some new features
This commit is contained in:
@ -0,0 +1,242 @@
|
||||
"""
|
||||
Created on Tue Oct 6 12:42:11 2020
|
||||
|
||||
Author: Josef Perktold
|
||||
License: BSD-3
|
||||
|
||||
"""
|
||||
|
||||
import numpy as np
|
||||
from scipy import stats
|
||||
|
||||
from statsmodels.stats.base import HolderTuple
|
||||
from statsmodels.stats.effect_size import _noncentrality_chisquare
|
||||
|
||||
|
||||
def test_chisquare_binning(counts, expected, sort_var=None, bins=10,
|
||||
df=None, ordered=False, sort_method="quicksort",
|
||||
alpha_nc=0.05):
|
||||
"""chisquare gof test with binning of data, Hosmer-Lemeshow type
|
||||
|
||||
``observed`` and ``expected`` are observation specific and should have
|
||||
observations in rows and choices in columns
|
||||
|
||||
Parameters
|
||||
----------
|
||||
counts : array_like
|
||||
Observed frequency, i.e. counts for all choices
|
||||
expected : array_like
|
||||
Expected counts or probability. If expected are counts, then they
|
||||
need to sum to the same total count as the sum of observed.
|
||||
If those sums are unequal and all expected values are smaller or equal
|
||||
to 1, then they are interpreted as probabilities and will be rescaled
|
||||
to match counts.
|
||||
sort_var : array_like
|
||||
1-dimensional array for binning. Groups will be formed according to
|
||||
quantiles of the sorted array ``sort_var``, so that group sizes have
|
||||
equal or approximately equal sizes.
|
||||
|
||||
Returns
|
||||
-------
|
||||
Holdertuple instance
|
||||
This instance contains the results of the chisquare test and some
|
||||
information about the data
|
||||
|
||||
- statistic : chisquare statistic of the goodness-of-fit test
|
||||
- pvalue : pvalue of the chisquare test
|
||||
= df : degrees of freedom of the test
|
||||
|
||||
Notes
|
||||
-----
|
||||
Degrees of freedom for Hosmer-Lemeshow tests are given by
|
||||
|
||||
g groups, c choices
|
||||
|
||||
- binary: `df = (g - 2)` for insample,
|
||||
Stata uses `df = g` for outsample
|
||||
- multinomial: `df = (g−2) *(c−1)`, reduces to (g-2) for binary c=2,
|
||||
(Fagerland, Hosmer, Bofin SIM 2008)
|
||||
- ordinal: `df = (g - 2) * (c - 1) + (c - 2)`, reduces to (g-2) for c=2,
|
||||
(Hosmer, ... ?)
|
||||
|
||||
Note: If there are ties in the ``sort_var`` array, then the split of
|
||||
observations into groups will depend on the sort algorithm.
|
||||
"""
|
||||
|
||||
observed = np.asarray(counts)
|
||||
expected = np.asarray(expected)
|
||||
n_observed = counts.sum()
|
||||
n_expected = expected.sum()
|
||||
if not np.allclose(n_observed, n_expected, atol=1e-13):
|
||||
if np.max(expected) < 1 + 1e-13:
|
||||
# expected seems to be probability, warn and rescale
|
||||
import warnings
|
||||
warnings.warn("sum of expected and of observed differ, "
|
||||
"rescaling ``expected``")
|
||||
expected = expected / n_expected * n_observed
|
||||
else:
|
||||
# expected doesn't look like fractions or probabilities
|
||||
raise ValueError("total counts of expected and observed differ")
|
||||
|
||||
# k = 1 if observed.ndim == 1 else observed.shape[1]
|
||||
if sort_var is not None:
|
||||
argsort = np.argsort(sort_var, kind=sort_method)
|
||||
else:
|
||||
argsort = np.arange(observed.shape[0])
|
||||
# indices = [arr for arr in np.array_split(argsort, bins, axis=0)]
|
||||
indices = np.array_split(argsort, bins, axis=0)
|
||||
# in one loop, observed expected in last dimension, too messy,
|
||||
# freqs_probs = np.array([np.vstack([observed[idx].mean(0),
|
||||
# expected[idx].mean(0)]).T
|
||||
# for idx in indices])
|
||||
freqs = np.array([observed[idx].sum(0) for idx in indices])
|
||||
probs = np.array([expected[idx].sum(0) for idx in indices])
|
||||
|
||||
# chisquare test
|
||||
resid_pearson = (freqs - probs) / np.sqrt(probs)
|
||||
chi2_stat_groups = ((freqs - probs)**2 / probs).sum(1)
|
||||
chi2_stat = chi2_stat_groups.sum()
|
||||
if df is None:
|
||||
g, c = freqs.shape
|
||||
if ordered is True:
|
||||
df = (g - 2) * (c - 1) + (c - 2)
|
||||
else:
|
||||
df = (g - 2) * (c - 1)
|
||||
pvalue = stats.chi2.sf(chi2_stat, df)
|
||||
noncentrality = _noncentrality_chisquare(chi2_stat, df, alpha=alpha_nc)
|
||||
|
||||
res = HolderTuple(statistic=chi2_stat,
|
||||
pvalue=pvalue,
|
||||
df=df,
|
||||
freqs=freqs,
|
||||
probs=probs,
|
||||
noncentrality=noncentrality,
|
||||
resid_pearson=resid_pearson,
|
||||
chi2_stat_groups=chi2_stat_groups,
|
||||
indices=indices
|
||||
)
|
||||
return res
|
||||
|
||||
|
||||
def prob_larger_ordinal_choice(prob):
|
||||
"""probability that observed category is larger than distribution prob
|
||||
|
||||
This is a helper function for Ordinal models, where endog is a 1-dim
|
||||
categorical variable and predicted probabilities are 2-dimensional with
|
||||
observations in rows and choices in columns.
|
||||
|
||||
Parameter
|
||||
---------
|
||||
prob : array_like
|
||||
Expected probabilities for ordinal choices, e.g. from prediction of
|
||||
an ordinal model with observations in rows and choices in columns.
|
||||
|
||||
Returns
|
||||
-------
|
||||
cdf_mid : ndarray
|
||||
mid cdf, i.e ``P(x < y) + 0.5 P(x=y)``
|
||||
r : ndarray
|
||||
Probability residual ``P(x > y) - P(x < y)`` for all possible choices.
|
||||
Computed as ``r = cdf_mid * 2 - 1``
|
||||
|
||||
References
|
||||
----------
|
||||
.. [2] Li, Chun, and Bryan E. Shepherd. 2012. “A New Residual for Ordinal
|
||||
Outcomes.” Biometrika 99 (2): 473–80.
|
||||
|
||||
See Also
|
||||
--------
|
||||
`statsmodels.stats.nonparametric.rank_compare_2ordinal`
|
||||
|
||||
"""
|
||||
# similar to `nonparametric rank_compare_2ordinal`
|
||||
|
||||
prob = np.asarray(prob)
|
||||
cdf = prob.cumsum(-1)
|
||||
if cdf.ndim == 1:
|
||||
cdf_ = np.concatenate(([0], cdf))
|
||||
elif cdf.ndim == 2:
|
||||
cdf_ = np.concatenate((np.zeros((len(cdf), 1)), cdf), axis=1)
|
||||
# r_1 = cdf_[..., 1:] + cdf_[..., :-1] - 1
|
||||
cdf_mid = (cdf_[..., 1:] + cdf_[..., :-1]) / 2
|
||||
r = cdf_mid * 2 - 1
|
||||
return cdf_mid, r
|
||||
|
||||
|
||||
def prob_larger_2ordinal(probs1, probs2):
|
||||
"""Stochastically large probability for two ordinal distributions
|
||||
|
||||
Computes Pr(x1 > x2) + 0.5 * Pr(x1 = x2) for two ordered multinomial
|
||||
(ordinal) distributed random variables x1 and x2.
|
||||
|
||||
This is vectorized with choices along last axis.
|
||||
Broadcasting if freq2 is 1-dim also seems to work correctly.
|
||||
|
||||
Returns
|
||||
-------
|
||||
prob1 : float
|
||||
Probability that random draw from distribution 1 is larger than a
|
||||
random draw from distribution 2. Pr(x1 > x2) + 0.5 * Pr(x1 = x2)
|
||||
prob2 : float
|
||||
prob2 = 1 - prob1 = Pr(x1 < x2) + 0.5 * Pr(x1 = x2)
|
||||
"""
|
||||
# count1 = np.asarray(count1)
|
||||
# count2 = np.asarray(count2)
|
||||
# nobs1, nobs2 = count1.sum(), count2.sum()
|
||||
# freq1 = count1 / nobs1
|
||||
# freq2 = count2 / nobs2
|
||||
|
||||
# if freq1.ndim == 1:
|
||||
# freq1_ = np.concatenate(([0], freq1))
|
||||
# elif freq1.ndim == 2:
|
||||
# freq1_ = np.concatenate((np.zeros((len(freq1), 1)), freq1), axis=1)
|
||||
|
||||
# if freq2.ndim == 1:
|
||||
# freq2_ = np.concatenate(([0], freq2))
|
||||
# elif freq2.ndim == 2:
|
||||
# freq2_ = np.concatenate((np.zeros((len(freq2), 1)), freq2), axis=1)
|
||||
|
||||
freq1 = np.asarray(probs1)
|
||||
freq2 = np.asarray(probs2)
|
||||
# add zero at beginning of choices for cdf computation
|
||||
freq1_ = np.concatenate((np.zeros(freq1.shape[:-1] + (1,)), freq1),
|
||||
axis=-1)
|
||||
freq2_ = np.concatenate((np.zeros(freq2.shape[:-1] + (1,)), freq2),
|
||||
axis=-1)
|
||||
|
||||
cdf1 = freq1_.cumsum(axis=-1)
|
||||
cdf2 = freq2_.cumsum(axis=-1)
|
||||
|
||||
# mid rank cdf
|
||||
cdfm1 = (cdf1[..., 1:] + cdf1[..., :-1]) / 2
|
||||
cdfm2 = (cdf2[..., 1:] + cdf2[..., :-1]) / 2
|
||||
prob1 = (cdfm2 * freq1).sum(-1)
|
||||
prob2 = (cdfm1 * freq2).sum(-1)
|
||||
return prob1, prob2
|
||||
|
||||
|
||||
def cov_multinomial(probs):
|
||||
"""covariance matrix of multinomial distribution
|
||||
|
||||
This is vectorized with choices along last axis.
|
||||
|
||||
cov = diag(probs) - outer(probs, probs)
|
||||
|
||||
"""
|
||||
|
||||
k = probs.shape[-1]
|
||||
di = np.diag_indices(k, 2)
|
||||
cov = probs[..., None] * probs[..., None, :]
|
||||
cov *= - 1
|
||||
cov[..., di[0], di[1]] += probs
|
||||
return cov
|
||||
|
||||
|
||||
def var_multinomial(probs):
|
||||
"""variance of multinomial distribution
|
||||
|
||||
var = probs * (1 - probs)
|
||||
|
||||
"""
|
||||
var = probs * (1 - probs)
|
||||
return var
|
||||
Reference in New Issue
Block a user