some new features

2025-07-30 17:09:11 +03:00
parent db5d46760a
commit 8019bd3b7c
20616 changed files with 4375466 additions and 8 deletions
--- a/.venv/lib/python3.12/site-packages/statsmodels/stats/diagnostic_gen.py
+++ b/.venv/lib/python3.12/site-packages/statsmodels/stats/diagnostic_gen.py
@ -0,0 +1,242 @@
+"""
+Created on Tue Oct  6 12:42:11 2020
+
+Author: Josef Perktold
+License: BSD-3
+
+"""
+
+import numpy as np
+from scipy import stats
+
+from statsmodels.stats.base import HolderTuple
+from statsmodels.stats.effect_size import _noncentrality_chisquare
+
+
+def test_chisquare_binning(counts, expected, sort_var=None, bins=10,
+                           df=None, ordered=False, sort_method="quicksort",
+                           alpha_nc=0.05):
+    """chisquare gof test with binning of data, Hosmer-Lemeshow type
+
+    ``observed`` and ``expected`` are observation specific and should have
+    observations in rows and choices in columns
+
+    Parameters
+    ----------
+    counts : array_like
+        Observed frequency, i.e. counts for all choices
+    expected : array_like
+        Expected counts or probability. If expected are counts, then they
+        need to sum to the same total count as the sum of observed.
+        If those sums are unequal and all expected values are smaller or equal
+        to 1, then they are interpreted as probabilities and will be rescaled
+        to match counts.
+    sort_var : array_like
+        1-dimensional array for binning. Groups will be formed according to
+        quantiles of the sorted array ``sort_var``, so that group sizes have
+        equal or approximately equal sizes.
+
+    Returns
+    -------
+    Holdertuple instance
+        This instance contains the results of the chisquare test and some
+        information about the data
+
+        - statistic : chisquare statistic of the goodness-of-fit test
+        - pvalue : pvalue of the chisquare test
+        = df : degrees of freedom of the test
+
+    Notes
+    -----
+    Degrees of freedom for Hosmer-Lemeshow tests are given by
+
+    g groups, c choices
+
+    - binary: `df = (g - 2)` for insample,
+         Stata uses `df = g` for outsample
+    - multinomial: `df = (g−2) *(c−1)`, reduces to (g-2) for binary c=2,
+         (Fagerland, Hosmer, Bofin SIM 2008)
+    - ordinal: `df = (g - 2) * (c - 1) + (c - 2)`, reduces to (g-2) for c=2,
+         (Hosmer, ... ?)
+
+    Note: If there are ties in the ``sort_var`` array, then the split of
+    observations into groups will depend on the sort algorithm.
+    """
+
+    observed = np.asarray(counts)
+    expected = np.asarray(expected)
+    n_observed = counts.sum()
+    n_expected = expected.sum()
+    if not np.allclose(n_observed, n_expected, atol=1e-13):
+        if np.max(expected) < 1 + 1e-13:
+            # expected seems to be probability, warn and rescale
+            import warnings
+            warnings.warn("sum of expected and of observed differ, "
+                          "rescaling ``expected``")
+            expected = expected / n_expected * n_observed
+        else:
+            # expected doesn't look like fractions or probabilities
+            raise ValueError("total counts of expected and observed differ")
+
+    # k = 1 if observed.ndim == 1 else observed.shape[1]
+    if sort_var is not None:
+        argsort = np.argsort(sort_var, kind=sort_method)
+    else:
+        argsort = np.arange(observed.shape[0])
+    # indices = [arr for arr in np.array_split(argsort, bins, axis=0)]
+    indices = np.array_split(argsort, bins, axis=0)
+    # in one loop, observed expected in last dimension, too messy,
+    # freqs_probs = np.array([np.vstack([observed[idx].mean(0),
+    #                                    expected[idx].mean(0)]).T
+    #                         for idx in indices])
+    freqs = np.array([observed[idx].sum(0) for idx in indices])
+    probs = np.array([expected[idx].sum(0) for idx in indices])
+
+    # chisquare test
+    resid_pearson = (freqs - probs) / np.sqrt(probs)
+    chi2_stat_groups = ((freqs - probs)**2 / probs).sum(1)
+    chi2_stat = chi2_stat_groups.sum()
+    if df is None:
+        g, c = freqs.shape
+        if ordered is True:
+            df = (g - 2) * (c - 1) + (c - 2)
+        else:
+            df = (g - 2) * (c - 1)
+    pvalue = stats.chi2.sf(chi2_stat, df)
+    noncentrality = _noncentrality_chisquare(chi2_stat, df, alpha=alpha_nc)
+
+    res = HolderTuple(statistic=chi2_stat,
+                      pvalue=pvalue,
+                      df=df,
+                      freqs=freqs,
+                      probs=probs,
+                      noncentrality=noncentrality,
+                      resid_pearson=resid_pearson,
+                      chi2_stat_groups=chi2_stat_groups,
+                      indices=indices
+                      )
+    return res
+
+
+def prob_larger_ordinal_choice(prob):
+    """probability that observed category is larger than distribution prob
+
+    This is a helper function for Ordinal models, where endog is a 1-dim
+    categorical variable and predicted probabilities are 2-dimensional with
+    observations in rows and choices in columns.
+
+    Parameter
+    ---------
+    prob : array_like
+        Expected probabilities for ordinal choices, e.g. from prediction of
+        an ordinal model with observations in rows and choices in columns.
+
+    Returns
+    -------
+    cdf_mid : ndarray
+        mid cdf, i.e ``P(x < y) + 0.5 P(x=y)``
+    r : ndarray
+        Probability residual ``P(x > y) - P(x < y)`` for all possible choices.
+        Computed as ``r = cdf_mid * 2 - 1``
+
+    References
+    ----------
+    .. [2] Li, Chun, and Bryan E. Shepherd. 2012. “A New Residual for Ordinal
+       Outcomes.” Biometrika 99 (2): 473–80.
+
+    See Also
+    --------
+    `statsmodels.stats.nonparametric.rank_compare_2ordinal`
+
+    """
+    # similar to `nonparametric rank_compare_2ordinal`
+
+    prob = np.asarray(prob)
+    cdf = prob.cumsum(-1)
+    if cdf.ndim == 1:
+        cdf_ = np.concatenate(([0], cdf))
+    elif cdf.ndim == 2:
+        cdf_ = np.concatenate((np.zeros((len(cdf), 1)), cdf), axis=1)
+    # r_1 = cdf_[..., 1:] + cdf_[..., :-1] - 1
+    cdf_mid = (cdf_[..., 1:] + cdf_[..., :-1]) / 2
+    r = cdf_mid * 2 - 1
+    return cdf_mid, r
+
+
+def prob_larger_2ordinal(probs1, probs2):
+    """Stochastically large probability for two ordinal distributions
+
+    Computes Pr(x1 > x2) + 0.5 * Pr(x1 = x2) for two ordered multinomial
+    (ordinal) distributed random variables x1 and x2.
+
+    This is vectorized with choices along last axis.
+    Broadcasting if freq2 is 1-dim also seems to work correctly.
+
+    Returns
+    -------
+    prob1 : float
+        Probability that random draw from distribution 1 is larger than a
+        random draw from distribution 2. Pr(x1 > x2) + 0.5 * Pr(x1 = x2)
+    prob2 : float
+        prob2 = 1 - prob1 = Pr(x1 < x2) + 0.5 * Pr(x1 = x2)
+    """
+#    count1 = np.asarray(count1)
+#    count2 = np.asarray(count2)
+#    nobs1, nobs2 = count1.sum(), count2.sum()
+#    freq1 = count1 / nobs1
+#    freq2 = count2 / nobs2
+
+#     if freq1.ndim == 1:
+#         freq1_ = np.concatenate(([0], freq1))
+#     elif freq1.ndim == 2:
+#         freq1_ = np.concatenate((np.zeros((len(freq1), 1)), freq1), axis=1)
+
+#     if freq2.ndim == 1:
+#         freq2_ = np.concatenate(([0], freq2))
+#     elif freq2.ndim == 2:
+#         freq2_ = np.concatenate((np.zeros((len(freq2), 1)), freq2), axis=1)
+
+    freq1 = np.asarray(probs1)
+    freq2 = np.asarray(probs2)
+    # add zero at beginning of choices for cdf computation
+    freq1_ = np.concatenate((np.zeros(freq1.shape[:-1] + (1,)), freq1),
+                            axis=-1)
+    freq2_ = np.concatenate((np.zeros(freq2.shape[:-1] + (1,)), freq2),
+                            axis=-1)
+
+    cdf1 = freq1_.cumsum(axis=-1)
+    cdf2 = freq2_.cumsum(axis=-1)
+
+    # mid rank cdf
+    cdfm1 = (cdf1[..., 1:] + cdf1[..., :-1]) / 2
+    cdfm2 = (cdf2[..., 1:] + cdf2[..., :-1]) / 2
+    prob1 = (cdfm2 * freq1).sum(-1)
+    prob2 = (cdfm1 * freq2).sum(-1)
+    return prob1, prob2
+
+
+def cov_multinomial(probs):
+    """covariance matrix of multinomial distribution
+
+    This is vectorized with choices along last axis.
+
+    cov = diag(probs) - outer(probs, probs)
+
+    """
+
+    k = probs.shape[-1]
+    di = np.diag_indices(k, 2)
+    cov = probs[..., None] * probs[..., None, :]
+    cov *= - 1
+    cov[..., di[0], di[1]] += probs
+    return cov
+
+
+def var_multinomial(probs):
+    """variance of multinomial distribution
+
+    var = probs * (1 - probs)
+
+    """
+    var = probs * (1 - probs)
+    return var