some new features

2025-07-30 17:09:11 +03:00
parent db5d46760a
commit 8019bd3b7c
20616 changed files with 4375466 additions and 8 deletions
--- a/.venv/lib/python3.12/site-packages/statsmodels/sandbox/stats/runs.py
+++ b/.venv/lib/python3.12/site-packages/statsmodels/sandbox/stats/runs.py
@ -0,0 +1,637 @@
+'''runstest
+
+formulas for mean and var of runs taken from SAS manual NPAR tests, also idea
+for runstest_1samp and runstest_2samp
+
+Description in NIST handbook and dataplot does not explain their expected
+values, or variance
+
+Note:
+There are (at least) two definitions of runs used in literature. The classical
+definition which is also used here, is that runs are sequences of identical
+observations separated by observations with different realizations.
+The second definition allows for overlapping runs, or runs where counting a
+run is also started after a run of a fixed length of the same kind.
+
+
+TODO
+* add one-sided tests where possible or where it makes sense
+
+'''
+
+import numpy as np
+from scipy import stats
+from scipy.special import comb
+import warnings
+from statsmodels.tools.validation import array_like
+
+class Runs:
+    '''class for runs in a binary sequence
+
+
+    Parameters
+    ----------
+    x : array_like, 1d
+        data array,
+
+
+    Notes
+    -----
+    This was written as a more general class for runs. This has some redundant
+    calculations when only the runs_test is used.
+
+    TODO: make it lazy
+
+    The runs test could be generalized to more than 1d if there is a use case
+    for it.
+
+    This should be extended once I figure out what the distribution of runs
+    of any length k is.
+
+    The exact distribution for the runs test is also available but not yet
+    verified.
+
+    '''
+
+    def __init__(self, x):
+        self.x = np.asarray(x)
+
+        self.runstart = runstart = np.nonzero(np.diff(np.r_[[-np.inf], x, [np.inf]]))[0]
+        self.runs = runs = np.diff(runstart)
+        self.runs_sign = runs_sign = x[runstart[:-1]]
+        self.runs_pos = runs[runs_sign==1]
+        self.runs_neg = runs[runs_sign==0]
+        self.runs_freqs = np.bincount(runs)
+        self.n_runs = len(self.runs)
+        self.n_pos = (x==1).sum()
+
+    def runs_test(self, correction=True):
+        '''basic version of runs test
+
+        Parameters
+        ----------
+        correction : bool
+            Following the SAS manual, for samplesize below 50, the test
+            statistic is corrected by 0.5. This can be turned off with
+            correction=False, and was included to match R, tseries, which
+            does not use any correction.
+
+        pvalue based on normal distribution, with integer correction
+
+        '''
+        self.npo = npo = (self.runs_pos).sum()
+        self.nne = nne = (self.runs_neg).sum()
+
+        #n_r = self.n_runs
+        n = npo + nne
+        npn = npo * nne
+        rmean = 2. * npn / n + 1
+        rvar = 2. * npn * (2.*npn - n) / n**2. / (n-1.)
+        rstd = np.sqrt(rvar)
+        rdemean = self.n_runs - rmean
+        if n >= 50 or not correction:
+            z = rdemean
+        else:
+            if rdemean > 0.5:
+                z = rdemean - 0.5
+            elif rdemean < 0.5:
+                z = rdemean + 0.5
+            else:
+                z = 0.
+
+        z /= rstd
+        pval = 2 * stats.norm.sf(np.abs(z))
+        return z, pval
+
+def runstest_1samp(x, cutoff='mean', correction=True):
+    '''use runs test on binary discretized data above/below cutoff
+
+    Parameters
+    ----------
+    x : array_like
+        data, numeric
+    cutoff : {'mean', 'median'} or number
+        This specifies the cutoff to split the data into large and small
+        values.
+    correction : bool
+        Following the SAS manual, for samplesize below 50, the test
+        statistic is corrected by 0.5. This can be turned off with
+        correction=False, and was included to match R, tseries, which
+        does not use any correction.
+
+    Returns
+    -------
+    z_stat : float
+        test statistic, asymptotically normally distributed
+    p-value : float
+        p-value, reject the null hypothesis if it is below an type 1 error
+        level, alpha .
+
+    '''
+
+    x = array_like(x, "x")
+    if cutoff == 'mean':
+        cutoff = np.mean(x)
+    elif cutoff == 'median':
+        cutoff = np.median(x)
+    else:
+        cutoff = float(cutoff)
+    xindicator = (x >= cutoff).astype(int)
+    return Runs(xindicator).runs_test(correction=correction)
+
+def runstest_2samp(x, y=None, groups=None, correction=True):
+    '''Wald-Wolfowitz runstest for two samples
+
+    This tests whether two samples come from the same distribution.
+
+    Parameters
+    ----------
+    x : array_like
+        data, numeric, contains either one group, if y is also given, or
+        both groups, if additionally a group indicator is provided
+    y : array_like (optional)
+        data, numeric
+    groups : array_like
+        group labels or indicator the data for both groups is given in a
+        single 1-dimensional array, x. If group labels are not [0,1], then
+    correction : bool
+        Following the SAS manual, for samplesize below 50, the test
+        statistic is corrected by 0.5. This can be turned off with
+        correction=False, and was included to match R, tseries, which
+        does not use any correction.
+
+    Returns
+    -------
+    z_stat : float
+        test statistic, asymptotically normally distributed
+    p-value : float
+        p-value, reject the null hypothesis if it is below an type 1 error
+        level, alpha .
+
+
+    Notes
+    -----
+    Wald-Wolfowitz runs test.
+
+    If there are ties, then then the test statistic and p-value that is
+    reported, is based on the higher p-value between sorting all tied
+    observations of the same group
+
+
+    This test is intended for continuous distributions
+    SAS has treatment for ties, but not clear, and sounds more complicated
+    (minimum and maximum possible runs prevent use of argsort)
+    (maybe it's not so difficult, idea: add small positive noise to first
+    one, run test, then to the other, run test, take max(?) p-value - DONE
+    This gives not the minimum and maximum of the number of runs, but should
+    be close. Not true, this is close to minimum but far away from maximum.
+    maximum number of runs would use alternating groups in the ties.)
+    Maybe adding random noise would be the better approach.
+
+    SAS has exact distribution for sample size <=30, does not look standard
+    but should be easy to add.
+
+    currently two-sided test only
+
+    This has not been verified against a reference implementation. In a short
+    Monte Carlo simulation where both samples are normally distribute, the test
+    seems to be correctly sized for larger number of observations (30 or
+    larger), but conservative (i.e. reject less often than nominal) with a
+    sample size of 10 in each group.
+
+    See Also
+    --------
+    runs_test_1samp
+    Runs
+    RunsProb
+
+    '''
+    x = np.asarray(x)
+    if y is not None:
+        y = np.asarray(y)
+        groups = np.concatenate((np.zeros(len(x)), np.ones(len(y))))
+        # note reassigning x
+        x = np.concatenate((x, y))
+        gruni = np.arange(2)
+    elif groups is not None:
+        gruni = np.unique(groups)
+        if gruni.size != 2:  # pylint: disable=E1103
+            raise ValueError('not exactly two groups specified')
+        #require groups to be numeric ???
+    else:
+        raise ValueError('either y or groups is necessary')
+
+    xargsort = np.argsort(x)
+    #check for ties
+    x_sorted = x[xargsort]
+    x_diff = np.diff(x_sorted)  # used for detecting and handling ties
+    if x_diff.min() == 0:
+        print('ties detected')   #replace with warning
+        x_mindiff = x_diff[x_diff > 0].min()
+        eps = x_mindiff/2.
+        xx = x.copy()  #do not change original, just in case
+
+        xx[groups==gruni[0]] += eps
+        xargsort = np.argsort(xx)
+        xindicator = groups[xargsort]
+        z0, p0 = Runs(xindicator).runs_test(correction=correction)
+
+        xx[groups==gruni[0]] -= eps   #restore xx = x
+        xx[groups==gruni[1]] += eps
+        xargsort = np.argsort(xx)
+        xindicator = groups[xargsort]
+        z1, p1 = Runs(xindicator).runs_test(correction=correction)
+
+        idx = np.argmax([p0,p1])
+        return [z0, z1][idx], [p0, p1][idx]
+
+    else:
+        xindicator = groups[xargsort]
+        return Runs(xindicator).runs_test(correction=correction)
+
+
+class TotalRunsProb:
+    '''class for the probability distribution of total runs
+
+    This is the exact probability distribution for the (Wald-Wolfowitz)
+    runs test. The random variable is the total number of runs if the
+    sample has (n0, n1) observations of groups 0 and 1.
+
+
+    Notes
+    -----
+    Written as a class so I can store temporary calculations, but I do not
+    think it matters much.
+
+    Formulas taken from SAS manual for one-sided significance level.
+
+    Could be converted to a full univariate distribution, subclassing
+    scipy.stats.distributions.
+
+    *Status*
+    Not verified yet except for mean.
+
+
+
+    '''
+
+    def __init__(self, n0, n1):
+        self.n0 = n0
+        self.n1 = n1
+        self.n = n = n0 + n1
+        self.comball = comb(n, n1)
+
+    def runs_prob_even(self, r):
+        n0, n1 = self.n0, self.n1
+        tmp0 = comb(n0-1, r//2-1)
+        tmp1 = comb(n1-1, r//2-1)
+        return tmp0 * tmp1 * 2. / self.comball
+
+    def runs_prob_odd(self, r):
+        n0, n1 = self.n0, self.n1
+        k = (r+1)//2
+        tmp0 = comb(n0-1, k-1)
+        tmp1 = comb(n1-1, k-2)
+        tmp3 = comb(n0-1, k-2)
+        tmp4 = comb(n1-1, k-1)
+        return (tmp0 * tmp1 + tmp3 * tmp4)  / self.comball
+
+    def pdf(self, r):
+        r = np.asarray(r)
+        r_isodd = np.mod(r, 2) > 0
+        r_odd = r[r_isodd]
+        r_even = r[~r_isodd]
+        runs_pdf = np.zeros(r.shape)
+        runs_pdf[r_isodd] = self.runs_prob_odd(r_odd)
+        runs_pdf[~r_isodd] = self.runs_prob_even(r_even)
+        return runs_pdf
+
+
+    def cdf(self, r):
+        r_ = np.arange(2,r+1)
+        cdfval = self.runs_prob_even(r_[::2]).sum()
+        cdfval += self.runs_prob_odd(r_[1::2]).sum()
+        return cdfval
+
+
+class RunsProb:
+    '''distribution of success runs of length k or more (classical definition)
+
+    The underlying process is assumed to be a sequence of Bernoulli trials
+    of a given length n.
+
+    not sure yet, how to interpret or use the distribution for runs
+    of length k or more.
+
+    Musseli also has longest success run, and waiting time distribution
+    negative binomial of order k and geometric of order k
+
+    need to compare with Godpole
+
+    need a MonteCarlo function to do some quick tests before doing more
+
+
+    '''
+
+
+
+    def pdf(self, x, k, n, p):
+        '''distribution of success runs of length k or more
+
+        Parameters
+        ----------
+        x : float
+            count of runs of length n
+        k : int
+            length of runs
+        n : int
+            total number of observations or trials
+        p : float
+            probability of success in each Bernoulli trial
+
+        Returns
+        -------
+        pdf : float
+            probability that x runs of length of k are observed
+
+        Notes
+        -----
+        not yet vectorized
+
+        References
+        ----------
+        Muselli 1996, theorem 3
+        '''
+
+        q = 1-p
+        m = np.arange(x, (n+1)//(k+1)+1)[:,None]
+        terms = (-1)**(m-x) * comb(m, x) * p**(m*k) * q**(m-1) \
+                * (comb(n - m*k, m - 1) + q * comb(n - m*k, m))
+        return terms.sum(0)
+
+    def pdf_nb(self, x, k, n, p):
+        pass
+        #y = np.arange(m-1, n-mk+1
+
+'''
+>>> [np.sum([RunsProb().pdf(xi, k, 16, 10/16.) for xi in range(0,16)]) for k in range(16)]
+[0.99999332193894064, 0.99999999999999367, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]
+>>> [(np.arange(0,16) * [RunsProb().pdf(xi, k, 16, 10/16.) for xi in range(0,16)]).sum() for k in range(16)]
+[6.9998931510341809, 4.1406249999999929, 2.4414062500000075, 1.4343261718749996, 0.83923339843749856, 0.48875808715820324, 0.28312206268310569, 0.1629814505577086, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
+>>> np.array([(np.arange(0,16) * [RunsProb().pdf(xi, k, 16, 10/16.) for xi in range(0,16)]).sum() for k in range(16)])/11
+array([ 0.63635392,  0.37642045,  0.22194602,  0.13039329,  0.07629395,
+        0.04443255,  0.02573837,  0.0148165 ,  0.        ,  0.        ,
+        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,  0.        ])
+>>> np.diff([(np.arange(0,16) * [RunsProb().pdf(xi, k, 16, 10/16.) for xi in range(0,16)]).sum() for k in range(16)][::-1])
+array([ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
+        0.        ,  0.        ,  0.16298145,  0.12014061,  0.20563602,
+        0.35047531,  0.59509277,  1.00708008,  1.69921875,  2.85926815])
+'''
+
+
+
+def median_test_ksample(x, groups):
+    '''chisquare test for equality of median/location
+
+    This tests whether all groups have the same fraction of observations
+    above the median.
+
+    Parameters
+    ----------
+    x : array_like
+        data values stacked for all groups
+    groups : array_like
+        group labels or indicator
+
+    Returns
+    -------
+    stat : float
+       test statistic
+    pvalue : float
+       pvalue from the chisquare distribution
+    others ????
+       currently some test output, table and expected
+
+    '''
+    x = np.asarray(x)
+    gruni = np.unique(groups)
+    xli = [x[groups==group] for group in gruni]
+    xmedian = np.median(x)
+    counts_larger = np.array([(xg > xmedian).sum() for xg in xli])
+    counts = np.array([len(xg) for xg in xli])
+    counts_smaller = counts - counts_larger
+    nobs = counts.sum()
+    n_larger = (x > xmedian).sum()
+    n_smaller = nobs - n_larger
+    table = np.vstack((counts_smaller, counts_larger))
+
+    #the following should be replaced by chisquare_contingency table
+    expected = np.vstack((counts * 1. / nobs * n_smaller,
+                          counts * 1. / nobs * n_larger))
+
+    if (expected < 5).any():
+        print('Warning: There are cells with less than 5 expected' \
+        'observations. The chisquare distribution might not be a good' \
+        'approximation for the true distribution.')
+
+    #check ddof
+    return stats.chisquare(table.ravel(), expected.ravel(), ddof=1), table, expected
+
+
+
+
+def cochrans_q(x):
+    '''Cochran's Q test for identical effect of k treatments
+
+    Cochran's Q is a k-sample extension of the McNemar test. If there are only
+    two treatments, then Cochran's Q test and McNemar test are equivalent.
+
+    Test that the probability of success is the same for each treatment.
+    The alternative is that at least two treatments have a different
+    probability of success.
+
+    Parameters
+    ----------
+    x : array_like, 2d (N,k)
+        data with N cases and k variables
+
+    Returns
+    -------
+    q_stat : float
+       test statistic
+    pvalue : float
+       pvalue from the chisquare distribution
+
+    Notes
+    -----
+    In Wikipedia terminology, rows are blocks and columns are treatments.
+    The number of rows N, should be large for the chisquare distribution to be
+    a good approximation.
+    The Null hypothesis of the test is that all treatments have the
+    same effect.
+
+    References
+    ----------
+    https://en.wikipedia.org/wiki/Cochran_test
+    SAS Manual for NPAR TESTS
+
+    '''
+
+    warnings.warn("Deprecated, use stats.cochrans_q instead", FutureWarning)
+
+    x = np.asarray(x)
+    gruni = np.unique(x)
+    N, k = x.shape
+    count_row_success = (x==gruni[-1]).sum(1, float)
+    count_col_success = (x==gruni[-1]).sum(0, float)
+    count_row_ss = count_row_success.sum()
+    count_col_ss = count_col_success.sum()
+    assert count_row_ss == count_col_ss  #just a calculation check
+
+
+    #this is SAS manual
+    q_stat = (k-1) * (k *  np.sum(count_col_success**2) - count_col_ss**2) \
+             / (k * count_row_ss - np.sum(count_row_success**2))
+
+    #Note: the denominator looks just like k times the variance of the
+    #columns
+
+    #Wikipedia uses a different, but equivalent expression
+##    q_stat = (k-1) * (k *  np.sum(count_row_success**2) - count_row_ss**2) \
+##             / (k * count_col_ss - np.sum(count_col_success**2))
+
+    return q_stat, stats.chi2.sf(q_stat, k-1)
+
+def mcnemar(x, y=None, exact=True, correction=True):
+    '''McNemar test
+
+    Parameters
+    ----------
+    x, y : array_like
+        two paired data samples. If y is None, then x can be a 2 by 2
+        contingency table. x and y can have more than one dimension, then
+        the results are calculated under the assumption that axis zero
+        contains the observation for the samples.
+    exact : bool
+        If exact is true, then the binomial distribution will be used.
+        If exact is false, then the chisquare distribution will be used, which
+        is the approximation to the distribution of the test statistic for
+        large sample sizes.
+    correction : bool
+        If true, then a continuity correction is used for the chisquare
+        distribution (if exact is false.)
+
+    Returns
+    -------
+    stat : float or int, array
+        The test statistic is the chisquare statistic if exact is false. If the
+        exact binomial distribution is used, then this contains the min(n1, n2),
+        where n1, n2 are cases that are zero in one sample but one in the other
+        sample.
+
+    pvalue : float or array
+        p-value of the null hypothesis of equal effects.
+
+    Notes
+    -----
+    This is a special case of Cochran's Q test. The results when the chisquare
+    distribution is used are identical, except for continuity correction.
+
+    '''
+
+    warnings.warn("Deprecated, use stats.TableSymmetry instead", FutureWarning)
+
+    x = np.asarray(x)
+    if y is None and x.shape[0] == x.shape[1]:
+        if x.shape[0] != 2:
+            raise ValueError('table needs to be 2 by 2')
+        n1, n2 = x[1, 0], x[0, 1]
+    else:
+        # I'm not checking here whether x and y are binary,
+        # is not this also paired sign test
+        n1 = np.sum(x < y, 0)
+        n2 = np.sum(x > y, 0)
+
+    if exact:
+        stat = np.minimum(n1, n2)
+        # binom is symmetric with p=0.5
+        pval = stats.binom.cdf(stat, n1 + n2, 0.5) * 2
+        pval = np.minimum(pval, 1)  # limit to 1 if n1==n2
+    else:
+        corr = int(correction) # convert bool to 0 or 1
+        stat = (np.abs(n1 - n2) - corr)**2 / (1. * (n1 + n2))
+        df = 1
+        pval = stats.chi2.sf(stat, df)
+    return stat, pval
+
+
+def symmetry_bowker(table):
+    '''Test for symmetry of a (k, k) square contingency table
+
+    This is an extension of the McNemar test to test the Null hypothesis
+    that the contingency table is symmetric around the main diagonal, that is
+
+    n_{i, j} = n_{j, i}  for all i, j
+
+    Parameters
+    ----------
+    table : array_like, 2d, (k, k)
+        a square contingency table that contains the count for k categories
+        in rows and columns.
+
+    Returns
+    -------
+    statistic : float
+        chisquare test statistic
+    p-value : float
+        p-value of the test statistic based on chisquare distribution
+    df : int
+        degrees of freedom of the chisquare distribution
+
+    Notes
+    -----
+    Implementation is based on the SAS documentation, R includes it in
+    `mcnemar.test` if the table is not 2 by 2.
+
+    The pvalue is based on the chisquare distribution which requires that the
+    sample size is not very small to be a good approximation of the true
+    distribution. For 2x2 contingency tables exact distribution can be
+    obtained with `mcnemar`
+
+    See Also
+    --------
+    mcnemar
+
+
+    '''
+
+    warnings.warn("Deprecated, use stats.TableSymmetry instead", FutureWarning)
+
+    table = np.asarray(table)
+    k, k2 = table.shape
+    if k != k2:
+        raise ValueError('table needs to be square')
+
+    #low_idx = np.tril_indices(k, -1)  # this does not have Fortran order
+    upp_idx = np.triu_indices(k, 1)
+
+    tril = table.T[upp_idx]   # lower triangle in column order
+    triu = table[upp_idx]     # upper triangle in row order
+
+    stat = ((tril - triu)**2 / (tril + triu + 1e-20)).sum()
+    df = k * (k-1) / 2.
+    pval = stats.chi2.sf(stat, df)
+
+    return stat, pval, df
+
+
+if __name__ == '__main__':
+
+    x1 = np.array([1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1])
+
+    print(Runs(x1).runs_test())
+    print(runstest_1samp(x1, cutoff='mean'))
+    print(runstest_2samp(np.arange(16,0,-1), groups=x1))
+    print(TotalRunsProb(7,9).cdf(11))
+    print(median_test_ksample(np.random.randn(100), np.random.randint(0,2,100)))
+    print(cochrans_q(np.random.randint(0,2,(100,8))))