reconnect moved files to git repo

2025-08-01 04:33:03 -04:00
commit 5d3c35492d
23190 changed files with 4750716 additions and 0 deletions
--- a/venv/lib/python3.11/site-packages/statsmodels/sandbox/stats/stats_dhuard.py
+++ b/venv/lib/python3.11/site-packages/statsmodels/sandbox/stats/stats_dhuard.py
@ -0,0 +1,328 @@
+'''
+from David Huard's scipy sandbox, also attached to a ticket and
+in the matplotlib-user mailinglist  (links ???)
+
+
+Notes
+=====
+
+out of bounds interpolation raises exception and would not be completely
+defined ::
+
+>>> scoreatpercentile(x, [0,25,50,100])
+Traceback (most recent call last):
+...
+    raise ValueError("A value in x_new is below the interpolation "
+ValueError: A value in x_new is below the interpolation range.
+>>> percentileofscore(x, [-50, 50])
+Traceback (most recent call last):
+...
+    raise ValueError("A value in x_new is below the interpolation "
+ValueError: A value in x_new is below the interpolation range.
+
+
+idea
+====
+
+histogram and empirical interpolated distribution
+-------------------------------------------------
+
+dual constructor
+* empirical cdf : cdf on all observations through linear interpolation
+* binned cdf : based on histogram
+both should work essentially the same, although pdf of empirical has
+many spikes, fluctuates a lot
+- alternative: binning based on interpolated cdf : example in script
+* ppf: quantileatscore based on interpolated cdf
+* rvs : generic from ppf
+* stats, expectation ? how does integration wrt cdf work - theory?
+
+Problems
+* limits, lower and upper bound of support
+  does not work or is undefined with empirical cdf and interpolation
+* extending bounds ?
+  matlab has pareto tails for empirical distribution, breaks linearity
+
+empirical distribution with higher order interpolation
+------------------------------------------------------
+
+* should work easily enough with interpolating splines
+* not piecewise linear
+* can use pareto (or other) tails
+* ppf how do I get the inverse function of a higher order spline?
+  Chuck: resample and fit spline to inverse function
+  this will have an approximation error in the inverse function
+* -> does not work: higher order spline does not preserve monotonicity
+  see mailing list for response to my question
+* pmf from derivative available in spline
+
+-> forget this and use kernel density estimator instead
+
+
+bootstrap/empirical distribution:
+---------------------------------
+
+discrete distribution on real line given observations
+what's defined?
+* cdf : step function
+* pmf : points with equal weight 1/nobs
+* rvs : resampling
+* ppf : quantileatscore on sample?
+* moments : from data ?
+* expectation ? sum_{all observations x} [func(x) * pmf(x)]
+* similar for discrete distribution on real line
+* References : ?
+* what's the point? most of it is trivial, just for the record ?
+
+
+Created on Monday, May 03, 2010, 11:47:03 AM
+Author: josef-pktd, parts based on David Huard
+License: BSD
+
+'''
+import scipy.interpolate as interpolate
+import numpy as np
+
+def scoreatpercentile(data, percentile):
+    """Return the score at the given percentile of the data.
+
+    Example:
+        >>> data = randn(100)
+            >>> scoreatpercentile(data, 50)
+
+        will return the median of sample `data`.
+    """
+    per = np.array(percentile)
+    cdf = empiricalcdf(data)
+    interpolator = interpolate.interp1d(np.sort(cdf), np.sort(data))
+    return interpolator(per/100.)
+
+def percentileofscore(data, score):
+    """Return the percentile-position of score relative to data.
+
+    score: Array of scores at which the percentile is computed.
+
+    Return percentiles (0-100).
+
+    Example
+            r = randn(50)
+        x = linspace(-2,2,100)
+        percentileofscore(r,x)
+
+    Raise an error if the score is outside the range of data.
+    """
+    cdf = empiricalcdf(data)
+    interpolator = interpolate.interp1d(np.sort(data), np.sort(cdf))
+    return interpolator(score)*100.
+
+def empiricalcdf(data, method='Hazen'):
+    """Return the empirical cdf.
+
+    Methods available:
+        Hazen:       (i-0.5)/N
+            Weibull:     i/(N+1)
+        Chegodayev:  (i-.3)/(N+.4)
+        Cunnane:     (i-.4)/(N+.2)
+        Gringorten:  (i-.44)/(N+.12)
+        California:  (i-1)/N
+
+    Where i goes from 1 to N.
+    """
+
+    i = np.argsort(np.argsort(data)) + 1.
+    N = len(data)
+    method = method.lower()
+    if method == 'hazen':
+        cdf = (i-0.5)/N
+    elif method == 'weibull':
+        cdf = i/(N+1.)
+    elif method == 'california':
+        cdf = (i-1.)/N
+    elif method == 'chegodayev':
+        cdf = (i-.3)/(N+.4)
+    elif method == 'cunnane':
+        cdf = (i-.4)/(N+.2)
+    elif method == 'gringorten':
+        cdf = (i-.44)/(N+.12)
+    else:
+        raise ValueError('Unknown method. Choose among Weibull, Hazen,'
+                         'Chegodayev, Cunnane, Gringorten and California.')
+
+    return cdf
+
+
+class HistDist:
+    '''Distribution with piecewise linear cdf, pdf is step function
+
+    can be created from empiricial distribution or from a histogram (not done yet)
+
+    work in progress, not finished
+
+
+    '''
+
+    def __init__(self, data):
+        self.data = np.atleast_1d(data)
+        self.binlimit = np.array([self.data.min(), self.data.max()])
+        sortind = np.argsort(data)
+        self._datasorted = data[sortind]
+        self.ranking = np.argsort(sortind)
+
+        cdf = self.empiricalcdf()
+        self._empcdfsorted = np.sort(cdf)
+        self.cdfintp = interpolate.interp1d(self._datasorted, self._empcdfsorted)
+        self.ppfintp = interpolate.interp1d(self._empcdfsorted, self._datasorted)
+
+    def empiricalcdf(self, data=None, method='Hazen'):
+        """Return the empirical cdf.
+
+        Methods available:
+            Hazen:       (i-0.5)/N
+                Weibull:     i/(N+1)
+            Chegodayev:  (i-.3)/(N+.4)
+            Cunnane:     (i-.4)/(N+.2)
+            Gringorten:  (i-.44)/(N+.12)
+            California:  (i-1)/N
+
+        Where i goes from 1 to N.
+        """
+
+        if data is None:
+            data = self.data
+            i = self.ranking
+        else:
+            i = np.argsort(np.argsort(data)) + 1.
+
+        N = len(data)
+        method = method.lower()
+        if method == 'hazen':
+            cdf = (i-0.5)/N
+        elif method == 'weibull':
+            cdf = i/(N+1.)
+        elif method == 'california':
+            cdf = (i-1.)/N
+        elif method == 'chegodayev':
+            cdf = (i-.3)/(N+.4)
+        elif method == 'cunnane':
+            cdf = (i-.4)/(N+.2)
+        elif method == 'gringorten':
+            cdf = (i-.44)/(N+.12)
+        else:
+            raise ValueError('Unknown method. Choose among Weibull, Hazen,'
+                             'Chegodayev, Cunnane, Gringorten and California.')
+
+        return cdf
+
+
+    def cdf_emp(self, score):
+        '''
+        this is score in dh
+
+        '''
+        return self.cdfintp(score)
+        #return percentileofscore(self.data, score)
+
+    def ppf_emp(self, quantile):
+        '''
+        this is score in dh
+
+        '''
+        return self.ppfintp(quantile)
+        #return scoreatpercentile(self.data, quantile*100)
+
+
+    #from DHuard http://old.nabble.com/matplotlib-f2903.html
+    def optimize_binning(self, method='Freedman'):
+        """Find the optimal number of bins and update the bin countaccordingly.
+        Available methods : Freedman
+                            Scott
+        """
+
+        nobs = len(self.data)
+        if method=='Freedman':
+            IQR = self.ppf_emp(0.75) - self.ppf_emp(0.25) # Interquantile range(75% -25%)
+            width = 2* IQR* nobs**(-1./3)
+
+        elif method=='Scott':
+            width = 3.49 * np.std(self.data) * nobs**(-1./3)
+
+        self.nbin = (np.ptp(self.binlimit)/width)
+        return self.nbin
+
+
+#changes: josef-pktd
+if __name__ == '__main__':
+    import matplotlib.pyplot as plt
+
+    nobs = 100
+    x = np.random.randn(nobs)
+
+    examples = [2]
+    if 1 in examples:
+        empiricalcdf(x)
+        print(percentileofscore(x, 0.5))
+        print(scoreatpercentile(x, 50))
+        xsupp = np.linspace(x.min(), x.max())
+        pos = percentileofscore(x, xsupp)
+        plt.plot(xsupp, pos)
+        #perc = np.linspace(2.5, 97.5)
+        #plt.plot(scoreatpercentile(x, perc), perc)
+        plt.plot(scoreatpercentile(x, pos), pos+1)
+
+
+        #emp = interpolate.PiecewisePolynomial(np.sort(empiricalcdf(x)), np.sort(x))
+        emp=interpolate.InterpolatedUnivariateSpline(np.sort(x),np.sort(empiricalcdf(x)),k=1)
+        pdfemp = np.array([emp.derivatives(xi)[1] for xi in xsupp])
+        plt.figure()
+        plt.plot(xsupp,pdfemp)
+        cdf_ongrid = emp(xsupp)
+        plt.figure()
+        plt.plot(xsupp, cdf_ongrid)
+
+        #get pdf from interpolated cdf on a regular grid
+        plt.figure()
+        plt.step(xsupp[:-1],np.diff(cdf_ongrid)/np.diff(xsupp))
+
+        #reduce number of bins/steps
+        xsupp2 = np.linspace(x.min(), x.max(), 25)
+        plt.figure()
+        plt.step(xsupp2[:-1],np.diff(emp(xsupp2))/np.diff(xsupp2))
+
+        #pdf using 25 original observations, every (nobs/25)th
+        xso = np.sort(x)
+        xs = xso[::nobs/25]
+        plt.figure()
+        plt.step(xs[:-1],np.diff(emp(xs))/np.diff(xs))
+        #lower end looks strange
+
+
+    histd = HistDist(x)
+    print(histd.optimize_binning())
+    print(histd.cdf_emp(histd.binlimit))
+    print(histd.ppf_emp([0.25, 0.5, 0.75]))
+    print(histd.cdf_emp([-0.5, -0.25, 0, 0.25, 0.5]))
+
+
+    xsupp = np.linspace(x.min(), x.max(), 500)
+    emp=interpolate.InterpolatedUnivariateSpline(np.sort(x),np.sort(empiricalcdf(x)),k=1)
+    #pdfemp = np.array([emp.derivatives(xi)[1] for xi in xsupp])
+    #plt.figure()
+    #plt.plot(xsupp,pdfemp)
+    cdf_ongrid = emp(xsupp)
+    plt.figure()
+    plt.plot(xsupp, cdf_ongrid)
+    ppfintp = interpolate.InterpolatedUnivariateSpline(cdf_ongrid,xsupp,k=3)
+
+    ppfs = ppfintp(cdf_ongrid)
+    plt.plot(ppfs, cdf_ongrid)
+    #ppfemp=interpolate.InterpolatedUnivariateSpline(np.sort(empiricalcdf(x)),np.sort(x),k=3)
+    #Do not use interpolating splines for function approximation
+    #with s=0.03 the spline is monotonic at the evaluated values
+    ppfemp=interpolate.UnivariateSpline(np.sort(empiricalcdf(x)),np.sort(x),k=3, s=0.03)
+    ppfe = ppfemp(cdf_ongrid)
+    plt.plot(ppfe, cdf_ongrid)
+
+    print('negative density')
+    print('(np.diff(ppfs)).min()', (np.diff(ppfs)).min())
+    print('(np.diff(cdf_ongrid)).min()', (np.diff(cdf_ongrid)).min())
+    #plt.show()