reconnect moved files to git repo

2025-08-01 04:33:03 -04:00
commit 5d3c35492d
23190 changed files with 4750716 additions and 0 deletions
--- a/venv/lib/python3.11/site-packages/statsmodels/regression/quantile_regression.py
+++ b/venv/lib/python3.11/site-packages/statsmodels/regression/quantile_regression.py
@ -0,0 +1,416 @@
+#!/usr/bin/env python
+
+'''
+Quantile regression model
+
+Model parameters are estimated using iterated reweighted least squares. The
+asymptotic covariance matrix estimated using kernel density estimation.
+
+Author: Vincent Arel-Bundock
+License: BSD-3
+Created: 2013-03-19
+
+The original IRLS function was written for Matlab by Shapour Mohammadi,
+University of Tehran, 2008 (shmohammadi@gmail.com), with some lines based on
+code written by James P. Lesage in Applied Econometrics Using MATLAB(1999).PP.
+73-4.  Translated to python with permission from original author by Christian
+Prinoth (christian at prinoth dot name).
+'''
+
+import numpy as np
+import warnings
+import scipy.stats as stats
+from numpy.linalg import pinv
+from scipy.stats import norm
+from statsmodels.tools.decorators import cache_readonly
+from statsmodels.regression.linear_model import (RegressionModel,
+                                                 RegressionResults,
+                                                 RegressionResultsWrapper)
+from statsmodels.tools.sm_exceptions import (ConvergenceWarning,
+                                             IterationLimitWarning)
+
+class QuantReg(RegressionModel):
+    '''Quantile Regression
+
+    Estimate a quantile regression model using iterative reweighted least
+    squares.
+
+    Parameters
+    ----------
+    endog : array or dataframe
+        endogenous/response variable
+    exog : array or dataframe
+        exogenous/explanatory variable(s)
+
+    Notes
+    -----
+    The Least Absolute Deviation (LAD) estimator is a special case where
+    quantile is set to 0.5 (q argument of the fit method).
+
+    The asymptotic covariance matrix is estimated following the procedure in
+    Greene (2008, p.407-408), using either the logistic or gaussian kernels
+    (kernel argument of the fit method).
+
+    References
+    ----------
+    General:
+
+    * Birkes, D. and Y. Dodge(1993). Alternative Methods of Regression, John Wiley and Sons.
+    * Green,W. H. (2008). Econometric Analysis. Sixth Edition. International Student Edition.
+    * Koenker, R. (2005). Quantile Regression. New York: Cambridge University Press.
+    * LeSage, J. P.(1999). Applied Econometrics Using MATLAB,
+
+    Kernels (used by the fit method):
+
+    * Green (2008) Table 14.2
+
+    Bandwidth selection (used by the fit method):
+
+    * Bofinger, E. (1975). Estimation of a density function using order statistics. Australian Journal of Statistics 17: 1-17.
+    * Chamberlain, G. (1994). Quantile regression, censoring, and the structure of wages. In Advances in Econometrics, Vol. 1: Sixth World Congress, ed. C. A. Sims, 171-209. Cambridge: Cambridge University Press.
+    * Hall, P., and S. Sheather. (1988). On the distribution of the Studentized quantile. Journal of the Royal Statistical Society, Series B 50: 381-391.
+
+    Keywords: Least Absolute Deviation(LAD) Regression, Quantile Regression,
+    Regression, Robust Estimation.
+    '''
+
+    def __init__(self, endog, exog, **kwargs):
+        self._check_kwargs(kwargs)
+        super().__init__(endog, exog, **kwargs)
+
+    def whiten(self, data):
+        """
+        QuantReg model whitener does nothing: returns data.
+        """
+        return data
+
+    def fit(self, q=.5, vcov='robust', kernel='epa', bandwidth='hsheather',
+            max_iter=1000, p_tol=1e-6, **kwargs):
+        """
+        Solve by Iterative Weighted Least Squares
+
+        Parameters
+        ----------
+        q : float
+            Quantile must be strictly between 0 and 1
+        vcov : str, method used to calculate the variance-covariance matrix
+            of the parameters. Default is ``robust``:
+
+            - robust : heteroskedasticity robust standard errors (as suggested
+              in Greene 6th edition)
+            - iid : iid errors (as in Stata 12)
+
+        kernel : str, kernel to use in the kernel density estimation for the
+            asymptotic covariance matrix:
+
+            - epa: Epanechnikov
+            - cos: Cosine
+            - gau: Gaussian
+            - par: Parzene
+
+        bandwidth : str, Bandwidth selection method in kernel density
+            estimation for asymptotic covariance estimate (full
+            references in QuantReg docstring):
+
+            - hsheather: Hall-Sheather (1988)
+            - bofinger: Bofinger (1975)
+            - chamberlain: Chamberlain (1994)
+        """
+
+        if q <= 0 or q >= 1:
+            raise Exception('q must be strictly between 0 and 1')
+
+        kern_names = ['biw', 'cos', 'epa', 'gau', 'par']
+        if kernel not in kern_names:
+            raise Exception("kernel must be one of " + ', '.join(kern_names))
+        else:
+            kernel = kernels[kernel]
+
+        if bandwidth == 'hsheather':
+            bandwidth = hall_sheather
+        elif bandwidth == 'bofinger':
+            bandwidth = bofinger
+        elif bandwidth == 'chamberlain':
+            bandwidth = chamberlain
+        else:
+            raise Exception("bandwidth must be in 'hsheather', 'bofinger', 'chamberlain'")
+
+        endog = self.endog
+        exog = self.exog
+        nobs = self.nobs
+        exog_rank = np.linalg.matrix_rank(self.exog)
+        self.rank = exog_rank
+        self.df_model = float(self.rank - self.k_constant)
+        self.df_resid = self.nobs - self.rank
+        n_iter = 0
+        xstar = exog
+
+        beta = np.ones(exog.shape[1])
+        # TODO: better start, initial beta is used only for convergence check
+
+        # Note the following does not work yet,
+        # the iteration loop always starts with OLS as initial beta
+        # if start_params is not None:
+        #    if len(start_params) != rank:
+        #       raise ValueError('start_params has wrong length')
+        #       beta = start_params
+        #    else:
+        #       # start with OLS
+        #       beta = np.dot(np.linalg.pinv(exog), endog)
+
+        diff = 10
+        cycle = False
+
+        history = dict(params = [], mse=[])
+        while n_iter < max_iter and diff > p_tol and not cycle:
+            n_iter += 1
+            beta0 = beta
+            xtx = np.dot(xstar.T, exog)
+            xty = np.dot(xstar.T, endog)
+            beta = np.dot(pinv(xtx), xty)
+            resid = endog - np.dot(exog, beta)
+
+            mask = np.abs(resid) < .000001
+            resid[mask] = ((resid[mask] >= 0) * 2 - 1) * .000001
+            resid = np.where(resid < 0, q * resid, (1-q) * resid)
+            resid = np.abs(resid)
+            xstar = exog / resid[:, np.newaxis]
+            diff = np.max(np.abs(beta - beta0))
+            history['params'].append(beta)
+            history['mse'].append(np.mean(resid*resid))
+
+            if (n_iter >= 300) and (n_iter % 100 == 0):
+                # check for convergence circle, should not happen
+                for ii in range(2, 10):
+                    if np.all(beta == history['params'][-ii]):
+                        cycle = True
+                        warnings.warn("Convergence cycle detected", ConvergenceWarning)
+                        break
+
+        if n_iter == max_iter:
+            warnings.warn("Maximum number of iterations (" + str(max_iter) +
+                          ") reached.", IterationLimitWarning)
+
+        e = endog - np.dot(exog, beta)
+        # Greene (2008, p.407) writes that Stata 6 uses this bandwidth:
+        # h = 0.9 * np.std(e) / (nobs**0.2)
+        # Instead, we calculate bandwidth as in Stata 12
+        iqre = stats.scoreatpercentile(e, 75) - stats.scoreatpercentile(e, 25)
+        h = bandwidth(nobs, q)
+        h = min(np.std(endog),
+                iqre / 1.34) * (norm.ppf(q + h) - norm.ppf(q - h))
+
+        fhat0 = 1. / (nobs * h) * np.sum(kernel(e / h))
+
+        if vcov == 'robust':
+            d = np.where(e > 0, (q/fhat0)**2, ((1-q)/fhat0)**2)
+            xtxi = pinv(np.dot(exog.T, exog))
+            xtdx = np.dot(exog.T * d[np.newaxis, :], exog)
+            vcov = xtxi @ xtdx @ xtxi
+        elif vcov == 'iid':
+            vcov = (1. / fhat0)**2 * q * (1 - q) * pinv(np.dot(exog.T, exog))
+        else:
+            raise Exception("vcov must be 'robust' or 'iid'")
+
+        lfit = QuantRegResults(self, beta, normalized_cov_params=vcov)
+
+        lfit.q = q
+        lfit.iterations = n_iter
+        lfit.sparsity = 1. / fhat0
+        lfit.bandwidth = h
+        lfit.history = history
+
+        return RegressionResultsWrapper(lfit)
+
+
+def _parzen(u):
+    z = np.where(np.abs(u) <= .5, 4./3 - 8. * u**2 + 8. * np.abs(u)**3,
+                 8. * (1 - np.abs(u))**3 / 3.)
+    z[np.abs(u) > 1] = 0
+    return z
+
+
+kernels = {}
+kernels['biw'] = lambda u: 15. / 16 * (1 - u**2)**2 * np.where(np.abs(u) <= 1, 1, 0)
+kernels['cos'] = lambda u: np.where(np.abs(u) <= .5, 1 + np.cos(2 * np.pi * u), 0)
+kernels['epa'] = lambda u: 3. / 4 * (1-u**2) * np.where(np.abs(u) <= 1, 1, 0)
+kernels['gau'] = norm.pdf
+kernels['par'] = _parzen
+#kernels['bet'] = lambda u: np.where(np.abs(u) <= 1, .75 * (1 - u) * (1 + u), 0)
+#kernels['log'] = lambda u: logistic.pdf(u) * (1 - logistic.pdf(u))
+#kernels['tri'] = lambda u: np.where(np.abs(u) <= 1, 1 - np.abs(u), 0)
+#kernels['trw'] = lambda u: 35. / 32 * (1 - u**2)**3 * np.where(np.abs(u) <= 1, 1, 0)
+#kernels['uni'] = lambda u: 1. / 2 * np.where(np.abs(u) <= 1, 1, 0)
+
+
+def hall_sheather(n, q, alpha=.05):
+    z = norm.ppf(q)
+    num = 1.5 * norm.pdf(z)**2.
+    den = 2. * z**2. + 1.
+    h = n**(-1. / 3) * norm.ppf(1. - alpha / 2.)**(2./3) * (num / den)**(1./3)
+    return h
+
+
+def bofinger(n, q):
+    num = 9. / 2 * norm.pdf(2 * norm.ppf(q))**4
+    den = (2 * norm.ppf(q)**2 + 1)**2
+    h = n**(-1. / 5) * (num / den)**(1. / 5)
+    return h
+
+
+def chamberlain(n, q, alpha=.05):
+    return norm.ppf(1 - alpha / 2) * np.sqrt(q*(1 - q) / n)
+
+
+class QuantRegResults(RegressionResults):
+    '''Results instance for the QuantReg model'''
+
+    @cache_readonly
+    def prsquared(self):
+        q = self.q
+        endog = self.model.endog
+        e = self.resid
+        e = np.where(e < 0, (1 - q) * e, q * e)
+        e = np.abs(e)
+        ered = endog - stats.scoreatpercentile(endog, q * 100)
+        ered = np.where(ered < 0, (1 - q) * ered, q * ered)
+        ered = np.abs(ered)
+        return 1 - np.sum(e) / np.sum(ered)
+
+    #@cache_readonly
+    def scale(self):
+        return 1.
+
+    @cache_readonly
+    def bic(self):
+        return np.nan
+
+    @cache_readonly
+    def aic(self):
+        return np.nan
+
+    @cache_readonly
+    def llf(self):
+        return np.nan
+
+    @cache_readonly
+    def rsquared(self):
+        return np.nan
+
+    @cache_readonly
+    def rsquared_adj(self):
+        return np.nan
+
+    @cache_readonly
+    def mse(self):
+        return np.nan
+
+    @cache_readonly
+    def mse_model(self):
+        return np.nan
+
+    @cache_readonly
+    def mse_total(self):
+        return np.nan
+
+    @cache_readonly
+    def centered_tss(self):
+        return np.nan
+
+    @cache_readonly
+    def uncentered_tss(self):
+        return np.nan
+
+    @cache_readonly
+    def HC0_se(self):
+        raise NotImplementedError
+
+    @cache_readonly
+    def HC1_se(self):
+        raise NotImplementedError
+
+    @cache_readonly
+    def HC2_se(self):
+        raise NotImplementedError
+
+    @cache_readonly
+    def HC3_se(self):
+        raise NotImplementedError
+
+    def summary(self, yname=None, xname=None, title=None, alpha=.05):
+        """Summarize the Regression Results
+
+        Parameters
+        ----------
+        yname : str, optional
+            Default is `y`
+        xname : list[str], optional
+            Names for the exogenous variables. Default is `var_##` for ## in
+            the number of regressors. Must match the number of parameters
+            in the model
+        title : str, optional
+            Title for the top table. If not None, then this replaces the
+            default title
+        alpha : float
+            significance level for the confidence intervals
+
+        Returns
+        -------
+        smry : Summary instance
+            this holds the summary tables and text, which can be printed or
+            converted to various output formats.
+
+        See Also
+        --------
+        statsmodels.iolib.summary.Summary : class to hold summary results
+        """
+        eigvals = self.eigenvals
+        condno = self.condition_number
+
+        top_left = [('Dep. Variable:', None),
+                    ('Model:', None),
+                    ('Method:', ['Least Squares']),
+                    ('Date:', None),
+                    ('Time:', None)
+                    ]
+
+        top_right = [('Pseudo R-squared:', ["%#8.4g" % self.prsquared]),
+                     ('Bandwidth:', ["%#8.4g" % self.bandwidth]),
+                     ('Sparsity:', ["%#8.4g" % self.sparsity]),
+                     ('No. Observations:', None),
+                     ('Df Residuals:', None),
+                     ('Df Model:', None)
+                     ]
+
+        if title is None:
+            title = self.model.__class__.__name__ + ' ' + "Regression Results"
+
+        # create summary table instance
+        from statsmodels.iolib.summary import Summary
+        smry = Summary()
+        smry.add_table_2cols(self, gleft=top_left, gright=top_right,
+                             yname=yname, xname=xname, title=title)
+        smry.add_table_params(self, yname=yname, xname=xname, alpha=alpha,
+                              use_t=self.use_t)
+
+        # add warnings/notes, added to text format only
+        etext = []
+        if eigvals[-1] < 1e-10:
+            wstr = "The smallest eigenvalue is %6.3g. This might indicate "
+            wstr += "that there are\n"
+            wstr += "strong multicollinearity problems or that the design "
+            wstr += "matrix is singular."
+            wstr = wstr % eigvals[-1]
+            etext.append(wstr)
+        elif condno > 1000:  # TODO: what is recommended
+            wstr = "The condition number is large, %6.3g. This might "
+            wstr += "indicate that there are\n"
+            wstr += "strong multicollinearity or other numerical "
+            wstr += "problems."
+            wstr = wstr % condno
+            etext.append(wstr)
+
+        if etext:
+            smry.add_extra_txt(etext)
+
+        return smry