reconnect moved files to git repo
This commit is contained in:
@ -0,0 +1,416 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
'''
|
||||
Quantile regression model
|
||||
|
||||
Model parameters are estimated using iterated reweighted least squares. The
|
||||
asymptotic covariance matrix estimated using kernel density estimation.
|
||||
|
||||
Author: Vincent Arel-Bundock
|
||||
License: BSD-3
|
||||
Created: 2013-03-19
|
||||
|
||||
The original IRLS function was written for Matlab by Shapour Mohammadi,
|
||||
University of Tehran, 2008 (shmohammadi@gmail.com), with some lines based on
|
||||
code written by James P. Lesage in Applied Econometrics Using MATLAB(1999).PP.
|
||||
73-4. Translated to python with permission from original author by Christian
|
||||
Prinoth (christian at prinoth dot name).
|
||||
'''
|
||||
|
||||
import numpy as np
|
||||
import warnings
|
||||
import scipy.stats as stats
|
||||
from numpy.linalg import pinv
|
||||
from scipy.stats import norm
|
||||
from statsmodels.tools.decorators import cache_readonly
|
||||
from statsmodels.regression.linear_model import (RegressionModel,
|
||||
RegressionResults,
|
||||
RegressionResultsWrapper)
|
||||
from statsmodels.tools.sm_exceptions import (ConvergenceWarning,
|
||||
IterationLimitWarning)
|
||||
|
||||
class QuantReg(RegressionModel):
|
||||
'''Quantile Regression
|
||||
|
||||
Estimate a quantile regression model using iterative reweighted least
|
||||
squares.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
endog : array or dataframe
|
||||
endogenous/response variable
|
||||
exog : array or dataframe
|
||||
exogenous/explanatory variable(s)
|
||||
|
||||
Notes
|
||||
-----
|
||||
The Least Absolute Deviation (LAD) estimator is a special case where
|
||||
quantile is set to 0.5 (q argument of the fit method).
|
||||
|
||||
The asymptotic covariance matrix is estimated following the procedure in
|
||||
Greene (2008, p.407-408), using either the logistic or gaussian kernels
|
||||
(kernel argument of the fit method).
|
||||
|
||||
References
|
||||
----------
|
||||
General:
|
||||
|
||||
* Birkes, D. and Y. Dodge(1993). Alternative Methods of Regression, John Wiley and Sons.
|
||||
* Green,W. H. (2008). Econometric Analysis. Sixth Edition. International Student Edition.
|
||||
* Koenker, R. (2005). Quantile Regression. New York: Cambridge University Press.
|
||||
* LeSage, J. P.(1999). Applied Econometrics Using MATLAB,
|
||||
|
||||
Kernels (used by the fit method):
|
||||
|
||||
* Green (2008) Table 14.2
|
||||
|
||||
Bandwidth selection (used by the fit method):
|
||||
|
||||
* Bofinger, E. (1975). Estimation of a density function using order statistics. Australian Journal of Statistics 17: 1-17.
|
||||
* Chamberlain, G. (1994). Quantile regression, censoring, and the structure of wages. In Advances in Econometrics, Vol. 1: Sixth World Congress, ed. C. A. Sims, 171-209. Cambridge: Cambridge University Press.
|
||||
* Hall, P., and S. Sheather. (1988). On the distribution of the Studentized quantile. Journal of the Royal Statistical Society, Series B 50: 381-391.
|
||||
|
||||
Keywords: Least Absolute Deviation(LAD) Regression, Quantile Regression,
|
||||
Regression, Robust Estimation.
|
||||
'''
|
||||
|
||||
def __init__(self, endog, exog, **kwargs):
|
||||
self._check_kwargs(kwargs)
|
||||
super().__init__(endog, exog, **kwargs)
|
||||
|
||||
def whiten(self, data):
|
||||
"""
|
||||
QuantReg model whitener does nothing: returns data.
|
||||
"""
|
||||
return data
|
||||
|
||||
def fit(self, q=.5, vcov='robust', kernel='epa', bandwidth='hsheather',
|
||||
max_iter=1000, p_tol=1e-6, **kwargs):
|
||||
"""
|
||||
Solve by Iterative Weighted Least Squares
|
||||
|
||||
Parameters
|
||||
----------
|
||||
q : float
|
||||
Quantile must be strictly between 0 and 1
|
||||
vcov : str, method used to calculate the variance-covariance matrix
|
||||
of the parameters. Default is ``robust``:
|
||||
|
||||
- robust : heteroskedasticity robust standard errors (as suggested
|
||||
in Greene 6th edition)
|
||||
- iid : iid errors (as in Stata 12)
|
||||
|
||||
kernel : str, kernel to use in the kernel density estimation for the
|
||||
asymptotic covariance matrix:
|
||||
|
||||
- epa: Epanechnikov
|
||||
- cos: Cosine
|
||||
- gau: Gaussian
|
||||
- par: Parzene
|
||||
|
||||
bandwidth : str, Bandwidth selection method in kernel density
|
||||
estimation for asymptotic covariance estimate (full
|
||||
references in QuantReg docstring):
|
||||
|
||||
- hsheather: Hall-Sheather (1988)
|
||||
- bofinger: Bofinger (1975)
|
||||
- chamberlain: Chamberlain (1994)
|
||||
"""
|
||||
|
||||
if q <= 0 or q >= 1:
|
||||
raise Exception('q must be strictly between 0 and 1')
|
||||
|
||||
kern_names = ['biw', 'cos', 'epa', 'gau', 'par']
|
||||
if kernel not in kern_names:
|
||||
raise Exception("kernel must be one of " + ', '.join(kern_names))
|
||||
else:
|
||||
kernel = kernels[kernel]
|
||||
|
||||
if bandwidth == 'hsheather':
|
||||
bandwidth = hall_sheather
|
||||
elif bandwidth == 'bofinger':
|
||||
bandwidth = bofinger
|
||||
elif bandwidth == 'chamberlain':
|
||||
bandwidth = chamberlain
|
||||
else:
|
||||
raise Exception("bandwidth must be in 'hsheather', 'bofinger', 'chamberlain'")
|
||||
|
||||
endog = self.endog
|
||||
exog = self.exog
|
||||
nobs = self.nobs
|
||||
exog_rank = np.linalg.matrix_rank(self.exog)
|
||||
self.rank = exog_rank
|
||||
self.df_model = float(self.rank - self.k_constant)
|
||||
self.df_resid = self.nobs - self.rank
|
||||
n_iter = 0
|
||||
xstar = exog
|
||||
|
||||
beta = np.ones(exog.shape[1])
|
||||
# TODO: better start, initial beta is used only for convergence check
|
||||
|
||||
# Note the following does not work yet,
|
||||
# the iteration loop always starts with OLS as initial beta
|
||||
# if start_params is not None:
|
||||
# if len(start_params) != rank:
|
||||
# raise ValueError('start_params has wrong length')
|
||||
# beta = start_params
|
||||
# else:
|
||||
# # start with OLS
|
||||
# beta = np.dot(np.linalg.pinv(exog), endog)
|
||||
|
||||
diff = 10
|
||||
cycle = False
|
||||
|
||||
history = dict(params = [], mse=[])
|
||||
while n_iter < max_iter and diff > p_tol and not cycle:
|
||||
n_iter += 1
|
||||
beta0 = beta
|
||||
xtx = np.dot(xstar.T, exog)
|
||||
xty = np.dot(xstar.T, endog)
|
||||
beta = np.dot(pinv(xtx), xty)
|
||||
resid = endog - np.dot(exog, beta)
|
||||
|
||||
mask = np.abs(resid) < .000001
|
||||
resid[mask] = ((resid[mask] >= 0) * 2 - 1) * .000001
|
||||
resid = np.where(resid < 0, q * resid, (1-q) * resid)
|
||||
resid = np.abs(resid)
|
||||
xstar = exog / resid[:, np.newaxis]
|
||||
diff = np.max(np.abs(beta - beta0))
|
||||
history['params'].append(beta)
|
||||
history['mse'].append(np.mean(resid*resid))
|
||||
|
||||
if (n_iter >= 300) and (n_iter % 100 == 0):
|
||||
# check for convergence circle, should not happen
|
||||
for ii in range(2, 10):
|
||||
if np.all(beta == history['params'][-ii]):
|
||||
cycle = True
|
||||
warnings.warn("Convergence cycle detected", ConvergenceWarning)
|
||||
break
|
||||
|
||||
if n_iter == max_iter:
|
||||
warnings.warn("Maximum number of iterations (" + str(max_iter) +
|
||||
") reached.", IterationLimitWarning)
|
||||
|
||||
e = endog - np.dot(exog, beta)
|
||||
# Greene (2008, p.407) writes that Stata 6 uses this bandwidth:
|
||||
# h = 0.9 * np.std(e) / (nobs**0.2)
|
||||
# Instead, we calculate bandwidth as in Stata 12
|
||||
iqre = stats.scoreatpercentile(e, 75) - stats.scoreatpercentile(e, 25)
|
||||
h = bandwidth(nobs, q)
|
||||
h = min(np.std(endog),
|
||||
iqre / 1.34) * (norm.ppf(q + h) - norm.ppf(q - h))
|
||||
|
||||
fhat0 = 1. / (nobs * h) * np.sum(kernel(e / h))
|
||||
|
||||
if vcov == 'robust':
|
||||
d = np.where(e > 0, (q/fhat0)**2, ((1-q)/fhat0)**2)
|
||||
xtxi = pinv(np.dot(exog.T, exog))
|
||||
xtdx = np.dot(exog.T * d[np.newaxis, :], exog)
|
||||
vcov = xtxi @ xtdx @ xtxi
|
||||
elif vcov == 'iid':
|
||||
vcov = (1. / fhat0)**2 * q * (1 - q) * pinv(np.dot(exog.T, exog))
|
||||
else:
|
||||
raise Exception("vcov must be 'robust' or 'iid'")
|
||||
|
||||
lfit = QuantRegResults(self, beta, normalized_cov_params=vcov)
|
||||
|
||||
lfit.q = q
|
||||
lfit.iterations = n_iter
|
||||
lfit.sparsity = 1. / fhat0
|
||||
lfit.bandwidth = h
|
||||
lfit.history = history
|
||||
|
||||
return RegressionResultsWrapper(lfit)
|
||||
|
||||
|
||||
def _parzen(u):
|
||||
z = np.where(np.abs(u) <= .5, 4./3 - 8. * u**2 + 8. * np.abs(u)**3,
|
||||
8. * (1 - np.abs(u))**3 / 3.)
|
||||
z[np.abs(u) > 1] = 0
|
||||
return z
|
||||
|
||||
|
||||
kernels = {}
|
||||
kernels['biw'] = lambda u: 15. / 16 * (1 - u**2)**2 * np.where(np.abs(u) <= 1, 1, 0)
|
||||
kernels['cos'] = lambda u: np.where(np.abs(u) <= .5, 1 + np.cos(2 * np.pi * u), 0)
|
||||
kernels['epa'] = lambda u: 3. / 4 * (1-u**2) * np.where(np.abs(u) <= 1, 1, 0)
|
||||
kernels['gau'] = norm.pdf
|
||||
kernels['par'] = _parzen
|
||||
#kernels['bet'] = lambda u: np.where(np.abs(u) <= 1, .75 * (1 - u) * (1 + u), 0)
|
||||
#kernels['log'] = lambda u: logistic.pdf(u) * (1 - logistic.pdf(u))
|
||||
#kernels['tri'] = lambda u: np.where(np.abs(u) <= 1, 1 - np.abs(u), 0)
|
||||
#kernels['trw'] = lambda u: 35. / 32 * (1 - u**2)**3 * np.where(np.abs(u) <= 1, 1, 0)
|
||||
#kernels['uni'] = lambda u: 1. / 2 * np.where(np.abs(u) <= 1, 1, 0)
|
||||
|
||||
|
||||
def hall_sheather(n, q, alpha=.05):
|
||||
z = norm.ppf(q)
|
||||
num = 1.5 * norm.pdf(z)**2.
|
||||
den = 2. * z**2. + 1.
|
||||
h = n**(-1. / 3) * norm.ppf(1. - alpha / 2.)**(2./3) * (num / den)**(1./3)
|
||||
return h
|
||||
|
||||
|
||||
def bofinger(n, q):
|
||||
num = 9. / 2 * norm.pdf(2 * norm.ppf(q))**4
|
||||
den = (2 * norm.ppf(q)**2 + 1)**2
|
||||
h = n**(-1. / 5) * (num / den)**(1. / 5)
|
||||
return h
|
||||
|
||||
|
||||
def chamberlain(n, q, alpha=.05):
|
||||
return norm.ppf(1 - alpha / 2) * np.sqrt(q*(1 - q) / n)
|
||||
|
||||
|
||||
class QuantRegResults(RegressionResults):
|
||||
'''Results instance for the QuantReg model'''
|
||||
|
||||
@cache_readonly
|
||||
def prsquared(self):
|
||||
q = self.q
|
||||
endog = self.model.endog
|
||||
e = self.resid
|
||||
e = np.where(e < 0, (1 - q) * e, q * e)
|
||||
e = np.abs(e)
|
||||
ered = endog - stats.scoreatpercentile(endog, q * 100)
|
||||
ered = np.where(ered < 0, (1 - q) * ered, q * ered)
|
||||
ered = np.abs(ered)
|
||||
return 1 - np.sum(e) / np.sum(ered)
|
||||
|
||||
#@cache_readonly
|
||||
def scale(self):
|
||||
return 1.
|
||||
|
||||
@cache_readonly
|
||||
def bic(self):
|
||||
return np.nan
|
||||
|
||||
@cache_readonly
|
||||
def aic(self):
|
||||
return np.nan
|
||||
|
||||
@cache_readonly
|
||||
def llf(self):
|
||||
return np.nan
|
||||
|
||||
@cache_readonly
|
||||
def rsquared(self):
|
||||
return np.nan
|
||||
|
||||
@cache_readonly
|
||||
def rsquared_adj(self):
|
||||
return np.nan
|
||||
|
||||
@cache_readonly
|
||||
def mse(self):
|
||||
return np.nan
|
||||
|
||||
@cache_readonly
|
||||
def mse_model(self):
|
||||
return np.nan
|
||||
|
||||
@cache_readonly
|
||||
def mse_total(self):
|
||||
return np.nan
|
||||
|
||||
@cache_readonly
|
||||
def centered_tss(self):
|
||||
return np.nan
|
||||
|
||||
@cache_readonly
|
||||
def uncentered_tss(self):
|
||||
return np.nan
|
||||
|
||||
@cache_readonly
|
||||
def HC0_se(self):
|
||||
raise NotImplementedError
|
||||
|
||||
@cache_readonly
|
||||
def HC1_se(self):
|
||||
raise NotImplementedError
|
||||
|
||||
@cache_readonly
|
||||
def HC2_se(self):
|
||||
raise NotImplementedError
|
||||
|
||||
@cache_readonly
|
||||
def HC3_se(self):
|
||||
raise NotImplementedError
|
||||
|
||||
def summary(self, yname=None, xname=None, title=None, alpha=.05):
|
||||
"""Summarize the Regression Results
|
||||
|
||||
Parameters
|
||||
----------
|
||||
yname : str, optional
|
||||
Default is `y`
|
||||
xname : list[str], optional
|
||||
Names for the exogenous variables. Default is `var_##` for ## in
|
||||
the number of regressors. Must match the number of parameters
|
||||
in the model
|
||||
title : str, optional
|
||||
Title for the top table. If not None, then this replaces the
|
||||
default title
|
||||
alpha : float
|
||||
significance level for the confidence intervals
|
||||
|
||||
Returns
|
||||
-------
|
||||
smry : Summary instance
|
||||
this holds the summary tables and text, which can be printed or
|
||||
converted to various output formats.
|
||||
|
||||
See Also
|
||||
--------
|
||||
statsmodels.iolib.summary.Summary : class to hold summary results
|
||||
"""
|
||||
eigvals = self.eigenvals
|
||||
condno = self.condition_number
|
||||
|
||||
top_left = [('Dep. Variable:', None),
|
||||
('Model:', None),
|
||||
('Method:', ['Least Squares']),
|
||||
('Date:', None),
|
||||
('Time:', None)
|
||||
]
|
||||
|
||||
top_right = [('Pseudo R-squared:', ["%#8.4g" % self.prsquared]),
|
||||
('Bandwidth:', ["%#8.4g" % self.bandwidth]),
|
||||
('Sparsity:', ["%#8.4g" % self.sparsity]),
|
||||
('No. Observations:', None),
|
||||
('Df Residuals:', None),
|
||||
('Df Model:', None)
|
||||
]
|
||||
|
||||
if title is None:
|
||||
title = self.model.__class__.__name__ + ' ' + "Regression Results"
|
||||
|
||||
# create summary table instance
|
||||
from statsmodels.iolib.summary import Summary
|
||||
smry = Summary()
|
||||
smry.add_table_2cols(self, gleft=top_left, gright=top_right,
|
||||
yname=yname, xname=xname, title=title)
|
||||
smry.add_table_params(self, yname=yname, xname=xname, alpha=alpha,
|
||||
use_t=self.use_t)
|
||||
|
||||
# add warnings/notes, added to text format only
|
||||
etext = []
|
||||
if eigvals[-1] < 1e-10:
|
||||
wstr = "The smallest eigenvalue is %6.3g. This might indicate "
|
||||
wstr += "that there are\n"
|
||||
wstr += "strong multicollinearity problems or that the design "
|
||||
wstr += "matrix is singular."
|
||||
wstr = wstr % eigvals[-1]
|
||||
etext.append(wstr)
|
||||
elif condno > 1000: # TODO: what is recommended
|
||||
wstr = "The condition number is large, %6.3g. This might "
|
||||
wstr += "indicate that there are\n"
|
||||
wstr += "strong multicollinearity or other numerical "
|
||||
wstr += "problems."
|
||||
wstr = wstr % condno
|
||||
etext.append(wstr)
|
||||
|
||||
if etext:
|
||||
smry.add_extra_txt(etext)
|
||||
|
||||
return smry
|
||||
Reference in New Issue
Block a user