reconnect moved files to git repo
This commit is contained in:
@ -0,0 +1,186 @@
|
||||
"""
|
||||
Created on Sun Nov 14 08:21:41 2010
|
||||
|
||||
Author: josef-pktd
|
||||
License: BSD (3-clause)
|
||||
"""
|
||||
|
||||
import numpy as np
|
||||
import statsmodels.api as sm
|
||||
from statsmodels.sandbox.tools import pca
|
||||
from statsmodels.sandbox.tools.cross_val import LeaveOneOut
|
||||
|
||||
#converting example Principal Component Regression to a class
|
||||
#from sandbox/example_pca_regression.py
|
||||
|
||||
|
||||
class FactorModelUnivariate:
|
||||
'''
|
||||
|
||||
Todo:
|
||||
check treatment of const, make it optional ?
|
||||
add hasconst (0 or 1), needed when selecting nfact+hasconst
|
||||
options are arguments in calc_factors, should be more public instead
|
||||
cross-validation is slow for large number of observations
|
||||
'''
|
||||
def __init__(self, endog, exog):
|
||||
#do this in a superclass?
|
||||
self.endog = np.asarray(endog)
|
||||
self.exog = np.asarray(exog)
|
||||
|
||||
|
||||
def calc_factors(self, x=None, keepdim=0, addconst=True):
|
||||
'''get factor decomposition of exogenous variables
|
||||
|
||||
This uses principal component analysis to obtain the factors. The number
|
||||
of factors kept is the maximum that will be considered in the regression.
|
||||
'''
|
||||
if x is None:
|
||||
x = self.exog
|
||||
else:
|
||||
x = np.asarray(x)
|
||||
xred, fact, evals, evecs = pca(x, keepdim=keepdim, normalize=1)
|
||||
self.exog_reduced = xred
|
||||
#self.factors = fact
|
||||
if addconst:
|
||||
self.factors = sm.add_constant(fact, prepend=True)
|
||||
self.hasconst = 1 #needs to be int
|
||||
else:
|
||||
self.factors = fact
|
||||
self.hasconst = 0 #needs to be int
|
||||
|
||||
self.evals = evals
|
||||
self.evecs = evecs
|
||||
|
||||
def fit_fixed_nfact(self, nfact):
|
||||
if not hasattr(self, 'factors_wconst'):
|
||||
self.calc_factors()
|
||||
return sm.OLS(self.endog, self.factors[:,:nfact+1]).fit()
|
||||
|
||||
def fit_find_nfact(self, maxfact=None, skip_crossval=True, cv_iter=None):
|
||||
'''estimate the model and selection criteria for up to maxfact factors
|
||||
|
||||
The selection criteria that are calculated are AIC, BIC, and R2_adj. and
|
||||
additionally cross-validation prediction error sum of squares if `skip_crossval`
|
||||
is false. Cross-validation is not used by default because it can be
|
||||
time consuming to calculate.
|
||||
|
||||
By default the cross-validation method is Leave-one-out on the full dataset.
|
||||
A different cross-validation sample can be specified as an argument to
|
||||
cv_iter.
|
||||
|
||||
Results are attached in `results_find_nfact`
|
||||
|
||||
|
||||
|
||||
'''
|
||||
#print 'OLS on Factors'
|
||||
if not hasattr(self, 'factors'):
|
||||
self.calc_factors()
|
||||
|
||||
hasconst = self.hasconst
|
||||
if maxfact is None:
|
||||
maxfact = self.factors.shape[1] - hasconst
|
||||
|
||||
if (maxfact+hasconst) < 1:
|
||||
raise ValueError('nothing to do, number of factors (incl. constant) should ' +
|
||||
'be at least 1')
|
||||
|
||||
#temporary safety
|
||||
maxfact = min(maxfact, 10)
|
||||
|
||||
y0 = self.endog
|
||||
results = []
|
||||
#xred, fact, eva, eve = pca(x0, keepdim=0, normalize=1)
|
||||
for k in range(1, maxfact+hasconst): #k includes now the constnat
|
||||
#xred, fact, eva, eve = pca(x0, keepdim=k, normalize=1)
|
||||
# this is faster and same result
|
||||
fact = self.factors[:,:k]
|
||||
res = sm.OLS(y0, fact).fit()
|
||||
## print 'k =', k
|
||||
## print res.params
|
||||
## print 'aic: ', res.aic
|
||||
## print 'bic: ', res.bic
|
||||
## print 'llf: ', res.llf
|
||||
## print 'R2 ', res.rsquared
|
||||
## print 'R2 adj', res.rsquared_adj
|
||||
|
||||
if not skip_crossval:
|
||||
if cv_iter is None:
|
||||
cv_iter = LeaveOneOut(len(y0))
|
||||
prederr2 = 0.
|
||||
for inidx, outidx in cv_iter:
|
||||
res_l1o = sm.OLS(y0[inidx], fact[inidx,:]).fit()
|
||||
#print data.endog[outidx], res.model.predict(data.exog[outidx,:]),
|
||||
prederr2 += (y0[outidx] -
|
||||
res_l1o.model.predict(res_l1o.params, fact[outidx,:]))**2.
|
||||
else:
|
||||
prederr2 = np.nan
|
||||
|
||||
results.append([k, res.aic, res.bic, res.rsquared_adj, prederr2])
|
||||
|
||||
self.results_find_nfact = results = np.array(results)
|
||||
self.best_nfact = np.r_[(np.argmin(results[:,1:3],0), np.argmax(results[:,3],0),
|
||||
np.argmin(results[:,-1],0))]
|
||||
|
||||
def summary_find_nfact(self):
|
||||
'''provides a summary for the selection of the number of factors
|
||||
|
||||
Returns
|
||||
-------
|
||||
sumstr : str
|
||||
summary of the results for selecting the number of factors
|
||||
|
||||
'''
|
||||
if not hasattr(self, 'results_find_nfact'):
|
||||
self.fit_find_nfact()
|
||||
|
||||
|
||||
results = self.results_find_nfact
|
||||
sumstr = ''
|
||||
sumstr += '\n' + 'Best result for k, by AIC, BIC, R2_adj, L1O'
|
||||
# best = np.r_[(np.argmin(results[:,1:3],0), np.argmax(results[:,3],0),
|
||||
# np.argmin(results[:,-1],0))]
|
||||
|
||||
sumstr += '\n' + ' '*19 + '%5d %4d %6d %5d' % tuple(self.best_nfact)
|
||||
|
||||
from statsmodels.iolib.table import SimpleTable
|
||||
|
||||
headers = 'k, AIC, BIC, R2_adj, L1O'.split(', ')
|
||||
numformat = ['%6d'] + ['%10.3f']*4 #'%10.4f'
|
||||
txt_fmt1 = dict(data_fmts = numformat)
|
||||
tabl = SimpleTable(results, headers, None, txt_fmt=txt_fmt1)
|
||||
|
||||
sumstr += '\n' + "PCA regression on simulated data,"
|
||||
sumstr += '\n' + "DGP: 2 factors and 4 explanatory variables"
|
||||
sumstr += '\n' + tabl.__str__()
|
||||
sumstr += '\n' + "Notes: k is number of components of PCA,"
|
||||
sumstr += '\n' + " constant is added additionally"
|
||||
sumstr += '\n' + " k=0 means regression on constant only"
|
||||
sumstr += '\n' + " L1O: sum of squared prediction errors for leave-one-out"
|
||||
return sumstr
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
||||
examples = [1]
|
||||
if 1 in examples:
|
||||
nobs = 500
|
||||
f0 = np.c_[np.random.normal(size=(nobs,2)), np.ones((nobs,1))]
|
||||
f2xcoef = np.c_[np.repeat(np.eye(2),2,0),np.arange(4)[::-1]].T
|
||||
f2xcoef = np.array([[ 1., 1., 0., 0.],
|
||||
[ 0., 0., 1., 1.],
|
||||
[ 3., 2., 1., 0.]])
|
||||
f2xcoef = np.array([[ 0.1, 3., 1., 0.],
|
||||
[ 0., 0., 1.5, 0.1],
|
||||
[ 3., 2., 1., 0.]])
|
||||
x0 = np.dot(f0, f2xcoef)
|
||||
x0 += 0.1*np.random.normal(size=x0.shape)
|
||||
ytrue = np.dot(f0,[1., 1., 1.])
|
||||
y0 = ytrue + 0.1*np.random.normal(size=ytrue.shape)
|
||||
|
||||
mod = FactorModelUnivariate(y0, x0)
|
||||
print(mod.summary_find_nfact())
|
||||
print("with cross validation - slower")
|
||||
mod.fit_find_nfact(maxfact=None, skip_crossval=False, cv_iter=None)
|
||||
print(mod.summary_find_nfact())
|
||||
Reference in New Issue
Block a user