reconnect moved files to git repo

2025-08-01 04:33:03 -04:00
commit 5d3c35492d
23190 changed files with 4750716 additions and 0 deletions
--- a/venv/lib/python3.11/site-packages/statsmodels/sandbox/datarich/factormodels.py
+++ b/venv/lib/python3.11/site-packages/statsmodels/sandbox/datarich/factormodels.py
@ -0,0 +1,186 @@
+"""
+Created on Sun Nov 14 08:21:41 2010
+
+Author: josef-pktd
+License: BSD (3-clause)
+"""
+
+import numpy as np
+import statsmodels.api as sm
+from statsmodels.sandbox.tools import pca
+from statsmodels.sandbox.tools.cross_val import LeaveOneOut
+
+#converting example Principal Component Regression to a class
+#from sandbox/example_pca_regression.py
+
+
+class FactorModelUnivariate:
+    '''
+
+    Todo:
+    check treatment of const, make it optional ?
+        add hasconst (0 or 1), needed when selecting nfact+hasconst
+    options are arguments in calc_factors, should be more public instead
+    cross-validation is slow for large number of observations
+    '''
+    def __init__(self, endog, exog):
+        #do this in a superclass?
+        self.endog = np.asarray(endog)
+        self.exog = np.asarray(exog)
+
+
+    def calc_factors(self, x=None, keepdim=0, addconst=True):
+        '''get factor decomposition of exogenous variables
+
+        This uses principal component analysis to obtain the factors. The number
+        of factors kept is the maximum that will be considered in the regression.
+        '''
+        if x is None:
+            x = self.exog
+        else:
+            x = np.asarray(x)
+        xred, fact, evals, evecs  = pca(x, keepdim=keepdim, normalize=1)
+        self.exog_reduced = xred
+        #self.factors = fact
+        if addconst:
+            self.factors = sm.add_constant(fact, prepend=True)
+            self.hasconst = 1  #needs to be int
+        else:
+            self.factors = fact
+            self.hasconst = 0  #needs to be int
+
+        self.evals = evals
+        self.evecs = evecs
+
+    def fit_fixed_nfact(self, nfact):
+        if not hasattr(self, 'factors_wconst'):
+            self.calc_factors()
+        return sm.OLS(self.endog, self.factors[:,:nfact+1]).fit()
+
+    def fit_find_nfact(self, maxfact=None, skip_crossval=True, cv_iter=None):
+        '''estimate the model and selection criteria for up to maxfact factors
+
+        The selection criteria that are calculated are AIC, BIC, and R2_adj. and
+        additionally cross-validation prediction error sum of squares if `skip_crossval`
+        is false. Cross-validation is not used by default because it can be
+        time consuming to calculate.
+
+        By default the cross-validation method is Leave-one-out on the full dataset.
+        A different cross-validation sample can be specified as an argument to
+        cv_iter.
+
+        Results are attached in `results_find_nfact`
+
+
+
+        '''
+        #print 'OLS on Factors'
+        if not hasattr(self, 'factors'):
+            self.calc_factors()
+
+        hasconst = self.hasconst
+        if maxfact is None:
+            maxfact = self.factors.shape[1] - hasconst
+
+        if (maxfact+hasconst) < 1:
+            raise ValueError('nothing to do, number of factors (incl. constant) should ' +
+                             'be at least 1')
+
+        #temporary safety
+        maxfact = min(maxfact, 10)
+
+        y0 = self.endog
+        results = []
+        #xred, fact, eva, eve  = pca(x0, keepdim=0, normalize=1)
+        for k in range(1, maxfact+hasconst): #k includes now the constnat
+            #xred, fact, eva, eve  = pca(x0, keepdim=k, normalize=1)
+            # this is faster and same result
+            fact = self.factors[:,:k]
+            res = sm.OLS(y0, fact).fit()
+        ##    print 'k =', k
+        ##    print res.params
+        ##    print 'aic:  ', res.aic
+        ##    print 'bic:  ', res.bic
+        ##    print 'llf:  ', res.llf
+        ##    print 'R2    ', res.rsquared
+        ##    print 'R2 adj', res.rsquared_adj
+
+            if not skip_crossval:
+                if cv_iter is None:
+                    cv_iter = LeaveOneOut(len(y0))
+                prederr2 = 0.
+                for inidx, outidx in cv_iter:
+                    res_l1o = sm.OLS(y0[inidx], fact[inidx,:]).fit()
+                    #print data.endog[outidx], res.model.predict(data.exog[outidx,:]),
+                    prederr2 += (y0[outidx] -
+                                 res_l1o.model.predict(res_l1o.params, fact[outidx,:]))**2.
+            else:
+                prederr2 = np.nan
+
+            results.append([k, res.aic, res.bic, res.rsquared_adj, prederr2])
+
+        self.results_find_nfact = results = np.array(results)
+        self.best_nfact = np.r_[(np.argmin(results[:,1:3],0), np.argmax(results[:,3],0),
+                     np.argmin(results[:,-1],0))]
+
+    def summary_find_nfact(self):
+        '''provides a summary for the selection of the number of factors
+
+        Returns
+        -------
+        sumstr : str
+            summary of the results for selecting the number of factors
+
+        '''
+        if not hasattr(self, 'results_find_nfact'):
+            self.fit_find_nfact()
+
+
+        results = self.results_find_nfact
+        sumstr = ''
+        sumstr += '\n' + 'Best result for k, by AIC, BIC, R2_adj, L1O'
+#        best = np.r_[(np.argmin(results[:,1:3],0), np.argmax(results[:,3],0),
+#                     np.argmin(results[:,-1],0))]
+
+        sumstr += '\n' + ' '*19 + '%5d %4d %6d %5d' % tuple(self.best_nfact)
+
+        from statsmodels.iolib.table import SimpleTable
+
+        headers = 'k, AIC, BIC, R2_adj, L1O'.split(', ')
+        numformat = ['%6d'] + ['%10.3f']*4 #'%10.4f'
+        txt_fmt1 = dict(data_fmts = numformat)
+        tabl = SimpleTable(results, headers, None, txt_fmt=txt_fmt1)
+
+        sumstr += '\n' + "PCA regression on simulated data,"
+        sumstr += '\n' + "DGP: 2 factors and 4 explanatory variables"
+        sumstr += '\n' + tabl.__str__()
+        sumstr += '\n' + "Notes: k is number of components of PCA,"
+        sumstr += '\n' + "       constant is added additionally"
+        sumstr += '\n' + "       k=0 means regression on constant only"
+        sumstr += '\n' + "       L1O: sum of squared prediction errors for leave-one-out"
+        return sumstr
+
+
+if __name__ == '__main__':
+
+    examples = [1]
+    if 1 in examples:
+        nobs = 500
+        f0 = np.c_[np.random.normal(size=(nobs,2)), np.ones((nobs,1))]
+        f2xcoef = np.c_[np.repeat(np.eye(2),2,0),np.arange(4)[::-1]].T
+        f2xcoef = np.array([[ 1.,  1.,  0.,  0.],
+                            [ 0.,  0.,  1.,  1.],
+                            [ 3.,  2.,  1.,  0.]])
+        f2xcoef = np.array([[ 0.1,  3.,  1.,    0.],
+                            [ 0.,  0.,  1.5,   0.1],
+                            [ 3.,  2.,  1.,    0.]])
+        x0 = np.dot(f0, f2xcoef)
+        x0 += 0.1*np.random.normal(size=x0.shape)
+        ytrue = np.dot(f0,[1., 1., 1.])
+        y0 = ytrue + 0.1*np.random.normal(size=ytrue.shape)
+
+        mod = FactorModelUnivariate(y0, x0)
+        print(mod.summary_find_nfact())
+        print("with cross validation - slower")
+        mod.fit_find_nfact(maxfact=None, skip_crossval=False, cv_iter=None)
+        print(mod.summary_find_nfact())