reconnect moved files to git repo
This commit is contained in:
@ -0,0 +1,441 @@
|
||||
"""
|
||||
Sandbox Panel Estimators
|
||||
|
||||
References
|
||||
-----------
|
||||
|
||||
Baltagi, Badi H. `Econometric Analysis of Panel Data.` 4th ed. Wiley, 2008.
|
||||
"""
|
||||
from functools import reduce
|
||||
|
||||
import numpy as np
|
||||
|
||||
from statsmodels.regression.linear_model import GLS
|
||||
|
||||
__all__ = ["PanelModel"]
|
||||
|
||||
from pandas import Panel
|
||||
|
||||
|
||||
def group(X):
|
||||
"""
|
||||
Returns unique numeric values for groups without sorting.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> X = np.array(['a','a','b','c','b','c'])
|
||||
>>> group(X)
|
||||
>>> g
|
||||
array([ 0., 0., 1., 2., 1., 2.])
|
||||
"""
|
||||
uniq_dict = {}
|
||||
group = np.zeros(len(X))
|
||||
for i in range(len(X)):
|
||||
if not X[i] in uniq_dict:
|
||||
uniq_dict.update({X[i] : len(uniq_dict)})
|
||||
group[i] = uniq_dict[X[i]]
|
||||
return group
|
||||
|
||||
def repanel_cov(groups, sigmas):
|
||||
'''calculate error covariance matrix for random effects model
|
||||
|
||||
Parameters
|
||||
----------
|
||||
groups : ndarray, (nobs, nre) or (nobs,)
|
||||
array of group/category observations
|
||||
sigma : ndarray, (nre+1,)
|
||||
array of standard deviations of random effects,
|
||||
last element is the standard deviation of the
|
||||
idiosyncratic error
|
||||
|
||||
Returns
|
||||
-------
|
||||
omega : ndarray, (nobs, nobs)
|
||||
covariance matrix of error
|
||||
omegainv : ndarray, (nobs, nobs)
|
||||
inverse covariance matrix of error
|
||||
omegainvsqrt : ndarray, (nobs, nobs)
|
||||
squareroot inverse covariance matrix of error
|
||||
such that omega = omegainvsqrt * omegainvsqrt.T
|
||||
|
||||
Notes
|
||||
-----
|
||||
This does not use sparse matrices and constructs nobs by nobs
|
||||
matrices. Also, omegainvsqrt is not sparse, i.e. elements are non-zero
|
||||
'''
|
||||
|
||||
if groups.ndim == 1:
|
||||
groups = groups[:,None]
|
||||
nobs, nre = groups.shape
|
||||
omega = sigmas[-1]*np.eye(nobs)
|
||||
for igr in range(nre):
|
||||
group = groups[:,igr:igr+1]
|
||||
groupuniq = np.unique(group)
|
||||
dummygr = sigmas[igr] * (group == groupuniq).astype(float)
|
||||
omega += np.dot(dummygr, dummygr.T)
|
||||
ev, evec = np.linalg.eigh(omega) #eig does not work
|
||||
omegainv = np.dot(evec, (1/ev * evec).T)
|
||||
omegainvhalf = evec/np.sqrt(ev)
|
||||
return omega, omegainv, omegainvhalf
|
||||
|
||||
|
||||
|
||||
class PanelData(Panel):
|
||||
pass
|
||||
|
||||
class PanelModel:
|
||||
"""
|
||||
An abstract statistical model class for panel (longitudinal) datasets.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
endog : array_like or str
|
||||
If a pandas object is used then endog should be the name of the
|
||||
endogenous variable as a string.
|
||||
# exog
|
||||
# panel_arr
|
||||
# time_arr
|
||||
panel_data : pandas.Panel object
|
||||
|
||||
Notes
|
||||
-----
|
||||
If a pandas object is supplied it is assumed that the major_axis is time
|
||||
and that the minor_axis has the panel variable.
|
||||
"""
|
||||
def __init__(self, endog=None, exog=None, panel=None, time=None,
|
||||
xtnames=None, equation=None, panel_data=None):
|
||||
if panel_data is None:
|
||||
# if endog == None and exog == None and panel == None and \
|
||||
# time == None:
|
||||
# raise ValueError("If pandel_data is False then endog, exog, \
|
||||
#panel_arr, and time_arr cannot be None.")
|
||||
self.initialize(endog, exog, panel, time, xtnames, equation)
|
||||
# elif aspandas != False:
|
||||
# if not isinstance(endog, str):
|
||||
# raise ValueError("If a pandas object is supplied then endog \
|
||||
#must be a string containing the name of the endogenous variable")
|
||||
# if not isinstance(aspandas, Panel):
|
||||
# raise ValueError("Only pandas.Panel objects are supported")
|
||||
# self.initialize_pandas(endog, aspandas, panel_name)
|
||||
|
||||
|
||||
def initialize(self, endog, exog, panel, time, xtnames, equation):
|
||||
"""
|
||||
Initialize plain array model.
|
||||
|
||||
See PanelModel
|
||||
"""
|
||||
#TODO: for now, we are going assume a constant, and then make the first
|
||||
#panel the base, add a flag for this....
|
||||
|
||||
# get names
|
||||
names = equation.split(" ")
|
||||
self.endog_name = names[0]
|
||||
exog_names = names[1:] # this makes the order matter in the array
|
||||
self.panel_name = xtnames[0]
|
||||
self.time_name = xtnames[1]
|
||||
|
||||
|
||||
novar = exog.var(0) == 0
|
||||
if True in novar:
|
||||
cons_index = np.where(novar == 1)[0][0] # constant col. num
|
||||
exog_names.insert(cons_index, 'cons')
|
||||
|
||||
self._cons_index = novar # used again in fit_fixed
|
||||
self.exog_names = exog_names
|
||||
self.endog = np.squeeze(np.asarray(endog))
|
||||
exog = np.asarray(exog)
|
||||
self.exog = exog
|
||||
self.panel = np.asarray(panel)
|
||||
self.time = np.asarray(time)
|
||||
|
||||
self.paneluniq = np.unique(panel)
|
||||
self.timeuniq = np.unique(time)
|
||||
#TODO: this structure can possibly be extracted somewhat to deal with
|
||||
#names in general
|
||||
|
||||
#TODO: add some dimension checks, etc.
|
||||
|
||||
# def initialize_pandas(self, endog, aspandas):
|
||||
# """
|
||||
# Initialize pandas objects.
|
||||
#
|
||||
# See PanelModel.
|
||||
# """
|
||||
# self.aspandas = aspandas
|
||||
# endog = aspandas[endog].values
|
||||
# self.endog = np.squeeze(endog)
|
||||
# exog_name = aspandas.columns.tolist()
|
||||
# exog_name.remove(endog)
|
||||
# self.exog = aspandas.filterItems(exog_name).values
|
||||
#TODO: can the above be simplified to slice notation?
|
||||
# if panel_name != None:
|
||||
# self.panel_name = panel_name
|
||||
# self.exog_name = exog_name
|
||||
# self.endog_name = endog
|
||||
# self.time_arr = aspandas.major_axis
|
||||
#TODO: is time always handled correctly in fromRecords?
|
||||
# self.panel_arr = aspandas.minor_axis
|
||||
#TODO: all of this might need to be refactored to explicitly rely (internally)
|
||||
# on the pandas LongPanel structure for speed and convenience.
|
||||
# not sure this part is finished...
|
||||
|
||||
#TODO: does not conform to new initialize
|
||||
def initialize_pandas(self, panel_data, endog_name, exog_name):
|
||||
self.panel_data = panel_data
|
||||
endog = panel_data[endog_name].values # does this create a copy?
|
||||
self.endog = np.squeeze(endog)
|
||||
if exog_name is None:
|
||||
exog_name = panel_data.columns.tolist()
|
||||
exog_name.remove(endog_name)
|
||||
self.exog = panel_data.filterItems(exog_name).values # copy?
|
||||
self._exog_name = exog_name
|
||||
self._endog_name = endog_name
|
||||
self._timeseries = panel_data.major_axis # might not need these
|
||||
self._panelseries = panel_data.minor_axis
|
||||
|
||||
#TODO: this could be pulled out and just have a by kwd that takes
|
||||
# the panel or time array
|
||||
#TODO: this also needs to be expanded for 'twoway'
|
||||
def _group_mean(self, X, index='oneway', counts=False, dummies=False):
|
||||
"""
|
||||
Get group means of X by time or by panel.
|
||||
|
||||
index default is panel
|
||||
"""
|
||||
if index == 'oneway':
|
||||
Y = self.panel
|
||||
uniq = self.paneluniq
|
||||
elif index == 'time':
|
||||
Y = self.time
|
||||
uniq = self.timeuniq
|
||||
else:
|
||||
raise ValueError("index %s not understood" % index)
|
||||
print(Y, uniq, uniq[:,None], len(Y), len(uniq), len(uniq[:,None]),
|
||||
index)
|
||||
#TODO: use sparse matrices
|
||||
dummy = (Y == uniq[:,None]).astype(float)
|
||||
if X.ndim > 1:
|
||||
mean = np.dot(dummy,X)/dummy.sum(1)[:,None]
|
||||
else:
|
||||
mean = np.dot(dummy,X)/dummy.sum(1)
|
||||
if counts is False and dummies is False:
|
||||
return mean
|
||||
elif counts is True and dummies is False:
|
||||
return mean, dummy.sum(1)
|
||||
elif counts is True and dummies is True:
|
||||
return mean, dummy.sum(1), dummy
|
||||
elif counts is False and dummies is True:
|
||||
return mean, dummy
|
||||
|
||||
#TODO: Use kwd arguments or have fit_method methods?
|
||||
def fit(self, model=None, method=None, effects='oneway'):
|
||||
"""
|
||||
method : LSDV, demeaned, MLE, GLS, BE, FE, optional
|
||||
model :
|
||||
between
|
||||
fixed
|
||||
random
|
||||
pooled
|
||||
[gmm]
|
||||
effects :
|
||||
oneway
|
||||
time
|
||||
twoway
|
||||
femethod : demeaned (only one implemented)
|
||||
WLS
|
||||
remethod :
|
||||
swar -
|
||||
amemiya
|
||||
nerlove
|
||||
walhus
|
||||
|
||||
|
||||
Notes
|
||||
-----
|
||||
This is unfinished. None of the method arguments work yet.
|
||||
Only oneway effects should work.
|
||||
"""
|
||||
if method: # get rid of this with default
|
||||
method = method.lower()
|
||||
model = model.lower()
|
||||
if method and method not in ["lsdv", "demeaned", "mle",
|
||||
"gls", "be", "fe"]:
|
||||
# get rid of if method with default
|
||||
raise ValueError("%s not a valid method" % method)
|
||||
# if method == "lsdv":
|
||||
# self.fit_lsdv(model)
|
||||
if model == 'pooled':
|
||||
return GLS(self.endog, self.exog).fit()
|
||||
if model == 'between':
|
||||
return self._fit_btwn(method, effects)
|
||||
if model == 'fixed':
|
||||
return self._fit_fixed(method, effects)
|
||||
|
||||
# def fit_lsdv(self, effects):
|
||||
# """
|
||||
# Fit using least squares dummy variables.
|
||||
#
|
||||
# Notes
|
||||
# -----
|
||||
# Should only be used for small `nobs`.
|
||||
# """
|
||||
# pdummies = None
|
||||
# tdummies = None
|
||||
|
||||
def _fit_btwn(self, method, effects):
|
||||
# group mean regression or WLS
|
||||
if effects != "twoway":
|
||||
endog = self._group_mean(self.endog, index=effects)
|
||||
exog = self._group_mean(self.exog, index=effects)
|
||||
else:
|
||||
raise ValueError("%s effects is not valid for the between "
|
||||
"estimator" % effects)
|
||||
befit = GLS(endog, exog).fit()
|
||||
return befit
|
||||
|
||||
def _fit_fixed(self, method, effects):
|
||||
endog = self.endog
|
||||
exog = self.exog
|
||||
demeantwice = False
|
||||
if effects in ["oneway","twoways"]:
|
||||
if effects == "twoways":
|
||||
demeantwice = True
|
||||
effects = "oneway"
|
||||
endog_mean, counts = self._group_mean(endog, index=effects,
|
||||
counts=True)
|
||||
exog_mean = self._group_mean(exog, index=effects)
|
||||
counts = counts.astype(int)
|
||||
endog = endog - np.repeat(endog_mean, counts)
|
||||
exog = exog - np.repeat(exog_mean, counts, axis=0)
|
||||
if demeantwice or effects == "time":
|
||||
endog_mean, dummies = self._group_mean(endog, index="time",
|
||||
dummies=True)
|
||||
exog_mean = self._group_mean(exog, index="time")
|
||||
# This allows unbalanced panels
|
||||
endog = endog - np.dot(endog_mean, dummies)
|
||||
exog = exog - np.dot(dummies.T, exog_mean)
|
||||
fefit = GLS(endog, exog[:,-self._cons_index]).fit()
|
||||
#TODO: might fail with one regressor
|
||||
return fefit
|
||||
|
||||
|
||||
|
||||
|
||||
class SURPanel(PanelModel):
|
||||
pass
|
||||
|
||||
class SEMPanel(PanelModel):
|
||||
pass
|
||||
|
||||
class DynamicPanel(PanelModel):
|
||||
pass
|
||||
|
||||
if __name__ == "__main__":
|
||||
import numpy.lib.recfunctions as nprf
|
||||
import pandas
|
||||
from pandas import Panel
|
||||
|
||||
import statsmodels.api as sm
|
||||
|
||||
data = sm.datasets.grunfeld.load()
|
||||
# Baltagi does not include American Steel
|
||||
endog = data.endog[:-20]
|
||||
fullexog = data.exog[:-20]
|
||||
# fullexog.sort(order=['firm','year'])
|
||||
panel_arr = nprf.append_fields(fullexog, 'investment', endog, float,
|
||||
usemask=False)
|
||||
|
||||
panel_df = pandas.DataFrame(panel_arr)
|
||||
panel_panda = panel_df.set_index(['year', 'firm']).to_panel()
|
||||
|
||||
|
||||
# the most cumbersome way of doing it as far as preprocessing by hand
|
||||
exog = fullexog[['value','capital']].view(float).reshape(-1,2)
|
||||
exog = sm.add_constant(exog, prepend=False)
|
||||
panel = group(fullexog['firm'])
|
||||
year = fullexog['year']
|
||||
panel_mod = PanelModel(endog, exog, panel, year, xtnames=['firm','year'],
|
||||
equation='invest value capital')
|
||||
# note that equation does not actually do anything but name the variables
|
||||
panel_ols = panel_mod.fit(model='pooled')
|
||||
|
||||
panel_be = panel_mod.fit(model='between', effects='oneway')
|
||||
panel_fe = panel_mod.fit(model='fixed', effects='oneway')
|
||||
|
||||
panel_bet = panel_mod.fit(model='between', effects='time')
|
||||
panel_fet = panel_mod.fit(model='fixed', effects='time')
|
||||
|
||||
panel_fe2 = panel_mod.fit(model='fixed', effects='twoways')
|
||||
|
||||
|
||||
#see also Baltagi (3rd edt) 3.3 THE RANDOM EFFECTS MODEL p.35
|
||||
#for explicit formulas for spectral decomposition
|
||||
#but this works also for unbalanced panel
|
||||
#
|
||||
#I also just saw: 9.4.2 The Random Effects Model p.176 which is
|
||||
#partially almost the same as I did
|
||||
#
|
||||
#this needs to use sparse matrices for larger datasets
|
||||
#
|
||||
#"""
|
||||
#
|
||||
#import numpy as np
|
||||
#
|
||||
|
||||
groups = np.array([0,0,0,1,1,2,2,2])
|
||||
nobs = groups.shape[0]
|
||||
groupuniq = np.unique(groups)
|
||||
periods = np.array([0,1,2,1,2,0,1,2])
|
||||
perioduniq = np.unique(periods)
|
||||
|
||||
dummygr = (groups[:,None] == groupuniq).astype(float)
|
||||
dummype = (periods[:,None] == perioduniq).astype(float)
|
||||
|
||||
sigma = 1.
|
||||
sigmagr = np.sqrt(2.)
|
||||
sigmape = np.sqrt(3.)
|
||||
|
||||
#dummyall = np.c_[sigma*np.ones((nobs,1)), sigmagr*dummygr,
|
||||
# sigmape*dummype]
|
||||
#exclude constant ?
|
||||
dummyall = np.c_[sigmagr*dummygr, sigmape*dummype]
|
||||
# omega is the error variance-covariance matrix for the stacked
|
||||
# observations
|
||||
omega = np.dot(dummyall, dummyall.T) + sigma* np.eye(nobs)
|
||||
print(omega)
|
||||
print(np.linalg.cholesky(omega))
|
||||
ev, evec = np.linalg.eigh(omega) #eig does not work
|
||||
omegainv = np.dot(evec, (1/ev * evec).T)
|
||||
omegainv2 = np.linalg.inv(omega)
|
||||
omegacomp = np.dot(evec, (ev * evec).T)
|
||||
print(np.max(np.abs(omegacomp - omega)))
|
||||
#check
|
||||
#print(np.dot(omegainv,omega)
|
||||
print(np.max(np.abs(np.dot(omegainv,omega) - np.eye(nobs))))
|
||||
omegainvhalf = evec/np.sqrt(ev) #not sure whether ev should not be column
|
||||
print(np.max(np.abs(np.dot(omegainvhalf,omegainvhalf.T) - omegainv)))
|
||||
|
||||
# now we can use omegainvhalf in GLS (instead of the cholesky)
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
sigmas2 = np.array([sigmagr, sigmape, sigma])
|
||||
groups2 = np.column_stack((groups, periods))
|
||||
omega_, omegainv_, omegainvhalf_ = repanel_cov(groups2, sigmas2)
|
||||
print(np.max(np.abs(omega_ - omega)))
|
||||
print(np.max(np.abs(omegainv_ - omegainv)))
|
||||
print(np.max(np.abs(omegainvhalf_ - omegainvhalf)))
|
||||
|
||||
# notation Baltagi (3rd) section 9.4.1 (Fixed Effects Model)
|
||||
Pgr = reduce(np.dot,[dummygr,
|
||||
np.linalg.inv(np.dot(dummygr.T, dummygr)),dummygr.T])
|
||||
Qgr = np.eye(nobs) - Pgr
|
||||
# within group effect: np.dot(Qgr, groups)
|
||||
# but this is not memory efficient, compared to groupstats
|
||||
print(np.max(np.abs(np.dot(Qgr, groups))))
|
||||
Reference in New Issue
Block a user