reconnect moved files to git repo
This commit is contained in:
@ -0,0 +1 @@
|
||||
__author__ = 'luca puggini: lucapuggio@gmail.com'
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
@ -0,0 +1,66 @@
|
||||
"""
|
||||
Cross-validation iterators for GAM
|
||||
|
||||
Author: Luca Puggini
|
||||
|
||||
"""
|
||||
|
||||
from abc import ABCMeta, abstractmethod
|
||||
from statsmodels.compat.python import with_metaclass
|
||||
import numpy as np
|
||||
|
||||
|
||||
class BaseCrossValidator(with_metaclass(ABCMeta)):
|
||||
"""
|
||||
The BaseCrossValidator class is a base class for all the iterators that
|
||||
split the data in train and test as for example KFolds or LeavePOut
|
||||
"""
|
||||
def __init__(self):
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def split(self):
|
||||
pass
|
||||
|
||||
|
||||
class KFold(BaseCrossValidator):
|
||||
"""
|
||||
K-Folds cross validation iterator:
|
||||
Provides train/test indexes to split data in train test sets
|
||||
|
||||
Parameters
|
||||
----------
|
||||
k: int
|
||||
number of folds
|
||||
shuffle : bool
|
||||
If true, then the index is shuffled before splitting into train and
|
||||
test indices.
|
||||
|
||||
Notes
|
||||
-----
|
||||
All folds except for last fold have size trunc(n/k), the last fold has
|
||||
the remainder.
|
||||
"""
|
||||
|
||||
def __init__(self, k_folds, shuffle=False):
|
||||
self.nobs = None
|
||||
self.k_folds = k_folds
|
||||
self.shuffle = shuffle
|
||||
|
||||
def split(self, X, y=None, label=None):
|
||||
"""yield index split into train and test sets
|
||||
"""
|
||||
# TODO: X and y are redundant, we only need nobs
|
||||
|
||||
nobs = X.shape[0]
|
||||
index = np.array(range(nobs))
|
||||
|
||||
if self.shuffle:
|
||||
np.random.shuffle(index)
|
||||
|
||||
folds = np.array_split(index, self.k_folds)
|
||||
for fold in folds:
|
||||
test_index = np.zeros(nobs, dtype=bool)
|
||||
test_index[fold] = True
|
||||
train_index = np.logical_not(test_index)
|
||||
yield train_index, test_index
|
||||
@ -0,0 +1,207 @@
|
||||
"""
|
||||
Cross-validation classes for GAM
|
||||
|
||||
Author: Luca Puggini
|
||||
|
||||
"""
|
||||
|
||||
from abc import ABCMeta, abstractmethod
|
||||
from statsmodels.compat.python import with_metaclass
|
||||
import itertools
|
||||
import numpy as np
|
||||
from statsmodels.gam.smooth_basis import (GenericSmoothers,
|
||||
UnivariateGenericSmoother)
|
||||
|
||||
|
||||
class BaseCV(with_metaclass(ABCMeta)):
|
||||
"""
|
||||
BaseCV class. It computes the cross validation error of a given model.
|
||||
All the cross validation classes can be derived by this one
|
||||
(e.g. GamCV, LassoCV,...)
|
||||
"""
|
||||
|
||||
def __init__(self, cv_iterator, endog, exog):
|
||||
self.cv_iterator = cv_iterator
|
||||
self.exog = exog
|
||||
self.endog = endog
|
||||
# TODO: cv_iterator.split only needs nobs from endog or exog
|
||||
self.train_test_cv_indices = self.cv_iterator.split(self.exog,
|
||||
self.endog,
|
||||
label=None)
|
||||
|
||||
def fit(self, **kwargs):
|
||||
# kwargs are the input values for the fit method of the
|
||||
# cross-validated object
|
||||
|
||||
cv_err = []
|
||||
|
||||
for train_index, test_index in self.train_test_cv_indices:
|
||||
cv_err.append(self._error(train_index, test_index, **kwargs))
|
||||
|
||||
return np.array(cv_err)
|
||||
|
||||
@abstractmethod
|
||||
def _error(self, train_index, test_index, **kwargs):
|
||||
# train the model on the train set
|
||||
# and returns the error on the test set
|
||||
pass
|
||||
|
||||
|
||||
def _split_train_test_smoothers(x, smoother, train_index, test_index):
|
||||
"""split smoothers in test and train sets and create GenericSmoothers
|
||||
|
||||
Note: this does not take exog_linear into account
|
||||
"""
|
||||
train_smoothers = []
|
||||
test_smoothers = []
|
||||
for smoother in smoother.smoothers:
|
||||
train_basis = smoother.basis[train_index]
|
||||
train_der_basis = smoother.der_basis[train_index]
|
||||
train_der2_basis = smoother.der2_basis[train_index]
|
||||
train_cov_der2 = smoother.cov_der2
|
||||
# TODO: Double check this part. cov_der2 is calculated with all data
|
||||
train_x = smoother.x[train_index]
|
||||
|
||||
train_smoothers.append(
|
||||
UnivariateGenericSmoother(
|
||||
train_x, train_basis, train_der_basis, train_der2_basis,
|
||||
train_cov_der2, smoother.variable_name + ' train'))
|
||||
|
||||
test_basis = smoother.basis[test_index]
|
||||
test_der_basis = smoother.der_basis[test_index]
|
||||
test_cov_der2 = smoother.cov_der2
|
||||
# TODO: Double check this part. cov_der2 is calculated with all data
|
||||
test_x = smoother.x[test_index]
|
||||
|
||||
test_smoothers.append(
|
||||
UnivariateGenericSmoother(
|
||||
test_x, test_basis, test_der_basis, train_der2_basis,
|
||||
test_cov_der2, smoother.variable_name + ' test'))
|
||||
|
||||
train_multivariate_smoothers = GenericSmoothers(x[train_index],
|
||||
train_smoothers)
|
||||
test_multivariate_smoothers = GenericSmoothers(x[test_index],
|
||||
test_smoothers)
|
||||
|
||||
return train_multivariate_smoothers, test_multivariate_smoothers
|
||||
|
||||
|
||||
class MultivariateGAMCV(BaseCV):
|
||||
def __init__(self, smoother, alphas, gam, cost, endog, exog, cv_iterator):
|
||||
self.cost = cost
|
||||
self.gam = gam
|
||||
self.smoother = smoother
|
||||
self.exog_linear = exog
|
||||
self.alphas = alphas
|
||||
self.cv_iterator = cv_iterator
|
||||
# TODO: super does not do anything with endog, exog, except get nobs
|
||||
# refactor to clean up what where `exog` and `exog_linear` is attached
|
||||
# exog is not used in super
|
||||
super().__init__(cv_iterator, endog, self.smoother.basis)
|
||||
|
||||
def _error(self, train_index, test_index, **kwargs):
|
||||
train_smoother, test_smoother = _split_train_test_smoothers(
|
||||
self.smoother.x, self.smoother, train_index, test_index)
|
||||
|
||||
endog_train = self.endog[train_index]
|
||||
endog_test = self.endog[test_index]
|
||||
if self.exog_linear is not None:
|
||||
exog_linear_train = self.exog_linear[train_index]
|
||||
exog_linear_test = self.exog_linear[test_index]
|
||||
else:
|
||||
exog_linear_train = None
|
||||
exog_linear_test = None
|
||||
|
||||
gam = self.gam(endog_train, exog=exog_linear_train,
|
||||
smoother=train_smoother, alpha=self.alphas)
|
||||
gam_res = gam.fit(**kwargs)
|
||||
# exog_linear_test and test_smoother.basis will be column_stacked
|
||||
# but not transformed in predict
|
||||
endog_est = gam_res.predict(exog_linear_test, test_smoother.basis,
|
||||
transform=False)
|
||||
|
||||
return self.cost(endog_test, endog_est)
|
||||
|
||||
|
||||
class BasePenaltiesPathCV(with_metaclass(ABCMeta)):
|
||||
"""
|
||||
Base class for cross validation over a grid of parameters.
|
||||
|
||||
The best parameter is saved in alpha_cv
|
||||
|
||||
This class is currently not used
|
||||
"""
|
||||
|
||||
def __init__(self, alphas):
|
||||
self.alphas = alphas
|
||||
self.alpha_cv = None
|
||||
self.cv_error = None
|
||||
self.cv_std = None
|
||||
|
||||
def plot_path(self):
|
||||
from statsmodels.graphics.utils import _import_mpl
|
||||
plt = _import_mpl()
|
||||
plt.plot(self.alphas, self.cv_error, c='black')
|
||||
plt.plot(self.alphas, self.cv_error + 1.96 * self.cv_std,
|
||||
c='blue')
|
||||
plt.plot(self.alphas, self.cv_error - 1.96 * self.cv_std,
|
||||
c='blue')
|
||||
|
||||
plt.plot(self.alphas, self.cv_error, 'o', c='black')
|
||||
plt.plot(self.alphas, self.cv_error + 1.96 * self.cv_std, 'o',
|
||||
c='blue')
|
||||
plt.plot(self.alphas, self.cv_error - 1.96 * self.cv_std, 'o',
|
||||
c='blue')
|
||||
|
||||
return
|
||||
# TODO add return
|
||||
|
||||
|
||||
class MultivariateGAMCVPath:
|
||||
"""k-fold cross-validation for GAM
|
||||
|
||||
Warning: The API of this class is preliminary and will change.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
smoother : additive smoother instance
|
||||
alphas : list of iteratables
|
||||
list of alpha for smooths. The product space will be used as alpha
|
||||
grid for cross-validation
|
||||
gam : model class
|
||||
model class for creating a model with k-fole training data
|
||||
cost : function
|
||||
cost function for the prediction error
|
||||
endog : ndarray
|
||||
dependent (response) variable of the model
|
||||
cv_iterator : instance of cross-validation iterator
|
||||
"""
|
||||
|
||||
def __init__(self, smoother, alphas, gam, cost, endog, exog, cv_iterator):
|
||||
self.cost = cost
|
||||
self.smoother = smoother
|
||||
self.gam = gam
|
||||
self.alphas = alphas
|
||||
self.alphas_grid = list(itertools.product(*self.alphas))
|
||||
self.endog = endog
|
||||
self.exog = exog
|
||||
self.cv_iterator = cv_iterator
|
||||
self.cv_error = np.zeros(shape=(len(self.alphas_grid, )))
|
||||
self.cv_std = np.zeros(shape=(len(self.alphas_grid, )))
|
||||
self.alpha_cv = None
|
||||
|
||||
def fit(self, **kwargs):
|
||||
for i, alphas_i in enumerate(self.alphas_grid):
|
||||
gam_cv = MultivariateGAMCV(smoother=self.smoother,
|
||||
alphas=alphas_i,
|
||||
gam=self.gam,
|
||||
cost=self.cost,
|
||||
endog=self.endog,
|
||||
exog=self.exog,
|
||||
cv_iterator=self.cv_iterator)
|
||||
cv_err = gam_cv.fit(**kwargs)
|
||||
self.cv_error[i] = cv_err.mean()
|
||||
self.cv_std[i] = cv_err.std()
|
||||
|
||||
self.alpha_cv = self.alphas_grid[np.argmin(self.cv_error)]
|
||||
return self
|
||||
Reference in New Issue
Block a user