some new features
This commit is contained in:
@ -0,0 +1,388 @@
|
||||
import numpy as np
|
||||
from statsmodels.base.model import Results
|
||||
import statsmodels.base.wrapper as wrap
|
||||
from statsmodels.tools.decorators import cache_readonly
|
||||
|
||||
"""
|
||||
Elastic net regularization.
|
||||
|
||||
Routines for fitting regression models using elastic net
|
||||
regularization. The elastic net minimizes the objective function
|
||||
|
||||
-llf / nobs + alpha((1 - L1_wt) * sum(params**2) / 2 +
|
||||
L1_wt * sum(abs(params)))
|
||||
|
||||
The algorithm implemented here closely follows the implementation in
|
||||
the R glmnet package, documented here:
|
||||
|
||||
http://cran.r-project.org/web/packages/glmnet/index.html
|
||||
|
||||
and here:
|
||||
|
||||
http://www.jstatsoft.org/v33/i01/paper
|
||||
|
||||
This routine should work for any regression model that implements
|
||||
loglike, score, and hess.
|
||||
"""
|
||||
|
||||
|
||||
def _gen_npfuncs(k, L1_wt, alpha, loglike_kwds, score_kwds, hess_kwds):
|
||||
"""
|
||||
Negative penalized log-likelihood functions.
|
||||
|
||||
Returns the negative penalized log-likelihood, its derivative, and
|
||||
its Hessian. The penalty only includes the smooth (L2) term.
|
||||
|
||||
All three functions have argument signature (x, model), where
|
||||
``x`` is a point in the parameter space and ``model`` is an
|
||||
arbitrary statsmodels regression model.
|
||||
"""
|
||||
|
||||
def nploglike(params, model):
|
||||
nobs = model.nobs
|
||||
pen_llf = alpha[k] * (1 - L1_wt) * np.sum(params**2) / 2
|
||||
llf = model.loglike(np.r_[params], **loglike_kwds)
|
||||
return - llf / nobs + pen_llf
|
||||
|
||||
def npscore(params, model):
|
||||
nobs = model.nobs
|
||||
pen_grad = alpha[k] * (1 - L1_wt) * params
|
||||
gr = -model.score(np.r_[params], **score_kwds)[0] / nobs
|
||||
return gr + pen_grad
|
||||
|
||||
def nphess(params, model):
|
||||
nobs = model.nobs
|
||||
pen_hess = alpha[k] * (1 - L1_wt)
|
||||
h = -model.hessian(np.r_[params], **hess_kwds)[0, 0] / nobs + pen_hess
|
||||
return h
|
||||
|
||||
return nploglike, npscore, nphess
|
||||
|
||||
|
||||
def fit_elasticnet(model, method="coord_descent", maxiter=100,
|
||||
alpha=0., L1_wt=1., start_params=None, cnvrg_tol=1e-7,
|
||||
zero_tol=1e-8, refit=False, check_step=True,
|
||||
loglike_kwds=None, score_kwds=None, hess_kwds=None):
|
||||
"""
|
||||
Return an elastic net regularized fit to a regression model.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
model : model object
|
||||
A statsmodels object implementing ``loglike``, ``score``, and
|
||||
``hessian``.
|
||||
method : {'coord_descent'}
|
||||
Only the coordinate descent algorithm is implemented.
|
||||
maxiter : int
|
||||
The maximum number of iteration cycles (an iteration cycle
|
||||
involves running coordinate descent on all variables).
|
||||
alpha : scalar or array_like
|
||||
The penalty weight. If a scalar, the same penalty weight
|
||||
applies to all variables in the model. If a vector, it
|
||||
must have the same length as `params`, and contains a
|
||||
penalty weight for each coefficient.
|
||||
L1_wt : scalar
|
||||
The fraction of the penalty given to the L1 penalty term.
|
||||
Must be between 0 and 1 (inclusive). If 0, the fit is
|
||||
a ridge fit, if 1 it is a lasso fit.
|
||||
start_params : array_like
|
||||
Starting values for `params`.
|
||||
cnvrg_tol : scalar
|
||||
If `params` changes by less than this amount (in sup-norm)
|
||||
in one iteration cycle, the algorithm terminates with
|
||||
convergence.
|
||||
zero_tol : scalar
|
||||
Any estimated coefficient smaller than this value is
|
||||
replaced with zero.
|
||||
refit : bool
|
||||
If True, the model is refit using only the variables that have
|
||||
non-zero coefficients in the regularized fit. The refitted
|
||||
model is not regularized.
|
||||
check_step : bool
|
||||
If True, confirm that the first step is an improvement and search
|
||||
further if it is not.
|
||||
loglike_kwds : dict-like or None
|
||||
Keyword arguments for the log-likelihood function.
|
||||
score_kwds : dict-like or None
|
||||
Keyword arguments for the score function.
|
||||
hess_kwds : dict-like or None
|
||||
Keyword arguments for the Hessian function.
|
||||
|
||||
Returns
|
||||
-------
|
||||
Results
|
||||
A results object.
|
||||
|
||||
Notes
|
||||
-----
|
||||
The ``elastic net`` penalty is a combination of L1 and L2
|
||||
penalties.
|
||||
|
||||
The function that is minimized is:
|
||||
|
||||
-loglike/n + alpha*((1-L1_wt)*|params|_2^2/2 + L1_wt*|params|_1)
|
||||
|
||||
where |*|_1 and |*|_2 are the L1 and L2 norms.
|
||||
|
||||
The computational approach used here is to obtain a quadratic
|
||||
approximation to the smooth part of the target function:
|
||||
|
||||
-loglike/n + alpha*(1-L1_wt)*|params|_2^2/2
|
||||
|
||||
then repeatedly optimize the L1 penalized version of this function
|
||||
along coordinate axes.
|
||||
"""
|
||||
|
||||
k_exog = model.exog.shape[1]
|
||||
|
||||
loglike_kwds = {} if loglike_kwds is None else loglike_kwds
|
||||
score_kwds = {} if score_kwds is None else score_kwds
|
||||
hess_kwds = {} if hess_kwds is None else hess_kwds
|
||||
|
||||
if np.isscalar(alpha):
|
||||
alpha = alpha * np.ones(k_exog)
|
||||
|
||||
# Define starting params
|
||||
if start_params is None:
|
||||
params = np.zeros(k_exog)
|
||||
else:
|
||||
params = start_params.copy()
|
||||
|
||||
btol = 1e-4
|
||||
params_zero = np.zeros(len(params), dtype=bool)
|
||||
|
||||
init_args = model._get_init_kwds()
|
||||
# we do not need a copy of init_args b/c get_init_kwds provides new dict
|
||||
init_args['hasconst'] = False
|
||||
model_offset = init_args.pop('offset', None)
|
||||
if 'exposure' in init_args and init_args['exposure'] is not None:
|
||||
if model_offset is None:
|
||||
model_offset = np.log(init_args.pop('exposure'))
|
||||
else:
|
||||
model_offset += np.log(init_args.pop('exposure'))
|
||||
|
||||
fgh_list = [
|
||||
_gen_npfuncs(k, L1_wt, alpha, loglike_kwds, score_kwds, hess_kwds)
|
||||
for k in range(k_exog)]
|
||||
|
||||
converged = False
|
||||
|
||||
for itr in range(maxiter):
|
||||
|
||||
# Sweep through the parameters
|
||||
params_save = params.copy()
|
||||
for k in range(k_exog):
|
||||
|
||||
# Under the active set method, if a parameter becomes
|
||||
# zero we do not try to change it again.
|
||||
# TODO : give the user the option to switch this off
|
||||
if params_zero[k]:
|
||||
continue
|
||||
|
||||
# Set the offset to account for the variables that are
|
||||
# being held fixed in the current coordinate
|
||||
# optimization.
|
||||
params0 = params.copy()
|
||||
params0[k] = 0
|
||||
offset = np.dot(model.exog, params0)
|
||||
if model_offset is not None:
|
||||
offset += model_offset
|
||||
|
||||
# Create a one-variable model for optimization.
|
||||
model_1var = model.__class__(
|
||||
model.endog, model.exog[:, k], offset=offset, **init_args)
|
||||
|
||||
# Do the one-dimensional optimization.
|
||||
func, grad, hess = fgh_list[k]
|
||||
params[k] = _opt_1d(
|
||||
func, grad, hess, model_1var, params[k], alpha[k]*L1_wt,
|
||||
tol=btol, check_step=check_step)
|
||||
|
||||
# Update the active set
|
||||
if itr > 0 and np.abs(params[k]) < zero_tol:
|
||||
params_zero[k] = True
|
||||
params[k] = 0.
|
||||
|
||||
# Check for convergence
|
||||
pchange = np.max(np.abs(params - params_save))
|
||||
if pchange < cnvrg_tol:
|
||||
converged = True
|
||||
break
|
||||
|
||||
# Set approximate zero coefficients to be exactly zero
|
||||
params[np.abs(params) < zero_tol] = 0
|
||||
|
||||
if not refit:
|
||||
results = RegularizedResults(model, params)
|
||||
results.converged = converged
|
||||
return RegularizedResultsWrapper(results)
|
||||
|
||||
# Fit the reduced model to get standard errors and other
|
||||
# post-estimation results.
|
||||
ii = np.flatnonzero(params)
|
||||
cov = np.zeros((k_exog, k_exog))
|
||||
init_args = {k: getattr(model, k, None) for k in model._init_keys}
|
||||
if len(ii) > 0:
|
||||
model1 = model.__class__(
|
||||
model.endog, model.exog[:, ii], **init_args)
|
||||
rslt = model1.fit()
|
||||
params[ii] = rslt.params
|
||||
cov[np.ix_(ii, ii)] = rslt.normalized_cov_params
|
||||
else:
|
||||
# Hack: no variables were selected but we need to run fit in
|
||||
# order to get the correct results class. So just fit a model
|
||||
# with one variable.
|
||||
model1 = model.__class__(model.endog, model.exog[:, 0], **init_args)
|
||||
rslt = model1.fit(maxiter=0)
|
||||
|
||||
# fit may return a results or a results wrapper
|
||||
if issubclass(rslt.__class__, wrap.ResultsWrapper):
|
||||
klass = rslt._results.__class__
|
||||
else:
|
||||
klass = rslt.__class__
|
||||
|
||||
# Not all models have a scale
|
||||
if hasattr(rslt, 'scale'):
|
||||
scale = rslt.scale
|
||||
else:
|
||||
scale = 1.
|
||||
|
||||
# The degrees of freedom should reflect the number of parameters
|
||||
# in the refit model, not including the zeros that are displayed
|
||||
# to indicate which variables were dropped. See issue #1723 for
|
||||
# discussion about setting df parameters in model and results
|
||||
# classes.
|
||||
p, q = model.df_model, model.df_resid
|
||||
model.df_model = len(ii)
|
||||
model.df_resid = model.nobs - model.df_model
|
||||
|
||||
# Assuming a standard signature for creating results classes.
|
||||
refit = klass(model, params, cov, scale=scale)
|
||||
refit.regularized = True
|
||||
refit.converged = converged
|
||||
refit.method = method
|
||||
refit.fit_history = {'iteration': itr + 1}
|
||||
|
||||
# Restore df in model class, see issue #1723 for discussion.
|
||||
model.df_model, model.df_resid = p, q
|
||||
|
||||
return refit
|
||||
|
||||
|
||||
def _opt_1d(func, grad, hess, model, start, L1_wt, tol,
|
||||
check_step=True):
|
||||
"""
|
||||
One-dimensional helper for elastic net.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
func : function
|
||||
A smooth function of a single variable to be optimized
|
||||
with L1 penaty.
|
||||
grad : function
|
||||
The gradient of `func`.
|
||||
hess : function
|
||||
The Hessian of `func`.
|
||||
model : statsmodels model
|
||||
The model being fit.
|
||||
start : real
|
||||
A starting value for the function argument
|
||||
L1_wt : non-negative real
|
||||
The weight for the L1 penalty function.
|
||||
tol : non-negative real
|
||||
A convergence threshold.
|
||||
check_step : bool
|
||||
If True, check that the first step is an improvement and
|
||||
use bisection if it is not. If False, return after the
|
||||
first step regardless.
|
||||
|
||||
Notes
|
||||
-----
|
||||
``func``, ``grad``, and ``hess`` have argument signature (x,
|
||||
model), where ``x`` is a point in the parameter space and
|
||||
``model`` is the model being fit.
|
||||
|
||||
If the log-likelihood for the model is exactly quadratic, the
|
||||
global minimum is returned in one step. Otherwise numerical
|
||||
bisection is used.
|
||||
|
||||
Returns
|
||||
-------
|
||||
The argmin of the objective function.
|
||||
"""
|
||||
|
||||
# Overview:
|
||||
# We want to minimize L(x) + L1_wt*abs(x), where L() is a smooth
|
||||
# loss function that includes the log-likelihood and L2 penalty.
|
||||
# This is a 1-dimensional optimization. If L(x) is exactly
|
||||
# quadratic we can solve for the argmin exactly. Otherwise we
|
||||
# approximate L(x) with a quadratic function Q(x) and try to use
|
||||
# the minimizer of Q(x) + L1_wt*abs(x). But if this yields an
|
||||
# uphill step for the actual target function L(x) + L1_wt*abs(x),
|
||||
# then we fall back to a expensive line search. The line search
|
||||
# is never needed for OLS.
|
||||
|
||||
x = start
|
||||
f = func(x, model)
|
||||
b = grad(x, model)
|
||||
c = hess(x, model)
|
||||
d = b - c*x
|
||||
|
||||
# The optimum is achieved by hard thresholding to zero
|
||||
if L1_wt > np.abs(d):
|
||||
return 0.
|
||||
|
||||
# x + h is the minimizer of the Q(x) + L1_wt*abs(x)
|
||||
if d >= 0:
|
||||
h = (L1_wt - b) / c
|
||||
elif d < 0:
|
||||
h = -(L1_wt + b) / c
|
||||
else:
|
||||
return np.nan
|
||||
|
||||
# If the new point is not uphill for the target function, take it
|
||||
# and return. This check is a bit expensive and un-necessary for
|
||||
# OLS
|
||||
if not check_step:
|
||||
return x + h
|
||||
f1 = func(x + h, model) + L1_wt*np.abs(x + h)
|
||||
if f1 <= f + L1_wt*np.abs(x) + 1e-10:
|
||||
return x + h
|
||||
|
||||
# Fallback for models where the loss is not quadratic
|
||||
from scipy.optimize import brent
|
||||
x_opt = brent(func, args=(model,), brack=(x-1, x+1), tol=tol)
|
||||
return x_opt
|
||||
|
||||
|
||||
class RegularizedResults(Results):
|
||||
"""
|
||||
Results for models estimated using regularization
|
||||
|
||||
Parameters
|
||||
----------
|
||||
model : Model
|
||||
The model instance used to estimate the parameters.
|
||||
params : ndarray
|
||||
The estimated (regularized) parameters.
|
||||
"""
|
||||
def __init__(self, model, params):
|
||||
super().__init__(model, params)
|
||||
|
||||
@cache_readonly
|
||||
def fittedvalues(self):
|
||||
"""
|
||||
The predicted values from the model at the estimated parameters.
|
||||
"""
|
||||
return self.model.predict(self.params)
|
||||
|
||||
|
||||
class RegularizedResultsWrapper(wrap.ResultsWrapper):
|
||||
_attrs = {
|
||||
'params': 'columns',
|
||||
'resid': 'rows',
|
||||
'fittedvalues': 'rows',
|
||||
}
|
||||
_wrap_attrs = _attrs
|
||||
wrap.populate_wrapper(RegularizedResultsWrapper, # noqa:E305
|
||||
RegularizedResults)
|
||||
Reference in New Issue
Block a user