some new features
This commit is contained in:
@ -0,0 +1,586 @@
|
||||
"""
|
||||
A collection of smooth penalty functions.
|
||||
|
||||
Penalties on vectors take a vector argument and return a scalar
|
||||
penalty. The gradient of the penalty is a vector with the same shape
|
||||
as the input value.
|
||||
|
||||
Penalties on covariance matrices take two arguments: the matrix and
|
||||
its inverse, both in unpacked (square) form. The returned penalty is
|
||||
a scalar, and the gradient is returned as a vector that contains the
|
||||
gradient with respect to the free elements in the lower triangle of
|
||||
the covariance matrix.
|
||||
|
||||
All penalties are subtracted from the log-likelihood, so greater
|
||||
penalty values correspond to a greater degree of penalization.
|
||||
|
||||
The penaties should be smooth so that they can be subtracted from log
|
||||
likelihood functions and optimized using standard methods (i.e. L1
|
||||
penalties do not belong here).
|
||||
"""
|
||||
import numpy as np
|
||||
|
||||
|
||||
class Penalty:
|
||||
"""
|
||||
A class for representing a scalar-value penalty.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
weights : array_like
|
||||
A vector of weights that determines the weight of the penalty
|
||||
for each parameter.
|
||||
|
||||
Notes
|
||||
-----
|
||||
The class has a member called `alpha` that scales the weights.
|
||||
"""
|
||||
|
||||
def __init__(self, weights=1.):
|
||||
self.weights = weights
|
||||
self.alpha = 1.
|
||||
|
||||
def func(self, params):
|
||||
"""
|
||||
A penalty function on a vector of parameters.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
params : array_like
|
||||
A vector of parameters.
|
||||
|
||||
Returns
|
||||
-------
|
||||
A scalar penaty value; greater values imply greater
|
||||
penalization.
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
def deriv(self, params):
|
||||
"""
|
||||
The gradient of a penalty function.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
params : array_like
|
||||
A vector of parameters
|
||||
|
||||
Returns
|
||||
-------
|
||||
The gradient of the penalty with respect to each element in
|
||||
`params`.
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
def _null_weights(self, params):
|
||||
"""work around for Null model
|
||||
|
||||
This will not be needed anymore when we can use `self._null_drop_keys`
|
||||
as in DiscreteModels.
|
||||
TODO: check other models
|
||||
"""
|
||||
if np.size(self.weights) > 1:
|
||||
if len(params) == 1:
|
||||
raise # raise to identify models where this would be needed
|
||||
return 0.
|
||||
|
||||
return self.weights
|
||||
|
||||
|
||||
class NonePenalty(Penalty):
|
||||
"""
|
||||
A penalty that does not penalize.
|
||||
"""
|
||||
|
||||
def __init__(self, **kwds):
|
||||
super().__init__()
|
||||
if kwds:
|
||||
import warnings
|
||||
warnings.warn('keyword arguments are be ignored')
|
||||
|
||||
def func(self, params):
|
||||
if params.ndim == 2:
|
||||
return np.zeros(params.shape[1:])
|
||||
else:
|
||||
return 0
|
||||
|
||||
def deriv(self, params):
|
||||
return np.zeros(params.shape)
|
||||
|
||||
def deriv2(self, params):
|
||||
# returns diagonal of hessian
|
||||
return np.zeros(params.shape[0])
|
||||
|
||||
|
||||
class L2(Penalty):
|
||||
"""
|
||||
The L2 (ridge) penalty.
|
||||
"""
|
||||
|
||||
def __init__(self, weights=1.):
|
||||
super().__init__(weights)
|
||||
|
||||
def func(self, params):
|
||||
return np.sum(self.weights * self.alpha * params**2)
|
||||
|
||||
def deriv(self, params):
|
||||
return 2 * self.weights * self.alpha * params
|
||||
|
||||
def deriv2(self, params):
|
||||
return 2 * self.weights * self.alpha * np.ones(len(params))
|
||||
|
||||
|
||||
class L2Univariate(Penalty):
|
||||
"""
|
||||
The L2 (ridge) penalty applied to each parameter.
|
||||
"""
|
||||
|
||||
def __init__(self, weights=None):
|
||||
if weights is None:
|
||||
self.weights = 1.
|
||||
else:
|
||||
self.weights = weights
|
||||
|
||||
def func(self, params):
|
||||
return self.weights * params**2
|
||||
|
||||
def deriv(self, params):
|
||||
return 2 * self.weights * params
|
||||
|
||||
def deriv2(self, params):
|
||||
return 2 * self.weights * np.ones(len(params))
|
||||
|
||||
|
||||
class PseudoHuber(Penalty):
|
||||
"""
|
||||
The pseudo-Huber penalty.
|
||||
"""
|
||||
|
||||
def __init__(self, dlt, weights=1.):
|
||||
super().__init__(weights)
|
||||
self.dlt = dlt
|
||||
|
||||
def func(self, params):
|
||||
v = np.sqrt(1 + (params / self.dlt)**2)
|
||||
v -= 1
|
||||
v *= self.dlt**2
|
||||
return np.sum(self.weights * self.alpha * v, 0)
|
||||
|
||||
def deriv(self, params):
|
||||
v = np.sqrt(1 + (params / self.dlt)**2)
|
||||
return params * self.weights * self.alpha / v
|
||||
|
||||
def deriv2(self, params):
|
||||
v = np.power(1 + (params / self.dlt)**2, -3/2)
|
||||
return self.weights * self.alpha * v
|
||||
|
||||
|
||||
class SCAD(Penalty):
|
||||
"""
|
||||
The SCAD penalty of Fan and Li.
|
||||
|
||||
The SCAD penalty is linear around zero as a L1 penalty up to threshold tau.
|
||||
The SCAD penalty is constant for values larger than c*tau.
|
||||
The middle segment is quadratic and connect the two segments with a continuous
|
||||
derivative.
|
||||
The penalty is symmetric around zero.
|
||||
|
||||
Parameterization follows Boo, Johnson, Li and Tan 2011.
|
||||
Fan and Li use lambda instead of tau, and a instead of c. Fan and Li
|
||||
recommend setting c=3.7.
|
||||
|
||||
f(x) = { tau |x| if 0 <= |x| < tau
|
||||
{ -(|x|^2 - 2 c tau |x| + tau^2) / (2 (c - 1)) if tau <= |x| < c tau
|
||||
{ (c + 1) tau^2 / 2 if c tau <= |x|
|
||||
|
||||
Parameters
|
||||
----------
|
||||
tau : float
|
||||
slope and threshold for linear segment
|
||||
c : float
|
||||
factor for second threshold which is c * tau
|
||||
weights : None or array
|
||||
weights for penalty of each parameter. If an entry is zero, then the
|
||||
corresponding parameter will not be penalized.
|
||||
|
||||
References
|
||||
----------
|
||||
Buu, Anne, Norman J. Johnson, Runze Li, and Xianming Tan. "New variable
|
||||
selection methods for zero‐inflated count data with applications to the
|
||||
substance abuse field."
|
||||
Statistics in medicine 30, no. 18 (2011): 2326-2340.
|
||||
|
||||
Fan, Jianqing, and Runze Li. "Variable selection via nonconcave penalized
|
||||
likelihood and its oracle properties."
|
||||
Journal of the American statistical Association 96, no. 456 (2001):
|
||||
1348-1360.
|
||||
"""
|
||||
|
||||
def __init__(self, tau, c=3.7, weights=1.):
|
||||
super().__init__(weights)
|
||||
self.tau = tau
|
||||
self.c = c
|
||||
|
||||
def func(self, params):
|
||||
|
||||
# 3 segments in absolute value
|
||||
tau = self.tau
|
||||
p_abs = np.atleast_1d(np.abs(params))
|
||||
res = np.empty(p_abs.shape, p_abs.dtype)
|
||||
res.fill(np.nan)
|
||||
mask1 = p_abs < tau
|
||||
mask3 = p_abs >= self.c * tau
|
||||
res[mask1] = tau * p_abs[mask1]
|
||||
mask2 = ~mask1 & ~mask3
|
||||
p_abs2 = p_abs[mask2]
|
||||
tmp = (p_abs2**2 - 2 * self.c * tau * p_abs2 + tau**2)
|
||||
res[mask2] = -tmp / (2 * (self.c - 1))
|
||||
res[mask3] = (self.c + 1) * tau**2 / 2.
|
||||
|
||||
return (self.weights * res).sum(0)
|
||||
|
||||
def deriv(self, params):
|
||||
|
||||
# 3 segments in absolute value
|
||||
tau = self.tau
|
||||
p = np.atleast_1d(params)
|
||||
p_abs = np.abs(p)
|
||||
p_sign = np.sign(p)
|
||||
res = np.empty(p_abs.shape)
|
||||
res.fill(np.nan)
|
||||
|
||||
mask1 = p_abs < tau
|
||||
mask3 = p_abs >= self.c * tau
|
||||
mask2 = ~mask1 & ~mask3
|
||||
res[mask1] = p_sign[mask1] * tau
|
||||
tmp = p_sign[mask2] * (p_abs[mask2] - self.c * tau)
|
||||
res[mask2] = -tmp / (self.c - 1)
|
||||
res[mask3] = 0
|
||||
|
||||
return self.weights * res
|
||||
|
||||
def deriv2(self, params):
|
||||
"""Second derivative of function
|
||||
|
||||
This returns scalar or vector in same shape as params, not a square
|
||||
Hessian. If the return is 1 dimensional, then it is the diagonal of
|
||||
the Hessian.
|
||||
"""
|
||||
|
||||
# 3 segments in absolute value
|
||||
tau = self.tau
|
||||
p = np.atleast_1d(params)
|
||||
p_abs = np.abs(p)
|
||||
res = np.zeros(p_abs.shape)
|
||||
|
||||
mask1 = p_abs < tau
|
||||
mask3 = p_abs >= self.c * tau
|
||||
mask2 = ~mask1 & ~mask3
|
||||
res[mask2] = -1 / (self.c - 1)
|
||||
|
||||
return self.weights * res
|
||||
|
||||
|
||||
class SCADSmoothed(SCAD):
|
||||
"""
|
||||
The SCAD penalty of Fan and Li, quadratically smoothed around zero.
|
||||
|
||||
This follows Fan and Li 2001 equation (3.7).
|
||||
|
||||
Parameterization follows Boo, Johnson, Li and Tan 2011
|
||||
see docstring of SCAD
|
||||
|
||||
Parameters
|
||||
----------
|
||||
tau : float
|
||||
slope and threshold for linear segment
|
||||
c : float
|
||||
factor for second threshold
|
||||
c0 : float
|
||||
threshold for quadratically smoothed segment
|
||||
restriction : None or array
|
||||
linear constraints for
|
||||
|
||||
Notes
|
||||
-----
|
||||
TODO: Use delegation instead of subclassing, so smoothing can be added to
|
||||
all penalty classes.
|
||||
"""
|
||||
|
||||
def __init__(self, tau, c=3.7, c0=None, weights=1., restriction=None):
|
||||
super().__init__(tau, c=c, weights=weights)
|
||||
self.tau = tau
|
||||
self.c = c
|
||||
self.c0 = c0 if c0 is not None else tau * 0.1
|
||||
if self.c0 > tau:
|
||||
raise ValueError('c0 cannot be larger than tau')
|
||||
|
||||
# get coefficients for quadratic approximation
|
||||
c0 = self.c0
|
||||
# need to temporarily override weights for call to super
|
||||
weights = self.weights
|
||||
self.weights = 1.
|
||||
deriv_c0 = super().deriv(c0)
|
||||
value_c0 = super().func(c0)
|
||||
self.weights = weights
|
||||
|
||||
self.aq1 = value_c0 - 0.5 * deriv_c0 * c0
|
||||
self.aq2 = 0.5 * deriv_c0 / c0
|
||||
self.restriction = restriction
|
||||
|
||||
def func(self, params):
|
||||
# workaround for Null model
|
||||
weights = self._null_weights(params)
|
||||
# TODO: `and np.size(params) > 1` is hack for llnull, need better solution
|
||||
if self.restriction is not None and np.size(params) > 1:
|
||||
params = self.restriction.dot(params)
|
||||
# need to temporarily override weights for call to super
|
||||
# Note: we have the same problem with `restriction`
|
||||
self_weights = self.weights
|
||||
self.weights = 1.
|
||||
value = super().func(params[None, ...])
|
||||
self.weights = self_weights
|
||||
|
||||
# shift down so func(0) == 0
|
||||
value -= self.aq1
|
||||
# change the segment corrsponding to quadratic approximation
|
||||
p_abs = np.atleast_1d(np.abs(params))
|
||||
mask = p_abs < self.c0
|
||||
p_abs_masked = p_abs[mask]
|
||||
value[mask] = self.aq2 * p_abs_masked**2
|
||||
|
||||
return (weights * value).sum(0)
|
||||
|
||||
def deriv(self, params):
|
||||
# workaround for Null model
|
||||
weights = self._null_weights(params)
|
||||
if self.restriction is not None and np.size(params) > 1:
|
||||
params = self.restriction.dot(params)
|
||||
# need to temporarily override weights for call to super
|
||||
self_weights = self.weights
|
||||
self.weights = 1.
|
||||
value = super().deriv(params)
|
||||
self.weights = self_weights
|
||||
|
||||
#change the segment corrsponding to quadratic approximation
|
||||
p = np.atleast_1d(params)
|
||||
mask = np.abs(p) < self.c0
|
||||
value[mask] = 2 * self.aq2 * p[mask]
|
||||
|
||||
if self.restriction is not None and np.size(params) > 1:
|
||||
return weights * value.dot(self.restriction)
|
||||
else:
|
||||
return weights * value
|
||||
|
||||
def deriv2(self, params):
|
||||
# workaround for Null model
|
||||
weights = self._null_weights(params)
|
||||
if self.restriction is not None and np.size(params) > 1:
|
||||
params = self.restriction.dot(params)
|
||||
# need to temporarily override weights for call to super
|
||||
self_weights = self.weights
|
||||
self.weights = 1.
|
||||
value = super().deriv2(params)
|
||||
self.weights = self_weights
|
||||
|
||||
# change the segment corrsponding to quadratic approximation
|
||||
p = np.atleast_1d(params)
|
||||
mask = np.abs(p) < self.c0
|
||||
value[mask] = 2 * self.aq2
|
||||
|
||||
if self.restriction is not None and np.size(params) > 1:
|
||||
# note: super returns 1d array for diag, i.e. hessian_diag
|
||||
# TODO: weights are missing
|
||||
return (self.restriction.T * (weights * value)
|
||||
).dot(self.restriction)
|
||||
else:
|
||||
return weights * value
|
||||
|
||||
|
||||
class ConstraintsPenalty:
|
||||
"""
|
||||
Penalty applied to linear transformation of parameters
|
||||
|
||||
Parameters
|
||||
----------
|
||||
penalty: instance of penalty function
|
||||
currently this requires an instance of a univariate, vectorized
|
||||
penalty class
|
||||
weights : None or ndarray
|
||||
weights for adding penalties of transformed params
|
||||
restriction : None or ndarray
|
||||
If it is not None, then restriction defines a linear transformation
|
||||
of the parameters. The penalty function is applied to each transformed
|
||||
parameter independently.
|
||||
|
||||
Notes
|
||||
-----
|
||||
`restrictions` allows us to impose penalization on contrasts or stochastic
|
||||
constraints of the original parameters.
|
||||
Examples for these contrast are difference penalities or all pairs
|
||||
penalties.
|
||||
"""
|
||||
|
||||
def __init__(self, penalty, weights=None, restriction=None):
|
||||
|
||||
self.penalty = penalty
|
||||
if weights is None:
|
||||
self.weights = 1.
|
||||
else:
|
||||
self.weights = weights
|
||||
|
||||
if restriction is not None:
|
||||
restriction = np.asarray(restriction)
|
||||
|
||||
self.restriction = restriction
|
||||
|
||||
def func(self, params):
|
||||
"""evaluate penalty function at params
|
||||
|
||||
Parameter
|
||||
---------
|
||||
params : ndarray
|
||||
array of parameters at which derivative is evaluated
|
||||
|
||||
Returns
|
||||
-------
|
||||
deriv2 : ndarray
|
||||
value(s) of penalty function
|
||||
"""
|
||||
# TODO: `and np.size(params) > 1` is hack for llnull, need better solution
|
||||
# Is this still needed? it seems to work without
|
||||
if self.restriction is not None:
|
||||
params = self.restriction.dot(params)
|
||||
|
||||
value = self.penalty.func(params)
|
||||
|
||||
return (self.weights * value.T).T.sum(0)
|
||||
|
||||
def deriv(self, params):
|
||||
"""first derivative of penalty function w.r.t. params
|
||||
|
||||
Parameter
|
||||
---------
|
||||
params : ndarray
|
||||
array of parameters at which derivative is evaluated
|
||||
|
||||
Returns
|
||||
-------
|
||||
deriv2 : ndarray
|
||||
array of first partial derivatives
|
||||
"""
|
||||
if self.restriction is not None:
|
||||
params = self.restriction.dot(params)
|
||||
|
||||
value = self.penalty.deriv(params)
|
||||
|
||||
if self.restriction is not None:
|
||||
return self.weights * value.T.dot(self.restriction)
|
||||
else:
|
||||
return (self.weights * value.T)
|
||||
|
||||
grad = deriv
|
||||
|
||||
def deriv2(self, params):
|
||||
"""second derivative of penalty function w.r.t. params
|
||||
|
||||
Parameter
|
||||
---------
|
||||
params : ndarray
|
||||
array of parameters at which derivative is evaluated
|
||||
|
||||
Returns
|
||||
-------
|
||||
deriv2 : ndarray, 2-D
|
||||
second derivative matrix
|
||||
"""
|
||||
|
||||
if self.restriction is not None:
|
||||
params = self.restriction.dot(params)
|
||||
|
||||
value = self.penalty.deriv2(params)
|
||||
|
||||
if self.restriction is not None:
|
||||
# note: univariate penalty returns 1d array for diag,
|
||||
# i.e. hessian_diag
|
||||
v = (self.restriction.T * value * self.weights)
|
||||
value = v.dot(self.restriction)
|
||||
else:
|
||||
value = np.diag(self.weights * value)
|
||||
|
||||
return value
|
||||
|
||||
|
||||
class L2ConstraintsPenalty(ConstraintsPenalty):
|
||||
"""convenience class of ConstraintsPenalty with L2 penalization
|
||||
"""
|
||||
|
||||
def __init__(self, weights=None, restriction=None, sigma_prior=None):
|
||||
|
||||
if sigma_prior is not None:
|
||||
raise NotImplementedError('sigma_prior is not implemented yet')
|
||||
|
||||
penalty = L2Univariate()
|
||||
|
||||
super().__init__(penalty, weights=weights,
|
||||
restriction=restriction)
|
||||
|
||||
|
||||
class CovariancePenalty:
|
||||
|
||||
def __init__(self, weight):
|
||||
# weight should be scalar
|
||||
self.weight = weight
|
||||
|
||||
def func(self, mat, mat_inv):
|
||||
"""
|
||||
Parameters
|
||||
----------
|
||||
mat : square matrix
|
||||
The matrix to be penalized.
|
||||
mat_inv : square matrix
|
||||
The inverse of `mat`.
|
||||
|
||||
Returns
|
||||
-------
|
||||
A scalar penalty value
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
def deriv(self, mat, mat_inv):
|
||||
"""
|
||||
Parameters
|
||||
----------
|
||||
mat : square matrix
|
||||
The matrix to be penalized.
|
||||
mat_inv : square matrix
|
||||
The inverse of `mat`.
|
||||
|
||||
Returns
|
||||
-------
|
||||
A vector containing the gradient of the penalty
|
||||
with respect to each element in the lower triangle
|
||||
of `mat`.
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
|
||||
class PSD(CovariancePenalty):
|
||||
"""
|
||||
A penalty that converges to +infinity as the argument matrix
|
||||
approaches the boundary of the domain of symmetric, positive
|
||||
definite matrices.
|
||||
"""
|
||||
|
||||
def func(self, mat, mat_inv):
|
||||
try:
|
||||
cy = np.linalg.cholesky(mat)
|
||||
except np.linalg.LinAlgError:
|
||||
return np.inf
|
||||
return -2 * self.weight * np.sum(np.log(np.diag(cy)))
|
||||
|
||||
def deriv(self, mat, mat_inv):
|
||||
cy = mat_inv.copy()
|
||||
cy = 2*cy - np.diag(np.diag(cy))
|
||||
i,j = np.tril_indices(mat.shape[0])
|
||||
return -self.weight * cy[i,j]
|
||||
Reference in New Issue
Block a user