reconnect moved files to git repo
This commit is contained in:
@ -0,0 +1,576 @@
|
||||
"""
|
||||
This models contains the Kernels for Kernel smoothing.
|
||||
|
||||
Hopefully in the future they may be reused/extended for other kernel based
|
||||
method
|
||||
|
||||
References:
|
||||
----------
|
||||
|
||||
Pointwise Kernel Confidence Bounds
|
||||
(smoothconf)
|
||||
http://fedc.wiwi.hu-berlin.de/xplore/ebooks/html/anr/anrhtmlframe62.html
|
||||
"""
|
||||
|
||||
# pylint: disable-msg=C0103
|
||||
# pylint: disable-msg=W0142
|
||||
# pylint: disable-msg=E1101
|
||||
# pylint: disable-msg=E0611
|
||||
from statsmodels.compat.python import lzip, lfilter
|
||||
import numpy as np
|
||||
import scipy.integrate
|
||||
from scipy.special import factorial
|
||||
from numpy import exp, multiply, square, divide, subtract, inf
|
||||
|
||||
|
||||
class NdKernel:
|
||||
"""Generic N-dimensial kernel
|
||||
|
||||
Parameters
|
||||
----------
|
||||
n : int
|
||||
The number of series for kernel estimates
|
||||
kernels : list
|
||||
kernels
|
||||
|
||||
Can be constructed from either
|
||||
a) a list of n kernels which will be treated as
|
||||
indepent marginals on a gaussian copula (specified by H)
|
||||
or b) a single univariate kernel which will be applied radially to the
|
||||
mahalanobis distance defined by H.
|
||||
|
||||
In the case of the Gaussian these are both equivalent, and the second constructiong
|
||||
is prefered.
|
||||
"""
|
||||
def __init__(self, n, kernels = None, H = None):
|
||||
if kernels is None:
|
||||
kernels = Gaussian()
|
||||
|
||||
self._kernels = kernels
|
||||
self.weights = None
|
||||
|
||||
if H is None:
|
||||
H = np.matrix( np.identity(n))
|
||||
|
||||
self._H = H
|
||||
self._Hrootinv = np.linalg.cholesky( H.I )
|
||||
|
||||
def getH(self):
|
||||
"""Getter for kernel bandwidth, H"""
|
||||
return self._H
|
||||
|
||||
def setH(self, value):
|
||||
"""Setter for kernel bandwidth, H"""
|
||||
self._H = value
|
||||
|
||||
H = property(getH, setH, doc="Kernel bandwidth matrix")
|
||||
|
||||
def density(self, xs, x):
|
||||
|
||||
n = len(xs)
|
||||
#xs = self.in_domain( xs, xs, x )[0]
|
||||
|
||||
if len(xs)>0: ## Need to do product of marginal distributions
|
||||
#w = np.sum([self(self._Hrootinv * (xx-x).T ) for xx in xs])/n
|
||||
#vectorized does not work:
|
||||
if self.weights is not None:
|
||||
w = np.mean(self((xs-x) * self._Hrootinv).T * self.weights)/sum(self.weights)
|
||||
else:
|
||||
w = np.mean(self((xs-x) * self._Hrootinv )) #transposed
|
||||
#w = np.mean([self(xd) for xd in ((xs-x) * self._Hrootinv)] ) #transposed
|
||||
return w
|
||||
else:
|
||||
return np.nan
|
||||
|
||||
def _kernweight(self, x ):
|
||||
"""returns the kernel weight for the independent multivariate kernel"""
|
||||
if isinstance( self._kernels, CustomKernel ):
|
||||
## Radial case
|
||||
#d = x.T * x
|
||||
#x is matrix, 2d, element wise sqrt looks wrong
|
||||
#d = np.sqrt( x.T * x )
|
||||
x = np.asarray(x)
|
||||
#d = np.sqrt( (x * x).sum(-1) )
|
||||
d = (x * x).sum(-1)
|
||||
return self._kernels( np.asarray(d) )
|
||||
|
||||
def __call__(self, x):
|
||||
"""
|
||||
This simply returns the value of the kernel function at x
|
||||
|
||||
Does the same as weight if the function is normalised
|
||||
"""
|
||||
return self._kernweight(x)
|
||||
|
||||
|
||||
class CustomKernel:
|
||||
"""
|
||||
Generic 1D Kernel object.
|
||||
Can be constructed by selecting a standard named Kernel,
|
||||
or providing a lambda expression and domain.
|
||||
The domain allows some algorithms to run faster for finite domain kernels.
|
||||
"""
|
||||
# MC: Not sure how this will look in the end - or even still exist.
|
||||
# Main purpose of this is to allow custom kernels and to allow speed up
|
||||
# from finite support.
|
||||
|
||||
def __init__(self, shape, h = 1.0, domain = None, norm = None):
|
||||
"""
|
||||
shape should be a function taking and returning numeric type.
|
||||
|
||||
For sanity it should always return positive or zero but this is not
|
||||
enforced in case you want to do weird things. Bear in mind that the
|
||||
statistical tests etc. may not be valid for non-positive kernels.
|
||||
|
||||
The bandwidth of the kernel is supplied as h.
|
||||
|
||||
You may specify a domain as a list of 2 values [min, max], in which case
|
||||
kernel will be treated as zero outside these values. This will speed up
|
||||
calculation.
|
||||
|
||||
You may also specify the normalisation constant for the supplied Kernel.
|
||||
If you do this number will be stored and used as the normalisation
|
||||
without calculation. It is recommended you do this if you know the
|
||||
constant, to speed up calculation. In particular if the shape function
|
||||
provided is already normalised you should provide norm = 1.0.
|
||||
|
||||
Warning: I think several calculations assume that the kernel is
|
||||
normalized. No tests for non-normalized kernel.
|
||||
"""
|
||||
self._normconst = norm # a value or None, if None, then calculate
|
||||
self.domain = domain
|
||||
self.weights = None
|
||||
if callable(shape):
|
||||
self._shape = shape
|
||||
else:
|
||||
raise TypeError("shape must be a callable object/function")
|
||||
self._h = h
|
||||
self._L2Norm = None
|
||||
self._kernel_var = None
|
||||
self._normal_reference_constant = None
|
||||
self._order = None
|
||||
|
||||
def geth(self):
|
||||
"""Getter for kernel bandwidth, h"""
|
||||
return self._h
|
||||
def seth(self, value):
|
||||
"""Setter for kernel bandwidth, h"""
|
||||
self._h = value
|
||||
h = property(geth, seth, doc="Kernel Bandwidth")
|
||||
|
||||
def in_domain(self, xs, ys, x):
|
||||
"""
|
||||
Returns the filtered (xs, ys) based on the Kernel domain centred on x
|
||||
"""
|
||||
# Disable black-list functions: filter used for speed instead of
|
||||
# list-comprehension
|
||||
# pylint: disable-msg=W0141
|
||||
def isInDomain(xy):
|
||||
"""Used for filter to check if point is in the domain"""
|
||||
u = (xy[0]-x)/self.h
|
||||
return np.all((u >= self.domain[0]) & (u <= self.domain[1]))
|
||||
|
||||
if self.domain is None:
|
||||
return (xs, ys)
|
||||
else:
|
||||
filtered = lfilter(isInDomain, lzip(xs, ys))
|
||||
if len(filtered) > 0:
|
||||
xs, ys = lzip(*filtered)
|
||||
return (xs, ys)
|
||||
else:
|
||||
return ([], [])
|
||||
|
||||
def density(self, xs, x):
|
||||
"""Returns the kernel density estimate for point x based on x-values
|
||||
xs
|
||||
"""
|
||||
xs = np.asarray(xs)
|
||||
n = len(xs) # before in_domain?
|
||||
if self.weights is not None:
|
||||
xs, weights = self.in_domain( xs, self.weights, x )
|
||||
else:
|
||||
xs = self.in_domain( xs, xs, x )[0]
|
||||
xs = np.asarray(xs)
|
||||
#print 'len(xs)', len(xs), x
|
||||
if xs.ndim == 1:
|
||||
xs = xs[:,None]
|
||||
if len(xs)>0:
|
||||
h = self.h
|
||||
if self.weights is not None:
|
||||
w = 1 / h * np.sum(self((xs-x)/h).T * weights, axis=1)
|
||||
else:
|
||||
w = 1. / (h * n) * np.sum(self((xs-x)/h), axis=0)
|
||||
return w
|
||||
else:
|
||||
return np.nan
|
||||
|
||||
def density_var(self, density, nobs):
|
||||
"""approximate pointwise variance for kernel density
|
||||
|
||||
not verified
|
||||
|
||||
Parameters
|
||||
----------
|
||||
density : array_lie
|
||||
pdf of the kernel density
|
||||
nobs : int
|
||||
number of observations used in the KDE estimation
|
||||
|
||||
Returns
|
||||
-------
|
||||
kde_var : ndarray
|
||||
estimated variance of the density estimate
|
||||
|
||||
Notes
|
||||
-----
|
||||
This uses the asymptotic normal approximation to the distribution of
|
||||
the density estimate.
|
||||
"""
|
||||
return np.asarray(density) * self.L2Norm / self.h / nobs
|
||||
|
||||
def density_confint(self, density, nobs, alpha=0.05):
|
||||
"""approximate pointwise confidence interval for kernel density
|
||||
|
||||
The confidence interval is centered at the estimated density and
|
||||
ignores the bias of the density estimate.
|
||||
|
||||
not verified
|
||||
|
||||
Parameters
|
||||
----------
|
||||
density : array_lie
|
||||
pdf of the kernel density
|
||||
nobs : int
|
||||
number of observations used in the KDE estimation
|
||||
|
||||
Returns
|
||||
-------
|
||||
conf_int : ndarray
|
||||
estimated confidence interval of the density estimate, lower bound
|
||||
in first column and upper bound in second column
|
||||
|
||||
Notes
|
||||
-----
|
||||
This uses the asymptotic normal approximation to the distribution of
|
||||
the density estimate. The lower bound can be negative for density
|
||||
values close to zero.
|
||||
"""
|
||||
from scipy import stats
|
||||
crit = stats.norm.isf(alpha / 2.)
|
||||
density = np.asarray(density)
|
||||
half_width = crit * np.sqrt(self.density_var(density, nobs))
|
||||
conf_int = np.column_stack((density - half_width, density + half_width))
|
||||
return conf_int
|
||||
|
||||
def smooth(self, xs, ys, x):
|
||||
"""Returns the kernel smoothing estimate for point x based on x-values
|
||||
xs and y-values ys.
|
||||
Not expected to be called by the user.
|
||||
"""
|
||||
xs, ys = self.in_domain(xs, ys, x)
|
||||
|
||||
if len(xs)>0:
|
||||
w = np.sum(self((xs-x)/self.h))
|
||||
#TODO: change the below to broadcasting when shape is sorted
|
||||
v = np.sum([yy*self((xx-x)/self.h) for xx, yy in zip(xs, ys)])
|
||||
return v / w
|
||||
else:
|
||||
return np.nan
|
||||
|
||||
def smoothvar(self, xs, ys, x):
|
||||
"""Returns the kernel smoothing estimate of the variance at point x.
|
||||
"""
|
||||
xs, ys = self.in_domain(xs, ys, x)
|
||||
|
||||
if len(xs) > 0:
|
||||
fittedvals = np.array([self.smooth(xs, ys, xx) for xx in xs])
|
||||
sqresid = square( subtract(ys, fittedvals) )
|
||||
w = np.sum(self((xs-x)/self.h))
|
||||
v = np.sum([rr*self((xx-x)/self.h) for xx, rr in zip(xs, sqresid)])
|
||||
return v / w
|
||||
else:
|
||||
return np.nan
|
||||
|
||||
def smoothconf(self, xs, ys, x, alpha=0.05):
|
||||
"""Returns the kernel smoothing estimate with confidence 1sigma bounds
|
||||
"""
|
||||
xs, ys = self.in_domain(xs, ys, x)
|
||||
|
||||
if len(xs) > 0:
|
||||
fittedvals = np.array([self.smooth(xs, ys, xx) for xx in xs])
|
||||
#fittedvals = self.smooth(xs, ys, x) # x or xs in Haerdle
|
||||
sqresid = square(
|
||||
subtract(ys, fittedvals)
|
||||
)
|
||||
w = np.sum(self((xs-x)/self.h))
|
||||
#var = sqresid.sum() / (len(sqresid) - 0) # nonlocal var ? JP just trying
|
||||
v = np.sum([rr*self((xx-x)/self.h) for xx, rr in zip(xs, sqresid)])
|
||||
var = v / w
|
||||
sd = np.sqrt(var)
|
||||
K = self.L2Norm
|
||||
yhat = self.smooth(xs, ys, x)
|
||||
from scipy import stats
|
||||
crit = stats.norm.isf(alpha / 2)
|
||||
err = crit * sd * np.sqrt(K) / np.sqrt(w * self.h * self.norm_const)
|
||||
return (yhat - err, yhat, yhat + err)
|
||||
else:
|
||||
return (np.nan, np.nan, np.nan)
|
||||
|
||||
@property
|
||||
def L2Norm(self):
|
||||
"""Returns the integral of the square of the kernal from -inf to inf"""
|
||||
if self._L2Norm is None:
|
||||
L2Func = lambda x: (self.norm_const*self._shape(x))**2
|
||||
if self.domain is None:
|
||||
self._L2Norm = scipy.integrate.quad(L2Func, -inf, inf)[0]
|
||||
else:
|
||||
self._L2Norm = scipy.integrate.quad(L2Func, self.domain[0],
|
||||
self.domain[1])[0]
|
||||
return self._L2Norm
|
||||
|
||||
@property
|
||||
def norm_const(self):
|
||||
"""
|
||||
Normalising constant for kernel (integral from -inf to inf)
|
||||
"""
|
||||
if self._normconst is None:
|
||||
if self.domain is None:
|
||||
quadres = scipy.integrate.quad(self._shape, -inf, inf)
|
||||
else:
|
||||
quadres = scipy.integrate.quad(self._shape, self.domain[0],
|
||||
self.domain[1])
|
||||
self._normconst = 1.0/(quadres[0])
|
||||
return self._normconst
|
||||
|
||||
@property
|
||||
def kernel_var(self):
|
||||
"""Returns the second moment of the kernel"""
|
||||
if self._kernel_var is None:
|
||||
func = lambda x: x**2 * self.norm_const * self._shape(x)
|
||||
if self.domain is None:
|
||||
self._kernel_var = scipy.integrate.quad(func, -inf, inf)[0]
|
||||
else:
|
||||
self._kernel_var = scipy.integrate.quad(func, self.domain[0],
|
||||
self.domain[1])[0]
|
||||
return self._kernel_var
|
||||
|
||||
def moments(self, n):
|
||||
|
||||
if n > 2:
|
||||
msg = "Only first and second moment currently implemented"
|
||||
raise NotImplementedError(msg)
|
||||
|
||||
if n == 1:
|
||||
return 0
|
||||
|
||||
if n == 2:
|
||||
return self.kernel_var
|
||||
|
||||
@property
|
||||
def normal_reference_constant(self):
|
||||
"""
|
||||
Constant used for silverman normal reference asymtotic bandwidth
|
||||
calculation.
|
||||
|
||||
C = 2((pi^(1/2)*(nu!)^3 R(k))/(2nu(2nu)!kap_nu(k)^2))^(1/(2nu+1))
|
||||
nu = kernel order
|
||||
kap_nu = nu'th moment of kernel
|
||||
R = kernel roughness (square of L^2 norm)
|
||||
|
||||
Note: L2Norm property returns square of norm.
|
||||
"""
|
||||
nu = self._order
|
||||
|
||||
if not nu == 2:
|
||||
msg = "Only implemented for second order kernels"
|
||||
raise NotImplementedError(msg)
|
||||
|
||||
if self._normal_reference_constant is None:
|
||||
C = np.pi**(.5) * factorial(nu)**3 * self.L2Norm
|
||||
C /= (2 * nu * factorial(2 * nu) * self.moments(nu)**2)
|
||||
C = 2*C**(1.0/(2*nu+1))
|
||||
self._normal_reference_constant = C
|
||||
|
||||
return self._normal_reference_constant
|
||||
|
||||
|
||||
def weight(self, x):
|
||||
"""This returns the normalised weight at distance x"""
|
||||
return self.norm_const*self._shape(x)
|
||||
|
||||
def __call__(self, x):
|
||||
"""
|
||||
This simply returns the value of the kernel function at x
|
||||
|
||||
Does the same as weight if the function is normalised
|
||||
"""
|
||||
return self._shape(x)
|
||||
|
||||
|
||||
class Uniform(CustomKernel):
|
||||
def __init__(self, h=1.0):
|
||||
CustomKernel.__init__(self, shape=lambda x: 0.5 * np.ones(x.shape), h=h,
|
||||
domain=[-1.0, 1.0], norm = 1.0)
|
||||
self._L2Norm = 0.5
|
||||
self._kernel_var = 1. / 3
|
||||
self._order = 2
|
||||
|
||||
|
||||
class Triangular(CustomKernel):
|
||||
def __init__(self, h=1.0):
|
||||
CustomKernel.__init__(self, shape=lambda x: 1 - abs(x), h=h,
|
||||
domain=[-1.0, 1.0], norm = 1.0)
|
||||
self._L2Norm = 2.0/3.0
|
||||
self._kernel_var = 1. / 6
|
||||
self._order = 2
|
||||
|
||||
|
||||
class Epanechnikov(CustomKernel):
|
||||
def __init__(self, h=1.0):
|
||||
CustomKernel.__init__(self, shape=lambda x: 0.75*(1 - x*x), h=h,
|
||||
domain=[-1.0, 1.0], norm = 1.0)
|
||||
self._L2Norm = 0.6
|
||||
self._kernel_var = 0.2
|
||||
self._order = 2
|
||||
|
||||
|
||||
class Biweight(CustomKernel):
|
||||
def __init__(self, h=1.0):
|
||||
CustomKernel.__init__(self, shape=lambda x: 0.9375*(1 - x*x)**2, h=h,
|
||||
domain=[-1.0, 1.0], norm = 1.0)
|
||||
self._L2Norm = 5.0/7.0
|
||||
self._kernel_var = 1. / 7
|
||||
self._order = 2
|
||||
|
||||
def smooth(self, xs, ys, x):
|
||||
"""Returns the kernel smoothing estimate for point x based on x-values
|
||||
xs and y-values ys.
|
||||
Not expected to be called by the user.
|
||||
|
||||
Special implementation optimized for Biweight.
|
||||
"""
|
||||
xs, ys = self.in_domain(xs, ys, x)
|
||||
|
||||
if len(xs) > 0:
|
||||
w = np.sum(square(subtract(1, square(divide(subtract(xs, x),
|
||||
self.h)))))
|
||||
v = np.sum(multiply(ys, square(subtract(1, square(divide(
|
||||
subtract(xs, x), self.h))))))
|
||||
return v / w
|
||||
else:
|
||||
return np.nan
|
||||
|
||||
def smoothvar(self, xs, ys, x):
|
||||
"""
|
||||
Returns the kernel smoothing estimate of the variance at point x.
|
||||
"""
|
||||
xs, ys = self.in_domain(xs, ys, x)
|
||||
|
||||
if len(xs) > 0:
|
||||
fittedvals = np.array([self.smooth(xs, ys, xx) for xx in xs])
|
||||
rs = square(subtract(ys, fittedvals))
|
||||
w = np.sum(square(subtract(1.0, square(divide(subtract(xs, x),
|
||||
self.h)))))
|
||||
v = np.sum(multiply(rs, square(subtract(1, square(divide(
|
||||
subtract(xs, x), self.h))))))
|
||||
return v / w
|
||||
else:
|
||||
return np.nan
|
||||
|
||||
def smoothconf_(self, xs, ys, x):
|
||||
"""Returns the kernel smoothing estimate with confidence 1sigma bounds
|
||||
"""
|
||||
xs, ys = self.in_domain(xs, ys, x)
|
||||
|
||||
if len(xs) > 0:
|
||||
fittedvals = np.array([self.smooth(xs, ys, xx) for xx in xs])
|
||||
rs = square(subtract(ys, fittedvals))
|
||||
w = np.sum(square(subtract(1.0, square(divide(subtract(xs, x),
|
||||
self.h)))))
|
||||
v = np.sum(multiply(rs, square(subtract(1, square(divide(
|
||||
subtract(xs, x), self.h))))))
|
||||
var = v / w
|
||||
sd = np.sqrt(var)
|
||||
K = self.L2Norm
|
||||
yhat = self.smooth(xs, ys, x)
|
||||
err = sd * K / np.sqrt(0.9375 * w * self.h)
|
||||
return (yhat - err, yhat, yhat + err)
|
||||
else:
|
||||
return (np.nan, np.nan, np.nan)
|
||||
|
||||
class Triweight(CustomKernel):
|
||||
def __init__(self, h=1.0):
|
||||
CustomKernel.__init__(self, shape=lambda x: 1.09375*(1 - x*x)**3, h=h,
|
||||
domain=[-1.0, 1.0], norm = 1.0)
|
||||
self._L2Norm = 350.0/429.0
|
||||
self._kernel_var = 1. / 9
|
||||
self._order = 2
|
||||
|
||||
|
||||
class Gaussian(CustomKernel):
|
||||
"""
|
||||
Gaussian (Normal) Kernel
|
||||
|
||||
K(u) = 1 / (sqrt(2*pi)) exp(-0.5 u**2)
|
||||
"""
|
||||
def __init__(self, h=1.0):
|
||||
CustomKernel.__init__(self, shape = lambda x: 0.3989422804014327 *
|
||||
np.exp(-x**2/2.0), h = h, domain = None, norm = 1.0)
|
||||
self._L2Norm = 1.0/(2.0*np.sqrt(np.pi))
|
||||
self._kernel_var = 1.0
|
||||
self._order = 2
|
||||
|
||||
def smooth(self, xs, ys, x):
|
||||
"""Returns the kernel smoothing estimate for point x based on x-values
|
||||
xs and y-values ys.
|
||||
Not expected to be called by the user.
|
||||
|
||||
Special implementation optimized for Gaussian.
|
||||
"""
|
||||
w = np.sum(exp(multiply(square(divide(subtract(xs, x),
|
||||
self.h)),-0.5)))
|
||||
v = np.sum(multiply(ys, exp(multiply(square(divide(subtract(xs, x),
|
||||
self.h)), -0.5))))
|
||||
return v/w
|
||||
|
||||
class Cosine(CustomKernel):
|
||||
"""
|
||||
Cosine Kernel
|
||||
|
||||
K(u) = pi/4 cos(0.5 * pi * u) between -1.0 and 1.0
|
||||
"""
|
||||
def __init__(self, h=1.0):
|
||||
CustomKernel.__init__(self, shape=lambda x: 0.78539816339744828 *
|
||||
np.cos(np.pi/2.0 * x), h=h, domain=[-1.0, 1.0], norm = 1.0)
|
||||
self._L2Norm = np.pi**2/16.0
|
||||
self._kernel_var = 0.1894305308612978 # = 1 - 8 / np.pi**2
|
||||
self._order = 2
|
||||
|
||||
|
||||
class Cosine2(CustomKernel):
|
||||
"""
|
||||
Cosine2 Kernel
|
||||
|
||||
K(u) = 1 + cos(2 * pi * u) between -0.5 and 0.5
|
||||
|
||||
Note: this is the same Cosine kernel that Stata uses
|
||||
"""
|
||||
def __init__(self, h=1.0):
|
||||
CustomKernel.__init__(self, shape=lambda x: 1 + np.cos(2.0 * np.pi * x)
|
||||
, h=h, domain=[-0.5, 0.5], norm = 1.0)
|
||||
self._L2Norm = 1.5
|
||||
self._kernel_var = 0.03267274151216444 # = 1/12. - 0.5 / np.pi**2
|
||||
self._order = 2
|
||||
|
||||
class Tricube(CustomKernel):
|
||||
"""
|
||||
Tricube Kernel
|
||||
|
||||
K(u) = 0.864197530864 * (1 - abs(x)**3)**3 between -1.0 and 1.0
|
||||
"""
|
||||
def __init__(self,h=1.0):
|
||||
CustomKernel.__init__(self,shape=lambda x: 0.864197530864 * (1 - abs(x)**3)**3,
|
||||
h=h, domain=[-1.0, 1.0], norm = 1.0)
|
||||
self._L2Norm = 175.0/247.0
|
||||
self._kernel_var = 35.0/243.0
|
||||
self._order = 2
|
||||
Reference in New Issue
Block a user