reconnect moved files to git repo

This commit is contained in:
root
2025-08-01 04:33:03 -04:00
commit 5d3c35492d
23190 changed files with 4750716 additions and 0 deletions

View File

@ -0,0 +1,560 @@
# some cut and paste characters are not ASCII
'''density estimation based on orthogonal polynomials
Author: Josef Perktold
Created: 2011-05017
License: BSD
2 versions work: based on Fourier, FPoly, and chebychev T, ChebyTPoly
also hermite polynomials, HPoly, works
other versions need normalization
TODO:
* check fourier case again: base is orthonormal,
but needs offsetfact = 0 and does not integrate to 1, rescaled looks good
* hermite: works but DensityOrthoPoly requires currently finite bounds
I use it with offsettfactor 0.5 in example
* not implemented methods:
- add bonafide density correction
- add transformation to domain of polynomial base - DONE
possible problem: what is the behavior at the boundary,
offsetfact requires more work, check different cases, add as option
moved to polynomial class by default, as attribute
* convert examples to test cases
* need examples with large density on boundary, beta ?
* organize poly classes in separate module, check new numpy.polynomials,
polyvander
* MISE measures, order selection, ...
enhancements:
* other polynomial bases: especially for open and half open support
* wavelets
* local or piecewise approximations
'''
from scipy import stats, integrate, special
import numpy as np
sqr2 = np.sqrt(2.)
class FPoly:
'''Orthonormal (for weight=1) Fourier Polynomial on [0,1]
orthonormal polynomial but density needs corfactor that I do not see what
it is analytically
parameterization on [0,1] from
Sam Efromovich: Orthogonal series density estimation,
2010 John Wiley & Sons, Inc. WIREs Comp Stat 2010 2 467-476
'''
def __init__(self, order):
self.order = order
self.domain = (0, 1)
self.intdomain = self.domain
def __call__(self, x):
if self.order == 0:
return np.ones_like(x)
else:
return sqr2 * np.cos(np.pi * self.order * x)
class F2Poly:
'''Orthogonal (for weight=1) Fourier Polynomial on [0,pi]
is orthogonal but first component does not square-integrate to 1
final result seems to need a correction factor of sqrt(pi)
_corfactor = sqrt(pi) from integrating the density
Parameterization on [0, pi] from
Peter Hall, Cross-Validation and the Smoothing of Orthogonal Series Density
Estimators, JOURNAL OF MULTIVARIATE ANALYSIS 21, 189-206 (1987)
'''
def __init__(self, order):
self.order = order
self.domain = (0, np.pi)
self.intdomain = self.domain
self.offsetfactor = 0
def __call__(self, x):
if self.order == 0:
return np.ones_like(x) / np.sqrt(np.pi)
else:
return sqr2 * np.cos(self.order * x) / np.sqrt(np.pi)
class ChebyTPoly:
'''Orthonormal (for weight=1) Chebychev Polynomial on (-1,1)
Notes
-----
integration requires to stay away from boundary, offsetfactor > 0
maybe this implies that we cannot use it for densities that are > 0 at
boundary ???
or maybe there is a mistake close to the boundary, sometimes integration works.
'''
def __init__(self, order):
self.order = order
from scipy.special import chebyt
self.poly = chebyt(order)
self.domain = (-1, 1)
self.intdomain = (-1+1e-6, 1-1e-6)
#not sure if I need this, in integration nans are possible on the boundary
self.offsetfactor = 0.01 #required for integration
def __call__(self, x):
if self.order == 0:
return np.ones_like(x) / (1-x**2)**(1/4.) /np.sqrt(np.pi)
else:
return self.poly(x) / (1-x**2)**(1/4.) /np.sqrt(np.pi) *np.sqrt(2)
logpi2 = np.log(np.pi)/2
class HPoly:
'''Orthonormal (for weight=1) Hermite Polynomial, uses finite bounds
for current use with DensityOrthoPoly domain is defined as [-6,6]
'''
def __init__(self, order):
self.order = order
from scipy.special import hermite
self.poly = hermite(order)
self.domain = (-6, +6)
self.offsetfactor = 0.5 # note this is
def __call__(self, x):
k = self.order
lnfact = -(1./2)*(k*np.log(2.) + special.gammaln(k+1) + logpi2) - x*x/2
fact = np.exp(lnfact)
return self.poly(x) * fact
def polyvander(x, polybase, order=5):
polyarr = np.column_stack([polybase(i)(x) for i in range(order)])
return polyarr
def inner_cont(polys, lower, upper, weight=None):
'''inner product of continuous function (with weight=1)
Parameters
----------
polys : list of callables
polynomial instances
lower : float
lower integration limit
upper : float
upper integration limit
weight : callable or None
weighting function
Returns
-------
innp : ndarray
symmetric 2d square array with innerproduct of all function pairs
err : ndarray
numerical error estimate from scipy.integrate.quad, same dimension as innp
Examples
--------
>>> from scipy.special import chebyt
>>> polys = [chebyt(i) for i in range(4)]
>>> r, e = inner_cont(polys, -1, 1)
>>> r
array([[ 2. , 0. , -0.66666667, 0. ],
[ 0. , 0.66666667, 0. , -0.4 ],
[-0.66666667, 0. , 0.93333333, 0. ],
[ 0. , -0.4 , 0. , 0.97142857]])
'''
n_polys = len(polys)
innerprod = np.empty((n_polys, n_polys))
innerprod.fill(np.nan)
interr = np.zeros((n_polys, n_polys))
for i in range(n_polys):
for j in range(i+1):
p1 = polys[i]
p2 = polys[j]
if weight is not None:
innp, err = integrate.quad(lambda x: p1(x)*p2(x)*weight(x),
lower, upper)
else:
innp, err = integrate.quad(lambda x: p1(x)*p2(x), lower, upper)
innerprod[i,j] = innp
interr[i,j] = err
if not i == j:
innerprod[j,i] = innp
interr[j,i] = err
return innerprod, interr
def is_orthonormal_cont(polys, lower, upper, rtol=0, atol=1e-08):
'''check whether functions are orthonormal
Parameters
----------
polys : list of polynomials or function
Returns
-------
is_orthonormal : bool
is False if the innerproducts are not close to 0 or 1
Notes
-----
this stops as soon as the first deviation from orthonormality is found.
Examples
--------
>>> from scipy.special import chebyt
>>> polys = [chebyt(i) for i in range(4)]
>>> r, e = inner_cont(polys, -1, 1)
>>> r
array([[ 2. , 0. , -0.66666667, 0. ],
[ 0. , 0.66666667, 0. , -0.4 ],
[-0.66666667, 0. , 0.93333333, 0. ],
[ 0. , -0.4 , 0. , 0.97142857]])
>>> is_orthonormal_cont(polys, -1, 1, atol=1e-6)
False
>>> polys = [ChebyTPoly(i) for i in range(4)]
>>> r, e = inner_cont(polys, -1, 1)
>>> r
array([[ 1.00000000e+00, 0.00000000e+00, -9.31270888e-14,
0.00000000e+00],
[ 0.00000000e+00, 1.00000000e+00, 0.00000000e+00,
-9.47850712e-15],
[ -9.31270888e-14, 0.00000000e+00, 1.00000000e+00,
0.00000000e+00],
[ 0.00000000e+00, -9.47850712e-15, 0.00000000e+00,
1.00000000e+00]])
>>> is_orthonormal_cont(polys, -1, 1, atol=1e-6)
True
'''
for i in range(len(polys)):
for j in range(i+1):
p1 = polys[i]
p2 = polys[j]
innerprod = integrate.quad(lambda x: p1(x)*p2(x), lower, upper)[0]
#print i,j, innerprod
if not np.allclose(innerprod, i==j, rtol=rtol, atol=atol):
return False
return True
#new versions
class DensityOrthoPoly:
'''Univariate density estimation by orthonormal series expansion
Uses an orthonormal polynomial basis to approximate a univariate density.
currently all arguments can be given to fit, I might change it to requiring
arguments in __init__ instead.
'''
def __init__(self, polybase=None, order=5):
if polybase is not None:
self.polybase = polybase
self.polys = polys = [polybase(i) for i in range(order)]
#try:
#self.offsetfac = 0.05
#self.offsetfac = polys[0].offsetfactor #polys maybe not defined yet
self._corfactor = 1
self._corshift = 0
def fit(self, x, polybase=None, order=5, limits=None):
'''estimate the orthogonal polynomial approximation to the density
'''
if polybase is None:
polys = self.polys[:order]
else:
self.polybase = polybase
self.polys = polys = [polybase(i) for i in range(order)]
#move to init ?
if not hasattr(self, 'offsetfac'):
self.offsetfac = polys[0].offsetfactor
xmin, xmax = x.min(), x.max()
if limits is None:
self.offset = offset = (xmax - xmin) * self.offsetfac
limits = self.limits = (xmin - offset, xmax + offset)
interval_length = limits[1] - limits[0]
xinterval = xmax - xmin
# need to cover (half-)open intervalls
self.shrink = 1. / interval_length #xinterval/interval_length
offset = (interval_length - xinterval ) / 2.
self.shift = xmin - offset
self.x = x = self._transform(x)
coeffs = [(p(x)).mean() for p in polys]
self.coeffs = coeffs
self.polys = polys
self._verify() #verify that it is a proper density
return self #coeffs, polys
def evaluate(self, xeval, order=None):
xeval = self._transform(xeval)
if order is None:
order = len(self.polys)
res = sum(c*p(xeval) for c, p in list(zip(self.coeffs, self.polys))[:order])
res = self._correction(res)
return res
def __call__(self, xeval):
'''alias for evaluate, except no order argument'''
return self.evaluate(xeval)
def _verify(self):
'''check for bona fide density correction
currently only checks that density integrates to 1
` non-negativity - NotImplementedYet
'''
#watch out for circular/recursive usage
#evaluate uses domain of data, we stay offset away from bounds
intdomain = self.limits #self.polys[0].intdomain
self._corfactor = 1./integrate.quad(self.evaluate, *intdomain)[0]
#self._corshift = 0
#self._corfactor
return self._corfactor
def _correction(self, x):
'''bona fide density correction
affine shift of density to make it into a proper density
'''
if self._corfactor != 1:
x *= self._corfactor
if self._corshift != 0:
x += self._corshift
return x
def _transform(self, x): # limits=None):
'''transform observation to the domain of the density
uses shrink and shift attribute which are set in fit to stay
'''
#use domain from first instance
#class does not have domain self.polybase.domain[0] AttributeError
domain = self.polys[0].domain
ilen = (domain[1] - domain[0])
shift = self.shift - domain[0]/self.shrink/ilen
shrink = self.shrink * ilen
return (x - shift) * shrink
#old version as a simple function
def density_orthopoly(x, polybase, order=5, xeval=None):
#polybase = legendre #chebyt #hermitenorm#
#polybase = chebyt
#polybase = FPoly
#polybase = ChtPoly
#polybase = hermite
#polybase = HPoly
if xeval is None:
xeval = np.linspace(x.min(),x.max(),50)
#polys = [legendre(i) for i in range(order)]
polys = [polybase(i) for i in range(order)]
#coeffs = [(p(x)*(1-x**2)**(-1/2.)).mean() for p in polys]
#coeffs = [(p(x)*np.exp(-x*x)).mean() for p in polys]
coeffs = [(p(x)).mean() for p in polys]
res = sum(c*p(xeval) for c, p in zip(coeffs, polys))
#res *= (1-xeval**2)**(-1/2.)
#res *= np.exp(-xeval**2./2)
return res, xeval, coeffs, polys
if __name__ == '__main__':
examples = ['chebyt', 'fourier', 'hermite']#[2]
nobs = 10000
import matplotlib.pyplot as plt
from statsmodels.distributions.mixture_rvs import (
mixture_rvs, MixtureDistribution)
#np.random.seed(12345)
## obs_dist = mixture_rvs([1/3.,2/3.], size=nobs, dist=[stats.norm, stats.norm],
## kwargs = (dict(loc=-1,scale=.5),dict(loc=1,scale=.75)))
mix_kwds = (dict(loc=-0.5,scale=.5),dict(loc=1,scale=.2))
obs_dist = mixture_rvs([1/3.,2/3.], size=nobs, dist=[stats.norm, stats.norm],
kwargs=mix_kwds)
mix = MixtureDistribution()
#obs_dist = np.random.randn(nobs)/4. #np.sqrt(2)
if "chebyt_" in examples: # needed for Cheby example below
#obs_dist = np.clip(obs_dist, -2, 2)/2.01
#chebyt [0,1]
obs_dist = obs_dist[(obs_dist>-2) & (obs_dist<2)]/2.0 #/4. + 2/4.0
#fourier [0,1]
#obs_dist = obs_dist[(obs_dist>-2) & (obs_dist<2)]/4. + 2/4.0
f_hat, grid, coeffs, polys = density_orthopoly(obs_dist, ChebyTPoly, order=20, xeval=None)
#f_hat /= f_hat.sum() * (grid.max() - grid.min())/len(grid)
f_hat0 = f_hat
fint = integrate.trapz(f_hat, grid)# dx=(grid.max() - grid.min())/len(grid))
#f_hat -= fint/2.
print('f_hat.min()', f_hat.min())
f_hat = (f_hat - f_hat.min()) #/ f_hat.max() - f_hat.min
fint2 = integrate.trapz(f_hat, grid)# dx=(grid.max() - grid.min())/len(grid))
print('fint2', fint, fint2)
f_hat /= fint2
# note that this uses a *huge* grid by default
#f_hat, grid = kdensityfft(emp_dist, kernel="gauss", bw="scott")
# check the plot
doplot = 0
if doplot:
plt.hist(obs_dist, bins=50, normed=True, color='red')
plt.plot(grid, f_hat, lw=2, color='black')
plt.plot(grid, f_hat0, lw=2, color='g')
plt.show()
for i,p in enumerate(polys[:5]):
for j,p2 in enumerate(polys[:5]):
print(i,j,integrate.quad(lambda x: p(x)*p2(x), -1,1)[0])
for p in polys:
print(integrate.quad(lambda x: p(x)**2, -1,1))
#examples using the new class
if "chebyt" in examples:
dop = DensityOrthoPoly().fit(obs_dist, ChebyTPoly, order=20)
grid = np.linspace(obs_dist.min(), obs_dist.max())
xf = dop(grid)
#print('np.max(np.abs(xf - f_hat0))', np.max(np.abs(xf - f_hat0))
dopint = integrate.quad(dop, *dop.limits)[0]
print('dop F integral', dopint)
mpdf = mix.pdf(grid, [1/3.,2/3.], dist=[stats.norm, stats.norm],
kwargs=mix_kwds)
doplot = 1
if doplot:
plt.figure()
plt.hist(obs_dist, bins=50, normed=True, color='red')
plt.plot(grid, xf, lw=2, color='black')
plt.plot(grid, mpdf, lw=2, color='green')
plt.title('using Chebychev polynomials')
#plt.show()
if "fourier" in examples:
dop = DensityOrthoPoly()
dop.offsetfac = 0.5
dop = dop.fit(obs_dist, F2Poly, order=30)
grid = np.linspace(obs_dist.min(), obs_dist.max())
xf = dop(grid)
#print(np.max(np.abs(xf - f_hat0))
dopint = integrate.quad(dop, *dop.limits)[0]
print('dop F integral', dopint)
mpdf = mix.pdf(grid, [1/3.,2/3.], dist=[stats.norm, stats.norm],
kwargs=mix_kwds)
doplot = 1
if doplot:
plt.figure()
plt.hist(obs_dist, bins=50, normed=True, color='red')
plt.title('using Fourier polynomials')
plt.plot(grid, xf, lw=2, color='black')
plt.plot(grid, mpdf, lw=2, color='green')
#plt.show()
#check orthonormality:
print(np.max(np.abs(inner_cont(dop.polys[:5], 0, 1)[0] -np.eye(5))))
if "hermite" in examples:
dop = DensityOrthoPoly()
dop.offsetfac = 0
dop = dop.fit(obs_dist, HPoly, order=20)
grid = np.linspace(obs_dist.min(), obs_dist.max())
xf = dop(grid)
#print(np.max(np.abs(xf - f_hat0))
dopint = integrate.quad(dop, *dop.limits)[0]
print('dop F integral', dopint)
mpdf = mix.pdf(grid, [1/3.,2/3.], dist=[stats.norm, stats.norm],
kwargs=mix_kwds)
doplot = 1
if doplot:
plt.figure()
plt.hist(obs_dist, bins=50, normed=True, color='red')
plt.plot(grid, xf, lw=2, color='black')
plt.plot(grid, mpdf, lw=2, color='green')
plt.title('using Hermite polynomials')
plt.show()
#check orthonormality:
print(np.max(np.abs(inner_cont(dop.polys[:5], 0, 1)[0] -np.eye(5))))
#check orthonormality
hpolys = [HPoly(i) for i in range(5)]
inn = inner_cont(hpolys, -6, 6)[0]
print(np.max(np.abs(inn - np.eye(5))))
print((inn*100000).astype(int))
from scipy.special import hermite, chebyt
htpolys = [hermite(i) for i in range(5)]
innt = inner_cont(htpolys, -10, 10)[0]
print((innt*100000).astype(int))
polysc = [chebyt(i) for i in range(4)]
r, e = inner_cont(polysc, -1, 1, weight=lambda x: (1-x*x)**(-1/2.))
print(np.max(np.abs(r - np.diag(np.diag(r)))))

View File

@ -0,0 +1,212 @@
"""Examples of non-linear functions for non-parametric regression
Created on Sat Jan 05 20:21:22 2013
Author: Josef Perktold
"""
import numpy as np
## Functions
def fg1(x):
'''Fan and Gijbels example function 1
'''
return x + 2 * np.exp(-16 * x**2)
def fg1eu(x):
'''Eubank similar to Fan and Gijbels example function 1
'''
return x + 0.5 * np.exp(-50 * (x - 0.5)**2)
def fg2(x):
'''Fan and Gijbels example function 2
'''
return np.sin(2 * x) + 2 * np.exp(-16 * x**2)
def func1(x):
'''made up example with sin, square
'''
return np.sin(x * 5) / x + 2. * x - 1. * x**2
## Classes with Data Generating Processes
doc = {'description':
'''Base Class for Univariate non-linear example
Does not work on it's own.
needs additional at least self.func
''',
'ref': ''}
class _UnivariateFunction:
#Base Class for Univariate non-linear example.
#Does not work on it's own. needs additionally at least self.func
__doc__ = '''%(description)s
Parameters
----------
nobs : int
number of observations to simulate
x : None or 1d array
If x is given then it is used for the exogenous variable instead of
creating a random sample
distr_x : None or distribution instance
Only used if x is None. The rvs method is used to create a random
sample of the exogenous (explanatory) variable.
distr_noise : None or distribution instance
The rvs method is used to create a random sample of the errors.
Attributes
----------
x : ndarray, 1-D
exogenous or explanatory variable. x is sorted.
y : ndarray, 1-D
endogenous or response variable
y_true : ndarray, 1-D
expected values of endogenous or response variable, i.e. values of y
without noise
func : callable
underlying function (defined by subclass)
%(ref)s
''' #% doc
def __init__(self, nobs=200, x=None, distr_x=None, distr_noise=None):
if x is None:
if distr_x is None:
x = np.random.normal(loc=0, scale=self.s_x, size=nobs)
else:
x = distr_x.rvs(size=nobs)
x.sort()
self.x = x
if distr_noise is None:
noise = np.random.normal(loc=0, scale=self.s_noise, size=nobs)
else:
noise = distr_noise.rvs(size=nobs)
if hasattr(self, 'het_scale'):
noise *= self.het_scale(self.x)
#self.func = fg1
self.y_true = y_true = self.func(x)
self.y = y_true + noise
def plot(self, scatter=True, ax=None):
'''plot the mean function and optionally the scatter of the sample
Parameters
----------
scatter : bool
If true, then add scatterpoints of sample to plot.
ax : None or matplotlib axis instance
If None, then a matplotlib.pyplot figure is created, otherwise
the given axis, ax, is used.
Returns
-------
Figure
This is either the created figure instance or the one associated
with ax if ax is given.
'''
if ax is None:
import matplotlib.pyplot as plt
fig = plt.figure()
ax = fig.add_subplot(1, 1, 1)
if scatter:
ax.plot(self.x, self.y, 'o', alpha=0.5)
xx = np.linspace(self.x.min(), self.x.max(), 100)
ax.plot(xx, self.func(xx), lw=2, color='b', label='dgp mean')
return ax.figure
doc = {'description':
'''Fan and Gijbels example function 1
linear trend plus a hump
''',
'ref':
'''
References
----------
Fan, Jianqing, and Irene Gijbels. 1992. "Variable Bandwidth and Local
Linear Regression Smoothers."
The Annals of Statistics 20 (4) (December): 2008-2036. doi:10.2307/2242378.
'''}
class UnivariateFanGijbels1(_UnivariateFunction):
__doc__ = _UnivariateFunction.__doc__ % doc
def __init__(self, nobs=200, x=None, distr_x=None, distr_noise=None):
self.s_x = 1.
self.s_noise = 0.7
self.func = fg1
super(self.__class__, self).__init__(nobs=nobs, x=x,
distr_x=distr_x,
distr_noise=distr_noise)
doc['description'] =\
'''Fan and Gijbels example function 2
sin plus a hump
'''
class UnivariateFanGijbels2(_UnivariateFunction):
__doc__ = _UnivariateFunction.__doc__ % doc
def __init__(self, nobs=200, x=None, distr_x=None, distr_noise=None):
self.s_x = 1.
self.s_noise = 0.5
self.func = fg2
super(self.__class__, self).__init__(nobs=nobs, x=x,
distr_x=distr_x,
distr_noise=distr_noise)
class UnivariateFanGijbels1EU(_UnivariateFunction):
'''
Eubank p.179f
'''
def __init__(self, nobs=50, x=None, distr_x=None, distr_noise=None):
if distr_x is None:
from scipy import stats
distr_x = stats.uniform
self.s_noise = 0.15
self.func = fg1eu
super(self.__class__, self).__init__(nobs=nobs, x=x,
distr_x=distr_x,
distr_noise=distr_noise)
class UnivariateFunc1(_UnivariateFunction):
'''
made up, with sin and quadratic trend
'''
def __init__(self, nobs=200, x=None, distr_x=None, distr_noise=None):
if x is None and distr_x is None:
from scipy import stats
distr_x = stats.uniform(-2, 4)
else:
nobs = x.shape[0]
self.s_noise = 2.
self.func = func1
super().__init__(nobs=nobs, x=x,
distr_x=distr_x,
distr_noise=distr_noise)
def het_scale(self, x):
return np.sqrt(np.abs(3+x))

View File

@ -0,0 +1,114 @@
from statsmodels.compat.python import lzip
import numpy as np
from statsmodels.tools.validation import array_like
from . import kernels
#TODO: should this be a function?
class KDE:
"""
Kernel Density Estimator
Parameters
----------
x : array_like
N-dimensional array from which the density is to be estimated
kernel : Kernel Class
Should be a class from *
"""
#TODO: amend docs for Nd case?
def __init__(self, x, kernel=None):
x = array_like(x, "x", maxdim=2, contiguous=True)
if x.ndim == 1:
x = x[:,None]
nobs, n_series = x.shape
if kernel is None:
kernel = kernels.Gaussian() # no meaningful bandwidth yet
if n_series > 1:
if isinstance( kernel, kernels.CustomKernel ):
kernel = kernels.NdKernel(n_series, kernels = kernel)
self.kernel = kernel
self.n = n_series #TODO change attribute
self.x = x
def density(self, x):
return self.kernel.density(self.x, x)
def __call__(self, x, h="scott"):
return np.array([self.density(xx) for xx in x])
def evaluate(self, x, h="silverman"):
density = self.kernel.density
return np.array([density(xx) for xx in x])
if __name__ == "__main__":
from numpy import random
import matplotlib.pyplot as plt
import statsmodels.nonparametric.bandwidths as bw
from statsmodels.sandbox.nonparametric.testdata import kdetest
# 1-D case
random.seed(142)
x = random.standard_t(4.2, size = 50)
h = bw.bw_silverman(x)
#NOTE: try to do it with convolution
support = np.linspace(-10,10,512)
kern = kernels.Gaussian(h = h)
kde = KDE( x, kern)
print(kde.density(1.015469))
print(0.2034675)
Xs = np.arange(-10,10,0.1)
fig = plt.figure()
ax = fig.add_subplot(111)
ax.plot(Xs, kde(Xs), "-")
ax.set_ylim(-10, 10)
ax.set_ylim(0,0.4)
# 2-D case
x = lzip(kdetest.faithfulData["eruptions"], kdetest.faithfulData["waiting"])
x = np.array(x)
x = (x - x.mean(0))/x.std(0)
nobs = x.shape[0]
H = kdetest.Hpi
kern = kernels.NdKernel( 2 )
kde = KDE( x, kern )
print(kde.density( np.matrix( [1,2 ]))) #.T
plt.figure()
plt.plot(x[:,0], x[:,1], 'o')
n_grid = 50
xsp = np.linspace(x.min(0)[0], x.max(0)[0], n_grid)
ysp = np.linspace(x.min(0)[1], x.max(0)[1], n_grid)
# xsorted = np.sort(x)
# xlow = xsorted[nobs/4]
# xupp = xsorted[3*nobs/4]
# xsp = np.linspace(xlow[0], xupp[0], n_grid)
# ysp = np.linspace(xlow[1], xupp[1], n_grid)
xr, yr = np.meshgrid(xsp, ysp)
kde_vals = np.array([kde.density( np.matrix( [xi, yi ]) ) for xi, yi in
zip(xr.ravel(), yr.ravel())])
plt.contour(xsp, ysp, kde_vals.reshape(n_grid, n_grid))
plt.show()
# 5 D case
# random.seed(142)
# mu = [1.0, 4.0, 3.5, -2.4, 0.0]
# sigma = np.matrix(
# [[ 0.6 - 0.1*abs(i-j) if i != j else 1.0 for j in xrange(5)] for i in xrange(5)])
# x = random.multivariate_normal(mu, sigma, size = 100)
# kern = kernel.Gaussian()
# kde = KernelEstimate( x, kern )

View File

@ -0,0 +1,164 @@
'''subclassing kde
Author: josef pktd
'''
import numpy as np
from numpy.testing import assert_almost_equal, assert_
import scipy
from scipy import stats
import matplotlib.pylab as plt
class gaussian_kde_set_covariance(stats.gaussian_kde):
'''
from Anne Archibald in mailinglist:
http://www.nabble.com/Width-of-the-gaussian-in-stats.kde.gaussian_kde---td19558924.html#a19558924
'''
def __init__(self, dataset, covariance):
self.covariance = covariance
scipy.stats.gaussian_kde.__init__(self, dataset)
def _compute_covariance(self):
self.inv_cov = np.linalg.inv(self.covariance)
self._norm_factor = np.sqrt(np.linalg.det(2*np.pi*self.covariance)) * self.n
class gaussian_kde_covfact(stats.gaussian_kde):
def __init__(self, dataset, covfact = 'scotts'):
self.covfact = covfact
scipy.stats.gaussian_kde.__init__(self, dataset)
def _compute_covariance_(self):
'''not used'''
self.inv_cov = np.linalg.inv(self.covariance)
self._norm_factor = np.sqrt(np.linalg.det(2*np.pi*self.covariance)) * self.n
def covariance_factor(self):
if self.covfact in ['sc', 'scotts']:
return self.scotts_factor()
if self.covfact in ['si', 'silverman']:
return self.silverman_factor()
elif self.covfact:
return float(self.covfact)
else:
raise ValueError('covariance factor has to be scotts, silverman or a number')
def reset_covfact(self, covfact):
self.covfact = covfact
self.covariance_factor()
self._compute_covariance()
def plotkde(covfact):
gkde.reset_covfact(covfact)
kdepdf = gkde.evaluate(ind)
plt.figure()
# plot histgram of sample
plt.hist(xn, bins=20, normed=1)
# plot estimated density
plt.plot(ind, kdepdf, label='kde', color="g")
# plot data generating density
plt.plot(ind, alpha * stats.norm.pdf(ind, loc=mlow) +
(1-alpha) * stats.norm.pdf(ind, loc=mhigh),
color="r", label='DGP: normal mix')
plt.title('Kernel Density Estimation - ' + str(gkde.covfact))
plt.legend()
def test_kde_1d():
np.random.seed(8765678)
n_basesample = 500
xn = np.random.randn(n_basesample)
xnmean = xn.mean()
xnstd = xn.std(ddof=1)
print(xnmean, xnstd)
# get kde for original sample
gkde = stats.gaussian_kde(xn)
# evaluate the density function for the kde for some points
xs = np.linspace(-7,7,501)
kdepdf = gkde.evaluate(xs)
normpdf = stats.norm.pdf(xs, loc=xnmean, scale=xnstd)
print('MSE', np.sum((kdepdf - normpdf)**2))
print('axabserror', np.max(np.abs(kdepdf - normpdf)))
intervall = xs[1] - xs[0]
assert_(np.sum((kdepdf - normpdf)**2)*intervall < 0.01)
#assert_array_almost_equal(kdepdf, normpdf, decimal=2)
print(gkde.integrate_gaussian(0.0, 1.0))
print(gkde.integrate_box_1d(-np.inf, 0.0))
print(gkde.integrate_box_1d(0.0, np.inf))
print(gkde.integrate_box_1d(-np.inf, xnmean))
print(gkde.integrate_box_1d(xnmean, np.inf))
assert_almost_equal(gkde.integrate_box_1d(xnmean, np.inf), 0.5, decimal=1)
assert_almost_equal(gkde.integrate_box_1d(-np.inf, xnmean), 0.5, decimal=1)
assert_almost_equal(gkde.integrate_box(xnmean, np.inf), 0.5, decimal=1)
assert_almost_equal(gkde.integrate_box(-np.inf, xnmean), 0.5, decimal=1)
assert_almost_equal(gkde.integrate_kde(gkde),
(kdepdf**2).sum()*intervall, decimal=2)
assert_almost_equal(gkde.integrate_gaussian(xnmean, xnstd**2),
(kdepdf*normpdf).sum()*intervall, decimal=2)
## assert_almost_equal(gkde.integrate_gaussian(0.0, 1.0),
## (kdepdf*normpdf).sum()*intervall, decimal=2)
if __name__ == '__main__':
# generate a sample
n_basesample = 1000
np.random.seed(8765678)
alpha = 0.6 #weight for (prob of) lower distribution
mlow, mhigh = (-3,3) #mean locations for gaussian mixture
xn = np.concatenate([mlow + np.random.randn(alpha * n_basesample),
mhigh + np.random.randn((1-alpha) * n_basesample)])
# get kde for original sample
#gkde = stats.gaussian_kde(xn)
gkde = gaussian_kde_covfact(xn, 0.1)
# evaluate the density function for the kde for some points
ind = np.linspace(-7,7,101)
kdepdf = gkde.evaluate(ind)
plt.figure()
# plot histgram of sample
plt.hist(xn, bins=20, normed=1)
# plot estimated density
plt.plot(ind, kdepdf, label='kde', color="g")
# plot data generating density
plt.plot(ind, alpha * stats.norm.pdf(ind, loc=mlow) +
(1-alpha) * stats.norm.pdf(ind, loc=mhigh),
color="r", label='DGP: normal mix')
plt.title('Kernel Density Estimation')
plt.legend()
gkde = gaussian_kde_covfact(xn, 'scotts')
kdepdf = gkde.evaluate(ind)
plt.figure()
# plot histgram of sample
plt.hist(xn, bins=20, normed=1)
# plot estimated density
plt.plot(ind, kdepdf, label='kde', color="g")
# plot data generating density
plt.plot(ind, alpha * stats.norm.pdf(ind, loc=mlow) +
(1-alpha) * stats.norm.pdf(ind, loc=mhigh),
color="r", label='DGP: normal mix')
plt.title('Kernel Density Estimation')
plt.legend()
#plt.show()
for cv in ['scotts', 'silverman', 0.05, 0.1, 0.5]:
plotkde(cv)
test_kde_1d()
np.random.seed(8765678)
n_basesample = 1000
xn = np.random.randn(n_basesample)
xnmean = xn.mean()
xnstd = xn.std(ddof=1)
# get kde for original sample
gkde = stats.gaussian_kde(xn)

View File

@ -0,0 +1,419 @@
"""
Multivariate Conditional and Unconditional Kernel Density Estimation
with Mixed Data Types
References
----------
[1] Racine, J., Li, Q. Nonparametric econometrics: theory and practice.
Princeton University Press. (2007)
[2] Racine, Jeff. "Nonparametric Econometrics: A Primer," Foundation
and Trends in Econometrics: Vol 3: No 1, pp1-88. (2008)
http://dx.doi.org/10.1561/0800000009
[3] Racine, J., Li, Q. "Nonparametric Estimation of Distributions
with Categorical and Continuous Data." Working Paper. (2000)
[4] Racine, J. Li, Q. "Kernel Estimation of Multivariate Conditional
Distributions Annals of Economics and Finance 5, 211-235 (2004)
[5] Liu, R., Yang, L. "Kernel estimation of multivariate
cumulative distribution function."
Journal of Nonparametric Statistics (2008)
[6] Li, R., Ju, G. "Nonparametric Estimation of Multivariate CDF
with Categorical and Continuous Data." Working Paper
[7] Li, Q., Racine, J. "Cross-validated local linear nonparametric
regression" Statistica Sinica 14(2004), pp. 485-512
[8] Racine, J.: "Consistent Significance Testing for Nonparametric
Regression" Journal of Business & Economics Statistics
[9] Racine, J., Hart, J., Li, Q., "Testing the Significance of
Categorical Predictor Variables in Nonparametric Regression
Models", 2006, Econometric Reviews 25, 523-544
"""
# TODO: make default behavior efficient=True above a certain n_obs
import numpy as np
from scipy import optimize
from scipy.stats.mstats import mquantiles
from statsmodels.nonparametric.api import KDEMultivariate, KernelReg
from statsmodels.nonparametric._kernel_base import \
gpke, LeaveOneOut, _get_type_pos, _adjust_shape
__all__ = ['SingleIndexModel', 'SemiLinear', 'TestFForm']
class TestFForm:
"""
Nonparametric test for functional form.
Parameters
----------
endog : list
Dependent variable (training set)
exog : list of array_like objects
The independent (right-hand-side) variables
bw : array_like, str
Bandwidths for exog or specify method for bandwidth selection
fform : function
The functional form ``y = g(b, x)`` to be tested. Takes as inputs
the RHS variables `exog` and the coefficients ``b`` (betas)
and returns a fitted ``y_hat``.
var_type : str
The type of the independent `exog` variables:
- c: continuous
- o: ordered
- u: unordered
estimator : function
Must return the estimated coefficients b (betas). Takes as inputs
``(endog, exog)``. E.g. least square estimator::
lambda (x,y): np.dot(np.pinv(np.dot(x.T, x)), np.dot(x.T, y))
References
----------
See Racine, J.: "Consistent Significance Testing for Nonparametric
Regression" Journal of Business & Economics Statistics.
See chapter 12 in [1] pp. 355-357.
"""
def __init__(self, endog, exog, bw, var_type, fform, estimator, nboot=100):
self.endog = endog
self.exog = exog
self.var_type = var_type
self.fform = fform
self.estimator = estimator
self.nboot = nboot
self.bw = KDEMultivariate(exog, bw=bw, var_type=var_type).bw
self.sig = self._compute_sig()
def _compute_sig(self):
Y = self.endog
X = self.exog
b = self.estimator(Y, X)
m = self.fform(X, b)
n = np.shape(X)[0]
resid = Y - m
resid = resid - np.mean(resid) # center residuals
self.test_stat = self._compute_test_stat(resid)
sqrt5 = np.sqrt(5.)
fct1 = (1 - sqrt5) / 2.
fct2 = (1 + sqrt5) / 2.
u1 = fct1 * resid
u2 = fct2 * resid
r = fct2 / sqrt5
I_dist = np.empty((self.nboot,1))
for j in range(self.nboot):
u_boot = u2.copy()
prob = np.random.uniform(0,1, size = (n,))
ind = prob < r
u_boot[ind] = u1[ind]
Y_boot = m + u_boot
b_hat = self.estimator(Y_boot, X)
m_hat = self.fform(X, b_hat)
u_boot_hat = Y_boot - m_hat
I_dist[j] = self._compute_test_stat(u_boot_hat)
self.boots_results = I_dist
sig = "Not Significant"
if self.test_stat > mquantiles(I_dist, 0.9):
sig = "*"
if self.test_stat > mquantiles(I_dist, 0.95):
sig = "**"
if self.test_stat > mquantiles(I_dist, 0.99):
sig = "***"
return sig
def _compute_test_stat(self, u):
n = np.shape(u)[0]
XLOO = LeaveOneOut(self.exog)
uLOO = LeaveOneOut(u[:,None]).__iter__()
ival = 0
S2 = 0
for i, X_not_i in enumerate(XLOO):
u_j = next(uLOO)
u_j = np.squeeze(u_j)
# See Bootstrapping procedure on p. 357 in [1]
K = gpke(self.bw, data=-X_not_i, data_predict=-self.exog[i, :],
var_type=self.var_type, tosum=False)
f_i = (u[i] * u_j * K)
assert u_j.shape == K.shape
ival += f_i.sum() # See eq. 12.7 on p. 355 in [1]
S2 += (f_i**2).sum() # See Theorem 12.1 on p.356 in [1]
assert np.size(ival) == 1
assert np.size(S2) == 1
ival *= 1. / (n * (n - 1))
ix_cont = _get_type_pos(self.var_type)[0]
hp = self.bw[ix_cont].prod()
S2 *= 2 * hp / (n * (n - 1))
T = n * ival * np.sqrt(hp / S2)
return T
class SingleIndexModel(KernelReg):
"""
Single index semiparametric model ``y = g(X * b) + e``.
Parameters
----------
endog : array_like
The dependent variable
exog : array_like
The independent variable(s)
var_type : str
The type of variables in X:
- c: continuous
- o: ordered
- u: unordered
Attributes
----------
b : array_like
The linear coefficients b (betas)
bw : array_like
Bandwidths
Methods
-------
fit(): Computes the fitted values ``E[Y|X] = g(X * b)``
and the marginal effects ``dY/dX``.
References
----------
See chapter on semiparametric models in [1]
Notes
-----
This model resembles the binary choice models. The user knows
that X and b interact linearly, but ``g(X * b)`` is unknown.
In the parametric binary choice models the user usually assumes
some distribution of g() such as normal or logistic.
"""
def __init__(self, endog, exog, var_type):
self.var_type = var_type
self.K = len(var_type)
self.var_type = self.var_type[0]
self.endog = _adjust_shape(endog, 1)
self.exog = _adjust_shape(exog, self.K)
self.nobs = np.shape(self.exog)[0]
self.data_type = self.var_type
self.ckertype = 'gaussian'
self.okertype = 'wangryzin'
self.ukertype = 'aitchisonaitken'
self.func = self._est_loc_linear
self.b, self.bw = self._est_b_bw()
def _est_b_bw(self):
params0 = np.random.uniform(size=(self.K + 1, ))
b_bw = optimize.fmin(self.cv_loo, params0, disp=0)
b = b_bw[0:self.K]
bw = b_bw[self.K:]
bw = self._set_bw_bounds(bw)
return b, bw
def cv_loo(self, params):
# See p. 254 in Textbook
params = np.asarray(params)
b = params[0 : self.K]
bw = params[self.K:]
LOO_X = LeaveOneOut(self.exog)
LOO_Y = LeaveOneOut(self.endog).__iter__()
L = 0
for i, X_not_i in enumerate(LOO_X):
Y = next(LOO_Y)
#print b.shape, np.dot(self.exog[i:i+1, :], b).shape, bw,
G = self.func(bw, endog=Y, exog=-np.dot(X_not_i, b)[:,None],
#data_predict=-b*self.exog[i, :])[0]
data_predict=-np.dot(self.exog[i:i+1, :], b))[0]
#print G.shape
L += (self.endog[i] - G) ** 2
# Note: There might be a way to vectorize this. See p.72 in [1]
return L / self.nobs
def fit(self, data_predict=None):
if data_predict is None:
data_predict = self.exog
else:
data_predict = _adjust_shape(data_predict, self.K)
N_data_predict = np.shape(data_predict)[0]
mean = np.empty((N_data_predict,))
mfx = np.empty((N_data_predict, self.K))
for i in range(N_data_predict):
mean_mfx = self.func(self.bw, self.endog,
np.dot(self.exog, self.b)[:,None],
data_predict=np.dot(data_predict[i:i+1, :],self.b))
mean[i] = mean_mfx[0]
mfx_c = np.squeeze(mean_mfx[1])
mfx[i, :] = mfx_c
return mean, mfx
def __repr__(self):
"""Provide something sane to print."""
repr = "Single Index Model \n"
repr += "Number of variables: K = " + str(self.K) + "\n"
repr += "Number of samples: nobs = " + str(self.nobs) + "\n"
repr += "Variable types: " + self.var_type + "\n"
repr += "BW selection method: cv_ls" + "\n"
repr += "Estimator type: local constant" + "\n"
return repr
class SemiLinear(KernelReg):
"""
Semiparametric partially linear model, ``Y = Xb + g(Z) + e``.
Parameters
----------
endog : array_like
The dependent variable
exog : array_like
The linear component in the regression
exog_nonparametric : array_like
The nonparametric component in the regression
var_type : str
The type of the variables in the nonparametric component;
- c: continuous
- o: ordered
- u: unordered
k_linear : int
The number of variables that comprise the linear component.
Attributes
----------
bw : array_like
Bandwidths for the nonparametric component exog_nonparametric
b : array_like
Coefficients in the linear component
nobs : int
The number of observations.
k_linear : int
The number of variables that comprise the linear component.
Methods
-------
fit
Returns the fitted mean and marginal effects dy/dz
Notes
-----
This model uses only the local constant regression estimator
References
----------
See chapter on Semiparametric Models in [1]
"""
def __init__(self, endog, exog, exog_nonparametric, var_type, k_linear):
self.endog = _adjust_shape(endog, 1)
self.exog = _adjust_shape(exog, k_linear)
self.K = len(var_type)
self.exog_nonparametric = _adjust_shape(exog_nonparametric, self.K)
self.k_linear = k_linear
self.nobs = np.shape(self.exog)[0]
self.var_type = var_type
self.data_type = self.var_type
self.ckertype = 'gaussian'
self.okertype = 'wangryzin'
self.ukertype = 'aitchisonaitken'
self.func = self._est_loc_linear
self.b, self.bw = self._est_b_bw()
def _est_b_bw(self):
"""
Computes the (beta) coefficients and the bandwidths.
Minimizes ``cv_loo`` with respect to ``b`` and ``bw``.
"""
params0 = np.random.uniform(size=(self.k_linear + self.K, ))
b_bw = optimize.fmin(self.cv_loo, params0, disp=0)
b = b_bw[0 : self.k_linear]
bw = b_bw[self.k_linear:]
#bw = self._set_bw_bounds(np.asarray(bw))
return b, bw
def cv_loo(self, params):
"""
Similar to the cross validation leave-one-out estimator.
Modified to reflect the linear components.
Parameters
----------
params : array_like
Vector consisting of the coefficients (b) and the bandwidths (bw).
The first ``k_linear`` elements are the coefficients.
Returns
-------
L : float
The value of the objective function
References
----------
See p.254 in [1]
"""
params = np.asarray(params)
b = params[0 : self.k_linear]
bw = params[self.k_linear:]
LOO_X = LeaveOneOut(self.exog)
LOO_Y = LeaveOneOut(self.endog).__iter__()
LOO_Z = LeaveOneOut(self.exog_nonparametric).__iter__()
Xb = np.dot(self.exog, b)[:,None]
L = 0
for ii, X_not_i in enumerate(LOO_X):
Y = next(LOO_Y)
Z = next(LOO_Z)
Xb_j = np.dot(X_not_i, b)[:,None]
Yx = Y - Xb_j
G = self.func(bw, endog=Yx, exog=-Z,
data_predict=-self.exog_nonparametric[ii, :])[0]
lt = Xb[ii, :] #.sum() # linear term
L += (self.endog[ii] - lt - G) ** 2
return L
def fit(self, exog_predict=None, exog_nonparametric_predict=None):
"""Computes fitted values and marginal effects"""
if exog_predict is None:
exog_predict = self.exog
else:
exog_predict = _adjust_shape(exog_predict, self.k_linear)
if exog_nonparametric_predict is None:
exog_nonparametric_predict = self.exog_nonparametric
else:
exog_nonparametric_predict = _adjust_shape(exog_nonparametric_predict, self.K)
N_data_predict = np.shape(exog_nonparametric_predict)[0]
mean = np.empty((N_data_predict,))
mfx = np.empty((N_data_predict, self.K))
Y = self.endog - np.dot(exog_predict, self.b)[:,None]
for i in range(N_data_predict):
mean_mfx = self.func(self.bw, Y, self.exog_nonparametric,
data_predict=exog_nonparametric_predict[i, :])
mean[i] = mean_mfx[0]
mfx_c = np.squeeze(mean_mfx[1])
mfx[i, :] = mfx_c
return mean, mfx
def __repr__(self):
"""Provide something sane to print."""
repr = "Semiparamatric Partially Linear Model \n"
repr += "Number of variables: K = " + str(self.K) + "\n"
repr += "Number of samples: N = " + str(self.nobs) + "\n"
repr += "Variable types: " + self.var_type + "\n"
repr += "BW selection method: cv_ls" + "\n"
repr += "Estimator type: local constant" + "\n"
return repr

View File

@ -0,0 +1,576 @@
"""
This models contains the Kernels for Kernel smoothing.
Hopefully in the future they may be reused/extended for other kernel based
method
References:
----------
Pointwise Kernel Confidence Bounds
(smoothconf)
http://fedc.wiwi.hu-berlin.de/xplore/ebooks/html/anr/anrhtmlframe62.html
"""
# pylint: disable-msg=C0103
# pylint: disable-msg=W0142
# pylint: disable-msg=E1101
# pylint: disable-msg=E0611
from statsmodels.compat.python import lzip, lfilter
import numpy as np
import scipy.integrate
from scipy.special import factorial
from numpy import exp, multiply, square, divide, subtract, inf
class NdKernel:
"""Generic N-dimensial kernel
Parameters
----------
n : int
The number of series for kernel estimates
kernels : list
kernels
Can be constructed from either
a) a list of n kernels which will be treated as
indepent marginals on a gaussian copula (specified by H)
or b) a single univariate kernel which will be applied radially to the
mahalanobis distance defined by H.
In the case of the Gaussian these are both equivalent, and the second constructiong
is prefered.
"""
def __init__(self, n, kernels = None, H = None):
if kernels is None:
kernels = Gaussian()
self._kernels = kernels
self.weights = None
if H is None:
H = np.matrix( np.identity(n))
self._H = H
self._Hrootinv = np.linalg.cholesky( H.I )
def getH(self):
"""Getter for kernel bandwidth, H"""
return self._H
def setH(self, value):
"""Setter for kernel bandwidth, H"""
self._H = value
H = property(getH, setH, doc="Kernel bandwidth matrix")
def density(self, xs, x):
n = len(xs)
#xs = self.in_domain( xs, xs, x )[0]
if len(xs)>0: ## Need to do product of marginal distributions
#w = np.sum([self(self._Hrootinv * (xx-x).T ) for xx in xs])/n
#vectorized does not work:
if self.weights is not None:
w = np.mean(self((xs-x) * self._Hrootinv).T * self.weights)/sum(self.weights)
else:
w = np.mean(self((xs-x) * self._Hrootinv )) #transposed
#w = np.mean([self(xd) for xd in ((xs-x) * self._Hrootinv)] ) #transposed
return w
else:
return np.nan
def _kernweight(self, x ):
"""returns the kernel weight for the independent multivariate kernel"""
if isinstance( self._kernels, CustomKernel ):
## Radial case
#d = x.T * x
#x is matrix, 2d, element wise sqrt looks wrong
#d = np.sqrt( x.T * x )
x = np.asarray(x)
#d = np.sqrt( (x * x).sum(-1) )
d = (x * x).sum(-1)
return self._kernels( np.asarray(d) )
def __call__(self, x):
"""
This simply returns the value of the kernel function at x
Does the same as weight if the function is normalised
"""
return self._kernweight(x)
class CustomKernel:
"""
Generic 1D Kernel object.
Can be constructed by selecting a standard named Kernel,
or providing a lambda expression and domain.
The domain allows some algorithms to run faster for finite domain kernels.
"""
# MC: Not sure how this will look in the end - or even still exist.
# Main purpose of this is to allow custom kernels and to allow speed up
# from finite support.
def __init__(self, shape, h = 1.0, domain = None, norm = None):
"""
shape should be a function taking and returning numeric type.
For sanity it should always return positive or zero but this is not
enforced in case you want to do weird things. Bear in mind that the
statistical tests etc. may not be valid for non-positive kernels.
The bandwidth of the kernel is supplied as h.
You may specify a domain as a list of 2 values [min, max], in which case
kernel will be treated as zero outside these values. This will speed up
calculation.
You may also specify the normalisation constant for the supplied Kernel.
If you do this number will be stored and used as the normalisation
without calculation. It is recommended you do this if you know the
constant, to speed up calculation. In particular if the shape function
provided is already normalised you should provide norm = 1.0.
Warning: I think several calculations assume that the kernel is
normalized. No tests for non-normalized kernel.
"""
self._normconst = norm # a value or None, if None, then calculate
self.domain = domain
self.weights = None
if callable(shape):
self._shape = shape
else:
raise TypeError("shape must be a callable object/function")
self._h = h
self._L2Norm = None
self._kernel_var = None
self._normal_reference_constant = None
self._order = None
def geth(self):
"""Getter for kernel bandwidth, h"""
return self._h
def seth(self, value):
"""Setter for kernel bandwidth, h"""
self._h = value
h = property(geth, seth, doc="Kernel Bandwidth")
def in_domain(self, xs, ys, x):
"""
Returns the filtered (xs, ys) based on the Kernel domain centred on x
"""
# Disable black-list functions: filter used for speed instead of
# list-comprehension
# pylint: disable-msg=W0141
def isInDomain(xy):
"""Used for filter to check if point is in the domain"""
u = (xy[0]-x)/self.h
return np.all((u >= self.domain[0]) & (u <= self.domain[1]))
if self.domain is None:
return (xs, ys)
else:
filtered = lfilter(isInDomain, lzip(xs, ys))
if len(filtered) > 0:
xs, ys = lzip(*filtered)
return (xs, ys)
else:
return ([], [])
def density(self, xs, x):
"""Returns the kernel density estimate for point x based on x-values
xs
"""
xs = np.asarray(xs)
n = len(xs) # before in_domain?
if self.weights is not None:
xs, weights = self.in_domain( xs, self.weights, x )
else:
xs = self.in_domain( xs, xs, x )[0]
xs = np.asarray(xs)
#print 'len(xs)', len(xs), x
if xs.ndim == 1:
xs = xs[:,None]
if len(xs)>0:
h = self.h
if self.weights is not None:
w = 1 / h * np.sum(self((xs-x)/h).T * weights, axis=1)
else:
w = 1. / (h * n) * np.sum(self((xs-x)/h), axis=0)
return w
else:
return np.nan
def density_var(self, density, nobs):
"""approximate pointwise variance for kernel density
not verified
Parameters
----------
density : array_lie
pdf of the kernel density
nobs : int
number of observations used in the KDE estimation
Returns
-------
kde_var : ndarray
estimated variance of the density estimate
Notes
-----
This uses the asymptotic normal approximation to the distribution of
the density estimate.
"""
return np.asarray(density) * self.L2Norm / self.h / nobs
def density_confint(self, density, nobs, alpha=0.05):
"""approximate pointwise confidence interval for kernel density
The confidence interval is centered at the estimated density and
ignores the bias of the density estimate.
not verified
Parameters
----------
density : array_lie
pdf of the kernel density
nobs : int
number of observations used in the KDE estimation
Returns
-------
conf_int : ndarray
estimated confidence interval of the density estimate, lower bound
in first column and upper bound in second column
Notes
-----
This uses the asymptotic normal approximation to the distribution of
the density estimate. The lower bound can be negative for density
values close to zero.
"""
from scipy import stats
crit = stats.norm.isf(alpha / 2.)
density = np.asarray(density)
half_width = crit * np.sqrt(self.density_var(density, nobs))
conf_int = np.column_stack((density - half_width, density + half_width))
return conf_int
def smooth(self, xs, ys, x):
"""Returns the kernel smoothing estimate for point x based on x-values
xs and y-values ys.
Not expected to be called by the user.
"""
xs, ys = self.in_domain(xs, ys, x)
if len(xs)>0:
w = np.sum(self((xs-x)/self.h))
#TODO: change the below to broadcasting when shape is sorted
v = np.sum([yy*self((xx-x)/self.h) for xx, yy in zip(xs, ys)])
return v / w
else:
return np.nan
def smoothvar(self, xs, ys, x):
"""Returns the kernel smoothing estimate of the variance at point x.
"""
xs, ys = self.in_domain(xs, ys, x)
if len(xs) > 0:
fittedvals = np.array([self.smooth(xs, ys, xx) for xx in xs])
sqresid = square( subtract(ys, fittedvals) )
w = np.sum(self((xs-x)/self.h))
v = np.sum([rr*self((xx-x)/self.h) for xx, rr in zip(xs, sqresid)])
return v / w
else:
return np.nan
def smoothconf(self, xs, ys, x, alpha=0.05):
"""Returns the kernel smoothing estimate with confidence 1sigma bounds
"""
xs, ys = self.in_domain(xs, ys, x)
if len(xs) > 0:
fittedvals = np.array([self.smooth(xs, ys, xx) for xx in xs])
#fittedvals = self.smooth(xs, ys, x) # x or xs in Haerdle
sqresid = square(
subtract(ys, fittedvals)
)
w = np.sum(self((xs-x)/self.h))
#var = sqresid.sum() / (len(sqresid) - 0) # nonlocal var ? JP just trying
v = np.sum([rr*self((xx-x)/self.h) for xx, rr in zip(xs, sqresid)])
var = v / w
sd = np.sqrt(var)
K = self.L2Norm
yhat = self.smooth(xs, ys, x)
from scipy import stats
crit = stats.norm.isf(alpha / 2)
err = crit * sd * np.sqrt(K) / np.sqrt(w * self.h * self.norm_const)
return (yhat - err, yhat, yhat + err)
else:
return (np.nan, np.nan, np.nan)
@property
def L2Norm(self):
"""Returns the integral of the square of the kernal from -inf to inf"""
if self._L2Norm is None:
L2Func = lambda x: (self.norm_const*self._shape(x))**2
if self.domain is None:
self._L2Norm = scipy.integrate.quad(L2Func, -inf, inf)[0]
else:
self._L2Norm = scipy.integrate.quad(L2Func, self.domain[0],
self.domain[1])[0]
return self._L2Norm
@property
def norm_const(self):
"""
Normalising constant for kernel (integral from -inf to inf)
"""
if self._normconst is None:
if self.domain is None:
quadres = scipy.integrate.quad(self._shape, -inf, inf)
else:
quadres = scipy.integrate.quad(self._shape, self.domain[0],
self.domain[1])
self._normconst = 1.0/(quadres[0])
return self._normconst
@property
def kernel_var(self):
"""Returns the second moment of the kernel"""
if self._kernel_var is None:
func = lambda x: x**2 * self.norm_const * self._shape(x)
if self.domain is None:
self._kernel_var = scipy.integrate.quad(func, -inf, inf)[0]
else:
self._kernel_var = scipy.integrate.quad(func, self.domain[0],
self.domain[1])[0]
return self._kernel_var
def moments(self, n):
if n > 2:
msg = "Only first and second moment currently implemented"
raise NotImplementedError(msg)
if n == 1:
return 0
if n == 2:
return self.kernel_var
@property
def normal_reference_constant(self):
"""
Constant used for silverman normal reference asymtotic bandwidth
calculation.
C = 2((pi^(1/2)*(nu!)^3 R(k))/(2nu(2nu)!kap_nu(k)^2))^(1/(2nu+1))
nu = kernel order
kap_nu = nu'th moment of kernel
R = kernel roughness (square of L^2 norm)
Note: L2Norm property returns square of norm.
"""
nu = self._order
if not nu == 2:
msg = "Only implemented for second order kernels"
raise NotImplementedError(msg)
if self._normal_reference_constant is None:
C = np.pi**(.5) * factorial(nu)**3 * self.L2Norm
C /= (2 * nu * factorial(2 * nu) * self.moments(nu)**2)
C = 2*C**(1.0/(2*nu+1))
self._normal_reference_constant = C
return self._normal_reference_constant
def weight(self, x):
"""This returns the normalised weight at distance x"""
return self.norm_const*self._shape(x)
def __call__(self, x):
"""
This simply returns the value of the kernel function at x
Does the same as weight if the function is normalised
"""
return self._shape(x)
class Uniform(CustomKernel):
def __init__(self, h=1.0):
CustomKernel.__init__(self, shape=lambda x: 0.5 * np.ones(x.shape), h=h,
domain=[-1.0, 1.0], norm = 1.0)
self._L2Norm = 0.5
self._kernel_var = 1. / 3
self._order = 2
class Triangular(CustomKernel):
def __init__(self, h=1.0):
CustomKernel.__init__(self, shape=lambda x: 1 - abs(x), h=h,
domain=[-1.0, 1.0], norm = 1.0)
self._L2Norm = 2.0/3.0
self._kernel_var = 1. / 6
self._order = 2
class Epanechnikov(CustomKernel):
def __init__(self, h=1.0):
CustomKernel.__init__(self, shape=lambda x: 0.75*(1 - x*x), h=h,
domain=[-1.0, 1.0], norm = 1.0)
self._L2Norm = 0.6
self._kernel_var = 0.2
self._order = 2
class Biweight(CustomKernel):
def __init__(self, h=1.0):
CustomKernel.__init__(self, shape=lambda x: 0.9375*(1 - x*x)**2, h=h,
domain=[-1.0, 1.0], norm = 1.0)
self._L2Norm = 5.0/7.0
self._kernel_var = 1. / 7
self._order = 2
def smooth(self, xs, ys, x):
"""Returns the kernel smoothing estimate for point x based on x-values
xs and y-values ys.
Not expected to be called by the user.
Special implementation optimized for Biweight.
"""
xs, ys = self.in_domain(xs, ys, x)
if len(xs) > 0:
w = np.sum(square(subtract(1, square(divide(subtract(xs, x),
self.h)))))
v = np.sum(multiply(ys, square(subtract(1, square(divide(
subtract(xs, x), self.h))))))
return v / w
else:
return np.nan
def smoothvar(self, xs, ys, x):
"""
Returns the kernel smoothing estimate of the variance at point x.
"""
xs, ys = self.in_domain(xs, ys, x)
if len(xs) > 0:
fittedvals = np.array([self.smooth(xs, ys, xx) for xx in xs])
rs = square(subtract(ys, fittedvals))
w = np.sum(square(subtract(1.0, square(divide(subtract(xs, x),
self.h)))))
v = np.sum(multiply(rs, square(subtract(1, square(divide(
subtract(xs, x), self.h))))))
return v / w
else:
return np.nan
def smoothconf_(self, xs, ys, x):
"""Returns the kernel smoothing estimate with confidence 1sigma bounds
"""
xs, ys = self.in_domain(xs, ys, x)
if len(xs) > 0:
fittedvals = np.array([self.smooth(xs, ys, xx) for xx in xs])
rs = square(subtract(ys, fittedvals))
w = np.sum(square(subtract(1.0, square(divide(subtract(xs, x),
self.h)))))
v = np.sum(multiply(rs, square(subtract(1, square(divide(
subtract(xs, x), self.h))))))
var = v / w
sd = np.sqrt(var)
K = self.L2Norm
yhat = self.smooth(xs, ys, x)
err = sd * K / np.sqrt(0.9375 * w * self.h)
return (yhat - err, yhat, yhat + err)
else:
return (np.nan, np.nan, np.nan)
class Triweight(CustomKernel):
def __init__(self, h=1.0):
CustomKernel.__init__(self, shape=lambda x: 1.09375*(1 - x*x)**3, h=h,
domain=[-1.0, 1.0], norm = 1.0)
self._L2Norm = 350.0/429.0
self._kernel_var = 1. / 9
self._order = 2
class Gaussian(CustomKernel):
"""
Gaussian (Normal) Kernel
K(u) = 1 / (sqrt(2*pi)) exp(-0.5 u**2)
"""
def __init__(self, h=1.0):
CustomKernel.__init__(self, shape = lambda x: 0.3989422804014327 *
np.exp(-x**2/2.0), h = h, domain = None, norm = 1.0)
self._L2Norm = 1.0/(2.0*np.sqrt(np.pi))
self._kernel_var = 1.0
self._order = 2
def smooth(self, xs, ys, x):
"""Returns the kernel smoothing estimate for point x based on x-values
xs and y-values ys.
Not expected to be called by the user.
Special implementation optimized for Gaussian.
"""
w = np.sum(exp(multiply(square(divide(subtract(xs, x),
self.h)),-0.5)))
v = np.sum(multiply(ys, exp(multiply(square(divide(subtract(xs, x),
self.h)), -0.5))))
return v/w
class Cosine(CustomKernel):
"""
Cosine Kernel
K(u) = pi/4 cos(0.5 * pi * u) between -1.0 and 1.0
"""
def __init__(self, h=1.0):
CustomKernel.__init__(self, shape=lambda x: 0.78539816339744828 *
np.cos(np.pi/2.0 * x), h=h, domain=[-1.0, 1.0], norm = 1.0)
self._L2Norm = np.pi**2/16.0
self._kernel_var = 0.1894305308612978 # = 1 - 8 / np.pi**2
self._order = 2
class Cosine2(CustomKernel):
"""
Cosine2 Kernel
K(u) = 1 + cos(2 * pi * u) between -0.5 and 0.5
Note: this is the same Cosine kernel that Stata uses
"""
def __init__(self, h=1.0):
CustomKernel.__init__(self, shape=lambda x: 1 + np.cos(2.0 * np.pi * x)
, h=h, domain=[-0.5, 0.5], norm = 1.0)
self._L2Norm = 1.5
self._kernel_var = 0.03267274151216444 # = 1/12. - 0.5 / np.pi**2
self._order = 2
class Tricube(CustomKernel):
"""
Tricube Kernel
K(u) = 0.864197530864 * (1 - abs(x)**3)**3 between -1.0 and 1.0
"""
def __init__(self,h=1.0):
CustomKernel.__init__(self,shape=lambda x: 0.864197530864 * (1 - abs(x)**3)**3,
h=h, domain=[-1.0, 1.0], norm = 1.0)
self._L2Norm = 175.0/247.0
self._kernel_var = 35.0/243.0
self._order = 2

View File

@ -0,0 +1,401 @@
"""
This module contains scatterplot smoothers, that is classes
who generate a smooth fit of a set of (x,y) pairs.
"""
# pylint: disable-msg=C0103
# pylint: disable-msg=W0142
# pylint: disable-msg=E0611
# pylint: disable-msg=E1101
import numpy as np
from . import kernels
class KernelSmoother:
"""
1D Kernel Density Regression/Kernel Smoother
Requires:
x - array_like of x values
y - array_like of y values
Kernel - Kernel object, Default is Gaussian.
"""
def __init__(self, x, y, Kernel = None):
if Kernel is None:
Kernel = kernels.Gaussian()
self.Kernel = Kernel
self.x = np.array(x)
self.y = np.array(y)
def fit(self):
pass
def __call__(self, x):
return np.array([self.predict(xx) for xx in x])
def predict(self, x):
"""
Returns the kernel smoothed prediction at x
If x is a real number then a single value is returned.
Otherwise an attempt is made to cast x to numpy.ndarray and an array of
corresponding y-points is returned.
"""
if np.size(x) == 1: # if isinstance(x, numbers.Real):
return self.Kernel.smooth(self.x, self.y, x)
else:
return np.array([self.Kernel.smooth(self.x, self.y, xx) for xx
in np.array(x)])
def conf(self, x):
"""
Returns the fitted curve and 1-sigma upper and lower point-wise
confidence.
These bounds are based on variance only, and do not include the bias.
If the bandwidth is much larger than the curvature of the underlying
function then the bias could be large.
x is the points on which you want to evaluate the fit and the errors.
Alternatively if x is specified as a positive integer, then the fit and
confidence bands points will be returned after every
xth sample point - so they are closer together where the data
is denser.
"""
if isinstance(x, int):
sorted_x = np.array(self.x)
sorted_x.sort()
confx = sorted_x[::x]
conffit = self.conf(confx)
return (confx, conffit)
else:
return np.array([self.Kernel.smoothconf(self.x, self.y, xx)
for xx in x])
def var(self, x):
return np.array([self.Kernel.smoothvar(self.x, self.y, xx) for xx in x])
def std(self, x):
return np.sqrt(self.var(x))
class PolySmoother:
"""
Polynomial smoother up to a given order.
Fit based on weighted least squares.
The x values can be specified at instantiation or when called.
This is a 3 liner with OLS or WLS, see test.
It's here as a test smoother for GAM
"""
#JP: heavily adjusted to work as plugin replacement for bspline
# smoother in gam.py initialized by function default_smoother
# Only fixed exceptions, I did not check whether it is statistically
# correctand I think it is not, there are still be some dimension
# problems, and there were some dimension problems initially.
# TODO: undo adjustments and fix dimensions correctly
# comment: this is just like polyfit with initialization options
# and additional results (OLS on polynomial of x (x is 1d?))
def __init__(self, order, x=None):
#order = 4 # set this because we get knots instead of order
self.order = order
#print order, x.shape
self.coef = np.zeros((order+1,), np.float64)
if x is not None:
if x.ndim > 1:
print('Warning: 2d x detected in PolySmoother init, shape:', x.shape)
x=x[0,:] #check orientation
self.X = np.array([x**i for i in range(order+1)]).T
def df_fit(self):
'''alias of df_model for backwards compatibility
'''
return self.df_model()
def df_model(self):
"""
Degrees of freedom used in the fit.
"""
return self.order + 1
def gram(self, d=None):
#fake for spline imitation
pass
def smooth(self,*args, **kwds):
'''alias for fit, for backwards compatibility,
do we need it with different behavior than fit?
'''
return self.fit(*args, **kwds)
def df_resid(self):
"""
Residual degrees of freedom from last fit.
"""
return self.N - self.order - 1
def __call__(self, x=None):
return self.predict(x=x)
def predict(self, x=None):
if x is not None:
#if x.ndim > 1: x=x[0,:] #why this this should select column not row
if x.ndim > 1:
print('Warning: 2d x detected in PolySmoother predict, shape:', x.shape)
x=x[:,0] #TODO: check and clean this up
X = np.array([(x**i) for i in range(self.order+1)])
else:
X = self.X
#return np.squeeze(np.dot(X.T, self.coef))
#need to check what dimension this is supposed to be
if X.shape[1] == self.coef.shape[0]:
return np.squeeze(np.dot(X, self.coef))#[0]
else:
return np.squeeze(np.dot(X.T, self.coef))#[0]
def fit(self, y, x=None, weights=None):
self.N = y.shape[0]
if y.ndim == 1:
y = y[:,None]
if weights is None or np.isnan(weights).all():
weights = 1
_w = 1
else:
_w = np.sqrt(weights)[:,None]
if x is None:
if not hasattr(self, "X"):
raise ValueError("x needed to fit PolySmoother")
else:
if x.ndim > 1:
print('Warning: 2d x detected in PolySmoother predict, shape:', x.shape)
#x=x[0,:] #TODO: check orientation, row or col
self.X = np.array([(x**i) for i in range(self.order+1)]).T
#print _w.shape
X = self.X * _w
_y = y * _w#[:,None]
#self.coef = np.dot(L.pinv(X).T, _y[:,None])
#self.coef = np.dot(L.pinv(X), _y)
self.coef = np.linalg.lstsq(X, _y, rcond=-1)[0]
self.params = np.squeeze(self.coef)
# comment out for now to remove dependency on _hbspline
##class SmoothingSpline(BSpline):
##
## penmax = 30.
##
## def fit(self, y, x=None, weights=None, pen=0.):
## banded = True
##
## if x is None:
## x = self.tau[(self.M-1):-(self.M-1)] # internal knots
##
## if pen == 0.: # cannot use cholesky for singular matrices
## banded = False
##
## if x.shape != y.shape:
## raise ValueError('x and y shape do not agree, by default x are the Bspline\'s internal knots')
##
## bt = self.basis(x)
## if pen >= self.penmax:
## pen = self.penmax
##
## if weights is None:
## weights = np.array(1.)
##
## wmean = weights.mean()
## _w = np.sqrt(weights / wmean)
## bt *= _w
##
## # throw out rows with zeros (this happens at boundary points!)
##
## mask = np.flatnonzero(1 - np.alltrue(np.equal(bt, 0), axis=0))
##
## bt = bt[:, mask]
## y = y[mask]
##
## self.df_total = y.shape[0]
##
## if bt.shape[1] != y.shape[0]:
## raise ValueError("some x values are outside range of B-spline knots")
## bty = np.dot(bt, _w * y)
## self.N = y.shape[0]
## if not banded:
## self.btb = np.dot(bt, bt.T)
## _g = _band2array(self.g, lower=1, symmetric=True)
## self.coef, _, self.rank = L.lstsq(self.btb + pen*_g, bty)[0:3]
## self.rank = min(self.rank, self.btb.shape[0])
## else:
## self.btb = np.zeros(self.g.shape, np.float64)
## nband, nbasis = self.g.shape
## for i in range(nbasis):
## for k in range(min(nband, nbasis-i)):
## self.btb[k, i] = (bt[i] * bt[i+k]).sum()
##
## bty.shape = (1, bty.shape[0])
## self.chol, self.coef = solveh_banded(self.btb +
## pen*self.g,
## bty, lower=1)
##
## self.coef = np.squeeze(self.coef)
## self.resid = np.sqrt(wmean) * (y * _w - np.dot(self.coef, bt))
## self.pen = pen
##
## def gcv(self):
## """
## Generalized cross-validation score of current fit.
## """
##
## norm_resid = (self.resid**2).sum()
## return norm_resid / (self.df_total - self.trace())
##
## def df_resid(self):
## """
## self.N - self.trace()
##
## where self.N is the number of observations of last fit.
## """
##
## return self.N - self.trace()
##
## def df_fit(self):
## """
## = self.trace()
##
## How many degrees of freedom used in the fit?
## """
## return self.trace()
##
## def trace(self):
## """
## Trace of the smoothing matrix S(pen)
## """
##
## if self.pen > 0:
## _invband = _hbspline.invband(self.chol.copy())
## tr = _trace_symbanded(_invband, self.btb, lower=1)
## return tr
## else:
## return self.rank
##
##class SmoothingSplineFixedDF(SmoothingSpline):
## """
## Fit smoothing spline with approximately df degrees of freedom
## used in the fit, i.e. so that self.trace() is approximately df.
##
## In general, df must be greater than the dimension of the null space
## of the Gram inner product. For cubic smoothing splines, this means
## that df > 2.
## """
##
## target_df = 5
##
## def __init__(self, knots, order=4, coef=None, M=None, target_df=None):
## if target_df is not None:
## self.target_df = target_df
## BSpline.__init__(self, knots, order=order, coef=coef, M=M)
## self.target_reached = False
##
## def fit(self, y, x=None, df=None, weights=None, tol=1.0e-03):
##
## df = df or self.target_df
##
## apen, bpen = 0, 1.0e-03
## olddf = y.shape[0] - self.m
##
## if not self.target_reached:
## while True:
## curpen = 0.5 * (apen + bpen)
## SmoothingSpline.fit(self, y, x=x, weights=weights, pen=curpen)
## curdf = self.trace()
## if curdf > df:
## apen, bpen = curpen, 2 * curpen
## else:
## apen, bpen = apen, curpen
## if apen >= self.penmax:
## raise ValueError("penalty too large, try setting penmax higher or decreasing df")
## if np.fabs(curdf - df) / df < tol:
## self.target_reached = True
## break
## else:
## SmoothingSpline.fit(self, y, x=x, weights=weights, pen=self.pen)
##
##class SmoothingSplineGCV(SmoothingSpline):
##
## """
## Fit smoothing spline trying to optimize GCV.
##
## Try to find a bracketing interval for scipy.optimize.golden
## based on bracket.
##
## It is probably best to use target_df instead, as it is
## sometimes difficult to find a bracketing interval.
##
## """
##
## def fit(self, y, x=None, weights=None, tol=1.0e-03,
## bracket=(0,1.0e-03)):
##
## def _gcv(pen, y, x):
## SmoothingSpline.fit(y, x=x, pen=np.exp(pen), weights=weights)
## a = self.gcv()
## return a
##
## a = golden(_gcv, args=(y,x), brack=(-100,20), tol=tol)
##
##def _trace_symbanded(a,b, lower=0):
## """
## Compute the trace(a*b) for two upper or lower banded real symmetric matrices.
## """
##
## if lower:
## t = _zero_triband(a * b, lower=1)
## return t[0].sum() + 2 * t[1:].sum()
## else:
## t = _zero_triband(a * b, lower=0)
## return t[-1].sum() + 2 * t[:-1].sum()
##
##
##
##def _zero_triband(a, lower=0):
## """
## Zero out unnecessary elements of a real symmetric banded matrix.
## """
##
## nrow, ncol = a.shape
## if lower:
## for i in range(nrow): a[i,(ncol-i):] = 0.
## else:
## for i in range(nrow): a[i,0:i] = 0.
## return a

View File

@ -0,0 +1,57 @@
"""
Created on Fri Mar 04 07:36:28 2011
@author: Mike
"""
import numpy as np
class kdetest:
Hpi = np.matrix([[ 0.05163034, 0.5098923 ],
[0.50989228, 8.8822365 ]])
faithfulData = dict(
eruptions=[
3.6, 1.8, 3.333, 2.283, 4.533, 2.883, 4.7, 3.6, 1.95, 4.35, 1.833, 3.917,
4.2, 1.75, 4.7, 2.167, 1.75, 4.8, 1.6, 4.25, 1.8, 1.75, 3.45, 3.067, 4.533,
3.6, 1.967, 4.083, 3.85, 4.433, 4.3, 4.467, 3.367, 4.033, 3.833, 2.017, 1.867,
4.833, 1.833, 4.783, 4.35, 1.883, 4.567, 1.75, 4.533, 3.317, 3.833, 2.1, 4.633,
2, 4.8, 4.716, 1.833, 4.833, 1.733, 4.883, 3.717, 1.667, 4.567, 4.317, 2.233, 4.5,
1.75, 4.8, 1.817, 4.4, 4.167, 4.7, 2.067, 4.7, 4.033, 1.967, 4.5, 4, 1.983, 5.067,
2.017, 4.567, 3.883, 3.6, 4.133, 4.333, 4.1, 2.633, 4.067, 4.933, 3.95, 4.517, 2.167,
4, 2.2, 4.333, 1.867, 4.817, 1.833, 4.3, 4.667, 3.75, 1.867, 4.9, 2.483, 4.367, 2.1, 4.5,
4.05, 1.867, 4.7, 1.783, 4.85, 3.683, 4.733, 2.3, 4.9, 4.417, 1.7, 4.633, 2.317, 4.6,
1.817, 4.417, 2.617, 4.067, 4.25, 1.967, 4.6, 3.767, 1.917, 4.5, 2.267, 4.65, 1.867,
4.167, 2.8, 4.333, 1.833, 4.383, 1.883, 4.933, 2.033, 3.733, 4.233, 2.233, 4.533,
4.817, 4.333, 1.983, 4.633, 2.017, 5.1, 1.8, 5.033, 4, 2.4, 4.6, 3.567, 4, 4.5, 4.083,
1.8, 3.967, 2.2, 4.15, 2, 3.833, 3.5, 4.583, 2.367, 5, 1.933, 4.617, 1.917, 2.083,
4.583, 3.333, 4.167, 4.333, 4.5, 2.417, 4, 4.167, 1.883, 4.583, 4.25, 3.767, 2.033,
4.433, 4.083, 1.833, 4.417, 2.183, 4.8, 1.833, 4.8, 4.1, 3.966, 4.233, 3.5, 4.366,
2.25, 4.667, 2.1, 4.35, 4.133, 1.867, 4.6, 1.783, 4.367, 3.85, 1.933, 4.5, 2.383,
4.7, 1.867, 3.833, 3.417, 4.233, 2.4, 4.8, 2, 4.15, 1.867, 4.267, 1.75, 4.483, 4,
4.117, 4.083, 4.267, 3.917, 4.55, 4.083, 2.417, 4.183, 2.217, 4.45, 1.883, 1.85,
4.283, 3.95, 2.333, 4.15, 2.35, 4.933, 2.9, 4.583, 3.833, 2.083, 4.367, 2.133, 4.35,
2.2, 4.45, 3.567, 4.5, 4.15, 3.817, 3.917, 4.45, 2, 4.283, 4.767, 4.533, 1.85, 4.25,
1.983, 2.25, 4.75, 4.117, 2.15, 4.417, 1.817, 4.467],
waiting=[
79, 54, 74, 62, 85, 55, 88, 85, 51, 85, 54, 84, 78, 47, 83, 52,
62, 84, 52, 79, 51, 47, 78, 69, 74, 83, 55, 76, 78, 79, 73, 77,
66, 80, 74, 52, 48, 80, 59, 90, 80, 58, 84, 58, 73, 83, 64, 53,
82, 59, 75, 90, 54, 80, 54, 83, 71, 64, 77, 81, 59, 84, 48, 82,
60, 92, 78, 78, 65, 73, 82, 56, 79, 71, 62, 76, 60, 78, 76, 83,
75, 82, 70, 65, 73, 88, 76, 80, 48, 86, 60, 90, 50, 78, 63, 72,
84, 75, 51, 82, 62, 88, 49, 83, 81, 47, 84, 52, 86, 81, 75, 59,
89, 79, 59, 81, 50, 85, 59, 87, 53, 69, 77, 56, 88, 81, 45, 82,
55, 90, 45, 83, 56, 89, 46, 82, 51, 86, 53, 79, 81, 60, 82, 77,
76, 59, 80, 49, 96, 53, 77, 77, 65, 81, 71, 70, 81, 93, 53, 89,
45, 86, 58, 78, 66, 76, 63, 88, 52, 93, 49, 57, 77, 68, 81, 81,
73, 50, 85, 74, 55, 77, 83, 83, 51, 78, 84, 46, 83, 55, 81, 57,
76, 84, 77, 81, 87, 77, 51, 78, 60, 82, 91, 53, 78, 46, 77, 84,
49, 83, 71, 80, 49, 75, 64, 76, 53, 94, 55, 76, 50, 82, 54, 75,
78, 79, 78, 78, 70, 79, 70, 54, 86, 50, 90, 54, 54, 77, 79, 64,
75, 47, 86, 63, 85, 82, 57, 82, 67, 74, 54, 83, 73, 73, 88, 80,
71, 83, 56, 79, 78, 84, 58, 83, 43, 60, 75, 81, 46, 90, 46, 74]
)

View File

@ -0,0 +1,87 @@
"""Example for gam.AdditiveModel and PolynomialSmoother
This example was written as a test case.
The data generating process is chosen so the parameters are well identified
and estimated.
Created on Fri Nov 04 13:45:43 2011
Author: Josef Perktold
"""
from statsmodels.compat.python import lrange
import numpy as np
from statsmodels.sandbox.gam import AdditiveModel
from statsmodels.regression.linear_model import OLS
np.random.seed(8765993)
#seed is chosen for nice result, not randomly
#other seeds are pretty off in the prediction
#DGP: simple polynomial
order = 3
sigma_noise = 0.5
nobs = 1000 #1000 #with 1000, OLS and Additivemodel agree in params at 2 decimals
lb, ub = -3.5, 4#2.5
x1 = np.linspace(lb, ub, nobs)
x2 = np.sin(2*x1)
x = np.column_stack((x1/x1.max()*2, x2))
exog = (x[:,:,None]**np.arange(order+1)[None, None, :]).reshape(nobs, -1)
idx = lrange((order+1)*2)
del idx[order+1]
exog_reduced = exog[:,idx] #remove duplicate constant
y_true = exog.sum(1) / 2.
z = y_true #alias check
d = x
y = y_true + sigma_noise * np.random.randn(nobs)
example = 1
if example == 1:
m = AdditiveModel(d)
m.fit(y)
y_pred = m.results.predict(d)
for ss in m.smoothers:
print(ss.params)
res_ols = OLS(y, exog_reduced).fit()
print(res_ols.params)
#from numpy.testing import assert_almost_equal
#assert_almost_equal(y_pred, res_ols.fittedvalues, 3)
if example > 0:
import matplotlib.pyplot as plt
plt.figure()
plt.plot(exog)
y_pred = m.results.mu# + m.results.alpha #m.results.predict(d)
plt.figure()
plt.subplot(2,2,1)
plt.plot(y, '.', alpha=0.25)
plt.plot(y_true, 'k-', label='true')
plt.plot(res_ols.fittedvalues, 'g-', label='OLS', lw=2, alpha=-.7)
plt.plot(y_pred, 'r-', label='AM')
plt.legend(loc='upper left')
plt.title('gam.AdditiveModel')
counter = 2
for ii, xx in zip(['z', 'x1', 'x2'], [z, x[:,0], x[:,1]]):
sortidx = np.argsort(xx)
#plt.figure()
plt.subplot(2, 2, counter)
plt.plot(xx[sortidx], y[sortidx], '.', alpha=0.25)
plt.plot(xx[sortidx], y_true[sortidx], 'k.', label='true', lw=2)
plt.plot(xx[sortidx], y_pred[sortidx], 'r.', label='AM')
plt.legend(loc='upper left')
plt.title('gam.AdditiveModel ' + ii)
counter += 1
plt.show()

View File

@ -0,0 +1,132 @@
"""Example for GAM with Poisson Model and PolynomialSmoother
This example was written as a test case.
The data generating process is chosen so the parameters are well identified
and estimated.
Created on Fri Nov 04 13:45:43 2011
Author: Josef Perktold
"""
from statsmodels.compat.python import lrange
import time
import numpy as np
from scipy import stats
from statsmodels.sandbox.gam import Model as GAM
from statsmodels.genmod.families import family
from statsmodels.genmod.generalized_linear_model import GLM
np.seterr(all='raise')
np.random.seed(8765993)
#seed is chosen for nice result, not randomly
#other seeds are pretty off in the prediction or end in overflow
#DGP: simple polynomial
order = 3
sigma_noise = 0.1
nobs = 1000
#lb, ub = -0.75, 3#1.5#0.75 #2.5
lb, ub = -3.5, 3
x1 = np.linspace(lb, ub, nobs)
x2 = np.sin(2*x1)
x = np.column_stack((x1/x1.max()*1, 1.*x2))
exog = (x[:,:,None]**np.arange(order+1)[None, None, :]).reshape(nobs, -1)
idx = lrange((order+1)*2)
del idx[order+1]
exog_reduced = exog[:,idx] #remove duplicate constant
y_true = exog.sum(1) #/ 4.
z = y_true #alias check
d = x
y = y_true + sigma_noise * np.random.randn(nobs)
example = 3
if example == 2:
print("binomial")
f = family.Binomial()
mu_true = f.link.inverse(z)
#b = np.asarray([scipy.stats.bernoulli.rvs(p) for p in f.link.inverse(y)])
b = np.asarray([stats.bernoulli.rvs(p) for p in f.link.inverse(z)])
b.shape = y.shape
m = GAM(b, d, family=f)
toc = time.time()
m.fit(b)
tic = time.time()
print(tic-toc)
#for plotting
yp = f.link.inverse(y)
p = b
if example == 3:
print("Poisson")
f = family.Poisson()
#y = y/y.max() * 3
yp = f.link.inverse(z)
p = np.asarray([stats.poisson.rvs(val) for val in f.link.inverse(z)],
float)
p.shape = y.shape
m = GAM(p, d, family=f)
toc = time.time()
m.fit(p)
tic = time.time()
print(tic-toc)
for ss in m.smoothers:
print(ss.params)
if example > 1:
import matplotlib.pyplot as plt
plt.figure()
for i in np.array(m.history[2:15:3]):
plt.plot(i.T)
plt.figure()
plt.plot(exog)
#plt.plot(p, '.', lw=2)
plt.plot(y_true, lw=2)
y_pred = m.results.mu # + m.results.alpha #m.results.predict(d)
plt.figure()
plt.subplot(2,2,1)
plt.plot(p, '.')
plt.plot(yp, 'b-', label='true')
plt.plot(y_pred, 'r-', label='GAM')
plt.legend(loc='upper left')
plt.title('gam.GAM Poisson')
counter = 2
for ii, xx in zip(['z', 'x1', 'x2'], [z, x[:,0], x[:,1]]):
sortidx = np.argsort(xx)
#plt.figure()
plt.subplot(2, 2, counter)
plt.plot(xx[sortidx], p[sortidx], 'k.', alpha=0.5)
plt.plot(xx[sortidx], yp[sortidx], 'b.', label='true')
plt.plot(xx[sortidx], y_pred[sortidx], 'r.', label='GAM')
plt.legend(loc='upper left')
plt.title('gam.GAM Poisson ' + ii)
counter += 1
res = GLM(p, exog_reduced, family=f).fit()
#plot component, compared to true component
x1 = x[:,0]
x2 = x[:,1]
f1 = exog[:,:order+1].sum(1) - 1 #take out constant
f2 = exog[:,order+1:].sum(1) - 1
plt.figure()
#Note: need to correct for constant which is indeterminatedly distributed
#plt.plot(x1, m.smoothers[0](x1)-m.smoothers[0].params[0]+1, 'r')
#better would be subtract f(0) m.smoothers[0](np.array([0]))
plt.plot(x1, f1, linewidth=2)
plt.plot(x1, m.smoothers[0](x1)-m.smoothers[0].params[0], 'r')
plt.figure()
plt.plot(x2, f2, linewidth=2)
plt.plot(x2, m.smoothers[1](x2)-m.smoothers[1].params[0], 'r')
plt.show()

View File

@ -0,0 +1,61 @@
"""
Created on Fri Nov 04 10:51:39 2011
@author: josef
"""
import numpy as np
from statsmodels.sandbox.nonparametric import smoothers
from statsmodels.regression.linear_model import OLS, WLS
#DGP: simple polynomial
order = 3
sigma_noise = 0.5
nobs = 100
lb, ub = -1, 2
x = np.linspace(lb, ub, nobs)
x = np.sin(x)
exog = x[:,None]**np.arange(order+1)
y_true = exog.sum(1)
y = y_true + sigma_noise * np.random.randn(nobs)
#xind = np.argsort(x)
pmod = smoothers.PolySmoother(2, x)
pmod.fit(y) #no return
y_pred = pmod.predict(x)
error = y - y_pred
mse = (error*error).mean()
print(mse)
res_ols = OLS(y, exog[:,:3]).fit()
print(np.squeeze(pmod.coef) - res_ols.params)
weights = np.ones(nobs)
weights[:nobs//3] = 0.1
weights[-nobs//5:] = 2
pmodw = smoothers.PolySmoother(2, x)
pmodw.fit(y, weights=weights) #no return
y_predw = pmodw.predict(x)
error = y - y_predw
mse = (error*error).mean()
print(mse)
res_wls = WLS(y, exog[:,:3], weights=weights).fit()
print(np.squeeze(pmodw.coef) - res_wls.params)
doplot = 1
if doplot:
import matplotlib.pyplot as plt
plt.plot(y, '.')
plt.plot(y_true, 'b-', label='true')
plt.plot(y_pred, '-', label='poly')
plt.plot(y_predw, '-', label='poly -w')
plt.legend(loc='upper left')
plt.close()
#plt.show()

View File

@ -0,0 +1,78 @@
import numpy as np
import numpy.testing as npt
from statsmodels.sandbox.nonparametric.kernel_extras import SemiLinear
class KernelExtrasTestBase:
@classmethod
def setup_class(cls):
nobs = 60
np.random.seed(123456)
cls.o = np.random.binomial(2, 0.7, size=(nobs, 1))
cls.o2 = np.random.binomial(3, 0.7, size=(nobs, 1))
cls.c1 = np.random.normal(size=(nobs, 1))
cls.c2 = np.random.normal(10, 1, size=(nobs, 1))
cls.c3 = np.random.normal(10, 2, size=(nobs, 1))
cls.noise = np.random.normal(size=(nobs, 1))
b0 = 0.3
b1 = 1.2
b2 = 3.7 # regression coefficients
cls.y = b0 + b1 * cls.c1 + b2 * cls.c2 + cls.noise
cls.y2 = b0 + b1 * cls.c1 + b2 * cls.c2 + cls.o + cls.noise
# Italy data from R's np package (the first 50 obs) R>> data (Italy)
cls.Italy_gdp = \
[8.556, 12.262, 9.587, 8.119, 5.537, 6.796, 8.638,
6.483, 6.212, 5.111, 6.001, 7.027, 4.616, 3.922,
4.688, 3.957, 3.159, 3.763, 3.829, 5.242, 6.275,
8.518, 11.542, 9.348, 8.02, 5.527, 6.865, 8.666,
6.672, 6.289, 5.286, 6.271, 7.94, 4.72, 4.357,
4.672, 3.883, 3.065, 3.489, 3.635, 5.443, 6.302,
9.054, 12.485, 9.896, 8.33, 6.161, 7.055, 8.717,
6.95]
cls.Italy_year = \
[1951, 1951, 1951, 1951, 1951, 1951, 1951, 1951, 1951, 1951, 1951,
1951, 1951, 1951, 1951, 1951, 1951, 1951, 1951, 1951, 1951, 1952,
1952, 1952, 1952, 1952, 1952, 1952, 1952, 1952, 1952, 1952, 1952,
1952, 1952, 1952, 1952, 1952, 1952, 1952, 1952, 1952, 1953, 1953,
1953, 1953, 1953, 1953, 1953, 1953]
# OECD panel data from NP R>> data(oecdpanel)
cls.growth = \
[-0.0017584, 0.00740688, 0.03424461, 0.03848719, 0.02932506,
0.03769199, 0.0466038, 0.00199456, 0.03679607, 0.01917304,
-0.00221, 0.00787269, 0.03441118, -0.0109228, 0.02043064,
-0.0307962, 0.02008947, 0.00580313, 0.00344502, 0.04706358,
0.03585851, 0.01464953, 0.04525762, 0.04109222, -0.0087903,
0.04087915, 0.04551403, 0.036916, 0.00369293, 0.0718669,
0.02577732, -0.0130759, -0.01656641, 0.00676429, 0.08833017,
0.05092105, 0.02005877, 0.00183858, 0.03903173, 0.05832116,
0.0494571, 0.02078484, 0.09213897, 0.0070534, 0.08677202,
0.06830603, -0.00041, 0.0002856, 0.03421225, -0.0036825]
cls.oecd = \
[0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0,
0, 0, 0, 0]
class TestSemiLinear(KernelExtrasTestBase):
def test_basic(self):
nobs = 300
np.random.seed(1234)
C1 = np.random.normal(0,2, size=(nobs, ))
C2 = np.random.normal(2, 1, size=(nobs, ))
e = np.random.normal(size=(nobs, ))
b1 = 1.3
b2 = -0.7
Y = b1 * C1 + np.exp(b2 * C2) + e
model = SemiLinear(endog=[Y], exog=[C1], exog_nonparametric=[C2],
var_type='c', k_linear=1)
b_hat = np.squeeze(model.b)
# Only tests for the linear part of the regression
# Currently does not work well with the nonparametric part
# Needs some more work
npt.assert_allclose(b1, b_hat, rtol=0.1)

View File

@ -0,0 +1,110 @@
"""
Created on Fri Nov 04 10:51:39 2011
Author: Josef Perktold
License: BSD-3
"""
import numpy as np
from numpy.testing import assert_almost_equal, assert_equal
from statsmodels.sandbox.nonparametric import smoothers
from statsmodels.regression.linear_model import OLS, WLS
class CheckSmoother:
def test_predict(self):
assert_almost_equal(self.res_ps.predict(self.x),
self.res2.fittedvalues, decimal=13)
assert_almost_equal(self.res_ps.predict(self.x[:10]),
self.res2.fittedvalues[:10], decimal=13)
def test_coef(self):
#TODO: check dim of coef
assert_almost_equal(self.res_ps.coef.ravel(),
self.res2.params, decimal=14)
def test_df(self):
#TODO: make into attributes
assert_equal(self.res_ps.df_model(), self.res2.df_model+1) #with const
assert_equal(self.res_ps.df_fit(), self.res2.df_model+1) #alias
assert_equal(self.res_ps.df_resid(), self.res2.df_resid)
class BasePolySmoother:
@classmethod
def setup_class(cls):
#DGP: simple polynomial
order = 3
sigma_noise = 0.5
nobs = 100
lb, ub = -1, 2
cls.x = x = np.linspace(lb, ub, nobs)
cls.exog = exog = x[:,None]**np.arange(order+1)
y_true = exog.sum(1)
np.random.seed(987567)
cls.y = y = y_true + sigma_noise * np.random.randn(nobs)
class TestPolySmoother1(BasePolySmoother, CheckSmoother):
@classmethod
def setup_class(cls):
super().setup_class() #initialize DGP
y, x, exog = cls.y, cls.x, cls.exog
#use order = 2 in regression
pmod = smoothers.PolySmoother(2, x)
pmod.fit(y) #no return
cls.res_ps = pmod
cls.res2 = OLS(y, exog[:,:2+1]).fit()
class TestPolySmoother2(BasePolySmoother, CheckSmoother):
@classmethod
def setup_class(cls):
super().setup_class() #initialize DGP
y, x, exog = cls.y, cls.x, cls.exog
#use order = 3 in regression
pmod = smoothers.PolySmoother(3, x)
#pmod.fit(y) #no return
pmod.smooth(y) #no return, use alias for fit
cls.res_ps = pmod
cls.res2 = OLS(y, exog[:,:3+1]).fit()
class TestPolySmoother3(BasePolySmoother, CheckSmoother):
@classmethod
def setup_class(cls):
super().setup_class() #initialize DGP
y, x, exog = cls.y, cls.x, cls.exog
nobs = y.shape[0]
weights = np.ones(nobs)
weights[:nobs//3] = 0.1
weights[-nobs//5:] = 2
#use order = 2 in regression
pmod = smoothers.PolySmoother(2, x)
pmod.fit(y, weights=weights) #no return
cls.res_ps = pmod
cls.res2 = WLS(y, exog[:,:2+1], weights=weights).fit()
if __name__ == '__main__':
t1 = TestPolySmoother1()
t1.test_predict()
t1.test_coef()
t1.test_df
t3 = TestPolySmoother3()
t3.test_predict()