reconnect moved files to git repo
This commit is contained in:
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@ -0,0 +1,560 @@
|
||||
# some cut and paste characters are not ASCII
|
||||
'''density estimation based on orthogonal polynomials
|
||||
|
||||
|
||||
Author: Josef Perktold
|
||||
Created: 2011-05017
|
||||
License: BSD
|
||||
|
||||
2 versions work: based on Fourier, FPoly, and chebychev T, ChebyTPoly
|
||||
also hermite polynomials, HPoly, works
|
||||
other versions need normalization
|
||||
|
||||
|
||||
TODO:
|
||||
|
||||
* check fourier case again: base is orthonormal,
|
||||
but needs offsetfact = 0 and does not integrate to 1, rescaled looks good
|
||||
* hermite: works but DensityOrthoPoly requires currently finite bounds
|
||||
I use it with offsettfactor 0.5 in example
|
||||
* not implemented methods:
|
||||
- add bonafide density correction
|
||||
- add transformation to domain of polynomial base - DONE
|
||||
possible problem: what is the behavior at the boundary,
|
||||
offsetfact requires more work, check different cases, add as option
|
||||
moved to polynomial class by default, as attribute
|
||||
* convert examples to test cases
|
||||
* need examples with large density on boundary, beta ?
|
||||
* organize poly classes in separate module, check new numpy.polynomials,
|
||||
polyvander
|
||||
* MISE measures, order selection, ...
|
||||
|
||||
enhancements:
|
||||
* other polynomial bases: especially for open and half open support
|
||||
* wavelets
|
||||
* local or piecewise approximations
|
||||
|
||||
|
||||
'''
|
||||
from scipy import stats, integrate, special
|
||||
|
||||
import numpy as np
|
||||
|
||||
|
||||
sqr2 = np.sqrt(2.)
|
||||
|
||||
class FPoly:
|
||||
'''Orthonormal (for weight=1) Fourier Polynomial on [0,1]
|
||||
|
||||
orthonormal polynomial but density needs corfactor that I do not see what
|
||||
it is analytically
|
||||
|
||||
parameterization on [0,1] from
|
||||
|
||||
Sam Efromovich: Orthogonal series density estimation,
|
||||
2010 John Wiley & Sons, Inc. WIREs Comp Stat 2010 2 467-476
|
||||
|
||||
|
||||
'''
|
||||
|
||||
def __init__(self, order):
|
||||
self.order = order
|
||||
self.domain = (0, 1)
|
||||
self.intdomain = self.domain
|
||||
|
||||
def __call__(self, x):
|
||||
if self.order == 0:
|
||||
return np.ones_like(x)
|
||||
else:
|
||||
return sqr2 * np.cos(np.pi * self.order * x)
|
||||
|
||||
class F2Poly:
|
||||
'''Orthogonal (for weight=1) Fourier Polynomial on [0,pi]
|
||||
|
||||
is orthogonal but first component does not square-integrate to 1
|
||||
final result seems to need a correction factor of sqrt(pi)
|
||||
_corfactor = sqrt(pi) from integrating the density
|
||||
|
||||
Parameterization on [0, pi] from
|
||||
|
||||
Peter Hall, Cross-Validation and the Smoothing of Orthogonal Series Density
|
||||
Estimators, JOURNAL OF MULTIVARIATE ANALYSIS 21, 189-206 (1987)
|
||||
|
||||
'''
|
||||
|
||||
def __init__(self, order):
|
||||
self.order = order
|
||||
self.domain = (0, np.pi)
|
||||
self.intdomain = self.domain
|
||||
self.offsetfactor = 0
|
||||
|
||||
def __call__(self, x):
|
||||
if self.order == 0:
|
||||
return np.ones_like(x) / np.sqrt(np.pi)
|
||||
else:
|
||||
return sqr2 * np.cos(self.order * x) / np.sqrt(np.pi)
|
||||
|
||||
class ChebyTPoly:
|
||||
'''Orthonormal (for weight=1) Chebychev Polynomial on (-1,1)
|
||||
|
||||
|
||||
Notes
|
||||
-----
|
||||
integration requires to stay away from boundary, offsetfactor > 0
|
||||
maybe this implies that we cannot use it for densities that are > 0 at
|
||||
boundary ???
|
||||
|
||||
or maybe there is a mistake close to the boundary, sometimes integration works.
|
||||
|
||||
'''
|
||||
|
||||
def __init__(self, order):
|
||||
self.order = order
|
||||
from scipy.special import chebyt
|
||||
self.poly = chebyt(order)
|
||||
self.domain = (-1, 1)
|
||||
self.intdomain = (-1+1e-6, 1-1e-6)
|
||||
#not sure if I need this, in integration nans are possible on the boundary
|
||||
self.offsetfactor = 0.01 #required for integration
|
||||
|
||||
|
||||
def __call__(self, x):
|
||||
if self.order == 0:
|
||||
return np.ones_like(x) / (1-x**2)**(1/4.) /np.sqrt(np.pi)
|
||||
|
||||
else:
|
||||
return self.poly(x) / (1-x**2)**(1/4.) /np.sqrt(np.pi) *np.sqrt(2)
|
||||
|
||||
|
||||
logpi2 = np.log(np.pi)/2
|
||||
|
||||
class HPoly:
|
||||
'''Orthonormal (for weight=1) Hermite Polynomial, uses finite bounds
|
||||
|
||||
for current use with DensityOrthoPoly domain is defined as [-6,6]
|
||||
|
||||
'''
|
||||
def __init__(self, order):
|
||||
self.order = order
|
||||
from scipy.special import hermite
|
||||
self.poly = hermite(order)
|
||||
self.domain = (-6, +6)
|
||||
self.offsetfactor = 0.5 # note this is
|
||||
|
||||
def __call__(self, x):
|
||||
k = self.order
|
||||
|
||||
lnfact = -(1./2)*(k*np.log(2.) + special.gammaln(k+1) + logpi2) - x*x/2
|
||||
fact = np.exp(lnfact)
|
||||
|
||||
return self.poly(x) * fact
|
||||
|
||||
def polyvander(x, polybase, order=5):
|
||||
polyarr = np.column_stack([polybase(i)(x) for i in range(order)])
|
||||
return polyarr
|
||||
|
||||
def inner_cont(polys, lower, upper, weight=None):
|
||||
'''inner product of continuous function (with weight=1)
|
||||
|
||||
Parameters
|
||||
----------
|
||||
polys : list of callables
|
||||
polynomial instances
|
||||
lower : float
|
||||
lower integration limit
|
||||
upper : float
|
||||
upper integration limit
|
||||
weight : callable or None
|
||||
weighting function
|
||||
|
||||
Returns
|
||||
-------
|
||||
innp : ndarray
|
||||
symmetric 2d square array with innerproduct of all function pairs
|
||||
err : ndarray
|
||||
numerical error estimate from scipy.integrate.quad, same dimension as innp
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> from scipy.special import chebyt
|
||||
>>> polys = [chebyt(i) for i in range(4)]
|
||||
>>> r, e = inner_cont(polys, -1, 1)
|
||||
>>> r
|
||||
array([[ 2. , 0. , -0.66666667, 0. ],
|
||||
[ 0. , 0.66666667, 0. , -0.4 ],
|
||||
[-0.66666667, 0. , 0.93333333, 0. ],
|
||||
[ 0. , -0.4 , 0. , 0.97142857]])
|
||||
|
||||
'''
|
||||
n_polys = len(polys)
|
||||
innerprod = np.empty((n_polys, n_polys))
|
||||
innerprod.fill(np.nan)
|
||||
interr = np.zeros((n_polys, n_polys))
|
||||
|
||||
for i in range(n_polys):
|
||||
for j in range(i+1):
|
||||
p1 = polys[i]
|
||||
p2 = polys[j]
|
||||
if weight is not None:
|
||||
innp, err = integrate.quad(lambda x: p1(x)*p2(x)*weight(x),
|
||||
lower, upper)
|
||||
else:
|
||||
innp, err = integrate.quad(lambda x: p1(x)*p2(x), lower, upper)
|
||||
innerprod[i,j] = innp
|
||||
interr[i,j] = err
|
||||
if not i == j:
|
||||
innerprod[j,i] = innp
|
||||
interr[j,i] = err
|
||||
|
||||
return innerprod, interr
|
||||
|
||||
|
||||
def is_orthonormal_cont(polys, lower, upper, rtol=0, atol=1e-08):
|
||||
'''check whether functions are orthonormal
|
||||
|
||||
Parameters
|
||||
----------
|
||||
polys : list of polynomials or function
|
||||
|
||||
Returns
|
||||
-------
|
||||
is_orthonormal : bool
|
||||
is False if the innerproducts are not close to 0 or 1
|
||||
|
||||
Notes
|
||||
-----
|
||||
this stops as soon as the first deviation from orthonormality is found.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> from scipy.special import chebyt
|
||||
>>> polys = [chebyt(i) for i in range(4)]
|
||||
>>> r, e = inner_cont(polys, -1, 1)
|
||||
>>> r
|
||||
array([[ 2. , 0. , -0.66666667, 0. ],
|
||||
[ 0. , 0.66666667, 0. , -0.4 ],
|
||||
[-0.66666667, 0. , 0.93333333, 0. ],
|
||||
[ 0. , -0.4 , 0. , 0.97142857]])
|
||||
>>> is_orthonormal_cont(polys, -1, 1, atol=1e-6)
|
||||
False
|
||||
|
||||
>>> polys = [ChebyTPoly(i) for i in range(4)]
|
||||
>>> r, e = inner_cont(polys, -1, 1)
|
||||
>>> r
|
||||
array([[ 1.00000000e+00, 0.00000000e+00, -9.31270888e-14,
|
||||
0.00000000e+00],
|
||||
[ 0.00000000e+00, 1.00000000e+00, 0.00000000e+00,
|
||||
-9.47850712e-15],
|
||||
[ -9.31270888e-14, 0.00000000e+00, 1.00000000e+00,
|
||||
0.00000000e+00],
|
||||
[ 0.00000000e+00, -9.47850712e-15, 0.00000000e+00,
|
||||
1.00000000e+00]])
|
||||
>>> is_orthonormal_cont(polys, -1, 1, atol=1e-6)
|
||||
True
|
||||
|
||||
'''
|
||||
for i in range(len(polys)):
|
||||
for j in range(i+1):
|
||||
p1 = polys[i]
|
||||
p2 = polys[j]
|
||||
innerprod = integrate.quad(lambda x: p1(x)*p2(x), lower, upper)[0]
|
||||
#print i,j, innerprod
|
||||
if not np.allclose(innerprod, i==j, rtol=rtol, atol=atol):
|
||||
return False
|
||||
return True
|
||||
|
||||
|
||||
|
||||
#new versions
|
||||
|
||||
|
||||
class DensityOrthoPoly:
|
||||
'''Univariate density estimation by orthonormal series expansion
|
||||
|
||||
|
||||
Uses an orthonormal polynomial basis to approximate a univariate density.
|
||||
|
||||
|
||||
currently all arguments can be given to fit, I might change it to requiring
|
||||
arguments in __init__ instead.
|
||||
'''
|
||||
|
||||
def __init__(self, polybase=None, order=5):
|
||||
if polybase is not None:
|
||||
self.polybase = polybase
|
||||
self.polys = polys = [polybase(i) for i in range(order)]
|
||||
#try:
|
||||
#self.offsetfac = 0.05
|
||||
#self.offsetfac = polys[0].offsetfactor #polys maybe not defined yet
|
||||
self._corfactor = 1
|
||||
self._corshift = 0
|
||||
|
||||
|
||||
def fit(self, x, polybase=None, order=5, limits=None):
|
||||
'''estimate the orthogonal polynomial approximation to the density
|
||||
|
||||
'''
|
||||
if polybase is None:
|
||||
polys = self.polys[:order]
|
||||
else:
|
||||
self.polybase = polybase
|
||||
self.polys = polys = [polybase(i) for i in range(order)]
|
||||
|
||||
#move to init ?
|
||||
if not hasattr(self, 'offsetfac'):
|
||||
self.offsetfac = polys[0].offsetfactor
|
||||
|
||||
|
||||
xmin, xmax = x.min(), x.max()
|
||||
if limits is None:
|
||||
self.offset = offset = (xmax - xmin) * self.offsetfac
|
||||
limits = self.limits = (xmin - offset, xmax + offset)
|
||||
|
||||
interval_length = limits[1] - limits[0]
|
||||
xinterval = xmax - xmin
|
||||
# need to cover (half-)open intervalls
|
||||
self.shrink = 1. / interval_length #xinterval/interval_length
|
||||
offset = (interval_length - xinterval ) / 2.
|
||||
self.shift = xmin - offset
|
||||
|
||||
self.x = x = self._transform(x)
|
||||
|
||||
coeffs = [(p(x)).mean() for p in polys]
|
||||
self.coeffs = coeffs
|
||||
self.polys = polys
|
||||
self._verify() #verify that it is a proper density
|
||||
|
||||
return self #coeffs, polys
|
||||
|
||||
def evaluate(self, xeval, order=None):
|
||||
xeval = self._transform(xeval)
|
||||
if order is None:
|
||||
order = len(self.polys)
|
||||
res = sum(c*p(xeval) for c, p in list(zip(self.coeffs, self.polys))[:order])
|
||||
res = self._correction(res)
|
||||
return res
|
||||
|
||||
def __call__(self, xeval):
|
||||
'''alias for evaluate, except no order argument'''
|
||||
return self.evaluate(xeval)
|
||||
|
||||
def _verify(self):
|
||||
'''check for bona fide density correction
|
||||
|
||||
currently only checks that density integrates to 1
|
||||
|
||||
` non-negativity - NotImplementedYet
|
||||
'''
|
||||
#watch out for circular/recursive usage
|
||||
|
||||
#evaluate uses domain of data, we stay offset away from bounds
|
||||
intdomain = self.limits #self.polys[0].intdomain
|
||||
self._corfactor = 1./integrate.quad(self.evaluate, *intdomain)[0]
|
||||
#self._corshift = 0
|
||||
#self._corfactor
|
||||
return self._corfactor
|
||||
|
||||
|
||||
|
||||
def _correction(self, x):
|
||||
'''bona fide density correction
|
||||
|
||||
affine shift of density to make it into a proper density
|
||||
|
||||
'''
|
||||
if self._corfactor != 1:
|
||||
x *= self._corfactor
|
||||
|
||||
if self._corshift != 0:
|
||||
x += self._corshift
|
||||
|
||||
return x
|
||||
|
||||
def _transform(self, x): # limits=None):
|
||||
'''transform observation to the domain of the density
|
||||
|
||||
|
||||
uses shrink and shift attribute which are set in fit to stay
|
||||
|
||||
|
||||
'''
|
||||
|
||||
#use domain from first instance
|
||||
#class does not have domain self.polybase.domain[0] AttributeError
|
||||
domain = self.polys[0].domain
|
||||
|
||||
ilen = (domain[1] - domain[0])
|
||||
shift = self.shift - domain[0]/self.shrink/ilen
|
||||
shrink = self.shrink * ilen
|
||||
|
||||
return (x - shift) * shrink
|
||||
|
||||
|
||||
#old version as a simple function
|
||||
def density_orthopoly(x, polybase, order=5, xeval=None):
|
||||
#polybase = legendre #chebyt #hermitenorm#
|
||||
#polybase = chebyt
|
||||
#polybase = FPoly
|
||||
#polybase = ChtPoly
|
||||
#polybase = hermite
|
||||
#polybase = HPoly
|
||||
|
||||
if xeval is None:
|
||||
xeval = np.linspace(x.min(),x.max(),50)
|
||||
|
||||
#polys = [legendre(i) for i in range(order)]
|
||||
polys = [polybase(i) for i in range(order)]
|
||||
#coeffs = [(p(x)*(1-x**2)**(-1/2.)).mean() for p in polys]
|
||||
#coeffs = [(p(x)*np.exp(-x*x)).mean() for p in polys]
|
||||
coeffs = [(p(x)).mean() for p in polys]
|
||||
res = sum(c*p(xeval) for c, p in zip(coeffs, polys))
|
||||
#res *= (1-xeval**2)**(-1/2.)
|
||||
#res *= np.exp(-xeval**2./2)
|
||||
return res, xeval, coeffs, polys
|
||||
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
||||
examples = ['chebyt', 'fourier', 'hermite']#[2]
|
||||
|
||||
nobs = 10000
|
||||
|
||||
import matplotlib.pyplot as plt
|
||||
from statsmodels.distributions.mixture_rvs import (
|
||||
mixture_rvs, MixtureDistribution)
|
||||
|
||||
#np.random.seed(12345)
|
||||
## obs_dist = mixture_rvs([1/3.,2/3.], size=nobs, dist=[stats.norm, stats.norm],
|
||||
## kwargs = (dict(loc=-1,scale=.5),dict(loc=1,scale=.75)))
|
||||
mix_kwds = (dict(loc=-0.5,scale=.5),dict(loc=1,scale=.2))
|
||||
obs_dist = mixture_rvs([1/3.,2/3.], size=nobs, dist=[stats.norm, stats.norm],
|
||||
kwargs=mix_kwds)
|
||||
mix = MixtureDistribution()
|
||||
|
||||
#obs_dist = np.random.randn(nobs)/4. #np.sqrt(2)
|
||||
|
||||
|
||||
if "chebyt_" in examples: # needed for Cheby example below
|
||||
#obs_dist = np.clip(obs_dist, -2, 2)/2.01
|
||||
#chebyt [0,1]
|
||||
obs_dist = obs_dist[(obs_dist>-2) & (obs_dist<2)]/2.0 #/4. + 2/4.0
|
||||
#fourier [0,1]
|
||||
#obs_dist = obs_dist[(obs_dist>-2) & (obs_dist<2)]/4. + 2/4.0
|
||||
f_hat, grid, coeffs, polys = density_orthopoly(obs_dist, ChebyTPoly, order=20, xeval=None)
|
||||
#f_hat /= f_hat.sum() * (grid.max() - grid.min())/len(grid)
|
||||
f_hat0 = f_hat
|
||||
fint = integrate.trapz(f_hat, grid)# dx=(grid.max() - grid.min())/len(grid))
|
||||
#f_hat -= fint/2.
|
||||
print('f_hat.min()', f_hat.min())
|
||||
f_hat = (f_hat - f_hat.min()) #/ f_hat.max() - f_hat.min
|
||||
fint2 = integrate.trapz(f_hat, grid)# dx=(grid.max() - grid.min())/len(grid))
|
||||
print('fint2', fint, fint2)
|
||||
f_hat /= fint2
|
||||
|
||||
# note that this uses a *huge* grid by default
|
||||
#f_hat, grid = kdensityfft(emp_dist, kernel="gauss", bw="scott")
|
||||
|
||||
# check the plot
|
||||
|
||||
doplot = 0
|
||||
if doplot:
|
||||
plt.hist(obs_dist, bins=50, normed=True, color='red')
|
||||
plt.plot(grid, f_hat, lw=2, color='black')
|
||||
plt.plot(grid, f_hat0, lw=2, color='g')
|
||||
plt.show()
|
||||
|
||||
for i,p in enumerate(polys[:5]):
|
||||
for j,p2 in enumerate(polys[:5]):
|
||||
print(i,j,integrate.quad(lambda x: p(x)*p2(x), -1,1)[0])
|
||||
|
||||
for p in polys:
|
||||
print(integrate.quad(lambda x: p(x)**2, -1,1))
|
||||
|
||||
|
||||
#examples using the new class
|
||||
|
||||
if "chebyt" in examples:
|
||||
dop = DensityOrthoPoly().fit(obs_dist, ChebyTPoly, order=20)
|
||||
grid = np.linspace(obs_dist.min(), obs_dist.max())
|
||||
xf = dop(grid)
|
||||
#print('np.max(np.abs(xf - f_hat0))', np.max(np.abs(xf - f_hat0))
|
||||
dopint = integrate.quad(dop, *dop.limits)[0]
|
||||
print('dop F integral', dopint)
|
||||
mpdf = mix.pdf(grid, [1/3.,2/3.], dist=[stats.norm, stats.norm],
|
||||
kwargs=mix_kwds)
|
||||
|
||||
doplot = 1
|
||||
if doplot:
|
||||
plt.figure()
|
||||
plt.hist(obs_dist, bins=50, normed=True, color='red')
|
||||
plt.plot(grid, xf, lw=2, color='black')
|
||||
plt.plot(grid, mpdf, lw=2, color='green')
|
||||
plt.title('using Chebychev polynomials')
|
||||
#plt.show()
|
||||
|
||||
if "fourier" in examples:
|
||||
dop = DensityOrthoPoly()
|
||||
dop.offsetfac = 0.5
|
||||
dop = dop.fit(obs_dist, F2Poly, order=30)
|
||||
grid = np.linspace(obs_dist.min(), obs_dist.max())
|
||||
xf = dop(grid)
|
||||
#print(np.max(np.abs(xf - f_hat0))
|
||||
dopint = integrate.quad(dop, *dop.limits)[0]
|
||||
print('dop F integral', dopint)
|
||||
mpdf = mix.pdf(grid, [1/3.,2/3.], dist=[stats.norm, stats.norm],
|
||||
kwargs=mix_kwds)
|
||||
|
||||
doplot = 1
|
||||
if doplot:
|
||||
plt.figure()
|
||||
plt.hist(obs_dist, bins=50, normed=True, color='red')
|
||||
plt.title('using Fourier polynomials')
|
||||
plt.plot(grid, xf, lw=2, color='black')
|
||||
plt.plot(grid, mpdf, lw=2, color='green')
|
||||
#plt.show()
|
||||
|
||||
#check orthonormality:
|
||||
print(np.max(np.abs(inner_cont(dop.polys[:5], 0, 1)[0] -np.eye(5))))
|
||||
|
||||
if "hermite" in examples:
|
||||
dop = DensityOrthoPoly()
|
||||
dop.offsetfac = 0
|
||||
dop = dop.fit(obs_dist, HPoly, order=20)
|
||||
grid = np.linspace(obs_dist.min(), obs_dist.max())
|
||||
xf = dop(grid)
|
||||
#print(np.max(np.abs(xf - f_hat0))
|
||||
dopint = integrate.quad(dop, *dop.limits)[0]
|
||||
print('dop F integral', dopint)
|
||||
|
||||
mpdf = mix.pdf(grid, [1/3.,2/3.], dist=[stats.norm, stats.norm],
|
||||
kwargs=mix_kwds)
|
||||
|
||||
doplot = 1
|
||||
if doplot:
|
||||
plt.figure()
|
||||
plt.hist(obs_dist, bins=50, normed=True, color='red')
|
||||
plt.plot(grid, xf, lw=2, color='black')
|
||||
plt.plot(grid, mpdf, lw=2, color='green')
|
||||
plt.title('using Hermite polynomials')
|
||||
plt.show()
|
||||
|
||||
#check orthonormality:
|
||||
print(np.max(np.abs(inner_cont(dop.polys[:5], 0, 1)[0] -np.eye(5))))
|
||||
|
||||
|
||||
#check orthonormality
|
||||
|
||||
hpolys = [HPoly(i) for i in range(5)]
|
||||
inn = inner_cont(hpolys, -6, 6)[0]
|
||||
print(np.max(np.abs(inn - np.eye(5))))
|
||||
print((inn*100000).astype(int))
|
||||
|
||||
from scipy.special import hermite, chebyt
|
||||
htpolys = [hermite(i) for i in range(5)]
|
||||
innt = inner_cont(htpolys, -10, 10)[0]
|
||||
print((innt*100000).astype(int))
|
||||
|
||||
polysc = [chebyt(i) for i in range(4)]
|
||||
r, e = inner_cont(polysc, -1, 1, weight=lambda x: (1-x*x)**(-1/2.))
|
||||
print(np.max(np.abs(r - np.diag(np.diag(r)))))
|
||||
@ -0,0 +1,212 @@
|
||||
"""Examples of non-linear functions for non-parametric regression
|
||||
|
||||
Created on Sat Jan 05 20:21:22 2013
|
||||
|
||||
Author: Josef Perktold
|
||||
"""
|
||||
|
||||
import numpy as np
|
||||
|
||||
## Functions
|
||||
|
||||
def fg1(x):
|
||||
'''Fan and Gijbels example function 1
|
||||
|
||||
'''
|
||||
return x + 2 * np.exp(-16 * x**2)
|
||||
|
||||
def fg1eu(x):
|
||||
'''Eubank similar to Fan and Gijbels example function 1
|
||||
|
||||
'''
|
||||
return x + 0.5 * np.exp(-50 * (x - 0.5)**2)
|
||||
|
||||
def fg2(x):
|
||||
'''Fan and Gijbels example function 2
|
||||
|
||||
'''
|
||||
return np.sin(2 * x) + 2 * np.exp(-16 * x**2)
|
||||
|
||||
def func1(x):
|
||||
'''made up example with sin, square
|
||||
|
||||
'''
|
||||
return np.sin(x * 5) / x + 2. * x - 1. * x**2
|
||||
|
||||
## Classes with Data Generating Processes
|
||||
|
||||
doc = {'description':
|
||||
'''Base Class for Univariate non-linear example
|
||||
|
||||
Does not work on it's own.
|
||||
needs additional at least self.func
|
||||
''',
|
||||
'ref': ''}
|
||||
|
||||
class _UnivariateFunction:
|
||||
#Base Class for Univariate non-linear example.
|
||||
#Does not work on it's own. needs additionally at least self.func
|
||||
__doc__ = '''%(description)s
|
||||
|
||||
Parameters
|
||||
----------
|
||||
nobs : int
|
||||
number of observations to simulate
|
||||
x : None or 1d array
|
||||
If x is given then it is used for the exogenous variable instead of
|
||||
creating a random sample
|
||||
distr_x : None or distribution instance
|
||||
Only used if x is None. The rvs method is used to create a random
|
||||
sample of the exogenous (explanatory) variable.
|
||||
distr_noise : None or distribution instance
|
||||
The rvs method is used to create a random sample of the errors.
|
||||
|
||||
Attributes
|
||||
----------
|
||||
x : ndarray, 1-D
|
||||
exogenous or explanatory variable. x is sorted.
|
||||
y : ndarray, 1-D
|
||||
endogenous or response variable
|
||||
y_true : ndarray, 1-D
|
||||
expected values of endogenous or response variable, i.e. values of y
|
||||
without noise
|
||||
func : callable
|
||||
underlying function (defined by subclass)
|
||||
|
||||
%(ref)s
|
||||
''' #% doc
|
||||
|
||||
def __init__(self, nobs=200, x=None, distr_x=None, distr_noise=None):
|
||||
|
||||
if x is None:
|
||||
if distr_x is None:
|
||||
x = np.random.normal(loc=0, scale=self.s_x, size=nobs)
|
||||
else:
|
||||
x = distr_x.rvs(size=nobs)
|
||||
x.sort()
|
||||
|
||||
self.x = x
|
||||
|
||||
if distr_noise is None:
|
||||
noise = np.random.normal(loc=0, scale=self.s_noise, size=nobs)
|
||||
else:
|
||||
noise = distr_noise.rvs(size=nobs)
|
||||
|
||||
if hasattr(self, 'het_scale'):
|
||||
noise *= self.het_scale(self.x)
|
||||
|
||||
#self.func = fg1
|
||||
self.y_true = y_true = self.func(x)
|
||||
self.y = y_true + noise
|
||||
|
||||
|
||||
def plot(self, scatter=True, ax=None):
|
||||
'''plot the mean function and optionally the scatter of the sample
|
||||
|
||||
Parameters
|
||||
----------
|
||||
scatter : bool
|
||||
If true, then add scatterpoints of sample to plot.
|
||||
ax : None or matplotlib axis instance
|
||||
If None, then a matplotlib.pyplot figure is created, otherwise
|
||||
the given axis, ax, is used.
|
||||
|
||||
Returns
|
||||
-------
|
||||
Figure
|
||||
This is either the created figure instance or the one associated
|
||||
with ax if ax is given.
|
||||
|
||||
'''
|
||||
if ax is None:
|
||||
import matplotlib.pyplot as plt
|
||||
fig = plt.figure()
|
||||
ax = fig.add_subplot(1, 1, 1)
|
||||
|
||||
if scatter:
|
||||
ax.plot(self.x, self.y, 'o', alpha=0.5)
|
||||
|
||||
xx = np.linspace(self.x.min(), self.x.max(), 100)
|
||||
ax.plot(xx, self.func(xx), lw=2, color='b', label='dgp mean')
|
||||
return ax.figure
|
||||
|
||||
doc = {'description':
|
||||
'''Fan and Gijbels example function 1
|
||||
|
||||
linear trend plus a hump
|
||||
''',
|
||||
'ref':
|
||||
'''
|
||||
References
|
||||
----------
|
||||
Fan, Jianqing, and Irene Gijbels. 1992. "Variable Bandwidth and Local
|
||||
Linear Regression Smoothers."
|
||||
The Annals of Statistics 20 (4) (December): 2008-2036. doi:10.2307/2242378.
|
||||
|
||||
'''}
|
||||
|
||||
class UnivariateFanGijbels1(_UnivariateFunction):
|
||||
__doc__ = _UnivariateFunction.__doc__ % doc
|
||||
|
||||
|
||||
def __init__(self, nobs=200, x=None, distr_x=None, distr_noise=None):
|
||||
self.s_x = 1.
|
||||
self.s_noise = 0.7
|
||||
self.func = fg1
|
||||
super(self.__class__, self).__init__(nobs=nobs, x=x,
|
||||
distr_x=distr_x,
|
||||
distr_noise=distr_noise)
|
||||
|
||||
doc['description'] =\
|
||||
'''Fan and Gijbels example function 2
|
||||
|
||||
sin plus a hump
|
||||
'''
|
||||
|
||||
class UnivariateFanGijbels2(_UnivariateFunction):
|
||||
__doc__ = _UnivariateFunction.__doc__ % doc
|
||||
|
||||
def __init__(self, nobs=200, x=None, distr_x=None, distr_noise=None):
|
||||
self.s_x = 1.
|
||||
self.s_noise = 0.5
|
||||
self.func = fg2
|
||||
super(self.__class__, self).__init__(nobs=nobs, x=x,
|
||||
distr_x=distr_x,
|
||||
distr_noise=distr_noise)
|
||||
|
||||
class UnivariateFanGijbels1EU(_UnivariateFunction):
|
||||
'''
|
||||
|
||||
Eubank p.179f
|
||||
'''
|
||||
|
||||
def __init__(self, nobs=50, x=None, distr_x=None, distr_noise=None):
|
||||
if distr_x is None:
|
||||
from scipy import stats
|
||||
distr_x = stats.uniform
|
||||
self.s_noise = 0.15
|
||||
self.func = fg1eu
|
||||
super(self.__class__, self).__init__(nobs=nobs, x=x,
|
||||
distr_x=distr_x,
|
||||
distr_noise=distr_noise)
|
||||
|
||||
class UnivariateFunc1(_UnivariateFunction):
|
||||
'''
|
||||
|
||||
made up, with sin and quadratic trend
|
||||
'''
|
||||
|
||||
def __init__(self, nobs=200, x=None, distr_x=None, distr_noise=None):
|
||||
if x is None and distr_x is None:
|
||||
from scipy import stats
|
||||
distr_x = stats.uniform(-2, 4)
|
||||
else:
|
||||
nobs = x.shape[0]
|
||||
self.s_noise = 2.
|
||||
self.func = func1
|
||||
super().__init__(nobs=nobs, x=x,
|
||||
distr_x=distr_x,
|
||||
distr_noise=distr_noise)
|
||||
|
||||
def het_scale(self, x):
|
||||
return np.sqrt(np.abs(3+x))
|
||||
@ -0,0 +1,114 @@
|
||||
from statsmodels.compat.python import lzip
|
||||
|
||||
import numpy as np
|
||||
|
||||
from statsmodels.tools.validation import array_like
|
||||
from . import kernels
|
||||
|
||||
|
||||
#TODO: should this be a function?
|
||||
class KDE:
|
||||
"""
|
||||
Kernel Density Estimator
|
||||
|
||||
Parameters
|
||||
----------
|
||||
x : array_like
|
||||
N-dimensional array from which the density is to be estimated
|
||||
kernel : Kernel Class
|
||||
Should be a class from *
|
||||
"""
|
||||
#TODO: amend docs for Nd case?
|
||||
def __init__(self, x, kernel=None):
|
||||
x = array_like(x, "x", maxdim=2, contiguous=True)
|
||||
if x.ndim == 1:
|
||||
x = x[:,None]
|
||||
|
||||
nobs, n_series = x.shape
|
||||
|
||||
if kernel is None:
|
||||
kernel = kernels.Gaussian() # no meaningful bandwidth yet
|
||||
|
||||
if n_series > 1:
|
||||
if isinstance( kernel, kernels.CustomKernel ):
|
||||
kernel = kernels.NdKernel(n_series, kernels = kernel)
|
||||
|
||||
self.kernel = kernel
|
||||
self.n = n_series #TODO change attribute
|
||||
self.x = x
|
||||
|
||||
def density(self, x):
|
||||
return self.kernel.density(self.x, x)
|
||||
|
||||
def __call__(self, x, h="scott"):
|
||||
return np.array([self.density(xx) for xx in x])
|
||||
|
||||
def evaluate(self, x, h="silverman"):
|
||||
density = self.kernel.density
|
||||
return np.array([density(xx) for xx in x])
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
from numpy import random
|
||||
import matplotlib.pyplot as plt
|
||||
import statsmodels.nonparametric.bandwidths as bw
|
||||
from statsmodels.sandbox.nonparametric.testdata import kdetest
|
||||
|
||||
# 1-D case
|
||||
random.seed(142)
|
||||
x = random.standard_t(4.2, size = 50)
|
||||
h = bw.bw_silverman(x)
|
||||
#NOTE: try to do it with convolution
|
||||
support = np.linspace(-10,10,512)
|
||||
|
||||
|
||||
kern = kernels.Gaussian(h = h)
|
||||
kde = KDE( x, kern)
|
||||
print(kde.density(1.015469))
|
||||
print(0.2034675)
|
||||
Xs = np.arange(-10,10,0.1)
|
||||
|
||||
fig = plt.figure()
|
||||
ax = fig.add_subplot(111)
|
||||
ax.plot(Xs, kde(Xs), "-")
|
||||
ax.set_ylim(-10, 10)
|
||||
ax.set_ylim(0,0.4)
|
||||
|
||||
|
||||
# 2-D case
|
||||
x = lzip(kdetest.faithfulData["eruptions"], kdetest.faithfulData["waiting"])
|
||||
x = np.array(x)
|
||||
x = (x - x.mean(0))/x.std(0)
|
||||
nobs = x.shape[0]
|
||||
H = kdetest.Hpi
|
||||
kern = kernels.NdKernel( 2 )
|
||||
kde = KDE( x, kern )
|
||||
print(kde.density( np.matrix( [1,2 ]))) #.T
|
||||
plt.figure()
|
||||
plt.plot(x[:,0], x[:,1], 'o')
|
||||
|
||||
|
||||
n_grid = 50
|
||||
xsp = np.linspace(x.min(0)[0], x.max(0)[0], n_grid)
|
||||
ysp = np.linspace(x.min(0)[1], x.max(0)[1], n_grid)
|
||||
# xsorted = np.sort(x)
|
||||
# xlow = xsorted[nobs/4]
|
||||
# xupp = xsorted[3*nobs/4]
|
||||
# xsp = np.linspace(xlow[0], xupp[0], n_grid)
|
||||
# ysp = np.linspace(xlow[1], xupp[1], n_grid)
|
||||
xr, yr = np.meshgrid(xsp, ysp)
|
||||
kde_vals = np.array([kde.density( np.matrix( [xi, yi ]) ) for xi, yi in
|
||||
zip(xr.ravel(), yr.ravel())])
|
||||
plt.contour(xsp, ysp, kde_vals.reshape(n_grid, n_grid))
|
||||
|
||||
plt.show()
|
||||
|
||||
|
||||
# 5 D case
|
||||
# random.seed(142)
|
||||
# mu = [1.0, 4.0, 3.5, -2.4, 0.0]
|
||||
# sigma = np.matrix(
|
||||
# [[ 0.6 - 0.1*abs(i-j) if i != j else 1.0 for j in xrange(5)] for i in xrange(5)])
|
||||
# x = random.multivariate_normal(mu, sigma, size = 100)
|
||||
# kern = kernel.Gaussian()
|
||||
# kde = KernelEstimate( x, kern )
|
||||
@ -0,0 +1,164 @@
|
||||
'''subclassing kde
|
||||
|
||||
Author: josef pktd
|
||||
'''
|
||||
|
||||
import numpy as np
|
||||
from numpy.testing import assert_almost_equal, assert_
|
||||
import scipy
|
||||
from scipy import stats
|
||||
import matplotlib.pylab as plt
|
||||
|
||||
|
||||
class gaussian_kde_set_covariance(stats.gaussian_kde):
|
||||
'''
|
||||
from Anne Archibald in mailinglist:
|
||||
http://www.nabble.com/Width-of-the-gaussian-in-stats.kde.gaussian_kde---td19558924.html#a19558924
|
||||
'''
|
||||
def __init__(self, dataset, covariance):
|
||||
self.covariance = covariance
|
||||
scipy.stats.gaussian_kde.__init__(self, dataset)
|
||||
|
||||
def _compute_covariance(self):
|
||||
self.inv_cov = np.linalg.inv(self.covariance)
|
||||
self._norm_factor = np.sqrt(np.linalg.det(2*np.pi*self.covariance)) * self.n
|
||||
|
||||
|
||||
class gaussian_kde_covfact(stats.gaussian_kde):
|
||||
def __init__(self, dataset, covfact = 'scotts'):
|
||||
self.covfact = covfact
|
||||
scipy.stats.gaussian_kde.__init__(self, dataset)
|
||||
|
||||
def _compute_covariance_(self):
|
||||
'''not used'''
|
||||
self.inv_cov = np.linalg.inv(self.covariance)
|
||||
self._norm_factor = np.sqrt(np.linalg.det(2*np.pi*self.covariance)) * self.n
|
||||
|
||||
def covariance_factor(self):
|
||||
if self.covfact in ['sc', 'scotts']:
|
||||
return self.scotts_factor()
|
||||
if self.covfact in ['si', 'silverman']:
|
||||
return self.silverman_factor()
|
||||
elif self.covfact:
|
||||
return float(self.covfact)
|
||||
else:
|
||||
raise ValueError('covariance factor has to be scotts, silverman or a number')
|
||||
|
||||
def reset_covfact(self, covfact):
|
||||
self.covfact = covfact
|
||||
self.covariance_factor()
|
||||
self._compute_covariance()
|
||||
|
||||
def plotkde(covfact):
|
||||
gkde.reset_covfact(covfact)
|
||||
kdepdf = gkde.evaluate(ind)
|
||||
plt.figure()
|
||||
# plot histgram of sample
|
||||
plt.hist(xn, bins=20, normed=1)
|
||||
# plot estimated density
|
||||
plt.plot(ind, kdepdf, label='kde', color="g")
|
||||
# plot data generating density
|
||||
plt.plot(ind, alpha * stats.norm.pdf(ind, loc=mlow) +
|
||||
(1-alpha) * stats.norm.pdf(ind, loc=mhigh),
|
||||
color="r", label='DGP: normal mix')
|
||||
plt.title('Kernel Density Estimation - ' + str(gkde.covfact))
|
||||
plt.legend()
|
||||
|
||||
|
||||
def test_kde_1d():
|
||||
np.random.seed(8765678)
|
||||
n_basesample = 500
|
||||
xn = np.random.randn(n_basesample)
|
||||
xnmean = xn.mean()
|
||||
xnstd = xn.std(ddof=1)
|
||||
print(xnmean, xnstd)
|
||||
|
||||
# get kde for original sample
|
||||
gkde = stats.gaussian_kde(xn)
|
||||
|
||||
# evaluate the density function for the kde for some points
|
||||
xs = np.linspace(-7,7,501)
|
||||
kdepdf = gkde.evaluate(xs)
|
||||
normpdf = stats.norm.pdf(xs, loc=xnmean, scale=xnstd)
|
||||
print('MSE', np.sum((kdepdf - normpdf)**2))
|
||||
print('axabserror', np.max(np.abs(kdepdf - normpdf)))
|
||||
intervall = xs[1] - xs[0]
|
||||
assert_(np.sum((kdepdf - normpdf)**2)*intervall < 0.01)
|
||||
#assert_array_almost_equal(kdepdf, normpdf, decimal=2)
|
||||
print(gkde.integrate_gaussian(0.0, 1.0))
|
||||
print(gkde.integrate_box_1d(-np.inf, 0.0))
|
||||
print(gkde.integrate_box_1d(0.0, np.inf))
|
||||
print(gkde.integrate_box_1d(-np.inf, xnmean))
|
||||
print(gkde.integrate_box_1d(xnmean, np.inf))
|
||||
|
||||
assert_almost_equal(gkde.integrate_box_1d(xnmean, np.inf), 0.5, decimal=1)
|
||||
assert_almost_equal(gkde.integrate_box_1d(-np.inf, xnmean), 0.5, decimal=1)
|
||||
assert_almost_equal(gkde.integrate_box(xnmean, np.inf), 0.5, decimal=1)
|
||||
assert_almost_equal(gkde.integrate_box(-np.inf, xnmean), 0.5, decimal=1)
|
||||
|
||||
assert_almost_equal(gkde.integrate_kde(gkde),
|
||||
(kdepdf**2).sum()*intervall, decimal=2)
|
||||
assert_almost_equal(gkde.integrate_gaussian(xnmean, xnstd**2),
|
||||
(kdepdf*normpdf).sum()*intervall, decimal=2)
|
||||
## assert_almost_equal(gkde.integrate_gaussian(0.0, 1.0),
|
||||
## (kdepdf*normpdf).sum()*intervall, decimal=2)
|
||||
|
||||
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
# generate a sample
|
||||
n_basesample = 1000
|
||||
np.random.seed(8765678)
|
||||
alpha = 0.6 #weight for (prob of) lower distribution
|
||||
mlow, mhigh = (-3,3) #mean locations for gaussian mixture
|
||||
xn = np.concatenate([mlow + np.random.randn(alpha * n_basesample),
|
||||
mhigh + np.random.randn((1-alpha) * n_basesample)])
|
||||
|
||||
# get kde for original sample
|
||||
#gkde = stats.gaussian_kde(xn)
|
||||
gkde = gaussian_kde_covfact(xn, 0.1)
|
||||
# evaluate the density function for the kde for some points
|
||||
ind = np.linspace(-7,7,101)
|
||||
kdepdf = gkde.evaluate(ind)
|
||||
|
||||
plt.figure()
|
||||
# plot histgram of sample
|
||||
plt.hist(xn, bins=20, normed=1)
|
||||
# plot estimated density
|
||||
plt.plot(ind, kdepdf, label='kde', color="g")
|
||||
# plot data generating density
|
||||
plt.plot(ind, alpha * stats.norm.pdf(ind, loc=mlow) +
|
||||
(1-alpha) * stats.norm.pdf(ind, loc=mhigh),
|
||||
color="r", label='DGP: normal mix')
|
||||
plt.title('Kernel Density Estimation')
|
||||
plt.legend()
|
||||
|
||||
gkde = gaussian_kde_covfact(xn, 'scotts')
|
||||
kdepdf = gkde.evaluate(ind)
|
||||
plt.figure()
|
||||
# plot histgram of sample
|
||||
plt.hist(xn, bins=20, normed=1)
|
||||
# plot estimated density
|
||||
plt.plot(ind, kdepdf, label='kde', color="g")
|
||||
# plot data generating density
|
||||
plt.plot(ind, alpha * stats.norm.pdf(ind, loc=mlow) +
|
||||
(1-alpha) * stats.norm.pdf(ind, loc=mhigh),
|
||||
color="r", label='DGP: normal mix')
|
||||
plt.title('Kernel Density Estimation')
|
||||
plt.legend()
|
||||
#plt.show()
|
||||
for cv in ['scotts', 'silverman', 0.05, 0.1, 0.5]:
|
||||
plotkde(cv)
|
||||
|
||||
test_kde_1d()
|
||||
|
||||
|
||||
np.random.seed(8765678)
|
||||
n_basesample = 1000
|
||||
xn = np.random.randn(n_basesample)
|
||||
xnmean = xn.mean()
|
||||
xnstd = xn.std(ddof=1)
|
||||
|
||||
# get kde for original sample
|
||||
gkde = stats.gaussian_kde(xn)
|
||||
@ -0,0 +1,419 @@
|
||||
"""
|
||||
Multivariate Conditional and Unconditional Kernel Density Estimation
|
||||
with Mixed Data Types
|
||||
|
||||
References
|
||||
----------
|
||||
[1] Racine, J., Li, Q. Nonparametric econometrics: theory and practice.
|
||||
Princeton University Press. (2007)
|
||||
[2] Racine, Jeff. "Nonparametric Econometrics: A Primer," Foundation
|
||||
and Trends in Econometrics: Vol 3: No 1, pp1-88. (2008)
|
||||
http://dx.doi.org/10.1561/0800000009
|
||||
[3] Racine, J., Li, Q. "Nonparametric Estimation of Distributions
|
||||
with Categorical and Continuous Data." Working Paper. (2000)
|
||||
[4] Racine, J. Li, Q. "Kernel Estimation of Multivariate Conditional
|
||||
Distributions Annals of Economics and Finance 5, 211-235 (2004)
|
||||
[5] Liu, R., Yang, L. "Kernel estimation of multivariate
|
||||
cumulative distribution function."
|
||||
Journal of Nonparametric Statistics (2008)
|
||||
[6] Li, R., Ju, G. "Nonparametric Estimation of Multivariate CDF
|
||||
with Categorical and Continuous Data." Working Paper
|
||||
[7] Li, Q., Racine, J. "Cross-validated local linear nonparametric
|
||||
regression" Statistica Sinica 14(2004), pp. 485-512
|
||||
[8] Racine, J.: "Consistent Significance Testing for Nonparametric
|
||||
Regression" Journal of Business & Economics Statistics
|
||||
[9] Racine, J., Hart, J., Li, Q., "Testing the Significance of
|
||||
Categorical Predictor Variables in Nonparametric Regression
|
||||
Models", 2006, Econometric Reviews 25, 523-544
|
||||
|
||||
"""
|
||||
|
||||
# TODO: make default behavior efficient=True above a certain n_obs
|
||||
import numpy as np
|
||||
from scipy import optimize
|
||||
from scipy.stats.mstats import mquantiles
|
||||
|
||||
from statsmodels.nonparametric.api import KDEMultivariate, KernelReg
|
||||
from statsmodels.nonparametric._kernel_base import \
|
||||
gpke, LeaveOneOut, _get_type_pos, _adjust_shape
|
||||
|
||||
|
||||
__all__ = ['SingleIndexModel', 'SemiLinear', 'TestFForm']
|
||||
|
||||
|
||||
class TestFForm:
|
||||
"""
|
||||
Nonparametric test for functional form.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
endog : list
|
||||
Dependent variable (training set)
|
||||
exog : list of array_like objects
|
||||
The independent (right-hand-side) variables
|
||||
bw : array_like, str
|
||||
Bandwidths for exog or specify method for bandwidth selection
|
||||
fform : function
|
||||
The functional form ``y = g(b, x)`` to be tested. Takes as inputs
|
||||
the RHS variables `exog` and the coefficients ``b`` (betas)
|
||||
and returns a fitted ``y_hat``.
|
||||
var_type : str
|
||||
The type of the independent `exog` variables:
|
||||
|
||||
- c: continuous
|
||||
- o: ordered
|
||||
- u: unordered
|
||||
|
||||
estimator : function
|
||||
Must return the estimated coefficients b (betas). Takes as inputs
|
||||
``(endog, exog)``. E.g. least square estimator::
|
||||
|
||||
lambda (x,y): np.dot(np.pinv(np.dot(x.T, x)), np.dot(x.T, y))
|
||||
|
||||
References
|
||||
----------
|
||||
See Racine, J.: "Consistent Significance Testing for Nonparametric
|
||||
Regression" Journal of Business & Economics Statistics.
|
||||
|
||||
See chapter 12 in [1] pp. 355-357.
|
||||
"""
|
||||
def __init__(self, endog, exog, bw, var_type, fform, estimator, nboot=100):
|
||||
self.endog = endog
|
||||
self.exog = exog
|
||||
self.var_type = var_type
|
||||
self.fform = fform
|
||||
self.estimator = estimator
|
||||
self.nboot = nboot
|
||||
self.bw = KDEMultivariate(exog, bw=bw, var_type=var_type).bw
|
||||
self.sig = self._compute_sig()
|
||||
|
||||
def _compute_sig(self):
|
||||
Y = self.endog
|
||||
X = self.exog
|
||||
b = self.estimator(Y, X)
|
||||
m = self.fform(X, b)
|
||||
n = np.shape(X)[0]
|
||||
resid = Y - m
|
||||
resid = resid - np.mean(resid) # center residuals
|
||||
self.test_stat = self._compute_test_stat(resid)
|
||||
sqrt5 = np.sqrt(5.)
|
||||
fct1 = (1 - sqrt5) / 2.
|
||||
fct2 = (1 + sqrt5) / 2.
|
||||
u1 = fct1 * resid
|
||||
u2 = fct2 * resid
|
||||
r = fct2 / sqrt5
|
||||
I_dist = np.empty((self.nboot,1))
|
||||
for j in range(self.nboot):
|
||||
u_boot = u2.copy()
|
||||
|
||||
prob = np.random.uniform(0,1, size = (n,))
|
||||
ind = prob < r
|
||||
u_boot[ind] = u1[ind]
|
||||
Y_boot = m + u_boot
|
||||
b_hat = self.estimator(Y_boot, X)
|
||||
m_hat = self.fform(X, b_hat)
|
||||
u_boot_hat = Y_boot - m_hat
|
||||
I_dist[j] = self._compute_test_stat(u_boot_hat)
|
||||
|
||||
self.boots_results = I_dist
|
||||
sig = "Not Significant"
|
||||
if self.test_stat > mquantiles(I_dist, 0.9):
|
||||
sig = "*"
|
||||
if self.test_stat > mquantiles(I_dist, 0.95):
|
||||
sig = "**"
|
||||
if self.test_stat > mquantiles(I_dist, 0.99):
|
||||
sig = "***"
|
||||
return sig
|
||||
|
||||
def _compute_test_stat(self, u):
|
||||
n = np.shape(u)[0]
|
||||
XLOO = LeaveOneOut(self.exog)
|
||||
uLOO = LeaveOneOut(u[:,None]).__iter__()
|
||||
ival = 0
|
||||
S2 = 0
|
||||
for i, X_not_i in enumerate(XLOO):
|
||||
u_j = next(uLOO)
|
||||
u_j = np.squeeze(u_j)
|
||||
# See Bootstrapping procedure on p. 357 in [1]
|
||||
K = gpke(self.bw, data=-X_not_i, data_predict=-self.exog[i, :],
|
||||
var_type=self.var_type, tosum=False)
|
||||
f_i = (u[i] * u_j * K)
|
||||
assert u_j.shape == K.shape
|
||||
ival += f_i.sum() # See eq. 12.7 on p. 355 in [1]
|
||||
S2 += (f_i**2).sum() # See Theorem 12.1 on p.356 in [1]
|
||||
assert np.size(ival) == 1
|
||||
assert np.size(S2) == 1
|
||||
|
||||
ival *= 1. / (n * (n - 1))
|
||||
ix_cont = _get_type_pos(self.var_type)[0]
|
||||
hp = self.bw[ix_cont].prod()
|
||||
S2 *= 2 * hp / (n * (n - 1))
|
||||
T = n * ival * np.sqrt(hp / S2)
|
||||
return T
|
||||
|
||||
|
||||
class SingleIndexModel(KernelReg):
|
||||
"""
|
||||
Single index semiparametric model ``y = g(X * b) + e``.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
endog : array_like
|
||||
The dependent variable
|
||||
exog : array_like
|
||||
The independent variable(s)
|
||||
var_type : str
|
||||
The type of variables in X:
|
||||
|
||||
- c: continuous
|
||||
- o: ordered
|
||||
- u: unordered
|
||||
|
||||
Attributes
|
||||
----------
|
||||
b : array_like
|
||||
The linear coefficients b (betas)
|
||||
bw : array_like
|
||||
Bandwidths
|
||||
|
||||
Methods
|
||||
-------
|
||||
fit(): Computes the fitted values ``E[Y|X] = g(X * b)``
|
||||
and the marginal effects ``dY/dX``.
|
||||
|
||||
References
|
||||
----------
|
||||
See chapter on semiparametric models in [1]
|
||||
|
||||
Notes
|
||||
-----
|
||||
This model resembles the binary choice models. The user knows
|
||||
that X and b interact linearly, but ``g(X * b)`` is unknown.
|
||||
In the parametric binary choice models the user usually assumes
|
||||
some distribution of g() such as normal or logistic.
|
||||
"""
|
||||
def __init__(self, endog, exog, var_type):
|
||||
self.var_type = var_type
|
||||
self.K = len(var_type)
|
||||
self.var_type = self.var_type[0]
|
||||
self.endog = _adjust_shape(endog, 1)
|
||||
self.exog = _adjust_shape(exog, self.K)
|
||||
self.nobs = np.shape(self.exog)[0]
|
||||
self.data_type = self.var_type
|
||||
self.ckertype = 'gaussian'
|
||||
self.okertype = 'wangryzin'
|
||||
self.ukertype = 'aitchisonaitken'
|
||||
self.func = self._est_loc_linear
|
||||
|
||||
self.b, self.bw = self._est_b_bw()
|
||||
|
||||
def _est_b_bw(self):
|
||||
params0 = np.random.uniform(size=(self.K + 1, ))
|
||||
b_bw = optimize.fmin(self.cv_loo, params0, disp=0)
|
||||
b = b_bw[0:self.K]
|
||||
bw = b_bw[self.K:]
|
||||
bw = self._set_bw_bounds(bw)
|
||||
return b, bw
|
||||
|
||||
def cv_loo(self, params):
|
||||
# See p. 254 in Textbook
|
||||
params = np.asarray(params)
|
||||
b = params[0 : self.K]
|
||||
bw = params[self.K:]
|
||||
LOO_X = LeaveOneOut(self.exog)
|
||||
LOO_Y = LeaveOneOut(self.endog).__iter__()
|
||||
L = 0
|
||||
for i, X_not_i in enumerate(LOO_X):
|
||||
Y = next(LOO_Y)
|
||||
#print b.shape, np.dot(self.exog[i:i+1, :], b).shape, bw,
|
||||
G = self.func(bw, endog=Y, exog=-np.dot(X_not_i, b)[:,None],
|
||||
#data_predict=-b*self.exog[i, :])[0]
|
||||
data_predict=-np.dot(self.exog[i:i+1, :], b))[0]
|
||||
#print G.shape
|
||||
L += (self.endog[i] - G) ** 2
|
||||
|
||||
# Note: There might be a way to vectorize this. See p.72 in [1]
|
||||
return L / self.nobs
|
||||
|
||||
def fit(self, data_predict=None):
|
||||
if data_predict is None:
|
||||
data_predict = self.exog
|
||||
else:
|
||||
data_predict = _adjust_shape(data_predict, self.K)
|
||||
|
||||
N_data_predict = np.shape(data_predict)[0]
|
||||
mean = np.empty((N_data_predict,))
|
||||
mfx = np.empty((N_data_predict, self.K))
|
||||
for i in range(N_data_predict):
|
||||
mean_mfx = self.func(self.bw, self.endog,
|
||||
np.dot(self.exog, self.b)[:,None],
|
||||
data_predict=np.dot(data_predict[i:i+1, :],self.b))
|
||||
mean[i] = mean_mfx[0]
|
||||
mfx_c = np.squeeze(mean_mfx[1])
|
||||
mfx[i, :] = mfx_c
|
||||
|
||||
return mean, mfx
|
||||
|
||||
def __repr__(self):
|
||||
"""Provide something sane to print."""
|
||||
repr = "Single Index Model \n"
|
||||
repr += "Number of variables: K = " + str(self.K) + "\n"
|
||||
repr += "Number of samples: nobs = " + str(self.nobs) + "\n"
|
||||
repr += "Variable types: " + self.var_type + "\n"
|
||||
repr += "BW selection method: cv_ls" + "\n"
|
||||
repr += "Estimator type: local constant" + "\n"
|
||||
return repr
|
||||
|
||||
|
||||
class SemiLinear(KernelReg):
|
||||
"""
|
||||
Semiparametric partially linear model, ``Y = Xb + g(Z) + e``.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
endog : array_like
|
||||
The dependent variable
|
||||
exog : array_like
|
||||
The linear component in the regression
|
||||
exog_nonparametric : array_like
|
||||
The nonparametric component in the regression
|
||||
var_type : str
|
||||
The type of the variables in the nonparametric component;
|
||||
|
||||
- c: continuous
|
||||
- o: ordered
|
||||
- u: unordered
|
||||
|
||||
k_linear : int
|
||||
The number of variables that comprise the linear component.
|
||||
|
||||
Attributes
|
||||
----------
|
||||
bw : array_like
|
||||
Bandwidths for the nonparametric component exog_nonparametric
|
||||
b : array_like
|
||||
Coefficients in the linear component
|
||||
nobs : int
|
||||
The number of observations.
|
||||
k_linear : int
|
||||
The number of variables that comprise the linear component.
|
||||
|
||||
Methods
|
||||
-------
|
||||
fit
|
||||
Returns the fitted mean and marginal effects dy/dz
|
||||
|
||||
Notes
|
||||
-----
|
||||
This model uses only the local constant regression estimator
|
||||
|
||||
References
|
||||
----------
|
||||
See chapter on Semiparametric Models in [1]
|
||||
"""
|
||||
|
||||
def __init__(self, endog, exog, exog_nonparametric, var_type, k_linear):
|
||||
self.endog = _adjust_shape(endog, 1)
|
||||
self.exog = _adjust_shape(exog, k_linear)
|
||||
self.K = len(var_type)
|
||||
self.exog_nonparametric = _adjust_shape(exog_nonparametric, self.K)
|
||||
self.k_linear = k_linear
|
||||
self.nobs = np.shape(self.exog)[0]
|
||||
self.var_type = var_type
|
||||
self.data_type = self.var_type
|
||||
self.ckertype = 'gaussian'
|
||||
self.okertype = 'wangryzin'
|
||||
self.ukertype = 'aitchisonaitken'
|
||||
self.func = self._est_loc_linear
|
||||
|
||||
self.b, self.bw = self._est_b_bw()
|
||||
|
||||
def _est_b_bw(self):
|
||||
"""
|
||||
Computes the (beta) coefficients and the bandwidths.
|
||||
|
||||
Minimizes ``cv_loo`` with respect to ``b`` and ``bw``.
|
||||
"""
|
||||
params0 = np.random.uniform(size=(self.k_linear + self.K, ))
|
||||
b_bw = optimize.fmin(self.cv_loo, params0, disp=0)
|
||||
b = b_bw[0 : self.k_linear]
|
||||
bw = b_bw[self.k_linear:]
|
||||
#bw = self._set_bw_bounds(np.asarray(bw))
|
||||
return b, bw
|
||||
|
||||
def cv_loo(self, params):
|
||||
"""
|
||||
Similar to the cross validation leave-one-out estimator.
|
||||
|
||||
Modified to reflect the linear components.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
params : array_like
|
||||
Vector consisting of the coefficients (b) and the bandwidths (bw).
|
||||
The first ``k_linear`` elements are the coefficients.
|
||||
|
||||
Returns
|
||||
-------
|
||||
L : float
|
||||
The value of the objective function
|
||||
|
||||
References
|
||||
----------
|
||||
See p.254 in [1]
|
||||
"""
|
||||
params = np.asarray(params)
|
||||
b = params[0 : self.k_linear]
|
||||
bw = params[self.k_linear:]
|
||||
LOO_X = LeaveOneOut(self.exog)
|
||||
LOO_Y = LeaveOneOut(self.endog).__iter__()
|
||||
LOO_Z = LeaveOneOut(self.exog_nonparametric).__iter__()
|
||||
Xb = np.dot(self.exog, b)[:,None]
|
||||
L = 0
|
||||
for ii, X_not_i in enumerate(LOO_X):
|
||||
Y = next(LOO_Y)
|
||||
Z = next(LOO_Z)
|
||||
Xb_j = np.dot(X_not_i, b)[:,None]
|
||||
Yx = Y - Xb_j
|
||||
G = self.func(bw, endog=Yx, exog=-Z,
|
||||
data_predict=-self.exog_nonparametric[ii, :])[0]
|
||||
lt = Xb[ii, :] #.sum() # linear term
|
||||
L += (self.endog[ii] - lt - G) ** 2
|
||||
|
||||
return L
|
||||
|
||||
def fit(self, exog_predict=None, exog_nonparametric_predict=None):
|
||||
"""Computes fitted values and marginal effects"""
|
||||
|
||||
if exog_predict is None:
|
||||
exog_predict = self.exog
|
||||
else:
|
||||
exog_predict = _adjust_shape(exog_predict, self.k_linear)
|
||||
|
||||
if exog_nonparametric_predict is None:
|
||||
exog_nonparametric_predict = self.exog_nonparametric
|
||||
else:
|
||||
exog_nonparametric_predict = _adjust_shape(exog_nonparametric_predict, self.K)
|
||||
|
||||
N_data_predict = np.shape(exog_nonparametric_predict)[0]
|
||||
mean = np.empty((N_data_predict,))
|
||||
mfx = np.empty((N_data_predict, self.K))
|
||||
Y = self.endog - np.dot(exog_predict, self.b)[:,None]
|
||||
for i in range(N_data_predict):
|
||||
mean_mfx = self.func(self.bw, Y, self.exog_nonparametric,
|
||||
data_predict=exog_nonparametric_predict[i, :])
|
||||
mean[i] = mean_mfx[0]
|
||||
mfx_c = np.squeeze(mean_mfx[1])
|
||||
mfx[i, :] = mfx_c
|
||||
|
||||
return mean, mfx
|
||||
|
||||
def __repr__(self):
|
||||
"""Provide something sane to print."""
|
||||
repr = "Semiparamatric Partially Linear Model \n"
|
||||
repr += "Number of variables: K = " + str(self.K) + "\n"
|
||||
repr += "Number of samples: N = " + str(self.nobs) + "\n"
|
||||
repr += "Variable types: " + self.var_type + "\n"
|
||||
repr += "BW selection method: cv_ls" + "\n"
|
||||
repr += "Estimator type: local constant" + "\n"
|
||||
return repr
|
||||
@ -0,0 +1,576 @@
|
||||
"""
|
||||
This models contains the Kernels for Kernel smoothing.
|
||||
|
||||
Hopefully in the future they may be reused/extended for other kernel based
|
||||
method
|
||||
|
||||
References:
|
||||
----------
|
||||
|
||||
Pointwise Kernel Confidence Bounds
|
||||
(smoothconf)
|
||||
http://fedc.wiwi.hu-berlin.de/xplore/ebooks/html/anr/anrhtmlframe62.html
|
||||
"""
|
||||
|
||||
# pylint: disable-msg=C0103
|
||||
# pylint: disable-msg=W0142
|
||||
# pylint: disable-msg=E1101
|
||||
# pylint: disable-msg=E0611
|
||||
from statsmodels.compat.python import lzip, lfilter
|
||||
import numpy as np
|
||||
import scipy.integrate
|
||||
from scipy.special import factorial
|
||||
from numpy import exp, multiply, square, divide, subtract, inf
|
||||
|
||||
|
||||
class NdKernel:
|
||||
"""Generic N-dimensial kernel
|
||||
|
||||
Parameters
|
||||
----------
|
||||
n : int
|
||||
The number of series for kernel estimates
|
||||
kernels : list
|
||||
kernels
|
||||
|
||||
Can be constructed from either
|
||||
a) a list of n kernels which will be treated as
|
||||
indepent marginals on a gaussian copula (specified by H)
|
||||
or b) a single univariate kernel which will be applied radially to the
|
||||
mahalanobis distance defined by H.
|
||||
|
||||
In the case of the Gaussian these are both equivalent, and the second constructiong
|
||||
is prefered.
|
||||
"""
|
||||
def __init__(self, n, kernels = None, H = None):
|
||||
if kernels is None:
|
||||
kernels = Gaussian()
|
||||
|
||||
self._kernels = kernels
|
||||
self.weights = None
|
||||
|
||||
if H is None:
|
||||
H = np.matrix( np.identity(n))
|
||||
|
||||
self._H = H
|
||||
self._Hrootinv = np.linalg.cholesky( H.I )
|
||||
|
||||
def getH(self):
|
||||
"""Getter for kernel bandwidth, H"""
|
||||
return self._H
|
||||
|
||||
def setH(self, value):
|
||||
"""Setter for kernel bandwidth, H"""
|
||||
self._H = value
|
||||
|
||||
H = property(getH, setH, doc="Kernel bandwidth matrix")
|
||||
|
||||
def density(self, xs, x):
|
||||
|
||||
n = len(xs)
|
||||
#xs = self.in_domain( xs, xs, x )[0]
|
||||
|
||||
if len(xs)>0: ## Need to do product of marginal distributions
|
||||
#w = np.sum([self(self._Hrootinv * (xx-x).T ) for xx in xs])/n
|
||||
#vectorized does not work:
|
||||
if self.weights is not None:
|
||||
w = np.mean(self((xs-x) * self._Hrootinv).T * self.weights)/sum(self.weights)
|
||||
else:
|
||||
w = np.mean(self((xs-x) * self._Hrootinv )) #transposed
|
||||
#w = np.mean([self(xd) for xd in ((xs-x) * self._Hrootinv)] ) #transposed
|
||||
return w
|
||||
else:
|
||||
return np.nan
|
||||
|
||||
def _kernweight(self, x ):
|
||||
"""returns the kernel weight for the independent multivariate kernel"""
|
||||
if isinstance( self._kernels, CustomKernel ):
|
||||
## Radial case
|
||||
#d = x.T * x
|
||||
#x is matrix, 2d, element wise sqrt looks wrong
|
||||
#d = np.sqrt( x.T * x )
|
||||
x = np.asarray(x)
|
||||
#d = np.sqrt( (x * x).sum(-1) )
|
||||
d = (x * x).sum(-1)
|
||||
return self._kernels( np.asarray(d) )
|
||||
|
||||
def __call__(self, x):
|
||||
"""
|
||||
This simply returns the value of the kernel function at x
|
||||
|
||||
Does the same as weight if the function is normalised
|
||||
"""
|
||||
return self._kernweight(x)
|
||||
|
||||
|
||||
class CustomKernel:
|
||||
"""
|
||||
Generic 1D Kernel object.
|
||||
Can be constructed by selecting a standard named Kernel,
|
||||
or providing a lambda expression and domain.
|
||||
The domain allows some algorithms to run faster for finite domain kernels.
|
||||
"""
|
||||
# MC: Not sure how this will look in the end - or even still exist.
|
||||
# Main purpose of this is to allow custom kernels and to allow speed up
|
||||
# from finite support.
|
||||
|
||||
def __init__(self, shape, h = 1.0, domain = None, norm = None):
|
||||
"""
|
||||
shape should be a function taking and returning numeric type.
|
||||
|
||||
For sanity it should always return positive or zero but this is not
|
||||
enforced in case you want to do weird things. Bear in mind that the
|
||||
statistical tests etc. may not be valid for non-positive kernels.
|
||||
|
||||
The bandwidth of the kernel is supplied as h.
|
||||
|
||||
You may specify a domain as a list of 2 values [min, max], in which case
|
||||
kernel will be treated as zero outside these values. This will speed up
|
||||
calculation.
|
||||
|
||||
You may also specify the normalisation constant for the supplied Kernel.
|
||||
If you do this number will be stored and used as the normalisation
|
||||
without calculation. It is recommended you do this if you know the
|
||||
constant, to speed up calculation. In particular if the shape function
|
||||
provided is already normalised you should provide norm = 1.0.
|
||||
|
||||
Warning: I think several calculations assume that the kernel is
|
||||
normalized. No tests for non-normalized kernel.
|
||||
"""
|
||||
self._normconst = norm # a value or None, if None, then calculate
|
||||
self.domain = domain
|
||||
self.weights = None
|
||||
if callable(shape):
|
||||
self._shape = shape
|
||||
else:
|
||||
raise TypeError("shape must be a callable object/function")
|
||||
self._h = h
|
||||
self._L2Norm = None
|
||||
self._kernel_var = None
|
||||
self._normal_reference_constant = None
|
||||
self._order = None
|
||||
|
||||
def geth(self):
|
||||
"""Getter for kernel bandwidth, h"""
|
||||
return self._h
|
||||
def seth(self, value):
|
||||
"""Setter for kernel bandwidth, h"""
|
||||
self._h = value
|
||||
h = property(geth, seth, doc="Kernel Bandwidth")
|
||||
|
||||
def in_domain(self, xs, ys, x):
|
||||
"""
|
||||
Returns the filtered (xs, ys) based on the Kernel domain centred on x
|
||||
"""
|
||||
# Disable black-list functions: filter used for speed instead of
|
||||
# list-comprehension
|
||||
# pylint: disable-msg=W0141
|
||||
def isInDomain(xy):
|
||||
"""Used for filter to check if point is in the domain"""
|
||||
u = (xy[0]-x)/self.h
|
||||
return np.all((u >= self.domain[0]) & (u <= self.domain[1]))
|
||||
|
||||
if self.domain is None:
|
||||
return (xs, ys)
|
||||
else:
|
||||
filtered = lfilter(isInDomain, lzip(xs, ys))
|
||||
if len(filtered) > 0:
|
||||
xs, ys = lzip(*filtered)
|
||||
return (xs, ys)
|
||||
else:
|
||||
return ([], [])
|
||||
|
||||
def density(self, xs, x):
|
||||
"""Returns the kernel density estimate for point x based on x-values
|
||||
xs
|
||||
"""
|
||||
xs = np.asarray(xs)
|
||||
n = len(xs) # before in_domain?
|
||||
if self.weights is not None:
|
||||
xs, weights = self.in_domain( xs, self.weights, x )
|
||||
else:
|
||||
xs = self.in_domain( xs, xs, x )[0]
|
||||
xs = np.asarray(xs)
|
||||
#print 'len(xs)', len(xs), x
|
||||
if xs.ndim == 1:
|
||||
xs = xs[:,None]
|
||||
if len(xs)>0:
|
||||
h = self.h
|
||||
if self.weights is not None:
|
||||
w = 1 / h * np.sum(self((xs-x)/h).T * weights, axis=1)
|
||||
else:
|
||||
w = 1. / (h * n) * np.sum(self((xs-x)/h), axis=0)
|
||||
return w
|
||||
else:
|
||||
return np.nan
|
||||
|
||||
def density_var(self, density, nobs):
|
||||
"""approximate pointwise variance for kernel density
|
||||
|
||||
not verified
|
||||
|
||||
Parameters
|
||||
----------
|
||||
density : array_lie
|
||||
pdf of the kernel density
|
||||
nobs : int
|
||||
number of observations used in the KDE estimation
|
||||
|
||||
Returns
|
||||
-------
|
||||
kde_var : ndarray
|
||||
estimated variance of the density estimate
|
||||
|
||||
Notes
|
||||
-----
|
||||
This uses the asymptotic normal approximation to the distribution of
|
||||
the density estimate.
|
||||
"""
|
||||
return np.asarray(density) * self.L2Norm / self.h / nobs
|
||||
|
||||
def density_confint(self, density, nobs, alpha=0.05):
|
||||
"""approximate pointwise confidence interval for kernel density
|
||||
|
||||
The confidence interval is centered at the estimated density and
|
||||
ignores the bias of the density estimate.
|
||||
|
||||
not verified
|
||||
|
||||
Parameters
|
||||
----------
|
||||
density : array_lie
|
||||
pdf of the kernel density
|
||||
nobs : int
|
||||
number of observations used in the KDE estimation
|
||||
|
||||
Returns
|
||||
-------
|
||||
conf_int : ndarray
|
||||
estimated confidence interval of the density estimate, lower bound
|
||||
in first column and upper bound in second column
|
||||
|
||||
Notes
|
||||
-----
|
||||
This uses the asymptotic normal approximation to the distribution of
|
||||
the density estimate. The lower bound can be negative for density
|
||||
values close to zero.
|
||||
"""
|
||||
from scipy import stats
|
||||
crit = stats.norm.isf(alpha / 2.)
|
||||
density = np.asarray(density)
|
||||
half_width = crit * np.sqrt(self.density_var(density, nobs))
|
||||
conf_int = np.column_stack((density - half_width, density + half_width))
|
||||
return conf_int
|
||||
|
||||
def smooth(self, xs, ys, x):
|
||||
"""Returns the kernel smoothing estimate for point x based on x-values
|
||||
xs and y-values ys.
|
||||
Not expected to be called by the user.
|
||||
"""
|
||||
xs, ys = self.in_domain(xs, ys, x)
|
||||
|
||||
if len(xs)>0:
|
||||
w = np.sum(self((xs-x)/self.h))
|
||||
#TODO: change the below to broadcasting when shape is sorted
|
||||
v = np.sum([yy*self((xx-x)/self.h) for xx, yy in zip(xs, ys)])
|
||||
return v / w
|
||||
else:
|
||||
return np.nan
|
||||
|
||||
def smoothvar(self, xs, ys, x):
|
||||
"""Returns the kernel smoothing estimate of the variance at point x.
|
||||
"""
|
||||
xs, ys = self.in_domain(xs, ys, x)
|
||||
|
||||
if len(xs) > 0:
|
||||
fittedvals = np.array([self.smooth(xs, ys, xx) for xx in xs])
|
||||
sqresid = square( subtract(ys, fittedvals) )
|
||||
w = np.sum(self((xs-x)/self.h))
|
||||
v = np.sum([rr*self((xx-x)/self.h) for xx, rr in zip(xs, sqresid)])
|
||||
return v / w
|
||||
else:
|
||||
return np.nan
|
||||
|
||||
def smoothconf(self, xs, ys, x, alpha=0.05):
|
||||
"""Returns the kernel smoothing estimate with confidence 1sigma bounds
|
||||
"""
|
||||
xs, ys = self.in_domain(xs, ys, x)
|
||||
|
||||
if len(xs) > 0:
|
||||
fittedvals = np.array([self.smooth(xs, ys, xx) for xx in xs])
|
||||
#fittedvals = self.smooth(xs, ys, x) # x or xs in Haerdle
|
||||
sqresid = square(
|
||||
subtract(ys, fittedvals)
|
||||
)
|
||||
w = np.sum(self((xs-x)/self.h))
|
||||
#var = sqresid.sum() / (len(sqresid) - 0) # nonlocal var ? JP just trying
|
||||
v = np.sum([rr*self((xx-x)/self.h) for xx, rr in zip(xs, sqresid)])
|
||||
var = v / w
|
||||
sd = np.sqrt(var)
|
||||
K = self.L2Norm
|
||||
yhat = self.smooth(xs, ys, x)
|
||||
from scipy import stats
|
||||
crit = stats.norm.isf(alpha / 2)
|
||||
err = crit * sd * np.sqrt(K) / np.sqrt(w * self.h * self.norm_const)
|
||||
return (yhat - err, yhat, yhat + err)
|
||||
else:
|
||||
return (np.nan, np.nan, np.nan)
|
||||
|
||||
@property
|
||||
def L2Norm(self):
|
||||
"""Returns the integral of the square of the kernal from -inf to inf"""
|
||||
if self._L2Norm is None:
|
||||
L2Func = lambda x: (self.norm_const*self._shape(x))**2
|
||||
if self.domain is None:
|
||||
self._L2Norm = scipy.integrate.quad(L2Func, -inf, inf)[0]
|
||||
else:
|
||||
self._L2Norm = scipy.integrate.quad(L2Func, self.domain[0],
|
||||
self.domain[1])[0]
|
||||
return self._L2Norm
|
||||
|
||||
@property
|
||||
def norm_const(self):
|
||||
"""
|
||||
Normalising constant for kernel (integral from -inf to inf)
|
||||
"""
|
||||
if self._normconst is None:
|
||||
if self.domain is None:
|
||||
quadres = scipy.integrate.quad(self._shape, -inf, inf)
|
||||
else:
|
||||
quadres = scipy.integrate.quad(self._shape, self.domain[0],
|
||||
self.domain[1])
|
||||
self._normconst = 1.0/(quadres[0])
|
||||
return self._normconst
|
||||
|
||||
@property
|
||||
def kernel_var(self):
|
||||
"""Returns the second moment of the kernel"""
|
||||
if self._kernel_var is None:
|
||||
func = lambda x: x**2 * self.norm_const * self._shape(x)
|
||||
if self.domain is None:
|
||||
self._kernel_var = scipy.integrate.quad(func, -inf, inf)[0]
|
||||
else:
|
||||
self._kernel_var = scipy.integrate.quad(func, self.domain[0],
|
||||
self.domain[1])[0]
|
||||
return self._kernel_var
|
||||
|
||||
def moments(self, n):
|
||||
|
||||
if n > 2:
|
||||
msg = "Only first and second moment currently implemented"
|
||||
raise NotImplementedError(msg)
|
||||
|
||||
if n == 1:
|
||||
return 0
|
||||
|
||||
if n == 2:
|
||||
return self.kernel_var
|
||||
|
||||
@property
|
||||
def normal_reference_constant(self):
|
||||
"""
|
||||
Constant used for silverman normal reference asymtotic bandwidth
|
||||
calculation.
|
||||
|
||||
C = 2((pi^(1/2)*(nu!)^3 R(k))/(2nu(2nu)!kap_nu(k)^2))^(1/(2nu+1))
|
||||
nu = kernel order
|
||||
kap_nu = nu'th moment of kernel
|
||||
R = kernel roughness (square of L^2 norm)
|
||||
|
||||
Note: L2Norm property returns square of norm.
|
||||
"""
|
||||
nu = self._order
|
||||
|
||||
if not nu == 2:
|
||||
msg = "Only implemented for second order kernels"
|
||||
raise NotImplementedError(msg)
|
||||
|
||||
if self._normal_reference_constant is None:
|
||||
C = np.pi**(.5) * factorial(nu)**3 * self.L2Norm
|
||||
C /= (2 * nu * factorial(2 * nu) * self.moments(nu)**2)
|
||||
C = 2*C**(1.0/(2*nu+1))
|
||||
self._normal_reference_constant = C
|
||||
|
||||
return self._normal_reference_constant
|
||||
|
||||
|
||||
def weight(self, x):
|
||||
"""This returns the normalised weight at distance x"""
|
||||
return self.norm_const*self._shape(x)
|
||||
|
||||
def __call__(self, x):
|
||||
"""
|
||||
This simply returns the value of the kernel function at x
|
||||
|
||||
Does the same as weight if the function is normalised
|
||||
"""
|
||||
return self._shape(x)
|
||||
|
||||
|
||||
class Uniform(CustomKernel):
|
||||
def __init__(self, h=1.0):
|
||||
CustomKernel.__init__(self, shape=lambda x: 0.5 * np.ones(x.shape), h=h,
|
||||
domain=[-1.0, 1.0], norm = 1.0)
|
||||
self._L2Norm = 0.5
|
||||
self._kernel_var = 1. / 3
|
||||
self._order = 2
|
||||
|
||||
|
||||
class Triangular(CustomKernel):
|
||||
def __init__(self, h=1.0):
|
||||
CustomKernel.__init__(self, shape=lambda x: 1 - abs(x), h=h,
|
||||
domain=[-1.0, 1.0], norm = 1.0)
|
||||
self._L2Norm = 2.0/3.0
|
||||
self._kernel_var = 1. / 6
|
||||
self._order = 2
|
||||
|
||||
|
||||
class Epanechnikov(CustomKernel):
|
||||
def __init__(self, h=1.0):
|
||||
CustomKernel.__init__(self, shape=lambda x: 0.75*(1 - x*x), h=h,
|
||||
domain=[-1.0, 1.0], norm = 1.0)
|
||||
self._L2Norm = 0.6
|
||||
self._kernel_var = 0.2
|
||||
self._order = 2
|
||||
|
||||
|
||||
class Biweight(CustomKernel):
|
||||
def __init__(self, h=1.0):
|
||||
CustomKernel.__init__(self, shape=lambda x: 0.9375*(1 - x*x)**2, h=h,
|
||||
domain=[-1.0, 1.0], norm = 1.0)
|
||||
self._L2Norm = 5.0/7.0
|
||||
self._kernel_var = 1. / 7
|
||||
self._order = 2
|
||||
|
||||
def smooth(self, xs, ys, x):
|
||||
"""Returns the kernel smoothing estimate for point x based on x-values
|
||||
xs and y-values ys.
|
||||
Not expected to be called by the user.
|
||||
|
||||
Special implementation optimized for Biweight.
|
||||
"""
|
||||
xs, ys = self.in_domain(xs, ys, x)
|
||||
|
||||
if len(xs) > 0:
|
||||
w = np.sum(square(subtract(1, square(divide(subtract(xs, x),
|
||||
self.h)))))
|
||||
v = np.sum(multiply(ys, square(subtract(1, square(divide(
|
||||
subtract(xs, x), self.h))))))
|
||||
return v / w
|
||||
else:
|
||||
return np.nan
|
||||
|
||||
def smoothvar(self, xs, ys, x):
|
||||
"""
|
||||
Returns the kernel smoothing estimate of the variance at point x.
|
||||
"""
|
||||
xs, ys = self.in_domain(xs, ys, x)
|
||||
|
||||
if len(xs) > 0:
|
||||
fittedvals = np.array([self.smooth(xs, ys, xx) for xx in xs])
|
||||
rs = square(subtract(ys, fittedvals))
|
||||
w = np.sum(square(subtract(1.0, square(divide(subtract(xs, x),
|
||||
self.h)))))
|
||||
v = np.sum(multiply(rs, square(subtract(1, square(divide(
|
||||
subtract(xs, x), self.h))))))
|
||||
return v / w
|
||||
else:
|
||||
return np.nan
|
||||
|
||||
def smoothconf_(self, xs, ys, x):
|
||||
"""Returns the kernel smoothing estimate with confidence 1sigma bounds
|
||||
"""
|
||||
xs, ys = self.in_domain(xs, ys, x)
|
||||
|
||||
if len(xs) > 0:
|
||||
fittedvals = np.array([self.smooth(xs, ys, xx) for xx in xs])
|
||||
rs = square(subtract(ys, fittedvals))
|
||||
w = np.sum(square(subtract(1.0, square(divide(subtract(xs, x),
|
||||
self.h)))))
|
||||
v = np.sum(multiply(rs, square(subtract(1, square(divide(
|
||||
subtract(xs, x), self.h))))))
|
||||
var = v / w
|
||||
sd = np.sqrt(var)
|
||||
K = self.L2Norm
|
||||
yhat = self.smooth(xs, ys, x)
|
||||
err = sd * K / np.sqrt(0.9375 * w * self.h)
|
||||
return (yhat - err, yhat, yhat + err)
|
||||
else:
|
||||
return (np.nan, np.nan, np.nan)
|
||||
|
||||
class Triweight(CustomKernel):
|
||||
def __init__(self, h=1.0):
|
||||
CustomKernel.__init__(self, shape=lambda x: 1.09375*(1 - x*x)**3, h=h,
|
||||
domain=[-1.0, 1.0], norm = 1.0)
|
||||
self._L2Norm = 350.0/429.0
|
||||
self._kernel_var = 1. / 9
|
||||
self._order = 2
|
||||
|
||||
|
||||
class Gaussian(CustomKernel):
|
||||
"""
|
||||
Gaussian (Normal) Kernel
|
||||
|
||||
K(u) = 1 / (sqrt(2*pi)) exp(-0.5 u**2)
|
||||
"""
|
||||
def __init__(self, h=1.0):
|
||||
CustomKernel.__init__(self, shape = lambda x: 0.3989422804014327 *
|
||||
np.exp(-x**2/2.0), h = h, domain = None, norm = 1.0)
|
||||
self._L2Norm = 1.0/(2.0*np.sqrt(np.pi))
|
||||
self._kernel_var = 1.0
|
||||
self._order = 2
|
||||
|
||||
def smooth(self, xs, ys, x):
|
||||
"""Returns the kernel smoothing estimate for point x based on x-values
|
||||
xs and y-values ys.
|
||||
Not expected to be called by the user.
|
||||
|
||||
Special implementation optimized for Gaussian.
|
||||
"""
|
||||
w = np.sum(exp(multiply(square(divide(subtract(xs, x),
|
||||
self.h)),-0.5)))
|
||||
v = np.sum(multiply(ys, exp(multiply(square(divide(subtract(xs, x),
|
||||
self.h)), -0.5))))
|
||||
return v/w
|
||||
|
||||
class Cosine(CustomKernel):
|
||||
"""
|
||||
Cosine Kernel
|
||||
|
||||
K(u) = pi/4 cos(0.5 * pi * u) between -1.0 and 1.0
|
||||
"""
|
||||
def __init__(self, h=1.0):
|
||||
CustomKernel.__init__(self, shape=lambda x: 0.78539816339744828 *
|
||||
np.cos(np.pi/2.0 * x), h=h, domain=[-1.0, 1.0], norm = 1.0)
|
||||
self._L2Norm = np.pi**2/16.0
|
||||
self._kernel_var = 0.1894305308612978 # = 1 - 8 / np.pi**2
|
||||
self._order = 2
|
||||
|
||||
|
||||
class Cosine2(CustomKernel):
|
||||
"""
|
||||
Cosine2 Kernel
|
||||
|
||||
K(u) = 1 + cos(2 * pi * u) between -0.5 and 0.5
|
||||
|
||||
Note: this is the same Cosine kernel that Stata uses
|
||||
"""
|
||||
def __init__(self, h=1.0):
|
||||
CustomKernel.__init__(self, shape=lambda x: 1 + np.cos(2.0 * np.pi * x)
|
||||
, h=h, domain=[-0.5, 0.5], norm = 1.0)
|
||||
self._L2Norm = 1.5
|
||||
self._kernel_var = 0.03267274151216444 # = 1/12. - 0.5 / np.pi**2
|
||||
self._order = 2
|
||||
|
||||
class Tricube(CustomKernel):
|
||||
"""
|
||||
Tricube Kernel
|
||||
|
||||
K(u) = 0.864197530864 * (1 - abs(x)**3)**3 between -1.0 and 1.0
|
||||
"""
|
||||
def __init__(self,h=1.0):
|
||||
CustomKernel.__init__(self,shape=lambda x: 0.864197530864 * (1 - abs(x)**3)**3,
|
||||
h=h, domain=[-1.0, 1.0], norm = 1.0)
|
||||
self._L2Norm = 175.0/247.0
|
||||
self._kernel_var = 35.0/243.0
|
||||
self._order = 2
|
||||
@ -0,0 +1,401 @@
|
||||
"""
|
||||
This module contains scatterplot smoothers, that is classes
|
||||
who generate a smooth fit of a set of (x,y) pairs.
|
||||
"""
|
||||
|
||||
# pylint: disable-msg=C0103
|
||||
# pylint: disable-msg=W0142
|
||||
# pylint: disable-msg=E0611
|
||||
# pylint: disable-msg=E1101
|
||||
|
||||
import numpy as np
|
||||
from . import kernels
|
||||
|
||||
|
||||
class KernelSmoother:
|
||||
"""
|
||||
1D Kernel Density Regression/Kernel Smoother
|
||||
|
||||
Requires:
|
||||
x - array_like of x values
|
||||
y - array_like of y values
|
||||
Kernel - Kernel object, Default is Gaussian.
|
||||
"""
|
||||
def __init__(self, x, y, Kernel = None):
|
||||
if Kernel is None:
|
||||
Kernel = kernels.Gaussian()
|
||||
self.Kernel = Kernel
|
||||
self.x = np.array(x)
|
||||
self.y = np.array(y)
|
||||
|
||||
def fit(self):
|
||||
pass
|
||||
|
||||
def __call__(self, x):
|
||||
return np.array([self.predict(xx) for xx in x])
|
||||
|
||||
def predict(self, x):
|
||||
"""
|
||||
Returns the kernel smoothed prediction at x
|
||||
|
||||
If x is a real number then a single value is returned.
|
||||
|
||||
Otherwise an attempt is made to cast x to numpy.ndarray and an array of
|
||||
corresponding y-points is returned.
|
||||
"""
|
||||
if np.size(x) == 1: # if isinstance(x, numbers.Real):
|
||||
return self.Kernel.smooth(self.x, self.y, x)
|
||||
else:
|
||||
return np.array([self.Kernel.smooth(self.x, self.y, xx) for xx
|
||||
in np.array(x)])
|
||||
|
||||
def conf(self, x):
|
||||
"""
|
||||
Returns the fitted curve and 1-sigma upper and lower point-wise
|
||||
confidence.
|
||||
These bounds are based on variance only, and do not include the bias.
|
||||
If the bandwidth is much larger than the curvature of the underlying
|
||||
function then the bias could be large.
|
||||
|
||||
x is the points on which you want to evaluate the fit and the errors.
|
||||
|
||||
Alternatively if x is specified as a positive integer, then the fit and
|
||||
confidence bands points will be returned after every
|
||||
xth sample point - so they are closer together where the data
|
||||
is denser.
|
||||
"""
|
||||
if isinstance(x, int):
|
||||
sorted_x = np.array(self.x)
|
||||
sorted_x.sort()
|
||||
confx = sorted_x[::x]
|
||||
conffit = self.conf(confx)
|
||||
return (confx, conffit)
|
||||
else:
|
||||
return np.array([self.Kernel.smoothconf(self.x, self.y, xx)
|
||||
for xx in x])
|
||||
|
||||
|
||||
def var(self, x):
|
||||
return np.array([self.Kernel.smoothvar(self.x, self.y, xx) for xx in x])
|
||||
|
||||
def std(self, x):
|
||||
return np.sqrt(self.var(x))
|
||||
|
||||
class PolySmoother:
|
||||
"""
|
||||
Polynomial smoother up to a given order.
|
||||
Fit based on weighted least squares.
|
||||
|
||||
The x values can be specified at instantiation or when called.
|
||||
|
||||
This is a 3 liner with OLS or WLS, see test.
|
||||
It's here as a test smoother for GAM
|
||||
"""
|
||||
#JP: heavily adjusted to work as plugin replacement for bspline
|
||||
# smoother in gam.py initialized by function default_smoother
|
||||
# Only fixed exceptions, I did not check whether it is statistically
|
||||
# correctand I think it is not, there are still be some dimension
|
||||
# problems, and there were some dimension problems initially.
|
||||
# TODO: undo adjustments and fix dimensions correctly
|
||||
# comment: this is just like polyfit with initialization options
|
||||
# and additional results (OLS on polynomial of x (x is 1d?))
|
||||
|
||||
|
||||
def __init__(self, order, x=None):
|
||||
#order = 4 # set this because we get knots instead of order
|
||||
self.order = order
|
||||
|
||||
#print order, x.shape
|
||||
self.coef = np.zeros((order+1,), np.float64)
|
||||
if x is not None:
|
||||
if x.ndim > 1:
|
||||
print('Warning: 2d x detected in PolySmoother init, shape:', x.shape)
|
||||
x=x[0,:] #check orientation
|
||||
self.X = np.array([x**i for i in range(order+1)]).T
|
||||
|
||||
def df_fit(self):
|
||||
'''alias of df_model for backwards compatibility
|
||||
'''
|
||||
return self.df_model()
|
||||
|
||||
def df_model(self):
|
||||
"""
|
||||
Degrees of freedom used in the fit.
|
||||
"""
|
||||
return self.order + 1
|
||||
|
||||
def gram(self, d=None):
|
||||
#fake for spline imitation
|
||||
pass
|
||||
|
||||
def smooth(self,*args, **kwds):
|
||||
'''alias for fit, for backwards compatibility,
|
||||
|
||||
do we need it with different behavior than fit?
|
||||
|
||||
'''
|
||||
return self.fit(*args, **kwds)
|
||||
|
||||
def df_resid(self):
|
||||
"""
|
||||
Residual degrees of freedom from last fit.
|
||||
"""
|
||||
return self.N - self.order - 1
|
||||
|
||||
def __call__(self, x=None):
|
||||
return self.predict(x=x)
|
||||
|
||||
|
||||
def predict(self, x=None):
|
||||
|
||||
if x is not None:
|
||||
#if x.ndim > 1: x=x[0,:] #why this this should select column not row
|
||||
if x.ndim > 1:
|
||||
print('Warning: 2d x detected in PolySmoother predict, shape:', x.shape)
|
||||
x=x[:,0] #TODO: check and clean this up
|
||||
X = np.array([(x**i) for i in range(self.order+1)])
|
||||
else:
|
||||
X = self.X
|
||||
#return np.squeeze(np.dot(X.T, self.coef))
|
||||
#need to check what dimension this is supposed to be
|
||||
if X.shape[1] == self.coef.shape[0]:
|
||||
return np.squeeze(np.dot(X, self.coef))#[0]
|
||||
else:
|
||||
return np.squeeze(np.dot(X.T, self.coef))#[0]
|
||||
|
||||
def fit(self, y, x=None, weights=None):
|
||||
self.N = y.shape[0]
|
||||
if y.ndim == 1:
|
||||
y = y[:,None]
|
||||
if weights is None or np.isnan(weights).all():
|
||||
weights = 1
|
||||
_w = 1
|
||||
else:
|
||||
_w = np.sqrt(weights)[:,None]
|
||||
if x is None:
|
||||
if not hasattr(self, "X"):
|
||||
raise ValueError("x needed to fit PolySmoother")
|
||||
else:
|
||||
if x.ndim > 1:
|
||||
print('Warning: 2d x detected in PolySmoother predict, shape:', x.shape)
|
||||
#x=x[0,:] #TODO: check orientation, row or col
|
||||
self.X = np.array([(x**i) for i in range(self.order+1)]).T
|
||||
#print _w.shape
|
||||
|
||||
X = self.X * _w
|
||||
|
||||
_y = y * _w#[:,None]
|
||||
#self.coef = np.dot(L.pinv(X).T, _y[:,None])
|
||||
#self.coef = np.dot(L.pinv(X), _y)
|
||||
self.coef = np.linalg.lstsq(X, _y, rcond=-1)[0]
|
||||
self.params = np.squeeze(self.coef)
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
# comment out for now to remove dependency on _hbspline
|
||||
|
||||
##class SmoothingSpline(BSpline):
|
||||
##
|
||||
## penmax = 30.
|
||||
##
|
||||
## def fit(self, y, x=None, weights=None, pen=0.):
|
||||
## banded = True
|
||||
##
|
||||
## if x is None:
|
||||
## x = self.tau[(self.M-1):-(self.M-1)] # internal knots
|
||||
##
|
||||
## if pen == 0.: # cannot use cholesky for singular matrices
|
||||
## banded = False
|
||||
##
|
||||
## if x.shape != y.shape:
|
||||
## raise ValueError('x and y shape do not agree, by default x are the Bspline\'s internal knots')
|
||||
##
|
||||
## bt = self.basis(x)
|
||||
## if pen >= self.penmax:
|
||||
## pen = self.penmax
|
||||
##
|
||||
## if weights is None:
|
||||
## weights = np.array(1.)
|
||||
##
|
||||
## wmean = weights.mean()
|
||||
## _w = np.sqrt(weights / wmean)
|
||||
## bt *= _w
|
||||
##
|
||||
## # throw out rows with zeros (this happens at boundary points!)
|
||||
##
|
||||
## mask = np.flatnonzero(1 - np.alltrue(np.equal(bt, 0), axis=0))
|
||||
##
|
||||
## bt = bt[:, mask]
|
||||
## y = y[mask]
|
||||
##
|
||||
## self.df_total = y.shape[0]
|
||||
##
|
||||
## if bt.shape[1] != y.shape[0]:
|
||||
## raise ValueError("some x values are outside range of B-spline knots")
|
||||
## bty = np.dot(bt, _w * y)
|
||||
## self.N = y.shape[0]
|
||||
## if not banded:
|
||||
## self.btb = np.dot(bt, bt.T)
|
||||
## _g = _band2array(self.g, lower=1, symmetric=True)
|
||||
## self.coef, _, self.rank = L.lstsq(self.btb + pen*_g, bty)[0:3]
|
||||
## self.rank = min(self.rank, self.btb.shape[0])
|
||||
## else:
|
||||
## self.btb = np.zeros(self.g.shape, np.float64)
|
||||
## nband, nbasis = self.g.shape
|
||||
## for i in range(nbasis):
|
||||
## for k in range(min(nband, nbasis-i)):
|
||||
## self.btb[k, i] = (bt[i] * bt[i+k]).sum()
|
||||
##
|
||||
## bty.shape = (1, bty.shape[0])
|
||||
## self.chol, self.coef = solveh_banded(self.btb +
|
||||
## pen*self.g,
|
||||
## bty, lower=1)
|
||||
##
|
||||
## self.coef = np.squeeze(self.coef)
|
||||
## self.resid = np.sqrt(wmean) * (y * _w - np.dot(self.coef, bt))
|
||||
## self.pen = pen
|
||||
##
|
||||
## def gcv(self):
|
||||
## """
|
||||
## Generalized cross-validation score of current fit.
|
||||
## """
|
||||
##
|
||||
## norm_resid = (self.resid**2).sum()
|
||||
## return norm_resid / (self.df_total - self.trace())
|
||||
##
|
||||
## def df_resid(self):
|
||||
## """
|
||||
## self.N - self.trace()
|
||||
##
|
||||
## where self.N is the number of observations of last fit.
|
||||
## """
|
||||
##
|
||||
## return self.N - self.trace()
|
||||
##
|
||||
## def df_fit(self):
|
||||
## """
|
||||
## = self.trace()
|
||||
##
|
||||
## How many degrees of freedom used in the fit?
|
||||
## """
|
||||
## return self.trace()
|
||||
##
|
||||
## def trace(self):
|
||||
## """
|
||||
## Trace of the smoothing matrix S(pen)
|
||||
## """
|
||||
##
|
||||
## if self.pen > 0:
|
||||
## _invband = _hbspline.invband(self.chol.copy())
|
||||
## tr = _trace_symbanded(_invband, self.btb, lower=1)
|
||||
## return tr
|
||||
## else:
|
||||
## return self.rank
|
||||
##
|
||||
##class SmoothingSplineFixedDF(SmoothingSpline):
|
||||
## """
|
||||
## Fit smoothing spline with approximately df degrees of freedom
|
||||
## used in the fit, i.e. so that self.trace() is approximately df.
|
||||
##
|
||||
## In general, df must be greater than the dimension of the null space
|
||||
## of the Gram inner product. For cubic smoothing splines, this means
|
||||
## that df > 2.
|
||||
## """
|
||||
##
|
||||
## target_df = 5
|
||||
##
|
||||
## def __init__(self, knots, order=4, coef=None, M=None, target_df=None):
|
||||
## if target_df is not None:
|
||||
## self.target_df = target_df
|
||||
## BSpline.__init__(self, knots, order=order, coef=coef, M=M)
|
||||
## self.target_reached = False
|
||||
##
|
||||
## def fit(self, y, x=None, df=None, weights=None, tol=1.0e-03):
|
||||
##
|
||||
## df = df or self.target_df
|
||||
##
|
||||
## apen, bpen = 0, 1.0e-03
|
||||
## olddf = y.shape[0] - self.m
|
||||
##
|
||||
## if not self.target_reached:
|
||||
## while True:
|
||||
## curpen = 0.5 * (apen + bpen)
|
||||
## SmoothingSpline.fit(self, y, x=x, weights=weights, pen=curpen)
|
||||
## curdf = self.trace()
|
||||
## if curdf > df:
|
||||
## apen, bpen = curpen, 2 * curpen
|
||||
## else:
|
||||
## apen, bpen = apen, curpen
|
||||
## if apen >= self.penmax:
|
||||
## raise ValueError("penalty too large, try setting penmax higher or decreasing df")
|
||||
## if np.fabs(curdf - df) / df < tol:
|
||||
## self.target_reached = True
|
||||
## break
|
||||
## else:
|
||||
## SmoothingSpline.fit(self, y, x=x, weights=weights, pen=self.pen)
|
||||
##
|
||||
##class SmoothingSplineGCV(SmoothingSpline):
|
||||
##
|
||||
## """
|
||||
## Fit smoothing spline trying to optimize GCV.
|
||||
##
|
||||
## Try to find a bracketing interval for scipy.optimize.golden
|
||||
## based on bracket.
|
||||
##
|
||||
## It is probably best to use target_df instead, as it is
|
||||
## sometimes difficult to find a bracketing interval.
|
||||
##
|
||||
## """
|
||||
##
|
||||
## def fit(self, y, x=None, weights=None, tol=1.0e-03,
|
||||
## bracket=(0,1.0e-03)):
|
||||
##
|
||||
## def _gcv(pen, y, x):
|
||||
## SmoothingSpline.fit(y, x=x, pen=np.exp(pen), weights=weights)
|
||||
## a = self.gcv()
|
||||
## return a
|
||||
##
|
||||
## a = golden(_gcv, args=(y,x), brack=(-100,20), tol=tol)
|
||||
##
|
||||
##def _trace_symbanded(a,b, lower=0):
|
||||
## """
|
||||
## Compute the trace(a*b) for two upper or lower banded real symmetric matrices.
|
||||
## """
|
||||
##
|
||||
## if lower:
|
||||
## t = _zero_triband(a * b, lower=1)
|
||||
## return t[0].sum() + 2 * t[1:].sum()
|
||||
## else:
|
||||
## t = _zero_triband(a * b, lower=0)
|
||||
## return t[-1].sum() + 2 * t[:-1].sum()
|
||||
##
|
||||
##
|
||||
##
|
||||
##def _zero_triband(a, lower=0):
|
||||
## """
|
||||
## Zero out unnecessary elements of a real symmetric banded matrix.
|
||||
## """
|
||||
##
|
||||
## nrow, ncol = a.shape
|
||||
## if lower:
|
||||
## for i in range(nrow): a[i,(ncol-i):] = 0.
|
||||
## else:
|
||||
## for i in range(nrow): a[i,0:i] = 0.
|
||||
## return a
|
||||
@ -0,0 +1,57 @@
|
||||
"""
|
||||
Created on Fri Mar 04 07:36:28 2011
|
||||
|
||||
@author: Mike
|
||||
"""
|
||||
|
||||
import numpy as np
|
||||
|
||||
class kdetest:
|
||||
|
||||
|
||||
Hpi = np.matrix([[ 0.05163034, 0.5098923 ],
|
||||
[0.50989228, 8.8822365 ]])
|
||||
|
||||
|
||||
faithfulData = dict(
|
||||
eruptions=[
|
||||
3.6, 1.8, 3.333, 2.283, 4.533, 2.883, 4.7, 3.6, 1.95, 4.35, 1.833, 3.917,
|
||||
4.2, 1.75, 4.7, 2.167, 1.75, 4.8, 1.6, 4.25, 1.8, 1.75, 3.45, 3.067, 4.533,
|
||||
3.6, 1.967, 4.083, 3.85, 4.433, 4.3, 4.467, 3.367, 4.033, 3.833, 2.017, 1.867,
|
||||
4.833, 1.833, 4.783, 4.35, 1.883, 4.567, 1.75, 4.533, 3.317, 3.833, 2.1, 4.633,
|
||||
2, 4.8, 4.716, 1.833, 4.833, 1.733, 4.883, 3.717, 1.667, 4.567, 4.317, 2.233, 4.5,
|
||||
1.75, 4.8, 1.817, 4.4, 4.167, 4.7, 2.067, 4.7, 4.033, 1.967, 4.5, 4, 1.983, 5.067,
|
||||
2.017, 4.567, 3.883, 3.6, 4.133, 4.333, 4.1, 2.633, 4.067, 4.933, 3.95, 4.517, 2.167,
|
||||
4, 2.2, 4.333, 1.867, 4.817, 1.833, 4.3, 4.667, 3.75, 1.867, 4.9, 2.483, 4.367, 2.1, 4.5,
|
||||
4.05, 1.867, 4.7, 1.783, 4.85, 3.683, 4.733, 2.3, 4.9, 4.417, 1.7, 4.633, 2.317, 4.6,
|
||||
1.817, 4.417, 2.617, 4.067, 4.25, 1.967, 4.6, 3.767, 1.917, 4.5, 2.267, 4.65, 1.867,
|
||||
4.167, 2.8, 4.333, 1.833, 4.383, 1.883, 4.933, 2.033, 3.733, 4.233, 2.233, 4.533,
|
||||
4.817, 4.333, 1.983, 4.633, 2.017, 5.1, 1.8, 5.033, 4, 2.4, 4.6, 3.567, 4, 4.5, 4.083,
|
||||
1.8, 3.967, 2.2, 4.15, 2, 3.833, 3.5, 4.583, 2.367, 5, 1.933, 4.617, 1.917, 2.083,
|
||||
4.583, 3.333, 4.167, 4.333, 4.5, 2.417, 4, 4.167, 1.883, 4.583, 4.25, 3.767, 2.033,
|
||||
4.433, 4.083, 1.833, 4.417, 2.183, 4.8, 1.833, 4.8, 4.1, 3.966, 4.233, 3.5, 4.366,
|
||||
2.25, 4.667, 2.1, 4.35, 4.133, 1.867, 4.6, 1.783, 4.367, 3.85, 1.933, 4.5, 2.383,
|
||||
4.7, 1.867, 3.833, 3.417, 4.233, 2.4, 4.8, 2, 4.15, 1.867, 4.267, 1.75, 4.483, 4,
|
||||
4.117, 4.083, 4.267, 3.917, 4.55, 4.083, 2.417, 4.183, 2.217, 4.45, 1.883, 1.85,
|
||||
4.283, 3.95, 2.333, 4.15, 2.35, 4.933, 2.9, 4.583, 3.833, 2.083, 4.367, 2.133, 4.35,
|
||||
2.2, 4.45, 3.567, 4.5, 4.15, 3.817, 3.917, 4.45, 2, 4.283, 4.767, 4.533, 1.85, 4.25,
|
||||
1.983, 2.25, 4.75, 4.117, 2.15, 4.417, 1.817, 4.467],
|
||||
waiting=[
|
||||
79, 54, 74, 62, 85, 55, 88, 85, 51, 85, 54, 84, 78, 47, 83, 52,
|
||||
62, 84, 52, 79, 51, 47, 78, 69, 74, 83, 55, 76, 78, 79, 73, 77,
|
||||
66, 80, 74, 52, 48, 80, 59, 90, 80, 58, 84, 58, 73, 83, 64, 53,
|
||||
82, 59, 75, 90, 54, 80, 54, 83, 71, 64, 77, 81, 59, 84, 48, 82,
|
||||
60, 92, 78, 78, 65, 73, 82, 56, 79, 71, 62, 76, 60, 78, 76, 83,
|
||||
75, 82, 70, 65, 73, 88, 76, 80, 48, 86, 60, 90, 50, 78, 63, 72,
|
||||
84, 75, 51, 82, 62, 88, 49, 83, 81, 47, 84, 52, 86, 81, 75, 59,
|
||||
89, 79, 59, 81, 50, 85, 59, 87, 53, 69, 77, 56, 88, 81, 45, 82,
|
||||
55, 90, 45, 83, 56, 89, 46, 82, 51, 86, 53, 79, 81, 60, 82, 77,
|
||||
76, 59, 80, 49, 96, 53, 77, 77, 65, 81, 71, 70, 81, 93, 53, 89,
|
||||
45, 86, 58, 78, 66, 76, 63, 88, 52, 93, 49, 57, 77, 68, 81, 81,
|
||||
73, 50, 85, 74, 55, 77, 83, 83, 51, 78, 84, 46, 83, 55, 81, 57,
|
||||
76, 84, 77, 81, 87, 77, 51, 78, 60, 82, 91, 53, 78, 46, 77, 84,
|
||||
49, 83, 71, 80, 49, 75, 64, 76, 53, 94, 55, 76, 50, 82, 54, 75,
|
||||
78, 79, 78, 78, 70, 79, 70, 54, 86, 50, 90, 54, 54, 77, 79, 64,
|
||||
75, 47, 86, 63, 85, 82, 57, 82, 67, 74, 54, 83, 73, 73, 88, 80,
|
||||
71, 83, 56, 79, 78, 84, 58, 83, 43, 60, 75, 81, 46, 90, 46, 74]
|
||||
)
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@ -0,0 +1,87 @@
|
||||
"""Example for gam.AdditiveModel and PolynomialSmoother
|
||||
|
||||
This example was written as a test case.
|
||||
The data generating process is chosen so the parameters are well identified
|
||||
and estimated.
|
||||
|
||||
Created on Fri Nov 04 13:45:43 2011
|
||||
|
||||
Author: Josef Perktold
|
||||
|
||||
"""
|
||||
from statsmodels.compat.python import lrange
|
||||
|
||||
import numpy as np
|
||||
|
||||
from statsmodels.sandbox.gam import AdditiveModel
|
||||
from statsmodels.regression.linear_model import OLS
|
||||
|
||||
np.random.seed(8765993)
|
||||
#seed is chosen for nice result, not randomly
|
||||
#other seeds are pretty off in the prediction
|
||||
|
||||
#DGP: simple polynomial
|
||||
order = 3
|
||||
sigma_noise = 0.5
|
||||
nobs = 1000 #1000 #with 1000, OLS and Additivemodel agree in params at 2 decimals
|
||||
lb, ub = -3.5, 4#2.5
|
||||
x1 = np.linspace(lb, ub, nobs)
|
||||
x2 = np.sin(2*x1)
|
||||
x = np.column_stack((x1/x1.max()*2, x2))
|
||||
exog = (x[:,:,None]**np.arange(order+1)[None, None, :]).reshape(nobs, -1)
|
||||
idx = lrange((order+1)*2)
|
||||
del idx[order+1]
|
||||
exog_reduced = exog[:,idx] #remove duplicate constant
|
||||
y_true = exog.sum(1) / 2.
|
||||
z = y_true #alias check
|
||||
d = x
|
||||
y = y_true + sigma_noise * np.random.randn(nobs)
|
||||
|
||||
example = 1
|
||||
|
||||
if example == 1:
|
||||
m = AdditiveModel(d)
|
||||
m.fit(y)
|
||||
|
||||
y_pred = m.results.predict(d)
|
||||
|
||||
|
||||
for ss in m.smoothers:
|
||||
print(ss.params)
|
||||
|
||||
res_ols = OLS(y, exog_reduced).fit()
|
||||
print(res_ols.params)
|
||||
|
||||
#from numpy.testing import assert_almost_equal
|
||||
#assert_almost_equal(y_pred, res_ols.fittedvalues, 3)
|
||||
|
||||
if example > 0:
|
||||
import matplotlib.pyplot as plt
|
||||
|
||||
plt.figure()
|
||||
plt.plot(exog)
|
||||
|
||||
y_pred = m.results.mu# + m.results.alpha #m.results.predict(d)
|
||||
plt.figure()
|
||||
plt.subplot(2,2,1)
|
||||
plt.plot(y, '.', alpha=0.25)
|
||||
plt.plot(y_true, 'k-', label='true')
|
||||
|
||||
plt.plot(res_ols.fittedvalues, 'g-', label='OLS', lw=2, alpha=-.7)
|
||||
plt.plot(y_pred, 'r-', label='AM')
|
||||
plt.legend(loc='upper left')
|
||||
plt.title('gam.AdditiveModel')
|
||||
|
||||
counter = 2
|
||||
for ii, xx in zip(['z', 'x1', 'x2'], [z, x[:,0], x[:,1]]):
|
||||
sortidx = np.argsort(xx)
|
||||
#plt.figure()
|
||||
plt.subplot(2, 2, counter)
|
||||
plt.plot(xx[sortidx], y[sortidx], '.', alpha=0.25)
|
||||
plt.plot(xx[sortidx], y_true[sortidx], 'k.', label='true', lw=2)
|
||||
plt.plot(xx[sortidx], y_pred[sortidx], 'r.', label='AM')
|
||||
plt.legend(loc='upper left')
|
||||
plt.title('gam.AdditiveModel ' + ii)
|
||||
counter += 1
|
||||
|
||||
plt.show()
|
||||
@ -0,0 +1,132 @@
|
||||
"""Example for GAM with Poisson Model and PolynomialSmoother
|
||||
|
||||
This example was written as a test case.
|
||||
The data generating process is chosen so the parameters are well identified
|
||||
and estimated.
|
||||
|
||||
Created on Fri Nov 04 13:45:43 2011
|
||||
|
||||
Author: Josef Perktold
|
||||
"""
|
||||
from statsmodels.compat.python import lrange
|
||||
import time
|
||||
|
||||
import numpy as np
|
||||
|
||||
from scipy import stats
|
||||
|
||||
from statsmodels.sandbox.gam import Model as GAM
|
||||
from statsmodels.genmod.families import family
|
||||
from statsmodels.genmod.generalized_linear_model import GLM
|
||||
|
||||
np.seterr(all='raise')
|
||||
np.random.seed(8765993)
|
||||
#seed is chosen for nice result, not randomly
|
||||
#other seeds are pretty off in the prediction or end in overflow
|
||||
|
||||
#DGP: simple polynomial
|
||||
order = 3
|
||||
sigma_noise = 0.1
|
||||
nobs = 1000
|
||||
#lb, ub = -0.75, 3#1.5#0.75 #2.5
|
||||
lb, ub = -3.5, 3
|
||||
x1 = np.linspace(lb, ub, nobs)
|
||||
x2 = np.sin(2*x1)
|
||||
x = np.column_stack((x1/x1.max()*1, 1.*x2))
|
||||
exog = (x[:,:,None]**np.arange(order+1)[None, None, :]).reshape(nobs, -1)
|
||||
idx = lrange((order+1)*2)
|
||||
del idx[order+1]
|
||||
exog_reduced = exog[:,idx] #remove duplicate constant
|
||||
y_true = exog.sum(1) #/ 4.
|
||||
z = y_true #alias check
|
||||
d = x
|
||||
y = y_true + sigma_noise * np.random.randn(nobs)
|
||||
|
||||
example = 3
|
||||
|
||||
if example == 2:
|
||||
print("binomial")
|
||||
f = family.Binomial()
|
||||
mu_true = f.link.inverse(z)
|
||||
#b = np.asarray([scipy.stats.bernoulli.rvs(p) for p in f.link.inverse(y)])
|
||||
b = np.asarray([stats.bernoulli.rvs(p) for p in f.link.inverse(z)])
|
||||
b.shape = y.shape
|
||||
m = GAM(b, d, family=f)
|
||||
toc = time.time()
|
||||
m.fit(b)
|
||||
tic = time.time()
|
||||
print(tic-toc)
|
||||
#for plotting
|
||||
yp = f.link.inverse(y)
|
||||
p = b
|
||||
|
||||
|
||||
if example == 3:
|
||||
print("Poisson")
|
||||
f = family.Poisson()
|
||||
#y = y/y.max() * 3
|
||||
yp = f.link.inverse(z)
|
||||
p = np.asarray([stats.poisson.rvs(val) for val in f.link.inverse(z)],
|
||||
float)
|
||||
p.shape = y.shape
|
||||
m = GAM(p, d, family=f)
|
||||
toc = time.time()
|
||||
m.fit(p)
|
||||
tic = time.time()
|
||||
print(tic-toc)
|
||||
|
||||
for ss in m.smoothers:
|
||||
print(ss.params)
|
||||
|
||||
if example > 1:
|
||||
import matplotlib.pyplot as plt
|
||||
plt.figure()
|
||||
for i in np.array(m.history[2:15:3]):
|
||||
plt.plot(i.T)
|
||||
|
||||
plt.figure()
|
||||
plt.plot(exog)
|
||||
#plt.plot(p, '.', lw=2)
|
||||
plt.plot(y_true, lw=2)
|
||||
|
||||
y_pred = m.results.mu # + m.results.alpha #m.results.predict(d)
|
||||
plt.figure()
|
||||
plt.subplot(2,2,1)
|
||||
plt.plot(p, '.')
|
||||
plt.plot(yp, 'b-', label='true')
|
||||
plt.plot(y_pred, 'r-', label='GAM')
|
||||
plt.legend(loc='upper left')
|
||||
plt.title('gam.GAM Poisson')
|
||||
|
||||
counter = 2
|
||||
for ii, xx in zip(['z', 'x1', 'x2'], [z, x[:,0], x[:,1]]):
|
||||
sortidx = np.argsort(xx)
|
||||
#plt.figure()
|
||||
plt.subplot(2, 2, counter)
|
||||
plt.plot(xx[sortidx], p[sortidx], 'k.', alpha=0.5)
|
||||
plt.plot(xx[sortidx], yp[sortidx], 'b.', label='true')
|
||||
plt.plot(xx[sortidx], y_pred[sortidx], 'r.', label='GAM')
|
||||
plt.legend(loc='upper left')
|
||||
plt.title('gam.GAM Poisson ' + ii)
|
||||
counter += 1
|
||||
|
||||
res = GLM(p, exog_reduced, family=f).fit()
|
||||
|
||||
#plot component, compared to true component
|
||||
x1 = x[:,0]
|
||||
x2 = x[:,1]
|
||||
f1 = exog[:,:order+1].sum(1) - 1 #take out constant
|
||||
f2 = exog[:,order+1:].sum(1) - 1
|
||||
plt.figure()
|
||||
#Note: need to correct for constant which is indeterminatedly distributed
|
||||
#plt.plot(x1, m.smoothers[0](x1)-m.smoothers[0].params[0]+1, 'r')
|
||||
#better would be subtract f(0) m.smoothers[0](np.array([0]))
|
||||
plt.plot(x1, f1, linewidth=2)
|
||||
plt.plot(x1, m.smoothers[0](x1)-m.smoothers[0].params[0], 'r')
|
||||
|
||||
plt.figure()
|
||||
plt.plot(x2, f2, linewidth=2)
|
||||
plt.plot(x2, m.smoothers[1](x2)-m.smoothers[1].params[0], 'r')
|
||||
|
||||
|
||||
plt.show()
|
||||
@ -0,0 +1,61 @@
|
||||
"""
|
||||
Created on Fri Nov 04 10:51:39 2011
|
||||
|
||||
@author: josef
|
||||
"""
|
||||
import numpy as np
|
||||
|
||||
from statsmodels.sandbox.nonparametric import smoothers
|
||||
from statsmodels.regression.linear_model import OLS, WLS
|
||||
|
||||
|
||||
#DGP: simple polynomial
|
||||
order = 3
|
||||
sigma_noise = 0.5
|
||||
nobs = 100
|
||||
lb, ub = -1, 2
|
||||
x = np.linspace(lb, ub, nobs)
|
||||
x = np.sin(x)
|
||||
exog = x[:,None]**np.arange(order+1)
|
||||
y_true = exog.sum(1)
|
||||
y = y_true + sigma_noise * np.random.randn(nobs)
|
||||
|
||||
|
||||
|
||||
#xind = np.argsort(x)
|
||||
pmod = smoothers.PolySmoother(2, x)
|
||||
pmod.fit(y) #no return
|
||||
y_pred = pmod.predict(x)
|
||||
error = y - y_pred
|
||||
mse = (error*error).mean()
|
||||
print(mse)
|
||||
res_ols = OLS(y, exog[:,:3]).fit()
|
||||
print(np.squeeze(pmod.coef) - res_ols.params)
|
||||
|
||||
|
||||
weights = np.ones(nobs)
|
||||
weights[:nobs//3] = 0.1
|
||||
weights[-nobs//5:] = 2
|
||||
|
||||
pmodw = smoothers.PolySmoother(2, x)
|
||||
pmodw.fit(y, weights=weights) #no return
|
||||
y_predw = pmodw.predict(x)
|
||||
error = y - y_predw
|
||||
mse = (error*error).mean()
|
||||
print(mse)
|
||||
res_wls = WLS(y, exog[:,:3], weights=weights).fit()
|
||||
print(np.squeeze(pmodw.coef) - res_wls.params)
|
||||
|
||||
|
||||
|
||||
doplot = 1
|
||||
if doplot:
|
||||
import matplotlib.pyplot as plt
|
||||
plt.plot(y, '.')
|
||||
plt.plot(y_true, 'b-', label='true')
|
||||
plt.plot(y_pred, '-', label='poly')
|
||||
plt.plot(y_predw, '-', label='poly -w')
|
||||
plt.legend(loc='upper left')
|
||||
|
||||
plt.close()
|
||||
#plt.show()
|
||||
@ -0,0 +1,78 @@
|
||||
import numpy as np
|
||||
import numpy.testing as npt
|
||||
|
||||
from statsmodels.sandbox.nonparametric.kernel_extras import SemiLinear
|
||||
|
||||
|
||||
class KernelExtrasTestBase:
|
||||
@classmethod
|
||||
def setup_class(cls):
|
||||
nobs = 60
|
||||
np.random.seed(123456)
|
||||
cls.o = np.random.binomial(2, 0.7, size=(nobs, 1))
|
||||
cls.o2 = np.random.binomial(3, 0.7, size=(nobs, 1))
|
||||
cls.c1 = np.random.normal(size=(nobs, 1))
|
||||
cls.c2 = np.random.normal(10, 1, size=(nobs, 1))
|
||||
cls.c3 = np.random.normal(10, 2, size=(nobs, 1))
|
||||
cls.noise = np.random.normal(size=(nobs, 1))
|
||||
b0 = 0.3
|
||||
b1 = 1.2
|
||||
b2 = 3.7 # regression coefficients
|
||||
cls.y = b0 + b1 * cls.c1 + b2 * cls.c2 + cls.noise
|
||||
cls.y2 = b0 + b1 * cls.c1 + b2 * cls.c2 + cls.o + cls.noise
|
||||
# Italy data from R's np package (the first 50 obs) R>> data (Italy)
|
||||
|
||||
cls.Italy_gdp = \
|
||||
[8.556, 12.262, 9.587, 8.119, 5.537, 6.796, 8.638,
|
||||
6.483, 6.212, 5.111, 6.001, 7.027, 4.616, 3.922,
|
||||
4.688, 3.957, 3.159, 3.763, 3.829, 5.242, 6.275,
|
||||
8.518, 11.542, 9.348, 8.02, 5.527, 6.865, 8.666,
|
||||
6.672, 6.289, 5.286, 6.271, 7.94, 4.72, 4.357,
|
||||
4.672, 3.883, 3.065, 3.489, 3.635, 5.443, 6.302,
|
||||
9.054, 12.485, 9.896, 8.33, 6.161, 7.055, 8.717,
|
||||
6.95]
|
||||
|
||||
cls.Italy_year = \
|
||||
[1951, 1951, 1951, 1951, 1951, 1951, 1951, 1951, 1951, 1951, 1951,
|
||||
1951, 1951, 1951, 1951, 1951, 1951, 1951, 1951, 1951, 1951, 1952,
|
||||
1952, 1952, 1952, 1952, 1952, 1952, 1952, 1952, 1952, 1952, 1952,
|
||||
1952, 1952, 1952, 1952, 1952, 1952, 1952, 1952, 1952, 1953, 1953,
|
||||
1953, 1953, 1953, 1953, 1953, 1953]
|
||||
|
||||
# OECD panel data from NP R>> data(oecdpanel)
|
||||
cls.growth = \
|
||||
[-0.0017584, 0.00740688, 0.03424461, 0.03848719, 0.02932506,
|
||||
0.03769199, 0.0466038, 0.00199456, 0.03679607, 0.01917304,
|
||||
-0.00221, 0.00787269, 0.03441118, -0.0109228, 0.02043064,
|
||||
-0.0307962, 0.02008947, 0.00580313, 0.00344502, 0.04706358,
|
||||
0.03585851, 0.01464953, 0.04525762, 0.04109222, -0.0087903,
|
||||
0.04087915, 0.04551403, 0.036916, 0.00369293, 0.0718669,
|
||||
0.02577732, -0.0130759, -0.01656641, 0.00676429, 0.08833017,
|
||||
0.05092105, 0.02005877, 0.00183858, 0.03903173, 0.05832116,
|
||||
0.0494571, 0.02078484, 0.09213897, 0.0070534, 0.08677202,
|
||||
0.06830603, -0.00041, 0.0002856, 0.03421225, -0.0036825]
|
||||
|
||||
cls.oecd = \
|
||||
[0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
|
||||
0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0,
|
||||
0, 0, 0, 0]
|
||||
|
||||
|
||||
class TestSemiLinear(KernelExtrasTestBase):
|
||||
|
||||
def test_basic(self):
|
||||
nobs = 300
|
||||
np.random.seed(1234)
|
||||
C1 = np.random.normal(0,2, size=(nobs, ))
|
||||
C2 = np.random.normal(2, 1, size=(nobs, ))
|
||||
e = np.random.normal(size=(nobs, ))
|
||||
b1 = 1.3
|
||||
b2 = -0.7
|
||||
Y = b1 * C1 + np.exp(b2 * C2) + e
|
||||
model = SemiLinear(endog=[Y], exog=[C1], exog_nonparametric=[C2],
|
||||
var_type='c', k_linear=1)
|
||||
b_hat = np.squeeze(model.b)
|
||||
# Only tests for the linear part of the regression
|
||||
# Currently does not work well with the nonparametric part
|
||||
# Needs some more work
|
||||
npt.assert_allclose(b1, b_hat, rtol=0.1)
|
||||
@ -0,0 +1,110 @@
|
||||
"""
|
||||
Created on Fri Nov 04 10:51:39 2011
|
||||
|
||||
Author: Josef Perktold
|
||||
License: BSD-3
|
||||
"""
|
||||
|
||||
import numpy as np
|
||||
from numpy.testing import assert_almost_equal, assert_equal
|
||||
|
||||
from statsmodels.sandbox.nonparametric import smoothers
|
||||
from statsmodels.regression.linear_model import OLS, WLS
|
||||
|
||||
|
||||
class CheckSmoother:
|
||||
|
||||
def test_predict(self):
|
||||
assert_almost_equal(self.res_ps.predict(self.x),
|
||||
self.res2.fittedvalues, decimal=13)
|
||||
assert_almost_equal(self.res_ps.predict(self.x[:10]),
|
||||
self.res2.fittedvalues[:10], decimal=13)
|
||||
|
||||
def test_coef(self):
|
||||
#TODO: check dim of coef
|
||||
assert_almost_equal(self.res_ps.coef.ravel(),
|
||||
self.res2.params, decimal=14)
|
||||
|
||||
def test_df(self):
|
||||
#TODO: make into attributes
|
||||
assert_equal(self.res_ps.df_model(), self.res2.df_model+1) #with const
|
||||
assert_equal(self.res_ps.df_fit(), self.res2.df_model+1) #alias
|
||||
assert_equal(self.res_ps.df_resid(), self.res2.df_resid)
|
||||
|
||||
|
||||
|
||||
class BasePolySmoother:
|
||||
|
||||
@classmethod
|
||||
def setup_class(cls):
|
||||
#DGP: simple polynomial
|
||||
order = 3
|
||||
sigma_noise = 0.5
|
||||
nobs = 100
|
||||
lb, ub = -1, 2
|
||||
cls.x = x = np.linspace(lb, ub, nobs)
|
||||
cls.exog = exog = x[:,None]**np.arange(order+1)
|
||||
y_true = exog.sum(1)
|
||||
np.random.seed(987567)
|
||||
cls.y = y = y_true + sigma_noise * np.random.randn(nobs)
|
||||
|
||||
|
||||
class TestPolySmoother1(BasePolySmoother, CheckSmoother):
|
||||
|
||||
@classmethod
|
||||
def setup_class(cls):
|
||||
super().setup_class() #initialize DGP
|
||||
|
||||
y, x, exog = cls.y, cls.x, cls.exog
|
||||
|
||||
#use order = 2 in regression
|
||||
pmod = smoothers.PolySmoother(2, x)
|
||||
pmod.fit(y) #no return
|
||||
|
||||
cls.res_ps = pmod
|
||||
cls.res2 = OLS(y, exog[:,:2+1]).fit()
|
||||
|
||||
class TestPolySmoother2(BasePolySmoother, CheckSmoother):
|
||||
|
||||
@classmethod
|
||||
def setup_class(cls):
|
||||
super().setup_class() #initialize DGP
|
||||
|
||||
y, x, exog = cls.y, cls.x, cls.exog
|
||||
|
||||
#use order = 3 in regression
|
||||
pmod = smoothers.PolySmoother(3, x)
|
||||
#pmod.fit(y) #no return
|
||||
pmod.smooth(y) #no return, use alias for fit
|
||||
|
||||
cls.res_ps = pmod
|
||||
cls.res2 = OLS(y, exog[:,:3+1]).fit()
|
||||
|
||||
class TestPolySmoother3(BasePolySmoother, CheckSmoother):
|
||||
|
||||
@classmethod
|
||||
def setup_class(cls):
|
||||
super().setup_class() #initialize DGP
|
||||
|
||||
y, x, exog = cls.y, cls.x, cls.exog
|
||||
nobs = y.shape[0]
|
||||
weights = np.ones(nobs)
|
||||
weights[:nobs//3] = 0.1
|
||||
weights[-nobs//5:] = 2
|
||||
|
||||
#use order = 2 in regression
|
||||
pmod = smoothers.PolySmoother(2, x)
|
||||
pmod.fit(y, weights=weights) #no return
|
||||
|
||||
cls.res_ps = pmod
|
||||
cls.res2 = WLS(y, exog[:,:2+1], weights=weights).fit()
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
t1 = TestPolySmoother1()
|
||||
t1.test_predict()
|
||||
t1.test_coef()
|
||||
t1.test_df
|
||||
|
||||
t3 = TestPolySmoother3()
|
||||
t3.test_predict()
|
||||
Reference in New Issue
Block a user