reconnect moved files to git repo
This commit is contained in:
@ -0,0 +1,328 @@
|
||||
'''
|
||||
from David Huard's scipy sandbox, also attached to a ticket and
|
||||
in the matplotlib-user mailinglist (links ???)
|
||||
|
||||
|
||||
Notes
|
||||
=====
|
||||
|
||||
out of bounds interpolation raises exception and would not be completely
|
||||
defined ::
|
||||
|
||||
>>> scoreatpercentile(x, [0,25,50,100])
|
||||
Traceback (most recent call last):
|
||||
...
|
||||
raise ValueError("A value in x_new is below the interpolation "
|
||||
ValueError: A value in x_new is below the interpolation range.
|
||||
>>> percentileofscore(x, [-50, 50])
|
||||
Traceback (most recent call last):
|
||||
...
|
||||
raise ValueError("A value in x_new is below the interpolation "
|
||||
ValueError: A value in x_new is below the interpolation range.
|
||||
|
||||
|
||||
idea
|
||||
====
|
||||
|
||||
histogram and empirical interpolated distribution
|
||||
-------------------------------------------------
|
||||
|
||||
dual constructor
|
||||
* empirical cdf : cdf on all observations through linear interpolation
|
||||
* binned cdf : based on histogram
|
||||
both should work essentially the same, although pdf of empirical has
|
||||
many spikes, fluctuates a lot
|
||||
- alternative: binning based on interpolated cdf : example in script
|
||||
* ppf: quantileatscore based on interpolated cdf
|
||||
* rvs : generic from ppf
|
||||
* stats, expectation ? how does integration wrt cdf work - theory?
|
||||
|
||||
Problems
|
||||
* limits, lower and upper bound of support
|
||||
does not work or is undefined with empirical cdf and interpolation
|
||||
* extending bounds ?
|
||||
matlab has pareto tails for empirical distribution, breaks linearity
|
||||
|
||||
empirical distribution with higher order interpolation
|
||||
------------------------------------------------------
|
||||
|
||||
* should work easily enough with interpolating splines
|
||||
* not piecewise linear
|
||||
* can use pareto (or other) tails
|
||||
* ppf how do I get the inverse function of a higher order spline?
|
||||
Chuck: resample and fit spline to inverse function
|
||||
this will have an approximation error in the inverse function
|
||||
* -> does not work: higher order spline does not preserve monotonicity
|
||||
see mailing list for response to my question
|
||||
* pmf from derivative available in spline
|
||||
|
||||
-> forget this and use kernel density estimator instead
|
||||
|
||||
|
||||
bootstrap/empirical distribution:
|
||||
---------------------------------
|
||||
|
||||
discrete distribution on real line given observations
|
||||
what's defined?
|
||||
* cdf : step function
|
||||
* pmf : points with equal weight 1/nobs
|
||||
* rvs : resampling
|
||||
* ppf : quantileatscore on sample?
|
||||
* moments : from data ?
|
||||
* expectation ? sum_{all observations x} [func(x) * pmf(x)]
|
||||
* similar for discrete distribution on real line
|
||||
* References : ?
|
||||
* what's the point? most of it is trivial, just for the record ?
|
||||
|
||||
|
||||
Created on Monday, May 03, 2010, 11:47:03 AM
|
||||
Author: josef-pktd, parts based on David Huard
|
||||
License: BSD
|
||||
|
||||
'''
|
||||
import scipy.interpolate as interpolate
|
||||
import numpy as np
|
||||
|
||||
def scoreatpercentile(data, percentile):
|
||||
"""Return the score at the given percentile of the data.
|
||||
|
||||
Example:
|
||||
>>> data = randn(100)
|
||||
>>> scoreatpercentile(data, 50)
|
||||
|
||||
will return the median of sample `data`.
|
||||
"""
|
||||
per = np.array(percentile)
|
||||
cdf = empiricalcdf(data)
|
||||
interpolator = interpolate.interp1d(np.sort(cdf), np.sort(data))
|
||||
return interpolator(per/100.)
|
||||
|
||||
def percentileofscore(data, score):
|
||||
"""Return the percentile-position of score relative to data.
|
||||
|
||||
score: Array of scores at which the percentile is computed.
|
||||
|
||||
Return percentiles (0-100).
|
||||
|
||||
Example
|
||||
r = randn(50)
|
||||
x = linspace(-2,2,100)
|
||||
percentileofscore(r,x)
|
||||
|
||||
Raise an error if the score is outside the range of data.
|
||||
"""
|
||||
cdf = empiricalcdf(data)
|
||||
interpolator = interpolate.interp1d(np.sort(data), np.sort(cdf))
|
||||
return interpolator(score)*100.
|
||||
|
||||
def empiricalcdf(data, method='Hazen'):
|
||||
"""Return the empirical cdf.
|
||||
|
||||
Methods available:
|
||||
Hazen: (i-0.5)/N
|
||||
Weibull: i/(N+1)
|
||||
Chegodayev: (i-.3)/(N+.4)
|
||||
Cunnane: (i-.4)/(N+.2)
|
||||
Gringorten: (i-.44)/(N+.12)
|
||||
California: (i-1)/N
|
||||
|
||||
Where i goes from 1 to N.
|
||||
"""
|
||||
|
||||
i = np.argsort(np.argsort(data)) + 1.
|
||||
N = len(data)
|
||||
method = method.lower()
|
||||
if method == 'hazen':
|
||||
cdf = (i-0.5)/N
|
||||
elif method == 'weibull':
|
||||
cdf = i/(N+1.)
|
||||
elif method == 'california':
|
||||
cdf = (i-1.)/N
|
||||
elif method == 'chegodayev':
|
||||
cdf = (i-.3)/(N+.4)
|
||||
elif method == 'cunnane':
|
||||
cdf = (i-.4)/(N+.2)
|
||||
elif method == 'gringorten':
|
||||
cdf = (i-.44)/(N+.12)
|
||||
else:
|
||||
raise ValueError('Unknown method. Choose among Weibull, Hazen,'
|
||||
'Chegodayev, Cunnane, Gringorten and California.')
|
||||
|
||||
return cdf
|
||||
|
||||
|
||||
class HistDist:
|
||||
'''Distribution with piecewise linear cdf, pdf is step function
|
||||
|
||||
can be created from empiricial distribution or from a histogram (not done yet)
|
||||
|
||||
work in progress, not finished
|
||||
|
||||
|
||||
'''
|
||||
|
||||
def __init__(self, data):
|
||||
self.data = np.atleast_1d(data)
|
||||
self.binlimit = np.array([self.data.min(), self.data.max()])
|
||||
sortind = np.argsort(data)
|
||||
self._datasorted = data[sortind]
|
||||
self.ranking = np.argsort(sortind)
|
||||
|
||||
cdf = self.empiricalcdf()
|
||||
self._empcdfsorted = np.sort(cdf)
|
||||
self.cdfintp = interpolate.interp1d(self._datasorted, self._empcdfsorted)
|
||||
self.ppfintp = interpolate.interp1d(self._empcdfsorted, self._datasorted)
|
||||
|
||||
def empiricalcdf(self, data=None, method='Hazen'):
|
||||
"""Return the empirical cdf.
|
||||
|
||||
Methods available:
|
||||
Hazen: (i-0.5)/N
|
||||
Weibull: i/(N+1)
|
||||
Chegodayev: (i-.3)/(N+.4)
|
||||
Cunnane: (i-.4)/(N+.2)
|
||||
Gringorten: (i-.44)/(N+.12)
|
||||
California: (i-1)/N
|
||||
|
||||
Where i goes from 1 to N.
|
||||
"""
|
||||
|
||||
if data is None:
|
||||
data = self.data
|
||||
i = self.ranking
|
||||
else:
|
||||
i = np.argsort(np.argsort(data)) + 1.
|
||||
|
||||
N = len(data)
|
||||
method = method.lower()
|
||||
if method == 'hazen':
|
||||
cdf = (i-0.5)/N
|
||||
elif method == 'weibull':
|
||||
cdf = i/(N+1.)
|
||||
elif method == 'california':
|
||||
cdf = (i-1.)/N
|
||||
elif method == 'chegodayev':
|
||||
cdf = (i-.3)/(N+.4)
|
||||
elif method == 'cunnane':
|
||||
cdf = (i-.4)/(N+.2)
|
||||
elif method == 'gringorten':
|
||||
cdf = (i-.44)/(N+.12)
|
||||
else:
|
||||
raise ValueError('Unknown method. Choose among Weibull, Hazen,'
|
||||
'Chegodayev, Cunnane, Gringorten and California.')
|
||||
|
||||
return cdf
|
||||
|
||||
|
||||
def cdf_emp(self, score):
|
||||
'''
|
||||
this is score in dh
|
||||
|
||||
'''
|
||||
return self.cdfintp(score)
|
||||
#return percentileofscore(self.data, score)
|
||||
|
||||
def ppf_emp(self, quantile):
|
||||
'''
|
||||
this is score in dh
|
||||
|
||||
'''
|
||||
return self.ppfintp(quantile)
|
||||
#return scoreatpercentile(self.data, quantile*100)
|
||||
|
||||
|
||||
#from DHuard http://old.nabble.com/matplotlib-f2903.html
|
||||
def optimize_binning(self, method='Freedman'):
|
||||
"""Find the optimal number of bins and update the bin countaccordingly.
|
||||
Available methods : Freedman
|
||||
Scott
|
||||
"""
|
||||
|
||||
nobs = len(self.data)
|
||||
if method=='Freedman':
|
||||
IQR = self.ppf_emp(0.75) - self.ppf_emp(0.25) # Interquantile range(75% -25%)
|
||||
width = 2* IQR* nobs**(-1./3)
|
||||
|
||||
elif method=='Scott':
|
||||
width = 3.49 * np.std(self.data) * nobs**(-1./3)
|
||||
|
||||
self.nbin = (np.ptp(self.binlimit)/width)
|
||||
return self.nbin
|
||||
|
||||
|
||||
#changes: josef-pktd
|
||||
if __name__ == '__main__':
|
||||
import matplotlib.pyplot as plt
|
||||
|
||||
nobs = 100
|
||||
x = np.random.randn(nobs)
|
||||
|
||||
examples = [2]
|
||||
if 1 in examples:
|
||||
empiricalcdf(x)
|
||||
print(percentileofscore(x, 0.5))
|
||||
print(scoreatpercentile(x, 50))
|
||||
xsupp = np.linspace(x.min(), x.max())
|
||||
pos = percentileofscore(x, xsupp)
|
||||
plt.plot(xsupp, pos)
|
||||
#perc = np.linspace(2.5, 97.5)
|
||||
#plt.plot(scoreatpercentile(x, perc), perc)
|
||||
plt.plot(scoreatpercentile(x, pos), pos+1)
|
||||
|
||||
|
||||
#emp = interpolate.PiecewisePolynomial(np.sort(empiricalcdf(x)), np.sort(x))
|
||||
emp=interpolate.InterpolatedUnivariateSpline(np.sort(x),np.sort(empiricalcdf(x)),k=1)
|
||||
pdfemp = np.array([emp.derivatives(xi)[1] for xi in xsupp])
|
||||
plt.figure()
|
||||
plt.plot(xsupp,pdfemp)
|
||||
cdf_ongrid = emp(xsupp)
|
||||
plt.figure()
|
||||
plt.plot(xsupp, cdf_ongrid)
|
||||
|
||||
#get pdf from interpolated cdf on a regular grid
|
||||
plt.figure()
|
||||
plt.step(xsupp[:-1],np.diff(cdf_ongrid)/np.diff(xsupp))
|
||||
|
||||
#reduce number of bins/steps
|
||||
xsupp2 = np.linspace(x.min(), x.max(), 25)
|
||||
plt.figure()
|
||||
plt.step(xsupp2[:-1],np.diff(emp(xsupp2))/np.diff(xsupp2))
|
||||
|
||||
#pdf using 25 original observations, every (nobs/25)th
|
||||
xso = np.sort(x)
|
||||
xs = xso[::nobs/25]
|
||||
plt.figure()
|
||||
plt.step(xs[:-1],np.diff(emp(xs))/np.diff(xs))
|
||||
#lower end looks strange
|
||||
|
||||
|
||||
histd = HistDist(x)
|
||||
print(histd.optimize_binning())
|
||||
print(histd.cdf_emp(histd.binlimit))
|
||||
print(histd.ppf_emp([0.25, 0.5, 0.75]))
|
||||
print(histd.cdf_emp([-0.5, -0.25, 0, 0.25, 0.5]))
|
||||
|
||||
|
||||
xsupp = np.linspace(x.min(), x.max(), 500)
|
||||
emp=interpolate.InterpolatedUnivariateSpline(np.sort(x),np.sort(empiricalcdf(x)),k=1)
|
||||
#pdfemp = np.array([emp.derivatives(xi)[1] for xi in xsupp])
|
||||
#plt.figure()
|
||||
#plt.plot(xsupp,pdfemp)
|
||||
cdf_ongrid = emp(xsupp)
|
||||
plt.figure()
|
||||
plt.plot(xsupp, cdf_ongrid)
|
||||
ppfintp = interpolate.InterpolatedUnivariateSpline(cdf_ongrid,xsupp,k=3)
|
||||
|
||||
ppfs = ppfintp(cdf_ongrid)
|
||||
plt.plot(ppfs, cdf_ongrid)
|
||||
#ppfemp=interpolate.InterpolatedUnivariateSpline(np.sort(empiricalcdf(x)),np.sort(x),k=3)
|
||||
#Do not use interpolating splines for function approximation
|
||||
#with s=0.03 the spline is monotonic at the evaluated values
|
||||
ppfemp=interpolate.UnivariateSpline(np.sort(empiricalcdf(x)),np.sort(x),k=3, s=0.03)
|
||||
ppfe = ppfemp(cdf_ongrid)
|
||||
plt.plot(ppfe, cdf_ongrid)
|
||||
|
||||
print('negative density')
|
||||
print('(np.diff(ppfs)).min()', (np.diff(ppfs)).min())
|
||||
print('(np.diff(cdf_ongrid)).min()', (np.diff(cdf_ongrid)).min())
|
||||
#plt.show()
|
||||
Reference in New Issue
Block a user