reconnect moved files to git repo
This commit is contained in:
@ -0,0 +1,310 @@
|
||||
"""
|
||||
Implements Lilliefors corrected Kolmogorov-Smirnov tests for normal and
|
||||
exponential distributions.
|
||||
|
||||
`kstest_fit` is provided as a top-level function to access both tests.
|
||||
`kstest_normal` and `kstest_exponential` are provided as convenience functions
|
||||
with the appropriate test as the default.
|
||||
`lilliefors` is provided as an alias for `kstest_fit`.
|
||||
|
||||
Created on Sat Oct 01 13:16:49 2011
|
||||
|
||||
Author: Josef Perktold
|
||||
License: BSD-3
|
||||
|
||||
pvalues for Lilliefors test are based on formula and table in
|
||||
|
||||
An Analytic Approximation to the Distribution of Lilliefors's Test Statistic
|
||||
for Normality
|
||||
Author(s): Gerard E. Dallal and Leland WilkinsonSource: The American
|
||||
Statistician, Vol. 40, No. 4 (Nov., 1986), pp. 294-296
|
||||
Published by: American Statistical Association
|
||||
Stable URL: http://www.jstor.org/stable/2684607 .
|
||||
|
||||
On the Kolmogorov-Smirnov Test for Normality with Mean and Variance Unknown
|
||||
Hubert W. Lilliefors
|
||||
Journal of the American Statistical Association, Vol. 62, No. 318.
|
||||
(Jun., 1967), pp. 399-402.
|
||||
|
||||
---
|
||||
|
||||
Updated 2017-07-23
|
||||
Jacob C. Kimmel
|
||||
|
||||
Ref:
|
||||
Lilliefors, H.W.
|
||||
On the Kolmogorov-Smirnov test for the exponential distribution with mean
|
||||
unknown. Journal of the American Statistical Association, Vol 64, No. 325.
|
||||
(1969), pp. 387–389.
|
||||
"""
|
||||
from functools import partial
|
||||
|
||||
import numpy as np
|
||||
from scipy import stats
|
||||
|
||||
from statsmodels.tools.validation import string_like
|
||||
from ._lilliefors_critical_values import (critical_values,
|
||||
asymp_critical_values,
|
||||
PERCENTILES)
|
||||
from .tabledist import TableDist
|
||||
|
||||
|
||||
def _make_asymptotic_function(params):
|
||||
"""
|
||||
Generates an asymptotic distribution callable from a param matrix
|
||||
|
||||
Polynomial is a[0] * x**(-1/2) + a[1] * x**(-1) + a[2] * x**(-3/2)
|
||||
|
||||
Parameters
|
||||
----------
|
||||
params : ndarray
|
||||
Array with shape (nalpha, 3) where nalpha is the number of
|
||||
significance levels
|
||||
"""
|
||||
|
||||
def f(n):
|
||||
poly = np.array([1, np.log(n), np.log(n) ** 2])
|
||||
return np.exp(poly.dot(params.T))
|
||||
|
||||
return f
|
||||
|
||||
|
||||
def ksstat(x, cdf, alternative='two_sided', args=()):
|
||||
"""
|
||||
Calculate statistic for the Kolmogorov-Smirnov test for goodness of fit
|
||||
|
||||
This calculates the test statistic for a test of the distribution G(x) of
|
||||
an observed variable against a given distribution F(x). Under the null
|
||||
hypothesis the two distributions are identical, G(x)=F(x). The
|
||||
alternative hypothesis can be either 'two_sided' (default), 'less'
|
||||
or 'greater'. The KS test is only valid for continuous distributions.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
x : array_like, 1d
|
||||
array of observations
|
||||
cdf : str or callable
|
||||
string: name of a distribution in scipy.stats
|
||||
callable: function to evaluate cdf
|
||||
alternative : 'two_sided' (default), 'less' or 'greater'
|
||||
defines the alternative hypothesis (see explanation)
|
||||
args : tuple, sequence
|
||||
distribution parameters for call to cdf
|
||||
|
||||
|
||||
Returns
|
||||
-------
|
||||
D : float
|
||||
KS test statistic, either D, D+ or D-
|
||||
|
||||
See Also
|
||||
--------
|
||||
scipy.stats.kstest
|
||||
|
||||
Notes
|
||||
-----
|
||||
|
||||
In the one-sided test, the alternative is that the empirical
|
||||
cumulative distribution function of the random variable is "less"
|
||||
or "greater" than the cumulative distribution function F(x) of the
|
||||
hypothesis, G(x)<=F(x), resp. G(x)>=F(x).
|
||||
|
||||
In contrast to scipy.stats.kstest, this function only calculates the
|
||||
statistic which can be used either as distance measure or to implement
|
||||
case specific p-values.
|
||||
"""
|
||||
nobs = float(len(x))
|
||||
|
||||
if isinstance(cdf, str):
|
||||
cdf = getattr(stats.distributions, cdf).cdf
|
||||
elif hasattr(cdf, 'cdf'):
|
||||
cdf = getattr(cdf, 'cdf')
|
||||
|
||||
x = np.sort(x)
|
||||
cdfvals = cdf(x, *args)
|
||||
|
||||
d_plus = (np.arange(1.0, nobs + 1) / nobs - cdfvals).max()
|
||||
d_min = (cdfvals - np.arange(0.0, nobs) / nobs).max()
|
||||
if alternative == 'greater':
|
||||
return d_plus
|
||||
elif alternative == 'less':
|
||||
return d_min
|
||||
|
||||
return np.max([d_plus, d_min])
|
||||
|
||||
|
||||
def get_lilliefors_table(dist='norm'):
|
||||
"""
|
||||
Generates tables for significance levels of Lilliefors test statistics
|
||||
|
||||
Tables for available normal and exponential distribution testing,
|
||||
as specified in Lilliefors references above
|
||||
|
||||
Parameters
|
||||
----------
|
||||
dist : str
|
||||
distribution being tested in set {'norm', 'exp'}.
|
||||
|
||||
Returns
|
||||
-------
|
||||
lf : TableDist object.
|
||||
table of critical values
|
||||
"""
|
||||
# function just to keep things together
|
||||
# for this test alpha is sf probability, i.e. right tail probability
|
||||
|
||||
alpha = 1 - np.array(PERCENTILES) / 100.0
|
||||
alpha = alpha[::-1]
|
||||
dist = 'normal' if dist == 'norm' else dist
|
||||
if dist not in critical_values:
|
||||
raise ValueError("Invalid dist parameter. Must be 'norm' or 'exp'")
|
||||
cv_data = critical_values[dist]
|
||||
acv_data = asymp_critical_values[dist]
|
||||
|
||||
size = np.array(sorted(cv_data), dtype=float)
|
||||
crit_lf = np.array([cv_data[key] for key in sorted(cv_data)])
|
||||
crit_lf = crit_lf[:, ::-1]
|
||||
|
||||
asym_params = np.array([acv_data[key] for key in sorted(acv_data)])
|
||||
asymp_fn = _make_asymptotic_function(asym_params[::-1])
|
||||
|
||||
lf = TableDist(alpha, size, crit_lf, asymptotic=asymp_fn)
|
||||
return lf
|
||||
|
||||
|
||||
lilliefors_table_norm = get_lilliefors_table(dist='norm')
|
||||
lilliefors_table_expon = get_lilliefors_table(dist='exp')
|
||||
|
||||
|
||||
def pval_lf(d_max, n):
|
||||
"""
|
||||
Approximate pvalues for Lilliefors test
|
||||
|
||||
This is only valid for pvalues smaller than 0.1 which is not checked in
|
||||
this function.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
d_max : array_like
|
||||
two-sided Kolmogorov-Smirnov test statistic
|
||||
n : int or float
|
||||
sample size
|
||||
|
||||
Returns
|
||||
-------
|
||||
p-value : float or ndarray
|
||||
pvalue according to approximation formula of Dallal and Wilkinson.
|
||||
|
||||
Notes
|
||||
-----
|
||||
This is mainly a helper function where the calling code should dispatch
|
||||
on bound violations. Therefore it does not check whether the pvalue is in
|
||||
the valid range.
|
||||
|
||||
Precision for the pvalues is around 2 to 3 decimals. This approximation is
|
||||
also used by other statistical packages (e.g. R:fBasics) but might not be
|
||||
the most precise available.
|
||||
|
||||
References
|
||||
----------
|
||||
DallalWilkinson1986
|
||||
"""
|
||||
# todo: check boundaries, valid range for n and Dmax
|
||||
if n > 100:
|
||||
d_max *= (n / 100.) ** 0.49
|
||||
n = 100
|
||||
pval = np.exp(-7.01256 * d_max ** 2 * (n + 2.78019)
|
||||
+ 2.99587 * d_max * np.sqrt(n + 2.78019) - 0.122119
|
||||
+ 0.974598 / np.sqrt(n) + 1.67997 / n)
|
||||
return pval
|
||||
|
||||
|
||||
def kstest_fit(x, dist='norm', pvalmethod="table"):
|
||||
"""
|
||||
Test assumed normal or exponential distribution using Lilliefors' test.
|
||||
|
||||
Lilliefors' test is a Kolmogorov-Smirnov test with estimated parameters.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
x : array_like, 1d
|
||||
Data to test.
|
||||
dist : {'norm', 'exp'}, optional
|
||||
The assumed distribution.
|
||||
pvalmethod : {'approx', 'table'}, optional
|
||||
The method used to compute the p-value of the test statistic. In
|
||||
general, 'table' is preferred and makes use of a very large simulation.
|
||||
'approx' is only valid for normality. if `dist = 'exp'` `table` is
|
||||
always used. 'approx' uses the approximation formula of Dalal and
|
||||
Wilkinson, valid for pvalues < 0.1. If the pvalue is larger than 0.1,
|
||||
then the result of `table` is returned.
|
||||
|
||||
Returns
|
||||
-------
|
||||
ksstat : float
|
||||
Kolmogorov-Smirnov test statistic with estimated mean and variance.
|
||||
pvalue : float
|
||||
If the pvalue is lower than some threshold, e.g. 0.05, then we can
|
||||
reject the Null hypothesis that the sample comes from a normal
|
||||
distribution.
|
||||
|
||||
Notes
|
||||
-----
|
||||
'table' uses an improved table based on 10,000,000 simulations. The
|
||||
critical values are approximated using
|
||||
log(cv_alpha) = b_alpha + c[0] log(n) + c[1] log(n)**2
|
||||
where cv_alpha is the critical value for a test with size alpha,
|
||||
b_alpha is an alpha-specific intercept term and c[1] and c[2] are
|
||||
coefficients that are shared all alphas.
|
||||
Values in the table are linearly interpolated. Values outside the
|
||||
range are be returned as bounds, 0.990 for large and 0.001 for small
|
||||
pvalues.
|
||||
|
||||
For implementation details, see lilliefors_critical_value_simulation.py in
|
||||
the test directory.
|
||||
"""
|
||||
pvalmethod = string_like(pvalmethod,
|
||||
"pvalmethod",
|
||||
options=("approx", "table"))
|
||||
x = np.asarray(x)
|
||||
if x.ndim == 2 and x.shape[1] == 1:
|
||||
x = x[:, 0]
|
||||
elif x.ndim != 1:
|
||||
raise ValueError("Invalid parameter `x`: must be a one-dimensional"
|
||||
" array-like or a single-column DataFrame")
|
||||
|
||||
nobs = len(x)
|
||||
|
||||
if dist == 'norm':
|
||||
z = (x - x.mean()) / x.std(ddof=1)
|
||||
test_d = stats.norm.cdf
|
||||
lilliefors_table = lilliefors_table_norm
|
||||
elif dist == 'exp':
|
||||
z = x / x.mean()
|
||||
test_d = stats.expon.cdf
|
||||
lilliefors_table = lilliefors_table_expon
|
||||
pvalmethod = 'table'
|
||||
else:
|
||||
raise ValueError("Invalid dist parameter, must be 'norm' or 'exp'")
|
||||
|
||||
min_nobs = 4 if dist == 'norm' else 3
|
||||
if nobs < min_nobs:
|
||||
raise ValueError('Test for distribution {} requires at least {} '
|
||||
'observations'.format(dist, min_nobs))
|
||||
|
||||
d_ks = ksstat(z, test_d, alternative='two_sided')
|
||||
|
||||
if pvalmethod == 'approx':
|
||||
pval = pval_lf(d_ks, nobs)
|
||||
# check pval is in desired range
|
||||
if pval > 0.1:
|
||||
pval = lilliefors_table.prob(d_ks, nobs)
|
||||
else: # pvalmethod == 'table'
|
||||
pval = lilliefors_table.prob(d_ks, nobs)
|
||||
|
||||
return d_ks, pval
|
||||
|
||||
|
||||
lilliefors = kstest_fit
|
||||
kstest_normal = kstest_fit
|
||||
kstest_exponential = partial(kstest_fit, dist='exp')
|
||||
Reference in New Issue
Block a user