reconnect moved files to git repo

This commit is contained in:
root
2025-08-01 04:33:03 -04:00
commit 5d3c35492d
23190 changed files with 4750716 additions and 0 deletions

View File

@ -0,0 +1,126 @@
"""
Created on Wed Feb 19 12:39:49 2014
Author: Josef Perktold
"""
import numpy as np
from scipy import stats
from statsmodels.sandbox.distributions.extras import (SkewNorm_gen, skewnorm,
ACSkewT_gen,
NormExpan_gen, pdf_moments,
ExpTransf_gen, LogTransf_gen)
from statsmodels.stats.moment_helpers import mc2mvsk, mnc2mc, mvsk2mnc
def example_n():
print(skewnorm.pdf(1,0), stats.norm.pdf(1), skewnorm.pdf(1,0) - stats.norm.pdf(1))
print(skewnorm.pdf(1,1000), stats.chi.pdf(1,1), skewnorm.pdf(1,1000) - stats.chi.pdf(1,1))
print(skewnorm.pdf(-1,-1000), stats.chi.pdf(1,1), skewnorm.pdf(-1,-1000) - stats.chi.pdf(1,1))
rvs = skewnorm.rvs(0,size=500)
print('sample mean var: ', rvs.mean(), rvs.var())
print('theoretical mean var', skewnorm.stats(0))
rvs = skewnorm.rvs(5,size=500)
print('sample mean var: ', rvs.mean(), rvs.var())
print('theoretical mean var', skewnorm.stats(5))
print(skewnorm.cdf(1,0), stats.norm.cdf(1), skewnorm.cdf(1,0) - stats.norm.cdf(1))
print(skewnorm.cdf(1,1000), stats.chi.cdf(1,1), skewnorm.cdf(1,1000) - stats.chi.cdf(1,1))
print(skewnorm.sf(0.05,1000), stats.chi.sf(0.05,1), skewnorm.sf(0.05,1000) - stats.chi.sf(0.05,1))
def example_T():
skewt = ACSkewT_gen()
rvs = skewt.rvs(10,0,size=500)
print('sample mean var: ', rvs.mean(), rvs.var())
print('theoretical mean var', skewt.stats(10,0))
print('t mean var', stats.t.stats(10))
print(skewt.stats(10,1000)) # -> folded t distribution, as alpha -> inf
rvs = np.abs(stats.t.rvs(10,size=1000))
print(rvs.mean(), rvs.var())
def examples_normexpand():
skewnorm = SkewNorm_gen()
rvs = skewnorm.rvs(5,size=100)
normexpan = NormExpan_gen(rvs, mode='sample')
smvsk = stats.describe(rvs)[2:]
print('sample: mu,sig,sk,kur')
print(smvsk)
dmvsk = normexpan.stats(moments='mvsk')
print('normexpan: mu,sig,sk,kur')
print(dmvsk)
print('mvsk diff distribution - sample')
print(np.array(dmvsk) - np.array(smvsk))
print('normexpan attributes mvsk')
print(mc2mvsk(normexpan.cnt))
print(normexpan.mvsk)
mnc = mvsk2mnc(dmvsk)
mc = mnc2mc(mnc)
print('central moments')
print(mc)
print('non-central moments')
print(mnc)
pdffn = pdf_moments(mc)
print('\npdf approximation from moments')
print('pdf at', mc[0]-1,mc[0]+1)
print(pdffn([mc[0]-1,mc[0]+1]))
print(normexpan.pdf([mc[0]-1,mc[0]+1]))
def examples_transf():
##lognormal = ExpTransf(a=0.0, xa=-10.0, name = 'Log transformed normal')
##print(lognormal.cdf(1))
##print(stats.lognorm.cdf(1,1))
##print(lognormal.stats())
##print(stats.lognorm.stats(1))
##print(lognormal.rvs(size=10))
print('Results for lognormal')
lognormalg = ExpTransf_gen(stats.norm, a=0, name = 'Log transformed normal general')
print(lognormalg.cdf(1))
print(stats.lognorm.cdf(1,1))
print(lognormalg.stats())
print(stats.lognorm.stats(1))
print(lognormalg.rvs(size=5))
##print('Results for loggamma')
##loggammag = ExpTransf_gen(stats.gamma)
##print(loggammag._cdf(1,10))
##print(stats.loggamma.cdf(1,10))
print('Results for expgamma')
loggammaexpg = LogTransf_gen(stats.gamma)
print(loggammaexpg._cdf(1,10))
print(stats.loggamma.cdf(1,10))
print(loggammaexpg._cdf(2,15))
print(stats.loggamma.cdf(2,15))
# this requires change in scipy.stats.distribution
#print(loggammaexpg.cdf(1,10))
print('Results for loglaplace')
loglaplaceg = LogTransf_gen(stats.laplace)
print(loglaplaceg._cdf(2))
print(stats.loglaplace.cdf(2,1))
loglaplaceexpg = ExpTransf_gen(stats.laplace)
print(loglaplaceexpg._cdf(2))
stats.loglaplace.cdf(3,3)
#0.98148148148148151
loglaplaceexpg._cdf(3,0,1./3)
#0.98148148148148151
if __name__ == '__main__':
example_n()
example_T()
examples_normexpand()
examples_transf()

View File

@ -0,0 +1,28 @@
'''Example for estimating distribution parameters when some are fixed.
This uses currently a patched version of the distributions, two methods are
added to the continuous distributions. This has no side effects.
It also adds bounds to vonmises, which changes the behavior of it for some
methods.
'''
import numpy as np
from scipy import stats
# Note the following import attaches methods to scipy.stats.distributions
# and adds bounds to stats.vonmises
# from statsmodels.sandbox.distributions import sppatch
np.random.seed(12345)
x = stats.gamma.rvs(2.5, loc=0, scale=1.2, size=200)
#estimate all parameters
print(stats.gamma.fit(x))
print(stats.gamma.fit_fr(x, frozen=[np.nan, np.nan, np.nan]))
#estimate shape parameter only
print(stats.gamma.fit_fr(x, frozen=[np.nan, 0., 1.2]))
np.random.seed(12345)
x = stats.lognorm.rvs(2, loc=0, scale=2, size=200)
print(stats.lognorm.fit_fr(x, frozen=[np.nan, 0., np.nan]))

View File

@ -0,0 +1,11 @@
from scipy import stats
from statsmodels.stats import gof
poissrvs = stats.poisson.rvs(0.6, size = 200)
freq, expfreq, histsupp = gof.gof_binning_discrete(poissrvs, stats.poisson, (0.6,), nsupp=20)
(chi2val, pval) = stats.chisquare(freq, expfreq)
print(chi2val, pval)
print(gof.gof_chisquare_discrete(stats.poisson, (0.6,), poissrvs, 0.05,
'Poisson'))

View File

@ -0,0 +1,162 @@
"""examples for multivariate normal and t distributions
Created on Fri Jun 03 16:00:26 2011
@author: josef
for comparison I used R mvtnorm version 0.9-96
"""
import numpy as np
from numpy.testing import assert_array_almost_equal
import matplotlib.pyplot as plt
import statsmodels.api as sm
import statsmodels.distributions.mixture_rvs as mix
import statsmodels.sandbox.distributions.mv_normal as mvd
cov3 = np.array([[ 1. , 0.5 , 0.75],
[ 0.5 , 1.5 , 0.6 ],
[ 0.75, 0.6 , 2. ]])
mu = np.array([-1, 0.0, 2.0])
#************** multivariate normal distribution ***************
mvn3 = mvd.MVNormal(mu, cov3)
#compare with random sample
x = mvn3.rvs(size=1000000)
xli = [[2., 1., 1.5],
[0., 2., 1.5],
[1.5, 1., 2.5],
[0., 1., 1.5]]
xliarr = np.asarray(xli).T[None,:, :]
#from R session
#pmvnorm(lower=-Inf,upper=(x[0,.]-mu)/sqrt(diag(cov3)),mean=rep(0,3),corr3)
r_cdf = [0.3222292, 0.3414643, 0.5450594, 0.3116296]
r_cdf_errors = [1.715116e-05, 1.590284e-05, 5.356471e-05, 3.567548e-05]
n_cdf = [mvn3.cdf(a) for a in xli]
assert_array_almost_equal(r_cdf, n_cdf, decimal=4)
print(n_cdf)
print('')
print((x<np.array(xli[0])).all(-1).mean(0))
print((x[...,None]<xliarr).all(1).mean(0))
print(mvn3.expect_mc(lambda x: (x<xli[0]).all(-1), size=100000))
print(mvn3.expect_mc(lambda x: (x[...,None]<xliarr).all(1), size=100000))
#other methods
mvn3n = mvn3.normalized()
assert_array_almost_equal(mvn3n.cov, mvn3n.corr, decimal=15)
assert_array_almost_equal(mvn3n.mean, np.zeros(3), decimal=15)
xn = mvn3.normalize(x)
xn_cov = np.cov(xn, rowvar=0)
assert_array_almost_equal(mvn3n.cov, xn_cov, decimal=2)
assert_array_almost_equal(np.zeros(3), xn.mean(0), decimal=2)
mvn3n2 = mvn3.normalized2()
assert_array_almost_equal(mvn3n.cov, mvn3n2.cov, decimal=2)
#mistake: "normalized2" standardizes - FIXED
#assert_array_almost_equal(np.eye(3), mvn3n2.cov, decimal=2)
xs = mvn3.standardize(x)
xs_cov = np.cov(xn, rowvar=0)
#another mixup xs is normalized
#assert_array_almost_equal(np.eye(3), xs_cov, decimal=2)
assert_array_almost_equal(mvn3.corr, xs_cov, decimal=2)
assert_array_almost_equal(np.zeros(3), xs.mean(0), decimal=2)
mv2m = mvn3.marginal(np.array([0,1]))
print(mv2m.mean)
print(mv2m.cov)
mv2c = mvn3.conditional(np.array([0,1]), [0])
print(mv2c.mean)
print(mv2c.cov)
mv2c = mvn3.conditional(np.array([0]), [0, 0])
print(mv2c.mean)
print(mv2c.cov)
mod = sm.OLS(x[:,0], sm.add_constant(x[:,1:], prepend=True))
res = mod.fit()
print(res.model.predict(np.array([1,0,0])))
mv2c = mvn3.conditional(np.array([0]), [0, 0])
print(mv2c.mean)
mv2c = mvn3.conditional(np.array([0]), [1, 1])
print(res.model.predict(np.array([1,1,1])))
print(mv2c.mean)
#the following wrong input does not raise an exception but produces wrong numbers
#mv2c = mvn3.conditional(np.array([0]), [[1, 1],[2,2]])
#************** multivariate t distribution ***************
mvt3 = mvd.MVT(mu, cov3, 4)
xt = mvt3.rvs(size=100000)
assert_array_almost_equal(mvt3.cov, np.cov(xt, rowvar=0), decimal=1)
mvt3s = mvt3.standardized()
mvt3n = mvt3.normalized()
#the following should be equal or correct up to numerical precision of float
assert_array_almost_equal(mvt3.corr, mvt3n.sigma, decimal=15)
assert_array_almost_equal(mvt3n.corr, mvt3n.sigma, decimal=15)
assert_array_almost_equal(np.eye(3), mvt3s.sigma, decimal=15)
xts = mvt3.standardize(xt)
xts_cov = np.cov(xts, rowvar=0)
xtn = mvt3.normalize(xt)
xtn_cov = np.cov(xtn, rowvar=0)
xtn_corr = np.corrcoef(xtn, rowvar=0)
assert_array_almost_equal(mvt3n.mean, xtn.mean(0), decimal=2)
#the following might fail sometimes (random test), add seed in tests
assert_array_almost_equal(mvt3n.corr, xtn_corr, decimal=1)
#watch out cov is not the same as sigma for t distribution, what's right here?
#normalize by sigma or by cov ? now normalized by sigma
assert_array_almost_equal(mvt3n.cov, xtn_cov, decimal=1)
assert_array_almost_equal(mvt3s.cov, xts_cov, decimal=1)
a = [0.0, 1.0, 1.5]
mvt3_cdf0 = mvt3.cdf(a)
print(mvt3_cdf0)
print((xt<np.array(a)).all(-1).mean(0))
print('R', 0.3026741) # "error": 0.0004832187
print('R', 0.3026855) # error 3.444375e-06 with smaller abseps
print('diff', mvt3_cdf0 - 0.3026855)
a = [0.0, 0.5, 1.0]
mvt3_cdf1 = mvt3.cdf(a)
print(mvt3_cdf1)
print((xt<np.array(a)).all(-1).mean(0))
print('R', 0.1946621) # "error": 0.0002524817)
print('R', 0.1946217) # "error:"2.748699e-06 with smaller abseps)
print('diff', mvt3_cdf1 - 0.1946217)
assert_array_almost_equal(mvt3_cdf0, 0.3026855, decimal=5)
assert_array_almost_equal(mvt3_cdf1, 0.1946217, decimal=5)
mu2 = np.array([4, 2.0, 2.0])
mvn32 = mvd.MVNormal(mu2, cov3/2., 4)
md = mix.mv_mixture_rvs([0.4, 0.6], 5, [mvt3, mvt3n], 3)
rvs = mix.mv_mixture_rvs([0.4, 0.6], 2000, [mvn3, mvn32], 3)
#rvs2 = rvs[:,:2]
fig = plt.figure()
fig.add_subplot(2, 2, 1)
plt.plot(rvs[:,0], rvs[:,1], '.', alpha=0.25)
plt.title('1 versus 0')
fig.add_subplot(2, 2, 2)
plt.plot(rvs[:,0], rvs[:,2], '.', alpha=0.25)
plt.title('2 versus 0')
fig.add_subplot(2, 2, 3)
plt.plot(rvs[:,1], rvs[:,2], '.', alpha=0.25)
plt.title('2 versus 1')
#plt.show()

View File

@ -0,0 +1,314 @@
"""
Created on Sun May 09 22:23:22 2010
Author: josef-pktd
Licese: BSD
"""
import numpy as np
from numpy.testing import assert_almost_equal
from scipy import stats
from statsmodels.sandbox.distributions.extras import (
ExpTransf_gen, LogTransf_gen,
squarenormalg, absnormalg, negsquarenormalg, squaretg)
#define these as module globals
l, s = 0.0, 1.0
ppfq = [0.1, 0.5, 0.9]
xx = [0.95, 1.0, 1.1]
nxx = [-0.95, -1.0, -1.1]
def test_loggamma():
#'Results for expgamma'
loggammaexpg = LogTransf_gen(stats.gamma)
cdftr = loggammaexpg._cdf(1,10)
cdfst = stats.loggamma.cdf(1,10)
assert_almost_equal(cdfst, cdftr, 14)
cdftr = loggammaexpg._cdf(2,15)
cdfst = stats.loggamma.cdf(2,15)
assert_almost_equal(cdfst, cdftr, 14)
def test_loglaplace():
#if x is laplace then y = exp(x) is loglaplace
#parameters are tricky
#the stats.loglaplace parameter is the inverse scale of x
loglaplaceexpg = ExpTransf_gen(stats.laplace)
cdfst = stats.loglaplace.cdf(3,3)
#0.98148148148148151
#the parameters are shape, loc and scale of underlying laplace
cdftr = loglaplaceexpg._cdf(3,0,1./3)
assert_almost_equal(cdfst, cdftr, 14)
class CheckDistEquivalence:
#no args, kwds yet
def test_cdf(self):
#'\nsquare of standard normal random variable is chisquare with dof=1 distributed'
cdftr = self.dist.cdf(xx, *self.trargs, **self.trkwds)
sfctr = 1-self.dist.sf(xx, *self.trargs, **self.trkwds) #sf complement
cdfst = self.statsdist.cdf(xx, *self.stargs, **self.stkwds)
assert_almost_equal(cdfst, cdftr, 14)
assert_almost_equal(cdfst, sfctr, 14)
def test_pdf(self):
#'\nsquare of standard normal random variable is chisquare with dof=1 distributed'
pdftr = self.dist.pdf(xx, *self.trargs, **self.trkwds)
pdfst = self.statsdist.pdf(xx, *self.stargs, **self.stkwds)
assert_almost_equal(pdfst, pdftr, 13)
def test_ppf(self):
#'\nsquare of standard normal random variable is chisquare with dof=1 distributed'
ppftr = self.dist.ppf(ppfq, *self.trargs, **self.trkwds)
ppfst = self.statsdist.ppf(ppfq, *self.stargs, **self.stkwds)
assert_almost_equal(ppfst, ppftr, 13)
def test_rvs(self):
rvs = self.dist.rvs(*self.trargs, **{'size':100})
mean_s = rvs.mean(0)
mean_d, var_d = self.dist.stats(*self.trargs, **{'moments':'mv'})
if np.any(np.abs(mean_d) < 1):
assert_almost_equal(mean_d, mean_s, 1)
else:
assert_almost_equal(mean_s/mean_d, 1., 0) #tests 0.5<meanration<1.5
def test_stats(self):
trkwds = {'moments':'mvsk'}
trkwds.update(self.stkwds)
stkwds = {'moments':'mvsk'}
stkwds.update(self.stkwds)
mvsktr = np.array(self.dist.stats(*self.trargs, **trkwds))
mvskst = np.array(self.statsdist.stats(*self.stargs, **stkwds))
assert_almost_equal(mvskst[:2], mvsktr[:2], 8)
if np.any(np.abs(mvskst[2:]) < 1):
assert_almost_equal(mvskst[2:], mvsktr[2:], 1)
else:
assert_almost_equal(mvskst[2:]/mvsktr[2:], np.ones(2), 0)
#tests 0.5<meanration<1.5
class TestLoggamma_1(CheckDistEquivalence):
def __init__(self):
self.dist = LogTransf_gen(stats.gamma)
self.trargs = (10,)
self.trkwds = {}
self.statsdist = stats.loggamma
self.stargs = (10,)
self.stkwds = {}
class TestSquaredNormChi2_1(CheckDistEquivalence):
def __init__(self):
self.dist = squarenormalg
self.trargs = ()
self.trkwds = {}
self.statsdist = stats.chi2
self.stargs = (1,)
self.stkwds = {}
class TestSquaredNormChi2_2(CheckDistEquivalence):
def __init__(self):
self.dist = squarenormalg
self.trargs = ()
self.trkwds = dict(loc=-10, scale=20)
self.statsdist = stats.chi2
self.stargs = (1,)
self.stkwds = dict(loc=-10, scale=20)
class TestAbsNormHalfNorm(CheckDistEquivalence):
def __init__(self):
self.dist = absnormalg
self.trargs = ()
self.trkwds = {}
self.statsdist = stats.halfnorm
self.stargs = ()
self.stkwds = {}
class TestSquaredTF(CheckDistEquivalence):
def __init__(self):
self.dist = squaretg
self.trargs = (10,)
self.trkwds = {}
self.statsdist = stats.f
self.stargs = (1,10)
self.stkwds = {}
def test_squared_normal_chi2():
#'\nsquare of standard normal random variable is chisquare with dof=1 distributed'
cdftr = squarenormalg.cdf(xx,loc=l, scale=s)
sfctr = 1-squarenormalg.sf(xx,loc=l, scale=s) #sf complement
cdfst = stats.chi2.cdf(xx,1)
assert_almost_equal(cdfst, cdftr, 14)
assert_almost_equal(cdfst, sfctr, 14)
# print('sqnorm pdf for (%3.2f, %3.2f, %3.2f):' % tuple(xx), squarenormalg.pdf(xx,loc=l, scale=s)
# print('chi2 pdf for (%3.2f, %3.2f, %3.2f):' % tuple(xx), stats.chi2.pdf(xx,1)
# print('sqnorm ppf for (%3.2f, %3.2f, %3.2f):' % tuple(xx), squarenormalg.ppf(ppfq,loc=l, scale=s)
# print('chi2 ppf for (%3.2f, %3.2f, %3.2f):' % tuple(xx), stats.chi2.ppf(ppfq,1)
# print('sqnorm cdf with loc scale', squarenormalg.cdf(xx,loc=-10, scale=20)
# print('chi2 cdf with loc scale', stats.chi2.cdf(xx,1,loc=-10, scale=20)
if __name__ == '__main__':
#Examples for Transf2_gen, u- or hump shaped transformation
#copied from transformtwo.py
l,s = 0.0, 1.0
ppfq = [0.1, 0.5, 0.9]
xx = [0.95, 1.0, 1.1]
nxx = [-0.95, -1.0, -1.1]
print
#print(invnormalg.__doc__
print('\nsquare of standard normal random variable is chisquare with dof=1 distributed')
print('sqnorm cdf for (%3.2f, %3.2f, %3.2f):' % tuple(xx), squarenormalg.cdf(xx,loc=l, scale=s))
print('sqnorm 1-sf for (%3.2f, %3.2f, %3.2f):' % tuple(xx), 1-squarenormalg.sf(xx,loc=l, scale=s))
print('chi2 cdf for (%3.2f, %3.2f, %3.2f):' % tuple(xx), stats.chi2.cdf(xx,1))
print('sqnorm pdf for (%3.2f, %3.2f, %3.2f):' % tuple(xx), squarenormalg.pdf(xx,loc=l, scale=s))
print('chi2 pdf for (%3.2f, %3.2f, %3.2f):' % tuple(xx), stats.chi2.pdf(xx,1))
print('sqnorm ppf for (%3.2f, %3.2f, %3.2f):' % tuple(xx), squarenormalg.ppf(ppfq,loc=l, scale=s))
print('chi2 ppf for (%3.2f, %3.2f, %3.2f):' % tuple(xx), stats.chi2.ppf(ppfq,1))
print('sqnorm cdf with loc scale', squarenormalg.cdf(xx,loc=-10, scale=20))
print('chi2 cdf with loc scale', stats.chi2.cdf(xx,1,loc=-10, scale=20))
# print('cdf for [0.5]:', squarenormalg.cdf(0.5,loc=l, scale=s))
# print('chi square distribution')
# print('chi2 pdf for (%3.2f, %3.2f, %3.2f):' % tuple(xx), stats.chi2.pdf(xx,1))
# print('cdf for (%3.2f, %3.2f, %3.2f):' % tuple(xx), stats.chi2.cdf(xx,1))
print('\nabsolute value of standard normal random variable is foldnorm(0) and ')
print('halfnorm distributed:')
print('absnorm cdf for (%3.2f, %3.2f, %3.2f):' % tuple(xx), absnormalg.cdf(xx,loc=l, scale=s))
print('absnorm 1-sf for (%3.2f, %3.2f, %3.2f):' % tuple(xx), 1-absnormalg.sf(xx,loc=l, scale=s))
print('foldn cdf for (%3.2f, %3.2f, %3.2f):' % tuple(xx), stats.foldnorm.cdf(xx,1e-5))
print('halfn cdf for (%3.2f, %3.2f, %3.2f):' % tuple(xx), stats.halfnorm.cdf(xx))
print('absnorm pdf for (%3.2f, %3.2f, %3.2f):' % tuple(xx), absnormalg.pdf(xx,loc=l, scale=s))
print('foldn pdf for (%3.2f, %3.2f, %3.2f):' % tuple(xx), stats.foldnorm.pdf(xx,1e-5))
print('halfn pdf for (%3.2f, %3.2f, %3.2f):' % tuple(xx), stats.halfnorm.pdf(xx))
print('absnorm ppf for (%3.2f, %3.2f, %3.2f):' % tuple(ppfq), absnormalg.ppf(ppfq,loc=l, scale=s))
print('foldn ppf for (%3.2f, %3.2f, %3.2f):' % tuple(ppfq), stats.foldnorm.ppf(ppfq,1e-5))
print('halfn ppf for (%3.2f, %3.2f, %3.2f):' % tuple(ppfq), stats.halfnorm.ppf(ppfq))
# print('cdf for [0.5]:', squarenormalg.cdf(0.5,loc=l, scale=s)
# print('chi square distribution'
# print('chi2 pdf for (%3.2f, %3.2f, %3.2f):' % tuple(xx), stats.chi2.pdf(xx,1)
# print('cdf for (%3.2f, %3.2f, %3.2f):' % tuple(xx), stats.chi2.cdf(xx,1)
print('\nnegative square of standard normal random variable is')
print('1-chisquare with dof=1 distributed')
print('this is mainly for testing')
print('the following should be outside of the support - returns nan')
print('nsqnorm cdf for (%3.2f, %3.2f, %3.2f):' % tuple(xx), negsquarenormalg.cdf(xx,loc=l, scale=s))
print('nsqnorm 1-sf for (%3.2f, %3.2f, %3.2f):' % tuple(xx), 1-negsquarenormalg.sf(xx,loc=l, scale=s))
print('nsqnorm pdf for (%3.2f, %3.2f, %3.2f):' % tuple(xx), negsquarenormalg.pdf(xx,loc=l, scale=s))
print('nsqnorm cdf for (%3.2f, %3.2f, %3.2f):' % tuple(nxx), negsquarenormalg.cdf(nxx,loc=l, scale=s))
print('nsqnorm 1-sf for (%3.2f, %3.2f, %3.2f):' % tuple(nxx), 1-negsquarenormalg.sf(nxx,loc=l, scale=s))
print('chi2 sf for (%3.2f, %3.2f, %3.2f):' % tuple(xx), stats.chi2.sf(xx,1))
print('nsqnorm pdf for (%3.2f, %3.2f, %3.2f):' % tuple(nxx), negsquarenormalg.pdf(nxx,loc=l, scale=s))
print('chi2 pdf for (%3.2f, %3.2f, %3.2f):' % tuple(xx), stats.chi2.pdf(xx,1))
print('nsqnorm pdf for (%3.2f, %3.2f, %3.2f):' % tuple(nxx), negsquarenormalg.pdf(nxx,loc=l, scale=s))
print('\nsquare of a t distributed random variable with dof=10 is')
print(' F with dof=1,10 distributed')
print('sqt cdf for (%3.2f, %3.2f, %3.2f):' % tuple(xx), squaretg.cdf(xx,10))
print('sqt 1-sf for (%3.2f, %3.2f, %3.2f):' % tuple(xx), 1-squaretg.sf(xx,10))
print('f cdf for (%3.2f, %3.2f, %3.2f):' % tuple(xx), stats.f.cdf(xx,1,10))
print('sqt pdf for (%3.2f, %3.2f, %3.2f):' % tuple(xx), squaretg.pdf(xx,10))
print('f pdf for (%3.2f, %3.2f, %3.2f):' % tuple(xx), stats.f.pdf(xx,1,10))
print('sqt ppf for (%3.2f, %3.2f, %3.2f):' % tuple(ppfq), squaretg.ppf(ppfq,10))
print('f ppf for (%3.2f, %3.2f, %3.2f):' % tuple(ppfq), stats.f.ppf(ppfq,1,10))
print('sqt cdf for 100:', squaretg.cdf(100,10))
print('f cdf for 100:', stats.f.cdf(100,1,10))
print('sqt stats:', squaretg.stats(10, moments='mvsk'))
print('f stats:', stats.f.stats(1,10, moments='mvsk'))
#Note the results differ for skew and kurtosis. I think the 3rd and 4th moment
# in the scipy.stats.f distribution is incorrect.
# I corrected it now in stats.distributions.py in bzr branch
v1=1
v2=10
g1 = 2*(v2+2*v1-2.)/(v2-6.)*np.sqrt(2*(v2-4.)/(v1*(v2+v1-2.)))
g2 = 3/(2.*v2-16)*(8+g1*g1*(v2-6.))
print('corrected skew, kurtosis of f(1,10) is', g1, g2)
print(squarenormalg.rvs())
print(squarenormalg.rvs(size=(2,4)))
print('sqt random variables')
print(stats.f.rvs(1,10,size=4))
print(squaretg.rvs(10,size=4))
#a large number check:
np.random.seed(464239857)
rvstsq = squaretg.rvs(10,size=100000)
squaretg.moment(4,10)
(rvstsq**4).mean()
squaretg.moment(3,10)
(rvstsq**3).mean()
squaretg.stats(10, moments='mvsk')
stats.describe(rvstsq)
'''
>>> np.random.seed(464239857)
>>> rvstsq = squaretg.rvs(10,size=100000)
>>> squaretg.moment(4,10)
2734.3750000000009
>>> (rvstsq**4).mean()
2739.672765170933
>>> squaretg.moment(3,10)
78.124999999997044
>>> (rvstsq**3).mean()
84.13950048850549
>>> squaretg.stats(10, moments='mvsk')
(array(1.2500000000000022), array(4.6874999999630909), array(5.7735026919777912), array(106.00000000170148))
>>> stats.describe(rvstsq)
(100000, (3.2953470738423724e-009, 92.649615690914473), 1.2534924690963247, 4.7741427958594098, 6.1562177957041895, 100.99331166052181)
'''
# checking the distribution
# fraction of observations in each decile
dec = squaretg.ppf(np.linspace(0.,1,11),10)
freq,edges = np.histogram(rvstsq, bins=dec)
print(freq/float(len(rvstsq)))
import matplotlib.pyplot as plt
freq,edges,_ = plt.hist(rvstsq, bins=50, range=(0,4),normed=True)
edges += (edges[1]-edges[0])/2.0
plt.plot(edges[:-1], squaretg.pdf(edges[:-1], 10), 'r')
#plt.show()
#plt.close()
'''
>>> plt.plot(edges[:-1], squaretg.pdf(edges[:-1], 10), 'r')
[<matplotlib.lines.Line2D object at 0x06EBFDB0>]
>>> plt.fill(edges[4:8], squaretg.pdf(edges[4:8], 10), 'r')
[<matplotlib.patches.Polygon object at 0x0725BA90>]
>>> plt.show()
>>> plt.fill_between(edges[4:8], squaretg.pdf(edges[4:8], 10), y2=0, 'r')
SyntaxError: non-keyword arg after keyword arg (<console>, line 1)
>>> plt.fill_between(edges[4:8], squaretg.pdf(edges[4:8], 10), 0, 'r')
Traceback (most recent call last):
AttributeError: 'module' object has no attribute 'fill_between'
>>> fig = figure()
Traceback (most recent call last):
NameError: name 'figure' is not defined
>>> ax1 = fig.add_subplot(311)
Traceback (most recent call last):
NameError: name 'fig' is not defined
>>> fig = plt.figure()
>>> ax1 = fig.add_subplot(111)
>>> ax1.fill_between(edges[4:8], squaretg.pdf(edges[4:8], 10), 0, 'r')
Traceback (most recent call last):
AttributeError: 'AxesSubplot' object has no attribute 'fill_between'
>>> ax1.fill(edges[4:8], squaretg.pdf(edges[4:8], 10), 0, 'r')
Traceback (most recent call last):
'''
import pytest
pytest.main([__file__, '-vvs', '-x', '--pdb'])

View File

@ -0,0 +1,260 @@
'''given a 1D sample of observation, find a matching distribution
* estimate maximum likelihood parameter for each distribution
* rank estimated distribution by Kolmogorov-Smirnov and Anderson-Darling
test statistics
Author: Josef Pktd
License: Simplified BSD
original December 2008
TODO:
* refactor to result class
* split estimation by support, add option and choose automatically
*
'''
from scipy import stats
import numpy as np
import matplotlib.pyplot as plt
#stats.distributions.beta_gen._fitstart = lambda self, data : (5,5,0,1)
def plothist(x,distfn, args, loc, scale, right=1):
plt.figure()
# the histogram of the data
n, bins, patches = plt.hist(x, 25, normed=1, facecolor='green', alpha=0.75)
maxheight = max([p.get_height() for p in patches])
print(maxheight)
axlim = list(plt.axis())
#print(axlim)
axlim[-1] = maxheight*1.05
#plt.axis(tuple(axlim))
## print(bins)
## print('args in plothist', args)
# add a 'best fit' line
#yt = stats.norm.pdf( bins, loc=loc, scale=scale)
yt = distfn.pdf( bins, loc=loc, scale=scale, *args)
yt[yt>maxheight]=maxheight
lt = plt.plot(bins, yt, 'r--', linewidth=1)
ys = stats.t.pdf( bins, 10,scale=10,)*right
ls = plt.plot(bins, ys, 'b-', linewidth=1)
plt.xlabel('Smarts')
plt.ylabel('Probability')
plt.title(fr'$\mathrm{{Testing: {distfn.name} :}}\ \mu={loc:f},\ \sigma={scale:f}$')
#plt.axis([bins[0], bins[-1], 0, 0.134+0.05])
plt.grid(True)
plt.draw()
#plt.show()
#plt.close()
#targetdist = ['norm','t','truncnorm','johnsonsu','johnsonsb',
targetdist = ['norm','alpha', 'anglit', 'arcsine',
'beta', 'betaprime', 'bradford', 'burr', 'fisk', 'cauchy',
'chi', 'chi2', 'cosine', 'dgamma', 'dweibull', 'erlang',
'expon', 'exponweib', 'exponpow', 'fatiguelife', 'foldcauchy',
'f', 'foldnorm', 'frechet_r', 'weibull_min', 'frechet_l',
'weibull_max', 'genlogistic', 'genpareto', 'genexpon', 'genextreme',
'gamma', 'gengamma', 'genhalflogistic', 'gompertz', 'gumbel_r',
'gumbel_l', 'halfcauchy', 'halflogistic', 'halfnorm', 'hypsecant',
'gausshyper', 'invgamma', 'invnorm', 'invweibull', 'johnsonsb',
'johnsonsu', 'laplace', 'levy', 'levy_l',
'logistic', 'loggamma', 'loglaplace', 'lognorm', 'gilbrat',
'maxwell', 'mielke', 'nakagami', 'ncx2', 'ncf', 't',
'nct', 'pareto', 'lomax', 'powerlaw', 'powerlognorm', 'powernorm',
'rdist', 'rayleigh', 'reciprocal', 'rice', 'recipinvgauss',
'semicircular', 'triang', 'truncexpon', 'truncnorm',
'tukeylambda', 'uniform', 'vonmises', 'wald', 'wrapcauchy',
'binom', 'bernoulli', 'nbinom', 'geom', 'hypergeom', 'logser',
'poisson', 'planck', 'boltzmann', 'randint', 'zipf', 'dlaplace']
left = []
right = []
finite = []
unbound = []
other = []
contdist = []
discrete = []
categ = {('open','open'):'unbound', ('0','open'):'right',('open','0',):'left',
('finite','finite'):'finite',('oth','oth'):'other'}
categ = {('open','open'):unbound, ('0','open'):right,('open','0',):left,
('finite','finite'):finite,('oth','oth'):other}
categ2 = {
('open', '0') : ['frechet_l', 'weibull_max', 'levy_l'],
('finite', 'finite') : ['anglit', 'cosine', 'rdist', 'semicircular'],
('0', 'open') : ['alpha', 'burr', 'fisk', 'chi', 'chi2', 'erlang',
'expon', 'exponweib', 'exponpow', 'fatiguelife', 'foldcauchy', 'f',
'foldnorm', 'frechet_r', 'weibull_min', 'genpareto', 'genexpon',
'gamma', 'gengamma', 'genhalflogistic', 'gompertz', 'halfcauchy',
'halflogistic', 'halfnorm', 'invgamma', 'invnorm', 'invweibull',
'levy', 'loglaplace', 'lognorm', 'gilbrat', 'maxwell', 'mielke',
'nakagami', 'ncx2', 'ncf', 'lomax', 'powerlognorm', 'rayleigh',
'rice', 'recipinvgauss', 'truncexpon', 'wald'],
('open', 'open') : ['cauchy', 'dgamma', 'dweibull', 'genlogistic', 'genextreme',
'gumbel_r', 'gumbel_l', 'hypsecant', 'johnsonsu', 'laplace',
'logistic', 'loggamma', 't', 'nct', 'powernorm', 'reciprocal',
'truncnorm', 'tukeylambda', 'vonmises'],
('0', 'finite') : ['arcsine', 'beta', 'betaprime', 'bradford', 'gausshyper',
'johnsonsb', 'powerlaw', 'triang', 'uniform', 'wrapcauchy'],
('finite', 'open') : ['pareto']
}
#Note: weibull_max == frechet_l
right_incorrect = ['genextreme']
right_all = categ2[('0', 'open')] + categ2[('0', 'finite')] + categ2[('finite', 'open')]\
+ right_incorrect
for distname in targetdist:
distfn = getattr(stats,distname)
if hasattr(distfn,'_pdf'):
if np.isinf(distfn.a):
low = 'open'
elif distfn.a == 0:
low = '0'
else:
low = 'finite'
if np.isinf(distfn.b):
high = 'open'
elif distfn.b == 0:
high = '0'
else:
high = 'finite'
contdist.append(distname)
categ.setdefault((low,high),[]).append(distname)
not_good = ['genextreme', 'reciprocal', 'vonmises']
# 'genextreme' is right (or left?), 'reciprocal' requires 0<a<b, 'vonmises' no a,b
targetdist = [f for f in categ[('open', 'open')] if f not in not_good]
not_good = ['wrapcauchy']
not_good = ['vonmises']
not_good = ['genexpon','vonmises']
#'wrapcauchy' requires additional parameter (scale) in argcheck
targetdist = [f for f in contdist if f not in not_good]
#targetdist = contdist
#targetdist = not_good
#targetdist = ['t', 'f']
#targetdist = ['norm','burr']
if __name__ == '__main__':
#TODO: calculate correct tail probability for mixture
prefix = 'run_conv500_1_'
convol = 0.75
n = 500
dgp_arg = 10
dgp_scale = 10
results = []
for i in range(1):
rvs_orig = stats.t.rvs(dgp_arg,scale=dgp_scale,size=n*convol)
rvs_orig = np.hstack((rvs_orig,stats.halflogistic.rvs(loc=0.4, scale=5.0,size =n*(1-convol))))
rvs_abs = np.absolute(rvs_orig)
rvs_pos = rvs_orig[rvs_orig>0]
rightfactor = 1
rvs_right = rvs_pos
print('='*50)
print('samplesize = ', n)
for distname in targetdist:
distfn = getattr(stats,distname)
if distname in right_all:
rvs = rvs_right
rind = rightfactor
else:
rvs = rvs_orig
rind = 1
print('-'*30)
print('target = %s' % distname)
sm = rvs.mean()
sstd = np.sqrt(rvs.var())
ssupp = (rvs.min(), rvs.max())
if distname in ['truncnorm','betaprime','reciprocal']:
par0 = (sm-2*sstd,sm+2*sstd)
par_est = tuple(distfn.fit(rvs,loc=sm,scale=sstd,*par0))
elif distname == 'norm':
par_est = tuple(distfn.fit(rvs,loc=sm,scale=sstd))
elif distname == 'genextreme':
par_est = tuple(distfn.fit(rvs,-5,loc=sm,scale=sstd))
elif distname == 'wrapcauchy':
par_est = tuple(distfn.fit(rvs,0.5,loc=0,scale=sstd))
elif distname == 'f':
par_est = tuple(distfn.fit(rvs,10,15,loc=0,scale=1))
elif distname in right:
sm = rvs.mean()
sstd = np.sqrt(rvs.var())
par_est = tuple(distfn.fit(rvs,loc=0,scale=1))
else:
sm = rvs.mean()
sstd = np.sqrt(rvs.var())
par_est = tuple(distfn.fit(rvs,loc=sm,scale=sstd))
print('fit', par_est)
arg_est = par_est[:-2]
loc_est = par_est[-2]
scale_est = par_est[-1]
rvs_normed = (rvs-loc_est)/scale_est
ks_stat, ks_pval = stats.kstest(rvs_normed,distname, arg_est)
print('kstest', ks_stat, ks_pval)
quant = 0.1
crit = distfn.ppf(1-quant*float(rind), loc=loc_est, scale=scale_est,*par_est)
tail_prob = stats.t.sf(crit,dgp_arg,scale=dgp_scale)
print('crit, prob', quant, crit, tail_prob)
#if distname == 'norm':
#plothist(rvs,loc_est,scale_est)
#args = tuple()
results.append([distname,ks_stat, ks_pval,arg_est,loc_est,scale_est,crit,tail_prob ])
#plothist(rvs,distfn,arg_est,loc_est,scale_est)
#plothist(rvs,distfn,arg_est,loc_est,scale_est)
#plt.show()
#plt.close()
#TODO: collect results and compare tail quantiles
from operator import itemgetter
res_sort = sorted(results, key = itemgetter(2))
res_sort.reverse() #kstest statistic: smaller is better, pval larger is better
print('number of distributions', len(res_sort))
imagedir = 'matchresults'
import os
if not os.path.exists(imagedir):
os.makedirs(imagedir)
for ii,di in enumerate(res_sort):
distname,ks_stat, ks_pval,arg_est,loc_est,scale_est,crit,tail_prob = di[:]
distfn = getattr(stats,distname)
if distname in right_all:
rvs = rvs_right
rind = rightfactor
ri = 'r'
else:
rvs = rvs_orig
ri = ''
rind = 1
print('%s ks-stat = %f, ks-pval = %f tail_prob = %f)' % \
(distname, ks_stat, ks_pval, tail_prob))
## print('arg_est = %s, loc_est = %f scale_est = %f)' % \
## (repr(arg_est),loc_est,scale_est))
plothist(rvs,distfn,arg_est,loc_est,scale_est,right = rind)
plt.savefig(os.path.join(imagedir,'%s%s%02d_%s.png'% (prefix, ri,ii, distname)))
##plt.show()
##plt.close()