some new features

This commit is contained in:
ilgazca
2025-07-30 17:09:11 +03:00
parent db5d46760a
commit 8019bd3b7c
20616 changed files with 4375466 additions and 8 deletions

View File

@ -0,0 +1,21 @@
"""
This module contains the one-parameter exponential families used
for fitting GLMs and GAMs.
These families are described in
P. McCullagh and J. A. Nelder. "Generalized linear models."
Monographs on Statistics and Applied Probability.
Chapman & Hall, London, 1983.
"""
from statsmodels.genmod.families import links
from .family import Gaussian, Family, Poisson, Gamma, \
InverseGaussian, Binomial, NegativeBinomial, Tweedie
from statsmodels.tools._test_runner import PytestTester
__all__ = ['test', 'links', 'Family', 'Gamma', 'Gaussian', 'Poisson',
'InverseGaussian', 'Binomial', 'NegativeBinomial', 'Tweedie']
test = PytestTester()

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,109 @@
"""
Test functions for genmod.families.family
"""
import warnings
import pytest
import numpy as np
from numpy.testing import assert_allclose
from scipy import integrate
from statsmodels.compat.scipy import SP_LT_17
from statsmodels.tools.sm_exceptions import (
ValueWarning,
)
import statsmodels.genmod.families as F
from statsmodels.genmod.families.family import Tweedie
import statsmodels.genmod.families.links as L
all_links = {
L.Logit, L.logit, L.Power, L.inverse_power, L.sqrt, L.inverse_squared,
L.identity, L.Log, L.log, L.CDFLink, L.probit, L.cauchy, L.LogLog,
L.loglog, L.CLogLog, L.cloglog, L.NegativeBinomial, L.nbinom
}
poisson_links = {L.Log, L.log, L.identity, L.sqrt}
gaussian_links = {L.Log, L.log, L.identity, L.inverse_power}
gamma_links = {L.Log, L.log, L.identity, L.inverse_power}
binomial_links = {
L.Logit, L.logit, L.probit, L.cauchy, L.Log, L.log, L.CLogLog,
L.cloglog, L.LogLog, L.loglog, L.identity
}
inverse_gaussian_links = {
L.inverse_squared, L.inverse_power, L.identity, L.Log, L.log
}
negative_bionomial_links = {
L.Log, L.log, L.CLogLog, L.cloglog, L.identity, L.NegativeBinomial,
L.nbinom, L.Power
}
tweedie_links = {L.Log, L.log, L.Power}
link_cases = [
(F.Poisson, poisson_links),
(F.Gaussian, gaussian_links),
(F.Gamma, gamma_links),
(F.Binomial, binomial_links),
(F.InverseGaussian, inverse_gaussian_links),
(F.NegativeBinomial, negative_bionomial_links),
(F.Tweedie, tweedie_links)
]
@pytest.mark.parametrize("family, links", link_cases)
def test_invalid_family_link(family, links):
invalid_links = all_links - links
with pytest.raises(ValueError):
with warnings.catch_warnings():
msg = ("Negative binomial dispersion parameter alpha not set. "
"Using default value alpha=1.0.")
warnings.filterwarnings("ignore", message=msg,
category=UserWarning)
warnings.filterwarnings("ignore",
category=FutureWarning)
for link in invalid_links:
family(link())
@pytest.mark.parametrize("family, links", link_cases)
def test_family_link(family, links):
with warnings.catch_warnings():
msg = ("Negative binomial dispersion parameter alpha not set. "
"Using default value alpha=1.0.")
warnings.filterwarnings("ignore", message=msg,
category=ValueWarning)
warnings.filterwarnings("ignore",
category=FutureWarning)
for link in links:
assert family(link())
@pytest.mark.parametrize("family, links", link_cases)
def test_family_link_check(family, links):
# check that we can turn of all link checks
class Hugo():
pass
with warnings.catch_warnings():
msg = ("Negative binomial dispersion parameter alpha not set. "
"Using default value alpha=1.0.")
warnings.filterwarnings("ignore", message=msg,
category=ValueWarning)
assert family(Hugo(), check_link=False)
@pytest.mark.skipif(SP_LT_17, reason="Scipy too old, function not available")
@pytest.mark.parametrize("power", (1.1, 1.5, 1.9))
def test_tweedie_loglike_obs(power):
"""Test that Tweedie loglike is normalized to 1."""
tweedie = Tweedie(var_power=power, eql=False)
mu = 2.0
scale = 2.9
def pdf(y):
return np.squeeze(
np.exp(
tweedie.loglike_obs(endog=y, mu=mu, scale=scale)
)
)
assert_allclose(pdf(0) + integrate.quad(pdf, 0, 1e2)[0], 1, atol=1e-4)

View File

@ -0,0 +1,195 @@
"""
Test functions for genmod.families.links
"""
import numpy as np
from numpy.testing import assert_allclose, assert_equal, assert_array_less
from scipy import stats
import pytest
import statsmodels.genmod.families as families
from statsmodels.tools import numdiff as nd
# Family instances
links = families.links
logit = links.Logit()
inverse_power = links.InversePower()
sqrt = links.Sqrt()
inverse_squared = links.InverseSquared()
identity = links.Identity()
log = links.Log()
logc = links.LogC()
probit = links.Probit()
cauchy = links.Cauchy()
cloglog = links.CLogLog()
loglog = links.LogLog()
negbinom = links.NegativeBinomial()
# TODO: parametrize all these tess
Links = [logit, inverse_power, sqrt, inverse_squared, identity,
log, logc, probit, cauchy, cloglog, loglog, negbinom]
# links with defined second derivative of inverse link.
LinksISD = [inverse_power, sqrt, inverse_squared, identity,
logc, cauchy, probit, loglog]
def get_domainvalue(link):
"""
Get a value in the domain for a given family.
"""
z = -np.log(np.random.uniform(0, 1))
if isinstance(link, links.CLogLog): # prone to overflow
z = min(z, 3)
elif isinstance(link, links.LogLog):
z = max(z, -3)
elif isinstance(link, (links.NegativeBinomial, links.LogC)):
# domain is negative numbers
z = -z
return z
def test_inverse():
# Logic check that link.inverse(link) and link(link.inverse)
# are the identity.
np.random.seed(3285)
for link in Links:
for k in range(10):
p = np.random.uniform(0, 1) # In domain for all families
d = link.inverse(link(p))
assert_allclose(d, p, atol=1e-8, err_msg=str(link))
z = get_domainvalue(link)
d = link(link.inverse(z))
assert_allclose(d, z, atol=1e-8, err_msg=str(link))
def test_deriv():
# Check link function derivatives using numeric differentiation.
np.random.seed(24235)
for link in Links:
for k in range(10):
p = np.random.uniform(0, 1)
if isinstance(link, links.Cauchy):
p = np.clip(p, 0.03, 0.97)
d = link.deriv(p)
da = nd.approx_fprime(np.r_[p], link)
assert_allclose(d, da, rtol=1e-6, atol=1e-6,
err_msg=str(link))
if not isinstance(link, (type(inverse_power),
type(inverse_squared),
type(logc))):
# check monotonically increasing
assert_array_less(-d, 0)
def test_deriv2():
# Check link function second derivatives using numeric differentiation.
np.random.seed(24235)
for link in Links:
for k in range(10):
p = np.random.uniform(0, 1)
p = np.clip(p, 0.01, 0.99)
if isinstance(link, links.cauchy):
p = np.clip(p, 0.03, 0.97)
d = link.deriv2(p)
da = nd.approx_fprime(np.r_[p], link.deriv)
assert_allclose(d, da, rtol=5e-6, atol=1e-6,
err_msg=str(link))
def test_inverse_deriv():
# Logic check that inverse_deriv equals 1/link.deriv(link.inverse)
np.random.seed(24235)
for link in Links:
for k in range(10):
z = get_domainvalue(link)
d = link.inverse_deriv(z)
f = 1 / link.deriv(link.inverse(z))
assert_allclose(d, f, rtol=1e-8, atol=1e-10,
err_msg=str(link))
def test_inverse_deriv2():
# Check second derivative of inverse link using numeric differentiation.
np.random.seed(24235)
for link in LinksISD:
for k in range(10):
z = get_domainvalue(link)
d2 = link.inverse_deriv2(z)
d2a = nd.approx_fprime(np.r_[z], link.inverse_deriv)
assert_allclose(d2, d2a, rtol=5e-6, atol=1e-6,
err_msg=str(link))
def test_invlogit_stability():
z = [1123.4910007309222, 1483.952316802719, 1344.86033748641,
706.339159002542, 1167.9986375146532, 663.8345826933115,
1496.3691686913917, 1563.0763842182257, 1587.4309332296314,
697.1173174974248, 1333.7256198289665, 1388.7667560586933,
819.7605431778434, 1479.9204150555015, 1078.5642245164856,
480.10338454985896, 1112.691659145772, 534.1061908007274,
918.2011296406588, 1280.8808515887802, 758.3890788775948,
673.503699841035, 1556.7043357878208, 819.5269028006679,
1262.5711060356423, 1098.7271535253608, 1482.811928490097,
796.198809756532, 893.7946963941745, 470.3304989319786,
1427.77079226037, 1365.2050226373822, 1492.4193201661922,
871.9922191949931, 768.4735925445908, 732.9222777654679,
812.2382651982667, 495.06449978924525]
zinv = logit.inverse(z)
assert_equal(zinv, np.ones_like(z))
class MyCLogLog(links.Link):
def __call__(self, p):
# p = self._clean(p)
return np.log(-np.log(1 - p))
def inverse(self, z):
return 1 - np.exp(-np.exp(z))
def deriv(self, p):
# p = self._clean(p)
return 1. / ((p - 1) * (np.log(1 - p)))
class CasesCDFLink():
# just as namespace to hold cases for test_cdflink
link_pairs = [
(links.CDFLink(dbn=stats.gumbel_l), links.CLogLog()),
(links.CDFLink(dbn=stats.gumbel_r), links.LogLog()),
(links.CDFLink(dbn=stats.norm), links.Probit()),
(links.CDFLink(dbn=stats.logistic), links.Logit()),
(links.CDFLink(dbn=stats.t(1)), links.Cauchy()),
# approximation of t by normal is not good enough for rtol, atol
# (links.CDFLink(dbn=stats.t(1000000)), links.Probit()),
(MyCLogLog(), links.CLogLog()), # not a cdflink, but compares
]
methods = ['__call__', 'deriv', 'inverse', 'inverse_deriv', 'deriv2',
'inverse_deriv2']
p = np.linspace(0, 1, 6)
eps = 1e-3
p = np.clip(p, eps, 1 - eps)
@pytest.mark.parametrize("m", CasesCDFLink.methods)
@pytest.mark.parametrize("link1, link2", CasesCDFLink.link_pairs)
def test_cdflink(m, link1, link2):
p = CasesCDFLink.p
res1 = getattr(link1, m)(p)
res2 = getattr(link2, m)(p)
assert_allclose(res1, res2, atol=1e-8, rtol=1e-8)

View File

@ -0,0 +1,283 @@
"""
Variance functions for use with the link functions in statsmodels.family.links
"""
import numpy as np
FLOAT_EPS = np.finfo(float).eps
class VarianceFunction:
"""
Relates the variance of a random variable to its mean. Defaults to 1.
Methods
-------
call
Returns an array of ones that is the same shape as `mu`
Notes
-----
After a variance function is initialized, its call method can be used.
Alias for VarianceFunction:
constant = VarianceFunction()
See Also
--------
statsmodels.genmod.families.family
"""
def __call__(self, mu):
"""
Default variance function
Parameters
----------
mu : array_like
mean parameters
Returns
-------
v : ndarray
ones(mu.shape)
"""
mu = np.asarray(mu)
return np.ones(mu.shape, np.float64)
def deriv(self, mu):
"""
Derivative of the variance function v'(mu)
"""
return np.zeros_like(mu)
constant = VarianceFunction()
constant.__doc__ = """
The call method of constant returns a constant variance, i.e., a vector of
ones.
constant is an alias of VarianceFunction()
"""
class Power:
"""
Power variance function
Parameters
----------
power : float
exponent used in power variance function
Methods
-------
call
Returns the power variance
Notes
-----
Formulas
V(mu) = numpy.fabs(mu)**power
Aliases for Power:
mu = Power()
mu_squared = Power(power=2)
mu_cubed = Power(power=3)
"""
def __init__(self, power=1.):
self.power = power
def __call__(self, mu):
"""
Power variance function
Parameters
----------
mu : array_like
mean parameters
Returns
-------
variance : ndarray
numpy.fabs(mu)**self.power
"""
return np.power(np.fabs(mu), self.power)
def deriv(self, mu):
"""
Derivative of the variance function v'(mu)
May be undefined at zero.
"""
der = self.power * np.fabs(mu) ** (self.power - 1)
ii = np.flatnonzero(mu < 0)
der[ii] *= -1
return der
mu = Power()
mu.__doc__ = """
Returns np.fabs(mu)
Notes
-----
This is an alias of Power()
"""
mu_squared = Power(power=2)
mu_squared.__doc__ = """
Returns np.fabs(mu)**2
Notes
-----
This is an alias of statsmodels.family.links.Power(power=2)
"""
mu_cubed = Power(power=3)
mu_cubed.__doc__ = """
Returns np.fabs(mu)**3
Notes
-----
This is an alias of statsmodels.family.links.Power(power=3)
"""
class Binomial:
"""
Binomial variance function
Parameters
----------
n : int, optional
The number of trials for a binomial variable. The default is 1 for
p in (0,1)
Methods
-------
call
Returns the binomial variance
Notes
-----
Formulas :
V(mu) = p * (1 - p) * n
where p = mu / n
Alias for Binomial:
binary = Binomial()
A private method _clean trims the data by machine epsilon so that p is
in (0,1)
"""
def __init__(self, n=1):
self.n = n
def _clean(self, p):
return np.clip(p, FLOAT_EPS, 1 - FLOAT_EPS)
def __call__(self, mu):
"""
Binomial variance function
Parameters
----------
mu : array_like
mean parameters
Returns
-------
variance : ndarray
variance = mu/n * (1 - mu/n) * self.n
"""
p = self._clean(mu / self.n)
return p * (1 - p) * self.n
# TODO: inherit from super
def deriv(self, mu):
"""
Derivative of the variance function v'(mu)
"""
return 1 - 2*mu
binary = Binomial()
binary.__doc__ = """
The binomial variance function for n = 1
Notes
-----
This is an alias of Binomial(n=1)
"""
class NegativeBinomial:
'''
Negative binomial variance function
Parameters
----------
alpha : float
The ancillary parameter for the negative binomial variance function.
`alpha` is assumed to be nonstochastic. The default is 1.
Methods
-------
call
Returns the negative binomial variance
Notes
-----
Formulas :
V(mu) = mu + alpha*mu**2
Alias for NegativeBinomial:
nbinom = NegativeBinomial()
A private method _clean trims the data by machine epsilon so that p is
in (0,inf)
'''
def __init__(self, alpha=1.):
self.alpha = alpha
def _clean(self, p):
return np.clip(p, FLOAT_EPS, np.inf)
def __call__(self, mu):
"""
Negative binomial variance function
Parameters
----------
mu : array_like
mean parameters
Returns
-------
variance : ndarray
variance = mu + alpha*mu**2
"""
p = self._clean(mu)
return p + self.alpha*p**2
def deriv(self, mu):
"""
Derivative of the negative binomial variance function.
"""
p = self._clean(mu)
return 1 + 2 * self.alpha * p
nbinom = NegativeBinomial()
nbinom.__doc__ = """
Negative Binomial variance function.
Notes
-----
This is an alias of NegativeBinomial(alpha=1.)
"""