reconnect moved files to git repo
This commit is contained in:
@ -0,0 +1,806 @@
|
||||
# pylint: disable=F841
|
||||
"""
|
||||
unit test for GAM
|
||||
|
||||
Author: Luca Puggini
|
||||
|
||||
Created on 08/07/2015
|
||||
"""
|
||||
|
||||
import os
|
||||
import numpy as np
|
||||
from numpy.testing import assert_allclose
|
||||
import pandas as pd
|
||||
from scipy.linalg import block_diag
|
||||
import pytest
|
||||
|
||||
from statsmodels.tools.linalg import matrix_sqrt
|
||||
from statsmodels.gam.smooth_basis import (
|
||||
UnivariatePolynomialSmoother, PolynomialSmoother, BSplines,
|
||||
GenericSmoothers, UnivariateCubicSplines, CyclicCubicSplines)
|
||||
from statsmodels.gam.generalized_additive_model import (
|
||||
GLMGam, LogitGam, make_augmented_matrix, penalized_wls)
|
||||
from statsmodels.gam.gam_cross_validation.gam_cross_validation import (
|
||||
MultivariateGAMCV, MultivariateGAMCVPath, _split_train_test_smoothers)
|
||||
from statsmodels.gam.gam_penalties import (UnivariateGamPenalty,
|
||||
MultivariateGamPenalty)
|
||||
from statsmodels.gam.gam_cross_validation.cross_validators import KFold
|
||||
from statsmodels.genmod.generalized_linear_model import GLM
|
||||
from statsmodels.genmod.families.family import Gaussian
|
||||
from statsmodels.genmod.generalized_linear_model import lm
|
||||
|
||||
sigmoid = np.vectorize(lambda x: 1.0 / (1.0 + np.exp(-x)))
|
||||
|
||||
|
||||
def polynomial_sample_data():
|
||||
"""A polynomial of degree 4
|
||||
|
||||
poly = ax^4 + bx^3 + cx^2 + dx + e
|
||||
second der = 12ax^2 + 6bx + 2c
|
||||
integral from -1 to 1 of second der^2 is
|
||||
(288 a^2)/5 + 32 a c + 8 (3 b^2 + c^2)
|
||||
the gradient of the integral is der
|
||||
[576*a/5 + 32 * c, 48*b, 32*a + 16*c, 0, 0]
|
||||
|
||||
Returns
|
||||
-------
|
||||
poly : smoother instance
|
||||
y : ndarray
|
||||
generated function values, demeaned
|
||||
"""
|
||||
n = 10000
|
||||
x = np.linspace(-1, 1, n)
|
||||
y = 2 * x ** 3 - x
|
||||
y -= y.mean()
|
||||
|
||||
degree = [4]
|
||||
pol = PolynomialSmoother(x, degree)
|
||||
|
||||
return pol, y
|
||||
|
||||
|
||||
def integral(params):
|
||||
d, c, b, a = params
|
||||
itg = (288 * a ** 2) / 5 + (32 * a * c) + 8 * (3 * b ** 2 + c ** 2)
|
||||
itg /= 2
|
||||
return itg
|
||||
|
||||
|
||||
def grad(params):
|
||||
d, c, b, a = params
|
||||
grd = np.array([576 * a / 5 + 32 * c, 48 * b, 32 * a + 16 * c, 0])
|
||||
grd = grd[::-1]
|
||||
return grd / 2
|
||||
|
||||
|
||||
def hessian(params):
|
||||
hess = np.array([[576 / 5, 0, 32, 0],
|
||||
[0, 48, 0, 0],
|
||||
[32, 0, 16, 0],
|
||||
[0, 0, 0, 0]
|
||||
])
|
||||
return hess / 2
|
||||
|
||||
|
||||
def cost_function(params, pol, y, alpha):
|
||||
# this should be the MSE or log likelihood value
|
||||
lin_pred = np.dot(pol.basis, params)
|
||||
gaussian = Gaussian()
|
||||
expval = gaussian.link.inverse(lin_pred)
|
||||
loglike = gaussian.loglike(y, expval)
|
||||
|
||||
# this is the vale of the GAM penalty. For the example polynomial
|
||||
itg = integral(params)
|
||||
|
||||
# return the cost function of the GAM for the given polynomial
|
||||
return loglike - alpha * itg, loglike, itg
|
||||
|
||||
|
||||
def test_gam_penalty():
|
||||
"""
|
||||
test the func method of the gam penalty
|
||||
:return:
|
||||
"""
|
||||
pol, y = polynomial_sample_data()
|
||||
univ_pol = pol.smoothers[0]
|
||||
alpha = 1
|
||||
gp = UnivariateGamPenalty(alpha=alpha, univariate_smoother=univ_pol)
|
||||
|
||||
for _ in range(10):
|
||||
params = np.random.randint(-2, 2, 4)
|
||||
gp_score = gp.func(params)
|
||||
itg = integral(params)
|
||||
assert_allclose(gp_score, itg, atol=1.e-1)
|
||||
|
||||
|
||||
def test_gam_gradient():
|
||||
# test the gam gradient for the example polynomial
|
||||
np.random.seed(1)
|
||||
pol, y = polynomial_sample_data()
|
||||
|
||||
alpha = 1
|
||||
smoother = pol.smoothers[0]
|
||||
gp = UnivariateGamPenalty(alpha=alpha, univariate_smoother=smoother)
|
||||
|
||||
for _ in range(10):
|
||||
params = np.random.uniform(-2, 2, 4)
|
||||
params = np.array([1, 1, 1, 1])
|
||||
gam_grad = gp.deriv(params)
|
||||
grd = grad(params)
|
||||
|
||||
assert_allclose(gam_grad, grd, rtol=1.e-2, atol=1.e-2)
|
||||
|
||||
|
||||
def test_gam_hessian():
|
||||
# test the deriv2 method of the gam penalty
|
||||
np.random.seed(1)
|
||||
pol, y = polynomial_sample_data()
|
||||
univ_pol = pol.smoothers[0]
|
||||
alpha = 1
|
||||
gp = UnivariateGamPenalty(alpha=alpha, univariate_smoother=univ_pol)
|
||||
|
||||
for _ in range(10):
|
||||
params = np.random.randint(-2, 2, 5)
|
||||
gam_der2 = gp.deriv2(params)
|
||||
hess = hessian(params)
|
||||
hess = np.flipud(hess)
|
||||
hess = np.fliplr(hess)
|
||||
assert_allclose(gam_der2, hess, atol=1.e-13, rtol=1.e-3)
|
||||
|
||||
|
||||
def test_approximation():
|
||||
np.random.seed(1)
|
||||
poly, y = polynomial_sample_data()
|
||||
alpha = 1
|
||||
for _ in range(10):
|
||||
params = np.random.uniform(-1, 1, 4)
|
||||
cost, err, itg = cost_function(params, poly, y, alpha)
|
||||
glm_gam = GLMGam(y, smoother=poly, alpha=alpha)
|
||||
# TODO: why do we need pen_weight=1
|
||||
gam_loglike = glm_gam.loglike(params, scale=1, pen_weight=1)
|
||||
assert_allclose(err - itg, cost, rtol=1e-10)
|
||||
assert_allclose(gam_loglike, cost, rtol=0.1)
|
||||
|
||||
|
||||
def test_gam_glm():
|
||||
cur_dir = os.path.dirname(os.path.abspath(__file__))
|
||||
file_path = os.path.join(cur_dir, "results", "prediction_from_mgcv.csv")
|
||||
data_from_r = pd.read_csv(file_path)
|
||||
# dataset used to train the R model
|
||||
x = data_from_r.x.values
|
||||
y = data_from_r.y.values
|
||||
|
||||
df = [10]
|
||||
degree = [3]
|
||||
bsplines = BSplines(x, degree=degree, df=df, include_intercept=True)
|
||||
# y_mgcv is obtained from R with the following code
|
||||
# g = gam(y~s(x, k = 10, bs = "cr"), data = data, scale = 80)
|
||||
y_mgcv = np.asarray(data_from_r.y_est)
|
||||
|
||||
alpha = 0.1 # chosen by trial and error
|
||||
|
||||
glm_gam = GLMGam(y, smoother=bsplines, alpha=alpha)
|
||||
res_glm_gam = glm_gam.fit(method='bfgs', max_start_irls=0,
|
||||
disp=1, maxiter=10000)
|
||||
y_gam0 = np.dot(bsplines.basis, res_glm_gam.params)
|
||||
y_gam = np.asarray(res_glm_gam.fittedvalues)
|
||||
assert_allclose(y_gam, y_gam0, rtol=1e-10)
|
||||
|
||||
# plt.plot(x, y_gam, '.', label='gam')
|
||||
# plt.plot(x, y_mgcv, '.', label='mgcv')
|
||||
# plt.plot(x, y, '.', label='y')
|
||||
# plt.legend()
|
||||
# plt.show()
|
||||
|
||||
assert_allclose(y_gam, y_mgcv, atol=1.e-2)
|
||||
|
||||
|
||||
def test_gam_discrete():
|
||||
cur_dir = os.path.dirname(os.path.abspath(__file__))
|
||||
file_path = os.path.join(cur_dir, "results", "prediction_from_mgcv.csv")
|
||||
data_from_r = pd.read_csv(file_path)
|
||||
# dataset used to train the R model
|
||||
x = data_from_r.x.values
|
||||
y = data_from_r.ybin.values
|
||||
|
||||
df = [10]
|
||||
degree = [5]
|
||||
bsplines = BSplines(x, degree=degree, df=df, include_intercept=True)
|
||||
|
||||
# y_mgcv is obtained from R with the following code
|
||||
# g = gam(y~s(x, k = 10, bs = "cr"), data = data, scale = 80)
|
||||
y_mgcv = data_from_r.ybin_est
|
||||
|
||||
alpha = 0.00002
|
||||
# gp = UnivariateGamPenalty(alpha=alpha, univariate_smoother=bsplines)
|
||||
# lg_gam = LogitGam(y, bsplines.basis, penal=gp)
|
||||
#
|
||||
lg_gam = LogitGam(y, bsplines, alpha=alpha)
|
||||
res_lg_gam = lg_gam.fit(maxiter=10000)
|
||||
y_gam = np.dot(bsplines.basis, res_lg_gam.params)
|
||||
y_gam = sigmoid(y_gam)
|
||||
y_mgcv = sigmoid(y_mgcv)
|
||||
|
||||
# plt.plot(x, y_gam, label='gam')
|
||||
# plt.plot(x, y_mgcv, label='mgcv')
|
||||
# plt.plot(x, y, '.', label='y')
|
||||
# plt.ylim(-0.4, 1.4)
|
||||
# plt.legend()
|
||||
# plt.show()
|
||||
|
||||
assert_allclose(y_gam, y_mgcv, rtol=1.e-10, atol=1.e-1)
|
||||
|
||||
|
||||
def multivariate_sample_data(seed=1):
|
||||
n = 1000
|
||||
x1 = np.linspace(-1, 1, n)
|
||||
x2 = np.linspace(-10, 10, n)
|
||||
x = np.vstack([x1, x2]).T
|
||||
np.random.seed(seed)
|
||||
y = x1 * x1 * x1 + x2 + np.random.normal(0, 0.01, n)
|
||||
degree1 = 4
|
||||
degree2 = 3
|
||||
degrees = [degree1, degree2]
|
||||
pol = PolynomialSmoother(x, degrees)
|
||||
return x, y, pol
|
||||
|
||||
|
||||
def test_multivariate_penalty():
|
||||
alphas = [1, 2]
|
||||
weights = [1, 1]
|
||||
np.random.seed(1)
|
||||
x, y, pol = multivariate_sample_data()
|
||||
|
||||
univ_pol1 = UnivariatePolynomialSmoother(x[:, 0], degree=pol.degrees[0])
|
||||
univ_pol2 = UnivariatePolynomialSmoother(x[:, 1], degree=pol.degrees[1])
|
||||
|
||||
gp1 = UnivariateGamPenalty(alpha=alphas[0], univariate_smoother=univ_pol1)
|
||||
gp2 = UnivariateGamPenalty(alpha=alphas[1], univariate_smoother=univ_pol2)
|
||||
with pytest.warns(UserWarning, match="weights is currently ignored"):
|
||||
mgp = MultivariateGamPenalty(multivariate_smoother=pol, alpha=alphas,
|
||||
weights=weights)
|
||||
|
||||
for i in range(10):
|
||||
params1 = np.random.randint(-3, 3, pol.smoothers[0].dim_basis)
|
||||
params2 = np.random.randint(-3, 3, pol.smoothers[1].dim_basis)
|
||||
params = np.concatenate([params1, params2])
|
||||
c1 = gp1.func(params1)
|
||||
c2 = gp2.func(params2)
|
||||
c = mgp.func(params)
|
||||
assert_allclose(c, c1 + c2, atol=1.e-10, rtol=1.e-10)
|
||||
|
||||
d1 = gp1.deriv(params1)
|
||||
d2 = gp2.deriv(params2)
|
||||
d12 = np.concatenate([d1, d2])
|
||||
d = mgp.deriv(params)
|
||||
assert_allclose(d, d12)
|
||||
|
||||
h1 = gp1.deriv2(params1)
|
||||
h2 = gp2.deriv2(params2)
|
||||
h12 = block_diag(h1, h2)
|
||||
h = mgp.deriv2(params)
|
||||
assert_allclose(h, h12)
|
||||
|
||||
|
||||
def test_generic_smoother():
|
||||
x, y, poly = multivariate_sample_data()
|
||||
alphas = [0.4, 0.7]
|
||||
weights = [1, 1] # noqa: F841
|
||||
|
||||
gs = GenericSmoothers(poly.x, poly.smoothers)
|
||||
gam_gs = GLMGam(y, smoother=gs, alpha=alphas)
|
||||
gam_gs_res = gam_gs.fit()
|
||||
|
||||
gam_poly = GLMGam(y, smoother=poly, alpha=alphas)
|
||||
gam_poly_res = gam_poly.fit()
|
||||
|
||||
assert_allclose(gam_gs_res.params, gam_poly_res.params)
|
||||
|
||||
|
||||
def test_multivariate_gam_1d_data():
|
||||
cur_dir = os.path.dirname(os.path.abspath(__file__))
|
||||
file_path = os.path.join(cur_dir, "results", "prediction_from_mgcv.csv")
|
||||
data_from_r = pd.read_csv(file_path)
|
||||
# dataset used to train the R model
|
||||
x = data_from_r.x.values
|
||||
y = data_from_r.y
|
||||
|
||||
df = [10]
|
||||
degree = [3]
|
||||
bsplines = BSplines(x, degree=degree, df=df)
|
||||
# y_mgcv is obtained from R with the following code
|
||||
# g = gam(y~s(x, k = 10, bs = "cr"), data = data, scale = 80)
|
||||
y_mgcv = data_from_r.y_est
|
||||
|
||||
# alpha is by manually adjustment to reduce discrepancy in fittedvalues
|
||||
alpha = [0.0168 * 0.0251 / 2 * 500]
|
||||
gp = MultivariateGamPenalty(bsplines, alpha=alpha) # noqa: F841
|
||||
|
||||
glm_gam = GLMGam(y, exog=np.ones((len(y), 1)), smoother=bsplines,
|
||||
alpha=alpha)
|
||||
# "nm" converges to a different params, "bfgs" params are close to pirls
|
||||
# res_glm_gam = glm_gam.fit(method='nm', max_start_irls=0,
|
||||
# disp=1, maxiter=10000, maxfun=5000)
|
||||
res_glm_gam = glm_gam.fit(method='pirls', max_start_irls=0,
|
||||
disp=1, maxiter=10000)
|
||||
y_gam = res_glm_gam.fittedvalues
|
||||
|
||||
# plt.plot(x, y_gam, '.', label='gam')
|
||||
# plt.plot(x, y_mgcv, '.', label='mgcv')
|
||||
# plt.plot(x, y, '.', label='y')
|
||||
# plt.legend()
|
||||
# plt.show()
|
||||
|
||||
assert_allclose(y_gam, y_mgcv, atol=0.01)
|
||||
|
||||
|
||||
def test_multivariate_gam_cv():
|
||||
# SMOKE test
|
||||
# no test is performed. It only checks that there is not any runtime error
|
||||
|
||||
def cost(x1, x2):
|
||||
return np.linalg.norm(x1 - x2) / len(x1)
|
||||
|
||||
cur_dir = os.path.dirname(os.path.abspath(__file__))
|
||||
file_path = os.path.join(cur_dir, "results", "prediction_from_mgcv.csv")
|
||||
data_from_r = pd.read_csv(file_path)
|
||||
# dataset used to train the R model
|
||||
x = data_from_r.x.values
|
||||
y = data_from_r.y.values
|
||||
|
||||
df = [10]
|
||||
degree = [5]
|
||||
bsplines = BSplines(x, degree=degree, df=df)
|
||||
# y_mgcv is obtained from R with the following code
|
||||
# g = gam(y~s(x, k = 10, bs = "cr"), data = data, scale = 80)
|
||||
|
||||
alphas = [0.0251]
|
||||
alphas = [2]
|
||||
cv = KFold(3)
|
||||
|
||||
gp = MultivariateGamPenalty(bsplines, alpha=alphas) # noqa: F841
|
||||
gam_cv = MultivariateGAMCV(smoother=bsplines, alphas=alphas, gam=GLMGam,
|
||||
cost=cost, endog=y, exog=None, cv_iterator=cv)
|
||||
gam_cv_res = gam_cv.fit() # noqa: F841
|
||||
|
||||
|
||||
def test_multivariate_gam_cv_path():
|
||||
def sample_metric(y1, y2):
|
||||
return np.linalg.norm(y1 - y2) / len(y1)
|
||||
|
||||
cur_dir = os.path.dirname(os.path.abspath(__file__))
|
||||
file_path = os.path.join(cur_dir, "results", "prediction_from_mgcv.csv")
|
||||
|
||||
data_from_r = pd.read_csv(file_path)
|
||||
|
||||
# dataset used to train the R model
|
||||
x = data_from_r.x.values
|
||||
y = data_from_r.y.values
|
||||
se_from_mgcv = data_from_r.y_est_se # noqa: F841
|
||||
y_mgcv = data_from_r.y_mgcv_gcv # noqa: F841
|
||||
|
||||
df = [10]
|
||||
degree = [6]
|
||||
|
||||
bsplines = BSplines(x, degree=degree, df=df, include_intercept=True)
|
||||
|
||||
gam = GLMGam
|
||||
alphas = [np.linspace(0, 2, 10)]
|
||||
k = 3
|
||||
cv = KFold(k_folds=k, shuffle=True)
|
||||
|
||||
# Note: kfold cv uses random shuffle
|
||||
np.random.seed(123)
|
||||
gam_cv = MultivariateGAMCVPath(smoother=bsplines, alphas=alphas, gam=gam,
|
||||
cost=sample_metric, endog=y, exog=None,
|
||||
cv_iterator=cv)
|
||||
gam_cv_res = gam_cv.fit() # noqa: F841
|
||||
|
||||
glm_gam = GLMGam(y, smoother=bsplines, alpha=gam_cv.alpha_cv)
|
||||
res_glm_gam = glm_gam.fit(method='irls', max_start_irls=0,
|
||||
disp=1, maxiter=10000)
|
||||
y_est = res_glm_gam.predict(bsplines.basis)
|
||||
|
||||
# plt.plot(x, y, '.', label='y')
|
||||
# plt.plot(x, y_est, '.', label='y est')
|
||||
# plt.plot(x, y_mgcv, '.', label='y mgcv')
|
||||
# plt.legend()
|
||||
# plt.show()
|
||||
|
||||
# The test compares to result obtained with GCV and not KFOLDS CV.
|
||||
# This is because MGCV does not support KFOLD CV
|
||||
assert_allclose(data_from_r.y_mgcv_gcv, y_est, atol=1.e-1, rtol=1.e-1)
|
||||
|
||||
# Note: kfold cv uses random shuffle
|
||||
np.random.seed(123)
|
||||
alpha_cv, res_cv = glm_gam.select_penweight_kfold(alphas=alphas, k_folds=3)
|
||||
assert_allclose(alpha_cv, gam_cv.alpha_cv, rtol=1e-12)
|
||||
|
||||
|
||||
def test_train_test_smoothers():
|
||||
n = 6
|
||||
x = np.zeros(shape=(n, 2))
|
||||
x[:, 0] = range(6)
|
||||
x[:, 1] = range(6, 12)
|
||||
poly = PolynomialSmoother(x, degrees=[3, 3])
|
||||
train_index = list(range(3))
|
||||
test_index = list(range(3, 6))
|
||||
train_smoother, test_smoother = _split_train_test_smoothers(poly.x, poly,
|
||||
train_index,
|
||||
test_index)
|
||||
|
||||
expected_train_basis = [[0., 0., 0., 6., 36., 216.],
|
||||
[1., 1., 1., 7., 49., 343.],
|
||||
[2., 4., 8., 8., 64., 512.]]
|
||||
assert_allclose(train_smoother.basis, expected_train_basis)
|
||||
|
||||
expected_test_basis = [[3., 9., 27., 9., 81., 729.],
|
||||
[4., 16., 64., 10., 100., 1000.],
|
||||
[5., 25., 125., 11., 121., 1331.]]
|
||||
assert_allclose(test_smoother.basis, expected_test_basis)
|
||||
|
||||
|
||||
def test_get_sqrt():
|
||||
n = 1000
|
||||
np.random.seed(1)
|
||||
x = np.random.normal(0, 1, (n, 3))
|
||||
x2 = np.dot(x.T, x)
|
||||
|
||||
sqrt_x2 = matrix_sqrt(x2)
|
||||
|
||||
x2_reconstruction = np.dot(sqrt_x2.T, sqrt_x2)
|
||||
assert_allclose(x2_reconstruction, x2)
|
||||
|
||||
|
||||
def test_make_augmented_matrix():
|
||||
np.random.seed(1)
|
||||
n = 500
|
||||
x = np.random.uniform(-1, 1, (n, 3))
|
||||
s = np.dot(x.T, x)
|
||||
y = np.array(list(range(n)))
|
||||
w = np.random.uniform(0, 1, n)
|
||||
nobs, n_columns = x.shape
|
||||
|
||||
# matrix_sqrt removes redundant rows,
|
||||
# if alpha is zero, then no augmentation is needed
|
||||
alpha = 0
|
||||
aug_y, aug_x, aug_w = make_augmented_matrix(y, x, alpha * s, w)
|
||||
expected_aug_x = x
|
||||
assert_allclose(aug_x, expected_aug_x)
|
||||
expected_aug_y = y
|
||||
expected_aug_y[:nobs] = y
|
||||
assert_allclose(aug_y, expected_aug_y)
|
||||
expected_aug_w = w
|
||||
assert_allclose(aug_w, expected_aug_w)
|
||||
|
||||
alpha = 1
|
||||
aug_y, aug_x, aug_w = make_augmented_matrix(y, x, s, w)
|
||||
rs = matrix_sqrt(alpha * s)
|
||||
# alternative version to matrix_sqrt using cholesky is not available
|
||||
# rs = sp.linalg.cholesky(alpha * s)
|
||||
assert_allclose(np.dot(rs.T, rs), alpha * s)
|
||||
expected_aug_x = np.vstack([x, rs])
|
||||
assert_allclose(aug_x, expected_aug_x)
|
||||
expected_aug_y = np.zeros(shape=(nobs + n_columns,))
|
||||
expected_aug_y[:nobs] = y
|
||||
assert_allclose(aug_y, expected_aug_y)
|
||||
expected_aug_w = np.concatenate((w, [1] * n_columns), axis=0)
|
||||
assert_allclose(aug_w, expected_aug_w)
|
||||
|
||||
|
||||
def test_penalized_wls():
|
||||
np.random.seed(1)
|
||||
n = 20
|
||||
p = 3
|
||||
x = np.random.normal(0, 1, (n, 3))
|
||||
y = x[:, 1] - x[:, 2] + np.random.normal(0, .1, n)
|
||||
y -= y.mean()
|
||||
|
||||
weights = np.ones(shape=(n,))
|
||||
s = np.random.normal(0, 1, (p, p))
|
||||
|
||||
pen_wls_res = penalized_wls(y, x, 0 * s, weights)
|
||||
ls_res = lm.OLS(y, x).fit()
|
||||
|
||||
assert_allclose(ls_res.params, pen_wls_res.params)
|
||||
|
||||
|
||||
def test_cyclic_cubic_splines():
|
||||
cur_dir = os.path.dirname(os.path.abspath(__file__))
|
||||
file_path = os.path.join(cur_dir, "results",
|
||||
"cubic_cyclic_splines_from_mgcv.csv")
|
||||
data_from_r = pd.read_csv(file_path)
|
||||
|
||||
x = data_from_r[['x0', 'x2']].values
|
||||
y = data_from_r['y'].values
|
||||
y_est_mgcv = data_from_r[['y_est']].values # noqa: F841
|
||||
s_mgcv = data_from_r[['s(x0)', 's(x2)']].values
|
||||
|
||||
dfs = [10, 10]
|
||||
ccs = CyclicCubicSplines(x, df=dfs)
|
||||
alpha = [0.05 / 2, 0.0005 / 2]
|
||||
# TODO: if alpha changes in pirls this should be updated
|
||||
|
||||
gam = GLMGam(y, smoother=ccs, alpha=alpha)
|
||||
gam_res = gam.fit(method='pirls')
|
||||
|
||||
s0 = np.dot(ccs.basis[:, ccs.mask[0]],
|
||||
gam_res.params[ccs.mask[0]])
|
||||
# TODO: Mean has to be removed
|
||||
# removing mean could be replaced by options for intercept handling
|
||||
s0 -= s0.mean()
|
||||
|
||||
s1 = np.dot(ccs.basis[:, ccs.mask[1]],
|
||||
gam_res.params[ccs.mask[1]])
|
||||
s1 -= s1.mean() # TODO: Mean has to be removed
|
||||
|
||||
# plt.subplot(2, 1, 1)
|
||||
# plt.plot(x[:, 0], s0, '.', label='s0')
|
||||
# plt.plot(x[:, 0], s_mgcv[:, 0], '.', label='s0_mgcv')
|
||||
# plt.legend(loc='best')
|
||||
#
|
||||
# plt.subplot(2, 1, 2)
|
||||
# plt.plot(x[:, 1], s1, '.', label='s1_est')
|
||||
# plt.plot(x[:, 1], s_mgcv[:, 1], '.', label='s1_mgcv')
|
||||
# plt.legend(loc='best')
|
||||
# plt.show()
|
||||
|
||||
assert_allclose(s0, s_mgcv[:, 0], atol=0.02)
|
||||
assert_allclose(s1, s_mgcv[:, 1], atol=0.33)
|
||||
|
||||
|
||||
def test_multivariate_cubic_splines():
|
||||
np.random.seed(0)
|
||||
from statsmodels.gam.smooth_basis import CubicSplines
|
||||
|
||||
n = 500
|
||||
x1 = np.linspace(-3, 3, n)
|
||||
x2 = np.linspace(0, 1, n)**2
|
||||
|
||||
x = np.vstack([x1, x2]).T
|
||||
y1 = np.sin(x1) / x1
|
||||
y2 = x2 * x2
|
||||
y0 = y1 + y2
|
||||
# need small enough noise variance to get good estimate for this test
|
||||
y = y0 + np.random.normal(0, .3 / 2, n)
|
||||
y -= y.mean()
|
||||
y0 -= y0.mean()
|
||||
|
||||
alphas = [1e-3, 1e-3]
|
||||
cs = CubicSplines(x, df=[10, 10], constraints='center')
|
||||
|
||||
gam = GLMGam(y, exog=np.ones((n, 1)), smoother=cs, alpha=alphas)
|
||||
gam_res = gam.fit(method='pirls')
|
||||
|
||||
y_est = gam_res.fittedvalues
|
||||
y_est -= y_est.mean()
|
||||
|
||||
# cut the tails
|
||||
index = list(range(50, n - 50))
|
||||
y_est = y_est[index]
|
||||
y0 = y0[index]
|
||||
y = y[index]
|
||||
|
||||
# plt.plot(y_est, label='y est')
|
||||
# plt.plot(y0, label='y0')
|
||||
# plt.plot(y, '.', label='y')
|
||||
# plt.legend(loc='best')
|
||||
# plt.show()
|
||||
|
||||
assert_allclose(y_est, y0, atol=0.04)
|
||||
|
||||
|
||||
def test_glm_pirls_compatibility():
|
||||
np.random.seed(0)
|
||||
|
||||
n = 500
|
||||
x1 = np.linspace(-3, 3, n)
|
||||
x2 = np.random.rand(n)
|
||||
|
||||
x = np.vstack([x1, x2]).T
|
||||
y1 = np.sin(x1) / x1
|
||||
y2 = x2 * x2
|
||||
y0 = y1 + y2
|
||||
y = y0 + np.random.normal(0, .3, n)
|
||||
y -= y.mean()
|
||||
y0 -= y0.mean()
|
||||
|
||||
# TODO: we have now alphas == alphas_glm
|
||||
alphas = [5.75] * 2
|
||||
alphas_glm = [1.2] * 2 # noqa: F841
|
||||
# using constraints avoids singular exog.
|
||||
cs = BSplines(x, df=[10, 10], degree=[3, 3], constraints='center')
|
||||
|
||||
gam_pirls = GLMGam(y, smoother=cs, alpha=alphas)
|
||||
gam_glm = GLMGam(y, smoother=cs, alpha=alphas)
|
||||
|
||||
gam_res_glm = gam_glm.fit(method='nm', max_start_irls=0,
|
||||
disp=1, maxiter=20000)
|
||||
gam_res_glm = gam_glm.fit(start_params=gam_res_glm.params,
|
||||
method='bfgs', max_start_irls=0,
|
||||
disp=1, maxiter=20000)
|
||||
gam_res_pirls = gam_pirls.fit()
|
||||
|
||||
y_est_glm = np.dot(cs.basis, gam_res_glm.params)
|
||||
y_est_glm -= y_est_glm.mean()
|
||||
y_est_pirls = np.dot(cs.basis, gam_res_pirls.params)
|
||||
y_est_pirls -= y_est_pirls.mean()
|
||||
|
||||
# plt.plot(y_est_pirls)
|
||||
# plt.plot(y_est_glm)
|
||||
# plt.plot(y, '.')
|
||||
# plt.show()
|
||||
assert_allclose(gam_res_glm.params, gam_res_pirls.params, atol=5e-5,
|
||||
rtol=5e-5)
|
||||
assert_allclose(y_est_glm, y_est_pirls, atol=5e-5)
|
||||
|
||||
|
||||
def test_zero_penalty():
|
||||
x, y, poly = multivariate_sample_data()
|
||||
alphas = [0, 0]
|
||||
gam_gs = GLMGam(y, smoother=poly, alpha=alphas)
|
||||
gam_gs_res = gam_gs.fit()
|
||||
y_est_gam = gam_gs_res.predict()
|
||||
|
||||
glm = GLM(y, poly.basis).fit()
|
||||
y_est = glm.predict()
|
||||
|
||||
assert_allclose(y_est, y_est_gam)
|
||||
|
||||
|
||||
def test_spl_s():
|
||||
# matrix from R
|
||||
spl_s_R = [[0, 0, 0.000000000, 0.000000000, 0.000000000, 0.000000000],
|
||||
[0, 0, 0.000000000, 0.000000000, 0.000000000, 0.000000000],
|
||||
[0, 0, 0.001400000, 0.000200000, -0.001133333, -0.001000000],
|
||||
[0, 0, 0.000200000, 0.002733333, 0.001666667, -0.001133333],
|
||||
[0, 0, -0.001133333, 0.001666667, 0.002733333, 0.000200000],
|
||||
[0, 0, -0.001000000, -0.001133333, 0.000200000, 0.001400000]]
|
||||
|
||||
np.random.seed(1)
|
||||
x = np.random.normal(0, 1, 10)
|
||||
xk = np.array([0.2, .4, .6, .8])
|
||||
cs = UnivariateCubicSplines(x, df=4)
|
||||
cs.knots = xk
|
||||
|
||||
spl_s = cs._splines_s()
|
||||
assert_allclose(spl_s_R, spl_s, atol=4.e-10)
|
||||
|
||||
|
||||
def test_partial_values2():
|
||||
np.random.seed(0)
|
||||
n = 1000
|
||||
x = np.random.uniform(0, 1, (n, 2))
|
||||
x = x - x.mean()
|
||||
y = x[:, 0] * x[:, 0] + np.random.normal(0, .01, n)
|
||||
y -= y.mean()
|
||||
alpha = 0.0
|
||||
# BUG: mask is incorrect if exog is not None, start_idx missing
|
||||
# bsplines = BSplines(x, degree=[3] * 2, df=[10] * 2)
|
||||
# glm_gam = GLMGam(y, exog=np.ones((len(y), 1)), smoother=bsplines,
|
||||
# alpha=alpha)
|
||||
bsplines = BSplines(x, degree=[3] * 2, df=[10] * 2,
|
||||
include_intercept=[True, False])
|
||||
glm_gam = GLMGam(y, smoother=bsplines, alpha=alpha)
|
||||
res_glm_gam = glm_gam.fit(method='pirls', max_start_irls=0,
|
||||
disp=0, maxiter=5000)
|
||||
glm = GLM(y, bsplines.basis) # noqa: F841
|
||||
|
||||
# case with constant column in exog is currently wrong
|
||||
# ex = np.column_stack((np.zeros((len(y), 1)), bsplines.smoothers[0].basis,
|
||||
# np.zeros_like(bsplines.smoothers[1].basis) ))
|
||||
ex = np.column_stack((bsplines.smoothers[0].basis,
|
||||
np.zeros_like(bsplines.smoothers[1].basis)))
|
||||
|
||||
y_est = res_glm_gam.predict(ex, transform=False)
|
||||
y_partial_est, se = res_glm_gam.partial_values(0)
|
||||
|
||||
assert_allclose(y_est, y_partial_est, atol=0.05)
|
||||
assert se.min() < 100
|
||||
# TODO: sometimes the SE reported by partial_values is very large.
|
||||
# This should be double checked
|
||||
|
||||
|
||||
def test_partial_values():
|
||||
# this test is only approximate because we do not use the same spline
|
||||
# basis functions (knots) as mgcv
|
||||
cur_dir = os.path.dirname(os.path.abspath(__file__))
|
||||
file_path = os.path.join(cur_dir, "results", "prediction_from_mgcv.csv")
|
||||
|
||||
data_from_r = pd.read_csv(file_path)
|
||||
|
||||
# dataset used to train the R model
|
||||
x = data_from_r.x.values
|
||||
y = data_from_r.y.values
|
||||
se_from_mgcv = data_from_r.y_est_se
|
||||
df = [10]
|
||||
degree = [6]
|
||||
bsplines = BSplines(x, degree=degree, df=df, include_intercept=True)
|
||||
|
||||
# TODO: alpha found by trial and error to pass assert
|
||||
alpha = 0.025 / 115 * 500
|
||||
glm_gam = GLMGam(y, smoother=bsplines, alpha=alpha)
|
||||
res_glm_gam = glm_gam.fit(maxiter=10000, method='bfgs')
|
||||
# TODO: if IRLS is used res_glm_gam has not partial_values.
|
||||
|
||||
univ_bsplines = bsplines.smoothers[0] # noqa: F841
|
||||
hat_y, se = res_glm_gam.partial_values(0)
|
||||
|
||||
assert_allclose(hat_y, data_from_r["y_est"], rtol=0, atol=0.008)
|
||||
# TODO: bug missing scale
|
||||
bug_fact = np.sqrt(res_glm_gam.scale) * 0.976 # this is = 0.106
|
||||
assert_allclose(se, se_from_mgcv * bug_fact, rtol=0, atol=0.008)
|
||||
|
||||
|
||||
@pytest.mark.matplotlib
|
||||
def test_partial_plot():
|
||||
# verify that plot and partial_values method agree
|
||||
# the model only has one component so partial values is the same as
|
||||
# fittedvalues
|
||||
# Generate a plot to visualize analyze the result.
|
||||
|
||||
cur_dir = os.path.dirname(os.path.abspath(__file__))
|
||||
file_path = os.path.join(cur_dir, "results", "prediction_from_mgcv.csv")
|
||||
|
||||
data_from_r = pd.read_csv(file_path)
|
||||
|
||||
# dataset used to train the R model
|
||||
x = data_from_r.x.values
|
||||
y = data_from_r.y.values
|
||||
se_from_mgcv = data_from_r.y_est_se # noqa: F841
|
||||
df = [10]
|
||||
degree = [6]
|
||||
bsplines = BSplines(x, degree=degree, df=df)
|
||||
|
||||
alpha = 0.03
|
||||
glm_gam = GLMGam(y, smoother=bsplines, alpha=alpha)
|
||||
res_glm_gam = glm_gam.fit(maxiter=10000, method='bfgs')
|
||||
fig = res_glm_gam.plot_partial(0)
|
||||
xp, yp = fig.axes[0].get_children()[0].get_data()
|
||||
# Note xp and yp are sorted by x
|
||||
sort_idx = np.argsort(x)
|
||||
hat_y, se = res_glm_gam.partial_values(0)
|
||||
# assert that main plot line is the prediction
|
||||
assert_allclose(xp, x[sort_idx])
|
||||
assert_allclose(yp, hat_y[sort_idx])
|
||||
|
||||
# Uncomment to visualize the plot
|
||||
# import matplotlib.pyplot as plt
|
||||
# res_glm_gam.plot_partial(0)
|
||||
# plt.plot(x, y, '.')
|
||||
# plt.show()
|
||||
|
||||
|
||||
def test_cov_params():
|
||||
|
||||
np.random.seed(0)
|
||||
n = 1000
|
||||
x = np.random.uniform(0, 1, (n, 2))
|
||||
x = x - x.mean()
|
||||
y = x[:, 0] * x[:, 0] + np.random.normal(0, .01, n)
|
||||
y -= y.mean()
|
||||
|
||||
bsplines = BSplines(x, degree=[3] * 2, df=[10] * 2, constraints='center')
|
||||
alpha = [0, 0]
|
||||
glm_gam = GLMGam(y, smoother=bsplines, alpha=alpha)
|
||||
res_glm_gam = glm_gam.fit(method='pirls', max_start_irls=0,
|
||||
disp=0, maxiter=5000)
|
||||
glm = GLM(y, bsplines.basis)
|
||||
res_glm = glm.fit()
|
||||
|
||||
assert_allclose(res_glm.cov_params(), res_glm_gam.cov_params(),
|
||||
rtol=0.0025)
|
||||
|
||||
alpha = 1e-13
|
||||
glm_gam = GLMGam(y, smoother=bsplines, alpha=alpha)
|
||||
res_glm_gam = glm_gam.fit(method='pirls', max_start_irls=0,
|
||||
disp=0, maxiter=5000)
|
||||
|
||||
assert_allclose(res_glm.cov_params(), res_glm_gam.cov_params(),
|
||||
atol=1e-10)
|
||||
|
||||
res_glm_gam = glm_gam.fit(method='bfgs', max_start_irls=0,
|
||||
disp=0, maxiter=5000)
|
||||
|
||||
assert_allclose(res_glm.cov_params(), res_glm_gam.cov_params(),
|
||||
rtol=1e-4, atol=1e-8)
|
||||
Reference in New Issue
Block a user