some new features
This commit is contained in:
@ -0,0 +1,515 @@
|
||||
from io import StringIO
|
||||
|
||||
import numpy as np
|
||||
|
||||
from statsmodels.stats.anova import anova_lm
|
||||
from statsmodels.formula.api import ols
|
||||
from pandas import read_csv
|
||||
|
||||
kidney_table = StringIO("""Days Duration Weight ID
|
||||
0.0 1 1 1
|
||||
2.0 1 1 2
|
||||
1.0 1 1 3
|
||||
3.0 1 1 4
|
||||
0.0 1 1 5
|
||||
2.0 1 1 6
|
||||
0.0 1 1 7
|
||||
5.0 1 1 8
|
||||
6.0 1 1 9
|
||||
8.0 1 1 10
|
||||
2.0 1 2 1
|
||||
4.0 1 2 2
|
||||
7.0 1 2 3
|
||||
12.0 1 2 4
|
||||
15.0 1 2 5
|
||||
4.0 1 2 6
|
||||
3.0 1 2 7
|
||||
1.0 1 2 8
|
||||
5.0 1 2 9
|
||||
20.0 1 2 10
|
||||
15.0 1 3 1
|
||||
10.0 1 3 2
|
||||
8.0 1 3 3
|
||||
5.0 1 3 4
|
||||
25.0 1 3 5
|
||||
16.0 1 3 6
|
||||
7.0 1 3 7
|
||||
30.0 1 3 8
|
||||
3.0 1 3 9
|
||||
27.0 1 3 10
|
||||
0.0 2 1 1
|
||||
1.0 2 1 2
|
||||
1.0 2 1 3
|
||||
0.0 2 1 4
|
||||
4.0 2 1 5
|
||||
2.0 2 1 6
|
||||
7.0 2 1 7
|
||||
4.0 2 1 8
|
||||
0.0 2 1 9
|
||||
3.0 2 1 10
|
||||
5.0 2 2 1
|
||||
3.0 2 2 2
|
||||
2.0 2 2 3
|
||||
0.0 2 2 4
|
||||
1.0 2 2 5
|
||||
1.0 2 2 6
|
||||
3.0 2 2 7
|
||||
6.0 2 2 8
|
||||
7.0 2 2 9
|
||||
9.0 2 2 10
|
||||
10.0 2 3 1
|
||||
8.0 2 3 2
|
||||
12.0 2 3 3
|
||||
3.0 2 3 4
|
||||
7.0 2 3 5
|
||||
15.0 2 3 6
|
||||
4.0 2 3 7
|
||||
9.0 2 3 8
|
||||
6.0 2 3 9
|
||||
1.0 2 3 10
|
||||
""")
|
||||
|
||||
kidney_table.seek(0)
|
||||
kidney_table = read_csv(kidney_table, sep=r"\s+", engine='python').astype(int)
|
||||
|
||||
class TestAnovaLM:
|
||||
@classmethod
|
||||
def setup_class(cls):
|
||||
# kidney data taken from JT's course
|
||||
# do not know the license
|
||||
cls.data = kidney_table
|
||||
cls.kidney_lm = ols('np.log(Days+1) ~ C(Duration) * C(Weight)',
|
||||
data=cls.data).fit()
|
||||
|
||||
def test_results(self):
|
||||
Df = np.array([1, 2, 2, 54])
|
||||
sum_sq = np.array([2.339693, 16.97129, 0.6356584, 28.9892])
|
||||
mean_sq = np.array([2.339693, 8.485645, 0.3178292, 0.536837])
|
||||
f_value = np.array([4.358293, 15.80674, 0.5920404, np.nan])
|
||||
pr_f = np.array([0.0415617, 3.944502e-06, 0.5567479, np.nan])
|
||||
|
||||
results = anova_lm(self.kidney_lm)
|
||||
np.testing.assert_equal(results['df'].values, Df)
|
||||
np.testing.assert_almost_equal(results['sum_sq'].values, sum_sq, 4)
|
||||
np.testing.assert_almost_equal(results['F'].values, f_value, 4)
|
||||
np.testing.assert_almost_equal(results['PR(>F)'].values, pr_f)
|
||||
|
||||
|
||||
class TestAnovaLMNoconstant:
|
||||
@classmethod
|
||||
def setup_class(cls):
|
||||
# kidney data taken from JT's course
|
||||
# do not know the license
|
||||
cls.data = kidney_table
|
||||
cls.kidney_lm = ols('np.log(Days+1) ~ C(Duration) * C(Weight) - 1',
|
||||
data=cls.data).fit()
|
||||
|
||||
def test_results(self):
|
||||
Df = np.array([2, 2, 2, 54])
|
||||
sum_sq = np.array([158.6415227, 16.97129, 0.6356584, 28.9892])
|
||||
mean_sq = np.array([79.3207613, 8.485645, 0.3178292, 0.536837])
|
||||
f_value = np.array([147.7557648, 15.80674, 0.5920404, np.nan])
|
||||
pr_f = np.array([1.262324e-22, 3.944502e-06, 0.5567479, np.nan])
|
||||
|
||||
results = anova_lm(self.kidney_lm)
|
||||
np.testing.assert_equal(results['df'].values, Df)
|
||||
np.testing.assert_almost_equal(results['sum_sq'].values, sum_sq, 4)
|
||||
np.testing.assert_almost_equal(results['F'].values, f_value, 4)
|
||||
np.testing.assert_almost_equal(results['PR(>F)'].values, pr_f)
|
||||
|
||||
# > sum2.lm = lm(logDays ~ Duration * Weight - 1, contrasts=list(Duration=contr.sum, Weight=contr.sum))
|
||||
# > anova.lm.sum2 <- anova(sum2.lm)
|
||||
# > anova.lm.sum2
|
||||
# Analysis of Variance Table
|
||||
#
|
||||
# Response: logDays
|
||||
# Df Sum Sq Mean Sq F value Pr(>F)
|
||||
# Duration 2 158.642 79.321 147.756 < 2.2e-16 ***
|
||||
# Weight 2 16.971 8.486 15.807 3.945e-06 ***
|
||||
# Duration:Weight 2 0.636 0.318 0.592 0.5567
|
||||
# Residuals 54 28.989 0.537
|
||||
# ---
|
||||
# Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
|
||||
|
||||
|
||||
|
||||
class TestAnovaLMCompare(TestAnovaLM):
|
||||
def test_results(self):
|
||||
new_model = ols("np.log(Days+1) ~ C(Duration) + C(Weight)",
|
||||
self.data).fit()
|
||||
results = anova_lm(new_model, self.kidney_lm)
|
||||
|
||||
Res_Df = np.array([
|
||||
56, 54
|
||||
])
|
||||
RSS = np.array([
|
||||
29.62486, 28.9892
|
||||
])
|
||||
Df = np.array([
|
||||
0, 2
|
||||
])
|
||||
Sum_of_Sq = np.array([
|
||||
np.nan, 0.6356584
|
||||
])
|
||||
F = np.array([
|
||||
np.nan, 0.5920404
|
||||
])
|
||||
PrF = np.array([
|
||||
np.nan, 0.5567479
|
||||
])
|
||||
|
||||
np.testing.assert_equal(results["df_resid"].values, Res_Df)
|
||||
np.testing.assert_almost_equal(results["ssr"].values, RSS, 4)
|
||||
np.testing.assert_almost_equal(results["df_diff"].values, Df)
|
||||
np.testing.assert_almost_equal(results["ss_diff"].values, Sum_of_Sq)
|
||||
np.testing.assert_almost_equal(results["F"].values, F)
|
||||
np.testing.assert_almost_equal(results["Pr(>F)"].values, PrF)
|
||||
|
||||
|
||||
class TestAnovaLMCompareNoconstant(TestAnovaLM):
|
||||
def test_results(self):
|
||||
new_model = ols("np.log(Days+1) ~ C(Duration) + C(Weight) - 1",
|
||||
self.data).fit()
|
||||
results = anova_lm(new_model, self.kidney_lm)
|
||||
|
||||
Res_Df = np.array([
|
||||
56, 54
|
||||
])
|
||||
RSS = np.array([
|
||||
29.62486, 28.9892
|
||||
])
|
||||
Df = np.array([
|
||||
0, 2
|
||||
])
|
||||
Sum_of_Sq = np.array([
|
||||
np.nan, 0.6356584
|
||||
])
|
||||
F = np.array([
|
||||
np.nan, 0.5920404
|
||||
])
|
||||
PrF = np.array([
|
||||
np.nan, 0.5567479
|
||||
])
|
||||
|
||||
np.testing.assert_equal(results["df_resid"].values, Res_Df)
|
||||
np.testing.assert_almost_equal(results["ssr"].values, RSS, 4)
|
||||
np.testing.assert_almost_equal(results["df_diff"].values, Df)
|
||||
np.testing.assert_almost_equal(results["ss_diff"].values, Sum_of_Sq)
|
||||
np.testing.assert_almost_equal(results["F"].values, F)
|
||||
np.testing.assert_almost_equal(results["Pr(>F)"].values, PrF)
|
||||
|
||||
|
||||
class TestAnova2(TestAnovaLM):
|
||||
# drop some observations to make an unbalanced, disproportionate panel
|
||||
# to make sure things are okay
|
||||
def test_results(self):
|
||||
data = self.data.drop([0,1,2])
|
||||
anova_ii = ols("np.log(Days+1) ~ C(Duration, Sum)*C(Weight, Sum)",
|
||||
data).fit()
|
||||
|
||||
Sum_Sq = np.array([
|
||||
3.067066, 13.27205, 0.1905093, 27.60181
|
||||
])
|
||||
Df = np.array([
|
||||
1, 2, 2, 51
|
||||
])
|
||||
F_value = np.array([
|
||||
5.667033, 12.26141, 0.1760025, np.nan
|
||||
])
|
||||
PrF = np.array([
|
||||
0.02106078, 4.487909e-05, 0.8391231, np.nan
|
||||
])
|
||||
|
||||
results = anova_lm(anova_ii, typ="II")
|
||||
np.testing.assert_equal(results['df'].values, Df)
|
||||
np.testing.assert_almost_equal(results['sum_sq'].values, Sum_Sq, 4)
|
||||
np.testing.assert_almost_equal(results['F'].values, F_value, 4)
|
||||
np.testing.assert_almost_equal(results['PR(>F)'].values, PrF)
|
||||
|
||||
|
||||
class TestAnova2Noconstant(TestAnovaLM):
|
||||
# drop some observations to make an unbalanced, disproportionate panel
|
||||
# to make sure things are okay
|
||||
def test_results(self):
|
||||
data = self.data.drop([0,1,2])
|
||||
anova_ii = ols("np.log(Days+1) ~ C(Duration, Sum)*C(Weight, Sum) - 1",
|
||||
data).fit()
|
||||
|
||||
Sum_Sq = np.array([
|
||||
154.7131692, 13.27205, 0.1905093, 27.60181
|
||||
])
|
||||
Df = np.array([
|
||||
2, 2, 2, 51
|
||||
])
|
||||
F_value = np.array([
|
||||
142.9321191, 12.26141, 0.1760025, np.nan
|
||||
])
|
||||
PrF = np.array([
|
||||
1.238624e-21, 4.487909e-05, 0.8391231, np.nan
|
||||
])
|
||||
|
||||
results = anova_lm(anova_ii, typ="II")
|
||||
np.testing.assert_equal(results['df'].values, Df)
|
||||
np.testing.assert_almost_equal(results['sum_sq'].values, Sum_Sq, 4)
|
||||
np.testing.assert_almost_equal(results['F'].values, F_value, 4)
|
||||
np.testing.assert_almost_equal(results['PR(>F)'].values, PrF)
|
||||
|
||||
# > sum2.lm.dropped <- lm(logDays ~ Duration * Weight - 1, dta.dropped,
|
||||
# contrasts=list(Duration=contr.sum, Weight=contr.sum))
|
||||
# > anova.ii.dropped2 <- Anova(sum2.lm.dropped, type='II')
|
||||
# > anova.ii.dropped2
|
||||
# Anova Table (Type II tests)
|
||||
#
|
||||
# Response: logDays
|
||||
# Sum Sq Df F value Pr(>F)
|
||||
# Duration 154.713 2 142.932 < 2.2e-16 ***
|
||||
# Weight 13.272 2 12.261 4.488e-05 ***
|
||||
# Duration:Weight 0.191 2 0.176 0.8391
|
||||
# Residuals 27.602 51
|
||||
|
||||
|
||||
class TestAnova2HC0(TestAnovaLM):
|
||||
#NOTE: R does not return SSq with robust covariance. Why?
|
||||
# drop some observations to make an unbalanced, disproportionate panel
|
||||
# to make sure things are okay
|
||||
def test_results(self):
|
||||
data = self.data.drop([0,1,2])
|
||||
anova_ii = ols("np.log(Days+1) ~ C(Duration, Sum)*C(Weight, Sum)",
|
||||
data).fit()
|
||||
|
||||
Sum_Sq = np.array([
|
||||
151.4065, 2.904723, 13.45718, 0.1905093, 27.60181
|
||||
])
|
||||
Df = np.array([
|
||||
1, 2, 2, 51
|
||||
])
|
||||
F = np.array([
|
||||
6.972744, 13.7804, 0.1709936, np.nan
|
||||
])
|
||||
PrF = np.array([
|
||||
0.01095599, 1.641682e-05, 0.8433081, np.nan
|
||||
])
|
||||
|
||||
results = anova_lm(anova_ii, typ="II", robust="hc0")
|
||||
np.testing.assert_equal(results['df'].values, Df)
|
||||
#np.testing.assert_almost_equal(results['sum_sq'].values, Sum_Sq, 4)
|
||||
np.testing.assert_almost_equal(results['F'].values, F, 4)
|
||||
np.testing.assert_almost_equal(results['PR(>F)'].values, PrF)
|
||||
|
||||
class TestAnova2HC1(TestAnovaLM):
|
||||
# drop some observations to make an unbalanced, disproportionate panel
|
||||
# to make sure things are okay
|
||||
def test_results(self):
|
||||
data = self.data.drop([0,1,2])
|
||||
anova_ii = ols("np.log(Days+1) ~ C(Duration, Sum)*C(Weight, Sum)",
|
||||
data).fit()
|
||||
|
||||
Sum_Sq = np.array([
|
||||
151.4065, 2.904723, 13.45718, 0.1905093, 27.60181
|
||||
])
|
||||
Df = np.array([
|
||||
1, 2, 2, 51
|
||||
])
|
||||
F = np.array([
|
||||
6.238771, 12.32983, 0.1529943, np.nan
|
||||
])
|
||||
PrF = np.array([
|
||||
0.01576555, 4.285456e-05, 0.858527, np.nan
|
||||
])
|
||||
|
||||
results = anova_lm(anova_ii, typ="II", robust="hc1")
|
||||
np.testing.assert_equal(results['df'].values, Df)
|
||||
#np.testing.assert_almost_equal(results['sum_sq'].values, Sum_Sq, 4)
|
||||
np.testing.assert_almost_equal(results['F'].values, F, 4)
|
||||
np.testing.assert_almost_equal(results['PR(>F)'].values, PrF)
|
||||
|
||||
class TestAnova2HC2(TestAnovaLM):
|
||||
# drop some observations to make an unbalanced, disproportionate panel
|
||||
# to make sure things are okay
|
||||
def test_results(self):
|
||||
data = self.data.drop([0,1,2])
|
||||
anova_ii = ols("np.log(Days+1) ~ C(Duration, Sum)*C(Weight, Sum)",
|
||||
data).fit()
|
||||
|
||||
Sum_Sq = np.array([
|
||||
151.4065, 2.904723, 13.45718, 0.1905093, 27.60181
|
||||
])
|
||||
Df = np.array([
|
||||
1, 2, 2, 51
|
||||
])
|
||||
F = np.array([
|
||||
6.267499, 12.25354, 0.1501224, np.nan
|
||||
])
|
||||
PrF = np.array([
|
||||
0.01554009, 4.511826e-05, 0.8609815, np.nan
|
||||
])
|
||||
|
||||
|
||||
results = anova_lm(anova_ii, typ="II", robust="hc2")
|
||||
np.testing.assert_equal(results['df'].values, Df)
|
||||
#np.testing.assert_almost_equal(results['sum_sq'].values, Sum_Sq, 4)
|
||||
np.testing.assert_almost_equal(results['F'].values, F, 4)
|
||||
np.testing.assert_almost_equal(results['PR(>F)'].values, PrF)
|
||||
|
||||
class TestAnova2HC3(TestAnovaLM):
|
||||
# drop some observations to make an unbalanced, disproportionate panel
|
||||
# to make sure things are okay
|
||||
def test_results(self):
|
||||
data = self.data.drop([0,1,2])
|
||||
anova_ii = ols("np.log(Days+1) ~ C(Duration, Sum)*C(Weight, Sum)",
|
||||
data).fit()
|
||||
|
||||
Sum_Sq = np.array([
|
||||
151.4065, 2.904723, 13.45718, 0.1905093, 27.60181
|
||||
])
|
||||
Df = np.array([
|
||||
1, 2, 2, 51
|
||||
])
|
||||
F = np.array([
|
||||
5.633786, 10.89842, 0.1317223, np.nan
|
||||
])
|
||||
PrF = np.array([
|
||||
0.02142223, 0.0001145965, 0.8768817, np.nan
|
||||
])
|
||||
|
||||
results = anova_lm(anova_ii, typ="II", robust="hc3")
|
||||
np.testing.assert_equal(results['df'].values, Df)
|
||||
#np.testing.assert_almost_equal(results['sum_sq'].values, Sum_Sq, 4)
|
||||
np.testing.assert_almost_equal(results['F'].values, F, 4)
|
||||
np.testing.assert_almost_equal(results['PR(>F)'].values, PrF)
|
||||
|
||||
class TestAnova3(TestAnovaLM):
|
||||
# drop some observations to make an unbalanced, disproportionate panel
|
||||
# to make sure things are okay
|
||||
def test_results(self):
|
||||
data = self.data.drop([0,1,2])
|
||||
anova_iii = ols("np.log(Days+1) ~ C(Duration, Sum)*C(Weight, Sum)",
|
||||
data).fit()
|
||||
|
||||
Sum_Sq = np.array([
|
||||
151.4065, 2.904723, 13.45718, 0.1905093, 27.60181
|
||||
])
|
||||
Df = np.array([
|
||||
1, 1, 2, 2, 51
|
||||
])
|
||||
F_value = np.array([
|
||||
279.7545, 5.367071, 12.43245, 0.1760025, np.nan
|
||||
])
|
||||
PrF = np.array([
|
||||
2.379855e-22, 0.02457384, 3.999431e-05, 0.8391231, np.nan
|
||||
])
|
||||
|
||||
results = anova_lm(anova_iii, typ="III")
|
||||
np.testing.assert_equal(results['df'].values, Df)
|
||||
np.testing.assert_almost_equal(results['sum_sq'].values, Sum_Sq, 4)
|
||||
np.testing.assert_almost_equal(results['F'].values, F_value, 4)
|
||||
np.testing.assert_almost_equal(results['PR(>F)'].values, PrF)
|
||||
|
||||
class TestAnova3HC0(TestAnovaLM):
|
||||
#NOTE: R does not return SSq with robust covariance. Why?
|
||||
# drop some observations to make an unbalanced, disproportionate panel
|
||||
# to make sure things are okay
|
||||
def test_results(self):
|
||||
data = self.data.drop([0,1,2])
|
||||
anova_iii = ols("np.log(Days+1) ~ C(Duration, Sum)*C(Weight, Sum)",
|
||||
data).fit()
|
||||
|
||||
Sum_Sq = np.array([
|
||||
151.4065, 2.904723, 13.45718, 0.1905093, 27.60181
|
||||
])
|
||||
Df = np.array([
|
||||
1, 1, 2, 2, 51
|
||||
])
|
||||
F = np.array([
|
||||
298.3404, 5.723638, 13.76069, 0.1709936, np.nan
|
||||
])
|
||||
PrF = np.array([
|
||||
5.876255e-23, 0.02046031, 1.662826e-05, 0.8433081, np.nan
|
||||
])
|
||||
|
||||
results = anova_lm(anova_iii, typ="III", robust="hc0")
|
||||
np.testing.assert_equal(results['df'].values, Df)
|
||||
#np.testing.assert_almost_equal(results['sum_sq'].values, Sum_Sq, 4)
|
||||
np.testing.assert_almost_equal(results['F'].values, F, 4)
|
||||
np.testing.assert_almost_equal(results['PR(>F)'].values, PrF)
|
||||
|
||||
class TestAnova3HC1(TestAnovaLM):
|
||||
# drop some observations to make an unbalanced, disproportionate panel
|
||||
# to make sure things are okay
|
||||
def test_results(self):
|
||||
data = self.data.drop([0,1,2])
|
||||
anova_iii = ols("np.log(Days+1) ~ C(Duration, Sum)*C(Weight, Sum)",
|
||||
data).fit()
|
||||
|
||||
Sum_Sq = np.array([
|
||||
151.4065, 2.904723, 13.45718, 0.1905093, 27.60181
|
||||
])
|
||||
Df = np.array([
|
||||
1, 1, 2, 2, 51
|
||||
])
|
||||
F = np.array([
|
||||
266.9361, 5.12115, 12.3122, 0.1529943, np.nan
|
||||
])
|
||||
PrF = np.array([
|
||||
6.54355e-22, 0.02792296, 4.336712e-05, 0.858527, np.nan
|
||||
])
|
||||
|
||||
results = anova_lm(anova_iii, typ="III", robust="hc1")
|
||||
np.testing.assert_equal(results['df'].values, Df)
|
||||
#np.testing.assert_almost_equal(results['sum_sq'].values, Sum_Sq, 4)
|
||||
np.testing.assert_almost_equal(results['F'].values, F, 4)
|
||||
np.testing.assert_almost_equal(results['PR(>F)'].values, PrF)
|
||||
|
||||
class TestAnova3HC2(TestAnovaLM):
|
||||
# drop some observations to make an unbalanced, disproportionate panel
|
||||
# to make sure things are okay
|
||||
def test_results(self):
|
||||
data = self.data.drop([0,1,2])
|
||||
anova_iii = ols("np.log(Days+1) ~ C(Duration, Sum)*C(Weight, Sum)",
|
||||
data).fit()
|
||||
|
||||
Sum_Sq = np.array([
|
||||
151.4065, 2.904723, 13.45718, 0.1905093, 27.60181
|
||||
])
|
||||
Df = np.array([
|
||||
1, 1, 2, 2, 51
|
||||
])
|
||||
F = np.array([
|
||||
264.5137, 5.074677, 12.19158, 0.1501224, np.nan
|
||||
])
|
||||
PrF = np.array([
|
||||
7.958286e-22, 0.02860926, 4.704831e-05, 0.8609815, np.nan
|
||||
])
|
||||
|
||||
results = anova_lm(anova_iii, typ="III", robust="hc2")
|
||||
np.testing.assert_equal(results['df'].values, Df)
|
||||
#np.testing.assert_almost_equal(results['sum_sq'].values, Sum_Sq, 4)
|
||||
np.testing.assert_almost_equal(results['F'].values, F, 4)
|
||||
np.testing.assert_almost_equal(results['PR(>F)'].values, PrF)
|
||||
|
||||
class TestAnova3HC3(TestAnovaLM):
|
||||
# drop some observations to make an unbalanced, disproportionate panel
|
||||
# to make sure things are okay
|
||||
def test_results(self):
|
||||
data = self.data.drop([0,1,2])
|
||||
anova_iii = ols("np.log(Days+1) ~ C(Duration, Sum)*C(Weight, Sum)",
|
||||
data).fit()
|
||||
|
||||
Sum_Sq = np.array([
|
||||
151.4065, 2.904723, 13.45718, 0.1905093, 27.60181
|
||||
])
|
||||
Df = np.array([
|
||||
1, 1, 2, 2, 51
|
||||
])
|
||||
F = np.array([
|
||||
234.4026, 4.496996, 10.79903, 0.1317223, np.nan
|
||||
])
|
||||
PrF = np.array([
|
||||
1.037224e-20, 0.03883841, 0.0001228716, 0.8768817, np.nan
|
||||
])
|
||||
|
||||
results = anova_lm(anova_iii, typ="III", robust="hc3")
|
||||
np.testing.assert_equal(results['df'].values, Df)
|
||||
#np.testing.assert_almost_equal(results['sum_sq'].values, Sum_Sq, 4)
|
||||
np.testing.assert_almost_equal(results['F'].values, F, 4)
|
||||
np.testing.assert_almost_equal(results['PR(>F)'].values, PrF)
|
||||
Reference in New Issue
Block a user