some new features

2025-07-30 17:09:11 +03:00
parent db5d46760a
commit 8019bd3b7c
20616 changed files with 4375466 additions and 8 deletions
--- a/.venv/lib/python3.12/site-packages/statsmodels/stats/tests/test_knockoff.py
+++ b/.venv/lib/python3.12/site-packages/statsmodels/stats/tests/test_knockoff.py
@ -0,0 +1,157 @@
+import numpy as np
+from numpy.testing import assert_allclose, assert_array_equal, assert_equal
+import pytest
+import statsmodels.api as sm
+from statsmodels.stats import knockoff_regeffects as kr
+from statsmodels.stats._knockoff import (RegressionFDR,
+                                         _design_knockoff_equi,
+                                         _design_knockoff_sdp)
+
+try:
+    import cvxopt  # noqa:F401
+    has_cvxopt = True
+except ImportError:
+    has_cvxopt = False
+
+
+def test_equi():
+    # Test the structure of the equivariant knockoff construction.
+
+    np.random.seed(2342)
+    exog = np.random.normal(size=(10, 4))
+
+    exog1, exog2, sl = _design_knockoff_equi(exog)
+
+    exoga = np.concatenate((exog1, exog2), axis=1)
+
+    gmat = np.dot(exoga.T, exoga)
+
+    cm1 = gmat[0:4, 0:4]
+    cm2 = gmat[4:, 4:]
+    cm3 = gmat[0:4, 4:]
+
+    assert_allclose(cm1, cm2, rtol=1e-4, atol=1e-4)
+    assert_allclose(cm1 - cm3, np.diag(sl * np.ones(4)), rtol=1e-4, atol=1e-4)
+
+
+def test_sdp():
+    # Test the structure of the SDP knockoff construction.
+
+    if not has_cvxopt:
+        return
+
+    np.random.seed(2342)
+    exog = np.random.normal(size=(10, 4))
+
+    exog1, exog2, sl = _design_knockoff_sdp(exog)
+
+    exoga = np.concatenate((exog1, exog2), axis=1)
+
+    gmat = np.dot(exoga.T, exoga)
+
+    cm1 = gmat[0:4, 0:4]
+    cm2 = gmat[4:, 4:]
+    cm3 = gmat[0:4, 4:]
+
+    assert_allclose(cm1, cm2, rtol=1e-4, atol=1e-4)
+    assert_allclose(cm1 - cm3, np.diag(sl * np.ones(4)), rtol=1e-5, atol=1e-5)
+
+
+@pytest.mark.parametrize("p", [49, 50])
+@pytest.mark.parametrize("tester", [
+                   kr.CorrelationEffects(),
+                   kr.ForwardEffects(pursuit=False),
+                   kr.ForwardEffects(pursuit=True),
+                   kr.OLSEffects(),
+                   kr.RegModelEffects(sm.OLS),
+                   kr.RegModelEffects(sm.OLS, True,
+                                      fit_kws={"L1_wt": 0, "alpha": 1}),
+                ])
+@pytest.mark.parametrize("method", ["equi", "sdp"])
+def test_testers(p, tester, method):
+
+    if method == "sdp" and not has_cvxopt:
+        return
+
+    np.random.seed(2432)
+    n = 200
+
+    y = np.random.normal(size=n)
+    x = np.random.normal(size=(n, p))
+
+    kn = RegressionFDR(y, x, tester, design_method=method)
+    assert_equal(len(kn.stats), p)
+    assert_equal(len(kn.fdr), p)
+    kn.summary()  # smoke test
+
+
+@pytest.mark.slow
+@pytest.mark.parametrize("method", ["equi", "sdp"])
+@pytest.mark.parametrize("tester,n,p,es", [
+    [kr.CorrelationEffects(), 300, 100, 6],
+    [kr.ForwardEffects(pursuit=False), 300, 100, 3.5],
+    [kr.ForwardEffects(pursuit=True), 300, 100, 3.5],
+    [kr.OLSEffects(), 3000, 200, 3.5],
+])
+def test_sim(method, tester, n, p, es):
+    # This function assesses the performance of the knockoff approach
+    # relative to its theoretical claims.
+
+    if method == "sdp" and not has_cvxopt:
+        return
+
+    np.random.seed(43234)
+
+    # Number of variables with a non-zero coefficient
+    npos = 30
+
+    # Aim to control FDR to this level
+    target_fdr = 0.2
+
+    # Number of siumulation replications
+    nrep = 10
+
+    if method == "sdp" and not has_cvxopt:
+        return
+
+    fdr, power = 0, 0
+    for k in range(nrep):
+
+        # Generate the predictors
+        x = np.random.normal(size=(n, p))
+        x /= np.sqrt(np.sum(x*x, 0))
+
+        # Generate the response variable
+        coeff = es * (-1)**np.arange(npos)
+        y = np.dot(x[:, 0:npos], coeff) + np.random.normal(size=n)
+
+        kn = RegressionFDR(y, x, tester)
+
+        # Threshold to achieve the target FDR
+        tr = kn.threshold(target_fdr)
+
+        # Number of selected coefficients
+        cp = np.sum(kn.stats >= tr)
+
+        # Number of false positives
+        fp = np.sum(kn.stats[npos:] >= tr)
+
+        # Observed FDR
+        fdr += fp / max(cp, 1)
+
+        # Proportion of true positives that are detected
+        power += np.mean(kn.stats[0:npos] >= tr)
+
+        # The estimated FDR may never exceed the target FDR
+        estimated_fdr = (np.sum(kn.stats <= -tr) /
+                         (1 + np.sum(kn.stats >= tr)))
+        assert_equal(estimated_fdr < target_fdr, True)
+
+    power /= nrep
+    fdr /= nrep
+
+    # Check for reasonable power
+    assert_array_equal(power > 0.6, True)
+
+    # Check that we are close to the target FDR
+    assert_array_equal(fdr < target_fdr + 0.1, True)