some new features

2025-07-30 17:09:11 +03:00
parent db5d46760a
commit 8019bd3b7c
20616 changed files with 4375466 additions and 8 deletions
--- a/.venv/lib/python3.12/site-packages/pmdarima/preprocessing/exog/tests/init.py
+++ b/.venv/lib/python3.12/site-packages/pmdarima/preprocessing/exog/tests/init.py
@ -0,0 +1 @@
+# -*- coding: utf-8 -*-
--- a/.venv/lib/python3.12/site-packages/pmdarima/preprocessing/exog/tests/pycache/init.cpython-312.pyc
+++ b/.venv/lib/python3.12/site-packages/pmdarima/preprocessing/exog/tests/pycache/init.cpython-312.pyc
--- a/.venv/lib/python3.12/site-packages/pmdarima/preprocessing/exog/tests/pycache/test_base.cpython-312.pyc
+++ b/.venv/lib/python3.12/site-packages/pmdarima/preprocessing/exog/tests/pycache/test_base.cpython-312.pyc
--- a/.venv/lib/python3.12/site-packages/pmdarima/preprocessing/exog/tests/pycache/test_dates.cpython-312.pyc
+++ b/.venv/lib/python3.12/site-packages/pmdarima/preprocessing/exog/tests/pycache/test_dates.cpython-312.pyc
--- a/.venv/lib/python3.12/site-packages/pmdarima/preprocessing/exog/tests/pycache/test_fourier.cpython-312.pyc
+++ b/.venv/lib/python3.12/site-packages/pmdarima/preprocessing/exog/tests/pycache/test_fourier.cpython-312.pyc
--- a/.venv/lib/python3.12/site-packages/pmdarima/preprocessing/exog/tests/test_base.py
+++ b/.venv/lib/python3.12/site-packages/pmdarima/preprocessing/exog/tests/test_base.py
@ -0,0 +1,41 @@
+# -*- coding: utf-8 -*-
+
+from pmdarima.preprocessing.exog import base
+from pmdarima import datasets
+import numpy as np
+import pandas as pd
+
+wineind = datasets.load_wineind()
+
+
+class RandomExogFeaturizer(base.BaseExogFeaturizer):
+    """Creates random exog features. This is just used to test base func"""
+
+    def _get_prefix(self):
+        return "RND"
+
+    def fit(self, y, X, **_):
+        return self
+
+    def transform(self, y, X=None, n_periods=0, **_):
+        Xt = np.random.rand(y.shape[0], 4)
+        Xt = self._safe_hstack(X, Xt)
+        return y, Xt
+
+
+def test_default_get_feature_names():
+    feat = RandomExogFeaturizer()
+    y_trans, X = feat.fit_transform(wineind)
+    assert y_trans is wineind
+    assert X.columns.tolist() == \
+        ['RND_0', 'RND_1', 'RND_2', 'RND_3']
+
+
+def test_default_get_feature_names_with_X():
+    feat = RandomExogFeaturizer()
+    X = pd.DataFrame.from_records(
+        np.random.rand(wineind.shape[0], 2), columns=['a', 'b'])
+    y_trans, X_trans = feat.fit_transform(wineind, X)
+    assert y_trans is wineind
+    assert X_trans.columns.tolist() == \
+        ['a', 'b', 'RND_0', 'RND_1', 'RND_2', 'RND_3']
--- a/.venv/lib/python3.12/site-packages/pmdarima/preprocessing/exog/tests/test_dates.py
+++ b/.venv/lib/python3.12/site-packages/pmdarima/preprocessing/exog/tests/test_dates.py
@ -0,0 +1,129 @@
+# -*- coding: utf-8 -*-
+
+from pmdarima.datasets._base import load_date_example
+from pmdarima.preprocessing.exog import DateFeaturizer
+from pmdarima.compat.pytest import pytest_error_str
+
+from numpy.testing import assert_array_equal
+import pytest
+
+y, X = load_date_example()
+
+
+def test_no_options_warns():
+    feat = DateFeaturizer(column_name="date",
+                          with_day_of_month=False,
+                          with_day_of_week=False)
+
+    with pytest.warns(UserWarning) as w:
+        y_prime, X_prime = feat.fit_transform(y, X)
+
+    assert w is not None
+    assert_array_equal(y, y_prime)
+    assert X.equals(X_prime)
+
+
+def test_illegal_column_fails():
+    X_prime = X.copy()
+    X_prime["date2"] = X_prime["date"].astype(str)
+
+    feat = DateFeaturizer(column_name="date2")
+    with pytest.raises(ValueError) as ve:
+        feat.fit_transform(y, X_prime)
+
+    assert "pd.Timestamp type" in pytest_error_str(ve)
+
+
+def test_missing_column_fails():
+    feat = DateFeaturizer(column_name="date2")
+    with pytest.raises(ValueError) as ve:
+        feat.fit_transform(y, X)
+
+    assert "must exist" in pytest_error_str(ve)
+
+
+def test_numpy_array_fails():
+    feat = DateFeaturizer(column_name="date")
+    with pytest.raises(TypeError) as te:
+        feat.fit_transform(y, X.values)
+
+    assert "X must be" in pytest_error_str(te)
+
+
+def _dummy_assertions(X_prime):
+    # they are dummies, so they should sum to 1 along the row axis
+    dummies = X_prime[[n for n in X_prime.columns if 'WEEKDAY' in n]]
+    assert (dummies.values.sum(axis=1) == 1).all()
+
+
+def _ordinal_assertions(X_prime):
+    # it's the day of the month, so they should all be > 0
+    series = X_prime["DATE-DAY-OF-MONTH"]
+    assert (series.values.ravel() > 0).all()
+
+
+def test_all_true():
+    feat = DateFeaturizer(column_name="date",
+                          with_day_of_month=True,
+                          with_day_of_week=True)
+
+    y_prime, X_prime = feat.fit_transform(y, X)
+
+    assert_array_equal(y, y_prime)
+    assert y is not y_prime
+
+    # there should be 8 columns in the X_prime (7 for days of the week, 1 for
+    # ordinal)
+    assert X_prime.shape[1] == 8
+
+    _dummy_assertions(X_prime)
+    _ordinal_assertions(X_prime)
+
+    # date column should not be there anymore
+    assert "date" not in X_prime.columns.tolist()
+
+
+def test_dummy_only():
+    feat = DateFeaturizer(column_name="date",
+                          prefix="DATE",
+                          with_day_of_month=False,
+                          with_day_of_week=True)
+
+    y_prime, X_prime = feat.fit_transform(y, X)
+
+    assert_array_equal(y, y_prime)
+    assert y is not y_prime
+
+    # there should be 7 columns in the X_prime (7 for days of the week)
+    assert X_prime.shape[1] == 7
+
+    _dummy_assertions(X_prime)
+
+    # show ordinal col not here
+    assert "DATE-DAY-OF-MONTH" not in X_prime.columns.tolist()
+
+    # date column should not be there anymore
+    assert "date" not in X_prime.columns.tolist()
+
+
+def test_ordinal_only():
+    feat = DateFeaturizer(column_name="date",
+                          prefix="DATE",
+                          with_day_of_month=True,
+                          with_day_of_week=False)
+
+    y_prime, X_prime = feat.fit_transform(y, X)
+
+    assert_array_equal(y, y_prime)
+    assert y is not y_prime
+
+    # there should be 1 column in the X_prime df
+    assert X_prime.shape[1] == 1
+
+    _ordinal_assertions(X_prime)
+
+    # show ordinal col not here
+    assert not [n for n in X_prime.columns.tolist() if "WEEKDAY" in n]
+
+    # date column should not be there anymore
+    assert "date" not in X_prime.columns.tolist()
--- a/.venv/lib/python3.12/site-packages/pmdarima/preprocessing/exog/tests/test_fourier.py
+++ b/.venv/lib/python3.12/site-packages/pmdarima/preprocessing/exog/tests/test_fourier.py
@ -0,0 +1,148 @@
+# -*- coding: utf-8 -*-
+
+import numpy as np
+from numpy.testing import assert_array_almost_equal
+
+from pmdarima.preprocessing.exog import FourierFeaturizer
+from pmdarima.compat.pytest import pytest_error_str
+import pmdarima as pm
+
+import pytest
+
+wineind = pm.datasets.load_wineind()
+
+
+class TestFourierREquivalency:
+
+    # The following R code is what we want to reproduce:
+    #   > set.seed(99)
+    #   > n = 20
+    #   > m = 5
+    #   > y <- ts(rnorm(n) + (1:n)%%100/30, f=m)
+    #   > library(forecast)
+    #   > exog = fourier(y, K=2)
+    #   > head(exog, 2)
+    #             S1-5      C1-5       S2-5      C2-5
+    #   [1,] 0.9510565  0.309017  0.5877853 -0.809017
+    #   [2,] 0.5877853 -0.809017 -0.9510565  0.309017
+
+    y = pm.c(
+        0.24729584, 0.54632480, 0.18782870, 0.57719184, -0.19617125,
+        0.32267403, -0.63051185, 0.75629093, -0.06411691, -0.96090867,
+        -0.37910238, 1.32155036, 1.18338768, -2.04188735, -2.54093410,
+        0.53359913, 0.17264767, -1.14502766, 1.13196478, 0.93762046)
+
+    expected = np.array([
+        [0.9510565, 0.309017, 0.5877853, -0.809017],
+        [0.5877853, -0.809017, -0.9510565, 0.309017],
+        [-0.5877853, -0.809017, 0.9510565, 0.309017],
+        [-0.9510565, 0.309017, -0.5877853, -0.809017],
+        [0.0000000, 1.000000, 0.0000000, 1.000000],
+        [0.9510565, 0.309017, 0.5877853, -0.809017],
+        [0.5877853, -0.809017, -0.9510565, 0.309017],
+        [-0.5877853, -0.809017, 0.9510565, 0.309017],
+        [-0.9510565, 0.309017, -0.5877853, -0.809017],
+        [0.0000000, 1.000000, 0.0000000, 1.000000],
+        [0.9510565, 0.309017, 0.5877853, -0.809017],
+        [0.5877853, -0.809017, -0.9510565, 0.309017],
+        [-0.5877853, -0.809017, 0.9510565, 0.309017],
+        [-0.9510565, 0.309017, -0.5877853, -0.809017],
+        [0.0000000, 1.000000, 0.0000000, 1.000000],
+        [0.9510565, 0.309017, 0.5877853, -0.809017],
+        [0.5877853, -0.809017, -0.9510565, 0.309017],
+        [-0.5877853, -0.809017, 0.9510565, 0.309017],
+        [-0.9510565, 0.309017, -0.5877853, -0.809017],
+        [0.0000000, 1.000000, 0.0000000, 1.000000],
+    ])
+
+    @pytest.mark.parametrize(
+        'X', [
+            None,
+            np.random.rand(y.shape[0], 3)
+        ]
+    )
+    def test_r_equivalency(self, X):
+        y = self.y
+        expected = self.expected
+
+        trans = FourierFeaturizer(m=5, k=2).fit(y)
+        _, xreg = trans.transform(y, X)
+
+        # maybe subset
+        if hasattr(xreg, 'iloc'):
+            xreg = xreg.values
+        assert_array_almost_equal(expected, xreg[:, -4:])
+
+        # maybe assert on X
+        if X is not None:
+            assert_array_almost_equal(X, xreg[:, :3])
+
+            # Test a bad forecast (X dim does not match n_periods dim)
+            with pytest.raises(ValueError):
+                trans.transform(y, np.random.rand(5, 3), n_periods=2)
+
+
+def test_hyndman_blog():
+    # This is the exact code Hyndman ran in his blog post on the matter:
+    # https://robjhyndman.com/hyndsight/longseasonality/
+    n = 2000
+    m = 200
+    y = np.random.RandomState(1).normal(size=n) + \
+        (np.arange(1, n + 1) % 100 / 30)
+
+    trans = FourierFeaturizer(m=m, k=5).fit(y)
+    _, xreg = trans.transform(y)
+
+    arima = pm.auto_arima(y,
+                          X=xreg,
+                          seasonal=False,
+                          maxiter=1,  # very short
+                          start_p=4,
+                          max_p=5,
+                          d=0,
+                          max_q=1,
+                          start_q=0,
+                          simple_differencing=True)  # type: pm.ARIMA
+
+    # Show we can forecast 10 in the future
+    _, xreg_test = trans.transform(y, n_periods=10)
+    arima.predict(n_periods=10, X=xreg_test)
+
+
+def test_update_transform():
+    n = 150
+    m = 10
+    y = np.random.RandomState(1).normal(size=n) + \
+        (np.arange(1, n + 1) % 100 / 30)
+
+    train, test = y[:100], y[100:]
+
+    trans = FourierFeaturizer(m=m, k=5).fit(train)
+    _, xreg = trans.transform(train)
+
+    # Now update with the test set and show the xreg is diff
+    yt, Xt = trans.update_and_transform(test, X=None)
+    assert yt is test
+    assert Xt.shape[0] == test.shape[0]
+    assert trans.n_ == y.shape[0]
+
+    # Now assert that if we do a vanilla transform with no n_periods, the last
+    # 50 are the same as the Xt we just got and the first 100 are the same as
+    # we got earlier
+    _, xreg2 = trans.transform(y)
+    assert_array_almost_equal(xreg2[:100], xreg)
+    assert_array_almost_equal(xreg2[100:], Xt)
+
+
+def test_value_error_check():
+    feat = FourierFeaturizer(m=12)
+    with pytest.raises(ValueError) as ve:
+        feat._check_y_X(wineind, None, null_allowed=False)
+    assert 'non-None' in pytest_error_str(ve)
+
+
+def test_value_error_on_fit():
+    feat = FourierFeaturizer(m=12, k=8)
+    with pytest.raises(ValueError) as ve:
+        feat.fit_transform(wineind)
+    assert 'k must be' in pytest_error_str(ve)