some new features

2025-07-30 17:09:11 +03:00
parent db5d46760a
commit 8019bd3b7c
20616 changed files with 4375466 additions and 8 deletions
--- a/.venv/lib/python3.12/site-packages/patsy/state.py
+++ b/.venv/lib/python3.12/site-packages/patsy/state.py
@ -0,0 +1,199 @@
+# This file is part of Patsy
+# Copyright (C) 2011 Nathaniel Smith <njs@pobox.com>
+# See file LICENSE.txt for license information.
+
+# Stateful transform protocol:
+#   def __init__(self):
+#       pass
+#   def memorize_chunk(self, input_data):
+#       return None
+#   def memorize_finish(self):
+#       return None
+#   def transform(self, input_data):
+#       return output_data
+
+# BETTER WAY: always run the first row of data through the builder alone, and
+# check that it gives the same output row as when running the whole block of
+# data through at once. This gives us the same information, but it's robust
+# against people writing their own centering functions.
+
+# QUESTION: right now we refuse to even fit a model that contains a
+# my_transform(x)-style function. Maybe we should allow it to be fit (with a
+# warning), and only disallow making predictions with it? Need to revisit this
+# question once it's clearer what exactly our public API will look like,
+# because right now I'm not sure how to tell whether we are being called for
+# fitting versus being called for prediction.
+
+from functools import wraps
+import numpy as np
+from patsy.util import (
+    atleast_2d_column_default,
+    asarray_or_pandas,
+    pandas_friendly_reshape,
+    wide_dtype_for,
+    safe_issubdtype,
+    no_pickling,
+    assert_no_pickling,
+)
+
+# These are made available in the patsy.* namespace
+__all__ = [
+    "stateful_transform",
+    "center",
+    "standardize",
+    "scale",
+]
+
+
+def stateful_transform(class_):
+    """Create a stateful transform callable object from a class that fulfills
+    the :ref:`stateful transform protocol <stateful-transform-protocol>`.
+    """
+
+    @wraps(class_)
+    def stateful_transform_wrapper(*args, **kwargs):
+        transform = class_()
+        transform.memorize_chunk(*args, **kwargs)
+        transform.memorize_finish()
+        return transform.transform(*args, **kwargs)
+
+    stateful_transform_wrapper.__patsy_stateful_transform__ = class_
+    return stateful_transform_wrapper
+
+
+# class NonIncrementalStatefulTransform(object):
+#     def __init__(self):
+#         self._data = []
+#
+#     def memorize_chunk(self, input_data, *args, **kwargs):
+#         self._data.append(input_data)
+#         self._args = _args
+#         self._kwargs = kwargs
+#
+#     def memorize_finish(self):
+#         all_data = np.vstack(self._data)
+#         args = self._args
+#         kwargs = self._kwargs
+#         del self._data
+#         del self._args
+#         del self._kwargs
+#         self.memorize_all(all_data, *args, **kwargs)
+#
+#     def memorize_all(self, input_data, *args, **kwargs):
+#         raise NotImplementedError
+#
+#     def transform(self, input_data, *args, **kwargs):
+#         raise NotImplementedError
+#
+# class QuantileEstimatingTransform(NonIncrementalStatefulTransform):
+#     def memorize_all(self, input_data, *args, **kwargs):
+
+
+class Center(object):
+    """center(x)
+
+    A stateful transform that centers input data, i.e., subtracts the mean.
+
+    If input has multiple columns, centers each column separately.
+
+    Equivalent to ``standardize(x, rescale=False)``
+    """
+
+    def __init__(self):
+        self._sum = None
+        self._count = 0
+
+    def memorize_chunk(self, x):
+        x = atleast_2d_column_default(x)
+        self._count += x.shape[0]
+        this_total = np.sum(x, 0, dtype=wide_dtype_for(x))
+        # This is to handle potentially multi-column x's:
+        if self._sum is None:
+            self._sum = this_total
+        else:
+            self._sum += this_total
+
+    def memorize_finish(self):
+        pass
+
+    def transform(self, x):
+        x = asarray_or_pandas(x)
+        # This doesn't copy data unless our input is a DataFrame that has
+        # heterogeneous types. And in that case we're going to be munging the
+        # types anyway, so copying isn't a big deal.
+        x_arr = np.asarray(x)
+        if safe_issubdtype(x_arr.dtype, np.integer):
+            dt = float
+        else:
+            dt = x_arr.dtype
+        mean_val = np.asarray(self._sum / self._count, dtype=dt)
+        centered = atleast_2d_column_default(x, preserve_pandas=True) - mean_val
+        return pandas_friendly_reshape(centered, x.shape)
+
+    __getstate__ = no_pickling
+
+
+center = stateful_transform(Center)
+
+
+# See:
+#   http://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#On-line_algorithm
+# or page 232 of Knuth vol. 3 (3rd ed.).
+class Standardize(object):
+    """standardize(x, center=True, rescale=True, ddof=0)
+
+    A stateful transform that standardizes input data, i.e. it subtracts the
+    mean and divides by the sample standard deviation.
+
+    Either centering or rescaling or both can be disabled by use of keyword
+    arguments. The `ddof` argument controls the delta degrees of freedom when
+    computing the standard deviation (cf. :func:`numpy.std`). The default of
+    ``ddof=0`` produces the maximum likelihood estimate; use ``ddof=1`` if you
+    prefer the square root of the unbiased estimate of the variance.
+
+    If input has multiple columns, standardizes each column separately.
+
+    .. note:: This function computes the mean and standard deviation using a
+       memory-efficient online algorithm, making it suitable for use with
+       large incrementally processed data-sets.
+    """
+
+    def __init__(self):
+        self.current_n = 0
+        self.current_mean = None
+        self.current_M2 = None
+
+    def memorize_chunk(self, x, center=True, rescale=True, ddof=0):
+        x = atleast_2d_column_default(x)
+        if self.current_mean is None:
+            self.current_mean = np.zeros(x.shape[1], dtype=wide_dtype_for(x))
+            self.current_M2 = np.zeros(x.shape[1], dtype=wide_dtype_for(x))
+        # XX this can surely be vectorized but I am feeling lazy:
+        for i in range(x.shape[0]):
+            self.current_n += 1
+            delta = x[i, :] - self.current_mean
+            self.current_mean += delta / self.current_n
+            self.current_M2 += delta * (x[i, :] - self.current_mean)
+
+    def memorize_finish(self):
+        pass
+
+    def transform(self, x, center=True, rescale=True, ddof=0):
+        # XX: this forces all inputs to double-precision real, even if the
+        # input is single- or extended-precision or complex. But I got all
+        # tangled up in knots trying to do that without breaking something
+        # else (e.g. by requiring an extra copy).
+        x = asarray_or_pandas(x, copy=True, dtype=float)
+        x_2d = atleast_2d_column_default(x, preserve_pandas=True)
+        if center:
+            x_2d -= self.current_mean
+        if rescale:
+            x_2d /= np.sqrt(self.current_M2 / (self.current_n - ddof))
+        return pandas_friendly_reshape(x_2d, x.shape)
+
+    __getstate__ = no_pickling
+
+
+standardize = stateful_transform(Standardize)
+# R compatibility:
+scale = standardize