some new features

2025-07-30 17:09:11 +03:00
parent db5d46760a
commit 8019bd3b7c
20616 changed files with 4375466 additions and 8 deletions
--- a/.venv/lib/python3.12/site-packages/statsmodels/nonparametric/smoothers_lowess.py
+++ b/.venv/lib/python3.12/site-packages/statsmodels/nonparametric/smoothers_lowess.py
@ -0,0 +1,265 @@
+"""Lowess - wrapper for cythonized extension
+
+Author : Chris Jordan-Squire
+Author : Carl Vogel
+Author : Josef Perktold
+
+"""
+
+import numpy as np
+from ._smoothers_lowess import lowess as _lowess
+
+def lowess(endog, exog, frac=2.0/3.0, it=3, delta=0.0, xvals=None, is_sorted=False,
+           missing='drop', return_sorted=True):
+    '''LOWESS (Locally Weighted Scatterplot Smoothing)
+
+    A lowess function that outs smoothed estimates of endog
+    at the given exog values from points (exog, endog)
+
+    Parameters
+    ----------
+    endog : 1-D numpy array
+        The y-values of the observed points
+    exog : 1-D numpy array
+        The x-values of the observed points
+    frac : float
+        Between 0 and 1. The fraction of the data used
+        when estimating each y-value.
+    it : int
+        The number of residual-based reweightings
+        to perform.
+    delta : float
+        Distance within which to use linear-interpolation
+        instead of weighted regression.
+    xvals: 1-D numpy array
+        Values of the exogenous variable at which to evaluate the regression.
+        If supplied, cannot use delta.
+    is_sorted : bool
+        If False (default), then the data will be sorted by exog before
+        calculating lowess. If True, then it is assumed that the data is
+        already sorted by exog. If xvals is specified, then it too must be
+        sorted if is_sorted is True.
+    missing : str
+        Available options are 'none', 'drop', and 'raise'. If 'none', no nan
+        checking is done. If 'drop', any observations with nans are dropped.
+        If 'raise', an error is raised. Default is 'drop'.
+    return_sorted : bool
+        If True (default), then the returned array is sorted by exog and has
+        missing (nan or infinite) observations removed.
+        If False, then the returned array is in the same length and the same
+        sequence of observations as the input array.
+
+    Returns
+    -------
+    out : {ndarray, float}
+        The returned array is two-dimensional if return_sorted is True, and
+        one dimensional if return_sorted is False.
+        If return_sorted is True, then a numpy array with two columns. The
+        first column contains the sorted x (exog) values and the second column
+        the associated estimated y (endog) values.
+        If return_sorted is False, then only the fitted values are returned,
+        and the observations will be in the same order as the input arrays.
+        If xvals is provided, then return_sorted is ignored and the returned
+        array is always one dimensional, containing the y values fitted at
+        the x values provided by xvals.
+
+    Notes
+    -----
+    This lowess function implements the algorithm given in the
+    reference below using local linear estimates.
+
+    Suppose the input data has N points. The algorithm works by
+    estimating the `smooth` y_i by taking the frac*N closest points
+    to (x_i,y_i) based on their x values and estimating y_i
+    using a weighted linear regression. The weight for (x_j,y_j)
+    is tricube function applied to abs(x_i-x_j).
+
+    If it > 1, then further weighted local linear regressions
+    are performed, where the weights are the same as above
+    times the _lowess_bisquare function of the residuals. Each iteration
+    takes approximately the same amount of time as the original fit,
+    so these iterations are expensive. They are most useful when
+    the noise has extremely heavy tails, such as Cauchy noise.
+    Noise with less heavy-tails, such as t-distributions with df>2,
+    are less problematic. The weights downgrade the influence of
+    points with large residuals. In the extreme case, points whose
+    residuals are larger than 6 times the median absolute residual
+    are given weight 0.
+
+    `delta` can be used to save computations. For each `x_i`, regressions
+    are skipped for points closer than `delta`. The next regression is
+    fit for the farthest point within delta of `x_i` and all points in
+    between are estimated by linearly interpolating between the two
+    regression fits.
+
+    Judicious choice of delta can cut computation time considerably
+    for large data (N > 5000). A good choice is ``delta = 0.01 * range(exog)``.
+
+    If `xvals` is provided, the regression is then computed at those points
+    and the fit values are returned. Otherwise, the regression is run
+    at points of `exog`.
+
+    Some experimentation is likely required to find a good
+    choice of `frac` and `iter` for a particular dataset.
+
+    References
+    ----------
+    Cleveland, W.S. (1979) "Robust Locally Weighted Regression
+    and Smoothing Scatterplots". Journal of the American Statistical
+    Association 74 (368): 829-836.
+
+    Examples
+    --------
+    The below allows a comparison between how different the fits from
+    lowess for different values of frac can be.
+
+    >>> import numpy as np
+    >>> import statsmodels.api as sm
+    >>> lowess = sm.nonparametric.lowess
+    >>> x = np.random.uniform(low = -2*np.pi, high = 2*np.pi, size=500)
+    >>> y = np.sin(x) + np.random.normal(size=len(x))
+    >>> z = lowess(y, x)
+    >>> w = lowess(y, x, frac=1./3)
+
+    This gives a similar comparison for when it is 0 vs not.
+
+    >>> import numpy as np
+    >>> import scipy.stats as stats
+    >>> import statsmodels.api as sm
+    >>> lowess = sm.nonparametric.lowess
+    >>> x = np.random.uniform(low = -2*np.pi, high = 2*np.pi, size=500)
+    >>> y = np.sin(x) + stats.cauchy.rvs(size=len(x))
+    >>> z = lowess(y, x, frac= 1./3, it=0)
+    >>> w = lowess(y, x, frac=1./3)
+
+    '''
+
+    endog = np.asarray(endog, float)
+    exog = np.asarray(exog, float)
+
+    # Whether xvals argument was provided
+    given_xvals = (xvals is not None)
+
+    # Inputs should be vectors (1-D arrays) of the
+    # same length.
+    if exog.ndim != 1:
+        raise ValueError('exog must be a vector')
+    if endog.ndim != 1:
+        raise ValueError('endog must be a vector')
+    if endog.shape[0] != exog.shape[0] :
+        raise ValueError('exog and endog must have same length')
+
+    if xvals is not None:
+        xvals = np.ascontiguousarray(xvals)
+        if xvals.ndim != 1:
+            raise ValueError('exog_predict must be a vector')
+
+    if missing in ['drop', 'raise']:
+        mask_valid = (np.isfinite(exog) & np.isfinite(endog))
+        all_valid = np.all(mask_valid)
+        if all_valid:
+            y = endog
+            x = exog
+        else:
+            if missing == 'drop':
+                x = exog[mask_valid]
+                y = endog[mask_valid]
+            else:
+                raise ValueError('nan or inf found in data')
+    elif missing == 'none':
+        y = endog
+        x = exog
+        all_valid = True   # we assume it's true if missing='none'
+    else:
+        raise ValueError("missing can only be 'none', 'drop' or 'raise'")
+
+    if not is_sorted:
+        # Sort both inputs according to the ascending order of x values
+        sort_index = np.argsort(x)
+        x = np.array(x[sort_index])
+        y = np.array(y[sort_index])
+
+    if not given_xvals:
+        # If given no explicit x values, we use the x-values in the exog array
+        xvals = exog
+        xvalues = x
+
+        xvals_all_valid = all_valid
+        if missing == 'drop':
+            xvals_mask_valid = mask_valid
+    else:
+        if delta != 0.0:
+            raise ValueError("Cannot have non-zero 'delta' and 'xvals' values")
+            # TODO: allow this again
+        mask_valid = np.isfinite(xvals)
+        if missing == "raise":
+            raise ValueError("NaN values in xvals with missing='raise'")
+        elif missing == 'drop':
+            xvals_mask_valid = mask_valid
+
+        xvalues = xvals
+        xvals_all_valid = True if missing == "none" else np.all(mask_valid)
+        # With explicit xvals, we ignore 'return_sorted' and always
+        # use the order provided
+        return_sorted = False
+
+        if missing in ['drop', 'raise']:
+            xvals_mask_valid = np.isfinite(xvals)
+            xvals_all_valid = np.all(xvals_mask_valid)
+            if xvals_all_valid:
+                xvalues = xvals
+            else:
+                if missing == 'drop':
+                    xvalues = xvals[xvals_mask_valid]
+                else:
+                    raise ValueError("nan or inf found in xvals")
+
+        if not is_sorted:
+            sort_index = np.argsort(xvalues)
+            xvalues = np.array(xvalues[sort_index])
+        else:
+            xvals_all_valid = True
+    y = np.ascontiguousarray(y)
+    x = np.ascontiguousarray(x)
+    if not given_xvals:
+        # Run LOWESS on the data points
+        res, _ = _lowess(y, x, x, np.ones_like(x),
+                        frac=frac, it=it, delta=delta, given_xvals=False)
+    else:
+        # First run LOWESS on the data points to get the weights of the data points
+        # using it-1 iterations, last iter done next
+        if it > 0:
+            _, weights = _lowess(y, x, x, np.ones_like(x),
+                                frac=frac, it=it-1, delta=delta, given_xvals=False)
+        else:
+            weights = np.ones_like(x)
+        xvalues = np.ascontiguousarray(xvalues, dtype=float)
+        # Then run once more using those supplied weights at the points provided by xvals
+        # No extra iterations are performed here since weights are fixed
+        res, _ = _lowess(y, x, xvalues, weights,
+                        frac=frac, it=0, delta=delta, given_xvals=True)
+
+    _, yfitted = res.T
+
+    if return_sorted:
+        return res
+    else:
+
+        # rebuild yfitted with original indices
+        # a bit messy: y might have been selected twice
+        if not is_sorted:
+            yfitted_ = np.empty_like(xvalues)
+            yfitted_.fill(np.nan)
+            yfitted_[sort_index] = yfitted
+            yfitted = yfitted_
+        else:
+            yfitted = yfitted
+
+        if not xvals_all_valid:
+            yfitted_ = np.empty_like(xvals)
+            yfitted_.fill(np.nan)
+            yfitted_[xvals_mask_valid] = yfitted
+            yfitted = yfitted_
+
+        # we do not need to return exog anymore
+        return yfitted