some new features

2025-07-30 17:09:11 +03:00
parent db5d46760a
commit 8019bd3b7c
20616 changed files with 4375466 additions and 8 deletions
--- a/.venv/lib/python3.12/site-packages/statsmodels/imputation/ros.py
+++ b/.venv/lib/python3.12/site-packages/statsmodels/imputation/ros.py
@ -0,0 +1,582 @@
+"""
+Implementation of Regression on Order Statistics for imputing left-
+censored (non-detect data)
+
+Method described in *Nondetects and Data Analysis* by Dennis R.
+Helsel (John Wiley, 2005) to estimate the left-censored (non-detect)
+values of a dataset.
+
+Author: Paul M. Hobson
+Company: Geosyntec Consultants (Portland, OR)
+Date: 2016-06-14
+
+"""
+
+import warnings
+
+import numpy as np
+import pandas as pd
+from scipy import stats
+
+
+def _ros_sort(df, observations, censorship, warn=False):
+    """
+    This function prepares a dataframe for ROS.
+
+    It sorts ascending with
+    left-censored observations first. Censored observations larger than
+    the maximum uncensored observations are removed from the dataframe.
+
+    Parameters
+    ----------
+    df : DataFrame
+
+    observations : str
+        Name of the column in the dataframe that contains observed
+        values. Censored values should be set to the detection (upper)
+        limit.
+
+    censorship : str
+        Name of the column in the dataframe that indicates that a
+        observation is left-censored. (i.e., True -> censored,
+        False -> uncensored)
+
+    Returns
+    ------
+    sorted_df : DataFrame
+        The sorted dataframe with all columns dropped except the
+        observation and censorship columns.
+    """
+
+    # separate uncensored data from censored data
+    censored = df[df[censorship]].sort_values(observations, axis=0)
+    uncensored = df[~df[censorship]].sort_values(observations, axis=0)
+
+    if censored[observations].max() > uncensored[observations].max():
+        censored = censored[censored[observations] <= uncensored[observations].max()]
+
+        if warn:
+            msg = ("Dropping censored observations greater than "
+                   "the max uncensored observation.")
+            warnings.warn(msg)
+
+    combined = pd.concat([censored, uncensored], axis=0)
+    return combined[[observations, censorship]].reset_index(drop=True)
+
+
+def cohn_numbers(df, observations, censorship):
+    r"""
+    Computes the Cohn numbers for the detection limits in the dataset.
+
+    The Cohn Numbers are:
+
+        - :math:`A_j =` the number of uncensored obs above the jth
+          threshold.
+        - :math:`B_j =` the number of observations (cen & uncen) below
+          the jth threshold.
+        - :math:`C_j =` the number of censored observations at the jth
+          threshold.
+        - :math:`\mathrm{PE}_j =` the probability of exceeding the jth
+          threshold
+        - :math:`\mathrm{DL}_j =` the unique, sorted detection limits
+        - :math:`\mathrm{DL}_{j+1} = \mathrm{DL}_j` shifted down a
+          single index (row)
+
+    Parameters
+    ----------
+    dataframe : DataFrame
+
+    observations : str
+        Name of the column in the dataframe that contains observed
+        values. Censored values should be set to the detection (upper)
+        limit.
+
+    censorship : str
+        Name of the column in the dataframe that indicates that a
+        observation is left-censored. (i.e., True -> censored,
+        False -> uncensored)
+
+    Returns
+    -------
+    cohn : DataFrame
+    """
+
+    def nuncen_above(row):
+        """ A, the number of uncensored obs above the given threshold.
+        """
+
+        # index of observations above the lower_dl DL
+        above = df[observations] >= row['lower_dl']
+
+        # index of observations below the upper_dl DL
+        below = df[observations] < row['upper_dl']
+
+        # index of non-detect observations
+        detect = ~df[censorship]
+
+        # return the number of observations where all conditions are True
+        return df[above & below & detect].shape[0]
+
+    def nobs_below(row):
+        """ B, the number of observations (cen & uncen) below the given
+        threshold
+        """
+
+        # index of data less than the lower_dl DL
+        less_than = df[observations] < row['lower_dl']
+
+        # index of data less than or equal to the lower_dl DL
+        less_thanequal = df[observations] <= row['lower_dl']
+
+        # index of detects, non-detects
+        uncensored = ~df[censorship]
+        censored = df[censorship]
+
+        # number observations less than or equal to lower_dl DL and non-detect
+        LTE_censored = df[less_thanequal & censored].shape[0]
+
+        # number of observations less than lower_dl DL and detected
+        LT_uncensored = df[less_than & uncensored].shape[0]
+
+        # return the sum
+        return LTE_censored + LT_uncensored
+
+    def ncen_equal(row):
+        """ C, the number of censored observations at the given
+        threshold.
+        """
+
+        censored_index = df[censorship]
+        censored_data = df[observations][censored_index]
+        censored_below = censored_data == row['lower_dl']
+        return censored_below.sum()
+
+    def set_upper_limit(cohn):
+        """ Sets the upper_dl DL for each row of the Cohn dataframe. """
+        if cohn.shape[0] > 1:
+            return cohn['lower_dl'].shift(-1).fillna(value=np.inf)
+        else:
+            return [np.inf]
+
+    def compute_PE(A, B):
+        """ Computes the probability of excedance for each row of the
+        Cohn dataframe. """
+        N = len(A)
+        PE = np.empty(N, dtype='float64')
+        PE[-1] = 0.0
+        for j in range(N-2, -1, -1):
+            PE[j] = PE[j+1] + (1 - PE[j+1]) * A[j] / (A[j] + B[j])
+
+        return PE
+
+    # unique, sorted detection limts
+    censored_data = df[censorship]
+    DLs = pd.unique(df.loc[censored_data, observations])
+    DLs.sort()
+
+    # if there is a observations smaller than the minimum detection limit,
+    # add that value to the array
+    if DLs.shape[0] > 0:
+        if df[observations].min() < DLs.min():
+            DLs = np.hstack([df[observations].min(), DLs])
+
+        # create a dataframe
+        # (editted for pandas 0.14 compatibility; see commit 63f162e
+        #  when `pipe` and `assign` are available)
+        cohn = pd.DataFrame(DLs, columns=['lower_dl'])
+        cohn.loc[:, 'upper_dl'] = set_upper_limit(cohn)
+        cohn.loc[:, 'nuncen_above'] = cohn.apply(nuncen_above, axis=1)
+        cohn.loc[:, 'nobs_below'] = cohn.apply(nobs_below, axis=1)
+        cohn.loc[:, 'ncen_equal'] = cohn.apply(ncen_equal, axis=1)
+        cohn = cohn.reindex(range(DLs.shape[0] + 1))
+        cohn.loc[:, 'prob_exceedance'] = compute_PE(cohn['nuncen_above'], cohn['nobs_below'])
+
+    else:
+        dl_cols = ['lower_dl', 'upper_dl', 'nuncen_above',
+                   'nobs_below', 'ncen_equal', 'prob_exceedance']
+        cohn = pd.DataFrame(np.empty((0, len(dl_cols))), columns=dl_cols)
+
+    return cohn
+
+
+def _detection_limit_index(obs, cohn):
+    """
+    Locates the corresponding detection limit for each observation.
+
+    Basically, creates an array of indices for the detection limits
+    (Cohn numbers) corresponding to each data point.
+
+    Parameters
+    ----------
+    obs : float
+        A single observation from the larger dataset.
+
+    cohn : DataFrame
+        DataFrame of Cohn numbers.
+
+    Returns
+    -------
+    det_limit_index : int
+        The index of the corresponding detection limit in `cohn`
+
+    See Also
+    --------
+    cohn_numbers
+    """
+
+    if cohn.shape[0] > 0:
+        index, = np.where(cohn['lower_dl'] <= obs)
+        det_limit_index = index[-1]
+    else:
+        det_limit_index = 0
+
+    return det_limit_index
+
+
+def _ros_group_rank(df, dl_idx, censorship):
+    """
+    Ranks each observation within the data groups.
+
+    In this case, the groups are defined by the record's detection
+    limit index and censorship status.
+
+    Parameters
+    ----------
+    df : DataFrame
+
+    dl_idx : str
+        Name of the column in the dataframe the index of the
+        observations' corresponding detection limit in the `cohn`
+        dataframe.
+
+    censorship : str
+        Name of the column in the dataframe that indicates that a
+        observation is left-censored. (i.e., True -> censored,
+        False -> uncensored)
+
+    Returns
+    -------
+    ranks : ndarray
+        Array of ranks for the dataset.
+    """
+
+    # (editted for pandas 0.14 compatibility; see commit 63f162e
+    #  when `pipe` and `assign` are available)
+    ranks = df.copy()
+    ranks.loc[:, 'rank'] = 1
+    ranks = (
+        ranks.groupby(by=[dl_idx, censorship])['rank']
+             .transform(lambda g: g.cumsum())
+    )
+    return ranks
+
+
+def _ros_plot_pos(row, censorship, cohn):
+    """
+    ROS-specific plotting positions.
+
+    Computes the plotting position for an observation based on its rank,
+    censorship status, and detection limit index.
+
+    Parameters
+    ----------
+    row : {Series, dict}
+        Full observation (row) from a censored dataset. Requires a
+        'rank', 'detection_limit', and `censorship` column.
+
+    censorship : str
+        Name of the column in the dataframe that indicates that a
+        observation is left-censored. (i.e., True -> censored,
+        False -> uncensored)
+
+    cohn : DataFrame
+        DataFrame of Cohn numbers.
+
+    Returns
+    -------
+    plotting_position : float
+
+    See Also
+    --------
+    cohn_numbers
+    """
+
+    DL_index = row['det_limit_index']
+    rank = row['rank']
+    censored = row[censorship]
+
+    dl_1 = cohn.iloc[DL_index]
+    dl_2 = cohn.iloc[DL_index + 1]
+    if censored:
+        return (1 - dl_1['prob_exceedance']) * rank / (dl_1['ncen_equal']+1)
+    else:
+        return (1 - dl_1['prob_exceedance']) + (dl_1['prob_exceedance'] - dl_2['prob_exceedance']) * \
+                rank / (dl_1['nuncen_above']+1)
+
+
+def _norm_plot_pos(observations):
+    """
+    Computes standard normal (Gaussian) plotting positions using scipy.
+
+    Parameters
+    ----------
+    observations : array_like
+        Sequence of observed quantities.
+
+    Returns
+    -------
+    plotting_position : array of floats
+    """
+    ppos, sorted_res = stats.probplot(observations, fit=False)
+    return stats.norm.cdf(ppos)
+
+
+def plotting_positions(df, censorship, cohn):
+    """
+    Compute the plotting positions for the observations.
+
+    The ROS-specific plotting postions are based on the observations'
+    rank, censorship status, and corresponding detection limit.
+
+    Parameters
+    ----------
+    df : DataFrame
+
+    censorship : str
+        Name of the column in the dataframe that indicates that a
+        observation is left-censored. (i.e., True -> censored,
+        False -> uncensored)
+
+    cohn : DataFrame
+        DataFrame of Cohn numbers.
+
+    Returns
+    -------
+    plotting_position : array of float
+
+    See Also
+    --------
+    cohn_numbers
+    """
+
+    plot_pos = df.apply(lambda r: _ros_plot_pos(r, censorship, cohn), axis=1)
+
+    # correctly sort the plotting positions of the ND data:
+    ND_plotpos = plot_pos[df[censorship]]
+    ND_plotpos_arr = np.require(ND_plotpos, requirements="W")
+    ND_plotpos_arr.sort()
+    plot_pos.loc[df[censorship].index[df[censorship]]] = ND_plotpos_arr
+
+    return plot_pos
+
+
+def _impute(df, observations, censorship, transform_in, transform_out):
+    """
+    Executes the basic regression on order stat (ROS) proceedure.
+
+    Uses ROS to impute censored from the best-fit line of a
+    probability plot of the uncensored values.
+
+    Parameters
+    ----------
+    df : DataFrame
+    observations : str
+        Name of the column in the dataframe that contains observed
+        values. Censored values should be set to the detection (upper)
+        limit.
+    censorship : str
+        Name of the column in the dataframe that indicates that a
+        observation is left-censored. (i.e., True -> censored,
+        False -> uncensored)
+    transform_in, transform_out : callable
+        Transformations to be applied to the data prior to fitting
+        the line and after estimated values from that line. Typically,
+        `np.log` and `np.exp` are used, respectively.
+
+    Returns
+    -------
+    estimated : DataFrame
+        A new dataframe with two new columns: "estimated" and "final".
+        The "estimated" column contains of the values inferred from the
+        best-fit line. The "final" column contains the estimated values
+        only where the original observations were censored, and the original
+        observations everwhere else.
+    """
+
+    # detect/non-detect selectors
+    uncensored_mask = ~df[censorship]
+    censored_mask = df[censorship]
+
+    # fit a line to the logs of the detected data
+    fit_params = stats.linregress(
+        df['Zprelim'][uncensored_mask],
+        transform_in(df[observations][uncensored_mask])
+    )
+
+    # pull out the slope and intercept for use later
+    slope, intercept = fit_params[:2]
+
+    # model the data based on the best-fit curve
+    # (editted for pandas 0.14 compatibility; see commit 63f162e
+    #  when `pipe` and `assign` are available)
+    df.loc[:, 'estimated'] = transform_out(slope * df['Zprelim'][censored_mask] + intercept)
+    df.loc[:, 'final'] = np.where(df[censorship], df['estimated'], df[observations])
+
+    return df
+
+
+def _do_ros(df, observations, censorship, transform_in, transform_out):
+    """
+    DataFrame-centric function to impute censored valies with ROS.
+
+    Prepares a dataframe for, and then esimates the values of a censored
+    dataset using Regression on Order Statistics
+
+    Parameters
+    ----------
+    df : DataFrame
+
+    observations : str
+        Name of the column in the dataframe that contains observed
+        values. Censored values should be set to the detection (upper)
+        limit.
+
+    censorship : str
+        Name of the column in the dataframe that indicates that a
+        observation is left-censored. (i.e., True -> censored,
+        False -> uncensored)
+
+    transform_in, transform_out : callable
+        Transformations to be applied to the data prior to fitting
+        the line and after estimated values from that line. Typically,
+        `np.log` and `np.exp` are used, respectively.
+
+    Returns
+    -------
+    estimated : DataFrame
+        A new dataframe with two new columns: "estimated" and "final".
+        The "estimated" column contains of the values inferred from the
+        best-fit line. The "final" column contains the estimated values
+        only where the original observations were censored, and the original
+        observations everwhere else.
+    """
+
+    # compute the Cohn numbers
+    cohn = cohn_numbers(df, observations=observations, censorship=censorship)
+
+    # (editted for pandas 0.14 compatibility; see commit 63f162e
+    #  when `pipe` and `assign` are available)
+    modeled = _ros_sort(df, observations=observations, censorship=censorship)
+    modeled.loc[:, 'det_limit_index'] = modeled[observations].apply(_detection_limit_index, args=(cohn,))
+    modeled.loc[:, 'rank'] = _ros_group_rank(modeled, 'det_limit_index', censorship)
+    modeled.loc[:, 'plot_pos'] = plotting_positions(modeled, censorship, cohn)
+    modeled.loc[:, 'Zprelim'] = stats.norm.ppf(modeled['plot_pos'])
+
+    return _impute(modeled, observations, censorship, transform_in, transform_out)
+
+
+def impute_ros(observations, censorship, df=None, min_uncensored=2,
+               max_fraction_censored=0.8, substitution_fraction=0.5,
+               transform_in=np.log, transform_out=np.exp,
+               as_array=True):
+    """
+    Impute censored dataset using Regression on Order Statistics (ROS).
+
+    Method described in *Nondetects and Data Analysis* by Dennis R.
+    Helsel (John Wiley, 2005) to estimate the left-censored (non-detect)
+    values of a dataset. When there is insufficient non-censorded data,
+    simple substitution is used.
+
+    Parameters
+    ----------
+    observations : str or array-like
+        Label of the column or the float array of censored observations
+
+    censorship : str
+        Label of the column or the bool array of the censorship
+        status of the observations.
+
+          * True if censored,
+          * False if uncensored
+
+    df : DataFrame, optional
+        If `observations` and `censorship` are labels, this is the
+        DataFrame that contains those columns.
+
+    min_uncensored : int (default is 2)
+        The minimum number of uncensored values required before ROS
+        can be used to impute the censored observations. When this
+        criterion is not met, simple substituion is used instead.
+
+    max_fraction_censored : float (default is 0.8)
+        The maximum fraction of censored data below which ROS can be
+        used to impute the censored observations. When this fraction is
+        exceeded, simple substituion is used instead.
+
+    substitution_fraction : float (default is 0.5)
+        The fraction of the detection limit to be used during simple
+        substitution of the censored values.
+
+    transform_in : callable (default is np.log)
+        Transformation to be applied to the values prior to fitting a
+        line to the plotting positions vs. uncensored values.
+
+    transform_out : callable (default is np.exp)
+        Transformation to be applied to the imputed censored values
+        estimated from the previously computed best-fit line.
+
+    as_array : bool (default is True)
+        When True, a numpy array of the imputed observations is
+        returned. Otherwise, a modified copy of the original dataframe
+        with all of the intermediate calculations is returned.
+
+    Returns
+    -------
+    imputed : {ndarray, DataFrame}
+        The final observations where the censored values have either been
+        imputed through ROS or substituted as a fraction of the
+        detection limit.
+
+    Notes
+    -----
+    This function requires pandas 0.14 or more recent.
+    """
+
+    # process arrays into a dataframe, if necessary
+    if df is None:
+        df = pd.DataFrame({'obs': observations, 'cen': censorship})
+        observations = 'obs'
+        censorship = 'cen'
+
+    # basic counts/metrics of the dataset
+    N_observations = df.shape[0]
+    N_censored = df[censorship].astype(int).sum()
+    N_uncensored = N_observations - N_censored
+    fraction_censored = N_censored / N_observations
+
+    # add plotting positions if there are no censored values
+    # (editted for pandas 0.14 compatibility; see commit 63f162e
+    #  when `pipe` and `assign` are available)
+    if N_censored == 0:
+        output = df[[observations, censorship]].copy()
+        output.loc[:, 'final'] = df[observations]
+
+    # substitute w/ fraction of the DLs if there's insufficient
+    # uncensored data
+    # (editted for pandas 0.14 compatibility; see commit 63f162e
+    #  when `pipe` and `assign` are available)
+    elif (N_uncensored < min_uncensored) or (fraction_censored > max_fraction_censored):
+        output = df[[observations, censorship]].copy()
+        output.loc[:, 'final'] = df[observations]
+        output.loc[df[censorship], 'final'] *= substitution_fraction
+
+
+    # normal ROS stuff
+    else:
+        output = _do_ros(df, observations, censorship, transform_in, transform_out)
+
+    # convert to an array if necessary
+    if as_array:
+        output = output['final'].values
+
+    return output