reconnect moved files to git repo

2025-08-01 04:33:03 -04:00
commit 5d3c35492d
23190 changed files with 4750716 additions and 0 deletions
--- a/venv/lib/python3.11/site-packages/statsmodels/sandbox/multilinear.py
+++ b/venv/lib/python3.11/site-packages/statsmodels/sandbox/multilinear.py
@ -0,0 +1,323 @@
+"""Analyze a set of multiple variables with a linear models
+
+multiOLS:
+    take a model and test it on a series of variables defined over a
+    pandas dataset, returning a summary for each variable
+
+multigroup:
+    take a boolean vector and the definition of several groups of variables
+    and test if the group has a fraction of true values higher than the
+    rest. It allows to test if the variables in the group are significantly
+    more significant than outside the group.
+"""
+from patsy import dmatrix
+import pandas as pd
+from statsmodels.api import OLS
+from statsmodels.api import stats
+import numpy as np
+import logging
+
+def _model2dataframe(model_endog, model_exog, model_type=OLS, **kwargs):
+    """return a series containing the summary of a linear model
+
+    All the exceding parameters will be redirected to the linear model
+    """
+    # create the linear model and perform the fit
+    model_result = model_type(model_endog, model_exog, **kwargs).fit()
+    # keeps track of some global statistics
+    statistics = pd.Series({'r2': model_result.rsquared,
+                  'adj_r2': model_result.rsquared_adj})
+    # put them togher with the result for each term
+    result_df = pd.DataFrame({'params': model_result.params,
+                              'pvals': model_result.pvalues,
+                              'std': model_result.bse,
+                              'statistics': statistics})
+    # add the complexive results for f-value and the total p-value
+    fisher_df = pd.DataFrame({'params': {'_f_test': model_result.fvalue},
+                              'pvals': {'_f_test': model_result.f_pvalue}})
+    # merge them and unstack to obtain a hierarchically indexed series
+    res_series = pd.concat([result_df, fisher_df]).unstack()
+    return res_series.dropna()
+
+
+def multiOLS(model, dataframe, column_list=None, method='fdr_bh',
+             alpha=0.05, subset=None, model_type=OLS, **kwargs):
+    """apply a linear model to several endogenous variables on a dataframe
+
+    Take a linear model definition via formula and a dataframe that will be
+    the environment of the model, and apply the linear model to a subset
+    (or all) of the columns of the dataframe. It will return a dataframe
+    with part of the information from the linear model summary.
+
+    Parameters
+    ----------
+    model : str
+        formula description of the model
+    dataframe : pandas.dataframe
+        dataframe where the model will be evaluated
+    column_list : list[str], optional
+        Names of the columns to analyze with the model.
+        If None (Default) it will perform the function on all the
+        eligible columns (numerical type and not in the model definition)
+    model_type : model class, optional
+        The type of model to be used. The default is the linear model.
+        Can be any linear model (OLS, WLS, GLS, etc..)
+    method : str, optional
+        the method used to perform the pvalue correction for multiple testing.
+        default is the Benjamini/Hochberg, other available methods are:
+
+            `bonferroni` : one-step correction
+            `sidak` : on-step correction
+            `holm-sidak` :
+            `holm` :
+            `simes-hochberg` :
+            `hommel` :
+            `fdr_bh` : Benjamini/Hochberg
+            `fdr_by` : Benjamini/Yekutieli
+
+    alpha : float, optional
+        the significance level used for the pvalue correction (default 0.05)
+    subset : bool array
+        the selected rows to be used in the regression
+
+    all the other parameters will be directed to the model creation.
+
+    Returns
+    -------
+    summary : pandas.DataFrame
+        a dataframe containing an extract from the summary of the model
+        obtained for each columns. It will give the model complexive f test
+        result and p-value, and the regression value and standard deviarion
+        for each of the regressors. The DataFrame has a hierachical column
+        structure, divided as:
+
+            - params: contains the parameters resulting from the models. Has
+            an additional column named _f_test containing the result of the
+            F test.
+            - pval: the pvalue results of the models. Has the _f_test column
+            for the significativity of the whole test.
+            - adj_pval: the corrected pvalues via the multitest function.
+            - std: uncertainties of the model parameters
+            - statistics: contains the r squared statistics and the adjusted
+            r squared.
+
+    Notes
+    -----
+    The main application of this function is on system biology to perform
+    a linear model testing of a lot of different parameters, like the
+    different genetic expression of several genes.
+
+    See Also
+    --------
+    statsmodels.stats.multitest
+        contains several functions to perform the multiple p-value correction
+
+    Examples
+    --------
+    Using the longley data as dataframe example
+
+    >>> import statsmodels.api as sm
+    >>> data = sm.datasets.longley.load_pandas()
+    >>> df = data.exog
+    >>> df['TOTEMP'] = data.endog
+
+    This will perform the specified linear model on all the
+    other columns of the dataframe
+    >>> multiOLS('GNP + 1', df)
+
+    This select only a certain subset of the columns
+    >>> multiOLS('GNP + 0', df, ['GNPDEFL', 'TOTEMP', 'POP'])
+
+    It is possible to specify a trasformation also on the target column,
+    conforming to the patsy formula specification
+    >>> multiOLS('GNP + 0', df, ['I(GNPDEFL**2)', 'center(TOTEMP)'])
+
+    It is possible to specify the subset of the dataframe
+    on which perform the analysis
+    >> multiOLS('GNP + 1', df, subset=df.GNPDEFL > 90)
+
+    Even a single column name can be given without enclosing it in a list
+    >>> multiOLS('GNP + 0', df, 'GNPDEFL')
+    """
+    # data normalization
+    # if None take all the numerical columns that are not present in the model
+    # it's not waterproof but is a good enough criterion for everyday use
+    if column_list is None:
+        column_list = [name for name in dataframe.columns
+                      if dataframe[name].dtype != object and name not in model]
+    # if it's a single string transform it in a single element list
+    if isinstance(column_list, str):
+        column_list = [column_list]
+    if subset is not None:
+        dataframe = dataframe.loc[subset]
+    # perform each model and retrieve the statistics
+    col_results = {}
+    # as the model will use always the same endogenous variables
+    # we can create them once and reuse
+    model_exog = dmatrix(model, data=dataframe, return_type="dataframe")
+    for col_name in column_list:
+        # it will try to interpret the column name as a valid dataframe
+        # index as it can be several times faster. If it fails it
+        # interpret it as a patsy formula (for example for centering)
+        try:
+            model_endog = dataframe[col_name]
+        except KeyError:
+            model_endog = dmatrix(col_name + ' + 0', data=dataframe)
+        # retrieve the result and store them
+        res = _model2dataframe(model_endog, model_exog, model_type, **kwargs)
+        col_results[col_name] = res
+    # mangle them togheter and sort by complexive p-value
+    summary = pd.DataFrame(col_results)
+    # order by the p-value: the most useful model first!
+    summary = summary.T.sort_values([('pvals', '_f_test')])
+    summary.index.name = 'endogenous vars'
+    # implementing the pvalue correction method
+    smt = stats.multipletests
+    for (key1, key2) in summary:
+        if key1 != 'pvals':
+            continue
+        p_values = summary[key1, key2]
+        corrected = smt(p_values, method=method, alpha=alpha)[1]
+        # extend the dataframe of results with the column
+        # of the corrected p_values
+        summary['adj_' + key1, key2] = corrected
+    return summary
+
+
+def _test_group(pvalues, group_name, group, exact=True):
+    """test if the objects in the group are different from the general set.
+
+    The test is performed on the pvalues set (ad a pandas series) over
+    the group specified via a fisher exact test.
+    """
+    from scipy.stats import fisher_exact, chi2_contingency
+
+    totals = 1.0 * len(pvalues)
+    total_significant = 1.0 * np.sum(pvalues)
+    cross_index = [c for c in group if c in pvalues.index]
+    missing = [c for c in group if c not in pvalues.index]
+    if missing:
+        s = ('the test is not well defined if the group '
+             'has elements not presents in the significativity '
+             'array. group name: {}, missing elements: {}')
+        logging.warning(s.format(group_name, missing))
+    # how many are significant and not in the group
+    group_total = 1.0 * len(cross_index)
+    group_sign = 1.0 * len([c for c in cross_index if pvalues[c]])
+    group_nonsign = 1.0 * (group_total - group_sign)
+    # how many are significant and not outside the group
+    extern_sign = 1.0 * (total_significant - group_sign)
+    extern_nonsign = 1.0 * (totals - total_significant - group_nonsign)
+    # make the fisher test or the chi squared
+    test = fisher_exact if exact else chi2_contingency
+    table = [[extern_nonsign, extern_sign], [group_nonsign, group_sign]]
+    pvalue = test(np.array(table))[1]
+    # is the group more represented or less?
+    part = group_sign, group_nonsign, extern_sign, extern_nonsign
+    #increase = (group_sign / group_total) > (total_significant / totals)
+    increase = np.log((totals * group_sign)
+                      / (total_significant * group_total))
+    return pvalue, increase, part
+
+
+def multigroup(pvals, groups, exact=True, keep_all=True, alpha=0.05):
+    """Test if the given groups are different from the total partition.
+
+    Given a boolean array test if each group has a proportion of positives
+    different than the complexive proportion.
+    The test can be done as an exact Fisher test or approximated as a
+    Chi squared test for more speed.
+
+    Parameters
+    ----------
+    pvals : pandas series of boolean
+        the significativity of the variables under analysis
+    groups : dict of list
+        the name of each category of variables under exam.
+        each one is a list of the variables included
+    exact : bool, optional
+        If True (default) use the fisher exact test, otherwise
+        use the chi squared test for contingencies tables.
+        For high number of elements in the array the fisher test can
+        be significantly slower than the chi squared.
+    keep_all : bool, optional
+        if False it will drop those groups where the fraction
+        of positive is below the expected result. If True (default)
+         it will keep all the significant results.
+    alpha : float, optional
+        the significativity level for the pvalue correction
+        on the whole set of groups (not inside the groups themselves).
+
+    Returns
+    -------
+    result_df: pandas dataframe
+        for each group returns:
+
+            pvals - the fisher p value of the test
+            adj_pvals - the adjusted pvals
+            increase - the log of the odd ratio between the
+                internal significant ratio versus the external one
+            _in_sign - significative elements inside the group
+            _in_non - non significative elements inside the group
+            _out_sign - significative elements outside the group
+            _out_non - non significative elements outside the group
+
+    Notes
+    -----
+    This test allow to see if a category of variables is generally better
+    suited to be described for the model. For example to see if a predictor
+    gives more information on demographic or economical parameters,
+    by creating two groups containing the endogenous variables of each
+    category.
+
+    This function is conceived for medical dataset with a lot of variables
+    that can be easily grouped into functional groups. This is because
+    The significativity of a group require a rather large number of
+    composing elements.
+
+    Examples
+    --------
+    A toy example on a real dataset, the Guerry dataset from R
+    >>> url = "https://raw.githubusercontent.com/vincentarelbundock/"
+    >>> url = url + "Rdatasets/csv/HistData/Guerry.csv"
+    >>> df = pd.read_csv(url, index_col='dept')
+
+    evaluate the relationship between the various paramenters whith the Wealth
+    >>> pvals = multiOLS('Wealth', df)['adj_pvals', '_f_test']
+
+    define the groups
+    >>> groups = {}
+    >>> groups['crime'] = ['Crime_prop', 'Infanticide',
+    ...     'Crime_parents', 'Desertion', 'Crime_pers']
+    >>> groups['religion'] = ['Donation_clergy', 'Clergy', 'Donations']
+    >>> groups['wealth'] = ['Commerce', 'Lottery', 'Instruction', 'Literacy']
+
+    do the analysis of the significativity
+    >>> multigroup(pvals < 0.05, groups)
+    """
+    pvals = pd.Series(pvals)
+    if not (set(pvals.unique()) <= {False, True}):
+        raise ValueError("the series should be binary")
+    if hasattr(pvals.index, 'is_unique') and not pvals.index.is_unique:
+        raise ValueError("series with duplicated index is not accepted")
+    results = {'pvals': {},
+        'increase': {},
+        '_in_sign': {},
+        '_in_non': {},
+        '_out_sign': {},
+        '_out_non': {}}
+    for group_name, group_list in groups.items():
+        res = _test_group(pvals, group_name, group_list, exact)
+        results['pvals'][group_name] = res[0]
+        results['increase'][group_name] = res[1]
+        results['_in_sign'][group_name] = res[2][0]
+        results['_in_non'][group_name] = res[2][1]
+        results['_out_sign'][group_name] = res[2][2]
+        results['_out_non'][group_name] = res[2][3]
+    result_df = pd.DataFrame(results).sort_values('pvals')
+    if not keep_all:
+        result_df = result_df[result_df.increase]
+    smt = stats.multipletests
+    corrected = smt(result_df['pvals'], method='fdr_bh', alpha=alpha)[1]
+    result_df['adj_pvals'] = corrected
+    return result_df