some new features

2025-07-30 17:09:11 +03:00
parent db5d46760a
commit 8019bd3b7c
20616 changed files with 4375466 additions and 8 deletions
--- a/.venv/lib/python3.12/site-packages/statsmodels/stats/multitest.py
+++ b/.venv/lib/python3.12/site-packages/statsmodels/stats/multitest.py
@ -0,0 +1,771 @@
+'''Multiple Testing and P-Value Correction
+
+
+Author: Josef Perktold
+License: BSD-3
+
+'''
+
+
+import numpy as np
+
+from statsmodels.stats._knockoff import RegressionFDR
+
+__all__ = ['fdrcorrection', 'fdrcorrection_twostage', 'local_fdr',
+           'multipletests', 'NullDistribution', 'RegressionFDR']
+
+# ==============================================
+#
+# Part 1: Multiple Tests and P-Value Correction
+#
+# ==============================================
+
+
+def _ecdf(x):
+    '''no frills empirical cdf used in fdrcorrection
+    '''
+    nobs = len(x)
+    return np.arange(1,nobs+1)/float(nobs)
+
+multitest_methods_names = {'b': 'Bonferroni',
+                           's': 'Sidak',
+                           'h': 'Holm',
+                           'hs': 'Holm-Sidak',
+                           'sh': 'Simes-Hochberg',
+                           'ho': 'Hommel',
+                           'fdr_bh': 'FDR Benjamini-Hochberg',
+                           'fdr_by': 'FDR Benjamini-Yekutieli',
+                           'fdr_tsbh': 'FDR 2-stage Benjamini-Hochberg',
+                           'fdr_tsbky': 'FDR 2-stage Benjamini-Krieger-Yekutieli',
+                           'fdr_gbs': 'FDR adaptive Gavrilov-Benjamini-Sarkar'
+                           }
+
+_alias_list = [['b', 'bonf', 'bonferroni'],
+               ['s', 'sidak'],
+               ['h', 'holm'],
+               ['hs', 'holm-sidak'],
+               ['sh', 'simes-hochberg'],
+               ['ho', 'hommel'],
+               ['fdr_bh', 'fdr_i', 'fdr_p', 'fdri', 'fdrp'],
+               ['fdr_by', 'fdr_n', 'fdr_c', 'fdrn', 'fdrcorr'],
+               ['fdr_tsbh', 'fdr_2sbh'],
+               ['fdr_tsbky', 'fdr_2sbky', 'fdr_twostage'],
+               ['fdr_gbs']
+               ]
+
+
+multitest_alias = {}
+for m in _alias_list:
+    multitest_alias[m[0]] = m[0]
+    for a in m[1:]:
+        multitest_alias[a] = m[0]
+
+def multipletests(pvals, alpha=0.05, method='hs',
+                  maxiter=1,
+                  is_sorted=False,
+                  returnsorted=False):
+    """
+    Test results and p-value correction for multiple tests
+
+    Parameters
+    ----------
+    pvals : array_like, 1-d
+        uncorrected p-values.   Must be 1-dimensional.
+    alpha : float
+        FWER, family-wise error rate, e.g. 0.1
+    method : str
+        Method used for testing and adjustment of pvalues. Can be either the
+        full name or initial letters. Available methods are:
+
+        - `bonferroni` : one-step correction
+        - `sidak` : one-step correction
+        - `holm-sidak` : step down method using Sidak adjustments
+        - `holm` : step-down method using Bonferroni adjustments
+        - `simes-hochberg` : step-up method  (independent)
+        - `hommel` : closed method based on Simes tests (non-negative)
+        - `fdr_bh` : Benjamini/Hochberg  (non-negative)
+        - `fdr_by` : Benjamini/Yekutieli (negative)
+        - `fdr_tsbh` : two stage fdr correction (non-negative)
+        - `fdr_tsbky` : two stage fdr correction (non-negative)
+
+    maxiter : int or bool
+        Maximum number of iterations for two-stage fdr, `fdr_tsbh` and
+        `fdr_tsbky`. It is ignored by all other methods.
+        maxiter=1 (default) corresponds to the two stage method.
+        maxiter=-1 corresponds to full iterations which is maxiter=len(pvals).
+        maxiter=0 uses only a single stage fdr correction using a 'bh' or 'bky'
+        prior fraction of assumed true hypotheses.
+    is_sorted : bool
+        If False (default), the p_values will be sorted, but the corrected
+        pvalues are in the original order. If True, then it assumed that the
+        pvalues are already sorted in ascending order.
+    returnsorted : bool
+         not tested, return sorted p-values instead of original sequence
+
+    Returns
+    -------
+    reject : ndarray, boolean
+        true for hypothesis that can be rejected for given alpha
+    pvals_corrected : ndarray
+        p-values corrected for multiple tests
+    alphacSidak : float
+        corrected alpha for Sidak method
+    alphacBonf : float
+        corrected alpha for Bonferroni method
+
+    Notes
+    -----
+    There may be API changes for this function in the future.
+
+    Except for 'fdr_twostage', the p-value correction is independent of the
+    alpha specified as argument. In these cases the corrected p-values
+    can also be compared with a different alpha. In the case of 'fdr_twostage',
+    the corrected p-values are specific to the given alpha, see
+    ``fdrcorrection_twostage``.
+
+    The 'fdr_gbs' procedure is not verified against another package, p-values
+    are derived from scratch and are not derived in the reference. In Monte
+    Carlo experiments the method worked correctly and maintained the false
+    discovery rate.
+
+    All procedures that are included, control FWER or FDR in the independent
+    case, and most are robust in the positively correlated case.
+
+    `fdr_gbs`: high power, fdr control for independent case and only small
+    violation in positively correlated case
+
+    **Timing**:
+
+    Most of the time with large arrays is spent in `argsort`. When
+    we want to calculate the p-value for several methods, then it is more
+    efficient to presort the pvalues, and put the results back into the
+    original order outside of the function.
+
+    Method='hommel' is very slow for large arrays, since it requires the
+    evaluation of n partitions, where n is the number of p-values.
+    """
+    import gc
+    pvals = np.asarray(pvals)
+    alphaf = alpha  # Notation ?
+
+    if not is_sorted:
+        sortind = np.argsort(pvals)
+        pvals = np.take(pvals, sortind)
+
+    ntests = len(pvals)
+    alphacSidak = 1 - np.power((1. - alphaf), 1./ntests)
+    alphacBonf = alphaf / float(ntests)
+    if method.lower() in ['b', 'bonf', 'bonferroni']:
+        reject = pvals <= alphacBonf
+        pvals_corrected = pvals * float(ntests)
+
+    elif method.lower() in ['s', 'sidak']:
+        reject = pvals <= alphacSidak
+        pvals_corrected = -np.expm1(ntests * np.log1p(-pvals))
+
+    elif method.lower() in ['hs', 'holm-sidak']:
+        alphacSidak_all = 1 - np.power((1. - alphaf),
+                                       1./np.arange(ntests, 0, -1))
+        notreject = pvals > alphacSidak_all
+        del alphacSidak_all
+
+        nr_index = np.nonzero(notreject)[0]
+        if nr_index.size == 0:
+            # nonreject is empty, all rejected
+            notrejectmin = len(pvals)
+        else:
+            notrejectmin = np.min(nr_index)
+        notreject[notrejectmin:] = True
+        reject = ~notreject
+        del notreject
+
+        # It's eqivalent to 1 - np.power((1. - pvals),
+        #                           np.arange(ntests, 0, -1))
+        # but prevents the issue of the floating point precision
+        pvals_corrected_raw = -np.expm1(np.arange(ntests, 0, -1) *
+                                        np.log1p(-pvals))
+        pvals_corrected = np.maximum.accumulate(pvals_corrected_raw)
+        del pvals_corrected_raw
+
+    elif method.lower() in ['h', 'holm']:
+        notreject = pvals > alphaf / np.arange(ntests, 0, -1)
+        nr_index = np.nonzero(notreject)[0]
+        if nr_index.size == 0:
+            # nonreject is empty, all rejected
+            notrejectmin = len(pvals)
+        else:
+            notrejectmin = np.min(nr_index)
+        notreject[notrejectmin:] = True
+        reject = ~notreject
+        pvals_corrected_raw = pvals * np.arange(ntests, 0, -1)
+        pvals_corrected = np.maximum.accumulate(pvals_corrected_raw)
+        del pvals_corrected_raw
+        gc.collect()
+
+    elif method.lower() in ['sh', 'simes-hochberg']:
+        alphash = alphaf / np.arange(ntests, 0, -1)
+        reject = pvals <= alphash
+        rejind = np.nonzero(reject)
+        if rejind[0].size > 0:
+            rejectmax = np.max(np.nonzero(reject))
+            reject[:rejectmax] = True
+        pvals_corrected_raw = np.arange(ntests, 0, -1) * pvals
+        pvals_corrected = np.minimum.accumulate(pvals_corrected_raw[::-1])[::-1]
+        del pvals_corrected_raw
+
+    elif method.lower() in ['ho', 'hommel']:
+        # we need a copy because we overwrite it in a loop
+        a = pvals.copy()
+        for m in range(ntests, 1, -1):
+            cim = np.min(m * pvals[-m:] / np.arange(1,m+1.))
+            a[-m:] = np.maximum(a[-m:], cim)
+            a[:-m] = np.maximum(a[:-m], np.minimum(m * pvals[:-m], cim))
+        pvals_corrected = a
+        reject = a <= alphaf
+
+    elif method.lower() in ['fdr_bh', 'fdr_i', 'fdr_p', 'fdri', 'fdrp']:
+        # delegate, call with sorted pvals
+        reject, pvals_corrected = fdrcorrection(pvals, alpha=alpha,
+                                                 method='indep',
+                                                 is_sorted=True)
+    elif method.lower() in ['fdr_by', 'fdr_n', 'fdr_c', 'fdrn', 'fdrcorr']:
+        # delegate, call with sorted pvals
+        reject, pvals_corrected = fdrcorrection(pvals, alpha=alpha,
+                                                 method='n',
+                                                 is_sorted=True)
+    elif method.lower() in ['fdr_tsbky', 'fdr_2sbky', 'fdr_twostage']:
+        # delegate, call with sorted pvals
+        reject, pvals_corrected = fdrcorrection_twostage(pvals, alpha=alpha,
+                                                         method='bky',
+                                                         maxiter=maxiter,
+                                                         is_sorted=True)[:2]
+    elif method.lower() in ['fdr_tsbh', 'fdr_2sbh']:
+        # delegate, call with sorted pvals
+        reject, pvals_corrected = fdrcorrection_twostage(pvals, alpha=alpha,
+                                                         method='bh',
+                                                         maxiter=maxiter,
+                                                         is_sorted=True)[:2]
+
+    elif method.lower() in ['fdr_gbs']:
+        #adaptive stepdown in Gavrilov, Benjamini, Sarkar, Annals of Statistics 2009
+##        notreject = pvals > alphaf / np.arange(ntests, 0, -1) #alphacSidak
+##        notrejectmin = np.min(np.nonzero(notreject))
+##        notreject[notrejectmin:] = True
+##        reject = ~notreject
+
+        ii = np.arange(1, ntests + 1)
+        q = (ntests + 1. - ii)/ii * pvals / (1. - pvals)
+        pvals_corrected_raw = np.maximum.accumulate(q) #up requirementd
+
+        pvals_corrected = np.minimum.accumulate(pvals_corrected_raw[::-1])[::-1]
+        del pvals_corrected_raw
+        reject = pvals_corrected <= alpha
+
+    else:
+        raise ValueError('method not recognized')
+
+    if pvals_corrected is not None: #not necessary anymore
+        pvals_corrected[pvals_corrected>1] = 1
+    if is_sorted or returnsorted:
+        return reject, pvals_corrected, alphacSidak, alphacBonf
+    else:
+        pvals_corrected_ = np.empty_like(pvals_corrected)
+        pvals_corrected_[sortind] = pvals_corrected
+        del pvals_corrected
+        reject_ = np.empty_like(reject)
+        reject_[sortind] = reject
+        return reject_, pvals_corrected_, alphacSidak, alphacBonf
+
+
+def fdrcorrection(pvals, alpha=0.05, method='indep', is_sorted=False):
+    '''
+    pvalue correction for false discovery rate.
+
+    This covers Benjamini/Hochberg for independent or positively correlated and
+    Benjamini/Yekutieli for general or negatively correlated tests.
+
+    Parameters
+    ----------
+    pvals : array_like, 1d
+        Set of p-values of the individual tests.
+    alpha : float, optional
+        Family-wise error rate. Defaults to ``0.05``.
+    method : {'i', 'indep', 'p', 'poscorr', 'n', 'negcorr'}, optional
+        Which method to use for FDR correction.
+        ``{'i', 'indep', 'p', 'poscorr'}`` all refer to ``fdr_bh``
+        (Benjamini/Hochberg for independent or positively
+        correlated tests). ``{'n', 'negcorr'}`` both refer to ``fdr_by``
+        (Benjamini/Yekutieli for general or negatively correlated tests).
+        Defaults to ``'indep'``.
+    is_sorted : bool, optional
+        If False (default), the p_values will be sorted, but the corrected
+        pvalues are in the original order. If True, then it assumed that the
+        pvalues are already sorted in ascending order.
+
+    Returns
+    -------
+    rejected : ndarray, bool
+        True if a hypothesis is rejected, False if not
+    pvalue-corrected : ndarray
+        pvalues adjusted for multiple hypothesis testing to limit FDR
+
+    Notes
+    -----
+    If there is prior information on the fraction of true hypothesis, then alpha
+    should be set to ``alpha * m/m_0`` where m is the number of tests,
+    given by the p-values, and m_0 is an estimate of the true hypothesis.
+    (see Benjamini, Krieger and Yekuteli)
+
+    The two-step method of Benjamini, Krieger and Yekutiel that estimates the number
+    of false hypotheses will be available (soon).
+
+    Both methods exposed via this function (Benjamini/Hochberg, Benjamini/Yekutieli)
+    are also available in the function ``multipletests``, as ``method="fdr_bh"`` and
+    ``method="fdr_by"``, respectively.
+
+    See also
+    --------
+    multipletests
+
+    '''
+    pvals = np.asarray(pvals)
+    assert pvals.ndim == 1, "pvals must be 1-dimensional, that is of shape (n,)"
+
+    if not is_sorted:
+        pvals_sortind = np.argsort(pvals)
+        pvals_sorted = np.take(pvals, pvals_sortind)
+    else:
+        pvals_sorted = pvals  # alias
+
+    if method in ['i', 'indep', 'p', 'poscorr']:
+        ecdffactor = _ecdf(pvals_sorted)
+    elif method in ['n', 'negcorr']:
+        cm = np.sum(1./np.arange(1, len(pvals_sorted)+1))   #corrected this
+        ecdffactor = _ecdf(pvals_sorted) / cm
+##    elif method in ['n', 'negcorr']:
+##        cm = np.sum(np.arange(len(pvals)))
+##        ecdffactor = ecdf(pvals_sorted)/cm
+    else:
+        raise ValueError('only indep and negcorr implemented')
+    reject = pvals_sorted <= ecdffactor*alpha
+    if reject.any():
+        rejectmax = max(np.nonzero(reject)[0])
+        reject[:rejectmax] = True
+
+    pvals_corrected_raw = pvals_sorted / ecdffactor
+    pvals_corrected = np.minimum.accumulate(pvals_corrected_raw[::-1])[::-1]
+    del pvals_corrected_raw
+    pvals_corrected[pvals_corrected>1] = 1
+    if not is_sorted:
+        pvals_corrected_ = np.empty_like(pvals_corrected)
+        pvals_corrected_[pvals_sortind] = pvals_corrected
+        del pvals_corrected
+        reject_ = np.empty_like(reject)
+        reject_[pvals_sortind] = reject
+        return reject_, pvals_corrected_
+    else:
+        return reject, pvals_corrected
+
+
+def fdrcorrection_twostage(pvals, alpha=0.05, method='bky',
+                           maxiter=1,
+                           iter=None,
+                           is_sorted=False):
+    '''(iterated) two stage linear step-up procedure with estimation of number of true
+    hypotheses
+
+    Benjamini, Krieger and Yekuteli, procedure in Definition 6
+
+    Parameters
+    ----------
+    pvals : array_like
+        set of p-values of the individual tests.
+    alpha : float
+        error rate
+    method : {'bky', 'bh')
+        see Notes for details
+
+        * 'bky' - implements the procedure in Definition 6 of Benjamini, Krieger
+           and Yekuteli 2006
+        * 'bh' - the two stage method of Benjamini and Hochberg
+
+    maxiter : int or bool
+        Maximum number of iterations.
+        maxiter=1 (default) corresponds to the two stage method.
+        maxiter=-1 corresponds to full iterations which is maxiter=len(pvals).
+        maxiter=0 uses only a single stage fdr correction using a 'bh' or 'bky'
+        prior fraction of assumed true hypotheses.
+        Boolean maxiter is allowed for backwards compatibility with the
+        deprecated ``iter`` keyword.
+        maxiter=False is two-stage fdr (maxiter=1)
+        maxiter=True is full iteration (maxiter=-1 or maxiter=len(pvals))
+
+        .. versionadded:: 0.14
+
+            Replacement for ``iter`` with additional features.
+
+    iter : bool
+        ``iter`` is deprecated use ``maxiter`` instead.
+        If iter is True, then only one iteration step is used, this is the
+        two-step method.
+        If iter is False, then iterations are stopped at convergence which
+        occurs in a finite number of steps (at most len(pvals) steps).
+
+        .. deprecated:: 0.14
+
+            Use ``maxiter`` instead of ``iter``.
+
+    Returns
+    -------
+    rejected : ndarray, bool
+        True if a hypothesis is rejected, False if not
+    pvalue-corrected : ndarray
+        pvalues adjusted for multiple hypotheses testing to limit FDR
+    m0 : int
+        ntest - rej, estimated number of true (not rejected) hypotheses
+    alpha_stages : list of floats
+        A list of alphas that have been used at each stage
+
+    Notes
+    -----
+    The returned corrected p-values are specific to the given alpha, they
+    cannot be used for a different alpha.
+
+    The returned corrected p-values are from the last stage of the fdr_bh
+    linear step-up procedure (fdrcorrection0 with method='indep') corrected
+    for the estimated fraction of true hypotheses.
+    This means that the rejection decision can be obtained with
+    ``pval_corrected <= alpha``, where ``alpha`` is the original significance
+    level.
+    (Note: This has changed from earlier versions (<0.5.0) of statsmodels.)
+
+    BKY described several other multi-stage methods, which would be easy to implement.
+    However, in their simulation the simple two-stage method (with iter=False) was the
+    most robust to the presence of positive correlation
+
+    TODO: What should be returned?
+
+    '''
+    pvals = np.asarray(pvals)
+
+    if iter is not None:
+        import warnings
+        msg = "iter keyword is deprecated, use maxiter keyword instead."
+        warnings.warn(msg, FutureWarning)
+
+    if iter is False:
+        maxiter = 1
+    elif iter is True or maxiter in [-1, None] :
+        maxiter = len(pvals)
+    # otherwise we use maxiter
+
+
+    if not is_sorted:
+        pvals_sortind = np.argsort(pvals)
+        pvals = np.take(pvals, pvals_sortind)
+
+    ntests = len(pvals)
+    if method == 'bky':
+        fact = (1.+alpha)
+        alpha_prime = alpha / fact
+    elif method == 'bh':
+        fact = 1.
+        alpha_prime = alpha
+    else:
+        raise ValueError("only 'bky' and 'bh' are available as method")
+
+    alpha_stages = [alpha_prime]
+    rej, pvalscorr = fdrcorrection(pvals, alpha=alpha_prime, method='indep',
+                                   is_sorted=True)
+    r1 = rej.sum()
+    if (r1 == 0) or (r1 == ntests):
+        # return rej, pvalscorr * fact, ntests - r1, alpha_stages
+        reject = rej
+        pvalscorr *= fact
+        ri = r1
+    else:
+        ri_old = ri = r1
+        ntests0 = ntests # needed if maxiter=0
+        # while True:
+        for it in range(maxiter):
+            ntests0 = 1.0 * ntests - ri_old
+            alpha_star = alpha_prime * ntests / ntests0
+            alpha_stages.append(alpha_star)
+            #print ntests0, alpha_star
+            rej, pvalscorr = fdrcorrection(pvals, alpha=alpha_star, method='indep',
+                                           is_sorted=True)
+            ri = rej.sum()
+            if (it >= maxiter - 1) or ri == ri_old:
+                break
+            elif ri < ri_old:
+                # prevent cycles and endless loops
+                raise RuntimeError(" oops - should not be here")
+            ri_old = ri
+
+        # make adjustment to pvalscorr to reflect estimated number of Non-Null cases
+        # decision is then pvalscorr < alpha  (or <=)
+        pvalscorr *= ntests0 * 1.0 /  ntests
+        if method == 'bky':
+            pvalscorr *= (1. + alpha)
+
+    pvalscorr[pvalscorr>1] = 1
+    if not is_sorted:
+        pvalscorr_ = np.empty_like(pvalscorr)
+        pvalscorr_[pvals_sortind] = pvalscorr
+        del pvalscorr
+        reject = np.empty_like(rej)
+        reject[pvals_sortind] = rej
+        return reject, pvalscorr_, ntests - ri, alpha_stages
+    else:
+        return rej, pvalscorr, ntests - ri, alpha_stages
+
+
+def local_fdr(zscores, null_proportion=1.0, null_pdf=None, deg=7,
+              nbins=30, alpha=0):
+    """
+    Calculate local FDR values for a list of Z-scores.
+
+    Parameters
+    ----------
+    zscores : array_like
+        A vector of Z-scores
+    null_proportion : float
+        The assumed proportion of true null hypotheses
+    null_pdf : function mapping reals to positive reals
+        The density of null Z-scores; if None, use standard normal
+    deg : int
+        The maximum exponent in the polynomial expansion of the
+        density of non-null Z-scores
+    nbins : int
+        The number of bins for estimating the marginal density
+        of Z-scores.
+    alpha : float
+        Use Poisson ridge regression with parameter alpha to estimate
+        the density of non-null Z-scores.
+
+    Returns
+    -------
+    fdr : array_like
+        A vector of FDR values
+
+    References
+    ----------
+    B Efron (2008).  Microarrays, Empirical Bayes, and the Two-Groups
+    Model.  Statistical Science 23:1, 1-22.
+
+    Examples
+    --------
+    Basic use (the null Z-scores are taken to be standard normal):
+
+    >>> from statsmodels.stats.multitest import local_fdr
+    >>> import numpy as np
+    >>> zscores = np.random.randn(30)
+    >>> fdr = local_fdr(zscores)
+
+    Use a Gaussian null distribution estimated from the data:
+
+    >>> null = EmpiricalNull(zscores)
+    >>> fdr = local_fdr(zscores, null_pdf=null.pdf)
+    """
+
+    from statsmodels.genmod.generalized_linear_model import GLM
+    from statsmodels.genmod.generalized_linear_model import families
+    from statsmodels.regression.linear_model import OLS
+
+    # Bins for Poisson modeling of the marginal Z-score density
+    minz = min(zscores)
+    maxz = max(zscores)
+    bins = np.linspace(minz, maxz, nbins)
+
+    # Bin counts
+    zhist = np.histogram(zscores, bins)[0]
+
+    # Bin centers
+    zbins = (bins[:-1] + bins[1:]) / 2
+
+    # The design matrix at bin centers
+    dmat = np.vander(zbins, deg + 1)
+
+    # Rescale the design matrix
+    sd = dmat.std(0)
+    ii = sd >1e-8
+    dmat[:, ii] /= sd[ii]
+
+    start = OLS(np.log(1 + zhist), dmat).fit().params
+
+    # Poisson regression
+    if alpha > 0:
+        md = GLM(zhist, dmat, family=families.Poisson()).fit_regularized(L1_wt=0, alpha=alpha, start_params=start)
+    else:
+        md = GLM(zhist, dmat, family=families.Poisson()).fit(start_params=start)
+
+    # The design matrix for all Z-scores
+    dmat_full = np.vander(zscores, deg + 1)
+    dmat_full[:, ii] /= sd[ii]
+
+    # The height of the estimated marginal density of Z-scores,
+    # evaluated at every observed Z-score.
+    fz = md.predict(dmat_full) / (len(zscores) * (bins[1] - bins[0]))
+
+    # The null density.
+    if null_pdf is None:
+        f0 = np.exp(-0.5 * zscores**2) / np.sqrt(2 * np.pi)
+    else:
+        f0 = null_pdf(zscores)
+
+    # The local FDR values
+    fdr = null_proportion * f0 / fz
+
+    fdr = np.clip(fdr, 0, 1)
+
+    return fdr
+
+
+class NullDistribution:
+    """
+    Estimate a Gaussian distribution for the null Z-scores.
+
+    The observed Z-scores consist of both null and non-null values.
+    The fitted distribution of null Z-scores is Gaussian, but may have
+    non-zero mean and/or non-unit scale.
+
+    Parameters
+    ----------
+    zscores : array_like
+        The observed Z-scores.
+    null_lb : float
+        Z-scores between `null_lb` and `null_ub` are all considered to be
+        true null hypotheses.
+    null_ub : float
+        See `null_lb`.
+    estimate_mean : bool
+        If True, estimate the mean of the distribution.  If False, the
+        mean is fixed at zero.
+    estimate_scale : bool
+        If True, estimate the scale of the distribution.  If False, the
+        scale parameter is fixed at 1.
+    estimate_null_proportion : bool
+        If True, estimate the proportion of true null hypotheses (i.e.
+        the proportion of z-scores with expected value zero).  If False,
+        this parameter is fixed at 1.
+
+    Attributes
+    ----------
+    mean : float
+        The estimated mean of the empirical null distribution
+    sd : float
+        The estimated standard deviation of the empirical null distribution
+    null_proportion : float
+        The estimated proportion of true null hypotheses among all hypotheses
+
+    References
+    ----------
+    B Efron (2008).  Microarrays, Empirical Bayes, and the Two-Groups
+    Model.  Statistical Science 23:1, 1-22.
+
+    Notes
+    -----
+    See also:
+
+    http://nipy.org/nipy/labs/enn.html#nipy.algorithms.statistics.empirical_pvalue.NormalEmpiricalNull.fdr
+    """
+
+    def __init__(self, zscores, null_lb=-1, null_ub=1, estimate_mean=True,
+                 estimate_scale=True, estimate_null_proportion=False):
+
+        # Extract the null z-scores
+        ii = np.flatnonzero((zscores >= null_lb) & (zscores <= null_ub))
+        if len(ii) == 0:
+            raise RuntimeError("No Z-scores fall between null_lb and null_ub")
+        zscores0 = zscores[ii]
+
+        # Number of Z-scores, and null Z-scores
+        n_zs, n_zs0 = len(zscores), len(zscores0)
+
+        # Unpack and transform the parameters to the natural scale, hold
+        # parameters fixed as specified.
+        def xform(params):
+
+            mean = 0.
+            sd = 1.
+            prob = 1.
+
+            ii = 0
+            if estimate_mean:
+                mean = params[ii]
+                ii += 1
+            if estimate_scale:
+                sd = np.exp(params[ii])
+                ii += 1
+            if estimate_null_proportion:
+                prob = 1 / (1 + np.exp(-params[ii]))
+
+            return mean, sd, prob
+
+
+        from scipy.stats.distributions import norm
+
+
+        def fun(params):
+            """
+            Negative log-likelihood of z-scores.
+
+            The function has three arguments, packed into a vector:
+
+            mean : location parameter
+            logscale : log of the scale parameter
+            logitprop : logit of the proportion of true nulls
+
+            The implementation follows section 4 from Efron 2008.
+            """
+
+            d, s, p = xform(params)
+
+            # Mass within the central region
+            central_mass = (norm.cdf((null_ub - d) / s) -
+                            norm.cdf((null_lb - d) / s))
+
+            # Probability that a Z-score is null and is in the central region
+            cp = p * central_mass
+
+            # Binomial term
+            rval = n_zs0 * np.log(cp) + (n_zs - n_zs0) * np.log(1 - cp)
+
+            # Truncated Gaussian term for null Z-scores
+            zv = (zscores0 - d) / s
+            rval += np.sum(-zv**2 / 2) - n_zs0 * np.log(s)
+            rval -= n_zs0 * np.log(central_mass)
+
+            return -rval
+
+
+        # Estimate the parameters
+        from scipy.optimize import minimize
+        # starting values are mean = 0, scale = 1, p0 ~ 1
+        mz = minimize(fun, np.r_[0., 0, 3], method="Nelder-Mead")
+        mean, sd, prob = xform(mz['x'])
+
+        self.mean = mean
+        self.sd = sd
+        self.null_proportion = prob
+
+
+    # The fitted null density function
+    def pdf(self, zscores):
+        """
+        Evaluates the fitted empirical null Z-score density.
+
+        Parameters
+        ----------
+        zscores : scalar or array_like
+            The point or points at which the density is to be
+            evaluated.
+
+        Returns
+        -------
+        The empirical null Z-score density evaluated at the given
+        points.
+        """
+
+        zval = (zscores - self.mean) / self.sd
+        return np.exp(-0.5*zval**2 - np.log(self.sd) - 0.5*np.log(2*np.pi))