some new features
This commit is contained in:
@ -0,0 +1,817 @@
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
from scipy.stats.distributions import chi2, norm
|
||||
from statsmodels.graphics import utils
|
||||
|
||||
|
||||
def _calc_survfunc_right(time, status, weights=None, entry=None, compress=True,
|
||||
retall=True):
|
||||
"""
|
||||
Calculate the survival function and its standard error for a single
|
||||
group.
|
||||
"""
|
||||
|
||||
# Convert the unique times to ranks (0, 1, 2, ...)
|
||||
if entry is None:
|
||||
utime, rtime = np.unique(time, return_inverse=True)
|
||||
else:
|
||||
tx = np.concatenate((time, entry))
|
||||
utime, rtime = np.unique(tx, return_inverse=True)
|
||||
rtime = rtime[0:len(time)]
|
||||
|
||||
# Number of deaths at each unique time.
|
||||
ml = len(utime)
|
||||
if weights is None:
|
||||
d = np.bincount(rtime, weights=status, minlength=ml)
|
||||
else:
|
||||
d = np.bincount(rtime, weights=status*weights, minlength=ml)
|
||||
|
||||
# Size of risk set just prior to each event time.
|
||||
if weights is None:
|
||||
n = np.bincount(rtime, minlength=ml)
|
||||
else:
|
||||
n = np.bincount(rtime, weights=weights, minlength=ml)
|
||||
if entry is not None:
|
||||
n = np.cumsum(n) - n
|
||||
rentry = np.searchsorted(utime, entry, side='left')
|
||||
if weights is None:
|
||||
n0 = np.bincount(rentry, minlength=ml)
|
||||
else:
|
||||
n0 = np.bincount(rentry, weights=weights, minlength=ml)
|
||||
n0 = np.cumsum(n0) - n0
|
||||
n = n0 - n
|
||||
else:
|
||||
n = np.cumsum(n[::-1])[::-1]
|
||||
|
||||
# Only retain times where an event occurred.
|
||||
if compress:
|
||||
ii = np.flatnonzero(d > 0)
|
||||
d = d[ii]
|
||||
n = n[ii]
|
||||
utime = utime[ii]
|
||||
|
||||
# The survival function probabilities.
|
||||
sp = 1 - d / n.astype(np.float64)
|
||||
ii = sp < 1e-16
|
||||
sp[ii] = 1e-16
|
||||
sp = np.log(sp)
|
||||
sp = np.cumsum(sp)
|
||||
sp = np.exp(sp)
|
||||
sp[ii] = 0
|
||||
|
||||
if not retall:
|
||||
return sp, utime, rtime, n, d
|
||||
|
||||
# Standard errors
|
||||
if weights is None:
|
||||
# Greenwood's formula
|
||||
denom = n * (n - d)
|
||||
denom = np.clip(denom, 1e-12, np.inf)
|
||||
se = d / denom.astype(np.float64)
|
||||
se[(n == d) | (n == 0)] = np.nan
|
||||
se = np.cumsum(se)
|
||||
se = np.sqrt(se)
|
||||
locs = np.isfinite(se) | (sp != 0)
|
||||
se[locs] *= sp[locs]
|
||||
se[~locs] = np.nan
|
||||
else:
|
||||
# Tsiatis' (1981) formula
|
||||
se = d / (n * n).astype(np.float64)
|
||||
se = np.cumsum(se)
|
||||
se = np.sqrt(se)
|
||||
|
||||
return sp, se, utime, rtime, n, d
|
||||
|
||||
|
||||
def _calc_incidence_right(time, status, weights=None):
|
||||
"""
|
||||
Calculate the cumulative incidence function and its standard error.
|
||||
"""
|
||||
|
||||
# Calculate the all-cause survival function.
|
||||
status0 = (status >= 1).astype(np.float64)
|
||||
sp, utime, rtime, n, d = _calc_survfunc_right(time, status0, weights,
|
||||
compress=False, retall=False)
|
||||
|
||||
ngrp = int(status.max())
|
||||
|
||||
# Number of cause-specific deaths at each unique time.
|
||||
d = []
|
||||
for k in range(ngrp):
|
||||
status0 = (status == k + 1).astype(np.float64)
|
||||
if weights is None:
|
||||
d0 = np.bincount(rtime, weights=status0, minlength=len(utime))
|
||||
else:
|
||||
d0 = np.bincount(rtime, weights=status0*weights,
|
||||
minlength=len(utime))
|
||||
d.append(d0)
|
||||
|
||||
# The cumulative incidence function probabilities.
|
||||
ip = []
|
||||
sp0 = np.r_[1, sp[:-1]] / n
|
||||
for k in range(ngrp):
|
||||
ip0 = np.cumsum(sp0 * d[k])
|
||||
ip.append(ip0)
|
||||
|
||||
# The standard error of the cumulative incidence function.
|
||||
if weights is not None:
|
||||
return ip, None, utime
|
||||
se = []
|
||||
da = sum(d)
|
||||
for k in range(ngrp):
|
||||
|
||||
ra = da / (n * (n - da))
|
||||
v = ip[k]**2 * np.cumsum(ra)
|
||||
v -= 2 * ip[k] * np.cumsum(ip[k] * ra)
|
||||
v += np.cumsum(ip[k]**2 * ra)
|
||||
|
||||
ra = (n - d[k]) * d[k] / n
|
||||
v += np.cumsum(sp0**2 * ra)
|
||||
|
||||
ra = sp0 * d[k] / n
|
||||
v -= 2 * ip[k] * np.cumsum(ra)
|
||||
v += 2 * np.cumsum(ip[k] * ra)
|
||||
|
||||
se.append(np.sqrt(v))
|
||||
|
||||
return ip, se, utime
|
||||
|
||||
|
||||
def _checkargs(time, status, entry, freq_weights, exog):
|
||||
|
||||
if len(time) != len(status):
|
||||
raise ValueError("time and status must have the same length")
|
||||
|
||||
if entry is not None and (len(entry) != len(time)):
|
||||
msg = "entry times and event times must have the same length"
|
||||
raise ValueError(msg)
|
||||
|
||||
if entry is not None and np.any(entry >= time):
|
||||
msg = "Entry times must not occur on or after event times"
|
||||
raise ValueError(msg)
|
||||
|
||||
if freq_weights is not None and (len(freq_weights) != len(time)):
|
||||
raise ValueError("weights, time and status must have the same length")
|
||||
|
||||
if exog is not None and (exog.shape[0] != len(time)):
|
||||
raise ValueError("the rows of exog should align with time")
|
||||
|
||||
|
||||
class CumIncidenceRight:
|
||||
"""
|
||||
Estimation and inference for a cumulative incidence function.
|
||||
|
||||
If J = 1, 2, ... indicates the event type, the cumulative
|
||||
incidence function for cause j is:
|
||||
|
||||
I(t, j) = P(T <= t and J=j)
|
||||
|
||||
Only right censoring is supported. If frequency weights are provided,
|
||||
the point estimate is returned without a standard error.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
time : array_like
|
||||
An array of times (censoring times or event times)
|
||||
status : array_like
|
||||
If status >= 1 indicates which event occurred at time t. If
|
||||
status = 0, the subject was censored at time t.
|
||||
title : str
|
||||
Optional title used for plots and summary output.
|
||||
freq_weights : array_like
|
||||
Optional frequency weights
|
||||
exog : array_like
|
||||
Optional, if present used to account for violation of
|
||||
independent censoring.
|
||||
bw_factor : float
|
||||
Band-width multiplier for kernel-based estimation. Only
|
||||
used if exog is provided.
|
||||
dimred : bool
|
||||
If True, proportional hazards regression models are used to
|
||||
reduce exog to two columns by predicting overall events and
|
||||
censoring in two separate models. If False, exog is used
|
||||
directly for calculating kernel weights without dimension
|
||||
reduction.
|
||||
|
||||
Attributes
|
||||
----------
|
||||
times : array_like
|
||||
The distinct times at which the incidence rates are estimated
|
||||
cinc : list of arrays
|
||||
cinc[k-1] contains the estimated cumulative incidence rates
|
||||
for outcome k=1,2,...
|
||||
cinc_se : list of arrays
|
||||
The standard errors for the values in `cinc`. Not available when
|
||||
exog and/or frequency weights are provided.
|
||||
|
||||
Notes
|
||||
-----
|
||||
When exog is provided, a local estimate of the cumulative incidence
|
||||
rate around each point is provided, and these are averaged to
|
||||
produce an estimate of the marginal cumulative incidence
|
||||
functions. The procedure is analogous to that described in Zeng
|
||||
(2004) for estimation of the marginal survival function. The
|
||||
approach removes bias resulting from dependent censoring when the
|
||||
censoring becomes independent conditioned on the columns of exog.
|
||||
|
||||
References
|
||||
----------
|
||||
The Stata stcompet procedure:
|
||||
http://www.stata-journal.com/sjpdf.html?articlenum=st0059
|
||||
|
||||
Dinse, G. E. and M. G. Larson. 1986. A note on semi-Markov models
|
||||
for partially censored data. Biometrika 73: 379-386.
|
||||
|
||||
Marubini, E. and M. G. Valsecchi. 1995. Analysing Survival Data
|
||||
from Clinical Trials and Observational Studies. Chichester, UK:
|
||||
John Wiley & Sons.
|
||||
|
||||
D. Zeng (2004). Estimating marginal survival function by
|
||||
adjusting for dependent censoring using many covariates. Annals
|
||||
of Statistics 32:4.
|
||||
https://arxiv.org/pdf/math/0409180.pdf
|
||||
"""
|
||||
|
||||
def __init__(self, time, status, title=None, freq_weights=None,
|
||||
exog=None, bw_factor=1., dimred=True):
|
||||
|
||||
_checkargs(time, status, None, freq_weights, None)
|
||||
time = self.time = np.asarray(time)
|
||||
status = self.status = np.asarray(status)
|
||||
if freq_weights is not None:
|
||||
freq_weights = self.freq_weights = np.asarray(freq_weights)
|
||||
|
||||
if exog is not None:
|
||||
from ._kernel_estimates import _kernel_cumincidence
|
||||
exog = self.exog = np.asarray(exog)
|
||||
nobs = exog.shape[0]
|
||||
kw = nobs**(-1/3.0) * bw_factor
|
||||
kfunc = lambda x: np.exp(-x**2 / kw**2).sum(1)
|
||||
x = _kernel_cumincidence(time, status, exog, kfunc, freq_weights,
|
||||
dimred)
|
||||
self.times = x[0]
|
||||
self.cinc = x[1]
|
||||
return
|
||||
|
||||
x = _calc_incidence_right(time, status, freq_weights)
|
||||
self.cinc = x[0]
|
||||
self.cinc_se = x[1]
|
||||
self.times = x[2]
|
||||
self.title = "" if not title else title
|
||||
|
||||
|
||||
class SurvfuncRight:
|
||||
"""
|
||||
Estimation and inference for a survival function.
|
||||
|
||||
The survival function S(t) = P(T > t) is the probability that an
|
||||
event time T is greater than t.
|
||||
|
||||
This class currently only supports right censoring.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
time : array_like
|
||||
An array of times (censoring times or event times)
|
||||
status : array_like
|
||||
Status at the event time, status==1 is the 'event'
|
||||
(e.g. death, failure), meaning that the event
|
||||
occurs at the given value in `time`; status==0
|
||||
indicates that censoring has occurred, meaning that
|
||||
the event occurs after the given value in `time`.
|
||||
entry : array_like, optional An array of entry times for handling
|
||||
left truncation (the subject is not in the risk set on or
|
||||
before the entry time)
|
||||
title : str
|
||||
Optional title used for plots and summary output.
|
||||
freq_weights : array_like
|
||||
Optional frequency weights
|
||||
exog : array_like
|
||||
Optional, if present used to account for violation of
|
||||
independent censoring.
|
||||
bw_factor : float
|
||||
Band-width multiplier for kernel-based estimation. Only used
|
||||
if exog is provided.
|
||||
|
||||
Attributes
|
||||
----------
|
||||
surv_prob : array_like
|
||||
The estimated value of the survivor function at each time
|
||||
point in `surv_times`.
|
||||
surv_prob_se : array_like
|
||||
The standard errors for the values in `surv_prob`. Not available
|
||||
if exog is provided.
|
||||
surv_times : array_like
|
||||
The points where the survival function changes.
|
||||
n_risk : array_like
|
||||
The number of subjects at risk just before each time value in
|
||||
`surv_times`. Not available if exog is provided.
|
||||
n_events : array_like
|
||||
The number of events (e.g. deaths) that occur at each point
|
||||
in `surv_times`. Not available if exog is provided.
|
||||
|
||||
Notes
|
||||
-----
|
||||
If exog is None, the standard Kaplan-Meier estimator is used. If
|
||||
exog is not None, a local estimate of the marginal survival
|
||||
function around each point is constructed, and these are then
|
||||
averaged. This procedure gives an estimate of the marginal
|
||||
survival function that accounts for dependent censoring as long as
|
||||
the censoring becomes independent when conditioning on the
|
||||
covariates in exog. See Zeng et al. (2004) for details.
|
||||
|
||||
References
|
||||
----------
|
||||
D. Zeng (2004). Estimating marginal survival function by
|
||||
adjusting for dependent censoring using many covariates. Annals
|
||||
of Statistics 32:4.
|
||||
https://arxiv.org/pdf/math/0409180.pdf
|
||||
"""
|
||||
|
||||
def __init__(self, time, status, entry=None, title=None,
|
||||
freq_weights=None, exog=None, bw_factor=1.):
|
||||
|
||||
_checkargs(time, status, entry, freq_weights, exog)
|
||||
time = self.time = np.asarray(time)
|
||||
status = self.status = np.asarray(status)
|
||||
if freq_weights is not None:
|
||||
freq_weights = self.freq_weights = np.asarray(freq_weights)
|
||||
|
||||
if entry is not None:
|
||||
entry = self.entry = np.asarray(entry)
|
||||
|
||||
if exog is not None:
|
||||
if entry is not None:
|
||||
raise ValueError("exog and entry cannot both be present")
|
||||
from ._kernel_estimates import _kernel_survfunc
|
||||
exog = self.exog = np.asarray(exog)
|
||||
nobs = exog.shape[0]
|
||||
kw = nobs**(-1/3.0) * bw_factor
|
||||
kfunc = lambda x: np.exp(-x**2 / kw**2).sum(1)
|
||||
x = _kernel_survfunc(time, status, exog, kfunc, freq_weights)
|
||||
self.surv_prob = x[0]
|
||||
self.surv_times = x[1]
|
||||
return
|
||||
|
||||
x = _calc_survfunc_right(time, status, weights=freq_weights,
|
||||
entry=entry)
|
||||
|
||||
self.surv_prob = x[0]
|
||||
self.surv_prob_se = x[1]
|
||||
self.surv_times = x[2]
|
||||
self.n_risk = x[4]
|
||||
self.n_events = x[5]
|
||||
self.title = "" if not title else title
|
||||
|
||||
def plot(self, ax=None):
|
||||
"""
|
||||
Plot the survival function.
|
||||
|
||||
Examples
|
||||
--------
|
||||
Change the line color:
|
||||
|
||||
>>> import statsmodels.api as sm
|
||||
>>> data = sm.datasets.get_rdataset("flchain", "survival").data
|
||||
>>> df = data.loc[data.sex == "F", :]
|
||||
>>> sf = sm.SurvfuncRight(df["futime"], df["death"])
|
||||
>>> fig = sf.plot()
|
||||
>>> ax = fig.get_axes()[0]
|
||||
>>> li = ax.get_lines()
|
||||
>>> li[0].set_color('purple')
|
||||
>>> li[1].set_color('purple')
|
||||
|
||||
Do not show the censoring points:
|
||||
|
||||
>>> fig = sf.plot()
|
||||
>>> ax = fig.get_axes()[0]
|
||||
>>> li = ax.get_lines()
|
||||
>>> li[1].set_visible(False)
|
||||
"""
|
||||
|
||||
return plot_survfunc(self, ax)
|
||||
|
||||
def quantile(self, p):
|
||||
"""
|
||||
Estimated quantile of a survival distribution.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
p : float
|
||||
The probability point at which the quantile
|
||||
is determined.
|
||||
|
||||
Returns the estimated quantile.
|
||||
"""
|
||||
|
||||
# SAS uses a strict inequality here.
|
||||
ii = np.flatnonzero(self.surv_prob < 1 - p)
|
||||
|
||||
if len(ii) == 0:
|
||||
return np.nan
|
||||
|
||||
return self.surv_times[ii[0]]
|
||||
|
||||
def quantile_ci(self, p, alpha=0.05, method='cloglog'):
|
||||
"""
|
||||
Returns a confidence interval for a survival quantile.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
p : float
|
||||
The probability point for which a confidence interval is
|
||||
determined.
|
||||
alpha : float
|
||||
The confidence interval has nominal coverage probability
|
||||
1 - `alpha`.
|
||||
method : str
|
||||
Function to use for g-transformation, must be ...
|
||||
|
||||
Returns
|
||||
-------
|
||||
lb : float
|
||||
The lower confidence limit.
|
||||
ub : float
|
||||
The upper confidence limit.
|
||||
|
||||
Notes
|
||||
-----
|
||||
The confidence interval is obtained by inverting Z-tests. The
|
||||
limits of the confidence interval will always be observed
|
||||
event times.
|
||||
|
||||
References
|
||||
----------
|
||||
The method is based on the approach used in SAS, documented here:
|
||||
|
||||
http://support.sas.com/documentation/cdl/en/statug/68162/HTML/default/viewer.htm#statug_lifetest_details03.htm
|
||||
"""
|
||||
|
||||
tr = norm.ppf(1 - alpha / 2)
|
||||
|
||||
method = method.lower()
|
||||
if method == "cloglog":
|
||||
g = lambda x: np.log(-np.log(x))
|
||||
gprime = lambda x: -1 / (x * np.log(x))
|
||||
elif method == "linear":
|
||||
g = lambda x: x
|
||||
gprime = lambda x: 1
|
||||
elif method == "log":
|
||||
g = np.log
|
||||
gprime = lambda x: 1 / x
|
||||
elif method == "logit":
|
||||
g = lambda x: np.log(x / (1 - x))
|
||||
gprime = lambda x: 1 / (x * (1 - x))
|
||||
elif method == "asinsqrt":
|
||||
g = lambda x: np.arcsin(np.sqrt(x))
|
||||
gprime = lambda x: 1 / (2 * np.sqrt(x) * np.sqrt(1 - x))
|
||||
else:
|
||||
raise ValueError("unknown method")
|
||||
|
||||
r = g(self.surv_prob) - g(1 - p)
|
||||
r /= (gprime(self.surv_prob) * self.surv_prob_se)
|
||||
|
||||
ii = np.flatnonzero(np.abs(r) <= tr)
|
||||
if len(ii) == 0:
|
||||
return np.nan, np.nan
|
||||
|
||||
lb = self.surv_times[ii[0]]
|
||||
|
||||
if ii[-1] == len(self.surv_times) - 1:
|
||||
ub = np.inf
|
||||
else:
|
||||
ub = self.surv_times[ii[-1] + 1]
|
||||
|
||||
return lb, ub
|
||||
|
||||
def summary(self):
|
||||
"""
|
||||
Return a summary of the estimated survival function.
|
||||
|
||||
The summary is a dataframe containing the unique event times,
|
||||
estimated survival function values, and related quantities.
|
||||
"""
|
||||
|
||||
df = pd.DataFrame(index=self.surv_times)
|
||||
df.index.name = "Time"
|
||||
df["Surv prob"] = self.surv_prob
|
||||
df["Surv prob SE"] = self.surv_prob_se
|
||||
df["num at risk"] = self.n_risk
|
||||
df["num events"] = self.n_events
|
||||
|
||||
return df
|
||||
|
||||
def simultaneous_cb(self, alpha=0.05, method="hw", transform="log"):
|
||||
"""
|
||||
Returns a simultaneous confidence band for the survival function.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
alpha : float
|
||||
`1 - alpha` is the desired simultaneous coverage
|
||||
probability for the confidence region. Currently alpha
|
||||
must be set to 0.05, giving 95% simultaneous intervals.
|
||||
method : str
|
||||
The method used to produce the simultaneous confidence
|
||||
band. Only the Hall-Wellner (hw) method is currently
|
||||
implemented.
|
||||
transform : str
|
||||
The used to produce the interval (note that the returned
|
||||
interval is on the survival probability scale regardless
|
||||
of which transform is used). Only `log` and `arcsin` are
|
||||
implemented.
|
||||
|
||||
Returns
|
||||
-------
|
||||
lcb : array_like
|
||||
The lower confidence limits corresponding to the points
|
||||
in `surv_times`.
|
||||
ucb : array_like
|
||||
The upper confidence limits corresponding to the points
|
||||
in `surv_times`.
|
||||
"""
|
||||
|
||||
method = method.lower()
|
||||
if method != "hw":
|
||||
msg = "only the Hall-Wellner (hw) method is implemented"
|
||||
raise ValueError(msg)
|
||||
|
||||
if alpha != 0.05:
|
||||
raise ValueError("alpha must be set to 0.05")
|
||||
|
||||
transform = transform.lower()
|
||||
s2 = self.surv_prob_se**2 / self.surv_prob**2
|
||||
nn = self.n_risk
|
||||
if transform == "log":
|
||||
denom = np.sqrt(nn) * np.log(self.surv_prob)
|
||||
theta = 1.3581 * (1 + nn * s2) / denom
|
||||
theta = np.exp(theta)
|
||||
lcb = self.surv_prob**(1/theta)
|
||||
ucb = self.surv_prob**theta
|
||||
elif transform == "arcsin":
|
||||
k = 1.3581
|
||||
k *= (1 + nn * s2) / (2 * np.sqrt(nn))
|
||||
k *= np.sqrt(self.surv_prob / (1 - self.surv_prob))
|
||||
f = np.arcsin(np.sqrt(self.surv_prob))
|
||||
v = np.clip(f - k, 0, np.inf)
|
||||
lcb = np.sin(v)**2
|
||||
v = np.clip(f + k, -np.inf, np.pi/2)
|
||||
ucb = np.sin(v)**2
|
||||
else:
|
||||
raise ValueError("Unknown transform")
|
||||
|
||||
return lcb, ucb
|
||||
|
||||
|
||||
def survdiff(time, status, group, weight_type=None, strata=None,
|
||||
entry=None, **kwargs):
|
||||
"""
|
||||
Test for the equality of two survival distributions.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
time : array_like
|
||||
The event or censoring times.
|
||||
status : array_like
|
||||
The censoring status variable, status=1 indicates that the
|
||||
event occurred, status=0 indicates that the observation was
|
||||
censored.
|
||||
group : array_like
|
||||
Indicators of the two groups
|
||||
weight_type : str
|
||||
The following weight types are implemented:
|
||||
None (default) : logrank test
|
||||
fh : Fleming-Harrington, weights by S^(fh_p),
|
||||
requires exponent fh_p to be provided as keyword
|
||||
argument; the weights are derived from S defined at
|
||||
the previous event time, and the first weight is
|
||||
always 1.
|
||||
gb : Gehan-Breslow, weights by the number at risk
|
||||
tw : Tarone-Ware, weights by the square root of the number
|
||||
at risk
|
||||
strata : array_like
|
||||
Optional stratum indicators for a stratified test
|
||||
entry : array_like
|
||||
Entry times to handle left truncation. The subject is not in
|
||||
the risk set on or before the entry time.
|
||||
|
||||
Returns
|
||||
-------
|
||||
chisq : The chi-square (1 degree of freedom) distributed test
|
||||
statistic value
|
||||
pvalue : The p-value for the chi^2 test
|
||||
"""
|
||||
|
||||
time = np.asarray(time)
|
||||
status = np.asarray(status)
|
||||
group = np.asarray(group)
|
||||
|
||||
gr = np.unique(group)
|
||||
|
||||
if strata is None:
|
||||
obs, var = _survdiff(time, status, group, weight_type, gr,
|
||||
entry, **kwargs)
|
||||
else:
|
||||
strata = np.asarray(strata)
|
||||
stu = np.unique(strata)
|
||||
obs, var = 0., 0.
|
||||
for st in stu:
|
||||
# could be more efficient?
|
||||
ii = (strata == st)
|
||||
obs1, var1 = _survdiff(time[ii], status[ii], group[ii],
|
||||
weight_type, gr, entry, **kwargs)
|
||||
obs += obs1
|
||||
var += var1
|
||||
|
||||
chisq = obs.dot(np.linalg.solve(var, obs)) # (O - E).T * V^(-1) * (O - E)
|
||||
pvalue = 1 - chi2.cdf(chisq, len(gr)-1)
|
||||
|
||||
return chisq, pvalue
|
||||
|
||||
|
||||
def _survdiff(time, status, group, weight_type, gr, entry=None,
|
||||
**kwargs):
|
||||
# logrank test for one stratum
|
||||
# calculations based on https://web.stanford.edu/~lutian/coursepdf/unit6.pdf
|
||||
# formula for variance better to take from https://web.stanford.edu/~lutian/coursepdf/survweek3.pdf
|
||||
|
||||
# Get the unique times.
|
||||
if entry is None:
|
||||
utimes, rtimes = np.unique(time, return_inverse=True)
|
||||
else:
|
||||
utimes, rtimes = np.unique(np.concatenate((time, entry)),
|
||||
return_inverse=True)
|
||||
rtimes = rtimes[0:len(time)]
|
||||
|
||||
# Split entry times by group if present (should use pandas groupby)
|
||||
tse = [(gr_i, None) for gr_i in gr]
|
||||
if entry is not None:
|
||||
for k, _ in enumerate(gr):
|
||||
ii = (group == gr[k])
|
||||
entry1 = entry[ii]
|
||||
tse[k] = (gr[k], entry1)
|
||||
|
||||
# Event count and risk set size at each time point, per group and overall.
|
||||
# TODO: should use Pandas groupby
|
||||
nrisk, obsv = [], []
|
||||
ml = len(utimes)
|
||||
for g, entry0 in tse:
|
||||
|
||||
mk = (group == g)
|
||||
n = np.bincount(rtimes, weights=mk, minlength=ml)
|
||||
|
||||
ob = np.bincount(rtimes, weights=status*mk, minlength=ml)
|
||||
obsv.append(ob)
|
||||
|
||||
if entry is not None:
|
||||
n = np.cumsum(n) - n
|
||||
rentry = np.searchsorted(utimes, entry0, side='left')
|
||||
n0 = np.bincount(rentry, minlength=ml)
|
||||
n0 = np.cumsum(n0) - n0
|
||||
nr = n0 - n
|
||||
else:
|
||||
nr = np.cumsum(n[::-1])[::-1]
|
||||
|
||||
nrisk.append(nr)
|
||||
|
||||
obs = sum(obsv)
|
||||
nrisk_tot = sum(nrisk)
|
||||
ix = np.flatnonzero(nrisk_tot > 1)
|
||||
|
||||
weights = None
|
||||
if weight_type is not None:
|
||||
weight_type = weight_type.lower()
|
||||
if weight_type == "gb":
|
||||
weights = nrisk_tot
|
||||
elif weight_type == "tw":
|
||||
weights = np.sqrt(nrisk_tot)
|
||||
elif weight_type == "fh":
|
||||
if "fh_p" not in kwargs:
|
||||
msg = "weight_type type 'fh' requires specification of fh_p"
|
||||
raise ValueError(msg)
|
||||
fh_p = kwargs["fh_p"]
|
||||
# Calculate the survivor function directly to avoid the
|
||||
# overhead of creating a SurvfuncRight object
|
||||
sp = 1 - obs / nrisk_tot.astype(np.float64)
|
||||
sp = np.log(sp)
|
||||
sp = np.cumsum(sp)
|
||||
sp = np.exp(sp)
|
||||
weights = sp**fh_p
|
||||
weights = np.roll(weights, 1)
|
||||
weights[0] = 1
|
||||
else:
|
||||
raise ValueError("weight_type not implemented")
|
||||
|
||||
dfs = len(gr) - 1
|
||||
r = np.vstack(nrisk) / np.clip(nrisk_tot, 1e-10, np.inf)[None, :] # each line is timeseries of r's. line per group
|
||||
|
||||
# The variance of event counts in each group.
|
||||
groups_oe = []
|
||||
groups_var = []
|
||||
|
||||
var_denom = nrisk_tot - 1
|
||||
var_denom = np.clip(var_denom, 1e-10, np.inf)
|
||||
|
||||
# use the first group as a reference
|
||||
for g in range(1, dfs+1):
|
||||
# Difference between observed and expected number of events in the group #g
|
||||
oe = obsv[g] - r[g]*obs
|
||||
|
||||
# build one row of the dfs x dfs variance matrix
|
||||
var_tensor_part = r[1:, :].T * (np.eye(1, dfs, g-1).ravel() - r[g, :, None]) # r*(1 - r) in multidim
|
||||
var_scalar_part = obs * (nrisk_tot - obs) / var_denom
|
||||
var = var_tensor_part * var_scalar_part[:, None]
|
||||
|
||||
if weights is not None:
|
||||
oe = weights * oe
|
||||
var = (weights**2)[:, None] * var
|
||||
|
||||
# sum over times and store
|
||||
groups_oe.append(oe[ix].sum())
|
||||
groups_var.append(var[ix].sum(axis=0))
|
||||
|
||||
obs_vec = np.hstack(groups_oe)
|
||||
var_mat = np.vstack(groups_var)
|
||||
|
||||
return obs_vec, var_mat
|
||||
|
||||
|
||||
def plot_survfunc(survfuncs, ax=None):
|
||||
"""
|
||||
Plot one or more survivor functions.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
survfuncs : object or array_like
|
||||
A single SurvfuncRight object, or a list or SurvfuncRight
|
||||
objects that are plotted together.
|
||||
|
||||
Returns
|
||||
-------
|
||||
A figure instance on which the plot was drawn.
|
||||
|
||||
Examples
|
||||
--------
|
||||
Add a legend:
|
||||
|
||||
>>> import statsmodels.api as sm
|
||||
>>> from statsmodels.duration.survfunc import plot_survfunc
|
||||
>>> data = sm.datasets.get_rdataset("flchain", "survival").data
|
||||
>>> df = data.loc[data.sex == "F", :]
|
||||
>>> sf0 = sm.SurvfuncRight(df["futime"], df["death"])
|
||||
>>> sf1 = sm.SurvfuncRight(3.0 * df["futime"], df["death"])
|
||||
>>> fig = plot_survfunc([sf0, sf1])
|
||||
>>> ax = fig.get_axes()[0]
|
||||
>>> ax.set_position([0.1, 0.1, 0.64, 0.8])
|
||||
>>> ha, lb = ax.get_legend_handles_labels()
|
||||
>>> leg = fig.legend((ha[0], ha[1]), (lb[0], lb[1]), loc='center right')
|
||||
|
||||
Change the line colors:
|
||||
|
||||
>>> fig = plot_survfunc([sf0, sf1])
|
||||
>>> ax = fig.get_axes()[0]
|
||||
>>> ax.set_position([0.1, 0.1, 0.64, 0.8])
|
||||
>>> ha, lb = ax.get_legend_handles_labels()
|
||||
>>> ha[0].set_color('purple')
|
||||
>>> ha[1].set_color('orange')
|
||||
"""
|
||||
|
||||
fig, ax = utils.create_mpl_ax(ax)
|
||||
|
||||
# If we have only a single survival function to plot, put it into
|
||||
# a list.
|
||||
try:
|
||||
assert type(survfuncs[0]) is SurvfuncRight
|
||||
except:
|
||||
survfuncs = [survfuncs]
|
||||
|
||||
for gx, sf in enumerate(survfuncs):
|
||||
|
||||
# The estimated survival function does not include a point at
|
||||
# time 0, include it here for plotting.
|
||||
surv_times = np.concatenate(([0], sf.surv_times))
|
||||
surv_prob = np.concatenate(([1], sf.surv_prob))
|
||||
|
||||
# If the final times are censoring times they are not included
|
||||
# in the survival function so we add them here
|
||||
mxt = max(sf.time)
|
||||
if mxt > surv_times[-1]:
|
||||
surv_times = np.concatenate((surv_times, [mxt]))
|
||||
surv_prob = np.concatenate((surv_prob, [surv_prob[-1]]))
|
||||
|
||||
label = getattr(sf, "title", "Group %d" % (gx + 1))
|
||||
|
||||
li, = ax.step(surv_times, surv_prob, '-', label=label, lw=2,
|
||||
where='post')
|
||||
|
||||
# Plot the censored points.
|
||||
ii = np.flatnonzero(np.logical_not(sf.status))
|
||||
ti = np.unique(sf.time[ii])
|
||||
jj = np.searchsorted(surv_times, ti) - 1
|
||||
sp = surv_prob[jj]
|
||||
ax.plot(ti, sp, '+', ms=12, color=li.get_color(),
|
||||
label=label + " points")
|
||||
|
||||
ax.set_ylim(0, 1.01)
|
||||
|
||||
return fig
|
||||
Reference in New Issue
Block a user