reconnect moved files to git repo
This commit is contained in:
@ -0,0 +1,195 @@
|
||||
'''using multivariate dependence and divergence measures
|
||||
|
||||
The standard correlation coefficient measures only linear dependence between
|
||||
random variables.
|
||||
kendall's tau measures any monotonic relationship also non-linear.
|
||||
|
||||
mutual information measures any kind of dependence, but does not distinguish
|
||||
between positive and negative relationship
|
||||
|
||||
|
||||
mutualinfo_kde and mutualinfo_binning follow Khan et al. 2007
|
||||
|
||||
Shiraj Khan, Sharba Bandyopadhyay, Auroop R. Ganguly, Sunil Saigal,
|
||||
David J. Erickson, III, Vladimir Protopopescu, and George Ostrouchov,
|
||||
Relative performance of mutual information estimation methods for
|
||||
quantifying the dependence among short and noisy data,
|
||||
Phys. Rev. E 76, 026209 (2007)
|
||||
http://pre.aps.org/abstract/PRE/v76/i2/e026209
|
||||
|
||||
|
||||
'''
|
||||
|
||||
import numpy as np
|
||||
from scipy import stats
|
||||
from scipy.stats import gaussian_kde
|
||||
|
||||
import statsmodels.sandbox.infotheo as infotheo
|
||||
|
||||
|
||||
def mutualinfo_kde(y, x, normed=True):
|
||||
'''mutual information of two random variables estimated with kde
|
||||
|
||||
'''
|
||||
nobs = len(x)
|
||||
if not len(y) == nobs:
|
||||
raise ValueError('both data arrays need to have the same size')
|
||||
x = np.asarray(x, float)
|
||||
y = np.asarray(y, float)
|
||||
yx = np.vstack((y,x))
|
||||
kde_x = gaussian_kde(x)(x)
|
||||
kde_y = gaussian_kde(y)(y)
|
||||
kde_yx = gaussian_kde(yx)(yx)
|
||||
|
||||
mi_obs = np.log(kde_yx) - np.log(kde_x) - np.log(kde_y)
|
||||
mi = mi_obs.sum() / nobs
|
||||
if normed:
|
||||
mi_normed = np.sqrt(1. - np.exp(-2 * mi))
|
||||
return mi_normed
|
||||
else:
|
||||
return mi
|
||||
|
||||
def mutualinfo_kde_2sample(y, x, normed=True):
|
||||
'''mutual information of two random variables estimated with kde
|
||||
|
||||
'''
|
||||
nobs = len(x)
|
||||
x = np.asarray(x, float)
|
||||
y = np.asarray(y, float)
|
||||
#yx = np.vstack((y,x))
|
||||
kde_x = gaussian_kde(x.T)(x.T)
|
||||
kde_y = gaussian_kde(y.T)(x.T)
|
||||
#kde_yx = gaussian_kde(yx)(yx)
|
||||
|
||||
mi_obs = np.log(kde_x) - np.log(kde_y)
|
||||
if len(mi_obs) != nobs:
|
||||
raise ValueError("Wrong number of observations")
|
||||
mi = mi_obs.mean()
|
||||
if normed:
|
||||
mi_normed = np.sqrt(1. - np.exp(-2 * mi))
|
||||
return mi_normed
|
||||
else:
|
||||
return mi
|
||||
|
||||
def mutualinfo_binned(y, x, bins, normed=True):
|
||||
'''mutual information of two random variables estimated with kde
|
||||
|
||||
|
||||
|
||||
Notes
|
||||
-----
|
||||
bins='auto' selects the number of bins so that approximately 5 observations
|
||||
are expected to be in each bin under the assumption of independence. This
|
||||
follows roughly the description in Kahn et al. 2007
|
||||
|
||||
'''
|
||||
nobs = len(x)
|
||||
if not len(y) == nobs:
|
||||
raise ValueError('both data arrays need to have the same size')
|
||||
x = np.asarray(x, float)
|
||||
y = np.asarray(y, float)
|
||||
#yx = np.vstack((y,x))
|
||||
|
||||
|
||||
## fyx, binsy, binsx = np.histogram2d(y, x, bins=bins)
|
||||
## fx, binsx_ = np.histogram(x, bins=binsx)
|
||||
## fy, binsy_ = np.histogram(y, bins=binsy)
|
||||
|
||||
if bins == 'auto':
|
||||
ys = np.sort(y)
|
||||
xs = np.sort(x)
|
||||
#quantiles = np.array([0,0.25, 0.4, 0.6, 0.75, 1])
|
||||
qbin_sqr = np.sqrt(5./nobs)
|
||||
quantiles = np.linspace(0, 1, 1./qbin_sqr)
|
||||
quantile_index = ((nobs-1)*quantiles).astype(int)
|
||||
#move edges so that they do not coincide with an observation
|
||||
shift = 1e-6 + np.ones(quantiles.shape)
|
||||
shift[0] -= 2*1e-6
|
||||
binsy = ys[quantile_index] + shift
|
||||
binsx = xs[quantile_index] + shift
|
||||
|
||||
elif np.size(bins) == 1:
|
||||
binsy = bins
|
||||
binsx = bins
|
||||
elif (len(bins) == 2):
|
||||
binsy, binsx = bins
|
||||
## if np.size(bins[0]) == 1:
|
||||
## binsx = bins[0]
|
||||
## if np.size(bins[1]) == 1:
|
||||
## binsx = bins[1]
|
||||
|
||||
fx, binsx = np.histogram(x, bins=binsx)
|
||||
fy, binsy = np.histogram(y, bins=binsy)
|
||||
fyx, binsy, binsx = np.histogram2d(y, x, bins=(binsy, binsx))
|
||||
|
||||
pyx = fyx * 1. / nobs
|
||||
px = fx * 1. / nobs
|
||||
py = fy * 1. / nobs
|
||||
|
||||
|
||||
mi_obs = pyx * (np.log(pyx+1e-10) - np.log(py)[:,None] - np.log(px))
|
||||
mi = mi_obs.sum()
|
||||
|
||||
if normed:
|
||||
mi_normed = np.sqrt(1. - np.exp(-2 * mi))
|
||||
return mi_normed, (pyx, py, px, binsy, binsx), mi_obs
|
||||
else:
|
||||
return mi
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
import statsmodels.api as sm
|
||||
|
||||
funtype = ['linear', 'quadratic'][1]
|
||||
nobs = 200
|
||||
sig = 2#5.
|
||||
#x = np.linspace(-3, 3, nobs) + np.random.randn(nobs)
|
||||
x = np.sort(3*np.random.randn(nobs))
|
||||
exog = sm.add_constant(x, prepend=True)
|
||||
#y = 0 + np.log(1+x**2) + sig * np.random.randn(nobs)
|
||||
if funtype == 'quadratic':
|
||||
y = 0 + x**2 + sig * np.random.randn(nobs)
|
||||
if funtype == 'linear':
|
||||
y = 0 + x + sig * np.random.randn(nobs)
|
||||
|
||||
print('correlation')
|
||||
print(np.corrcoef(y,x)[0, 1])
|
||||
print('pearsonr', stats.pearsonr(y,x))
|
||||
print('spearmanr', stats.spearmanr(y,x))
|
||||
print('kendalltau', stats.kendalltau(y,x))
|
||||
|
||||
pxy, binsx, binsy = np.histogram2d(x,y, bins=5)
|
||||
px, binsx_ = np.histogram(x, bins=binsx)
|
||||
py, binsy_ = np.histogram(y, bins=binsy)
|
||||
print('mutualinfo', infotheo.mutualinfo(px*1./nobs, py*1./nobs,
|
||||
1e-15+pxy*1./nobs, logbase=np.e))
|
||||
|
||||
print('mutualinfo_kde normed', mutualinfo_kde(y,x))
|
||||
print('mutualinfo_kde ', mutualinfo_kde(y,x, normed=False))
|
||||
mi_normed, (pyx2, py2, px2, binsy2, binsx2), mi_obs = \
|
||||
mutualinfo_binned(y, x, 5, normed=True)
|
||||
print('mutualinfo_binned normed', mi_normed)
|
||||
print('mutualinfo_binned ', mi_obs.sum())
|
||||
|
||||
mi_normed, (pyx2, py2, px2, binsy2, binsx2), mi_obs = \
|
||||
mutualinfo_binned(y, x, 'auto', normed=True)
|
||||
print('auto')
|
||||
print('mutualinfo_binned normed', mi_normed)
|
||||
print('mutualinfo_binned ', mi_obs.sum())
|
||||
|
||||
ys = np.sort(y)
|
||||
xs = np.sort(x)
|
||||
by = ys[((nobs-1)*np.array([0, 0.25, 0.4, 0.6, 0.75, 1])).astype(int)]
|
||||
bx = xs[((nobs-1)*np.array([0, 0.25, 0.4, 0.6, 0.75, 1])).astype(int)]
|
||||
mi_normed, (pyx2, py2, px2, binsy2, binsx2), mi_obs = \
|
||||
mutualinfo_binned(y, x, (by,bx), normed=True)
|
||||
print('quantiles')
|
||||
print('mutualinfo_binned normed', mi_normed)
|
||||
print('mutualinfo_binned ', mi_obs.sum())
|
||||
|
||||
doplot = 1#False
|
||||
if doplot:
|
||||
import matplotlib.pyplot as plt
|
||||
plt.plot(x, y, 'o')
|
||||
olsres = sm.OLS(y, exog).fit()
|
||||
plt.plot(x, olsres.fittedvalues)
|
||||
Reference in New Issue
Block a user