reconnect moved files to git repo
This commit is contained in:
@ -0,0 +1,3 @@
|
||||
from statsmodels.tools._test_runner import PytestTester
|
||||
|
||||
test = PytestTester()
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@ -0,0 +1,10 @@
|
||||
__all__ = [
|
||||
"PCA", "MANOVA", "Factor", "FactorResults", "CanCorr",
|
||||
"factor_rotation"
|
||||
]
|
||||
|
||||
from .pca import PCA
|
||||
from .manova import MANOVA
|
||||
from .factor import Factor, FactorResults
|
||||
from .cancorr import CanCorr
|
||||
from . import factor_rotation
|
||||
@ -0,0 +1,176 @@
|
||||
"""Canonical correlation analysis
|
||||
|
||||
author: Yichuan Liu
|
||||
"""
|
||||
import numpy as np
|
||||
from numpy.linalg import svd
|
||||
import scipy
|
||||
import pandas as pd
|
||||
|
||||
from statsmodels.base.model import Model
|
||||
from statsmodels.iolib import summary2
|
||||
from .multivariate_ols import multivariate_stats
|
||||
|
||||
|
||||
class CanCorr(Model):
|
||||
"""
|
||||
Canonical correlation analysis using singular value decomposition
|
||||
|
||||
For matrices exog=x and endog=y, find projections x_cancoef and y_cancoef
|
||||
such that:
|
||||
|
||||
x1 = x * x_cancoef, x1' * x1 is identity matrix
|
||||
y1 = y * y_cancoef, y1' * y1 is identity matrix
|
||||
|
||||
and the correlation between x1 and y1 is maximized.
|
||||
|
||||
Attributes
|
||||
----------
|
||||
endog : ndarray
|
||||
See Parameters.
|
||||
exog : ndarray
|
||||
See Parameters.
|
||||
cancorr : ndarray
|
||||
The canonical correlation values
|
||||
y_cancoef : ndarray
|
||||
The canonical coefficients for endog
|
||||
x_cancoef : ndarray
|
||||
The canonical coefficients for exog
|
||||
|
||||
References
|
||||
----------
|
||||
.. [*] http://numerical.recipes/whp/notes/CanonCorrBySVD.pdf
|
||||
.. [*] http://www.csun.edu/~ata20315/psy524/docs/Psy524%20Lecture%208%20CC.pdf
|
||||
.. [*] http://www.mathematica-journal.com/2014/06/canonical-correlation-analysis/
|
||||
""" # noqa:E501
|
||||
def __init__(self, endog, exog, tolerance=1e-8, missing='none', hasconst=None, **kwargs):
|
||||
super().__init__(endog, exog, missing=missing,
|
||||
hasconst=hasconst, **kwargs)
|
||||
self._fit(tolerance)
|
||||
|
||||
def _fit(self, tolerance=1e-8):
|
||||
"""Fit the model
|
||||
|
||||
A ValueError is raised if there are singular values smaller than the
|
||||
tolerance. The treatment of singular arrays might change in future.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
tolerance : float
|
||||
eigenvalue tolerance, values smaller than which is considered 0
|
||||
"""
|
||||
nobs, k_yvar = self.endog.shape
|
||||
nobs, k_xvar = self.exog.shape
|
||||
k = np.min([k_yvar, k_xvar])
|
||||
|
||||
x = np.array(self.exog)
|
||||
x = x - x.mean(0)
|
||||
y = np.array(self.endog)
|
||||
y = y - y.mean(0)
|
||||
|
||||
ux, sx, vx = svd(x, 0)
|
||||
# vx_ds = vx.T divided by sx
|
||||
vx_ds = vx.T
|
||||
mask = sx > tolerance
|
||||
if mask.sum() < len(mask):
|
||||
raise ValueError('exog is collinear.')
|
||||
vx_ds[:, mask] /= sx[mask]
|
||||
uy, sy, vy = svd(y, 0)
|
||||
# vy_ds = vy.T divided by sy
|
||||
vy_ds = vy.T
|
||||
mask = sy > tolerance
|
||||
if mask.sum() < len(mask):
|
||||
raise ValueError('endog is collinear.')
|
||||
vy_ds[:, mask] /= sy[mask]
|
||||
u, s, v = svd(ux.T.dot(uy), 0)
|
||||
|
||||
# Correct any roundoff
|
||||
self.cancorr = np.array([max(0, min(s[i], 1)) for i in range(len(s))])
|
||||
|
||||
self.x_cancoef = vx_ds.dot(u[:, :k])
|
||||
self.y_cancoef = vy_ds.dot(v.T[:, :k])
|
||||
|
||||
def corr_test(self):
|
||||
"""Approximate F test
|
||||
Perform multivariate statistical tests of the hypothesis that
|
||||
there is no canonical correlation between endog and exog.
|
||||
For each canonical correlation, testing its significance based on
|
||||
Wilks' lambda.
|
||||
|
||||
Returns
|
||||
-------
|
||||
CanCorrTestResults instance
|
||||
"""
|
||||
nobs, k_yvar = self.endog.shape
|
||||
nobs, k_xvar = self.exog.shape
|
||||
eigenvals = np.power(self.cancorr, 2)
|
||||
stats = pd.DataFrame(columns=['Canonical Correlation', "Wilks' lambda",
|
||||
'Num DF','Den DF', 'F Value','Pr > F'],
|
||||
index=list(range(len(eigenvals) - 1, -1, -1)))
|
||||
prod = 1
|
||||
for i in range(len(eigenvals) - 1, -1, -1):
|
||||
prod *= 1 - eigenvals[i]
|
||||
p = k_yvar - i
|
||||
q = k_xvar - i
|
||||
r = (nobs - k_yvar - 1) - (p - q + 1) / 2
|
||||
u = (p * q - 2) / 4
|
||||
df1 = p * q
|
||||
if p ** 2 + q ** 2 - 5 > 0:
|
||||
t = np.sqrt(((p * q) ** 2 - 4) / (p ** 2 + q ** 2 - 5))
|
||||
else:
|
||||
t = 1
|
||||
df2 = r * t - 2 * u
|
||||
lmd = np.power(prod, 1 / t)
|
||||
F = (1 - lmd) / lmd * df2 / df1
|
||||
stats.loc[i, 'Canonical Correlation'] = self.cancorr[i]
|
||||
stats.loc[i, "Wilks' lambda"] = prod
|
||||
stats.loc[i, 'Num DF'] = df1
|
||||
stats.loc[i, 'Den DF'] = df2
|
||||
stats.loc[i, 'F Value'] = F
|
||||
pval = scipy.stats.f.sf(F, df1, df2)
|
||||
stats.loc[i, 'Pr > F'] = pval
|
||||
'''
|
||||
# Wilk's Chi square test of each canonical correlation
|
||||
df = (p - i + 1) * (q - i + 1)
|
||||
chi2 = a * np.log(prod)
|
||||
pval = stats.chi2.sf(chi2, df)
|
||||
stats.loc[i, 'Canonical correlation'] = self.cancorr[i]
|
||||
stats.loc[i, 'Chi-square'] = chi2
|
||||
stats.loc[i, 'DF'] = df
|
||||
stats.loc[i, 'Pr > ChiSq'] = pval
|
||||
'''
|
||||
ind = stats.index.values[::-1]
|
||||
stats = stats.loc[ind, :]
|
||||
|
||||
# Multivariate tests (remember x has mean removed)
|
||||
stats_mv = multivariate_stats(eigenvals,
|
||||
k_yvar, k_xvar, nobs - k_xvar - 1)
|
||||
return CanCorrTestResults(stats, stats_mv)
|
||||
|
||||
|
||||
class CanCorrTestResults:
|
||||
"""
|
||||
Canonical correlation results class
|
||||
|
||||
Attributes
|
||||
----------
|
||||
stats : DataFrame
|
||||
Contain statistical tests results for each canonical correlation
|
||||
stats_mv : DataFrame
|
||||
Contain the multivariate statistical tests results
|
||||
"""
|
||||
def __init__(self, stats, stats_mv):
|
||||
self.stats = stats
|
||||
self.stats_mv = stats_mv
|
||||
|
||||
def __str__(self):
|
||||
return self.summary().__str__()
|
||||
|
||||
def summary(self):
|
||||
summ = summary2.Summary()
|
||||
summ.add_title('Cancorr results')
|
||||
summ.add_df(self.stats)
|
||||
summ.add_dict({'': ''})
|
||||
summ.add_dict({'Multivariate Statistics and F Approximations': ''})
|
||||
summ.add_df(self.stats_mv)
|
||||
return summ
|
||||
1037
venv/lib/python3.11/site-packages/statsmodels/multivariate/factor.py
Normal file
1037
venv/lib/python3.11/site-packages/statsmodels/multivariate/factor.py
Normal file
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,32 @@
|
||||
"""
|
||||
Package with factor rotation algorithms.
|
||||
|
||||
This file contains a Python version of the gradient projection rotation
|
||||
algorithms (GPA) developed by Bernaards, C.A. and Jennrich, R.I.
|
||||
The code is based on the Matlab version of the code developed Bernaards, C.A.
|
||||
and Jennrich, R.I. and is ported and made available with permission of the
|
||||
authors.
|
||||
|
||||
Additionally, several analytic rotation methods are implemented.
|
||||
|
||||
References
|
||||
----------
|
||||
[1] Bernaards, C.A. and Jennrich, R.I. (2005) Gradient Projection Algorithms and Software for Arbitrary Rotation Criteria in Factor Analysis. Educational and Psychological Measurement, 65 (5), 676-696.
|
||||
|
||||
[2] Jennrich, R.I. (2001). A simple general procedure for orthogonal rotation. Psychometrika, 66, 289-306.
|
||||
|
||||
[3] Jennrich, R.I. (2002). A simple general method for oblique rotation. Psychometrika, 67, 7-19.
|
||||
|
||||
[4] http://www.stat.ucla.edu/research/gpa/matlab.net
|
||||
|
||||
[5] http://www.stat.ucla.edu/research/gpa/GPderfree.txt
|
||||
"""
|
||||
from ._wrappers import rotate_factors
|
||||
|
||||
from ._analytic_rotation import target_rotation, procrustes, promax
|
||||
from statsmodels.tools._test_runner import PytestTester
|
||||
|
||||
__all__ = ['rotate_factors', 'target_rotation', 'procrustes', 'promax',
|
||||
'test']
|
||||
|
||||
test = PytestTester()
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@ -0,0 +1,152 @@
|
||||
"""
|
||||
This file contains analytic implementations of rotation methods.
|
||||
"""
|
||||
|
||||
import numpy as np
|
||||
import scipy as sp
|
||||
|
||||
|
||||
def target_rotation(A, H, full_rank=False):
|
||||
r"""
|
||||
Analytically performs orthogonal rotations towards a target matrix,
|
||||
i.e., we minimize:
|
||||
|
||||
.. math::
|
||||
\phi(L) =\frac{1}{2}\|AT-H\|^2.
|
||||
|
||||
where :math:`T` is an orthogonal matrix. This problem is also known as
|
||||
an orthogonal Procrustes problem.
|
||||
|
||||
Under the assumption that :math:`A^*H` has full rank, the analytical
|
||||
solution :math:`T` is given by:
|
||||
|
||||
.. math::
|
||||
T = (A^*HH^*A)^{-\frac{1}{2}}A^*H,
|
||||
|
||||
see Green (1952). In other cases the solution is given by :math:`T = UV`,
|
||||
where :math:`U` and :math:`V` result from the singular value decomposition
|
||||
of :math:`A^*H`:
|
||||
|
||||
.. math::
|
||||
A^*H = U\Sigma V,
|
||||
|
||||
see Schonemann (1966).
|
||||
|
||||
Parameters
|
||||
----------
|
||||
A : numpy matrix (default None)
|
||||
non rotated factors
|
||||
H : numpy matrix
|
||||
target matrix
|
||||
full_rank : bool (default FAlse)
|
||||
if set to true full rank is assumed
|
||||
|
||||
Returns
|
||||
-------
|
||||
The matrix :math:`T`.
|
||||
|
||||
References
|
||||
----------
|
||||
[1] Green (1952, Psychometrika) - The orthogonal approximation of an
|
||||
oblique structure in factor analysis
|
||||
|
||||
[2] Schonemann (1966) - A generalized solution of the orthogonal
|
||||
procrustes problem
|
||||
|
||||
[3] Gower, Dijksterhuis (2004) - Procrustes problems
|
||||
"""
|
||||
ATH = A.T.dot(H)
|
||||
if full_rank or np.linalg.matrix_rank(ATH) == A.shape[1]:
|
||||
T = sp.linalg.fractional_matrix_power(ATH.dot(ATH.T), -1/2).dot(ATH)
|
||||
else:
|
||||
U, D, V = np.linalg.svd(ATH, full_matrices=False)
|
||||
T = U.dot(V)
|
||||
return T
|
||||
|
||||
|
||||
def procrustes(A, H):
|
||||
r"""
|
||||
Analytically solves the following Procrustes problem:
|
||||
|
||||
.. math::
|
||||
\phi(L) =\frac{1}{2}\|AT-H\|^2.
|
||||
|
||||
(With no further conditions on :math:`H`)
|
||||
|
||||
Under the assumption that :math:`A^*H` has full rank, the analytical
|
||||
solution :math:`T` is given by:
|
||||
|
||||
.. math::
|
||||
T = (A^*HH^*A)^{-\frac{1}{2}}A^*H,
|
||||
|
||||
see Navarra, Simoncini (2010).
|
||||
|
||||
Parameters
|
||||
----------
|
||||
A : numpy matrix
|
||||
non rotated factors
|
||||
H : numpy matrix
|
||||
target matrix
|
||||
full_rank : bool (default False)
|
||||
if set to true full rank is assumed
|
||||
|
||||
Returns
|
||||
-------
|
||||
The matrix :math:`T`.
|
||||
|
||||
References
|
||||
----------
|
||||
[1] Navarra, Simoncini (2010) - A guide to empirical orthogonal functions
|
||||
for climate data analysis
|
||||
"""
|
||||
return np.linalg.inv(A.T.dot(A)).dot(A.T).dot(H)
|
||||
|
||||
|
||||
def promax(A, k=2):
|
||||
r"""
|
||||
Performs promax rotation of the matrix :math:`A`.
|
||||
|
||||
This method was not very clear to me from the literature, this
|
||||
implementation is as I understand it should work.
|
||||
|
||||
Promax rotation is performed in the following steps:
|
||||
|
||||
* Determine varimax rotated patterns :math:`V`.
|
||||
|
||||
* Construct a rotation target matrix :math:`|V_{ij}|^k/V_{ij}`
|
||||
|
||||
* Perform procrustes rotation towards the target to obtain T
|
||||
|
||||
* Determine the patterns
|
||||
|
||||
First, varimax rotation a target matrix :math:`H` is determined with
|
||||
orthogonal varimax rotation.
|
||||
Then, oblique target rotation is performed towards the target.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
A : numpy matrix
|
||||
non rotated factors
|
||||
k : float
|
||||
parameter, should be positive
|
||||
|
||||
References
|
||||
----------
|
||||
[1] Browne (2001) - An overview of analytic rotation in exploratory
|
||||
factor analysis
|
||||
|
||||
[2] Navarra, Simoncini (2010) - A guide to empirical orthogonal functions
|
||||
for climate data analysis
|
||||
"""
|
||||
assert k > 0
|
||||
# define rotation target using varimax rotation:
|
||||
from ._wrappers import rotate_factors
|
||||
V, T = rotate_factors(A, 'varimax')
|
||||
H = np.abs(V)**k/V
|
||||
# solve procrustes problem
|
||||
S = procrustes(A, H) # np.linalg.inv(A.T.dot(A)).dot(A.T).dot(H);
|
||||
# normalize
|
||||
d = np.sqrt(np.diag(np.linalg.inv(S.T.dot(S))))
|
||||
D = np.diag(d)
|
||||
T = np.linalg.inv(S.dot(D)).T
|
||||
return A.dot(T), T
|
||||
@ -0,0 +1,592 @@
|
||||
"""
|
||||
This file contains a Python version of the gradient projection rotation
|
||||
algorithms (GPA) developed by Bernaards, C.A. and Jennrich, R.I.
|
||||
The code is based on code developed Bernaards, C.A. and Jennrich, R.I.
|
||||
and is ported and made available with permission of the authors.
|
||||
|
||||
References
|
||||
----------
|
||||
[1] Bernaards, C.A. and Jennrich, R.I. (2005) Gradient Projection Algorithms
|
||||
and Software for Arbitrary Rotation Criteria in Factor Analysis. Educational
|
||||
and Psychological Measurement, 65 (5), 676-696.
|
||||
|
||||
[2] Jennrich, R.I. (2001). A simple general procedure for orthogonal rotation.
|
||||
Psychometrika, 66, 289-306.
|
||||
|
||||
[3] Jennrich, R.I. (2002). A simple general method for oblique rotation.
|
||||
Psychometrika, 67, 7-19.
|
||||
|
||||
[4] http://www.stat.ucla.edu/research/gpa/matlab.net
|
||||
|
||||
[5] http://www.stat.ucla.edu/research/gpa/GPderfree.txt
|
||||
"""
|
||||
|
||||
import numpy as np
|
||||
|
||||
|
||||
def GPA(A, ff=None, vgQ=None, T=None, max_tries=501,
|
||||
rotation_method='orthogonal', tol=1e-5):
|
||||
r"""
|
||||
The gradient projection algorithm (GPA) minimizes a target function
|
||||
:math:`\phi(L)`, where :math:`L` is a matrix with rotated factors.
|
||||
|
||||
For orthogonal rotation methods :math:`L=AT`, where :math:`T` is an
|
||||
orthogonal matrix. For oblique rotation matrices :math:`L=A(T^*)^{-1}`,
|
||||
where :math:`T` is a normal matrix, i.e., :math:`TT^*=T^*T`. Oblique
|
||||
rotations relax the orthogonality constraint in order to gain simplicity
|
||||
in the interpretation.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
A : numpy matrix
|
||||
non rotated factors
|
||||
T : numpy matrix (default identity matrix)
|
||||
initial guess of rotation matrix
|
||||
ff : function (defualt None)
|
||||
criterion :math:`\phi` to optimize. Should have A, T, L as keyword
|
||||
arguments
|
||||
and mapping to a float. Only used (and required) if vgQ is not
|
||||
provided.
|
||||
vgQ : function (defualt None)
|
||||
criterion :math:`\phi` to optimize and its derivative. Should have
|
||||
A, T, L as keyword arguments and mapping to a tuple containing a
|
||||
float and vector. Can be omitted if ff is provided.
|
||||
max_tries : int (default 501)
|
||||
maximum number of iterations
|
||||
rotation_method : str
|
||||
should be one of {orthogonal, oblique}
|
||||
tol : float
|
||||
stop criterion, algorithm stops if Frobenius norm of gradient is
|
||||
smaller then tol
|
||||
"""
|
||||
# pre processing
|
||||
if rotation_method not in ['orthogonal', 'oblique']:
|
||||
raise ValueError('rotation_method should be one of '
|
||||
'{orthogonal, oblique}')
|
||||
if vgQ is None:
|
||||
if ff is None:
|
||||
raise ValueError('ff should be provided if vgQ is not')
|
||||
derivative_free = True
|
||||
Gff = lambda x: Gf(x, lambda y: ff(T=y, A=A, L=None))
|
||||
else:
|
||||
derivative_free = False
|
||||
if T is None:
|
||||
T = np.eye(A.shape[1])
|
||||
# pre processing for iteration
|
||||
al = 1
|
||||
table = []
|
||||
# pre processing for iteration: initialize f and G
|
||||
if derivative_free:
|
||||
f = ff(T=T, A=A, L=None)
|
||||
G = Gff(T)
|
||||
elif rotation_method == 'orthogonal': # and not derivative_free
|
||||
L = A.dot(T)
|
||||
f, Gq = vgQ(L=L)
|
||||
G = (A.T).dot(Gq)
|
||||
else: # i.e. rotation_method == 'oblique' and not derivative_free
|
||||
Ti = np.linalg.inv(T)
|
||||
L = A.dot(Ti.T)
|
||||
f, Gq = vgQ(L=L)
|
||||
G = -((L.T).dot(Gq).dot(Ti)).T
|
||||
# iteration
|
||||
for i_try in range(0, max_tries):
|
||||
# determine Gp
|
||||
if rotation_method == 'orthogonal':
|
||||
M = (T.T).dot(G)
|
||||
S = (M + M.T)/2
|
||||
Gp = G - T.dot(S)
|
||||
else: # i.e. if rotation_method == 'oblique':
|
||||
Gp = G-T.dot(np.diag(np.sum(T*G, axis=0)))
|
||||
s = np.linalg.norm(Gp, 'fro')
|
||||
table.append([i_try, f, np.log10(s), al])
|
||||
# if we are close stop
|
||||
if s < tol:
|
||||
break
|
||||
# update T
|
||||
al = 2*al
|
||||
for i in range(11):
|
||||
# determine Tt
|
||||
X = T - al*Gp
|
||||
if rotation_method == 'orthogonal':
|
||||
U, D, V = np.linalg.svd(X, full_matrices=False)
|
||||
Tt = U.dot(V)
|
||||
else: # i.e. if rotation_method == 'oblique':
|
||||
v = 1/np.sqrt(np.sum(X**2, axis=0))
|
||||
Tt = X.dot(np.diag(v))
|
||||
# calculate objective using Tt
|
||||
if derivative_free:
|
||||
ft = ff(T=Tt, A=A, L=None)
|
||||
elif rotation_method == 'orthogonal': # and not derivative_free
|
||||
L = A.dot(Tt)
|
||||
ft, Gq = vgQ(L=L)
|
||||
else: # i.e. rotation_method == 'oblique' and not derivative_free
|
||||
Ti = np.linalg.inv(Tt)
|
||||
L = A.dot(Ti.T)
|
||||
ft, Gq = vgQ(L=L)
|
||||
# if sufficient improvement in objective -> use this T
|
||||
if ft < f-.5*s**2*al:
|
||||
break
|
||||
al = al/2
|
||||
# post processing for next iteration
|
||||
T = Tt
|
||||
f = ft
|
||||
if derivative_free:
|
||||
G = Gff(T)
|
||||
elif rotation_method == 'orthogonal': # and not derivative_free
|
||||
G = (A.T).dot(Gq)
|
||||
else: # i.e. rotation_method == 'oblique' and not derivative_free
|
||||
G = -((L.T).dot(Gq).dot(Ti)).T
|
||||
# post processing
|
||||
Th = T
|
||||
Lh = rotateA(A, T, rotation_method=rotation_method)
|
||||
Phi = (T.T).dot(T)
|
||||
return Lh, Phi, Th, table
|
||||
|
||||
|
||||
def Gf(T, ff):
|
||||
"""
|
||||
Subroutine for the gradient of f using numerical derivatives.
|
||||
"""
|
||||
k = T.shape[0]
|
||||
ep = 1e-4
|
||||
G = np.zeros((k, k))
|
||||
for r in range(k):
|
||||
for s in range(k):
|
||||
dT = np.zeros((k, k))
|
||||
dT[r, s] = ep
|
||||
G[r, s] = (ff(T+dT)-ff(T-dT))/(2*ep)
|
||||
return G
|
||||
|
||||
|
||||
def rotateA(A, T, rotation_method='orthogonal'):
|
||||
r"""
|
||||
For orthogonal rotation methods :math:`L=AT`, where :math:`T` is an
|
||||
orthogonal matrix. For oblique rotation matrices :math:`L=A(T^*)^{-1}`,
|
||||
where :math:`T` is a normal matrix, i.e., :math:`TT^*=T^*T`. Oblique
|
||||
rotations relax the orthogonality constraint in order to gain simplicity
|
||||
in the interpretation.
|
||||
"""
|
||||
if rotation_method == 'orthogonal':
|
||||
L = A.dot(T)
|
||||
elif rotation_method == 'oblique':
|
||||
L = A.dot(np.linalg.inv(T.T))
|
||||
else: # i.e. if rotation_method == 'oblique':
|
||||
raise ValueError('rotation_method should be one of '
|
||||
'{orthogonal, oblique}')
|
||||
return L
|
||||
|
||||
|
||||
def oblimin_objective(L=None, A=None, T=None, gamma=0,
|
||||
rotation_method='orthogonal',
|
||||
return_gradient=True):
|
||||
r"""
|
||||
Objective function for the oblimin family for orthogonal or
|
||||
oblique rotation wich minimizes:
|
||||
|
||||
.. math::
|
||||
\phi(L) = \frac{1}{4}(L\circ L,(I-\gamma C)(L\circ L)N),
|
||||
|
||||
where :math:`L` is a :math:`p\times k` matrix, :math:`N` is
|
||||
:math:`k\times k`
|
||||
matrix with zeros on the diagonal and ones elsewhere, :math:`C` is a
|
||||
:math:`p\times p` matrix with elements equal to :math:`1/p`,
|
||||
:math:`(X,Y)=\operatorname{Tr}(X^*Y)` is the Frobenius norm and
|
||||
:math:`\circ`
|
||||
is the element-wise product or Hadamard product.
|
||||
|
||||
The gradient is given by
|
||||
|
||||
.. math::
|
||||
L\circ\left[(I-\gamma C) (L \circ L)N\right].
|
||||
|
||||
Either :math:`L` should be provided or :math:`A` and :math:`T` should be
|
||||
provided.
|
||||
|
||||
For orthogonal rotations :math:`L` satisfies
|
||||
|
||||
.. math::
|
||||
L = AT,
|
||||
|
||||
where :math:`T` is an orthogonal matrix. For oblique rotations :math:`L`
|
||||
satisfies
|
||||
|
||||
.. math::
|
||||
L = A(T^*)^{-1},
|
||||
|
||||
where :math:`T` is a normal matrix.
|
||||
|
||||
The oblimin family is parametrized by the parameter :math:`\gamma`. For
|
||||
orthogonal rotations:
|
||||
|
||||
* :math:`\gamma=0` corresponds to quartimax,
|
||||
* :math:`\gamma=\frac{1}{2}` corresponds to biquartimax,
|
||||
* :math:`\gamma=1` corresponds to varimax,
|
||||
* :math:`\gamma=\frac{1}{p}` corresponds to equamax.
|
||||
For oblique rotations rotations:
|
||||
|
||||
* :math:`\gamma=0` corresponds to quartimin,
|
||||
* :math:`\gamma=\frac{1}{2}` corresponds to biquartimin.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
L : numpy matrix (default None)
|
||||
rotated factors, i.e., :math:`L=A(T^*)^{-1}=AT`
|
||||
A : numpy matrix (default None)
|
||||
non rotated factors
|
||||
T : numpy matrix (default None)
|
||||
rotation matrix
|
||||
gamma : float (default 0)
|
||||
a parameter
|
||||
rotation_method : str
|
||||
should be one of {orthogonal, oblique}
|
||||
return_gradient : bool (default True)
|
||||
toggles return of gradient
|
||||
"""
|
||||
if L is None:
|
||||
assert A is not None and T is not None
|
||||
L = rotateA(A, T, rotation_method=rotation_method)
|
||||
p, k = L.shape
|
||||
L2 = L**2
|
||||
N = np.ones((k, k))-np.eye(k)
|
||||
if np.isclose(gamma, 0):
|
||||
X = L2.dot(N)
|
||||
else:
|
||||
C = np.ones((p, p))/p
|
||||
X = (np.eye(p) - gamma*C).dot(L2).dot(N)
|
||||
phi = np.sum(L2*X)/4
|
||||
if return_gradient:
|
||||
Gphi = L*X
|
||||
return phi, Gphi
|
||||
else:
|
||||
return phi
|
||||
|
||||
|
||||
def orthomax_objective(L=None, A=None, T=None, gamma=0, return_gradient=True):
|
||||
r"""
|
||||
Objective function for the orthomax family for orthogonal
|
||||
rotation wich minimizes the following objective:
|
||||
|
||||
.. math::
|
||||
\phi(L) = -\frac{1}{4}(L\circ L,(I-\gamma C)(L\circ L)),
|
||||
|
||||
where :math:`0\leq\gamma\leq1`, :math:`L` is a :math:`p\times k` matrix,
|
||||
:math:`C` is a :math:`p\times p` matrix with elements equal to
|
||||
:math:`1/p`,
|
||||
:math:`(X,Y)=\operatorname{Tr}(X^*Y)` is the Frobenius norm and
|
||||
:math:`\circ` is the element-wise product or Hadamard product.
|
||||
|
||||
Either :math:`L` should be provided or :math:`A` and :math:`T` should be
|
||||
provided.
|
||||
|
||||
For orthogonal rotations :math:`L` satisfies
|
||||
|
||||
.. math::
|
||||
L = AT,
|
||||
|
||||
where :math:`T` is an orthogonal matrix.
|
||||
|
||||
The orthomax family is parametrized by the parameter :math:`\gamma`:
|
||||
|
||||
* :math:`\gamma=0` corresponds to quartimax,
|
||||
* :math:`\gamma=\frac{1}{2}` corresponds to biquartimax,
|
||||
* :math:`\gamma=1` corresponds to varimax,
|
||||
* :math:`\gamma=\frac{1}{p}` corresponds to equamax.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
L : numpy matrix (default None)
|
||||
rotated factors, i.e., :math:`L=A(T^*)^{-1}=AT`
|
||||
A : numpy matrix (default None)
|
||||
non rotated factors
|
||||
T : numpy matrix (default None)
|
||||
rotation matrix
|
||||
gamma : float (default 0)
|
||||
a parameter
|
||||
return_gradient : bool (default True)
|
||||
toggles return of gradient
|
||||
"""
|
||||
assert 0 <= gamma <= 1, "Gamma should be between 0 and 1"
|
||||
if L is None:
|
||||
assert A is not None and T is not None
|
||||
L = rotateA(A, T, rotation_method='orthogonal')
|
||||
p, k = L.shape
|
||||
L2 = L**2
|
||||
if np.isclose(gamma, 0):
|
||||
X = L2
|
||||
else:
|
||||
C = np.ones((p, p))/p
|
||||
X = (np.eye(p)-gamma*C).dot(L2)
|
||||
phi = -np.sum(L2*X)/4
|
||||
if return_gradient:
|
||||
Gphi = -L*X
|
||||
return phi, Gphi
|
||||
else:
|
||||
return phi
|
||||
|
||||
|
||||
def CF_objective(L=None, A=None, T=None, kappa=0,
|
||||
rotation_method='orthogonal',
|
||||
return_gradient=True):
|
||||
r"""
|
||||
Objective function for the Crawford-Ferguson family for orthogonal
|
||||
and oblique rotation wich minimizes the following objective:
|
||||
|
||||
.. math::
|
||||
\phi(L) =\frac{1-\kappa}{4} (L\circ L,(L\circ L)N)
|
||||
-\frac{1}{4}(L\circ L,M(L\circ L)),
|
||||
|
||||
where :math:`0\leq\kappa\leq1`, :math:`L` is a :math:`p\times k` matrix,
|
||||
:math:`N` is :math:`k\times k` matrix with zeros on the diagonal and ones
|
||||
elsewhere,
|
||||
:math:`M` is :math:`p\times p` matrix with zeros on the diagonal and ones
|
||||
elsewhere
|
||||
:math:`(X,Y)=\operatorname{Tr}(X^*Y)` is the Frobenius norm and
|
||||
:math:`\circ` is the element-wise product or Hadamard product.
|
||||
|
||||
The gradient is given by
|
||||
|
||||
.. math::
|
||||
d\phi(L) = (1-\kappa) L\circ\left[(L\circ L)N\right]
|
||||
-\kappa L\circ \left[M(L\circ L)\right].
|
||||
|
||||
Either :math:`L` should be provided or :math:`A` and :math:`T` should be
|
||||
provided.
|
||||
|
||||
For orthogonal rotations :math:`L` satisfies
|
||||
|
||||
.. math::
|
||||
L = AT,
|
||||
|
||||
where :math:`T` is an orthogonal matrix. For oblique rotations :math:`L`
|
||||
satisfies
|
||||
|
||||
.. math::
|
||||
L = A(T^*)^{-1},
|
||||
|
||||
where :math:`T` is a normal matrix.
|
||||
|
||||
For orthogonal rotations the oblimin (and orthomax) family of rotations is
|
||||
equivalent to the Crawford-Ferguson family. To be more precise:
|
||||
|
||||
* :math:`\kappa=0` corresponds to quartimax,
|
||||
* :math:`\kappa=\frac{1}{p}` corresponds to variamx,
|
||||
* :math:`\kappa=\frac{k-1}{p+k-2}` corresponds to parsimax,
|
||||
* :math:`\kappa=1` corresponds to factor parsimony.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
L : numpy matrix (default None)
|
||||
rotated factors, i.e., :math:`L=A(T^*)^{-1}=AT`
|
||||
A : numpy matrix (default None)
|
||||
non rotated factors
|
||||
T : numpy matrix (default None)
|
||||
rotation matrix
|
||||
gamma : float (default 0)
|
||||
a parameter
|
||||
rotation_method : str
|
||||
should be one of {orthogonal, oblique}
|
||||
return_gradient : bool (default True)
|
||||
toggles return of gradient
|
||||
"""
|
||||
assert 0 <= kappa <= 1, "Kappa should be between 0 and 1"
|
||||
if L is None:
|
||||
assert A is not None and T is not None
|
||||
L = rotateA(A, T, rotation_method=rotation_method)
|
||||
p, k = L.shape
|
||||
L2 = L**2
|
||||
X = None
|
||||
if not np.isclose(kappa, 1):
|
||||
N = np.ones((k, k)) - np.eye(k)
|
||||
X = (1 - kappa)*L2.dot(N)
|
||||
if not np.isclose(kappa, 0):
|
||||
M = np.ones((p, p)) - np.eye(p)
|
||||
if X is None:
|
||||
X = kappa*M.dot(L2)
|
||||
else:
|
||||
X += kappa*M.dot(L2)
|
||||
phi = np.sum(L2 * X) / 4
|
||||
if return_gradient:
|
||||
Gphi = L*X
|
||||
return phi, Gphi
|
||||
else:
|
||||
return phi
|
||||
|
||||
|
||||
def vgQ_target(H, L=None, A=None, T=None, rotation_method='orthogonal'):
|
||||
r"""
|
||||
Subroutine for the value of vgQ using orthogonal or oblique rotation
|
||||
towards a target matrix, i.e., we minimize:
|
||||
|
||||
.. math::
|
||||
\phi(L) =\frac{1}{2}\|L-H\|^2
|
||||
|
||||
and the gradient is given by
|
||||
|
||||
.. math::
|
||||
d\phi(L)=L-H.
|
||||
|
||||
Either :math:`L` should be provided or :math:`A` and :math:`T` should be
|
||||
provided.
|
||||
|
||||
For orthogonal rotations :math:`L` satisfies
|
||||
|
||||
.. math::
|
||||
L = AT,
|
||||
|
||||
where :math:`T` is an orthogonal matrix. For oblique rotations :math:`L`
|
||||
satisfies
|
||||
|
||||
.. math::
|
||||
L = A(T^*)^{-1},
|
||||
|
||||
where :math:`T` is a normal matrix.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
H : numpy matrix
|
||||
target matrix
|
||||
L : numpy matrix (default None)
|
||||
rotated factors, i.e., :math:`L=A(T^*)^{-1}=AT`
|
||||
A : numpy matrix (default None)
|
||||
non rotated factors
|
||||
T : numpy matrix (default None)
|
||||
rotation matrix
|
||||
rotation_method : str
|
||||
should be one of {orthogonal, oblique}
|
||||
"""
|
||||
if L is None:
|
||||
assert A is not None and T is not None
|
||||
L = rotateA(A, T, rotation_method=rotation_method)
|
||||
q = np.linalg.norm(L-H, 'fro')**2
|
||||
Gq = 2*(L-H)
|
||||
return q, Gq
|
||||
|
||||
|
||||
def ff_target(H, L=None, A=None, T=None, rotation_method='orthogonal'):
|
||||
r"""
|
||||
Subroutine for the value of f using (orthogonal or oblique) rotation
|
||||
towards a target matrix, i.e., we minimize:
|
||||
|
||||
.. math::
|
||||
\phi(L) =\frac{1}{2}\|L-H\|^2.
|
||||
|
||||
Either :math:`L` should be provided or :math:`A` and :math:`T` should be
|
||||
provided. For orthogonal rotations :math:`L` satisfies
|
||||
|
||||
.. math::
|
||||
L = AT,
|
||||
|
||||
where :math:`T` is an orthogonal matrix. For oblique rotations
|
||||
:math:`L` satisfies
|
||||
|
||||
.. math::
|
||||
L = A(T^*)^{-1},
|
||||
|
||||
where :math:`T` is a normal matrix.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
H : numpy matrix
|
||||
target matrix
|
||||
L : numpy matrix (default None)
|
||||
rotated factors, i.e., :math:`L=A(T^*)^{-1}=AT`
|
||||
A : numpy matrix (default None)
|
||||
non rotated factors
|
||||
T : numpy matrix (default None)
|
||||
rotation matrix
|
||||
rotation_method : str
|
||||
should be one of {orthogonal, oblique}
|
||||
"""
|
||||
if L is None:
|
||||
assert A is not None and T is not None
|
||||
L = rotateA(A, T, rotation_method=rotation_method)
|
||||
return np.linalg.norm(L-H, 'fro')**2
|
||||
|
||||
|
||||
def vgQ_partial_target(H, W=None, L=None, A=None, T=None):
|
||||
r"""
|
||||
Subroutine for the value of vgQ using orthogonal rotation towards a partial
|
||||
target matrix, i.e., we minimize:
|
||||
|
||||
.. math::
|
||||
\phi(L) =\frac{1}{2}\|W\circ(L-H)\|^2,
|
||||
|
||||
where :math:`\circ` is the element-wise product or Hadamard product and
|
||||
:math:`W` is a matrix whose entries can only be one or zero. The gradient
|
||||
is given by
|
||||
|
||||
.. math::
|
||||
d\phi(L)=W\circ(L-H).
|
||||
|
||||
Either :math:`L` should be provided or :math:`A` and :math:`T` should be
|
||||
provided.
|
||||
|
||||
For orthogonal rotations :math:`L` satisfies
|
||||
|
||||
.. math::
|
||||
L = AT,
|
||||
|
||||
where :math:`T` is an orthogonal matrix.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
H : numpy matrix
|
||||
target matrix
|
||||
W : numpy matrix (default matrix with equal weight one for all entries)
|
||||
matrix with weights, entries can either be one or zero
|
||||
L : numpy matrix (default None)
|
||||
rotated factors, i.e., :math:`L=A(T^*)^{-1}=AT`
|
||||
A : numpy matrix (default None)
|
||||
non rotated factors
|
||||
T : numpy matrix (default None)
|
||||
rotation matrix
|
||||
"""
|
||||
if W is None:
|
||||
return vgQ_target(H, L=L, A=A, T=T)
|
||||
if L is None:
|
||||
assert A is not None and T is not None
|
||||
L = rotateA(A, T, rotation_method='orthogonal')
|
||||
q = np.linalg.norm(W*(L-H), 'fro')**2
|
||||
Gq = 2*W*(L-H)
|
||||
return q, Gq
|
||||
|
||||
|
||||
def ff_partial_target(H, W=None, L=None, A=None, T=None):
|
||||
r"""
|
||||
Subroutine for the value of vgQ using orthogonal rotation towards a partial
|
||||
target matrix, i.e., we minimize:
|
||||
|
||||
.. math::
|
||||
\phi(L) =\frac{1}{2}\|W\circ(L-H)\|^2,
|
||||
|
||||
where :math:`\circ` is the element-wise product or Hadamard product and
|
||||
:math:`W` is a matrix whose entries can only be one or zero. Either
|
||||
:math:`L` should be provided or :math:`A` and :math:`T` should be provided.
|
||||
|
||||
For orthogonal rotations :math:`L` satisfies
|
||||
|
||||
.. math::
|
||||
L = AT,
|
||||
|
||||
where :math:`T` is an orthogonal matrix.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
H : numpy matrix
|
||||
target matrix
|
||||
W : numpy matrix (default matrix with equal weight one for all entries)
|
||||
matrix with weights, entries can either be one or zero
|
||||
L : numpy matrix (default None)
|
||||
rotated factors, i.e., :math:`L=A(T^*)^{-1}=AT`
|
||||
A : numpy matrix (default None)
|
||||
non rotated factors
|
||||
T : numpy matrix (default None)
|
||||
rotation matrix
|
||||
"""
|
||||
if W is None:
|
||||
return ff_target(H, L=L, A=A, T=T)
|
||||
if L is None:
|
||||
assert A is not None and T is not None
|
||||
L = rotateA(A, T, rotation_method='orthogonal')
|
||||
q = np.linalg.norm(W*(L-H), 'fro')**2
|
||||
return q
|
||||
@ -0,0 +1,350 @@
|
||||
from ._analytic_rotation import target_rotation
|
||||
from ._gpa_rotation import oblimin_objective, orthomax_objective, CF_objective
|
||||
from ._gpa_rotation import ff_partial_target, ff_target
|
||||
from ._gpa_rotation import vgQ_partial_target, vgQ_target
|
||||
from ._gpa_rotation import rotateA, GPA
|
||||
|
||||
__all__ = []
|
||||
|
||||
|
||||
def rotate_factors(A, method, *method_args, **algorithm_kwargs):
|
||||
r"""
|
||||
Subroutine for orthogonal and oblique rotation of the matrix :math:`A`.
|
||||
For orthogonal rotations :math:`A` is rotated to :math:`L` according to
|
||||
|
||||
.. math::
|
||||
|
||||
L = AT,
|
||||
|
||||
where :math:`T` is an orthogonal matrix. And, for oblique rotations
|
||||
:math:`A` is rotated to :math:`L` according to
|
||||
|
||||
.. math::
|
||||
|
||||
L = A(T^*)^{-1},
|
||||
|
||||
where :math:`T` is a normal matrix.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
A : numpy matrix (default None)
|
||||
non rotated factors
|
||||
method : str
|
||||
should be one of the methods listed below
|
||||
method_args : list
|
||||
additional arguments that should be provided with each method
|
||||
algorithm_kwargs : dictionary
|
||||
algorithm : str (default gpa)
|
||||
should be one of:
|
||||
|
||||
* 'gpa': a numerical method
|
||||
* 'gpa_der_free': a derivative free numerical method
|
||||
* 'analytic' : an analytic method
|
||||
|
||||
Depending on the algorithm, there are algorithm specific keyword
|
||||
arguments. For the gpa and gpa_der_free, the following
|
||||
keyword arguments are available:
|
||||
|
||||
max_tries : int (default 501)
|
||||
maximum number of iterations
|
||||
|
||||
tol : float
|
||||
stop criterion, algorithm stops if Frobenius norm of gradient is
|
||||
smaller then tol
|
||||
|
||||
For analytic, the supported arguments depend on the method, see above.
|
||||
|
||||
See the lower level functions for more details.
|
||||
|
||||
Returns
|
||||
-------
|
||||
The tuple :math:`(L,T)`
|
||||
|
||||
Notes
|
||||
-----
|
||||
What follows is a list of available methods. Depending on the method
|
||||
additional argument are required and different algorithms
|
||||
are available. The algorithm_kwargs are additional keyword arguments
|
||||
passed to the selected algorithm (see the parameters section).
|
||||
Unless stated otherwise, only the gpa and
|
||||
gpa_der_free algorithm are available.
|
||||
|
||||
Below,
|
||||
|
||||
* :math:`L` is a :math:`p\times k` matrix;
|
||||
* :math:`N` is :math:`k\times k` matrix with zeros on the diagonal and ones
|
||||
elsewhere;
|
||||
* :math:`M` is :math:`p\times p` matrix with zeros on the diagonal and ones
|
||||
elsewhere;
|
||||
* :math:`C` is a :math:`p\times p` matrix with elements equal to
|
||||
:math:`1/p`;
|
||||
* :math:`(X,Y)=\operatorname{Tr}(X^*Y)` is the Frobenius norm;
|
||||
* :math:`\circ` is the element-wise product or Hadamard product.
|
||||
|
||||
oblimin : orthogonal or oblique rotation that minimizes
|
||||
.. math::
|
||||
\phi(L) = \frac{1}{4}(L\circ L,(I-\gamma C)(L\circ L)N).
|
||||
|
||||
For orthogonal rotations:
|
||||
|
||||
* :math:`\gamma=0` corresponds to quartimax,
|
||||
* :math:`\gamma=\frac{1}{2}` corresponds to biquartimax,
|
||||
* :math:`\gamma=1` corresponds to varimax,
|
||||
* :math:`\gamma=\frac{1}{p}` corresponds to equamax.
|
||||
|
||||
For oblique rotations rotations:
|
||||
|
||||
* :math:`\gamma=0` corresponds to quartimin,
|
||||
* :math:`\gamma=\frac{1}{2}` corresponds to biquartimin.
|
||||
|
||||
method_args:
|
||||
|
||||
gamma : float
|
||||
oblimin family parameter
|
||||
rotation_method : str
|
||||
should be one of {orthogonal, oblique}
|
||||
|
||||
orthomax : orthogonal rotation that minimizes
|
||||
|
||||
.. math::
|
||||
\phi(L) = -\frac{1}{4}(L\circ L,(I-\gamma C)(L\circ L)),
|
||||
|
||||
where :math:`0\leq\gamma\leq1`. The orthomax family is equivalent to
|
||||
the oblimin family (when restricted to orthogonal rotations).
|
||||
Furthermore,
|
||||
|
||||
* :math:`\gamma=0` corresponds to quartimax,
|
||||
* :math:`\gamma=\frac{1}{2}` corresponds to biquartimax,
|
||||
* :math:`\gamma=1` corresponds to varimax,
|
||||
* :math:`\gamma=\frac{1}{p}` corresponds to equamax.
|
||||
|
||||
method_args:
|
||||
|
||||
gamma : float (between 0 and 1)
|
||||
orthomax family parameter
|
||||
|
||||
CF : Crawford-Ferguson family for orthogonal and oblique rotation which
|
||||
minimizes:
|
||||
|
||||
.. math::
|
||||
|
||||
\phi(L) =\frac{1-\kappa}{4} (L\circ L,(L\circ L)N)
|
||||
-\frac{1}{4}(L\circ L,M(L\circ L)),
|
||||
|
||||
where :math:`0\leq\kappa\leq1`. For orthogonal rotations the oblimin
|
||||
(and orthomax) family of rotations is equivalent to the
|
||||
Crawford-Ferguson family.
|
||||
To be more precise:
|
||||
|
||||
* :math:`\kappa=0` corresponds to quartimax,
|
||||
* :math:`\kappa=\frac{1}{p}` corresponds to varimax,
|
||||
* :math:`\kappa=\frac{k-1}{p+k-2}` corresponds to parsimax,
|
||||
* :math:`\kappa=1` corresponds to factor parsimony.
|
||||
|
||||
method_args:
|
||||
|
||||
kappa : float (between 0 and 1)
|
||||
Crawford-Ferguson family parameter
|
||||
rotation_method : str
|
||||
should be one of {orthogonal, oblique}
|
||||
|
||||
quartimax : orthogonal rotation method
|
||||
minimizes the orthomax objective with :math:`\gamma=0`
|
||||
|
||||
biquartimax : orthogonal rotation method
|
||||
minimizes the orthomax objective with :math:`\gamma=\frac{1}{2}`
|
||||
|
||||
varimax : orthogonal rotation method
|
||||
minimizes the orthomax objective with :math:`\gamma=1`
|
||||
|
||||
equamax : orthogonal rotation method
|
||||
minimizes the orthomax objective with :math:`\gamma=\frac{1}{p}`
|
||||
|
||||
parsimax : orthogonal rotation method
|
||||
minimizes the Crawford-Ferguson family objective with
|
||||
:math:`\kappa=\frac{k-1}{p+k-2}`
|
||||
|
||||
parsimony : orthogonal rotation method
|
||||
minimizes the Crawford-Ferguson family objective with :math:`\kappa=1`
|
||||
|
||||
quartimin : oblique rotation method that minimizes
|
||||
minimizes the oblimin objective with :math:`\gamma=0`
|
||||
|
||||
quartimin : oblique rotation method that minimizes
|
||||
minimizes the oblimin objective with :math:`\gamma=\frac{1}{2}`
|
||||
|
||||
target : orthogonal or oblique rotation that rotates towards a target
|
||||
|
||||
matrix : math:`H` by minimizing the objective
|
||||
|
||||
.. math::
|
||||
|
||||
\phi(L) =\frac{1}{2}\|L-H\|^2.
|
||||
|
||||
method_args:
|
||||
|
||||
H : numpy matrix
|
||||
target matrix
|
||||
rotation_method : str
|
||||
should be one of {orthogonal, oblique}
|
||||
|
||||
For orthogonal rotations the algorithm can be set to analytic in which
|
||||
case the following keyword arguments are available:
|
||||
|
||||
full_rank : bool (default False)
|
||||
if set to true full rank is assumed
|
||||
|
||||
partial_target : orthogonal (default) or oblique rotation that partially
|
||||
rotates towards a target matrix :math:`H` by minimizing the objective:
|
||||
|
||||
.. math::
|
||||
|
||||
\phi(L) =\frac{1}{2}\|W\circ(L-H)\|^2.
|
||||
|
||||
method_args:
|
||||
|
||||
H : numpy matrix
|
||||
target matrix
|
||||
W : numpy matrix (default matrix with equal weight one for all entries)
|
||||
matrix with weights, entries can either be one or zero
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> A = np.random.randn(8,2)
|
||||
>>> L, T = rotate_factors(A,'varimax')
|
||||
>>> np.allclose(L,A.dot(T))
|
||||
>>> L, T = rotate_factors(A,'orthomax',0.5)
|
||||
>>> np.allclose(L,A.dot(T))
|
||||
>>> L, T = rotate_factors(A,'quartimin',0.5)
|
||||
>>> np.allclose(L,A.dot(np.linalg.inv(T.T)))
|
||||
"""
|
||||
if 'algorithm' in algorithm_kwargs:
|
||||
algorithm = algorithm_kwargs['algorithm']
|
||||
algorithm_kwargs.pop('algorithm')
|
||||
else:
|
||||
algorithm = 'gpa'
|
||||
assert not ('rotation_method' in algorithm_kwargs), (
|
||||
'rotation_method cannot be provided as keyword argument')
|
||||
L = None
|
||||
T = None
|
||||
ff = None
|
||||
vgQ = None
|
||||
p, k = A.shape
|
||||
# set ff or vgQ to appropriate objective function, compute solution using
|
||||
# recursion or analytically compute solution
|
||||
if method == 'orthomax':
|
||||
assert len(method_args) == 1, ('Only %s family parameter should be '
|
||||
'provided' % method)
|
||||
rotation_method = 'orthogonal'
|
||||
gamma = method_args[0]
|
||||
if algorithm == 'gpa':
|
||||
vgQ = lambda L=None, A=None, T=None: orthomax_objective(
|
||||
L=L, A=A, T=T, gamma=gamma, return_gradient=True)
|
||||
elif algorithm == 'gpa_der_free':
|
||||
ff = lambda L=None, A=None, T=None: orthomax_objective(
|
||||
L=L, A=A, T=T, gamma=gamma, return_gradient=False)
|
||||
else:
|
||||
raise ValueError('Algorithm %s is not possible for %s '
|
||||
'rotation' % (algorithm, method))
|
||||
elif method == 'oblimin':
|
||||
assert len(method_args) == 2, ('Both %s family parameter and '
|
||||
'rotation_method should be '
|
||||
'provided' % method)
|
||||
rotation_method = method_args[1]
|
||||
assert rotation_method in ['orthogonal', 'oblique'], (
|
||||
'rotation_method should be one of {orthogonal, oblique}')
|
||||
gamma = method_args[0]
|
||||
if algorithm == 'gpa':
|
||||
vgQ = lambda L=None, A=None, T=None: oblimin_objective(
|
||||
L=L, A=A, T=T, gamma=gamma, return_gradient=True)
|
||||
elif algorithm == 'gpa_der_free':
|
||||
ff = lambda L=None, A=None, T=None: oblimin_objective(
|
||||
L=L, A=A, T=T, gamma=gamma, rotation_method=rotation_method,
|
||||
return_gradient=False)
|
||||
else:
|
||||
raise ValueError('Algorithm %s is not possible for %s '
|
||||
'rotation' % (algorithm, method))
|
||||
elif method == 'CF':
|
||||
assert len(method_args) == 2, ('Both %s family parameter and '
|
||||
'rotation_method should be provided'
|
||||
% method)
|
||||
rotation_method = method_args[1]
|
||||
assert rotation_method in ['orthogonal', 'oblique'], (
|
||||
'rotation_method should be one of {orthogonal, oblique}')
|
||||
kappa = method_args[0]
|
||||
if algorithm == 'gpa':
|
||||
vgQ = lambda L=None, A=None, T=None: CF_objective(
|
||||
L=L, A=A, T=T, kappa=kappa, rotation_method=rotation_method,
|
||||
return_gradient=True)
|
||||
elif algorithm == 'gpa_der_free':
|
||||
ff = lambda L=None, A=None, T=None: CF_objective(
|
||||
L=L, A=A, T=T, kappa=kappa, rotation_method=rotation_method,
|
||||
return_gradient=False)
|
||||
else:
|
||||
raise ValueError('Algorithm %s is not possible for %s '
|
||||
'rotation' % (algorithm, method))
|
||||
elif method == 'quartimax':
|
||||
return rotate_factors(A, 'orthomax', 0, **algorithm_kwargs)
|
||||
elif method == 'biquartimax':
|
||||
return rotate_factors(A, 'orthomax', 0.5, **algorithm_kwargs)
|
||||
elif method == 'varimax':
|
||||
return rotate_factors(A, 'orthomax', 1, **algorithm_kwargs)
|
||||
elif method == 'equamax':
|
||||
return rotate_factors(A, 'orthomax', 1/p, **algorithm_kwargs)
|
||||
elif method == 'parsimax':
|
||||
return rotate_factors(A, 'CF', (k-1)/(p+k-2),
|
||||
'orthogonal', **algorithm_kwargs)
|
||||
elif method == 'parsimony':
|
||||
return rotate_factors(A, 'CF', 1, 'orthogonal', **algorithm_kwargs)
|
||||
elif method == 'quartimin':
|
||||
return rotate_factors(A, 'oblimin', 0, 'oblique', **algorithm_kwargs)
|
||||
elif method == 'biquartimin':
|
||||
return rotate_factors(A, 'oblimin', 0.5, 'oblique', **algorithm_kwargs)
|
||||
elif method == 'target':
|
||||
assert len(method_args) == 2, (
|
||||
'only the rotation target and orthogonal/oblique should be provide'
|
||||
' for %s rotation' % method)
|
||||
H = method_args[0]
|
||||
rotation_method = method_args[1]
|
||||
assert rotation_method in ['orthogonal', 'oblique'], (
|
||||
'rotation_method should be one of {orthogonal, oblique}')
|
||||
if algorithm == 'gpa':
|
||||
vgQ = lambda L=None, A=None, T=None: vgQ_target(
|
||||
H, L=L, A=A, T=T, rotation_method=rotation_method)
|
||||
elif algorithm == 'gpa_der_free':
|
||||
ff = lambda L=None, A=None, T=None: ff_target(
|
||||
H, L=L, A=A, T=T, rotation_method=rotation_method)
|
||||
elif algorithm == 'analytic':
|
||||
assert rotation_method == 'orthogonal', (
|
||||
'For analytic %s rotation only orthogonal rotation is '
|
||||
'supported')
|
||||
T = target_rotation(A, H, **algorithm_kwargs)
|
||||
else:
|
||||
raise ValueError('Algorithm %s is not possible for %s rotation'
|
||||
% (algorithm, method))
|
||||
elif method == 'partial_target':
|
||||
assert len(method_args) == 2, ('2 additional arguments are expected '
|
||||
'for %s rotation' % method)
|
||||
H = method_args[0]
|
||||
W = method_args[1]
|
||||
rotation_method = 'orthogonal'
|
||||
if algorithm == 'gpa':
|
||||
vgQ = lambda L=None, A=None, T=None: vgQ_partial_target(
|
||||
H, W=W, L=L, A=A, T=T)
|
||||
elif algorithm == 'gpa_der_free':
|
||||
ff = lambda L=None, A=None, T=None: ff_partial_target(
|
||||
H, W=W, L=L, A=A, T=T)
|
||||
else:
|
||||
raise ValueError('Algorithm %s is not possible for %s '
|
||||
'rotation' % (algorithm, method))
|
||||
else:
|
||||
raise ValueError('Invalid method')
|
||||
# compute L and T if not already done
|
||||
if T is None:
|
||||
L, phi, T, table = GPA(A, vgQ=vgQ, ff=ff,
|
||||
rotation_method=rotation_method,
|
||||
**algorithm_kwargs)
|
||||
if L is None:
|
||||
assert T is not None, 'Cannot compute L without T'
|
||||
L = rotateA(A, T, rotation_method=rotation_method)
|
||||
return L, T
|
||||
Binary file not shown.
Binary file not shown.
@ -0,0 +1,584 @@
|
||||
import unittest
|
||||
import numpy as np
|
||||
|
||||
from statsmodels.multivariate.factor_rotation._wrappers import rotate_factors
|
||||
from statsmodels.multivariate.factor_rotation._gpa_rotation import (
|
||||
ff_partial_target, vgQ_partial_target, ff_target, vgQ_target, CF_objective,
|
||||
orthomax_objective, oblimin_objective, GPA)
|
||||
from statsmodels.multivariate.factor_rotation._analytic_rotation import (
|
||||
target_rotation)
|
||||
|
||||
|
||||
class TestAnalyticRotation(unittest.TestCase):
|
||||
@staticmethod
|
||||
def str2matrix(A):
|
||||
A = A.lstrip().rstrip().split('\n')
|
||||
A = np.array([row.split() for row in A]).astype(float)
|
||||
return A
|
||||
|
||||
def test_target_rotation(self):
|
||||
"""
|
||||
Rotation towards target matrix example
|
||||
http://www.stat.ucla.edu/research/gpa
|
||||
"""
|
||||
A = self.str2matrix("""
|
||||
.830 -.396
|
||||
.818 -.469
|
||||
.777 -.470
|
||||
.798 -.401
|
||||
.786 .500
|
||||
.672 .458
|
||||
.594 .444
|
||||
.647 .333
|
||||
""")
|
||||
H = self.str2matrix("""
|
||||
.8 -.3
|
||||
.8 -.4
|
||||
.7 -.4
|
||||
.9 -.4
|
||||
.8 .5
|
||||
.6 .4
|
||||
.5 .4
|
||||
.6 .3
|
||||
""")
|
||||
T = target_rotation(A, H)
|
||||
L = A.dot(T)
|
||||
L_required = self.str2matrix("""
|
||||
0.84168 -0.37053
|
||||
0.83191 -0.44386
|
||||
0.79096 -0.44611
|
||||
0.80985 -0.37650
|
||||
0.77040 0.52371
|
||||
0.65774 0.47826
|
||||
0.58020 0.46189
|
||||
0.63656 0.35255
|
||||
""")
|
||||
self.assertTrue(np.allclose(L, L_required, atol=1e-05))
|
||||
T = target_rotation(A, H, full_rank=True)
|
||||
L = A.dot(T)
|
||||
self.assertTrue(np.allclose(L, L_required, atol=1e-05))
|
||||
|
||||
def test_orthogonal_target(self):
|
||||
"""
|
||||
Rotation towards target matrix example
|
||||
http://www.stat.ucla.edu/research/gpa
|
||||
"""
|
||||
A = self.str2matrix("""
|
||||
.830 -.396
|
||||
.818 -.469
|
||||
.777 -.470
|
||||
.798 -.401
|
||||
.786 .500
|
||||
.672 .458
|
||||
.594 .444
|
||||
.647 .333
|
||||
""")
|
||||
H = self.str2matrix("""
|
||||
.8 -.3
|
||||
.8 -.4
|
||||
.7 -.4
|
||||
.9 -.4
|
||||
.8 .5
|
||||
.6 .4
|
||||
.5 .4
|
||||
.6 .3
|
||||
""")
|
||||
vgQ = lambda L=None, A=None, T=None: vgQ_target(H, L=L, A=A, T=T)
|
||||
L, phi, T, table = GPA(A, vgQ=vgQ, rotation_method='orthogonal')
|
||||
T_analytic = target_rotation(A, H)
|
||||
self.assertTrue(np.allclose(T, T_analytic, atol=1e-05))
|
||||
|
||||
|
||||
class TestGPARotation(unittest.TestCase):
|
||||
|
||||
@staticmethod
|
||||
def str2matrix(A):
|
||||
A = A.lstrip().rstrip().split('\n')
|
||||
A = np.array([row.split() for row in A]).astype(float)
|
||||
return A
|
||||
|
||||
@classmethod
|
||||
def get_A(cls):
|
||||
return cls.str2matrix("""
|
||||
.830 -.396
|
||||
.818 -.469
|
||||
.777 -.470
|
||||
.798 -.401
|
||||
.786 .500
|
||||
.672 .458
|
||||
.594 .444
|
||||
.647 .333
|
||||
""")
|
||||
|
||||
@classmethod
|
||||
def get_quartimin_example(cls):
|
||||
A = cls.get_A()
|
||||
table_required = cls.str2matrix("""
|
||||
0.00000 0.42806 -0.46393 1.00000
|
||||
1.00000 0.41311 -0.57313 0.25000
|
||||
2.00000 0.38238 -0.36652 0.50000
|
||||
3.00000 0.31850 -0.21011 0.50000
|
||||
4.00000 0.20937 -0.13838 0.50000
|
||||
5.00000 0.12379 -0.35583 0.25000
|
||||
6.00000 0.04289 -0.53244 0.50000
|
||||
7.00000 0.01098 -0.86649 0.50000
|
||||
8.00000 0.00566 -1.65798 0.50000
|
||||
9.00000 0.00558 -2.13212 0.25000
|
||||
10.00000 0.00557 -2.49020 0.25000
|
||||
11.00000 0.00557 -2.84585 0.25000
|
||||
12.00000 0.00557 -3.20320 0.25000
|
||||
13.00000 0.00557 -3.56143 0.25000
|
||||
14.00000 0.00557 -3.92005 0.25000
|
||||
15.00000 0.00557 -4.27885 0.25000
|
||||
16.00000 0.00557 -4.63772 0.25000
|
||||
17.00000 0.00557 -4.99663 0.25000
|
||||
18.00000 0.00557 -5.35555 0.25000
|
||||
""")
|
||||
L_required = cls.str2matrix("""
|
||||
0.891822 0.056015
|
||||
0.953680 -0.023246
|
||||
0.929150 -0.046503
|
||||
0.876683 0.033658
|
||||
0.013701 0.925000
|
||||
-0.017265 0.821253
|
||||
-0.052445 0.764953
|
||||
0.085890 0.683115
|
||||
""")
|
||||
return A, table_required, L_required
|
||||
|
||||
@classmethod
|
||||
def get_biquartimin_example(cls):
|
||||
A = cls.get_A()
|
||||
table_required = cls.str2matrix("""
|
||||
0.00000 0.21632 -0.54955 1.00000
|
||||
1.00000 0.19519 -0.46174 0.50000
|
||||
2.00000 0.09479 -0.16365 1.00000
|
||||
3.00000 -0.06302 -0.32096 0.50000
|
||||
4.00000 -0.21304 -0.46562 1.00000
|
||||
5.00000 -0.33199 -0.33287 1.00000
|
||||
6.00000 -0.35108 -0.63990 0.12500
|
||||
7.00000 -0.35543 -1.20916 0.12500
|
||||
8.00000 -0.35568 -2.61213 0.12500
|
||||
9.00000 -0.35568 -2.97910 0.06250
|
||||
10.00000 -0.35568 -3.32645 0.06250
|
||||
11.00000 -0.35568 -3.66021 0.06250
|
||||
12.00000 -0.35568 -3.98564 0.06250
|
||||
13.00000 -0.35568 -4.30635 0.06250
|
||||
14.00000 -0.35568 -4.62451 0.06250
|
||||
15.00000 -0.35568 -4.94133 0.06250
|
||||
16.00000 -0.35568 -5.25745 0.06250
|
||||
""")
|
||||
L_required = cls.str2matrix("""
|
||||
1.01753 -0.13657
|
||||
1.11338 -0.24643
|
||||
1.09200 -0.26890
|
||||
1.00676 -0.16010
|
||||
-0.26534 1.11371
|
||||
-0.26972 0.99553
|
||||
-0.29341 0.93561
|
||||
-0.10806 0.80513
|
||||
""")
|
||||
return A, table_required, L_required
|
||||
|
||||
@classmethod
|
||||
def get_biquartimin_example_derivative_free(cls):
|
||||
A = cls.get_A()
|
||||
table_required = cls.str2matrix("""
|
||||
0.00000 0.21632 -0.54955 1.00000
|
||||
1.00000 0.19519 -0.46174 0.50000
|
||||
2.00000 0.09479 -0.16365 1.00000
|
||||
3.00000 -0.06302 -0.32096 0.50000
|
||||
4.00000 -0.21304 -0.46562 1.00000
|
||||
5.00000 -0.33199 -0.33287 1.00000
|
||||
6.00000 -0.35108 -0.63990 0.12500
|
||||
7.00000 -0.35543 -1.20916 0.12500
|
||||
8.00000 -0.35568 -2.61213 0.12500
|
||||
9.00000 -0.35568 -2.97910 0.06250
|
||||
10.00000 -0.35568 -3.32645 0.06250
|
||||
11.00000 -0.35568 -3.66021 0.06250
|
||||
12.00000 -0.35568 -3.98564 0.06250
|
||||
13.00000 -0.35568 -4.30634 0.06250
|
||||
14.00000 -0.35568 -4.62451 0.06250
|
||||
15.00000 -0.35568 -4.94133 0.06250
|
||||
16.00000 -0.35568 -6.32435 0.12500
|
||||
""")
|
||||
L_required = cls.str2matrix("""
|
||||
1.01753 -0.13657
|
||||
1.11338 -0.24643
|
||||
1.09200 -0.26890
|
||||
1.00676 -0.16010
|
||||
-0.26534 1.11371
|
||||
-0.26972 0.99553
|
||||
-0.29342 0.93561
|
||||
-0.10806 0.80513
|
||||
""")
|
||||
return A, table_required, L_required
|
||||
|
||||
@classmethod
|
||||
def get_quartimax_example_derivative_free(cls):
|
||||
A = cls.get_A()
|
||||
table_required = cls.str2matrix("""
|
||||
0.00000 -0.72073 -0.65498 1.00000
|
||||
1.00000 -0.88561 -0.34614 2.00000
|
||||
2.00000 -1.01992 -1.07152 1.00000
|
||||
3.00000 -1.02237 -1.51373 0.50000
|
||||
4.00000 -1.02269 -1.96205 0.50000
|
||||
5.00000 -1.02273 -2.41116 0.50000
|
||||
6.00000 -1.02273 -2.86037 0.50000
|
||||
7.00000 -1.02273 -3.30959 0.50000
|
||||
8.00000 -1.02273 -3.75881 0.50000
|
||||
9.00000 -1.02273 -4.20804 0.50000
|
||||
10.00000 -1.02273 -4.65726 0.50000
|
||||
11.00000 -1.02273 -5.10648 0.50000
|
||||
""")
|
||||
L_required = cls.str2matrix("""
|
||||
0.89876 0.19482
|
||||
0.93394 0.12974
|
||||
0.90213 0.10386
|
||||
0.87651 0.17128
|
||||
0.31558 0.87647
|
||||
0.25113 0.77349
|
||||
0.19801 0.71468
|
||||
0.30786 0.65933
|
||||
""")
|
||||
return A, table_required, L_required
|
||||
|
||||
def test_orthomax(self):
|
||||
"""
|
||||
Quartimax example
|
||||
http://www.stat.ucla.edu/research/gpa
|
||||
"""
|
||||
A = self.get_A()
|
||||
vgQ = lambda L=None, A=None, T=None: orthomax_objective(
|
||||
L=L, A=A, T=T, gamma=0, return_gradient=True)
|
||||
L, phi, T, table = GPA(A, vgQ=vgQ, rotation_method='orthogonal')
|
||||
table_required = self.str2matrix("""
|
||||
0.00000 -0.72073 -0.65498 1.00000
|
||||
1.00000 -0.88561 -0.34614 2.00000
|
||||
2.00000 -1.01992 -1.07152 1.00000
|
||||
3.00000 -1.02237 -1.51373 0.50000
|
||||
4.00000 -1.02269 -1.96205 0.50000
|
||||
5.00000 -1.02273 -2.41116 0.50000
|
||||
6.00000 -1.02273 -2.86037 0.50000
|
||||
7.00000 -1.02273 -3.30959 0.50000
|
||||
8.00000 -1.02273 -3.75881 0.50000
|
||||
9.00000 -1.02273 -4.20804 0.50000
|
||||
10.00000 -1.02273 -4.65726 0.50000
|
||||
11.00000 -1.02273 -5.10648 0.50000
|
||||
""")
|
||||
L_required = self.str2matrix("""
|
||||
0.89876 0.19482
|
||||
0.93394 0.12974
|
||||
0.90213 0.10386
|
||||
0.87651 0.17128
|
||||
0.31558 0.87647
|
||||
0.25113 0.77349
|
||||
0.19801 0.71468
|
||||
0.30786 0.65933
|
||||
""")
|
||||
self.assertTrue(np.allclose(table, table_required, atol=1e-05))
|
||||
self.assertTrue(np.allclose(L, L_required, atol=1e-05))
|
||||
# oblimin criterion gives same result
|
||||
vgQ = lambda L=None, A=None, T=None: oblimin_objective(
|
||||
L=L, A=A, T=T, gamma=0, rotation_method='orthogonal',
|
||||
return_gradient=True)
|
||||
L_oblimin, phi2, T2, table2 = GPA(A, vgQ=vgQ,
|
||||
rotation_method='orthogonal')
|
||||
self.assertTrue(np.allclose(L, L_oblimin, atol=1e-05))
|
||||
# derivative free quartimax
|
||||
out = self.get_quartimax_example_derivative_free()
|
||||
A, table_required, L_required = out
|
||||
ff = lambda L=None, A=None, T=None: orthomax_objective(
|
||||
L=L, A=A, T=T, gamma=0, return_gradient=False)
|
||||
L, phi, T, table = GPA(A, ff=ff, rotation_method='orthogonal')
|
||||
self.assertTrue(np.allclose(table, table_required, atol=1e-05))
|
||||
self.assertTrue(np.allclose(L, L_required, atol=1e-05))
|
||||
|
||||
def test_equivalence_orthomax_oblimin(self):
|
||||
"""
|
||||
These criteria should be equivalent when restricted to orthogonal
|
||||
rotation.
|
||||
See Hartman 1976 page 299.
|
||||
"""
|
||||
A = self.get_A()
|
||||
gamma = 0 # quartimax
|
||||
vgQ = lambda L=None, A=None, T=None: orthomax_objective(
|
||||
L=L, A=A, T=T, gamma=gamma, return_gradient=True)
|
||||
L_orthomax, phi, T, table = GPA(
|
||||
A, vgQ=vgQ, rotation_method='orthogonal')
|
||||
vgQ = lambda L=None, A=None, T=None: oblimin_objective(
|
||||
L=L, A=A, T=T, gamma=gamma, rotation_method='orthogonal',
|
||||
return_gradient=True)
|
||||
L_oblimin, phi2, T2, table2 = GPA(A, vgQ=vgQ,
|
||||
rotation_method='orthogonal')
|
||||
self.assertTrue(np.allclose(L_orthomax, L_oblimin, atol=1e-05))
|
||||
gamma = 1 # varimax
|
||||
vgQ = lambda L=None, A=None, T=None: orthomax_objective(
|
||||
L=L, A=A, T=T, gamma=gamma, return_gradient=True)
|
||||
L_orthomax, phi, T, table = GPA(
|
||||
A, vgQ=vgQ, rotation_method='orthogonal')
|
||||
vgQ = lambda L=None, A=None, T=None: oblimin_objective(
|
||||
L=L, A=A, T=T, gamma=gamma, rotation_method='orthogonal',
|
||||
return_gradient=True)
|
||||
L_oblimin, phi2, T2, table2 = GPA(
|
||||
A, vgQ=vgQ, rotation_method='orthogonal')
|
||||
self.assertTrue(np.allclose(L_orthomax, L_oblimin, atol=1e-05))
|
||||
|
||||
def test_orthogonal_target(self):
|
||||
"""
|
||||
Rotation towards target matrix example
|
||||
http://www.stat.ucla.edu/research/gpa
|
||||
"""
|
||||
A = self.get_A()
|
||||
H = self.str2matrix("""
|
||||
.8 -.3
|
||||
.8 -.4
|
||||
.7 -.4
|
||||
.9 -.4
|
||||
.8 .5
|
||||
.6 .4
|
||||
.5 .4
|
||||
.6 .3
|
||||
""")
|
||||
vgQ = lambda L=None, A=None, T=None: vgQ_target(H, L=L, A=A, T=T)
|
||||
L, phi, T, table = GPA(A, vgQ=vgQ, rotation_method='orthogonal')
|
||||
table_required = self.str2matrix("""
|
||||
0.00000 0.05925 -0.61244 1.00000
|
||||
1.00000 0.05444 -1.14701 0.12500
|
||||
2.00000 0.05403 -1.68194 0.12500
|
||||
3.00000 0.05399 -2.21689 0.12500
|
||||
4.00000 0.05399 -2.75185 0.12500
|
||||
5.00000 0.05399 -3.28681 0.12500
|
||||
6.00000 0.05399 -3.82176 0.12500
|
||||
7.00000 0.05399 -4.35672 0.12500
|
||||
8.00000 0.05399 -4.89168 0.12500
|
||||
9.00000 0.05399 -5.42664 0.12500
|
||||
""")
|
||||
L_required = self.str2matrix("""
|
||||
0.84168 -0.37053
|
||||
0.83191 -0.44386
|
||||
0.79096 -0.44611
|
||||
0.80985 -0.37650
|
||||
0.77040 0.52371
|
||||
0.65774 0.47826
|
||||
0.58020 0.46189
|
||||
0.63656 0.35255
|
||||
""")
|
||||
self.assertTrue(np.allclose(table, table_required, atol=1e-05))
|
||||
self.assertTrue(np.allclose(L, L_required, atol=1e-05))
|
||||
ff = lambda L=None, A=None, T=None: ff_target(H, L=L, A=A, T=T)
|
||||
L2, phi, T2, table = GPA(A, ff=ff, rotation_method='orthogonal')
|
||||
self.assertTrue(np.allclose(L, L2, atol=1e-05))
|
||||
self.assertTrue(np.allclose(T, T2, atol=1e-05))
|
||||
vgQ = lambda L=None, A=None, T=None: vgQ_target(
|
||||
H, L=L, A=A, T=T, rotation_method='oblique')
|
||||
L, phi, T, table = GPA(A, vgQ=vgQ, rotation_method='oblique')
|
||||
ff = lambda L=None, A=None, T=None: ff_target(
|
||||
H, L=L, A=A, T=T, rotation_method='oblique')
|
||||
L2, phi, T2, table = GPA(A, ff=ff, rotation_method='oblique')
|
||||
self.assertTrue(np.allclose(L, L2, atol=1e-05))
|
||||
self.assertTrue(np.allclose(T, T2, atol=1e-05))
|
||||
|
||||
def test_orthogonal_partial_target(self):
|
||||
"""
|
||||
Rotation towards target matrix example
|
||||
http://www.stat.ucla.edu/research/gpa
|
||||
"""
|
||||
A = self.get_A()
|
||||
H = self.str2matrix("""
|
||||
.8 -.3
|
||||
.8 -.4
|
||||
.7 -.4
|
||||
.9 -.4
|
||||
.8 .5
|
||||
.6 .4
|
||||
.5 .4
|
||||
.6 .3
|
||||
""")
|
||||
W = self.str2matrix("""
|
||||
1 0
|
||||
0 1
|
||||
0 0
|
||||
1 1
|
||||
1 0
|
||||
1 0
|
||||
0 1
|
||||
1 0
|
||||
""")
|
||||
vgQ = lambda L=None, A=None, T=None: vgQ_partial_target(
|
||||
H, W, L=L, A=A, T=T)
|
||||
L, phi, T, table = GPA(A, vgQ=vgQ, rotation_method='orthogonal')
|
||||
table_required = self.str2matrix("""
|
||||
0.00000 0.02559 -0.84194 1.00000
|
||||
1.00000 0.02203 -1.27116 0.25000
|
||||
2.00000 0.02154 -1.71198 0.25000
|
||||
3.00000 0.02148 -2.15713 0.25000
|
||||
4.00000 0.02147 -2.60385 0.25000
|
||||
5.00000 0.02147 -3.05114 0.25000
|
||||
6.00000 0.02147 -3.49863 0.25000
|
||||
7.00000 0.02147 -3.94619 0.25000
|
||||
8.00000 0.02147 -4.39377 0.25000
|
||||
9.00000 0.02147 -4.84137 0.25000
|
||||
10.00000 0.02147 -5.28897 0.25000
|
||||
""")
|
||||
L_required = self.str2matrix("""
|
||||
0.84526 -0.36228
|
||||
0.83621 -0.43571
|
||||
0.79528 -0.43836
|
||||
0.81349 -0.36857
|
||||
0.76525 0.53122
|
||||
0.65303 0.48467
|
||||
0.57565 0.46754
|
||||
0.63308 0.35876
|
||||
""")
|
||||
self.assertTrue(np.allclose(table, table_required, atol=1e-05))
|
||||
self.assertTrue(np.allclose(L, L_required, atol=1e-05))
|
||||
ff = lambda L=None, A=None, T=None: ff_partial_target(
|
||||
H, W, L=L, A=A, T=T)
|
||||
L2, phi, T2, table = GPA(A, ff=ff, rotation_method='orthogonal')
|
||||
self.assertTrue(np.allclose(L, L2, atol=1e-05))
|
||||
self.assertTrue(np.allclose(T, T2, atol=1e-05))
|
||||
|
||||
def test_oblimin(self):
|
||||
# quartimin
|
||||
A, table_required, L_required = self.get_quartimin_example()
|
||||
vgQ = lambda L=None, A=None, T=None: oblimin_objective(
|
||||
L=L, A=A, T=T, gamma=0, rotation_method='oblique')
|
||||
L, phi, T, table = GPA(A, vgQ=vgQ, rotation_method='oblique')
|
||||
self.assertTrue(np.allclose(table, table_required, atol=1e-05))
|
||||
self.assertTrue(np.allclose(L, L_required, atol=1e-05))
|
||||
# quartimin derivative free
|
||||
ff = lambda L=None, A=None, T=None: oblimin_objective(
|
||||
L=L, A=A, T=T, gamma=0, rotation_method='oblique',
|
||||
return_gradient=False)
|
||||
L, phi, T, table = GPA(A, ff=ff, rotation_method='oblique')
|
||||
self.assertTrue(np.allclose(L, L_required, atol=1e-05))
|
||||
self.assertTrue(np.allclose(table, table_required, atol=1e-05))
|
||||
# biquartimin
|
||||
A, table_required, L_required = self.get_biquartimin_example()
|
||||
vgQ = lambda L=None, A=None, T=None: oblimin_objective(
|
||||
L=L, A=A, T=T, gamma=1/2, rotation_method='oblique')
|
||||
L, phi, T, table = GPA(A, vgQ=vgQ, rotation_method='oblique')
|
||||
self.assertTrue(np.allclose(table, table_required, atol=1e-05))
|
||||
self.assertTrue(np.allclose(L, L_required, atol=1e-05))
|
||||
# quartimin derivative free
|
||||
out = self.get_biquartimin_example_derivative_free()
|
||||
A, table_required, L_required = out
|
||||
ff = lambda L=None, A=None, T=None: oblimin_objective(
|
||||
L=L, A=A, T=T, gamma=1/2, rotation_method='oblique',
|
||||
return_gradient=False)
|
||||
L, phi, T, table = GPA(A, ff=ff, rotation_method='oblique')
|
||||
self.assertTrue(np.allclose(L, L_required, atol=1e-05))
|
||||
self.assertTrue(np.allclose(table, table_required, atol=1e-05))
|
||||
|
||||
def test_CF(self):
|
||||
# quartimax
|
||||
out = self.get_quartimax_example_derivative_free()
|
||||
A, table_required, L_required = out
|
||||
vgQ = lambda L=None, A=None, T=None: CF_objective(
|
||||
L=L, A=A, T=T, kappa=0, rotation_method='orthogonal',
|
||||
return_gradient=True)
|
||||
L, phi, T, table = GPA(A, vgQ=vgQ, rotation_method='orthogonal')
|
||||
self.assertTrue(np.allclose(L, L_required, atol=1e-05))
|
||||
# quartimax derivative free
|
||||
ff = lambda L=None, A=None, T=None: CF_objective(
|
||||
L=L, A=A, T=T, kappa=0, rotation_method='orthogonal',
|
||||
return_gradient=False)
|
||||
L, phi, T, table = GPA(A, ff=ff, rotation_method='orthogonal')
|
||||
self.assertTrue(np.allclose(L, L_required, atol=1e-05))
|
||||
# varimax
|
||||
p, k = A.shape
|
||||
vgQ = lambda L=None, A=None, T=None: orthomax_objective(
|
||||
L=L, A=A, T=T, gamma=1, return_gradient=True)
|
||||
L_vm, phi, T, table = GPA(A, vgQ=vgQ, rotation_method='orthogonal')
|
||||
vgQ = lambda L=None, A=None, T=None: CF_objective(
|
||||
L=L, A=A, T=T, kappa=1/p, rotation_method='orthogonal',
|
||||
return_gradient=True)
|
||||
L_CF, phi, T, table = GPA(A, vgQ=vgQ, rotation_method='orthogonal')
|
||||
ff = lambda L=None, A=None, T=None: CF_objective(
|
||||
L=L, A=A, T=T, kappa=1/p, rotation_method='orthogonal',
|
||||
return_gradient=False)
|
||||
L_CF_df, phi, T, table = GPA(A, ff=ff, rotation_method='orthogonal')
|
||||
self.assertTrue(np.allclose(L_vm, L_CF, atol=1e-05))
|
||||
self.assertTrue(np.allclose(L_CF, L_CF_df, atol=1e-05))
|
||||
|
||||
|
||||
class TestWrappers(unittest.TestCase):
|
||||
@staticmethod
|
||||
def str2matrix(A):
|
||||
A = A.lstrip().rstrip().split('\n')
|
||||
A = np.array([row.split() for row in A]).astype(float)
|
||||
return A
|
||||
|
||||
def get_A(self):
|
||||
return self.str2matrix("""
|
||||
.830 -.396
|
||||
.818 -.469
|
||||
.777 -.470
|
||||
.798 -.401
|
||||
.786 .500
|
||||
.672 .458
|
||||
.594 .444
|
||||
.647 .333
|
||||
""")
|
||||
|
||||
def get_H(self):
|
||||
return self.str2matrix("""
|
||||
.8 -.3
|
||||
.8 -.4
|
||||
.7 -.4
|
||||
.9 -.4
|
||||
.8 .5
|
||||
.6 .4
|
||||
.5 .4
|
||||
.6 .3
|
||||
""")
|
||||
|
||||
def get_W(self):
|
||||
return self.str2matrix("""
|
||||
1 0
|
||||
0 1
|
||||
0 0
|
||||
1 1
|
||||
1 0
|
||||
1 0
|
||||
0 1
|
||||
1 0
|
||||
""")
|
||||
|
||||
def _test_template(self, method, *method_args, **algorithms):
|
||||
A = self.get_A()
|
||||
algorithm1 = 'gpa' if 'algorithm1' not in algorithms else algorithms[
|
||||
'algorithm1']
|
||||
if 'algorithm`' not in algorithms:
|
||||
algorithm2 = 'gpa_der_free'
|
||||
else:
|
||||
algorithms['algorithm1']
|
||||
L1, T1 = rotate_factors(A, method, *method_args, algorithm=algorithm1)
|
||||
L2, T2 = rotate_factors(A, method, *method_args, algorithm=algorithm2)
|
||||
self.assertTrue(np.allclose(L1, L2, atol=1e-5))
|
||||
self.assertTrue(np.allclose(T1, T2, atol=1e-5))
|
||||
|
||||
def test_methods(self):
|
||||
"""
|
||||
Quartimax derivative free example
|
||||
http://www.stat.ucla.edu/research/gpa
|
||||
"""
|
||||
# orthomax, oblimin and CF are tested indirectly
|
||||
methods = ['quartimin', 'biquartimin',
|
||||
'quartimax', 'biquartimax', 'varimax', 'equamax',
|
||||
'parsimax', 'parsimony',
|
||||
'target', 'partial_target']
|
||||
for method in methods:
|
||||
method_args = []
|
||||
if method == 'target':
|
||||
method_args = [self.get_H(), 'orthogonal']
|
||||
self._test_template(method, *method_args)
|
||||
method_args = [self.get_H(), 'oblique']
|
||||
self._test_template(method, *method_args)
|
||||
method_args = [self.get_H(), 'orthogonal']
|
||||
self._test_template(method, *method_args,
|
||||
algorithm2='analytic')
|
||||
elif method == 'partial_target':
|
||||
method_args = [self.get_H(), self.get_W()]
|
||||
self._test_template(method, *method_args)
|
||||
@ -0,0 +1,127 @@
|
||||
"""Multivariate analysis of variance
|
||||
|
||||
author: Yichuan Liu
|
||||
"""
|
||||
import numpy as np
|
||||
|
||||
from statsmodels.compat.pandas import Substitution
|
||||
from statsmodels.base.model import Model
|
||||
from .multivariate_ols import MultivariateTestResults
|
||||
from .multivariate_ols import _multivariate_ols_fit
|
||||
from .multivariate_ols import _multivariate_ols_test, _hypotheses_doc
|
||||
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
|
||||
class MANOVA(Model):
|
||||
"""
|
||||
Multivariate Analysis of Variance
|
||||
|
||||
The implementation of MANOVA is based on multivariate regression and does
|
||||
not assume that the explanatory variables are categorical. Any type of
|
||||
variables as in regression is allowed.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
endog : array_like
|
||||
Dependent variables. A nobs x k_endog array where nobs is
|
||||
the number of observations and k_endog is the number of dependent
|
||||
variables.
|
||||
exog : array_like
|
||||
Independent variables. A nobs x k_exog array where nobs is the
|
||||
number of observations and k_exog is the number of independent
|
||||
variables. An intercept is not included by default and should be added
|
||||
by the user. Models specified using a formula include an intercept by
|
||||
default.
|
||||
|
||||
Attributes
|
||||
----------
|
||||
endog : ndarray
|
||||
See Parameters.
|
||||
exog : ndarray
|
||||
See Parameters.
|
||||
|
||||
Notes
|
||||
-----
|
||||
MANOVA is used though the `mv_test` function, and `fit` is not used.
|
||||
|
||||
The ``from_formula`` interface is the recommended method to specify
|
||||
a model and simplifies testing without needing to manually configure
|
||||
the contrast matrices.
|
||||
|
||||
References
|
||||
----------
|
||||
.. [*] ftp://public.dhe.ibm.com/software/analytics/spss/documentation/
|
||||
statistics/20.0/en/client/Manuals/IBM_SPSS_Statistics_Algorithms.pdf
|
||||
"""
|
||||
_formula_max_endog = None
|
||||
|
||||
def __init__(self, endog, exog, missing='none', hasconst=None, **kwargs):
|
||||
if len(endog.shape) == 1 or endog.shape[1] == 1:
|
||||
raise ValueError('There must be more than one dependent variable'
|
||||
' to fit MANOVA!')
|
||||
super().__init__(endog, exog, missing=missing,
|
||||
hasconst=hasconst, **kwargs)
|
||||
self._fittedmod = _multivariate_ols_fit(self.endog, self.exog)
|
||||
|
||||
def fit(self):
|
||||
raise NotImplementedError('fit is not needed to use MANOVA. Call'
|
||||
'mv_test directly on a MANOVA instance.')
|
||||
|
||||
@Substitution(hypotheses_doc=_hypotheses_doc)
|
||||
def mv_test(self, hypotheses=None, skip_intercept_test=False):
|
||||
"""
|
||||
Linear hypotheses testing
|
||||
|
||||
Parameters
|
||||
----------
|
||||
%(hypotheses_doc)s
|
||||
skip_intercept_test : bool
|
||||
If true, then testing the intercept is skipped, the model is not
|
||||
changed.
|
||||
Note: If a term has a numerically insignificant effect, then
|
||||
an exception because of emtpy arrays may be raised. This can
|
||||
happen for the intercept if the data has been demeaned.
|
||||
|
||||
Returns
|
||||
-------
|
||||
results: MultivariateTestResults
|
||||
|
||||
Notes
|
||||
-----
|
||||
Testing the linear hypotheses
|
||||
|
||||
L * params * M = 0
|
||||
|
||||
where `params` is the regression coefficient matrix for the
|
||||
linear model y = x * params
|
||||
|
||||
If the model is not specified using the formula interfact, then the
|
||||
hypotheses test each included exogenous variable, one at a time. In
|
||||
most applications with categorical variables, the ``from_formula``
|
||||
interface should be preferred when specifying a model since it
|
||||
provides knowledge about the model when specifying the hypotheses.
|
||||
"""
|
||||
if hypotheses is None:
|
||||
if (hasattr(self, 'data') and self.data is not None and
|
||||
hasattr(self.data, 'design_info')):
|
||||
terms = self.data.design_info.term_name_slices
|
||||
hypotheses = []
|
||||
for key in terms:
|
||||
if skip_intercept_test and key == 'Intercept':
|
||||
continue
|
||||
L_contrast = np.eye(self.exog.shape[1])[terms[key], :]
|
||||
hypotheses.append([key, L_contrast, None])
|
||||
else:
|
||||
hypotheses = []
|
||||
for i in range(self.exog.shape[1]):
|
||||
name = 'x%d' % (i)
|
||||
L = np.zeros([1, self.exog.shape[1]])
|
||||
L[0, i] = 1
|
||||
hypotheses.append([name, L, None])
|
||||
|
||||
results = _multivariate_ols_test(hypotheses, self._fittedmod,
|
||||
self.exog_names, self.endog_names)
|
||||
|
||||
return MultivariateTestResults(results, self.endog_names,
|
||||
self.exog_names)
|
||||
@ -0,0 +1,590 @@
|
||||
"""General linear model
|
||||
|
||||
author: Yichuan Liu
|
||||
"""
|
||||
import numpy as np
|
||||
from numpy.linalg import eigvals, inv, solve, matrix_rank, pinv, svd
|
||||
from scipy import stats
|
||||
import pandas as pd
|
||||
from patsy import DesignInfo
|
||||
|
||||
from statsmodels.compat.pandas import Substitution
|
||||
from statsmodels.base.model import Model
|
||||
from statsmodels.iolib import summary2
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
_hypotheses_doc = \
|
||||
"""hypotheses : list[tuple]
|
||||
Hypothesis `L*B*M = C` to be tested where B is the parameters in
|
||||
regression Y = X*B. Each element is a tuple of length 2, 3, or 4:
|
||||
|
||||
* (name, contrast_L)
|
||||
* (name, contrast_L, transform_M)
|
||||
* (name, contrast_L, transform_M, constant_C)
|
||||
|
||||
containing a string `name`, the contrast matrix L, the transform
|
||||
matrix M (for transforming dependent variables), and right-hand side
|
||||
constant matrix constant_C, respectively.
|
||||
|
||||
contrast_L : 2D array or an array of strings
|
||||
Left-hand side contrast matrix for hypotheses testing.
|
||||
If 2D array, each row is an hypotheses and each column is an
|
||||
independent variable. At least 1 row
|
||||
(1 by k_exog, the number of independent variables) is required.
|
||||
If an array of strings, it will be passed to
|
||||
patsy.DesignInfo().linear_constraint.
|
||||
|
||||
transform_M : 2D array or an array of strings or None, optional
|
||||
Left hand side transform matrix.
|
||||
If `None` or left out, it is set to a k_endog by k_endog
|
||||
identity matrix (i.e. do not transform y matrix).
|
||||
If an array of strings, it will be passed to
|
||||
patsy.DesignInfo().linear_constraint.
|
||||
|
||||
constant_C : 2D array or None, optional
|
||||
Right-hand side constant matrix.
|
||||
if `None` or left out it is set to a matrix of zeros
|
||||
Must has the same number of rows as contrast_L and the same
|
||||
number of columns as transform_M
|
||||
|
||||
If `hypotheses` is None: 1) the effect of each independent variable
|
||||
on the dependent variables will be tested. Or 2) if model is created
|
||||
using a formula, `hypotheses` will be created according to
|
||||
`design_info`. 1) and 2) is equivalent if no additional variables
|
||||
are created by the formula (e.g. dummy variables for categorical
|
||||
variables and interaction terms)
|
||||
"""
|
||||
|
||||
|
||||
def _multivariate_ols_fit(endog, exog, method='svd', tolerance=1e-8):
|
||||
"""
|
||||
Solve multivariate linear model y = x * params
|
||||
where y is dependent variables, x is independent variables
|
||||
|
||||
Parameters
|
||||
----------
|
||||
endog : array_like
|
||||
each column is a dependent variable
|
||||
exog : array_like
|
||||
each column is a independent variable
|
||||
method : str
|
||||
'svd' - Singular value decomposition
|
||||
'pinv' - Moore-Penrose pseudoinverse
|
||||
tolerance : float, a small positive number
|
||||
Tolerance for eigenvalue. Values smaller than tolerance is considered
|
||||
zero.
|
||||
Returns
|
||||
-------
|
||||
a tuple of matrices or values necessary for hypotheses testing
|
||||
|
||||
.. [*] https://support.sas.com/documentation/cdl/en/statug/63033/HTML/default/viewer.htm#statug_introreg_sect012.htm
|
||||
Notes
|
||||
-----
|
||||
Status: experimental and incomplete
|
||||
"""
|
||||
y = endog
|
||||
x = exog
|
||||
nobs, k_endog = y.shape
|
||||
nobs1, k_exog= x.shape
|
||||
if nobs != nobs1:
|
||||
raise ValueError('x(n=%d) and y(n=%d) should have the same number of '
|
||||
'rows!' % (nobs1, nobs))
|
||||
|
||||
# Calculate the matrices necessary for hypotheses testing
|
||||
df_resid = nobs - k_exog
|
||||
if method == 'pinv':
|
||||
# Regression coefficients matrix
|
||||
pinv_x = pinv(x)
|
||||
params = pinv_x.dot(y)
|
||||
|
||||
# inverse of x'x
|
||||
inv_cov = pinv_x.dot(pinv_x.T)
|
||||
if matrix_rank(inv_cov,tol=tolerance) < k_exog:
|
||||
raise ValueError('Covariance of x singular!')
|
||||
|
||||
# Sums of squares and cross-products of residuals
|
||||
# Y'Y - (X * params)'B * params
|
||||
t = x.dot(params)
|
||||
sscpr = np.subtract(y.T.dot(y), t.T.dot(t))
|
||||
return (params, df_resid, inv_cov, sscpr)
|
||||
elif method == 'svd':
|
||||
u, s, v = svd(x, 0)
|
||||
if (s > tolerance).sum() < len(s):
|
||||
raise ValueError('Covariance of x singular!')
|
||||
invs = 1. / s
|
||||
|
||||
params = v.T.dot(np.diag(invs)).dot(u.T).dot(y)
|
||||
inv_cov = v.T.dot(np.diag(np.power(invs, 2))).dot(v)
|
||||
t = np.diag(s).dot(v).dot(params)
|
||||
sscpr = np.subtract(y.T.dot(y), t.T.dot(t))
|
||||
return (params, df_resid, inv_cov, sscpr)
|
||||
else:
|
||||
raise ValueError('%s is not a supported method!' % method)
|
||||
|
||||
|
||||
def multivariate_stats(eigenvals,
|
||||
r_err_sscp,
|
||||
r_contrast, df_resid, tolerance=1e-8):
|
||||
"""
|
||||
For multivariate linear model Y = X * B
|
||||
Testing hypotheses
|
||||
L*B*M = 0
|
||||
where L is contrast matrix, B is the parameters of the
|
||||
multivariate linear model and M is dependent variable transform matrix.
|
||||
T = L*inv(X'X)*L'
|
||||
H = M'B'L'*inv(T)*LBM
|
||||
E = M'(Y'Y - B'X'XB)M
|
||||
|
||||
Parameters
|
||||
----------
|
||||
eigenvals : ndarray
|
||||
The eigenvalues of inv(E + H)*H
|
||||
r_err_sscp : int
|
||||
Rank of E + H
|
||||
r_contrast : int
|
||||
Rank of T matrix
|
||||
df_resid : int
|
||||
Residual degree of freedom (n_samples minus n_variables of X)
|
||||
tolerance : float
|
||||
smaller than which eigenvalue is considered 0
|
||||
|
||||
Returns
|
||||
-------
|
||||
A DataFrame
|
||||
|
||||
References
|
||||
----------
|
||||
.. [*] https://support.sas.com/documentation/cdl/en/statug/63033/HTML/default/viewer.htm#statug_introreg_sect012.htm
|
||||
"""
|
||||
v = df_resid
|
||||
p = r_err_sscp
|
||||
q = r_contrast
|
||||
s = np.min([p, q])
|
||||
ind = eigenvals > tolerance
|
||||
n_e = ind.sum()
|
||||
eigv2 = eigenvals[ind]
|
||||
eigv1 = np.array([i / (1 - i) for i in eigv2])
|
||||
m = (np.abs(p - q) - 1) / 2
|
||||
n = (v - p - 1) / 2
|
||||
|
||||
cols = ['Value', 'Num DF', 'Den DF', 'F Value', 'Pr > F']
|
||||
index = ["Wilks' lambda", "Pillai's trace",
|
||||
"Hotelling-Lawley trace", "Roy's greatest root"]
|
||||
results = pd.DataFrame(columns=cols,
|
||||
index=index)
|
||||
|
||||
def fn(x):
|
||||
return np.real([x])[0]
|
||||
|
||||
results.loc["Wilks' lambda", 'Value'] = fn(np.prod(1 - eigv2))
|
||||
|
||||
results.loc["Pillai's trace", 'Value'] = fn(eigv2.sum())
|
||||
|
||||
results.loc["Hotelling-Lawley trace", 'Value'] = fn(eigv1.sum())
|
||||
|
||||
results.loc["Roy's greatest root", 'Value'] = fn(eigv1.max())
|
||||
|
||||
r = v - (p - q + 1)/2
|
||||
u = (p*q - 2) / 4
|
||||
df1 = p * q
|
||||
if p*p + q*q - 5 > 0:
|
||||
t = np.sqrt((p*p*q*q - 4) / (p*p + q*q - 5))
|
||||
else:
|
||||
t = 1
|
||||
df2 = r*t - 2*u
|
||||
lmd = results.loc["Wilks' lambda", 'Value']
|
||||
lmd = np.power(lmd, 1 / t)
|
||||
F = (1 - lmd) / lmd * df2 / df1
|
||||
results.loc["Wilks' lambda", 'Num DF'] = df1
|
||||
results.loc["Wilks' lambda", 'Den DF'] = df2
|
||||
results.loc["Wilks' lambda", 'F Value'] = F
|
||||
pval = stats.f.sf(F, df1, df2)
|
||||
results.loc["Wilks' lambda", 'Pr > F'] = pval
|
||||
|
||||
V = results.loc["Pillai's trace", 'Value']
|
||||
df1 = s * (2*m + s + 1)
|
||||
df2 = s * (2*n + s + 1)
|
||||
F = df2 / df1 * V / (s - V)
|
||||
results.loc["Pillai's trace", 'Num DF'] = df1
|
||||
results.loc["Pillai's trace", 'Den DF'] = df2
|
||||
results.loc["Pillai's trace", 'F Value'] = F
|
||||
pval = stats.f.sf(F, df1, df2)
|
||||
results.loc["Pillai's trace", 'Pr > F'] = pval
|
||||
|
||||
U = results.loc["Hotelling-Lawley trace", 'Value']
|
||||
if n > 0:
|
||||
b = (p + 2*n) * (q + 2*n) / 2 / (2*n + 1) / (n - 1)
|
||||
df1 = p * q
|
||||
df2 = 4 + (p*q + 2) / (b - 1)
|
||||
c = (df2 - 2) / 2 / n
|
||||
F = df2 / df1 * U / c
|
||||
else:
|
||||
df1 = s * (2*m + s + 1)
|
||||
df2 = s * (s*n + 1)
|
||||
F = df2 / df1 / s * U
|
||||
results.loc["Hotelling-Lawley trace", 'Num DF'] = df1
|
||||
results.loc["Hotelling-Lawley trace", 'Den DF'] = df2
|
||||
results.loc["Hotelling-Lawley trace", 'F Value'] = F
|
||||
pval = stats.f.sf(F, df1, df2)
|
||||
results.loc["Hotelling-Lawley trace", 'Pr > F'] = pval
|
||||
|
||||
sigma = results.loc["Roy's greatest root", 'Value']
|
||||
r = np.max([p, q])
|
||||
df1 = r
|
||||
df2 = v - r + q
|
||||
F = df2 / df1 * sigma
|
||||
results.loc["Roy's greatest root", 'Num DF'] = df1
|
||||
results.loc["Roy's greatest root", 'Den DF'] = df2
|
||||
results.loc["Roy's greatest root", 'F Value'] = F
|
||||
pval = stats.f.sf(F, df1, df2)
|
||||
results.loc["Roy's greatest root", 'Pr > F'] = pval
|
||||
return results
|
||||
|
||||
|
||||
def _multivariate_ols_test(hypotheses, fit_results, exog_names,
|
||||
endog_names):
|
||||
def fn(L, M, C):
|
||||
# .. [1] https://support.sas.com/documentation/cdl/en/statug/63033
|
||||
# /HTML/default/viewer.htm#statug_introreg_sect012.htm
|
||||
params, df_resid, inv_cov, sscpr = fit_results
|
||||
# t1 = (L * params)M
|
||||
t1 = L.dot(params).dot(M) - C
|
||||
# H = t1'L(X'X)^L't1
|
||||
t2 = L.dot(inv_cov).dot(L.T)
|
||||
q = matrix_rank(t2)
|
||||
H = t1.T.dot(inv(t2)).dot(t1)
|
||||
|
||||
# E = M'(Y'Y - B'(X'X)B)M
|
||||
E = M.T.dot(sscpr).dot(M)
|
||||
return E, H, q, df_resid
|
||||
|
||||
return _multivariate_test(hypotheses, exog_names, endog_names, fn)
|
||||
|
||||
|
||||
@Substitution(hypotheses_doc=_hypotheses_doc)
|
||||
def _multivariate_test(hypotheses, exog_names, endog_names, fn):
|
||||
"""
|
||||
Multivariate linear model hypotheses testing
|
||||
|
||||
For y = x * params, where y are the dependent variables and x are the
|
||||
independent variables, testing L * params * M = 0 where L is the contrast
|
||||
matrix for hypotheses testing and M is the transformation matrix for
|
||||
transforming the dependent variables in y.
|
||||
|
||||
Algorithm:
|
||||
T = L*inv(X'X)*L'
|
||||
H = M'B'L'*inv(T)*LBM
|
||||
E = M'(Y'Y - B'X'XB)M
|
||||
where H and E correspond to the numerator and denominator of a univariate
|
||||
F-test. Then find the eigenvalues of inv(H + E)*H from which the
|
||||
multivariate test statistics are calculated.
|
||||
|
||||
.. [*] https://support.sas.com/documentation/cdl/en/statug/63033/HTML
|
||||
/default/viewer.htm#statug_introreg_sect012.htm
|
||||
|
||||
Parameters
|
||||
----------
|
||||
%(hypotheses_doc)s
|
||||
k_xvar : int
|
||||
The number of independent variables
|
||||
k_yvar : int
|
||||
The number of dependent variables
|
||||
fn : function
|
||||
a function fn(contrast_L, transform_M) that returns E, H, q, df_resid
|
||||
where q is the rank of T matrix
|
||||
|
||||
Returns
|
||||
-------
|
||||
results : MANOVAResults
|
||||
"""
|
||||
|
||||
k_xvar = len(exog_names)
|
||||
k_yvar = len(endog_names)
|
||||
results = {}
|
||||
for hypo in hypotheses:
|
||||
if len(hypo) ==2:
|
||||
name, L = hypo
|
||||
M = None
|
||||
C = None
|
||||
elif len(hypo) == 3:
|
||||
name, L, M = hypo
|
||||
C = None
|
||||
elif len(hypo) == 4:
|
||||
name, L, M, C = hypo
|
||||
else:
|
||||
raise ValueError('hypotheses must be a tuple of length 2, 3 or 4.'
|
||||
' len(hypotheses)=%d' % len(hypo))
|
||||
if any(isinstance(j, str) for j in L):
|
||||
L = DesignInfo(exog_names).linear_constraint(L).coefs
|
||||
else:
|
||||
if not isinstance(L, np.ndarray) or len(L.shape) != 2:
|
||||
raise ValueError('Contrast matrix L must be a 2-d array!')
|
||||
if L.shape[1] != k_xvar:
|
||||
raise ValueError('Contrast matrix L should have the same '
|
||||
'number of columns as exog! %d != %d' %
|
||||
(L.shape[1], k_xvar))
|
||||
if M is None:
|
||||
M = np.eye(k_yvar)
|
||||
elif any(isinstance(j, str) for j in M):
|
||||
M = DesignInfo(endog_names).linear_constraint(M).coefs.T
|
||||
else:
|
||||
if M is not None:
|
||||
if not isinstance(M, np.ndarray) or len(M.shape) != 2:
|
||||
raise ValueError('Transform matrix M must be a 2-d array!')
|
||||
if M.shape[0] != k_yvar:
|
||||
raise ValueError('Transform matrix M should have the same '
|
||||
'number of rows as the number of columns '
|
||||
'of endog! %d != %d' %
|
||||
(M.shape[0], k_yvar))
|
||||
if C is None:
|
||||
C = np.zeros([L.shape[0], M.shape[1]])
|
||||
elif not isinstance(C, np.ndarray):
|
||||
raise ValueError('Constant matrix C must be a 2-d array!')
|
||||
|
||||
if C.shape[0] != L.shape[0]:
|
||||
raise ValueError('contrast L and constant C must have the same '
|
||||
'number of rows! %d!=%d'
|
||||
% (L.shape[0], C.shape[0]))
|
||||
if C.shape[1] != M.shape[1]:
|
||||
raise ValueError('transform M and constant C must have the same '
|
||||
'number of columns! %d!=%d'
|
||||
% (M.shape[1], C.shape[1]))
|
||||
E, H, q, df_resid = fn(L, M, C)
|
||||
EH = np.add(E, H)
|
||||
p = matrix_rank(EH)
|
||||
|
||||
# eigenvalues of inv(E + H)H
|
||||
eigv2 = np.sort(eigvals(solve(EH, H)))
|
||||
stat_table = multivariate_stats(eigv2, p, q, df_resid)
|
||||
|
||||
results[name] = {'stat': stat_table, 'contrast_L': L,
|
||||
'transform_M': M, 'constant_C': C,
|
||||
'E': E, 'H': H}
|
||||
return results
|
||||
|
||||
|
||||
class _MultivariateOLS(Model):
|
||||
"""
|
||||
Multivariate linear model via least squares
|
||||
|
||||
|
||||
Parameters
|
||||
----------
|
||||
endog : array_like
|
||||
Dependent variables. A nobs x k_endog array where nobs is
|
||||
the number of observations and k_endog is the number of dependent
|
||||
variables
|
||||
exog : array_like
|
||||
Independent variables. A nobs x k_exog array where nobs is the
|
||||
number of observations and k_exog is the number of independent
|
||||
variables. An intercept is not included by default and should be added
|
||||
by the user (models specified using a formula include an intercept by
|
||||
default)
|
||||
|
||||
Attributes
|
||||
----------
|
||||
endog : ndarray
|
||||
See Parameters.
|
||||
exog : ndarray
|
||||
See Parameters.
|
||||
"""
|
||||
_formula_max_endog = None
|
||||
|
||||
def __init__(self, endog, exog, missing='none', hasconst=None, **kwargs):
|
||||
if len(endog.shape) == 1 or endog.shape[1] == 1:
|
||||
raise ValueError('There must be more than one dependent variable'
|
||||
' to fit multivariate OLS!')
|
||||
super().__init__(endog, exog, missing=missing,
|
||||
hasconst=hasconst, **kwargs)
|
||||
|
||||
def fit(self, method='svd'):
|
||||
self._fittedmod = _multivariate_ols_fit(
|
||||
self.endog, self.exog, method=method)
|
||||
return _MultivariateOLSResults(self)
|
||||
|
||||
|
||||
class _MultivariateOLSResults:
|
||||
"""
|
||||
_MultivariateOLS results class
|
||||
"""
|
||||
def __init__(self, fitted_mv_ols):
|
||||
if (hasattr(fitted_mv_ols, 'data') and
|
||||
hasattr(fitted_mv_ols.data, 'design_info')):
|
||||
self.design_info = fitted_mv_ols.data.design_info
|
||||
else:
|
||||
self.design_info = None
|
||||
self.exog_names = fitted_mv_ols.exog_names
|
||||
self.endog_names = fitted_mv_ols.endog_names
|
||||
self._fittedmod = fitted_mv_ols._fittedmod
|
||||
|
||||
def __str__(self):
|
||||
return self.summary().__str__()
|
||||
|
||||
@Substitution(hypotheses_doc=_hypotheses_doc)
|
||||
def mv_test(self, hypotheses=None, skip_intercept_test=False):
|
||||
"""
|
||||
Linear hypotheses testing
|
||||
|
||||
Parameters
|
||||
----------
|
||||
%(hypotheses_doc)s
|
||||
skip_intercept_test : bool
|
||||
If true, then testing the intercept is skipped, the model is not
|
||||
changed.
|
||||
Note: If a term has a numerically insignificant effect, then
|
||||
an exception because of emtpy arrays may be raised. This can
|
||||
happen for the intercept if the data has been demeaned.
|
||||
|
||||
Returns
|
||||
-------
|
||||
results: _MultivariateOLSResults
|
||||
|
||||
Notes
|
||||
-----
|
||||
Tests hypotheses of the form
|
||||
|
||||
L * params * M = C
|
||||
|
||||
where `params` is the regression coefficient matrix for the
|
||||
linear model y = x * params, `L` is the contrast matrix, `M` is the
|
||||
dependent variable transform matrix and C is the constant matrix.
|
||||
"""
|
||||
k_xvar = len(self.exog_names)
|
||||
if hypotheses is None:
|
||||
if self.design_info is not None:
|
||||
terms = self.design_info.term_name_slices
|
||||
hypotheses = []
|
||||
for key in terms:
|
||||
if skip_intercept_test and key == 'Intercept':
|
||||
continue
|
||||
L_contrast = np.eye(k_xvar)[terms[key], :]
|
||||
hypotheses.append([key, L_contrast, None])
|
||||
else:
|
||||
hypotheses = []
|
||||
for i in range(k_xvar):
|
||||
name = 'x%d' % (i)
|
||||
L = np.zeros([1, k_xvar])
|
||||
L[i] = 1
|
||||
hypotheses.append([name, L, None])
|
||||
|
||||
results = _multivariate_ols_test(hypotheses, self._fittedmod,
|
||||
self.exog_names, self.endog_names)
|
||||
|
||||
return MultivariateTestResults(results,
|
||||
self.endog_names,
|
||||
self.exog_names)
|
||||
|
||||
def summary(self):
|
||||
raise NotImplementedError
|
||||
|
||||
|
||||
class MultivariateTestResults:
|
||||
"""
|
||||
Multivariate test results class
|
||||
|
||||
Returned by `mv_test` method of `_MultivariateOLSResults` class
|
||||
|
||||
Parameters
|
||||
----------
|
||||
results : dict[str, dict]
|
||||
Dictionary containing test results. See the description
|
||||
below for the expected format.
|
||||
endog_names : sequence[str]
|
||||
A list or other sequence of endogenous variables names
|
||||
exog_names : sequence[str]
|
||||
A list of other sequence of exogenous variables names
|
||||
|
||||
Attributes
|
||||
----------
|
||||
results : dict
|
||||
Each hypothesis is contained in a single`key`. Each test must
|
||||
have the following keys:
|
||||
|
||||
* 'stat' - contains the multivariate test results
|
||||
* 'contrast_L' - contains the contrast_L matrix
|
||||
* 'transform_M' - contains the transform_M matrix
|
||||
* 'constant_C' - contains the constant_C matrix
|
||||
* 'H' - contains an intermediate Hypothesis matrix,
|
||||
or the between groups sums of squares and cross-products matrix,
|
||||
corresponding to the numerator of the univariate F test.
|
||||
* 'E' - contains an intermediate Error matrix,
|
||||
corresponding to the denominator of the univariate F test.
|
||||
The Hypotheses and Error matrices can be used to calculate
|
||||
the same test statistics in 'stat', as well as to calculate
|
||||
the discriminant function (canonical correlates) from the
|
||||
eigenvectors of inv(E)H.
|
||||
|
||||
endog_names : list[str]
|
||||
The endogenous names
|
||||
exog_names : list[str]
|
||||
The exogenous names
|
||||
summary_frame : DataFrame
|
||||
Returns results as a MultiIndex DataFrame
|
||||
"""
|
||||
|
||||
def __init__(self, results, endog_names, exog_names):
|
||||
self.results = results
|
||||
self.endog_names = list(endog_names)
|
||||
self.exog_names = list(exog_names)
|
||||
|
||||
def __str__(self):
|
||||
return self.summary().__str__()
|
||||
|
||||
def __getitem__(self, item):
|
||||
return self.results[item]
|
||||
|
||||
@property
|
||||
def summary_frame(self):
|
||||
"""
|
||||
Return results as a multiindex dataframe
|
||||
"""
|
||||
df = []
|
||||
for key in self.results:
|
||||
tmp = self.results[key]['stat'].copy()
|
||||
tmp.loc[:, 'Effect'] = key
|
||||
df.append(tmp.reset_index())
|
||||
df = pd.concat(df, axis=0)
|
||||
df = df.set_index(['Effect', 'index'])
|
||||
df.index.set_names(['Effect', 'Statistic'], inplace=True)
|
||||
return df
|
||||
|
||||
def summary(self, show_contrast_L=False, show_transform_M=False,
|
||||
show_constant_C=False):
|
||||
"""
|
||||
Summary of test results
|
||||
|
||||
Parameters
|
||||
----------
|
||||
show_contrast_L : bool
|
||||
Whether to show contrast_L matrix
|
||||
show_transform_M : bool
|
||||
Whether to show transform_M matrix
|
||||
show_constant_C : bool
|
||||
Whether to show the constant_C
|
||||
"""
|
||||
summ = summary2.Summary()
|
||||
summ.add_title('Multivariate linear model')
|
||||
for key in self.results:
|
||||
summ.add_dict({'': ''})
|
||||
df = self.results[key]['stat'].copy()
|
||||
df = df.reset_index()
|
||||
c = list(df.columns)
|
||||
c[0] = key
|
||||
df.columns = c
|
||||
df.index = ['', '', '', '']
|
||||
summ.add_df(df)
|
||||
if show_contrast_L:
|
||||
summ.add_dict({key: ' contrast L='})
|
||||
df = pd.DataFrame(self.results[key]['contrast_L'],
|
||||
columns=self.exog_names)
|
||||
summ.add_df(df)
|
||||
if show_transform_M:
|
||||
summ.add_dict({key: ' transform M='})
|
||||
df = pd.DataFrame(self.results[key]['transform_M'],
|
||||
index=self.endog_names)
|
||||
summ.add_df(df)
|
||||
if show_constant_C:
|
||||
summ.add_dict({key: ' constant C='})
|
||||
df = pd.DataFrame(self.results[key]['constant_C'])
|
||||
summ.add_df(df)
|
||||
return summ
|
||||
@ -0,0 +1,873 @@
|
||||
"""Principal Component Analysis
|
||||
|
||||
Author: josef-pktd
|
||||
Modified by Kevin Sheppard
|
||||
"""
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
|
||||
from statsmodels.tools.sm_exceptions import (ValueWarning,
|
||||
EstimationWarning)
|
||||
from statsmodels.tools.validation import (string_like,
|
||||
array_like,
|
||||
bool_like,
|
||||
float_like,
|
||||
int_like,
|
||||
)
|
||||
|
||||
|
||||
def _norm(x):
|
||||
return np.sqrt(np.sum(x * x))
|
||||
|
||||
|
||||
class PCA:
|
||||
"""
|
||||
Principal Component Analysis
|
||||
|
||||
Parameters
|
||||
----------
|
||||
data : array_like
|
||||
Variables in columns, observations in rows.
|
||||
ncomp : int, optional
|
||||
Number of components to return. If None, returns the as many as the
|
||||
smaller of the number of rows or columns in data.
|
||||
standardize : bool, optional
|
||||
Flag indicating to use standardized data with mean 0 and unit
|
||||
variance. standardized being True implies demean. Using standardized
|
||||
data is equivalent to computing principal components from the
|
||||
correlation matrix of data.
|
||||
demean : bool, optional
|
||||
Flag indicating whether to demean data before computing principal
|
||||
components. demean is ignored if standardize is True. Demeaning data
|
||||
but not standardizing is equivalent to computing principal components
|
||||
from the covariance matrix of data.
|
||||
normalize : bool , optional
|
||||
Indicates whether to normalize the factors to have unit inner product.
|
||||
If False, the loadings will have unit inner product.
|
||||
gls : bool, optional
|
||||
Flag indicating to implement a two-step GLS estimator where
|
||||
in the first step principal components are used to estimate residuals,
|
||||
and then the inverse residual variance is used as a set of weights to
|
||||
estimate the final principal components. Setting gls to True requires
|
||||
ncomp to be less then the min of the number of rows or columns.
|
||||
weights : ndarray, optional
|
||||
Series weights to use after transforming data according to standardize
|
||||
or demean when computing the principal components.
|
||||
method : str, optional
|
||||
Sets the linear algebra routine used to compute eigenvectors:
|
||||
|
||||
* 'svd' uses a singular value decomposition (default).
|
||||
* 'eig' uses an eigenvalue decomposition of a quadratic form
|
||||
* 'nipals' uses the NIPALS algorithm and can be faster than SVD when
|
||||
ncomp is small and nvars is large. See notes about additional changes
|
||||
when using NIPALS.
|
||||
missing : {str, None}
|
||||
Method for missing data. Choices are:
|
||||
|
||||
* 'drop-row' - drop rows with missing values.
|
||||
* 'drop-col' - drop columns with missing values.
|
||||
* 'drop-min' - drop either rows or columns, choosing by data retention.
|
||||
* 'fill-em' - use EM algorithm to fill missing value. ncomp should be
|
||||
set to the number of factors required.
|
||||
* `None` raises if data contains NaN values.
|
||||
tol : float, optional
|
||||
Tolerance to use when checking for convergence when using NIPALS.
|
||||
max_iter : int, optional
|
||||
Maximum iterations when using NIPALS.
|
||||
tol_em : float
|
||||
Tolerance to use when checking for convergence of the EM algorithm.
|
||||
max_em_iter : int
|
||||
Maximum iterations for the EM algorithm.
|
||||
svd_full_matrices : bool, optional
|
||||
If the 'svd' method is selected, this flag is used to set the parameter
|
||||
'full_matrices' in the singular value decomposition method. Is set to
|
||||
False by default.
|
||||
|
||||
Attributes
|
||||
----------
|
||||
factors : array or DataFrame
|
||||
nobs by ncomp array of principal components (scores)
|
||||
scores : array or DataFrame
|
||||
nobs by ncomp array of principal components - identical to factors
|
||||
loadings : array or DataFrame
|
||||
ncomp by nvar array of principal component loadings for constructing
|
||||
the factors
|
||||
coeff : array or DataFrame
|
||||
nvar by ncomp array of principal component loadings for constructing
|
||||
the projections
|
||||
projection : array or DataFrame
|
||||
nobs by var array containing the projection of the data onto the ncomp
|
||||
estimated factors
|
||||
rsquare : array or Series
|
||||
ncomp array where the element in the ith position is the R-square
|
||||
of including the fist i principal components. Note: values are
|
||||
calculated on the transformed data, not the original data
|
||||
ic : array or DataFrame
|
||||
ncomp by 3 array containing the Bai and Ng (2003) Information
|
||||
criteria. Each column is a different criteria, and each row
|
||||
represents the number of included factors.
|
||||
eigenvals : array or Series
|
||||
nvar array of eigenvalues
|
||||
eigenvecs : array or DataFrame
|
||||
nvar by nvar array of eigenvectors
|
||||
weights : ndarray
|
||||
nvar array of weights used to compute the principal components,
|
||||
normalized to unit length
|
||||
transformed_data : ndarray
|
||||
Standardized, demeaned and weighted data used to compute
|
||||
principal components and related quantities
|
||||
cols : ndarray
|
||||
Array of indices indicating columns used in the PCA
|
||||
rows : ndarray
|
||||
Array of indices indicating rows used in the PCA
|
||||
|
||||
Notes
|
||||
-----
|
||||
The default options perform principal component analysis on the
|
||||
demeaned, unit variance version of data. Setting standardize to False
|
||||
will instead only demean, and setting both standardized and
|
||||
demean to False will not alter the data.
|
||||
|
||||
Once the data have been transformed, the following relationships hold when
|
||||
the number of components (ncomp) is the same as tne minimum of the number
|
||||
of observation or the number of variables.
|
||||
|
||||
.. math:
|
||||
|
||||
X' X = V \\Lambda V'
|
||||
|
||||
.. math:
|
||||
|
||||
F = X V
|
||||
|
||||
.. math:
|
||||
|
||||
X = F V'
|
||||
|
||||
where X is the `data`, F is the array of principal components (`factors`
|
||||
or `scores`), and V is the array of eigenvectors (`loadings`) and V' is
|
||||
the array of factor coefficients (`coeff`).
|
||||
|
||||
When weights are provided, the principal components are computed from the
|
||||
modified data
|
||||
|
||||
.. math:
|
||||
|
||||
\\Omega^{-\\frac{1}{2}} X
|
||||
|
||||
where :math:`\\Omega` is a diagonal matrix composed of the weights. For
|
||||
example, when using the GLS version of PCA, the elements of :math:`\\Omega`
|
||||
will be the inverse of the variances of the residuals from
|
||||
|
||||
.. math:
|
||||
|
||||
X - F V'
|
||||
|
||||
where the number of factors is less than the rank of X
|
||||
|
||||
References
|
||||
----------
|
||||
.. [*] J. Bai and S. Ng, "Determining the number of factors in approximate
|
||||
factor models," Econometrica, vol. 70, number 1, pp. 191-221, 2002
|
||||
|
||||
Examples
|
||||
--------
|
||||
Basic PCA using the correlation matrix of the data
|
||||
|
||||
>>> import numpy as np
|
||||
>>> from statsmodels.multivariate.pca import PCA
|
||||
>>> x = np.random.randn(100)[:, None]
|
||||
>>> x = x + np.random.randn(100, 100)
|
||||
>>> pc = PCA(x)
|
||||
|
||||
Note that the principal components are computed using a SVD and so the
|
||||
correlation matrix is never constructed, unless method='eig'.
|
||||
|
||||
PCA using the covariance matrix of the data
|
||||
|
||||
>>> pc = PCA(x, standardize=False)
|
||||
|
||||
Limiting the number of factors returned to 1 computed using NIPALS
|
||||
|
||||
>>> pc = PCA(x, ncomp=1, method='nipals')
|
||||
>>> pc.factors.shape
|
||||
(100, 1)
|
||||
"""
|
||||
|
||||
def __init__(self, data, ncomp=None, standardize=True, demean=True,
|
||||
normalize=True, gls=False, weights=None, method='svd',
|
||||
missing=None, tol=5e-8, max_iter=1000, tol_em=5e-8,
|
||||
max_em_iter=100, svd_full_matrices=False):
|
||||
self._index = None
|
||||
self._columns = []
|
||||
if isinstance(data, pd.DataFrame):
|
||||
self._index = data.index
|
||||
self._columns = data.columns
|
||||
|
||||
self.data = array_like(data, "data", ndim=2)
|
||||
# Store inputs
|
||||
self._gls = bool_like(gls, "gls")
|
||||
self._normalize = bool_like(normalize, "normalize")
|
||||
self._svd_full_matrices = bool_like(svd_full_matrices, "svd_fm")
|
||||
self._tol = float_like(tol, "tol")
|
||||
if not 0 < self._tol < 1:
|
||||
raise ValueError('tol must be strictly between 0 and 1')
|
||||
self._max_iter = int_like(max_iter, "int_like")
|
||||
self._max_em_iter = int_like(max_em_iter, "max_em_iter")
|
||||
self._tol_em = float_like(tol_em, "tol_em")
|
||||
|
||||
# Prepare data
|
||||
self._standardize = bool_like(standardize, "standardize")
|
||||
self._demean = bool_like(demean, "demean")
|
||||
|
||||
self._nobs, self._nvar = self.data.shape
|
||||
weights = array_like(weights, "weights", maxdim=1, optional=True)
|
||||
if weights is None:
|
||||
weights = np.ones(self._nvar)
|
||||
else:
|
||||
weights = np.array(weights).flatten()
|
||||
if weights.shape[0] != self._nvar:
|
||||
raise ValueError('weights should have nvar elements')
|
||||
weights = weights / np.sqrt((weights ** 2.0).mean())
|
||||
self.weights = weights
|
||||
|
||||
# Check ncomp against maximum
|
||||
min_dim = min(self._nobs, self._nvar)
|
||||
self._ncomp = min_dim if ncomp is None else ncomp
|
||||
if self._ncomp > min_dim:
|
||||
import warnings
|
||||
|
||||
warn = 'The requested number of components is more than can be ' \
|
||||
'computed from data. The maximum number of components is ' \
|
||||
'the minimum of the number of observations or variables'
|
||||
warnings.warn(warn, ValueWarning)
|
||||
self._ncomp = min_dim
|
||||
|
||||
self._method = method
|
||||
# Workaround to avoid instance methods in __dict__
|
||||
if self._method not in ('eig', 'svd', 'nipals'):
|
||||
raise ValueError(f'method {method} is not known.')
|
||||
if self._method == 'svd':
|
||||
self._svd_full_matrices = True
|
||||
|
||||
self.rows = np.arange(self._nobs)
|
||||
self.cols = np.arange(self._nvar)
|
||||
# Handle missing
|
||||
self._missing = string_like(missing, "missing", optional=True)
|
||||
self._adjusted_data = self.data
|
||||
self._adjust_missing()
|
||||
|
||||
# Update size
|
||||
self._nobs, self._nvar = self._adjusted_data.shape
|
||||
if self._ncomp == np.min(self.data.shape):
|
||||
self._ncomp = np.min(self._adjusted_data.shape)
|
||||
elif self._ncomp > np.min(self._adjusted_data.shape):
|
||||
raise ValueError('When adjusting for missing values, user '
|
||||
'provided ncomp must be no larger than the '
|
||||
'smallest dimension of the '
|
||||
'missing-value-adjusted data size.')
|
||||
|
||||
# Attributes and internal values
|
||||
self._tss = 0.0
|
||||
self._ess = None
|
||||
self.transformed_data = None
|
||||
self._mu = None
|
||||
self._sigma = None
|
||||
self._ess_indiv = None
|
||||
self._tss_indiv = None
|
||||
self.scores = self.factors = None
|
||||
self.loadings = None
|
||||
self.coeff = None
|
||||
self.eigenvals = None
|
||||
self.eigenvecs = None
|
||||
self.projection = None
|
||||
self.rsquare = None
|
||||
self.ic = None
|
||||
|
||||
# Prepare data
|
||||
self.transformed_data = self._prepare_data()
|
||||
# Perform the PCA
|
||||
self._pca()
|
||||
if gls:
|
||||
self._compute_gls_weights()
|
||||
self.transformed_data = self._prepare_data()
|
||||
self._pca()
|
||||
|
||||
# Final calculations
|
||||
self._compute_rsquare_and_ic()
|
||||
if self._index is not None:
|
||||
self._to_pandas()
|
||||
|
||||
def _adjust_missing(self):
|
||||
"""
|
||||
Implements alternatives for handling missing values
|
||||
"""
|
||||
|
||||
def keep_col(x):
|
||||
index = np.logical_not(np.any(np.isnan(x), 0))
|
||||
return x[:, index], index
|
||||
|
||||
def keep_row(x):
|
||||
index = np.logical_not(np.any(np.isnan(x), 1))
|
||||
return x[index, :], index
|
||||
|
||||
if self._missing == 'drop-col':
|
||||
self._adjusted_data, index = keep_col(self.data)
|
||||
self.cols = np.where(index)[0]
|
||||
self.weights = self.weights[index]
|
||||
elif self._missing == 'drop-row':
|
||||
self._adjusted_data, index = keep_row(self.data)
|
||||
self.rows = np.where(index)[0]
|
||||
elif self._missing == 'drop-min':
|
||||
drop_col, drop_col_index = keep_col(self.data)
|
||||
drop_col_size = drop_col.size
|
||||
|
||||
drop_row, drop_row_index = keep_row(self.data)
|
||||
drop_row_size = drop_row.size
|
||||
|
||||
if drop_row_size > drop_col_size:
|
||||
self._adjusted_data = drop_row
|
||||
self.rows = np.where(drop_row_index)[0]
|
||||
else:
|
||||
self._adjusted_data = drop_col
|
||||
self.weights = self.weights[drop_col_index]
|
||||
self.cols = np.where(drop_col_index)[0]
|
||||
elif self._missing == 'fill-em':
|
||||
self._adjusted_data = self._fill_missing_em()
|
||||
elif self._missing is None:
|
||||
if not np.isfinite(self._adjusted_data).all():
|
||||
raise ValueError("""\
|
||||
data contains non-finite values (inf, NaN). You should drop these values or
|
||||
use one of the methods for adjusting data for missing-values.""")
|
||||
else:
|
||||
raise ValueError('missing method is not known.')
|
||||
|
||||
if self._index is not None:
|
||||
self._columns = self._columns[self.cols]
|
||||
self._index = self._index[self.rows]
|
||||
|
||||
# Check adjusted data size
|
||||
if self._adjusted_data.size == 0:
|
||||
raise ValueError('Removal of missing values has eliminated '
|
||||
'all data.')
|
||||
|
||||
def _compute_gls_weights(self):
|
||||
"""
|
||||
Computes GLS weights based on percentage of data fit
|
||||
"""
|
||||
projection = np.asarray(self.project(transform=False))
|
||||
errors = self.transformed_data - projection
|
||||
if self._ncomp == self._nvar:
|
||||
raise ValueError('gls can only be used when ncomp < nvar '
|
||||
'so that residuals have non-zero variance')
|
||||
var = (errors ** 2.0).mean(0)
|
||||
weights = 1.0 / var
|
||||
weights = weights / np.sqrt((weights ** 2.0).mean())
|
||||
nvar = self._nvar
|
||||
eff_series_perc = (1.0 / sum((weights / weights.sum()) ** 2.0)) / nvar
|
||||
if eff_series_perc < 0.1:
|
||||
eff_series = int(np.round(eff_series_perc * nvar))
|
||||
import warnings
|
||||
|
||||
warn = f"""\
|
||||
Many series are being down weighted by GLS. Of the {nvar} series, the GLS
|
||||
estimates are based on only {eff_series} (effective) series."""
|
||||
warnings.warn(warn, EstimationWarning)
|
||||
|
||||
self.weights = weights
|
||||
|
||||
def _pca(self):
|
||||
"""
|
||||
Main PCA routine
|
||||
"""
|
||||
self._compute_eig()
|
||||
self._compute_pca_from_eig()
|
||||
self.projection = self.project()
|
||||
|
||||
def __repr__(self):
|
||||
string = self.__str__()
|
||||
string = string[:-1]
|
||||
string += ', id: ' + hex(id(self)) + ')'
|
||||
return string
|
||||
|
||||
def __str__(self):
|
||||
string = 'Principal Component Analysis('
|
||||
string += 'nobs: ' + str(self._nobs) + ', '
|
||||
string += 'nvar: ' + str(self._nvar) + ', '
|
||||
if self._standardize:
|
||||
kind = 'Standardize (Correlation)'
|
||||
elif self._demean:
|
||||
kind = 'Demean (Covariance)'
|
||||
else:
|
||||
kind = 'None'
|
||||
string += 'transformation: ' + kind + ', '
|
||||
if self._gls:
|
||||
string += 'GLS, '
|
||||
string += 'normalization: ' + str(self._normalize) + ', '
|
||||
string += 'number of components: ' + str(self._ncomp) + ', '
|
||||
string += 'method: ' + 'Eigenvalue' if self._method == 'eig' else 'SVD'
|
||||
string += ')'
|
||||
return string
|
||||
|
||||
def _prepare_data(self):
|
||||
"""
|
||||
Standardize or demean data.
|
||||
"""
|
||||
adj_data = self._adjusted_data
|
||||
if np.all(np.isnan(adj_data)):
|
||||
return np.empty(adj_data.shape[1]).fill(np.nan)
|
||||
|
||||
self._mu = np.nanmean(adj_data, axis=0)
|
||||
self._sigma = np.sqrt(np.nanmean((adj_data - self._mu) ** 2.0, axis=0))
|
||||
if self._standardize:
|
||||
data = (adj_data - self._mu) / self._sigma
|
||||
elif self._demean:
|
||||
data = (adj_data - self._mu)
|
||||
else:
|
||||
data = adj_data
|
||||
return data / np.sqrt(self.weights)
|
||||
|
||||
def _compute_eig(self):
|
||||
"""
|
||||
Wrapper for actual eigenvalue method
|
||||
|
||||
This is a workaround to avoid instance methods in __dict__
|
||||
"""
|
||||
if self._method == 'eig':
|
||||
return self._compute_using_eig()
|
||||
elif self._method == 'svd':
|
||||
return self._compute_using_svd()
|
||||
else: # self._method == 'nipals'
|
||||
return self._compute_using_nipals()
|
||||
|
||||
def _compute_using_svd(self):
|
||||
"""SVD method to compute eigenvalues and eigenvecs"""
|
||||
x = self.transformed_data
|
||||
u, s, v = np.linalg.svd(x, full_matrices=self._svd_full_matrices)
|
||||
self.eigenvals = s ** 2.0
|
||||
self.eigenvecs = v.T
|
||||
|
||||
def _compute_using_eig(self):
|
||||
"""
|
||||
Eigenvalue decomposition method to compute eigenvalues and eigenvectors
|
||||
"""
|
||||
x = self.transformed_data
|
||||
self.eigenvals, self.eigenvecs = np.linalg.eigh(x.T.dot(x))
|
||||
|
||||
def _compute_using_nipals(self):
|
||||
"""
|
||||
NIPALS implementation to compute small number of eigenvalues
|
||||
and eigenvectors
|
||||
"""
|
||||
x = self.transformed_data
|
||||
if self._ncomp > 1:
|
||||
x = x + 0.0 # Copy
|
||||
|
||||
tol, max_iter, ncomp = self._tol, self._max_iter, self._ncomp
|
||||
vals = np.zeros(self._ncomp)
|
||||
vecs = np.zeros((self._nvar, self._ncomp))
|
||||
for i in range(ncomp):
|
||||
max_var_ind = np.argmax(x.var(0))
|
||||
factor = x[:, [max_var_ind]]
|
||||
_iter = 0
|
||||
diff = 1.0
|
||||
while diff > tol and _iter < max_iter:
|
||||
vec = x.T.dot(factor) / (factor.T.dot(factor))
|
||||
vec = vec / np.sqrt(vec.T.dot(vec))
|
||||
factor_last = factor
|
||||
factor = x.dot(vec) / (vec.T.dot(vec))
|
||||
diff = _norm(factor - factor_last) / _norm(factor)
|
||||
_iter += 1
|
||||
vals[i] = (factor ** 2).sum()
|
||||
vecs[:, [i]] = vec
|
||||
if ncomp > 1:
|
||||
x -= factor.dot(vec.T)
|
||||
|
||||
self.eigenvals = vals
|
||||
self.eigenvecs = vecs
|
||||
|
||||
def _fill_missing_em(self):
|
||||
"""
|
||||
EM algorithm to fill missing values
|
||||
"""
|
||||
non_missing = np.logical_not(np.isnan(self.data))
|
||||
|
||||
# If nothing missing, return without altering the data
|
||||
if np.all(non_missing):
|
||||
return self.data
|
||||
|
||||
# 1. Standardized data as needed
|
||||
data = self.transformed_data = np.asarray(self._prepare_data())
|
||||
|
||||
ncomp = self._ncomp
|
||||
|
||||
# 2. Check for all nans
|
||||
col_non_missing = np.sum(non_missing, 1)
|
||||
row_non_missing = np.sum(non_missing, 0)
|
||||
if np.any(col_non_missing < ncomp) or np.any(row_non_missing < ncomp):
|
||||
raise ValueError('Implementation requires that all columns and '
|
||||
'all rows have at least ncomp non-missing values')
|
||||
# 3. Get mask
|
||||
mask = np.isnan(data)
|
||||
|
||||
# 4. Compute mean
|
||||
mu = np.nanmean(data, 0)
|
||||
|
||||
# 5. Replace missing with mean
|
||||
projection = np.ones((self._nobs, 1)) * mu
|
||||
projection_masked = projection[mask]
|
||||
data[mask] = projection_masked
|
||||
|
||||
# 6. Compute eigenvalues and fit
|
||||
diff = 1.0
|
||||
_iter = 0
|
||||
while diff > self._tol_em and _iter < self._max_em_iter:
|
||||
last_projection_masked = projection_masked
|
||||
# Set transformed data to compute eigenvalues
|
||||
self.transformed_data = data
|
||||
# Call correct eig function here
|
||||
self._compute_eig()
|
||||
# Call function to compute factors and projection
|
||||
self._compute_pca_from_eig()
|
||||
projection = np.asarray(self.project(transform=False,
|
||||
unweight=False))
|
||||
projection_masked = projection[mask]
|
||||
data[mask] = projection_masked
|
||||
delta = last_projection_masked - projection_masked
|
||||
diff = _norm(delta) / _norm(projection_masked)
|
||||
_iter += 1
|
||||
# Must copy to avoid overwriting original data since replacing values
|
||||
data = self._adjusted_data + 0.0
|
||||
projection = np.asarray(self.project())
|
||||
data[mask] = projection[mask]
|
||||
|
||||
return data
|
||||
|
||||
def _compute_pca_from_eig(self):
|
||||
"""
|
||||
Compute relevant statistics after eigenvalues have been computed
|
||||
"""
|
||||
# Ensure sorted largest to smallest
|
||||
vals, vecs = self.eigenvals, self.eigenvecs
|
||||
indices = np.argsort(vals)
|
||||
indices = indices[::-1]
|
||||
vals = vals[indices]
|
||||
vecs = vecs[:, indices]
|
||||
if (vals <= 0).any():
|
||||
# Discard and warn
|
||||
num_good = vals.shape[0] - (vals <= 0).sum()
|
||||
if num_good < self._ncomp:
|
||||
import warnings
|
||||
|
||||
warnings.warn('Only {num:d} eigenvalues are positive. '
|
||||
'This is the maximum number of components '
|
||||
'that can be extracted.'.format(num=num_good),
|
||||
EstimationWarning)
|
||||
|
||||
self._ncomp = num_good
|
||||
vals[num_good:] = np.finfo(np.float64).tiny
|
||||
# Use ncomp for the remaining calculations
|
||||
vals = vals[:self._ncomp]
|
||||
vecs = vecs[:, :self._ncomp]
|
||||
self.eigenvals, self.eigenvecs = vals, vecs
|
||||
# Select correct number of components to return
|
||||
self.scores = self.factors = self.transformed_data.dot(vecs)
|
||||
self.loadings = vecs
|
||||
self.coeff = vecs.T
|
||||
if self._normalize:
|
||||
self.coeff = (self.coeff.T * np.sqrt(vals)).T
|
||||
self.factors /= np.sqrt(vals)
|
||||
self.scores = self.factors
|
||||
|
||||
def _compute_rsquare_and_ic(self):
|
||||
"""
|
||||
Final statistics to compute
|
||||
"""
|
||||
# TSS and related calculations
|
||||
# TODO: This needs careful testing, with and without weights,
|
||||
# gls, standardized and demean
|
||||
weights = self.weights
|
||||
ss_data = self.transformed_data * np.sqrt(weights)
|
||||
self._tss_indiv = np.sum(ss_data ** 2, 0)
|
||||
self._tss = np.sum(self._tss_indiv)
|
||||
self._ess = np.zeros(self._ncomp + 1)
|
||||
self._ess_indiv = np.zeros((self._ncomp + 1, self._nvar))
|
||||
for i in range(self._ncomp + 1):
|
||||
# Projection in the same space as transformed_data
|
||||
projection = self.project(ncomp=i, transform=False, unweight=False)
|
||||
indiv_rss = (projection ** 2).sum(axis=0)
|
||||
rss = indiv_rss.sum()
|
||||
self._ess[i] = self._tss - rss
|
||||
self._ess_indiv[i, :] = self._tss_indiv - indiv_rss
|
||||
self.rsquare = 1.0 - self._ess / self._tss
|
||||
# Information Criteria
|
||||
ess = self._ess
|
||||
invalid = ess <= 0 # Prevent log issues of 0
|
||||
if invalid.any():
|
||||
last_obs = (np.where(invalid)[0]).min()
|
||||
ess = ess[:last_obs]
|
||||
|
||||
log_ess = np.log(ess)
|
||||
r = np.arange(ess.shape[0])
|
||||
|
||||
nobs, nvar = self._nobs, self._nvar
|
||||
sum_to_prod = (nobs + nvar) / (nobs * nvar)
|
||||
min_dim = min(nobs, nvar)
|
||||
penalties = np.array([sum_to_prod * np.log(1.0 / sum_to_prod),
|
||||
sum_to_prod * np.log(min_dim),
|
||||
np.log(min_dim) / min_dim])
|
||||
penalties = penalties[:, None]
|
||||
ic = log_ess + r * penalties
|
||||
self.ic = ic.T
|
||||
|
||||
def project(self, ncomp=None, transform=True, unweight=True):
|
||||
"""
|
||||
Project series onto a specific number of factors.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
ncomp : int, optional
|
||||
Number of components to use. If omitted, all components
|
||||
initially computed are used.
|
||||
transform : bool, optional
|
||||
Flag indicating whether to return the projection in the original
|
||||
space of the data (True, default) or in the space of the
|
||||
standardized/demeaned data.
|
||||
unweight : bool, optional
|
||||
Flag indicating whether to undo the effects of the estimation
|
||||
weights.
|
||||
|
||||
Returns
|
||||
-------
|
||||
array_like
|
||||
The nobs by nvar array of the projection onto ncomp factors.
|
||||
|
||||
Notes
|
||||
-----
|
||||
"""
|
||||
# Projection needs to be scaled/shifted based on inputs
|
||||
ncomp = self._ncomp if ncomp is None else ncomp
|
||||
if ncomp > self._ncomp:
|
||||
raise ValueError('ncomp must be smaller than the number of '
|
||||
'components computed.')
|
||||
factors = np.asarray(self.factors)
|
||||
coeff = np.asarray(self.coeff)
|
||||
|
||||
projection = factors[:, :ncomp].dot(coeff[:ncomp, :])
|
||||
if transform or unweight:
|
||||
projection *= np.sqrt(self.weights)
|
||||
if transform:
|
||||
# Remove the weights, which do not depend on transformation
|
||||
if self._standardize:
|
||||
projection *= self._sigma
|
||||
if self._standardize or self._demean:
|
||||
projection += self._mu
|
||||
if self._index is not None:
|
||||
projection = pd.DataFrame(projection,
|
||||
columns=self._columns,
|
||||
index=self._index)
|
||||
return projection
|
||||
|
||||
def _to_pandas(self):
|
||||
"""
|
||||
Returns pandas DataFrames for all values
|
||||
"""
|
||||
index = self._index
|
||||
# Principal Components
|
||||
num_zeros = np.ceil(np.log10(self._ncomp))
|
||||
comp_str = 'comp_{0:0' + str(int(num_zeros)) + 'd}'
|
||||
cols = [comp_str.format(i) for i in range(self._ncomp)]
|
||||
df = pd.DataFrame(self.factors, columns=cols, index=index)
|
||||
self.scores = self.factors = df
|
||||
# Projections
|
||||
df = pd.DataFrame(self.projection,
|
||||
columns=self._columns,
|
||||
index=index)
|
||||
self.projection = df
|
||||
# Weights
|
||||
df = pd.DataFrame(self.coeff, index=cols,
|
||||
columns=self._columns)
|
||||
self.coeff = df
|
||||
# Loadings
|
||||
df = pd.DataFrame(self.loadings,
|
||||
index=self._columns, columns=cols)
|
||||
self.loadings = df
|
||||
# eigenvals
|
||||
self.eigenvals = pd.Series(self.eigenvals)
|
||||
self.eigenvals.name = 'eigenvals'
|
||||
# eigenvecs
|
||||
vec_str = comp_str.replace('comp', 'eigenvec')
|
||||
cols = [vec_str.format(i) for i in range(self.eigenvecs.shape[1])]
|
||||
self.eigenvecs = pd.DataFrame(self.eigenvecs, columns=cols)
|
||||
# R2
|
||||
self.rsquare = pd.Series(self.rsquare)
|
||||
self.rsquare.index.name = 'ncomp'
|
||||
self.rsquare.name = 'rsquare'
|
||||
# IC
|
||||
self.ic = pd.DataFrame(self.ic, columns=['IC_p1', 'IC_p2', 'IC_p3'])
|
||||
self.ic.index.name = 'ncomp'
|
||||
|
||||
def plot_scree(self, ncomp=None, log_scale=True,
|
||||
cumulative=False, ax=None):
|
||||
"""
|
||||
Plot of the ordered eigenvalues
|
||||
|
||||
Parameters
|
||||
----------
|
||||
ncomp : int, optional
|
||||
Number of components ot include in the plot. If None, will
|
||||
included the same as the number of components computed
|
||||
log_scale : boot, optional
|
||||
Flag indicating whether ot use a log scale for the y-axis
|
||||
cumulative : bool, optional
|
||||
Flag indicating whether to plot the eigenvalues or cumulative
|
||||
eigenvalues
|
||||
ax : AxesSubplot, optional
|
||||
An axes on which to draw the graph. If omitted, new a figure
|
||||
is created
|
||||
|
||||
Returns
|
||||
-------
|
||||
matplotlib.figure.Figure
|
||||
The handle to the figure.
|
||||
"""
|
||||
import statsmodels.graphics.utils as gutils
|
||||
|
||||
fig, ax = gutils.create_mpl_ax(ax)
|
||||
|
||||
ncomp = self._ncomp if ncomp is None else ncomp
|
||||
vals = np.asarray(self.eigenvals)
|
||||
vals = vals[:self._ncomp]
|
||||
if cumulative:
|
||||
vals = np.cumsum(vals)
|
||||
|
||||
if log_scale:
|
||||
ax.set_yscale('log')
|
||||
ax.plot(np.arange(ncomp), vals[: ncomp], 'bo')
|
||||
ax.autoscale(tight=True)
|
||||
xlim = np.array(ax.get_xlim())
|
||||
sp = xlim[1] - xlim[0]
|
||||
xlim += 0.02 * np.array([-sp, sp])
|
||||
ax.set_xlim(xlim)
|
||||
|
||||
ylim = np.array(ax.get_ylim())
|
||||
scale = 0.02
|
||||
if log_scale:
|
||||
sp = np.log(ylim[1] / ylim[0])
|
||||
ylim = np.exp(np.array([np.log(ylim[0]) - scale * sp,
|
||||
np.log(ylim[1]) + scale * sp]))
|
||||
else:
|
||||
sp = ylim[1] - ylim[0]
|
||||
ylim += scale * np.array([-sp, sp])
|
||||
ax.set_ylim(ylim)
|
||||
ax.set_title('Scree Plot')
|
||||
ax.set_ylabel('Eigenvalue')
|
||||
ax.set_xlabel('Component Number')
|
||||
fig.tight_layout()
|
||||
|
||||
return fig
|
||||
|
||||
def plot_rsquare(self, ncomp=None, ax=None):
|
||||
"""
|
||||
Box plots of the individual series R-square against the number of PCs.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
ncomp : int, optional
|
||||
Number of components ot include in the plot. If None, will
|
||||
plot the minimum of 10 or the number of computed components.
|
||||
ax : AxesSubplot, optional
|
||||
An axes on which to draw the graph. If omitted, new a figure
|
||||
is created.
|
||||
|
||||
Returns
|
||||
-------
|
||||
matplotlib.figure.Figure
|
||||
The handle to the figure.
|
||||
"""
|
||||
import statsmodels.graphics.utils as gutils
|
||||
|
||||
fig, ax = gutils.create_mpl_ax(ax)
|
||||
|
||||
ncomp = 10 if ncomp is None else ncomp
|
||||
ncomp = min(ncomp, self._ncomp)
|
||||
# R2s in rows, series in columns
|
||||
r2s = 1.0 - self._ess_indiv / self._tss_indiv
|
||||
r2s = r2s[1:]
|
||||
r2s = r2s[:ncomp]
|
||||
ax.boxplot(r2s.T)
|
||||
ax.set_title('Individual Input $R^2$')
|
||||
ax.set_ylabel('$R^2$')
|
||||
ax.set_xlabel('Number of Included Principal Components')
|
||||
|
||||
return fig
|
||||
|
||||
|
||||
def pca(data, ncomp=None, standardize=True, demean=True, normalize=True,
|
||||
gls=False, weights=None, method='svd'):
|
||||
"""
|
||||
Perform Principal Component Analysis (PCA).
|
||||
|
||||
Parameters
|
||||
----------
|
||||
data : ndarray
|
||||
Variables in columns, observations in rows.
|
||||
ncomp : int, optional
|
||||
Number of components to return. If None, returns the as many as the
|
||||
smaller to the number of rows or columns of data.
|
||||
standardize : bool, optional
|
||||
Flag indicating to use standardized data with mean 0 and unit
|
||||
variance. standardized being True implies demean.
|
||||
demean : bool, optional
|
||||
Flag indicating whether to demean data before computing principal
|
||||
components. demean is ignored if standardize is True.
|
||||
normalize : bool , optional
|
||||
Indicates whether th normalize the factors to have unit inner
|
||||
product. If False, the loadings will have unit inner product.
|
||||
gls : bool, optional
|
||||
Flag indicating to implement a two-step GLS estimator where
|
||||
in the first step principal components are used to estimate residuals,
|
||||
and then the inverse residual variance is used as a set of weights to
|
||||
estimate the final principal components
|
||||
weights : ndarray, optional
|
||||
Series weights to use after transforming data according to standardize
|
||||
or demean when computing the principal components.
|
||||
method : str, optional
|
||||
Determines the linear algebra routine uses. 'eig', the default,
|
||||
uses an eigenvalue decomposition. 'svd' uses a singular value
|
||||
decomposition.
|
||||
|
||||
Returns
|
||||
-------
|
||||
factors : {ndarray, DataFrame}
|
||||
Array (nobs, ncomp) of principal components (also known as scores).
|
||||
loadings : {ndarray, DataFrame}
|
||||
Array (ncomp, nvar) of principal component loadings for constructing
|
||||
the factors.
|
||||
projection : {ndarray, DataFrame}
|
||||
Array (nobs, nvar) containing the projection of the data onto the ncomp
|
||||
estimated factors.
|
||||
rsquare : {ndarray, Series}
|
||||
Array (ncomp,) where the element in the ith position is the R-square
|
||||
of including the fist i principal components. The values are
|
||||
calculated on the transformed data, not the original data.
|
||||
ic : {ndarray, DataFrame}
|
||||
Array (ncomp, 3) containing the Bai and Ng (2003) Information
|
||||
criteria. Each column is a different criteria, and each row
|
||||
represents the number of included factors.
|
||||
eigenvals : {ndarray, Series}
|
||||
Array of eigenvalues (nvar,).
|
||||
eigenvecs : {ndarray, DataFrame}
|
||||
Array of eigenvectors. (nvar, nvar).
|
||||
|
||||
Notes
|
||||
-----
|
||||
This is a simple function wrapper around the PCA class. See PCA for
|
||||
more information and additional methods.
|
||||
"""
|
||||
pc = PCA(data, ncomp=ncomp, standardize=standardize, demean=demean,
|
||||
normalize=normalize, gls=gls, weights=weights, method=method)
|
||||
|
||||
return (pc.factors, pc.loadings, pc.projection, pc.rsquare, pc.ic,
|
||||
pc.eigenvals, pc.eigenvecs)
|
||||
@ -0,0 +1,140 @@
|
||||
import matplotlib.pyplot as plt
|
||||
import numpy as np
|
||||
|
||||
|
||||
def plot_scree(eigenvals, total_var, ncomp=None, x_label='factor'):
|
||||
"""
|
||||
Plot of the ordered eigenvalues and variance explained for the loadings
|
||||
|
||||
Parameters
|
||||
----------
|
||||
eigenvals : array_like
|
||||
The eigenvalues
|
||||
total_var : float
|
||||
the total variance (for plotting percent variance explained)
|
||||
ncomp : int, optional
|
||||
Number of factors to include in the plot. If None, will
|
||||
included the same as the number of maximum possible loadings
|
||||
x_label : str
|
||||
label of x-axis
|
||||
|
||||
Returns
|
||||
-------
|
||||
Figure
|
||||
Handle to the figure.
|
||||
"""
|
||||
fig = plt.figure()
|
||||
ncomp = len(eigenvals) if ncomp is None else ncomp
|
||||
vals = eigenvals
|
||||
vals = vals[:ncomp]
|
||||
# vals = np.cumsum(vals)
|
||||
|
||||
ax = fig.add_subplot(121)
|
||||
ax.plot(np.arange(ncomp), vals[: ncomp], 'b-o')
|
||||
ax.autoscale(tight=True)
|
||||
xlim = np.array(ax.get_xlim())
|
||||
sp = xlim[1] - xlim[0]
|
||||
xlim += 0.02 * np.array([-sp, sp])
|
||||
ax.set_xticks(np.arange(ncomp))
|
||||
ax.set_xlim(xlim)
|
||||
|
||||
ylim = np.array(ax.get_ylim())
|
||||
scale = 0.02
|
||||
sp = ylim[1] - ylim[0]
|
||||
ylim += scale * np.array([-sp, sp])
|
||||
ax.set_ylim(ylim)
|
||||
ax.set_title('Scree Plot')
|
||||
ax.set_ylabel('Eigenvalue')
|
||||
ax.set_xlabel(x_label)
|
||||
|
||||
per_variance = vals / total_var
|
||||
cumper_variance = np.cumsum(per_variance)
|
||||
ax = fig.add_subplot(122)
|
||||
|
||||
ax.plot(np.arange(ncomp), per_variance[: ncomp], 'b-o')
|
||||
ax.plot(np.arange(ncomp), cumper_variance[: ncomp], 'g--o')
|
||||
ax.autoscale(tight=True)
|
||||
xlim = np.array(ax.get_xlim())
|
||||
sp = xlim[1] - xlim[0]
|
||||
xlim += 0.02 * np.array([-sp, sp])
|
||||
ax.set_xticks(np.arange(ncomp))
|
||||
ax.set_xlim(xlim)
|
||||
|
||||
ylim = np.array(ax.get_ylim())
|
||||
scale = 0.02
|
||||
sp = ylim[1] - ylim[0]
|
||||
ylim += scale * np.array([-sp, sp])
|
||||
ax.set_ylim(ylim)
|
||||
ax.set_title('Variance Explained')
|
||||
ax.set_ylabel('Proportion')
|
||||
ax.set_xlabel(x_label)
|
||||
ax.legend(['Proportion', 'Cumulative'], loc=5)
|
||||
fig.tight_layout()
|
||||
return fig
|
||||
|
||||
|
||||
def plot_loadings(loadings, col_names=None, row_names=None,
|
||||
loading_pairs=None, percent_variance=None,
|
||||
title='Factor patterns'):
|
||||
"""
|
||||
Plot factor loadings in 2-d plots
|
||||
|
||||
Parameters
|
||||
----------
|
||||
loadings : array like
|
||||
Each column is a component (or factor)
|
||||
col_names : a list of strings
|
||||
column names of `loadings`
|
||||
row_names : a list of strings
|
||||
row names of `loadings`
|
||||
loading_pairs : None or a list of tuples
|
||||
Specify plots. Each tuple (i, j) represent one figure, i and j is
|
||||
the loading number for x-axis and y-axis, respectively. If `None`,
|
||||
all combinations of the loadings will be plotted.
|
||||
percent_variance : array_like
|
||||
The percent variance explained by each factor.
|
||||
|
||||
Returns
|
||||
-------
|
||||
figs : a list of figure handles
|
||||
"""
|
||||
k_var, n_factor = loadings.shape
|
||||
if loading_pairs is None:
|
||||
loading_pairs = []
|
||||
for i in range(n_factor):
|
||||
for j in range(i + 1,n_factor):
|
||||
loading_pairs.append([i, j])
|
||||
if col_names is None:
|
||||
col_names = ["factor %d" % i for i in range(n_factor)]
|
||||
if row_names is None:
|
||||
row_names = ["var %d" % i for i in range(k_var)]
|
||||
figs = []
|
||||
for item in loading_pairs:
|
||||
i = item[0]
|
||||
j = item[1]
|
||||
fig = plt.figure(figsize=(7, 7))
|
||||
figs.append(fig)
|
||||
ax = fig.add_subplot(111)
|
||||
for k in range(loadings.shape[0]):
|
||||
plt.text(loadings[k, i], loadings[k, j],
|
||||
row_names[k], fontsize=12)
|
||||
ax.plot(loadings[:, i], loadings[:, j], 'bo')
|
||||
ax.set_title(title)
|
||||
if percent_variance is not None:
|
||||
x_str = f'{col_names[i]} ({percent_variance[i]:.1f}%)'
|
||||
y_str = f'{col_names[j]} ({percent_variance[j]:.1f}%)'
|
||||
ax.set_xlabel(x_str)
|
||||
ax.set_ylabel(y_str)
|
||||
else:
|
||||
ax.set_xlabel(col_names[i])
|
||||
ax.set_ylabel(col_names[j])
|
||||
v = 1.05
|
||||
xlim = np.array([-v, v])
|
||||
ylim = np.array([-v, v])
|
||||
ax.plot(xlim, [0, 0], 'k--')
|
||||
ax.plot([0, 0], ylim, 'k--')
|
||||
ax.set_aspect('equal', 'datalim')
|
||||
ax.set_xlim(xlim)
|
||||
ax.set_ylim(ylim)
|
||||
fig.tight_layout()
|
||||
return figs
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@ -0,0 +1,294 @@
|
||||
from numpy import array
|
||||
|
||||
from statsmodels.tools.testing import Holder
|
||||
|
||||
|
||||
data = Holder()
|
||||
data.comment = 'generated data, divide by 1000'
|
||||
data.name = 'data'
|
||||
data.xo = array([
|
||||
[-419, -731, -1306, -1294],
|
||||
[6, 529, -200, -437],
|
||||
[-27, -833, -6, -564],
|
||||
[-304, -273, -502, -739],
|
||||
[1377, -912, 927, 280],
|
||||
[-375, -517, -514, 49],
|
||||
[247, -504, 123, -259],
|
||||
[712, 534, -773, 286],
|
||||
[195, -1080, 3256, -178],
|
||||
[-854, 75, -706, -1084],
|
||||
[-1219, -612, -15, -203],
|
||||
[550, -628, -483, -2686],
|
||||
[-365, 1376, -1266, 317],
|
||||
[-489, 544, -195, 431],
|
||||
[-656, 854, 840, -723],
|
||||
[16, -1385, -880, -460],
|
||||
[258, -2252, 96, 54],
|
||||
[2049, -750, -1115, 381],
|
||||
[-65, 280, -777, 416],
|
||||
[755, 82, -806, 1027],
|
||||
[-39, -170, -2134, 743],
|
||||
[-859, 780, 746, -133],
|
||||
[762, 252, -450, -459],
|
||||
[-941, -202, 49, -202],
|
||||
[-54, 115, 455, 388],
|
||||
[-1348, 1246, 1430, -480],
|
||||
[229, -535, -1831, 1524],
|
||||
[-651, -167, 2116, 483],
|
||||
[-1249, -1373, 888, -1092],
|
||||
[-75, -2162, 486, -496],
|
||||
[2436, -1627, -1069, 162],
|
||||
[-63, 560, -601, 587],
|
||||
[-60, 1051, -277, 1323],
|
||||
[1329, -1294, 68, 5],
|
||||
[1532, -633, -923, 696],
|
||||
[669, 895, -1762, -375],
|
||||
[1129, -548, 2064, 609],
|
||||
[1320, 573, 2119, 270],
|
||||
[-213, -412, -2517, 1685],
|
||||
[73, -979, 1312, -1220],
|
||||
[-1360, -2107, -237, 1522],
|
||||
[-645, 205, -543, -169],
|
||||
[-212, 1072, 543, -128],
|
||||
[-352, -129, -605, -904],
|
||||
[511, 85, 167, -1914],
|
||||
[1515, 1862, 942, 1622],
|
||||
[-465, 623, -495, -89],
|
||||
[-1396, -979, 1758, 128],
|
||||
[-255, -47, 980, 501],
|
||||
[-1282, -58, -49, -610],
|
||||
[-889, -1177, -492, 494],
|
||||
[1415, 1146, 696, -722],
|
||||
[1237, -224, -1609, -64],
|
||||
[-528, -1625, 231, 883],
|
||||
[-327, 1636, -476, -361],
|
||||
[-781, 793, 1882, 234],
|
||||
[-506, -561, 1988, -810],
|
||||
[-1233, 1467, -261, 2164],
|
||||
[53, 1069, 824, 2123],
|
||||
[-1200, -441, -321, 339],
|
||||
[1606, 298, -995, 1292],
|
||||
[-1740, -672, -1628, -129],
|
||||
[-1450, -354, 224, -657],
|
||||
[-2556, 1006, -706, -1453],
|
||||
[-717, -463, 345, -1821],
|
||||
[1056, -38, -420, -455],
|
||||
[-523, 565, 425, 1138],
|
||||
[-1030, -187, 683, 78],
|
||||
[-214, -312, -1171, -528],
|
||||
[819, 736, -265, 423],
|
||||
[1339, 351, 1142, 579],
|
||||
[-387, -126, -1573, 2346],
|
||||
[969, 2, 327, -134],
|
||||
[163, 227, 90, 2021],
|
||||
[1022, -1076, 174, 304],
|
||||
[1042, 1317, 311, 880],
|
||||
[2018, -840, 295, 2651],
|
||||
[-277, 566, 1147, -189],
|
||||
[20, 467, 1262, 263],
|
||||
[-663, 1061, -1552, -1159],
|
||||
[1830, 391, 2534, -199],
|
||||
[-487, 752, -1061, 351],
|
||||
[-2138, -556, -367, -457],
|
||||
[-868, -411, -559, 726],
|
||||
[1770, 819, -892, -363],
|
||||
[553, -736, -169, -490],
|
||||
[388, -503, 809, -821],
|
||||
[-516, -1452, -192, 483],
|
||||
[493, 2904, 1318, 2591],
|
||||
[175, 584, -1001, 1675],
|
||||
[1316, -1596, -460, 1500],
|
||||
[1212, 214, -644, -696],
|
||||
[-501, 338, 1197, -841],
|
||||
[-587, -469, -1101, 24],
|
||||
[-1205, 1910, 659, 1232],
|
||||
[-150, 398, 594, 394],
|
||||
[34, -663, 235, -334],
|
||||
[-1580, 647, 239, -351],
|
||||
[-2177, -345, 1215, -1494],
|
||||
[1923, 329, -152, 1128]])
|
||||
|
||||
princomp1 = Holder()
|
||||
princomp1.comment = 'mlab.princomp(x, nout=3)'
|
||||
princomp1.factors = array([
|
||||
[-.83487832815382, -1.75681522344645, -.50882660928949, -.59661466511045],
|
||||
[-.18695786699253, -.10732909330422, .23971799542554, -.75468286946853],
|
||||
[-.57403949255604, -.39667006607544, -.7927838094217, .02652621881328],
|
||||
[-.60828125251513, -.75979035898754, -.20148864200404, -.40278856050237],
|
||||
[.55997928601548, .88869370546643, -1.55474410845786, .23033958281961],
|
||||
[-.18023239851961, -.72398923145328, -.07056264751117, .29292391015376],
|
||||
[-.189029743271, -.05888596186903, -.63882208368513, -.05682951829677],
|
||||
[.94694345324739, -.33448036234864, .16665867708366, -.67190948646953],
|
||||
[-1.355171899399, 2.58899695901774, -1.53157119606928, .93743278678908],
|
||||
[-1.06797676403358, -1.01894055566289, .29181722134698, -.65261957826524],
|
||||
[-1.08919199915725, -.5395876105009, .18846579824378, .61935728909742],
|
||||
[-1.36598849770841, -1.00986627679465, -1.6090477073157, -1.82708847399443], # noqa:E501
|
||||
[.561511276285, -.74919011595195, 1.49872898209738, -.80588545345232],
|
||||
[.04805787176428, -.05522267212748, .82943784435024, .01537039050312],
|
||||
[-1.12006939155398, .73462770352006, .58868274831601, -.67786987413505],
|
||||
[-.26087838474316, -1.33362289066951, -1.02932517860259, .24865839951801],
|
||||
[-.24666198784909, -.58247196399204, -1.78971960966265, 1.18908143657302],
|
||||
[1.80675592845666, -.73341258204636, -1.45012544705912, -.44875329121288],
|
||||
[.4794281391435, -.57169295903913, .48557628591056, -.11638075289238],
|
||||
[1.39425263398653, -.3665732682294, .06937942447187, .06683559082703],
|
||||
[1.11015707065101, -1.87631329249852, .48914958604867, .11096926802212],
|
||||
[-.85159530389901, .68543874135386, .86736021483251, -.17641002537865],
|
||||
[.34109015314112, -.25431311542374, -.36804227540019, -.95824474920131],
|
||||
[-.86253950274987, -.28796613689709, .30820634958709, .27228599921917],
|
||||
[.01266190412089, .48559962017667, .14020630700546, .18517398749337],
|
||||
[-1.56345869427724, 1.27917754070516, 1.25640847929385, -.36055181722313],
|
||||
[1.62834293379132, -1.51923809467869, .27754976407182, .79362967384835],
|
||||
[-.94400458067084, 1.77733054371289, .03595731772774, .96570688640992],
|
||||
[-2.11906234438329, -.13226430948321, -.78992396115366, .66362103473975],
|
||||
[-.94372331181891, -.37502966791165, -1.77907324401749, .97801542954941],
|
||||
[1.76575198740032, -.92309597844861, -2.3872195277998, -.21817018301121],
|
||||
[.57418226616373, -.2925257318724, .71180507312941, -.13937750314467],
|
||||
[1.01654397566275, .28855305878842, 1.25119859389106, .11257524396004],
|
||||
[.58979013567212, -.06866577243092, -1.74447546690995, .13917953157575],
|
||||
[1.62072087150051, -.5835145063711, -.99029357957459, -.06334029436682],
|
||||
[.893493925425, -1.23995040005948, .40058503790479, -1.49029669097391],
|
||||
[.26990527585623, 2.03399854143898, -1.2335089890881, .54010061879979],
|
||||
[.33504096277444, 2.42394994177782, -.6643863358332, -.42471161848557],
|
||||
[1.69952476943058, -2.1707037237448, .79694026483866, .88177267205969],
|
||||
[-1.41498253257895, .65248089992094, -1.40045976465378, -.12045332880702],
|
||||
[-.22640706265253, -.94114558124915, -.18868114063537, 2.67652245892778],
|
||||
[-.37493712386529, -.61985213642068, .5383582946365, -.17931524703276],
|
||||
[-.30437796317839, .74252786648649, .73255373596822, -.64993745548429],
|
||||
[-.68788283675831, -.84714762684627, -.10721753874211, -.59777382822281],
|
||||
[-1.00667616522842, -.06670525233919, -.92973707141688, -1.60742284256649],
|
||||
[1.95220512266515, 2.05751265066695, .79640648143073, -.59608004229343],
|
||||
[-.15504464969388, -.3882079443045, .75049869361395, -.44163703260023],
|
||||
[-1.6686863460652, .96325894557423, -.16453379247258, 1.4560996746313],
|
||||
[-.25573631707529, .88265554068571, .08984550855664, .53561910563178],
|
||||
[-1.29430028690793, -.48042359291447, .49318558750269, .03689178852848],
|
||||
[-.34391235307349, -.95154811896716, -.09714022474353, 1.19792361047367],
|
||||
[.34367523316975, 1.16641214447854, -.39528838072965, -1.72565643987406],
|
||||
[1.23887392116229, -1.27474554996132, -.65859544264097, -.81757560038832],
|
||||
[-.17739006831099, -.29057501559843, -.62533324788504, 1.7092669546224],
|
||||
[-.08610919021307, -.06524996994257, 1.3018284944661, -1.28219607271255],
|
||||
[-.95717735853496, 1.79841555744597, .75799149339397, .23542916575208],
|
||||
[-1.70175078442029, 1.33831900642462, -.73979048943944, .26157699746442],
|
||||
[.84631686421106, .32029666775009, 2.51638540556813, .90367536744335],
|
||||
[1.22693220256582, 1.45665385966518, 1.27480662666555, .78786331120259],
|
||||
[-.59251239046609, -.660398245535, .53258334042042, .81248748854679],
|
||||
[2.22723057510913, -.22856960444805, -.15586801032885, -.26957090658609],
|
||||
[-.83192612439183, -2.11983096548132, .75319973501664, .62196293266702],
|
||||
[-1.577627210601, -.3747136286972, .31736538266249, .30187577548949],
|
||||
[-2.28230005998543, -1.17283119424281, 1.83780755209602, -.75928026219594],
|
||||
[-1.90574204329052, -.34197417196464, -.59978910354131, -.68240235236779],
|
||||
[.48132729275936, -.2524965456322, -.75271273075, -.89651237903089],
|
||||
[.26961427953002, .62968227134995, .99324664633985, .59917742452108],
|
||||
[-.95910506784013, .31907970712369, .35568397653203, .60155535679072],
|
||||
[-.18528259973205, -1.31831013869974, -.09749195643548, -.39885348684496],
|
||||
[.9608404103702, .23727553971573, .20695289013955, -.65281918968052],
|
||||
[.85302395609555, 1.5303724004181, -.56440186223081, -.27348033453255],
|
||||
[1.72786301913767, -1.14859994931789, 1.16222121440674, 1.39284961909257],
|
||||
[.37711527308989, .47231886947072, -.69423676772182, -.53515102147655],
|
||||
[1.35642227654922, .53204130038923, .69844068787197, 1.04544871561741],
|
||||
[.57797880484094, .08044525072063, -1.32634695941334, .35179408060132],
|
||||
[1.29437232500619, 1.07461562326311, .54545226737269, -.6836610122092],
|
||||
[2.74736726573105, .90881277479338, -.98342785084735, 1.38171127911719],
|
||||
[-.67749479829901, 1.10093727650063, .28416704607992, -.24984509303044],
|
||||
[-.24513961858774, 1.32098977907584, .16904762754153, .00886790270539],
|
||||
[-.5392290825383, -1.43851802284774, 1.0064737206577, -1.52649870396689],
|
||||
[.19486366400459, 2.77236000318994, -1.32201258472682, -.75922390642504],
|
||||
[.33271229220962, -.78464273816827, 1.09930224781861, -.32184679755027],
|
||||
[-1.72814706427698, -1.09275114767838, .7451569579997, .72871211772761],
|
||||
[-.035506207751, -.72161367235521, .52828318684787, .87177739169758],
|
||||
[1.31224955134141, -.22742530984642, -.44682270809773, -1.72769462581607],
|
||||
[-.07125058353119, -.36850925227739, -1.01188688859296, -.24962251325969],
|
||||
[-.69840680770104, .4925285516285, -1.0255829922787, -.36214090052941],
|
||||
[-.2530614593082, -.68595709316063, -.56882710610856, 1.25787365685572],
|
||||
[1.93782484285419, 2.67095706598253, 2.4023579082791, -.09112046819432],
|
||||
[1.57782156817208, -.39819017512275, 1.01938038947667, .39718992194809],
|
||||
[1.6839282738726, -.37808442385434, -1.36566197748227, 1.22029200163339],
|
||||
[.54652714502605, -.38206797548206, -.70554510441189, -1.31224358889695],
|
||||
[-1.30026063006148, .90642495630747, .02711437433058, -.44482098905042],
|
||||
[-.1239033493518, -1.29112252171673, .18092802221218, .22673242779457],
|
||||
[.01152882540055, 1.13242883415094, 2.34980443084773, .17712319903618],
|
||||
[-.0505195424414, .6807219067402, .37771832345982, .0842510459176],
|
||||
[-.44230076745505, -.07002728477811, -.6716520563439, .09637247949641],
|
||||
[-1.31245480585229, -.01674966464909, 1.21063252882651, -.03927111631335],
|
||||
[-2.94268586886381, .20925236551048, .30321714445262, .22027672852006],
|
||||
[2.04121905977187, .58496246543101, -.5192457175416, -.37212298770116]])
|
||||
princomp1.values = array([
|
||||
[1.29489288337888],
|
||||
[1.12722515391348],
|
||||
[.94682423958163],
|
||||
[.65890241090379]])
|
||||
princomp1.name = 'princomp1'
|
||||
princomp1.coef = array([
|
||||
[.65989917631713, .22621848650964, -.5882833472413, -.40899997165748],
|
||||
[.15824945056105, .3189419948895, .71689623797385, -.5994104597619],
|
||||
[-.3488766362785, .90294049788532, -.17151017930575, .1832151967827],
|
||||
[.64635538301471, .17832458477678, .33251578268108, .66321815082225]])
|
||||
|
||||
princomp2 = Holder()
|
||||
princomp2.comment = 'mlab.princomp(x[:20,], nout=3)'
|
||||
princomp2.factors = array([
|
||||
[.74592631465403, -.92093638563647, 1.10020213969681, -.20234362115983],
|
||||
[.40379773814409, -.23694214086306, -.53526599590626, .48048423978257],
|
||||
[-.43826559396565, -.26267383420164, .35939862515391, -.15176605914773],
|
||||
[.29427656853499, -.56363285386285, .19525662206552, -.0384830001072],
|
||||
[-1.4327917748351, 1.18414191887856, .05435949672922, .46861687286613],
|
||||
[.23033214569426, -.00452237842477, .00346120473054, -.61483888402985],
|
||||
[-.40976419499281, .10137131352284, .02570805136468, .06798926306103],
|
||||
[.83201287149759, .82736894861103, -.35298970920805, .49344802383821],
|
||||
[-3.36634598435507, -.18324521714611, -1.12118215528184, .2057949493723],
|
||||
[.70198992281665, -1.1856449495675, .02465727900177, -.08333428418838],
|
||||
[-.13789069679894, -.79430992968357, -.33106496391047, -1.01808298459082],
|
||||
[-.10779840884825, -1.41970796854378, 1.55590290358904, 1.34014813517248],
|
||||
[1.8229340670437, .13065838030104, -1.06152350166072, .11456488463131],
|
||||
[.51650051521229, .07999783864926, -1.08601194413786, -.28255247881905],
|
||||
[-.24654203558433, -1.02895891025197, -1.34475655787845, .52240852619949],
|
||||
[.03542169335227, -.01198903021187, 1.12649412049726, -.60518306798831],
|
||||
[-1.23945075955452, .48778599927278, 1.11522465483282, -.994827967694],
|
||||
[.30661562766349, 1.91993049714024, 1.08834307939522, .61608892787963],
|
||||
[.8241280516035, .43533554216801, -.48261931874702, -.22391158066897],
|
||||
[.6649139327178, 1.44597315984982, -.33359403032613, -.094219894409]])
|
||||
princomp2.values = array([
|
||||
[1.16965204468073],
|
||||
[.77687367815155],
|
||||
[.72297937656591],
|
||||
[.32548581375971]])
|
||||
princomp2.name = 'princomp2'
|
||||
princomp2.coef = array([
|
||||
[-.13957162231397, .6561182967648, .32256106777669, .66781951188167],
|
||||
[.49534264552989, -.08241251099014, -.6919444767593, .51870674049413],
|
||||
[-.85614372781797, -.11427402995055, -.47665923729502, .16357058078438],
|
||||
[.04661912785591, .74138950947638, -.43584764555793, -.50813884128056]])
|
||||
|
||||
princomp3 = Holder()
|
||||
princomp3.comment = 'mlab.princomp(x[:20,]-x[:20,].mean(0), nout=3)'
|
||||
princomp3.factors = array([
|
||||
[.74592631465403, -.92093638563647, 1.10020213969681, -.20234362115983],
|
||||
[.40379773814409, -.23694214086306, -.53526599590626, .48048423978257],
|
||||
[-.43826559396565, -.26267383420164, .35939862515391, -.15176605914773],
|
||||
[.29427656853499, -.56363285386285, .19525662206552, -.0384830001072],
|
||||
[-1.4327917748351, 1.18414191887856, .05435949672922, .46861687286613],
|
||||
[.23033214569426, -.00452237842477, .00346120473054, -.61483888402985],
|
||||
[-.40976419499281, .10137131352284, .02570805136468, .06798926306103],
|
||||
[.83201287149759, .82736894861103, -.35298970920805, .49344802383821],
|
||||
[-3.36634598435507, -.18324521714611, -1.12118215528184, .2057949493723],
|
||||
[.70198992281665, -1.1856449495675, .02465727900177, -.08333428418838],
|
||||
[-.13789069679894, -.79430992968357, -.33106496391047, -1.01808298459082],
|
||||
[-.10779840884825, -1.41970796854378, 1.55590290358904, 1.34014813517248],
|
||||
[1.8229340670437, .13065838030104, -1.06152350166072, .11456488463131],
|
||||
[.51650051521229, .07999783864926, -1.08601194413786, -.28255247881905],
|
||||
[-.24654203558433, -1.02895891025197, -1.34475655787845, .52240852619949],
|
||||
[.03542169335227, -.01198903021187, 1.12649412049726, -.60518306798831],
|
||||
[-1.23945075955452, .48778599927278, 1.11522465483282, -.994827967694],
|
||||
[.30661562766349, 1.91993049714024, 1.08834307939522, .61608892787963],
|
||||
[.8241280516035, .43533554216801, -.48261931874702, -.22391158066897],
|
||||
[.6649139327178, 1.44597315984982, -.33359403032613, -.094219894409]])
|
||||
princomp3.values = array([
|
||||
[1.16965204468073],
|
||||
[.77687367815155],
|
||||
[.72297937656591],
|
||||
[.32548581375971]])
|
||||
princomp3.name = 'princomp3'
|
||||
princomp3.coef = array([
|
||||
[-.13957162231397, .6561182967648, .32256106777669, .66781951188167],
|
||||
[.49534264552989, -.08241251099014, -.6919444767593, .51870674049413],
|
||||
[-.85614372781797, -.11427402995055, -.47665923729502, .16357058078438],
|
||||
[.04661912785591, .74138950947638, -.43584764555793, -.50813884128056]])
|
||||
@ -0,0 +1,101 @@
|
||||
var1,var2,var3,var4,var5
|
||||
2.3358,0.0044,0.3163,0.8698,1.4817
|
||||
3.1387,-0.1494,1.1793,2.1482,-0.2141
|
||||
0.0501,0.6111,-0.892,1.0971,-2.6557
|
||||
-0.029,-1.7519,-0.5098,-0.5294,0.2512
|
||||
-0.0012,-0.8835,3.1745,3.6743,2.9339
|
||||
1.3228,0.1419,0.6433,2.5167,0.9105
|
||||
0.9066,-0.7031,-0.9839,-0.0551,0.049
|
||||
-1.5817,-1.332,1.0518,-1.1584,-0.9183
|
||||
2.9412,-1.9048,-1.328,0.3225,-0.2039
|
||||
-1.5636,-1.506,1.6153,1.8511,0.9356
|
||||
-0.5645,-0.7889,1.136,1.9609,2.5086
|
||||
-0.802,-0.3495,-1.6138,-0.4514,-0.5998
|
||||
0.7878,0.8931,0.3226,-1.0011,1.4319
|
||||
-2.375,-0.6369,-0.5691,-1.3663,-1.7005
|
||||
-0.2379,0.4552,-0.0294,-0.5558,1.4641
|
||||
-1.3576,-0.1541,0.2684,-2.3244,-1.2963
|
||||
0.9799,0.219,-2.0489,-3.1371,-1.0562
|
||||
1.5379,2.7226,-0.0049,-3.8528,-0.4739
|
||||
-0.8352,-0.8438,-0.4886,0.8641,-1.2653
|
||||
1.3476,-0.0039,-0.8244,0.2143,0.0362
|
||||
0.3315,-0.2731,-0.2188,-2.3388,-0.3962
|
||||
-0.2723,0.6647,-0.2893,0.0999,-0.8111
|
||||
-0.1344,0.695,0.6257,-0.283,-0.5913
|
||||
-2.2572,-1.5057,1.3967,0.471,0.0997
|
||||
1.0519,-1.3884,1.0226,-1.0947,1.3978
|
||||
1.7878,1.8082,-0.694,0.6162,-0.9046
|
||||
0.5601,0.8636,0.4449,0.6998,1.0791
|
||||
-0.2156,1.4126,2.0064,0.3332,0.0751
|
||||
-1.2607,-1.2132,-0.0598,-1.693,-1.0813
|
||||
0.7107,1.9284,1.2093,-0.0175,1.0042
|
||||
0.0362,1.7571,-0.0752,1.8337,2.6863
|
||||
2.1175,0.8949,-1.765,0.6082,0.8375
|
||||
-1.0219,0.2911,-0.727,0.2553,-1.6644
|
||||
2.653,0.0148,0.4559,-0.0419,1.2743
|
||||
-0.3103,0.4724,-0.6975,0.3755,3.4604
|
||||
-1.7467,0.4565,-1.7263,0.9031,0.1875
|
||||
-0.574,-2.3953,-0.8059,1.5461,-0.8906
|
||||
-1.5758,1.8004,-1.3741,0.9648,0.0344
|
||||
-1.2976,-0.6741,2.0647,2.1778,1.5391
|
||||
0.6771,2.042,0.3806,-2.4027,-2.3492
|
||||
-1.0357,0.5604,0.2532,-1.6972,-0.4285
|
||||
-0.17,-0.0818,-2.5097,-1.4429,-0.8825
|
||||
-0.9111,-0.5983,-1.3297,0.5678,2.5338
|
||||
0.0865,2.3449,-1.9526,0.16,0.4645
|
||||
0.7475,-0.5134,-0.598,0.5344,0.0727
|
||||
-2.298,-0.8431,0.2371,-0.7896,-1.7017
|
||||
3.008,-0.271,0.4868,0.4959,0.1369
|
||||
0.376,1.0972,-1.4817,0.1465,0.8261
|
||||
-0.2943,-1.9401,-0.4638,1.8092,0.9328
|
||||
0.131,-0.8266,-1.4767,-0.5936,-2.0493
|
||||
-0.1,0.265,0.4371,1.1967,1.8712
|
||||
0.8886,0.945,-0.1471,-0.1363,-0.9092
|
||||
0.1406,-0.5044,-1.3068,1.441,-3.8205
|
||||
1.896,1.0309,1.1718,2.3715,1.6846
|
||||
-2.3731,0.3547,-2.5275,0.3097,-1.4761
|
||||
-0.5936,-1.5261,-1.0773,1.417,1.3027
|
||||
-2.4798,-1.5857,-0.6344,-2.1682,-0.002
|
||||
0.7588,0.0225,1.2982,0.01,1.1708
|
||||
-0.0718,1.9237,1.3538,1.4318,1.4835
|
||||
1.1017,-0.5897,-0.3399,1.2663,1.6784
|
||||
-0.7308,0.6094,-0.7773,0.2373,1.013
|
||||
1.0155,-0.2549,1.2958,0.6724,0.484
|
||||
-0.4901,0.92,0.4208,0.2325,1.6677
|
||||
0.6138,1.4609,0.3375,-0.8655,-1.2248
|
||||
0.3232,-0.2704,2.8568,-0.7418,1.2925
|
||||
1.1547,0.2841,0.3959,-0.2621,1.2498
|
||||
-0.8148,-0.1754,-0.6326,-2.8309,-3.0651
|
||||
-2.6977,-1.9161,-1.1292,-1.4923,0.3646
|
||||
-3.1057,-0.2471,0.3585,-1.0263,-0.1043
|
||||
0.666,0.368,0.0196,-1.1868,0.2599
|
||||
-1.0735,-1.3328,-0.9537,-0.2594,-1.2733
|
||||
-0.0316,2.3285,1.872,0.1398,3.1739
|
||||
-0.495,-0.245,-2.0064,-1.315,-1.4454
|
||||
-1.1888,-1.0905,1.0745,1.2094,1.4798
|
||||
-2.7048,-0.9399,-1.1409,-1.3737,-1.2151
|
||||
1.2275,2.3317,-1.3622,-0.9929,-1.5922
|
||||
-2.659,-1.18,-1.6486,-0.2288,0.4164
|
||||
-0.5639,2.0618,-1.9634,0.1514,1.6458
|
||||
-1.8483,-0.4639,0.6209,-0.0183,2.4059
|
||||
-0.4303,-0.1728,-0.3347,-0.3546,-0.7524
|
||||
1.9564,-0.6527,0.4776,1.3519,-0.9619
|
||||
-1.5531,-1.2717,1.4032,0.9843,0.3788
|
||||
2.0049,-0.6503,0.0042,-0.3649,1.1627
|
||||
-0.1315,0.5443,0.5422,0.8582,0.4374
|
||||
0.5894,-0.2894,0.8457,0.641,0.3239
|
||||
1.7067,-0.4797,-0.2498,1.1692,0.5081
|
||||
-3.2533,1.3689,1.0815,1.6946,0.8739
|
||||
2.8036,0.5355,0.0828,-0.7673,-1.0338
|
||||
-1.0385,-0.6787,0.8265,-1.7571,-3.1357
|
||||
-2.1853,0.2404,-0.5056,-1.7177,0.6123
|
||||
2.2815,0.5445,1.2507,0.6492,-0.6182
|
||||
0.247,0.1745,0.8681,-1.4099,1.3582
|
||||
0.1303,0.9697,0.6633,0.3373,-0.5746
|
||||
-0.6143,-0.3428,1.3671,-1.5012,-2.0953
|
||||
3.2129,0.5585,0.0043,0.9622,-1.0555
|
||||
-1.3977,-0.1699,-2.4553,-1.2764,-1.0301
|
||||
-1.1966,-0.6408,-1.0887,-1.4875,-0.4743
|
||||
-1.7013,0.2085,0.2438,-1.2822,-1.4098
|
||||
-0.6957,-1.055,-0.6753,-0.3784,-1.9997
|
||||
1.7702,1.1211,-0.6032,-0.6982,0.4066
|
||||
|
@ -0,0 +1,101 @@
|
||||
f1,f2,f1b,f2b,f1o,f2o,f1ob,f2ob
|
||||
.77409906,.5265066,1.2342164,1.5539443,.82174469,.64430356,1.3798437,1.7324318
|
||||
.90211532,.5778448,1.4354716,1.691956,.95415644,.71558883,1.5935961,1.9010544
|
||||
-.55152949,.10318112,-1.0868707,.62321661,-.53884179,.01300941,-1.0209297,.44003871
|
||||
-.11934901,-.53549872,.01129338,-1.8973372,-.17099081,-.54772966,-.17374833,-1.8707504
|
||||
2.3941179,-.26282474,4.6079809,-1.9774392,2.3570865,.12618302,4.3932292,-1.2095023
|
||||
1.0927031,.30140322,1.9283693,.6165865,1.1168835,.47345041,1.9792983,.91910478
|
||||
-.13079791,-.02496757,-.22584839,-.04664732,-.13260905,-.04570687,-.22932042,-.08241165
|
||||
-.33812166,-.74795931,-.33393451,-2.5509963,-.40943578,-.7926505,-.58106234,-2.5714763
|
||||
-.04786263,.01681279,-.0859222,.0515025,-.04599537,.00888495,-.08049141,.03699227
|
||||
1.0480495,-.81266539,2.3144765,-3.3816091,.96382241,-.63326664,1.9737471,-2.964715
|
||||
1.3937318,-.33515776,2.7734803,-1.8141489,1.3544142,-.10631967,2.5833894,-1.3437932
|
||||
-.60405968,-.29122039,-.99811882,-.79446532,-.62957534,-.38470365,-1.0708228,-.94484311
|
||||
.1768074,.51456466,.11159002,1.777466,.22613441,.53632266,.28435898,1.7722347
|
||||
-.96740945,-.723384,-1.5150277,-2.1864758,-1.0333294,-.86974388,-1.7209881,-2.4019313
|
||||
.20389441,.14744239,.3268983,.44172513,.21729843,.17835522,.36840847,.48860637
|
||||
-.92590203,-.31839571,-1.6216858,-.74416985,-.95253387,-.46335749,-1.6865151,-.99563074
|
||||
-1.467613,.3140052,-2.8897371,1.7524627,-1.4300057,.07354455,-2.7051065,1.2641888
|
||||
-1.1479964,1.2582306,-2.7173854,5.041731,-1.0198511,1.0569188,-2.2128766,4.5382765
|
||||
-.17961277,-.49664318,-.12436604,-1.7124124,-.22717909,-.51908693,-.29073149,-1.7100879
|
||||
-.03173505,.28265502,-.17368455,1.0251028,-.00402536,.27385425,-.07291087,.98374888
|
||||
-.73229133,.02886435,-1.397402,.41212285,-.7259882,-.08944924,-1.3505629,.18168897
|
||||
-.24941449,.1269423,-.52647655,.57374862,-.23584948,.08511655,-.46802849,.48146893
|
||||
-.10732968,.18432286,-.29391519,.7210821,-.08884709,.16463107,-.22221031,.66433352
|
||||
.39880577,-.95391695,1.1482693,-3.6091908,.30390006,-.87723612,.79090718,-3.3771456
|
||||
.35872658,-.13977614,.72766572,-.67702616,.34338949,-.08017796,.6581897,-.55099642
|
||||
-.1661692,.91000156,-.70379606,3.3628839,-.07665355,.87136053,-.37256614,3.2056372
|
||||
.55913446,.40910597,.87726935,1.2383401,.59635784,.49381506,.99382627,1.3634605
|
||||
.4792189,.40196303,.70586224,1.2632806,.51612661,.47389483,.82566748,1.36047
|
||||
-.73646072,-.63906787,-1.1203902,-1.9917138,-.79526024,-.74933373,-1.3092417,-2.1461547
|
||||
.4795629,.79207339,.55137636,2.6632818,.55450422,.85896807,.80841591,2.7173154
|
||||
1.1127571,.59389352,1.8580276,1.6695604,1.1653594,.76535228,2.0119553,1.9470045
|
||||
.08956588,.73845254,-.12614275,2.6149467,.16113731,.7432375,.12941213,2.5604956
|
||||
-.52911256,-.1793241,-.91998487,-.4085011,-.54407555,-.26219769,-.95543007,-.55133367
|
||||
.51942396,.61292952,.71244766,1.9736557,.57670908,.68858245,.90148216,2.0626324
|
||||
.8132702,.16897349,1.4862315,.24911189,.82587019,.29774627,1.5034386,.48522013
|
||||
-.12857721,-.24986049,-.11127733,-.84082875,-.1523257,-.26730639,-.19272687,-.84777395
|
||||
.08574189,-.93312063,.57059076,-3.4152487,-.00564465,-.90713076,.23488998,-3.2787713
|
||||
-.10989362,.20354792,-.27660743,.79824474,-.0895244,.18319224,-.19746176,.74327632
|
||||
1.3642833,-.48395053,2.7669022,-2.3267277,1.3105988,-.25791284,2.5268668,-1.8507401
|
||||
-1.1672069,.77791884,-2.5579395,3.3272367,-1.0857999,.57978316,-2.2213513,2.8718422
|
||||
-.55165952,-.01582468,-1.0412775,.18814338,-.55057411,-.10446383,-1.0179728,.0179878
|
||||
-1.1024776,-.06132851,-2.0295797,.2446312,-1.1032044,-.23808385,-1.9960589,-.08542956
|
||||
.51121121,-.32752081,1.1344589,-1.4159784,.47684277,-.24091389,.99099791,-1.2147876
|
||||
-.2827674,.75734733,-.8347537,2.8623962,-.20757982,.70192071,-.55169675,2.6905919
|
||||
.09876273,-.01059937,.19723165,-.08903476,.09725877,.00544491,.1876112,-.056108
|
||||
-.6421798,-.78126,-.88585867,-2.5329525,-.71529206,-.87448557,-1.1285977,-2.6425563
|
||||
.40709836,.55585896,.51952852,1.8181175,.45935433,.61416665,.69431739,1.8780547
|
||||
-.05459569,.44056716,-.27210725,1.6111778,-.01138085,.42602325,-.1137231,1.5463221
|
||||
.67929573,-.6847309,1.5879431,-2.7776557,.60929897,-.56639064,1.3095598,-2.4856551
|
||||
-.92607077,-.27752489,-1.6212673,-.60797893,-.94871697,-.42304737,-1.6728202,-.86115026
|
||||
.87850547,.09109933,1.6219634,-.05027553,.88320204,.23139495,1.609334,.21160078
|
||||
-.26970441,.46623015,-.71459682,1.8042973,-.22296261,.4167076,-.5352756,1.6656569
|
||||
-.79989201,-.26861592,-1.3954051,-.61625155,-.82227074,-.39393339,-1.4488407,-.83293939
|
||||
1.3522459,.72936579,2.2300877,2.0517907,1.4169156,.93762617,2.4195097,2.384166
|
||||
-.88674943,-.45168537,-1.4528825,-1.2385071,-.92656338,-.58860158,-1.5667133,-1.4563288
|
||||
.5197045,-.60245723,1.2605819,-2.4123386,.45848966,-.51089342,1.0193762,-2.1778289
|
||||
-.74788779,-.97828004,-.98283317,-3.2156233,-.83970562,-1.0859582,-1.2916697,-3.3319334
|
||||
.60152168,.20957638,1.0322015,.49464574,.61908925,.30371668,1.0755111,.65442646
|
||||
.98432058,.60450513,1.5874137,1.7740498,1.0385694,.75514043,1.7528183,2.006547
|
||||
.76389179,.07029355,1.4222455,-.08934065,.76710587,.19240203,1.4067588,.14088071
|
||||
.12791207,.05698825,.23093481,.15178897,.13285894,.07684478,.24463378,.18699996
|
||||
.62420468,.14365152,1.1006233,.24788906,.63523658,.24230553,1.1195484,.42191039
|
||||
.53308959,.22886039,.90944184,.60071557,.55286335,.31172784,.96367793,.739341
|
||||
-.47080363,.57870828,-1.1505061,2.3032231,-.41213726,.49533006,-.92046348,2.0878657
|
||||
.73660289,.05121274,1.3362788,-.13133352,.73808663,.16917538,1.3171075,.08559093
|
||||
.3750162,.38087227,.54184582,1.2062216,.41036404,.43629736,.65686931,1.2777407
|
||||
-1.6628303,-.25019609,-3.0399027,-.18082388,-1.6793018,-.51473173,-3.0430496,-.66804499
|
||||
-.57518971,-1.1354505,-.58045463,-3.8604945,-.68315424,-1.2132636,-.95408231,-3.9035827
|
||||
-.33048237,-.71724484,-.31867622,-2.4346249,-.39883828,-.76110666,-.55453068,-2.4541665
|
||||
-.21711319,.29449836,-.53874819,1.1527255,-.18736559,.25568751,-.42379218,1.0509114
|
||||
-.57134442,-.67766186,-.78074037,-2.2034566,-.63469348,-.76083165,-.9918548,-2.3004322
|
||||
1.1568365,.81787262,1.8212026,2.4652365,1.2310664,.99350661,2.0528831,2.7263631
|
||||
-1.1158728,-.19839769,-2.0042252,-.23966229,-1.1298999,-.37552105,-2.0180431,-.55931767
|
||||
.90698836,-.57501394,1.9541201,-2.467368,.84660404,-.42143564,1.7042449,-2.1204445
|
||||
-.96822116,-.87756275,-1.4404227,-2.7473047,-1.0491694,-1.0220407,-1.7014187,-2.9434239
|
||||
-.93949139,.96862152,-2.1843695,3.9104251,-.840576,.80467039,-1.7927011,3.5075818
|
||||
-.34872923,-.92561876,-.23562044,-3.1952854,-.43731439,-.96969915,-.54603402,-3.1915211
|
||||
-.01008118,.56239284,-.22983374,2.0376015,.04479942,.55342776,-.03007525,1.9739874
|
||||
.6753194,-.46834193,1.4803838,-1.98315,.62643921,-.35346682,1.2799762,-1.7188435
|
||||
-.34822558,-.15758844,-.58978344,-.41711157,-.36193118,-.2116136,-.62764134,-.50665223
|
||||
.32773507,.16164562,.53650026,.43747533,.34193386,.21231779,.57659746,.51816881
|
||||
.62853751,-.73379458,1.4879734,-2.9151351,.55399893,-.62298855,1.1966623,-2.6374401
|
||||
.30674611,.26827549,.46503444,.81759821,.33144117,.31417541,.54253362,.88181986
|
||||
.44192818,.14021085,.7685949,.32232893,.45349306,.20955383,.79635969,.44190477
|
||||
.46887837,.03616074,.85905645,-.07268136,.47017009,.11120237,.84787727,.06661995
|
||||
.48475223,.20430528,.83041791,.51731057,.5023622,.27970846,.87689861,.64429782
|
||||
.73545423,-.27367592,1.5001756,-1.2762381,.70526722,-.15165716,1.3685967,-1.0179721
|
||||
-.33495813,.75918345,-.96995976,2.8784489,-.25934288,.69532746,-.68469352,2.6846599
|
||||
-1.0941765,-.47592994,-1.8866643,-1.2349393,-1.135366,-.64593615,-1.9980805,-1.5226689
|
||||
-.48718385,-.3357568,-.76630583,-.99841793,-.51759859,-.40983563,-.85999933,-1.1087994
|
||||
.3617318,.6347137,.38779162,2.1372949,.42189215,.68468566,.5943275,2.1718491
|
||||
.15777211,.18310759,.21152508,.5882512,.17487318,.20612677,.26787102,.61463863
|
||||
.07296441,.31367206,-.00945973,1.1123917,.1031994,.32132842,.09904208,1.096347
|
||||
-.64908777,-.25472472,-1.1440559,-.62922708,-.67083062,-.35593625,-1.199954,-.80526531
|
||||
.12122439,.81412903,-.13059422,2.8820144,.20002335,.82302476,.15102062,2.82336
|
||||
-1.1305481,-.35912583,-1.9541954,-.81215474,-1.160176,-.53651452,-2.0240689,-1.1162794
|
||||
-.75096061,-.43799405,-1.2186468,-1.2618644,-.79008663,-.55321997,-1.3358709,-1.4416571
|
||||
-.70483427,-.304975,-1.2091407,-.78438434,-.73121086,-.41450863,-1.2798564,-.96887917
|
||||
-.7211226,-.52612824,-1.1360481,-1.5881501,-.76898374,-.63539819,-1.2854782,-1.7503809
|
||||
-.15257555,.75718097,-.60854757,2.7933029,-.07802447,.72272415,-.33330485,2.6588313
|
||||
|
@ -0,0 +1,108 @@
|
||||
import pandas as pd
|
||||
from ..cancorr import CanCorr
|
||||
from numpy.testing import assert_almost_equal
|
||||
|
||||
data_fit = pd.DataFrame([[191, 36, 50, 5, 162, 60],
|
||||
[189, 37, 52, 2, 110, 60],
|
||||
[193, 38, 58, 12, 101, 101],
|
||||
[162, 35, 62, 12, 105, 37],
|
||||
[189, 35, 46, 13, 155, 58],
|
||||
[182, 36, 56, 4, 101, 42],
|
||||
[211, 38, 56, 8, 101, 38],
|
||||
[167, 34, 60, 6, 125, 40],
|
||||
[176, 31, 74, 15, 200, 40],
|
||||
[154, 33, 56, 17, 251, 250],
|
||||
[169, 34, 50, 17, 120, 38],
|
||||
[166, 33, 52, 13, 210, 115],
|
||||
[154, 34, 64, 14, 215, 105],
|
||||
[247, 46, 50, 1, 50, 50],
|
||||
[193, 36, 46, 6, 70, 31],
|
||||
[202, 37, 62, 12, 210, 120],
|
||||
[176, 37, 54, 4, 60, 25],
|
||||
[157, 32, 52, 11, 230, 80],
|
||||
[156, 33, 54, 15, 225, 73],
|
||||
[138, 33, 68, 2, 110, 43]])
|
||||
|
||||
|
||||
def test_cancorr():
|
||||
# Compare results to SAS example:
|
||||
# https://support.sas.com/documentation/cdl/en/statug/63347/HTML/default/
|
||||
# viewer.htm#statug_cancorr_sect020.htm
|
||||
X1 = data_fit.iloc[:, :3]
|
||||
Y1 = data_fit.iloc[:, 3:]
|
||||
mod = CanCorr(Y1, X1)
|
||||
r = mod.corr_test()
|
||||
assert_almost_equal(r.stats_mv.loc["Wilks' lambda", 'Value'],
|
||||
0.35039053, decimal=8)
|
||||
assert_almost_equal(r.stats_mv.loc["Pillai's trace", 'Value'],
|
||||
0.67848151, decimal=8)
|
||||
assert_almost_equal(r.stats_mv.loc["Hotelling-Lawley trace", 'Value'],
|
||||
1.77194146, decimal=8)
|
||||
assert_almost_equal(r.stats_mv.loc["Roy's greatest root", 'Value'],
|
||||
1.72473874, decimal=8)
|
||||
assert_almost_equal(r.stats_mv.loc["Wilks' lambda", 'F Value'],
|
||||
2.05, decimal=2)
|
||||
assert_almost_equal(r.stats_mv.loc["Pillai's trace", 'F Value'],
|
||||
1.56, decimal=2)
|
||||
assert_almost_equal(r.stats_mv.loc["Hotelling-Lawley trace",
|
||||
'F Value'],
|
||||
2.64, decimal=2)
|
||||
assert_almost_equal(r.stats_mv.loc["Roy's greatest root", 'F Value'],
|
||||
9.20, decimal=2)
|
||||
assert_almost_equal(r.stats_mv.loc["Wilks' lambda", 'Num DF'],
|
||||
9, decimal=3)
|
||||
assert_almost_equal(r.stats_mv.loc["Pillai's trace", 'Num DF'],
|
||||
9, decimal=3)
|
||||
assert_almost_equal(r.stats_mv.loc["Hotelling-Lawley trace",
|
||||
'Num DF'],
|
||||
9, decimal=3)
|
||||
assert_almost_equal(r.stats_mv.loc["Roy's greatest root", 'Num DF'],
|
||||
3, decimal=3)
|
||||
assert_almost_equal(r.stats_mv.loc["Wilks' lambda", 'Den DF'],
|
||||
34.223, decimal=3)
|
||||
assert_almost_equal(r.stats_mv.loc["Pillai's trace", 'Den DF'],
|
||||
48, decimal=3)
|
||||
assert_almost_equal(r.stats_mv.loc["Hotelling-Lawley trace",
|
||||
'Den DF'],
|
||||
19.053, decimal=3)
|
||||
assert_almost_equal(r.stats_mv.loc["Roy's greatest root", 'Den DF'],
|
||||
16, decimal=3)
|
||||
assert_almost_equal(r.stats_mv.loc["Wilks' lambda", 'Pr > F'],
|
||||
0.0635, decimal=4)
|
||||
assert_almost_equal(r.stats_mv.loc["Pillai's trace", 'Pr > F'],
|
||||
0.1551, decimal=4)
|
||||
assert_almost_equal(r.stats_mv.loc["Hotelling-Lawley trace",
|
||||
'Pr > F'],
|
||||
0.0357, decimal=4)
|
||||
assert_almost_equal(r.stats_mv.loc["Roy's greatest root", 'Pr > F'],
|
||||
0.0009, decimal=4)
|
||||
assert_almost_equal(r.stats.loc[0, "Wilks' lambda"],
|
||||
0.35039053, decimal=8)
|
||||
assert_almost_equal(r.stats.loc[1, "Wilks' lambda"],
|
||||
0.95472266, decimal=8)
|
||||
assert_almost_equal(r.stats.loc[2, "Wilks' lambda"],
|
||||
0.99473355, decimal=8)
|
||||
assert_almost_equal(r.stats.loc[0, 'F Value'],
|
||||
2.05, decimal=2)
|
||||
assert_almost_equal(r.stats.loc[1, 'F Value'],
|
||||
0.18, decimal=2)
|
||||
assert_almost_equal(r.stats.loc[2, 'F Value'],
|
||||
0.08, decimal=2)
|
||||
assert_almost_equal(r.stats.loc[0, 'Num DF'],
|
||||
9, decimal=2)
|
||||
assert_almost_equal(r.stats.loc[1, 'Num DF'],
|
||||
4, decimal=2)
|
||||
assert_almost_equal(r.stats.loc[2, 'Num DF'],
|
||||
1, decimal=2)
|
||||
assert_almost_equal(r.stats.loc[0, 'Den DF'],
|
||||
34.223, decimal=3)
|
||||
assert_almost_equal(r.stats.loc[1, 'Den DF'],
|
||||
30, decimal=2)
|
||||
assert_almost_equal(r.stats.loc[2, 'Den DF'],
|
||||
16, decimal=2)
|
||||
assert_almost_equal(r.stats.loc[0, 'Pr > F'],
|
||||
0.0635, decimal=4)
|
||||
assert_almost_equal(r.stats.loc[1, 'Pr > F'],
|
||||
0.9491, decimal=4)
|
||||
assert_almost_equal(r.stats.loc[2, 'Pr > F'],
|
||||
0.7748, decimal=4)
|
||||
@ -0,0 +1,314 @@
|
||||
import warnings
|
||||
|
||||
from statsmodels.compat.pandas import PD_LT_1_4
|
||||
|
||||
import os
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
from statsmodels.multivariate.factor import Factor
|
||||
from numpy.testing import (assert_equal, assert_array_almost_equal, assert_,
|
||||
assert_raises, assert_array_equal,
|
||||
assert_array_less, assert_allclose)
|
||||
import pytest
|
||||
|
||||
try:
|
||||
import matplotlib.pyplot as plt
|
||||
missing_matplotlib = False
|
||||
plt.switch_backend('Agg')
|
||||
|
||||
except ImportError:
|
||||
missing_matplotlib = True
|
||||
|
||||
# Example data
|
||||
# https://support.sas.com/documentation/cdl/en/statug/63033/HTML/default/
|
||||
# viewer.htm#statug_introreg_sect012.htm
|
||||
X = pd.DataFrame([['Minas Graes', 2.068, 2.070, 1.580, 1, 0],
|
||||
['Minas Graes', 2.068, 2.074, 1.602, 2, 1],
|
||||
['Minas Graes', 2.090, 2.090, 1.613, 3, 0],
|
||||
['Minas Graes', 2.097, 2.093, 1.613, 4, 1],
|
||||
['Minas Graes', 2.117, 2.125, 1.663, 5, 0],
|
||||
['Minas Graes', 2.140, 2.146, 1.681, 6, 1],
|
||||
['Matto Grosso', 2.045, 2.054, 1.580, 7, 0],
|
||||
['Matto Grosso', 2.076, 2.088, 1.602, 8, 1],
|
||||
['Matto Grosso', 2.090, 2.093, 1.643, 9, 0],
|
||||
['Matto Grosso', 2.111, 2.114, 1.643, 10, 1],
|
||||
['Santa Cruz', 2.093, 2.098, 1.653, 11, 0],
|
||||
['Santa Cruz', 2.100, 2.106, 1.623, 12, 1],
|
||||
['Santa Cruz', 2.104, 2.101, 1.653, 13, 0]],
|
||||
columns=['Loc', 'Basal', 'Occ', 'Max', 'id', 'alt'])
|
||||
|
||||
|
||||
def test_auto_col_name():
|
||||
# Test auto generated variable names when endog_names is None
|
||||
mod = Factor(None, 2, corr=np.eye(11), endog_names=None,
|
||||
smc=False)
|
||||
assert_array_equal(mod.endog_names,
|
||||
['var00', 'var01', 'var02', 'var03', 'var04', 'var05',
|
||||
'var06', 'var07', 'var08', 'var09', 'var10'])
|
||||
|
||||
|
||||
def test_direct_corr_matrix():
|
||||
# Test specifying the correlation matrix directly
|
||||
mod = Factor(None, 2, corr=np.corrcoef(X.iloc[:, 1:-1], rowvar=0),
|
||||
smc=False)
|
||||
results = mod.fit(tol=1e-10)
|
||||
a = np.array([[0.965392158864, 0.225880658666255],
|
||||
[0.967587154301, 0.212758741910989],
|
||||
[0.929891035996, -0.000603217967568],
|
||||
[0.486822656362, -0.869649573289374]])
|
||||
assert_array_almost_equal(results.loadings, a, decimal=8)
|
||||
# Test set and get endog_names
|
||||
mod.endog_names = X.iloc[:, 1:-1].columns
|
||||
assert_array_equal(mod.endog_names, ['Basal', 'Occ', 'Max', 'id'])
|
||||
|
||||
# Test set endog_names with the wrong number of elements
|
||||
assert_raises(ValueError, setattr, mod, 'endog_names',
|
||||
X.iloc[:, :1].columns)
|
||||
|
||||
|
||||
def test_unknown_fa_method_error():
|
||||
# Test raise error if an unkonwn FA method is specified in fa.method
|
||||
mod = Factor(X.iloc[:, 1:-1], 2, method='ab')
|
||||
assert_raises(ValueError, mod.fit)
|
||||
|
||||
|
||||
def test_example_compare_to_R_output():
|
||||
# Testing basic functions and compare to R output
|
||||
|
||||
# R code for producing the results:
|
||||
# library(psych)
|
||||
# library(GPArotation)
|
||||
# Basal = c(2.068, 2.068, 2.09, 2.097, 2.117, 2.14, 2.045, 2.076, 2.09, 2.111, 2.093, 2.1, 2.104)
|
||||
# Occ = c(2.07, 2.074, 2.09, 2.093, 2.125, 2.146, 2.054, 2.088, 2.093, 2.114, 2.098, 2.106, 2.101)
|
||||
# Max = c(1.58, 1.602, 1.613, 1.613, 1.663, 1.681, 1.58, 1.602, 1.643, 1.643, 1.653, 1.623, 1.653)
|
||||
# id = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13)
|
||||
# Y <- cbind(Basal, Occ, Max, id)
|
||||
# a <- fa(Y, nfactors=2, fm="pa", rotate="none", SMC=FALSE, min.err=1e-10)
|
||||
# b <- cbind(a$loadings[,1], -a$loadings[,2])
|
||||
# b
|
||||
# a <- fa(Y, nfactors=2, fm="pa", rotate="Promax", SMC=TRUE, min.err=1e-10)
|
||||
# b <- cbind(a$loadings[,1], a$loadings[,2])
|
||||
# b
|
||||
# a <- fa(Y, nfactors=2, fm="pa", rotate="Varimax", SMC=TRUE, min.err=1e-10)
|
||||
# b <- cbind(a$loadings[,1], a$loadings[,2])
|
||||
# b
|
||||
# a <- fa(Y, nfactors=2, fm="pa", rotate="quartimax", SMC=TRUE, min.err=1e-10)
|
||||
# b <- cbind(a$loadings[,1], -a$loadings[,2])
|
||||
# b
|
||||
# a <- fa(Y, nfactors=2, fm="pa", rotate="oblimin", SMC=TRUE, min.err=1e-10)
|
||||
# b <- cbind(a$loadings[,1], a$loadings[,2])
|
||||
# b
|
||||
|
||||
# No rotation without squared multiple correlations prior
|
||||
# produce same results as in R `fa`
|
||||
mod = Factor(X.iloc[:, 1:-1], 2, smc=False)
|
||||
results = mod.fit(tol=1e-10)
|
||||
a = np.array([[0.965392158864, 0.225880658666255],
|
||||
[0.967587154301, 0.212758741910989],
|
||||
[0.929891035996, -0.000603217967568],
|
||||
[0.486822656362, -0.869649573289374]])
|
||||
assert_array_almost_equal(results.loadings, a, decimal=8)
|
||||
|
||||
# No rotation WITH squared multiple correlations prior
|
||||
# produce same results as in R `fa`
|
||||
mod = Factor(X.iloc[:, 1:-1], 2, smc=True)
|
||||
results = mod.fit()
|
||||
a = np.array([[0.97541115, 0.20280987],
|
||||
[0.97113975, 0.17207499],
|
||||
[0.9618705, -0.2004196],
|
||||
[0.37570708, -0.45821379]])
|
||||
assert_array_almost_equal(results.loadings, a, decimal=8)
|
||||
|
||||
# Same as R GRArotation
|
||||
results.rotate('varimax')
|
||||
a = np.array([[0.98828898, -0.12587155],
|
||||
[0.97424206, -0.15354033],
|
||||
[0.84418097, -0.502714],
|
||||
[0.20601929, -0.55558235]])
|
||||
assert_array_almost_equal(results.loadings, a, decimal=8)
|
||||
|
||||
results.rotate('quartimax') # Same as R fa
|
||||
a = np.array([[0.98935598, 0.98242714, 0.94078972, 0.33442284],
|
||||
[0.117190049, 0.086943252, -0.283332952, -0.489159543]])
|
||||
assert_array_almost_equal(results.loadings, a.T, decimal=8)
|
||||
|
||||
results.rotate('equamax') # Not the same as R fa
|
||||
|
||||
results.rotate('promax') # Not the same as R fa
|
||||
|
||||
results.rotate('biquartimin') # Not the same as R fa
|
||||
|
||||
results.rotate('oblimin') # Same as R fa
|
||||
a = np.array([[1.02834170170, 1.00178840104, 0.71824931384,
|
||||
-0.00013510048],
|
||||
[0.06563421, 0.03096076, -0.39658839, -0.59261944]])
|
||||
assert_array_almost_equal(results.loadings, a.T, decimal=8)
|
||||
|
||||
# Testing result summary string
|
||||
results.rotate('varimax')
|
||||
desired = (
|
||||
""" Factor analysis results
|
||||
=============================
|
||||
Eigenvalues
|
||||
-----------------------------
|
||||
Basal Occ Max id
|
||||
-----------------------------
|
||||
2.9609 0.3209 0.0000 -0.0000
|
||||
-----------------------------
|
||||
|
||||
-----------------------------
|
||||
Communality
|
||||
-----------------------------
|
||||
Basal Occ Max id
|
||||
-----------------------------
|
||||
0.9926 0.9727 0.9654 0.3511
|
||||
-----------------------------
|
||||
|
||||
-----------------------------
|
||||
Pre-rotated loadings
|
||||
-----------------------------------
|
||||
factor 0 factor 1
|
||||
-----------------------------------
|
||||
Basal 0.9754 0.2028
|
||||
Occ 0.9711 0.1721
|
||||
Max 0.9619 -0.2004
|
||||
id 0.3757 -0.4582
|
||||
-----------------------------
|
||||
|
||||
-----------------------------
|
||||
varimax rotated loadings
|
||||
-----------------------------------
|
||||
factor 0 factor 1
|
||||
-----------------------------------
|
||||
Basal 0.9883 -0.1259
|
||||
Occ 0.9742 -0.1535
|
||||
Max 0.8442 -0.5027
|
||||
id 0.2060 -0.5556
|
||||
=============================
|
||||
""")
|
||||
actual = results.summary().as_text()
|
||||
actual = "\n".join(line.rstrip() for line in actual.splitlines()) + "\n"
|
||||
assert_equal(actual, desired)
|
||||
|
||||
|
||||
@pytest.mark.skipif(missing_matplotlib, reason='matplotlib not available')
|
||||
def test_plots(close_figures):
|
||||
mod = Factor(X.iloc[:, 1:], 3)
|
||||
results = mod.fit()
|
||||
results.rotate('oblimin')
|
||||
fig = results.plot_scree()
|
||||
|
||||
fig_loadings = results.plot_loadings()
|
||||
assert_equal(3, len(fig_loadings))
|
||||
|
||||
|
||||
@pytest.mark.smoke
|
||||
def test_getframe_smoke():
|
||||
# mostly smoke tests for now
|
||||
mod = Factor(X.iloc[:, 1:-1], 2, smc=True)
|
||||
res = mod.fit()
|
||||
|
||||
df = res.get_loadings_frame(style='raw')
|
||||
assert_(isinstance(df, pd.DataFrame))
|
||||
|
||||
lds = res.get_loadings_frame(style='strings', decimals=3, threshold=0.3)
|
||||
|
||||
|
||||
# The Styler option require jinja2, skip if not available
|
||||
try:
|
||||
from jinja2 import Template # noqa:F401
|
||||
except ImportError:
|
||||
return
|
||||
# TODO: separate this and do pytest.skip?
|
||||
|
||||
# Old implementation that warns
|
||||
if PD_LT_1_4:
|
||||
with warnings.catch_warnings():
|
||||
warnings.simplefilter("always")
|
||||
lds.to_latex()
|
||||
else:
|
||||
# Smoke test using new style to_latex
|
||||
lds.style.to_latex()
|
||||
try:
|
||||
from pandas.io import formats as pd_formats
|
||||
except ImportError:
|
||||
from pandas import formats as pd_formats
|
||||
|
||||
ldf = res.get_loadings_frame(style='display')
|
||||
assert_(isinstance(ldf, pd_formats.style.Styler))
|
||||
assert_(isinstance(ldf.data, pd.DataFrame))
|
||||
|
||||
res.get_loadings_frame(style='display', decimals=3, threshold=0.2)
|
||||
|
||||
res.get_loadings_frame(style='display', decimals=3, color_max='GAINSBORO')
|
||||
|
||||
res.get_loadings_frame(style='display', decimals=3, threshold=0.45, highlight_max=False, sort_=False)
|
||||
|
||||
|
||||
def test_factor_missing():
|
||||
xm = X.iloc[:, 1:-1].copy()
|
||||
nobs, k_endog = xm.shape
|
||||
xm.iloc[2,2] = np.nan
|
||||
mod = Factor(xm, 2)
|
||||
assert_equal(mod.nobs, nobs - 1)
|
||||
assert_equal(mod.k_endog, k_endog)
|
||||
assert_equal(mod.endog.shape, (nobs - 1, k_endog))
|
||||
|
||||
|
||||
def _zscore(x):
|
||||
# helper function
|
||||
return (x - x.mean(0)) / x.std(0)
|
||||
|
||||
|
||||
@pytest.mark.smoke
|
||||
def test_factor_scoring():
|
||||
path = os.path.abspath(__file__)
|
||||
dir_path = os.path.dirname(path)
|
||||
csv_path = os.path.join(dir_path, 'results', 'factor_data.csv')
|
||||
y = pd.read_csv(csv_path)
|
||||
csv_path = os.path.join(dir_path, 'results', 'factors_stata.csv')
|
||||
f_s = pd.read_csv(csv_path)
|
||||
# mostly smoke tests for now
|
||||
mod = Factor(y, 2)
|
||||
res = mod.fit(maxiter=1)
|
||||
res.rotate('varimax')
|
||||
f_reg = res.factor_scoring(method='reg')
|
||||
assert_allclose(f_reg * [1, -1], f_s[["f1", 'f2']].values,
|
||||
atol=1e-4, rtol=1e-3)
|
||||
f_bart = res.factor_scoring()
|
||||
assert_allclose(f_bart * [1, -1], f_s[["f1b", 'f2b']].values,
|
||||
atol=1e-4, rtol=1e-3)
|
||||
|
||||
# check we have high correlation to ols and gls
|
||||
f_ols = res.factor_scoring(method='ols')
|
||||
f_gls = res.factor_scoring(method='gls')
|
||||
f_reg_z = _zscore(f_reg)
|
||||
f_ols_z = _zscore(f_ols)
|
||||
f_gls_z = _zscore(f_gls)
|
||||
assert_array_less(0.98, (f_ols_z * f_reg_z).mean(0))
|
||||
assert_array_less(0.999, (f_gls_z * f_reg_z).mean(0))
|
||||
|
||||
# with oblique rotation
|
||||
res.rotate('oblimin')
|
||||
# Note: Stata has second factor with flipped sign compared to statsmodels
|
||||
assert_allclose(res._corr_factors()[0, 1], (-1) * 0.25651037, rtol=1e-3)
|
||||
f_reg = res.factor_scoring(method='reg')
|
||||
assert_allclose(f_reg * [1, -1], f_s[["f1o", 'f2o']].values,
|
||||
atol=1e-4, rtol=1e-3)
|
||||
f_bart = res.factor_scoring()
|
||||
assert_allclose(f_bart * [1, -1], f_s[["f1ob", 'f2ob']].values,
|
||||
atol=1e-4, rtol=1e-3)
|
||||
|
||||
# check we have high correlation to ols and gls
|
||||
f_ols = res.factor_scoring(method='ols')
|
||||
f_gls = res.factor_scoring(method='gls')
|
||||
f_reg_z = _zscore(f_reg)
|
||||
f_ols_z = _zscore(f_ols)
|
||||
f_gls_z = _zscore(f_gls)
|
||||
assert_array_less(0.97, (f_ols_z * f_reg_z).mean(0))
|
||||
assert_array_less(0.999, (f_gls_z * f_reg_z).mean(0))
|
||||
|
||||
# check provided endog
|
||||
f_ols2 = res.factor_scoring(method='ols', endog=res.model.endog)
|
||||
assert_allclose(f_ols2, f_ols, rtol=1e-13)
|
||||
@ -0,0 +1,197 @@
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
import pytest
|
||||
from numpy.testing import assert_almost_equal, assert_raises, assert_allclose
|
||||
|
||||
from statsmodels.multivariate.manova import MANOVA
|
||||
from statsmodels.multivariate.multivariate_ols import MultivariateTestResults
|
||||
from statsmodels.tools import add_constant
|
||||
|
||||
# Example data
|
||||
# https://support.sas.com/documentation/cdl/en/statug/63033/HTML/default/
|
||||
# viewer.htm#statug_introreg_sect012.htm
|
||||
X = pd.DataFrame([['Minas Graes', 2.068, 2.070, 1.580],
|
||||
['Minas Graes', 2.068, 2.074, 1.602],
|
||||
['Minas Graes', 2.090, 2.090, 1.613],
|
||||
['Minas Graes', 2.097, 2.093, 1.613],
|
||||
['Minas Graes', 2.117, 2.125, 1.663],
|
||||
['Minas Graes', 2.140, 2.146, 1.681],
|
||||
['Matto Grosso', 2.045, 2.054, 1.580],
|
||||
['Matto Grosso', 2.076, 2.088, 1.602],
|
||||
['Matto Grosso', 2.090, 2.093, 1.643],
|
||||
['Matto Grosso', 2.111, 2.114, 1.643],
|
||||
['Santa Cruz', 2.093, 2.098, 1.653],
|
||||
['Santa Cruz', 2.100, 2.106, 1.623],
|
||||
['Santa Cruz', 2.104, 2.101, 1.653]],
|
||||
columns=['Loc', 'Basal', 'Occ', 'Max'])
|
||||
|
||||
|
||||
def test_manova_sas_example():
|
||||
# Results should be the same as figure 4.5 of
|
||||
# https://support.sas.com/documentation/cdl/en/statug/63033/HTML/default/
|
||||
# viewer.htm#statug_introreg_sect012.htm
|
||||
mod = MANOVA.from_formula('Basal + Occ + Max ~ Loc', data=X)
|
||||
r = mod.mv_test()
|
||||
assert_almost_equal(r['Loc']['stat'].loc["Wilks' lambda", 'Value'],
|
||||
0.60143661, decimal=8)
|
||||
assert_almost_equal(r['Loc']['stat'].loc["Pillai's trace", 'Value'],
|
||||
0.44702843, decimal=8)
|
||||
assert_almost_equal(r['Loc']['stat'].loc["Hotelling-Lawley trace", 'Value'],
|
||||
0.58210348, decimal=8)
|
||||
assert_almost_equal(r['Loc']['stat'].loc["Roy's greatest root", 'Value'],
|
||||
0.35530890, decimal=8)
|
||||
assert_almost_equal(r['Loc']['stat'].loc["Wilks' lambda", 'F Value'],
|
||||
0.77, decimal=2)
|
||||
assert_almost_equal(r['Loc']['stat'].loc["Pillai's trace", 'F Value'],
|
||||
0.86, decimal=2)
|
||||
assert_almost_equal(r['Loc']['stat'].loc["Hotelling-Lawley trace", 'F Value'],
|
||||
0.75, decimal=2)
|
||||
assert_almost_equal(r['Loc']['stat'].loc["Roy's greatest root", 'F Value'],
|
||||
1.07, decimal=2)
|
||||
assert_almost_equal(r['Loc']['stat'].loc["Wilks' lambda", 'Num DF'],
|
||||
6, decimal=3)
|
||||
assert_almost_equal(r['Loc']['stat'].loc["Pillai's trace", 'Num DF'],
|
||||
6, decimal=3)
|
||||
assert_almost_equal(r['Loc']['stat'].loc["Hotelling-Lawley trace", 'Num DF'],
|
||||
6, decimal=3)
|
||||
assert_almost_equal(r['Loc']['stat'].loc["Roy's greatest root", 'Num DF'],
|
||||
3, decimal=3)
|
||||
assert_almost_equal(r['Loc']['stat'].loc["Wilks' lambda", 'Den DF'],
|
||||
16, decimal=3)
|
||||
assert_almost_equal(r['Loc']['stat'].loc["Pillai's trace", 'Den DF'],
|
||||
18, decimal=3)
|
||||
assert_almost_equal(r['Loc']['stat'].loc["Hotelling-Lawley trace", 'Den DF'],
|
||||
9.0909, decimal=4)
|
||||
assert_almost_equal(r['Loc']['stat'].loc["Roy's greatest root", 'Den DF'],
|
||||
9, decimal=3)
|
||||
assert_almost_equal(r['Loc']['stat'].loc["Wilks' lambda", 'Pr > F'],
|
||||
0.6032, decimal=4)
|
||||
assert_almost_equal(r['Loc']['stat'].loc["Pillai's trace", 'Pr > F'],
|
||||
0.5397, decimal=4)
|
||||
assert_almost_equal(r['Loc']['stat'].loc["Hotelling-Lawley trace", 'Pr > F'],
|
||||
0.6272, decimal=4)
|
||||
assert_almost_equal(r['Loc']['stat'].loc["Roy's greatest root", 'Pr > F'],
|
||||
0.4109, decimal=4)
|
||||
|
||||
|
||||
def test_manova_no_formula():
|
||||
# Same as previous test only skipping formula interface
|
||||
exog = add_constant(pd.get_dummies(X[['Loc']], drop_first=True,
|
||||
dtype=float))
|
||||
endog = X[['Basal', 'Occ', 'Max']]
|
||||
mod = MANOVA(endog, exog)
|
||||
intercept = np.zeros((1, 3))
|
||||
intercept[0, 0] = 1
|
||||
loc = np.zeros((2, 3))
|
||||
loc[0, 1] = loc[1, 2] = 1
|
||||
hypotheses = [('Intercept', intercept), ('Loc', loc)]
|
||||
r = mod.mv_test(hypotheses)
|
||||
assert_almost_equal(r['Loc']['stat'].loc["Wilks' lambda", 'Value'],
|
||||
0.60143661, decimal=8)
|
||||
assert_almost_equal(r['Loc']['stat'].loc["Pillai's trace", 'Value'],
|
||||
0.44702843, decimal=8)
|
||||
assert_almost_equal(r['Loc']['stat'].loc["Hotelling-Lawley trace",
|
||||
'Value'],
|
||||
0.58210348, decimal=8)
|
||||
assert_almost_equal(r['Loc']['stat'].loc["Roy's greatest root", 'Value'],
|
||||
0.35530890, decimal=8)
|
||||
assert_almost_equal(r['Loc']['stat'].loc["Wilks' lambda", 'F Value'],
|
||||
0.77, decimal=2)
|
||||
assert_almost_equal(r['Loc']['stat'].loc["Pillai's trace", 'F Value'],
|
||||
0.86, decimal=2)
|
||||
assert_almost_equal(r['Loc']['stat'].loc["Hotelling-Lawley trace",
|
||||
'F Value'],
|
||||
0.75, decimal=2)
|
||||
assert_almost_equal(r['Loc']['stat'].loc["Roy's greatest root", 'F Value'],
|
||||
1.07, decimal=2)
|
||||
assert_almost_equal(r['Loc']['stat'].loc["Wilks' lambda", 'Num DF'],
|
||||
6, decimal=3)
|
||||
assert_almost_equal(r['Loc']['stat'].loc["Pillai's trace", 'Num DF'],
|
||||
6, decimal=3)
|
||||
assert_almost_equal(r['Loc']['stat'].loc["Hotelling-Lawley trace",
|
||||
'Num DF'],
|
||||
6, decimal=3)
|
||||
assert_almost_equal(r['Loc']['stat'].loc["Roy's greatest root", 'Num DF'],
|
||||
3, decimal=3)
|
||||
assert_almost_equal(r['Loc']['stat'].loc["Wilks' lambda", 'Den DF'],
|
||||
16, decimal=3)
|
||||
assert_almost_equal(r['Loc']['stat'].loc["Pillai's trace", 'Den DF'],
|
||||
18, decimal=3)
|
||||
assert_almost_equal(r['Loc']['stat'].loc["Hotelling-Lawley trace",
|
||||
'Den DF'],
|
||||
9.0909, decimal=4)
|
||||
assert_almost_equal(r['Loc']['stat'].loc["Roy's greatest root", 'Den DF'],
|
||||
9, decimal=3)
|
||||
assert_almost_equal(r['Loc']['stat'].loc["Wilks' lambda", 'Pr > F'],
|
||||
0.6032, decimal=4)
|
||||
assert_almost_equal(r['Loc']['stat'].loc["Pillai's trace", 'Pr > F'],
|
||||
0.5397, decimal=4)
|
||||
assert_almost_equal(r['Loc']['stat'].loc["Hotelling-Lawley trace",
|
||||
'Pr > F'],
|
||||
0.6272, decimal=4)
|
||||
assert_almost_equal(r['Loc']['stat'].loc["Roy's greatest root", 'Pr > F'],
|
||||
0.4109, decimal=4)
|
||||
|
||||
|
||||
@pytest.mark.smoke
|
||||
def test_manova_no_formula_no_hypothesis():
|
||||
# Same as previous test only skipping formula interface
|
||||
exog = add_constant(pd.get_dummies(X[['Loc']], drop_first=True,
|
||||
dtype=float))
|
||||
endog = X[['Basal', 'Occ', 'Max']]
|
||||
mod = MANOVA(endog, exog)
|
||||
r = mod.mv_test()
|
||||
assert isinstance(r, MultivariateTestResults)
|
||||
|
||||
|
||||
def test_manova_test_input_validation():
|
||||
mod = MANOVA.from_formula('Basal + Occ + Max ~ Loc', data=X)
|
||||
hypothesis = [('test', np.array([[1, 1, 1]]), None)]
|
||||
mod.mv_test(hypothesis)
|
||||
hypothesis = [('test', np.array([[1, 1]]), None)]
|
||||
assert_raises(ValueError, mod.mv_test, hypothesis)
|
||||
"""
|
||||
assert_raises_regex(ValueError,
|
||||
('Contrast matrix L should have the same number of '
|
||||
'columns as exog! 2 != 3'),
|
||||
mod.mv_test, hypothesis)
|
||||
"""
|
||||
hypothesis = [('test', np.array([[1, 1, 1]]), np.array([[1], [1], [1]]))]
|
||||
mod.mv_test(hypothesis)
|
||||
hypothesis = [('test', np.array([[1, 1, 1]]), np.array([[1], [1]]))]
|
||||
assert_raises(ValueError, mod.mv_test, hypothesis)
|
||||
"""
|
||||
assert_raises_regex(ValueError,
|
||||
('Transform matrix M should have the same number of '
|
||||
'rows as the number of columns of endog! 2 != 3'),
|
||||
mod.mv_test, hypothesis)
|
||||
"""
|
||||
|
||||
def test_endog_1D_array():
|
||||
assert_raises(ValueError, MANOVA.from_formula, 'Basal ~ Loc', X)
|
||||
|
||||
|
||||
def test_manova_demeaned():
|
||||
# see last example in #8713
|
||||
# If a term has no effect, all eigenvalues below threshold, then computaion
|
||||
# raised numpy exception with empty arrays.
|
||||
# currently we have an option to skip the intercept test, but don't handle
|
||||
# empty arrays directly
|
||||
ng = 5
|
||||
loc = ["Basal", "Occ", "Max"] * ng
|
||||
y1 = (np.random.randn(ng, 3) + [0, 0.5, 1]).ravel()
|
||||
y2 = (np.random.randn(ng, 3) + [0.25, 0.75, 1]).ravel()
|
||||
y3 = (np.random.randn(ng, 3) + [0.3, 0.6, 1]).ravel()
|
||||
dta = pd.DataFrame(dict(Loc=loc, Basal=y1, Occ=y2, Max=y3))
|
||||
mod = MANOVA.from_formula('Basal + Occ + Max ~ C(Loc, Helmert)', data=dta)
|
||||
res1 = mod.mv_test()
|
||||
|
||||
# subtract sample means to have insignificant intercept
|
||||
means = dta[["Basal", "Occ", "Max"]].mean()
|
||||
dta[["Basal", "Occ", "Max"]] = dta[["Basal", "Occ", "Max"]] - means
|
||||
mod = MANOVA.from_formula('Basal + Occ + Max ~ C(Loc, Helmert)', data=dta)
|
||||
res2 = mod.mv_test(skip_intercept_test=True)
|
||||
|
||||
stat1 = res1.results["C(Loc, Helmert)"]["stat"].to_numpy(float)
|
||||
stat2 = res2.results["C(Loc, Helmert)"]["stat"].to_numpy(float)
|
||||
assert_allclose(stat1, stat2, rtol=1e-10)
|
||||
@ -0,0 +1,206 @@
|
||||
import numpy as np
|
||||
from statsmodels.multivariate.factor import Factor
|
||||
from numpy.testing import assert_allclose, assert_equal
|
||||
from scipy.optimize import approx_fprime
|
||||
import warnings
|
||||
|
||||
# A small model for basic testing
|
||||
def _toy():
|
||||
uniq = np.r_[4, 9, 16]
|
||||
load = np.asarray([[3, 1, 2], [2, 5, 8]]).T
|
||||
par = np.r_[2, 3, 4, 3, 1, 2, 2, 5, 8]
|
||||
corr = np.asarray([[1, .5, .25], [.5, 1, .5], [.25, .5, 1]])
|
||||
return uniq, load, corr, par
|
||||
|
||||
|
||||
def test_loglike():
|
||||
|
||||
uniq, load, corr, par = _toy()
|
||||
fa = Factor(n_factor=2, corr=corr)
|
||||
|
||||
# Two ways of passing the parameters to loglike
|
||||
ll1 = fa.loglike((load, uniq))
|
||||
ll2 = fa.loglike(par)
|
||||
|
||||
assert_allclose(ll1, ll2)
|
||||
|
||||
|
||||
def test_score():
|
||||
|
||||
uniq, load, corr, par = _toy()
|
||||
fa = Factor(n_factor=2, corr=corr)
|
||||
|
||||
def f(par):
|
||||
return fa.loglike(par)
|
||||
|
||||
par2 = np.r_[0.1, 0.2, 0.3, 0.4, 0.3, 0.1, 0.2, -0.2, 0, 0.8, 0.5, 0]
|
||||
|
||||
for pt in (par, par2):
|
||||
g1 = approx_fprime(pt, f, 1e-8)
|
||||
g2 = fa.score(pt)
|
||||
assert_allclose(g1, g2, atol=1e-3)
|
||||
|
||||
|
||||
def test_exact():
|
||||
# Test if we can recover exact factor-structured matrices with
|
||||
# default starting values.
|
||||
|
||||
np.random.seed(23324)
|
||||
|
||||
# Works for larger k_var but slow for routine testing.
|
||||
for k_var in 5, 10, 25:
|
||||
for n_factor in 1, 2, 3:
|
||||
load = np.random.normal(size=(k_var, n_factor))
|
||||
uniq = np.linspace(1, 2, k_var)
|
||||
c = np.dot(load, load.T)
|
||||
c.flat[::c.shape[0]+1] += uniq
|
||||
s = np.sqrt(np.diag(c))
|
||||
c /= np.outer(s, s)
|
||||
fa = Factor(corr=c, n_factor=n_factor, method='ml')
|
||||
rslt = fa.fit()
|
||||
assert_allclose(rslt.fitted_cov, c, rtol=1e-4, atol=1e-4)
|
||||
rslt.summary() # smoke test
|
||||
|
||||
|
||||
def test_exact_em():
|
||||
# Test if we can recover exact factor-structured matrices with
|
||||
# default starting values using the EM algorithm.
|
||||
|
||||
np.random.seed(23324)
|
||||
|
||||
# Works for larger k_var but slow for routine testing.
|
||||
for k_var in 5, 10, 25:
|
||||
for n_factor in 1, 2, 3:
|
||||
load = np.random.normal(size=(k_var, n_factor))
|
||||
uniq = np.linspace(1, 2, k_var)
|
||||
c = np.dot(load, load.T)
|
||||
c.flat[::c.shape[0]+1] += uniq
|
||||
s = np.sqrt(np.diag(c))
|
||||
c /= np.outer(s, s)
|
||||
fa = Factor(corr=c, n_factor=n_factor, method='ml')
|
||||
load_e, uniq_e = fa._fit_ml_em(2000)
|
||||
c_e = np.dot(load_e, load_e.T)
|
||||
c_e.flat[::c_e.shape[0]+1] += uniq_e
|
||||
assert_allclose(c_e, c, rtol=1e-4, atol=1e-4)
|
||||
|
||||
|
||||
def test_fit_ml_em_random_state():
|
||||
# Ensure Factor._fit_ml_em doesn't change numpy's singleton random state
|
||||
# see #7357
|
||||
|
||||
T = 10
|
||||
epsilon = np.random.multivariate_normal(np.zeros(3), np.eye(3), size=T).T
|
||||
initial = np.random.get_state()
|
||||
with warnings.catch_warnings():
|
||||
warnings.filterwarnings("ignore", message='Fitting did not converge')
|
||||
Factor(endog=epsilon, n_factor=2, method='ml').fit()
|
||||
final = np.random.get_state()
|
||||
|
||||
assert initial[0] == final[0]
|
||||
assert_equal(initial[1], final[1])
|
||||
assert initial[2:] == final[2:]
|
||||
|
||||
|
||||
def test_em():
|
||||
|
||||
n_factor = 1
|
||||
cor = np.asarray([[1, 0.5, 0.3], [0.5, 1, 0], [0.3, 0, 1]])
|
||||
|
||||
fa = Factor(corr=cor, n_factor=n_factor, method='ml')
|
||||
rslt = fa.fit(opt={'gtol': 1e-3})
|
||||
load_opt = rslt.loadings
|
||||
uniq_opt = rslt.uniqueness
|
||||
|
||||
load_em, uniq_em = fa._fit_ml_em(1000)
|
||||
cc = np.dot(load_em, load_em.T)
|
||||
cc.flat[::cc.shape[0]+1] += uniq_em
|
||||
|
||||
assert_allclose(cc, rslt.fitted_cov, rtol=1e-2, atol=1e-2)
|
||||
|
||||
|
||||
def test_1factor():
|
||||
"""
|
||||
# R code:
|
||||
r = 0.4
|
||||
p = 4
|
||||
ii = seq(0, p-1)
|
||||
ii = outer(ii, ii, "-")
|
||||
ii = abs(ii)
|
||||
cm = r^ii
|
||||
fa = factanal(covmat=cm, factors=1)
|
||||
print(fa, digits=10)
|
||||
"""
|
||||
|
||||
r = 0.4
|
||||
p = 4
|
||||
ii = np.arange(p)
|
||||
cm = r ** np.abs(np.subtract.outer(ii, ii))
|
||||
|
||||
fa = Factor(corr=cm, n_factor=1, method='ml')
|
||||
rslt = fa.fit()
|
||||
|
||||
if rslt.loadings[0, 0] < 0:
|
||||
rslt.loadings[:, 0] *= -1
|
||||
|
||||
# R solution, but our likelihood is higher
|
||||
# uniq = np.r_[0.8392472054, 0.5820958187, 0.5820958187, 0.8392472054]
|
||||
# load = np.asarray([[0.4009399224, 0.6464550935, 0.6464550935,
|
||||
# 0.4009399224]]).T
|
||||
# l1 = fa.loglike(fa._pack(load, uniq))
|
||||
# l2 = fa.loglike(fa._pack(rslt.loadings, rslt.uniqueness))
|
||||
|
||||
# So use a smoke test
|
||||
uniq = np.r_[0.85290232, 0.60916033, 0.55382266, 0.82610666]
|
||||
load = np.asarray([[0.38353316], [0.62517171], [0.66796508],
|
||||
[0.4170052]])
|
||||
|
||||
assert_allclose(load, rslt.loadings, rtol=1e-3, atol=1e-3)
|
||||
assert_allclose(uniq, rslt.uniqueness, rtol=1e-3, atol=1e-3)
|
||||
|
||||
assert_equal(rslt.df, 2)
|
||||
|
||||
|
||||
def test_2factor():
|
||||
"""
|
||||
# R code:
|
||||
r = 0.4
|
||||
p = 6
|
||||
ii = seq(0, p-1)
|
||||
ii = outer(ii, ii, "-")
|
||||
ii = abs(ii)
|
||||
cm = r^ii
|
||||
factanal(covmat=cm, factors=2)
|
||||
"""
|
||||
|
||||
r = 0.4
|
||||
p = 6
|
||||
ii = np.arange(p)
|
||||
cm = r ** np.abs(np.subtract.outer(ii, ii))
|
||||
|
||||
fa = Factor(corr=cm, n_factor=2, nobs=100, method='ml')
|
||||
rslt = fa.fit()
|
||||
|
||||
for j in 0, 1:
|
||||
if rslt.loadings[0, j] < 0:
|
||||
rslt.loadings[:, j] *= -1
|
||||
|
||||
uniq = np.r_[0.782, 0.367, 0.696, 0.696, 0.367, 0.782]
|
||||
assert_allclose(uniq, rslt.uniqueness, rtol=1e-3, atol=1e-3)
|
||||
|
||||
loads = [np.r_[0.323, 0.586, 0.519, 0.519, 0.586, 0.323],
|
||||
np.r_[0.337, 0.538, 0.187, -0.187, -0.538, -0.337]]
|
||||
for k in 0, 1:
|
||||
if np.dot(loads[k], rslt.loadings[:, k]) < 0:
|
||||
loads[k] *= -1
|
||||
assert_allclose(loads[k], rslt.loadings[:, k], rtol=1e-3, atol=1e-3)
|
||||
|
||||
assert_equal(rslt.df, 4)
|
||||
|
||||
# Smoke test for standard errors
|
||||
e = np.asarray([0.11056836, 0.05191071, 0.09836349,
|
||||
0.09836349, 0.05191071, 0.11056836])
|
||||
assert_allclose(rslt.uniq_stderr, e, atol=1e-4)
|
||||
e = np.asarray([[0.08842151, 0.08842151], [0.06058582, 0.06058582],
|
||||
[0.08339874, 0.08339874], [0.08339874, 0.08339874],
|
||||
[0.06058582, 0.06058582], [0.08842151, 0.08842151]])
|
||||
assert_allclose(rslt.load_stderr, e, atol=1e-4)
|
||||
@ -0,0 +1,199 @@
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
from statsmodels.multivariate.multivariate_ols import _MultivariateOLS
|
||||
from numpy.testing import assert_array_almost_equal, assert_raises
|
||||
import patsy
|
||||
|
||||
data = pd.DataFrame([['Morphine', 'N', .04, .20, .10, .08],
|
||||
['Morphine', 'N', .02, .06, .02, .02],
|
||||
['Morphine', 'N', .07, 1.40, .48, .24],
|
||||
['Morphine', 'N', .17, .57, .35, .24],
|
||||
['Morphine', 'Y', .10, .09, .13, .14],
|
||||
['placebo', 'Y', .07, .07, .06, .07],
|
||||
['placebo', 'Y', .05, .07, .06, .07],
|
||||
['placebo', 'N', .03, .62, .31, .22],
|
||||
['placebo', 'N', .03, 1.05, .73, .60],
|
||||
['placebo', 'N', .07, .83, 1.07, .80],
|
||||
['Trimethaphan', 'N', .09, 3.13, 2.06, 1.23],
|
||||
['Trimethaphan', 'Y', .10, .09, .09, .08],
|
||||
['Trimethaphan', 'Y', .08, .09, .09, .10],
|
||||
['Trimethaphan', 'Y', .13, .10, .12, .12],
|
||||
['Trimethaphan', 'Y', .06, .05, .05, .05]],
|
||||
columns=['Drug', 'Depleted',
|
||||
'Histamine0', 'Histamine1',
|
||||
'Histamine3', 'Histamine5'])
|
||||
|
||||
for i in range(2, 6):
|
||||
data.iloc[:, i] = np.log(data.iloc[:, i])
|
||||
|
||||
|
||||
def compare_r_output_dogs_data(method):
|
||||
''' Testing within-subject effect interact with 2 between-subject effect
|
||||
Compares with R car library Anova(, type=3) output
|
||||
|
||||
Note: The test statistis Phillai, Wilks, Hotelling-Lawley
|
||||
and Roy are the same as R output but the approximate F and degree
|
||||
of freedoms can be different. This is due to the fact that this
|
||||
implementation is based on SAS formula [1]
|
||||
|
||||
.. [*] https://support.sas.com/documentation/cdl/en/statug/63033/HTML/default/viewer.htm#statug_introreg_sect012.htm
|
||||
'''
|
||||
|
||||
|
||||
# Repeated measures with orthogonal polynomial contrasts coding
|
||||
mod = _MultivariateOLS.from_formula(
|
||||
'Histamine0 + Histamine1 + Histamine3 + Histamine5 ~ Drug * Depleted',
|
||||
data)
|
||||
r = mod.fit(method=method)
|
||||
r = r.mv_test()
|
||||
a = [[2.68607660e-02, 4, 6, 5.43435304e+01, 7.59585610e-05],
|
||||
[9.73139234e-01, 4, 6, 5.43435304e+01, 7.59585610e-05],
|
||||
[3.62290202e+01, 4, 6, 5.43435304e+01, 7.59585610e-05],
|
||||
[3.62290202e+01, 4, 6, 5.43435304e+01, 7.59585610e-05]]
|
||||
assert_array_almost_equal(r['Intercept']['stat'].values, a, decimal=6)
|
||||
a = [[8.39646619e-02, 8, 1.20000000e+01, 3.67658068e+00, 2.12614444e-02],
|
||||
[1.18605382e+00, 8, 1.40000000e+01, 2.55003861e+00, 6.01270701e-02],
|
||||
[7.69391362e+00, 8, 6.63157895e+00, 5.50814270e+00, 2.07392260e-02],
|
||||
[7.25036952e+00, 4, 7.00000000e+00, 1.26881467e+01, 2.52669877e-03]]
|
||||
assert_array_almost_equal(r['Drug']['stat'].values, a, decimal=6)
|
||||
a = [[0.32048892, 4., 6., 3.18034906, 0.10002373],
|
||||
[0.67951108, 4., 6., 3.18034906, 0.10002373],
|
||||
[2.12023271, 4., 6., 3.18034906, 0.10002373],
|
||||
[2.12023271, 4., 6., 3.18034906, 0.10002373]]
|
||||
assert_array_almost_equal(r['Depleted']['stat'].values, a, decimal=6)
|
||||
a = [[0.15234366, 8., 12., 2.34307678, 0.08894239],
|
||||
[1.13013353, 8., 14., 2.27360606, 0.08553213],
|
||||
[3.70989596, 8., 6.63157895, 2.65594824, 0.11370285],
|
||||
[3.1145597, 4., 7., 5.45047947, 0.02582767]]
|
||||
assert_array_almost_equal(r['Drug:Depleted']['stat'].values, a, decimal=6)
|
||||
|
||||
|
||||
def test_glm_dogs_example():
|
||||
compare_r_output_dogs_data(method='svd')
|
||||
compare_r_output_dogs_data(method='pinv')
|
||||
|
||||
|
||||
def test_specify_L_M_by_string():
|
||||
mod = _MultivariateOLS.from_formula(
|
||||
'Histamine0 + Histamine1 + Histamine3 + Histamine5 ~ Drug * Depleted',
|
||||
data)
|
||||
r = mod.fit()
|
||||
r1 = r.mv_test(hypotheses=[['Intercept', ['Intercept'], None]])
|
||||
a = [[2.68607660e-02, 4, 6, 5.43435304e+01, 7.59585610e-05],
|
||||
[9.73139234e-01, 4, 6, 5.43435304e+01, 7.59585610e-05],
|
||||
[3.62290202e+01, 4, 6, 5.43435304e+01, 7.59585610e-05],
|
||||
[3.62290202e+01, 4, 6, 5.43435304e+01, 7.59585610e-05]]
|
||||
assert_array_almost_equal(r1['Intercept']['stat'].values, a, decimal=6)
|
||||
L = ['Intercept', 'Drug[T.Trimethaphan]', 'Drug[T.placebo]']
|
||||
M = ['Histamine1', 'Histamine3', 'Histamine5']
|
||||
r1 = r.mv_test(hypotheses=[['a', L, M]])
|
||||
a = [[1, 0, 0, 0, 0, 0],
|
||||
[0, 1, 0, 0, 0, 0],
|
||||
[0, 0, 1, 0, 0, 0]]
|
||||
assert_array_almost_equal(r1['a']['contrast_L'], a, decimal=10)
|
||||
a = [[0, 1, 0, 0],
|
||||
[0, 0, 1, 0],
|
||||
[0, 0, 0, 1]]
|
||||
assert_array_almost_equal(r1['a']['transform_M'].T, a, decimal=10)
|
||||
|
||||
|
||||
def test_independent_variable_singular():
|
||||
data1 = data.copy()
|
||||
data1['dup'] = data1['Drug']
|
||||
mod = _MultivariateOLS.from_formula(
|
||||
'Histamine0 + Histamine1 + Histamine3 + Histamine5 ~ Drug * dup',
|
||||
data1)
|
||||
assert_raises(ValueError, mod.fit)
|
||||
mod = _MultivariateOLS.from_formula(
|
||||
'Histamine0 + Histamine1 + Histamine3 + Histamine5 ~ Drug * dup',
|
||||
data1)
|
||||
assert_raises(ValueError, mod.fit)
|
||||
|
||||
|
||||
def test_from_formula_vs_no_formula():
|
||||
mod = _MultivariateOLS.from_formula(
|
||||
'Histamine0 + Histamine1 + Histamine3 + Histamine5 ~ Drug * Depleted',
|
||||
data)
|
||||
r = mod.fit(method='svd')
|
||||
r0 = r.mv_test()
|
||||
endog, exog = patsy.dmatrices(
|
||||
'Histamine0 + Histamine1 + Histamine3 + Histamine5 ~ Drug * Depleted',
|
||||
data, return_type="dataframe")
|
||||
L = np.array([[1, 0, 0, 0, 0, 0]])
|
||||
# DataFrame input
|
||||
r = _MultivariateOLS(endog, exog).fit(method='svd')
|
||||
r1 = r.mv_test(hypotheses=[['Intercept', L, None]])
|
||||
assert_array_almost_equal(r1['Intercept']['stat'].values,
|
||||
r0['Intercept']['stat'].values, decimal=6)
|
||||
# Numpy array input
|
||||
r = _MultivariateOLS(endog.values, exog.values).fit(method='svd')
|
||||
r1 = r.mv_test(hypotheses=[['Intercept', L, None]])
|
||||
assert_array_almost_equal(r1['Intercept']['stat'].values,
|
||||
r0['Intercept']['stat'].values, decimal=6)
|
||||
L = np.array([[0, 1, 0, 0, 0, 0],
|
||||
[0, 0, 1, 0, 0, 0],
|
||||
])
|
||||
r1 = r.mv_test(hypotheses=[['Drug', L, None]])
|
||||
# DataFrame input
|
||||
r = _MultivariateOLS(endog, exog).fit(method='svd')
|
||||
r1 = r.mv_test(hypotheses=[['Drug', L, None]])
|
||||
assert_array_almost_equal(r1['Drug']['stat'].values,
|
||||
r0['Drug']['stat'].values, decimal=6)
|
||||
# Numpy array input
|
||||
r = _MultivariateOLS(endog.values, exog.values).fit(method='svd')
|
||||
r1 = r.mv_test(hypotheses=[['Drug', L, None]])
|
||||
assert_array_almost_equal(r1['Drug']['stat'].values,
|
||||
r0['Drug']['stat'].values, decimal=6)
|
||||
|
||||
def test_L_M_matrices_1D_array():
|
||||
mod = _MultivariateOLS.from_formula(
|
||||
'Histamine0 + Histamine1 + Histamine3 + Histamine5 ~ Drug * Depleted',
|
||||
data)
|
||||
r = mod.fit(method='svd')
|
||||
L = np.array([1, 0, 0, 0, 0, 0])
|
||||
assert_raises(ValueError, r.mv_test, hypotheses=[['Drug', L, None]])
|
||||
L = np.array([[1, 0, 0, 0, 0, 0]])
|
||||
M = np.array([1, 0, 0, 0, 0, 0])
|
||||
assert_raises(ValueError, r.mv_test, hypotheses=[['Drug', L, M]])
|
||||
|
||||
|
||||
def test_exog_1D_array():
|
||||
mod = _MultivariateOLS.from_formula(
|
||||
'Histamine0 + Histamine1 + Histamine3 + Histamine5 ~ 0 + Depleted',
|
||||
data)
|
||||
r = mod.fit(method='svd')
|
||||
r0 = r.mv_test()
|
||||
a = [[0.0019, 8.0000, 20.0000, 55.0013, 0.0000],
|
||||
[1.8112, 8.0000, 22.0000, 26.3796, 0.0000],
|
||||
[97.8858, 8.0000, 12.1818, 117.1133, 0.0000],
|
||||
[93.2742, 4.0000, 11.0000, 256.5041, 0.0000]]
|
||||
assert_array_almost_equal(r0['Depleted']['stat'].values, a, decimal=4)
|
||||
|
||||
|
||||
def test_endog_1D_array():
|
||||
assert_raises(ValueError, _MultivariateOLS.from_formula,
|
||||
'Histamine0 ~ 0 + Depleted', data)
|
||||
|
||||
def test_affine_hypothesis():
|
||||
# Testing affine hypothesis, compared with R car linearHypothesis
|
||||
# Note: The test statistis Phillai, Wilks, Hotelling-Lawley
|
||||
# and Roy are the same as R output but the approximate F and degree
|
||||
# of freedoms can be different. This is due to the fact that this
|
||||
# implementation is based on SAS formula [1]
|
||||
mod = _MultivariateOLS.from_formula(
|
||||
'Histamine0 + Histamine1 + Histamine3 + Histamine5 ~ Drug * Depleted',
|
||||
data)
|
||||
r = mod.fit(method='svd')
|
||||
L = np.array([[0, 1.2, 1.1, 1.3, 1.5, 1.4],
|
||||
[0, 3.2, 2.1, 3.3, 5.5, 4.4]])
|
||||
M = None
|
||||
C = np.array([[1, 2, 3, 4],
|
||||
[5, 6, 7, 8]])
|
||||
r0 = r.mv_test(hypotheses=[('test1', L, M, C)])
|
||||
a = [[0.0269, 8.0000, 12.0000, 7.6441, 0.0010],
|
||||
[1.4277, 8.0000, 14.0000, 4.3657, 0.0080],
|
||||
[19.2678, 8.0000, 6.6316, 13.7940, 0.0016],
|
||||
[18.3470, 4.0000, 7.0000, 32.1072, 0.0001]]
|
||||
assert_array_almost_equal(r0['test1']['stat'].values, a, decimal=4)
|
||||
r0.summary(show_contrast_L=True, show_transform_M=True,
|
||||
show_constant_C=True)
|
||||
@ -0,0 +1,443 @@
|
||||
from statsmodels.compat.platform import PLATFORM_WIN32
|
||||
|
||||
import warnings
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
import pytest
|
||||
from numpy.testing import assert_allclose, assert_equal, assert_raises
|
||||
|
||||
from statsmodels.multivariate.pca import PCA, pca
|
||||
from statsmodels.multivariate.tests.results.datamlw import (data, princomp1,
|
||||
princomp2)
|
||||
from statsmodels.tools.sm_exceptions import EstimationWarning
|
||||
|
||||
DECIMAL_5 = .00001
|
||||
|
||||
|
||||
class TestPCA:
|
||||
@classmethod
|
||||
def setup_class(cls):
|
||||
rs = np.random.RandomState()
|
||||
rs.seed(1234)
|
||||
k = 3
|
||||
n = 100
|
||||
t = 200
|
||||
lam = 2
|
||||
|
||||
norm_rng = rs.standard_normal
|
||||
e = norm_rng((t, n))
|
||||
f = norm_rng((t, k))
|
||||
b = rs.standard_gamma(lam, size=(k, n)) / lam
|
||||
cls.x = f.dot(b) + e
|
||||
cls.x_copy = cls.x + 0.0
|
||||
cls.rs = rs
|
||||
|
||||
k = 3
|
||||
n = 300
|
||||
t = 200
|
||||
lam = 2
|
||||
|
||||
norm_rng = rs.standard_normal
|
||||
e = norm_rng((t, n))
|
||||
f = norm_rng((t, k))
|
||||
b = rs.standard_gamma(lam, size=(k, n)) / lam
|
||||
cls.x_wide = f.dot(b) + e
|
||||
|
||||
@pytest.mark.smoke
|
||||
@pytest.mark.matplotlib
|
||||
def test_smoke_plot_and_repr(self, close_figures):
|
||||
pc = PCA(self.x)
|
||||
fig = pc.plot_scree()
|
||||
fig = pc.plot_scree(ncomp=10)
|
||||
fig = pc.plot_scree(log_scale=False)
|
||||
fig = pc.plot_scree(cumulative=True)
|
||||
fig = pc.plot_rsquare()
|
||||
fig = pc.plot_rsquare(ncomp=5)
|
||||
# Additional smoke test
|
||||
pc.__repr__()
|
||||
pc = PCA(self.x, standardize=False)
|
||||
pc.__repr__()
|
||||
pc = PCA(self.x, standardize=False, demean=False)
|
||||
pc.__repr__()
|
||||
pc = PCA(self.x, ncomp=2, gls=True)
|
||||
assert "GLS" in pc.__repr__()
|
||||
# Check data for no changes
|
||||
assert_equal(self.x, pc.data)
|
||||
|
||||
def test_eig_svd_equiv(self):
|
||||
# Test leading components since the tail end can differ
|
||||
pc_eig = PCA(self.x)
|
||||
pc_svd = PCA(self.x, method='svd')
|
||||
|
||||
assert_allclose(pc_eig.projection, pc_svd.projection)
|
||||
assert_allclose(np.abs(pc_eig.factors[:, :2]),
|
||||
np.abs(pc_svd.factors[:, :2]))
|
||||
assert_allclose(np.abs(pc_eig.coeff[:2, :]),
|
||||
np.abs(pc_svd.coeff[:2, :]))
|
||||
assert_allclose(pc_eig.eigenvals,
|
||||
pc_svd.eigenvals)
|
||||
assert_allclose(np.abs(pc_eig.eigenvecs[:, :2]),
|
||||
np.abs(pc_svd.eigenvecs[:, :2]))
|
||||
|
||||
pc_svd = PCA(self.x, method='svd', ncomp=2)
|
||||
pc_nipals = PCA(self.x, method='nipals', ncomp=2)
|
||||
assert_allclose(np.abs(pc_nipals.factors),
|
||||
np.abs(pc_svd.factors),
|
||||
atol=DECIMAL_5)
|
||||
assert_allclose(np.abs(pc_nipals.coeff),
|
||||
np.abs(pc_svd.coeff),
|
||||
atol=DECIMAL_5)
|
||||
assert_allclose(pc_nipals.eigenvals,
|
||||
pc_svd.eigenvals,
|
||||
atol=DECIMAL_5)
|
||||
assert_allclose(np.abs(pc_nipals.eigenvecs),
|
||||
np.abs(pc_svd.eigenvecs),
|
||||
atol=DECIMAL_5)
|
||||
# Check data for no changes
|
||||
assert_equal(self.x, pc_svd.data)
|
||||
# Check data for no changes
|
||||
assert_equal(self.x, pc_eig.data)
|
||||
# Check data for no changes
|
||||
assert_equal(self.x, pc_nipals.data)
|
||||
|
||||
def test_options(self):
|
||||
pc = PCA(self.x)
|
||||
pc_no_norm = PCA(self.x, normalize=False)
|
||||
assert_allclose(pc.factors.dot(pc.coeff),
|
||||
pc_no_norm.factors.dot(pc_no_norm.coeff))
|
||||
princomp = pc.factors
|
||||
assert_allclose(princomp.T.dot(princomp), np.eye(100), atol=1e-5)
|
||||
weights = pc_no_norm.coeff
|
||||
assert_allclose(weights.T.dot(weights), np.eye(100), atol=1e-5)
|
||||
|
||||
pc_10 = PCA(self.x, ncomp=10)
|
||||
assert_allclose(pc.factors[:, :10], pc_10.factors)
|
||||
assert_allclose(pc.coeff[:10, :], pc_10.coeff)
|
||||
assert_allclose(pc.rsquare[:(10 + 1)], pc_10.rsquare)
|
||||
assert_allclose(pc.eigenvals[:10], pc_10.eigenvals)
|
||||
assert_allclose(pc.eigenvecs[:, :10], pc_10.eigenvecs)
|
||||
|
||||
pc = PCA(self.x, standardize=False, normalize=False)
|
||||
mu = self.x.mean(0)
|
||||
xdm = self.x - mu
|
||||
xpx = xdm.T.dot(xdm)
|
||||
val, vec = np.linalg.eigh(xpx)
|
||||
ind = np.argsort(val)
|
||||
ind = ind[::-1]
|
||||
val = val[ind]
|
||||
vec = vec[:, ind]
|
||||
assert_allclose(xdm, pc.transformed_data)
|
||||
assert_allclose(val, pc.eigenvals)
|
||||
assert_allclose(np.abs(vec), np.abs(pc.eigenvecs))
|
||||
assert_allclose(np.abs(pc.factors), np.abs(xdm.dot(vec)))
|
||||
assert_allclose(pc.projection, xdm + mu)
|
||||
|
||||
pc = PCA(self.x, standardize=False, demean=False, normalize=False)
|
||||
x = self.x
|
||||
xpx = x.T.dot(x)
|
||||
val, vec = np.linalg.eigh(xpx)
|
||||
ind = np.argsort(val)
|
||||
ind = ind[::-1]
|
||||
val = val[ind]
|
||||
vec = vec[:, ind]
|
||||
assert_allclose(x, pc.transformed_data)
|
||||
assert_allclose(val, pc.eigenvals)
|
||||
assert_allclose(np.abs(vec), np.abs(pc.eigenvecs))
|
||||
assert_allclose(np.abs(pc.factors), np.abs(x.dot(vec)))
|
||||
|
||||
def test_against_reference(self):
|
||||
# Test against MATLAB, which by default demeans but does not standardize
|
||||
x = data.xo / 1000.0
|
||||
pc = PCA(x, normalize=False, standardize=False)
|
||||
|
||||
ref = princomp1
|
||||
assert_allclose(np.abs(pc.factors), np.abs(ref.factors))
|
||||
assert_allclose(pc.factors.dot(pc.coeff) + x.mean(0), x)
|
||||
assert_allclose(np.abs(pc.coeff), np.abs(ref.coef.T))
|
||||
assert_allclose(pc.factors.dot(pc.coeff),
|
||||
ref.factors.dot(ref.coef.T))
|
||||
|
||||
pc = PCA(x[:20], normalize=False, standardize=False)
|
||||
mu = x[:20].mean(0)
|
||||
ref = princomp2
|
||||
assert_allclose(np.abs(pc.factors), np.abs(ref.factors))
|
||||
assert_allclose(pc.factors.dot(pc.coeff) + mu, x[:20])
|
||||
assert_allclose(np.abs(pc.coeff), np.abs(ref.coef.T))
|
||||
assert_allclose(pc.factors.dot(pc.coeff),
|
||||
ref.factors.dot(ref.coef.T))
|
||||
|
||||
def test_warnings_and_errors(self):
|
||||
with warnings.catch_warnings(record=True) as w:
|
||||
pc = PCA(self.x, ncomp=300)
|
||||
assert_equal(len(w), 1)
|
||||
|
||||
with warnings.catch_warnings(record=True) as w:
|
||||
rs = self.rs
|
||||
x = rs.standard_normal((200, 1)) * np.ones(200)
|
||||
pc = PCA(x, method='eig')
|
||||
assert_equal(len(w), 1)
|
||||
|
||||
assert_raises(ValueError, PCA, self.x, method='unknown')
|
||||
assert_raises(ValueError, PCA, self.x, missing='unknown')
|
||||
assert_raises(ValueError, PCA, self.x, tol=2.0)
|
||||
assert_raises(ValueError, PCA, np.nan * np.ones((200, 100)), tol=2.0)
|
||||
|
||||
@pytest.mark.matplotlib
|
||||
def test_pandas(self, close_figures):
|
||||
pc = PCA(pd.DataFrame(self.x))
|
||||
pc1 = PCA(self.x)
|
||||
assert_allclose(pc.factors.values, pc1.factors)
|
||||
fig = pc.plot_scree()
|
||||
fig = pc.plot_scree(ncomp=10)
|
||||
fig = pc.plot_scree(log_scale=False)
|
||||
fig = pc.plot_rsquare()
|
||||
fig = pc.plot_rsquare(ncomp=5)
|
||||
proj = pc.project(2)
|
||||
PCA(pd.DataFrame(self.x), ncomp=4, gls=True)
|
||||
PCA(pd.DataFrame(self.x), ncomp=4, standardize=False)
|
||||
|
||||
def test_gls_and_weights(self):
|
||||
assert_raises(ValueError, PCA, self.x, gls=True)
|
||||
assert_raises(ValueError, PCA, self.x, weights=np.array([1.0, 1.0]))
|
||||
|
||||
# Pre-standardize to make comparison simple
|
||||
x = (self.x - self.x.mean(0))
|
||||
x = x / (x ** 2.0).mean(0)
|
||||
pc_gls = PCA(x, ncomp=1, standardize=False, demean=False, gls=True)
|
||||
pc = PCA(x, ncomp=1, standardize=False, demean=False)
|
||||
errors = x - pc.projection
|
||||
var = (errors ** 2.0).mean(0)
|
||||
weights = 1.0 / var
|
||||
weights = weights / np.sqrt((weights ** 2.0).mean())
|
||||
|
||||
assert_allclose(weights, pc_gls.weights)
|
||||
assert_equal(x, pc_gls.data)
|
||||
assert_equal(x, pc.data)
|
||||
|
||||
pc_weights = PCA(x, ncomp=1, standardize=False, demean=False, weights=weights)
|
||||
|
||||
assert_allclose(weights, pc_weights.weights)
|
||||
assert_allclose(np.abs(pc_weights.factors), np.abs(pc_gls.factors))
|
||||
|
||||
@pytest.mark.slow
|
||||
def test_wide(self):
|
||||
pc = PCA(self.x_wide)
|
||||
assert_equal(pc.factors.shape[1], self.x_wide.shape[0])
|
||||
assert_equal(pc.eigenvecs.shape[1], min(np.array(self.x_wide.shape)))
|
||||
|
||||
pc = PCA(pd.DataFrame(self.x_wide))
|
||||
assert_equal(pc.factors.shape[1], self.x_wide.shape[0])
|
||||
assert_equal(pc.eigenvecs.shape[1], min(np.array(self.x_wide.shape)))
|
||||
|
||||
def test_projection(self):
|
||||
pc = PCA(self.x, ncomp=5)
|
||||
mu = self.x.mean(0)
|
||||
demean_x = self.x - mu
|
||||
coef = np.linalg.pinv(pc.factors).dot(demean_x)
|
||||
direct = pc.factors.dot(coef)
|
||||
assert_allclose(pc.projection, direct + mu)
|
||||
|
||||
pc = PCA(self.x, standardize=False, ncomp=5)
|
||||
coef = np.linalg.pinv(pc.factors).dot(demean_x)
|
||||
direct = pc.factors.dot(coef)
|
||||
assert_allclose(pc.projection, direct + mu)
|
||||
|
||||
pc = PCA(self.x, standardize=False, demean=False, ncomp=5)
|
||||
coef = np.linalg.pinv(pc.factors).dot(self.x)
|
||||
direct = pc.factors.dot(coef)
|
||||
assert_allclose(pc.projection, direct)
|
||||
|
||||
pc = PCA(self.x, ncomp=5, gls=True)
|
||||
mu = self.x.mean(0)
|
||||
demean_x = self.x - mu
|
||||
coef = np.linalg.pinv(pc.factors).dot(demean_x)
|
||||
direct = pc.factors.dot(coef)
|
||||
assert_allclose(pc.projection, direct + mu)
|
||||
|
||||
pc = PCA(self.x, standardize=False, ncomp=5)
|
||||
coef = np.linalg.pinv(pc.factors).dot(demean_x)
|
||||
direct = pc.factors.dot(coef)
|
||||
assert_allclose(pc.projection, direct + mu)
|
||||
|
||||
pc = PCA(self.x, standardize=False, demean=False, ncomp=5, gls=True)
|
||||
coef = np.linalg.pinv(pc.factors).dot(self.x)
|
||||
direct = pc.factors.dot(coef)
|
||||
assert_allclose(pc.projection, direct)
|
||||
|
||||
# Test error for too many factors
|
||||
project = pc.project
|
||||
assert_raises(ValueError, project, 6)
|
||||
|
||||
@pytest.mark.skipif(PLATFORM_WIN32, reason='Windows 32-bit')
|
||||
def test_replace_missing(self):
|
||||
x = self.x.copy()
|
||||
x[::5, ::7] = np.nan
|
||||
|
||||
pc = PCA(x, missing='drop-row')
|
||||
x_dropped_row = x[np.logical_not(np.any(np.isnan(x), 1))]
|
||||
pc_dropped = PCA(x_dropped_row)
|
||||
assert_allclose(pc.projection, pc_dropped.projection)
|
||||
assert_equal(x, pc.data)
|
||||
|
||||
pc = PCA(x, missing='drop-col')
|
||||
x_dropped_col = x[:, np.logical_not(np.any(np.isnan(x), 0))]
|
||||
pc_dropped = PCA(x_dropped_col)
|
||||
assert_allclose(pc.projection, pc_dropped.projection)
|
||||
assert_equal(x, pc.data)
|
||||
|
||||
pc = PCA(x, missing='drop-min')
|
||||
if x_dropped_row.size > x_dropped_col.size:
|
||||
x_dropped_min = x_dropped_row
|
||||
else:
|
||||
x_dropped_min = x_dropped_col
|
||||
pc_dropped = PCA(x_dropped_min)
|
||||
assert_allclose(pc.projection, pc_dropped.projection)
|
||||
assert_equal(x, pc.data)
|
||||
|
||||
pc = PCA(x, ncomp=3, missing='fill-em')
|
||||
missing = np.isnan(x)
|
||||
mu = np.nanmean(x, axis=0)
|
||||
errors = x - mu
|
||||
sigma = np.sqrt(np.nanmean(errors ** 2, axis=0))
|
||||
x_std = errors / sigma
|
||||
x_std[missing] = 0.0
|
||||
last = x_std[missing]
|
||||
delta = 1.0
|
||||
count = 0
|
||||
while delta > 5e-8:
|
||||
pc_temp = PCA(x_std, ncomp=3, standardize=False, demean=False)
|
||||
x_std[missing] = pc_temp.projection[missing]
|
||||
current = x_std[missing]
|
||||
diff = current - last
|
||||
delta = np.sqrt(np.sum(diff ** 2)) / np.sqrt(np.sum(current ** 2))
|
||||
last = current
|
||||
count += 1
|
||||
x = self.x + 0.0
|
||||
projection = pc_temp.projection * sigma + mu
|
||||
x[missing] = projection[missing]
|
||||
assert_allclose(pc._adjusted_data, x)
|
||||
# Check data for no changes
|
||||
assert_equal(self.x, self.x_copy)
|
||||
|
||||
x = self.x
|
||||
pc = PCA(x)
|
||||
pc_dropped = PCA(x, missing='drop-row')
|
||||
assert_allclose(pc.projection, pc_dropped.projection, atol=DECIMAL_5)
|
||||
|
||||
pc_dropped = PCA(x, missing='drop-col')
|
||||
assert_allclose(pc.projection, pc_dropped.projection, atol=DECIMAL_5)
|
||||
|
||||
pc_dropped = PCA(x, missing='drop-min')
|
||||
assert_allclose(pc.projection, pc_dropped.projection, atol=DECIMAL_5)
|
||||
|
||||
pc = PCA(x, ncomp=3)
|
||||
pc_dropped = PCA(x, ncomp=3, missing='fill-em')
|
||||
assert_allclose(pc.projection, pc_dropped.projection, atol=DECIMAL_5)
|
||||
|
||||
# Test too many missing for missing='fill-em'
|
||||
x = self.x.copy()
|
||||
x[:, :] = np.nan
|
||||
assert_raises(ValueError, PCA, x, missing='drop-row')
|
||||
assert_raises(ValueError, PCA, x, missing='drop-col')
|
||||
assert_raises(ValueError, PCA, x, missing='drop-min')
|
||||
assert_raises(ValueError, PCA, x, missing='fill-em')
|
||||
|
||||
def test_rsquare(self):
|
||||
x = self.x + 0.0
|
||||
mu = x.mean(0)
|
||||
x_demean = x - mu
|
||||
std = np.std(x, 0)
|
||||
x_std = x_demean / std
|
||||
|
||||
pc = PCA(self.x)
|
||||
nvar = x.shape[1]
|
||||
rsquare = np.zeros(nvar + 1)
|
||||
tss = np.sum(x_std ** 2)
|
||||
for i in range(nvar + 1):
|
||||
errors = x_std - pc.project(i, transform=False, unweight=False)
|
||||
rsquare[i] = 1.0 - np.sum(errors ** 2) / tss
|
||||
assert_allclose(rsquare, pc.rsquare)
|
||||
|
||||
pc = PCA(self.x, standardize=False)
|
||||
tss = np.sum(x_demean ** 2)
|
||||
for i in range(nvar + 1):
|
||||
errors = x_demean - pc.project(i, transform=False, unweight=False)
|
||||
rsquare[i] = 1.0 - np.sum(errors ** 2) / tss
|
||||
assert_allclose(rsquare, pc.rsquare)
|
||||
|
||||
pc = PCA(self.x, standardize=False, demean=False)
|
||||
tss = np.sum(x ** 2)
|
||||
for i in range(nvar + 1):
|
||||
errors = x - pc.project(i, transform=False, unweight=False)
|
||||
rsquare[i] = 1.0 - np.sum(errors ** 2) / tss
|
||||
assert_allclose(rsquare, pc.rsquare)
|
||||
|
||||
@pytest.mark.slow
|
||||
def test_missing_dataframe(self):
|
||||
x = self.x.copy()
|
||||
x[::5, ::7] = np.nan
|
||||
pc = PCA(x, ncomp=3, missing='fill-em')
|
||||
|
||||
x = pd.DataFrame(x)
|
||||
pc_df = PCA(x, ncomp=3, missing='fill-em')
|
||||
assert_allclose(pc.coeff, pc_df.coeff)
|
||||
assert_allclose(pc.factors, pc_df.factors)
|
||||
|
||||
pc_df_nomissing = PCA(pd.DataFrame(self.x.copy()), ncomp=3)
|
||||
assert isinstance(pc_df.coeff, type(pc_df_nomissing.coeff))
|
||||
assert isinstance(pc_df.data, type(pc_df_nomissing.data))
|
||||
assert isinstance(pc_df.eigenvals, type(pc_df_nomissing.eigenvals))
|
||||
assert isinstance(pc_df.eigenvecs, type(pc_df_nomissing.eigenvecs))
|
||||
|
||||
x = self.x.copy()
|
||||
x[::5, ::7] = np.nan
|
||||
x_df = pd.DataFrame(x)
|
||||
pc = PCA(x, missing='drop-row')
|
||||
pc_df = PCA(x_df, missing='drop-row')
|
||||
assert_allclose(pc.coeff, pc_df.coeff)
|
||||
assert_allclose(pc.factors, pc_df.factors)
|
||||
|
||||
pc = PCA(x, missing='drop-col')
|
||||
pc_df = PCA(x_df, missing='drop-col')
|
||||
assert_allclose(pc.coeff, pc_df.coeff)
|
||||
assert_allclose(pc.factors, pc_df.factors)
|
||||
|
||||
pc = PCA(x, missing='drop-min')
|
||||
pc_df = PCA(x_df, missing='drop-min')
|
||||
assert_allclose(pc.coeff, pc_df.coeff)
|
||||
assert_allclose(pc.factors, pc_df.factors)
|
||||
|
||||
def test_equivalence(self):
|
||||
x = self.x.copy()
|
||||
assert_allclose(PCA(x).factors, pca(x)[0])
|
||||
|
||||
def test_equivalence_full_matrices(self):
|
||||
x = self.x.copy()
|
||||
svd_full_matrices_true = PCA(x, svd_full_matrices=True).factors
|
||||
svd_full_matrices_false = PCA(x).factors
|
||||
assert_allclose(svd_full_matrices_true, svd_full_matrices_false)
|
||||
|
||||
|
||||
def test_missing():
|
||||
data = np.empty((200, 50))
|
||||
data[0, 0] = np.nan
|
||||
with pytest.raises(ValueError, match="data contains non-finite values"):
|
||||
PCA(data)
|
||||
|
||||
|
||||
def test_too_many_missing(reset_randomstate):
|
||||
data = np.random.standard_normal((200, 50))
|
||||
data[0, :-3] = np.nan
|
||||
with pytest.raises(ValueError):
|
||||
PCA(data, ncomp=5, missing="drop-col")
|
||||
p = PCA(data, missing="drop-min")
|
||||
assert max(p.factors.shape) == max(data.shape) - 1
|
||||
|
||||
|
||||
def test_gls_warning(reset_randomstate):
|
||||
data = np.random.standard_normal((400, 200))
|
||||
data[:, 1:] = data[:, :1] + .01 * data[:, 1:]
|
||||
with pytest.warns(EstimationWarning, match="Many series are being down weighted"):
|
||||
factors = PCA(data, ncomp=2, gls=True).factors
|
||||
assert factors.shape == (data.shape[0], 2)
|
||||
Reference in New Issue
Block a user