reconnect moved files to git repo

This commit is contained in:
root
2025-08-01 04:33:03 -04:00
commit 5d3c35492d
23190 changed files with 4750716 additions and 0 deletions

View File

@ -0,0 +1,3 @@
from statsmodels.tools._test_runner import PytestTester
test = PytestTester()

View File

@ -0,0 +1,10 @@
__all__ = [
"PCA", "MANOVA", "Factor", "FactorResults", "CanCorr",
"factor_rotation"
]
from .pca import PCA
from .manova import MANOVA
from .factor import Factor, FactorResults
from .cancorr import CanCorr
from . import factor_rotation

View File

@ -0,0 +1,176 @@
"""Canonical correlation analysis
author: Yichuan Liu
"""
import numpy as np
from numpy.linalg import svd
import scipy
import pandas as pd
from statsmodels.base.model import Model
from statsmodels.iolib import summary2
from .multivariate_ols import multivariate_stats
class CanCorr(Model):
"""
Canonical correlation analysis using singular value decomposition
For matrices exog=x and endog=y, find projections x_cancoef and y_cancoef
such that:
x1 = x * x_cancoef, x1' * x1 is identity matrix
y1 = y * y_cancoef, y1' * y1 is identity matrix
and the correlation between x1 and y1 is maximized.
Attributes
----------
endog : ndarray
See Parameters.
exog : ndarray
See Parameters.
cancorr : ndarray
The canonical correlation values
y_cancoef : ndarray
The canonical coefficients for endog
x_cancoef : ndarray
The canonical coefficients for exog
References
----------
.. [*] http://numerical.recipes/whp/notes/CanonCorrBySVD.pdf
.. [*] http://www.csun.edu/~ata20315/psy524/docs/Psy524%20Lecture%208%20CC.pdf
.. [*] http://www.mathematica-journal.com/2014/06/canonical-correlation-analysis/
""" # noqa:E501
def __init__(self, endog, exog, tolerance=1e-8, missing='none', hasconst=None, **kwargs):
super().__init__(endog, exog, missing=missing,
hasconst=hasconst, **kwargs)
self._fit(tolerance)
def _fit(self, tolerance=1e-8):
"""Fit the model
A ValueError is raised if there are singular values smaller than the
tolerance. The treatment of singular arrays might change in future.
Parameters
----------
tolerance : float
eigenvalue tolerance, values smaller than which is considered 0
"""
nobs, k_yvar = self.endog.shape
nobs, k_xvar = self.exog.shape
k = np.min([k_yvar, k_xvar])
x = np.array(self.exog)
x = x - x.mean(0)
y = np.array(self.endog)
y = y - y.mean(0)
ux, sx, vx = svd(x, 0)
# vx_ds = vx.T divided by sx
vx_ds = vx.T
mask = sx > tolerance
if mask.sum() < len(mask):
raise ValueError('exog is collinear.')
vx_ds[:, mask] /= sx[mask]
uy, sy, vy = svd(y, 0)
# vy_ds = vy.T divided by sy
vy_ds = vy.T
mask = sy > tolerance
if mask.sum() < len(mask):
raise ValueError('endog is collinear.')
vy_ds[:, mask] /= sy[mask]
u, s, v = svd(ux.T.dot(uy), 0)
# Correct any roundoff
self.cancorr = np.array([max(0, min(s[i], 1)) for i in range(len(s))])
self.x_cancoef = vx_ds.dot(u[:, :k])
self.y_cancoef = vy_ds.dot(v.T[:, :k])
def corr_test(self):
"""Approximate F test
Perform multivariate statistical tests of the hypothesis that
there is no canonical correlation between endog and exog.
For each canonical correlation, testing its significance based on
Wilks' lambda.
Returns
-------
CanCorrTestResults instance
"""
nobs, k_yvar = self.endog.shape
nobs, k_xvar = self.exog.shape
eigenvals = np.power(self.cancorr, 2)
stats = pd.DataFrame(columns=['Canonical Correlation', "Wilks' lambda",
'Num DF','Den DF', 'F Value','Pr > F'],
index=list(range(len(eigenvals) - 1, -1, -1)))
prod = 1
for i in range(len(eigenvals) - 1, -1, -1):
prod *= 1 - eigenvals[i]
p = k_yvar - i
q = k_xvar - i
r = (nobs - k_yvar - 1) - (p - q + 1) / 2
u = (p * q - 2) / 4
df1 = p * q
if p ** 2 + q ** 2 - 5 > 0:
t = np.sqrt(((p * q) ** 2 - 4) / (p ** 2 + q ** 2 - 5))
else:
t = 1
df2 = r * t - 2 * u
lmd = np.power(prod, 1 / t)
F = (1 - lmd) / lmd * df2 / df1
stats.loc[i, 'Canonical Correlation'] = self.cancorr[i]
stats.loc[i, "Wilks' lambda"] = prod
stats.loc[i, 'Num DF'] = df1
stats.loc[i, 'Den DF'] = df2
stats.loc[i, 'F Value'] = F
pval = scipy.stats.f.sf(F, df1, df2)
stats.loc[i, 'Pr > F'] = pval
'''
# Wilk's Chi square test of each canonical correlation
df = (p - i + 1) * (q - i + 1)
chi2 = a * np.log(prod)
pval = stats.chi2.sf(chi2, df)
stats.loc[i, 'Canonical correlation'] = self.cancorr[i]
stats.loc[i, 'Chi-square'] = chi2
stats.loc[i, 'DF'] = df
stats.loc[i, 'Pr > ChiSq'] = pval
'''
ind = stats.index.values[::-1]
stats = stats.loc[ind, :]
# Multivariate tests (remember x has mean removed)
stats_mv = multivariate_stats(eigenvals,
k_yvar, k_xvar, nobs - k_xvar - 1)
return CanCorrTestResults(stats, stats_mv)
class CanCorrTestResults:
"""
Canonical correlation results class
Attributes
----------
stats : DataFrame
Contain statistical tests results for each canonical correlation
stats_mv : DataFrame
Contain the multivariate statistical tests results
"""
def __init__(self, stats, stats_mv):
self.stats = stats
self.stats_mv = stats_mv
def __str__(self):
return self.summary().__str__()
def summary(self):
summ = summary2.Summary()
summ.add_title('Cancorr results')
summ.add_df(self.stats)
summ.add_dict({'': ''})
summ.add_dict({'Multivariate Statistics and F Approximations': ''})
summ.add_df(self.stats_mv)
return summ

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,32 @@
"""
Package with factor rotation algorithms.
This file contains a Python version of the gradient projection rotation
algorithms (GPA) developed by Bernaards, C.A. and Jennrich, R.I.
The code is based on the Matlab version of the code developed Bernaards, C.A.
and Jennrich, R.I. and is ported and made available with permission of the
authors.
Additionally, several analytic rotation methods are implemented.
References
----------
[1] Bernaards, C.A. and Jennrich, R.I. (2005) Gradient Projection Algorithms and Software for Arbitrary Rotation Criteria in Factor Analysis. Educational and Psychological Measurement, 65 (5), 676-696.
[2] Jennrich, R.I. (2001). A simple general procedure for orthogonal rotation. Psychometrika, 66, 289-306.
[3] Jennrich, R.I. (2002). A simple general method for oblique rotation. Psychometrika, 67, 7-19.
[4] http://www.stat.ucla.edu/research/gpa/matlab.net
[5] http://www.stat.ucla.edu/research/gpa/GPderfree.txt
"""
from ._wrappers import rotate_factors
from ._analytic_rotation import target_rotation, procrustes, promax
from statsmodels.tools._test_runner import PytestTester
__all__ = ['rotate_factors', 'target_rotation', 'procrustes', 'promax',
'test']
test = PytestTester()

View File

@ -0,0 +1,152 @@
"""
This file contains analytic implementations of rotation methods.
"""
import numpy as np
import scipy as sp
def target_rotation(A, H, full_rank=False):
r"""
Analytically performs orthogonal rotations towards a target matrix,
i.e., we minimize:
.. math::
\phi(L) =\frac{1}{2}\|AT-H\|^2.
where :math:`T` is an orthogonal matrix. This problem is also known as
an orthogonal Procrustes problem.
Under the assumption that :math:`A^*H` has full rank, the analytical
solution :math:`T` is given by:
.. math::
T = (A^*HH^*A)^{-\frac{1}{2}}A^*H,
see Green (1952). In other cases the solution is given by :math:`T = UV`,
where :math:`U` and :math:`V` result from the singular value decomposition
of :math:`A^*H`:
.. math::
A^*H = U\Sigma V,
see Schonemann (1966).
Parameters
----------
A : numpy matrix (default None)
non rotated factors
H : numpy matrix
target matrix
full_rank : bool (default FAlse)
if set to true full rank is assumed
Returns
-------
The matrix :math:`T`.
References
----------
[1] Green (1952, Psychometrika) - The orthogonal approximation of an
oblique structure in factor analysis
[2] Schonemann (1966) - A generalized solution of the orthogonal
procrustes problem
[3] Gower, Dijksterhuis (2004) - Procrustes problems
"""
ATH = A.T.dot(H)
if full_rank or np.linalg.matrix_rank(ATH) == A.shape[1]:
T = sp.linalg.fractional_matrix_power(ATH.dot(ATH.T), -1/2).dot(ATH)
else:
U, D, V = np.linalg.svd(ATH, full_matrices=False)
T = U.dot(V)
return T
def procrustes(A, H):
r"""
Analytically solves the following Procrustes problem:
.. math::
\phi(L) =\frac{1}{2}\|AT-H\|^2.
(With no further conditions on :math:`H`)
Under the assumption that :math:`A^*H` has full rank, the analytical
solution :math:`T` is given by:
.. math::
T = (A^*HH^*A)^{-\frac{1}{2}}A^*H,
see Navarra, Simoncini (2010).
Parameters
----------
A : numpy matrix
non rotated factors
H : numpy matrix
target matrix
full_rank : bool (default False)
if set to true full rank is assumed
Returns
-------
The matrix :math:`T`.
References
----------
[1] Navarra, Simoncini (2010) - A guide to empirical orthogonal functions
for climate data analysis
"""
return np.linalg.inv(A.T.dot(A)).dot(A.T).dot(H)
def promax(A, k=2):
r"""
Performs promax rotation of the matrix :math:`A`.
This method was not very clear to me from the literature, this
implementation is as I understand it should work.
Promax rotation is performed in the following steps:
* Determine varimax rotated patterns :math:`V`.
* Construct a rotation target matrix :math:`|V_{ij}|^k/V_{ij}`
* Perform procrustes rotation towards the target to obtain T
* Determine the patterns
First, varimax rotation a target matrix :math:`H` is determined with
orthogonal varimax rotation.
Then, oblique target rotation is performed towards the target.
Parameters
----------
A : numpy matrix
non rotated factors
k : float
parameter, should be positive
References
----------
[1] Browne (2001) - An overview of analytic rotation in exploratory
factor analysis
[2] Navarra, Simoncini (2010) - A guide to empirical orthogonal functions
for climate data analysis
"""
assert k > 0
# define rotation target using varimax rotation:
from ._wrappers import rotate_factors
V, T = rotate_factors(A, 'varimax')
H = np.abs(V)**k/V
# solve procrustes problem
S = procrustes(A, H) # np.linalg.inv(A.T.dot(A)).dot(A.T).dot(H);
# normalize
d = np.sqrt(np.diag(np.linalg.inv(S.T.dot(S))))
D = np.diag(d)
T = np.linalg.inv(S.dot(D)).T
return A.dot(T), T

View File

@ -0,0 +1,592 @@
"""
This file contains a Python version of the gradient projection rotation
algorithms (GPA) developed by Bernaards, C.A. and Jennrich, R.I.
The code is based on code developed Bernaards, C.A. and Jennrich, R.I.
and is ported and made available with permission of the authors.
References
----------
[1] Bernaards, C.A. and Jennrich, R.I. (2005) Gradient Projection Algorithms
and Software for Arbitrary Rotation Criteria in Factor Analysis. Educational
and Psychological Measurement, 65 (5), 676-696.
[2] Jennrich, R.I. (2001). A simple general procedure for orthogonal rotation.
Psychometrika, 66, 289-306.
[3] Jennrich, R.I. (2002). A simple general method for oblique rotation.
Psychometrika, 67, 7-19.
[4] http://www.stat.ucla.edu/research/gpa/matlab.net
[5] http://www.stat.ucla.edu/research/gpa/GPderfree.txt
"""
import numpy as np
def GPA(A, ff=None, vgQ=None, T=None, max_tries=501,
rotation_method='orthogonal', tol=1e-5):
r"""
The gradient projection algorithm (GPA) minimizes a target function
:math:`\phi(L)`, where :math:`L` is a matrix with rotated factors.
For orthogonal rotation methods :math:`L=AT`, where :math:`T` is an
orthogonal matrix. For oblique rotation matrices :math:`L=A(T^*)^{-1}`,
where :math:`T` is a normal matrix, i.e., :math:`TT^*=T^*T`. Oblique
rotations relax the orthogonality constraint in order to gain simplicity
in the interpretation.
Parameters
----------
A : numpy matrix
non rotated factors
T : numpy matrix (default identity matrix)
initial guess of rotation matrix
ff : function (defualt None)
criterion :math:`\phi` to optimize. Should have A, T, L as keyword
arguments
and mapping to a float. Only used (and required) if vgQ is not
provided.
vgQ : function (defualt None)
criterion :math:`\phi` to optimize and its derivative. Should have
A, T, L as keyword arguments and mapping to a tuple containing a
float and vector. Can be omitted if ff is provided.
max_tries : int (default 501)
maximum number of iterations
rotation_method : str
should be one of {orthogonal, oblique}
tol : float
stop criterion, algorithm stops if Frobenius norm of gradient is
smaller then tol
"""
# pre processing
if rotation_method not in ['orthogonal', 'oblique']:
raise ValueError('rotation_method should be one of '
'{orthogonal, oblique}')
if vgQ is None:
if ff is None:
raise ValueError('ff should be provided if vgQ is not')
derivative_free = True
Gff = lambda x: Gf(x, lambda y: ff(T=y, A=A, L=None))
else:
derivative_free = False
if T is None:
T = np.eye(A.shape[1])
# pre processing for iteration
al = 1
table = []
# pre processing for iteration: initialize f and G
if derivative_free:
f = ff(T=T, A=A, L=None)
G = Gff(T)
elif rotation_method == 'orthogonal': # and not derivative_free
L = A.dot(T)
f, Gq = vgQ(L=L)
G = (A.T).dot(Gq)
else: # i.e. rotation_method == 'oblique' and not derivative_free
Ti = np.linalg.inv(T)
L = A.dot(Ti.T)
f, Gq = vgQ(L=L)
G = -((L.T).dot(Gq).dot(Ti)).T
# iteration
for i_try in range(0, max_tries):
# determine Gp
if rotation_method == 'orthogonal':
M = (T.T).dot(G)
S = (M + M.T)/2
Gp = G - T.dot(S)
else: # i.e. if rotation_method == 'oblique':
Gp = G-T.dot(np.diag(np.sum(T*G, axis=0)))
s = np.linalg.norm(Gp, 'fro')
table.append([i_try, f, np.log10(s), al])
# if we are close stop
if s < tol:
break
# update T
al = 2*al
for i in range(11):
# determine Tt
X = T - al*Gp
if rotation_method == 'orthogonal':
U, D, V = np.linalg.svd(X, full_matrices=False)
Tt = U.dot(V)
else: # i.e. if rotation_method == 'oblique':
v = 1/np.sqrt(np.sum(X**2, axis=0))
Tt = X.dot(np.diag(v))
# calculate objective using Tt
if derivative_free:
ft = ff(T=Tt, A=A, L=None)
elif rotation_method == 'orthogonal': # and not derivative_free
L = A.dot(Tt)
ft, Gq = vgQ(L=L)
else: # i.e. rotation_method == 'oblique' and not derivative_free
Ti = np.linalg.inv(Tt)
L = A.dot(Ti.T)
ft, Gq = vgQ(L=L)
# if sufficient improvement in objective -> use this T
if ft < f-.5*s**2*al:
break
al = al/2
# post processing for next iteration
T = Tt
f = ft
if derivative_free:
G = Gff(T)
elif rotation_method == 'orthogonal': # and not derivative_free
G = (A.T).dot(Gq)
else: # i.e. rotation_method == 'oblique' and not derivative_free
G = -((L.T).dot(Gq).dot(Ti)).T
# post processing
Th = T
Lh = rotateA(A, T, rotation_method=rotation_method)
Phi = (T.T).dot(T)
return Lh, Phi, Th, table
def Gf(T, ff):
"""
Subroutine for the gradient of f using numerical derivatives.
"""
k = T.shape[0]
ep = 1e-4
G = np.zeros((k, k))
for r in range(k):
for s in range(k):
dT = np.zeros((k, k))
dT[r, s] = ep
G[r, s] = (ff(T+dT)-ff(T-dT))/(2*ep)
return G
def rotateA(A, T, rotation_method='orthogonal'):
r"""
For orthogonal rotation methods :math:`L=AT`, where :math:`T` is an
orthogonal matrix. For oblique rotation matrices :math:`L=A(T^*)^{-1}`,
where :math:`T` is a normal matrix, i.e., :math:`TT^*=T^*T`. Oblique
rotations relax the orthogonality constraint in order to gain simplicity
in the interpretation.
"""
if rotation_method == 'orthogonal':
L = A.dot(T)
elif rotation_method == 'oblique':
L = A.dot(np.linalg.inv(T.T))
else: # i.e. if rotation_method == 'oblique':
raise ValueError('rotation_method should be one of '
'{orthogonal, oblique}')
return L
def oblimin_objective(L=None, A=None, T=None, gamma=0,
rotation_method='orthogonal',
return_gradient=True):
r"""
Objective function for the oblimin family for orthogonal or
oblique rotation wich minimizes:
.. math::
\phi(L) = \frac{1}{4}(L\circ L,(I-\gamma C)(L\circ L)N),
where :math:`L` is a :math:`p\times k` matrix, :math:`N` is
:math:`k\times k`
matrix with zeros on the diagonal and ones elsewhere, :math:`C` is a
:math:`p\times p` matrix with elements equal to :math:`1/p`,
:math:`(X,Y)=\operatorname{Tr}(X^*Y)` is the Frobenius norm and
:math:`\circ`
is the element-wise product or Hadamard product.
The gradient is given by
.. math::
L\circ\left[(I-\gamma C) (L \circ L)N\right].
Either :math:`L` should be provided or :math:`A` and :math:`T` should be
provided.
For orthogonal rotations :math:`L` satisfies
.. math::
L = AT,
where :math:`T` is an orthogonal matrix. For oblique rotations :math:`L`
satisfies
.. math::
L = A(T^*)^{-1},
where :math:`T` is a normal matrix.
The oblimin family is parametrized by the parameter :math:`\gamma`. For
orthogonal rotations:
* :math:`\gamma=0` corresponds to quartimax,
* :math:`\gamma=\frac{1}{2}` corresponds to biquartimax,
* :math:`\gamma=1` corresponds to varimax,
* :math:`\gamma=\frac{1}{p}` corresponds to equamax.
For oblique rotations rotations:
* :math:`\gamma=0` corresponds to quartimin,
* :math:`\gamma=\frac{1}{2}` corresponds to biquartimin.
Parameters
----------
L : numpy matrix (default None)
rotated factors, i.e., :math:`L=A(T^*)^{-1}=AT`
A : numpy matrix (default None)
non rotated factors
T : numpy matrix (default None)
rotation matrix
gamma : float (default 0)
a parameter
rotation_method : str
should be one of {orthogonal, oblique}
return_gradient : bool (default True)
toggles return of gradient
"""
if L is None:
assert A is not None and T is not None
L = rotateA(A, T, rotation_method=rotation_method)
p, k = L.shape
L2 = L**2
N = np.ones((k, k))-np.eye(k)
if np.isclose(gamma, 0):
X = L2.dot(N)
else:
C = np.ones((p, p))/p
X = (np.eye(p) - gamma*C).dot(L2).dot(N)
phi = np.sum(L2*X)/4
if return_gradient:
Gphi = L*X
return phi, Gphi
else:
return phi
def orthomax_objective(L=None, A=None, T=None, gamma=0, return_gradient=True):
r"""
Objective function for the orthomax family for orthogonal
rotation wich minimizes the following objective:
.. math::
\phi(L) = -\frac{1}{4}(L\circ L,(I-\gamma C)(L\circ L)),
where :math:`0\leq\gamma\leq1`, :math:`L` is a :math:`p\times k` matrix,
:math:`C` is a :math:`p\times p` matrix with elements equal to
:math:`1/p`,
:math:`(X,Y)=\operatorname{Tr}(X^*Y)` is the Frobenius norm and
:math:`\circ` is the element-wise product or Hadamard product.
Either :math:`L` should be provided or :math:`A` and :math:`T` should be
provided.
For orthogonal rotations :math:`L` satisfies
.. math::
L = AT,
where :math:`T` is an orthogonal matrix.
The orthomax family is parametrized by the parameter :math:`\gamma`:
* :math:`\gamma=0` corresponds to quartimax,
* :math:`\gamma=\frac{1}{2}` corresponds to biquartimax,
* :math:`\gamma=1` corresponds to varimax,
* :math:`\gamma=\frac{1}{p}` corresponds to equamax.
Parameters
----------
L : numpy matrix (default None)
rotated factors, i.e., :math:`L=A(T^*)^{-1}=AT`
A : numpy matrix (default None)
non rotated factors
T : numpy matrix (default None)
rotation matrix
gamma : float (default 0)
a parameter
return_gradient : bool (default True)
toggles return of gradient
"""
assert 0 <= gamma <= 1, "Gamma should be between 0 and 1"
if L is None:
assert A is not None and T is not None
L = rotateA(A, T, rotation_method='orthogonal')
p, k = L.shape
L2 = L**2
if np.isclose(gamma, 0):
X = L2
else:
C = np.ones((p, p))/p
X = (np.eye(p)-gamma*C).dot(L2)
phi = -np.sum(L2*X)/4
if return_gradient:
Gphi = -L*X
return phi, Gphi
else:
return phi
def CF_objective(L=None, A=None, T=None, kappa=0,
rotation_method='orthogonal',
return_gradient=True):
r"""
Objective function for the Crawford-Ferguson family for orthogonal
and oblique rotation wich minimizes the following objective:
.. math::
\phi(L) =\frac{1-\kappa}{4} (L\circ L,(L\circ L)N)
-\frac{1}{4}(L\circ L,M(L\circ L)),
where :math:`0\leq\kappa\leq1`, :math:`L` is a :math:`p\times k` matrix,
:math:`N` is :math:`k\times k` matrix with zeros on the diagonal and ones
elsewhere,
:math:`M` is :math:`p\times p` matrix with zeros on the diagonal and ones
elsewhere
:math:`(X,Y)=\operatorname{Tr}(X^*Y)` is the Frobenius norm and
:math:`\circ` is the element-wise product or Hadamard product.
The gradient is given by
.. math::
d\phi(L) = (1-\kappa) L\circ\left[(L\circ L)N\right]
-\kappa L\circ \left[M(L\circ L)\right].
Either :math:`L` should be provided or :math:`A` and :math:`T` should be
provided.
For orthogonal rotations :math:`L` satisfies
.. math::
L = AT,
where :math:`T` is an orthogonal matrix. For oblique rotations :math:`L`
satisfies
.. math::
L = A(T^*)^{-1},
where :math:`T` is a normal matrix.
For orthogonal rotations the oblimin (and orthomax) family of rotations is
equivalent to the Crawford-Ferguson family. To be more precise:
* :math:`\kappa=0` corresponds to quartimax,
* :math:`\kappa=\frac{1}{p}` corresponds to variamx,
* :math:`\kappa=\frac{k-1}{p+k-2}` corresponds to parsimax,
* :math:`\kappa=1` corresponds to factor parsimony.
Parameters
----------
L : numpy matrix (default None)
rotated factors, i.e., :math:`L=A(T^*)^{-1}=AT`
A : numpy matrix (default None)
non rotated factors
T : numpy matrix (default None)
rotation matrix
gamma : float (default 0)
a parameter
rotation_method : str
should be one of {orthogonal, oblique}
return_gradient : bool (default True)
toggles return of gradient
"""
assert 0 <= kappa <= 1, "Kappa should be between 0 and 1"
if L is None:
assert A is not None and T is not None
L = rotateA(A, T, rotation_method=rotation_method)
p, k = L.shape
L2 = L**2
X = None
if not np.isclose(kappa, 1):
N = np.ones((k, k)) - np.eye(k)
X = (1 - kappa)*L2.dot(N)
if not np.isclose(kappa, 0):
M = np.ones((p, p)) - np.eye(p)
if X is None:
X = kappa*M.dot(L2)
else:
X += kappa*M.dot(L2)
phi = np.sum(L2 * X) / 4
if return_gradient:
Gphi = L*X
return phi, Gphi
else:
return phi
def vgQ_target(H, L=None, A=None, T=None, rotation_method='orthogonal'):
r"""
Subroutine for the value of vgQ using orthogonal or oblique rotation
towards a target matrix, i.e., we minimize:
.. math::
\phi(L) =\frac{1}{2}\|L-H\|^2
and the gradient is given by
.. math::
d\phi(L)=L-H.
Either :math:`L` should be provided or :math:`A` and :math:`T` should be
provided.
For orthogonal rotations :math:`L` satisfies
.. math::
L = AT,
where :math:`T` is an orthogonal matrix. For oblique rotations :math:`L`
satisfies
.. math::
L = A(T^*)^{-1},
where :math:`T` is a normal matrix.
Parameters
----------
H : numpy matrix
target matrix
L : numpy matrix (default None)
rotated factors, i.e., :math:`L=A(T^*)^{-1}=AT`
A : numpy matrix (default None)
non rotated factors
T : numpy matrix (default None)
rotation matrix
rotation_method : str
should be one of {orthogonal, oblique}
"""
if L is None:
assert A is not None and T is not None
L = rotateA(A, T, rotation_method=rotation_method)
q = np.linalg.norm(L-H, 'fro')**2
Gq = 2*(L-H)
return q, Gq
def ff_target(H, L=None, A=None, T=None, rotation_method='orthogonal'):
r"""
Subroutine for the value of f using (orthogonal or oblique) rotation
towards a target matrix, i.e., we minimize:
.. math::
\phi(L) =\frac{1}{2}\|L-H\|^2.
Either :math:`L` should be provided or :math:`A` and :math:`T` should be
provided. For orthogonal rotations :math:`L` satisfies
.. math::
L = AT,
where :math:`T` is an orthogonal matrix. For oblique rotations
:math:`L` satisfies
.. math::
L = A(T^*)^{-1},
where :math:`T` is a normal matrix.
Parameters
----------
H : numpy matrix
target matrix
L : numpy matrix (default None)
rotated factors, i.e., :math:`L=A(T^*)^{-1}=AT`
A : numpy matrix (default None)
non rotated factors
T : numpy matrix (default None)
rotation matrix
rotation_method : str
should be one of {orthogonal, oblique}
"""
if L is None:
assert A is not None and T is not None
L = rotateA(A, T, rotation_method=rotation_method)
return np.linalg.norm(L-H, 'fro')**2
def vgQ_partial_target(H, W=None, L=None, A=None, T=None):
r"""
Subroutine for the value of vgQ using orthogonal rotation towards a partial
target matrix, i.e., we minimize:
.. math::
\phi(L) =\frac{1}{2}\|W\circ(L-H)\|^2,
where :math:`\circ` is the element-wise product or Hadamard product and
:math:`W` is a matrix whose entries can only be one or zero. The gradient
is given by
.. math::
d\phi(L)=W\circ(L-H).
Either :math:`L` should be provided or :math:`A` and :math:`T` should be
provided.
For orthogonal rotations :math:`L` satisfies
.. math::
L = AT,
where :math:`T` is an orthogonal matrix.
Parameters
----------
H : numpy matrix
target matrix
W : numpy matrix (default matrix with equal weight one for all entries)
matrix with weights, entries can either be one or zero
L : numpy matrix (default None)
rotated factors, i.e., :math:`L=A(T^*)^{-1}=AT`
A : numpy matrix (default None)
non rotated factors
T : numpy matrix (default None)
rotation matrix
"""
if W is None:
return vgQ_target(H, L=L, A=A, T=T)
if L is None:
assert A is not None and T is not None
L = rotateA(A, T, rotation_method='orthogonal')
q = np.linalg.norm(W*(L-H), 'fro')**2
Gq = 2*W*(L-H)
return q, Gq
def ff_partial_target(H, W=None, L=None, A=None, T=None):
r"""
Subroutine for the value of vgQ using orthogonal rotation towards a partial
target matrix, i.e., we minimize:
.. math::
\phi(L) =\frac{1}{2}\|W\circ(L-H)\|^2,
where :math:`\circ` is the element-wise product or Hadamard product and
:math:`W` is a matrix whose entries can only be one or zero. Either
:math:`L` should be provided or :math:`A` and :math:`T` should be provided.
For orthogonal rotations :math:`L` satisfies
.. math::
L = AT,
where :math:`T` is an orthogonal matrix.
Parameters
----------
H : numpy matrix
target matrix
W : numpy matrix (default matrix with equal weight one for all entries)
matrix with weights, entries can either be one or zero
L : numpy matrix (default None)
rotated factors, i.e., :math:`L=A(T^*)^{-1}=AT`
A : numpy matrix (default None)
non rotated factors
T : numpy matrix (default None)
rotation matrix
"""
if W is None:
return ff_target(H, L=L, A=A, T=T)
if L is None:
assert A is not None and T is not None
L = rotateA(A, T, rotation_method='orthogonal')
q = np.linalg.norm(W*(L-H), 'fro')**2
return q

View File

@ -0,0 +1,350 @@
from ._analytic_rotation import target_rotation
from ._gpa_rotation import oblimin_objective, orthomax_objective, CF_objective
from ._gpa_rotation import ff_partial_target, ff_target
from ._gpa_rotation import vgQ_partial_target, vgQ_target
from ._gpa_rotation import rotateA, GPA
__all__ = []
def rotate_factors(A, method, *method_args, **algorithm_kwargs):
r"""
Subroutine for orthogonal and oblique rotation of the matrix :math:`A`.
For orthogonal rotations :math:`A` is rotated to :math:`L` according to
.. math::
L = AT,
where :math:`T` is an orthogonal matrix. And, for oblique rotations
:math:`A` is rotated to :math:`L` according to
.. math::
L = A(T^*)^{-1},
where :math:`T` is a normal matrix.
Parameters
----------
A : numpy matrix (default None)
non rotated factors
method : str
should be one of the methods listed below
method_args : list
additional arguments that should be provided with each method
algorithm_kwargs : dictionary
algorithm : str (default gpa)
should be one of:
* 'gpa': a numerical method
* 'gpa_der_free': a derivative free numerical method
* 'analytic' : an analytic method
Depending on the algorithm, there are algorithm specific keyword
arguments. For the gpa and gpa_der_free, the following
keyword arguments are available:
max_tries : int (default 501)
maximum number of iterations
tol : float
stop criterion, algorithm stops if Frobenius norm of gradient is
smaller then tol
For analytic, the supported arguments depend on the method, see above.
See the lower level functions for more details.
Returns
-------
The tuple :math:`(L,T)`
Notes
-----
What follows is a list of available methods. Depending on the method
additional argument are required and different algorithms
are available. The algorithm_kwargs are additional keyword arguments
passed to the selected algorithm (see the parameters section).
Unless stated otherwise, only the gpa and
gpa_der_free algorithm are available.
Below,
* :math:`L` is a :math:`p\times k` matrix;
* :math:`N` is :math:`k\times k` matrix with zeros on the diagonal and ones
elsewhere;
* :math:`M` is :math:`p\times p` matrix with zeros on the diagonal and ones
elsewhere;
* :math:`C` is a :math:`p\times p` matrix with elements equal to
:math:`1/p`;
* :math:`(X,Y)=\operatorname{Tr}(X^*Y)` is the Frobenius norm;
* :math:`\circ` is the element-wise product or Hadamard product.
oblimin : orthogonal or oblique rotation that minimizes
.. math::
\phi(L) = \frac{1}{4}(L\circ L,(I-\gamma C)(L\circ L)N).
For orthogonal rotations:
* :math:`\gamma=0` corresponds to quartimax,
* :math:`\gamma=\frac{1}{2}` corresponds to biquartimax,
* :math:`\gamma=1` corresponds to varimax,
* :math:`\gamma=\frac{1}{p}` corresponds to equamax.
For oblique rotations rotations:
* :math:`\gamma=0` corresponds to quartimin,
* :math:`\gamma=\frac{1}{2}` corresponds to biquartimin.
method_args:
gamma : float
oblimin family parameter
rotation_method : str
should be one of {orthogonal, oblique}
orthomax : orthogonal rotation that minimizes
.. math::
\phi(L) = -\frac{1}{4}(L\circ L,(I-\gamma C)(L\circ L)),
where :math:`0\leq\gamma\leq1`. The orthomax family is equivalent to
the oblimin family (when restricted to orthogonal rotations).
Furthermore,
* :math:`\gamma=0` corresponds to quartimax,
* :math:`\gamma=\frac{1}{2}` corresponds to biquartimax,
* :math:`\gamma=1` corresponds to varimax,
* :math:`\gamma=\frac{1}{p}` corresponds to equamax.
method_args:
gamma : float (between 0 and 1)
orthomax family parameter
CF : Crawford-Ferguson family for orthogonal and oblique rotation which
minimizes:
.. math::
\phi(L) =\frac{1-\kappa}{4} (L\circ L,(L\circ L)N)
-\frac{1}{4}(L\circ L,M(L\circ L)),
where :math:`0\leq\kappa\leq1`. For orthogonal rotations the oblimin
(and orthomax) family of rotations is equivalent to the
Crawford-Ferguson family.
To be more precise:
* :math:`\kappa=0` corresponds to quartimax,
* :math:`\kappa=\frac{1}{p}` corresponds to varimax,
* :math:`\kappa=\frac{k-1}{p+k-2}` corresponds to parsimax,
* :math:`\kappa=1` corresponds to factor parsimony.
method_args:
kappa : float (between 0 and 1)
Crawford-Ferguson family parameter
rotation_method : str
should be one of {orthogonal, oblique}
quartimax : orthogonal rotation method
minimizes the orthomax objective with :math:`\gamma=0`
biquartimax : orthogonal rotation method
minimizes the orthomax objective with :math:`\gamma=\frac{1}{2}`
varimax : orthogonal rotation method
minimizes the orthomax objective with :math:`\gamma=1`
equamax : orthogonal rotation method
minimizes the orthomax objective with :math:`\gamma=\frac{1}{p}`
parsimax : orthogonal rotation method
minimizes the Crawford-Ferguson family objective with
:math:`\kappa=\frac{k-1}{p+k-2}`
parsimony : orthogonal rotation method
minimizes the Crawford-Ferguson family objective with :math:`\kappa=1`
quartimin : oblique rotation method that minimizes
minimizes the oblimin objective with :math:`\gamma=0`
quartimin : oblique rotation method that minimizes
minimizes the oblimin objective with :math:`\gamma=\frac{1}{2}`
target : orthogonal or oblique rotation that rotates towards a target
matrix : math:`H` by minimizing the objective
.. math::
\phi(L) =\frac{1}{2}\|L-H\|^2.
method_args:
H : numpy matrix
target matrix
rotation_method : str
should be one of {orthogonal, oblique}
For orthogonal rotations the algorithm can be set to analytic in which
case the following keyword arguments are available:
full_rank : bool (default False)
if set to true full rank is assumed
partial_target : orthogonal (default) or oblique rotation that partially
rotates towards a target matrix :math:`H` by minimizing the objective:
.. math::
\phi(L) =\frac{1}{2}\|W\circ(L-H)\|^2.
method_args:
H : numpy matrix
target matrix
W : numpy matrix (default matrix with equal weight one for all entries)
matrix with weights, entries can either be one or zero
Examples
--------
>>> A = np.random.randn(8,2)
>>> L, T = rotate_factors(A,'varimax')
>>> np.allclose(L,A.dot(T))
>>> L, T = rotate_factors(A,'orthomax',0.5)
>>> np.allclose(L,A.dot(T))
>>> L, T = rotate_factors(A,'quartimin',0.5)
>>> np.allclose(L,A.dot(np.linalg.inv(T.T)))
"""
if 'algorithm' in algorithm_kwargs:
algorithm = algorithm_kwargs['algorithm']
algorithm_kwargs.pop('algorithm')
else:
algorithm = 'gpa'
assert not ('rotation_method' in algorithm_kwargs), (
'rotation_method cannot be provided as keyword argument')
L = None
T = None
ff = None
vgQ = None
p, k = A.shape
# set ff or vgQ to appropriate objective function, compute solution using
# recursion or analytically compute solution
if method == 'orthomax':
assert len(method_args) == 1, ('Only %s family parameter should be '
'provided' % method)
rotation_method = 'orthogonal'
gamma = method_args[0]
if algorithm == 'gpa':
vgQ = lambda L=None, A=None, T=None: orthomax_objective(
L=L, A=A, T=T, gamma=gamma, return_gradient=True)
elif algorithm == 'gpa_der_free':
ff = lambda L=None, A=None, T=None: orthomax_objective(
L=L, A=A, T=T, gamma=gamma, return_gradient=False)
else:
raise ValueError('Algorithm %s is not possible for %s '
'rotation' % (algorithm, method))
elif method == 'oblimin':
assert len(method_args) == 2, ('Both %s family parameter and '
'rotation_method should be '
'provided' % method)
rotation_method = method_args[1]
assert rotation_method in ['orthogonal', 'oblique'], (
'rotation_method should be one of {orthogonal, oblique}')
gamma = method_args[0]
if algorithm == 'gpa':
vgQ = lambda L=None, A=None, T=None: oblimin_objective(
L=L, A=A, T=T, gamma=gamma, return_gradient=True)
elif algorithm == 'gpa_der_free':
ff = lambda L=None, A=None, T=None: oblimin_objective(
L=L, A=A, T=T, gamma=gamma, rotation_method=rotation_method,
return_gradient=False)
else:
raise ValueError('Algorithm %s is not possible for %s '
'rotation' % (algorithm, method))
elif method == 'CF':
assert len(method_args) == 2, ('Both %s family parameter and '
'rotation_method should be provided'
% method)
rotation_method = method_args[1]
assert rotation_method in ['orthogonal', 'oblique'], (
'rotation_method should be one of {orthogonal, oblique}')
kappa = method_args[0]
if algorithm == 'gpa':
vgQ = lambda L=None, A=None, T=None: CF_objective(
L=L, A=A, T=T, kappa=kappa, rotation_method=rotation_method,
return_gradient=True)
elif algorithm == 'gpa_der_free':
ff = lambda L=None, A=None, T=None: CF_objective(
L=L, A=A, T=T, kappa=kappa, rotation_method=rotation_method,
return_gradient=False)
else:
raise ValueError('Algorithm %s is not possible for %s '
'rotation' % (algorithm, method))
elif method == 'quartimax':
return rotate_factors(A, 'orthomax', 0, **algorithm_kwargs)
elif method == 'biquartimax':
return rotate_factors(A, 'orthomax', 0.5, **algorithm_kwargs)
elif method == 'varimax':
return rotate_factors(A, 'orthomax', 1, **algorithm_kwargs)
elif method == 'equamax':
return rotate_factors(A, 'orthomax', 1/p, **algorithm_kwargs)
elif method == 'parsimax':
return rotate_factors(A, 'CF', (k-1)/(p+k-2),
'orthogonal', **algorithm_kwargs)
elif method == 'parsimony':
return rotate_factors(A, 'CF', 1, 'orthogonal', **algorithm_kwargs)
elif method == 'quartimin':
return rotate_factors(A, 'oblimin', 0, 'oblique', **algorithm_kwargs)
elif method == 'biquartimin':
return rotate_factors(A, 'oblimin', 0.5, 'oblique', **algorithm_kwargs)
elif method == 'target':
assert len(method_args) == 2, (
'only the rotation target and orthogonal/oblique should be provide'
' for %s rotation' % method)
H = method_args[0]
rotation_method = method_args[1]
assert rotation_method in ['orthogonal', 'oblique'], (
'rotation_method should be one of {orthogonal, oblique}')
if algorithm == 'gpa':
vgQ = lambda L=None, A=None, T=None: vgQ_target(
H, L=L, A=A, T=T, rotation_method=rotation_method)
elif algorithm == 'gpa_der_free':
ff = lambda L=None, A=None, T=None: ff_target(
H, L=L, A=A, T=T, rotation_method=rotation_method)
elif algorithm == 'analytic':
assert rotation_method == 'orthogonal', (
'For analytic %s rotation only orthogonal rotation is '
'supported')
T = target_rotation(A, H, **algorithm_kwargs)
else:
raise ValueError('Algorithm %s is not possible for %s rotation'
% (algorithm, method))
elif method == 'partial_target':
assert len(method_args) == 2, ('2 additional arguments are expected '
'for %s rotation' % method)
H = method_args[0]
W = method_args[1]
rotation_method = 'orthogonal'
if algorithm == 'gpa':
vgQ = lambda L=None, A=None, T=None: vgQ_partial_target(
H, W=W, L=L, A=A, T=T)
elif algorithm == 'gpa_der_free':
ff = lambda L=None, A=None, T=None: ff_partial_target(
H, W=W, L=L, A=A, T=T)
else:
raise ValueError('Algorithm %s is not possible for %s '
'rotation' % (algorithm, method))
else:
raise ValueError('Invalid method')
# compute L and T if not already done
if T is None:
L, phi, T, table = GPA(A, vgQ=vgQ, ff=ff,
rotation_method=rotation_method,
**algorithm_kwargs)
if L is None:
assert T is not None, 'Cannot compute L without T'
L = rotateA(A, T, rotation_method=rotation_method)
return L, T

View File

@ -0,0 +1,584 @@
import unittest
import numpy as np
from statsmodels.multivariate.factor_rotation._wrappers import rotate_factors
from statsmodels.multivariate.factor_rotation._gpa_rotation import (
ff_partial_target, vgQ_partial_target, ff_target, vgQ_target, CF_objective,
orthomax_objective, oblimin_objective, GPA)
from statsmodels.multivariate.factor_rotation._analytic_rotation import (
target_rotation)
class TestAnalyticRotation(unittest.TestCase):
@staticmethod
def str2matrix(A):
A = A.lstrip().rstrip().split('\n')
A = np.array([row.split() for row in A]).astype(float)
return A
def test_target_rotation(self):
"""
Rotation towards target matrix example
http://www.stat.ucla.edu/research/gpa
"""
A = self.str2matrix("""
.830 -.396
.818 -.469
.777 -.470
.798 -.401
.786 .500
.672 .458
.594 .444
.647 .333
""")
H = self.str2matrix("""
.8 -.3
.8 -.4
.7 -.4
.9 -.4
.8 .5
.6 .4
.5 .4
.6 .3
""")
T = target_rotation(A, H)
L = A.dot(T)
L_required = self.str2matrix("""
0.84168 -0.37053
0.83191 -0.44386
0.79096 -0.44611
0.80985 -0.37650
0.77040 0.52371
0.65774 0.47826
0.58020 0.46189
0.63656 0.35255
""")
self.assertTrue(np.allclose(L, L_required, atol=1e-05))
T = target_rotation(A, H, full_rank=True)
L = A.dot(T)
self.assertTrue(np.allclose(L, L_required, atol=1e-05))
def test_orthogonal_target(self):
"""
Rotation towards target matrix example
http://www.stat.ucla.edu/research/gpa
"""
A = self.str2matrix("""
.830 -.396
.818 -.469
.777 -.470
.798 -.401
.786 .500
.672 .458
.594 .444
.647 .333
""")
H = self.str2matrix("""
.8 -.3
.8 -.4
.7 -.4
.9 -.4
.8 .5
.6 .4
.5 .4
.6 .3
""")
vgQ = lambda L=None, A=None, T=None: vgQ_target(H, L=L, A=A, T=T)
L, phi, T, table = GPA(A, vgQ=vgQ, rotation_method='orthogonal')
T_analytic = target_rotation(A, H)
self.assertTrue(np.allclose(T, T_analytic, atol=1e-05))
class TestGPARotation(unittest.TestCase):
@staticmethod
def str2matrix(A):
A = A.lstrip().rstrip().split('\n')
A = np.array([row.split() for row in A]).astype(float)
return A
@classmethod
def get_A(cls):
return cls.str2matrix("""
.830 -.396
.818 -.469
.777 -.470
.798 -.401
.786 .500
.672 .458
.594 .444
.647 .333
""")
@classmethod
def get_quartimin_example(cls):
A = cls.get_A()
table_required = cls.str2matrix("""
0.00000 0.42806 -0.46393 1.00000
1.00000 0.41311 -0.57313 0.25000
2.00000 0.38238 -0.36652 0.50000
3.00000 0.31850 -0.21011 0.50000
4.00000 0.20937 -0.13838 0.50000
5.00000 0.12379 -0.35583 0.25000
6.00000 0.04289 -0.53244 0.50000
7.00000 0.01098 -0.86649 0.50000
8.00000 0.00566 -1.65798 0.50000
9.00000 0.00558 -2.13212 0.25000
10.00000 0.00557 -2.49020 0.25000
11.00000 0.00557 -2.84585 0.25000
12.00000 0.00557 -3.20320 0.25000
13.00000 0.00557 -3.56143 0.25000
14.00000 0.00557 -3.92005 0.25000
15.00000 0.00557 -4.27885 0.25000
16.00000 0.00557 -4.63772 0.25000
17.00000 0.00557 -4.99663 0.25000
18.00000 0.00557 -5.35555 0.25000
""")
L_required = cls.str2matrix("""
0.891822 0.056015
0.953680 -0.023246
0.929150 -0.046503
0.876683 0.033658
0.013701 0.925000
-0.017265 0.821253
-0.052445 0.764953
0.085890 0.683115
""")
return A, table_required, L_required
@classmethod
def get_biquartimin_example(cls):
A = cls.get_A()
table_required = cls.str2matrix("""
0.00000 0.21632 -0.54955 1.00000
1.00000 0.19519 -0.46174 0.50000
2.00000 0.09479 -0.16365 1.00000
3.00000 -0.06302 -0.32096 0.50000
4.00000 -0.21304 -0.46562 1.00000
5.00000 -0.33199 -0.33287 1.00000
6.00000 -0.35108 -0.63990 0.12500
7.00000 -0.35543 -1.20916 0.12500
8.00000 -0.35568 -2.61213 0.12500
9.00000 -0.35568 -2.97910 0.06250
10.00000 -0.35568 -3.32645 0.06250
11.00000 -0.35568 -3.66021 0.06250
12.00000 -0.35568 -3.98564 0.06250
13.00000 -0.35568 -4.30635 0.06250
14.00000 -0.35568 -4.62451 0.06250
15.00000 -0.35568 -4.94133 0.06250
16.00000 -0.35568 -5.25745 0.06250
""")
L_required = cls.str2matrix("""
1.01753 -0.13657
1.11338 -0.24643
1.09200 -0.26890
1.00676 -0.16010
-0.26534 1.11371
-0.26972 0.99553
-0.29341 0.93561
-0.10806 0.80513
""")
return A, table_required, L_required
@classmethod
def get_biquartimin_example_derivative_free(cls):
A = cls.get_A()
table_required = cls.str2matrix("""
0.00000 0.21632 -0.54955 1.00000
1.00000 0.19519 -0.46174 0.50000
2.00000 0.09479 -0.16365 1.00000
3.00000 -0.06302 -0.32096 0.50000
4.00000 -0.21304 -0.46562 1.00000
5.00000 -0.33199 -0.33287 1.00000
6.00000 -0.35108 -0.63990 0.12500
7.00000 -0.35543 -1.20916 0.12500
8.00000 -0.35568 -2.61213 0.12500
9.00000 -0.35568 -2.97910 0.06250
10.00000 -0.35568 -3.32645 0.06250
11.00000 -0.35568 -3.66021 0.06250
12.00000 -0.35568 -3.98564 0.06250
13.00000 -0.35568 -4.30634 0.06250
14.00000 -0.35568 -4.62451 0.06250
15.00000 -0.35568 -4.94133 0.06250
16.00000 -0.35568 -6.32435 0.12500
""")
L_required = cls.str2matrix("""
1.01753 -0.13657
1.11338 -0.24643
1.09200 -0.26890
1.00676 -0.16010
-0.26534 1.11371
-0.26972 0.99553
-0.29342 0.93561
-0.10806 0.80513
""")
return A, table_required, L_required
@classmethod
def get_quartimax_example_derivative_free(cls):
A = cls.get_A()
table_required = cls.str2matrix("""
0.00000 -0.72073 -0.65498 1.00000
1.00000 -0.88561 -0.34614 2.00000
2.00000 -1.01992 -1.07152 1.00000
3.00000 -1.02237 -1.51373 0.50000
4.00000 -1.02269 -1.96205 0.50000
5.00000 -1.02273 -2.41116 0.50000
6.00000 -1.02273 -2.86037 0.50000
7.00000 -1.02273 -3.30959 0.50000
8.00000 -1.02273 -3.75881 0.50000
9.00000 -1.02273 -4.20804 0.50000
10.00000 -1.02273 -4.65726 0.50000
11.00000 -1.02273 -5.10648 0.50000
""")
L_required = cls.str2matrix("""
0.89876 0.19482
0.93394 0.12974
0.90213 0.10386
0.87651 0.17128
0.31558 0.87647
0.25113 0.77349
0.19801 0.71468
0.30786 0.65933
""")
return A, table_required, L_required
def test_orthomax(self):
"""
Quartimax example
http://www.stat.ucla.edu/research/gpa
"""
A = self.get_A()
vgQ = lambda L=None, A=None, T=None: orthomax_objective(
L=L, A=A, T=T, gamma=0, return_gradient=True)
L, phi, T, table = GPA(A, vgQ=vgQ, rotation_method='orthogonal')
table_required = self.str2matrix("""
0.00000 -0.72073 -0.65498 1.00000
1.00000 -0.88561 -0.34614 2.00000
2.00000 -1.01992 -1.07152 1.00000
3.00000 -1.02237 -1.51373 0.50000
4.00000 -1.02269 -1.96205 0.50000
5.00000 -1.02273 -2.41116 0.50000
6.00000 -1.02273 -2.86037 0.50000
7.00000 -1.02273 -3.30959 0.50000
8.00000 -1.02273 -3.75881 0.50000
9.00000 -1.02273 -4.20804 0.50000
10.00000 -1.02273 -4.65726 0.50000
11.00000 -1.02273 -5.10648 0.50000
""")
L_required = self.str2matrix("""
0.89876 0.19482
0.93394 0.12974
0.90213 0.10386
0.87651 0.17128
0.31558 0.87647
0.25113 0.77349
0.19801 0.71468
0.30786 0.65933
""")
self.assertTrue(np.allclose(table, table_required, atol=1e-05))
self.assertTrue(np.allclose(L, L_required, atol=1e-05))
# oblimin criterion gives same result
vgQ = lambda L=None, A=None, T=None: oblimin_objective(
L=L, A=A, T=T, gamma=0, rotation_method='orthogonal',
return_gradient=True)
L_oblimin, phi2, T2, table2 = GPA(A, vgQ=vgQ,
rotation_method='orthogonal')
self.assertTrue(np.allclose(L, L_oblimin, atol=1e-05))
# derivative free quartimax
out = self.get_quartimax_example_derivative_free()
A, table_required, L_required = out
ff = lambda L=None, A=None, T=None: orthomax_objective(
L=L, A=A, T=T, gamma=0, return_gradient=False)
L, phi, T, table = GPA(A, ff=ff, rotation_method='orthogonal')
self.assertTrue(np.allclose(table, table_required, atol=1e-05))
self.assertTrue(np.allclose(L, L_required, atol=1e-05))
def test_equivalence_orthomax_oblimin(self):
"""
These criteria should be equivalent when restricted to orthogonal
rotation.
See Hartman 1976 page 299.
"""
A = self.get_A()
gamma = 0 # quartimax
vgQ = lambda L=None, A=None, T=None: orthomax_objective(
L=L, A=A, T=T, gamma=gamma, return_gradient=True)
L_orthomax, phi, T, table = GPA(
A, vgQ=vgQ, rotation_method='orthogonal')
vgQ = lambda L=None, A=None, T=None: oblimin_objective(
L=L, A=A, T=T, gamma=gamma, rotation_method='orthogonal',
return_gradient=True)
L_oblimin, phi2, T2, table2 = GPA(A, vgQ=vgQ,
rotation_method='orthogonal')
self.assertTrue(np.allclose(L_orthomax, L_oblimin, atol=1e-05))
gamma = 1 # varimax
vgQ = lambda L=None, A=None, T=None: orthomax_objective(
L=L, A=A, T=T, gamma=gamma, return_gradient=True)
L_orthomax, phi, T, table = GPA(
A, vgQ=vgQ, rotation_method='orthogonal')
vgQ = lambda L=None, A=None, T=None: oblimin_objective(
L=L, A=A, T=T, gamma=gamma, rotation_method='orthogonal',
return_gradient=True)
L_oblimin, phi2, T2, table2 = GPA(
A, vgQ=vgQ, rotation_method='orthogonal')
self.assertTrue(np.allclose(L_orthomax, L_oblimin, atol=1e-05))
def test_orthogonal_target(self):
"""
Rotation towards target matrix example
http://www.stat.ucla.edu/research/gpa
"""
A = self.get_A()
H = self.str2matrix("""
.8 -.3
.8 -.4
.7 -.4
.9 -.4
.8 .5
.6 .4
.5 .4
.6 .3
""")
vgQ = lambda L=None, A=None, T=None: vgQ_target(H, L=L, A=A, T=T)
L, phi, T, table = GPA(A, vgQ=vgQ, rotation_method='orthogonal')
table_required = self.str2matrix("""
0.00000 0.05925 -0.61244 1.00000
1.00000 0.05444 -1.14701 0.12500
2.00000 0.05403 -1.68194 0.12500
3.00000 0.05399 -2.21689 0.12500
4.00000 0.05399 -2.75185 0.12500
5.00000 0.05399 -3.28681 0.12500
6.00000 0.05399 -3.82176 0.12500
7.00000 0.05399 -4.35672 0.12500
8.00000 0.05399 -4.89168 0.12500
9.00000 0.05399 -5.42664 0.12500
""")
L_required = self.str2matrix("""
0.84168 -0.37053
0.83191 -0.44386
0.79096 -0.44611
0.80985 -0.37650
0.77040 0.52371
0.65774 0.47826
0.58020 0.46189
0.63656 0.35255
""")
self.assertTrue(np.allclose(table, table_required, atol=1e-05))
self.assertTrue(np.allclose(L, L_required, atol=1e-05))
ff = lambda L=None, A=None, T=None: ff_target(H, L=L, A=A, T=T)
L2, phi, T2, table = GPA(A, ff=ff, rotation_method='orthogonal')
self.assertTrue(np.allclose(L, L2, atol=1e-05))
self.assertTrue(np.allclose(T, T2, atol=1e-05))
vgQ = lambda L=None, A=None, T=None: vgQ_target(
H, L=L, A=A, T=T, rotation_method='oblique')
L, phi, T, table = GPA(A, vgQ=vgQ, rotation_method='oblique')
ff = lambda L=None, A=None, T=None: ff_target(
H, L=L, A=A, T=T, rotation_method='oblique')
L2, phi, T2, table = GPA(A, ff=ff, rotation_method='oblique')
self.assertTrue(np.allclose(L, L2, atol=1e-05))
self.assertTrue(np.allclose(T, T2, atol=1e-05))
def test_orthogonal_partial_target(self):
"""
Rotation towards target matrix example
http://www.stat.ucla.edu/research/gpa
"""
A = self.get_A()
H = self.str2matrix("""
.8 -.3
.8 -.4
.7 -.4
.9 -.4
.8 .5
.6 .4
.5 .4
.6 .3
""")
W = self.str2matrix("""
1 0
0 1
0 0
1 1
1 0
1 0
0 1
1 0
""")
vgQ = lambda L=None, A=None, T=None: vgQ_partial_target(
H, W, L=L, A=A, T=T)
L, phi, T, table = GPA(A, vgQ=vgQ, rotation_method='orthogonal')
table_required = self.str2matrix("""
0.00000 0.02559 -0.84194 1.00000
1.00000 0.02203 -1.27116 0.25000
2.00000 0.02154 -1.71198 0.25000
3.00000 0.02148 -2.15713 0.25000
4.00000 0.02147 -2.60385 0.25000
5.00000 0.02147 -3.05114 0.25000
6.00000 0.02147 -3.49863 0.25000
7.00000 0.02147 -3.94619 0.25000
8.00000 0.02147 -4.39377 0.25000
9.00000 0.02147 -4.84137 0.25000
10.00000 0.02147 -5.28897 0.25000
""")
L_required = self.str2matrix("""
0.84526 -0.36228
0.83621 -0.43571
0.79528 -0.43836
0.81349 -0.36857
0.76525 0.53122
0.65303 0.48467
0.57565 0.46754
0.63308 0.35876
""")
self.assertTrue(np.allclose(table, table_required, atol=1e-05))
self.assertTrue(np.allclose(L, L_required, atol=1e-05))
ff = lambda L=None, A=None, T=None: ff_partial_target(
H, W, L=L, A=A, T=T)
L2, phi, T2, table = GPA(A, ff=ff, rotation_method='orthogonal')
self.assertTrue(np.allclose(L, L2, atol=1e-05))
self.assertTrue(np.allclose(T, T2, atol=1e-05))
def test_oblimin(self):
# quartimin
A, table_required, L_required = self.get_quartimin_example()
vgQ = lambda L=None, A=None, T=None: oblimin_objective(
L=L, A=A, T=T, gamma=0, rotation_method='oblique')
L, phi, T, table = GPA(A, vgQ=vgQ, rotation_method='oblique')
self.assertTrue(np.allclose(table, table_required, atol=1e-05))
self.assertTrue(np.allclose(L, L_required, atol=1e-05))
# quartimin derivative free
ff = lambda L=None, A=None, T=None: oblimin_objective(
L=L, A=A, T=T, gamma=0, rotation_method='oblique',
return_gradient=False)
L, phi, T, table = GPA(A, ff=ff, rotation_method='oblique')
self.assertTrue(np.allclose(L, L_required, atol=1e-05))
self.assertTrue(np.allclose(table, table_required, atol=1e-05))
# biquartimin
A, table_required, L_required = self.get_biquartimin_example()
vgQ = lambda L=None, A=None, T=None: oblimin_objective(
L=L, A=A, T=T, gamma=1/2, rotation_method='oblique')
L, phi, T, table = GPA(A, vgQ=vgQ, rotation_method='oblique')
self.assertTrue(np.allclose(table, table_required, atol=1e-05))
self.assertTrue(np.allclose(L, L_required, atol=1e-05))
# quartimin derivative free
out = self.get_biquartimin_example_derivative_free()
A, table_required, L_required = out
ff = lambda L=None, A=None, T=None: oblimin_objective(
L=L, A=A, T=T, gamma=1/2, rotation_method='oblique',
return_gradient=False)
L, phi, T, table = GPA(A, ff=ff, rotation_method='oblique')
self.assertTrue(np.allclose(L, L_required, atol=1e-05))
self.assertTrue(np.allclose(table, table_required, atol=1e-05))
def test_CF(self):
# quartimax
out = self.get_quartimax_example_derivative_free()
A, table_required, L_required = out
vgQ = lambda L=None, A=None, T=None: CF_objective(
L=L, A=A, T=T, kappa=0, rotation_method='orthogonal',
return_gradient=True)
L, phi, T, table = GPA(A, vgQ=vgQ, rotation_method='orthogonal')
self.assertTrue(np.allclose(L, L_required, atol=1e-05))
# quartimax derivative free
ff = lambda L=None, A=None, T=None: CF_objective(
L=L, A=A, T=T, kappa=0, rotation_method='orthogonal',
return_gradient=False)
L, phi, T, table = GPA(A, ff=ff, rotation_method='orthogonal')
self.assertTrue(np.allclose(L, L_required, atol=1e-05))
# varimax
p, k = A.shape
vgQ = lambda L=None, A=None, T=None: orthomax_objective(
L=L, A=A, T=T, gamma=1, return_gradient=True)
L_vm, phi, T, table = GPA(A, vgQ=vgQ, rotation_method='orthogonal')
vgQ = lambda L=None, A=None, T=None: CF_objective(
L=L, A=A, T=T, kappa=1/p, rotation_method='orthogonal',
return_gradient=True)
L_CF, phi, T, table = GPA(A, vgQ=vgQ, rotation_method='orthogonal')
ff = lambda L=None, A=None, T=None: CF_objective(
L=L, A=A, T=T, kappa=1/p, rotation_method='orthogonal',
return_gradient=False)
L_CF_df, phi, T, table = GPA(A, ff=ff, rotation_method='orthogonal')
self.assertTrue(np.allclose(L_vm, L_CF, atol=1e-05))
self.assertTrue(np.allclose(L_CF, L_CF_df, atol=1e-05))
class TestWrappers(unittest.TestCase):
@staticmethod
def str2matrix(A):
A = A.lstrip().rstrip().split('\n')
A = np.array([row.split() for row in A]).astype(float)
return A
def get_A(self):
return self.str2matrix("""
.830 -.396
.818 -.469
.777 -.470
.798 -.401
.786 .500
.672 .458
.594 .444
.647 .333
""")
def get_H(self):
return self.str2matrix("""
.8 -.3
.8 -.4
.7 -.4
.9 -.4
.8 .5
.6 .4
.5 .4
.6 .3
""")
def get_W(self):
return self.str2matrix("""
1 0
0 1
0 0
1 1
1 0
1 0
0 1
1 0
""")
def _test_template(self, method, *method_args, **algorithms):
A = self.get_A()
algorithm1 = 'gpa' if 'algorithm1' not in algorithms else algorithms[
'algorithm1']
if 'algorithm`' not in algorithms:
algorithm2 = 'gpa_der_free'
else:
algorithms['algorithm1']
L1, T1 = rotate_factors(A, method, *method_args, algorithm=algorithm1)
L2, T2 = rotate_factors(A, method, *method_args, algorithm=algorithm2)
self.assertTrue(np.allclose(L1, L2, atol=1e-5))
self.assertTrue(np.allclose(T1, T2, atol=1e-5))
def test_methods(self):
"""
Quartimax derivative free example
http://www.stat.ucla.edu/research/gpa
"""
# orthomax, oblimin and CF are tested indirectly
methods = ['quartimin', 'biquartimin',
'quartimax', 'biquartimax', 'varimax', 'equamax',
'parsimax', 'parsimony',
'target', 'partial_target']
for method in methods:
method_args = []
if method == 'target':
method_args = [self.get_H(), 'orthogonal']
self._test_template(method, *method_args)
method_args = [self.get_H(), 'oblique']
self._test_template(method, *method_args)
method_args = [self.get_H(), 'orthogonal']
self._test_template(method, *method_args,
algorithm2='analytic')
elif method == 'partial_target':
method_args = [self.get_H(), self.get_W()]
self._test_template(method, *method_args)

View File

@ -0,0 +1,127 @@
"""Multivariate analysis of variance
author: Yichuan Liu
"""
import numpy as np
from statsmodels.compat.pandas import Substitution
from statsmodels.base.model import Model
from .multivariate_ols import MultivariateTestResults
from .multivariate_ols import _multivariate_ols_fit
from .multivariate_ols import _multivariate_ols_test, _hypotheses_doc
__docformat__ = 'restructuredtext en'
class MANOVA(Model):
"""
Multivariate Analysis of Variance
The implementation of MANOVA is based on multivariate regression and does
not assume that the explanatory variables are categorical. Any type of
variables as in regression is allowed.
Parameters
----------
endog : array_like
Dependent variables. A nobs x k_endog array where nobs is
the number of observations and k_endog is the number of dependent
variables.
exog : array_like
Independent variables. A nobs x k_exog array where nobs is the
number of observations and k_exog is the number of independent
variables. An intercept is not included by default and should be added
by the user. Models specified using a formula include an intercept by
default.
Attributes
----------
endog : ndarray
See Parameters.
exog : ndarray
See Parameters.
Notes
-----
MANOVA is used though the `mv_test` function, and `fit` is not used.
The ``from_formula`` interface is the recommended method to specify
a model and simplifies testing without needing to manually configure
the contrast matrices.
References
----------
.. [*] ftp://public.dhe.ibm.com/software/analytics/spss/documentation/
statistics/20.0/en/client/Manuals/IBM_SPSS_Statistics_Algorithms.pdf
"""
_formula_max_endog = None
def __init__(self, endog, exog, missing='none', hasconst=None, **kwargs):
if len(endog.shape) == 1 or endog.shape[1] == 1:
raise ValueError('There must be more than one dependent variable'
' to fit MANOVA!')
super().__init__(endog, exog, missing=missing,
hasconst=hasconst, **kwargs)
self._fittedmod = _multivariate_ols_fit(self.endog, self.exog)
def fit(self):
raise NotImplementedError('fit is not needed to use MANOVA. Call'
'mv_test directly on a MANOVA instance.')
@Substitution(hypotheses_doc=_hypotheses_doc)
def mv_test(self, hypotheses=None, skip_intercept_test=False):
"""
Linear hypotheses testing
Parameters
----------
%(hypotheses_doc)s
skip_intercept_test : bool
If true, then testing the intercept is skipped, the model is not
changed.
Note: If a term has a numerically insignificant effect, then
an exception because of emtpy arrays may be raised. This can
happen for the intercept if the data has been demeaned.
Returns
-------
results: MultivariateTestResults
Notes
-----
Testing the linear hypotheses
L * params * M = 0
where `params` is the regression coefficient matrix for the
linear model y = x * params
If the model is not specified using the formula interfact, then the
hypotheses test each included exogenous variable, one at a time. In
most applications with categorical variables, the ``from_formula``
interface should be preferred when specifying a model since it
provides knowledge about the model when specifying the hypotheses.
"""
if hypotheses is None:
if (hasattr(self, 'data') and self.data is not None and
hasattr(self.data, 'design_info')):
terms = self.data.design_info.term_name_slices
hypotheses = []
for key in terms:
if skip_intercept_test and key == 'Intercept':
continue
L_contrast = np.eye(self.exog.shape[1])[terms[key], :]
hypotheses.append([key, L_contrast, None])
else:
hypotheses = []
for i in range(self.exog.shape[1]):
name = 'x%d' % (i)
L = np.zeros([1, self.exog.shape[1]])
L[0, i] = 1
hypotheses.append([name, L, None])
results = _multivariate_ols_test(hypotheses, self._fittedmod,
self.exog_names, self.endog_names)
return MultivariateTestResults(results, self.endog_names,
self.exog_names)

View File

@ -0,0 +1,590 @@
"""General linear model
author: Yichuan Liu
"""
import numpy as np
from numpy.linalg import eigvals, inv, solve, matrix_rank, pinv, svd
from scipy import stats
import pandas as pd
from patsy import DesignInfo
from statsmodels.compat.pandas import Substitution
from statsmodels.base.model import Model
from statsmodels.iolib import summary2
__docformat__ = 'restructuredtext en'
_hypotheses_doc = \
"""hypotheses : list[tuple]
Hypothesis `L*B*M = C` to be tested where B is the parameters in
regression Y = X*B. Each element is a tuple of length 2, 3, or 4:
* (name, contrast_L)
* (name, contrast_L, transform_M)
* (name, contrast_L, transform_M, constant_C)
containing a string `name`, the contrast matrix L, the transform
matrix M (for transforming dependent variables), and right-hand side
constant matrix constant_C, respectively.
contrast_L : 2D array or an array of strings
Left-hand side contrast matrix for hypotheses testing.
If 2D array, each row is an hypotheses and each column is an
independent variable. At least 1 row
(1 by k_exog, the number of independent variables) is required.
If an array of strings, it will be passed to
patsy.DesignInfo().linear_constraint.
transform_M : 2D array or an array of strings or None, optional
Left hand side transform matrix.
If `None` or left out, it is set to a k_endog by k_endog
identity matrix (i.e. do not transform y matrix).
If an array of strings, it will be passed to
patsy.DesignInfo().linear_constraint.
constant_C : 2D array or None, optional
Right-hand side constant matrix.
if `None` or left out it is set to a matrix of zeros
Must has the same number of rows as contrast_L and the same
number of columns as transform_M
If `hypotheses` is None: 1) the effect of each independent variable
on the dependent variables will be tested. Or 2) if model is created
using a formula, `hypotheses` will be created according to
`design_info`. 1) and 2) is equivalent if no additional variables
are created by the formula (e.g. dummy variables for categorical
variables and interaction terms)
"""
def _multivariate_ols_fit(endog, exog, method='svd', tolerance=1e-8):
"""
Solve multivariate linear model y = x * params
where y is dependent variables, x is independent variables
Parameters
----------
endog : array_like
each column is a dependent variable
exog : array_like
each column is a independent variable
method : str
'svd' - Singular value decomposition
'pinv' - Moore-Penrose pseudoinverse
tolerance : float, a small positive number
Tolerance for eigenvalue. Values smaller than tolerance is considered
zero.
Returns
-------
a tuple of matrices or values necessary for hypotheses testing
.. [*] https://support.sas.com/documentation/cdl/en/statug/63033/HTML/default/viewer.htm#statug_introreg_sect012.htm
Notes
-----
Status: experimental and incomplete
"""
y = endog
x = exog
nobs, k_endog = y.shape
nobs1, k_exog= x.shape
if nobs != nobs1:
raise ValueError('x(n=%d) and y(n=%d) should have the same number of '
'rows!' % (nobs1, nobs))
# Calculate the matrices necessary for hypotheses testing
df_resid = nobs - k_exog
if method == 'pinv':
# Regression coefficients matrix
pinv_x = pinv(x)
params = pinv_x.dot(y)
# inverse of x'x
inv_cov = pinv_x.dot(pinv_x.T)
if matrix_rank(inv_cov,tol=tolerance) < k_exog:
raise ValueError('Covariance of x singular!')
# Sums of squares and cross-products of residuals
# Y'Y - (X * params)'B * params
t = x.dot(params)
sscpr = np.subtract(y.T.dot(y), t.T.dot(t))
return (params, df_resid, inv_cov, sscpr)
elif method == 'svd':
u, s, v = svd(x, 0)
if (s > tolerance).sum() < len(s):
raise ValueError('Covariance of x singular!')
invs = 1. / s
params = v.T.dot(np.diag(invs)).dot(u.T).dot(y)
inv_cov = v.T.dot(np.diag(np.power(invs, 2))).dot(v)
t = np.diag(s).dot(v).dot(params)
sscpr = np.subtract(y.T.dot(y), t.T.dot(t))
return (params, df_resid, inv_cov, sscpr)
else:
raise ValueError('%s is not a supported method!' % method)
def multivariate_stats(eigenvals,
r_err_sscp,
r_contrast, df_resid, tolerance=1e-8):
"""
For multivariate linear model Y = X * B
Testing hypotheses
L*B*M = 0
where L is contrast matrix, B is the parameters of the
multivariate linear model and M is dependent variable transform matrix.
T = L*inv(X'X)*L'
H = M'B'L'*inv(T)*LBM
E = M'(Y'Y - B'X'XB)M
Parameters
----------
eigenvals : ndarray
The eigenvalues of inv(E + H)*H
r_err_sscp : int
Rank of E + H
r_contrast : int
Rank of T matrix
df_resid : int
Residual degree of freedom (n_samples minus n_variables of X)
tolerance : float
smaller than which eigenvalue is considered 0
Returns
-------
A DataFrame
References
----------
.. [*] https://support.sas.com/documentation/cdl/en/statug/63033/HTML/default/viewer.htm#statug_introreg_sect012.htm
"""
v = df_resid
p = r_err_sscp
q = r_contrast
s = np.min([p, q])
ind = eigenvals > tolerance
n_e = ind.sum()
eigv2 = eigenvals[ind]
eigv1 = np.array([i / (1 - i) for i in eigv2])
m = (np.abs(p - q) - 1) / 2
n = (v - p - 1) / 2
cols = ['Value', 'Num DF', 'Den DF', 'F Value', 'Pr > F']
index = ["Wilks' lambda", "Pillai's trace",
"Hotelling-Lawley trace", "Roy's greatest root"]
results = pd.DataFrame(columns=cols,
index=index)
def fn(x):
return np.real([x])[0]
results.loc["Wilks' lambda", 'Value'] = fn(np.prod(1 - eigv2))
results.loc["Pillai's trace", 'Value'] = fn(eigv2.sum())
results.loc["Hotelling-Lawley trace", 'Value'] = fn(eigv1.sum())
results.loc["Roy's greatest root", 'Value'] = fn(eigv1.max())
r = v - (p - q + 1)/2
u = (p*q - 2) / 4
df1 = p * q
if p*p + q*q - 5 > 0:
t = np.sqrt((p*p*q*q - 4) / (p*p + q*q - 5))
else:
t = 1
df2 = r*t - 2*u
lmd = results.loc["Wilks' lambda", 'Value']
lmd = np.power(lmd, 1 / t)
F = (1 - lmd) / lmd * df2 / df1
results.loc["Wilks' lambda", 'Num DF'] = df1
results.loc["Wilks' lambda", 'Den DF'] = df2
results.loc["Wilks' lambda", 'F Value'] = F
pval = stats.f.sf(F, df1, df2)
results.loc["Wilks' lambda", 'Pr > F'] = pval
V = results.loc["Pillai's trace", 'Value']
df1 = s * (2*m + s + 1)
df2 = s * (2*n + s + 1)
F = df2 / df1 * V / (s - V)
results.loc["Pillai's trace", 'Num DF'] = df1
results.loc["Pillai's trace", 'Den DF'] = df2
results.loc["Pillai's trace", 'F Value'] = F
pval = stats.f.sf(F, df1, df2)
results.loc["Pillai's trace", 'Pr > F'] = pval
U = results.loc["Hotelling-Lawley trace", 'Value']
if n > 0:
b = (p + 2*n) * (q + 2*n) / 2 / (2*n + 1) / (n - 1)
df1 = p * q
df2 = 4 + (p*q + 2) / (b - 1)
c = (df2 - 2) / 2 / n
F = df2 / df1 * U / c
else:
df1 = s * (2*m + s + 1)
df2 = s * (s*n + 1)
F = df2 / df1 / s * U
results.loc["Hotelling-Lawley trace", 'Num DF'] = df1
results.loc["Hotelling-Lawley trace", 'Den DF'] = df2
results.loc["Hotelling-Lawley trace", 'F Value'] = F
pval = stats.f.sf(F, df1, df2)
results.loc["Hotelling-Lawley trace", 'Pr > F'] = pval
sigma = results.loc["Roy's greatest root", 'Value']
r = np.max([p, q])
df1 = r
df2 = v - r + q
F = df2 / df1 * sigma
results.loc["Roy's greatest root", 'Num DF'] = df1
results.loc["Roy's greatest root", 'Den DF'] = df2
results.loc["Roy's greatest root", 'F Value'] = F
pval = stats.f.sf(F, df1, df2)
results.loc["Roy's greatest root", 'Pr > F'] = pval
return results
def _multivariate_ols_test(hypotheses, fit_results, exog_names,
endog_names):
def fn(L, M, C):
# .. [1] https://support.sas.com/documentation/cdl/en/statug/63033
# /HTML/default/viewer.htm#statug_introreg_sect012.htm
params, df_resid, inv_cov, sscpr = fit_results
# t1 = (L * params)M
t1 = L.dot(params).dot(M) - C
# H = t1'L(X'X)^L't1
t2 = L.dot(inv_cov).dot(L.T)
q = matrix_rank(t2)
H = t1.T.dot(inv(t2)).dot(t1)
# E = M'(Y'Y - B'(X'X)B)M
E = M.T.dot(sscpr).dot(M)
return E, H, q, df_resid
return _multivariate_test(hypotheses, exog_names, endog_names, fn)
@Substitution(hypotheses_doc=_hypotheses_doc)
def _multivariate_test(hypotheses, exog_names, endog_names, fn):
"""
Multivariate linear model hypotheses testing
For y = x * params, where y are the dependent variables and x are the
independent variables, testing L * params * M = 0 where L is the contrast
matrix for hypotheses testing and M is the transformation matrix for
transforming the dependent variables in y.
Algorithm:
T = L*inv(X'X)*L'
H = M'B'L'*inv(T)*LBM
E = M'(Y'Y - B'X'XB)M
where H and E correspond to the numerator and denominator of a univariate
F-test. Then find the eigenvalues of inv(H + E)*H from which the
multivariate test statistics are calculated.
.. [*] https://support.sas.com/documentation/cdl/en/statug/63033/HTML
/default/viewer.htm#statug_introreg_sect012.htm
Parameters
----------
%(hypotheses_doc)s
k_xvar : int
The number of independent variables
k_yvar : int
The number of dependent variables
fn : function
a function fn(contrast_L, transform_M) that returns E, H, q, df_resid
where q is the rank of T matrix
Returns
-------
results : MANOVAResults
"""
k_xvar = len(exog_names)
k_yvar = len(endog_names)
results = {}
for hypo in hypotheses:
if len(hypo) ==2:
name, L = hypo
M = None
C = None
elif len(hypo) == 3:
name, L, M = hypo
C = None
elif len(hypo) == 4:
name, L, M, C = hypo
else:
raise ValueError('hypotheses must be a tuple of length 2, 3 or 4.'
' len(hypotheses)=%d' % len(hypo))
if any(isinstance(j, str) for j in L):
L = DesignInfo(exog_names).linear_constraint(L).coefs
else:
if not isinstance(L, np.ndarray) or len(L.shape) != 2:
raise ValueError('Contrast matrix L must be a 2-d array!')
if L.shape[1] != k_xvar:
raise ValueError('Contrast matrix L should have the same '
'number of columns as exog! %d != %d' %
(L.shape[1], k_xvar))
if M is None:
M = np.eye(k_yvar)
elif any(isinstance(j, str) for j in M):
M = DesignInfo(endog_names).linear_constraint(M).coefs.T
else:
if M is not None:
if not isinstance(M, np.ndarray) or len(M.shape) != 2:
raise ValueError('Transform matrix M must be a 2-d array!')
if M.shape[0] != k_yvar:
raise ValueError('Transform matrix M should have the same '
'number of rows as the number of columns '
'of endog! %d != %d' %
(M.shape[0], k_yvar))
if C is None:
C = np.zeros([L.shape[0], M.shape[1]])
elif not isinstance(C, np.ndarray):
raise ValueError('Constant matrix C must be a 2-d array!')
if C.shape[0] != L.shape[0]:
raise ValueError('contrast L and constant C must have the same '
'number of rows! %d!=%d'
% (L.shape[0], C.shape[0]))
if C.shape[1] != M.shape[1]:
raise ValueError('transform M and constant C must have the same '
'number of columns! %d!=%d'
% (M.shape[1], C.shape[1]))
E, H, q, df_resid = fn(L, M, C)
EH = np.add(E, H)
p = matrix_rank(EH)
# eigenvalues of inv(E + H)H
eigv2 = np.sort(eigvals(solve(EH, H)))
stat_table = multivariate_stats(eigv2, p, q, df_resid)
results[name] = {'stat': stat_table, 'contrast_L': L,
'transform_M': M, 'constant_C': C,
'E': E, 'H': H}
return results
class _MultivariateOLS(Model):
"""
Multivariate linear model via least squares
Parameters
----------
endog : array_like
Dependent variables. A nobs x k_endog array where nobs is
the number of observations and k_endog is the number of dependent
variables
exog : array_like
Independent variables. A nobs x k_exog array where nobs is the
number of observations and k_exog is the number of independent
variables. An intercept is not included by default and should be added
by the user (models specified using a formula include an intercept by
default)
Attributes
----------
endog : ndarray
See Parameters.
exog : ndarray
See Parameters.
"""
_formula_max_endog = None
def __init__(self, endog, exog, missing='none', hasconst=None, **kwargs):
if len(endog.shape) == 1 or endog.shape[1] == 1:
raise ValueError('There must be more than one dependent variable'
' to fit multivariate OLS!')
super().__init__(endog, exog, missing=missing,
hasconst=hasconst, **kwargs)
def fit(self, method='svd'):
self._fittedmod = _multivariate_ols_fit(
self.endog, self.exog, method=method)
return _MultivariateOLSResults(self)
class _MultivariateOLSResults:
"""
_MultivariateOLS results class
"""
def __init__(self, fitted_mv_ols):
if (hasattr(fitted_mv_ols, 'data') and
hasattr(fitted_mv_ols.data, 'design_info')):
self.design_info = fitted_mv_ols.data.design_info
else:
self.design_info = None
self.exog_names = fitted_mv_ols.exog_names
self.endog_names = fitted_mv_ols.endog_names
self._fittedmod = fitted_mv_ols._fittedmod
def __str__(self):
return self.summary().__str__()
@Substitution(hypotheses_doc=_hypotheses_doc)
def mv_test(self, hypotheses=None, skip_intercept_test=False):
"""
Linear hypotheses testing
Parameters
----------
%(hypotheses_doc)s
skip_intercept_test : bool
If true, then testing the intercept is skipped, the model is not
changed.
Note: If a term has a numerically insignificant effect, then
an exception because of emtpy arrays may be raised. This can
happen for the intercept if the data has been demeaned.
Returns
-------
results: _MultivariateOLSResults
Notes
-----
Tests hypotheses of the form
L * params * M = C
where `params` is the regression coefficient matrix for the
linear model y = x * params, `L` is the contrast matrix, `M` is the
dependent variable transform matrix and C is the constant matrix.
"""
k_xvar = len(self.exog_names)
if hypotheses is None:
if self.design_info is not None:
terms = self.design_info.term_name_slices
hypotheses = []
for key in terms:
if skip_intercept_test and key == 'Intercept':
continue
L_contrast = np.eye(k_xvar)[terms[key], :]
hypotheses.append([key, L_contrast, None])
else:
hypotheses = []
for i in range(k_xvar):
name = 'x%d' % (i)
L = np.zeros([1, k_xvar])
L[i] = 1
hypotheses.append([name, L, None])
results = _multivariate_ols_test(hypotheses, self._fittedmod,
self.exog_names, self.endog_names)
return MultivariateTestResults(results,
self.endog_names,
self.exog_names)
def summary(self):
raise NotImplementedError
class MultivariateTestResults:
"""
Multivariate test results class
Returned by `mv_test` method of `_MultivariateOLSResults` class
Parameters
----------
results : dict[str, dict]
Dictionary containing test results. See the description
below for the expected format.
endog_names : sequence[str]
A list or other sequence of endogenous variables names
exog_names : sequence[str]
A list of other sequence of exogenous variables names
Attributes
----------
results : dict
Each hypothesis is contained in a single`key`. Each test must
have the following keys:
* 'stat' - contains the multivariate test results
* 'contrast_L' - contains the contrast_L matrix
* 'transform_M' - contains the transform_M matrix
* 'constant_C' - contains the constant_C matrix
* 'H' - contains an intermediate Hypothesis matrix,
or the between groups sums of squares and cross-products matrix,
corresponding to the numerator of the univariate F test.
* 'E' - contains an intermediate Error matrix,
corresponding to the denominator of the univariate F test.
The Hypotheses and Error matrices can be used to calculate
the same test statistics in 'stat', as well as to calculate
the discriminant function (canonical correlates) from the
eigenvectors of inv(E)H.
endog_names : list[str]
The endogenous names
exog_names : list[str]
The exogenous names
summary_frame : DataFrame
Returns results as a MultiIndex DataFrame
"""
def __init__(self, results, endog_names, exog_names):
self.results = results
self.endog_names = list(endog_names)
self.exog_names = list(exog_names)
def __str__(self):
return self.summary().__str__()
def __getitem__(self, item):
return self.results[item]
@property
def summary_frame(self):
"""
Return results as a multiindex dataframe
"""
df = []
for key in self.results:
tmp = self.results[key]['stat'].copy()
tmp.loc[:, 'Effect'] = key
df.append(tmp.reset_index())
df = pd.concat(df, axis=0)
df = df.set_index(['Effect', 'index'])
df.index.set_names(['Effect', 'Statistic'], inplace=True)
return df
def summary(self, show_contrast_L=False, show_transform_M=False,
show_constant_C=False):
"""
Summary of test results
Parameters
----------
show_contrast_L : bool
Whether to show contrast_L matrix
show_transform_M : bool
Whether to show transform_M matrix
show_constant_C : bool
Whether to show the constant_C
"""
summ = summary2.Summary()
summ.add_title('Multivariate linear model')
for key in self.results:
summ.add_dict({'': ''})
df = self.results[key]['stat'].copy()
df = df.reset_index()
c = list(df.columns)
c[0] = key
df.columns = c
df.index = ['', '', '', '']
summ.add_df(df)
if show_contrast_L:
summ.add_dict({key: ' contrast L='})
df = pd.DataFrame(self.results[key]['contrast_L'],
columns=self.exog_names)
summ.add_df(df)
if show_transform_M:
summ.add_dict({key: ' transform M='})
df = pd.DataFrame(self.results[key]['transform_M'],
index=self.endog_names)
summ.add_df(df)
if show_constant_C:
summ.add_dict({key: ' constant C='})
df = pd.DataFrame(self.results[key]['constant_C'])
summ.add_df(df)
return summ

View File

@ -0,0 +1,873 @@
"""Principal Component Analysis
Author: josef-pktd
Modified by Kevin Sheppard
"""
import numpy as np
import pandas as pd
from statsmodels.tools.sm_exceptions import (ValueWarning,
EstimationWarning)
from statsmodels.tools.validation import (string_like,
array_like,
bool_like,
float_like,
int_like,
)
def _norm(x):
return np.sqrt(np.sum(x * x))
class PCA:
"""
Principal Component Analysis
Parameters
----------
data : array_like
Variables in columns, observations in rows.
ncomp : int, optional
Number of components to return. If None, returns the as many as the
smaller of the number of rows or columns in data.
standardize : bool, optional
Flag indicating to use standardized data with mean 0 and unit
variance. standardized being True implies demean. Using standardized
data is equivalent to computing principal components from the
correlation matrix of data.
demean : bool, optional
Flag indicating whether to demean data before computing principal
components. demean is ignored if standardize is True. Demeaning data
but not standardizing is equivalent to computing principal components
from the covariance matrix of data.
normalize : bool , optional
Indicates whether to normalize the factors to have unit inner product.
If False, the loadings will have unit inner product.
gls : bool, optional
Flag indicating to implement a two-step GLS estimator where
in the first step principal components are used to estimate residuals,
and then the inverse residual variance is used as a set of weights to
estimate the final principal components. Setting gls to True requires
ncomp to be less then the min of the number of rows or columns.
weights : ndarray, optional
Series weights to use after transforming data according to standardize
or demean when computing the principal components.
method : str, optional
Sets the linear algebra routine used to compute eigenvectors:
* 'svd' uses a singular value decomposition (default).
* 'eig' uses an eigenvalue decomposition of a quadratic form
* 'nipals' uses the NIPALS algorithm and can be faster than SVD when
ncomp is small and nvars is large. See notes about additional changes
when using NIPALS.
missing : {str, None}
Method for missing data. Choices are:
* 'drop-row' - drop rows with missing values.
* 'drop-col' - drop columns with missing values.
* 'drop-min' - drop either rows or columns, choosing by data retention.
* 'fill-em' - use EM algorithm to fill missing value. ncomp should be
set to the number of factors required.
* `None` raises if data contains NaN values.
tol : float, optional
Tolerance to use when checking for convergence when using NIPALS.
max_iter : int, optional
Maximum iterations when using NIPALS.
tol_em : float
Tolerance to use when checking for convergence of the EM algorithm.
max_em_iter : int
Maximum iterations for the EM algorithm.
svd_full_matrices : bool, optional
If the 'svd' method is selected, this flag is used to set the parameter
'full_matrices' in the singular value decomposition method. Is set to
False by default.
Attributes
----------
factors : array or DataFrame
nobs by ncomp array of principal components (scores)
scores : array or DataFrame
nobs by ncomp array of principal components - identical to factors
loadings : array or DataFrame
ncomp by nvar array of principal component loadings for constructing
the factors
coeff : array or DataFrame
nvar by ncomp array of principal component loadings for constructing
the projections
projection : array or DataFrame
nobs by var array containing the projection of the data onto the ncomp
estimated factors
rsquare : array or Series
ncomp array where the element in the ith position is the R-square
of including the fist i principal components. Note: values are
calculated on the transformed data, not the original data
ic : array or DataFrame
ncomp by 3 array containing the Bai and Ng (2003) Information
criteria. Each column is a different criteria, and each row
represents the number of included factors.
eigenvals : array or Series
nvar array of eigenvalues
eigenvecs : array or DataFrame
nvar by nvar array of eigenvectors
weights : ndarray
nvar array of weights used to compute the principal components,
normalized to unit length
transformed_data : ndarray
Standardized, demeaned and weighted data used to compute
principal components and related quantities
cols : ndarray
Array of indices indicating columns used in the PCA
rows : ndarray
Array of indices indicating rows used in the PCA
Notes
-----
The default options perform principal component analysis on the
demeaned, unit variance version of data. Setting standardize to False
will instead only demean, and setting both standardized and
demean to False will not alter the data.
Once the data have been transformed, the following relationships hold when
the number of components (ncomp) is the same as tne minimum of the number
of observation or the number of variables.
.. math:
X' X = V \\Lambda V'
.. math:
F = X V
.. math:
X = F V'
where X is the `data`, F is the array of principal components (`factors`
or `scores`), and V is the array of eigenvectors (`loadings`) and V' is
the array of factor coefficients (`coeff`).
When weights are provided, the principal components are computed from the
modified data
.. math:
\\Omega^{-\\frac{1}{2}} X
where :math:`\\Omega` is a diagonal matrix composed of the weights. For
example, when using the GLS version of PCA, the elements of :math:`\\Omega`
will be the inverse of the variances of the residuals from
.. math:
X - F V'
where the number of factors is less than the rank of X
References
----------
.. [*] J. Bai and S. Ng, "Determining the number of factors in approximate
factor models," Econometrica, vol. 70, number 1, pp. 191-221, 2002
Examples
--------
Basic PCA using the correlation matrix of the data
>>> import numpy as np
>>> from statsmodels.multivariate.pca import PCA
>>> x = np.random.randn(100)[:, None]
>>> x = x + np.random.randn(100, 100)
>>> pc = PCA(x)
Note that the principal components are computed using a SVD and so the
correlation matrix is never constructed, unless method='eig'.
PCA using the covariance matrix of the data
>>> pc = PCA(x, standardize=False)
Limiting the number of factors returned to 1 computed using NIPALS
>>> pc = PCA(x, ncomp=1, method='nipals')
>>> pc.factors.shape
(100, 1)
"""
def __init__(self, data, ncomp=None, standardize=True, demean=True,
normalize=True, gls=False, weights=None, method='svd',
missing=None, tol=5e-8, max_iter=1000, tol_em=5e-8,
max_em_iter=100, svd_full_matrices=False):
self._index = None
self._columns = []
if isinstance(data, pd.DataFrame):
self._index = data.index
self._columns = data.columns
self.data = array_like(data, "data", ndim=2)
# Store inputs
self._gls = bool_like(gls, "gls")
self._normalize = bool_like(normalize, "normalize")
self._svd_full_matrices = bool_like(svd_full_matrices, "svd_fm")
self._tol = float_like(tol, "tol")
if not 0 < self._tol < 1:
raise ValueError('tol must be strictly between 0 and 1')
self._max_iter = int_like(max_iter, "int_like")
self._max_em_iter = int_like(max_em_iter, "max_em_iter")
self._tol_em = float_like(tol_em, "tol_em")
# Prepare data
self._standardize = bool_like(standardize, "standardize")
self._demean = bool_like(demean, "demean")
self._nobs, self._nvar = self.data.shape
weights = array_like(weights, "weights", maxdim=1, optional=True)
if weights is None:
weights = np.ones(self._nvar)
else:
weights = np.array(weights).flatten()
if weights.shape[0] != self._nvar:
raise ValueError('weights should have nvar elements')
weights = weights / np.sqrt((weights ** 2.0).mean())
self.weights = weights
# Check ncomp against maximum
min_dim = min(self._nobs, self._nvar)
self._ncomp = min_dim if ncomp is None else ncomp
if self._ncomp > min_dim:
import warnings
warn = 'The requested number of components is more than can be ' \
'computed from data. The maximum number of components is ' \
'the minimum of the number of observations or variables'
warnings.warn(warn, ValueWarning)
self._ncomp = min_dim
self._method = method
# Workaround to avoid instance methods in __dict__
if self._method not in ('eig', 'svd', 'nipals'):
raise ValueError(f'method {method} is not known.')
if self._method == 'svd':
self._svd_full_matrices = True
self.rows = np.arange(self._nobs)
self.cols = np.arange(self._nvar)
# Handle missing
self._missing = string_like(missing, "missing", optional=True)
self._adjusted_data = self.data
self._adjust_missing()
# Update size
self._nobs, self._nvar = self._adjusted_data.shape
if self._ncomp == np.min(self.data.shape):
self._ncomp = np.min(self._adjusted_data.shape)
elif self._ncomp > np.min(self._adjusted_data.shape):
raise ValueError('When adjusting for missing values, user '
'provided ncomp must be no larger than the '
'smallest dimension of the '
'missing-value-adjusted data size.')
# Attributes and internal values
self._tss = 0.0
self._ess = None
self.transformed_data = None
self._mu = None
self._sigma = None
self._ess_indiv = None
self._tss_indiv = None
self.scores = self.factors = None
self.loadings = None
self.coeff = None
self.eigenvals = None
self.eigenvecs = None
self.projection = None
self.rsquare = None
self.ic = None
# Prepare data
self.transformed_data = self._prepare_data()
# Perform the PCA
self._pca()
if gls:
self._compute_gls_weights()
self.transformed_data = self._prepare_data()
self._pca()
# Final calculations
self._compute_rsquare_and_ic()
if self._index is not None:
self._to_pandas()
def _adjust_missing(self):
"""
Implements alternatives for handling missing values
"""
def keep_col(x):
index = np.logical_not(np.any(np.isnan(x), 0))
return x[:, index], index
def keep_row(x):
index = np.logical_not(np.any(np.isnan(x), 1))
return x[index, :], index
if self._missing == 'drop-col':
self._adjusted_data, index = keep_col(self.data)
self.cols = np.where(index)[0]
self.weights = self.weights[index]
elif self._missing == 'drop-row':
self._adjusted_data, index = keep_row(self.data)
self.rows = np.where(index)[0]
elif self._missing == 'drop-min':
drop_col, drop_col_index = keep_col(self.data)
drop_col_size = drop_col.size
drop_row, drop_row_index = keep_row(self.data)
drop_row_size = drop_row.size
if drop_row_size > drop_col_size:
self._adjusted_data = drop_row
self.rows = np.where(drop_row_index)[0]
else:
self._adjusted_data = drop_col
self.weights = self.weights[drop_col_index]
self.cols = np.where(drop_col_index)[0]
elif self._missing == 'fill-em':
self._adjusted_data = self._fill_missing_em()
elif self._missing is None:
if not np.isfinite(self._adjusted_data).all():
raise ValueError("""\
data contains non-finite values (inf, NaN). You should drop these values or
use one of the methods for adjusting data for missing-values.""")
else:
raise ValueError('missing method is not known.')
if self._index is not None:
self._columns = self._columns[self.cols]
self._index = self._index[self.rows]
# Check adjusted data size
if self._adjusted_data.size == 0:
raise ValueError('Removal of missing values has eliminated '
'all data.')
def _compute_gls_weights(self):
"""
Computes GLS weights based on percentage of data fit
"""
projection = np.asarray(self.project(transform=False))
errors = self.transformed_data - projection
if self._ncomp == self._nvar:
raise ValueError('gls can only be used when ncomp < nvar '
'so that residuals have non-zero variance')
var = (errors ** 2.0).mean(0)
weights = 1.0 / var
weights = weights / np.sqrt((weights ** 2.0).mean())
nvar = self._nvar
eff_series_perc = (1.0 / sum((weights / weights.sum()) ** 2.0)) / nvar
if eff_series_perc < 0.1:
eff_series = int(np.round(eff_series_perc * nvar))
import warnings
warn = f"""\
Many series are being down weighted by GLS. Of the {nvar} series, the GLS
estimates are based on only {eff_series} (effective) series."""
warnings.warn(warn, EstimationWarning)
self.weights = weights
def _pca(self):
"""
Main PCA routine
"""
self._compute_eig()
self._compute_pca_from_eig()
self.projection = self.project()
def __repr__(self):
string = self.__str__()
string = string[:-1]
string += ', id: ' + hex(id(self)) + ')'
return string
def __str__(self):
string = 'Principal Component Analysis('
string += 'nobs: ' + str(self._nobs) + ', '
string += 'nvar: ' + str(self._nvar) + ', '
if self._standardize:
kind = 'Standardize (Correlation)'
elif self._demean:
kind = 'Demean (Covariance)'
else:
kind = 'None'
string += 'transformation: ' + kind + ', '
if self._gls:
string += 'GLS, '
string += 'normalization: ' + str(self._normalize) + ', '
string += 'number of components: ' + str(self._ncomp) + ', '
string += 'method: ' + 'Eigenvalue' if self._method == 'eig' else 'SVD'
string += ')'
return string
def _prepare_data(self):
"""
Standardize or demean data.
"""
adj_data = self._adjusted_data
if np.all(np.isnan(adj_data)):
return np.empty(adj_data.shape[1]).fill(np.nan)
self._mu = np.nanmean(adj_data, axis=0)
self._sigma = np.sqrt(np.nanmean((adj_data - self._mu) ** 2.0, axis=0))
if self._standardize:
data = (adj_data - self._mu) / self._sigma
elif self._demean:
data = (adj_data - self._mu)
else:
data = adj_data
return data / np.sqrt(self.weights)
def _compute_eig(self):
"""
Wrapper for actual eigenvalue method
This is a workaround to avoid instance methods in __dict__
"""
if self._method == 'eig':
return self._compute_using_eig()
elif self._method == 'svd':
return self._compute_using_svd()
else: # self._method == 'nipals'
return self._compute_using_nipals()
def _compute_using_svd(self):
"""SVD method to compute eigenvalues and eigenvecs"""
x = self.transformed_data
u, s, v = np.linalg.svd(x, full_matrices=self._svd_full_matrices)
self.eigenvals = s ** 2.0
self.eigenvecs = v.T
def _compute_using_eig(self):
"""
Eigenvalue decomposition method to compute eigenvalues and eigenvectors
"""
x = self.transformed_data
self.eigenvals, self.eigenvecs = np.linalg.eigh(x.T.dot(x))
def _compute_using_nipals(self):
"""
NIPALS implementation to compute small number of eigenvalues
and eigenvectors
"""
x = self.transformed_data
if self._ncomp > 1:
x = x + 0.0 # Copy
tol, max_iter, ncomp = self._tol, self._max_iter, self._ncomp
vals = np.zeros(self._ncomp)
vecs = np.zeros((self._nvar, self._ncomp))
for i in range(ncomp):
max_var_ind = np.argmax(x.var(0))
factor = x[:, [max_var_ind]]
_iter = 0
diff = 1.0
while diff > tol and _iter < max_iter:
vec = x.T.dot(factor) / (factor.T.dot(factor))
vec = vec / np.sqrt(vec.T.dot(vec))
factor_last = factor
factor = x.dot(vec) / (vec.T.dot(vec))
diff = _norm(factor - factor_last) / _norm(factor)
_iter += 1
vals[i] = (factor ** 2).sum()
vecs[:, [i]] = vec
if ncomp > 1:
x -= factor.dot(vec.T)
self.eigenvals = vals
self.eigenvecs = vecs
def _fill_missing_em(self):
"""
EM algorithm to fill missing values
"""
non_missing = np.logical_not(np.isnan(self.data))
# If nothing missing, return without altering the data
if np.all(non_missing):
return self.data
# 1. Standardized data as needed
data = self.transformed_data = np.asarray(self._prepare_data())
ncomp = self._ncomp
# 2. Check for all nans
col_non_missing = np.sum(non_missing, 1)
row_non_missing = np.sum(non_missing, 0)
if np.any(col_non_missing < ncomp) or np.any(row_non_missing < ncomp):
raise ValueError('Implementation requires that all columns and '
'all rows have at least ncomp non-missing values')
# 3. Get mask
mask = np.isnan(data)
# 4. Compute mean
mu = np.nanmean(data, 0)
# 5. Replace missing with mean
projection = np.ones((self._nobs, 1)) * mu
projection_masked = projection[mask]
data[mask] = projection_masked
# 6. Compute eigenvalues and fit
diff = 1.0
_iter = 0
while diff > self._tol_em and _iter < self._max_em_iter:
last_projection_masked = projection_masked
# Set transformed data to compute eigenvalues
self.transformed_data = data
# Call correct eig function here
self._compute_eig()
# Call function to compute factors and projection
self._compute_pca_from_eig()
projection = np.asarray(self.project(transform=False,
unweight=False))
projection_masked = projection[mask]
data[mask] = projection_masked
delta = last_projection_masked - projection_masked
diff = _norm(delta) / _norm(projection_masked)
_iter += 1
# Must copy to avoid overwriting original data since replacing values
data = self._adjusted_data + 0.0
projection = np.asarray(self.project())
data[mask] = projection[mask]
return data
def _compute_pca_from_eig(self):
"""
Compute relevant statistics after eigenvalues have been computed
"""
# Ensure sorted largest to smallest
vals, vecs = self.eigenvals, self.eigenvecs
indices = np.argsort(vals)
indices = indices[::-1]
vals = vals[indices]
vecs = vecs[:, indices]
if (vals <= 0).any():
# Discard and warn
num_good = vals.shape[0] - (vals <= 0).sum()
if num_good < self._ncomp:
import warnings
warnings.warn('Only {num:d} eigenvalues are positive. '
'This is the maximum number of components '
'that can be extracted.'.format(num=num_good),
EstimationWarning)
self._ncomp = num_good
vals[num_good:] = np.finfo(np.float64).tiny
# Use ncomp for the remaining calculations
vals = vals[:self._ncomp]
vecs = vecs[:, :self._ncomp]
self.eigenvals, self.eigenvecs = vals, vecs
# Select correct number of components to return
self.scores = self.factors = self.transformed_data.dot(vecs)
self.loadings = vecs
self.coeff = vecs.T
if self._normalize:
self.coeff = (self.coeff.T * np.sqrt(vals)).T
self.factors /= np.sqrt(vals)
self.scores = self.factors
def _compute_rsquare_and_ic(self):
"""
Final statistics to compute
"""
# TSS and related calculations
# TODO: This needs careful testing, with and without weights,
# gls, standardized and demean
weights = self.weights
ss_data = self.transformed_data * np.sqrt(weights)
self._tss_indiv = np.sum(ss_data ** 2, 0)
self._tss = np.sum(self._tss_indiv)
self._ess = np.zeros(self._ncomp + 1)
self._ess_indiv = np.zeros((self._ncomp + 1, self._nvar))
for i in range(self._ncomp + 1):
# Projection in the same space as transformed_data
projection = self.project(ncomp=i, transform=False, unweight=False)
indiv_rss = (projection ** 2).sum(axis=0)
rss = indiv_rss.sum()
self._ess[i] = self._tss - rss
self._ess_indiv[i, :] = self._tss_indiv - indiv_rss
self.rsquare = 1.0 - self._ess / self._tss
# Information Criteria
ess = self._ess
invalid = ess <= 0 # Prevent log issues of 0
if invalid.any():
last_obs = (np.where(invalid)[0]).min()
ess = ess[:last_obs]
log_ess = np.log(ess)
r = np.arange(ess.shape[0])
nobs, nvar = self._nobs, self._nvar
sum_to_prod = (nobs + nvar) / (nobs * nvar)
min_dim = min(nobs, nvar)
penalties = np.array([sum_to_prod * np.log(1.0 / sum_to_prod),
sum_to_prod * np.log(min_dim),
np.log(min_dim) / min_dim])
penalties = penalties[:, None]
ic = log_ess + r * penalties
self.ic = ic.T
def project(self, ncomp=None, transform=True, unweight=True):
"""
Project series onto a specific number of factors.
Parameters
----------
ncomp : int, optional
Number of components to use. If omitted, all components
initially computed are used.
transform : bool, optional
Flag indicating whether to return the projection in the original
space of the data (True, default) or in the space of the
standardized/demeaned data.
unweight : bool, optional
Flag indicating whether to undo the effects of the estimation
weights.
Returns
-------
array_like
The nobs by nvar array of the projection onto ncomp factors.
Notes
-----
"""
# Projection needs to be scaled/shifted based on inputs
ncomp = self._ncomp if ncomp is None else ncomp
if ncomp > self._ncomp:
raise ValueError('ncomp must be smaller than the number of '
'components computed.')
factors = np.asarray(self.factors)
coeff = np.asarray(self.coeff)
projection = factors[:, :ncomp].dot(coeff[:ncomp, :])
if transform or unweight:
projection *= np.sqrt(self.weights)
if transform:
# Remove the weights, which do not depend on transformation
if self._standardize:
projection *= self._sigma
if self._standardize or self._demean:
projection += self._mu
if self._index is not None:
projection = pd.DataFrame(projection,
columns=self._columns,
index=self._index)
return projection
def _to_pandas(self):
"""
Returns pandas DataFrames for all values
"""
index = self._index
# Principal Components
num_zeros = np.ceil(np.log10(self._ncomp))
comp_str = 'comp_{0:0' + str(int(num_zeros)) + 'd}'
cols = [comp_str.format(i) for i in range(self._ncomp)]
df = pd.DataFrame(self.factors, columns=cols, index=index)
self.scores = self.factors = df
# Projections
df = pd.DataFrame(self.projection,
columns=self._columns,
index=index)
self.projection = df
# Weights
df = pd.DataFrame(self.coeff, index=cols,
columns=self._columns)
self.coeff = df
# Loadings
df = pd.DataFrame(self.loadings,
index=self._columns, columns=cols)
self.loadings = df
# eigenvals
self.eigenvals = pd.Series(self.eigenvals)
self.eigenvals.name = 'eigenvals'
# eigenvecs
vec_str = comp_str.replace('comp', 'eigenvec')
cols = [vec_str.format(i) for i in range(self.eigenvecs.shape[1])]
self.eigenvecs = pd.DataFrame(self.eigenvecs, columns=cols)
# R2
self.rsquare = pd.Series(self.rsquare)
self.rsquare.index.name = 'ncomp'
self.rsquare.name = 'rsquare'
# IC
self.ic = pd.DataFrame(self.ic, columns=['IC_p1', 'IC_p2', 'IC_p3'])
self.ic.index.name = 'ncomp'
def plot_scree(self, ncomp=None, log_scale=True,
cumulative=False, ax=None):
"""
Plot of the ordered eigenvalues
Parameters
----------
ncomp : int, optional
Number of components ot include in the plot. If None, will
included the same as the number of components computed
log_scale : boot, optional
Flag indicating whether ot use a log scale for the y-axis
cumulative : bool, optional
Flag indicating whether to plot the eigenvalues or cumulative
eigenvalues
ax : AxesSubplot, optional
An axes on which to draw the graph. If omitted, new a figure
is created
Returns
-------
matplotlib.figure.Figure
The handle to the figure.
"""
import statsmodels.graphics.utils as gutils
fig, ax = gutils.create_mpl_ax(ax)
ncomp = self._ncomp if ncomp is None else ncomp
vals = np.asarray(self.eigenvals)
vals = vals[:self._ncomp]
if cumulative:
vals = np.cumsum(vals)
if log_scale:
ax.set_yscale('log')
ax.plot(np.arange(ncomp), vals[: ncomp], 'bo')
ax.autoscale(tight=True)
xlim = np.array(ax.get_xlim())
sp = xlim[1] - xlim[0]
xlim += 0.02 * np.array([-sp, sp])
ax.set_xlim(xlim)
ylim = np.array(ax.get_ylim())
scale = 0.02
if log_scale:
sp = np.log(ylim[1] / ylim[0])
ylim = np.exp(np.array([np.log(ylim[0]) - scale * sp,
np.log(ylim[1]) + scale * sp]))
else:
sp = ylim[1] - ylim[0]
ylim += scale * np.array([-sp, sp])
ax.set_ylim(ylim)
ax.set_title('Scree Plot')
ax.set_ylabel('Eigenvalue')
ax.set_xlabel('Component Number')
fig.tight_layout()
return fig
def plot_rsquare(self, ncomp=None, ax=None):
"""
Box plots of the individual series R-square against the number of PCs.
Parameters
----------
ncomp : int, optional
Number of components ot include in the plot. If None, will
plot the minimum of 10 or the number of computed components.
ax : AxesSubplot, optional
An axes on which to draw the graph. If omitted, new a figure
is created.
Returns
-------
matplotlib.figure.Figure
The handle to the figure.
"""
import statsmodels.graphics.utils as gutils
fig, ax = gutils.create_mpl_ax(ax)
ncomp = 10 if ncomp is None else ncomp
ncomp = min(ncomp, self._ncomp)
# R2s in rows, series in columns
r2s = 1.0 - self._ess_indiv / self._tss_indiv
r2s = r2s[1:]
r2s = r2s[:ncomp]
ax.boxplot(r2s.T)
ax.set_title('Individual Input $R^2$')
ax.set_ylabel('$R^2$')
ax.set_xlabel('Number of Included Principal Components')
return fig
def pca(data, ncomp=None, standardize=True, demean=True, normalize=True,
gls=False, weights=None, method='svd'):
"""
Perform Principal Component Analysis (PCA).
Parameters
----------
data : ndarray
Variables in columns, observations in rows.
ncomp : int, optional
Number of components to return. If None, returns the as many as the
smaller to the number of rows or columns of data.
standardize : bool, optional
Flag indicating to use standardized data with mean 0 and unit
variance. standardized being True implies demean.
demean : bool, optional
Flag indicating whether to demean data before computing principal
components. demean is ignored if standardize is True.
normalize : bool , optional
Indicates whether th normalize the factors to have unit inner
product. If False, the loadings will have unit inner product.
gls : bool, optional
Flag indicating to implement a two-step GLS estimator where
in the first step principal components are used to estimate residuals,
and then the inverse residual variance is used as a set of weights to
estimate the final principal components
weights : ndarray, optional
Series weights to use after transforming data according to standardize
or demean when computing the principal components.
method : str, optional
Determines the linear algebra routine uses. 'eig', the default,
uses an eigenvalue decomposition. 'svd' uses a singular value
decomposition.
Returns
-------
factors : {ndarray, DataFrame}
Array (nobs, ncomp) of principal components (also known as scores).
loadings : {ndarray, DataFrame}
Array (ncomp, nvar) of principal component loadings for constructing
the factors.
projection : {ndarray, DataFrame}
Array (nobs, nvar) containing the projection of the data onto the ncomp
estimated factors.
rsquare : {ndarray, Series}
Array (ncomp,) where the element in the ith position is the R-square
of including the fist i principal components. The values are
calculated on the transformed data, not the original data.
ic : {ndarray, DataFrame}
Array (ncomp, 3) containing the Bai and Ng (2003) Information
criteria. Each column is a different criteria, and each row
represents the number of included factors.
eigenvals : {ndarray, Series}
Array of eigenvalues (nvar,).
eigenvecs : {ndarray, DataFrame}
Array of eigenvectors. (nvar, nvar).
Notes
-----
This is a simple function wrapper around the PCA class. See PCA for
more information and additional methods.
"""
pc = PCA(data, ncomp=ncomp, standardize=standardize, demean=demean,
normalize=normalize, gls=gls, weights=weights, method=method)
return (pc.factors, pc.loadings, pc.projection, pc.rsquare, pc.ic,
pc.eigenvals, pc.eigenvecs)

View File

@ -0,0 +1,140 @@
import matplotlib.pyplot as plt
import numpy as np
def plot_scree(eigenvals, total_var, ncomp=None, x_label='factor'):
"""
Plot of the ordered eigenvalues and variance explained for the loadings
Parameters
----------
eigenvals : array_like
The eigenvalues
total_var : float
the total variance (for plotting percent variance explained)
ncomp : int, optional
Number of factors to include in the plot. If None, will
included the same as the number of maximum possible loadings
x_label : str
label of x-axis
Returns
-------
Figure
Handle to the figure.
"""
fig = plt.figure()
ncomp = len(eigenvals) if ncomp is None else ncomp
vals = eigenvals
vals = vals[:ncomp]
# vals = np.cumsum(vals)
ax = fig.add_subplot(121)
ax.plot(np.arange(ncomp), vals[: ncomp], 'b-o')
ax.autoscale(tight=True)
xlim = np.array(ax.get_xlim())
sp = xlim[1] - xlim[0]
xlim += 0.02 * np.array([-sp, sp])
ax.set_xticks(np.arange(ncomp))
ax.set_xlim(xlim)
ylim = np.array(ax.get_ylim())
scale = 0.02
sp = ylim[1] - ylim[0]
ylim += scale * np.array([-sp, sp])
ax.set_ylim(ylim)
ax.set_title('Scree Plot')
ax.set_ylabel('Eigenvalue')
ax.set_xlabel(x_label)
per_variance = vals / total_var
cumper_variance = np.cumsum(per_variance)
ax = fig.add_subplot(122)
ax.plot(np.arange(ncomp), per_variance[: ncomp], 'b-o')
ax.plot(np.arange(ncomp), cumper_variance[: ncomp], 'g--o')
ax.autoscale(tight=True)
xlim = np.array(ax.get_xlim())
sp = xlim[1] - xlim[0]
xlim += 0.02 * np.array([-sp, sp])
ax.set_xticks(np.arange(ncomp))
ax.set_xlim(xlim)
ylim = np.array(ax.get_ylim())
scale = 0.02
sp = ylim[1] - ylim[0]
ylim += scale * np.array([-sp, sp])
ax.set_ylim(ylim)
ax.set_title('Variance Explained')
ax.set_ylabel('Proportion')
ax.set_xlabel(x_label)
ax.legend(['Proportion', 'Cumulative'], loc=5)
fig.tight_layout()
return fig
def plot_loadings(loadings, col_names=None, row_names=None,
loading_pairs=None, percent_variance=None,
title='Factor patterns'):
"""
Plot factor loadings in 2-d plots
Parameters
----------
loadings : array like
Each column is a component (or factor)
col_names : a list of strings
column names of `loadings`
row_names : a list of strings
row names of `loadings`
loading_pairs : None or a list of tuples
Specify plots. Each tuple (i, j) represent one figure, i and j is
the loading number for x-axis and y-axis, respectively. If `None`,
all combinations of the loadings will be plotted.
percent_variance : array_like
The percent variance explained by each factor.
Returns
-------
figs : a list of figure handles
"""
k_var, n_factor = loadings.shape
if loading_pairs is None:
loading_pairs = []
for i in range(n_factor):
for j in range(i + 1,n_factor):
loading_pairs.append([i, j])
if col_names is None:
col_names = ["factor %d" % i for i in range(n_factor)]
if row_names is None:
row_names = ["var %d" % i for i in range(k_var)]
figs = []
for item in loading_pairs:
i = item[0]
j = item[1]
fig = plt.figure(figsize=(7, 7))
figs.append(fig)
ax = fig.add_subplot(111)
for k in range(loadings.shape[0]):
plt.text(loadings[k, i], loadings[k, j],
row_names[k], fontsize=12)
ax.plot(loadings[:, i], loadings[:, j], 'bo')
ax.set_title(title)
if percent_variance is not None:
x_str = f'{col_names[i]} ({percent_variance[i]:.1f}%)'
y_str = f'{col_names[j]} ({percent_variance[j]:.1f}%)'
ax.set_xlabel(x_str)
ax.set_ylabel(y_str)
else:
ax.set_xlabel(col_names[i])
ax.set_ylabel(col_names[j])
v = 1.05
xlim = np.array([-v, v])
ylim = np.array([-v, v])
ax.plot(xlim, [0, 0], 'k--')
ax.plot([0, 0], ylim, 'k--')
ax.set_aspect('equal', 'datalim')
ax.set_xlim(xlim)
ax.set_ylim(ylim)
fig.tight_layout()
return figs

View File

@ -0,0 +1,294 @@
from numpy import array
from statsmodels.tools.testing import Holder
data = Holder()
data.comment = 'generated data, divide by 1000'
data.name = 'data'
data.xo = array([
[-419, -731, -1306, -1294],
[6, 529, -200, -437],
[-27, -833, -6, -564],
[-304, -273, -502, -739],
[1377, -912, 927, 280],
[-375, -517, -514, 49],
[247, -504, 123, -259],
[712, 534, -773, 286],
[195, -1080, 3256, -178],
[-854, 75, -706, -1084],
[-1219, -612, -15, -203],
[550, -628, -483, -2686],
[-365, 1376, -1266, 317],
[-489, 544, -195, 431],
[-656, 854, 840, -723],
[16, -1385, -880, -460],
[258, -2252, 96, 54],
[2049, -750, -1115, 381],
[-65, 280, -777, 416],
[755, 82, -806, 1027],
[-39, -170, -2134, 743],
[-859, 780, 746, -133],
[762, 252, -450, -459],
[-941, -202, 49, -202],
[-54, 115, 455, 388],
[-1348, 1246, 1430, -480],
[229, -535, -1831, 1524],
[-651, -167, 2116, 483],
[-1249, -1373, 888, -1092],
[-75, -2162, 486, -496],
[2436, -1627, -1069, 162],
[-63, 560, -601, 587],
[-60, 1051, -277, 1323],
[1329, -1294, 68, 5],
[1532, -633, -923, 696],
[669, 895, -1762, -375],
[1129, -548, 2064, 609],
[1320, 573, 2119, 270],
[-213, -412, -2517, 1685],
[73, -979, 1312, -1220],
[-1360, -2107, -237, 1522],
[-645, 205, -543, -169],
[-212, 1072, 543, -128],
[-352, -129, -605, -904],
[511, 85, 167, -1914],
[1515, 1862, 942, 1622],
[-465, 623, -495, -89],
[-1396, -979, 1758, 128],
[-255, -47, 980, 501],
[-1282, -58, -49, -610],
[-889, -1177, -492, 494],
[1415, 1146, 696, -722],
[1237, -224, -1609, -64],
[-528, -1625, 231, 883],
[-327, 1636, -476, -361],
[-781, 793, 1882, 234],
[-506, -561, 1988, -810],
[-1233, 1467, -261, 2164],
[53, 1069, 824, 2123],
[-1200, -441, -321, 339],
[1606, 298, -995, 1292],
[-1740, -672, -1628, -129],
[-1450, -354, 224, -657],
[-2556, 1006, -706, -1453],
[-717, -463, 345, -1821],
[1056, -38, -420, -455],
[-523, 565, 425, 1138],
[-1030, -187, 683, 78],
[-214, -312, -1171, -528],
[819, 736, -265, 423],
[1339, 351, 1142, 579],
[-387, -126, -1573, 2346],
[969, 2, 327, -134],
[163, 227, 90, 2021],
[1022, -1076, 174, 304],
[1042, 1317, 311, 880],
[2018, -840, 295, 2651],
[-277, 566, 1147, -189],
[20, 467, 1262, 263],
[-663, 1061, -1552, -1159],
[1830, 391, 2534, -199],
[-487, 752, -1061, 351],
[-2138, -556, -367, -457],
[-868, -411, -559, 726],
[1770, 819, -892, -363],
[553, -736, -169, -490],
[388, -503, 809, -821],
[-516, -1452, -192, 483],
[493, 2904, 1318, 2591],
[175, 584, -1001, 1675],
[1316, -1596, -460, 1500],
[1212, 214, -644, -696],
[-501, 338, 1197, -841],
[-587, -469, -1101, 24],
[-1205, 1910, 659, 1232],
[-150, 398, 594, 394],
[34, -663, 235, -334],
[-1580, 647, 239, -351],
[-2177, -345, 1215, -1494],
[1923, 329, -152, 1128]])
princomp1 = Holder()
princomp1.comment = 'mlab.princomp(x, nout=3)'
princomp1.factors = array([
[-.83487832815382, -1.75681522344645, -.50882660928949, -.59661466511045],
[-.18695786699253, -.10732909330422, .23971799542554, -.75468286946853],
[-.57403949255604, -.39667006607544, -.7927838094217, .02652621881328],
[-.60828125251513, -.75979035898754, -.20148864200404, -.40278856050237],
[.55997928601548, .88869370546643, -1.55474410845786, .23033958281961],
[-.18023239851961, -.72398923145328, -.07056264751117, .29292391015376],
[-.189029743271, -.05888596186903, -.63882208368513, -.05682951829677],
[.94694345324739, -.33448036234864, .16665867708366, -.67190948646953],
[-1.355171899399, 2.58899695901774, -1.53157119606928, .93743278678908],
[-1.06797676403358, -1.01894055566289, .29181722134698, -.65261957826524],
[-1.08919199915725, -.5395876105009, .18846579824378, .61935728909742],
[-1.36598849770841, -1.00986627679465, -1.6090477073157, -1.82708847399443], # noqa:E501
[.561511276285, -.74919011595195, 1.49872898209738, -.80588545345232],
[.04805787176428, -.05522267212748, .82943784435024, .01537039050312],
[-1.12006939155398, .73462770352006, .58868274831601, -.67786987413505],
[-.26087838474316, -1.33362289066951, -1.02932517860259, .24865839951801],
[-.24666198784909, -.58247196399204, -1.78971960966265, 1.18908143657302],
[1.80675592845666, -.73341258204636, -1.45012544705912, -.44875329121288],
[.4794281391435, -.57169295903913, .48557628591056, -.11638075289238],
[1.39425263398653, -.3665732682294, .06937942447187, .06683559082703],
[1.11015707065101, -1.87631329249852, .48914958604867, .11096926802212],
[-.85159530389901, .68543874135386, .86736021483251, -.17641002537865],
[.34109015314112, -.25431311542374, -.36804227540019, -.95824474920131],
[-.86253950274987, -.28796613689709, .30820634958709, .27228599921917],
[.01266190412089, .48559962017667, .14020630700546, .18517398749337],
[-1.56345869427724, 1.27917754070516, 1.25640847929385, -.36055181722313],
[1.62834293379132, -1.51923809467869, .27754976407182, .79362967384835],
[-.94400458067084, 1.77733054371289, .03595731772774, .96570688640992],
[-2.11906234438329, -.13226430948321, -.78992396115366, .66362103473975],
[-.94372331181891, -.37502966791165, -1.77907324401749, .97801542954941],
[1.76575198740032, -.92309597844861, -2.3872195277998, -.21817018301121],
[.57418226616373, -.2925257318724, .71180507312941, -.13937750314467],
[1.01654397566275, .28855305878842, 1.25119859389106, .11257524396004],
[.58979013567212, -.06866577243092, -1.74447546690995, .13917953157575],
[1.62072087150051, -.5835145063711, -.99029357957459, -.06334029436682],
[.893493925425, -1.23995040005948, .40058503790479, -1.49029669097391],
[.26990527585623, 2.03399854143898, -1.2335089890881, .54010061879979],
[.33504096277444, 2.42394994177782, -.6643863358332, -.42471161848557],
[1.69952476943058, -2.1707037237448, .79694026483866, .88177267205969],
[-1.41498253257895, .65248089992094, -1.40045976465378, -.12045332880702],
[-.22640706265253, -.94114558124915, -.18868114063537, 2.67652245892778],
[-.37493712386529, -.61985213642068, .5383582946365, -.17931524703276],
[-.30437796317839, .74252786648649, .73255373596822, -.64993745548429],
[-.68788283675831, -.84714762684627, -.10721753874211, -.59777382822281],
[-1.00667616522842, -.06670525233919, -.92973707141688, -1.60742284256649],
[1.95220512266515, 2.05751265066695, .79640648143073, -.59608004229343],
[-.15504464969388, -.3882079443045, .75049869361395, -.44163703260023],
[-1.6686863460652, .96325894557423, -.16453379247258, 1.4560996746313],
[-.25573631707529, .88265554068571, .08984550855664, .53561910563178],
[-1.29430028690793, -.48042359291447, .49318558750269, .03689178852848],
[-.34391235307349, -.95154811896716, -.09714022474353, 1.19792361047367],
[.34367523316975, 1.16641214447854, -.39528838072965, -1.72565643987406],
[1.23887392116229, -1.27474554996132, -.65859544264097, -.81757560038832],
[-.17739006831099, -.29057501559843, -.62533324788504, 1.7092669546224],
[-.08610919021307, -.06524996994257, 1.3018284944661, -1.28219607271255],
[-.95717735853496, 1.79841555744597, .75799149339397, .23542916575208],
[-1.70175078442029, 1.33831900642462, -.73979048943944, .26157699746442],
[.84631686421106, .32029666775009, 2.51638540556813, .90367536744335],
[1.22693220256582, 1.45665385966518, 1.27480662666555, .78786331120259],
[-.59251239046609, -.660398245535, .53258334042042, .81248748854679],
[2.22723057510913, -.22856960444805, -.15586801032885, -.26957090658609],
[-.83192612439183, -2.11983096548132, .75319973501664, .62196293266702],
[-1.577627210601, -.3747136286972, .31736538266249, .30187577548949],
[-2.28230005998543, -1.17283119424281, 1.83780755209602, -.75928026219594],
[-1.90574204329052, -.34197417196464, -.59978910354131, -.68240235236779],
[.48132729275936, -.2524965456322, -.75271273075, -.89651237903089],
[.26961427953002, .62968227134995, .99324664633985, .59917742452108],
[-.95910506784013, .31907970712369, .35568397653203, .60155535679072],
[-.18528259973205, -1.31831013869974, -.09749195643548, -.39885348684496],
[.9608404103702, .23727553971573, .20695289013955, -.65281918968052],
[.85302395609555, 1.5303724004181, -.56440186223081, -.27348033453255],
[1.72786301913767, -1.14859994931789, 1.16222121440674, 1.39284961909257],
[.37711527308989, .47231886947072, -.69423676772182, -.53515102147655],
[1.35642227654922, .53204130038923, .69844068787197, 1.04544871561741],
[.57797880484094, .08044525072063, -1.32634695941334, .35179408060132],
[1.29437232500619, 1.07461562326311, .54545226737269, -.6836610122092],
[2.74736726573105, .90881277479338, -.98342785084735, 1.38171127911719],
[-.67749479829901, 1.10093727650063, .28416704607992, -.24984509303044],
[-.24513961858774, 1.32098977907584, .16904762754153, .00886790270539],
[-.5392290825383, -1.43851802284774, 1.0064737206577, -1.52649870396689],
[.19486366400459, 2.77236000318994, -1.32201258472682, -.75922390642504],
[.33271229220962, -.78464273816827, 1.09930224781861, -.32184679755027],
[-1.72814706427698, -1.09275114767838, .7451569579997, .72871211772761],
[-.035506207751, -.72161367235521, .52828318684787, .87177739169758],
[1.31224955134141, -.22742530984642, -.44682270809773, -1.72769462581607],
[-.07125058353119, -.36850925227739, -1.01188688859296, -.24962251325969],
[-.69840680770104, .4925285516285, -1.0255829922787, -.36214090052941],
[-.2530614593082, -.68595709316063, -.56882710610856, 1.25787365685572],
[1.93782484285419, 2.67095706598253, 2.4023579082791, -.09112046819432],
[1.57782156817208, -.39819017512275, 1.01938038947667, .39718992194809],
[1.6839282738726, -.37808442385434, -1.36566197748227, 1.22029200163339],
[.54652714502605, -.38206797548206, -.70554510441189, -1.31224358889695],
[-1.30026063006148, .90642495630747, .02711437433058, -.44482098905042],
[-.1239033493518, -1.29112252171673, .18092802221218, .22673242779457],
[.01152882540055, 1.13242883415094, 2.34980443084773, .17712319903618],
[-.0505195424414, .6807219067402, .37771832345982, .0842510459176],
[-.44230076745505, -.07002728477811, -.6716520563439, .09637247949641],
[-1.31245480585229, -.01674966464909, 1.21063252882651, -.03927111631335],
[-2.94268586886381, .20925236551048, .30321714445262, .22027672852006],
[2.04121905977187, .58496246543101, -.5192457175416, -.37212298770116]])
princomp1.values = array([
[1.29489288337888],
[1.12722515391348],
[.94682423958163],
[.65890241090379]])
princomp1.name = 'princomp1'
princomp1.coef = array([
[.65989917631713, .22621848650964, -.5882833472413, -.40899997165748],
[.15824945056105, .3189419948895, .71689623797385, -.5994104597619],
[-.3488766362785, .90294049788532, -.17151017930575, .1832151967827],
[.64635538301471, .17832458477678, .33251578268108, .66321815082225]])
princomp2 = Holder()
princomp2.comment = 'mlab.princomp(x[:20,], nout=3)'
princomp2.factors = array([
[.74592631465403, -.92093638563647, 1.10020213969681, -.20234362115983],
[.40379773814409, -.23694214086306, -.53526599590626, .48048423978257],
[-.43826559396565, -.26267383420164, .35939862515391, -.15176605914773],
[.29427656853499, -.56363285386285, .19525662206552, -.0384830001072],
[-1.4327917748351, 1.18414191887856, .05435949672922, .46861687286613],
[.23033214569426, -.00452237842477, .00346120473054, -.61483888402985],
[-.40976419499281, .10137131352284, .02570805136468, .06798926306103],
[.83201287149759, .82736894861103, -.35298970920805, .49344802383821],
[-3.36634598435507, -.18324521714611, -1.12118215528184, .2057949493723],
[.70198992281665, -1.1856449495675, .02465727900177, -.08333428418838],
[-.13789069679894, -.79430992968357, -.33106496391047, -1.01808298459082],
[-.10779840884825, -1.41970796854378, 1.55590290358904, 1.34014813517248],
[1.8229340670437, .13065838030104, -1.06152350166072, .11456488463131],
[.51650051521229, .07999783864926, -1.08601194413786, -.28255247881905],
[-.24654203558433, -1.02895891025197, -1.34475655787845, .52240852619949],
[.03542169335227, -.01198903021187, 1.12649412049726, -.60518306798831],
[-1.23945075955452, .48778599927278, 1.11522465483282, -.994827967694],
[.30661562766349, 1.91993049714024, 1.08834307939522, .61608892787963],
[.8241280516035, .43533554216801, -.48261931874702, -.22391158066897],
[.6649139327178, 1.44597315984982, -.33359403032613, -.094219894409]])
princomp2.values = array([
[1.16965204468073],
[.77687367815155],
[.72297937656591],
[.32548581375971]])
princomp2.name = 'princomp2'
princomp2.coef = array([
[-.13957162231397, .6561182967648, .32256106777669, .66781951188167],
[.49534264552989, -.08241251099014, -.6919444767593, .51870674049413],
[-.85614372781797, -.11427402995055, -.47665923729502, .16357058078438],
[.04661912785591, .74138950947638, -.43584764555793, -.50813884128056]])
princomp3 = Holder()
princomp3.comment = 'mlab.princomp(x[:20,]-x[:20,].mean(0), nout=3)'
princomp3.factors = array([
[.74592631465403, -.92093638563647, 1.10020213969681, -.20234362115983],
[.40379773814409, -.23694214086306, -.53526599590626, .48048423978257],
[-.43826559396565, -.26267383420164, .35939862515391, -.15176605914773],
[.29427656853499, -.56363285386285, .19525662206552, -.0384830001072],
[-1.4327917748351, 1.18414191887856, .05435949672922, .46861687286613],
[.23033214569426, -.00452237842477, .00346120473054, -.61483888402985],
[-.40976419499281, .10137131352284, .02570805136468, .06798926306103],
[.83201287149759, .82736894861103, -.35298970920805, .49344802383821],
[-3.36634598435507, -.18324521714611, -1.12118215528184, .2057949493723],
[.70198992281665, -1.1856449495675, .02465727900177, -.08333428418838],
[-.13789069679894, -.79430992968357, -.33106496391047, -1.01808298459082],
[-.10779840884825, -1.41970796854378, 1.55590290358904, 1.34014813517248],
[1.8229340670437, .13065838030104, -1.06152350166072, .11456488463131],
[.51650051521229, .07999783864926, -1.08601194413786, -.28255247881905],
[-.24654203558433, -1.02895891025197, -1.34475655787845, .52240852619949],
[.03542169335227, -.01198903021187, 1.12649412049726, -.60518306798831],
[-1.23945075955452, .48778599927278, 1.11522465483282, -.994827967694],
[.30661562766349, 1.91993049714024, 1.08834307939522, .61608892787963],
[.8241280516035, .43533554216801, -.48261931874702, -.22391158066897],
[.6649139327178, 1.44597315984982, -.33359403032613, -.094219894409]])
princomp3.values = array([
[1.16965204468073],
[.77687367815155],
[.72297937656591],
[.32548581375971]])
princomp3.name = 'princomp3'
princomp3.coef = array([
[-.13957162231397, .6561182967648, .32256106777669, .66781951188167],
[.49534264552989, -.08241251099014, -.6919444767593, .51870674049413],
[-.85614372781797, -.11427402995055, -.47665923729502, .16357058078438],
[.04661912785591, .74138950947638, -.43584764555793, -.50813884128056]])

View File

@ -0,0 +1,101 @@
var1,var2,var3,var4,var5
2.3358,0.0044,0.3163,0.8698,1.4817
3.1387,-0.1494,1.1793,2.1482,-0.2141
0.0501,0.6111,-0.892,1.0971,-2.6557
-0.029,-1.7519,-0.5098,-0.5294,0.2512
-0.0012,-0.8835,3.1745,3.6743,2.9339
1.3228,0.1419,0.6433,2.5167,0.9105
0.9066,-0.7031,-0.9839,-0.0551,0.049
-1.5817,-1.332,1.0518,-1.1584,-0.9183
2.9412,-1.9048,-1.328,0.3225,-0.2039
-1.5636,-1.506,1.6153,1.8511,0.9356
-0.5645,-0.7889,1.136,1.9609,2.5086
-0.802,-0.3495,-1.6138,-0.4514,-0.5998
0.7878,0.8931,0.3226,-1.0011,1.4319
-2.375,-0.6369,-0.5691,-1.3663,-1.7005
-0.2379,0.4552,-0.0294,-0.5558,1.4641
-1.3576,-0.1541,0.2684,-2.3244,-1.2963
0.9799,0.219,-2.0489,-3.1371,-1.0562
1.5379,2.7226,-0.0049,-3.8528,-0.4739
-0.8352,-0.8438,-0.4886,0.8641,-1.2653
1.3476,-0.0039,-0.8244,0.2143,0.0362
0.3315,-0.2731,-0.2188,-2.3388,-0.3962
-0.2723,0.6647,-0.2893,0.0999,-0.8111
-0.1344,0.695,0.6257,-0.283,-0.5913
-2.2572,-1.5057,1.3967,0.471,0.0997
1.0519,-1.3884,1.0226,-1.0947,1.3978
1.7878,1.8082,-0.694,0.6162,-0.9046
0.5601,0.8636,0.4449,0.6998,1.0791
-0.2156,1.4126,2.0064,0.3332,0.0751
-1.2607,-1.2132,-0.0598,-1.693,-1.0813
0.7107,1.9284,1.2093,-0.0175,1.0042
0.0362,1.7571,-0.0752,1.8337,2.6863
2.1175,0.8949,-1.765,0.6082,0.8375
-1.0219,0.2911,-0.727,0.2553,-1.6644
2.653,0.0148,0.4559,-0.0419,1.2743
-0.3103,0.4724,-0.6975,0.3755,3.4604
-1.7467,0.4565,-1.7263,0.9031,0.1875
-0.574,-2.3953,-0.8059,1.5461,-0.8906
-1.5758,1.8004,-1.3741,0.9648,0.0344
-1.2976,-0.6741,2.0647,2.1778,1.5391
0.6771,2.042,0.3806,-2.4027,-2.3492
-1.0357,0.5604,0.2532,-1.6972,-0.4285
-0.17,-0.0818,-2.5097,-1.4429,-0.8825
-0.9111,-0.5983,-1.3297,0.5678,2.5338
0.0865,2.3449,-1.9526,0.16,0.4645
0.7475,-0.5134,-0.598,0.5344,0.0727
-2.298,-0.8431,0.2371,-0.7896,-1.7017
3.008,-0.271,0.4868,0.4959,0.1369
0.376,1.0972,-1.4817,0.1465,0.8261
-0.2943,-1.9401,-0.4638,1.8092,0.9328
0.131,-0.8266,-1.4767,-0.5936,-2.0493
-0.1,0.265,0.4371,1.1967,1.8712
0.8886,0.945,-0.1471,-0.1363,-0.9092
0.1406,-0.5044,-1.3068,1.441,-3.8205
1.896,1.0309,1.1718,2.3715,1.6846
-2.3731,0.3547,-2.5275,0.3097,-1.4761
-0.5936,-1.5261,-1.0773,1.417,1.3027
-2.4798,-1.5857,-0.6344,-2.1682,-0.002
0.7588,0.0225,1.2982,0.01,1.1708
-0.0718,1.9237,1.3538,1.4318,1.4835
1.1017,-0.5897,-0.3399,1.2663,1.6784
-0.7308,0.6094,-0.7773,0.2373,1.013
1.0155,-0.2549,1.2958,0.6724,0.484
-0.4901,0.92,0.4208,0.2325,1.6677
0.6138,1.4609,0.3375,-0.8655,-1.2248
0.3232,-0.2704,2.8568,-0.7418,1.2925
1.1547,0.2841,0.3959,-0.2621,1.2498
-0.8148,-0.1754,-0.6326,-2.8309,-3.0651
-2.6977,-1.9161,-1.1292,-1.4923,0.3646
-3.1057,-0.2471,0.3585,-1.0263,-0.1043
0.666,0.368,0.0196,-1.1868,0.2599
-1.0735,-1.3328,-0.9537,-0.2594,-1.2733
-0.0316,2.3285,1.872,0.1398,3.1739
-0.495,-0.245,-2.0064,-1.315,-1.4454
-1.1888,-1.0905,1.0745,1.2094,1.4798
-2.7048,-0.9399,-1.1409,-1.3737,-1.2151
1.2275,2.3317,-1.3622,-0.9929,-1.5922
-2.659,-1.18,-1.6486,-0.2288,0.4164
-0.5639,2.0618,-1.9634,0.1514,1.6458
-1.8483,-0.4639,0.6209,-0.0183,2.4059
-0.4303,-0.1728,-0.3347,-0.3546,-0.7524
1.9564,-0.6527,0.4776,1.3519,-0.9619
-1.5531,-1.2717,1.4032,0.9843,0.3788
2.0049,-0.6503,0.0042,-0.3649,1.1627
-0.1315,0.5443,0.5422,0.8582,0.4374
0.5894,-0.2894,0.8457,0.641,0.3239
1.7067,-0.4797,-0.2498,1.1692,0.5081
-3.2533,1.3689,1.0815,1.6946,0.8739
2.8036,0.5355,0.0828,-0.7673,-1.0338
-1.0385,-0.6787,0.8265,-1.7571,-3.1357
-2.1853,0.2404,-0.5056,-1.7177,0.6123
2.2815,0.5445,1.2507,0.6492,-0.6182
0.247,0.1745,0.8681,-1.4099,1.3582
0.1303,0.9697,0.6633,0.3373,-0.5746
-0.6143,-0.3428,1.3671,-1.5012,-2.0953
3.2129,0.5585,0.0043,0.9622,-1.0555
-1.3977,-0.1699,-2.4553,-1.2764,-1.0301
-1.1966,-0.6408,-1.0887,-1.4875,-0.4743
-1.7013,0.2085,0.2438,-1.2822,-1.4098
-0.6957,-1.055,-0.6753,-0.3784,-1.9997
1.7702,1.1211,-0.6032,-0.6982,0.4066
1 var1 var2 var3 var4 var5
2 2.3358 0.0044 0.3163 0.8698 1.4817
3 3.1387 -0.1494 1.1793 2.1482 -0.2141
4 0.0501 0.6111 -0.892 1.0971 -2.6557
5 -0.029 -1.7519 -0.5098 -0.5294 0.2512
6 -0.0012 -0.8835 3.1745 3.6743 2.9339
7 1.3228 0.1419 0.6433 2.5167 0.9105
8 0.9066 -0.7031 -0.9839 -0.0551 0.049
9 -1.5817 -1.332 1.0518 -1.1584 -0.9183
10 2.9412 -1.9048 -1.328 0.3225 -0.2039
11 -1.5636 -1.506 1.6153 1.8511 0.9356
12 -0.5645 -0.7889 1.136 1.9609 2.5086
13 -0.802 -0.3495 -1.6138 -0.4514 -0.5998
14 0.7878 0.8931 0.3226 -1.0011 1.4319
15 -2.375 -0.6369 -0.5691 -1.3663 -1.7005
16 -0.2379 0.4552 -0.0294 -0.5558 1.4641
17 -1.3576 -0.1541 0.2684 -2.3244 -1.2963
18 0.9799 0.219 -2.0489 -3.1371 -1.0562
19 1.5379 2.7226 -0.0049 -3.8528 -0.4739
20 -0.8352 -0.8438 -0.4886 0.8641 -1.2653
21 1.3476 -0.0039 -0.8244 0.2143 0.0362
22 0.3315 -0.2731 -0.2188 -2.3388 -0.3962
23 -0.2723 0.6647 -0.2893 0.0999 -0.8111
24 -0.1344 0.695 0.6257 -0.283 -0.5913
25 -2.2572 -1.5057 1.3967 0.471 0.0997
26 1.0519 -1.3884 1.0226 -1.0947 1.3978
27 1.7878 1.8082 -0.694 0.6162 -0.9046
28 0.5601 0.8636 0.4449 0.6998 1.0791
29 -0.2156 1.4126 2.0064 0.3332 0.0751
30 -1.2607 -1.2132 -0.0598 -1.693 -1.0813
31 0.7107 1.9284 1.2093 -0.0175 1.0042
32 0.0362 1.7571 -0.0752 1.8337 2.6863
33 2.1175 0.8949 -1.765 0.6082 0.8375
34 -1.0219 0.2911 -0.727 0.2553 -1.6644
35 2.653 0.0148 0.4559 -0.0419 1.2743
36 -0.3103 0.4724 -0.6975 0.3755 3.4604
37 -1.7467 0.4565 -1.7263 0.9031 0.1875
38 -0.574 -2.3953 -0.8059 1.5461 -0.8906
39 -1.5758 1.8004 -1.3741 0.9648 0.0344
40 -1.2976 -0.6741 2.0647 2.1778 1.5391
41 0.6771 2.042 0.3806 -2.4027 -2.3492
42 -1.0357 0.5604 0.2532 -1.6972 -0.4285
43 -0.17 -0.0818 -2.5097 -1.4429 -0.8825
44 -0.9111 -0.5983 -1.3297 0.5678 2.5338
45 0.0865 2.3449 -1.9526 0.16 0.4645
46 0.7475 -0.5134 -0.598 0.5344 0.0727
47 -2.298 -0.8431 0.2371 -0.7896 -1.7017
48 3.008 -0.271 0.4868 0.4959 0.1369
49 0.376 1.0972 -1.4817 0.1465 0.8261
50 -0.2943 -1.9401 -0.4638 1.8092 0.9328
51 0.131 -0.8266 -1.4767 -0.5936 -2.0493
52 -0.1 0.265 0.4371 1.1967 1.8712
53 0.8886 0.945 -0.1471 -0.1363 -0.9092
54 0.1406 -0.5044 -1.3068 1.441 -3.8205
55 1.896 1.0309 1.1718 2.3715 1.6846
56 -2.3731 0.3547 -2.5275 0.3097 -1.4761
57 -0.5936 -1.5261 -1.0773 1.417 1.3027
58 -2.4798 -1.5857 -0.6344 -2.1682 -0.002
59 0.7588 0.0225 1.2982 0.01 1.1708
60 -0.0718 1.9237 1.3538 1.4318 1.4835
61 1.1017 -0.5897 -0.3399 1.2663 1.6784
62 -0.7308 0.6094 -0.7773 0.2373 1.013
63 1.0155 -0.2549 1.2958 0.6724 0.484
64 -0.4901 0.92 0.4208 0.2325 1.6677
65 0.6138 1.4609 0.3375 -0.8655 -1.2248
66 0.3232 -0.2704 2.8568 -0.7418 1.2925
67 1.1547 0.2841 0.3959 -0.2621 1.2498
68 -0.8148 -0.1754 -0.6326 -2.8309 -3.0651
69 -2.6977 -1.9161 -1.1292 -1.4923 0.3646
70 -3.1057 -0.2471 0.3585 -1.0263 -0.1043
71 0.666 0.368 0.0196 -1.1868 0.2599
72 -1.0735 -1.3328 -0.9537 -0.2594 -1.2733
73 -0.0316 2.3285 1.872 0.1398 3.1739
74 -0.495 -0.245 -2.0064 -1.315 -1.4454
75 -1.1888 -1.0905 1.0745 1.2094 1.4798
76 -2.7048 -0.9399 -1.1409 -1.3737 -1.2151
77 1.2275 2.3317 -1.3622 -0.9929 -1.5922
78 -2.659 -1.18 -1.6486 -0.2288 0.4164
79 -0.5639 2.0618 -1.9634 0.1514 1.6458
80 -1.8483 -0.4639 0.6209 -0.0183 2.4059
81 -0.4303 -0.1728 -0.3347 -0.3546 -0.7524
82 1.9564 -0.6527 0.4776 1.3519 -0.9619
83 -1.5531 -1.2717 1.4032 0.9843 0.3788
84 2.0049 -0.6503 0.0042 -0.3649 1.1627
85 -0.1315 0.5443 0.5422 0.8582 0.4374
86 0.5894 -0.2894 0.8457 0.641 0.3239
87 1.7067 -0.4797 -0.2498 1.1692 0.5081
88 -3.2533 1.3689 1.0815 1.6946 0.8739
89 2.8036 0.5355 0.0828 -0.7673 -1.0338
90 -1.0385 -0.6787 0.8265 -1.7571 -3.1357
91 -2.1853 0.2404 -0.5056 -1.7177 0.6123
92 2.2815 0.5445 1.2507 0.6492 -0.6182
93 0.247 0.1745 0.8681 -1.4099 1.3582
94 0.1303 0.9697 0.6633 0.3373 -0.5746
95 -0.6143 -0.3428 1.3671 -1.5012 -2.0953
96 3.2129 0.5585 0.0043 0.9622 -1.0555
97 -1.3977 -0.1699 -2.4553 -1.2764 -1.0301
98 -1.1966 -0.6408 -1.0887 -1.4875 -0.4743
99 -1.7013 0.2085 0.2438 -1.2822 -1.4098
100 -0.6957 -1.055 -0.6753 -0.3784 -1.9997
101 1.7702 1.1211 -0.6032 -0.6982 0.4066

View File

@ -0,0 +1,101 @@
f1,f2,f1b,f2b,f1o,f2o,f1ob,f2ob
.77409906,.5265066,1.2342164,1.5539443,.82174469,.64430356,1.3798437,1.7324318
.90211532,.5778448,1.4354716,1.691956,.95415644,.71558883,1.5935961,1.9010544
-.55152949,.10318112,-1.0868707,.62321661,-.53884179,.01300941,-1.0209297,.44003871
-.11934901,-.53549872,.01129338,-1.8973372,-.17099081,-.54772966,-.17374833,-1.8707504
2.3941179,-.26282474,4.6079809,-1.9774392,2.3570865,.12618302,4.3932292,-1.2095023
1.0927031,.30140322,1.9283693,.6165865,1.1168835,.47345041,1.9792983,.91910478
-.13079791,-.02496757,-.22584839,-.04664732,-.13260905,-.04570687,-.22932042,-.08241165
-.33812166,-.74795931,-.33393451,-2.5509963,-.40943578,-.7926505,-.58106234,-2.5714763
-.04786263,.01681279,-.0859222,.0515025,-.04599537,.00888495,-.08049141,.03699227
1.0480495,-.81266539,2.3144765,-3.3816091,.96382241,-.63326664,1.9737471,-2.964715
1.3937318,-.33515776,2.7734803,-1.8141489,1.3544142,-.10631967,2.5833894,-1.3437932
-.60405968,-.29122039,-.99811882,-.79446532,-.62957534,-.38470365,-1.0708228,-.94484311
.1768074,.51456466,.11159002,1.777466,.22613441,.53632266,.28435898,1.7722347
-.96740945,-.723384,-1.5150277,-2.1864758,-1.0333294,-.86974388,-1.7209881,-2.4019313
.20389441,.14744239,.3268983,.44172513,.21729843,.17835522,.36840847,.48860637
-.92590203,-.31839571,-1.6216858,-.74416985,-.95253387,-.46335749,-1.6865151,-.99563074
-1.467613,.3140052,-2.8897371,1.7524627,-1.4300057,.07354455,-2.7051065,1.2641888
-1.1479964,1.2582306,-2.7173854,5.041731,-1.0198511,1.0569188,-2.2128766,4.5382765
-.17961277,-.49664318,-.12436604,-1.7124124,-.22717909,-.51908693,-.29073149,-1.7100879
-.03173505,.28265502,-.17368455,1.0251028,-.00402536,.27385425,-.07291087,.98374888
-.73229133,.02886435,-1.397402,.41212285,-.7259882,-.08944924,-1.3505629,.18168897
-.24941449,.1269423,-.52647655,.57374862,-.23584948,.08511655,-.46802849,.48146893
-.10732968,.18432286,-.29391519,.7210821,-.08884709,.16463107,-.22221031,.66433352
.39880577,-.95391695,1.1482693,-3.6091908,.30390006,-.87723612,.79090718,-3.3771456
.35872658,-.13977614,.72766572,-.67702616,.34338949,-.08017796,.6581897,-.55099642
-.1661692,.91000156,-.70379606,3.3628839,-.07665355,.87136053,-.37256614,3.2056372
.55913446,.40910597,.87726935,1.2383401,.59635784,.49381506,.99382627,1.3634605
.4792189,.40196303,.70586224,1.2632806,.51612661,.47389483,.82566748,1.36047
-.73646072,-.63906787,-1.1203902,-1.9917138,-.79526024,-.74933373,-1.3092417,-2.1461547
.4795629,.79207339,.55137636,2.6632818,.55450422,.85896807,.80841591,2.7173154
1.1127571,.59389352,1.8580276,1.6695604,1.1653594,.76535228,2.0119553,1.9470045
.08956588,.73845254,-.12614275,2.6149467,.16113731,.7432375,.12941213,2.5604956
-.52911256,-.1793241,-.91998487,-.4085011,-.54407555,-.26219769,-.95543007,-.55133367
.51942396,.61292952,.71244766,1.9736557,.57670908,.68858245,.90148216,2.0626324
.8132702,.16897349,1.4862315,.24911189,.82587019,.29774627,1.5034386,.48522013
-.12857721,-.24986049,-.11127733,-.84082875,-.1523257,-.26730639,-.19272687,-.84777395
.08574189,-.93312063,.57059076,-3.4152487,-.00564465,-.90713076,.23488998,-3.2787713
-.10989362,.20354792,-.27660743,.79824474,-.0895244,.18319224,-.19746176,.74327632
1.3642833,-.48395053,2.7669022,-2.3267277,1.3105988,-.25791284,2.5268668,-1.8507401
-1.1672069,.77791884,-2.5579395,3.3272367,-1.0857999,.57978316,-2.2213513,2.8718422
-.55165952,-.01582468,-1.0412775,.18814338,-.55057411,-.10446383,-1.0179728,.0179878
-1.1024776,-.06132851,-2.0295797,.2446312,-1.1032044,-.23808385,-1.9960589,-.08542956
.51121121,-.32752081,1.1344589,-1.4159784,.47684277,-.24091389,.99099791,-1.2147876
-.2827674,.75734733,-.8347537,2.8623962,-.20757982,.70192071,-.55169675,2.6905919
.09876273,-.01059937,.19723165,-.08903476,.09725877,.00544491,.1876112,-.056108
-.6421798,-.78126,-.88585867,-2.5329525,-.71529206,-.87448557,-1.1285977,-2.6425563
.40709836,.55585896,.51952852,1.8181175,.45935433,.61416665,.69431739,1.8780547
-.05459569,.44056716,-.27210725,1.6111778,-.01138085,.42602325,-.1137231,1.5463221
.67929573,-.6847309,1.5879431,-2.7776557,.60929897,-.56639064,1.3095598,-2.4856551
-.92607077,-.27752489,-1.6212673,-.60797893,-.94871697,-.42304737,-1.6728202,-.86115026
.87850547,.09109933,1.6219634,-.05027553,.88320204,.23139495,1.609334,.21160078
-.26970441,.46623015,-.71459682,1.8042973,-.22296261,.4167076,-.5352756,1.6656569
-.79989201,-.26861592,-1.3954051,-.61625155,-.82227074,-.39393339,-1.4488407,-.83293939
1.3522459,.72936579,2.2300877,2.0517907,1.4169156,.93762617,2.4195097,2.384166
-.88674943,-.45168537,-1.4528825,-1.2385071,-.92656338,-.58860158,-1.5667133,-1.4563288
.5197045,-.60245723,1.2605819,-2.4123386,.45848966,-.51089342,1.0193762,-2.1778289
-.74788779,-.97828004,-.98283317,-3.2156233,-.83970562,-1.0859582,-1.2916697,-3.3319334
.60152168,.20957638,1.0322015,.49464574,.61908925,.30371668,1.0755111,.65442646
.98432058,.60450513,1.5874137,1.7740498,1.0385694,.75514043,1.7528183,2.006547
.76389179,.07029355,1.4222455,-.08934065,.76710587,.19240203,1.4067588,.14088071
.12791207,.05698825,.23093481,.15178897,.13285894,.07684478,.24463378,.18699996
.62420468,.14365152,1.1006233,.24788906,.63523658,.24230553,1.1195484,.42191039
.53308959,.22886039,.90944184,.60071557,.55286335,.31172784,.96367793,.739341
-.47080363,.57870828,-1.1505061,2.3032231,-.41213726,.49533006,-.92046348,2.0878657
.73660289,.05121274,1.3362788,-.13133352,.73808663,.16917538,1.3171075,.08559093
.3750162,.38087227,.54184582,1.2062216,.41036404,.43629736,.65686931,1.2777407
-1.6628303,-.25019609,-3.0399027,-.18082388,-1.6793018,-.51473173,-3.0430496,-.66804499
-.57518971,-1.1354505,-.58045463,-3.8604945,-.68315424,-1.2132636,-.95408231,-3.9035827
-.33048237,-.71724484,-.31867622,-2.4346249,-.39883828,-.76110666,-.55453068,-2.4541665
-.21711319,.29449836,-.53874819,1.1527255,-.18736559,.25568751,-.42379218,1.0509114
-.57134442,-.67766186,-.78074037,-2.2034566,-.63469348,-.76083165,-.9918548,-2.3004322
1.1568365,.81787262,1.8212026,2.4652365,1.2310664,.99350661,2.0528831,2.7263631
-1.1158728,-.19839769,-2.0042252,-.23966229,-1.1298999,-.37552105,-2.0180431,-.55931767
.90698836,-.57501394,1.9541201,-2.467368,.84660404,-.42143564,1.7042449,-2.1204445
-.96822116,-.87756275,-1.4404227,-2.7473047,-1.0491694,-1.0220407,-1.7014187,-2.9434239
-.93949139,.96862152,-2.1843695,3.9104251,-.840576,.80467039,-1.7927011,3.5075818
-.34872923,-.92561876,-.23562044,-3.1952854,-.43731439,-.96969915,-.54603402,-3.1915211
-.01008118,.56239284,-.22983374,2.0376015,.04479942,.55342776,-.03007525,1.9739874
.6753194,-.46834193,1.4803838,-1.98315,.62643921,-.35346682,1.2799762,-1.7188435
-.34822558,-.15758844,-.58978344,-.41711157,-.36193118,-.2116136,-.62764134,-.50665223
.32773507,.16164562,.53650026,.43747533,.34193386,.21231779,.57659746,.51816881
.62853751,-.73379458,1.4879734,-2.9151351,.55399893,-.62298855,1.1966623,-2.6374401
.30674611,.26827549,.46503444,.81759821,.33144117,.31417541,.54253362,.88181986
.44192818,.14021085,.7685949,.32232893,.45349306,.20955383,.79635969,.44190477
.46887837,.03616074,.85905645,-.07268136,.47017009,.11120237,.84787727,.06661995
.48475223,.20430528,.83041791,.51731057,.5023622,.27970846,.87689861,.64429782
.73545423,-.27367592,1.5001756,-1.2762381,.70526722,-.15165716,1.3685967,-1.0179721
-.33495813,.75918345,-.96995976,2.8784489,-.25934288,.69532746,-.68469352,2.6846599
-1.0941765,-.47592994,-1.8866643,-1.2349393,-1.135366,-.64593615,-1.9980805,-1.5226689
-.48718385,-.3357568,-.76630583,-.99841793,-.51759859,-.40983563,-.85999933,-1.1087994
.3617318,.6347137,.38779162,2.1372949,.42189215,.68468566,.5943275,2.1718491
.15777211,.18310759,.21152508,.5882512,.17487318,.20612677,.26787102,.61463863
.07296441,.31367206,-.00945973,1.1123917,.1031994,.32132842,.09904208,1.096347
-.64908777,-.25472472,-1.1440559,-.62922708,-.67083062,-.35593625,-1.199954,-.80526531
.12122439,.81412903,-.13059422,2.8820144,.20002335,.82302476,.15102062,2.82336
-1.1305481,-.35912583,-1.9541954,-.81215474,-1.160176,-.53651452,-2.0240689,-1.1162794
-.75096061,-.43799405,-1.2186468,-1.2618644,-.79008663,-.55321997,-1.3358709,-1.4416571
-.70483427,-.304975,-1.2091407,-.78438434,-.73121086,-.41450863,-1.2798564,-.96887917
-.7211226,-.52612824,-1.1360481,-1.5881501,-.76898374,-.63539819,-1.2854782,-1.7503809
-.15257555,.75718097,-.60854757,2.7933029,-.07802447,.72272415,-.33330485,2.6588313
1 f1 f2 f1b f2b f1o f2o f1ob f2ob
2 .77409906 .5265066 1.2342164 1.5539443 .82174469 .64430356 1.3798437 1.7324318
3 .90211532 .5778448 1.4354716 1.691956 .95415644 .71558883 1.5935961 1.9010544
4 -.55152949 .10318112 -1.0868707 .62321661 -.53884179 .01300941 -1.0209297 .44003871
5 -.11934901 -.53549872 .01129338 -1.8973372 -.17099081 -.54772966 -.17374833 -1.8707504
6 2.3941179 -.26282474 4.6079809 -1.9774392 2.3570865 .12618302 4.3932292 -1.2095023
7 1.0927031 .30140322 1.9283693 .6165865 1.1168835 .47345041 1.9792983 .91910478
8 -.13079791 -.02496757 -.22584839 -.04664732 -.13260905 -.04570687 -.22932042 -.08241165
9 -.33812166 -.74795931 -.33393451 -2.5509963 -.40943578 -.7926505 -.58106234 -2.5714763
10 -.04786263 .01681279 -.0859222 .0515025 -.04599537 .00888495 -.08049141 .03699227
11 1.0480495 -.81266539 2.3144765 -3.3816091 .96382241 -.63326664 1.9737471 -2.964715
12 1.3937318 -.33515776 2.7734803 -1.8141489 1.3544142 -.10631967 2.5833894 -1.3437932
13 -.60405968 -.29122039 -.99811882 -.79446532 -.62957534 -.38470365 -1.0708228 -.94484311
14 .1768074 .51456466 .11159002 1.777466 .22613441 .53632266 .28435898 1.7722347
15 -.96740945 -.723384 -1.5150277 -2.1864758 -1.0333294 -.86974388 -1.7209881 -2.4019313
16 .20389441 .14744239 .3268983 .44172513 .21729843 .17835522 .36840847 .48860637
17 -.92590203 -.31839571 -1.6216858 -.74416985 -.95253387 -.46335749 -1.6865151 -.99563074
18 -1.467613 .3140052 -2.8897371 1.7524627 -1.4300057 .07354455 -2.7051065 1.2641888
19 -1.1479964 1.2582306 -2.7173854 5.041731 -1.0198511 1.0569188 -2.2128766 4.5382765
20 -.17961277 -.49664318 -.12436604 -1.7124124 -.22717909 -.51908693 -.29073149 -1.7100879
21 -.03173505 .28265502 -.17368455 1.0251028 -.00402536 .27385425 -.07291087 .98374888
22 -.73229133 .02886435 -1.397402 .41212285 -.7259882 -.08944924 -1.3505629 .18168897
23 -.24941449 .1269423 -.52647655 .57374862 -.23584948 .08511655 -.46802849 .48146893
24 -.10732968 .18432286 -.29391519 .7210821 -.08884709 .16463107 -.22221031 .66433352
25 .39880577 -.95391695 1.1482693 -3.6091908 .30390006 -.87723612 .79090718 -3.3771456
26 .35872658 -.13977614 .72766572 -.67702616 .34338949 -.08017796 .6581897 -.55099642
27 -.1661692 .91000156 -.70379606 3.3628839 -.07665355 .87136053 -.37256614 3.2056372
28 .55913446 .40910597 .87726935 1.2383401 .59635784 .49381506 .99382627 1.3634605
29 .4792189 .40196303 .70586224 1.2632806 .51612661 .47389483 .82566748 1.36047
30 -.73646072 -.63906787 -1.1203902 -1.9917138 -.79526024 -.74933373 -1.3092417 -2.1461547
31 .4795629 .79207339 .55137636 2.6632818 .55450422 .85896807 .80841591 2.7173154
32 1.1127571 .59389352 1.8580276 1.6695604 1.1653594 .76535228 2.0119553 1.9470045
33 .08956588 .73845254 -.12614275 2.6149467 .16113731 .7432375 .12941213 2.5604956
34 -.52911256 -.1793241 -.91998487 -.4085011 -.54407555 -.26219769 -.95543007 -.55133367
35 .51942396 .61292952 .71244766 1.9736557 .57670908 .68858245 .90148216 2.0626324
36 .8132702 .16897349 1.4862315 .24911189 .82587019 .29774627 1.5034386 .48522013
37 -.12857721 -.24986049 -.11127733 -.84082875 -.1523257 -.26730639 -.19272687 -.84777395
38 .08574189 -.93312063 .57059076 -3.4152487 -.00564465 -.90713076 .23488998 -3.2787713
39 -.10989362 .20354792 -.27660743 .79824474 -.0895244 .18319224 -.19746176 .74327632
40 1.3642833 -.48395053 2.7669022 -2.3267277 1.3105988 -.25791284 2.5268668 -1.8507401
41 -1.1672069 .77791884 -2.5579395 3.3272367 -1.0857999 .57978316 -2.2213513 2.8718422
42 -.55165952 -.01582468 -1.0412775 .18814338 -.55057411 -.10446383 -1.0179728 .0179878
43 -1.1024776 -.06132851 -2.0295797 .2446312 -1.1032044 -.23808385 -1.9960589 -.08542956
44 .51121121 -.32752081 1.1344589 -1.4159784 .47684277 -.24091389 .99099791 -1.2147876
45 -.2827674 .75734733 -.8347537 2.8623962 -.20757982 .70192071 -.55169675 2.6905919
46 .09876273 -.01059937 .19723165 -.08903476 .09725877 .00544491 .1876112 -.056108
47 -.6421798 -.78126 -.88585867 -2.5329525 -.71529206 -.87448557 -1.1285977 -2.6425563
48 .40709836 .55585896 .51952852 1.8181175 .45935433 .61416665 .69431739 1.8780547
49 -.05459569 .44056716 -.27210725 1.6111778 -.01138085 .42602325 -.1137231 1.5463221
50 .67929573 -.6847309 1.5879431 -2.7776557 .60929897 -.56639064 1.3095598 -2.4856551
51 -.92607077 -.27752489 -1.6212673 -.60797893 -.94871697 -.42304737 -1.6728202 -.86115026
52 .87850547 .09109933 1.6219634 -.05027553 .88320204 .23139495 1.609334 .21160078
53 -.26970441 .46623015 -.71459682 1.8042973 -.22296261 .4167076 -.5352756 1.6656569
54 -.79989201 -.26861592 -1.3954051 -.61625155 -.82227074 -.39393339 -1.4488407 -.83293939
55 1.3522459 .72936579 2.2300877 2.0517907 1.4169156 .93762617 2.4195097 2.384166
56 -.88674943 -.45168537 -1.4528825 -1.2385071 -.92656338 -.58860158 -1.5667133 -1.4563288
57 .5197045 -.60245723 1.2605819 -2.4123386 .45848966 -.51089342 1.0193762 -2.1778289
58 -.74788779 -.97828004 -.98283317 -3.2156233 -.83970562 -1.0859582 -1.2916697 -3.3319334
59 .60152168 .20957638 1.0322015 .49464574 .61908925 .30371668 1.0755111 .65442646
60 .98432058 .60450513 1.5874137 1.7740498 1.0385694 .75514043 1.7528183 2.006547
61 .76389179 .07029355 1.4222455 -.08934065 .76710587 .19240203 1.4067588 .14088071
62 .12791207 .05698825 .23093481 .15178897 .13285894 .07684478 .24463378 .18699996
63 .62420468 .14365152 1.1006233 .24788906 .63523658 .24230553 1.1195484 .42191039
64 .53308959 .22886039 .90944184 .60071557 .55286335 .31172784 .96367793 .739341
65 -.47080363 .57870828 -1.1505061 2.3032231 -.41213726 .49533006 -.92046348 2.0878657
66 .73660289 .05121274 1.3362788 -.13133352 .73808663 .16917538 1.3171075 .08559093
67 .3750162 .38087227 .54184582 1.2062216 .41036404 .43629736 .65686931 1.2777407
68 -1.6628303 -.25019609 -3.0399027 -.18082388 -1.6793018 -.51473173 -3.0430496 -.66804499
69 -.57518971 -1.1354505 -.58045463 -3.8604945 -.68315424 -1.2132636 -.95408231 -3.9035827
70 -.33048237 -.71724484 -.31867622 -2.4346249 -.39883828 -.76110666 -.55453068 -2.4541665
71 -.21711319 .29449836 -.53874819 1.1527255 -.18736559 .25568751 -.42379218 1.0509114
72 -.57134442 -.67766186 -.78074037 -2.2034566 -.63469348 -.76083165 -.9918548 -2.3004322
73 1.1568365 .81787262 1.8212026 2.4652365 1.2310664 .99350661 2.0528831 2.7263631
74 -1.1158728 -.19839769 -2.0042252 -.23966229 -1.1298999 -.37552105 -2.0180431 -.55931767
75 .90698836 -.57501394 1.9541201 -2.467368 .84660404 -.42143564 1.7042449 -2.1204445
76 -.96822116 -.87756275 -1.4404227 -2.7473047 -1.0491694 -1.0220407 -1.7014187 -2.9434239
77 -.93949139 .96862152 -2.1843695 3.9104251 -.840576 .80467039 -1.7927011 3.5075818
78 -.34872923 -.92561876 -.23562044 -3.1952854 -.43731439 -.96969915 -.54603402 -3.1915211
79 -.01008118 .56239284 -.22983374 2.0376015 .04479942 .55342776 -.03007525 1.9739874
80 .6753194 -.46834193 1.4803838 -1.98315 .62643921 -.35346682 1.2799762 -1.7188435
81 -.34822558 -.15758844 -.58978344 -.41711157 -.36193118 -.2116136 -.62764134 -.50665223
82 .32773507 .16164562 .53650026 .43747533 .34193386 .21231779 .57659746 .51816881
83 .62853751 -.73379458 1.4879734 -2.9151351 .55399893 -.62298855 1.1966623 -2.6374401
84 .30674611 .26827549 .46503444 .81759821 .33144117 .31417541 .54253362 .88181986
85 .44192818 .14021085 .7685949 .32232893 .45349306 .20955383 .79635969 .44190477
86 .46887837 .03616074 .85905645 -.07268136 .47017009 .11120237 .84787727 .06661995
87 .48475223 .20430528 .83041791 .51731057 .5023622 .27970846 .87689861 .64429782
88 .73545423 -.27367592 1.5001756 -1.2762381 .70526722 -.15165716 1.3685967 -1.0179721
89 -.33495813 .75918345 -.96995976 2.8784489 -.25934288 .69532746 -.68469352 2.6846599
90 -1.0941765 -.47592994 -1.8866643 -1.2349393 -1.135366 -.64593615 -1.9980805 -1.5226689
91 -.48718385 -.3357568 -.76630583 -.99841793 -.51759859 -.40983563 -.85999933 -1.1087994
92 .3617318 .6347137 .38779162 2.1372949 .42189215 .68468566 .5943275 2.1718491
93 .15777211 .18310759 .21152508 .5882512 .17487318 .20612677 .26787102 .61463863
94 .07296441 .31367206 -.00945973 1.1123917 .1031994 .32132842 .09904208 1.096347
95 -.64908777 -.25472472 -1.1440559 -.62922708 -.67083062 -.35593625 -1.199954 -.80526531
96 .12122439 .81412903 -.13059422 2.8820144 .20002335 .82302476 .15102062 2.82336
97 -1.1305481 -.35912583 -1.9541954 -.81215474 -1.160176 -.53651452 -2.0240689 -1.1162794
98 -.75096061 -.43799405 -1.2186468 -1.2618644 -.79008663 -.55321997 -1.3358709 -1.4416571
99 -.70483427 -.304975 -1.2091407 -.78438434 -.73121086 -.41450863 -1.2798564 -.96887917
100 -.7211226 -.52612824 -1.1360481 -1.5881501 -.76898374 -.63539819 -1.2854782 -1.7503809
101 -.15257555 .75718097 -.60854757 2.7933029 -.07802447 .72272415 -.33330485 2.6588313

View File

@ -0,0 +1,108 @@
import pandas as pd
from ..cancorr import CanCorr
from numpy.testing import assert_almost_equal
data_fit = pd.DataFrame([[191, 36, 50, 5, 162, 60],
[189, 37, 52, 2, 110, 60],
[193, 38, 58, 12, 101, 101],
[162, 35, 62, 12, 105, 37],
[189, 35, 46, 13, 155, 58],
[182, 36, 56, 4, 101, 42],
[211, 38, 56, 8, 101, 38],
[167, 34, 60, 6, 125, 40],
[176, 31, 74, 15, 200, 40],
[154, 33, 56, 17, 251, 250],
[169, 34, 50, 17, 120, 38],
[166, 33, 52, 13, 210, 115],
[154, 34, 64, 14, 215, 105],
[247, 46, 50, 1, 50, 50],
[193, 36, 46, 6, 70, 31],
[202, 37, 62, 12, 210, 120],
[176, 37, 54, 4, 60, 25],
[157, 32, 52, 11, 230, 80],
[156, 33, 54, 15, 225, 73],
[138, 33, 68, 2, 110, 43]])
def test_cancorr():
# Compare results to SAS example:
# https://support.sas.com/documentation/cdl/en/statug/63347/HTML/default/
# viewer.htm#statug_cancorr_sect020.htm
X1 = data_fit.iloc[:, :3]
Y1 = data_fit.iloc[:, 3:]
mod = CanCorr(Y1, X1)
r = mod.corr_test()
assert_almost_equal(r.stats_mv.loc["Wilks' lambda", 'Value'],
0.35039053, decimal=8)
assert_almost_equal(r.stats_mv.loc["Pillai's trace", 'Value'],
0.67848151, decimal=8)
assert_almost_equal(r.stats_mv.loc["Hotelling-Lawley trace", 'Value'],
1.77194146, decimal=8)
assert_almost_equal(r.stats_mv.loc["Roy's greatest root", 'Value'],
1.72473874, decimal=8)
assert_almost_equal(r.stats_mv.loc["Wilks' lambda", 'F Value'],
2.05, decimal=2)
assert_almost_equal(r.stats_mv.loc["Pillai's trace", 'F Value'],
1.56, decimal=2)
assert_almost_equal(r.stats_mv.loc["Hotelling-Lawley trace",
'F Value'],
2.64, decimal=2)
assert_almost_equal(r.stats_mv.loc["Roy's greatest root", 'F Value'],
9.20, decimal=2)
assert_almost_equal(r.stats_mv.loc["Wilks' lambda", 'Num DF'],
9, decimal=3)
assert_almost_equal(r.stats_mv.loc["Pillai's trace", 'Num DF'],
9, decimal=3)
assert_almost_equal(r.stats_mv.loc["Hotelling-Lawley trace",
'Num DF'],
9, decimal=3)
assert_almost_equal(r.stats_mv.loc["Roy's greatest root", 'Num DF'],
3, decimal=3)
assert_almost_equal(r.stats_mv.loc["Wilks' lambda", 'Den DF'],
34.223, decimal=3)
assert_almost_equal(r.stats_mv.loc["Pillai's trace", 'Den DF'],
48, decimal=3)
assert_almost_equal(r.stats_mv.loc["Hotelling-Lawley trace",
'Den DF'],
19.053, decimal=3)
assert_almost_equal(r.stats_mv.loc["Roy's greatest root", 'Den DF'],
16, decimal=3)
assert_almost_equal(r.stats_mv.loc["Wilks' lambda", 'Pr > F'],
0.0635, decimal=4)
assert_almost_equal(r.stats_mv.loc["Pillai's trace", 'Pr > F'],
0.1551, decimal=4)
assert_almost_equal(r.stats_mv.loc["Hotelling-Lawley trace",
'Pr > F'],
0.0357, decimal=4)
assert_almost_equal(r.stats_mv.loc["Roy's greatest root", 'Pr > F'],
0.0009, decimal=4)
assert_almost_equal(r.stats.loc[0, "Wilks' lambda"],
0.35039053, decimal=8)
assert_almost_equal(r.stats.loc[1, "Wilks' lambda"],
0.95472266, decimal=8)
assert_almost_equal(r.stats.loc[2, "Wilks' lambda"],
0.99473355, decimal=8)
assert_almost_equal(r.stats.loc[0, 'F Value'],
2.05, decimal=2)
assert_almost_equal(r.stats.loc[1, 'F Value'],
0.18, decimal=2)
assert_almost_equal(r.stats.loc[2, 'F Value'],
0.08, decimal=2)
assert_almost_equal(r.stats.loc[0, 'Num DF'],
9, decimal=2)
assert_almost_equal(r.stats.loc[1, 'Num DF'],
4, decimal=2)
assert_almost_equal(r.stats.loc[2, 'Num DF'],
1, decimal=2)
assert_almost_equal(r.stats.loc[0, 'Den DF'],
34.223, decimal=3)
assert_almost_equal(r.stats.loc[1, 'Den DF'],
30, decimal=2)
assert_almost_equal(r.stats.loc[2, 'Den DF'],
16, decimal=2)
assert_almost_equal(r.stats.loc[0, 'Pr > F'],
0.0635, decimal=4)
assert_almost_equal(r.stats.loc[1, 'Pr > F'],
0.9491, decimal=4)
assert_almost_equal(r.stats.loc[2, 'Pr > F'],
0.7748, decimal=4)

View File

@ -0,0 +1,314 @@
import warnings
from statsmodels.compat.pandas import PD_LT_1_4
import os
import numpy as np
import pandas as pd
from statsmodels.multivariate.factor import Factor
from numpy.testing import (assert_equal, assert_array_almost_equal, assert_,
assert_raises, assert_array_equal,
assert_array_less, assert_allclose)
import pytest
try:
import matplotlib.pyplot as plt
missing_matplotlib = False
plt.switch_backend('Agg')
except ImportError:
missing_matplotlib = True
# Example data
# https://support.sas.com/documentation/cdl/en/statug/63033/HTML/default/
# viewer.htm#statug_introreg_sect012.htm
X = pd.DataFrame([['Minas Graes', 2.068, 2.070, 1.580, 1, 0],
['Minas Graes', 2.068, 2.074, 1.602, 2, 1],
['Minas Graes', 2.090, 2.090, 1.613, 3, 0],
['Minas Graes', 2.097, 2.093, 1.613, 4, 1],
['Minas Graes', 2.117, 2.125, 1.663, 5, 0],
['Minas Graes', 2.140, 2.146, 1.681, 6, 1],
['Matto Grosso', 2.045, 2.054, 1.580, 7, 0],
['Matto Grosso', 2.076, 2.088, 1.602, 8, 1],
['Matto Grosso', 2.090, 2.093, 1.643, 9, 0],
['Matto Grosso', 2.111, 2.114, 1.643, 10, 1],
['Santa Cruz', 2.093, 2.098, 1.653, 11, 0],
['Santa Cruz', 2.100, 2.106, 1.623, 12, 1],
['Santa Cruz', 2.104, 2.101, 1.653, 13, 0]],
columns=['Loc', 'Basal', 'Occ', 'Max', 'id', 'alt'])
def test_auto_col_name():
# Test auto generated variable names when endog_names is None
mod = Factor(None, 2, corr=np.eye(11), endog_names=None,
smc=False)
assert_array_equal(mod.endog_names,
['var00', 'var01', 'var02', 'var03', 'var04', 'var05',
'var06', 'var07', 'var08', 'var09', 'var10'])
def test_direct_corr_matrix():
# Test specifying the correlation matrix directly
mod = Factor(None, 2, corr=np.corrcoef(X.iloc[:, 1:-1], rowvar=0),
smc=False)
results = mod.fit(tol=1e-10)
a = np.array([[0.965392158864, 0.225880658666255],
[0.967587154301, 0.212758741910989],
[0.929891035996, -0.000603217967568],
[0.486822656362, -0.869649573289374]])
assert_array_almost_equal(results.loadings, a, decimal=8)
# Test set and get endog_names
mod.endog_names = X.iloc[:, 1:-1].columns
assert_array_equal(mod.endog_names, ['Basal', 'Occ', 'Max', 'id'])
# Test set endog_names with the wrong number of elements
assert_raises(ValueError, setattr, mod, 'endog_names',
X.iloc[:, :1].columns)
def test_unknown_fa_method_error():
# Test raise error if an unkonwn FA method is specified in fa.method
mod = Factor(X.iloc[:, 1:-1], 2, method='ab')
assert_raises(ValueError, mod.fit)
def test_example_compare_to_R_output():
# Testing basic functions and compare to R output
# R code for producing the results:
# library(psych)
# library(GPArotation)
# Basal = c(2.068, 2.068, 2.09, 2.097, 2.117, 2.14, 2.045, 2.076, 2.09, 2.111, 2.093, 2.1, 2.104)
# Occ = c(2.07, 2.074, 2.09, 2.093, 2.125, 2.146, 2.054, 2.088, 2.093, 2.114, 2.098, 2.106, 2.101)
# Max = c(1.58, 1.602, 1.613, 1.613, 1.663, 1.681, 1.58, 1.602, 1.643, 1.643, 1.653, 1.623, 1.653)
# id = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13)
# Y <- cbind(Basal, Occ, Max, id)
# a <- fa(Y, nfactors=2, fm="pa", rotate="none", SMC=FALSE, min.err=1e-10)
# b <- cbind(a$loadings[,1], -a$loadings[,2])
# b
# a <- fa(Y, nfactors=2, fm="pa", rotate="Promax", SMC=TRUE, min.err=1e-10)
# b <- cbind(a$loadings[,1], a$loadings[,2])
# b
# a <- fa(Y, nfactors=2, fm="pa", rotate="Varimax", SMC=TRUE, min.err=1e-10)
# b <- cbind(a$loadings[,1], a$loadings[,2])
# b
# a <- fa(Y, nfactors=2, fm="pa", rotate="quartimax", SMC=TRUE, min.err=1e-10)
# b <- cbind(a$loadings[,1], -a$loadings[,2])
# b
# a <- fa(Y, nfactors=2, fm="pa", rotate="oblimin", SMC=TRUE, min.err=1e-10)
# b <- cbind(a$loadings[,1], a$loadings[,2])
# b
# No rotation without squared multiple correlations prior
# produce same results as in R `fa`
mod = Factor(X.iloc[:, 1:-1], 2, smc=False)
results = mod.fit(tol=1e-10)
a = np.array([[0.965392158864, 0.225880658666255],
[0.967587154301, 0.212758741910989],
[0.929891035996, -0.000603217967568],
[0.486822656362, -0.869649573289374]])
assert_array_almost_equal(results.loadings, a, decimal=8)
# No rotation WITH squared multiple correlations prior
# produce same results as in R `fa`
mod = Factor(X.iloc[:, 1:-1], 2, smc=True)
results = mod.fit()
a = np.array([[0.97541115, 0.20280987],
[0.97113975, 0.17207499],
[0.9618705, -0.2004196],
[0.37570708, -0.45821379]])
assert_array_almost_equal(results.loadings, a, decimal=8)
# Same as R GRArotation
results.rotate('varimax')
a = np.array([[0.98828898, -0.12587155],
[0.97424206, -0.15354033],
[0.84418097, -0.502714],
[0.20601929, -0.55558235]])
assert_array_almost_equal(results.loadings, a, decimal=8)
results.rotate('quartimax') # Same as R fa
a = np.array([[0.98935598, 0.98242714, 0.94078972, 0.33442284],
[0.117190049, 0.086943252, -0.283332952, -0.489159543]])
assert_array_almost_equal(results.loadings, a.T, decimal=8)
results.rotate('equamax') # Not the same as R fa
results.rotate('promax') # Not the same as R fa
results.rotate('biquartimin') # Not the same as R fa
results.rotate('oblimin') # Same as R fa
a = np.array([[1.02834170170, 1.00178840104, 0.71824931384,
-0.00013510048],
[0.06563421, 0.03096076, -0.39658839, -0.59261944]])
assert_array_almost_equal(results.loadings, a.T, decimal=8)
# Testing result summary string
results.rotate('varimax')
desired = (
""" Factor analysis results
=============================
Eigenvalues
-----------------------------
Basal Occ Max id
-----------------------------
2.9609 0.3209 0.0000 -0.0000
-----------------------------
-----------------------------
Communality
-----------------------------
Basal Occ Max id
-----------------------------
0.9926 0.9727 0.9654 0.3511
-----------------------------
-----------------------------
Pre-rotated loadings
-----------------------------------
factor 0 factor 1
-----------------------------------
Basal 0.9754 0.2028
Occ 0.9711 0.1721
Max 0.9619 -0.2004
id 0.3757 -0.4582
-----------------------------
-----------------------------
varimax rotated loadings
-----------------------------------
factor 0 factor 1
-----------------------------------
Basal 0.9883 -0.1259
Occ 0.9742 -0.1535
Max 0.8442 -0.5027
id 0.2060 -0.5556
=============================
""")
actual = results.summary().as_text()
actual = "\n".join(line.rstrip() for line in actual.splitlines()) + "\n"
assert_equal(actual, desired)
@pytest.mark.skipif(missing_matplotlib, reason='matplotlib not available')
def test_plots(close_figures):
mod = Factor(X.iloc[:, 1:], 3)
results = mod.fit()
results.rotate('oblimin')
fig = results.plot_scree()
fig_loadings = results.plot_loadings()
assert_equal(3, len(fig_loadings))
@pytest.mark.smoke
def test_getframe_smoke():
# mostly smoke tests for now
mod = Factor(X.iloc[:, 1:-1], 2, smc=True)
res = mod.fit()
df = res.get_loadings_frame(style='raw')
assert_(isinstance(df, pd.DataFrame))
lds = res.get_loadings_frame(style='strings', decimals=3, threshold=0.3)
# The Styler option require jinja2, skip if not available
try:
from jinja2 import Template # noqa:F401
except ImportError:
return
# TODO: separate this and do pytest.skip?
# Old implementation that warns
if PD_LT_1_4:
with warnings.catch_warnings():
warnings.simplefilter("always")
lds.to_latex()
else:
# Smoke test using new style to_latex
lds.style.to_latex()
try:
from pandas.io import formats as pd_formats
except ImportError:
from pandas import formats as pd_formats
ldf = res.get_loadings_frame(style='display')
assert_(isinstance(ldf, pd_formats.style.Styler))
assert_(isinstance(ldf.data, pd.DataFrame))
res.get_loadings_frame(style='display', decimals=3, threshold=0.2)
res.get_loadings_frame(style='display', decimals=3, color_max='GAINSBORO')
res.get_loadings_frame(style='display', decimals=3, threshold=0.45, highlight_max=False, sort_=False)
def test_factor_missing():
xm = X.iloc[:, 1:-1].copy()
nobs, k_endog = xm.shape
xm.iloc[2,2] = np.nan
mod = Factor(xm, 2)
assert_equal(mod.nobs, nobs - 1)
assert_equal(mod.k_endog, k_endog)
assert_equal(mod.endog.shape, (nobs - 1, k_endog))
def _zscore(x):
# helper function
return (x - x.mean(0)) / x.std(0)
@pytest.mark.smoke
def test_factor_scoring():
path = os.path.abspath(__file__)
dir_path = os.path.dirname(path)
csv_path = os.path.join(dir_path, 'results', 'factor_data.csv')
y = pd.read_csv(csv_path)
csv_path = os.path.join(dir_path, 'results', 'factors_stata.csv')
f_s = pd.read_csv(csv_path)
# mostly smoke tests for now
mod = Factor(y, 2)
res = mod.fit(maxiter=1)
res.rotate('varimax')
f_reg = res.factor_scoring(method='reg')
assert_allclose(f_reg * [1, -1], f_s[["f1", 'f2']].values,
atol=1e-4, rtol=1e-3)
f_bart = res.factor_scoring()
assert_allclose(f_bart * [1, -1], f_s[["f1b", 'f2b']].values,
atol=1e-4, rtol=1e-3)
# check we have high correlation to ols and gls
f_ols = res.factor_scoring(method='ols')
f_gls = res.factor_scoring(method='gls')
f_reg_z = _zscore(f_reg)
f_ols_z = _zscore(f_ols)
f_gls_z = _zscore(f_gls)
assert_array_less(0.98, (f_ols_z * f_reg_z).mean(0))
assert_array_less(0.999, (f_gls_z * f_reg_z).mean(0))
# with oblique rotation
res.rotate('oblimin')
# Note: Stata has second factor with flipped sign compared to statsmodels
assert_allclose(res._corr_factors()[0, 1], (-1) * 0.25651037, rtol=1e-3)
f_reg = res.factor_scoring(method='reg')
assert_allclose(f_reg * [1, -1], f_s[["f1o", 'f2o']].values,
atol=1e-4, rtol=1e-3)
f_bart = res.factor_scoring()
assert_allclose(f_bart * [1, -1], f_s[["f1ob", 'f2ob']].values,
atol=1e-4, rtol=1e-3)
# check we have high correlation to ols and gls
f_ols = res.factor_scoring(method='ols')
f_gls = res.factor_scoring(method='gls')
f_reg_z = _zscore(f_reg)
f_ols_z = _zscore(f_ols)
f_gls_z = _zscore(f_gls)
assert_array_less(0.97, (f_ols_z * f_reg_z).mean(0))
assert_array_less(0.999, (f_gls_z * f_reg_z).mean(0))
# check provided endog
f_ols2 = res.factor_scoring(method='ols', endog=res.model.endog)
assert_allclose(f_ols2, f_ols, rtol=1e-13)

View File

@ -0,0 +1,197 @@
import numpy as np
import pandas as pd
import pytest
from numpy.testing import assert_almost_equal, assert_raises, assert_allclose
from statsmodels.multivariate.manova import MANOVA
from statsmodels.multivariate.multivariate_ols import MultivariateTestResults
from statsmodels.tools import add_constant
# Example data
# https://support.sas.com/documentation/cdl/en/statug/63033/HTML/default/
# viewer.htm#statug_introreg_sect012.htm
X = pd.DataFrame([['Minas Graes', 2.068, 2.070, 1.580],
['Minas Graes', 2.068, 2.074, 1.602],
['Minas Graes', 2.090, 2.090, 1.613],
['Minas Graes', 2.097, 2.093, 1.613],
['Minas Graes', 2.117, 2.125, 1.663],
['Minas Graes', 2.140, 2.146, 1.681],
['Matto Grosso', 2.045, 2.054, 1.580],
['Matto Grosso', 2.076, 2.088, 1.602],
['Matto Grosso', 2.090, 2.093, 1.643],
['Matto Grosso', 2.111, 2.114, 1.643],
['Santa Cruz', 2.093, 2.098, 1.653],
['Santa Cruz', 2.100, 2.106, 1.623],
['Santa Cruz', 2.104, 2.101, 1.653]],
columns=['Loc', 'Basal', 'Occ', 'Max'])
def test_manova_sas_example():
# Results should be the same as figure 4.5 of
# https://support.sas.com/documentation/cdl/en/statug/63033/HTML/default/
# viewer.htm#statug_introreg_sect012.htm
mod = MANOVA.from_formula('Basal + Occ + Max ~ Loc', data=X)
r = mod.mv_test()
assert_almost_equal(r['Loc']['stat'].loc["Wilks' lambda", 'Value'],
0.60143661, decimal=8)
assert_almost_equal(r['Loc']['stat'].loc["Pillai's trace", 'Value'],
0.44702843, decimal=8)
assert_almost_equal(r['Loc']['stat'].loc["Hotelling-Lawley trace", 'Value'],
0.58210348, decimal=8)
assert_almost_equal(r['Loc']['stat'].loc["Roy's greatest root", 'Value'],
0.35530890, decimal=8)
assert_almost_equal(r['Loc']['stat'].loc["Wilks' lambda", 'F Value'],
0.77, decimal=2)
assert_almost_equal(r['Loc']['stat'].loc["Pillai's trace", 'F Value'],
0.86, decimal=2)
assert_almost_equal(r['Loc']['stat'].loc["Hotelling-Lawley trace", 'F Value'],
0.75, decimal=2)
assert_almost_equal(r['Loc']['stat'].loc["Roy's greatest root", 'F Value'],
1.07, decimal=2)
assert_almost_equal(r['Loc']['stat'].loc["Wilks' lambda", 'Num DF'],
6, decimal=3)
assert_almost_equal(r['Loc']['stat'].loc["Pillai's trace", 'Num DF'],
6, decimal=3)
assert_almost_equal(r['Loc']['stat'].loc["Hotelling-Lawley trace", 'Num DF'],
6, decimal=3)
assert_almost_equal(r['Loc']['stat'].loc["Roy's greatest root", 'Num DF'],
3, decimal=3)
assert_almost_equal(r['Loc']['stat'].loc["Wilks' lambda", 'Den DF'],
16, decimal=3)
assert_almost_equal(r['Loc']['stat'].loc["Pillai's trace", 'Den DF'],
18, decimal=3)
assert_almost_equal(r['Loc']['stat'].loc["Hotelling-Lawley trace", 'Den DF'],
9.0909, decimal=4)
assert_almost_equal(r['Loc']['stat'].loc["Roy's greatest root", 'Den DF'],
9, decimal=3)
assert_almost_equal(r['Loc']['stat'].loc["Wilks' lambda", 'Pr > F'],
0.6032, decimal=4)
assert_almost_equal(r['Loc']['stat'].loc["Pillai's trace", 'Pr > F'],
0.5397, decimal=4)
assert_almost_equal(r['Loc']['stat'].loc["Hotelling-Lawley trace", 'Pr > F'],
0.6272, decimal=4)
assert_almost_equal(r['Loc']['stat'].loc["Roy's greatest root", 'Pr > F'],
0.4109, decimal=4)
def test_manova_no_formula():
# Same as previous test only skipping formula interface
exog = add_constant(pd.get_dummies(X[['Loc']], drop_first=True,
dtype=float))
endog = X[['Basal', 'Occ', 'Max']]
mod = MANOVA(endog, exog)
intercept = np.zeros((1, 3))
intercept[0, 0] = 1
loc = np.zeros((2, 3))
loc[0, 1] = loc[1, 2] = 1
hypotheses = [('Intercept', intercept), ('Loc', loc)]
r = mod.mv_test(hypotheses)
assert_almost_equal(r['Loc']['stat'].loc["Wilks' lambda", 'Value'],
0.60143661, decimal=8)
assert_almost_equal(r['Loc']['stat'].loc["Pillai's trace", 'Value'],
0.44702843, decimal=8)
assert_almost_equal(r['Loc']['stat'].loc["Hotelling-Lawley trace",
'Value'],
0.58210348, decimal=8)
assert_almost_equal(r['Loc']['stat'].loc["Roy's greatest root", 'Value'],
0.35530890, decimal=8)
assert_almost_equal(r['Loc']['stat'].loc["Wilks' lambda", 'F Value'],
0.77, decimal=2)
assert_almost_equal(r['Loc']['stat'].loc["Pillai's trace", 'F Value'],
0.86, decimal=2)
assert_almost_equal(r['Loc']['stat'].loc["Hotelling-Lawley trace",
'F Value'],
0.75, decimal=2)
assert_almost_equal(r['Loc']['stat'].loc["Roy's greatest root", 'F Value'],
1.07, decimal=2)
assert_almost_equal(r['Loc']['stat'].loc["Wilks' lambda", 'Num DF'],
6, decimal=3)
assert_almost_equal(r['Loc']['stat'].loc["Pillai's trace", 'Num DF'],
6, decimal=3)
assert_almost_equal(r['Loc']['stat'].loc["Hotelling-Lawley trace",
'Num DF'],
6, decimal=3)
assert_almost_equal(r['Loc']['stat'].loc["Roy's greatest root", 'Num DF'],
3, decimal=3)
assert_almost_equal(r['Loc']['stat'].loc["Wilks' lambda", 'Den DF'],
16, decimal=3)
assert_almost_equal(r['Loc']['stat'].loc["Pillai's trace", 'Den DF'],
18, decimal=3)
assert_almost_equal(r['Loc']['stat'].loc["Hotelling-Lawley trace",
'Den DF'],
9.0909, decimal=4)
assert_almost_equal(r['Loc']['stat'].loc["Roy's greatest root", 'Den DF'],
9, decimal=3)
assert_almost_equal(r['Loc']['stat'].loc["Wilks' lambda", 'Pr > F'],
0.6032, decimal=4)
assert_almost_equal(r['Loc']['stat'].loc["Pillai's trace", 'Pr > F'],
0.5397, decimal=4)
assert_almost_equal(r['Loc']['stat'].loc["Hotelling-Lawley trace",
'Pr > F'],
0.6272, decimal=4)
assert_almost_equal(r['Loc']['stat'].loc["Roy's greatest root", 'Pr > F'],
0.4109, decimal=4)
@pytest.mark.smoke
def test_manova_no_formula_no_hypothesis():
# Same as previous test only skipping formula interface
exog = add_constant(pd.get_dummies(X[['Loc']], drop_first=True,
dtype=float))
endog = X[['Basal', 'Occ', 'Max']]
mod = MANOVA(endog, exog)
r = mod.mv_test()
assert isinstance(r, MultivariateTestResults)
def test_manova_test_input_validation():
mod = MANOVA.from_formula('Basal + Occ + Max ~ Loc', data=X)
hypothesis = [('test', np.array([[1, 1, 1]]), None)]
mod.mv_test(hypothesis)
hypothesis = [('test', np.array([[1, 1]]), None)]
assert_raises(ValueError, mod.mv_test, hypothesis)
"""
assert_raises_regex(ValueError,
('Contrast matrix L should have the same number of '
'columns as exog! 2 != 3'),
mod.mv_test, hypothesis)
"""
hypothesis = [('test', np.array([[1, 1, 1]]), np.array([[1], [1], [1]]))]
mod.mv_test(hypothesis)
hypothesis = [('test', np.array([[1, 1, 1]]), np.array([[1], [1]]))]
assert_raises(ValueError, mod.mv_test, hypothesis)
"""
assert_raises_regex(ValueError,
('Transform matrix M should have the same number of '
'rows as the number of columns of endog! 2 != 3'),
mod.mv_test, hypothesis)
"""
def test_endog_1D_array():
assert_raises(ValueError, MANOVA.from_formula, 'Basal ~ Loc', X)
def test_manova_demeaned():
# see last example in #8713
# If a term has no effect, all eigenvalues below threshold, then computaion
# raised numpy exception with empty arrays.
# currently we have an option to skip the intercept test, but don't handle
# empty arrays directly
ng = 5
loc = ["Basal", "Occ", "Max"] * ng
y1 = (np.random.randn(ng, 3) + [0, 0.5, 1]).ravel()
y2 = (np.random.randn(ng, 3) + [0.25, 0.75, 1]).ravel()
y3 = (np.random.randn(ng, 3) + [0.3, 0.6, 1]).ravel()
dta = pd.DataFrame(dict(Loc=loc, Basal=y1, Occ=y2, Max=y3))
mod = MANOVA.from_formula('Basal + Occ + Max ~ C(Loc, Helmert)', data=dta)
res1 = mod.mv_test()
# subtract sample means to have insignificant intercept
means = dta[["Basal", "Occ", "Max"]].mean()
dta[["Basal", "Occ", "Max"]] = dta[["Basal", "Occ", "Max"]] - means
mod = MANOVA.from_formula('Basal + Occ + Max ~ C(Loc, Helmert)', data=dta)
res2 = mod.mv_test(skip_intercept_test=True)
stat1 = res1.results["C(Loc, Helmert)"]["stat"].to_numpy(float)
stat2 = res2.results["C(Loc, Helmert)"]["stat"].to_numpy(float)
assert_allclose(stat1, stat2, rtol=1e-10)

View File

@ -0,0 +1,206 @@
import numpy as np
from statsmodels.multivariate.factor import Factor
from numpy.testing import assert_allclose, assert_equal
from scipy.optimize import approx_fprime
import warnings
# A small model for basic testing
def _toy():
uniq = np.r_[4, 9, 16]
load = np.asarray([[3, 1, 2], [2, 5, 8]]).T
par = np.r_[2, 3, 4, 3, 1, 2, 2, 5, 8]
corr = np.asarray([[1, .5, .25], [.5, 1, .5], [.25, .5, 1]])
return uniq, load, corr, par
def test_loglike():
uniq, load, corr, par = _toy()
fa = Factor(n_factor=2, corr=corr)
# Two ways of passing the parameters to loglike
ll1 = fa.loglike((load, uniq))
ll2 = fa.loglike(par)
assert_allclose(ll1, ll2)
def test_score():
uniq, load, corr, par = _toy()
fa = Factor(n_factor=2, corr=corr)
def f(par):
return fa.loglike(par)
par2 = np.r_[0.1, 0.2, 0.3, 0.4, 0.3, 0.1, 0.2, -0.2, 0, 0.8, 0.5, 0]
for pt in (par, par2):
g1 = approx_fprime(pt, f, 1e-8)
g2 = fa.score(pt)
assert_allclose(g1, g2, atol=1e-3)
def test_exact():
# Test if we can recover exact factor-structured matrices with
# default starting values.
np.random.seed(23324)
# Works for larger k_var but slow for routine testing.
for k_var in 5, 10, 25:
for n_factor in 1, 2, 3:
load = np.random.normal(size=(k_var, n_factor))
uniq = np.linspace(1, 2, k_var)
c = np.dot(load, load.T)
c.flat[::c.shape[0]+1] += uniq
s = np.sqrt(np.diag(c))
c /= np.outer(s, s)
fa = Factor(corr=c, n_factor=n_factor, method='ml')
rslt = fa.fit()
assert_allclose(rslt.fitted_cov, c, rtol=1e-4, atol=1e-4)
rslt.summary() # smoke test
def test_exact_em():
# Test if we can recover exact factor-structured matrices with
# default starting values using the EM algorithm.
np.random.seed(23324)
# Works for larger k_var but slow for routine testing.
for k_var in 5, 10, 25:
for n_factor in 1, 2, 3:
load = np.random.normal(size=(k_var, n_factor))
uniq = np.linspace(1, 2, k_var)
c = np.dot(load, load.T)
c.flat[::c.shape[0]+1] += uniq
s = np.sqrt(np.diag(c))
c /= np.outer(s, s)
fa = Factor(corr=c, n_factor=n_factor, method='ml')
load_e, uniq_e = fa._fit_ml_em(2000)
c_e = np.dot(load_e, load_e.T)
c_e.flat[::c_e.shape[0]+1] += uniq_e
assert_allclose(c_e, c, rtol=1e-4, atol=1e-4)
def test_fit_ml_em_random_state():
# Ensure Factor._fit_ml_em doesn't change numpy's singleton random state
# see #7357
T = 10
epsilon = np.random.multivariate_normal(np.zeros(3), np.eye(3), size=T).T
initial = np.random.get_state()
with warnings.catch_warnings():
warnings.filterwarnings("ignore", message='Fitting did not converge')
Factor(endog=epsilon, n_factor=2, method='ml').fit()
final = np.random.get_state()
assert initial[0] == final[0]
assert_equal(initial[1], final[1])
assert initial[2:] == final[2:]
def test_em():
n_factor = 1
cor = np.asarray([[1, 0.5, 0.3], [0.5, 1, 0], [0.3, 0, 1]])
fa = Factor(corr=cor, n_factor=n_factor, method='ml')
rslt = fa.fit(opt={'gtol': 1e-3})
load_opt = rslt.loadings
uniq_opt = rslt.uniqueness
load_em, uniq_em = fa._fit_ml_em(1000)
cc = np.dot(load_em, load_em.T)
cc.flat[::cc.shape[0]+1] += uniq_em
assert_allclose(cc, rslt.fitted_cov, rtol=1e-2, atol=1e-2)
def test_1factor():
"""
# R code:
r = 0.4
p = 4
ii = seq(0, p-1)
ii = outer(ii, ii, "-")
ii = abs(ii)
cm = r^ii
fa = factanal(covmat=cm, factors=1)
print(fa, digits=10)
"""
r = 0.4
p = 4
ii = np.arange(p)
cm = r ** np.abs(np.subtract.outer(ii, ii))
fa = Factor(corr=cm, n_factor=1, method='ml')
rslt = fa.fit()
if rslt.loadings[0, 0] < 0:
rslt.loadings[:, 0] *= -1
# R solution, but our likelihood is higher
# uniq = np.r_[0.8392472054, 0.5820958187, 0.5820958187, 0.8392472054]
# load = np.asarray([[0.4009399224, 0.6464550935, 0.6464550935,
# 0.4009399224]]).T
# l1 = fa.loglike(fa._pack(load, uniq))
# l2 = fa.loglike(fa._pack(rslt.loadings, rslt.uniqueness))
# So use a smoke test
uniq = np.r_[0.85290232, 0.60916033, 0.55382266, 0.82610666]
load = np.asarray([[0.38353316], [0.62517171], [0.66796508],
[0.4170052]])
assert_allclose(load, rslt.loadings, rtol=1e-3, atol=1e-3)
assert_allclose(uniq, rslt.uniqueness, rtol=1e-3, atol=1e-3)
assert_equal(rslt.df, 2)
def test_2factor():
"""
# R code:
r = 0.4
p = 6
ii = seq(0, p-1)
ii = outer(ii, ii, "-")
ii = abs(ii)
cm = r^ii
factanal(covmat=cm, factors=2)
"""
r = 0.4
p = 6
ii = np.arange(p)
cm = r ** np.abs(np.subtract.outer(ii, ii))
fa = Factor(corr=cm, n_factor=2, nobs=100, method='ml')
rslt = fa.fit()
for j in 0, 1:
if rslt.loadings[0, j] < 0:
rslt.loadings[:, j] *= -1
uniq = np.r_[0.782, 0.367, 0.696, 0.696, 0.367, 0.782]
assert_allclose(uniq, rslt.uniqueness, rtol=1e-3, atol=1e-3)
loads = [np.r_[0.323, 0.586, 0.519, 0.519, 0.586, 0.323],
np.r_[0.337, 0.538, 0.187, -0.187, -0.538, -0.337]]
for k in 0, 1:
if np.dot(loads[k], rslt.loadings[:, k]) < 0:
loads[k] *= -1
assert_allclose(loads[k], rslt.loadings[:, k], rtol=1e-3, atol=1e-3)
assert_equal(rslt.df, 4)
# Smoke test for standard errors
e = np.asarray([0.11056836, 0.05191071, 0.09836349,
0.09836349, 0.05191071, 0.11056836])
assert_allclose(rslt.uniq_stderr, e, atol=1e-4)
e = np.asarray([[0.08842151, 0.08842151], [0.06058582, 0.06058582],
[0.08339874, 0.08339874], [0.08339874, 0.08339874],
[0.06058582, 0.06058582], [0.08842151, 0.08842151]])
assert_allclose(rslt.load_stderr, e, atol=1e-4)

View File

@ -0,0 +1,199 @@
import numpy as np
import pandas as pd
from statsmodels.multivariate.multivariate_ols import _MultivariateOLS
from numpy.testing import assert_array_almost_equal, assert_raises
import patsy
data = pd.DataFrame([['Morphine', 'N', .04, .20, .10, .08],
['Morphine', 'N', .02, .06, .02, .02],
['Morphine', 'N', .07, 1.40, .48, .24],
['Morphine', 'N', .17, .57, .35, .24],
['Morphine', 'Y', .10, .09, .13, .14],
['placebo', 'Y', .07, .07, .06, .07],
['placebo', 'Y', .05, .07, .06, .07],
['placebo', 'N', .03, .62, .31, .22],
['placebo', 'N', .03, 1.05, .73, .60],
['placebo', 'N', .07, .83, 1.07, .80],
['Trimethaphan', 'N', .09, 3.13, 2.06, 1.23],
['Trimethaphan', 'Y', .10, .09, .09, .08],
['Trimethaphan', 'Y', .08, .09, .09, .10],
['Trimethaphan', 'Y', .13, .10, .12, .12],
['Trimethaphan', 'Y', .06, .05, .05, .05]],
columns=['Drug', 'Depleted',
'Histamine0', 'Histamine1',
'Histamine3', 'Histamine5'])
for i in range(2, 6):
data.iloc[:, i] = np.log(data.iloc[:, i])
def compare_r_output_dogs_data(method):
''' Testing within-subject effect interact with 2 between-subject effect
Compares with R car library Anova(, type=3) output
Note: The test statistis Phillai, Wilks, Hotelling-Lawley
and Roy are the same as R output but the approximate F and degree
of freedoms can be different. This is due to the fact that this
implementation is based on SAS formula [1]
.. [*] https://support.sas.com/documentation/cdl/en/statug/63033/HTML/default/viewer.htm#statug_introreg_sect012.htm
'''
# Repeated measures with orthogonal polynomial contrasts coding
mod = _MultivariateOLS.from_formula(
'Histamine0 + Histamine1 + Histamine3 + Histamine5 ~ Drug * Depleted',
data)
r = mod.fit(method=method)
r = r.mv_test()
a = [[2.68607660e-02, 4, 6, 5.43435304e+01, 7.59585610e-05],
[9.73139234e-01, 4, 6, 5.43435304e+01, 7.59585610e-05],
[3.62290202e+01, 4, 6, 5.43435304e+01, 7.59585610e-05],
[3.62290202e+01, 4, 6, 5.43435304e+01, 7.59585610e-05]]
assert_array_almost_equal(r['Intercept']['stat'].values, a, decimal=6)
a = [[8.39646619e-02, 8, 1.20000000e+01, 3.67658068e+00, 2.12614444e-02],
[1.18605382e+00, 8, 1.40000000e+01, 2.55003861e+00, 6.01270701e-02],
[7.69391362e+00, 8, 6.63157895e+00, 5.50814270e+00, 2.07392260e-02],
[7.25036952e+00, 4, 7.00000000e+00, 1.26881467e+01, 2.52669877e-03]]
assert_array_almost_equal(r['Drug']['stat'].values, a, decimal=6)
a = [[0.32048892, 4., 6., 3.18034906, 0.10002373],
[0.67951108, 4., 6., 3.18034906, 0.10002373],
[2.12023271, 4., 6., 3.18034906, 0.10002373],
[2.12023271, 4., 6., 3.18034906, 0.10002373]]
assert_array_almost_equal(r['Depleted']['stat'].values, a, decimal=6)
a = [[0.15234366, 8., 12., 2.34307678, 0.08894239],
[1.13013353, 8., 14., 2.27360606, 0.08553213],
[3.70989596, 8., 6.63157895, 2.65594824, 0.11370285],
[3.1145597, 4., 7., 5.45047947, 0.02582767]]
assert_array_almost_equal(r['Drug:Depleted']['stat'].values, a, decimal=6)
def test_glm_dogs_example():
compare_r_output_dogs_data(method='svd')
compare_r_output_dogs_data(method='pinv')
def test_specify_L_M_by_string():
mod = _MultivariateOLS.from_formula(
'Histamine0 + Histamine1 + Histamine3 + Histamine5 ~ Drug * Depleted',
data)
r = mod.fit()
r1 = r.mv_test(hypotheses=[['Intercept', ['Intercept'], None]])
a = [[2.68607660e-02, 4, 6, 5.43435304e+01, 7.59585610e-05],
[9.73139234e-01, 4, 6, 5.43435304e+01, 7.59585610e-05],
[3.62290202e+01, 4, 6, 5.43435304e+01, 7.59585610e-05],
[3.62290202e+01, 4, 6, 5.43435304e+01, 7.59585610e-05]]
assert_array_almost_equal(r1['Intercept']['stat'].values, a, decimal=6)
L = ['Intercept', 'Drug[T.Trimethaphan]', 'Drug[T.placebo]']
M = ['Histamine1', 'Histamine3', 'Histamine5']
r1 = r.mv_test(hypotheses=[['a', L, M]])
a = [[1, 0, 0, 0, 0, 0],
[0, 1, 0, 0, 0, 0],
[0, 0, 1, 0, 0, 0]]
assert_array_almost_equal(r1['a']['contrast_L'], a, decimal=10)
a = [[0, 1, 0, 0],
[0, 0, 1, 0],
[0, 0, 0, 1]]
assert_array_almost_equal(r1['a']['transform_M'].T, a, decimal=10)
def test_independent_variable_singular():
data1 = data.copy()
data1['dup'] = data1['Drug']
mod = _MultivariateOLS.from_formula(
'Histamine0 + Histamine1 + Histamine3 + Histamine5 ~ Drug * dup',
data1)
assert_raises(ValueError, mod.fit)
mod = _MultivariateOLS.from_formula(
'Histamine0 + Histamine1 + Histamine3 + Histamine5 ~ Drug * dup',
data1)
assert_raises(ValueError, mod.fit)
def test_from_formula_vs_no_formula():
mod = _MultivariateOLS.from_formula(
'Histamine0 + Histamine1 + Histamine3 + Histamine5 ~ Drug * Depleted',
data)
r = mod.fit(method='svd')
r0 = r.mv_test()
endog, exog = patsy.dmatrices(
'Histamine0 + Histamine1 + Histamine3 + Histamine5 ~ Drug * Depleted',
data, return_type="dataframe")
L = np.array([[1, 0, 0, 0, 0, 0]])
# DataFrame input
r = _MultivariateOLS(endog, exog).fit(method='svd')
r1 = r.mv_test(hypotheses=[['Intercept', L, None]])
assert_array_almost_equal(r1['Intercept']['stat'].values,
r0['Intercept']['stat'].values, decimal=6)
# Numpy array input
r = _MultivariateOLS(endog.values, exog.values).fit(method='svd')
r1 = r.mv_test(hypotheses=[['Intercept', L, None]])
assert_array_almost_equal(r1['Intercept']['stat'].values,
r0['Intercept']['stat'].values, decimal=6)
L = np.array([[0, 1, 0, 0, 0, 0],
[0, 0, 1, 0, 0, 0],
])
r1 = r.mv_test(hypotheses=[['Drug', L, None]])
# DataFrame input
r = _MultivariateOLS(endog, exog).fit(method='svd')
r1 = r.mv_test(hypotheses=[['Drug', L, None]])
assert_array_almost_equal(r1['Drug']['stat'].values,
r0['Drug']['stat'].values, decimal=6)
# Numpy array input
r = _MultivariateOLS(endog.values, exog.values).fit(method='svd')
r1 = r.mv_test(hypotheses=[['Drug', L, None]])
assert_array_almost_equal(r1['Drug']['stat'].values,
r0['Drug']['stat'].values, decimal=6)
def test_L_M_matrices_1D_array():
mod = _MultivariateOLS.from_formula(
'Histamine0 + Histamine1 + Histamine3 + Histamine5 ~ Drug * Depleted',
data)
r = mod.fit(method='svd')
L = np.array([1, 0, 0, 0, 0, 0])
assert_raises(ValueError, r.mv_test, hypotheses=[['Drug', L, None]])
L = np.array([[1, 0, 0, 0, 0, 0]])
M = np.array([1, 0, 0, 0, 0, 0])
assert_raises(ValueError, r.mv_test, hypotheses=[['Drug', L, M]])
def test_exog_1D_array():
mod = _MultivariateOLS.from_formula(
'Histamine0 + Histamine1 + Histamine3 + Histamine5 ~ 0 + Depleted',
data)
r = mod.fit(method='svd')
r0 = r.mv_test()
a = [[0.0019, 8.0000, 20.0000, 55.0013, 0.0000],
[1.8112, 8.0000, 22.0000, 26.3796, 0.0000],
[97.8858, 8.0000, 12.1818, 117.1133, 0.0000],
[93.2742, 4.0000, 11.0000, 256.5041, 0.0000]]
assert_array_almost_equal(r0['Depleted']['stat'].values, a, decimal=4)
def test_endog_1D_array():
assert_raises(ValueError, _MultivariateOLS.from_formula,
'Histamine0 ~ 0 + Depleted', data)
def test_affine_hypothesis():
# Testing affine hypothesis, compared with R car linearHypothesis
# Note: The test statistis Phillai, Wilks, Hotelling-Lawley
# and Roy are the same as R output but the approximate F and degree
# of freedoms can be different. This is due to the fact that this
# implementation is based on SAS formula [1]
mod = _MultivariateOLS.from_formula(
'Histamine0 + Histamine1 + Histamine3 + Histamine5 ~ Drug * Depleted',
data)
r = mod.fit(method='svd')
L = np.array([[0, 1.2, 1.1, 1.3, 1.5, 1.4],
[0, 3.2, 2.1, 3.3, 5.5, 4.4]])
M = None
C = np.array([[1, 2, 3, 4],
[5, 6, 7, 8]])
r0 = r.mv_test(hypotheses=[('test1', L, M, C)])
a = [[0.0269, 8.0000, 12.0000, 7.6441, 0.0010],
[1.4277, 8.0000, 14.0000, 4.3657, 0.0080],
[19.2678, 8.0000, 6.6316, 13.7940, 0.0016],
[18.3470, 4.0000, 7.0000, 32.1072, 0.0001]]
assert_array_almost_equal(r0['test1']['stat'].values, a, decimal=4)
r0.summary(show_contrast_L=True, show_transform_M=True,
show_constant_C=True)

View File

@ -0,0 +1,443 @@
from statsmodels.compat.platform import PLATFORM_WIN32
import warnings
import numpy as np
import pandas as pd
import pytest
from numpy.testing import assert_allclose, assert_equal, assert_raises
from statsmodels.multivariate.pca import PCA, pca
from statsmodels.multivariate.tests.results.datamlw import (data, princomp1,
princomp2)
from statsmodels.tools.sm_exceptions import EstimationWarning
DECIMAL_5 = .00001
class TestPCA:
@classmethod
def setup_class(cls):
rs = np.random.RandomState()
rs.seed(1234)
k = 3
n = 100
t = 200
lam = 2
norm_rng = rs.standard_normal
e = norm_rng((t, n))
f = norm_rng((t, k))
b = rs.standard_gamma(lam, size=(k, n)) / lam
cls.x = f.dot(b) + e
cls.x_copy = cls.x + 0.0
cls.rs = rs
k = 3
n = 300
t = 200
lam = 2
norm_rng = rs.standard_normal
e = norm_rng((t, n))
f = norm_rng((t, k))
b = rs.standard_gamma(lam, size=(k, n)) / lam
cls.x_wide = f.dot(b) + e
@pytest.mark.smoke
@pytest.mark.matplotlib
def test_smoke_plot_and_repr(self, close_figures):
pc = PCA(self.x)
fig = pc.plot_scree()
fig = pc.plot_scree(ncomp=10)
fig = pc.plot_scree(log_scale=False)
fig = pc.plot_scree(cumulative=True)
fig = pc.plot_rsquare()
fig = pc.plot_rsquare(ncomp=5)
# Additional smoke test
pc.__repr__()
pc = PCA(self.x, standardize=False)
pc.__repr__()
pc = PCA(self.x, standardize=False, demean=False)
pc.__repr__()
pc = PCA(self.x, ncomp=2, gls=True)
assert "GLS" in pc.__repr__()
# Check data for no changes
assert_equal(self.x, pc.data)
def test_eig_svd_equiv(self):
# Test leading components since the tail end can differ
pc_eig = PCA(self.x)
pc_svd = PCA(self.x, method='svd')
assert_allclose(pc_eig.projection, pc_svd.projection)
assert_allclose(np.abs(pc_eig.factors[:, :2]),
np.abs(pc_svd.factors[:, :2]))
assert_allclose(np.abs(pc_eig.coeff[:2, :]),
np.abs(pc_svd.coeff[:2, :]))
assert_allclose(pc_eig.eigenvals,
pc_svd.eigenvals)
assert_allclose(np.abs(pc_eig.eigenvecs[:, :2]),
np.abs(pc_svd.eigenvecs[:, :2]))
pc_svd = PCA(self.x, method='svd', ncomp=2)
pc_nipals = PCA(self.x, method='nipals', ncomp=2)
assert_allclose(np.abs(pc_nipals.factors),
np.abs(pc_svd.factors),
atol=DECIMAL_5)
assert_allclose(np.abs(pc_nipals.coeff),
np.abs(pc_svd.coeff),
atol=DECIMAL_5)
assert_allclose(pc_nipals.eigenvals,
pc_svd.eigenvals,
atol=DECIMAL_5)
assert_allclose(np.abs(pc_nipals.eigenvecs),
np.abs(pc_svd.eigenvecs),
atol=DECIMAL_5)
# Check data for no changes
assert_equal(self.x, pc_svd.data)
# Check data for no changes
assert_equal(self.x, pc_eig.data)
# Check data for no changes
assert_equal(self.x, pc_nipals.data)
def test_options(self):
pc = PCA(self.x)
pc_no_norm = PCA(self.x, normalize=False)
assert_allclose(pc.factors.dot(pc.coeff),
pc_no_norm.factors.dot(pc_no_norm.coeff))
princomp = pc.factors
assert_allclose(princomp.T.dot(princomp), np.eye(100), atol=1e-5)
weights = pc_no_norm.coeff
assert_allclose(weights.T.dot(weights), np.eye(100), atol=1e-5)
pc_10 = PCA(self.x, ncomp=10)
assert_allclose(pc.factors[:, :10], pc_10.factors)
assert_allclose(pc.coeff[:10, :], pc_10.coeff)
assert_allclose(pc.rsquare[:(10 + 1)], pc_10.rsquare)
assert_allclose(pc.eigenvals[:10], pc_10.eigenvals)
assert_allclose(pc.eigenvecs[:, :10], pc_10.eigenvecs)
pc = PCA(self.x, standardize=False, normalize=False)
mu = self.x.mean(0)
xdm = self.x - mu
xpx = xdm.T.dot(xdm)
val, vec = np.linalg.eigh(xpx)
ind = np.argsort(val)
ind = ind[::-1]
val = val[ind]
vec = vec[:, ind]
assert_allclose(xdm, pc.transformed_data)
assert_allclose(val, pc.eigenvals)
assert_allclose(np.abs(vec), np.abs(pc.eigenvecs))
assert_allclose(np.abs(pc.factors), np.abs(xdm.dot(vec)))
assert_allclose(pc.projection, xdm + mu)
pc = PCA(self.x, standardize=False, demean=False, normalize=False)
x = self.x
xpx = x.T.dot(x)
val, vec = np.linalg.eigh(xpx)
ind = np.argsort(val)
ind = ind[::-1]
val = val[ind]
vec = vec[:, ind]
assert_allclose(x, pc.transformed_data)
assert_allclose(val, pc.eigenvals)
assert_allclose(np.abs(vec), np.abs(pc.eigenvecs))
assert_allclose(np.abs(pc.factors), np.abs(x.dot(vec)))
def test_against_reference(self):
# Test against MATLAB, which by default demeans but does not standardize
x = data.xo / 1000.0
pc = PCA(x, normalize=False, standardize=False)
ref = princomp1
assert_allclose(np.abs(pc.factors), np.abs(ref.factors))
assert_allclose(pc.factors.dot(pc.coeff) + x.mean(0), x)
assert_allclose(np.abs(pc.coeff), np.abs(ref.coef.T))
assert_allclose(pc.factors.dot(pc.coeff),
ref.factors.dot(ref.coef.T))
pc = PCA(x[:20], normalize=False, standardize=False)
mu = x[:20].mean(0)
ref = princomp2
assert_allclose(np.abs(pc.factors), np.abs(ref.factors))
assert_allclose(pc.factors.dot(pc.coeff) + mu, x[:20])
assert_allclose(np.abs(pc.coeff), np.abs(ref.coef.T))
assert_allclose(pc.factors.dot(pc.coeff),
ref.factors.dot(ref.coef.T))
def test_warnings_and_errors(self):
with warnings.catch_warnings(record=True) as w:
pc = PCA(self.x, ncomp=300)
assert_equal(len(w), 1)
with warnings.catch_warnings(record=True) as w:
rs = self.rs
x = rs.standard_normal((200, 1)) * np.ones(200)
pc = PCA(x, method='eig')
assert_equal(len(w), 1)
assert_raises(ValueError, PCA, self.x, method='unknown')
assert_raises(ValueError, PCA, self.x, missing='unknown')
assert_raises(ValueError, PCA, self.x, tol=2.0)
assert_raises(ValueError, PCA, np.nan * np.ones((200, 100)), tol=2.0)
@pytest.mark.matplotlib
def test_pandas(self, close_figures):
pc = PCA(pd.DataFrame(self.x))
pc1 = PCA(self.x)
assert_allclose(pc.factors.values, pc1.factors)
fig = pc.plot_scree()
fig = pc.plot_scree(ncomp=10)
fig = pc.plot_scree(log_scale=False)
fig = pc.plot_rsquare()
fig = pc.plot_rsquare(ncomp=5)
proj = pc.project(2)
PCA(pd.DataFrame(self.x), ncomp=4, gls=True)
PCA(pd.DataFrame(self.x), ncomp=4, standardize=False)
def test_gls_and_weights(self):
assert_raises(ValueError, PCA, self.x, gls=True)
assert_raises(ValueError, PCA, self.x, weights=np.array([1.0, 1.0]))
# Pre-standardize to make comparison simple
x = (self.x - self.x.mean(0))
x = x / (x ** 2.0).mean(0)
pc_gls = PCA(x, ncomp=1, standardize=False, demean=False, gls=True)
pc = PCA(x, ncomp=1, standardize=False, demean=False)
errors = x - pc.projection
var = (errors ** 2.0).mean(0)
weights = 1.0 / var
weights = weights / np.sqrt((weights ** 2.0).mean())
assert_allclose(weights, pc_gls.weights)
assert_equal(x, pc_gls.data)
assert_equal(x, pc.data)
pc_weights = PCA(x, ncomp=1, standardize=False, demean=False, weights=weights)
assert_allclose(weights, pc_weights.weights)
assert_allclose(np.abs(pc_weights.factors), np.abs(pc_gls.factors))
@pytest.mark.slow
def test_wide(self):
pc = PCA(self.x_wide)
assert_equal(pc.factors.shape[1], self.x_wide.shape[0])
assert_equal(pc.eigenvecs.shape[1], min(np.array(self.x_wide.shape)))
pc = PCA(pd.DataFrame(self.x_wide))
assert_equal(pc.factors.shape[1], self.x_wide.shape[0])
assert_equal(pc.eigenvecs.shape[1], min(np.array(self.x_wide.shape)))
def test_projection(self):
pc = PCA(self.x, ncomp=5)
mu = self.x.mean(0)
demean_x = self.x - mu
coef = np.linalg.pinv(pc.factors).dot(demean_x)
direct = pc.factors.dot(coef)
assert_allclose(pc.projection, direct + mu)
pc = PCA(self.x, standardize=False, ncomp=5)
coef = np.linalg.pinv(pc.factors).dot(demean_x)
direct = pc.factors.dot(coef)
assert_allclose(pc.projection, direct + mu)
pc = PCA(self.x, standardize=False, demean=False, ncomp=5)
coef = np.linalg.pinv(pc.factors).dot(self.x)
direct = pc.factors.dot(coef)
assert_allclose(pc.projection, direct)
pc = PCA(self.x, ncomp=5, gls=True)
mu = self.x.mean(0)
demean_x = self.x - mu
coef = np.linalg.pinv(pc.factors).dot(demean_x)
direct = pc.factors.dot(coef)
assert_allclose(pc.projection, direct + mu)
pc = PCA(self.x, standardize=False, ncomp=5)
coef = np.linalg.pinv(pc.factors).dot(demean_x)
direct = pc.factors.dot(coef)
assert_allclose(pc.projection, direct + mu)
pc = PCA(self.x, standardize=False, demean=False, ncomp=5, gls=True)
coef = np.linalg.pinv(pc.factors).dot(self.x)
direct = pc.factors.dot(coef)
assert_allclose(pc.projection, direct)
# Test error for too many factors
project = pc.project
assert_raises(ValueError, project, 6)
@pytest.mark.skipif(PLATFORM_WIN32, reason='Windows 32-bit')
def test_replace_missing(self):
x = self.x.copy()
x[::5, ::7] = np.nan
pc = PCA(x, missing='drop-row')
x_dropped_row = x[np.logical_not(np.any(np.isnan(x), 1))]
pc_dropped = PCA(x_dropped_row)
assert_allclose(pc.projection, pc_dropped.projection)
assert_equal(x, pc.data)
pc = PCA(x, missing='drop-col')
x_dropped_col = x[:, np.logical_not(np.any(np.isnan(x), 0))]
pc_dropped = PCA(x_dropped_col)
assert_allclose(pc.projection, pc_dropped.projection)
assert_equal(x, pc.data)
pc = PCA(x, missing='drop-min')
if x_dropped_row.size > x_dropped_col.size:
x_dropped_min = x_dropped_row
else:
x_dropped_min = x_dropped_col
pc_dropped = PCA(x_dropped_min)
assert_allclose(pc.projection, pc_dropped.projection)
assert_equal(x, pc.data)
pc = PCA(x, ncomp=3, missing='fill-em')
missing = np.isnan(x)
mu = np.nanmean(x, axis=0)
errors = x - mu
sigma = np.sqrt(np.nanmean(errors ** 2, axis=0))
x_std = errors / sigma
x_std[missing] = 0.0
last = x_std[missing]
delta = 1.0
count = 0
while delta > 5e-8:
pc_temp = PCA(x_std, ncomp=3, standardize=False, demean=False)
x_std[missing] = pc_temp.projection[missing]
current = x_std[missing]
diff = current - last
delta = np.sqrt(np.sum(diff ** 2)) / np.sqrt(np.sum(current ** 2))
last = current
count += 1
x = self.x + 0.0
projection = pc_temp.projection * sigma + mu
x[missing] = projection[missing]
assert_allclose(pc._adjusted_data, x)
# Check data for no changes
assert_equal(self.x, self.x_copy)
x = self.x
pc = PCA(x)
pc_dropped = PCA(x, missing='drop-row')
assert_allclose(pc.projection, pc_dropped.projection, atol=DECIMAL_5)
pc_dropped = PCA(x, missing='drop-col')
assert_allclose(pc.projection, pc_dropped.projection, atol=DECIMAL_5)
pc_dropped = PCA(x, missing='drop-min')
assert_allclose(pc.projection, pc_dropped.projection, atol=DECIMAL_5)
pc = PCA(x, ncomp=3)
pc_dropped = PCA(x, ncomp=3, missing='fill-em')
assert_allclose(pc.projection, pc_dropped.projection, atol=DECIMAL_5)
# Test too many missing for missing='fill-em'
x = self.x.copy()
x[:, :] = np.nan
assert_raises(ValueError, PCA, x, missing='drop-row')
assert_raises(ValueError, PCA, x, missing='drop-col')
assert_raises(ValueError, PCA, x, missing='drop-min')
assert_raises(ValueError, PCA, x, missing='fill-em')
def test_rsquare(self):
x = self.x + 0.0
mu = x.mean(0)
x_demean = x - mu
std = np.std(x, 0)
x_std = x_demean / std
pc = PCA(self.x)
nvar = x.shape[1]
rsquare = np.zeros(nvar + 1)
tss = np.sum(x_std ** 2)
for i in range(nvar + 1):
errors = x_std - pc.project(i, transform=False, unweight=False)
rsquare[i] = 1.0 - np.sum(errors ** 2) / tss
assert_allclose(rsquare, pc.rsquare)
pc = PCA(self.x, standardize=False)
tss = np.sum(x_demean ** 2)
for i in range(nvar + 1):
errors = x_demean - pc.project(i, transform=False, unweight=False)
rsquare[i] = 1.0 - np.sum(errors ** 2) / tss
assert_allclose(rsquare, pc.rsquare)
pc = PCA(self.x, standardize=False, demean=False)
tss = np.sum(x ** 2)
for i in range(nvar + 1):
errors = x - pc.project(i, transform=False, unweight=False)
rsquare[i] = 1.0 - np.sum(errors ** 2) / tss
assert_allclose(rsquare, pc.rsquare)
@pytest.mark.slow
def test_missing_dataframe(self):
x = self.x.copy()
x[::5, ::7] = np.nan
pc = PCA(x, ncomp=3, missing='fill-em')
x = pd.DataFrame(x)
pc_df = PCA(x, ncomp=3, missing='fill-em')
assert_allclose(pc.coeff, pc_df.coeff)
assert_allclose(pc.factors, pc_df.factors)
pc_df_nomissing = PCA(pd.DataFrame(self.x.copy()), ncomp=3)
assert isinstance(pc_df.coeff, type(pc_df_nomissing.coeff))
assert isinstance(pc_df.data, type(pc_df_nomissing.data))
assert isinstance(pc_df.eigenvals, type(pc_df_nomissing.eigenvals))
assert isinstance(pc_df.eigenvecs, type(pc_df_nomissing.eigenvecs))
x = self.x.copy()
x[::5, ::7] = np.nan
x_df = pd.DataFrame(x)
pc = PCA(x, missing='drop-row')
pc_df = PCA(x_df, missing='drop-row')
assert_allclose(pc.coeff, pc_df.coeff)
assert_allclose(pc.factors, pc_df.factors)
pc = PCA(x, missing='drop-col')
pc_df = PCA(x_df, missing='drop-col')
assert_allclose(pc.coeff, pc_df.coeff)
assert_allclose(pc.factors, pc_df.factors)
pc = PCA(x, missing='drop-min')
pc_df = PCA(x_df, missing='drop-min')
assert_allclose(pc.coeff, pc_df.coeff)
assert_allclose(pc.factors, pc_df.factors)
def test_equivalence(self):
x = self.x.copy()
assert_allclose(PCA(x).factors, pca(x)[0])
def test_equivalence_full_matrices(self):
x = self.x.copy()
svd_full_matrices_true = PCA(x, svd_full_matrices=True).factors
svd_full_matrices_false = PCA(x).factors
assert_allclose(svd_full_matrices_true, svd_full_matrices_false)
def test_missing():
data = np.empty((200, 50))
data[0, 0] = np.nan
with pytest.raises(ValueError, match="data contains non-finite values"):
PCA(data)
def test_too_many_missing(reset_randomstate):
data = np.random.standard_normal((200, 50))
data[0, :-3] = np.nan
with pytest.raises(ValueError):
PCA(data, ncomp=5, missing="drop-col")
p = PCA(data, missing="drop-min")
assert max(p.factors.shape) == max(data.shape) - 1
def test_gls_warning(reset_randomstate):
data = np.random.standard_normal((400, 200))
data[:, 1:] = data[:, :1] + .01 * data[:, 1:]
with pytest.warns(EstimationWarning, match="Many series are being down weighted"):
factors = PCA(data, ncomp=2, gls=True).factors
assert factors.shape == (data.shape[0], 2)