450 lines
17 KiB
Python
450 lines
17 KiB
Python
import numpy as np
|
|
from numpy.testing import assert_approx_equal
|
|
|
|
from sklearn.utils._testing import (assert_array_almost_equal,
|
|
assert_array_equal, assert_raise_message,
|
|
assert_warns)
|
|
from sklearn.datasets import load_linnerud
|
|
from sklearn.cross_decomposition import _pls as pls_
|
|
from sklearn.cross_decomposition import CCA
|
|
from sklearn.preprocessing import StandardScaler
|
|
from sklearn.utils import check_random_state
|
|
from sklearn.exceptions import ConvergenceWarning
|
|
|
|
|
|
def test_pls():
|
|
d = load_linnerud()
|
|
X = d.data
|
|
Y = d.target
|
|
# 1) Canonical (symmetric) PLS (PLS 2 blocks canonical mode A)
|
|
# ===========================================================
|
|
# Compare 2 algo.: nipals vs. svd
|
|
# ------------------------------
|
|
pls_bynipals = pls_.PLSCanonical(n_components=X.shape[1])
|
|
pls_bynipals.fit(X, Y)
|
|
pls_bysvd = pls_.PLSCanonical(algorithm="svd", n_components=X.shape[1])
|
|
pls_bysvd.fit(X, Y)
|
|
# check equalities of loading (up to the sign of the second column)
|
|
assert_array_almost_equal(
|
|
pls_bynipals.x_loadings_,
|
|
pls_bysvd.x_loadings_, decimal=5,
|
|
err_msg="nipals and svd implementations lead to different x loadings")
|
|
|
|
assert_array_almost_equal(
|
|
pls_bynipals.y_loadings_,
|
|
pls_bysvd.y_loadings_, decimal=5,
|
|
err_msg="nipals and svd implementations lead to different y loadings")
|
|
|
|
# Check PLS properties (with n_components=X.shape[1])
|
|
# ---------------------------------------------------
|
|
plsca = pls_.PLSCanonical(n_components=X.shape[1])
|
|
plsca.fit(X, Y)
|
|
T = plsca.x_scores_
|
|
P = plsca.x_loadings_
|
|
Wx = plsca.x_weights_
|
|
U = plsca.y_scores_
|
|
Q = plsca.y_loadings_
|
|
Wy = plsca.y_weights_
|
|
|
|
def check_ortho(M, err_msg):
|
|
K = np.dot(M.T, M)
|
|
assert_array_almost_equal(K, np.diag(np.diag(K)), err_msg=err_msg)
|
|
|
|
# Orthogonality of weights
|
|
# ~~~~~~~~~~~~~~~~~~~~~~~~
|
|
check_ortho(Wx, "x weights are not orthogonal")
|
|
check_ortho(Wy, "y weights are not orthogonal")
|
|
|
|
# Orthogonality of latent scores
|
|
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
|
check_ortho(T, "x scores are not orthogonal")
|
|
check_ortho(U, "y scores are not orthogonal")
|
|
|
|
# Check X = TP' and Y = UQ' (with (p == q) components)
|
|
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
|
# center scale X, Y
|
|
Xc, Yc, x_mean, y_mean, x_std, y_std =\
|
|
pls_._center_scale_xy(X.copy(), Y.copy(), scale=True)
|
|
assert_array_almost_equal(Xc, np.dot(T, P.T), err_msg="X != TP'")
|
|
assert_array_almost_equal(Yc, np.dot(U, Q.T), err_msg="Y != UQ'")
|
|
|
|
# Check that rotations on training data lead to scores
|
|
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
|
Xr = plsca.transform(X)
|
|
assert_array_almost_equal(Xr, plsca.x_scores_,
|
|
err_msg="rotation on X failed")
|
|
Xr, Yr = plsca.transform(X, Y)
|
|
assert_array_almost_equal(Xr, plsca.x_scores_,
|
|
err_msg="rotation on X failed")
|
|
assert_array_almost_equal(Yr, plsca.y_scores_,
|
|
err_msg="rotation on Y failed")
|
|
|
|
# Check that inverse_transform works
|
|
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
|
Xreconstructed = plsca.inverse_transform(Xr)
|
|
assert_array_almost_equal(Xreconstructed, X,
|
|
err_msg="inverse_transform failed")
|
|
|
|
# "Non regression test" on canonical PLS
|
|
# --------------------------------------
|
|
# The results were checked against the R-package plspm
|
|
pls_ca = pls_.PLSCanonical(n_components=X.shape[1])
|
|
pls_ca.fit(X, Y)
|
|
|
|
x_weights = np.array(
|
|
[[-0.61330704, 0.25616119, -0.74715187],
|
|
[-0.74697144, 0.11930791, 0.65406368],
|
|
[-0.25668686, -0.95924297, -0.11817271]])
|
|
# x_weights_sign_flip holds columns of 1 or -1, depending on sign flip
|
|
# between R and python
|
|
x_weights_sign_flip = pls_ca.x_weights_ / x_weights
|
|
|
|
x_rotations = np.array(
|
|
[[-0.61330704, 0.41591889, -0.62297525],
|
|
[-0.74697144, 0.31388326, 0.77368233],
|
|
[-0.25668686, -0.89237972, -0.24121788]])
|
|
x_rotations_sign_flip = pls_ca.x_rotations_ / x_rotations
|
|
|
|
y_weights = np.array(
|
|
[[+0.58989127, 0.7890047, 0.1717553],
|
|
[+0.77134053, -0.61351791, 0.16920272],
|
|
[-0.23887670, -0.03267062, 0.97050016]])
|
|
y_weights_sign_flip = pls_ca.y_weights_ / y_weights
|
|
|
|
y_rotations = np.array(
|
|
[[+0.58989127, 0.7168115, 0.30665872],
|
|
[+0.77134053, -0.70791757, 0.19786539],
|
|
[-0.23887670, -0.00343595, 0.94162826]])
|
|
y_rotations_sign_flip = pls_ca.y_rotations_ / y_rotations
|
|
|
|
# x_weights = X.dot(x_rotation)
|
|
# Hence R/python sign flip should be the same in x_weight and x_rotation
|
|
assert_array_almost_equal(x_rotations_sign_flip, x_weights_sign_flip)
|
|
# This test that R / python give the same result up to column
|
|
# sign indeterminacy
|
|
assert_array_almost_equal(np.abs(x_rotations_sign_flip), 1, 4)
|
|
assert_array_almost_equal(np.abs(x_weights_sign_flip), 1, 4)
|
|
|
|
|
|
assert_array_almost_equal(y_rotations_sign_flip, y_weights_sign_flip)
|
|
assert_array_almost_equal(np.abs(y_rotations_sign_flip), 1, 4)
|
|
assert_array_almost_equal(np.abs(y_weights_sign_flip), 1, 4)
|
|
|
|
# 2) Regression PLS (PLS2): "Non regression test"
|
|
# ===============================================
|
|
# The results were checked against the R-packages plspm, misOmics and pls
|
|
pls_2 = pls_.PLSRegression(n_components=X.shape[1])
|
|
pls_2.fit(X, Y)
|
|
|
|
x_weights = np.array(
|
|
[[-0.61330704, -0.00443647, 0.78983213],
|
|
[-0.74697144, -0.32172099, -0.58183269],
|
|
[-0.25668686, 0.94682413, -0.19399983]])
|
|
x_weights_sign_flip = pls_2.x_weights_ / x_weights
|
|
|
|
x_loadings = np.array(
|
|
[[-0.61470416, -0.24574278, 0.78983213],
|
|
[-0.65625755, -0.14396183, -0.58183269],
|
|
[-0.51733059, 1.00609417, -0.19399983]])
|
|
x_loadings_sign_flip = pls_2.x_loadings_ / x_loadings
|
|
|
|
y_weights = np.array(
|
|
[[+0.32456184, 0.29892183, 0.20316322],
|
|
[+0.42439636, 0.61970543, 0.19320542],
|
|
[-0.13143144, -0.26348971, -0.17092916]])
|
|
y_weights_sign_flip = pls_2.y_weights_ / y_weights
|
|
|
|
y_loadings = np.array(
|
|
[[+0.32456184, 0.29892183, 0.20316322],
|
|
[+0.42439636, 0.61970543, 0.19320542],
|
|
[-0.13143144, -0.26348971, -0.17092916]])
|
|
y_loadings_sign_flip = pls_2.y_loadings_ / y_loadings
|
|
|
|
# x_loadings[:, i] = Xi.dot(x_weights[:, i]) \forall i
|
|
assert_array_almost_equal(x_loadings_sign_flip, x_weights_sign_flip, 4)
|
|
assert_array_almost_equal(np.abs(x_loadings_sign_flip), 1, 4)
|
|
assert_array_almost_equal(np.abs(x_weights_sign_flip), 1, 4)
|
|
|
|
assert_array_almost_equal(y_loadings_sign_flip, y_weights_sign_flip, 4)
|
|
assert_array_almost_equal(np.abs(y_loadings_sign_flip), 1, 4)
|
|
assert_array_almost_equal(np.abs(y_weights_sign_flip), 1, 4)
|
|
|
|
# 3) Another non-regression test of Canonical PLS on random dataset
|
|
# =================================================================
|
|
# The results were checked against the R-package plspm
|
|
n = 500
|
|
p_noise = 10
|
|
q_noise = 5
|
|
# 2 latents vars:
|
|
rng = check_random_state(11)
|
|
l1 = rng.normal(size=n)
|
|
l2 = rng.normal(size=n)
|
|
latents = np.array([l1, l1, l2, l2]).T
|
|
X = latents + rng.normal(size=4 * n).reshape((n, 4))
|
|
Y = latents + rng.normal(size=4 * n).reshape((n, 4))
|
|
X = np.concatenate(
|
|
(X, rng.normal(size=p_noise * n).reshape(n, p_noise)), axis=1)
|
|
Y = np.concatenate(
|
|
(Y, rng.normal(size=q_noise * n).reshape(n, q_noise)), axis=1)
|
|
|
|
pls_ca = pls_.PLSCanonical(n_components=3)
|
|
pls_ca.fit(X, Y)
|
|
|
|
x_weights = np.array(
|
|
[[0.65803719, 0.19197924, 0.21769083],
|
|
[0.7009113, 0.13303969, -0.15376699],
|
|
[0.13528197, -0.68636408, 0.13856546],
|
|
[0.16854574, -0.66788088, -0.12485304],
|
|
[-0.03232333, -0.04189855, 0.40690153],
|
|
[0.1148816, -0.09643158, 0.1613305],
|
|
[0.04792138, -0.02384992, 0.17175319],
|
|
[-0.06781, -0.01666137, -0.18556747],
|
|
[-0.00266945, -0.00160224, 0.11893098],
|
|
[-0.00849528, -0.07706095, 0.1570547],
|
|
[-0.00949471, -0.02964127, 0.34657036],
|
|
[-0.03572177, 0.0945091, 0.3414855],
|
|
[0.05584937, -0.02028961, -0.57682568],
|
|
[0.05744254, -0.01482333, -0.17431274]])
|
|
x_weights_sign_flip = pls_ca.x_weights_ / x_weights
|
|
|
|
|
|
x_loadings = np.array(
|
|
[[0.65649254, 0.1847647, 0.15270699],
|
|
[0.67554234, 0.15237508, -0.09182247],
|
|
[0.19219925, -0.67750975, 0.08673128],
|
|
[0.2133631, -0.67034809, -0.08835483],
|
|
[-0.03178912, -0.06668336, 0.43395268],
|
|
[0.15684588, -0.13350241, 0.20578984],
|
|
[0.03337736, -0.03807306, 0.09871553],
|
|
[-0.06199844, 0.01559854, -0.1881785],
|
|
[0.00406146, -0.00587025, 0.16413253],
|
|
[-0.00374239, -0.05848466, 0.19140336],
|
|
[0.00139214, -0.01033161, 0.32239136],
|
|
[-0.05292828, 0.0953533, 0.31916881],
|
|
[0.04031924, -0.01961045, -0.65174036],
|
|
[0.06172484, -0.06597366, -0.1244497]])
|
|
x_loadings_sign_flip = pls_ca.x_loadings_ / x_loadings
|
|
|
|
y_weights = np.array(
|
|
[[0.66101097, 0.18672553, 0.22826092],
|
|
[0.69347861, 0.18463471, -0.23995597],
|
|
[0.14462724, -0.66504085, 0.17082434],
|
|
[0.22247955, -0.6932605, -0.09832993],
|
|
[0.07035859, 0.00714283, 0.67810124],
|
|
[0.07765351, -0.0105204, -0.44108074],
|
|
[-0.00917056, 0.04322147, 0.10062478],
|
|
[-0.01909512, 0.06182718, 0.28830475],
|
|
[0.01756709, 0.04797666, 0.32225745]])
|
|
y_weights_sign_flip = pls_ca.y_weights_ / y_weights
|
|
|
|
y_loadings = np.array(
|
|
[[0.68568625, 0.1674376, 0.0969508],
|
|
[0.68782064, 0.20375837, -0.1164448],
|
|
[0.11712173, -0.68046903, 0.12001505],
|
|
[0.17860457, -0.6798319, -0.05089681],
|
|
[0.06265739, -0.0277703, 0.74729584],
|
|
[0.0914178, 0.00403751, -0.5135078],
|
|
[-0.02196918, -0.01377169, 0.09564505],
|
|
[-0.03288952, 0.09039729, 0.31858973],
|
|
[0.04287624, 0.05254676, 0.27836841]])
|
|
y_loadings_sign_flip = pls_ca.y_loadings_ / y_loadings
|
|
|
|
assert_array_almost_equal(x_loadings_sign_flip, x_weights_sign_flip, 4)
|
|
assert_array_almost_equal(np.abs(x_weights_sign_flip), 1, 4)
|
|
assert_array_almost_equal(np.abs(x_loadings_sign_flip), 1, 4)
|
|
|
|
assert_array_almost_equal(y_loadings_sign_flip, y_weights_sign_flip, 4)
|
|
assert_array_almost_equal(np.abs(y_weights_sign_flip), 1, 4)
|
|
assert_array_almost_equal(np.abs(y_loadings_sign_flip), 1, 4)
|
|
|
|
# Orthogonality of weights
|
|
# ~~~~~~~~~~~~~~~~~~~~~~~~
|
|
check_ortho(pls_ca.x_weights_, "x weights are not orthogonal")
|
|
check_ortho(pls_ca.y_weights_, "y weights are not orthogonal")
|
|
|
|
# Orthogonality of latent scores
|
|
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
|
check_ortho(pls_ca.x_scores_, "x scores are not orthogonal")
|
|
check_ortho(pls_ca.y_scores_, "y scores are not orthogonal")
|
|
|
|
# 4) Another "Non regression test" of PLS Regression (PLS2):
|
|
# Checking behavior when the first column of Y is constant
|
|
# ===============================================
|
|
# The results were compared against a modified version of plsreg2
|
|
# from the R-package plsdepot
|
|
X = d.data
|
|
Y = d.target
|
|
Y[:, 0] = 1
|
|
pls_2 = pls_.PLSRegression(n_components=X.shape[1])
|
|
pls_2.fit(X, Y)
|
|
|
|
x_weights = np.array(
|
|
[[-0.6273573, 0.007081799, 0.7786994],
|
|
[-0.7493417, -0.277612681, -0.6011807],
|
|
[-0.2119194, 0.960666981, -0.1794690]])
|
|
x_weights_sign_flip = pls_2.x_weights_ / x_weights
|
|
|
|
x_loadings = np.array(
|
|
[[-0.6273512, -0.22464538, 0.7786994],
|
|
[-0.6643156, -0.09871193, -0.6011807],
|
|
[-0.5125877, 1.01407380, -0.1794690]])
|
|
x_loadings_sign_flip = pls_2.x_loadings_ / x_loadings
|
|
|
|
y_loadings = np.array(
|
|
[[0.0000000, 0.0000000, 0.0000000],
|
|
[0.4357300, 0.5828479, 0.2174802],
|
|
[-0.1353739, -0.2486423, -0.1810386]])
|
|
|
|
# R/python sign flip should be the same in x_weight and x_rotation
|
|
assert_array_almost_equal(x_loadings_sign_flip, x_weights_sign_flip, 4)
|
|
|
|
# This test that R / python give the same result up to column
|
|
# sign indeterminacy
|
|
assert_array_almost_equal(np.abs(x_loadings_sign_flip), 1, 4)
|
|
assert_array_almost_equal(np.abs(x_weights_sign_flip), 1, 4)
|
|
|
|
# For the PLSRegression with default parameters, it holds that
|
|
# y_loadings==y_weights. In this case we only test that R/python
|
|
# give the same result for the y_loadings irrespective of the sign
|
|
assert_array_almost_equal(np.abs(pls_2.y_loadings_), np.abs(y_loadings), 4)
|
|
|
|
|
|
def test_convergence_fail():
|
|
d = load_linnerud()
|
|
X = d.data
|
|
Y = d.target
|
|
pls_bynipals = pls_.PLSCanonical(n_components=X.shape[1],
|
|
max_iter=2, tol=1e-10)
|
|
assert_warns(ConvergenceWarning, pls_bynipals.fit, X, Y)
|
|
|
|
|
|
def test_PLSSVD():
|
|
# Let's check the PLSSVD doesn't return all possible component but just
|
|
# the specified number
|
|
d = load_linnerud()
|
|
X = d.data
|
|
Y = d.target
|
|
n_components = 2
|
|
for clf in [pls_.PLSSVD, pls_.PLSRegression, pls_.PLSCanonical]:
|
|
pls = clf(n_components=n_components)
|
|
pls.fit(X, Y)
|
|
assert n_components == pls.y_scores_.shape[1]
|
|
|
|
|
|
def test_univariate_pls_regression():
|
|
# Ensure 1d Y is correctly interpreted
|
|
d = load_linnerud()
|
|
X = d.data
|
|
Y = d.target
|
|
|
|
clf = pls_.PLSRegression()
|
|
# Compare 1d to column vector
|
|
model1 = clf.fit(X, Y[:, 0]).coef_
|
|
model2 = clf.fit(X, Y[:, :1]).coef_
|
|
assert_array_almost_equal(model1, model2)
|
|
|
|
|
|
def test_predict_transform_copy():
|
|
# check that the "copy" keyword works
|
|
d = load_linnerud()
|
|
X = d.data
|
|
Y = d.target
|
|
clf = pls_.PLSCanonical()
|
|
X_copy = X.copy()
|
|
Y_copy = Y.copy()
|
|
clf.fit(X, Y)
|
|
# check that results are identical with copy
|
|
assert_array_almost_equal(clf.predict(X), clf.predict(X.copy(), copy=False))
|
|
assert_array_almost_equal(clf.transform(X), clf.transform(X.copy(), copy=False))
|
|
|
|
# check also if passing Y
|
|
assert_array_almost_equal(clf.transform(X, Y),
|
|
clf.transform(X.copy(), Y.copy(), copy=False))
|
|
# check that copy doesn't destroy
|
|
# we do want to check exact equality here
|
|
assert_array_equal(X_copy, X)
|
|
assert_array_equal(Y_copy, Y)
|
|
# also check that mean wasn't zero before (to make sure we didn't touch it)
|
|
assert np.all(X.mean(axis=0) != 0)
|
|
|
|
|
|
def test_scale_and_stability():
|
|
# We test scale=True parameter
|
|
# This allows to check numerical stability over platforms as well
|
|
|
|
d = load_linnerud()
|
|
X1 = d.data
|
|
Y1 = d.target
|
|
# causes X[:, -1].std() to be zero
|
|
X1[:, -1] = 1.0
|
|
|
|
# From bug #2821
|
|
# Test with X2, T2 s.t. clf.x_score[:, 1] == 0, clf.y_score[:, 1] == 0
|
|
# This test robustness of algorithm when dealing with value close to 0
|
|
X2 = np.array([[0., 0., 1.],
|
|
[1., 0., 0.],
|
|
[2., 2., 2.],
|
|
[3., 5., 4.]])
|
|
Y2 = np.array([[0.1, -0.2],
|
|
[0.9, 1.1],
|
|
[6.2, 5.9],
|
|
[11.9, 12.3]])
|
|
|
|
for (X, Y) in [(X1, Y1), (X2, Y2)]:
|
|
X_std = X.std(axis=0, ddof=1)
|
|
X_std[X_std == 0] = 1
|
|
Y_std = Y.std(axis=0, ddof=1)
|
|
Y_std[Y_std == 0] = 1
|
|
|
|
X_s = (X - X.mean(axis=0)) / X_std
|
|
Y_s = (Y - Y.mean(axis=0)) / Y_std
|
|
|
|
for clf in [CCA(), pls_.PLSCanonical(), pls_.PLSRegression(),
|
|
pls_.PLSSVD()]:
|
|
clf.set_params(scale=True)
|
|
X_score, Y_score = clf.fit_transform(X, Y)
|
|
clf.set_params(scale=False)
|
|
X_s_score, Y_s_score = clf.fit_transform(X_s, Y_s)
|
|
assert_array_almost_equal(X_s_score, X_score)
|
|
assert_array_almost_equal(Y_s_score, Y_score)
|
|
# Scaling should be idempotent
|
|
clf.set_params(scale=True)
|
|
X_score, Y_score = clf.fit_transform(X_s, Y_s)
|
|
assert_array_almost_equal(X_s_score, X_score)
|
|
assert_array_almost_equal(Y_s_score, Y_score)
|
|
|
|
|
|
def test_pls_errors():
|
|
d = load_linnerud()
|
|
X = d.data
|
|
Y = d.target
|
|
for clf in [pls_.PLSCanonical(), pls_.PLSRegression(),
|
|
pls_.PLSSVD()]:
|
|
clf.n_components = 4
|
|
assert_raise_message(ValueError, "Invalid number of components",
|
|
clf.fit, X, Y)
|
|
|
|
|
|
def test_pls_scaling():
|
|
# sanity check for scale=True
|
|
n_samples = 1000
|
|
n_targets = 5
|
|
n_features = 10
|
|
|
|
rng = check_random_state(0)
|
|
|
|
Q = rng.randn(n_targets, n_features)
|
|
Y = rng.randn(n_samples, n_targets)
|
|
X = np.dot(Y, Q) + 2 * rng.randn(n_samples, n_features) + 1
|
|
X *= 1000
|
|
X_scaled = StandardScaler().fit_transform(X)
|
|
|
|
pls = pls_.PLSRegression(n_components=5, scale=True)
|
|
|
|
pls.fit(X, Y)
|
|
score = pls.score(X, Y)
|
|
|
|
pls.fit(X_scaled, Y)
|
|
score_scaled = pls.score(X_scaled, Y)
|
|
|
|
assert_approx_equal(score, score_scaled)
|