Uploaded Test files

This commit is contained in:
Batuhan Berk Başoğlu 2020-11-12 11:05:57 -05:00
parent f584ad9d97
commit 2e81cb7d99
16627 changed files with 2065359 additions and 102444 deletions

View file

@ -0,0 +1,523 @@
import pytest
import numpy as np
import itertools
from sklearn.exceptions import ConvergenceWarning
from sklearn.utils import check_array
from sklearn.utils._testing import assert_array_almost_equal
from sklearn.utils._testing import assert_array_equal
from sklearn.utils._testing import ignore_warnings
from sklearn.utils._testing import TempMemmap
from sklearn.decomposition import DictionaryLearning
from sklearn.decomposition import MiniBatchDictionaryLearning
from sklearn.decomposition import SparseCoder
from sklearn.decomposition import dict_learning
from sklearn.decomposition import dict_learning_online
from sklearn.decomposition import sparse_encode
rng_global = np.random.RandomState(0)
n_samples, n_features = 10, 8
X = rng_global.randn(n_samples, n_features)
def test_sparse_encode_shapes_omp():
rng = np.random.RandomState(0)
algorithms = ['omp', 'lasso_lars', 'lasso_cd', 'lars', 'threshold']
for n_components, n_samples in itertools.product([1, 5], [1, 9]):
X_ = rng.randn(n_samples, n_features)
dictionary = rng.randn(n_components, n_features)
for algorithm, n_jobs in itertools.product(algorithms, [1, 3]):
code = sparse_encode(X_, dictionary, algorithm=algorithm,
n_jobs=n_jobs)
assert code.shape == (n_samples, n_components)
def test_dict_learning_shapes():
n_components = 5
dico = DictionaryLearning(n_components, random_state=0).fit(X)
assert dico.components_.shape == (n_components, n_features)
n_components = 1
dico = DictionaryLearning(n_components, random_state=0).fit(X)
assert dico.components_.shape == (n_components, n_features)
assert dico.transform(X).shape == (X.shape[0], n_components)
def test_dict_learning_overcomplete():
n_components = 12
dico = DictionaryLearning(n_components, random_state=0).fit(X)
assert dico.components_.shape == (n_components, n_features)
def test_max_iter():
def ricker_function(resolution, center, width):
"""Discrete sub-sampled Ricker (Mexican hat) wavelet"""
x = np.linspace(0, resolution - 1, resolution)
x = ((2 / (np.sqrt(3 * width) * np.pi ** .25))
* (1 - (x - center) ** 2 / width ** 2)
* np.exp(-(x - center) ** 2 / (2 * width ** 2)))
return x
def ricker_matrix(width, resolution, n_components):
"""Dictionary of Ricker (Mexican hat) wavelets"""
centers = np.linspace(0, resolution - 1, n_components)
D = np.empty((n_components, resolution))
for i, center in enumerate(centers):
D[i] = ricker_function(resolution, center, width)
D /= np.sqrt(np.sum(D ** 2, axis=1))[:, np.newaxis]
return D
transform_algorithm = 'lasso_cd'
resolution = 1024
subsampling = 3 # subsampling factor
n_components = resolution // subsampling
# Compute a wavelet dictionary
D_multi = np.r_[tuple(ricker_matrix(width=w, resolution=resolution,
n_components=n_components // 5)
for w in (10, 50, 100, 500, 1000))]
X = np.linspace(0, resolution - 1, resolution)
first_quarter = X < resolution / 4
X[first_quarter] = 3.
X[np.logical_not(first_quarter)] = -1.
X = X.reshape(1, -1)
# check that the underlying model fails to converge
with pytest.warns(ConvergenceWarning):
model = SparseCoder(D_multi, transform_algorithm=transform_algorithm,
transform_max_iter=1)
model.fit_transform(X)
# check that the underlying model converges w/o warnings
with pytest.warns(None) as record:
model = SparseCoder(D_multi, transform_algorithm=transform_algorithm,
transform_max_iter=2000)
model.fit_transform(X)
assert not record.list
def test_dict_learning_lars_positive_parameter():
n_components = 5
alpha = 1
err_msg = "Positive constraint not supported for 'lars' coding method."
with pytest.raises(ValueError, match=err_msg):
dict_learning(X, n_components, alpha=alpha, positive_code=True)
@pytest.mark.parametrize("transform_algorithm", [
"lasso_lars",
"lasso_cd",
"threshold",
])
@pytest.mark.parametrize("positive_code", [False, True])
@pytest.mark.parametrize("positive_dict", [False, True])
def test_dict_learning_positivity(transform_algorithm,
positive_code,
positive_dict):
n_components = 5
dico = DictionaryLearning(
n_components, transform_algorithm=transform_algorithm, random_state=0,
positive_code=positive_code, positive_dict=positive_dict,
fit_algorithm="cd").fit(X)
code = dico.transform(X)
if positive_dict:
assert (dico.components_ >= 0).all()
else:
assert (dico.components_ < 0).any()
if positive_code:
assert (code >= 0).all()
else:
assert (code < 0).any()
@pytest.mark.parametrize("positive_dict", [False, True])
def test_dict_learning_lars_dict_positivity(positive_dict):
n_components = 5
dico = DictionaryLearning(
n_components, transform_algorithm="lars", random_state=0,
positive_dict=positive_dict, fit_algorithm="cd").fit(X)
if positive_dict:
assert (dico.components_ >= 0).all()
else:
assert (dico.components_ < 0).any()
def test_dict_learning_lars_code_positivity():
n_components = 5
dico = DictionaryLearning(
n_components, transform_algorithm="lars", random_state=0,
positive_code=True, fit_algorithm="cd").fit(X)
err_msg = "Positive constraint not supported for '{}' coding method."
err_msg = err_msg.format("lars")
with pytest.raises(ValueError, match=err_msg):
dico.transform(X)
def test_dict_learning_reconstruction():
n_components = 12
dico = DictionaryLearning(n_components, transform_algorithm='omp',
transform_alpha=0.001, random_state=0)
code = dico.fit(X).transform(X)
assert_array_almost_equal(np.dot(code, dico.components_), X)
dico.set_params(transform_algorithm='lasso_lars')
code = dico.transform(X)
assert_array_almost_equal(np.dot(code, dico.components_), X, decimal=2)
# used to test lars here too, but there's no guarantee the number of
# nonzero atoms is right.
def test_dict_learning_reconstruction_parallel():
# regression test that parallel reconstruction works with n_jobs>1
n_components = 12
dico = DictionaryLearning(n_components, transform_algorithm='omp',
transform_alpha=0.001, random_state=0, n_jobs=4)
code = dico.fit(X).transform(X)
assert_array_almost_equal(np.dot(code, dico.components_), X)
dico.set_params(transform_algorithm='lasso_lars')
code = dico.transform(X)
assert_array_almost_equal(np.dot(code, dico.components_), X, decimal=2)
def test_dict_learning_lassocd_readonly_data():
n_components = 12
with TempMemmap(X) as X_read_only:
dico = DictionaryLearning(n_components, transform_algorithm='lasso_cd',
transform_alpha=0.001, random_state=0,
n_jobs=4)
with ignore_warnings(category=ConvergenceWarning):
code = dico.fit(X_read_only).transform(X_read_only)
assert_array_almost_equal(np.dot(code, dico.components_), X_read_only,
decimal=2)
def test_dict_learning_nonzero_coefs():
n_components = 4
dico = DictionaryLearning(n_components, transform_algorithm='lars',
transform_n_nonzero_coefs=3, random_state=0)
code = dico.fit(X).transform(X[np.newaxis, 1])
assert len(np.flatnonzero(code)) == 3
dico.set_params(transform_algorithm='omp')
code = dico.transform(X[np.newaxis, 1])
assert len(np.flatnonzero(code)) == 3
def test_dict_learning_unknown_fit_algorithm():
n_components = 5
dico = DictionaryLearning(n_components, fit_algorithm='<unknown>')
with pytest.raises(ValueError):
dico.fit(X)
def test_dict_learning_split():
n_components = 5
dico = DictionaryLearning(n_components, transform_algorithm='threshold',
random_state=0)
code = dico.fit(X).transform(X)
dico.split_sign = True
split_code = dico.transform(X)
assert_array_almost_equal(split_code[:, :n_components] -
split_code[:, n_components:], code)
def test_dict_learning_online_shapes():
rng = np.random.RandomState(0)
n_components = 8
code, dictionary = dict_learning_online(X, n_components=n_components,
alpha=1, random_state=rng)
assert code.shape == (n_samples, n_components)
assert dictionary.shape == (n_components, n_features)
assert np.dot(code, dictionary).shape == X.shape
def test_dict_learning_online_lars_positive_parameter():
alpha = 1
err_msg = "Positive constraint not supported for 'lars' coding method."
with pytest.raises(ValueError, match=err_msg):
dict_learning_online(X, alpha=alpha, positive_code=True)
@pytest.mark.parametrize("transform_algorithm", [
"lasso_lars",
"lasso_cd",
"threshold",
])
@pytest.mark.parametrize("positive_code", [False, True])
@pytest.mark.parametrize("positive_dict", [False, True])
def test_minibatch_dictionary_learning_positivity(transform_algorithm,
positive_code,
positive_dict):
n_components = 8
dico = MiniBatchDictionaryLearning(
n_components, transform_algorithm=transform_algorithm, random_state=0,
positive_code=positive_code, positive_dict=positive_dict,
fit_algorithm='cd').fit(X)
code = dico.transform(X)
if positive_dict:
assert (dico.components_ >= 0).all()
else:
assert (dico.components_ < 0).any()
if positive_code:
assert (code >= 0).all()
else:
assert (code < 0).any()
@pytest.mark.parametrize("positive_dict", [False, True])
def test_minibatch_dictionary_learning_lars(positive_dict):
n_components = 8
dico = MiniBatchDictionaryLearning(
n_components, transform_algorithm="lars", random_state=0,
positive_dict=positive_dict, fit_algorithm='cd').fit(X)
if positive_dict:
assert (dico.components_ >= 0).all()
else:
assert (dico.components_ < 0).any()
@pytest.mark.parametrize("positive_code", [False, True])
@pytest.mark.parametrize("positive_dict", [False, True])
def test_dict_learning_online_positivity(positive_code,
positive_dict):
rng = np.random.RandomState(0)
n_components = 8
code, dictionary = dict_learning_online(X, n_components=n_components,
method="cd",
alpha=1, random_state=rng,
positive_dict=positive_dict,
positive_code=positive_code)
if positive_dict:
assert (dictionary >= 0).all()
else:
assert (dictionary < 0).any()
if positive_code:
assert (code >= 0).all()
else:
assert (code < 0).any()
def test_dict_learning_online_verbosity():
n_components = 5
# test verbosity
from io import StringIO
import sys
old_stdout = sys.stdout
try:
sys.stdout = StringIO()
dico = MiniBatchDictionaryLearning(n_components, n_iter=20, verbose=1,
random_state=0)
dico.fit(X)
dico = MiniBatchDictionaryLearning(n_components, n_iter=20, verbose=2,
random_state=0)
dico.fit(X)
dict_learning_online(X, n_components=n_components, alpha=1, verbose=1,
random_state=0)
dict_learning_online(X, n_components=n_components, alpha=1, verbose=2,
random_state=0)
finally:
sys.stdout = old_stdout
assert dico.components_.shape == (n_components, n_features)
def test_dict_learning_online_estimator_shapes():
n_components = 5
dico = MiniBatchDictionaryLearning(n_components, n_iter=20, random_state=0)
dico.fit(X)
assert dico.components_.shape == (n_components, n_features)
def test_dict_learning_online_overcomplete():
n_components = 12
dico = MiniBatchDictionaryLearning(n_components, n_iter=20,
random_state=0).fit(X)
assert dico.components_.shape == (n_components, n_features)
def test_dict_learning_online_initialization():
n_components = 12
rng = np.random.RandomState(0)
V = rng.randn(n_components, n_features)
dico = MiniBatchDictionaryLearning(n_components, n_iter=0,
dict_init=V, random_state=0).fit(X)
assert_array_equal(dico.components_, V)
def test_dict_learning_online_readonly_initialization():
n_components = 12
rng = np.random.RandomState(0)
V = rng.randn(n_components, n_features)
V.setflags(write=False)
MiniBatchDictionaryLearning(n_components, n_iter=1, dict_init=V,
random_state=0, shuffle=False).fit(X)
def test_dict_learning_online_partial_fit():
n_components = 12
rng = np.random.RandomState(0)
V = rng.randn(n_components, n_features) # random init
V /= np.sum(V ** 2, axis=1)[:, np.newaxis]
dict1 = MiniBatchDictionaryLearning(n_components, n_iter=10 * len(X),
batch_size=1,
alpha=1, shuffle=False, dict_init=V,
random_state=0).fit(X)
dict2 = MiniBatchDictionaryLearning(n_components, alpha=1,
n_iter=1, dict_init=V,
random_state=0)
for i in range(10):
for sample in X:
dict2.partial_fit(sample[np.newaxis, :])
assert not np.all(sparse_encode(X, dict1.components_, alpha=1) == 0)
assert_array_almost_equal(dict1.components_, dict2.components_,
decimal=2)
def test_dict_learning_iter_offset():
n_components = 12
rng = np.random.RandomState(0)
V = rng.randn(n_components, n_features)
dict1 = MiniBatchDictionaryLearning(n_components, n_iter=10,
dict_init=V, random_state=0,
shuffle=False)
dict2 = MiniBatchDictionaryLearning(n_components, n_iter=10,
dict_init=V, random_state=0,
shuffle=False)
dict1.fit(X)
for sample in X:
dict2.partial_fit(sample[np.newaxis, :])
assert dict1.iter_offset_ == dict2.iter_offset_
def test_sparse_encode_shapes():
n_components = 12
rng = np.random.RandomState(0)
V = rng.randn(n_components, n_features) # random init
V /= np.sum(V ** 2, axis=1)[:, np.newaxis]
for algo in ('lasso_lars', 'lasso_cd', 'lars', 'omp', 'threshold'):
code = sparse_encode(X, V, algorithm=algo)
assert code.shape == (n_samples, n_components)
@pytest.mark.parametrize("algo", [
'lasso_lars',
'lasso_cd',
'threshold'
])
@pytest.mark.parametrize("positive", [False, True])
def test_sparse_encode_positivity(algo, positive):
n_components = 12
rng = np.random.RandomState(0)
V = rng.randn(n_components, n_features) # random init
V /= np.sum(V ** 2, axis=1)[:, np.newaxis]
code = sparse_encode(X, V, algorithm=algo, positive=positive)
if positive:
assert (code >= 0).all()
else:
assert (code < 0).any()
@pytest.mark.parametrize("algo", ['lars', 'omp'])
def test_sparse_encode_unavailable_positivity(algo):
n_components = 12
rng = np.random.RandomState(0)
V = rng.randn(n_components, n_features) # random init
V /= np.sum(V ** 2, axis=1)[:, np.newaxis]
err_msg = "Positive constraint not supported for '{}' coding method."
err_msg = err_msg.format(algo)
with pytest.raises(ValueError, match=err_msg):
sparse_encode(X, V, algorithm=algo, positive=True)
def test_sparse_encode_input():
n_components = 100
rng = np.random.RandomState(0)
V = rng.randn(n_components, n_features) # random init
V /= np.sum(V ** 2, axis=1)[:, np.newaxis]
Xf = check_array(X, order='F')
for algo in ('lasso_lars', 'lasso_cd', 'lars', 'omp', 'threshold'):
a = sparse_encode(X, V, algorithm=algo)
b = sparse_encode(Xf, V, algorithm=algo)
assert_array_almost_equal(a, b)
def test_sparse_encode_error():
n_components = 12
rng = np.random.RandomState(0)
V = rng.randn(n_components, n_features) # random init
V /= np.sum(V ** 2, axis=1)[:, np.newaxis]
code = sparse_encode(X, V, alpha=0.001)
assert not np.all(code == 0)
assert np.sqrt(np.sum((np.dot(code, V) - X) ** 2)) < 0.1
def test_sparse_encode_error_default_sparsity():
rng = np.random.RandomState(0)
X = rng.randn(100, 64)
D = rng.randn(2, 64)
code = ignore_warnings(sparse_encode)(X, D, algorithm='omp',
n_nonzero_coefs=None)
assert code.shape == (100, 2)
def test_unknown_method():
n_components = 12
rng = np.random.RandomState(0)
V = rng.randn(n_components, n_features) # random init
with pytest.raises(ValueError):
sparse_encode(X, V, algorithm="<unknown>")
def test_sparse_coder_estimator():
n_components = 12
rng = np.random.RandomState(0)
V = rng.randn(n_components, n_features) # random init
V /= np.sum(V ** 2, axis=1)[:, np.newaxis]
code = SparseCoder(dictionary=V, transform_algorithm='lasso_lars',
transform_alpha=0.001).transform(X)
assert not np.all(code == 0)
assert np.sqrt(np.sum((np.dot(code, V) - X) ** 2)) < 0.1
def test_sparse_coder_parallel_mmap():
# Non-regression test for:
# https://github.com/scikit-learn/scikit-learn/issues/5956
# Test that SparseCoder does not error by passing reading only
# arrays to child processes
rng = np.random.RandomState(777)
n_components, n_features = 40, 64
init_dict = rng.rand(n_components, n_features)
# Ensure that `data` is >2M. Joblib memory maps arrays
# if they are larger than 1MB. The 4 accounts for float32
# data type
n_samples = int(2e6) // (4 * n_features)
data = np.random.rand(n_samples, n_features).astype(np.float32)
sc = SparseCoder(init_dict, transform_algorithm='omp', n_jobs=2)
sc.fit_transform(data)
def test_sparse_coder_n_features_in():
d = np.array([[1, 2, 3], [1, 2, 3]])
sc = SparseCoder(d)
assert sc.n_features_in_ == d.shape[1]

View file

@ -0,0 +1,85 @@
# Author: Christian Osendorfer <osendorf@gmail.com>
# Alexandre Gramfort <alexandre.gramfort@inria.fr>
# License: BSD3
import numpy as np
import pytest
from sklearn.utils._testing import assert_warns
from sklearn.utils._testing import assert_almost_equal
from sklearn.utils._testing import assert_array_almost_equal
from sklearn.exceptions import ConvergenceWarning
from sklearn.decomposition import FactorAnalysis
from sklearn.utils._testing import ignore_warnings
# Ignore warnings from switching to more power iterations in randomized_svd
@ignore_warnings
def test_factor_analysis():
# Test FactorAnalysis ability to recover the data covariance structure
rng = np.random.RandomState(0)
n_samples, n_features, n_components = 20, 5, 3
# Some random settings for the generative model
W = rng.randn(n_components, n_features)
# latent variable of dim 3, 20 of it
h = rng.randn(n_samples, n_components)
# using gamma to model different noise variance
# per component
noise = rng.gamma(1, size=n_features) * rng.randn(n_samples, n_features)
# generate observations
# wlog, mean is 0
X = np.dot(h, W) + noise
with pytest.raises(ValueError):
FactorAnalysis(svd_method='foo')
fa_fail = FactorAnalysis()
fa_fail.svd_method = 'foo'
with pytest.raises(ValueError):
fa_fail.fit(X)
fas = []
for method in ['randomized', 'lapack']:
fa = FactorAnalysis(n_components=n_components, svd_method=method)
fa.fit(X)
fas.append(fa)
X_t = fa.transform(X)
assert X_t.shape == (n_samples, n_components)
assert_almost_equal(fa.loglike_[-1], fa.score_samples(X).sum())
assert_almost_equal(fa.score_samples(X).mean(), fa.score(X))
diff = np.all(np.diff(fa.loglike_))
assert diff > 0., 'Log likelihood dif not increase'
# Sample Covariance
scov = np.cov(X, rowvar=0., bias=1.)
# Model Covariance
mcov = fa.get_covariance()
diff = np.sum(np.abs(scov - mcov)) / W.size
assert diff < 0.1, "Mean absolute difference is %f" % diff
fa = FactorAnalysis(n_components=n_components,
noise_variance_init=np.ones(n_features))
with pytest.raises(ValueError):
fa.fit(X[:, :2])
f = lambda x, y: np.abs(getattr(x, y)) # sign will not be equal
fa1, fa2 = fas
for attr in ['loglike_', 'components_', 'noise_variance_']:
assert_almost_equal(f(fa1, attr), f(fa2, attr))
fa1.max_iter = 1
fa1.verbose = True
assert_warns(ConvergenceWarning, fa1.fit, X)
# Test get_covariance and get_precision with n_components == n_features
# with n_components < n_features and with n_components == 0
for n_components in [0, 2, X.shape[1]]:
fa.n_components = n_components
fa.fit(X)
cov = fa.get_covariance()
precision = fa.get_precision()
assert_array_almost_equal(np.dot(cov, precision),
np.eye(X.shape[1]), 12)

View file

@ -0,0 +1,301 @@
"""
Test the fastica algorithm.
"""
import itertools
import warnings
import pytest
import numpy as np
from scipy import stats
from sklearn.utils._testing import assert_almost_equal
from sklearn.utils._testing import assert_array_almost_equal
from sklearn.utils._testing import assert_warns
from sklearn.decomposition import FastICA, fastica, PCA
from sklearn.decomposition._fastica import _gs_decorrelation
from sklearn.exceptions import ConvergenceWarning
def center_and_norm(x, axis=-1):
""" Centers and norms x **in place**
Parameters
-----------
x: ndarray
Array with an axis of observations (statistical units) measured on
random variables.
axis: int, optional
Axis along which the mean and variance are calculated.
"""
x = np.rollaxis(x, axis)
x -= x.mean(axis=0)
x /= x.std(axis=0)
def test_gs():
# Test gram schmidt orthonormalization
# generate a random orthogonal matrix
rng = np.random.RandomState(0)
W, _, _ = np.linalg.svd(rng.randn(10, 10))
w = rng.randn(10)
_gs_decorrelation(w, W, 10)
assert (w ** 2).sum() < 1.e-10
w = rng.randn(10)
u = _gs_decorrelation(w, W, 5)
tmp = np.dot(u, W.T)
assert (tmp[:5] ** 2).sum() < 1.e-10
@pytest.mark.parametrize("add_noise", [True, False])
@pytest.mark.parametrize("seed", range(1))
def test_fastica_simple(add_noise, seed):
# Test the FastICA algorithm on very simple data.
rng = np.random.RandomState(seed)
# scipy.stats uses the global RNG:
n_samples = 1000
# Generate two sources:
s1 = (2 * np.sin(np.linspace(0, 100, n_samples)) > 0) - 1
s2 = stats.t.rvs(1, size=n_samples)
s = np.c_[s1, s2].T
center_and_norm(s)
s1, s2 = s
# Mixing angle
phi = 0.6
mixing = np.array([[np.cos(phi), np.sin(phi)],
[np.sin(phi), -np.cos(phi)]])
m = np.dot(mixing, s)
if add_noise:
m += 0.1 * rng.randn(2, 1000)
center_and_norm(m)
# function as fun arg
def g_test(x):
return x ** 3, (3 * x ** 2).mean(axis=-1)
algos = ['parallel', 'deflation']
nls = ['logcosh', 'exp', 'cube', g_test]
whitening = [True, False]
for algo, nl, whiten in itertools.product(algos, nls, whitening):
if whiten:
k_, mixing_, s_ = fastica(m.T, fun=nl, algorithm=algo,
random_state=rng)
with pytest.raises(ValueError):
fastica(m.T, fun=np.tanh, algorithm=algo)
else:
pca = PCA(n_components=2, whiten=True, random_state=rng)
X = pca.fit_transform(m.T)
k_, mixing_, s_ = fastica(X, fun=nl, algorithm=algo, whiten=False,
random_state=rng)
with pytest.raises(ValueError):
fastica(X, fun=np.tanh, algorithm=algo)
s_ = s_.T
# Check that the mixing model described in the docstring holds:
if whiten:
assert_almost_equal(s_, np.dot(np.dot(mixing_, k_), m))
center_and_norm(s_)
s1_, s2_ = s_
# Check to see if the sources have been estimated
# in the wrong order
if abs(np.dot(s1_, s2)) > abs(np.dot(s1_, s1)):
s2_, s1_ = s_
s1_ *= np.sign(np.dot(s1_, s1))
s2_ *= np.sign(np.dot(s2_, s2))
# Check that we have estimated the original sources
if not add_noise:
assert_almost_equal(np.dot(s1_, s1) / n_samples, 1, decimal=2)
assert_almost_equal(np.dot(s2_, s2) / n_samples, 1, decimal=2)
else:
assert_almost_equal(np.dot(s1_, s1) / n_samples, 1, decimal=1)
assert_almost_equal(np.dot(s2_, s2) / n_samples, 1, decimal=1)
# Test FastICA class
_, _, sources_fun = fastica(m.T, fun=nl, algorithm=algo,
random_state=seed)
ica = FastICA(fun=nl, algorithm=algo, random_state=seed)
sources = ica.fit_transform(m.T)
assert ica.components_.shape == (2, 2)
assert sources.shape == (1000, 2)
assert_array_almost_equal(sources_fun, sources)
assert_array_almost_equal(sources, ica.transform(m.T))
assert ica.mixing_.shape == (2, 2)
for fn in [np.tanh, "exp(-.5(x^2))"]:
ica = FastICA(fun=fn, algorithm=algo)
with pytest.raises(ValueError):
ica.fit(m.T)
with pytest.raises(TypeError):
FastICA(fun=range(10)).fit(m.T)
def test_fastica_nowhiten():
m = [[0, 1], [1, 0]]
# test for issue #697
ica = FastICA(n_components=1, whiten=False, random_state=0)
assert_warns(UserWarning, ica.fit, m)
assert hasattr(ica, 'mixing_')
def test_fastica_convergence_fail():
# Test the FastICA algorithm on very simple data
# (see test_non_square_fastica).
# Ensure a ConvergenceWarning raised if the tolerance is sufficiently low.
rng = np.random.RandomState(0)
n_samples = 1000
# Generate two sources:
t = np.linspace(0, 100, n_samples)
s1 = np.sin(t)
s2 = np.ceil(np.sin(np.pi * t))
s = np.c_[s1, s2].T
center_and_norm(s)
# Mixing matrix
mixing = rng.randn(6, 2)
m = np.dot(mixing, s)
# Do fastICA with tolerance 0. to ensure failing convergence
ica = FastICA(algorithm="parallel", n_components=2, random_state=rng,
max_iter=2, tol=0.)
assert_warns(ConvergenceWarning, ica.fit, m.T)
@pytest.mark.parametrize('add_noise', [True, False])
def test_non_square_fastica(add_noise):
# Test the FastICA algorithm on very simple data.
rng = np.random.RandomState(0)
n_samples = 1000
# Generate two sources:
t = np.linspace(0, 100, n_samples)
s1 = np.sin(t)
s2 = np.ceil(np.sin(np.pi * t))
s = np.c_[s1, s2].T
center_and_norm(s)
s1, s2 = s
# Mixing matrix
mixing = rng.randn(6, 2)
m = np.dot(mixing, s)
if add_noise:
m += 0.1 * rng.randn(6, n_samples)
center_and_norm(m)
k_, mixing_, s_ = fastica(m.T, n_components=2, random_state=rng)
s_ = s_.T
# Check that the mixing model described in the docstring holds:
assert_almost_equal(s_, np.dot(np.dot(mixing_, k_), m))
center_and_norm(s_)
s1_, s2_ = s_
# Check to see if the sources have been estimated
# in the wrong order
if abs(np.dot(s1_, s2)) > abs(np.dot(s1_, s1)):
s2_, s1_ = s_
s1_ *= np.sign(np.dot(s1_, s1))
s2_ *= np.sign(np.dot(s2_, s2))
# Check that we have estimated the original sources
if not add_noise:
assert_almost_equal(np.dot(s1_, s1) / n_samples, 1, decimal=3)
assert_almost_equal(np.dot(s2_, s2) / n_samples, 1, decimal=3)
def test_fit_transform():
# Test FastICA.fit_transform
rng = np.random.RandomState(0)
X = rng.random_sample((100, 10))
for whiten, n_components in [[True, 5], [False, None]]:
n_components_ = (n_components if n_components is not None else
X.shape[1])
ica = FastICA(n_components=n_components, whiten=whiten, random_state=0)
Xt = ica.fit_transform(X)
assert ica.components_.shape == (n_components_, 10)
assert Xt.shape == (100, n_components_)
ica = FastICA(n_components=n_components, whiten=whiten, random_state=0)
ica.fit(X)
assert ica.components_.shape == (n_components_, 10)
Xt2 = ica.transform(X)
assert_array_almost_equal(Xt, Xt2)
def test_inverse_transform():
# Test FastICA.inverse_transform
n_features = 10
n_samples = 100
n1, n2 = 5, 10
rng = np.random.RandomState(0)
X = rng.random_sample((n_samples, n_features))
expected = {(True, n1): (n_features, n1),
(True, n2): (n_features, n2),
(False, n1): (n_features, n2),
(False, n2): (n_features, n2)}
for whiten in [True, False]:
for n_components in [n1, n2]:
n_components_ = (n_components if n_components is not None else
X.shape[1])
ica = FastICA(n_components=n_components, random_state=rng,
whiten=whiten)
with warnings.catch_warnings(record=True):
# catch "n_components ignored" warning
Xt = ica.fit_transform(X)
expected_shape = expected[(whiten, n_components_)]
assert ica.mixing_.shape == expected_shape
X2 = ica.inverse_transform(Xt)
assert X.shape == X2.shape
# reversibility test in non-reduction case
if n_components == X.shape[1]:
assert_array_almost_equal(X, X2)
def test_fastica_errors():
n_features = 3
n_samples = 10
rng = np.random.RandomState(0)
X = rng.random_sample((n_samples, n_features))
w_init = rng.randn(n_features + 1, n_features + 1)
with pytest.raises(ValueError, match='max_iter should be greater than 1'):
FastICA(max_iter=0)
with pytest.raises(ValueError, match=r'alpha must be in \[1,2\]'):
fastica(X, fun_args={'alpha': 0})
with pytest.raises(ValueError, match='w_init has invalid shape.+'
r'should be \(3L?, 3L?\)'):
fastica(X, w_init=w_init)
with pytest.raises(ValueError, match='Invalid algorithm.+must '
'be.+parallel.+or.+deflation'):
fastica(X, algorithm='pizza')
@pytest.mark.parametrize('whiten', [True, False])
@pytest.mark.parametrize('return_X_mean', [True, False])
@pytest.mark.parametrize('return_n_iter', [True, False])
def test_fastica_output_shape(whiten, return_X_mean, return_n_iter):
n_features = 3
n_samples = 10
rng = np.random.RandomState(0)
X = rng.random_sample((n_samples, n_features))
expected_len = 3 + return_X_mean + return_n_iter
out = fastica(X, whiten=whiten, return_n_iter=return_n_iter,
return_X_mean=return_X_mean)
assert len(out) == expected_len
if not whiten:
assert out[0] is None

View file

@ -0,0 +1,401 @@
"""Tests for Incremental PCA."""
import numpy as np
import pytest
from sklearn.utils._testing import assert_almost_equal
from sklearn.utils._testing import assert_array_almost_equal
from sklearn.utils._testing import assert_allclose_dense_sparse
from sklearn import datasets
from sklearn.decomposition import PCA, IncrementalPCA
from scipy import sparse
iris = datasets.load_iris()
def test_incremental_pca():
# Incremental PCA on dense arrays.
X = iris.data
batch_size = X.shape[0] // 3
ipca = IncrementalPCA(n_components=2, batch_size=batch_size)
pca = PCA(n_components=2)
pca.fit_transform(X)
X_transformed = ipca.fit_transform(X)
assert X_transformed.shape == (X.shape[0], 2)
np.testing.assert_allclose(ipca.explained_variance_ratio_.sum(),
pca.explained_variance_ratio_.sum(), rtol=1e-3)
for n_components in [1, 2, X.shape[1]]:
ipca = IncrementalPCA(n_components, batch_size=batch_size)
ipca.fit(X)
cov = ipca.get_covariance()
precision = ipca.get_precision()
np.testing.assert_allclose(np.dot(cov, precision),
np.eye(X.shape[1]), atol=1e-13)
@pytest.mark.parametrize(
"matrix_class",
[sparse.csc_matrix, sparse.csr_matrix, sparse.lil_matrix])
def test_incremental_pca_sparse(matrix_class):
# Incremental PCA on sparse arrays.
X = iris.data
pca = PCA(n_components=2)
pca.fit_transform(X)
X_sparse = matrix_class(X)
batch_size = X_sparse.shape[0] // 3
ipca = IncrementalPCA(n_components=2, batch_size=batch_size)
X_transformed = ipca.fit_transform(X_sparse)
assert X_transformed.shape == (X_sparse.shape[0], 2)
np.testing.assert_allclose(ipca.explained_variance_ratio_.sum(),
pca.explained_variance_ratio_.sum(), rtol=1e-3)
for n_components in [1, 2, X.shape[1]]:
ipca = IncrementalPCA(n_components, batch_size=batch_size)
ipca.fit(X_sparse)
cov = ipca.get_covariance()
precision = ipca.get_precision()
np.testing.assert_allclose(np.dot(cov, precision),
np.eye(X_sparse.shape[1]), atol=1e-13)
with pytest.raises(
TypeError,
match="IncrementalPCA.partial_fit does not support "
"sparse input. Either convert data to dense "
"or use IncrementalPCA.fit to do so in batches."):
ipca.partial_fit(X_sparse)
def test_incremental_pca_check_projection():
# Test that the projection of data is correct.
rng = np.random.RandomState(1999)
n, p = 100, 3
X = rng.randn(n, p) * .1
X[:10] += np.array([3, 4, 5])
Xt = 0.1 * rng.randn(1, p) + np.array([3, 4, 5])
# Get the reconstruction of the generated data X
# Note that Xt has the same "components" as X, just separated
# This is what we want to ensure is recreated correctly
Yt = IncrementalPCA(n_components=2).fit(X).transform(Xt)
# Normalize
Yt /= np.sqrt((Yt ** 2).sum())
# Make sure that the first element of Yt is ~1, this means
# the reconstruction worked as expected
assert_almost_equal(np.abs(Yt[0][0]), 1., 1)
def test_incremental_pca_inverse():
# Test that the projection of data can be inverted.
rng = np.random.RandomState(1999)
n, p = 50, 3
X = rng.randn(n, p) # spherical data
X[:, 1] *= .00001 # make middle component relatively small
X += [5, 4, 3] # make a large mean
# same check that we can find the original data from the transformed
# signal (since the data is almost of rank n_components)
ipca = IncrementalPCA(n_components=2, batch_size=10).fit(X)
Y = ipca.transform(X)
Y_inverse = ipca.inverse_transform(Y)
assert_almost_equal(X, Y_inverse, decimal=3)
def test_incremental_pca_validation():
# Test that n_components is >=1 and <= n_features.
X = np.array([[0, 1, 0], [1, 0, 0]])
n_samples, n_features = X.shape
for n_components in [-1, 0, .99, 4]:
with pytest.raises(ValueError, match="n_components={} invalid"
" for n_features={}, need more rows than"
" columns for IncrementalPCA"
" processing".format(n_components,
n_features)):
IncrementalPCA(n_components, batch_size=10).fit(X)
# Tests that n_components is also <= n_samples.
n_components = 3
with pytest.raises(ValueError, match="n_components={} must be"
" less or equal to the batch number of"
" samples {}".format(n_components, n_samples)):
IncrementalPCA(n_components=n_components).partial_fit(X)
def test_n_components_none():
# Ensures that n_components == None is handled correctly
rng = np.random.RandomState(1999)
for n_samples, n_features in [(50, 10), (10, 50)]:
X = rng.rand(n_samples, n_features)
ipca = IncrementalPCA(n_components=None)
# First partial_fit call, ipca.n_components_ is inferred from
# min(X.shape)
ipca.partial_fit(X)
assert ipca.n_components_ == min(X.shape)
# Second partial_fit call, ipca.n_components_ is inferred from
# ipca.components_ computed from the first partial_fit call
ipca.partial_fit(X)
assert ipca.n_components_ == ipca.components_.shape[0]
def test_incremental_pca_set_params():
# Test that components_ sign is stable over batch sizes.
rng = np.random.RandomState(1999)
n_samples = 100
n_features = 20
X = rng.randn(n_samples, n_features)
X2 = rng.randn(n_samples, n_features)
X3 = rng.randn(n_samples, n_features)
ipca = IncrementalPCA(n_components=20)
ipca.fit(X)
# Decreasing number of components
ipca.set_params(n_components=10)
with pytest.raises(ValueError):
ipca.partial_fit(X2)
# Increasing number of components
ipca.set_params(n_components=15)
with pytest.raises(ValueError):
ipca.partial_fit(X3)
# Returning to original setting
ipca.set_params(n_components=20)
ipca.partial_fit(X)
def test_incremental_pca_num_features_change():
# Test that changing n_components will raise an error.
rng = np.random.RandomState(1999)
n_samples = 100
X = rng.randn(n_samples, 20)
X2 = rng.randn(n_samples, 50)
ipca = IncrementalPCA(n_components=None)
ipca.fit(X)
with pytest.raises(ValueError):
ipca.partial_fit(X2)
def test_incremental_pca_batch_signs():
# Test that components_ sign is stable over batch sizes.
rng = np.random.RandomState(1999)
n_samples = 100
n_features = 3
X = rng.randn(n_samples, n_features)
all_components = []
batch_sizes = np.arange(10, 20)
for batch_size in batch_sizes:
ipca = IncrementalPCA(n_components=None, batch_size=batch_size).fit(X)
all_components.append(ipca.components_)
for i, j in zip(all_components[:-1], all_components[1:]):
assert_almost_equal(np.sign(i), np.sign(j), decimal=6)
def test_incremental_pca_batch_values():
# Test that components_ values are stable over batch sizes.
rng = np.random.RandomState(1999)
n_samples = 100
n_features = 3
X = rng.randn(n_samples, n_features)
all_components = []
batch_sizes = np.arange(20, 40, 3)
for batch_size in batch_sizes:
ipca = IncrementalPCA(n_components=None, batch_size=batch_size).fit(X)
all_components.append(ipca.components_)
for i, j in zip(all_components[:-1], all_components[1:]):
assert_almost_equal(i, j, decimal=1)
def test_incremental_pca_batch_rank():
# Test sample size in each batch is always larger or equal to n_components
rng = np.random.RandomState(1999)
n_samples = 100
n_features = 20
X = rng.randn(n_samples, n_features)
all_components = []
batch_sizes = np.arange(20, 90, 3)
for batch_size in batch_sizes:
ipca = IncrementalPCA(n_components=20, batch_size=batch_size).fit(X)
all_components.append(ipca.components_)
for components_i, components_j in zip(all_components[:-1],
all_components[1:]):
assert_allclose_dense_sparse(components_i, components_j)
def test_incremental_pca_partial_fit():
# Test that fit and partial_fit get equivalent results.
rng = np.random.RandomState(1999)
n, p = 50, 3
X = rng.randn(n, p) # spherical data
X[:, 1] *= .00001 # make middle component relatively small
X += [5, 4, 3] # make a large mean
# same check that we can find the original data from the transformed
# signal (since the data is almost of rank n_components)
batch_size = 10
ipca = IncrementalPCA(n_components=2, batch_size=batch_size).fit(X)
pipca = IncrementalPCA(n_components=2, batch_size=batch_size)
# Add one to make sure endpoint is included
batch_itr = np.arange(0, n + 1, batch_size)
for i, j in zip(batch_itr[:-1], batch_itr[1:]):
pipca.partial_fit(X[i:j, :])
assert_almost_equal(ipca.components_, pipca.components_, decimal=3)
def test_incremental_pca_against_pca_iris():
# Test that IncrementalPCA and PCA are approximate (to a sign flip).
X = iris.data
Y_pca = PCA(n_components=2).fit_transform(X)
Y_ipca = IncrementalPCA(n_components=2, batch_size=25).fit_transform(X)
assert_almost_equal(np.abs(Y_pca), np.abs(Y_ipca), 1)
def test_incremental_pca_against_pca_random_data():
# Test that IncrementalPCA and PCA are approximate (to a sign flip).
rng = np.random.RandomState(1999)
n_samples = 100
n_features = 3
X = rng.randn(n_samples, n_features) + 5 * rng.rand(1, n_features)
Y_pca = PCA(n_components=3).fit_transform(X)
Y_ipca = IncrementalPCA(n_components=3, batch_size=25).fit_transform(X)
assert_almost_equal(np.abs(Y_pca), np.abs(Y_ipca), 1)
def test_explained_variances():
# Test that PCA and IncrementalPCA calculations match
X = datasets.make_low_rank_matrix(1000, 100, tail_strength=0.,
effective_rank=10, random_state=1999)
prec = 3
n_samples, n_features = X.shape
for nc in [None, 99]:
pca = PCA(n_components=nc).fit(X)
ipca = IncrementalPCA(n_components=nc, batch_size=100).fit(X)
assert_almost_equal(pca.explained_variance_, ipca.explained_variance_,
decimal=prec)
assert_almost_equal(pca.explained_variance_ratio_,
ipca.explained_variance_ratio_, decimal=prec)
assert_almost_equal(pca.noise_variance_, ipca.noise_variance_,
decimal=prec)
def test_singular_values():
# Check that the IncrementalPCA output has the correct singular values
rng = np.random.RandomState(0)
n_samples = 1000
n_features = 100
X = datasets.make_low_rank_matrix(n_samples, n_features, tail_strength=0.0,
effective_rank=10, random_state=rng)
pca = PCA(n_components=10, svd_solver='full', random_state=rng).fit(X)
ipca = IncrementalPCA(n_components=10, batch_size=100).fit(X)
assert_array_almost_equal(pca.singular_values_, ipca.singular_values_, 2)
# Compare to the Frobenius norm
X_pca = pca.transform(X)
X_ipca = ipca.transform(X)
assert_array_almost_equal(np.sum(pca.singular_values_**2.0),
np.linalg.norm(X_pca, "fro")**2.0, 12)
assert_array_almost_equal(np.sum(ipca.singular_values_**2.0),
np.linalg.norm(X_ipca, "fro")**2.0, 2)
# Compare to the 2-norms of the score vectors
assert_array_almost_equal(pca.singular_values_,
np.sqrt(np.sum(X_pca**2.0, axis=0)), 12)
assert_array_almost_equal(ipca.singular_values_,
np.sqrt(np.sum(X_ipca**2.0, axis=0)), 2)
# Set the singular values and see what we get back
rng = np.random.RandomState(0)
n_samples = 100
n_features = 110
X = datasets.make_low_rank_matrix(n_samples, n_features, tail_strength=0.0,
effective_rank=3, random_state=rng)
pca = PCA(n_components=3, svd_solver='full', random_state=rng)
ipca = IncrementalPCA(n_components=3, batch_size=100)
X_pca = pca.fit_transform(X)
X_pca /= np.sqrt(np.sum(X_pca**2.0, axis=0))
X_pca[:, 0] *= 3.142
X_pca[:, 1] *= 2.718
X_hat = np.dot(X_pca, pca.components_)
pca.fit(X_hat)
ipca.fit(X_hat)
assert_array_almost_equal(pca.singular_values_, [3.142, 2.718, 1.0], 14)
assert_array_almost_equal(ipca.singular_values_, [3.142, 2.718, 1.0], 14)
def test_whitening():
# Test that PCA and IncrementalPCA transforms match to sign flip.
X = datasets.make_low_rank_matrix(1000, 10, tail_strength=0.,
effective_rank=2, random_state=1999)
prec = 3
n_samples, n_features = X.shape
for nc in [None, 9]:
pca = PCA(whiten=True, n_components=nc).fit(X)
ipca = IncrementalPCA(whiten=True, n_components=nc,
batch_size=250).fit(X)
Xt_pca = pca.transform(X)
Xt_ipca = ipca.transform(X)
assert_almost_equal(np.abs(Xt_pca), np.abs(Xt_ipca), decimal=prec)
Xinv_ipca = ipca.inverse_transform(Xt_ipca)
Xinv_pca = pca.inverse_transform(Xt_pca)
assert_almost_equal(X, Xinv_ipca, decimal=prec)
assert_almost_equal(X, Xinv_pca, decimal=prec)
assert_almost_equal(Xinv_pca, Xinv_ipca, decimal=prec)
def test_incremental_pca_partial_fit_float_division():
# Test to ensure float division is used in all versions of Python
# (non-regression test for issue #9489)
rng = np.random.RandomState(0)
A = rng.randn(5, 3) + 2
B = rng.randn(7, 3) + 5
pca = IncrementalPCA(n_components=2)
pca.partial_fit(A)
# Set n_samples_seen_ to be a floating point number instead of an int
pca.n_samples_seen_ = float(pca.n_samples_seen_)
pca.partial_fit(B)
singular_vals_float_samples_seen = pca.singular_values_
pca2 = IncrementalPCA(n_components=2)
pca2.partial_fit(A)
pca2.partial_fit(B)
singular_vals_int_samples_seen = pca2.singular_values_
np.testing.assert_allclose(singular_vals_float_samples_seen,
singular_vals_int_samples_seen)
def test_incremental_pca_fit_overflow_error():
# Test for overflow error on Windows OS
# (non-regression test for issue #17693)
rng = np.random.RandomState(0)
A = rng.rand(500000, 2)
ipca = IncrementalPCA(n_components=2, batch_size=10000)
ipca.fit(A)
pca = PCA(n_components=2)
pca.fit(A)
np.testing.assert_allclose(ipca.singular_values_, pca.singular_values_)

View file

@ -0,0 +1,297 @@
import numpy as np
import scipy.sparse as sp
import pytest
from sklearn.utils._testing import (assert_array_almost_equal,
assert_allclose)
from sklearn.decomposition import PCA, KernelPCA
from sklearn.datasets import make_circles
from sklearn.datasets import make_blobs
from sklearn.linear_model import Perceptron
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics.pairwise import rbf_kernel
from sklearn.utils.validation import _check_psd_eigenvalues
def test_kernel_pca():
rng = np.random.RandomState(0)
X_fit = rng.random_sample((5, 4))
X_pred = rng.random_sample((2, 4))
def histogram(x, y, **kwargs):
# Histogram kernel implemented as a callable.
assert kwargs == {} # no kernel_params that we didn't ask for
return np.minimum(x, y).sum()
for eigen_solver in ("auto", "dense", "arpack"):
for kernel in ("linear", "rbf", "poly", histogram):
# histogram kernel produces singular matrix inside linalg.solve
# XXX use a least-squares approximation?
inv = not callable(kernel)
# transform fit data
kpca = KernelPCA(4, kernel=kernel, eigen_solver=eigen_solver,
fit_inverse_transform=inv)
X_fit_transformed = kpca.fit_transform(X_fit)
X_fit_transformed2 = kpca.fit(X_fit).transform(X_fit)
assert_array_almost_equal(np.abs(X_fit_transformed),
np.abs(X_fit_transformed2))
# non-regression test: previously, gamma would be 0 by default,
# forcing all eigenvalues to 0 under the poly kernel
assert X_fit_transformed.size != 0
# transform new data
X_pred_transformed = kpca.transform(X_pred)
assert (X_pred_transformed.shape[1] ==
X_fit_transformed.shape[1])
# inverse transform
if inv:
X_pred2 = kpca.inverse_transform(X_pred_transformed)
assert X_pred2.shape == X_pred.shape
def test_kernel_pca_invalid_parameters():
with pytest.raises(ValueError):
KernelPCA(10, fit_inverse_transform=True, kernel='precomputed')
def test_kernel_pca_consistent_transform():
# X_fit_ needs to retain the old, unmodified copy of X
state = np.random.RandomState(0)
X = state.rand(10, 10)
kpca = KernelPCA(random_state=state).fit(X)
transformed1 = kpca.transform(X)
X_copy = X.copy()
X[:, 0] = 666
transformed2 = kpca.transform(X_copy)
assert_array_almost_equal(transformed1, transformed2)
def test_kernel_pca_deterministic_output():
rng = np.random.RandomState(0)
X = rng.rand(10, 10)
eigen_solver = ('arpack', 'dense')
for solver in eigen_solver:
transformed_X = np.zeros((20, 2))
for i in range(20):
kpca = KernelPCA(n_components=2, eigen_solver=solver,
random_state=rng)
transformed_X[i, :] = kpca.fit_transform(X)[0]
assert_allclose(
transformed_X, np.tile(transformed_X[0, :], 20).reshape(20, 2))
def test_kernel_pca_sparse():
rng = np.random.RandomState(0)
X_fit = sp.csr_matrix(rng.random_sample((5, 4)))
X_pred = sp.csr_matrix(rng.random_sample((2, 4)))
for eigen_solver in ("auto", "arpack"):
for kernel in ("linear", "rbf", "poly"):
# transform fit data
kpca = KernelPCA(4, kernel=kernel, eigen_solver=eigen_solver,
fit_inverse_transform=False)
X_fit_transformed = kpca.fit_transform(X_fit)
X_fit_transformed2 = kpca.fit(X_fit).transform(X_fit)
assert_array_almost_equal(np.abs(X_fit_transformed),
np.abs(X_fit_transformed2))
# transform new data
X_pred_transformed = kpca.transform(X_pred)
assert (X_pred_transformed.shape[1] ==
X_fit_transformed.shape[1])
# inverse transform
# X_pred2 = kpca.inverse_transform(X_pred_transformed)
# assert X_pred2.shape == X_pred.shape)
def test_kernel_pca_linear_kernel():
rng = np.random.RandomState(0)
X_fit = rng.random_sample((5, 4))
X_pred = rng.random_sample((2, 4))
# for a linear kernel, kernel PCA should find the same projection as PCA
# modulo the sign (direction)
# fit only the first four components: fifth is near zero eigenvalue, so
# can be trimmed due to roundoff error
assert_array_almost_equal(
np.abs(KernelPCA(4).fit(X_fit).transform(X_pred)),
np.abs(PCA(4).fit(X_fit).transform(X_pred)))
def test_kernel_pca_n_components():
rng = np.random.RandomState(0)
X_fit = rng.random_sample((5, 4))
X_pred = rng.random_sample((2, 4))
for eigen_solver in ("dense", "arpack"):
for c in [1, 2, 4]:
kpca = KernelPCA(n_components=c, eigen_solver=eigen_solver)
shape = kpca.fit(X_fit).transform(X_pred).shape
assert shape == (2, c)
def test_remove_zero_eig():
X = np.array([[1 - 1e-30, 1], [1, 1], [1, 1 - 1e-20]])
# n_components=None (default) => remove_zero_eig is True
kpca = KernelPCA()
Xt = kpca.fit_transform(X)
assert Xt.shape == (3, 0)
kpca = KernelPCA(n_components=2)
Xt = kpca.fit_transform(X)
assert Xt.shape == (3, 2)
kpca = KernelPCA(n_components=2, remove_zero_eig=True)
Xt = kpca.fit_transform(X)
assert Xt.shape == (3, 0)
def test_leave_zero_eig():
"""This test checks that fit().transform() returns the same result as
fit_transform() in case of non-removed zero eigenvalue.
Non-regression test for issue #12141 (PR #12143)"""
X_fit = np.array([[1, 1], [0, 0]])
# Assert that even with all np warnings on, there is no div by zero warning
with pytest.warns(None) as record:
with np.errstate(all='warn'):
k = KernelPCA(n_components=2, remove_zero_eig=False,
eigen_solver="dense")
# Fit, then transform
A = k.fit(X_fit).transform(X_fit)
# Do both at once
B = k.fit_transform(X_fit)
# Compare
assert_array_almost_equal(np.abs(A), np.abs(B))
for w in record:
# There might be warnings about the kernel being badly conditioned,
# but there should not be warnings about division by zero.
# (Numpy division by zero warning can have many message variants, but
# at least we know that it is a RuntimeWarning so lets check only this)
assert not issubclass(w.category, RuntimeWarning)
def test_kernel_pca_precomputed():
rng = np.random.RandomState(0)
X_fit = rng.random_sample((5, 4))
X_pred = rng.random_sample((2, 4))
for eigen_solver in ("dense", "arpack"):
X_kpca = KernelPCA(4, eigen_solver=eigen_solver).\
fit(X_fit).transform(X_pred)
X_kpca2 = KernelPCA(
4, eigen_solver=eigen_solver, kernel='precomputed').fit(
np.dot(X_fit, X_fit.T)).transform(np.dot(X_pred, X_fit.T))
X_kpca_train = KernelPCA(
4, eigen_solver=eigen_solver,
kernel='precomputed').fit_transform(np.dot(X_fit, X_fit.T))
X_kpca_train2 = KernelPCA(
4, eigen_solver=eigen_solver, kernel='precomputed').fit(
np.dot(X_fit, X_fit.T)).transform(np.dot(X_fit, X_fit.T))
assert_array_almost_equal(np.abs(X_kpca),
np.abs(X_kpca2))
assert_array_almost_equal(np.abs(X_kpca_train),
np.abs(X_kpca_train2))
def test_kernel_pca_invalid_kernel():
rng = np.random.RandomState(0)
X_fit = rng.random_sample((2, 4))
kpca = KernelPCA(kernel="tototiti")
with pytest.raises(ValueError):
kpca.fit(X_fit)
def test_gridsearch_pipeline():
# Test if we can do a grid-search to find parameters to separate
# circles with a perceptron model.
X, y = make_circles(n_samples=400, factor=.3, noise=.05,
random_state=0)
kpca = KernelPCA(kernel="rbf", n_components=2)
pipeline = Pipeline([("kernel_pca", kpca),
("Perceptron", Perceptron(max_iter=5))])
param_grid = dict(kernel_pca__gamma=2. ** np.arange(-2, 2))
grid_search = GridSearchCV(pipeline, cv=3, param_grid=param_grid)
grid_search.fit(X, y)
assert grid_search.best_score_ == 1
def test_gridsearch_pipeline_precomputed():
# Test if we can do a grid-search to find parameters to separate
# circles with a perceptron model using a precomputed kernel.
X, y = make_circles(n_samples=400, factor=.3, noise=.05,
random_state=0)
kpca = KernelPCA(kernel="precomputed", n_components=2)
pipeline = Pipeline([("kernel_pca", kpca),
("Perceptron", Perceptron(max_iter=5))])
param_grid = dict(Perceptron__max_iter=np.arange(1, 5))
grid_search = GridSearchCV(pipeline, cv=3, param_grid=param_grid)
X_kernel = rbf_kernel(X, gamma=2.)
grid_search.fit(X_kernel, y)
assert grid_search.best_score_ == 1
def test_nested_circles():
# Test the linear separability of the first 2D KPCA transform
X, y = make_circles(n_samples=400, factor=.3, noise=.05,
random_state=0)
# 2D nested circles are not linearly separable
train_score = Perceptron(max_iter=5).fit(X, y).score(X, y)
assert train_score < 0.8
# Project the circles data into the first 2 components of a RBF Kernel
# PCA model.
# Note that the gamma value is data dependent. If this test breaks
# and the gamma value has to be updated, the Kernel PCA example will
# have to be updated too.
kpca = KernelPCA(kernel="rbf", n_components=2,
fit_inverse_transform=True, gamma=2.)
X_kpca = kpca.fit_transform(X)
# The data is perfectly linearly separable in that space
train_score = Perceptron(max_iter=5).fit(X_kpca, y).score(X_kpca, y)
assert train_score == 1.0
def test_kernel_conditioning():
""" Test that ``_check_psd_eigenvalues`` is correctly called
Non-regression test for issue #12140 (PR #12145)"""
# create a pathological X leading to small non-zero eigenvalue
X = [[5, 1],
[5+1e-8, 1e-8],
[5+1e-8, 0]]
kpca = KernelPCA(kernel="linear", n_components=2,
fit_inverse_transform=True)
kpca.fit(X)
# check that the small non-zero eigenvalue was correctly set to zero
assert kpca.lambdas_.min() == 0
assert np.all(kpca.lambdas_ == _check_psd_eigenvalues(kpca.lambdas_))
@pytest.mark.parametrize("kernel",
["linear", "poly", "rbf", "sigmoid", "cosine"])
def test_kernel_pca_inverse_transform(kernel):
X, *_ = make_blobs(n_samples=100, n_features=4, centers=[[1, 1, 1, 1]],
random_state=0)
kp = KernelPCA(n_components=2, kernel=kernel, fit_inverse_transform=True)
X_trans = kp.fit_transform(X)
X_inv = kp.inverse_transform(X_trans)
assert_allclose(X, X_inv)

View file

@ -0,0 +1,554 @@
import numpy as np
import scipy.sparse as sp
from scipy import linalg
from sklearn.decomposition import NMF, non_negative_factorization
from sklearn.decomposition import _nmf as nmf # For testing internals
from scipy.sparse import csc_matrix
import pytest
from sklearn.utils._testing import assert_raise_message
from sklearn.utils._testing import assert_array_equal
from sklearn.utils._testing import assert_array_almost_equal
from sklearn.utils._testing import assert_almost_equal
from sklearn.utils._testing import assert_allclose
from sklearn.utils._testing import ignore_warnings
from sklearn.utils.extmath import squared_norm
from sklearn.base import clone
from sklearn.exceptions import ConvergenceWarning
@pytest.mark.parametrize('solver', ['cd', 'mu'])
def test_convergence_warning(solver):
convergence_warning = ("Maximum number of iterations 1 reached. "
"Increase it to improve convergence.")
A = np.ones((2, 2))
with pytest.warns(ConvergenceWarning, match=convergence_warning):
NMF(solver=solver, max_iter=1).fit(A)
def test_initialize_nn_output():
# Test that initialization does not return negative values
rng = np.random.mtrand.RandomState(42)
data = np.abs(rng.randn(10, 10))
for init in ('random', 'nndsvd', 'nndsvda', 'nndsvdar'):
W, H = nmf._initialize_nmf(data, 10, init=init, random_state=0)
assert not ((W < 0).any() or (H < 0).any())
def test_parameter_checking():
A = np.ones((2, 2))
name = 'spam'
msg = "Invalid solver parameter: got 'spam' instead of one of"
assert_raise_message(ValueError, msg, NMF(solver=name).fit, A)
msg = "Invalid init parameter: got 'spam' instead of one of"
assert_raise_message(ValueError, msg, NMF(init=name).fit, A)
msg = "Invalid beta_loss parameter: got 'spam' instead of one"
assert_raise_message(ValueError, msg, NMF(solver='mu',
beta_loss=name).fit, A)
msg = "Invalid beta_loss parameter: solver 'cd' does not handle "
msg += "beta_loss = 1.0"
assert_raise_message(ValueError, msg, NMF(solver='cd',
beta_loss=1.0).fit, A)
msg = "Negative values in data passed to"
assert_raise_message(ValueError, msg, NMF().fit, -A)
assert_raise_message(ValueError, msg, nmf._initialize_nmf, -A,
2, 'nndsvd')
clf = NMF(2, tol=0.1).fit(A)
assert_raise_message(ValueError, msg, clf.transform, -A)
for init in ['nndsvd', 'nndsvda', 'nndsvdar']:
msg = ("init = '{}' can only be used when "
"n_components <= min(n_samples, n_features)"
.format(init))
assert_raise_message(ValueError, msg, NMF(3, init=init).fit, A)
assert_raise_message(ValueError, msg, nmf._initialize_nmf, A,
3, init)
def test_initialize_close():
# Test NNDSVD error
# Test that _initialize_nmf error is less than the standard deviation of
# the entries in the matrix.
rng = np.random.mtrand.RandomState(42)
A = np.abs(rng.randn(10, 10))
W, H = nmf._initialize_nmf(A, 10, init='nndsvd')
error = linalg.norm(np.dot(W, H) - A)
sdev = linalg.norm(A - A.mean())
assert error <= sdev
def test_initialize_variants():
# Test NNDSVD variants correctness
# Test that the variants 'nndsvda' and 'nndsvdar' differ from basic
# 'nndsvd' only where the basic version has zeros.
rng = np.random.mtrand.RandomState(42)
data = np.abs(rng.randn(10, 10))
W0, H0 = nmf._initialize_nmf(data, 10, init='nndsvd')
Wa, Ha = nmf._initialize_nmf(data, 10, init='nndsvda')
War, Har = nmf._initialize_nmf(data, 10, init='nndsvdar',
random_state=0)
for ref, evl in ((W0, Wa), (W0, War), (H0, Ha), (H0, Har)):
assert_almost_equal(evl[ref != 0], ref[ref != 0])
# ignore UserWarning raised when both solver='mu' and init='nndsvd'
@ignore_warnings(category=UserWarning)
def test_nmf_fit_nn_output():
# Test that the decomposition does not contain negative values
A = np.c_[5. - np.arange(1, 6),
5. + np.arange(1, 6)]
for solver in ('cd', 'mu'):
for init in (None, 'nndsvd', 'nndsvda', 'nndsvdar', 'random'):
model = NMF(n_components=2, solver=solver, init=init,
random_state=0)
transf = model.fit_transform(A)
assert not((model.components_ < 0).any() or
(transf < 0).any())
@pytest.mark.parametrize('solver', ('cd', 'mu'))
def test_nmf_fit_close(solver):
rng = np.random.mtrand.RandomState(42)
# Test that the fit is not too far away
pnmf = NMF(5, solver=solver, init='nndsvdar', random_state=0,
max_iter=600)
X = np.abs(rng.randn(6, 5))
assert pnmf.fit(X).reconstruction_err_ < 0.1
@pytest.mark.parametrize('solver', ('cd', 'mu'))
def test_nmf_transform(solver):
# Test that NMF.transform returns close values
rng = np.random.mtrand.RandomState(42)
A = np.abs(rng.randn(6, 5))
m = NMF(solver=solver, n_components=3, init='random',
random_state=0, tol=1e-5)
ft = m.fit_transform(A)
t = m.transform(A)
assert_array_almost_equal(ft, t, decimal=2)
def test_nmf_transform_custom_init():
# Smoke test that checks if NMF.transform works with custom initialization
random_state = np.random.RandomState(0)
A = np.abs(random_state.randn(6, 5))
n_components = 4
avg = np.sqrt(A.mean() / n_components)
H_init = np.abs(avg * random_state.randn(n_components, 5))
W_init = np.abs(avg * random_state.randn(6, n_components))
m = NMF(solver='cd', n_components=n_components, init='custom',
random_state=0)
m.fit_transform(A, W=W_init, H=H_init)
m.transform(A)
@pytest.mark.parametrize('solver', ('cd', 'mu'))
def test_nmf_inverse_transform(solver):
# Test that NMF.inverse_transform returns close values
random_state = np.random.RandomState(0)
A = np.abs(random_state.randn(6, 4))
m = NMF(solver=solver, n_components=4, init='random', random_state=0,
max_iter=1000)
ft = m.fit_transform(A)
A_new = m.inverse_transform(ft)
assert_array_almost_equal(A, A_new, decimal=2)
def test_n_components_greater_n_features():
# Smoke test for the case of more components than features.
rng = np.random.mtrand.RandomState(42)
A = np.abs(rng.randn(30, 10))
NMF(n_components=15, random_state=0, tol=1e-2).fit(A)
def test_nmf_sparse_input():
# Test that sparse matrices are accepted as input
from scipy.sparse import csc_matrix
rng = np.random.mtrand.RandomState(42)
A = np.abs(rng.randn(10, 10))
A[:, 2 * np.arange(5)] = 0
A_sparse = csc_matrix(A)
for solver in ('cd', 'mu'):
est1 = NMF(solver=solver, n_components=5, init='random',
random_state=0, tol=1e-2)
est2 = clone(est1)
W1 = est1.fit_transform(A)
W2 = est2.fit_transform(A_sparse)
H1 = est1.components_
H2 = est2.components_
assert_array_almost_equal(W1, W2)
assert_array_almost_equal(H1, H2)
def test_nmf_sparse_transform():
# Test that transform works on sparse data. Issue #2124
rng = np.random.mtrand.RandomState(42)
A = np.abs(rng.randn(3, 2))
A[1, 1] = 0
A = csc_matrix(A)
for solver in ('cd', 'mu'):
model = NMF(solver=solver, random_state=0, n_components=2,
max_iter=400)
A_fit_tr = model.fit_transform(A)
A_tr = model.transform(A)
assert_array_almost_equal(A_fit_tr, A_tr, decimal=1)
def test_non_negative_factorization_consistency():
# Test that the function is called in the same way, either directly
# or through the NMF class
rng = np.random.mtrand.RandomState(42)
A = np.abs(rng.randn(10, 10))
A[:, 2 * np.arange(5)] = 0
for init in ['random', 'nndsvd']:
for solver in ('cd', 'mu'):
W_nmf, H, _ = non_negative_factorization(
A, init=init, solver=solver, random_state=1, tol=1e-2)
W_nmf_2, _, _ = non_negative_factorization(
A, H=H, update_H=False, init=init, solver=solver,
random_state=1, tol=1e-2)
model_class = NMF(init=init, solver=solver, random_state=1,
tol=1e-2)
W_cls = model_class.fit_transform(A)
W_cls_2 = model_class.transform(A)
assert_array_almost_equal(W_nmf, W_cls, decimal=10)
assert_array_almost_equal(W_nmf_2, W_cls_2, decimal=10)
def test_non_negative_factorization_checking():
A = np.ones((2, 2))
# Test parameters checking is public function
nnmf = non_negative_factorization
msg = ("Number of components must be a positive integer; "
"got (n_components=1.5)")
assert_raise_message(ValueError, msg, nnmf, A, A, A, 1.5, init='random')
msg = ("Number of components must be a positive integer; "
"got (n_components='2')")
assert_raise_message(ValueError, msg, nnmf, A, A, A, '2', init='random')
msg = "Negative values in data passed to NMF (input H)"
assert_raise_message(ValueError, msg, nnmf, A, A, -A, 2, init='custom')
msg = "Negative values in data passed to NMF (input W)"
assert_raise_message(ValueError, msg, nnmf, A, -A, A, 2, init='custom')
msg = "Array passed to NMF (input H) is full of zeros"
assert_raise_message(ValueError, msg, nnmf, A, A, 0 * A, 2, init='custom')
msg = "Invalid regularization parameter: got 'spam' instead of one of"
assert_raise_message(ValueError, msg, nnmf, A, A, 0 * A, 2, init='custom',
regularization='spam')
def _beta_divergence_dense(X, W, H, beta):
"""Compute the beta-divergence of X and W.H for dense array only.
Used as a reference for testing nmf._beta_divergence.
"""
WH = np.dot(W, H)
if beta == 2:
return squared_norm(X - WH) / 2
WH_Xnonzero = WH[X != 0]
X_nonzero = X[X != 0]
np.maximum(WH_Xnonzero, 1e-9, out=WH_Xnonzero)
if beta == 1:
res = np.sum(X_nonzero * np.log(X_nonzero / WH_Xnonzero))
res += WH.sum() - X.sum()
elif beta == 0:
div = X_nonzero / WH_Xnonzero
res = np.sum(div) - X.size - np.sum(np.log(div))
else:
res = (X_nonzero ** beta).sum()
res += (beta - 1) * (WH ** beta).sum()
res -= beta * (X_nonzero * (WH_Xnonzero ** (beta - 1))).sum()
res /= beta * (beta - 1)
return res
def test_beta_divergence():
# Compare _beta_divergence with the reference _beta_divergence_dense
n_samples = 20
n_features = 10
n_components = 5
beta_losses = [0., 0.5, 1., 1.5, 2.]
# initialization
rng = np.random.mtrand.RandomState(42)
X = rng.randn(n_samples, n_features)
np.clip(X, 0, None, out=X)
X_csr = sp.csr_matrix(X)
W, H = nmf._initialize_nmf(X, n_components, init='random', random_state=42)
for beta in beta_losses:
ref = _beta_divergence_dense(X, W, H, beta)
loss = nmf._beta_divergence(X, W, H, beta)
loss_csr = nmf._beta_divergence(X_csr, W, H, beta)
assert_almost_equal(ref, loss, decimal=7)
assert_almost_equal(ref, loss_csr, decimal=7)
def test_special_sparse_dot():
# Test the function that computes np.dot(W, H), only where X is non zero.
n_samples = 10
n_features = 5
n_components = 3
rng = np.random.mtrand.RandomState(42)
X = rng.randn(n_samples, n_features)
np.clip(X, 0, None, out=X)
X_csr = sp.csr_matrix(X)
W = np.abs(rng.randn(n_samples, n_components))
H = np.abs(rng.randn(n_components, n_features))
WH_safe = nmf._special_sparse_dot(W, H, X_csr)
WH = nmf._special_sparse_dot(W, H, X)
# test that both results have same values, in X_csr nonzero elements
ii, jj = X_csr.nonzero()
WH_safe_data = np.asarray(WH_safe[ii, jj]).ravel()
assert_array_almost_equal(WH_safe_data, WH[ii, jj], decimal=10)
# test that WH_safe and X_csr have the same sparse structure
assert_array_equal(WH_safe.indices, X_csr.indices)
assert_array_equal(WH_safe.indptr, X_csr.indptr)
assert_array_equal(WH_safe.shape, X_csr.shape)
@ignore_warnings(category=ConvergenceWarning)
def test_nmf_multiplicative_update_sparse():
# Compare sparse and dense input in multiplicative update NMF
# Also test continuity of the results with respect to beta_loss parameter
n_samples = 20
n_features = 10
n_components = 5
alpha = 0.1
l1_ratio = 0.5
n_iter = 20
# initialization
rng = np.random.mtrand.RandomState(1337)
X = rng.randn(n_samples, n_features)
X = np.abs(X)
X_csr = sp.csr_matrix(X)
W0, H0 = nmf._initialize_nmf(X, n_components, init='random',
random_state=42)
for beta_loss in (-1.2, 0, 0.2, 1., 2., 2.5):
# Reference with dense array X
W, H = W0.copy(), H0.copy()
W1, H1, _ = non_negative_factorization(
X, W, H, n_components, init='custom', update_H=True,
solver='mu', beta_loss=beta_loss, max_iter=n_iter, alpha=alpha,
l1_ratio=l1_ratio, regularization='both', random_state=42)
# Compare with sparse X
W, H = W0.copy(), H0.copy()
W2, H2, _ = non_negative_factorization(
X_csr, W, H, n_components, init='custom', update_H=True,
solver='mu', beta_loss=beta_loss, max_iter=n_iter, alpha=alpha,
l1_ratio=l1_ratio, regularization='both', random_state=42)
assert_array_almost_equal(W1, W2, decimal=7)
assert_array_almost_equal(H1, H2, decimal=7)
# Compare with almost same beta_loss, since some values have a specific
# behavior, but the results should be continuous w.r.t beta_loss
beta_loss -= 1.e-5
W, H = W0.copy(), H0.copy()
W3, H3, _ = non_negative_factorization(
X_csr, W, H, n_components, init='custom', update_H=True,
solver='mu', beta_loss=beta_loss, max_iter=n_iter, alpha=alpha,
l1_ratio=l1_ratio, regularization='both', random_state=42)
assert_array_almost_equal(W1, W3, decimal=4)
assert_array_almost_equal(H1, H3, decimal=4)
def test_nmf_negative_beta_loss():
# Test that an error is raised if beta_loss < 0 and X contains zeros.
# Test that the output has not NaN values when the input contains zeros.
n_samples = 6
n_features = 5
n_components = 3
rng = np.random.mtrand.RandomState(42)
X = rng.randn(n_samples, n_features)
np.clip(X, 0, None, out=X)
X_csr = sp.csr_matrix(X)
def _assert_nmf_no_nan(X, beta_loss):
W, H, _ = non_negative_factorization(
X, init='random', n_components=n_components, solver='mu',
beta_loss=beta_loss, random_state=0, max_iter=1000)
assert not np.any(np.isnan(W))
assert not np.any(np.isnan(H))
msg = "When beta_loss <= 0 and X contains zeros, the solver may diverge."
for beta_loss in (-0.6, 0.):
assert_raise_message(ValueError, msg, _assert_nmf_no_nan, X, beta_loss)
_assert_nmf_no_nan(X + 1e-9, beta_loss)
for beta_loss in (0.2, 1., 1.2, 2., 2.5):
_assert_nmf_no_nan(X, beta_loss)
_assert_nmf_no_nan(X_csr, beta_loss)
def test_nmf_regularization():
# Test the effect of L1 and L2 regularizations
n_samples = 6
n_features = 5
n_components = 3
rng = np.random.mtrand.RandomState(42)
X = np.abs(rng.randn(n_samples, n_features))
# L1 regularization should increase the number of zeros
l1_ratio = 1.
for solver in ['cd', 'mu']:
regul = nmf.NMF(n_components=n_components, solver=solver,
alpha=0.5, l1_ratio=l1_ratio, random_state=42)
model = nmf.NMF(n_components=n_components, solver=solver,
alpha=0., l1_ratio=l1_ratio, random_state=42)
W_regul = regul.fit_transform(X)
W_model = model.fit_transform(X)
H_regul = regul.components_
H_model = model.components_
W_regul_n_zeros = W_regul[W_regul == 0].size
W_model_n_zeros = W_model[W_model == 0].size
H_regul_n_zeros = H_regul[H_regul == 0].size
H_model_n_zeros = H_model[H_model == 0].size
assert W_regul_n_zeros > W_model_n_zeros
assert H_regul_n_zeros > H_model_n_zeros
# L2 regularization should decrease the mean of the coefficients
l1_ratio = 0.
for solver in ['cd', 'mu']:
regul = nmf.NMF(n_components=n_components, solver=solver,
alpha=0.5, l1_ratio=l1_ratio, random_state=42)
model = nmf.NMF(n_components=n_components, solver=solver,
alpha=0., l1_ratio=l1_ratio, random_state=42)
W_regul = regul.fit_transform(X)
W_model = model.fit_transform(X)
H_regul = regul.components_
H_model = model.components_
assert W_model.mean() > W_regul.mean()
assert H_model.mean() > H_regul.mean()
@ignore_warnings(category=ConvergenceWarning)
def test_nmf_decreasing():
# test that the objective function is decreasing at each iteration
n_samples = 20
n_features = 15
n_components = 10
alpha = 0.1
l1_ratio = 0.5
tol = 0.
# initialization
rng = np.random.mtrand.RandomState(42)
X = rng.randn(n_samples, n_features)
np.abs(X, X)
W0, H0 = nmf._initialize_nmf(X, n_components, init='random',
random_state=42)
for beta_loss in (-1.2, 0, 0.2, 1., 2., 2.5):
for solver in ('cd', 'mu'):
if solver != 'mu' and beta_loss != 2:
# not implemented
continue
W, H = W0.copy(), H0.copy()
previous_loss = None
for _ in range(30):
# one more iteration starting from the previous results
W, H, _ = non_negative_factorization(
X, W, H, beta_loss=beta_loss, init='custom',
n_components=n_components, max_iter=1, alpha=alpha,
solver=solver, tol=tol, l1_ratio=l1_ratio, verbose=0,
regularization='both', random_state=0, update_H=True)
loss = nmf._beta_divergence(X, W, H, beta_loss)
if previous_loss is not None:
assert previous_loss > loss
previous_loss = loss
def test_nmf_underflow():
# Regression test for an underflow issue in _beta_divergence
rng = np.random.RandomState(0)
n_samples, n_features, n_components = 10, 2, 2
X = np.abs(rng.randn(n_samples, n_features)) * 10
W = np.abs(rng.randn(n_samples, n_components)) * 10
H = np.abs(rng.randn(n_components, n_features))
X[0, 0] = 0
ref = nmf._beta_divergence(X, W, H, beta=1.0)
X[0, 0] = 1e-323
res = nmf._beta_divergence(X, W, H, beta=1.0)
assert_almost_equal(res, ref)
@pytest.mark.parametrize("dtype_in, dtype_out", [
(np.float32, np.float32),
(np.float64, np.float64),
(np.int32, np.float64),
(np.int64, np.float64)])
@pytest.mark.parametrize("solver", ["cd", "mu"])
def test_nmf_dtype_match(dtype_in, dtype_out, solver):
# Check that NMF preserves dtype (float32 and float64)
X = np.random.RandomState(0).randn(20, 15).astype(dtype_in, copy=False)
np.abs(X, out=X)
nmf = NMF(solver=solver)
assert nmf.fit(X).transform(X).dtype == dtype_out
assert nmf.fit_transform(X).dtype == dtype_out
assert nmf.components_.dtype == dtype_out
@pytest.mark.parametrize("solver", ["cd", "mu"])
def test_nmf_float32_float64_consistency(solver):
# Check that the result of NMF is the same between float32 and float64
X = np.random.RandomState(0).randn(50, 7)
np.abs(X, out=X)
nmf32 = NMF(solver=solver, random_state=0)
W32 = nmf32.fit_transform(X.astype(np.float32))
nmf64 = NMF(solver=solver, random_state=0)
W64 = nmf64.fit_transform(X)
assert_allclose(W32, W64, rtol=1e-6, atol=1e-5)
def test_nmf_custom_init_dtype_error():
# Check that an error is raise if custom H and/or W don't have the same
# dtype as X.
rng = np.random.RandomState(0)
X = rng.random_sample((20, 15))
H = rng.random_sample((15, 15)).astype(np.float32)
W = rng.random_sample((20, 15))
with pytest.raises(TypeError, match="should have the same dtype as X"):
NMF(init='custom').fit(X, H=H, W=W)
with pytest.raises(TypeError, match="should have the same dtype as X"):
non_negative_factorization(X, H=H, update_H=False)

View file

@ -0,0 +1,401 @@
import sys
import numpy as np
from scipy.linalg import block_diag
from scipy.sparse import csr_matrix
from scipy.special import psi
import pytest
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.decomposition._lda import (_dirichlet_expectation_1d,
_dirichlet_expectation_2d)
from sklearn.utils._testing import assert_allclose
from sklearn.utils._testing import assert_array_almost_equal
from sklearn.utils._testing import assert_almost_equal
from sklearn.utils._testing import if_safe_multiprocessing_with_blas
from sklearn.exceptions import NotFittedError
from io import StringIO
def _build_sparse_mtx():
# Create 3 topics and each topic has 3 distinct words.
# (Each word only belongs to a single topic.)
n_components = 3
block = np.full((3, 3), n_components, dtype=np.int)
blocks = [block] * n_components
X = block_diag(*blocks)
X = csr_matrix(X)
return (n_components, X)
def test_lda_default_prior_params():
# default prior parameter should be `1 / topics`
# and verbose params should not affect result
n_components, X = _build_sparse_mtx()
prior = 1. / n_components
lda_1 = LatentDirichletAllocation(n_components=n_components,
doc_topic_prior=prior,
topic_word_prior=prior, random_state=0)
lda_2 = LatentDirichletAllocation(n_components=n_components,
random_state=0)
topic_distr_1 = lda_1.fit_transform(X)
topic_distr_2 = lda_2.fit_transform(X)
assert_almost_equal(topic_distr_1, topic_distr_2)
def test_lda_fit_batch():
# Test LDA batch learning_offset (`fit` method with 'batch' learning)
rng = np.random.RandomState(0)
n_components, X = _build_sparse_mtx()
lda = LatentDirichletAllocation(n_components=n_components,
evaluate_every=1, learning_method='batch',
random_state=rng)
lda.fit(X)
correct_idx_grps = [(0, 1, 2), (3, 4, 5), (6, 7, 8)]
for component in lda.components_:
# Find top 3 words in each LDA component
top_idx = set(component.argsort()[-3:][::-1])
assert tuple(sorted(top_idx)) in correct_idx_grps
def test_lda_fit_online():
# Test LDA online learning (`fit` method with 'online' learning)
rng = np.random.RandomState(0)
n_components, X = _build_sparse_mtx()
lda = LatentDirichletAllocation(n_components=n_components,
learning_offset=10., evaluate_every=1,
learning_method='online', random_state=rng)
lda.fit(X)
correct_idx_grps = [(0, 1, 2), (3, 4, 5), (6, 7, 8)]
for component in lda.components_:
# Find top 3 words in each LDA component
top_idx = set(component.argsort()[-3:][::-1])
assert tuple(sorted(top_idx)) in correct_idx_grps
def test_lda_partial_fit():
# Test LDA online learning (`partial_fit` method)
# (same as test_lda_batch)
rng = np.random.RandomState(0)
n_components, X = _build_sparse_mtx()
lda = LatentDirichletAllocation(n_components=n_components,
learning_offset=10., total_samples=100,
random_state=rng)
for i in range(3):
lda.partial_fit(X)
correct_idx_grps = [(0, 1, 2), (3, 4, 5), (6, 7, 8)]
for c in lda.components_:
top_idx = set(c.argsort()[-3:][::-1])
assert tuple(sorted(top_idx)) in correct_idx_grps
def test_lda_dense_input():
# Test LDA with dense input.
rng = np.random.RandomState(0)
n_components, X = _build_sparse_mtx()
lda = LatentDirichletAllocation(n_components=n_components,
learning_method='batch', random_state=rng)
lda.fit(X.toarray())
correct_idx_grps = [(0, 1, 2), (3, 4, 5), (6, 7, 8)]
for component in lda.components_:
# Find top 3 words in each LDA component
top_idx = set(component.argsort()[-3:][::-1])
assert tuple(sorted(top_idx)) in correct_idx_grps
def test_lda_transform():
# Test LDA transform.
# Transform result cannot be negative and should be normalized
rng = np.random.RandomState(0)
X = rng.randint(5, size=(20, 10))
n_components = 3
lda = LatentDirichletAllocation(n_components=n_components,
random_state=rng)
X_trans = lda.fit_transform(X)
assert (X_trans > 0.0).any()
assert_array_almost_equal(np.sum(X_trans, axis=1),
np.ones(X_trans.shape[0]))
@pytest.mark.parametrize('method', ('online', 'batch'))
def test_lda_fit_transform(method):
# Test LDA fit_transform & transform
# fit_transform and transform result should be the same
rng = np.random.RandomState(0)
X = rng.randint(10, size=(50, 20))
lda = LatentDirichletAllocation(n_components=5, learning_method=method,
random_state=rng)
X_fit = lda.fit_transform(X)
X_trans = lda.transform(X)
assert_array_almost_equal(X_fit, X_trans, 4)
def test_lda_partial_fit_dim_mismatch():
# test `n_features` mismatch in `partial_fit`
rng = np.random.RandomState(0)
n_components = rng.randint(3, 6)
n_col = rng.randint(6, 10)
X_1 = np.random.randint(4, size=(10, n_col))
X_2 = np.random.randint(4, size=(10, n_col + 1))
lda = LatentDirichletAllocation(n_components=n_components,
learning_offset=5., total_samples=20,
random_state=rng)
lda.partial_fit(X_1)
with pytest.raises(ValueError, match=r"^The provided data has"):
lda.partial_fit(X_2)
def test_invalid_params():
# test `_check_params` method
X = np.ones((5, 10))
invalid_models = (
('n_components', LatentDirichletAllocation(n_components=0)),
('learning_method',
LatentDirichletAllocation(learning_method='unknown')),
('total_samples', LatentDirichletAllocation(total_samples=0)),
('learning_offset', LatentDirichletAllocation(learning_offset=-1)),
)
for param, model in invalid_models:
regex = r"^Invalid %r parameter" % param
with pytest.raises(ValueError, match=regex):
model.fit(X)
def test_lda_negative_input():
# test pass dense matrix with sparse negative input.
X = np.full((5, 10), -1.)
lda = LatentDirichletAllocation()
regex = r"^Negative values in data passed"
with pytest.raises(ValueError, match=regex):
lda.fit(X)
def test_lda_no_component_error():
# test `perplexity` before `fit`
rng = np.random.RandomState(0)
X = rng.randint(4, size=(20, 10))
lda = LatentDirichletAllocation()
regex = ("This LatentDirichletAllocation instance is not fitted yet. "
"Call 'fit' with appropriate arguments before using this "
"estimator.")
with pytest.raises(NotFittedError, match=regex):
lda.perplexity(X)
def test_lda_transform_mismatch():
# test `n_features` mismatch in partial_fit and transform
rng = np.random.RandomState(0)
X = rng.randint(4, size=(20, 10))
X_2 = rng.randint(4, size=(10, 8))
n_components = rng.randint(3, 6)
lda = LatentDirichletAllocation(n_components=n_components,
random_state=rng)
lda.partial_fit(X)
with pytest.raises(ValueError, match=r"^The provided data has"):
lda.partial_fit(X_2)
@if_safe_multiprocessing_with_blas
@pytest.mark.parametrize('method', ('online', 'batch'))
def test_lda_multi_jobs(method):
n_components, X = _build_sparse_mtx()
# Test LDA batch training with multi CPU
rng = np.random.RandomState(0)
lda = LatentDirichletAllocation(n_components=n_components, n_jobs=2,
learning_method=method,
evaluate_every=1, random_state=rng)
lda.fit(X)
correct_idx_grps = [(0, 1, 2), (3, 4, 5), (6, 7, 8)]
for c in lda.components_:
top_idx = set(c.argsort()[-3:][::-1])
assert tuple(sorted(top_idx)) in correct_idx_grps
@if_safe_multiprocessing_with_blas
def test_lda_partial_fit_multi_jobs():
# Test LDA online training with multi CPU
rng = np.random.RandomState(0)
n_components, X = _build_sparse_mtx()
lda = LatentDirichletAllocation(n_components=n_components, n_jobs=2,
learning_offset=5., total_samples=30,
random_state=rng)
for i in range(2):
lda.partial_fit(X)
correct_idx_grps = [(0, 1, 2), (3, 4, 5), (6, 7, 8)]
for c in lda.components_:
top_idx = set(c.argsort()[-3:][::-1])
assert tuple(sorted(top_idx)) in correct_idx_grps
def test_lda_preplexity_mismatch():
# test dimension mismatch in `perplexity` method
rng = np.random.RandomState(0)
n_components = rng.randint(3, 6)
n_samples = rng.randint(6, 10)
X = np.random.randint(4, size=(n_samples, 10))
lda = LatentDirichletAllocation(n_components=n_components,
learning_offset=5., total_samples=20,
random_state=rng)
lda.fit(X)
# invalid samples
invalid_n_samples = rng.randint(4, size=(n_samples + 1, n_components))
with pytest.raises(ValueError, match=r'Number of samples'):
lda._perplexity_precomp_distr(X, invalid_n_samples)
# invalid topic number
invalid_n_components = rng.randint(4, size=(n_samples, n_components + 1))
with pytest.raises(ValueError, match=r'Number of topics'):
lda._perplexity_precomp_distr(X, invalid_n_components)
@pytest.mark.parametrize('method', ('online', 'batch'))
def test_lda_perplexity(method):
# Test LDA perplexity for batch training
# perplexity should be lower after each iteration
n_components, X = _build_sparse_mtx()
lda_1 = LatentDirichletAllocation(n_components=n_components,
max_iter=1, learning_method=method,
total_samples=100, random_state=0)
lda_2 = LatentDirichletAllocation(n_components=n_components,
max_iter=10, learning_method=method,
total_samples=100, random_state=0)
lda_1.fit(X)
perp_1 = lda_1.perplexity(X, sub_sampling=False)
lda_2.fit(X)
perp_2 = lda_2.perplexity(X, sub_sampling=False)
assert perp_1 >= perp_2
perp_1_subsampling = lda_1.perplexity(X, sub_sampling=True)
perp_2_subsampling = lda_2.perplexity(X, sub_sampling=True)
assert perp_1_subsampling >= perp_2_subsampling
@pytest.mark.parametrize('method', ('online', 'batch'))
def test_lda_score(method):
# Test LDA score for batch training
# score should be higher after each iteration
n_components, X = _build_sparse_mtx()
lda_1 = LatentDirichletAllocation(n_components=n_components,
max_iter=1, learning_method=method,
total_samples=100, random_state=0)
lda_2 = LatentDirichletAllocation(n_components=n_components,
max_iter=10, learning_method=method,
total_samples=100, random_state=0)
lda_1.fit_transform(X)
score_1 = lda_1.score(X)
lda_2.fit_transform(X)
score_2 = lda_2.score(X)
assert score_2 >= score_1
def test_perplexity_input_format():
# Test LDA perplexity for sparse and dense input
# score should be the same for both dense and sparse input
n_components, X = _build_sparse_mtx()
lda = LatentDirichletAllocation(n_components=n_components, max_iter=1,
learning_method='batch',
total_samples=100, random_state=0)
lda.fit(X)
perp_1 = lda.perplexity(X)
perp_2 = lda.perplexity(X.toarray())
assert_almost_equal(perp_1, perp_2)
def test_lda_score_perplexity():
# Test the relationship between LDA score and perplexity
n_components, X = _build_sparse_mtx()
lda = LatentDirichletAllocation(n_components=n_components, max_iter=10,
random_state=0)
lda.fit(X)
perplexity_1 = lda.perplexity(X, sub_sampling=False)
score = lda.score(X)
perplexity_2 = np.exp(-1. * (score / np.sum(X.data)))
assert_almost_equal(perplexity_1, perplexity_2)
def test_lda_fit_perplexity():
# Test that the perplexity computed during fit is consistent with what is
# returned by the perplexity method
n_components, X = _build_sparse_mtx()
lda = LatentDirichletAllocation(n_components=n_components, max_iter=1,
learning_method='batch', random_state=0,
evaluate_every=1)
lda.fit(X)
# Perplexity computed at end of fit method
perplexity1 = lda.bound_
# Result of perplexity method on the train set
perplexity2 = lda.perplexity(X)
assert_almost_equal(perplexity1, perplexity2)
def test_lda_empty_docs():
"""Test LDA on empty document (all-zero rows)."""
Z = np.zeros((5, 4))
for X in [Z, csr_matrix(Z)]:
lda = LatentDirichletAllocation(max_iter=750).fit(X)
assert_almost_equal(lda.components_.sum(axis=0),
np.ones(lda.components_.shape[1]))
def test_dirichlet_expectation():
"""Test Cython version of Dirichlet expectation calculation."""
x = np.logspace(-100, 10, 10000)
expectation = np.empty_like(x)
_dirichlet_expectation_1d(x, 0, expectation)
assert_allclose(expectation, np.exp(psi(x) - psi(np.sum(x))),
atol=1e-19)
x = x.reshape(100, 100)
assert_allclose(_dirichlet_expectation_2d(x),
psi(x) - psi(np.sum(x, axis=1)[:, np.newaxis]),
rtol=1e-11, atol=3e-9)
def check_verbosity(verbose, evaluate_every, expected_lines,
expected_perplexities):
n_components, X = _build_sparse_mtx()
lda = LatentDirichletAllocation(n_components=n_components, max_iter=3,
learning_method='batch',
verbose=verbose,
evaluate_every=evaluate_every,
random_state=0)
out = StringIO()
old_out, sys.stdout = sys.stdout, out
try:
lda.fit(X)
finally:
sys.stdout = old_out
n_lines = out.getvalue().count('\n')
n_perplexity = out.getvalue().count('perplexity')
assert expected_lines == n_lines
assert expected_perplexities == n_perplexity
@pytest.mark.parametrize(
'verbose,evaluate_every,expected_lines,expected_perplexities',
[(False, 1, 0, 0),
(False, 0, 0, 0),
(True, 0, 3, 0),
(True, 1, 3, 3),
(True, 2, 3, 1)])
def test_verbosity(verbose, evaluate_every, expected_lines,
expected_perplexities):
check_verbosity(verbose, evaluate_every, expected_lines,
expected_perplexities)

View file

@ -0,0 +1,640 @@
import numpy as np
import scipy as sp
import pytest
from sklearn.utils._testing import assert_allclose
from sklearn import datasets
from sklearn.decomposition import PCA
from sklearn.datasets import load_iris
from sklearn.decomposition._pca import _assess_dimension
from sklearn.decomposition._pca import _infer_dimension
iris = datasets.load_iris()
PCA_SOLVERS = ['full', 'arpack', 'randomized', 'auto']
@pytest.mark.parametrize('svd_solver', PCA_SOLVERS)
@pytest.mark.parametrize('n_components', range(1, iris.data.shape[1]))
def test_pca(svd_solver, n_components):
X = iris.data
pca = PCA(n_components=n_components, svd_solver=svd_solver)
# check the shape of fit.transform
X_r = pca.fit(X).transform(X)
assert X_r.shape[1] == n_components
# check the equivalence of fit.transform and fit_transform
X_r2 = pca.fit_transform(X)
assert_allclose(X_r, X_r2)
X_r = pca.transform(X)
assert_allclose(X_r, X_r2)
# Test get_covariance and get_precision
cov = pca.get_covariance()
precision = pca.get_precision()
assert_allclose(np.dot(cov, precision), np.eye(X.shape[1]), atol=1e-12)
def test_no_empty_slice_warning():
# test if we avoid numpy warnings for computing over empty arrays
n_components = 10
n_features = n_components + 2 # anything > n_comps triggered it in 0.16
X = np.random.uniform(-1, 1, size=(n_components, n_features))
pca = PCA(n_components=n_components)
with pytest.warns(None) as record:
pca.fit(X)
assert not record.list
@pytest.mark.parametrize('copy', [True, False])
@pytest.mark.parametrize('solver', PCA_SOLVERS)
def test_whitening(solver, copy):
# Check that PCA output has unit-variance
rng = np.random.RandomState(0)
n_samples = 100
n_features = 80
n_components = 30
rank = 50
# some low rank data with correlated features
X = np.dot(rng.randn(n_samples, rank),
np.dot(np.diag(np.linspace(10.0, 1.0, rank)),
rng.randn(rank, n_features)))
# the component-wise variance of the first 50 features is 3 times the
# mean component-wise variance of the remaining 30 features
X[:, :50] *= 3
assert X.shape == (n_samples, n_features)
# the component-wise variance is thus highly varying:
assert X.std(axis=0).std() > 43.8
# whiten the data while projecting to the lower dim subspace
X_ = X.copy() # make sure we keep an original across iterations.
pca = PCA(n_components=n_components, whiten=True, copy=copy,
svd_solver=solver, random_state=0, iterated_power=7)
# test fit_transform
X_whitened = pca.fit_transform(X_.copy())
assert X_whitened.shape == (n_samples, n_components)
X_whitened2 = pca.transform(X_)
assert_allclose(X_whitened, X_whitened2, rtol=5e-4)
assert_allclose(X_whitened.std(ddof=1, axis=0), np.ones(n_components))
assert_allclose(
X_whitened.mean(axis=0), np.zeros(n_components), atol=1e-12
)
X_ = X.copy()
pca = PCA(n_components=n_components, whiten=False, copy=copy,
svd_solver=solver).fit(X_)
X_unwhitened = pca.transform(X_)
assert X_unwhitened.shape == (n_samples, n_components)
# in that case the output components still have varying variances
assert X_unwhitened.std(axis=0).std() == pytest.approx(74.1, rel=1e-1)
# we always center, so no test for non-centering.
@pytest.mark.parametrize('svd_solver', ['arpack', 'randomized'])
def test_pca_explained_variance_equivalence_solver(svd_solver):
rng = np.random.RandomState(0)
n_samples, n_features = 100, 80
X = rng.randn(n_samples, n_features)
pca_full = PCA(n_components=2, svd_solver='full')
pca_other = PCA(n_components=2, svd_solver=svd_solver, random_state=0)
pca_full.fit(X)
pca_other.fit(X)
assert_allclose(
pca_full.explained_variance_,
pca_other.explained_variance_,
rtol=5e-2
)
assert_allclose(
pca_full.explained_variance_ratio_,
pca_other.explained_variance_ratio_,
rtol=5e-2
)
@pytest.mark.parametrize(
'X',
[np.random.RandomState(0).randn(100, 80),
datasets.make_classification(100, 80, n_informative=78,
random_state=0)[0]],
ids=['random-data', 'correlated-data']
)
@pytest.mark.parametrize('svd_solver', PCA_SOLVERS)
def test_pca_explained_variance_empirical(X, svd_solver):
pca = PCA(n_components=2, svd_solver=svd_solver, random_state=0)
X_pca = pca.fit_transform(X)
assert_allclose(pca.explained_variance_, np.var(X_pca, ddof=1, axis=0))
expected_result = np.linalg.eig(np.cov(X, rowvar=False))[0]
expected_result = sorted(expected_result, reverse=True)[:2]
assert_allclose(pca.explained_variance_, expected_result, rtol=5e-3)
@pytest.mark.parametrize("svd_solver", ['arpack', 'randomized'])
def test_pca_singular_values_consistency(svd_solver):
rng = np.random.RandomState(0)
n_samples, n_features = 100, 80
X = rng.randn(n_samples, n_features)
pca_full = PCA(n_components=2, svd_solver='full', random_state=rng)
pca_other = PCA(n_components=2, svd_solver=svd_solver, random_state=rng)
pca_full.fit(X)
pca_other.fit(X)
assert_allclose(
pca_full.singular_values_, pca_other.singular_values_, rtol=5e-3
)
@pytest.mark.parametrize("svd_solver", PCA_SOLVERS)
def test_pca_singular_values(svd_solver):
rng = np.random.RandomState(0)
n_samples, n_features = 100, 80
X = rng.randn(n_samples, n_features)
pca = PCA(n_components=2, svd_solver=svd_solver, random_state=rng)
X_trans = pca.fit_transform(X)
# compare to the Frobenius norm
assert_allclose(
np.sum(pca.singular_values_ ** 2), np.linalg.norm(X_trans, "fro") ** 2
)
# Compare to the 2-norms of the score vectors
assert_allclose(
pca.singular_values_, np.sqrt(np.sum(X_trans ** 2, axis=0))
)
# set the singular values and see what er get back
n_samples, n_features = 100, 110
X = rng.randn(n_samples, n_features)
pca = PCA(n_components=3, svd_solver=svd_solver, random_state=rng)
X_trans = pca.fit_transform(X)
X_trans /= np.sqrt(np.sum(X_trans ** 2, axis=0))
X_trans[:, 0] *= 3.142
X_trans[:, 1] *= 2.718
X_hat = np.dot(X_trans, pca.components_)
pca.fit(X_hat)
assert_allclose(pca.singular_values_, [3.142, 2.718, 1.0])
@pytest.mark.parametrize("svd_solver", PCA_SOLVERS)
def test_pca_check_projection(svd_solver):
# Test that the projection of data is correct
rng = np.random.RandomState(0)
n, p = 100, 3
X = rng.randn(n, p) * .1
X[:10] += np.array([3, 4, 5])
Xt = 0.1 * rng.randn(1, p) + np.array([3, 4, 5])
Yt = PCA(n_components=2, svd_solver=svd_solver).fit(X).transform(Xt)
Yt /= np.sqrt((Yt ** 2).sum())
assert_allclose(np.abs(Yt[0][0]), 1., rtol=5e-3)
@pytest.mark.parametrize("svd_solver", PCA_SOLVERS)
def test_pca_check_projection_list(svd_solver):
# Test that the projection of data is correct
X = [[1.0, 0.0], [0.0, 1.0]]
pca = PCA(n_components=1, svd_solver=svd_solver, random_state=0)
X_trans = pca.fit_transform(X)
assert X_trans.shape, (2, 1)
assert_allclose(X_trans.mean(), 0.00, atol=1e-12)
assert_allclose(X_trans.std(), 0.71, rtol=5e-3)
@pytest.mark.parametrize("svd_solver", ['full', 'arpack', 'randomized'])
@pytest.mark.parametrize("whiten", [False, True])
def test_pca_inverse(svd_solver, whiten):
# Test that the projection of data can be inverted
rng = np.random.RandomState(0)
n, p = 50, 3
X = rng.randn(n, p) # spherical data
X[:, 1] *= .00001 # make middle component relatively small
X += [5, 4, 3] # make a large mean
# same check that we can find the original data from the transformed
# signal (since the data is almost of rank n_components)
pca = PCA(n_components=2, svd_solver=svd_solver, whiten=whiten).fit(X)
Y = pca.transform(X)
Y_inverse = pca.inverse_transform(Y)
assert_allclose(X, Y_inverse, rtol=5e-6)
@pytest.mark.parametrize(
'data',
[np.array([[0, 1, 0], [1, 0, 0]]), np.array([[0, 1, 0], [1, 0, 0]]).T]
)
@pytest.mark.parametrize(
"svd_solver, n_components, err_msg",
[('arpack', 0, r'must be between 1 and min\(n_samples, n_features\)'),
('randomized', 0, r'must be between 1 and min\(n_samples, n_features\)'),
('arpack', 2, r'must be strictly less than min'),
('auto', -1, (r"n_components={}L? must be between {}L? and "
r"min\(n_samples, n_features\)={}L? with "
r"svd_solver=\'{}\'")),
('auto', 3, (r"n_components={}L? must be between {}L? and "
r"min\(n_samples, n_features\)={}L? with "
r"svd_solver=\'{}\'")),
('auto', 1.0, "must be of type int")]
)
def test_pca_validation(svd_solver, data, n_components, err_msg):
# Ensures that solver-specific extreme inputs for the n_components
# parameter raise errors
smallest_d = 2 # The smallest dimension
lower_limit = {'randomized': 1, 'arpack': 1, 'full': 0, 'auto': 0}
pca_fitted = PCA(n_components, svd_solver=svd_solver)
solver_reported = 'full' if svd_solver == 'auto' else svd_solver
err_msg = err_msg.format(
n_components, lower_limit[svd_solver], smallest_d, solver_reported
)
with pytest.raises(ValueError, match=err_msg):
pca_fitted.fit(data)
# Additional case for arpack
if svd_solver == 'arpack':
n_components = smallest_d
err_msg = ("n_components={}L? must be strictly less than "
r"min\(n_samples, n_features\)={}L? with "
"svd_solver=\'arpack\'".format(n_components, smallest_d))
with pytest.raises(ValueError, match=err_msg):
PCA(n_components, svd_solver=svd_solver).fit(data)
@pytest.mark.parametrize(
'solver, n_components_',
[('full', min(iris.data.shape)),
('arpack', min(iris.data.shape) - 1),
('randomized', min(iris.data.shape))]
)
@pytest.mark.parametrize("data", [iris.data, iris.data.T])
def test_n_components_none(data, solver, n_components_):
pca = PCA(svd_solver=solver)
pca.fit(data)
assert pca.n_components_ == n_components_
@pytest.mark.parametrize("svd_solver", ['auto', 'full'])
def test_n_components_mle(svd_solver):
# Ensure that n_components == 'mle' doesn't raise error for auto/full
rng = np.random.RandomState(0)
n_samples, n_features = 600, 10
X = rng.randn(n_samples, n_features)
pca = PCA(n_components='mle', svd_solver=svd_solver)
pca.fit(X)
assert pca.n_components_ == 1
@pytest.mark.parametrize("svd_solver", ["arpack", "randomized"])
def test_n_components_mle_error(svd_solver):
# Ensure that n_components == 'mle' will raise an error for unsupported
# solvers
rng = np.random.RandomState(0)
n_samples, n_features = 600, 10
X = rng.randn(n_samples, n_features)
pca = PCA(n_components='mle', svd_solver=svd_solver)
err_msg = ("n_components='mle' cannot be a string with svd_solver='{}'"
.format(svd_solver))
with pytest.raises(ValueError, match=err_msg):
pca.fit(X)
def test_pca_dim():
# Check automated dimensionality setting
rng = np.random.RandomState(0)
n, p = 100, 5
X = rng.randn(n, p) * .1
X[:10] += np.array([3, 4, 5, 1, 2])
pca = PCA(n_components='mle', svd_solver='full').fit(X)
assert pca.n_components == 'mle'
assert pca.n_components_ == 1
def test_infer_dim_1():
# TODO: explain what this is testing
# Or at least use explicit variable names...
n, p = 1000, 5
rng = np.random.RandomState(0)
X = (rng.randn(n, p) * .1 + rng.randn(n, 1) * np.array([3, 4, 5, 1, 2]) +
np.array([1, 0, 7, 4, 6]))
pca = PCA(n_components=p, svd_solver='full')
pca.fit(X)
spect = pca.explained_variance_
ll = np.array([_assess_dimension(spect, k, n) for k in range(1, p)])
assert ll[1] > ll.max() - .01 * n
def test_infer_dim_2():
# TODO: explain what this is testing
# Or at least use explicit variable names...
n, p = 1000, 5
rng = np.random.RandomState(0)
X = rng.randn(n, p) * .1
X[:10] += np.array([3, 4, 5, 1, 2])
X[10:20] += np.array([6, 0, 7, 2, -1])
pca = PCA(n_components=p, svd_solver='full')
pca.fit(X)
spect = pca.explained_variance_
assert _infer_dimension(spect, n) > 1
def test_infer_dim_3():
n, p = 100, 5
rng = np.random.RandomState(0)
X = rng.randn(n, p) * .1
X[:10] += np.array([3, 4, 5, 1, 2])
X[10:20] += np.array([6, 0, 7, 2, -1])
X[30:40] += 2 * np.array([-1, 1, -1, 1, -1])
pca = PCA(n_components=p, svd_solver='full')
pca.fit(X)
spect = pca.explained_variance_
assert _infer_dimension(spect, n) > 2
@pytest.mark.parametrize(
"X, n_components, n_components_validated",
[(iris.data, 0.95, 2), # row > col
(iris.data, 0.01, 1), # row > col
(np.random.RandomState(0).rand(5, 20), 0.5, 2)] # row < col
)
def test_infer_dim_by_explained_variance(X, n_components,
n_components_validated):
pca = PCA(n_components=n_components, svd_solver='full')
pca.fit(X)
assert pca.n_components == pytest.approx(n_components)
assert pca.n_components_ == n_components_validated
@pytest.mark.parametrize("svd_solver", PCA_SOLVERS)
def test_pca_score(svd_solver):
# Test that probabilistic PCA scoring yields a reasonable score
n, p = 1000, 3
rng = np.random.RandomState(0)
X = rng.randn(n, p) * .1 + np.array([3, 4, 5])
pca = PCA(n_components=2, svd_solver=svd_solver)
pca.fit(X)
ll1 = pca.score(X)
h = -0.5 * np.log(2 * np.pi * np.exp(1) * 0.1 ** 2) * p
assert_allclose(ll1 / h, 1, rtol=5e-2)
ll2 = pca.score(rng.randn(n, p) * .2 + np.array([3, 4, 5]))
assert ll1 > ll2
pca = PCA(n_components=2, whiten=True, svd_solver=svd_solver)
pca.fit(X)
ll2 = pca.score(X)
assert ll1 > ll2
def test_pca_score3():
# Check that probabilistic PCA selects the right model
n, p = 200, 3
rng = np.random.RandomState(0)
Xl = (rng.randn(n, p) + rng.randn(n, 1) * np.array([3, 4, 5]) +
np.array([1, 0, 7]))
Xt = (rng.randn(n, p) + rng.randn(n, 1) * np.array([3, 4, 5]) +
np.array([1, 0, 7]))
ll = np.zeros(p)
for k in range(p):
pca = PCA(n_components=k, svd_solver='full')
pca.fit(Xl)
ll[k] = pca.score(Xt)
assert ll.argmax() == 1
@pytest.mark.parametrize("svd_solver", PCA_SOLVERS)
def test_pca_sanity_noise_variance(svd_solver):
# Sanity check for the noise_variance_. For more details see
# https://github.com/scikit-learn/scikit-learn/issues/7568
# https://github.com/scikit-learn/scikit-learn/issues/8541
# https://github.com/scikit-learn/scikit-learn/issues/8544
X, _ = datasets.load_digits(return_X_y=True)
pca = PCA(n_components=30, svd_solver=svd_solver, random_state=0)
pca.fit(X)
assert np.all((pca.explained_variance_ - pca.noise_variance_) >= 0)
@pytest.mark.parametrize("svd_solver", ["arpack", "randomized"])
def test_pca_score_consistency_solvers(svd_solver):
# Check the consistency of score between solvers
X, _ = datasets.load_digits(return_X_y=True)
pca_full = PCA(n_components=30, svd_solver='full', random_state=0)
pca_other = PCA(n_components=30, svd_solver=svd_solver, random_state=0)
pca_full.fit(X)
pca_other.fit(X)
assert_allclose(pca_full.score(X), pca_other.score(X), rtol=5e-6)
# arpack raises ValueError for n_components == min(n_samples, n_features)
@pytest.mark.parametrize("svd_solver", ["full", "randomized"])
def test_pca_zero_noise_variance_edge_cases(svd_solver):
# ensure that noise_variance_ is 0 in edge cases
# when n_components == min(n_samples, n_features)
n, p = 100, 3
rng = np.random.RandomState(0)
X = rng.randn(n, p) * .1 + np.array([3, 4, 5])
pca = PCA(n_components=p, svd_solver=svd_solver)
pca.fit(X)
assert pca.noise_variance_ == 0
pca.fit(X.T)
assert pca.noise_variance_ == 0
@pytest.mark.parametrize(
'data, n_components, expected_solver',
[ # case: n_components in (0,1) => 'full'
(np.random.RandomState(0).uniform(size=(1000, 50)), 0.5, 'full'),
# case: max(X.shape) <= 500 => 'full'
(np.random.RandomState(0).uniform(size=(10, 50)), 5, 'full'),
# case: n_components >= .8 * min(X.shape) => 'full'
(np.random.RandomState(0).uniform(size=(1000, 50)), 50, 'full'),
# n_components >= 1 and n_components < .8*min(X.shape) => 'randomized'
(np.random.RandomState(0).uniform(size=(1000, 50)), 10, 'randomized')
]
)
def test_pca_svd_solver_auto(data, n_components, expected_solver):
pca_auto = PCA(n_components=n_components, random_state=0)
pca_test = PCA(
n_components=n_components, svd_solver=expected_solver, random_state=0
)
pca_auto.fit(data)
pca_test.fit(data)
assert_allclose(pca_auto.components_, pca_test.components_)
@pytest.mark.parametrize('svd_solver', PCA_SOLVERS)
def test_pca_sparse_input(svd_solver):
X = np.random.RandomState(0).rand(5, 4)
X = sp.sparse.csr_matrix(X)
assert sp.sparse.issparse(X)
pca = PCA(n_components=3, svd_solver=svd_solver)
with pytest.raises(TypeError):
pca.fit(X)
def test_pca_bad_solver():
X = np.random.RandomState(0).rand(5, 4)
pca = PCA(n_components=3, svd_solver='bad_argument')
with pytest.raises(ValueError):
pca.fit(X)
@pytest.mark.parametrize("svd_solver", PCA_SOLVERS)
def test_pca_deterministic_output(svd_solver):
rng = np.random.RandomState(0)
X = rng.rand(10, 10)
transformed_X = np.zeros((20, 2))
for i in range(20):
pca = PCA(n_components=2, svd_solver=svd_solver, random_state=rng)
transformed_X[i, :] = pca.fit_transform(X)[0]
assert_allclose(
transformed_X, np.tile(transformed_X[0, :], 20).reshape(20, 2)
)
@pytest.mark.parametrize('svd_solver', PCA_SOLVERS)
def test_pca_dtype_preservation(svd_solver):
check_pca_float_dtype_preservation(svd_solver)
check_pca_int_dtype_upcast_to_double(svd_solver)
def check_pca_float_dtype_preservation(svd_solver):
# Ensure that PCA does not upscale the dtype when input is float32
X_64 = np.random.RandomState(0).rand(1000, 4).astype(np.float64,
copy=False)
X_32 = X_64.astype(np.float32)
pca_64 = PCA(n_components=3, svd_solver=svd_solver,
random_state=0).fit(X_64)
pca_32 = PCA(n_components=3, svd_solver=svd_solver,
random_state=0).fit(X_32)
assert pca_64.components_.dtype == np.float64
assert pca_32.components_.dtype == np.float32
assert pca_64.transform(X_64).dtype == np.float64
assert pca_32.transform(X_32).dtype == np.float32
# the rtol is set such that the test passes on all platforms tested on
# conda-forge: PR#15775
# see: https://github.com/conda-forge/scikit-learn-feedstock/pull/113
assert_allclose(pca_64.components_, pca_32.components_, rtol=2e-4)
def check_pca_int_dtype_upcast_to_double(svd_solver):
# Ensure that all int types will be upcast to float64
X_i64 = np.random.RandomState(0).randint(0, 1000, (1000, 4))
X_i64 = X_i64.astype(np.int64, copy=False)
X_i32 = X_i64.astype(np.int32, copy=False)
pca_64 = PCA(n_components=3, svd_solver=svd_solver,
random_state=0).fit(X_i64)
pca_32 = PCA(n_components=3, svd_solver=svd_solver,
random_state=0).fit(X_i32)
assert pca_64.components_.dtype == np.float64
assert pca_32.components_.dtype == np.float64
assert pca_64.transform(X_i64).dtype == np.float64
assert pca_32.transform(X_i32).dtype == np.float64
assert_allclose(pca_64.components_, pca_32.components_, rtol=1e-4)
def test_pca_n_components_mostly_explained_variance_ratio():
# when n_components is the second highest cumulative sum of the
# explained_variance_ratio_, then n_components_ should equal the
# number of features in the dataset #15669
X, y = load_iris(return_X_y=True)
pca1 = PCA().fit(X, y)
n_components = pca1.explained_variance_ratio_.cumsum()[-2]
pca2 = PCA(n_components=n_components).fit(X, y)
assert pca2.n_components_ == X.shape[1]
def test_assess_dimension_bad_rank():
# Test error when tested rank not in [1, n_features - 1]
spectrum = np.array([1, 1e-30, 1e-30, 1e-30])
n_samples = 10
for rank in (0, 5):
with pytest.raises(ValueError,
match=r"should be in \[1, n_features - 1\]"):
_assess_dimension(spectrum, rank, n_samples)
def test_small_eigenvalues_mle():
# Test rank associated with tiny eigenvalues are given a log-likelihood of
# -inf. The inferred rank will be 1
spectrum = np.array([1, 1e-30, 1e-30, 1e-30])
assert _assess_dimension(spectrum, rank=1, n_samples=10) > -np.inf
for rank in (2, 3):
assert _assess_dimension(spectrum, rank, 10) == -np.inf
assert _infer_dimension(spectrum, 10) == 1
def test_mle_redundant_data():
# Test 'mle' with pathological X: only one relevant feature should give a
# rank of 1
X, _ = datasets.make_classification(n_features=20,
n_informative=1, n_repeated=18,
n_redundant=1, n_clusters_per_class=1,
random_state=42)
pca = PCA(n_components='mle').fit(X)
assert pca.n_components_ == 1
def test_fit_mle_too_few_samples():
# Tests that an error is raised when the number of samples is smaller
# than the number of features during an mle fit
X, _ = datasets.make_classification(n_samples=20, n_features=21,
random_state=42)
pca = PCA(n_components='mle', svd_solver='full')
with pytest.raises(ValueError, match="n_components='mle' is only "
"supported if "
"n_samples >= n_features"):
pca.fit(X)
def test_mle_simple_case():
# non-regression test for issue
# https://github.com/scikit-learn/scikit-learn/issues/16730
n_samples, n_dim = 1000, 10
X = np.random.RandomState(0).randn(n_samples, n_dim)
X[:, -1] = np.mean(X[:, :-1], axis=-1) # true X dim is ndim - 1
pca_skl = PCA('mle', svd_solver='full')
pca_skl.fit(X)
assert pca_skl.n_components_ == n_dim - 1
def test_assess_dimesion_rank_one():
# Make sure assess_dimension works properly on a matrix of rank 1
n_samples, n_features = 9, 6
X = np.ones((n_samples, n_features)) # rank 1 matrix
_, s, _ = np.linalg.svd(X, full_matrices=True)
assert sum(s[1:]) == 0 # except for rank 1, all eigenvalues are 0
assert np.isfinite(_assess_dimension(s, rank=1, n_samples=n_samples))
for rank in range(2, n_features):
assert _assess_dimension(s, rank, n_samples) == -np.inf

View file

@ -0,0 +1,224 @@
# Author: Vlad Niculae
# License: BSD 3 clause
import sys
import pytest
import numpy as np
from sklearn.utils._testing import assert_array_almost_equal
from sklearn.utils._testing import assert_allclose
from sklearn.utils._testing import if_safe_multiprocessing_with_blas
from sklearn.decomposition import SparsePCA, MiniBatchSparsePCA, PCA
from sklearn.utils import check_random_state
def generate_toy_data(n_components, n_samples, image_size, random_state=None):
n_features = image_size[0] * image_size[1]
rng = check_random_state(random_state)
U = rng.randn(n_samples, n_components)
V = rng.randn(n_components, n_features)
centers = [(3, 3), (6, 7), (8, 1)]
sz = [1, 2, 1]
for k in range(n_components):
img = np.zeros(image_size)
xmin, xmax = centers[k][0] - sz[k], centers[k][0] + sz[k]
ymin, ymax = centers[k][1] - sz[k], centers[k][1] + sz[k]
img[xmin:xmax][:, ymin:ymax] = 1.0
V[k, :] = img.ravel()
# Y is defined by : Y = UV + noise
Y = np.dot(U, V)
Y += 0.1 * rng.randn(Y.shape[0], Y.shape[1]) # Add noise
return Y, U, V
# SparsePCA can be a bit slow. To avoid having test times go up, we
# test different aspects of the code in the same test
def test_correct_shapes():
rng = np.random.RandomState(0)
X = rng.randn(12, 10)
spca = SparsePCA(n_components=8, random_state=rng)
U = spca.fit_transform(X)
assert spca.components_.shape == (8, 10)
assert U.shape == (12, 8)
# test overcomplete decomposition
spca = SparsePCA(n_components=13, random_state=rng)
U = spca.fit_transform(X)
assert spca.components_.shape == (13, 10)
assert U.shape == (12, 13)
def test_fit_transform():
alpha = 1
rng = np.random.RandomState(0)
Y, _, _ = generate_toy_data(3, 10, (8, 8), random_state=rng) # wide array
spca_lars = SparsePCA(n_components=3, method='lars', alpha=alpha,
random_state=0)
spca_lars.fit(Y)
# Test that CD gives similar results
spca_lasso = SparsePCA(n_components=3, method='cd', random_state=0,
alpha=alpha)
spca_lasso.fit(Y)
assert_array_almost_equal(spca_lasso.components_, spca_lars.components_)
@if_safe_multiprocessing_with_blas
def test_fit_transform_parallel():
alpha = 1
rng = np.random.RandomState(0)
Y, _, _ = generate_toy_data(3, 10, (8, 8), random_state=rng) # wide array
spca_lars = SparsePCA(n_components=3, method='lars', alpha=alpha,
random_state=0)
spca_lars.fit(Y)
U1 = spca_lars.transform(Y)
# Test multiple CPUs
spca = SparsePCA(n_components=3, n_jobs=2, method='lars', alpha=alpha,
random_state=0).fit(Y)
U2 = spca.transform(Y)
assert not np.all(spca_lars.components_ == 0)
assert_array_almost_equal(U1, U2)
def test_transform_nan():
# Test that SparsePCA won't return NaN when there is 0 feature in all
# samples.
rng = np.random.RandomState(0)
Y, _, _ = generate_toy_data(3, 10, (8, 8), random_state=rng) # wide array
Y[:, 0] = 0
estimator = SparsePCA(n_components=8)
assert not np.any(np.isnan(estimator.fit_transform(Y)))
def test_fit_transform_tall():
rng = np.random.RandomState(0)
Y, _, _ = generate_toy_data(3, 65, (8, 8), random_state=rng) # tall array
spca_lars = SparsePCA(n_components=3, method='lars', random_state=rng)
U1 = spca_lars.fit_transform(Y)
spca_lasso = SparsePCA(n_components=3, method='cd', random_state=rng)
U2 = spca_lasso.fit(Y).transform(Y)
assert_array_almost_equal(U1, U2)
def test_initialization():
rng = np.random.RandomState(0)
U_init = rng.randn(5, 3)
V_init = rng.randn(3, 4)
model = SparsePCA(n_components=3, U_init=U_init, V_init=V_init, max_iter=0,
random_state=rng)
model.fit(rng.randn(5, 4))
assert_allclose(model.components_,
V_init / np.linalg.norm(V_init, axis=1)[:, None])
def test_mini_batch_correct_shapes():
rng = np.random.RandomState(0)
X = rng.randn(12, 10)
pca = MiniBatchSparsePCA(n_components=8, random_state=rng)
U = pca.fit_transform(X)
assert pca.components_.shape == (8, 10)
assert U.shape == (12, 8)
# test overcomplete decomposition
pca = MiniBatchSparsePCA(n_components=13, random_state=rng)
U = pca.fit_transform(X)
assert pca.components_.shape == (13, 10)
assert U.shape == (12, 13)
# XXX: test always skipped
@pytest.mark.skipif(True, reason="skipping mini_batch_fit_transform.")
def test_mini_batch_fit_transform():
alpha = 1
rng = np.random.RandomState(0)
Y, _, _ = generate_toy_data(3, 10, (8, 8), random_state=rng) # wide array
spca_lars = MiniBatchSparsePCA(n_components=3, random_state=0,
alpha=alpha).fit(Y)
U1 = spca_lars.transform(Y)
# Test multiple CPUs
if sys.platform == 'win32': # fake parallelism for win32
import joblib
_mp = joblib.parallel.multiprocessing
joblib.parallel.multiprocessing = None
try:
spca = MiniBatchSparsePCA(n_components=3, n_jobs=2, alpha=alpha,
random_state=0)
U2 = spca.fit(Y).transform(Y)
finally:
joblib.parallel.multiprocessing = _mp
else: # we can efficiently use parallelism
spca = MiniBatchSparsePCA(n_components=3, n_jobs=2, alpha=alpha,
random_state=0)
U2 = spca.fit(Y).transform(Y)
assert not np.all(spca_lars.components_ == 0)
assert_array_almost_equal(U1, U2)
# Test that CD gives similar results
spca_lasso = MiniBatchSparsePCA(n_components=3, method='cd', alpha=alpha,
random_state=0).fit(Y)
assert_array_almost_equal(spca_lasso.components_, spca_lars.components_)
def test_scaling_fit_transform():
alpha = 1
rng = np.random.RandomState(0)
Y, _, _ = generate_toy_data(3, 1000, (8, 8), random_state=rng)
spca_lars = SparsePCA(n_components=3, method='lars', alpha=alpha,
random_state=rng)
results_train = spca_lars.fit_transform(Y)
results_test = spca_lars.transform(Y[:10])
assert_allclose(results_train[0], results_test[0])
def test_pca_vs_spca():
rng = np.random.RandomState(0)
Y, _, _ = generate_toy_data(3, 1000, (8, 8), random_state=rng)
Z, _, _ = generate_toy_data(3, 10, (8, 8), random_state=rng)
spca = SparsePCA(alpha=0, ridge_alpha=0, n_components=2)
pca = PCA(n_components=2)
pca.fit(Y)
spca.fit(Y)
results_test_pca = pca.transform(Z)
results_test_spca = spca.transform(Z)
assert_allclose(np.abs(spca.components_.dot(pca.components_.T)),
np.eye(2), atol=1e-5)
results_test_pca *= np.sign(results_test_pca[0, :])
results_test_spca *= np.sign(results_test_spca[0, :])
assert_allclose(results_test_pca, results_test_spca)
@pytest.mark.parametrize("spca", [SparsePCA, MiniBatchSparsePCA])
def test_spca_deprecation_warning(spca):
rng = np.random.RandomState(0)
Y, _, _ = generate_toy_data(3, 10, (8, 8), random_state=rng)
warn_msg = "'normalize_components' has been deprecated in 0.22"
with pytest.warns(FutureWarning, match=warn_msg):
spca(normalize_components=True).fit(Y)
@pytest.mark.parametrize("spca", [SparsePCA, MiniBatchSparsePCA])
def test_spca_error_unormalized_components(spca):
rng = np.random.RandomState(0)
Y, _, _ = generate_toy_data(3, 10, (8, 8), random_state=rng)
err_msg = "normalize_components=False is not supported starting "
with pytest.raises(NotImplementedError, match=err_msg):
spca(normalize_components=False).fit(Y)
@pytest.mark.parametrize("SPCA", [SparsePCA, MiniBatchSparsePCA])
@pytest.mark.parametrize("n_components", [None, 3])
def test_spca_n_components_(SPCA, n_components):
rng = np.random.RandomState(0)
n_samples, n_features = 12, 10
X = rng.randn(n_samples, n_features)
model = SPCA(n_components=n_components).fit(X)
if n_components is not None:
assert model.n_components_ == n_components
else:
assert model.n_components_ == n_features

View file

@ -0,0 +1,193 @@
"""Test truncated SVD transformer."""
import numpy as np
import scipy.sparse as sp
import pytest
from sklearn.decomposition import TruncatedSVD, PCA
from sklearn.utils import check_random_state
from sklearn.utils._testing import assert_array_less, assert_allclose
SVD_SOLVERS = ['arpack', 'randomized']
@pytest.fixture(scope='module')
def X_sparse():
# Make an X that looks somewhat like a small tf-idf matrix.
rng = check_random_state(42)
X = sp.random(60, 55, density=0.2, format="csr", random_state=rng)
X.data[:] = 1 + np.log(X.data)
return X
@pytest.mark.parametrize("solver", ['randomized'])
@pytest.mark.parametrize('kind', ('dense', 'sparse'))
def test_solvers(X_sparse, solver, kind):
X = X_sparse if kind == 'sparse' else X_sparse.toarray()
svd_a = TruncatedSVD(30, algorithm="arpack")
svd = TruncatedSVD(30, algorithm=solver, random_state=42)
Xa = svd_a.fit_transform(X)[:, :6]
Xr = svd.fit_transform(X)[:, :6]
assert_allclose(Xa, Xr, rtol=2e-3)
comp_a = np.abs(svd_a.components_)
comp = np.abs(svd.components_)
# All elements are equal, but some elements are more equal than others.
assert_allclose(comp_a[:9], comp[:9], rtol=1e-3)
assert_allclose(comp_a[9:], comp[9:], atol=1e-2)
@pytest.mark.parametrize("n_components", (10, 25, 41))
def test_attributes(n_components, X_sparse):
n_features = X_sparse.shape[1]
tsvd = TruncatedSVD(n_components).fit(X_sparse)
assert tsvd.n_components == n_components
assert tsvd.components_.shape == (n_components, n_features)
@pytest.mark.parametrize('algorithm', SVD_SOLVERS)
def test_too_many_components(algorithm, X_sparse):
n_features = X_sparse.shape[1]
for n_components in (n_features, n_features + 1):
tsvd = TruncatedSVD(n_components=n_components, algorithm=algorithm)
with pytest.raises(ValueError):
tsvd.fit(X_sparse)
@pytest.mark.parametrize('fmt', ("array", "csr", "csc", "coo", "lil"))
def test_sparse_formats(fmt, X_sparse):
n_samples = X_sparse.shape[0]
Xfmt = (X_sparse.toarray()
if fmt == "dense" else getattr(X_sparse, "to" + fmt)())
tsvd = TruncatedSVD(n_components=11)
Xtrans = tsvd.fit_transform(Xfmt)
assert Xtrans.shape == (n_samples, 11)
Xtrans = tsvd.transform(Xfmt)
assert Xtrans.shape == (n_samples, 11)
@pytest.mark.parametrize('algo', SVD_SOLVERS)
def test_inverse_transform(algo, X_sparse):
# We need a lot of components for the reconstruction to be "almost
# equal" in all positions. XXX Test means or sums instead?
tsvd = TruncatedSVD(n_components=52, random_state=42, algorithm=algo)
Xt = tsvd.fit_transform(X_sparse)
Xinv = tsvd.inverse_transform(Xt)
assert_allclose(Xinv, X_sparse.toarray(), rtol=1e-1, atol=2e-1)
def test_integers(X_sparse):
n_samples = X_sparse.shape[0]
Xint = X_sparse.astype(np.int64)
tsvd = TruncatedSVD(n_components=6)
Xtrans = tsvd.fit_transform(Xint)
assert Xtrans.shape == (n_samples, tsvd.n_components)
@pytest.mark.parametrize('kind', ('dense', 'sparse'))
@pytest.mark.parametrize('n_components', [10, 20])
@pytest.mark.parametrize('solver', SVD_SOLVERS)
def test_explained_variance(X_sparse, kind, n_components, solver):
X = X_sparse if kind == 'sparse' else X_sparse.toarray()
svd = TruncatedSVD(n_components, algorithm=solver)
X_tr = svd.fit_transform(X)
# Assert that all the values are greater than 0
assert_array_less(0.0, svd.explained_variance_ratio_)
# Assert that total explained variance is less than 1
assert_array_less(svd.explained_variance_ratio_.sum(), 1.0)
# Test that explained_variance is correct
total_variance = np.var(X_sparse.toarray(), axis=0).sum()
variances = np.var(X_tr, axis=0)
true_explained_variance_ratio = variances / total_variance
assert_allclose(
svd.explained_variance_ratio_,
true_explained_variance_ratio,
)
@pytest.mark.parametrize('kind', ('dense', 'sparse'))
@pytest.mark.parametrize('solver', SVD_SOLVERS)
def test_explained_variance_components_10_20(X_sparse, kind, solver):
X = X_sparse if kind == 'sparse' else X_sparse.toarray()
svd_10 = TruncatedSVD(10, algorithm=solver, n_iter=10).fit(X)
svd_20 = TruncatedSVD(20, algorithm=solver, n_iter=10).fit(X)
# Assert the 1st component is equal
assert_allclose(
svd_10.explained_variance_ratio_,
svd_20.explained_variance_ratio_[:10],
rtol=5e-3,
)
# Assert that 20 components has higher explained variance than 10
assert (
svd_20.explained_variance_ratio_.sum() >
svd_10.explained_variance_ratio_.sum()
)
@pytest.mark.parametrize('solver', SVD_SOLVERS)
def test_singular_values_consistency(solver):
# Check that the TruncatedSVD output has the correct singular values
rng = np.random.RandomState(0)
n_samples, n_features = 100, 80
X = rng.randn(n_samples, n_features)
pca = TruncatedSVD(n_components=2, algorithm=solver,
random_state=rng).fit(X)
# Compare to the Frobenius norm
X_pca = pca.transform(X)
assert_allclose(np.sum(pca.singular_values_**2.0),
np.linalg.norm(X_pca, "fro")**2.0, rtol=1e-2)
# Compare to the 2-norms of the score vectors
assert_allclose(pca.singular_values_,
np.sqrt(np.sum(X_pca**2.0, axis=0)), rtol=1e-2)
@pytest.mark.parametrize('solver', SVD_SOLVERS)
def test_singular_values_expected(solver):
# Set the singular values and see what we get back
rng = np.random.RandomState(0)
n_samples = 100
n_features = 110
X = rng.randn(n_samples, n_features)
pca = TruncatedSVD(n_components=3, algorithm=solver,
random_state=rng)
X_pca = pca.fit_transform(X)
X_pca /= np.sqrt(np.sum(X_pca**2.0, axis=0))
X_pca[:, 0] *= 3.142
X_pca[:, 1] *= 2.718
X_hat_pca = np.dot(X_pca, pca.components_)
pca.fit(X_hat_pca)
assert_allclose(pca.singular_values_, [3.142, 2.718, 1.0], rtol=1e-14)
def test_truncated_svd_eq_pca(X_sparse):
# TruncatedSVD should be equal to PCA on centered data
X_dense = X_sparse.toarray()
X_c = X_dense - X_dense.mean(axis=0)
params = dict(n_components=10, random_state=42)
svd = TruncatedSVD(algorithm='arpack', **params)
pca = PCA(svd_solver='arpack', **params)
Xt_svd = svd.fit_transform(X_c)
Xt_pca = pca.fit_transform(X_c)
assert_allclose(Xt_svd, Xt_pca, rtol=1e-9)
assert_allclose(pca.mean_, 0, atol=1e-9)
assert_allclose(svd.components_, pca.components_)