# Author: Wei Xue # Thierry Guillemot # License: BSD 3 clause import sys import copy import warnings import pytest import numpy as np from scipy import stats, linalg from sklearn.covariance import EmpiricalCovariance from sklearn.datasets import make_spd_matrix from io import StringIO from sklearn.metrics.cluster import adjusted_rand_score from sklearn.mixture import GaussianMixture from sklearn.mixture._gaussian_mixture import ( _estimate_gaussian_covariances_full, _estimate_gaussian_covariances_tied, _estimate_gaussian_covariances_diag, _estimate_gaussian_covariances_spherical, _compute_precision_cholesky, _compute_log_det_cholesky, ) from sklearn.exceptions import ConvergenceWarning, NotFittedError from sklearn.utils.extmath import fast_logdet from sklearn.utils._testing import assert_allclose from sklearn.utils._testing import assert_almost_equal from sklearn.utils._testing import assert_array_almost_equal from sklearn.utils._testing import assert_array_equal from sklearn.utils._testing import assert_raise_message from sklearn.utils._testing import assert_warns_message from sklearn.utils._testing import ignore_warnings COVARIANCE_TYPE = ['full', 'tied', 'diag', 'spherical'] def generate_data(n_samples, n_features, weights, means, precisions, covariance_type): rng = np.random.RandomState(0) X = [] if covariance_type == 'spherical': for _, (w, m, c) in enumerate(zip(weights, means, precisions['spherical'])): X.append(rng.multivariate_normal(m, c * np.eye(n_features), int(np.round(w * n_samples)))) if covariance_type == 'diag': for _, (w, m, c) in enumerate(zip(weights, means, precisions['diag'])): X.append(rng.multivariate_normal(m, np.diag(c), int(np.round(w * n_samples)))) if covariance_type == 'tied': for _, (w, m) in enumerate(zip(weights, means)): X.append(rng.multivariate_normal(m, precisions['tied'], int(np.round(w * n_samples)))) if covariance_type == 'full': for _, (w, m, c) in enumerate(zip(weights, means, precisions['full'])): X.append(rng.multivariate_normal(m, c, int(np.round(w * n_samples)))) X = np.vstack(X) return X class RandomData: def __init__(self, rng, n_samples=200, n_components=2, n_features=2, scale=50): self.n_samples = n_samples self.n_components = n_components self.n_features = n_features self.weights = rng.rand(n_components) self.weights = self.weights / self.weights.sum() self.means = rng.rand(n_components, n_features) * scale self.covariances = { 'spherical': .5 + rng.rand(n_components), 'diag': (.5 + rng.rand(n_components, n_features)) ** 2, 'tied': make_spd_matrix(n_features, random_state=rng), 'full': np.array([ make_spd_matrix(n_features, random_state=rng) * .5 for _ in range(n_components)])} self.precisions = { 'spherical': 1. / self.covariances['spherical'], 'diag': 1. / self.covariances['diag'], 'tied': linalg.inv(self.covariances['tied']), 'full': np.array([linalg.inv(covariance) for covariance in self.covariances['full']])} self.X = dict(zip(COVARIANCE_TYPE, [generate_data( n_samples, n_features, self.weights, self.means, self.covariances, covar_type) for covar_type in COVARIANCE_TYPE])) self.Y = np.hstack([np.full(int(np.round(w * n_samples)), k, dtype=np.int) for k, w in enumerate(self.weights)]) def test_gaussian_mixture_attributes(): # test bad parameters rng = np.random.RandomState(0) X = rng.rand(10, 2) n_components_bad = 0 gmm = GaussianMixture(n_components=n_components_bad) assert_raise_message(ValueError, "Invalid value for 'n_components': %d " "Estimation requires at least one component" % n_components_bad, gmm.fit, X) # covariance_type should be in [spherical, diag, tied, full] covariance_type_bad = 'bad_covariance_type' gmm = GaussianMixture(covariance_type=covariance_type_bad) assert_raise_message(ValueError, "Invalid value for 'covariance_type': %s " "'covariance_type' should be in " "['spherical', 'tied', 'diag', 'full']" % covariance_type_bad, gmm.fit, X) tol_bad = -1 gmm = GaussianMixture(tol=tol_bad) assert_raise_message(ValueError, "Invalid value for 'tol': %.5f " "Tolerance used by the EM must be non-negative" % tol_bad, gmm.fit, X) reg_covar_bad = -1 gmm = GaussianMixture(reg_covar=reg_covar_bad) assert_raise_message(ValueError, "Invalid value for 'reg_covar': %.5f " "regularization on covariance must be " "non-negative" % reg_covar_bad, gmm.fit, X) max_iter_bad = 0 gmm = GaussianMixture(max_iter=max_iter_bad) assert_raise_message(ValueError, "Invalid value for 'max_iter': %d " "Estimation requires at least one iteration" % max_iter_bad, gmm.fit, X) n_init_bad = 0 gmm = GaussianMixture(n_init=n_init_bad) assert_raise_message(ValueError, "Invalid value for 'n_init': %d " "Estimation requires at least one run" % n_init_bad, gmm.fit, X) init_params_bad = 'bad_method' gmm = GaussianMixture(init_params=init_params_bad) assert_raise_message(ValueError, "Unimplemented initialization method '%s'" % init_params_bad, gmm.fit, X) # test good parameters n_components, tol, n_init, max_iter, reg_covar = 2, 1e-4, 3, 30, 1e-1 covariance_type, init_params = 'full', 'random' gmm = GaussianMixture(n_components=n_components, tol=tol, n_init=n_init, max_iter=max_iter, reg_covar=reg_covar, covariance_type=covariance_type, init_params=init_params).fit(X) assert gmm.n_components == n_components assert gmm.covariance_type == covariance_type assert gmm.tol == tol assert gmm.reg_covar == reg_covar assert gmm.max_iter == max_iter assert gmm.n_init == n_init assert gmm.init_params == init_params def test_check_X(): from sklearn.mixture._base import _check_X rng = np.random.RandomState(0) n_samples, n_components, n_features = 10, 2, 2 X_bad_dim = rng.rand(n_components - 1, n_features) assert_raise_message(ValueError, 'Expected n_samples >= n_components ' 'but got n_components = %d, n_samples = %d' % (n_components, X_bad_dim.shape[0]), _check_X, X_bad_dim, n_components) X_bad_dim = rng.rand(n_components, n_features + 1) assert_raise_message(ValueError, 'Expected the input data X have %d features, ' 'but got %d features' % (n_features, X_bad_dim.shape[1]), _check_X, X_bad_dim, n_components, n_features) X = rng.rand(n_samples, n_features) assert_array_equal(X, _check_X(X, n_components, n_features)) def test_check_weights(): rng = np.random.RandomState(0) rand_data = RandomData(rng) n_components = rand_data.n_components X = rand_data.X['full'] g = GaussianMixture(n_components=n_components) # Check bad shape weights_bad_shape = rng.rand(n_components, 1) g.weights_init = weights_bad_shape assert_raise_message(ValueError, "The parameter 'weights' should have the shape of " "(%d,), but got %s" % (n_components, str(weights_bad_shape.shape)), g.fit, X) # Check bad range weights_bad_range = rng.rand(n_components) + 1 g.weights_init = weights_bad_range assert_raise_message(ValueError, "The parameter 'weights' should be in the range " "[0, 1], but got max value %.5f, min value %.5f" % (np.min(weights_bad_range), np.max(weights_bad_range)), g.fit, X) # Check bad normalization weights_bad_norm = rng.rand(n_components) weights_bad_norm = weights_bad_norm / (weights_bad_norm.sum() + 1) g.weights_init = weights_bad_norm assert_raise_message(ValueError, "The parameter 'weights' should be normalized, " "but got sum(weights) = %.5f" % np.sum(weights_bad_norm), g.fit, X) # Check good weights matrix weights = rand_data.weights g = GaussianMixture(weights_init=weights, n_components=n_components) g.fit(X) assert_array_equal(weights, g.weights_init) def test_check_means(): rng = np.random.RandomState(0) rand_data = RandomData(rng) n_components, n_features = rand_data.n_components, rand_data.n_features X = rand_data.X['full'] g = GaussianMixture(n_components=n_components) # Check means bad shape means_bad_shape = rng.rand(n_components + 1, n_features) g.means_init = means_bad_shape assert_raise_message(ValueError, "The parameter 'means' should have the shape of ", g.fit, X) # Check good means matrix means = rand_data.means g.means_init = means g.fit(X) assert_array_equal(means, g.means_init) def test_check_precisions(): rng = np.random.RandomState(0) rand_data = RandomData(rng) n_components, n_features = rand_data.n_components, rand_data.n_features # Define the bad precisions for each covariance_type precisions_bad_shape = { 'full': np.ones((n_components + 1, n_features, n_features)), 'tied': np.ones((n_features + 1, n_features + 1)), 'diag': np.ones((n_components + 1, n_features)), 'spherical': np.ones((n_components + 1))} # Define not positive-definite precisions precisions_not_pos = np.ones((n_components, n_features, n_features)) precisions_not_pos[0] = np.eye(n_features) precisions_not_pos[0, 0, 0] = -1. precisions_not_positive = { 'full': precisions_not_pos, 'tied': precisions_not_pos[0], 'diag': np.full((n_components, n_features), -1.), 'spherical': np.full(n_components, -1.)} not_positive_errors = { 'full': 'symmetric, positive-definite', 'tied': 'symmetric, positive-definite', 'diag': 'positive', 'spherical': 'positive'} for covar_type in COVARIANCE_TYPE: X = RandomData(rng).X[covar_type] g = GaussianMixture(n_components=n_components, covariance_type=covar_type, random_state=rng) # Check precisions with bad shapes g.precisions_init = precisions_bad_shape[covar_type] assert_raise_message(ValueError, "The parameter '%s precision' should have " "the shape of" % covar_type, g.fit, X) # Check not positive precisions g.precisions_init = precisions_not_positive[covar_type] assert_raise_message(ValueError, "'%s precision' should be %s" % (covar_type, not_positive_errors[covar_type]), g.fit, X) # Check the correct init of precisions_init g.precisions_init = rand_data.precisions[covar_type] g.fit(X) assert_array_equal(rand_data.precisions[covar_type], g.precisions_init) def test_suffstat_sk_full(): # compare the precision matrix compute from the # EmpiricalCovariance.covariance fitted on X*sqrt(resp) # with _sufficient_sk_full, n_components=1 rng = np.random.RandomState(0) n_samples, n_features = 500, 2 # special case 1, assuming data is "centered" X = rng.rand(n_samples, n_features) resp = rng.rand(n_samples, 1) X_resp = np.sqrt(resp) * X nk = np.array([n_samples]) xk = np.zeros((1, n_features)) covars_pred = _estimate_gaussian_covariances_full(resp, X, nk, xk, 0) ecov = EmpiricalCovariance(assume_centered=True) ecov.fit(X_resp) assert_almost_equal(ecov.error_norm(covars_pred[0], norm='frobenius'), 0) assert_almost_equal(ecov.error_norm(covars_pred[0], norm='spectral'), 0) # check the precision computation precs_chol_pred = _compute_precision_cholesky(covars_pred, 'full') precs_pred = np.array([np.dot(prec, prec.T) for prec in precs_chol_pred]) precs_est = np.array([linalg.inv(cov) for cov in covars_pred]) assert_array_almost_equal(precs_est, precs_pred) # special case 2, assuming resp are all ones resp = np.ones((n_samples, 1)) nk = np.array([n_samples]) xk = X.mean(axis=0).reshape((1, -1)) covars_pred = _estimate_gaussian_covariances_full(resp, X, nk, xk, 0) ecov = EmpiricalCovariance(assume_centered=False) ecov.fit(X) assert_almost_equal(ecov.error_norm(covars_pred[0], norm='frobenius'), 0) assert_almost_equal(ecov.error_norm(covars_pred[0], norm='spectral'), 0) # check the precision computation precs_chol_pred = _compute_precision_cholesky(covars_pred, 'full') precs_pred = np.array([np.dot(prec, prec.T) for prec in precs_chol_pred]) precs_est = np.array([linalg.inv(cov) for cov in covars_pred]) assert_array_almost_equal(precs_est, precs_pred) def test_suffstat_sk_tied(): # use equation Nk * Sk / N = S_tied rng = np.random.RandomState(0) n_samples, n_features, n_components = 500, 2, 2 resp = rng.rand(n_samples, n_components) resp = resp / resp.sum(axis=1)[:, np.newaxis] X = rng.rand(n_samples, n_features) nk = resp.sum(axis=0) xk = np.dot(resp.T, X) / nk[:, np.newaxis] covars_pred_full = _estimate_gaussian_covariances_full(resp, X, nk, xk, 0) covars_pred_full = np.sum(nk[:, np.newaxis, np.newaxis] * covars_pred_full, 0) / n_samples covars_pred_tied = _estimate_gaussian_covariances_tied(resp, X, nk, xk, 0) ecov = EmpiricalCovariance() ecov.covariance_ = covars_pred_full assert_almost_equal(ecov.error_norm(covars_pred_tied, norm='frobenius'), 0) assert_almost_equal(ecov.error_norm(covars_pred_tied, norm='spectral'), 0) # check the precision computation precs_chol_pred = _compute_precision_cholesky(covars_pred_tied, 'tied') precs_pred = np.dot(precs_chol_pred, precs_chol_pred.T) precs_est = linalg.inv(covars_pred_tied) assert_array_almost_equal(precs_est, precs_pred) def test_suffstat_sk_diag(): # test against 'full' case rng = np.random.RandomState(0) n_samples, n_features, n_components = 500, 2, 2 resp = rng.rand(n_samples, n_components) resp = resp / resp.sum(axis=1)[:, np.newaxis] X = rng.rand(n_samples, n_features) nk = resp.sum(axis=0) xk = np.dot(resp.T, X) / nk[:, np.newaxis] covars_pred_full = _estimate_gaussian_covariances_full(resp, X, nk, xk, 0) covars_pred_diag = _estimate_gaussian_covariances_diag(resp, X, nk, xk, 0) ecov = EmpiricalCovariance() for (cov_full, cov_diag) in zip(covars_pred_full, covars_pred_diag): ecov.covariance_ = np.diag(np.diag(cov_full)) cov_diag = np.diag(cov_diag) assert_almost_equal(ecov.error_norm(cov_diag, norm='frobenius'), 0) assert_almost_equal(ecov.error_norm(cov_diag, norm='spectral'), 0) # check the precision computation precs_chol_pred = _compute_precision_cholesky(covars_pred_diag, 'diag') assert_almost_equal(covars_pred_diag, 1. / precs_chol_pred ** 2) def test_gaussian_suffstat_sk_spherical(): # computing spherical covariance equals to the variance of one-dimension # data after flattening, n_components=1 rng = np.random.RandomState(0) n_samples, n_features = 500, 2 X = rng.rand(n_samples, n_features) X = X - X.mean() resp = np.ones((n_samples, 1)) nk = np.array([n_samples]) xk = X.mean() covars_pred_spherical = _estimate_gaussian_covariances_spherical(resp, X, nk, xk, 0) covars_pred_spherical2 = (np.dot(X.flatten().T, X.flatten()) / (n_features * n_samples)) assert_almost_equal(covars_pred_spherical, covars_pred_spherical2) # check the precision computation precs_chol_pred = _compute_precision_cholesky(covars_pred_spherical, 'spherical') assert_almost_equal(covars_pred_spherical, 1. / precs_chol_pred ** 2) def test_compute_log_det_cholesky(): n_features = 2 rand_data = RandomData(np.random.RandomState(0)) for covar_type in COVARIANCE_TYPE: covariance = rand_data.covariances[covar_type] if covar_type == 'full': predected_det = np.array([linalg.det(cov) for cov in covariance]) elif covar_type == 'tied': predected_det = linalg.det(covariance) elif covar_type == 'diag': predected_det = np.array([np.prod(cov) for cov in covariance]) elif covar_type == 'spherical': predected_det = covariance ** n_features # We compute the cholesky decomposition of the covariance matrix expected_det = _compute_log_det_cholesky(_compute_precision_cholesky( covariance, covar_type), covar_type, n_features=n_features) assert_array_almost_equal(expected_det, - .5 * np.log(predected_det)) def _naive_lmvnpdf_diag(X, means, covars): resp = np.empty((len(X), len(means))) stds = np.sqrt(covars) for i, (mean, std) in enumerate(zip(means, stds)): resp[:, i] = stats.norm.logpdf(X, mean, std).sum(axis=1) return resp def test_gaussian_mixture_log_probabilities(): from sklearn.mixture._gaussian_mixture import _estimate_log_gaussian_prob # test against with _naive_lmvnpdf_diag rng = np.random.RandomState(0) rand_data = RandomData(rng) n_samples = 500 n_features = rand_data.n_features n_components = rand_data.n_components means = rand_data.means covars_diag = rng.rand(n_components, n_features) X = rng.rand(n_samples, n_features) log_prob_naive = _naive_lmvnpdf_diag(X, means, covars_diag) # full covariances precs_full = np.array([np.diag(1. / np.sqrt(x)) for x in covars_diag]) log_prob = _estimate_log_gaussian_prob(X, means, precs_full, 'full') assert_array_almost_equal(log_prob, log_prob_naive) # diag covariances precs_chol_diag = 1. / np.sqrt(covars_diag) log_prob = _estimate_log_gaussian_prob(X, means, precs_chol_diag, 'diag') assert_array_almost_equal(log_prob, log_prob_naive) # tied covars_tied = np.array([x for x in covars_diag]).mean(axis=0) precs_tied = np.diag(np.sqrt(1. / covars_tied)) log_prob_naive = _naive_lmvnpdf_diag(X, means, [covars_tied] * n_components) log_prob = _estimate_log_gaussian_prob(X, means, precs_tied, 'tied') assert_array_almost_equal(log_prob, log_prob_naive) # spherical covars_spherical = covars_diag.mean(axis=1) precs_spherical = 1. / np.sqrt(covars_diag.mean(axis=1)) log_prob_naive = _naive_lmvnpdf_diag(X, means, [[k] * n_features for k in covars_spherical]) log_prob = _estimate_log_gaussian_prob(X, means, precs_spherical, 'spherical') assert_array_almost_equal(log_prob, log_prob_naive) # skip tests on weighted_log_probabilities, log_weights def test_gaussian_mixture_estimate_log_prob_resp(): # test whether responsibilities are normalized rng = np.random.RandomState(0) rand_data = RandomData(rng, scale=5) n_samples = rand_data.n_samples n_features = rand_data.n_features n_components = rand_data.n_components X = rng.rand(n_samples, n_features) for covar_type in COVARIANCE_TYPE: weights = rand_data.weights means = rand_data.means precisions = rand_data.precisions[covar_type] g = GaussianMixture(n_components=n_components, random_state=rng, weights_init=weights, means_init=means, precisions_init=precisions, covariance_type=covar_type) g.fit(X) resp = g.predict_proba(X) assert_array_almost_equal(resp.sum(axis=1), np.ones(n_samples)) assert_array_equal(g.weights_init, weights) assert_array_equal(g.means_init, means) assert_array_equal(g.precisions_init, precisions) def test_gaussian_mixture_predict_predict_proba(): rng = np.random.RandomState(0) rand_data = RandomData(rng) for covar_type in COVARIANCE_TYPE: X = rand_data.X[covar_type] Y = rand_data.Y g = GaussianMixture(n_components=rand_data.n_components, random_state=rng, weights_init=rand_data.weights, means_init=rand_data.means, precisions_init=rand_data.precisions[covar_type], covariance_type=covar_type) # Check a warning message arrive if we don't do fit assert_raise_message(NotFittedError, "This GaussianMixture instance is not fitted " "yet. Call 'fit' with appropriate arguments " "before using this estimator.", g.predict, X) g.fit(X) Y_pred = g.predict(X) Y_pred_proba = g.predict_proba(X).argmax(axis=1) assert_array_equal(Y_pred, Y_pred_proba) assert adjusted_rand_score(Y, Y_pred) > .95 @pytest.mark.filterwarnings("ignore:.*did not converge.*") @pytest.mark.parametrize('seed, max_iter, tol', [ (0, 2, 1e-7), # strict non-convergence (1, 2, 1e-1), # loose non-convergence (3, 300, 1e-7), # strict convergence (4, 300, 1e-1), # loose convergence ]) def test_gaussian_mixture_fit_predict(seed, max_iter, tol): rng = np.random.RandomState(seed) rand_data = RandomData(rng) for covar_type in COVARIANCE_TYPE: X = rand_data.X[covar_type] Y = rand_data.Y g = GaussianMixture(n_components=rand_data.n_components, random_state=rng, weights_init=rand_data.weights, means_init=rand_data.means, precisions_init=rand_data.precisions[covar_type], covariance_type=covar_type, max_iter=max_iter, tol=tol) # check if fit_predict(X) is equivalent to fit(X).predict(X) f = copy.deepcopy(g) Y_pred1 = f.fit(X).predict(X) Y_pred2 = g.fit_predict(X) assert_array_equal(Y_pred1, Y_pred2) assert adjusted_rand_score(Y, Y_pred2) > .95 def test_gaussian_mixture_fit_predict_n_init(): # Check that fit_predict is equivalent to fit.predict, when n_init > 1 X = np.random.RandomState(0).randn(1000, 5) gm = GaussianMixture(n_components=5, n_init=5, random_state=0) y_pred1 = gm.fit_predict(X) y_pred2 = gm.predict(X) assert_array_equal(y_pred1, y_pred2) def test_gaussian_mixture_fit(): # recover the ground truth rng = np.random.RandomState(0) rand_data = RandomData(rng) n_features = rand_data.n_features n_components = rand_data.n_components for covar_type in COVARIANCE_TYPE: X = rand_data.X[covar_type] g = GaussianMixture(n_components=n_components, n_init=20, reg_covar=0, random_state=rng, covariance_type=covar_type) g.fit(X) # needs more data to pass the test with rtol=1e-7 assert_allclose(np.sort(g.weights_), np.sort(rand_data.weights), rtol=0.1, atol=1e-2) arg_idx1 = g.means_[:, 0].argsort() arg_idx2 = rand_data.means[:, 0].argsort() assert_allclose(g.means_[arg_idx1], rand_data.means[arg_idx2], rtol=0.1, atol=1e-2) if covar_type == 'full': prec_pred = g.precisions_ prec_test = rand_data.precisions['full'] elif covar_type == 'tied': prec_pred = np.array([g.precisions_] * n_components) prec_test = np.array([rand_data.precisions['tied']] * n_components) elif covar_type == 'spherical': prec_pred = np.array([np.eye(n_features) * c for c in g.precisions_]) prec_test = np.array([np.eye(n_features) * c for c in rand_data.precisions['spherical']]) elif covar_type == 'diag': prec_pred = np.array([np.diag(d) for d in g.precisions_]) prec_test = np.array([np.diag(d) for d in rand_data.precisions['diag']]) arg_idx1 = np.trace(prec_pred, axis1=1, axis2=2).argsort() arg_idx2 = np.trace(prec_test, axis1=1, axis2=2).argsort() for k, h in zip(arg_idx1, arg_idx2): ecov = EmpiricalCovariance() ecov.covariance_ = prec_test[h] # the accuracy depends on the number of data and randomness, rng assert_allclose(ecov.error_norm(prec_pred[k]), 0, atol=0.15) def test_gaussian_mixture_fit_best_params(): rng = np.random.RandomState(0) rand_data = RandomData(rng) n_components = rand_data.n_components n_init = 10 for covar_type in COVARIANCE_TYPE: X = rand_data.X[covar_type] g = GaussianMixture(n_components=n_components, n_init=1, reg_covar=0, random_state=rng, covariance_type=covar_type) ll = [] for _ in range(n_init): g.fit(X) ll.append(g.score(X)) ll = np.array(ll) g_best = GaussianMixture(n_components=n_components, n_init=n_init, reg_covar=0, random_state=rng, covariance_type=covar_type) g_best.fit(X) assert_almost_equal(ll.min(), g_best.score(X)) def test_gaussian_mixture_fit_convergence_warning(): rng = np.random.RandomState(0) rand_data = RandomData(rng, scale=1) n_components = rand_data.n_components max_iter = 1 for covar_type in COVARIANCE_TYPE: X = rand_data.X[covar_type] g = GaussianMixture(n_components=n_components, n_init=1, max_iter=max_iter, reg_covar=0, random_state=rng, covariance_type=covar_type) assert_warns_message(ConvergenceWarning, 'Initialization %d did not converge. ' 'Try different init parameters, ' 'or increase max_iter, tol ' 'or check for degenerate data.' % max_iter, g.fit, X) def test_multiple_init(): # Test that multiple inits does not much worse than a single one rng = np.random.RandomState(0) n_samples, n_features, n_components = 50, 5, 2 X = rng.randn(n_samples, n_features) for cv_type in COVARIANCE_TYPE: train1 = GaussianMixture(n_components=n_components, covariance_type=cv_type, random_state=0).fit(X).score(X) train2 = GaussianMixture(n_components=n_components, covariance_type=cv_type, random_state=0, n_init=5).fit(X).score(X) assert train2 >= train1 def test_gaussian_mixture_n_parameters(): # Test that the right number of parameters is estimated rng = np.random.RandomState(0) n_samples, n_features, n_components = 50, 5, 2 X = rng.randn(n_samples, n_features) n_params = {'spherical': 13, 'diag': 21, 'tied': 26, 'full': 41} for cv_type in COVARIANCE_TYPE: g = GaussianMixture( n_components=n_components, covariance_type=cv_type, random_state=rng).fit(X) assert g._n_parameters() == n_params[cv_type] def test_bic_1d_1component(): # Test all of the covariance_types return the same BIC score for # 1-dimensional, 1 component fits. rng = np.random.RandomState(0) n_samples, n_dim, n_components = 100, 1, 1 X = rng.randn(n_samples, n_dim) bic_full = GaussianMixture(n_components=n_components, covariance_type='full', random_state=rng).fit(X).bic(X) for covariance_type in ['tied', 'diag', 'spherical']: bic = GaussianMixture(n_components=n_components, covariance_type=covariance_type, random_state=rng).fit(X).bic(X) assert_almost_equal(bic_full, bic) def test_gaussian_mixture_aic_bic(): # Test the aic and bic criteria rng = np.random.RandomState(0) n_samples, n_features, n_components = 50, 3, 2 X = rng.randn(n_samples, n_features) # standard gaussian entropy sgh = 0.5 * (fast_logdet(np.cov(X.T, bias=1)) + n_features * (1 + np.log(2 * np.pi))) for cv_type in COVARIANCE_TYPE: g = GaussianMixture( n_components=n_components, covariance_type=cv_type, random_state=rng, max_iter=200) g.fit(X) aic = 2 * n_samples * sgh + 2 * g._n_parameters() bic = (2 * n_samples * sgh + np.log(n_samples) * g._n_parameters()) bound = n_features / np.sqrt(n_samples) assert (g.aic(X) - aic) / n_samples < bound assert (g.bic(X) - bic) / n_samples < bound def test_gaussian_mixture_verbose(): rng = np.random.RandomState(0) rand_data = RandomData(rng) n_components = rand_data.n_components for covar_type in COVARIANCE_TYPE: X = rand_data.X[covar_type] g = GaussianMixture(n_components=n_components, n_init=1, reg_covar=0, random_state=rng, covariance_type=covar_type, verbose=1) h = GaussianMixture(n_components=n_components, n_init=1, reg_covar=0, random_state=rng, covariance_type=covar_type, verbose=2) old_stdout = sys.stdout sys.stdout = StringIO() try: g.fit(X) h.fit(X) finally: sys.stdout = old_stdout @pytest.mark.filterwarnings('ignore:.*did not converge.*') @pytest.mark.parametrize("seed", (0, 1, 2)) def test_warm_start(seed): random_state = seed rng = np.random.RandomState(random_state) n_samples, n_features, n_components = 500, 2, 2 X = rng.rand(n_samples, n_features) # Assert the warm_start give the same result for the same number of iter g = GaussianMixture(n_components=n_components, n_init=1, max_iter=2, reg_covar=0, random_state=random_state, warm_start=False) h = GaussianMixture(n_components=n_components, n_init=1, max_iter=1, reg_covar=0, random_state=random_state, warm_start=True) g.fit(X) score1 = h.fit(X).score(X) score2 = h.fit(X).score(X) assert_almost_equal(g.weights_, h.weights_) assert_almost_equal(g.means_, h.means_) assert_almost_equal(g.precisions_, h.precisions_) assert score2 > score1 # Assert that by using warm_start we can converge to a good solution g = GaussianMixture(n_components=n_components, n_init=1, max_iter=5, reg_covar=0, random_state=random_state, warm_start=False, tol=1e-6) h = GaussianMixture(n_components=n_components, n_init=1, max_iter=5, reg_covar=0, random_state=random_state, warm_start=True, tol=1e-6) g.fit(X) assert not g.converged_ h.fit(X) # depending on the data there is large variability in the number of # refit necessary to converge due to the complete randomness of the # data for _ in range(1000): h.fit(X) if h.converged_: break assert h.converged_ @ignore_warnings(category=ConvergenceWarning) def test_convergence_detected_with_warm_start(): # We check that convergence is detected when warm_start=True rng = np.random.RandomState(0) rand_data = RandomData(rng) n_components = rand_data.n_components X = rand_data.X['full'] for max_iter in (1, 2, 50): gmm = GaussianMixture(n_components=n_components, warm_start=True, max_iter=max_iter, random_state=rng) for _ in range(100): gmm.fit(X) if gmm.converged_: break assert gmm.converged_ assert max_iter >= gmm.n_iter_ def test_score(): covar_type = 'full' rng = np.random.RandomState(0) rand_data = RandomData(rng, scale=7) n_components = rand_data.n_components X = rand_data.X[covar_type] # Check the error message if we don't call fit gmm1 = GaussianMixture(n_components=n_components, n_init=1, max_iter=1, reg_covar=0, random_state=rng, covariance_type=covar_type) assert_raise_message(NotFittedError, "This GaussianMixture instance is not fitted " "yet. Call 'fit' with appropriate arguments " "before using this estimator.", gmm1.score, X) # Check score value with warnings.catch_warnings(): warnings.simplefilter("ignore", ConvergenceWarning) gmm1.fit(X) gmm_score = gmm1.score(X) gmm_score_proba = gmm1.score_samples(X).mean() assert_almost_equal(gmm_score, gmm_score_proba) # Check if the score increase gmm2 = GaussianMixture(n_components=n_components, n_init=1, reg_covar=0, random_state=rng, covariance_type=covar_type).fit(X) assert gmm2.score(X) > gmm1.score(X) def test_score_samples(): covar_type = 'full' rng = np.random.RandomState(0) rand_data = RandomData(rng, scale=7) n_components = rand_data.n_components X = rand_data.X[covar_type] # Check the error message if we don't call fit gmm = GaussianMixture(n_components=n_components, n_init=1, reg_covar=0, random_state=rng, covariance_type=covar_type) assert_raise_message(NotFittedError, "This GaussianMixture instance is not fitted " "yet. Call 'fit' with appropriate arguments " "before using this estimator.", gmm.score_samples, X) gmm_score_samples = gmm.fit(X).score_samples(X) assert gmm_score_samples.shape[0] == rand_data.n_samples def test_monotonic_likelihood(): # We check that each step of the EM without regularization improve # monotonically the training set likelihood rng = np.random.RandomState(0) rand_data = RandomData(rng, scale=7) n_components = rand_data.n_components for covar_type in COVARIANCE_TYPE: X = rand_data.X[covar_type] gmm = GaussianMixture(n_components=n_components, covariance_type=covar_type, reg_covar=0, warm_start=True, max_iter=1, random_state=rng, tol=1e-7) current_log_likelihood = -np.infty with warnings.catch_warnings(): warnings.simplefilter("ignore", ConvergenceWarning) # Do one training iteration at a time so we can make sure that the # training log likelihood increases after each iteration. for _ in range(600): prev_log_likelihood = current_log_likelihood current_log_likelihood = gmm.fit(X).score(X) assert current_log_likelihood >= prev_log_likelihood if gmm.converged_: break assert gmm.converged_ def test_regularisation(): # We train the GaussianMixture on degenerate data by defining two clusters # of a 0 covariance. rng = np.random.RandomState(0) n_samples, n_features = 10, 5 X = np.vstack((np.ones((n_samples // 2, n_features)), np.zeros((n_samples // 2, n_features)))) for covar_type in COVARIANCE_TYPE: gmm = GaussianMixture(n_components=n_samples, reg_covar=0, covariance_type=covar_type, random_state=rng) with warnings.catch_warnings(): warnings.simplefilter("ignore", RuntimeWarning) assert_raise_message(ValueError, "Fitting the mixture model failed because " "some components have ill-defined empirical " "covariance (for instance caused by " "singleton or collapsed samples). Try to " "decrease the number of components, or " "increase reg_covar.", gmm.fit, X) gmm.set_params(reg_covar=1e-6).fit(X) def test_property(): rng = np.random.RandomState(0) rand_data = RandomData(rng, scale=7) n_components = rand_data.n_components for covar_type in COVARIANCE_TYPE: X = rand_data.X[covar_type] gmm = GaussianMixture(n_components=n_components, covariance_type=covar_type, random_state=rng, n_init=5) gmm.fit(X) if covar_type == 'full': for prec, covar in zip(gmm.precisions_, gmm.covariances_): assert_array_almost_equal(linalg.inv(prec), covar) elif covar_type == 'tied': assert_array_almost_equal(linalg.inv(gmm.precisions_), gmm.covariances_) else: assert_array_almost_equal(gmm.precisions_, 1. / gmm.covariances_) def test_sample(): rng = np.random.RandomState(0) rand_data = RandomData(rng, scale=7, n_components=3) n_features, n_components = rand_data.n_features, rand_data.n_components for covar_type in COVARIANCE_TYPE: X = rand_data.X[covar_type] gmm = GaussianMixture(n_components=n_components, covariance_type=covar_type, random_state=rng) # To sample we need that GaussianMixture is fitted assert_raise_message(NotFittedError, "This GaussianMixture instance " "is not fitted", gmm.sample, 0) gmm.fit(X) assert_raise_message(ValueError, "Invalid value for 'n_samples", gmm.sample, 0) # Just to make sure the class samples correctly n_samples = 20000 X_s, y_s = gmm.sample(n_samples) for k in range(n_components): if covar_type == 'full': assert_array_almost_equal(gmm.covariances_[k], np.cov(X_s[y_s == k].T), decimal=1) elif covar_type == 'tied': assert_array_almost_equal(gmm.covariances_, np.cov(X_s[y_s == k].T), decimal=1) elif covar_type == 'diag': assert_array_almost_equal(gmm.covariances_[k], np.diag(np.cov(X_s[y_s == k].T)), decimal=1) else: assert_array_almost_equal( gmm.covariances_[k], np.var(X_s[y_s == k] - gmm.means_[k]), decimal=1) means_s = np.array([np.mean(X_s[y_s == k], 0) for k in range(n_components)]) assert_array_almost_equal(gmm.means_, means_s, decimal=1) # Check shapes of sampled data, see # https://github.com/scikit-learn/scikit-learn/issues/7701 assert X_s.shape == (n_samples, n_features) for sample_size in range(1, 100): X_s, _ = gmm.sample(sample_size) assert X_s.shape == (sample_size, n_features) @ignore_warnings(category=ConvergenceWarning) def test_init(): # We check that by increasing the n_init number we have a better solution for random_state in range(15): rand_data = RandomData(np.random.RandomState(random_state), n_samples=50, scale=1) n_components = rand_data.n_components X = rand_data.X['full'] gmm1 = GaussianMixture(n_components=n_components, n_init=1, max_iter=1, random_state=random_state).fit(X) gmm2 = GaussianMixture(n_components=n_components, n_init=10, max_iter=1, random_state=random_state).fit(X) assert gmm2.lower_bound_ >= gmm1.lower_bound_