Uploaded Test files

This commit is contained in:
Batuhan Berk Başoğlu 2020-11-12 11:05:57 -05:00
parent f584ad9d97
commit 2e81cb7d99
16627 changed files with 2065359 additions and 102444 deletions

View file

@ -0,0 +1,35 @@
"""
The :mod:`sklearn.covariance` module includes methods and algorithms to
robustly estimate the covariance of features given a set of points. The
precision matrix defined as the inverse of the covariance is also estimated.
Covariance estimation is closely related to the theory of Gaussian Graphical
Models.
"""
from ._empirical_covariance import (empirical_covariance,
EmpiricalCovariance,
log_likelihood)
from ._shrunk_covariance import (shrunk_covariance, ShrunkCovariance,
ledoit_wolf, ledoit_wolf_shrinkage,
LedoitWolf, oas, OAS)
from ._robust_covariance import fast_mcd, MinCovDet
from ._graph_lasso import graphical_lasso, GraphicalLasso, GraphicalLassoCV
from ._elliptic_envelope import EllipticEnvelope
__all__ = ['EllipticEnvelope',
'EmpiricalCovariance',
'GraphicalLasso',
'GraphicalLassoCV',
'LedoitWolf',
'MinCovDet',
'OAS',
'ShrunkCovariance',
'empirical_covariance',
'fast_mcd',
'graphical_lasso',
'ledoit_wolf',
'ledoit_wolf_shrinkage',
'log_likelihood',
'oas',
'shrunk_covariance']

View file

@ -0,0 +1,230 @@
# Author: Virgile Fritsch <virgile.fritsch@inria.fr>
#
# License: BSD 3 clause
import numpy as np
from . import MinCovDet
from ..utils.validation import check_is_fitted, check_array
from ..utils.validation import _deprecate_positional_args
from ..metrics import accuracy_score
from ..base import OutlierMixin
class EllipticEnvelope(OutlierMixin, MinCovDet):
"""An object for detecting outliers in a Gaussian distributed dataset.
Read more in the :ref:`User Guide <outlier_detection>`.
Parameters
----------
store_precision : bool, default=True
Specify if the estimated precision is stored.
assume_centered : bool, default=False
If True, the support of robust location and covariance estimates
is computed, and a covariance estimate is recomputed from it,
without centering the data.
Useful to work with data whose mean is significantly equal to
zero but is not exactly zero.
If False, the robust location and covariance are directly computed
with the FastMCD algorithm without additional treatment.
support_fraction : float, default=None
The proportion of points to be included in the support of the raw
MCD estimate. If None, the minimum value of support_fraction will
be used within the algorithm: `[n_sample + n_features + 1] / 2`.
Range is (0, 1).
contamination : float, default=0.1
The amount of contamination of the data set, i.e. the proportion
of outliers in the data set. Range is (0, 0.5).
random_state : int or RandomState instance, default=None
Determines the pseudo random number generator for shuffling
the data. Pass an int for reproducible results across multiple function
calls. See :term: `Glossary <random_state>`.
Attributes
----------
location_ : ndarray of shape (n_features,)
Estimated robust location
covariance_ : ndarray of shape (n_features, n_features)
Estimated robust covariance matrix
precision_ : ndarray of shape (n_features, n_features)
Estimated pseudo inverse matrix.
(stored only if store_precision is True)
support_ : ndarray of shape (n_samples,)
A mask of the observations that have been used to compute the
robust estimates of location and shape.
offset_ : float
Offset used to define the decision function from the raw scores.
We have the relation: ``decision_function = score_samples - offset_``.
The offset depends on the contamination parameter and is defined in
such a way we obtain the expected number of outliers (samples with
decision function < 0) in training.
.. versionadded:: 0.20
raw_location_ : ndarray of shape (n_features,)
The raw robust estimated location before correction and re-weighting.
raw_covariance_ : ndarray of shape (n_features, n_features)
The raw robust estimated covariance before correction and re-weighting.
raw_support_ : ndarray of shape (n_samples,)
A mask of the observations that have been used to compute
the raw robust estimates of location and shape, before correction
and re-weighting.
dist_ : ndarray of shape (n_samples,)
Mahalanobis distances of the training set (on which :meth:`fit` is
called) observations.
Examples
--------
>>> import numpy as np
>>> from sklearn.covariance import EllipticEnvelope
>>> true_cov = np.array([[.8, .3],
... [.3, .4]])
>>> X = np.random.RandomState(0).multivariate_normal(mean=[0, 0],
... cov=true_cov,
... size=500)
>>> cov = EllipticEnvelope(random_state=0).fit(X)
>>> # predict returns 1 for an inlier and -1 for an outlier
>>> cov.predict([[0, 0],
... [3, 3]])
array([ 1, -1])
>>> cov.covariance_
array([[0.7411..., 0.2535...],
[0.2535..., 0.3053...]])
>>> cov.location_
array([0.0813... , 0.0427...])
See Also
--------
EmpiricalCovariance, MinCovDet
Notes
-----
Outlier detection from covariance estimation may break or not
perform well in high-dimensional settings. In particular, one will
always take care to work with ``n_samples > n_features ** 2``.
References
----------
.. [1] Rousseeuw, P.J., Van Driessen, K. "A fast algorithm for the
minimum covariance determinant estimator" Technometrics 41(3), 212
(1999)
"""
@_deprecate_positional_args
def __init__(self, *, store_precision=True, assume_centered=False,
support_fraction=None, contamination=0.1,
random_state=None):
super().__init__(
store_precision=store_precision,
assume_centered=assume_centered,
support_fraction=support_fraction,
random_state=random_state)
self.contamination = contamination
def fit(self, X, y=None):
"""Fit the EllipticEnvelope model.
Parameters
----------
X : {array-like, sparse matrix} of shape (n_samples, n_features)
Training data.
y : Ignored
Not used, present for API consistency by convention.
"""
super().fit(X)
self.offset_ = np.percentile(-self.dist_, 100. * self.contamination)
return self
def decision_function(self, X):
"""Compute the decision function of the given observations.
Parameters
----------
X : array-like of shape (n_samples, n_features)
The data matrix.
Returns
-------
decision : ndarray of shape (n_samples, )
Decision function of the samples.
It is equal to the shifted Mahalanobis distances.
The threshold for being an outlier is 0, which ensures a
compatibility with other outlier detection algorithms.
"""
check_is_fitted(self)
negative_mahal_dist = self.score_samples(X)
return negative_mahal_dist - self.offset_
def score_samples(self, X):
"""Compute the negative Mahalanobis distances.
Parameters
----------
X : array-like of shape (n_samples, n_features)
The data matrix.
Returns
-------
negative_mahal_distances : array-like of shape (n_samples,)
Opposite of the Mahalanobis distances.
"""
check_is_fitted(self)
return -self.mahalanobis(X)
def predict(self, X):
"""
Predict the labels (1 inlier, -1 outlier) of X according to the
fitted model.
Parameters
----------
X : array-like of shape (n_samples, n_features)
The data matrix.
Returns
-------
is_inlier : ndarray of shape (n_samples,)
Returns -1 for anomalies/outliers and +1 for inliers.
"""
X = check_array(X)
is_inlier = np.full(X.shape[0], -1, dtype=int)
values = self.decision_function(X)
is_inlier[values >= 0] = 1
return is_inlier
def score(self, X, y, sample_weight=None):
"""Returns the mean accuracy on the given test data and labels.
In multi-label classification, this is the subset accuracy
which is a harsh metric since you require for each sample that
each label set be correctly predicted.
Parameters
----------
X : array-like of shape (n_samples, n_features)
Test samples.
y : array-like of shape (n_samples,) or (n_samples, n_outputs)
True labels for X.
sample_weight : array-like of shape (n_samples,), default=None
Sample weights.
Returns
-------
score : float
Mean accuracy of self.predict(X) w.r.t. y.
"""
return accuracy_score(y, self.predict(X), sample_weight=sample_weight)

View file

@ -0,0 +1,317 @@
"""
Maximum likelihood covariance estimator.
"""
# Author: Alexandre Gramfort <alexandre.gramfort@inria.fr>
# Gael Varoquaux <gael.varoquaux@normalesup.org>
# Virgile Fritsch <virgile.fritsch@inria.fr>
#
# License: BSD 3 clause
# avoid division truncation
import warnings
import numpy as np
from scipy import linalg
from ..base import BaseEstimator
from ..utils import check_array
from ..utils.extmath import fast_logdet
from ..metrics.pairwise import pairwise_distances
from ..utils.validation import _deprecate_positional_args
def log_likelihood(emp_cov, precision):
"""Computes the sample mean of the log_likelihood under a covariance model
computes the empirical expected log-likelihood (accounting for the
normalization terms and scaling), allowing for universal comparison (beyond
this software package)
Parameters
----------
emp_cov : ndarray of shape (n_features, n_features)
Maximum Likelihood Estimator of covariance.
precision : ndarray of shape (n_features, n_features)
The precision matrix of the covariance model to be tested.
Returns
-------
log_likelihood_ : float
Sample mean of the log-likelihood.
"""
p = precision.shape[0]
log_likelihood_ = - np.sum(emp_cov * precision) + fast_logdet(precision)
log_likelihood_ -= p * np.log(2 * np.pi)
log_likelihood_ /= 2.
return log_likelihood_
@_deprecate_positional_args
def empirical_covariance(X, *, assume_centered=False):
"""Computes the Maximum likelihood covariance estimator
Parameters
----------
X : ndarray of shape (n_samples, n_features)
Data from which to compute the covariance estimate
assume_centered : bool, default=False
If True, data will not be centered before computation.
Useful when working with data whose mean is almost, but not exactly
zero.
If False, data will be centered before computation.
Returns
-------
covariance : ndarray of shape (n_features, n_features)
Empirical covariance (Maximum Likelihood Estimator).
Examples
--------
>>> from sklearn.covariance import empirical_covariance
>>> X = [[1,1,1],[1,1,1],[1,1,1],
... [0,0,0],[0,0,0],[0,0,0]]
>>> empirical_covariance(X)
array([[0.25, 0.25, 0.25],
[0.25, 0.25, 0.25],
[0.25, 0.25, 0.25]])
"""
X = np.asarray(X)
if X.ndim == 1:
X = np.reshape(X, (1, -1))
if X.shape[0] == 1:
warnings.warn("Only one sample available. "
"You may want to reshape your data array")
if assume_centered:
covariance = np.dot(X.T, X) / X.shape[0]
else:
covariance = np.cov(X.T, bias=1)
if covariance.ndim == 0:
covariance = np.array([[covariance]])
return covariance
class EmpiricalCovariance(BaseEstimator):
"""Maximum likelihood covariance estimator
Read more in the :ref:`User Guide <covariance>`.
Parameters
----------
store_precision : bool, default=True
Specifies if the estimated precision is stored.
assume_centered : bool, default=False
If True, data are not centered before computation.
Useful when working with data whose mean is almost, but not exactly
zero.
If False (default), data are centered before computation.
Attributes
----------
location_ : ndarray of shape (n_features,)
Estimated location, i.e. the estimated mean.
covariance_ : ndarray of shape (n_features, n_features)
Estimated covariance matrix
precision_ : ndarray of shape (n_features, n_features)
Estimated pseudo-inverse matrix.
(stored only if store_precision is True)
Examples
--------
>>> import numpy as np
>>> from sklearn.covariance import EmpiricalCovariance
>>> from sklearn.datasets import make_gaussian_quantiles
>>> real_cov = np.array([[.8, .3],
... [.3, .4]])
>>> rng = np.random.RandomState(0)
>>> X = rng.multivariate_normal(mean=[0, 0],
... cov=real_cov,
... size=500)
>>> cov = EmpiricalCovariance().fit(X)
>>> cov.covariance_
array([[0.7569..., 0.2818...],
[0.2818..., 0.3928...]])
>>> cov.location_
array([0.0622..., 0.0193...])
"""
@_deprecate_positional_args
def __init__(self, *, store_precision=True, assume_centered=False):
self.store_precision = store_precision
self.assume_centered = assume_centered
def _set_covariance(self, covariance):
"""Saves the covariance and precision estimates
Storage is done accordingly to `self.store_precision`.
Precision stored only if invertible.
Parameters
----------
covariance : array-like of shape (n_features, n_features)
Estimated covariance matrix to be stored, and from which precision
is computed.
"""
covariance = check_array(covariance)
# set covariance
self.covariance_ = covariance
# set precision
if self.store_precision:
self.precision_ = linalg.pinvh(covariance)
else:
self.precision_ = None
def get_precision(self):
"""Getter for the precision matrix.
Returns
-------
precision_ : array-like of shape (n_features, n_features)
The precision matrix associated to the current covariance object.
"""
if self.store_precision:
precision = self.precision_
else:
precision = linalg.pinvh(self.covariance_)
return precision
def fit(self, X, y=None):
"""Fits the Maximum Likelihood Estimator covariance model
according to the given training data and parameters.
Parameters
----------
X : array-like of shape (n_samples, n_features)
Training data, where n_samples is the number of samples and
n_features is the number of features.
y : Ignored
Not used, present for API consistence purpose.
Returns
-------
self : object
"""
X = self._validate_data(X)
if self.assume_centered:
self.location_ = np.zeros(X.shape[1])
else:
self.location_ = X.mean(0)
covariance = empirical_covariance(
X, assume_centered=self.assume_centered)
self._set_covariance(covariance)
return self
def score(self, X_test, y=None):
"""Computes the log-likelihood of a Gaussian data set with
`self.covariance_` as an estimator of its covariance matrix.
Parameters
----------
X_test : array-like of shape (n_samples, n_features)
Test data of which we compute the likelihood, where n_samples is
the number of samples and n_features is the number of features.
X_test is assumed to be drawn from the same distribution than
the data used in fit (including centering).
y : Ignored
Not used, present for API consistence purpose.
Returns
-------
res : float
The likelihood of the data set with `self.covariance_` as an
estimator of its covariance matrix.
"""
# compute empirical covariance of the test set
test_cov = empirical_covariance(
X_test - self.location_, assume_centered=True)
# compute log likelihood
res = log_likelihood(test_cov, self.get_precision())
return res
def error_norm(self, comp_cov, norm='frobenius', scaling=True,
squared=True):
"""Computes the Mean Squared Error between two covariance estimators.
(In the sense of the Frobenius norm).
Parameters
----------
comp_cov : array-like of shape (n_features, n_features)
The covariance to compare with.
norm : {"frobenius", "spectral"}, default="frobenius"
The type of norm used to compute the error. Available error types:
- 'frobenius' (default): sqrt(tr(A^t.A))
- 'spectral': sqrt(max(eigenvalues(A^t.A))
where A is the error ``(comp_cov - self.covariance_)``.
scaling : bool, default=True
If True (default), the squared error norm is divided by n_features.
If False, the squared error norm is not rescaled.
squared : bool, default=True
Whether to compute the squared error norm or the error norm.
If True (default), the squared error norm is returned.
If False, the error norm is returned.
Returns
-------
result : float
The Mean Squared Error (in the sense of the Frobenius norm) between
`self` and `comp_cov` covariance estimators.
"""
# compute the error
error = comp_cov - self.covariance_
# compute the error norm
if norm == "frobenius":
squared_norm = np.sum(error ** 2)
elif norm == "spectral":
squared_norm = np.amax(linalg.svdvals(np.dot(error.T, error)))
else:
raise NotImplementedError(
"Only spectral and frobenius norms are implemented")
# optionally scale the error norm
if scaling:
squared_norm = squared_norm / error.shape[0]
# finally get either the squared norm or the norm
if squared:
result = squared_norm
else:
result = np.sqrt(squared_norm)
return result
def mahalanobis(self, X):
"""Computes the squared Mahalanobis distances of given observations.
Parameters
----------
X : array-like of shape (n_samples, n_features)
The observations, the Mahalanobis distances of the which we
compute. Observations are assumed to be drawn from the same
distribution than the data used in fit.
Returns
-------
dist : ndarray of shape (n_samples,)
Squared Mahalanobis distances of the observations.
"""
precision = self.get_precision()
# compute mahalanobis distances
dist = pairwise_distances(X, self.location_[np.newaxis, :],
metric='mahalanobis', VI=precision)
return np.reshape(dist, (len(X),)) ** 2

View file

@ -0,0 +1,794 @@
"""GraphicalLasso: sparse inverse covariance estimation with an l1-penalized
estimator.
"""
# Author: Gael Varoquaux <gael.varoquaux@normalesup.org>
# License: BSD 3 clause
# Copyright: INRIA
from collections.abc import Sequence
import warnings
import operator
import sys
import time
import numpy as np
from scipy import linalg
from joblib import Parallel, delayed
from . import empirical_covariance, EmpiricalCovariance, log_likelihood
from ..exceptions import ConvergenceWarning
from ..utils.validation import check_random_state
from ..utils.validation import _deprecate_positional_args
# mypy error: Module 'sklearn.linear_model' has no attribute '_cd_fast'
from ..linear_model import _cd_fast as cd_fast # type: ignore
from ..linear_model import lars_path_gram
from ..model_selection import check_cv, cross_val_score
# Helper functions to compute the objective and dual objective functions
# of the l1-penalized estimator
def _objective(mle, precision_, alpha):
"""Evaluation of the graphical-lasso objective function
the objective function is made of a shifted scaled version of the
normalized log-likelihood (i.e. its empirical mean over the samples) and a
penalisation term to promote sparsity
"""
p = precision_.shape[0]
cost = - 2. * log_likelihood(mle, precision_) + p * np.log(2 * np.pi)
cost += alpha * (np.abs(precision_).sum()
- np.abs(np.diag(precision_)).sum())
return cost
def _dual_gap(emp_cov, precision_, alpha):
"""Expression of the dual gap convergence criterion
The specific definition is given in Duchi "Projected Subgradient Methods
for Learning Sparse Gaussians".
"""
gap = np.sum(emp_cov * precision_)
gap -= precision_.shape[0]
gap += alpha * (np.abs(precision_).sum()
- np.abs(np.diag(precision_)).sum())
return gap
def alpha_max(emp_cov):
"""Find the maximum alpha for which there are some non-zeros off-diagonal.
Parameters
----------
emp_cov : ndarray of shape (n_features, n_features)
The sample covariance matrix.
Notes
-----
This results from the bound for the all the Lasso that are solved
in GraphicalLasso: each time, the row of cov corresponds to Xy. As the
bound for alpha is given by `max(abs(Xy))`, the result follows.
"""
A = np.copy(emp_cov)
A.flat[::A.shape[0] + 1] = 0
return np.max(np.abs(A))
# The g-lasso algorithm
@_deprecate_positional_args
def graphical_lasso(emp_cov, alpha, *, cov_init=None, mode='cd', tol=1e-4,
enet_tol=1e-4, max_iter=100, verbose=False,
return_costs=False, eps=np.finfo(np.float64).eps,
return_n_iter=False):
"""l1-penalized covariance estimator
Read more in the :ref:`User Guide <sparse_inverse_covariance>`.
.. versionchanged:: v0.20
graph_lasso has been renamed to graphical_lasso
Parameters
----------
emp_cov : ndarray of shape (n_features, n_features)
Empirical covariance from which to compute the covariance estimate.
alpha : float
The regularization parameter: the higher alpha, the more
regularization, the sparser the inverse covariance.
Range is (0, inf].
cov_init : array of shape (n_features, n_features), default=None
The initial guess for the covariance.
mode : {'cd', 'lars'}, default='cd'
The Lasso solver to use: coordinate descent or LARS. Use LARS for
very sparse underlying graphs, where p > n. Elsewhere prefer cd
which is more numerically stable.
tol : float, default=1e-4
The tolerance to declare convergence: if the dual gap goes below
this value, iterations are stopped. Range is (0, inf].
enet_tol : float, default=1e-4
The tolerance for the elastic net solver used to calculate the descent
direction. This parameter controls the accuracy of the search direction
for a given column update, not of the overall parameter estimate. Only
used for mode='cd'. Range is (0, inf].
max_iter : int, default=100
The maximum number of iterations.
verbose : bool, default=False
If verbose is True, the objective function and dual gap are
printed at each iteration.
return_costs : bool, default=Flase
If return_costs is True, the objective function and dual gap
at each iteration are returned.
eps : float, default=eps
The machine-precision regularization in the computation of the
Cholesky diagonal factors. Increase this for very ill-conditioned
systems. Default is `np.finfo(np.float64).eps`.
return_n_iter : bool, default=False
Whether or not to return the number of iterations.
Returns
-------
covariance : ndarray of shape (n_features, n_features)
The estimated covariance matrix.
precision : ndarray of shape (n_features, n_features)
The estimated (sparse) precision matrix.
costs : list of (objective, dual_gap) pairs
The list of values of the objective function and the dual gap at
each iteration. Returned only if return_costs is True.
n_iter : int
Number of iterations. Returned only if `return_n_iter` is set to True.
See Also
--------
GraphicalLasso, GraphicalLassoCV
Notes
-----
The algorithm employed to solve this problem is the GLasso algorithm,
from the Friedman 2008 Biostatistics paper. It is the same algorithm
as in the R `glasso` package.
One possible difference with the `glasso` R package is that the
diagonal coefficients are not penalized.
"""
_, n_features = emp_cov.shape
if alpha == 0:
if return_costs:
precision_ = linalg.inv(emp_cov)
cost = - 2. * log_likelihood(emp_cov, precision_)
cost += n_features * np.log(2 * np.pi)
d_gap = np.sum(emp_cov * precision_) - n_features
if return_n_iter:
return emp_cov, precision_, (cost, d_gap), 0
else:
return emp_cov, precision_, (cost, d_gap)
else:
if return_n_iter:
return emp_cov, linalg.inv(emp_cov), 0
else:
return emp_cov, linalg.inv(emp_cov)
if cov_init is None:
covariance_ = emp_cov.copy()
else:
covariance_ = cov_init.copy()
# As a trivial regularization (Tikhonov like), we scale down the
# off-diagonal coefficients of our starting point: This is needed, as
# in the cross-validation the cov_init can easily be
# ill-conditioned, and the CV loop blows. Beside, this takes
# conservative stand-point on the initial conditions, and it tends to
# make the convergence go faster.
covariance_ *= 0.95
diagonal = emp_cov.flat[::n_features + 1]
covariance_.flat[::n_features + 1] = diagonal
precision_ = linalg.pinvh(covariance_)
indices = np.arange(n_features)
costs = list()
# The different l1 regression solver have different numerical errors
if mode == 'cd':
errors = dict(over='raise', invalid='ignore')
else:
errors = dict(invalid='raise')
try:
# be robust to the max_iter=0 edge case, see:
# https://github.com/scikit-learn/scikit-learn/issues/4134
d_gap = np.inf
# set a sub_covariance buffer
sub_covariance = np.copy(covariance_[1:, 1:], order='C')
for i in range(max_iter):
for idx in range(n_features):
# To keep the contiguous matrix `sub_covariance` equal to
# covariance_[indices != idx].T[indices != idx]
# we only need to update 1 column and 1 line when idx changes
if idx > 0:
di = idx - 1
sub_covariance[di] = covariance_[di][indices != idx]
sub_covariance[:, di] = covariance_[:, di][indices != idx]
else:
sub_covariance[:] = covariance_[1:, 1:]
row = emp_cov[idx, indices != idx]
with np.errstate(**errors):
if mode == 'cd':
# Use coordinate descent
coefs = -(precision_[indices != idx, idx]
/ (precision_[idx, idx] + 1000 * eps))
coefs, _, _, _ = cd_fast.enet_coordinate_descent_gram(
coefs, alpha, 0, sub_covariance,
row, row, max_iter, enet_tol,
check_random_state(None), False)
else:
# Use LARS
_, _, coefs = lars_path_gram(
Xy=row, Gram=sub_covariance, n_samples=row.size,
alpha_min=alpha / (n_features - 1), copy_Gram=True,
eps=eps, method='lars', return_path=False)
# Update the precision matrix
precision_[idx, idx] = (
1. / (covariance_[idx, idx]
- np.dot(covariance_[indices != idx, idx], coefs)))
precision_[indices != idx, idx] = (- precision_[idx, idx]
* coefs)
precision_[idx, indices != idx] = (- precision_[idx, idx]
* coefs)
coefs = np.dot(sub_covariance, coefs)
covariance_[idx, indices != idx] = coefs
covariance_[indices != idx, idx] = coefs
if not np.isfinite(precision_.sum()):
raise FloatingPointError('The system is too ill-conditioned '
'for this solver')
d_gap = _dual_gap(emp_cov, precision_, alpha)
cost = _objective(emp_cov, precision_, alpha)
if verbose:
print('[graphical_lasso] Iteration '
'% 3i, cost % 3.2e, dual gap %.3e'
% (i, cost, d_gap))
if return_costs:
costs.append((cost, d_gap))
if np.abs(d_gap) < tol:
break
if not np.isfinite(cost) and i > 0:
raise FloatingPointError('Non SPD result: the system is '
'too ill-conditioned for this solver')
else:
warnings.warn('graphical_lasso: did not converge after '
'%i iteration: dual gap: %.3e'
% (max_iter, d_gap), ConvergenceWarning)
except FloatingPointError as e:
e.args = (e.args[0]
+ '. The system is too ill-conditioned for this solver',)
raise e
if return_costs:
if return_n_iter:
return covariance_, precision_, costs, i + 1
else:
return covariance_, precision_, costs
else:
if return_n_iter:
return covariance_, precision_, i + 1
else:
return covariance_, precision_
class GraphicalLasso(EmpiricalCovariance):
"""Sparse inverse covariance estimation with an l1-penalized estimator.
Read more in the :ref:`User Guide <sparse_inverse_covariance>`.
.. versionchanged:: v0.20
GraphLasso has been renamed to GraphicalLasso
Parameters
----------
alpha : float, default=0.01
The regularization parameter: the higher alpha, the more
regularization, the sparser the inverse covariance.
Range is (0, inf].
mode : {'cd', 'lars'}, default='cd'
The Lasso solver to use: coordinate descent or LARS. Use LARS for
very sparse underlying graphs, where p > n. Elsewhere prefer cd
which is more numerically stable.
tol : float, default=1e-4
The tolerance to declare convergence: if the dual gap goes below
this value, iterations are stopped. Range is (0, inf].
enet_tol : float, default=1e-4
The tolerance for the elastic net solver used to calculate the descent
direction. This parameter controls the accuracy of the search direction
for a given column update, not of the overall parameter estimate. Only
used for mode='cd'. Range is (0, inf].
max_iter : int, default=100
The maximum number of iterations.
verbose : bool, default=False
If verbose is True, the objective function and dual gap are
plotted at each iteration.
assume_centered : bool, default=False
If True, data are not centered before computation.
Useful when working with data whose mean is almost, but not exactly
zero.
If False, data are centered before computation.
Attributes
----------
location_ : ndarray of shape (n_features,)
Estimated location, i.e. the estimated mean.
covariance_ : ndarray of shape (n_features, n_features)
Estimated covariance matrix
precision_ : ndarray of shape (n_features, n_features)
Estimated pseudo inverse matrix.
n_iter_ : int
Number of iterations run.
Examples
--------
>>> import numpy as np
>>> from sklearn.covariance import GraphicalLasso
>>> true_cov = np.array([[0.8, 0.0, 0.2, 0.0],
... [0.0, 0.4, 0.0, 0.0],
... [0.2, 0.0, 0.3, 0.1],
... [0.0, 0.0, 0.1, 0.7]])
>>> np.random.seed(0)
>>> X = np.random.multivariate_normal(mean=[0, 0, 0, 0],
... cov=true_cov,
... size=200)
>>> cov = GraphicalLasso().fit(X)
>>> np.around(cov.covariance_, decimals=3)
array([[0.816, 0.049, 0.218, 0.019],
[0.049, 0.364, 0.017, 0.034],
[0.218, 0.017, 0.322, 0.093],
[0.019, 0.034, 0.093, 0.69 ]])
>>> np.around(cov.location_, decimals=3)
array([0.073, 0.04 , 0.038, 0.143])
See Also
--------
graphical_lasso, GraphicalLassoCV
"""
@_deprecate_positional_args
def __init__(self, alpha=.01, *, mode='cd', tol=1e-4, enet_tol=1e-4,
max_iter=100, verbose=False, assume_centered=False):
super().__init__(assume_centered=assume_centered)
self.alpha = alpha
self.mode = mode
self.tol = tol
self.enet_tol = enet_tol
self.max_iter = max_iter
self.verbose = verbose
def fit(self, X, y=None):
"""Fits the GraphicalLasso model to X.
Parameters
----------
X : array-like of shape (n_samples, n_features)
Data from which to compute the covariance estimate
y : Ignored
Not used, present for API consistence purpose.
Returns
-------
self : object
"""
# Covariance does not make sense for a single feature
X = self._validate_data(X, ensure_min_features=2, ensure_min_samples=2,
estimator=self)
if self.assume_centered:
self.location_ = np.zeros(X.shape[1])
else:
self.location_ = X.mean(0)
emp_cov = empirical_covariance(
X, assume_centered=self.assume_centered)
self.covariance_, self.precision_, self.n_iter_ = graphical_lasso(
emp_cov, alpha=self.alpha, mode=self.mode, tol=self.tol,
enet_tol=self.enet_tol, max_iter=self.max_iter,
verbose=self.verbose, return_n_iter=True)
return self
# Cross-validation with GraphicalLasso
def graphical_lasso_path(X, alphas, cov_init=None, X_test=None, mode='cd',
tol=1e-4, enet_tol=1e-4, max_iter=100, verbose=False):
"""l1-penalized covariance estimator along a path of decreasing alphas
Read more in the :ref:`User Guide <sparse_inverse_covariance>`.
Parameters
----------
X : ndarray of shape (n_samples, n_features)
Data from which to compute the covariance estimate.
alphas : array-like of shape (n_alphas,)
The list of regularization parameters, decreasing order.
cov_init : array of shape (n_features, n_features), default=None
The initial guess for the covariance.
X_test : array of shape (n_test_samples, n_features), default=None
Optional test matrix to measure generalisation error.
mode : {'cd', 'lars'}, default='cd'
The Lasso solver to use: coordinate descent or LARS. Use LARS for
very sparse underlying graphs, where p > n. Elsewhere prefer cd
which is more numerically stable.
tol : float, default=1e-4
The tolerance to declare convergence: if the dual gap goes below
this value, iterations are stopped. The tolerance must be a positive
number.
enet_tol : float, default=1e-4
The tolerance for the elastic net solver used to calculate the descent
direction. This parameter controls the accuracy of the search direction
for a given column update, not of the overall parameter estimate. Only
used for mode='cd'. The tolerance must be a positive number.
max_iter : int, default=100
The maximum number of iterations. This parameter should be a strictly
positive integer.
verbose : int or bool, default=False
The higher the verbosity flag, the more information is printed
during the fitting.
Returns
-------
covariances_ : list of shape (n_alphas,) of ndarray of shape \
(n_features, n_features)
The estimated covariance matrices.
precisions_ : list of shape (n_alphas,) of ndarray of shape \
(n_features, n_features)
The estimated (sparse) precision matrices.
scores_ : list of shape (n_alphas,), dtype=float
The generalisation error (log-likelihood) on the test data.
Returned only if test data is passed.
"""
inner_verbose = max(0, verbose - 1)
emp_cov = empirical_covariance(X)
if cov_init is None:
covariance_ = emp_cov.copy()
else:
covariance_ = cov_init
covariances_ = list()
precisions_ = list()
scores_ = list()
if X_test is not None:
test_emp_cov = empirical_covariance(X_test)
for alpha in alphas:
try:
# Capture the errors, and move on
covariance_, precision_ = graphical_lasso(
emp_cov, alpha=alpha, cov_init=covariance_, mode=mode, tol=tol,
enet_tol=enet_tol, max_iter=max_iter, verbose=inner_verbose)
covariances_.append(covariance_)
precisions_.append(precision_)
if X_test is not None:
this_score = log_likelihood(test_emp_cov, precision_)
except FloatingPointError:
this_score = -np.inf
covariances_.append(np.nan)
precisions_.append(np.nan)
if X_test is not None:
if not np.isfinite(this_score):
this_score = -np.inf
scores_.append(this_score)
if verbose == 1:
sys.stderr.write('.')
elif verbose > 1:
if X_test is not None:
print('[graphical_lasso_path] alpha: %.2e, score: %.2e'
% (alpha, this_score))
else:
print('[graphical_lasso_path] alpha: %.2e' % alpha)
if X_test is not None:
return covariances_, precisions_, scores_
return covariances_, precisions_
class GraphicalLassoCV(GraphicalLasso):
"""Sparse inverse covariance w/ cross-validated choice of the l1 penalty.
See glossary entry for :term:`cross-validation estimator`.
Read more in the :ref:`User Guide <sparse_inverse_covariance>`.
.. versionchanged:: v0.20
GraphLassoCV has been renamed to GraphicalLassoCV
Parameters
----------
alphas : int or array-like of shape (n_alphas,), dtype=float, default=4
If an integer is given, it fixes the number of points on the
grids of alpha to be used. If a list is given, it gives the
grid to be used. See the notes in the class docstring for
more details. Range is (0, inf] when floats given.
n_refinements : int, default=4
The number of times the grid is refined. Not used if explicit
values of alphas are passed. Range is [1, inf).
cv : int, cross-validation generator or iterable, default=None
Determines the cross-validation splitting strategy.
Possible inputs for cv are:
- None, to use the default 5-fold cross-validation,
- integer, to specify the number of folds.
- :term:`CV splitter`,
- An iterable yielding (train, test) splits as arrays of indices.
For integer/None inputs :class:`KFold` is used.
Refer :ref:`User Guide <cross_validation>` for the various
cross-validation strategies that can be used here.
.. versionchanged:: 0.20
``cv`` default value if None changed from 3-fold to 5-fold.
tol : float, default=1e-4
The tolerance to declare convergence: if the dual gap goes below
this value, iterations are stopped. Range is (0, inf].
enet_tol : float, default=1e-4
The tolerance for the elastic net solver used to calculate the descent
direction. This parameter controls the accuracy of the search direction
for a given column update, not of the overall parameter estimate. Only
used for mode='cd'. Range is (0, inf].
max_iter : int, default=100
Maximum number of iterations.
mode : {'cd', 'lars'}, default='cd'
The Lasso solver to use: coordinate descent or LARS. Use LARS for
very sparse underlying graphs, where number of features is greater
than number of samples. Elsewhere prefer cd which is more numerically
stable.
n_jobs : int, default=None
number of jobs to run in parallel.
``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
``-1`` means using all processors. See :term:`Glossary <n_jobs>`
for more details.
.. versionchanged:: v0.20
`n_jobs` default changed from 1 to None
verbose : bool, default=False
If verbose is True, the objective function and duality gap are
printed at each iteration.
assume_centered : bool, default=False
If True, data are not centered before computation.
Useful when working with data whose mean is almost, but not exactly
zero.
If False, data are centered before computation.
Attributes
----------
location_ : ndarray of shape (n_features,)
Estimated location, i.e. the estimated mean.
covariance_ : ndarray of shape (n_features, n_features)
Estimated covariance matrix.
precision_ : ndarray of shape (n_features, n_features)
Estimated precision matrix (inverse covariance).
alpha_ : float
Penalization parameter selected.
cv_alphas_ : list of shape (n_alphas,), dtype=float
All penalization parameters explored.
grid_scores_ : ndarray of shape (n_alphas, n_folds)
Log-likelihood score on left-out data across folds.
n_iter_ : int
Number of iterations run for the optimal alpha.
Examples
--------
>>> import numpy as np
>>> from sklearn.covariance import GraphicalLassoCV
>>> true_cov = np.array([[0.8, 0.0, 0.2, 0.0],
... [0.0, 0.4, 0.0, 0.0],
... [0.2, 0.0, 0.3, 0.1],
... [0.0, 0.0, 0.1, 0.7]])
>>> np.random.seed(0)
>>> X = np.random.multivariate_normal(mean=[0, 0, 0, 0],
... cov=true_cov,
... size=200)
>>> cov = GraphicalLassoCV().fit(X)
>>> np.around(cov.covariance_, decimals=3)
array([[0.816, 0.051, 0.22 , 0.017],
[0.051, 0.364, 0.018, 0.036],
[0.22 , 0.018, 0.322, 0.094],
[0.017, 0.036, 0.094, 0.69 ]])
>>> np.around(cov.location_, decimals=3)
array([0.073, 0.04 , 0.038, 0.143])
See Also
--------
graphical_lasso, GraphicalLasso
Notes
-----
The search for the optimal penalization parameter (alpha) is done on an
iteratively refined grid: first the cross-validated scores on a grid are
computed, then a new refined grid is centered around the maximum, and so
on.
One of the challenges which is faced here is that the solvers can
fail to converge to a well-conditioned estimate. The corresponding
values of alpha then come out as missing values, but the optimum may
be close to these missing values.
"""
@_deprecate_positional_args
def __init__(self, *, alphas=4, n_refinements=4, cv=None, tol=1e-4,
enet_tol=1e-4, max_iter=100, mode='cd', n_jobs=None,
verbose=False, assume_centered=False):
super().__init__(
mode=mode, tol=tol, verbose=verbose, enet_tol=enet_tol,
max_iter=max_iter, assume_centered=assume_centered)
self.alphas = alphas
self.n_refinements = n_refinements
self.cv = cv
self.n_jobs = n_jobs
def fit(self, X, y=None):
"""Fits the GraphicalLasso covariance model to X.
Parameters
----------
X : array-like of shape (n_samples, n_features)
Data from which to compute the covariance estimate
y : Ignored
Not used, present for API consistence purpose.
Returns
-------
self : object
"""
# Covariance does not make sense for a single feature
X = self._validate_data(X, ensure_min_features=2, estimator=self)
if self.assume_centered:
self.location_ = np.zeros(X.shape[1])
else:
self.location_ = X.mean(0)
emp_cov = empirical_covariance(
X, assume_centered=self.assume_centered)
cv = check_cv(self.cv, y, classifier=False)
# List of (alpha, scores, covs)
path = list()
n_alphas = self.alphas
inner_verbose = max(0, self.verbose - 1)
if isinstance(n_alphas, Sequence):
alphas = self.alphas
n_refinements = 1
else:
n_refinements = self.n_refinements
alpha_1 = alpha_max(emp_cov)
alpha_0 = 1e-2 * alpha_1
alphas = np.logspace(np.log10(alpha_0), np.log10(alpha_1),
n_alphas)[::-1]
t0 = time.time()
for i in range(n_refinements):
with warnings.catch_warnings():
# No need to see the convergence warnings on this grid:
# they will always be points that will not converge
# during the cross-validation
warnings.simplefilter('ignore', ConvergenceWarning)
# Compute the cross-validated loss on the current grid
# NOTE: Warm-restarting graphical_lasso_path has been tried,
# and this did not allow to gain anything
# (same execution time with or without).
this_path = Parallel(
n_jobs=self.n_jobs,
verbose=self.verbose
)(delayed(graphical_lasso_path)(X[train], alphas=alphas,
X_test=X[test], mode=self.mode,
tol=self.tol,
enet_tol=self.enet_tol,
max_iter=int(.1 *
self.max_iter),
verbose=inner_verbose)
for train, test in cv.split(X, y))
# Little danse to transform the list in what we need
covs, _, scores = zip(*this_path)
covs = zip(*covs)
scores = zip(*scores)
path.extend(zip(alphas, scores, covs))
path = sorted(path, key=operator.itemgetter(0), reverse=True)
# Find the maximum (avoid using built in 'max' function to
# have a fully-reproducible selection of the smallest alpha
# in case of equality)
best_score = -np.inf
last_finite_idx = 0
for index, (alpha, scores, _) in enumerate(path):
this_score = np.mean(scores)
if this_score >= .1 / np.finfo(np.float64).eps:
this_score = np.nan
if np.isfinite(this_score):
last_finite_idx = index
if this_score >= best_score:
best_score = this_score
best_index = index
# Refine the grid
if best_index == 0:
# We do not need to go back: we have chosen
# the highest value of alpha for which there are
# non-zero coefficients
alpha_1 = path[0][0]
alpha_0 = path[1][0]
elif (best_index == last_finite_idx
and not best_index == len(path) - 1):
# We have non-converged models on the upper bound of the
# grid, we need to refine the grid there
alpha_1 = path[best_index][0]
alpha_0 = path[best_index + 1][0]
elif best_index == len(path) - 1:
alpha_1 = path[best_index][0]
alpha_0 = 0.01 * path[best_index][0]
else:
alpha_1 = path[best_index - 1][0]
alpha_0 = path[best_index + 1][0]
if not isinstance(n_alphas, Sequence):
alphas = np.logspace(np.log10(alpha_1), np.log10(alpha_0),
n_alphas + 2)
alphas = alphas[1:-1]
if self.verbose and n_refinements > 1:
print('[GraphicalLassoCV] Done refinement % 2i out of'
' %i: % 3is' % (i + 1, n_refinements, time.time() - t0))
path = list(zip(*path))
grid_scores = list(path[1])
alphas = list(path[0])
# Finally, compute the score with alpha = 0
alphas.append(0)
grid_scores.append(cross_val_score(EmpiricalCovariance(), X,
cv=cv, n_jobs=self.n_jobs,
verbose=inner_verbose))
self.grid_scores_ = np.array(grid_scores)
best_alpha = alphas[best_index]
self.alpha_ = best_alpha
self.cv_alphas_ = alphas
# Finally fit the model with the selected alpha
self.covariance_, self.precision_, self.n_iter_ = graphical_lasso(
emp_cov, alpha=best_alpha, mode=self.mode, tol=self.tol,
enet_tol=self.enet_tol, max_iter=self.max_iter,
verbose=inner_verbose, return_n_iter=True)
return self

View file

@ -0,0 +1,762 @@
"""
Robust location and covariance estimators.
Here are implemented estimators that are resistant to outliers.
"""
# Author: Virgile Fritsch <virgile.fritsch@inria.fr>
#
# License: BSD 3 clause
import warnings
import numbers
import numpy as np
from scipy import linalg
from scipy.stats import chi2
from . import empirical_covariance, EmpiricalCovariance
from ..utils.extmath import fast_logdet
from ..utils import check_random_state, check_array
from ..utils.validation import _deprecate_positional_args
# Minimum Covariance Determinant
# Implementing of an algorithm by Rousseeuw & Van Driessen described in
# (A Fast Algorithm for the Minimum Covariance Determinant Estimator,
# 1999, American Statistical Association and the American Society
# for Quality, TECHNOMETRICS)
# XXX Is this really a public function? It's not listed in the docs or
# exported by sklearn.covariance. Deprecate?
def c_step(X, n_support, remaining_iterations=30, initial_estimates=None,
verbose=False, cov_computation_method=empirical_covariance,
random_state=None):
"""C_step procedure described in [Rouseeuw1984]_ aiming at computing MCD.
Parameters
----------
X : array-like of shape (n_samples, n_features)
Data set in which we look for the n_support observations whose
scatter matrix has minimum determinant.
n_support : int,
Number of observations to compute the robust estimates of location
and covariance from. This parameter must be greater than
`n_samples / 2`.
remaining_iterations : int, default=30
Number of iterations to perform.
According to [Rouseeuw1999]_, two iterations are sufficient to get
close to the minimum, and we never need more than 30 to reach
convergence.
initial_estimates : tuple of shape (2,), default=None
Initial estimates of location and shape from which to run the c_step
procedure:
- initial_estimates[0]: an initial location estimate
- initial_estimates[1]: an initial covariance estimate
verbose : bool, defaut=False
Verbose mode.
cov_computation_method : callable, \
default=:func:`sklearn.covariance.empirical_covariance`
The function which will be used to compute the covariance.
Must return array of shape (n_features, n_features).
random_state : int or RandomState instance, default=None
Determines the pseudo random number generator for shuffling the data.
Pass an int for reproducible results across multiple function calls.
See :term: `Glossary <random_state>`.
Returns
-------
location : ndarray of shape (n_features,)
Robust location estimates.
covariance : ndarray of shape (n_features, n_features)
Robust covariance estimates.
support : ndarray of shape (n_samples,)
A mask for the `n_support` observations whose scatter matrix has
minimum determinant.
References
----------
.. [Rouseeuw1999] A Fast Algorithm for the Minimum Covariance Determinant
Estimator, 1999, American Statistical Association and the American
Society for Quality, TECHNOMETRICS
"""
X = np.asarray(X)
random_state = check_random_state(random_state)
return _c_step(X, n_support, remaining_iterations=remaining_iterations,
initial_estimates=initial_estimates, verbose=verbose,
cov_computation_method=cov_computation_method,
random_state=random_state)
def _c_step(X, n_support, random_state, remaining_iterations=30,
initial_estimates=None, verbose=False,
cov_computation_method=empirical_covariance):
n_samples, n_features = X.shape
dist = np.inf
# Initialisation
support = np.zeros(n_samples, dtype=bool)
if initial_estimates is None:
# compute initial robust estimates from a random subset
support[random_state.permutation(n_samples)[:n_support]] = True
else:
# get initial robust estimates from the function parameters
location = initial_estimates[0]
covariance = initial_estimates[1]
# run a special iteration for that case (to get an initial support)
precision = linalg.pinvh(covariance)
X_centered = X - location
dist = (np.dot(X_centered, precision) * X_centered).sum(1)
# compute new estimates
support[np.argsort(dist)[:n_support]] = True
X_support = X[support]
location = X_support.mean(0)
covariance = cov_computation_method(X_support)
# Iterative procedure for Minimum Covariance Determinant computation
det = fast_logdet(covariance)
# If the data already has singular covariance, calculate the precision,
# as the loop below will not be entered.
if np.isinf(det):
precision = linalg.pinvh(covariance)
previous_det = np.inf
while (det < previous_det and remaining_iterations > 0
and not np.isinf(det)):
# save old estimates values
previous_location = location
previous_covariance = covariance
previous_det = det
previous_support = support
# compute a new support from the full data set mahalanobis distances
precision = linalg.pinvh(covariance)
X_centered = X - location
dist = (np.dot(X_centered, precision) * X_centered).sum(axis=1)
# compute new estimates
support = np.zeros(n_samples, dtype=bool)
support[np.argsort(dist)[:n_support]] = True
X_support = X[support]
location = X_support.mean(axis=0)
covariance = cov_computation_method(X_support)
det = fast_logdet(covariance)
# update remaining iterations for early stopping
remaining_iterations -= 1
previous_dist = dist
dist = (np.dot(X - location, precision) * (X - location)).sum(axis=1)
# Check if best fit already found (det => 0, logdet => -inf)
if np.isinf(det):
results = location, covariance, det, support, dist
# Check convergence
if np.allclose(det, previous_det):
# c_step procedure converged
if verbose:
print("Optimal couple (location, covariance) found before"
" ending iterations (%d left)" % (remaining_iterations))
results = location, covariance, det, support, dist
elif det > previous_det:
# determinant has increased (should not happen)
warnings.warn("Determinant has increased; this should not happen: "
"log(det) > log(previous_det) (%.15f > %.15f). "
"You may want to try with a higher value of "
"support_fraction (current value: %.3f)."
% (det, previous_det, n_support / n_samples),
RuntimeWarning)
results = previous_location, previous_covariance, \
previous_det, previous_support, previous_dist
# Check early stopping
if remaining_iterations == 0:
if verbose:
print('Maximum number of iterations reached')
results = location, covariance, det, support, dist
return results
def select_candidates(X, n_support, n_trials, select=1, n_iter=30,
verbose=False,
cov_computation_method=empirical_covariance,
random_state=None):
"""Finds the best pure subset of observations to compute MCD from it.
The purpose of this function is to find the best sets of n_support
observations with respect to a minimization of their covariance
matrix determinant. Equivalently, it removes n_samples-n_support
observations to construct what we call a pure data set (i.e. not
containing outliers). The list of the observations of the pure
data set is referred to as the `support`.
Starting from a random support, the pure data set is found by the
c_step procedure introduced by Rousseeuw and Van Driessen in
[RV]_.
Parameters
----------
X : array-like of shape (n_samples, n_features)
Data (sub)set in which we look for the n_support purest observations.
n_support : int
The number of samples the pure data set must contain.
This parameter must be in the range `[(n + p + 1)/2] < n_support < n`.
n_trials : int or tuple of shape (2,)
Number of different initial sets of observations from which to
run the algorithm. This parameter should be a strictly positive
integer.
Instead of giving a number of trials to perform, one can provide a
list of initial estimates that will be used to iteratively run
c_step procedures. In this case:
- n_trials[0]: array-like, shape (n_trials, n_features)
is the list of `n_trials` initial location estimates
- n_trials[1]: array-like, shape (n_trials, n_features, n_features)
is the list of `n_trials` initial covariances estimates
select : int, default=1
Number of best candidates results to return. This parameter must be
a strictly positive integer.
n_iter : int, default=30
Maximum number of iterations for the c_step procedure.
(2 is enough to be close to the final solution. "Never" exceeds 20).
This parameter must be a strictly positive integer.
verbose : bool, default False
Control the output verbosity.
cov_computation_method : callable, \
default=:func:`sklearn.covariance.empirical_covariance`
The function which will be used to compute the covariance.
Must return an array of shape (n_features, n_features).
random_state : int or RandomState instance, default=None
Determines the pseudo random number generator for shuffling the data.
Pass an int for reproducible results across multiple function calls.
See :term: `Glossary <random_state>`.
See Also
---------
c_step
Returns
-------
best_locations : ndarray of shape (select, n_features)
The `select` location estimates computed from the `select` best
supports found in the data set (`X`).
best_covariances : ndarray of shape (select, n_features, n_features)
The `select` covariance estimates computed from the `select`
best supports found in the data set (`X`).
best_supports : ndarray of shape (select, n_samples)
The `select` best supports found in the data set (`X`).
References
----------
.. [RV] A Fast Algorithm for the Minimum Covariance Determinant
Estimator, 1999, American Statistical Association and the American
Society for Quality, TECHNOMETRICS
"""
random_state = check_random_state(random_state)
if isinstance(n_trials, numbers.Integral):
run_from_estimates = False
elif isinstance(n_trials, tuple):
run_from_estimates = True
estimates_list = n_trials
n_trials = estimates_list[0].shape[0]
else:
raise TypeError("Invalid 'n_trials' parameter, expected tuple or "
" integer, got %s (%s)" % (n_trials, type(n_trials)))
# compute `n_trials` location and shape estimates candidates in the subset
all_estimates = []
if not run_from_estimates:
# perform `n_trials` computations from random initial supports
for j in range(n_trials):
all_estimates.append(
_c_step(
X, n_support, remaining_iterations=n_iter, verbose=verbose,
cov_computation_method=cov_computation_method,
random_state=random_state))
else:
# perform computations from every given initial estimates
for j in range(n_trials):
initial_estimates = (estimates_list[0][j], estimates_list[1][j])
all_estimates.append(_c_step(
X, n_support, remaining_iterations=n_iter,
initial_estimates=initial_estimates, verbose=verbose,
cov_computation_method=cov_computation_method,
random_state=random_state))
all_locs_sub, all_covs_sub, all_dets_sub, all_supports_sub, all_ds_sub = \
zip(*all_estimates)
# find the `n_best` best results among the `n_trials` ones
index_best = np.argsort(all_dets_sub)[:select]
best_locations = np.asarray(all_locs_sub)[index_best]
best_covariances = np.asarray(all_covs_sub)[index_best]
best_supports = np.asarray(all_supports_sub)[index_best]
best_ds = np.asarray(all_ds_sub)[index_best]
return best_locations, best_covariances, best_supports, best_ds
def fast_mcd(X, support_fraction=None,
cov_computation_method=empirical_covariance,
random_state=None):
"""Estimates the Minimum Covariance Determinant matrix.
Read more in the :ref:`User Guide <robust_covariance>`.
Parameters
----------
X : array-like of shape (n_samples, n_features)
The data matrix, with p features and n samples.
support_fraction : float, default=None
The proportion of points to be included in the support of the raw
MCD estimate. Default is `None`, which implies that the minimum
value of `support_fraction` will be used within the algorithm:
`(n_sample + n_features + 1) / 2`. This parameter must be in the
range (0, 1).
cov_computation_method : callable, \
default=:func:`sklearn.covariance.empirical_covariance`
The function which will be used to compute the covariance.
Must return an array of shape (n_features, n_features).
random_state : int or RandomState instance, default=None
Determines the pseudo random number generator for shuffling the data.
Pass an int for reproducible results across multiple function calls.
See :term: `Glossary <random_state>`.
Returns
-------
location : ndarray of shape (n_features,)
Robust location of the data.
covariance : ndarray of shape (n_features, n_features)
Robust covariance of the features.
support : ndarray of shape (n_samples,), dtype=bool
A mask of the observations that have been used to compute
the robust location and covariance estimates of the data set.
Notes
-----
The FastMCD algorithm has been introduced by Rousseuw and Van Driessen
in "A Fast Algorithm for the Minimum Covariance Determinant Estimator,
1999, American Statistical Association and the American Society
for Quality, TECHNOMETRICS".
The principle is to compute robust estimates and random subsets before
pooling them into a larger subsets, and finally into the full data set.
Depending on the size of the initial sample, we have one, two or three
such computation levels.
Note that only raw estimates are returned. If one is interested in
the correction and reweighting steps described in [RouseeuwVan]_,
see the MinCovDet object.
References
----------
.. [RouseeuwVan] A Fast Algorithm for the Minimum Covariance
Determinant Estimator, 1999, American Statistical Association
and the American Society for Quality, TECHNOMETRICS
.. [Butler1993] R. W. Butler, P. L. Davies and M. Jhun,
Asymptotics For The Minimum Covariance Determinant Estimator,
The Annals of Statistics, 1993, Vol. 21, No. 3, 1385-1400
"""
random_state = check_random_state(random_state)
X = check_array(X, ensure_min_samples=2, estimator='fast_mcd')
n_samples, n_features = X.shape
# minimum breakdown value
if support_fraction is None:
n_support = int(np.ceil(0.5 * (n_samples + n_features + 1)))
else:
n_support = int(support_fraction * n_samples)
# 1-dimensional case quick computation
# (Rousseeuw, P. J. and Leroy, A. M. (2005) References, in Robust
# Regression and Outlier Detection, John Wiley & Sons, chapter 4)
if n_features == 1:
if n_support < n_samples:
# find the sample shortest halves
X_sorted = np.sort(np.ravel(X))
diff = X_sorted[n_support:] - X_sorted[:(n_samples - n_support)]
halves_start = np.where(diff == np.min(diff))[0]
# take the middle points' mean to get the robust location estimate
location = 0.5 * (X_sorted[n_support + halves_start] +
X_sorted[halves_start]).mean()
support = np.zeros(n_samples, dtype=bool)
X_centered = X - location
support[np.argsort(np.abs(X_centered), 0)[:n_support]] = True
covariance = np.asarray([[np.var(X[support])]])
location = np.array([location])
# get precision matrix in an optimized way
precision = linalg.pinvh(covariance)
dist = (np.dot(X_centered, precision) * (X_centered)).sum(axis=1)
else:
support = np.ones(n_samples, dtype=bool)
covariance = np.asarray([[np.var(X)]])
location = np.asarray([np.mean(X)])
X_centered = X - location
# get precision matrix in an optimized way
precision = linalg.pinvh(covariance)
dist = (np.dot(X_centered, precision) * (X_centered)).sum(axis=1)
# Starting FastMCD algorithm for p-dimensional case
if (n_samples > 500) and (n_features > 1):
# 1. Find candidate supports on subsets
# a. split the set in subsets of size ~ 300
n_subsets = n_samples // 300
n_samples_subsets = n_samples // n_subsets
samples_shuffle = random_state.permutation(n_samples)
h_subset = int(np.ceil(n_samples_subsets *
(n_support / float(n_samples))))
# b. perform a total of 500 trials
n_trials_tot = 500
# c. select 10 best (location, covariance) for each subset
n_best_sub = 10
n_trials = max(10, n_trials_tot // n_subsets)
n_best_tot = n_subsets * n_best_sub
all_best_locations = np.zeros((n_best_tot, n_features))
try:
all_best_covariances = np.zeros((n_best_tot, n_features,
n_features))
except MemoryError:
# The above is too big. Let's try with something much small
# (and less optimal)
n_best_tot = 10
all_best_covariances = np.zeros((n_best_tot, n_features,
n_features))
n_best_sub = 2
for i in range(n_subsets):
low_bound = i * n_samples_subsets
high_bound = low_bound + n_samples_subsets
current_subset = X[samples_shuffle[low_bound:high_bound]]
best_locations_sub, best_covariances_sub, _, _ = select_candidates(
current_subset, h_subset, n_trials,
select=n_best_sub, n_iter=2,
cov_computation_method=cov_computation_method,
random_state=random_state)
subset_slice = np.arange(i * n_best_sub, (i + 1) * n_best_sub)
all_best_locations[subset_slice] = best_locations_sub
all_best_covariances[subset_slice] = best_covariances_sub
# 2. Pool the candidate supports into a merged set
# (possibly the full dataset)
n_samples_merged = min(1500, n_samples)
h_merged = int(np.ceil(n_samples_merged *
(n_support / float(n_samples))))
if n_samples > 1500:
n_best_merged = 10
else:
n_best_merged = 1
# find the best couples (location, covariance) on the merged set
selection = random_state.permutation(n_samples)[:n_samples_merged]
locations_merged, covariances_merged, supports_merged, d = \
select_candidates(
X[selection], h_merged,
n_trials=(all_best_locations, all_best_covariances),
select=n_best_merged,
cov_computation_method=cov_computation_method,
random_state=random_state)
# 3. Finally get the overall best (locations, covariance) couple
if n_samples < 1500:
# directly get the best couple (location, covariance)
location = locations_merged[0]
covariance = covariances_merged[0]
support = np.zeros(n_samples, dtype=bool)
dist = np.zeros(n_samples)
support[selection] = supports_merged[0]
dist[selection] = d[0]
else:
# select the best couple on the full dataset
locations_full, covariances_full, supports_full, d = \
select_candidates(
X, n_support,
n_trials=(locations_merged, covariances_merged),
select=1,
cov_computation_method=cov_computation_method,
random_state=random_state)
location = locations_full[0]
covariance = covariances_full[0]
support = supports_full[0]
dist = d[0]
elif n_features > 1:
# 1. Find the 10 best couples (location, covariance)
# considering two iterations
n_trials = 30
n_best = 10
locations_best, covariances_best, _, _ = select_candidates(
X, n_support, n_trials=n_trials, select=n_best, n_iter=2,
cov_computation_method=cov_computation_method,
random_state=random_state)
# 2. Select the best couple on the full dataset amongst the 10
locations_full, covariances_full, supports_full, d = select_candidates(
X, n_support, n_trials=(locations_best, covariances_best),
select=1, cov_computation_method=cov_computation_method,
random_state=random_state)
location = locations_full[0]
covariance = covariances_full[0]
support = supports_full[0]
dist = d[0]
return location, covariance, support, dist
class MinCovDet(EmpiricalCovariance):
"""Minimum Covariance Determinant (MCD): robust estimator of covariance.
The Minimum Covariance Determinant covariance estimator is to be applied
on Gaussian-distributed data, but could still be relevant on data
drawn from a unimodal, symmetric distribution. It is not meant to be used
with multi-modal data (the algorithm used to fit a MinCovDet object is
likely to fail in such a case).
One should consider projection pursuit methods to deal with multi-modal
datasets.
Read more in the :ref:`User Guide <robust_covariance>`.
Parameters
----------
store_precision : bool, default=True
Specify if the estimated precision is stored.
assume_centered : bool, default=False
If True, the support of the robust location and the covariance
estimates is computed, and a covariance estimate is recomputed from
it, without centering the data.
Useful to work with data whose mean is significantly equal to
zero but is not exactly zero.
If False, the robust location and covariance are directly computed
with the FastMCD algorithm without additional treatment.
support_fraction : float, default=None
The proportion of points to be included in the support of the raw
MCD estimate. Default is None, which implies that the minimum
value of support_fraction will be used within the algorithm:
`(n_sample + n_features + 1) / 2`. The parameter must be in the range
(0, 1).
random_state : int or RandomState instance, default=None
Determines the pseudo random number generator for shuffling the data.
Pass an int for reproducible results across multiple function calls.
See :term: `Glossary <random_state>`.
Attributes
----------
raw_location_ : ndarray of shape (n_features,)
The raw robust estimated location before correction and re-weighting.
raw_covariance_ : ndarray of shape (n_features, n_features)
The raw robust estimated covariance before correction and re-weighting.
raw_support_ : ndarray of shape (n_samples,)
A mask of the observations that have been used to compute
the raw robust estimates of location and shape, before correction
and re-weighting.
location_ : ndarray of shape (n_features,)
Estimated robust location.
covariance_ : ndarray of shape (n_features, n_features)
Estimated robust covariance matrix.
precision_ : ndarray of shape (n_features, n_features)
Estimated pseudo inverse matrix.
(stored only if store_precision is True)
support_ : ndarray of shape (n_samples,)
A mask of the observations that have been used to compute
the robust estimates of location and shape.
dist_ : ndarray of shape (n_samples,)
Mahalanobis distances of the training set (on which :meth:`fit` is
called) observations.
Examples
--------
>>> import numpy as np
>>> from sklearn.covariance import MinCovDet
>>> from sklearn.datasets import make_gaussian_quantiles
>>> real_cov = np.array([[.8, .3],
... [.3, .4]])
>>> rng = np.random.RandomState(0)
>>> X = rng.multivariate_normal(mean=[0, 0],
... cov=real_cov,
... size=500)
>>> cov = MinCovDet(random_state=0).fit(X)
>>> cov.covariance_
array([[0.7411..., 0.2535...],
[0.2535..., 0.3053...]])
>>> cov.location_
array([0.0813... , 0.0427...])
References
----------
.. [Rouseeuw1984] P. J. Rousseeuw. Least median of squares regression.
J. Am Stat Ass, 79:871, 1984.
.. [Rousseeuw] A Fast Algorithm for the Minimum Covariance Determinant
Estimator, 1999, American Statistical Association and the American
Society for Quality, TECHNOMETRICS
.. [ButlerDavies] R. W. Butler, P. L. Davies and M. Jhun,
Asymptotics For The Minimum Covariance Determinant Estimator,
The Annals of Statistics, 1993, Vol. 21, No. 3, 1385-1400
"""
_nonrobust_covariance = staticmethod(empirical_covariance)
@_deprecate_positional_args
def __init__(self, *, store_precision=True, assume_centered=False,
support_fraction=None, random_state=None):
self.store_precision = store_precision
self.assume_centered = assume_centered
self.support_fraction = support_fraction
self.random_state = random_state
def fit(self, X, y=None):
"""Fits a Minimum Covariance Determinant with the FastMCD algorithm.
Parameters
----------
X : array-like of shape (n_samples, n_features)
Training data, where `n_samples` is the number of samples
and `n_features` is the number of features.
y: Ignored
Not used, present for API consistence purpose.
Returns
-------
self : object
"""
X = self._validate_data(X, ensure_min_samples=2, estimator='MinCovDet')
random_state = check_random_state(self.random_state)
n_samples, n_features = X.shape
# check that the empirical covariance is full rank
if (linalg.svdvals(np.dot(X.T, X)) > 1e-8).sum() != n_features:
warnings.warn("The covariance matrix associated to your dataset "
"is not full rank")
# compute and store raw estimates
raw_location, raw_covariance, raw_support, raw_dist = fast_mcd(
X, support_fraction=self.support_fraction,
cov_computation_method=self._nonrobust_covariance,
random_state=random_state)
if self.assume_centered:
raw_location = np.zeros(n_features)
raw_covariance = self._nonrobust_covariance(X[raw_support],
assume_centered=True)
# get precision matrix in an optimized way
precision = linalg.pinvh(raw_covariance)
raw_dist = np.sum(np.dot(X, precision) * X, 1)
self.raw_location_ = raw_location
self.raw_covariance_ = raw_covariance
self.raw_support_ = raw_support
self.location_ = raw_location
self.support_ = raw_support
self.dist_ = raw_dist
# obtain consistency at normal models
self.correct_covariance(X)
# re-weight estimator
self.reweight_covariance(X)
return self
def correct_covariance(self, data):
"""Apply a correction to raw Minimum Covariance Determinant estimates.
Correction using the empirical correction factor suggested
by Rousseeuw and Van Driessen in [RVD]_.
Parameters
----------
data : array-like of shape (n_samples, n_features)
The data matrix, with p features and n samples.
The data set must be the one which was used to compute
the raw estimates.
Returns
-------
covariance_corrected : ndarray of shape (n_features, n_features)
Corrected robust covariance estimate.
References
----------
.. [RVD] A Fast Algorithm for the Minimum Covariance
Determinant Estimator, 1999, American Statistical Association
and the American Society for Quality, TECHNOMETRICS
"""
# Check that the covariance of the support data is not equal to 0.
# Otherwise self.dist_ = 0 and thus correction = 0.
n_samples = len(self.dist_)
n_support = np.sum(self.support_)
if n_support < n_samples and np.allclose(self.raw_covariance_, 0):
raise ValueError('The covariance matrix of the support data '
'is equal to 0, try to increase support_fraction')
correction = np.median(self.dist_) / chi2(data.shape[1]).isf(0.5)
covariance_corrected = self.raw_covariance_ * correction
self.dist_ /= correction
return covariance_corrected
def reweight_covariance(self, data):
"""Re-weight raw Minimum Covariance Determinant estimates.
Re-weight observations using Rousseeuw's method (equivalent to
deleting outlying observations from the data set before
computing location and covariance estimates) described
in [RVDriessen]_.
Parameters
----------
data : array-like of shape (n_samples, n_features)
The data matrix, with p features and n samples.
The data set must be the one which was used to compute
the raw estimates.
Returns
-------
location_reweighted : ndarray of shape (n_features,)
Re-weighted robust location estimate.
covariance_reweighted : ndarray of shape (n_features, n_features)
Re-weighted robust covariance estimate.
support_reweighted : ndarray of shape (n_samples,), dtype=bool
A mask of the observations that have been used to compute
the re-weighted robust location and covariance estimates.
References
----------
.. [RVDriessen] A Fast Algorithm for the Minimum Covariance
Determinant Estimator, 1999, American Statistical Association
and the American Society for Quality, TECHNOMETRICS
"""
n_samples, n_features = data.shape
mask = self.dist_ < chi2(n_features).isf(0.025)
if self.assume_centered:
location_reweighted = np.zeros(n_features)
else:
location_reweighted = data[mask].mean(0)
covariance_reweighted = self._nonrobust_covariance(
data[mask], assume_centered=self.assume_centered)
support_reweighted = np.zeros(n_samples, dtype=bool)
support_reweighted[mask] = True
self._set_covariance(covariance_reweighted)
self.location_ = location_reweighted
self.support_ = support_reweighted
X_centered = data - self.location_
self.dist_ = np.sum(
np.dot(X_centered, self.get_precision()) * X_centered, 1)
return location_reweighted, covariance_reweighted, support_reweighted

View file

@ -0,0 +1,605 @@
"""
Covariance estimators using shrinkage.
Shrinkage corresponds to regularising `cov` using a convex combination:
shrunk_cov = (1-shrinkage)*cov + shrinkage*structured_estimate.
"""
# Author: Alexandre Gramfort <alexandre.gramfort@inria.fr>
# Gael Varoquaux <gael.varoquaux@normalesup.org>
# Virgile Fritsch <virgile.fritsch@inria.fr>
#
# License: BSD 3 clause
# avoid division truncation
import warnings
import numpy as np
from . import empirical_covariance, EmpiricalCovariance
from ..utils import check_array
from ..utils.validation import _deprecate_positional_args
# ShrunkCovariance estimator
def shrunk_covariance(emp_cov, shrinkage=0.1):
"""Calculates a covariance matrix shrunk on the diagonal
Read more in the :ref:`User Guide <shrunk_covariance>`.
Parameters
----------
emp_cov : array-like of shape (n_features, n_features)
Covariance matrix to be shrunk
shrinkage : float, default=0.1
Coefficient in the convex combination used for the computation
of the shrunk estimate. Range is [0, 1].
Returns
-------
shrunk_cov : ndarray of shape (n_features, n_features)
Shrunk covariance.
Notes
-----
The regularized (shrunk) covariance is given by:
(1 - shrinkage) * cov + shrinkage * mu * np.identity(n_features)
where mu = trace(cov) / n_features
"""
emp_cov = check_array(emp_cov)
n_features = emp_cov.shape[0]
mu = np.trace(emp_cov) / n_features
shrunk_cov = (1. - shrinkage) * emp_cov
shrunk_cov.flat[::n_features + 1] += shrinkage * mu
return shrunk_cov
class ShrunkCovariance(EmpiricalCovariance):
"""Covariance estimator with shrinkage
Read more in the :ref:`User Guide <shrunk_covariance>`.
Parameters
----------
store_precision : bool, default=True
Specify if the estimated precision is stored
assume_centered : bool, default=False
If True, data will not be centered before computation.
Useful when working with data whose mean is almost, but not exactly
zero.
If False, data will be centered before computation.
shrinkage : float, default=0.1
Coefficient in the convex combination used for the computation
of the shrunk estimate. Range is [0, 1].
Attributes
----------
covariance_ : ndarray of shape (n_features, n_features)
Estimated covariance matrix
location_ : ndarray of shape (n_features,)
Estimated location, i.e. the estimated mean.
precision_ : ndarray of shape (n_features, n_features)
Estimated pseudo inverse matrix.
(stored only if store_precision is True)
Examples
--------
>>> import numpy as np
>>> from sklearn.covariance import ShrunkCovariance
>>> from sklearn.datasets import make_gaussian_quantiles
>>> real_cov = np.array([[.8, .3],
... [.3, .4]])
>>> rng = np.random.RandomState(0)
>>> X = rng.multivariate_normal(mean=[0, 0],
... cov=real_cov,
... size=500)
>>> cov = ShrunkCovariance().fit(X)
>>> cov.covariance_
array([[0.7387..., 0.2536...],
[0.2536..., 0.4110...]])
>>> cov.location_
array([0.0622..., 0.0193...])
Notes
-----
The regularized covariance is given by:
(1 - shrinkage) * cov + shrinkage * mu * np.identity(n_features)
where mu = trace(cov) / n_features
"""
@_deprecate_positional_args
def __init__(self, *, store_precision=True, assume_centered=False,
shrinkage=0.1):
super().__init__(store_precision=store_precision,
assume_centered=assume_centered)
self.shrinkage = shrinkage
def fit(self, X, y=None):
"""Fit the shrunk covariance model according to the given training data
and parameters.
Parameters
----------
X : array-like of shape (n_samples, n_features)
Training data, where n_samples is the number of samples
and n_features is the number of features.
y: Ignored
not used, present for API consistence purpose.
Returns
-------
self : object
"""
X = self._validate_data(X)
# Not calling the parent object to fit, to avoid a potential
# matrix inversion when setting the precision
if self.assume_centered:
self.location_ = np.zeros(X.shape[1])
else:
self.location_ = X.mean(0)
covariance = empirical_covariance(
X, assume_centered=self.assume_centered)
covariance = shrunk_covariance(covariance, self.shrinkage)
self._set_covariance(covariance)
return self
# Ledoit-Wolf estimator
def ledoit_wolf_shrinkage(X, assume_centered=False, block_size=1000):
"""Estimates the shrunk Ledoit-Wolf covariance matrix.
Read more in the :ref:`User Guide <shrunk_covariance>`.
Parameters
----------
X : array-like of shape (n_samples, n_features)
Data from which to compute the Ledoit-Wolf shrunk covariance shrinkage.
assume_centered : bool, default=False
If True, data will not be centered before computation.
Useful to work with data whose mean is significantly equal to
zero but is not exactly zero.
If False, data will be centered before computation.
block_size : int, default=1000
Size of the blocks into which the covariance matrix will be split.
Returns
-------
shrinkage : float
Coefficient in the convex combination used for the computation
of the shrunk estimate.
Notes
-----
The regularized (shrunk) covariance is:
(1 - shrinkage) * cov + shrinkage * mu * np.identity(n_features)
where mu = trace(cov) / n_features
"""
X = np.asarray(X)
# for only one feature, the result is the same whatever the shrinkage
if len(X.shape) == 2 and X.shape[1] == 1:
return 0.
if X.ndim == 1:
X = np.reshape(X, (1, -1))
if X.shape[0] == 1:
warnings.warn("Only one sample available. "
"You may want to reshape your data array")
n_samples, n_features = X.shape
# optionally center data
if not assume_centered:
X = X - X.mean(0)
# A non-blocked version of the computation is present in the tests
# in tests/test_covariance.py
# number of blocks to split the covariance matrix into
n_splits = int(n_features / block_size)
X2 = X ** 2
emp_cov_trace = np.sum(X2, axis=0) / n_samples
mu = np.sum(emp_cov_trace) / n_features
beta_ = 0. # sum of the coefficients of <X2.T, X2>
delta_ = 0. # sum of the *squared* coefficients of <X.T, X>
# starting block computation
for i in range(n_splits):
for j in range(n_splits):
rows = slice(block_size * i, block_size * (i + 1))
cols = slice(block_size * j, block_size * (j + 1))
beta_ += np.sum(np.dot(X2.T[rows], X2[:, cols]))
delta_ += np.sum(np.dot(X.T[rows], X[:, cols]) ** 2)
rows = slice(block_size * i, block_size * (i + 1))
beta_ += np.sum(np.dot(X2.T[rows], X2[:, block_size * n_splits:]))
delta_ += np.sum(
np.dot(X.T[rows], X[:, block_size * n_splits:]) ** 2)
for j in range(n_splits):
cols = slice(block_size * j, block_size * (j + 1))
beta_ += np.sum(np.dot(X2.T[block_size * n_splits:], X2[:, cols]))
delta_ += np.sum(
np.dot(X.T[block_size * n_splits:], X[:, cols]) ** 2)
delta_ += np.sum(np.dot(X.T[block_size * n_splits:],
X[:, block_size * n_splits:]) ** 2)
delta_ /= n_samples ** 2
beta_ += np.sum(np.dot(X2.T[block_size * n_splits:],
X2[:, block_size * n_splits:]))
# use delta_ to compute beta
beta = 1. / (n_features * n_samples) * (beta_ / n_samples - delta_)
# delta is the sum of the squared coefficients of (<X.T,X> - mu*Id) / p
delta = delta_ - 2. * mu * emp_cov_trace.sum() + n_features * mu ** 2
delta /= n_features
# get final beta as the min between beta and delta
# We do this to prevent shrinking more than "1", which whould invert
# the value of covariances
beta = min(beta, delta)
# finally get shrinkage
shrinkage = 0 if beta == 0 else beta / delta
return shrinkage
@_deprecate_positional_args
def ledoit_wolf(X, *, assume_centered=False, block_size=1000):
"""Estimates the shrunk Ledoit-Wolf covariance matrix.
Read more in the :ref:`User Guide <shrunk_covariance>`.
Parameters
----------
X : array-like of shape (n_samples, n_features)
Data from which to compute the covariance estimate
assume_centered : bool, default=False
If True, data will not be centered before computation.
Useful to work with data whose mean is significantly equal to
zero but is not exactly zero.
If False, data will be centered before computation.
block_size : int, default=1000
Size of the blocks into which the covariance matrix will be split.
This is purely a memory optimization and does not affect results.
Returns
-------
shrunk_cov : ndarray of shape (n_features, n_features)
Shrunk covariance.
shrinkage : float
Coefficient in the convex combination used for the computation
of the shrunk estimate.
Notes
-----
The regularized (shrunk) covariance is:
(1 - shrinkage) * cov + shrinkage * mu * np.identity(n_features)
where mu = trace(cov) / n_features
"""
X = np.asarray(X)
# for only one feature, the result is the same whatever the shrinkage
if len(X.shape) == 2 and X.shape[1] == 1:
if not assume_centered:
X = X - X.mean()
return np.atleast_2d((X ** 2).mean()), 0.
if X.ndim == 1:
X = np.reshape(X, (1, -1))
warnings.warn("Only one sample available. "
"You may want to reshape your data array")
n_features = X.size
else:
_, n_features = X.shape
# get Ledoit-Wolf shrinkage
shrinkage = ledoit_wolf_shrinkage(
X, assume_centered=assume_centered, block_size=block_size)
emp_cov = empirical_covariance(X, assume_centered=assume_centered)
mu = np.sum(np.trace(emp_cov)) / n_features
shrunk_cov = (1. - shrinkage) * emp_cov
shrunk_cov.flat[::n_features + 1] += shrinkage * mu
return shrunk_cov, shrinkage
class LedoitWolf(EmpiricalCovariance):
"""LedoitWolf Estimator
Ledoit-Wolf is a particular form of shrinkage, where the shrinkage
coefficient is computed using O. Ledoit and M. Wolf's formula as
described in "A Well-Conditioned Estimator for Large-Dimensional
Covariance Matrices", Ledoit and Wolf, Journal of Multivariate
Analysis, Volume 88, Issue 2, February 2004, pages 365-411.
Read more in the :ref:`User Guide <shrunk_covariance>`.
Parameters
----------
store_precision : bool, default=True
Specify if the estimated precision is stored.
assume_centered : bool, default=False
If True, data will not be centered before computation.
Useful when working with data whose mean is almost, but not exactly
zero.
If False (default), data will be centered before computation.
block_size : int, default=1000
Size of the blocks into which the covariance matrix will be split
during its Ledoit-Wolf estimation. This is purely a memory
optimization and does not affect results.
Attributes
----------
covariance_ : ndarray of shape (n_features, n_features)
Estimated covariance matrix.
location_ : ndarray of shape (n_features,)
Estimated location, i.e. the estimated mean.
precision_ : ndarray of shape (n_features, n_features)
Estimated pseudo inverse matrix.
(stored only if store_precision is True)
shrinkage_ : float
Coefficient in the convex combination used for the computation
of the shrunk estimate. Range is [0, 1].
Examples
--------
>>> import numpy as np
>>> from sklearn.covariance import LedoitWolf
>>> real_cov = np.array([[.4, .2],
... [.2, .8]])
>>> np.random.seed(0)
>>> X = np.random.multivariate_normal(mean=[0, 0],
... cov=real_cov,
... size=50)
>>> cov = LedoitWolf().fit(X)
>>> cov.covariance_
array([[0.4406..., 0.1616...],
[0.1616..., 0.8022...]])
>>> cov.location_
array([ 0.0595... , -0.0075...])
Notes
-----
The regularised covariance is:
(1 - shrinkage) * cov + shrinkage * mu * np.identity(n_features)
where mu = trace(cov) / n_features
and shrinkage is given by the Ledoit and Wolf formula (see References)
References
----------
"A Well-Conditioned Estimator for Large-Dimensional Covariance Matrices",
Ledoit and Wolf, Journal of Multivariate Analysis, Volume 88, Issue 2,
February 2004, pages 365-411.
"""
@_deprecate_positional_args
def __init__(self, *, store_precision=True, assume_centered=False,
block_size=1000):
super().__init__(store_precision=store_precision,
assume_centered=assume_centered)
self.block_size = block_size
def fit(self, X, y=None):
"""Fit the Ledoit-Wolf shrunk covariance model according to the given
training data and parameters.
Parameters
----------
X : array-like of shape (n_samples, n_features)
Training data, where `n_samples` is the number of samples
and `n_features` is the number of features.
y : Ignored
not used, present for API consistence purpose.
Returns
-------
self : object
"""
# Not calling the parent object to fit, to avoid computing the
# covariance matrix (and potentially the precision)
X = self._validate_data(X)
if self.assume_centered:
self.location_ = np.zeros(X.shape[1])
else:
self.location_ = X.mean(0)
covariance, shrinkage = ledoit_wolf(X - self.location_,
assume_centered=True,
block_size=self.block_size)
self.shrinkage_ = shrinkage
self._set_covariance(covariance)
return self
# OAS estimator
@_deprecate_positional_args
def oas(X, *, assume_centered=False):
"""Estimate covariance with the Oracle Approximating Shrinkage algorithm.
Parameters
----------
X : array-like of shape (n_samples, n_features)
Data from which to compute the covariance estimate.
assume_centered : bool, default=False
If True, data will not be centered before computation.
Useful to work with data whose mean is significantly equal to
zero but is not exactly zero.
If False, data will be centered before computation.
Returns
-------
shrunk_cov : array-like of shape (n_features, n_features)
Shrunk covariance.
shrinkage : float
Coefficient in the convex combination used for the computation
of the shrunk estimate.
Notes
-----
The regularised (shrunk) covariance is:
(1 - shrinkage) * cov + shrinkage * mu * np.identity(n_features)
where mu = trace(cov) / n_features
The formula we used to implement the OAS is slightly modified compared
to the one given in the article. See :class:`OAS` for more details.
"""
X = np.asarray(X)
# for only one feature, the result is the same whatever the shrinkage
if len(X.shape) == 2 and X.shape[1] == 1:
if not assume_centered:
X = X - X.mean()
return np.atleast_2d((X ** 2).mean()), 0.
if X.ndim == 1:
X = np.reshape(X, (1, -1))
warnings.warn("Only one sample available. "
"You may want to reshape your data array")
n_samples = 1
n_features = X.size
else:
n_samples, n_features = X.shape
emp_cov = empirical_covariance(X, assume_centered=assume_centered)
mu = np.trace(emp_cov) / n_features
# formula from Chen et al.'s **implementation**
alpha = np.mean(emp_cov ** 2)
num = alpha + mu ** 2
den = (n_samples + 1.) * (alpha - (mu ** 2) / n_features)
shrinkage = 1. if den == 0 else min(num / den, 1.)
shrunk_cov = (1. - shrinkage) * emp_cov
shrunk_cov.flat[::n_features + 1] += shrinkage * mu
return shrunk_cov, shrinkage
class OAS(EmpiricalCovariance):
"""Oracle Approximating Shrinkage Estimator
Read more in the :ref:`User Guide <shrunk_covariance>`.
OAS is a particular form of shrinkage described in
"Shrinkage Algorithms for MMSE Covariance Estimation"
Chen et al., IEEE Trans. on Sign. Proc., Volume 58, Issue 10, October 2010.
The formula used here does not correspond to the one given in the
article. In the original article, formula (23) states that 2/p is
multiplied by Trace(cov*cov) in both the numerator and denominator, but
this operation is omitted because for a large p, the value of 2/p is
so small that it doesn't affect the value of the estimator.
Parameters
----------
store_precision : bool, default=True
Specify if the estimated precision is stored.
assume_centered : bool, default=False
If True, data will not be centered before computation.
Useful when working with data whose mean is almost, but not exactly
zero.
If False (default), data will be centered before computation.
Attributes
----------
covariance_ : ndarray of shape (n_features, n_features)
Estimated covariance matrix.
location_ : ndarray of shape (n_features,)
Estimated location, i.e. the estimated mean.
precision_ : ndarray of shape (n_features, n_features)
Estimated pseudo inverse matrix.
(stored only if store_precision is True)
shrinkage_ : float
coefficient in the convex combination used for the computation
of the shrunk estimate. Range is [0, 1].
Examples
--------
>>> import numpy as np
>>> from sklearn.covariance import OAS
>>> from sklearn.datasets import make_gaussian_quantiles
>>> real_cov = np.array([[.8, .3],
... [.3, .4]])
>>> rng = np.random.RandomState(0)
>>> X = rng.multivariate_normal(mean=[0, 0],
... cov=real_cov,
... size=500)
>>> oas = OAS().fit(X)
>>> oas.covariance_
array([[0.7533..., 0.2763...],
[0.2763..., 0.3964...]])
>>> oas.precision_
array([[ 1.7833..., -1.2431... ],
[-1.2431..., 3.3889...]])
>>> oas.shrinkage_
0.0195...
Notes
-----
The regularised covariance is:
(1 - shrinkage) * cov + shrinkage * mu * np.identity(n_features)
where mu = trace(cov) / n_features
and shrinkage is given by the OAS formula (see References)
References
----------
"Shrinkage Algorithms for MMSE Covariance Estimation"
Chen et al., IEEE Trans. on Sign. Proc., Volume 58, Issue 10, October 2010.
"""
def fit(self, X, y=None):
"""Fit the Oracle Approximating Shrinkage covariance model
according to the given training data and parameters.
Parameters
----------
X : array-like of shape (n_samples, n_features)
Training data, where `n_samples` is the number of samples
and `n_features` is the number of features.
y : Ignored
not used, present for API consistence purpose.
Returns
-------
self : object
"""
X = self._validate_data(X)
# Not calling the parent object to fit, to avoid computing the
# covariance matrix (and potentially the precision)
if self.assume_centered:
self.location_ = np.zeros(X.shape[1])
else:
self.location_ = X.mean(0)
covariance, shrinkage = oas(X - self.location_, assume_centered=True)
self.shrinkage_ = shrinkage
self._set_covariance(covariance)
return self

View file

@ -0,0 +1,18 @@
# THIS FILE WAS AUTOMATICALLY GENERATED BY deprecated_modules.py
import sys
# mypy error: Module X has no attribute y (typically for C extensions)
from . import _elliptic_envelope # type: ignore
from ..externals._pep562 import Pep562
from ..utils.deprecation import _raise_dep_warning_if_not_pytest
deprecated_path = 'sklearn.covariance.elliptic_envelope'
correct_import_path = 'sklearn.covariance'
_raise_dep_warning_if_not_pytest(deprecated_path, correct_import_path)
def __getattr__(name):
return getattr(_elliptic_envelope, name)
if not sys.version_info >= (3, 7):
Pep562(__name__)

View file

@ -0,0 +1,18 @@
# THIS FILE WAS AUTOMATICALLY GENERATED BY deprecated_modules.py
import sys
# mypy error: Module X has no attribute y (typically for C extensions)
from . import _empirical_covariance # type: ignore
from ..externals._pep562 import Pep562
from ..utils.deprecation import _raise_dep_warning_if_not_pytest
deprecated_path = 'sklearn.covariance.empirical_covariance_'
correct_import_path = 'sklearn.covariance'
_raise_dep_warning_if_not_pytest(deprecated_path, correct_import_path)
def __getattr__(name):
return getattr(_empirical_covariance, name)
if not sys.version_info >= (3, 7):
Pep562(__name__)

View file

@ -0,0 +1,18 @@
# THIS FILE WAS AUTOMATICALLY GENERATED BY deprecated_modules.py
import sys
# mypy error: Module X has no attribute y (typically for C extensions)
from . import _graph_lasso # type: ignore
from ..externals._pep562 import Pep562
from ..utils.deprecation import _raise_dep_warning_if_not_pytest
deprecated_path = 'sklearn.covariance.graph_lasso_'
correct_import_path = 'sklearn.covariance'
_raise_dep_warning_if_not_pytest(deprecated_path, correct_import_path)
def __getattr__(name):
return getattr(_graph_lasso, name)
if not sys.version_info >= (3, 7):
Pep562(__name__)

View file

@ -0,0 +1,18 @@
# THIS FILE WAS AUTOMATICALLY GENERATED BY deprecated_modules.py
import sys
# mypy error: Module X has no attribute y (typically for C extensions)
from . import _robust_covariance # type: ignore
from ..externals._pep562 import Pep562
from ..utils.deprecation import _raise_dep_warning_if_not_pytest
deprecated_path = 'sklearn.covariance.robust_covariance'
correct_import_path = 'sklearn.covariance'
_raise_dep_warning_if_not_pytest(deprecated_path, correct_import_path)
def __getattr__(name):
return getattr(_robust_covariance, name)
if not sys.version_info >= (3, 7):
Pep562(__name__)

View file

@ -0,0 +1,18 @@
# THIS FILE WAS AUTOMATICALLY GENERATED BY deprecated_modules.py
import sys
# mypy error: Module X has no attribute y (typically for C extensions)
from . import _shrunk_covariance # type: ignore
from ..externals._pep562 import Pep562
from ..utils.deprecation import _raise_dep_warning_if_not_pytest
deprecated_path = 'sklearn.covariance.shrunk_covariance_'
correct_import_path = 'sklearn.covariance'
_raise_dep_warning_if_not_pytest(deprecated_path, correct_import_path)
def __getattr__(name):
return getattr(_shrunk_covariance, name)
if not sys.version_info >= (3, 7):
Pep562(__name__)

View file

@ -0,0 +1,305 @@
# Author: Alexandre Gramfort <alexandre.gramfort@inria.fr>
# Gael Varoquaux <gael.varoquaux@normalesup.org>
# Virgile Fritsch <virgile.fritsch@inria.fr>
#
# License: BSD 3 clause
import numpy as np
import pytest
from sklearn.utils._testing import assert_almost_equal
from sklearn.utils._testing import assert_array_almost_equal
from sklearn.utils._testing import assert_array_equal
from sklearn.utils._testing import assert_warns
from sklearn import datasets
from sklearn.covariance import empirical_covariance, EmpiricalCovariance, \
ShrunkCovariance, shrunk_covariance, \
LedoitWolf, ledoit_wolf, ledoit_wolf_shrinkage, OAS, oas
X, _ = datasets.load_diabetes(return_X_y=True)
X_1d = X[:, 0]
n_samples, n_features = X.shape
def test_covariance():
# Tests Covariance module on a simple dataset.
# test covariance fit from data
cov = EmpiricalCovariance()
cov.fit(X)
emp_cov = empirical_covariance(X)
assert_array_almost_equal(emp_cov, cov.covariance_, 4)
assert_almost_equal(cov.error_norm(emp_cov), 0)
assert_almost_equal(
cov.error_norm(emp_cov, norm='spectral'), 0)
assert_almost_equal(
cov.error_norm(emp_cov, norm='frobenius'), 0)
assert_almost_equal(
cov.error_norm(emp_cov, scaling=False), 0)
assert_almost_equal(
cov.error_norm(emp_cov, squared=False), 0)
with pytest.raises(NotImplementedError):
cov.error_norm(emp_cov, norm='foo')
# Mahalanobis distances computation test
mahal_dist = cov.mahalanobis(X)
assert np.amin(mahal_dist) > 0
# test with n_features = 1
X_1d = X[:, 0].reshape((-1, 1))
cov = EmpiricalCovariance()
cov.fit(X_1d)
assert_array_almost_equal(empirical_covariance(X_1d), cov.covariance_, 4)
assert_almost_equal(cov.error_norm(empirical_covariance(X_1d)), 0)
assert_almost_equal(
cov.error_norm(empirical_covariance(X_1d), norm='spectral'), 0)
# test with one sample
# Create X with 1 sample and 5 features
X_1sample = np.arange(5).reshape(1, 5)
cov = EmpiricalCovariance()
assert_warns(UserWarning, cov.fit, X_1sample)
assert_array_almost_equal(cov.covariance_,
np.zeros(shape=(5, 5), dtype=np.float64))
# test integer type
X_integer = np.asarray([[0, 1], [1, 0]])
result = np.asarray([[0.25, -0.25], [-0.25, 0.25]])
assert_array_almost_equal(empirical_covariance(X_integer), result)
# test centered case
cov = EmpiricalCovariance(assume_centered=True)
cov.fit(X)
assert_array_equal(cov.location_, np.zeros(X.shape[1]))
def test_shrunk_covariance():
# Tests ShrunkCovariance module on a simple dataset.
# compare shrunk covariance obtained from data and from MLE estimate
cov = ShrunkCovariance(shrinkage=0.5)
cov.fit(X)
assert_array_almost_equal(
shrunk_covariance(empirical_covariance(X), shrinkage=0.5),
cov.covariance_, 4)
# same test with shrinkage not provided
cov = ShrunkCovariance()
cov.fit(X)
assert_array_almost_equal(
shrunk_covariance(empirical_covariance(X)), cov.covariance_, 4)
# same test with shrinkage = 0 (<==> empirical_covariance)
cov = ShrunkCovariance(shrinkage=0.)
cov.fit(X)
assert_array_almost_equal(empirical_covariance(X), cov.covariance_, 4)
# test with n_features = 1
X_1d = X[:, 0].reshape((-1, 1))
cov = ShrunkCovariance(shrinkage=0.3)
cov.fit(X_1d)
assert_array_almost_equal(empirical_covariance(X_1d), cov.covariance_, 4)
# test shrinkage coeff on a simple data set (without saving precision)
cov = ShrunkCovariance(shrinkage=0.5, store_precision=False)
cov.fit(X)
assert(cov.precision_ is None)
def test_ledoit_wolf():
# Tests LedoitWolf module on a simple dataset.
# test shrinkage coeff on a simple data set
X_centered = X - X.mean(axis=0)
lw = LedoitWolf(assume_centered=True)
lw.fit(X_centered)
shrinkage_ = lw.shrinkage_
score_ = lw.score(X_centered)
assert_almost_equal(ledoit_wolf_shrinkage(X_centered,
assume_centered=True),
shrinkage_)
assert_almost_equal(ledoit_wolf_shrinkage(X_centered, assume_centered=True,
block_size=6),
shrinkage_)
# compare shrunk covariance obtained from data and from MLE estimate
lw_cov_from_mle, lw_shrinkage_from_mle = ledoit_wolf(X_centered,
assume_centered=True)
assert_array_almost_equal(lw_cov_from_mle, lw.covariance_, 4)
assert_almost_equal(lw_shrinkage_from_mle, lw.shrinkage_)
# compare estimates given by LW and ShrunkCovariance
scov = ShrunkCovariance(shrinkage=lw.shrinkage_, assume_centered=True)
scov.fit(X_centered)
assert_array_almost_equal(scov.covariance_, lw.covariance_, 4)
# test with n_features = 1
X_1d = X[:, 0].reshape((-1, 1))
lw = LedoitWolf(assume_centered=True)
lw.fit(X_1d)
lw_cov_from_mle, lw_shrinkage_from_mle = ledoit_wolf(X_1d,
assume_centered=True)
assert_array_almost_equal(lw_cov_from_mle, lw.covariance_, 4)
assert_almost_equal(lw_shrinkage_from_mle, lw.shrinkage_)
assert_array_almost_equal((X_1d ** 2).sum() / n_samples, lw.covariance_, 4)
# test shrinkage coeff on a simple data set (without saving precision)
lw = LedoitWolf(store_precision=False, assume_centered=True)
lw.fit(X_centered)
assert_almost_equal(lw.score(X_centered), score_, 4)
assert(lw.precision_ is None)
# Same tests without assuming centered data
# test shrinkage coeff on a simple data set
lw = LedoitWolf()
lw.fit(X)
assert_almost_equal(lw.shrinkage_, shrinkage_, 4)
assert_almost_equal(lw.shrinkage_, ledoit_wolf_shrinkage(X))
assert_almost_equal(lw.shrinkage_, ledoit_wolf(X)[1])
assert_almost_equal(lw.score(X), score_, 4)
# compare shrunk covariance obtained from data and from MLE estimate
lw_cov_from_mle, lw_shrinkage_from_mle = ledoit_wolf(X)
assert_array_almost_equal(lw_cov_from_mle, lw.covariance_, 4)
assert_almost_equal(lw_shrinkage_from_mle, lw.shrinkage_)
# compare estimates given by LW and ShrunkCovariance
scov = ShrunkCovariance(shrinkage=lw.shrinkage_)
scov.fit(X)
assert_array_almost_equal(scov.covariance_, lw.covariance_, 4)
# test with n_features = 1
X_1d = X[:, 0].reshape((-1, 1))
lw = LedoitWolf()
lw.fit(X_1d)
lw_cov_from_mle, lw_shrinkage_from_mle = ledoit_wolf(X_1d)
assert_array_almost_equal(lw_cov_from_mle, lw.covariance_, 4)
assert_almost_equal(lw_shrinkage_from_mle, lw.shrinkage_)
assert_array_almost_equal(empirical_covariance(X_1d), lw.covariance_, 4)
# test with one sample
# warning should be raised when using only 1 sample
X_1sample = np.arange(5).reshape(1, 5)
lw = LedoitWolf()
assert_warns(UserWarning, lw.fit, X_1sample)
assert_array_almost_equal(lw.covariance_,
np.zeros(shape=(5, 5), dtype=np.float64))
# test shrinkage coeff on a simple data set (without saving precision)
lw = LedoitWolf(store_precision=False)
lw.fit(X)
assert_almost_equal(lw.score(X), score_, 4)
assert(lw.precision_ is None)
def _naive_ledoit_wolf_shrinkage(X):
# A simple implementation of the formulas from Ledoit & Wolf
# The computation below achieves the following computations of the
# "O. Ledoit and M. Wolf, A Well-Conditioned Estimator for
# Large-Dimensional Covariance Matrices"
# beta and delta are given in the beginning of section 3.2
n_samples, n_features = X.shape
emp_cov = empirical_covariance(X, assume_centered=False)
mu = np.trace(emp_cov) / n_features
delta_ = emp_cov.copy()
delta_.flat[::n_features + 1] -= mu
delta = (delta_ ** 2).sum() / n_features
X2 = X ** 2
beta_ = 1. / (n_features * n_samples) \
* np.sum(np.dot(X2.T, X2) / n_samples - emp_cov ** 2)
beta = min(beta_, delta)
shrinkage = beta / delta
return shrinkage
def test_ledoit_wolf_small():
# Compare our blocked implementation to the naive implementation
X_small = X[:, :4]
lw = LedoitWolf()
lw.fit(X_small)
shrinkage_ = lw.shrinkage_
assert_almost_equal(shrinkage_, _naive_ledoit_wolf_shrinkage(X_small))
def test_ledoit_wolf_large():
# test that ledoit_wolf doesn't error on data that is wider than block_size
rng = np.random.RandomState(0)
# use a number of features that is larger than the block-size
X = rng.normal(size=(10, 20))
lw = LedoitWolf(block_size=10).fit(X)
# check that covariance is about diagonal (random normal noise)
assert_almost_equal(lw.covariance_, np.eye(20), 0)
cov = lw.covariance_
# check that the result is consistent with not splitting data into blocks.
lw = LedoitWolf(block_size=25).fit(X)
assert_almost_equal(lw.covariance_, cov)
def test_oas():
# Tests OAS module on a simple dataset.
# test shrinkage coeff on a simple data set
X_centered = X - X.mean(axis=0)
oa = OAS(assume_centered=True)
oa.fit(X_centered)
shrinkage_ = oa.shrinkage_
score_ = oa.score(X_centered)
# compare shrunk covariance obtained from data and from MLE estimate
oa_cov_from_mle, oa_shrinkage_from_mle = oas(X_centered,
assume_centered=True)
assert_array_almost_equal(oa_cov_from_mle, oa.covariance_, 4)
assert_almost_equal(oa_shrinkage_from_mle, oa.shrinkage_)
# compare estimates given by OAS and ShrunkCovariance
scov = ShrunkCovariance(shrinkage=oa.shrinkage_, assume_centered=True)
scov.fit(X_centered)
assert_array_almost_equal(scov.covariance_, oa.covariance_, 4)
# test with n_features = 1
X_1d = X[:, 0:1]
oa = OAS(assume_centered=True)
oa.fit(X_1d)
oa_cov_from_mle, oa_shrinkage_from_mle = oas(X_1d, assume_centered=True)
assert_array_almost_equal(oa_cov_from_mle, oa.covariance_, 4)
assert_almost_equal(oa_shrinkage_from_mle, oa.shrinkage_)
assert_array_almost_equal((X_1d ** 2).sum() / n_samples, oa.covariance_, 4)
# test shrinkage coeff on a simple data set (without saving precision)
oa = OAS(store_precision=False, assume_centered=True)
oa.fit(X_centered)
assert_almost_equal(oa.score(X_centered), score_, 4)
assert(oa.precision_ is None)
# Same tests without assuming centered data--------------------------------
# test shrinkage coeff on a simple data set
oa = OAS()
oa.fit(X)
assert_almost_equal(oa.shrinkage_, shrinkage_, 4)
assert_almost_equal(oa.score(X), score_, 4)
# compare shrunk covariance obtained from data and from MLE estimate
oa_cov_from_mle, oa_shrinkage_from_mle = oas(X)
assert_array_almost_equal(oa_cov_from_mle, oa.covariance_, 4)
assert_almost_equal(oa_shrinkage_from_mle, oa.shrinkage_)
# compare estimates given by OAS and ShrunkCovariance
scov = ShrunkCovariance(shrinkage=oa.shrinkage_)
scov.fit(X)
assert_array_almost_equal(scov.covariance_, oa.covariance_, 4)
# test with n_features = 1
X_1d = X[:, 0].reshape((-1, 1))
oa = OAS()
oa.fit(X_1d)
oa_cov_from_mle, oa_shrinkage_from_mle = oas(X_1d)
assert_array_almost_equal(oa_cov_from_mle, oa.covariance_, 4)
assert_almost_equal(oa_shrinkage_from_mle, oa.shrinkage_)
assert_array_almost_equal(empirical_covariance(X_1d), oa.covariance_, 4)
# test with one sample
# warning should be raised when using only 1 sample
X_1sample = np.arange(5).reshape(1, 5)
oa = OAS()
assert_warns(UserWarning, oa.fit, X_1sample)
assert_array_almost_equal(oa.covariance_,
np.zeros(shape=(5, 5), dtype=np.float64))
# test shrinkage coeff on a simple data set (without saving precision)
oa = OAS(store_precision=False)
oa.fit(X)
assert_almost_equal(oa.score(X), score_, 4)
assert(oa.precision_ is None)

View file

@ -0,0 +1,45 @@
"""
Testing for Elliptic Envelope algorithm (sklearn.covariance.elliptic_envelope).
"""
import numpy as np
import pytest
from sklearn.covariance import EllipticEnvelope
from sklearn.utils._testing import assert_almost_equal
from sklearn.utils._testing import assert_array_almost_equal
from sklearn.utils._testing import assert_array_equal
from sklearn.exceptions import NotFittedError
def test_elliptic_envelope():
rnd = np.random.RandomState(0)
X = rnd.randn(100, 10)
clf = EllipticEnvelope(contamination=0.1)
with pytest.raises(NotFittedError):
clf.predict(X)
with pytest.raises(NotFittedError):
clf.decision_function(X)
clf.fit(X)
y_pred = clf.predict(X)
scores = clf.score_samples(X)
decisions = clf.decision_function(X)
assert_array_almost_equal(
scores, -clf.mahalanobis(X))
assert_array_almost_equal(clf.mahalanobis(X), clf.dist_)
assert_almost_equal(clf.score(X, np.ones(100)),
(100 - y_pred[y_pred == -1].size) / 100.)
assert(sum(y_pred == -1) == sum(decisions < 0))
def test_score_samples():
X_train = [[1, 1], [1, 2], [2, 1]]
clf1 = EllipticEnvelope(contamination=0.2).fit(X_train)
clf2 = EllipticEnvelope().fit(X_train)
assert_array_equal(clf1.score_samples([[2., 2.]]),
clf1.decision_function([[2., 2.]]) + clf1.offset_)
assert_array_equal(clf2.score_samples([[2., 2.]]),
clf2.decision_function([[2., 2.]]) + clf2.offset_)
assert_array_equal(clf1.score_samples([[2., 2.]]),
clf2.score_samples([[2., 2.]]))

View file

@ -0,0 +1,150 @@
""" Test the graphical_lasso module.
"""
import sys
import numpy as np
from scipy import linalg
from sklearn.utils._testing import assert_array_almost_equal
from sklearn.utils._testing import assert_array_less
from sklearn.covariance import (graphical_lasso, GraphicalLasso,
GraphicalLassoCV, empirical_covariance)
from sklearn.datasets import make_sparse_spd_matrix
from io import StringIO
from sklearn.utils import check_random_state
from sklearn import datasets
def test_graphical_lasso(random_state=0):
# Sample data from a sparse multivariate normal
dim = 20
n_samples = 100
random_state = check_random_state(random_state)
prec = make_sparse_spd_matrix(dim, alpha=.95,
random_state=random_state)
cov = linalg.inv(prec)
X = random_state.multivariate_normal(np.zeros(dim), cov, size=n_samples)
emp_cov = empirical_covariance(X)
for alpha in (0., .1, .25):
covs = dict()
icovs = dict()
for method in ('cd', 'lars'):
cov_, icov_, costs = graphical_lasso(emp_cov, return_costs=True,
alpha=alpha, mode=method)
covs[method] = cov_
icovs[method] = icov_
costs, dual_gap = np.array(costs).T
# Check that the costs always decrease (doesn't hold if alpha == 0)
if not alpha == 0:
assert_array_less(np.diff(costs), 0)
# Check that the 2 approaches give similar results
assert_array_almost_equal(covs['cd'], covs['lars'], decimal=4)
assert_array_almost_equal(icovs['cd'], icovs['lars'], decimal=4)
# Smoke test the estimator
model = GraphicalLasso(alpha=.25).fit(X)
model.score(X)
assert_array_almost_equal(model.covariance_, covs['cd'], decimal=4)
assert_array_almost_equal(model.covariance_, covs['lars'], decimal=4)
# For a centered matrix, assume_centered could be chosen True or False
# Check that this returns indeed the same result for centered data
Z = X - X.mean(0)
precs = list()
for assume_centered in (False, True):
prec_ = GraphicalLasso(
assume_centered=assume_centered).fit(Z).precision_
precs.append(prec_)
assert_array_almost_equal(precs[0], precs[1])
def test_graphical_lasso_iris():
# Hard-coded solution from R glasso package for alpha=1.0
# (need to set penalize.diagonal to FALSE)
cov_R = np.array([
[0.68112222, 0.0000000, 0.265820, 0.02464314],
[0.00000000, 0.1887129, 0.000000, 0.00000000],
[0.26582000, 0.0000000, 3.095503, 0.28697200],
[0.02464314, 0.0000000, 0.286972, 0.57713289]
])
icov_R = np.array([
[1.5190747, 0.000000, -0.1304475, 0.0000000],
[0.0000000, 5.299055, 0.0000000, 0.0000000],
[-0.1304475, 0.000000, 0.3498624, -0.1683946],
[0.0000000, 0.000000, -0.1683946, 1.8164353]
])
X = datasets.load_iris().data
emp_cov = empirical_covariance(X)
for method in ('cd', 'lars'):
cov, icov = graphical_lasso(emp_cov, alpha=1.0, return_costs=False,
mode=method)
assert_array_almost_equal(cov, cov_R)
assert_array_almost_equal(icov, icov_R)
def test_graph_lasso_2D():
# Hard-coded solution from Python skggm package
# obtained by calling `quic(emp_cov, lam=.1, tol=1e-8)`
cov_skggm = np.array([[3.09550269, 1.186972],
[1.186972, 0.57713289]])
icov_skggm = np.array([[1.52836773, -3.14334831],
[-3.14334831, 8.19753385]])
X = datasets.load_iris().data[:, 2:]
emp_cov = empirical_covariance(X)
for method in ('cd', 'lars'):
cov, icov = graphical_lasso(emp_cov, alpha=.1, return_costs=False,
mode=method)
assert_array_almost_equal(cov, cov_skggm)
assert_array_almost_equal(icov, icov_skggm)
def test_graphical_lasso_iris_singular():
# Small subset of rows to test the rank-deficient case
# Need to choose samples such that none of the variances are zero
indices = np.arange(10, 13)
# Hard-coded solution from R glasso package for alpha=0.01
cov_R = np.array([
[0.08, 0.056666662595, 0.00229729713223, 0.00153153142149],
[0.056666662595, 0.082222222222, 0.00333333333333, 0.00222222222222],
[0.002297297132, 0.003333333333, 0.00666666666667, 0.00009009009009],
[0.001531531421, 0.002222222222, 0.00009009009009, 0.00222222222222]
])
icov_R = np.array([
[24.42244057, -16.831679593, 0.0, 0.0],
[-16.83168201, 24.351841681, -6.206896552, -12.5],
[0.0, -6.206896171, 153.103448276, 0.0],
[0.0, -12.499999143, 0.0, 462.5]
])
X = datasets.load_iris().data[indices, :]
emp_cov = empirical_covariance(X)
for method in ('cd', 'lars'):
cov, icov = graphical_lasso(emp_cov, alpha=0.01, return_costs=False,
mode=method)
assert_array_almost_equal(cov, cov_R, decimal=5)
assert_array_almost_equal(icov, icov_R, decimal=5)
def test_graphical_lasso_cv(random_state=1):
# Sample data from a sparse multivariate normal
dim = 5
n_samples = 6
random_state = check_random_state(random_state)
prec = make_sparse_spd_matrix(dim, alpha=.96,
random_state=random_state)
cov = linalg.inv(prec)
X = random_state.multivariate_normal(np.zeros(dim), cov, size=n_samples)
# Capture stdout, to smoke test the verbose mode
orig_stdout = sys.stdout
try:
sys.stdout = StringIO()
# We need verbose very high so that Parallel prints on stdout
GraphicalLassoCV(verbose=100, alphas=5, tol=1e-1).fit(X)
finally:
sys.stdout = orig_stdout
# Smoke test with specified alphas
GraphicalLassoCV(alphas=[0.8, 0.5], tol=1e-1, n_jobs=1).fit(X)

View file

@ -0,0 +1,168 @@
# Author: Alexandre Gramfort <alexandre.gramfort@inria.fr>
# Gael Varoquaux <gael.varoquaux@normalesup.org>
# Virgile Fritsch <virgile.fritsch@inria.fr>
#
# License: BSD 3 clause
import itertools
import numpy as np
from sklearn.utils._testing import assert_array_almost_equal
from sklearn.utils._testing import assert_raise_message
from sklearn.utils._testing import assert_warns_message
from sklearn import datasets
from sklearn.covariance import empirical_covariance, MinCovDet
from sklearn.covariance import fast_mcd
X = datasets.load_iris().data
X_1d = X[:, 0]
n_samples, n_features = X.shape
def test_mcd():
# Tests the FastMCD algorithm implementation
# Small data set
# test without outliers (random independent normal data)
launch_mcd_on_dataset(100, 5, 0, 0.01, 0.1, 80)
# test with a contaminated data set (medium contamination)
launch_mcd_on_dataset(100, 5, 20, 0.01, 0.01, 70)
# test with a contaminated data set (strong contamination)
launch_mcd_on_dataset(100, 5, 40, 0.1, 0.1, 50)
# Medium data set
launch_mcd_on_dataset(1000, 5, 450, 0.1, 0.1, 540)
# Large data set
launch_mcd_on_dataset(1700, 5, 800, 0.1, 0.1, 870)
# 1D data set
launch_mcd_on_dataset(500, 1, 100, 0.001, 0.001, 350)
def test_fast_mcd_on_invalid_input():
X = np.arange(100)
assert_raise_message(ValueError, 'Expected 2D array, got 1D array instead',
fast_mcd, X)
def test_mcd_class_on_invalid_input():
X = np.arange(100)
mcd = MinCovDet()
assert_raise_message(ValueError, 'Expected 2D array, got 1D array instead',
mcd.fit, X)
def launch_mcd_on_dataset(n_samples, n_features, n_outliers, tol_loc, tol_cov,
tol_support):
rand_gen = np.random.RandomState(0)
data = rand_gen.randn(n_samples, n_features)
# add some outliers
outliers_index = rand_gen.permutation(n_samples)[:n_outliers]
outliers_offset = 10. * \
(rand_gen.randint(2, size=(n_outliers, n_features)) - 0.5)
data[outliers_index] += outliers_offset
inliers_mask = np.ones(n_samples).astype(bool)
inliers_mask[outliers_index] = False
pure_data = data[inliers_mask]
# compute MCD by fitting an object
mcd_fit = MinCovDet(random_state=rand_gen).fit(data)
T = mcd_fit.location_
S = mcd_fit.covariance_
H = mcd_fit.support_
# compare with the estimates learnt from the inliers
error_location = np.mean((pure_data.mean(0) - T) ** 2)
assert(error_location < tol_loc)
error_cov = np.mean((empirical_covariance(pure_data) - S) ** 2)
assert(error_cov < tol_cov)
assert(np.sum(H) >= tol_support)
assert_array_almost_equal(mcd_fit.mahalanobis(data), mcd_fit.dist_)
def test_mcd_issue1127():
# Check that the code does not break with X.shape = (3, 1)
# (i.e. n_support = n_samples)
rnd = np.random.RandomState(0)
X = rnd.normal(size=(3, 1))
mcd = MinCovDet()
mcd.fit(X)
def test_mcd_issue3367():
# Check that MCD completes when the covariance matrix is singular
# i.e. one of the rows and columns are all zeros
rand_gen = np.random.RandomState(0)
# Think of these as the values for X and Y -> 10 values between -5 and 5
data_values = np.linspace(-5, 5, 10).tolist()
# Get the cartesian product of all possible coordinate pairs from above set
data = np.array(list(itertools.product(data_values, data_values)))
# Add a third column that's all zeros to make our data a set of point
# within a plane, which means that the covariance matrix will be singular
data = np.hstack((data, np.zeros((data.shape[0], 1))))
# The below line of code should raise an exception if the covariance matrix
# is singular. As a further test, since we have points in XYZ, the
# principle components (Eigenvectors) of these directly relate to the
# geometry of the points. Since it's a plane, we should be able to test
# that the Eigenvector that corresponds to the smallest Eigenvalue is the
# plane normal, specifically [0, 0, 1], since everything is in the XY plane
# (as I've set it up above). To do this one would start by:
#
# evals, evecs = np.linalg.eigh(mcd_fit.covariance_)
# normal = evecs[:, np.argmin(evals)]
#
# After which we need to assert that our `normal` is equal to [0, 0, 1].
# Do note that there is floating point error associated with this, so it's
# best to subtract the two and then compare some small tolerance (e.g.
# 1e-12).
MinCovDet(random_state=rand_gen).fit(data)
def test_mcd_support_covariance_is_zero():
# Check that MCD returns a ValueError with informative message when the
# covariance of the support data is equal to 0.
X_1 = np.array([0.5, 0.1, 0.1, 0.1, 0.957, 0.1, 0.1, 0.1, 0.4285, 0.1])
X_1 = X_1.reshape(-1, 1)
X_2 = np.array([0.5, 0.3, 0.3, 0.3, 0.957, 0.3, 0.3, 0.3, 0.4285, 0.3])
X_2 = X_2.reshape(-1, 1)
msg = ('The covariance matrix of the support data is equal to 0, try to '
'increase support_fraction')
for X in [X_1, X_2]:
assert_raise_message(ValueError, msg, MinCovDet().fit, X)
def test_mcd_increasing_det_warning():
# Check that a warning is raised if we observe increasing determinants
# during the c_step. In theory the sequence of determinants should be
# decreasing. Increasing determinants are likely due to ill-conditioned
# covariance matrices that result in poor precision matrices.
X = [[5.1, 3.5, 1.4, 0.2],
[4.9, 3.0, 1.4, 0.2],
[4.7, 3.2, 1.3, 0.2],
[4.6, 3.1, 1.5, 0.2],
[5.0, 3.6, 1.4, 0.2],
[4.6, 3.4, 1.4, 0.3],
[5.0, 3.4, 1.5, 0.2],
[4.4, 2.9, 1.4, 0.2],
[4.9, 3.1, 1.5, 0.1],
[5.4, 3.7, 1.5, 0.2],
[4.8, 3.4, 1.6, 0.2],
[4.8, 3.0, 1.4, 0.1],
[4.3, 3.0, 1.1, 0.1],
[5.1, 3.5, 1.4, 0.3],
[5.7, 3.8, 1.7, 0.3],
[5.4, 3.4, 1.7, 0.2],
[4.6, 3.6, 1.0, 0.2],
[5.0, 3.0, 1.6, 0.2],
[5.2, 3.5, 1.5, 0.2]]
mcd = MinCovDet(random_state=1)
assert_warns_message(RuntimeWarning,
"Determinant has increased",
mcd.fit, X)