Uploaded Test files
This commit is contained in:
parent
f584ad9d97
commit
2e81cb7d99
16627 changed files with 2065359 additions and 102444 deletions
35
venv/Lib/site-packages/sklearn/covariance/__init__.py
Normal file
35
venv/Lib/site-packages/sklearn/covariance/__init__.py
Normal file
|
@ -0,0 +1,35 @@
|
|||
"""
|
||||
The :mod:`sklearn.covariance` module includes methods and algorithms to
|
||||
robustly estimate the covariance of features given a set of points. The
|
||||
precision matrix defined as the inverse of the covariance is also estimated.
|
||||
Covariance estimation is closely related to the theory of Gaussian Graphical
|
||||
Models.
|
||||
"""
|
||||
|
||||
from ._empirical_covariance import (empirical_covariance,
|
||||
EmpiricalCovariance,
|
||||
log_likelihood)
|
||||
from ._shrunk_covariance import (shrunk_covariance, ShrunkCovariance,
|
||||
ledoit_wolf, ledoit_wolf_shrinkage,
|
||||
LedoitWolf, oas, OAS)
|
||||
from ._robust_covariance import fast_mcd, MinCovDet
|
||||
from ._graph_lasso import graphical_lasso, GraphicalLasso, GraphicalLassoCV
|
||||
from ._elliptic_envelope import EllipticEnvelope
|
||||
|
||||
|
||||
__all__ = ['EllipticEnvelope',
|
||||
'EmpiricalCovariance',
|
||||
'GraphicalLasso',
|
||||
'GraphicalLassoCV',
|
||||
'LedoitWolf',
|
||||
'MinCovDet',
|
||||
'OAS',
|
||||
'ShrunkCovariance',
|
||||
'empirical_covariance',
|
||||
'fast_mcd',
|
||||
'graphical_lasso',
|
||||
'ledoit_wolf',
|
||||
'ledoit_wolf_shrinkage',
|
||||
'log_likelihood',
|
||||
'oas',
|
||||
'shrunk_covariance']
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
230
venv/Lib/site-packages/sklearn/covariance/_elliptic_envelope.py
Normal file
230
venv/Lib/site-packages/sklearn/covariance/_elliptic_envelope.py
Normal file
|
@ -0,0 +1,230 @@
|
|||
# Author: Virgile Fritsch <virgile.fritsch@inria.fr>
|
||||
#
|
||||
# License: BSD 3 clause
|
||||
|
||||
import numpy as np
|
||||
from . import MinCovDet
|
||||
from ..utils.validation import check_is_fitted, check_array
|
||||
from ..utils.validation import _deprecate_positional_args
|
||||
from ..metrics import accuracy_score
|
||||
from ..base import OutlierMixin
|
||||
|
||||
|
||||
class EllipticEnvelope(OutlierMixin, MinCovDet):
|
||||
"""An object for detecting outliers in a Gaussian distributed dataset.
|
||||
|
||||
Read more in the :ref:`User Guide <outlier_detection>`.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
store_precision : bool, default=True
|
||||
Specify if the estimated precision is stored.
|
||||
|
||||
assume_centered : bool, default=False
|
||||
If True, the support of robust location and covariance estimates
|
||||
is computed, and a covariance estimate is recomputed from it,
|
||||
without centering the data.
|
||||
Useful to work with data whose mean is significantly equal to
|
||||
zero but is not exactly zero.
|
||||
If False, the robust location and covariance are directly computed
|
||||
with the FastMCD algorithm without additional treatment.
|
||||
|
||||
support_fraction : float, default=None
|
||||
The proportion of points to be included in the support of the raw
|
||||
MCD estimate. If None, the minimum value of support_fraction will
|
||||
be used within the algorithm: `[n_sample + n_features + 1] / 2`.
|
||||
Range is (0, 1).
|
||||
|
||||
contamination : float, default=0.1
|
||||
The amount of contamination of the data set, i.e. the proportion
|
||||
of outliers in the data set. Range is (0, 0.5).
|
||||
|
||||
random_state : int or RandomState instance, default=None
|
||||
Determines the pseudo random number generator for shuffling
|
||||
the data. Pass an int for reproducible results across multiple function
|
||||
calls. See :term: `Glossary <random_state>`.
|
||||
|
||||
Attributes
|
||||
----------
|
||||
location_ : ndarray of shape (n_features,)
|
||||
Estimated robust location
|
||||
|
||||
covariance_ : ndarray of shape (n_features, n_features)
|
||||
Estimated robust covariance matrix
|
||||
|
||||
precision_ : ndarray of shape (n_features, n_features)
|
||||
Estimated pseudo inverse matrix.
|
||||
(stored only if store_precision is True)
|
||||
|
||||
support_ : ndarray of shape (n_samples,)
|
||||
A mask of the observations that have been used to compute the
|
||||
robust estimates of location and shape.
|
||||
|
||||
offset_ : float
|
||||
Offset used to define the decision function from the raw scores.
|
||||
We have the relation: ``decision_function = score_samples - offset_``.
|
||||
The offset depends on the contamination parameter and is defined in
|
||||
such a way we obtain the expected number of outliers (samples with
|
||||
decision function < 0) in training.
|
||||
|
||||
.. versionadded:: 0.20
|
||||
|
||||
raw_location_ : ndarray of shape (n_features,)
|
||||
The raw robust estimated location before correction and re-weighting.
|
||||
|
||||
raw_covariance_ : ndarray of shape (n_features, n_features)
|
||||
The raw robust estimated covariance before correction and re-weighting.
|
||||
|
||||
raw_support_ : ndarray of shape (n_samples,)
|
||||
A mask of the observations that have been used to compute
|
||||
the raw robust estimates of location and shape, before correction
|
||||
and re-weighting.
|
||||
|
||||
dist_ : ndarray of shape (n_samples,)
|
||||
Mahalanobis distances of the training set (on which :meth:`fit` is
|
||||
called) observations.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> import numpy as np
|
||||
>>> from sklearn.covariance import EllipticEnvelope
|
||||
>>> true_cov = np.array([[.8, .3],
|
||||
... [.3, .4]])
|
||||
>>> X = np.random.RandomState(0).multivariate_normal(mean=[0, 0],
|
||||
... cov=true_cov,
|
||||
... size=500)
|
||||
>>> cov = EllipticEnvelope(random_state=0).fit(X)
|
||||
>>> # predict returns 1 for an inlier and -1 for an outlier
|
||||
>>> cov.predict([[0, 0],
|
||||
... [3, 3]])
|
||||
array([ 1, -1])
|
||||
>>> cov.covariance_
|
||||
array([[0.7411..., 0.2535...],
|
||||
[0.2535..., 0.3053...]])
|
||||
>>> cov.location_
|
||||
array([0.0813... , 0.0427...])
|
||||
|
||||
See Also
|
||||
--------
|
||||
EmpiricalCovariance, MinCovDet
|
||||
|
||||
Notes
|
||||
-----
|
||||
Outlier detection from covariance estimation may break or not
|
||||
perform well in high-dimensional settings. In particular, one will
|
||||
always take care to work with ``n_samples > n_features ** 2``.
|
||||
|
||||
References
|
||||
----------
|
||||
.. [1] Rousseeuw, P.J., Van Driessen, K. "A fast algorithm for the
|
||||
minimum covariance determinant estimator" Technometrics 41(3), 212
|
||||
(1999)
|
||||
"""
|
||||
@_deprecate_positional_args
|
||||
def __init__(self, *, store_precision=True, assume_centered=False,
|
||||
support_fraction=None, contamination=0.1,
|
||||
random_state=None):
|
||||
super().__init__(
|
||||
store_precision=store_precision,
|
||||
assume_centered=assume_centered,
|
||||
support_fraction=support_fraction,
|
||||
random_state=random_state)
|
||||
self.contamination = contamination
|
||||
|
||||
def fit(self, X, y=None):
|
||||
"""Fit the EllipticEnvelope model.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : {array-like, sparse matrix} of shape (n_samples, n_features)
|
||||
Training data.
|
||||
|
||||
y : Ignored
|
||||
Not used, present for API consistency by convention.
|
||||
"""
|
||||
super().fit(X)
|
||||
self.offset_ = np.percentile(-self.dist_, 100. * self.contamination)
|
||||
return self
|
||||
|
||||
def decision_function(self, X):
|
||||
"""Compute the decision function of the given observations.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : array-like of shape (n_samples, n_features)
|
||||
The data matrix.
|
||||
|
||||
Returns
|
||||
-------
|
||||
decision : ndarray of shape (n_samples, )
|
||||
Decision function of the samples.
|
||||
It is equal to the shifted Mahalanobis distances.
|
||||
The threshold for being an outlier is 0, which ensures a
|
||||
compatibility with other outlier detection algorithms.
|
||||
"""
|
||||
check_is_fitted(self)
|
||||
negative_mahal_dist = self.score_samples(X)
|
||||
return negative_mahal_dist - self.offset_
|
||||
|
||||
def score_samples(self, X):
|
||||
"""Compute the negative Mahalanobis distances.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : array-like of shape (n_samples, n_features)
|
||||
The data matrix.
|
||||
|
||||
Returns
|
||||
-------
|
||||
negative_mahal_distances : array-like of shape (n_samples,)
|
||||
Opposite of the Mahalanobis distances.
|
||||
"""
|
||||
check_is_fitted(self)
|
||||
return -self.mahalanobis(X)
|
||||
|
||||
def predict(self, X):
|
||||
"""
|
||||
Predict the labels (1 inlier, -1 outlier) of X according to the
|
||||
fitted model.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : array-like of shape (n_samples, n_features)
|
||||
The data matrix.
|
||||
|
||||
Returns
|
||||
-------
|
||||
is_inlier : ndarray of shape (n_samples,)
|
||||
Returns -1 for anomalies/outliers and +1 for inliers.
|
||||
"""
|
||||
X = check_array(X)
|
||||
is_inlier = np.full(X.shape[0], -1, dtype=int)
|
||||
values = self.decision_function(X)
|
||||
is_inlier[values >= 0] = 1
|
||||
|
||||
return is_inlier
|
||||
|
||||
def score(self, X, y, sample_weight=None):
|
||||
"""Returns the mean accuracy on the given test data and labels.
|
||||
|
||||
In multi-label classification, this is the subset accuracy
|
||||
which is a harsh metric since you require for each sample that
|
||||
each label set be correctly predicted.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : array-like of shape (n_samples, n_features)
|
||||
Test samples.
|
||||
|
||||
y : array-like of shape (n_samples,) or (n_samples, n_outputs)
|
||||
True labels for X.
|
||||
|
||||
sample_weight : array-like of shape (n_samples,), default=None
|
||||
Sample weights.
|
||||
|
||||
Returns
|
||||
-------
|
||||
score : float
|
||||
Mean accuracy of self.predict(X) w.r.t. y.
|
||||
"""
|
||||
return accuracy_score(y, self.predict(X), sample_weight=sample_weight)
|
|
@ -0,0 +1,317 @@
|
|||
"""
|
||||
Maximum likelihood covariance estimator.
|
||||
|
||||
"""
|
||||
|
||||
# Author: Alexandre Gramfort <alexandre.gramfort@inria.fr>
|
||||
# Gael Varoquaux <gael.varoquaux@normalesup.org>
|
||||
# Virgile Fritsch <virgile.fritsch@inria.fr>
|
||||
#
|
||||
# License: BSD 3 clause
|
||||
|
||||
# avoid division truncation
|
||||
import warnings
|
||||
import numpy as np
|
||||
from scipy import linalg
|
||||
|
||||
from ..base import BaseEstimator
|
||||
from ..utils import check_array
|
||||
from ..utils.extmath import fast_logdet
|
||||
from ..metrics.pairwise import pairwise_distances
|
||||
from ..utils.validation import _deprecate_positional_args
|
||||
|
||||
|
||||
def log_likelihood(emp_cov, precision):
|
||||
"""Computes the sample mean of the log_likelihood under a covariance model
|
||||
|
||||
computes the empirical expected log-likelihood (accounting for the
|
||||
normalization terms and scaling), allowing for universal comparison (beyond
|
||||
this software package)
|
||||
|
||||
Parameters
|
||||
----------
|
||||
emp_cov : ndarray of shape (n_features, n_features)
|
||||
Maximum Likelihood Estimator of covariance.
|
||||
|
||||
precision : ndarray of shape (n_features, n_features)
|
||||
The precision matrix of the covariance model to be tested.
|
||||
|
||||
Returns
|
||||
-------
|
||||
log_likelihood_ : float
|
||||
Sample mean of the log-likelihood.
|
||||
"""
|
||||
p = precision.shape[0]
|
||||
log_likelihood_ = - np.sum(emp_cov * precision) + fast_logdet(precision)
|
||||
log_likelihood_ -= p * np.log(2 * np.pi)
|
||||
log_likelihood_ /= 2.
|
||||
return log_likelihood_
|
||||
|
||||
|
||||
@_deprecate_positional_args
|
||||
def empirical_covariance(X, *, assume_centered=False):
|
||||
"""Computes the Maximum likelihood covariance estimator
|
||||
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : ndarray of shape (n_samples, n_features)
|
||||
Data from which to compute the covariance estimate
|
||||
|
||||
assume_centered : bool, default=False
|
||||
If True, data will not be centered before computation.
|
||||
Useful when working with data whose mean is almost, but not exactly
|
||||
zero.
|
||||
If False, data will be centered before computation.
|
||||
|
||||
Returns
|
||||
-------
|
||||
covariance : ndarray of shape (n_features, n_features)
|
||||
Empirical covariance (Maximum Likelihood Estimator).
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> from sklearn.covariance import empirical_covariance
|
||||
>>> X = [[1,1,1],[1,1,1],[1,1,1],
|
||||
... [0,0,0],[0,0,0],[0,0,0]]
|
||||
>>> empirical_covariance(X)
|
||||
array([[0.25, 0.25, 0.25],
|
||||
[0.25, 0.25, 0.25],
|
||||
[0.25, 0.25, 0.25]])
|
||||
"""
|
||||
X = np.asarray(X)
|
||||
|
||||
if X.ndim == 1:
|
||||
X = np.reshape(X, (1, -1))
|
||||
|
||||
if X.shape[0] == 1:
|
||||
warnings.warn("Only one sample available. "
|
||||
"You may want to reshape your data array")
|
||||
|
||||
if assume_centered:
|
||||
covariance = np.dot(X.T, X) / X.shape[0]
|
||||
else:
|
||||
covariance = np.cov(X.T, bias=1)
|
||||
|
||||
if covariance.ndim == 0:
|
||||
covariance = np.array([[covariance]])
|
||||
return covariance
|
||||
|
||||
|
||||
class EmpiricalCovariance(BaseEstimator):
|
||||
"""Maximum likelihood covariance estimator
|
||||
|
||||
Read more in the :ref:`User Guide <covariance>`.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
store_precision : bool, default=True
|
||||
Specifies if the estimated precision is stored.
|
||||
|
||||
assume_centered : bool, default=False
|
||||
If True, data are not centered before computation.
|
||||
Useful when working with data whose mean is almost, but not exactly
|
||||
zero.
|
||||
If False (default), data are centered before computation.
|
||||
|
||||
Attributes
|
||||
----------
|
||||
location_ : ndarray of shape (n_features,)
|
||||
Estimated location, i.e. the estimated mean.
|
||||
|
||||
covariance_ : ndarray of shape (n_features, n_features)
|
||||
Estimated covariance matrix
|
||||
|
||||
precision_ : ndarray of shape (n_features, n_features)
|
||||
Estimated pseudo-inverse matrix.
|
||||
(stored only if store_precision is True)
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> import numpy as np
|
||||
>>> from sklearn.covariance import EmpiricalCovariance
|
||||
>>> from sklearn.datasets import make_gaussian_quantiles
|
||||
>>> real_cov = np.array([[.8, .3],
|
||||
... [.3, .4]])
|
||||
>>> rng = np.random.RandomState(0)
|
||||
>>> X = rng.multivariate_normal(mean=[0, 0],
|
||||
... cov=real_cov,
|
||||
... size=500)
|
||||
>>> cov = EmpiricalCovariance().fit(X)
|
||||
>>> cov.covariance_
|
||||
array([[0.7569..., 0.2818...],
|
||||
[0.2818..., 0.3928...]])
|
||||
>>> cov.location_
|
||||
array([0.0622..., 0.0193...])
|
||||
|
||||
"""
|
||||
@_deprecate_positional_args
|
||||
def __init__(self, *, store_precision=True, assume_centered=False):
|
||||
self.store_precision = store_precision
|
||||
self.assume_centered = assume_centered
|
||||
|
||||
def _set_covariance(self, covariance):
|
||||
"""Saves the covariance and precision estimates
|
||||
|
||||
Storage is done accordingly to `self.store_precision`.
|
||||
Precision stored only if invertible.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
covariance : array-like of shape (n_features, n_features)
|
||||
Estimated covariance matrix to be stored, and from which precision
|
||||
is computed.
|
||||
"""
|
||||
covariance = check_array(covariance)
|
||||
# set covariance
|
||||
self.covariance_ = covariance
|
||||
# set precision
|
||||
if self.store_precision:
|
||||
self.precision_ = linalg.pinvh(covariance)
|
||||
else:
|
||||
self.precision_ = None
|
||||
|
||||
def get_precision(self):
|
||||
"""Getter for the precision matrix.
|
||||
|
||||
Returns
|
||||
-------
|
||||
precision_ : array-like of shape (n_features, n_features)
|
||||
The precision matrix associated to the current covariance object.
|
||||
"""
|
||||
if self.store_precision:
|
||||
precision = self.precision_
|
||||
else:
|
||||
precision = linalg.pinvh(self.covariance_)
|
||||
return precision
|
||||
|
||||
def fit(self, X, y=None):
|
||||
"""Fits the Maximum Likelihood Estimator covariance model
|
||||
according to the given training data and parameters.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : array-like of shape (n_samples, n_features)
|
||||
Training data, where n_samples is the number of samples and
|
||||
n_features is the number of features.
|
||||
|
||||
y : Ignored
|
||||
Not used, present for API consistence purpose.
|
||||
|
||||
Returns
|
||||
-------
|
||||
self : object
|
||||
"""
|
||||
X = self._validate_data(X)
|
||||
if self.assume_centered:
|
||||
self.location_ = np.zeros(X.shape[1])
|
||||
else:
|
||||
self.location_ = X.mean(0)
|
||||
covariance = empirical_covariance(
|
||||
X, assume_centered=self.assume_centered)
|
||||
self._set_covariance(covariance)
|
||||
|
||||
return self
|
||||
|
||||
def score(self, X_test, y=None):
|
||||
"""Computes the log-likelihood of a Gaussian data set with
|
||||
`self.covariance_` as an estimator of its covariance matrix.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X_test : array-like of shape (n_samples, n_features)
|
||||
Test data of which we compute the likelihood, where n_samples is
|
||||
the number of samples and n_features is the number of features.
|
||||
X_test is assumed to be drawn from the same distribution than
|
||||
the data used in fit (including centering).
|
||||
|
||||
y : Ignored
|
||||
Not used, present for API consistence purpose.
|
||||
|
||||
Returns
|
||||
-------
|
||||
res : float
|
||||
The likelihood of the data set with `self.covariance_` as an
|
||||
estimator of its covariance matrix.
|
||||
"""
|
||||
# compute empirical covariance of the test set
|
||||
test_cov = empirical_covariance(
|
||||
X_test - self.location_, assume_centered=True)
|
||||
# compute log likelihood
|
||||
res = log_likelihood(test_cov, self.get_precision())
|
||||
|
||||
return res
|
||||
|
||||
def error_norm(self, comp_cov, norm='frobenius', scaling=True,
|
||||
squared=True):
|
||||
"""Computes the Mean Squared Error between two covariance estimators.
|
||||
(In the sense of the Frobenius norm).
|
||||
|
||||
Parameters
|
||||
----------
|
||||
comp_cov : array-like of shape (n_features, n_features)
|
||||
The covariance to compare with.
|
||||
|
||||
norm : {"frobenius", "spectral"}, default="frobenius"
|
||||
The type of norm used to compute the error. Available error types:
|
||||
- 'frobenius' (default): sqrt(tr(A^t.A))
|
||||
- 'spectral': sqrt(max(eigenvalues(A^t.A))
|
||||
where A is the error ``(comp_cov - self.covariance_)``.
|
||||
|
||||
scaling : bool, default=True
|
||||
If True (default), the squared error norm is divided by n_features.
|
||||
If False, the squared error norm is not rescaled.
|
||||
|
||||
squared : bool, default=True
|
||||
Whether to compute the squared error norm or the error norm.
|
||||
If True (default), the squared error norm is returned.
|
||||
If False, the error norm is returned.
|
||||
|
||||
Returns
|
||||
-------
|
||||
result : float
|
||||
The Mean Squared Error (in the sense of the Frobenius norm) between
|
||||
`self` and `comp_cov` covariance estimators.
|
||||
"""
|
||||
# compute the error
|
||||
error = comp_cov - self.covariance_
|
||||
# compute the error norm
|
||||
if norm == "frobenius":
|
||||
squared_norm = np.sum(error ** 2)
|
||||
elif norm == "spectral":
|
||||
squared_norm = np.amax(linalg.svdvals(np.dot(error.T, error)))
|
||||
else:
|
||||
raise NotImplementedError(
|
||||
"Only spectral and frobenius norms are implemented")
|
||||
# optionally scale the error norm
|
||||
if scaling:
|
||||
squared_norm = squared_norm / error.shape[0]
|
||||
# finally get either the squared norm or the norm
|
||||
if squared:
|
||||
result = squared_norm
|
||||
else:
|
||||
result = np.sqrt(squared_norm)
|
||||
|
||||
return result
|
||||
|
||||
def mahalanobis(self, X):
|
||||
"""Computes the squared Mahalanobis distances of given observations.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : array-like of shape (n_samples, n_features)
|
||||
The observations, the Mahalanobis distances of the which we
|
||||
compute. Observations are assumed to be drawn from the same
|
||||
distribution than the data used in fit.
|
||||
|
||||
Returns
|
||||
-------
|
||||
dist : ndarray of shape (n_samples,)
|
||||
Squared Mahalanobis distances of the observations.
|
||||
"""
|
||||
precision = self.get_precision()
|
||||
# compute mahalanobis distances
|
||||
dist = pairwise_distances(X, self.location_[np.newaxis, :],
|
||||
metric='mahalanobis', VI=precision)
|
||||
|
||||
return np.reshape(dist, (len(X),)) ** 2
|
794
venv/Lib/site-packages/sklearn/covariance/_graph_lasso.py
Normal file
794
venv/Lib/site-packages/sklearn/covariance/_graph_lasso.py
Normal file
|
@ -0,0 +1,794 @@
|
|||
"""GraphicalLasso: sparse inverse covariance estimation with an l1-penalized
|
||||
estimator.
|
||||
"""
|
||||
|
||||
# Author: Gael Varoquaux <gael.varoquaux@normalesup.org>
|
||||
# License: BSD 3 clause
|
||||
# Copyright: INRIA
|
||||
from collections.abc import Sequence
|
||||
import warnings
|
||||
import operator
|
||||
import sys
|
||||
import time
|
||||
|
||||
import numpy as np
|
||||
from scipy import linalg
|
||||
from joblib import Parallel, delayed
|
||||
|
||||
from . import empirical_covariance, EmpiricalCovariance, log_likelihood
|
||||
|
||||
from ..exceptions import ConvergenceWarning
|
||||
from ..utils.validation import check_random_state
|
||||
from ..utils.validation import _deprecate_positional_args
|
||||
# mypy error: Module 'sklearn.linear_model' has no attribute '_cd_fast'
|
||||
from ..linear_model import _cd_fast as cd_fast # type: ignore
|
||||
from ..linear_model import lars_path_gram
|
||||
from ..model_selection import check_cv, cross_val_score
|
||||
|
||||
|
||||
# Helper functions to compute the objective and dual objective functions
|
||||
# of the l1-penalized estimator
|
||||
def _objective(mle, precision_, alpha):
|
||||
"""Evaluation of the graphical-lasso objective function
|
||||
|
||||
the objective function is made of a shifted scaled version of the
|
||||
normalized log-likelihood (i.e. its empirical mean over the samples) and a
|
||||
penalisation term to promote sparsity
|
||||
"""
|
||||
p = precision_.shape[0]
|
||||
cost = - 2. * log_likelihood(mle, precision_) + p * np.log(2 * np.pi)
|
||||
cost += alpha * (np.abs(precision_).sum()
|
||||
- np.abs(np.diag(precision_)).sum())
|
||||
return cost
|
||||
|
||||
|
||||
def _dual_gap(emp_cov, precision_, alpha):
|
||||
"""Expression of the dual gap convergence criterion
|
||||
|
||||
The specific definition is given in Duchi "Projected Subgradient Methods
|
||||
for Learning Sparse Gaussians".
|
||||
"""
|
||||
gap = np.sum(emp_cov * precision_)
|
||||
gap -= precision_.shape[0]
|
||||
gap += alpha * (np.abs(precision_).sum()
|
||||
- np.abs(np.diag(precision_)).sum())
|
||||
return gap
|
||||
|
||||
|
||||
def alpha_max(emp_cov):
|
||||
"""Find the maximum alpha for which there are some non-zeros off-diagonal.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
emp_cov : ndarray of shape (n_features, n_features)
|
||||
The sample covariance matrix.
|
||||
|
||||
Notes
|
||||
-----
|
||||
This results from the bound for the all the Lasso that are solved
|
||||
in GraphicalLasso: each time, the row of cov corresponds to Xy. As the
|
||||
bound for alpha is given by `max(abs(Xy))`, the result follows.
|
||||
"""
|
||||
A = np.copy(emp_cov)
|
||||
A.flat[::A.shape[0] + 1] = 0
|
||||
return np.max(np.abs(A))
|
||||
|
||||
|
||||
# The g-lasso algorithm
|
||||
@_deprecate_positional_args
|
||||
def graphical_lasso(emp_cov, alpha, *, cov_init=None, mode='cd', tol=1e-4,
|
||||
enet_tol=1e-4, max_iter=100, verbose=False,
|
||||
return_costs=False, eps=np.finfo(np.float64).eps,
|
||||
return_n_iter=False):
|
||||
"""l1-penalized covariance estimator
|
||||
|
||||
Read more in the :ref:`User Guide <sparse_inverse_covariance>`.
|
||||
|
||||
.. versionchanged:: v0.20
|
||||
graph_lasso has been renamed to graphical_lasso
|
||||
|
||||
Parameters
|
||||
----------
|
||||
emp_cov : ndarray of shape (n_features, n_features)
|
||||
Empirical covariance from which to compute the covariance estimate.
|
||||
|
||||
alpha : float
|
||||
The regularization parameter: the higher alpha, the more
|
||||
regularization, the sparser the inverse covariance.
|
||||
Range is (0, inf].
|
||||
|
||||
cov_init : array of shape (n_features, n_features), default=None
|
||||
The initial guess for the covariance.
|
||||
|
||||
mode : {'cd', 'lars'}, default='cd'
|
||||
The Lasso solver to use: coordinate descent or LARS. Use LARS for
|
||||
very sparse underlying graphs, where p > n. Elsewhere prefer cd
|
||||
which is more numerically stable.
|
||||
|
||||
tol : float, default=1e-4
|
||||
The tolerance to declare convergence: if the dual gap goes below
|
||||
this value, iterations are stopped. Range is (0, inf].
|
||||
|
||||
enet_tol : float, default=1e-4
|
||||
The tolerance for the elastic net solver used to calculate the descent
|
||||
direction. This parameter controls the accuracy of the search direction
|
||||
for a given column update, not of the overall parameter estimate. Only
|
||||
used for mode='cd'. Range is (0, inf].
|
||||
|
||||
max_iter : int, default=100
|
||||
The maximum number of iterations.
|
||||
|
||||
verbose : bool, default=False
|
||||
If verbose is True, the objective function and dual gap are
|
||||
printed at each iteration.
|
||||
|
||||
return_costs : bool, default=Flase
|
||||
If return_costs is True, the objective function and dual gap
|
||||
at each iteration are returned.
|
||||
|
||||
eps : float, default=eps
|
||||
The machine-precision regularization in the computation of the
|
||||
Cholesky diagonal factors. Increase this for very ill-conditioned
|
||||
systems. Default is `np.finfo(np.float64).eps`.
|
||||
|
||||
return_n_iter : bool, default=False
|
||||
Whether or not to return the number of iterations.
|
||||
|
||||
Returns
|
||||
-------
|
||||
covariance : ndarray of shape (n_features, n_features)
|
||||
The estimated covariance matrix.
|
||||
|
||||
precision : ndarray of shape (n_features, n_features)
|
||||
The estimated (sparse) precision matrix.
|
||||
|
||||
costs : list of (objective, dual_gap) pairs
|
||||
The list of values of the objective function and the dual gap at
|
||||
each iteration. Returned only if return_costs is True.
|
||||
|
||||
n_iter : int
|
||||
Number of iterations. Returned only if `return_n_iter` is set to True.
|
||||
|
||||
See Also
|
||||
--------
|
||||
GraphicalLasso, GraphicalLassoCV
|
||||
|
||||
Notes
|
||||
-----
|
||||
The algorithm employed to solve this problem is the GLasso algorithm,
|
||||
from the Friedman 2008 Biostatistics paper. It is the same algorithm
|
||||
as in the R `glasso` package.
|
||||
|
||||
One possible difference with the `glasso` R package is that the
|
||||
diagonal coefficients are not penalized.
|
||||
"""
|
||||
_, n_features = emp_cov.shape
|
||||
if alpha == 0:
|
||||
if return_costs:
|
||||
precision_ = linalg.inv(emp_cov)
|
||||
cost = - 2. * log_likelihood(emp_cov, precision_)
|
||||
cost += n_features * np.log(2 * np.pi)
|
||||
d_gap = np.sum(emp_cov * precision_) - n_features
|
||||
if return_n_iter:
|
||||
return emp_cov, precision_, (cost, d_gap), 0
|
||||
else:
|
||||
return emp_cov, precision_, (cost, d_gap)
|
||||
else:
|
||||
if return_n_iter:
|
||||
return emp_cov, linalg.inv(emp_cov), 0
|
||||
else:
|
||||
return emp_cov, linalg.inv(emp_cov)
|
||||
if cov_init is None:
|
||||
covariance_ = emp_cov.copy()
|
||||
else:
|
||||
covariance_ = cov_init.copy()
|
||||
# As a trivial regularization (Tikhonov like), we scale down the
|
||||
# off-diagonal coefficients of our starting point: This is needed, as
|
||||
# in the cross-validation the cov_init can easily be
|
||||
# ill-conditioned, and the CV loop blows. Beside, this takes
|
||||
# conservative stand-point on the initial conditions, and it tends to
|
||||
# make the convergence go faster.
|
||||
covariance_ *= 0.95
|
||||
diagonal = emp_cov.flat[::n_features + 1]
|
||||
covariance_.flat[::n_features + 1] = diagonal
|
||||
precision_ = linalg.pinvh(covariance_)
|
||||
|
||||
indices = np.arange(n_features)
|
||||
costs = list()
|
||||
# The different l1 regression solver have different numerical errors
|
||||
if mode == 'cd':
|
||||
errors = dict(over='raise', invalid='ignore')
|
||||
else:
|
||||
errors = dict(invalid='raise')
|
||||
try:
|
||||
# be robust to the max_iter=0 edge case, see:
|
||||
# https://github.com/scikit-learn/scikit-learn/issues/4134
|
||||
d_gap = np.inf
|
||||
# set a sub_covariance buffer
|
||||
sub_covariance = np.copy(covariance_[1:, 1:], order='C')
|
||||
for i in range(max_iter):
|
||||
for idx in range(n_features):
|
||||
# To keep the contiguous matrix `sub_covariance` equal to
|
||||
# covariance_[indices != idx].T[indices != idx]
|
||||
# we only need to update 1 column and 1 line when idx changes
|
||||
if idx > 0:
|
||||
di = idx - 1
|
||||
sub_covariance[di] = covariance_[di][indices != idx]
|
||||
sub_covariance[:, di] = covariance_[:, di][indices != idx]
|
||||
else:
|
||||
sub_covariance[:] = covariance_[1:, 1:]
|
||||
row = emp_cov[idx, indices != idx]
|
||||
with np.errstate(**errors):
|
||||
if mode == 'cd':
|
||||
# Use coordinate descent
|
||||
coefs = -(precision_[indices != idx, idx]
|
||||
/ (precision_[idx, idx] + 1000 * eps))
|
||||
coefs, _, _, _ = cd_fast.enet_coordinate_descent_gram(
|
||||
coefs, alpha, 0, sub_covariance,
|
||||
row, row, max_iter, enet_tol,
|
||||
check_random_state(None), False)
|
||||
else:
|
||||
# Use LARS
|
||||
_, _, coefs = lars_path_gram(
|
||||
Xy=row, Gram=sub_covariance, n_samples=row.size,
|
||||
alpha_min=alpha / (n_features - 1), copy_Gram=True,
|
||||
eps=eps, method='lars', return_path=False)
|
||||
# Update the precision matrix
|
||||
precision_[idx, idx] = (
|
||||
1. / (covariance_[idx, idx]
|
||||
- np.dot(covariance_[indices != idx, idx], coefs)))
|
||||
precision_[indices != idx, idx] = (- precision_[idx, idx]
|
||||
* coefs)
|
||||
precision_[idx, indices != idx] = (- precision_[idx, idx]
|
||||
* coefs)
|
||||
coefs = np.dot(sub_covariance, coefs)
|
||||
covariance_[idx, indices != idx] = coefs
|
||||
covariance_[indices != idx, idx] = coefs
|
||||
if not np.isfinite(precision_.sum()):
|
||||
raise FloatingPointError('The system is too ill-conditioned '
|
||||
'for this solver')
|
||||
d_gap = _dual_gap(emp_cov, precision_, alpha)
|
||||
cost = _objective(emp_cov, precision_, alpha)
|
||||
if verbose:
|
||||
print('[graphical_lasso] Iteration '
|
||||
'% 3i, cost % 3.2e, dual gap %.3e'
|
||||
% (i, cost, d_gap))
|
||||
if return_costs:
|
||||
costs.append((cost, d_gap))
|
||||
if np.abs(d_gap) < tol:
|
||||
break
|
||||
if not np.isfinite(cost) and i > 0:
|
||||
raise FloatingPointError('Non SPD result: the system is '
|
||||
'too ill-conditioned for this solver')
|
||||
else:
|
||||
warnings.warn('graphical_lasso: did not converge after '
|
||||
'%i iteration: dual gap: %.3e'
|
||||
% (max_iter, d_gap), ConvergenceWarning)
|
||||
except FloatingPointError as e:
|
||||
e.args = (e.args[0]
|
||||
+ '. The system is too ill-conditioned for this solver',)
|
||||
raise e
|
||||
|
||||
if return_costs:
|
||||
if return_n_iter:
|
||||
return covariance_, precision_, costs, i + 1
|
||||
else:
|
||||
return covariance_, precision_, costs
|
||||
else:
|
||||
if return_n_iter:
|
||||
return covariance_, precision_, i + 1
|
||||
else:
|
||||
return covariance_, precision_
|
||||
|
||||
|
||||
class GraphicalLasso(EmpiricalCovariance):
|
||||
"""Sparse inverse covariance estimation with an l1-penalized estimator.
|
||||
|
||||
Read more in the :ref:`User Guide <sparse_inverse_covariance>`.
|
||||
|
||||
.. versionchanged:: v0.20
|
||||
GraphLasso has been renamed to GraphicalLasso
|
||||
|
||||
Parameters
|
||||
----------
|
||||
alpha : float, default=0.01
|
||||
The regularization parameter: the higher alpha, the more
|
||||
regularization, the sparser the inverse covariance.
|
||||
Range is (0, inf].
|
||||
|
||||
mode : {'cd', 'lars'}, default='cd'
|
||||
The Lasso solver to use: coordinate descent or LARS. Use LARS for
|
||||
very sparse underlying graphs, where p > n. Elsewhere prefer cd
|
||||
which is more numerically stable.
|
||||
|
||||
tol : float, default=1e-4
|
||||
The tolerance to declare convergence: if the dual gap goes below
|
||||
this value, iterations are stopped. Range is (0, inf].
|
||||
|
||||
enet_tol : float, default=1e-4
|
||||
The tolerance for the elastic net solver used to calculate the descent
|
||||
direction. This parameter controls the accuracy of the search direction
|
||||
for a given column update, not of the overall parameter estimate. Only
|
||||
used for mode='cd'. Range is (0, inf].
|
||||
|
||||
max_iter : int, default=100
|
||||
The maximum number of iterations.
|
||||
|
||||
verbose : bool, default=False
|
||||
If verbose is True, the objective function and dual gap are
|
||||
plotted at each iteration.
|
||||
|
||||
assume_centered : bool, default=False
|
||||
If True, data are not centered before computation.
|
||||
Useful when working with data whose mean is almost, but not exactly
|
||||
zero.
|
||||
If False, data are centered before computation.
|
||||
|
||||
Attributes
|
||||
----------
|
||||
location_ : ndarray of shape (n_features,)
|
||||
Estimated location, i.e. the estimated mean.
|
||||
|
||||
covariance_ : ndarray of shape (n_features, n_features)
|
||||
Estimated covariance matrix
|
||||
|
||||
precision_ : ndarray of shape (n_features, n_features)
|
||||
Estimated pseudo inverse matrix.
|
||||
|
||||
n_iter_ : int
|
||||
Number of iterations run.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> import numpy as np
|
||||
>>> from sklearn.covariance import GraphicalLasso
|
||||
>>> true_cov = np.array([[0.8, 0.0, 0.2, 0.0],
|
||||
... [0.0, 0.4, 0.0, 0.0],
|
||||
... [0.2, 0.0, 0.3, 0.1],
|
||||
... [0.0, 0.0, 0.1, 0.7]])
|
||||
>>> np.random.seed(0)
|
||||
>>> X = np.random.multivariate_normal(mean=[0, 0, 0, 0],
|
||||
... cov=true_cov,
|
||||
... size=200)
|
||||
>>> cov = GraphicalLasso().fit(X)
|
||||
>>> np.around(cov.covariance_, decimals=3)
|
||||
array([[0.816, 0.049, 0.218, 0.019],
|
||||
[0.049, 0.364, 0.017, 0.034],
|
||||
[0.218, 0.017, 0.322, 0.093],
|
||||
[0.019, 0.034, 0.093, 0.69 ]])
|
||||
>>> np.around(cov.location_, decimals=3)
|
||||
array([0.073, 0.04 , 0.038, 0.143])
|
||||
|
||||
See Also
|
||||
--------
|
||||
graphical_lasso, GraphicalLassoCV
|
||||
"""
|
||||
@_deprecate_positional_args
|
||||
def __init__(self, alpha=.01, *, mode='cd', tol=1e-4, enet_tol=1e-4,
|
||||
max_iter=100, verbose=False, assume_centered=False):
|
||||
super().__init__(assume_centered=assume_centered)
|
||||
self.alpha = alpha
|
||||
self.mode = mode
|
||||
self.tol = tol
|
||||
self.enet_tol = enet_tol
|
||||
self.max_iter = max_iter
|
||||
self.verbose = verbose
|
||||
|
||||
def fit(self, X, y=None):
|
||||
"""Fits the GraphicalLasso model to X.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : array-like of shape (n_samples, n_features)
|
||||
Data from which to compute the covariance estimate
|
||||
|
||||
y : Ignored
|
||||
Not used, present for API consistence purpose.
|
||||
|
||||
Returns
|
||||
-------
|
||||
self : object
|
||||
"""
|
||||
# Covariance does not make sense for a single feature
|
||||
X = self._validate_data(X, ensure_min_features=2, ensure_min_samples=2,
|
||||
estimator=self)
|
||||
|
||||
if self.assume_centered:
|
||||
self.location_ = np.zeros(X.shape[1])
|
||||
else:
|
||||
self.location_ = X.mean(0)
|
||||
emp_cov = empirical_covariance(
|
||||
X, assume_centered=self.assume_centered)
|
||||
self.covariance_, self.precision_, self.n_iter_ = graphical_lasso(
|
||||
emp_cov, alpha=self.alpha, mode=self.mode, tol=self.tol,
|
||||
enet_tol=self.enet_tol, max_iter=self.max_iter,
|
||||
verbose=self.verbose, return_n_iter=True)
|
||||
return self
|
||||
|
||||
|
||||
# Cross-validation with GraphicalLasso
|
||||
def graphical_lasso_path(X, alphas, cov_init=None, X_test=None, mode='cd',
|
||||
tol=1e-4, enet_tol=1e-4, max_iter=100, verbose=False):
|
||||
"""l1-penalized covariance estimator along a path of decreasing alphas
|
||||
|
||||
Read more in the :ref:`User Guide <sparse_inverse_covariance>`.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : ndarray of shape (n_samples, n_features)
|
||||
Data from which to compute the covariance estimate.
|
||||
|
||||
alphas : array-like of shape (n_alphas,)
|
||||
The list of regularization parameters, decreasing order.
|
||||
|
||||
cov_init : array of shape (n_features, n_features), default=None
|
||||
The initial guess for the covariance.
|
||||
|
||||
X_test : array of shape (n_test_samples, n_features), default=None
|
||||
Optional test matrix to measure generalisation error.
|
||||
|
||||
mode : {'cd', 'lars'}, default='cd'
|
||||
The Lasso solver to use: coordinate descent or LARS. Use LARS for
|
||||
very sparse underlying graphs, where p > n. Elsewhere prefer cd
|
||||
which is more numerically stable.
|
||||
|
||||
tol : float, default=1e-4
|
||||
The tolerance to declare convergence: if the dual gap goes below
|
||||
this value, iterations are stopped. The tolerance must be a positive
|
||||
number.
|
||||
|
||||
enet_tol : float, default=1e-4
|
||||
The tolerance for the elastic net solver used to calculate the descent
|
||||
direction. This parameter controls the accuracy of the search direction
|
||||
for a given column update, not of the overall parameter estimate. Only
|
||||
used for mode='cd'. The tolerance must be a positive number.
|
||||
|
||||
max_iter : int, default=100
|
||||
The maximum number of iterations. This parameter should be a strictly
|
||||
positive integer.
|
||||
|
||||
verbose : int or bool, default=False
|
||||
The higher the verbosity flag, the more information is printed
|
||||
during the fitting.
|
||||
|
||||
Returns
|
||||
-------
|
||||
covariances_ : list of shape (n_alphas,) of ndarray of shape \
|
||||
(n_features, n_features)
|
||||
The estimated covariance matrices.
|
||||
|
||||
precisions_ : list of shape (n_alphas,) of ndarray of shape \
|
||||
(n_features, n_features)
|
||||
The estimated (sparse) precision matrices.
|
||||
|
||||
scores_ : list of shape (n_alphas,), dtype=float
|
||||
The generalisation error (log-likelihood) on the test data.
|
||||
Returned only if test data is passed.
|
||||
"""
|
||||
inner_verbose = max(0, verbose - 1)
|
||||
emp_cov = empirical_covariance(X)
|
||||
if cov_init is None:
|
||||
covariance_ = emp_cov.copy()
|
||||
else:
|
||||
covariance_ = cov_init
|
||||
covariances_ = list()
|
||||
precisions_ = list()
|
||||
scores_ = list()
|
||||
if X_test is not None:
|
||||
test_emp_cov = empirical_covariance(X_test)
|
||||
|
||||
for alpha in alphas:
|
||||
try:
|
||||
# Capture the errors, and move on
|
||||
covariance_, precision_ = graphical_lasso(
|
||||
emp_cov, alpha=alpha, cov_init=covariance_, mode=mode, tol=tol,
|
||||
enet_tol=enet_tol, max_iter=max_iter, verbose=inner_verbose)
|
||||
covariances_.append(covariance_)
|
||||
precisions_.append(precision_)
|
||||
if X_test is not None:
|
||||
this_score = log_likelihood(test_emp_cov, precision_)
|
||||
except FloatingPointError:
|
||||
this_score = -np.inf
|
||||
covariances_.append(np.nan)
|
||||
precisions_.append(np.nan)
|
||||
if X_test is not None:
|
||||
if not np.isfinite(this_score):
|
||||
this_score = -np.inf
|
||||
scores_.append(this_score)
|
||||
if verbose == 1:
|
||||
sys.stderr.write('.')
|
||||
elif verbose > 1:
|
||||
if X_test is not None:
|
||||
print('[graphical_lasso_path] alpha: %.2e, score: %.2e'
|
||||
% (alpha, this_score))
|
||||
else:
|
||||
print('[graphical_lasso_path] alpha: %.2e' % alpha)
|
||||
if X_test is not None:
|
||||
return covariances_, precisions_, scores_
|
||||
return covariances_, precisions_
|
||||
|
||||
|
||||
class GraphicalLassoCV(GraphicalLasso):
|
||||
"""Sparse inverse covariance w/ cross-validated choice of the l1 penalty.
|
||||
|
||||
See glossary entry for :term:`cross-validation estimator`.
|
||||
|
||||
Read more in the :ref:`User Guide <sparse_inverse_covariance>`.
|
||||
|
||||
.. versionchanged:: v0.20
|
||||
GraphLassoCV has been renamed to GraphicalLassoCV
|
||||
|
||||
Parameters
|
||||
----------
|
||||
alphas : int or array-like of shape (n_alphas,), dtype=float, default=4
|
||||
If an integer is given, it fixes the number of points on the
|
||||
grids of alpha to be used. If a list is given, it gives the
|
||||
grid to be used. See the notes in the class docstring for
|
||||
more details. Range is (0, inf] when floats given.
|
||||
|
||||
n_refinements : int, default=4
|
||||
The number of times the grid is refined. Not used if explicit
|
||||
values of alphas are passed. Range is [1, inf).
|
||||
|
||||
cv : int, cross-validation generator or iterable, default=None
|
||||
Determines the cross-validation splitting strategy.
|
||||
Possible inputs for cv are:
|
||||
|
||||
- None, to use the default 5-fold cross-validation,
|
||||
- integer, to specify the number of folds.
|
||||
- :term:`CV splitter`,
|
||||
- An iterable yielding (train, test) splits as arrays of indices.
|
||||
|
||||
For integer/None inputs :class:`KFold` is used.
|
||||
|
||||
Refer :ref:`User Guide <cross_validation>` for the various
|
||||
cross-validation strategies that can be used here.
|
||||
|
||||
.. versionchanged:: 0.20
|
||||
``cv`` default value if None changed from 3-fold to 5-fold.
|
||||
|
||||
tol : float, default=1e-4
|
||||
The tolerance to declare convergence: if the dual gap goes below
|
||||
this value, iterations are stopped. Range is (0, inf].
|
||||
|
||||
enet_tol : float, default=1e-4
|
||||
The tolerance for the elastic net solver used to calculate the descent
|
||||
direction. This parameter controls the accuracy of the search direction
|
||||
for a given column update, not of the overall parameter estimate. Only
|
||||
used for mode='cd'. Range is (0, inf].
|
||||
|
||||
max_iter : int, default=100
|
||||
Maximum number of iterations.
|
||||
|
||||
mode : {'cd', 'lars'}, default='cd'
|
||||
The Lasso solver to use: coordinate descent or LARS. Use LARS for
|
||||
very sparse underlying graphs, where number of features is greater
|
||||
than number of samples. Elsewhere prefer cd which is more numerically
|
||||
stable.
|
||||
|
||||
n_jobs : int, default=None
|
||||
number of jobs to run in parallel.
|
||||
``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
|
||||
``-1`` means using all processors. See :term:`Glossary <n_jobs>`
|
||||
for more details.
|
||||
|
||||
.. versionchanged:: v0.20
|
||||
`n_jobs` default changed from 1 to None
|
||||
|
||||
verbose : bool, default=False
|
||||
If verbose is True, the objective function and duality gap are
|
||||
printed at each iteration.
|
||||
|
||||
assume_centered : bool, default=False
|
||||
If True, data are not centered before computation.
|
||||
Useful when working with data whose mean is almost, but not exactly
|
||||
zero.
|
||||
If False, data are centered before computation.
|
||||
|
||||
Attributes
|
||||
----------
|
||||
location_ : ndarray of shape (n_features,)
|
||||
Estimated location, i.e. the estimated mean.
|
||||
|
||||
covariance_ : ndarray of shape (n_features, n_features)
|
||||
Estimated covariance matrix.
|
||||
|
||||
precision_ : ndarray of shape (n_features, n_features)
|
||||
Estimated precision matrix (inverse covariance).
|
||||
|
||||
alpha_ : float
|
||||
Penalization parameter selected.
|
||||
|
||||
cv_alphas_ : list of shape (n_alphas,), dtype=float
|
||||
All penalization parameters explored.
|
||||
|
||||
grid_scores_ : ndarray of shape (n_alphas, n_folds)
|
||||
Log-likelihood score on left-out data across folds.
|
||||
|
||||
n_iter_ : int
|
||||
Number of iterations run for the optimal alpha.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> import numpy as np
|
||||
>>> from sklearn.covariance import GraphicalLassoCV
|
||||
>>> true_cov = np.array([[0.8, 0.0, 0.2, 0.0],
|
||||
... [0.0, 0.4, 0.0, 0.0],
|
||||
... [0.2, 0.0, 0.3, 0.1],
|
||||
... [0.0, 0.0, 0.1, 0.7]])
|
||||
>>> np.random.seed(0)
|
||||
>>> X = np.random.multivariate_normal(mean=[0, 0, 0, 0],
|
||||
... cov=true_cov,
|
||||
... size=200)
|
||||
>>> cov = GraphicalLassoCV().fit(X)
|
||||
>>> np.around(cov.covariance_, decimals=3)
|
||||
array([[0.816, 0.051, 0.22 , 0.017],
|
||||
[0.051, 0.364, 0.018, 0.036],
|
||||
[0.22 , 0.018, 0.322, 0.094],
|
||||
[0.017, 0.036, 0.094, 0.69 ]])
|
||||
>>> np.around(cov.location_, decimals=3)
|
||||
array([0.073, 0.04 , 0.038, 0.143])
|
||||
|
||||
See Also
|
||||
--------
|
||||
graphical_lasso, GraphicalLasso
|
||||
|
||||
Notes
|
||||
-----
|
||||
The search for the optimal penalization parameter (alpha) is done on an
|
||||
iteratively refined grid: first the cross-validated scores on a grid are
|
||||
computed, then a new refined grid is centered around the maximum, and so
|
||||
on.
|
||||
|
||||
One of the challenges which is faced here is that the solvers can
|
||||
fail to converge to a well-conditioned estimate. The corresponding
|
||||
values of alpha then come out as missing values, but the optimum may
|
||||
be close to these missing values.
|
||||
"""
|
||||
@_deprecate_positional_args
|
||||
def __init__(self, *, alphas=4, n_refinements=4, cv=None, tol=1e-4,
|
||||
enet_tol=1e-4, max_iter=100, mode='cd', n_jobs=None,
|
||||
verbose=False, assume_centered=False):
|
||||
super().__init__(
|
||||
mode=mode, tol=tol, verbose=verbose, enet_tol=enet_tol,
|
||||
max_iter=max_iter, assume_centered=assume_centered)
|
||||
self.alphas = alphas
|
||||
self.n_refinements = n_refinements
|
||||
self.cv = cv
|
||||
self.n_jobs = n_jobs
|
||||
|
||||
def fit(self, X, y=None):
|
||||
"""Fits the GraphicalLasso covariance model to X.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : array-like of shape (n_samples, n_features)
|
||||
Data from which to compute the covariance estimate
|
||||
|
||||
y : Ignored
|
||||
Not used, present for API consistence purpose.
|
||||
|
||||
Returns
|
||||
-------
|
||||
self : object
|
||||
"""
|
||||
# Covariance does not make sense for a single feature
|
||||
X = self._validate_data(X, ensure_min_features=2, estimator=self)
|
||||
if self.assume_centered:
|
||||
self.location_ = np.zeros(X.shape[1])
|
||||
else:
|
||||
self.location_ = X.mean(0)
|
||||
emp_cov = empirical_covariance(
|
||||
X, assume_centered=self.assume_centered)
|
||||
|
||||
cv = check_cv(self.cv, y, classifier=False)
|
||||
|
||||
# List of (alpha, scores, covs)
|
||||
path = list()
|
||||
n_alphas = self.alphas
|
||||
inner_verbose = max(0, self.verbose - 1)
|
||||
|
||||
if isinstance(n_alphas, Sequence):
|
||||
alphas = self.alphas
|
||||
n_refinements = 1
|
||||
else:
|
||||
n_refinements = self.n_refinements
|
||||
alpha_1 = alpha_max(emp_cov)
|
||||
alpha_0 = 1e-2 * alpha_1
|
||||
alphas = np.logspace(np.log10(alpha_0), np.log10(alpha_1),
|
||||
n_alphas)[::-1]
|
||||
|
||||
t0 = time.time()
|
||||
for i in range(n_refinements):
|
||||
with warnings.catch_warnings():
|
||||
# No need to see the convergence warnings on this grid:
|
||||
# they will always be points that will not converge
|
||||
# during the cross-validation
|
||||
warnings.simplefilter('ignore', ConvergenceWarning)
|
||||
# Compute the cross-validated loss on the current grid
|
||||
|
||||
# NOTE: Warm-restarting graphical_lasso_path has been tried,
|
||||
# and this did not allow to gain anything
|
||||
# (same execution time with or without).
|
||||
this_path = Parallel(
|
||||
n_jobs=self.n_jobs,
|
||||
verbose=self.verbose
|
||||
)(delayed(graphical_lasso_path)(X[train], alphas=alphas,
|
||||
X_test=X[test], mode=self.mode,
|
||||
tol=self.tol,
|
||||
enet_tol=self.enet_tol,
|
||||
max_iter=int(.1 *
|
||||
self.max_iter),
|
||||
verbose=inner_verbose)
|
||||
for train, test in cv.split(X, y))
|
||||
|
||||
# Little danse to transform the list in what we need
|
||||
covs, _, scores = zip(*this_path)
|
||||
covs = zip(*covs)
|
||||
scores = zip(*scores)
|
||||
path.extend(zip(alphas, scores, covs))
|
||||
path = sorted(path, key=operator.itemgetter(0), reverse=True)
|
||||
|
||||
# Find the maximum (avoid using built in 'max' function to
|
||||
# have a fully-reproducible selection of the smallest alpha
|
||||
# in case of equality)
|
||||
best_score = -np.inf
|
||||
last_finite_idx = 0
|
||||
for index, (alpha, scores, _) in enumerate(path):
|
||||
this_score = np.mean(scores)
|
||||
if this_score >= .1 / np.finfo(np.float64).eps:
|
||||
this_score = np.nan
|
||||
if np.isfinite(this_score):
|
||||
last_finite_idx = index
|
||||
if this_score >= best_score:
|
||||
best_score = this_score
|
||||
best_index = index
|
||||
|
||||
# Refine the grid
|
||||
if best_index == 0:
|
||||
# We do not need to go back: we have chosen
|
||||
# the highest value of alpha for which there are
|
||||
# non-zero coefficients
|
||||
alpha_1 = path[0][0]
|
||||
alpha_0 = path[1][0]
|
||||
elif (best_index == last_finite_idx
|
||||
and not best_index == len(path) - 1):
|
||||
# We have non-converged models on the upper bound of the
|
||||
# grid, we need to refine the grid there
|
||||
alpha_1 = path[best_index][0]
|
||||
alpha_0 = path[best_index + 1][0]
|
||||
elif best_index == len(path) - 1:
|
||||
alpha_1 = path[best_index][0]
|
||||
alpha_0 = 0.01 * path[best_index][0]
|
||||
else:
|
||||
alpha_1 = path[best_index - 1][0]
|
||||
alpha_0 = path[best_index + 1][0]
|
||||
|
||||
if not isinstance(n_alphas, Sequence):
|
||||
alphas = np.logspace(np.log10(alpha_1), np.log10(alpha_0),
|
||||
n_alphas + 2)
|
||||
alphas = alphas[1:-1]
|
||||
|
||||
if self.verbose and n_refinements > 1:
|
||||
print('[GraphicalLassoCV] Done refinement % 2i out of'
|
||||
' %i: % 3is' % (i + 1, n_refinements, time.time() - t0))
|
||||
|
||||
path = list(zip(*path))
|
||||
grid_scores = list(path[1])
|
||||
alphas = list(path[0])
|
||||
# Finally, compute the score with alpha = 0
|
||||
alphas.append(0)
|
||||
grid_scores.append(cross_val_score(EmpiricalCovariance(), X,
|
||||
cv=cv, n_jobs=self.n_jobs,
|
||||
verbose=inner_verbose))
|
||||
self.grid_scores_ = np.array(grid_scores)
|
||||
best_alpha = alphas[best_index]
|
||||
self.alpha_ = best_alpha
|
||||
self.cv_alphas_ = alphas
|
||||
|
||||
# Finally fit the model with the selected alpha
|
||||
self.covariance_, self.precision_, self.n_iter_ = graphical_lasso(
|
||||
emp_cov, alpha=best_alpha, mode=self.mode, tol=self.tol,
|
||||
enet_tol=self.enet_tol, max_iter=self.max_iter,
|
||||
verbose=inner_verbose, return_n_iter=True)
|
||||
return self
|
762
venv/Lib/site-packages/sklearn/covariance/_robust_covariance.py
Normal file
762
venv/Lib/site-packages/sklearn/covariance/_robust_covariance.py
Normal file
|
@ -0,0 +1,762 @@
|
|||
"""
|
||||
Robust location and covariance estimators.
|
||||
|
||||
Here are implemented estimators that are resistant to outliers.
|
||||
|
||||
"""
|
||||
# Author: Virgile Fritsch <virgile.fritsch@inria.fr>
|
||||
#
|
||||
# License: BSD 3 clause
|
||||
|
||||
import warnings
|
||||
import numbers
|
||||
import numpy as np
|
||||
from scipy import linalg
|
||||
from scipy.stats import chi2
|
||||
|
||||
from . import empirical_covariance, EmpiricalCovariance
|
||||
from ..utils.extmath import fast_logdet
|
||||
from ..utils import check_random_state, check_array
|
||||
from ..utils.validation import _deprecate_positional_args
|
||||
|
||||
|
||||
# Minimum Covariance Determinant
|
||||
# Implementing of an algorithm by Rousseeuw & Van Driessen described in
|
||||
# (A Fast Algorithm for the Minimum Covariance Determinant Estimator,
|
||||
# 1999, American Statistical Association and the American Society
|
||||
# for Quality, TECHNOMETRICS)
|
||||
# XXX Is this really a public function? It's not listed in the docs or
|
||||
# exported by sklearn.covariance. Deprecate?
|
||||
def c_step(X, n_support, remaining_iterations=30, initial_estimates=None,
|
||||
verbose=False, cov_computation_method=empirical_covariance,
|
||||
random_state=None):
|
||||
"""C_step procedure described in [Rouseeuw1984]_ aiming at computing MCD.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : array-like of shape (n_samples, n_features)
|
||||
Data set in which we look for the n_support observations whose
|
||||
scatter matrix has minimum determinant.
|
||||
|
||||
n_support : int,
|
||||
Number of observations to compute the robust estimates of location
|
||||
and covariance from. This parameter must be greater than
|
||||
`n_samples / 2`.
|
||||
|
||||
remaining_iterations : int, default=30
|
||||
Number of iterations to perform.
|
||||
According to [Rouseeuw1999]_, two iterations are sufficient to get
|
||||
close to the minimum, and we never need more than 30 to reach
|
||||
convergence.
|
||||
|
||||
initial_estimates : tuple of shape (2,), default=None
|
||||
Initial estimates of location and shape from which to run the c_step
|
||||
procedure:
|
||||
- initial_estimates[0]: an initial location estimate
|
||||
- initial_estimates[1]: an initial covariance estimate
|
||||
|
||||
verbose : bool, defaut=False
|
||||
Verbose mode.
|
||||
|
||||
cov_computation_method : callable, \
|
||||
default=:func:`sklearn.covariance.empirical_covariance`
|
||||
The function which will be used to compute the covariance.
|
||||
Must return array of shape (n_features, n_features).
|
||||
|
||||
random_state : int or RandomState instance, default=None
|
||||
Determines the pseudo random number generator for shuffling the data.
|
||||
Pass an int for reproducible results across multiple function calls.
|
||||
See :term: `Glossary <random_state>`.
|
||||
|
||||
Returns
|
||||
-------
|
||||
location : ndarray of shape (n_features,)
|
||||
Robust location estimates.
|
||||
|
||||
covariance : ndarray of shape (n_features, n_features)
|
||||
Robust covariance estimates.
|
||||
|
||||
support : ndarray of shape (n_samples,)
|
||||
A mask for the `n_support` observations whose scatter matrix has
|
||||
minimum determinant.
|
||||
|
||||
References
|
||||
----------
|
||||
.. [Rouseeuw1999] A Fast Algorithm for the Minimum Covariance Determinant
|
||||
Estimator, 1999, American Statistical Association and the American
|
||||
Society for Quality, TECHNOMETRICS
|
||||
"""
|
||||
X = np.asarray(X)
|
||||
random_state = check_random_state(random_state)
|
||||
return _c_step(X, n_support, remaining_iterations=remaining_iterations,
|
||||
initial_estimates=initial_estimates, verbose=verbose,
|
||||
cov_computation_method=cov_computation_method,
|
||||
random_state=random_state)
|
||||
|
||||
|
||||
def _c_step(X, n_support, random_state, remaining_iterations=30,
|
||||
initial_estimates=None, verbose=False,
|
||||
cov_computation_method=empirical_covariance):
|
||||
n_samples, n_features = X.shape
|
||||
dist = np.inf
|
||||
|
||||
# Initialisation
|
||||
support = np.zeros(n_samples, dtype=bool)
|
||||
if initial_estimates is None:
|
||||
# compute initial robust estimates from a random subset
|
||||
support[random_state.permutation(n_samples)[:n_support]] = True
|
||||
else:
|
||||
# get initial robust estimates from the function parameters
|
||||
location = initial_estimates[0]
|
||||
covariance = initial_estimates[1]
|
||||
# run a special iteration for that case (to get an initial support)
|
||||
precision = linalg.pinvh(covariance)
|
||||
X_centered = X - location
|
||||
dist = (np.dot(X_centered, precision) * X_centered).sum(1)
|
||||
# compute new estimates
|
||||
support[np.argsort(dist)[:n_support]] = True
|
||||
|
||||
X_support = X[support]
|
||||
location = X_support.mean(0)
|
||||
covariance = cov_computation_method(X_support)
|
||||
|
||||
# Iterative procedure for Minimum Covariance Determinant computation
|
||||
det = fast_logdet(covariance)
|
||||
# If the data already has singular covariance, calculate the precision,
|
||||
# as the loop below will not be entered.
|
||||
if np.isinf(det):
|
||||
precision = linalg.pinvh(covariance)
|
||||
|
||||
previous_det = np.inf
|
||||
while (det < previous_det and remaining_iterations > 0
|
||||
and not np.isinf(det)):
|
||||
# save old estimates values
|
||||
previous_location = location
|
||||
previous_covariance = covariance
|
||||
previous_det = det
|
||||
previous_support = support
|
||||
# compute a new support from the full data set mahalanobis distances
|
||||
precision = linalg.pinvh(covariance)
|
||||
X_centered = X - location
|
||||
dist = (np.dot(X_centered, precision) * X_centered).sum(axis=1)
|
||||
# compute new estimates
|
||||
support = np.zeros(n_samples, dtype=bool)
|
||||
support[np.argsort(dist)[:n_support]] = True
|
||||
X_support = X[support]
|
||||
location = X_support.mean(axis=0)
|
||||
covariance = cov_computation_method(X_support)
|
||||
det = fast_logdet(covariance)
|
||||
# update remaining iterations for early stopping
|
||||
remaining_iterations -= 1
|
||||
|
||||
previous_dist = dist
|
||||
dist = (np.dot(X - location, precision) * (X - location)).sum(axis=1)
|
||||
# Check if best fit already found (det => 0, logdet => -inf)
|
||||
if np.isinf(det):
|
||||
results = location, covariance, det, support, dist
|
||||
# Check convergence
|
||||
if np.allclose(det, previous_det):
|
||||
# c_step procedure converged
|
||||
if verbose:
|
||||
print("Optimal couple (location, covariance) found before"
|
||||
" ending iterations (%d left)" % (remaining_iterations))
|
||||
results = location, covariance, det, support, dist
|
||||
elif det > previous_det:
|
||||
# determinant has increased (should not happen)
|
||||
warnings.warn("Determinant has increased; this should not happen: "
|
||||
"log(det) > log(previous_det) (%.15f > %.15f). "
|
||||
"You may want to try with a higher value of "
|
||||
"support_fraction (current value: %.3f)."
|
||||
% (det, previous_det, n_support / n_samples),
|
||||
RuntimeWarning)
|
||||
results = previous_location, previous_covariance, \
|
||||
previous_det, previous_support, previous_dist
|
||||
|
||||
# Check early stopping
|
||||
if remaining_iterations == 0:
|
||||
if verbose:
|
||||
print('Maximum number of iterations reached')
|
||||
results = location, covariance, det, support, dist
|
||||
|
||||
return results
|
||||
|
||||
|
||||
def select_candidates(X, n_support, n_trials, select=1, n_iter=30,
|
||||
verbose=False,
|
||||
cov_computation_method=empirical_covariance,
|
||||
random_state=None):
|
||||
"""Finds the best pure subset of observations to compute MCD from it.
|
||||
|
||||
The purpose of this function is to find the best sets of n_support
|
||||
observations with respect to a minimization of their covariance
|
||||
matrix determinant. Equivalently, it removes n_samples-n_support
|
||||
observations to construct what we call a pure data set (i.e. not
|
||||
containing outliers). The list of the observations of the pure
|
||||
data set is referred to as the `support`.
|
||||
|
||||
Starting from a random support, the pure data set is found by the
|
||||
c_step procedure introduced by Rousseeuw and Van Driessen in
|
||||
[RV]_.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : array-like of shape (n_samples, n_features)
|
||||
Data (sub)set in which we look for the n_support purest observations.
|
||||
|
||||
n_support : int
|
||||
The number of samples the pure data set must contain.
|
||||
This parameter must be in the range `[(n + p + 1)/2] < n_support < n`.
|
||||
|
||||
n_trials : int or tuple of shape (2,)
|
||||
Number of different initial sets of observations from which to
|
||||
run the algorithm. This parameter should be a strictly positive
|
||||
integer.
|
||||
Instead of giving a number of trials to perform, one can provide a
|
||||
list of initial estimates that will be used to iteratively run
|
||||
c_step procedures. In this case:
|
||||
- n_trials[0]: array-like, shape (n_trials, n_features)
|
||||
is the list of `n_trials` initial location estimates
|
||||
- n_trials[1]: array-like, shape (n_trials, n_features, n_features)
|
||||
is the list of `n_trials` initial covariances estimates
|
||||
|
||||
select : int, default=1
|
||||
Number of best candidates results to return. This parameter must be
|
||||
a strictly positive integer.
|
||||
|
||||
n_iter : int, default=30
|
||||
Maximum number of iterations for the c_step procedure.
|
||||
(2 is enough to be close to the final solution. "Never" exceeds 20).
|
||||
This parameter must be a strictly positive integer.
|
||||
|
||||
verbose : bool, default False
|
||||
Control the output verbosity.
|
||||
|
||||
cov_computation_method : callable, \
|
||||
default=:func:`sklearn.covariance.empirical_covariance`
|
||||
The function which will be used to compute the covariance.
|
||||
Must return an array of shape (n_features, n_features).
|
||||
|
||||
random_state : int or RandomState instance, default=None
|
||||
Determines the pseudo random number generator for shuffling the data.
|
||||
Pass an int for reproducible results across multiple function calls.
|
||||
See :term: `Glossary <random_state>`.
|
||||
|
||||
See Also
|
||||
---------
|
||||
c_step
|
||||
|
||||
Returns
|
||||
-------
|
||||
best_locations : ndarray of shape (select, n_features)
|
||||
The `select` location estimates computed from the `select` best
|
||||
supports found in the data set (`X`).
|
||||
|
||||
best_covariances : ndarray of shape (select, n_features, n_features)
|
||||
The `select` covariance estimates computed from the `select`
|
||||
best supports found in the data set (`X`).
|
||||
|
||||
best_supports : ndarray of shape (select, n_samples)
|
||||
The `select` best supports found in the data set (`X`).
|
||||
|
||||
References
|
||||
----------
|
||||
.. [RV] A Fast Algorithm for the Minimum Covariance Determinant
|
||||
Estimator, 1999, American Statistical Association and the American
|
||||
Society for Quality, TECHNOMETRICS
|
||||
"""
|
||||
random_state = check_random_state(random_state)
|
||||
|
||||
if isinstance(n_trials, numbers.Integral):
|
||||
run_from_estimates = False
|
||||
elif isinstance(n_trials, tuple):
|
||||
run_from_estimates = True
|
||||
estimates_list = n_trials
|
||||
n_trials = estimates_list[0].shape[0]
|
||||
else:
|
||||
raise TypeError("Invalid 'n_trials' parameter, expected tuple or "
|
||||
" integer, got %s (%s)" % (n_trials, type(n_trials)))
|
||||
|
||||
# compute `n_trials` location and shape estimates candidates in the subset
|
||||
all_estimates = []
|
||||
if not run_from_estimates:
|
||||
# perform `n_trials` computations from random initial supports
|
||||
for j in range(n_trials):
|
||||
all_estimates.append(
|
||||
_c_step(
|
||||
X, n_support, remaining_iterations=n_iter, verbose=verbose,
|
||||
cov_computation_method=cov_computation_method,
|
||||
random_state=random_state))
|
||||
else:
|
||||
# perform computations from every given initial estimates
|
||||
for j in range(n_trials):
|
||||
initial_estimates = (estimates_list[0][j], estimates_list[1][j])
|
||||
all_estimates.append(_c_step(
|
||||
X, n_support, remaining_iterations=n_iter,
|
||||
initial_estimates=initial_estimates, verbose=verbose,
|
||||
cov_computation_method=cov_computation_method,
|
||||
random_state=random_state))
|
||||
all_locs_sub, all_covs_sub, all_dets_sub, all_supports_sub, all_ds_sub = \
|
||||
zip(*all_estimates)
|
||||
# find the `n_best` best results among the `n_trials` ones
|
||||
index_best = np.argsort(all_dets_sub)[:select]
|
||||
best_locations = np.asarray(all_locs_sub)[index_best]
|
||||
best_covariances = np.asarray(all_covs_sub)[index_best]
|
||||
best_supports = np.asarray(all_supports_sub)[index_best]
|
||||
best_ds = np.asarray(all_ds_sub)[index_best]
|
||||
|
||||
return best_locations, best_covariances, best_supports, best_ds
|
||||
|
||||
|
||||
def fast_mcd(X, support_fraction=None,
|
||||
cov_computation_method=empirical_covariance,
|
||||
random_state=None):
|
||||
"""Estimates the Minimum Covariance Determinant matrix.
|
||||
|
||||
Read more in the :ref:`User Guide <robust_covariance>`.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : array-like of shape (n_samples, n_features)
|
||||
The data matrix, with p features and n samples.
|
||||
|
||||
support_fraction : float, default=None
|
||||
The proportion of points to be included in the support of the raw
|
||||
MCD estimate. Default is `None`, which implies that the minimum
|
||||
value of `support_fraction` will be used within the algorithm:
|
||||
`(n_sample + n_features + 1) / 2`. This parameter must be in the
|
||||
range (0, 1).
|
||||
|
||||
cov_computation_method : callable, \
|
||||
default=:func:`sklearn.covariance.empirical_covariance`
|
||||
The function which will be used to compute the covariance.
|
||||
Must return an array of shape (n_features, n_features).
|
||||
|
||||
random_state : int or RandomState instance, default=None
|
||||
Determines the pseudo random number generator for shuffling the data.
|
||||
Pass an int for reproducible results across multiple function calls.
|
||||
See :term: `Glossary <random_state>`.
|
||||
|
||||
Returns
|
||||
-------
|
||||
location : ndarray of shape (n_features,)
|
||||
Robust location of the data.
|
||||
|
||||
covariance : ndarray of shape (n_features, n_features)
|
||||
Robust covariance of the features.
|
||||
|
||||
support : ndarray of shape (n_samples,), dtype=bool
|
||||
A mask of the observations that have been used to compute
|
||||
the robust location and covariance estimates of the data set.
|
||||
|
||||
Notes
|
||||
-----
|
||||
The FastMCD algorithm has been introduced by Rousseuw and Van Driessen
|
||||
in "A Fast Algorithm for the Minimum Covariance Determinant Estimator,
|
||||
1999, American Statistical Association and the American Society
|
||||
for Quality, TECHNOMETRICS".
|
||||
The principle is to compute robust estimates and random subsets before
|
||||
pooling them into a larger subsets, and finally into the full data set.
|
||||
Depending on the size of the initial sample, we have one, two or three
|
||||
such computation levels.
|
||||
|
||||
Note that only raw estimates are returned. If one is interested in
|
||||
the correction and reweighting steps described in [RouseeuwVan]_,
|
||||
see the MinCovDet object.
|
||||
|
||||
References
|
||||
----------
|
||||
|
||||
.. [RouseeuwVan] A Fast Algorithm for the Minimum Covariance
|
||||
Determinant Estimator, 1999, American Statistical Association
|
||||
and the American Society for Quality, TECHNOMETRICS
|
||||
|
||||
.. [Butler1993] R. W. Butler, P. L. Davies and M. Jhun,
|
||||
Asymptotics For The Minimum Covariance Determinant Estimator,
|
||||
The Annals of Statistics, 1993, Vol. 21, No. 3, 1385-1400
|
||||
"""
|
||||
random_state = check_random_state(random_state)
|
||||
|
||||
X = check_array(X, ensure_min_samples=2, estimator='fast_mcd')
|
||||
n_samples, n_features = X.shape
|
||||
|
||||
# minimum breakdown value
|
||||
if support_fraction is None:
|
||||
n_support = int(np.ceil(0.5 * (n_samples + n_features + 1)))
|
||||
else:
|
||||
n_support = int(support_fraction * n_samples)
|
||||
|
||||
# 1-dimensional case quick computation
|
||||
# (Rousseeuw, P. J. and Leroy, A. M. (2005) References, in Robust
|
||||
# Regression and Outlier Detection, John Wiley & Sons, chapter 4)
|
||||
if n_features == 1:
|
||||
if n_support < n_samples:
|
||||
# find the sample shortest halves
|
||||
X_sorted = np.sort(np.ravel(X))
|
||||
diff = X_sorted[n_support:] - X_sorted[:(n_samples - n_support)]
|
||||
halves_start = np.where(diff == np.min(diff))[0]
|
||||
# take the middle points' mean to get the robust location estimate
|
||||
location = 0.5 * (X_sorted[n_support + halves_start] +
|
||||
X_sorted[halves_start]).mean()
|
||||
support = np.zeros(n_samples, dtype=bool)
|
||||
X_centered = X - location
|
||||
support[np.argsort(np.abs(X_centered), 0)[:n_support]] = True
|
||||
covariance = np.asarray([[np.var(X[support])]])
|
||||
location = np.array([location])
|
||||
# get precision matrix in an optimized way
|
||||
precision = linalg.pinvh(covariance)
|
||||
dist = (np.dot(X_centered, precision) * (X_centered)).sum(axis=1)
|
||||
else:
|
||||
support = np.ones(n_samples, dtype=bool)
|
||||
covariance = np.asarray([[np.var(X)]])
|
||||
location = np.asarray([np.mean(X)])
|
||||
X_centered = X - location
|
||||
# get precision matrix in an optimized way
|
||||
precision = linalg.pinvh(covariance)
|
||||
dist = (np.dot(X_centered, precision) * (X_centered)).sum(axis=1)
|
||||
# Starting FastMCD algorithm for p-dimensional case
|
||||
if (n_samples > 500) and (n_features > 1):
|
||||
# 1. Find candidate supports on subsets
|
||||
# a. split the set in subsets of size ~ 300
|
||||
n_subsets = n_samples // 300
|
||||
n_samples_subsets = n_samples // n_subsets
|
||||
samples_shuffle = random_state.permutation(n_samples)
|
||||
h_subset = int(np.ceil(n_samples_subsets *
|
||||
(n_support / float(n_samples))))
|
||||
# b. perform a total of 500 trials
|
||||
n_trials_tot = 500
|
||||
# c. select 10 best (location, covariance) for each subset
|
||||
n_best_sub = 10
|
||||
n_trials = max(10, n_trials_tot // n_subsets)
|
||||
n_best_tot = n_subsets * n_best_sub
|
||||
all_best_locations = np.zeros((n_best_tot, n_features))
|
||||
try:
|
||||
all_best_covariances = np.zeros((n_best_tot, n_features,
|
||||
n_features))
|
||||
except MemoryError:
|
||||
# The above is too big. Let's try with something much small
|
||||
# (and less optimal)
|
||||
n_best_tot = 10
|
||||
all_best_covariances = np.zeros((n_best_tot, n_features,
|
||||
n_features))
|
||||
n_best_sub = 2
|
||||
for i in range(n_subsets):
|
||||
low_bound = i * n_samples_subsets
|
||||
high_bound = low_bound + n_samples_subsets
|
||||
current_subset = X[samples_shuffle[low_bound:high_bound]]
|
||||
best_locations_sub, best_covariances_sub, _, _ = select_candidates(
|
||||
current_subset, h_subset, n_trials,
|
||||
select=n_best_sub, n_iter=2,
|
||||
cov_computation_method=cov_computation_method,
|
||||
random_state=random_state)
|
||||
subset_slice = np.arange(i * n_best_sub, (i + 1) * n_best_sub)
|
||||
all_best_locations[subset_slice] = best_locations_sub
|
||||
all_best_covariances[subset_slice] = best_covariances_sub
|
||||
# 2. Pool the candidate supports into a merged set
|
||||
# (possibly the full dataset)
|
||||
n_samples_merged = min(1500, n_samples)
|
||||
h_merged = int(np.ceil(n_samples_merged *
|
||||
(n_support / float(n_samples))))
|
||||
if n_samples > 1500:
|
||||
n_best_merged = 10
|
||||
else:
|
||||
n_best_merged = 1
|
||||
# find the best couples (location, covariance) on the merged set
|
||||
selection = random_state.permutation(n_samples)[:n_samples_merged]
|
||||
locations_merged, covariances_merged, supports_merged, d = \
|
||||
select_candidates(
|
||||
X[selection], h_merged,
|
||||
n_trials=(all_best_locations, all_best_covariances),
|
||||
select=n_best_merged,
|
||||
cov_computation_method=cov_computation_method,
|
||||
random_state=random_state)
|
||||
# 3. Finally get the overall best (locations, covariance) couple
|
||||
if n_samples < 1500:
|
||||
# directly get the best couple (location, covariance)
|
||||
location = locations_merged[0]
|
||||
covariance = covariances_merged[0]
|
||||
support = np.zeros(n_samples, dtype=bool)
|
||||
dist = np.zeros(n_samples)
|
||||
support[selection] = supports_merged[0]
|
||||
dist[selection] = d[0]
|
||||
else:
|
||||
# select the best couple on the full dataset
|
||||
locations_full, covariances_full, supports_full, d = \
|
||||
select_candidates(
|
||||
X, n_support,
|
||||
n_trials=(locations_merged, covariances_merged),
|
||||
select=1,
|
||||
cov_computation_method=cov_computation_method,
|
||||
random_state=random_state)
|
||||
location = locations_full[0]
|
||||
covariance = covariances_full[0]
|
||||
support = supports_full[0]
|
||||
dist = d[0]
|
||||
elif n_features > 1:
|
||||
# 1. Find the 10 best couples (location, covariance)
|
||||
# considering two iterations
|
||||
n_trials = 30
|
||||
n_best = 10
|
||||
locations_best, covariances_best, _, _ = select_candidates(
|
||||
X, n_support, n_trials=n_trials, select=n_best, n_iter=2,
|
||||
cov_computation_method=cov_computation_method,
|
||||
random_state=random_state)
|
||||
# 2. Select the best couple on the full dataset amongst the 10
|
||||
locations_full, covariances_full, supports_full, d = select_candidates(
|
||||
X, n_support, n_trials=(locations_best, covariances_best),
|
||||
select=1, cov_computation_method=cov_computation_method,
|
||||
random_state=random_state)
|
||||
location = locations_full[0]
|
||||
covariance = covariances_full[0]
|
||||
support = supports_full[0]
|
||||
dist = d[0]
|
||||
|
||||
return location, covariance, support, dist
|
||||
|
||||
|
||||
class MinCovDet(EmpiricalCovariance):
|
||||
"""Minimum Covariance Determinant (MCD): robust estimator of covariance.
|
||||
|
||||
The Minimum Covariance Determinant covariance estimator is to be applied
|
||||
on Gaussian-distributed data, but could still be relevant on data
|
||||
drawn from a unimodal, symmetric distribution. It is not meant to be used
|
||||
with multi-modal data (the algorithm used to fit a MinCovDet object is
|
||||
likely to fail in such a case).
|
||||
One should consider projection pursuit methods to deal with multi-modal
|
||||
datasets.
|
||||
|
||||
Read more in the :ref:`User Guide <robust_covariance>`.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
store_precision : bool, default=True
|
||||
Specify if the estimated precision is stored.
|
||||
|
||||
assume_centered : bool, default=False
|
||||
If True, the support of the robust location and the covariance
|
||||
estimates is computed, and a covariance estimate is recomputed from
|
||||
it, without centering the data.
|
||||
Useful to work with data whose mean is significantly equal to
|
||||
zero but is not exactly zero.
|
||||
If False, the robust location and covariance are directly computed
|
||||
with the FastMCD algorithm without additional treatment.
|
||||
|
||||
support_fraction : float, default=None
|
||||
The proportion of points to be included in the support of the raw
|
||||
MCD estimate. Default is None, which implies that the minimum
|
||||
value of support_fraction will be used within the algorithm:
|
||||
`(n_sample + n_features + 1) / 2`. The parameter must be in the range
|
||||
(0, 1).
|
||||
|
||||
random_state : int or RandomState instance, default=None
|
||||
Determines the pseudo random number generator for shuffling the data.
|
||||
Pass an int for reproducible results across multiple function calls.
|
||||
See :term: `Glossary <random_state>`.
|
||||
|
||||
Attributes
|
||||
----------
|
||||
raw_location_ : ndarray of shape (n_features,)
|
||||
The raw robust estimated location before correction and re-weighting.
|
||||
|
||||
raw_covariance_ : ndarray of shape (n_features, n_features)
|
||||
The raw robust estimated covariance before correction and re-weighting.
|
||||
|
||||
raw_support_ : ndarray of shape (n_samples,)
|
||||
A mask of the observations that have been used to compute
|
||||
the raw robust estimates of location and shape, before correction
|
||||
and re-weighting.
|
||||
|
||||
location_ : ndarray of shape (n_features,)
|
||||
Estimated robust location.
|
||||
|
||||
covariance_ : ndarray of shape (n_features, n_features)
|
||||
Estimated robust covariance matrix.
|
||||
|
||||
precision_ : ndarray of shape (n_features, n_features)
|
||||
Estimated pseudo inverse matrix.
|
||||
(stored only if store_precision is True)
|
||||
|
||||
support_ : ndarray of shape (n_samples,)
|
||||
A mask of the observations that have been used to compute
|
||||
the robust estimates of location and shape.
|
||||
|
||||
dist_ : ndarray of shape (n_samples,)
|
||||
Mahalanobis distances of the training set (on which :meth:`fit` is
|
||||
called) observations.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> import numpy as np
|
||||
>>> from sklearn.covariance import MinCovDet
|
||||
>>> from sklearn.datasets import make_gaussian_quantiles
|
||||
>>> real_cov = np.array([[.8, .3],
|
||||
... [.3, .4]])
|
||||
>>> rng = np.random.RandomState(0)
|
||||
>>> X = rng.multivariate_normal(mean=[0, 0],
|
||||
... cov=real_cov,
|
||||
... size=500)
|
||||
>>> cov = MinCovDet(random_state=0).fit(X)
|
||||
>>> cov.covariance_
|
||||
array([[0.7411..., 0.2535...],
|
||||
[0.2535..., 0.3053...]])
|
||||
>>> cov.location_
|
||||
array([0.0813... , 0.0427...])
|
||||
|
||||
References
|
||||
----------
|
||||
|
||||
.. [Rouseeuw1984] P. J. Rousseeuw. Least median of squares regression.
|
||||
J. Am Stat Ass, 79:871, 1984.
|
||||
.. [Rousseeuw] A Fast Algorithm for the Minimum Covariance Determinant
|
||||
Estimator, 1999, American Statistical Association and the American
|
||||
Society for Quality, TECHNOMETRICS
|
||||
.. [ButlerDavies] R. W. Butler, P. L. Davies and M. Jhun,
|
||||
Asymptotics For The Minimum Covariance Determinant Estimator,
|
||||
The Annals of Statistics, 1993, Vol. 21, No. 3, 1385-1400
|
||||
"""
|
||||
_nonrobust_covariance = staticmethod(empirical_covariance)
|
||||
|
||||
@_deprecate_positional_args
|
||||
def __init__(self, *, store_precision=True, assume_centered=False,
|
||||
support_fraction=None, random_state=None):
|
||||
self.store_precision = store_precision
|
||||
self.assume_centered = assume_centered
|
||||
self.support_fraction = support_fraction
|
||||
self.random_state = random_state
|
||||
|
||||
def fit(self, X, y=None):
|
||||
"""Fits a Minimum Covariance Determinant with the FastMCD algorithm.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : array-like of shape (n_samples, n_features)
|
||||
Training data, where `n_samples` is the number of samples
|
||||
and `n_features` is the number of features.
|
||||
|
||||
y: Ignored
|
||||
Not used, present for API consistence purpose.
|
||||
|
||||
Returns
|
||||
-------
|
||||
self : object
|
||||
"""
|
||||
X = self._validate_data(X, ensure_min_samples=2, estimator='MinCovDet')
|
||||
random_state = check_random_state(self.random_state)
|
||||
n_samples, n_features = X.shape
|
||||
# check that the empirical covariance is full rank
|
||||
if (linalg.svdvals(np.dot(X.T, X)) > 1e-8).sum() != n_features:
|
||||
warnings.warn("The covariance matrix associated to your dataset "
|
||||
"is not full rank")
|
||||
# compute and store raw estimates
|
||||
raw_location, raw_covariance, raw_support, raw_dist = fast_mcd(
|
||||
X, support_fraction=self.support_fraction,
|
||||
cov_computation_method=self._nonrobust_covariance,
|
||||
random_state=random_state)
|
||||
if self.assume_centered:
|
||||
raw_location = np.zeros(n_features)
|
||||
raw_covariance = self._nonrobust_covariance(X[raw_support],
|
||||
assume_centered=True)
|
||||
# get precision matrix in an optimized way
|
||||
precision = linalg.pinvh(raw_covariance)
|
||||
raw_dist = np.sum(np.dot(X, precision) * X, 1)
|
||||
self.raw_location_ = raw_location
|
||||
self.raw_covariance_ = raw_covariance
|
||||
self.raw_support_ = raw_support
|
||||
self.location_ = raw_location
|
||||
self.support_ = raw_support
|
||||
self.dist_ = raw_dist
|
||||
# obtain consistency at normal models
|
||||
self.correct_covariance(X)
|
||||
# re-weight estimator
|
||||
self.reweight_covariance(X)
|
||||
|
||||
return self
|
||||
|
||||
def correct_covariance(self, data):
|
||||
"""Apply a correction to raw Minimum Covariance Determinant estimates.
|
||||
|
||||
Correction using the empirical correction factor suggested
|
||||
by Rousseeuw and Van Driessen in [RVD]_.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
data : array-like of shape (n_samples, n_features)
|
||||
The data matrix, with p features and n samples.
|
||||
The data set must be the one which was used to compute
|
||||
the raw estimates.
|
||||
|
||||
Returns
|
||||
-------
|
||||
covariance_corrected : ndarray of shape (n_features, n_features)
|
||||
Corrected robust covariance estimate.
|
||||
|
||||
References
|
||||
----------
|
||||
|
||||
.. [RVD] A Fast Algorithm for the Minimum Covariance
|
||||
Determinant Estimator, 1999, American Statistical Association
|
||||
and the American Society for Quality, TECHNOMETRICS
|
||||
"""
|
||||
|
||||
# Check that the covariance of the support data is not equal to 0.
|
||||
# Otherwise self.dist_ = 0 and thus correction = 0.
|
||||
n_samples = len(self.dist_)
|
||||
n_support = np.sum(self.support_)
|
||||
if n_support < n_samples and np.allclose(self.raw_covariance_, 0):
|
||||
raise ValueError('The covariance matrix of the support data '
|
||||
'is equal to 0, try to increase support_fraction')
|
||||
correction = np.median(self.dist_) / chi2(data.shape[1]).isf(0.5)
|
||||
covariance_corrected = self.raw_covariance_ * correction
|
||||
self.dist_ /= correction
|
||||
return covariance_corrected
|
||||
|
||||
def reweight_covariance(self, data):
|
||||
"""Re-weight raw Minimum Covariance Determinant estimates.
|
||||
|
||||
Re-weight observations using Rousseeuw's method (equivalent to
|
||||
deleting outlying observations from the data set before
|
||||
computing location and covariance estimates) described
|
||||
in [RVDriessen]_.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
data : array-like of shape (n_samples, n_features)
|
||||
The data matrix, with p features and n samples.
|
||||
The data set must be the one which was used to compute
|
||||
the raw estimates.
|
||||
|
||||
Returns
|
||||
-------
|
||||
location_reweighted : ndarray of shape (n_features,)
|
||||
Re-weighted robust location estimate.
|
||||
|
||||
covariance_reweighted : ndarray of shape (n_features, n_features)
|
||||
Re-weighted robust covariance estimate.
|
||||
|
||||
support_reweighted : ndarray of shape (n_samples,), dtype=bool
|
||||
A mask of the observations that have been used to compute
|
||||
the re-weighted robust location and covariance estimates.
|
||||
|
||||
References
|
||||
----------
|
||||
|
||||
.. [RVDriessen] A Fast Algorithm for the Minimum Covariance
|
||||
Determinant Estimator, 1999, American Statistical Association
|
||||
and the American Society for Quality, TECHNOMETRICS
|
||||
"""
|
||||
n_samples, n_features = data.shape
|
||||
mask = self.dist_ < chi2(n_features).isf(0.025)
|
||||
if self.assume_centered:
|
||||
location_reweighted = np.zeros(n_features)
|
||||
else:
|
||||
location_reweighted = data[mask].mean(0)
|
||||
covariance_reweighted = self._nonrobust_covariance(
|
||||
data[mask], assume_centered=self.assume_centered)
|
||||
support_reweighted = np.zeros(n_samples, dtype=bool)
|
||||
support_reweighted[mask] = True
|
||||
self._set_covariance(covariance_reweighted)
|
||||
self.location_ = location_reweighted
|
||||
self.support_ = support_reweighted
|
||||
X_centered = data - self.location_
|
||||
self.dist_ = np.sum(
|
||||
np.dot(X_centered, self.get_precision()) * X_centered, 1)
|
||||
return location_reweighted, covariance_reweighted, support_reweighted
|
605
venv/Lib/site-packages/sklearn/covariance/_shrunk_covariance.py
Normal file
605
venv/Lib/site-packages/sklearn/covariance/_shrunk_covariance.py
Normal file
|
@ -0,0 +1,605 @@
|
|||
"""
|
||||
Covariance estimators using shrinkage.
|
||||
|
||||
Shrinkage corresponds to regularising `cov` using a convex combination:
|
||||
shrunk_cov = (1-shrinkage)*cov + shrinkage*structured_estimate.
|
||||
|
||||
"""
|
||||
|
||||
# Author: Alexandre Gramfort <alexandre.gramfort@inria.fr>
|
||||
# Gael Varoquaux <gael.varoquaux@normalesup.org>
|
||||
# Virgile Fritsch <virgile.fritsch@inria.fr>
|
||||
#
|
||||
# License: BSD 3 clause
|
||||
|
||||
# avoid division truncation
|
||||
import warnings
|
||||
import numpy as np
|
||||
|
||||
from . import empirical_covariance, EmpiricalCovariance
|
||||
from ..utils import check_array
|
||||
from ..utils.validation import _deprecate_positional_args
|
||||
|
||||
|
||||
# ShrunkCovariance estimator
|
||||
|
||||
def shrunk_covariance(emp_cov, shrinkage=0.1):
|
||||
"""Calculates a covariance matrix shrunk on the diagonal
|
||||
|
||||
Read more in the :ref:`User Guide <shrunk_covariance>`.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
emp_cov : array-like of shape (n_features, n_features)
|
||||
Covariance matrix to be shrunk
|
||||
|
||||
shrinkage : float, default=0.1
|
||||
Coefficient in the convex combination used for the computation
|
||||
of the shrunk estimate. Range is [0, 1].
|
||||
|
||||
Returns
|
||||
-------
|
||||
shrunk_cov : ndarray of shape (n_features, n_features)
|
||||
Shrunk covariance.
|
||||
|
||||
Notes
|
||||
-----
|
||||
The regularized (shrunk) covariance is given by:
|
||||
|
||||
(1 - shrinkage) * cov + shrinkage * mu * np.identity(n_features)
|
||||
|
||||
where mu = trace(cov) / n_features
|
||||
"""
|
||||
emp_cov = check_array(emp_cov)
|
||||
n_features = emp_cov.shape[0]
|
||||
|
||||
mu = np.trace(emp_cov) / n_features
|
||||
shrunk_cov = (1. - shrinkage) * emp_cov
|
||||
shrunk_cov.flat[::n_features + 1] += shrinkage * mu
|
||||
|
||||
return shrunk_cov
|
||||
|
||||
|
||||
class ShrunkCovariance(EmpiricalCovariance):
|
||||
"""Covariance estimator with shrinkage
|
||||
|
||||
Read more in the :ref:`User Guide <shrunk_covariance>`.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
store_precision : bool, default=True
|
||||
Specify if the estimated precision is stored
|
||||
|
||||
assume_centered : bool, default=False
|
||||
If True, data will not be centered before computation.
|
||||
Useful when working with data whose mean is almost, but not exactly
|
||||
zero.
|
||||
If False, data will be centered before computation.
|
||||
|
||||
shrinkage : float, default=0.1
|
||||
Coefficient in the convex combination used for the computation
|
||||
of the shrunk estimate. Range is [0, 1].
|
||||
|
||||
Attributes
|
||||
----------
|
||||
covariance_ : ndarray of shape (n_features, n_features)
|
||||
Estimated covariance matrix
|
||||
|
||||
location_ : ndarray of shape (n_features,)
|
||||
Estimated location, i.e. the estimated mean.
|
||||
|
||||
precision_ : ndarray of shape (n_features, n_features)
|
||||
Estimated pseudo inverse matrix.
|
||||
(stored only if store_precision is True)
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> import numpy as np
|
||||
>>> from sklearn.covariance import ShrunkCovariance
|
||||
>>> from sklearn.datasets import make_gaussian_quantiles
|
||||
>>> real_cov = np.array([[.8, .3],
|
||||
... [.3, .4]])
|
||||
>>> rng = np.random.RandomState(0)
|
||||
>>> X = rng.multivariate_normal(mean=[0, 0],
|
||||
... cov=real_cov,
|
||||
... size=500)
|
||||
>>> cov = ShrunkCovariance().fit(X)
|
||||
>>> cov.covariance_
|
||||
array([[0.7387..., 0.2536...],
|
||||
[0.2536..., 0.4110...]])
|
||||
>>> cov.location_
|
||||
array([0.0622..., 0.0193...])
|
||||
|
||||
Notes
|
||||
-----
|
||||
The regularized covariance is given by:
|
||||
|
||||
(1 - shrinkage) * cov + shrinkage * mu * np.identity(n_features)
|
||||
|
||||
where mu = trace(cov) / n_features
|
||||
"""
|
||||
@_deprecate_positional_args
|
||||
def __init__(self, *, store_precision=True, assume_centered=False,
|
||||
shrinkage=0.1):
|
||||
super().__init__(store_precision=store_precision,
|
||||
assume_centered=assume_centered)
|
||||
self.shrinkage = shrinkage
|
||||
|
||||
def fit(self, X, y=None):
|
||||
"""Fit the shrunk covariance model according to the given training data
|
||||
and parameters.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : array-like of shape (n_samples, n_features)
|
||||
Training data, where n_samples is the number of samples
|
||||
and n_features is the number of features.
|
||||
|
||||
y: Ignored
|
||||
not used, present for API consistence purpose.
|
||||
|
||||
Returns
|
||||
-------
|
||||
self : object
|
||||
"""
|
||||
X = self._validate_data(X)
|
||||
# Not calling the parent object to fit, to avoid a potential
|
||||
# matrix inversion when setting the precision
|
||||
if self.assume_centered:
|
||||
self.location_ = np.zeros(X.shape[1])
|
||||
else:
|
||||
self.location_ = X.mean(0)
|
||||
covariance = empirical_covariance(
|
||||
X, assume_centered=self.assume_centered)
|
||||
covariance = shrunk_covariance(covariance, self.shrinkage)
|
||||
self._set_covariance(covariance)
|
||||
|
||||
return self
|
||||
|
||||
|
||||
# Ledoit-Wolf estimator
|
||||
|
||||
def ledoit_wolf_shrinkage(X, assume_centered=False, block_size=1000):
|
||||
"""Estimates the shrunk Ledoit-Wolf covariance matrix.
|
||||
|
||||
Read more in the :ref:`User Guide <shrunk_covariance>`.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : array-like of shape (n_samples, n_features)
|
||||
Data from which to compute the Ledoit-Wolf shrunk covariance shrinkage.
|
||||
|
||||
assume_centered : bool, default=False
|
||||
If True, data will not be centered before computation.
|
||||
Useful to work with data whose mean is significantly equal to
|
||||
zero but is not exactly zero.
|
||||
If False, data will be centered before computation.
|
||||
|
||||
block_size : int, default=1000
|
||||
Size of the blocks into which the covariance matrix will be split.
|
||||
|
||||
Returns
|
||||
-------
|
||||
shrinkage : float
|
||||
Coefficient in the convex combination used for the computation
|
||||
of the shrunk estimate.
|
||||
|
||||
Notes
|
||||
-----
|
||||
The regularized (shrunk) covariance is:
|
||||
|
||||
(1 - shrinkage) * cov + shrinkage * mu * np.identity(n_features)
|
||||
|
||||
where mu = trace(cov) / n_features
|
||||
"""
|
||||
X = np.asarray(X)
|
||||
# for only one feature, the result is the same whatever the shrinkage
|
||||
if len(X.shape) == 2 and X.shape[1] == 1:
|
||||
return 0.
|
||||
if X.ndim == 1:
|
||||
X = np.reshape(X, (1, -1))
|
||||
|
||||
if X.shape[0] == 1:
|
||||
warnings.warn("Only one sample available. "
|
||||
"You may want to reshape your data array")
|
||||
n_samples, n_features = X.shape
|
||||
|
||||
# optionally center data
|
||||
if not assume_centered:
|
||||
X = X - X.mean(0)
|
||||
|
||||
# A non-blocked version of the computation is present in the tests
|
||||
# in tests/test_covariance.py
|
||||
|
||||
# number of blocks to split the covariance matrix into
|
||||
n_splits = int(n_features / block_size)
|
||||
X2 = X ** 2
|
||||
emp_cov_trace = np.sum(X2, axis=0) / n_samples
|
||||
mu = np.sum(emp_cov_trace) / n_features
|
||||
beta_ = 0. # sum of the coefficients of <X2.T, X2>
|
||||
delta_ = 0. # sum of the *squared* coefficients of <X.T, X>
|
||||
# starting block computation
|
||||
for i in range(n_splits):
|
||||
for j in range(n_splits):
|
||||
rows = slice(block_size * i, block_size * (i + 1))
|
||||
cols = slice(block_size * j, block_size * (j + 1))
|
||||
beta_ += np.sum(np.dot(X2.T[rows], X2[:, cols]))
|
||||
delta_ += np.sum(np.dot(X.T[rows], X[:, cols]) ** 2)
|
||||
rows = slice(block_size * i, block_size * (i + 1))
|
||||
beta_ += np.sum(np.dot(X2.T[rows], X2[:, block_size * n_splits:]))
|
||||
delta_ += np.sum(
|
||||
np.dot(X.T[rows], X[:, block_size * n_splits:]) ** 2)
|
||||
for j in range(n_splits):
|
||||
cols = slice(block_size * j, block_size * (j + 1))
|
||||
beta_ += np.sum(np.dot(X2.T[block_size * n_splits:], X2[:, cols]))
|
||||
delta_ += np.sum(
|
||||
np.dot(X.T[block_size * n_splits:], X[:, cols]) ** 2)
|
||||
delta_ += np.sum(np.dot(X.T[block_size * n_splits:],
|
||||
X[:, block_size * n_splits:]) ** 2)
|
||||
delta_ /= n_samples ** 2
|
||||
beta_ += np.sum(np.dot(X2.T[block_size * n_splits:],
|
||||
X2[:, block_size * n_splits:]))
|
||||
# use delta_ to compute beta
|
||||
beta = 1. / (n_features * n_samples) * (beta_ / n_samples - delta_)
|
||||
# delta is the sum of the squared coefficients of (<X.T,X> - mu*Id) / p
|
||||
delta = delta_ - 2. * mu * emp_cov_trace.sum() + n_features * mu ** 2
|
||||
delta /= n_features
|
||||
# get final beta as the min between beta and delta
|
||||
# We do this to prevent shrinking more than "1", which whould invert
|
||||
# the value of covariances
|
||||
beta = min(beta, delta)
|
||||
# finally get shrinkage
|
||||
shrinkage = 0 if beta == 0 else beta / delta
|
||||
return shrinkage
|
||||
|
||||
|
||||
@_deprecate_positional_args
|
||||
def ledoit_wolf(X, *, assume_centered=False, block_size=1000):
|
||||
"""Estimates the shrunk Ledoit-Wolf covariance matrix.
|
||||
|
||||
Read more in the :ref:`User Guide <shrunk_covariance>`.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : array-like of shape (n_samples, n_features)
|
||||
Data from which to compute the covariance estimate
|
||||
|
||||
assume_centered : bool, default=False
|
||||
If True, data will not be centered before computation.
|
||||
Useful to work with data whose mean is significantly equal to
|
||||
zero but is not exactly zero.
|
||||
If False, data will be centered before computation.
|
||||
|
||||
block_size : int, default=1000
|
||||
Size of the blocks into which the covariance matrix will be split.
|
||||
This is purely a memory optimization and does not affect results.
|
||||
|
||||
Returns
|
||||
-------
|
||||
shrunk_cov : ndarray of shape (n_features, n_features)
|
||||
Shrunk covariance.
|
||||
|
||||
shrinkage : float
|
||||
Coefficient in the convex combination used for the computation
|
||||
of the shrunk estimate.
|
||||
|
||||
Notes
|
||||
-----
|
||||
The regularized (shrunk) covariance is:
|
||||
|
||||
(1 - shrinkage) * cov + shrinkage * mu * np.identity(n_features)
|
||||
|
||||
where mu = trace(cov) / n_features
|
||||
"""
|
||||
X = np.asarray(X)
|
||||
# for only one feature, the result is the same whatever the shrinkage
|
||||
if len(X.shape) == 2 and X.shape[1] == 1:
|
||||
if not assume_centered:
|
||||
X = X - X.mean()
|
||||
return np.atleast_2d((X ** 2).mean()), 0.
|
||||
if X.ndim == 1:
|
||||
X = np.reshape(X, (1, -1))
|
||||
warnings.warn("Only one sample available. "
|
||||
"You may want to reshape your data array")
|
||||
n_features = X.size
|
||||
else:
|
||||
_, n_features = X.shape
|
||||
|
||||
# get Ledoit-Wolf shrinkage
|
||||
shrinkage = ledoit_wolf_shrinkage(
|
||||
X, assume_centered=assume_centered, block_size=block_size)
|
||||
emp_cov = empirical_covariance(X, assume_centered=assume_centered)
|
||||
mu = np.sum(np.trace(emp_cov)) / n_features
|
||||
shrunk_cov = (1. - shrinkage) * emp_cov
|
||||
shrunk_cov.flat[::n_features + 1] += shrinkage * mu
|
||||
|
||||
return shrunk_cov, shrinkage
|
||||
|
||||
|
||||
class LedoitWolf(EmpiricalCovariance):
|
||||
"""LedoitWolf Estimator
|
||||
|
||||
Ledoit-Wolf is a particular form of shrinkage, where the shrinkage
|
||||
coefficient is computed using O. Ledoit and M. Wolf's formula as
|
||||
described in "A Well-Conditioned Estimator for Large-Dimensional
|
||||
Covariance Matrices", Ledoit and Wolf, Journal of Multivariate
|
||||
Analysis, Volume 88, Issue 2, February 2004, pages 365-411.
|
||||
|
||||
Read more in the :ref:`User Guide <shrunk_covariance>`.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
store_precision : bool, default=True
|
||||
Specify if the estimated precision is stored.
|
||||
|
||||
assume_centered : bool, default=False
|
||||
If True, data will not be centered before computation.
|
||||
Useful when working with data whose mean is almost, but not exactly
|
||||
zero.
|
||||
If False (default), data will be centered before computation.
|
||||
|
||||
block_size : int, default=1000
|
||||
Size of the blocks into which the covariance matrix will be split
|
||||
during its Ledoit-Wolf estimation. This is purely a memory
|
||||
optimization and does not affect results.
|
||||
|
||||
Attributes
|
||||
----------
|
||||
covariance_ : ndarray of shape (n_features, n_features)
|
||||
Estimated covariance matrix.
|
||||
|
||||
location_ : ndarray of shape (n_features,)
|
||||
Estimated location, i.e. the estimated mean.
|
||||
|
||||
precision_ : ndarray of shape (n_features, n_features)
|
||||
Estimated pseudo inverse matrix.
|
||||
(stored only if store_precision is True)
|
||||
|
||||
shrinkage_ : float
|
||||
Coefficient in the convex combination used for the computation
|
||||
of the shrunk estimate. Range is [0, 1].
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> import numpy as np
|
||||
>>> from sklearn.covariance import LedoitWolf
|
||||
>>> real_cov = np.array([[.4, .2],
|
||||
... [.2, .8]])
|
||||
>>> np.random.seed(0)
|
||||
>>> X = np.random.multivariate_normal(mean=[0, 0],
|
||||
... cov=real_cov,
|
||||
... size=50)
|
||||
>>> cov = LedoitWolf().fit(X)
|
||||
>>> cov.covariance_
|
||||
array([[0.4406..., 0.1616...],
|
||||
[0.1616..., 0.8022...]])
|
||||
>>> cov.location_
|
||||
array([ 0.0595... , -0.0075...])
|
||||
|
||||
Notes
|
||||
-----
|
||||
The regularised covariance is:
|
||||
|
||||
(1 - shrinkage) * cov + shrinkage * mu * np.identity(n_features)
|
||||
|
||||
where mu = trace(cov) / n_features
|
||||
and shrinkage is given by the Ledoit and Wolf formula (see References)
|
||||
|
||||
References
|
||||
----------
|
||||
"A Well-Conditioned Estimator for Large-Dimensional Covariance Matrices",
|
||||
Ledoit and Wolf, Journal of Multivariate Analysis, Volume 88, Issue 2,
|
||||
February 2004, pages 365-411.
|
||||
"""
|
||||
@_deprecate_positional_args
|
||||
def __init__(self, *, store_precision=True, assume_centered=False,
|
||||
block_size=1000):
|
||||
super().__init__(store_precision=store_precision,
|
||||
assume_centered=assume_centered)
|
||||
self.block_size = block_size
|
||||
|
||||
def fit(self, X, y=None):
|
||||
"""Fit the Ledoit-Wolf shrunk covariance model according to the given
|
||||
training data and parameters.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : array-like of shape (n_samples, n_features)
|
||||
Training data, where `n_samples` is the number of samples
|
||||
and `n_features` is the number of features.
|
||||
y : Ignored
|
||||
not used, present for API consistence purpose.
|
||||
|
||||
Returns
|
||||
-------
|
||||
self : object
|
||||
"""
|
||||
# Not calling the parent object to fit, to avoid computing the
|
||||
# covariance matrix (and potentially the precision)
|
||||
X = self._validate_data(X)
|
||||
if self.assume_centered:
|
||||
self.location_ = np.zeros(X.shape[1])
|
||||
else:
|
||||
self.location_ = X.mean(0)
|
||||
covariance, shrinkage = ledoit_wolf(X - self.location_,
|
||||
assume_centered=True,
|
||||
block_size=self.block_size)
|
||||
self.shrinkage_ = shrinkage
|
||||
self._set_covariance(covariance)
|
||||
|
||||
return self
|
||||
|
||||
|
||||
# OAS estimator
|
||||
@_deprecate_positional_args
|
||||
def oas(X, *, assume_centered=False):
|
||||
"""Estimate covariance with the Oracle Approximating Shrinkage algorithm.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : array-like of shape (n_samples, n_features)
|
||||
Data from which to compute the covariance estimate.
|
||||
|
||||
assume_centered : bool, default=False
|
||||
If True, data will not be centered before computation.
|
||||
Useful to work with data whose mean is significantly equal to
|
||||
zero but is not exactly zero.
|
||||
If False, data will be centered before computation.
|
||||
|
||||
Returns
|
||||
-------
|
||||
shrunk_cov : array-like of shape (n_features, n_features)
|
||||
Shrunk covariance.
|
||||
|
||||
shrinkage : float
|
||||
Coefficient in the convex combination used for the computation
|
||||
of the shrunk estimate.
|
||||
|
||||
Notes
|
||||
-----
|
||||
The regularised (shrunk) covariance is:
|
||||
|
||||
(1 - shrinkage) * cov + shrinkage * mu * np.identity(n_features)
|
||||
|
||||
where mu = trace(cov) / n_features
|
||||
|
||||
The formula we used to implement the OAS is slightly modified compared
|
||||
to the one given in the article. See :class:`OAS` for more details.
|
||||
"""
|
||||
X = np.asarray(X)
|
||||
# for only one feature, the result is the same whatever the shrinkage
|
||||
if len(X.shape) == 2 and X.shape[1] == 1:
|
||||
if not assume_centered:
|
||||
X = X - X.mean()
|
||||
return np.atleast_2d((X ** 2).mean()), 0.
|
||||
if X.ndim == 1:
|
||||
X = np.reshape(X, (1, -1))
|
||||
warnings.warn("Only one sample available. "
|
||||
"You may want to reshape your data array")
|
||||
n_samples = 1
|
||||
n_features = X.size
|
||||
else:
|
||||
n_samples, n_features = X.shape
|
||||
|
||||
emp_cov = empirical_covariance(X, assume_centered=assume_centered)
|
||||
mu = np.trace(emp_cov) / n_features
|
||||
|
||||
# formula from Chen et al.'s **implementation**
|
||||
alpha = np.mean(emp_cov ** 2)
|
||||
num = alpha + mu ** 2
|
||||
den = (n_samples + 1.) * (alpha - (mu ** 2) / n_features)
|
||||
|
||||
shrinkage = 1. if den == 0 else min(num / den, 1.)
|
||||
shrunk_cov = (1. - shrinkage) * emp_cov
|
||||
shrunk_cov.flat[::n_features + 1] += shrinkage * mu
|
||||
|
||||
return shrunk_cov, shrinkage
|
||||
|
||||
|
||||
class OAS(EmpiricalCovariance):
|
||||
"""Oracle Approximating Shrinkage Estimator
|
||||
|
||||
Read more in the :ref:`User Guide <shrunk_covariance>`.
|
||||
|
||||
OAS is a particular form of shrinkage described in
|
||||
"Shrinkage Algorithms for MMSE Covariance Estimation"
|
||||
Chen et al., IEEE Trans. on Sign. Proc., Volume 58, Issue 10, October 2010.
|
||||
|
||||
The formula used here does not correspond to the one given in the
|
||||
article. In the original article, formula (23) states that 2/p is
|
||||
multiplied by Trace(cov*cov) in both the numerator and denominator, but
|
||||
this operation is omitted because for a large p, the value of 2/p is
|
||||
so small that it doesn't affect the value of the estimator.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
store_precision : bool, default=True
|
||||
Specify if the estimated precision is stored.
|
||||
|
||||
assume_centered : bool, default=False
|
||||
If True, data will not be centered before computation.
|
||||
Useful when working with data whose mean is almost, but not exactly
|
||||
zero.
|
||||
If False (default), data will be centered before computation.
|
||||
|
||||
Attributes
|
||||
----------
|
||||
covariance_ : ndarray of shape (n_features, n_features)
|
||||
Estimated covariance matrix.
|
||||
|
||||
location_ : ndarray of shape (n_features,)
|
||||
Estimated location, i.e. the estimated mean.
|
||||
|
||||
precision_ : ndarray of shape (n_features, n_features)
|
||||
Estimated pseudo inverse matrix.
|
||||
(stored only if store_precision is True)
|
||||
|
||||
shrinkage_ : float
|
||||
coefficient in the convex combination used for the computation
|
||||
of the shrunk estimate. Range is [0, 1].
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> import numpy as np
|
||||
>>> from sklearn.covariance import OAS
|
||||
>>> from sklearn.datasets import make_gaussian_quantiles
|
||||
>>> real_cov = np.array([[.8, .3],
|
||||
... [.3, .4]])
|
||||
>>> rng = np.random.RandomState(0)
|
||||
>>> X = rng.multivariate_normal(mean=[0, 0],
|
||||
... cov=real_cov,
|
||||
... size=500)
|
||||
>>> oas = OAS().fit(X)
|
||||
>>> oas.covariance_
|
||||
array([[0.7533..., 0.2763...],
|
||||
[0.2763..., 0.3964...]])
|
||||
>>> oas.precision_
|
||||
array([[ 1.7833..., -1.2431... ],
|
||||
[-1.2431..., 3.3889...]])
|
||||
>>> oas.shrinkage_
|
||||
0.0195...
|
||||
|
||||
Notes
|
||||
-----
|
||||
The regularised covariance is:
|
||||
|
||||
(1 - shrinkage) * cov + shrinkage * mu * np.identity(n_features)
|
||||
|
||||
where mu = trace(cov) / n_features
|
||||
and shrinkage is given by the OAS formula (see References)
|
||||
|
||||
References
|
||||
----------
|
||||
"Shrinkage Algorithms for MMSE Covariance Estimation"
|
||||
Chen et al., IEEE Trans. on Sign. Proc., Volume 58, Issue 10, October 2010.
|
||||
"""
|
||||
|
||||
def fit(self, X, y=None):
|
||||
"""Fit the Oracle Approximating Shrinkage covariance model
|
||||
according to the given training data and parameters.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : array-like of shape (n_samples, n_features)
|
||||
Training data, where `n_samples` is the number of samples
|
||||
and `n_features` is the number of features.
|
||||
y : Ignored
|
||||
not used, present for API consistence purpose.
|
||||
|
||||
Returns
|
||||
-------
|
||||
self : object
|
||||
"""
|
||||
X = self._validate_data(X)
|
||||
# Not calling the parent object to fit, to avoid computing the
|
||||
# covariance matrix (and potentially the precision)
|
||||
if self.assume_centered:
|
||||
self.location_ = np.zeros(X.shape[1])
|
||||
else:
|
||||
self.location_ = X.mean(0)
|
||||
|
||||
covariance, shrinkage = oas(X - self.location_, assume_centered=True)
|
||||
self.shrinkage_ = shrinkage
|
||||
self._set_covariance(covariance)
|
||||
|
||||
return self
|
|
@ -0,0 +1,18 @@
|
|||
|
||||
# THIS FILE WAS AUTOMATICALLY GENERATED BY deprecated_modules.py
|
||||
import sys
|
||||
# mypy error: Module X has no attribute y (typically for C extensions)
|
||||
from . import _elliptic_envelope # type: ignore
|
||||
from ..externals._pep562 import Pep562
|
||||
from ..utils.deprecation import _raise_dep_warning_if_not_pytest
|
||||
|
||||
deprecated_path = 'sklearn.covariance.elliptic_envelope'
|
||||
correct_import_path = 'sklearn.covariance'
|
||||
|
||||
_raise_dep_warning_if_not_pytest(deprecated_path, correct_import_path)
|
||||
|
||||
def __getattr__(name):
|
||||
return getattr(_elliptic_envelope, name)
|
||||
|
||||
if not sys.version_info >= (3, 7):
|
||||
Pep562(__name__)
|
|
@ -0,0 +1,18 @@
|
|||
|
||||
# THIS FILE WAS AUTOMATICALLY GENERATED BY deprecated_modules.py
|
||||
import sys
|
||||
# mypy error: Module X has no attribute y (typically for C extensions)
|
||||
from . import _empirical_covariance # type: ignore
|
||||
from ..externals._pep562 import Pep562
|
||||
from ..utils.deprecation import _raise_dep_warning_if_not_pytest
|
||||
|
||||
deprecated_path = 'sklearn.covariance.empirical_covariance_'
|
||||
correct_import_path = 'sklearn.covariance'
|
||||
|
||||
_raise_dep_warning_if_not_pytest(deprecated_path, correct_import_path)
|
||||
|
||||
def __getattr__(name):
|
||||
return getattr(_empirical_covariance, name)
|
||||
|
||||
if not sys.version_info >= (3, 7):
|
||||
Pep562(__name__)
|
18
venv/Lib/site-packages/sklearn/covariance/graph_lasso_.py
Normal file
18
venv/Lib/site-packages/sklearn/covariance/graph_lasso_.py
Normal file
|
@ -0,0 +1,18 @@
|
|||
|
||||
# THIS FILE WAS AUTOMATICALLY GENERATED BY deprecated_modules.py
|
||||
import sys
|
||||
# mypy error: Module X has no attribute y (typically for C extensions)
|
||||
from . import _graph_lasso # type: ignore
|
||||
from ..externals._pep562 import Pep562
|
||||
from ..utils.deprecation import _raise_dep_warning_if_not_pytest
|
||||
|
||||
deprecated_path = 'sklearn.covariance.graph_lasso_'
|
||||
correct_import_path = 'sklearn.covariance'
|
||||
|
||||
_raise_dep_warning_if_not_pytest(deprecated_path, correct_import_path)
|
||||
|
||||
def __getattr__(name):
|
||||
return getattr(_graph_lasso, name)
|
||||
|
||||
if not sys.version_info >= (3, 7):
|
||||
Pep562(__name__)
|
|
@ -0,0 +1,18 @@
|
|||
|
||||
# THIS FILE WAS AUTOMATICALLY GENERATED BY deprecated_modules.py
|
||||
import sys
|
||||
# mypy error: Module X has no attribute y (typically for C extensions)
|
||||
from . import _robust_covariance # type: ignore
|
||||
from ..externals._pep562 import Pep562
|
||||
from ..utils.deprecation import _raise_dep_warning_if_not_pytest
|
||||
|
||||
deprecated_path = 'sklearn.covariance.robust_covariance'
|
||||
correct_import_path = 'sklearn.covariance'
|
||||
|
||||
_raise_dep_warning_if_not_pytest(deprecated_path, correct_import_path)
|
||||
|
||||
def __getattr__(name):
|
||||
return getattr(_robust_covariance, name)
|
||||
|
||||
if not sys.version_info >= (3, 7):
|
||||
Pep562(__name__)
|
|
@ -0,0 +1,18 @@
|
|||
|
||||
# THIS FILE WAS AUTOMATICALLY GENERATED BY deprecated_modules.py
|
||||
import sys
|
||||
# mypy error: Module X has no attribute y (typically for C extensions)
|
||||
from . import _shrunk_covariance # type: ignore
|
||||
from ..externals._pep562 import Pep562
|
||||
from ..utils.deprecation import _raise_dep_warning_if_not_pytest
|
||||
|
||||
deprecated_path = 'sklearn.covariance.shrunk_covariance_'
|
||||
correct_import_path = 'sklearn.covariance'
|
||||
|
||||
_raise_dep_warning_if_not_pytest(deprecated_path, correct_import_path)
|
||||
|
||||
def __getattr__(name):
|
||||
return getattr(_shrunk_covariance, name)
|
||||
|
||||
if not sys.version_info >= (3, 7):
|
||||
Pep562(__name__)
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
|
@ -0,0 +1,305 @@
|
|||
# Author: Alexandre Gramfort <alexandre.gramfort@inria.fr>
|
||||
# Gael Varoquaux <gael.varoquaux@normalesup.org>
|
||||
# Virgile Fritsch <virgile.fritsch@inria.fr>
|
||||
#
|
||||
# License: BSD 3 clause
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from sklearn.utils._testing import assert_almost_equal
|
||||
from sklearn.utils._testing import assert_array_almost_equal
|
||||
from sklearn.utils._testing import assert_array_equal
|
||||
from sklearn.utils._testing import assert_warns
|
||||
|
||||
from sklearn import datasets
|
||||
from sklearn.covariance import empirical_covariance, EmpiricalCovariance, \
|
||||
ShrunkCovariance, shrunk_covariance, \
|
||||
LedoitWolf, ledoit_wolf, ledoit_wolf_shrinkage, OAS, oas
|
||||
|
||||
X, _ = datasets.load_diabetes(return_X_y=True)
|
||||
X_1d = X[:, 0]
|
||||
n_samples, n_features = X.shape
|
||||
|
||||
|
||||
def test_covariance():
|
||||
# Tests Covariance module on a simple dataset.
|
||||
# test covariance fit from data
|
||||
cov = EmpiricalCovariance()
|
||||
cov.fit(X)
|
||||
emp_cov = empirical_covariance(X)
|
||||
assert_array_almost_equal(emp_cov, cov.covariance_, 4)
|
||||
assert_almost_equal(cov.error_norm(emp_cov), 0)
|
||||
assert_almost_equal(
|
||||
cov.error_norm(emp_cov, norm='spectral'), 0)
|
||||
assert_almost_equal(
|
||||
cov.error_norm(emp_cov, norm='frobenius'), 0)
|
||||
assert_almost_equal(
|
||||
cov.error_norm(emp_cov, scaling=False), 0)
|
||||
assert_almost_equal(
|
||||
cov.error_norm(emp_cov, squared=False), 0)
|
||||
with pytest.raises(NotImplementedError):
|
||||
cov.error_norm(emp_cov, norm='foo')
|
||||
# Mahalanobis distances computation test
|
||||
mahal_dist = cov.mahalanobis(X)
|
||||
assert np.amin(mahal_dist) > 0
|
||||
|
||||
# test with n_features = 1
|
||||
X_1d = X[:, 0].reshape((-1, 1))
|
||||
cov = EmpiricalCovariance()
|
||||
cov.fit(X_1d)
|
||||
assert_array_almost_equal(empirical_covariance(X_1d), cov.covariance_, 4)
|
||||
assert_almost_equal(cov.error_norm(empirical_covariance(X_1d)), 0)
|
||||
assert_almost_equal(
|
||||
cov.error_norm(empirical_covariance(X_1d), norm='spectral'), 0)
|
||||
|
||||
# test with one sample
|
||||
# Create X with 1 sample and 5 features
|
||||
X_1sample = np.arange(5).reshape(1, 5)
|
||||
cov = EmpiricalCovariance()
|
||||
assert_warns(UserWarning, cov.fit, X_1sample)
|
||||
assert_array_almost_equal(cov.covariance_,
|
||||
np.zeros(shape=(5, 5), dtype=np.float64))
|
||||
|
||||
# test integer type
|
||||
X_integer = np.asarray([[0, 1], [1, 0]])
|
||||
result = np.asarray([[0.25, -0.25], [-0.25, 0.25]])
|
||||
assert_array_almost_equal(empirical_covariance(X_integer), result)
|
||||
|
||||
# test centered case
|
||||
cov = EmpiricalCovariance(assume_centered=True)
|
||||
cov.fit(X)
|
||||
assert_array_equal(cov.location_, np.zeros(X.shape[1]))
|
||||
|
||||
|
||||
def test_shrunk_covariance():
|
||||
# Tests ShrunkCovariance module on a simple dataset.
|
||||
# compare shrunk covariance obtained from data and from MLE estimate
|
||||
cov = ShrunkCovariance(shrinkage=0.5)
|
||||
cov.fit(X)
|
||||
assert_array_almost_equal(
|
||||
shrunk_covariance(empirical_covariance(X), shrinkage=0.5),
|
||||
cov.covariance_, 4)
|
||||
|
||||
# same test with shrinkage not provided
|
||||
cov = ShrunkCovariance()
|
||||
cov.fit(X)
|
||||
assert_array_almost_equal(
|
||||
shrunk_covariance(empirical_covariance(X)), cov.covariance_, 4)
|
||||
|
||||
# same test with shrinkage = 0 (<==> empirical_covariance)
|
||||
cov = ShrunkCovariance(shrinkage=0.)
|
||||
cov.fit(X)
|
||||
assert_array_almost_equal(empirical_covariance(X), cov.covariance_, 4)
|
||||
|
||||
# test with n_features = 1
|
||||
X_1d = X[:, 0].reshape((-1, 1))
|
||||
cov = ShrunkCovariance(shrinkage=0.3)
|
||||
cov.fit(X_1d)
|
||||
assert_array_almost_equal(empirical_covariance(X_1d), cov.covariance_, 4)
|
||||
|
||||
# test shrinkage coeff on a simple data set (without saving precision)
|
||||
cov = ShrunkCovariance(shrinkage=0.5, store_precision=False)
|
||||
cov.fit(X)
|
||||
assert(cov.precision_ is None)
|
||||
|
||||
|
||||
def test_ledoit_wolf():
|
||||
# Tests LedoitWolf module on a simple dataset.
|
||||
# test shrinkage coeff on a simple data set
|
||||
X_centered = X - X.mean(axis=0)
|
||||
lw = LedoitWolf(assume_centered=True)
|
||||
lw.fit(X_centered)
|
||||
shrinkage_ = lw.shrinkage_
|
||||
|
||||
score_ = lw.score(X_centered)
|
||||
assert_almost_equal(ledoit_wolf_shrinkage(X_centered,
|
||||
assume_centered=True),
|
||||
shrinkage_)
|
||||
assert_almost_equal(ledoit_wolf_shrinkage(X_centered, assume_centered=True,
|
||||
block_size=6),
|
||||
shrinkage_)
|
||||
# compare shrunk covariance obtained from data and from MLE estimate
|
||||
lw_cov_from_mle, lw_shrinkage_from_mle = ledoit_wolf(X_centered,
|
||||
assume_centered=True)
|
||||
assert_array_almost_equal(lw_cov_from_mle, lw.covariance_, 4)
|
||||
assert_almost_equal(lw_shrinkage_from_mle, lw.shrinkage_)
|
||||
# compare estimates given by LW and ShrunkCovariance
|
||||
scov = ShrunkCovariance(shrinkage=lw.shrinkage_, assume_centered=True)
|
||||
scov.fit(X_centered)
|
||||
assert_array_almost_equal(scov.covariance_, lw.covariance_, 4)
|
||||
|
||||
# test with n_features = 1
|
||||
X_1d = X[:, 0].reshape((-1, 1))
|
||||
lw = LedoitWolf(assume_centered=True)
|
||||
lw.fit(X_1d)
|
||||
lw_cov_from_mle, lw_shrinkage_from_mle = ledoit_wolf(X_1d,
|
||||
assume_centered=True)
|
||||
assert_array_almost_equal(lw_cov_from_mle, lw.covariance_, 4)
|
||||
assert_almost_equal(lw_shrinkage_from_mle, lw.shrinkage_)
|
||||
assert_array_almost_equal((X_1d ** 2).sum() / n_samples, lw.covariance_, 4)
|
||||
|
||||
# test shrinkage coeff on a simple data set (without saving precision)
|
||||
lw = LedoitWolf(store_precision=False, assume_centered=True)
|
||||
lw.fit(X_centered)
|
||||
assert_almost_equal(lw.score(X_centered), score_, 4)
|
||||
assert(lw.precision_ is None)
|
||||
|
||||
# Same tests without assuming centered data
|
||||
# test shrinkage coeff on a simple data set
|
||||
lw = LedoitWolf()
|
||||
lw.fit(X)
|
||||
assert_almost_equal(lw.shrinkage_, shrinkage_, 4)
|
||||
assert_almost_equal(lw.shrinkage_, ledoit_wolf_shrinkage(X))
|
||||
assert_almost_equal(lw.shrinkage_, ledoit_wolf(X)[1])
|
||||
assert_almost_equal(lw.score(X), score_, 4)
|
||||
# compare shrunk covariance obtained from data and from MLE estimate
|
||||
lw_cov_from_mle, lw_shrinkage_from_mle = ledoit_wolf(X)
|
||||
assert_array_almost_equal(lw_cov_from_mle, lw.covariance_, 4)
|
||||
assert_almost_equal(lw_shrinkage_from_mle, lw.shrinkage_)
|
||||
# compare estimates given by LW and ShrunkCovariance
|
||||
scov = ShrunkCovariance(shrinkage=lw.shrinkage_)
|
||||
scov.fit(X)
|
||||
assert_array_almost_equal(scov.covariance_, lw.covariance_, 4)
|
||||
|
||||
# test with n_features = 1
|
||||
X_1d = X[:, 0].reshape((-1, 1))
|
||||
lw = LedoitWolf()
|
||||
lw.fit(X_1d)
|
||||
lw_cov_from_mle, lw_shrinkage_from_mle = ledoit_wolf(X_1d)
|
||||
assert_array_almost_equal(lw_cov_from_mle, lw.covariance_, 4)
|
||||
assert_almost_equal(lw_shrinkage_from_mle, lw.shrinkage_)
|
||||
assert_array_almost_equal(empirical_covariance(X_1d), lw.covariance_, 4)
|
||||
|
||||
# test with one sample
|
||||
# warning should be raised when using only 1 sample
|
||||
X_1sample = np.arange(5).reshape(1, 5)
|
||||
lw = LedoitWolf()
|
||||
assert_warns(UserWarning, lw.fit, X_1sample)
|
||||
assert_array_almost_equal(lw.covariance_,
|
||||
np.zeros(shape=(5, 5), dtype=np.float64))
|
||||
|
||||
# test shrinkage coeff on a simple data set (without saving precision)
|
||||
lw = LedoitWolf(store_precision=False)
|
||||
lw.fit(X)
|
||||
assert_almost_equal(lw.score(X), score_, 4)
|
||||
assert(lw.precision_ is None)
|
||||
|
||||
|
||||
def _naive_ledoit_wolf_shrinkage(X):
|
||||
# A simple implementation of the formulas from Ledoit & Wolf
|
||||
|
||||
# The computation below achieves the following computations of the
|
||||
# "O. Ledoit and M. Wolf, A Well-Conditioned Estimator for
|
||||
# Large-Dimensional Covariance Matrices"
|
||||
# beta and delta are given in the beginning of section 3.2
|
||||
n_samples, n_features = X.shape
|
||||
emp_cov = empirical_covariance(X, assume_centered=False)
|
||||
mu = np.trace(emp_cov) / n_features
|
||||
delta_ = emp_cov.copy()
|
||||
delta_.flat[::n_features + 1] -= mu
|
||||
delta = (delta_ ** 2).sum() / n_features
|
||||
X2 = X ** 2
|
||||
beta_ = 1. / (n_features * n_samples) \
|
||||
* np.sum(np.dot(X2.T, X2) / n_samples - emp_cov ** 2)
|
||||
|
||||
beta = min(beta_, delta)
|
||||
shrinkage = beta / delta
|
||||
return shrinkage
|
||||
|
||||
|
||||
def test_ledoit_wolf_small():
|
||||
# Compare our blocked implementation to the naive implementation
|
||||
X_small = X[:, :4]
|
||||
lw = LedoitWolf()
|
||||
lw.fit(X_small)
|
||||
shrinkage_ = lw.shrinkage_
|
||||
|
||||
assert_almost_equal(shrinkage_, _naive_ledoit_wolf_shrinkage(X_small))
|
||||
|
||||
|
||||
def test_ledoit_wolf_large():
|
||||
# test that ledoit_wolf doesn't error on data that is wider than block_size
|
||||
rng = np.random.RandomState(0)
|
||||
# use a number of features that is larger than the block-size
|
||||
X = rng.normal(size=(10, 20))
|
||||
lw = LedoitWolf(block_size=10).fit(X)
|
||||
# check that covariance is about diagonal (random normal noise)
|
||||
assert_almost_equal(lw.covariance_, np.eye(20), 0)
|
||||
cov = lw.covariance_
|
||||
|
||||
# check that the result is consistent with not splitting data into blocks.
|
||||
lw = LedoitWolf(block_size=25).fit(X)
|
||||
assert_almost_equal(lw.covariance_, cov)
|
||||
|
||||
|
||||
def test_oas():
|
||||
# Tests OAS module on a simple dataset.
|
||||
# test shrinkage coeff on a simple data set
|
||||
X_centered = X - X.mean(axis=0)
|
||||
oa = OAS(assume_centered=True)
|
||||
oa.fit(X_centered)
|
||||
shrinkage_ = oa.shrinkage_
|
||||
score_ = oa.score(X_centered)
|
||||
# compare shrunk covariance obtained from data and from MLE estimate
|
||||
oa_cov_from_mle, oa_shrinkage_from_mle = oas(X_centered,
|
||||
assume_centered=True)
|
||||
assert_array_almost_equal(oa_cov_from_mle, oa.covariance_, 4)
|
||||
assert_almost_equal(oa_shrinkage_from_mle, oa.shrinkage_)
|
||||
# compare estimates given by OAS and ShrunkCovariance
|
||||
scov = ShrunkCovariance(shrinkage=oa.shrinkage_, assume_centered=True)
|
||||
scov.fit(X_centered)
|
||||
assert_array_almost_equal(scov.covariance_, oa.covariance_, 4)
|
||||
|
||||
# test with n_features = 1
|
||||
X_1d = X[:, 0:1]
|
||||
oa = OAS(assume_centered=True)
|
||||
oa.fit(X_1d)
|
||||
oa_cov_from_mle, oa_shrinkage_from_mle = oas(X_1d, assume_centered=True)
|
||||
assert_array_almost_equal(oa_cov_from_mle, oa.covariance_, 4)
|
||||
assert_almost_equal(oa_shrinkage_from_mle, oa.shrinkage_)
|
||||
assert_array_almost_equal((X_1d ** 2).sum() / n_samples, oa.covariance_, 4)
|
||||
|
||||
# test shrinkage coeff on a simple data set (without saving precision)
|
||||
oa = OAS(store_precision=False, assume_centered=True)
|
||||
oa.fit(X_centered)
|
||||
assert_almost_equal(oa.score(X_centered), score_, 4)
|
||||
assert(oa.precision_ is None)
|
||||
|
||||
# Same tests without assuming centered data--------------------------------
|
||||
# test shrinkage coeff on a simple data set
|
||||
oa = OAS()
|
||||
oa.fit(X)
|
||||
assert_almost_equal(oa.shrinkage_, shrinkage_, 4)
|
||||
assert_almost_equal(oa.score(X), score_, 4)
|
||||
# compare shrunk covariance obtained from data and from MLE estimate
|
||||
oa_cov_from_mle, oa_shrinkage_from_mle = oas(X)
|
||||
assert_array_almost_equal(oa_cov_from_mle, oa.covariance_, 4)
|
||||
assert_almost_equal(oa_shrinkage_from_mle, oa.shrinkage_)
|
||||
# compare estimates given by OAS and ShrunkCovariance
|
||||
scov = ShrunkCovariance(shrinkage=oa.shrinkage_)
|
||||
scov.fit(X)
|
||||
assert_array_almost_equal(scov.covariance_, oa.covariance_, 4)
|
||||
|
||||
# test with n_features = 1
|
||||
X_1d = X[:, 0].reshape((-1, 1))
|
||||
oa = OAS()
|
||||
oa.fit(X_1d)
|
||||
oa_cov_from_mle, oa_shrinkage_from_mle = oas(X_1d)
|
||||
assert_array_almost_equal(oa_cov_from_mle, oa.covariance_, 4)
|
||||
assert_almost_equal(oa_shrinkage_from_mle, oa.shrinkage_)
|
||||
assert_array_almost_equal(empirical_covariance(X_1d), oa.covariance_, 4)
|
||||
|
||||
# test with one sample
|
||||
# warning should be raised when using only 1 sample
|
||||
X_1sample = np.arange(5).reshape(1, 5)
|
||||
oa = OAS()
|
||||
assert_warns(UserWarning, oa.fit, X_1sample)
|
||||
assert_array_almost_equal(oa.covariance_,
|
||||
np.zeros(shape=(5, 5), dtype=np.float64))
|
||||
|
||||
# test shrinkage coeff on a simple data set (without saving precision)
|
||||
oa = OAS(store_precision=False)
|
||||
oa.fit(X)
|
||||
assert_almost_equal(oa.score(X), score_, 4)
|
||||
assert(oa.precision_ is None)
|
|
@ -0,0 +1,45 @@
|
|||
"""
|
||||
Testing for Elliptic Envelope algorithm (sklearn.covariance.elliptic_envelope).
|
||||
"""
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from sklearn.covariance import EllipticEnvelope
|
||||
from sklearn.utils._testing import assert_almost_equal
|
||||
from sklearn.utils._testing import assert_array_almost_equal
|
||||
from sklearn.utils._testing import assert_array_equal
|
||||
from sklearn.exceptions import NotFittedError
|
||||
|
||||
|
||||
def test_elliptic_envelope():
|
||||
rnd = np.random.RandomState(0)
|
||||
X = rnd.randn(100, 10)
|
||||
clf = EllipticEnvelope(contamination=0.1)
|
||||
with pytest.raises(NotFittedError):
|
||||
clf.predict(X)
|
||||
with pytest.raises(NotFittedError):
|
||||
clf.decision_function(X)
|
||||
clf.fit(X)
|
||||
y_pred = clf.predict(X)
|
||||
scores = clf.score_samples(X)
|
||||
decisions = clf.decision_function(X)
|
||||
|
||||
assert_array_almost_equal(
|
||||
scores, -clf.mahalanobis(X))
|
||||
assert_array_almost_equal(clf.mahalanobis(X), clf.dist_)
|
||||
assert_almost_equal(clf.score(X, np.ones(100)),
|
||||
(100 - y_pred[y_pred == -1].size) / 100.)
|
||||
assert(sum(y_pred == -1) == sum(decisions < 0))
|
||||
|
||||
|
||||
def test_score_samples():
|
||||
X_train = [[1, 1], [1, 2], [2, 1]]
|
||||
clf1 = EllipticEnvelope(contamination=0.2).fit(X_train)
|
||||
clf2 = EllipticEnvelope().fit(X_train)
|
||||
assert_array_equal(clf1.score_samples([[2., 2.]]),
|
||||
clf1.decision_function([[2., 2.]]) + clf1.offset_)
|
||||
assert_array_equal(clf2.score_samples([[2., 2.]]),
|
||||
clf2.decision_function([[2., 2.]]) + clf2.offset_)
|
||||
assert_array_equal(clf1.score_samples([[2., 2.]]),
|
||||
clf2.score_samples([[2., 2.]]))
|
|
@ -0,0 +1,150 @@
|
|||
""" Test the graphical_lasso module.
|
||||
"""
|
||||
import sys
|
||||
|
||||
import numpy as np
|
||||
from scipy import linalg
|
||||
|
||||
from sklearn.utils._testing import assert_array_almost_equal
|
||||
from sklearn.utils._testing import assert_array_less
|
||||
|
||||
from sklearn.covariance import (graphical_lasso, GraphicalLasso,
|
||||
GraphicalLassoCV, empirical_covariance)
|
||||
from sklearn.datasets import make_sparse_spd_matrix
|
||||
from io import StringIO
|
||||
from sklearn.utils import check_random_state
|
||||
from sklearn import datasets
|
||||
|
||||
|
||||
def test_graphical_lasso(random_state=0):
|
||||
# Sample data from a sparse multivariate normal
|
||||
dim = 20
|
||||
n_samples = 100
|
||||
random_state = check_random_state(random_state)
|
||||
prec = make_sparse_spd_matrix(dim, alpha=.95,
|
||||
random_state=random_state)
|
||||
cov = linalg.inv(prec)
|
||||
X = random_state.multivariate_normal(np.zeros(dim), cov, size=n_samples)
|
||||
emp_cov = empirical_covariance(X)
|
||||
|
||||
for alpha in (0., .1, .25):
|
||||
covs = dict()
|
||||
icovs = dict()
|
||||
for method in ('cd', 'lars'):
|
||||
cov_, icov_, costs = graphical_lasso(emp_cov, return_costs=True,
|
||||
alpha=alpha, mode=method)
|
||||
covs[method] = cov_
|
||||
icovs[method] = icov_
|
||||
costs, dual_gap = np.array(costs).T
|
||||
# Check that the costs always decrease (doesn't hold if alpha == 0)
|
||||
if not alpha == 0:
|
||||
assert_array_less(np.diff(costs), 0)
|
||||
# Check that the 2 approaches give similar results
|
||||
assert_array_almost_equal(covs['cd'], covs['lars'], decimal=4)
|
||||
assert_array_almost_equal(icovs['cd'], icovs['lars'], decimal=4)
|
||||
|
||||
# Smoke test the estimator
|
||||
model = GraphicalLasso(alpha=.25).fit(X)
|
||||
model.score(X)
|
||||
assert_array_almost_equal(model.covariance_, covs['cd'], decimal=4)
|
||||
assert_array_almost_equal(model.covariance_, covs['lars'], decimal=4)
|
||||
|
||||
# For a centered matrix, assume_centered could be chosen True or False
|
||||
# Check that this returns indeed the same result for centered data
|
||||
Z = X - X.mean(0)
|
||||
precs = list()
|
||||
for assume_centered in (False, True):
|
||||
prec_ = GraphicalLasso(
|
||||
assume_centered=assume_centered).fit(Z).precision_
|
||||
precs.append(prec_)
|
||||
assert_array_almost_equal(precs[0], precs[1])
|
||||
|
||||
|
||||
def test_graphical_lasso_iris():
|
||||
# Hard-coded solution from R glasso package for alpha=1.0
|
||||
# (need to set penalize.diagonal to FALSE)
|
||||
cov_R = np.array([
|
||||
[0.68112222, 0.0000000, 0.265820, 0.02464314],
|
||||
[0.00000000, 0.1887129, 0.000000, 0.00000000],
|
||||
[0.26582000, 0.0000000, 3.095503, 0.28697200],
|
||||
[0.02464314, 0.0000000, 0.286972, 0.57713289]
|
||||
])
|
||||
icov_R = np.array([
|
||||
[1.5190747, 0.000000, -0.1304475, 0.0000000],
|
||||
[0.0000000, 5.299055, 0.0000000, 0.0000000],
|
||||
[-0.1304475, 0.000000, 0.3498624, -0.1683946],
|
||||
[0.0000000, 0.000000, -0.1683946, 1.8164353]
|
||||
])
|
||||
X = datasets.load_iris().data
|
||||
emp_cov = empirical_covariance(X)
|
||||
for method in ('cd', 'lars'):
|
||||
cov, icov = graphical_lasso(emp_cov, alpha=1.0, return_costs=False,
|
||||
mode=method)
|
||||
assert_array_almost_equal(cov, cov_R)
|
||||
assert_array_almost_equal(icov, icov_R)
|
||||
|
||||
|
||||
def test_graph_lasso_2D():
|
||||
# Hard-coded solution from Python skggm package
|
||||
# obtained by calling `quic(emp_cov, lam=.1, tol=1e-8)`
|
||||
cov_skggm = np.array([[3.09550269, 1.186972],
|
||||
[1.186972, 0.57713289]])
|
||||
|
||||
icov_skggm = np.array([[1.52836773, -3.14334831],
|
||||
[-3.14334831, 8.19753385]])
|
||||
X = datasets.load_iris().data[:, 2:]
|
||||
emp_cov = empirical_covariance(X)
|
||||
for method in ('cd', 'lars'):
|
||||
cov, icov = graphical_lasso(emp_cov, alpha=.1, return_costs=False,
|
||||
mode=method)
|
||||
assert_array_almost_equal(cov, cov_skggm)
|
||||
assert_array_almost_equal(icov, icov_skggm)
|
||||
|
||||
|
||||
def test_graphical_lasso_iris_singular():
|
||||
# Small subset of rows to test the rank-deficient case
|
||||
# Need to choose samples such that none of the variances are zero
|
||||
indices = np.arange(10, 13)
|
||||
|
||||
# Hard-coded solution from R glasso package for alpha=0.01
|
||||
cov_R = np.array([
|
||||
[0.08, 0.056666662595, 0.00229729713223, 0.00153153142149],
|
||||
[0.056666662595, 0.082222222222, 0.00333333333333, 0.00222222222222],
|
||||
[0.002297297132, 0.003333333333, 0.00666666666667, 0.00009009009009],
|
||||
[0.001531531421, 0.002222222222, 0.00009009009009, 0.00222222222222]
|
||||
])
|
||||
icov_R = np.array([
|
||||
[24.42244057, -16.831679593, 0.0, 0.0],
|
||||
[-16.83168201, 24.351841681, -6.206896552, -12.5],
|
||||
[0.0, -6.206896171, 153.103448276, 0.0],
|
||||
[0.0, -12.499999143, 0.0, 462.5]
|
||||
])
|
||||
X = datasets.load_iris().data[indices, :]
|
||||
emp_cov = empirical_covariance(X)
|
||||
for method in ('cd', 'lars'):
|
||||
cov, icov = graphical_lasso(emp_cov, alpha=0.01, return_costs=False,
|
||||
mode=method)
|
||||
assert_array_almost_equal(cov, cov_R, decimal=5)
|
||||
assert_array_almost_equal(icov, icov_R, decimal=5)
|
||||
|
||||
|
||||
def test_graphical_lasso_cv(random_state=1):
|
||||
# Sample data from a sparse multivariate normal
|
||||
dim = 5
|
||||
n_samples = 6
|
||||
random_state = check_random_state(random_state)
|
||||
prec = make_sparse_spd_matrix(dim, alpha=.96,
|
||||
random_state=random_state)
|
||||
cov = linalg.inv(prec)
|
||||
X = random_state.multivariate_normal(np.zeros(dim), cov, size=n_samples)
|
||||
# Capture stdout, to smoke test the verbose mode
|
||||
orig_stdout = sys.stdout
|
||||
try:
|
||||
sys.stdout = StringIO()
|
||||
# We need verbose very high so that Parallel prints on stdout
|
||||
GraphicalLassoCV(verbose=100, alphas=5, tol=1e-1).fit(X)
|
||||
finally:
|
||||
sys.stdout = orig_stdout
|
||||
|
||||
# Smoke test with specified alphas
|
||||
GraphicalLassoCV(alphas=[0.8, 0.5], tol=1e-1, n_jobs=1).fit(X)
|
|
@ -0,0 +1,168 @@
|
|||
# Author: Alexandre Gramfort <alexandre.gramfort@inria.fr>
|
||||
# Gael Varoquaux <gael.varoquaux@normalesup.org>
|
||||
# Virgile Fritsch <virgile.fritsch@inria.fr>
|
||||
#
|
||||
# License: BSD 3 clause
|
||||
|
||||
import itertools
|
||||
|
||||
import numpy as np
|
||||
|
||||
from sklearn.utils._testing import assert_array_almost_equal
|
||||
from sklearn.utils._testing import assert_raise_message
|
||||
from sklearn.utils._testing import assert_warns_message
|
||||
|
||||
from sklearn import datasets
|
||||
from sklearn.covariance import empirical_covariance, MinCovDet
|
||||
from sklearn.covariance import fast_mcd
|
||||
|
||||
X = datasets.load_iris().data
|
||||
X_1d = X[:, 0]
|
||||
n_samples, n_features = X.shape
|
||||
|
||||
|
||||
def test_mcd():
|
||||
# Tests the FastMCD algorithm implementation
|
||||
# Small data set
|
||||
# test without outliers (random independent normal data)
|
||||
launch_mcd_on_dataset(100, 5, 0, 0.01, 0.1, 80)
|
||||
# test with a contaminated data set (medium contamination)
|
||||
launch_mcd_on_dataset(100, 5, 20, 0.01, 0.01, 70)
|
||||
# test with a contaminated data set (strong contamination)
|
||||
launch_mcd_on_dataset(100, 5, 40, 0.1, 0.1, 50)
|
||||
|
||||
# Medium data set
|
||||
launch_mcd_on_dataset(1000, 5, 450, 0.1, 0.1, 540)
|
||||
|
||||
# Large data set
|
||||
launch_mcd_on_dataset(1700, 5, 800, 0.1, 0.1, 870)
|
||||
|
||||
# 1D data set
|
||||
launch_mcd_on_dataset(500, 1, 100, 0.001, 0.001, 350)
|
||||
|
||||
|
||||
def test_fast_mcd_on_invalid_input():
|
||||
X = np.arange(100)
|
||||
assert_raise_message(ValueError, 'Expected 2D array, got 1D array instead',
|
||||
fast_mcd, X)
|
||||
|
||||
|
||||
def test_mcd_class_on_invalid_input():
|
||||
X = np.arange(100)
|
||||
mcd = MinCovDet()
|
||||
assert_raise_message(ValueError, 'Expected 2D array, got 1D array instead',
|
||||
mcd.fit, X)
|
||||
|
||||
|
||||
def launch_mcd_on_dataset(n_samples, n_features, n_outliers, tol_loc, tol_cov,
|
||||
tol_support):
|
||||
|
||||
rand_gen = np.random.RandomState(0)
|
||||
data = rand_gen.randn(n_samples, n_features)
|
||||
# add some outliers
|
||||
outliers_index = rand_gen.permutation(n_samples)[:n_outliers]
|
||||
outliers_offset = 10. * \
|
||||
(rand_gen.randint(2, size=(n_outliers, n_features)) - 0.5)
|
||||
data[outliers_index] += outliers_offset
|
||||
inliers_mask = np.ones(n_samples).astype(bool)
|
||||
inliers_mask[outliers_index] = False
|
||||
|
||||
pure_data = data[inliers_mask]
|
||||
# compute MCD by fitting an object
|
||||
mcd_fit = MinCovDet(random_state=rand_gen).fit(data)
|
||||
T = mcd_fit.location_
|
||||
S = mcd_fit.covariance_
|
||||
H = mcd_fit.support_
|
||||
# compare with the estimates learnt from the inliers
|
||||
error_location = np.mean((pure_data.mean(0) - T) ** 2)
|
||||
assert(error_location < tol_loc)
|
||||
error_cov = np.mean((empirical_covariance(pure_data) - S) ** 2)
|
||||
assert(error_cov < tol_cov)
|
||||
assert(np.sum(H) >= tol_support)
|
||||
assert_array_almost_equal(mcd_fit.mahalanobis(data), mcd_fit.dist_)
|
||||
|
||||
|
||||
def test_mcd_issue1127():
|
||||
# Check that the code does not break with X.shape = (3, 1)
|
||||
# (i.e. n_support = n_samples)
|
||||
rnd = np.random.RandomState(0)
|
||||
X = rnd.normal(size=(3, 1))
|
||||
mcd = MinCovDet()
|
||||
mcd.fit(X)
|
||||
|
||||
|
||||
def test_mcd_issue3367():
|
||||
# Check that MCD completes when the covariance matrix is singular
|
||||
# i.e. one of the rows and columns are all zeros
|
||||
rand_gen = np.random.RandomState(0)
|
||||
|
||||
# Think of these as the values for X and Y -> 10 values between -5 and 5
|
||||
data_values = np.linspace(-5, 5, 10).tolist()
|
||||
# Get the cartesian product of all possible coordinate pairs from above set
|
||||
data = np.array(list(itertools.product(data_values, data_values)))
|
||||
|
||||
# Add a third column that's all zeros to make our data a set of point
|
||||
# within a plane, which means that the covariance matrix will be singular
|
||||
data = np.hstack((data, np.zeros((data.shape[0], 1))))
|
||||
|
||||
# The below line of code should raise an exception if the covariance matrix
|
||||
# is singular. As a further test, since we have points in XYZ, the
|
||||
# principle components (Eigenvectors) of these directly relate to the
|
||||
# geometry of the points. Since it's a plane, we should be able to test
|
||||
# that the Eigenvector that corresponds to the smallest Eigenvalue is the
|
||||
# plane normal, specifically [0, 0, 1], since everything is in the XY plane
|
||||
# (as I've set it up above). To do this one would start by:
|
||||
#
|
||||
# evals, evecs = np.linalg.eigh(mcd_fit.covariance_)
|
||||
# normal = evecs[:, np.argmin(evals)]
|
||||
#
|
||||
# After which we need to assert that our `normal` is equal to [0, 0, 1].
|
||||
# Do note that there is floating point error associated with this, so it's
|
||||
# best to subtract the two and then compare some small tolerance (e.g.
|
||||
# 1e-12).
|
||||
MinCovDet(random_state=rand_gen).fit(data)
|
||||
|
||||
|
||||
def test_mcd_support_covariance_is_zero():
|
||||
# Check that MCD returns a ValueError with informative message when the
|
||||
# covariance of the support data is equal to 0.
|
||||
X_1 = np.array([0.5, 0.1, 0.1, 0.1, 0.957, 0.1, 0.1, 0.1, 0.4285, 0.1])
|
||||
X_1 = X_1.reshape(-1, 1)
|
||||
X_2 = np.array([0.5, 0.3, 0.3, 0.3, 0.957, 0.3, 0.3, 0.3, 0.4285, 0.3])
|
||||
X_2 = X_2.reshape(-1, 1)
|
||||
msg = ('The covariance matrix of the support data is equal to 0, try to '
|
||||
'increase support_fraction')
|
||||
for X in [X_1, X_2]:
|
||||
assert_raise_message(ValueError, msg, MinCovDet().fit, X)
|
||||
|
||||
|
||||
def test_mcd_increasing_det_warning():
|
||||
# Check that a warning is raised if we observe increasing determinants
|
||||
# during the c_step. In theory the sequence of determinants should be
|
||||
# decreasing. Increasing determinants are likely due to ill-conditioned
|
||||
# covariance matrices that result in poor precision matrices.
|
||||
|
||||
X = [[5.1, 3.5, 1.4, 0.2],
|
||||
[4.9, 3.0, 1.4, 0.2],
|
||||
[4.7, 3.2, 1.3, 0.2],
|
||||
[4.6, 3.1, 1.5, 0.2],
|
||||
[5.0, 3.6, 1.4, 0.2],
|
||||
[4.6, 3.4, 1.4, 0.3],
|
||||
[5.0, 3.4, 1.5, 0.2],
|
||||
[4.4, 2.9, 1.4, 0.2],
|
||||
[4.9, 3.1, 1.5, 0.1],
|
||||
[5.4, 3.7, 1.5, 0.2],
|
||||
[4.8, 3.4, 1.6, 0.2],
|
||||
[4.8, 3.0, 1.4, 0.1],
|
||||
[4.3, 3.0, 1.1, 0.1],
|
||||
[5.1, 3.5, 1.4, 0.3],
|
||||
[5.7, 3.8, 1.7, 0.3],
|
||||
[5.4, 3.4, 1.7, 0.2],
|
||||
[4.6, 3.6, 1.0, 0.2],
|
||||
[5.0, 3.0, 1.6, 0.2],
|
||||
[5.2, 3.5, 1.5, 0.2]]
|
||||
|
||||
mcd = MinCovDet(random_state=1)
|
||||
assert_warns_message(RuntimeWarning,
|
||||
"Determinant has increased",
|
||||
mcd.fit, X)
|
Loading…
Add table
Add a link
Reference in a new issue