Uploaded Test files
This commit is contained in:
parent
f584ad9d97
commit
2e81cb7d99
16627 changed files with 2065359 additions and 102444 deletions
37
venv/Lib/site-packages/sklearn/neighbors/__init__.py
Normal file
37
venv/Lib/site-packages/sklearn/neighbors/__init__.py
Normal file
|
@ -0,0 +1,37 @@
|
|||
"""
|
||||
The :mod:`sklearn.neighbors` module implements the k-nearest neighbors
|
||||
algorithm.
|
||||
"""
|
||||
|
||||
from ._ball_tree import BallTree
|
||||
from ._kd_tree import KDTree
|
||||
from ._dist_metrics import DistanceMetric
|
||||
from ._graph import kneighbors_graph, radius_neighbors_graph
|
||||
from ._graph import KNeighborsTransformer, RadiusNeighborsTransformer
|
||||
from ._unsupervised import NearestNeighbors
|
||||
from ._classification import KNeighborsClassifier, RadiusNeighborsClassifier
|
||||
from ._regression import KNeighborsRegressor, RadiusNeighborsRegressor
|
||||
from ._nearest_centroid import NearestCentroid
|
||||
from ._kde import KernelDensity
|
||||
from ._lof import LocalOutlierFactor
|
||||
from ._nca import NeighborhoodComponentsAnalysis
|
||||
from ._base import VALID_METRICS, VALID_METRICS_SPARSE
|
||||
|
||||
__all__ = ['BallTree',
|
||||
'DistanceMetric',
|
||||
'KDTree',
|
||||
'KNeighborsClassifier',
|
||||
'KNeighborsRegressor',
|
||||
'KNeighborsTransformer',
|
||||
'NearestCentroid',
|
||||
'NearestNeighbors',
|
||||
'RadiusNeighborsClassifier',
|
||||
'RadiusNeighborsRegressor',
|
||||
'RadiusNeighborsTransformer',
|
||||
'kneighbors_graph',
|
||||
'radius_neighbors_graph',
|
||||
'KernelDensity',
|
||||
'LocalOutlierFactor',
|
||||
'NeighborhoodComponentsAnalysis',
|
||||
'VALID_METRICS',
|
||||
'VALID_METRICS_SPARSE']
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
1173
venv/Lib/site-packages/sklearn/neighbors/_base.py
Normal file
1173
venv/Lib/site-packages/sklearn/neighbors/_base.py
Normal file
File diff suppressed because it is too large
Load diff
583
venv/Lib/site-packages/sklearn/neighbors/_classification.py
Normal file
583
venv/Lib/site-packages/sklearn/neighbors/_classification.py
Normal file
|
@ -0,0 +1,583 @@
|
|||
"""Nearest Neighbor Classification"""
|
||||
|
||||
# Authors: Jake Vanderplas <vanderplas@astro.washington.edu>
|
||||
# Fabian Pedregosa <fabian.pedregosa@inria.fr>
|
||||
# Alexandre Gramfort <alexandre.gramfort@inria.fr>
|
||||
# Sparseness support by Lars Buitinck
|
||||
# Multi-output support by Arnaud Joly <a.joly@ulg.ac.be>
|
||||
#
|
||||
# License: BSD 3 clause (C) INRIA, University of Amsterdam
|
||||
|
||||
import numpy as np
|
||||
from scipy import stats
|
||||
from ..utils.extmath import weighted_mode
|
||||
from ..utils.validation import _is_arraylike, _num_samples
|
||||
|
||||
import warnings
|
||||
from ._base import \
|
||||
_check_weights, _get_weights, \
|
||||
NeighborsBase, KNeighborsMixin,\
|
||||
RadiusNeighborsMixin, SupervisedIntegerMixin
|
||||
from ..base import ClassifierMixin
|
||||
from ..utils import check_array
|
||||
from ..utils.validation import _deprecate_positional_args
|
||||
|
||||
|
||||
class KNeighborsClassifier(NeighborsBase, KNeighborsMixin,
|
||||
SupervisedIntegerMixin, ClassifierMixin):
|
||||
"""Classifier implementing the k-nearest neighbors vote.
|
||||
|
||||
Read more in the :ref:`User Guide <classification>`.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
n_neighbors : int, default=5
|
||||
Number of neighbors to use by default for :meth:`kneighbors` queries.
|
||||
|
||||
weights : {'uniform', 'distance'} or callable, default='uniform'
|
||||
weight function used in prediction. Possible values:
|
||||
|
||||
- 'uniform' : uniform weights. All points in each neighborhood
|
||||
are weighted equally.
|
||||
- 'distance' : weight points by the inverse of their distance.
|
||||
in this case, closer neighbors of a query point will have a
|
||||
greater influence than neighbors which are further away.
|
||||
- [callable] : a user-defined function which accepts an
|
||||
array of distances, and returns an array of the same shape
|
||||
containing the weights.
|
||||
|
||||
algorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'}, default='auto'
|
||||
Algorithm used to compute the nearest neighbors:
|
||||
|
||||
- 'ball_tree' will use :class:`BallTree`
|
||||
- 'kd_tree' will use :class:`KDTree`
|
||||
- 'brute' will use a brute-force search.
|
||||
- 'auto' will attempt to decide the most appropriate algorithm
|
||||
based on the values passed to :meth:`fit` method.
|
||||
|
||||
Note: fitting on sparse input will override the setting of
|
||||
this parameter, using brute force.
|
||||
|
||||
leaf_size : int, default=30
|
||||
Leaf size passed to BallTree or KDTree. This can affect the
|
||||
speed of the construction and query, as well as the memory
|
||||
required to store the tree. The optimal value depends on the
|
||||
nature of the problem.
|
||||
|
||||
p : int, default=2
|
||||
Power parameter for the Minkowski metric. When p = 1, this is
|
||||
equivalent to using manhattan_distance (l1), and euclidean_distance
|
||||
(l2) for p = 2. For arbitrary p, minkowski_distance (l_p) is used.
|
||||
|
||||
metric : str or callable, default='minkowski'
|
||||
the distance metric to use for the tree. The default metric is
|
||||
minkowski, and with p=2 is equivalent to the standard Euclidean
|
||||
metric. See the documentation of :class:`DistanceMetric` for a
|
||||
list of available metrics.
|
||||
If metric is "precomputed", X is assumed to be a distance matrix and
|
||||
must be square during fit. X may be a :term:`sparse graph`,
|
||||
in which case only "nonzero" elements may be considered neighbors.
|
||||
|
||||
metric_params : dict, default=None
|
||||
Additional keyword arguments for the metric function.
|
||||
|
||||
n_jobs : int, default=None
|
||||
The number of parallel jobs to run for neighbors search.
|
||||
``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
|
||||
``-1`` means using all processors. See :term:`Glossary <n_jobs>`
|
||||
for more details.
|
||||
Doesn't affect :meth:`fit` method.
|
||||
|
||||
Attributes
|
||||
----------
|
||||
classes_ : array of shape (n_classes,)
|
||||
Class labels known to the classifier
|
||||
|
||||
effective_metric_ : str or callble
|
||||
The distance metric used. It will be same as the `metric` parameter
|
||||
or a synonym of it, e.g. 'euclidean' if the `metric` parameter set to
|
||||
'minkowski' and `p` parameter set to 2.
|
||||
|
||||
effective_metric_params_ : dict
|
||||
Additional keyword arguments for the metric function. For most metrics
|
||||
will be same with `metric_params` parameter, but may also contain the
|
||||
`p` parameter value if the `effective_metric_` attribute is set to
|
||||
'minkowski'.
|
||||
|
||||
outputs_2d_ : bool
|
||||
False when `y`'s shape is (n_samples, ) or (n_samples, 1) during fit
|
||||
otherwise True.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> X = [[0], [1], [2], [3]]
|
||||
>>> y = [0, 0, 1, 1]
|
||||
>>> from sklearn.neighbors import KNeighborsClassifier
|
||||
>>> neigh = KNeighborsClassifier(n_neighbors=3)
|
||||
>>> neigh.fit(X, y)
|
||||
KNeighborsClassifier(...)
|
||||
>>> print(neigh.predict([[1.1]]))
|
||||
[0]
|
||||
>>> print(neigh.predict_proba([[0.9]]))
|
||||
[[0.66666667 0.33333333]]
|
||||
|
||||
See also
|
||||
--------
|
||||
RadiusNeighborsClassifier
|
||||
KNeighborsRegressor
|
||||
RadiusNeighborsRegressor
|
||||
NearestNeighbors
|
||||
|
||||
Notes
|
||||
-----
|
||||
See :ref:`Nearest Neighbors <neighbors>` in the online documentation
|
||||
for a discussion of the choice of ``algorithm`` and ``leaf_size``.
|
||||
|
||||
.. warning::
|
||||
|
||||
Regarding the Nearest Neighbors algorithms, if it is found that two
|
||||
neighbors, neighbor `k+1` and `k`, have identical distances
|
||||
but different labels, the results will depend on the ordering of the
|
||||
training data.
|
||||
|
||||
https://en.wikipedia.org/wiki/K-nearest_neighbor_algorithm
|
||||
"""
|
||||
|
||||
@_deprecate_positional_args
|
||||
def __init__(self, n_neighbors=5, *,
|
||||
weights='uniform', algorithm='auto', leaf_size=30,
|
||||
p=2, metric='minkowski', metric_params=None, n_jobs=None,
|
||||
**kwargs):
|
||||
super().__init__(
|
||||
n_neighbors=n_neighbors,
|
||||
algorithm=algorithm,
|
||||
leaf_size=leaf_size, metric=metric, p=p,
|
||||
metric_params=metric_params,
|
||||
n_jobs=n_jobs, **kwargs)
|
||||
self.weights = _check_weights(weights)
|
||||
|
||||
def predict(self, X):
|
||||
"""Predict the class labels for the provided data.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : array-like of shape (n_queries, n_features), \
|
||||
or (n_queries, n_indexed) if metric == 'precomputed'
|
||||
Test samples.
|
||||
|
||||
Returns
|
||||
-------
|
||||
y : ndarray of shape (n_queries,) or (n_queries, n_outputs)
|
||||
Class labels for each data sample.
|
||||
"""
|
||||
X = check_array(X, accept_sparse='csr')
|
||||
|
||||
neigh_dist, neigh_ind = self.kneighbors(X)
|
||||
classes_ = self.classes_
|
||||
_y = self._y
|
||||
if not self.outputs_2d_:
|
||||
_y = self._y.reshape((-1, 1))
|
||||
classes_ = [self.classes_]
|
||||
|
||||
n_outputs = len(classes_)
|
||||
n_queries = _num_samples(X)
|
||||
weights = _get_weights(neigh_dist, self.weights)
|
||||
|
||||
y_pred = np.empty((n_queries, n_outputs), dtype=classes_[0].dtype)
|
||||
for k, classes_k in enumerate(classes_):
|
||||
if weights is None:
|
||||
mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
|
||||
else:
|
||||
mode, _ = weighted_mode(_y[neigh_ind, k], weights, axis=1)
|
||||
|
||||
mode = np.asarray(mode.ravel(), dtype=np.intp)
|
||||
y_pred[:, k] = classes_k.take(mode)
|
||||
|
||||
if not self.outputs_2d_:
|
||||
y_pred = y_pred.ravel()
|
||||
|
||||
return y_pred
|
||||
|
||||
def predict_proba(self, X):
|
||||
"""Return probability estimates for the test data X.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : array-like of shape (n_queries, n_features), \
|
||||
or (n_queries, n_indexed) if metric == 'precomputed'
|
||||
Test samples.
|
||||
|
||||
Returns
|
||||
-------
|
||||
p : ndarray of shape (n_queries, n_classes), or a list of n_outputs
|
||||
of such arrays if n_outputs > 1.
|
||||
The class probabilities of the input samples. Classes are ordered
|
||||
by lexicographic order.
|
||||
"""
|
||||
X = check_array(X, accept_sparse='csr')
|
||||
|
||||
neigh_dist, neigh_ind = self.kneighbors(X)
|
||||
|
||||
classes_ = self.classes_
|
||||
_y = self._y
|
||||
if not self.outputs_2d_:
|
||||
_y = self._y.reshape((-1, 1))
|
||||
classes_ = [self.classes_]
|
||||
|
||||
n_queries = _num_samples(X)
|
||||
|
||||
weights = _get_weights(neigh_dist, self.weights)
|
||||
if weights is None:
|
||||
weights = np.ones_like(neigh_ind)
|
||||
|
||||
all_rows = np.arange(X.shape[0])
|
||||
probabilities = []
|
||||
for k, classes_k in enumerate(classes_):
|
||||
pred_labels = _y[:, k][neigh_ind]
|
||||
proba_k = np.zeros((n_queries, classes_k.size))
|
||||
|
||||
# a simple ':' index doesn't work right
|
||||
for i, idx in enumerate(pred_labels.T): # loop is O(n_neighbors)
|
||||
proba_k[all_rows, idx] += weights[:, i]
|
||||
|
||||
# normalize 'votes' into real [0,1] probabilities
|
||||
normalizer = proba_k.sum(axis=1)[:, np.newaxis]
|
||||
normalizer[normalizer == 0.0] = 1.0
|
||||
proba_k /= normalizer
|
||||
|
||||
probabilities.append(proba_k)
|
||||
|
||||
if not self.outputs_2d_:
|
||||
probabilities = probabilities[0]
|
||||
|
||||
return probabilities
|
||||
|
||||
|
||||
class RadiusNeighborsClassifier(NeighborsBase, RadiusNeighborsMixin,
|
||||
SupervisedIntegerMixin, ClassifierMixin):
|
||||
"""Classifier implementing a vote among neighbors within a given radius
|
||||
|
||||
Read more in the :ref:`User Guide <classification>`.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
radius : float, default=1.0
|
||||
Range of parameter space to use by default for :meth:`radius_neighbors`
|
||||
queries.
|
||||
|
||||
weights : {'uniform', 'distance'} or callable, default='uniform'
|
||||
weight function used in prediction. Possible values:
|
||||
|
||||
- 'uniform' : uniform weights. All points in each neighborhood
|
||||
are weighted equally.
|
||||
- 'distance' : weight points by the inverse of their distance.
|
||||
in this case, closer neighbors of a query point will have a
|
||||
greater influence than neighbors which are further away.
|
||||
- [callable] : a user-defined function which accepts an
|
||||
array of distances, and returns an array of the same shape
|
||||
containing the weights.
|
||||
|
||||
Uniform weights are used by default.
|
||||
|
||||
algorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'}, default='auto'
|
||||
Algorithm used to compute the nearest neighbors:
|
||||
|
||||
- 'ball_tree' will use :class:`BallTree`
|
||||
- 'kd_tree' will use :class:`KDTree`
|
||||
- 'brute' will use a brute-force search.
|
||||
- 'auto' will attempt to decide the most appropriate algorithm
|
||||
based on the values passed to :meth:`fit` method.
|
||||
|
||||
Note: fitting on sparse input will override the setting of
|
||||
this parameter, using brute force.
|
||||
|
||||
leaf_size : int, default=30
|
||||
Leaf size passed to BallTree or KDTree. This can affect the
|
||||
speed of the construction and query, as well as the memory
|
||||
required to store the tree. The optimal value depends on the
|
||||
nature of the problem.
|
||||
|
||||
p : int, default=2
|
||||
Power parameter for the Minkowski metric. When p = 1, this is
|
||||
equivalent to using manhattan_distance (l1), and euclidean_distance
|
||||
(l2) for p = 2. For arbitrary p, minkowski_distance (l_p) is used.
|
||||
|
||||
metric : str or callable, default='minkowski'
|
||||
the distance metric to use for the tree. The default metric is
|
||||
minkowski, and with p=2 is equivalent to the standard Euclidean
|
||||
metric. See the documentation of :class:`DistanceMetric` for a
|
||||
list of available metrics.
|
||||
If metric is "precomputed", X is assumed to be a distance matrix and
|
||||
must be square during fit. X may be a :term:`sparse graph`,
|
||||
in which case only "nonzero" elements may be considered neighbors.
|
||||
|
||||
outlier_label : {manual label, 'most_frequent'}, default=None
|
||||
label for outlier samples (samples with no neighbors in given radius).
|
||||
|
||||
- manual label: str or int label (should be the same type as y)
|
||||
or list of manual labels if multi-output is used.
|
||||
- 'most_frequent' : assign the most frequent label of y to outliers.
|
||||
- None : when any outlier is detected, ValueError will be raised.
|
||||
|
||||
metric_params : dict, default=None
|
||||
Additional keyword arguments for the metric function.
|
||||
|
||||
n_jobs : int, default=None
|
||||
The number of parallel jobs to run for neighbors search.
|
||||
``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
|
||||
``-1`` means using all processors. See :term:`Glossary <n_jobs>`
|
||||
for more details.
|
||||
|
||||
Attributes
|
||||
----------
|
||||
classes_ : ndarray of shape (n_classes,)
|
||||
Class labels known to the classifier.
|
||||
|
||||
effective_metric_ : str or callble
|
||||
The distance metric used. It will be same as the `metric` parameter
|
||||
or a synonym of it, e.g. 'euclidean' if the `metric` parameter set to
|
||||
'minkowski' and `p` parameter set to 2.
|
||||
|
||||
effective_metric_params_ : dict
|
||||
Additional keyword arguments for the metric function. For most metrics
|
||||
will be same with `metric_params` parameter, but may also contain the
|
||||
`p` parameter value if the `effective_metric_` attribute is set to
|
||||
'minkowski'.
|
||||
|
||||
outputs_2d_ : bool
|
||||
False when `y`'s shape is (n_samples, ) or (n_samples, 1) during fit
|
||||
otherwise True.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> X = [[0], [1], [2], [3]]
|
||||
>>> y = [0, 0, 1, 1]
|
||||
>>> from sklearn.neighbors import RadiusNeighborsClassifier
|
||||
>>> neigh = RadiusNeighborsClassifier(radius=1.0)
|
||||
>>> neigh.fit(X, y)
|
||||
RadiusNeighborsClassifier(...)
|
||||
>>> print(neigh.predict([[1.5]]))
|
||||
[0]
|
||||
>>> print(neigh.predict_proba([[1.0]]))
|
||||
[[0.66666667 0.33333333]]
|
||||
|
||||
See also
|
||||
--------
|
||||
KNeighborsClassifier
|
||||
RadiusNeighborsRegressor
|
||||
KNeighborsRegressor
|
||||
NearestNeighbors
|
||||
|
||||
Notes
|
||||
-----
|
||||
See :ref:`Nearest Neighbors <neighbors>` in the online documentation
|
||||
for a discussion of the choice of ``algorithm`` and ``leaf_size``.
|
||||
|
||||
https://en.wikipedia.org/wiki/K-nearest_neighbor_algorithm
|
||||
"""
|
||||
|
||||
@_deprecate_positional_args
|
||||
def __init__(self, radius=1.0, *, weights='uniform',
|
||||
algorithm='auto', leaf_size=30, p=2, metric='minkowski',
|
||||
outlier_label=None, metric_params=None, n_jobs=None,
|
||||
**kwargs):
|
||||
super().__init__(
|
||||
radius=radius,
|
||||
algorithm=algorithm,
|
||||
leaf_size=leaf_size,
|
||||
metric=metric, p=p, metric_params=metric_params,
|
||||
n_jobs=n_jobs, **kwargs)
|
||||
self.weights = _check_weights(weights)
|
||||
self.outlier_label = outlier_label
|
||||
|
||||
def fit(self, X, y):
|
||||
"""Fit the model using X as training data and y as target values
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : BallTree, KDTree or {array-like, sparse matrix} of shape \
|
||||
(n_samples, n_features) or (n_samples, n_samples)
|
||||
Training data. If array or matrix, the shape is (n_samples,
|
||||
n_features), or (n_samples, n_samples) if metric='precomputed'.
|
||||
|
||||
y : {array-like, sparse matrix} of shape (n_samples,) or \
|
||||
(n_samples, n_output)
|
||||
Target values.
|
||||
|
||||
"""
|
||||
|
||||
SupervisedIntegerMixin.fit(self, X, y)
|
||||
|
||||
classes_ = self.classes_
|
||||
_y = self._y
|
||||
if not self.outputs_2d_:
|
||||
_y = self._y.reshape((-1, 1))
|
||||
classes_ = [self.classes_]
|
||||
|
||||
if self.outlier_label is None:
|
||||
outlier_label_ = None
|
||||
|
||||
elif self.outlier_label == 'most_frequent':
|
||||
outlier_label_ = []
|
||||
# iterate over multi-output, get the most frequest label for each
|
||||
# output.
|
||||
for k, classes_k in enumerate(classes_):
|
||||
label_count = np.bincount(_y[:, k])
|
||||
outlier_label_.append(classes_k[label_count.argmax()])
|
||||
|
||||
else:
|
||||
if (_is_arraylike(self.outlier_label) and
|
||||
not isinstance(self.outlier_label, str)):
|
||||
if len(self.outlier_label) != len(classes_):
|
||||
raise ValueError("The length of outlier_label: {} is "
|
||||
"inconsistent with the output "
|
||||
"length: {}".format(self.outlier_label,
|
||||
len(classes_)))
|
||||
outlier_label_ = self.outlier_label
|
||||
else:
|
||||
outlier_label_ = [self.outlier_label] * len(classes_)
|
||||
|
||||
for classes, label in zip(classes_, outlier_label_):
|
||||
if (_is_arraylike(label) and
|
||||
not isinstance(label, str)):
|
||||
# ensure the outlier lable for each output is a scalar.
|
||||
raise TypeError("The outlier_label of classes {} is "
|
||||
"supposed to be a scalar, got "
|
||||
"{}.".format(classes, label))
|
||||
if np.append(classes, label).dtype != classes.dtype:
|
||||
# ensure the dtype of outlier label is consistent with y.
|
||||
raise TypeError("The dtype of outlier_label {} is "
|
||||
"inconsistent with classes {} in "
|
||||
"y.".format(label, classes))
|
||||
|
||||
self.outlier_label_ = outlier_label_
|
||||
return self
|
||||
|
||||
def predict(self, X):
|
||||
"""Predict the class labels for the provided data.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : array-like of shape (n_queries, n_features), \
|
||||
or (n_queries, n_indexed) if metric == 'precomputed'
|
||||
Test samples.
|
||||
|
||||
Returns
|
||||
-------
|
||||
y : ndarray of shape (n_queries,) or (n_queries, n_outputs)
|
||||
Class labels for each data sample.
|
||||
"""
|
||||
|
||||
probs = self.predict_proba(X)
|
||||
classes_ = self.classes_
|
||||
|
||||
if not self.outputs_2d_:
|
||||
probs = [probs]
|
||||
classes_ = [self.classes_]
|
||||
|
||||
n_outputs = len(classes_)
|
||||
n_queries = probs[0].shape[0]
|
||||
y_pred = np.empty((n_queries, n_outputs), dtype=classes_[0].dtype)
|
||||
|
||||
for k, prob in enumerate(probs):
|
||||
# iterate over multi-output, assign labels based on probabilities
|
||||
# of each output.
|
||||
max_prob_index = prob.argmax(axis=1)
|
||||
y_pred[:, k] = classes_[k].take(max_prob_index)
|
||||
|
||||
outlier_zero_probs = (prob == 0).all(axis=1)
|
||||
if outlier_zero_probs.any():
|
||||
zero_prob_index = np.flatnonzero(outlier_zero_probs)
|
||||
y_pred[zero_prob_index, k] = self.outlier_label_[k]
|
||||
|
||||
if not self.outputs_2d_:
|
||||
y_pred = y_pred.ravel()
|
||||
|
||||
return y_pred
|
||||
|
||||
def predict_proba(self, X):
|
||||
"""Return probability estimates for the test data X.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : array-like of shape (n_queries, n_features), \
|
||||
or (n_queries, n_indexed) if metric == 'precomputed'
|
||||
Test samples.
|
||||
|
||||
Returns
|
||||
-------
|
||||
p : ndarray of shape (n_queries, n_classes), or a list of n_outputs
|
||||
of such arrays if n_outputs > 1.
|
||||
The class probabilities of the input samples. Classes are ordered
|
||||
by lexicographic order.
|
||||
"""
|
||||
|
||||
X = check_array(X, accept_sparse='csr')
|
||||
n_queries = _num_samples(X)
|
||||
|
||||
neigh_dist, neigh_ind = self.radius_neighbors(X)
|
||||
outlier_mask = np.zeros(n_queries, dtype=np.bool)
|
||||
outlier_mask[:] = [len(nind) == 0 for nind in neigh_ind]
|
||||
outliers = np.flatnonzero(outlier_mask)
|
||||
inliers = np.flatnonzero(~outlier_mask)
|
||||
|
||||
classes_ = self.classes_
|
||||
_y = self._y
|
||||
if not self.outputs_2d_:
|
||||
_y = self._y.reshape((-1, 1))
|
||||
classes_ = [self.classes_]
|
||||
|
||||
if self.outlier_label_ is None and outliers.size > 0:
|
||||
raise ValueError('No neighbors found for test samples %r, '
|
||||
'you can try using larger radius, '
|
||||
'giving a label for outliers, '
|
||||
'or considering removing them from your dataset.'
|
||||
% outliers)
|
||||
|
||||
weights = _get_weights(neigh_dist, self.weights)
|
||||
if weights is not None:
|
||||
weights = weights[inliers]
|
||||
|
||||
probabilities = []
|
||||
# iterate over multi-output, measure probabilities of the k-th output.
|
||||
for k, classes_k in enumerate(classes_):
|
||||
pred_labels = np.zeros(len(neigh_ind), dtype=object)
|
||||
pred_labels[:] = [_y[ind, k] for ind in neigh_ind]
|
||||
|
||||
proba_k = np.zeros((n_queries, classes_k.size))
|
||||
proba_inl = np.zeros((len(inliers), classes_k.size))
|
||||
|
||||
# samples have different size of neighbors within the same radius
|
||||
if weights is None:
|
||||
for i, idx in enumerate(pred_labels[inliers]):
|
||||
proba_inl[i, :] = np.bincount(idx,
|
||||
minlength=classes_k.size)
|
||||
else:
|
||||
for i, idx in enumerate(pred_labels[inliers]):
|
||||
proba_inl[i, :] = np.bincount(idx,
|
||||
weights[i],
|
||||
minlength=classes_k.size)
|
||||
proba_k[inliers, :] = proba_inl
|
||||
|
||||
if outliers.size > 0:
|
||||
_outlier_label = self.outlier_label_[k]
|
||||
label_index = np.flatnonzero(classes_k == _outlier_label)
|
||||
if label_index.size == 1:
|
||||
proba_k[outliers, label_index[0]] = 1.0
|
||||
else:
|
||||
warnings.warn('Outlier label {} is not in training '
|
||||
'classes. All class probabilities of '
|
||||
'outliers will be assigned with 0.'
|
||||
''.format(self.outlier_label_[k]))
|
||||
|
||||
# normalize 'votes' into real [0,1] probabilities
|
||||
normalizer = proba_k.sum(axis=1)[:, np.newaxis]
|
||||
normalizer[normalizer == 0.0] = 1.0
|
||||
proba_k /= normalizer
|
||||
|
||||
probabilities.append(proba_k)
|
||||
|
||||
if not self.outputs_2d_:
|
||||
probabilities = probabilities[0]
|
||||
|
||||
return probabilities
|
Binary file not shown.
77
venv/Lib/site-packages/sklearn/neighbors/_dist_metrics.pxd
Normal file
77
venv/Lib/site-packages/sklearn/neighbors/_dist_metrics.pxd
Normal file
|
@ -0,0 +1,77 @@
|
|||
#!python
|
||||
#cython: boundscheck=False
|
||||
#cython: wraparound=False
|
||||
#cython: cdivision=True
|
||||
|
||||
cimport cython
|
||||
cimport numpy as np
|
||||
from libc.math cimport fabs, sqrt, exp, cos, pow
|
||||
|
||||
from ._typedefs cimport DTYPE_t, ITYPE_t, DITYPE_t
|
||||
from ._typedefs import DTYPE, ITYPE
|
||||
|
||||
######################################################################
|
||||
# Inline distance functions
|
||||
#
|
||||
# We use these for the default (euclidean) case so that they can be
|
||||
# inlined. This leads to faster computation for the most common case
|
||||
cdef inline DTYPE_t euclidean_dist(DTYPE_t* x1, DTYPE_t* x2,
|
||||
ITYPE_t size) nogil except -1:
|
||||
cdef DTYPE_t tmp, d=0
|
||||
cdef np.intp_t j
|
||||
for j in range(size):
|
||||
tmp = x1[j] - x2[j]
|
||||
d += tmp * tmp
|
||||
return sqrt(d)
|
||||
|
||||
|
||||
cdef inline DTYPE_t euclidean_rdist(DTYPE_t* x1, DTYPE_t* x2,
|
||||
ITYPE_t size) nogil except -1:
|
||||
cdef DTYPE_t tmp, d=0
|
||||
cdef np.intp_t j
|
||||
for j in range(size):
|
||||
tmp = x1[j] - x2[j]
|
||||
d += tmp * tmp
|
||||
return d
|
||||
|
||||
|
||||
cdef inline DTYPE_t euclidean_dist_to_rdist(DTYPE_t dist) nogil except -1:
|
||||
return dist * dist
|
||||
|
||||
|
||||
cdef inline DTYPE_t euclidean_rdist_to_dist(DTYPE_t dist) nogil except -1:
|
||||
return sqrt(dist)
|
||||
|
||||
|
||||
######################################################################
|
||||
# DistanceMetric base class
|
||||
cdef class DistanceMetric:
|
||||
# The following attributes are required for a few of the subclasses.
|
||||
# we must define them here so that cython's limited polymorphism will work.
|
||||
# Because we don't expect to instantiate a lot of these objects, the
|
||||
# extra memory overhead of this setup should not be an issue.
|
||||
cdef DTYPE_t p
|
||||
#cdef DTYPE_t[::1] vec
|
||||
#cdef DTYPE_t[:, ::1] mat
|
||||
cdef np.ndarray vec
|
||||
cdef np.ndarray mat
|
||||
cdef DTYPE_t* vec_ptr
|
||||
cdef DTYPE_t* mat_ptr
|
||||
cdef ITYPE_t size
|
||||
cdef object func
|
||||
cdef object kwargs
|
||||
|
||||
cdef DTYPE_t dist(self, DTYPE_t* x1, DTYPE_t* x2,
|
||||
ITYPE_t size) nogil except -1
|
||||
|
||||
cdef DTYPE_t rdist(self, DTYPE_t* x1, DTYPE_t* x2,
|
||||
ITYPE_t size) nogil except -1
|
||||
|
||||
cdef int pdist(self, DTYPE_t[:, ::1] X, DTYPE_t[:, ::1] D) except -1
|
||||
|
||||
cdef int cdist(self, DTYPE_t[:, ::1] X, DTYPE_t[:, ::1] Y,
|
||||
DTYPE_t[:, ::1] D) except -1
|
||||
|
||||
cdef DTYPE_t _rdist_to_dist(self, DTYPE_t rdist) nogil except -1
|
||||
|
||||
cdef DTYPE_t _dist_to_rdist(self, DTYPE_t dist) nogil except -1
|
480
venv/Lib/site-packages/sklearn/neighbors/_graph.py
Normal file
480
venv/Lib/site-packages/sklearn/neighbors/_graph.py
Normal file
|
@ -0,0 +1,480 @@
|
|||
"""Nearest Neighbors graph functions"""
|
||||
|
||||
# Author: Jake Vanderplas <vanderplas@astro.washington.edu>
|
||||
# Tom Dupre la Tour
|
||||
#
|
||||
# License: BSD 3 clause (C) INRIA, University of Amsterdam
|
||||
from ._base import KNeighborsMixin, RadiusNeighborsMixin
|
||||
from ._base import NeighborsBase
|
||||
from ._base import UnsupervisedMixin
|
||||
from ._unsupervised import NearestNeighbors
|
||||
from ..base import TransformerMixin
|
||||
from ..utils.validation import check_is_fitted, _deprecate_positional_args
|
||||
|
||||
|
||||
def _check_params(X, metric, p, metric_params):
|
||||
"""Check the validity of the input parameters"""
|
||||
params = zip(['metric', 'p', 'metric_params'],
|
||||
[metric, p, metric_params])
|
||||
est_params = X.get_params()
|
||||
for param_name, func_param in params:
|
||||
if func_param != est_params[param_name]:
|
||||
raise ValueError(
|
||||
"Got %s for %s, while the estimator has %s for "
|
||||
"the same parameter." % (
|
||||
func_param, param_name, est_params[param_name]))
|
||||
|
||||
|
||||
def _query_include_self(X, include_self, mode):
|
||||
"""Return the query based on include_self param"""
|
||||
if include_self == 'auto':
|
||||
include_self = mode == 'connectivity'
|
||||
|
||||
# it does not include each sample as its own neighbors
|
||||
if not include_self:
|
||||
X = None
|
||||
|
||||
return X
|
||||
|
||||
|
||||
@_deprecate_positional_args
|
||||
def kneighbors_graph(X, n_neighbors, *, mode='connectivity',
|
||||
metric='minkowski', p=2, metric_params=None,
|
||||
include_self=False, n_jobs=None):
|
||||
"""Computes the (weighted) graph of k-Neighbors for points in X
|
||||
|
||||
Read more in the :ref:`User Guide <unsupervised_neighbors>`.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : array-like of shape (n_samples, n_features) or BallTree
|
||||
Sample data, in the form of a numpy array or a precomputed
|
||||
:class:`BallTree`.
|
||||
|
||||
n_neighbors : int
|
||||
Number of neighbors for each sample.
|
||||
|
||||
mode : {'connectivity', 'distance'}, default='connectivity'
|
||||
Type of returned matrix: 'connectivity' will return the connectivity
|
||||
matrix with ones and zeros, and 'distance' will return the distances
|
||||
between neighbors according to the given metric.
|
||||
|
||||
metric : str, default='minkowski'
|
||||
The distance metric used to calculate the k-Neighbors for each sample
|
||||
point. The DistanceMetric class gives a list of available metrics.
|
||||
The default distance is 'euclidean' ('minkowski' metric with the p
|
||||
param equal to 2.)
|
||||
|
||||
p : int, default=2
|
||||
Power parameter for the Minkowski metric. When p = 1, this is
|
||||
equivalent to using manhattan_distance (l1), and euclidean_distance
|
||||
(l2) for p = 2. For arbitrary p, minkowski_distance (l_p) is used.
|
||||
|
||||
metric_params : dict, default=None
|
||||
additional keyword arguments for the metric function.
|
||||
|
||||
include_self : bool or 'auto', default=False
|
||||
Whether or not to mark each sample as the first nearest neighbor to
|
||||
itself. If 'auto', then True is used for mode='connectivity' and False
|
||||
for mode='distance'.
|
||||
|
||||
n_jobs : int, default=None
|
||||
The number of parallel jobs to run for neighbors search.
|
||||
``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
|
||||
``-1`` means using all processors. See :term:`Glossary <n_jobs>`
|
||||
for more details.
|
||||
|
||||
Returns
|
||||
-------
|
||||
A : sparse matrix of shape (n_samples, n_samples)
|
||||
Graph where A[i, j] is assigned the weight of edge that
|
||||
connects i to j. The matrix is of CSR format.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> X = [[0], [3], [1]]
|
||||
>>> from sklearn.neighbors import kneighbors_graph
|
||||
>>> A = kneighbors_graph(X, 2, mode='connectivity', include_self=True)
|
||||
>>> A.toarray()
|
||||
array([[1., 0., 1.],
|
||||
[0., 1., 1.],
|
||||
[1., 0., 1.]])
|
||||
|
||||
See also
|
||||
--------
|
||||
radius_neighbors_graph
|
||||
"""
|
||||
if not isinstance(X, KNeighborsMixin):
|
||||
X = NearestNeighbors(n_neighbors=n_neighbors, metric=metric, p=p,
|
||||
metric_params=metric_params, n_jobs=n_jobs).fit(X)
|
||||
else:
|
||||
_check_params(X, metric, p, metric_params)
|
||||
|
||||
query = _query_include_self(X._fit_X, include_self, mode)
|
||||
return X.kneighbors_graph(X=query, n_neighbors=n_neighbors, mode=mode)
|
||||
|
||||
|
||||
@_deprecate_positional_args
|
||||
def radius_neighbors_graph(X, radius, *, mode='connectivity',
|
||||
metric='minkowski', p=2, metric_params=None,
|
||||
include_self=False, n_jobs=None):
|
||||
"""Computes the (weighted) graph of Neighbors for points in X
|
||||
|
||||
Neighborhoods are restricted the points at a distance lower than
|
||||
radius.
|
||||
|
||||
Read more in the :ref:`User Guide <unsupervised_neighbors>`.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : array-like of shape (n_samples, n_features) or BallTree
|
||||
Sample data, in the form of a numpy array or a precomputed
|
||||
:class:`BallTree`.
|
||||
|
||||
radius : float
|
||||
Radius of neighborhoods.
|
||||
|
||||
mode : {'connectivity', 'distance'}, default='connectivity'
|
||||
Type of returned matrix: 'connectivity' will return the connectivity
|
||||
matrix with ones and zeros, and 'distance' will return the distances
|
||||
between neighbors according to the given metric.
|
||||
|
||||
metric : str, default='minkowski'
|
||||
The distance metric used to calculate the neighbors within a
|
||||
given radius for each sample point. The DistanceMetric class
|
||||
gives a list of available metrics. The default distance is
|
||||
'euclidean' ('minkowski' metric with the param equal to 2.)
|
||||
|
||||
p : int, default=2
|
||||
Power parameter for the Minkowski metric. When p = 1, this is
|
||||
equivalent to using manhattan_distance (l1), and euclidean_distance
|
||||
(l2) for p = 2. For arbitrary p, minkowski_distance (l_p) is used.
|
||||
|
||||
metric_params : dict, default=None
|
||||
additional keyword arguments for the metric function.
|
||||
|
||||
include_self : bool or 'auto', default=False
|
||||
Whether or not to mark each sample as the first nearest neighbor to
|
||||
itself. If 'auto', then True is used for mode='connectivity' and False
|
||||
for mode='distance'.
|
||||
|
||||
n_jobs : int, default=None
|
||||
The number of parallel jobs to run for neighbors search.
|
||||
``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
|
||||
``-1`` means using all processors. See :term:`Glossary <n_jobs>`
|
||||
for more details.
|
||||
|
||||
Returns
|
||||
-------
|
||||
A : sparse matrix of shape (n_samples, n_samples)
|
||||
Graph where A[i, j] is assigned the weight of edge that connects
|
||||
i to j. The matrix is of CSR format.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> X = [[0], [3], [1]]
|
||||
>>> from sklearn.neighbors import radius_neighbors_graph
|
||||
>>> A = radius_neighbors_graph(X, 1.5, mode='connectivity',
|
||||
... include_self=True)
|
||||
>>> A.toarray()
|
||||
array([[1., 0., 1.],
|
||||
[0., 1., 0.],
|
||||
[1., 0., 1.]])
|
||||
|
||||
See also
|
||||
--------
|
||||
kneighbors_graph
|
||||
"""
|
||||
if not isinstance(X, RadiusNeighborsMixin):
|
||||
X = NearestNeighbors(radius=radius, metric=metric, p=p,
|
||||
metric_params=metric_params, n_jobs=n_jobs).fit(X)
|
||||
else:
|
||||
_check_params(X, metric, p, metric_params)
|
||||
|
||||
query = _query_include_self(X._fit_X, include_self, mode)
|
||||
return X.radius_neighbors_graph(query, radius, mode)
|
||||
|
||||
|
||||
class KNeighborsTransformer(KNeighborsMixin, UnsupervisedMixin,
|
||||
TransformerMixin, NeighborsBase):
|
||||
"""Transform X into a (weighted) graph of k nearest neighbors
|
||||
|
||||
The transformed data is a sparse graph as returned by kneighbors_graph.
|
||||
|
||||
Read more in the :ref:`User Guide <neighbors_transformer>`.
|
||||
|
||||
.. versionadded:: 0.22
|
||||
|
||||
Parameters
|
||||
----------
|
||||
mode : {'distance', 'connectivity'}, default='distance'
|
||||
Type of returned matrix: 'connectivity' will return the connectivity
|
||||
matrix with ones and zeros, and 'distance' will return the distances
|
||||
between neighbors according to the given metric.
|
||||
|
||||
n_neighbors : int, default=5
|
||||
Number of neighbors for each sample in the transformed sparse graph.
|
||||
For compatibility reasons, as each sample is considered as its own
|
||||
neighbor, one extra neighbor will be computed when mode == 'distance'.
|
||||
In this case, the sparse graph contains (n_neighbors + 1) neighbors.
|
||||
|
||||
algorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'}, default='auto'
|
||||
Algorithm used to compute the nearest neighbors:
|
||||
|
||||
- 'ball_tree' will use :class:`BallTree`
|
||||
- 'kd_tree' will use :class:`KDTree`
|
||||
- 'brute' will use a brute-force search.
|
||||
- 'auto' will attempt to decide the most appropriate algorithm
|
||||
based on the values passed to :meth:`fit` method.
|
||||
|
||||
Note: fitting on sparse input will override the setting of
|
||||
this parameter, using brute force.
|
||||
|
||||
leaf_size : int, default=30
|
||||
Leaf size passed to BallTree or KDTree. This can affect the
|
||||
speed of the construction and query, as well as the memory
|
||||
required to store the tree. The optimal value depends on the
|
||||
nature of the problem.
|
||||
|
||||
metric : str or callable, default='minkowski'
|
||||
metric to use for distance computation. Any metric from scikit-learn
|
||||
or scipy.spatial.distance can be used.
|
||||
|
||||
If metric is a callable function, it is called on each
|
||||
pair of instances (rows) and the resulting value recorded. The callable
|
||||
should take two arrays as input and return one value indicating the
|
||||
distance between them. This works for Scipy's metrics, but is less
|
||||
efficient than passing the metric name as a string.
|
||||
|
||||
Distance matrices are not supported.
|
||||
|
||||
Valid values for metric are:
|
||||
|
||||
- from scikit-learn: ['cityblock', 'cosine', 'euclidean', 'l1', 'l2',
|
||||
'manhattan']
|
||||
|
||||
- from scipy.spatial.distance: ['braycurtis', 'canberra', 'chebyshev',
|
||||
'correlation', 'dice', 'hamming', 'jaccard', 'kulsinski',
|
||||
'mahalanobis', 'minkowski', 'rogerstanimoto', 'russellrao',
|
||||
'seuclidean', 'sokalmichener', 'sokalsneath', 'sqeuclidean',
|
||||
'yule']
|
||||
|
||||
See the documentation for scipy.spatial.distance for details on these
|
||||
metrics.
|
||||
|
||||
p : int, default=2
|
||||
Parameter for the Minkowski metric from
|
||||
sklearn.metrics.pairwise.pairwise_distances. When p = 1, this is
|
||||
equivalent to using manhattan_distance (l1), and euclidean_distance
|
||||
(l2) for p = 2. For arbitrary p, minkowski_distance (l_p) is used.
|
||||
|
||||
metric_params : dict, default=None
|
||||
Additional keyword arguments for the metric function.
|
||||
|
||||
n_jobs : int, default=1
|
||||
The number of parallel jobs to run for neighbors search.
|
||||
If ``-1``, then the number of jobs is set to the number of CPU cores.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> from sklearn.manifold import Isomap
|
||||
>>> from sklearn.neighbors import KNeighborsTransformer
|
||||
>>> from sklearn.pipeline import make_pipeline
|
||||
>>> estimator = make_pipeline(
|
||||
... KNeighborsTransformer(n_neighbors=5, mode='distance'),
|
||||
... Isomap(neighbors_algorithm='precomputed'))
|
||||
"""
|
||||
@_deprecate_positional_args
|
||||
def __init__(self, *, mode='distance', n_neighbors=5, algorithm='auto',
|
||||
leaf_size=30, metric='minkowski', p=2, metric_params=None,
|
||||
n_jobs=1):
|
||||
super(KNeighborsTransformer, self).__init__(
|
||||
n_neighbors=n_neighbors, radius=None, algorithm=algorithm,
|
||||
leaf_size=leaf_size, metric=metric, p=p,
|
||||
metric_params=metric_params, n_jobs=n_jobs)
|
||||
self.mode = mode
|
||||
|
||||
def transform(self, X):
|
||||
"""Computes the (weighted) graph of Neighbors for points in X
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : array-like of shape (n_samples_transform, n_features)
|
||||
Sample data.
|
||||
|
||||
Returns
|
||||
-------
|
||||
Xt : sparse matrix of shape (n_samples_transform, n_samples_fit)
|
||||
Xt[i, j] is assigned the weight of edge that connects i to j.
|
||||
Only the neighbors have an explicit value.
|
||||
The diagonal is always explicit.
|
||||
The matrix is of CSR format.
|
||||
"""
|
||||
check_is_fitted(self)
|
||||
add_one = self.mode == 'distance'
|
||||
return self.kneighbors_graph(X, mode=self.mode,
|
||||
n_neighbors=self.n_neighbors + add_one)
|
||||
|
||||
def fit_transform(self, X, y=None):
|
||||
"""Fit to data, then transform it.
|
||||
|
||||
Fits transformer to X and y with optional parameters fit_params
|
||||
and returns a transformed version of X.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : array-like of shape (n_samples, n_features)
|
||||
Training set.
|
||||
|
||||
y : ignored
|
||||
|
||||
Returns
|
||||
-------
|
||||
Xt : sparse matrix of shape (n_samples, n_samples)
|
||||
Xt[i, j] is assigned the weight of edge that connects i to j.
|
||||
Only the neighbors have an explicit value.
|
||||
The diagonal is always explicit.
|
||||
The matrix is of CSR format.
|
||||
"""
|
||||
return self.fit(X).transform(X)
|
||||
|
||||
|
||||
class RadiusNeighborsTransformer(RadiusNeighborsMixin, UnsupervisedMixin,
|
||||
TransformerMixin, NeighborsBase):
|
||||
"""Transform X into a (weighted) graph of neighbors nearer than a radius
|
||||
|
||||
The transformed data is a sparse graph as returned by
|
||||
radius_neighbors_graph.
|
||||
|
||||
Read more in the :ref:`User Guide <neighbors_transformer>`.
|
||||
|
||||
.. versionadded:: 0.22
|
||||
|
||||
Parameters
|
||||
----------
|
||||
mode : {'distance', 'connectivity'}, default='distance'
|
||||
Type of returned matrix: 'connectivity' will return the connectivity
|
||||
matrix with ones and zeros, and 'distance' will return the distances
|
||||
between neighbors according to the given metric.
|
||||
|
||||
radius : float, default=1.
|
||||
Radius of neighborhood in the transformed sparse graph.
|
||||
|
||||
algorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'}, default='auto'
|
||||
Algorithm used to compute the nearest neighbors:
|
||||
|
||||
- 'ball_tree' will use :class:`BallTree`
|
||||
- 'kd_tree' will use :class:`KDTree`
|
||||
- 'brute' will use a brute-force search.
|
||||
- 'auto' will attempt to decide the most appropriate algorithm
|
||||
based on the values passed to :meth:`fit` method.
|
||||
|
||||
Note: fitting on sparse input will override the setting of
|
||||
this parameter, using brute force.
|
||||
|
||||
leaf_size : int, default=30
|
||||
Leaf size passed to BallTree or KDTree. This can affect the
|
||||
speed of the construction and query, as well as the memory
|
||||
required to store the tree. The optimal value depends on the
|
||||
nature of the problem.
|
||||
|
||||
metric : str or callable, default='minkowski'
|
||||
metric to use for distance computation. Any metric from scikit-learn
|
||||
or scipy.spatial.distance can be used.
|
||||
|
||||
If metric is a callable function, it is called on each
|
||||
pair of instances (rows) and the resulting value recorded. The callable
|
||||
should take two arrays as input and return one value indicating the
|
||||
distance between them. This works for Scipy's metrics, but is less
|
||||
efficient than passing the metric name as a string.
|
||||
|
||||
Distance matrices are not supported.
|
||||
|
||||
Valid values for metric are:
|
||||
|
||||
- from scikit-learn: ['cityblock', 'cosine', 'euclidean', 'l1', 'l2',
|
||||
'manhattan']
|
||||
|
||||
- from scipy.spatial.distance: ['braycurtis', 'canberra', 'chebyshev',
|
||||
'correlation', 'dice', 'hamming', 'jaccard', 'kulsinski',
|
||||
'mahalanobis', 'minkowski', 'rogerstanimoto', 'russellrao',
|
||||
'seuclidean', 'sokalmichener', 'sokalsneath', 'sqeuclidean',
|
||||
'yule']
|
||||
|
||||
See the documentation for scipy.spatial.distance for details on these
|
||||
metrics.
|
||||
|
||||
p : int, default=2
|
||||
Parameter for the Minkowski metric from
|
||||
sklearn.metrics.pairwise.pairwise_distances. When p = 1, this is
|
||||
equivalent to using manhattan_distance (l1), and euclidean_distance
|
||||
(l2) for p = 2. For arbitrary p, minkowski_distance (l_p) is used.
|
||||
|
||||
metric_params : dict, default=None
|
||||
Additional keyword arguments for the metric function.
|
||||
|
||||
n_jobs : int, default=1
|
||||
The number of parallel jobs to run for neighbors search.
|
||||
If ``-1``, then the number of jobs is set to the number of CPU cores.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> from sklearn.cluster import DBSCAN
|
||||
>>> from sklearn.neighbors import RadiusNeighborsTransformer
|
||||
>>> from sklearn.pipeline import make_pipeline
|
||||
>>> estimator = make_pipeline(
|
||||
... RadiusNeighborsTransformer(radius=42.0, mode='distance'),
|
||||
... DBSCAN(min_samples=30, metric='precomputed'))
|
||||
"""
|
||||
@_deprecate_positional_args
|
||||
def __init__(self, *, mode='distance', radius=1., algorithm='auto',
|
||||
leaf_size=30, metric='minkowski', p=2, metric_params=None,
|
||||
n_jobs=1):
|
||||
super(RadiusNeighborsTransformer, self).__init__(
|
||||
n_neighbors=None, radius=radius, algorithm=algorithm,
|
||||
leaf_size=leaf_size, metric=metric, p=p,
|
||||
metric_params=metric_params, n_jobs=n_jobs)
|
||||
self.mode = mode
|
||||
|
||||
def transform(self, X):
|
||||
"""Computes the (weighted) graph of Neighbors for points in X
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : array-like of shape (n_samples_transform, n_features)
|
||||
Sample data
|
||||
|
||||
Returns
|
||||
-------
|
||||
Xt : sparse matrix of shape (n_samples_transform, n_samples_fit)
|
||||
Xt[i, j] is assigned the weight of edge that connects i to j.
|
||||
Only the neighbors have an explicit value.
|
||||
The diagonal is always explicit.
|
||||
The matrix is of CSR format.
|
||||
"""
|
||||
check_is_fitted(self)
|
||||
return self.radius_neighbors_graph(X, mode=self.mode,
|
||||
sort_results=True)
|
||||
|
||||
def fit_transform(self, X, y=None):
|
||||
"""Fit to data, then transform it.
|
||||
|
||||
Fits transformer to X and y with optional parameters fit_params
|
||||
and returns a transformed version of X.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : array-like of shape (n_samples, n_features)
|
||||
Training set.
|
||||
|
||||
y : ignored
|
||||
|
||||
Returns
|
||||
-------
|
||||
Xt : sparse matrix of shape (n_samples, n_samples)
|
||||
Xt[i, j] is assigned the weight of edge that connects i to j.
|
||||
Only the neighbors have an explicit value.
|
||||
The diagonal is always explicit.
|
||||
The matrix is of CSR format.
|
||||
"""
|
||||
return self.fit(X).transform(X)
|
BIN
venv/Lib/site-packages/sklearn/neighbors/_kd_tree.cp36-win32.pyd
Normal file
BIN
venv/Lib/site-packages/sklearn/neighbors/_kd_tree.cp36-win32.pyd
Normal file
Binary file not shown.
276
venv/Lib/site-packages/sklearn/neighbors/_kde.py
Normal file
276
venv/Lib/site-packages/sklearn/neighbors/_kde.py
Normal file
|
@ -0,0 +1,276 @@
|
|||
"""
|
||||
Kernel Density Estimation
|
||||
-------------------------
|
||||
"""
|
||||
# Author: Jake Vanderplas <jakevdp@cs.washington.edu>
|
||||
|
||||
import numpy as np
|
||||
from scipy.special import gammainc
|
||||
from ..base import BaseEstimator
|
||||
from ..utils import check_array, check_random_state
|
||||
from ..utils.validation import _check_sample_weight, check_is_fitted
|
||||
from ..utils.validation import _deprecate_positional_args
|
||||
|
||||
from ..utils.extmath import row_norms
|
||||
from ._ball_tree import BallTree, DTYPE
|
||||
from ._kd_tree import KDTree
|
||||
|
||||
|
||||
VALID_KERNELS = ['gaussian', 'tophat', 'epanechnikov', 'exponential', 'linear',
|
||||
'cosine']
|
||||
TREE_DICT = {'ball_tree': BallTree, 'kd_tree': KDTree}
|
||||
|
||||
|
||||
# TODO: implement a brute force version for testing purposes
|
||||
# TODO: bandwidth estimation
|
||||
# TODO: create a density estimation base class?
|
||||
class KernelDensity(BaseEstimator):
|
||||
"""Kernel Density Estimation.
|
||||
|
||||
Read more in the :ref:`User Guide <kernel_density>`.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
bandwidth : float
|
||||
The bandwidth of the kernel.
|
||||
|
||||
algorithm : str
|
||||
The tree algorithm to use. Valid options are
|
||||
['kd_tree'|'ball_tree'|'auto']. Default is 'auto'.
|
||||
|
||||
kernel : str
|
||||
The kernel to use. Valid kernels are
|
||||
['gaussian'|'tophat'|'epanechnikov'|'exponential'|'linear'|'cosine']
|
||||
Default is 'gaussian'.
|
||||
|
||||
metric : str
|
||||
The distance metric to use. Note that not all metrics are
|
||||
valid with all algorithms. Refer to the documentation of
|
||||
:class:`BallTree` and :class:`KDTree` for a description of
|
||||
available algorithms. Note that the normalization of the density
|
||||
output is correct only for the Euclidean distance metric. Default
|
||||
is 'euclidean'.
|
||||
|
||||
atol : float
|
||||
The desired absolute tolerance of the result. A larger tolerance will
|
||||
generally lead to faster execution. Default is 0.
|
||||
|
||||
rtol : float
|
||||
The desired relative tolerance of the result. A larger tolerance will
|
||||
generally lead to faster execution. Default is 1E-8.
|
||||
|
||||
breadth_first : bool
|
||||
If true (default), use a breadth-first approach to the problem.
|
||||
Otherwise use a depth-first approach.
|
||||
|
||||
leaf_size : int
|
||||
Specify the leaf size of the underlying tree. See :class:`BallTree`
|
||||
or :class:`KDTree` for details. Default is 40.
|
||||
|
||||
metric_params : dict
|
||||
Additional parameters to be passed to the tree for use with the
|
||||
metric. For more information, see the documentation of
|
||||
:class:`BallTree` or :class:`KDTree`.
|
||||
|
||||
See Also
|
||||
--------
|
||||
sklearn.neighbors.KDTree : K-dimensional tree for fast generalized N-point
|
||||
problems.
|
||||
sklearn.neighbors.BallTree : Ball tree for fast generalized N-point
|
||||
problems.
|
||||
|
||||
Examples
|
||||
--------
|
||||
Compute a gaussian kernel density estimate with a fixed bandwidth.
|
||||
>>> import numpy as np
|
||||
>>> rng = np.random.RandomState(42)
|
||||
>>> X = rng.random_sample((100, 3))
|
||||
>>> kde = KernelDensity(kernel='gaussian', bandwidth=0.5).fit(X)
|
||||
>>> log_density = kde.score_samples(X[:3])
|
||||
>>> log_density
|
||||
array([-1.52955942, -1.51462041, -1.60244657])
|
||||
"""
|
||||
@_deprecate_positional_args
|
||||
def __init__(self, *, bandwidth=1.0, algorithm='auto',
|
||||
kernel='gaussian', metric="euclidean", atol=0, rtol=0,
|
||||
breadth_first=True, leaf_size=40, metric_params=None):
|
||||
self.algorithm = algorithm
|
||||
self.bandwidth = bandwidth
|
||||
self.kernel = kernel
|
||||
self.metric = metric
|
||||
self.atol = atol
|
||||
self.rtol = rtol
|
||||
self.breadth_first = breadth_first
|
||||
self.leaf_size = leaf_size
|
||||
self.metric_params = metric_params
|
||||
|
||||
# run the choose algorithm code so that exceptions will happen here
|
||||
# we're using clone() in the GenerativeBayes classifier,
|
||||
# so we can't do this kind of logic in __init__
|
||||
self._choose_algorithm(self.algorithm, self.metric)
|
||||
|
||||
if bandwidth <= 0:
|
||||
raise ValueError("bandwidth must be positive")
|
||||
if kernel not in VALID_KERNELS:
|
||||
raise ValueError("invalid kernel: '{0}'".format(kernel))
|
||||
|
||||
def _choose_algorithm(self, algorithm, metric):
|
||||
# given the algorithm string + metric string, choose the optimal
|
||||
# algorithm to compute the result.
|
||||
if algorithm == 'auto':
|
||||
# use KD Tree if possible
|
||||
if metric in KDTree.valid_metrics:
|
||||
return 'kd_tree'
|
||||
elif metric in BallTree.valid_metrics:
|
||||
return 'ball_tree'
|
||||
else:
|
||||
raise ValueError("invalid metric: '{0}'".format(metric))
|
||||
elif algorithm in TREE_DICT:
|
||||
if metric not in TREE_DICT[algorithm].valid_metrics:
|
||||
raise ValueError("invalid metric for {0}: "
|
||||
"'{1}'".format(TREE_DICT[algorithm],
|
||||
metric))
|
||||
return algorithm
|
||||
else:
|
||||
raise ValueError("invalid algorithm: '{0}'".format(algorithm))
|
||||
|
||||
def fit(self, X, y=None, sample_weight=None):
|
||||
"""Fit the Kernel Density model on the data.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : array_like, shape (n_samples, n_features)
|
||||
List of n_features-dimensional data points. Each row
|
||||
corresponds to a single data point.
|
||||
y : None
|
||||
Ignored. This parameter exists only for compatibility with
|
||||
:class:`sklearn.pipeline.Pipeline`.
|
||||
sample_weight : array_like, shape (n_samples,), optional
|
||||
List of sample weights attached to the data X.
|
||||
|
||||
.. versionadded:: 0.20
|
||||
|
||||
Returns
|
||||
-------
|
||||
self : object
|
||||
Returns instance of object.
|
||||
"""
|
||||
algorithm = self._choose_algorithm(self.algorithm, self.metric)
|
||||
X = self._validate_data(X, order='C', dtype=DTYPE)
|
||||
|
||||
if sample_weight is not None:
|
||||
sample_weight = _check_sample_weight(sample_weight, X, DTYPE)
|
||||
if sample_weight.min() <= 0:
|
||||
raise ValueError("sample_weight must have positive values")
|
||||
|
||||
kwargs = self.metric_params
|
||||
if kwargs is None:
|
||||
kwargs = {}
|
||||
self.tree_ = TREE_DICT[algorithm](X, metric=self.metric,
|
||||
leaf_size=self.leaf_size,
|
||||
sample_weight=sample_weight,
|
||||
**kwargs)
|
||||
return self
|
||||
|
||||
def score_samples(self, X):
|
||||
"""Evaluate the log density model on the data.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : array_like, shape (n_samples, n_features)
|
||||
An array of points to query. Last dimension should match dimension
|
||||
of training data (n_features).
|
||||
|
||||
Returns
|
||||
-------
|
||||
density : ndarray, shape (n_samples,)
|
||||
The array of log(density) evaluations. These are normalized to be
|
||||
probability densities, so values will be low for high-dimensional
|
||||
data.
|
||||
"""
|
||||
check_is_fitted(self)
|
||||
# The returned density is normalized to the number of points.
|
||||
# For it to be a probability, we must scale it. For this reason
|
||||
# we'll also scale atol.
|
||||
X = check_array(X, order='C', dtype=DTYPE)
|
||||
if self.tree_.sample_weight is None:
|
||||
N = self.tree_.data.shape[0]
|
||||
else:
|
||||
N = self.tree_.sum_weight
|
||||
atol_N = self.atol * N
|
||||
log_density = self.tree_.kernel_density(
|
||||
X, h=self.bandwidth, kernel=self.kernel, atol=atol_N,
|
||||
rtol=self.rtol, breadth_first=self.breadth_first, return_log=True)
|
||||
log_density -= np.log(N)
|
||||
return log_density
|
||||
|
||||
def score(self, X, y=None):
|
||||
"""Compute the total log probability density under the model.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : array_like, shape (n_samples, n_features)
|
||||
List of n_features-dimensional data points. Each row
|
||||
corresponds to a single data point.
|
||||
y : None
|
||||
Ignored. This parameter exists only for compatibility with
|
||||
:class:`sklearn.pipeline.Pipeline`.
|
||||
|
||||
Returns
|
||||
-------
|
||||
logprob : float
|
||||
Total log-likelihood of the data in X. This is normalized to be a
|
||||
probability density, so the value will be low for high-dimensional
|
||||
data.
|
||||
"""
|
||||
return np.sum(self.score_samples(X))
|
||||
|
||||
def sample(self, n_samples=1, random_state=None):
|
||||
"""Generate random samples from the model.
|
||||
|
||||
Currently, this is implemented only for gaussian and tophat kernels.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
n_samples : int, optional
|
||||
Number of samples to generate. Defaults to 1.
|
||||
|
||||
random_state : int, RandomState instance, default=None
|
||||
Determines random number generation used to generate
|
||||
random samples. Pass an int for reproducible results
|
||||
across multiple function calls.
|
||||
See :term: `Glossary <random_state>`.
|
||||
|
||||
Returns
|
||||
-------
|
||||
X : array_like, shape (n_samples, n_features)
|
||||
List of samples.
|
||||
"""
|
||||
check_is_fitted(self)
|
||||
# TODO: implement sampling for other valid kernel shapes
|
||||
if self.kernel not in ['gaussian', 'tophat']:
|
||||
raise NotImplementedError()
|
||||
|
||||
data = np.asarray(self.tree_.data)
|
||||
|
||||
rng = check_random_state(random_state)
|
||||
u = rng.uniform(0, 1, size=n_samples)
|
||||
if self.tree_.sample_weight is None:
|
||||
i = (u * data.shape[0]).astype(np.int64)
|
||||
else:
|
||||
cumsum_weight = np.cumsum(np.asarray(self.tree_.sample_weight))
|
||||
sum_weight = cumsum_weight[-1]
|
||||
i = np.searchsorted(cumsum_weight, u * sum_weight)
|
||||
if self.kernel == 'gaussian':
|
||||
return np.atleast_2d(rng.normal(data[i], self.bandwidth))
|
||||
|
||||
elif self.kernel == 'tophat':
|
||||
# we first draw points from a d-dimensional normal distribution,
|
||||
# then use an incomplete gamma function to map them to a uniform
|
||||
# d-dimensional tophat distribution.
|
||||
dim = data.shape[1]
|
||||
X = rng.normal(size=(n_samples, dim))
|
||||
s_sq = row_norms(X, squared=True)
|
||||
correction = (gammainc(0.5 * dim, 0.5 * s_sq) ** (1. / dim)
|
||||
* self.bandwidth / np.sqrt(s_sq))
|
||||
return data[i] + X * correction[:, np.newaxis]
|
517
venv/Lib/site-packages/sklearn/neighbors/_lof.py
Normal file
517
venv/Lib/site-packages/sklearn/neighbors/_lof.py
Normal file
|
@ -0,0 +1,517 @@
|
|||
# Authors: Nicolas Goix <nicolas.goix@telecom-paristech.fr>
|
||||
# Alexandre Gramfort <alexandre.gramfort@telecom-paristech.fr>
|
||||
# License: BSD 3 clause
|
||||
|
||||
import numpy as np
|
||||
import warnings
|
||||
|
||||
from ._base import NeighborsBase
|
||||
from ._base import KNeighborsMixin
|
||||
from ._base import UnsupervisedMixin
|
||||
from ..base import OutlierMixin
|
||||
|
||||
from ..utils.validation import check_is_fitted
|
||||
from ..utils.validation import _deprecate_positional_args
|
||||
from ..utils import check_array
|
||||
|
||||
__all__ = ["LocalOutlierFactor"]
|
||||
|
||||
|
||||
class LocalOutlierFactor(KNeighborsMixin, UnsupervisedMixin,
|
||||
OutlierMixin, NeighborsBase):
|
||||
"""Unsupervised Outlier Detection using Local Outlier Factor (LOF)
|
||||
|
||||
The anomaly score of each sample is called Local Outlier Factor.
|
||||
It measures the local deviation of density of a given sample with
|
||||
respect to its neighbors.
|
||||
It is local in that the anomaly score depends on how isolated the object
|
||||
is with respect to the surrounding neighborhood.
|
||||
More precisely, locality is given by k-nearest neighbors, whose distance
|
||||
is used to estimate the local density.
|
||||
By comparing the local density of a sample to the local densities of
|
||||
its neighbors, one can identify samples that have a substantially lower
|
||||
density than their neighbors. These are considered outliers.
|
||||
|
||||
.. versionadded:: 0.19
|
||||
|
||||
Parameters
|
||||
----------
|
||||
n_neighbors : int, default=20
|
||||
Number of neighbors to use by default for :meth:`kneighbors` queries.
|
||||
If n_neighbors is larger than the number of samples provided,
|
||||
all samples will be used.
|
||||
|
||||
algorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'}, default='auto'
|
||||
Algorithm used to compute the nearest neighbors:
|
||||
|
||||
- 'ball_tree' will use :class:`BallTree`
|
||||
- 'kd_tree' will use :class:`KDTree`
|
||||
- 'brute' will use a brute-force search.
|
||||
- 'auto' will attempt to decide the most appropriate algorithm
|
||||
based on the values passed to :meth:`fit` method.
|
||||
|
||||
Note: fitting on sparse input will override the setting of
|
||||
this parameter, using brute force.
|
||||
|
||||
leaf_size : int, default=30
|
||||
Leaf size passed to :class:`BallTree` or :class:`KDTree`. This can
|
||||
affect the speed of the construction and query, as well as the memory
|
||||
required to store the tree. The optimal value depends on the
|
||||
nature of the problem.
|
||||
|
||||
metric : str or callable, default='minkowski'
|
||||
metric used for the distance computation. Any metric from scikit-learn
|
||||
or scipy.spatial.distance can be used.
|
||||
|
||||
If metric is "precomputed", X is assumed to be a distance matrix and
|
||||
must be square. X may be a sparse matrix, in which case only "nonzero"
|
||||
elements may be considered neighbors.
|
||||
|
||||
If metric is a callable function, it is called on each
|
||||
pair of instances (rows) and the resulting value recorded. The callable
|
||||
should take two arrays as input and return one value indicating the
|
||||
distance between them. This works for Scipy's metrics, but is less
|
||||
efficient than passing the metric name as a string.
|
||||
|
||||
Valid values for metric are:
|
||||
|
||||
- from scikit-learn: ['cityblock', 'cosine', 'euclidean', 'l1', 'l2',
|
||||
'manhattan']
|
||||
|
||||
- from scipy.spatial.distance: ['braycurtis', 'canberra', 'chebyshev',
|
||||
'correlation', 'dice', 'hamming', 'jaccard', 'kulsinski',
|
||||
'mahalanobis', 'minkowski', 'rogerstanimoto', 'russellrao',
|
||||
'seuclidean', 'sokalmichener', 'sokalsneath', 'sqeuclidean',
|
||||
'yule']
|
||||
|
||||
See the documentation for scipy.spatial.distance for details on these
|
||||
metrics:
|
||||
https://docs.scipy.org/doc/scipy/reference/spatial.distance.html
|
||||
|
||||
p : int, default=2
|
||||
Parameter for the Minkowski metric from
|
||||
:func:`sklearn.metrics.pairwise.pairwise_distances`. When p = 1, this
|
||||
is equivalent to using manhattan_distance (l1), and euclidean_distance
|
||||
(l2) for p = 2. For arbitrary p, minkowski_distance (l_p) is used.
|
||||
|
||||
metric_params : dict, default=None
|
||||
Additional keyword arguments for the metric function.
|
||||
|
||||
contamination : 'auto' or float, default='auto'
|
||||
The amount of contamination of the data set, i.e. the proportion
|
||||
of outliers in the data set. When fitting this is used to define the
|
||||
threshold on the scores of the samples.
|
||||
|
||||
- if 'auto', the threshold is determined as in the
|
||||
original paper,
|
||||
- if a float, the contamination should be in the range [0, 0.5].
|
||||
|
||||
.. versionchanged:: 0.22
|
||||
The default value of ``contamination`` changed from 0.1
|
||||
to ``'auto'``.
|
||||
|
||||
novelty : bool, default=False
|
||||
By default, LocalOutlierFactor is only meant to be used for outlier
|
||||
detection (novelty=False). Set novelty to True if you want to use
|
||||
LocalOutlierFactor for novelty detection. In this case be aware that
|
||||
that you should only use predict, decision_function and score_samples
|
||||
on new unseen data and not on the training set.
|
||||
|
||||
.. versionadded:: 0.20
|
||||
|
||||
n_jobs : int, default=None
|
||||
The number of parallel jobs to run for neighbors search.
|
||||
``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
|
||||
``-1`` means using all processors. See :term:`Glossary <n_jobs>`
|
||||
for more details.
|
||||
|
||||
Attributes
|
||||
----------
|
||||
negative_outlier_factor_ : ndarray of shape (n_samples,)
|
||||
The opposite LOF of the training samples. The higher, the more normal.
|
||||
Inliers tend to have a LOF score close to 1
|
||||
(``negative_outlier_factor_`` close to -1), while outliers tend to have
|
||||
a larger LOF score.
|
||||
|
||||
The local outlier factor (LOF) of a sample captures its
|
||||
supposed 'degree of abnormality'.
|
||||
It is the average of the ratio of the local reachability density of
|
||||
a sample and those of its k-nearest neighbors.
|
||||
|
||||
n_neighbors_ : int
|
||||
The actual number of neighbors used for :meth:`kneighbors` queries.
|
||||
|
||||
offset_ : float
|
||||
Offset used to obtain binary labels from the raw scores.
|
||||
Observations having a negative_outlier_factor smaller than `offset_`
|
||||
are detected as abnormal.
|
||||
The offset is set to -1.5 (inliers score around -1), except when a
|
||||
contamination parameter different than "auto" is provided. In that
|
||||
case, the offset is defined in such a way we obtain the expected
|
||||
number of outliers in training.
|
||||
|
||||
.. versionadded:: 0.20
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> import numpy as np
|
||||
>>> from sklearn.neighbors import LocalOutlierFactor
|
||||
>>> X = [[-1.1], [0.2], [101.1], [0.3]]
|
||||
>>> clf = LocalOutlierFactor(n_neighbors=2)
|
||||
>>> clf.fit_predict(X)
|
||||
array([ 1, 1, -1, 1])
|
||||
>>> clf.negative_outlier_factor_
|
||||
array([ -0.9821..., -1.0370..., -73.3697..., -0.9821...])
|
||||
|
||||
References
|
||||
----------
|
||||
.. [1] Breunig, M. M., Kriegel, H. P., Ng, R. T., & Sander, J. (2000, May).
|
||||
LOF: identifying density-based local outliers. In ACM sigmod record.
|
||||
"""
|
||||
@_deprecate_positional_args
|
||||
def __init__(self, n_neighbors=20, *, algorithm='auto', leaf_size=30,
|
||||
metric='minkowski', p=2, metric_params=None,
|
||||
contamination="auto", novelty=False, n_jobs=None):
|
||||
super().__init__(
|
||||
n_neighbors=n_neighbors,
|
||||
algorithm=algorithm,
|
||||
leaf_size=leaf_size, metric=metric, p=p,
|
||||
metric_params=metric_params, n_jobs=n_jobs)
|
||||
self.contamination = contamination
|
||||
self.novelty = novelty
|
||||
|
||||
@property
|
||||
def fit_predict(self):
|
||||
"""Fits the model to the training set X and returns the labels.
|
||||
|
||||
**Only available for novelty detection (when novelty is set to True).**
|
||||
Label is 1 for an inlier and -1 for an outlier according to the LOF
|
||||
score and the contamination parameter.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : array-like of shape (n_samples, n_features), default=None
|
||||
The query sample or samples to compute the Local Outlier Factor
|
||||
w.r.t. to the training samples.
|
||||
|
||||
y : Ignored
|
||||
Not used, present for API consistency by convention.
|
||||
|
||||
Returns
|
||||
-------
|
||||
is_inlier : ndarray of shape (n_samples,)
|
||||
Returns -1 for anomalies/outliers and 1 for inliers.
|
||||
"""
|
||||
|
||||
# As fit_predict would be different from fit.predict, fit_predict is
|
||||
# only available for outlier detection (novelty=False)
|
||||
|
||||
if self.novelty:
|
||||
msg = ('fit_predict is not available when novelty=True. Use '
|
||||
'novelty=False if you want to predict on the training set.')
|
||||
raise AttributeError(msg)
|
||||
|
||||
return self._fit_predict
|
||||
|
||||
def _fit_predict(self, X, y=None):
|
||||
"""Fits the model to the training set X and returns the labels.
|
||||
|
||||
Label is 1 for an inlier and -1 for an outlier according to the LOF
|
||||
score and the contamination parameter.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : array-like of shape (n_samples, n_features), default=None
|
||||
The query sample or samples to compute the Local Outlier Factor
|
||||
w.r.t. to the training samples.
|
||||
|
||||
Returns
|
||||
-------
|
||||
is_inlier : ndarray of shape (n_samples,)
|
||||
Returns -1 for anomalies/outliers and 1 for inliers.
|
||||
"""
|
||||
|
||||
# As fit_predict would be different from fit.predict, fit_predict is
|
||||
# only available for outlier detection (novelty=False)
|
||||
|
||||
return self.fit(X)._predict()
|
||||
|
||||
def fit(self, X, y=None):
|
||||
"""Fit the model using X as training data.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : BallTree, KDTree or {array-like, sparse matrix} of shape \
|
||||
(n_samples, n_features) or (n_samples, n_samples)
|
||||
Training data. If array or matrix, the shape is (n_samples,
|
||||
n_features), or (n_samples, n_samples) if metric='precomputed'.
|
||||
|
||||
y : Ignored
|
||||
Not used, present for API consistency by convention.
|
||||
|
||||
Returns
|
||||
-------
|
||||
self : object
|
||||
"""
|
||||
if self.contamination != 'auto':
|
||||
if not(0. < self.contamination <= .5):
|
||||
raise ValueError("contamination must be in (0, 0.5], "
|
||||
"got: %f" % self.contamination)
|
||||
|
||||
super().fit(X)
|
||||
|
||||
n_samples = self.n_samples_fit_
|
||||
if self.n_neighbors > n_samples:
|
||||
warnings.warn("n_neighbors (%s) is greater than the "
|
||||
"total number of samples (%s). n_neighbors "
|
||||
"will be set to (n_samples - 1) for estimation."
|
||||
% (self.n_neighbors, n_samples))
|
||||
self.n_neighbors_ = max(1, min(self.n_neighbors, n_samples - 1))
|
||||
|
||||
self._distances_fit_X_, _neighbors_indices_fit_X_ = self.kneighbors(
|
||||
n_neighbors=self.n_neighbors_)
|
||||
|
||||
self._lrd = self._local_reachability_density(
|
||||
self._distances_fit_X_, _neighbors_indices_fit_X_)
|
||||
|
||||
# Compute lof score over training samples to define offset_:
|
||||
lrd_ratios_array = (self._lrd[_neighbors_indices_fit_X_] /
|
||||
self._lrd[:, np.newaxis])
|
||||
|
||||
self.negative_outlier_factor_ = -np.mean(lrd_ratios_array, axis=1)
|
||||
|
||||
if self.contamination == "auto":
|
||||
# inliers score around -1 (the higher, the less abnormal).
|
||||
self.offset_ = -1.5
|
||||
else:
|
||||
self.offset_ = np.percentile(self.negative_outlier_factor_,
|
||||
100. * self.contamination)
|
||||
|
||||
return self
|
||||
|
||||
@property
|
||||
def predict(self):
|
||||
"""Predict the labels (1 inlier, -1 outlier) of X according to LOF.
|
||||
|
||||
**Only available for novelty detection (when novelty is set to True).**
|
||||
This method allows to generalize prediction to *new observations* (not
|
||||
in the training set).
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : array-like of shape (n_samples, n_features)
|
||||
The query sample or samples to compute the Local Outlier Factor
|
||||
w.r.t. to the training samples.
|
||||
|
||||
Returns
|
||||
-------
|
||||
is_inlier : ndarray of shape (n_samples,)
|
||||
Returns -1 for anomalies/outliers and +1 for inliers.
|
||||
"""
|
||||
if not self.novelty:
|
||||
msg = ('predict is not available when novelty=False, use '
|
||||
'fit_predict if you want to predict on training data. Use '
|
||||
'novelty=True if you want to use LOF for novelty detection '
|
||||
'and predict on new unseen data.')
|
||||
raise AttributeError(msg)
|
||||
|
||||
return self._predict
|
||||
|
||||
def _predict(self, X=None):
|
||||
"""Predict the labels (1 inlier, -1 outlier) of X according to LOF.
|
||||
|
||||
If X is None, returns the same as fit_predict(X_train).
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : array-like of shape (n_samples, n_features), default=None
|
||||
The query sample or samples to compute the Local Outlier Factor
|
||||
w.r.t. to the training samples. If None, makes prediction on the
|
||||
training data without considering them as their own neighbors.
|
||||
|
||||
Returns
|
||||
-------
|
||||
is_inlier : ndarray of shape (n_samples,)
|
||||
Returns -1 for anomalies/outliers and +1 for inliers.
|
||||
"""
|
||||
check_is_fitted(self)
|
||||
|
||||
if X is not None:
|
||||
X = check_array(X, accept_sparse='csr')
|
||||
is_inlier = np.ones(X.shape[0], dtype=int)
|
||||
is_inlier[self.decision_function(X) < 0] = -1
|
||||
else:
|
||||
is_inlier = np.ones(self.n_samples_fit_, dtype=int)
|
||||
is_inlier[self.negative_outlier_factor_ < self.offset_] = -1
|
||||
|
||||
return is_inlier
|
||||
|
||||
@property
|
||||
def decision_function(self):
|
||||
"""Shifted opposite of the Local Outlier Factor of X.
|
||||
|
||||
Bigger is better, i.e. large values correspond to inliers.
|
||||
|
||||
**Only available for novelty detection (when novelty is set to True).**
|
||||
The shift offset allows a zero threshold for being an outlier.
|
||||
The argument X is supposed to contain *new data*: if X contains a
|
||||
point from training, it considers the later in its own neighborhood.
|
||||
Also, the samples in X are not considered in the neighborhood of any
|
||||
point.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : array-like of shape (n_samples, n_features)
|
||||
The query sample or samples to compute the Local Outlier Factor
|
||||
w.r.t. the training samples.
|
||||
|
||||
Returns
|
||||
-------
|
||||
shifted_opposite_lof_scores : ndarray of shape (n_samples,)
|
||||
The shifted opposite of the Local Outlier Factor of each input
|
||||
samples. The lower, the more abnormal. Negative scores represent
|
||||
outliers, positive scores represent inliers.
|
||||
"""
|
||||
if not self.novelty:
|
||||
msg = ('decision_function is not available when novelty=False. '
|
||||
'Use novelty=True if you want to use LOF for novelty '
|
||||
'detection and compute decision_function for new unseen '
|
||||
'data. Note that the opposite LOF of the training samples '
|
||||
'is always available by considering the '
|
||||
'negative_outlier_factor_ attribute.')
|
||||
raise AttributeError(msg)
|
||||
|
||||
return self._decision_function
|
||||
|
||||
def _decision_function(self, X):
|
||||
"""Shifted opposite of the Local Outlier Factor of X.
|
||||
|
||||
Bigger is better, i.e. large values correspond to inliers.
|
||||
|
||||
**Only available for novelty detection (when novelty is set to True).**
|
||||
The shift offset allows a zero threshold for being an outlier.
|
||||
The argument X is supposed to contain *new data*: if X contains a
|
||||
point from training, it considers the later in its own neighborhood.
|
||||
Also, the samples in X are not considered in the neighborhood of any
|
||||
point.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : array-like of shape (n_samples, n_features)
|
||||
The query sample or samples to compute the Local Outlier Factor
|
||||
w.r.t. the training samples.
|
||||
|
||||
Returns
|
||||
-------
|
||||
shifted_opposite_lof_scores : ndarray of shape (n_samples,)
|
||||
The shifted opposite of the Local Outlier Factor of each input
|
||||
samples. The lower, the more abnormal. Negative scores represent
|
||||
outliers, positive scores represent inliers.
|
||||
"""
|
||||
|
||||
return self._score_samples(X) - self.offset_
|
||||
|
||||
@property
|
||||
def score_samples(self):
|
||||
"""Opposite of the Local Outlier Factor of X.
|
||||
|
||||
It is the opposite as bigger is better, i.e. large values correspond
|
||||
to inliers.
|
||||
|
||||
**Only available for novelty detection (when novelty is set to True).**
|
||||
The argument X is supposed to contain *new data*: if X contains a
|
||||
point from training, it considers the later in its own neighborhood.
|
||||
Also, the samples in X are not considered in the neighborhood of any
|
||||
point.
|
||||
The score_samples on training data is available by considering the
|
||||
the ``negative_outlier_factor_`` attribute.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : array-like of shape (n_samples, n_features)
|
||||
The query sample or samples to compute the Local Outlier Factor
|
||||
w.r.t. the training samples.
|
||||
|
||||
Returns
|
||||
-------
|
||||
opposite_lof_scores : ndarray of shape (n_samples,)
|
||||
The opposite of the Local Outlier Factor of each input samples.
|
||||
The lower, the more abnormal.
|
||||
"""
|
||||
if not self.novelty:
|
||||
msg = ('score_samples is not available when novelty=False. The '
|
||||
'scores of the training samples are always available '
|
||||
'through the negative_outlier_factor_ attribute. Use '
|
||||
'novelty=True if you want to use LOF for novelty detection '
|
||||
'and compute score_samples for new unseen data.')
|
||||
raise AttributeError(msg)
|
||||
|
||||
return self._score_samples
|
||||
|
||||
def _score_samples(self, X):
|
||||
"""Opposite of the Local Outlier Factor of X.
|
||||
|
||||
It is the opposite as bigger is better, i.e. large values correspond
|
||||
to inliers.
|
||||
|
||||
**Only available for novelty detection (when novelty is set to True).**
|
||||
The argument X is supposed to contain *new data*: if X contains a
|
||||
point from training, it considers the later in its own neighborhood.
|
||||
Also, the samples in X are not considered in the neighborhood of any
|
||||
point.
|
||||
The score_samples on training data is available by considering the
|
||||
the ``negative_outlier_factor_`` attribute.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : array-like of shape (n_samples, n_features)
|
||||
The query sample or samples to compute the Local Outlier Factor
|
||||
w.r.t. the training samples.
|
||||
|
||||
Returns
|
||||
-------
|
||||
opposite_lof_scores : ndarray of shape (n_samples,)
|
||||
The opposite of the Local Outlier Factor of each input samples.
|
||||
The lower, the more abnormal.
|
||||
"""
|
||||
check_is_fitted(self)
|
||||
X = check_array(X, accept_sparse='csr')
|
||||
|
||||
distances_X, neighbors_indices_X = (
|
||||
self.kneighbors(X, n_neighbors=self.n_neighbors_))
|
||||
X_lrd = self._local_reachability_density(distances_X,
|
||||
neighbors_indices_X)
|
||||
|
||||
lrd_ratios_array = (self._lrd[neighbors_indices_X] /
|
||||
X_lrd[:, np.newaxis])
|
||||
|
||||
# as bigger is better:
|
||||
return -np.mean(lrd_ratios_array, axis=1)
|
||||
|
||||
def _local_reachability_density(self, distances_X, neighbors_indices):
|
||||
"""The local reachability density (LRD)
|
||||
|
||||
The LRD of a sample is the inverse of the average reachability
|
||||
distance of its k-nearest neighbors.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
distances_X : ndarray of shape (n_queries, self.n_neighbors)
|
||||
Distances to the neighbors (in the training samples `self._fit_X`)
|
||||
of each query point to compute the LRD.
|
||||
|
||||
neighbors_indices : ndarray of shape (n_queries, self.n_neighbors)
|
||||
Neighbors indices (of each query point) among training samples
|
||||
self._fit_X.
|
||||
|
||||
Returns
|
||||
-------
|
||||
local_reachability_density : ndarray of shape (n_queries,)
|
||||
The local reachability density of each sample.
|
||||
"""
|
||||
dist_k = self._distances_fit_X_[neighbors_indices,
|
||||
self.n_neighbors_ - 1]
|
||||
reach_dist_array = np.maximum(distances_X, dist_k)
|
||||
|
||||
# 1e-10 to avoid `nan' when nb of duplicates > n_neighbors_:
|
||||
return 1. / (np.mean(reach_dist_array, axis=1) + 1e-10)
|
527
venv/Lib/site-packages/sklearn/neighbors/_nca.py
Normal file
527
venv/Lib/site-packages/sklearn/neighbors/_nca.py
Normal file
|
@ -0,0 +1,527 @@
|
|||
# coding: utf-8
|
||||
"""
|
||||
Neighborhood Component Analysis
|
||||
"""
|
||||
|
||||
# Authors: William de Vazelhes <wdevazelhes@gmail.com>
|
||||
# John Chiotellis <ioannis.chiotellis@in.tum.de>
|
||||
# License: BSD 3 clause
|
||||
|
||||
from __future__ import print_function
|
||||
|
||||
from warnings import warn
|
||||
import numpy as np
|
||||
import sys
|
||||
import time
|
||||
import numbers
|
||||
from scipy.optimize import minimize
|
||||
from ..utils.extmath import softmax
|
||||
from ..metrics import pairwise_distances
|
||||
from ..base import BaseEstimator, TransformerMixin
|
||||
from ..preprocessing import LabelEncoder
|
||||
from ..decomposition import PCA
|
||||
from ..utils.multiclass import check_classification_targets
|
||||
from ..utils.random import check_random_state
|
||||
from ..utils.validation import check_is_fitted, check_array, check_scalar
|
||||
from ..utils.validation import _deprecate_positional_args
|
||||
from ..exceptions import ConvergenceWarning
|
||||
|
||||
|
||||
class NeighborhoodComponentsAnalysis(TransformerMixin, BaseEstimator):
|
||||
"""Neighborhood Components Analysis
|
||||
|
||||
Neighborhood Component Analysis (NCA) is a machine learning algorithm for
|
||||
metric learning. It learns a linear transformation in a supervised fashion
|
||||
to improve the classification accuracy of a stochastic nearest neighbors
|
||||
rule in the transformed space.
|
||||
|
||||
Read more in the :ref:`User Guide <nca>`.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
n_components : int, default=None
|
||||
Preferred dimensionality of the projected space.
|
||||
If None it will be set to ``n_features``.
|
||||
|
||||
init : {'auto', 'pca', 'lda', 'identity', 'random'} or ndarray of shape \
|
||||
(n_features_a, n_features_b), default='auto'
|
||||
Initialization of the linear transformation. Possible options are
|
||||
'auto', 'pca', 'lda', 'identity', 'random', and a numpy array of shape
|
||||
(n_features_a, n_features_b).
|
||||
|
||||
'auto'
|
||||
Depending on ``n_components``, the most reasonable initialization
|
||||
will be chosen. If ``n_components <= n_classes`` we use 'lda', as
|
||||
it uses labels information. If not, but
|
||||
``n_components < min(n_features, n_samples)``, we use 'pca', as
|
||||
it projects data in meaningful directions (those of higher
|
||||
variance). Otherwise, we just use 'identity'.
|
||||
|
||||
'pca'
|
||||
``n_components`` principal components of the inputs passed
|
||||
to :meth:`fit` will be used to initialize the transformation.
|
||||
(See :class:`~sklearn.decomposition.PCA`)
|
||||
|
||||
'lda'
|
||||
``min(n_components, n_classes)`` most discriminative
|
||||
components of the inputs passed to :meth:`fit` will be used to
|
||||
initialize the transformation. (If ``n_components > n_classes``,
|
||||
the rest of the components will be zero.) (See
|
||||
:class:`~sklearn.discriminant_analysis.LinearDiscriminantAnalysis`)
|
||||
|
||||
'identity'
|
||||
If ``n_components`` is strictly smaller than the
|
||||
dimensionality of the inputs passed to :meth:`fit`, the identity
|
||||
matrix will be truncated to the first ``n_components`` rows.
|
||||
|
||||
'random'
|
||||
The initial transformation will be a random array of shape
|
||||
`(n_components, n_features)`. Each value is sampled from the
|
||||
standard normal distribution.
|
||||
|
||||
numpy array
|
||||
n_features_b must match the dimensionality of the inputs passed to
|
||||
:meth:`fit` and n_features_a must be less than or equal to that.
|
||||
If ``n_components`` is not None, n_features_a must match it.
|
||||
|
||||
warm_start : bool, default=False
|
||||
If True and :meth:`fit` has been called before, the solution of the
|
||||
previous call to :meth:`fit` is used as the initial linear
|
||||
transformation (``n_components`` and ``init`` will be ignored).
|
||||
|
||||
max_iter : int, default=50
|
||||
Maximum number of iterations in the optimization.
|
||||
|
||||
tol : float, default=1e-5
|
||||
Convergence tolerance for the optimization.
|
||||
|
||||
callback : callable, default=None
|
||||
If not None, this function is called after every iteration of the
|
||||
optimizer, taking as arguments the current solution (flattened
|
||||
transformation matrix) and the number of iterations. This might be
|
||||
useful in case one wants to examine or store the transformation
|
||||
found after each iteration.
|
||||
|
||||
verbose : int, default=0
|
||||
If 0, no progress messages will be printed.
|
||||
If 1, progress messages will be printed to stdout.
|
||||
If > 1, progress messages will be printed and the ``disp``
|
||||
parameter of :func:`scipy.optimize.minimize` will be set to
|
||||
``verbose - 2``.
|
||||
|
||||
random_state : int or numpy.RandomState, default=None
|
||||
A pseudo random number generator object or a seed for it if int. If
|
||||
``init='random'``, ``random_state`` is used to initialize the random
|
||||
transformation. If ``init='pca'``, ``random_state`` is passed as an
|
||||
argument to PCA when initializing the transformation. Pass an int
|
||||
for reproducible results across multiple function calls.
|
||||
See :term: `Glossary <random_state>`.
|
||||
|
||||
Attributes
|
||||
----------
|
||||
components_ : ndarray of shape (n_components, n_features)
|
||||
The linear transformation learned during fitting.
|
||||
|
||||
n_iter_ : int
|
||||
Counts the number of iterations performed by the optimizer.
|
||||
|
||||
random_state_ : numpy.RandomState
|
||||
Pseudo random number generator object used during initialization.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> from sklearn.neighbors import NeighborhoodComponentsAnalysis
|
||||
>>> from sklearn.neighbors import KNeighborsClassifier
|
||||
>>> from sklearn.datasets import load_iris
|
||||
>>> from sklearn.model_selection import train_test_split
|
||||
>>> X, y = load_iris(return_X_y=True)
|
||||
>>> X_train, X_test, y_train, y_test = train_test_split(X, y,
|
||||
... stratify=y, test_size=0.7, random_state=42)
|
||||
>>> nca = NeighborhoodComponentsAnalysis(random_state=42)
|
||||
>>> nca.fit(X_train, y_train)
|
||||
NeighborhoodComponentsAnalysis(...)
|
||||
>>> knn = KNeighborsClassifier(n_neighbors=3)
|
||||
>>> knn.fit(X_train, y_train)
|
||||
KNeighborsClassifier(...)
|
||||
>>> print(knn.score(X_test, y_test))
|
||||
0.933333...
|
||||
>>> knn.fit(nca.transform(X_train), y_train)
|
||||
KNeighborsClassifier(...)
|
||||
>>> print(knn.score(nca.transform(X_test), y_test))
|
||||
0.961904...
|
||||
|
||||
References
|
||||
----------
|
||||
.. [1] J. Goldberger, G. Hinton, S. Roweis, R. Salakhutdinov.
|
||||
"Neighbourhood Components Analysis". Advances in Neural Information
|
||||
Processing Systems. 17, 513-520, 2005.
|
||||
http://www.cs.nyu.edu/~roweis/papers/ncanips.pdf
|
||||
|
||||
.. [2] Wikipedia entry on Neighborhood Components Analysis
|
||||
https://en.wikipedia.org/wiki/Neighbourhood_components_analysis
|
||||
|
||||
"""
|
||||
|
||||
@_deprecate_positional_args
|
||||
def __init__(self, n_components=None, *, init='auto', warm_start=False,
|
||||
max_iter=50, tol=1e-5, callback=None, verbose=0,
|
||||
random_state=None):
|
||||
self.n_components = n_components
|
||||
self.init = init
|
||||
self.warm_start = warm_start
|
||||
self.max_iter = max_iter
|
||||
self.tol = tol
|
||||
self.callback = callback
|
||||
self.verbose = verbose
|
||||
self.random_state = random_state
|
||||
|
||||
def fit(self, X, y):
|
||||
"""Fit the model according to the given training data.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : array-like of shape (n_samples, n_features)
|
||||
The training samples.
|
||||
|
||||
y : array-like of shape (n_samples,)
|
||||
The corresponding training labels.
|
||||
|
||||
Returns
|
||||
-------
|
||||
self : object
|
||||
returns a trained NeighborhoodComponentsAnalysis model.
|
||||
"""
|
||||
|
||||
# Verify inputs X and y and NCA parameters, and transform a copy if
|
||||
# needed
|
||||
X, y, init = self._validate_params(X, y)
|
||||
|
||||
# Initialize the random generator
|
||||
self.random_state_ = check_random_state(self.random_state)
|
||||
|
||||
# Measure the total training time
|
||||
t_train = time.time()
|
||||
|
||||
# Compute a mask that stays fixed during optimization:
|
||||
same_class_mask = y[:, np.newaxis] == y[np.newaxis, :]
|
||||
# (n_samples, n_samples)
|
||||
|
||||
# Initialize the transformation
|
||||
transformation = self._initialize(X, y, init)
|
||||
|
||||
# Create a dictionary of parameters to be passed to the optimizer
|
||||
disp = self.verbose - 2 if self.verbose > 1 else -1
|
||||
optimizer_params = {'method': 'L-BFGS-B',
|
||||
'fun': self._loss_grad_lbfgs,
|
||||
'args': (X, same_class_mask, -1.0),
|
||||
'jac': True,
|
||||
'x0': transformation,
|
||||
'tol': self.tol,
|
||||
'options': dict(maxiter=self.max_iter, disp=disp),
|
||||
'callback': self._callback
|
||||
}
|
||||
|
||||
# Call the optimizer
|
||||
self.n_iter_ = 0
|
||||
opt_result = minimize(**optimizer_params)
|
||||
|
||||
# Reshape the solution found by the optimizer
|
||||
self.components_ = opt_result.x.reshape(-1, X.shape[1])
|
||||
|
||||
# Stop timer
|
||||
t_train = time.time() - t_train
|
||||
if self.verbose:
|
||||
cls_name = self.__class__.__name__
|
||||
|
||||
# Warn the user if the algorithm did not converge
|
||||
if not opt_result.success:
|
||||
warn('[{}] NCA did not converge: {}'.format(
|
||||
cls_name, opt_result.message),
|
||||
ConvergenceWarning)
|
||||
|
||||
print('[{}] Training took {:8.2f}s.'.format(cls_name, t_train))
|
||||
|
||||
return self
|
||||
|
||||
def transform(self, X):
|
||||
"""Applies the learned transformation to the given data.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : array-like of shape (n_samples, n_features)
|
||||
Data samples.
|
||||
|
||||
Returns
|
||||
-------
|
||||
X_embedded: ndarray of shape (n_samples, n_components)
|
||||
The data samples transformed.
|
||||
|
||||
Raises
|
||||
------
|
||||
NotFittedError
|
||||
If :meth:`fit` has not been called before.
|
||||
"""
|
||||
|
||||
check_is_fitted(self)
|
||||
X = check_array(X)
|
||||
|
||||
return np.dot(X, self.components_.T)
|
||||
|
||||
def _validate_params(self, X, y):
|
||||
"""Validate parameters as soon as :meth:`fit` is called.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : array-like of shape (n_samples, n_features)
|
||||
The training samples.
|
||||
|
||||
y : array-like of shape (n_samples,)
|
||||
The corresponding training labels.
|
||||
|
||||
Returns
|
||||
-------
|
||||
X : ndarray of shape (n_samples, n_features)
|
||||
The validated training samples.
|
||||
|
||||
y : ndarray of shape (n_samples,)
|
||||
The validated training labels, encoded to be integers in
|
||||
the range(0, n_classes).
|
||||
|
||||
init : str or ndarray of shape (n_features_a, n_features_b)
|
||||
The validated initialization of the linear transformation.
|
||||
|
||||
Raises
|
||||
-------
|
||||
TypeError
|
||||
If a parameter is not an instance of the desired type.
|
||||
|
||||
ValueError
|
||||
If a parameter's value violates its legal value range or if the
|
||||
combination of two or more given parameters is incompatible.
|
||||
"""
|
||||
|
||||
# Validate the inputs X and y, and converts y to numerical classes.
|
||||
X, y = self._validate_data(X, y, ensure_min_samples=2)
|
||||
check_classification_targets(y)
|
||||
y = LabelEncoder().fit_transform(y)
|
||||
|
||||
# Check the preferred dimensionality of the projected space
|
||||
if self.n_components is not None:
|
||||
check_scalar(
|
||||
self.n_components, 'n_components', numbers.Integral, min_val=1)
|
||||
|
||||
if self.n_components > X.shape[1]:
|
||||
raise ValueError('The preferred dimensionality of the '
|
||||
'projected space `n_components` ({}) cannot '
|
||||
'be greater than the given data '
|
||||
'dimensionality ({})!'
|
||||
.format(self.n_components, X.shape[1]))
|
||||
|
||||
# If warm_start is enabled, check that the inputs are consistent
|
||||
check_scalar(self.warm_start, 'warm_start', bool)
|
||||
if self.warm_start and hasattr(self, 'components_'):
|
||||
if self.components_.shape[1] != X.shape[1]:
|
||||
raise ValueError('The new inputs dimensionality ({}) does not '
|
||||
'match the input dimensionality of the '
|
||||
'previously learned transformation ({}).'
|
||||
.format(X.shape[1],
|
||||
self.components_.shape[1]))
|
||||
|
||||
check_scalar(self.max_iter, 'max_iter', numbers.Integral, min_val=1)
|
||||
check_scalar(self.tol, 'tol', numbers.Real, min_val=0.)
|
||||
check_scalar(self.verbose, 'verbose', numbers.Integral, min_val=0)
|
||||
|
||||
if self.callback is not None:
|
||||
if not callable(self.callback):
|
||||
raise ValueError('`callback` is not callable.')
|
||||
|
||||
# Check how the linear transformation should be initialized
|
||||
init = self.init
|
||||
|
||||
if isinstance(init, np.ndarray):
|
||||
init = check_array(init)
|
||||
|
||||
# Assert that init.shape[1] = X.shape[1]
|
||||
if init.shape[1] != X.shape[1]:
|
||||
raise ValueError(
|
||||
'The input dimensionality ({}) of the given '
|
||||
'linear transformation `init` must match the '
|
||||
'dimensionality of the given inputs `X` ({}).'
|
||||
.format(init.shape[1], X.shape[1]))
|
||||
|
||||
# Assert that init.shape[0] <= init.shape[1]
|
||||
if init.shape[0] > init.shape[1]:
|
||||
raise ValueError(
|
||||
'The output dimensionality ({}) of the given '
|
||||
'linear transformation `init` cannot be '
|
||||
'greater than its input dimensionality ({}).'
|
||||
.format(init.shape[0], init.shape[1]))
|
||||
|
||||
if self.n_components is not None:
|
||||
# Assert that self.n_components = init.shape[0]
|
||||
if self.n_components != init.shape[0]:
|
||||
raise ValueError('The preferred dimensionality of the '
|
||||
'projected space `n_components` ({}) does'
|
||||
' not match the output dimensionality of '
|
||||
'the given linear transformation '
|
||||
'`init` ({})!'
|
||||
.format(self.n_components,
|
||||
init.shape[0]))
|
||||
elif init in ['auto', 'pca', 'lda', 'identity', 'random']:
|
||||
pass
|
||||
else:
|
||||
raise ValueError(
|
||||
"`init` must be 'auto', 'pca', 'lda', 'identity', 'random' "
|
||||
"or a numpy array of shape (n_components, n_features).")
|
||||
|
||||
return X, y, init
|
||||
|
||||
def _initialize(self, X, y, init):
|
||||
"""Initialize the transformation.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : array-like of shape (n_samples, n_features)
|
||||
The training samples.
|
||||
|
||||
y : array-like of shape (n_samples,)
|
||||
The training labels.
|
||||
|
||||
init : str or ndarray of shape (n_features_a, n_features_b)
|
||||
The validated initialization of the linear transformation.
|
||||
|
||||
Returns
|
||||
-------
|
||||
transformation : ndarray of shape (n_components, n_features)
|
||||
The initialized linear transformation.
|
||||
|
||||
"""
|
||||
|
||||
transformation = init
|
||||
if self.warm_start and hasattr(self, 'components_'):
|
||||
transformation = self.components_
|
||||
elif isinstance(init, np.ndarray):
|
||||
pass
|
||||
else:
|
||||
n_samples, n_features = X.shape
|
||||
n_components = self.n_components or n_features
|
||||
if init == 'auto':
|
||||
n_classes = len(np.unique(y))
|
||||
if n_components <= min(n_features, n_classes - 1):
|
||||
init = 'lda'
|
||||
elif n_components < min(n_features, n_samples):
|
||||
init = 'pca'
|
||||
else:
|
||||
init = 'identity'
|
||||
if init == 'identity':
|
||||
transformation = np.eye(n_components, X.shape[1])
|
||||
elif init == 'random':
|
||||
transformation = self.random_state_.randn(n_components,
|
||||
X.shape[1])
|
||||
elif init in {'pca', 'lda'}:
|
||||
init_time = time.time()
|
||||
if init == 'pca':
|
||||
pca = PCA(n_components=n_components,
|
||||
random_state=self.random_state_)
|
||||
if self.verbose:
|
||||
print('Finding principal components... ', end='')
|
||||
sys.stdout.flush()
|
||||
pca.fit(X)
|
||||
transformation = pca.components_
|
||||
elif init == 'lda':
|
||||
from ..discriminant_analysis import (
|
||||
LinearDiscriminantAnalysis)
|
||||
lda = LinearDiscriminantAnalysis(n_components=n_components)
|
||||
if self.verbose:
|
||||
print('Finding most discriminative components... ',
|
||||
end='')
|
||||
sys.stdout.flush()
|
||||
lda.fit(X, y)
|
||||
transformation = lda.scalings_.T[:n_components]
|
||||
if self.verbose:
|
||||
print('done in {:5.2f}s'.format(time.time() - init_time))
|
||||
return transformation
|
||||
|
||||
def _callback(self, transformation):
|
||||
"""Called after each iteration of the optimizer.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
transformation : ndarray of shape (n_components * n_features,)
|
||||
The solution computed by the optimizer in this iteration.
|
||||
"""
|
||||
if self.callback is not None:
|
||||
self.callback(transformation, self.n_iter_)
|
||||
|
||||
self.n_iter_ += 1
|
||||
|
||||
def _loss_grad_lbfgs(self, transformation, X, same_class_mask, sign=1.0):
|
||||
"""Compute the loss and the loss gradient w.r.t. ``transformation``.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
transformation : ndarray of shape (n_components * n_features,)
|
||||
The raveled linear transformation on which to compute loss and
|
||||
evaluate gradient.
|
||||
|
||||
X : ndarray of shape (n_samples, n_features)
|
||||
The training samples.
|
||||
|
||||
same_class_mask : ndarray of shape (n_samples, n_samples)
|
||||
A mask where ``mask[i, j] == 1`` if ``X[i]`` and ``X[j]`` belong
|
||||
to the same class, and ``0`` otherwise.
|
||||
|
||||
Returns
|
||||
-------
|
||||
loss : float
|
||||
The loss computed for the given transformation.
|
||||
|
||||
gradient : ndarray of shape (n_components * n_features,)
|
||||
The new (flattened) gradient of the loss.
|
||||
"""
|
||||
|
||||
if self.n_iter_ == 0:
|
||||
self.n_iter_ += 1
|
||||
if self.verbose:
|
||||
header_fields = ['Iteration', 'Objective Value', 'Time(s)']
|
||||
header_fmt = '{:>10} {:>20} {:>10}'
|
||||
header = header_fmt.format(*header_fields)
|
||||
cls_name = self.__class__.__name__
|
||||
print('[{}]'.format(cls_name))
|
||||
print('[{}] {}\n[{}] {}'.format(cls_name, header,
|
||||
cls_name, '-' * len(header)))
|
||||
|
||||
t_funcall = time.time()
|
||||
|
||||
transformation = transformation.reshape(-1, X.shape[1])
|
||||
X_embedded = np.dot(X, transformation.T) # (n_samples, n_components)
|
||||
|
||||
# Compute softmax distances
|
||||
p_ij = pairwise_distances(X_embedded, squared=True)
|
||||
np.fill_diagonal(p_ij, np.inf)
|
||||
p_ij = softmax(-p_ij) # (n_samples, n_samples)
|
||||
|
||||
# Compute loss
|
||||
masked_p_ij = p_ij * same_class_mask
|
||||
p = np.sum(masked_p_ij, axis=1, keepdims=True) # (n_samples, 1)
|
||||
loss = np.sum(p)
|
||||
|
||||
# Compute gradient of loss w.r.t. `transform`
|
||||
weighted_p_ij = masked_p_ij - p_ij * p
|
||||
weighted_p_ij_sym = weighted_p_ij + weighted_p_ij.T
|
||||
np.fill_diagonal(weighted_p_ij_sym, -weighted_p_ij.sum(axis=0))
|
||||
gradient = 2 * X_embedded.T.dot(weighted_p_ij_sym).dot(X)
|
||||
# time complexity of the gradient: O(n_components x n_samples x (
|
||||
# n_samples + n_features))
|
||||
|
||||
if self.verbose:
|
||||
t_funcall = time.time() - t_funcall
|
||||
values_fmt = '[{}] {:>10} {:>20.6e} {:>10.2f}'
|
||||
print(values_fmt.format(self.__class__.__name__, self.n_iter_,
|
||||
loss, t_funcall))
|
||||
sys.stdout.flush()
|
||||
|
||||
return sign * loss, sign * gradient.ravel()
|
||||
|
||||
def _more_tags(self):
|
||||
return {'requires_y': True}
|
203
venv/Lib/site-packages/sklearn/neighbors/_nearest_centroid.py
Normal file
203
venv/Lib/site-packages/sklearn/neighbors/_nearest_centroid.py
Normal file
|
@ -0,0 +1,203 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
Nearest Centroid Classification
|
||||
"""
|
||||
|
||||
# Author: Robert Layton <robertlayton@gmail.com>
|
||||
# Olivier Grisel <olivier.grisel@ensta.org>
|
||||
#
|
||||
# License: BSD 3 clause
|
||||
|
||||
import warnings
|
||||
import numpy as np
|
||||
from scipy import sparse as sp
|
||||
|
||||
from ..base import BaseEstimator, ClassifierMixin
|
||||
from ..metrics.pairwise import pairwise_distances
|
||||
from ..preprocessing import LabelEncoder
|
||||
from ..utils.validation import check_array, check_is_fitted
|
||||
from ..utils.validation import _deprecate_positional_args
|
||||
from ..utils.sparsefuncs import csc_median_axis_0
|
||||
from ..utils.multiclass import check_classification_targets
|
||||
|
||||
|
||||
class NearestCentroid(ClassifierMixin, BaseEstimator):
|
||||
"""Nearest centroid classifier.
|
||||
|
||||
Each class is represented by its centroid, with test samples classified to
|
||||
the class with the nearest centroid.
|
||||
|
||||
Read more in the :ref:`User Guide <nearest_centroid_classifier>`.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
metric : str or callable
|
||||
The metric to use when calculating distance between instances in a
|
||||
feature array. If metric is a string or callable, it must be one of
|
||||
the options allowed by metrics.pairwise.pairwise_distances for its
|
||||
metric parameter.
|
||||
The centroids for the samples corresponding to each class is the point
|
||||
from which the sum of the distances (according to the metric) of all
|
||||
samples that belong to that particular class are minimized.
|
||||
If the "manhattan" metric is provided, this centroid is the median and
|
||||
for all other metrics, the centroid is now set to be the mean.
|
||||
|
||||
.. versionchanged:: 0.19
|
||||
``metric='precomputed'`` was deprecated and now raises an error
|
||||
|
||||
shrink_threshold : float, default=None
|
||||
Threshold for shrinking centroids to remove features.
|
||||
|
||||
Attributes
|
||||
----------
|
||||
centroids_ : array-like of shape (n_classes, n_features)
|
||||
Centroid of each class.
|
||||
|
||||
classes_ : array of shape (n_classes,)
|
||||
The unique classes labels.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> from sklearn.neighbors import NearestCentroid
|
||||
>>> import numpy as np
|
||||
>>> X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]])
|
||||
>>> y = np.array([1, 1, 1, 2, 2, 2])
|
||||
>>> clf = NearestCentroid()
|
||||
>>> clf.fit(X, y)
|
||||
NearestCentroid()
|
||||
>>> print(clf.predict([[-0.8, -1]]))
|
||||
[1]
|
||||
|
||||
See also
|
||||
--------
|
||||
sklearn.neighbors.KNeighborsClassifier: nearest neighbors classifier
|
||||
|
||||
Notes
|
||||
-----
|
||||
When used for text classification with tf-idf vectors, this classifier is
|
||||
also known as the Rocchio classifier.
|
||||
|
||||
References
|
||||
----------
|
||||
Tibshirani, R., Hastie, T., Narasimhan, B., & Chu, G. (2002). Diagnosis of
|
||||
multiple cancer types by shrunken centroids of gene expression. Proceedings
|
||||
of the National Academy of Sciences of the United States of America,
|
||||
99(10), 6567-6572. The National Academy of Sciences.
|
||||
|
||||
"""
|
||||
|
||||
@_deprecate_positional_args
|
||||
def __init__(self, metric='euclidean', *, shrink_threshold=None):
|
||||
self.metric = metric
|
||||
self.shrink_threshold = shrink_threshold
|
||||
|
||||
def fit(self, X, y):
|
||||
"""
|
||||
Fit the NearestCentroid model according to the given training data.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : {array-like, sparse matrix} of shape (n_samples, n_features)
|
||||
Training vector, where n_samples is the number of samples and
|
||||
n_features is the number of features.
|
||||
Note that centroid shrinking cannot be used with sparse matrices.
|
||||
y : array-like of shape (n_samples,)
|
||||
Target values (integers)
|
||||
"""
|
||||
if self.metric == 'precomputed':
|
||||
raise ValueError("Precomputed is not supported.")
|
||||
# If X is sparse and the metric is "manhattan", store it in a csc
|
||||
# format is easier to calculate the median.
|
||||
if self.metric == 'manhattan':
|
||||
X, y = self._validate_data(X, y, accept_sparse=['csc'])
|
||||
else:
|
||||
X, y = self._validate_data(X, y, accept_sparse=['csr', 'csc'])
|
||||
is_X_sparse = sp.issparse(X)
|
||||
if is_X_sparse and self.shrink_threshold:
|
||||
raise ValueError("threshold shrinking not supported"
|
||||
" for sparse input")
|
||||
check_classification_targets(y)
|
||||
|
||||
n_samples, n_features = X.shape
|
||||
le = LabelEncoder()
|
||||
y_ind = le.fit_transform(y)
|
||||
self.classes_ = classes = le.classes_
|
||||
n_classes = classes.size
|
||||
if n_classes < 2:
|
||||
raise ValueError('The number of classes has to be greater than'
|
||||
' one; got %d class' % (n_classes))
|
||||
|
||||
# Mask mapping each class to its members.
|
||||
self.centroids_ = np.empty((n_classes, n_features), dtype=np.float64)
|
||||
# Number of clusters in each class.
|
||||
nk = np.zeros(n_classes)
|
||||
|
||||
for cur_class in range(n_classes):
|
||||
center_mask = y_ind == cur_class
|
||||
nk[cur_class] = np.sum(center_mask)
|
||||
if is_X_sparse:
|
||||
center_mask = np.where(center_mask)[0]
|
||||
|
||||
# XXX: Update other averaging methods according to the metrics.
|
||||
if self.metric == "manhattan":
|
||||
# NumPy does not calculate median of sparse matrices.
|
||||
if not is_X_sparse:
|
||||
self.centroids_[cur_class] = np.median(X[center_mask], axis=0)
|
||||
else:
|
||||
self.centroids_[cur_class] = csc_median_axis_0(X[center_mask])
|
||||
else:
|
||||
if self.metric != 'euclidean':
|
||||
warnings.warn("Averaging for metrics other than "
|
||||
"euclidean and manhattan not supported. "
|
||||
"The average is set to be the mean."
|
||||
)
|
||||
self.centroids_[cur_class] = X[center_mask].mean(axis=0)
|
||||
|
||||
if self.shrink_threshold:
|
||||
dataset_centroid_ = np.mean(X, axis=0)
|
||||
|
||||
# m parameter for determining deviation
|
||||
m = np.sqrt((1. / nk) - (1. / n_samples))
|
||||
# Calculate deviation using the standard deviation of centroids.
|
||||
variance = (X - self.centroids_[y_ind]) ** 2
|
||||
variance = variance.sum(axis=0)
|
||||
s = np.sqrt(variance / (n_samples - n_classes))
|
||||
s += np.median(s) # To deter outliers from affecting the results.
|
||||
mm = m.reshape(len(m), 1) # Reshape to allow broadcasting.
|
||||
ms = mm * s
|
||||
deviation = ((self.centroids_ - dataset_centroid_) / ms)
|
||||
# Soft thresholding: if the deviation crosses 0 during shrinking,
|
||||
# it becomes zero.
|
||||
signs = np.sign(deviation)
|
||||
deviation = (np.abs(deviation) - self.shrink_threshold)
|
||||
np.clip(deviation, 0, None, out=deviation)
|
||||
deviation *= signs
|
||||
# Now adjust the centroids using the deviation
|
||||
msd = ms * deviation
|
||||
self.centroids_ = dataset_centroid_[np.newaxis, :] + msd
|
||||
return self
|
||||
|
||||
def predict(self, X):
|
||||
"""Perform classification on an array of test vectors X.
|
||||
|
||||
The predicted class C for each sample in X is returned.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : array-like of shape (n_samples, n_features)
|
||||
|
||||
Returns
|
||||
-------
|
||||
C : ndarray of shape (n_samples,)
|
||||
|
||||
Notes
|
||||
-----
|
||||
If the metric constructor parameter is "precomputed", X is assumed to
|
||||
be the distance matrix between the data to be predicted and
|
||||
``self.centroids_``.
|
||||
"""
|
||||
check_is_fitted(self)
|
||||
|
||||
X = check_array(X, accept_sparse='csr')
|
||||
return self.classes_[pairwise_distances(
|
||||
X, self.centroids_, metric=self.metric).argmin(axis=1)]
|
Binary file not shown.
101
venv/Lib/site-packages/sklearn/neighbors/_quad_tree.pxd
Normal file
101
venv/Lib/site-packages/sklearn/neighbors/_quad_tree.pxd
Normal file
|
@ -0,0 +1,101 @@
|
|||
# cython: boundscheck=False
|
||||
# cython: wraparound=False
|
||||
# cython: cdivision=True
|
||||
#
|
||||
# Author: Thomas Moreau <thomas.moreau.2010@gmail.com>
|
||||
# Author: Olivier Grisel <olivier.grisel@ensta.fr>
|
||||
|
||||
# See quad_tree.pyx for details.
|
||||
|
||||
import numpy as np
|
||||
cimport numpy as np
|
||||
|
||||
ctypedef np.npy_float32 DTYPE_t # Type of X
|
||||
ctypedef np.npy_intp SIZE_t # Type for indices and counters
|
||||
ctypedef np.npy_int32 INT32_t # Signed 32 bit integer
|
||||
ctypedef np.npy_uint32 UINT32_t # Unsigned 32 bit integer
|
||||
|
||||
# This is effectively an ifdef statement in Cython
|
||||
# It allows us to write printf debugging lines
|
||||
# and remove them at compile time
|
||||
cdef enum:
|
||||
DEBUGFLAG = 0
|
||||
|
||||
cdef float EPSILON = 1e-6
|
||||
|
||||
# XXX: Careful to not change the order of the arguments. It is important to
|
||||
# have is_leaf and max_width consecutive as it permits to avoid padding by
|
||||
# the compiler and keep the size coherent for both C and numpy data structures.
|
||||
cdef struct Cell:
|
||||
# Base storage structure for cells in a QuadTree object
|
||||
|
||||
# Tree structure
|
||||
SIZE_t parent # Parent cell of this cell
|
||||
SIZE_t[8] children # Array pointing to childrens of this cell
|
||||
|
||||
# Cell description
|
||||
SIZE_t cell_id # Id of the cell in the cells array in the Tree
|
||||
SIZE_t point_index # Index of the point at this cell (only defined
|
||||
# in non empty leaf)
|
||||
bint is_leaf # Does this cell have children?
|
||||
DTYPE_t squared_max_width # Squared value of the maximum width w
|
||||
SIZE_t depth # Depth of the cell in the tree
|
||||
SIZE_t cumulative_size # Number of points included in the subtree with
|
||||
# this cell as a root.
|
||||
|
||||
# Internal constants
|
||||
DTYPE_t[3] center # Store the center for quick split of cells
|
||||
DTYPE_t[3] barycenter # Keep track of the center of mass of the cell
|
||||
|
||||
# Cell boundaries
|
||||
DTYPE_t[3] min_bounds # Inferior boundaries of this cell (inclusive)
|
||||
DTYPE_t[3] max_bounds # Superior boundaries of this cell (exclusive)
|
||||
|
||||
|
||||
cdef class _QuadTree:
|
||||
# The QuadTree object is a quad tree structure constructed by inserting
|
||||
# recursively points in the tree and splitting cells in 4 so that each
|
||||
# leaf cell contains at most one point.
|
||||
# This structure also handle 3D data, inserted in trees with 8 children
|
||||
# for each node.
|
||||
|
||||
# Parameters of the tree
|
||||
cdef public int n_dimensions # Number of dimensions in X
|
||||
cdef public int verbose # Verbosity of the output
|
||||
cdef SIZE_t n_cells_per_cell # Number of children per node. (2 ** n_dimension)
|
||||
|
||||
# Tree inner structure
|
||||
cdef public SIZE_t max_depth # Max depth of the tree
|
||||
cdef public SIZE_t cell_count # Counter for node IDs
|
||||
cdef public SIZE_t capacity # Capacity of tree, in terms of nodes
|
||||
cdef public SIZE_t n_points # Total number of points
|
||||
cdef Cell* cells # Array of nodes
|
||||
|
||||
# Point insertion methods
|
||||
cdef int insert_point(self, DTYPE_t[3] point, SIZE_t point_index,
|
||||
SIZE_t cell_id=*) nogil except -1
|
||||
cdef SIZE_t _insert_point_in_new_child(self, DTYPE_t[3] point, Cell* cell,
|
||||
SIZE_t point_index, SIZE_t size=*
|
||||
) nogil
|
||||
cdef SIZE_t _select_child(self, DTYPE_t[3] point, Cell* cell) nogil
|
||||
cdef bint _is_duplicate(self, DTYPE_t[3] point1, DTYPE_t[3] point2) nogil
|
||||
|
||||
# Create a summary of the Tree compare to a query point
|
||||
cdef long summarize(self, DTYPE_t[3] point, DTYPE_t* results,
|
||||
float squared_theta=*, int cell_id=*, long idx=*
|
||||
) nogil
|
||||
|
||||
# Internal cell initialization methods
|
||||
cdef void _init_cell(self, Cell* cell, SIZE_t parent, SIZE_t depth) nogil
|
||||
cdef void _init_root(self, DTYPE_t[3] min_bounds, DTYPE_t[3] max_bounds
|
||||
) nogil
|
||||
|
||||
# Private methods
|
||||
cdef int _check_point_in_cell(self, DTYPE_t[3] point, Cell* cell
|
||||
) nogil except -1
|
||||
|
||||
# Private array manipulation to manage the ``cells`` array
|
||||
cdef int _resize(self, SIZE_t capacity) nogil except -1
|
||||
cdef int _resize_c(self, SIZE_t capacity=*) nogil except -1
|
||||
cdef int _get_cell(self, DTYPE_t[3] point, SIZE_t cell_id=*) nogil except -1
|
||||
cdef np.ndarray _get_cell_ndarray(self)
|
371
venv/Lib/site-packages/sklearn/neighbors/_regression.py
Normal file
371
venv/Lib/site-packages/sklearn/neighbors/_regression.py
Normal file
|
@ -0,0 +1,371 @@
|
|||
"""Nearest Neighbor Regression"""
|
||||
|
||||
# Authors: Jake Vanderplas <vanderplas@astro.washington.edu>
|
||||
# Fabian Pedregosa <fabian.pedregosa@inria.fr>
|
||||
# Alexandre Gramfort <alexandre.gramfort@inria.fr>
|
||||
# Sparseness support by Lars Buitinck
|
||||
# Multi-output support by Arnaud Joly <a.joly@ulg.ac.be>
|
||||
# Empty radius support by Andreas Bjerre-Nielsen
|
||||
#
|
||||
# License: BSD 3 clause (C) INRIA, University of Amsterdam,
|
||||
# University of Copenhagen
|
||||
|
||||
import warnings
|
||||
|
||||
import numpy as np
|
||||
|
||||
from ._base import _get_weights, _check_weights, NeighborsBase, KNeighborsMixin
|
||||
from ._base import RadiusNeighborsMixin, SupervisedFloatMixin
|
||||
from ..base import RegressorMixin
|
||||
from ..utils import check_array
|
||||
from ..utils.validation import _deprecate_positional_args
|
||||
|
||||
|
||||
class KNeighborsRegressor(NeighborsBase, KNeighborsMixin,
|
||||
SupervisedFloatMixin,
|
||||
RegressorMixin):
|
||||
"""Regression based on k-nearest neighbors.
|
||||
|
||||
The target is predicted by local interpolation of the targets
|
||||
associated of the nearest neighbors in the training set.
|
||||
|
||||
Read more in the :ref:`User Guide <regression>`.
|
||||
|
||||
.. versionadded:: 0.9
|
||||
|
||||
Parameters
|
||||
----------
|
||||
n_neighbors : int, default=5
|
||||
Number of neighbors to use by default for :meth:`kneighbors` queries.
|
||||
|
||||
weights : {'uniform', 'distance'} or callable, default='uniform'
|
||||
weight function used in prediction. Possible values:
|
||||
|
||||
- 'uniform' : uniform weights. All points in each neighborhood
|
||||
are weighted equally.
|
||||
- 'distance' : weight points by the inverse of their distance.
|
||||
in this case, closer neighbors of a query point will have a
|
||||
greater influence than neighbors which are further away.
|
||||
- [callable] : a user-defined function which accepts an
|
||||
array of distances, and returns an array of the same shape
|
||||
containing the weights.
|
||||
|
||||
Uniform weights are used by default.
|
||||
|
||||
algorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'}, default='auto'
|
||||
Algorithm used to compute the nearest neighbors:
|
||||
|
||||
- 'ball_tree' will use :class:`BallTree`
|
||||
- 'kd_tree' will use :class:`KDTree`
|
||||
- 'brute' will use a brute-force search.
|
||||
- 'auto' will attempt to decide the most appropriate algorithm
|
||||
based on the values passed to :meth:`fit` method.
|
||||
|
||||
Note: fitting on sparse input will override the setting of
|
||||
this parameter, using brute force.
|
||||
|
||||
leaf_size : int, default=30
|
||||
Leaf size passed to BallTree or KDTree. This can affect the
|
||||
speed of the construction and query, as well as the memory
|
||||
required to store the tree. The optimal value depends on the
|
||||
nature of the problem.
|
||||
|
||||
p : int, default=2
|
||||
Power parameter for the Minkowski metric. When p = 1, this is
|
||||
equivalent to using manhattan_distance (l1), and euclidean_distance
|
||||
(l2) for p = 2. For arbitrary p, minkowski_distance (l_p) is used.
|
||||
|
||||
metric : str or callable, default='minkowski'
|
||||
the distance metric to use for the tree. The default metric is
|
||||
minkowski, and with p=2 is equivalent to the standard Euclidean
|
||||
metric. See the documentation of :class:`DistanceMetric` for a
|
||||
list of available metrics.
|
||||
If metric is "precomputed", X is assumed to be a distance matrix and
|
||||
must be square during fit. X may be a :term:`sparse graph`,
|
||||
in which case only "nonzero" elements may be considered neighbors.
|
||||
|
||||
metric_params : dict, default=None
|
||||
Additional keyword arguments for the metric function.
|
||||
|
||||
n_jobs : int, default=None
|
||||
The number of parallel jobs to run for neighbors search.
|
||||
``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
|
||||
``-1`` means using all processors. See :term:`Glossary <n_jobs>`
|
||||
for more details.
|
||||
Doesn't affect :meth:`fit` method.
|
||||
|
||||
Attributes
|
||||
----------
|
||||
effective_metric_ : str or callable
|
||||
The distance metric to use. It will be same as the `metric` parameter
|
||||
or a synonym of it, e.g. 'euclidean' if the `metric` parameter set to
|
||||
'minkowski' and `p` parameter set to 2.
|
||||
|
||||
effective_metric_params_ : dict
|
||||
Additional keyword arguments for the metric function. For most metrics
|
||||
will be same with `metric_params` parameter, but may also contain the
|
||||
`p` parameter value if the `effective_metric_` attribute is set to
|
||||
'minkowski'.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> X = [[0], [1], [2], [3]]
|
||||
>>> y = [0, 0, 1, 1]
|
||||
>>> from sklearn.neighbors import KNeighborsRegressor
|
||||
>>> neigh = KNeighborsRegressor(n_neighbors=2)
|
||||
>>> neigh.fit(X, y)
|
||||
KNeighborsRegressor(...)
|
||||
>>> print(neigh.predict([[1.5]]))
|
||||
[0.5]
|
||||
|
||||
See also
|
||||
--------
|
||||
NearestNeighbors
|
||||
RadiusNeighborsRegressor
|
||||
KNeighborsClassifier
|
||||
RadiusNeighborsClassifier
|
||||
|
||||
Notes
|
||||
-----
|
||||
See :ref:`Nearest Neighbors <neighbors>` in the online documentation
|
||||
for a discussion of the choice of ``algorithm`` and ``leaf_size``.
|
||||
|
||||
.. warning::
|
||||
|
||||
Regarding the Nearest Neighbors algorithms, if it is found that two
|
||||
neighbors, neighbor `k+1` and `k`, have identical distances but
|
||||
different labels, the results will depend on the ordering of the
|
||||
training data.
|
||||
|
||||
https://en.wikipedia.org/wiki/K-nearest_neighbor_algorithm
|
||||
"""
|
||||
|
||||
@_deprecate_positional_args
|
||||
def __init__(self, n_neighbors=5, *, weights='uniform',
|
||||
algorithm='auto', leaf_size=30,
|
||||
p=2, metric='minkowski', metric_params=None, n_jobs=None,
|
||||
**kwargs):
|
||||
super().__init__(
|
||||
n_neighbors=n_neighbors,
|
||||
algorithm=algorithm,
|
||||
leaf_size=leaf_size, metric=metric, p=p,
|
||||
metric_params=metric_params, n_jobs=n_jobs, **kwargs)
|
||||
self.weights = _check_weights(weights)
|
||||
|
||||
@property
|
||||
def _pairwise(self):
|
||||
# For cross-validation routines to split data correctly
|
||||
return self.metric == 'precomputed'
|
||||
|
||||
def predict(self, X):
|
||||
"""Predict the target for the provided data
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : array-like of shape (n_queries, n_features), \
|
||||
or (n_queries, n_indexed) if metric == 'precomputed'
|
||||
Test samples.
|
||||
|
||||
Returns
|
||||
-------
|
||||
y : ndarray of shape (n_queries,) or (n_queries, n_outputs), dtype=int
|
||||
Target values.
|
||||
"""
|
||||
X = check_array(X, accept_sparse='csr')
|
||||
|
||||
neigh_dist, neigh_ind = self.kneighbors(X)
|
||||
|
||||
weights = _get_weights(neigh_dist, self.weights)
|
||||
|
||||
_y = self._y
|
||||
if _y.ndim == 1:
|
||||
_y = _y.reshape((-1, 1))
|
||||
|
||||
if weights is None:
|
||||
y_pred = np.mean(_y[neigh_ind], axis=1)
|
||||
else:
|
||||
y_pred = np.empty((X.shape[0], _y.shape[1]), dtype=np.float64)
|
||||
denom = np.sum(weights, axis=1)
|
||||
|
||||
for j in range(_y.shape[1]):
|
||||
num = np.sum(_y[neigh_ind, j] * weights, axis=1)
|
||||
y_pred[:, j] = num / denom
|
||||
|
||||
if self._y.ndim == 1:
|
||||
y_pred = y_pred.ravel()
|
||||
|
||||
return y_pred
|
||||
|
||||
|
||||
class RadiusNeighborsRegressor(NeighborsBase, RadiusNeighborsMixin,
|
||||
SupervisedFloatMixin,
|
||||
RegressorMixin):
|
||||
"""Regression based on neighbors within a fixed radius.
|
||||
|
||||
The target is predicted by local interpolation of the targets
|
||||
associated of the nearest neighbors in the training set.
|
||||
|
||||
Read more in the :ref:`User Guide <regression>`.
|
||||
|
||||
.. versionadded:: 0.9
|
||||
|
||||
Parameters
|
||||
----------
|
||||
radius : float, default=1.0
|
||||
Range of parameter space to use by default for :meth:`radius_neighbors`
|
||||
queries.
|
||||
|
||||
weights : {'uniform', 'distance'} or callable, default='uniform'
|
||||
weight function used in prediction. Possible values:
|
||||
|
||||
- 'uniform' : uniform weights. All points in each neighborhood
|
||||
are weighted equally.
|
||||
- 'distance' : weight points by the inverse of their distance.
|
||||
in this case, closer neighbors of a query point will have a
|
||||
greater influence than neighbors which are further away.
|
||||
- [callable] : a user-defined function which accepts an
|
||||
array of distances, and returns an array of the same shape
|
||||
containing the weights.
|
||||
|
||||
Uniform weights are used by default.
|
||||
|
||||
algorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'}, default='auto'
|
||||
Algorithm used to compute the nearest neighbors:
|
||||
|
||||
- 'ball_tree' will use :class:`BallTree`
|
||||
- 'kd_tree' will use :class:`KDTree`
|
||||
- 'brute' will use a brute-force search.
|
||||
- 'auto' will attempt to decide the most appropriate algorithm
|
||||
based on the values passed to :meth:`fit` method.
|
||||
|
||||
Note: fitting on sparse input will override the setting of
|
||||
this parameter, using brute force.
|
||||
|
||||
leaf_size : int, default=30
|
||||
Leaf size passed to BallTree or KDTree. This can affect the
|
||||
speed of the construction and query, as well as the memory
|
||||
required to store the tree. The optimal value depends on the
|
||||
nature of the problem.
|
||||
|
||||
p : int, default=2
|
||||
Power parameter for the Minkowski metric. When p = 1, this is
|
||||
equivalent to using manhattan_distance (l1), and euclidean_distance
|
||||
(l2) for p = 2. For arbitrary p, minkowski_distance (l_p) is used.
|
||||
|
||||
metric : str or callable, default='minkowski'
|
||||
the distance metric to use for the tree. The default metric is
|
||||
minkowski, and with p=2 is equivalent to the standard Euclidean
|
||||
metric. See the documentation of :class:`DistanceMetric` for a
|
||||
list of available metrics.
|
||||
If metric is "precomputed", X is assumed to be a distance matrix and
|
||||
must be square during fit. X may be a :term:`sparse graph`,
|
||||
in which case only "nonzero" elements may be considered neighbors.
|
||||
|
||||
metric_params : dict, default=None
|
||||
Additional keyword arguments for the metric function.
|
||||
|
||||
n_jobs : int, default=None
|
||||
The number of parallel jobs to run for neighbors search.
|
||||
``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
|
||||
``-1`` means using all processors. See :term:`Glossary <n_jobs>`
|
||||
for more details.
|
||||
|
||||
Attributes
|
||||
----------
|
||||
effective_metric_ : str or callable
|
||||
The distance metric to use. It will be same as the `metric` parameter
|
||||
or a synonym of it, e.g. 'euclidean' if the `metric` parameter set to
|
||||
'minkowski' and `p` parameter set to 2.
|
||||
|
||||
effective_metric_params_ : dict
|
||||
Additional keyword arguments for the metric function. For most metrics
|
||||
will be same with `metric_params` parameter, but may also contain the
|
||||
`p` parameter value if the `effective_metric_` attribute is set to
|
||||
'minkowski'.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> X = [[0], [1], [2], [3]]
|
||||
>>> y = [0, 0, 1, 1]
|
||||
>>> from sklearn.neighbors import RadiusNeighborsRegressor
|
||||
>>> neigh = RadiusNeighborsRegressor(radius=1.0)
|
||||
>>> neigh.fit(X, y)
|
||||
RadiusNeighborsRegressor(...)
|
||||
>>> print(neigh.predict([[1.5]]))
|
||||
[0.5]
|
||||
|
||||
See also
|
||||
--------
|
||||
NearestNeighbors
|
||||
KNeighborsRegressor
|
||||
KNeighborsClassifier
|
||||
RadiusNeighborsClassifier
|
||||
|
||||
Notes
|
||||
-----
|
||||
See :ref:`Nearest Neighbors <neighbors>` in the online documentation
|
||||
for a discussion of the choice of ``algorithm`` and ``leaf_size``.
|
||||
|
||||
https://en.wikipedia.org/wiki/K-nearest_neighbor_algorithm
|
||||
"""
|
||||
|
||||
@_deprecate_positional_args
|
||||
def __init__(self, radius=1.0, *, weights='uniform',
|
||||
algorithm='auto', leaf_size=30,
|
||||
p=2, metric='minkowski', metric_params=None, n_jobs=None,
|
||||
**kwargs):
|
||||
super().__init__(
|
||||
radius=radius,
|
||||
algorithm=algorithm,
|
||||
leaf_size=leaf_size,
|
||||
p=p, metric=metric, metric_params=metric_params,
|
||||
n_jobs=n_jobs, **kwargs)
|
||||
self.weights = _check_weights(weights)
|
||||
|
||||
def predict(self, X):
|
||||
"""Predict the target for the provided data
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : array-like of shape (n_queries, n_features), \
|
||||
or (n_queries, n_indexed) if metric == 'precomputed'
|
||||
Test samples.
|
||||
|
||||
Returns
|
||||
-------
|
||||
y : ndarray of shape (n_queries,) or (n_queries, n_outputs), \
|
||||
dtype=double
|
||||
Target values.
|
||||
"""
|
||||
X = check_array(X, accept_sparse='csr')
|
||||
|
||||
neigh_dist, neigh_ind = self.radius_neighbors(X)
|
||||
|
||||
weights = _get_weights(neigh_dist, self.weights)
|
||||
|
||||
_y = self._y
|
||||
if _y.ndim == 1:
|
||||
_y = _y.reshape((-1, 1))
|
||||
|
||||
empty_obs = np.full_like(_y[0], np.nan)
|
||||
|
||||
if weights is None:
|
||||
y_pred = np.array([np.mean(_y[ind, :], axis=0)
|
||||
if len(ind) else empty_obs
|
||||
for (i, ind) in enumerate(neigh_ind)])
|
||||
|
||||
else:
|
||||
y_pred = np.array([np.average(_y[ind, :], axis=0,
|
||||
weights=weights[i])
|
||||
if len(ind) else empty_obs
|
||||
for (i, ind) in enumerate(neigh_ind)])
|
||||
|
||||
if np.any(np.isnan(y_pred)):
|
||||
empty_warning_msg = ("One or more samples have no neighbors "
|
||||
"within specified radius; predicting NaN.")
|
||||
warnings.warn(empty_warning_msg)
|
||||
|
||||
if self._y.ndim == 1:
|
||||
y_pred = y_pred.ravel()
|
||||
|
||||
return y_pred
|
Binary file not shown.
18
venv/Lib/site-packages/sklearn/neighbors/_typedefs.pxd
Normal file
18
venv/Lib/site-packages/sklearn/neighbors/_typedefs.pxd
Normal file
|
@ -0,0 +1,18 @@
|
|||
#!python
|
||||
cimport numpy as np
|
||||
|
||||
# Floating point/data type
|
||||
ctypedef np.float64_t DTYPE_t # WARNING: should match DTYPE in typedefs.pyx
|
||||
|
||||
cdef enum:
|
||||
DTYPECODE = np.NPY_FLOAT64
|
||||
ITYPECODE = np.NPY_INTP
|
||||
|
||||
# Index/integer type.
|
||||
# WARNING: ITYPE_t must be a signed integer type or you will have a bad time!
|
||||
ctypedef np.intp_t ITYPE_t # WARNING: should match ITYPE in typedefs.pyx
|
||||
|
||||
# Fused type for certain operations
|
||||
ctypedef fused DITYPE_t:
|
||||
ITYPE_t
|
||||
DTYPE_t
|
118
venv/Lib/site-packages/sklearn/neighbors/_unsupervised.py
Normal file
118
venv/Lib/site-packages/sklearn/neighbors/_unsupervised.py
Normal file
|
@ -0,0 +1,118 @@
|
|||
"""Unsupervised nearest neighbors learner"""
|
||||
from ._base import NeighborsBase
|
||||
from ._base import KNeighborsMixin
|
||||
from ._base import RadiusNeighborsMixin
|
||||
from ._base import UnsupervisedMixin
|
||||
from ..utils.validation import _deprecate_positional_args
|
||||
|
||||
|
||||
class NearestNeighbors(KNeighborsMixin, RadiusNeighborsMixin,
|
||||
UnsupervisedMixin, NeighborsBase):
|
||||
"""Unsupervised learner for implementing neighbor searches.
|
||||
|
||||
Read more in the :ref:`User Guide <unsupervised_neighbors>`.
|
||||
|
||||
.. versionadded:: 0.9
|
||||
|
||||
Parameters
|
||||
----------
|
||||
n_neighbors : int, default=5
|
||||
Number of neighbors to use by default for :meth:`kneighbors` queries.
|
||||
|
||||
radius : float, default=1.0
|
||||
Range of parameter space to use by default for :meth:`radius_neighbors`
|
||||
queries.
|
||||
|
||||
algorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'}, default='auto'
|
||||
Algorithm used to compute the nearest neighbors:
|
||||
|
||||
- 'ball_tree' will use :class:`BallTree`
|
||||
- 'kd_tree' will use :class:`KDTree`
|
||||
- 'brute' will use a brute-force search.
|
||||
- 'auto' will attempt to decide the most appropriate algorithm
|
||||
based on the values passed to :meth:`fit` method.
|
||||
|
||||
Note: fitting on sparse input will override the setting of
|
||||
this parameter, using brute force.
|
||||
|
||||
leaf_size : int, default=30
|
||||
Leaf size passed to BallTree or KDTree. This can affect the
|
||||
speed of the construction and query, as well as the memory
|
||||
required to store the tree. The optimal value depends on the
|
||||
nature of the problem.
|
||||
|
||||
metric : str or callable, default='minkowski'
|
||||
the distance metric to use for the tree. The default metric is
|
||||
minkowski, and with p=2 is equivalent to the standard Euclidean
|
||||
metric. See the documentation of :class:`DistanceMetric` for a
|
||||
list of available metrics.
|
||||
If metric is "precomputed", X is assumed to be a distance matrix and
|
||||
must be square during fit. X may be a :term:`sparse graph`,
|
||||
in which case only "nonzero" elements may be considered neighbors.
|
||||
|
||||
p : int, default=2
|
||||
Parameter for the Minkowski metric from
|
||||
sklearn.metrics.pairwise.pairwise_distances. When p = 1, this is
|
||||
equivalent to using manhattan_distance (l1), and euclidean_distance
|
||||
(l2) for p = 2. For arbitrary p, minkowski_distance (l_p) is used.
|
||||
|
||||
metric_params : dict, default=None
|
||||
Additional keyword arguments for the metric function.
|
||||
|
||||
n_jobs : int, default=None
|
||||
The number of parallel jobs to run for neighbors search.
|
||||
``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
|
||||
``-1`` means using all processors. See :term:`Glossary <n_jobs>`
|
||||
for more details.
|
||||
|
||||
Attributes
|
||||
----------
|
||||
effective_metric_ : str
|
||||
Metric used to compute distances to neighbors.
|
||||
|
||||
effective_metric_params_ : dict
|
||||
Parameters for the metric used to compute distances to neighbors.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> import numpy as np
|
||||
>>> from sklearn.neighbors import NearestNeighbors
|
||||
>>> samples = [[0, 0, 2], [1, 0, 0], [0, 0, 1]]
|
||||
|
||||
>>> neigh = NearestNeighbors(n_neighbors=2, radius=0.4)
|
||||
>>> neigh.fit(samples)
|
||||
NearestNeighbors(...)
|
||||
|
||||
>>> neigh.kneighbors([[0, 0, 1.3]], 2, return_distance=False)
|
||||
array([[2, 0]]...)
|
||||
|
||||
>>> nbrs = neigh.radius_neighbors([[0, 0, 1.3]], 0.4, return_distance=False)
|
||||
>>> np.asarray(nbrs[0][0])
|
||||
array(2)
|
||||
|
||||
See also
|
||||
--------
|
||||
KNeighborsClassifier
|
||||
RadiusNeighborsClassifier
|
||||
KNeighborsRegressor
|
||||
RadiusNeighborsRegressor
|
||||
BallTree
|
||||
|
||||
Notes
|
||||
-----
|
||||
See :ref:`Nearest Neighbors <neighbors>` in the online documentation
|
||||
for a discussion of the choice of ``algorithm`` and ``leaf_size``.
|
||||
|
||||
https://en.wikipedia.org/wiki/K-nearest_neighbor_algorithm
|
||||
"""
|
||||
|
||||
@_deprecate_positional_args
|
||||
def __init__(self, *, n_neighbors=5, radius=1.0,
|
||||
algorithm='auto', leaf_size=30, metric='minkowski',
|
||||
p=2, metric_params=None, n_jobs=None):
|
||||
super().__init__(
|
||||
n_neighbors=n_neighbors,
|
||||
radius=radius,
|
||||
algorithm=algorithm,
|
||||
leaf_size=leaf_size, metric=metric, p=p,
|
||||
metric_params=metric_params, n_jobs=n_jobs)
|
18
venv/Lib/site-packages/sklearn/neighbors/ball_tree.py
Normal file
18
venv/Lib/site-packages/sklearn/neighbors/ball_tree.py
Normal file
|
@ -0,0 +1,18 @@
|
|||
|
||||
# THIS FILE WAS AUTOMATICALLY GENERATED BY deprecated_modules.py
|
||||
import sys
|
||||
# mypy error: Module X has no attribute y (typically for C extensions)
|
||||
from . import _ball_tree # type: ignore
|
||||
from ..externals._pep562 import Pep562
|
||||
from ..utils.deprecation import _raise_dep_warning_if_not_pytest
|
||||
|
||||
deprecated_path = 'sklearn.neighbors.ball_tree'
|
||||
correct_import_path = 'sklearn.neighbors'
|
||||
|
||||
_raise_dep_warning_if_not_pytest(deprecated_path, correct_import_path)
|
||||
|
||||
def __getattr__(name):
|
||||
return getattr(_ball_tree, name)
|
||||
|
||||
if not sys.version_info >= (3, 7):
|
||||
Pep562(__name__)
|
18
venv/Lib/site-packages/sklearn/neighbors/base.py
Normal file
18
venv/Lib/site-packages/sklearn/neighbors/base.py
Normal file
|
@ -0,0 +1,18 @@
|
|||
|
||||
# THIS FILE WAS AUTOMATICALLY GENERATED BY deprecated_modules.py
|
||||
import sys
|
||||
# mypy error: Module X has no attribute y (typically for C extensions)
|
||||
from . import _base # type: ignore
|
||||
from ..externals._pep562 import Pep562
|
||||
from ..utils.deprecation import _raise_dep_warning_if_not_pytest
|
||||
|
||||
deprecated_path = 'sklearn.neighbors.base'
|
||||
correct_import_path = 'sklearn.neighbors'
|
||||
|
||||
_raise_dep_warning_if_not_pytest(deprecated_path, correct_import_path)
|
||||
|
||||
def __getattr__(name):
|
||||
return getattr(_base, name)
|
||||
|
||||
if not sys.version_info >= (3, 7):
|
||||
Pep562(__name__)
|
18
venv/Lib/site-packages/sklearn/neighbors/classification.py
Normal file
18
venv/Lib/site-packages/sklearn/neighbors/classification.py
Normal file
|
@ -0,0 +1,18 @@
|
|||
|
||||
# THIS FILE WAS AUTOMATICALLY GENERATED BY deprecated_modules.py
|
||||
import sys
|
||||
# mypy error: Module X has no attribute y (typically for C extensions)
|
||||
from . import _classification # type: ignore
|
||||
from ..externals._pep562 import Pep562
|
||||
from ..utils.deprecation import _raise_dep_warning_if_not_pytest
|
||||
|
||||
deprecated_path = 'sklearn.neighbors.classification'
|
||||
correct_import_path = 'sklearn.neighbors'
|
||||
|
||||
_raise_dep_warning_if_not_pytest(deprecated_path, correct_import_path)
|
||||
|
||||
def __getattr__(name):
|
||||
return getattr(_classification, name)
|
||||
|
||||
if not sys.version_info >= (3, 7):
|
||||
Pep562(__name__)
|
18
venv/Lib/site-packages/sklearn/neighbors/dist_metrics.py
Normal file
18
venv/Lib/site-packages/sklearn/neighbors/dist_metrics.py
Normal file
|
@ -0,0 +1,18 @@
|
|||
|
||||
# THIS FILE WAS AUTOMATICALLY GENERATED BY deprecated_modules.py
|
||||
import sys
|
||||
# mypy error: Module X has no attribute y (typically for C extensions)
|
||||
from . import _dist_metrics # type: ignore
|
||||
from ..externals._pep562 import Pep562
|
||||
from ..utils.deprecation import _raise_dep_warning_if_not_pytest
|
||||
|
||||
deprecated_path = 'sklearn.neighbors.dist_metrics'
|
||||
correct_import_path = 'sklearn.neighbors'
|
||||
|
||||
_raise_dep_warning_if_not_pytest(deprecated_path, correct_import_path)
|
||||
|
||||
def __getattr__(name):
|
||||
return getattr(_dist_metrics, name)
|
||||
|
||||
if not sys.version_info >= (3, 7):
|
||||
Pep562(__name__)
|
18
venv/Lib/site-packages/sklearn/neighbors/graph.py
Normal file
18
venv/Lib/site-packages/sklearn/neighbors/graph.py
Normal file
|
@ -0,0 +1,18 @@
|
|||
|
||||
# THIS FILE WAS AUTOMATICALLY GENERATED BY deprecated_modules.py
|
||||
import sys
|
||||
# mypy error: Module X has no attribute y (typically for C extensions)
|
||||
from . import _graph # type: ignore
|
||||
from ..externals._pep562 import Pep562
|
||||
from ..utils.deprecation import _raise_dep_warning_if_not_pytest
|
||||
|
||||
deprecated_path = 'sklearn.neighbors.graph'
|
||||
correct_import_path = 'sklearn.neighbors'
|
||||
|
||||
_raise_dep_warning_if_not_pytest(deprecated_path, correct_import_path)
|
||||
|
||||
def __getattr__(name):
|
||||
return getattr(_graph, name)
|
||||
|
||||
if not sys.version_info >= (3, 7):
|
||||
Pep562(__name__)
|
18
venv/Lib/site-packages/sklearn/neighbors/kd_tree.py
Normal file
18
venv/Lib/site-packages/sklearn/neighbors/kd_tree.py
Normal file
|
@ -0,0 +1,18 @@
|
|||
|
||||
# THIS FILE WAS AUTOMATICALLY GENERATED BY deprecated_modules.py
|
||||
import sys
|
||||
# mypy error: Module X has no attribute y (typically for C extensions)
|
||||
from . import _kd_tree # type: ignore
|
||||
from ..externals._pep562 import Pep562
|
||||
from ..utils.deprecation import _raise_dep_warning_if_not_pytest
|
||||
|
||||
deprecated_path = 'sklearn.neighbors.kd_tree'
|
||||
correct_import_path = 'sklearn.neighbors'
|
||||
|
||||
_raise_dep_warning_if_not_pytest(deprecated_path, correct_import_path)
|
||||
|
||||
def __getattr__(name):
|
||||
return getattr(_kd_tree, name)
|
||||
|
||||
if not sys.version_info >= (3, 7):
|
||||
Pep562(__name__)
|
18
venv/Lib/site-packages/sklearn/neighbors/kde.py
Normal file
18
venv/Lib/site-packages/sklearn/neighbors/kde.py
Normal file
|
@ -0,0 +1,18 @@
|
|||
|
||||
# THIS FILE WAS AUTOMATICALLY GENERATED BY deprecated_modules.py
|
||||
import sys
|
||||
# mypy error: Module X has no attribute y (typically for C extensions)
|
||||
from . import _kde # type: ignore
|
||||
from ..externals._pep562 import Pep562
|
||||
from ..utils.deprecation import _raise_dep_warning_if_not_pytest
|
||||
|
||||
deprecated_path = 'sklearn.neighbors.kde'
|
||||
correct_import_path = 'sklearn.neighbors'
|
||||
|
||||
_raise_dep_warning_if_not_pytest(deprecated_path, correct_import_path)
|
||||
|
||||
def __getattr__(name):
|
||||
return getattr(_kde, name)
|
||||
|
||||
if not sys.version_info >= (3, 7):
|
||||
Pep562(__name__)
|
18
venv/Lib/site-packages/sklearn/neighbors/lof.py
Normal file
18
venv/Lib/site-packages/sklearn/neighbors/lof.py
Normal file
|
@ -0,0 +1,18 @@
|
|||
|
||||
# THIS FILE WAS AUTOMATICALLY GENERATED BY deprecated_modules.py
|
||||
import sys
|
||||
# mypy error: Module X has no attribute y (typically for C extensions)
|
||||
from . import _lof # type: ignore
|
||||
from ..externals._pep562 import Pep562
|
||||
from ..utils.deprecation import _raise_dep_warning_if_not_pytest
|
||||
|
||||
deprecated_path = 'sklearn.neighbors.lof'
|
||||
correct_import_path = 'sklearn.neighbors'
|
||||
|
||||
_raise_dep_warning_if_not_pytest(deprecated_path, correct_import_path)
|
||||
|
||||
def __getattr__(name):
|
||||
return getattr(_lof, name)
|
||||
|
||||
if not sys.version_info >= (3, 7):
|
||||
Pep562(__name__)
|
18
venv/Lib/site-packages/sklearn/neighbors/nca.py
Normal file
18
venv/Lib/site-packages/sklearn/neighbors/nca.py
Normal file
|
@ -0,0 +1,18 @@
|
|||
|
||||
# THIS FILE WAS AUTOMATICALLY GENERATED BY deprecated_modules.py
|
||||
import sys
|
||||
# mypy error: Module X has no attribute y (typically for C extensions)
|
||||
from . import _nca # type: ignore
|
||||
from ..externals._pep562 import Pep562
|
||||
from ..utils.deprecation import _raise_dep_warning_if_not_pytest
|
||||
|
||||
deprecated_path = 'sklearn.neighbors.nca'
|
||||
correct_import_path = 'sklearn.neighbors'
|
||||
|
||||
_raise_dep_warning_if_not_pytest(deprecated_path, correct_import_path)
|
||||
|
||||
def __getattr__(name):
|
||||
return getattr(_nca, name)
|
||||
|
||||
if not sys.version_info >= (3, 7):
|
||||
Pep562(__name__)
|
18
venv/Lib/site-packages/sklearn/neighbors/nearest_centroid.py
Normal file
18
venv/Lib/site-packages/sklearn/neighbors/nearest_centroid.py
Normal file
|
@ -0,0 +1,18 @@
|
|||
|
||||
# THIS FILE WAS AUTOMATICALLY GENERATED BY deprecated_modules.py
|
||||
import sys
|
||||
# mypy error: Module X has no attribute y (typically for C extensions)
|
||||
from . import _nearest_centroid # type: ignore
|
||||
from ..externals._pep562 import Pep562
|
||||
from ..utils.deprecation import _raise_dep_warning_if_not_pytest
|
||||
|
||||
deprecated_path = 'sklearn.neighbors.nearest_centroid'
|
||||
correct_import_path = 'sklearn.neighbors'
|
||||
|
||||
_raise_dep_warning_if_not_pytest(deprecated_path, correct_import_path)
|
||||
|
||||
def __getattr__(name):
|
||||
return getattr(_nearest_centroid, name)
|
||||
|
||||
if not sys.version_info >= (3, 7):
|
||||
Pep562(__name__)
|
18
venv/Lib/site-packages/sklearn/neighbors/quad_tree.py
Normal file
18
venv/Lib/site-packages/sklearn/neighbors/quad_tree.py
Normal file
|
@ -0,0 +1,18 @@
|
|||
|
||||
# THIS FILE WAS AUTOMATICALLY GENERATED BY deprecated_modules.py
|
||||
import sys
|
||||
# mypy error: Module X has no attribute y (typically for C extensions)
|
||||
from . import _quad_tree # type: ignore
|
||||
from ..externals._pep562 import Pep562
|
||||
from ..utils.deprecation import _raise_dep_warning_if_not_pytest
|
||||
|
||||
deprecated_path = 'sklearn.neighbors.quad_tree'
|
||||
correct_import_path = 'sklearn.neighbors'
|
||||
|
||||
_raise_dep_warning_if_not_pytest(deprecated_path, correct_import_path)
|
||||
|
||||
def __getattr__(name):
|
||||
return getattr(_quad_tree, name)
|
||||
|
||||
if not sys.version_info >= (3, 7):
|
||||
Pep562(__name__)
|
18
venv/Lib/site-packages/sklearn/neighbors/regression.py
Normal file
18
venv/Lib/site-packages/sklearn/neighbors/regression.py
Normal file
|
@ -0,0 +1,18 @@
|
|||
|
||||
# THIS FILE WAS AUTOMATICALLY GENERATED BY deprecated_modules.py
|
||||
import sys
|
||||
# mypy error: Module X has no attribute y (typically for C extensions)
|
||||
from . import _regression # type: ignore
|
||||
from ..externals._pep562 import Pep562
|
||||
from ..utils.deprecation import _raise_dep_warning_if_not_pytest
|
||||
|
||||
deprecated_path = 'sklearn.neighbors.regression'
|
||||
correct_import_path = 'sklearn.neighbors'
|
||||
|
||||
_raise_dep_warning_if_not_pytest(deprecated_path, correct_import_path)
|
||||
|
||||
def __getattr__(name):
|
||||
return getattr(_regression, name)
|
||||
|
||||
if not sys.version_info >= (3, 7):
|
||||
Pep562(__name__)
|
41
venv/Lib/site-packages/sklearn/neighbors/setup.py
Normal file
41
venv/Lib/site-packages/sklearn/neighbors/setup.py
Normal file
|
@ -0,0 +1,41 @@
|
|||
import os
|
||||
|
||||
|
||||
def configuration(parent_package='', top_path=None):
|
||||
import numpy
|
||||
from numpy.distutils.misc_util import Configuration
|
||||
|
||||
config = Configuration('neighbors', parent_package, top_path)
|
||||
libraries = []
|
||||
if os.name == 'posix':
|
||||
libraries.append('m')
|
||||
|
||||
config.add_extension('_ball_tree',
|
||||
sources=['_ball_tree.pyx'],
|
||||
include_dirs=[numpy.get_include()],
|
||||
libraries=libraries)
|
||||
|
||||
config.add_extension('_kd_tree',
|
||||
sources=['_kd_tree.pyx'],
|
||||
include_dirs=[numpy.get_include()],
|
||||
libraries=libraries)
|
||||
|
||||
config.add_extension('_dist_metrics',
|
||||
sources=['_dist_metrics.pyx'],
|
||||
include_dirs=[numpy.get_include(),
|
||||
os.path.join(numpy.get_include(),
|
||||
'numpy')],
|
||||
libraries=libraries)
|
||||
|
||||
config.add_extension('_typedefs',
|
||||
sources=['_typedefs.pyx'],
|
||||
include_dirs=[numpy.get_include()],
|
||||
libraries=libraries)
|
||||
config.add_extension("_quad_tree",
|
||||
sources=["_quad_tree.pyx"],
|
||||
include_dirs=[numpy.get_include()],
|
||||
libraries=libraries)
|
||||
|
||||
config.add_subpackage('tests')
|
||||
|
||||
return config
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
|
@ -0,0 +1,67 @@
|
|||
import itertools
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
from numpy.testing import assert_array_almost_equal
|
||||
from sklearn.neighbors._ball_tree import BallTree
|
||||
from sklearn.neighbors import DistanceMetric
|
||||
from sklearn.utils import check_random_state
|
||||
|
||||
rng = np.random.RandomState(10)
|
||||
V_mahalanobis = rng.rand(3, 3)
|
||||
V_mahalanobis = np.dot(V_mahalanobis, V_mahalanobis.T)
|
||||
|
||||
DIMENSION = 3
|
||||
|
||||
METRICS = {'euclidean': {},
|
||||
'manhattan': {},
|
||||
'minkowski': dict(p=3),
|
||||
'chebyshev': {},
|
||||
'seuclidean': dict(V=rng.random_sample(DIMENSION)),
|
||||
'wminkowski': dict(p=3, w=rng.random_sample(DIMENSION)),
|
||||
'mahalanobis': dict(V=V_mahalanobis)}
|
||||
|
||||
DISCRETE_METRICS = ['hamming',
|
||||
'canberra',
|
||||
'braycurtis']
|
||||
|
||||
BOOLEAN_METRICS = ['matching', 'jaccard', 'dice', 'kulsinski',
|
||||
'rogerstanimoto', 'russellrao', 'sokalmichener',
|
||||
'sokalsneath']
|
||||
|
||||
|
||||
def brute_force_neighbors(X, Y, k, metric, **kwargs):
|
||||
D = DistanceMetric.get_metric(metric, **kwargs).pairwise(Y, X)
|
||||
ind = np.argsort(D, axis=1)[:, :k]
|
||||
dist = D[np.arange(Y.shape[0])[:, None], ind]
|
||||
return dist, ind
|
||||
|
||||
|
||||
@pytest.mark.parametrize('metric',
|
||||
itertools.chain(BOOLEAN_METRICS, DISCRETE_METRICS))
|
||||
def test_ball_tree_query_metrics(metric):
|
||||
rng = check_random_state(0)
|
||||
if metric in BOOLEAN_METRICS:
|
||||
X = rng.random_sample((40, 10)).round(0)
|
||||
Y = rng.random_sample((10, 10)).round(0)
|
||||
elif metric in DISCRETE_METRICS:
|
||||
X = (4 * rng.random_sample((40, 10))).round(0)
|
||||
Y = (4 * rng.random_sample((10, 10))).round(0)
|
||||
|
||||
k = 5
|
||||
|
||||
bt = BallTree(X, leaf_size=1, metric=metric)
|
||||
dist1, ind1 = bt.query(Y, k)
|
||||
dist2, ind2 = brute_force_neighbors(X, Y, k, metric)
|
||||
assert_array_almost_equal(dist1, dist2)
|
||||
|
||||
|
||||
def test_query_haversine():
|
||||
rng = check_random_state(0)
|
||||
X = 2 * np.pi * rng.random_sample((40, 2))
|
||||
bt = BallTree(X, leaf_size=1, metric='haversine')
|
||||
dist1, ind1 = bt.query(X, k=5)
|
||||
dist2, ind2 = brute_force_neighbors(X, X, k=5, metric='haversine')
|
||||
|
||||
assert_array_almost_equal(dist1, dist2)
|
||||
assert_array_almost_equal(ind1, ind2)
|
|
@ -0,0 +1,203 @@
|
|||
import itertools
|
||||
import pickle
|
||||
|
||||
import numpy as np
|
||||
from numpy.testing import assert_array_almost_equal
|
||||
|
||||
import pytest
|
||||
|
||||
from scipy.spatial.distance import cdist
|
||||
from sklearn.neighbors import DistanceMetric
|
||||
from sklearn.neighbors import BallTree
|
||||
from sklearn.utils import check_random_state
|
||||
from sklearn.utils._testing import assert_raises_regex
|
||||
from sklearn.utils.fixes import sp_version, parse_version
|
||||
|
||||
|
||||
def dist_func(x1, x2, p):
|
||||
return np.sum((x1 - x2) ** p) ** (1. / p)
|
||||
|
||||
|
||||
rng = check_random_state(0)
|
||||
d = 4
|
||||
n1 = 20
|
||||
n2 = 25
|
||||
X1 = rng.random_sample((n1, d)).astype('float64', copy=False)
|
||||
X2 = rng.random_sample((n2, d)).astype('float64', copy=False)
|
||||
|
||||
# make boolean arrays: ones and zeros
|
||||
X1_bool = X1.round(0)
|
||||
X2_bool = X2.round(0)
|
||||
|
||||
V = rng.random_sample((d, d))
|
||||
VI = np.dot(V, V.T)
|
||||
|
||||
BOOL_METRICS = ['matching', 'jaccard', 'dice',
|
||||
'kulsinski', 'rogerstanimoto', 'russellrao',
|
||||
'sokalmichener', 'sokalsneath']
|
||||
|
||||
METRICS_DEFAULT_PARAMS = {'euclidean': {},
|
||||
'cityblock': {},
|
||||
'minkowski': dict(p=(1, 1.5, 2, 3)),
|
||||
'chebyshev': {},
|
||||
'seuclidean': dict(V=(rng.random_sample(d),)),
|
||||
'wminkowski': dict(p=(1, 1.5, 3),
|
||||
w=(rng.random_sample(d),)),
|
||||
'mahalanobis': dict(VI=(VI,)),
|
||||
'hamming': {},
|
||||
'canberra': {},
|
||||
'braycurtis': {}}
|
||||
|
||||
|
||||
@pytest.mark.parametrize('metric', METRICS_DEFAULT_PARAMS)
|
||||
def test_cdist(metric):
|
||||
argdict = METRICS_DEFAULT_PARAMS[metric]
|
||||
keys = argdict.keys()
|
||||
for vals in itertools.product(*argdict.values()):
|
||||
kwargs = dict(zip(keys, vals))
|
||||
D_true = cdist(X1, X2, metric, **kwargs)
|
||||
check_cdist(metric, kwargs, D_true)
|
||||
|
||||
|
||||
@pytest.mark.parametrize('metric', BOOL_METRICS)
|
||||
def test_cdist_bool_metric(metric):
|
||||
D_true = cdist(X1_bool, X2_bool, metric)
|
||||
check_cdist_bool(metric, D_true)
|
||||
|
||||
|
||||
def check_cdist(metric, kwargs, D_true):
|
||||
dm = DistanceMetric.get_metric(metric, **kwargs)
|
||||
D12 = dm.pairwise(X1, X2)
|
||||
assert_array_almost_equal(D12, D_true)
|
||||
|
||||
|
||||
def check_cdist_bool(metric, D_true):
|
||||
dm = DistanceMetric.get_metric(metric)
|
||||
D12 = dm.pairwise(X1_bool, X2_bool)
|
||||
assert_array_almost_equal(D12, D_true)
|
||||
|
||||
|
||||
@pytest.mark.parametrize('metric', METRICS_DEFAULT_PARAMS)
|
||||
def test_pdist(metric):
|
||||
argdict = METRICS_DEFAULT_PARAMS[metric]
|
||||
keys = argdict.keys()
|
||||
for vals in itertools.product(*argdict.values()):
|
||||
kwargs = dict(zip(keys, vals))
|
||||
D_true = cdist(X1, X1, metric, **kwargs)
|
||||
check_pdist(metric, kwargs, D_true)
|
||||
|
||||
|
||||
@pytest.mark.parametrize('metric', BOOL_METRICS)
|
||||
def test_pdist_bool_metrics(metric):
|
||||
D_true = cdist(X1_bool, X1_bool, metric)
|
||||
check_pdist_bool(metric, D_true)
|
||||
|
||||
|
||||
def check_pdist(metric, kwargs, D_true):
|
||||
dm = DistanceMetric.get_metric(metric, **kwargs)
|
||||
D12 = dm.pairwise(X1)
|
||||
assert_array_almost_equal(D12, D_true)
|
||||
|
||||
|
||||
def check_pdist_bool(metric, D_true):
|
||||
dm = DistanceMetric.get_metric(metric)
|
||||
D12 = dm.pairwise(X1_bool)
|
||||
# Based on https://github.com/scipy/scipy/pull/7373
|
||||
# When comparing two all-zero vectors, scipy>=1.2.0 jaccard metric
|
||||
# was changed to return 0, instead of nan.
|
||||
if metric == 'jaccard' and sp_version < parse_version('1.2.0'):
|
||||
D_true[np.isnan(D_true)] = 0
|
||||
assert_array_almost_equal(D12, D_true)
|
||||
|
||||
|
||||
@pytest.mark.parametrize('metric', METRICS_DEFAULT_PARAMS)
|
||||
def test_pickle(metric):
|
||||
argdict = METRICS_DEFAULT_PARAMS[metric]
|
||||
keys = argdict.keys()
|
||||
for vals in itertools.product(*argdict.values()):
|
||||
kwargs = dict(zip(keys, vals))
|
||||
check_pickle(metric, kwargs)
|
||||
|
||||
|
||||
@pytest.mark.parametrize('metric', BOOL_METRICS)
|
||||
def test_pickle_bool_metrics(metric):
|
||||
dm = DistanceMetric.get_metric(metric)
|
||||
D1 = dm.pairwise(X1_bool)
|
||||
dm2 = pickle.loads(pickle.dumps(dm))
|
||||
D2 = dm2.pairwise(X1_bool)
|
||||
assert_array_almost_equal(D1, D2)
|
||||
|
||||
|
||||
def check_pickle(metric, kwargs):
|
||||
dm = DistanceMetric.get_metric(metric, **kwargs)
|
||||
D1 = dm.pairwise(X1)
|
||||
dm2 = pickle.loads(pickle.dumps(dm))
|
||||
D2 = dm2.pairwise(X1)
|
||||
assert_array_almost_equal(D1, D2)
|
||||
|
||||
|
||||
def test_haversine_metric():
|
||||
def haversine_slow(x1, x2):
|
||||
return 2 * np.arcsin(np.sqrt(np.sin(0.5 * (x1[0] - x2[0])) ** 2
|
||||
+ np.cos(x1[0]) * np.cos(x2[0]) *
|
||||
np.sin(0.5 * (x1[1] - x2[1])) ** 2))
|
||||
|
||||
X = np.random.random((10, 2))
|
||||
|
||||
haversine = DistanceMetric.get_metric("haversine")
|
||||
|
||||
D1 = haversine.pairwise(X)
|
||||
D2 = np.zeros_like(D1)
|
||||
for i, x1 in enumerate(X):
|
||||
for j, x2 in enumerate(X):
|
||||
D2[i, j] = haversine_slow(x1, x2)
|
||||
|
||||
assert_array_almost_equal(D1, D2)
|
||||
assert_array_almost_equal(haversine.dist_to_rdist(D1),
|
||||
np.sin(0.5 * D2) ** 2)
|
||||
|
||||
|
||||
def test_pyfunc_metric():
|
||||
X = np.random.random((10, 3))
|
||||
|
||||
euclidean = DistanceMetric.get_metric("euclidean")
|
||||
pyfunc = DistanceMetric.get_metric("pyfunc", func=dist_func, p=2)
|
||||
|
||||
# Check if both callable metric and predefined metric initialized
|
||||
# DistanceMetric object is picklable
|
||||
euclidean_pkl = pickle.loads(pickle.dumps(euclidean))
|
||||
pyfunc_pkl = pickle.loads(pickle.dumps(pyfunc))
|
||||
|
||||
D1 = euclidean.pairwise(X)
|
||||
D2 = pyfunc.pairwise(X)
|
||||
|
||||
D1_pkl = euclidean_pkl.pairwise(X)
|
||||
D2_pkl = pyfunc_pkl.pairwise(X)
|
||||
|
||||
assert_array_almost_equal(D1, D2)
|
||||
assert_array_almost_equal(D1_pkl, D2_pkl)
|
||||
|
||||
|
||||
def test_bad_pyfunc_metric():
|
||||
def wrong_distance(x, y):
|
||||
return "1"
|
||||
|
||||
X = np.ones((5, 2))
|
||||
assert_raises_regex(TypeError,
|
||||
"Custom distance function must accept two vectors",
|
||||
BallTree, X, metric=wrong_distance)
|
||||
|
||||
|
||||
def test_input_data_size():
|
||||
# Regression test for #6288
|
||||
# Previously, a metric requiring a particular input dimension would fail
|
||||
def custom_metric(x, y):
|
||||
assert x.shape[0] == 3
|
||||
return np.sum((x - y) ** 2)
|
||||
|
||||
rng = check_random_state(0)
|
||||
X = rng.rand(10, 3)
|
||||
|
||||
pyfunc = DistanceMetric.get_metric("pyfunc", func=custom_metric)
|
||||
eucl = DistanceMetric.get_metric("euclidean")
|
||||
assert_array_almost_equal(pyfunc.pairwise(X), eucl.pairwise(X) ** 2)
|
79
venv/Lib/site-packages/sklearn/neighbors/tests/test_graph.py
Normal file
79
venv/Lib/site-packages/sklearn/neighbors/tests/test_graph.py
Normal file
|
@ -0,0 +1,79 @@
|
|||
import numpy as np
|
||||
|
||||
from sklearn.metrics import euclidean_distances
|
||||
from sklearn.neighbors import KNeighborsTransformer, RadiusNeighborsTransformer
|
||||
from sklearn.neighbors._base import _is_sorted_by_data
|
||||
|
||||
|
||||
def test_transformer_result():
|
||||
# Test the number of neighbors returned
|
||||
n_neighbors = 5
|
||||
n_samples_fit = 20
|
||||
n_queries = 18
|
||||
n_features = 10
|
||||
|
||||
rng = np.random.RandomState(42)
|
||||
X = rng.randn(n_samples_fit, n_features)
|
||||
X2 = rng.randn(n_queries, n_features)
|
||||
radius = np.percentile(euclidean_distances(X), 10)
|
||||
|
||||
# with n_neighbors
|
||||
for mode in ['distance', 'connectivity']:
|
||||
add_one = mode == 'distance'
|
||||
nnt = KNeighborsTransformer(n_neighbors=n_neighbors, mode=mode)
|
||||
Xt = nnt.fit_transform(X)
|
||||
assert Xt.shape == (n_samples_fit, n_samples_fit)
|
||||
assert Xt.data.shape == (n_samples_fit * (n_neighbors + add_one), )
|
||||
assert Xt.format == 'csr'
|
||||
assert _is_sorted_by_data(Xt)
|
||||
|
||||
X2t = nnt.transform(X2)
|
||||
assert X2t.shape == (n_queries, n_samples_fit)
|
||||
assert X2t.data.shape == (n_queries * (n_neighbors + add_one), )
|
||||
assert X2t.format == 'csr'
|
||||
assert _is_sorted_by_data(X2t)
|
||||
|
||||
# with radius
|
||||
for mode in ['distance', 'connectivity']:
|
||||
add_one = mode == 'distance'
|
||||
nnt = RadiusNeighborsTransformer(radius=radius, mode=mode)
|
||||
Xt = nnt.fit_transform(X)
|
||||
assert Xt.shape == (n_samples_fit, n_samples_fit)
|
||||
assert not Xt.data.shape == (n_samples_fit * (n_neighbors + add_one), )
|
||||
assert Xt.format == 'csr'
|
||||
assert _is_sorted_by_data(Xt)
|
||||
|
||||
X2t = nnt.transform(X2)
|
||||
assert X2t.shape == (n_queries, n_samples_fit)
|
||||
assert not X2t.data.shape == (n_queries * (n_neighbors + add_one), )
|
||||
assert X2t.format == 'csr'
|
||||
assert _is_sorted_by_data(X2t)
|
||||
|
||||
|
||||
def _has_explicit_diagonal(X):
|
||||
"""Return True if the diagonal is explicitly stored"""
|
||||
X = X.tocoo()
|
||||
explicit = X.row[X.row == X.col]
|
||||
return len(explicit) == X.shape[0]
|
||||
|
||||
|
||||
def test_explicit_diagonal():
|
||||
# Test that the diagonal is explicitly stored in the sparse graph
|
||||
n_neighbors = 5
|
||||
n_samples_fit, n_samples_transform, n_features = 20, 18, 10
|
||||
rng = np.random.RandomState(42)
|
||||
X = rng.randn(n_samples_fit, n_features)
|
||||
X2 = rng.randn(n_samples_transform, n_features)
|
||||
|
||||
nnt = KNeighborsTransformer(n_neighbors=n_neighbors)
|
||||
Xt = nnt.fit_transform(X)
|
||||
assert _has_explicit_diagonal(Xt)
|
||||
assert np.all(Xt.data.reshape(n_samples_fit, n_neighbors + 1)[:, 0] == 0)
|
||||
|
||||
Xt = nnt.transform(X)
|
||||
assert _has_explicit_diagonal(Xt)
|
||||
assert np.all(Xt.data.reshape(n_samples_fit, n_neighbors + 1)[:, 0] == 0)
|
||||
|
||||
# Using transform on new data should not always have zero diagonal
|
||||
X2t = nnt.transform(X2)
|
||||
assert not _has_explicit_diagonal(X2t)
|
|
@ -0,0 +1,6 @@
|
|||
DIMENSION = 3
|
||||
|
||||
METRICS = {'euclidean': {},
|
||||
'manhattan': {},
|
||||
'chebyshev': {},
|
||||
'minkowski': dict(p=3)}
|
250
venv/Lib/site-packages/sklearn/neighbors/tests/test_kde.py
Normal file
250
venv/Lib/site-packages/sklearn/neighbors/tests/test_kde.py
Normal file
|
@ -0,0 +1,250 @@
|
|||
import numpy as np
|
||||
|
||||
import pytest
|
||||
|
||||
from sklearn.utils._testing import assert_allclose, assert_raises
|
||||
from sklearn.neighbors import KernelDensity, KDTree, NearestNeighbors
|
||||
from sklearn.neighbors._ball_tree import kernel_norm
|
||||
from sklearn.pipeline import make_pipeline
|
||||
from sklearn.datasets import make_blobs
|
||||
from sklearn.model_selection import GridSearchCV
|
||||
from sklearn.preprocessing import StandardScaler
|
||||
from sklearn.exceptions import NotFittedError
|
||||
import joblib
|
||||
|
||||
|
||||
# XXX Duplicated in test_neighbors_tree, test_kde
|
||||
def compute_kernel_slow(Y, X, kernel, h):
|
||||
d = np.sqrt(((Y[:, None, :] - X) ** 2).sum(-1))
|
||||
norm = kernel_norm(h, X.shape[1], kernel) / X.shape[0]
|
||||
|
||||
if kernel == 'gaussian':
|
||||
return norm * np.exp(-0.5 * (d * d) / (h * h)).sum(-1)
|
||||
elif kernel == 'tophat':
|
||||
return norm * (d < h).sum(-1)
|
||||
elif kernel == 'epanechnikov':
|
||||
return norm * ((1.0 - (d * d) / (h * h)) * (d < h)).sum(-1)
|
||||
elif kernel == 'exponential':
|
||||
return norm * (np.exp(-d / h)).sum(-1)
|
||||
elif kernel == 'linear':
|
||||
return norm * ((1 - d / h) * (d < h)).sum(-1)
|
||||
elif kernel == 'cosine':
|
||||
return norm * (np.cos(0.5 * np.pi * d / h) * (d < h)).sum(-1)
|
||||
else:
|
||||
raise ValueError('kernel not recognized')
|
||||
|
||||
|
||||
def check_results(kernel, bandwidth, atol, rtol, X, Y, dens_true):
|
||||
kde = KernelDensity(kernel=kernel, bandwidth=bandwidth,
|
||||
atol=atol, rtol=rtol)
|
||||
log_dens = kde.fit(X).score_samples(Y)
|
||||
assert_allclose(np.exp(log_dens), dens_true,
|
||||
atol=atol, rtol=max(1E-7, rtol))
|
||||
assert_allclose(np.exp(kde.score(Y)),
|
||||
np.prod(dens_true),
|
||||
atol=atol, rtol=max(1E-7, rtol))
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
'kernel',
|
||||
['gaussian', 'tophat', 'epanechnikov',
|
||||
'exponential', 'linear', 'cosine'])
|
||||
@pytest.mark.parametrize('bandwidth', [0.01, 0.1, 1])
|
||||
def test_kernel_density(kernel, bandwidth):
|
||||
n_samples, n_features = (100, 3)
|
||||
|
||||
rng = np.random.RandomState(0)
|
||||
X = rng.randn(n_samples, n_features)
|
||||
Y = rng.randn(n_samples, n_features)
|
||||
|
||||
dens_true = compute_kernel_slow(Y, X, kernel, bandwidth)
|
||||
|
||||
for rtol in [0, 1E-5]:
|
||||
for atol in [1E-6, 1E-2]:
|
||||
for breadth_first in (True, False):
|
||||
check_results(kernel, bandwidth, atol, rtol,
|
||||
X, Y, dens_true)
|
||||
|
||||
|
||||
def test_kernel_density_sampling(n_samples=100, n_features=3):
|
||||
rng = np.random.RandomState(0)
|
||||
X = rng.randn(n_samples, n_features)
|
||||
|
||||
bandwidth = 0.2
|
||||
|
||||
for kernel in ['gaussian', 'tophat']:
|
||||
# draw a tophat sample
|
||||
kde = KernelDensity(bandwidth=bandwidth, kernel=kernel).fit(X)
|
||||
samp = kde.sample(100)
|
||||
assert X.shape == samp.shape
|
||||
|
||||
# check that samples are in the right range
|
||||
nbrs = NearestNeighbors(n_neighbors=1).fit(X)
|
||||
dist, ind = nbrs.kneighbors(X, return_distance=True)
|
||||
|
||||
if kernel == 'tophat':
|
||||
assert np.all(dist < bandwidth)
|
||||
elif kernel == 'gaussian':
|
||||
# 5 standard deviations is safe for 100 samples, but there's a
|
||||
# very small chance this test could fail.
|
||||
assert np.all(dist < 5 * bandwidth)
|
||||
|
||||
# check unsupported kernels
|
||||
for kernel in ['epanechnikov', 'exponential', 'linear', 'cosine']:
|
||||
kde = KernelDensity(bandwidth=bandwidth, kernel=kernel).fit(X)
|
||||
assert_raises(NotImplementedError, kde.sample, 100)
|
||||
|
||||
# non-regression test: used to return a scalar
|
||||
X = rng.randn(4, 1)
|
||||
kde = KernelDensity(kernel="gaussian").fit(X)
|
||||
assert kde.sample().shape == (1, 1)
|
||||
|
||||
|
||||
@pytest.mark.parametrize('algorithm', ['auto', 'ball_tree', 'kd_tree'])
|
||||
@pytest.mark.parametrize('metric',
|
||||
['euclidean', 'minkowski', 'manhattan',
|
||||
'chebyshev', 'haversine'])
|
||||
def test_kde_algorithm_metric_choice(algorithm, metric):
|
||||
# Smoke test for various metrics and algorithms
|
||||
rng = np.random.RandomState(0)
|
||||
X = rng.randn(10, 2) # 2 features required for haversine dist.
|
||||
Y = rng.randn(10, 2)
|
||||
|
||||
if algorithm == 'kd_tree' and metric not in KDTree.valid_metrics:
|
||||
assert_raises(ValueError, KernelDensity,
|
||||
algorithm=algorithm, metric=metric)
|
||||
else:
|
||||
kde = KernelDensity(algorithm=algorithm, metric=metric)
|
||||
kde.fit(X)
|
||||
y_dens = kde.score_samples(Y)
|
||||
assert y_dens.shape == Y.shape[:1]
|
||||
|
||||
|
||||
def test_kde_score(n_samples=100, n_features=3):
|
||||
pass
|
||||
# FIXME
|
||||
# rng = np.random.RandomState(0)
|
||||
# X = rng.random_sample((n_samples, n_features))
|
||||
# Y = rng.random_sample((n_samples, n_features))
|
||||
|
||||
|
||||
def test_kde_badargs():
|
||||
assert_raises(ValueError, KernelDensity,
|
||||
algorithm='blah')
|
||||
assert_raises(ValueError, KernelDensity,
|
||||
bandwidth=0)
|
||||
assert_raises(ValueError, KernelDensity,
|
||||
kernel='blah')
|
||||
assert_raises(ValueError, KernelDensity,
|
||||
metric='blah')
|
||||
assert_raises(ValueError, KernelDensity,
|
||||
algorithm='kd_tree', metric='blah')
|
||||
kde = KernelDensity()
|
||||
assert_raises(ValueError, kde.fit, np.random.random((200, 10)),
|
||||
sample_weight=np.random.random((200, 10)))
|
||||
assert_raises(ValueError, kde.fit, np.random.random((200, 10)),
|
||||
sample_weight=-np.random.random(200))
|
||||
|
||||
|
||||
def test_kde_pipeline_gridsearch():
|
||||
# test that kde plays nice in pipelines and grid-searches
|
||||
X, _ = make_blobs(cluster_std=.1, random_state=1,
|
||||
centers=[[0, 1], [1, 0], [0, 0]])
|
||||
pipe1 = make_pipeline(StandardScaler(with_mean=False, with_std=False),
|
||||
KernelDensity(kernel="gaussian"))
|
||||
params = dict(kerneldensity__bandwidth=[0.001, 0.01, 0.1, 1, 10])
|
||||
search = GridSearchCV(pipe1, param_grid=params)
|
||||
search.fit(X)
|
||||
assert search.best_params_['kerneldensity__bandwidth'] == .1
|
||||
|
||||
|
||||
def test_kde_sample_weights():
|
||||
n_samples = 400
|
||||
size_test = 20
|
||||
weights_neutral = np.full(n_samples, 3.)
|
||||
for d in [1, 2, 10]:
|
||||
rng = np.random.RandomState(0)
|
||||
X = rng.rand(n_samples, d)
|
||||
weights = 1 + (10 * X.sum(axis=1)).astype(np.int8)
|
||||
X_repetitions = np.repeat(X, weights, axis=0)
|
||||
n_samples_test = size_test // d
|
||||
test_points = rng.rand(n_samples_test, d)
|
||||
for algorithm in ['auto', 'ball_tree', 'kd_tree']:
|
||||
for metric in ['euclidean', 'minkowski', 'manhattan',
|
||||
'chebyshev']:
|
||||
if algorithm != 'kd_tree' or metric in KDTree.valid_metrics:
|
||||
kde = KernelDensity(algorithm=algorithm, metric=metric)
|
||||
|
||||
# Test that adding a constant sample weight has no effect
|
||||
kde.fit(X, sample_weight=weights_neutral)
|
||||
scores_const_weight = kde.score_samples(test_points)
|
||||
sample_const_weight = kde.sample(random_state=1234)
|
||||
kde.fit(X)
|
||||
scores_no_weight = kde.score_samples(test_points)
|
||||
sample_no_weight = kde.sample(random_state=1234)
|
||||
assert_allclose(scores_const_weight, scores_no_weight)
|
||||
assert_allclose(sample_const_weight, sample_no_weight)
|
||||
|
||||
# Test equivalence between sampling and (integer) weights
|
||||
kde.fit(X, sample_weight=weights)
|
||||
scores_weight = kde.score_samples(test_points)
|
||||
sample_weight = kde.sample(random_state=1234)
|
||||
kde.fit(X_repetitions)
|
||||
scores_ref_sampling = kde.score_samples(test_points)
|
||||
sample_ref_sampling = kde.sample(random_state=1234)
|
||||
assert_allclose(scores_weight, scores_ref_sampling)
|
||||
assert_allclose(sample_weight, sample_ref_sampling)
|
||||
|
||||
# Test that sample weights has a non-trivial effect
|
||||
diff = np.max(np.abs(scores_no_weight - scores_weight))
|
||||
assert diff > 0.001
|
||||
|
||||
# Test invariance with respect to arbitrary scaling
|
||||
scale_factor = rng.rand()
|
||||
kde.fit(X, sample_weight=(scale_factor * weights))
|
||||
scores_scaled_weight = kde.score_samples(test_points)
|
||||
assert_allclose(scores_scaled_weight, scores_weight)
|
||||
|
||||
|
||||
def test_sample_weight_invalid():
|
||||
# Check sample weighting raises errors.
|
||||
kde = KernelDensity()
|
||||
data = np.reshape([1., 2., 3.], (-1, 1))
|
||||
|
||||
sample_weight = [0.1, -0.2, 0.3]
|
||||
expected_err = "sample_weight must have positive values"
|
||||
with pytest.raises(ValueError, match=expected_err):
|
||||
kde.fit(data, sample_weight=sample_weight)
|
||||
|
||||
|
||||
@pytest.mark.parametrize('sample_weight', [None, [0.1, 0.2, 0.3]])
|
||||
def test_pickling(tmpdir, sample_weight):
|
||||
# Make sure that predictions are the same before and after pickling. Used
|
||||
# to be a bug because sample_weights wasn't pickled and the resulting tree
|
||||
# would miss some info.
|
||||
|
||||
kde = KernelDensity()
|
||||
data = np.reshape([1., 2., 3.], (-1, 1))
|
||||
kde.fit(data, sample_weight=sample_weight)
|
||||
|
||||
X = np.reshape([1.1, 2.1], (-1, 1))
|
||||
scores = kde.score_samples(X)
|
||||
|
||||
file_path = str(tmpdir.join('dump.pkl'))
|
||||
joblib.dump(kde, file_path)
|
||||
kde = joblib.load(file_path)
|
||||
scores_pickled = kde.score_samples(X)
|
||||
|
||||
assert_allclose(scores, scores_pickled)
|
||||
|
||||
|
||||
@pytest.mark.parametrize('method', ['score_samples', 'sample'])
|
||||
def test_check_is_fitted(method):
|
||||
# Check that predict raises an exception in an unfitted estimator.
|
||||
# Unfitted estimators should raise a NotFittedError.
|
||||
rng = np.random.RandomState(0)
|
||||
X = rng.randn(10, 2)
|
||||
kde = KernelDensity()
|
||||
|
||||
with pytest.raises(NotFittedError):
|
||||
getattr(kde, method)(X)
|
232
venv/Lib/site-packages/sklearn/neighbors/tests/test_lof.py
Normal file
232
venv/Lib/site-packages/sklearn/neighbors/tests/test_lof.py
Normal file
|
@ -0,0 +1,232 @@
|
|||
# Authors: Nicolas Goix <nicolas.goix@telecom-paristech.fr>
|
||||
# Alexandre Gramfort <alexandre.gramfort@telecom-paristech.fr>
|
||||
# License: BSD 3 clause
|
||||
|
||||
from math import sqrt
|
||||
|
||||
import numpy as np
|
||||
from sklearn import neighbors
|
||||
|
||||
import pytest
|
||||
from numpy.testing import assert_array_equal
|
||||
|
||||
from sklearn import metrics
|
||||
from sklearn.metrics import roc_auc_score
|
||||
|
||||
from sklearn.utils import check_random_state
|
||||
from sklearn.utils._testing import assert_array_almost_equal
|
||||
from sklearn.utils._testing import assert_warns_message
|
||||
from sklearn.utils._testing import assert_raises
|
||||
from sklearn.utils._testing import assert_raises_regex
|
||||
from sklearn.utils.estimator_checks import check_estimator
|
||||
from sklearn.utils.estimator_checks import check_outlier_corruption
|
||||
|
||||
from sklearn.datasets import load_iris
|
||||
|
||||
|
||||
# load the iris dataset
|
||||
# and randomly permute it
|
||||
rng = check_random_state(0)
|
||||
iris = load_iris()
|
||||
perm = rng.permutation(iris.target.size)
|
||||
iris.data = iris.data[perm]
|
||||
iris.target = iris.target[perm]
|
||||
|
||||
|
||||
def test_lof():
|
||||
# Toy sample (the last two samples are outliers):
|
||||
X = [[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1], [5, 3], [-4, 2]]
|
||||
|
||||
# Test LocalOutlierFactor:
|
||||
clf = neighbors.LocalOutlierFactor(n_neighbors=5)
|
||||
score = clf.fit(X).negative_outlier_factor_
|
||||
assert_array_equal(clf._fit_X, X)
|
||||
|
||||
# Assert largest outlier score is smaller than smallest inlier score:
|
||||
assert np.min(score[:-2]) > np.max(score[-2:])
|
||||
|
||||
# Assert predict() works:
|
||||
clf = neighbors.LocalOutlierFactor(contamination=0.25,
|
||||
n_neighbors=5).fit(X)
|
||||
assert_array_equal(clf._predict(), 6 * [1] + 2 * [-1])
|
||||
assert_array_equal(clf.fit_predict(X), 6 * [1] + 2 * [-1])
|
||||
|
||||
|
||||
def test_lof_performance():
|
||||
# Generate train/test data
|
||||
rng = check_random_state(2)
|
||||
X = 0.3 * rng.randn(120, 2)
|
||||
X_train = X[:100]
|
||||
|
||||
# Generate some abnormal novel observations
|
||||
X_outliers = rng.uniform(low=-4, high=4, size=(20, 2))
|
||||
X_test = np.r_[X[100:], X_outliers]
|
||||
y_test = np.array([0] * 20 + [1] * 20)
|
||||
|
||||
# fit the model for novelty detection
|
||||
clf = neighbors.LocalOutlierFactor(novelty=True).fit(X_train)
|
||||
|
||||
# predict scores (the lower, the more normal)
|
||||
y_pred = -clf.decision_function(X_test)
|
||||
|
||||
# check that roc_auc is good
|
||||
assert roc_auc_score(y_test, y_pred) > .99
|
||||
|
||||
|
||||
def test_lof_values():
|
||||
# toy samples:
|
||||
X_train = [[1, 1], [1, 2], [2, 1]]
|
||||
clf1 = neighbors.LocalOutlierFactor(n_neighbors=2,
|
||||
contamination=0.1,
|
||||
novelty=True).fit(X_train)
|
||||
clf2 = neighbors.LocalOutlierFactor(n_neighbors=2,
|
||||
novelty=True).fit(X_train)
|
||||
s_0 = 2. * sqrt(2.) / (1. + sqrt(2.))
|
||||
s_1 = (1. + sqrt(2)) * (1. / (4. * sqrt(2.)) + 1. / (2. + 2. * sqrt(2)))
|
||||
# check predict()
|
||||
assert_array_almost_equal(-clf1.negative_outlier_factor_, [s_0, s_1, s_1])
|
||||
assert_array_almost_equal(-clf2.negative_outlier_factor_, [s_0, s_1, s_1])
|
||||
# check predict(one sample not in train)
|
||||
assert_array_almost_equal(-clf1.score_samples([[2., 2.]]), [s_0])
|
||||
assert_array_almost_equal(-clf2.score_samples([[2., 2.]]), [s_0])
|
||||
# check predict(one sample already in train)
|
||||
assert_array_almost_equal(-clf1.score_samples([[1., 1.]]), [s_1])
|
||||
assert_array_almost_equal(-clf2.score_samples([[1., 1.]]), [s_1])
|
||||
|
||||
|
||||
def test_lof_precomputed(random_state=42):
|
||||
"""Tests LOF with a distance matrix."""
|
||||
# Note: smaller samples may result in spurious test success
|
||||
rng = np.random.RandomState(random_state)
|
||||
X = rng.random_sample((10, 4))
|
||||
Y = rng.random_sample((3, 4))
|
||||
DXX = metrics.pairwise_distances(X, metric='euclidean')
|
||||
DYX = metrics.pairwise_distances(Y, X, metric='euclidean')
|
||||
# As a feature matrix (n_samples by n_features)
|
||||
lof_X = neighbors.LocalOutlierFactor(n_neighbors=3, novelty=True)
|
||||
lof_X.fit(X)
|
||||
pred_X_X = lof_X._predict()
|
||||
pred_X_Y = lof_X.predict(Y)
|
||||
|
||||
# As a dense distance matrix (n_samples by n_samples)
|
||||
lof_D = neighbors.LocalOutlierFactor(n_neighbors=3, algorithm='brute',
|
||||
metric='precomputed', novelty=True)
|
||||
lof_D.fit(DXX)
|
||||
pred_D_X = lof_D._predict()
|
||||
pred_D_Y = lof_D.predict(DYX)
|
||||
|
||||
assert_array_almost_equal(pred_X_X, pred_D_X)
|
||||
assert_array_almost_equal(pred_X_Y, pred_D_Y)
|
||||
|
||||
|
||||
def test_n_neighbors_attribute():
|
||||
X = iris.data
|
||||
clf = neighbors.LocalOutlierFactor(n_neighbors=500).fit(X)
|
||||
assert clf.n_neighbors_ == X.shape[0] - 1
|
||||
|
||||
clf = neighbors.LocalOutlierFactor(n_neighbors=500)
|
||||
assert_warns_message(UserWarning,
|
||||
"n_neighbors will be set to (n_samples - 1)",
|
||||
clf.fit, X)
|
||||
assert clf.n_neighbors_ == X.shape[0] - 1
|
||||
|
||||
|
||||
def test_score_samples():
|
||||
X_train = [[1, 1], [1, 2], [2, 1]]
|
||||
clf1 = neighbors.LocalOutlierFactor(n_neighbors=2,
|
||||
contamination=0.1,
|
||||
novelty=True).fit(X_train)
|
||||
clf2 = neighbors.LocalOutlierFactor(n_neighbors=2,
|
||||
novelty=True).fit(X_train)
|
||||
assert_array_equal(clf1.score_samples([[2., 2.]]),
|
||||
clf1.decision_function([[2., 2.]]) + clf1.offset_)
|
||||
assert_array_equal(clf2.score_samples([[2., 2.]]),
|
||||
clf2.decision_function([[2., 2.]]) + clf2.offset_)
|
||||
assert_array_equal(clf1.score_samples([[2., 2.]]),
|
||||
clf2.score_samples([[2., 2.]]))
|
||||
|
||||
|
||||
def test_contamination():
|
||||
X = [[1, 1], [1, 0]]
|
||||
clf = neighbors.LocalOutlierFactor(contamination=0.6)
|
||||
assert_raises(ValueError, clf.fit, X)
|
||||
|
||||
|
||||
def test_novelty_errors():
|
||||
X = iris.data
|
||||
|
||||
# check errors for novelty=False
|
||||
clf = neighbors.LocalOutlierFactor()
|
||||
clf.fit(X)
|
||||
# predict, decision_function and score_samples raise ValueError
|
||||
for method in ['predict', 'decision_function', 'score_samples']:
|
||||
msg = ('{} is not available when novelty=False'.format(method))
|
||||
assert_raises_regex(AttributeError, msg, getattr, clf, method)
|
||||
|
||||
# check errors for novelty=True
|
||||
clf = neighbors.LocalOutlierFactor(novelty=True)
|
||||
msg = 'fit_predict is not available when novelty=True'
|
||||
assert_raises_regex(AttributeError, msg, getattr, clf, 'fit_predict')
|
||||
|
||||
|
||||
def test_novelty_training_scores():
|
||||
# check that the scores of the training samples are still accessible
|
||||
# when novelty=True through the negative_outlier_factor_ attribute
|
||||
X = iris.data
|
||||
|
||||
# fit with novelty=False
|
||||
clf_1 = neighbors.LocalOutlierFactor()
|
||||
clf_1.fit(X)
|
||||
scores_1 = clf_1.negative_outlier_factor_
|
||||
|
||||
# fit with novelty=True
|
||||
clf_2 = neighbors.LocalOutlierFactor(novelty=True)
|
||||
clf_2.fit(X)
|
||||
scores_2 = clf_2.negative_outlier_factor_
|
||||
|
||||
assert_array_almost_equal(scores_1, scores_2)
|
||||
|
||||
|
||||
def test_hasattr_prediction():
|
||||
# check availability of prediction methods depending on novelty value.
|
||||
X = [[1, 1], [1, 2], [2, 1]]
|
||||
|
||||
# when novelty=True
|
||||
clf = neighbors.LocalOutlierFactor(novelty=True)
|
||||
clf.fit(X)
|
||||
assert hasattr(clf, 'predict')
|
||||
assert hasattr(clf, 'decision_function')
|
||||
assert hasattr(clf, 'score_samples')
|
||||
assert not hasattr(clf, 'fit_predict')
|
||||
|
||||
# when novelty=False
|
||||
clf = neighbors.LocalOutlierFactor(novelty=False)
|
||||
clf.fit(X)
|
||||
assert hasattr(clf, 'fit_predict')
|
||||
assert not hasattr(clf, 'predict')
|
||||
assert not hasattr(clf, 'decision_function')
|
||||
assert not hasattr(clf, 'score_samples')
|
||||
|
||||
|
||||
def test_novelty_true_common_tests():
|
||||
|
||||
# the common tests are run for the default LOF (novelty=False).
|
||||
# here we run these common tests for LOF when novelty=True
|
||||
check_estimator(neighbors.LocalOutlierFactor(novelty=True))
|
||||
|
||||
|
||||
@pytest.mark.parametrize('expected_outliers', [30, 53])
|
||||
def test_predicted_outlier_number(expected_outliers):
|
||||
# the number of predicted outliers should be equal to the number of
|
||||
# expected outliers unless there are ties in the abnormality scores.
|
||||
X = iris.data
|
||||
n_samples = X.shape[0]
|
||||
contamination = float(expected_outliers)/n_samples
|
||||
|
||||
clf = neighbors.LocalOutlierFactor(contamination=contamination)
|
||||
y_pred = clf.fit_predict(X)
|
||||
|
||||
num_outliers = np.sum(y_pred != 1)
|
||||
if num_outliers != expected_outliers:
|
||||
y_dec = clf.negative_outlier_factor_
|
||||
check_outlier_corruption(num_outliers, expected_outliers, y_dec)
|
534
venv/Lib/site-packages/sklearn/neighbors/tests/test_nca.py
Normal file
534
venv/Lib/site-packages/sklearn/neighbors/tests/test_nca.py
Normal file
|
@ -0,0 +1,534 @@
|
|||
# coding: utf-8
|
||||
"""
|
||||
Testing for Neighborhood Component Analysis module (sklearn.neighbors.nca)
|
||||
"""
|
||||
|
||||
# Authors: William de Vazelhes <wdevazelhes@gmail.com>
|
||||
# John Chiotellis <ioannis.chiotellis@in.tum.de>
|
||||
# License: BSD 3 clause
|
||||
|
||||
import pytest
|
||||
import re
|
||||
import numpy as np
|
||||
from numpy.testing import assert_array_equal, assert_array_almost_equal
|
||||
from scipy.optimize import check_grad
|
||||
from sklearn import clone
|
||||
from sklearn.exceptions import ConvergenceWarning
|
||||
from sklearn.utils import check_random_state
|
||||
from sklearn.utils._testing import (assert_raises,
|
||||
assert_raise_message, assert_warns_message)
|
||||
from sklearn.datasets import load_iris, make_classification, make_blobs
|
||||
from sklearn.neighbors import NeighborhoodComponentsAnalysis
|
||||
from sklearn.metrics import pairwise_distances
|
||||
|
||||
|
||||
rng = check_random_state(0)
|
||||
# load and shuffle iris dataset
|
||||
iris = load_iris()
|
||||
perm = rng.permutation(iris.target.size)
|
||||
iris_data = iris.data[perm]
|
||||
iris_target = iris.target[perm]
|
||||
EPS = np.finfo(float).eps
|
||||
|
||||
|
||||
def test_simple_example():
|
||||
"""Test on a simple example.
|
||||
|
||||
Puts four points in the input space where the opposite labels points are
|
||||
next to each other. After transform the samples from the same class
|
||||
should be next to each other.
|
||||
|
||||
"""
|
||||
X = np.array([[0, 0], [0, 1], [2, 0], [2, 1]])
|
||||
y = np.array([1, 0, 1, 0])
|
||||
nca = NeighborhoodComponentsAnalysis(n_components=2, init='identity',
|
||||
random_state=42)
|
||||
nca.fit(X, y)
|
||||
X_t = nca.transform(X)
|
||||
assert_array_equal(pairwise_distances(X_t).argsort()[:, 1],
|
||||
np.array([2, 3, 0, 1]))
|
||||
|
||||
|
||||
def test_toy_example_collapse_points():
|
||||
"""Test on a toy example of three points that should collapse
|
||||
|
||||
We build a simple example: two points from the same class and a point from
|
||||
a different class in the middle of them. On this simple example, the new
|
||||
(transformed) points should all collapse into one single point. Indeed, the
|
||||
objective is 2/(1 + exp(d/2)), with d the euclidean distance between the
|
||||
two samples from the same class. This is maximized for d=0 (because d>=0),
|
||||
with an objective equal to 1 (loss=-1.).
|
||||
|
||||
"""
|
||||
rng = np.random.RandomState(42)
|
||||
input_dim = 5
|
||||
two_points = rng.randn(2, input_dim)
|
||||
X = np.vstack([two_points, two_points.mean(axis=0)[np.newaxis, :]])
|
||||
y = [0, 0, 1]
|
||||
|
||||
class LossStorer:
|
||||
|
||||
def __init__(self, X, y):
|
||||
self.loss = np.inf # initialize the loss to very high
|
||||
# Initialize a fake NCA and variables needed to compute the loss:
|
||||
self.fake_nca = NeighborhoodComponentsAnalysis()
|
||||
self.fake_nca.n_iter_ = np.inf
|
||||
self.X, y, _ = self.fake_nca._validate_params(X, y)
|
||||
self.same_class_mask = y[:, np.newaxis] == y[np.newaxis, :]
|
||||
|
||||
def callback(self, transformation, n_iter):
|
||||
"""Stores the last value of the loss function"""
|
||||
self.loss, _ = self.fake_nca._loss_grad_lbfgs(transformation,
|
||||
self.X,
|
||||
self.same_class_mask,
|
||||
-1.0)
|
||||
|
||||
loss_storer = LossStorer(X, y)
|
||||
nca = NeighborhoodComponentsAnalysis(random_state=42,
|
||||
callback=loss_storer.callback)
|
||||
X_t = nca.fit_transform(X, y)
|
||||
print(X_t)
|
||||
# test that points are collapsed into one point
|
||||
assert_array_almost_equal(X_t - X_t[0], 0.)
|
||||
assert abs(loss_storer.loss + 1) < 1e-10
|
||||
|
||||
|
||||
def test_finite_differences():
|
||||
"""Test gradient of loss function
|
||||
|
||||
Assert that the gradient is almost equal to its finite differences
|
||||
approximation.
|
||||
"""
|
||||
# Initialize the transformation `M`, as well as `X` and `y` and `NCA`
|
||||
rng = np.random.RandomState(42)
|
||||
X, y = make_classification()
|
||||
M = rng.randn(rng.randint(1, X.shape[1] + 1),
|
||||
X.shape[1])
|
||||
nca = NeighborhoodComponentsAnalysis()
|
||||
nca.n_iter_ = 0
|
||||
mask = y[:, np.newaxis] == y[np.newaxis, :]
|
||||
|
||||
def fun(M):
|
||||
return nca._loss_grad_lbfgs(M, X, mask)[0]
|
||||
|
||||
def grad(M):
|
||||
return nca._loss_grad_lbfgs(M, X, mask)[1]
|
||||
|
||||
# compute relative error
|
||||
rel_diff = check_grad(fun, grad, M.ravel()) / np.linalg.norm(grad(M))
|
||||
np.testing.assert_almost_equal(rel_diff, 0., decimal=5)
|
||||
|
||||
|
||||
def test_params_validation():
|
||||
# Test that invalid parameters raise value error
|
||||
X = np.arange(12).reshape(4, 3)
|
||||
y = [1, 1, 2, 2]
|
||||
NCA = NeighborhoodComponentsAnalysis
|
||||
rng = np.random.RandomState(42)
|
||||
|
||||
# TypeError
|
||||
assert_raises(TypeError, NCA(max_iter='21').fit, X, y)
|
||||
assert_raises(TypeError, NCA(verbose='true').fit, X, y)
|
||||
assert_raises(TypeError, NCA(tol='1').fit, X, y)
|
||||
assert_raises(TypeError, NCA(n_components='invalid').fit, X, y)
|
||||
assert_raises(TypeError, NCA(warm_start=1).fit, X, y)
|
||||
|
||||
# ValueError
|
||||
assert_raise_message(ValueError,
|
||||
"`init` must be 'auto', 'pca', 'lda', 'identity', "
|
||||
"'random' or a numpy array of shape "
|
||||
"(n_components, n_features).",
|
||||
NCA(init=1).fit, X, y)
|
||||
assert_raise_message(ValueError,
|
||||
'`max_iter`= -1, must be >= 1.',
|
||||
NCA(max_iter=-1).fit, X, y)
|
||||
|
||||
init = rng.rand(5, 3)
|
||||
assert_raise_message(ValueError,
|
||||
'The output dimensionality ({}) of the given linear '
|
||||
'transformation `init` cannot be greater than its '
|
||||
'input dimensionality ({}).'
|
||||
.format(init.shape[0], init.shape[1]),
|
||||
NCA(init=init).fit, X, y)
|
||||
|
||||
n_components = 10
|
||||
assert_raise_message(ValueError,
|
||||
'The preferred dimensionality of the '
|
||||
'projected space `n_components` ({}) cannot '
|
||||
'be greater than the given data '
|
||||
'dimensionality ({})!'
|
||||
.format(n_components, X.shape[1]),
|
||||
NCA(n_components=n_components).fit, X, y)
|
||||
|
||||
|
||||
def test_transformation_dimensions():
|
||||
X = np.arange(12).reshape(4, 3)
|
||||
y = [1, 1, 2, 2]
|
||||
|
||||
# Fail if transformation input dimension does not match inputs dimensions
|
||||
transformation = np.array([[1, 2], [3, 4]])
|
||||
assert_raises(ValueError,
|
||||
NeighborhoodComponentsAnalysis(init=transformation).fit,
|
||||
X, y)
|
||||
|
||||
# Fail if transformation output dimension is larger than
|
||||
# transformation input dimension
|
||||
transformation = np.array([[1, 2], [3, 4], [5, 6]])
|
||||
# len(transformation) > len(transformation[0])
|
||||
assert_raises(ValueError,
|
||||
NeighborhoodComponentsAnalysis(init=transformation).fit,
|
||||
X, y)
|
||||
|
||||
# Pass otherwise
|
||||
transformation = np.arange(9).reshape(3, 3)
|
||||
NeighborhoodComponentsAnalysis(init=transformation).fit(X, y)
|
||||
|
||||
|
||||
def test_n_components():
|
||||
rng = np.random.RandomState(42)
|
||||
X = np.arange(12).reshape(4, 3)
|
||||
y = [1, 1, 2, 2]
|
||||
|
||||
init = rng.rand(X.shape[1] - 1, 3)
|
||||
|
||||
# n_components = X.shape[1] != transformation.shape[0]
|
||||
n_components = X.shape[1]
|
||||
nca = NeighborhoodComponentsAnalysis(init=init, n_components=n_components)
|
||||
assert_raise_message(ValueError,
|
||||
'The preferred dimensionality of the '
|
||||
'projected space `n_components` ({}) does not match '
|
||||
'the output dimensionality of the given '
|
||||
'linear transformation `init` ({})!'
|
||||
.format(n_components, init.shape[0]),
|
||||
nca.fit, X, y)
|
||||
|
||||
# n_components > X.shape[1]
|
||||
n_components = X.shape[1] + 2
|
||||
nca = NeighborhoodComponentsAnalysis(init=init, n_components=n_components)
|
||||
assert_raise_message(ValueError,
|
||||
'The preferred dimensionality of the '
|
||||
'projected space `n_components` ({}) cannot '
|
||||
'be greater than the given data '
|
||||
'dimensionality ({})!'
|
||||
.format(n_components, X.shape[1]),
|
||||
nca.fit, X, y)
|
||||
|
||||
# n_components < X.shape[1]
|
||||
nca = NeighborhoodComponentsAnalysis(n_components=2, init='identity')
|
||||
nca.fit(X, y)
|
||||
|
||||
|
||||
def test_init_transformation():
|
||||
rng = np.random.RandomState(42)
|
||||
X, y = make_blobs(n_samples=30, centers=6, n_features=5, random_state=0)
|
||||
|
||||
# Start learning from scratch
|
||||
nca = NeighborhoodComponentsAnalysis(init='identity')
|
||||
nca.fit(X, y)
|
||||
|
||||
# Initialize with random
|
||||
nca_random = NeighborhoodComponentsAnalysis(init='random')
|
||||
nca_random.fit(X, y)
|
||||
|
||||
# Initialize with auto
|
||||
nca_auto = NeighborhoodComponentsAnalysis(init='auto')
|
||||
nca_auto.fit(X, y)
|
||||
|
||||
# Initialize with PCA
|
||||
nca_pca = NeighborhoodComponentsAnalysis(init='pca')
|
||||
nca_pca.fit(X, y)
|
||||
|
||||
# Initialize with LDA
|
||||
nca_lda = NeighborhoodComponentsAnalysis(init='lda')
|
||||
nca_lda.fit(X, y)
|
||||
|
||||
init = rng.rand(X.shape[1], X.shape[1])
|
||||
nca = NeighborhoodComponentsAnalysis(init=init)
|
||||
nca.fit(X, y)
|
||||
|
||||
# init.shape[1] must match X.shape[1]
|
||||
init = rng.rand(X.shape[1], X.shape[1] + 1)
|
||||
nca = NeighborhoodComponentsAnalysis(init=init)
|
||||
assert_raise_message(ValueError,
|
||||
'The input dimensionality ({}) of the given '
|
||||
'linear transformation `init` must match the '
|
||||
'dimensionality of the given inputs `X` ({}).'
|
||||
.format(init.shape[1], X.shape[1]),
|
||||
nca.fit, X, y)
|
||||
|
||||
# init.shape[0] must be <= init.shape[1]
|
||||
init = rng.rand(X.shape[1] + 1, X.shape[1])
|
||||
nca = NeighborhoodComponentsAnalysis(init=init)
|
||||
assert_raise_message(ValueError,
|
||||
'The output dimensionality ({}) of the given '
|
||||
'linear transformation `init` cannot be '
|
||||
'greater than its input dimensionality ({}).'
|
||||
.format(init.shape[0], init.shape[1]),
|
||||
nca.fit, X, y)
|
||||
|
||||
# init.shape[0] must match n_components
|
||||
init = rng.rand(X.shape[1], X.shape[1])
|
||||
n_components = X.shape[1] - 2
|
||||
nca = NeighborhoodComponentsAnalysis(init=init, n_components=n_components)
|
||||
assert_raise_message(ValueError,
|
||||
'The preferred dimensionality of the '
|
||||
'projected space `n_components` ({}) does not match '
|
||||
'the output dimensionality of the given '
|
||||
'linear transformation `init` ({})!'
|
||||
.format(n_components, init.shape[0]),
|
||||
nca.fit, X, y)
|
||||
|
||||
|
||||
@pytest.mark.parametrize('n_samples', [3, 5, 7, 11])
|
||||
@pytest.mark.parametrize('n_features', [3, 5, 7, 11])
|
||||
@pytest.mark.parametrize('n_classes', [5, 7, 11])
|
||||
@pytest.mark.parametrize('n_components', [3, 5, 7, 11])
|
||||
def test_auto_init(n_samples, n_features, n_classes, n_components):
|
||||
# Test that auto choose the init as expected with every configuration
|
||||
# of order of n_samples, n_features, n_classes and n_components.
|
||||
rng = np.random.RandomState(42)
|
||||
nca_base = NeighborhoodComponentsAnalysis(init='auto',
|
||||
n_components=n_components,
|
||||
max_iter=1,
|
||||
random_state=rng)
|
||||
if n_classes >= n_samples:
|
||||
pass
|
||||
# n_classes > n_samples is impossible, and n_classes == n_samples
|
||||
# throws an error from lda but is an absurd case
|
||||
else:
|
||||
X = rng.randn(n_samples, n_features)
|
||||
y = np.tile(range(n_classes), n_samples // n_classes + 1)[:n_samples]
|
||||
if n_components > n_features:
|
||||
# this would return a ValueError, which is already tested in
|
||||
# test_params_validation
|
||||
pass
|
||||
else:
|
||||
nca = clone(nca_base)
|
||||
nca.fit(X, y)
|
||||
if n_components <= min(n_classes - 1, n_features):
|
||||
nca_other = clone(nca_base).set_params(init='lda')
|
||||
elif n_components < min(n_features, n_samples):
|
||||
nca_other = clone(nca_base).set_params(init='pca')
|
||||
else:
|
||||
nca_other = clone(nca_base).set_params(init='identity')
|
||||
nca_other.fit(X, y)
|
||||
assert_array_almost_equal(nca.components_, nca_other.components_)
|
||||
|
||||
|
||||
def test_warm_start_validation():
|
||||
X, y = make_classification(n_samples=30, n_features=5, n_classes=4,
|
||||
n_redundant=0, n_informative=5, random_state=0)
|
||||
|
||||
nca = NeighborhoodComponentsAnalysis(warm_start=True, max_iter=5)
|
||||
nca.fit(X, y)
|
||||
|
||||
X_less_features, y = make_classification(n_samples=30, n_features=4,
|
||||
n_classes=4, n_redundant=0,
|
||||
n_informative=4, random_state=0)
|
||||
assert_raise_message(ValueError,
|
||||
'The new inputs dimensionality ({}) does not '
|
||||
'match the input dimensionality of the '
|
||||
'previously learned transformation ({}).'
|
||||
.format(X_less_features.shape[1],
|
||||
nca.components_.shape[1]),
|
||||
nca.fit, X_less_features, y)
|
||||
|
||||
|
||||
def test_warm_start_effectiveness():
|
||||
# A 1-iteration second fit on same data should give almost same result
|
||||
# with warm starting, and quite different result without warm starting.
|
||||
|
||||
nca_warm = NeighborhoodComponentsAnalysis(warm_start=True, random_state=0)
|
||||
nca_warm.fit(iris_data, iris_target)
|
||||
transformation_warm = nca_warm.components_
|
||||
nca_warm.max_iter = 1
|
||||
nca_warm.fit(iris_data, iris_target)
|
||||
transformation_warm_plus_one = nca_warm.components_
|
||||
|
||||
nca_cold = NeighborhoodComponentsAnalysis(warm_start=False, random_state=0)
|
||||
nca_cold.fit(iris_data, iris_target)
|
||||
transformation_cold = nca_cold.components_
|
||||
nca_cold.max_iter = 1
|
||||
nca_cold.fit(iris_data, iris_target)
|
||||
transformation_cold_plus_one = nca_cold.components_
|
||||
|
||||
diff_warm = np.sum(np.abs(transformation_warm_plus_one -
|
||||
transformation_warm))
|
||||
diff_cold = np.sum(np.abs(transformation_cold_plus_one -
|
||||
transformation_cold))
|
||||
assert diff_warm < 3.0, ("Transformer changed significantly after one "
|
||||
"iteration even though it was warm-started.")
|
||||
|
||||
assert diff_cold > diff_warm, ("Cold-started transformer changed less "
|
||||
"significantly than warm-started "
|
||||
"transformer after one iteration.")
|
||||
|
||||
|
||||
@pytest.mark.parametrize('init_name', ['pca', 'lda', 'identity', 'random',
|
||||
'precomputed'])
|
||||
def test_verbose(init_name, capsys):
|
||||
# assert there is proper output when verbose = 1, for every initialization
|
||||
# except auto because auto will call one of the others
|
||||
rng = np.random.RandomState(42)
|
||||
X, y = make_blobs(n_samples=30, centers=6, n_features=5, random_state=0)
|
||||
regexp_init = r'... done in \ *\d+\.\d{2}s'
|
||||
msgs = {'pca': "Finding principal components" + regexp_init,
|
||||
'lda': "Finding most discriminative components" + regexp_init}
|
||||
if init_name == 'precomputed':
|
||||
init = rng.randn(X.shape[1], X.shape[1])
|
||||
else:
|
||||
init = init_name
|
||||
nca = NeighborhoodComponentsAnalysis(verbose=1, init=init)
|
||||
nca.fit(X, y)
|
||||
out, _ = capsys.readouterr()
|
||||
|
||||
# check output
|
||||
lines = re.split('\n+', out)
|
||||
# if pca or lda init, an additional line is printed, so we test
|
||||
# it and remove it to test the rest equally among initializations
|
||||
if init_name in ['pca', 'lda']:
|
||||
assert re.match(msgs[init_name], lines[0])
|
||||
lines = lines[1:]
|
||||
assert lines[0] == '[NeighborhoodComponentsAnalysis]'
|
||||
header = '{:>10} {:>20} {:>10}'.format('Iteration', 'Objective Value',
|
||||
'Time(s)')
|
||||
assert lines[1] == '[NeighborhoodComponentsAnalysis] {}'.format(header)
|
||||
assert lines[2] == ('[NeighborhoodComponentsAnalysis] {}'
|
||||
.format('-' * len(header)))
|
||||
for line in lines[3:-2]:
|
||||
# The following regex will match for instance:
|
||||
# '[NeighborhoodComponentsAnalysis] 0 6.988936e+01 0.01'
|
||||
assert re.match(r'\[NeighborhoodComponentsAnalysis\] *\d+ *\d\.\d{6}e'
|
||||
r'[+|-]\d+\ *\d+\.\d{2}', line)
|
||||
assert re.match(r'\[NeighborhoodComponentsAnalysis\] Training took\ *'
|
||||
r'\d+\.\d{2}s\.', lines[-2])
|
||||
assert lines[-1] == ''
|
||||
|
||||
|
||||
def test_no_verbose(capsys):
|
||||
# assert by default there is no output (verbose=0)
|
||||
nca = NeighborhoodComponentsAnalysis()
|
||||
nca.fit(iris_data, iris_target)
|
||||
out, _ = capsys.readouterr()
|
||||
# check output
|
||||
assert(out == '')
|
||||
|
||||
|
||||
def test_singleton_class():
|
||||
X = iris_data
|
||||
y = iris_target
|
||||
|
||||
# one singleton class
|
||||
singleton_class = 1
|
||||
ind_singleton, = np.where(y == singleton_class)
|
||||
y[ind_singleton] = 2
|
||||
y[ind_singleton[0]] = singleton_class
|
||||
|
||||
nca = NeighborhoodComponentsAnalysis(max_iter=30)
|
||||
nca.fit(X, y)
|
||||
|
||||
# One non-singleton class
|
||||
ind_1, = np.where(y == 1)
|
||||
ind_2, = np.where(y == 2)
|
||||
y[ind_1] = 0
|
||||
y[ind_1[0]] = 1
|
||||
y[ind_2] = 0
|
||||
y[ind_2[0]] = 2
|
||||
|
||||
nca = NeighborhoodComponentsAnalysis(max_iter=30)
|
||||
nca.fit(X, y)
|
||||
|
||||
# Only singleton classes
|
||||
ind_0, = np.where(y == 0)
|
||||
ind_1, = np.where(y == 1)
|
||||
ind_2, = np.where(y == 2)
|
||||
X = X[[ind_0[0], ind_1[0], ind_2[0]]]
|
||||
y = y[[ind_0[0], ind_1[0], ind_2[0]]]
|
||||
|
||||
nca = NeighborhoodComponentsAnalysis(init='identity', max_iter=30)
|
||||
nca.fit(X, y)
|
||||
assert_array_equal(X, nca.transform(X))
|
||||
|
||||
|
||||
def test_one_class():
|
||||
X = iris_data[iris_target == 0]
|
||||
y = iris_target[iris_target == 0]
|
||||
|
||||
nca = NeighborhoodComponentsAnalysis(max_iter=30,
|
||||
n_components=X.shape[1],
|
||||
init='identity')
|
||||
nca.fit(X, y)
|
||||
assert_array_equal(X, nca.transform(X))
|
||||
|
||||
|
||||
def test_callback(capsys):
|
||||
X = iris_data
|
||||
y = iris_target
|
||||
|
||||
nca = NeighborhoodComponentsAnalysis(callback='my_cb')
|
||||
assert_raises(ValueError, nca.fit, X, y)
|
||||
|
||||
max_iter = 10
|
||||
|
||||
def my_cb(transformation, n_iter):
|
||||
assert transformation.shape == (iris_data.shape[1]**2,)
|
||||
rem_iter = max_iter - n_iter
|
||||
print('{} iterations remaining...'.format(rem_iter))
|
||||
|
||||
# assert that my_cb is called
|
||||
nca = NeighborhoodComponentsAnalysis(max_iter=max_iter,
|
||||
callback=my_cb, verbose=1)
|
||||
nca.fit(iris_data, iris_target)
|
||||
out, _ = capsys.readouterr()
|
||||
|
||||
# check output
|
||||
assert('{} iterations remaining...'.format(max_iter - 1) in out)
|
||||
|
||||
|
||||
def test_expected_transformation_shape():
|
||||
"""Test that the transformation has the expected shape."""
|
||||
X = iris_data
|
||||
y = iris_target
|
||||
|
||||
class TransformationStorer:
|
||||
|
||||
def __init__(self, X, y):
|
||||
# Initialize a fake NCA and variables needed to call the loss
|
||||
# function:
|
||||
self.fake_nca = NeighborhoodComponentsAnalysis()
|
||||
self.fake_nca.n_iter_ = np.inf
|
||||
self.X, y, _ = self.fake_nca._validate_params(X, y)
|
||||
self.same_class_mask = y[:, np.newaxis] == y[np.newaxis, :]
|
||||
|
||||
def callback(self, transformation, n_iter):
|
||||
"""Stores the last value of the transformation taken as input by
|
||||
the optimizer"""
|
||||
self.transformation = transformation
|
||||
|
||||
transformation_storer = TransformationStorer(X, y)
|
||||
cb = transformation_storer.callback
|
||||
nca = NeighborhoodComponentsAnalysis(max_iter=5, callback=cb)
|
||||
nca.fit(X, y)
|
||||
assert transformation_storer.transformation.size == X.shape[1]**2
|
||||
|
||||
|
||||
def test_convergence_warning():
|
||||
nca = NeighborhoodComponentsAnalysis(max_iter=2, verbose=1)
|
||||
cls_name = nca.__class__.__name__
|
||||
assert_warns_message(ConvergenceWarning,
|
||||
'[{}] NCA did not converge'.format(cls_name),
|
||||
nca.fit, iris_data, iris_target)
|
||||
|
||||
|
||||
@pytest.mark.parametrize('param, value', [('n_components', np.int32(3)),
|
||||
('max_iter', np.int32(100)),
|
||||
('tol', np.float32(0.0001))])
|
||||
def test_parameters_valid_types(param, value):
|
||||
# check that no error is raised when parameters have numpy integer or
|
||||
# floating types.
|
||||
nca = NeighborhoodComponentsAnalysis(**{param: value})
|
||||
|
||||
X = iris_data
|
||||
y = iris_target
|
||||
|
||||
nca.fit(X, y)
|
|
@ -0,0 +1,148 @@
|
|||
"""
|
||||
Testing for the nearest centroid module.
|
||||
"""
|
||||
|
||||
import numpy as np
|
||||
from scipy import sparse as sp
|
||||
from numpy.testing import assert_array_equal
|
||||
|
||||
from sklearn.neighbors import NearestCentroid
|
||||
from sklearn import datasets
|
||||
from sklearn.utils._testing import assert_raises
|
||||
|
||||
# toy sample
|
||||
X = [[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1]]
|
||||
X_csr = sp.csr_matrix(X) # Sparse matrix
|
||||
y = [-1, -1, -1, 1, 1, 1]
|
||||
T = [[-1, -1], [2, 2], [3, 2]]
|
||||
T_csr = sp.csr_matrix(T)
|
||||
true_result = [-1, 1, 1]
|
||||
|
||||
# also load the iris dataset
|
||||
# and randomly permute it
|
||||
iris = datasets.load_iris()
|
||||
rng = np.random.RandomState(1)
|
||||
perm = rng.permutation(iris.target.size)
|
||||
iris.data = iris.data[perm]
|
||||
iris.target = iris.target[perm]
|
||||
|
||||
|
||||
def test_classification_toy():
|
||||
# Check classification on a toy dataset, including sparse versions.
|
||||
clf = NearestCentroid()
|
||||
clf.fit(X, y)
|
||||
assert_array_equal(clf.predict(T), true_result)
|
||||
|
||||
# Same test, but with a sparse matrix to fit and test.
|
||||
clf = NearestCentroid()
|
||||
clf.fit(X_csr, y)
|
||||
assert_array_equal(clf.predict(T_csr), true_result)
|
||||
|
||||
# Fit with sparse, test with non-sparse
|
||||
clf = NearestCentroid()
|
||||
clf.fit(X_csr, y)
|
||||
assert_array_equal(clf.predict(T), true_result)
|
||||
|
||||
# Fit with non-sparse, test with sparse
|
||||
clf = NearestCentroid()
|
||||
clf.fit(X, y)
|
||||
assert_array_equal(clf.predict(T_csr), true_result)
|
||||
|
||||
# Fit and predict with non-CSR sparse matrices
|
||||
clf = NearestCentroid()
|
||||
clf.fit(X_csr.tocoo(), y)
|
||||
assert_array_equal(clf.predict(T_csr.tolil()), true_result)
|
||||
|
||||
|
||||
def test_precomputed():
|
||||
clf = NearestCentroid(metric='precomputed')
|
||||
with assert_raises(ValueError):
|
||||
clf.fit(X, y)
|
||||
|
||||
|
||||
def test_iris():
|
||||
# Check consistency on dataset iris.
|
||||
for metric in ('euclidean', 'cosine'):
|
||||
clf = NearestCentroid(metric=metric).fit(iris.data, iris.target)
|
||||
score = np.mean(clf.predict(iris.data) == iris.target)
|
||||
assert score > 0.9, "Failed with score = " + str(score)
|
||||
|
||||
|
||||
def test_iris_shrinkage():
|
||||
# Check consistency on dataset iris, when using shrinkage.
|
||||
for metric in ('euclidean', 'cosine'):
|
||||
for shrink_threshold in [None, 0.1, 0.5]:
|
||||
clf = NearestCentroid(metric=metric,
|
||||
shrink_threshold=shrink_threshold)
|
||||
clf = clf.fit(iris.data, iris.target)
|
||||
score = np.mean(clf.predict(iris.data) == iris.target)
|
||||
assert score > 0.8, "Failed with score = " + str(score)
|
||||
|
||||
|
||||
def test_pickle():
|
||||
import pickle
|
||||
|
||||
# classification
|
||||
obj = NearestCentroid()
|
||||
obj.fit(iris.data, iris.target)
|
||||
score = obj.score(iris.data, iris.target)
|
||||
s = pickle.dumps(obj)
|
||||
|
||||
obj2 = pickle.loads(s)
|
||||
assert type(obj2) == obj.__class__
|
||||
score2 = obj2.score(iris.data, iris.target)
|
||||
assert_array_equal(score, score2,
|
||||
"Failed to generate same score"
|
||||
" after pickling (classification).")
|
||||
|
||||
|
||||
def test_shrinkage_correct():
|
||||
# Ensure that the shrinking is correct.
|
||||
# The expected result is calculated by R (pamr),
|
||||
# which is implemented by the author of the original paper.
|
||||
# (One need to modify the code to output the new centroid in pamr.predict)
|
||||
|
||||
X = np.array([[0, 1], [1, 0], [1, 1], [2, 0], [6, 8]])
|
||||
y = np.array([1, 1, 2, 2, 2])
|
||||
clf = NearestCentroid(shrink_threshold=0.1)
|
||||
clf.fit(X, y)
|
||||
expected_result = np.array([[0.7787310, 0.8545292], [2.814179, 2.763647]])
|
||||
np.testing.assert_array_almost_equal(clf.centroids_, expected_result)
|
||||
|
||||
|
||||
def test_shrinkage_threshold_decoded_y():
|
||||
clf = NearestCentroid(shrink_threshold=0.01)
|
||||
y_ind = np.asarray(y)
|
||||
y_ind[y_ind == -1] = 0
|
||||
clf.fit(X, y_ind)
|
||||
centroid_encoded = clf.centroids_
|
||||
clf.fit(X, y)
|
||||
assert_array_equal(centroid_encoded, clf.centroids_)
|
||||
|
||||
|
||||
def test_predict_translated_data():
|
||||
# Test that NearestCentroid gives same results on translated data
|
||||
|
||||
rng = np.random.RandomState(0)
|
||||
X = rng.rand(50, 50)
|
||||
y = rng.randint(0, 3, 50)
|
||||
noise = rng.rand(50)
|
||||
clf = NearestCentroid(shrink_threshold=0.1)
|
||||
clf.fit(X, y)
|
||||
y_init = clf.predict(X)
|
||||
clf = NearestCentroid(shrink_threshold=0.1)
|
||||
X_noise = X + noise
|
||||
clf.fit(X_noise, y)
|
||||
y_translate = clf.predict(X_noise)
|
||||
assert_array_equal(y_init, y_translate)
|
||||
|
||||
|
||||
def test_manhattan_metric():
|
||||
# Test the manhattan metric.
|
||||
|
||||
clf = NearestCentroid(metric='manhattan')
|
||||
clf.fit(X, y)
|
||||
dense_centroid = clf.centroids_
|
||||
clf.fit(X_csr, y)
|
||||
assert_array_equal(clf.centroids_, dense_centroid)
|
||||
assert_array_equal(dense_centroid, [[-1, -1], [1, 1]])
|
1676
venv/Lib/site-packages/sklearn/neighbors/tests/test_neighbors.py
Normal file
1676
venv/Lib/site-packages/sklearn/neighbors/tests/test_neighbors.py
Normal file
File diff suppressed because it is too large
Load diff
|
@ -0,0 +1,221 @@
|
|||
"""
|
||||
This is testing the equivalence between some estimators with internal nearest
|
||||
neighbors computations, and the corresponding pipeline versions with
|
||||
KNeighborsTransformer or RadiusNeighborsTransformer to precompute the
|
||||
neighbors.
|
||||
"""
|
||||
|
||||
import numpy as np
|
||||
|
||||
from sklearn.utils._testing import assert_array_almost_equal
|
||||
from sklearn.cluster.tests.common import generate_clustered_data
|
||||
from sklearn.datasets import make_blobs
|
||||
from sklearn.pipeline import make_pipeline
|
||||
from sklearn.base import clone
|
||||
|
||||
from sklearn.neighbors import KNeighborsTransformer
|
||||
from sklearn.neighbors import RadiusNeighborsTransformer
|
||||
|
||||
from sklearn.cluster import DBSCAN
|
||||
from sklearn.cluster import SpectralClustering
|
||||
from sklearn.neighbors import KNeighborsRegressor
|
||||
from sklearn.neighbors import RadiusNeighborsRegressor
|
||||
from sklearn.neighbors import LocalOutlierFactor
|
||||
from sklearn.manifold import SpectralEmbedding
|
||||
from sklearn.manifold import Isomap
|
||||
from sklearn.manifold import TSNE
|
||||
|
||||
|
||||
def test_spectral_clustering():
|
||||
# Test chaining KNeighborsTransformer and SpectralClustering
|
||||
n_neighbors = 5
|
||||
X, _ = make_blobs(random_state=0)
|
||||
|
||||
# compare the chained version and the compact version
|
||||
est_chain = make_pipeline(
|
||||
KNeighborsTransformer(n_neighbors=n_neighbors, mode='connectivity'),
|
||||
SpectralClustering(n_neighbors=n_neighbors, affinity='precomputed',
|
||||
random_state=42))
|
||||
est_compact = SpectralClustering(
|
||||
n_neighbors=n_neighbors, affinity='nearest_neighbors', random_state=42)
|
||||
labels_compact = est_compact.fit_predict(X)
|
||||
labels_chain = est_chain.fit_predict(X)
|
||||
assert_array_almost_equal(labels_chain, labels_compact)
|
||||
|
||||
|
||||
def test_spectral_embedding():
|
||||
# Test chaining KNeighborsTransformer and SpectralEmbedding
|
||||
n_neighbors = 5
|
||||
|
||||
n_samples = 1000
|
||||
centers = np.array([
|
||||
[0.0, 5.0, 0.0, 0.0, 0.0],
|
||||
[0.0, 0.0, 4.0, 0.0, 0.0],
|
||||
[1.0, 0.0, 0.0, 5.0, 1.0],
|
||||
])
|
||||
S, true_labels = make_blobs(n_samples=n_samples, centers=centers,
|
||||
cluster_std=1., random_state=42)
|
||||
|
||||
# compare the chained version and the compact version
|
||||
est_chain = make_pipeline(
|
||||
KNeighborsTransformer(n_neighbors=n_neighbors, mode='connectivity'),
|
||||
SpectralEmbedding(n_neighbors=n_neighbors, affinity='precomputed',
|
||||
random_state=42))
|
||||
est_compact = SpectralEmbedding(
|
||||
n_neighbors=n_neighbors, affinity='nearest_neighbors', random_state=42)
|
||||
St_compact = est_compact.fit_transform(S)
|
||||
St_chain = est_chain.fit_transform(S)
|
||||
assert_array_almost_equal(St_chain, St_compact)
|
||||
|
||||
|
||||
def test_dbscan():
|
||||
# Test chaining RadiusNeighborsTransformer and DBSCAN
|
||||
radius = 0.3
|
||||
n_clusters = 3
|
||||
X = generate_clustered_data(n_clusters=n_clusters)
|
||||
|
||||
# compare the chained version and the compact version
|
||||
est_chain = make_pipeline(
|
||||
RadiusNeighborsTransformer(radius=radius, mode='distance'),
|
||||
DBSCAN(metric='precomputed', eps=radius))
|
||||
est_compact = DBSCAN(eps=radius)
|
||||
|
||||
labels_chain = est_chain.fit_predict(X)
|
||||
labels_compact = est_compact.fit_predict(X)
|
||||
assert_array_almost_equal(labels_chain, labels_compact)
|
||||
|
||||
|
||||
def test_isomap():
|
||||
# Test chaining KNeighborsTransformer and Isomap with
|
||||
# neighbors_algorithm='precomputed'
|
||||
algorithm = 'auto'
|
||||
n_neighbors = 10
|
||||
|
||||
X, _ = make_blobs(random_state=0)
|
||||
X2, _ = make_blobs(random_state=1)
|
||||
|
||||
# compare the chained version and the compact version
|
||||
est_chain = make_pipeline(
|
||||
KNeighborsTransformer(n_neighbors=n_neighbors, algorithm=algorithm,
|
||||
mode='distance'),
|
||||
Isomap(n_neighbors=n_neighbors, metric='precomputed'))
|
||||
est_compact = Isomap(n_neighbors=n_neighbors,
|
||||
neighbors_algorithm=algorithm)
|
||||
|
||||
Xt_chain = est_chain.fit_transform(X)
|
||||
Xt_compact = est_compact.fit_transform(X)
|
||||
assert_array_almost_equal(Xt_chain, Xt_compact)
|
||||
|
||||
Xt_chain = est_chain.transform(X2)
|
||||
Xt_compact = est_compact.transform(X2)
|
||||
assert_array_almost_equal(Xt_chain, Xt_compact)
|
||||
|
||||
|
||||
def test_tsne():
|
||||
# Test chaining KNeighborsTransformer and TSNE
|
||||
n_iter = 250
|
||||
perplexity = 5
|
||||
n_neighbors = int(3. * perplexity + 1)
|
||||
|
||||
rng = np.random.RandomState(0)
|
||||
X = rng.randn(20, 2)
|
||||
|
||||
for metric in ['minkowski', 'sqeuclidean']:
|
||||
|
||||
# compare the chained version and the compact version
|
||||
est_chain = make_pipeline(
|
||||
KNeighborsTransformer(n_neighbors=n_neighbors, mode='distance',
|
||||
metric=metric),
|
||||
TSNE(metric='precomputed', perplexity=perplexity,
|
||||
method="barnes_hut", random_state=42, n_iter=n_iter))
|
||||
est_compact = TSNE(metric=metric, perplexity=perplexity, n_iter=n_iter,
|
||||
method="barnes_hut", random_state=42)
|
||||
|
||||
Xt_chain = est_chain.fit_transform(X)
|
||||
Xt_compact = est_compact.fit_transform(X)
|
||||
assert_array_almost_equal(Xt_chain, Xt_compact)
|
||||
|
||||
|
||||
def test_lof_novelty_false():
|
||||
# Test chaining KNeighborsTransformer and LocalOutlierFactor
|
||||
n_neighbors = 4
|
||||
|
||||
rng = np.random.RandomState(0)
|
||||
X = rng.randn(40, 2)
|
||||
|
||||
# compare the chained version and the compact version
|
||||
est_chain = make_pipeline(
|
||||
KNeighborsTransformer(n_neighbors=n_neighbors, mode='distance'),
|
||||
LocalOutlierFactor(metric='precomputed', n_neighbors=n_neighbors,
|
||||
novelty=False, contamination="auto"))
|
||||
est_compact = LocalOutlierFactor(n_neighbors=n_neighbors, novelty=False,
|
||||
contamination="auto")
|
||||
|
||||
pred_chain = est_chain.fit_predict(X)
|
||||
pred_compact = est_compact.fit_predict(X)
|
||||
assert_array_almost_equal(pred_chain, pred_compact)
|
||||
|
||||
|
||||
def test_lof_novelty_true():
|
||||
# Test chaining KNeighborsTransformer and LocalOutlierFactor
|
||||
n_neighbors = 4
|
||||
|
||||
rng = np.random.RandomState(0)
|
||||
X1 = rng.randn(40, 2)
|
||||
X2 = rng.randn(40, 2)
|
||||
|
||||
# compare the chained version and the compact version
|
||||
est_chain = make_pipeline(
|
||||
KNeighborsTransformer(n_neighbors=n_neighbors, mode='distance'),
|
||||
LocalOutlierFactor(metric='precomputed', n_neighbors=n_neighbors,
|
||||
novelty=True, contamination="auto"))
|
||||
est_compact = LocalOutlierFactor(n_neighbors=n_neighbors, novelty=True,
|
||||
contamination="auto")
|
||||
|
||||
pred_chain = est_chain.fit(X1).predict(X2)
|
||||
pred_compact = est_compact.fit(X1).predict(X2)
|
||||
assert_array_almost_equal(pred_chain, pred_compact)
|
||||
|
||||
|
||||
def test_kneighbors_regressor():
|
||||
# Test chaining KNeighborsTransformer and classifiers/regressors
|
||||
rng = np.random.RandomState(0)
|
||||
X = 2 * rng.rand(40, 5) - 1
|
||||
X2 = 2 * rng.rand(40, 5) - 1
|
||||
y = rng.rand(40, 1)
|
||||
|
||||
n_neighbors = 12
|
||||
radius = 1.5
|
||||
# We precompute more neighbors than necessary, to have equivalence between
|
||||
# k-neighbors estimator after radius-neighbors transformer, and vice-versa.
|
||||
factor = 2
|
||||
|
||||
k_trans = KNeighborsTransformer(n_neighbors=n_neighbors, mode='distance')
|
||||
k_trans_factor = KNeighborsTransformer(n_neighbors=int(
|
||||
n_neighbors * factor), mode='distance')
|
||||
|
||||
r_trans = RadiusNeighborsTransformer(radius=radius, mode='distance')
|
||||
r_trans_factor = RadiusNeighborsTransformer(radius=int(
|
||||
radius * factor), mode='distance')
|
||||
|
||||
k_reg = KNeighborsRegressor(n_neighbors=n_neighbors)
|
||||
r_reg = RadiusNeighborsRegressor(radius=radius)
|
||||
|
||||
test_list = [
|
||||
(k_trans, k_reg),
|
||||
(k_trans_factor, r_reg),
|
||||
(r_trans, r_reg),
|
||||
(r_trans_factor, k_reg),
|
||||
]
|
||||
|
||||
for trans, reg in test_list:
|
||||
# compare the chained version and the compact version
|
||||
reg_compact = clone(reg)
|
||||
reg_precomp = clone(reg)
|
||||
reg_precomp.set_params(metric='precomputed')
|
||||
|
||||
reg_chain = make_pipeline(clone(trans), reg_precomp)
|
||||
|
||||
y_pred_chain = reg_chain.fit(X, y).predict(X2)
|
||||
y_pred_compact = reg_compact.fit(X, y).predict(X2)
|
||||
assert_array_almost_equal(y_pred_chain, y_pred_compact)
|
|
@ -0,0 +1,279 @@
|
|||
# License: BSD 3 clause
|
||||
|
||||
import pickle
|
||||
import itertools
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from sklearn.neighbors import DistanceMetric
|
||||
from sklearn.neighbors._ball_tree import (
|
||||
BallTree, kernel_norm, DTYPE, ITYPE,
|
||||
NeighborsHeap as NeighborsHeapBT,
|
||||
simultaneous_sort as simultaneous_sort_bt,
|
||||
nodeheap_sort as nodeheap_sort_bt)
|
||||
from sklearn.neighbors._kd_tree import (
|
||||
KDTree, NeighborsHeap as NeighborsHeapKDT,
|
||||
simultaneous_sort as simultaneous_sort_kdt,
|
||||
nodeheap_sort as nodeheap_sort_kdt)
|
||||
|
||||
from sklearn.utils import check_random_state
|
||||
from numpy.testing import assert_array_almost_equal, assert_allclose
|
||||
|
||||
rng = np.random.RandomState(42)
|
||||
V_mahalanobis = rng.rand(3, 3)
|
||||
V_mahalanobis = np.dot(V_mahalanobis, V_mahalanobis.T)
|
||||
|
||||
DIMENSION = 3
|
||||
|
||||
METRICS = {'euclidean': {},
|
||||
'manhattan': {},
|
||||
'minkowski': dict(p=3),
|
||||
'chebyshev': {},
|
||||
'seuclidean': dict(V=rng.random_sample(DIMENSION)),
|
||||
'wminkowski': dict(p=3, w=rng.random_sample(DIMENSION)),
|
||||
'mahalanobis': dict(V=V_mahalanobis)}
|
||||
|
||||
KD_TREE_METRICS = ['euclidean', 'manhattan', 'chebyshev', 'minkowski']
|
||||
BALL_TREE_METRICS = list(METRICS)
|
||||
|
||||
|
||||
def dist_func(x1, x2, p):
|
||||
return np.sum((x1 - x2) ** p) ** (1. / p)
|
||||
|
||||
|
||||
def compute_kernel_slow(Y, X, kernel, h):
|
||||
d = np.sqrt(((Y[:, None, :] - X) ** 2).sum(-1))
|
||||
norm = kernel_norm(h, X.shape[1], kernel)
|
||||
|
||||
if kernel == 'gaussian':
|
||||
return norm * np.exp(-0.5 * (d * d) / (h * h)).sum(-1)
|
||||
elif kernel == 'tophat':
|
||||
return norm * (d < h).sum(-1)
|
||||
elif kernel == 'epanechnikov':
|
||||
return norm * ((1.0 - (d * d) / (h * h)) * (d < h)).sum(-1)
|
||||
elif kernel == 'exponential':
|
||||
return norm * (np.exp(-d / h)).sum(-1)
|
||||
elif kernel == 'linear':
|
||||
return norm * ((1 - d / h) * (d < h)).sum(-1)
|
||||
elif kernel == 'cosine':
|
||||
return norm * (np.cos(0.5 * np.pi * d / h) * (d < h)).sum(-1)
|
||||
else:
|
||||
raise ValueError('kernel not recognized')
|
||||
|
||||
|
||||
def brute_force_neighbors(X, Y, k, metric, **kwargs):
|
||||
D = DistanceMetric.get_metric(metric, **kwargs).pairwise(Y, X)
|
||||
ind = np.argsort(D, axis=1)[:, :k]
|
||||
dist = D[np.arange(Y.shape[0])[:, None], ind]
|
||||
return dist, ind
|
||||
|
||||
|
||||
@pytest.mark.parametrize('Cls', [KDTree, BallTree])
|
||||
@pytest.mark.parametrize("kernel", ['gaussian', 'tophat', 'epanechnikov',
|
||||
'exponential', 'linear', 'cosine'])
|
||||
@pytest.mark.parametrize("h", [0.01, 0.1, 1])
|
||||
@pytest.mark.parametrize("rtol", [0, 1E-5])
|
||||
@pytest.mark.parametrize("atol", [1E-6, 1E-2])
|
||||
@pytest.mark.parametrize("breadth_first", [True, False])
|
||||
def test_kernel_density(Cls, kernel, h, rtol, atol, breadth_first,
|
||||
n_samples=100, n_features=3):
|
||||
rng = check_random_state(1)
|
||||
X = rng.random_sample((n_samples, n_features))
|
||||
Y = rng.random_sample((n_samples, n_features))
|
||||
dens_true = compute_kernel_slow(Y, X, kernel, h)
|
||||
|
||||
tree = Cls(X, leaf_size=10)
|
||||
dens = tree.kernel_density(Y, h, atol=atol, rtol=rtol,
|
||||
kernel=kernel,
|
||||
breadth_first=breadth_first)
|
||||
assert_allclose(dens, dens_true,
|
||||
atol=atol, rtol=max(rtol, 1e-7))
|
||||
|
||||
|
||||
@pytest.mark.parametrize('Cls', [KDTree, BallTree])
|
||||
def test_neighbor_tree_query_radius(Cls, n_samples=100, n_features=10):
|
||||
rng = check_random_state(0)
|
||||
X = 2 * rng.random_sample(size=(n_samples, n_features)) - 1
|
||||
query_pt = np.zeros(n_features, dtype=float)
|
||||
|
||||
eps = 1E-15 # roundoff error can cause test to fail
|
||||
tree = Cls(X, leaf_size=5)
|
||||
rad = np.sqrt(((X - query_pt) ** 2).sum(1))
|
||||
|
||||
for r in np.linspace(rad[0], rad[-1], 100):
|
||||
ind = tree.query_radius([query_pt], r + eps)[0]
|
||||
i = np.where(rad <= r + eps)[0]
|
||||
|
||||
ind.sort()
|
||||
i.sort()
|
||||
|
||||
assert_array_almost_equal(i, ind)
|
||||
|
||||
|
||||
@pytest.mark.parametrize('Cls', [KDTree, BallTree])
|
||||
def test_neighbor_tree_query_radius_distance(Cls, n_samples=100,
|
||||
n_features=10):
|
||||
rng = check_random_state(0)
|
||||
X = 2 * rng.random_sample(size=(n_samples, n_features)) - 1
|
||||
query_pt = np.zeros(n_features, dtype=float)
|
||||
|
||||
eps = 1E-15 # roundoff error can cause test to fail
|
||||
tree = Cls(X, leaf_size=5)
|
||||
rad = np.sqrt(((X - query_pt) ** 2).sum(1))
|
||||
|
||||
for r in np.linspace(rad[0], rad[-1], 100):
|
||||
ind, dist = tree.query_radius([query_pt], r + eps,
|
||||
return_distance=True)
|
||||
|
||||
ind = ind[0]
|
||||
dist = dist[0]
|
||||
|
||||
d = np.sqrt(((query_pt - X[ind]) ** 2).sum(1))
|
||||
|
||||
assert_array_almost_equal(d, dist)
|
||||
|
||||
|
||||
@pytest.mark.parametrize('Cls', [KDTree, BallTree])
|
||||
@pytest.mark.parametrize('dualtree', (True, False))
|
||||
def test_neighbor_tree_two_point(Cls, dualtree, n_samples=100, n_features=3):
|
||||
rng = check_random_state(0)
|
||||
X = rng.random_sample((n_samples, n_features))
|
||||
Y = rng.random_sample((n_samples, n_features))
|
||||
r = np.linspace(0, 1, 10)
|
||||
tree = Cls(X, leaf_size=10)
|
||||
|
||||
D = DistanceMetric.get_metric("euclidean").pairwise(Y, X)
|
||||
counts_true = [(D <= ri).sum() for ri in r]
|
||||
|
||||
counts = tree.two_point_correlation(Y, r=r, dualtree=dualtree)
|
||||
assert_array_almost_equal(counts, counts_true)
|
||||
|
||||
|
||||
@pytest.mark.parametrize('NeighborsHeap', [NeighborsHeapBT, NeighborsHeapKDT])
|
||||
def test_neighbors_heap(NeighborsHeap, n_pts=5, n_nbrs=10):
|
||||
heap = NeighborsHeap(n_pts, n_nbrs)
|
||||
rng = check_random_state(0)
|
||||
|
||||
for row in range(n_pts):
|
||||
d_in = rng.random_sample(2 * n_nbrs).astype(DTYPE, copy=False)
|
||||
i_in = np.arange(2 * n_nbrs, dtype=ITYPE)
|
||||
for d, i in zip(d_in, i_in):
|
||||
heap.push(row, d, i)
|
||||
|
||||
ind = np.argsort(d_in)
|
||||
d_in = d_in[ind]
|
||||
i_in = i_in[ind]
|
||||
|
||||
d_heap, i_heap = heap.get_arrays(sort=True)
|
||||
|
||||
assert_array_almost_equal(d_in[:n_nbrs], d_heap[row])
|
||||
assert_array_almost_equal(i_in[:n_nbrs], i_heap[row])
|
||||
|
||||
|
||||
@pytest.mark.parametrize('nodeheap_sort', [nodeheap_sort_bt,
|
||||
nodeheap_sort_kdt])
|
||||
def test_node_heap(nodeheap_sort, n_nodes=50):
|
||||
rng = check_random_state(0)
|
||||
vals = rng.random_sample(n_nodes).astype(DTYPE, copy=False)
|
||||
|
||||
i1 = np.argsort(vals)
|
||||
vals2, i2 = nodeheap_sort(vals)
|
||||
|
||||
assert_array_almost_equal(i1, i2)
|
||||
assert_array_almost_equal(vals[i1], vals2)
|
||||
|
||||
|
||||
@pytest.mark.parametrize('simultaneous_sort', [simultaneous_sort_bt,
|
||||
simultaneous_sort_kdt])
|
||||
def test_simultaneous_sort(simultaneous_sort, n_rows=10, n_pts=201):
|
||||
rng = check_random_state(0)
|
||||
dist = rng.random_sample((n_rows, n_pts)).astype(DTYPE, copy=False)
|
||||
ind = (np.arange(n_pts) + np.zeros((n_rows, 1))).astype(ITYPE, copy=False)
|
||||
|
||||
dist2 = dist.copy()
|
||||
ind2 = ind.copy()
|
||||
|
||||
# simultaneous sort rows using function
|
||||
simultaneous_sort(dist, ind)
|
||||
|
||||
# simultaneous sort rows using numpy
|
||||
i = np.argsort(dist2, axis=1)
|
||||
row_ind = np.arange(n_rows)[:, None]
|
||||
dist2 = dist2[row_ind, i]
|
||||
ind2 = ind2[row_ind, i]
|
||||
|
||||
assert_array_almost_equal(dist, dist2)
|
||||
assert_array_almost_equal(ind, ind2)
|
||||
|
||||
|
||||
@pytest.mark.parametrize('Cls', [KDTree, BallTree])
|
||||
def test_gaussian_kde(Cls, n_samples=1000):
|
||||
# Compare gaussian KDE results to scipy.stats.gaussian_kde
|
||||
from scipy.stats import gaussian_kde
|
||||
rng = check_random_state(0)
|
||||
x_in = rng.normal(0, 1, n_samples)
|
||||
x_out = np.linspace(-5, 5, 30)
|
||||
|
||||
for h in [0.01, 0.1, 1]:
|
||||
tree = Cls(x_in[:, None])
|
||||
gkde = gaussian_kde(x_in, bw_method=h / np.std(x_in))
|
||||
|
||||
dens_tree = tree.kernel_density(x_out[:, None], h) / n_samples
|
||||
dens_gkde = gkde.evaluate(x_out)
|
||||
|
||||
assert_array_almost_equal(dens_tree, dens_gkde, decimal=3)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
'Cls, metric',
|
||||
itertools.chain(
|
||||
[(KDTree, metric) for metric in KD_TREE_METRICS],
|
||||
[(BallTree, metric) for metric in BALL_TREE_METRICS]))
|
||||
@pytest.mark.parametrize('k', (1, 3, 5))
|
||||
@pytest.mark.parametrize('dualtree', (True, False))
|
||||
@pytest.mark.parametrize('breadth_first', (True, False))
|
||||
def test_nn_tree_query(Cls, metric, k, dualtree, breadth_first):
|
||||
rng = check_random_state(0)
|
||||
X = rng.random_sample((40, DIMENSION))
|
||||
Y = rng.random_sample((10, DIMENSION))
|
||||
|
||||
kwargs = METRICS[metric]
|
||||
|
||||
kdt = Cls(X, leaf_size=1, metric=metric, **kwargs)
|
||||
dist1, ind1 = kdt.query(Y, k, dualtree=dualtree,
|
||||
breadth_first=breadth_first)
|
||||
dist2, ind2 = brute_force_neighbors(X, Y, k, metric, **kwargs)
|
||||
|
||||
# don't check indices here: if there are any duplicate distances,
|
||||
# the indices may not match. Distances should not have this problem.
|
||||
assert_array_almost_equal(dist1, dist2)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"Cls, metric",
|
||||
[(KDTree, 'euclidean'), (BallTree, 'euclidean'),
|
||||
(BallTree, dist_func)])
|
||||
@pytest.mark.parametrize('protocol', (0, 1, 2))
|
||||
def test_pickle(Cls, metric, protocol):
|
||||
rng = check_random_state(0)
|
||||
X = rng.random_sample((10, 3))
|
||||
|
||||
if hasattr(metric, '__call__'):
|
||||
kwargs = {'p': 2}
|
||||
else:
|
||||
kwargs = {}
|
||||
|
||||
tree1 = Cls(X, leaf_size=1, metric=metric, **kwargs)
|
||||
|
||||
ind1, dist1 = tree1.query(X)
|
||||
|
||||
s = pickle.dumps(tree1, protocol=protocol)
|
||||
tree2 = pickle.loads(s)
|
||||
|
||||
ind2, dist2 = tree2.query(X)
|
||||
|
||||
assert_array_almost_equal(ind1, ind2)
|
||||
assert_array_almost_equal(dist1, dist2)
|
||||
|
||||
assert isinstance(tree2, Cls)
|
104
venv/Lib/site-packages/sklearn/neighbors/tests/test_quad_tree.py
Normal file
104
venv/Lib/site-packages/sklearn/neighbors/tests/test_quad_tree.py
Normal file
|
@ -0,0 +1,104 @@
|
|||
import pickle
|
||||
import numpy as np
|
||||
|
||||
import pytest
|
||||
|
||||
from sklearn.neighbors._quad_tree import _QuadTree
|
||||
from sklearn.utils import check_random_state
|
||||
|
||||
|
||||
def test_quadtree_boundary_computation():
|
||||
# Introduce a point into a quad tree with boundaries not easy to compute.
|
||||
Xs = []
|
||||
|
||||
# check a random case
|
||||
Xs.append(np.array([[-1, 1], [-4, -1]], dtype=np.float32))
|
||||
# check the case where only 0 are inserted
|
||||
Xs.append(np.array([[0, 0], [0, 0]], dtype=np.float32))
|
||||
# check the case where only negative are inserted
|
||||
Xs.append(np.array([[-1, -2], [-4, 0]], dtype=np.float32))
|
||||
# check the case where only small numbers are inserted
|
||||
Xs.append(np.array([[-1e-6, 1e-6], [-4e-6, -1e-6]], dtype=np.float32))
|
||||
|
||||
for X in Xs:
|
||||
tree = _QuadTree(n_dimensions=2, verbose=0)
|
||||
tree.build_tree(X)
|
||||
tree._check_coherence()
|
||||
|
||||
|
||||
def test_quadtree_similar_point():
|
||||
# Introduce a point into a quad tree where a similar point already exists.
|
||||
# Test will hang if it doesn't complete.
|
||||
Xs = []
|
||||
|
||||
# check the case where points are actually different
|
||||
Xs.append(np.array([[1, 2], [3, 4]], dtype=np.float32))
|
||||
# check the case where points are the same on X axis
|
||||
Xs.append(np.array([[1.0, 2.0], [1.0, 3.0]], dtype=np.float32))
|
||||
# check the case where points are arbitrarily close on X axis
|
||||
Xs.append(np.array([[1.00001, 2.0], [1.00002, 3.0]], dtype=np.float32))
|
||||
# check the case where points are the same on Y axis
|
||||
Xs.append(np.array([[1.0, 2.0], [3.0, 2.0]], dtype=np.float32))
|
||||
# check the case where points are arbitrarily close on Y axis
|
||||
Xs.append(np.array([[1.0, 2.00001], [3.0, 2.00002]], dtype=np.float32))
|
||||
# check the case where points are arbitrarily close on both axes
|
||||
Xs.append(np.array([[1.00001, 2.00001], [1.00002, 2.00002]],
|
||||
dtype=np.float32))
|
||||
|
||||
# check the case where points are arbitrarily close on both axes
|
||||
# close to machine epsilon - x axis
|
||||
Xs.append(np.array([[1, 0.0003817754041], [2, 0.0003817753750]],
|
||||
dtype=np.float32))
|
||||
|
||||
# check the case where points are arbitrarily close on both axes
|
||||
# close to machine epsilon - y axis
|
||||
Xs.append(np.array([[0.0003817754041, 1.0], [0.0003817753750, 2.0]],
|
||||
dtype=np.float32))
|
||||
|
||||
for X in Xs:
|
||||
tree = _QuadTree(n_dimensions=2, verbose=0)
|
||||
tree.build_tree(X)
|
||||
tree._check_coherence()
|
||||
|
||||
|
||||
@pytest.mark.parametrize('n_dimensions', (2, 3))
|
||||
@pytest.mark.parametrize('protocol', (0, 1, 2))
|
||||
def test_quad_tree_pickle(n_dimensions, protocol):
|
||||
rng = check_random_state(0)
|
||||
|
||||
X = rng.random_sample((10, n_dimensions))
|
||||
|
||||
tree = _QuadTree(n_dimensions=n_dimensions, verbose=0)
|
||||
tree.build_tree(X)
|
||||
|
||||
s = pickle.dumps(tree, protocol=protocol)
|
||||
bt2 = pickle.loads(s)
|
||||
|
||||
for x in X:
|
||||
cell_x_tree = tree.get_cell(x)
|
||||
cell_x_bt2 = bt2.get_cell(x)
|
||||
assert cell_x_tree == cell_x_bt2
|
||||
|
||||
|
||||
@pytest.mark.parametrize('n_dimensions', (2, 3))
|
||||
def test_qt_insert_duplicate(n_dimensions):
|
||||
rng = check_random_state(0)
|
||||
|
||||
X = rng.random_sample((10, n_dimensions))
|
||||
Xd = np.r_[X, X[:5]]
|
||||
tree = _QuadTree(n_dimensions=n_dimensions, verbose=0)
|
||||
tree.build_tree(Xd)
|
||||
|
||||
cumulative_size = tree.cumulative_size
|
||||
leafs = tree.leafs
|
||||
|
||||
# Assert that the first 5 are indeed duplicated and that the next
|
||||
# ones are single point leaf
|
||||
for i, x in enumerate(X):
|
||||
cell_id = tree.get_cell(x)
|
||||
assert leafs[cell_id]
|
||||
assert cumulative_size[cell_id] == 1 + (i < 5)
|
||||
|
||||
|
||||
def test_summarize():
|
||||
_QuadTree.test_summarize()
|
18
venv/Lib/site-packages/sklearn/neighbors/typedefs.py
Normal file
18
venv/Lib/site-packages/sklearn/neighbors/typedefs.py
Normal file
|
@ -0,0 +1,18 @@
|
|||
|
||||
# THIS FILE WAS AUTOMATICALLY GENERATED BY deprecated_modules.py
|
||||
import sys
|
||||
# mypy error: Module X has no attribute y (typically for C extensions)
|
||||
from . import _typedefs # type: ignore
|
||||
from ..externals._pep562 import Pep562
|
||||
from ..utils.deprecation import _raise_dep_warning_if_not_pytest
|
||||
|
||||
deprecated_path = 'sklearn.neighbors.typedefs'
|
||||
correct_import_path = 'sklearn.neighbors'
|
||||
|
||||
_raise_dep_warning_if_not_pytest(deprecated_path, correct_import_path)
|
||||
|
||||
def __getattr__(name):
|
||||
return getattr(_typedefs, name)
|
||||
|
||||
if not sys.version_info >= (3, 7):
|
||||
Pep562(__name__)
|
18
venv/Lib/site-packages/sklearn/neighbors/unsupervised.py
Normal file
18
venv/Lib/site-packages/sklearn/neighbors/unsupervised.py
Normal file
|
@ -0,0 +1,18 @@
|
|||
|
||||
# THIS FILE WAS AUTOMATICALLY GENERATED BY deprecated_modules.py
|
||||
import sys
|
||||
# mypy error: Module X has no attribute y (typically for C extensions)
|
||||
from . import _unsupervised # type: ignore
|
||||
from ..externals._pep562 import Pep562
|
||||
from ..utils.deprecation import _raise_dep_warning_if_not_pytest
|
||||
|
||||
deprecated_path = 'sklearn.neighbors.unsupervised'
|
||||
correct_import_path = 'sklearn.neighbors'
|
||||
|
||||
_raise_dep_warning_if_not_pytest(deprecated_path, correct_import_path)
|
||||
|
||||
def __getattr__(name):
|
||||
return getattr(_unsupervised, name)
|
||||
|
||||
if not sys.version_info >= (3, 7):
|
||||
Pep562(__name__)
|
Loading…
Add table
Add a link
Reference in a new issue