Uploaded Test files
This commit is contained in:
parent
f584ad9d97
commit
2e81cb7d99
16627 changed files with 2065359 additions and 102444 deletions
10
venv/Lib/site-packages/sklearn/semi_supervised/__init__.py
Normal file
10
venv/Lib/site-packages/sklearn/semi_supervised/__init__.py
Normal file
|
@ -0,0 +1,10 @@
|
|||
"""
|
||||
The :mod:`sklearn.semi_supervised` module implements semi-supervised learning
|
||||
algorithms. These algorithms utilized small amounts of labeled data and large
|
||||
amounts of unlabeled data for classification tasks. This module includes Label
|
||||
Propagation.
|
||||
"""
|
||||
|
||||
from ._label_propagation import LabelPropagation, LabelSpreading
|
||||
|
||||
__all__ = ['LabelPropagation', 'LabelSpreading']
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
|
@ -0,0 +1,520 @@
|
|||
# coding=utf8
|
||||
"""
|
||||
Label propagation in the context of this module refers to a set of
|
||||
semi-supervised classification algorithms. At a high level, these algorithms
|
||||
work by forming a fully-connected graph between all points given and solving
|
||||
for the steady-state distribution of labels at each point.
|
||||
|
||||
These algorithms perform very well in practice. The cost of running can be very
|
||||
expensive, at approximately O(N^3) where N is the number of (labeled and
|
||||
unlabeled) points. The theory (why they perform so well) is motivated by
|
||||
intuitions from random walk algorithms and geometric relationships in the data.
|
||||
For more information see the references below.
|
||||
|
||||
Model Features
|
||||
--------------
|
||||
Label clamping:
|
||||
The algorithm tries to learn distributions of labels over the dataset given
|
||||
label assignments over an initial subset. In one variant, the algorithm does
|
||||
not allow for any errors in the initial assignment (hard-clamping) while
|
||||
in another variant, the algorithm allows for some wiggle room for the initial
|
||||
assignments, allowing them to change by a fraction alpha in each iteration
|
||||
(soft-clamping).
|
||||
|
||||
Kernel:
|
||||
A function which projects a vector into some higher dimensional space. This
|
||||
implementation supports RBF and KNN kernels. Using the RBF kernel generates
|
||||
a dense matrix of size O(N^2). KNN kernel will generate a sparse matrix of
|
||||
size O(k*N) which will run much faster. See the documentation for SVMs for
|
||||
more info on kernels.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> import numpy as np
|
||||
>>> from sklearn import datasets
|
||||
>>> from sklearn.semi_supervised import LabelPropagation
|
||||
>>> label_prop_model = LabelPropagation()
|
||||
>>> iris = datasets.load_iris()
|
||||
>>> rng = np.random.RandomState(42)
|
||||
>>> random_unlabeled_points = rng.rand(len(iris.target)) < 0.3
|
||||
>>> labels = np.copy(iris.target)
|
||||
>>> labels[random_unlabeled_points] = -1
|
||||
>>> label_prop_model.fit(iris.data, labels)
|
||||
LabelPropagation(...)
|
||||
|
||||
Notes
|
||||
-----
|
||||
References:
|
||||
[1] Yoshua Bengio, Olivier Delalleau, Nicolas Le Roux. In Semi-Supervised
|
||||
Learning (2006), pp. 193-216
|
||||
|
||||
[2] Olivier Delalleau, Yoshua Bengio, Nicolas Le Roux. Efficient
|
||||
Non-Parametric Function Induction in Semi-Supervised Learning. AISTAT 2005
|
||||
"""
|
||||
|
||||
# Authors: Clay Woolam <clay@woolam.org>
|
||||
# Utkarsh Upadhyay <mail@musicallyut.in>
|
||||
# License: BSD
|
||||
from abc import ABCMeta, abstractmethod
|
||||
|
||||
import warnings
|
||||
import numpy as np
|
||||
from scipy import sparse
|
||||
from scipy.sparse import csgraph
|
||||
|
||||
from ..base import BaseEstimator, ClassifierMixin
|
||||
from ..metrics.pairwise import rbf_kernel
|
||||
from ..neighbors import NearestNeighbors
|
||||
from ..utils.extmath import safe_sparse_dot
|
||||
from ..utils.multiclass import check_classification_targets
|
||||
from ..utils.validation import check_is_fitted, check_array
|
||||
from ..utils.validation import _deprecate_positional_args
|
||||
from ..exceptions import ConvergenceWarning
|
||||
|
||||
|
||||
class BaseLabelPropagation(ClassifierMixin, BaseEstimator, metaclass=ABCMeta):
|
||||
"""Base class for label propagation module.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
kernel : {'knn', 'rbf'} or callable, default='rbf'
|
||||
String identifier for kernel function to use or the kernel function
|
||||
itself. Only 'rbf' and 'knn' strings are valid inputs. The function
|
||||
passed should take two inputs, each of shape (n_samples, n_features),
|
||||
and return a (n_samples, n_samples) shaped weight matrix.
|
||||
|
||||
gamma : float, default=20
|
||||
Parameter for rbf kernel.
|
||||
|
||||
n_neighbors : int, default=7
|
||||
Parameter for knn kernel. Need to be strictly positive.
|
||||
|
||||
alpha : float, default=1.0
|
||||
Clamping factor.
|
||||
|
||||
max_iter : int, default=30
|
||||
Change maximum number of iterations allowed.
|
||||
|
||||
tol : float, default=1e-3
|
||||
Convergence tolerance: threshold to consider the system at steady
|
||||
state.
|
||||
|
||||
n_jobs : int, default=None
|
||||
The number of parallel jobs to run.
|
||||
``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
|
||||
``-1`` means using all processors. See :term:`Glossary <n_jobs>`
|
||||
for more details.
|
||||
"""
|
||||
|
||||
@_deprecate_positional_args
|
||||
def __init__(self, kernel='rbf', *, gamma=20, n_neighbors=7,
|
||||
alpha=1, max_iter=30, tol=1e-3, n_jobs=None):
|
||||
|
||||
self.max_iter = max_iter
|
||||
self.tol = tol
|
||||
|
||||
# kernel parameters
|
||||
self.kernel = kernel
|
||||
self.gamma = gamma
|
||||
self.n_neighbors = n_neighbors
|
||||
|
||||
# clamping factor
|
||||
self.alpha = alpha
|
||||
|
||||
self.n_jobs = n_jobs
|
||||
|
||||
def _get_kernel(self, X, y=None):
|
||||
if self.kernel == "rbf":
|
||||
if y is None:
|
||||
return rbf_kernel(X, X, gamma=self.gamma)
|
||||
else:
|
||||
return rbf_kernel(X, y, gamma=self.gamma)
|
||||
elif self.kernel == "knn":
|
||||
if self.nn_fit is None:
|
||||
self.nn_fit = NearestNeighbors(n_neighbors=self.n_neighbors,
|
||||
n_jobs=self.n_jobs).fit(X)
|
||||
if y is None:
|
||||
return self.nn_fit.kneighbors_graph(self.nn_fit._fit_X,
|
||||
self.n_neighbors,
|
||||
mode='connectivity')
|
||||
else:
|
||||
return self.nn_fit.kneighbors(y, return_distance=False)
|
||||
elif callable(self.kernel):
|
||||
if y is None:
|
||||
return self.kernel(X, X)
|
||||
else:
|
||||
return self.kernel(X, y)
|
||||
else:
|
||||
raise ValueError("%s is not a valid kernel. Only rbf and knn"
|
||||
" or an explicit function "
|
||||
" are supported at this time." % self.kernel)
|
||||
|
||||
@abstractmethod
|
||||
def _build_graph(self):
|
||||
raise NotImplementedError("Graph construction must be implemented"
|
||||
" to fit a label propagation model.")
|
||||
|
||||
def predict(self, X):
|
||||
"""Performs inductive inference across the model.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : array-like of shape (n_samples, n_features)
|
||||
The data matrix.
|
||||
|
||||
Returns
|
||||
-------
|
||||
y : ndarray of shape (n_samples,)
|
||||
Predictions for input data.
|
||||
"""
|
||||
probas = self.predict_proba(X)
|
||||
return self.classes_[np.argmax(probas, axis=1)].ravel()
|
||||
|
||||
def predict_proba(self, X):
|
||||
"""Predict probability for each possible outcome.
|
||||
|
||||
Compute the probability estimates for each single sample in X
|
||||
and each possible outcome seen during training (categorical
|
||||
distribution).
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : array-like of shape (n_samples, n_features)
|
||||
The data matrix.
|
||||
|
||||
Returns
|
||||
-------
|
||||
probabilities : ndarray of shape (n_samples, n_classes)
|
||||
Normalized probability distributions across
|
||||
class labels.
|
||||
"""
|
||||
check_is_fitted(self)
|
||||
|
||||
X_2d = check_array(X, accept_sparse=['csc', 'csr', 'coo', 'dok',
|
||||
'bsr', 'lil', 'dia'])
|
||||
weight_matrices = self._get_kernel(self.X_, X_2d)
|
||||
if self.kernel == 'knn':
|
||||
probabilities = np.array([
|
||||
np.sum(self.label_distributions_[weight_matrix], axis=0)
|
||||
for weight_matrix in weight_matrices])
|
||||
else:
|
||||
weight_matrices = weight_matrices.T
|
||||
probabilities = safe_sparse_dot(
|
||||
weight_matrices, self.label_distributions_)
|
||||
normalizer = np.atleast_2d(np.sum(probabilities, axis=1)).T
|
||||
probabilities /= normalizer
|
||||
return probabilities
|
||||
|
||||
def fit(self, X, y):
|
||||
"""Fit a semi-supervised label propagation model based
|
||||
|
||||
All the input data is provided matrix X (labeled and unlabeled)
|
||||
and corresponding label matrix y with a dedicated marker value for
|
||||
unlabeled samples.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : array-like of shape (n_samples, n_features)
|
||||
A matrix of shape (n_samples, n_samples) will be created from this.
|
||||
|
||||
y : array-like of shape (n_samples,)
|
||||
`n_labeled_samples` (unlabeled points are marked as -1)
|
||||
All unlabeled samples will be transductively assigned labels.
|
||||
|
||||
Returns
|
||||
-------
|
||||
self : object
|
||||
"""
|
||||
X, y = self._validate_data(X, y)
|
||||
self.X_ = X
|
||||
check_classification_targets(y)
|
||||
|
||||
# actual graph construction (implementations should override this)
|
||||
graph_matrix = self._build_graph()
|
||||
|
||||
# label construction
|
||||
# construct a categorical distribution for classification only
|
||||
classes = np.unique(y)
|
||||
classes = (classes[classes != -1])
|
||||
self.classes_ = classes
|
||||
|
||||
n_samples, n_classes = len(y), len(classes)
|
||||
|
||||
alpha = self.alpha
|
||||
if self._variant == 'spreading' and \
|
||||
(alpha is None or alpha <= 0.0 or alpha >= 1.0):
|
||||
raise ValueError('alpha=%s is invalid: it must be inside '
|
||||
'the open interval (0, 1)' % alpha)
|
||||
y = np.asarray(y)
|
||||
unlabeled = y == -1
|
||||
|
||||
# initialize distributions
|
||||
self.label_distributions_ = np.zeros((n_samples, n_classes))
|
||||
for label in classes:
|
||||
self.label_distributions_[y == label, classes == label] = 1
|
||||
|
||||
y_static = np.copy(self.label_distributions_)
|
||||
if self._variant == 'propagation':
|
||||
# LabelPropagation
|
||||
y_static[unlabeled] = 0
|
||||
else:
|
||||
# LabelSpreading
|
||||
y_static *= 1 - alpha
|
||||
|
||||
l_previous = np.zeros((self.X_.shape[0], n_classes))
|
||||
|
||||
unlabeled = unlabeled[:, np.newaxis]
|
||||
if sparse.isspmatrix(graph_matrix):
|
||||
graph_matrix = graph_matrix.tocsr()
|
||||
|
||||
for self.n_iter_ in range(self.max_iter):
|
||||
if np.abs(self.label_distributions_ - l_previous).sum() < self.tol:
|
||||
break
|
||||
|
||||
l_previous = self.label_distributions_
|
||||
self.label_distributions_ = safe_sparse_dot(
|
||||
graph_matrix, self.label_distributions_)
|
||||
|
||||
if self._variant == 'propagation':
|
||||
normalizer = np.sum(
|
||||
self.label_distributions_, axis=1)[:, np.newaxis]
|
||||
self.label_distributions_ /= normalizer
|
||||
self.label_distributions_ = np.where(unlabeled,
|
||||
self.label_distributions_,
|
||||
y_static)
|
||||
else:
|
||||
# clamp
|
||||
self.label_distributions_ = np.multiply(
|
||||
alpha, self.label_distributions_) + y_static
|
||||
else:
|
||||
warnings.warn(
|
||||
'max_iter=%d was reached without convergence.' % self.max_iter,
|
||||
category=ConvergenceWarning
|
||||
)
|
||||
self.n_iter_ += 1
|
||||
|
||||
normalizer = np.sum(self.label_distributions_, axis=1)[:, np.newaxis]
|
||||
normalizer[normalizer == 0] = 1
|
||||
self.label_distributions_ /= normalizer
|
||||
|
||||
# set the transduction item
|
||||
transduction = self.classes_[np.argmax(self.label_distributions_,
|
||||
axis=1)]
|
||||
self.transduction_ = transduction.ravel()
|
||||
return self
|
||||
|
||||
|
||||
class LabelPropagation(BaseLabelPropagation):
|
||||
"""Label Propagation classifier
|
||||
|
||||
Read more in the :ref:`User Guide <label_propagation>`.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
kernel : {'knn', 'rbf'} or callable, default='rbf'
|
||||
String identifier for kernel function to use or the kernel function
|
||||
itself. Only 'rbf' and 'knn' strings are valid inputs. The function
|
||||
passed should take two inputs, each of shape (n_samples, n_features),
|
||||
and return a (n_samples, n_samples) shaped weight matrix.
|
||||
|
||||
gamma : float, default=20
|
||||
Parameter for rbf kernel.
|
||||
|
||||
n_neighbors : int, default=7
|
||||
Parameter for knn kernel which need to be strictly positive.
|
||||
|
||||
max_iter : int, default=1000
|
||||
Change maximum number of iterations allowed.
|
||||
|
||||
tol : float, 1e-3
|
||||
Convergence tolerance: threshold to consider the system at steady
|
||||
state.
|
||||
|
||||
n_jobs : int, default=None
|
||||
The number of parallel jobs to run.
|
||||
``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
|
||||
``-1`` means using all processors. See :term:`Glossary <n_jobs>`
|
||||
for more details.
|
||||
|
||||
Attributes
|
||||
----------
|
||||
X_ : ndarray of shape (n_samples, n_features)
|
||||
Input array.
|
||||
|
||||
classes_ : ndarray of shape (n_classes,)
|
||||
The distinct labels used in classifying instances.
|
||||
|
||||
label_distributions_ : ndarray of shape (n_samples, n_classes)
|
||||
Categorical distribution for each item.
|
||||
|
||||
transduction_ : ndarray of shape (n_samples)
|
||||
Label assigned to each item via the transduction.
|
||||
|
||||
n_iter_ : int
|
||||
Number of iterations run.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> import numpy as np
|
||||
>>> from sklearn import datasets
|
||||
>>> from sklearn.semi_supervised import LabelPropagation
|
||||
>>> label_prop_model = LabelPropagation()
|
||||
>>> iris = datasets.load_iris()
|
||||
>>> rng = np.random.RandomState(42)
|
||||
>>> random_unlabeled_points = rng.rand(len(iris.target)) < 0.3
|
||||
>>> labels = np.copy(iris.target)
|
||||
>>> labels[random_unlabeled_points] = -1
|
||||
>>> label_prop_model.fit(iris.data, labels)
|
||||
LabelPropagation(...)
|
||||
|
||||
References
|
||||
----------
|
||||
Xiaojin Zhu and Zoubin Ghahramani. Learning from labeled and unlabeled data
|
||||
with label propagation. Technical Report CMU-CALD-02-107, Carnegie Mellon
|
||||
University, 2002 http://pages.cs.wisc.edu/~jerryzhu/pub/CMU-CALD-02-107.pdf
|
||||
|
||||
See Also
|
||||
--------
|
||||
LabelSpreading : Alternate label propagation strategy more robust to noise
|
||||
"""
|
||||
|
||||
_variant = 'propagation'
|
||||
|
||||
@_deprecate_positional_args
|
||||
def __init__(self, kernel='rbf', *, gamma=20, n_neighbors=7,
|
||||
max_iter=1000, tol=1e-3, n_jobs=None):
|
||||
super().__init__(kernel=kernel, gamma=gamma,
|
||||
n_neighbors=n_neighbors, max_iter=max_iter,
|
||||
tol=tol, n_jobs=n_jobs, alpha=None)
|
||||
|
||||
def _build_graph(self):
|
||||
"""Matrix representing a fully connected graph between each sample
|
||||
|
||||
This basic implementation creates a non-stochastic affinity matrix, so
|
||||
class distributions will exceed 1 (normalization may be desired).
|
||||
"""
|
||||
if self.kernel == 'knn':
|
||||
self.nn_fit = None
|
||||
affinity_matrix = self._get_kernel(self.X_)
|
||||
normalizer = affinity_matrix.sum(axis=0)
|
||||
if sparse.isspmatrix(affinity_matrix):
|
||||
affinity_matrix.data /= np.diag(np.array(normalizer))
|
||||
else:
|
||||
affinity_matrix /= normalizer[:, np.newaxis]
|
||||
return affinity_matrix
|
||||
|
||||
def fit(self, X, y):
|
||||
return super().fit(X, y)
|
||||
|
||||
|
||||
class LabelSpreading(BaseLabelPropagation):
|
||||
"""LabelSpreading model for semi-supervised learning
|
||||
|
||||
This model is similar to the basic Label Propagation algorithm,
|
||||
but uses affinity matrix based on the normalized graph Laplacian
|
||||
and soft clamping across the labels.
|
||||
|
||||
Read more in the :ref:`User Guide <label_propagation>`.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
kernel : {'knn', 'rbf'} or callable, default='rbf'
|
||||
String identifier for kernel function to use or the kernel function
|
||||
itself. Only 'rbf' and 'knn' strings are valid inputs. The function
|
||||
passed should take two inputs, each of shape (n_samples, n_features),
|
||||
and return a (n_samples, n_samples) shaped weight matrix.
|
||||
|
||||
gamma : float, default=20
|
||||
Parameter for rbf kernel.
|
||||
|
||||
n_neighbors : int, default=7
|
||||
Parameter for knn kernel which is a strictly positive integer.
|
||||
|
||||
alpha : float, default=0.2
|
||||
Clamping factor. A value in (0, 1) that specifies the relative amount
|
||||
that an instance should adopt the information from its neighbors as
|
||||
opposed to its initial label.
|
||||
alpha=0 means keeping the initial label information; alpha=1 means
|
||||
replacing all initial information.
|
||||
|
||||
max_iter : int, default=30
|
||||
Maximum number of iterations allowed.
|
||||
|
||||
tol : float, default=1e-3
|
||||
Convergence tolerance: threshold to consider the system at steady
|
||||
state.
|
||||
|
||||
n_jobs : int, default=None
|
||||
The number of parallel jobs to run.
|
||||
``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
|
||||
``-1`` means using all processors. See :term:`Glossary <n_jobs>`
|
||||
for more details.
|
||||
|
||||
Attributes
|
||||
----------
|
||||
X_ : ndarray of shape (n_samples, n_features)
|
||||
Input array.
|
||||
|
||||
classes_ : ndarray of shape (n_classes,)
|
||||
The distinct labels used in classifying instances.
|
||||
|
||||
label_distributions_ : ndarray of shape (n_samples, n_classes)
|
||||
Categorical distribution for each item.
|
||||
|
||||
transduction_ : ndarray of shape (n_samples,)
|
||||
Label assigned to each item via the transduction.
|
||||
|
||||
n_iter_ : int
|
||||
Number of iterations run.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> import numpy as np
|
||||
>>> from sklearn import datasets
|
||||
>>> from sklearn.semi_supervised import LabelSpreading
|
||||
>>> label_prop_model = LabelSpreading()
|
||||
>>> iris = datasets.load_iris()
|
||||
>>> rng = np.random.RandomState(42)
|
||||
>>> random_unlabeled_points = rng.rand(len(iris.target)) < 0.3
|
||||
>>> labels = np.copy(iris.target)
|
||||
>>> labels[random_unlabeled_points] = -1
|
||||
>>> label_prop_model.fit(iris.data, labels)
|
||||
LabelSpreading(...)
|
||||
|
||||
References
|
||||
----------
|
||||
Dengyong Zhou, Olivier Bousquet, Thomas Navin Lal, Jason Weston,
|
||||
Bernhard Schoelkopf. Learning with local and global consistency (2004)
|
||||
http://citeseer.ist.psu.edu/viewdoc/summary?doi=10.1.1.115.3219
|
||||
|
||||
See Also
|
||||
--------
|
||||
LabelPropagation : Unregularized graph based semi-supervised learning
|
||||
"""
|
||||
|
||||
_variant = 'spreading'
|
||||
|
||||
@_deprecate_positional_args
|
||||
def __init__(self, kernel='rbf', *, gamma=20, n_neighbors=7, alpha=0.2,
|
||||
max_iter=30, tol=1e-3, n_jobs=None):
|
||||
|
||||
# this one has different base parameters
|
||||
super().__init__(kernel=kernel, gamma=gamma,
|
||||
n_neighbors=n_neighbors, alpha=alpha,
|
||||
max_iter=max_iter, tol=tol, n_jobs=n_jobs)
|
||||
|
||||
def _build_graph(self):
|
||||
"""Graph matrix for Label Spreading computes the graph laplacian"""
|
||||
# compute affinity matrix (or gram matrix)
|
||||
if self.kernel == 'knn':
|
||||
self.nn_fit = None
|
||||
n_samples = self.X_.shape[0]
|
||||
affinity_matrix = self._get_kernel(self.X_)
|
||||
laplacian = csgraph.laplacian(affinity_matrix, normed=True)
|
||||
laplacian = -laplacian
|
||||
if sparse.isspmatrix(laplacian):
|
||||
diag_mask = (laplacian.row == laplacian.col)
|
||||
laplacian.data[diag_mask] = 0.0
|
||||
else:
|
||||
laplacian.flat[::n_samples + 1] = 0.0 # set diag to 0.0
|
||||
return laplacian
|
|
@ -0,0 +1,18 @@
|
|||
|
||||
# THIS FILE WAS AUTOMATICALLY GENERATED BY deprecated_modules.py
|
||||
import sys
|
||||
# mypy error: Module X has no attribute y (typically for C extensions)
|
||||
from . import _label_propagation # type: ignore
|
||||
from ..externals._pep562 import Pep562
|
||||
from ..utils.deprecation import _raise_dep_warning_if_not_pytest
|
||||
|
||||
deprecated_path = 'sklearn.semi_supervised.label_propagation'
|
||||
correct_import_path = 'sklearn.semi_supervised'
|
||||
|
||||
_raise_dep_warning_if_not_pytest(deprecated_path, correct_import_path)
|
||||
|
||||
def __getattr__(name):
|
||||
return getattr(_label_propagation, name)
|
||||
|
||||
if not sys.version_info >= (3, 7):
|
||||
Pep562(__name__)
|
Binary file not shown.
Binary file not shown.
|
@ -0,0 +1,205 @@
|
|||
""" test the label propagation module """
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from scipy.sparse import issparse
|
||||
from sklearn.utils._testing import assert_warns
|
||||
from sklearn.utils._testing import assert_no_warnings
|
||||
from sklearn.semi_supervised import _label_propagation as label_propagation
|
||||
from sklearn.metrics.pairwise import rbf_kernel
|
||||
from sklearn.model_selection import train_test_split
|
||||
from sklearn.neighbors import NearestNeighbors
|
||||
from sklearn.datasets import make_classification
|
||||
from sklearn.exceptions import ConvergenceWarning
|
||||
from numpy.testing import assert_array_almost_equal
|
||||
from numpy.testing import assert_array_equal
|
||||
|
||||
ESTIMATORS = [
|
||||
(label_propagation.LabelPropagation, {'kernel': 'rbf'}),
|
||||
(label_propagation.LabelPropagation, {'kernel': 'knn', 'n_neighbors': 2}),
|
||||
(label_propagation.LabelPropagation, {
|
||||
'kernel': lambda x, y: rbf_kernel(x, y, gamma=20)
|
||||
}),
|
||||
(label_propagation.LabelSpreading, {'kernel': 'rbf'}),
|
||||
(label_propagation.LabelSpreading, {'kernel': 'knn', 'n_neighbors': 2}),
|
||||
(label_propagation.LabelSpreading, {
|
||||
'kernel': lambda x, y: rbf_kernel(x, y, gamma=20)
|
||||
}),
|
||||
]
|
||||
|
||||
|
||||
def test_fit_transduction():
|
||||
samples = [[1., 0.], [0., 2.], [1., 3.]]
|
||||
labels = [0, 1, -1]
|
||||
for estimator, parameters in ESTIMATORS:
|
||||
clf = estimator(**parameters).fit(samples, labels)
|
||||
assert clf.transduction_[2] == 1
|
||||
|
||||
|
||||
def test_distribution():
|
||||
samples = [[1., 0.], [0., 1.], [1., 1.]]
|
||||
labels = [0, 1, -1]
|
||||
for estimator, parameters in ESTIMATORS:
|
||||
clf = estimator(**parameters).fit(samples, labels)
|
||||
if parameters['kernel'] == 'knn':
|
||||
continue # unstable test; changes in k-NN ordering break it
|
||||
assert_array_almost_equal(clf.predict_proba([[1., 0.0]]),
|
||||
np.array([[1., 0.]]), 2)
|
||||
else:
|
||||
assert_array_almost_equal(np.asarray(clf.label_distributions_[2]),
|
||||
np.array([.5, .5]), 2)
|
||||
|
||||
|
||||
def test_predict():
|
||||
samples = [[1., 0.], [0., 2.], [1., 3.]]
|
||||
labels = [0, 1, -1]
|
||||
for estimator, parameters in ESTIMATORS:
|
||||
clf = estimator(**parameters).fit(samples, labels)
|
||||
assert_array_equal(clf.predict([[0.5, 2.5]]), np.array([1]))
|
||||
|
||||
|
||||
def test_predict_proba():
|
||||
samples = [[1., 0.], [0., 1.], [1., 2.5]]
|
||||
labels = [0, 1, -1]
|
||||
for estimator, parameters in ESTIMATORS:
|
||||
clf = estimator(**parameters).fit(samples, labels)
|
||||
assert_array_almost_equal(clf.predict_proba([[1., 1.]]),
|
||||
np.array([[0.5, 0.5]]))
|
||||
|
||||
|
||||
def test_label_spreading_closed_form():
|
||||
n_classes = 2
|
||||
X, y = make_classification(n_classes=n_classes, n_samples=200,
|
||||
random_state=0)
|
||||
y[::3] = -1
|
||||
clf = label_propagation.LabelSpreading().fit(X, y)
|
||||
# adopting notation from Zhou et al (2004):
|
||||
S = clf._build_graph()
|
||||
Y = np.zeros((len(y), n_classes + 1))
|
||||
Y[np.arange(len(y)), y] = 1
|
||||
Y = Y[:, :-1]
|
||||
for alpha in [0.1, 0.3, 0.5, 0.7, 0.9]:
|
||||
expected = np.dot(np.linalg.inv(np.eye(len(S)) - alpha * S), Y)
|
||||
expected /= expected.sum(axis=1)[:, np.newaxis]
|
||||
clf = label_propagation.LabelSpreading(max_iter=10000, alpha=alpha)
|
||||
clf.fit(X, y)
|
||||
assert_array_almost_equal(expected, clf.label_distributions_, 4)
|
||||
|
||||
|
||||
def test_label_propagation_closed_form():
|
||||
n_classes = 2
|
||||
X, y = make_classification(n_classes=n_classes, n_samples=200,
|
||||
random_state=0)
|
||||
y[::3] = -1
|
||||
Y = np.zeros((len(y), n_classes + 1))
|
||||
Y[np.arange(len(y)), y] = 1
|
||||
unlabelled_idx = Y[:, (-1,)].nonzero()[0]
|
||||
labelled_idx = (Y[:, (-1,)] == 0).nonzero()[0]
|
||||
|
||||
clf = label_propagation.LabelPropagation(max_iter=10000,
|
||||
gamma=0.1)
|
||||
clf.fit(X, y)
|
||||
# adopting notation from Zhu et al 2002
|
||||
T_bar = clf._build_graph()
|
||||
Tuu = T_bar[tuple(np.meshgrid(unlabelled_idx, unlabelled_idx,
|
||||
indexing='ij'))]
|
||||
Tul = T_bar[tuple(np.meshgrid(unlabelled_idx, labelled_idx,
|
||||
indexing='ij'))]
|
||||
Y = Y[:, :-1]
|
||||
Y_l = Y[labelled_idx, :]
|
||||
Y_u = np.dot(np.dot(np.linalg.inv(np.eye(Tuu.shape[0]) - Tuu), Tul), Y_l)
|
||||
|
||||
expected = Y.copy()
|
||||
expected[unlabelled_idx, :] = Y_u
|
||||
expected /= expected.sum(axis=1)[:, np.newaxis]
|
||||
|
||||
assert_array_almost_equal(expected, clf.label_distributions_, 4)
|
||||
|
||||
|
||||
def test_valid_alpha():
|
||||
n_classes = 2
|
||||
X, y = make_classification(n_classes=n_classes, n_samples=200,
|
||||
random_state=0)
|
||||
for alpha in [-0.1, 0, 1, 1.1, None]:
|
||||
with pytest.raises(ValueError):
|
||||
label_propagation.LabelSpreading(alpha=alpha).fit(X, y)
|
||||
|
||||
|
||||
def test_convergence_speed():
|
||||
# This is a non-regression test for #5774
|
||||
X = np.array([[1., 0.], [0., 1.], [1., 2.5]])
|
||||
y = np.array([0, 1, -1])
|
||||
mdl = label_propagation.LabelSpreading(kernel='rbf', max_iter=5000)
|
||||
mdl.fit(X, y)
|
||||
|
||||
# this should converge quickly:
|
||||
assert mdl.n_iter_ < 10
|
||||
assert_array_equal(mdl.predict(X), [0, 1, 1])
|
||||
|
||||
|
||||
def test_convergence_warning():
|
||||
# This is a non-regression test for #5774
|
||||
X = np.array([[1., 0.], [0., 1.], [1., 2.5]])
|
||||
y = np.array([0, 1, -1])
|
||||
mdl = label_propagation.LabelSpreading(kernel='rbf', max_iter=1)
|
||||
assert_warns(ConvergenceWarning, mdl.fit, X, y)
|
||||
assert mdl.n_iter_ == mdl.max_iter
|
||||
|
||||
mdl = label_propagation.LabelPropagation(kernel='rbf', max_iter=1)
|
||||
assert_warns(ConvergenceWarning, mdl.fit, X, y)
|
||||
assert mdl.n_iter_ == mdl.max_iter
|
||||
|
||||
mdl = label_propagation.LabelSpreading(kernel='rbf', max_iter=500)
|
||||
assert_no_warnings(mdl.fit, X, y)
|
||||
|
||||
mdl = label_propagation.LabelPropagation(kernel='rbf', max_iter=500)
|
||||
assert_no_warnings(mdl.fit, X, y)
|
||||
|
||||
|
||||
def test_label_propagation_non_zero_normalizer():
|
||||
# check that we don't divide by zero in case of null normalizer
|
||||
# non-regression test for
|
||||
# https://github.com/scikit-learn/scikit-learn/pull/15946
|
||||
X = np.array([[100., 100.], [100., 100.], [0., 0.], [0., 0.]])
|
||||
y = np.array([0, 1, -1, -1])
|
||||
mdl = label_propagation.LabelSpreading(kernel='knn',
|
||||
max_iter=100,
|
||||
n_neighbors=1)
|
||||
assert_no_warnings(mdl.fit, X, y)
|
||||
|
||||
|
||||
def test_predict_sparse_callable_kernel():
|
||||
# This is a non-regression test for #15866
|
||||
|
||||
# Custom sparse kernel (top-K RBF)
|
||||
def topk_rbf(X, Y=None, n_neighbors=10, gamma=1e-5):
|
||||
nn = NearestNeighbors(n_neighbors=10, metric='euclidean', n_jobs=-1)
|
||||
nn.fit(X)
|
||||
W = -1 * nn.kneighbors_graph(Y, mode='distance').power(2) * gamma
|
||||
np.exp(W.data, out=W.data)
|
||||
assert issparse(W)
|
||||
return W.T
|
||||
|
||||
n_classes = 4
|
||||
n_samples = 500
|
||||
n_test = 10
|
||||
X, y = make_classification(n_classes=n_classes,
|
||||
n_samples=n_samples,
|
||||
n_features=20,
|
||||
n_informative=20,
|
||||
n_redundant=0,
|
||||
n_repeated=0,
|
||||
random_state=0)
|
||||
|
||||
X_train, X_test, y_train, y_test = train_test_split(X, y,
|
||||
test_size=n_test,
|
||||
random_state=0)
|
||||
|
||||
model = label_propagation.LabelSpreading(kernel=topk_rbf)
|
||||
model.fit(X_train, y_train)
|
||||
assert model.score(X_test, y_test) >= 0.9
|
||||
|
||||
model = label_propagation.LabelPropagation(kernel=topk_rbf)
|
||||
model.fit(X_train, y_train)
|
||||
assert model.score(X_test, y_test) >= 0.9
|
Loading…
Add table
Add a link
Reference in a new issue