372 lines
13 KiB
Python
372 lines
13 KiB
Python
|
"""Nearest Neighbor Regression"""
|
||
|
|
||
|
# Authors: Jake Vanderplas <vanderplas@astro.washington.edu>
|
||
|
# Fabian Pedregosa <fabian.pedregosa@inria.fr>
|
||
|
# Alexandre Gramfort <alexandre.gramfort@inria.fr>
|
||
|
# Sparseness support by Lars Buitinck
|
||
|
# Multi-output support by Arnaud Joly <a.joly@ulg.ac.be>
|
||
|
# Empty radius support by Andreas Bjerre-Nielsen
|
||
|
#
|
||
|
# License: BSD 3 clause (C) INRIA, University of Amsterdam,
|
||
|
# University of Copenhagen
|
||
|
|
||
|
import warnings
|
||
|
|
||
|
import numpy as np
|
||
|
|
||
|
from ._base import _get_weights, _check_weights, NeighborsBase, KNeighborsMixin
|
||
|
from ._base import RadiusNeighborsMixin, SupervisedFloatMixin
|
||
|
from ..base import RegressorMixin
|
||
|
from ..utils import check_array
|
||
|
from ..utils.validation import _deprecate_positional_args
|
||
|
|
||
|
|
||
|
class KNeighborsRegressor(NeighborsBase, KNeighborsMixin,
|
||
|
SupervisedFloatMixin,
|
||
|
RegressorMixin):
|
||
|
"""Regression based on k-nearest neighbors.
|
||
|
|
||
|
The target is predicted by local interpolation of the targets
|
||
|
associated of the nearest neighbors in the training set.
|
||
|
|
||
|
Read more in the :ref:`User Guide <regression>`.
|
||
|
|
||
|
.. versionadded:: 0.9
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
n_neighbors : int, default=5
|
||
|
Number of neighbors to use by default for :meth:`kneighbors` queries.
|
||
|
|
||
|
weights : {'uniform', 'distance'} or callable, default='uniform'
|
||
|
weight function used in prediction. Possible values:
|
||
|
|
||
|
- 'uniform' : uniform weights. All points in each neighborhood
|
||
|
are weighted equally.
|
||
|
- 'distance' : weight points by the inverse of their distance.
|
||
|
in this case, closer neighbors of a query point will have a
|
||
|
greater influence than neighbors which are further away.
|
||
|
- [callable] : a user-defined function which accepts an
|
||
|
array of distances, and returns an array of the same shape
|
||
|
containing the weights.
|
||
|
|
||
|
Uniform weights are used by default.
|
||
|
|
||
|
algorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'}, default='auto'
|
||
|
Algorithm used to compute the nearest neighbors:
|
||
|
|
||
|
- 'ball_tree' will use :class:`BallTree`
|
||
|
- 'kd_tree' will use :class:`KDTree`
|
||
|
- 'brute' will use a brute-force search.
|
||
|
- 'auto' will attempt to decide the most appropriate algorithm
|
||
|
based on the values passed to :meth:`fit` method.
|
||
|
|
||
|
Note: fitting on sparse input will override the setting of
|
||
|
this parameter, using brute force.
|
||
|
|
||
|
leaf_size : int, default=30
|
||
|
Leaf size passed to BallTree or KDTree. This can affect the
|
||
|
speed of the construction and query, as well as the memory
|
||
|
required to store the tree. The optimal value depends on the
|
||
|
nature of the problem.
|
||
|
|
||
|
p : int, default=2
|
||
|
Power parameter for the Minkowski metric. When p = 1, this is
|
||
|
equivalent to using manhattan_distance (l1), and euclidean_distance
|
||
|
(l2) for p = 2. For arbitrary p, minkowski_distance (l_p) is used.
|
||
|
|
||
|
metric : str or callable, default='minkowski'
|
||
|
the distance metric to use for the tree. The default metric is
|
||
|
minkowski, and with p=2 is equivalent to the standard Euclidean
|
||
|
metric. See the documentation of :class:`DistanceMetric` for a
|
||
|
list of available metrics.
|
||
|
If metric is "precomputed", X is assumed to be a distance matrix and
|
||
|
must be square during fit. X may be a :term:`sparse graph`,
|
||
|
in which case only "nonzero" elements may be considered neighbors.
|
||
|
|
||
|
metric_params : dict, default=None
|
||
|
Additional keyword arguments for the metric function.
|
||
|
|
||
|
n_jobs : int, default=None
|
||
|
The number of parallel jobs to run for neighbors search.
|
||
|
``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
|
||
|
``-1`` means using all processors. See :term:`Glossary <n_jobs>`
|
||
|
for more details.
|
||
|
Doesn't affect :meth:`fit` method.
|
||
|
|
||
|
Attributes
|
||
|
----------
|
||
|
effective_metric_ : str or callable
|
||
|
The distance metric to use. It will be same as the `metric` parameter
|
||
|
or a synonym of it, e.g. 'euclidean' if the `metric` parameter set to
|
||
|
'minkowski' and `p` parameter set to 2.
|
||
|
|
||
|
effective_metric_params_ : dict
|
||
|
Additional keyword arguments for the metric function. For most metrics
|
||
|
will be same with `metric_params` parameter, but may also contain the
|
||
|
`p` parameter value if the `effective_metric_` attribute is set to
|
||
|
'minkowski'.
|
||
|
|
||
|
Examples
|
||
|
--------
|
||
|
>>> X = [[0], [1], [2], [3]]
|
||
|
>>> y = [0, 0, 1, 1]
|
||
|
>>> from sklearn.neighbors import KNeighborsRegressor
|
||
|
>>> neigh = KNeighborsRegressor(n_neighbors=2)
|
||
|
>>> neigh.fit(X, y)
|
||
|
KNeighborsRegressor(...)
|
||
|
>>> print(neigh.predict([[1.5]]))
|
||
|
[0.5]
|
||
|
|
||
|
See also
|
||
|
--------
|
||
|
NearestNeighbors
|
||
|
RadiusNeighborsRegressor
|
||
|
KNeighborsClassifier
|
||
|
RadiusNeighborsClassifier
|
||
|
|
||
|
Notes
|
||
|
-----
|
||
|
See :ref:`Nearest Neighbors <neighbors>` in the online documentation
|
||
|
for a discussion of the choice of ``algorithm`` and ``leaf_size``.
|
||
|
|
||
|
.. warning::
|
||
|
|
||
|
Regarding the Nearest Neighbors algorithms, if it is found that two
|
||
|
neighbors, neighbor `k+1` and `k`, have identical distances but
|
||
|
different labels, the results will depend on the ordering of the
|
||
|
training data.
|
||
|
|
||
|
https://en.wikipedia.org/wiki/K-nearest_neighbor_algorithm
|
||
|
"""
|
||
|
|
||
|
@_deprecate_positional_args
|
||
|
def __init__(self, n_neighbors=5, *, weights='uniform',
|
||
|
algorithm='auto', leaf_size=30,
|
||
|
p=2, metric='minkowski', metric_params=None, n_jobs=None,
|
||
|
**kwargs):
|
||
|
super().__init__(
|
||
|
n_neighbors=n_neighbors,
|
||
|
algorithm=algorithm,
|
||
|
leaf_size=leaf_size, metric=metric, p=p,
|
||
|
metric_params=metric_params, n_jobs=n_jobs, **kwargs)
|
||
|
self.weights = _check_weights(weights)
|
||
|
|
||
|
@property
|
||
|
def _pairwise(self):
|
||
|
# For cross-validation routines to split data correctly
|
||
|
return self.metric == 'precomputed'
|
||
|
|
||
|
def predict(self, X):
|
||
|
"""Predict the target for the provided data
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
X : array-like of shape (n_queries, n_features), \
|
||
|
or (n_queries, n_indexed) if metric == 'precomputed'
|
||
|
Test samples.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
y : ndarray of shape (n_queries,) or (n_queries, n_outputs), dtype=int
|
||
|
Target values.
|
||
|
"""
|
||
|
X = check_array(X, accept_sparse='csr')
|
||
|
|
||
|
neigh_dist, neigh_ind = self.kneighbors(X)
|
||
|
|
||
|
weights = _get_weights(neigh_dist, self.weights)
|
||
|
|
||
|
_y = self._y
|
||
|
if _y.ndim == 1:
|
||
|
_y = _y.reshape((-1, 1))
|
||
|
|
||
|
if weights is None:
|
||
|
y_pred = np.mean(_y[neigh_ind], axis=1)
|
||
|
else:
|
||
|
y_pred = np.empty((X.shape[0], _y.shape[1]), dtype=np.float64)
|
||
|
denom = np.sum(weights, axis=1)
|
||
|
|
||
|
for j in range(_y.shape[1]):
|
||
|
num = np.sum(_y[neigh_ind, j] * weights, axis=1)
|
||
|
y_pred[:, j] = num / denom
|
||
|
|
||
|
if self._y.ndim == 1:
|
||
|
y_pred = y_pred.ravel()
|
||
|
|
||
|
return y_pred
|
||
|
|
||
|
|
||
|
class RadiusNeighborsRegressor(NeighborsBase, RadiusNeighborsMixin,
|
||
|
SupervisedFloatMixin,
|
||
|
RegressorMixin):
|
||
|
"""Regression based on neighbors within a fixed radius.
|
||
|
|
||
|
The target is predicted by local interpolation of the targets
|
||
|
associated of the nearest neighbors in the training set.
|
||
|
|
||
|
Read more in the :ref:`User Guide <regression>`.
|
||
|
|
||
|
.. versionadded:: 0.9
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
radius : float, default=1.0
|
||
|
Range of parameter space to use by default for :meth:`radius_neighbors`
|
||
|
queries.
|
||
|
|
||
|
weights : {'uniform', 'distance'} or callable, default='uniform'
|
||
|
weight function used in prediction. Possible values:
|
||
|
|
||
|
- 'uniform' : uniform weights. All points in each neighborhood
|
||
|
are weighted equally.
|
||
|
- 'distance' : weight points by the inverse of their distance.
|
||
|
in this case, closer neighbors of a query point will have a
|
||
|
greater influence than neighbors which are further away.
|
||
|
- [callable] : a user-defined function which accepts an
|
||
|
array of distances, and returns an array of the same shape
|
||
|
containing the weights.
|
||
|
|
||
|
Uniform weights are used by default.
|
||
|
|
||
|
algorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'}, default='auto'
|
||
|
Algorithm used to compute the nearest neighbors:
|
||
|
|
||
|
- 'ball_tree' will use :class:`BallTree`
|
||
|
- 'kd_tree' will use :class:`KDTree`
|
||
|
- 'brute' will use a brute-force search.
|
||
|
- 'auto' will attempt to decide the most appropriate algorithm
|
||
|
based on the values passed to :meth:`fit` method.
|
||
|
|
||
|
Note: fitting on sparse input will override the setting of
|
||
|
this parameter, using brute force.
|
||
|
|
||
|
leaf_size : int, default=30
|
||
|
Leaf size passed to BallTree or KDTree. This can affect the
|
||
|
speed of the construction and query, as well as the memory
|
||
|
required to store the tree. The optimal value depends on the
|
||
|
nature of the problem.
|
||
|
|
||
|
p : int, default=2
|
||
|
Power parameter for the Minkowski metric. When p = 1, this is
|
||
|
equivalent to using manhattan_distance (l1), and euclidean_distance
|
||
|
(l2) for p = 2. For arbitrary p, minkowski_distance (l_p) is used.
|
||
|
|
||
|
metric : str or callable, default='minkowski'
|
||
|
the distance metric to use for the tree. The default metric is
|
||
|
minkowski, and with p=2 is equivalent to the standard Euclidean
|
||
|
metric. See the documentation of :class:`DistanceMetric` for a
|
||
|
list of available metrics.
|
||
|
If metric is "precomputed", X is assumed to be a distance matrix and
|
||
|
must be square during fit. X may be a :term:`sparse graph`,
|
||
|
in which case only "nonzero" elements may be considered neighbors.
|
||
|
|
||
|
metric_params : dict, default=None
|
||
|
Additional keyword arguments for the metric function.
|
||
|
|
||
|
n_jobs : int, default=None
|
||
|
The number of parallel jobs to run for neighbors search.
|
||
|
``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
|
||
|
``-1`` means using all processors. See :term:`Glossary <n_jobs>`
|
||
|
for more details.
|
||
|
|
||
|
Attributes
|
||
|
----------
|
||
|
effective_metric_ : str or callable
|
||
|
The distance metric to use. It will be same as the `metric` parameter
|
||
|
or a synonym of it, e.g. 'euclidean' if the `metric` parameter set to
|
||
|
'minkowski' and `p` parameter set to 2.
|
||
|
|
||
|
effective_metric_params_ : dict
|
||
|
Additional keyword arguments for the metric function. For most metrics
|
||
|
will be same with `metric_params` parameter, but may also contain the
|
||
|
`p` parameter value if the `effective_metric_` attribute is set to
|
||
|
'minkowski'.
|
||
|
|
||
|
Examples
|
||
|
--------
|
||
|
>>> X = [[0], [1], [2], [3]]
|
||
|
>>> y = [0, 0, 1, 1]
|
||
|
>>> from sklearn.neighbors import RadiusNeighborsRegressor
|
||
|
>>> neigh = RadiusNeighborsRegressor(radius=1.0)
|
||
|
>>> neigh.fit(X, y)
|
||
|
RadiusNeighborsRegressor(...)
|
||
|
>>> print(neigh.predict([[1.5]]))
|
||
|
[0.5]
|
||
|
|
||
|
See also
|
||
|
--------
|
||
|
NearestNeighbors
|
||
|
KNeighborsRegressor
|
||
|
KNeighborsClassifier
|
||
|
RadiusNeighborsClassifier
|
||
|
|
||
|
Notes
|
||
|
-----
|
||
|
See :ref:`Nearest Neighbors <neighbors>` in the online documentation
|
||
|
for a discussion of the choice of ``algorithm`` and ``leaf_size``.
|
||
|
|
||
|
https://en.wikipedia.org/wiki/K-nearest_neighbor_algorithm
|
||
|
"""
|
||
|
|
||
|
@_deprecate_positional_args
|
||
|
def __init__(self, radius=1.0, *, weights='uniform',
|
||
|
algorithm='auto', leaf_size=30,
|
||
|
p=2, metric='minkowski', metric_params=None, n_jobs=None,
|
||
|
**kwargs):
|
||
|
super().__init__(
|
||
|
radius=radius,
|
||
|
algorithm=algorithm,
|
||
|
leaf_size=leaf_size,
|
||
|
p=p, metric=metric, metric_params=metric_params,
|
||
|
n_jobs=n_jobs, **kwargs)
|
||
|
self.weights = _check_weights(weights)
|
||
|
|
||
|
def predict(self, X):
|
||
|
"""Predict the target for the provided data
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
X : array-like of shape (n_queries, n_features), \
|
||
|
or (n_queries, n_indexed) if metric == 'precomputed'
|
||
|
Test samples.
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
y : ndarray of shape (n_queries,) or (n_queries, n_outputs), \
|
||
|
dtype=double
|
||
|
Target values.
|
||
|
"""
|
||
|
X = check_array(X, accept_sparse='csr')
|
||
|
|
||
|
neigh_dist, neigh_ind = self.radius_neighbors(X)
|
||
|
|
||
|
weights = _get_weights(neigh_dist, self.weights)
|
||
|
|
||
|
_y = self._y
|
||
|
if _y.ndim == 1:
|
||
|
_y = _y.reshape((-1, 1))
|
||
|
|
||
|
empty_obs = np.full_like(_y[0], np.nan)
|
||
|
|
||
|
if weights is None:
|
||
|
y_pred = np.array([np.mean(_y[ind, :], axis=0)
|
||
|
if len(ind) else empty_obs
|
||
|
for (i, ind) in enumerate(neigh_ind)])
|
||
|
|
||
|
else:
|
||
|
y_pred = np.array([np.average(_y[ind, :], axis=0,
|
||
|
weights=weights[i])
|
||
|
if len(ind) else empty_obs
|
||
|
for (i, ind) in enumerate(neigh_ind)])
|
||
|
|
||
|
if np.any(np.isnan(y_pred)):
|
||
|
empty_warning_msg = ("One or more samples have no neighbors "
|
||
|
"within specified radius; predicting NaN.")
|
||
|
warnings.warn(empty_warning_msg)
|
||
|
|
||
|
if self._y.ndim == 1:
|
||
|
y_pred = y_pred.ravel()
|
||
|
|
||
|
return y_pred
|