800 lines
34 KiB
Python
800 lines
34 KiB
Python
"""Gaussian processes classification."""
|
|
|
|
# Authors: Jan Hendrik Metzen <jhm@informatik.uni-bremen.de>
|
|
#
|
|
# License: BSD 3 clause
|
|
|
|
from operator import itemgetter
|
|
|
|
import numpy as np
|
|
from scipy.linalg import cholesky, cho_solve, solve
|
|
import scipy.optimize
|
|
from scipy.special import erf, expit
|
|
|
|
from ..base import BaseEstimator, ClassifierMixin, clone
|
|
from .kernels \
|
|
import RBF, CompoundKernel, ConstantKernel as C
|
|
from ..utils.validation import check_is_fitted, check_array
|
|
from ..utils import check_random_state
|
|
from ..utils.optimize import _check_optimize_result
|
|
from ..preprocessing import LabelEncoder
|
|
from ..multiclass import OneVsRestClassifier, OneVsOneClassifier
|
|
from ..utils.validation import _deprecate_positional_args
|
|
|
|
|
|
# Values required for approximating the logistic sigmoid by
|
|
# error functions. coefs are obtained via:
|
|
# x = np.array([0, 0.6, 2, 3.5, 4.5, np.inf])
|
|
# b = logistic(x)
|
|
# A = (erf(np.dot(x, self.lambdas)) + 1) / 2
|
|
# coefs = lstsq(A, b)[0]
|
|
LAMBDAS = np.array([0.41, 0.4, 0.37, 0.44, 0.39])[:, np.newaxis]
|
|
COEFS = np.array([-1854.8214151, 3516.89893646, 221.29346712,
|
|
128.12323805, -2010.49422654])[:, np.newaxis]
|
|
|
|
|
|
class _BinaryGaussianProcessClassifierLaplace(BaseEstimator):
|
|
"""Binary Gaussian process classification based on Laplace approximation.
|
|
|
|
The implementation is based on Algorithm 3.1, 3.2, and 5.1 of
|
|
``Gaussian Processes for Machine Learning'' (GPML) by Rasmussen and
|
|
Williams.
|
|
|
|
Internally, the Laplace approximation is used for approximating the
|
|
non-Gaussian posterior by a Gaussian.
|
|
|
|
Currently, the implementation is restricted to using the logistic link
|
|
function.
|
|
|
|
.. versionadded:: 0.18
|
|
|
|
Parameters
|
|
----------
|
|
kernel : kernel instance, default=None
|
|
The kernel specifying the covariance function of the GP. If None is
|
|
passed, the kernel "1.0 * RBF(1.0)" is used as default. Note that
|
|
the kernel's hyperparameters are optimized during fitting.
|
|
|
|
optimizer : 'fmin_l_bfgs_b' or callable, default='fmin_l_bfgs_b'
|
|
Can either be one of the internally supported optimizers for optimizing
|
|
the kernel's parameters, specified by a string, or an externally
|
|
defined optimizer passed as a callable. If a callable is passed, it
|
|
must have the signature::
|
|
|
|
def optimizer(obj_func, initial_theta, bounds):
|
|
# * 'obj_func' is the objective function to be maximized, which
|
|
# takes the hyperparameters theta as parameter and an
|
|
# optional flag eval_gradient, which determines if the
|
|
# gradient is returned additionally to the function value
|
|
# * 'initial_theta': the initial value for theta, which can be
|
|
# used by local optimizers
|
|
# * 'bounds': the bounds on the values of theta
|
|
....
|
|
# Returned are the best found hyperparameters theta and
|
|
# the corresponding value of the target function.
|
|
return theta_opt, func_min
|
|
|
|
Per default, the 'L-BFGS-B' algorithm from scipy.optimize.minimize
|
|
is used. If None is passed, the kernel's parameters are kept fixed.
|
|
Available internal optimizers are::
|
|
|
|
'fmin_l_bfgs_b'
|
|
|
|
n_restarts_optimizer : int, default=0
|
|
The number of restarts of the optimizer for finding the kernel's
|
|
parameters which maximize the log-marginal likelihood. The first run
|
|
of the optimizer is performed from the kernel's initial parameters,
|
|
the remaining ones (if any) from thetas sampled log-uniform randomly
|
|
from the space of allowed theta-values. If greater than 0, all bounds
|
|
must be finite. Note that n_restarts_optimizer=0 implies that one
|
|
run is performed.
|
|
|
|
max_iter_predict : int, default=100
|
|
The maximum number of iterations in Newton's method for approximating
|
|
the posterior during predict. Smaller values will reduce computation
|
|
time at the cost of worse results.
|
|
|
|
warm_start : bool, default=False
|
|
If warm-starts are enabled, the solution of the last Newton iteration
|
|
on the Laplace approximation of the posterior mode is used as
|
|
initialization for the next call of _posterior_mode(). This can speed
|
|
up convergence when _posterior_mode is called several times on similar
|
|
problems as in hyperparameter optimization. See :term:`the Glossary
|
|
<warm_start>`.
|
|
|
|
copy_X_train : bool, default=True
|
|
If True, a persistent copy of the training data is stored in the
|
|
object. Otherwise, just a reference to the training data is stored,
|
|
which might cause predictions to change if the data is modified
|
|
externally.
|
|
|
|
random_state : int or RandomState, default=None
|
|
Determines random number generation used to initialize the centers.
|
|
Pass an int for reproducible results across multiple function calls.
|
|
See :term: `Glossary <random_state>`.
|
|
|
|
Attributes
|
|
----------
|
|
X_train_ : array-like of shape (n_samples, n_features) or list of object
|
|
Feature vectors or other representations of training data (also
|
|
required for prediction).
|
|
|
|
y_train_ : array-like of shape (n_samples,)
|
|
Target values in training data (also required for prediction)
|
|
|
|
classes_ : array-like of shape (n_classes,)
|
|
Unique class labels.
|
|
|
|
kernel_ : kernl instance
|
|
The kernel used for prediction. The structure of the kernel is the
|
|
same as the one passed as parameter but with optimized hyperparameters
|
|
|
|
L_ : array-like of shape (n_samples, n_samples)
|
|
Lower-triangular Cholesky decomposition of the kernel in X_train_
|
|
|
|
pi_ : array-like of shape (n_samples,)
|
|
The probabilities of the positive class for the training points
|
|
X_train_
|
|
|
|
W_sr_ : array-like of shape (n_samples,)
|
|
Square root of W, the Hessian of log-likelihood of the latent function
|
|
values for the observed labels. Since W is diagonal, only the diagonal
|
|
of sqrt(W) is stored.
|
|
|
|
log_marginal_likelihood_value_ : float
|
|
The log-marginal-likelihood of ``self.kernel_.theta``
|
|
|
|
"""
|
|
@_deprecate_positional_args
|
|
def __init__(self, kernel=None, *, optimizer="fmin_l_bfgs_b",
|
|
n_restarts_optimizer=0, max_iter_predict=100,
|
|
warm_start=False, copy_X_train=True, random_state=None):
|
|
self.kernel = kernel
|
|
self.optimizer = optimizer
|
|
self.n_restarts_optimizer = n_restarts_optimizer
|
|
self.max_iter_predict = max_iter_predict
|
|
self.warm_start = warm_start
|
|
self.copy_X_train = copy_X_train
|
|
self.random_state = random_state
|
|
|
|
def fit(self, X, y):
|
|
"""Fit Gaussian process classification model
|
|
|
|
Parameters
|
|
----------
|
|
X : array-like of shape (n_samples, n_features) or list of object
|
|
Feature vectors or other representations of training data.
|
|
|
|
y : array-like of shape (n_samples,)
|
|
Target values, must be binary
|
|
|
|
Returns
|
|
-------
|
|
self : returns an instance of self.
|
|
"""
|
|
if self.kernel is None: # Use an RBF kernel as default
|
|
self.kernel_ = C(1.0, constant_value_bounds="fixed") \
|
|
* RBF(1.0, length_scale_bounds="fixed")
|
|
else:
|
|
self.kernel_ = clone(self.kernel)
|
|
|
|
self.rng = check_random_state(self.random_state)
|
|
|
|
self.X_train_ = np.copy(X) if self.copy_X_train else X
|
|
|
|
# Encode class labels and check that it is a binary classification
|
|
# problem
|
|
label_encoder = LabelEncoder()
|
|
self.y_train_ = label_encoder.fit_transform(y)
|
|
self.classes_ = label_encoder.classes_
|
|
if self.classes_.size > 2:
|
|
raise ValueError("%s supports only binary classification. "
|
|
"y contains classes %s"
|
|
% (self.__class__.__name__, self.classes_))
|
|
elif self.classes_.size == 1:
|
|
raise ValueError("{0:s} requires 2 classes; got {1:d} class"
|
|
.format(self.__class__.__name__,
|
|
self.classes_.size))
|
|
|
|
if self.optimizer is not None and self.kernel_.n_dims > 0:
|
|
# Choose hyperparameters based on maximizing the log-marginal
|
|
# likelihood (potentially starting from several initial values)
|
|
def obj_func(theta, eval_gradient=True):
|
|
if eval_gradient:
|
|
lml, grad = self.log_marginal_likelihood(
|
|
theta, eval_gradient=True, clone_kernel=False)
|
|
return -lml, -grad
|
|
else:
|
|
return -self.log_marginal_likelihood(theta,
|
|
clone_kernel=False)
|
|
|
|
# First optimize starting from theta specified in kernel
|
|
optima = [self._constrained_optimization(obj_func,
|
|
self.kernel_.theta,
|
|
self.kernel_.bounds)]
|
|
|
|
# Additional runs are performed from log-uniform chosen initial
|
|
# theta
|
|
if self.n_restarts_optimizer > 0:
|
|
if not np.isfinite(self.kernel_.bounds).all():
|
|
raise ValueError(
|
|
"Multiple optimizer restarts (n_restarts_optimizer>0) "
|
|
"requires that all bounds are finite.")
|
|
bounds = self.kernel_.bounds
|
|
for iteration in range(self.n_restarts_optimizer):
|
|
theta_initial = np.exp(self.rng.uniform(bounds[:, 0],
|
|
bounds[:, 1]))
|
|
optima.append(
|
|
self._constrained_optimization(obj_func, theta_initial,
|
|
bounds))
|
|
# Select result from run with minimal (negative) log-marginal
|
|
# likelihood
|
|
lml_values = list(map(itemgetter(1), optima))
|
|
self.kernel_.theta = optima[np.argmin(lml_values)][0]
|
|
self.log_marginal_likelihood_value_ = -np.min(lml_values)
|
|
else:
|
|
self.log_marginal_likelihood_value_ = \
|
|
self.log_marginal_likelihood(self.kernel_.theta)
|
|
|
|
# Precompute quantities required for predictions which are independent
|
|
# of actual query points
|
|
K = self.kernel_(self.X_train_)
|
|
|
|
_, (self.pi_, self.W_sr_, self.L_, _, _) = \
|
|
self._posterior_mode(K, return_temporaries=True)
|
|
|
|
return self
|
|
|
|
def predict(self, X):
|
|
"""Perform classification on an array of test vectors X.
|
|
|
|
Parameters
|
|
----------
|
|
X : array-like of shape (n_samples, n_features) or list of object
|
|
Query points where the GP is evaluated for classification.
|
|
|
|
Returns
|
|
-------
|
|
C : ndarray of shape (n_samples,)
|
|
Predicted target values for X, values are from ``classes_``
|
|
"""
|
|
check_is_fitted(self)
|
|
|
|
# As discussed on Section 3.4.2 of GPML, for making hard binary
|
|
# decisions, it is enough to compute the MAP of the posterior and
|
|
# pass it through the link function
|
|
K_star = self.kernel_(self.X_train_, X) # K_star =k(x_star)
|
|
f_star = K_star.T.dot(self.y_train_ - self.pi_) # Algorithm 3.2,Line 4
|
|
|
|
return np.where(f_star > 0, self.classes_[1], self.classes_[0])
|
|
|
|
def predict_proba(self, X):
|
|
"""Return probability estimates for the test vector X.
|
|
|
|
Parameters
|
|
----------
|
|
X : array-like of shape (n_samples, n_features) or list of object
|
|
Query points where the GP is evaluated for classification.
|
|
|
|
Returns
|
|
-------
|
|
C : array-like of shape (n_samples, n_classes)
|
|
Returns the probability of the samples for each class in
|
|
the model. The columns correspond to the classes in sorted
|
|
order, as they appear in the attribute ``classes_``.
|
|
"""
|
|
check_is_fitted(self)
|
|
|
|
# Based on Algorithm 3.2 of GPML
|
|
K_star = self.kernel_(self.X_train_, X) # K_star =k(x_star)
|
|
f_star = K_star.T.dot(self.y_train_ - self.pi_) # Line 4
|
|
v = solve(self.L_, self.W_sr_[:, np.newaxis] * K_star) # Line 5
|
|
# Line 6 (compute np.diag(v.T.dot(v)) via einsum)
|
|
var_f_star = self.kernel_.diag(X) - np.einsum("ij,ij->j", v, v)
|
|
|
|
# Line 7:
|
|
# Approximate \int log(z) * N(z | f_star, var_f_star)
|
|
# Approximation is due to Williams & Barber, "Bayesian Classification
|
|
# with Gaussian Processes", Appendix A: Approximate the logistic
|
|
# sigmoid by a linear combination of 5 error functions.
|
|
# For information on how this integral can be computed see
|
|
# blitiri.blogspot.de/2012/11/gaussian-integral-of-error-function.html
|
|
alpha = 1 / (2 * var_f_star)
|
|
gamma = LAMBDAS * f_star
|
|
integrals = np.sqrt(np.pi / alpha) \
|
|
* erf(gamma * np.sqrt(alpha / (alpha + LAMBDAS**2))) \
|
|
/ (2 * np.sqrt(var_f_star * 2 * np.pi))
|
|
pi_star = (COEFS * integrals).sum(axis=0) + .5 * COEFS.sum()
|
|
|
|
return np.vstack((1 - pi_star, pi_star)).T
|
|
|
|
def log_marginal_likelihood(self, theta=None, eval_gradient=False,
|
|
clone_kernel=True):
|
|
"""Returns log-marginal likelihood of theta for training data.
|
|
|
|
Parameters
|
|
----------
|
|
theta : array-like of shape (n_kernel_params,), default=None
|
|
Kernel hyperparameters for which the log-marginal likelihood is
|
|
evaluated. If None, the precomputed log_marginal_likelihood
|
|
of ``self.kernel_.theta`` is returned.
|
|
|
|
eval_gradient : bool, default=False
|
|
If True, the gradient of the log-marginal likelihood with respect
|
|
to the kernel hyperparameters at position theta is returned
|
|
additionally. If True, theta must not be None.
|
|
|
|
clone_kernel : bool, default=True
|
|
If True, the kernel attribute is copied. If False, the kernel
|
|
attribute is modified, but may result in a performance improvement.
|
|
|
|
Returns
|
|
-------
|
|
log_likelihood : float
|
|
Log-marginal likelihood of theta for training data.
|
|
|
|
log_likelihood_gradient : ndarray of shape (n_kernel_params,), \
|
|
optional
|
|
Gradient of the log-marginal likelihood with respect to the kernel
|
|
hyperparameters at position theta.
|
|
Only returned when `eval_gradient` is True.
|
|
"""
|
|
if theta is None:
|
|
if eval_gradient:
|
|
raise ValueError(
|
|
"Gradient can only be evaluated for theta!=None")
|
|
return self.log_marginal_likelihood_value_
|
|
|
|
if clone_kernel:
|
|
kernel = self.kernel_.clone_with_theta(theta)
|
|
else:
|
|
kernel = self.kernel_
|
|
kernel.theta = theta
|
|
|
|
if eval_gradient:
|
|
K, K_gradient = kernel(self.X_train_, eval_gradient=True)
|
|
else:
|
|
K = kernel(self.X_train_)
|
|
|
|
# Compute log-marginal-likelihood Z and also store some temporaries
|
|
# which can be reused for computing Z's gradient
|
|
Z, (pi, W_sr, L, b, a) = \
|
|
self._posterior_mode(K, return_temporaries=True)
|
|
|
|
if not eval_gradient:
|
|
return Z
|
|
|
|
# Compute gradient based on Algorithm 5.1 of GPML
|
|
d_Z = np.empty(theta.shape[0])
|
|
# XXX: Get rid of the np.diag() in the next line
|
|
R = W_sr[:, np.newaxis] * cho_solve((L, True), np.diag(W_sr)) # Line 7
|
|
C = solve(L, W_sr[:, np.newaxis] * K) # Line 8
|
|
# Line 9: (use einsum to compute np.diag(C.T.dot(C))))
|
|
s_2 = -0.5 * (np.diag(K) - np.einsum('ij, ij -> j', C, C)) \
|
|
* (pi * (1 - pi) * (1 - 2 * pi)) # third derivative
|
|
|
|
for j in range(d_Z.shape[0]):
|
|
C = K_gradient[:, :, j] # Line 11
|
|
# Line 12: (R.T.ravel().dot(C.ravel()) = np.trace(R.dot(C)))
|
|
s_1 = .5 * a.T.dot(C).dot(a) - .5 * R.T.ravel().dot(C.ravel())
|
|
|
|
b = C.dot(self.y_train_ - pi) # Line 13
|
|
s_3 = b - K.dot(R.dot(b)) # Line 14
|
|
|
|
d_Z[j] = s_1 + s_2.T.dot(s_3) # Line 15
|
|
|
|
return Z, d_Z
|
|
|
|
def _posterior_mode(self, K, return_temporaries=False):
|
|
"""Mode-finding for binary Laplace GPC and fixed kernel.
|
|
|
|
This approximates the posterior of the latent function values for given
|
|
inputs and target observations with a Gaussian approximation and uses
|
|
Newton's iteration to find the mode of this approximation.
|
|
"""
|
|
# Based on Algorithm 3.1 of GPML
|
|
|
|
# If warm_start are enabled, we reuse the last solution for the
|
|
# posterior mode as initialization; otherwise, we initialize with 0
|
|
if self.warm_start and hasattr(self, "f_cached") \
|
|
and self.f_cached.shape == self.y_train_.shape:
|
|
f = self.f_cached
|
|
else:
|
|
f = np.zeros_like(self.y_train_, dtype=np.float64)
|
|
|
|
# Use Newton's iteration method to find mode of Laplace approximation
|
|
log_marginal_likelihood = -np.inf
|
|
for _ in range(self.max_iter_predict):
|
|
# Line 4
|
|
pi = expit(f)
|
|
W = pi * (1 - pi)
|
|
# Line 5
|
|
W_sr = np.sqrt(W)
|
|
W_sr_K = W_sr[:, np.newaxis] * K
|
|
B = np.eye(W.shape[0]) + W_sr_K * W_sr
|
|
L = cholesky(B, lower=True)
|
|
# Line 6
|
|
b = W * f + (self.y_train_ - pi)
|
|
# Line 7
|
|
a = b - W_sr * cho_solve((L, True), W_sr_K.dot(b))
|
|
# Line 8
|
|
f = K.dot(a)
|
|
|
|
# Line 10: Compute log marginal likelihood in loop and use as
|
|
# convergence criterion
|
|
lml = -0.5 * a.T.dot(f) \
|
|
- np.log1p(np.exp(-(self.y_train_ * 2 - 1) * f)).sum() \
|
|
- np.log(np.diag(L)).sum()
|
|
# Check if we have converged (log marginal likelihood does
|
|
# not decrease)
|
|
# XXX: more complex convergence criterion
|
|
if lml - log_marginal_likelihood < 1e-10:
|
|
break
|
|
log_marginal_likelihood = lml
|
|
|
|
self.f_cached = f # Remember solution for later warm-starts
|
|
if return_temporaries:
|
|
return log_marginal_likelihood, (pi, W_sr, L, b, a)
|
|
else:
|
|
return log_marginal_likelihood
|
|
|
|
def _constrained_optimization(self, obj_func, initial_theta, bounds):
|
|
if self.optimizer == "fmin_l_bfgs_b":
|
|
opt_res = scipy.optimize.minimize(
|
|
obj_func, initial_theta, method="L-BFGS-B", jac=True,
|
|
bounds=bounds)
|
|
_check_optimize_result("lbfgs", opt_res)
|
|
theta_opt, func_min = opt_res.x, opt_res.fun
|
|
elif callable(self.optimizer):
|
|
theta_opt, func_min = \
|
|
self.optimizer(obj_func, initial_theta, bounds=bounds)
|
|
else:
|
|
raise ValueError("Unknown optimizer %s." % self.optimizer)
|
|
|
|
return theta_opt, func_min
|
|
|
|
|
|
class GaussianProcessClassifier(ClassifierMixin, BaseEstimator):
|
|
"""Gaussian process classification (GPC) based on Laplace approximation.
|
|
|
|
The implementation is based on Algorithm 3.1, 3.2, and 5.1 of
|
|
Gaussian Processes for Machine Learning (GPML) by Rasmussen and
|
|
Williams.
|
|
|
|
Internally, the Laplace approximation is used for approximating the
|
|
non-Gaussian posterior by a Gaussian.
|
|
|
|
Currently, the implementation is restricted to using the logistic link
|
|
function. For multi-class classification, several binary one-versus rest
|
|
classifiers are fitted. Note that this class thus does not implement
|
|
a true multi-class Laplace approximation.
|
|
|
|
Read more in the :ref:`User Guide <gaussian_process>`.
|
|
|
|
Parameters
|
|
----------
|
|
kernel : kernel instance, default=None
|
|
The kernel specifying the covariance function of the GP. If None is
|
|
passed, the kernel "1.0 * RBF(1.0)" is used as default. Note that
|
|
the kernel's hyperparameters are optimized during fitting.
|
|
|
|
optimizer : 'fmin_l_bfgs_b' or callable, default='fmin_l_bfgs_b'
|
|
Can either be one of the internally supported optimizers for optimizing
|
|
the kernel's parameters, specified by a string, or an externally
|
|
defined optimizer passed as a callable. If a callable is passed, it
|
|
must have the signature::
|
|
|
|
def optimizer(obj_func, initial_theta, bounds):
|
|
# * 'obj_func' is the objective function to be maximized, which
|
|
# takes the hyperparameters theta as parameter and an
|
|
# optional flag eval_gradient, which determines if the
|
|
# gradient is returned additionally to the function value
|
|
# * 'initial_theta': the initial value for theta, which can be
|
|
# used by local optimizers
|
|
# * 'bounds': the bounds on the values of theta
|
|
....
|
|
# Returned are the best found hyperparameters theta and
|
|
# the corresponding value of the target function.
|
|
return theta_opt, func_min
|
|
|
|
Per default, the 'L-BFGS-B' algorithm from scipy.optimize.minimize
|
|
is used. If None is passed, the kernel's parameters are kept fixed.
|
|
Available internal optimizers are::
|
|
|
|
'fmin_l_bfgs_b'
|
|
|
|
n_restarts_optimizer : int, default=0
|
|
The number of restarts of the optimizer for finding the kernel's
|
|
parameters which maximize the log-marginal likelihood. The first run
|
|
of the optimizer is performed from the kernel's initial parameters,
|
|
the remaining ones (if any) from thetas sampled log-uniform randomly
|
|
from the space of allowed theta-values. If greater than 0, all bounds
|
|
must be finite. Note that n_restarts_optimizer=0 implies that one
|
|
run is performed.
|
|
|
|
max_iter_predict : int, default=100
|
|
The maximum number of iterations in Newton's method for approximating
|
|
the posterior during predict. Smaller values will reduce computation
|
|
time at the cost of worse results.
|
|
|
|
warm_start : bool, default=False
|
|
If warm-starts are enabled, the solution of the last Newton iteration
|
|
on the Laplace approximation of the posterior mode is used as
|
|
initialization for the next call of _posterior_mode(). This can speed
|
|
up convergence when _posterior_mode is called several times on similar
|
|
problems as in hyperparameter optimization. See :term:`the Glossary
|
|
<warm_start>`.
|
|
|
|
copy_X_train : bool, default=True
|
|
If True, a persistent copy of the training data is stored in the
|
|
object. Otherwise, just a reference to the training data is stored,
|
|
which might cause predictions to change if the data is modified
|
|
externally.
|
|
|
|
random_state : int or RandomState, default=None
|
|
Determines random number generation used to initialize the centers.
|
|
Pass an int for reproducible results across multiple function calls.
|
|
See :term: `Glossary <random_state>`.
|
|
|
|
multi_class : {'one_vs_rest', 'one_vs_one'}, default='one_vs_rest'
|
|
Specifies how multi-class classification problems are handled.
|
|
Supported are 'one_vs_rest' and 'one_vs_one'. In 'one_vs_rest',
|
|
one binary Gaussian process classifier is fitted for each class, which
|
|
is trained to separate this class from the rest. In 'one_vs_one', one
|
|
binary Gaussian process classifier is fitted for each pair of classes,
|
|
which is trained to separate these two classes. The predictions of
|
|
these binary predictors are combined into multi-class predictions.
|
|
Note that 'one_vs_one' does not support predicting probability
|
|
estimates.
|
|
|
|
n_jobs : int, default=None
|
|
The number of jobs to use for the computation.
|
|
``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
|
|
``-1`` means using all processors. See :term:`Glossary <n_jobs>`
|
|
for more details.
|
|
|
|
Attributes
|
|
----------
|
|
kernel_ : kernel instance
|
|
The kernel used for prediction. In case of binary classification,
|
|
the structure of the kernel is the same as the one passed as parameter
|
|
but with optimized hyperparameters. In case of multi-class
|
|
classification, a CompoundKernel is returned which consists of the
|
|
different kernels used in the one-versus-rest classifiers.
|
|
|
|
log_marginal_likelihood_value_ : float
|
|
The log-marginal-likelihood of ``self.kernel_.theta``
|
|
|
|
classes_ : array-like of shape (n_classes,)
|
|
Unique class labels.
|
|
|
|
n_classes_ : int
|
|
The number of classes in the training data
|
|
|
|
Examples
|
|
--------
|
|
>>> from sklearn.datasets import load_iris
|
|
>>> from sklearn.gaussian_process import GaussianProcessClassifier
|
|
>>> from sklearn.gaussian_process.kernels import RBF
|
|
>>> X, y = load_iris(return_X_y=True)
|
|
>>> kernel = 1.0 * RBF(1.0)
|
|
>>> gpc = GaussianProcessClassifier(kernel=kernel,
|
|
... random_state=0).fit(X, y)
|
|
>>> gpc.score(X, y)
|
|
0.9866...
|
|
>>> gpc.predict_proba(X[:2,:])
|
|
array([[0.83548752, 0.03228706, 0.13222543],
|
|
[0.79064206, 0.06525643, 0.14410151]])
|
|
|
|
.. versionadded:: 0.18
|
|
"""
|
|
@_deprecate_positional_args
|
|
def __init__(self, kernel=None, *, optimizer="fmin_l_bfgs_b",
|
|
n_restarts_optimizer=0, max_iter_predict=100,
|
|
warm_start=False, copy_X_train=True, random_state=None,
|
|
multi_class="one_vs_rest", n_jobs=None):
|
|
self.kernel = kernel
|
|
self.optimizer = optimizer
|
|
self.n_restarts_optimizer = n_restarts_optimizer
|
|
self.max_iter_predict = max_iter_predict
|
|
self.warm_start = warm_start
|
|
self.copy_X_train = copy_X_train
|
|
self.random_state = random_state
|
|
self.multi_class = multi_class
|
|
self.n_jobs = n_jobs
|
|
|
|
def fit(self, X, y):
|
|
"""Fit Gaussian process classification model
|
|
|
|
Parameters
|
|
----------
|
|
X : array-like of shape (n_samples, n_features) or list of object
|
|
Feature vectors or other representations of training data.
|
|
|
|
y : array-like of shape (n_samples,)
|
|
Target values, must be binary
|
|
|
|
Returns
|
|
-------
|
|
self : returns an instance of self.
|
|
"""
|
|
if self.kernel is None or self.kernel.requires_vector_input:
|
|
X, y = self._validate_data(X, y, multi_output=False,
|
|
ensure_2d=True, dtype="numeric")
|
|
else:
|
|
X, y = self._validate_data(X, y, multi_output=False,
|
|
ensure_2d=False, dtype=None)
|
|
|
|
self.base_estimator_ = _BinaryGaussianProcessClassifierLaplace(
|
|
kernel=self.kernel,
|
|
optimizer=self.optimizer,
|
|
n_restarts_optimizer=self.n_restarts_optimizer,
|
|
max_iter_predict=self.max_iter_predict,
|
|
warm_start=self.warm_start,
|
|
copy_X_train=self.copy_X_train,
|
|
random_state=self.random_state)
|
|
|
|
self.classes_ = np.unique(y)
|
|
self.n_classes_ = self.classes_.size
|
|
if self.n_classes_ == 1:
|
|
raise ValueError("GaussianProcessClassifier requires 2 or more "
|
|
"distinct classes; got %d class (only class %s "
|
|
"is present)"
|
|
% (self.n_classes_, self.classes_[0]))
|
|
if self.n_classes_ > 2:
|
|
if self.multi_class == "one_vs_rest":
|
|
self.base_estimator_ = \
|
|
OneVsRestClassifier(self.base_estimator_,
|
|
n_jobs=self.n_jobs)
|
|
elif self.multi_class == "one_vs_one":
|
|
self.base_estimator_ = \
|
|
OneVsOneClassifier(self.base_estimator_,
|
|
n_jobs=self.n_jobs)
|
|
else:
|
|
raise ValueError("Unknown multi-class mode %s"
|
|
% self.multi_class)
|
|
|
|
self.base_estimator_.fit(X, y)
|
|
|
|
if self.n_classes_ > 2:
|
|
self.log_marginal_likelihood_value_ = np.mean(
|
|
[estimator.log_marginal_likelihood()
|
|
for estimator in self.base_estimator_.estimators_])
|
|
else:
|
|
self.log_marginal_likelihood_value_ = \
|
|
self.base_estimator_.log_marginal_likelihood()
|
|
|
|
return self
|
|
|
|
def predict(self, X):
|
|
"""Perform classification on an array of test vectors X.
|
|
|
|
Parameters
|
|
----------
|
|
X : array-like of shape (n_samples, n_features) or list of object
|
|
Query points where the GP is evaluated for classification.
|
|
|
|
Returns
|
|
-------
|
|
C : ndarray of shape (n_samples,)
|
|
Predicted target values for X, values are from ``classes_``
|
|
"""
|
|
check_is_fitted(self)
|
|
|
|
if self.kernel is None or self.kernel.requires_vector_input:
|
|
X = check_array(X, ensure_2d=True, dtype="numeric")
|
|
else:
|
|
X = check_array(X, ensure_2d=False, dtype=None)
|
|
|
|
return self.base_estimator_.predict(X)
|
|
|
|
def predict_proba(self, X):
|
|
"""Return probability estimates for the test vector X.
|
|
|
|
Parameters
|
|
----------
|
|
X : array-like of shape (n_samples, n_features) or list of object
|
|
Query points where the GP is evaluated for classification.
|
|
|
|
Returns
|
|
-------
|
|
C : array-like of shape (n_samples, n_classes)
|
|
Returns the probability of the samples for each class in
|
|
the model. The columns correspond to the classes in sorted
|
|
order, as they appear in the attribute :term:`classes_`.
|
|
"""
|
|
check_is_fitted(self)
|
|
if self.n_classes_ > 2 and self.multi_class == "one_vs_one":
|
|
raise ValueError("one_vs_one multi-class mode does not support "
|
|
"predicting probability estimates. Use "
|
|
"one_vs_rest mode instead.")
|
|
|
|
if self.kernel is None or self.kernel.requires_vector_input:
|
|
X = check_array(X, ensure_2d=True, dtype="numeric")
|
|
else:
|
|
X = check_array(X, ensure_2d=False, dtype=None)
|
|
|
|
return self.base_estimator_.predict_proba(X)
|
|
|
|
@property
|
|
def kernel_(self):
|
|
if self.n_classes_ == 2:
|
|
return self.base_estimator_.kernel_
|
|
else:
|
|
return CompoundKernel(
|
|
[estimator.kernel_
|
|
for estimator in self.base_estimator_.estimators_])
|
|
|
|
def log_marginal_likelihood(self, theta=None, eval_gradient=False,
|
|
clone_kernel=True):
|
|
"""Returns log-marginal likelihood of theta for training data.
|
|
|
|
In the case of multi-class classification, the mean log-marginal
|
|
likelihood of the one-versus-rest classifiers are returned.
|
|
|
|
Parameters
|
|
----------
|
|
theta : array-like of shape (n_kernel_params,), default=None
|
|
Kernel hyperparameters for which the log-marginal likelihood is
|
|
evaluated. In the case of multi-class classification, theta may
|
|
be the hyperparameters of the compound kernel or of an individual
|
|
kernel. In the latter case, all individual kernel get assigned the
|
|
same theta values. If None, the precomputed log_marginal_likelihood
|
|
of ``self.kernel_.theta`` is returned.
|
|
|
|
eval_gradient : bool, default=False
|
|
If True, the gradient of the log-marginal likelihood with respect
|
|
to the kernel hyperparameters at position theta is returned
|
|
additionally. Note that gradient computation is not supported
|
|
for non-binary classification. If True, theta must not be None.
|
|
|
|
clone_kernel : bool, default=True
|
|
If True, the kernel attribute is copied. If False, the kernel
|
|
attribute is modified, but may result in a performance improvement.
|
|
|
|
Returns
|
|
-------
|
|
log_likelihood : float
|
|
Log-marginal likelihood of theta for training data.
|
|
|
|
log_likelihood_gradient : ndarray of shape (n_kernel_params,), optional
|
|
Gradient of the log-marginal likelihood with respect to the kernel
|
|
hyperparameters at position theta.
|
|
Only returned when `eval_gradient` is True.
|
|
"""
|
|
check_is_fitted(self)
|
|
|
|
if theta is None:
|
|
if eval_gradient:
|
|
raise ValueError(
|
|
"Gradient can only be evaluated for theta!=None")
|
|
return self.log_marginal_likelihood_value_
|
|
|
|
theta = np.asarray(theta)
|
|
if self.n_classes_ == 2:
|
|
return self.base_estimator_.log_marginal_likelihood(
|
|
theta, eval_gradient, clone_kernel=clone_kernel)
|
|
else:
|
|
if eval_gradient:
|
|
raise NotImplementedError(
|
|
"Gradient of log-marginal-likelihood not implemented for "
|
|
"multi-class GPC.")
|
|
estimators = self.base_estimator_.estimators_
|
|
n_dims = estimators[0].kernel_.n_dims
|
|
if theta.shape[0] == n_dims: # use same theta for all sub-kernels
|
|
return np.mean(
|
|
[estimator.log_marginal_likelihood(
|
|
theta, clone_kernel=clone_kernel)
|
|
for i, estimator in enumerate(estimators)])
|
|
elif theta.shape[0] == n_dims * self.classes_.shape[0]:
|
|
# theta for compound kernel
|
|
return np.mean(
|
|
[estimator.log_marginal_likelihood(
|
|
theta[n_dims * i:n_dims * (i + 1)],
|
|
clone_kernel=clone_kernel)
|
|
for i, estimator in enumerate(estimators)])
|
|
else:
|
|
raise ValueError("Shape of theta must be either %d or %d. "
|
|
"Obtained theta with shape %d."
|
|
% (n_dims, n_dims * self.classes_.shape[0],
|
|
theta.shape[0]))
|