Uploaded Test files

2020-11-12 11:05:57 -05:00 · 2020-11-12 11:05:57 -05:00 · 2e81cb7d99
commit 2e81cb7d99
parent f584ad9d97
16627 changed files with 2065359 additions and 102444 deletions
--- a/venv/Lib/site-packages/sklearn/gaussian_process/_gpc.py
+++ b/venv/Lib/site-packages/sklearn/gaussian_process/_gpc.py
@ -0,0 +1,800 @@
+"""Gaussian processes classification."""
+
+# Authors: Jan Hendrik Metzen <jhm@informatik.uni-bremen.de>
+#
+# License: BSD 3 clause
+
+from operator import itemgetter
+
+import numpy as np
+from scipy.linalg import cholesky, cho_solve, solve
+import scipy.optimize
+from scipy.special import erf, expit
+
+from ..base import BaseEstimator, ClassifierMixin, clone
+from .kernels \
+    import RBF, CompoundKernel, ConstantKernel as C
+from ..utils.validation import check_is_fitted, check_array
+from ..utils import check_random_state
+from ..utils.optimize import _check_optimize_result
+from ..preprocessing import LabelEncoder
+from ..multiclass import OneVsRestClassifier, OneVsOneClassifier
+from ..utils.validation import _deprecate_positional_args
+
+
+# Values required for approximating the logistic sigmoid by
+# error functions. coefs are obtained via:
+# x = np.array([0, 0.6, 2, 3.5, 4.5, np.inf])
+# b = logistic(x)
+# A = (erf(np.dot(x, self.lambdas)) + 1) / 2
+# coefs = lstsq(A, b)[0]
+LAMBDAS = np.array([0.41, 0.4, 0.37, 0.44, 0.39])[:, np.newaxis]
+COEFS = np.array([-1854.8214151, 3516.89893646, 221.29346712,
+                  128.12323805, -2010.49422654])[:, np.newaxis]
+
+
+class _BinaryGaussianProcessClassifierLaplace(BaseEstimator):
+    """Binary Gaussian process classification based on Laplace approximation.
+
+    The implementation is based on Algorithm 3.1, 3.2, and 5.1 of
+    ``Gaussian Processes for Machine Learning'' (GPML) by Rasmussen and
+    Williams.
+
+    Internally, the Laplace approximation is used for approximating the
+    non-Gaussian posterior by a Gaussian.
+
+    Currently, the implementation is restricted to using the logistic link
+    function.
+
+    .. versionadded:: 0.18
+
+    Parameters
+    ----------
+    kernel : kernel instance, default=None
+        The kernel specifying the covariance function of the GP. If None is
+        passed, the kernel "1.0 * RBF(1.0)" is used as default. Note that
+        the kernel's hyperparameters are optimized during fitting.
+
+    optimizer : 'fmin_l_bfgs_b' or callable, default='fmin_l_bfgs_b'
+        Can either be one of the internally supported optimizers for optimizing
+        the kernel's parameters, specified by a string, or an externally
+        defined optimizer passed as a callable. If a callable is passed, it
+        must have the  signature::
+
+            def optimizer(obj_func, initial_theta, bounds):
+                # * 'obj_func' is the objective function to be maximized, which
+                #   takes the hyperparameters theta as parameter and an
+                #   optional flag eval_gradient, which determines if the
+                #   gradient is returned additionally to the function value
+                # * 'initial_theta': the initial value for theta, which can be
+                #   used by local optimizers
+                # * 'bounds': the bounds on the values of theta
+                ....
+                # Returned are the best found hyperparameters theta and
+                # the corresponding value of the target function.
+                return theta_opt, func_min
+
+        Per default, the 'L-BFGS-B' algorithm from scipy.optimize.minimize
+        is used. If None is passed, the kernel's parameters are kept fixed.
+        Available internal optimizers are::
+
+            'fmin_l_bfgs_b'
+
+    n_restarts_optimizer : int, default=0
+        The number of restarts of the optimizer for finding the kernel's
+        parameters which maximize the log-marginal likelihood. The first run
+        of the optimizer is performed from the kernel's initial parameters,
+        the remaining ones (if any) from thetas sampled log-uniform randomly
+        from the space of allowed theta-values. If greater than 0, all bounds
+        must be finite. Note that n_restarts_optimizer=0 implies that one
+        run is performed.
+
+    max_iter_predict : int, default=100
+        The maximum number of iterations in Newton's method for approximating
+        the posterior during predict. Smaller values will reduce computation
+        time at the cost of worse results.
+
+    warm_start : bool, default=False
+        If warm-starts are enabled, the solution of the last Newton iteration
+        on the Laplace approximation of the posterior mode is used as
+        initialization for the next call of _posterior_mode(). This can speed
+        up convergence when _posterior_mode is called several times on similar
+        problems as in hyperparameter optimization. See :term:`the Glossary
+        <warm_start>`.
+
+    copy_X_train : bool, default=True
+        If True, a persistent copy of the training data is stored in the
+        object. Otherwise, just a reference to the training data is stored,
+        which might cause predictions to change if the data is modified
+        externally.
+
+    random_state : int or RandomState, default=None
+        Determines random number generation used to initialize the centers.
+        Pass an int for reproducible results across multiple function calls.
+        See :term: `Glossary <random_state>`.
+
+    Attributes
+    ----------
+    X_train_ : array-like of shape (n_samples, n_features) or list of object
+        Feature vectors or other representations of training data (also
+        required for prediction).
+
+    y_train_ : array-like of shape (n_samples,)
+        Target values in training data (also required for prediction)
+
+    classes_ : array-like of shape (n_classes,)
+        Unique class labels.
+
+    kernel_ : kernl instance
+        The kernel used for prediction. The structure of the kernel is the
+        same as the one passed as parameter but with optimized hyperparameters
+
+    L_ : array-like of shape (n_samples, n_samples)
+        Lower-triangular Cholesky decomposition of the kernel in X_train_
+
+    pi_ : array-like of shape (n_samples,)
+        The probabilities of the positive class for the training points
+        X_train_
+
+    W_sr_ : array-like of shape (n_samples,)
+        Square root of W, the Hessian of log-likelihood of the latent function
+        values for the observed labels. Since W is diagonal, only the diagonal
+        of sqrt(W) is stored.
+
+    log_marginal_likelihood_value_ : float
+        The log-marginal-likelihood of ``self.kernel_.theta``
+
+    """
+    @_deprecate_positional_args
+    def __init__(self, kernel=None, *, optimizer="fmin_l_bfgs_b",
+                 n_restarts_optimizer=0, max_iter_predict=100,
+                 warm_start=False, copy_X_train=True, random_state=None):
+        self.kernel = kernel
+        self.optimizer = optimizer
+        self.n_restarts_optimizer = n_restarts_optimizer
+        self.max_iter_predict = max_iter_predict
+        self.warm_start = warm_start
+        self.copy_X_train = copy_X_train
+        self.random_state = random_state
+
+    def fit(self, X, y):
+        """Fit Gaussian process classification model
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features) or list of object
+            Feature vectors or other representations of training data.
+
+        y : array-like of shape (n_samples,)
+            Target values, must be binary
+
+        Returns
+        -------
+        self : returns an instance of self.
+        """
+        if self.kernel is None:  # Use an RBF kernel as default
+            self.kernel_ = C(1.0, constant_value_bounds="fixed") \
+                * RBF(1.0, length_scale_bounds="fixed")
+        else:
+            self.kernel_ = clone(self.kernel)
+
+        self.rng = check_random_state(self.random_state)
+
+        self.X_train_ = np.copy(X) if self.copy_X_train else X
+
+        # Encode class labels and check that it is a binary classification
+        # problem
+        label_encoder = LabelEncoder()
+        self.y_train_ = label_encoder.fit_transform(y)
+        self.classes_ = label_encoder.classes_
+        if self.classes_.size > 2:
+            raise ValueError("%s supports only binary classification. "
+                             "y contains classes %s"
+                             % (self.__class__.__name__, self.classes_))
+        elif self.classes_.size == 1:
+            raise ValueError("{0:s} requires 2 classes; got {1:d} class"
+                             .format(self.__class__.__name__,
+                                     self.classes_.size))
+
+        if self.optimizer is not None and self.kernel_.n_dims > 0:
+            # Choose hyperparameters based on maximizing the log-marginal
+            # likelihood (potentially starting from several initial values)
+            def obj_func(theta, eval_gradient=True):
+                if eval_gradient:
+                    lml, grad = self.log_marginal_likelihood(
+                        theta, eval_gradient=True, clone_kernel=False)
+                    return -lml, -grad
+                else:
+                    return -self.log_marginal_likelihood(theta,
+                                                         clone_kernel=False)
+
+            # First optimize starting from theta specified in kernel
+            optima = [self._constrained_optimization(obj_func,
+                                                     self.kernel_.theta,
+                                                     self.kernel_.bounds)]
+
+            # Additional runs are performed from log-uniform chosen initial
+            # theta
+            if self.n_restarts_optimizer > 0:
+                if not np.isfinite(self.kernel_.bounds).all():
+                    raise ValueError(
+                        "Multiple optimizer restarts (n_restarts_optimizer>0) "
+                        "requires that all bounds are finite.")
+                bounds = self.kernel_.bounds
+                for iteration in range(self.n_restarts_optimizer):
+                    theta_initial = np.exp(self.rng.uniform(bounds[:, 0],
+                                                            bounds[:, 1]))
+                    optima.append(
+                        self._constrained_optimization(obj_func, theta_initial,
+                                                       bounds))
+            # Select result from run with minimal (negative) log-marginal
+            # likelihood
+            lml_values = list(map(itemgetter(1), optima))
+            self.kernel_.theta = optima[np.argmin(lml_values)][0]
+            self.log_marginal_likelihood_value_ = -np.min(lml_values)
+        else:
+            self.log_marginal_likelihood_value_ = \
+                self.log_marginal_likelihood(self.kernel_.theta)
+
+        # Precompute quantities required for predictions which are independent
+        # of actual query points
+        K = self.kernel_(self.X_train_)
+
+        _, (self.pi_, self.W_sr_, self.L_, _, _) = \
+            self._posterior_mode(K, return_temporaries=True)
+
+        return self
+
+    def predict(self, X):
+        """Perform classification on an array of test vectors X.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features) or list of object
+            Query points where the GP is evaluated for classification.
+
+        Returns
+        -------
+        C : ndarray of shape (n_samples,)
+            Predicted target values for X, values are from ``classes_``
+        """
+        check_is_fitted(self)
+
+        # As discussed on Section 3.4.2 of GPML, for making hard binary
+        # decisions, it is enough to compute the MAP of the posterior and
+        # pass it through the link function
+        K_star = self.kernel_(self.X_train_, X)  # K_star =k(x_star)
+        f_star = K_star.T.dot(self.y_train_ - self.pi_)  # Algorithm 3.2,Line 4
+
+        return np.where(f_star > 0, self.classes_[1], self.classes_[0])
+
+    def predict_proba(self, X):
+        """Return probability estimates for the test vector X.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features) or list of object
+            Query points where the GP is evaluated for classification.
+
+        Returns
+        -------
+        C : array-like of shape (n_samples, n_classes)
+            Returns the probability of the samples for each class in
+            the model. The columns correspond to the classes in sorted
+            order, as they appear in the attribute ``classes_``.
+        """
+        check_is_fitted(self)
+
+        # Based on Algorithm 3.2 of GPML
+        K_star = self.kernel_(self.X_train_, X)  # K_star =k(x_star)
+        f_star = K_star.T.dot(self.y_train_ - self.pi_)  # Line 4
+        v = solve(self.L_, self.W_sr_[:, np.newaxis] * K_star)  # Line 5
+        # Line 6 (compute np.diag(v.T.dot(v)) via einsum)
+        var_f_star = self.kernel_.diag(X) - np.einsum("ij,ij->j", v, v)
+
+        # Line 7:
+        # Approximate \int log(z) * N(z | f_star, var_f_star)
+        # Approximation is due to Williams & Barber, "Bayesian Classification
+        # with Gaussian Processes", Appendix A: Approximate the logistic
+        # sigmoid by a linear combination of 5 error functions.
+        # For information on how this integral can be computed see
+        # blitiri.blogspot.de/2012/11/gaussian-integral-of-error-function.html
+        alpha = 1 / (2 * var_f_star)
+        gamma = LAMBDAS * f_star
+        integrals = np.sqrt(np.pi / alpha) \
+            * erf(gamma * np.sqrt(alpha / (alpha + LAMBDAS**2))) \
+            / (2 * np.sqrt(var_f_star * 2 * np.pi))
+        pi_star = (COEFS * integrals).sum(axis=0) + .5 * COEFS.sum()
+
+        return np.vstack((1 - pi_star, pi_star)).T
+
+    def log_marginal_likelihood(self, theta=None, eval_gradient=False,
+                                clone_kernel=True):
+        """Returns log-marginal likelihood of theta for training data.
+
+        Parameters
+        ----------
+        theta : array-like of shape (n_kernel_params,), default=None
+            Kernel hyperparameters for which the log-marginal likelihood is
+            evaluated. If None, the precomputed log_marginal_likelihood
+            of ``self.kernel_.theta`` is returned.
+
+        eval_gradient : bool, default=False
+            If True, the gradient of the log-marginal likelihood with respect
+            to the kernel hyperparameters at position theta is returned
+            additionally. If True, theta must not be None.
+
+        clone_kernel : bool, default=True
+            If True, the kernel attribute is copied. If False, the kernel
+            attribute is modified, but may result in a performance improvement.
+
+        Returns
+        -------
+        log_likelihood : float
+            Log-marginal likelihood of theta for training data.
+
+        log_likelihood_gradient : ndarray of shape (n_kernel_params,), \
+                optional
+            Gradient of the log-marginal likelihood with respect to the kernel
+            hyperparameters at position theta.
+            Only returned when `eval_gradient` is True.
+        """
+        if theta is None:
+            if eval_gradient:
+                raise ValueError(
+                    "Gradient can only be evaluated for theta!=None")
+            return self.log_marginal_likelihood_value_
+
+        if clone_kernel:
+            kernel = self.kernel_.clone_with_theta(theta)
+        else:
+            kernel = self.kernel_
+            kernel.theta = theta
+
+        if eval_gradient:
+            K, K_gradient = kernel(self.X_train_, eval_gradient=True)
+        else:
+            K = kernel(self.X_train_)
+
+        # Compute log-marginal-likelihood Z and also store some temporaries
+        # which can be reused for computing Z's gradient
+        Z, (pi, W_sr, L, b, a) = \
+            self._posterior_mode(K, return_temporaries=True)
+
+        if not eval_gradient:
+            return Z
+
+        # Compute gradient based on Algorithm 5.1 of GPML
+        d_Z = np.empty(theta.shape[0])
+        # XXX: Get rid of the np.diag() in the next line
+        R = W_sr[:, np.newaxis] * cho_solve((L, True), np.diag(W_sr))  # Line 7
+        C = solve(L, W_sr[:, np.newaxis] * K)  # Line 8
+        # Line 9: (use einsum to compute np.diag(C.T.dot(C))))
+        s_2 = -0.5 * (np.diag(K) - np.einsum('ij, ij -> j', C, C)) \
+            * (pi * (1 - pi) * (1 - 2 * pi))  # third derivative
+
+        for j in range(d_Z.shape[0]):
+            C = K_gradient[:, :, j]   # Line 11
+            # Line 12: (R.T.ravel().dot(C.ravel()) = np.trace(R.dot(C)))
+            s_1 = .5 * a.T.dot(C).dot(a) - .5 * R.T.ravel().dot(C.ravel())
+
+            b = C.dot(self.y_train_ - pi)  # Line 13
+            s_3 = b - K.dot(R.dot(b))  # Line 14
+
+            d_Z[j] = s_1 + s_2.T.dot(s_3)  # Line 15
+
+        return Z, d_Z
+
+    def _posterior_mode(self, K, return_temporaries=False):
+        """Mode-finding for binary Laplace GPC and fixed kernel.
+
+        This approximates the posterior of the latent function values for given
+        inputs and target observations with a Gaussian approximation and uses
+        Newton's iteration to find the mode of this approximation.
+        """
+        # Based on Algorithm 3.1 of GPML
+
+        # If warm_start are enabled, we reuse the last solution for the
+        # posterior mode as initialization; otherwise, we initialize with 0
+        if self.warm_start and hasattr(self, "f_cached") \
+           and self.f_cached.shape == self.y_train_.shape:
+            f = self.f_cached
+        else:
+            f = np.zeros_like(self.y_train_, dtype=np.float64)
+
+        # Use Newton's iteration method to find mode of Laplace approximation
+        log_marginal_likelihood = -np.inf
+        for _ in range(self.max_iter_predict):
+            # Line 4
+            pi = expit(f)
+            W = pi * (1 - pi)
+            # Line 5
+            W_sr = np.sqrt(W)
+            W_sr_K = W_sr[:, np.newaxis] * K
+            B = np.eye(W.shape[0]) + W_sr_K * W_sr
+            L = cholesky(B, lower=True)
+            # Line 6
+            b = W * f + (self.y_train_ - pi)
+            # Line 7
+            a = b - W_sr * cho_solve((L, True), W_sr_K.dot(b))
+            # Line 8
+            f = K.dot(a)
+
+            # Line 10: Compute log marginal likelihood in loop and use as
+            #          convergence criterion
+            lml = -0.5 * a.T.dot(f) \
+                - np.log1p(np.exp(-(self.y_train_ * 2 - 1) * f)).sum() \
+                - np.log(np.diag(L)).sum()
+            # Check if we have converged (log marginal likelihood does
+            # not decrease)
+            # XXX: more complex convergence criterion
+            if lml - log_marginal_likelihood < 1e-10:
+                break
+            log_marginal_likelihood = lml
+
+        self.f_cached = f  # Remember solution for later warm-starts
+        if return_temporaries:
+            return log_marginal_likelihood, (pi, W_sr, L, b, a)
+        else:
+            return log_marginal_likelihood
+
+    def _constrained_optimization(self, obj_func, initial_theta, bounds):
+        if self.optimizer == "fmin_l_bfgs_b":
+            opt_res = scipy.optimize.minimize(
+                obj_func, initial_theta, method="L-BFGS-B", jac=True,
+                bounds=bounds)
+            _check_optimize_result("lbfgs", opt_res)
+            theta_opt, func_min = opt_res.x, opt_res.fun
+        elif callable(self.optimizer):
+            theta_opt, func_min = \
+                self.optimizer(obj_func, initial_theta, bounds=bounds)
+        else:
+            raise ValueError("Unknown optimizer %s." % self.optimizer)
+
+        return theta_opt, func_min
+
+
+class GaussianProcessClassifier(ClassifierMixin, BaseEstimator):
+    """Gaussian process classification (GPC) based on Laplace approximation.
+
+    The implementation is based on Algorithm 3.1, 3.2, and 5.1 of
+    Gaussian Processes for Machine Learning (GPML) by Rasmussen and
+    Williams.
+
+    Internally, the Laplace approximation is used for approximating the
+    non-Gaussian posterior by a Gaussian.
+
+    Currently, the implementation is restricted to using the logistic link
+    function. For multi-class classification, several binary one-versus rest
+    classifiers are fitted. Note that this class thus does not implement
+    a true multi-class Laplace approximation.
+
+    Read more in the :ref:`User Guide <gaussian_process>`.
+
+    Parameters
+    ----------
+    kernel : kernel instance, default=None
+        The kernel specifying the covariance function of the GP. If None is
+        passed, the kernel "1.0 * RBF(1.0)" is used as default. Note that
+        the kernel's hyperparameters are optimized during fitting.
+
+    optimizer : 'fmin_l_bfgs_b' or callable, default='fmin_l_bfgs_b'
+        Can either be one of the internally supported optimizers for optimizing
+        the kernel's parameters, specified by a string, or an externally
+        defined optimizer passed as a callable. If a callable is passed, it
+        must have the  signature::
+
+            def optimizer(obj_func, initial_theta, bounds):
+                # * 'obj_func' is the objective function to be maximized, which
+                #   takes the hyperparameters theta as parameter and an
+                #   optional flag eval_gradient, which determines if the
+                #   gradient is returned additionally to the function value
+                # * 'initial_theta': the initial value for theta, which can be
+                #   used by local optimizers
+                # * 'bounds': the bounds on the values of theta
+                ....
+                # Returned are the best found hyperparameters theta and
+                # the corresponding value of the target function.
+                return theta_opt, func_min
+
+        Per default, the 'L-BFGS-B' algorithm from scipy.optimize.minimize
+        is used. If None is passed, the kernel's parameters are kept fixed.
+        Available internal optimizers are::
+
+            'fmin_l_bfgs_b'
+
+    n_restarts_optimizer : int, default=0
+        The number of restarts of the optimizer for finding the kernel's
+        parameters which maximize the log-marginal likelihood. The first run
+        of the optimizer is performed from the kernel's initial parameters,
+        the remaining ones (if any) from thetas sampled log-uniform randomly
+        from the space of allowed theta-values. If greater than 0, all bounds
+        must be finite. Note that n_restarts_optimizer=0 implies that one
+        run is performed.
+
+    max_iter_predict : int, default=100
+        The maximum number of iterations in Newton's method for approximating
+        the posterior during predict. Smaller values will reduce computation
+        time at the cost of worse results.
+
+    warm_start : bool, default=False
+        If warm-starts are enabled, the solution of the last Newton iteration
+        on the Laplace approximation of the posterior mode is used as
+        initialization for the next call of _posterior_mode(). This can speed
+        up convergence when _posterior_mode is called several times on similar
+        problems as in hyperparameter optimization. See :term:`the Glossary
+        <warm_start>`.
+
+    copy_X_train : bool, default=True
+        If True, a persistent copy of the training data is stored in the
+        object. Otherwise, just a reference to the training data is stored,
+        which might cause predictions to change if the data is modified
+        externally.
+
+    random_state : int or RandomState, default=None
+        Determines random number generation used to initialize the centers.
+        Pass an int for reproducible results across multiple function calls.
+        See :term: `Glossary <random_state>`.
+
+    multi_class : {'one_vs_rest', 'one_vs_one'}, default='one_vs_rest'
+        Specifies how multi-class classification problems are handled.
+        Supported are 'one_vs_rest' and 'one_vs_one'. In 'one_vs_rest',
+        one binary Gaussian process classifier is fitted for each class, which
+        is trained to separate this class from the rest. In 'one_vs_one', one
+        binary Gaussian process classifier is fitted for each pair of classes,
+        which is trained to separate these two classes. The predictions of
+        these binary predictors are combined into multi-class predictions.
+        Note that 'one_vs_one' does not support predicting probability
+        estimates.
+
+    n_jobs : int, default=None
+        The number of jobs to use for the computation.
+        ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
+        ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
+        for more details.
+
+    Attributes
+    ----------
+    kernel_ : kernel instance
+        The kernel used for prediction. In case of binary classification,
+        the structure of the kernel is the same as the one passed as parameter
+        but with optimized hyperparameters. In case of multi-class
+        classification, a CompoundKernel is returned which consists of the
+        different kernels used in the one-versus-rest classifiers.
+
+    log_marginal_likelihood_value_ : float
+        The log-marginal-likelihood of ``self.kernel_.theta``
+
+    classes_ : array-like of shape (n_classes,)
+        Unique class labels.
+
+    n_classes_ : int
+        The number of classes in the training data
+
+    Examples
+    --------
+    >>> from sklearn.datasets import load_iris
+    >>> from sklearn.gaussian_process import GaussianProcessClassifier
+    >>> from sklearn.gaussian_process.kernels import RBF
+    >>> X, y = load_iris(return_X_y=True)
+    >>> kernel = 1.0 * RBF(1.0)
+    >>> gpc = GaussianProcessClassifier(kernel=kernel,
+    ...         random_state=0).fit(X, y)
+    >>> gpc.score(X, y)
+    0.9866...
+    >>> gpc.predict_proba(X[:2,:])
+    array([[0.83548752, 0.03228706, 0.13222543],
+           [0.79064206, 0.06525643, 0.14410151]])
+
+    .. versionadded:: 0.18
+    """
+    @_deprecate_positional_args
+    def __init__(self, kernel=None, *, optimizer="fmin_l_bfgs_b",
+                 n_restarts_optimizer=0, max_iter_predict=100,
+                 warm_start=False, copy_X_train=True, random_state=None,
+                 multi_class="one_vs_rest", n_jobs=None):
+        self.kernel = kernel
+        self.optimizer = optimizer
+        self.n_restarts_optimizer = n_restarts_optimizer
+        self.max_iter_predict = max_iter_predict
+        self.warm_start = warm_start
+        self.copy_X_train = copy_X_train
+        self.random_state = random_state
+        self.multi_class = multi_class
+        self.n_jobs = n_jobs
+
+    def fit(self, X, y):
+        """Fit Gaussian process classification model
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features) or list of object
+            Feature vectors or other representations of training data.
+
+        y : array-like of shape (n_samples,)
+            Target values, must be binary
+
+        Returns
+        -------
+        self : returns an instance of self.
+        """
+        if self.kernel is None or self.kernel.requires_vector_input:
+            X, y = self._validate_data(X, y, multi_output=False,
+                                       ensure_2d=True, dtype="numeric")
+        else:
+            X, y = self._validate_data(X, y, multi_output=False,
+                                       ensure_2d=False, dtype=None)
+
+        self.base_estimator_ = _BinaryGaussianProcessClassifierLaplace(
+            kernel=self.kernel,
+            optimizer=self.optimizer,
+            n_restarts_optimizer=self.n_restarts_optimizer,
+            max_iter_predict=self.max_iter_predict,
+            warm_start=self.warm_start,
+            copy_X_train=self.copy_X_train,
+            random_state=self.random_state)
+
+        self.classes_ = np.unique(y)
+        self.n_classes_ = self.classes_.size
+        if self.n_classes_ == 1:
+            raise ValueError("GaussianProcessClassifier requires 2 or more "
+                             "distinct classes; got %d class (only class %s "
+                             "is present)"
+                             % (self.n_classes_, self.classes_[0]))
+        if self.n_classes_ > 2:
+            if self.multi_class == "one_vs_rest":
+                self.base_estimator_ = \
+                    OneVsRestClassifier(self.base_estimator_,
+                                        n_jobs=self.n_jobs)
+            elif self.multi_class == "one_vs_one":
+                self.base_estimator_ = \
+                    OneVsOneClassifier(self.base_estimator_,
+                                       n_jobs=self.n_jobs)
+            else:
+                raise ValueError("Unknown multi-class mode %s"
+                                 % self.multi_class)
+
+        self.base_estimator_.fit(X, y)
+
+        if self.n_classes_ > 2:
+            self.log_marginal_likelihood_value_ = np.mean(
+                [estimator.log_marginal_likelihood()
+                 for estimator in self.base_estimator_.estimators_])
+        else:
+            self.log_marginal_likelihood_value_ = \
+                self.base_estimator_.log_marginal_likelihood()
+
+        return self
+
+    def predict(self, X):
+        """Perform classification on an array of test vectors X.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features) or list of object
+            Query points where the GP is evaluated for classification.
+
+        Returns
+        -------
+        C : ndarray of shape (n_samples,)
+            Predicted target values for X, values are from ``classes_``
+        """
+        check_is_fitted(self)
+
+        if self.kernel is None or self.kernel.requires_vector_input:
+            X = check_array(X, ensure_2d=True, dtype="numeric")
+        else:
+            X = check_array(X, ensure_2d=False, dtype=None)
+
+        return self.base_estimator_.predict(X)
+
+    def predict_proba(self, X):
+        """Return probability estimates for the test vector X.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features) or list of object
+            Query points where the GP is evaluated for classification.
+
+        Returns
+        -------
+        C : array-like of shape (n_samples, n_classes)
+            Returns the probability of the samples for each class in
+            the model. The columns correspond to the classes in sorted
+            order, as they appear in the attribute :term:`classes_`.
+        """
+        check_is_fitted(self)
+        if self.n_classes_ > 2 and self.multi_class == "one_vs_one":
+            raise ValueError("one_vs_one multi-class mode does not support "
+                             "predicting probability estimates. Use "
+                             "one_vs_rest mode instead.")
+
+        if self.kernel is None or self.kernel.requires_vector_input:
+            X = check_array(X, ensure_2d=True, dtype="numeric")
+        else:
+            X = check_array(X, ensure_2d=False, dtype=None)
+
+        return self.base_estimator_.predict_proba(X)
+
+    @property
+    def kernel_(self):
+        if self.n_classes_ == 2:
+            return self.base_estimator_.kernel_
+        else:
+            return CompoundKernel(
+                [estimator.kernel_
+                 for estimator in self.base_estimator_.estimators_])
+
+    def log_marginal_likelihood(self, theta=None, eval_gradient=False,
+                                clone_kernel=True):
+        """Returns log-marginal likelihood of theta for training data.
+
+        In the case of multi-class classification, the mean log-marginal
+        likelihood of the one-versus-rest classifiers are returned.
+
+        Parameters
+        ----------
+        theta : array-like of shape (n_kernel_params,), default=None
+            Kernel hyperparameters for which the log-marginal likelihood is
+            evaluated. In the case of multi-class classification, theta may
+            be the  hyperparameters of the compound kernel or of an individual
+            kernel. In the latter case, all individual kernel get assigned the
+            same theta values. If None, the precomputed log_marginal_likelihood
+            of ``self.kernel_.theta`` is returned.
+
+        eval_gradient : bool, default=False
+            If True, the gradient of the log-marginal likelihood with respect
+            to the kernel hyperparameters at position theta is returned
+            additionally. Note that gradient computation is not supported
+            for non-binary classification. If True, theta must not be None.
+
+        clone_kernel : bool, default=True
+            If True, the kernel attribute is copied. If False, the kernel
+            attribute is modified, but may result in a performance improvement.
+
+        Returns
+        -------
+        log_likelihood : float
+            Log-marginal likelihood of theta for training data.
+
+        log_likelihood_gradient : ndarray of shape (n_kernel_params,), optional
+            Gradient of the log-marginal likelihood with respect to the kernel
+            hyperparameters at position theta.
+            Only returned when `eval_gradient` is True.
+        """
+        check_is_fitted(self)
+
+        if theta is None:
+            if eval_gradient:
+                raise ValueError(
+                    "Gradient can only be evaluated for theta!=None")
+            return self.log_marginal_likelihood_value_
+
+        theta = np.asarray(theta)
+        if self.n_classes_ == 2:
+            return self.base_estimator_.log_marginal_likelihood(
+                theta, eval_gradient, clone_kernel=clone_kernel)
+        else:
+            if eval_gradient:
+                raise NotImplementedError(
+                    "Gradient of log-marginal-likelihood not implemented for "
+                    "multi-class GPC.")
+            estimators = self.base_estimator_.estimators_
+            n_dims = estimators[0].kernel_.n_dims
+            if theta.shape[0] == n_dims:  # use same theta for all sub-kernels
+                return np.mean(
+                    [estimator.log_marginal_likelihood(
+                        theta, clone_kernel=clone_kernel)
+                     for i, estimator in enumerate(estimators)])
+            elif theta.shape[0] == n_dims * self.classes_.shape[0]:
+                # theta for compound kernel
+                return np.mean(
+                    [estimator.log_marginal_likelihood(
+                        theta[n_dims * i:n_dims * (i + 1)],
+                        clone_kernel=clone_kernel)
+                     for i, estimator in enumerate(estimators)])
+            else:
+                raise ValueError("Shape of theta must be either %d or %d. "
+                                 "Obtained theta with shape %d."
+                                 % (n_dims, n_dims * self.classes_.shape[0],
+                                    theta.shape[0]))