Uploaded Test files
This commit is contained in:
parent
f584ad9d97
commit
2e81cb7d99
16627 changed files with 2065359 additions and 102444 deletions
674
venv/Lib/site-packages/sklearn/linear_model/_bayes.py
Normal file
674
venv/Lib/site-packages/sklearn/linear_model/_bayes.py
Normal file
|
@ -0,0 +1,674 @@
|
|||
"""
|
||||
Various bayesian regression
|
||||
"""
|
||||
|
||||
# Authors: V. Michel, F. Pedregosa, A. Gramfort
|
||||
# License: BSD 3 clause
|
||||
|
||||
from math import log
|
||||
import numpy as np
|
||||
from scipy import linalg
|
||||
|
||||
from ._base import LinearModel, _rescale_data
|
||||
from ..base import RegressorMixin
|
||||
from ..utils.extmath import fast_logdet
|
||||
from scipy.linalg import pinvh
|
||||
from ..utils.validation import _check_sample_weight
|
||||
from ..utils.validation import _deprecate_positional_args
|
||||
|
||||
|
||||
###############################################################################
|
||||
# BayesianRidge regression
|
||||
|
||||
class BayesianRidge(RegressorMixin, LinearModel):
|
||||
"""Bayesian ridge regression.
|
||||
|
||||
Fit a Bayesian ridge model. See the Notes section for details on this
|
||||
implementation and the optimization of the regularization parameters
|
||||
lambda (precision of the weights) and alpha (precision of the noise).
|
||||
|
||||
Read more in the :ref:`User Guide <bayesian_regression>`.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
n_iter : int, default=300
|
||||
Maximum number of iterations. Should be greater than or equal to 1.
|
||||
|
||||
tol : float, default=1e-3
|
||||
Stop the algorithm if w has converged.
|
||||
|
||||
alpha_1 : float, default=1e-6
|
||||
Hyper-parameter : shape parameter for the Gamma distribution prior
|
||||
over the alpha parameter.
|
||||
|
||||
alpha_2 : float, default=1e-6
|
||||
Hyper-parameter : inverse scale parameter (rate parameter) for the
|
||||
Gamma distribution prior over the alpha parameter.
|
||||
|
||||
lambda_1 : float, default=1e-6
|
||||
Hyper-parameter : shape parameter for the Gamma distribution prior
|
||||
over the lambda parameter.
|
||||
|
||||
lambda_2 : float, default=1e-6
|
||||
Hyper-parameter : inverse scale parameter (rate parameter) for the
|
||||
Gamma distribution prior over the lambda parameter.
|
||||
|
||||
alpha_init : float, default=None
|
||||
Initial value for alpha (precision of the noise).
|
||||
If not set, alpha_init is 1/Var(y).
|
||||
|
||||
.. versionadded:: 0.22
|
||||
|
||||
lambda_init : float, default=None
|
||||
Initial value for lambda (precision of the weights).
|
||||
If not set, lambda_init is 1.
|
||||
|
||||
.. versionadded:: 0.22
|
||||
|
||||
compute_score : bool, default=False
|
||||
If True, compute the log marginal likelihood at each iteration of the
|
||||
optimization.
|
||||
|
||||
fit_intercept : bool, default=True
|
||||
Whether to calculate the intercept for this model.
|
||||
The intercept is not treated as a probabilistic parameter
|
||||
and thus has no associated variance. If set
|
||||
to False, no intercept will be used in calculations
|
||||
(i.e. data is expected to be centered).
|
||||
|
||||
normalize : bool, default=False
|
||||
This parameter is ignored when ``fit_intercept`` is set to False.
|
||||
If True, the regressors X will be normalized before regression by
|
||||
subtracting the mean and dividing by the l2-norm.
|
||||
If you wish to standardize, please use
|
||||
:class:`sklearn.preprocessing.StandardScaler` before calling ``fit``
|
||||
on an estimator with ``normalize=False``.
|
||||
|
||||
copy_X : bool, default=True
|
||||
If True, X will be copied; else, it may be overwritten.
|
||||
|
||||
verbose : bool, default=False
|
||||
Verbose mode when fitting the model.
|
||||
|
||||
|
||||
Attributes
|
||||
----------
|
||||
coef_ : array-like of shape (n_features,)
|
||||
Coefficients of the regression model (mean of distribution)
|
||||
|
||||
intercept_ : float
|
||||
Independent term in decision function. Set to 0.0 if
|
||||
``fit_intercept = False``.
|
||||
|
||||
alpha_ : float
|
||||
Estimated precision of the noise.
|
||||
|
||||
lambda_ : float
|
||||
Estimated precision of the weights.
|
||||
|
||||
sigma_ : array-like of shape (n_features, n_features)
|
||||
Estimated variance-covariance matrix of the weights
|
||||
|
||||
scores_ : array-like of shape (n_iter_+1,)
|
||||
If computed_score is True, value of the log marginal likelihood (to be
|
||||
maximized) at each iteration of the optimization. The array starts
|
||||
with the value of the log marginal likelihood obtained for the initial
|
||||
values of alpha and lambda and ends with the value obtained for the
|
||||
estimated alpha and lambda.
|
||||
|
||||
n_iter_ : int
|
||||
The actual number of iterations to reach the stopping criterion.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> from sklearn import linear_model
|
||||
>>> clf = linear_model.BayesianRidge()
|
||||
>>> clf.fit([[0,0], [1, 1], [2, 2]], [0, 1, 2])
|
||||
BayesianRidge()
|
||||
>>> clf.predict([[1, 1]])
|
||||
array([1.])
|
||||
|
||||
Notes
|
||||
-----
|
||||
There exist several strategies to perform Bayesian ridge regression. This
|
||||
implementation is based on the algorithm described in Appendix A of
|
||||
(Tipping, 2001) where updates of the regularization parameters are done as
|
||||
suggested in (MacKay, 1992). Note that according to A New
|
||||
View of Automatic Relevance Determination (Wipf and Nagarajan, 2008) these
|
||||
update rules do not guarantee that the marginal likelihood is increasing
|
||||
between two consecutive iterations of the optimization.
|
||||
|
||||
References
|
||||
----------
|
||||
D. J. C. MacKay, Bayesian Interpolation, Computation and Neural Systems,
|
||||
Vol. 4, No. 3, 1992.
|
||||
|
||||
M. E. Tipping, Sparse Bayesian Learning and the Relevance Vector Machine,
|
||||
Journal of Machine Learning Research, Vol. 1, 2001.
|
||||
"""
|
||||
@_deprecate_positional_args
|
||||
def __init__(self, *, n_iter=300, tol=1.e-3, alpha_1=1.e-6, alpha_2=1.e-6,
|
||||
lambda_1=1.e-6, lambda_2=1.e-6, alpha_init=None,
|
||||
lambda_init=None, compute_score=False, fit_intercept=True,
|
||||
normalize=False, copy_X=True, verbose=False):
|
||||
self.n_iter = n_iter
|
||||
self.tol = tol
|
||||
self.alpha_1 = alpha_1
|
||||
self.alpha_2 = alpha_2
|
||||
self.lambda_1 = lambda_1
|
||||
self.lambda_2 = lambda_2
|
||||
self.alpha_init = alpha_init
|
||||
self.lambda_init = lambda_init
|
||||
self.compute_score = compute_score
|
||||
self.fit_intercept = fit_intercept
|
||||
self.normalize = normalize
|
||||
self.copy_X = copy_X
|
||||
self.verbose = verbose
|
||||
|
||||
def fit(self, X, y, sample_weight=None):
|
||||
"""Fit the model
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : ndarray of shape (n_samples, n_features)
|
||||
Training data
|
||||
y : ndarray of shape (n_samples,)
|
||||
Target values. Will be cast to X's dtype if necessary
|
||||
|
||||
sample_weight : ndarray of shape (n_samples,), default=None
|
||||
Individual weights for each sample
|
||||
|
||||
.. versionadded:: 0.20
|
||||
parameter *sample_weight* support to BayesianRidge.
|
||||
|
||||
Returns
|
||||
-------
|
||||
self : returns an instance of self.
|
||||
"""
|
||||
|
||||
if self.n_iter < 1:
|
||||
raise ValueError('n_iter should be greater than or equal to 1.'
|
||||
' Got {!r}.'.format(self.n_iter))
|
||||
|
||||
X, y = self._validate_data(X, y, dtype=np.float64, y_numeric=True)
|
||||
|
||||
if sample_weight is not None:
|
||||
sample_weight = _check_sample_weight(sample_weight, X,
|
||||
dtype=X.dtype)
|
||||
|
||||
X, y, X_offset_, y_offset_, X_scale_ = self._preprocess_data(
|
||||
X, y, self.fit_intercept, self.normalize, self.copy_X,
|
||||
sample_weight=sample_weight)
|
||||
|
||||
if sample_weight is not None:
|
||||
# Sample weight can be implemented via a simple rescaling.
|
||||
X, y = _rescale_data(X, y, sample_weight)
|
||||
|
||||
self.X_offset_ = X_offset_
|
||||
self.X_scale_ = X_scale_
|
||||
n_samples, n_features = X.shape
|
||||
|
||||
# Initialization of the values of the parameters
|
||||
eps = np.finfo(np.float64).eps
|
||||
# Add `eps` in the denominator to omit division by zero if `np.var(y)`
|
||||
# is zero
|
||||
alpha_ = self.alpha_init
|
||||
lambda_ = self.lambda_init
|
||||
if alpha_ is None:
|
||||
alpha_ = 1. / (np.var(y) + eps)
|
||||
if lambda_ is None:
|
||||
lambda_ = 1.
|
||||
|
||||
verbose = self.verbose
|
||||
lambda_1 = self.lambda_1
|
||||
lambda_2 = self.lambda_2
|
||||
alpha_1 = self.alpha_1
|
||||
alpha_2 = self.alpha_2
|
||||
|
||||
self.scores_ = list()
|
||||
coef_old_ = None
|
||||
|
||||
XT_y = np.dot(X.T, y)
|
||||
U, S, Vh = linalg.svd(X, full_matrices=False)
|
||||
eigen_vals_ = S ** 2
|
||||
|
||||
# Convergence loop of the bayesian ridge regression
|
||||
for iter_ in range(self.n_iter):
|
||||
|
||||
# update posterior mean coef_ based on alpha_ and lambda_ and
|
||||
# compute corresponding rmse
|
||||
coef_, rmse_ = self._update_coef_(X, y, n_samples, n_features,
|
||||
XT_y, U, Vh, eigen_vals_,
|
||||
alpha_, lambda_)
|
||||
if self.compute_score:
|
||||
# compute the log marginal likelihood
|
||||
s = self._log_marginal_likelihood(n_samples, n_features,
|
||||
eigen_vals_,
|
||||
alpha_, lambda_,
|
||||
coef_, rmse_)
|
||||
self.scores_.append(s)
|
||||
|
||||
# Update alpha and lambda according to (MacKay, 1992)
|
||||
gamma_ = np.sum((alpha_ * eigen_vals_) /
|
||||
(lambda_ + alpha_ * eigen_vals_))
|
||||
lambda_ = ((gamma_ + 2 * lambda_1) /
|
||||
(np.sum(coef_ ** 2) + 2 * lambda_2))
|
||||
alpha_ = ((n_samples - gamma_ + 2 * alpha_1) /
|
||||
(rmse_ + 2 * alpha_2))
|
||||
|
||||
# Check for convergence
|
||||
if iter_ != 0 and np.sum(np.abs(coef_old_ - coef_)) < self.tol:
|
||||
if verbose:
|
||||
print("Convergence after ", str(iter_), " iterations")
|
||||
break
|
||||
coef_old_ = np.copy(coef_)
|
||||
|
||||
self.n_iter_ = iter_ + 1
|
||||
|
||||
# return regularization parameters and corresponding posterior mean,
|
||||
# log marginal likelihood and posterior covariance
|
||||
self.alpha_ = alpha_
|
||||
self.lambda_ = lambda_
|
||||
self.coef_, rmse_ = self._update_coef_(X, y, n_samples, n_features,
|
||||
XT_y, U, Vh, eigen_vals_,
|
||||
alpha_, lambda_)
|
||||
if self.compute_score:
|
||||
# compute the log marginal likelihood
|
||||
s = self._log_marginal_likelihood(n_samples, n_features,
|
||||
eigen_vals_,
|
||||
alpha_, lambda_,
|
||||
coef_, rmse_)
|
||||
self.scores_.append(s)
|
||||
self.scores_ = np.array(self.scores_)
|
||||
|
||||
# posterior covariance is given by 1/alpha_ * scaled_sigma_
|
||||
scaled_sigma_ = np.dot(Vh.T,
|
||||
Vh / (eigen_vals_ +
|
||||
lambda_ / alpha_)[:, np.newaxis])
|
||||
self.sigma_ = (1. / alpha_) * scaled_sigma_
|
||||
|
||||
self._set_intercept(X_offset_, y_offset_, X_scale_)
|
||||
|
||||
return self
|
||||
|
||||
def predict(self, X, return_std=False):
|
||||
"""Predict using the linear model.
|
||||
|
||||
In addition to the mean of the predictive distribution, also its
|
||||
standard deviation can be returned.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : {array-like, sparse matrix} of shape (n_samples, n_features)
|
||||
Samples.
|
||||
|
||||
return_std : bool, default=False
|
||||
Whether to return the standard deviation of posterior prediction.
|
||||
|
||||
Returns
|
||||
-------
|
||||
y_mean : array-like of shape (n_samples,)
|
||||
Mean of predictive distribution of query points.
|
||||
|
||||
y_std : array-like of shape (n_samples,)
|
||||
Standard deviation of predictive distribution of query points.
|
||||
"""
|
||||
y_mean = self._decision_function(X)
|
||||
if return_std is False:
|
||||
return y_mean
|
||||
else:
|
||||
if self.normalize:
|
||||
X = (X - self.X_offset_) / self.X_scale_
|
||||
sigmas_squared_data = (np.dot(X, self.sigma_) * X).sum(axis=1)
|
||||
y_std = np.sqrt(sigmas_squared_data + (1. / self.alpha_))
|
||||
return y_mean, y_std
|
||||
|
||||
def _update_coef_(self, X, y, n_samples, n_features, XT_y, U, Vh,
|
||||
eigen_vals_, alpha_, lambda_):
|
||||
"""Update posterior mean and compute corresponding rmse.
|
||||
|
||||
Posterior mean is given by coef_ = scaled_sigma_ * X.T * y where
|
||||
scaled_sigma_ = (lambda_/alpha_ * np.eye(n_features)
|
||||
+ np.dot(X.T, X))^-1
|
||||
"""
|
||||
|
||||
if n_samples > n_features:
|
||||
coef_ = np.dot(Vh.T,
|
||||
Vh / (eigen_vals_ +
|
||||
lambda_ / alpha_)[:, np.newaxis])
|
||||
coef_ = np.dot(coef_, XT_y)
|
||||
else:
|
||||
coef_ = np.dot(X.T, np.dot(
|
||||
U / (eigen_vals_ + lambda_ / alpha_)[None, :], U.T))
|
||||
coef_ = np.dot(coef_, y)
|
||||
|
||||
rmse_ = np.sum((y - np.dot(X, coef_)) ** 2)
|
||||
|
||||
return coef_, rmse_
|
||||
|
||||
def _log_marginal_likelihood(self, n_samples, n_features, eigen_vals,
|
||||
alpha_, lambda_, coef, rmse):
|
||||
"""Log marginal likelihood."""
|
||||
alpha_1 = self.alpha_1
|
||||
alpha_2 = self.alpha_2
|
||||
lambda_1 = self.lambda_1
|
||||
lambda_2 = self.lambda_2
|
||||
|
||||
# compute the log of the determinant of the posterior covariance.
|
||||
# posterior covariance is given by
|
||||
# sigma = (lambda_ * np.eye(n_features) + alpha_ * np.dot(X.T, X))^-1
|
||||
if n_samples > n_features:
|
||||
logdet_sigma = - np.sum(np.log(lambda_ + alpha_ * eigen_vals))
|
||||
else:
|
||||
logdet_sigma = np.full(n_features, lambda_,
|
||||
dtype=np.array(lambda_).dtype)
|
||||
logdet_sigma[:n_samples] += alpha_ * eigen_vals
|
||||
logdet_sigma = - np.sum(np.log(logdet_sigma))
|
||||
|
||||
score = lambda_1 * log(lambda_) - lambda_2 * lambda_
|
||||
score += alpha_1 * log(alpha_) - alpha_2 * alpha_
|
||||
score += 0.5 * (n_features * log(lambda_) +
|
||||
n_samples * log(alpha_) -
|
||||
alpha_ * rmse -
|
||||
lambda_ * np.sum(coef ** 2) +
|
||||
logdet_sigma -
|
||||
n_samples * log(2 * np.pi))
|
||||
|
||||
return score
|
||||
|
||||
|
||||
###############################################################################
|
||||
# ARD (Automatic Relevance Determination) regression
|
||||
|
||||
|
||||
class ARDRegression(RegressorMixin, LinearModel):
|
||||
"""Bayesian ARD regression.
|
||||
|
||||
Fit the weights of a regression model, using an ARD prior. The weights of
|
||||
the regression model are assumed to be in Gaussian distributions.
|
||||
Also estimate the parameters lambda (precisions of the distributions of the
|
||||
weights) and alpha (precision of the distribution of the noise).
|
||||
The estimation is done by an iterative procedures (Evidence Maximization)
|
||||
|
||||
Read more in the :ref:`User Guide <bayesian_regression>`.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
n_iter : int, default=300
|
||||
Maximum number of iterations.
|
||||
|
||||
tol : float, default=1e-3
|
||||
Stop the algorithm if w has converged.
|
||||
|
||||
alpha_1 : float, default=1e-6
|
||||
Hyper-parameter : shape parameter for the Gamma distribution prior
|
||||
over the alpha parameter.
|
||||
|
||||
alpha_2 : float, default=1e-6
|
||||
Hyper-parameter : inverse scale parameter (rate parameter) for the
|
||||
Gamma distribution prior over the alpha parameter.
|
||||
|
||||
lambda_1 : float, default=1e-6
|
||||
Hyper-parameter : shape parameter for the Gamma distribution prior
|
||||
over the lambda parameter.
|
||||
|
||||
lambda_2 : float, default=1e-6
|
||||
Hyper-parameter : inverse scale parameter (rate parameter) for the
|
||||
Gamma distribution prior over the lambda parameter.
|
||||
|
||||
compute_score : bool, default=False
|
||||
If True, compute the objective function at each step of the model.
|
||||
|
||||
threshold_lambda : float, default=10 000
|
||||
threshold for removing (pruning) weights with high precision from
|
||||
the computation.
|
||||
|
||||
fit_intercept : bool, default=True
|
||||
whether to calculate the intercept for this model. If set
|
||||
to false, no intercept will be used in calculations
|
||||
(i.e. data is expected to be centered).
|
||||
|
||||
normalize : bool, default=False
|
||||
This parameter is ignored when ``fit_intercept`` is set to False.
|
||||
If True, the regressors X will be normalized before regression by
|
||||
subtracting the mean and dividing by the l2-norm.
|
||||
If you wish to standardize, please use
|
||||
:class:`sklearn.preprocessing.StandardScaler` before calling ``fit``
|
||||
on an estimator with ``normalize=False``.
|
||||
|
||||
copy_X : bool, default=True
|
||||
If True, X will be copied; else, it may be overwritten.
|
||||
|
||||
verbose : bool, default=False
|
||||
Verbose mode when fitting the model.
|
||||
|
||||
Attributes
|
||||
----------
|
||||
coef_ : array-like of shape (n_features,)
|
||||
Coefficients of the regression model (mean of distribution)
|
||||
|
||||
alpha_ : float
|
||||
estimated precision of the noise.
|
||||
|
||||
lambda_ : array-like of shape (n_features,)
|
||||
estimated precisions of the weights.
|
||||
|
||||
sigma_ : array-like of shape (n_features, n_features)
|
||||
estimated variance-covariance matrix of the weights
|
||||
|
||||
scores_ : float
|
||||
if computed, value of the objective function (to be maximized)
|
||||
|
||||
intercept_ : float
|
||||
Independent term in decision function. Set to 0.0 if
|
||||
``fit_intercept = False``.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> from sklearn import linear_model
|
||||
>>> clf = linear_model.ARDRegression()
|
||||
>>> clf.fit([[0,0], [1, 1], [2, 2]], [0, 1, 2])
|
||||
ARDRegression()
|
||||
>>> clf.predict([[1, 1]])
|
||||
array([1.])
|
||||
|
||||
Notes
|
||||
-----
|
||||
For an example, see :ref:`examples/linear_model/plot_ard.py
|
||||
<sphx_glr_auto_examples_linear_model_plot_ard.py>`.
|
||||
|
||||
References
|
||||
----------
|
||||
D. J. C. MacKay, Bayesian nonlinear modeling for the prediction
|
||||
competition, ASHRAE Transactions, 1994.
|
||||
|
||||
R. Salakhutdinov, Lecture notes on Statistical Machine Learning,
|
||||
http://www.utstat.toronto.edu/~rsalakhu/sta4273/notes/Lecture2.pdf#page=15
|
||||
Their beta is our ``self.alpha_``
|
||||
Their alpha is our ``self.lambda_``
|
||||
ARD is a little different than the slide: only dimensions/features for
|
||||
which ``self.lambda_ < self.threshold_lambda`` are kept and the rest are
|
||||
discarded.
|
||||
"""
|
||||
@_deprecate_positional_args
|
||||
def __init__(self, *, n_iter=300, tol=1.e-3, alpha_1=1.e-6, alpha_2=1.e-6,
|
||||
lambda_1=1.e-6, lambda_2=1.e-6, compute_score=False,
|
||||
threshold_lambda=1.e+4, fit_intercept=True, normalize=False,
|
||||
copy_X=True, verbose=False):
|
||||
self.n_iter = n_iter
|
||||
self.tol = tol
|
||||
self.fit_intercept = fit_intercept
|
||||
self.normalize = normalize
|
||||
self.alpha_1 = alpha_1
|
||||
self.alpha_2 = alpha_2
|
||||
self.lambda_1 = lambda_1
|
||||
self.lambda_2 = lambda_2
|
||||
self.compute_score = compute_score
|
||||
self.threshold_lambda = threshold_lambda
|
||||
self.copy_X = copy_X
|
||||
self.verbose = verbose
|
||||
|
||||
def fit(self, X, y):
|
||||
"""Fit the ARDRegression model according to the given training data
|
||||
and parameters.
|
||||
|
||||
Iterative procedure to maximize the evidence
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : array-like of shape (n_samples, n_features)
|
||||
Training vector, where n_samples in the number of samples and
|
||||
n_features is the number of features.
|
||||
y : array-like of shape (n_samples,)
|
||||
Target values (integers). Will be cast to X's dtype if necessary
|
||||
|
||||
Returns
|
||||
-------
|
||||
self : returns an instance of self.
|
||||
"""
|
||||
X, y = self._validate_data(X, y, dtype=np.float64, y_numeric=True,
|
||||
ensure_min_samples=2)
|
||||
|
||||
n_samples, n_features = X.shape
|
||||
coef_ = np.zeros(n_features)
|
||||
|
||||
X, y, X_offset_, y_offset_, X_scale_ = self._preprocess_data(
|
||||
X, y, self.fit_intercept, self.normalize, self.copy_X)
|
||||
|
||||
# Launch the convergence loop
|
||||
keep_lambda = np.ones(n_features, dtype=bool)
|
||||
|
||||
lambda_1 = self.lambda_1
|
||||
lambda_2 = self.lambda_2
|
||||
alpha_1 = self.alpha_1
|
||||
alpha_2 = self.alpha_2
|
||||
verbose = self.verbose
|
||||
|
||||
# Initialization of the values of the parameters
|
||||
eps = np.finfo(np.float64).eps
|
||||
# Add `eps` in the denominator to omit division by zero if `np.var(y)`
|
||||
# is zero
|
||||
alpha_ = 1. / (np.var(y) + eps)
|
||||
lambda_ = np.ones(n_features)
|
||||
|
||||
self.scores_ = list()
|
||||
coef_old_ = None
|
||||
|
||||
def update_coeff(X, y, coef_, alpha_, keep_lambda, sigma_):
|
||||
coef_[keep_lambda] = alpha_ * np.dot(
|
||||
sigma_, np.dot(X[:, keep_lambda].T, y))
|
||||
return coef_
|
||||
|
||||
update_sigma = (self._update_sigma if n_samples >= n_features
|
||||
else self._update_sigma_woodbury)
|
||||
# Iterative procedure of ARDRegression
|
||||
for iter_ in range(self.n_iter):
|
||||
sigma_ = update_sigma(X, alpha_, lambda_, keep_lambda)
|
||||
coef_ = update_coeff(X, y, coef_, alpha_, keep_lambda, sigma_)
|
||||
|
||||
# Update alpha and lambda
|
||||
rmse_ = np.sum((y - np.dot(X, coef_)) ** 2)
|
||||
gamma_ = 1. - lambda_[keep_lambda] * np.diag(sigma_)
|
||||
lambda_[keep_lambda] = ((gamma_ + 2. * lambda_1) /
|
||||
((coef_[keep_lambda]) ** 2 +
|
||||
2. * lambda_2))
|
||||
alpha_ = ((n_samples - gamma_.sum() + 2. * alpha_1) /
|
||||
(rmse_ + 2. * alpha_2))
|
||||
|
||||
# Prune the weights with a precision over a threshold
|
||||
keep_lambda = lambda_ < self.threshold_lambda
|
||||
coef_[~keep_lambda] = 0
|
||||
|
||||
# Compute the objective function
|
||||
if self.compute_score:
|
||||
s = (lambda_1 * np.log(lambda_) - lambda_2 * lambda_).sum()
|
||||
s += alpha_1 * log(alpha_) - alpha_2 * alpha_
|
||||
s += 0.5 * (fast_logdet(sigma_) + n_samples * log(alpha_) +
|
||||
np.sum(np.log(lambda_)))
|
||||
s -= 0.5 * (alpha_ * rmse_ + (lambda_ * coef_ ** 2).sum())
|
||||
self.scores_.append(s)
|
||||
|
||||
# Check for convergence
|
||||
if iter_ > 0 and np.sum(np.abs(coef_old_ - coef_)) < self.tol:
|
||||
if verbose:
|
||||
print("Converged after %s iterations" % iter_)
|
||||
break
|
||||
coef_old_ = np.copy(coef_)
|
||||
|
||||
if not keep_lambda.any():
|
||||
break
|
||||
|
||||
if keep_lambda.any():
|
||||
# update sigma and mu using updated params from the last iteration
|
||||
sigma_ = update_sigma(X, alpha_, lambda_, keep_lambda)
|
||||
coef_ = update_coeff(X, y, coef_, alpha_, keep_lambda, sigma_)
|
||||
else:
|
||||
sigma_ = np.array([]).reshape(0, 0)
|
||||
|
||||
self.coef_ = coef_
|
||||
self.alpha_ = alpha_
|
||||
self.sigma_ = sigma_
|
||||
self.lambda_ = lambda_
|
||||
self._set_intercept(X_offset_, y_offset_, X_scale_)
|
||||
return self
|
||||
|
||||
def _update_sigma_woodbury(self, X, alpha_, lambda_, keep_lambda):
|
||||
# See slides as referenced in the docstring note
|
||||
# this function is used when n_samples < n_features and will invert
|
||||
# a matrix of shape (n_samples, n_samples) making use of the
|
||||
# woodbury formula:
|
||||
# https://en.wikipedia.org/wiki/Woodbury_matrix_identity
|
||||
n_samples = X.shape[0]
|
||||
X_keep = X[:, keep_lambda]
|
||||
inv_lambda = 1 / lambda_[keep_lambda].reshape(1, -1)
|
||||
sigma_ = pinvh(
|
||||
np.eye(n_samples) / alpha_ + np.dot(X_keep * inv_lambda, X_keep.T)
|
||||
)
|
||||
sigma_ = np.dot(sigma_, X_keep * inv_lambda)
|
||||
sigma_ = - np.dot(inv_lambda.reshape(-1, 1) * X_keep.T, sigma_)
|
||||
sigma_[np.diag_indices(sigma_.shape[1])] += 1. / lambda_[keep_lambda]
|
||||
return sigma_
|
||||
|
||||
def _update_sigma(self, X, alpha_, lambda_, keep_lambda):
|
||||
# See slides as referenced in the docstring note
|
||||
# this function is used when n_samples >= n_features and will
|
||||
# invert a matrix of shape (n_features, n_features)
|
||||
X_keep = X[:, keep_lambda]
|
||||
gram = np.dot(X_keep.T, X_keep)
|
||||
eye = np.eye(gram.shape[0])
|
||||
sigma_inv = lambda_[keep_lambda] * eye + alpha_ * gram
|
||||
sigma_ = pinvh(sigma_inv)
|
||||
return sigma_
|
||||
|
||||
def predict(self, X, return_std=False):
|
||||
"""Predict using the linear model.
|
||||
|
||||
In addition to the mean of the predictive distribution, also its
|
||||
standard deviation can be returned.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
X : {array-like, sparse matrix} of shape (n_samples, n_features)
|
||||
Samples.
|
||||
|
||||
return_std : bool, default=False
|
||||
Whether to return the standard deviation of posterior prediction.
|
||||
|
||||
Returns
|
||||
-------
|
||||
y_mean : array-like of shape (n_samples,)
|
||||
Mean of predictive distribution of query points.
|
||||
|
||||
y_std : array-like of shape (n_samples,)
|
||||
Standard deviation of predictive distribution of query points.
|
||||
"""
|
||||
y_mean = self._decision_function(X)
|
||||
if return_std is False:
|
||||
return y_mean
|
||||
else:
|
||||
if self.normalize:
|
||||
X = (X - self.X_offset_) / self.X_scale_
|
||||
X = X[:, self.lambda_ < self.threshold_lambda]
|
||||
sigmas_squared_data = (np.dot(X, self.sigma_) * X).sum(axis=1)
|
||||
y_std = np.sqrt(sigmas_squared_data + (1. / self.alpha_))
|
||||
return y_mean, y_std
|
Loading…
Add table
Add a link
Reference in a new issue