1838 lines
68 KiB
Python
1838 lines
68 KiB
Python
"""
|
|
Least Angle Regression algorithm. See the documentation on the
|
|
Generalized Linear Model for a complete discussion.
|
|
"""
|
|
# Author: Fabian Pedregosa <fabian.pedregosa@inria.fr>
|
|
# Alexandre Gramfort <alexandre.gramfort@inria.fr>
|
|
# Gael Varoquaux
|
|
#
|
|
# License: BSD 3 clause
|
|
|
|
from math import log
|
|
import sys
|
|
import warnings
|
|
|
|
import numpy as np
|
|
from scipy import linalg, interpolate
|
|
from scipy.linalg.lapack import get_lapack_funcs
|
|
from joblib import Parallel, delayed
|
|
|
|
from ._base import LinearModel
|
|
from ..base import RegressorMixin, MultiOutputMixin
|
|
# mypy error: Module 'sklearn.utils' has no attribute 'arrayfuncs'
|
|
from ..utils import arrayfuncs, as_float_array # type: ignore
|
|
from ..utils import check_random_state
|
|
from ..model_selection import check_cv
|
|
from ..exceptions import ConvergenceWarning
|
|
from ..utils.validation import _deprecate_positional_args
|
|
|
|
SOLVE_TRIANGULAR_ARGS = {'check_finite': False}
|
|
|
|
|
|
@_deprecate_positional_args
|
|
def lars_path(X, y, Xy=None, *, Gram=None, max_iter=500, alpha_min=0,
|
|
method='lar', copy_X=True, eps=np.finfo(np.float).eps,
|
|
copy_Gram=True, verbose=0, return_path=True,
|
|
return_n_iter=False, positive=False):
|
|
"""Compute Least Angle Regression or Lasso path using LARS algorithm [1]
|
|
|
|
The optimization objective for the case method='lasso' is::
|
|
|
|
(1 / (2 * n_samples)) * ||y - Xw||^2_2 + alpha * ||w||_1
|
|
|
|
in the case of method='lars', the objective function is only known in
|
|
the form of an implicit equation (see discussion in [1])
|
|
|
|
Read more in the :ref:`User Guide <least_angle_regression>`.
|
|
|
|
Parameters
|
|
----------
|
|
X : None or array-like of shape (n_samples, n_features)
|
|
Input data. Note that if X is None then the Gram matrix must be
|
|
specified, i.e., cannot be None or False.
|
|
|
|
y : None or array-like of shape (n_samples,)
|
|
Input targets.
|
|
|
|
Xy : array-like of shape (n_samples,) or (n_samples, n_targets), \
|
|
default=None
|
|
Xy = np.dot(X.T, y) that can be precomputed. It is useful
|
|
only when the Gram matrix is precomputed.
|
|
|
|
Gram : None, 'auto', array-like of shape (n_features, n_features), \
|
|
default=None
|
|
Precomputed Gram matrix (X' * X), if ``'auto'``, the Gram
|
|
matrix is precomputed from the given X, if there are more samples
|
|
than features.
|
|
|
|
max_iter : int, default=500
|
|
Maximum number of iterations to perform, set to infinity for no limit.
|
|
|
|
alpha_min : float, default=0
|
|
Minimum correlation along the path. It corresponds to the
|
|
regularization parameter alpha parameter in the Lasso.
|
|
|
|
method : {'lar', 'lasso'}, default='lar'
|
|
Specifies the returned model. Select ``'lar'`` for Least Angle
|
|
Regression, ``'lasso'`` for the Lasso.
|
|
|
|
copy_X : bool, default=True
|
|
If ``False``, ``X`` is overwritten.
|
|
|
|
eps : float, optional
|
|
The machine-precision regularization in the computation of the
|
|
Cholesky diagonal factors. Increase this for very ill-conditioned
|
|
systems. By default, ``np.finfo(np.float).eps`` is used.
|
|
|
|
copy_Gram : bool, default=True
|
|
If ``False``, ``Gram`` is overwritten.
|
|
|
|
verbose : int, default=0
|
|
Controls output verbosity.
|
|
|
|
return_path : bool, default=True
|
|
If ``return_path==True`` returns the entire path, else returns only the
|
|
last point of the path.
|
|
|
|
return_n_iter : bool, default=False
|
|
Whether to return the number of iterations.
|
|
|
|
positive : bool, default=False
|
|
Restrict coefficients to be >= 0.
|
|
This option is only allowed with method 'lasso'. Note that the model
|
|
coefficients will not converge to the ordinary-least-squares solution
|
|
for small values of alpha. Only coefficients up to the smallest alpha
|
|
value (``alphas_[alphas_ > 0.].min()`` when fit_path=True) reached by
|
|
the stepwise Lars-Lasso algorithm are typically in congruence with the
|
|
solution of the coordinate descent lasso_path function.
|
|
|
|
Returns
|
|
-------
|
|
alphas : array-like of shape (n_alphas + 1,)
|
|
Maximum of covariances (in absolute value) at each iteration.
|
|
``n_alphas`` is either ``max_iter``, ``n_features`` or the
|
|
number of nodes in the path with ``alpha >= alpha_min``, whichever
|
|
is smaller.
|
|
|
|
active : array-like of shape (n_alphas,)
|
|
Indices of active variables at the end of the path.
|
|
|
|
coefs : array-like of shape (n_features, n_alphas + 1)
|
|
Coefficients along the path
|
|
|
|
n_iter : int
|
|
Number of iterations run. Returned only if return_n_iter is set
|
|
to True.
|
|
|
|
See also
|
|
--------
|
|
lars_path_gram
|
|
lasso_path
|
|
lasso_path_gram
|
|
LassoLars
|
|
Lars
|
|
LassoLarsCV
|
|
LarsCV
|
|
sklearn.decomposition.sparse_encode
|
|
|
|
References
|
|
----------
|
|
.. [1] "Least Angle Regression", Efron et al.
|
|
http://statweb.stanford.edu/~tibs/ftp/lars.pdf
|
|
|
|
.. [2] `Wikipedia entry on the Least-angle regression
|
|
<https://en.wikipedia.org/wiki/Least-angle_regression>`_
|
|
|
|
.. [3] `Wikipedia entry on the Lasso
|
|
<https://en.wikipedia.org/wiki/Lasso_(statistics)>`_
|
|
|
|
"""
|
|
if X is None and Gram is not None:
|
|
raise ValueError(
|
|
'X cannot be None if Gram is not None'
|
|
'Use lars_path_gram to avoid passing X and y.'
|
|
)
|
|
return _lars_path_solver(
|
|
X=X, y=y, Xy=Xy, Gram=Gram, n_samples=None, max_iter=max_iter,
|
|
alpha_min=alpha_min, method=method, copy_X=copy_X,
|
|
eps=eps, copy_Gram=copy_Gram, verbose=verbose, return_path=return_path,
|
|
return_n_iter=return_n_iter, positive=positive)
|
|
|
|
|
|
@_deprecate_positional_args
|
|
def lars_path_gram(Xy, Gram, *, n_samples, max_iter=500, alpha_min=0,
|
|
method='lar', copy_X=True, eps=np.finfo(np.float).eps,
|
|
copy_Gram=True, verbose=0, return_path=True,
|
|
return_n_iter=False, positive=False):
|
|
"""lars_path in the sufficient stats mode [1]
|
|
|
|
The optimization objective for the case method='lasso' is::
|
|
|
|
(1 / (2 * n_samples)) * ||y - Xw||^2_2 + alpha * ||w||_1
|
|
|
|
in the case of method='lars', the objective function is only known in
|
|
the form of an implicit equation (see discussion in [1])
|
|
|
|
Read more in the :ref:`User Guide <least_angle_regression>`.
|
|
|
|
Parameters
|
|
----------
|
|
Xy : array-like of shape (n_samples,) or (n_samples, n_targets)
|
|
Xy = np.dot(X.T, y).
|
|
|
|
Gram : array-like of shape (n_features, n_features)
|
|
Gram = np.dot(X.T * X).
|
|
|
|
n_samples : int or float
|
|
Equivalent size of sample.
|
|
|
|
max_iter : int, default=500
|
|
Maximum number of iterations to perform, set to infinity for no limit.
|
|
|
|
alpha_min : float, default=0
|
|
Minimum correlation along the path. It corresponds to the
|
|
regularization parameter alpha parameter in the Lasso.
|
|
|
|
method : {'lar', 'lasso'}, default='lar'
|
|
Specifies the returned model. Select ``'lar'`` for Least Angle
|
|
Regression, ``'lasso'`` for the Lasso.
|
|
|
|
copy_X : bool, default=True
|
|
If ``False``, ``X`` is overwritten.
|
|
|
|
eps : float, optional
|
|
The machine-precision regularization in the computation of the
|
|
Cholesky diagonal factors. Increase this for very ill-conditioned
|
|
systems. By default, ``np.finfo(np.float).eps`` is used.
|
|
|
|
copy_Gram : bool, default=True
|
|
If ``False``, ``Gram`` is overwritten.
|
|
|
|
verbose : int, default=0
|
|
Controls output verbosity.
|
|
|
|
return_path : bool, default=True
|
|
If ``return_path==True`` returns the entire path, else returns only the
|
|
last point of the path.
|
|
|
|
return_n_iter : bool, default=False
|
|
Whether to return the number of iterations.
|
|
|
|
positive : bool, default=False
|
|
Restrict coefficients to be >= 0.
|
|
This option is only allowed with method 'lasso'. Note that the model
|
|
coefficients will not converge to the ordinary-least-squares solution
|
|
for small values of alpha. Only coefficients up to the smallest alpha
|
|
value (``alphas_[alphas_ > 0.].min()`` when fit_path=True) reached by
|
|
the stepwise Lars-Lasso algorithm are typically in congruence with the
|
|
solution of the coordinate descent lasso_path function.
|
|
|
|
Returns
|
|
-------
|
|
alphas : array-like of shape (n_alphas + 1,)
|
|
Maximum of covariances (in absolute value) at each iteration.
|
|
``n_alphas`` is either ``max_iter``, ``n_features`` or the
|
|
number of nodes in the path with ``alpha >= alpha_min``, whichever
|
|
is smaller.
|
|
|
|
active : array-like of shape (n_alphas,)
|
|
Indices of active variables at the end of the path.
|
|
|
|
coefs : array-like of shape (n_features, n_alphas + 1)
|
|
Coefficients along the path
|
|
|
|
n_iter : int
|
|
Number of iterations run. Returned only if return_n_iter is set
|
|
to True.
|
|
|
|
See also
|
|
--------
|
|
lars_path
|
|
lasso_path
|
|
lasso_path_gram
|
|
LassoLars
|
|
Lars
|
|
LassoLarsCV
|
|
LarsCV
|
|
sklearn.decomposition.sparse_encode
|
|
|
|
References
|
|
----------
|
|
.. [1] "Least Angle Regression", Efron et al.
|
|
http://statweb.stanford.edu/~tibs/ftp/lars.pdf
|
|
|
|
.. [2] `Wikipedia entry on the Least-angle regression
|
|
<https://en.wikipedia.org/wiki/Least-angle_regression>`_
|
|
|
|
.. [3] `Wikipedia entry on the Lasso
|
|
<https://en.wikipedia.org/wiki/Lasso_(statistics)>`_
|
|
|
|
"""
|
|
return _lars_path_solver(
|
|
X=None, y=None, Xy=Xy, Gram=Gram, n_samples=n_samples,
|
|
max_iter=max_iter, alpha_min=alpha_min, method=method,
|
|
copy_X=copy_X, eps=eps, copy_Gram=copy_Gram,
|
|
verbose=verbose, return_path=return_path,
|
|
return_n_iter=return_n_iter, positive=positive)
|
|
|
|
|
|
def _lars_path_solver(X, y, Xy=None, Gram=None, n_samples=None, max_iter=500,
|
|
alpha_min=0, method='lar', copy_X=True,
|
|
eps=np.finfo(np.float).eps, copy_Gram=True, verbose=0,
|
|
return_path=True, return_n_iter=False, positive=False):
|
|
"""Compute Least Angle Regression or Lasso path using LARS algorithm [1]
|
|
|
|
The optimization objective for the case method='lasso' is::
|
|
|
|
(1 / (2 * n_samples)) * ||y - Xw||^2_2 + alpha * ||w||_1
|
|
|
|
in the case of method='lars', the objective function is only known in
|
|
the form of an implicit equation (see discussion in [1])
|
|
|
|
Read more in the :ref:`User Guide <least_angle_regression>`.
|
|
|
|
Parameters
|
|
----------
|
|
X : None or ndarray, of shape (n_samples, n_features)
|
|
Input data. Note that if X is None then Gram must be specified,
|
|
i.e., cannot be None or False.
|
|
|
|
y : None or ndarray, of shape (n_samples,)
|
|
Input targets.
|
|
|
|
Xy : array-like of shape (n_samples,) or (n_samples, n_targets), \
|
|
default=None
|
|
Xy = np.dot(X.T, y) that can be precomputed. It is useful
|
|
only when the Gram matrix is precomputed.
|
|
|
|
Gram : None, 'auto' or array-like of shape (n_features, n_features), \
|
|
default=None
|
|
Precomputed Gram matrix (X' * X), if ``'auto'``, the Gram
|
|
matrix is precomputed from the given X, if there are more samples
|
|
than features.
|
|
|
|
n_samples : int or float, default=None
|
|
Equivalent size of sample.
|
|
|
|
max_iter : int, default=500
|
|
Maximum number of iterations to perform, set to infinity for no limit.
|
|
|
|
alpha_min : float, default=0
|
|
Minimum correlation along the path. It corresponds to the
|
|
regularization parameter alpha parameter in the Lasso.
|
|
|
|
method : {'lar', 'lasso'}, default='lar'
|
|
Specifies the returned model. Select ``'lar'`` for Least Angle
|
|
Regression, ``'lasso'`` for the Lasso.
|
|
|
|
copy_X : bool, default=True
|
|
If ``False``, ``X`` is overwritten.
|
|
|
|
eps : float, optional
|
|
The machine-precision regularization in the computation of the
|
|
Cholesky diagonal factors. Increase this for very ill-conditioned
|
|
systems. By default, ``np.finfo(np.float).eps`` is used
|
|
|
|
copy_Gram : bool, default=True
|
|
If ``False``, ``Gram`` is overwritten.
|
|
|
|
verbose : int, default=0
|
|
Controls output verbosity.
|
|
|
|
return_path : bool, default=True
|
|
If ``return_path==True`` returns the entire path, else returns only the
|
|
last point of the path.
|
|
|
|
return_n_iter : bool, default=False
|
|
Whether to return the number of iterations.
|
|
|
|
positive : bool, default=False
|
|
Restrict coefficients to be >= 0.
|
|
This option is only allowed with method 'lasso'. Note that the model
|
|
coefficients will not converge to the ordinary-least-squares solution
|
|
for small values of alpha. Only coefficients up to the smallest alpha
|
|
value (``alphas_[alphas_ > 0.].min()`` when fit_path=True) reached by
|
|
the stepwise Lars-Lasso algorithm are typically in congruence with the
|
|
solution of the coordinate descent lasso_path function.
|
|
|
|
Returns
|
|
-------
|
|
alphas : array-like of shape (n_alphas + 1,)
|
|
Maximum of covariances (in absolute value) at each iteration.
|
|
``n_alphas`` is either ``max_iter``, ``n_features`` or the
|
|
number of nodes in the path with ``alpha >= alpha_min``, whichever
|
|
is smaller.
|
|
|
|
active : array-like of shape (n_alphas,)
|
|
Indices of active variables at the end of the path.
|
|
|
|
coefs : array-like of shape (n_features, n_alphas + 1)
|
|
Coefficients along the path
|
|
|
|
n_iter : int
|
|
Number of iterations run. Returned only if return_n_iter is set
|
|
to True.
|
|
|
|
See also
|
|
--------
|
|
lasso_path
|
|
LassoLars
|
|
Lars
|
|
LassoLarsCV
|
|
LarsCV
|
|
sklearn.decomposition.sparse_encode
|
|
|
|
References
|
|
----------
|
|
.. [1] "Least Angle Regression", Efron et al.
|
|
http://statweb.stanford.edu/~tibs/ftp/lars.pdf
|
|
|
|
.. [2] `Wikipedia entry on the Least-angle regression
|
|
<https://en.wikipedia.org/wiki/Least-angle_regression>`_
|
|
|
|
.. [3] `Wikipedia entry on the Lasso
|
|
<https://en.wikipedia.org/wiki/Lasso_(statistics)>`_
|
|
|
|
"""
|
|
if method == 'lar' and positive:
|
|
raise ValueError(
|
|
"Positive constraint not supported for 'lar' "
|
|
"coding method."
|
|
)
|
|
|
|
n_samples = n_samples if n_samples is not None else y.size
|
|
|
|
if Xy is None:
|
|
Cov = np.dot(X.T, y)
|
|
else:
|
|
Cov = Xy.copy()
|
|
|
|
if Gram is None or Gram is False:
|
|
Gram = None
|
|
if X is None:
|
|
raise ValueError('X and Gram cannot both be unspecified.')
|
|
elif isinstance(Gram, str) and Gram == 'auto' or Gram is True:
|
|
if Gram is True or X.shape[0] > X.shape[1]:
|
|
Gram = np.dot(X.T, X)
|
|
else:
|
|
Gram = None
|
|
elif copy_Gram:
|
|
Gram = Gram.copy()
|
|
|
|
if Gram is None:
|
|
n_features = X.shape[1]
|
|
else:
|
|
n_features = Cov.shape[0]
|
|
if Gram.shape != (n_features, n_features):
|
|
raise ValueError('The shapes of the inputs Gram and Xy'
|
|
' do not match.')
|
|
|
|
if copy_X and X is not None and Gram is None:
|
|
# force copy. setting the array to be fortran-ordered
|
|
# speeds up the calculation of the (partial) Gram matrix
|
|
# and allows to easily swap columns
|
|
X = X.copy('F')
|
|
|
|
max_features = min(max_iter, n_features)
|
|
|
|
if return_path:
|
|
coefs = np.zeros((max_features + 1, n_features))
|
|
alphas = np.zeros(max_features + 1)
|
|
else:
|
|
coef, prev_coef = np.zeros(n_features), np.zeros(n_features)
|
|
alpha, prev_alpha = np.array([0.]), np.array([0.]) # better ideas?
|
|
|
|
n_iter, n_active = 0, 0
|
|
active, indices = list(), np.arange(n_features)
|
|
# holds the sign of covariance
|
|
sign_active = np.empty(max_features, dtype=np.int8)
|
|
drop = False
|
|
|
|
# will hold the cholesky factorization. Only lower part is
|
|
# referenced.
|
|
if Gram is None:
|
|
L = np.empty((max_features, max_features), dtype=X.dtype)
|
|
swap, nrm2 = linalg.get_blas_funcs(('swap', 'nrm2'), (X,))
|
|
else:
|
|
L = np.empty((max_features, max_features), dtype=Gram.dtype)
|
|
swap, nrm2 = linalg.get_blas_funcs(('swap', 'nrm2'), (Cov,))
|
|
solve_cholesky, = get_lapack_funcs(('potrs',), (L,))
|
|
|
|
if verbose:
|
|
if verbose > 1:
|
|
print("Step\t\tAdded\t\tDropped\t\tActive set size\t\tC")
|
|
else:
|
|
sys.stdout.write('.')
|
|
sys.stdout.flush()
|
|
|
|
tiny32 = np.finfo(np.float32).tiny # to avoid division by 0 warning
|
|
equality_tolerance = np.finfo(np.float32).eps
|
|
|
|
if Gram is not None:
|
|
Gram_copy = Gram.copy()
|
|
Cov_copy = Cov.copy()
|
|
|
|
while True:
|
|
if Cov.size:
|
|
if positive:
|
|
C_idx = np.argmax(Cov)
|
|
else:
|
|
C_idx = np.argmax(np.abs(Cov))
|
|
|
|
C_ = Cov[C_idx]
|
|
|
|
if positive:
|
|
C = C_
|
|
else:
|
|
C = np.fabs(C_)
|
|
else:
|
|
C = 0.
|
|
|
|
if return_path:
|
|
alpha = alphas[n_iter, np.newaxis]
|
|
coef = coefs[n_iter]
|
|
prev_alpha = alphas[n_iter - 1, np.newaxis]
|
|
prev_coef = coefs[n_iter - 1]
|
|
|
|
alpha[0] = C / n_samples
|
|
if alpha[0] <= alpha_min + equality_tolerance: # early stopping
|
|
if abs(alpha[0] - alpha_min) > equality_tolerance:
|
|
# interpolation factor 0 <= ss < 1
|
|
if n_iter > 0:
|
|
# In the first iteration, all alphas are zero, the formula
|
|
# below would make ss a NaN
|
|
ss = ((prev_alpha[0] - alpha_min) /
|
|
(prev_alpha[0] - alpha[0]))
|
|
coef[:] = prev_coef + ss * (coef - prev_coef)
|
|
alpha[0] = alpha_min
|
|
if return_path:
|
|
coefs[n_iter] = coef
|
|
break
|
|
|
|
if n_iter >= max_iter or n_active >= n_features:
|
|
break
|
|
if not drop:
|
|
|
|
##########################################################
|
|
# Append x_j to the Cholesky factorization of (Xa * Xa') #
|
|
# #
|
|
# ( L 0 ) #
|
|
# L -> ( ) , where L * w = Xa' x_j #
|
|
# ( w z ) and z = ||x_j|| #
|
|
# #
|
|
##########################################################
|
|
|
|
if positive:
|
|
sign_active[n_active] = np.ones_like(C_)
|
|
else:
|
|
sign_active[n_active] = np.sign(C_)
|
|
m, n = n_active, C_idx + n_active
|
|
|
|
Cov[C_idx], Cov[0] = swap(Cov[C_idx], Cov[0])
|
|
indices[n], indices[m] = indices[m], indices[n]
|
|
Cov_not_shortened = Cov
|
|
Cov = Cov[1:] # remove Cov[0]
|
|
|
|
if Gram is None:
|
|
X.T[n], X.T[m] = swap(X.T[n], X.T[m])
|
|
c = nrm2(X.T[n_active]) ** 2
|
|
L[n_active, :n_active] = \
|
|
np.dot(X.T[n_active], X.T[:n_active].T)
|
|
else:
|
|
# swap does only work inplace if matrix is fortran
|
|
# contiguous ...
|
|
Gram[m], Gram[n] = swap(Gram[m], Gram[n])
|
|
Gram[:, m], Gram[:, n] = swap(Gram[:, m], Gram[:, n])
|
|
c = Gram[n_active, n_active]
|
|
L[n_active, :n_active] = Gram[n_active, :n_active]
|
|
|
|
# Update the cholesky decomposition for the Gram matrix
|
|
if n_active:
|
|
linalg.solve_triangular(L[:n_active, :n_active],
|
|
L[n_active, :n_active],
|
|
trans=0, lower=1,
|
|
overwrite_b=True,
|
|
**SOLVE_TRIANGULAR_ARGS)
|
|
|
|
v = np.dot(L[n_active, :n_active], L[n_active, :n_active])
|
|
diag = max(np.sqrt(np.abs(c - v)), eps)
|
|
L[n_active, n_active] = diag
|
|
|
|
if diag < 1e-7:
|
|
# The system is becoming too ill-conditioned.
|
|
# We have degenerate vectors in our active set.
|
|
# We'll 'drop for good' the last regressor added.
|
|
|
|
# Note: this case is very rare. It is no longer triggered by
|
|
# the test suite. The `equality_tolerance` margin added in 0.16
|
|
# to get early stopping to work consistently on all versions of
|
|
# Python including 32 bit Python under Windows seems to make it
|
|
# very difficult to trigger the 'drop for good' strategy.
|
|
warnings.warn('Regressors in active set degenerate. '
|
|
'Dropping a regressor, after %i iterations, '
|
|
'i.e. alpha=%.3e, '
|
|
'with an active set of %i regressors, and '
|
|
'the smallest cholesky pivot element being %.3e.'
|
|
' Reduce max_iter or increase eps parameters.'
|
|
% (n_iter, alpha, n_active, diag),
|
|
ConvergenceWarning)
|
|
|
|
# XXX: need to figure a 'drop for good' way
|
|
Cov = Cov_not_shortened
|
|
Cov[0] = 0
|
|
Cov[C_idx], Cov[0] = swap(Cov[C_idx], Cov[0])
|
|
continue
|
|
|
|
active.append(indices[n_active])
|
|
n_active += 1
|
|
|
|
if verbose > 1:
|
|
print("%s\t\t%s\t\t%s\t\t%s\t\t%s" % (n_iter, active[-1], '',
|
|
n_active, C))
|
|
|
|
if method == 'lasso' and n_iter > 0 and prev_alpha[0] < alpha[0]:
|
|
# alpha is increasing. This is because the updates of Cov are
|
|
# bringing in too much numerical error that is greater than
|
|
# than the remaining correlation with the
|
|
# regressors. Time to bail out
|
|
warnings.warn('Early stopping the lars path, as the residues '
|
|
'are small and the current value of alpha is no '
|
|
'longer well controlled. %i iterations, alpha=%.3e, '
|
|
'previous alpha=%.3e, with an active set of %i '
|
|
'regressors.'
|
|
% (n_iter, alpha, prev_alpha, n_active),
|
|
ConvergenceWarning)
|
|
break
|
|
|
|
# least squares solution
|
|
least_squares, _ = solve_cholesky(L[:n_active, :n_active],
|
|
sign_active[:n_active],
|
|
lower=True)
|
|
|
|
if least_squares.size == 1 and least_squares == 0:
|
|
# This happens because sign_active[:n_active] = 0
|
|
least_squares[...] = 1
|
|
AA = 1.
|
|
else:
|
|
# is this really needed ?
|
|
AA = 1. / np.sqrt(np.sum(least_squares * sign_active[:n_active]))
|
|
|
|
if not np.isfinite(AA):
|
|
# L is too ill-conditioned
|
|
i = 0
|
|
L_ = L[:n_active, :n_active].copy()
|
|
while not np.isfinite(AA):
|
|
L_.flat[::n_active + 1] += (2 ** i) * eps
|
|
least_squares, _ = solve_cholesky(
|
|
L_, sign_active[:n_active], lower=True)
|
|
tmp = max(np.sum(least_squares * sign_active[:n_active]),
|
|
eps)
|
|
AA = 1. / np.sqrt(tmp)
|
|
i += 1
|
|
least_squares *= AA
|
|
|
|
if Gram is None:
|
|
# equiangular direction of variables in the active set
|
|
eq_dir = np.dot(X.T[:n_active].T, least_squares)
|
|
# correlation between each unactive variables and
|
|
# eqiangular vector
|
|
corr_eq_dir = np.dot(X.T[n_active:], eq_dir)
|
|
else:
|
|
# if huge number of features, this takes 50% of time, I
|
|
# think could be avoided if we just update it using an
|
|
# orthogonal (QR) decomposition of X
|
|
corr_eq_dir = np.dot(Gram[:n_active, n_active:].T,
|
|
least_squares)
|
|
|
|
g1 = arrayfuncs.min_pos((C - Cov) / (AA - corr_eq_dir + tiny32))
|
|
if positive:
|
|
gamma_ = min(g1, C / AA)
|
|
else:
|
|
g2 = arrayfuncs.min_pos((C + Cov) / (AA + corr_eq_dir + tiny32))
|
|
gamma_ = min(g1, g2, C / AA)
|
|
|
|
# TODO: better names for these variables: z
|
|
drop = False
|
|
z = -coef[active] / (least_squares + tiny32)
|
|
z_pos = arrayfuncs.min_pos(z)
|
|
if z_pos < gamma_:
|
|
# some coefficients have changed sign
|
|
idx = np.where(z == z_pos)[0][::-1]
|
|
|
|
# update the sign, important for LAR
|
|
sign_active[idx] = -sign_active[idx]
|
|
|
|
if method == 'lasso':
|
|
gamma_ = z_pos
|
|
drop = True
|
|
|
|
n_iter += 1
|
|
|
|
if return_path:
|
|
if n_iter >= coefs.shape[0]:
|
|
del coef, alpha, prev_alpha, prev_coef
|
|
# resize the coefs and alphas array
|
|
add_features = 2 * max(1, (max_features - n_active))
|
|
coefs = np.resize(coefs, (n_iter + add_features, n_features))
|
|
coefs[-add_features:] = 0
|
|
alphas = np.resize(alphas, n_iter + add_features)
|
|
alphas[-add_features:] = 0
|
|
coef = coefs[n_iter]
|
|
prev_coef = coefs[n_iter - 1]
|
|
else:
|
|
# mimic the effect of incrementing n_iter on the array references
|
|
prev_coef = coef
|
|
prev_alpha[0] = alpha[0]
|
|
coef = np.zeros_like(coef)
|
|
|
|
coef[active] = prev_coef[active] + gamma_ * least_squares
|
|
|
|
# update correlations
|
|
Cov -= gamma_ * corr_eq_dir
|
|
|
|
# See if any coefficient has changed sign
|
|
if drop and method == 'lasso':
|
|
|
|
# handle the case when idx is not length of 1
|
|
for ii in idx:
|
|
arrayfuncs.cholesky_delete(L[:n_active, :n_active], ii)
|
|
|
|
n_active -= 1
|
|
# handle the case when idx is not length of 1
|
|
drop_idx = [active.pop(ii) for ii in idx]
|
|
|
|
if Gram is None:
|
|
# propagate dropped variable
|
|
for ii in idx:
|
|
for i in range(ii, n_active):
|
|
X.T[i], X.T[i + 1] = swap(X.T[i], X.T[i + 1])
|
|
# yeah this is stupid
|
|
indices[i], indices[i + 1] = indices[i + 1], indices[i]
|
|
|
|
# TODO: this could be updated
|
|
residual = y - np.dot(X[:, :n_active], coef[active])
|
|
temp = np.dot(X.T[n_active], residual)
|
|
|
|
Cov = np.r_[temp, Cov]
|
|
else:
|
|
for ii in idx:
|
|
for i in range(ii, n_active):
|
|
indices[i], indices[i + 1] = indices[i + 1], indices[i]
|
|
Gram[i], Gram[i + 1] = swap(Gram[i], Gram[i + 1])
|
|
Gram[:, i], Gram[:, i + 1] = swap(Gram[:, i],
|
|
Gram[:, i + 1])
|
|
|
|
# Cov_n = Cov_j + x_j * X + increment(betas) TODO:
|
|
# will this still work with multiple drops ?
|
|
|
|
# recompute covariance. Probably could be done better
|
|
# wrong as Xy is not swapped with the rest of variables
|
|
|
|
# TODO: this could be updated
|
|
temp = Cov_copy[drop_idx] - np.dot(Gram_copy[drop_idx], coef)
|
|
Cov = np.r_[temp, Cov]
|
|
|
|
sign_active = np.delete(sign_active, idx)
|
|
sign_active = np.append(sign_active, 0.) # just to maintain size
|
|
if verbose > 1:
|
|
print("%s\t\t%s\t\t%s\t\t%s\t\t%s" % (n_iter, '', drop_idx,
|
|
n_active, abs(temp)))
|
|
|
|
if return_path:
|
|
# resize coefs in case of early stop
|
|
alphas = alphas[:n_iter + 1]
|
|
coefs = coefs[:n_iter + 1]
|
|
|
|
if return_n_iter:
|
|
return alphas, active, coefs.T, n_iter
|
|
else:
|
|
return alphas, active, coefs.T
|
|
else:
|
|
if return_n_iter:
|
|
return alpha, active, coef, n_iter
|
|
else:
|
|
return alpha, active, coef
|
|
|
|
|
|
###############################################################################
|
|
# Estimator classes
|
|
|
|
class Lars(MultiOutputMixin, RegressorMixin, LinearModel):
|
|
"""Least Angle Regression model a.k.a. LAR
|
|
|
|
Read more in the :ref:`User Guide <least_angle_regression>`.
|
|
|
|
Parameters
|
|
----------
|
|
fit_intercept : bool, default=True
|
|
Whether to calculate the intercept for this model. If set
|
|
to false, no intercept will be used in calculations
|
|
(i.e. data is expected to be centered).
|
|
|
|
verbose : bool or int, default=False
|
|
Sets the verbosity amount
|
|
|
|
normalize : bool, default=True
|
|
This parameter is ignored when ``fit_intercept`` is set to False.
|
|
If True, the regressors X will be normalized before regression by
|
|
subtracting the mean and dividing by the l2-norm.
|
|
If you wish to standardize, please use
|
|
:class:`sklearn.preprocessing.StandardScaler` before calling ``fit``
|
|
on an estimator with ``normalize=False``.
|
|
|
|
precompute : bool, 'auto' or array-like , default='auto'
|
|
Whether to use a precomputed Gram matrix to speed up
|
|
calculations. If set to ``'auto'`` let us decide. The Gram
|
|
matrix can also be passed as argument.
|
|
|
|
n_nonzero_coefs : int, default=500
|
|
Target number of non-zero coefficients. Use ``np.inf`` for no limit.
|
|
|
|
eps : float, optional
|
|
The machine-precision regularization in the computation of the
|
|
Cholesky diagonal factors. Increase this for very ill-conditioned
|
|
systems. Unlike the ``tol`` parameter in some iterative
|
|
optimization-based algorithms, this parameter does not control
|
|
the tolerance of the optimization.
|
|
By default, ``np.finfo(np.float).eps`` is used.
|
|
|
|
copy_X : bool, default=True
|
|
If ``True``, X will be copied; else, it may be overwritten.
|
|
|
|
fit_path : bool, default=True
|
|
If True the full path is stored in the ``coef_path_`` attribute.
|
|
If you compute the solution for a large problem or many targets,
|
|
setting ``fit_path`` to ``False`` will lead to a speedup, especially
|
|
with a small alpha.
|
|
|
|
jitter : float, default=None
|
|
Upper bound on a uniform noise parameter to be added to the
|
|
`y` values, to satisfy the model's assumption of
|
|
one-at-a-time computations. Might help with stability.
|
|
|
|
random_state : int, RandomState instance or None (default)
|
|
Determines random number generation for jittering. Pass an int
|
|
for reproducible output across multiple function calls.
|
|
See :term:`Glossary <random_state>`. Ignored if `jitter` is None.
|
|
|
|
Attributes
|
|
----------
|
|
alphas_ : array-like of shape (n_alphas + 1,) | list of n_targets such \
|
|
arrays
|
|
Maximum of covariances (in absolute value) at each iteration. \
|
|
``n_alphas`` is either ``n_nonzero_coefs`` or ``n_features``, \
|
|
whichever is smaller.
|
|
|
|
active_ : list, length = n_alphas | list of n_targets such lists
|
|
Indices of active variables at the end of the path.
|
|
|
|
coef_path_ : array-like of shape (n_features, n_alphas + 1) \
|
|
| list of n_targets such arrays
|
|
The varying values of the coefficients along the path. It is not
|
|
present if the ``fit_path`` parameter is ``False``.
|
|
|
|
coef_ : array-like of shape (n_features,) or (n_targets, n_features)
|
|
Parameter vector (w in the formulation formula).
|
|
|
|
intercept_ : float or array-like of shape (n_targets,)
|
|
Independent term in decision function.
|
|
|
|
n_iter_ : array-like or int
|
|
The number of iterations taken by lars_path to find the
|
|
grid of alphas for each target.
|
|
|
|
Examples
|
|
--------
|
|
>>> from sklearn import linear_model
|
|
>>> reg = linear_model.Lars(n_nonzero_coefs=1)
|
|
>>> reg.fit([[-1, 1], [0, 0], [1, 1]], [-1.1111, 0, -1.1111])
|
|
Lars(n_nonzero_coefs=1)
|
|
>>> print(reg.coef_)
|
|
[ 0. -1.11...]
|
|
|
|
See also
|
|
--------
|
|
lars_path, LarsCV
|
|
sklearn.decomposition.sparse_encode
|
|
|
|
"""
|
|
method = 'lar'
|
|
positive = False
|
|
|
|
@_deprecate_positional_args
|
|
def __init__(self, *, fit_intercept=True, verbose=False, normalize=True,
|
|
precompute='auto', n_nonzero_coefs=500,
|
|
eps=np.finfo(np.float).eps, copy_X=True, fit_path=True,
|
|
jitter=None, random_state=None):
|
|
self.fit_intercept = fit_intercept
|
|
self.verbose = verbose
|
|
self.normalize = normalize
|
|
self.precompute = precompute
|
|
self.n_nonzero_coefs = n_nonzero_coefs
|
|
self.eps = eps
|
|
self.copy_X = copy_X
|
|
self.fit_path = fit_path
|
|
self.jitter = jitter
|
|
self.random_state = random_state
|
|
|
|
@staticmethod
|
|
def _get_gram(precompute, X, y):
|
|
if (not hasattr(precompute, '__array__')) and (
|
|
(precompute is True) or
|
|
(precompute == 'auto' and X.shape[0] > X.shape[1]) or
|
|
(precompute == 'auto' and y.shape[1] > 1)):
|
|
precompute = np.dot(X.T, X)
|
|
|
|
return precompute
|
|
|
|
def _fit(self, X, y, max_iter, alpha, fit_path, Xy=None):
|
|
"""Auxiliary method to fit the model using X, y as training data"""
|
|
n_features = X.shape[1]
|
|
|
|
X, y, X_offset, y_offset, X_scale = self._preprocess_data(
|
|
X, y, self.fit_intercept, self.normalize, self.copy_X)
|
|
|
|
if y.ndim == 1:
|
|
y = y[:, np.newaxis]
|
|
|
|
n_targets = y.shape[1]
|
|
|
|
Gram = self._get_gram(self.precompute, X, y)
|
|
|
|
self.alphas_ = []
|
|
self.n_iter_ = []
|
|
self.coef_ = np.empty((n_targets, n_features))
|
|
|
|
if fit_path:
|
|
self.active_ = []
|
|
self.coef_path_ = []
|
|
for k in range(n_targets):
|
|
this_Xy = None if Xy is None else Xy[:, k]
|
|
alphas, active, coef_path, n_iter_ = lars_path(
|
|
X, y[:, k], Gram=Gram, Xy=this_Xy, copy_X=self.copy_X,
|
|
copy_Gram=True, alpha_min=alpha, method=self.method,
|
|
verbose=max(0, self.verbose - 1), max_iter=max_iter,
|
|
eps=self.eps, return_path=True,
|
|
return_n_iter=True, positive=self.positive)
|
|
self.alphas_.append(alphas)
|
|
self.active_.append(active)
|
|
self.n_iter_.append(n_iter_)
|
|
self.coef_path_.append(coef_path)
|
|
self.coef_[k] = coef_path[:, -1]
|
|
|
|
if n_targets == 1:
|
|
self.alphas_, self.active_, self.coef_path_, self.coef_ = [
|
|
a[0] for a in (self.alphas_, self.active_, self.coef_path_,
|
|
self.coef_)]
|
|
self.n_iter_ = self.n_iter_[0]
|
|
else:
|
|
for k in range(n_targets):
|
|
this_Xy = None if Xy is None else Xy[:, k]
|
|
alphas, _, self.coef_[k], n_iter_ = lars_path(
|
|
X, y[:, k], Gram=Gram, Xy=this_Xy, copy_X=self.copy_X,
|
|
copy_Gram=True, alpha_min=alpha, method=self.method,
|
|
verbose=max(0, self.verbose - 1), max_iter=max_iter,
|
|
eps=self.eps, return_path=False, return_n_iter=True,
|
|
positive=self.positive)
|
|
self.alphas_.append(alphas)
|
|
self.n_iter_.append(n_iter_)
|
|
if n_targets == 1:
|
|
self.alphas_ = self.alphas_[0]
|
|
self.n_iter_ = self.n_iter_[0]
|
|
|
|
self._set_intercept(X_offset, y_offset, X_scale)
|
|
return self
|
|
|
|
def fit(self, X, y, Xy=None):
|
|
"""Fit the model using X, y as training data.
|
|
|
|
Parameters
|
|
----------
|
|
X : array-like of shape (n_samples, n_features)
|
|
Training data.
|
|
|
|
y : array-like of shape (n_samples,) or (n_samples, n_targets)
|
|
Target values.
|
|
|
|
Xy : array-like of shape (n_samples,) or (n_samples, n_targets), \
|
|
default=None
|
|
Xy = np.dot(X.T, y) that can be precomputed. It is useful
|
|
only when the Gram matrix is precomputed.
|
|
|
|
Returns
|
|
-------
|
|
self : object
|
|
returns an instance of self.
|
|
"""
|
|
X, y = self._validate_data(X, y, y_numeric=True, multi_output=True)
|
|
|
|
alpha = getattr(self, 'alpha', 0.)
|
|
if hasattr(self, 'n_nonzero_coefs'):
|
|
alpha = 0. # n_nonzero_coefs parametrization takes priority
|
|
max_iter = self.n_nonzero_coefs
|
|
else:
|
|
max_iter = self.max_iter
|
|
|
|
if self.jitter is not None:
|
|
rng = check_random_state(self.random_state)
|
|
|
|
noise = rng.uniform(high=self.jitter, size=len(y))
|
|
y = y + noise
|
|
|
|
self._fit(X, y, max_iter=max_iter, alpha=alpha, fit_path=self.fit_path,
|
|
Xy=Xy)
|
|
|
|
return self
|
|
|
|
|
|
class LassoLars(Lars):
|
|
"""Lasso model fit with Least Angle Regression a.k.a. Lars
|
|
|
|
It is a Linear Model trained with an L1 prior as regularizer.
|
|
|
|
The optimization objective for Lasso is::
|
|
|
|
(1 / (2 * n_samples)) * ||y - Xw||^2_2 + alpha * ||w||_1
|
|
|
|
Read more in the :ref:`User Guide <least_angle_regression>`.
|
|
|
|
Parameters
|
|
----------
|
|
alpha : float, default=1.0
|
|
Constant that multiplies the penalty term. Defaults to 1.0.
|
|
``alpha = 0`` is equivalent to an ordinary least square, solved
|
|
by :class:`LinearRegression`. For numerical reasons, using
|
|
``alpha = 0`` with the LassoLars object is not advised and you
|
|
should prefer the LinearRegression object.
|
|
|
|
fit_intercept : bool, default=True
|
|
whether to calculate the intercept for this model. If set
|
|
to false, no intercept will be used in calculations
|
|
(i.e. data is expected to be centered).
|
|
|
|
verbose : bool or int, default=False
|
|
Sets the verbosity amount
|
|
|
|
normalize : bool, default=True
|
|
This parameter is ignored when ``fit_intercept`` is set to False.
|
|
If True, the regressors X will be normalized before regression by
|
|
subtracting the mean and dividing by the l2-norm.
|
|
If you wish to standardize, please use
|
|
:class:`sklearn.preprocessing.StandardScaler` before calling ``fit``
|
|
on an estimator with ``normalize=False``.
|
|
|
|
precompute : bool, 'auto' or array-like, default='auto'
|
|
Whether to use a precomputed Gram matrix to speed up
|
|
calculations. If set to ``'auto'`` let us decide. The Gram
|
|
matrix can also be passed as argument.
|
|
|
|
max_iter : int, default=500
|
|
Maximum number of iterations to perform.
|
|
|
|
eps : float, optional
|
|
The machine-precision regularization in the computation of the
|
|
Cholesky diagonal factors. Increase this for very ill-conditioned
|
|
systems. Unlike the ``tol`` parameter in some iterative
|
|
optimization-based algorithms, this parameter does not control
|
|
the tolerance of the optimization.
|
|
By default, ``np.finfo(np.float).eps`` is used.
|
|
|
|
copy_X : bool, default=True
|
|
If True, X will be copied; else, it may be overwritten.
|
|
|
|
fit_path : bool, default=True
|
|
If ``True`` the full path is stored in the ``coef_path_`` attribute.
|
|
If you compute the solution for a large problem or many targets,
|
|
setting ``fit_path`` to ``False`` will lead to a speedup, especially
|
|
with a small alpha.
|
|
|
|
positive : bool, default=False
|
|
Restrict coefficients to be >= 0. Be aware that you might want to
|
|
remove fit_intercept which is set True by default.
|
|
Under the positive restriction the model coefficients will not converge
|
|
to the ordinary-least-squares solution for small values of alpha.
|
|
Only coefficients up to the smallest alpha value (``alphas_[alphas_ >
|
|
0.].min()`` when fit_path=True) reached by the stepwise Lars-Lasso
|
|
algorithm are typically in congruence with the solution of the
|
|
coordinate descent Lasso estimator.
|
|
|
|
jitter : float, default=None
|
|
Upper bound on a uniform noise parameter to be added to the
|
|
`y` values, to satisfy the model's assumption of
|
|
one-at-a-time computations. Might help with stability.
|
|
|
|
random_state : int, RandomState instance or None (default)
|
|
Determines random number generation for jittering. Pass an int
|
|
for reproducible output across multiple function calls.
|
|
See :term:`Glossary <random_state>`. Ignored if `jitter` is None.
|
|
|
|
Attributes
|
|
----------
|
|
alphas_ : array-like of shape (n_alphas + 1,) | list of n_targets such \
|
|
arrays
|
|
Maximum of covariances (in absolute value) at each iteration. \
|
|
``n_alphas`` is either ``max_iter``, ``n_features``, or the number of \
|
|
nodes in the path with correlation greater than ``alpha``, whichever \
|
|
is smaller.
|
|
|
|
active_ : list, length = n_alphas | list of n_targets such lists
|
|
Indices of active variables at the end of the path.
|
|
|
|
coef_path_ : array-like of shape (n_features, n_alphas + 1) or list
|
|
If a list is passed it's expected to be one of n_targets such arrays.
|
|
The varying values of the coefficients along the path. It is not
|
|
present if the ``fit_path`` parameter is ``False``.
|
|
|
|
coef_ : array-like of shape (n_features,) or (n_targets, n_features)
|
|
Parameter vector (w in the formulation formula).
|
|
|
|
intercept_ : float or array-like of shape (n_targets,)
|
|
Independent term in decision function.
|
|
|
|
n_iter_ : array-like or int.
|
|
The number of iterations taken by lars_path to find the
|
|
grid of alphas for each target.
|
|
|
|
Examples
|
|
--------
|
|
>>> from sklearn import linear_model
|
|
>>> reg = linear_model.LassoLars(alpha=0.01)
|
|
>>> reg.fit([[-1, 1], [0, 0], [1, 1]], [-1, 0, -1])
|
|
LassoLars(alpha=0.01)
|
|
>>> print(reg.coef_)
|
|
[ 0. -0.963257...]
|
|
|
|
See also
|
|
--------
|
|
lars_path
|
|
lasso_path
|
|
Lasso
|
|
LassoCV
|
|
LassoLarsCV
|
|
LassoLarsIC
|
|
sklearn.decomposition.sparse_encode
|
|
|
|
"""
|
|
method = 'lasso'
|
|
|
|
@_deprecate_positional_args
|
|
def __init__(self, alpha=1.0, *, fit_intercept=True, verbose=False,
|
|
normalize=True, precompute='auto', max_iter=500,
|
|
eps=np.finfo(np.float).eps, copy_X=True, fit_path=True,
|
|
positive=False, jitter=None, random_state=None):
|
|
self.alpha = alpha
|
|
self.fit_intercept = fit_intercept
|
|
self.max_iter = max_iter
|
|
self.verbose = verbose
|
|
self.normalize = normalize
|
|
self.positive = positive
|
|
self.precompute = precompute
|
|
self.copy_X = copy_X
|
|
self.eps = eps
|
|
self.fit_path = fit_path
|
|
self.jitter = jitter
|
|
self.random_state = random_state
|
|
|
|
|
|
###############################################################################
|
|
# Cross-validated estimator classes
|
|
|
|
def _check_copy_and_writeable(array, copy=False):
|
|
if copy or not array.flags.writeable:
|
|
return array.copy()
|
|
return array
|
|
|
|
|
|
def _lars_path_residues(X_train, y_train, X_test, y_test, Gram=None,
|
|
copy=True, method='lars', verbose=False,
|
|
fit_intercept=True, normalize=True, max_iter=500,
|
|
eps=np.finfo(np.float).eps, positive=False):
|
|
"""Compute the residues on left-out data for a full LARS path
|
|
|
|
Parameters
|
|
-----------
|
|
X_train : array-like of shape (n_samples, n_features)
|
|
The data to fit the LARS on
|
|
|
|
y_train : array-like of shape (n_samples,)
|
|
The target variable to fit LARS on
|
|
|
|
X_test : array-like of shape (n_samples, n_features)
|
|
The data to compute the residues on
|
|
|
|
y_test : array-like of shape (n_samples,)
|
|
The target variable to compute the residues on
|
|
|
|
Gram : None, 'auto' or array-like of shape (n_features, n_features), \
|
|
default=None
|
|
Precomputed Gram matrix (X' * X), if ``'auto'``, the Gram
|
|
matrix is precomputed from the given X, if there are more samples
|
|
than features
|
|
|
|
copy : bool, default=True
|
|
Whether X_train, X_test, y_train and y_test should be copied;
|
|
if False, they may be overwritten.
|
|
|
|
method : {'lar' , 'lasso'}, default='lar'
|
|
Specifies the returned model. Select ``'lar'`` for Least Angle
|
|
Regression, ``'lasso'`` for the Lasso.
|
|
|
|
verbose : bool or int, default=False
|
|
Sets the amount of verbosity
|
|
|
|
fit_intercept : bool, default=True
|
|
whether to calculate the intercept for this model. If set
|
|
to false, no intercept will be used in calculations
|
|
(i.e. data is expected to be centered).
|
|
|
|
positive : bool, default=False
|
|
Restrict coefficients to be >= 0. Be aware that you might want to
|
|
remove fit_intercept which is set True by default.
|
|
See reservations for using this option in combination with method
|
|
'lasso' for expected small values of alpha in the doc of LassoLarsCV
|
|
and LassoLarsIC.
|
|
|
|
normalize : bool, default=True
|
|
This parameter is ignored when ``fit_intercept`` is set to False.
|
|
If True, the regressors X will be normalized before regression by
|
|
subtracting the mean and dividing by the l2-norm.
|
|
If you wish to standardize, please use
|
|
:class:`sklearn.preprocessing.StandardScaler` before calling ``fit``
|
|
on an estimator with ``normalize=False``.
|
|
|
|
max_iter : int, default=500
|
|
Maximum number of iterations to perform.
|
|
|
|
eps : float, optional
|
|
The machine-precision regularization in the computation of the
|
|
Cholesky diagonal factors. Increase this for very ill-conditioned
|
|
systems. Unlike the ``tol`` parameter in some iterative
|
|
optimization-based algorithms, this parameter does not control
|
|
the tolerance of the optimization.
|
|
By default, ``np.finfo(np.float).eps`` is used
|
|
|
|
|
|
Returns
|
|
--------
|
|
alphas : array-like of shape (n_alphas,)
|
|
Maximum of covariances (in absolute value) at each iteration.
|
|
``n_alphas`` is either ``max_iter`` or ``n_features``, whichever
|
|
is smaller.
|
|
|
|
active : list
|
|
Indices of active variables at the end of the path.
|
|
|
|
coefs : array-like of shape (n_features, n_alphas)
|
|
Coefficients along the path
|
|
|
|
residues : array-like of shape (n_alphas, n_samples)
|
|
Residues of the prediction on the test data
|
|
"""
|
|
X_train = _check_copy_and_writeable(X_train, copy)
|
|
y_train = _check_copy_and_writeable(y_train, copy)
|
|
X_test = _check_copy_and_writeable(X_test, copy)
|
|
y_test = _check_copy_and_writeable(y_test, copy)
|
|
|
|
if fit_intercept:
|
|
X_mean = X_train.mean(axis=0)
|
|
X_train -= X_mean
|
|
X_test -= X_mean
|
|
y_mean = y_train.mean(axis=0)
|
|
y_train = as_float_array(y_train, copy=False)
|
|
y_train -= y_mean
|
|
y_test = as_float_array(y_test, copy=False)
|
|
y_test -= y_mean
|
|
|
|
if normalize:
|
|
norms = np.sqrt(np.sum(X_train ** 2, axis=0))
|
|
nonzeros = np.flatnonzero(norms)
|
|
X_train[:, nonzeros] /= norms[nonzeros]
|
|
|
|
alphas, active, coefs = lars_path(
|
|
X_train, y_train, Gram=Gram, copy_X=False, copy_Gram=False,
|
|
method=method, verbose=max(0, verbose - 1), max_iter=max_iter, eps=eps,
|
|
positive=positive)
|
|
if normalize:
|
|
coefs[nonzeros] /= norms[nonzeros][:, np.newaxis]
|
|
residues = np.dot(X_test, coefs) - y_test[:, np.newaxis]
|
|
return alphas, active, coefs, residues.T
|
|
|
|
|
|
class LarsCV(Lars):
|
|
"""Cross-validated Least Angle Regression model.
|
|
|
|
See glossary entry for :term:`cross-validation estimator`.
|
|
|
|
Read more in the :ref:`User Guide <least_angle_regression>`.
|
|
|
|
Parameters
|
|
----------
|
|
fit_intercept : bool, default=True
|
|
whether to calculate the intercept for this model. If set
|
|
to false, no intercept will be used in calculations
|
|
(i.e. data is expected to be centered).
|
|
|
|
verbose : bool or int, default=False
|
|
Sets the verbosity amount
|
|
|
|
max_iter : int, default=500
|
|
Maximum number of iterations to perform.
|
|
|
|
normalize : bool, default=True
|
|
This parameter is ignored when ``fit_intercept`` is set to False.
|
|
If True, the regressors X will be normalized before regression by
|
|
subtracting the mean and dividing by the l2-norm.
|
|
If you wish to standardize, please use
|
|
:class:`sklearn.preprocessing.StandardScaler` before calling ``fit``
|
|
on an estimator with ``normalize=False``.
|
|
|
|
precompute : bool, 'auto' or array-like , default='auto'
|
|
Whether to use a precomputed Gram matrix to speed up
|
|
calculations. If set to ``'auto'`` let us decide. The Gram matrix
|
|
cannot be passed as argument since we will use only subsets of X.
|
|
|
|
cv : int, cross-validation generator or an iterable, default=None
|
|
Determines the cross-validation splitting strategy.
|
|
Possible inputs for cv are:
|
|
|
|
- None, to use the default 5-fold cross-validation,
|
|
- integer, to specify the number of folds.
|
|
- :term:`CV splitter`,
|
|
- An iterable yielding (train, test) splits as arrays of indices.
|
|
|
|
For integer/None inputs, :class:`KFold` is used.
|
|
|
|
Refer :ref:`User Guide <cross_validation>` for the various
|
|
cross-validation strategies that can be used here.
|
|
|
|
.. versionchanged:: 0.22
|
|
``cv`` default value if None changed from 3-fold to 5-fold.
|
|
|
|
max_n_alphas : int, default=1000
|
|
The maximum number of points on the path used to compute the
|
|
residuals in the cross-validation
|
|
|
|
n_jobs : int or None, default=None
|
|
Number of CPUs to use during the cross validation.
|
|
``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
|
|
``-1`` means using all processors. See :term:`Glossary <n_jobs>`
|
|
for more details.
|
|
|
|
eps : float, optional
|
|
The machine-precision regularization in the computation of the
|
|
Cholesky diagonal factors. Increase this for very ill-conditioned
|
|
systems. By default, ``np.finfo(np.float).eps`` is used.
|
|
|
|
copy_X : bool, default=True
|
|
If ``True``, X will be copied; else, it may be overwritten.
|
|
|
|
Attributes
|
|
----------
|
|
coef_ : array-like of shape (n_features,)
|
|
parameter vector (w in the formulation formula)
|
|
|
|
intercept_ : float
|
|
independent term in decision function
|
|
|
|
coef_path_ : array-like of shape (n_features, n_alphas)
|
|
the varying values of the coefficients along the path
|
|
|
|
alpha_ : float
|
|
the estimated regularization parameter alpha
|
|
|
|
alphas_ : array-like of shape (n_alphas,)
|
|
the different values of alpha along the path
|
|
|
|
cv_alphas_ : array-like of shape (n_cv_alphas,)
|
|
all the values of alpha along the path for the different folds
|
|
|
|
mse_path_ : array-like of shape (n_folds, n_cv_alphas)
|
|
the mean square error on left-out for each fold along the path
|
|
(alpha values given by ``cv_alphas``)
|
|
|
|
n_iter_ : array-like or int
|
|
the number of iterations run by Lars with the optimal alpha.
|
|
|
|
Examples
|
|
--------
|
|
>>> from sklearn.linear_model import LarsCV
|
|
>>> from sklearn.datasets import make_regression
|
|
>>> X, y = make_regression(n_samples=200, noise=4.0, random_state=0)
|
|
>>> reg = LarsCV(cv=5).fit(X, y)
|
|
>>> reg.score(X, y)
|
|
0.9996...
|
|
>>> reg.alpha_
|
|
0.0254...
|
|
>>> reg.predict(X[:1,])
|
|
array([154.0842...])
|
|
|
|
See also
|
|
--------
|
|
lars_path, LassoLars, LassoLarsCV
|
|
"""
|
|
|
|
method = 'lar'
|
|
|
|
@_deprecate_positional_args
|
|
def __init__(self, *, fit_intercept=True, verbose=False, max_iter=500,
|
|
normalize=True, precompute='auto', cv=None,
|
|
max_n_alphas=1000, n_jobs=None, eps=np.finfo(np.float).eps,
|
|
copy_X=True):
|
|
self.max_iter = max_iter
|
|
self.cv = cv
|
|
self.max_n_alphas = max_n_alphas
|
|
self.n_jobs = n_jobs
|
|
super().__init__(fit_intercept=fit_intercept,
|
|
verbose=verbose, normalize=normalize,
|
|
precompute=precompute,
|
|
n_nonzero_coefs=500,
|
|
eps=eps, copy_X=copy_X, fit_path=True)
|
|
|
|
def _more_tags(self):
|
|
return {'multioutput': False}
|
|
|
|
def fit(self, X, y):
|
|
"""Fit the model using X, y as training data.
|
|
|
|
Parameters
|
|
----------
|
|
X : array-like of shape (n_samples, n_features)
|
|
Training data.
|
|
|
|
y : array-like of shape (n_samples,)
|
|
Target values.
|
|
|
|
Returns
|
|
-------
|
|
self : object
|
|
returns an instance of self.
|
|
"""
|
|
X, y = self._validate_data(X, y, y_numeric=True)
|
|
X = as_float_array(X, copy=self.copy_X)
|
|
y = as_float_array(y, copy=self.copy_X)
|
|
|
|
# init cross-validation generator
|
|
cv = check_cv(self.cv, classifier=False)
|
|
|
|
# As we use cross-validation, the Gram matrix is not precomputed here
|
|
Gram = self.precompute
|
|
if hasattr(Gram, '__array__'):
|
|
warnings.warn('Parameter "precompute" cannot be an array in '
|
|
'%s. Automatically switch to "auto" instead.'
|
|
% self.__class__.__name__)
|
|
Gram = 'auto'
|
|
|
|
cv_paths = Parallel(n_jobs=self.n_jobs, verbose=self.verbose)(
|
|
delayed(_lars_path_residues)(
|
|
X[train], y[train], X[test], y[test], Gram=Gram, copy=False,
|
|
method=self.method, verbose=max(0, self.verbose - 1),
|
|
normalize=self.normalize, fit_intercept=self.fit_intercept,
|
|
max_iter=self.max_iter, eps=self.eps, positive=self.positive)
|
|
for train, test in cv.split(X, y))
|
|
all_alphas = np.concatenate(list(zip(*cv_paths))[0])
|
|
# Unique also sorts
|
|
all_alphas = np.unique(all_alphas)
|
|
# Take at most max_n_alphas values
|
|
stride = int(max(1, int(len(all_alphas) / float(self.max_n_alphas))))
|
|
all_alphas = all_alphas[::stride]
|
|
|
|
mse_path = np.empty((len(all_alphas), len(cv_paths)))
|
|
for index, (alphas, _, _, residues) in enumerate(cv_paths):
|
|
alphas = alphas[::-1]
|
|
residues = residues[::-1]
|
|
if alphas[0] != 0:
|
|
alphas = np.r_[0, alphas]
|
|
residues = np.r_[residues[0, np.newaxis], residues]
|
|
if alphas[-1] != all_alphas[-1]:
|
|
alphas = np.r_[alphas, all_alphas[-1]]
|
|
residues = np.r_[residues, residues[-1, np.newaxis]]
|
|
this_residues = interpolate.interp1d(alphas,
|
|
residues,
|
|
axis=0)(all_alphas)
|
|
this_residues **= 2
|
|
mse_path[:, index] = np.mean(this_residues, axis=-1)
|
|
|
|
mask = np.all(np.isfinite(mse_path), axis=-1)
|
|
all_alphas = all_alphas[mask]
|
|
mse_path = mse_path[mask]
|
|
# Select the alpha that minimizes left-out error
|
|
i_best_alpha = np.argmin(mse_path.mean(axis=-1))
|
|
best_alpha = all_alphas[i_best_alpha]
|
|
|
|
# Store our parameters
|
|
self.alpha_ = best_alpha
|
|
self.cv_alphas_ = all_alphas
|
|
self.mse_path_ = mse_path
|
|
|
|
# Now compute the full model
|
|
# it will call a lasso internally when self if LassoLarsCV
|
|
# as self.method == 'lasso'
|
|
self._fit(X, y, max_iter=self.max_iter, alpha=best_alpha,
|
|
Xy=None, fit_path=True)
|
|
return self
|
|
|
|
|
|
class LassoLarsCV(LarsCV):
|
|
"""Cross-validated Lasso, using the LARS algorithm.
|
|
|
|
See glossary entry for :term:`cross-validation estimator`.
|
|
|
|
The optimization objective for Lasso is::
|
|
|
|
(1 / (2 * n_samples)) * ||y - Xw||^2_2 + alpha * ||w||_1
|
|
|
|
Read more in the :ref:`User Guide <least_angle_regression>`.
|
|
|
|
Parameters
|
|
----------
|
|
fit_intercept : bool, default=True
|
|
whether to calculate the intercept for this model. If set
|
|
to false, no intercept will be used in calculations
|
|
(i.e. data is expected to be centered).
|
|
|
|
verbose : bool or int, default=False
|
|
Sets the verbosity amount
|
|
|
|
max_iter : int, default=500
|
|
Maximum number of iterations to perform.
|
|
|
|
normalize : bool, default=True
|
|
This parameter is ignored when ``fit_intercept`` is set to False.
|
|
If True, the regressors X will be normalized before regression by
|
|
subtracting the mean and dividing by the l2-norm.
|
|
If you wish to standardize, please use
|
|
:class:`sklearn.preprocessing.StandardScaler` before calling ``fit``
|
|
on an estimator with ``normalize=False``.
|
|
|
|
precompute : bool or 'auto' , default='auto'
|
|
Whether to use a precomputed Gram matrix to speed up
|
|
calculations. If set to ``'auto'`` let us decide. The Gram matrix
|
|
cannot be passed as argument since we will use only subsets of X.
|
|
|
|
cv : int, cross-validation generator or an iterable, default=None
|
|
Determines the cross-validation splitting strategy.
|
|
Possible inputs for cv are:
|
|
|
|
- None, to use the default 5-fold cross-validation,
|
|
- integer, to specify the number of folds.
|
|
- :term:`CV splitter`,
|
|
- An iterable yielding (train, test) splits as arrays of indices.
|
|
|
|
For integer/None inputs, :class:`KFold` is used.
|
|
|
|
Refer :ref:`User Guide <cross_validation>` for the various
|
|
cross-validation strategies that can be used here.
|
|
|
|
.. versionchanged:: 0.22
|
|
``cv`` default value if None changed from 3-fold to 5-fold.
|
|
|
|
max_n_alphas : int, default=1000
|
|
The maximum number of points on the path used to compute the
|
|
residuals in the cross-validation
|
|
|
|
n_jobs : int or None, default=None
|
|
Number of CPUs to use during the cross validation.
|
|
``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
|
|
``-1`` means using all processors. See :term:`Glossary <n_jobs>`
|
|
for more details.
|
|
|
|
eps : float, optional
|
|
The machine-precision regularization in the computation of the
|
|
Cholesky diagonal factors. Increase this for very ill-conditioned
|
|
systems. By default, ``np.finfo(np.float).eps`` is used.
|
|
|
|
copy_X : bool, default=True
|
|
If True, X will be copied; else, it may be overwritten.
|
|
|
|
positive : bool, default=False
|
|
Restrict coefficients to be >= 0. Be aware that you might want to
|
|
remove fit_intercept which is set True by default.
|
|
Under the positive restriction the model coefficients do not converge
|
|
to the ordinary-least-squares solution for small values of alpha.
|
|
Only coefficients up to the smallest alpha value (``alphas_[alphas_ >
|
|
0.].min()`` when fit_path=True) reached by the stepwise Lars-Lasso
|
|
algorithm are typically in congruence with the solution of the
|
|
coordinate descent Lasso estimator.
|
|
As a consequence using LassoLarsCV only makes sense for problems where
|
|
a sparse solution is expected and/or reached.
|
|
|
|
Attributes
|
|
----------
|
|
coef_ : array-like of shape (n_features,)
|
|
parameter vector (w in the formulation formula)
|
|
|
|
intercept_ : float
|
|
independent term in decision function.
|
|
|
|
coef_path_ : array-like of shape (n_features, n_alphas)
|
|
the varying values of the coefficients along the path
|
|
|
|
alpha_ : float
|
|
the estimated regularization parameter alpha
|
|
|
|
alphas_ : array-like of shape (n_alphas,)
|
|
the different values of alpha along the path
|
|
|
|
cv_alphas_ : array-like of shape (n_cv_alphas,)
|
|
all the values of alpha along the path for the different folds
|
|
|
|
mse_path_ : array-like of shape (n_folds, n_cv_alphas)
|
|
the mean square error on left-out for each fold along the path
|
|
(alpha values given by ``cv_alphas``)
|
|
|
|
n_iter_ : array-like or int
|
|
the number of iterations run by Lars with the optimal alpha.
|
|
|
|
Examples
|
|
--------
|
|
>>> from sklearn.linear_model import LassoLarsCV
|
|
>>> from sklearn.datasets import make_regression
|
|
>>> X, y = make_regression(noise=4.0, random_state=0)
|
|
>>> reg = LassoLarsCV(cv=5).fit(X, y)
|
|
>>> reg.score(X, y)
|
|
0.9992...
|
|
>>> reg.alpha_
|
|
0.0484...
|
|
>>> reg.predict(X[:1,])
|
|
array([-77.8723...])
|
|
|
|
Notes
|
|
-----
|
|
|
|
The object solves the same problem as the LassoCV object. However,
|
|
unlike the LassoCV, it find the relevant alphas values by itself.
|
|
In general, because of this property, it will be more stable.
|
|
However, it is more fragile to heavily multicollinear datasets.
|
|
|
|
It is more efficient than the LassoCV if only a small number of
|
|
features are selected compared to the total number, for instance if
|
|
there are very few samples compared to the number of features.
|
|
|
|
See also
|
|
--------
|
|
lars_path, LassoLars, LarsCV, LassoCV
|
|
"""
|
|
|
|
method = 'lasso'
|
|
|
|
@_deprecate_positional_args
|
|
def __init__(self, *, fit_intercept=True, verbose=False, max_iter=500,
|
|
normalize=True, precompute='auto', cv=None,
|
|
max_n_alphas=1000, n_jobs=None, eps=np.finfo(np.float).eps,
|
|
copy_X=True, positive=False):
|
|
self.fit_intercept = fit_intercept
|
|
self.verbose = verbose
|
|
self.max_iter = max_iter
|
|
self.normalize = normalize
|
|
self.precompute = precompute
|
|
self.cv = cv
|
|
self.max_n_alphas = max_n_alphas
|
|
self.n_jobs = n_jobs
|
|
self.eps = eps
|
|
self.copy_X = copy_X
|
|
self.positive = positive
|
|
# XXX : we don't use super().__init__
|
|
# to avoid setting n_nonzero_coefs
|
|
|
|
|
|
class LassoLarsIC(LassoLars):
|
|
"""Lasso model fit with Lars using BIC or AIC for model selection
|
|
|
|
The optimization objective for Lasso is::
|
|
|
|
(1 / (2 * n_samples)) * ||y - Xw||^2_2 + alpha * ||w||_1
|
|
|
|
AIC is the Akaike information criterion and BIC is the Bayes
|
|
Information criterion. Such criteria are useful to select the value
|
|
of the regularization parameter by making a trade-off between the
|
|
goodness of fit and the complexity of the model. A good model should
|
|
explain well the data while being simple.
|
|
|
|
Read more in the :ref:`User Guide <least_angle_regression>`.
|
|
|
|
Parameters
|
|
----------
|
|
criterion : {'bic' , 'aic'}, default='aic'
|
|
The type of criterion to use.
|
|
|
|
fit_intercept : bool, default=True
|
|
whether to calculate the intercept for this model. If set
|
|
to false, no intercept will be used in calculations
|
|
(i.e. data is expected to be centered).
|
|
|
|
verbose : bool or int, default=False
|
|
Sets the verbosity amount
|
|
|
|
normalize : bool, default=True
|
|
This parameter is ignored when ``fit_intercept`` is set to False.
|
|
If True, the regressors X will be normalized before regression by
|
|
subtracting the mean and dividing by the l2-norm.
|
|
If you wish to standardize, please use
|
|
:class:`sklearn.preprocessing.StandardScaler` before calling ``fit``
|
|
on an estimator with ``normalize=False``.
|
|
|
|
precompute : bool, 'auto' or array-like, default='auto'
|
|
Whether to use a precomputed Gram matrix to speed up
|
|
calculations. If set to ``'auto'`` let us decide. The Gram
|
|
matrix can also be passed as argument.
|
|
|
|
max_iter : int, default=500
|
|
Maximum number of iterations to perform. Can be used for
|
|
early stopping.
|
|
|
|
eps : float, optional
|
|
The machine-precision regularization in the computation of the
|
|
Cholesky diagonal factors. Increase this for very ill-conditioned
|
|
systems. Unlike the ``tol`` parameter in some iterative
|
|
optimization-based algorithms, this parameter does not control
|
|
the tolerance of the optimization.
|
|
By default, ``np.finfo(np.float).eps`` is used
|
|
|
|
copy_X : bool, default=True
|
|
If True, X will be copied; else, it may be overwritten.
|
|
|
|
positive : bool, default=False
|
|
Restrict coefficients to be >= 0. Be aware that you might want to
|
|
remove fit_intercept which is set True by default.
|
|
Under the positive restriction the model coefficients do not converge
|
|
to the ordinary-least-squares solution for small values of alpha.
|
|
Only coefficients up to the smallest alpha value (``alphas_[alphas_ >
|
|
0.].min()`` when fit_path=True) reached by the stepwise Lars-Lasso
|
|
algorithm are typically in congruence with the solution of the
|
|
coordinate descent Lasso estimator.
|
|
As a consequence using LassoLarsIC only makes sense for problems where
|
|
a sparse solution is expected and/or reached.
|
|
|
|
Attributes
|
|
----------
|
|
coef_ : array-like of shape (n_features,)
|
|
parameter vector (w in the formulation formula)
|
|
|
|
intercept_ : float
|
|
independent term in decision function.
|
|
|
|
alpha_ : float
|
|
the alpha parameter chosen by the information criterion
|
|
|
|
n_iter_ : int
|
|
number of iterations run by lars_path to find the grid of
|
|
alphas.
|
|
|
|
criterion_ : array-like of shape (n_alphas,)
|
|
The value of the information criteria ('aic', 'bic') across all
|
|
alphas. The alpha which has the smallest information criterion is
|
|
chosen. This value is larger by a factor of ``n_samples`` compared to
|
|
Eqns. 2.15 and 2.16 in (Zou et al, 2007).
|
|
|
|
|
|
Examples
|
|
--------
|
|
>>> from sklearn import linear_model
|
|
>>> reg = linear_model.LassoLarsIC(criterion='bic')
|
|
>>> reg.fit([[-1, 1], [0, 0], [1, 1]], [-1.1111, 0, -1.1111])
|
|
LassoLarsIC(criterion='bic')
|
|
>>> print(reg.coef_)
|
|
[ 0. -1.11...]
|
|
|
|
Notes
|
|
-----
|
|
The estimation of the number of degrees of freedom is given by:
|
|
|
|
"On the degrees of freedom of the lasso"
|
|
Hui Zou, Trevor Hastie, and Robert Tibshirani
|
|
Ann. Statist. Volume 35, Number 5 (2007), 2173-2192.
|
|
|
|
https://en.wikipedia.org/wiki/Akaike_information_criterion
|
|
https://en.wikipedia.org/wiki/Bayesian_information_criterion
|
|
|
|
See also
|
|
--------
|
|
lars_path, LassoLars, LassoLarsCV
|
|
"""
|
|
@_deprecate_positional_args
|
|
def __init__(self, criterion='aic', *, fit_intercept=True, verbose=False,
|
|
normalize=True, precompute='auto', max_iter=500,
|
|
eps=np.finfo(np.float).eps, copy_X=True, positive=False):
|
|
self.criterion = criterion
|
|
self.fit_intercept = fit_intercept
|
|
self.positive = positive
|
|
self.max_iter = max_iter
|
|
self.verbose = verbose
|
|
self.normalize = normalize
|
|
self.copy_X = copy_X
|
|
self.precompute = precompute
|
|
self.eps = eps
|
|
self.fit_path = True
|
|
|
|
def _more_tags(self):
|
|
return {'multioutput': False}
|
|
|
|
def fit(self, X, y, copy_X=None):
|
|
"""Fit the model using X, y as training data.
|
|
|
|
Parameters
|
|
----------
|
|
X : array-like of shape (n_samples, n_features)
|
|
training data.
|
|
|
|
y : array-like of shape (n_samples,)
|
|
target values. Will be cast to X's dtype if necessary
|
|
|
|
copy_X : bool, default=None
|
|
If provided, this parameter will override the choice
|
|
of copy_X made at instance creation.
|
|
If ``True``, X will be copied; else, it may be overwritten.
|
|
|
|
Returns
|
|
-------
|
|
self : object
|
|
returns an instance of self.
|
|
"""
|
|
if copy_X is None:
|
|
copy_X = self.copy_X
|
|
X, y = self._validate_data(X, y, y_numeric=True)
|
|
|
|
X, y, Xmean, ymean, Xstd = LinearModel._preprocess_data(
|
|
X, y, self.fit_intercept, self.normalize, copy_X)
|
|
max_iter = self.max_iter
|
|
|
|
Gram = self.precompute
|
|
|
|
alphas_, _, coef_path_, self.n_iter_ = lars_path(
|
|
X, y, Gram=Gram, copy_X=copy_X, copy_Gram=True, alpha_min=0.0,
|
|
method='lasso', verbose=self.verbose, max_iter=max_iter,
|
|
eps=self.eps, return_n_iter=True, positive=self.positive)
|
|
|
|
n_samples = X.shape[0]
|
|
|
|
if self.criterion == 'aic':
|
|
K = 2 # AIC
|
|
elif self.criterion == 'bic':
|
|
K = log(n_samples) # BIC
|
|
else:
|
|
raise ValueError('criterion should be either bic or aic')
|
|
|
|
R = y[:, np.newaxis] - np.dot(X, coef_path_) # residuals
|
|
mean_squared_error = np.mean(R ** 2, axis=0)
|
|
sigma2 = np.var(y)
|
|
|
|
df = np.zeros(coef_path_.shape[1], dtype=np.int) # Degrees of freedom
|
|
for k, coef in enumerate(coef_path_.T):
|
|
mask = np.abs(coef) > np.finfo(coef.dtype).eps
|
|
if not np.any(mask):
|
|
continue
|
|
# get the number of degrees of freedom equal to:
|
|
# Xc = X[:, mask]
|
|
# Trace(Xc * inv(Xc.T, Xc) * Xc.T) ie the number of non-zero coefs
|
|
df[k] = np.sum(mask)
|
|
|
|
self.alphas_ = alphas_
|
|
eps64 = np.finfo('float64').eps
|
|
self.criterion_ = (n_samples * mean_squared_error / (sigma2 + eps64) +
|
|
K * df) # Eqns. 2.15--16 in (Zou et al, 2007)
|
|
n_best = np.argmin(self.criterion_)
|
|
|
|
self.alpha_ = alphas_[n_best]
|
|
self.coef_ = coef_path_[:, n_best]
|
|
self._set_intercept(Xmean, ymean, Xstd)
|
|
return self
|