# Author: Alexandre Gramfort # Fabian Pedregosa # Olivier Grisel # Gael Varoquaux # # License: BSD 3 clause import numbers import sys import warnings from abc import ABC, abstractmethod from functools import partial from numbers import Integral, Real import numpy as np from joblib import effective_n_jobs from scipy import sparse from ..base import MultiOutputMixin, RegressorMixin, _fit_context from ..model_selection import check_cv from ..utils import Bunch, check_array, check_scalar from ..utils._metadata_requests import ( MetadataRouter, MethodMapping, _raise_for_params, get_routing_for_object, ) from ..utils._param_validation import Interval, StrOptions, validate_params from ..utils.extmath import safe_sparse_dot from ..utils.metadata_routing import ( _routing_enabled, process_routing, ) from ..utils.parallel import Parallel, delayed from ..utils.validation import ( _check_sample_weight, check_consistent_length, check_is_fitted, check_random_state, column_or_1d, has_fit_parameter, ) # mypy error: Module 'sklearn.linear_model' has no attribute '_cd_fast' from . import _cd_fast as cd_fast # type: ignore from ._base import LinearModel, _pre_fit, _preprocess_data def _set_order(X, y, order="C"): """Change the order of X and y if necessary. Parameters ---------- X : {array-like, sparse matrix} of shape (n_samples, n_features) Training data. y : ndarray of shape (n_samples,) Target values. order : {None, 'C', 'F'} If 'C', dense arrays are returned as C-ordered, sparse matrices in csr format. If 'F', dense arrays are return as F-ordered, sparse matrices in csc format. Returns ------- X : {array-like, sparse matrix} of shape (n_samples, n_features) Training data with guaranteed order. y : ndarray of shape (n_samples,) Target values with guaranteed order. """ if order not in [None, "C", "F"]: raise ValueError( "Unknown value for order. Got {} instead of None, 'C' or 'F'.".format(order) ) sparse_X = sparse.issparse(X) sparse_y = sparse.issparse(y) if order is not None: sparse_format = "csc" if order == "F" else "csr" if sparse_X: X = X.asformat(sparse_format, copy=False) else: X = np.asarray(X, order=order) if sparse_y: y = y.asformat(sparse_format) else: y = np.asarray(y, order=order) return X, y ############################################################################### # Paths functions def _alpha_grid( X, y, Xy=None, l1_ratio=1.0, fit_intercept=True, eps=1e-3, n_alphas=100, copy_X=True, ): """Compute the grid of alpha values for elastic net parameter search Parameters ---------- X : {array-like, sparse matrix} of shape (n_samples, n_features) Training data. Pass directly as Fortran-contiguous data to avoid unnecessary memory duplication y : ndarray of shape (n_samples,) or (n_samples, n_outputs) Target values Xy : array-like of shape (n_features,) or (n_features, n_outputs),\ default=None Xy = np.dot(X.T, y) that can be precomputed. l1_ratio : float, default=1.0 The elastic net mixing parameter, with ``0 < l1_ratio <= 1``. For ``l1_ratio = 0`` the penalty is an L2 penalty. (currently not supported) ``For l1_ratio = 1`` it is an L1 penalty. For ``0 < l1_ratio <1``, the penalty is a combination of L1 and L2. eps : float, default=1e-3 Length of the path. ``eps=1e-3`` means that ``alpha_min / alpha_max = 1e-3`` n_alphas : int, default=100 Number of alphas along the regularization path fit_intercept : bool, default=True Whether to fit an intercept or not copy_X : bool, default=True If ``True``, X will be copied; else, it may be overwritten. """ if l1_ratio == 0: raise ValueError( "Automatic alpha grid generation is not supported for" " l1_ratio=0. Please supply a grid by providing " "your estimator with the appropriate `alphas=` " "argument." ) n_samples = len(y) sparse_center = False if Xy is None: X_sparse = sparse.issparse(X) sparse_center = X_sparse and fit_intercept X = check_array( X, accept_sparse="csc", copy=(copy_X and fit_intercept and not X_sparse) ) if not X_sparse: # X can be touched inplace thanks to the above line X, y, _, _, _ = _preprocess_data( X, y, fit_intercept=fit_intercept, copy=False ) Xy = safe_sparse_dot(X.T, y, dense_output=True) if sparse_center: # Workaround to find alpha_max for sparse matrices. # since we should not destroy the sparsity of such matrices. _, _, X_offset, _, X_scale = _preprocess_data( X, y, fit_intercept=fit_intercept ) mean_dot = X_offset * np.sum(y) if Xy.ndim == 1: Xy = Xy[:, np.newaxis] if sparse_center: if fit_intercept: Xy -= mean_dot[:, np.newaxis] alpha_max = np.sqrt(np.sum(Xy**2, axis=1)).max() / (n_samples * l1_ratio) if alpha_max <= np.finfo(float).resolution: alphas = np.empty(n_alphas) alphas.fill(np.finfo(float).resolution) return alphas return np.geomspace(alpha_max, alpha_max * eps, num=n_alphas) @validate_params( { "X": ["array-like", "sparse matrix"], "y": ["array-like", "sparse matrix"], "eps": [Interval(Real, 0, None, closed="neither")], "n_alphas": [Interval(Integral, 1, None, closed="left")], "alphas": ["array-like", None], "precompute": [StrOptions({"auto"}), "boolean", "array-like"], "Xy": ["array-like", None], "copy_X": ["boolean"], "coef_init": ["array-like", None], "verbose": ["verbose"], "return_n_iter": ["boolean"], "positive": ["boolean"], }, prefer_skip_nested_validation=True, ) def lasso_path( X, y, *, eps=1e-3, n_alphas=100, alphas=None, precompute="auto", Xy=None, copy_X=True, coef_init=None, verbose=False, return_n_iter=False, positive=False, **params, ): """Compute Lasso path with coordinate descent. The Lasso optimization function varies for mono and multi-outputs. For mono-output tasks it is:: (1 / (2 * n_samples)) * ||y - Xw||^2_2 + alpha * ||w||_1 For multi-output tasks it is:: (1 / (2 * n_samples)) * ||Y - XW||^2_Fro + alpha * ||W||_21 Where:: ||W||_21 = \\sum_i \\sqrt{\\sum_j w_{ij}^2} i.e. the sum of norm of each row. Read more in the :ref:`User Guide `. Parameters ---------- X : {array-like, sparse matrix} of shape (n_samples, n_features) Training data. Pass directly as Fortran-contiguous data to avoid unnecessary memory duplication. If ``y`` is mono-output then ``X`` can be sparse. y : {array-like, sparse matrix} of shape (n_samples,) or \ (n_samples, n_targets) Target values. eps : float, default=1e-3 Length of the path. ``eps=1e-3`` means that ``alpha_min / alpha_max = 1e-3``. n_alphas : int, default=100 Number of alphas along the regularization path. alphas : array-like, default=None List of alphas where to compute the models. If ``None`` alphas are set automatically. precompute : 'auto', bool or array-like of shape \ (n_features, n_features), default='auto' Whether to use a precomputed Gram matrix to speed up calculations. If set to ``'auto'`` let us decide. The Gram matrix can also be passed as argument. Xy : array-like of shape (n_features,) or (n_features, n_targets),\ default=None Xy = np.dot(X.T, y) that can be precomputed. It is useful only when the Gram matrix is precomputed. copy_X : bool, default=True If ``True``, X will be copied; else, it may be overwritten. coef_init : array-like of shape (n_features, ), default=None The initial values of the coefficients. verbose : bool or int, default=False Amount of verbosity. return_n_iter : bool, default=False Whether to return the number of iterations or not. positive : bool, default=False If set to True, forces coefficients to be positive. (Only allowed when ``y.ndim == 1``). **params : kwargs Keyword arguments passed to the coordinate descent solver. Returns ------- alphas : ndarray of shape (n_alphas,) The alphas along the path where models are computed. coefs : ndarray of shape (n_features, n_alphas) or \ (n_targets, n_features, n_alphas) Coefficients along the path. dual_gaps : ndarray of shape (n_alphas,) The dual gaps at the end of the optimization for each alpha. n_iters : list of int The number of iterations taken by the coordinate descent optimizer to reach the specified tolerance for each alpha. See Also -------- lars_path : Compute Least Angle Regression or Lasso path using LARS algorithm. Lasso : The Lasso is a linear model that estimates sparse coefficients. LassoLars : Lasso model fit with Least Angle Regression a.k.a. Lars. LassoCV : Lasso linear model with iterative fitting along a regularization path. LassoLarsCV : Cross-validated Lasso using the LARS algorithm. sklearn.decomposition.sparse_encode : Estimator that can be used to transform signals into sparse linear combination of atoms from a fixed. Notes ----- For an example, see :ref:`examples/linear_model/plot_lasso_coordinate_descent_path.py `. To avoid unnecessary memory duplication the X argument of the fit method should be directly passed as a Fortran-contiguous numpy array. Note that in certain cases, the Lars solver may be significantly faster to implement this functionality. In particular, linear interpolation can be used to retrieve model coefficients between the values output by lars_path Examples -------- Comparing lasso_path and lars_path with interpolation: >>> import numpy as np >>> from sklearn.linear_model import lasso_path >>> X = np.array([[1, 2, 3.1], [2.3, 5.4, 4.3]]).T >>> y = np.array([1, 2, 3.1]) >>> # Use lasso_path to compute a coefficient path >>> _, coef_path, _ = lasso_path(X, y, alphas=[5., 1., .5]) >>> print(coef_path) [[0. 0. 0.46874778] [0.2159048 0.4425765 0.23689075]] >>> # Now use lars_path and 1D linear interpolation to compute the >>> # same path >>> from sklearn.linear_model import lars_path >>> alphas, active, coef_path_lars = lars_path(X, y, method='lasso') >>> from scipy import interpolate >>> coef_path_continuous = interpolate.interp1d(alphas[::-1], ... coef_path_lars[:, ::-1]) >>> print(coef_path_continuous([5., 1., .5])) [[0. 0. 0.46915237] [0.2159048 0.4425765 0.23668876]] """ return enet_path( X, y, l1_ratio=1.0, eps=eps, n_alphas=n_alphas, alphas=alphas, precompute=precompute, Xy=Xy, copy_X=copy_X, coef_init=coef_init, verbose=verbose, positive=positive, return_n_iter=return_n_iter, **params, ) @validate_params( { "X": ["array-like", "sparse matrix"], "y": ["array-like", "sparse matrix"], "l1_ratio": [Interval(Real, 0.0, 1.0, closed="both")], "eps": [Interval(Real, 0.0, None, closed="neither")], "n_alphas": [Interval(Integral, 1, None, closed="left")], "alphas": ["array-like", None], "precompute": [StrOptions({"auto"}), "boolean", "array-like"], "Xy": ["array-like", None], "copy_X": ["boolean"], "coef_init": ["array-like", None], "verbose": ["verbose"], "return_n_iter": ["boolean"], "positive": ["boolean"], "check_input": ["boolean"], }, prefer_skip_nested_validation=True, ) def enet_path( X, y, *, l1_ratio=0.5, eps=1e-3, n_alphas=100, alphas=None, precompute="auto", Xy=None, copy_X=True, coef_init=None, verbose=False, return_n_iter=False, positive=False, check_input=True, **params, ): """Compute elastic net path with coordinate descent. The elastic net optimization function varies for mono and multi-outputs. For mono-output tasks it is:: 1 / (2 * n_samples) * ||y - Xw||^2_2 + alpha * l1_ratio * ||w||_1 + 0.5 * alpha * (1 - l1_ratio) * ||w||^2_2 For multi-output tasks it is:: (1 / (2 * n_samples)) * ||Y - XW||_Fro^2 + alpha * l1_ratio * ||W||_21 + 0.5 * alpha * (1 - l1_ratio) * ||W||_Fro^2 Where:: ||W||_21 = \\sum_i \\sqrt{\\sum_j w_{ij}^2} i.e. the sum of norm of each row. Read more in the :ref:`User Guide `. Parameters ---------- X : {array-like, sparse matrix} of shape (n_samples, n_features) Training data. Pass directly as Fortran-contiguous data to avoid unnecessary memory duplication. If ``y`` is mono-output then ``X`` can be sparse. y : {array-like, sparse matrix} of shape (n_samples,) or \ (n_samples, n_targets) Target values. l1_ratio : float, default=0.5 Number between 0 and 1 passed to elastic net (scaling between l1 and l2 penalties). ``l1_ratio=1`` corresponds to the Lasso. eps : float, default=1e-3 Length of the path. ``eps=1e-3`` means that ``alpha_min / alpha_max = 1e-3``. n_alphas : int, default=100 Number of alphas along the regularization path. alphas : array-like, default=None List of alphas where to compute the models. If None alphas are set automatically. precompute : 'auto', bool or array-like of shape \ (n_features, n_features), default='auto' Whether to use a precomputed Gram matrix to speed up calculations. If set to ``'auto'`` let us decide. The Gram matrix can also be passed as argument. Xy : array-like of shape (n_features,) or (n_features, n_targets),\ default=None Xy = np.dot(X.T, y) that can be precomputed. It is useful only when the Gram matrix is precomputed. copy_X : bool, default=True If ``True``, X will be copied; else, it may be overwritten. coef_init : array-like of shape (n_features, ), default=None The initial values of the coefficients. verbose : bool or int, default=False Amount of verbosity. return_n_iter : bool, default=False Whether to return the number of iterations or not. positive : bool, default=False If set to True, forces coefficients to be positive. (Only allowed when ``y.ndim == 1``). check_input : bool, default=True If set to False, the input validation checks are skipped (including the Gram matrix when provided). It is assumed that they are handled by the caller. **params : kwargs Keyword arguments passed to the coordinate descent solver. Returns ------- alphas : ndarray of shape (n_alphas,) The alphas along the path where models are computed. coefs : ndarray of shape (n_features, n_alphas) or \ (n_targets, n_features, n_alphas) Coefficients along the path. dual_gaps : ndarray of shape (n_alphas,) The dual gaps at the end of the optimization for each alpha. n_iters : list of int The number of iterations taken by the coordinate descent optimizer to reach the specified tolerance for each alpha. (Is returned when ``return_n_iter`` is set to True). See Also -------- MultiTaskElasticNet : Multi-task ElasticNet model trained with L1/L2 mixed-norm \ as regularizer. MultiTaskElasticNetCV : Multi-task L1/L2 ElasticNet with built-in cross-validation. ElasticNet : Linear regression with combined L1 and L2 priors as regularizer. ElasticNetCV : Elastic Net model with iterative fitting along a regularization path. Notes ----- For an example, see :ref:`examples/linear_model/plot_lasso_coordinate_descent_path.py `. """ X_offset_param = params.pop("X_offset", None) X_scale_param = params.pop("X_scale", None) sample_weight = params.pop("sample_weight", None) tol = params.pop("tol", 1e-4) max_iter = params.pop("max_iter", 1000) random_state = params.pop("random_state", None) selection = params.pop("selection", "cyclic") if len(params) > 0: raise ValueError("Unexpected parameters in params", params.keys()) # We expect X and y to be already Fortran ordered when bypassing # checks if check_input: X = check_array( X, accept_sparse="csc", dtype=[np.float64, np.float32], order="F", copy=copy_X, ) y = check_array( y, accept_sparse="csc", dtype=X.dtype.type, order="F", copy=False, ensure_2d=False, ) if Xy is not None: # Xy should be a 1d contiguous array or a 2D C ordered array Xy = check_array( Xy, dtype=X.dtype.type, order="C", copy=False, ensure_2d=False ) n_samples, n_features = X.shape multi_output = False if y.ndim != 1: multi_output = True n_targets = y.shape[1] if multi_output and positive: raise ValueError("positive=True is not allowed for multi-output (y.ndim != 1)") # MultiTaskElasticNet does not support sparse matrices if not multi_output and sparse.issparse(X): if X_offset_param is not None: # As sparse matrices are not actually centered we need this to be passed to # the CD solver. X_sparse_scaling = X_offset_param / X_scale_param X_sparse_scaling = np.asarray(X_sparse_scaling, dtype=X.dtype) else: X_sparse_scaling = np.zeros(n_features, dtype=X.dtype) # X should have been passed through _pre_fit already if function is called # from ElasticNet.fit if check_input: X, y, _, _, _, precompute, Xy = _pre_fit( X, y, Xy, precompute, fit_intercept=False, copy=False, check_input=check_input, ) if alphas is None: # No need to normalize of fit_intercept: it has been done # above alphas = _alpha_grid( X, y, Xy=Xy, l1_ratio=l1_ratio, fit_intercept=False, eps=eps, n_alphas=n_alphas, copy_X=False, ) elif len(alphas) > 1: alphas = np.sort(alphas)[::-1] # make sure alphas are properly ordered n_alphas = len(alphas) dual_gaps = np.empty(n_alphas) n_iters = [] rng = check_random_state(random_state) if selection not in ["random", "cyclic"]: raise ValueError("selection should be either random or cyclic.") random = selection == "random" if not multi_output: coefs = np.empty((n_features, n_alphas), dtype=X.dtype) else: coefs = np.empty((n_targets, n_features, n_alphas), dtype=X.dtype) if coef_init is None: coef_ = np.zeros(coefs.shape[:-1], dtype=X.dtype, order="F") else: coef_ = np.asfortranarray(coef_init, dtype=X.dtype) for i, alpha in enumerate(alphas): # account for n_samples scaling in objectives between here and cd_fast l1_reg = alpha * l1_ratio * n_samples l2_reg = alpha * (1.0 - l1_ratio) * n_samples if not multi_output and sparse.issparse(X): model = cd_fast.sparse_enet_coordinate_descent( w=coef_, alpha=l1_reg, beta=l2_reg, X_data=X.data, X_indices=X.indices, X_indptr=X.indptr, y=y, sample_weight=sample_weight, X_mean=X_sparse_scaling, max_iter=max_iter, tol=tol, rng=rng, random=random, positive=positive, ) elif multi_output: model = cd_fast.enet_coordinate_descent_multi_task( coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random ) elif isinstance(precompute, np.ndarray): # We expect precompute to be already Fortran ordered when bypassing # checks if check_input: precompute = check_array(precompute, dtype=X.dtype.type, order="C") model = cd_fast.enet_coordinate_descent_gram( coef_, l1_reg, l2_reg, precompute, Xy, y, max_iter, tol, rng, random, positive, ) elif precompute is False: model = cd_fast.enet_coordinate_descent( coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive ) else: raise ValueError( "Precompute should be one of True, False, 'auto' or array-like. Got %r" % precompute ) coef_, dual_gap_, eps_, n_iter_ = model coefs[..., i] = coef_ # we correct the scale of the returned dual gap, as the objective # in cd_fast is n_samples * the objective in this docstring. dual_gaps[i] = dual_gap_ / n_samples n_iters.append(n_iter_) if verbose: if verbose > 2: print(model) elif verbose > 1: print("Path: %03i out of %03i" % (i, n_alphas)) else: sys.stderr.write(".") if return_n_iter: return alphas, coefs, dual_gaps, n_iters return alphas, coefs, dual_gaps ############################################################################### # ElasticNet model class ElasticNet(MultiOutputMixin, RegressorMixin, LinearModel): """Linear regression with combined L1 and L2 priors as regularizer. Minimizes the objective function:: 1 / (2 * n_samples) * ||y - Xw||^2_2 + alpha * l1_ratio * ||w||_1 + 0.5 * alpha * (1 - l1_ratio) * ||w||^2_2 If you are interested in controlling the L1 and L2 penalty separately, keep in mind that this is equivalent to:: a * ||w||_1 + 0.5 * b * ||w||_2^2 where:: alpha = a + b and l1_ratio = a / (a + b) The parameter l1_ratio corresponds to alpha in the glmnet R package while alpha corresponds to the lambda parameter in glmnet. Specifically, l1_ratio = 1 is the lasso penalty. Currently, l1_ratio <= 0.01 is not reliable, unless you supply your own sequence of alpha. Read more in the :ref:`User Guide `. Parameters ---------- alpha : float, default=1.0 Constant that multiplies the penalty terms. Defaults to 1.0. See the notes for the exact mathematical meaning of this parameter. ``alpha = 0`` is equivalent to an ordinary least square, solved by the :class:`LinearRegression` object. For numerical reasons, using ``alpha = 0`` with the ``Lasso`` object is not advised. Given this, you should use the :class:`LinearRegression` object. l1_ratio : float, default=0.5 The ElasticNet mixing parameter, with ``0 <= l1_ratio <= 1``. For ``l1_ratio = 0`` the penalty is an L2 penalty. ``For l1_ratio = 1`` it is an L1 penalty. For ``0 < l1_ratio < 1``, the penalty is a combination of L1 and L2. fit_intercept : bool, default=True Whether the intercept should be estimated or not. If ``False``, the data is assumed to be already centered. precompute : bool or array-like of shape (n_features, n_features),\ default=False Whether to use a precomputed Gram matrix to speed up calculations. The Gram matrix can also be passed as argument. For sparse input this option is always ``False`` to preserve sparsity. max_iter : int, default=1000 The maximum number of iterations. copy_X : bool, default=True If ``True``, X will be copied; else, it may be overwritten. tol : float, default=1e-4 The tolerance for the optimization: if the updates are smaller than ``tol``, the optimization code checks the dual gap for optimality and continues until it is smaller than ``tol``, see Notes below. warm_start : bool, default=False When set to ``True``, reuse the solution of the previous call to fit as initialization, otherwise, just erase the previous solution. See :term:`the Glossary `. positive : bool, default=False When set to ``True``, forces the coefficients to be positive. random_state : int, RandomState instance, default=None The seed of the pseudo random number generator that selects a random feature to update. Used when ``selection`` == 'random'. Pass an int for reproducible output across multiple function calls. See :term:`Glossary `. selection : {'cyclic', 'random'}, default='cyclic' If set to 'random', a random coefficient is updated every iteration rather than looping over features sequentially by default. This (setting to 'random') often leads to significantly faster convergence especially when tol is higher than 1e-4. Attributes ---------- coef_ : ndarray of shape (n_features,) or (n_targets, n_features) Parameter vector (w in the cost function formula). sparse_coef_ : sparse matrix of shape (n_features,) or \ (n_targets, n_features) Sparse representation of the `coef_`. intercept_ : float or ndarray of shape (n_targets,) Independent term in decision function. n_iter_ : list of int Number of iterations run by the coordinate descent solver to reach the specified tolerance. dual_gap_ : float or ndarray of shape (n_targets,) Given param alpha, the dual gaps at the end of the optimization, same shape as each observation of y. n_features_in_ : int Number of features seen during :term:`fit`. .. versionadded:: 0.24 feature_names_in_ : ndarray of shape (`n_features_in_`,) Names of features seen during :term:`fit`. Defined only when `X` has feature names that are all strings. .. versionadded:: 1.0 See Also -------- ElasticNetCV : Elastic net model with best model selection by cross-validation. SGDRegressor : Implements elastic net regression with incremental training. SGDClassifier : Implements logistic regression with elastic net penalty (``SGDClassifier(loss="log_loss", penalty="elasticnet")``). Notes ----- To avoid unnecessary memory duplication the X argument of the fit method should be directly passed as a Fortran-contiguous numpy array. The precise stopping criteria based on `tol` are the following: First, check that that maximum coordinate update, i.e. :math:`\\max_j |w_j^{new} - w_j^{old}|` is smaller than `tol` times the maximum absolute coefficient, :math:`\\max_j |w_j|`. If so, then additionally check whether the dual gap is smaller than `tol` times :math:`||y||_2^2 / n_{\text{samples}}`. Examples -------- >>> from sklearn.linear_model import ElasticNet >>> from sklearn.datasets import make_regression >>> X, y = make_regression(n_features=2, random_state=0) >>> regr = ElasticNet(random_state=0) >>> regr.fit(X, y) ElasticNet(random_state=0) >>> print(regr.coef_) [18.83816048 64.55968825] >>> print(regr.intercept_) 1.451... >>> print(regr.predict([[0, 0]])) [1.451...] """ _parameter_constraints: dict = { "alpha": [Interval(Real, 0, None, closed="left")], "l1_ratio": [Interval(Real, 0, 1, closed="both")], "fit_intercept": ["boolean"], "precompute": ["boolean", "array-like"], "max_iter": [Interval(Integral, 1, None, closed="left"), None], "copy_X": ["boolean"], "tol": [Interval(Real, 0, None, closed="left")], "warm_start": ["boolean"], "positive": ["boolean"], "random_state": ["random_state"], "selection": [StrOptions({"cyclic", "random"})], } path = staticmethod(enet_path) def __init__( self, alpha=1.0, *, l1_ratio=0.5, fit_intercept=True, precompute=False, max_iter=1000, copy_X=True, tol=1e-4, warm_start=False, positive=False, random_state=None, selection="cyclic", ): self.alpha = alpha self.l1_ratio = l1_ratio self.fit_intercept = fit_intercept self.precompute = precompute self.max_iter = max_iter self.copy_X = copy_X self.tol = tol self.warm_start = warm_start self.positive = positive self.random_state = random_state self.selection = selection @_fit_context(prefer_skip_nested_validation=True) def fit(self, X, y, sample_weight=None, check_input=True): """Fit model with coordinate descent. Parameters ---------- X : {ndarray, sparse matrix} of (n_samples, n_features) Data. y : ndarray of shape (n_samples,) or (n_samples, n_targets) Target. Will be cast to X's dtype if necessary. sample_weight : float or array-like of shape (n_samples,), default=None Sample weights. Internally, the `sample_weight` vector will be rescaled to sum to `n_samples`. .. versionadded:: 0.23 check_input : bool, default=True Allow to bypass several input checking. Don't use this parameter unless you know what you do. Returns ------- self : object Fitted estimator. Notes ----- Coordinate descent is an algorithm that considers each column of data at a time hence it will automatically convert the X input as a Fortran-contiguous numpy array if necessary. To avoid memory re-allocation it is advised to allocate the initial data in memory directly using that format. """ if self.alpha == 0: warnings.warn( ( "With alpha=0, this algorithm does not converge " "well. You are advised to use the LinearRegression " "estimator" ), stacklevel=2, ) # Remember if X is copied X_copied = False # We expect X and y to be float64 or float32 Fortran ordered arrays # when bypassing checks if check_input: X_copied = self.copy_X and self.fit_intercept X, y = self._validate_data( X, y, accept_sparse="csc", order="F", dtype=[np.float64, np.float32], copy=X_copied, multi_output=True, y_numeric=True, ) y = check_array( y, order="F", copy=False, dtype=X.dtype.type, ensure_2d=False ) n_samples, n_features = X.shape alpha = self.alpha if isinstance(sample_weight, numbers.Number): sample_weight = None if sample_weight is not None: if check_input: sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype) # TLDR: Rescale sw to sum up to n_samples. # Long: The objective function of Enet # # 1/2 * np.average(squared error, weights=sw) # + alpha * penalty (1) # # is invariant under rescaling of sw. # But enet_path coordinate descent minimizes # # 1/2 * sum(squared error) + alpha' * penalty (2) # # and therefore sets # # alpha' = n_samples * alpha (3) # # inside its function body, which results in objective (2) being # equivalent to (1) in case of no sw. # With sw, however, enet_path should set # # alpha' = sum(sw) * alpha (4) # # Therefore, we use the freedom of Eq. (1) to rescale sw before # calling enet_path, i.e. # # sw *= n_samples / sum(sw) # # such that sum(sw) = n_samples. This way, (3) and (4) are the same. sample_weight = sample_weight * (n_samples / np.sum(sample_weight)) # Note: Alternatively, we could also have rescaled alpha instead # of sample_weight: # # alpha *= np.sum(sample_weight) / n_samples # Ensure copying happens only once, don't do it again if done above. # X and y will be rescaled if sample_weight is not None, order='F' # ensures that the returned X and y are still F-contiguous. should_copy = self.copy_X and not X_copied X, y, X_offset, y_offset, X_scale, precompute, Xy = _pre_fit( X, y, None, self.precompute, fit_intercept=self.fit_intercept, copy=should_copy, check_input=check_input, sample_weight=sample_weight, ) # coordinate descent needs F-ordered arrays and _pre_fit might have # called _rescale_data if check_input or sample_weight is not None: X, y = _set_order(X, y, order="F") if y.ndim == 1: y = y[:, np.newaxis] if Xy is not None and Xy.ndim == 1: Xy = Xy[:, np.newaxis] n_targets = y.shape[1] if not self.warm_start or not hasattr(self, "coef_"): coef_ = np.zeros((n_targets, n_features), dtype=X.dtype, order="F") else: coef_ = self.coef_ if coef_.ndim == 1: coef_ = coef_[np.newaxis, :] dual_gaps_ = np.zeros(n_targets, dtype=X.dtype) self.n_iter_ = [] for k in range(n_targets): if Xy is not None: this_Xy = Xy[:, k] else: this_Xy = None _, this_coef, this_dual_gap, this_iter = self.path( X, y[:, k], l1_ratio=self.l1_ratio, eps=None, n_alphas=None, alphas=[alpha], precompute=precompute, Xy=this_Xy, copy_X=True, coef_init=coef_[k], verbose=False, return_n_iter=True, positive=self.positive, check_input=False, # from here on **params tol=self.tol, X_offset=X_offset, X_scale=X_scale, max_iter=self.max_iter, random_state=self.random_state, selection=self.selection, sample_weight=sample_weight, ) coef_[k] = this_coef[:, 0] dual_gaps_[k] = this_dual_gap[0] self.n_iter_.append(this_iter[0]) if n_targets == 1: self.n_iter_ = self.n_iter_[0] self.coef_ = coef_[0] self.dual_gap_ = dual_gaps_[0] else: self.coef_ = coef_ self.dual_gap_ = dual_gaps_ self._set_intercept(X_offset, y_offset, X_scale) # check for finiteness of coefficients if not all(np.isfinite(w).all() for w in [self.coef_, self.intercept_]): raise ValueError( "Coordinate descent iterations resulted in non-finite parameter" " values. The input data may contain large values and need to" " be preprocessed." ) # return self for chaining fit and predict calls return self @property def sparse_coef_(self): """Sparse representation of the fitted `coef_`.""" return sparse.csr_matrix(self.coef_) def _decision_function(self, X): """Decision function of the linear model. Parameters ---------- X : numpy array or scipy.sparse matrix of shape (n_samples, n_features) Returns ------- T : ndarray of shape (n_samples,) The predicted decision function. """ check_is_fitted(self) if sparse.issparse(X): return safe_sparse_dot(X, self.coef_.T, dense_output=True) + self.intercept_ else: return super()._decision_function(X) ############################################################################### # Lasso model class Lasso(ElasticNet): """Linear Model trained with L1 prior as regularizer (aka the Lasso). The optimization objective for Lasso is:: (1 / (2 * n_samples)) * ||y - Xw||^2_2 + alpha * ||w||_1 Technically the Lasso model is optimizing the same objective function as the Elastic Net with ``l1_ratio=1.0`` (no L2 penalty). Read more in the :ref:`User Guide `. Parameters ---------- alpha : float, default=1.0 Constant that multiplies the L1 term, controlling regularization strength. `alpha` must be a non-negative float i.e. in `[0, inf)`. When `alpha = 0`, the objective is equivalent to ordinary least squares, solved by the :class:`LinearRegression` object. For numerical reasons, using `alpha = 0` with the `Lasso` object is not advised. Instead, you should use the :class:`LinearRegression` object. fit_intercept : bool, default=True Whether to calculate the intercept for this model. If set to False, no intercept will be used in calculations (i.e. data is expected to be centered). precompute : bool or array-like of shape (n_features, n_features),\ default=False Whether to use a precomputed Gram matrix to speed up calculations. The Gram matrix can also be passed as argument. For sparse input this option is always ``False`` to preserve sparsity. copy_X : bool, default=True If ``True``, X will be copied; else, it may be overwritten. max_iter : int, default=1000 The maximum number of iterations. tol : float, default=1e-4 The tolerance for the optimization: if the updates are smaller than ``tol``, the optimization code checks the dual gap for optimality and continues until it is smaller than ``tol``, see Notes below. warm_start : bool, default=False When set to True, reuse the solution of the previous call to fit as initialization, otherwise, just erase the previous solution. See :term:`the Glossary `. positive : bool, default=False When set to ``True``, forces the coefficients to be positive. random_state : int, RandomState instance, default=None The seed of the pseudo random number generator that selects a random feature to update. Used when ``selection`` == 'random'. Pass an int for reproducible output across multiple function calls. See :term:`Glossary `. selection : {'cyclic', 'random'}, default='cyclic' If set to 'random', a random coefficient is updated every iteration rather than looping over features sequentially by default. This (setting to 'random') often leads to significantly faster convergence especially when tol is higher than 1e-4. Attributes ---------- coef_ : ndarray of shape (n_features,) or (n_targets, n_features) Parameter vector (w in the cost function formula). dual_gap_ : float or ndarray of shape (n_targets,) Given param alpha, the dual gaps at the end of the optimization, same shape as each observation of y. sparse_coef_ : sparse matrix of shape (n_features, 1) or \ (n_targets, n_features) Readonly property derived from ``coef_``. intercept_ : float or ndarray of shape (n_targets,) Independent term in decision function. n_iter_ : int or list of int Number of iterations run by the coordinate descent solver to reach the specified tolerance. n_features_in_ : int Number of features seen during :term:`fit`. .. versionadded:: 0.24 feature_names_in_ : ndarray of shape (`n_features_in_`,) Names of features seen during :term:`fit`. Defined only when `X` has feature names that are all strings. .. versionadded:: 1.0 See Also -------- lars_path : Regularization path using LARS. lasso_path : Regularization path using Lasso. LassoLars : Lasso Path along the regularization parameter using LARS algorithm. LassoCV : Lasso alpha parameter by cross-validation. LassoLarsCV : Lasso least angle parameter algorithm by cross-validation. sklearn.decomposition.sparse_encode : Sparse coding array estimator. Notes ----- The algorithm used to fit the model is coordinate descent. To avoid unnecessary memory duplication the X argument of the fit method should be directly passed as a Fortran-contiguous numpy array. Regularization improves the conditioning of the problem and reduces the variance of the estimates. Larger values specify stronger regularization. Alpha corresponds to `1 / (2C)` in other linear models such as :class:`~sklearn.linear_model.LogisticRegression` or :class:`~sklearn.svm.LinearSVC`. If an array is passed, penalties are assumed to be specific to the targets. Hence they must correspond in number. The precise stopping criteria based on `tol` are the following: First, check that that maximum coordinate update, i.e. :math:`\\max_j |w_j^{new} - w_j^{old}|` is smaller than `tol` times the maximum absolute coefficient, :math:`\\max_j |w_j|`. If so, then additionally check whether the dual gap is smaller than `tol` times :math:`||y||_2^2 / n_{\\text{samples}}`. The target can be a 2-dimensional array, resulting in the optimization of the following objective:: (1 / (2 * n_samples)) * ||Y - XW||^2_F + alpha * ||W||_11 where :math:`||W||_{1,1}` is the sum of the magnitude of the matrix coefficients. It should not be confused with :class:`~sklearn.linear_model.MultiTaskLasso` which instead penalizes the :math:`L_{2,1}` norm of the coefficients, yielding row-wise sparsity in the coefficients. Examples -------- >>> from sklearn import linear_model >>> clf = linear_model.Lasso(alpha=0.1) >>> clf.fit([[0,0], [1, 1], [2, 2]], [0, 1, 2]) Lasso(alpha=0.1) >>> print(clf.coef_) [0.85 0. ] >>> print(clf.intercept_) 0.15... """ _parameter_constraints: dict = { **ElasticNet._parameter_constraints, } _parameter_constraints.pop("l1_ratio") path = staticmethod(enet_path) def __init__( self, alpha=1.0, *, fit_intercept=True, precompute=False, copy_X=True, max_iter=1000, tol=1e-4, warm_start=False, positive=False, random_state=None, selection="cyclic", ): super().__init__( alpha=alpha, l1_ratio=1.0, fit_intercept=fit_intercept, precompute=precompute, copy_X=copy_X, max_iter=max_iter, tol=tol, warm_start=warm_start, positive=positive, random_state=random_state, selection=selection, ) ############################################################################### # Functions for CV with paths functions def _path_residuals( X, y, sample_weight, train, test, fit_intercept, path, path_params, alphas=None, l1_ratio=1, X_order=None, dtype=None, ): """Returns the MSE for the models computed by 'path'. Parameters ---------- X : {array-like, sparse matrix} of shape (n_samples, n_features) Training data. y : array-like of shape (n_samples,) or (n_samples, n_targets) Target values. sample_weight : None or array-like of shape (n_samples,) Sample weights. train : list of indices The indices of the train set. test : list of indices The indices of the test set. path : callable Function returning a list of models on the path. See enet_path for an example of signature. path_params : dictionary Parameters passed to the path function. alphas : array-like, default=None Array of float that is used for cross-validation. If not provided, computed using 'path'. l1_ratio : float, default=1 float between 0 and 1 passed to ElasticNet (scaling between l1 and l2 penalties). For ``l1_ratio = 0`` the penalty is an L2 penalty. For ``l1_ratio = 1`` it is an L1 penalty. For ``0 < l1_ratio < 1``, the penalty is a combination of L1 and L2. X_order : {'F', 'C'}, default=None The order of the arrays expected by the path function to avoid memory copies. dtype : a numpy dtype, default=None The dtype of the arrays expected by the path function to avoid memory copies. """ X_train = X[train] y_train = y[train] X_test = X[test] y_test = y[test] if sample_weight is None: sw_train, sw_test = None, None else: sw_train = sample_weight[train] sw_test = sample_weight[test] n_samples = X_train.shape[0] # TLDR: Rescale sw_train to sum up to n_samples on the training set. # See TLDR and long comment inside ElasticNet.fit. sw_train *= n_samples / np.sum(sw_train) # Note: Alternatively, we could also have rescaled alpha instead # of sample_weight: # # alpha *= np.sum(sample_weight) / n_samples if not sparse.issparse(X): for array, array_input in ( (X_train, X), (y_train, y), (X_test, X), (y_test, y), ): if array.base is not array_input and not array.flags["WRITEABLE"]: # fancy indexing should create a writable copy but it doesn't # for read-only memmaps (cf. numpy#14132). array.setflags(write=True) if y.ndim == 1: precompute = path_params["precompute"] else: # No Gram variant of multi-task exists right now. # Fall back to default enet_multitask precompute = False X_train, y_train, X_offset, y_offset, X_scale, precompute, Xy = _pre_fit( X_train, y_train, None, precompute, fit_intercept=fit_intercept, copy=False, sample_weight=sw_train, ) path_params = path_params.copy() path_params["Xy"] = Xy path_params["X_offset"] = X_offset path_params["X_scale"] = X_scale path_params["precompute"] = precompute path_params["copy_X"] = False path_params["alphas"] = alphas # needed for sparse cd solver path_params["sample_weight"] = sw_train if "l1_ratio" in path_params: path_params["l1_ratio"] = l1_ratio # Do the ordering and type casting here, as if it is done in the path, # X is copied and a reference is kept here X_train = check_array(X_train, accept_sparse="csc", dtype=dtype, order=X_order) alphas, coefs, _ = path(X_train, y_train, **path_params) del X_train, y_train if y.ndim == 1: # Doing this so that it becomes coherent with multioutput. coefs = coefs[np.newaxis, :, :] y_offset = np.atleast_1d(y_offset) y_test = y_test[:, np.newaxis] intercepts = y_offset[:, np.newaxis] - np.dot(X_offset, coefs) X_test_coefs = safe_sparse_dot(X_test, coefs) residues = X_test_coefs - y_test[:, :, np.newaxis] residues += intercepts if sample_weight is None: this_mse = (residues**2).mean(axis=0) else: this_mse = np.average(residues**2, weights=sw_test, axis=0) return this_mse.mean(axis=0) class LinearModelCV(MultiOutputMixin, LinearModel, ABC): """Base class for iterative model fitting along a regularization path.""" _parameter_constraints: dict = { "eps": [Interval(Real, 0, None, closed="neither")], "n_alphas": [Interval(Integral, 1, None, closed="left")], "alphas": ["array-like", None], "fit_intercept": ["boolean"], "precompute": [StrOptions({"auto"}), "array-like", "boolean"], "max_iter": [Interval(Integral, 1, None, closed="left")], "tol": [Interval(Real, 0, None, closed="left")], "copy_X": ["boolean"], "cv": ["cv_object"], "verbose": ["verbose"], "n_jobs": [Integral, None], "positive": ["boolean"], "random_state": ["random_state"], "selection": [StrOptions({"cyclic", "random"})], } @abstractmethod def __init__( self, eps=1e-3, n_alphas=100, alphas=None, fit_intercept=True, precompute="auto", max_iter=1000, tol=1e-4, copy_X=True, cv=None, verbose=False, n_jobs=None, positive=False, random_state=None, selection="cyclic", ): self.eps = eps self.n_alphas = n_alphas self.alphas = alphas self.fit_intercept = fit_intercept self.precompute = precompute self.max_iter = max_iter self.tol = tol self.copy_X = copy_X self.cv = cv self.verbose = verbose self.n_jobs = n_jobs self.positive = positive self.random_state = random_state self.selection = selection @abstractmethod def _get_estimator(self): """Model to be fitted after the best alpha has been determined.""" @abstractmethod def _is_multitask(self): """Bool indicating if class is meant for multidimensional target.""" @staticmethod @abstractmethod def path(X, y, **kwargs): """Compute path with coordinate descent.""" @_fit_context(prefer_skip_nested_validation=True) def fit(self, X, y, sample_weight=None, **params): """Fit linear model with coordinate descent. Fit is on grid of alphas and best alpha estimated by cross-validation. Parameters ---------- X : {array-like, sparse matrix} of shape (n_samples, n_features) Training data. Pass directly as Fortran-contiguous data to avoid unnecessary memory duplication. If y is mono-output, X can be sparse. y : array-like of shape (n_samples,) or (n_samples, n_targets) Target values. sample_weight : float or array-like of shape (n_samples,), \ default=None Sample weights used for fitting and evaluation of the weighted mean squared error of each cv-fold. Note that the cross validated MSE that is finally used to find the best model is the unweighted mean over the (weighted) MSEs of each test fold. **params : dict, default=None Parameters to be passed to the CV splitter. .. versionadded:: 1.4 Only available if `enable_metadata_routing=True`, which can be set by using ``sklearn.set_config(enable_metadata_routing=True)``. See :ref:`Metadata Routing User Guide ` for more details. Returns ------- self : object Returns an instance of fitted model. """ _raise_for_params(params, self, "fit") # This makes sure that there is no duplication in memory. # Dealing right with copy_X is important in the following: # Multiple functions touch X and subsamples of X and can induce a # lot of duplication of memory copy_X = self.copy_X and self.fit_intercept check_y_params = dict( copy=False, dtype=[np.float64, np.float32], ensure_2d=False ) if isinstance(X, np.ndarray) or sparse.issparse(X): # Keep a reference to X reference_to_old_X = X # Let us not impose fortran ordering so far: it is # not useful for the cross-validation loop and will be done # by the model fitting itself # Need to validate separately here. # We can't pass multi_output=True because that would allow y to be # csr. We also want to allow y to be 64 or 32 but check_X_y only # allows to convert for 64. check_X_params = dict( accept_sparse="csc", dtype=[np.float64, np.float32], copy=False ) X, y = self._validate_data( X, y, validate_separately=(check_X_params, check_y_params) ) if sparse.issparse(X): if hasattr(reference_to_old_X, "data") and not np.may_share_memory( reference_to_old_X.data, X.data ): # X is a sparse matrix and has been copied copy_X = False elif not np.may_share_memory(reference_to_old_X, X): # X has been copied copy_X = False del reference_to_old_X else: # Need to validate separately here. # We can't pass multi_output=True because that would allow y to be # csr. We also want to allow y to be 64 or 32 but check_X_y only # allows to convert for 64. check_X_params = dict( accept_sparse="csc", dtype=[np.float64, np.float32], order="F", copy=copy_X, ) X, y = self._validate_data( X, y, validate_separately=(check_X_params, check_y_params) ) copy_X = False check_consistent_length(X, y) if not self._is_multitask(): if y.ndim > 1 and y.shape[1] > 1: raise ValueError( "For multi-task outputs, use MultiTask%s" % self.__class__.__name__ ) y = column_or_1d(y, warn=True) else: if sparse.issparse(X): raise TypeError("X should be dense but a sparse matrix waspassed") elif y.ndim == 1: raise ValueError( "For mono-task outputs, use %sCV" % self.__class__.__name__[9:] ) if isinstance(sample_weight, numbers.Number): sample_weight = None if sample_weight is not None: sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype) model = self._get_estimator() # All LinearModelCV parameters except 'cv' are acceptable path_params = self.get_params() # Pop `intercept` that is not parameter of the path function path_params.pop("fit_intercept", None) if "l1_ratio" in path_params: l1_ratios = np.atleast_1d(path_params["l1_ratio"]) # For the first path, we need to set l1_ratio path_params["l1_ratio"] = l1_ratios[0] else: l1_ratios = [ 1, ] path_params.pop("cv", None) path_params.pop("n_jobs", None) alphas = self.alphas n_l1_ratio = len(l1_ratios) check_scalar_alpha = partial( check_scalar, target_type=Real, min_val=0.0, include_boundaries="left", ) if alphas is None: alphas = [ _alpha_grid( X, y, l1_ratio=l1_ratio, fit_intercept=self.fit_intercept, eps=self.eps, n_alphas=self.n_alphas, copy_X=self.copy_X, ) for l1_ratio in l1_ratios ] else: # Making sure alphas entries are scalars. for index, alpha in enumerate(alphas): check_scalar_alpha(alpha, f"alphas[{index}]") # Making sure alphas is properly ordered. alphas = np.tile(np.sort(alphas)[::-1], (n_l1_ratio, 1)) # We want n_alphas to be the number of alphas used for each l1_ratio. n_alphas = len(alphas[0]) path_params.update({"n_alphas": n_alphas}) path_params["copy_X"] = copy_X # We are not computing in parallel, we can modify X # inplace in the folds if effective_n_jobs(self.n_jobs) > 1: path_params["copy_X"] = False # init cross-validation generator cv = check_cv(self.cv) if _routing_enabled(): splitter_supports_sample_weight = get_routing_for_object(cv).consumes( method="split", params=["sample_weight"] ) if ( sample_weight is not None and not splitter_supports_sample_weight and not has_fit_parameter(self, "sample_weight") ): raise ValueError( "The CV splitter and underlying estimator do not support" " sample weights." ) if splitter_supports_sample_weight: params["sample_weight"] = sample_weight routed_params = process_routing(self, "fit", **params) if sample_weight is not None and not has_fit_parameter( self, "sample_weight" ): # MultiTaskElasticNetCV does not (yet) support sample_weight sample_weight = None else: routed_params = Bunch() routed_params.splitter = Bunch(split=Bunch()) # Compute path for all folds and compute MSE to get the best alpha folds = list(cv.split(X, y, **routed_params.splitter.split)) best_mse = np.inf # We do a double for loop folded in one, in order to be able to # iterate in parallel on l1_ratio and folds jobs = ( delayed(_path_residuals)( X, y, sample_weight, train, test, self.fit_intercept, self.path, path_params, alphas=this_alphas, l1_ratio=this_l1_ratio, X_order="F", dtype=X.dtype.type, ) for this_l1_ratio, this_alphas in zip(l1_ratios, alphas) for train, test in folds ) mse_paths = Parallel( n_jobs=self.n_jobs, verbose=self.verbose, prefer="threads", )(jobs) mse_paths = np.reshape(mse_paths, (n_l1_ratio, len(folds), -1)) # The mean is computed over folds. mean_mse = np.mean(mse_paths, axis=1) self.mse_path_ = np.squeeze(np.moveaxis(mse_paths, 2, 1)) for l1_ratio, l1_alphas, mse_alphas in zip(l1_ratios, alphas, mean_mse): i_best_alpha = np.argmin(mse_alphas) this_best_mse = mse_alphas[i_best_alpha] if this_best_mse < best_mse: best_alpha = l1_alphas[i_best_alpha] best_l1_ratio = l1_ratio best_mse = this_best_mse self.l1_ratio_ = best_l1_ratio self.alpha_ = best_alpha if self.alphas is None: self.alphas_ = np.asarray(alphas) if n_l1_ratio == 1: self.alphas_ = self.alphas_[0] # Remove duplicate alphas in case alphas is provided. else: self.alphas_ = np.asarray(alphas[0]) # Refit the model with the parameters selected common_params = { name: value for name, value in self.get_params().items() if name in model.get_params() } model.set_params(**common_params) model.alpha = best_alpha model.l1_ratio = best_l1_ratio model.copy_X = copy_X precompute = getattr(self, "precompute", None) if isinstance(precompute, str) and precompute == "auto": model.precompute = False if sample_weight is None: # MultiTaskElasticNetCV does not (yet) support sample_weight, even # not sample_weight=None. model.fit(X, y) else: model.fit(X, y, sample_weight=sample_weight) if not hasattr(self, "l1_ratio"): del self.l1_ratio_ self.coef_ = model.coef_ self.intercept_ = model.intercept_ self.dual_gap_ = model.dual_gap_ self.n_iter_ = model.n_iter_ return self def _more_tags(self): # Note: check_sample_weights_invariance(kind='ones') should work, but # currently we can only mark a whole test as xfail. return { "_xfail_checks": { "check_sample_weights_invariance": ( "zero sample_weight is not equivalent to removing samples" ), } } def get_metadata_routing(self): """Get metadata routing of this object. Please check :ref:`User Guide ` on how the routing mechanism works. .. versionadded:: 1.4 Returns ------- routing : MetadataRouter A :class:`~sklearn.utils.metadata_routing.MetadataRouter` encapsulating routing information. """ router = ( MetadataRouter(owner=self.__class__.__name__) .add_self_request(self) .add( splitter=check_cv(self.cv), method_mapping=MethodMapping().add(callee="split", caller="fit"), ) ) return router class LassoCV(RegressorMixin, LinearModelCV): """Lasso linear model with iterative fitting along a regularization path. See glossary entry for :term:`cross-validation estimator`. The best model is selected by cross-validation. The optimization objective for Lasso is:: (1 / (2 * n_samples)) * ||y - Xw||^2_2 + alpha * ||w||_1 Read more in the :ref:`User Guide `. Parameters ---------- eps : float, default=1e-3 Length of the path. ``eps=1e-3`` means that ``alpha_min / alpha_max = 1e-3``. n_alphas : int, default=100 Number of alphas along the regularization path. alphas : array-like, default=None List of alphas where to compute the models. If ``None`` alphas are set automatically. fit_intercept : bool, default=True Whether to calculate the intercept for this model. If set to false, no intercept will be used in calculations (i.e. data is expected to be centered). precompute : 'auto', bool or array-like of shape \ (n_features, n_features), default='auto' Whether to use a precomputed Gram matrix to speed up calculations. If set to ``'auto'`` let us decide. The Gram matrix can also be passed as argument. max_iter : int, default=1000 The maximum number of iterations. tol : float, default=1e-4 The tolerance for the optimization: if the updates are smaller than ``tol``, the optimization code checks the dual gap for optimality and continues until it is smaller than ``tol``. copy_X : bool, default=True If ``True``, X will be copied; else, it may be overwritten. cv : int, cross-validation generator or iterable, default=None Determines the cross-validation splitting strategy. Possible inputs for cv are: - None, to use the default 5-fold cross-validation, - int, to specify the number of folds. - :term:`CV splitter`, - An iterable yielding (train, test) splits as arrays of indices. For int/None inputs, :class:`~sklearn.model_selection.KFold` is used. Refer :ref:`User Guide ` for the various cross-validation strategies that can be used here. .. versionchanged:: 0.22 ``cv`` default value if None changed from 3-fold to 5-fold. verbose : bool or int, default=False Amount of verbosity. n_jobs : int, default=None Number of CPUs to use during the cross validation. ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. ``-1`` means using all processors. See :term:`Glossary ` for more details. positive : bool, default=False If positive, restrict regression coefficients to be positive. random_state : int, RandomState instance, default=None The seed of the pseudo random number generator that selects a random feature to update. Used when ``selection`` == 'random'. Pass an int for reproducible output across multiple function calls. See :term:`Glossary `. selection : {'cyclic', 'random'}, default='cyclic' If set to 'random', a random coefficient is updated every iteration rather than looping over features sequentially by default. This (setting to 'random') often leads to significantly faster convergence especially when tol is higher than 1e-4. Attributes ---------- alpha_ : float The amount of penalization chosen by cross validation. coef_ : ndarray of shape (n_features,) or (n_targets, n_features) Parameter vector (w in the cost function formula). intercept_ : float or ndarray of shape (n_targets,) Independent term in decision function. mse_path_ : ndarray of shape (n_alphas, n_folds) Mean square error for the test set on each fold, varying alpha. alphas_ : ndarray of shape (n_alphas,) The grid of alphas used for fitting. dual_gap_ : float or ndarray of shape (n_targets,) The dual gap at the end of the optimization for the optimal alpha (``alpha_``). n_iter_ : int Number of iterations run by the coordinate descent solver to reach the specified tolerance for the optimal alpha. n_features_in_ : int Number of features seen during :term:`fit`. .. versionadded:: 0.24 feature_names_in_ : ndarray of shape (`n_features_in_`,) Names of features seen during :term:`fit`. Defined only when `X` has feature names that are all strings. .. versionadded:: 1.0 See Also -------- lars_path : Compute Least Angle Regression or Lasso path using LARS algorithm. lasso_path : Compute Lasso path with coordinate descent. Lasso : The Lasso is a linear model that estimates sparse coefficients. LassoLars : Lasso model fit with Least Angle Regression a.k.a. Lars. LassoCV : Lasso linear model with iterative fitting along a regularization path. LassoLarsCV : Cross-validated Lasso using the LARS algorithm. Notes ----- In `fit`, once the best parameter `alpha` is found through cross-validation, the model is fit again using the entire training set. To avoid unnecessary memory duplication the `X` argument of the `fit` method should be directly passed as a Fortran-contiguous numpy array. For an example, see :ref:`examples/linear_model/plot_lasso_model_selection.py `. :class:`LassoCV` leads to different results than a hyperparameter search using :class:`~sklearn.model_selection.GridSearchCV` with a :class:`Lasso` model. In :class:`LassoCV`, a model for a given penalty `alpha` is warm started using the coefficients of the closest model (trained at the previous iteration) on the regularization path. It tends to speed up the hyperparameter search. Examples -------- >>> from sklearn.linear_model import LassoCV >>> from sklearn.datasets import make_regression >>> X, y = make_regression(noise=4, random_state=0) >>> reg = LassoCV(cv=5, random_state=0).fit(X, y) >>> reg.score(X, y) 0.9993... >>> reg.predict(X[:1,]) array([-78.4951...]) """ path = staticmethod(lasso_path) def __init__( self, *, eps=1e-3, n_alphas=100, alphas=None, fit_intercept=True, precompute="auto", max_iter=1000, tol=1e-4, copy_X=True, cv=None, verbose=False, n_jobs=None, positive=False, random_state=None, selection="cyclic", ): super().__init__( eps=eps, n_alphas=n_alphas, alphas=alphas, fit_intercept=fit_intercept, precompute=precompute, max_iter=max_iter, tol=tol, copy_X=copy_X, cv=cv, verbose=verbose, n_jobs=n_jobs, positive=positive, random_state=random_state, selection=selection, ) def _get_estimator(self): return Lasso() def _is_multitask(self): return False def _more_tags(self): return {"multioutput": False} class ElasticNetCV(RegressorMixin, LinearModelCV): """Elastic Net model with iterative fitting along a regularization path. See glossary entry for :term:`cross-validation estimator`. Read more in the :ref:`User Guide `. Parameters ---------- l1_ratio : float or list of float, default=0.5 Float between 0 and 1 passed to ElasticNet (scaling between l1 and l2 penalties). For ``l1_ratio = 0`` the penalty is an L2 penalty. For ``l1_ratio = 1`` it is an L1 penalty. For ``0 < l1_ratio < 1``, the penalty is a combination of L1 and L2 This parameter can be a list, in which case the different values are tested by cross-validation and the one giving the best prediction score is used. Note that a good choice of list of values for l1_ratio is often to put more values close to 1 (i.e. Lasso) and less close to 0 (i.e. Ridge), as in ``[.1, .5, .7, .9, .95, .99, 1]``. eps : float, default=1e-3 Length of the path. ``eps=1e-3`` means that ``alpha_min / alpha_max = 1e-3``. n_alphas : int, default=100 Number of alphas along the regularization path, used for each l1_ratio. alphas : array-like, default=None List of alphas where to compute the models. If None alphas are set automatically. fit_intercept : bool, default=True Whether to calculate the intercept for this model. If set to false, no intercept will be used in calculations (i.e. data is expected to be centered). precompute : 'auto', bool or array-like of shape \ (n_features, n_features), default='auto' Whether to use a precomputed Gram matrix to speed up calculations. If set to ``'auto'`` let us decide. The Gram matrix can also be passed as argument. max_iter : int, default=1000 The maximum number of iterations. tol : float, default=1e-4 The tolerance for the optimization: if the updates are smaller than ``tol``, the optimization code checks the dual gap for optimality and continues until it is smaller than ``tol``. cv : int, cross-validation generator or iterable, default=None Determines the cross-validation splitting strategy. Possible inputs for cv are: - None, to use the default 5-fold cross-validation, - int, to specify the number of folds. - :term:`CV splitter`, - An iterable yielding (train, test) splits as arrays of indices. For int/None inputs, :class:`~sklearn.model_selection.KFold` is used. Refer :ref:`User Guide ` for the various cross-validation strategies that can be used here. .. versionchanged:: 0.22 ``cv`` default value if None changed from 3-fold to 5-fold. copy_X : bool, default=True If ``True``, X will be copied; else, it may be overwritten. verbose : bool or int, default=0 Amount of verbosity. n_jobs : int, default=None Number of CPUs to use during the cross validation. ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. ``-1`` means using all processors. See :term:`Glossary ` for more details. positive : bool, default=False When set to ``True``, forces the coefficients to be positive. random_state : int, RandomState instance, default=None The seed of the pseudo random number generator that selects a random feature to update. Used when ``selection`` == 'random'. Pass an int for reproducible output across multiple function calls. See :term:`Glossary `. selection : {'cyclic', 'random'}, default='cyclic' If set to 'random', a random coefficient is updated every iteration rather than looping over features sequentially by default. This (setting to 'random') often leads to significantly faster convergence especially when tol is higher than 1e-4. Attributes ---------- alpha_ : float The amount of penalization chosen by cross validation. l1_ratio_ : float The compromise between l1 and l2 penalization chosen by cross validation. coef_ : ndarray of shape (n_features,) or (n_targets, n_features) Parameter vector (w in the cost function formula). intercept_ : float or ndarray of shape (n_targets, n_features) Independent term in the decision function. mse_path_ : ndarray of shape (n_l1_ratio, n_alpha, n_folds) Mean square error for the test set on each fold, varying l1_ratio and alpha. alphas_ : ndarray of shape (n_alphas,) or (n_l1_ratio, n_alphas) The grid of alphas used for fitting, for each l1_ratio. dual_gap_ : float The dual gaps at the end of the optimization for the optimal alpha. n_iter_ : int Number of iterations run by the coordinate descent solver to reach the specified tolerance for the optimal alpha. n_features_in_ : int Number of features seen during :term:`fit`. .. versionadded:: 0.24 feature_names_in_ : ndarray of shape (`n_features_in_`,) Names of features seen during :term:`fit`. Defined only when `X` has feature names that are all strings. .. versionadded:: 1.0 See Also -------- enet_path : Compute elastic net path with coordinate descent. ElasticNet : Linear regression with combined L1 and L2 priors as regularizer. Notes ----- In `fit`, once the best parameters `l1_ratio` and `alpha` are found through cross-validation, the model is fit again using the entire training set. To avoid unnecessary memory duplication the `X` argument of the `fit` method should be directly passed as a Fortran-contiguous numpy array. The parameter `l1_ratio` corresponds to alpha in the glmnet R package while alpha corresponds to the lambda parameter in glmnet. More specifically, the optimization objective is:: 1 / (2 * n_samples) * ||y - Xw||^2_2 + alpha * l1_ratio * ||w||_1 + 0.5 * alpha * (1 - l1_ratio) * ||w||^2_2 If you are interested in controlling the L1 and L2 penalty separately, keep in mind that this is equivalent to:: a * L1 + b * L2 for:: alpha = a + b and l1_ratio = a / (a + b). For an example, see :ref:`examples/linear_model/plot_lasso_model_selection.py `. Examples -------- >>> from sklearn.linear_model import ElasticNetCV >>> from sklearn.datasets import make_regression >>> X, y = make_regression(n_features=2, random_state=0) >>> regr = ElasticNetCV(cv=5, random_state=0) >>> regr.fit(X, y) ElasticNetCV(cv=5, random_state=0) >>> print(regr.alpha_) 0.199... >>> print(regr.intercept_) 0.398... >>> print(regr.predict([[0, 0]])) [0.398...] """ _parameter_constraints: dict = { **LinearModelCV._parameter_constraints, "l1_ratio": [Interval(Real, 0, 1, closed="both"), "array-like"], } path = staticmethod(enet_path) def __init__( self, *, l1_ratio=0.5, eps=1e-3, n_alphas=100, alphas=None, fit_intercept=True, precompute="auto", max_iter=1000, tol=1e-4, cv=None, copy_X=True, verbose=0, n_jobs=None, positive=False, random_state=None, selection="cyclic", ): self.l1_ratio = l1_ratio self.eps = eps self.n_alphas = n_alphas self.alphas = alphas self.fit_intercept = fit_intercept self.precompute = precompute self.max_iter = max_iter self.tol = tol self.cv = cv self.copy_X = copy_X self.verbose = verbose self.n_jobs = n_jobs self.positive = positive self.random_state = random_state self.selection = selection def _get_estimator(self): return ElasticNet() def _is_multitask(self): return False def _more_tags(self): return {"multioutput": False} ############################################################################### # Multi Task ElasticNet and Lasso models (with joint feature selection) class MultiTaskElasticNet(Lasso): """Multi-task ElasticNet model trained with L1/L2 mixed-norm as regularizer. The optimization objective for MultiTaskElasticNet is:: (1 / (2 * n_samples)) * ||Y - XW||_Fro^2 + alpha * l1_ratio * ||W||_21 + 0.5 * alpha * (1 - l1_ratio) * ||W||_Fro^2 Where:: ||W||_21 = sum_i sqrt(sum_j W_ij ^ 2) i.e. the sum of norms of each row. Read more in the :ref:`User Guide `. Parameters ---------- alpha : float, default=1.0 Constant that multiplies the L1/L2 term. Defaults to 1.0. l1_ratio : float, default=0.5 The ElasticNet mixing parameter, with 0 < l1_ratio <= 1. For l1_ratio = 1 the penalty is an L1/L2 penalty. For l1_ratio = 0 it is an L2 penalty. For ``0 < l1_ratio < 1``, the penalty is a combination of L1/L2 and L2. fit_intercept : bool, default=True Whether to calculate the intercept for this model. If set to false, no intercept will be used in calculations (i.e. data is expected to be centered). copy_X : bool, default=True If ``True``, X will be copied; else, it may be overwritten. max_iter : int, default=1000 The maximum number of iterations. tol : float, default=1e-4 The tolerance for the optimization: if the updates are smaller than ``tol``, the optimization code checks the dual gap for optimality and continues until it is smaller than ``tol``. warm_start : bool, default=False When set to ``True``, reuse the solution of the previous call to fit as initialization, otherwise, just erase the previous solution. See :term:`the Glossary `. random_state : int, RandomState instance, default=None The seed of the pseudo random number generator that selects a random feature to update. Used when ``selection`` == 'random'. Pass an int for reproducible output across multiple function calls. See :term:`Glossary `. selection : {'cyclic', 'random'}, default='cyclic' If set to 'random', a random coefficient is updated every iteration rather than looping over features sequentially by default. This (setting to 'random') often leads to significantly faster convergence especially when tol is higher than 1e-4. Attributes ---------- intercept_ : ndarray of shape (n_targets,) Independent term in decision function. coef_ : ndarray of shape (n_targets, n_features) Parameter vector (W in the cost function formula). If a 1D y is passed in at fit (non multi-task usage), ``coef_`` is then a 1D array. Note that ``coef_`` stores the transpose of ``W``, ``W.T``. n_iter_ : int Number of iterations run by the coordinate descent solver to reach the specified tolerance. dual_gap_ : float The dual gaps at the end of the optimization. eps_ : float The tolerance scaled scaled by the variance of the target `y`. sparse_coef_ : sparse matrix of shape (n_features,) or \ (n_targets, n_features) Sparse representation of the `coef_`. n_features_in_ : int Number of features seen during :term:`fit`. .. versionadded:: 0.24 feature_names_in_ : ndarray of shape (`n_features_in_`,) Names of features seen during :term:`fit`. Defined only when `X` has feature names that are all strings. .. versionadded:: 1.0 See Also -------- MultiTaskElasticNetCV : Multi-task L1/L2 ElasticNet with built-in cross-validation. ElasticNet : Linear regression with combined L1 and L2 priors as regularizer. MultiTaskLasso : Multi-task Lasso model trained with L1/L2 mixed-norm as regularizer. Notes ----- The algorithm used to fit the model is coordinate descent. To avoid unnecessary memory duplication the X and y arguments of the fit method should be directly passed as Fortran-contiguous numpy arrays. Examples -------- >>> from sklearn import linear_model >>> clf = linear_model.MultiTaskElasticNet(alpha=0.1) >>> clf.fit([[0,0], [1, 1], [2, 2]], [[0, 0], [1, 1], [2, 2]]) MultiTaskElasticNet(alpha=0.1) >>> print(clf.coef_) [[0.45663524 0.45612256] [0.45663524 0.45612256]] >>> print(clf.intercept_) [0.0872422 0.0872422] """ _parameter_constraints: dict = { **ElasticNet._parameter_constraints, } for param in ("precompute", "positive"): _parameter_constraints.pop(param) def __init__( self, alpha=1.0, *, l1_ratio=0.5, fit_intercept=True, copy_X=True, max_iter=1000, tol=1e-4, warm_start=False, random_state=None, selection="cyclic", ): self.l1_ratio = l1_ratio self.alpha = alpha self.fit_intercept = fit_intercept self.max_iter = max_iter self.copy_X = copy_X self.tol = tol self.warm_start = warm_start self.random_state = random_state self.selection = selection @_fit_context(prefer_skip_nested_validation=True) def fit(self, X, y): """Fit MultiTaskElasticNet model with coordinate descent. Parameters ---------- X : ndarray of shape (n_samples, n_features) Data. y : ndarray of shape (n_samples, n_targets) Target. Will be cast to X's dtype if necessary. Returns ------- self : object Fitted estimator. Notes ----- Coordinate descent is an algorithm that considers each column of data at a time hence it will automatically convert the X input as a Fortran-contiguous numpy array if necessary. To avoid memory re-allocation it is advised to allocate the initial data in memory directly using that format. """ # Need to validate separately here. # We can't pass multi_output=True because that would allow y to be csr. check_X_params = dict( dtype=[np.float64, np.float32], order="F", copy=self.copy_X and self.fit_intercept, ) check_y_params = dict(ensure_2d=False, order="F") X, y = self._validate_data( X, y, validate_separately=(check_X_params, check_y_params) ) check_consistent_length(X, y) y = y.astype(X.dtype) if hasattr(self, "l1_ratio"): model_str = "ElasticNet" else: model_str = "Lasso" if y.ndim == 1: raise ValueError("For mono-task outputs, use %s" % model_str) n_samples, n_features = X.shape n_targets = y.shape[1] X, y, X_offset, y_offset, X_scale = _preprocess_data( X, y, fit_intercept=self.fit_intercept, copy=False ) if not self.warm_start or not hasattr(self, "coef_"): self.coef_ = np.zeros( (n_targets, n_features), dtype=X.dtype.type, order="F" ) l1_reg = self.alpha * self.l1_ratio * n_samples l2_reg = self.alpha * (1.0 - self.l1_ratio) * n_samples self.coef_ = np.asfortranarray(self.coef_) # coef contiguous in memory random = self.selection == "random" ( self.coef_, self.dual_gap_, self.eps_, self.n_iter_, ) = cd_fast.enet_coordinate_descent_multi_task( self.coef_, l1_reg, l2_reg, X, y, self.max_iter, self.tol, check_random_state(self.random_state), random, ) # account for different objective scaling here and in cd_fast self.dual_gap_ /= n_samples self._set_intercept(X_offset, y_offset, X_scale) # return self for chaining fit and predict calls return self def _more_tags(self): return {"multioutput_only": True} class MultiTaskLasso(MultiTaskElasticNet): """Multi-task Lasso model trained with L1/L2 mixed-norm as regularizer. The optimization objective for Lasso is:: (1 / (2 * n_samples)) * ||Y - XW||^2_Fro + alpha * ||W||_21 Where:: ||W||_21 = \\sum_i \\sqrt{\\sum_j w_{ij}^2} i.e. the sum of norm of each row. Read more in the :ref:`User Guide `. Parameters ---------- alpha : float, default=1.0 Constant that multiplies the L1/L2 term. Defaults to 1.0. fit_intercept : bool, default=True Whether to calculate the intercept for this model. If set to false, no intercept will be used in calculations (i.e. data is expected to be centered). copy_X : bool, default=True If ``True``, X will be copied; else, it may be overwritten. max_iter : int, default=1000 The maximum number of iterations. tol : float, default=1e-4 The tolerance for the optimization: if the updates are smaller than ``tol``, the optimization code checks the dual gap for optimality and continues until it is smaller than ``tol``. warm_start : bool, default=False When set to ``True``, reuse the solution of the previous call to fit as initialization, otherwise, just erase the previous solution. See :term:`the Glossary `. random_state : int, RandomState instance, default=None The seed of the pseudo random number generator that selects a random feature to update. Used when ``selection`` == 'random'. Pass an int for reproducible output across multiple function calls. See :term:`Glossary `. selection : {'cyclic', 'random'}, default='cyclic' If set to 'random', a random coefficient is updated every iteration rather than looping over features sequentially by default. This (setting to 'random') often leads to significantly faster convergence especially when tol is higher than 1e-4. Attributes ---------- coef_ : ndarray of shape (n_targets, n_features) Parameter vector (W in the cost function formula). Note that ``coef_`` stores the transpose of ``W``, ``W.T``. intercept_ : ndarray of shape (n_targets,) Independent term in decision function. n_iter_ : int Number of iterations run by the coordinate descent solver to reach the specified tolerance. dual_gap_ : ndarray of shape (n_alphas,) The dual gaps at the end of the optimization for each alpha. eps_ : float The tolerance scaled scaled by the variance of the target `y`. sparse_coef_ : sparse matrix of shape (n_features,) or \ (n_targets, n_features) Sparse representation of the `coef_`. n_features_in_ : int Number of features seen during :term:`fit`. .. versionadded:: 0.24 feature_names_in_ : ndarray of shape (`n_features_in_`,) Names of features seen during :term:`fit`. Defined only when `X` has feature names that are all strings. .. versionadded:: 1.0 See Also -------- Lasso: Linear Model trained with L1 prior as regularizer (aka the Lasso). MultiTaskLassoCV: Multi-task L1 regularized linear model with built-in cross-validation. MultiTaskElasticNetCV: Multi-task L1/L2 ElasticNet with built-in cross-validation. Notes ----- The algorithm used to fit the model is coordinate descent. To avoid unnecessary memory duplication the X and y arguments of the fit method should be directly passed as Fortran-contiguous numpy arrays. Examples -------- >>> from sklearn import linear_model >>> clf = linear_model.MultiTaskLasso(alpha=0.1) >>> clf.fit([[0, 1], [1, 2], [2, 4]], [[0, 0], [1, 1], [2, 3]]) MultiTaskLasso(alpha=0.1) >>> print(clf.coef_) [[0. 0.60809415] [0. 0.94592424]] >>> print(clf.intercept_) [-0.41888636 -0.87382323] """ _parameter_constraints: dict = { **MultiTaskElasticNet._parameter_constraints, } _parameter_constraints.pop("l1_ratio") def __init__( self, alpha=1.0, *, fit_intercept=True, copy_X=True, max_iter=1000, tol=1e-4, warm_start=False, random_state=None, selection="cyclic", ): self.alpha = alpha self.fit_intercept = fit_intercept self.max_iter = max_iter self.copy_X = copy_X self.tol = tol self.warm_start = warm_start self.l1_ratio = 1.0 self.random_state = random_state self.selection = selection class MultiTaskElasticNetCV(RegressorMixin, LinearModelCV): """Multi-task L1/L2 ElasticNet with built-in cross-validation. See glossary entry for :term:`cross-validation estimator`. The optimization objective for MultiTaskElasticNet is:: (1 / (2 * n_samples)) * ||Y - XW||^Fro_2 + alpha * l1_ratio * ||W||_21 + 0.5 * alpha * (1 - l1_ratio) * ||W||_Fro^2 Where:: ||W||_21 = \\sum_i \\sqrt{\\sum_j w_{ij}^2} i.e. the sum of norm of each row. Read more in the :ref:`User Guide `. .. versionadded:: 0.15 Parameters ---------- l1_ratio : float or list of float, default=0.5 The ElasticNet mixing parameter, with 0 < l1_ratio <= 1. For l1_ratio = 1 the penalty is an L1/L2 penalty. For l1_ratio = 0 it is an L2 penalty. For ``0 < l1_ratio < 1``, the penalty is a combination of L1/L2 and L2. This parameter can be a list, in which case the different values are tested by cross-validation and the one giving the best prediction score is used. Note that a good choice of list of values for l1_ratio is often to put more values close to 1 (i.e. Lasso) and less close to 0 (i.e. Ridge), as in ``[.1, .5, .7, .9, .95, .99, 1]``. eps : float, default=1e-3 Length of the path. ``eps=1e-3`` means that ``alpha_min / alpha_max = 1e-3``. n_alphas : int, default=100 Number of alphas along the regularization path. alphas : array-like, default=None List of alphas where to compute the models. If not provided, set automatically. fit_intercept : bool, default=True Whether to calculate the intercept for this model. If set to false, no intercept will be used in calculations (i.e. data is expected to be centered). max_iter : int, default=1000 The maximum number of iterations. tol : float, default=1e-4 The tolerance for the optimization: if the updates are smaller than ``tol``, the optimization code checks the dual gap for optimality and continues until it is smaller than ``tol``. cv : int, cross-validation generator or iterable, default=None Determines the cross-validation splitting strategy. Possible inputs for cv are: - None, to use the default 5-fold cross-validation, - int, to specify the number of folds. - :term:`CV splitter`, - An iterable yielding (train, test) splits as arrays of indices. For int/None inputs, :class:`~sklearn.model_selection.KFold` is used. Refer :ref:`User Guide ` for the various cross-validation strategies that can be used here. .. versionchanged:: 0.22 ``cv`` default value if None changed from 3-fold to 5-fold. copy_X : bool, default=True If ``True``, X will be copied; else, it may be overwritten. verbose : bool or int, default=0 Amount of verbosity. n_jobs : int, default=None Number of CPUs to use during the cross validation. Note that this is used only if multiple values for l1_ratio are given. ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. ``-1`` means using all processors. See :term:`Glossary ` for more details. random_state : int, RandomState instance, default=None The seed of the pseudo random number generator that selects a random feature to update. Used when ``selection`` == 'random'. Pass an int for reproducible output across multiple function calls. See :term:`Glossary `. selection : {'cyclic', 'random'}, default='cyclic' If set to 'random', a random coefficient is updated every iteration rather than looping over features sequentially by default. This (setting to 'random') often leads to significantly faster convergence especially when tol is higher than 1e-4. Attributes ---------- intercept_ : ndarray of shape (n_targets,) Independent term in decision function. coef_ : ndarray of shape (n_targets, n_features) Parameter vector (W in the cost function formula). Note that ``coef_`` stores the transpose of ``W``, ``W.T``. alpha_ : float The amount of penalization chosen by cross validation. mse_path_ : ndarray of shape (n_alphas, n_folds) or \ (n_l1_ratio, n_alphas, n_folds) Mean square error for the test set on each fold, varying alpha. alphas_ : ndarray of shape (n_alphas,) or (n_l1_ratio, n_alphas) The grid of alphas used for fitting, for each l1_ratio. l1_ratio_ : float Best l1_ratio obtained by cross-validation. n_iter_ : int Number of iterations run by the coordinate descent solver to reach the specified tolerance for the optimal alpha. dual_gap_ : float The dual gap at the end of the optimization for the optimal alpha. n_features_in_ : int Number of features seen during :term:`fit`. .. versionadded:: 0.24 feature_names_in_ : ndarray of shape (`n_features_in_`,) Names of features seen during :term:`fit`. Defined only when `X` has feature names that are all strings. .. versionadded:: 1.0 See Also -------- MultiTaskElasticNet : Multi-task L1/L2 ElasticNet with built-in cross-validation. ElasticNetCV : Elastic net model with best model selection by cross-validation. MultiTaskLassoCV : Multi-task Lasso model trained with L1 norm as regularizer and built-in cross-validation. Notes ----- The algorithm used to fit the model is coordinate descent. In `fit`, once the best parameters `l1_ratio` and `alpha` are found through cross-validation, the model is fit again using the entire training set. To avoid unnecessary memory duplication the `X` and `y` arguments of the `fit` method should be directly passed as Fortran-contiguous numpy arrays. Examples -------- >>> from sklearn import linear_model >>> clf = linear_model.MultiTaskElasticNetCV(cv=3) >>> clf.fit([[0,0], [1, 1], [2, 2]], ... [[0, 0], [1, 1], [2, 2]]) MultiTaskElasticNetCV(cv=3) >>> print(clf.coef_) [[0.52875032 0.46958558] [0.52875032 0.46958558]] >>> print(clf.intercept_) [0.00166409 0.00166409] """ _parameter_constraints: dict = { **LinearModelCV._parameter_constraints, "l1_ratio": [Interval(Real, 0, 1, closed="both"), "array-like"], } _parameter_constraints.pop("precompute") _parameter_constraints.pop("positive") path = staticmethod(enet_path) def __init__( self, *, l1_ratio=0.5, eps=1e-3, n_alphas=100, alphas=None, fit_intercept=True, max_iter=1000, tol=1e-4, cv=None, copy_X=True, verbose=0, n_jobs=None, random_state=None, selection="cyclic", ): self.l1_ratio = l1_ratio self.eps = eps self.n_alphas = n_alphas self.alphas = alphas self.fit_intercept = fit_intercept self.max_iter = max_iter self.tol = tol self.cv = cv self.copy_X = copy_X self.verbose = verbose self.n_jobs = n_jobs self.random_state = random_state self.selection = selection def _get_estimator(self): return MultiTaskElasticNet() def _is_multitask(self): return True def _more_tags(self): return {"multioutput_only": True} # This is necessary as LinearModelCV now supports sample_weight while # MultiTaskElasticNet does not (yet). def fit(self, X, y, **params): """Fit MultiTaskElasticNet model with coordinate descent. Fit is on grid of alphas and best alpha estimated by cross-validation. Parameters ---------- X : ndarray of shape (n_samples, n_features) Training data. y : ndarray of shape (n_samples, n_targets) Training target variable. Will be cast to X's dtype if necessary. **params : dict, default=None Parameters to be passed to the CV splitter. .. versionadded:: 1.4 Only available if `enable_metadata_routing=True`, which can be set by using ``sklearn.set_config(enable_metadata_routing=True)``. See :ref:`Metadata Routing User Guide ` for more details. Returns ------- self : object Returns MultiTaskElasticNet instance. """ return super().fit(X, y, **params) class MultiTaskLassoCV(RegressorMixin, LinearModelCV): """Multi-task Lasso model trained with L1/L2 mixed-norm as regularizer. See glossary entry for :term:`cross-validation estimator`. The optimization objective for MultiTaskLasso is:: (1 / (2 * n_samples)) * ||Y - XW||^Fro_2 + alpha * ||W||_21 Where:: ||W||_21 = \\sum_i \\sqrt{\\sum_j w_{ij}^2} i.e. the sum of norm of each row. Read more in the :ref:`User Guide `. .. versionadded:: 0.15 Parameters ---------- eps : float, default=1e-3 Length of the path. ``eps=1e-3`` means that ``alpha_min / alpha_max = 1e-3``. n_alphas : int, default=100 Number of alphas along the regularization path. alphas : array-like, default=None List of alphas where to compute the models. If not provided, set automatically. fit_intercept : bool, default=True Whether to calculate the intercept for this model. If set to false, no intercept will be used in calculations (i.e. data is expected to be centered). max_iter : int, default=1000 The maximum number of iterations. tol : float, default=1e-4 The tolerance for the optimization: if the updates are smaller than ``tol``, the optimization code checks the dual gap for optimality and continues until it is smaller than ``tol``. copy_X : bool, default=True If ``True``, X will be copied; else, it may be overwritten. cv : int, cross-validation generator or iterable, default=None Determines the cross-validation splitting strategy. Possible inputs for cv are: - None, to use the default 5-fold cross-validation, - int, to specify the number of folds. - :term:`CV splitter`, - An iterable yielding (train, test) splits as arrays of indices. For int/None inputs, :class:`~sklearn.model_selection.KFold` is used. Refer :ref:`User Guide ` for the various cross-validation strategies that can be used here. .. versionchanged:: 0.22 ``cv`` default value if None changed from 3-fold to 5-fold. verbose : bool or int, default=False Amount of verbosity. n_jobs : int, default=None Number of CPUs to use during the cross validation. Note that this is used only if multiple values for l1_ratio are given. ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. ``-1`` means using all processors. See :term:`Glossary ` for more details. random_state : int, RandomState instance, default=None The seed of the pseudo random number generator that selects a random feature to update. Used when ``selection`` == 'random'. Pass an int for reproducible output across multiple function calls. See :term:`Glossary `. selection : {'cyclic', 'random'}, default='cyclic' If set to 'random', a random coefficient is updated every iteration rather than looping over features sequentially by default. This (setting to 'random') often leads to significantly faster convergence especially when tol is higher than 1e-4. Attributes ---------- intercept_ : ndarray of shape (n_targets,) Independent term in decision function. coef_ : ndarray of shape (n_targets, n_features) Parameter vector (W in the cost function formula). Note that ``coef_`` stores the transpose of ``W``, ``W.T``. alpha_ : float The amount of penalization chosen by cross validation. mse_path_ : ndarray of shape (n_alphas, n_folds) Mean square error for the test set on each fold, varying alpha. alphas_ : ndarray of shape (n_alphas,) The grid of alphas used for fitting. n_iter_ : int Number of iterations run by the coordinate descent solver to reach the specified tolerance for the optimal alpha. dual_gap_ : float The dual gap at the end of the optimization for the optimal alpha. n_features_in_ : int Number of features seen during :term:`fit`. .. versionadded:: 0.24 feature_names_in_ : ndarray of shape (`n_features_in_`,) Names of features seen during :term:`fit`. Defined only when `X` has feature names that are all strings. .. versionadded:: 1.0 See Also -------- MultiTaskElasticNet : Multi-task ElasticNet model trained with L1/L2 mixed-norm as regularizer. ElasticNetCV : Elastic net model with best model selection by cross-validation. MultiTaskElasticNetCV : Multi-task L1/L2 ElasticNet with built-in cross-validation. Notes ----- The algorithm used to fit the model is coordinate descent. In `fit`, once the best parameter `alpha` is found through cross-validation, the model is fit again using the entire training set. To avoid unnecessary memory duplication the `X` and `y` arguments of the `fit` method should be directly passed as Fortran-contiguous numpy arrays. Examples -------- >>> from sklearn.linear_model import MultiTaskLassoCV >>> from sklearn.datasets import make_regression >>> from sklearn.metrics import r2_score >>> X, y = make_regression(n_targets=2, noise=4, random_state=0) >>> reg = MultiTaskLassoCV(cv=5, random_state=0).fit(X, y) >>> r2_score(y, reg.predict(X)) 0.9994... >>> reg.alpha_ 0.5713... >>> reg.predict(X[:1,]) array([[153.7971..., 94.9015...]]) """ _parameter_constraints: dict = { **LinearModelCV._parameter_constraints, } _parameter_constraints.pop("precompute") _parameter_constraints.pop("positive") path = staticmethod(lasso_path) def __init__( self, *, eps=1e-3, n_alphas=100, alphas=None, fit_intercept=True, max_iter=1000, tol=1e-4, copy_X=True, cv=None, verbose=False, n_jobs=None, random_state=None, selection="cyclic", ): super().__init__( eps=eps, n_alphas=n_alphas, alphas=alphas, fit_intercept=fit_intercept, max_iter=max_iter, tol=tol, copy_X=copy_X, cv=cv, verbose=verbose, n_jobs=n_jobs, random_state=random_state, selection=selection, ) def _get_estimator(self): return MultiTaskLasso() def _is_multitask(self): return True def _more_tags(self): return {"multioutput_only": True} # This is necessary as LinearModelCV now supports sample_weight while # MultiTaskElasticNet does not (yet). def fit(self, X, y, **params): """Fit MultiTaskLasso model with coordinate descent. Fit is on grid of alphas and best alpha estimated by cross-validation. Parameters ---------- X : ndarray of shape (n_samples, n_features) Data. y : ndarray of shape (n_samples, n_targets) Target. Will be cast to X's dtype if necessary. **params : dict, default=None Parameters to be passed to the CV splitter. .. versionadded:: 1.4 Only available if `enable_metadata_routing=True`, which can be set by using ``sklearn.set_config(enable_metadata_routing=True)``. See :ref:`Metadata Routing User Guide ` for more details. Returns ------- self : object Returns an instance of fitted model. """ return super().fit(X, y, **params)