"""Module :mod:`sklearn.kernel_ridge` implements kernel ridge regression.""" # Authors: Mathieu Blondel # Jan Hendrik Metzen # License: BSD 3 clause from numbers import Real import numpy as np from .base import BaseEstimator, MultiOutputMixin, RegressorMixin, _fit_context from .linear_model._ridge import _solve_cholesky_kernel from .metrics.pairwise import PAIRWISE_KERNEL_FUNCTIONS, pairwise_kernels from .utils._param_validation import Interval, StrOptions from .utils.validation import _check_sample_weight, check_is_fitted class KernelRidge(MultiOutputMixin, RegressorMixin, BaseEstimator): """Kernel ridge regression. Kernel ridge regression (KRR) combines ridge regression (linear least squares with l2-norm regularization) with the kernel trick. It thus learns a linear function in the space induced by the respective kernel and the data. For non-linear kernels, this corresponds to a non-linear function in the original space. The form of the model learned by KRR is identical to support vector regression (SVR). However, different loss functions are used: KRR uses squared error loss while support vector regression uses epsilon-insensitive loss, both combined with l2 regularization. In contrast to SVR, fitting a KRR model can be done in closed-form and is typically faster for medium-sized datasets. On the other hand, the learned model is non-sparse and thus slower than SVR, which learns a sparse model for epsilon > 0, at prediction-time. This estimator has built-in support for multi-variate regression (i.e., when y is a 2d-array of shape [n_samples, n_targets]). Read more in the :ref:`User Guide `. Parameters ---------- alpha : float or array-like of shape (n_targets,), default=1.0 Regularization strength; must be a positive float. Regularization improves the conditioning of the problem and reduces the variance of the estimates. Larger values specify stronger regularization. Alpha corresponds to ``1 / (2C)`` in other linear models such as :class:`~sklearn.linear_model.LogisticRegression` or :class:`~sklearn.svm.LinearSVC`. If an array is passed, penalties are assumed to be specific to the targets. Hence they must correspond in number. See :ref:`ridge_regression` for formula. kernel : str or callable, default="linear" Kernel mapping used internally. This parameter is directly passed to :class:`~sklearn.metrics.pairwise.pairwise_kernels`. If `kernel` is a string, it must be one of the metrics in `pairwise.PAIRWISE_KERNEL_FUNCTIONS` or "precomputed". If `kernel` is "precomputed", X is assumed to be a kernel matrix. Alternatively, if `kernel` is a callable function, it is called on each pair of instances (rows) and the resulting value recorded. The callable should take two rows from X as input and return the corresponding kernel value as a single number. This means that callables from :mod:`sklearn.metrics.pairwise` are not allowed, as they operate on matrices, not single samples. Use the string identifying the kernel instead. gamma : float, default=None Gamma parameter for the RBF, laplacian, polynomial, exponential chi2 and sigmoid kernels. Interpretation of the default value is left to the kernel; see the documentation for sklearn.metrics.pairwise. Ignored by other kernels. degree : float, default=3 Degree of the polynomial kernel. Ignored by other kernels. coef0 : float, default=1 Zero coefficient for polynomial and sigmoid kernels. Ignored by other kernels. kernel_params : dict, default=None Additional parameters (keyword arguments) for kernel function passed as callable object. Attributes ---------- dual_coef_ : ndarray of shape (n_samples,) or (n_samples, n_targets) Representation of weight vector(s) in kernel space X_fit_ : {ndarray, sparse matrix} of shape (n_samples, n_features) Training data, which is also required for prediction. If kernel == "precomputed" this is instead the precomputed training matrix, of shape (n_samples, n_samples). n_features_in_ : int Number of features seen during :term:`fit`. .. versionadded:: 0.24 feature_names_in_ : ndarray of shape (`n_features_in_`,) Names of features seen during :term:`fit`. Defined only when `X` has feature names that are all strings. .. versionadded:: 1.0 See Also -------- sklearn.gaussian_process.GaussianProcessRegressor : Gaussian Process regressor providing automatic kernel hyperparameters tuning and predictions uncertainty. sklearn.linear_model.Ridge : Linear ridge regression. sklearn.linear_model.RidgeCV : Ridge regression with built-in cross-validation. sklearn.svm.SVR : Support Vector Regression accepting a large variety of kernels. References ---------- * Kevin P. Murphy "Machine Learning: A Probabilistic Perspective", The MIT Press chapter 14.4.3, pp. 492-493 Examples -------- >>> from sklearn.kernel_ridge import KernelRidge >>> import numpy as np >>> n_samples, n_features = 10, 5 >>> rng = np.random.RandomState(0) >>> y = rng.randn(n_samples) >>> X = rng.randn(n_samples, n_features) >>> krr = KernelRidge(alpha=1.0) >>> krr.fit(X, y) KernelRidge(alpha=1.0) """ _parameter_constraints: dict = { "alpha": [Interval(Real, 0, None, closed="left"), "array-like"], "kernel": [ StrOptions(set(PAIRWISE_KERNEL_FUNCTIONS.keys()) | {"precomputed"}), callable, ], "gamma": [Interval(Real, 0, None, closed="left"), None], "degree": [Interval(Real, 0, None, closed="left")], "coef0": [Interval(Real, None, None, closed="neither")], "kernel_params": [dict, None], } def __init__( self, alpha=1, *, kernel="linear", gamma=None, degree=3, coef0=1, kernel_params=None, ): self.alpha = alpha self.kernel = kernel self.gamma = gamma self.degree = degree self.coef0 = coef0 self.kernel_params = kernel_params def _get_kernel(self, X, Y=None): if callable(self.kernel): params = self.kernel_params or {} else: params = {"gamma": self.gamma, "degree": self.degree, "coef0": self.coef0} return pairwise_kernels(X, Y, metric=self.kernel, filter_params=True, **params) def _more_tags(self): return {"pairwise": self.kernel == "precomputed"} @_fit_context(prefer_skip_nested_validation=True) def fit(self, X, y, sample_weight=None): """Fit Kernel Ridge regression model. Parameters ---------- X : {array-like, sparse matrix} of shape (n_samples, n_features) Training data. If kernel == "precomputed" this is instead a precomputed kernel matrix, of shape (n_samples, n_samples). y : array-like of shape (n_samples,) or (n_samples, n_targets) Target values. sample_weight : float or array-like of shape (n_samples,), default=None Individual weights for each sample, ignored if None is passed. Returns ------- self : object Returns the instance itself. """ # Convert data X, y = self._validate_data( X, y, accept_sparse=("csr", "csc"), multi_output=True, y_numeric=True ) if sample_weight is not None and not isinstance(sample_weight, float): sample_weight = _check_sample_weight(sample_weight, X) K = self._get_kernel(X) alpha = np.atleast_1d(self.alpha) ravel = False if len(y.shape) == 1: y = y.reshape(-1, 1) ravel = True copy = self.kernel == "precomputed" self.dual_coef_ = _solve_cholesky_kernel(K, y, alpha, sample_weight, copy) if ravel: self.dual_coef_ = self.dual_coef_.ravel() self.X_fit_ = X return self def predict(self, X): """Predict using the kernel ridge model. Parameters ---------- X : {array-like, sparse matrix} of shape (n_samples, n_features) Samples. If kernel == "precomputed" this is instead a precomputed kernel matrix, shape = [n_samples, n_samples_fitted], where n_samples_fitted is the number of samples used in the fitting for this estimator. Returns ------- C : ndarray of shape (n_samples,) or (n_samples, n_targets) Returns predicted values. """ check_is_fitted(self) X = self._validate_data(X, accept_sparse=("csr", "csc"), reset=False) K = self._get_kernel(X, self.X_fit_) return np.dot(K, self.dual_coef_)