# Authors: Andreas Mueller # Guillaume Lemaitre # License: BSD 3 clause import warnings import numpy as np from ..base import BaseEstimator, RegressorMixin, _fit_context, clone from ..exceptions import NotFittedError from ..preprocessing import FunctionTransformer from ..utils import _safe_indexing, check_array from ..utils._param_validation import HasMethods from ..utils._tags import _safe_tags from ..utils.metadata_routing import ( _raise_for_unsupported_routing, _RoutingNotSupportedMixin, ) from ..utils.validation import check_is_fitted __all__ = ["TransformedTargetRegressor"] class TransformedTargetRegressor( _RoutingNotSupportedMixin, RegressorMixin, BaseEstimator ): """Meta-estimator to regress on a transformed target. Useful for applying a non-linear transformation to the target `y` in regression problems. This transformation can be given as a Transformer such as the :class:`~sklearn.preprocessing.QuantileTransformer` or as a function and its inverse such as `np.log` and `np.exp`. The computation during :meth:`fit` is:: regressor.fit(X, func(y)) or:: regressor.fit(X, transformer.transform(y)) The computation during :meth:`predict` is:: inverse_func(regressor.predict(X)) or:: transformer.inverse_transform(regressor.predict(X)) Read more in the :ref:`User Guide `. .. versionadded:: 0.20 Parameters ---------- regressor : object, default=None Regressor object such as derived from :class:`~sklearn.base.RegressorMixin`. This regressor will automatically be cloned each time prior to fitting. If `regressor is None`, :class:`~sklearn.linear_model.LinearRegression` is created and used. transformer : object, default=None Estimator object such as derived from :class:`~sklearn.base.TransformerMixin`. Cannot be set at the same time as `func` and `inverse_func`. If `transformer is None` as well as `func` and `inverse_func`, the transformer will be an identity transformer. Note that the transformer will be cloned during fitting. Also, the transformer is restricting `y` to be a numpy array. func : function, default=None Function to apply to `y` before passing to :meth:`fit`. Cannot be set at the same time as `transformer`. The function needs to return a 2-dimensional array. If `func is None`, the function used will be the identity function. inverse_func : function, default=None Function to apply to the prediction of the regressor. Cannot be set at the same time as `transformer`. The function needs to return a 2-dimensional array. The inverse function is used to return predictions to the same space of the original training labels. check_inverse : bool, default=True Whether to check that `transform` followed by `inverse_transform` or `func` followed by `inverse_func` leads to the original targets. Attributes ---------- regressor_ : object Fitted regressor. transformer_ : object Transformer used in :meth:`fit` and :meth:`predict`. n_features_in_ : int Number of features seen during :term:`fit`. Only defined if the underlying regressor exposes such an attribute when fit. .. versionadded:: 0.24 feature_names_in_ : ndarray of shape (`n_features_in_`,) Names of features seen during :term:`fit`. Defined only when `X` has feature names that are all strings. .. versionadded:: 1.0 See Also -------- sklearn.preprocessing.FunctionTransformer : Construct a transformer from an arbitrary callable. Notes ----- Internally, the target `y` is always converted into a 2-dimensional array to be used by scikit-learn transformers. At the time of prediction, the output will be reshaped to a have the same number of dimensions as `y`. Examples -------- >>> import numpy as np >>> from sklearn.linear_model import LinearRegression >>> from sklearn.compose import TransformedTargetRegressor >>> tt = TransformedTargetRegressor(regressor=LinearRegression(), ... func=np.log, inverse_func=np.exp) >>> X = np.arange(4).reshape(-1, 1) >>> y = np.exp(2 * X).ravel() >>> tt.fit(X, y) TransformedTargetRegressor(...) >>> tt.score(X, y) 1.0 >>> tt.regressor_.coef_ array([2.]) For a more detailed example use case refer to :ref:`sphx_glr_auto_examples_compose_plot_transformed_target.py`. """ _parameter_constraints: dict = { "regressor": [HasMethods(["fit", "predict"]), None], "transformer": [HasMethods("transform"), None], "func": [callable, None], "inverse_func": [callable, None], "check_inverse": ["boolean"], } def __init__( self, regressor=None, *, transformer=None, func=None, inverse_func=None, check_inverse=True, ): self.regressor = regressor self.transformer = transformer self.func = func self.inverse_func = inverse_func self.check_inverse = check_inverse def _fit_transformer(self, y): """Check transformer and fit transformer. Create the default transformer, fit it and make additional inverse check on a subset (optional). """ if self.transformer is not None and ( self.func is not None or self.inverse_func is not None ): raise ValueError( "'transformer' and functions 'func'/'inverse_func' cannot both be set." ) elif self.transformer is not None: self.transformer_ = clone(self.transformer) else: if self.func is not None and self.inverse_func is None: raise ValueError( "When 'func' is provided, 'inverse_func' must also be provided" ) self.transformer_ = FunctionTransformer( func=self.func, inverse_func=self.inverse_func, validate=True, check_inverse=self.check_inverse, ) # XXX: sample_weight is not currently passed to the # transformer. However, if transformer starts using sample_weight, the # code should be modified accordingly. At the time to consider the # sample_prop feature, it is also a good use case to be considered. self.transformer_.fit(y) if self.check_inverse: idx_selected = slice(None, None, max(1, y.shape[0] // 10)) y_sel = _safe_indexing(y, idx_selected) y_sel_t = self.transformer_.transform(y_sel) if not np.allclose(y_sel, self.transformer_.inverse_transform(y_sel_t)): warnings.warn( ( "The provided functions or transformer are" " not strictly inverse of each other. If" " you are sure you want to proceed regardless" ", set 'check_inverse=False'" ), UserWarning, ) @_fit_context( # TransformedTargetRegressor.regressor/transformer are not validated yet. prefer_skip_nested_validation=False ) def fit(self, X, y, **fit_params): """Fit the model according to the given training data. Parameters ---------- X : {array-like, sparse matrix} of shape (n_samples, n_features) Training vector, where `n_samples` is the number of samples and `n_features` is the number of features. y : array-like of shape (n_samples,) Target values. **fit_params : dict Parameters passed to the `fit` method of the underlying regressor. Returns ------- self : object Fitted estimator. """ _raise_for_unsupported_routing(self, "fit", **fit_params) if y is None: raise ValueError( f"This {self.__class__.__name__} estimator " "requires y to be passed, but the target y is None." ) y = check_array( y, input_name="y", accept_sparse=False, force_all_finite=True, ensure_2d=False, dtype="numeric", allow_nd=True, ) # store the number of dimension of the target to predict an array of # similar shape at predict self._training_dim = y.ndim # transformers are designed to modify X which is 2d dimensional, we # need to modify y accordingly. if y.ndim == 1: y_2d = y.reshape(-1, 1) else: y_2d = y self._fit_transformer(y_2d) # transform y and convert back to 1d array if needed y_trans = self.transformer_.transform(y_2d) # FIXME: a FunctionTransformer can return a 1D array even when validate # is set to True. Therefore, we need to check the number of dimension # first. if y_trans.ndim == 2 and y_trans.shape[1] == 1: y_trans = y_trans.squeeze(axis=1) if self.regressor is None: from ..linear_model import LinearRegression self.regressor_ = LinearRegression() else: self.regressor_ = clone(self.regressor) self.regressor_.fit(X, y_trans, **fit_params) if hasattr(self.regressor_, "feature_names_in_"): self.feature_names_in_ = self.regressor_.feature_names_in_ return self def predict(self, X, **predict_params): """Predict using the base regressor, applying inverse. The regressor is used to predict and the `inverse_func` or `inverse_transform` is applied before returning the prediction. Parameters ---------- X : {array-like, sparse matrix} of shape (n_samples, n_features) Samples. **predict_params : dict of str -> object Parameters passed to the `predict` method of the underlying regressor. Returns ------- y_hat : ndarray of shape (n_samples,) Predicted values. """ check_is_fitted(self) pred = self.regressor_.predict(X, **predict_params) if pred.ndim == 1: pred_trans = self.transformer_.inverse_transform(pred.reshape(-1, 1)) else: pred_trans = self.transformer_.inverse_transform(pred) if ( self._training_dim == 1 and pred_trans.ndim == 2 and pred_trans.shape[1] == 1 ): pred_trans = pred_trans.squeeze(axis=1) return pred_trans def _more_tags(self): regressor = self.regressor if regressor is None: from ..linear_model import LinearRegression regressor = LinearRegression() return { "poor_score": True, "multioutput": _safe_tags(regressor, key="multioutput"), } @property def n_features_in_(self): """Number of features seen during :term:`fit`.""" # For consistency with other estimators we raise a AttributeError so # that hasattr() returns False the estimator isn't fitted. try: check_is_fitted(self) except NotFittedError as nfe: raise AttributeError( "{} object has no n_features_in_ attribute.".format( self.__class__.__name__ ) ) from nfe return self.regressor_.n_features_in_