#=============================================================================== # Copyright 2021 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. #=============================================================================== import numpy as np import warnings from scipy import sparse as sp from scipy.sparse import issparse, dok_matrix, lil_matrix from scipy.sparse.base import spmatrix from collections.abc import Sequence class DataConversionWarning(UserWarning): """Warning used to notify implicit data conversions happening in the code. """ def _column_or_1d(y, warn=False): y = np.asarray(y) # TODO: Convert this kind of arrays to a table like in daal4py if not y.flags.aligned and not y.flags.writeable: y = np.array(y.tolist()) shape = np.shape(y) if len(shape) == 1: return np.ravel(y) if len(shape) == 2 and shape[1] == 1: if warn: warnings.warn("A column-vector y was passed when a 1d array was" " expected. Please change the shape of y to " "(n_samples, ), for example using ravel().", DataConversionWarning, stacklevel=2) return np.ravel(y) raise ValueError( "y should be a 1d array, " "got an array of shape {} instead.".format(shape)) def _compute_class_weight(class_weight, classes, y): if set(y) - set(classes): raise ValueError("classes should include all valid labels that can " "be in y") if class_weight is None or len(class_weight) == 0: weight = np.ones(classes.shape[0], dtype=np.float64, order='C') elif class_weight == 'balanced': weight = None else: # user-defined dictionary weight = np.ones(classes.shape[0], dtype=np.float64, order='C') if not isinstance(class_weight, dict): raise ValueError("class_weight must be dict, 'balanced', or None," " got: %r" % class_weight) for c in class_weight: i = np.searchsorted(classes, c) if i >= len(classes) or classes[i] != c: raise ValueError("Class label {} not present.".format(c)) weight[i] = class_weight[c] return weight def _validate_targets(y, class_weight, dtype): y_ = _column_or_1d(y, warn=True) _check_classification_targets(y) classes, y = np.unique(y_, return_inverse=True) class_weight_res = _compute_class_weight(class_weight, classes=classes, y=y_) if len(classes) < 2: raise ValueError( "The number of classes has to be greater than one; got %d" " class" % len(classes)) return np.asarray(y, dtype=dtype, order='C'), class_weight_res, classes def _check_array(array, dtype="numeric", accept_sparse=False, order=None, copy=False, force_all_finite=True, ensure_2d=True, accept_large_sparse=True): # TODO from sklearn.utils.validation import check_array array = check_array(array=array, dtype=dtype, accept_sparse=accept_sparse, order=order, copy=copy, force_all_finite=force_all_finite, ensure_2d=ensure_2d, accept_large_sparse=accept_large_sparse) if sp.isspmatrix(array): return array # TODO: Convert this kind of arrays to a table like in daal4py if not array.flags.aligned and not array.flags.writeable: array = np.array(array.tolist()) # TODO: If data is not contiguous copy to contiguous # Need implemeted numpy table in oneDAL if not array.flags.c_contiguous and not array.flags.f_contiguous: array = np.ascontiguousarray(array, array.dtype) return array def _check_X_y(X, y, dtype="numeric", accept_sparse=False, order=None, copy=False, force_all_finite=True, ensure_2d=True, accept_large_sparse=True, y_numeric=False): if y is None: raise ValueError("y cannot be None") X = _check_array(X, accept_sparse=accept_sparse, dtype=dtype, order=order, copy=copy, force_all_finite=force_all_finite, ensure_2d=ensure_2d, accept_large_sparse=accept_large_sparse) y = _column_or_1d(y, warn=True) if y_numeric and y.dtype.kind == 'O': y = y.astype(np.float64) try: from daal4py.utils.validation import _daal_assert_all_finite as assert_all_finite except ImportError: from sklearn.utils.validation import assert_all_finite assert_all_finite(y) lengths = [X.shape[0], y.shape[0]] uniques = np.unique(lengths) if len(uniques) > 1: raise ValueError("Found input variables with inconsistent numbers of" " samples: %r" % [int(length) for length in lengths]) return X, y def _check_is_fitted(estimator, attributes=None, *, msg=None): if msg is None: msg = ("This %(name)s instance is not fitted yet. Call 'fit' with " "appropriate arguments before using this estimator.") if not hasattr(estimator, 'fit'): raise TypeError("%s is not an estimator instance." % (estimator)) if attributes is not None: if not isinstance(attributes, (list, tuple)): attributes = [attributes] attrs = all([hasattr(estimator, attr) for attr in attributes]) else: attrs = [v for v in vars(estimator) if v.endswith("_") and not v.startswith("__")] if not attrs: raise AttributeError(msg % {'name': type(estimator).__name__}) def _check_classification_targets(y): y_type = _type_of_target(y) if y_type not in ['binary', 'multiclass', 'multiclass-multioutput', 'multilabel-indicator', 'multilabel-sequences']: raise ValueError("Unknown label type: %r" % y_type) def _type_of_target(y): valid = (isinstance(y, (Sequence, spmatrix)) or hasattr(y, '__array__')) \ and not isinstance(y, str) if not valid: raise ValueError('Expected array-like (array or non-string sequence), ' 'got %r' % y) sparse_pandas = (y.__class__.__name__ in ['SparseSeries', 'SparseArray']) if sparse_pandas: raise ValueError("y cannot be class 'SparseSeries' or 'SparseArray'") if _is_multilabel(y): return 'multilabel-indicator' # DeprecationWarning will be replaced by ValueError, see NEP 34 # https://numpy.org/neps/nep-0034-infer-dtype-is-object.html with warnings.catch_warnings(): warnings.simplefilter('error', np.VisibleDeprecationWarning) try: y = np.asarray(y) except np.VisibleDeprecationWarning: # dtype=object should be provided explicitly for ragged arrays, # see NEP 34 y = np.asarray(y, dtype=object) # The old sequence of sequences format try: if not hasattr(y[0], '__array__') and isinstance(y[0], Sequence) \ and not isinstance(y[0], str): raise ValueError('You appear to be using a legacy multi-label data' ' representation. Sequence of sequences are no' ' longer supported; use a binary array or sparse' ' matrix instead - the MultiLabelBinarizer' ' transformer can convert to this format.') except IndexError: pass # Invalid inputs if y.ndim > 2 or (y.dtype == object and len(y) and not isinstance(y.flat[0], str)): return 'unknown' # [[[1, 2]]] or [obj_1] and not ["label_1"] if y.ndim == 2 and y.shape[1] == 0: return 'unknown' # [[]] if y.ndim == 2 and y.shape[1] > 1: suffix = "-multioutput" # [[1, 2], [1, 2]] else: suffix = "" # [1, 2, 3] or [[1], [2], [3]] # check float and contains non-integer float values if y.dtype.kind == 'f' and np.any(y != y.astype(int)): # [.1, .2, 3] or [[.1, .2, 3]] or [[1., .2]] and not [1., 2., 3.] # TODO: replace on daal4py from sklearn.utils.validation import assert_all_finite assert_all_finite(y) return 'continuous' + suffix if (len(np.unique(y)) > 2) or (y.ndim >= 2 and len(y[0]) > 1): return 'multiclass' + suffix # [1, 2, 3] or [[1., 2., 3]] or [[1, 2]] return 'binary' # [1, 2] or [["a"], ["b"]] def _is_integral_float(y): return y.dtype.kind == 'f' and np.all(y.astype(int) == y) def _is_multilabel(y): if hasattr(y, '__array__') or isinstance(y, Sequence): # DeprecationWarning will be replaced by ValueError, see NEP 34 # https://numpy.org/neps/nep-0034-infer-dtype-is-object.html with warnings.catch_warnings(): warnings.simplefilter('error', np.VisibleDeprecationWarning) try: y = np.asarray(y) except np.VisibleDeprecationWarning: # dtype=object should be provided explicitly for ragged arrays, # see NEP 34 y = np.array(y, dtype=object) if not (hasattr(y, "shape") and y.ndim == 2 and y.shape[1] > 1): return False if issparse(y): if isinstance(y, (dok_matrix, lil_matrix)): y = y.tocsr() return len(y.data) == 0 or np.unique(y.data).size == 1 and \ (y.dtype.kind in 'biu' or _is_integral_float(np.unique(y.data))) labels = np.unique(y) return len(labels) < 3 and (y.dtype.kind in 'biu' or _is_integral_float(labels)) def _check_n_features(self, X, reset): try: n_features = _num_features(X) except TypeError as e: if not reset and hasattr(self, "n_features_in_"): raise ValueError( "X does not contain any features, but " f"{self.__class__.__name__} is expecting " f"{self.n_features_in_} features" ) from e # If the number of features is not defined and reset=True, # then we skip this check return if reset: self.n_features_in_ = n_features return if not hasattr(self, "n_features_in_"): # Skip this check if the expected number of expected input features # was not recorded by calling fit first. This is typically the case # for stateless transformers. return if n_features != self.n_features_in_: raise ValueError( f"X has {n_features} features, but {self.__class__.__name__} " f"is expecting {self.n_features_in_} features as input.") def _num_features(X): type_ = type(X) if type_.__module__ == "builtins": type_name = type_.__qualname__ else: type_name = f"{type_.__module__}.{type_.__qualname__}" message = ( "Unable to find the number of features from X of type " f"{type_name}" ) if not hasattr(X, '__len__') and not hasattr(X, 'shape'): if not hasattr(X, '__array__'): raise TypeError(message) # Only convert X to a numpy array if there is no cheaper, heuristic # option. X = np.asarray(X) if hasattr(X, 'shape'): if not hasattr(X.shape, '__len__') or len(X.shape) <= 1: message += f" with shape {X.shape}" raise TypeError(message) return X.shape[1] first_sample = X[0] # Do not consider an array-like of strings or dicts to be a 2D array if isinstance(first_sample, (str, bytes, dict)): message += (f" where the samples are of type " f"{type(first_sample).__qualname__}") raise TypeError(message) try: # If X is a list of lists, for instance, we assume that all nested # lists have the same length without checking or converting to # a numpy array to keep this function call as cheap as possible. return len(first_sample) except Exception as err: raise TypeError(message) from err