""" The :mod:`sklearn.utils` module includes various utilities. """ import math import numbers import platform import struct import timeit import warnings from collections.abc import Sequence from contextlib import contextmanager, suppress from itertools import compress, islice import numpy as np from scipy.sparse import issparse from .. import get_config from ..exceptions import DataConversionWarning from . import _joblib, metadata_routing from ._bunch import Bunch from ._estimator_html_repr import estimator_html_repr from ._param_validation import Integral, Interval, validate_params from .class_weight import compute_class_weight, compute_sample_weight from .deprecation import deprecated from .discovery import all_estimators from .fixes import parse_version, threadpool_info from .murmurhash import murmurhash3_32 from .validation import ( _is_arraylike_not_scalar, _is_pandas_df, _is_polars_df, _use_interchange_protocol, as_float_array, assert_all_finite, check_array, check_consistent_length, check_random_state, check_scalar, check_symmetric, check_X_y, column_or_1d, indexable, ) # Do not deprecate parallel_backend and register_parallel_backend as they are # needed to tune `scikit-learn` behavior and have different effect if called # from the vendored version or or the site-package version. The other are # utilities that are independent of scikit-learn so they are not part of # scikit-learn public API. parallel_backend = _joblib.parallel_backend register_parallel_backend = _joblib.register_parallel_backend __all__ = [ "murmurhash3_32", "as_float_array", "assert_all_finite", "check_array", "check_random_state", "compute_class_weight", "compute_sample_weight", "column_or_1d", "check_consistent_length", "check_X_y", "check_scalar", "indexable", "check_symmetric", "indices_to_mask", "deprecated", "parallel_backend", "register_parallel_backend", "resample", "shuffle", "check_matplotlib_support", "all_estimators", "DataConversionWarning", "estimator_html_repr", "Bunch", "metadata_routing", ] IS_PYPY = platform.python_implementation() == "PyPy" _IS_32BIT = 8 * struct.calcsize("P") == 32 _IS_WASM = platform.machine() in ["wasm32", "wasm64"] def _in_unstable_openblas_configuration(): """Return True if in an unstable configuration for OpenBLAS""" # Import libraries which might load OpenBLAS. import numpy # noqa import scipy # noqa modules_info = threadpool_info() open_blas_used = any(info["internal_api"] == "openblas" for info in modules_info) if not open_blas_used: return False # OpenBLAS 0.3.16 fixed instability for arm64, see: # https://github.com/xianyi/OpenBLAS/blob/1b6db3dbba672b4f8af935bd43a1ff6cff4d20b7/Changelog.txt#L56-L58 # noqa openblas_arm64_stable_version = parse_version("0.3.16") for info in modules_info: if info["internal_api"] != "openblas": continue openblas_version = info.get("version") openblas_architecture = info.get("architecture") if openblas_version is None or openblas_architecture is None: # Cannot be sure that OpenBLAS is good enough. Assume unstable: return True if ( openblas_architecture == "neoversen1" and parse_version(openblas_version) < openblas_arm64_stable_version ): # See discussions in https://github.com/numpy/numpy/issues/19411 return True return False @validate_params( { "X": ["array-like", "sparse matrix"], "mask": ["array-like"], }, prefer_skip_nested_validation=True, ) def safe_mask(X, mask): """Return a mask which is safe to use on X. Parameters ---------- X : {array-like, sparse matrix} Data on which to apply mask. mask : array-like Mask to be used on X. Returns ------- mask : ndarray Array that is safe to use on X. Examples -------- >>> from sklearn.utils import safe_mask >>> from scipy.sparse import csr_matrix >>> data = csr_matrix([[1], [2], [3], [4], [5]]) >>> condition = [False, True, True, False, True] >>> mask = safe_mask(data, condition) >>> data[mask].toarray() array([[2], [3], [5]]) """ mask = np.asarray(mask) if np.issubdtype(mask.dtype, np.signedinteger): return mask if hasattr(X, "toarray"): ind = np.arange(mask.shape[0]) mask = ind[mask] return mask def axis0_safe_slice(X, mask, len_mask): """Return a mask which is safer to use on X than safe_mask. This mask is safer than safe_mask since it returns an empty array, when a sparse matrix is sliced with a boolean mask with all False, instead of raising an unhelpful error in older versions of SciPy. See: https://github.com/scipy/scipy/issues/5361 Also note that we can avoid doing the dot product by checking if the len_mask is not zero in _huber_loss_and_gradient but this is not going to be the bottleneck, since the number of outliers and non_outliers are typically non-zero and it makes the code tougher to follow. Parameters ---------- X : {array-like, sparse matrix} Data on which to apply mask. mask : ndarray Mask to be used on X. len_mask : int The length of the mask. Returns ------- mask : ndarray Array that is safe to use on X. """ if len_mask != 0: return X[safe_mask(X, mask), :] return np.zeros(shape=(0, X.shape[1])) def _array_indexing(array, key, key_dtype, axis): """Index an array or scipy.sparse consistently across NumPy version.""" if issparse(array) and key_dtype == "bool": key = np.asarray(key) if isinstance(key, tuple): key = list(key) return array[key, ...] if axis == 0 else array[:, key] def _pandas_indexing(X, key, key_dtype, axis): """Index a pandas dataframe or a series.""" if _is_arraylike_not_scalar(key): key = np.asarray(key) if key_dtype == "int" and not (isinstance(key, slice) or np.isscalar(key)): # using take() instead of iloc[] ensures the return value is a "proper" # copy that will not raise SettingWithCopyWarning return X.take(key, axis=axis) else: # check whether we should index with loc or iloc indexer = X.iloc if key_dtype == "int" else X.loc return indexer[:, key] if axis else indexer[key] def _list_indexing(X, key, key_dtype): """Index a Python list.""" if np.isscalar(key) or isinstance(key, slice): # key is a slice or a scalar return X[key] if key_dtype == "bool": # key is a boolean array-like return list(compress(X, key)) # key is a integer array-like of key return [X[idx] for idx in key] def _polars_indexing(X, key, key_dtype, axis): """Indexing X with polars interchange protocol.""" # Polars behavior is more consistent with lists if isinstance(key, np.ndarray): key = key.tolist() if axis == 1: return X[:, key] else: return X[key] def _determine_key_type(key, accept_slice=True): """Determine the data type of key. Parameters ---------- key : scalar, slice or array-like The key from which we want to infer the data type. accept_slice : bool, default=True Whether or not to raise an error if the key is a slice. Returns ------- dtype : {'int', 'str', 'bool', None} Returns the data type of key. """ err_msg = ( "No valid specification of the columns. Only a scalar, list or " "slice of all integers or all strings, or boolean mask is " "allowed" ) dtype_to_str = {int: "int", str: "str", bool: "bool", np.bool_: "bool"} array_dtype_to_str = { "i": "int", "u": "int", "b": "bool", "O": "str", "U": "str", "S": "str", } if key is None: return None if isinstance(key, tuple(dtype_to_str.keys())): try: return dtype_to_str[type(key)] except KeyError: raise ValueError(err_msg) if isinstance(key, slice): if not accept_slice: raise TypeError( "Only array-like or scalar are supported. A Python slice was given." ) if key.start is None and key.stop is None: return None key_start_type = _determine_key_type(key.start) key_stop_type = _determine_key_type(key.stop) if key_start_type is not None and key_stop_type is not None: if key_start_type != key_stop_type: raise ValueError(err_msg) if key_start_type is not None: return key_start_type return key_stop_type if isinstance(key, (list, tuple)): unique_key = set(key) key_type = {_determine_key_type(elt) for elt in unique_key} if not key_type: return None if len(key_type) != 1: raise ValueError(err_msg) return key_type.pop() if hasattr(key, "dtype"): try: return array_dtype_to_str[key.dtype.kind] except KeyError: raise ValueError(err_msg) raise ValueError(err_msg) def _safe_indexing(X, indices, *, axis=0): """Return rows, items or columns of X using indices. .. warning:: This utility is documented, but **private**. This means that backward compatibility might be broken without any deprecation cycle. Parameters ---------- X : array-like, sparse-matrix, list, pandas.DataFrame, pandas.Series Data from which to sample rows, items or columns. `list` are only supported when `axis=0`. indices : bool, int, str, slice, array-like - If `axis=0`, boolean and integer array-like, integer slice, and scalar integer are supported. - If `axis=1`: - to select a single column, `indices` can be of `int` type for all `X` types and `str` only for dataframe. The selected subset will be 1D, unless `X` is a sparse matrix in which case it will be 2D. - to select multiples columns, `indices` can be one of the following: `list`, `array`, `slice`. The type used in these containers can be one of the following: `int`, 'bool' and `str`. However, `str` is only supported when `X` is a dataframe. The selected subset will be 2D. axis : int, default=0 The axis along which `X` will be subsampled. `axis=0` will select rows while `axis=1` will select columns. Returns ------- subset Subset of X on axis 0 or 1. Notes ----- CSR, CSC, and LIL sparse matrices are supported. COO sparse matrices are not supported. Examples -------- >>> import numpy as np >>> from sklearn.utils import _safe_indexing >>> data = np.array([[1, 2], [3, 4], [5, 6]]) >>> _safe_indexing(data, 0, axis=0) # select the first row array([1, 2]) >>> _safe_indexing(data, 0, axis=1) # select the first column array([1, 3, 5]) """ if indices is None: return X if axis not in (0, 1): raise ValueError( "'axis' should be either 0 (to index rows) or 1 (to index " " column). Got {} instead.".format(axis) ) indices_dtype = _determine_key_type(indices) if axis == 0 and indices_dtype == "str": raise ValueError("String indexing is not supported with 'axis=0'") if axis == 1 and isinstance(X, list): raise ValueError("axis=1 is not supported for lists") if axis == 1 and hasattr(X, "ndim") and X.ndim != 2: raise ValueError( "'X' should be a 2D NumPy array, 2D sparse matrix or pandas " "dataframe when indexing the columns (i.e. 'axis=1'). " "Got {} instead with {} dimension(s).".format(type(X), X.ndim) ) if ( axis == 1 and indices_dtype == "str" and not (_is_pandas_df(X) or _use_interchange_protocol(X)) ): raise ValueError( "Specifying the columns using strings is only supported for dataframes." ) if hasattr(X, "iloc"): # TODO: we should probably use _is_pandas_df(X) instead but this would # require updating some tests such as test_train_test_split_mock_pandas. return _pandas_indexing(X, indices, indices_dtype, axis=axis) elif _is_polars_df(X): return _polars_indexing(X, indices, indices_dtype, axis=axis) elif hasattr(X, "shape"): return _array_indexing(X, indices, indices_dtype, axis=axis) else: return _list_indexing(X, indices, indices_dtype) def _safe_assign(X, values, *, row_indexer=None, column_indexer=None): """Safe assignment to a numpy array, sparse matrix, or pandas dataframe. Parameters ---------- X : {ndarray, sparse-matrix, dataframe} Array to be modified. It is expected to be 2-dimensional. values : ndarray The values to be assigned to `X`. row_indexer : array-like, dtype={int, bool}, default=None A 1-dimensional array to select the rows of interest. If `None`, all rows are selected. column_indexer : array-like, dtype={int, bool}, default=None A 1-dimensional array to select the columns of interest. If `None`, all columns are selected. """ row_indexer = slice(None, None, None) if row_indexer is None else row_indexer column_indexer = ( slice(None, None, None) if column_indexer is None else column_indexer ) if hasattr(X, "iloc"): # pandas dataframe with warnings.catch_warnings(): # pandas >= 1.5 raises a warning when using iloc to set values in a column # that does not have the same type as the column being set. It happens # for instance when setting a categorical column with a string. # In the future the behavior won't change and the warning should disappear. # TODO(1.3): check if the warning is still raised or remove the filter. warnings.simplefilter("ignore", FutureWarning) X.iloc[row_indexer, column_indexer] = values else: # numpy array or sparse matrix X[row_indexer, column_indexer] = values def _get_column_indices_for_bool_or_int(key, n_columns): # Convert key into list of positive integer indexes try: idx = _safe_indexing(np.arange(n_columns), key) except IndexError as e: raise ValueError( f"all features must be in [0, {n_columns - 1}] or [-{n_columns}, 0]" ) from e return np.atleast_1d(idx).tolist() def _get_column_indices(X, key): """Get feature column indices for input data X and key. For accepted values of `key`, see the docstring of :func:`_safe_indexing`. """ key_dtype = _determine_key_type(key) if _use_interchange_protocol(X): return _get_column_indices_interchange(X.__dataframe__(), key, key_dtype) n_columns = X.shape[1] if isinstance(key, (list, tuple)) and not key: # we get an empty list return [] elif key_dtype in ("bool", "int"): return _get_column_indices_for_bool_or_int(key, n_columns) else: try: all_columns = X.columns except AttributeError: raise ValueError( "Specifying the columns using strings is only supported for dataframes." ) if isinstance(key, str): columns = [key] elif isinstance(key, slice): start, stop = key.start, key.stop if start is not None: start = all_columns.get_loc(start) if stop is not None: # pandas indexing with strings is endpoint included stop = all_columns.get_loc(stop) + 1 else: stop = n_columns + 1 return list(islice(range(n_columns), start, stop)) else: columns = list(key) try: column_indices = [] for col in columns: col_idx = all_columns.get_loc(col) if not isinstance(col_idx, numbers.Integral): raise ValueError( f"Selected columns, {columns}, are not unique in dataframe" ) column_indices.append(col_idx) except KeyError as e: raise ValueError("A given column is not a column of the dataframe") from e return column_indices def _get_column_indices_interchange(X_interchange, key, key_dtype): """Same as _get_column_indices but for X with __dataframe__ protocol.""" n_columns = X_interchange.num_columns() if isinstance(key, (list, tuple)) and not key: # we get an empty list return [] elif key_dtype in ("bool", "int"): return _get_column_indices_for_bool_or_int(key, n_columns) else: column_names = list(X_interchange.column_names()) if isinstance(key, slice): if key.step not in [1, None]: raise NotImplementedError("key.step must be 1 or None") start, stop = key.start, key.stop if start is not None: start = column_names.index(start) if stop is not None: stop = column_names.index(stop) + 1 else: stop = n_columns + 1 return list(islice(range(n_columns), start, stop)) selected_columns = [key] if np.isscalar(key) else key try: return [column_names.index(col) for col in selected_columns] except ValueError as e: raise ValueError("A given column is not a column of the dataframe") from e @validate_params( { "replace": ["boolean"], "n_samples": [Interval(numbers.Integral, 1, None, closed="left"), None], "random_state": ["random_state"], "stratify": ["array-like", None], }, prefer_skip_nested_validation=True, ) def resample(*arrays, replace=True, n_samples=None, random_state=None, stratify=None): """Resample arrays or sparse matrices in a consistent way. The default strategy implements one step of the bootstrapping procedure. Parameters ---------- *arrays : sequence of array-like of shape (n_samples,) or \ (n_samples, n_outputs) Indexable data-structures can be arrays, lists, dataframes or scipy sparse matrices with consistent first dimension. replace : bool, default=True Implements resampling with replacement. If False, this will implement (sliced) random permutations. n_samples : int, default=None Number of samples to generate. If left to None this is automatically set to the first dimension of the arrays. If replace is False it should not be larger than the length of arrays. random_state : int, RandomState instance or None, default=None Determines random number generation for shuffling the data. Pass an int for reproducible results across multiple function calls. See :term:`Glossary `. stratify : array-like of shape (n_samples,) or (n_samples, n_outputs), \ default=None If not None, data is split in a stratified fashion, using this as the class labels. Returns ------- resampled_arrays : sequence of array-like of shape (n_samples,) or \ (n_samples, n_outputs) Sequence of resampled copies of the collections. The original arrays are not impacted. See Also -------- shuffle : Shuffle arrays or sparse matrices in a consistent way. Examples -------- It is possible to mix sparse and dense arrays in the same run:: >>> import numpy as np >>> X = np.array([[1., 0.], [2., 1.], [0., 0.]]) >>> y = np.array([0, 1, 2]) >>> from scipy.sparse import coo_matrix >>> X_sparse = coo_matrix(X) >>> from sklearn.utils import resample >>> X, X_sparse, y = resample(X, X_sparse, y, random_state=0) >>> X array([[1., 0.], [2., 1.], [1., 0.]]) >>> X_sparse <3x2 sparse matrix of type '<... 'numpy.float64'>' with 4 stored elements in Compressed Sparse Row format> >>> X_sparse.toarray() array([[1., 0.], [2., 1.], [1., 0.]]) >>> y array([0, 1, 0]) >>> resample(y, n_samples=2, random_state=0) array([0, 1]) Example using stratification:: >>> y = [0, 0, 1, 1, 1, 1, 1, 1, 1] >>> resample(y, n_samples=5, replace=False, stratify=y, ... random_state=0) [1, 1, 1, 0, 1] """ max_n_samples = n_samples random_state = check_random_state(random_state) if len(arrays) == 0: return None first = arrays[0] n_samples = first.shape[0] if hasattr(first, "shape") else len(first) if max_n_samples is None: max_n_samples = n_samples elif (max_n_samples > n_samples) and (not replace): raise ValueError( "Cannot sample %d out of arrays with dim %d when replace is False" % (max_n_samples, n_samples) ) check_consistent_length(*arrays) if stratify is None: if replace: indices = random_state.randint(0, n_samples, size=(max_n_samples,)) else: indices = np.arange(n_samples) random_state.shuffle(indices) indices = indices[:max_n_samples] else: # Code adapted from StratifiedShuffleSplit() y = check_array(stratify, ensure_2d=False, dtype=None) if y.ndim == 2: # for multi-label y, map each distinct row to a string repr # using join because str(row) uses an ellipsis if len(row) > 1000 y = np.array([" ".join(row.astype("str")) for row in y]) classes, y_indices = np.unique(y, return_inverse=True) n_classes = classes.shape[0] class_counts = np.bincount(y_indices) # Find the sorted list of instances for each class: # (np.unique above performs a sort, so code is O(n logn) already) class_indices = np.split( np.argsort(y_indices, kind="mergesort"), np.cumsum(class_counts)[:-1] ) n_i = _approximate_mode(class_counts, max_n_samples, random_state) indices = [] for i in range(n_classes): indices_i = random_state.choice(class_indices[i], n_i[i], replace=replace) indices.extend(indices_i) indices = random_state.permutation(indices) # convert sparse matrices to CSR for row-based indexing arrays = [a.tocsr() if issparse(a) else a for a in arrays] resampled_arrays = [_safe_indexing(a, indices) for a in arrays] if len(resampled_arrays) == 1: # syntactic sugar for the unit argument case return resampled_arrays[0] else: return resampled_arrays def shuffle(*arrays, random_state=None, n_samples=None): """Shuffle arrays or sparse matrices in a consistent way. This is a convenience alias to ``resample(*arrays, replace=False)`` to do random permutations of the collections. Parameters ---------- *arrays : sequence of indexable data-structures Indexable data-structures can be arrays, lists, dataframes or scipy sparse matrices with consistent first dimension. random_state : int, RandomState instance or None, default=None Determines random number generation for shuffling the data. Pass an int for reproducible results across multiple function calls. See :term:`Glossary `. n_samples : int, default=None Number of samples to generate. If left to None this is automatically set to the first dimension of the arrays. It should not be larger than the length of arrays. Returns ------- shuffled_arrays : sequence of indexable data-structures Sequence of shuffled copies of the collections. The original arrays are not impacted. See Also -------- resample : Resample arrays or sparse matrices in a consistent way. Examples -------- It is possible to mix sparse and dense arrays in the same run:: >>> import numpy as np >>> X = np.array([[1., 0.], [2., 1.], [0., 0.]]) >>> y = np.array([0, 1, 2]) >>> from scipy.sparse import coo_matrix >>> X_sparse = coo_matrix(X) >>> from sklearn.utils import shuffle >>> X, X_sparse, y = shuffle(X, X_sparse, y, random_state=0) >>> X array([[0., 0.], [2., 1.], [1., 0.]]) >>> X_sparse <3x2 sparse matrix of type '<... 'numpy.float64'>' with 3 stored elements in Compressed Sparse Row format> >>> X_sparse.toarray() array([[0., 0.], [2., 1.], [1., 0.]]) >>> y array([2, 1, 0]) >>> shuffle(y, n_samples=2, random_state=0) array([0, 1]) """ return resample( *arrays, replace=False, n_samples=n_samples, random_state=random_state ) def safe_sqr(X, *, copy=True): """Element wise squaring of array-likes and sparse matrices. Parameters ---------- X : {array-like, ndarray, sparse matrix} copy : bool, default=True Whether to create a copy of X and operate on it or to perform inplace computation (default behaviour). Returns ------- X ** 2 : element wise square Return the element-wise square of the input. Examples -------- >>> from sklearn.utils import safe_sqr >>> safe_sqr([1, 2, 3]) array([1, 4, 9]) """ X = check_array(X, accept_sparse=["csr", "csc", "coo"], ensure_2d=False) if issparse(X): if copy: X = X.copy() X.data **= 2 else: if copy: X = X**2 else: X **= 2 return X def _chunk_generator(gen, chunksize): """Chunk generator, ``gen`` into lists of length ``chunksize``. The last chunk may have a length less than ``chunksize``.""" while True: chunk = list(islice(gen, chunksize)) if chunk: yield chunk else: return @validate_params( { "n": [Interval(numbers.Integral, 1, None, closed="left")], "batch_size": [Interval(numbers.Integral, 1, None, closed="left")], "min_batch_size": [Interval(numbers.Integral, 0, None, closed="left")], }, prefer_skip_nested_validation=True, ) def gen_batches(n, batch_size, *, min_batch_size=0): """Generator to create slices containing `batch_size` elements from 0 to `n`. The last slice may contain less than `batch_size` elements, when `batch_size` does not divide `n`. Parameters ---------- n : int Size of the sequence. batch_size : int Number of elements in each batch. min_batch_size : int, default=0 Minimum number of elements in each batch. Yields ------ slice of `batch_size` elements See Also -------- gen_even_slices: Generator to create n_packs slices going up to n. Examples -------- >>> from sklearn.utils import gen_batches >>> list(gen_batches(7, 3)) [slice(0, 3, None), slice(3, 6, None), slice(6, 7, None)] >>> list(gen_batches(6, 3)) [slice(0, 3, None), slice(3, 6, None)] >>> list(gen_batches(2, 3)) [slice(0, 2, None)] >>> list(gen_batches(7, 3, min_batch_size=0)) [slice(0, 3, None), slice(3, 6, None), slice(6, 7, None)] >>> list(gen_batches(7, 3, min_batch_size=2)) [slice(0, 3, None), slice(3, 7, None)] """ start = 0 for _ in range(int(n // batch_size)): end = start + batch_size if end + min_batch_size > n: continue yield slice(start, end) start = end if start < n: yield slice(start, n) @validate_params( { "n": [Interval(Integral, 1, None, closed="left")], "n_packs": [Interval(Integral, 1, None, closed="left")], "n_samples": [Interval(Integral, 1, None, closed="left"), None], }, prefer_skip_nested_validation=True, ) def gen_even_slices(n, n_packs, *, n_samples=None): """Generator to create `n_packs` evenly spaced slices going up to `n`. If `n_packs` does not divide `n`, except for the first `n % n_packs` slices, remaining slices may contain fewer elements. Parameters ---------- n : int Size of the sequence. n_packs : int Number of slices to generate. n_samples : int, default=None Number of samples. Pass `n_samples` when the slices are to be used for sparse matrix indexing; slicing off-the-end raises an exception, while it works for NumPy arrays. Yields ------ `slice` representing a set of indices from 0 to n. See Also -------- gen_batches: Generator to create slices containing batch_size elements from 0 to n. Examples -------- >>> from sklearn.utils import gen_even_slices >>> list(gen_even_slices(10, 1)) [slice(0, 10, None)] >>> list(gen_even_slices(10, 10)) [slice(0, 1, None), slice(1, 2, None), ..., slice(9, 10, None)] >>> list(gen_even_slices(10, 5)) [slice(0, 2, None), slice(2, 4, None), ..., slice(8, 10, None)] >>> list(gen_even_slices(10, 3)) [slice(0, 4, None), slice(4, 7, None), slice(7, 10, None)] """ start = 0 for pack_num in range(n_packs): this_n = n // n_packs if pack_num < n % n_packs: this_n += 1 if this_n > 0: end = start + this_n if n_samples is not None: end = min(n_samples, end) yield slice(start, end, None) start = end def tosequence(x): """Cast iterable x to a Sequence, avoiding a copy if possible. Parameters ---------- x : iterable The iterable to be converted. Returns ------- x : Sequence If `x` is a NumPy array, it returns it as a `ndarray`. If `x` is a `Sequence`, `x` is returned as-is. If `x` is from any other type, `x` is returned casted as a list. """ if isinstance(x, np.ndarray): return np.asarray(x) elif isinstance(x, Sequence): return x else: return list(x) def _to_object_array(sequence): """Convert sequence to a 1-D NumPy array of object dtype. numpy.array constructor has a similar use but it's output is ambiguous. It can be 1-D NumPy array of object dtype if the input is a ragged array, but if the input is a list of equal length arrays, then the output is a 2D numpy.array. _to_object_array solves this ambiguity by guarantying that the output is a 1-D NumPy array of objects for any input. Parameters ---------- sequence : array-like of shape (n_elements,) The sequence to be converted. Returns ------- out : ndarray of shape (n_elements,), dtype=object The converted sequence into a 1-D NumPy array of object dtype. Examples -------- >>> import numpy as np >>> from sklearn.utils import _to_object_array >>> _to_object_array([np.array([0]), np.array([1])]) array([array([0]), array([1])], dtype=object) >>> _to_object_array([np.array([0]), np.array([1, 2])]) array([array([0]), array([1, 2])], dtype=object) >>> _to_object_array([np.array([0]), np.array([1, 2])]) array([array([0]), array([1, 2])], dtype=object) """ out = np.empty(len(sequence), dtype=object) out[:] = sequence return out def indices_to_mask(indices, mask_length): """Convert list of indices to boolean mask. Parameters ---------- indices : list-like List of integers treated as indices. mask_length : int Length of boolean mask to be generated. This parameter must be greater than max(indices). Returns ------- mask : 1d boolean nd-array Boolean array that is True where indices are present, else False. Examples -------- >>> from sklearn.utils import indices_to_mask >>> indices = [1, 2 , 3, 4] >>> indices_to_mask(indices, 5) array([False, True, True, True, True]) """ if mask_length <= np.max(indices): raise ValueError("mask_length must be greater than max(indices)") mask = np.zeros(mask_length, dtype=bool) mask[indices] = True return mask def _message_with_time(source, message, time): """Create one line message for logging purposes. Parameters ---------- source : str String indicating the source or the reference of the message. message : str Short message. time : int Time in seconds. """ start_message = "[%s] " % source # adapted from joblib.logger.short_format_time without the Windows -.1s # adjustment if time > 60: time_str = "%4.1fmin" % (time / 60) else: time_str = " %5.1fs" % time end_message = " %s, total=%s" % (message, time_str) dots_len = 70 - len(start_message) - len(end_message) return "%s%s%s" % (start_message, dots_len * ".", end_message) @contextmanager def _print_elapsed_time(source, message=None): """Log elapsed time to stdout when the context is exited. Parameters ---------- source : str String indicating the source or the reference of the message. message : str, default=None Short message. If None, nothing will be printed. Returns ------- context_manager Prints elapsed time upon exit if verbose. """ if message is None: yield else: start = timeit.default_timer() yield print(_message_with_time(source, message, timeit.default_timer() - start)) def get_chunk_n_rows(row_bytes, *, max_n_rows=None, working_memory=None): """Calculate how many rows can be processed within `working_memory`. Parameters ---------- row_bytes : int The expected number of bytes of memory that will be consumed during the processing of each row. max_n_rows : int, default=None The maximum return value. working_memory : int or float, default=None The number of rows to fit inside this number of MiB will be returned. When None (default), the value of ``sklearn.get_config()['working_memory']`` is used. Returns ------- int The number of rows which can be processed within `working_memory`. Warns ----- Issues a UserWarning if `row_bytes exceeds `working_memory` MiB. """ if working_memory is None: working_memory = get_config()["working_memory"] chunk_n_rows = int(working_memory * (2**20) // row_bytes) if max_n_rows is not None: chunk_n_rows = min(chunk_n_rows, max_n_rows) if chunk_n_rows < 1: warnings.warn( "Could not adhere to working_memory config. " "Currently %.0fMiB, %.0fMiB required." % (working_memory, np.ceil(row_bytes * 2**-20)) ) chunk_n_rows = 1 return chunk_n_rows def _is_pandas_na(x): """Test if x is pandas.NA. We intentionally do not use this function to return `True` for `pd.NA` in `is_scalar_nan`, because estimators that support `pd.NA` are the exception rather than the rule at the moment. When `pd.NA` is more universally supported, we may reconsider this decision. Parameters ---------- x : any type Returns ------- boolean """ with suppress(ImportError): from pandas import NA return x is NA return False def is_scalar_nan(x): """Test if x is NaN. This function is meant to overcome the issue that np.isnan does not allow non-numerical types as input, and that np.nan is not float('nan'). Parameters ---------- x : any type Any scalar value. Returns ------- bool Returns true if x is NaN, and false otherwise. Examples -------- >>> import numpy as np >>> from sklearn.utils import is_scalar_nan >>> is_scalar_nan(np.nan) True >>> is_scalar_nan(float("nan")) True >>> is_scalar_nan(None) False >>> is_scalar_nan("") False >>> is_scalar_nan([np.nan]) False """ return ( not isinstance(x, numbers.Integral) and isinstance(x, numbers.Real) and math.isnan(x) ) def _approximate_mode(class_counts, n_draws, rng): """Computes approximate mode of multivariate hypergeometric. This is an approximation to the mode of the multivariate hypergeometric given by class_counts and n_draws. It shouldn't be off by more than one. It is the mostly likely outcome of drawing n_draws many samples from the population given by class_counts. Parameters ---------- class_counts : ndarray of int Population per class. n_draws : int Number of draws (samples to draw) from the overall population. rng : random state Used to break ties. Returns ------- sampled_classes : ndarray of int Number of samples drawn from each class. np.sum(sampled_classes) == n_draws Examples -------- >>> import numpy as np >>> from sklearn.utils import _approximate_mode >>> _approximate_mode(class_counts=np.array([4, 2]), n_draws=3, rng=0) array([2, 1]) >>> _approximate_mode(class_counts=np.array([5, 2]), n_draws=4, rng=0) array([3, 1]) >>> _approximate_mode(class_counts=np.array([2, 2, 2, 1]), ... n_draws=2, rng=0) array([0, 1, 1, 0]) >>> _approximate_mode(class_counts=np.array([2, 2, 2, 1]), ... n_draws=2, rng=42) array([1, 1, 0, 0]) """ rng = check_random_state(rng) # this computes a bad approximation to the mode of the # multivariate hypergeometric given by class_counts and n_draws continuous = class_counts / class_counts.sum() * n_draws # floored means we don't overshoot n_samples, but probably undershoot floored = np.floor(continuous) # we add samples according to how much "left over" probability # they had, until we arrive at n_samples need_to_add = int(n_draws - floored.sum()) if need_to_add > 0: remainder = continuous - floored values = np.sort(np.unique(remainder))[::-1] # add according to remainder, but break ties # randomly to avoid biases for value in values: (inds,) = np.where(remainder == value) # if we need_to_add less than what's in inds # we draw randomly from them. # if we need to add more, we add them all and # go to the next value add_now = min(len(inds), need_to_add) inds = rng.choice(inds, size=add_now, replace=False) floored[inds] += 1 need_to_add -= add_now if need_to_add == 0: break return floored.astype(int) def check_matplotlib_support(caller_name): """Raise ImportError with detailed error message if mpl is not installed. Plot utilities like any of the Display's plotting functions should lazily import matplotlib and call this helper before any computation. Parameters ---------- caller_name : str The name of the caller that requires matplotlib. """ try: import matplotlib # noqa except ImportError as e: raise ImportError( "{} requires matplotlib. You can install matplotlib with " "`pip install matplotlib`".format(caller_name) ) from e def check_pandas_support(caller_name): """Raise ImportError with detailed error message if pandas is not installed. Plot utilities like :func:`fetch_openml` should lazily import pandas and call this helper before any computation. Parameters ---------- caller_name : str The name of the caller that requires pandas. Returns ------- pandas The pandas package. """ try: import pandas # noqa return pandas except ImportError as e: raise ImportError("{} requires pandas.".format(caller_name)) from e