""" The :mod:`sklearn.utils` module includes various utilities. """ import pkgutil import inspect from importlib import import_module from operator import itemgetter from collections.abc import Sequence from contextlib import contextmanager from itertools import compress from itertools import islice import math import numbers import platform import struct import timeit from pathlib import Path import warnings import numpy as np from scipy.sparse import issparse from .murmurhash import murmurhash3_32 from .class_weight import compute_class_weight, compute_sample_weight from . import _joblib from ..exceptions import DataConversionWarning from .deprecation import deprecated from .fixes import np_version, parse_version from ._estimator_html_repr import estimator_html_repr from .validation import ( as_float_array, assert_all_finite, check_random_state, column_or_1d, check_array, check_consistent_length, check_X_y, indexable, check_symmetric, check_scalar, ) from .. import get_config # Do not deprecate parallel_backend and register_parallel_backend as they are # needed to tune `scikit-learn` behavior and have different effect if called # from the vendored version or or the site-package version. The other are # utilities that are independent of scikit-learn so they are not part of # scikit-learn public API. parallel_backend = _joblib.parallel_backend register_parallel_backend = _joblib.register_parallel_backend __all__ = [ "murmurhash3_32", "as_float_array", "assert_all_finite", "check_array", "check_random_state", "compute_class_weight", "compute_sample_weight", "column_or_1d", "check_consistent_length", "check_X_y", "check_scalar", "indexable", "check_symmetric", "indices_to_mask", "deprecated", "parallel_backend", "register_parallel_backend", "resample", "shuffle", "check_matplotlib_support", "all_estimators", "DataConversionWarning", "estimator_html_repr", ] IS_PYPY = platform.python_implementation() == "PyPy" _IS_32BIT = 8 * struct.calcsize("P") == 32 class Bunch(dict): """Container object exposing keys as attributes. Bunch objects are sometimes used as an output for functions and methods. They extend dictionaries by enabling values to be accessed by key, `bunch["value_key"]`, or by an attribute, `bunch.value_key`. Examples -------- >>> from sklearn.utils import Bunch >>> b = Bunch(a=1, b=2) >>> b['b'] 2 >>> b.b 2 >>> b.a = 3 >>> b['a'] 3 >>> b.c = 6 >>> b['c'] 6 """ def __init__(self, **kwargs): super().__init__(kwargs) def __setattr__(self, key, value): self[key] = value def __dir__(self): return self.keys() def __getattr__(self, key): try: return self[key] except KeyError: raise AttributeError(key) def __setstate__(self, state): # Bunch pickles generated with scikit-learn 0.16.* have an non # empty __dict__. This causes a surprising behaviour when # loading these pickles scikit-learn 0.17: reading bunch.key # uses __dict__ but assigning to bunch.key use __setattr__ and # only changes bunch['key']. More details can be found at: # https://github.com/scikit-learn/scikit-learn/issues/6196. # Overriding __setstate__ to be a noop has the effect of # ignoring the pickled __dict__ pass def safe_mask(X, mask): """Return a mask which is safe to use on X. Parameters ---------- X : {array-like, sparse matrix} Data on which to apply mask. mask : ndarray Mask to be used on X. Returns ------- mask """ mask = np.asarray(mask) if np.issubdtype(mask.dtype, np.signedinteger): return mask if hasattr(X, "toarray"): ind = np.arange(mask.shape[0]) mask = ind[mask] return mask def axis0_safe_slice(X, mask, len_mask): """ This mask is safer than safe_mask since it returns an empty array, when a sparse matrix is sliced with a boolean mask with all False, instead of raising an unhelpful error in older versions of SciPy. See: https://github.com/scipy/scipy/issues/5361 Also note that we can avoid doing the dot product by checking if the len_mask is not zero in _huber_loss_and_gradient but this is not going to be the bottleneck, since the number of outliers and non_outliers are typically non-zero and it makes the code tougher to follow. Parameters ---------- X : {array-like, sparse matrix} Data on which to apply mask. mask : ndarray Mask to be used on X. len_mask : int The length of the mask. Returns ------- mask """ if len_mask != 0: return X[safe_mask(X, mask), :] return np.zeros(shape=(0, X.shape[1])) def _array_indexing(array, key, key_dtype, axis): """Index an array or scipy.sparse consistently across NumPy version.""" if np_version < parse_version("1.12") or issparse(array): # FIXME: Remove the check for NumPy when using >= 1.12 # check if we have an boolean array-likes to make the proper indexing if key_dtype == "bool": key = np.asarray(key) if isinstance(key, tuple): key = list(key) return array[key] if axis == 0 else array[:, key] def _pandas_indexing(X, key, key_dtype, axis): """Index a pandas dataframe or a series.""" if hasattr(key, "shape"): # Work-around for indexing with read-only key in pandas # FIXME: solved in pandas 0.25 key = np.asarray(key) key = key if key.flags.writeable else key.copy() elif isinstance(key, tuple): key = list(key) if key_dtype == "int" and not (isinstance(key, slice) or np.isscalar(key)): # using take() instead of iloc[] ensures the return value is a "proper" # copy that will not raise SettingWithCopyWarning return X.take(key, axis=axis) else: # check whether we should index with loc or iloc indexer = X.iloc if key_dtype == "int" else X.loc return indexer[:, key] if axis else indexer[key] def _list_indexing(X, key, key_dtype): """Index a Python list.""" if np.isscalar(key) or isinstance(key, slice): # key is a slice or a scalar return X[key] if key_dtype == "bool": # key is a boolean array-like return list(compress(X, key)) # key is a integer array-like of key return [X[idx] for idx in key] def _determine_key_type(key, accept_slice=True): """Determine the data type of key. Parameters ---------- key : scalar, slice or array-like The key from which we want to infer the data type. accept_slice : bool, default=True Whether or not to raise an error if the key is a slice. Returns ------- dtype : {'int', 'str', 'bool', None} Returns the data type of key. """ err_msg = ( "No valid specification of the columns. Only a scalar, list or " "slice of all integers or all strings, or boolean mask is " "allowed" ) dtype_to_str = {int: "int", str: "str", bool: "bool", np.bool_: "bool"} array_dtype_to_str = { "i": "int", "u": "int", "b": "bool", "O": "str", "U": "str", "S": "str", } if key is None: return None if isinstance(key, tuple(dtype_to_str.keys())): try: return dtype_to_str[type(key)] except KeyError: raise ValueError(err_msg) if isinstance(key, slice): if not accept_slice: raise TypeError( "Only array-like or scalar are supported. A Python slice was given." ) if key.start is None and key.stop is None: return None key_start_type = _determine_key_type(key.start) key_stop_type = _determine_key_type(key.stop) if key_start_type is not None and key_stop_type is not None: if key_start_type != key_stop_type: raise ValueError(err_msg) if key_start_type is not None: return key_start_type return key_stop_type if isinstance(key, (list, tuple)): unique_key = set(key) key_type = {_determine_key_type(elt) for elt in unique_key} if not key_type: return None if len(key_type) != 1: raise ValueError(err_msg) return key_type.pop() if hasattr(key, "dtype"): try: return array_dtype_to_str[key.dtype.kind] except KeyError: raise ValueError(err_msg) raise ValueError(err_msg) def _safe_indexing(X, indices, *, axis=0): """Return rows, items or columns of X using indices. .. warning:: This utility is documented, but **private**. This means that backward compatibility might be broken without any deprecation cycle. Parameters ---------- X : array-like, sparse-matrix, list, pandas.DataFrame, pandas.Series Data from which to sample rows, items or columns. `list` are only supported when `axis=0`. indices : bool, int, str, slice, array-like - If `axis=0`, boolean and integer array-like, integer slice, and scalar integer are supported. - If `axis=1`: - to select a single column, `indices` can be of `int` type for all `X` types and `str` only for dataframe. The selected subset will be 1D, unless `X` is a sparse matrix in which case it will be 2D. - to select multiples columns, `indices` can be one of the following: `list`, `array`, `slice`. The type used in these containers can be one of the following: `int`, 'bool' and `str`. However, `str` is only supported when `X` is a dataframe. The selected subset will be 2D. axis : int, default=0 The axis along which `X` will be subsampled. `axis=0` will select rows while `axis=1` will select columns. Returns ------- subset Subset of X on axis 0 or 1. Notes ----- CSR, CSC, and LIL sparse matrices are supported. COO sparse matrices are not supported. """ if indices is None: return X if axis not in (0, 1): raise ValueError( "'axis' should be either 0 (to index rows) or 1 (to index " " column). Got {} instead.".format(axis) ) indices_dtype = _determine_key_type(indices) if axis == 0 and indices_dtype == "str": raise ValueError("String indexing is not supported with 'axis=0'") if axis == 1 and X.ndim != 2: raise ValueError( "'X' should be a 2D NumPy array, 2D sparse matrix or pandas " "dataframe when indexing the columns (i.e. 'axis=1'). " "Got {} instead with {} dimension(s).".format(type(X), X.ndim) ) if axis == 1 and indices_dtype == "str" and not hasattr(X, "loc"): raise ValueError( "Specifying the columns using strings is only supported for " "pandas DataFrames" ) if hasattr(X, "iloc"): return _pandas_indexing(X, indices, indices_dtype, axis=axis) elif hasattr(X, "shape"): return _array_indexing(X, indices, indices_dtype, axis=axis) else: return _list_indexing(X, indices, indices_dtype) def _get_column_indices(X, key): """Get feature column indices for input data X and key. For accepted values of `key`, see the docstring of :func:`_safe_indexing_column`. """ n_columns = X.shape[1] key_dtype = _determine_key_type(key) if isinstance(key, (list, tuple)) and not key: # we get an empty list return [] elif key_dtype in ("bool", "int"): # Convert key into positive indexes try: idx = _safe_indexing(np.arange(n_columns), key) except IndexError as e: raise ValueError( "all features must be in [0, {}] or [-{}, 0]".format( n_columns - 1, n_columns ) ) from e return np.atleast_1d(idx).tolist() elif key_dtype == "str": try: all_columns = X.columns except AttributeError: raise ValueError( "Specifying the columns using strings is only " "supported for pandas DataFrames" ) if isinstance(key, str): columns = [key] elif isinstance(key, slice): start, stop = key.start, key.stop if start is not None: start = all_columns.get_loc(start) if stop is not None: # pandas indexing with strings is endpoint included stop = all_columns.get_loc(stop) + 1 else: stop = n_columns + 1 return list(range(n_columns)[slice(start, stop)]) else: columns = list(key) try: column_indices = [] for col in columns: col_idx = all_columns.get_loc(col) if not isinstance(col_idx, numbers.Integral): raise ValueError( f"Selected columns, {columns}, are not unique in dataframe" ) column_indices.append(col_idx) except KeyError as e: raise ValueError("A given column is not a column of the dataframe") from e return column_indices else: raise ValueError( "No valid specification of the columns. Only a " "scalar, list or slice of all integers or all " "strings, or boolean mask is allowed" ) def resample(*arrays, replace=True, n_samples=None, random_state=None, stratify=None): """Resample arrays or sparse matrices in a consistent way. The default strategy implements one step of the bootstrapping procedure. Parameters ---------- *arrays : sequence of array-like of shape (n_samples,) or \ (n_samples, n_outputs) Indexable data-structures can be arrays, lists, dataframes or scipy sparse matrices with consistent first dimension. replace : bool, default=True Implements resampling with replacement. If False, this will implement (sliced) random permutations. n_samples : int, default=None Number of samples to generate. If left to None this is automatically set to the first dimension of the arrays. If replace is False it should not be larger than the length of arrays. random_state : int, RandomState instance or None, default=None Determines random number generation for shuffling the data. Pass an int for reproducible results across multiple function calls. See :term:`Glossary `. stratify : array-like of shape (n_samples,) or (n_samples, n_outputs), \ default=None If not None, data is split in a stratified fashion, using this as the class labels. Returns ------- resampled_arrays : sequence of array-like of shape (n_samples,) or \ (n_samples, n_outputs) Sequence of resampled copies of the collections. The original arrays are not impacted. Examples -------- It is possible to mix sparse and dense arrays in the same run:: >>> import numpy as np >>> X = np.array([[1., 0.], [2., 1.], [0., 0.]]) >>> y = np.array([0, 1, 2]) >>> from scipy.sparse import coo_matrix >>> X_sparse = coo_matrix(X) >>> from sklearn.utils import resample >>> X, X_sparse, y = resample(X, X_sparse, y, random_state=0) >>> X array([[1., 0.], [2., 1.], [1., 0.]]) >>> X_sparse <3x2 sparse matrix of type '<... 'numpy.float64'>' with 4 stored elements in Compressed Sparse Row format> >>> X_sparse.toarray() array([[1., 0.], [2., 1.], [1., 0.]]) >>> y array([0, 1, 0]) >>> resample(y, n_samples=2, random_state=0) array([0, 1]) Example using stratification:: >>> y = [0, 0, 1, 1, 1, 1, 1, 1, 1] >>> resample(y, n_samples=5, replace=False, stratify=y, ... random_state=0) [1, 1, 1, 0, 1] See Also -------- shuffle """ max_n_samples = n_samples random_state = check_random_state(random_state) if len(arrays) == 0: return None first = arrays[0] n_samples = first.shape[0] if hasattr(first, "shape") else len(first) if max_n_samples is None: max_n_samples = n_samples elif (max_n_samples > n_samples) and (not replace): raise ValueError( "Cannot sample %d out of arrays with dim %d when replace is False" % (max_n_samples, n_samples) ) check_consistent_length(*arrays) if stratify is None: if replace: indices = random_state.randint(0, n_samples, size=(max_n_samples,)) else: indices = np.arange(n_samples) random_state.shuffle(indices) indices = indices[:max_n_samples] else: # Code adapted from StratifiedShuffleSplit() y = check_array(stratify, ensure_2d=False, dtype=None) if y.ndim == 2: # for multi-label y, map each distinct row to a string repr # using join because str(row) uses an ellipsis if len(row) > 1000 y = np.array([" ".join(row.astype("str")) for row in y]) classes, y_indices = np.unique(y, return_inverse=True) n_classes = classes.shape[0] class_counts = np.bincount(y_indices) # Find the sorted list of instances for each class: # (np.unique above performs a sort, so code is O(n logn) already) class_indices = np.split( np.argsort(y_indices, kind="mergesort"), np.cumsum(class_counts)[:-1] ) n_i = _approximate_mode(class_counts, max_n_samples, random_state) indices = [] for i in range(n_classes): indices_i = random_state.choice(class_indices[i], n_i[i], replace=replace) indices.extend(indices_i) indices = random_state.permutation(indices) # convert sparse matrices to CSR for row-based indexing arrays = [a.tocsr() if issparse(a) else a for a in arrays] resampled_arrays = [_safe_indexing(a, indices) for a in arrays] if len(resampled_arrays) == 1: # syntactic sugar for the unit argument case return resampled_arrays[0] else: return resampled_arrays def shuffle(*arrays, random_state=None, n_samples=None): """Shuffle arrays or sparse matrices in a consistent way. This is a convenience alias to ``resample(*arrays, replace=False)`` to do random permutations of the collections. Parameters ---------- *arrays : sequence of indexable data-structures Indexable data-structures can be arrays, lists, dataframes or scipy sparse matrices with consistent first dimension. random_state : int, RandomState instance or None, default=None Determines random number generation for shuffling the data. Pass an int for reproducible results across multiple function calls. See :term:`Glossary `. n_samples : int, default=None Number of samples to generate. If left to None this is automatically set to the first dimension of the arrays. It should not be larger than the length of arrays. Returns ------- shuffled_arrays : sequence of indexable data-structures Sequence of shuffled copies of the collections. The original arrays are not impacted. Examples -------- It is possible to mix sparse and dense arrays in the same run:: >>> import numpy as np >>> X = np.array([[1., 0.], [2., 1.], [0., 0.]]) >>> y = np.array([0, 1, 2]) >>> from scipy.sparse import coo_matrix >>> X_sparse = coo_matrix(X) >>> from sklearn.utils import shuffle >>> X, X_sparse, y = shuffle(X, X_sparse, y, random_state=0) >>> X array([[0., 0.], [2., 1.], [1., 0.]]) >>> X_sparse <3x2 sparse matrix of type '<... 'numpy.float64'>' with 3 stored elements in Compressed Sparse Row format> >>> X_sparse.toarray() array([[0., 0.], [2., 1.], [1., 0.]]) >>> y array([2, 1, 0]) >>> shuffle(y, n_samples=2, random_state=0) array([0, 1]) See Also -------- resample """ return resample( *arrays, replace=False, n_samples=n_samples, random_state=random_state ) def safe_sqr(X, *, copy=True): """Element wise squaring of array-likes and sparse matrices. Parameters ---------- X : {array-like, ndarray, sparse matrix} copy : bool, default=True Whether to create a copy of X and operate on it or to perform inplace computation (default behaviour). Returns ------- X ** 2 : element wise square """ X = check_array(X, accept_sparse=["csr", "csc", "coo"], ensure_2d=False) if issparse(X): if copy: X = X.copy() X.data **= 2 else: if copy: X = X ** 2 else: X **= 2 return X def _chunk_generator(gen, chunksize): """Chunk generator, ``gen`` into lists of length ``chunksize``. The last chunk may have a length less than ``chunksize``.""" while True: chunk = list(islice(gen, chunksize)) if chunk: yield chunk else: return def gen_batches(n, batch_size, *, min_batch_size=0): """Generator to create slices containing batch_size elements, from 0 to n. The last slice may contain less than batch_size elements, when batch_size does not divide n. Parameters ---------- n : int batch_size : int Number of element in each batch. min_batch_size : int, default=0 Minimum batch size to produce. Yields ------ slice of batch_size elements See Also -------- gen_even_slices: Generator to create n_packs slices going up to n. Examples -------- >>> from sklearn.utils import gen_batches >>> list(gen_batches(7, 3)) [slice(0, 3, None), slice(3, 6, None), slice(6, 7, None)] >>> list(gen_batches(6, 3)) [slice(0, 3, None), slice(3, 6, None)] >>> list(gen_batches(2, 3)) [slice(0, 2, None)] >>> list(gen_batches(7, 3, min_batch_size=0)) [slice(0, 3, None), slice(3, 6, None), slice(6, 7, None)] >>> list(gen_batches(7, 3, min_batch_size=2)) [slice(0, 3, None), slice(3, 7, None)] """ if not isinstance(batch_size, numbers.Integral): raise TypeError( "gen_batches got batch_size=%s, must be an integer" % batch_size ) if batch_size <= 0: raise ValueError("gen_batches got batch_size=%s, must be positive" % batch_size) start = 0 for _ in range(int(n // batch_size)): end = start + batch_size if end + min_batch_size > n: continue yield slice(start, end) start = end if start < n: yield slice(start, n) def gen_even_slices(n, n_packs, *, n_samples=None): """Generator to create n_packs slices going up to n. Parameters ---------- n : int n_packs : int Number of slices to generate. n_samples : int, default=None Number of samples. Pass n_samples when the slices are to be used for sparse matrix indexing; slicing off-the-end raises an exception, while it works for NumPy arrays. Yields ------ slice See Also -------- gen_batches: Generator to create slices containing batch_size elements from 0 to n. Examples -------- >>> from sklearn.utils import gen_even_slices >>> list(gen_even_slices(10, 1)) [slice(0, 10, None)] >>> list(gen_even_slices(10, 10)) [slice(0, 1, None), slice(1, 2, None), ..., slice(9, 10, None)] >>> list(gen_even_slices(10, 5)) [slice(0, 2, None), slice(2, 4, None), ..., slice(8, 10, None)] >>> list(gen_even_slices(10, 3)) [slice(0, 4, None), slice(4, 7, None), slice(7, 10, None)] """ start = 0 if n_packs < 1: raise ValueError("gen_even_slices got n_packs=%s, must be >=1" % n_packs) for pack_num in range(n_packs): this_n = n // n_packs if pack_num < n % n_packs: this_n += 1 if this_n > 0: end = start + this_n if n_samples is not None: end = min(n_samples, end) yield slice(start, end, None) start = end def tosequence(x): """Cast iterable x to a Sequence, avoiding a copy if possible. Parameters ---------- x : iterable """ if isinstance(x, np.ndarray): return np.asarray(x) elif isinstance(x, Sequence): return x else: return list(x) def _to_object_array(sequence): """Convert sequence to a 1-D NumPy array of object dtype. numpy.array constructor has a similar use but it's output is ambiguous. It can be 1-D NumPy array of object dtype if the input is a ragged array, but if the input is a list of equal length arrays, then the output is a 2D numpy.array. _to_object_array solves this ambiguity by guarantying that the output is a 1-D NumPy array of objects for any input. Parameters ---------- sequence : array-like of shape (n_elements,) The sequence to be converted. Returns ------- out : ndarray of shape (n_elements,), dtype=object The converted sequence into a 1-D NumPy array of object dtype. Examples -------- >>> import numpy as np >>> from sklearn.utils import _to_object_array >>> _to_object_array([np.array([0]), np.array([1])]) array([array([0]), array([1])], dtype=object) >>> _to_object_array([np.array([0]), np.array([1, 2])]) array([array([0]), array([1, 2])], dtype=object) >>> _to_object_array([np.array([0]), np.array([1, 2])]) array([array([0]), array([1, 2])], dtype=object) """ out = np.empty(len(sequence), dtype=object) out[:] = sequence return out def indices_to_mask(indices, mask_length): """Convert list of indices to boolean mask. Parameters ---------- indices : list-like List of integers treated as indices. mask_length : int Length of boolean mask to be generated. This parameter must be greater than max(indices). Returns ------- mask : 1d boolean nd-array Boolean array that is True where indices are present, else False. Examples -------- >>> from sklearn.utils import indices_to_mask >>> indices = [1, 2 , 3, 4] >>> indices_to_mask(indices, 5) array([False, True, True, True, True]) """ if mask_length <= np.max(indices): raise ValueError("mask_length must be greater than max(indices)") mask = np.zeros(mask_length, dtype=bool) mask[indices] = True return mask def _message_with_time(source, message, time): """Create one line message for logging purposes. Parameters ---------- source : str String indicating the source or the reference of the message. message : str Short message. time : int Time in seconds. """ start_message = "[%s] " % source # adapted from joblib.logger.short_format_time without the Windows -.1s # adjustment if time > 60: time_str = "%4.1fmin" % (time / 60) else: time_str = " %5.1fs" % time end_message = " %s, total=%s" % (message, time_str) dots_len = 70 - len(start_message) - len(end_message) return "%s%s%s" % (start_message, dots_len * ".", end_message) @contextmanager def _print_elapsed_time(source, message=None): """Log elapsed time to stdout when the context is exited. Parameters ---------- source : str String indicating the source or the reference of the message. message : str, default=None Short message. If None, nothing will be printed. Returns ------- context_manager Prints elapsed time upon exit if verbose. """ if message is None: yield else: start = timeit.default_timer() yield print(_message_with_time(source, message, timeit.default_timer() - start)) def get_chunk_n_rows(row_bytes, *, max_n_rows=None, working_memory=None): """Calculates how many rows can be processed within working_memory. Parameters ---------- row_bytes : int The expected number of bytes of memory that will be consumed during the processing of each row. max_n_rows : int, default=None The maximum return value. working_memory : int or float, default=None The number of rows to fit inside this number of MiB will be returned. When None (default), the value of ``sklearn.get_config()['working_memory']`` is used. Returns ------- int or the value of n_samples Warns ----- Issues a UserWarning if ``row_bytes`` exceeds ``working_memory`` MiB. """ if working_memory is None: working_memory = get_config()["working_memory"] chunk_n_rows = int(working_memory * (2 ** 20) // row_bytes) if max_n_rows is not None: chunk_n_rows = min(chunk_n_rows, max_n_rows) if chunk_n_rows < 1: warnings.warn( "Could not adhere to working_memory config. " "Currently %.0fMiB, %.0fMiB required." % (working_memory, np.ceil(row_bytes * 2 ** -20)) ) chunk_n_rows = 1 return chunk_n_rows def is_scalar_nan(x): """Tests if x is NaN. This function is meant to overcome the issue that np.isnan does not allow non-numerical types as input, and that np.nan is not float('nan'). Parameters ---------- x : any type Returns ------- boolean Examples -------- >>> import numpy as np >>> from sklearn.utils import is_scalar_nan >>> is_scalar_nan(np.nan) True >>> is_scalar_nan(float("nan")) True >>> is_scalar_nan(None) False >>> is_scalar_nan("") False >>> is_scalar_nan([np.nan]) False """ return isinstance(x, numbers.Real) and math.isnan(x) def _approximate_mode(class_counts, n_draws, rng): """Computes approximate mode of multivariate hypergeometric. This is an approximation to the mode of the multivariate hypergeometric given by class_counts and n_draws. It shouldn't be off by more than one. It is the mostly likely outcome of drawing n_draws many samples from the population given by class_counts. Parameters ---------- class_counts : ndarray of int Population per class. n_draws : int Number of draws (samples to draw) from the overall population. rng : random state Used to break ties. Returns ------- sampled_classes : ndarray of int Number of samples drawn from each class. np.sum(sampled_classes) == n_draws Examples -------- >>> import numpy as np >>> from sklearn.utils import _approximate_mode >>> _approximate_mode(class_counts=np.array([4, 2]), n_draws=3, rng=0) array([2, 1]) >>> _approximate_mode(class_counts=np.array([5, 2]), n_draws=4, rng=0) array([3, 1]) >>> _approximate_mode(class_counts=np.array([2, 2, 2, 1]), ... n_draws=2, rng=0) array([0, 1, 1, 0]) >>> _approximate_mode(class_counts=np.array([2, 2, 2, 1]), ... n_draws=2, rng=42) array([1, 1, 0, 0]) """ rng = check_random_state(rng) # this computes a bad approximation to the mode of the # multivariate hypergeometric given by class_counts and n_draws continuous = class_counts / class_counts.sum() * n_draws # floored means we don't overshoot n_samples, but probably undershoot floored = np.floor(continuous) # we add samples according to how much "left over" probability # they had, until we arrive at n_samples need_to_add = int(n_draws - floored.sum()) if need_to_add > 0: remainder = continuous - floored values = np.sort(np.unique(remainder))[::-1] # add according to remainder, but break ties # randomly to avoid biases for value in values: (inds,) = np.where(remainder == value) # if we need_to_add less than what's in inds # we draw randomly from them. # if we need to add more, we add them all and # go to the next value add_now = min(len(inds), need_to_add) inds = rng.choice(inds, size=add_now, replace=False) floored[inds] += 1 need_to_add -= add_now if need_to_add == 0: break return floored.astype(int) def check_matplotlib_support(caller_name): """Raise ImportError with detailed error message if mpl is not installed. Plot utilities like any of the Display's plotting functions should lazily import matplotlib and call this helper before any computation. Parameters ---------- caller_name : str The name of the caller that requires matplotlib. """ try: import matplotlib # noqa except ImportError as e: raise ImportError( "{} requires matplotlib. You can install matplotlib with " "`pip install matplotlib`".format(caller_name) ) from e def check_pandas_support(caller_name): """Raise ImportError with detailed error message if pandas is not installed. Plot utilities like :func:`fetch_openml` should lazily import pandas and call this helper before any computation. Parameters ---------- caller_name : str The name of the caller that requires pandas. Returns ------- pandas The pandas package. """ try: import pandas # noqa return pandas except ImportError as e: raise ImportError("{} requires pandas.".format(caller_name)) from e def all_estimators(type_filter=None): """Get a list of all estimators from sklearn. This function crawls the module and gets all classes that inherit from BaseEstimator. Classes that are defined in test-modules are not included. Parameters ---------- type_filter : {"classifier", "regressor", "cluster", "transformer"} \ or list of such str, default=None Which kind of estimators should be returned. If None, no filter is applied and all estimators are returned. Possible values are 'classifier', 'regressor', 'cluster' and 'transformer' to get estimators only of these specific types, or a list of these to get the estimators that fit at least one of the types. Returns ------- estimators : list of tuples List of (name, class), where ``name`` is the class name as string and ``class`` is the actual type of the class. """ # lazy import to avoid circular imports from sklearn.base from ._testing import ignore_warnings from ..base import ( BaseEstimator, ClassifierMixin, RegressorMixin, TransformerMixin, ClusterMixin, ) def is_abstract(c): if not (hasattr(c, "__abstractmethods__")): return False if not len(c.__abstractmethods__): return False return True all_classes = [] modules_to_ignore = { "tests", "externals", "setup", "conftest", "enable_hist_gradient_boosting", } root = str(Path(__file__).parent.parent) # sklearn package # Ignore deprecation warnings triggered at import time and from walking # packages with ignore_warnings(category=FutureWarning): for importer, modname, ispkg in pkgutil.walk_packages( path=[root], prefix="sklearn." ): mod_parts = modname.split(".") if any(part in modules_to_ignore for part in mod_parts) or "._" in modname: continue module = import_module(modname) classes = inspect.getmembers(module, inspect.isclass) classes = [ (name, est_cls) for name, est_cls in classes if not name.startswith("_") ] # TODO: Remove when FeatureHasher is implemented in PYPY # Skips FeatureHasher for PYPY if IS_PYPY and "feature_extraction" in modname: classes = [ (name, est_cls) for name, est_cls in classes if name == "FeatureHasher" ] all_classes.extend(classes) all_classes = set(all_classes) estimators = [ c for c in all_classes if (issubclass(c[1], BaseEstimator) and c[0] != "BaseEstimator") ] # get rid of abstract base classes estimators = [c for c in estimators if not is_abstract(c[1])] if type_filter is not None: if not isinstance(type_filter, list): type_filter = [type_filter] else: type_filter = list(type_filter) # copy filtered_estimators = [] filters = { "classifier": ClassifierMixin, "regressor": RegressorMixin, "transformer": TransformerMixin, "cluster": ClusterMixin, } for name, mixin in filters.items(): if name in type_filter: type_filter.remove(name) filtered_estimators.extend( [est for est in estimators if issubclass(est[1], mixin)] ) estimators = filtered_estimators if type_filter: raise ValueError( "Parameter type_filter must be 'classifier', " "'regressor', 'transformer', 'cluster' or " "None, got" " %s." % repr(type_filter) ) # drop duplicates, sort for reproducibility # itemgetter is used to ensure the sort does not extend to the 2nd item of # the tuple return sorted(set(estimators), key=itemgetter(0))