from __future__ import annotations import itertools from typing import TYPE_CHECKING import warnings import numpy as np import pandas._libs.reshape as libreshape from pandas._libs.sparse import IntIndex from pandas._typing import ( Dtype, npt, ) from pandas.errors import PerformanceWarning from pandas.util._decorators import cache_readonly from pandas.core.dtypes.cast import maybe_promote from pandas.core.dtypes.common import ( ensure_platform_int, is_1d_only_ea_dtype, is_extension_array_dtype, is_integer, is_integer_dtype, is_list_like, is_object_dtype, needs_i8_conversion, ) from pandas.core.dtypes.dtypes import ExtensionDtype from pandas.core.dtypes.missing import notna import pandas.core.algorithms as algos from pandas.core.arrays import SparseArray from pandas.core.arrays.categorical import factorize_from_iterable from pandas.core.construction import ensure_wrapped_if_datetimelike from pandas.core.frame import DataFrame from pandas.core.indexes.api import ( Index, MultiIndex, ) from pandas.core.indexes.frozen import FrozenList from pandas.core.series import Series from pandas.core.sorting import ( compress_group_index, decons_obs_group_ids, get_compressed_ids, get_group_index, get_group_index_sorter, ) if TYPE_CHECKING: from pandas.core.arrays import ExtensionArray class _Unstacker: """ Helper class to unstack data / pivot with multi-level index Parameters ---------- index : MultiIndex level : int or str, default last level Level to "unstack". Accepts a name for the level. fill_value : scalar, optional Default value to fill in missing values if subgroups do not have the same set of labels. By default, missing values will be replaced with the default fill value for that data type, NaN for float, NaT for datetimelike, etc. For integer types, by default data will converted to float and missing values will be set to NaN. constructor : object Pandas ``DataFrame`` or subclass used to create unstacked response. If None, DataFrame will be used. Examples -------- >>> index = pd.MultiIndex.from_tuples([('one', 'a'), ('one', 'b'), ... ('two', 'a'), ('two', 'b')]) >>> s = pd.Series(np.arange(1, 5, dtype=np.int64), index=index) >>> s one a 1 b 2 two a 3 b 4 dtype: int64 >>> s.unstack(level=-1) a b one 1 2 two 3 4 >>> s.unstack(level=0) one two a 1 3 b 2 4 Returns ------- unstacked : DataFrame """ def __init__(self, index: MultiIndex, level=-1, constructor=None): if constructor is None: constructor = DataFrame self.constructor = constructor self.index = index.remove_unused_levels() self.level = self.index._get_level_number(level) # when index includes `nan`, need to lift levels/strides by 1 self.lift = 1 if -1 in self.index.codes[self.level] else 0 # Note: the "pop" below alters these in-place. self.new_index_levels = list(self.index.levels) self.new_index_names = list(self.index.names) self.removed_name = self.new_index_names.pop(self.level) self.removed_level = self.new_index_levels.pop(self.level) self.removed_level_full = index.levels[self.level] # Bug fix GH 20601 # If the data frame is too big, the number of unique index combination # will cause int32 overflow on windows environments. # We want to check and raise an error before this happens num_rows = np.max([index_level.size for index_level in self.new_index_levels]) num_columns = self.removed_level.size # GH20601: This forces an overflow if the number of cells is too high. num_cells = num_rows * num_columns # GH 26314: Previous ValueError raised was too restrictive for many users. if num_cells > np.iinfo(np.int32).max: warnings.warn( f"The following operation may generate {num_cells} cells " f"in the resulting pandas object.", PerformanceWarning, ) self._make_selectors() @cache_readonly def _indexer_and_to_sort( self, ) -> tuple[ npt.NDArray[np.intp], list[np.ndarray], # each has _some_ signed integer dtype ]: v = self.level codes = list(self.index.codes) levs = list(self.index.levels) to_sort = codes[:v] + codes[v + 1 :] + [codes[v]] sizes = tuple(len(x) for x in levs[:v] + levs[v + 1 :] + [levs[v]]) comp_index, obs_ids = get_compressed_ids(to_sort, sizes) ngroups = len(obs_ids) indexer = get_group_index_sorter(comp_index, ngroups) return indexer, to_sort @cache_readonly def sorted_labels(self): indexer, to_sort = self._indexer_and_to_sort return [line.take(indexer) for line in to_sort] def _make_sorted_values(self, values: np.ndarray) -> np.ndarray: indexer, _ = self._indexer_and_to_sort sorted_values = algos.take_nd(values, indexer, axis=0) return sorted_values def _make_selectors(self): new_levels = self.new_index_levels # make the mask remaining_labels = self.sorted_labels[:-1] level_sizes = tuple(len(x) for x in new_levels) comp_index, obs_ids = get_compressed_ids(remaining_labels, level_sizes) ngroups = len(obs_ids) comp_index = ensure_platform_int(comp_index) stride = self.index.levshape[self.level] + self.lift self.full_shape = ngroups, stride selector = self.sorted_labels[-1] + stride * comp_index + self.lift mask = np.zeros(np.prod(self.full_shape), dtype=bool) mask.put(selector, True) if mask.sum() < len(self.index): raise ValueError("Index contains duplicate entries, cannot reshape") self.group_index = comp_index self.mask = mask self.unique_groups = obs_ids self.compressor = comp_index.searchsorted(np.arange(ngroups)) @cache_readonly def mask_all(self) -> bool: return bool(self.mask.all()) @cache_readonly def arange_result(self) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.bool_]]: # We cache this for re-use in ExtensionBlock._unstack dummy_arr = np.arange(len(self.index), dtype=np.intp) new_values, mask = self.get_new_values(dummy_arr, fill_value=-1) return new_values, mask.any(0) # TODO: in all tests we have mask.any(0).all(); can we rely on that? def get_result(self, values, value_columns, fill_value): if values.ndim == 1: values = values[:, np.newaxis] if value_columns is None and values.shape[1] != 1: # pragma: no cover raise ValueError("must pass column labels for multi-column data") values, _ = self.get_new_values(values, fill_value) columns = self.get_new_columns(value_columns) index = self.new_index return self.constructor( values, index=index, columns=columns, dtype=values.dtype ) def get_new_values(self, values, fill_value=None): if values.ndim == 1: values = values[:, np.newaxis] sorted_values = self._make_sorted_values(values) # place the values length, width = self.full_shape stride = values.shape[1] result_width = width * stride result_shape = (length, result_width) mask = self.mask mask_all = self.mask_all # we can simply reshape if we don't have a mask if mask_all and len(values): # TODO: Under what circumstances can we rely on sorted_values # matching values? When that holds, we can slice instead # of take (in particular for EAs) new_values = ( sorted_values.reshape(length, width, stride) .swapaxes(1, 2) .reshape(result_shape) ) new_mask = np.ones(result_shape, dtype=bool) return new_values, new_mask dtype = values.dtype # if our mask is all True, then we can use our existing dtype if mask_all: dtype = values.dtype new_values = np.empty(result_shape, dtype=dtype) else: if isinstance(dtype, ExtensionDtype): # GH#41875 cls = dtype.construct_array_type() new_values = cls._empty(result_shape, dtype=dtype) new_values[:] = fill_value else: dtype, fill_value = maybe_promote(dtype, fill_value) new_values = np.empty(result_shape, dtype=dtype) new_values.fill(fill_value) name = dtype.name new_mask = np.zeros(result_shape, dtype=bool) # we need to convert to a basic dtype # and possibly coerce an input to our output dtype # e.g. ints -> floats if needs_i8_conversion(values.dtype): sorted_values = sorted_values.view("i8") new_values = new_values.view("i8") else: sorted_values = sorted_values.astype(name, copy=False) # fill in our values & mask libreshape.unstack( sorted_values, mask.view("u1"), stride, length, width, new_values, new_mask.view("u1"), ) # reconstruct dtype if needed if needs_i8_conversion(values.dtype): # view as datetime64 so we can wrap in DatetimeArray and use # DTA's view method new_values = new_values.view("M8[ns]") new_values = ensure_wrapped_if_datetimelike(new_values) new_values = new_values.view(values.dtype) return new_values, new_mask def get_new_columns(self, value_columns: Index | None): if value_columns is None: if self.lift == 0: return self.removed_level._rename(name=self.removed_name) lev = self.removed_level.insert(0, item=self.removed_level._na_value) return lev.rename(self.removed_name) stride = len(self.removed_level) + self.lift width = len(value_columns) propagator = np.repeat(np.arange(width), stride) new_levels: FrozenList | list[Index] if isinstance(value_columns, MultiIndex): new_levels = value_columns.levels + (self.removed_level_full,) new_names = value_columns.names + (self.removed_name,) new_codes = [lab.take(propagator) for lab in value_columns.codes] else: new_levels = [ value_columns, self.removed_level_full, ] new_names = [value_columns.name, self.removed_name] new_codes = [propagator] repeater = self._repeater # The entire level is then just a repetition of the single chunk: new_codes.append(np.tile(repeater, width)) return MultiIndex( levels=new_levels, codes=new_codes, names=new_names, verify_integrity=False ) @cache_readonly def _repeater(self) -> np.ndarray: # The two indices differ only if the unstacked level had unused items: if len(self.removed_level_full) != len(self.removed_level): # In this case, we remap the new codes to the original level: repeater = self.removed_level_full.get_indexer(self.removed_level) if self.lift: repeater = np.insert(repeater, 0, -1) else: # Otherwise, we just use each level item exactly once: stride = len(self.removed_level) + self.lift repeater = np.arange(stride) - self.lift return repeater @cache_readonly def new_index(self): # Does not depend on values or value_columns result_codes = [lab.take(self.compressor) for lab in self.sorted_labels[:-1]] # construct the new index if len(self.new_index_levels) == 1: level, level_codes = self.new_index_levels[0], result_codes[0] if (level_codes == -1).any(): level = level.insert(len(level), level._na_value) return level.take(level_codes).rename(self.new_index_names[0]) return MultiIndex( levels=self.new_index_levels, codes=result_codes, names=self.new_index_names, verify_integrity=False, ) def _unstack_multiple(data, clocs, fill_value=None): if len(clocs) == 0: return data # NOTE: This doesn't deal with hierarchical columns yet index = data.index # GH 19966 Make sure if MultiIndexed index has tuple name, they will be # recognised as a whole if clocs in index.names: clocs = [clocs] clocs = [index._get_level_number(i) for i in clocs] rlocs = [i for i in range(index.nlevels) if i not in clocs] clevels = [index.levels[i] for i in clocs] ccodes = [index.codes[i] for i in clocs] cnames = [index.names[i] for i in clocs] rlevels = [index.levels[i] for i in rlocs] rcodes = [index.codes[i] for i in rlocs] rnames = [index.names[i] for i in rlocs] shape = tuple(len(x) for x in clevels) group_index = get_group_index(ccodes, shape, sort=False, xnull=False) comp_ids, obs_ids = compress_group_index(group_index, sort=False) recons_codes = decons_obs_group_ids(comp_ids, obs_ids, shape, ccodes, xnull=False) if not rlocs: # Everything is in clocs, so the dummy df has a regular index dummy_index = Index(obs_ids, name="__placeholder__") else: dummy_index = MultiIndex( levels=rlevels + [obs_ids], codes=rcodes + [comp_ids], names=rnames + ["__placeholder__"], verify_integrity=False, ) if isinstance(data, Series): dummy = data.copy() dummy.index = dummy_index unstacked = dummy.unstack("__placeholder__", fill_value=fill_value) new_levels = clevels new_names = cnames new_codes = recons_codes else: if isinstance(data.columns, MultiIndex): result = data for i in range(len(clocs)): val = clocs[i] result = result.unstack(val, fill_value=fill_value) clocs = [v if v < val else v - 1 for v in clocs] return result # GH#42579 deep=False to avoid consolidating dummy = data.copy(deep=False) dummy.index = dummy_index unstacked = dummy.unstack("__placeholder__", fill_value=fill_value) if isinstance(unstacked, Series): unstcols = unstacked.index else: unstcols = unstacked.columns assert isinstance(unstcols, MultiIndex) # for mypy new_levels = [unstcols.levels[0]] + clevels new_names = [data.columns.name] + cnames new_codes = [unstcols.codes[0]] for rec in recons_codes: new_codes.append(rec.take(unstcols.codes[-1])) new_columns = MultiIndex( levels=new_levels, codes=new_codes, names=new_names, verify_integrity=False ) if isinstance(unstacked, Series): unstacked.index = new_columns else: unstacked.columns = new_columns return unstacked def unstack(obj, level, fill_value=None): if isinstance(level, (tuple, list)): if len(level) != 1: # _unstack_multiple only handles MultiIndexes, # and isn't needed for a single level return _unstack_multiple(obj, level, fill_value=fill_value) else: level = level[0] # Prioritize integer interpretation (GH #21677): if not is_integer(level) and not level == "__placeholder__": level = obj.index._get_level_number(level) if isinstance(obj, DataFrame): if isinstance(obj.index, MultiIndex): return _unstack_frame(obj, level, fill_value=fill_value) else: return obj.T.stack(dropna=False) elif not isinstance(obj.index, MultiIndex): # GH 36113 # Give nicer error messages when unstack a Series whose # Index is not a MultiIndex. raise ValueError( f"index must be a MultiIndex to unstack, {type(obj.index)} was passed" ) else: if is_1d_only_ea_dtype(obj.dtype): return _unstack_extension_series(obj, level, fill_value) unstacker = _Unstacker( obj.index, level=level, constructor=obj._constructor_expanddim ) return unstacker.get_result( obj._values, value_columns=None, fill_value=fill_value ) def _unstack_frame(obj, level, fill_value=None): if not obj._can_fast_transpose: unstacker = _Unstacker(obj.index, level=level) mgr = obj._mgr.unstack(unstacker, fill_value=fill_value) return obj._constructor(mgr) else: unstacker = _Unstacker(obj.index, level=level, constructor=obj._constructor) return unstacker.get_result( obj._values, value_columns=obj.columns, fill_value=fill_value ) def _unstack_extension_series(series, level, fill_value): """ Unstack an ExtensionArray-backed Series. The ExtensionDtype is preserved. Parameters ---------- series : Series A Series with an ExtensionArray for values level : Any The level name or number. fill_value : Any The user-level (not physical storage) fill value to use for missing values introduced by the reshape. Passed to ``series.values.take``. Returns ------- DataFrame Each column of the DataFrame will have the same dtype as the input Series. """ # Defer to the logic in ExtensionBlock._unstack df = series.to_frame() result = df.unstack(level=level, fill_value=fill_value) # equiv: result.droplevel(level=0, axis=1) # but this avoids an extra copy result.columns = result.columns.droplevel(0) return result def stack(frame, level=-1, dropna=True): """ Convert DataFrame to Series with multi-level Index. Columns become the second level of the resulting hierarchical index Returns ------- stacked : Series """ def factorize(index): if index.is_unique: return index, np.arange(len(index)) codes, categories = factorize_from_iterable(index) return categories, codes N, K = frame.shape # Will also convert negative level numbers and check if out of bounds. level_num = frame.columns._get_level_number(level) if isinstance(frame.columns, MultiIndex): return _stack_multi_columns(frame, level_num=level_num, dropna=dropna) elif isinstance(frame.index, MultiIndex): new_levels = list(frame.index.levels) new_codes = [lab.repeat(K) for lab in frame.index.codes] clev, clab = factorize(frame.columns) new_levels.append(clev) new_codes.append(np.tile(clab, N).ravel()) new_names = list(frame.index.names) new_names.append(frame.columns.name) new_index = MultiIndex( levels=new_levels, codes=new_codes, names=new_names, verify_integrity=False ) else: levels, (ilab, clab) = zip(*map(factorize, (frame.index, frame.columns))) codes = ilab.repeat(K), np.tile(clab, N).ravel() new_index = MultiIndex( levels=levels, codes=codes, names=[frame.index.name, frame.columns.name], verify_integrity=False, ) if not frame.empty and frame._is_homogeneous_type: # For homogeneous EAs, frame._values will coerce to object. So # we concatenate instead. dtypes = list(frame.dtypes._values) dtype = dtypes[0] if is_extension_array_dtype(dtype): arr = dtype.construct_array_type() new_values = arr._concat_same_type( [col._values for _, col in frame.items()] ) new_values = _reorder_for_extension_array_stack(new_values, N, K) else: # homogeneous, non-EA new_values = frame._values.ravel() else: # non-homogeneous new_values = frame._values.ravel() if dropna: mask = notna(new_values) new_values = new_values[mask] new_index = new_index[mask] return frame._constructor_sliced(new_values, index=new_index) def stack_multiple(frame, level, dropna=True): # If all passed levels match up to column names, no # ambiguity about what to do if all(lev in frame.columns.names for lev in level): result = frame for lev in level: result = stack(result, lev, dropna=dropna) # Otherwise, level numbers may change as each successive level is stacked elif all(isinstance(lev, int) for lev in level): # As each stack is done, the level numbers decrease, so we need # to account for that when level is a sequence of ints result = frame # _get_level_number() checks level numbers are in range and converts # negative numbers to positive level = [frame.columns._get_level_number(lev) for lev in level] # Can't iterate directly through level as we might need to change # values as we go for index in range(len(level)): lev = level[index] result = stack(result, lev, dropna=dropna) # Decrement all level numbers greater than current, as these # have now shifted down by one updated_level = [] for other in level: if other > lev: updated_level.append(other - 1) else: updated_level.append(other) level = updated_level else: raise ValueError( "level should contain all level names or all level " "numbers, not a mixture of the two." ) return result def _stack_multi_column_index(columns: MultiIndex) -> MultiIndex: """Creates a MultiIndex from the first N-1 levels of this MultiIndex.""" if len(columns.levels) <= 2: return columns.levels[0]._rename(name=columns.names[0]) levs = [ [lev[c] if c >= 0 else None for c in codes] for lev, codes in zip(columns.levels[:-1], columns.codes[:-1]) ] # Remove duplicate tuples in the MultiIndex. tuples = zip(*levs) unique_tuples = (key for key, _ in itertools.groupby(tuples)) new_levs = zip(*unique_tuples) # The dtype of each level must be explicitly set to avoid inferring the wrong type. # See GH-36991. return MultiIndex.from_arrays( [ # Not all indices can accept None values. Index(new_lev, dtype=lev.dtype) if None not in new_lev else new_lev for new_lev, lev in zip(new_levs, columns.levels) ], names=columns.names[:-1], ) def _stack_multi_columns(frame, level_num=-1, dropna=True): def _convert_level_number(level_num: int, columns): """ Logic for converting the level number to something we can safely pass to swaplevel. If `level_num` matches a column name return the name from position `level_num`, otherwise return `level_num`. """ if level_num in columns.names: return columns.names[level_num] return level_num this = frame.copy() # this makes life much simpler if level_num != frame.columns.nlevels - 1: # roll levels to put selected level at end roll_columns = this.columns for i in range(level_num, frame.columns.nlevels - 1): # Need to check if the ints conflict with level names lev1 = _convert_level_number(i, roll_columns) lev2 = _convert_level_number(i + 1, roll_columns) roll_columns = roll_columns.swaplevel(lev1, lev2) this.columns = roll_columns if not this.columns._is_lexsorted(): # Workaround the edge case where 0 is one of the column names, # which interferes with trying to sort based on the first # level level_to_sort = _convert_level_number(0, this.columns) this = this.sort_index(level=level_to_sort, axis=1) new_columns = _stack_multi_column_index(this.columns) # time to ravel the values new_data = {} level_vals = this.columns.levels[-1] level_codes = sorted(set(this.columns.codes[-1])) level_vals_nan = level_vals.insert(len(level_vals), None) level_vals_used = np.take(level_vals_nan, level_codes) levsize = len(level_codes) drop_cols = [] for key in new_columns: try: loc = this.columns.get_loc(key) except KeyError: drop_cols.append(key) continue # can make more efficient? # we almost always return a slice # but if unsorted can get a boolean # indexer if not isinstance(loc, slice): slice_len = len(loc) else: slice_len = loc.stop - loc.start if slice_len != levsize: chunk = this.loc[:, this.columns[loc]] chunk.columns = level_vals_nan.take(chunk.columns.codes[-1]) value_slice = chunk.reindex(columns=level_vals_used).values else: if frame._is_homogeneous_type and is_extension_array_dtype( frame.dtypes.iloc[0] ): # TODO(EA2D): won't need special case, can go through .values # paths below (might change to ._values) dtype = this[this.columns[loc]].dtypes.iloc[0] subset = this[this.columns[loc]] value_slice = dtype.construct_array_type()._concat_same_type( [x._values for _, x in subset.items()] ) N, K = subset.shape idx = np.arange(N * K).reshape(K, N).T.ravel() value_slice = value_slice.take(idx) elif frame._is_mixed_type: value_slice = this[this.columns[loc]].values else: value_slice = this.values[:, loc] if value_slice.ndim > 1: # i.e. not extension value_slice = value_slice.ravel() new_data[key] = value_slice if len(drop_cols) > 0: new_columns = new_columns.difference(drop_cols) N = len(this) if isinstance(this.index, MultiIndex): new_levels = list(this.index.levels) new_names = list(this.index.names) new_codes = [lab.repeat(levsize) for lab in this.index.codes] else: old_codes, old_levels = factorize_from_iterable(this.index) new_levels = [old_levels] new_codes = [old_codes.repeat(levsize)] new_names = [this.index.name] # something better? new_levels.append(level_vals) new_codes.append(np.tile(level_codes, N)) new_names.append(frame.columns.names[level_num]) new_index = MultiIndex( levels=new_levels, codes=new_codes, names=new_names, verify_integrity=False ) result = frame._constructor(new_data, index=new_index, columns=new_columns) # more efficient way to go about this? can do the whole masking biz but # will only save a small amount of time... if dropna: result = result.dropna(axis=0, how="all") return result def get_dummies( data, prefix=None, prefix_sep="_", dummy_na: bool = False, columns=None, sparse: bool = False, drop_first: bool = False, dtype: Dtype | None = None, ) -> DataFrame: """ Convert categorical variable into dummy/indicator variables. Parameters ---------- data : array-like, Series, or DataFrame Data of which to get dummy indicators. prefix : str, list of str, or dict of str, default None String to append DataFrame column names. Pass a list with length equal to the number of columns when calling get_dummies on a DataFrame. Alternatively, `prefix` can be a dictionary mapping column names to prefixes. prefix_sep : str, default '_' If appending prefix, separator/delimiter to use. Or pass a list or dictionary as with `prefix`. dummy_na : bool, default False Add a column to indicate NaNs, if False NaNs are ignored. columns : list-like, default None Column names in the DataFrame to be encoded. If `columns` is None then all the columns with `object` or `category` dtype will be converted. sparse : bool, default False Whether the dummy-encoded columns should be backed by a :class:`SparseArray` (True) or a regular NumPy array (False). drop_first : bool, default False Whether to get k-1 dummies out of k categorical levels by removing the first level. dtype : dtype, default np.uint8 Data type for new columns. Only a single dtype is allowed. Returns ------- DataFrame Dummy-coded data. See Also -------- Series.str.get_dummies : Convert Series to dummy codes. Notes ----- Reference :ref:`the user guide ` for more examples. Examples -------- >>> s = pd.Series(list('abca')) >>> pd.get_dummies(s) a b c 0 1 0 0 1 0 1 0 2 0 0 1 3 1 0 0 >>> s1 = ['a', 'b', np.nan] >>> pd.get_dummies(s1) a b 0 1 0 1 0 1 2 0 0 >>> pd.get_dummies(s1, dummy_na=True) a b NaN 0 1 0 0 1 0 1 0 2 0 0 1 >>> df = pd.DataFrame({'A': ['a', 'b', 'a'], 'B': ['b', 'a', 'c'], ... 'C': [1, 2, 3]}) >>> pd.get_dummies(df, prefix=['col1', 'col2']) C col1_a col1_b col2_a col2_b col2_c 0 1 1 0 0 1 0 1 2 0 1 1 0 0 2 3 1 0 0 0 1 >>> pd.get_dummies(pd.Series(list('abcaa'))) a b c 0 1 0 0 1 0 1 0 2 0 0 1 3 1 0 0 4 1 0 0 >>> pd.get_dummies(pd.Series(list('abcaa')), drop_first=True) b c 0 0 0 1 1 0 2 0 1 3 0 0 4 0 0 >>> pd.get_dummies(pd.Series(list('abc')), dtype=float) a b c 0 1.0 0.0 0.0 1 0.0 1.0 0.0 2 0.0 0.0 1.0 """ from pandas.core.reshape.concat import concat dtypes_to_encode = ["object", "category"] if isinstance(data, DataFrame): # determine columns being encoded if columns is None: data_to_encode = data.select_dtypes(include=dtypes_to_encode) elif not is_list_like(columns): raise TypeError("Input must be a list-like for parameter `columns`") else: data_to_encode = data[columns] # validate prefixes and separator to avoid silently dropping cols def check_len(item, name): if is_list_like(item): if not len(item) == data_to_encode.shape[1]: len_msg = ( f"Length of '{name}' ({len(item)}) did not match the " "length of the columns being encoded " f"({data_to_encode.shape[1]})." ) raise ValueError(len_msg) check_len(prefix, "prefix") check_len(prefix_sep, "prefix_sep") if isinstance(prefix, str): prefix = itertools.cycle([prefix]) if isinstance(prefix, dict): prefix = [prefix[col] for col in data_to_encode.columns] if prefix is None: prefix = data_to_encode.columns # validate separators if isinstance(prefix_sep, str): prefix_sep = itertools.cycle([prefix_sep]) elif isinstance(prefix_sep, dict): prefix_sep = [prefix_sep[col] for col in data_to_encode.columns] with_dummies: list[DataFrame] if data_to_encode.shape == data.shape: # Encoding the entire df, do not prepend any dropped columns with_dummies = [] elif columns is not None: # Encoding only cols specified in columns. Get all cols not in # columns to prepend to result. with_dummies = [data.drop(columns, axis=1)] else: # Encoding only object and category dtype columns. Get remaining # columns to prepend to result. with_dummies = [data.select_dtypes(exclude=dtypes_to_encode)] for (col, pre, sep) in zip(data_to_encode.items(), prefix, prefix_sep): # col is (column_name, column), use just column data here dummy = _get_dummies_1d( col[1], prefix=pre, prefix_sep=sep, dummy_na=dummy_na, sparse=sparse, drop_first=drop_first, dtype=dtype, ) with_dummies.append(dummy) result = concat(with_dummies, axis=1) else: result = _get_dummies_1d( data, prefix, prefix_sep, dummy_na, sparse=sparse, drop_first=drop_first, dtype=dtype, ) return result def _get_dummies_1d( data, prefix, prefix_sep="_", dummy_na: bool = False, sparse: bool = False, drop_first: bool = False, dtype: Dtype | None = None, ) -> DataFrame: from pandas.core.reshape.concat import concat # Series avoids inconsistent NaN handling codes, levels = factorize_from_iterable(Series(data)) if dtype is None: dtype = np.dtype(np.uint8) # error: Argument 1 to "dtype" has incompatible type "Union[ExtensionDtype, str, # dtype[Any], Type[object]]"; expected "Type[Any]" dtype = np.dtype(dtype) # type: ignore[arg-type] if is_object_dtype(dtype): raise ValueError("dtype=object is not a valid dtype for get_dummies") def get_empty_frame(data) -> DataFrame: index: Index | np.ndarray if isinstance(data, Series): index = data.index else: index = Index(range(len(data))) return DataFrame(index=index) # if all NaN if not dummy_na and len(levels) == 0: return get_empty_frame(data) codes = codes.copy() if dummy_na: codes[codes == -1] = len(levels) levels = levels.insert(len(levels), np.nan) # if dummy_na, we just fake a nan level. drop_first will drop it again if drop_first and len(levels) == 1: return get_empty_frame(data) number_of_cols = len(levels) if prefix is None: dummy_cols = levels else: dummy_cols = Index([f"{prefix}{prefix_sep}{level}" for level in levels]) index: Index | None if isinstance(data, Series): index = data.index else: index = None if sparse: fill_value: bool | float | int if is_integer_dtype(dtype): fill_value = 0 elif dtype == np.dtype(bool): fill_value = False else: fill_value = 0.0 sparse_series = [] N = len(data) sp_indices: list[list] = [[] for _ in range(len(dummy_cols))] mask = codes != -1 codes = codes[mask] n_idx = np.arange(N)[mask] for ndx, code in zip(n_idx, codes): sp_indices[code].append(ndx) if drop_first: # remove first categorical level to avoid perfect collinearity # GH12042 sp_indices = sp_indices[1:] dummy_cols = dummy_cols[1:] for col, ixs in zip(dummy_cols, sp_indices): sarr = SparseArray( np.ones(len(ixs), dtype=dtype), sparse_index=IntIndex(N, ixs), fill_value=fill_value, dtype=dtype, ) sparse_series.append(Series(data=sarr, index=index, name=col)) return concat(sparse_series, axis=1, copy=False) else: # take on axis=1 + transpose to ensure ndarray layout is column-major dummy_mat = np.eye(number_of_cols, dtype=dtype).take(codes, axis=1).T if not dummy_na: # reset NaN GH4446 dummy_mat[codes == -1] = 0 if drop_first: # remove first GH12042 dummy_mat = dummy_mat[:, 1:] dummy_cols = dummy_cols[1:] return DataFrame(dummy_mat, index=index, columns=dummy_cols) def _reorder_for_extension_array_stack( arr: ExtensionArray, n_rows: int, n_columns: int ) -> ExtensionArray: """ Re-orders the values when stacking multiple extension-arrays. The indirect stacking method used for EAs requires a followup take to get the order correct. Parameters ---------- arr : ExtensionArray n_rows, n_columns : int The number of rows and columns in the original DataFrame. Returns ------- taken : ExtensionArray The original `arr` with elements re-ordered appropriately Examples -------- >>> arr = np.array(['a', 'b', 'c', 'd', 'e', 'f']) >>> _reorder_for_extension_array_stack(arr, 2, 3) array(['a', 'c', 'e', 'b', 'd', 'f'], dtype='>> _reorder_for_extension_array_stack(arr, 3, 2) array(['a', 'd', 'b', 'e', 'c', 'f'], dtype='