
    h$,fC                         d dl Z d dlmZ d dlZddlmZmZmZ ddl	m
Z
 ddlmZmZmZmZ ddlmZ ddlmZmZmZmZmZ d	d
lmZ  G d dee      Zy)    N)Integral   )BaseEstimatorTransformerMixin_fit_context)_safe_indexing)HiddenIntervalOptions
StrOptions)_weighted_percentile)_check_feature_names_in_check_sample_weightcheck_arraycheck_is_fittedcheck_random_state   )OneHotEncoderc                   0   e Zd ZU dZ eeddd      dg eh d      g eh d      g eee	j                  e	j                  h      dg eed	dd      d e ed
h            gdgdZeed<   	 ddddd
dddZ ed      dd       Zd Zd Zd ZddZy)KBinsDiscretizera  
    Bin continuous data into intervals.

    Read more in the :ref:`User Guide <preprocessing_discretization>`.

    .. versionadded:: 0.20

    Parameters
    ----------
    n_bins : int or array-like of shape (n_features,), default=5
        The number of bins to produce. Raises ValueError if ``n_bins < 2``.

    encode : {'onehot', 'onehot-dense', 'ordinal'}, default='onehot'
        Method used to encode the transformed result.

        - 'onehot': Encode the transformed result with one-hot encoding
          and return a sparse matrix. Ignored features are always
          stacked to the right.
        - 'onehot-dense': Encode the transformed result with one-hot encoding
          and return a dense array. Ignored features are always
          stacked to the right.
        - 'ordinal': Return the bin identifier encoded as an integer value.

    strategy : {'uniform', 'quantile', 'kmeans'}, default='quantile'
        Strategy used to define the widths of the bins.

        - 'uniform': All bins in each feature have identical widths.
        - 'quantile': All bins in each feature have the same number of points.
        - 'kmeans': Values in each bin have the same nearest center of a 1D
          k-means cluster.

        For an example of the different strategies see:
        :ref:`sphx_glr_auto_examples_preprocessing_plot_discretization_strategies.py`.

    dtype : {np.float32, np.float64}, default=None
        The desired data-type for the output. If None, output dtype is
        consistent with input dtype. Only np.float32 and np.float64 are
        supported.

        .. versionadded:: 0.24

    subsample : int or None, default='warn'
        Maximum number of samples, used to fit the model, for computational
        efficiency. Defaults to 200_000 when `strategy='quantile'` and to `None`
        when `strategy='uniform'` or `strategy='kmeans'`.
        `subsample=None` means that all the training samples are used when
        computing the quantiles that determine the binning thresholds.
        Since quantile computation relies on sorting each column of `X` and
        that sorting has an `n log(n)` time complexity,
        it is recommended to use subsampling on datasets with a
        very large number of samples.

        .. versionchanged:: 1.3
            The default value of `subsample` changed from `None` to `200_000` when
            `strategy="quantile"`.

        .. versionchanged:: 1.5
            The default value of `subsample` changed from `None` to `200_000` when
            `strategy="uniform"` or `strategy="kmeans"`.

    random_state : int, RandomState instance or None, default=None
        Determines random number generation for subsampling.
        Pass an int for reproducible results across multiple function calls.
        See the `subsample` parameter for more details.
        See :term:`Glossary <random_state>`.

        .. versionadded:: 1.1

    Attributes
    ----------
    bin_edges_ : ndarray of ndarray of shape (n_features,)
        The edges of each bin. Contain arrays of varying shapes ``(n_bins_, )``
        Ignored features will have empty arrays.

    n_bins_ : ndarray of shape (n_features,), dtype=np.int64
        Number of bins per feature. Bins whose width are too small
        (i.e., <= 1e-8) are removed with a warning.

    n_features_in_ : int
        Number of features seen during :term:`fit`.

        .. versionadded:: 0.24

    feature_names_in_ : ndarray of shape (`n_features_in_`,)
        Names of features seen during :term:`fit`. Defined only when `X`
        has feature names that are all strings.

        .. versionadded:: 1.0

    See Also
    --------
    Binarizer : Class used to bin values as ``0`` or
        ``1`` based on a parameter ``threshold``.

    Notes
    -----

    For a visualization of discretization on different datasets refer to
    :ref:`sphx_glr_auto_examples_preprocessing_plot_discretization_classification.py`.
    On the effect of discretization on linear models see:
    :ref:`sphx_glr_auto_examples_preprocessing_plot_discretization.py`.

    In bin edges for feature ``i``, the first and last values are used only for
    ``inverse_transform``. During transform, bin edges are extended to::

      np.concatenate([-np.inf, bin_edges_[i][1:-1], np.inf])

    You can combine ``KBinsDiscretizer`` with
    :class:`~sklearn.compose.ColumnTransformer` if you only want to preprocess
    part of the features.

    ``KBinsDiscretizer`` might produce constant features (e.g., when
    ``encode = 'onehot'`` and certain bins do not contain any data).
    These features can be removed with feature selection algorithms
    (e.g., :class:`~sklearn.feature_selection.VarianceThreshold`).

    Examples
    --------
    >>> from sklearn.preprocessing import KBinsDiscretizer
    >>> X = [[-2, 1, -4,   -1],
    ...      [-1, 2, -3, -0.5],
    ...      [ 0, 3, -2,  0.5],
    ...      [ 1, 4, -1,    2]]
    >>> est = KBinsDiscretizer(
    ...     n_bins=3, encode='ordinal', strategy='uniform', subsample=None
    ... )
    >>> est.fit(X)
    KBinsDiscretizer(...)
    >>> Xt = est.transform(X)
    >>> Xt  # doctest: +SKIP
    array([[ 0., 0., 0., 0.],
           [ 1., 1., 1., 0.],
           [ 2., 2., 2., 1.],
           [ 2., 2., 2., 2.]])

    Sometimes it may be useful to convert the data back into the original
    feature space. The ``inverse_transform`` function converts the binned
    data into the original feature space. Each value will be equal to the mean
    of the two bin edges.

    >>> est.bin_edges_[0]
    array([-2., -1.,  0.,  1.])
    >>> est.inverse_transform(Xt)
    array([[-1.5,  1.5, -3.5, -0.5],
           [-0.5,  2.5, -2.5, -0.5],
           [ 0.5,  3.5, -1.5,  0.5],
           [ 0.5,  3.5, -1.5,  1.5]])
    r   Nleft)closedz
array-like>   onehot-denseonehotordinal>   kmeansuniformquantiler   warnrandom_staten_binsencodestrategydtype	subsampler    _parameter_constraintsr   r   )r#   r$   r%   r&   r    c                X    || _         || _        || _        || _        || _        || _        y Nr!   )selfr"   r#   r$   r%   r&   r    s          Elib/python3.12/site-packages/sklearn/preprocessing/_discretization.py__init__zKBinsDiscretizer.__init__   s/      
"(    T)prefer_skip_nested_validationc                 	   | j                  |d      }| j                  t        j                  t        j                  fv r| j                  }n|j                  }|j
                  \  }}|(| j                  dk(  rt        d| j                  d      | j                  dv r)| j                  dk(  rt        j                  d	t               | j                  }|dk(  r| j                  d
k(  rdnd}|:||kD  r5t        | j                        }|j                  ||d      }	t        ||	      }|j
                  d   }| j!                  |      }
|t#        |||j                        }t        j$                  |t&              }t)        |      D ]  }|dd|f   }|j+                         |j-                         }}||k(  rUt        j                  d|z         d|
|<   t        j.                  t        j0                   t        j0                  g      ||<   | j                  dk(  r"t        j2                  |||
|   dz         ||<   nZ| j                  d
k(  rt        j2                  dd|
|   dz         }|-t        j4                  t        j6                  ||            ||<   nt        j4                  |D cg c]  }t9        |||       c}t        j                        ||<   n| j                  dk(  rddlm} t        j2                  |||
|   dz         }|dd |dd z   dddf   dz  } ||
|   |d      }|j?                  |dddf   |      j@                  dddf   }|jC                          |dd |dd z   dz  ||<   t        jD                  |||   |f   ||<   | j                  dv s"t        jF                  ||   t        j0                        dkD  }||   |   ||<   tI        ||         dz
  |
|   k7  sqt        j                  d|z         tI        ||         dz
  |
|<    || _%        |
| _&        d| jN                  v rtQ        | jL                  D cg c]  }t        jR                  |       c}| jN                  dk(  |      | _*        | jT                  j?                  t        j$                  dtI        | jL                        f             | S c c}w c c}w )as  
        Fit the estimator.

        Parameters
        ----------
        X : array-like of shape (n_samples, n_features)
            Data to be discretized.

        y : None
            Ignored. This parameter exists only for compatibility with
            :class:`~sklearn.pipeline.Pipeline`.

        sample_weight : ndarray of shape (n_samples,)
            Contains weight values to be associated with each sample.
            Only possible when `strategy` is set to `"quantile"`.

            .. versionadded:: 1.3

        Returns
        -------
        self : object
            Returns the instance itself.
        numericr%   Nr   zY`sample_weight` was provided but it cannot be used with strategy='uniform'. Got strategy=z	 instead.)r   r   r   zIn version 1.5 onwards, subsample=200_000 will be used by default. Set subsample explicitly to silence this warning in the mean time. Set subsample=None to disable subsampling explicitly.r   i@ F)sizereplacer   z3Feature %d is constant and will be replaced with 0.r   d   r   r   )KMeans      ?)
n_clustersinitn_init)sample_weight)r   r   )to_beging:0yE>zqBins whose width are too small (i.e., <= 1e-8) in feature %d are removed. Consider decreasing the number of bins.r   )
categoriessparse_outputr%   )+_validate_datar%   npfloat64float32shaper$   
ValueErrorr&   warningsr   FutureWarningr   r    choicer   _validate_n_binsr   zerosobjectrangeminmaxarrayinflinspaceasarray
percentiler   clusterr5   fitcluster_centers_sortr_ediff1dlen
bin_edges_n_bins_r#   r   arange_encoder)r*   Xyr;   output_dtype	n_samples
n_featuresr&   rngsubsample_idxr"   	bin_edgesjjcolumncol_mincol_max	quantilesqr5   uniform_edgesr9   kmcentersmaskis                            r+   rT   zKBinsDiscretizer.fit   s   2 3::"**bjj11::L77L !	:$))C>==#9.  ==11dnn6NMMH
  NN	"&--:"=4I Y%:$T%6%67CJJyy%JPMq-0AWWQZ
&&z2$0QMHHZv6	
# 6	8Bq"uXF%zz|VZZ\WG'!IBN r
 "266'266): ;	"}}	) "GWfRj1n M	"*,KK3r
Q?	 ($&JJr}}VY/O$PIbM$&JJ &/ ! 1J !jj%IbM (*, !#GWfRj1n M%ab)M#2,>>4H3N vbzQG&&1d7O= ! ""1a4) !(ws|!;s B	" "gy}g&E F	" }} 66zz)B-"&&ADH )"d 3	"y}%)VBZ7MM9;=>
 "%Yr]!3a!7F2Jm6	8p $t{{")26,,?QBIIaL?"kkX5"DM MMbhh3t||+<'=>?aP @s   /S
Sc                    | j                   }t        |t              rt        j                  ||t
              S t        |t
        dd      }|j                  dkD  s|j                  d   |k7  rt        d      |dk  ||k7  z  }t        j                  |      d   }|j                  d   dkD  rAd	j                  d
 |D              }t        dj                  t        j                  |            |S )z0Returns n_bins_, the number of bins per feature.r1   TF)r%   copy	ensure_2dr   r   z8n_bins must be a scalar or array of shape (n_features,).r   z, c              3   2   K   | ]  }t        |        y wr)   )str).0rp   s     r+   	<genexpr>z4KBinsDiscretizer._validate_n_bins.<locals>.<genexpr>g  s     B1ABs   zk{} received an invalid number of bins at indices {}. Number of bins must be at least 2, and must be an int.)r"   
isinstancer   r@   fullintr   ndimrC   rD   wherejoinformatr   __name__)r*   rb   	orig_binsr"   bad_nbins_valueviolating_indicesindicess          r+   rH   z!KBinsDiscretizer._validate_n_binsX  s    KK	i*77:y<<YcN;;?fll1o;WXX!A:&I*=>HH_5a8""1%)iiB0ABBG::@&$--w;  r-   c                    t        |        | j                   t        j                  t        j                  fn| j                  }| j                  |d|d      }| j                  }t        |j                  d         D ].  }t        j                  ||   dd |dd|f   d      |dd|f<   0 | j                  d	k(  r|S d}d
| j                  v r1| j                  j                  }|j                  | j                  _        	 | j                  j                  |      }|| j                  _        |S # || j                  _        w xY w)a  
        Discretize the data.

        Parameters
        ----------
        X : array-like of shape (n_samples, n_features)
            Data to be discretized.

        Returns
        -------
        Xt : {ndarray, sparse matrix}, dtype={np.float32, np.float64}
            Data in the binned space. Will be a sparse matrix if
            `self.encode='onehot'` and ndarray otherwise.
        NTF)rr   r%   resetr   r6   right)sider   r   )r   r%   r@   rA   rB   r?   rZ   rK   rC   searchsortedr#   r]   	transform)r*   r^   r%   Xtre   rf   
dtype_initXt_encs           r+   r   zKBinsDiscretizer.transformq  s    	 -1JJ,>RZZ(DJJ  U% HOO	$ 	VB	"a(;R2YWUBq"uI	V ;;)#I
t{{",,J"$((DMM	-]],,R0F #-DMM #-DMMs   D/ /Ec                 &   t        |        d| j                  v r| j                  j                  |      }t	        |dt
        j                  t
        j                  f      }| j                  j                  d   }|j                  d   |k7  r(t        dj                  ||j                  d               t        |      D ]O  }| j                  |   }|dd |dd z   d	z  }||dd|f   j                  t
        j                           |dd|f<   Q |S )
a  
        Transform discretized data back to original feature space.

        Note that this function does not regenerate the original data
        due to discretization rounding.

        Parameters
        ----------
        Xt : array-like of shape (n_samples, n_features)
            Transformed data in the binned space.

        Returns
        -------
        Xinv : ndarray, dtype={np.float32, np.float64}
            Data in the original feature space.
        r   T)rr   r%   r   r   z8Incorrect number of features. Expecting {}, received {}.Nr6   r7   )r   r#   r]   inverse_transformr   r@   rA   rB   r[   rC   rD   r~   rK   rZ   astypeint64)r*   r   Xinvrb   rf   re   bin_centerss          r+   r   z"KBinsDiscretizer.inverse_transform  s   " 	t{{"004B2DRZZ0HI\\''*
::a=J&JQQ

1  
# 	FB+I$QR=9Sb>9S@K%tArE{&:&:288&DEDBK	F
 r-   c                     t        | d       t        | |      }t        | d      r| j                  j	                  |      S |S )a  Get output feature names.

        Parameters
        ----------
        input_features : array-like of str or None, default=None
            Input features.

            - If `input_features` is `None`, then `feature_names_in_` is
              used as feature names in. If `feature_names_in_` is not defined,
              then the following input feature names are generated:
              `["x0", "x1", ..., "x(n_features_in_ - 1)"]`.
            - If `input_features` is an array-like, then `input_features` must
              match `feature_names_in_` if `feature_names_in_` is defined.

        Returns
        -------
        feature_names_out : ndarray of str objects
            Transformed feature names.
        n_features_in_r]   )r   r   hasattrr]   get_feature_names_out)r*   input_featuress     r+   r   z&KBinsDiscretizer.get_feature_names_out  sB    ( 	./0~F4$==66~FF r-   )   )NNr)   )r   
__module____qualname____doc__r
   r   r   r   typer@   rA   rB   r	   r'   dict__annotations__r,   r   rT   rH   r   r   r    r-   r+   r   r      s    Sl Haf=|LCDE ABC$RZZ 894@Xq$v6:vh'(

 (($D  ) )" 5G 6GR2%N$Lr-   r   )rE   numbersr   numpyr@   baser   r   r   utilsr   utils._param_validationr	   r
   r   r   utils.statsr   utils.validationr   r   r   r   r   	_encodersr   r   r   r-   r+   <module>r      sB       @ @ " K K .  %~' ~r-   