a
    ;Zaa8                     @   sn   d dl Z d dlZd dlZddlmZ ddlmZmZ ddl	m
Z
 ddl	mZ ddl	mZ G d	d
 d
eeZdS )    N   )OneHotEncoder   )BaseEstimatorTransformerMixin)check_array)check_is_fitted)_check_feature_names_inc                   @   sP   e Zd ZdZdddddddZdd	d
Zdd Zdd Zdd ZdddZ	dS )KBinsDiscretizera  
    Bin continuous data into intervals.

    Read more in the :ref:`User Guide <preprocessing_discretization>`.

    .. versionadded:: 0.20

    Parameters
    ----------
    n_bins : int or array-like of shape (n_features,), default=5
        The number of bins to produce. Raises ValueError if ``n_bins < 2``.

    encode : {'onehot', 'onehot-dense', 'ordinal'}, default='onehot'
        Method used to encode the transformed result.

        - 'onehot': Encode the transformed result with one-hot encoding
          and return a sparse matrix. Ignored features are always
          stacked to the right.
        - 'onehot-dense': Encode the transformed result with one-hot encoding
          and return a dense array. Ignored features are always
          stacked to the right.
        - 'ordinal': Return the bin identifier encoded as an integer value.

    strategy : {'uniform', 'quantile', 'kmeans'}, default='quantile'
        Strategy used to define the widths of the bins.

        - 'uniform': All bins in each feature have identical widths.
        - 'quantile': All bins in each feature have the same number of points.
        - 'kmeans': Values in each bin have the same nearest center of a 1D
          k-means cluster.

    dtype : {np.float32, np.float64}, default=None
        The desired data-type for the output. If None, output dtype is
        consistent with input dtype. Only np.float32 and np.float64 are
        supported.

        .. versionadded:: 0.24

    Attributes
    ----------
    bin_edges_ : ndarray of ndarray of shape (n_features,)
        The edges of each bin. Contain arrays of varying shapes ``(n_bins_, )``
        Ignored features will have empty arrays.

    n_bins_ : ndarray of shape (n_features,), dtype=np.int_
        Number of bins per feature. Bins whose width are too small
        (i.e., <= 1e-8) are removed with a warning.

    n_features_in_ : int
        Number of features seen during :term:`fit`.

        .. versionadded:: 0.24

    feature_names_in_ : ndarray of shape (`n_features_in_`,)
        Names of features seen during :term:`fit`. Defined only when `X`
        has feature names that are all strings.

        .. versionadded:: 1.0

    See Also
    --------
    Binarizer : Class used to bin values as ``0`` or
        ``1`` based on a parameter ``threshold``.

    Notes
    -----
    In bin edges for feature ``i``, the first and last values are used only for
    ``inverse_transform``. During transform, bin edges are extended to::

      np.concatenate([-np.inf, bin_edges_[i][1:-1], np.inf])

    You can combine ``KBinsDiscretizer`` with
    :class:`~sklearn.compose.ColumnTransformer` if you only want to preprocess
    part of the features.

    ``KBinsDiscretizer`` might produce constant features (e.g., when
    ``encode = 'onehot'`` and certain bins do not contain any data).
    These features can be removed with feature selection algorithms
    (e.g., :class:`~sklearn.feature_selection.VarianceThreshold`).

    Examples
    --------
    >>> from sklearn.preprocessing import KBinsDiscretizer
    >>> X = [[-2, 1, -4,   -1],
    ...      [-1, 2, -3, -0.5],
    ...      [ 0, 3, -2,  0.5],
    ...      [ 1, 4, -1,    2]]
    >>> est = KBinsDiscretizer(n_bins=3, encode='ordinal', strategy='uniform')
    >>> est.fit(X)
    KBinsDiscretizer(...)
    >>> Xt = est.transform(X)
    >>> Xt  # doctest: +SKIP
    array([[ 0., 0., 0., 0.],
           [ 1., 1., 1., 0.],
           [ 2., 2., 2., 1.],
           [ 2., 2., 2., 2.]])

    Sometimes it may be useful to convert the data back into the original
    feature space. The ``inverse_transform`` function converts the binned
    data into the original feature space. Each value will be equal to the mean
    of the two bin edges.

    >>> est.bin_edges_[0]
    array([-2., -1.,  0.,  1.])
    >>> est.inverse_transform(Xt)
    array([[-1.5,  1.5, -3.5, -0.5],
           [-0.5,  2.5, -2.5, -0.5],
           [ 0.5,  3.5, -1.5,  0.5],
           [ 0.5,  3.5, -1.5,  1.5]])
       onehotquantileN)encodestrategydtypec                C   s   || _ || _|| _|| _d S N)n_binsr   r   r   )selfr   r   r   r    r   Dlib/python3.9/site-packages/sklearn/preprocessing/_discretization.py__init__   s    zKBinsDiscretizer.__init__c                 C   s   | j |dd}tjtjf}| j|v r,| j}n.| jdu r>|j}ntd|d  d| j dd}| j|vrztd	|| jd
}| j|vrtd|| j|j	d }| 
|}tj|td}	t|D ]}
|dd|
f }| |  }}||kr&td|
  d||
< ttj tjg|	|
< q| jdkrPt||||
 d |	|
< n| jdkrtdd||
 d }tt|||	|
< n| jdkrXddlm} t||||
 d }|dd |dd  dddf d }|||
 |ddd}||dddf jdddf }|  |dd |dd  d |	|
< tj||	|
 |f |	|
< | jdv rtj|	|
 tjddk}|	|
 | |	|
< t|	|
 d ||
 krtd|
  t|	|
 d ||
< q|	| _|| _ d| jv rt!dd | j D | jdk|d | _"| j"tdt| j f | S )!a  
        Fit the estimator.

        Parameters
        ----------
        X : array-like of shape (n_samples, n_features)
            Data to be discretized.

        y : None
            Ignored. This parameter exists only for compatibility with
            :class:`~sklearn.pipeline.Pipeline`.

        Returns
        -------
        self : object
            Returns the instance itself.
        Znumericr   NzValid options for 'dtype' are r   z. Got dtype=z
  instead.)r   zonehot-denseordinalz;Valid options for 'encode' are {}. Got encode={!r} instead.)uniformr   kmeansz?Valid options for 'strategy' are {}. Got strategy={!r} instead.r   z3Feature %d is constant and will be replaced with 0.r   r   r   d   r   r   )KMeans      ?full)Z
n_clustersinitZn_init	algorithm)r   r   )Zto_begin:0yE>zqBins whose width are too small (i.e., <= 1e-8) in feature %d are removed. Consider decreasing the number of bins.r   c                 S   s   g | ]}t |qS r   )npZarange.0ir   r   r   
<listcomp>       z(KBinsDiscretizer.fit.<locals>.<listcomp>)Z
categoriesZsparser   )#_validate_datar#   float64float32r   
ValueErrorr   formatr   shape_validate_n_binsZzerosobjectrangeminmaxwarningswarnZarrayinfZlinspaceZasarrayZ
percentileZclusterr   fitZcluster_centers_sortZr_Zediff1dlen
bin_edges_n_bins_r   _encoder)r   XyZsupported_dtypeZoutput_dtypeZvalid_encodeZvalid_strategy
n_featuresr   	bin_edgesjjcolumnZcol_minZcol_maxZ	quantilesr   Zuniform_edgesr    ZkmZcentersmaskr   r   r   r7      s    






($ 
zKBinsDiscretizer.fitc                 C   s   | j }t|tjr`t|tjs6tdtjt	|j|dk rPtdtj|t
j||tdS t|tddd}|jdks|jd	 |krtd
|dk ||kB }t
|d	 }|jd	 d	krddd |D }tdtj||S )z0Returns n_bins_, the number of bins per feature.z>{} received an invalid n_bins type. Received {}, expected int.r   zH{} received an invalid number of bins. Received {}, expected at least 2.r   TF)r   copyZ	ensure_2dr   r   z8n_bins must be a scalar or array of shape (n_features,).z, c                 s   s   | ]}t |V  qd S r   )strr$   r   r   r   	<genexpr>  r(   z4KBinsDiscretizer._validate_n_bins.<locals>.<genexpr>zk{} received an invalid number of bins at indices {}. Number of bins must be at least 2, and must be an int.)r   
isinstancenumbersNumberZIntegralr,   r-   r
   __name__typer#   r   intr   ndimr.   wherejoin)r   r?   Z	orig_binsr   Zbad_nbins_valueZviolating_indicesindicesr   r   r   r/      s8    z!KBinsDiscretizer._validate_n_binsc                 C   s  t |  | jdu rtjtjfn| j}| j|d|dd}| j}t|jd D ]^}d}d}||t	|dd|f   }t
|dd|f | || dd |dd|f< qJtj|d| jd |d	 | jd
kr|S d}	d| jv r| jj}	|j| j_z| j|}
W |	| j_n
|	| j_0 |
S )a  
        Discretize the data.

        Parameters
        ----------
        X : array-like of shape (n_samples, n_features)
            Data to be discretized.

        Returns
        -------
        Xt : {ndarray, sparse matrix}, dtype={np.float32, np.float64}
            Data in the binned space. Will be a sparse matrix if
            `self.encode='onehot'` and ndarray otherwise.
        NTF)rD   r   resetr   gh㈵>r"   r   )outr   r   )r   r   r#   r*   r+   r)   r:   r1   r.   absZdigitizeZclipr;   r   r<   	transform)r   r=   r   Xtr@   rA   ZrtolZatolZepsZ
dtype_initZXt_encr   r   r   rT   #  s(    6


zKBinsDiscretizer.transformc                 C   s   t |  d| jv r| j|}t|dtjtjfd}| jj	d }|j	d |krdt
d||j	d t|D ]P}| j| }|dd |dd  d	 }|t|dd|f  |dd|f< ql|S )
a  
        Transform discretized data back to original feature space.

        Note that this function does not regenerate the original data
        due to discretization rounding.

        Parameters
        ----------
        Xt : array-like of shape (n_samples, n_features)
            Transformed data in the binned space.

        Returns
        -------
        Xinv : ndarray, dtype={np.float32, np.float64}
            Data in the original feature space.
        r   T)rD   r   r   r   z8Incorrect number of features. Expecting {}, received {}.Nr   r   )r   r   r<   inverse_transformr   r#   r*   r+   r;   r.   r,   r-   r1   r:   Zint_)r   rU   ZXinvr?   rA   r@   Zbin_centersr   r   r   rV   R  s     


(z"KBinsDiscretizer.inverse_transformc                 C   s   t | |}| j|S )a  Get output feature names.

        Parameters
        ----------
        input_features : array-like of str or None, default=None
            Input features.

            - If `input_features` is `None`, then `feature_names_in_` is
              used as feature names in. If `feature_names_in_` is not defined,
              then names are generated: `[x0, x1, ..., x(n_features_in_)]`.
            - If `input_features` is an array-like, then `input_features` must
              match `feature_names_in_` if `feature_names_in_` is defined.

        Returns
        -------
        feature_names_out : ndarray of str objects
            Transformed feature names.
        )r	   r<   get_feature_names_out)r   Zinput_featuresr   r   r   rW   x  s    
z&KBinsDiscretizer.get_feature_names_out)r   )N)N)
rJ   
__module____qualname____doc__r   r7   r/   rT   rV   rW   r   r   r   r   r
      s   o
q'/&r
   )rH   Znumpyr#   r4    r   baser   r   Zutils.validationr   r   r	   r
   r   r   r   r   <module>	   s   