#===============================================================================
# Copyright 2014-2021 Intel Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#===============================================================================

import numpy as np
import numbers
from scipy import sparse as sp

from sklearn.utils import check_random_state, check_array
from sklearn.utils.sparsefuncs import mean_variance_axis
from sklearn.utils.validation import (
    check_is_fitted,
    _num_samples,
    _deprecate_positional_args)

from sklearn.cluster._kmeans import _labels_inertia
from sklearn.utils._openmp_helpers import _openmp_effective_n_threads

from sklearn.exceptions import ConvergenceWarning
from sklearn.utils.extmath import row_norms
import warnings

from sklearn.cluster import KMeans as KMeans_original

import daal4py
from .._utils import (
    getFPType,
    get_patch_message,
    sklearn_check_version,
    PatchingConditionsChain)
from .._device_offload import support_usm_ndarray
import logging


def _validate_center_shape(X, n_centers, centers):
    """Check if centers is compatible with X and n_centers"""
    if centers.shape[0] != n_centers:
        raise ValueError(
            f"The shape of the initial centers {centers.shape} does not "
            f"match the number of clusters {n_centers}.")
    if centers.shape[1] != X.shape[1]:
        raise ValueError(
            f"The shape of the initial centers {centers.shape} does not "
            f"match the number of features of the data {X.shape[1]}.")


def _tolerance(X, rtol):
    """Compute absolute tolerance from the relative tolerance"""
    if rtol == 0.0:
        return rtol
    if sp.issparse(X):
        variances = mean_variance_axis(X, axis=0)[1]
        mean_var = np.mean(variances)
    else:
        mean_var = np.var(X, axis=0).mean()
    return mean_var * rtol


def _daal4py_compute_starting_centroids(
    X,
    X_fptype,
    nClusters,
    cluster_centers_0,
    verbose,
    random_state
):
    def is_string(s, target_str):
        return isinstance(s, str) and s == target_str
    is_sparse = sp.isspmatrix(X)

    deterministic = False
    if is_string(cluster_centers_0, 'k-means++'):
        _seed = random_state.randint(np.iinfo('i').max)
        plus_plus_method = "plusPlusCSR" if is_sparse else "plusPlusDense"
        daal_engine = daal4py.engines_mt19937(
            fptype=X_fptype, method="defaultDense", seed=_seed)
        _n_local_trials = 2 + int(np.log(nClusters))
        kmeans_init = daal4py.kmeans_init(
            nClusters,
            fptype=X_fptype,
            nTrials=_n_local_trials,
            method=plus_plus_method,
            engine=daal_engine,
        )
        kmeans_init_res = kmeans_init.compute(X)
        centroids_ = kmeans_init_res.centroids
    elif is_string(cluster_centers_0, 'random'):
        _seed = random_state.randint(np.iinfo('i').max)
        random_method = "randomCSR" if is_sparse else "randomDense"
        daal_engine = daal4py.engines_mt19937(
            seed=_seed, fptype=X_fptype, method="defaultDense")
        kmeans_init = daal4py.kmeans_init(
            nClusters,
            fptype=X_fptype,
            method=random_method,
            engine=daal_engine,
        )
        kmeans_init_res = kmeans_init.compute(X)
        centroids_ = kmeans_init_res.centroids
    elif hasattr(cluster_centers_0, '__array__'):
        deterministic = True
        cc_arr = np.ascontiguousarray(cluster_centers_0, dtype=X.dtype)
        _validate_center_shape(X, nClusters, cc_arr)
        centroids_ = cc_arr
    elif callable(cluster_centers_0):
        cc_arr = cluster_centers_0(X, nClusters, random_state)
        cc_arr = np.ascontiguousarray(cc_arr, dtype=X.dtype)
        _validate_center_shape(X, nClusters, cc_arr)
        centroids_ = cc_arr
    elif is_string(cluster_centers_0, 'deterministic'):
        deterministic = True
        default_method = "lloydCSR" if is_sparse else "defaultDense"
        kmeans_init = daal4py.kmeans_init(
            nClusters, fptype=X_fptype, method=default_method)
        kmeans_init_res = kmeans_init.compute(X)
        centroids_ = kmeans_init_res.centroids
    else:
        raise ValueError(
            f"init should be either 'k-means++', 'random', a ndarray or a "
            f"callable, got '{cluster_centers_0}' instead.")
    if verbose:
        print("Initialization complete")
    return deterministic, centroids_


def _daal4py_kmeans_compatibility(nClusters, maxIterations, fptype="double",
                                  method="lloydDense", accuracyThreshold=0.0,
                                  resultsToEvaluate="computeCentroids"):
    kmeans_algo = daal4py.kmeans(
        nClusters=nClusters,
        maxIterations=maxIterations,
        fptype=fptype,
        resultsToEvaluate=resultsToEvaluate,
        accuracyThreshold=accuracyThreshold,
        method=method,
    )
    return kmeans_algo


def _daal4py_k_means_predict(X, nClusters, centroids,
                             resultsToEvaluate='computeAssignments'):
    X_fptype = getFPType(X)
    is_sparse = sp.isspmatrix(X)
    method = "lloydCSR" if is_sparse else "defaultDense"
    kmeans_algo = _daal4py_kmeans_compatibility(
        nClusters=nClusters,
        maxIterations=0,
        fptype=X_fptype,
        resultsToEvaluate=resultsToEvaluate,
        method=method,
    )

    res = kmeans_algo.compute(X, centroids)

    return res.assignments[:, 0], res.objectiveFunction[0, 0]


def _daal4py_k_means_fit(X, nClusters, numIterations,
                         tol, cluster_centers_0, n_init, verbose, random_state):
    if numIterations < 0:
        raise ValueError("Wrong iterations number")

    X_fptype = getFPType(X)
    abs_tol = _tolerance(X, tol)  # tol is relative tolerance
    is_sparse = sp.isspmatrix(X)
    method = "lloydCSR" if is_sparse else "defaultDense"
    best_inertia, best_cluster_centers = None, None
    best_n_iter = -1
    kmeans_algo = _daal4py_kmeans_compatibility(
        nClusters=nClusters,
        maxIterations=numIterations,
        accuracyThreshold=abs_tol,
        fptype=X_fptype,
        resultsToEvaluate='computeCentroids',
        method=method,
    )

    for k in range(n_init):
        deterministic, starting_centroids_ = _daal4py_compute_starting_centroids(
            X, X_fptype, nClusters, cluster_centers_0, verbose, random_state)

        res = kmeans_algo.compute(X, starting_centroids_)

        inertia = res.objectiveFunction[0, 0]
        if verbose:
            print(f"Iteration {k}, inertia {inertia}.")

        if best_inertia is None or inertia < best_inertia:
            best_cluster_centers = res.centroids
            if n_init > 1:
                best_cluster_centers = best_cluster_centers.copy()
            best_inertia = inertia
            best_n_iter = int(res.nIterations[0, 0])
        if deterministic and n_init != 1:
            warnings.warn(
                'Explicit initial center position passed: '
                'performing only one init in k-means instead of n_init=%d'
                % n_init, RuntimeWarning, stacklevel=2)
            break

    flag_compute = 'computeAssignments|computeExactObjectiveFunction'
    best_labels, best_inertia = _daal4py_k_means_predict(
        X, nClusters, best_cluster_centers, flag_compute)

    distinct_clusters = np.unique(best_labels).size
    if distinct_clusters < nClusters:
        warnings.warn(
            "Number of distinct clusters ({}) found smaller than "
            "n_clusters ({}). Possibly due to duplicate points "
            "in X.".format(distinct_clusters, nClusters),
            ConvergenceWarning, stacklevel=2)
        # for passing test case "test_kmeans_warns_less_centers_than_unique_points"

    return best_cluster_centers, best_labels, best_inertia, best_n_iter


def _fit(self, X, y=None, sample_weight=None):
    """Compute k-means clustering.

    Parameters
    ----------
    X : array-like or sparse matrix, shape=(n_samples, n_features)
        Training instances to cluster. It must be noted that the data
        will be converted to C ordering, which will cause a memory
        copy if the given data is not C-contiguous.

    y : Ignored
        not used, present here for API consistency by convention.

    sample_weight : array-like, shape (n_samples,), optional
        The weights for each observation in X. If None, all observations
        are assigned equal weight (default: None)

    """
    if hasattr(self, 'precompute_distances'):
        if self.precompute_distances != 'deprecated':
            if sklearn_check_version('0.24'):
                warnings.warn("'precompute_distances' was deprecated in version "
                              "0.23 and will be removed in 1.0 (renaming of 0.25)."
                              " It has no effect", FutureWarning)
            elif sklearn_check_version('0.23'):
                warnings.warn("'precompute_distances' was deprecated in version "
                              "0.23 and will be removed in 0.25. It has no "
                              "effect", FutureWarning)

    self._n_threads = None
    if hasattr(self, 'n_jobs'):
        if self.n_jobs != 'deprecated':
            if sklearn_check_version('0.24'):
                warnings.warn("'n_jobs' was deprecated in version 0.23 and will be"
                              " removed in 1.0 (renaming of 0.25).", FutureWarning)
            elif sklearn_check_version('0.23'):
                warnings.warn("'n_jobs' was deprecated in version 0.23 and will be"
                              " removed in 0.25.", FutureWarning)
            self._n_threads = self.n_jobs
    self._n_threads = _openmp_effective_n_threads(self._n_threads)

    if self.n_init <= 0:
        raise ValueError(
            f"n_init should be > 0, got {self.n_init} instead.")

    random_state = check_random_state(self.random_state)
    if sklearn_check_version("1.0"):
        self._check_feature_names(X, reset=True)

    if self.max_iter <= 0:
        raise ValueError(
            f"max_iter should be > 0, got {self.max_iter} instead.")

    algorithm = self.algorithm
    if algorithm == "elkan" and self.n_clusters == 1:
        warnings.warn("algorithm='elkan' doesn't make sense for a single "
                      "cluster. Using 'full' instead.", RuntimeWarning)
        algorithm = "full"

    if algorithm == "auto":
        algorithm = "full" if self.n_clusters == 1 else "elkan"

    if algorithm not in ["full", "elkan"]:
        raise ValueError("Algorithm must be 'auto', 'full' or 'elkan', got"
                         " {}".format(str(algorithm)))

    X_len = _num_samples(X)

    _patching_status = PatchingConditionsChain(
        "sklearn.cluster.KMeans.fit")
    _dal_ready = _patching_status.and_conditions([
        (self.n_clusters <= X_len,
            "The number of clusters is larger than the number of samples in X.")
    ])

    if _dal_ready and sample_weight is not None:
        if isinstance(sample_weight, numbers.Number):
            sample_weight = np.full(X_len, sample_weight, dtype=np.float64)
        else:
            sample_weight = np.asarray(sample_weight)
        _dal_ready = _patching_status.and_conditions([
            (sample_weight.shape == (X_len,),
                "Sample weights do not have the same length as X."),
            (np.allclose(sample_weight, np.ones_like(sample_weight)),
                "Sample weights are not ones.")
        ])

    _patching_status.write_log()
    if _dal_ready:
        X = check_array(X, accept_sparse='csr', dtype=[np.float64, np.float32])
        self.n_features_in_ = X.shape[1]
        self.cluster_centers_, self.labels_, self.inertia_, self.n_iter_ = \
            _daal4py_k_means_fit(
                X, self.n_clusters, self.max_iter, self.tol, self.init, self.n_init,
                self.verbose, random_state)
    else:
        super(KMeans, self).fit(X, y=y, sample_weight=sample_weight)
    return self


def _daal4py_check_test_data(self, X):
    if sklearn_check_version("1.0"):
        self._check_feature_names(X, reset=False)
    X = check_array(
        X,
        accept_sparse='csr',
        dtype=[np.float64, np.float32],
        accept_large_sparse=False
    )
    if self.n_features_in_ != X.shape[1]:
        raise ValueError(
            (f'X has {X.shape[1]} features, '
             f'but Kmeans is expecting {self.n_features_in_} features as input'))
    return X


def _predict(self, X, sample_weight=None):
    """Predict the closest cluster each sample in X belongs to.

    In the vector quantization literature, `cluster_centers_` is called
    the code book and each value returned by `predict` is the index of
    the closest code in the code book.

    Parameters
    ----------
    X : {array-like, sparse matrix}, shape = [n_samples, n_features]
       New data to predict.

    sample_weight : array-like, shape (n_samples,), optional
        The weights for each observation in X. If None, all observations
        are assigned equal weight (default: None)

    Returns
    -------
    labels : array, shape [n_samples,]
        Index of the cluster each sample belongs to.
    """
    check_is_fitted(self)

    X = _daal4py_check_test_data(self, X)

    _patching_status = PatchingConditionsChain(
        "sklearn.cluster.KMeans.predict")
    _patching_status.and_conditions([
        (sample_weight is None, "Sample weights are not supported."),
        (hasattr(X, '__array__'), "X does not have '__array__' attribute.")
    ])
    _dal_ready = _patching_status.or_conditions([
        (sp.isspmatrix_csr(X), "X is not sparse.")
    ])

    _patching_status.write_log()
    if _dal_ready:
        return _daal4py_k_means_predict(
            X, self.n_clusters, self.cluster_centers_)[0]
    x_squared_norms = row_norms(X, squared=True)
    return _labels_inertia(X, sample_weight, x_squared_norms,
                           self.cluster_centers_)[0]


class KMeans(KMeans_original):
    __doc__ = KMeans_original.__doc__

    if sklearn_check_version('1.0'):
        @_deprecate_positional_args
        def __init__(
            self,
            n_clusters=8,
            *,
            init='k-means++',
            n_init=10,
            max_iter=300,
            tol=1e-4,
            verbose=0,
            random_state=None,
            copy_x=True,
            algorithm='auto',
        ):
            super(KMeans, self).__init__(
                n_clusters=n_clusters,
                init=init,
                max_iter=max_iter,
                tol=tol,
                n_init=n_init,
                verbose=verbose,
                random_state=random_state,
                copy_x=copy_x,
                algorithm=algorithm,
            )
    else:
        @_deprecate_positional_args
        def __init__(
            self,
            n_clusters=8,
            *,
            init='k-means++',
            n_init=10,
            max_iter=300,
            tol=1e-4,
            precompute_distances='deprecated',
            verbose=0,
            random_state=None,
            copy_x=True,
            n_jobs='deprecated',
            algorithm='auto',
        ):
            super(KMeans, self).__init__(
                n_clusters=n_clusters,
                init=init,
                max_iter=max_iter,
                tol=tol,
                precompute_distances=precompute_distances,
                n_init=n_init,
                verbose=verbose,
                random_state=random_state,
                copy_x=copy_x,
                n_jobs=n_jobs,
                algorithm=algorithm,
            )

    @support_usm_ndarray()
    def fit(self, X, y=None, sample_weight=None):
        """
        Compute k-means clustering.

        Parameters
        ----------
        X : {array-like, sparse matrix} of shape (n_samples, n_features)
            Training instances to cluster. It must be noted that the data
            will be converted to C ordering, which will cause a memory
            copy if the given data is not C-contiguous.
            If a sparse matrix is passed, a copy will be made if it's not in
            CSR format.

        y : Ignored
            Not used, present here for API consistency by convention.

        sample_weight : array-like of shape (n_samples,), default=None
            The weights for each observation in X. If None, all observations
            are assigned equal weight.

            .. versionadded:: 0.20

        Returns
        -------
        self : object
            Fitted estimator.
        """
        return _fit(self, X, y=y, sample_weight=sample_weight)

    @support_usm_ndarray()
    def predict(self, X, sample_weight=None):
        """
        Predict the closest cluster each sample in X belongs to.

        In the vector quantization literature, `cluster_centers_` is called
        the code book and each value returned by `predict` is the index of
        the closest code in the code book.

        Parameters
        ----------
        X : {array-like, sparse matrix} of shape (n_samples, n_features)
            New data to predict.

        sample_weight : array-like of shape (n_samples,), default=None
            The weights for each observation in X. If None, all observations
            are assigned equal weight.

        Returns
        -------
        labels : ndarray of shape (n_samples,)
            Index of the cluster each sample belongs to.
        """
        return _predict(self, X, sample_weight=sample_weight)

    @support_usm_ndarray()
    def fit_predict(self, X, y=None, sample_weight=None):
        """
        Compute cluster centers and predict cluster index for each sample.

        Convenience method; equivalent to calling fit(X) followed by
        predict(X).

        Parameters
        ----------
        X : {array-like, sparse matrix} of shape (n_samples, n_features)
            New data to transform.

        y : Ignored
            Not used, present here for API consistency by convention.

        sample_weight : array-like of shape (n_samples,), default=None
            The weights for each observation in X. If None, all observations
            are assigned equal weight.

        Returns
        -------
        labels : ndarray of shape (n_samples,)
            Index of the cluster each sample belongs to.
        """
        return super().fit_predict(X, y, sample_weight)