#=============================================================================== # Copyright 2020-2021 Intel Corporation # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. #=============================================================================== import pytest from sklearn.neighbors \ import KNeighborsClassifier as ScikitKNeighborsClassifier from daal4py.sklearn.neighbors \ import KNeighborsClassifier as DaalKNeighborsClassifier from sklearn.datasets import load_iris from sklearn.metrics import (accuracy_score, log_loss, roc_auc_score) from sklearn.model_selection import train_test_split from daal4py.sklearn._utils import daal_check_version DISTANCES = ['minkowski'] ALGORITHMS = ['brute', 'kd_tree', 'auto'] WEIGHTS = ['uniform', 'distance'] KS = [1, 3, 7, 15, 31] N_TRIES = 10 ACCURACY_RATIO = 1.0 if daal_check_version(((2020, 'P', 300))) else 0.9 LOG_LOSS_RATIO = 1.02 ROC_AUC_RATIO = 0.999 IRIS = load_iris() def _test_determenistic(distance, algorithm, weight, k): x_train, x_test, y_train, y_test = \ train_test_split(IRIS.data, IRIS.target, test_size=0.33, random_state=31) alg_results = [] for _ in range(N_TRIES): # models scikit_model = ScikitKNeighborsClassifier(n_neighbors=k, weights=weight, algorithm=algorithm, leaf_size=30, p=2, metric=distance) daal_model = DaalKNeighborsClassifier(n_neighbors=k, weights=weight, algorithm=algorithm, leaf_size=30, p=2, metric=distance) # training scikit_model.fit(x_train, y_train) daal_model.fit(x_train, y_train) # predict scikit_predict = scikit_model.predict(x_test) daal_predict = daal_model.predict(x_test) distances, indices = daal_model.kneighbors(x_test) alg_results.append((distances, indices, daal_predict)) # accuracy scikit_accuracy = accuracy_score(y_test, scikit_predict) daal_accuracy = accuracy_score(y_test, daal_predict) ratio = daal_accuracy / scikit_accuracy reason = ("kNN accuracy: scikit_accuracy={},daal_accuracy={}, ratio={}".format( scikit_accuracy, daal_accuracy, ratio)) assert ratio >= ACCURACY_RATIO, reason # predict proba scikit_predict_proba = scikit_model.predict_proba(x_test) daal_predict_proba = daal_model.predict_proba(x_test) # log loss scikit_log_loss = log_loss(y_test, scikit_predict_proba) daal_log_loss = log_loss(y_test, daal_predict_proba) ratio = daal_log_loss / scikit_log_loss reason = "kNN log_loss: scikit_log_loss={},daal_log_loss={}, ratio={}".format( scikit_log_loss, daal_log_loss, ratio) assert ratio <= LOG_LOSS_RATIO, reason # ROC AUC scikit_roc_auc = roc_auc_score( y_test, scikit_predict_proba, multi_class='ovr') daal_roc_auc = roc_auc_score( y_test, daal_predict_proba, multi_class='ovr') ratio = daal_roc_auc / scikit_roc_auc reason = "kNN roc_auc: scikit_roc_auc={}, daal_roc_auc={}, ratio={}".format( scikit_roc_auc, daal_roc_auc, ratio) assert ratio >= ROC_AUC_RATIO, reason for i in range(1, N_TRIES): for j, res in enumerate(alg_results[i]): reason = 'Results are different between runs for {}, {}, {}, k={}'.format( algorithm, weight, distance, k) assert (res == alg_results[0][j]).mean() == 1, reason @pytest.mark.parametrize('distance', DISTANCES) @pytest.mark.parametrize('algorithm', ALGORITHMS) @pytest.mark.parametrize('weight', WEIGHTS) @pytest.mark.parametrize('k', KS) def test_determenistic(distance, algorithm, weight, k): _test_determenistic(distance, algorithm, weight, k)