diff --git a/notebooks/development/029-deterministic-test.ipynb b/notebooks/development/029-deterministic-test.ipynb new file mode 100644 index 0000000..0329ace --- /dev/null +++ b/notebooks/development/029-deterministic-test.ipynb @@ -0,0 +1,406 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "import datetime\n", + "\n", + "import numpy as np\n", + "import pandas as pd\n", + "\n", + "from sklearn.tree import DecisionTreeClassifier\n", + "from sklearn.ensemble import RandomForestClassifier\n", + "from sklearn.svm import SVC\n", + "from sklearn.neighbors import KNeighborsClassifier\n", + "from sklearn.model_selection import RepeatedStratifiedKFold\n", + "from sklearn.preprocessing import StandardScaler\n", + "from sklearn.metrics import roc_auc_score\n", + "\n", + "from smote_variants.oversampling import SMOTE\n", + "from common_datasets.binary_classification import get_filtered_data_loaders\n", + "import common_datasets.binary_classification as binclas" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "import logging\n", + "logger = logging.getLogger('smote_variants')\n", + "logger.setLevel(logging.ERROR)" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(498, 21)" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "binclas.load_cm1()['data'].shape" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "classifiers = {\n", + "DecisionTreeClassifier: [{'max_depth': md, 'random_state': 5} for md in range(2, 10, 2)],\n", + "RandomForestClassifier: [{'max_depth': md, 'random_state': 5} for md in range(2, 10, 2)],\n", + "KNeighborsClassifier: [{'n_neighbors': nn} for nn in range(1, 10, 2)],\n", + "SVC: [{'C': c, 'probability': True, 'random_state': 5} for c in [0.001, 0.01, 0.1]]\\\n", + " + [{'C': c, 'probability': True, 'kernel': 'poly', 'degree': 2, 'random_state': 5} for c in [0.001, 0.01, 0.1]]\\\n", + " + [{'C': c, 'probability': True, 'kernel': 'poly', 'degree': 3, 'random_state': 5} for c in [0.001, 0.01, 0.1]]\\\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "datasets = get_filtered_data_loaders(n_col_bounds=(2, 40),\n", + " n_bounds=(10, 500),\n", + " n_minority_bounds=(10, 500),\n", + " n_from_phenotypes=1,\n", + " n_smallest=20)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "smote_params = [\n", + " {'n_neighbors': 3, 'proportion': 0.5, 'random_state': 5, 'ss_params': {'within_simplex_sampling': 'random'}},\n", + " {'n_neighbors': 5, 'proportion': 0.5, 'random_state': 5, 'ss_params': {'within_simplex_sampling': 'random'}},\n", + " {'n_neighbors': 7, 'proportion': 0.5, 'random_state': 5, 'ss_params': {'within_simplex_sampling': 'random'}},\n", + " {'n_neighbors': 3, 'proportion': 1.0, 'random_state': 5, 'ss_params': {'within_simplex_sampling': 'random'}},\n", + " {'n_neighbors': 5, 'proportion': 1.0, 'random_state': 5, 'ss_params': {'within_simplex_sampling': 'random'}},\n", + " {'n_neighbors': 7, 'proportion': 1.0, 'random_state': 5, 'ss_params': {'within_simplex_sampling': 'random'}},\n", + " {'n_neighbors': 3, 'proportion': 1.5, 'random_state': 5, 'ss_params': {'within_simplex_sampling': 'random'}},\n", + " {'n_neighbors': 5, 'proportion': 1.5, 'random_state': 5, 'ss_params': {'within_simplex_sampling': 'random'}},\n", + " {'n_neighbors': 7, 'proportion': 1.5, 'random_state': 5, 'ss_params': {'within_simplex_sampling': 'random'}},\n", + " {'n_neighbors': 3, 'proportion': 0.5, 'random_state': 5, 'ss_params': {'within_simplex_sampling': 'deterministic'}},\n", + " {'n_neighbors': 5, 'proportion': 0.5, 'random_state': 5, 'ss_params': {'within_simplex_sampling': 'deterministic'}},\n", + " {'n_neighbors': 7, 'proportion': 0.5, 'random_state': 5, 'ss_params': {'within_simplex_sampling': 'deterministic'}},\n", + " {'n_neighbors': 3, 'proportion': 1.0, 'random_state': 5, 'ss_params': {'within_simplex_sampling': 'deterministic'}},\n", + " {'n_neighbors': 5, 'proportion': 1.0, 'random_state': 5, 'ss_params': {'within_simplex_sampling': 'deterministic'}},\n", + " {'n_neighbors': 7, 'proportion': 1.0, 'random_state': 5, 'ss_params': {'within_simplex_sampling': 'deterministic'}},\n", + " {'n_neighbors': 3, 'proportion': 1.5, 'random_state': 5, 'ss_params': {'within_simplex_sampling': 'deterministic'}},\n", + " {'n_neighbors': 5, 'proportion': 1.5, 'random_state': 5, 'ss_params': {'within_simplex_sampling': 'deterministic'}},\n", + " {'n_neighbors': 7, 'proportion': 1.5, 'random_state': 5, 'ss_params': {'within_simplex_sampling': 'deterministic'}},\n", + "]" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2023-12-14 21:06:31.104964 appendicitis\n", + "2023-12-14 21:23:36.600311 bupa\n", + "2023-12-14 21:47:51.281804 cleveland-0_vs_4\n", + "2023-12-14 22:07:33.462380 CM1\n" + ] + }, + { + "ename": "KeyboardInterrupt", + "evalue": "", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[6], line 28\u001b[0m\n\u001b[1;32m 26\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m param \u001b[38;5;129;01min\u001b[39;00m cparams:\n\u001b[1;32m 27\u001b[0m classifier_obj \u001b[38;5;241m=\u001b[39m classifier(\u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mparam)\n\u001b[0;32m---> 28\u001b[0m \u001b[43mclassifier_obj\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfit\u001b[49m\u001b[43m(\u001b[49m\u001b[43mX_samp\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43my_samp\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 29\u001b[0m y_pred \u001b[38;5;241m=\u001b[39m classifier_obj\u001b[38;5;241m.\u001b[39mpredict_proba(X_test)\n\u001b[1;32m 30\u001b[0m auc \u001b[38;5;241m=\u001b[39m roc_auc_score(y_test, y_pred[:, \u001b[38;5;241m1\u001b[39m])\n", + "File \u001b[0;32m~/anaconda3/envs/smote-variants/lib/python3.10/site-packages/sklearn/base.py:1152\u001b[0m, in \u001b[0;36m_fit_context..decorator..wrapper\u001b[0;34m(estimator, *args, **kwargs)\u001b[0m\n\u001b[1;32m 1145\u001b[0m estimator\u001b[38;5;241m.\u001b[39m_validate_params()\n\u001b[1;32m 1147\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m config_context(\n\u001b[1;32m 1148\u001b[0m skip_parameter_validation\u001b[38;5;241m=\u001b[39m(\n\u001b[1;32m 1149\u001b[0m prefer_skip_nested_validation \u001b[38;5;129;01mor\u001b[39;00m global_skip_validation\n\u001b[1;32m 1150\u001b[0m )\n\u001b[1;32m 1151\u001b[0m ):\n\u001b[0;32m-> 1152\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfit_method\u001b[49m\u001b[43m(\u001b[49m\u001b[43mestimator\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m~/anaconda3/envs/smote-variants/lib/python3.10/site-packages/sklearn/svm/_base.py:250\u001b[0m, in \u001b[0;36mBaseLibSVM.fit\u001b[0;34m(self, X, y, sample_weight)\u001b[0m\n\u001b[1;32m 247\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m[LibSVM]\u001b[39m\u001b[38;5;124m\"\u001b[39m, end\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m 249\u001b[0m seed \u001b[38;5;241m=\u001b[39m rnd\u001b[38;5;241m.\u001b[39mrandint(np\u001b[38;5;241m.\u001b[39miinfo(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mi\u001b[39m\u001b[38;5;124m\"\u001b[39m)\u001b[38;5;241m.\u001b[39mmax)\n\u001b[0;32m--> 250\u001b[0m \u001b[43mfit\u001b[49m\u001b[43m(\u001b[49m\u001b[43mX\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43my\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43msample_weight\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43msolver_type\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mkernel\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mrandom_seed\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mseed\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 251\u001b[0m \u001b[38;5;66;03m# see comment on the other call to np.iinfo in this file\u001b[39;00m\n\u001b[1;32m 253\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mshape_fit_ \u001b[38;5;241m=\u001b[39m X\u001b[38;5;241m.\u001b[39mshape \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mhasattr\u001b[39m(X, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mshape\u001b[39m\u001b[38;5;124m\"\u001b[39m) \u001b[38;5;28;01melse\u001b[39;00m (n_samples,)\n", + "File \u001b[0;32m~/anaconda3/envs/smote-variants/lib/python3.10/site-packages/sklearn/svm/_base.py:329\u001b[0m, in \u001b[0;36mBaseLibSVM._dense_fit\u001b[0;34m(self, X, y, sample_weight, solver_type, kernel, random_seed)\u001b[0m\n\u001b[1;32m 315\u001b[0m libsvm\u001b[38;5;241m.\u001b[39mset_verbosity_wrap(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mverbose)\n\u001b[1;32m 317\u001b[0m \u001b[38;5;66;03m# we don't pass **self.get_params() to allow subclasses to\u001b[39;00m\n\u001b[1;32m 318\u001b[0m \u001b[38;5;66;03m# add other parameters to __init__\u001b[39;00m\n\u001b[1;32m 319\u001b[0m (\n\u001b[1;32m 320\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39msupport_,\n\u001b[1;32m 321\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39msupport_vectors_,\n\u001b[1;32m 322\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_n_support,\n\u001b[1;32m 323\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdual_coef_,\n\u001b[1;32m 324\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mintercept_,\n\u001b[1;32m 325\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_probA,\n\u001b[1;32m 326\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_probB,\n\u001b[1;32m 327\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mfit_status_,\n\u001b[1;32m 328\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_num_iter,\n\u001b[0;32m--> 329\u001b[0m ) \u001b[38;5;241m=\u001b[39m \u001b[43mlibsvm\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfit\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 330\u001b[0m \u001b[43m \u001b[49m\u001b[43mX\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 331\u001b[0m \u001b[43m \u001b[49m\u001b[43my\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 332\u001b[0m \u001b[43m \u001b[49m\u001b[43msvm_type\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43msolver_type\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 333\u001b[0m \u001b[43m \u001b[49m\u001b[43msample_weight\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43msample_weight\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 334\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;66;43;03m# TODO(1.4): Replace \"_class_weight\" with \"class_weight_\"\u001b[39;49;00m\n\u001b[1;32m 335\u001b[0m \u001b[43m \u001b[49m\u001b[43mclass_weight\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mgetattr\u001b[39;49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43m_class_weight\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mnp\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mempty\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m0\u001b[39;49m\u001b[43m)\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 336\u001b[0m \u001b[43m \u001b[49m\u001b[43mkernel\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mkernel\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 337\u001b[0m \u001b[43m \u001b[49m\u001b[43mC\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mC\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 338\u001b[0m \u001b[43m \u001b[49m\u001b[43mnu\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mnu\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 339\u001b[0m \u001b[43m \u001b[49m\u001b[43mprobability\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mprobability\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 340\u001b[0m \u001b[43m \u001b[49m\u001b[43mdegree\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mdegree\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 341\u001b[0m \u001b[43m \u001b[49m\u001b[43mshrinking\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mshrinking\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 342\u001b[0m \u001b[43m \u001b[49m\u001b[43mtol\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mtol\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 343\u001b[0m \u001b[43m \u001b[49m\u001b[43mcache_size\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcache_size\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 344\u001b[0m \u001b[43m \u001b[49m\u001b[43mcoef0\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcoef0\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 345\u001b[0m \u001b[43m \u001b[49m\u001b[43mgamma\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_gamma\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 346\u001b[0m \u001b[43m \u001b[49m\u001b[43mepsilon\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mepsilon\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 347\u001b[0m \u001b[43m \u001b[49m\u001b[43mmax_iter\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mmax_iter\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 348\u001b[0m \u001b[43m \u001b[49m\u001b[43mrandom_seed\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mrandom_seed\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 349\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 351\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_warn_from_fit_status()\n", + "\u001b[0;31mKeyboardInterrupt\u001b[0m: " + ] + } + ], + "source": [ + "for data_loader in datasets:\n", + " results = []\n", + " dataset = data_loader()\n", + " print(datetime.datetime.now(), dataset['name'])\n", + " X = dataset['data']\n", + " y = dataset['target']\n", + "\n", + " validator = RepeatedStratifiedKFold(n_splits=5, n_repeats=20, random_state=5)\n", + "\n", + " for fidx, (train, test) in enumerate(validator.split(X, y, y)):\n", + " X_train = X[train]\n", + " X_test = X[test]\n", + " y_train = y[train]\n", + " y_test = y[test]\n", + "\n", + " ss = StandardScaler()\n", + " ss.fit(X_train)\n", + " X_train = ss.transform(X_train)\n", + " X_test = ss.transform(X_test)\n", + "\n", + " for sparam in smote_params:\n", + " oversampling = SMOTE(**sparam)\n", + " X_samp, y_samp = oversampling.sample(X_train, y_train)\n", + "\n", + " for classifier, cparams in classifiers.items():\n", + " for param in cparams:\n", + " classifier_obj = classifier(**param)\n", + " classifier_obj.fit(X_samp, y_samp)\n", + " y_pred = classifier_obj.predict_proba(X_test)\n", + " auc = roc_auc_score(y_test, y_pred[:, 1])\n", + " results.append({'name': dataset['name'],\n", + " 'fold': fidx,\n", + " 'sparam': sparam,\n", + " 'classifier': classifier.__name__,\n", + " 'cparam': param,\n", + " 'auc': auc})\n", + " data = pd.DataFrame.from_dict(results)\n", + " data.to_csv(f'{dataset[\"name\"]}.csv')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
namefoldsparamclassifiercparamauc
0appendicitis0{'n_neighbors': 3, 'proportion': 0.5, 'random_...DecisionTreeClassifier[{'max_depth': 2, 'random_state': 5}, {'max_de...0.582353
1appendicitis0{'n_neighbors': 3, 'proportion': 0.5, 'random_...DecisionTreeClassifier[{'max_depth': 2, 'random_state': 5}, {'max_de...0.723529
2appendicitis0{'n_neighbors': 3, 'proportion': 0.5, 'random_...DecisionTreeClassifier[{'max_depth': 2, 'random_state': 5}, {'max_de...0.764706
3appendicitis0{'n_neighbors': 3, 'proportion': 0.5, 'random_...DecisionTreeClassifier[{'max_depth': 2, 'random_state': 5}, {'max_de...0.782353
4appendicitis0{'n_neighbors': 3, 'proportion': 0.5, 'random_...RandomForestClassifier[{'max_depth': 2, 'random_state': 5}, {'max_de...0.976471
.....................
441appendicitis1{'n_neighbors': 7, 'proportion': 0.5, 'random_...DecisionTreeClassifier[{'max_depth': 2, 'random_state': 5}, {'max_de...0.647059
442appendicitis1{'n_neighbors': 7, 'proportion': 0.5, 'random_...DecisionTreeClassifier[{'max_depth': 2, 'random_state': 5}, {'max_de...0.661765
443appendicitis1{'n_neighbors': 7, 'proportion': 0.5, 'random_...DecisionTreeClassifier[{'max_depth': 2, 'random_state': 5}, {'max_de...0.661765
444appendicitis1{'n_neighbors': 7, 'proportion': 0.5, 'random_...RandomForestClassifier[{'max_depth': 2, 'random_state': 5}, {'max_de...0.647059
445appendicitis1{'n_neighbors': 7, 'proportion': 0.5, 'random_...RandomForestClassifier[{'max_depth': 2, 'random_state': 5}, {'max_de...0.676471
\n", + "

446 rows × 6 columns

\n", + "
" + ], + "text/plain": [ + " name fold sparam \\\n", + "0 appendicitis 0 {'n_neighbors': 3, 'proportion': 0.5, 'random_... \n", + "1 appendicitis 0 {'n_neighbors': 3, 'proportion': 0.5, 'random_... \n", + "2 appendicitis 0 {'n_neighbors': 3, 'proportion': 0.5, 'random_... \n", + "3 appendicitis 0 {'n_neighbors': 3, 'proportion': 0.5, 'random_... \n", + "4 appendicitis 0 {'n_neighbors': 3, 'proportion': 0.5, 'random_... \n", + ".. ... ... ... \n", + "441 appendicitis 1 {'n_neighbors': 7, 'proportion': 0.5, 'random_... \n", + "442 appendicitis 1 {'n_neighbors': 7, 'proportion': 0.5, 'random_... \n", + "443 appendicitis 1 {'n_neighbors': 7, 'proportion': 0.5, 'random_... \n", + "444 appendicitis 1 {'n_neighbors': 7, 'proportion': 0.5, 'random_... \n", + "445 appendicitis 1 {'n_neighbors': 7, 'proportion': 0.5, 'random_... \n", + "\n", + " classifier \\\n", + "0 DecisionTreeClassifier \n", + "1 DecisionTreeClassifier \n", + "2 DecisionTreeClassifier \n", + "3 DecisionTreeClassifier \n", + "4 RandomForestClassifier \n", + ".. ... \n", + "441 DecisionTreeClassifier \n", + "442 DecisionTreeClassifier \n", + "443 DecisionTreeClassifier \n", + "444 RandomForestClassifier \n", + "445 RandomForestClassifier \n", + "\n", + " cparam auc \n", + "0 [{'max_depth': 2, 'random_state': 5}, {'max_de... 0.582353 \n", + "1 [{'max_depth': 2, 'random_state': 5}, {'max_de... 0.723529 \n", + "2 [{'max_depth': 2, 'random_state': 5}, {'max_de... 0.764706 \n", + "3 [{'max_depth': 2, 'random_state': 5}, {'max_de... 0.782353 \n", + "4 [{'max_depth': 2, 'random_state': 5}, {'max_de... 0.976471 \n", + ".. ... ... \n", + "441 [{'max_depth': 2, 'random_state': 5}, {'max_de... 0.647059 \n", + "442 [{'max_depth': 2, 'random_state': 5}, {'max_de... 0.661765 \n", + "443 [{'max_depth': 2, 'random_state': 5}, {'max_de... 0.661765 \n", + "444 [{'max_depth': 2, 'random_state': 5}, {'max_de... 0.647059 \n", + "445 [{'max_depth': 2, 'random_state': 5}, {'max_de... 0.676471 \n", + "\n", + "[446 rows x 6 columns]" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tmp = pd.DataFrame.from_dict(results)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "smote-variants", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.13" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/notebooks/development/030-analysis.ipynb b/notebooks/development/030-analysis.ipynb new file mode 100644 index 0000000..7c0b46c --- /dev/null +++ b/notebooks/development/030-analysis.ipynb @@ -0,0 +1,1290 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 316, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "import pandas as pd\n", + "from scipy.stats import wilcoxon\n", + "import common_datasets.binary_classification as binclas" + ] + }, + { + "cell_type": "code", + "execution_count": 317, + "metadata": {}, + "outputs": [], + "source": [ + "data = pd.read_csv('haberman.csv')" + ] + }, + { + "cell_type": "code", + "execution_count": 318, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Unnamed: 0namefoldsparamclassifiercparamauc
00haberman0{'n_neighbors': 3, 'proportion': 0.5, 'random_...DecisionTreeClassifier{'max_depth': 2, 'random_state': 5}0.604575
11haberman0{'n_neighbors': 3, 'proportion': 0.5, 'random_...DecisionTreeClassifier{'max_depth': 4, 'random_state': 5}0.594771
22haberman0{'n_neighbors': 3, 'proportion': 0.5, 'random_...DecisionTreeClassifier{'max_depth': 6, 'random_state': 5}0.639216
33haberman0{'n_neighbors': 3, 'proportion': 0.5, 'random_...DecisionTreeClassifier{'max_depth': 8, 'random_state': 5}0.666667
44haberman0{'n_neighbors': 3, 'proportion': 0.5, 'random_...RandomForestClassifier{'max_depth': 2, 'random_state': 5}0.601961
\n", + "
" + ], + "text/plain": [ + " Unnamed: 0 name fold \\\n", + "0 0 haberman 0 \n", + "1 1 haberman 0 \n", + "2 2 haberman 0 \n", + "3 3 haberman 0 \n", + "4 4 haberman 0 \n", + "\n", + " sparam classifier \\\n", + "0 {'n_neighbors': 3, 'proportion': 0.5, 'random_... DecisionTreeClassifier \n", + "1 {'n_neighbors': 3, 'proportion': 0.5, 'random_... DecisionTreeClassifier \n", + "2 {'n_neighbors': 3, 'proportion': 0.5, 'random_... DecisionTreeClassifier \n", + "3 {'n_neighbors': 3, 'proportion': 0.5, 'random_... DecisionTreeClassifier \n", + "4 {'n_neighbors': 3, 'proportion': 0.5, 'random_... RandomForestClassifier \n", + "\n", + " cparam auc \n", + "0 {'max_depth': 2, 'random_state': 5} 0.604575 \n", + "1 {'max_depth': 4, 'random_state': 5} 0.594771 \n", + "2 {'max_depth': 6, 'random_state': 5} 0.639216 \n", + "3 {'max_depth': 8, 'random_state': 5} 0.666667 \n", + "4 {'max_depth': 2, 'random_state': 5} 0.601961 " + ] + }, + "execution_count": 318, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 319, + "metadata": {}, + "outputs": [], + "source": [ + "data['sparam'] = data['sparam'].apply(eval)\n", + "data['cparam'] = data['cparam'].apply(eval)" + ] + }, + { + "cell_type": "code", + "execution_count": 320, + "metadata": {}, + "outputs": [], + "source": [ + "def remove_key(dict, key):\n", + " del dict[key]\n", + " return dict" + ] + }, + { + "cell_type": "code", + "execution_count": 321, + "metadata": {}, + "outputs": [], + "source": [ + "data['deterministic'] = data['sparam'].apply(lambda x: x['ss_params']['within_simplex_sampling'])\n", + "data['sparam'] = data['sparam'].apply(lambda x: remove_key(x, 'ss_params'))\n", + "data['cparam'] = data['cparam'].apply(str)\n", + "data['sparam'] = data['sparam'].apply(str)" + ] + }, + { + "cell_type": "code", + "execution_count": 322, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Index(['Unnamed: 0', 'name', 'fold', 'sparam', 'classifier', 'cparam', 'auc',\n", + " 'deterministic'],\n", + " dtype='object')" + ] + }, + "execution_count": 322, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data.columns" + ] + }, + { + "cell_type": "code", + "execution_count": 323, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Unnamed: 0namefoldsparamclassifiercparamaucdeterministic
00haberman0{'n_neighbors': 3, 'proportion': 0.5, 'random_...DecisionTreeClassifier{'max_depth': 2, 'random_state': 5}0.604575random
11haberman0{'n_neighbors': 3, 'proportion': 0.5, 'random_...DecisionTreeClassifier{'max_depth': 4, 'random_state': 5}0.594771random
22haberman0{'n_neighbors': 3, 'proportion': 0.5, 'random_...DecisionTreeClassifier{'max_depth': 6, 'random_state': 5}0.639216random
33haberman0{'n_neighbors': 3, 'proportion': 0.5, 'random_...DecisionTreeClassifier{'max_depth': 8, 'random_state': 5}0.666667random
44haberman0{'n_neighbors': 3, 'proportion': 0.5, 'random_...RandomForestClassifier{'max_depth': 2, 'random_state': 5}0.601961random
\n", + "
" + ], + "text/plain": [ + " Unnamed: 0 name fold \\\n", + "0 0 haberman 0 \n", + "1 1 haberman 0 \n", + "2 2 haberman 0 \n", + "3 3 haberman 0 \n", + "4 4 haberman 0 \n", + "\n", + " sparam classifier \\\n", + "0 {'n_neighbors': 3, 'proportion': 0.5, 'random_... DecisionTreeClassifier \n", + "1 {'n_neighbors': 3, 'proportion': 0.5, 'random_... DecisionTreeClassifier \n", + "2 {'n_neighbors': 3, 'proportion': 0.5, 'random_... DecisionTreeClassifier \n", + "3 {'n_neighbors': 3, 'proportion': 0.5, 'random_... DecisionTreeClassifier \n", + "4 {'n_neighbors': 3, 'proportion': 0.5, 'random_... RandomForestClassifier \n", + "\n", + " cparam auc deterministic \n", + "0 {'max_depth': 2, 'random_state': 5} 0.604575 random \n", + "1 {'max_depth': 4, 'random_state': 5} 0.594771 random \n", + "2 {'max_depth': 6, 'random_state': 5} 0.639216 random \n", + "3 {'max_depth': 8, 'random_state': 5} 0.666667 random \n", + "4 {'max_depth': 2, 'random_state': 5} 0.601961 random " + ] + }, + "execution_count": 323, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 324, + "metadata": {}, + "outputs": [], + "source": [ + "grouped = data.groupby(['name', 'sparam', 'classifier', 'cparam', 'deterministic']).apply(lambda pdf: pdf.sort_values('fold')['auc'].values.tolist())" + ] + }, + { + "cell_type": "code", + "execution_count": 325, + "metadata": {}, + "outputs": [], + "source": [ + "grouped = grouped.reset_index(drop=False)\n", + "grouped = grouped.rename(columns={0: 'auc'})\n", + "determ = grouped[grouped['deterministic'] == 'deterministic'].drop(columns=['deterministic'])\n", + "rand = grouped[grouped['deterministic'] == 'random'].drop(columns=['deterministic'])\n", + "merged = pd.merge(determ.rename(columns={'auc': 'auc_det'}), rand, on=['name', 'sparam', 'classifier', 'cparam'])" + ] + }, + { + "cell_type": "code", + "execution_count": 326, + "metadata": {}, + "outputs": [], + "source": [ + "merged['auc_mean_det'] = merged['auc_det'].apply(np.mean)\n", + "merged['auc_std_det'] = merged['auc_det'].apply(np.std)\n", + "merged['auc_min_det'] = merged['auc_det'].apply(np.min)\n", + "merged['auc_max_det'] = merged['auc_det'].apply(np.max)\n", + "merged['auc_mean'] = merged['auc'].apply(np.mean)\n", + "merged['auc_std'] = merged['auc'].apply(np.std)\n", + "merged['auc_min'] = merged['auc'].apply(np.min)\n", + "merged['auc_max'] = merged['auc'].apply(np.max)\n", + "merged['p_l'] = merged.apply(lambda row: wilcoxon(row['auc_det'], row['auc'], zero_method='zsplit', alternative='less').pvalue, axis=1)\n", + "merged['p_g'] = merged.apply(lambda row: wilcoxon(row['auc_det'], row['auc'], zero_method='zsplit', alternative='greater').pvalue, axis=1)\n", + "merged['f_l'] = merged['p_l'] < 0.05\n", + "merged['f_g'] = merged['p_g'] < 0.05" + ] + }, + { + "cell_type": "code", + "execution_count": 327, + "metadata": {}, + "outputs": [], + "source": [ + "def model_selection(pdf):\n", + " max_det = pdf[pdf['auc_mean_det'] == pdf['auc_mean_det'].max()].iloc[0]\n", + " max_ran = pdf[pdf['auc_mean'] == pdf['auc_mean'].max()].iloc[0]\n", + " return pd.Series({'auc_mean_det': max_det['auc_mean_det'],\n", + " 'auc_mean': max_ran['auc_mean'],\n", + " 'auc_std_det': max_det['auc_std_det'],\n", + " 'auc_std': max_ran['auc_std'],\n", + " 'p_l': wilcoxon(max_det['auc_det'], max_ran['auc'], zero_method='zsplit', alternative='less').pvalue,\n", + " 'p_g': wilcoxon(max_det['auc_det'], max_ran['auc'], zero_method='zsplit', alternative='greater').pvalue,\n", + " 'auc_median_det': np.median(max_det['auc_det']),\n", + " 'auc_median': np.median(max_ran['auc'])})" + ] + }, + { + "cell_type": "code", + "execution_count": 328, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
auc_mean_detauc_meanauc_std_detauc_stdp_lp_gauc_median_detauc_median
classifier
DecisionTreeClassifier0.6655360.6655140.0657660.0608710.6277070.3722930.6663190.670139
KNeighborsClassifier0.6430670.6430930.0714460.0699340.4468550.5531450.6464460.649163
RandomForestClassifier0.7103250.7086440.0605500.0626520.9097470.0902530.7114580.711111
SVC0.7192210.7189380.0709950.0708850.8101470.1898530.7238970.722917
\n", + "
" + ], + "text/plain": [ + " auc_mean_det auc_mean auc_std_det auc_std \\\n", + "classifier \n", + "DecisionTreeClassifier 0.665536 0.665514 0.065766 0.060871 \n", + "KNeighborsClassifier 0.643067 0.643093 0.071446 0.069934 \n", + "RandomForestClassifier 0.710325 0.708644 0.060550 0.062652 \n", + "SVC 0.719221 0.718938 0.070995 0.070885 \n", + "\n", + " p_l p_g auc_median_det auc_median \n", + "classifier \n", + "DecisionTreeClassifier 0.627707 0.372293 0.666319 0.670139 \n", + "KNeighborsClassifier 0.446855 0.553145 0.646446 0.649163 \n", + "RandomForestClassifier 0.909747 0.090253 0.711458 0.711111 \n", + "SVC 0.810147 0.189853 0.723897 0.722917 " + ] + }, + "execution_count": 328, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "merged.groupby('classifier').apply(model_selection)" + ] + }, + { + "cell_type": "code", + "execution_count": 329, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
namesparamclassifiercparamauc_detaucauc_mean_detauc_std_detauc_min_detauc_max_detauc_meanauc_stdauc_minauc_maxp_lp_gf_lf_g
classifier
DecisionTreeClassifier155haberman{'n_neighbors': 7, 'proportion': 1.0, 'random_...DecisionTreeClassifier{'max_depth': 4, 'random_state': 5}[0.5908496732026143, 0.7548611111111111, 0.653...[0.630718954248366, 0.8020833333333333, 0.7270...0.6655360.0657660.4743060.8076390.6606990.0733280.4347220.8263890.9152680.084732FalseFalse
KNeighborsClassifier116haberman{'n_neighbors': 5, 'proportion': 1.5, 'random_...KNeighborsClassifier{'n_neighbors': 5}[0.550326797385621, 0.6277777777777778, 0.5958...[0.565359477124183, 0.6069444444444444, 0.5687...0.6430670.0714460.4194440.7965280.6430930.0699340.3902780.7979170.4468550.553145FalseFalse
RandomForestClassifier120haberman{'n_neighbors': 5, 'proportion': 1.5, 'random_...RandomForestClassifier{'max_depth': 4, 'random_state': 5}[0.5947712418300654, 0.7388888888888889, 0.75,...[0.5934640522875817, 0.7486111111111111, 0.730...0.7103250.0605500.4915030.8736110.7078790.0596070.5163400.8666670.9849350.015065FalseTrue
SVC150haberman{'n_neighbors': 7, 'proportion': 0.5, 'random_...SVC{'C': 0.01, 'probability': True, 'random_state...[0.6261437908496732, 0.751388888888889, 0.6930...[0.615686274509804, 0.75, 0.6958333333333333, ...0.7192210.0709950.4627450.8516340.7187000.0709600.4588240.8490200.9289570.071043FalseFalse
\n", + "
" + ], + "text/plain": [ + " name \\\n", + "classifier \n", + "DecisionTreeClassifier 155 haberman \n", + "KNeighborsClassifier 116 haberman \n", + "RandomForestClassifier 120 haberman \n", + "SVC 150 haberman \n", + "\n", + " sparam \\\n", + "classifier \n", + "DecisionTreeClassifier 155 {'n_neighbors': 7, 'proportion': 1.0, 'random_... \n", + "KNeighborsClassifier 116 {'n_neighbors': 5, 'proportion': 1.5, 'random_... \n", + "RandomForestClassifier 120 {'n_neighbors': 5, 'proportion': 1.5, 'random_... \n", + "SVC 150 {'n_neighbors': 7, 'proportion': 0.5, 'random_... \n", + "\n", + " classifier \\\n", + "classifier \n", + "DecisionTreeClassifier 155 DecisionTreeClassifier \n", + "KNeighborsClassifier 116 KNeighborsClassifier \n", + "RandomForestClassifier 120 RandomForestClassifier \n", + "SVC 150 SVC \n", + "\n", + " cparam \\\n", + "classifier \n", + "DecisionTreeClassifier 155 {'max_depth': 4, 'random_state': 5} \n", + "KNeighborsClassifier 116 {'n_neighbors': 5} \n", + "RandomForestClassifier 120 {'max_depth': 4, 'random_state': 5} \n", + "SVC 150 {'C': 0.01, 'probability': True, 'random_state... \n", + "\n", + " auc_det \\\n", + "classifier \n", + "DecisionTreeClassifier 155 [0.5908496732026143, 0.7548611111111111, 0.653... \n", + "KNeighborsClassifier 116 [0.550326797385621, 0.6277777777777778, 0.5958... \n", + "RandomForestClassifier 120 [0.5947712418300654, 0.7388888888888889, 0.75,... \n", + "SVC 150 [0.6261437908496732, 0.751388888888889, 0.6930... \n", + "\n", + " auc \\\n", + "classifier \n", + "DecisionTreeClassifier 155 [0.630718954248366, 0.8020833333333333, 0.7270... \n", + "KNeighborsClassifier 116 [0.565359477124183, 0.6069444444444444, 0.5687... \n", + "RandomForestClassifier 120 [0.5934640522875817, 0.7486111111111111, 0.730... \n", + "SVC 150 [0.615686274509804, 0.75, 0.6958333333333333, ... \n", + "\n", + " auc_mean_det auc_std_det auc_min_det \\\n", + "classifier \n", + "DecisionTreeClassifier 155 0.665536 0.065766 0.474306 \n", + "KNeighborsClassifier 116 0.643067 0.071446 0.419444 \n", + "RandomForestClassifier 120 0.710325 0.060550 0.491503 \n", + "SVC 150 0.719221 0.070995 0.462745 \n", + "\n", + " auc_max_det auc_mean auc_std auc_min \\\n", + "classifier \n", + "DecisionTreeClassifier 155 0.807639 0.660699 0.073328 0.434722 \n", + "KNeighborsClassifier 116 0.796528 0.643093 0.069934 0.390278 \n", + "RandomForestClassifier 120 0.873611 0.707879 0.059607 0.516340 \n", + "SVC 150 0.851634 0.718700 0.070960 0.458824 \n", + "\n", + " auc_max p_l p_g f_l f_g \n", + "classifier \n", + "DecisionTreeClassifier 155 0.826389 0.915268 0.084732 False False \n", + "KNeighborsClassifier 116 0.797917 0.446855 0.553145 False False \n", + "RandomForestClassifier 120 0.866667 0.984935 0.015065 False True \n", + "SVC 150 0.849020 0.928957 0.071043 False False " + ] + }, + "execution_count": 329, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "merged.groupby('classifier').apply(lambda pdf: pdf[pdf['auc_mean_det'] == pdf['auc_mean_det'].max()])" + ] + }, + { + "cell_type": "code", + "execution_count": 330, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
namesparamclassifiercparamauc_detaucauc_mean_detauc_std_detauc_min_detauc_max_detauc_meanauc_stdauc_minauc_maxp_lp_gf_lf_g
150haberman{'n_neighbors': 7, 'proportion': 0.5, 'random_...SVC{'C': 0.01, 'probability': True, 'random_state...[0.6261437908496732, 0.751388888888889, 0.6930...[0.615686274509804, 0.75, 0.6958333333333333, ...0.7192210.0709950.4627450.8516340.71870.070960.4588240.849020.9289570.071043FalseFalse
\n", + "
" + ], + "text/plain": [ + " name sparam classifier \\\n", + "150 haberman {'n_neighbors': 7, 'proportion': 0.5, 'random_... SVC \n", + "\n", + " cparam \\\n", + "150 {'C': 0.01, 'probability': True, 'random_state... \n", + "\n", + " auc_det \\\n", + "150 [0.6261437908496732, 0.751388888888889, 0.6930... \n", + "\n", + " auc auc_mean_det \\\n", + "150 [0.615686274509804, 0.75, 0.6958333333333333, ... 0.719221 \n", + "\n", + " auc_std_det auc_min_det auc_max_det auc_mean auc_std auc_min \\\n", + "150 0.070995 0.462745 0.851634 0.7187 0.07096 0.458824 \n", + "\n", + " auc_max p_l p_g f_l f_g \n", + "150 0.84902 0.928957 0.071043 False False " + ] + }, + "execution_count": 330, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "merged[merged['auc_mean_det'] == merged['auc_mean_det'].max()]" + ] + }, + { + "cell_type": "code", + "execution_count": 331, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
namesparamclassifiercparamauc_detaucauc_mean_detauc_std_detauc_min_detauc_max_detauc_meanauc_stdauc_minauc_maxp_lp_gf_lf_g
classifier
DecisionTreeClassifier176haberman{'n_neighbors': 7, 'proportion': 1.5, 'random_...DecisionTreeClassifier{'max_depth': 2, 'random_state': 5}[0.5215686274509804, 0.8076388888888888, 0.655...[0.5875816993464053, 0.8083333333333332, 0.672...0.6654760.0635160.4908500.8500000.6655140.0608710.4908500.8222220.5837730.416227FalseFalse
KNeighborsClassifier116haberman{'n_neighbors': 5, 'proportion': 1.5, 'random_...KNeighborsClassifier{'n_neighbors': 5}[0.550326797385621, 0.6277777777777778, 0.5958...[0.565359477124183, 0.6069444444444444, 0.5687...0.6430670.0714460.4194440.7965280.6430930.0699340.3902780.7979170.4468550.553145FalseFalse
RandomForestClassifier164haberman{'n_neighbors': 7, 'proportion': 1.0, 'random_...RandomForestClassifier{'max_depth': 4, 'random_state': 5}[0.6052287581699347, 0.75, 0.7305555555555555,...[0.5830065359477125, 0.7361111111111112, 0.773...0.7089540.0618710.5006540.8791670.7086440.0626520.5163400.8791670.6954200.304580FalseFalse
SVC153haberman{'n_neighbors': 7, 'proportion': 0.5, 'random_...SVC{'C': 0.1, 'probability': True, 'random_state'...[0.6235294117647059, 0.7486111111111112, 0.691...[0.6143790849673203, 0.75, 0.701388888888889, ...0.7191170.0712290.4653590.8542480.7189380.0708850.4588240.8490200.6654000.334600FalseFalse
\n", + "
" + ], + "text/plain": [ + " name \\\n", + "classifier \n", + "DecisionTreeClassifier 176 haberman \n", + "KNeighborsClassifier 116 haberman \n", + "RandomForestClassifier 164 haberman \n", + "SVC 153 haberman \n", + "\n", + " sparam \\\n", + "classifier \n", + "DecisionTreeClassifier 176 {'n_neighbors': 7, 'proportion': 1.5, 'random_... \n", + "KNeighborsClassifier 116 {'n_neighbors': 5, 'proportion': 1.5, 'random_... \n", + "RandomForestClassifier 164 {'n_neighbors': 7, 'proportion': 1.0, 'random_... \n", + "SVC 153 {'n_neighbors': 7, 'proportion': 0.5, 'random_... \n", + "\n", + " classifier \\\n", + "classifier \n", + "DecisionTreeClassifier 176 DecisionTreeClassifier \n", + "KNeighborsClassifier 116 KNeighborsClassifier \n", + "RandomForestClassifier 164 RandomForestClassifier \n", + "SVC 153 SVC \n", + "\n", + " cparam \\\n", + "classifier \n", + "DecisionTreeClassifier 176 {'max_depth': 2, 'random_state': 5} \n", + "KNeighborsClassifier 116 {'n_neighbors': 5} \n", + "RandomForestClassifier 164 {'max_depth': 4, 'random_state': 5} \n", + "SVC 153 {'C': 0.1, 'probability': True, 'random_state'... \n", + "\n", + " auc_det \\\n", + "classifier \n", + "DecisionTreeClassifier 176 [0.5215686274509804, 0.8076388888888888, 0.655... \n", + "KNeighborsClassifier 116 [0.550326797385621, 0.6277777777777778, 0.5958... \n", + "RandomForestClassifier 164 [0.6052287581699347, 0.75, 0.7305555555555555,... \n", + "SVC 153 [0.6235294117647059, 0.7486111111111112, 0.691... \n", + "\n", + " auc \\\n", + "classifier \n", + "DecisionTreeClassifier 176 [0.5875816993464053, 0.8083333333333332, 0.672... \n", + "KNeighborsClassifier 116 [0.565359477124183, 0.6069444444444444, 0.5687... \n", + "RandomForestClassifier 164 [0.5830065359477125, 0.7361111111111112, 0.773... \n", + "SVC 153 [0.6143790849673203, 0.75, 0.701388888888889, ... \n", + "\n", + " auc_mean_det auc_std_det auc_min_det \\\n", + "classifier \n", + "DecisionTreeClassifier 176 0.665476 0.063516 0.490850 \n", + "KNeighborsClassifier 116 0.643067 0.071446 0.419444 \n", + "RandomForestClassifier 164 0.708954 0.061871 0.500654 \n", + "SVC 153 0.719117 0.071229 0.465359 \n", + "\n", + " auc_max_det auc_mean auc_std auc_min \\\n", + "classifier \n", + "DecisionTreeClassifier 176 0.850000 0.665514 0.060871 0.490850 \n", + "KNeighborsClassifier 116 0.796528 0.643093 0.069934 0.390278 \n", + "RandomForestClassifier 164 0.879167 0.708644 0.062652 0.516340 \n", + "SVC 153 0.854248 0.718938 0.070885 0.458824 \n", + "\n", + " auc_max p_l p_g f_l f_g \n", + "classifier \n", + "DecisionTreeClassifier 176 0.822222 0.583773 0.416227 False False \n", + "KNeighborsClassifier 116 0.797917 0.446855 0.553145 False False \n", + "RandomForestClassifier 164 0.879167 0.695420 0.304580 False False \n", + "SVC 153 0.849020 0.665400 0.334600 False False " + ] + }, + "execution_count": 331, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "merged.groupby('classifier').apply(lambda pdf: pdf[pdf['auc_mean'] == pdf['auc_mean'].max()])" + ] + }, + { + "cell_type": "code", + "execution_count": 332, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
namesparamclassifiercparamauc_detaucauc_mean_detauc_std_detauc_min_detauc_max_detauc_meanauc_stdauc_minauc_maxp_lp_gf_lf_g
153haberman{'n_neighbors': 7, 'proportion': 0.5, 'random_...SVC{'C': 0.1, 'probability': True, 'random_state'...[0.6235294117647059, 0.7486111111111112, 0.691...[0.6143790849673203, 0.75, 0.701388888888889, ...0.7191170.0712290.4653590.8542480.7189380.0708850.4588240.849020.66540.3346FalseFalse
\n", + "
" + ], + "text/plain": [ + " name sparam classifier \\\n", + "153 haberman {'n_neighbors': 7, 'proportion': 0.5, 'random_... SVC \n", + "\n", + " cparam \\\n", + "153 {'C': 0.1, 'probability': True, 'random_state'... \n", + "\n", + " auc_det \\\n", + "153 [0.6235294117647059, 0.7486111111111112, 0.691... \n", + "\n", + " auc auc_mean_det \\\n", + "153 [0.6143790849673203, 0.75, 0.701388888888889, ... 0.719117 \n", + "\n", + " auc_std_det auc_min_det auc_max_det auc_mean auc_std auc_min \\\n", + "153 0.071229 0.465359 0.854248 0.718938 0.070885 0.458824 \n", + "\n", + " auc_max p_l p_g f_l f_g \n", + "153 0.84902 0.6654 0.3346 False False " + ] + }, + "execution_count": 332, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "merged[merged['auc_mean'] == merged['auc_mean'].max()]" + ] + }, + { + "cell_type": "code", + "execution_count": 333, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "auc_mean 0.608057\n", + "auc_std 0.077244\n", + "auc_mean_det 0.607343\n", + "auc_std_det 0.077029\n", + "p_l 0.431314\n", + "p_g 0.568686\n", + "f_l 0.141414\n", + "f_g 0.055556\n", + "dtype: float64" + ] + }, + "execution_count": 333, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "merged[['auc_mean', 'auc_std', 'auc_mean_det', 'auc_std_det', 'p_l', 'p_g', 'f_l', 'f_g']].mean()" + ] + }, + { + "cell_type": "code", + "execution_count": 334, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "auc_min 0.381989\n", + "auc_max 0.802708\n", + "auc_min_det 0.380450\n", + "auc_max_det 0.799888\n", + "dtype: float64" + ] + }, + "execution_count": 334, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "merged[['auc_min', 'auc_max', 'auc_min_det', 'auc_max_det']].mean()" + ] + }, + { + "cell_type": "code", + "execution_count": 335, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "auc_mean 0.632868\n", + "auc_std 0.070529\n", + "auc_mean_det 0.631631\n", + "auc_std_det 0.070056\n", + "dtype: float64" + ] + }, + "execution_count": 335, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "merged[['auc_mean', 'auc_std', 'auc_mean_det', 'auc_std_det']].median()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "smote-variants", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.13" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/notebooks/development/031-deterministic-test-joblib-ml.ipynb b/notebooks/development/031-deterministic-test-joblib-ml.ipynb new file mode 100644 index 0000000..a2fdfbd --- /dev/null +++ b/notebooks/development/031-deterministic-test-joblib-ml.ipynb @@ -0,0 +1,476 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import datetime\n", + "\n", + "from joblib import Parallel, delayed\n", + "\n", + "import numpy as np\n", + "import pandas as pd\n", + "\n", + "from sklearn.tree import DecisionTreeClassifier\n", + "from sklearn.ensemble import RandomForestClassifier\n", + "from sklearn.svm import SVC\n", + "from sklearn.neighbors import KNeighborsClassifier\n", + "from sklearn.model_selection import RepeatedStratifiedKFold\n", + "from sklearn.preprocessing import StandardScaler\n", + "from sklearn.metrics import roc_auc_score\n", + "\n", + "from smote_variants.oversampling import SMOTE\n", + "from common_datasets.binary_classification import get_filtered_data_loaders" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "import logging\n", + "logger = logging.getLogger('smote_variants')\n", + "logger.setLevel(logging.ERROR)" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "classifiers = {\n", + "DecisionTreeClassifier: [{'max_depth': md, 'random_state': 5} for md in range(4, 10, 2)],\n", + "RandomForestClassifier: [{'max_depth': md, 'random_state': 5} for md in range(4, 10, 2)],\n", + "KNeighborsClassifier: [{'n_neighbors': nn} for nn in range(1, 10, 2)],\n", + "SVC: [{'C': c, 'probability': True, 'random_state': 5} for c in [0.001, 0.01, 0.1]]\\\n", + " + [{'C': c, 'probability': True, 'kernel': 'poly', 'degree': 2, 'random_state': 5} for c in [0.001, 0.01, 0.1]]\\\n", + " + [{'C': c, 'probability': True, 'kernel': 'poly', 'degree': 3, 'random_state': 5} for c in [0.001, 0.01, 0.1]]\\\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "datasets = get_filtered_data_loaders(n_col_bounds=(2, 40),\n", + " n_bounds=(10, 500),\n", + " n_minority_bounds=(10, 500),\n", + " n_from_phenotypes=1,\n", + " n_smallest=20)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "smote_params = [\n", + " #{'n_neighbors': 3, 'proportion': 0.5, 'random_state': 5, 'nn_params': {'metric': 'precomputed', 'metric_learning_method': 'id'}},\n", + " #{'n_neighbors': 5, 'proportion': 0.5, 'random_state': 5, 'nn_params': {'metric': 'precomputed', 'metric_learning_method': 'id'}},\n", + " #{'n_neighbors': 7, 'proportion': 0.5, 'random_state': 5, 'nn_params': {'metric': 'precomputed', 'metric_learning_method': 'id'}},\n", + " #{'n_neighbors': 3, 'proportion': 1.0, 'random_state': 5, 'nn_params': {'metric': 'precomputed', 'metric_learning_method': 'id'}},\n", + " {'n_neighbors': 5, 'proportion': 1.0, 'random_state': 5, 'nn_params': {'metric': 'precomputed', 'metric_learning_method': 'id'}},\n", + " #{'n_neighbors': 7, 'proportion': 1.0, 'random_state': 5, 'nn_params': {'metric': 'precomputed', 'metric_learning_method': 'id'}},\n", + " #{'n_neighbors': 3, 'proportion': 1.5, 'random_state': 5, 'nn_params': {'metric': 'precomputed', 'metric_learning_method': 'id'}},\n", + " #{'n_neighbors': 5, 'proportion': 1.5, 'random_state': 5, 'nn_params': {'metric': 'precomputed', 'metric_learning_method': 'id'}},\n", + " #{'n_neighbors': 7, 'proportion': 1.5, 'random_state': 5, 'nn_params': {'metric': 'precomputed', 'metric_learning_method': 'id'}},\n", + " #{'n_neighbors': 3, 'proportion': 0.5, 'random_state': 5, 'nn_params': {'metric': 'precomputed', 'metric_learning_method': 'n_unique_inv'}},\n", + " #{'n_neighbors': 5, 'proportion': 0.5, 'random_state': 5, 'nn_params': {'metric': 'precomputed', 'metric_learning_method': 'n_unique_inv'}},\n", + " #{'n_neighbors': 7, 'proportion': 0.5, 'random_state': 5, 'nn_params': {'metric': 'precomputed', 'metric_learning_method': 'n_unique_inv'}},\n", + " #{'n_neighbors': 3, 'proportion': 1.0, 'random_state': 5, 'nn_params': {'metric': 'precomputed', 'metric_learning_method': 'n_unique_inv'}},\n", + " {'n_neighbors': 5, 'proportion': 1.0, 'random_state': 5, 'nn_params': {'metric': 'precomputed', 'metric_learning_method': 'MI_weighted'}},\n", + " #{'n_neighbors': 7, 'proportion': 1.0, 'random_state': 5, 'nn_params': {'metric': 'precomputed', 'metric_learning_method': 'n_unique_inv'}},\n", + " #{'n_neighbors': 3, 'proportion': 1.5, 'random_state': 5, 'nn_params': {'metric': 'precomputed', 'metric_learning_method': 'n_unique_inv'}},\n", + " #{'n_neighbors': 5, 'proportion': 1.5, 'random_state': 5, 'nn_params': {'metric': 'precomputed', 'metric_learning_method': 'n_unique_inv'}},\n", + " #{'n_neighbors': 7, 'proportion': 1.5, 'random_state': 5, 'nn_params': {'metric': 'precomputed', 'metric_learning_method': 'n_unique_inv'}},\n", + "]" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "for data_loader in datasets:\n", + " results = []" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "def job_generator(data_loader):\n", + "\n", + " dataset = data_loader()\n", + " print(datetime.datetime.now(), dataset['name'])\n", + "\n", + " X = dataset['data']\n", + " y = dataset['target']\n", + "\n", + " validator = RepeatedStratifiedKFold(n_splits=5, n_repeats=200, random_state=5)\n", + "\n", + " for fidx, (train, test) in enumerate(validator.split(X, y, y)):\n", + " X_train = X[train]\n", + " X_test = X[test]\n", + " y_train = y[train]\n", + " y_test = y[test]\n", + "\n", + " ss = StandardScaler()\n", + " ss.fit(X_train)\n", + " X_train = ss.transform(X_train)\n", + " X_test = ss.transform(X_test)\n", + "\n", + " for sparam in smote_params:\n", + " oversampling = SMOTE(**sparam)\n", + " X_samp, y_samp = oversampling.sample(X_train, y_train)\n", + "\n", + " for classifier, cparams in classifiers.items():\n", + " for param in cparams:\n", + " job = {\n", + " 'X_samp': X_samp,\n", + " 'y_samp': y_samp,\n", + " 'X_test': X_test,\n", + " 'y_test': y_test,\n", + " 'classifier': classifier,\n", + " 'param': param\n", + " }\n", + " description = {\n", + " 'name': dataset['name'],\n", + " 'fold': fidx,\n", + " 'sparam': sparam,\n", + " 'classifier': classifier.__name__,\n", + " 'cparam': param\n", + " }\n", + " yield job, description" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "def do_job(job, description):\n", + " classifier_obj = job['classifier'](**job['param'])\n", + " classifier_obj.fit(job['X_samp'], job['y_samp'])\n", + " y_pred = classifier_obj.predict_proba(job['X_test'])\n", + " auc = roc_auc_score(job['y_test'], y_pred[:, 1])\n", + " return description | {'auc': auc}" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2023-12-16 11:14:50.048117 appendicitis\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2023-12-16 11:21:45.349117 bupa\n", + "2023-12-16 11:31:09.550091 cleveland-0_vs_4\n", + "2023-12-16 11:42:26.786704 CM1\n" + ] + } + ], + "source": [ + "for data_loader in datasets:\n", + " dataset = data_loader()\n", + "\n", + " if dataset['name'] in ['iris0', 'dermatology-6']:\n", + " continue\n", + "\n", + " results = Parallel(n_jobs=3)(delayed(do_job)(*x) for x in job_generator(data_loader))\n", + " results = pd.DataFrame.from_dict(results)\n", + " results.to_csv(f\"{dataset['name']}-ml.csv\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "results.to_csv(f\"{dataset['name']}.csv\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2023-12-14 21:42:48.609799 appendicitis\n" + ] + }, + { + "data": { + "text/plain": [ + "{'name': 'appendicitis',\n", + " 'fold': 0,\n", + " 'sparam': {'n_neighbors': 3,\n", + " 'proportion': 0.5,\n", + " 'random_state': 5,\n", + " 'ss_params': {'within_simplex_sampling': 'random'}},\n", + " 'classifier': 'DecisionTreeClassifier',\n", + " 'cparam': {'max_depth': 2, 'random_state': 5},\n", + " 'auc': 0.5823529411764707}" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "do_job(*next(job_generator(datasets[0])))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
namefoldsparamclassifiercparamauc
0appendicitis0{'n_neighbors': 3, 'proportion': 0.5, 'random_...DecisionTreeClassifier[{'max_depth': 2, 'random_state': 5}, {'max_de...0.582353
1appendicitis0{'n_neighbors': 3, 'proportion': 0.5, 'random_...DecisionTreeClassifier[{'max_depth': 2, 'random_state': 5}, {'max_de...0.723529
2appendicitis0{'n_neighbors': 3, 'proportion': 0.5, 'random_...DecisionTreeClassifier[{'max_depth': 2, 'random_state': 5}, {'max_de...0.764706
3appendicitis0{'n_neighbors': 3, 'proportion': 0.5, 'random_...DecisionTreeClassifier[{'max_depth': 2, 'random_state': 5}, {'max_de...0.782353
4appendicitis0{'n_neighbors': 3, 'proportion': 0.5, 'random_...RandomForestClassifier[{'max_depth': 2, 'random_state': 5}, {'max_de...0.976471
.....................
441appendicitis1{'n_neighbors': 7, 'proportion': 0.5, 'random_...DecisionTreeClassifier[{'max_depth': 2, 'random_state': 5}, {'max_de...0.647059
442appendicitis1{'n_neighbors': 7, 'proportion': 0.5, 'random_...DecisionTreeClassifier[{'max_depth': 2, 'random_state': 5}, {'max_de...0.661765
443appendicitis1{'n_neighbors': 7, 'proportion': 0.5, 'random_...DecisionTreeClassifier[{'max_depth': 2, 'random_state': 5}, {'max_de...0.661765
444appendicitis1{'n_neighbors': 7, 'proportion': 0.5, 'random_...RandomForestClassifier[{'max_depth': 2, 'random_state': 5}, {'max_de...0.647059
445appendicitis1{'n_neighbors': 7, 'proportion': 0.5, 'random_...RandomForestClassifier[{'max_depth': 2, 'random_state': 5}, {'max_de...0.676471
\n", + "

446 rows × 6 columns

\n", + "
" + ], + "text/plain": [ + " name fold sparam \\\n", + "0 appendicitis 0 {'n_neighbors': 3, 'proportion': 0.5, 'random_... \n", + "1 appendicitis 0 {'n_neighbors': 3, 'proportion': 0.5, 'random_... \n", + "2 appendicitis 0 {'n_neighbors': 3, 'proportion': 0.5, 'random_... \n", + "3 appendicitis 0 {'n_neighbors': 3, 'proportion': 0.5, 'random_... \n", + "4 appendicitis 0 {'n_neighbors': 3, 'proportion': 0.5, 'random_... \n", + ".. ... ... ... \n", + "441 appendicitis 1 {'n_neighbors': 7, 'proportion': 0.5, 'random_... \n", + "442 appendicitis 1 {'n_neighbors': 7, 'proportion': 0.5, 'random_... \n", + "443 appendicitis 1 {'n_neighbors': 7, 'proportion': 0.5, 'random_... \n", + "444 appendicitis 1 {'n_neighbors': 7, 'proportion': 0.5, 'random_... \n", + "445 appendicitis 1 {'n_neighbors': 7, 'proportion': 0.5, 'random_... \n", + "\n", + " classifier \\\n", + "0 DecisionTreeClassifier \n", + "1 DecisionTreeClassifier \n", + "2 DecisionTreeClassifier \n", + "3 DecisionTreeClassifier \n", + "4 RandomForestClassifier \n", + ".. ... \n", + "441 DecisionTreeClassifier \n", + "442 DecisionTreeClassifier \n", + "443 DecisionTreeClassifier \n", + "444 RandomForestClassifier \n", + "445 RandomForestClassifier \n", + "\n", + " cparam auc \n", + "0 [{'max_depth': 2, 'random_state': 5}, {'max_de... 0.582353 \n", + "1 [{'max_depth': 2, 'random_state': 5}, {'max_de... 0.723529 \n", + "2 [{'max_depth': 2, 'random_state': 5}, {'max_de... 0.764706 \n", + "3 [{'max_depth': 2, 'random_state': 5}, {'max_de... 0.782353 \n", + "4 [{'max_depth': 2, 'random_state': 5}, {'max_de... 0.976471 \n", + ".. ... ... \n", + "441 [{'max_depth': 2, 'random_state': 5}, {'max_de... 0.647059 \n", + "442 [{'max_depth': 2, 'random_state': 5}, {'max_de... 0.661765 \n", + "443 [{'max_depth': 2, 'random_state': 5}, {'max_de... 0.661765 \n", + "444 [{'max_depth': 2, 'random_state': 5}, {'max_de... 0.647059 \n", + "445 [{'max_depth': 2, 'random_state': 5}, {'max_de... 0.676471 \n", + "\n", + "[446 rows x 6 columns]" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tmp = pd.DataFrame.from_dict(results)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "smote-variants", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.13" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/notebooks/development/031-deterministic-test-joblib.ipynb b/notebooks/development/031-deterministic-test-joblib.ipynb new file mode 100644 index 0000000..35f4e05 --- /dev/null +++ b/notebooks/development/031-deterministic-test-joblib.ipynb @@ -0,0 +1,480 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 30, + "metadata": {}, + "outputs": [], + "source": [ + "import datetime\n", + "\n", + "from joblib import Parallel, delayed\n", + "\n", + "import numpy as np\n", + "import pandas as pd\n", + "\n", + "from sklearn.tree import DecisionTreeClassifier\n", + "from sklearn.ensemble import RandomForestClassifier\n", + "from sklearn.svm import SVC\n", + "from sklearn.neighbors import KNeighborsClassifier\n", + "from sklearn.model_selection import RepeatedStratifiedKFold\n", + "from sklearn.preprocessing import StandardScaler\n", + "from sklearn.metrics import roc_auc_score\n", + "\n", + "from smote_variants.oversampling import SMOTE\n", + "from common_datasets.binary_classification import get_filtered_data_loaders" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": {}, + "outputs": [], + "source": [ + "import logging\n", + "logger = logging.getLogger('smote_variants')\n", + "logger.setLevel(logging.ERROR)" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "metadata": {}, + "outputs": [], + "source": [ + "classifiers = {\n", + "DecisionTreeClassifier: [{'max_depth': md, 'random_state': 5} for md in range(4, 10, 2)],\n", + "RandomForestClassifier: [{'max_depth': md, 'random_state': 5} for md in range(4, 10, 2)],\n", + "KNeighborsClassifier: [{'n_neighbors': nn} for nn in range(1, 10, 2)],\n", + "SVC: [{'C': c, 'probability': True, 'random_state': 5} for c in [0.001, 0.01, 0.1]]\\\n", + " + [{'C': c, 'probability': True, 'kernel': 'poly', 'degree': 2, 'random_state': 5} for c in [0.001, 0.01, 0.1]]\\\n", + " + [{'C': c, 'probability': True, 'kernel': 'poly', 'degree': 3, 'random_state': 5} for c in [0.001, 0.01, 0.1]]\\\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "metadata": {}, + "outputs": [], + "source": [ + "datasets = get_filtered_data_loaders(n_col_bounds=(2, 40),\n", + " n_bounds=(10, 500),\n", + " n_minority_bounds=(10, 500),\n", + " n_from_phenotypes=1,\n", + " n_smallest=20)" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "metadata": {}, + "outputs": [], + "source": [ + "smote_params = [\n", + " {'n_neighbors': 3, 'proportion': 0.5, 'random_state': 5, 'ss_params': {'within_simplex_sampling': 'random'}},\n", + " {'n_neighbors': 5, 'proportion': 0.5, 'random_state': 5, 'ss_params': {'within_simplex_sampling': 'random'}},\n", + " {'n_neighbors': 7, 'proportion': 0.5, 'random_state': 5, 'ss_params': {'within_simplex_sampling': 'random'}},\n", + " {'n_neighbors': 3, 'proportion': 1.0, 'random_state': 5, 'ss_params': {'within_simplex_sampling': 'random'}},\n", + " {'n_neighbors': 5, 'proportion': 1.0, 'random_state': 5, 'ss_params': {'within_simplex_sampling': 'random'}},\n", + " {'n_neighbors': 7, 'proportion': 1.0, 'random_state': 5, 'ss_params': {'within_simplex_sampling': 'random'}},\n", + " {'n_neighbors': 3, 'proportion': 1.5, 'random_state': 5, 'ss_params': {'within_simplex_sampling': 'random'}},\n", + " {'n_neighbors': 5, 'proportion': 1.5, 'random_state': 5, 'ss_params': {'within_simplex_sampling': 'random'}},\n", + " {'n_neighbors': 7, 'proportion': 1.5, 'random_state': 5, 'ss_params': {'within_simplex_sampling': 'random'}},\n", + " {'n_neighbors': 3, 'proportion': 0.5, 'random_state': 5, 'ss_params': {'within_simplex_sampling': 'deterministic'}},\n", + " {'n_neighbors': 5, 'proportion': 0.5, 'random_state': 5, 'ss_params': {'within_simplex_sampling': 'deterministic'}},\n", + " {'n_neighbors': 7, 'proportion': 0.5, 'random_state': 5, 'ss_params': {'within_simplex_sampling': 'deterministic'}},\n", + " {'n_neighbors': 3, 'proportion': 1.0, 'random_state': 5, 'ss_params': {'within_simplex_sampling': 'deterministic'}},\n", + " {'n_neighbors': 5, 'proportion': 1.0, 'random_state': 5, 'ss_params': {'within_simplex_sampling': 'deterministic'}},\n", + " {'n_neighbors': 7, 'proportion': 1.0, 'random_state': 5, 'ss_params': {'within_simplex_sampling': 'deterministic'}},\n", + " {'n_neighbors': 3, 'proportion': 1.5, 'random_state': 5, 'ss_params': {'within_simplex_sampling': 'deterministic'}},\n", + " {'n_neighbors': 5, 'proportion': 1.5, 'random_state': 5, 'ss_params': {'within_simplex_sampling': 'deterministic'}},\n", + " {'n_neighbors': 7, 'proportion': 1.5, 'random_state': 5, 'ss_params': {'within_simplex_sampling': 'deterministic'}},\n", + "]" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "metadata": {}, + "outputs": [], + "source": [ + "for data_loader in datasets:\n", + " results = []" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "metadata": {}, + "outputs": [], + "source": [ + "def job_generator(data_loader):\n", + "\n", + " dataset = data_loader()\n", + " print(datetime.datetime.now(), dataset['name'])\n", + " if dataset['name'] in ['iris0', 'dermatology-6']:\n", + " continue\n", + " X = dataset['data']\n", + " y = dataset['target']\n", + "\n", + " validator = RepeatedStratifiedKFold(n_splits=5, n_repeats=40, random_state=5)\n", + "\n", + " for fidx, (train, test) in enumerate(validator.split(X, y, y)):\n", + " X_train = X[train]\n", + " X_test = X[test]\n", + " y_train = y[train]\n", + " y_test = y[test]\n", + "\n", + " ss = StandardScaler()\n", + " ss.fit(X_train)\n", + " X_train = ss.transform(X_train)\n", + " X_test = ss.transform(X_test)\n", + "\n", + " for sparam in smote_params:\n", + " oversampling = SMOTE(**sparam)\n", + " X_samp, y_samp = oversampling.sample(X_train, y_train)\n", + "\n", + " for classifier, cparams in classifiers.items():\n", + " for param in cparams:\n", + " job = {\n", + " 'X_samp': X_samp,\n", + " 'y_samp': y_samp,\n", + " 'X_test': X_test,\n", + " 'y_test': y_test,\n", + " 'classifier': classifier,\n", + " 'param': param\n", + " }\n", + " description = {\n", + " 'name': dataset['name'],\n", + " 'fold': fidx,\n", + " 'sparam': sparam,\n", + " 'classifier': classifier.__name__,\n", + " 'cparam': param\n", + " }\n", + " yield job, description" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "metadata": {}, + "outputs": [], + "source": [ + "def do_job(job, description):\n", + " classifier_obj = job['classifier'](**job['param'])\n", + " classifier_obj.fit(job['X_samp'], job['y_samp'])\n", + " y_pred = classifier_obj.predict_proba(job['X_test'])\n", + " auc = roc_auc_score(job['y_test'], y_pred[:, 1])\n", + " return description | {'auc': auc}" + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2023-12-14 22:50:59.240523 bupa\n", + "2023-12-14 23:21:28.186055 cleveland-0_vs_4\n", + "2023-12-14 23:53:18.753317 CM1\n", + "2023-12-15 00:54:19.832280 dermatology-6\n", + "2023-12-15 01:33:56.591067 ecoli1\n", + "2023-12-15 02:09:46.543042 glass0\n", + "2023-12-15 02:39:14.222835 haberman\n", + "2023-12-15 03:12:29.604080 hepatitis\n", + "2023-12-15 03:41:29.622140 ionosphere\n", + "2023-12-15 04:13:36.586725 iris0\n", + "2023-12-15 04:29:00.888280 led7digit-0-2-4-6-7-8-9_vs_1\n", + "2023-12-15 05:02:23.042133 monk-2\n", + "2023-12-15 05:38:21.101381 new_thyroid1\n", + "2023-12-15 06:05:34.747387 page-blocks-1-3_vs_4\n", + "2023-12-15 06:53:06.746426 saheart\n", + "2023-12-15 07:30:53.274009 shuttle-6_vs_2-3\n", + "2023-12-15 07:58:28.826586 yeast-1_vs_7\n" + ] + } + ], + "source": [ + "for data_loader in datasets:\n", + " dataset = data_loader()\n", + " results = Parallel(n_jobs=3)(delayed(do_job)(*x) for x in job_generator(data_loader))\n", + " results = pd.DataFrame.from_dict(results)\n", + " results.to_csv(f\"{dataset['name']}.csv\")" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "metadata": {}, + "outputs": [], + "source": [ + "results.to_csv(f\"{dataset['name']}.csv\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2023-12-14 21:42:48.609799 appendicitis\n" + ] + }, + { + "data": { + "text/plain": [ + "{'name': 'appendicitis',\n", + " 'fold': 0,\n", + " 'sparam': {'n_neighbors': 3,\n", + " 'proportion': 0.5,\n", + " 'random_state': 5,\n", + " 'ss_params': {'within_simplex_sampling': 'random'}},\n", + " 'classifier': 'DecisionTreeClassifier',\n", + " 'cparam': {'max_depth': 2, 'random_state': 5},\n", + " 'auc': 0.5823529411764707}" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "do_job(*next(job_generator(datasets[0])))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
namefoldsparamclassifiercparamauc
0appendicitis0{'n_neighbors': 3, 'proportion': 0.5, 'random_...DecisionTreeClassifier[{'max_depth': 2, 'random_state': 5}, {'max_de...0.582353
1appendicitis0{'n_neighbors': 3, 'proportion': 0.5, 'random_...DecisionTreeClassifier[{'max_depth': 2, 'random_state': 5}, {'max_de...0.723529
2appendicitis0{'n_neighbors': 3, 'proportion': 0.5, 'random_...DecisionTreeClassifier[{'max_depth': 2, 'random_state': 5}, {'max_de...0.764706
3appendicitis0{'n_neighbors': 3, 'proportion': 0.5, 'random_...DecisionTreeClassifier[{'max_depth': 2, 'random_state': 5}, {'max_de...0.782353
4appendicitis0{'n_neighbors': 3, 'proportion': 0.5, 'random_...RandomForestClassifier[{'max_depth': 2, 'random_state': 5}, {'max_de...0.976471
.....................
441appendicitis1{'n_neighbors': 7, 'proportion': 0.5, 'random_...DecisionTreeClassifier[{'max_depth': 2, 'random_state': 5}, {'max_de...0.647059
442appendicitis1{'n_neighbors': 7, 'proportion': 0.5, 'random_...DecisionTreeClassifier[{'max_depth': 2, 'random_state': 5}, {'max_de...0.661765
443appendicitis1{'n_neighbors': 7, 'proportion': 0.5, 'random_...DecisionTreeClassifier[{'max_depth': 2, 'random_state': 5}, {'max_de...0.661765
444appendicitis1{'n_neighbors': 7, 'proportion': 0.5, 'random_...RandomForestClassifier[{'max_depth': 2, 'random_state': 5}, {'max_de...0.647059
445appendicitis1{'n_neighbors': 7, 'proportion': 0.5, 'random_...RandomForestClassifier[{'max_depth': 2, 'random_state': 5}, {'max_de...0.676471
\n", + "

446 rows × 6 columns

\n", + "
" + ], + "text/plain": [ + " name fold sparam \\\n", + "0 appendicitis 0 {'n_neighbors': 3, 'proportion': 0.5, 'random_... \n", + "1 appendicitis 0 {'n_neighbors': 3, 'proportion': 0.5, 'random_... \n", + "2 appendicitis 0 {'n_neighbors': 3, 'proportion': 0.5, 'random_... \n", + "3 appendicitis 0 {'n_neighbors': 3, 'proportion': 0.5, 'random_... \n", + "4 appendicitis 0 {'n_neighbors': 3, 'proportion': 0.5, 'random_... \n", + ".. ... ... ... \n", + "441 appendicitis 1 {'n_neighbors': 7, 'proportion': 0.5, 'random_... \n", + "442 appendicitis 1 {'n_neighbors': 7, 'proportion': 0.5, 'random_... \n", + "443 appendicitis 1 {'n_neighbors': 7, 'proportion': 0.5, 'random_... \n", + "444 appendicitis 1 {'n_neighbors': 7, 'proportion': 0.5, 'random_... \n", + "445 appendicitis 1 {'n_neighbors': 7, 'proportion': 0.5, 'random_... \n", + "\n", + " classifier \\\n", + "0 DecisionTreeClassifier \n", + "1 DecisionTreeClassifier \n", + "2 DecisionTreeClassifier \n", + "3 DecisionTreeClassifier \n", + "4 RandomForestClassifier \n", + ".. ... \n", + "441 DecisionTreeClassifier \n", + "442 DecisionTreeClassifier \n", + "443 DecisionTreeClassifier \n", + "444 RandomForestClassifier \n", + "445 RandomForestClassifier \n", + "\n", + " cparam auc \n", + "0 [{'max_depth': 2, 'random_state': 5}, {'max_de... 0.582353 \n", + "1 [{'max_depth': 2, 'random_state': 5}, {'max_de... 0.723529 \n", + "2 [{'max_depth': 2, 'random_state': 5}, {'max_de... 0.764706 \n", + "3 [{'max_depth': 2, 'random_state': 5}, {'max_de... 0.782353 \n", + "4 [{'max_depth': 2, 'random_state': 5}, {'max_de... 0.976471 \n", + ".. ... ... \n", + "441 [{'max_depth': 2, 'random_state': 5}, {'max_de... 0.647059 \n", + "442 [{'max_depth': 2, 'random_state': 5}, {'max_de... 0.661765 \n", + "443 [{'max_depth': 2, 'random_state': 5}, {'max_de... 0.661765 \n", + "444 [{'max_depth': 2, 'random_state': 5}, {'max_de... 0.647059 \n", + "445 [{'max_depth': 2, 'random_state': 5}, {'max_de... 0.676471 \n", + "\n", + "[446 rows x 6 columns]" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tmp = pd.DataFrame.from_dict(results)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "smote-variants", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.13" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/notebooks/development/032-smote-rf.ipynb b/notebooks/development/032-smote-rf.ipynb new file mode 100644 index 0000000..7d9aed8 --- /dev/null +++ b/notebooks/development/032-smote-rf.ipynb @@ -0,0 +1,507 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "import pandas as pd\n", + "from scipy.stats import wilcoxon\n", + "\n", + "from sklearn.ensemble import RandomForestClassifier\n", + "from sklearn.metrics import roc_auc_score\n", + "from sklearn.model_selection import RepeatedStratifiedKFold\n", + "from sklearn.preprocessing import StandardScaler\n", + "from sklearn.decomposition import PCA\n", + "from sklearn.tree import DecisionTreeClassifier\n", + "from conditioning_bias import OperatorRandomForestClassifier, OperatorDecisionTreeClassifier\n", + "\n", + "from smote_variants.oversampling import SMOTE\n", + "\n", + "import common_datasets.binary_classification as binclas" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "import matplotlib.pyplot as plt" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "dataset = binclas.load_haberman()\n", + "X = dataset['data']\n", + "y = dataset['target']" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "X = StandardScaler().fit_transform(X)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAiMAAAGdCAYAAADAAnMpAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjguMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/SrBM8AAAACXBIWXMAAA9hAAAPYQGoP6dpAABRe0lEQVR4nO3de3wU9b0//tfsJpsEyUYSCUm4BLS0cYlQUKNB8HisaBBBaY96tAhVSzXCqdRjf0hbjdRq5KttqR6Nl1oV+aKi/aF4C021RTmEYkUsGLFCuZUkhCaQDZfcdub7x7Ihm+xcNju3nXk9Hw9qk53MfGYS2Hfm83nNW5AkSQIRERGRRTxWD4CIiIjcjcUIERERWYrFCBEREVmKxQgRERFZisUIERERWYrFCBEREVmKxQgRERFZisUIERERWSrF6gEoEUUR9fX1yMzMhCAIVg+HiIiINJAkCW1tbSgoKIDHo37fw9bFSH19PUaOHGn1MIiIiGgA9u/fjxEjRqhuZ2gxUlVVhaqqKuzZswcAMG7cONx3332YPn26pq/PzMwEED4Zv99v1DCJiIhIR8FgECNHjux5H1djaDEyYsQIPPzwwxg7diwkScKLL76Iq6++Gp9++inGjRun+vWRqRm/389ihIiIKMloXWIhmN0oLzs7G4888ghuvfVW1W2DwSCysrLQ2trKYoSIiChJxPv+bdqakVAohNdeew3Hjh1DaWlpzG06OjrQ0dHR83EwGDRreERERGQRw6O927Ztw+DBg5GWlobbb78da9asQSAQiLltZWUlsrKyev5w8SoREZHzGT5N09nZiX379qG1tRWvv/46fvvb32L9+vUxC5JYd0ZGjhzJaRoiIqIkEu80jelrRi677DKcddZZePrpp1W35ZoRIiKi5BPv+7fpT2AVRTHq7gcRERG5m6ELWJcsWYLp06dj1KhRaGtrw6pVq/DnP/8Z69atM/KwRERElEQMLUaampowd+5cNDQ0ICsrC+PHj8e6deswbdo0Iw9LREREScTQYuS5554zcvfWEkPA3o3A0YPA4GFA4WTA47V6VEREREnH1r1pbKtuLVC9GAjWn/qcvwAoWwYEZlk3LiIioiRk+gLWpFe3Flg9N7oQAYBgQ/jzdWutGRcREVGSYjESDzEUviOCWGnok5+rvie8HREREWnCYiQeezf2vyMSRQKCB8LbERERkSYsRuJx9KC+2xERERGLkbgMHqbvdkRERMRiJC6Fk8OpGQgyGwiAf3h4OyIiItKExUg8PN5wfBdA/4Lk5MdlD/N5I0RERHFgMRKvwCzguhWAPz/68/6C8Of5nBEiIqK48KFnAxGYBRTN4BNYiYiIdMBiZKA8XmDMVKtHQURElPQ4TUNERESWYjFCRERElmIxQkRERJZiMUJERESWYjFCRERElmIxQkRERJZiMUJERESWYjFCRERElmIxQkRERJZiMUJERESWYjFCRERElmIxQkRERJZiMUJERESWYjFCRERElmIxQkRERJZiMUJERESWYjFCRERElmIxQkRERJZiMUJERESWYjFCRERElkqxegC2JIaAvRuBoweBwcOAwsmAx2v1qIiIiByJxUhfdWuB6sVAsP7U5/wFQNkyIDDLunERERE5FKdpeqtbC6yeG12IAECwIfz5urXWjIuIiMjBWIxEiKHwHRFIMV48+bnqe8LbERERkW5YjETs3dj/jkgUCQgeCG9HREREumExEnH0oL7bERERkSYsRiIGD9N3OyIiItKEaZqIwsnh1EywAbHXjQjh1wsna95lSJSweXcLmtrakZuZjpIx2fB6BN2GTERE5AQsRiI83nB8d/VcAAKiC5KTBUTZw5qfN1K9vQFL36pDQ2t7z+fys9JRMTOAsuJ83YZNRESU7DhN01tgFnDdCsDfp1jwF4Q/r/E5I9XbG1C+cktUIQIAja3tKF+5BdXbG/QaMRERUdLjnZG+ArOAohkDfgJrSJSw9K062YCwAGDpW3WYFsjjlA0RERFYjMTm8QJjpg7oSzfvbul3R6Q3CUBDazs2725B6Vk5AxwgERGRc3CaRmdNbfKFyEC2IyIicjpX3hkxMuWSm5mu63ZERERO57pixOiUS8mYbORnpaOxtV0uIIy8rHABRERERC6bpjEj5eL1CKiYGQDQEwjuEfm4YmaAi1eJiIhOck0xopZyAcIpl5AYa4v4lBXno2rOJORlRU/F5GWlo2rOJD5nhIiIqBfXTNOYnXIpK87HtEAen8BKRESkwjXFiBUpF69HYHyXiIhIhWuKkXhSLuwpQ0REZB7XFCNaUy6Hj3ViyrIP2FOGiIjIJK5ZwKol5TJrQj4WrGJPGSIiIjO5phgBlFMuT9w4EWs/azAlbUNERESnuGaaJkIu5cKeMkRERNZwXTECxE65sKcMERGRNVw1TaOEPWWIiIisYWgxUllZifPPPx+ZmZnIzc3FNddcgy+//NLIQ2oSEiXU7mrGm1sPoHZXM0Ki1JO2kQvwCginauLqKSOGgN0fAdteD/9XDOkxfCIiIkcxdJpm/fr1WLBgAc4//3x0d3fjJz/5CS6//HLU1dXhtNNOM/LQspQa5VXMDKB85RYIQNRC1gH1lKlbC1QvBoL1pz7nLwDKlgGBWYmeBhERkWMIkiSZFg85dOgQcnNzsX79elx88cWq2weDQWRlZaG1tRV+vz/h40ca5fU94Uh5UTVnEgAk3tW3bi2wei4gd6TrVrAgISIix4r3/dvUBaytra0AgOzs2FMdHR0d6Ojo6Pk4GAzqdmy1RnkCwkXIhsWXJtZTRgyF74goHan6HqBoBuDxDvR0iIiIHMO0BayiKGLRokW46KKLUFxcHHObyspKZGVl9fwZOXKkbsePJ7obSdtc/c3hKD0rJ75Hwe/dGD01E+tIwQPh7YiIiMi8YmTBggXYvn07XnnlFdltlixZgtbW1p4/+/fv1+34pkV3jx7UdzsiIiKHM2WaZuHChXj77bfx4YcfYsSIEbLbpaWlIS0tzZAxmBbdHTxM3+2IiIgcztA7I5IkYeHChVizZg0++OADjBkzxsjDKYpEd5XEHd2NpXByODWjFBL2Dw9vR0RERMYWIwsWLMDKlSuxatUqZGZmorGxEY2NjThx4oSRh43J6xFQPFx5RW/xcH9860Ni8XjD8V0Asi35yh7m4lUiIqKTDC1Gqqqq0NraiksuuQT5+fk9f1599VUjDxtTZ7eI979oUtzm/S+a0NktJn6wwKxwfNffJwrsL2Csl4iIqA9D14yY+AgTVS/V7oFaw11RCm9369QzEz9gYFY4vrt3Y3ix6uBh4akZ3hEhIiKK4ppGeXtbjuu6nSYeLzBmqn77IyIiciDXFCOF2YN6/r8HIko8O5CLI2jC6dgsFkE8OWPVe7tEhURp4A9PIyIicgnXFCM3lY7Gg+9+gWnCZlSkrkCB0NLzWr2UjaVdc1EjleCm0tG6HE+pB47mx8oTERG5gGkPPbOaL8WD+WdsR1XqcuShJeq1PLSgKnU55p+xHb6UxC9JpAdO3ye+Nra2o3zlFlRvb0j4GERERE7hmmKks7ML3ws+BQDoO1MS+Xhe8Gl0dnYldBy1HjhAuAdOSG01LRERkUu4phipqV6DfKGlXyES4RGAAqEZNdVrEjpOPD1wiIiIyEXFyPHmA7puJ8e0HjhEREQO4ZpiZFDOcF23k2NaDxwiIiKHcE0xMq1sNhqkbNkHn4kSUC/lYFrZbE37C4kSanc1482tB1C7q7lnDUikB45CZxp9euAQERE5hGuivT5fKjaO/TFmf7UEohS9iDVSoNSOvRvf8aWq7ksttlsxM4DylVsgAFELWSOHrJgZ4PNGiIiITnLNnREA+M6cO7BmbCUOIvquRCNysGZsJb4z5w7VfWiJ7ZYV56NqziTk9ekSnJeVjqo5k/icESIiol4EyU4NZPoIBoPIyspCa2sr/H7ljrvx6OzsQk31GhxvPoBBOcMxrWw2fBruiIRECVOWfSCblhEQLjg2LL4UXo/AJ7ASEZErxfv+7Zppmt58vlTMmHVd3F8XT2y39KwceD0CSs/KSWCkREREzueqaZpEMbZLRESkP1feGTnR3olVr72M9sMNSB+SjxuvvQEZ6T7Vr9M9tiuGgL0bgaMHgcHDgMLJ4U6/FnDalJLTzoeIyMlcV4w8WfUrXNP4OG6NNMprAeor78fzef+FO8rvUvzaSGy3sbU95uPeI2tGNMV269YC1YuBYP2pz/kLgLJlQGCW5vPRg9Oa+jntfIiInM5V0zRPVv0Ktzcujdko7/bGpXiy6leKX+/1CKiYGQCAfs8RiSu2W7cWWD03uhABgGBD+PN1a1XORD9Oa+rntPMhInID1xQjJ9o7cU3j4wDkG+Vd0/g/ONHeqbifhGO7Yih8R0SplV71PeHtDOa0pn5OOx8iIrdwzTTNqtdePjU1E4NHAArQjOdeexm33jRPcV9lxfmYFsgb2JqEvRv73xGJIgHBA+HtxkxV318C4k0H2Z3TzoeIyC1cU4y0H9Z2e17rdgOO7R49qO92CXBaOshp50NE5BauKUbSh+QD8jdGorcz0uBh+m6XgN6pHw9ElHh2IBdH0ITTsVksgnhyFi9ZmvqxSSERUXJyTTFy47U3oL7yfuShpd+aESDcn6YRObjx2huMHUjh5HBqJtiA2OtGhPDrhZONHQdOpYMmtH2I+1JXoKDXNFa9lI2fd83FZ5kXJ01TP13TTkREZBrXLGDNSPdhufcWAOjXuTfy8XLvzZqeN5IQjzcc3wUgm8kpe9iU5414PQKenPRPPJm6PGbC6MnU5Xhy0j+T5vkcuqWdiIjIVK4pRk50hrD6+CSUdy1CY4xGeeVdi7D6+CSc6DQ+xYLALOC6FYC/z5SQvyD8ebOeMyKGMPHzhyEIsRNGgiBg4ufLTEn26IVNComIko9rGuXd+8Y2vLRpHwDl9RE3XTgKD1xzTsJj18TqJ7Du/gh48Sr17ea9bXiyR298AisRkXXYKE/GnubjPf9fhAebxIDqdobzeK19k7dRskdvbFJIRJQ8XDNNMzpnkK7bOYKNkj1ERORerilGfnJl7DshA93OESLJnn7LPSMEwD/clGQPERG5l2uKkQyfF9MCuYrbTAvkIsNnTddcS9go2UNERO7lmmIEAM4847SEXnckuyR7iIjItVyTpunsFlF073v9njHSm0cAdjwwHb4UV9VoYVYne4iIyDGYppHxUu0exUIECD/87KXaPbh16pnmDMpOrE72EBGRa7nmFsDeFm2RXa3bERERkT5cc2ekMFtbZLcwe5A7pyzceM5ERGQLrilGbiodjQff/UJ1zcjcIduA5TOAYP2pF/wF4dSJUxdz1q0Fqhe765yJiMg2XDNN40vxIOc05SZ4/5GxBamvz4t+UwbCHXZXzw2/aTtN3drwubnpnImIyFZcU4wcbe/GoaOdsq97IGJR6HeQYjafP/m56nuSqmmcKjEUviPipnMmIiLbcU0x8qNXP1V8vcSzAwVCi+yzSAEJCB4Ir6twir0b+98RieLAcyYiIttxTTGy7/AJxddzcUTbjpKwaZwsBzfKIyKi5OGaBayjhmTgy8Y2AOEpmRLPDuTiCJpwOjaLRWjC6dp25KSmccnaKI/JHyIiR3FNMfLr6yei+P51uMKzGRWpK1AgtPS8Vi9l4+ddc1AvZSNfOAwhxhoKCQIEf4GzmsZFGuUFGxB73YgQft1O58zkDxGR47hmmmZwegquG7QFVanLkYeWqNfy0IInUx9DtTAFkiT1i/+KEiBJEj4dt9hZv4EnW6M8Jn+IiBzJNcVIZ2cXfhT6HYDw80R6i3xcJv0vFnTdiUZkR73eiBzc0bUId2wZgZDaM+WTTbI0ymPyh4jIsVwzTVNTvQYzhBbZ1z0CUIBmHEYmpnQ81m9NiQgP0NqOzbtbUHpWjokjN0FgFlA0w97rMOJJ/rDHDhFRUnFNMXK8+YCm7XJxBCI82CQGYr7e1Nau57Dsw+6N8pj8ISJyLNdM0wzKGa5pO7VUTW5mug6jobgla/KHiIhUuebOyLSy2Wj4ZDGGoaXfmhEgvEi1ETn4WCyK+fUCgLysdJSMCa8nCXV3Y8df1uHE4QPIGDIcRRdcAW+Kay6n+YxI/rgxIuzGcyYi23PNu6fPl4qNY3+M2V8tgSQBQq+CRDr53vZG3kKE9sa+WSQBqJgZgNcj4NN1L6KgdinGobnn9YM1OagvrcDEK+YZeBYuFkn+rJ6LcGnYuyAZQPLHjRFhN54zESUF10zTAMDfD7bJPu5dAFB/RPkprQDw6boXMWHjDzFUao76/FCpGRM2/hCfrnsx8YFSbHolf9wYEXbjORNR0hAkSbJtVjUYDCIrKwutra3w+/0J7evo8Q4ElxUhT2WaZkrHb8LJmT4EAAX+VPz/nbdjqNQsu48mIQdDf/Z3TtkYKZGpBjEELC9WSOacnO5ZtM050xduPGcislS879+uuTNStSL81NVYRQRwMtorNKPEsyPm6xKAkUc/wzDELkQi+8hDM3b8ZZ0+g6bYIsmfc/4j/N943kDd2BzQjedMREnFNcVId7BR03ZKDfO0NtM7cVhbjJgs4MaIsBvPmYiSimuKkRR/nqbtlKK9WpvpZQzRFiMmC7gxIuzGcyaipOKaYqR87lzUS9n9+s5EiBJQL+Vgs0K0d//gCTiIHMV9NCIHRRdcoc+gSX+RiLDSUmb/cHs1B0zUyXOWZM5ZcuI5E1FScU0xMnhQGh4WvwcAMRvhAcBDobmQ4JFrGYd7Z52D+tIKxX00lFZw8aqdJVtzQD14vPh03D3uagJJREnFNcXI0fZurO06D+Vdi2I2wivvWoS3u8/Hr66bgLys6Kes5mWlo2rOJJQV52PiFfPw2eTHcEiI7k/TJOTgs8mP8TkjySBZmgPqJCRKuGPLCNmffcc2gSSipOGaaO/8Fz9GzRdNAAAPxNiN8ABMOzsXT910HjbvbkFTWztyM8NPXfX2idDwCawO4JKnkdbuasYNz24CoPyz//L8C53XBJKILBHv+7dr3j33HT71QDOlRnj7Dp+A1yOo/qPsTUnBuItm6DpGMpndmwPqpHdzR1c2gSQi2zN0mubDDz/EzJkzUVBQAEEQ8MYbbxh5OEWjhmRo304MAbs/Ara9Hv6vGDJ4dMkhJEqo3dWMN7ceQO2uZt7WTxJamzsmYxNI/kwSOYOhd0aOHTuGCRMm4JZbbsG3v/1tIw+l6tfXT0Tx/eoPI3vsm/8Els9l/44+qrc3YOlbdWhoPfXbc35WOipmBlBWnK/wlWS1kjHZyM9KR2Nru1yLwagmkMmCP5NEzmHonZHp06fjF7/4BWbPnm3kYTQZnJ4Cn1cuzhl2VcrHyFhzM/t39FG9vQHlK7dE/aMPAI2t7ShfuQXV2xssGhlp4fUIqJgZnpqRS4pFmkAmC/5MEjmLrdI0HR0dCAaDUX/00nq8C50h+Vu4Hoj4ifdFSDF/dzz5uep7XDdlExIlLH2rTumqYOlbdbw9bnNlxfmomjNJMSmWLPgzSeQ8tlrAWllZiaVLlxqy71te2Kz4eolnBwqEFoUtevXvcMGix4jNu1v6/fbZmwSgobUdm3e3MIlhc2XF+ZgWyFNNitkdfyaJnMdWxciSJUtw11139XwcDAYxcuRIXfZdr/CPF6C974zb+ndoTVgwiZEctCTF7I4/k0TOY6tiJC0tDWlpaYbsuyArXfG3Ka19Z9zWv8PJSQxKTvyZJHIeW60ZMdLvvlfS8/89EHGhpw6zPBtxoacOHojYLBahXsqW7d9h554lRsYbI0kMhauCfLOTGE6MXjvxnAxiy59JIkqIoXdGjh49ip07d/Z8vHv3bmzduhXZ2dkYNWqUkYfuJ2tQKgpzMlB0eD0qUldErQ+pl7KxtGsunkz/Pn7R8QjC/5z1fkO3b88So+ONkSRG+cotclfF3CRG3VqgerGzotdOPCcD2e5nkogSZuidkb/+9a+YOHEiJk6cCAC46667MHHiRNx3331GHlbW9YO3oip1OfIQvVA1Dy2oSl2OgtMzkqpniVnxRtskMerWhiPWTopeO/GcTGCbn0ki0oVretOcaO/E4cpvIA8tiPULkyiFm4YNWbIDGT6v7XuWhEQJU5Z9ILsOJvIgqw2LL9XtN8SQKFmXxBBDwPLi/m/aPYRw0bhom+2+V7KceE4ms/RnkohksTeNjFWvvYxbFaK7HgEoQDOee+1l3HrTPNvHd62IN1qaxNi7UeFNG0jK6LUTz8lkTkgHEZGLFrC2H9Y2ZaF1O6u5Lt6oNVKdTNFrJ54TEdEAuObOSPqQfEDpmWa9t7NTa3mZsegdb7T97W6tkepkil478ZyIiAbANcXIjdfegPrK+1XXjMwZn9l/Ht+qZINCyqKkaKZuzc+SouFY4eTwuQcbALkz9hfYMnoty4nnREQ0AK6ZpslI9+Fd8aJwFLDPv/uSFH7zrhPHIG3NLfZINqikLLw73tKl+VnSNBzzeMMFIQDZM7Zh9FqRE8+JiGgAXFOMtB5tx5We/wUACH3+3Y98fInnE3s0yhND4TsiKmMpC+QmFG9MuoZjgVlJFb3WxInnREQUJ9dM0/zyt8/j5wppGkEAUmK+LUeYmGyII2VRVjx1wM3PkrLhWGAWUDTDPmt69ODEcyIiioNrihHdEglmJBviTFkMNN6YtIkcj9d5UVcnnhMRkUbuKUYGD4PWxryq+4HB6ROTUhZsOKYDOyWvbMT26SwishXXFCP//f2bUf/I/0E+WvqtGQHCi1hDEOAVYrfKkyBAOJlsMDx9YlLKItJwTI9Ejiuxp0xMSZHOIiJbcc0C1qzB6fijZwqA2GkaAFgvnAtJCsd8exMlQJIkfDpuMarrmoxPn5iUsog0HFM4ChuOyWFPmZiSJp1FRLbimmIk1N2NK6T/hYTYaRoJwNnSbizouhONiL4T0Igc3NG1CHdsGYH7135uTvrEpJQFG44NgMa0kynJKxtJunQWEdmGa6ZpdvxlHcahuf8tgJMivWkOIxNTOh5DiWcHcnEETTgdm8UiiPAACskTwID0iUkpi7Li/AEnclyJPWViSsp0FhHZgmuKkROHD2jaLhdHIMKDTWJgwMfSNX1iUsqCDcfiwJ4yMSVtOouILOeaaZqMIcM1bdeE0xM+FtMnDseeMjExnUVEA+WaOyNFF1yBgzU5GCo1K/am+Vgsivn1kWSJJEk4GOzQlD5RizeaFX9kzFJn7CkTE9NZJmCUnBzKNcWINyUF9aUVGLrxhxAlRBUkkfV0tWPvRmh77JtFEtCTPClfuSXc46bX633TJ2rxRrPij4xZGiCSdlo9F5D7SXBhT5lIOkvL3w8aAEbJycFcM00DAE8eDKC8a1HMtEx51yK8eOQc1X1oSZ+oxRsr360zJf7ImKWB2FMmJqazDMIoOTmcIEl9n7phH8FgEFlZWWhtbYXf709oXyc6Qzj7vmoAgAdi7LSMgsgt5g2LL4XXI8hOfYRECVOWfSCbKhAQjhLLpRv7HmegtIxDj+O4Hm+bx8SpQR2JIWB5sUKC6+S04KJt/Nkj24j3/ds10zQPvVvX8/8HkpbpG0uUS59oiTcqlX96xR8ZszQJe8rExHSWjhglJxdwzTTNnubjuuxHLZaoV2wx0f0wZknkEIySkwu4phgZnTNIl/1EYokhUULtrma8ufUAanc19zxVUq/YYqL7YcySyCEYJScXcM00zU+uDOClTfsS2kf+yViiUkJlWiAP+VnpilMkHg1rRhKNPzJmSeQQjJKTC7jmzkiGz4uhg33K26QqX45ZE/JRU9eomFCpqWtE8XDlxTrFw/3hhax9Pq9n/JFN8IgcwqTGmURWck0xcqIzhENHO5W36RIVX39zaz3uX6vcCKzize14/4smxf1sPxDE4//5TcPjj4xZEjkEo+TkcK6ZpumdphmoxmCH4usSgINtygUPEJ6iOdjWgQ2LLzU8/sgmeEQOYVLjTCIruKYY0StNo5e9LcdNiz8yZknkEIySk0O5phgZnTMIH32lbduBPBQtXoXZGtM9NnqoVlI9yMpG142IiJS5phjRmqa5wrMZFakrUCC09HyuXsrGz7vmYuvgqQAEHAzKJ1RyM304dLRTNi0DhNM0N5WOVh+0jXpRJFWPGxtdNyIiUueaBawZPi9SVM52unczqlKXIw8tUZ/PQwueTF2OqnMP4P5ZygmVpVcXY/7UMYrHmT91DHxqg7FRL4qk6nFjo+tGRETauKYYORTsQLdCWMYDEfemrIAgRHf0BcIfC4KAiZ8vQ1kgVzWhsuTKAG67eEzM/dx28RgsuVLlUfRiKPybvVJup/qe8HYGC4kSlr6lnCBa+lZdz0PfLGWj60ZERNq5Zppm9pMbFF8v8eyImprpS+jV/6GseKpqQmXJlQH89+VFeKl2D/a2HEdh9iDcVDpa/Y4IYKteFEnV48ZG142IiLRzTTHScqxL8fVcHNG2o5P9H7QkVHwpHtw69Uxt+41xDN22S0BS9bix0XUjIiLtXDNNk31aquLrTThd247M6P9go14USdXjxkbXjYiItHNNMbLmjimKr28Wi1AvZUPqtzQ1QgD8w3v6P8g1yosihoDdHwHbXg//V+tahUgvCo1jMVKkx42A8LqaCz11mOXZiAs9dfBAhIBTPXssZ6PrRkRE2rlmmmaoPw3+9BQE27tjvi7Cg0eEm/Er/Aoioqs0EeG3N+Fk/wdNMddE4qWRXhSr5548cu9Cx9xeFJEeN2+segr3yUSer5l5uz2eN2Kj60ZERNq55s4IAFxwpvJv77vO+HeUd96JRil6u0YpB+Wdd6JaPF9bzFWPeKmNelGUeT5Gle83yOuzwDdPaEGV7zco83xs2lhU2ei6ERGRNoIkSTbIZMYWDAaRlZWF1tZW+P3KnXDVnOgM4ez7qjVtG+sJrBI8yMtKhyRJsj1qBAAF/lRsSL8Tgmyq42S770XbtP2GbvWTRMUQsLxYIaUS5/mYxerrRkTkYvG+f7tmmiaeRnkiPNgk9n8WiFLEFQhPCow8+hmETh3jpVb3okjWuKzV142IiDRzzTSNWY3y4o0I2x7jskREZDDX3BmJp1FeIiyJCBs5JeHkuKxJUzl2ajBop7EQEUW4phjR2ihPSf7JNSMHgx2yjfL2D54AKb0ACDaEn9rahwQBgr9Av3ip0U3hInHZYANiP2b95JqRZIvLmtRMz04NBu00FiKi3lwzTZPh86IwJ0NxG59X+TfEoZk+3D9rHAD5Rnn3zjoHW8fdA0mS+nXuFSVAkiR8Om6xPr+Bm9EULhKXBSB71skWlzWpmZ6dGgzaaSxERH25phjp7Baxv+WE8jYh5WDR3/4ZxL99XblR3rRAHu7YMgLlXYvQiD4RYeTgjq5FuGPLiMQby5nZFM5JcVmTrpudGgzaaSxERLG4Zprmpdo9/e5UDMRD79bhgWvOkW2UV7urGQ2t7WhACWo6zusXERbhAfRoLGd2yiUwCyiakfxxWZOum50aDNppLEREsbimGNnbok+aJpLKkWuU17thnFxEuO92A2JFysUJcVmTrpudGgzaaSxERLG4phgpzB6ky35G5yjvR/fGcnKJjzhSLp2dXaipXoPjzQcwKGc4ppXNhs+n3DjQMFY/jMykdFC8PwdGplziGovV3x8iG2MazTiuKUZuKh2NB9/9IuGpmp9cGftOR0SksZzSbXHNjeWUEh9FMzSlXH6/4TNM3jkXMyKPct8LNHyyGBvH/hjfmXOH+hj0ZFKCRZFJ6aDIz0Fja7ts8irv5M+B0SkXzWNp3wAsv8fa7w+RTTGNZizXLGD1pXgwMjuxNM34EX5k+JR/S/R6BMyaoPyDOWtCvno1rZb42PGOasqldtAlmL3zpxiG6J4yw9CC2V8twe9XPqk8Bj2ZlGBRZVI6KNJgUOEoqJgZQE1do+EpFy1jeXLSP+F9bZ713x8iG2IazXiuKUZOdIawtzmxNM2htk7VxEFIlLD2M+UfzLWfNSjvR2vio2iGbMql69vPY3TDewCAvnVP5OPSrx5FZ2eX4lh1YWbyRwuT0kFlxfmqySuzUi6KY/nuBEz8/GHY5vtDZCNMo5nDNdM08fSmkaMlcaCWXNC0n3gSHzIplz+8/ftTUzMxeASgAM14p3oNZsy6TnG8CbNjfxuT0kFlxfmqySs5eqdcZMeyd4P9vj9ENsE0mjlcU4zo1ZtGLXGgS3Ih3sRHjJTL8eYDmnahdbuE2LW/jUnpIC3JKyV6plxijsWu3x8iG2AazRyumaZRS8FopZZM0CVNo0PiY1DOcE270LpdQpzc3yYBuievBorfHyJZtvl76nCuKUbUUjARAgAPRFzoqcMsz0Zc6KmDByIERKdgQqKE2l3NeHPrAdTuau6ZL4wkFzTtRwwBuz8Ctr0e/m9kTj6S+Oi33LDXKP3DFRMf08pmo0HKlk0PiRJQL+VgWtlsTdcl/EUy41UT7/kM9DhJpvfPSix9f+YMw+8PkSzb/D11ONdM02T4vJgWyEVNXZPsNuNH+JFf/0dUpK5AQa/1FvVSNpZ2zcXsmbfD6xFUI14VMwN4Y9VTuC/Gfn7eNRfXzLwd3h1vKcdcy5aFUwwQEL2wUFviw+dLxcaxP8bsr5ZAlKIXsUYKlNqxd+M7Wp83kkgsN5Jg0XI+doj/miSScilfuUXuqqBiZsD45xjw+0MkyzZ/Tx1OkCTJtkuAg8EgsrKy0NraCr/fn/D+qrc34PaVW2Rfv23odiwOPgQg9pv3U3kVOHPqDShfuaXfyurI5lVzJqHM8zGk1XMhQYq69SQCECBAmPxfwMbH0T+9cHIvkURHzH/4h4ffGDT+w//7lU9i8lePID+qKMoJFyJanzMSieWqjVfLfpTOR6/jJBnbPL+A3x8iWbb5e5ok4n3/dk0xEhIlTFn2geyqaA9EbEj7IfLQ0i8KC4QLkkbk4NspT6DxmBhzHwKAAn8qNqTfCUE2nSAAggBIsffR89CtRdvCv4nq8ETMhJ7AKoaA5cUKaYs+49Wyv1jno/dxkoxtnuzI7w+RLNv8PU0C8b5/mzJN88QTT+CRRx5BY2MjJkyYgMcffxwlJSVmHLqHWjyrxLMjakqlr0gUdvSJ7WhE7PUnEoCRRz+D0KkSk1Ss//rEKHVIfPh8qQOP7+ody5U7HzvGf00kl7gxHb8/RLJs8/fUgQxfwPrqq6/irrvuQkVFBbZs2YIJEybgiiuuQFOT/NoNI6jFrnJxRNN+1LbTuh9VdolRmhX7ZLzU3vj9ISIDGV6M/OpXv8L8+fNx8803IxAI4KmnnsKgQYPwu9/9zuhDR1GLXTXhdE37iWwXKykTz35UnYxRyqV2TGNW7JPxUnvj94eIDGToNE1nZyc++eQTLFmypOdzHo8Hl112GWpra/tt39HRgY6Ojp6Pg8GgbmNRa2C3WSxCvZStumZkT0Yxyk5skk3KbBt8EaT0AoU1IwAEDyRJjBkVkwAIJ2OUtlgwZVJjOdOOQwPD7w8RGcjQOyP/+te/EAqFMGxY9G9Lw4YNQ2NjY7/tKysrkZWV1fNn5MiRuo3F6xFwrKNb9nURHjzQPTf8//v8Wxv5+EX/bagqacKTqcuR16f5XB5a8GTqcvzPuQ0Q8r+pOJbm1PyYS0ckCYAUbnBXXddkj8ZMJjWWM+04NDD8/hCRgWz10LMlS5agtbW158/+/ft123fL0U4E2+WLEQB4L1SC8q5FaET0w2sakYPyrkV4u2Mivvn5wxCE2M3nBEHAxG2VwN+rZY8hAcjuCD+CXeizj8jHhQ3VWPrG3+zTmMmkxnKmHYcGht8fIjKIodM0Z5xxBrxeLw4ejF7UdvDgQeTl5fXbPi0tDWlpaYaM5T+f2ahpu3ViCWo6zkOJZwdycQRNOB2bxSKI8OBClaSMAAk4qnzHQuj5H5nXT6Z2Co//DQ0KqR3TGzOZ1FjOtOPQwPD7Q0QGMLQY8fl8OPfcc/H+++/jmmuuAQCIooj3338fCxcuNPLQ/TS1dWreVoQHm8T+hYBuSRkNtBzL9MZMJjWWM+04NDD8/hCRzgx/zshdd92FefPm4bzzzkNJSQmWL1+OY8eO4eabbzb60FFyM304cqIroX3olpTR6VhaGzM57kE9OjwIjoiI7MPwYuT666/HoUOHcN9996GxsRHf/OY3UV1d3W9Rq9Fe+cFkTPpFTUL72D94wsmkjEKiYHAecOyg7BNWpZP/I6H/uhPgVGpn76DxEI52yx0FeRobM9kikaMn9kYhInIcUxawLly4EHv37kVHRwf+8pe/4IILLjDjsFGyB/vgT1euvVJUrkZgxBAIaomCK/8P8PUy2X0IAOpPOxuAfGqnduzdqLhmvNJRNDVmqt7eYI9Ejl4ivVH6xqaDDeHP1621ZlxERJQQW6VpjBQSJZyWplyMdMu1iznp/S+a0Pn1q5QTBUUzgIativsZnhLEG197EAdjpHbWjK3Ed+bcgbLifFTNmYS8rOipmLys9HAzPpW7GiFRwtK36uyTyEmUGArfEVE6o+p72M6eiCgJmdKbxg7UetNoIUrAS7V7cOtUhUTB7o9UengACB7At2dPQOf1f8c7fRrYfadXA7uy4nxMC+QNaL2H2vlakshJBHujEBE5lmuKEb2SJ3tbjof/j1yiII4eHloa2A20MZPW8zU9kTNQ7I1CRORYrpmm0Zo8UVOYPUh5A5v08NB6vnpdF8PZ5LoSEZH+XHNnJNKbprG1HQLEfg81k+CJuRqhN48A3FQ6GoBCXDaOHh56RW5j7af3+SaayLEF9kYhInIs1xQjXo+AipkBrFn1FCpiNLlb2jUXYtFVqKlrkt3H/Klj4EvxqMdly5aF0x0QEP3GeaqHR3Vdky6RW6WxVMwMoHzlFrlRaErk2EakN8rqm2Q2kNgbhYgoSblmmgYAPl23AlUyTe6qUpdj8D/eU/z6f/zrmLa4rEoPj2rxfF0it2pjAZBQIoeIiMgMgiT17R1rH8FgEFlZWWhtbYXf709oX0ePdyC4rAh5aFF82NiUjt9AVKjRhmWm4WBbR8zXIlMfGxZfGr7jEONJoSF4MGXZB7JJl377kBESJc37AZD8T2AVQ8DyYoVEzclpmkXbeHeEiMhi8b5/u+bOSNWK8NSM3HuwRwAKhGaUeHYo7keuEAGi47LhnZ5M3JzzH+H/erxxRW6VxLOfSCLn6m8OR+lZOclXiADxRXuJiCipuKYY6Q42atpOj2Z4SnFZvSK3jovuqmG0l4jIsVyzgDXFnwccV99Oj2Z4kbhsrJSLXpFbW0Z3jWxgx2gvEZFjuaYYKZ87F/XLHlJdM7JZLFLcz7DMNDS1dajGZeVSLvfOCOgSubVddNfoBnaM9hIROZZrpmkGD0pDtTAlHHPt814mSeE377XdpYqLV/3pKVh69TgAyg3sauoaZVMuC1ZtwawJ+ar7UFvXEYkqJ7ofXZjRwC4S7QUge8aM9hIRJSXXFCMn2jtRJm0AAAh93ssiH89KqYUH8t3ygu3dmPK1oYpx2WmBPNUGdWs/a8ATN05MOHKbaDM9XZjZwE4lMq3LHRgiIjKda6ZpVr32Mm4V5BMqggAUIJym2SQGZLf70auf4tl558s2sKvd1awp5TLktDRsWHxpwpHbRJrp6cLsBnYBhSaFRESUlFxTjLQf1vYgMbU0zb7DJwDIN7CLJ+Uy0CZ4fem1nwGxIuUi16SQiIiSkmuKkfQh+YDyozsAqKdpRg3JACDfmyZZUy4D7pMTb8qluxP4+Fng8B5gyGjg/PlAii+u0yEiImdxTTFy47U3oL7y/oTTNL++fqJiP5hpgbykS7mo9tpREk/K5Q/3ArX/A0i91uX84WdA6ULg8gcGfo5ERJTUXLOANSPdhyfTvw8gXHj0Fvn4odBcxTTN+BF+bNh5SLEfTE1dY1KlXDT12lGiNeXyx/uBjY9FFyJA+OONj4ULFSIiciXXFCOd3SJWBb+J8q5FaET0XYlG5KC8axHeDZ2P4oLBMb9+/Ag/1twxRTUps/StOkwL5CVFykWqvgcPrN2mej6hvtVbX2opl6+Xhe+IKKl9IjyFQ0REruOaaZqXavdAlIB1UglqOs5DiWcHcnEETTgdm8WinjsisyeOxCs/GIUfvfop9h0+gVFDMvDr6ydicHqK5qTM5t0tSZFyEYIHMLLzMxxA7PRQ7/NRXSCrlHKpfaL/HZF+BwuF15KULlDejoiIHMc1xcjellPPghfhkY3v7m05jsHpKXh23vn9Xou3H0wypFy09OLR3N9GLuVyeI+2r9e6HREROYprpmkKswclvJ0tkzJyNKZctPTiSfh8hozWdzsiInIU19wZual0NB589wuIEpCCbsz1/gGjhCbsk3KxInQ5upECjxDeTk68/WDU4rKd3SJeqt2DvS3HUZg9CDeVjoYvJf76MOZxNKRcJH8B9rdPgBDsMjb5c/78cGpGaapG8Ia3kzsfs6a33M7IZodERDJcU4z4UjyYP3UMhvzvg5if8g68wqm335+m/F882z0Dhy/6qWIxEOkHU75yS7jHTa/X+iZl1OKyle/W4dmPdkclex589wvMnzoGS66UfwJsX4rHKVsWTs3IjFYoexj3iudoOp+EpPjC8d2Nj8lvU7oASPElFjOmxBjd7JCISIYgSX3bxtlHMBhEVlYWWltb4ff7E95f9a/n44ojqwFE96eJXIF1p1+Hsh89q74flTfMSFy274WNHPKyQC5q6ppk93/bxdoKErXjVM2ZhDLPxzHeYIaH47Z6PGckHrGeMyJ4w4XI5Q9oOx8WJMaIxMDlrj57/xBRHOJ9/3ZNMXLi+An4luXDA6lfozwgXJCE4EHX4npkDMpQ3Z/cVEJIlDBl2QeKqRs1HgHY8cB0xbs0aseJTLFsWHwpvBCNewJrvGSewBrX+XDKRl9iCFherJC+OvngukXbOGVDRJrE+/7tmmmaD1Y+iBmCfN0lCEAKRKxb+SBm/OAXqvuTS8ps3t2SUCEChB/C9lLtHtw69UzZbdSO0y+Wq9LLxbTkT4ovZnw37vMh/Zjd7JCIqA/XpGlSW/fqup0czTFYFb2jyIkcR6/xGM1p55NUrGh2SETUi2vujHRlFQLHNG6nhUzqoHcM1gNR9uFqanoixhqOo8QWMWMNnHY+ujMy5RJvs0MiIp25phi5dM5PEVr2P6prRi6d81P1nSmkDkqKZiI/Kx3j2z5EReoKFAinWgXXS9lY2jUX68QSxd33RIw1HMc2DfkSFG9s2lWMTrnE0+yQiMgArpmmyRiUgVdTrwZwKj0TEfl4deos9cWrKs3nvDvewt0jv0RV6nLkoSVqkzy0oCp1OW4bul3xEPOnjoHv72+rHsc2Dfl0EIlNA844H91oaHaYMK3NDrl4lYgM4ppipLNbxM+OXYenu6+C2Ocf3BA8eLr7Kvzs2HXo7FZ4MJeW5nPvLcaUnY8CCN/h6C3y8c1tT+MHU0bFfP22i8dgSdk3VI+D6ntQFsi1viGfjsqK8x11PgnT8POG6nvC2yVKrdkhY71EZCDXRHuf++gfeOCdLwDIP4EVAO6dcbZ8imX3R8CLVyU0jojPp63C2Aumx34Cq9bjzHsbGDPVcU8sddr5DFicPwe64BNYiUgHjPbK6J1O6UYKfhe6UnW7fnRME5w4fAC+FE/swifOdIOlDfkM4LTzGTArUi5yzQ6JiAzkmmKkdwM8pZRLYfYghLq7seMv63Di8AFkDBmOoguugDclRdc0QcaQ4fIvxptu4G+z/Tji7gpTLkTkEq4pRiKN8qYJm2VTLjVSCca3fYh//eLnGIfmntcP1uSgvrQCE6fNUW8+l5mPprYODJWa+60JAcIPNGsSclB0wRXyg40n3cB+Iv04pr8NUy5E5BKuWcDqS/HgPzK2KKZcfpb2Cs79y50YKjVHvT5UasaEjT/EpzUrVVMHwvRlqC+tAICoJni9P24orQjfaZGjNd2w4x3jkxZJJtLfpu/TXBtb21G+cguqtzdYNLIBYMqFiFzCNcXI0eMdWBT6HQD5lMs86W3F1/NrlyL09StVUwcTr5iHzyY/hkNC9LqHJiEHn01+DBOvmKc+YLV0Q9EM85IWSSIkSlj6Vp3SFcHSt+oQ6lsl2hlTLkTkAq5J0zzy1LP4cePdCY/p82mrMO6iGZrWaciuPYmH3HGsSFrYXO2uZtzw7CbV7V6ef2HyLZDluiAiSiJM08joDjbqsp8Thw+E/4+G1IE3JSVcuCRC7jjsJ9KPo/vbMOVCRA7mmmmaFH+eLvtRTMGYiUmLftjfhogoObmmGCmfOxf1Una/RaURogSEJI/i641QScGYKZK06LewMUIA/MP1TVqIofD00LbXw/+12XqUSH8bhSuC/Dj724RECbW7mvHm1gOo3dVs3HoTm19bIiIjuWaaZvCgNDzuvx2Lgw9BlKIXqUbeX15NnYX/7HpD9vWG0grkxbvmwyiRpMXqm2Q2kPRNWiRBhDjS36Z85RYIiF7aO5D+NqZFhJPg2hIRGck1d0YAAGfPRHnXIjQi+jfjRuSgvGsR/jRqoeLrB4dfbuZo7cOMZm060au/jWkR4SS6tkRERnFNmqazW0TRve+F73ooPIEViP26BA/ystKxYfGl9niSpxgClhf3fxPrcfKBWIu2JXZ3xKzj6CyRJ7CGRAlTln3QrxCJEAB9fhaS9NoSEalhmkbGS7V7eqZbRHiwSQzIbiv3ekNrOzbvbrFHLHTvRoU3MQCQgOCB8HaJpDDMOo7OEulvs3l3i2whAoSnf3T5WUjSa0tEpDfXTNMoNsCLg21ioWZFe10YITYtIuzCa0tEFItr7oz0bpSXgm7M9f4Bo4Qm7JNysSJ0Obo1XopILLSzsws11WtwvPkABuUMx7Sy2fD5UqO2VZsqSKiZm1nR3iSNECdybU2LCCfptSUih7DRwxRdU4xEGuX9f55VmJ/yDrzCqaUyP035v3i2ewYeDt0o+/WRdQIlY7Lx+5VPYvJXj2BGpNneXqDhk8XYOPbH+M6cOwCoJzESTmqY1UQtCZu1JXptIxHhxtZ2uTPu+VlISBJeWyJyCJul+FwzTeNL8eCRrNdxW8rb8PT5h98DCbelvI2HBq+GANmWZKiYGcAbq6ow+6slGNan2d4wtGD2V0vw+5VPqiYxKt+tSzypYVYTtSRr1qZHCiYSEQaUfxYSXsicZNeWiBzChik+1xQjne3tuObEGgCA0Off/cjH13WtxePXBWRjoZd+/QxM/uoRAPLN9Eq/ehRL3/ibYrO2Zz/arU8zN7OaqCVJszY9G+XpFRFWlSTXlogcQgzZssmqa6ZpPvn9IygV5N+EBAFIgYicL1Ziw+KfxVxv8M7a1aemZmLwCEABmlF4/G9oQOy0jgRAKUwdd1IjMCvcwdfoeT+zjpMAvVMwZcX5mBbIG/i6Hq2S4NoSkUPYNMXnmmJEOLxb83ZysdDjzQc07SMXR+IZWkxxJTXMaqJm82ZtRqRgEokIx8Xm15aIHMKmKT7XFCPSkDHAv7RtJ5eUGZQzHNirvo8mnJ7weNnMLX5ObpSXUPKKiCjCpik+1xQj537nxwhV/hIeSP3WjADhqZMQPPhndyZGP/j1mEmZmdfNR8MnizEMLf3WjACnmuntHTQewtFu2SSGIEC2IR8QfzM3CjMtBWMy03rkEJHz2TTF55oFrL70dGz2lQDov2Yj8vEOYQy+848K2aTMW6ufxZq8/wLQv5iIfPxG3kJUXDMegHwS41tn5yqOddaEfP7WOwCmpWBMZFqPHCJyB5um+FxTjHR2dmF051eQEDtNIwE4WwqvK1FKyqxsKVZspvfSkQmYFsiTTWI8ceNEbD8QVBzr2s8ajGtV73CmpWBMoGc6iIiohw1TfIZN0zz44IN45513sHXrVvh8Phw5csSoQ2lSU71GNQkDiIqvF6AZo459hnViCWo6zovdbO9kWkMuiaGW+ABs1gMnCZmWgjGYaT1yiMh9bJbiM6wY6ezsxLXXXovS0lI899xzRh1GM61JGDWRpIxSs71IWiNWEsO0vicuZ1oKxkD8WSEiQ9koxWdYMbJ06VIAwAsvvKD5azo6OtDR0dHzcTCoPJ0RD61JGDVakjJKaQ0nJz5IX/xZISK3sNWakcrKSmRlZfX8GTlypG77nlY2Gw1StmyKRZSAkORRfL1eysG+0yb0W/ITIUA9CRNJfCSyDyN0dot47qN/4L43t+O5j/6Bzm75KSsyhyU/K2II2P0RsO318H9NfgojEbmTrYqRJUuWoLW1tefP/v37ddu3z5eKjWN/DEA+TfNGxjUA5JMytWPvxn1XnwNg4GkNOyY+Kt+tQ9G97+GBd77Aitq9eOCdL1B073uofLfOtDFQf6b/rNStBZYXAy9eBfz+1vB/lxdb0qeCiNwlrmLknnvugSAIin927Ngx4MGkpaXB7/dH/dFTx56/KP6W2dEtYs3YShyMkZRZM7YS35lzhy5pDTslPirfrcPTH+6OWYA9/eFuFiQWM+1nxYaNs4jIPQRJUuqUEu3QoUNobm5W3ObMM8+Ez+fr+fiFF17AokWLBpSmCQaDyMrKQmtra8KFSWvwGAb/crjqQ8+O/fc/kZHui/kE1t70eCKm1U/V7OwWUXTve4oPYPMIwI4HpsOXYqubaK5j6M+KGArfAZHtV3HyIUiLtrFfDhFpEu/7d1wLWIcOHYqhQ4cOeHBW+v3TFbhFQ6O83z9dgVt+/ChmzLpOcX96pDWsTny8VLtHsRABwndIXqrdg1unnmnOoCgmQ39WbNo4i4jcw7A0zb59+9DS0oJ9+/YhFAph69atAICvfe1rGDx4sFGHlZV54p+6bucEe1uO67odJSmbNs4iIvcwrBi577778OKLL/Z8PHHiRADAn/70J1xyySVGHVZWW8YI4JjG7TSweopFD4XZg3TdjpKUTRtnEZF7xLVmxGxWrRnJ8p+muC+nNC7jmhEC0GvNiErjLK4ZISKN4n3/ds07TJb/NLwozAQgH+1dIVylqRBxSuMyX4oH86eOUdxm/tQxLESczqaNs4jIPVzzLtPZLeIXHf+Jp7uvgtjnH9wQPHi6+yr8ouM/FR/25cTGZUuuDOC2i8fEbA5428VjsOTK2I+8J4exYeMsInIP10zTPPfRP/DAO18AAFLQjbneP2CU0IR9Ui5WhC5H98nlM/fOOFs2OVK7qxk3PLtJ9Vgvz78w6fqidHaLeKl2D/a2HEdh9iDcVDqad0TcSAzZpnEWESUvQ6O9yax3IqQbKfhd6ErV7fpycuMyX4qH8V2yVeMsInIP1xQjvRMhHogo8exALo6gCadjs1gE8eSMlVJyRO/GZU5I5BARESXKNcXITaWj8eC7X2CasBkVqStQILT0vFYvZWNp11zUSCW4qXS07D4ijcv6Ll7tTWvjMqckcoiIiBLlmkUBvhQPbvRvRVXqcuShJeq1PLSgKnU5bvRvVVwn4fUImDVBuVCYNSFf9e6GkxI5REREiXJNMXKivRN3tP8WAGImRwDgjvbncKK9U3YfIVHC2s+UC4W1nzUopmmcmMghIiJKhGuKkVWvvYwCoaVfIRLhEYACoRmrXntZdh+bd7coTtEAQENrOzbvbpF9XW0fkoZ9EBEROYlripH2w9qmPpS20yNN4+REDhER0UC4phhJH6JtUajSdnqkafRO5BARESU716Rpbrz2BtRX3o88xJ6qESWgETm48dobZB8AFknTNLa2y3XwQF6vNE2s6K4e+0jm+K/TzsdW+MAyIkpSrilGMtJ9eD7vv3B741KIUvQi1sha0TfyFqL1g5149qPdUc3jHnz3C8yfGn40esXMAG5fuSXmMSQAFTMD8HoExehuxcwAyldugYDotmSRIWnZRzLGf512PrZStxaoXgwE6099zl8Q7jnDR7kTkc25ZpoGAD49bSrKuxahEdHPAWlEDsq7FuHVo9/E0x/u7tfFVpSApz/cjcp36zQdRy26CwBVcyYhLyt6KiYvKx1VcyahrDjfcfFfp52PrdStBVbPjS5EgHAX3tVzw68TEdmYa3rTnOgM4ez7qgEoP4FViUcAhg724WBb7PivAGCYPw2AgMZg7AWokWmYDYsvBYCYUxYhUcKUZR/Ipm567yMZpjicdj62IoaA5cX9C5EeQvgOyaJtnLIhItOwN42Mh3rd1RDhwSYx/m60ogTZQgQIT7k0BjsU99E7ult6Vk7MhnrxxH+ToSGf087HVvZuVChEAEACggfC27HnDBHZlGumafY0yzfAs4Kb4r9OOx9bOXpQ3+2IiCzgmjsjo3MG4aOvrB7FKXrGf+2eULFrnNnu102TwcP03Y6IyAKuKUZ+cmUAL23al9A+ImtGmto6ZWO5kTUjB4PaoruxxBP/TYaESrxxZjMkw3XTpHByeE1IsAGQu7r+gvB2REQ25ZppmgyfF/505drL51X+rXj+1DFYenUxgFMx3IjIx/fPGof7ZwUUt4lEd+V4PQIqZqrvo6auMSkSKlrPx6y7Eo5K9ni84fguANmrW/YwF68Ska25phg52t6NYHu34jadIQnfm1wYs5HebReHnzNSVpyvGsvVso0atX1MC+QlVcM9Pa6JHhzZqDAwC7huBeDvcw39BeHP8zkjRGRzron2zn/xY9R80aS63bSzc/HEd8+N+QTW3rSsN9BjTYLcPmp3NeOGZzepfv3L8y+0VULF6nUayXrdNOETWInIJhjtlbHv8AnN2/lSPLh16pmK23k9guqblZZt1MjtI1kTKnpck0Qk63XTxONlfJeIkpJripFRQzLwZWObpu2s/u1dC7smVOyO142IyH5cU4z8+vqJKL5/nep2M8bl93taqB1TFnZMqCQDXjciIvtxzQLWwekpKMzJUNxm6GAffvT6Z0mRsrBbQiVZ8LoREdmPa4qRkCihs1t5rW7zsdjPD7FrysIuCZVkw+tGRGQvrpmmUeuPAqBft97e7No/paw4H9MCebZf42I3vG5ERPbhmmJEr3SEHVMWVidUkhWvGxGRPbhmmkavdARTFkRERPpyzZ0RLSkKQZCfqmHKwj6SIXpNRETauaYYiaQobl+5JebrEoAfTB2DZz7c3fNxBFMW9uGYBndERNTDNdM0WkwcNYQpCxtzVIM7IiLq4Zo7I5EGaXIEhKO7GxZfypSFDak1uIt8/6YF8vi9IiJKMq4pRtSivX2ju0xZ2Eu83z8iIkoerpmmcXSDNBfg94+IyLlcc2ekdyTXAxElnh3IxRE04XRsFosgnqzLcjPTHZfW0O18VFrUG3nd2OCOjOK0v+9Eycg1xUgk2ju+7UNUpK5AgdDS81q9lI2lXXPxt8yLcfhYZ1I0ytNKt/RJ3VqgejEQrD/1OX8BULYMCMwyPOXCBndkBKaziOxBkCTJPs1W+ggGg8jKykJrayv8fn/C+/v9yicx+6slAIDev/hEni3yVF4FHtn7jX5vdpFNky1RE0mfJHw+dWuB1XMBmT19WvobfPtPZxh+3SLnA8SOXifb94espdvfDyLqJ973b9esGQl1d2PKzkcBRBcivT++pvF/IEDs97V2bZSnRC19Amg8HzEUviMisycJQH7tUlOuGxvckV50+/tBRLpwzTTNjr+swzg09+8bf5JHAArQjBLPDmwSA/1eT7a0hm7pk70bo6dm+hAgIc/E68YGd6QHprOI7MU1xciJwwc0bZeLI4qvJ0taQ7f0ydGDmvZj5nVjgztKFNNZRPbimmmajCHDNW3XhNMVX0+WtIZu6ZPBwzTtxynXjdyB6Swie3FNMVJ0wRU4iBzZRniiBNRLOfhYLIr5uoDwKvtkSWtE0idykxeaz6dwcjg1I7MnCQIakYPNMtcNWo9DZCLd/n4QkS5cU4x4U1Kw4Wt3A+jfmTfy8Rt5CyHC0+8fqGRslBdpDAj0LyPiOh+PNxzfldmTAOB/v3Z3z3NaYpk1IT9prhu5g25/P4hIF64pRkKihEf3fwPlXYvQiOjfdhqRg/KuRXjpyAQ8ceNEx6Q1dEufBGYB160A/H229xcgdO2LeHT/NxS/fO1nDUwlkO0wnUVkH655zkjtrmbc8OwmAMpPYH15/oUoGZPtqLSGkU9grd19pOe6Knl5/oVcdEq2xCewEukv3vdv16Rpeq+KF+GJGUONbOe0tIZu5+PxAmOmRn2KqQRKdk77+06UjFwzTcPV88bgdSUiokS5phjh6nlj8LoSEVGiXFOMcPW8MXhdiYgoUa4pRgCunjcKrysRESXCNWma3rh63hi8rkREBDBNowlXzxuD15WIiAbCVdM0REREZD+uvDOiBz2mJDitQURExGJkQKq3N2DpW3VoaD31IK/8rHRUzAxoXqypxz6IiIicwLBpmj179uDWW2/FmDFjkJGRgbPOOgsVFRXo7Ow06pCmqN7egPKVW6KKCABobG1H+cotqN7eYMo+iIiInMKwYmTHjh0QRRFPP/00Pv/8c/z617/GU089hZ/85CdGHdJwIVHC0rfqECt+FPnc0rfqFJvC6bEPIiIiJzFsmqasrAxlZWU9H5955pn48ssvUVVVhUcffdSowxpq8+6WfnczepMANLS2Y/PuFtlUiR77ICIichJT14y0trYiO1v+seAdHR3o6Ojo+TgYDJoxLM30aArHxnJERETRTCtGdu7ciccff1zxrkhlZSWWLl1q+FjUUixyr8fbFC7WfthYjoiIKFrcxcg999yDZcuWKW7zxRdfoKioqOfjAwcOoKysDNdeey3mz58v+3VLlizBXXfd1fNxMBjEyJEj4x2iIrUUi9Lr0wJ5yM9KR2Nre8w1HwLCj0AvGZMtu597Z5yteR9ERERuEPfj4A8dOoTm5mbFbc4880z4fD4AQH19PS655BJceOGFeOGFF+DxaF8zq/fj4CMplr4nHLkn8oOLx+CZD3fLvl41ZxIAoHzlFgCI2i7WNmrHUdoH471ERJSs4n3/NrQ3zYEDB/Dv//7vOPfcc7Fy5Up4vd64vl7PYiQkSpiy7APFxaMeAZALsUTuWGxYfClq6hoV754oHSeyn3tnBPDAO3zOCBEROY9tetMcOHAAl1xyCQoLC/Hoo4/i0KFDPa/l5eUZdVhZaikWQL4QAaJTLmXF+ZgWyIu5rqR2V7OmtMyQ03zYsPhSPoGViIhcz7BipKamBjt37sTOnTsxYsSIqNesaBSsVzolsh+5pnDxpGXYWI6IiMjAh55973vfgyRJMf9YQa90itp+mJYhIiKKj2u69paMyUZ+VjqUJkE8AmRfFxBe06GWclE7jtb9EBERuYVrihGvR0DFzACA/gWHcPLP/KljZF8HgIqZAdU1HWrH0bofIiIit3BNMQIAZcX5qJozCXlZ0VMkeVnpqJozCUuuDCi+rjXlonYcpmWIiIhOMTTamyi9nzMSMdAnsOp9HCIiIieyTbTXztRSLHqlXJiWISIiUueqaRoiIiKyHxYjREREZCkWI0RERGQpFiNERERkKRYjREREZCkWI0RERGQpFiNERERkKRYjREREZCkWI0RERGQpWz+BNfKk+mAwaPFIiIiISKvI+7bWjjO2Lkba2toAACNHjrR4JERERBSvtrY2ZGVlqW5n60Z5oiiivr4emZmZEATnNpgLBoMYOXIk9u/fr2tDQLvjefO83cKt587zdu95Z2Zmoq2tDQUFBfB41FeE2PrOiMfjwYgRI6wehmn8fr+rfnAjeN7u4tbzBtx77jxvd4mct5Y7IhFcwEpERESWYjFCRERElmIxYgNpaWmoqKhAWlqa1UMxFc+b5+0Wbj13njfPWytbL2AlIiIi5+OdESIiIrIUixEiIiKyFIsRIiIishSLESIiIrIUixEiIiKyFIsRiz3xxBMYPXo00tPTccEFF2Dz5s1WD8lwH374IWbOnImCggIIgoA33njD6iGZorKyEueffz4yMzORm5uLa665Bl9++aXVwzJcVVUVxo8f3/NUxtLSUrz33ntWD8t0Dz/8MARBwKJFi6weiuHuv/9+CIIQ9aeoqMjqYZniwIEDmDNnDnJycpCRkYFzzjkHf/3rX60elqFGjx7d7/stCAIWLFigeR8sRiz06quv4q677kJFRQW2bNmCCRMm4IorrkBTU5PVQzPUsWPHMGHCBDzxxBNWD8VU69evx4IFC7Bp0ybU1NSgq6sLl19+OY4dO2b10Aw1YsQIPPzww/jkk0/w17/+FZdeeimuvvpqfP7551YPzTQff/wxnn76aYwfP97qoZhm3LhxaGho6PmzYcMGq4dkuMOHD+Oiiy5Camoq3nvvPdTV1eGXv/wlhgwZYvXQDPXxxx9Hfa9ramoAANdee632nUhkmZKSEmnBggU9H4dCIamgoECqrKy0cFTmAiCtWbPG6mFYoqmpSQIgrV+/3uqhmG7IkCHSb3/7W6uHYYq2tjZp7NixUk1NjfRv//Zv0p133mn1kAxXUVEhTZgwwephmG7x4sXSlClTrB6G5e68807prLPOkkRR1Pw1vDNikc7OTnzyySe47LLLej7n8Xhw2WWXoba21sKRkVlaW1sBANnZ2RaPxDyhUAivvPIKjh07htLSUquHY4oFCxZgxowZUX/X3eCrr75CQUEBzjzzTHz3u9/Fvn37rB6S4dauXYvzzjsP1157LXJzczFx4kQ8++yzVg/LVJ2dnVi5ciVuueUWCIKg+etYjFjkX//6F0KhEIYNGxb1+WHDhqGxsdGiUZFZRFHEokWLcNFFF6G4uNjq4Rhu27ZtGDx4MNLS0nD77bdjzZo1CAQCVg/LcK+88gq2bNmCyspKq4diqgsuuAAvvPACqqurUVVVhd27d2Pq1Kloa2uzemiG+sc//oGqqiqMHTsW69atQ3l5OX74wx/ixRdftHpopnnjjTdw5MgRfO9734vr61KMGQ4RKVmwYAG2b9/uinl0APjGN76BrVu3orW1Fa+//jrmzZuH9evXO7og2b9/P+68807U1NQgPT3d6uGYavr06T3/f/z48bjgggtQWFiI1atX49Zbb7VwZMYSRRHnnXceHnroIQDAxIkTsX37djz11FOYN2+exaMzx3PPPYfp06ejoKAgrq/jnRGLnHHGGfB6vTh48GDU5w8ePIi8vDyLRkVmWLhwId5++2386U9/wogRI6wejil8Ph++9rWv4dxzz0VlZSUmTJiA3/zmN1YPy1CffPIJmpqaMGnSJKSkpCAlJQXr16/HY489hpSUFIRCIauHaJrTTz8dX//617Fz506rh2Ko/Pz8fgX22Wef7YopKgDYu3cv/vjHP+L73/9+3F/LYsQiPp8P5557Lt5///2ez4miiPfff981c+luI0kSFi5ciDVr1uCDDz7AmDFjrB6SZURRREdHh9XDMNS3vvUtbNu2DVu3bu35c9555+G73/0utm7dCq/Xa/UQTXP06FHs2rUL+fn5Vg/FUBdddFG/uP7f//53FBYWWjQicz3//PPIzc3FjBkz4v5aTtNY6K677sK8efNw3nnnoaSkBMuXL8exY8dw8803Wz00Qx09ejTqN6Tdu3dj69atyM7OxqhRoywcmbEWLFiAVatW4c0330RmZmbP2qCsrCxkZGRYPDrjLFmyBNOnT8eoUaPQ1taGVatW4c9//jPWrVtn9dAMlZmZ2W890GmnnYacnBzHrxO6++67MXPmTBQWFqK+vh4VFRXwer244YYbrB6aoX70ox9h8uTJeOihh3Dddddh8+bNeOaZZ/DMM89YPTTDiaKI559/HvPmzUNKygBKC+PCPaTF448/Lo0aNUry+XxSSUmJtGnTJquHZLg//elPEoB+f+bNm2f10AwV65wBSM8//7zVQzPULbfcIhUWFko+n08aOnSo9K1vfUv6wx/+YPWwLOGWaO/1118v5efnSz6fTxo+fLh0/fXXSzt37rR6WKZ46623pOLiYiktLU0qKiqSnnnmGauHZIp169ZJAKQvv/xyQF8vSJIk6VMXEREREcWPa0aIiIjIUixGiIiIyFIsRoiIiMhSLEaIiIjIUixGiIiIyFIsRoiIiMhSLEaIiIjIUixGiIiIyFIsRoiIiMhSLEaIiIjIUixGiIiIyFL/D5nm2vHYVXmpAAAAAElFTkSuQmCC", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "plt.scatter(X[y==0, 0], X[y==0, 1])\n", + "plt.scatter(X[y==1, 0], X[y==1, 1])" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.neighbors import NearestNeighbors" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "X_min = X[y == 1]\n", + "X_min = StandardScaler().fit_transform(X_min)\n", + "indices = NearestNeighbors(n_neighbors=5).fit(X_min).kneighbors(X_min, return_distance=False)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "def joint_neighborhood_ratios(X, y):\n", + "\n", + " X_min = X[y == 1]\n", + " X_min = StandardScaler().fit_transform(X_min)\n", + " indices = NearestNeighbors(n_neighbors=5).fit(X_min).kneighbors(X_min, return_distance=False)\n", + " count = 0\n", + " count_none = 0\n", + " for idx, inds in enumerate(indices):\n", + " flag = False\n", + " for ind in inds[1:]:\n", + " if idx in indices[ind]:\n", + " count += 1\n", + " flag = True\n", + " if not flag:\n", + " count_none += 1\n", + " return count/len(X_min), count_none/len(X_min)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "def comparison(X, y):\n", + " X = StandardScaler().fit_transform(X)\n", + " X_min = X[y == 1]\n", + " X_maj = X[y == 0]\n", + "\n", + " indices = NearestNeighbors(n_neighbors=5).fit(X_min).kneighbors(X_min, return_distance=False)\n", + " all_points = (X_min[:, None] + X_min[indices[:, 1:]]) / 2\n", + " all_points = all_points.reshape((all_points.shape[0]*all_points.shape[1], all_points.shape[2]))\n", + "\n", + " indices = NearestNeighbors(n_neighbors=5).fit(X).kneighbors(all_points, return_distance=False)\n", + " return np.bincount(y[indices.ravel()])\n" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([776, 844])" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "comparison(X[:, :], y)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "import logging\n", + "logger = logging.getLogger('smote_variants')\n", + "logger.setLevel(logging.ERROR)" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [], + "source": [ + "class RandomForestClassifierImproved:\n", + " def __init__(self, smote, min_samples_leaf=5, random_state=None, bootstrap=True, n_estimators=200):\n", + " self.smote = smote\n", + " self.min_samples_leaf = min_samples_leaf\n", + " self.random_state = random_state\n", + "\n", + " def fit(self, X, y, sample_weight=None):\n", + " self.estimators = []\n", + " self.masks = []\n", + " for idx in range(1000):\n", + " estimator = OperatorDecisionTreeClassifier(max_features='sqrt', min_weight_fraction_leaf=2/len(X), mode='avg')\n", + " bootstrap = np.random.randint(0, X.shape[0], X.shape[0])\n", + " X_train = X[bootstrap]\n", + " y_train = y[bootstrap]\n", + "\n", + " \"\"\"X0 = X_train[y_train == 0]\n", + " X1 = np.unique(X_train[y_train == 1], axis=0)\n", + " #X1 = X[y == 1]\n", + "\n", + " #bootstrap = np.random.choice(np.arange(X1.shape[0]), X1.shape[0], replace=True)\n", + " #X1 = X1[bootstrap]\n", + "\n", + " X_train = np.vstack([X0, X1])\n", + " y_train = np.hstack([np.repeat(0, X0.shape[0]), np.repeat(1, X1.shape[0])])\"\"\"\n", + "\n", + " smote = SMOTE(\n", + " random_state=5,\n", + " n_neighbors=1,\n", + " #proportion=2.0,\n", + " #proportion=1.0 + (np.random.random_sample()-1)*0.5,\n", + " ss_params={'gaussian_component': {'sigma': 0.0, 'fraction': 1.0}, 'n_dim': 2},\n", + " nn_params={'metric_learning_method': 'n_unique_inv', 'metric': 'precomputed', 'random_state': 5}\n", + " )\n", + " #X_train = X_train + np.random.random_sample(X_train.shape)\n", + "\n", + " X_train, y_train = smote.sample(X_train, y_train)\n", + " \"\"\"X0 = X_train[y_train == 0]\n", + " X1 = X_train[y_train == 1]\n", + " X_train = np.vstack([X0, X1, X1])\n", + " y_train = np.hstack([np.repeat(0, X0.shape[0]), np.repeat(1, X1.shape[0]*2)])\"\"\"\n", + " sample_weight = np.hstack([np.repeat(1.0, X.shape[0]), np.repeat(0.5, X_train.shape[0] - X.shape[0])])\n", + " estimator.fit(X_train, y_train, sample_weight=sample_weight)\n", + " self.estimators.append(estimator)\n", + " return self\n", + "\n", + " def predict_proba(self, X):\n", + " return np.mean([estimator.predict_proba(X) for estimator in self.estimators], axis=0)" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "306 283\n", + "[[ 2. 38. 59.]\n", + " [ 4. 39. 63.]\n", + " [ 1. 49. 62.]]\n", + "[31, 49, 12]\n", + "[[ 0. 45. 66.]\n", + " [ 1. 63. 60.]\n", + " [ 8. 69. 67.]\n", + " [ 5. 61. 62.]\n", + " [ 4. 53. 58.]]\n", + "[225 81]\n", + "(2.8395061728395063, 0.09876543209876543)\n", + "[772 848]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0.603921568627451 0.6261437908496732 0.5986928104575164 0.6366013071895424 0.6431372549019607\n", + "0.7048499702911467 0.6802473262032085 0.7060606060606062 0.6983437314319668 0.6980912061794414\n", + "0.7140153283535637 0.6938920012449424 0.7196992685963274 0.714513305322129 0.7132878151260504\n", + "0.7095601412608056 0.6882827324478178 0.7127358739194604 0.7096787370862323 0.7104298439806029\n", + "0.7152807667782561 0.6921907380838515 0.718140243902439 0.7166596923322175 0.7168121313566077\n", + "0.7145577021658337 0.6937107522747661 0.7161163975394079 0.7154820261437908 0.7155180699730873\n", + "0.7156635058394943 0.6917838583520841 0.7169358727097397 0.7166552823315118 0.7171568627450982\n", + "0.7167403111479335 0.6933558869557213 0.7181556660222777 0.7167011875172605 0.7172713568995674\n", + "0.7176147825385298 0.6952004155571694 0.7183490680222706 0.718486242233519 0.7190576333414024\n", + "0.7159538174244058 0.6963720103425987 0.7170500251382607 0.7179047259929612 0.7188595309918838\n", + "0.7169400763605773 0.6975158545266289 0.718616935222934 0.7197368633922214 0.7209190772018379\n", + "0.7176750279691456 0.6972615998351293 0.7192898781134074 0.7188335394217749 0.7200505652711533\n", + "0.7181578485388647 0.6974392994112245 0.7200254550856154 0.7189937476367958 0.7202823691460055\n", + "0.7182694706381281 0.6967959512049094 0.7200749638277703 0.718701666417203 0.7203041585590979\n", + "0.7178423260557178 0.6960790108005377 0.7195139758030873 0.7177704769851204 0.7193691188059147\n", + "0.7194279422585812 0.6972403475739082 0.7211566138596719 0.7191076375362507 0.7207600203436783\n", + "0.7183610400682012 0.6962301587301588 0.7205182072829132 0.7178206065034709 0.7194951893800999\n" + ] + }, + { + "ename": "KeyboardInterrupt", + "evalue": "", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[13], line 91\u001b[0m\n\u001b[1;32m 88\u001b[0m res0\u001b[38;5;241m.\u001b[39mappend(roc_auc_score(y_test, pred))\n\u001b[1;32m 90\u001b[0m classifier \u001b[38;5;241m=\u001b[39m RandomForestClassifierImproved(\u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39m(rfs_params \u001b[38;5;241m|\u001b[39m {\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mmin_samples_leaf\u001b[39m\u001b[38;5;124m'\u001b[39m: msl}), smote\u001b[38;5;241m=\u001b[39msmote3)\n\u001b[0;32m---> 91\u001b[0m \u001b[43mclassifier\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfit\u001b[49m\u001b[43m(\u001b[49m\u001b[43mX_train\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43my_train\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 92\u001b[0m pred \u001b[38;5;241m=\u001b[39m classifier\u001b[38;5;241m.\u001b[39mpredict_proba(X_test)[:, \u001b[38;5;241m1\u001b[39m]\n\u001b[1;32m 93\u001b[0m res1\u001b[38;5;241m.\u001b[39mappend(roc_auc_score(y_test, pred))\n", + "Cell \u001b[0;32mIn[12], line 36\u001b[0m, in \u001b[0;36mRandomForestClassifierImproved.fit\u001b[0;34m(self, X, y, sample_weight)\u001b[0m\n\u001b[1;32m 26\u001b[0m smote \u001b[38;5;241m=\u001b[39m SMOTE(\n\u001b[1;32m 27\u001b[0m random_state\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m5\u001b[39m,\n\u001b[1;32m 28\u001b[0m n_neighbors\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m1\u001b[39m,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 32\u001b[0m nn_params\u001b[38;5;241m=\u001b[39m{\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mmetric_learning_method\u001b[39m\u001b[38;5;124m'\u001b[39m: \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mn_unique_inv\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mmetric\u001b[39m\u001b[38;5;124m'\u001b[39m: \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mprecomputed\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mrandom_state\u001b[39m\u001b[38;5;124m'\u001b[39m: \u001b[38;5;241m5\u001b[39m}\n\u001b[1;32m 33\u001b[0m )\n\u001b[1;32m 34\u001b[0m \u001b[38;5;66;03m#X_train = X_train + np.random.random_sample(X_train.shape)\u001b[39;00m\n\u001b[0;32m---> 36\u001b[0m X_train, y_train \u001b[38;5;241m=\u001b[39m \u001b[43msmote\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43msample\u001b[49m\u001b[43m(\u001b[49m\u001b[43mX_train\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43my_train\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 37\u001b[0m \u001b[38;5;250m\u001b[39m\u001b[38;5;124;03m\"\"\"X0 = X_train[y_train == 0]\u001b[39;00m\n\u001b[1;32m 38\u001b[0m \u001b[38;5;124;03mX1 = X_train[y_train == 1]\u001b[39;00m\n\u001b[1;32m 39\u001b[0m \u001b[38;5;124;03mX_train = np.vstack([X0, X1, X1])\u001b[39;00m\n\u001b[1;32m 40\u001b[0m \u001b[38;5;124;03my_train = np.hstack([np.repeat(0, X0.shape[0]), np.repeat(1, X1.shape[0]*2)])\"\"\"\u001b[39;00m\n\u001b[1;32m 41\u001b[0m sample_weight \u001b[38;5;241m=\u001b[39m np\u001b[38;5;241m.\u001b[39mhstack([np\u001b[38;5;241m.\u001b[39mrepeat(\u001b[38;5;241m1.0\u001b[39m, X\u001b[38;5;241m.\u001b[39mshape[\u001b[38;5;241m0\u001b[39m]), np\u001b[38;5;241m.\u001b[39mrepeat(\u001b[38;5;241m0.5\u001b[39m, X_train\u001b[38;5;241m.\u001b[39mshape[\u001b[38;5;241m0\u001b[39m] \u001b[38;5;241m-\u001b[39m X\u001b[38;5;241m.\u001b[39mshape[\u001b[38;5;241m0\u001b[39m])])\n", + "File \u001b[0;32m~/workspaces/smote_variants/smote_variants/base/_oversampling.py:234\u001b[0m, in \u001b[0;36mOverSamplingBase.sample\u001b[0;34m(self, X, y)\u001b[0m\n\u001b[1;32m 230\u001b[0m _logger\u001b[38;5;241m.\u001b[39minfo(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m%s\u001b[39;00m\u001b[38;5;124m: not enough dimensions \u001b[39m\u001b[38;5;132;01m%d\u001b[39;00m\u001b[38;5;124m\"\u001b[39m,\n\u001b[1;32m 231\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m\u001b[38;5;18m__class__\u001b[39m\u001b[38;5;241m.\u001b[39m\u001b[38;5;18m__name__\u001b[39m, X\u001b[38;5;241m.\u001b[39mshape[\u001b[38;5;241m1\u001b[39m])\n\u001b[1;32m 232\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m X\u001b[38;5;241m.\u001b[39mcopy(), y\u001b[38;5;241m.\u001b[39mcopy()\n\u001b[0;32m--> 234\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43msampling_algorithm\u001b[49m\u001b[43m(\u001b[49m\u001b[43mX\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43my\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m~/workspaces/smote_variants/smote_variants/oversampling/_smote.py:122\u001b[0m, in \u001b[0;36mSMOTE.sampling_algorithm\u001b[0;34m(self, X, y)\u001b[0m\n\u001b[1;32m 118\u001b[0m nn_mt\u001b[38;5;241m=\u001b[39m NearestNeighborsWithMetricTensor(n_neighbors\u001b[38;5;241m=\u001b[39mn_neighbors,\n\u001b[1;32m 119\u001b[0m n_jobs\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mn_jobs,\n\u001b[1;32m 120\u001b[0m \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mnn_params)\n\u001b[1;32m 121\u001b[0m nn_mt\u001b[38;5;241m.\u001b[39mfit(X_min)\n\u001b[0;32m--> 122\u001b[0m _, ind_min \u001b[38;5;241m=\u001b[39m \u001b[43mnn_mt\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mkneighbors\u001b[49m\u001b[43m(\u001b[49m\u001b[43mX_min\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mreturn_distance\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\u001b[43m)\u001b[49m\n\u001b[1;32m 124\u001b[0m samples \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39msample_simplex(X\u001b[38;5;241m=\u001b[39mX_min,\n\u001b[1;32m 125\u001b[0m indices\u001b[38;5;241m=\u001b[39mind_min,\n\u001b[1;32m 126\u001b[0m n_to_sample\u001b[38;5;241m=\u001b[39mn_to_sample)\n\u001b[1;32m 128\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m (np\u001b[38;5;241m.\u001b[39mvstack([X, samples]),\n\u001b[1;32m 129\u001b[0m np\u001b[38;5;241m.\u001b[39mhstack([y, np\u001b[38;5;241m.\u001b[39mhstack([\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mmin_label]\u001b[38;5;241m*\u001b[39mn_to_sample)]))\n", + "File \u001b[0;32m~/workspaces/smote_variants/smote_variants/base/_metrictensor.py:769\u001b[0m, in \u001b[0;36mNearestNeighborsWithMetricTensor.kneighbors\u001b[0;34m(self, X, n_neighbors, return_distance)\u001b[0m\n\u001b[1;32m 766\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mnearestn\u001b[38;5;241m.\u001b[39mkneighbors(X, n_neighbors, return_distance)\n\u001b[1;32m 768\u001b[0m n_neighbors \u001b[38;5;241m=\u001b[39m coalesce(n_neighbors, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mn_neighbors)\n\u001b[0;32m--> 769\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mn_neighbors_func\u001b[49m\u001b[43m(\u001b[49m\u001b[43mX\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mX_fitted\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 770\u001b[0m \u001b[43m \u001b[49m\u001b[43mmetric_tensor\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mmetric_tensor\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 771\u001b[0m \u001b[43m \u001b[49m\u001b[43mn_neighbors\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mn_neighbors\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 772\u001b[0m \u001b[43m \u001b[49m\u001b[43mreturn_distance\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mreturn_distance\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m~/workspaces/smote_variants/smote_variants/base/_metrictensor.py:485\u001b[0m, in \u001b[0;36mn_neighbors_func\u001b[0;34m(X_base, X_neighbors, n_neighbors, metric_tensor, return_distance)\u001b[0m\n\u001b[1;32m 479\u001b[0m X_neighbors\u001b[38;5;241m=\u001b[39m X_neighbors \u001b[38;5;28;01mif\u001b[39;00m X_neighbors \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;28;01melse\u001b[39;00m X_base\n\u001b[1;32m 481\u001b[0m distm \u001b[38;5;241m=\u001b[39m pairwise_distances_mahalanobis(X_base,\n\u001b[1;32m 482\u001b[0m Y\u001b[38;5;241m=\u001b[39mX_neighbors,\n\u001b[1;32m 483\u001b[0m tensor\u001b[38;5;241m=\u001b[39mmetric_tensor)\n\u001b[0;32m--> 485\u001b[0m results_ind\u001b[38;5;241m=\u001b[39m \u001b[43mnp\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mapply_along_axis\u001b[49m\u001b[43m(\u001b[49m\u001b[43mnp\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43margsort\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 486\u001b[0m \u001b[43m \u001b[49m\u001b[43maxis\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m1\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m 487\u001b[0m \u001b[43m \u001b[49m\u001b[43marr\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdistm\u001b[49m\u001b[43m)\u001b[49m[:,:(n_neighbors)]\n\u001b[1;32m 489\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m return_distance:\n\u001b[1;32m 490\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m results_ind\n", + "File \u001b[0;32m~/anaconda3/envs/smote-variants/lib/python3.10/site-packages/numpy/lib/shape_base.py:402\u001b[0m, in \u001b[0;36mapply_along_axis\u001b[0;34m(func1d, axis, arr, *args, **kwargs)\u001b[0m\n\u001b[1;32m 400\u001b[0m buff[ind0] \u001b[38;5;241m=\u001b[39m res\n\u001b[1;32m 401\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m ind \u001b[38;5;129;01min\u001b[39;00m inds:\n\u001b[0;32m--> 402\u001b[0m buff[ind] \u001b[38;5;241m=\u001b[39m asanyarray(func1d(inarr_view[ind], \u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs))\n\u001b[1;32m 404\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(res, matrix):\n\u001b[1;32m 405\u001b[0m \u001b[38;5;66;03m# wrap the array, to preserve subclasses\u001b[39;00m\n\u001b[1;32m 406\u001b[0m buff \u001b[38;5;241m=\u001b[39m res\u001b[38;5;241m.\u001b[39m__array_wrap__(buff)\n", + "\u001b[0;31mKeyboardInterrupt\u001b[0m: " + ] + } + ], + "source": [ + "dataset = binclas.load_yeast_1_vs_7()\n", + "dataset = binclas.load_saheart()\n", + "dataset = binclas.load_haberman()\n", + "X = dataset['data']\n", + "y = dataset['target']\n", + "\n", + "print(len(X), len(np.unique(X, axis=0)))\n", + "print(X[:3])\n", + "print([len(np.unique(X[:, idx])) for idx in range(X.shape[1])])\n", + "print(X[y == 1][:5])\n", + "print(np.bincount(y))\n", + "print(joint_neighborhood_ratios(X, y))\n", + "print(comparison(X, y))\n", + "\n", + "validator = RepeatedStratifiedKFold(n_splits=5, n_repeats=100, random_state=5)\n", + "\n", + "res0 = []\n", + "res1 = []\n", + "res2 = []\n", + "res3 = []\n", + "res4 = []\n", + "\n", + "#X = X + np.random.random_sample(size=X.shape) * 0.001\n", + "\n", + "for idx, (train, test) in enumerate(validator.split(X, y, y)):\n", + "\n", + " X_train = X[train]\n", + " X_test = X[test]\n", + " y_train = y[train]\n", + " y_test = y[test]\n", + "\n", + "\n", + "\n", + " \"\"\"pca = PCA(n_components=X_train.shape[1]).fit(X_train)\n", + " X_train = pca.transform(X_train)\n", + " X_test = pca.transform(X_test)\"\"\"\n", + "\n", + " ss = StandardScaler()\n", + " X_train = ss.fit_transform(X_train)\n", + " X_test = ss.transform(X_test)\n", + "\n", + " X1_mean = np.mean(X_train[y_train == 1], axis=0)\n", + " dists = np.sum((X_train[y_train==1] - X1_mean)**2, axis=1)\n", + " mask = dists < np.sort(dists)[-5]\n", + " X0 = X_train[y_train == 0]\n", + " X1 = X_train[y_train == 1][mask]\n", + " X_train = np.vstack([X0, X1])\n", + " y_train = np.hstack([np.repeat(0, X0.shape[0]), np.repeat(1, X1.shape[0])])\n", + "\n", + " smote0 = SMOTE(random_state=5)\n", + " smote1 = SMOTE(\n", + " random_state=5,\n", + " n_neighbors=5,\n", + " #proportion=2.0,\n", + " ss_params={'gaussian_component': {'sigma': 0.0, 'fraction': 1.0}, 'n_dim': 2},\n", + " #nn_params={'metric_learning_method': 'MI_weighted', 'metric': 'precomputed', 'random_state': 5}\n", + " )\n", + " smote2 = SMOTE(\n", + " random_state=5,\n", + " #n_neighbors=1,\n", + " #proportion=2.0,\n", + " ss_params={'gaussian_component': {'sigma': 0.01, 'fraction': 1.0}, 'n_dim': 2},\n", + " #nn_params={'metric_learning_method': 'id', 'metric': 'precomputed', 'random_state': 5}\n", + " )\n", + " smote3 = SMOTE(\n", + " random_state=5,\n", + " #n_neighbors=1,\n", + " #proportion=2.0,\n", + " ss_params={'gaussian_component': {'sigma': 0.0, 'fraction': 1.0}, 'n_dim': 2},\n", + " #nn_params={'metric_learning_method': 'n_unique_inv', 'metric': 'precomputed', 'random_state': 5}\n", + " )\n", + "\n", + " X_samp0, y_samp0 = smote0.sample(X_train, y_train)\n", + " X_samp1, y_samp1 = smote1.sample(X_train, y_train)\n", + " X_samp2, y_samp2 = smote2.sample(X_train, y_train)\n", + " X_samp3, y_samp3 = smote3.sample(X_train, y_train)\n", + "\n", + " sample_weight=np.hstack([np.repeat(1.0, X_train.shape[0]), np.repeat(0.5, X_samp1.shape[0] - X_train.shape[0])])\n", + "\n", + " msl = 13\n", + "\n", + " rf_params = {'min_samples_leaf': 13, 'random_state': 5, 'bootstrap': True, 'n_estimators': 1000}\n", + " rfs_params = {'min_samples_leaf': msl, 'random_state': 6, 'bootstrap': True, 'n_estimators': 1000}\n", + "\n", + " classifier = OperatorRandomForestClassifier(**(rfs_params | {'bootstrap': True, 'min_weight_fraction_leaf': 2.0/len(X_samp0)}), mode='avg_half')\n", + " classifier.fit(X_samp0, y_samp0)\n", + " pred = classifier.predict_proba(X_test)[:, 1]\n", + " res0.append(roc_auc_score(y_test, pred))\n", + "\n", + " classifier = RandomForestClassifierImproved(**(rfs_params | {'min_samples_leaf': msl}), smote=smote3)\n", + " classifier.fit(X_train, y_train)\n", + " pred = classifier.predict_proba(X_test)[:, 1]\n", + " res1.append(roc_auc_score(y_test, pred))\n", + "\n", + " classifier = OperatorRandomForestClassifier(**rfs_params, mode='avg_half')\n", + " classifier.fit(X_samp2, y_samp2)\n", + " pred = classifier.predict_proba(X_test)[:, 1]\n", + " res2.append(roc_auc_score(y_test, pred))\n", + "\n", + " \"\"\"classifier = RandomForestClassifier(**rfs_params)\n", + " classifier.fit(X_samp3, y_samp3, sample_weight=sample_weight)\n", + " pred = classifier.predict_proba(X_test)[:, 1]\n", + " res3.append(roc_auc_score(y_test, pred))\"\"\"\n", + "\n", + "\n", + " classifier = OperatorRandomForestClassifier(**rfs_params#, class_weight={0: 1.0, 1: np.sum(1 - y_train)/np.sum(y_train)}\n", + " ,mode='avg_half')\n", + " classifier.fit(X_train, y_train)\n", + " pred = classifier.predict_proba(X_test)[:, 1]\n", + " res3.append(roc_auc_score(y_test, pred))\n", + "\n", + "\n", + " classifier = OperatorRandomForestClassifier(**(rf_params | {'bootstrap': True}), mode='avg_half')\n", + " classifier.fit(X_train, y_train)\n", + " pred = classifier.predict_proba(X_test)[:, 1]\n", + " res4.append(roc_auc_score(y_test, pred))\n", + "\n", + " if idx % 10 == 0:\n", + " print(np.mean(res0), np.mean(res1), np.mean(res2), np.mean(res3), np.mean(res4))\n", + " #break\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(0.9252715517241379,\n", + " 0.9300951794510908,\n", + " 0.9287995249824067,\n", + " 0.9308575826882477,\n", + " 0.9308951442646023)" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "np.mean(res0), np.mean(res1), np.mean(res2), np.mean(res3), np.mean(res4)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(0.9285714285714286,\n", + " 0.935960591133005,\n", + " 0.9334975369458128,\n", + " 0.93481703026038,\n", + " 0.9359605911330049)" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "np.median(res0), np.median(res1), np.median(res2), np.median(res3), np.median(res4)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "smote-variants", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.13" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/notebooks/development/033-analysis-ml.ipynb b/notebooks/development/033-analysis-ml.ipynb new file mode 100644 index 0000000..684fc56 --- /dev/null +++ b/notebooks/development/033-analysis-ml.ipynb @@ -0,0 +1,1304 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 61, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "import pandas as pd\n", + "from scipy.stats import wilcoxon\n", + "import common_datasets.binary_classification as binclas" + ] + }, + { + "cell_type": "code", + "execution_count": 62, + "metadata": {}, + "outputs": [], + "source": [ + "data = pd.read_csv('bupa-ml.csv')" + ] + }, + { + "cell_type": "code", + "execution_count": 63, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Unnamed: 0namefoldsparamclassifiercparamauc
00bupa0{'n_neighbors': 5, 'proportion': 1.0, 'random_...DecisionTreeClassifier{'max_depth': 4, 'random_state': 5}0.737500
11bupa0{'n_neighbors': 5, 'proportion': 1.0, 'random_...DecisionTreeClassifier{'max_depth': 6, 'random_state': 5}0.675431
22bupa0{'n_neighbors': 5, 'proportion': 1.0, 'random_...DecisionTreeClassifier{'max_depth': 8, 'random_state': 5}0.681034
33bupa0{'n_neighbors': 5, 'proportion': 1.0, 'random_...RandomForestClassifier{'max_depth': 4, 'random_state': 5}0.859483
44bupa0{'n_neighbors': 5, 'proportion': 1.0, 'random_...RandomForestClassifier{'max_depth': 6, 'random_state': 5}0.858621
\n", + "
" + ], + "text/plain": [ + " Unnamed: 0 name fold sparam \\\n", + "0 0 bupa 0 {'n_neighbors': 5, 'proportion': 1.0, 'random_... \n", + "1 1 bupa 0 {'n_neighbors': 5, 'proportion': 1.0, 'random_... \n", + "2 2 bupa 0 {'n_neighbors': 5, 'proportion': 1.0, 'random_... \n", + "3 3 bupa 0 {'n_neighbors': 5, 'proportion': 1.0, 'random_... \n", + "4 4 bupa 0 {'n_neighbors': 5, 'proportion': 1.0, 'random_... \n", + "\n", + " classifier cparam auc \n", + "0 DecisionTreeClassifier {'max_depth': 4, 'random_state': 5} 0.737500 \n", + "1 DecisionTreeClassifier {'max_depth': 6, 'random_state': 5} 0.675431 \n", + "2 DecisionTreeClassifier {'max_depth': 8, 'random_state': 5} 0.681034 \n", + "3 RandomForestClassifier {'max_depth': 4, 'random_state': 5} 0.859483 \n", + "4 RandomForestClassifier {'max_depth': 6, 'random_state': 5} 0.858621 " + ] + }, + "execution_count": 63, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 64, + "metadata": {}, + "outputs": [], + "source": [ + "data['sparam'] = data['sparam'].apply(eval)\n", + "data['cparam'] = data['cparam'].apply(eval)" + ] + }, + { + "cell_type": "code", + "execution_count": 65, + "metadata": {}, + "outputs": [], + "source": [ + "def remove_key(dict, key):\n", + " del dict[key]\n", + " return dict" + ] + }, + { + "cell_type": "code", + "execution_count": 66, + "metadata": {}, + "outputs": [], + "source": [ + "data['metric'] = data['sparam'].apply(lambda x: x['nn_params']['metric_learning_method'])\n", + "data['sparam'] = data['sparam'].apply(lambda x: remove_key(x, 'nn_params'))\n", + "data['cparam'] = data['cparam'].apply(str)\n", + "data['sparam'] = data['sparam'].apply(str)" + ] + }, + { + "cell_type": "code", + "execution_count": 67, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Index(['Unnamed: 0', 'name', 'fold', 'sparam', 'classifier', 'cparam', 'auc',\n", + " 'metric'],\n", + " dtype='object')" + ] + }, + "execution_count": 67, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data.columns" + ] + }, + { + "cell_type": "code", + "execution_count": 68, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Unnamed: 0namefoldsparamclassifiercparamaucmetric
00bupa0{'n_neighbors': 5, 'proportion': 1.0, 'random_...DecisionTreeClassifier{'max_depth': 4, 'random_state': 5}0.737500id
11bupa0{'n_neighbors': 5, 'proportion': 1.0, 'random_...DecisionTreeClassifier{'max_depth': 6, 'random_state': 5}0.675431id
22bupa0{'n_neighbors': 5, 'proportion': 1.0, 'random_...DecisionTreeClassifier{'max_depth': 8, 'random_state': 5}0.681034id
33bupa0{'n_neighbors': 5, 'proportion': 1.0, 'random_...RandomForestClassifier{'max_depth': 4, 'random_state': 5}0.859483id
44bupa0{'n_neighbors': 5, 'proportion': 1.0, 'random_...RandomForestClassifier{'max_depth': 6, 'random_state': 5}0.858621id
\n", + "
" + ], + "text/plain": [ + " Unnamed: 0 name fold sparam \\\n", + "0 0 bupa 0 {'n_neighbors': 5, 'proportion': 1.0, 'random_... \n", + "1 1 bupa 0 {'n_neighbors': 5, 'proportion': 1.0, 'random_... \n", + "2 2 bupa 0 {'n_neighbors': 5, 'proportion': 1.0, 'random_... \n", + "3 3 bupa 0 {'n_neighbors': 5, 'proportion': 1.0, 'random_... \n", + "4 4 bupa 0 {'n_neighbors': 5, 'proportion': 1.0, 'random_... \n", + "\n", + " classifier cparam auc \\\n", + "0 DecisionTreeClassifier {'max_depth': 4, 'random_state': 5} 0.737500 \n", + "1 DecisionTreeClassifier {'max_depth': 6, 'random_state': 5} 0.675431 \n", + "2 DecisionTreeClassifier {'max_depth': 8, 'random_state': 5} 0.681034 \n", + "3 RandomForestClassifier {'max_depth': 4, 'random_state': 5} 0.859483 \n", + "4 RandomForestClassifier {'max_depth': 6, 'random_state': 5} 0.858621 \n", + "\n", + " metric \n", + "0 id \n", + "1 id \n", + "2 id \n", + "3 id \n", + "4 id " + ] + }, + "execution_count": 68, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 69, + "metadata": {}, + "outputs": [], + "source": [ + "grouped = data.groupby(['name', 'sparam', 'classifier', 'cparam', 'metric']).apply(lambda pdf: pdf.sort_values('fold')['auc'].values.tolist())" + ] + }, + { + "cell_type": "code", + "execution_count": 70, + "metadata": {}, + "outputs": [], + "source": [ + "grouped = grouped.reset_index(drop=False)\n", + "grouped = grouped.rename(columns={0: 'auc'})\n", + "determ = grouped[grouped['metric'] == 'MI_weighted'].drop(columns=['metric'])\n", + "rand = grouped[grouped['metric'] == 'id'].drop(columns=['metric'])\n", + "merged = pd.merge(determ.rename(columns={'auc': 'auc_det'}), rand, on=['name', 'sparam', 'classifier', 'cparam'])" + ] + }, + { + "cell_type": "code", + "execution_count": 71, + "metadata": {}, + "outputs": [], + "source": [ + "merged['auc_mean_det'] = merged['auc_det'].apply(np.mean)\n", + "merged['auc_std_det'] = merged['auc_det'].apply(np.std)\n", + "merged['auc_min_det'] = merged['auc_det'].apply(np.min)\n", + "merged['auc_max_det'] = merged['auc_det'].apply(np.max)\n", + "merged['auc_mean'] = merged['auc'].apply(np.mean)\n", + "merged['auc_std'] = merged['auc'].apply(np.std)\n", + "merged['auc_min'] = merged['auc'].apply(np.min)\n", + "merged['auc_max'] = merged['auc'].apply(np.max)\n", + "merged['p_l'] = merged.apply(lambda row: wilcoxon(row['auc_det'], row['auc'], zero_method='zsplit', alternative='less').pvalue, axis=1)\n", + "merged['p_g'] = merged.apply(lambda row: wilcoxon(row['auc_det'], row['auc'], zero_method='zsplit', alternative='greater').pvalue, axis=1)\n", + "merged['f_l'] = merged['p_l'] < 0.05\n", + "merged['f_g'] = merged['p_g'] < 0.05" + ] + }, + { + "cell_type": "code", + "execution_count": 72, + "metadata": {}, + "outputs": [], + "source": [ + "def model_selection(pdf):\n", + " max_det = pdf[pdf['auc_mean_det'] == pdf['auc_mean_det'].max()].iloc[0]\n", + " max_ran = pdf[pdf['auc_mean'] == pdf['auc_mean'].max()].iloc[0]\n", + " return pd.Series({'auc_mean_det': max_det['auc_mean_det'],\n", + " 'auc_mean': max_ran['auc_mean'],\n", + " 'auc_std_det': max_det['auc_std_det'],\n", + " 'auc_std': max_ran['auc_std'],\n", + " 'p_l': wilcoxon(max_det['auc_det'], max_ran['auc'], zero_method='zsplit', alternative='less').pvalue,\n", + " 'p_g': wilcoxon(max_det['auc_det'], max_ran['auc'], zero_method='zsplit', alternative='greater').pvalue,\n", + " 'auc_median_det': np.median(max_det['auc_det']),\n", + " 'auc_median': np.median(max_ran['auc'])})" + ] + }, + { + "cell_type": "code", + "execution_count": 73, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
auc_mean_detauc_meanauc_std_detauc_stdp_lp_gauc_median_detauc_median
classifier
DecisionTreeClassifier0.6611720.6611090.0603840.0597220.4433130.5566870.6629310.664224
KNeighborsClassifier0.6591950.6587670.0552520.0552620.6658740.3341260.6612070.657543
RandomForestClassifier0.7625950.7645980.0526910.0524730.0024840.9975160.7655170.766810
SVC0.6501300.6508330.0658290.0659790.0017340.9982660.6517240.652155
\n", + "
" + ], + "text/plain": [ + " auc_mean_det auc_mean auc_std_det auc_std \\\n", + "classifier \n", + "DecisionTreeClassifier 0.661172 0.661109 0.060384 0.059722 \n", + "KNeighborsClassifier 0.659195 0.658767 0.055252 0.055262 \n", + "RandomForestClassifier 0.762595 0.764598 0.052691 0.052473 \n", + "SVC 0.650130 0.650833 0.065829 0.065979 \n", + "\n", + " p_l p_g auc_median_det auc_median \n", + "classifier \n", + "DecisionTreeClassifier 0.443313 0.556687 0.662931 0.664224 \n", + "KNeighborsClassifier 0.665874 0.334126 0.661207 0.657543 \n", + "RandomForestClassifier 0.002484 0.997516 0.765517 0.766810 \n", + "SVC 0.001734 0.998266 0.651724 0.652155 " + ] + }, + "execution_count": 73, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "merged.groupby('classifier').apply(model_selection)" + ] + }, + { + "cell_type": "code", + "execution_count": 74, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
namesparamclassifiercparamauc_detaucauc_mean_detauc_std_detauc_min_detauc_max_detauc_meanauc_stdauc_minauc_maxp_lp_gf_lf_g
classifier
DecisionTreeClassifier0bupa{'n_neighbors': 5, 'proportion': 1.0, 'random_...DecisionTreeClassifier{'max_depth': 4, 'random_state': 5}[0.6857758620689656, 0.5900862068965518, 0.638...[0.7375, 0.6668103448275862, 0.7, 0.6504310344...0.6611720.0603840.4581900.8556030.6611090.0597220.4750000.8336210.4433130.556687FalseFalse
KNeighborsClassifier4bupa{'n_neighbors': 5, 'proportion': 1.0, 'random_...KNeighborsClassifier{'n_neighbors': 3}[0.6133620689655173, 0.6340517241379311, 0.630...[0.6051724137931035, 0.6189655172413793, 0.637...0.6591950.0552520.4827590.8534480.6587670.0552620.4922410.8534480.6658740.334126FalseFalse
RandomForestClassifier10bupa{'n_neighbors': 5, 'proportion': 1.0, 'random_...RandomForestClassifier{'max_depth': 8, 'random_state': 5}[0.8543103448275862, 0.7698275862068966, 0.755...[0.8551724137931035, 0.7422413793103448, 0.755...0.7625950.0526910.6034480.9284480.7645980.0524730.6060340.9232760.0024840.997516TrueFalse
SVC19bupa{'n_neighbors': 5, 'proportion': 1.0, 'random_...SVC{'C': 0.1, 'probability': True, 'random_state'...[0.746551724137931, 0.6741379310344828, 0.6362...[0.7543103448275862, 0.6715517241379311, 0.634...0.6501300.0658290.4370690.8508620.6508330.0659790.4344830.8508620.0017340.998266TrueFalse
\n", + "
" + ], + "text/plain": [ + " name \\\n", + "classifier \n", + "DecisionTreeClassifier 0 bupa \n", + "KNeighborsClassifier 4 bupa \n", + "RandomForestClassifier 10 bupa \n", + "SVC 19 bupa \n", + "\n", + " sparam \\\n", + "classifier \n", + "DecisionTreeClassifier 0 {'n_neighbors': 5, 'proportion': 1.0, 'random_... \n", + "KNeighborsClassifier 4 {'n_neighbors': 5, 'proportion': 1.0, 'random_... \n", + "RandomForestClassifier 10 {'n_neighbors': 5, 'proportion': 1.0, 'random_... \n", + "SVC 19 {'n_neighbors': 5, 'proportion': 1.0, 'random_... \n", + "\n", + " classifier \\\n", + "classifier \n", + "DecisionTreeClassifier 0 DecisionTreeClassifier \n", + "KNeighborsClassifier 4 KNeighborsClassifier \n", + "RandomForestClassifier 10 RandomForestClassifier \n", + "SVC 19 SVC \n", + "\n", + " cparam \\\n", + "classifier \n", + "DecisionTreeClassifier 0 {'max_depth': 4, 'random_state': 5} \n", + "KNeighborsClassifier 4 {'n_neighbors': 3} \n", + "RandomForestClassifier 10 {'max_depth': 8, 'random_state': 5} \n", + "SVC 19 {'C': 0.1, 'probability': True, 'random_state'... \n", + "\n", + " auc_det \\\n", + "classifier \n", + "DecisionTreeClassifier 0 [0.6857758620689656, 0.5900862068965518, 0.638... \n", + "KNeighborsClassifier 4 [0.6133620689655173, 0.6340517241379311, 0.630... \n", + "RandomForestClassifier 10 [0.8543103448275862, 0.7698275862068966, 0.755... \n", + "SVC 19 [0.746551724137931, 0.6741379310344828, 0.6362... \n", + "\n", + " auc \\\n", + "classifier \n", + "DecisionTreeClassifier 0 [0.7375, 0.6668103448275862, 0.7, 0.6504310344... \n", + "KNeighborsClassifier 4 [0.6051724137931035, 0.6189655172413793, 0.637... \n", + "RandomForestClassifier 10 [0.8551724137931035, 0.7422413793103448, 0.755... \n", + "SVC 19 [0.7543103448275862, 0.6715517241379311, 0.634... \n", + "\n", + " auc_mean_det auc_std_det auc_min_det \\\n", + "classifier \n", + "DecisionTreeClassifier 0 0.661172 0.060384 0.458190 \n", + "KNeighborsClassifier 4 0.659195 0.055252 0.482759 \n", + "RandomForestClassifier 10 0.762595 0.052691 0.603448 \n", + "SVC 19 0.650130 0.065829 0.437069 \n", + "\n", + " auc_max_det auc_mean auc_std auc_min \\\n", + "classifier \n", + "DecisionTreeClassifier 0 0.855603 0.661109 0.059722 0.475000 \n", + "KNeighborsClassifier 4 0.853448 0.658767 0.055262 0.492241 \n", + "RandomForestClassifier 10 0.928448 0.764598 0.052473 0.606034 \n", + "SVC 19 0.850862 0.650833 0.065979 0.434483 \n", + "\n", + " auc_max p_l p_g f_l f_g \n", + "classifier \n", + "DecisionTreeClassifier 0 0.833621 0.443313 0.556687 False False \n", + "KNeighborsClassifier 4 0.853448 0.665874 0.334126 False False \n", + "RandomForestClassifier 10 0.923276 0.002484 0.997516 True False \n", + "SVC 19 0.850862 0.001734 0.998266 True False " + ] + }, + "execution_count": 74, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "merged.groupby('classifier').apply(lambda pdf: pdf[pdf['auc_mean_det'] == pdf['auc_mean_det'].max()])" + ] + }, + { + "cell_type": "code", + "execution_count": 75, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
namesparamclassifiercparamauc_detaucauc_mean_detauc_std_detauc_min_detauc_max_detauc_meanauc_stdauc_minauc_maxp_lp_gf_lf_g
10bupa{'n_neighbors': 5, 'proportion': 1.0, 'random_...RandomForestClassifier{'max_depth': 8, 'random_state': 5}[0.8543103448275862, 0.7698275862068966, 0.755...[0.8551724137931035, 0.7422413793103448, 0.755...0.7625950.0526910.6034480.9284480.7645980.0524730.6060340.9232760.0024840.997516TrueFalse
\n", + "
" + ], + "text/plain": [ + " name sparam \\\n", + "10 bupa {'n_neighbors': 5, 'proportion': 1.0, 'random_... \n", + "\n", + " classifier cparam \\\n", + "10 RandomForestClassifier {'max_depth': 8, 'random_state': 5} \n", + "\n", + " auc_det \\\n", + "10 [0.8543103448275862, 0.7698275862068966, 0.755... \n", + "\n", + " auc auc_mean_det \\\n", + "10 [0.8551724137931035, 0.7422413793103448, 0.755... 0.762595 \n", + "\n", + " auc_std_det auc_min_det auc_max_det auc_mean auc_std auc_min \\\n", + "10 0.052691 0.603448 0.928448 0.764598 0.052473 0.606034 \n", + "\n", + " auc_max p_l p_g f_l f_g \n", + "10 0.923276 0.002484 0.997516 True False " + ] + }, + "execution_count": 75, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "merged[merged['auc_mean_det'] == merged['auc_mean_det'].max()]" + ] + }, + { + "cell_type": "code", + "execution_count": 76, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
namesparamclassifiercparamauc_detaucauc_mean_detauc_std_detauc_min_detauc_max_detauc_meanauc_stdauc_minauc_maxp_lp_gf_lf_g
classifier
DecisionTreeClassifier0bupa{'n_neighbors': 5, 'proportion': 1.0, 'random_...DecisionTreeClassifier{'max_depth': 4, 'random_state': 5}[0.6857758620689656, 0.5900862068965518, 0.638...[0.7375, 0.6668103448275862, 0.7, 0.6504310344...0.6611720.0603840.4581900.8556030.6611090.0597220.4750000.8336210.4433130.556687FalseFalse
KNeighborsClassifier4bupa{'n_neighbors': 5, 'proportion': 1.0, 'random_...KNeighborsClassifier{'n_neighbors': 3}[0.6133620689655173, 0.6340517241379311, 0.630...[0.6051724137931035, 0.6189655172413793, 0.637...0.6591950.0552520.4827590.8534480.6587670.0552620.4922410.8534480.6658740.334126FalseFalse
RandomForestClassifier10bupa{'n_neighbors': 5, 'proportion': 1.0, 'random_...RandomForestClassifier{'max_depth': 8, 'random_state': 5}[0.8543103448275862, 0.7698275862068966, 0.755...[0.8551724137931035, 0.7422413793103448, 0.755...0.7625950.0526910.6034480.9284480.7645980.0524730.6060340.9232760.0024840.997516TrueFalse
SVC19bupa{'n_neighbors': 5, 'proportion': 1.0, 'random_...SVC{'C': 0.1, 'probability': True, 'random_state'...[0.746551724137931, 0.6741379310344828, 0.6362...[0.7543103448275862, 0.6715517241379311, 0.634...0.6501300.0658290.4370690.8508620.6508330.0659790.4344830.8508620.0017340.998266TrueFalse
\n", + "
" + ], + "text/plain": [ + " name \\\n", + "classifier \n", + "DecisionTreeClassifier 0 bupa \n", + "KNeighborsClassifier 4 bupa \n", + "RandomForestClassifier 10 bupa \n", + "SVC 19 bupa \n", + "\n", + " sparam \\\n", + "classifier \n", + "DecisionTreeClassifier 0 {'n_neighbors': 5, 'proportion': 1.0, 'random_... \n", + "KNeighborsClassifier 4 {'n_neighbors': 5, 'proportion': 1.0, 'random_... \n", + "RandomForestClassifier 10 {'n_neighbors': 5, 'proportion': 1.0, 'random_... \n", + "SVC 19 {'n_neighbors': 5, 'proportion': 1.0, 'random_... \n", + "\n", + " classifier \\\n", + "classifier \n", + "DecisionTreeClassifier 0 DecisionTreeClassifier \n", + "KNeighborsClassifier 4 KNeighborsClassifier \n", + "RandomForestClassifier 10 RandomForestClassifier \n", + "SVC 19 SVC \n", + "\n", + " cparam \\\n", + "classifier \n", + "DecisionTreeClassifier 0 {'max_depth': 4, 'random_state': 5} \n", + "KNeighborsClassifier 4 {'n_neighbors': 3} \n", + "RandomForestClassifier 10 {'max_depth': 8, 'random_state': 5} \n", + "SVC 19 {'C': 0.1, 'probability': True, 'random_state'... \n", + "\n", + " auc_det \\\n", + "classifier \n", + "DecisionTreeClassifier 0 [0.6857758620689656, 0.5900862068965518, 0.638... \n", + "KNeighborsClassifier 4 [0.6133620689655173, 0.6340517241379311, 0.630... \n", + "RandomForestClassifier 10 [0.8543103448275862, 0.7698275862068966, 0.755... \n", + "SVC 19 [0.746551724137931, 0.6741379310344828, 0.6362... \n", + "\n", + " auc \\\n", + "classifier \n", + "DecisionTreeClassifier 0 [0.7375, 0.6668103448275862, 0.7, 0.6504310344... \n", + "KNeighborsClassifier 4 [0.6051724137931035, 0.6189655172413793, 0.637... \n", + "RandomForestClassifier 10 [0.8551724137931035, 0.7422413793103448, 0.755... \n", + "SVC 19 [0.7543103448275862, 0.6715517241379311, 0.634... \n", + "\n", + " auc_mean_det auc_std_det auc_min_det \\\n", + "classifier \n", + "DecisionTreeClassifier 0 0.661172 0.060384 0.458190 \n", + "KNeighborsClassifier 4 0.659195 0.055252 0.482759 \n", + "RandomForestClassifier 10 0.762595 0.052691 0.603448 \n", + "SVC 19 0.650130 0.065829 0.437069 \n", + "\n", + " auc_max_det auc_mean auc_std auc_min \\\n", + "classifier \n", + "DecisionTreeClassifier 0 0.855603 0.661109 0.059722 0.475000 \n", + "KNeighborsClassifier 4 0.853448 0.658767 0.055262 0.492241 \n", + "RandomForestClassifier 10 0.928448 0.764598 0.052473 0.606034 \n", + "SVC 19 0.850862 0.650833 0.065979 0.434483 \n", + "\n", + " auc_max p_l p_g f_l f_g \n", + "classifier \n", + "DecisionTreeClassifier 0 0.833621 0.443313 0.556687 False False \n", + "KNeighborsClassifier 4 0.853448 0.665874 0.334126 False False \n", + "RandomForestClassifier 10 0.923276 0.002484 0.997516 True False \n", + "SVC 19 0.850862 0.001734 0.998266 True False " + ] + }, + "execution_count": 76, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "merged.groupby('classifier').apply(lambda pdf: pdf[pdf['auc_mean'] == pdf['auc_mean'].max()])" + ] + }, + { + "cell_type": "code", + "execution_count": 82, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "\"{'C': 0.1, 'probability': True, 'random_state': 5}\"" + ] + }, + "execution_count": 82, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tmp = merged.groupby('classifier').apply(lambda pdf: pdf[pdf['auc_mean'] == pdf['auc_mean'].max()])\n", + "tmp.iloc[-1]['cparam']" + ] + }, + { + "cell_type": "code", + "execution_count": 77, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
namesparamclassifiercparamauc_detaucauc_mean_detauc_std_detauc_min_detauc_max_detauc_meanauc_stdauc_minauc_maxp_lp_gf_lf_g
10bupa{'n_neighbors': 5, 'proportion': 1.0, 'random_...RandomForestClassifier{'max_depth': 8, 'random_state': 5}[0.8543103448275862, 0.7698275862068966, 0.755...[0.8551724137931035, 0.7422413793103448, 0.755...0.7625950.0526910.6034480.9284480.7645980.0524730.6060340.9232760.0024840.997516TrueFalse
\n", + "
" + ], + "text/plain": [ + " name sparam \\\n", + "10 bupa {'n_neighbors': 5, 'proportion': 1.0, 'random_... \n", + "\n", + " classifier cparam \\\n", + "10 RandomForestClassifier {'max_depth': 8, 'random_state': 5} \n", + "\n", + " auc_det \\\n", + "10 [0.8543103448275862, 0.7698275862068966, 0.755... \n", + "\n", + " auc auc_mean_det \\\n", + "10 [0.8551724137931035, 0.7422413793103448, 0.755... 0.762595 \n", + "\n", + " auc_std_det auc_min_det auc_max_det auc_mean auc_std auc_min \\\n", + "10 0.052691 0.603448 0.928448 0.764598 0.052473 0.606034 \n", + "\n", + " auc_max p_l p_g f_l f_g \n", + "10 0.923276 0.002484 0.997516 True False " + ] + }, + "execution_count": 77, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "merged[merged['auc_mean'] == merged['auc_mean'].max()]" + ] + }, + { + "cell_type": "code", + "execution_count": 78, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "auc_mean 0.610534\n", + "auc_std 0.052280\n", + "auc_mean_det 0.610252\n", + "auc_std_det 0.052778\n", + "p_l 0.405746\n", + "p_g 0.594254\n", + "f_l 0.250000\n", + "f_g 0.100000\n", + "dtype: float64" + ] + }, + "execution_count": 78, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "merged[['auc_mean', 'auc_std', 'auc_mean_det', 'auc_std_det', 'p_l', 'p_g', 'f_l', 'f_g']].mean()" + ] + }, + { + "cell_type": "code", + "execution_count": 79, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "auc_min 0.429138\n", + "auc_max 0.770366\n", + "auc_min_det 0.425280\n", + "auc_max_det 0.773621\n", + "dtype: float64" + ] + }, + "execution_count": 79, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "merged[['auc_min', 'auc_max', 'auc_min_det', 'auc_max_det']].mean()" + ] + }, + { + "cell_type": "code", + "execution_count": 80, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "auc_mean 0.630787\n", + "auc_std 0.057362\n", + "auc_mean_det 0.630810\n", + "auc_std_det 0.057514\n", + "dtype: float64" + ] + }, + "execution_count": 80, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "merged[['auc_mean', 'auc_std', 'auc_mean_det', 'auc_std_det']].median()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "smote-variants", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.13" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/notebooks/development/034-test-regularization.ipynb b/notebooks/development/034-test-regularization.ipynb new file mode 100644 index 0000000..5b61d45 --- /dev/null +++ b/notebooks/development/034-test-regularization.ipynb @@ -0,0 +1,279 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import datetime\n", + "\n", + "from joblib import Parallel, delayed\n", + "\n", + "import numpy as np\n", + "import pandas as pd\n", + "\n", + "import tqdm\n", + "\n", + "from sklearn.tree import DecisionTreeClassifier\n", + "from sklearn.ensemble import RandomForestClassifier\n", + "from sklearn.svm import SVC\n", + "from sklearn.neighbors import KNeighborsClassifier\n", + "from sklearn.model_selection import RepeatedStratifiedKFold\n", + "from sklearn.preprocessing import StandardScaler\n", + "from sklearn.metrics import roc_auc_score\n", + "\n", + "from smote_variants.oversampling import SMOTE, NoSMOTE, ADASYN, Borderline_SMOTE1, ProWSyn, SMOTE_IPF, Lee, SMOBD\n", + "from common_datasets.binary_classification import get_filtered_data_loaders\n", + "import common_datasets.binary_classification as binclas" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "import logging\n", + "logger = logging.getLogger('smote_variants')\n", + "logger.setLevel(logging.ERROR)" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "classifiers = {\n", + "DecisionTreeClassifier: [{'max_depth': md, 'random_state': 5} for md in [1, 2] + list(range(3, 18, 2))],\n", + "RandomForestClassifier: [{'max_depth': md, 'random_state': 5, 'n_jobs': 1} for md in [1, 2] + list(range(3, 18, 2))],\n", + "KNeighborsClassifier: [{'n_neighbors': nn, 'n_jobs': 1} for nn in range(1, 70, 4)],\n", + "SVC: [{'C': c, 'probability': True, 'random_state': 5} for c in [0.001, 0.002, 0.005, 0.01, 0.02, 0.05, 0.1, 0.2, 0.5, 1.0, 2.0, 5.0]]\\\n", + " + [{'C': c, 'probability': True, 'kernel': 'poly', 'degree': 2, 'random_state': 5} for c in [0.001, 0.002, 0.005, 0.01, 0.02, 0.05, 0.1, 0.2, 0.5, 1.0, 2.0, 5.0]]\\\n", + " + [{'C': c, 'probability': True, 'kernel': 'poly', 'degree': 3, 'random_state': 5} for c in [0.001, 0.002, 0.005, 0.01, 0.02, 0.05, 0.1, 0.2, 0.5, 1.0, 2.0, 5.0]]\\\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "datasets = get_filtered_data_loaders(n_col_bounds=(2, 50),\n", + " n_bounds=(10, 700),\n", + " n_minority_bounds=(10, 500),\n", + " n_from_phenotypes=1,\n", + " n_smallest=40)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "datasets = [loader for loader in datasets if loader not in [binclas.load_iris0, binclas.load_dermatology_6, binclas.load_shuttle_6_vs_2_3, binclas.load_monk_2, binclas.load_new_thyroid1]]" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "20" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(datasets)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "oversampler_classes = [SMOTE, Borderline_SMOTE1, ADASYN, ProWSyn, SMOTE_IPF, Lee, SMOBD, NoSMOTE]" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "oversamplers = {}\n", + "for oversampler in oversampler_classes:\n", + " random_state = np.random.RandomState(5)\n", + " params = oversampler.parameter_combinations()\n", + " params = [comb for comb in params if comb.get('proportion', 1.0) == 1.0]\n", + " n_params = min(10, len(params))\n", + " oversamplers[oversampler] = random_state.choice(params, n_params, replace=False)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "def job_generator(data_loader):\n", + "\n", + " dataset = data_loader()\n", + "\n", + " X = dataset['data']\n", + " y = dataset['target']\n", + "\n", + " validator = RepeatedStratifiedKFold(n_splits=5, n_repeats=2, random_state=5)\n", + "\n", + " for fidx, (train, test) in enumerate(validator.split(X, y, y)):\n", + " X_train = X[train]\n", + " X_test = X[test]\n", + " y_train = y[train]\n", + " y_test = y[test]\n", + "\n", + " ss = StandardScaler()\n", + " ss.fit(X_train)\n", + " X_train = ss.transform(X_train)\n", + " X_test = ss.transform(X_test)\n", + "\n", + " for oversampler, oparam in oversamplers.items():\n", + " for sparam in oparam:\n", + " oversampling = oversampler(**sparam)\n", + " X_samp, y_samp = oversampling.sample(X_train, y_train)\n", + "\n", + " job = {\n", + " 'X_samp': X_samp,\n", + " 'y_samp': y_samp,\n", + " 'X_test': X_test,\n", + " 'y_test': y_test,\n", + " }\n", + "\n", + " description = {\n", + " 'name': dataset['name'],\n", + " 'fold': fidx,\n", + " 'oversampler': oversampler.__name__,\n", + " 'sparam': sparam,\n", + " }\n", + "\n", + " yield job, description" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "def do_job(job, description):\n", + " results = []\n", + " for classifier, cparams in classifiers.items():\n", + " for cparam in cparams:\n", + " tmp = description.copy()\n", + " classifier_obj = classifier(**cparam)\n", + " classifier_obj.fit(job['X_samp'], job['y_samp'])\n", + " y_pred = classifier_obj.predict_proba(job['X_test'])\n", + " auc = roc_auc_score(job['y_test'], y_pred[:, 1])\n", + "\n", + " tmp['classifier'] = classifier.__name__\n", + " tmp['cparam'] = cparam\n", + " tmp['auc'] = auc\n", + " results.append(tmp)\n", + "\n", + " return results" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2023-12-16 17:39:43.203396 appendicitis\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "1it [00:00, 6.81it/s]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2023-12-16 17:39:43.262076 appendicitis\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "5it [00:00, 10.71it/s]" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "620it [08:17, 1.25it/s]\n" + ] + } + ], + "source": [ + "for data_loader in datasets:\n", + " if data_loader != binclas.load_appendicitis:\n", + " continue\n", + " dataset = data_loader()\n", + "\n", + " print(datetime.datetime.now(), dataset['name'])\n", + "\n", + " results = Parallel(n_jobs=3)(delayed(do_job)(*x) for x in tqdm.tqdm(job_generator(data_loader)))\n", + "\n", + " results = [\n", + " x\n", + " for xs in results\n", + " for x in xs\n", + " ]\n", + "\n", + " results = pd.DataFrame.from_dict(results)\n", + " results.to_csv(f\"{dataset['name']}-reg.csv\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "smote-variants", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.13" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/notebooks/development/034-test-regularization.py b/notebooks/development/034-test-regularization.py new file mode 100644 index 0000000..dcc8219 --- /dev/null +++ b/notebooks/development/034-test-regularization.py @@ -0,0 +1,142 @@ +# %% +import datetime + +from joblib import Parallel, delayed + +import numpy as np +import pandas as pd + +import tqdm + +from sklearn.tree import DecisionTreeClassifier +from sklearn.ensemble import RandomForestClassifier +from sklearn.svm import SVC +from sklearn.neighbors import KNeighborsClassifier +from sklearn.model_selection import RepeatedStratifiedKFold +from sklearn.preprocessing import StandardScaler +from sklearn.metrics import roc_auc_score + +from smote_variants.oversampling import SMOTE, NoSMOTE, ADASYN, Borderline_SMOTE1, ProWSyn, SMOTE_IPF, Lee, SMOBD +from common_datasets.binary_classification import get_filtered_data_loaders +import common_datasets.binary_classification as binclas + +# %% +import logging +logger = logging.getLogger('smote_variants') +logger.setLevel(logging.ERROR) + +# %% +classifiers = { +DecisionTreeClassifier: [{'max_depth': md, 'random_state': 5} for md in [1, 2] + list(range(3, 18, 2))], +RandomForestClassifier: [{'max_depth': md, 'random_state': 5, 'n_jobs': 1} for md in [1, 2] + list(range(3, 18, 2))], +KNeighborsClassifier: [{'n_neighbors': nn, 'n_jobs': 1} for nn in range(1, 70, 4)], +SVC: [{'C': c, 'probability': True, 'random_state': 5} for c in [0.001, 0.002, 0.005, 0.01, 0.02, 0.05, 0.1, 0.2, 0.5, 1.0, 2.0, 5.0]]\ + + [{'C': c, 'probability': True, 'kernel': 'poly', 'degree': 2, 'random_state': 5} for c in [0.001, 0.002, 0.005, 0.01, 0.02, 0.05, 0.1, 0.2, 0.5, 1.0, 2.0, 5.0]]\ + + [{'C': c, 'probability': True, 'kernel': 'poly', 'degree': 3, 'random_state': 5} for c in [0.001, 0.002, 0.005, 0.01, 0.02, 0.05, 0.1, 0.2, 0.5, 1.0, 2.0, 5.0]]\ +} + +# %% +datasets = get_filtered_data_loaders(n_col_bounds=(2, 50), + n_bounds=(10, 700), + n_minority_bounds=(10, 500), + n_from_phenotypes=1, + n_smallest=40) + +# %% +datasets = [loader for loader in datasets if loader not in [binclas.load_iris0, binclas.load_dermatology_6, binclas.load_shuttle_6_vs_2_3, binclas.load_monk_2, binclas.load_new_thyroid1]] + +# %% +len(datasets) + +# %% +oversampler_classes = [SMOTE, Borderline_SMOTE1, ADASYN, ProWSyn, SMOTE_IPF, Lee, SMOBD, NoSMOTE] + +# %% +oversamplers = {} +for oversampler in oversampler_classes: + random_state = np.random.RandomState(5) + params = oversampler.parameter_combinations() + params = [comb for comb in params if comb.get('proportion', 1.0) == 1.0] + n_params = min(10, len(params)) + oversamplers[oversampler] = random_state.choice(params, n_params, replace=False) + +# %% +def job_generator(data_loader): + + dataset = data_loader() + + X = dataset['data'] + y = dataset['target'] + + validator = RepeatedStratifiedKFold(n_splits=5, n_repeats=2, random_state=5) + + for fidx, (train, test) in enumerate(validator.split(X, y, y)): + X_train = X[train] + X_test = X[test] + y_train = y[train] + y_test = y[test] + + ss = StandardScaler() + ss.fit(X_train) + X_train = ss.transform(X_train) + X_test = ss.transform(X_test) + + for oversampler, oparam in oversamplers.items(): + for sparam in oparam: + oversampling = oversampler(**sparam) + X_samp, y_samp = oversampling.sample(X_train, y_train) + + job = { + 'X_samp': X_samp, + 'y_samp': y_samp, + 'X_test': X_test, + 'y_test': y_test, + } + + description = { + 'name': dataset['name'], + 'fold': fidx, + 'oversampler': oversampler.__name__, + 'sparam': sparam, + } + + yield job, description + +# %% +def do_job(job, description): + results = [] + for classifier, cparams in classifiers.items(): + for cparam in cparams: + tmp = description.copy() + classifier_obj = classifier(**cparam) + classifier_obj.fit(job['X_samp'], job['y_samp']) + y_pred = classifier_obj.predict_proba(job['X_test']) + auc = roc_auc_score(job['y_test'], y_pred[:, 1]) + + tmp['classifier'] = classifier.__name__ + tmp['cparam'] = cparam + tmp['auc'] = auc + results.append(tmp) + + return results + +# %% +for data_loader in datasets: + if data_loader != binclas.load_appendicitis: + continue + dataset = data_loader() + + print(datetime.datetime.now(), dataset['name']) + + results = Parallel(n_jobs=3)(delayed(do_job)(*x) for x in tqdm.tqdm(job_generator(data_loader))) + + results = [ + x + for xs in results + for x in xs + ] + + results = pd.DataFrame.from_dict(results) + results.to_csv(f"{dataset['name']}-reg.csv") + + diff --git a/notebooks/development/035-analysis-regularization.ipynb b/notebooks/development/035-analysis-regularization.ipynb new file mode 100644 index 0000000..4482a42 --- /dev/null +++ b/notebooks/development/035-analysis-regularization.ipynb @@ -0,0 +1,1670 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 48, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "import pandas as pd\n", + "from scipy.stats import wilcoxon\n", + "import common_datasets.binary_classification as binclas\n", + "import matplotlib.pyplot as plt" + ] + }, + { + "cell_type": "code", + "execution_count": 49, + "metadata": {}, + "outputs": [], + "source": [ + "data = pd.read_csv('appendicitis-reg.csv')" + ] + }, + { + "cell_type": "code", + "execution_count": 50, + "metadata": {}, + "outputs": [], + "source": [ + "grouped = data\\\n", + " .groupby(['name', 'oversampler', 'sparam', 'classifier', 'cparam'])\\\n", + " .apply(lambda pdf: pd.Series({'auc': pdf.sort_values('fold')['auc'].values.tolist()}))\\\n", + " .reset_index(drop=False)" + ] + }, + { + "cell_type": "code", + "execution_count": 51, + "metadata": {}, + "outputs": [], + "source": [ + "grouped['auc_mean'] = grouped['auc'].apply(np.mean)" + ] + }, + { + "cell_type": "code", + "execution_count": 52, + "metadata": {}, + "outputs": [], + "source": [ + "def extract_reg_param(row):\n", + " if row['classifier'] == 'SVC':\n", + " return eval(row['cparam'])['C']\n", + " if row['classifier'] == 'DecisionTreeClassifier':\n", + " return eval(row['cparam'])['max_depth']\n", + " if row['classifier'] == 'RandomForestClassifier':\n", + " return eval(row['cparam'])['max_depth']\n", + " if row['classifier'] == 'KNeighborsClassifier':\n", + " return eval(row['cparam'])['n_neighbors']\n", + "\n", + "def extract_classifier_subparam(row):\n", + " if row['classifier'] == 'SVC':\n", + " kernel = eval(row['cparam']).get('kernel', 'rbf')\n", + " degree = str(eval(row['cparam']).get('degree', ''))\n", + " return kernel + degree\n", + " return ''" + ] + }, + { + "cell_type": "code", + "execution_count": 53, + "metadata": {}, + "outputs": [], + "source": [ + "grouped['reg_param'] = grouped.apply(extract_reg_param, axis=1)" + ] + }, + { + "cell_type": "code", + "execution_count": 54, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " classifier kernel reg_param auc_mean auc_mean_smote\n", + "0 SVC poly2 0.001 0.711359 0.250122\n", + "3 SVC poly2 0.002 0.713850 0.250703\n", + "6 SVC poly2 0.005 0.713638 0.261456\n", + "9 SVC poly2 0.010 0.710585 0.657913\n", + "12 SVC poly2 0.020 0.710629 0.746131\n", + "15 SVC poly2 0.050 0.710712 0.750371\n", + "18 SVC poly2 0.100 0.713918 0.751929\n", + "21 SVC poly2 0.200 0.723368 0.752891\n", + "24 SVC poly2 0.500 0.725665 0.755069\n", + "27 SVC poly2 1.000 0.720097 0.755713\n", + "30 SVC poly2 2.000 0.718424 0.746403\n", + " classifier kernel reg_param auc_mean auc_mean_smote\n", + "1 SVC poly3 0.001 0.821631 0.155966\n", + "4 SVC poly3 0.002 0.821949 0.402381\n", + "7 SVC poly3 0.005 0.820960 0.857509\n", + "10 SVC poly3 0.010 0.820712 0.848819\n", + "13 SVC poly3 0.020 0.819429 0.839507\n", + "16 SVC poly3 0.050 0.809549 0.829846\n", + "19 SVC poly3 0.100 0.800191 0.823304\n", + "22 SVC poly3 0.200 0.792416 0.818324\n", + "25 SVC poly3 0.500 0.784059 0.811646\n", + "28 SVC poly3 1.000 0.778493 0.804654\n", + "31 SVC poly3 2.000 0.762454 0.796621\n", + " classifier kernel reg_param auc_mean auc_mean_smote\n", + "2 SVC rbf 0.001 0.800432 0.134971\n", + "5 SVC rbf 0.002 0.794282 0.134971\n", + "8 SVC rbf 0.005 0.794037 0.135968\n", + "11 SVC rbf 0.010 0.795526 0.138422\n", + "14 SVC rbf 0.020 0.795288 0.754365\n", + "17 SVC rbf 0.050 0.795529 0.868762\n", + "20 SVC rbf 0.100 0.795488 0.872091\n", + "23 SVC rbf 0.200 0.795541 0.867499\n", + "26 SVC rbf 0.500 0.791965 0.855222\n", + "29 SVC rbf 1.000 0.797488 0.844791\n", + "32 SVC rbf 2.000 0.813850 0.828971\n", + " classifier kernel reg_param auc_mean auc_mean_smote\n", + "4 DecisionTreeClassifier 3.0 0.750621 0.733844\n", + "5 DecisionTreeClassifier 5.0 0.737157 0.721297\n", + "6 DecisionTreeClassifier 7.0 0.716087 0.713525\n", + "7 DecisionTreeClassifier 9.0 0.713234 0.711046\n", + "0 DecisionTreeClassifier 11.0 0.712991 0.710782\n", + "1 DecisionTreeClassifier 13.0 0.712991 0.710768\n", + "2 DecisionTreeClassifier 15.0 0.712991 0.710768\n", + "3 DecisionTreeClassifier 17.0 0.712991 0.710768\n", + " classifier kernel reg_param auc_mean auc_mean_smote\n", + "4 RandomForestClassifier 3.0 0.838971 0.839129\n", + "5 RandomForestClassifier 5.0 0.828831 0.825497\n", + "6 RandomForestClassifier 7.0 0.825096 0.820869\n", + "7 RandomForestClassifier 9.0 0.824181 0.820987\n", + "0 RandomForestClassifier 11.0 0.824119 0.820993\n", + "1 RandomForestClassifier 13.0 0.824119 0.821001\n", + "2 RandomForestClassifier 15.0 0.824119 0.820988\n", + "3 RandomForestClassifier 17.0 0.824119 0.820988\n", + " classifier kernel reg_param auc_mean auc_mean_smote\n", + "2 KNeighborsClassifier 1.0 0.733643 0.709665\n", + "11 KNeighborsClassifier 5.0 0.779324 0.790846\n", + "12 KNeighborsClassifier 9.0 0.804013 0.809659\n", + "0 KNeighborsClassifier 13.0 0.815140 0.817610\n", + "1 KNeighborsClassifier 17.0 0.834174 0.827690\n", + "3 KNeighborsClassifier 21.0 0.830619 0.835022\n", + "4 KNeighborsClassifier 25.0 0.829724 0.834254\n", + "5 KNeighborsClassifier 29.0 0.843328 0.837459\n", + "6 KNeighborsClassifier 33.0 0.849118 0.842318\n", + "7 KNeighborsClassifier 37.0 0.849035 0.847653\n", + "8 KNeighborsClassifier 41.0 0.848688 0.851491\n", + "9 KNeighborsClassifier 45.0 0.841378 0.853685\n", + "10 KNeighborsClassifier 49.0 0.829066 0.854934\n" + ] + } + ], + "source": [ + "for classifier in ['SVC', 'DecisionTreeClassifier', 'RandomForestClassifier', 'KNeighborsClassifier']:\n", + " filtered = grouped[grouped['classifier'] == classifier]\n", + " nosmote = filtered[filtered['oversampler'] == 'NoSMOTE']\n", + " smote = filtered[filtered['oversampler'] == 'SMOTE']\n", + " merged = pd.merge(nosmote,\n", + " smote[['reg_param', 'cparam', 'auc_mean']].rename(columns={'auc_mean': 'auc_mean_smote'}),\n", + " on=['reg_param', 'cparam'])\n", + " merged['kernel'] = merged.apply(extract_classifier_subparam, axis=1)\n", + "\n", + " kernels = merged['kernel'].drop_duplicates().values\n", + "\n", + " for kernel in kernels:\n", + " print(merged[merged['kernel'] == kernel].sort_values('reg_param')[['classifier', 'kernel', 'reg_param', 'auc_mean', 'auc_mean_smote']])" + ] + }, + { + "cell_type": "code", + "execution_count": 55, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
classifierreg_paramauc_meanauc_mean_smote
2KNeighborsClassifier1.00.7336430.709665
11KNeighborsClassifier5.00.7793240.790846
12KNeighborsClassifier9.00.8040130.809659
0KNeighborsClassifier13.00.8151400.817610
1KNeighborsClassifier17.00.8341740.827690
3KNeighborsClassifier21.00.8306190.835022
4KNeighborsClassifier25.00.8297240.834254
5KNeighborsClassifier29.00.8433280.837459
6KNeighborsClassifier33.00.8491180.842318
7KNeighborsClassifier37.00.8490350.847653
8KNeighborsClassifier41.00.8486880.851491
9KNeighborsClassifier45.00.8413780.853685
10KNeighborsClassifier49.00.8290660.854934
\n", + "
" + ], + "text/plain": [ + " classifier reg_param auc_mean auc_mean_smote\n", + "2 KNeighborsClassifier 1.0 0.733643 0.709665\n", + "11 KNeighborsClassifier 5.0 0.779324 0.790846\n", + "12 KNeighborsClassifier 9.0 0.804013 0.809659\n", + "0 KNeighborsClassifier 13.0 0.815140 0.817610\n", + "1 KNeighborsClassifier 17.0 0.834174 0.827690\n", + "3 KNeighborsClassifier 21.0 0.830619 0.835022\n", + "4 KNeighborsClassifier 25.0 0.829724 0.834254\n", + "5 KNeighborsClassifier 29.0 0.843328 0.837459\n", + "6 KNeighborsClassifier 33.0 0.849118 0.842318\n", + "7 KNeighborsClassifier 37.0 0.849035 0.847653\n", + "8 KNeighborsClassifier 41.0 0.848688 0.851491\n", + "9 KNeighborsClassifier 45.0 0.841378 0.853685\n", + "10 KNeighborsClassifier 49.0 0.829066 0.854934" + ] + }, + "execution_count": 55, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "merged[merged['kernel'] == ''].sort_values('reg_param')[['classifier', 'reg_param', 'auc_mean', 'auc_mean_smote']]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 56, + "metadata": {}, + "outputs": [], + "source": [ + "data['reg_param'] = data.apply(extract_reg_param, axis=1)" + ] + }, + { + "cell_type": "code", + "execution_count": 57, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Unnamed: 0namefoldoversamplersparamclassifiercparamaucreg_param
00appendicitis0SMOTE{'n_neighbors': 5, 'proportion': 1.0, 'random_...DecisionTreeClassifier{'max_depth': 3, 'random_state': 5}0.5647063.0
11appendicitis0SMOTE{'n_neighbors': 5, 'proportion': 1.0, 'random_...DecisionTreeClassifier{'max_depth': 5, 'random_state': 5}0.6000005.0
22appendicitis0SMOTE{'n_neighbors': 5, 'proportion': 1.0, 'random_...DecisionTreeClassifier{'max_depth': 7, 'random_state': 5}0.6705887.0
33appendicitis0SMOTE{'n_neighbors': 5, 'proportion': 1.0, 'random_...DecisionTreeClassifier{'max_depth': 9, 'random_state': 5}0.6705889.0
44appendicitis0SMOTE{'n_neighbors': 5, 'proportion': 1.0, 'random_...DecisionTreeClassifier{'max_depth': 11, 'random_state': 5}0.67058811.0
\n", + "
" + ], + "text/plain": [ + " Unnamed: 0 name fold oversampler \\\n", + "0 0 appendicitis 0 SMOTE \n", + "1 1 appendicitis 0 SMOTE \n", + "2 2 appendicitis 0 SMOTE \n", + "3 3 appendicitis 0 SMOTE \n", + "4 4 appendicitis 0 SMOTE \n", + "\n", + " sparam classifier \\\n", + "0 {'n_neighbors': 5, 'proportion': 1.0, 'random_... DecisionTreeClassifier \n", + "1 {'n_neighbors': 5, 'proportion': 1.0, 'random_... DecisionTreeClassifier \n", + "2 {'n_neighbors': 5, 'proportion': 1.0, 'random_... DecisionTreeClassifier \n", + "3 {'n_neighbors': 5, 'proportion': 1.0, 'random_... DecisionTreeClassifier \n", + "4 {'n_neighbors': 5, 'proportion': 1.0, 'random_... DecisionTreeClassifier \n", + "\n", + " cparam auc reg_param \n", + "0 {'max_depth': 3, 'random_state': 5} 0.564706 3.0 \n", + "1 {'max_depth': 5, 'random_state': 5} 0.600000 5.0 \n", + "2 {'max_depth': 7, 'random_state': 5} 0.670588 7.0 \n", + "3 {'max_depth': 9, 'random_state': 5} 0.670588 9.0 \n", + "4 {'max_depth': 11, 'random_state': 5} 0.670588 11.0 " + ] + }, + "execution_count": 57, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 58, + "metadata": {}, + "outputs": [], + "source": [ + "data['sparam'] = data['sparam'].apply(eval)\n", + "data['cparam'] = data['cparam'].apply(eval)" + ] + }, + { + "cell_type": "code", + "execution_count": 59, + "metadata": {}, + "outputs": [], + "source": [ + "def remove_key(dict, key):\n", + " del dict[key]\n", + " return dict" + ] + }, + { + "cell_type": "code", + "execution_count": 60, + "metadata": {}, + "outputs": [ + { + "ename": "KeyError", + "evalue": "'nn_params'", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mKeyError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[60], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m data[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mmetric\u001b[39m\u001b[38;5;124m'\u001b[39m] \u001b[38;5;241m=\u001b[39m \u001b[43mdata\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43msparam\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m]\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mapply\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43;01mlambda\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mx\u001b[49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[43mx\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mnn_params\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mmetric_learning_method\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 2\u001b[0m data[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124msparam\u001b[39m\u001b[38;5;124m'\u001b[39m] \u001b[38;5;241m=\u001b[39m data[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124msparam\u001b[39m\u001b[38;5;124m'\u001b[39m]\u001b[38;5;241m.\u001b[39mapply(\u001b[38;5;28;01mlambda\u001b[39;00m x: remove_key(x, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mnn_params\u001b[39m\u001b[38;5;124m'\u001b[39m))\n\u001b[1;32m 3\u001b[0m data[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mcparam\u001b[39m\u001b[38;5;124m'\u001b[39m] \u001b[38;5;241m=\u001b[39m data[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mcparam\u001b[39m\u001b[38;5;124m'\u001b[39m]\u001b[38;5;241m.\u001b[39mapply(\u001b[38;5;28mstr\u001b[39m)\n", + "File \u001b[0;32m~/anaconda3/envs/smote-variants/lib/python3.10/site-packages/pandas/core/series.py:4760\u001b[0m, in \u001b[0;36mSeries.apply\u001b[0;34m(self, func, convert_dtype, args, by_row, **kwargs)\u001b[0m\n\u001b[1;32m 4625\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mapply\u001b[39m(\n\u001b[1;32m 4626\u001b[0m \u001b[38;5;28mself\u001b[39m,\n\u001b[1;32m 4627\u001b[0m func: AggFuncType,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 4632\u001b[0m \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs,\n\u001b[1;32m 4633\u001b[0m ) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m DataFrame \u001b[38;5;241m|\u001b[39m Series:\n\u001b[1;32m 4634\u001b[0m \u001b[38;5;250m \u001b[39m\u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m 4635\u001b[0m \u001b[38;5;124;03m Invoke function on values of Series.\u001b[39;00m\n\u001b[1;32m 4636\u001b[0m \n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 4751\u001b[0m \u001b[38;5;124;03m dtype: float64\u001b[39;00m\n\u001b[1;32m 4752\u001b[0m \u001b[38;5;124;03m \"\"\"\u001b[39;00m\n\u001b[1;32m 4753\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mSeriesApply\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 4754\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m 4755\u001b[0m \u001b[43m \u001b[49m\u001b[43mfunc\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 4756\u001b[0m \u001b[43m \u001b[49m\u001b[43mconvert_dtype\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mconvert_dtype\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 4757\u001b[0m \u001b[43m \u001b[49m\u001b[43mby_row\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mby_row\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 4758\u001b[0m \u001b[43m \u001b[49m\u001b[43margs\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 4759\u001b[0m \u001b[43m \u001b[49m\u001b[43mkwargs\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m-> 4760\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mapply\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m~/anaconda3/envs/smote-variants/lib/python3.10/site-packages/pandas/core/apply.py:1207\u001b[0m, in \u001b[0;36mSeriesApply.apply\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 1204\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mapply_compat()\n\u001b[1;32m 1206\u001b[0m \u001b[38;5;66;03m# self.func is Callable\u001b[39;00m\n\u001b[0;32m-> 1207\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mapply_standard\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m~/anaconda3/envs/smote-variants/lib/python3.10/site-packages/pandas/core/apply.py:1287\u001b[0m, in \u001b[0;36mSeriesApply.apply_standard\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 1281\u001b[0m \u001b[38;5;66;03m# row-wise access\u001b[39;00m\n\u001b[1;32m 1282\u001b[0m \u001b[38;5;66;03m# apply doesn't have a `na_action` keyword and for backward compat reasons\u001b[39;00m\n\u001b[1;32m 1283\u001b[0m \u001b[38;5;66;03m# we need to give `na_action=\"ignore\"` for categorical data.\u001b[39;00m\n\u001b[1;32m 1284\u001b[0m \u001b[38;5;66;03m# TODO: remove the `na_action=\"ignore\"` when that default has been changed in\u001b[39;00m\n\u001b[1;32m 1285\u001b[0m \u001b[38;5;66;03m# Categorical (GH51645).\u001b[39;00m\n\u001b[1;32m 1286\u001b[0m action \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mignore\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(obj\u001b[38;5;241m.\u001b[39mdtype, CategoricalDtype) \u001b[38;5;28;01melse\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[0;32m-> 1287\u001b[0m mapped \u001b[38;5;241m=\u001b[39m \u001b[43mobj\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_map_values\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 1288\u001b[0m \u001b[43m \u001b[49m\u001b[43mmapper\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcurried\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mna_action\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43maction\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mconvert\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mconvert_dtype\u001b[49m\n\u001b[1;32m 1289\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1291\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(mapped) \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(mapped[\u001b[38;5;241m0\u001b[39m], ABCSeries):\n\u001b[1;32m 1292\u001b[0m \u001b[38;5;66;03m# GH#43986 Need to do list(mapped) in order to get treated as nested\u001b[39;00m\n\u001b[1;32m 1293\u001b[0m \u001b[38;5;66;03m# See also GH#25959 regarding EA support\u001b[39;00m\n\u001b[1;32m 1294\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m obj\u001b[38;5;241m.\u001b[39m_constructor_expanddim(\u001b[38;5;28mlist\u001b[39m(mapped), index\u001b[38;5;241m=\u001b[39mobj\u001b[38;5;241m.\u001b[39mindex)\n", + "File \u001b[0;32m~/anaconda3/envs/smote-variants/lib/python3.10/site-packages/pandas/core/base.py:921\u001b[0m, in \u001b[0;36mIndexOpsMixin._map_values\u001b[0;34m(self, mapper, na_action, convert)\u001b[0m\n\u001b[1;32m 918\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(arr, ExtensionArray):\n\u001b[1;32m 919\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m arr\u001b[38;5;241m.\u001b[39mmap(mapper, na_action\u001b[38;5;241m=\u001b[39mna_action)\n\u001b[0;32m--> 921\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43malgorithms\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mmap_array\u001b[49m\u001b[43m(\u001b[49m\u001b[43marr\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mmapper\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mna_action\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mna_action\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mconvert\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mconvert\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m~/anaconda3/envs/smote-variants/lib/python3.10/site-packages/pandas/core/algorithms.py:1814\u001b[0m, in \u001b[0;36mmap_array\u001b[0;34m(arr, mapper, na_action, convert)\u001b[0m\n\u001b[1;32m 1812\u001b[0m values \u001b[38;5;241m=\u001b[39m arr\u001b[38;5;241m.\u001b[39mastype(\u001b[38;5;28mobject\u001b[39m, copy\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mFalse\u001b[39;00m)\n\u001b[1;32m 1813\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m na_action \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[0;32m-> 1814\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mlib\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mmap_infer\u001b[49m\u001b[43m(\u001b[49m\u001b[43mvalues\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mmapper\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mconvert\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mconvert\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1815\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 1816\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m lib\u001b[38;5;241m.\u001b[39mmap_infer_mask(\n\u001b[1;32m 1817\u001b[0m values, mapper, mask\u001b[38;5;241m=\u001b[39misna(values)\u001b[38;5;241m.\u001b[39mview(np\u001b[38;5;241m.\u001b[39muint8), convert\u001b[38;5;241m=\u001b[39mconvert\n\u001b[1;32m 1818\u001b[0m )\n", + "File \u001b[0;32mlib.pyx:2920\u001b[0m, in \u001b[0;36mpandas._libs.lib.map_infer\u001b[0;34m()\u001b[0m\n", + "Cell \u001b[0;32mIn[60], line 1\u001b[0m, in \u001b[0;36m\u001b[0;34m(x)\u001b[0m\n\u001b[0;32m----> 1\u001b[0m data[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mmetric\u001b[39m\u001b[38;5;124m'\u001b[39m] \u001b[38;5;241m=\u001b[39m data[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124msparam\u001b[39m\u001b[38;5;124m'\u001b[39m]\u001b[38;5;241m.\u001b[39mapply(\u001b[38;5;28;01mlambda\u001b[39;00m x: \u001b[43mx\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mnn_params\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m]\u001b[49m[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mmetric_learning_method\u001b[39m\u001b[38;5;124m'\u001b[39m])\n\u001b[1;32m 2\u001b[0m data[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124msparam\u001b[39m\u001b[38;5;124m'\u001b[39m] \u001b[38;5;241m=\u001b[39m data[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124msparam\u001b[39m\u001b[38;5;124m'\u001b[39m]\u001b[38;5;241m.\u001b[39mapply(\u001b[38;5;28;01mlambda\u001b[39;00m x: remove_key(x, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mnn_params\u001b[39m\u001b[38;5;124m'\u001b[39m))\n\u001b[1;32m 3\u001b[0m data[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mcparam\u001b[39m\u001b[38;5;124m'\u001b[39m] \u001b[38;5;241m=\u001b[39m data[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mcparam\u001b[39m\u001b[38;5;124m'\u001b[39m]\u001b[38;5;241m.\u001b[39mapply(\u001b[38;5;28mstr\u001b[39m)\n", + "\u001b[0;31mKeyError\u001b[0m: 'nn_params'" + ] + } + ], + "source": [ + "data['metric'] = data['sparam'].apply(lambda x: x['nn_params']['metric_learning_method'])\n", + "data['sparam'] = data['sparam'].apply(lambda x: remove_key(x, 'nn_params'))\n", + "data['cparam'] = data['cparam'].apply(str)\n", + "data['sparam'] = data['sparam'].apply(str)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Index(['Unnamed: 0', 'name', 'fold', 'sparam', 'classifier', 'cparam', 'auc',\n", + " 'metric'],\n", + " dtype='object')" + ] + }, + "execution_count": 67, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data.columns" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Unnamed: 0namefoldsparamclassifiercparamaucmetric
00bupa0{'n_neighbors': 5, 'proportion': 1.0, 'random_...DecisionTreeClassifier{'max_depth': 4, 'random_state': 5}0.737500id
11bupa0{'n_neighbors': 5, 'proportion': 1.0, 'random_...DecisionTreeClassifier{'max_depth': 6, 'random_state': 5}0.675431id
22bupa0{'n_neighbors': 5, 'proportion': 1.0, 'random_...DecisionTreeClassifier{'max_depth': 8, 'random_state': 5}0.681034id
33bupa0{'n_neighbors': 5, 'proportion': 1.0, 'random_...RandomForestClassifier{'max_depth': 4, 'random_state': 5}0.859483id
44bupa0{'n_neighbors': 5, 'proportion': 1.0, 'random_...RandomForestClassifier{'max_depth': 6, 'random_state': 5}0.858621id
\n", + "
" + ], + "text/plain": [ + " Unnamed: 0 name fold sparam \\\n", + "0 0 bupa 0 {'n_neighbors': 5, 'proportion': 1.0, 'random_... \n", + "1 1 bupa 0 {'n_neighbors': 5, 'proportion': 1.0, 'random_... \n", + "2 2 bupa 0 {'n_neighbors': 5, 'proportion': 1.0, 'random_... \n", + "3 3 bupa 0 {'n_neighbors': 5, 'proportion': 1.0, 'random_... \n", + "4 4 bupa 0 {'n_neighbors': 5, 'proportion': 1.0, 'random_... \n", + "\n", + " classifier cparam auc \\\n", + "0 DecisionTreeClassifier {'max_depth': 4, 'random_state': 5} 0.737500 \n", + "1 DecisionTreeClassifier {'max_depth': 6, 'random_state': 5} 0.675431 \n", + "2 DecisionTreeClassifier {'max_depth': 8, 'random_state': 5} 0.681034 \n", + "3 RandomForestClassifier {'max_depth': 4, 'random_state': 5} 0.859483 \n", + "4 RandomForestClassifier {'max_depth': 6, 'random_state': 5} 0.858621 \n", + "\n", + " metric \n", + "0 id \n", + "1 id \n", + "2 id \n", + "3 id \n", + "4 id " + ] + }, + "execution_count": 68, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "grouped = data.groupby(['name', 'sparam', 'classifier', 'cparam', 'metric']).apply(lambda pdf: pdf.sort_values('fold')['auc'].values.tolist())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "grouped = grouped.reset_index(drop=False)\n", + "grouped = grouped.rename(columns={0: 'auc'})\n", + "determ = grouped[grouped['metric'] == 'MI_weighted'].drop(columns=['metric'])\n", + "rand = grouped[grouped['metric'] == 'id'].drop(columns=['metric'])\n", + "merged = pd.merge(determ.rename(columns={'auc': 'auc_det'}), rand, on=['name', 'sparam', 'classifier', 'cparam'])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "merged['auc_mean_det'] = merged['auc_det'].apply(np.mean)\n", + "merged['auc_std_det'] = merged['auc_det'].apply(np.std)\n", + "merged['auc_min_det'] = merged['auc_det'].apply(np.min)\n", + "merged['auc_max_det'] = merged['auc_det'].apply(np.max)\n", + "merged['auc_mean'] = merged['auc'].apply(np.mean)\n", + "merged['auc_std'] = merged['auc'].apply(np.std)\n", + "merged['auc_min'] = merged['auc'].apply(np.min)\n", + "merged['auc_max'] = merged['auc'].apply(np.max)\n", + "merged['p_l'] = merged.apply(lambda row: wilcoxon(row['auc_det'], row['auc'], zero_method='zsplit', alternative='less').pvalue, axis=1)\n", + "merged['p_g'] = merged.apply(lambda row: wilcoxon(row['auc_det'], row['auc'], zero_method='zsplit', alternative='greater').pvalue, axis=1)\n", + "merged['f_l'] = merged['p_l'] < 0.05\n", + "merged['f_g'] = merged['p_g'] < 0.05" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def model_selection(pdf):\n", + " max_det = pdf[pdf['auc_mean_det'] == pdf['auc_mean_det'].max()].iloc[0]\n", + " max_ran = pdf[pdf['auc_mean'] == pdf['auc_mean'].max()].iloc[0]\n", + " return pd.Series({'auc_mean_det': max_det['auc_mean_det'],\n", + " 'auc_mean': max_ran['auc_mean'],\n", + " 'auc_std_det': max_det['auc_std_det'],\n", + " 'auc_std': max_ran['auc_std'],\n", + " 'p_l': wilcoxon(max_det['auc_det'], max_ran['auc'], zero_method='zsplit', alternative='less').pvalue,\n", + " 'p_g': wilcoxon(max_det['auc_det'], max_ran['auc'], zero_method='zsplit', alternative='greater').pvalue,\n", + " 'auc_median_det': np.median(max_det['auc_det']),\n", + " 'auc_median': np.median(max_ran['auc'])})" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
auc_mean_detauc_meanauc_std_detauc_stdp_lp_gauc_median_detauc_median
classifier
DecisionTreeClassifier0.6611720.6611090.0603840.0597220.4433130.5566870.6629310.664224
KNeighborsClassifier0.6591950.6587670.0552520.0552620.6658740.3341260.6612070.657543
RandomForestClassifier0.7625950.7645980.0526910.0524730.0024840.9975160.7655170.766810
SVC0.6501300.6508330.0658290.0659790.0017340.9982660.6517240.652155
\n", + "
" + ], + "text/plain": [ + " auc_mean_det auc_mean auc_std_det auc_std \\\n", + "classifier \n", + "DecisionTreeClassifier 0.661172 0.661109 0.060384 0.059722 \n", + "KNeighborsClassifier 0.659195 0.658767 0.055252 0.055262 \n", + "RandomForestClassifier 0.762595 0.764598 0.052691 0.052473 \n", + "SVC 0.650130 0.650833 0.065829 0.065979 \n", + "\n", + " p_l p_g auc_median_det auc_median \n", + "classifier \n", + "DecisionTreeClassifier 0.443313 0.556687 0.662931 0.664224 \n", + "KNeighborsClassifier 0.665874 0.334126 0.661207 0.657543 \n", + "RandomForestClassifier 0.002484 0.997516 0.765517 0.766810 \n", + "SVC 0.001734 0.998266 0.651724 0.652155 " + ] + }, + "execution_count": 73, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "merged.groupby('classifier').apply(model_selection)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
namesparamclassifiercparamauc_detaucauc_mean_detauc_std_detauc_min_detauc_max_detauc_meanauc_stdauc_minauc_maxp_lp_gf_lf_g
classifier
DecisionTreeClassifier0bupa{'n_neighbors': 5, 'proportion': 1.0, 'random_...DecisionTreeClassifier{'max_depth': 4, 'random_state': 5}[0.6857758620689656, 0.5900862068965518, 0.638...[0.7375, 0.6668103448275862, 0.7, 0.6504310344...0.6611720.0603840.4581900.8556030.6611090.0597220.4750000.8336210.4433130.556687FalseFalse
KNeighborsClassifier4bupa{'n_neighbors': 5, 'proportion': 1.0, 'random_...KNeighborsClassifier{'n_neighbors': 3}[0.6133620689655173, 0.6340517241379311, 0.630...[0.6051724137931035, 0.6189655172413793, 0.637...0.6591950.0552520.4827590.8534480.6587670.0552620.4922410.8534480.6658740.334126FalseFalse
RandomForestClassifier10bupa{'n_neighbors': 5, 'proportion': 1.0, 'random_...RandomForestClassifier{'max_depth': 8, 'random_state': 5}[0.8543103448275862, 0.7698275862068966, 0.755...[0.8551724137931035, 0.7422413793103448, 0.755...0.7625950.0526910.6034480.9284480.7645980.0524730.6060340.9232760.0024840.997516TrueFalse
SVC19bupa{'n_neighbors': 5, 'proportion': 1.0, 'random_...SVC{'C': 0.1, 'probability': True, 'random_state'...[0.746551724137931, 0.6741379310344828, 0.6362...[0.7543103448275862, 0.6715517241379311, 0.634...0.6501300.0658290.4370690.8508620.6508330.0659790.4344830.8508620.0017340.998266TrueFalse
\n", + "
" + ], + "text/plain": [ + " name \\\n", + "classifier \n", + "DecisionTreeClassifier 0 bupa \n", + "KNeighborsClassifier 4 bupa \n", + "RandomForestClassifier 10 bupa \n", + "SVC 19 bupa \n", + "\n", + " sparam \\\n", + "classifier \n", + "DecisionTreeClassifier 0 {'n_neighbors': 5, 'proportion': 1.0, 'random_... \n", + "KNeighborsClassifier 4 {'n_neighbors': 5, 'proportion': 1.0, 'random_... \n", + "RandomForestClassifier 10 {'n_neighbors': 5, 'proportion': 1.0, 'random_... \n", + "SVC 19 {'n_neighbors': 5, 'proportion': 1.0, 'random_... \n", + "\n", + " classifier \\\n", + "classifier \n", + "DecisionTreeClassifier 0 DecisionTreeClassifier \n", + "KNeighborsClassifier 4 KNeighborsClassifier \n", + "RandomForestClassifier 10 RandomForestClassifier \n", + "SVC 19 SVC \n", + "\n", + " cparam \\\n", + "classifier \n", + "DecisionTreeClassifier 0 {'max_depth': 4, 'random_state': 5} \n", + "KNeighborsClassifier 4 {'n_neighbors': 3} \n", + "RandomForestClassifier 10 {'max_depth': 8, 'random_state': 5} \n", + "SVC 19 {'C': 0.1, 'probability': True, 'random_state'... \n", + "\n", + " auc_det \\\n", + "classifier \n", + "DecisionTreeClassifier 0 [0.6857758620689656, 0.5900862068965518, 0.638... \n", + "KNeighborsClassifier 4 [0.6133620689655173, 0.6340517241379311, 0.630... \n", + "RandomForestClassifier 10 [0.8543103448275862, 0.7698275862068966, 0.755... \n", + "SVC 19 [0.746551724137931, 0.6741379310344828, 0.6362... \n", + "\n", + " auc \\\n", + "classifier \n", + "DecisionTreeClassifier 0 [0.7375, 0.6668103448275862, 0.7, 0.6504310344... \n", + "KNeighborsClassifier 4 [0.6051724137931035, 0.6189655172413793, 0.637... \n", + "RandomForestClassifier 10 [0.8551724137931035, 0.7422413793103448, 0.755... \n", + "SVC 19 [0.7543103448275862, 0.6715517241379311, 0.634... \n", + "\n", + " auc_mean_det auc_std_det auc_min_det \\\n", + "classifier \n", + "DecisionTreeClassifier 0 0.661172 0.060384 0.458190 \n", + "KNeighborsClassifier 4 0.659195 0.055252 0.482759 \n", + "RandomForestClassifier 10 0.762595 0.052691 0.603448 \n", + "SVC 19 0.650130 0.065829 0.437069 \n", + "\n", + " auc_max_det auc_mean auc_std auc_min \\\n", + "classifier \n", + "DecisionTreeClassifier 0 0.855603 0.661109 0.059722 0.475000 \n", + "KNeighborsClassifier 4 0.853448 0.658767 0.055262 0.492241 \n", + "RandomForestClassifier 10 0.928448 0.764598 0.052473 0.606034 \n", + "SVC 19 0.850862 0.650833 0.065979 0.434483 \n", + "\n", + " auc_max p_l p_g f_l f_g \n", + "classifier \n", + "DecisionTreeClassifier 0 0.833621 0.443313 0.556687 False False \n", + "KNeighborsClassifier 4 0.853448 0.665874 0.334126 False False \n", + "RandomForestClassifier 10 0.923276 0.002484 0.997516 True False \n", + "SVC 19 0.850862 0.001734 0.998266 True False " + ] + }, + "execution_count": 74, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "merged.groupby('classifier').apply(lambda pdf: pdf[pdf['auc_mean_det'] == pdf['auc_mean_det'].max()])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
namesparamclassifiercparamauc_detaucauc_mean_detauc_std_detauc_min_detauc_max_detauc_meanauc_stdauc_minauc_maxp_lp_gf_lf_g
10bupa{'n_neighbors': 5, 'proportion': 1.0, 'random_...RandomForestClassifier{'max_depth': 8, 'random_state': 5}[0.8543103448275862, 0.7698275862068966, 0.755...[0.8551724137931035, 0.7422413793103448, 0.755...0.7625950.0526910.6034480.9284480.7645980.0524730.6060340.9232760.0024840.997516TrueFalse
\n", + "
" + ], + "text/plain": [ + " name sparam \\\n", + "10 bupa {'n_neighbors': 5, 'proportion': 1.0, 'random_... \n", + "\n", + " classifier cparam \\\n", + "10 RandomForestClassifier {'max_depth': 8, 'random_state': 5} \n", + "\n", + " auc_det \\\n", + "10 [0.8543103448275862, 0.7698275862068966, 0.755... \n", + "\n", + " auc auc_mean_det \\\n", + "10 [0.8551724137931035, 0.7422413793103448, 0.755... 0.762595 \n", + "\n", + " auc_std_det auc_min_det auc_max_det auc_mean auc_std auc_min \\\n", + "10 0.052691 0.603448 0.928448 0.764598 0.052473 0.606034 \n", + "\n", + " auc_max p_l p_g f_l f_g \n", + "10 0.923276 0.002484 0.997516 True False " + ] + }, + "execution_count": 75, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "merged[merged['auc_mean_det'] == merged['auc_mean_det'].max()]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
namesparamclassifiercparamauc_detaucauc_mean_detauc_std_detauc_min_detauc_max_detauc_meanauc_stdauc_minauc_maxp_lp_gf_lf_g
classifier
DecisionTreeClassifier0bupa{'n_neighbors': 5, 'proportion': 1.0, 'random_...DecisionTreeClassifier{'max_depth': 4, 'random_state': 5}[0.6857758620689656, 0.5900862068965518, 0.638...[0.7375, 0.6668103448275862, 0.7, 0.6504310344...0.6611720.0603840.4581900.8556030.6611090.0597220.4750000.8336210.4433130.556687FalseFalse
KNeighborsClassifier4bupa{'n_neighbors': 5, 'proportion': 1.0, 'random_...KNeighborsClassifier{'n_neighbors': 3}[0.6133620689655173, 0.6340517241379311, 0.630...[0.6051724137931035, 0.6189655172413793, 0.637...0.6591950.0552520.4827590.8534480.6587670.0552620.4922410.8534480.6658740.334126FalseFalse
RandomForestClassifier10bupa{'n_neighbors': 5, 'proportion': 1.0, 'random_...RandomForestClassifier{'max_depth': 8, 'random_state': 5}[0.8543103448275862, 0.7698275862068966, 0.755...[0.8551724137931035, 0.7422413793103448, 0.755...0.7625950.0526910.6034480.9284480.7645980.0524730.6060340.9232760.0024840.997516TrueFalse
SVC19bupa{'n_neighbors': 5, 'proportion': 1.0, 'random_...SVC{'C': 0.1, 'probability': True, 'random_state'...[0.746551724137931, 0.6741379310344828, 0.6362...[0.7543103448275862, 0.6715517241379311, 0.634...0.6501300.0658290.4370690.8508620.6508330.0659790.4344830.8508620.0017340.998266TrueFalse
\n", + "
" + ], + "text/plain": [ + " name \\\n", + "classifier \n", + "DecisionTreeClassifier 0 bupa \n", + "KNeighborsClassifier 4 bupa \n", + "RandomForestClassifier 10 bupa \n", + "SVC 19 bupa \n", + "\n", + " sparam \\\n", + "classifier \n", + "DecisionTreeClassifier 0 {'n_neighbors': 5, 'proportion': 1.0, 'random_... \n", + "KNeighborsClassifier 4 {'n_neighbors': 5, 'proportion': 1.0, 'random_... \n", + "RandomForestClassifier 10 {'n_neighbors': 5, 'proportion': 1.0, 'random_... \n", + "SVC 19 {'n_neighbors': 5, 'proportion': 1.0, 'random_... \n", + "\n", + " classifier \\\n", + "classifier \n", + "DecisionTreeClassifier 0 DecisionTreeClassifier \n", + "KNeighborsClassifier 4 KNeighborsClassifier \n", + "RandomForestClassifier 10 RandomForestClassifier \n", + "SVC 19 SVC \n", + "\n", + " cparam \\\n", + "classifier \n", + "DecisionTreeClassifier 0 {'max_depth': 4, 'random_state': 5} \n", + "KNeighborsClassifier 4 {'n_neighbors': 3} \n", + "RandomForestClassifier 10 {'max_depth': 8, 'random_state': 5} \n", + "SVC 19 {'C': 0.1, 'probability': True, 'random_state'... \n", + "\n", + " auc_det \\\n", + "classifier \n", + "DecisionTreeClassifier 0 [0.6857758620689656, 0.5900862068965518, 0.638... \n", + "KNeighborsClassifier 4 [0.6133620689655173, 0.6340517241379311, 0.630... \n", + "RandomForestClassifier 10 [0.8543103448275862, 0.7698275862068966, 0.755... \n", + "SVC 19 [0.746551724137931, 0.6741379310344828, 0.6362... \n", + "\n", + " auc \\\n", + "classifier \n", + "DecisionTreeClassifier 0 [0.7375, 0.6668103448275862, 0.7, 0.6504310344... \n", + "KNeighborsClassifier 4 [0.6051724137931035, 0.6189655172413793, 0.637... \n", + "RandomForestClassifier 10 [0.8551724137931035, 0.7422413793103448, 0.755... \n", + "SVC 19 [0.7543103448275862, 0.6715517241379311, 0.634... \n", + "\n", + " auc_mean_det auc_std_det auc_min_det \\\n", + "classifier \n", + "DecisionTreeClassifier 0 0.661172 0.060384 0.458190 \n", + "KNeighborsClassifier 4 0.659195 0.055252 0.482759 \n", + "RandomForestClassifier 10 0.762595 0.052691 0.603448 \n", + "SVC 19 0.650130 0.065829 0.437069 \n", + "\n", + " auc_max_det auc_mean auc_std auc_min \\\n", + "classifier \n", + "DecisionTreeClassifier 0 0.855603 0.661109 0.059722 0.475000 \n", + "KNeighborsClassifier 4 0.853448 0.658767 0.055262 0.492241 \n", + "RandomForestClassifier 10 0.928448 0.764598 0.052473 0.606034 \n", + "SVC 19 0.850862 0.650833 0.065979 0.434483 \n", + "\n", + " auc_max p_l p_g f_l f_g \n", + "classifier \n", + "DecisionTreeClassifier 0 0.833621 0.443313 0.556687 False False \n", + "KNeighborsClassifier 4 0.853448 0.665874 0.334126 False False \n", + "RandomForestClassifier 10 0.923276 0.002484 0.997516 True False \n", + "SVC 19 0.850862 0.001734 0.998266 True False " + ] + }, + "execution_count": 76, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "merged.groupby('classifier').apply(lambda pdf: pdf[pdf['auc_mean'] == pdf['auc_mean'].max()])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "\"{'C': 0.1, 'probability': True, 'random_state': 5}\"" + ] + }, + "execution_count": 82, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tmp = merged.groupby('classifier').apply(lambda pdf: pdf[pdf['auc_mean'] == pdf['auc_mean'].max()])\n", + "tmp.iloc[-1]['cparam']" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
namesparamclassifiercparamauc_detaucauc_mean_detauc_std_detauc_min_detauc_max_detauc_meanauc_stdauc_minauc_maxp_lp_gf_lf_g
10bupa{'n_neighbors': 5, 'proportion': 1.0, 'random_...RandomForestClassifier{'max_depth': 8, 'random_state': 5}[0.8543103448275862, 0.7698275862068966, 0.755...[0.8551724137931035, 0.7422413793103448, 0.755...0.7625950.0526910.6034480.9284480.7645980.0524730.6060340.9232760.0024840.997516TrueFalse
\n", + "
" + ], + "text/plain": [ + " name sparam \\\n", + "10 bupa {'n_neighbors': 5, 'proportion': 1.0, 'random_... \n", + "\n", + " classifier cparam \\\n", + "10 RandomForestClassifier {'max_depth': 8, 'random_state': 5} \n", + "\n", + " auc_det \\\n", + "10 [0.8543103448275862, 0.7698275862068966, 0.755... \n", + "\n", + " auc auc_mean_det \\\n", + "10 [0.8551724137931035, 0.7422413793103448, 0.755... 0.762595 \n", + "\n", + " auc_std_det auc_min_det auc_max_det auc_mean auc_std auc_min \\\n", + "10 0.052691 0.603448 0.928448 0.764598 0.052473 0.606034 \n", + "\n", + " auc_max p_l p_g f_l f_g \n", + "10 0.923276 0.002484 0.997516 True False " + ] + }, + "execution_count": 77, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "merged[merged['auc_mean'] == merged['auc_mean'].max()]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "auc_mean 0.610534\n", + "auc_std 0.052280\n", + "auc_mean_det 0.610252\n", + "auc_std_det 0.052778\n", + "p_l 0.405746\n", + "p_g 0.594254\n", + "f_l 0.250000\n", + "f_g 0.100000\n", + "dtype: float64" + ] + }, + "execution_count": 78, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "merged[['auc_mean', 'auc_std', 'auc_mean_det', 'auc_std_det', 'p_l', 'p_g', 'f_l', 'f_g']].mean()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "auc_min 0.429138\n", + "auc_max 0.770366\n", + "auc_min_det 0.425280\n", + "auc_max_det 0.773621\n", + "dtype: float64" + ] + }, + "execution_count": 79, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "merged[['auc_min', 'auc_max', 'auc_min_det', 'auc_max_det']].mean()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "auc_mean 0.630787\n", + "auc_std 0.057362\n", + "auc_mean_det 0.630810\n", + "auc_std_det 0.057514\n", + "dtype: float64" + ] + }, + "execution_count": 80, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "merged[['auc_mean', 'auc_std', 'auc_mean_det', 'auc_std_det']].median()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "smote-variants", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.13" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/notebooks/development/tmp.ipynb b/notebooks/development/tmp.ipynb new file mode 100644 index 0000000..9d6f2ba --- /dev/null +++ b/notebooks/development/tmp.ipynb @@ -0,0 +1,168 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [], + "source": [ + "from mlscorecheck.aggregated import fold_partitioning_generator" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2 50\n", + "3 1633\n", + "4 3581\n", + "5 14090\n", + "6 483176\n", + "7 2040776\n", + "8 139143\n", + "9 4342190\n", + "10 246448\n", + "11 7138953\n" + ] + }, + { + "ename": "KeyboardInterrupt", + "evalue": "", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[27], line 4\u001b[0m\n\u001b[1;32m 2\u001b[0m count \u001b[38;5;241m=\u001b[39m \u001b[38;5;241m0\u001b[39m\n\u001b[1;32m 3\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m _ \u001b[38;5;129;01min\u001b[39;00m fold_partitioning_generator(p\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m100\u001b[39m, n\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m100\u001b[39m, k\u001b[38;5;241m=\u001b[39mk):\n\u001b[0;32m----> 4\u001b[0m count \u001b[38;5;241m+\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;241m1\u001b[39m\n\u001b[1;32m 5\u001b[0m \u001b[38;5;28mprint\u001b[39m(k, count)\n", + "\u001b[0;31mKeyboardInterrupt\u001b[0m: " + ] + } + ], + "source": [ + "for k in range(2, 100):\n", + " count = 0\n", + " for _ in fold_partitioning_generator(p=100, n=100, k=k):\n", + " count += 1\n", + " print(k, count)" + ] + }, + { + "cell_type": "code", + "execution_count": 44, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "def factorial(n):\n", + " res = 1.0\n", + " for idx in range(1, n+1):\n", + " res*= idx\n", + " return res\n", + "\n", + "def partitions(n, k):\n", + " return factorial(n)/(factorial(k)**(n/k)*factorial(int(n/k)))" + ] + }, + { + "cell_type": "code", + "execution_count": 58, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2 3.5621427058240887e+162\n", + "3 3.626801540867747e+197\n", + "4 2.572363629860447e+206\n", + "5 1.2282742117030945e+205\n", + "6 1.5826523735488407e+202\n", + "7 1.1103610594874628e+198\n", + "8 2.7433529171014324e+193\n", + "9 1.9492961457735544e+188\n", + "10 7.063610910334588e+181\n", + "11 7.2886230189539895e+177\n", + "12 3.2146769992502e+172\n", + "13 1.5115610004254664e+168\n", + "14 4.0785530300903386e+163\n", + "15 2.2321627264014143e+158\n", + "16 2.4791858123335247e+154\n", + "17 1.394364796130934e+151\n", + "18 5.098077439183687e+146\n", + "19 2.4740386235690717e+143\n", + "20 2.3728024522013827e+139\n", + "21 3.451392153010874e+135\n", + "22 5.140441315924067e+132\n", + "23 1.550064934282854e+129\n", + "24 6.442561010058904e+125\n", + "25 3.601142943963805e+122\n", + "26 3.178184745685334e+120\n", + "27 3.016011757262844e+117\n", + "28 3.628951497682093e+114\n", + "29 5.449810242010832e+111\n", + "30 1.0070550335094754e+109\n", + "31 1.1302251970828393e+107\n", + "32 3.045675448556612e+104\n", + "33 9.748430425758596e+101\n", + "34 3.6700428573340127e+99\n", + "35 1.610642116094514e+97\n", + "36 8.172210536057132e+94\n", + "37 4.757683690007036e+92\n", + "38 1.2623592927902406e+91\n", + "39 9.479061849256144e+88\n", + "40 8.00924709229139e+86\n", + "41 7.572458670181864e+84\n", + "42 7.969747815064958e+82\n", + "43 9.292051051329801e+80\n", + "44 1.1947431387602131e+79\n", + "45 1.6869322862438707e+77\n", + "46 2.6053246405122032e+75\n", + "47 4.3848537730394063e+73\n", + "48 8.014265931916633e+71\n", + "49 1.585493661024941e+70\n" + ] + } + ], + "source": [ + "p = 150\n", + "n = 50\n", + "\n", + "for k in range(2, min(p, n)):\n", + "\n", + " print(k, partitions(n, k)*partitions(p,k))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "smote-variants", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.13" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/smote_variants/base/_metrictensor.py b/smote_variants/base/_metrictensor.py index 45a54a0..bddecf6 100755 --- a/smote_variants/base/_metrictensor.py +++ b/smote_variants/base/_metrictensor.py @@ -584,7 +584,8 @@ def tensor(self, X, y): if self.metric_learning_method == 'ITML': self.metric_tensor = self._train_metric_learning(X_mod, y_mod, - self.metric_learning_method) + self.metric_learning_method, + random_state=5) elif self.metric_learning_method == 'rf': dissim= ClassifierImpliedDissimilarityMatrix().fit(X, y)\ .dissimilarity_matrix(X) @@ -592,7 +593,8 @@ def tensor(self, X, y): elif self.metric_learning_method == 'LSML': self.metric_tensor = self._train_metric_learning(X_mod, y_mod, - self.metric_learning_method) + self.metric_learning_method, + random_state=5) elif self.metric_learning_method == 'cov': self.metric_tensor = np.linalg.inv(fix_pd_matrix(np.cov(X.T))) elif self.metric_learning_method == 'cov_min': @@ -606,7 +608,8 @@ def tensor(self, X, y): elif self.metric_learning_method == 'ITML_mi': self.metric_tensor = self._train_metric_learning(X_mod, y_mod, - self.metric_learning_method) + self.metric_learning_method, + random_state=5) mutuali= estimate_mutual_information(X, y) self.metric_tensor= np.matmul(self.metric_tensor, np.diag(mutuali)) elif self.metric_learning_method == 'NCA': @@ -616,9 +619,16 @@ def tensor(self, X, y): matrices = [self._train_metric_learning(X_mod, y_mod, self.metric_learning_method, + random_state=5, prior='random') for i in range(2)] self.metric_tensor= psd_mean(matrices) + elif self.metric_learning_method == 'n_unique': + n_uniques = np.array([len(np.unique(X_mod[:, idx])) for idx in range(X.shape[1])]) + self.metric_tensor = np.diag(np.sqrt(n_uniques)) + elif self.metric_learning_method == 'n_unique_inv': + n_uniques = np.array([len(np.unique(X_mod[:, idx])) for idx in range(X.shape[1])]) + self.metric_tensor = np.diag(np.sqrt(1.0/n_uniques)) return self.metric_tensor diff --git a/smote_variants/base/_simplexsampling.py b/smote_variants/base/_simplexsampling.py index 6d04d3b..f33df7c 100644 --- a/smote_variants/base/_simplexsampling.py +++ b/smote_variants/base/_simplexsampling.py @@ -400,6 +400,8 @@ def determine_simplex_distribution(self, X, simplices): return np.repeat(1.0/len(simplices), len(simplices)) if self.simplex_sampling == 'volume': return simplex_volumes(X[simplices]) + if self.simplex_sampling == 'volume_inv': + return 1.0 / (simplex_volumes(X[simplices]) + 0.001) raise ValueError(f"simplex sampling with weighting "\ f"{self.simplex_sampling} not implemented yet") @@ -541,8 +543,13 @@ def add_gaussian_noise(self, samples): """ if 'sigma' in self.gaussian_component: - sigma = self.gaussian_component['sigma'] - return samples + self.random_state.normal(size=samples.shape) * sigma + if 'fraction' not in self.gaussian_component: + sigma = self.gaussian_component['sigma'] + return samples + self.random_state.normal(size=samples.shape) * sigma + else: + sigma = self.gaussian_component['sigma'] + fraction = self.gaussian_component['fraction'] + return samples + self.random_state.normal(size=samples.shape) * sigma * self.random_state.choice([0, 1], p=[1.0 - fraction, fraction], size=samples.shape) if 'sigmas' in self.gaussian_component: sigmas = self.gaussian_component['sigmas'] return samples + self.random_state.normal(size=samples.shape) * sigmas diff --git a/smote_variants/oversampling/_adasyn.py b/smote_variants/oversampling/_adasyn.py index c3f4758..95da8ff 100755 --- a/smote_variants/oversampling/_adasyn.py +++ b/smote_variants/oversampling/_adasyn.py @@ -95,7 +95,7 @@ def parameter_combinations(cls, raw=False): Returns: list(dict): a list of meaningful parameter combinations """ - parameter_combinations = {'n_neighbors': [3, 5, 7, 9], + parameter_combinations = {'n_neighbors': [3, 5, 7, 9, 11, 17], 'd_th': [0.9], 'proportion': [2.0, 1.5, 1.0, 0.75, 0.5, 0.25]} return cls.generate_parameter_combinations(parameter_combinations, raw) diff --git a/smote_variants/oversampling/_borderline_smote.py b/smote_variants/oversampling/_borderline_smote.py index 9ae4054..82ec9c1 100755 --- a/smote_variants/oversampling/_borderline_smote.py +++ b/smote_variants/oversampling/_borderline_smote.py @@ -145,8 +145,8 @@ def parameter_combinations(cls, raw=False): """ parameter_combinations = {'proportion': [0.1, 0.25, 0.5, 0.75, 1.0, 1.5, 2.0], - 'n_neighbors': [3, 5, 7], - 'k_neighbors': [3, 5, 7]} + 'n_neighbors': [3, 5, 7, 11, 17], + 'k_neighbors': [3, 5, 7, 11, 17]} return cls.generate_parameter_combinations(parameter_combinations, raw) diff --git a/smote_variants/oversampling/_lee.py b/smote_variants/oversampling/_lee.py index e655e4c..6a48f49 100755 --- a/smote_variants/oversampling/_lee.py +++ b/smote_variants/oversampling/_lee.py @@ -113,7 +113,7 @@ def parameter_combinations(cls, raw=False): """ parameter_combinations = {'proportion': [0.1, 0.25, 0.5, 0.75, 1.0, 1.5, 2.0], - 'n_neighbors': [3, 5, 7], + 'n_neighbors': [3, 5, 7, 11, 17], 'rejection_level': [0.3, 0.5, 0.7]} return cls.generate_parameter_combinations(parameter_combinations, raw) diff --git a/smote_variants/oversampling/_prowsyn.py b/smote_variants/oversampling/_prowsyn.py index bb24c14..4d61914 100755 --- a/smote_variants/oversampling/_prowsyn.py +++ b/smote_variants/oversampling/_prowsyn.py @@ -109,7 +109,7 @@ def parameter_combinations(cls, raw=False): """ parameter_combinations = {'proportion': [0.1, 0.25, 0.5, 0.75, 1.0, 1.5, 2.0], - 'n_neighbors': [3, 5, 7], + 'n_neighbors': [3, 5, 7, 11, 17], 'L': [3, 5, 7], 'theta': [0.1, 1.0, 2.0]} return cls.generate_parameter_combinations(parameter_combinations, raw) diff --git a/smote_variants/oversampling/_smote.py b/smote_variants/oversampling/_smote.py index aa237b0..b3e9843 100755 --- a/smote_variants/oversampling/_smote.py +++ b/smote_variants/oversampling/_smote.py @@ -87,7 +87,7 @@ def parameter_combinations(cls, raw=False): """ parameter_combinations = {'proportion': [0.1, 0.25, 0.5, 0.75, 1.0, 1.5, 2.0], - 'n_neighbors': [3, 5, 7]} + 'n_neighbors': [3, 5, 7, 11, 17]} return cls.generate_parameter_combinations(parameter_combinations, raw) diff --git a/smote_variants/oversampling/_smote_ipf.py b/smote_variants/oversampling/_smote_ipf.py index bedd0c0..485f2e9 100755 --- a/smote_variants/oversampling/_smote_ipf.py +++ b/smote_variants/oversampling/_smote_ipf.py @@ -122,7 +122,7 @@ def parameter_combinations(cls, raw=False): classifiers = [('sklearn.tree', 'DecisionTreeClassifier', {'random_state': 2})] parameter_combinations = {'proportion': [0.1, 0.25, 0.5, 0.75, 1.0, 1.5, 2.0], - 'n_neighbors': [3, 5, 7], + 'n_neighbors': [3, 5, 7, 11, 17], 'n_folds': [9], 'k': [3], 'p': [0.01],