-
Notifications
You must be signed in to change notification settings - Fork 138
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
18 changed files
with
6,751 additions
and
12 deletions.
There are no files selected for viewing
Large diffs are not rendered by default.
Oops, something went wrong.
Large diffs are not rendered by default.
Oops, something went wrong.
476 changes: 476 additions & 0 deletions
476
notebooks/development/031-deterministic-test-joblib-ml.ipynb
Large diffs are not rendered by default.
Oops, something went wrong.
480 changes: 480 additions & 0 deletions
480
notebooks/development/031-deterministic-test-joblib.ipynb
Large diffs are not rendered by default.
Oops, something went wrong.
Large diffs are not rendered by default.
Oops, something went wrong.
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,279 @@ | ||
{ | ||
"cells": [ | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 1, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"import datetime\n", | ||
"\n", | ||
"from joblib import Parallel, delayed\n", | ||
"\n", | ||
"import numpy as np\n", | ||
"import pandas as pd\n", | ||
"\n", | ||
"import tqdm\n", | ||
"\n", | ||
"from sklearn.tree import DecisionTreeClassifier\n", | ||
"from sklearn.ensemble import RandomForestClassifier\n", | ||
"from sklearn.svm import SVC\n", | ||
"from sklearn.neighbors import KNeighborsClassifier\n", | ||
"from sklearn.model_selection import RepeatedStratifiedKFold\n", | ||
"from sklearn.preprocessing import StandardScaler\n", | ||
"from sklearn.metrics import roc_auc_score\n", | ||
"\n", | ||
"from smote_variants.oversampling import SMOTE, NoSMOTE, ADASYN, Borderline_SMOTE1, ProWSyn, SMOTE_IPF, Lee, SMOBD\n", | ||
"from common_datasets.binary_classification import get_filtered_data_loaders\n", | ||
"import common_datasets.binary_classification as binclas" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 2, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"import logging\n", | ||
"logger = logging.getLogger('smote_variants')\n", | ||
"logger.setLevel(logging.ERROR)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 3, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"classifiers = {\n", | ||
"DecisionTreeClassifier: [{'max_depth': md, 'random_state': 5} for md in [1, 2] + list(range(3, 18, 2))],\n", | ||
"RandomForestClassifier: [{'max_depth': md, 'random_state': 5, 'n_jobs': 1} for md in [1, 2] + list(range(3, 18, 2))],\n", | ||
"KNeighborsClassifier: [{'n_neighbors': nn, 'n_jobs': 1} for nn in range(1, 70, 4)],\n", | ||
"SVC: [{'C': c, 'probability': True, 'random_state': 5} for c in [0.001, 0.002, 0.005, 0.01, 0.02, 0.05, 0.1, 0.2, 0.5, 1.0, 2.0, 5.0]]\\\n", | ||
" + [{'C': c, 'probability': True, 'kernel': 'poly', 'degree': 2, 'random_state': 5} for c in [0.001, 0.002, 0.005, 0.01, 0.02, 0.05, 0.1, 0.2, 0.5, 1.0, 2.0, 5.0]]\\\n", | ||
" + [{'C': c, 'probability': True, 'kernel': 'poly', 'degree': 3, 'random_state': 5} for c in [0.001, 0.002, 0.005, 0.01, 0.02, 0.05, 0.1, 0.2, 0.5, 1.0, 2.0, 5.0]]\\\n", | ||
"}" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 4, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"datasets = get_filtered_data_loaders(n_col_bounds=(2, 50),\n", | ||
" n_bounds=(10, 700),\n", | ||
" n_minority_bounds=(10, 500),\n", | ||
" n_from_phenotypes=1,\n", | ||
" n_smallest=40)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 5, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"datasets = [loader for loader in datasets if loader not in [binclas.load_iris0, binclas.load_dermatology_6, binclas.load_shuttle_6_vs_2_3, binclas.load_monk_2, binclas.load_new_thyroid1]]" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 6, | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"data": { | ||
"text/plain": [ | ||
"20" | ||
] | ||
}, | ||
"execution_count": 6, | ||
"metadata": {}, | ||
"output_type": "execute_result" | ||
} | ||
], | ||
"source": [ | ||
"len(datasets)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 7, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"oversampler_classes = [SMOTE, Borderline_SMOTE1, ADASYN, ProWSyn, SMOTE_IPF, Lee, SMOBD, NoSMOTE]" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 8, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"oversamplers = {}\n", | ||
"for oversampler in oversampler_classes:\n", | ||
" random_state = np.random.RandomState(5)\n", | ||
" params = oversampler.parameter_combinations()\n", | ||
" params = [comb for comb in params if comb.get('proportion', 1.0) == 1.0]\n", | ||
" n_params = min(10, len(params))\n", | ||
" oversamplers[oversampler] = random_state.choice(params, n_params, replace=False)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 9, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"def job_generator(data_loader):\n", | ||
"\n", | ||
" dataset = data_loader()\n", | ||
"\n", | ||
" X = dataset['data']\n", | ||
" y = dataset['target']\n", | ||
"\n", | ||
" validator = RepeatedStratifiedKFold(n_splits=5, n_repeats=2, random_state=5)\n", | ||
"\n", | ||
" for fidx, (train, test) in enumerate(validator.split(X, y, y)):\n", | ||
" X_train = X[train]\n", | ||
" X_test = X[test]\n", | ||
" y_train = y[train]\n", | ||
" y_test = y[test]\n", | ||
"\n", | ||
" ss = StandardScaler()\n", | ||
" ss.fit(X_train)\n", | ||
" X_train = ss.transform(X_train)\n", | ||
" X_test = ss.transform(X_test)\n", | ||
"\n", | ||
" for oversampler, oparam in oversamplers.items():\n", | ||
" for sparam in oparam:\n", | ||
" oversampling = oversampler(**sparam)\n", | ||
" X_samp, y_samp = oversampling.sample(X_train, y_train)\n", | ||
"\n", | ||
" job = {\n", | ||
" 'X_samp': X_samp,\n", | ||
" 'y_samp': y_samp,\n", | ||
" 'X_test': X_test,\n", | ||
" 'y_test': y_test,\n", | ||
" }\n", | ||
"\n", | ||
" description = {\n", | ||
" 'name': dataset['name'],\n", | ||
" 'fold': fidx,\n", | ||
" 'oversampler': oversampler.__name__,\n", | ||
" 'sparam': sparam,\n", | ||
" }\n", | ||
"\n", | ||
" yield job, description" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 10, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"def do_job(job, description):\n", | ||
" results = []\n", | ||
" for classifier, cparams in classifiers.items():\n", | ||
" for cparam in cparams:\n", | ||
" tmp = description.copy()\n", | ||
" classifier_obj = classifier(**cparam)\n", | ||
" classifier_obj.fit(job['X_samp'], job['y_samp'])\n", | ||
" y_pred = classifier_obj.predict_proba(job['X_test'])\n", | ||
" auc = roc_auc_score(job['y_test'], y_pred[:, 1])\n", | ||
"\n", | ||
" tmp['classifier'] = classifier.__name__\n", | ||
" tmp['cparam'] = cparam\n", | ||
" tmp['auc'] = auc\n", | ||
" results.append(tmp)\n", | ||
"\n", | ||
" return results" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 11, | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
"2023-12-16 17:39:43.203396 appendicitis\n" | ||
] | ||
}, | ||
{ | ||
"name": "stderr", | ||
"output_type": "stream", | ||
"text": [ | ||
"1it [00:00, 6.81it/s]" | ||
] | ||
}, | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
"2023-12-16 17:39:43.262076 appendicitis\n" | ||
] | ||
}, | ||
{ | ||
"name": "stderr", | ||
"output_type": "stream", | ||
"text": [ | ||
"5it [00:00, 10.71it/s]" | ||
] | ||
}, | ||
{ | ||
"name": "stderr", | ||
"output_type": "stream", | ||
"text": [ | ||
"620it [08:17, 1.25it/s]\n" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"for data_loader in datasets:\n", | ||
" if data_loader != binclas.load_appendicitis:\n", | ||
" continue\n", | ||
" dataset = data_loader()\n", | ||
"\n", | ||
" print(datetime.datetime.now(), dataset['name'])\n", | ||
"\n", | ||
" results = Parallel(n_jobs=3)(delayed(do_job)(*x) for x in tqdm.tqdm(job_generator(data_loader)))\n", | ||
"\n", | ||
" results = [\n", | ||
" x\n", | ||
" for xs in results\n", | ||
" for x in xs\n", | ||
" ]\n", | ||
"\n", | ||
" results = pd.DataFrame.from_dict(results)\n", | ||
" results.to_csv(f\"{dataset['name']}-reg.csv\")" | ||
] | ||
} | ||
], | ||
"metadata": { | ||
"kernelspec": { | ||
"display_name": "smote-variants", | ||
"language": "python", | ||
"name": "python3" | ||
}, | ||
"language_info": { | ||
"codemirror_mode": { | ||
"name": "ipython", | ||
"version": 3 | ||
}, | ||
"file_extension": ".py", | ||
"mimetype": "text/x-python", | ||
"name": "python", | ||
"nbconvert_exporter": "python", | ||
"pygments_lexer": "ipython3", | ||
"version": "3.10.13" | ||
} | ||
}, | ||
"nbformat": 4, | ||
"nbformat_minor": 2 | ||
} |
Oops, something went wrong.