Skip to content

Commit

Permalink
publish branch
Browse files Browse the repository at this point in the history
  • Loading branch information
gykovacs committed Dec 16, 2023
1 parent ca2d4e1 commit e7b04f0
Show file tree
Hide file tree
Showing 18 changed files with 6,751 additions and 12 deletions.
406 changes: 406 additions & 0 deletions notebooks/development/029-deterministic-test.ipynb

Large diffs are not rendered by default.

1,290 changes: 1,290 additions & 0 deletions notebooks/development/030-analysis.ipynb

Large diffs are not rendered by default.

476 changes: 476 additions & 0 deletions notebooks/development/031-deterministic-test-joblib-ml.ipynb

Large diffs are not rendered by default.

480 changes: 480 additions & 0 deletions notebooks/development/031-deterministic-test-joblib.ipynb

Large diffs are not rendered by default.

507 changes: 507 additions & 0 deletions notebooks/development/032-smote-rf.ipynb

Large diffs are not rendered by default.

1,304 changes: 1,304 additions & 0 deletions notebooks/development/033-analysis-ml.ipynb

Large diffs are not rendered by default.

279 changes: 279 additions & 0 deletions notebooks/development/034-test-regularization.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,279 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import datetime\n",
"\n",
"from joblib import Parallel, delayed\n",
"\n",
"import numpy as np\n",
"import pandas as pd\n",
"\n",
"import tqdm\n",
"\n",
"from sklearn.tree import DecisionTreeClassifier\n",
"from sklearn.ensemble import RandomForestClassifier\n",
"from sklearn.svm import SVC\n",
"from sklearn.neighbors import KNeighborsClassifier\n",
"from sklearn.model_selection import RepeatedStratifiedKFold\n",
"from sklearn.preprocessing import StandardScaler\n",
"from sklearn.metrics import roc_auc_score\n",
"\n",
"from smote_variants.oversampling import SMOTE, NoSMOTE, ADASYN, Borderline_SMOTE1, ProWSyn, SMOTE_IPF, Lee, SMOBD\n",
"from common_datasets.binary_classification import get_filtered_data_loaders\n",
"import common_datasets.binary_classification as binclas"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"import logging\n",
"logger = logging.getLogger('smote_variants')\n",
"logger.setLevel(logging.ERROR)"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"classifiers = {\n",
"DecisionTreeClassifier: [{'max_depth': md, 'random_state': 5} for md in [1, 2] + list(range(3, 18, 2))],\n",
"RandomForestClassifier: [{'max_depth': md, 'random_state': 5, 'n_jobs': 1} for md in [1, 2] + list(range(3, 18, 2))],\n",
"KNeighborsClassifier: [{'n_neighbors': nn, 'n_jobs': 1} for nn in range(1, 70, 4)],\n",
"SVC: [{'C': c, 'probability': True, 'random_state': 5} for c in [0.001, 0.002, 0.005, 0.01, 0.02, 0.05, 0.1, 0.2, 0.5, 1.0, 2.0, 5.0]]\\\n",
" + [{'C': c, 'probability': True, 'kernel': 'poly', 'degree': 2, 'random_state': 5} for c in [0.001, 0.002, 0.005, 0.01, 0.02, 0.05, 0.1, 0.2, 0.5, 1.0, 2.0, 5.0]]\\\n",
" + [{'C': c, 'probability': True, 'kernel': 'poly', 'degree': 3, 'random_state': 5} for c in [0.001, 0.002, 0.005, 0.01, 0.02, 0.05, 0.1, 0.2, 0.5, 1.0, 2.0, 5.0]]\\\n",
"}"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"datasets = get_filtered_data_loaders(n_col_bounds=(2, 50),\n",
" n_bounds=(10, 700),\n",
" n_minority_bounds=(10, 500),\n",
" n_from_phenotypes=1,\n",
" n_smallest=40)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"datasets = [loader for loader in datasets if loader not in [binclas.load_iris0, binclas.load_dermatology_6, binclas.load_shuttle_6_vs_2_3, binclas.load_monk_2, binclas.load_new_thyroid1]]"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"20"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(datasets)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"oversampler_classes = [SMOTE, Borderline_SMOTE1, ADASYN, ProWSyn, SMOTE_IPF, Lee, SMOBD, NoSMOTE]"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"oversamplers = {}\n",
"for oversampler in oversampler_classes:\n",
" random_state = np.random.RandomState(5)\n",
" params = oversampler.parameter_combinations()\n",
" params = [comb for comb in params if comb.get('proportion', 1.0) == 1.0]\n",
" n_params = min(10, len(params))\n",
" oversamplers[oversampler] = random_state.choice(params, n_params, replace=False)"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"def job_generator(data_loader):\n",
"\n",
" dataset = data_loader()\n",
"\n",
" X = dataset['data']\n",
" y = dataset['target']\n",
"\n",
" validator = RepeatedStratifiedKFold(n_splits=5, n_repeats=2, random_state=5)\n",
"\n",
" for fidx, (train, test) in enumerate(validator.split(X, y, y)):\n",
" X_train = X[train]\n",
" X_test = X[test]\n",
" y_train = y[train]\n",
" y_test = y[test]\n",
"\n",
" ss = StandardScaler()\n",
" ss.fit(X_train)\n",
" X_train = ss.transform(X_train)\n",
" X_test = ss.transform(X_test)\n",
"\n",
" for oversampler, oparam in oversamplers.items():\n",
" for sparam in oparam:\n",
" oversampling = oversampler(**sparam)\n",
" X_samp, y_samp = oversampling.sample(X_train, y_train)\n",
"\n",
" job = {\n",
" 'X_samp': X_samp,\n",
" 'y_samp': y_samp,\n",
" 'X_test': X_test,\n",
" 'y_test': y_test,\n",
" }\n",
"\n",
" description = {\n",
" 'name': dataset['name'],\n",
" 'fold': fidx,\n",
" 'oversampler': oversampler.__name__,\n",
" 'sparam': sparam,\n",
" }\n",
"\n",
" yield job, description"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
"def do_job(job, description):\n",
" results = []\n",
" for classifier, cparams in classifiers.items():\n",
" for cparam in cparams:\n",
" tmp = description.copy()\n",
" classifier_obj = classifier(**cparam)\n",
" classifier_obj.fit(job['X_samp'], job['y_samp'])\n",
" y_pred = classifier_obj.predict_proba(job['X_test'])\n",
" auc = roc_auc_score(job['y_test'], y_pred[:, 1])\n",
"\n",
" tmp['classifier'] = classifier.__name__\n",
" tmp['cparam'] = cparam\n",
" tmp['auc'] = auc\n",
" results.append(tmp)\n",
"\n",
" return results"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"2023-12-16 17:39:43.203396 appendicitis\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"1it [00:00, 6.81it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"2023-12-16 17:39:43.262076 appendicitis\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"5it [00:00, 10.71it/s]"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"620it [08:17, 1.25it/s]\n"
]
}
],
"source": [
"for data_loader in datasets:\n",
" if data_loader != binclas.load_appendicitis:\n",
" continue\n",
" dataset = data_loader()\n",
"\n",
" print(datetime.datetime.now(), dataset['name'])\n",
"\n",
" results = Parallel(n_jobs=3)(delayed(do_job)(*x) for x in tqdm.tqdm(job_generator(data_loader)))\n",
"\n",
" results = [\n",
" x\n",
" for xs in results\n",
" for x in xs\n",
" ]\n",
"\n",
" results = pd.DataFrame.from_dict(results)\n",
" results.to_csv(f\"{dataset['name']}-reg.csv\")"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "smote-variants",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.13"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Loading

0 comments on commit e7b04f0

Please sign in to comment.