diff --git a/docs/experiments.rst b/docs/experiments.rst index 37973cdaf7..b02e1c920c 100644 --- a/docs/experiments.rst +++ b/docs/experiments.rst @@ -16,6 +16,7 @@ The following experiments illustrate specific tests using the ``ProgLearn`` pack experiments/recruitment_across_datasets experiments/spiral_exp experiments/spoken_digit_exp + experiments/xor_rxor_bootstrap_exp experiments/xor_rxor_exp experiments/xor_rxor_with_cpd experiments/xor_rxor_with_icp diff --git a/docs/experiments/functions/xor_rxor_bootstrap_fns.py b/docs/experiments/functions/xor_rxor_bootstrap_fns.py new file mode 100644 index 0000000000..428a21cf20 --- /dev/null +++ b/docs/experiments/functions/xor_rxor_bootstrap_fns.py @@ -0,0 +1,184 @@ +import numpy as np +from scipy.spatial import distance +import sklearn.ensemble +from proglearn.sims import generate_gaussian_parity +import random +import math + + +def bootstrap(angle_sweep=range(0, 90, 5), n_samples=100, reps=1000): + """ + Runs getPval many times to perform a bootstrap exeriment. + """ + p_vals = [] + # generate xor + X_xor, y_xor = generate_gaussian_parity(n_samples, angle_params=0) + for angle in angle_sweep: + # print('Processing angle:', angle) + # we can use the same xor as from above but we need a new rxor + # generate rxor with different angles + + X_rxor, y_rxor = generate_gaussian_parity( + n_samples, angle_params=math.radians(angle) + ) + + # we want to pick 70 samples from xor/rxor to train trees so we need to first subset each into arrays with only xor_0/1 and rxor_0/1 + X_xor_0 = X_xor[np.where(y_xor == 0)] + X_xor_1 = X_xor[np.where(y_xor == 1)] + + X_rxor_0 = X_rxor[np.where(y_rxor == 0)] + X_rxor_1 = X_rxor[np.where(y_rxor == 1)] + + # we can concat the first 35 samples from each pair to use to tatal 70 samples for training and 30 for predict proba + X_xor_train = np.concatenate((X_xor_0[0:35], X_xor_1[0:35])) + y_xor_train = np.concatenate((np.zeros(35), np.ones(35))) + + # repeat for rxor + X_rxor_train = np.concatenate((X_rxor_0[0:35], X_rxor_1[0:35])) + y_rxor_train = np.concatenate((np.zeros(35), np.ones(35))) + + # make sure X_rxor_train is the right size everytime, run into errors sometime + while len(X_rxor_train) != 70: + X_rxor, y_rxor = generate_gaussian_parity( + n_samples, angle_params=math.radians(angle) + ) + # we want to pick 70 samples from xor/rxor to train trees so we need to first subset each into arrays with only xor_0/1 and rxor_0/1 + X_xor_0 = X_xor[np.where(y_xor == 0)] + X_xor_1 = X_xor[np.where(y_xor == 1)] + + X_rxor_0 = X_rxor[np.where(y_rxor == 0)] + X_rxor_1 = X_rxor[np.where(y_rxor == 1)] + + # we can concat the first 35 samples from each pair to use to tatal 70 samples for training and 30 for predict proba + X_xor_train = np.concatenate((X_xor_0[0:35], X_xor_1[0:35])) + y_xor_train = np.concatenate((np.zeros(35), np.ones(35))) + + # repeat for rxor + X_rxor_train = np.concatenate((X_rxor_0[0:35], X_rxor_1[0:35])) + y_rxor_train = np.concatenate((np.zeros(35), np.ones(35))) + + # init the rf's + # xor rf + clf_xor = sklearn.ensemble.RandomForestClassifier( + n_estimators=10, min_samples_leaf=int(n_samples / 7) + ) + + # rxor rf + clf_rxor = sklearn.ensemble.RandomForestClassifier( + n_estimators=10, min_samples_leaf=int(n_samples / 7) + ) + + # train rfs + # fit the model using the train data + clf_xor.fit(X_xor_train, y_xor_train) + + # fit rxor model + clf_rxor.fit(X_rxor_train, y_rxor_train) + + # concat the test samples from xor and rxor (30 from each), 60 total test samples + X_xor_rxor_test = np.concatenate( + (X_xor_0[35:], X_rxor_0[35:], X_xor_1[35:], X_rxor_1[35:]) + ) + y_xor_rxor_test = np.concatenate((np.zeros(30), np.ones(30))) + + # predict proba on the new test data with both rfs + # xor rf + xor_rxor_test_xorRF_probas = clf_xor.predict_proba(X_xor_rxor_test) + + # rxor rf + xor_rxor_test_rxorRF_probas = clf_rxor.predict_proba(X_xor_rxor_test) + + # calc the l2 distance between the probas from xor and rxor rfs + d1 = calcL2(xor_rxor_test_xorRF_probas, xor_rxor_test_rxorRF_probas) + + # concat all xor and rxor samples (100+100=200) + X_xor_rxor_all = np.concatenate((X_xor, X_rxor)) + y_xor_rxor_all = np.concatenate((y_xor, y_rxor)) + + # append the pval + p_vals.append( + getPval(X_xor_rxor_all, y_xor_rxor_all, d1, reps, n_samples=n_samples) + ) + + return p_vals + + +def getPval(X_xor_rxor_all, y_xor_rxor_all, d1, reps=1000, n_samples=100): + """ + Shuffles xor and rxor, trains trees, predicts, calculates L2 between probas, and calculates p-val to determine whether the 2 distributions are different. + """ + d1_greater_count = 0 + for i in range(0, reps): + random_idxs = random.sample(range(200), 200) + # subsample 100 samples twice randomly, call one xor and the other rxor + X_xor_new = X_xor_rxor_all[random_idxs[0:100]] + y_xor_new = y_xor_rxor_all[random_idxs[0:100]] + + X_rxor_new = X_xor_rxor_all[random_idxs[100:]] + y_rxor_new = y_xor_rxor_all[random_idxs[100:]] + + # subsample 70 from each and call one xor train and one rxor train + # since we randomly took 100 the pool of 200 samples we should just be able to take the first 70 samples + X_xor_new_train = X_xor_new[0:70] + y_xor_new_train = y_xor_new[0:70] + + X_rxor_new_train = X_rxor_new[0:70] + y_rxor_new_train = y_rxor_new[0:70] + + # train a new forest + # init the rf's + # xor rf + clf_xor_new = sklearn.ensemble.RandomForestClassifier( + n_estimators=10, min_samples_leaf=int(n_samples / 7) + ) + clf_xor_new.fit(X_xor_new_train, y_xor_new_train) + + # rxor rf + clf_rxor_new = sklearn.ensemble.RandomForestClassifier( + n_estimators=10, min_samples_leaf=int(n_samples / 7) + ) + clf_rxor_new.fit(X_rxor_new_train, y_rxor_new_train) + + # take the remaing 30 and call those test + X_xor_new_test = X_xor_new[70:] + y_xor_new_test = y_xor_new[70:] + + X_rxor_new_test = X_rxor_new[70:] + y_rxor_new_test = y_rxor_new[70:] + + # concat our new samples + X_xor_rxor_new_test = np.concatenate((X_xor_new_test, X_rxor_new_test)) + y_xor_rxor_new_test = np.concatenate((y_xor_new_test, y_rxor_new_test)) + + # predict proba using the original xor and rxor rf's and calc l2 + # new xor rf + xor_rxor_new_test_xorRF_probas = clf_xor_new.predict_proba(X_xor_rxor_new_test) + + # new rxor rf + xor_rxor_new_test_rxorRF_probas = clf_rxor_new.predict_proba( + X_xor_rxor_new_test + ) + + # calc l2 for our new data + d2 = calcL2(xor_rxor_new_test_xorRF_probas, xor_rxor_new_test_rxorRF_probas) + + if d1 > d2: + d1_greater_count += 1 + + return 1 - (d1_greater_count / reps) + + +def calcL2(xorRF_probas, rxorRF_probas): + """ + Returns L2 distance between 2 outputs from clf.predict_proba(). + """ + # lists to store % label 0 since we only need one of the probas to calc L2 + xors = [] + rxors = [] + + # iterate through the passed probas to store them in our lists + for xor_proba, rxor_proba in zip(xorRF_probas, rxorRF_probas): + xors.append(xor_proba[0]) + rxors.append(rxor_proba[0]) + + return distance.euclidean(xors, rxors) diff --git a/docs/experiments/xor_rxor_bootstrap_exp.ipynb b/docs/experiments/xor_rxor_bootstrap_exp.ipynb new file mode 100644 index 0000000000..2c2e50592b --- /dev/null +++ b/docs/experiments/xor_rxor_bootstrap_exp.ipynb @@ -0,0 +1,215 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "fbec258d-c58d-4218-ae9b-14bc01295e26", + "metadata": { + "tags": [] + }, + "source": [ + "# Gaussian XOR and Gaussian R-XOR Random Forest Bootstrap Experiment" + ] + }, + { + "cell_type": "markdown", + "id": "fa57f1bb", + "metadata": {}, + "source": [ + "## Overview" + ] + }, + { + "cell_type": "markdown", + "id": "d51addd4-5e62-4121-b09a-4b9b3624f359", + "metadata": {}, + "source": [ + "In this experiment, we are interested in learning at which angles RXOR is significantly different from XOR to warrant training a new Random Forest. We will do this for each angle in the angle sweep by:\n", + "1. Generating 100 XOR and 100 RXOR samples and training their respective trees on randomly selected 70 samples from each.\n", + "2. Concatenating the remaing 30 samples from both distributions (60 samples total) and pushing them through both XOR and RXOR random forests to get an array of probabilities for each sample. \n", + "3. Calculate L2 distance between the 2 arrays of probabilities. We will call this d1.\n", + "4. Concatenate ALL XOR and RXOR samples (200 total) and randomly select 70 samples to be XOR_new and 70 samples to be RXOR_new (bootstrap).\n", + "5. Train 2 new trees with XOR_new and RXOR_new.\n", + "6. Use the remaining 60 samples to calculate probabilities from both new trees.\n", + "7. Calculate L2 distance between the new probabilities (d2).\n", + "8. Repeat steps 4-7 1000 times and calculate p-value by 1 - ((# of times d1 > d2)/1000).\n", + "9. This entire experiment is then repeated 100 times to account for randomness.\n", + "\n", + "Finally, we take the mean of the p-values across each 100 tests for each angle and plot." + ] + }, + { + "cell_type": "markdown", + "id": "5f10072f", + "metadata": {}, + "source": [ + "## Running the Experiment" + ] + }, + { + "cell_type": "markdown", + "id": "699c54f3-2eee-45f7-93a2-3fa2ed07852c", + "metadata": {}, + "source": [ + "We will start by importing dependencies and running the experiment outlined above. This will take quite a while." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a76c3187-3074-4eab-b82a-7212693acd1d", + "metadata": {}, + "outputs": [], + "source": [ + "# import\n", + "import time\n", + "import matplotlib.pyplot as plt\n", + "import numpy as np\n", + "import pandas as pd\n", + "import xor_rxor_bootstrap_fns as fn" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0f1606f8-4a2c-4dd7-9f9c-ba873f873267", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "# set angle sweep\n", + "angle_sweep = range(0, 90, 5)\n", + "# data frame to store p values from each run\n", + "p_val_df = pd.DataFrame()\n", + "\n", + "# time experiment\n", + "start = time.time()\n", + "\n", + "# run the experiment for 100 repetitions, bootstrap each experiment for 1000 reps\n", + "for i in range(100):\n", + " p_val_df[i] = fn.bootstrap(angle_sweep=angle_sweep, n_samples=100, reps=1000)\n", + "end = time.time()\n", + "\n", + "# entire experiment run time\n", + "print(\"\\nThe function took {:.2f} s to compute.\".format(end - start))\n", + "# The function took 26973.70 s to compute." + ] + }, + { + "cell_type": "markdown", + "id": "65ebd653", + "metadata": {}, + "source": [ + "## Visualizing the Results" + ] + }, + { + "cell_type": "markdown", + "id": "68b85968-6853-4c3c-9fc3-d97c3bff8f73", + "metadata": { + "tags": [] + }, + "source": [ + "Next, we compute the mean across each test for each angle. We will use this to the mean p-value for each angle with errors bars for the 25th and 75th percentiles. " + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "29c30725-cae7-42a0-80f9-ed1336ecde13", + "metadata": {}, + "outputs": [], + "source": [ + "# compute mean across each test for each angle\n", + "p_val_df[\"mean\"] = p_val_df.mean(axis=1)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "496fca00-c10b-4396-ad0e-b6ed188f04ab", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Text(0.5, 1.0, 'Angle of Rotation vs mean P-Value')" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "# plot with error bars\n", + "qunatiles = np.nanquantile(p_val_df.iloc[:, :-1], [0.25, 0.75], axis=1)\n", + "plt.fill_between(angle_sweep, qunatiles[0], qunatiles[1], facecolor=\"r\", alpha=0.3)\n", + "plt.plot(angle_sweep, p_val_df[\"mean\"])\n", + "plt.xlabel(\"Angle of Rotation RXOR\")\n", + "plt.ylabel(\"P-Value\")\n", + "plt.title(\"Angle of Rotation vs mean P-Value\")" + ] + }, + { + "cell_type": "markdown", + "id": "cc990f97", + "metadata": {}, + "source": [ + "## Saving our Results" + ] + }, + { + "cell_type": "markdown", + "id": "3aeae5ba-dc05-4454-8bf5-947f6b51cac8", + "metadata": {}, + "source": [ + "Finally, we can write this dataframe to csv to avoid rerunning the experiment." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "b21328ff-eb61-405f-9e01-406ecfe7f7e8", + "metadata": {}, + "outputs": [], + "source": [ + "# optional write to csv\n", + "p_val_df.to_csv(\"p_val_df_with_mean.csv\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.7" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}