diff --git a/1_experiment_train.ipynb b/1_experiment_train.ipynb index 34ac4d8..7b0feda 100644 --- a/1_experiment_train.ipynb +++ b/1_experiment_train.ipynb @@ -20,45 +20,19 @@ }, { "cell_type": "code", + "execution_count": null, "metadata": { - "is_executing": true, "ExecuteTime": { "end_time": "2024-08-19T15:45:05.830869Z", "start_time": "2024-08-19T15:45:04.819700Z" - } + }, + "is_executing": true, + "scrolled": true }, + "outputs": [], "source": [ "!pip install onnx onnxruntime tf2onnx" - ], - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Requirement already satisfied: onnx in ./venv/lib/python3.9/site-packages (1.16.2)\r\n", - "Requirement already satisfied: onnxruntime in ./venv/lib/python3.9/site-packages (1.19.0)\r\n", - "Requirement already satisfied: tf2onnx in ./venv/lib/python3.9/site-packages (1.16.1)\r\n", - "Requirement already satisfied: numpy>=1.20 in ./venv/lib/python3.9/site-packages (from onnx) (1.26.4)\r\n", - "Requirement already satisfied: protobuf>=3.20.2 in ./venv/lib/python3.9/site-packages (from onnx) (3.20.3)\r\n", - "Requirement already satisfied: coloredlogs in ./venv/lib/python3.9/site-packages (from onnxruntime) (15.0.1)\r\n", - "Requirement already satisfied: flatbuffers in ./venv/lib/python3.9/site-packages (from onnxruntime) (24.3.25)\r\n", - "Requirement already satisfied: packaging in ./venv/lib/python3.9/site-packages (from onnxruntime) (24.1)\r\n", - "Requirement already satisfied: sympy in ./venv/lib/python3.9/site-packages (from onnxruntime) (1.13.2)\r\n", - "Requirement already satisfied: requests in ./venv/lib/python3.9/site-packages (from tf2onnx) (2.32.3)\r\n", - "Requirement already satisfied: six in ./venv/lib/python3.9/site-packages (from tf2onnx) (1.16.0)\r\n", - "Requirement already satisfied: humanfriendly>=9.1 in ./venv/lib/python3.9/site-packages (from coloredlogs->onnxruntime) (10.0)\r\n", - "Requirement already satisfied: charset-normalizer<4,>=2 in ./venv/lib/python3.9/site-packages (from requests->tf2onnx) (3.3.2)\r\n", - "Requirement already satisfied: idna<4,>=2.5 in ./venv/lib/python3.9/site-packages (from requests->tf2onnx) (3.7)\r\n", - "Requirement already satisfied: urllib3<3,>=1.21.1 in ./venv/lib/python3.9/site-packages (from requests->tf2onnx) (1.26.19)\r\n", - "Requirement already satisfied: certifi>=2017.4.17 in ./venv/lib/python3.9/site-packages (from requests->tf2onnx) (2024.7.4)\r\n", - "Requirement already satisfied: mpmath<1.4,>=1.1.0 in ./venv/lib/python3.9/site-packages (from sympy->onnxruntime) (1.3.0)\r\n", - "\r\n", - "\u001B[1m[\u001B[0m\u001B[34;49mnotice\u001B[0m\u001B[1;39;49m]\u001B[0m\u001B[39;49m A new release of pip is available: \u001B[0m\u001B[31;49m23.2.1\u001B[0m\u001B[39;49m -> \u001B[0m\u001B[32;49m24.2\u001B[0m\r\n", - "\u001B[1m[\u001B[0m\u001B[34;49mnotice\u001B[0m\u001B[1;39;49m]\u001B[0m\u001B[39;49m To update, run: \u001B[0m\u001B[32;49mpip install --upgrade pip\u001B[0m\r\n" - ] - } - ], - "execution_count": 1 + ] }, { "cell_type": "markdown", @@ -69,12 +43,14 @@ }, { "cell_type": "code", + "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2024-08-19T15:45:08.983925Z", "start_time": "2024-08-19T15:45:05.835311Z" } }, + "outputs": [], "source": [ "import numpy as np\n", "import pandas as pd\n", @@ -88,9 +64,7 @@ "import onnx\n", "import pickle\n", "from pathlib import Path" - ], - "outputs": [], - "execution_count": 2 + ] }, { "cell_type": "markdown", @@ -119,12 +93,14 @@ }, { "cell_type": "code", + "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2024-08-19T15:45:09.394745Z", "start_time": "2024-08-19T15:45:09.051361Z" } }, + "outputs": [], "source": [ "# Set the input (X) and output (Y) data. \n", "# The only output data is whether it's fraudulent. All other fields are inputs to the model.\n", @@ -141,17 +117,17 @@ " 7 # fraud\n", "]\n", "\n", - "X_train = pd.read_csv('data/train.csv')\n", - "y_train = X_train.iloc[:, label_indexes]\n", - "X_train = X_train.iloc[:, feature_indexes]\n", + "df = pd.read_csv('data/train.csv')\n", + "X_train = df.iloc[:, feature_indexes].values\n", + "y_train = df.iloc[:, label_indexes].values\n", "\n", - "X_val = pd.read_csv('data/validate.csv')\n", - "y_val = X_val.iloc[:, label_indexes]\n", - "X_val = X_val.iloc[:, feature_indexes]\n", + "df = pd.read_csv('data/validate.csv')\n", + "X_val = df.iloc[:, feature_indexes].values\n", + "y_val = df.iloc[:, label_indexes].values\n", "\n", - "X_test = pd.read_csv('data/test.csv')\n", - "y_test = X_test.iloc[:, label_indexes]\n", - "X_test = X_test.iloc[:, feature_indexes]\n", + "df = pd.read_csv('data/test.csv')\n", + "X_test = df.iloc[:, feature_indexes].values\n", + "y_test = df.iloc[:, label_indexes].values\n", "\n", "\n", "# Scale the data to remove mean and have unit variance. The data will be between -1 and 1, which makes it a lot easier for the model to learn than random (and potentially large) values.\n", @@ -159,7 +135,9 @@ "\n", "scaler = StandardScaler()\n", "\n", - "X_train = scaler.fit_transform(X_train.values)\n", + "X_train = scaler.fit_transform(X_train)\n", + "X_val = scaler.transform(X_val)\n", + "X_test = scaler.transform(X_test)\n", "\n", "Path(\"artifact\").mkdir(parents=True, exist_ok=True)\n", "with open(\"artifact/test_data.pkl\", \"wb\") as handle:\n", @@ -168,11 +146,9 @@ " pickle.dump(scaler, handle)\n", "\n", "# Since the dataset is unbalanced (it has many more non-fraud transactions than fraudulent ones), set a class weight to weight the few fraudulent transactions higher than the many non-fraud transactions.\n", - "class_weights = class_weight.compute_class_weight('balanced', classes=np.unique(y_train), y=y_train.values.ravel())\n", + "class_weights = class_weight.compute_class_weight('balanced', classes=np.unique(y_train), y=y_train.ravel())\n", "class_weights = {i : class_weights[i] for i in range(len(class_weights))}" - ], - "outputs": [], - "execution_count": 3 + ] }, { "cell_type": "markdown", @@ -185,15 +161,17 @@ }, { "cell_type": "code", + "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2024-08-19T15:45:09.489856Z", "start_time": "2024-08-19T15:45:09.419813Z" } }, + "outputs": [], "source": [ "model = Sequential()\n", - "model.add(Dense(32, activation = 'relu', input_dim = len(feature_indexes)))\n", + "model.add(Dense(32, activation='relu', input_dim=len(feature_indexes)))\n", "model.add(Dropout(0.2))\n", "model.add(Dense(32))\n", "model.add(BatchNormalization())\n", @@ -203,52 +181,16 @@ "model.add(BatchNormalization())\n", "model.add(Activation('relu'))\n", "model.add(Dropout(0.2))\n", - "model.add(Dense(1, activation = 'sigmoid'))\n", - "model.compile(optimizer='adam',loss='binary_crossentropy',metrics=['accuracy'])\n", + "model.add(Dense(1, activation='sigmoid'))\n", + "\n", + "model.compile(\n", + " optimizer='adam',\n", + " loss='binary_crossentropy',\n", + " metrics=['accuracy']\n", + ")\n", + "\n", "model.summary()" - ], - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Model: \"sequential\"\n", - "_________________________________________________________________\n", - " Layer (type) Output Shape Param # \n", - "=================================================================\n", - " dense (Dense) (None, 32) 192 \n", - " \n", - " dropout (Dropout) (None, 32) 0 \n", - " \n", - " dense_1 (Dense) (None, 32) 1056 \n", - " \n", - " batch_normalization (Batch (None, 32) 128 \n", - " Normalization) \n", - " \n", - " activation (Activation) (None, 32) 0 \n", - " \n", - " dropout_1 (Dropout) (None, 32) 0 \n", - " \n", - " dense_2 (Dense) (None, 32) 1056 \n", - " \n", - " batch_normalization_1 (Bat (None, 32) 128 \n", - " chNormalization) \n", - " \n", - " activation_1 (Activation) (None, 32) 0 \n", - " \n", - " dropout_2 (Dropout) (None, 32) 0 \n", - " \n", - " dense_3 (Dense) (None, 1) 33 \n", - " \n", - "=================================================================\n", - "Total params: 2593 (10.13 KB)\n", - "Trainable params: 2465 (9.63 KB)\n", - "Non-trainable params: 128 (512.00 Byte)\n", - "_________________________________________________________________\n" - ] - } - ], - "execution_count": 4 + ] }, { "cell_type": "markdown", @@ -261,12 +203,14 @@ }, { "cell_type": "code", + "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2024-08-19T15:45:29.664796Z", "start_time": "2024-08-19T15:45:09.496686Z" } }, + "outputs": [], "source": [ "# Train the model and get performance\n", "import os\n", @@ -274,26 +218,17 @@ "\n", "start = time.time()\n", "epochs = 2\n", - "history = model.fit(X_train, y_train, epochs=epochs, \\\n", - " validation_data=(scaler.transform(X_val.values),y_val), \\\n", - " verbose = True, class_weight = class_weights)\n", + "history = model.fit(\n", + " X_train,\n", + " y_train,\n", + " epochs=epochs,\n", + " validation_data=(X_val, y_val),\n", + " verbose=True,\n", + " class_weight=class_weights\n", + ")\n", "end = time.time()\n", "print(f\"Training of model is complete. Took {end-start} seconds\")" - ], - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Epoch 1/2\n", - "18750/18750 [==============================] - 10s 509us/step - loss: 0.2619 - accuracy: 0.9263 - val_loss: 0.2423 - val_accuracy: 0.9425\n", - "Epoch 2/2\n", - "18750/18750 [==============================] - 10s 537us/step - loss: 0.2371 - accuracy: 0.9474 - val_loss: 0.1983 - val_accuracy: 0.9564\n", - "Training of model is complete. Took 20.165951013565063 seconds\n" - ] - } - ], - "execution_count": 5 + ] }, { "cell_type": "markdown", @@ -304,20 +239,35 @@ }, { "cell_type": "code", + "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2024-08-19T15:45:29.845680Z", "start_time": "2024-08-19T15:45:29.674230Z" } }, + "outputs": [], "source": [ + "import tensorflow as tf\n", + "\n", + "# Normally we use tf2.onnx.convert.from_keras.\n", + "# workaround for tf2onnx bug https://github.com/onnx/tensorflow-onnx/issues/2348\n", + "\n", + "# Wrap the model in a `tf.function`\n", + "@tf.function(input_signature=[tf.TensorSpec([None, X_train.shape[1]], tf.float32, name='dense_input')])\n", + "def model_fn(x):\n", + " return model(x)\n", + "\n", + "# Convert the Keras model to ONNX\n", + "model_proto, _ = tf2onnx.convert.from_function(\n", + " model_fn,\n", + " input_signature=[tf.TensorSpec([None, X_train.shape[1]], tf.float32, name='dense_input')]\n", + ")\n", + "\n", "# Save the model as ONNX for easy use of ModelMesh\n", - "model_proto, _ = tf2onnx.convert.from_keras(model)\n", "os.makedirs(\"models/fraud/1\", exist_ok=True)\n", "onnx.save(model_proto, \"models/fraud/1/model.onnx\")" - ], - "outputs": [], - "execution_count": 6 + ] }, { "cell_type": "markdown", @@ -337,40 +287,17 @@ }, { "cell_type": "code", + "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2024-08-19T15:45:30.012353Z", "start_time": "2024-08-19T15:45:29.856416Z" } }, + "outputs": [], "source": [ "! ls -alRh ./models/" - ], - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "total 0\r\n", - "drwxr-xr-x@ 3 cchase staff 96B Aug 19 11:42 \u001B[34m.\u001B[m\u001B[m\r\n", - "drwxr-xr-x 30 cchase staff 960B Aug 19 11:45 \u001B[34m..\u001B[m\u001B[m\r\n", - "drwxr-xr-x@ 3 cchase staff 96B Aug 19 11:42 \u001B[34mfraud\u001B[m\u001B[m\r\n", - "\r\n", - "./models//fraud:\r\n", - "total 0\r\n", - "drwxr-xr-x@ 3 cchase staff 96B Aug 19 11:42 \u001B[34m.\u001B[m\u001B[m\r\n", - "drwxr-xr-x@ 3 cchase staff 96B Aug 19 11:42 \u001B[34m..\u001B[m\u001B[m\r\n", - "drwxr-xr-x@ 3 cchase staff 96B Aug 19 11:42 \u001B[34m1\u001B[m\u001B[m\r\n", - "\r\n", - "./models//fraud/1:\r\n", - "total 32\r\n", - "drwxr-xr-x@ 3 cchase staff 96B Aug 19 11:42 \u001B[34m.\u001B[m\u001B[m\r\n", - "drwxr-xr-x@ 3 cchase staff 96B Aug 19 11:42 \u001B[34m..\u001B[m\u001B[m\r\n", - "-rw-r--r--@ 1 cchase staff 13K Aug 19 11:45 model.onnx\r\n" - ] - } - ], - "execution_count": 7 + ] }, { "cell_type": "markdown", @@ -381,20 +308,20 @@ }, { "cell_type": "code", + "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2024-08-19T15:45:30.047040Z", "start_time": "2024-08-19T15:45:30.029773Z" } }, + "outputs": [], "source": [ "from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay\n", "import numpy as np\n", "import pickle\n", "import onnxruntime as rt" - ], - "outputs": [], - "execution_count": 8 + ] }, { "cell_type": "markdown", @@ -405,20 +332,20 @@ }, { "cell_type": "code", + "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2024-08-19T15:45:30.062713Z", "start_time": "2024-08-19T15:45:30.058023Z" } }, + "outputs": [], "source": [ "with open('artifact/scaler.pkl', 'rb') as handle:\n", " scaler = pickle.load(handle)\n", "with open('artifact/test_data.pkl', 'rb') as handle:\n", " (X_test, y_test) = pickle.load(handle)" - ], - "outputs": [], - "execution_count": 9 + ] }, { "cell_type": "markdown", @@ -429,23 +356,23 @@ }, { "cell_type": "code", + "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2024-08-19T15:45:30.210272Z", "start_time": "2024-08-19T15:45:30.073900Z" } }, + "outputs": [], "source": [ "sess = rt.InferenceSession(\"models/fraud/1/model.onnx\", providers=rt.get_available_providers())\n", "input_name = sess.get_inputs()[0].name\n", "output_name = sess.get_outputs()[0].name\n", - "y_pred_temp = sess.run([output_name], {input_name: scaler.transform(X_test.values).astype(np.float32)}) \n", + "y_pred_temp = sess.run([output_name], {input_name: X_test.astype(np.float32)}) \n", "y_pred_temp = np.asarray(np.squeeze(y_pred_temp[0]))\n", "threshold = 0.95\n", "y_pred = np.where(y_pred_temp > threshold, 1, 0)" - ], - "outputs": [], - "execution_count": 10 + ] }, { "cell_type": "markdown", @@ -456,18 +383,20 @@ }, { "cell_type": "code", + "execution_count": null, "metadata": { - "tags": [], "ExecuteTime": { "end_time": "2024-08-19T15:45:30.644142Z", "start_time": "2024-08-19T15:45:30.221686Z" - } + }, + "tags": [] }, + "outputs": [], "source": [ "from sklearn.metrics import precision_score, recall_score, confusion_matrix, ConfusionMatrixDisplay\n", "import numpy as np\n", "\n", - "y_test_arr = y_test.to_numpy().squeeze()\n", + "y_test_arr = y_test.squeeze()\n", "correct = np.equal(y_pred, y_test_arr).sum().item()\n", "acc = (correct / len(y_pred)) * 100\n", "precision = precision_score(y_test_arr, np.round(y_pred))\n", @@ -478,39 +407,7 @@ "\n", "c_matrix = confusion_matrix(y_test_arr, y_pred)\n", "ConfusionMatrixDisplay(c_matrix).plot()" - ], - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Eval Metrics: \n", - " Accuracy: 97.4%, Precision: 0.9938, Recall: 0.6999 \n", - "\n" - ] - }, - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 11, - "metadata": {}, - "output_type": "execute_result" - }, - { - "data": { - "text/plain": [ - "
" - ], - "image/png": "" - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "execution_count": 11 + ] }, { "cell_type": "markdown", @@ -528,19 +425,21 @@ }, { "cell_type": "code", + "execution_count": null, "metadata": { - "tags": [], "ExecuteTime": { "end_time": "2024-08-19T15:45:30.679688Z", "start_time": "2024-08-19T15:45:30.669086Z" - } + }, + "tags": [] }, + "outputs": [], "source": [ "sally_transaction_details = [\n", " [0.3111400080477545,\n", - " 1.9459399775518593, \n", - " 1.0, \n", - " 0.0, \n", + " 1.9459399775518593,\n", + " 1.0,\n", + " 0.0,\n", " 0.0]\n", " ]\n", "\n", @@ -550,50 +449,20 @@ "print(np.squeeze(prediction) > threshold)\n", "\n", "print(\"How likely was Sally's transaction to be fraudulent? \")\n", - "print(\"{:.5f}\".format(np.squeeze(prediction)) + \"%\")" - ], - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Is Sally's transaction predicted to be fraudulent? (true = YES, false = NO) \n", - "False\n", - "How likely was Sally's transaction to be fraudulent? \n", - "0.00002%\n" - ] - } - ], - "execution_count": 12 - }, - { - "cell_type": "code", - "metadata": { - "ExecuteTime": { - "end_time": "2024-08-19T15:45:30.722273Z", - "start_time": "2024-08-19T15:45:30.719926Z" - } - }, - "source": [], - "outputs": [], - "execution_count": null + "print(\"{:.5f}\".format(100 * np.squeeze(prediction)) + \"%\")" + ] }, { "cell_type": "code", - "metadata": { - "ExecuteTime": { - "end_time": "2024-08-19T15:45:30.756156Z", - "start_time": "2024-08-19T15:45:30.750131Z" - } - }, - "source": [], + "execution_count": null, + "metadata": {}, "outputs": [], - "execution_count": null + "source": [] } ], "metadata": { "kernelspec": { - "display_name": "Python 3.9", + "display_name": "Python 3.11", "language": "python", "name": "python3" }, @@ -607,7 +476,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.18" + "version": "3.11.7" }, "vscode": { "interpreter": {