From 36a76a47f35b745152b95d6721fd5a6b8e5aaa15 Mon Sep 17 00:00:00 2001
From: FemkeBakker <femke.bakker02@gmail.com>
Date: Fri, 12 Apr 2024 16:28:53 +0200
Subject: [PATCH] data insights

---
 .gitignore                    |  2 +-
 config.py                     |  1 +
 notebooks/data_insights.ipynb | 52 +++++++++++++++++++++++++++++++----
 3 files changed, 48 insertions(+), 7 deletions(-)

diff --git a/.gitignore b/.gitignore
index c8672d9..fcbb971 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,7 +1,7 @@
 # Adapt to your needs. This version includes typical data files we work with, along with the default python template.
 
 # big files
-big_data/.
+local_data/.
 
 # Text / html
 *.csv
diff --git a/config.py b/config.py
index 845dd32..74aea31 100644
--- a/config.py
+++ b/config.py
@@ -1 +1,2 @@
 """Paths when running locally"""
+output_path = "../local_data"
\ No newline at end of file
diff --git a/notebooks/data_insights.ipynb b/notebooks/data_insights.ipynb
index c9daaf7..7aaef06 100644
--- a/notebooks/data_insights.ipynb
+++ b/notebooks/data_insights.ipynb
@@ -11,7 +11,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 1,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -19,9 +19,9 @@
     "sys.path.append(\"..\")\n",
     "\n",
     "# Select where to run notebook: \"azure\" or \"local\"\n",
-    "my_run = \"azure\"\n",
+    "my_run = \"local\"\n",
     "\n",
-    "import my_secrets as sc\n",
+    "# import my_secrets as sc\n",
     "import settings as st\n",
     "\n",
     "if my_run == \"azure\":\n",
@@ -41,6 +41,46 @@
     "df = pd.read_pickle(path)"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Methodology: estimation runtime\n",
+    "Below an estimation of the runtime, based on GEITje-chat using a simple zeroshot prompt, docs represented with first 100 tokens (not cleaned)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "count    5254.000000\n",
+       "mean       13.007999\n",
+       "std        21.493117\n",
+       "min         5.720006\n",
+       "25%         9.065459\n",
+       "50%        10.906800\n",
+       "75%        12.262889\n",
+       "max       439.354736\n",
+       "Name: runtime, dtype: float64"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "import pandas as pd\n",
+    "predictions = pd.read_pickle(f\"{cf.output_path}/ICgeitje_predictions.pkl\")\n",
+    "                             \n",
+    "#select predictions of run 2 == first run on validation set for simple prompt of Geitje-Chat\n",
+    "predictions = predictions.loc[predictions['run_id']==2]\n",
+    "display(predictions['runtime'].describe())"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -51,9 +91,9 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "ThesisAmsterdamEnvironment19",
+   "display_name": "LocalFinetuning",
    "language": "python",
-   "name": "thesisamsterdamenvironment19"
+   "name": "python3"
   },
   "language_info": {
    "codemirror_mode": {
@@ -65,7 +105,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.9.18"
+   "version": "3.9.19"
   }
  },
  "nbformat": 4,