From 36a76a47f35b745152b95d6721fd5a6b8e5aaa15 Mon Sep 17 00:00:00 2001 From: FemkeBakker Date: Fri, 12 Apr 2024 16:28:53 +0200 Subject: [PATCH] data insights --- .gitignore | 2 +- config.py | 1 + notebooks/data_insights.ipynb | 52 +++++++++++++++++++++++++++++++---- 3 files changed, 48 insertions(+), 7 deletions(-) diff --git a/.gitignore b/.gitignore index c8672d9..fcbb971 100644 --- a/.gitignore +++ b/.gitignore @@ -1,7 +1,7 @@ # Adapt to your needs. This version includes typical data files we work with, along with the default python template. # big files -big_data/. +local_data/. # Text / html *.csv diff --git a/config.py b/config.py index 845dd32..74aea31 100644 --- a/config.py +++ b/config.py @@ -1 +1,2 @@ """Paths when running locally""" +output_path = "../local_data" \ No newline at end of file diff --git a/notebooks/data_insights.ipynb b/notebooks/data_insights.ipynb index c9daaf7..7aaef06 100644 --- a/notebooks/data_insights.ipynb +++ b/notebooks/data_insights.ipynb @@ -11,7 +11,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 1, "metadata": {}, "outputs": [], "source": [ @@ -19,9 +19,9 @@ "sys.path.append(\"..\")\n", "\n", "# Select where to run notebook: \"azure\" or \"local\"\n", - "my_run = \"azure\"\n", + "my_run = \"local\"\n", "\n", - "import my_secrets as sc\n", + "# import my_secrets as sc\n", "import settings as st\n", "\n", "if my_run == \"azure\":\n", @@ -41,6 +41,46 @@ "df = pd.read_pickle(path)" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Methodology: estimation runtime\n", + "Below an estimation of the runtime, based on GEITje-chat using a simple zeroshot prompt, docs represented with first 100 tokens (not cleaned)" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "count 5254.000000\n", + "mean 13.007999\n", + "std 21.493117\n", + "min 5.720006\n", + "25% 9.065459\n", + "50% 10.906800\n", + "75% 12.262889\n", + "max 439.354736\n", + "Name: runtime, dtype: float64" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "import pandas as pd\n", + "predictions = pd.read_pickle(f\"{cf.output_path}/ICgeitje_predictions.pkl\")\n", + " \n", + "#select predictions of run 2 == first run on validation set for simple prompt of Geitje-Chat\n", + "predictions = predictions.loc[predictions['run_id']==2]\n", + "display(predictions['runtime'].describe())" + ] + }, { "cell_type": "code", "execution_count": null, @@ -51,9 +91,9 @@ ], "metadata": { "kernelspec": { - "display_name": "ThesisAmsterdamEnvironment19", + "display_name": "LocalFinetuning", "language": "python", - "name": "thesisamsterdamenvironment19" + "name": "python3" }, "language_info": { "codemirror_mode": { @@ -65,7 +105,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.18" + "version": "3.9.19" } }, "nbformat": 4,