Skip to content

Commit

Permalink
data insights
Browse files Browse the repository at this point in the history
  • Loading branch information
FemkeBakker committed Apr 12, 2024
1 parent 5c0672f commit 36a76a4
Show file tree
Hide file tree
Showing 3 changed files with 48 additions and 7 deletions.
2 changes: 1 addition & 1 deletion .gitignore
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# Adapt to your needs. This version includes typical data files we work with, along with the default python template.

# big files
big_data/.
local_data/.

# Text / html
*.csv
Expand Down
1 change: 1 addition & 0 deletions config.py
Original file line number Diff line number Diff line change
@@ -1 +1,2 @@
"""Paths when running locally"""
output_path = "../local_data"
52 changes: 46 additions & 6 deletions notebooks/data_insights.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -11,17 +11,17 @@
},
{
"cell_type": "code",
"execution_count": 5,
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import sys\n",
"sys.path.append(\"..\")\n",
"\n",
"# Select where to run notebook: \"azure\" or \"local\"\n",
"my_run = \"azure\"\n",
"my_run = \"local\"\n",
"\n",
"import my_secrets as sc\n",
"# import my_secrets as sc\n",
"import settings as st\n",
"\n",
"if my_run == \"azure\":\n",
Expand All @@ -41,6 +41,46 @@
"df = pd.read_pickle(path)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Methodology: estimation runtime\n",
"Below an estimation of the runtime, based on GEITje-chat using a simple zeroshot prompt, docs represented with first 100 tokens (not cleaned)"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"count 5254.000000\n",
"mean 13.007999\n",
"std 21.493117\n",
"min 5.720006\n",
"25% 9.065459\n",
"50% 10.906800\n",
"75% 12.262889\n",
"max 439.354736\n",
"Name: runtime, dtype: float64"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"import pandas as pd\n",
"predictions = pd.read_pickle(f\"{cf.output_path}/ICgeitje_predictions.pkl\")\n",
" \n",
"#select predictions of run 2 == first run on validation set for simple prompt of Geitje-Chat\n",
"predictions = predictions.loc[predictions['run_id']==2]\n",
"display(predictions['runtime'].describe())"
]
},
{
"cell_type": "code",
"execution_count": null,
Expand All @@ -51,9 +91,9 @@
],
"metadata": {
"kernelspec": {
"display_name": "ThesisAmsterdamEnvironment19",
"display_name": "LocalFinetuning",
"language": "python",
"name": "thesisamsterdamenvironment19"
"name": "python3"
},
"language_info": {
"codemirror_mode": {
Expand All @@ -65,7 +105,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.18"
"version": "3.9.19"
}
},
"nbformat": 4,
Expand Down

0 comments on commit 36a76a4

Please sign in to comment.