-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
4fb68fc
commit 57d931b
Showing
4 changed files
with
707 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,266 @@ | ||
{ | ||
"cells": [ | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 1, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"!bash /home/azureuser/cloudfiles/code/blobfuse/blobfuse_raadsinformatie.sh" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 1, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"import sys\n", | ||
"sys.path.append(\"..\")\n", | ||
"\n", | ||
"# Select where to run notebook: \"azure\" or \"local\"\n", | ||
"my_run = \"azure\"\n", | ||
"\n", | ||
"import my_secrets as sc\n", | ||
"import settings as st\n", | ||
"\n", | ||
"if my_run == \"azure\":\n", | ||
" import config_azure as cf\n", | ||
"elif my_run == \"local\":\n", | ||
" import config as cf" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": {}, | ||
"source": [ | ||
"Load data - just first 2 docs of training set" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 6, | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"data": { | ||
"text/html": [ | ||
"<div>\n", | ||
"<style scoped>\n", | ||
" .dataframe tbody tr th:only-of-type {\n", | ||
" vertical-align: middle;\n", | ||
" }\n", | ||
"\n", | ||
" .dataframe tbody tr th {\n", | ||
" vertical-align: top;\n", | ||
" }\n", | ||
"\n", | ||
" .dataframe thead th {\n", | ||
" text-align: right;\n", | ||
" }\n", | ||
"</style>\n", | ||
"<table border=\"1\" class=\"dataframe\">\n", | ||
" <thead>\n", | ||
" <tr style=\"text-align: right;\">\n", | ||
" <th></th>\n", | ||
" <th>label</th>\n", | ||
" <th>path</th>\n", | ||
" <th>id</th>\n", | ||
" <th>set</th>\n", | ||
" <th>text</th>\n", | ||
" <th>tokens</th>\n", | ||
" <th>token_count</th>\n", | ||
" <th>clean_tokens</th>\n", | ||
" <th>clean_tokens_count</th>\n", | ||
" <th>pdf_path</th>\n", | ||
" <th>num_pages</th>\n", | ||
" <th>clean_text</th>\n", | ||
" </tr>\n", | ||
" </thead>\n", | ||
" <tbody>\n", | ||
" <tr>\n", | ||
" <th>0</th>\n", | ||
" <td>Motie</td>\n", | ||
" <td>/home/azureuser/cloudfiles/code/blobfuse/raads...</td>\n", | ||
" <td>0</td>\n", | ||
" <td>train</td>\n", | ||
" <td>Gemeente Amsterdam\\n% Gemeenteraad R\\n% Gemeen...</td>\n", | ||
" <td>[Gemeente, Amsterdam, %, Gemeenteraad, R, %, G...</td>\n", | ||
" <td>395</td>\n", | ||
" <td>[Gemeente, Amsterdam, Gemeenteraad, Gemeentebl...</td>\n", | ||
" <td>205</td>\n", | ||
" <td>/home/azureuser/cloudfiles/code/blobfuse/raads...</td>\n", | ||
" <td>2.0</td>\n", | ||
" <td>Gemeente Amsterdam Gemeenteraad Gemeenteblad M...</td>\n", | ||
" </tr>\n", | ||
" <tr>\n", | ||
" <th>1</th>\n", | ||
" <td>Motie</td>\n", | ||
" <td>/home/azureuser/cloudfiles/code/blobfuse/raads...</td>\n", | ||
" <td>1</td>\n", | ||
" <td>train</td>\n", | ||
" <td>Gemeente Amsterdam\\n\\n% Gemeenteraad R\\n\\n% Ge...</td>\n", | ||
" <td>[Gemeente, Amsterdam, %, Gemeenteraad, R, %, G...</td>\n", | ||
" <td>390</td>\n", | ||
" <td>[Gemeente, Amsterdam, Gemeenteraad, Gemeentebl...</td>\n", | ||
" <td>197</td>\n", | ||
" <td>/home/azureuser/cloudfiles/code/blobfuse/raads...</td>\n", | ||
" <td>2.0</td>\n", | ||
" <td>Gemeente Amsterdam Gemeenteraad Gemeenteblad M...</td>\n", | ||
" </tr>\n", | ||
" </tbody>\n", | ||
"</table>\n", | ||
"</div>" | ||
], | ||
"text/plain": [ | ||
" label path id set \\\n", | ||
"0 Motie /home/azureuser/cloudfiles/code/blobfuse/raads... 0 train \n", | ||
"1 Motie /home/azureuser/cloudfiles/code/blobfuse/raads... 1 train \n", | ||
"\n", | ||
" text \\\n", | ||
"0 Gemeente Amsterdam\\n% Gemeenteraad R\\n% Gemeen... \n", | ||
"1 Gemeente Amsterdam\\n\\n% Gemeenteraad R\\n\\n% Ge... \n", | ||
"\n", | ||
" tokens token_count \\\n", | ||
"0 [Gemeente, Amsterdam, %, Gemeenteraad, R, %, G... 395 \n", | ||
"1 [Gemeente, Amsterdam, %, Gemeenteraad, R, %, G... 390 \n", | ||
"\n", | ||
" clean_tokens clean_tokens_count \\\n", | ||
"0 [Gemeente, Amsterdam, Gemeenteraad, Gemeentebl... 205 \n", | ||
"1 [Gemeente, Amsterdam, Gemeenteraad, Gemeentebl... 197 \n", | ||
"\n", | ||
" pdf_path num_pages \\\n", | ||
"0 /home/azureuser/cloudfiles/code/blobfuse/raads... 2.0 \n", | ||
"1 /home/azureuser/cloudfiles/code/blobfuse/raads... 2.0 \n", | ||
"\n", | ||
" clean_text \n", | ||
"0 Gemeente Amsterdam Gemeenteraad Gemeenteblad M... \n", | ||
"1 Gemeente Amsterdam Gemeenteraad Gemeenteblad M... " | ||
] | ||
}, | ||
"metadata": {}, | ||
"output_type": "display_data" | ||
} | ||
], | ||
"source": [ | ||
"import pandas as pd\n", | ||
"df = pd.read_pickle(f\"{cf.output_path}/txtfiles.pkl\")\n", | ||
"df = df.loc[df['set']=='train'].head(2)\n", | ||
"display(df)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": {}, | ||
"source": [ | ||
"#### Tryout GEITje\n", | ||
"Load chatbot" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 4, | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"name": "stderr", | ||
"output_type": "stream", | ||
"text": [ | ||
"/anaconda/envs/ThesisAmsterdamEnvironment19/lib/python3.9/site-packages/accelerate/utils/modeling.py:1341: UserWarning: Current model requires 1073750016 bytes of buffer for offloaded layers, which seems does not fit any GPU's remaining memory. If you are experiencing a OOM later, please consider using offload_buffers=True.\n", | ||
" warnings.warn(\n" | ||
] | ||
}, | ||
{ | ||
"data": { | ||
"application/vnd.jupyter.widget-view+json": { | ||
"model_id": "89031b8ea8de40e3adaa0ba64da30641", | ||
"version_major": 2, | ||
"version_minor": 0 | ||
}, | ||
"text/plain": [ | ||
"Loading checkpoint shards: 0%| | 0/3 [00:00<?, ?it/s]" | ||
] | ||
}, | ||
"metadata": {}, | ||
"output_type": "display_data" | ||
}, | ||
{ | ||
"name": "stderr", | ||
"output_type": "stream", | ||
"text": [ | ||
"WARNING:root:Some parameters are on the meta device device because they were offloaded to the cpu.\n" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"from transformers import pipeline, Conversation\n", | ||
"\n", | ||
"chatbot = pipeline(task='conversational', model='Rijgersberg/GEITje-7B-chat-v2',\n", | ||
" device_map='auto')" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": {}, | ||
"source": [ | ||
"Simple query" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 5, | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"name": "stderr", | ||
"output_type": "stream", | ||
"text": [ | ||
"The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.\n", | ||
"Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n" | ||
] | ||
}, | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
"Conversation id: c7f94172-c476-4fc4-a1cf-27485289b9d1\n", | ||
"user: Welk woord hoort er niet in dit rijtje thuis: \"auto, vliegtuig, geitje, bus\"?\n", | ||
"assistant: Geitje hoort er niet in thuis.\n", | ||
"\n" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"print(chatbot(\n", | ||
" Conversation('Welk woord hoort er niet in dit rijtje thuis: \"auto, vliegtuig, geitje, bus\"?')\n", | ||
"))" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": {}, | ||
"source": [] | ||
} | ||
], | ||
"metadata": { | ||
"kernelspec": { | ||
"display_name": "ThesisAmsterdamEnvironment19", | ||
"language": "python", | ||
"name": "thesisamsterdamenvironment19" | ||
}, | ||
"language_info": { | ||
"codemirror_mode": { | ||
"name": "ipython", | ||
"version": 3 | ||
}, | ||
"file_extension": ".py", | ||
"mimetype": "text/x-python", | ||
"name": "python", | ||
"nbconvert_exporter": "python", | ||
"pygments_lexer": "ipython3", | ||
"version": "3.9.18" | ||
} | ||
}, | ||
"nbformat": 4, | ||
"nbformat_minor": 2 | ||
} |
Oops, something went wrong.