Skip to content

Commit

Permalink
first time prompting GEITje
Browse files Browse the repository at this point in the history
  • Loading branch information
FemkeBakker committed Apr 2, 2024
1 parent 4fb68fc commit 57d931b
Show file tree
Hide file tree
Showing 4 changed files with 707 additions and 0 deletions.
5 changes: 5 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,11 @@ This repo is for code related to the project Document Classification under the W
* [`tests`](./tests) Unit tests
* ...






## Installation

1) Clone this repository:
Expand Down
266 changes: 266 additions & 0 deletions notebooks/CLgeitje.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,266 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"!bash /home/azureuser/cloudfiles/code/blobfuse/blobfuse_raadsinformatie.sh"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import sys\n",
"sys.path.append(\"..\")\n",
"\n",
"# Select where to run notebook: \"azure\" or \"local\"\n",
"my_run = \"azure\"\n",
"\n",
"import my_secrets as sc\n",
"import settings as st\n",
"\n",
"if my_run == \"azure\":\n",
" import config_azure as cf\n",
"elif my_run == \"local\":\n",
" import config as cf"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Load data - just first 2 docs of training set"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>label</th>\n",
" <th>path</th>\n",
" <th>id</th>\n",
" <th>set</th>\n",
" <th>text</th>\n",
" <th>tokens</th>\n",
" <th>token_count</th>\n",
" <th>clean_tokens</th>\n",
" <th>clean_tokens_count</th>\n",
" <th>pdf_path</th>\n",
" <th>num_pages</th>\n",
" <th>clean_text</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>Motie</td>\n",
" <td>/home/azureuser/cloudfiles/code/blobfuse/raads...</td>\n",
" <td>0</td>\n",
" <td>train</td>\n",
" <td>Gemeente Amsterdam\\n% Gemeenteraad R\\n% Gemeen...</td>\n",
" <td>[Gemeente, Amsterdam, %, Gemeenteraad, R, %, G...</td>\n",
" <td>395</td>\n",
" <td>[Gemeente, Amsterdam, Gemeenteraad, Gemeentebl...</td>\n",
" <td>205</td>\n",
" <td>/home/azureuser/cloudfiles/code/blobfuse/raads...</td>\n",
" <td>2.0</td>\n",
" <td>Gemeente Amsterdam Gemeenteraad Gemeenteblad M...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>Motie</td>\n",
" <td>/home/azureuser/cloudfiles/code/blobfuse/raads...</td>\n",
" <td>1</td>\n",
" <td>train</td>\n",
" <td>Gemeente Amsterdam\\n\\n% Gemeenteraad R\\n\\n% Ge...</td>\n",
" <td>[Gemeente, Amsterdam, %, Gemeenteraad, R, %, G...</td>\n",
" <td>390</td>\n",
" <td>[Gemeente, Amsterdam, Gemeenteraad, Gemeentebl...</td>\n",
" <td>197</td>\n",
" <td>/home/azureuser/cloudfiles/code/blobfuse/raads...</td>\n",
" <td>2.0</td>\n",
" <td>Gemeente Amsterdam Gemeenteraad Gemeenteblad M...</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" label path id set \\\n",
"0 Motie /home/azureuser/cloudfiles/code/blobfuse/raads... 0 train \n",
"1 Motie /home/azureuser/cloudfiles/code/blobfuse/raads... 1 train \n",
"\n",
" text \\\n",
"0 Gemeente Amsterdam\\n% Gemeenteraad R\\n% Gemeen... \n",
"1 Gemeente Amsterdam\\n\\n% Gemeenteraad R\\n\\n% Ge... \n",
"\n",
" tokens token_count \\\n",
"0 [Gemeente, Amsterdam, %, Gemeenteraad, R, %, G... 395 \n",
"1 [Gemeente, Amsterdam, %, Gemeenteraad, R, %, G... 390 \n",
"\n",
" clean_tokens clean_tokens_count \\\n",
"0 [Gemeente, Amsterdam, Gemeenteraad, Gemeentebl... 205 \n",
"1 [Gemeente, Amsterdam, Gemeenteraad, Gemeentebl... 197 \n",
"\n",
" pdf_path num_pages \\\n",
"0 /home/azureuser/cloudfiles/code/blobfuse/raads... 2.0 \n",
"1 /home/azureuser/cloudfiles/code/blobfuse/raads... 2.0 \n",
"\n",
" clean_text \n",
"0 Gemeente Amsterdam Gemeenteraad Gemeenteblad M... \n",
"1 Gemeente Amsterdam Gemeenteraad Gemeenteblad M... "
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"import pandas as pd\n",
"df = pd.read_pickle(f\"{cf.output_path}/txtfiles.pkl\")\n",
"df = df.loc[df['set']=='train'].head(2)\n",
"display(df)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Tryout GEITje\n",
"Load chatbot"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/anaconda/envs/ThesisAmsterdamEnvironment19/lib/python3.9/site-packages/accelerate/utils/modeling.py:1341: UserWarning: Current model requires 1073750016 bytes of buffer for offloaded layers, which seems does not fit any GPU's remaining memory. If you are experiencing a OOM later, please consider using offload_buffers=True.\n",
" warnings.warn(\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "89031b8ea8de40e3adaa0ba64da30641",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Loading checkpoint shards: 0%| | 0/3 [00:00<?, ?it/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"WARNING:root:Some parameters are on the meta device device because they were offloaded to the cpu.\n"
]
}
],
"source": [
"from transformers import pipeline, Conversation\n",
"\n",
"chatbot = pipeline(task='conversational', model='Rijgersberg/GEITje-7B-chat-v2',\n",
" device_map='auto')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Simple query"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.\n",
"Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Conversation id: c7f94172-c476-4fc4-a1cf-27485289b9d1\n",
"user: Welk woord hoort er niet in dit rijtje thuis: \"auto, vliegtuig, geitje, bus\"?\n",
"assistant: Geitje hoort er niet in thuis.\n",
"\n"
]
}
],
"source": [
"print(chatbot(\n",
" Conversation('Welk woord hoort er niet in dit rijtje thuis: \"auto, vliegtuig, geitje, bus\"?')\n",
"))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "ThesisAmsterdamEnvironment19",
"language": "python",
"name": "thesisamsterdamenvironment19"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.18"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Loading

0 comments on commit 57d931b

Please sign in to comment.