Skip to content

Commit

Permalink
Fix wrong method
Browse files Browse the repository at this point in the history
  • Loading branch information
dizys committed Feb 17, 2022
1 parent f09c297 commit c532df7
Show file tree
Hide file tree
Showing 4 changed files with 397 additions and 69 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -16,3 +16,4 @@ GRADE.txt
*_out.txt
submission.pos
/submission
*.pkl
2 changes: 1 addition & 1 deletion README.txt
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ How to run:

optional arguments:
-h, --help show this help message and exit
-s STATEFILE path for storing/reading trained state file, default is states.json
-s STATEFILE path for storing/reading trained state file, default is states.pkl
-o OUTPUTFILE path for storing tagged output file, default is output.txt

Examples:
Expand Down
285 changes: 285 additions & 0 deletions src/exp.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,285 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 35,
"metadata": {},
"outputs": [],
"source": [
"import json\n",
"from argparse import Namespace\n",
"from pathlib import Path\n",
"\n",
"from typing import Dict, List, Tuple, Set"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"PosixPath('/Users/ziyang/Projects/aca/nyu-nlp-homework-3/src/..')"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"PROJECT_DIR = Path('..').absolute()\n",
"PROJECT_DIR"
]
},
{
"cell_type": "code",
"execution_count": 49,
"metadata": {},
"outputs": [],
"source": [
"args = Namespace(\n",
" train_file_path=PROJECT_DIR / 'data' / 'WSJ_02-21.pos',\n",
" test_file_path=PROJECT_DIR / 'data' / 'WSJ_24.words',\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [],
"source": [
"lines = [line.strip() for line in open(args.train_file_path, 'r')]"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [],
"source": [
"lines = [\n",
" 'the\\tDT',\n",
" 'cat\\tNN',\n",
" 'sat\\tVBD',\n",
" 'on\\tIN',\n",
" 'the\\tDT',\n",
" 'mat\\tNN',\n",
" '.\\t.',\n",
" '',\n",
"]"
]
},
{
"cell_type": "code",
"execution_count": 40,
"metadata": {},
"outputs": [],
"source": [
"word_tag_count: Dict[Tuple[str, str], int] = {}\n",
"word_set: Set[str] = set()\n",
"tag_count: Dict[str, int] = {}\n",
"tag_tag_count: Dict[Tuple[str, str], int] = {}\n",
"\n",
"last_tag = 'B'\n",
"for line in lines:\n",
" if last_tag == 'B':\n",
" tag_count['B'] = tag_count.get('B', 0) + 1\n",
" \n",
" line = line.strip()\n",
" if line:\n",
" word, tag = [x.strip() for x in line.split(\"\\t\")]\n",
" word = word.lower()\n",
" else:\n",
" word = ''\n",
" tag = 'E'\n",
" \n",
" tag_count[tag] = tag_count.get(tag, 0) + 1\n",
" tag_tag_count[(last_tag, tag)] = tag_tag_count.get((last_tag, tag), 0) + 1\n",
" if word:\n",
" word_set.add(word)\n",
" word_tag_count[(word, tag)] = word_tag_count.get((word, tag), 0) + 1\n",
"\n",
" if tag != 'E':\n",
" last_tag = tag\n",
" else:\n",
" last_tag = 'B'\n"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{('the', 'DT'): 2,\n",
" ('cat', 'NN'): 1,\n",
" ('sat', 'VBD'): 1,\n",
" ('on', 'IN'): 1,\n",
" ('mat', 'NN'): 1,\n",
" ('.', '.'): 1}"
]
},
"execution_count": 23,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"word_tag_count"
]
},
{
"cell_type": "code",
"execution_count": 41,
"metadata": {},
"outputs": [],
"source": [
"trans_prob: Dict[Tuple[str, str], float] = {} # (tag1, tag2) -> prob\n",
"for tag1 in tag_count.keys():\n",
" for tag2 in tag_count.keys():\n",
" total_tag1 = tag_count.get(tag1, 0)\n",
" tag1_followed_by_tag2 = tag_tag_count.get((tag1, tag2), 0)\n",
" if total_tag1 == 0:\n",
" prob = 0\n",
" else:\n",
" prob = tag1_followed_by_tag2 / total_tag1\n",
" trans_prob[(tag1, tag2)] = prob"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"trans_prob"
]
},
{
"cell_type": "code",
"execution_count": 46,
"metadata": {},
"outputs": [],
"source": [
"emission_prob: Dict[Tuple[str, str], float] = {} # (word, tag) -> prob\n",
"for word in word_set:\n",
" for tag in tag_count.keys():\n",
" word_tag = word_tag_count.get((word, tag), 0)\n",
" tag_total = tag_count.get(tag, 0)\n",
" if tag_total == 0:\n",
" prob = 0\n",
" else:\n",
" prob = word_tag / tag_total\n",
" emission_prob[(word, tag)] = prob"
]
},
{
"cell_type": "code",
"execution_count": 47,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{('sat', 'B'): 0.0,\n",
" ('sat', 'DT'): 0.0,\n",
" ('sat', 'NN'): 0.0,\n",
" ('sat', 'VBD'): 1.0,\n",
" ('sat', 'IN'): 0.0,\n",
" ('sat', '.'): 0.0,\n",
" ('sat', 'E'): 0.0,\n",
" ('mat', 'B'): 0.0,\n",
" ('mat', 'DT'): 0.0,\n",
" ('mat', 'NN'): 0.5,\n",
" ('mat', 'VBD'): 0.0,\n",
" ('mat', 'IN'): 0.0,\n",
" ('mat', '.'): 0.0,\n",
" ('mat', 'E'): 0.0,\n",
" ('cat', 'B'): 0.0,\n",
" ('cat', 'DT'): 0.0,\n",
" ('cat', 'NN'): 0.5,\n",
" ('cat', 'VBD'): 0.0,\n",
" ('cat', 'IN'): 0.0,\n",
" ('cat', '.'): 0.0,\n",
" ('cat', 'E'): 0.0,\n",
" ('.', 'B'): 0.0,\n",
" ('.', 'DT'): 0.0,\n",
" ('.', 'NN'): 0.0,\n",
" ('.', 'VBD'): 0.0,\n",
" ('.', 'IN'): 0.0,\n",
" ('.', '.'): 1.0,\n",
" ('.', 'E'): 0.0,\n",
" ('the', 'B'): 0.0,\n",
" ('the', 'DT'): 1.0,\n",
" ('the', 'NN'): 0.0,\n",
" ('the', 'VBD'): 0.0,\n",
" ('the', 'IN'): 0.0,\n",
" ('the', '.'): 0.0,\n",
" ('the', 'E'): 0.0,\n",
" ('on', 'B'): 0.0,\n",
" ('on', 'DT'): 0.0,\n",
" ('on', 'NN'): 0.0,\n",
" ('on', 'VBD'): 0.0,\n",
" ('on', 'IN'): 1.0,\n",
" ('on', '.'): 0.0,\n",
" ('on', 'E'): 0.0}"
]
},
"execution_count": 47,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"emission_prob"
]
},
{
"cell_type": "code",
"execution_count": 50,
"metadata": {},
"outputs": [],
"source": [
"lines = [line.strip() for line in open(args.test_file_path, 'r')]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"interpreter": {
"hash": "5edc29c2ed010d6458d71a83433b383a96a8cbd3efe8531bc90c4b8a5b8bcec9"
},
"kernelspec": {
"display_name": "Python 3.8.9 64-bit",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.9"
},
"orig_nbformat": 4
},
"nbformat": 4,
"nbformat_minor": 2
}
Loading

0 comments on commit c532df7

Please sign in to comment.