Skip to content

Commit

Permalink
Merge pull request #65 from tmu-nlp/okabe
Browse files Browse the repository at this point in the history
Okabe
  • Loading branch information
kiyama-hajime authored May 20, 2024
2 parents 2903beb + 105bdd4 commit 9333d7e
Show file tree
Hide file tree
Showing 12 changed files with 216,755 additions and 0 deletions.
54 changes: 54 additions & 0 deletions okabe/chapter04/knock30.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"## 形態素解析結果(neko.txt.mecab)を読み込むプログラムを実装せよ.\n",
"## ただし,各形態素は表層形(surface),基本形(base),品詞(pos),品詞細分類1(pos1)をキーとするマッピング型に格納し,\n",
"## 1文を形態素(マッピング型)のリストとして表現せよ.\n",
"## 第4章の残りの問題では,ここで作ったプログラムを活用せよ.\n",
"\n",
"import re\n",
"import numpy as np\n",
"\n",
"with open('neko2.txt.mecab','r') as f:\n",
" neko_data = f.read()\n",
"split_neko = neko_data.split(\"\\n\")\n",
"\n",
"sentence_list = list()\n",
"dict_line = dict()\n",
"sentence = list()\n",
"\n",
"for line in split_neko:\n",
" split_line = re.split('[\\t,]',line)\n",
" #print(split_line)\n",
" if len(split_line) == 1 and split_line[0] == \"\":\n",
" continue\n",
" if len(split_line) == 1 and split_line[0] == \"EOS\":\n",
" sentence_list.append(sentence)\n",
" sentence = list()\n",
" continue\n",
" #print(split_line[0],split_line[7],split_line[1],split_line[2])\n",
" dict_line[\"surface\"] = split_line[0]\n",
" dict_line[\"base\"] = split_line[7]\n",
" dict_line[\"pos\"] = split_line[1]\n",
" dict_line[\"pos1\"] = split_line[2]\n",
" sentence.append(dict_line)\n",
" dict_line = dict()\n",
"\n",
"#for line in sentence_list:\n",
" #print(line)"
]
}
],
"metadata": {
"language_info": {
"name": "python"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
27 changes: 27 additions & 0 deletions okabe/chapter04/knock30.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
import re
import numpy as np

with open('neko2.txt.mecab','r') as f:
neko_data = f.read()
split_neko = neko_data.split("\n")

sentence_list = list()
dict_line = dict()
sentence = list()

for line in split_neko:
split_line = re.split('[\t,]',line)
#print(split_line)
if len(split_line) == 1 and split_line[0] == "":
continue
if len(split_line) == 1 and split_line[0] == "EOS":
sentence_list.append(sentence)
sentence = list()
continue
#print(split_line[0],split_line[7],split_line[1],split_line[2])
dict_line["surface"] = split_line[0]
dict_line["base"] = split_line[7]
dict_line["pos"] = split_line[1]
dict_line["pos1"] = split_line[2]
sentence.append(dict_line)
dict_line = dict()
29 changes: 29 additions & 0 deletions okabe/chapter04/knock31.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#動詞の表層形をすべて抽出せよ.\n",
"import knock30\n",
"\n",
"verbs = list()\n",
"\n",
"for line in knock30.sentence_list:\n",
" for morph in line:\n",
" if morph['pos'] == \"動詞\":\n",
" verbs.append(morph['surface'])\n",
"print(verbs)"
]
}
],
"metadata": {
"language_info": {
"name": "python"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
30 changes: 30 additions & 0 deletions okabe/chapter04/knock32.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#動詞の基本形をすべて抽出せよ\n",
"\n",
"import knock30\n",
"\n",
"verbs_base = list()\n",
"\n",
"for line in knock30.sentence_list:\n",
" for morph in line:\n",
" if morph['pos'] == \"動詞\":\n",
" verbs_base.append(morph['base'])\n",
"print(verbs_base)"
]
}
],
"metadata": {
"language_info": {
"name": "python"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
34 changes: 34 additions & 0 deletions okabe/chapter04/knock33.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#2つの名詞が「の」で連結されている名詞句を抽出せよ\n",
"\n",
"import knock30\n",
"\n",
"nouns = list()\n",
"\n",
"for line in knock30.sentence_list:\n",
" for i in range(len(line)):\n",
" if line[i]['base'] == \"\" and line[i]['pos'] == \"助詞\":\n",
" #print(line)\n",
" if i<len(line)-1:\n",
" if line[i-1]['pos'] == \"名詞\" and line[i+1]['pos'] == \"名詞\":\n",
" nouns.append(str(line[i-1]['surface']+\"\"+line[i+1]['surface']))\n",
"\n",
"print(nouns)"
]
}
],
"metadata": {
"language_info": {
"name": "python"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
46 changes: 46 additions & 0 deletions okabe/chapter04/knock34.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#名詞の連接(連続して出現する名詞)を最長一致で抽出せよ\n",
"\n",
"import knock30\n",
"\n",
"ct_noun = list()\n",
"\n",
"for line in knock30.sentence_list:\n",
" temp_noun = \"\"\n",
" cnt = 0\n",
" for morph in line:\n",
" if morph[\"pos\"] == \"名詞\":\n",
" temp_noun += morph[\"surface\"]\n",
" cnt += 1\n",
" else:\n",
" if cnt > 1:\n",
" ct_noun.append(temp_noun)\n",
" temp_noun = \"\"\n",
" cnt = 0 \n",
"\n",
"length = 0\n",
"longest = list()\n",
"for elm in ct_noun:\n",
" if len(elm) > length:\n",
" longest.append(elm)\n",
" length = len(elm)\n",
"\n",
"print(longest)"
]
}
],
"metadata": {
"language_info": {
"name": "python"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
37 changes: 37 additions & 0 deletions okabe/chapter04/knock35.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#文章中に出現する単語とその出現頻度を求め,出現頻度の高い順に並べよ\n",
"import knock30\n",
"\n",
"word_dict = dict()\n",
"\n",
"for line in knock30.sentence_list:\n",
" for morph in line:\n",
" if morph[\"pos\"] == \"記号\":\n",
" continue\n",
" base = str(morph[\"base\"])\n",
" if base in word_dict:\n",
" word_dict[base] += 1\n",
" else:\n",
" word_dict[base] = 1\n",
"\n",
"sort_word_list = sorted(word_dict.items(), key=lambda x: x[1], reverse=True)\n",
"\n",
"print(sort_word_list)"
]
}
],
"metadata": {
"language_info": {
"name": "python"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
49 changes: 49 additions & 0 deletions okabe/chapter04/knock36.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#出現頻度が高い10語とその出現頻度をグラフ(例えば棒グラフなど)で表示せよ.\n",
"import knock30\n",
"import matplotlib.pyplot as plt\n",
"import japanize_matplotlib #日本語化matplotlib\n",
"import seaborn as sns\n",
"sns.set(font=\"IPAexGothic\")\n",
"\n",
"word_dict = dict()\n",
"\n",
"for line in knock30.sentence_list:\n",
" for morph in line:\n",
" if morph[\"pos\"] == \"記号\":\n",
" continue\n",
" base = str(morph[\"base\"])\n",
" if base in word_dict:\n",
" word_dict[base] += 1\n",
" else:\n",
" word_dict[base] = 1\n",
"\n",
"sort_word_list = sorted(word_dict.items(), key=lambda x: x[1], reverse=True)\n",
"\n",
"top_10 = sort_word_list[:10]\n",
"\n",
"words, frequencies = zip(*top_10)\n",
"\n",
"plt.bar(words, frequencies)\n",
"plt.xlabel('語')\n",
"plt.ylabel('頻度')\n",
"plt.title('出現頻度上位10単語')\n",
"plt.show()"
]
}
],
"metadata": {
"language_info": {
"name": "python"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
45 changes: 45 additions & 0 deletions okabe/chapter04/knock37.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#「猫」とよく共起する(共起頻度が高い)10語とその出現頻度をグラフ(例えば棒グラフなど)で表示せよ.\n",
"import knock30\n",
"import matplotlib.pyplot as plt\n",
"import japanize_matplotlib #日本語化matplotlib\n",
"import seaborn as sns\n",
"from collections import defaultdict\n",
"sns.set(font=\"IPAexGothic\")\n",
"\n",
"word_dict = defaultdict(int)\n",
"\n",
"for line in knock30.sentence_list:\n",
" if any(morph[\"base\"] == \"\" for morph in line):\n",
" for morph in line:\n",
" if morph[\"base\"] != \"\" and morph[\"pos\"] != \"記号\":\n",
" word_dict[morph[\"base\"]] += 1\n",
"\n",
"sort_word_list = sorted(word_dict.items(), key=lambda x: x[1], reverse=True)\n",
"top_10 = sort_word_list[:10]\n",
"\n",
"words, frequencies = zip(*top_10)\n",
"\n",
"plt.bar(words, frequencies)\n",
"plt.xlabel('語')\n",
"plt.ylabel('頻度')\n",
"plt.title('「猫」との共起頻度が高い10単語')\n",
"plt.show()"
]
}
],
"metadata": {
"language_info": {
"name": "python"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Loading

0 comments on commit 9333d7e

Please sign in to comment.