Merge pull request #65 from tmu-nlp/okabe

Okabe
tmu-nlp · May 20, 2024 · 9333d7e · 9333d7e
2 parents 2903beb + 105bdd4
commit 9333d7e
Show file tree

Hide file tree

Showing 12 changed files with 216,755 additions and 0 deletions.
diff --git a/okabe/chapter04/knock30.ipynb b/okabe/chapter04/knock30.ipynb
@@ -0,0 +1,54 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "## 形態素解析結果（neko.txt.mecab）を読み込むプログラムを実装せよ．\n",
+    "## ただし，各形態素は表層形（surface），基本形（base），品詞（pos），品詞細分類1（pos1）をキーとするマッピング型に格納し，\n",
+    "## 1文を形態素（マッピング型）のリストとして表現せよ．\n",
+    "## 第4章の残りの問題では，ここで作ったプログラムを活用せよ．\n",
+    "\n",
+    "import re\n",
+    "import numpy as np\n",
+    "\n",
+    "with open('neko2.txt.mecab','r') as f:\n",
+    "    neko_data = f.read()\n",
+    "split_neko = neko_data.split(\"\\n\")\n",
+    "\n",
+    "sentence_list = list()\n",
+    "dict_line = dict()\n",
+    "sentence = list()\n",
+    "\n",
+    "for line in split_neko:\n",
+    "    split_line = re.split('[\\t,]',line)\n",
+    "    #print(split_line)\n",
+    "    if len(split_line) == 1 and split_line[0] == \"\":\n",
+    "        continue\n",
+    "    if len(split_line) == 1 and split_line[0] == \"EOS\":\n",
+    "        sentence_list.append(sentence)\n",
+    "        sentence = list()\n",
+    "        continue\n",
+    "    #print(split_line[0],split_line[7],split_line[1],split_line[2])\n",
+    "    dict_line[\"surface\"] = split_line[0]\n",
+    "    dict_line[\"base\"] = split_line[7]\n",
+    "    dict_line[\"pos\"] = split_line[1]\n",
+    "    dict_line[\"pos1\"] = split_line[2]\n",
+    "    sentence.append(dict_line)\n",
+    "    dict_line = dict()\n",
+    "\n",
+    "#for line in sentence_list:\n",
+    "    #print(line)"
+   ]
+  }
+ ],
+ "metadata": {
+  "language_info": {
+   "name": "python"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/okabe/chapter04/knock30.py b/okabe/chapter04/knock30.py
@@ -0,0 +1,27 @@
+import re
+import numpy as np
+
+with open('neko2.txt.mecab','r') as f:
+    neko_data = f.read()
+split_neko = neko_data.split("\n")
+
+sentence_list = list()
+dict_line = dict()
+sentence = list()
+
+for line in split_neko:
+    split_line = re.split('[\t,]',line)
+    #print(split_line)
+    if len(split_line) == 1 and split_line[0] == "":
+        continue
+    if len(split_line) == 1 and split_line[0] == "EOS":
+        sentence_list.append(sentence)
+        sentence = list()
+        continue
+    #print(split_line[0],split_line[7],split_line[1],split_line[2])
+    dict_line["surface"] = split_line[0]
+    dict_line["base"] = split_line[7]
+    dict_line["pos"] = split_line[1]
+    dict_line["pos1"] = split_line[2]
+    sentence.append(dict_line)
+    dict_line = dict()
diff --git a/okabe/chapter04/knock31.ipynb b/okabe/chapter04/knock31.ipynb
@@ -0,0 +1,29 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#動詞の表層形をすべて抽出せよ．\n",
+    "import knock30\n",
+    "\n",
+    "verbs = list()\n",
+    "\n",
+    "for line in knock30.sentence_list:\n",
+    "    for morph in line:\n",
+    "        if morph['pos'] == \"動詞\":\n",
+    "            verbs.append(morph['surface'])\n",
+    "print(verbs)"
+   ]
+  }
+ ],
+ "metadata": {
+  "language_info": {
+   "name": "python"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/okabe/chapter04/knock32.ipynb b/okabe/chapter04/knock32.ipynb
@@ -0,0 +1,30 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#動詞の基本形をすべて抽出せよ\n",
+    "\n",
+    "import knock30\n",
+    "\n",
+    "verbs_base = list()\n",
+    "\n",
+    "for line in knock30.sentence_list:\n",
+    "    for morph in line:\n",
+    "        if morph['pos'] == \"動詞\":\n",
+    "            verbs_base.append(morph['base'])\n",
+    "print(verbs_base)"
+   ]
+  }
+ ],
+ "metadata": {
+  "language_info": {
+   "name": "python"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/okabe/chapter04/knock33.ipynb b/okabe/chapter04/knock33.ipynb
@@ -0,0 +1,34 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#2つの名詞が「の」で連結されている名詞句を抽出せよ\n",
+    "\n",
+    "import knock30\n",
+    "\n",
+    "nouns = list()\n",
+    "\n",
+    "for line in knock30.sentence_list:\n",
+    "    for i in range(len(line)):\n",
+    "        if line[i]['base'] == \"の\" and line[i]['pos'] == \"助詞\":\n",
+    "            #print(line)\n",
+    "            if i<len(line)-1:\n",
+    "                if line[i-1]['pos'] == \"名詞\" and line[i+1]['pos'] == \"名詞\":\n",
+    "                    nouns.append(str(line[i-1]['surface']+\"の\"+line[i+1]['surface']))\n",
+    "\n",
+    "print(nouns)"
+   ]
+  }
+ ],
+ "metadata": {
+  "language_info": {
+   "name": "python"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/okabe/chapter04/knock34.ipynb b/okabe/chapter04/knock34.ipynb
@@ -0,0 +1,46 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#名詞の連接（連続して出現する名詞）を最長一致で抽出せよ\n",
+    "\n",
+    "import knock30\n",
+    "\n",
+    "ct_noun = list()\n",
+    "\n",
+    "for line in knock30.sentence_list:\n",
+    "    temp_noun = \"\"\n",
+    "    cnt = 0\n",
+    "    for morph in line:\n",
+    "        if morph[\"pos\"] == \"名詞\":\n",
+    "            temp_noun += morph[\"surface\"]\n",
+    "            cnt += 1\n",
+    "        else:\n",
+    "            if cnt > 1:\n",
+    "                ct_noun.append(temp_noun)\n",
+    "            temp_noun = \"\"\n",
+    "            cnt = 0   \n",
+    "\n",
+    "length = 0\n",
+    "longest = list()\n",
+    "for elm in ct_noun:\n",
+    "    if len(elm) > length:\n",
+    "        longest.append(elm)\n",
+    "        length = len(elm)\n",
+    "\n",
+    "print(longest)"
+   ]
+  }
+ ],
+ "metadata": {
+  "language_info": {
+   "name": "python"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/okabe/chapter04/knock35.ipynb b/okabe/chapter04/knock35.ipynb
@@ -0,0 +1,37 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#文章中に出現する単語とその出現頻度を求め，出現頻度の高い順に並べよ\n",
+    "import knock30\n",
+    "\n",
+    "word_dict = dict()\n",
+    "\n",
+    "for line in knock30.sentence_list:\n",
+    "    for morph in line:\n",
+    "        if morph[\"pos\"] == \"記号\":\n",
+    "            continue\n",
+    "        base = str(morph[\"base\"])\n",
+    "        if base in word_dict:\n",
+    "            word_dict[base] += 1\n",
+    "        else:\n",
+    "            word_dict[base] = 1\n",
+    "\n",
+    "sort_word_list = sorted(word_dict.items(), key=lambda x: x[1], reverse=True)\n",
+    "\n",
+    "print(sort_word_list)"
+   ]
+  }
+ ],
+ "metadata": {
+  "language_info": {
+   "name": "python"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/okabe/chapter04/knock36.ipynb b/okabe/chapter04/knock36.ipynb
@@ -0,0 +1,49 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#出現頻度が高い10語とその出現頻度をグラフ（例えば棒グラフなど）で表示せよ．\n",
+    "import knock30\n",
+    "import matplotlib.pyplot as plt\n",
+    "import japanize_matplotlib #日本語化matplotlib\n",
+    "import seaborn as sns\n",
+    "sns.set(font=\"IPAexGothic\")\n",
+    "\n",
+    "word_dict = dict()\n",
+    "\n",
+    "for line in knock30.sentence_list:\n",
+    "    for morph in line:\n",
+    "        if morph[\"pos\"] == \"記号\":\n",
+    "            continue\n",
+    "        base = str(morph[\"base\"])\n",
+    "        if base in word_dict:\n",
+    "            word_dict[base] += 1\n",
+    "        else:\n",
+    "            word_dict[base] = 1\n",
+    "\n",
+    "sort_word_list = sorted(word_dict.items(), key=lambda x: x[1], reverse=True)\n",
+    "\n",
+    "top_10 = sort_word_list[:10]\n",
+    "\n",
+    "words, frequencies = zip(*top_10)\n",
+    "\n",
+    "plt.bar(words, frequencies)\n",
+    "plt.xlabel('語')\n",
+    "plt.ylabel('頻度')\n",
+    "plt.title('出現頻度上位10単語')\n",
+    "plt.show()"
+   ]
+  }
+ ],
+ "metadata": {
+  "language_info": {
+   "name": "python"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/okabe/chapter04/knock37.ipynb b/okabe/chapter04/knock37.ipynb
@@ -0,0 +1,45 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#「猫」とよく共起する（共起頻度が高い）10語とその出現頻度をグラフ（例えば棒グラフなど）で表示せよ．\n",
+    "import knock30\n",
+    "import matplotlib.pyplot as plt\n",
+    "import japanize_matplotlib #日本語化matplotlib\n",
+    "import seaborn as sns\n",
+    "from collections import defaultdict\n",
+    "sns.set(font=\"IPAexGothic\")\n",
+    "\n",
+    "word_dict = defaultdict(int)\n",
+    "\n",
+    "for line in knock30.sentence_list:\n",
+    "    if any(morph[\"base\"] == \"猫\" for morph in line):\n",
+    "        for morph in line:\n",
+    "            if morph[\"base\"] != \"猫\" and morph[\"pos\"] != \"記号\":\n",
+    "                word_dict[morph[\"base\"]] += 1\n",
+    "\n",
+    "sort_word_list = sorted(word_dict.items(), key=lambda x: x[1], reverse=True)\n",
+    "top_10 = sort_word_list[:10]\n",
+    "\n",
+    "words, frequencies = zip(*top_10)\n",
+    "\n",
+    "plt.bar(words, frequencies)\n",
+    "plt.xlabel('語')\n",
+    "plt.ylabel('頻度')\n",
+    "plt.title('「猫」との共起頻度が高い10単語')\n",
+    "plt.show()"
+   ]
+  }
+ ],
+ "metadata": {
+  "language_info": {
+   "name": "python"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}