Merge pull request #120 from tmu-nlp/megumi

chapter07
tmu-nlp · Jun 17, 2024 · 605e103 · 605e103
2 parents 56d7b6a + 757233d
commit 605e103
Show file tree

Hide file tree

Showing 19 changed files with 256,316 additions and 1 deletion.
diff --git a/megumi/chapter04/knock30.py b/megumi/chapter04/knock30.py
@@ -1,7 +1,38 @@
-#形態素解析結果の読み込み
+#30.形態素解析結果の読み込み
 """
 形態素解析結果（neko.txt.mecab）を読み込むプログラムを実装せよ．
 ただし，各形態素は表層形（surface），基本形（base），品詞（pos），品詞細分類1（pos1）をキーとするマッピング型に格納し，
 1文を形態素（マッピング型）のリストとして表現せよ．第4章の残りの問題では，ここで作ったプログラムを活用せよ．
 """
 
+# 関数の定義
+def parse_neko():
+    result = []
+    sentence = []
+
+    # 形態素解析済みのファイルを開き、1行ずつ読み込む
+    with open("neko.txt.mecab") as f:
+        for line in f:
+            # 1行をタブでsplit()する（要素が2つのlistが返ってくる）
+            l1 = line.split("\t")
+            # l1の要素が2つであれば、l1[1]の要素をカンマで分割する
+            if len(l1) == 2:
+                l2 = l1[1].split(",")
+                # 問題の通り、4つのキーを指定して、dictを作成し、sentenceに追加していく
+                sentence.append({"surface": l1[0], "base": l2[6], "pos": l2[0], "pos1": l2[1]})
+                # 句点（。）が来たときに、sentence内のdictをresultに追加する
+                if l2[1] == "句点":
+                    result.append(sentence)
+                    sentence = []
+
+    return result
+
+# 関数の呼び出し
+result = parse_neko()
+print(result)
+"""
+出力結果
+ {'surface': 'ある', 'base': 'ある', 'pos': '動詞', 'pos1': '自立'},
+ {'surface': 'から', 'base': 'から', 'pos': '助詞', 'pos1': '接続助詞'}, 
+ {'surface': '」', 'base': '」', 'pos': '記号', 'pos1': '括弧閉'},
+"""
diff --git a/megumi/chapter04/knock31.py b/megumi/chapter04/knock31.py
@@ -0,0 +1,26 @@
+#31.動詞
+#動詞の表層形をすべて抽出せよ．
+
+import knock30
+result = knock30.parse_neko()
+
+#空のsetを作成する。
+se = set()
+
+#問30で作成したresultを反復処理させる。
+for lis in result:
+  for dic in lis:
+    #品詞が動詞ならば、表層形をsetに格納していく。
+    if dic["pos"] == "動詞":
+      se.add(dic["surface"])
+
+print(se)
+
+"""
+出力結果
+{'解せ', '抜こ', 'やい', 'ふくれ', 
+'察せ', '生かし', 'ときゃ', '気に入ら', 
+'引き立た', '磨る', '横切っ', 'さまし', 
+'洩らし', 'ゆるん', '怒っ', '死に', 
+'いらっしゃれ', '誘い出す', '凌い'
+"""
diff --git a/megumi/chapter04/knock32.py b/megumi/chapter04/knock32.py
@@ -0,0 +1,24 @@
+#32.動詞の基本形
+#動詞の基本形をすべて抽出せよ
+
+import knock30
+result=knock30.parse_neko()
+
+se = set()
+
+for lis in result:
+  for dic in lis:
+    if dic["pos"] == "動詞":
+      se.add(dic["base"])
+
+print(se)
+
+"""
+出力結果
+{'載せる', '供する', '試みる', 
+'引きあげる', 'くばる', '取り違える', 
+'漬ける', '振れる', '割る', '観る', 
+'砕ける', '見くびる', '払う', '知る', 
+'引き取る', 'あばれる', '感ずる', 
+'つとめる', '褒める', '喰う
+"""
diff --git a/megumi/chapter04/knock33.py b/megumi/chapter04/knock33.py
@@ -0,0 +1,30 @@
+#33.「AのB」
+#2つの名詞が「の」で連結されている名詞句を抽出せよ．
+
+import knock30
+result=knock30.parse_neko()
+
+
+
+
+#空のsetを作成する
+se = set()
+
+#問30で作成したresultを反復処理させる。
+for line in result:
+  for i in range(len(line)):
+   #bool演算子(not,and,or)を使用して、条件分岐を行う。
+   if line[i]["pos"] == "名詞" and line[i + 1]["surface"] == "の" and line[i + 2]["pos"] == "名詞":
+    #重複を避けるため、条件にマッチした要素をsetに追加していく。
+      se.add(line[i]["surface"] + line[i + 1]["surface"] + line[i + 2]["surface"])
+
+print(se)
+
+"""
+出力結果
+'初対面の人', '事蹟の三', '天地の間', 
+'下女の顔', 'ここの細君', '得意のよう', 
+'貧乏性の男', '君の悪口', '人の所有', 
+'屋の大将', '窮措大の家', '鼻の在所', 
+'馬鹿の相談', '吾輩のため'
+"""
diff --git a/megumi/chapter04/knock34.py b/megumi/chapter04/knock34.py
@@ -0,0 +1,67 @@
+#34.名詞の連接
+#名詞の連接（連続して出現する名詞）を最長一致で抽出せよ．
+#最長一致：ここでは、最も連続する名詞を指す。
+
+
+
+def parse_neko():
+    result = []
+    sentence = []
+
+    with open("neko.txt.mecab", encoding='utf-8') as f:
+        for line in f:
+            l1 = line.split("\t")
+            if len(l1) == 2:
+                l2 = l1[1].split(",")
+                morph = {
+                    "surface": l1[0],
+                    "base": l2[6],
+                    "pos": l2[0],
+                    "pos1": l2[1]
+                }
+                sentence.append(morph)
+                if l2[1] == "句点":
+                    result.append(sentence)
+                    sentence = []
+
+    return result
+
+#名詞の連接を最長一致で抽出する関数の定義
+def extract_longest_noun_sequences(parsed_text):
+    longest_sequences = []
+
+    for sentence in parsed_text:
+        current_sequence = []
+        for morph in sentence:
+            if morph["pos"] == "名詞":
+                current_sequence.append(morph["surface"])
+            else:
+                if len(current_sequence) > 1:
+                    longest_sequences.append("".join(current_sequence))
+                current_sequence = []
+        if len(current_sequence) > 1:
+            longest_sequences.append("".join(current_sequence))
+
+    return longest_sequences
+
+# 形態素解析を実行
+parsed_text = parse_neko()
+
+# 名詞の連接を最長一致で抽出
+longest_noun_sequences = extract_longest_noun_sequences(parsed_text)
+
+# 抽出結果を表示
+for sequence in longest_noun_sequences:
+    print(sequence)
+
+"""
+出力結果
+——おい苦沙弥先生
+独仙君
+万年漬
+後ろ向
+迷亭君
+独仙君
+東風君
+寒月君
+"""
diff --git a/megumi/chapter04/knock35.py b/megumi/chapter04/knock35.py
@@ -0,0 +1,26 @@
+#35.単語の出現頻度
+#文章中に出現する単語とその出現頻度を求め，出現頻度の高い順に並べよ．
+
+import knock30
+from collections import Counter
+
+# 形態素解析の結果を取得
+result = knock30.parse_neko()
+
+# 単語の出現頻度をカウントするためのリスト
+words = []
+
+for line in result:
+    for dic in line:
+        words.append(dic["surface"])
+
+# 単語の出現頻度をカウント
+word_counter = Counter(words)
+
+# 出現頻度の高い順に並べる
+sorted_word_freq = word_counter.most_common()
+
+# 出現頻度の高い順に単語とその頻度を表示
+print("出現頻度の高い単語トップ5:")
+for word, freq in sorted_word_freq[:5]:
+    print(f"{word}: {freq}")
diff --git a/megumi/chapter04/knock36.py b/megumi/chapter04/knock36.py
@@ -0,0 +1,34 @@
+#36.頻度上位10語
+#出現頻度が高い10語とその出現頻度をグラフ（例えば棒グラフなど）で表示せよ．
+
+import knock30
+from collections import Counter
+import matplotlib.pyplot as plt
+
+# 形態素解析の結果を取得
+result = knock30.parse_neko()
+
+# 単語の出現頻度をカウントするためのリスト
+words = []
+
+for line in result:
+    for dic in line:
+        words.append(dic["surface"])
+
+# 単語の出現頻度をカウント
+word_counter = Counter(words)
+
+# 出現頻度の高い順に並べる
+sorted_word_freq = word_counter.most_common(10)
+
+# 出現頻度の高い10語とその頻度を取得
+words, freqs = zip(*sorted_word_freq)
+
+# グラフの描画
+plt.figure(figsize=(10, 6))
+plt.bar(words, freqs, color='skyblue')
+plt.xlabel('単語')
+plt.ylabel('出現頻度')
+plt.title('出現頻度が高い単語トップ10')
+plt.xticks(rotation=45)
+plt.show()
diff --git a/megumi/chapter04/knock37.py b/megumi/chapter04/knock37.py
@@ -0,0 +1,46 @@
+#37.猫」と共起頻度の高い上位10語
+#「猫」とよく共起する（共起頻度が高い）10語とその出現頻度をグラフ（例えば棒グラフなど）で表示せよ．
+
+import MeCab
+from collections import Counter
+import matplotlib.pyplot as plt
+
+# 形態素解析器の初期化
+mecab = MeCab.Tagger()
+
+# テキストデータの読み込み
+with open('neko.txt', 'r', encoding='utf-8') as f:
+    text = f.read()
+
+# 形態素解析を行い、単語に分割
+node = mecab.parseToNode(text)
+words = []
+while node:
+    if node.feature.split(',')[0] != 'BOS/EOS':
+        words.append(node.surface)
+    node = node.next
+
+# 「猫」と共起する単語をカウント
+co_occurrence_counter = Counter()
+for i, word in enumerate(words):
+    if word == "猫":
+        # 「猫」の前後の単語をカウント
+        if i > 0:
+            co_occurrence_counter[words[i-1]] += 1
+        if i < len(words) - 1:
+            co_occurrence_counter[words[i+1]] += 1
+
+# 共起頻度の高い順に並べる
+sorted_co_occurrence = co_occurrence_counter.most_common(10)
+
+# 共起頻度の高い10語とその頻度を取得
+co_words, co_freqs = zip(*sorted_co_occurrence)
+
+# グラフの描画
+plt.figure(figsize=(10, 6))
+plt.bar(co_words, co_freqs, color='skyblue')
+plt.xlabel('単語')
+plt.ylabel('共起頻度')
+plt.title('「猫」と共起頻度が高い単語トップ10')
+plt.xticks(rotation=45)
+plt.show()
diff --git a/megumi/chapter04/knock38.py b/megumi/chapter04/knock38.py
@@ -0,0 +1,42 @@
+#38.ヒストグラムPermalink
+#単語の出現頻度のヒストグラムを描け．
+# ただし，横軸は出現頻度を表し，1から単語の出現頻度の最大値までの線形目盛とする．
+# 縦軸はx軸で示される出現頻度となった単語の異なり数（種類数）である．
+
+import matplotlib.pyplot as plt
+from collections import Counter
+import knock30  # knock30.pyファイルをインポート
+
+# 形態素解析の結果を取得
+result = knock30.parse_neko()
+
+# 単語の出現頻度をカウントするためのリスト
+words = []
+
+for line in result:
+    for dic in line:
+        words.append(dic["surface"])
+
+# 単語の出現頻度をカウント
+word_counter = Counter(words)
+
+# 出現頻度のリストを作成
+frequencies = list(word_counter.values())
+
+# 出現頻度の最大値を取得
+max_frequency = max(frequencies)
+
+# 出現頻度ごとの単語の種類数をカウント
+frequency_counts = Counter(frequencies)
+
+# ヒストグラムの描画
+plt.figure(figsize=(10, 6))
+plt.bar(frequency_counts.keys(), frequency_counts.values(), edgecolor='black')
+plt.xlabel('出現頻度')
+plt.ylabel('単語の種類数')
+plt.title('単語の出現頻度のヒストグラム')
+plt.xticks(range(1, max_frequency + 1))
+plt.show()
+
+
+
diff --git a/megumi/chapter04/knock39.py b/megumi/chapter04/knock39.py
@@ -0,0 +1,36 @@
+#39. Zipfの法則
+#単語の出現頻度順位を横軸，
+# その出現頻度を縦軸として，両対数グラフをプロットせよ
+
+import matplotlib.pyplot as plt
+from collections import Counter
+import knock30  # knock30.pyファイルをインポート
+
+# 形態素解析の結果を取得
+result = knock30.parse_neko()
+
+# 単語の出現頻度をカウントするためのリスト
+words = []
+
+for line in result:
+    for dic in line:
+        words.append(dic["surface"])
+
+# 単語の出現頻度をカウント
+word_counter = Counter(words)
+
+# 出現頻度のリストを作成し、頻度の高い順にソート
+frequencies = list(word_counter.values())
+frequencies.sort(reverse=True)
+
+# 頻度順位（1位から順に）を作成
+ranks = range(1, len(frequencies) + 1)
+
+# 両対数グラフの描画
+plt.figure(figsize=(10, 6))
+plt.loglog(ranks, frequencies, marker="o", linestyle="none")
+plt.xlabel('出現頻度順位')
+plt.ylabel('出現頻度')
+plt.title('単語の出現頻度順位と出現頻度の両対数グラフ')
+plt.grid(True, which="both", ls="--", lw=0.5)
+plt.show()