-
Notifications
You must be signed in to change notification settings - Fork 2
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #120 from tmu-nlp/megumi
chapter07
- Loading branch information
Showing
19 changed files
with
256,316 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,7 +1,38 @@ | ||
#形態素解析結果の読み込み | ||
#30.形態素解析結果の読み込み | ||
""" | ||
形態素解析結果(neko.txt.mecab)を読み込むプログラムを実装せよ. | ||
ただし,各形態素は表層形(surface),基本形(base),品詞(pos),品詞細分類1(pos1)をキーとするマッピング型に格納し, | ||
1文を形態素(マッピング型)のリストとして表現せよ.第4章の残りの問題では,ここで作ったプログラムを活用せよ. | ||
""" | ||
|
||
# 関数の定義 | ||
def parse_neko(): | ||
result = [] | ||
sentence = [] | ||
|
||
# 形態素解析済みのファイルを開き、1行ずつ読み込む | ||
with open("neko.txt.mecab") as f: | ||
for line in f: | ||
# 1行をタブでsplit()する(要素が2つのlistが返ってくる) | ||
l1 = line.split("\t") | ||
# l1の要素が2つであれば、l1[1]の要素をカンマで分割する | ||
if len(l1) == 2: | ||
l2 = l1[1].split(",") | ||
# 問題の通り、4つのキーを指定して、dictを作成し、sentenceに追加していく | ||
sentence.append({"surface": l1[0], "base": l2[6], "pos": l2[0], "pos1": l2[1]}) | ||
# 句点(。)が来たときに、sentence内のdictをresultに追加する | ||
if l2[1] == "句点": | ||
result.append(sentence) | ||
sentence = [] | ||
|
||
return result | ||
|
||
# 関数の呼び出し | ||
result = parse_neko() | ||
print(result) | ||
""" | ||
出力結果 | ||
{'surface': 'ある', 'base': 'ある', 'pos': '動詞', 'pos1': '自立'}, | ||
{'surface': 'から', 'base': 'から', 'pos': '助詞', 'pos1': '接続助詞'}, | ||
{'surface': '」', 'base': '」', 'pos': '記号', 'pos1': '括弧閉'}, | ||
""" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,26 @@ | ||
#31.動詞 | ||
#動詞の表層形をすべて抽出せよ. | ||
|
||
import knock30 | ||
result = knock30.parse_neko() | ||
|
||
#空のsetを作成する。 | ||
se = set() | ||
|
||
#問30で作成したresultを反復処理させる。 | ||
for lis in result: | ||
for dic in lis: | ||
#品詞が動詞ならば、表層形をsetに格納していく。 | ||
if dic["pos"] == "動詞": | ||
se.add(dic["surface"]) | ||
|
||
print(se) | ||
|
||
""" | ||
出力結果 | ||
{'解せ', '抜こ', 'やい', 'ふくれ', | ||
'察せ', '生かし', 'ときゃ', '気に入ら', | ||
'引き立た', '磨る', '横切っ', 'さまし', | ||
'洩らし', 'ゆるん', '怒っ', '死に', | ||
'いらっしゃれ', '誘い出す', '凌い' | ||
""" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,24 @@ | ||
#32.動詞の基本形 | ||
#動詞の基本形をすべて抽出せよ | ||
|
||
import knock30 | ||
result=knock30.parse_neko() | ||
|
||
se = set() | ||
|
||
for lis in result: | ||
for dic in lis: | ||
if dic["pos"] == "動詞": | ||
se.add(dic["base"]) | ||
|
||
print(se) | ||
|
||
""" | ||
出力結果 | ||
{'載せる', '供する', '試みる', | ||
'引きあげる', 'くばる', '取り違える', | ||
'漬ける', '振れる', '割る', '観る', | ||
'砕ける', '見くびる', '払う', '知る', | ||
'引き取る', 'あばれる', '感ずる', | ||
'つとめる', '褒める', '喰う | ||
""" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,30 @@ | ||
#33.「AのB」 | ||
#2つの名詞が「の」で連結されている名詞句を抽出せよ. | ||
|
||
import knock30 | ||
result=knock30.parse_neko() | ||
|
||
|
||
|
||
|
||
#空のsetを作成する | ||
se = set() | ||
|
||
#問30で作成したresultを反復処理させる。 | ||
for line in result: | ||
for i in range(len(line)): | ||
#bool演算子(not,and,or)を使用して、条件分岐を行う。 | ||
if line[i]["pos"] == "名詞" and line[i + 1]["surface"] == "の" and line[i + 2]["pos"] == "名詞": | ||
#重複を避けるため、条件にマッチした要素をsetに追加していく。 | ||
se.add(line[i]["surface"] + line[i + 1]["surface"] + line[i + 2]["surface"]) | ||
|
||
print(se) | ||
|
||
""" | ||
出力結果 | ||
'初対面の人', '事蹟の三', '天地の間', | ||
'下女の顔', 'ここの細君', '得意のよう', | ||
'貧乏性の男', '君の悪口', '人の所有', | ||
'屋の大将', '窮措大の家', '鼻の在所', | ||
'馬鹿の相談', '吾輩のため' | ||
""" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,67 @@ | ||
#34.名詞の連接 | ||
#名詞の連接(連続して出現する名詞)を最長一致で抽出せよ. | ||
#最長一致:ここでは、最も連続する名詞を指す。 | ||
|
||
|
||
|
||
def parse_neko(): | ||
result = [] | ||
sentence = [] | ||
|
||
with open("neko.txt.mecab", encoding='utf-8') as f: | ||
for line in f: | ||
l1 = line.split("\t") | ||
if len(l1) == 2: | ||
l2 = l1[1].split(",") | ||
morph = { | ||
"surface": l1[0], | ||
"base": l2[6], | ||
"pos": l2[0], | ||
"pos1": l2[1] | ||
} | ||
sentence.append(morph) | ||
if l2[1] == "句点": | ||
result.append(sentence) | ||
sentence = [] | ||
|
||
return result | ||
|
||
#名詞の連接を最長一致で抽出する関数の定義 | ||
def extract_longest_noun_sequences(parsed_text): | ||
longest_sequences = [] | ||
|
||
for sentence in parsed_text: | ||
current_sequence = [] | ||
for morph in sentence: | ||
if morph["pos"] == "名詞": | ||
current_sequence.append(morph["surface"]) | ||
else: | ||
if len(current_sequence) > 1: | ||
longest_sequences.append("".join(current_sequence)) | ||
current_sequence = [] | ||
if len(current_sequence) > 1: | ||
longest_sequences.append("".join(current_sequence)) | ||
|
||
return longest_sequences | ||
|
||
# 形態素解析を実行 | ||
parsed_text = parse_neko() | ||
|
||
# 名詞の連接を最長一致で抽出 | ||
longest_noun_sequences = extract_longest_noun_sequences(parsed_text) | ||
|
||
# 抽出結果を表示 | ||
for sequence in longest_noun_sequences: | ||
print(sequence) | ||
|
||
""" | ||
出力結果 | ||
——おい苦沙弥先生 | ||
独仙君 | ||
万年漬 | ||
後ろ向 | ||
迷亭君 | ||
独仙君 | ||
東風君 | ||
寒月君 | ||
""" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,26 @@ | ||
#35.単語の出現頻度 | ||
#文章中に出現する単語とその出現頻度を求め,出現頻度の高い順に並べよ. | ||
|
||
import knock30 | ||
from collections import Counter | ||
|
||
# 形態素解析の結果を取得 | ||
result = knock30.parse_neko() | ||
|
||
# 単語の出現頻度をカウントするためのリスト | ||
words = [] | ||
|
||
for line in result: | ||
for dic in line: | ||
words.append(dic["surface"]) | ||
|
||
# 単語の出現頻度をカウント | ||
word_counter = Counter(words) | ||
|
||
# 出現頻度の高い順に並べる | ||
sorted_word_freq = word_counter.most_common() | ||
|
||
# 出現頻度の高い順に単語とその頻度を表示 | ||
print("出現頻度の高い単語トップ5:") | ||
for word, freq in sorted_word_freq[:5]: | ||
print(f"{word}: {freq}") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,34 @@ | ||
#36.頻度上位10語 | ||
#出現頻度が高い10語とその出現頻度をグラフ(例えば棒グラフなど)で表示せよ. | ||
|
||
import knock30 | ||
from collections import Counter | ||
import matplotlib.pyplot as plt | ||
|
||
# 形態素解析の結果を取得 | ||
result = knock30.parse_neko() | ||
|
||
# 単語の出現頻度をカウントするためのリスト | ||
words = [] | ||
|
||
for line in result: | ||
for dic in line: | ||
words.append(dic["surface"]) | ||
|
||
# 単語の出現頻度をカウント | ||
word_counter = Counter(words) | ||
|
||
# 出現頻度の高い順に並べる | ||
sorted_word_freq = word_counter.most_common(10) | ||
|
||
# 出現頻度の高い10語とその頻度を取得 | ||
words, freqs = zip(*sorted_word_freq) | ||
|
||
# グラフの描画 | ||
plt.figure(figsize=(10, 6)) | ||
plt.bar(words, freqs, color='skyblue') | ||
plt.xlabel('単語') | ||
plt.ylabel('出現頻度') | ||
plt.title('出現頻度が高い単語トップ10') | ||
plt.xticks(rotation=45) | ||
plt.show() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,46 @@ | ||
#37.猫」と共起頻度の高い上位10語 | ||
#「猫」とよく共起する(共起頻度が高い)10語とその出現頻度をグラフ(例えば棒グラフなど)で表示せよ. | ||
|
||
import MeCab | ||
from collections import Counter | ||
import matplotlib.pyplot as plt | ||
|
||
# 形態素解析器の初期化 | ||
mecab = MeCab.Tagger() | ||
|
||
# テキストデータの読み込み | ||
with open('neko.txt', 'r', encoding='utf-8') as f: | ||
text = f.read() | ||
|
||
# 形態素解析を行い、単語に分割 | ||
node = mecab.parseToNode(text) | ||
words = [] | ||
while node: | ||
if node.feature.split(',')[0] != 'BOS/EOS': | ||
words.append(node.surface) | ||
node = node.next | ||
|
||
# 「猫」と共起する単語をカウント | ||
co_occurrence_counter = Counter() | ||
for i, word in enumerate(words): | ||
if word == "猫": | ||
# 「猫」の前後の単語をカウント | ||
if i > 0: | ||
co_occurrence_counter[words[i-1]] += 1 | ||
if i < len(words) - 1: | ||
co_occurrence_counter[words[i+1]] += 1 | ||
|
||
# 共起頻度の高い順に並べる | ||
sorted_co_occurrence = co_occurrence_counter.most_common(10) | ||
|
||
# 共起頻度の高い10語とその頻度を取得 | ||
co_words, co_freqs = zip(*sorted_co_occurrence) | ||
|
||
# グラフの描画 | ||
plt.figure(figsize=(10, 6)) | ||
plt.bar(co_words, co_freqs, color='skyblue') | ||
plt.xlabel('単語') | ||
plt.ylabel('共起頻度') | ||
plt.title('「猫」と共起頻度が高い単語トップ10') | ||
plt.xticks(rotation=45) | ||
plt.show() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,42 @@ | ||
#38.ヒストグラムPermalink | ||
#単語の出現頻度のヒストグラムを描け. | ||
# ただし,横軸は出現頻度を表し,1から単語の出現頻度の最大値までの線形目盛とする. | ||
# 縦軸はx軸で示される出現頻度となった単語の異なり数(種類数)である. | ||
|
||
import matplotlib.pyplot as plt | ||
from collections import Counter | ||
import knock30 # knock30.pyファイルをインポート | ||
|
||
# 形態素解析の結果を取得 | ||
result = knock30.parse_neko() | ||
|
||
# 単語の出現頻度をカウントするためのリスト | ||
words = [] | ||
|
||
for line in result: | ||
for dic in line: | ||
words.append(dic["surface"]) | ||
|
||
# 単語の出現頻度をカウント | ||
word_counter = Counter(words) | ||
|
||
# 出現頻度のリストを作成 | ||
frequencies = list(word_counter.values()) | ||
|
||
# 出現頻度の最大値を取得 | ||
max_frequency = max(frequencies) | ||
|
||
# 出現頻度ごとの単語の種類数をカウント | ||
frequency_counts = Counter(frequencies) | ||
|
||
# ヒストグラムの描画 | ||
plt.figure(figsize=(10, 6)) | ||
plt.bar(frequency_counts.keys(), frequency_counts.values(), edgecolor='black') | ||
plt.xlabel('出現頻度') | ||
plt.ylabel('単語の種類数') | ||
plt.title('単語の出現頻度のヒストグラム') | ||
plt.xticks(range(1, max_frequency + 1)) | ||
plt.show() | ||
|
||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,36 @@ | ||
#39. Zipfの法則 | ||
#単語の出現頻度順位を横軸, | ||
# その出現頻度を縦軸として,両対数グラフをプロットせよ | ||
|
||
import matplotlib.pyplot as plt | ||
from collections import Counter | ||
import knock30 # knock30.pyファイルをインポート | ||
|
||
# 形態素解析の結果を取得 | ||
result = knock30.parse_neko() | ||
|
||
# 単語の出現頻度をカウントするためのリスト | ||
words = [] | ||
|
||
for line in result: | ||
for dic in line: | ||
words.append(dic["surface"]) | ||
|
||
# 単語の出現頻度をカウント | ||
word_counter = Counter(words) | ||
|
||
# 出現頻度のリストを作成し、頻度の高い順にソート | ||
frequencies = list(word_counter.values()) | ||
frequencies.sort(reverse=True) | ||
|
||
# 頻度順位(1位から順に)を作成 | ||
ranks = range(1, len(frequencies) + 1) | ||
|
||
# 両対数グラフの描画 | ||
plt.figure(figsize=(10, 6)) | ||
plt.loglog(ranks, frequencies, marker="o", linestyle="none") | ||
plt.xlabel('出現頻度順位') | ||
plt.ylabel('出現頻度') | ||
plt.title('単語の出現頻度順位と出現頻度の両対数グラフ') | ||
plt.grid(True, which="both", ls="--", lw=0.5) | ||
plt.show() |
Oops, something went wrong.