-
Notifications
You must be signed in to change notification settings - Fork 2
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #66 from tmu-nlp/ning
add chapter04
- Loading branch information
Showing
10 changed files
with
412 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,39 @@ | ||
file_path = 'neko.txt.mecab' | ||
output_path = 'neko_morpheme.txt' | ||
sentences = [] | ||
sentence = [] | ||
|
||
with open(file_path, 'r', encoding='utf-8') as file: | ||
for line in file: | ||
line = line.strip() | ||
if line == 'EOS': | ||
if sentence: | ||
sentences.append(sentence) | ||
sentence = [] | ||
continue | ||
|
||
if '\t' not in line: | ||
continue | ||
|
||
#形態素情報の抽出と格納 | ||
surface, details = line.split('\t') | ||
details = details.split(',') | ||
morph = { | ||
'surface': surface, | ||
'base': details[6] if len(details) > 6 else '*', | ||
'pos': details[0], | ||
'pos1': details[1] | ||
} | ||
sentence.append(morph) | ||
|
||
#出力 | ||
with open(output_path, 'w', encoding='utf-8') as out_file: | ||
for sentence in sentences: | ||
for morph in sentence: | ||
out_file.write(f"surface: {morph['surface']}, base: {morph['base']}, pos: {morph['pos']}, pos1: {morph['pos1']}\n") | ||
out_file.write("\n") | ||
|
||
|
||
|
||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,35 @@ | ||
file_path = 'neko.txt.mecab' | ||
output_file_path = 'verbs.txt' | ||
verbs = [] #動詞の表層形を格納するためのリスト | ||
|
||
with open(file_path, 'r', encoding='utf-8') as file: | ||
sentence = [] | ||
for line in file: | ||
line = line.strip() | ||
if line == 'EOS': | ||
if sentence: | ||
sentence = [] | ||
continue | ||
|
||
if '\t' not in line: | ||
continue | ||
|
||
surface, details = line.split('\t') | ||
details = details.split(',') | ||
morph = { | ||
'surface': surface, | ||
'base': details[6] if len(details) > 6 else '*', | ||
'pos': details[0], | ||
'pos1': details[1] | ||
} | ||
|
||
#動詞の表層形の抽出 | ||
if morph['pos'] == '動詞': | ||
verbs.append(morph['surface']) | ||
|
||
#見やすくためのファイルに出力 | ||
with open(output_file_path, 'w', encoding='utf-8') as out_file: | ||
for verb in verbs: | ||
out_file.write(verb + '\n') | ||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,34 @@ | ||
file_path = 'neko.txt.mecab' | ||
output_file_path = 'verbs_base.txt' | ||
verbs_base = [] | ||
|
||
with open(file_path, 'r', encoding='utf-8') as file: | ||
sentence = [] | ||
for line in file: | ||
line = line.strip() | ||
if line == 'EOS': | ||
if sentence: | ||
sentence = [] | ||
continue | ||
|
||
if '\t' not in line: | ||
continue | ||
|
||
surface, details = line.split('\t') | ||
details = details.split(',') | ||
morph = { | ||
'surface': surface, | ||
'base': details[6] if len(details) > 6 else '*', | ||
'pos': details[0], | ||
'pos1': details[1] | ||
} | ||
|
||
#動詞の基本形の抽出 | ||
if morph['pos'] == '動詞': | ||
verbs_base.append(morph['base']) | ||
|
||
#ファイルに出力 | ||
with open(output_file_path, 'w', encoding='utf-8') as out_file: | ||
for verb_base in verbs_base: | ||
out_file.write(verb_base + '\n') | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,38 @@ | ||
file_path = 'neko.txt.mecab' | ||
output_file_path = 'noun_phrases.txt' | ||
noun_phrases = [] | ||
|
||
with open(file_path, 'r', encoding='utf-8') as file: | ||
sentence = [] | ||
for line in file: | ||
line = line.strip() | ||
if line == 'EOS': | ||
if sentence: | ||
for i in range(1, len(sentence) - 1): | ||
if (sentence[i-1]['pos'] == '名詞' and | ||
sentence[i]['surface'] == 'の' and | ||
sentence[i+1]['pos'] == '名詞'): | ||
noun_phrase = sentence[i-1]['surface'] + sentence[i]['surface'] + sentence[i+1]['surface'] | ||
noun_phrases.append(noun_phrase) | ||
sentence = [] | ||
continue | ||
|
||
if '\t' not in line: | ||
continue | ||
|
||
#形態素情報の抽出 | ||
surface, details = line.split('\t') | ||
details = details.split(',') | ||
morph = { | ||
'surface': surface, | ||
'base': details[6] if len(details) > 6 else '*', | ||
'pos': details[0], | ||
'pos1': details[1] | ||
} | ||
sentence.append(morph) | ||
|
||
#ファイルに出力 | ||
with open(output_file_path, 'w', encoding='utf-8') as out_file: | ||
for phrase in noun_phrases: | ||
out_file.write(phrase + '\n') | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,40 @@ | ||
noun_sequences = [] | ||
|
||
with open("neko.txt.mecab", 'r', encoding='utf-8') as file: | ||
sentence = [] | ||
for line in file: | ||
line = line.strip() | ||
if line == 'EOS': | ||
if sentence: | ||
#名詞の連接を抽出 | ||
current_sequence = [] | ||
for morph in sentence: | ||
if morph['pos'] == '名詞': | ||
current_sequence.append(morph['surface']) | ||
else: | ||
if len(current_sequence) > 1: | ||
noun_sequences.append(''.join(current_sequence)) | ||
current_sequence = [] | ||
if len(current_sequence) > 1: | ||
noun_sequences.append(''.join(current_sequence)) | ||
sentence = [] | ||
continue | ||
|
||
if '\t' not in line: | ||
continue | ||
|
||
surface, details = line.split('\t') | ||
details = details.split(',') | ||
morph = { | ||
'surface': surface, | ||
'base': details[6] if len(details) > 6 else '*', | ||
'pos': details[0], | ||
'pos1': details[1] | ||
} | ||
sentence.append(morph) | ||
|
||
#ファイルに出力 | ||
with open("longest_noun_sequences.txt", 'w', encoding='utf-8') as out_file: | ||
for sequence in noun_sequences: | ||
out_file.write(sequence + '\n') | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,31 @@ | ||
from collections import Counter | ||
|
||
file_path = 'neko.txt.mecab' | ||
output_file_path = 'word_frequencies.txt' | ||
words = [] | ||
|
||
with open(file_path, 'r', encoding='utf-8') as file: | ||
for line in file: | ||
line = line.strip() | ||
if line == 'EOS': | ||
continue | ||
|
||
if '\t' not in line: | ||
continue | ||
|
||
surface, details = line.split('\t') | ||
details = details.split(',') | ||
word = details[6] if len(details) > 6 else surface #基本形が存在すれば基本形を、なければ表層形を | ||
words.append(word) | ||
|
||
#単語の出現頻度をカウント | ||
word_counter = Counter(words) | ||
|
||
#出現頻度の高い順から並べる | ||
sorted_word_counts = word_counter.most_common() | ||
|
||
#ファイルに出力 | ||
with open(output_file_path, 'w', encoding='utf-8') as out_file: | ||
for word, count in sorted_word_counts: | ||
out_file.write(f"{word}\t{count}\n") | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,45 @@ | ||
#句読点などの記号は除外。助詞と助動詞なども除外。 | ||
|
||
from collections import Counter | ||
import matplotlib.pyplot as plt | ||
import japanize_matplotlib | ||
|
||
file_path = 'neko.txt.mecab' | ||
words = [] | ||
|
||
with open(file_path, 'r', encoding='utf-8') as file: | ||
for line in file: | ||
line = line.strip() | ||
if line == 'EOS': | ||
continue | ||
|
||
if '\t' not in line: | ||
continue | ||
|
||
surface, details = line.split('\t') | ||
details = details.split(',') | ||
word = details[6] if len(details) > 6 else surface | ||
words.append(word) | ||
|
||
#出現頻度のカウント | ||
word_counter = Counter(words) | ||
|
||
#高い順から並べる | ||
sorted_word_counts = word_counter.most_common(10) | ||
|
||
#出現頻度が高い10語の抽出 | ||
words, counts = zip(*sorted_word_counts) | ||
|
||
#棒グラフを作成 | ||
plt.figure(figsize=(10, 6)) | ||
plt.bar(words, counts, color='skyblue') | ||
plt.xlabel('単語') | ||
plt.ylabel('出現頻度') | ||
plt.title('出現頻度が高い10語') | ||
plt.xticks(rotation=45) | ||
plt.tight_layout() | ||
|
||
#グラフを表示 | ||
plt.show() | ||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,52 @@ | ||
import matplotlib.pyplot as plt | ||
from collections import Counter | ||
import re | ||
import japanize_matplotlib | ||
|
||
def read_mecab(file_path): | ||
sentences = [] | ||
sentence = [] | ||
with open(file_path, 'r', encoding='utf-8') as f: | ||
for line in f: | ||
if line == 'EOS\n': | ||
if sentence: | ||
sentences.append(sentence) | ||
sentence = [] | ||
else: | ||
surface, feature = line.split('\t') | ||
features = feature.split(',') | ||
morph = { | ||
'surface': surface, | ||
'base': features[6], | ||
'pos': features[0], | ||
'pos1': features[1] | ||
} | ||
sentence.append(morph) | ||
return sentences | ||
|
||
#記号、助詞、助動詞を除外し、猫と共起する単語をカウント | ||
def count_cooccurrences(sentences, target_word): | ||
cooccurrence_counter = Counter() | ||
for sentence in sentences: | ||
words_in_sentence = [morph['surface'] for morph in sentence] | ||
if target_word in words_in_sentence: | ||
for morph in sentence: | ||
if morph['surface'] != target_word and morph['pos'] not in ['記号', '助詞', '助動詞']: | ||
cooccurrence_counter[morph['surface']] += 1 | ||
return cooccurrence_counter | ||
|
||
file_path = 'neko.txt.mecab' | ||
sentences = read_mecab(file_path) | ||
cooccurrence_counter = count_cooccurrences(sentences, '猫') | ||
|
||
#出現頻度上位10語の抽出 | ||
most_common_words = cooccurrence_counter.most_common(10) | ||
|
||
words, counts = zip(*most_common_words) | ||
plt.figure(figsize=(10, 5)) | ||
plt.bar(words, counts) | ||
plt.xlabel('単語') | ||
plt.ylabel('出現頻度') | ||
plt.title('「猫」と共起する単語の出現頻度上位10') | ||
plt.xticks(rotation=45) | ||
plt.show() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,46 @@ | ||
#全部を載せると差異が見れないため、出現頻度が30未満の単語だけにした | ||
|
||
import matplotlib.pyplot as plt | ||
from collections import Counter | ||
import japanize_matplotlib | ||
|
||
file_path = 'neko.txt.mecab' | ||
|
||
#形態素解析結果neko.txt.mecabの読み込み | ||
sentences = [] | ||
with open(file_path, 'r', encoding='utf-8') as f: | ||
sentence = [] | ||
for line in f: | ||
if line == 'EOS\n': | ||
if sentence: | ||
sentences.append(sentence) | ||
sentence = [] | ||
else: | ||
surface, feature = line.split('\t') | ||
features = feature.split(',') | ||
morph = { | ||
'surface': surface, | ||
'base': features[6], | ||
'pos': features[0], | ||
'pos1': features[1] | ||
} | ||
sentence.append(morph) | ||
|
||
#各単語の出現頻度をカウント | ||
word_counter = Counter() | ||
for sentence in sentences:#品詞が「記号」、「助詞」、「助動詞」でない単語のみをカウント | ||
for morph in sentence: | ||
if morph['pos'] not in ['記号', '助詞', '助動詞']: | ||
word_counter[morph['surface']] += 1 | ||
|
||
#出現頻度が30未満の単語のみをリストに抽出 | ||
frequencies = [freq for freq in word_counter.values() if freq < 30] | ||
|
||
#ヒストグラムの作成 | ||
plt.figure(figsize=(10, 5)) | ||
plt.hist(frequencies, bins=range(1, 30), edgecolor='black', align='left') | ||
plt.xlabel('出現頻度') | ||
plt.ylabel('単語の種類数') | ||
plt.title('出現頻度ヒストグラム') | ||
plt.show() | ||
|
Oops, something went wrong.