Skip to content

Commit

Permalink
Merge pull request #66 from tmu-nlp/ning
Browse files Browse the repository at this point in the history
add chapter04
  • Loading branch information
d36n authored May 20, 2024
2 parents 03ae7a2 + fcf94aa commit 78263df
Show file tree
Hide file tree
Showing 10 changed files with 412 additions and 0 deletions.
39 changes: 39 additions & 0 deletions ning/chapter04/knock30.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
file_path = 'neko.txt.mecab'
output_path = 'neko_morpheme.txt'
sentences = []
sentence = []

with open(file_path, 'r', encoding='utf-8') as file:
for line in file:
line = line.strip()
if line == 'EOS':
if sentence:
sentences.append(sentence)
sentence = []
continue

if '\t' not in line:
continue

#形態素情報の抽出と格納
surface, details = line.split('\t')
details = details.split(',')
morph = {
'surface': surface,
'base': details[6] if len(details) > 6 else '*',
'pos': details[0],
'pos1': details[1]
}
sentence.append(morph)

#出力
with open(output_path, 'w', encoding='utf-8') as out_file:
for sentence in sentences:
for morph in sentence:
out_file.write(f"surface: {morph['surface']}, base: {morph['base']}, pos: {morph['pos']}, pos1: {morph['pos1']}\n")
out_file.write("\n")





35 changes: 35 additions & 0 deletions ning/chapter04/knock31.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
file_path = 'neko.txt.mecab'
output_file_path = 'verbs.txt'
verbs = [] #動詞の表層形を格納するためのリスト

with open(file_path, 'r', encoding='utf-8') as file:
sentence = []
for line in file:
line = line.strip()
if line == 'EOS':
if sentence:
sentence = []
continue

if '\t' not in line:
continue

surface, details = line.split('\t')
details = details.split(',')
morph = {
'surface': surface,
'base': details[6] if len(details) > 6 else '*',
'pos': details[0],
'pos1': details[1]
}

#動詞の表層形の抽出
if morph['pos'] == '動詞':
verbs.append(morph['surface'])

#見やすくためのファイルに出力
with open(output_file_path, 'w', encoding='utf-8') as out_file:
for verb in verbs:
out_file.write(verb + '\n')


34 changes: 34 additions & 0 deletions ning/chapter04/knock32.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
file_path = 'neko.txt.mecab'
output_file_path = 'verbs_base.txt'
verbs_base = []

with open(file_path, 'r', encoding='utf-8') as file:
sentence = []
for line in file:
line = line.strip()
if line == 'EOS':
if sentence:
sentence = []
continue

if '\t' not in line:
continue

surface, details = line.split('\t')
details = details.split(',')
morph = {
'surface': surface,
'base': details[6] if len(details) > 6 else '*',
'pos': details[0],
'pos1': details[1]
}

#動詞の基本形の抽出
if morph['pos'] == '動詞':
verbs_base.append(morph['base'])

#ファイルに出力
with open(output_file_path, 'w', encoding='utf-8') as out_file:
for verb_base in verbs_base:
out_file.write(verb_base + '\n')

38 changes: 38 additions & 0 deletions ning/chapter04/knock33.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
file_path = 'neko.txt.mecab'
output_file_path = 'noun_phrases.txt'
noun_phrases = []

with open(file_path, 'r', encoding='utf-8') as file:
sentence = []
for line in file:
line = line.strip()
if line == 'EOS':
if sentence:
for i in range(1, len(sentence) - 1):
if (sentence[i-1]['pos'] == '名詞' and
sentence[i]['surface'] == 'の' and
sentence[i+1]['pos'] == '名詞'):
noun_phrase = sentence[i-1]['surface'] + sentence[i]['surface'] + sentence[i+1]['surface']
noun_phrases.append(noun_phrase)
sentence = []
continue

if '\t' not in line:
continue

#形態素情報の抽出
surface, details = line.split('\t')
details = details.split(',')
morph = {
'surface': surface,
'base': details[6] if len(details) > 6 else '*',
'pos': details[0],
'pos1': details[1]
}
sentence.append(morph)

#ファイルに出力
with open(output_file_path, 'w', encoding='utf-8') as out_file:
for phrase in noun_phrases:
out_file.write(phrase + '\n')

40 changes: 40 additions & 0 deletions ning/chapter04/knock34.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
noun_sequences = []

with open("neko.txt.mecab", 'r', encoding='utf-8') as file:
sentence = []
for line in file:
line = line.strip()
if line == 'EOS':
if sentence:
#名詞の連接を抽出
current_sequence = []
for morph in sentence:
if morph['pos'] == '名詞':
current_sequence.append(morph['surface'])
else:
if len(current_sequence) > 1:
noun_sequences.append(''.join(current_sequence))
current_sequence = []
if len(current_sequence) > 1:
noun_sequences.append(''.join(current_sequence))
sentence = []
continue

if '\t' not in line:
continue

surface, details = line.split('\t')
details = details.split(',')
morph = {
'surface': surface,
'base': details[6] if len(details) > 6 else '*',
'pos': details[0],
'pos1': details[1]
}
sentence.append(morph)

#ファイルに出力
with open("longest_noun_sequences.txt", 'w', encoding='utf-8') as out_file:
for sequence in noun_sequences:
out_file.write(sequence + '\n')

31 changes: 31 additions & 0 deletions ning/chapter04/knock35.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
from collections import Counter

file_path = 'neko.txt.mecab'
output_file_path = 'word_frequencies.txt'
words = []

with open(file_path, 'r', encoding='utf-8') as file:
for line in file:
line = line.strip()
if line == 'EOS':
continue

if '\t' not in line:
continue

surface, details = line.split('\t')
details = details.split(',')
word = details[6] if len(details) > 6 else surface #基本形が存在すれば基本形を、なければ表層形を
words.append(word)

#単語の出現頻度をカウント
word_counter = Counter(words)

#出現頻度の高い順から並べる
sorted_word_counts = word_counter.most_common()

#ファイルに出力
with open(output_file_path, 'w', encoding='utf-8') as out_file:
for word, count in sorted_word_counts:
out_file.write(f"{word}\t{count}\n")

45 changes: 45 additions & 0 deletions ning/chapter04/knock36.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
#句読点などの記号は除外。助詞と助動詞なども除外。

from collections import Counter
import matplotlib.pyplot as plt
import japanize_matplotlib

file_path = 'neko.txt.mecab'
words = []

with open(file_path, 'r', encoding='utf-8') as file:
for line in file:
line = line.strip()
if line == 'EOS':
continue

if '\t' not in line:
continue

surface, details = line.split('\t')
details = details.split(',')
word = details[6] if len(details) > 6 else surface
words.append(word)

#出現頻度のカウント
word_counter = Counter(words)

#高い順から並べる
sorted_word_counts = word_counter.most_common(10)

#出現頻度が高い10語の抽出
words, counts = zip(*sorted_word_counts)

#棒グラフを作成
plt.figure(figsize=(10, 6))
plt.bar(words, counts, color='skyblue')
plt.xlabel('単語')
plt.ylabel('出現頻度')
plt.title('出現頻度が高い10語')
plt.xticks(rotation=45)
plt.tight_layout()

#グラフを表示
plt.show()


52 changes: 52 additions & 0 deletions ning/chapter04/knock37.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
import matplotlib.pyplot as plt
from collections import Counter
import re
import japanize_matplotlib

def read_mecab(file_path):
sentences = []
sentence = []
with open(file_path, 'r', encoding='utf-8') as f:
for line in f:
if line == 'EOS\n':
if sentence:
sentences.append(sentence)
sentence = []
else:
surface, feature = line.split('\t')
features = feature.split(',')
morph = {
'surface': surface,
'base': features[6],
'pos': features[0],
'pos1': features[1]
}
sentence.append(morph)
return sentences

#記号、助詞、助動詞を除外し、猫と共起する単語をカウント
def count_cooccurrences(sentences, target_word):
cooccurrence_counter = Counter()
for sentence in sentences:
words_in_sentence = [morph['surface'] for morph in sentence]
if target_word in words_in_sentence:
for morph in sentence:
if morph['surface'] != target_word and morph['pos'] not in ['記号', '助詞', '助動詞']:
cooccurrence_counter[morph['surface']] += 1
return cooccurrence_counter

file_path = 'neko.txt.mecab'
sentences = read_mecab(file_path)
cooccurrence_counter = count_cooccurrences(sentences, '猫')

#出現頻度上位10語の抽出
most_common_words = cooccurrence_counter.most_common(10)

words, counts = zip(*most_common_words)
plt.figure(figsize=(10, 5))
plt.bar(words, counts)
plt.xlabel('単語')
plt.ylabel('出現頻度')
plt.title('「猫」と共起する単語の出現頻度上位10')
plt.xticks(rotation=45)
plt.show()
46 changes: 46 additions & 0 deletions ning/chapter04/knock38.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
#全部を載せると差異が見れないため、出現頻度が30未満の単語だけにした

import matplotlib.pyplot as plt
from collections import Counter
import japanize_matplotlib

file_path = 'neko.txt.mecab'

#形態素解析結果neko.txt.mecabの読み込み
sentences = []
with open(file_path, 'r', encoding='utf-8') as f:
sentence = []
for line in f:
if line == 'EOS\n':
if sentence:
sentences.append(sentence)
sentence = []
else:
surface, feature = line.split('\t')
features = feature.split(',')
morph = {
'surface': surface,
'base': features[6],
'pos': features[0],
'pos1': features[1]
}
sentence.append(morph)

#各単語の出現頻度をカウント
word_counter = Counter()
for sentence in sentences:#品詞が「記号」、「助詞」、「助動詞」でない単語のみをカウント
for morph in sentence:
if morph['pos'] not in ['記号', '助詞', '助動詞']:
word_counter[morph['surface']] += 1

#出現頻度が30未満の単語のみをリストに抽出
frequencies = [freq for freq in word_counter.values() if freq < 30]

#ヒストグラムの作成
plt.figure(figsize=(10, 5))
plt.hist(frequencies, bins=range(1, 30), edgecolor='black', align='left')
plt.xlabel('出現頻度')
plt.ylabel('単語の種類数')
plt.title('出現頻度ヒストグラム')
plt.show()

Loading

0 comments on commit 78263df

Please sign in to comment.