Merge pull request #66 from tmu-nlp/ning

add chapter04
tmu-nlp · May 20, 2024 · 78263df · 78263df
2 parents 03ae7a2 + fcf94aa
commit 78263df
Show file tree

Hide file tree

Showing 10 changed files with 412 additions and 0 deletions.
diff --git a/ning/chapter04/knock30.py b/ning/chapter04/knock30.py
@@ -0,0 +1,39 @@
+file_path = 'neko.txt.mecab'
+output_path = 'neko_morpheme.txt'
+sentences = [] 
+sentence = []  
+
+with open(file_path, 'r', encoding='utf-8') as file:
+    for line in file:
+        line = line.strip() 
+        if line == 'EOS':
+            if sentence:
+                sentences.append(sentence)
+                sentence = []
+            continue
+
+        if '\t' not in line:
+            continue
+
+        #形態素情報の抽出と格納
+        surface, details = line.split('\t')
+        details = details.split(',')
+        morph = {
+            'surface': surface,
+            'base': details[6] if len(details) > 6 else '*',
+            'pos': details[0],
+            'pos1': details[1]
+        }
+        sentence.append(morph)
+
+#出力
+with open(output_path, 'w', encoding='utf-8') as out_file:
+    for sentence in sentences:
+        for morph in sentence:
+            out_file.write(f"surface: {morph['surface']}, base: {morph['base']}, pos: {morph['pos']}, pos1: {morph['pos1']}\n")
+        out_file.write("\n") 
+
+
+
+
+
diff --git a/ning/chapter04/knock31.py b/ning/chapter04/knock31.py
@@ -0,0 +1,35 @@
+file_path = 'neko.txt.mecab'
+output_file_path = 'verbs.txt'
+verbs = []  #動詞の表層形を格納するためのリスト
+
+with open(file_path, 'r', encoding='utf-8') as file:
+    sentence = []  
+    for line in file:
+        line = line.strip()
+        if line == 'EOS':
+            if sentence:
+                sentence = []
+            continue
+
+        if '\t' not in line:
+            continue
+
+        surface, details = line.split('\t')
+        details = details.split(',')
+        morph = {
+            'surface': surface,
+            'base': details[6] if len(details) > 6 else '*',
+            'pos': details[0],
+            'pos1': details[1]
+        }
+
+        #動詞の表層形の抽出
+        if morph['pos'] == '動詞':
+            verbs.append(morph['surface'])
+
+#見やすくためのファイルに出力
+with open(output_file_path, 'w', encoding='utf-8') as out_file:
+    for verb in verbs:
+        out_file.write(verb + '\n')
+
+
diff --git a/ning/chapter04/knock32.py b/ning/chapter04/knock32.py
@@ -0,0 +1,34 @@
+file_path = 'neko.txt.mecab'
+output_file_path = 'verbs_base.txt'
+verbs_base = []
+
+with open(file_path, 'r', encoding='utf-8') as file:
+    sentence = []  
+    for line in file:
+        line = line.strip()  
+        if line == 'EOS':
+            if sentence:
+                sentence = []
+            continue  
+
+        if '\t' not in line:
+            continue
+
+        surface, details = line.split('\t')
+        details = details.split(',')
+        morph = {
+            'surface': surface,
+            'base': details[6] if len(details) > 6 else '*',
+            'pos': details[0],
+            'pos1': details[1]
+        }
+
+        #動詞の基本形の抽出
+        if morph['pos'] == '動詞':
+            verbs_base.append(morph['base'])
+
+#ファイルに出力
+with open(output_file_path, 'w', encoding='utf-8') as out_file:
+    for verb_base in verbs_base:
+        out_file.write(verb_base + '\n')
+
diff --git a/ning/chapter04/knock33.py b/ning/chapter04/knock33.py
@@ -0,0 +1,38 @@
+file_path = 'neko.txt.mecab'
+output_file_path = 'noun_phrases.txt'
+noun_phrases = []
+
+with open(file_path, 'r', encoding='utf-8') as file:
+    sentence = []
+    for line in file:
+        line = line.strip()
+        if line == 'EOS':
+            if sentence:
+                for i in range(1, len(sentence) - 1):
+                    if (sentence[i-1]['pos'] == '名詞' and
+                        sentence[i]['surface'] == 'の' and
+                        sentence[i+1]['pos'] == '名詞'):
+                        noun_phrase = sentence[i-1]['surface'] + sentence[i]['surface'] + sentence[i+1]['surface']
+                        noun_phrases.append(noun_phrase)
+                sentence = []
+            continue
+
+        if '\t' not in line:
+            continue
+
+        #形態素情報の抽出
+        surface, details = line.split('\t')
+        details = details.split(',')
+        morph = {
+            'surface': surface,
+            'base': details[6] if len(details) > 6 else '*',
+            'pos': details[0],
+            'pos1': details[1]
+        }
+        sentence.append(morph)
+
+#ファイルに出力
+with open(output_file_path, 'w', encoding='utf-8') as out_file:
+    for phrase in noun_phrases:
+        out_file.write(phrase + '\n')
+
diff --git a/ning/chapter04/knock34.py b/ning/chapter04/knock34.py
@@ -0,0 +1,40 @@
+noun_sequences = []
+
+with open("neko.txt.mecab", 'r', encoding='utf-8') as file:
+    sentence = [] 
+    for line in file:
+        line = line.strip() 
+        if line == 'EOS':
+            if sentence:
+                #名詞の連接を抽出
+                current_sequence = []
+                for morph in sentence:
+                    if morph['pos'] == '名詞':
+                        current_sequence.append(morph['surface'])
+                    else:
+                        if len(current_sequence) > 1:
+                            noun_sequences.append(''.join(current_sequence))
+                        current_sequence = []
+                if len(current_sequence) > 1:
+                    noun_sequences.append(''.join(current_sequence))
+                sentence = []
+            continue
+
+        if '\t' not in line:
+            continue
+
+        surface, details = line.split('\t')
+        details = details.split(',')
+        morph = {
+            'surface': surface,
+            'base': details[6] if len(details) > 6 else '*',
+            'pos': details[0],
+            'pos1': details[1]
+        }
+        sentence.append(morph)
+
+#ファイルに出力
+with open("longest_noun_sequences.txt", 'w', encoding='utf-8') as out_file:
+    for sequence in noun_sequences:
+        out_file.write(sequence + '\n')
+
diff --git a/ning/chapter04/knock35.py b/ning/chapter04/knock35.py
@@ -0,0 +1,31 @@
+from collections import Counter
+
+file_path = 'neko.txt.mecab'
+output_file_path = 'word_frequencies.txt'
+words = []
+
+with open(file_path, 'r', encoding='utf-8') as file:
+    for line in file:
+        line = line.strip()
+        if line == 'EOS':
+            continue 
+
+        if '\t' not in line:
+            continue
+
+        surface, details = line.split('\t')
+        details = details.split(',')
+        word = details[6] if len(details) > 6 else surface  #基本形が存在すれば基本形を、なければ表層形を
+        words.append(word)
+
+#単語の出現頻度をカウント
+word_counter = Counter(words)
+
+#出現頻度の高い順から並べる
+sorted_word_counts = word_counter.most_common()
+
+#ファイルに出力
+with open(output_file_path, 'w', encoding='utf-8') as out_file:
+    for word, count in sorted_word_counts:
+        out_file.write(f"{word}\t{count}\n")
+
diff --git a/ning/chapter04/knock36.py b/ning/chapter04/knock36.py
@@ -0,0 +1,45 @@
+#句読点などの記号は除外。助詞と助動詞なども除外。
+
+from collections import Counter
+import matplotlib.pyplot as plt
+import japanize_matplotlib
+
+file_path = 'neko.txt.mecab'
+words = []
+
+with open(file_path, 'r', encoding='utf-8') as file:
+    for line in file:
+        line = line.strip() 
+        if line == 'EOS':
+            continue 
+
+        if '\t' not in line:
+            continue
+
+        surface, details = line.split('\t')
+        details = details.split(',')
+        word = details[6] if len(details) > 6 else surface
+        words.append(word)
+
+#出現頻度のカウント
+word_counter = Counter(words)
+
+#高い順から並べる
+sorted_word_counts = word_counter.most_common(10)
+
+#出現頻度が高い10語の抽出
+words, counts = zip(*sorted_word_counts)
+
+#棒グラフを作成
+plt.figure(figsize=(10, 6))
+plt.bar(words, counts, color='skyblue')
+plt.xlabel('単語')
+plt.ylabel('出現頻度')
+plt.title('出現頻度が高い10語')
+plt.xticks(rotation=45)
+plt.tight_layout()
+
+#グラフを表示
+plt.show()
+
+
diff --git a/ning/chapter04/knock37.py b/ning/chapter04/knock37.py
@@ -0,0 +1,52 @@
+import matplotlib.pyplot as plt
+from collections import Counter
+import re
+import japanize_matplotlib
+
+def read_mecab(file_path):
+    sentences = []
+    sentence = []
+    with open(file_path, 'r', encoding='utf-8') as f:
+        for line in f:
+            if line == 'EOS\n':
+                if sentence:
+                    sentences.append(sentence)
+                    sentence = []
+            else:
+                surface, feature = line.split('\t')
+                features = feature.split(',')
+                morph = {
+                    'surface': surface,
+                    'base': features[6],
+                    'pos': features[0],
+                    'pos1': features[1]
+                }
+                sentence.append(morph)
+    return sentences
+
+#記号、助詞、助動詞を除外し、猫と共起する単語をカウント
+def count_cooccurrences(sentences, target_word):
+    cooccurrence_counter = Counter()
+    for sentence in sentences:
+        words_in_sentence = [morph['surface'] for morph in sentence]
+        if target_word in words_in_sentence:
+            for morph in sentence:
+                if morph['surface'] != target_word and morph['pos'] not in ['記号', '助詞', '助動詞']:
+                    cooccurrence_counter[morph['surface']] += 1
+    return cooccurrence_counter
+
+file_path = 'neko.txt.mecab'
+sentences = read_mecab(file_path)
+cooccurrence_counter = count_cooccurrences(sentences, '猫')
+
+#出現頻度上位10語の抽出
+most_common_words = cooccurrence_counter.most_common(10)
+
+words, counts = zip(*most_common_words)
+plt.figure(figsize=(10, 5))
+plt.bar(words, counts)
+plt.xlabel('単語')
+plt.ylabel('出現頻度')
+plt.title('「猫」と共起する単語の出現頻度上位10')
+plt.xticks(rotation=45)
+plt.show()
diff --git a/ning/chapter04/knock38.py b/ning/chapter04/knock38.py
@@ -0,0 +1,46 @@
+#全部を載せると差異が見れないため、出現頻度が30未満の単語だけにした
+
+import matplotlib.pyplot as plt
+from collections import Counter
+import japanize_matplotlib
+
+file_path = 'neko.txt.mecab'
+
+#形態素解析結果neko.txt.mecabの読み込み
+sentences = []
+with open(file_path, 'r', encoding='utf-8') as f:
+    sentence = []
+    for line in f:
+        if line == 'EOS\n':
+            if sentence:
+                sentences.append(sentence)
+                sentence = []
+        else:
+            surface, feature = line.split('\t')
+            features = feature.split(',')
+            morph = {
+                'surface': surface,
+                'base': features[6],
+                'pos': features[0],
+                'pos1': features[1]
+            }
+            sentence.append(morph)
+
+#各単語の出現頻度をカウント
+word_counter = Counter()
+for sentence in sentences:#品詞が「記号」、「助詞」、「助動詞」でない単語のみをカウント
+    for morph in sentence:
+        if morph['pos'] not in ['記号', '助詞', '助動詞']:
+            word_counter[morph['surface']] += 1
+
+#出現頻度が30未満の単語のみをリストに抽出
+frequencies = [freq for freq in word_counter.values() if freq < 30]
+
+#ヒストグラムの作成
+plt.figure(figsize=(10, 5))
+plt.hist(frequencies, bins=range(1, 30), edgecolor='black', align='left')
+plt.xlabel('出現頻度')
+plt.ylabel('単語の種類数')
+plt.title('出現頻度ヒストグラム')
+plt.show()
+