Merge pull request #72 from tmu-nlp/naoki

Naoki
tmu-nlp · May 21, 2024 · cfd8ecf · cfd8ecf
2 parents 13af179 + 945d999
commit cfd8ecf
Show file tree

Hide file tree

Showing 12 changed files with 10,097 additions and 0 deletions.
diff --git a/naoki/chapter04/knock30.py b/naoki/chapter04/knock30.py
@@ -0,0 +1,24 @@
+with open("C:/Users/shish_sf301y1/Desktop/pyファイル/neko.txt.mecab", "r") as f:
+    morphemes = []
+    neko_list = []
+    lines = f.readlines()
+    for line in lines:
+        neko_dic = {}
+        suf = line.split("\t")
+        if suf[0] == "EOS\n": 
+            continue
+        #suf[1]には名詞,普通名詞,副詞可能,,,,トキドキ,時々,時々,...
+        temp = suf[1].split(',')
+        neko_dic["surface"] = suf[0]
+        #なぜ7かは不明
+        if len(temp) <= 7:
+            neko_dic["base"] = suf[0]
+        else :
+            neko_dic["base"] = temp[7]
+        neko_dic["pos"] = temp[0]
+        neko_dic["pos1"] = temp[1]
+        neko_list.append(neko_dic)
+        if suf[0] == "。":
+            morphemes.append(neko_list)
+            neko_list = []
+morphemes   
diff --git a/naoki/chapter04/knock31.py b/naoki/chapter04/knock31.py
@@ -0,0 +1,24 @@
+with open("C:/Users/shish_sf301y1/Desktop/pyファイル/neko.txt.mecab", "r") as f:
+    morphemes = []
+    neko_list = []
+    lines = f.readlines()
+    for line in lines:
+        neko_dic = {}
+        suf = line.split("\t")
+        if suf[0] == "EOS\n": 
+            continue
+        #suf[1]には名詞,普通名詞,副詞可能,,,,トキドキ,時々,時々,...
+        temp = suf[1].split(',')
+        neko_dic["surface"] = suf[0]
+        #なぜ7かは不明
+        if len(temp) <= 7:
+            neko_dic["base"] = suf[0]
+        else :
+            neko_dic["base"] = temp[7]
+        neko_dic["pos"] = temp[0]
+        neko_dic["pos1"] = temp[1]
+        neko_list.append(neko_dic)
+        if suf[0] == "。":
+            morphemes.append(neko_list)
+            neko_list = []
+morphemes   
diff --git a/naoki/chapter04/knock32.py b/naoki/chapter04/knock32.py
@@ -0,0 +1,7 @@
+suf_list = []
+for sentense in morphemes:
+    for text in sentense:
+        if text["pos"] == "動詞":
+            suf_list.append(text["base"])
+base_verb = set(suf_list)
+base_verb
diff --git a/naoki/chapter04/knock33.py b/naoki/chapter04/knock33.py
@@ -0,0 +1,7 @@
+suf_list = []
+for sentense in morphemes:
+    #最初と最後は取らないように回数を調整する
+    for i in range(len(sentense)-2):
+        if sentense[i+1]['base'] == 'の' and sentense[i]['pos'] == '名詞' and sentense[i+2]['pos'] == '名詞':
+            suf_list.append(sentense[i]['surface']+sentense[i+1]['surface']+sentense[i+2]['surface'])
+suf_list
diff --git a/naoki/chapter04/knock34.py b/naoki/chapter04/knock34.py
@@ -0,0 +1,16 @@
+suf_list = []
+#自然言語処理100本ノックのような名詞を取得
+for sentense in morphemes:
+    count = 0
+    sent = ''
+    for i in range(len(sentense)):
+        if sentense[i]['pos'] == '名詞' :
+            count += 1
+            sent += sentense[i]['surface']
+        else :
+            if count >= 2:
+                suf_list.append(sent)
+            count = 0
+            sent = ''
+suf_list = set(suf_list)
+suf_list
diff --git a/naoki/chapter04/knock35.py b/naoki/chapter04/knock35.py
@@ -0,0 +1,9 @@
+import collections
+word_list = []
+for sentense in morphemes:
+    for text in sentense:
+        if text['pos'] != '補助記号':
+            word_list.append(text['surface'])
+word_list_count = collections.Counter(word_list)
+word_list_rank = word_list_count.most_common()
+word_list_rank
diff --git a/naoki/chapter04/knock36.py b/naoki/chapter04/knock36.py
@@ -0,0 +1,11 @@
+import matplotlib.pyplot as plt
+#import japanize_matplotlib
+import collections
+%matplotlib inline
+word_list_top10 = []
+word_list_count = []
+for i in range(10):
+    word_list_top10.append(word_list_rank[:10][i][0])
+    word_list_count.append(word_list_rank[:10][i][1])
+plt.bar(x = word_list_top10,height = word_list_count)
+
diff --git a/naoki/chapter04/knock37.py b/naoki/chapter04/knock37.py
@@ -0,0 +1,18 @@
+import itertools
+import matplotlib.pyplot as plt
+%matplotlib inline
+
+related_list = []
+for sentense in morphemes:
+    for i in range(len(sentense)-1):
+        if sentense[i]['surface'] == '猫' and sentense[i+1]["pos"] != "補助記号" and sentense[i+1]["pos"] != '助詞' and sentense[i+1]["pos"] != '助動詞':
+            related_list.append(sentense[i+1]['surface'])
+all_neko = list(itertools.chain.from_iterable(related_list))
+count_list = collections.Counter(all_neko)
+word_list = []
+height_list = []
+print(count_list)
+for i in range(10):
+    word_list.append(count_list.most_common()[:10][i][0])
+    height_list.append(count_list.most_common()[:10][i][1])
+plt.bar(x = word_list, height = height_list)
diff --git a/naoki/chapter04/knock38.py b/naoki/chapter04/knock38.py
@@ -0,0 +1,7 @@
+import matplotlib.pyplot as plt
+word_list = []
+for sentense in morphemes:
+    for text in sentense:
+        word_list.append(text['surface'])
+hist = collections.Counter(word_list)
+plt.hist(hist.values(),range(1,30))
diff --git a/naoki/chapter04/knock39.py b/naoki/chapter04/knock39.py
@@ -0,0 +1,7 @@
+import matplotlib.pyplot as plt
+word_list = []
+for sentense in morphemes:
+    for text in sentense:
+        word_list.append(text['surface'])
+hist = collections.Counter(word_list)
+plt.hist(hist.values(),range(1,30))