-
Notifications
You must be signed in to change notification settings - Fork 2
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #72 from tmu-nlp/naoki
Naoki
- Loading branch information
Showing
12 changed files
with
10,097 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,24 @@ | ||
with open("C:/Users/shish_sf301y1/Desktop/pyファイル/neko.txt.mecab", "r") as f: | ||
morphemes = [] | ||
neko_list = [] | ||
lines = f.readlines() | ||
for line in lines: | ||
neko_dic = {} | ||
suf = line.split("\t") | ||
if suf[0] == "EOS\n": | ||
continue | ||
#suf[1]には名詞,普通名詞,副詞可能,,,,トキドキ,時々,時々,... | ||
temp = suf[1].split(',') | ||
neko_dic["surface"] = suf[0] | ||
#なぜ7かは不明 | ||
if len(temp) <= 7: | ||
neko_dic["base"] = suf[0] | ||
else : | ||
neko_dic["base"] = temp[7] | ||
neko_dic["pos"] = temp[0] | ||
neko_dic["pos1"] = temp[1] | ||
neko_list.append(neko_dic) | ||
if suf[0] == "。": | ||
morphemes.append(neko_list) | ||
neko_list = [] | ||
morphemes |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,24 @@ | ||
with open("C:/Users/shish_sf301y1/Desktop/pyファイル/neko.txt.mecab", "r") as f: | ||
morphemes = [] | ||
neko_list = [] | ||
lines = f.readlines() | ||
for line in lines: | ||
neko_dic = {} | ||
suf = line.split("\t") | ||
if suf[0] == "EOS\n": | ||
continue | ||
#suf[1]には名詞,普通名詞,副詞可能,,,,トキドキ,時々,時々,... | ||
temp = suf[1].split(',') | ||
neko_dic["surface"] = suf[0] | ||
#なぜ7かは不明 | ||
if len(temp) <= 7: | ||
neko_dic["base"] = suf[0] | ||
else : | ||
neko_dic["base"] = temp[7] | ||
neko_dic["pos"] = temp[0] | ||
neko_dic["pos1"] = temp[1] | ||
neko_list.append(neko_dic) | ||
if suf[0] == "。": | ||
morphemes.append(neko_list) | ||
neko_list = [] | ||
morphemes |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,7 @@ | ||
suf_list = [] | ||
for sentense in morphemes: | ||
for text in sentense: | ||
if text["pos"] == "動詞": | ||
suf_list.append(text["base"]) | ||
base_verb = set(suf_list) | ||
base_verb |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,7 @@ | ||
suf_list = [] | ||
for sentense in morphemes: | ||
#最初と最後は取らないように回数を調整する | ||
for i in range(len(sentense)-2): | ||
if sentense[i+1]['base'] == 'の' and sentense[i]['pos'] == '名詞' and sentense[i+2]['pos'] == '名詞': | ||
suf_list.append(sentense[i]['surface']+sentense[i+1]['surface']+sentense[i+2]['surface']) | ||
suf_list |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,16 @@ | ||
suf_list = [] | ||
#自然言語処理100本ノックのような名詞を取得 | ||
for sentense in morphemes: | ||
count = 0 | ||
sent = '' | ||
for i in range(len(sentense)): | ||
if sentense[i]['pos'] == '名詞' : | ||
count += 1 | ||
sent += sentense[i]['surface'] | ||
else : | ||
if count >= 2: | ||
suf_list.append(sent) | ||
count = 0 | ||
sent = '' | ||
suf_list = set(suf_list) | ||
suf_list |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,9 @@ | ||
import collections | ||
word_list = [] | ||
for sentense in morphemes: | ||
for text in sentense: | ||
if text['pos'] != '補助記号': | ||
word_list.append(text['surface']) | ||
word_list_count = collections.Counter(word_list) | ||
word_list_rank = word_list_count.most_common() | ||
word_list_rank |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,11 @@ | ||
import matplotlib.pyplot as plt | ||
#import japanize_matplotlib | ||
import collections | ||
%matplotlib inline | ||
word_list_top10 = [] | ||
word_list_count = [] | ||
for i in range(10): | ||
word_list_top10.append(word_list_rank[:10][i][0]) | ||
word_list_count.append(word_list_rank[:10][i][1]) | ||
plt.bar(x = word_list_top10,height = word_list_count) | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,18 @@ | ||
import itertools | ||
import matplotlib.pyplot as plt | ||
%matplotlib inline | ||
|
||
related_list = [] | ||
for sentense in morphemes: | ||
for i in range(len(sentense)-1): | ||
if sentense[i]['surface'] == '猫' and sentense[i+1]["pos"] != "補助記号" and sentense[i+1]["pos"] != '助詞' and sentense[i+1]["pos"] != '助動詞': | ||
related_list.append(sentense[i+1]['surface']) | ||
all_neko = list(itertools.chain.from_iterable(related_list)) | ||
count_list = collections.Counter(all_neko) | ||
word_list = [] | ||
height_list = [] | ||
print(count_list) | ||
for i in range(10): | ||
word_list.append(count_list.most_common()[:10][i][0]) | ||
height_list.append(count_list.most_common()[:10][i][1]) | ||
plt.bar(x = word_list, height = height_list) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,7 @@ | ||
import matplotlib.pyplot as plt | ||
word_list = [] | ||
for sentense in morphemes: | ||
for text in sentense: | ||
word_list.append(text['surface']) | ||
hist = collections.Counter(word_list) | ||
plt.hist(hist.values(),range(1,30)) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,7 @@ | ||
import matplotlib.pyplot as plt | ||
word_list = [] | ||
for sentense in morphemes: | ||
for text in sentense: | ||
word_list.append(text['surface']) | ||
hist = collections.Counter(word_list) | ||
plt.hist(hist.values(),range(1,30)) |
Oops, something went wrong.