Skip to content

Commit

Permalink
Merge pull request #116 from tmu-nlp/naoki
Browse files Browse the repository at this point in the history
06
  • Loading branch information
kiyama-hajime authored Jun 17, 2024
2 parents f99a1ee + 5d7894a commit 167e5f2
Show file tree
Hide file tree
Showing 22 changed files with 361 additions and 0 deletions.
33 changes: 33 additions & 0 deletions naoki/chapter06/knock50.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
from sklearn.model_selection import train_test_split

#データに名前づけ
header_name = ['ID', 'TITLE', 'URL', 'PUBLISHER', 'CATEGORY', 'STORY', 'HOSTNAME', 'TIMESTAMP']
df = pd.read_csv('newsCorpora.csv', header=None, sep='\t', names=header_name)
"""
情報源(publisher)が
”Reuters”, “Huffington Post”, “Businessweek”, “Contactmusic.com”, “Daily Mail”
の事例(記事)のみを抽出
"""
#[]の使い分けがよくわからない
df_new = df[df['PUBLISHER'].isin(['Reuters', 'Huffington Post', 'Businessweek', 'Contactmusic.com', 'Daily Mail'])]

#データの分割
df_train, df_other = train_test_split(
df_new, test_size=0.2, random_state=777)
df_valid, df_test = train_test_split(
df_other, test_size=0.5, random_state=777)

#CSV形式で書き出し
df_train.to_csv('df_train.txt',sep='\t', index=False,header=False)
df_valid.to_csv('df_valid.txt',sep='\t', index=False,header=False)
df_test.to_csv('df_test.txt',sep='\t', index=False,header=False)

#カテゴリーの事例数を確認 .value_counts()はあるデータカラムにあるタイプごとの総数がわかるので便利
print(df_train['CATEGORY'].value_counts())
print(df_valid['CATEGORY'].value_counts())
print(df_test['CATEGORY'].value_counts())
71 changes: 71 additions & 0 deletions naoki/chapter06/knock51.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
from sklearn.feature_extraction.text import CountVectorizer
import joblib

df_train['TMP'] = 'train'
df_valid['TMP'] = 'valid'
df_test['TMP'] = 'test'

#データの結合
#concat([df1,df2])
#df[['a','b']] 中はインデックスのリスト
data = pd.concat([df_train[['TITLE','CATEGORY','TMP']],df_valid[['TITLE','CATEGORY','TMP']],df_test[['TITLE','CATEGORY','TMP']]],axis=0)
#reset_indexはdataのインデックスを更新している drop=Trueで元のインデックスを落とし、inplace=Trueでオブジェクトを変更している
data.reset_index(drop=True,inplace=True)
"""
CountVectorizerとTfidfVectorizerは、テキストデータをベクトル化するための手法です。
どちらを選ぶかは、具体的なタスクやデータによりますが、以下のポイントを考慮して選択することが一般的です。
CountVectorizer:
単語の出現回数をカウントする手法です。
文書内での単語の頻度を考慮しますが、文書全体の重要性は考慮しません。
テキスト分類やクラスタリングなど、単語の出現頻度が重要なタスクに適しています。
TfidfVectorizer:
TF-IDF (Term Frequency-Inverse Document Frequency) を計算する手法です。
単語の出現頻度と逆文書頻度を組み合わせて、単語の重要性を評価します。
"""
"""
(?u) は Unicode 文字列を扱うためのフラグで、正規表現内での文字列の解釈を Unicode モードに切り替えます。
\\b は単語の境界を表すメタキャラクタです。単語の先頭や末尾にマッチします。
\\w+ は一つ以上の単語文字(英数字やアンダースコア)にマッチします。例えば、apple や 123 などが該当します。
"""
vectorizer = CountVectorizer(token_pattern=u'(?u)\\b\\w+\\b')
bag = vectorizer.fit_transform(data['TITLE'])

"""
bag の正体
CountVectorizerを使ってテキストデータをベクトル化した結果、
bagは疎行列 (sparse matrix) 形式になっています。
疎行列は非ゼロ要素が少ない行列で、メモリ効率のためにこの形式で保存されます。
疎行列を配列に変換する理由
疎行列 (sparse matrix) 形式は、scipy.sparse モジュールの csr_matrix(Compressed Sparse Row)などの形式で保存されています。
この形式はメモリ効率が良いですが、PandasのDataFrameには直接結合できません。
そのため、疎行列を通常の配列(dense array)に変換する必要があります。
"""
data = pd.concat([data, pd.DataFrame(bag.toarray())], axis=1)

"""
joblib.dumpは、Pythonのオブジェクトをファイルに保存するための関数
vectorizer.vocabulary_について
CountVectorizerのvocabulary_属性は、テキストデータ内の全単語と、それらの単語に割り当てられたインデックスの対応関係を保持する辞書です。
例えば、{'word1': 0, 'word2': 1, 'word3': 2, ...} のような形式です。
この情報を保存することで、将来の予測や解析時に同じ単語辞書を使うことができます。
保存の目的
再利用性: 一度作成した単語辞書を再利用できるため、新たに辞書を生成する手間が省けます。
一貫性の保持: モデルのトレーニングと予測で同じ辞書を使用することで、一貫性を保持できます。異なる辞書を使うと、単語のインデックスがずれて予測結果が正しくなくなる可能性があります。
効率性: 辞書の生成は計算コストがかかるため、一度生成した辞書を保存しておくことで効率的に処理が行えます。
"""
joblib.dump(vectorizer.vocabulary_, 'vocabulary_.joblib')


data_train = data.query('TMP=="train"').drop(['TITLE','TMP'], axis=1)
data_valid = data.query('TMP=="valid"').drop(['TITLE','TMP'], axis=1)
data_test = data.query('TMP=="test"').drop(['TITLE','TMP'], axis=1)

#CSV形式で書き出し
data_train.to_csv('data_train.feature.txt',sep='\t', index=False,header=False)
data_valid.to_csv('data_valid.feature.txt',sep='\t', index=False,header=False)
data_test.to_csv('data_test.feature.txt',sep='\t', index=False,header=False)
data_train.head()
11 changes: 11 additions & 0 deletions naoki/chapter06/knock52.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
from sklearn.feature_extraction.text import CountVectorizer
import joblib
from sklearn.linear_model import LogisticRegression

X_train = data_train.iloc[:,1:]
y_train = data_train.iloc[:,0]

LR = LogisticRegression(penalty='l1', solver='saga', random_state=777)
LR.fit(X_train, y_train)
y_pred_train = LR.predict(X_train)
joblib.dump(LR, 'model.joblib')
9 changes: 9 additions & 0 deletions naoki/chapter06/knock53.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
from sklearn.feature_extraction.text import CountVectorizer
import joblib
# 保存されたモデルを読み込む
LR = joblib.load('model.joblib')

X_test = data_test.iloc[:,1:]
y_test = data_test.iloc[:,0]

y_pred = LR.predict(X_test)
8 changes: 8 additions & 0 deletions naoki/chapter06/knock54.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
from sklearn.metrics import accuracy_score
LR = joblib.load('model.joblib')

X_test = data_test.iloc[:,1:]
y_test = data_test.iloc[:,0]

y_pred_test = LR.predict(X_test)
accuracy_score(y_test, y_pred_test)
7 changes: 7 additions & 0 deletions naoki/chapter06/knock55.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
from sklearn.metrics import confusion_matrix
train_con = confusion_matrix(y_train, y_pred_train)
test_con = confusion_matrix(y_test, y_pred_test)
print('訓練データの混合行列')
print(train_con)
print('テストデータの混合行列')
print(test_con)
5 changes: 5 additions & 0 deletions naoki/chapter06/knock56.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
from sklearn.metrics import recall_score, precision_score, f1_score

print(recall_score(y_test, y_pred_test, average='macro'))
print(precision_score(y_test, y_pred_test, average='macro'))
print(f1_score(y_test, y_pred_test, average='macro'))
6 changes: 6 additions & 0 deletions naoki/chapter06/knock57.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
c = LR.coef_
"""
[開始:終了:ステップ]
"""
c0 = np.sort(abs(c[0]))[::-1]
print(c0[:10])
48 changes: 48 additions & 0 deletions naoki/chapter06/knock58.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

para = [0.01 , 0.1 , 1 , 10]

X_train = data_train.iloc[:,1:]
y_train = data_train.iloc[:,0]

X_valid = data_valid.iloc[:,1:]
y_valid = data_valid.iloc[:,0]

X_test = data_test.iloc[:,1:]
y_test = data_test.iloc[:,0]

train_accuracy = []
valid_accuracy = []
test_accuracy = []

#appendするリストを間違えてエラーを起こしてしまった。折角学習していた時間がもったいない。解決策はないか

for c in para:
LR = LogisticRegression(penalty='l1', C=c, solver='saga', random_state=777)
LR.fit(X_train, y_train)
#train
y_pred_train = LR.predict(X_train)
train_accuracy.append(accuracy_score(y_train, y_pred_train))
#valid
y_pred_valid = LR.predict(X_valid)
valid_accuracy.append(accuracy_score(y_valid, y_pred_valid))
#test
y_pred_test = LR.predict(X_test)
test_accuracy.append(accuracy_score(y_test, y_pred_test))


plt.plot(para, train_accuracy, label='train')
plt.plot(para, valid_accuracy, label='valid')
plt.plot(para, test_accuracy, label='test')
plt.legend()
plt.show()




Binary file added naoki/chapter06/knock59_pic.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Empty file.
11 changes: 11 additions & 0 deletions naoki/chapter07/knock60.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
from gensim.models import KeyedVectors
import pickle
#jpblibとは
#CBoW & nagativesampling

model = KeyedVectors.load_word2vec_format("drive/MyDrive/GoogleNews-vectors-negative300.bin.gz", binary=True)

with open("drive/MyDrive/word2vec.pkl", "wb") as f:
pickle.dump(model, f)

print(model["United_States"])
10 changes: 10 additions & 0 deletions naoki/chapter07/knock61.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
from gensim.models import KeyedVectors
import pickle
#jpblibとは
#CBoW & nagativesampling

model = KeyedVectors.load_word2vec_format("drive/MyDrive/GoogleNews-vectors-negative300.bin.gz", binary=True)

with open("drive/MyDrive/word2vec.pkl", "wb") as f:
pickle.dump(model, f)
print(model.similarity("United_States", "U.S."))
10 changes: 10 additions & 0 deletions naoki/chapter07/knock62.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
from gensim.models import KeyedVectors
import pickle
#jpblibとは
#CBoW & nagativesampling

model = KeyedVectors.load_word2vec_format("drive/MyDrive/GoogleNews-vectors-negative300.bin.gz", binary=True)

with open("drive/MyDrive/word2vec.pkl", "wb") as f:
pickle.dump(model, f)
print(model.most_similar("United_States"))
10 changes: 10 additions & 0 deletions naoki/chapter07/knock63.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
from gensim.models import KeyedVectors
import pickle
#jpblibとは
#CBoW & nagativesampling

model = KeyedVectors.load_word2vec_format("drive/MyDrive/GoogleNews-vectors-negative300.bin.gz", binary=True)

with open("drive/MyDrive/word2vec.pkl", "wb") as f:
pickle.dump(model, f)
print(model.most_similar_cosmul(positive=["Spain", "Athens"], negative=["Madrid"], topn=10))
22 changes: 22 additions & 0 deletions naoki/chapter07/knock64.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
from gensim.models import KeyedVectors
import pickle

with open("drive/MyDrive/knock64.txt", "w") as o_file:
with open("drive/MyDrive/questions-words.txt", "r") as t_file:
for line in t_file:
words = line.strip().split(" ")
if len(words) != 4:
continue
word,value = model.most_similar(positive=[words[1], words[2]], negative=[words[0]])[0]
o_file.write(f"{words[0]} {words[1]} {words[2]} {words[3]} {word} {value}\n")

# 8869までが意味的アナロジー
# 8870以降が文法的アナロジー
with open("drive/MyDrive/knock64.txt", "r") as f:
with open("drive/MyDrive/knock64-semantic.txt", 'w') as o_file1:
with open("drive/MyDrive/knock64-syntactic.txt", 'w') as o_file2:
for i, line in enumerate(f):
if i < 8869:
o_file1.write(line)
else:
o_file2.write(line)
8 changes: 8 additions & 0 deletions naoki/chapter07/knock65.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
import pandas as pd

df = pd.read_csv('drive/MyDrive/knock64.txt', sep=' ', header=None)
df.head()
#df[3]にはすでに用意された正解データが格納されている
#df[4]は今回推測した値
#今回用意した類似度は何の意味があったのか
print((df[3] == df[4]).sum() / len(df))
23 changes: 23 additions & 0 deletions naoki/chapter07/knock66.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
import pickle
from scipy.stats import spearmanr

with open("drive/MyDrive/word2vec.pkl", "rb") as f:
model = pickle.load(f)
word1 = []
word2 = []
human_rank = []
model_rank = []

df = pd.read_csv("drive/MyDrive/combined.csv",header=0)
df = df.dropna()
for i in range(len(df)):
word1.append(df.iloc[i,0])
word2.append(df.iloc[i,1])
human_rank.append(df.iloc[i,2])
cos = model.similarity(df.iloc[i,0],df.iloc[i,1])
model_rank.append(cos)
#model_rankにあるデータを順位データに変換
model_rank = pd.Series(model_rank).rank(ascending=True, method='min')
correlation, pvalue = spearmanr(human_rank,model_rank)
print(correlation)
print(pvalue)
28 changes: 28 additions & 0 deletions naoki/chapter07/knock67.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
import numpy as np
import pandas as pd
from gensim.models import KeyedVectors
from sklearn.cluster import KMeans
import pickle


with open("drive/MyDrive/word2vec.pkl", "rb") as f:
model = pickle.load(f)
df = pd.read_csv('drive/MyDrive/questions-words.txt',sep=' ',skiprows=1)
df = df.dropna()
df = df.reset_index()
df_country = df.iloc[:5030]
df_country.columns = ['index','word1','word2','word3','word4']
country = list(set(df_country.loc[:,'word4'].values))
countryVec = []
countryName = []
for c in country:
countryVec.append(model[c])
countryName.append(c)
#ndarray型に変換することで計算ができるようになる
X = np.array(countryVec)
km = KMeans(n_clusters=5, random_state=777)
y_km = km.fit_predict(X)
dic = {}
for num, name in zip(y_km, countryName):
dic.setdefault(num, []).append(name)
dic
16 changes: 16 additions & 0 deletions naoki/chapter07/knock68.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from gensim.models import KeyedVectors
from scipy.cluster.hierarchy import dendrogram, linkage
countryVec = []
countryName = []
for c in country:
countryVec.append(model[c])
countryName.append(c)

X = np.array(countryVec)
linkage_result = linkage(X, method='ward', metric='euclidean')
plt.figure(num=None, figsize=(16, 9), dpi=200, facecolor='w', edgecolor='k')
dendrogram(linkage_result, labels=countryName)
plt.show()
Binary file added naoki/chapter07/knock69.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
25 changes: 25 additions & 0 deletions naoki/chapter07/knock69.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
"""
t-SNEは機械学習の教師なし学習の中のひとつで、次元削減を行うアルゴリズム
t-SNEはPCAなどの可視化手法とは異なり、線形では表現できない関係も学習して次元削減を行える利点がある
t-SNEではあるデータ点とあるデータ点の近さを同時確立として表現
元データと次元削減後のデータの近さをKLダイバージェンスを最小化することで次元削減の学習を行います。
KLダイバージェンスとは2つの確率分布の間の異なり具合を測るものになっている
実際には削減後のデータを乱数で初期化し、KL divergenceを勾配法で最小化
"""
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans
X = np.array(countryVec)
tsne = TSNE(n_components=2)
X_tsne = tsne.fit_transform(X)
pred = KMeans(n_clusters = 5, random_state=777).fit_predict(X)

plt.figure(figsize=(15, 13))
col_list = ["Blue", "Red", "Green", "Black"]
for x, name, km in zip(X_tsne, countryName, pred):
plt.plot(x[0], x[1], color = col_list[km-1], marker="o")
plt.annotate(name, xy=(x[0], x[1]))#plt.annotate関数を使用して、データ点の位置に国名の注釈を追加 xy=(x[0], x[1])で注釈の位置を指定
plt.title("T-SNE")
plt.savefig("TSNE.png")
plt.show()

0 comments on commit 167e5f2

Please sign in to comment.