Merge pull request #117 from tmu-nlp/ning

add chapter06
tmu-nlp · Jun 17, 2024 · e03a4a8 · e03a4a8
2 parents 3f73bbc + ad358d1
commit e03a4a8
Show file tree

Hide file tree

Showing 10 changed files with 502 additions and 0 deletions.
diff --git a/ning/chapter06/knock50.py b/ning/chapter06/knock50.py
@@ -0,0 +1,59 @@
+"""
+News Aggregator Data Setを使い、
+学習データ（train.txt），検証データ（valid.txt），評価データ（test.txt）を作成するためには
+１、データの読み込み
+２、”Reuters”, “Huffington Post”, “Businessweek”, “Contactmusic.com”, “Daily Mail”の事例（記事）のみを抽出する
+３、抽出したデータをランダムに並び替える
+４、80%を学習データ、10%ずつを検証データと評価データに分割
+５、train.txt，valid.txt，test.txtで,１行に１事例を書き出すこととカテゴリ名と記事見出しのタブ区切り形式に保存
+６、カテゴリごとの事例数を確認
+"""
+
+import pandas as pd
+import numpy as np
+from sklearn.model_selection import train_test_split
+
+# データの読み込み
+data = pd.read_csv('newsCorpora.csv', sep='\t', header=None)
+#　コラムがないため、header指定しないことにより、データの最初の行が列名として使わず、すべての行がデータとして読み込まれる
+#　列名の指定
+data.columns = ['ID', 'TITLE', 'URL', 'PUBLISHER', 'CATEGORY', 'STORY', 'HOSTNAME', 'TIMESTAMP']
+
+# 指定された事例のみを抽出
+publishers = ["Reuters", "Huffington Post", "Businessweek", "Contactmusic.com", "Daily Mail"]
+filtered_data = data[data['PUBLISHER'].isin(publishers)]
+
+# データのシャッフル
+# fracはサンプリングするデータの割合を指定する引数で、1を指定すると全ての行が対象になる
+# random_stateは乱数生成器のシード値を設定する引数
+# .reset_index()で元のインデックスはデフォルトで新しい列として追加
+# drop=Trueは元のインデックス列を削除するオプションで、新しいインデックスだけが残る
+shuffled_data = filtered_data.sample(frac=1, random_state=42).reset_index(drop=True)
+
+# データの分割
+# test_size=0.2は80%のデータを抽出することを指定している、20%はtempに入れる
+# vaildとtestにtempの半分筒入れる
+# random_stateを上と同じように指定すること
+train, temp = train_test_split(shuffled_data, test_size=0.2, random_state=42)
+valid, test = train_test_split(temp, test_size=0.5, random_state=42)
+
+# 必要な列（カテゴリ名と記事見出し）のみを保存
+train_data = train[['CATEGORY', 'TITLE']]
+valid_data = valid[['CATEGORY', 'TITLE']]
+test_data = test[['CATEGORY', 'TITLE']]
+
+# ファイルへの保存
+# カテゴリと見出しがタブ区切り形式で保存
+train_data.to_csv('train.txt', sep='\t', index=False, header=False)
+valid_data.to_csv('valid.txt', sep='\t', index=False, header=False)
+test_data.to_csv('test.txt', sep='\t', index=False, header=False)
+
+# カテゴリごとの事例数を確認
+train_counts = train['CATEGORY'].value_counts()
+valid_counts = valid['CATEGORY'].value_counts()
+test_counts = test['CATEGORY'].value_counts()
+
+print("Training Data Counts:\n", train_counts)
+print("Validation Data Counts:\n", valid_counts)
+print("Test Data Counts:\n", test_counts)
+
diff --git a/ning/chapter06/knock51.py b/ning/chapter06/knock51.py
@@ -0,0 +1,69 @@
+"""
+特徴量抽出
+学習データ，検証データ，評価データから特徴量を抽出し，
+それぞれtrain.feature.txt，valid.feature.txt，test.feature.txtというファイル名で保存せよ．
+なお，カテゴリ分類に有用そうな特徴量は各自で自由に設計せよ．
+記事の見出しを単語列に変換したものが最低限のベースラインとなるであろう
+"""
+
+"""
+TfidfVectorizerを使うことによってテキストデータを数値ベクトルに変換する
+TF（Term Frequency）：ある文書内で特定の単語が出現する頻度。
+TF(t,d)=その単語の出現回数/文書内のすべての単語の出現回数
+IDF（Inverse Document Frequency）：特定の単語が文書集合全体でどれだけ重要かを示す尺度。頻繁に出現する単語には低い値が割り当てられる。
+IDF(t)=log(文書の総数/その単語が出現する文書の数)
+TF-IDF(t,d)=TF(t,d)xIDF(t)
+"""
+import collections
+import re
+import pandas as pd
+
+# 前処理関数
+def Process(lines):
+    sign_regrex = re.compile(r'[!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~]')
+    lines = sign_regrex.sub("", lines)  # 記号を削除
+    lines = re.sub(r"(\d+)", r" \1 ", lines)  # 数字と文字を分ける
+    texts = lines.split()  # 空白で分割
+    word_list = [word.lower() for word in texts]  # 小文字に変換
+    return word_list
+
+# 辞書作成関数
+def MakeDict(name):
+    with open(f"{name}.txt", "r") as f:
+        lines = f.readlines()
+    word_list = []
+    for line in lines:
+        word_list_temp = Process(line)
+        word_list.extend(word_list_temp)
+    c = collections.Counter(word_list).most_common()
+    word_dic = {}
+    i = 0  # インデックスを0から始める
+    for word in c:
+        if word[1] > 1:
+            word_dic[word[0]] = i
+            i += 1
+    return word_dic
+
+# ワンホットエンコーディング関数
+def MakeOneHot(text):
+    word_list = Process(text)
+    base_list = [0] * (len(GlobalWordDict) + 1)  # 長さを辞書のサイズ+1に設定
+    for word in word_list:
+        if word in GlobalWordDict:
+            base_list[GlobalWordDict[word]] = 1
+    return base_list
+
+# 特徴量抽出関数
+def MakeFeatureText(name):
+    df = pd.read_csv(f"{name}.txt", sep='\t', header=None, names=['Category', 'Title'])
+    df_2 = pd.DataFrame([MakeOneHot(title) for title in df["Title"]])
+    df_3 = pd.concat([df, df_2], axis=1)
+    df_3.to_csv(f"{name}.feature.txt", sep='\t', index=False)
+
+# 辞書をグローバル変数で定義
+GlobalWordDict = MakeDict("train")
+
+# 特徴量抽出とファイル保存
+MakeFeatureText("train")
+MakeFeatureText("test")
+MakeFeatureText("valid")
diff --git a/ning/chapter06/knock52.py b/ning/chapter06/knock52.py
@@ -0,0 +1,25 @@
+"""
+1.データの読み込み
+2.特徴量とラベルの分割
+  Category 列をラベル (y_train) とし、それ以外の列を特徴量 (X_train) として分割
+3.ロジスティック回帰モデルの学習
+　　scikit-learn の LogisticRegression クラスを用いてモデルを作成し、学習データ (X_train, y_train) を使う
+4.学習済みモデルをjobilbで保存
+"""
+import pandas as pd
+import joblib
+from sklearn.linear_model import LogisticRegression
+
+# データの読み込み
+train_df = pd.read_csv('train.feature.txt', sep='\t')
+
+# 特徴量とラベルの分割
+X_train = train_df.drop(columns=['Category', 'Title'])
+y_train = train_df['Category']
+
+# ロジスティック回帰モデルの学習
+model = LogisticRegression(max_iter=1000)
+model.fit(X_train, y_train)
+
+# 学習済みモデルの保存
+joblib.dump(model, 'logistic_regression_model.pkl')
diff --git a/ning/chapter06/knock53.py b/ning/chapter06/knock53.py
@@ -0,0 +1,62 @@
+import collections
+import re
+import pandas as pd
+from sklearn.linear_model import LogisticRegression
+import joblib
+
+# 前処理関数
+def Process(lines):
+    sign_regrex = re.compile(r'[!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~]')
+    lines = sign_regrex.sub("", lines)  # 記号を削除
+    lines = re.sub(r"(\d+)", r" \1 ", lines)  # 数字と文字を分ける
+    texts = lines.split()  # 空白で分割
+    word_list = [word.lower() for word in texts]  # 小文字に変換
+    return word_list
+
+# ワンホットエンコーディング関数
+def MakeOneHot(text, word_dict):
+    word_list = Process(text)
+    base_list = [0] * (len(word_dict) + 1)  # 長さを辞書のサイズ+1に設定
+    for word in word_list:
+        if word in word_dict:
+            base_list[word_dict[word]] = 1
+    return base_list
+
+# 辞書の読み込み
+def LoadDict(filename):
+    with open(filename, "r") as f:
+        lines = f.readlines()
+    word_list = []
+    for line in lines:
+        word_list_temp = Process(line)
+        word_list.extend(word_list_temp)
+    c = collections.Counter(word_list).most_common()
+    word_dict = {}
+    i = 0
+    for word in c:
+        if word[1] > 1:
+            word_dict[word[0]] = i
+            i += 1
+    return word_dict
+
+# 記事見出しのカテゴリと予測確率を計算する関数
+def PredictCategoryAndProbability(headline, model, word_dict):
+    one_hot_vector = MakeOneHot(headline, word_dict)
+    one_hot_vector_df = pd.DataFrame([one_hot_vector])
+    prediction = model.predict(one_hot_vector_df)
+    prediction_proba = model.predict_proba(one_hot_vector_df)
+    return prediction[0], prediction_proba[0]
+
+# 辞書を読み込み
+GlobalWordDict = LoadDict("train.txt")
+
+# 学習済みモデルの読み込み
+model = joblib.load('logistic_regression_model.pkl')
+
+# 記事見出しの例
+headline = "Example headline for testing"
+
+# 予測
+category, probability = PredictCategoryAndProbability(headline, model, GlobalWordDict)
+print(f"Predicted Category: {category}")
+print(f"Prediction Probability: {probability}")
diff --git a/ning/chapter06/knock54.py b/ning/chapter06/knock54.py
@@ -0,0 +1,31 @@
+import pandas as pd
+from sklearn.linear_model import LogisticRegression
+from sklearn.metrics import accuracy_score
+import joblib
+
+# データの読み込み
+train_df = pd.read_csv('train.feature.txt', sep='\t')
+valid_df = pd.read_csv('valid.feature.txt', sep='\t')
+
+# 特徴量とラベルの分割
+X_train = train_df.drop(columns=['Category', 'Title'])
+y_train = train_df['Category']
+
+X_valid = valid_df.drop(columns=['Category', 'Title'])
+y_valid = valid_df['Category']
+
+# 学習済みモデルの読み込み
+model = joblib.load('logistic_regression_model.pkl')
+
+# 学習データ上での正解率計測
+y_train_pred = model.predict(X_train)
+train_accuracy = accuracy_score(y_train, y_train_pred)
+print(f'Training Accuracy: {train_accuracy:.4f}')
+
+# 評価データ上での正解率計測
+y_valid_pred = model.predict(X_valid)
+valid_accuracy = accuracy_score(y_valid, y_valid_pred)
+print(f'Validation Accuracy: {valid_accuracy:.4f}')
+
+# Training Accuracy: 0.9898
+# Validation Accuracy: 0.8988
diff --git a/ning/chapter06/knock55.py b/ning/chapter06/knock55.py
@@ -0,0 +1,49 @@
+import pandas as pd
+from sklearn.linear_model import LogisticRegression
+from sklearn.metrics import confusion_matrix
+import joblib
+import matplotlib.pyplot as plt
+import seaborn as sns
+
+# データの読み込み
+train_df = pd.read_csv('train.feature.txt', sep='\t')
+valid_df = pd.read_csv('valid.feature.txt', sep='\t')
+
+# 特徴量とラベルの分割
+X_train = train_df.drop(columns=['Category', 'Title'])
+y_train = train_df['Category']
+
+X_valid = valid_df.drop(columns=['Category', 'Title'])
+y_valid = valid_df['Category']
+
+# 学習済みモデルの読み込み
+model = joblib.load('logistic_regression_model.pkl')
+
+# 学習データおよび評価データ上での予測
+y_train_pred = model.predict(X_train)
+y_valid_pred = model.predict(X_valid)
+
+# 混同行列の作成
+labels = y_train.unique()
+conf_matrix_train = confusion_matrix(y_train, y_train_pred, labels=labels)
+conf_matrix_valid = confusion_matrix(y_valid, y_valid_pred, labels=labels)
+
+# 混同行列をデータフレームに変換
+conf_matrix_train_df = pd.DataFrame(data=conf_matrix_train, index=labels, columns=labels)
+conf_matrix_valid_df = pd.DataFrame(data=conf_matrix_valid, index=labels, columns=labels)
+
+# 混同行列の表示関数
+def plot_confusion_matrix(conf_matrix, title, filename):
+    plt.figure(figsize=(10, 7))
+    sns.heatmap(conf_matrix, square=True, cbar=True, annot=True, cmap='Blues', fmt='d')
+    plt.xlabel("Predict")
+    plt.ylabel("True")
+    plt.title(title)
+    plt.savefig(filename)
+    plt.close()
+
+# 学習データ上の混同行列の表示
+plot_confusion_matrix(conf_matrix_train_df, 'Confusion Matrix - Training Data', 'Train_confusion.png')
+
+# 評価データ上の混同行列の表示
+plot_confusion_matrix(conf_matrix_valid_df, 'Confusion Matrix - Validation Data', 'Validation_confusion.png')
diff --git a/ning/chapter06/knock56.py b/ning/chapter06/knock56.py
@@ -0,0 +1,33 @@
+import pandas as pd
+import joblib
+from sklearn.linear_model import LogisticRegression
+from sklearn.metrics import classification_report
+
+# 特徴量データの読み込み
+X_test = joblib.load('test.feature.pkl')
+
+# ラベルデータの読み込み
+test_data = pd.read_csv('test.txt', sep='\t', header=None)
+y_test = test_data[0]
+
+# モデルの読み込み
+clf = joblib.load('model.joblib')
+
+# 評価データ上の予測値
+y_test_pred = clf.predict(X_test)
+
+# 適合率、再現率、F1スコアの計測と表示
+report = classification_report(y_test, y_test_pred, target_names=clf.classes_, output_dict=True)
+report_df = pd.DataFrame(report).transpose()
+print(report_df)
+
+# 適合率、再現率、F1スコアのマイクロ平均とマクロ平均を表示
+print("Micro Average:")
+print(f"Precision: {report['micro avg']['precision']:.4f}")
+print(f"Recall: {report['micro avg']['recall']:.4f}")
+print(f"F1 Score: {report['micro avg']['f1-score']:.4f}")
+
+print("\nMacro Average:")
+print(f"Precision: {report['macro avg']['precision']:.4f}")
+print(f"Recall: {report['macro avg']['recall']:.4f}")
+print(f"F1 Score: {report['macro avg']['f1-score']:.4f}")
diff --git a/ning/chapter06/knock57.py b/ning/chapter06/knock57.py
@@ -0,0 +1,33 @@
+import joblib
+import pandas as pd
+import numpy as np
+
+# 保存済みのモデルとベクトライザの読み込み
+clf = joblib.load('model.joblib')
+vectorizer = joblib.load('tfidf_vectorizer.pkl')
+
+# 各カテゴリに対する特徴量の重みを取得
+feature_names = vectorizer.get_feature_names_out()
+coefs = clf.coef_
+
+# 重みの高い特徴量トップ10と重みの低い特徴量トップ10をカテゴリごとに表示
+for i, category in enumerate(clf.classes_):
+    print(f"Category: {category}")
+
+    # 重みの高い特徴量トップ10
+    top10_indices = np.argsort(coefs[i])[-10:]
+    top10_features = feature_names[top10_indices]
+    top10_weights = coefs[i][top10_indices]
+    print("Top 10 positive features:")
+    for feature, weight in zip(top10_features, top10_weights):
+        print(f"{feature}: {weight:.4f}")
+
+    # 重みの低い特徴量トップ10
+    bottom10_indices = np.argsort(coefs[i])[:10]
+    bottom10_features = feature_names[bottom10_indices]
+    bottom10_weights = coefs[i][bottom10_indices]
+    print("Top 10 negative features:")
+    for feature, weight in zip(bottom10_features, bottom10_weights):
+        print(f"{feature}: {weight:.4f}")
+
+    print("\n")