-
Notifications
You must be signed in to change notification settings - Fork 2
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
11 changed files
with
518 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
/data/ | ||
/output/ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,63 @@ | ||
# task50. データの入手・整形 | ||
''' | ||
The task is to classify a given news headline to one of the following categories: | ||
“Business”, “Science”, “Entertainment” and “Health” | ||
News Aggregator Data Setをダウンロードし、以下の要領で学習データ(train.txt), 検証データ(valid.txt),評価データ(test.txt)を作成せよ: | ||
ダウンロードしたzipファイルを解凍し,readme.txtの説明を読む. | ||
情報源(publisher)が”Reuters”, “Huffington Post”, “Businessweek”, “Contactmusic.com”, | ||
“Daily Mail”の事例(記事)のみを抽出する. | ||
抽出された事例をランダムに並び替える. | ||
抽出された事例の80%を学習データ,残りの10%ずつを検証データと評価データに分割し, | ||
それぞれtrain.txt,valid.txt,test.txtというファイル名で保存する.ファイルには,1行に1事例を書き出すこととし, | ||
カテゴリ名と記事見出しのタブ区切り形式とせよ(このファイルは後に問題70で再利用する). | ||
学習データと評価データを作成したら,各カテゴリの事例数を確認せよ. | ||
''' | ||
import pandas as pd | ||
from sklearn.model_selection import train_test_split | ||
|
||
header_name = ['ID', 'TITLE', 'URL', 'PUBLISHER', 'CATEGORY', 'STORY', 'HOSTNAME', 'TIMESTAMP'] | ||
extract_list = ['Reuters', 'Huffington Post', 'Businessweek', 'Contactmusic.com', 'Daily Mail'] | ||
|
||
df = pd.read_csv('data/newsCorpora.csv', header=None, sep='\t', | ||
names=header_name) | ||
|
||
df_ex = df.loc[df['PUBLISHER'].isin(extract_list), ['TITLE', 'CATEGORY']] | ||
|
||
data_train, data_other = train_test_split(df_ex, test_size=0.2, random_state=42) | ||
data_valid, data_test = train_test_split(data_other, test_size=0.5, random_state=42) | ||
|
||
data_train.to_csv("output/ch6/train.txt", sep="\t", index=False, header=False) | ||
data_valid.to_csv("output/ch6/valid.txt", sep="\t", index=False, header=False) | ||
data_test.to_csv("output/ch6/test.txt", sep="\t", index=False, header=False) | ||
|
||
if __name__ == "__main__": | ||
print("train_data") | ||
print(data_train['CATEGORY'].value_counts()) | ||
print("valid_data") | ||
print(data_valid['CATEGORY'].value_counts()) | ||
print("test_data") | ||
print(data_test['CATEGORY'].value_counts()) | ||
|
||
''' | ||
train_data | ||
CATEGORY | ||
b 4538 | ||
e 4228 | ||
t 1205 | ||
m 701 | ||
Name: count, dtype: int64 | ||
valid_data | ||
CATEGORY | ||
b 531 | ||
e 529 | ||
t 155 | ||
m 119 | ||
Name: count, dtype: int64 | ||
test_data | ||
CATEGORY | ||
b 558 | ||
e 522 | ||
t 164 | ||
m 90 | ||
Name: count, dtype: int64 | ||
''' |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,52 @@ | ||
# task51. 特徴量抽出 | ||
''' | ||
学習データ, 検証データ, 評価データから特徴量を抽出し, | ||
それぞれtrain.feature.txt, valid.feature.txt, test.feature.txtというファイル名で保存せよ. | ||
なお, カテゴリ分類に有用そうな特徴量は各自で自由に設計せよ. | ||
記事の見出しを単語列に変換したものが最低限のベースラインとなるであろう. | ||
''' | ||
|
||
import pandas as pd | ||
import string | ||
|
||
from sklearn.feature_extraction.text import CountVectorizer # BoW | ||
from sklearn.feature_extraction.text import TfidfVectorizer # tf-idf | ||
|
||
def preprocess(text): | ||
translator = str.maketrans('', '', string.punctuation) | ||
text = text.translate(translator) | ||
text = text.lower() | ||
text = ''.join([i for i in text if not i.isdigit()]) | ||
return text | ||
|
||
# Load and preprocess data | ||
header_name = ['TITLE', 'CATEGORY'] | ||
train = pd.read_csv('output/ch6/train.txt', header=None, sep='\t', names=header_name) | ||
valid = pd.read_csv('output/ch6/valid.txt', header=None, sep='\t', names=header_name) | ||
test = pd.read_csv('output/ch6/test.txt', header=None, sep='\t', names=header_name) | ||
|
||
# Concatenate data for preprocessing | ||
df = pd.concat([train, valid, test], axis=0).reset_index(drop=True) | ||
df['TITLE'] = df['TITLE'].apply(preprocess) | ||
|
||
# Split back the data | ||
train_valid_d = df[:len(train) + len(valid)] | ||
test_d = df[len(train) + len(valid):] | ||
|
||
# Initialize TF-IDF Vectorizer | ||
vectorizer = TfidfVectorizer(min_df=10, ngram_range=(1, 2)) | ||
|
||
# Fit and transform data | ||
train_valid_f = vectorizer.fit_transform(train_valid_d["TITLE"]) | ||
test_f = vectorizer.transform(test_d["TITLE"]) | ||
|
||
# Convert to DataFrame and save with headers | ||
train_valid_vec = pd.DataFrame(train_valid_f.toarray(), columns=vectorizer.get_feature_names_out()) | ||
test_vec = pd.DataFrame(test_f.toarray(), columns=vectorizer.get_feature_names_out()) | ||
|
||
train_vec = train_valid_vec[:len(train)] | ||
valid_vec = train_valid_vec[len(train):] | ||
|
||
train_vec.to_csv("output/ch6/train.feature.txt", sep="\t", index=False) | ||
valid_vec.to_csv("output/ch6/valid.feature.txt", sep="\t", index=False) | ||
test_vec.to_csv("output/ch6/test.feature.txt", sep="\t", index=False) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,22 @@ | ||
#task 52. 学習 | ||
# 51で構築した学習データを用いて,ロジスティック回帰モデルを学習せよ. | ||
|
||
import pandas as pd | ||
from sklearn.linear_model import LogisticRegression | ||
import pickle | ||
|
||
# Initialize logistic regression model | ||
lr = LogisticRegression(random_state=42, max_iter=1000) | ||
|
||
# Load training data with headers | ||
X_train = pd.read_csv("output/ch6/train.feature.txt", sep='\t') | ||
Y_train = pd.read_csv("output/ch6/train.txt", sep='\t', header=None, | ||
names=['TITLE', 'CATEGORY'])['CATEGORY'] | ||
|
||
# Train the model | ||
lr.fit(X_train, Y_train) | ||
|
||
# Save the trained model | ||
with open("output/ch6/logreg.pkl", "wb") as f: | ||
pickle.dump(lr, f) | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,25 @@ | ||
# task53. 予測 | ||
# 52で学習したロジスティック回帰モデルを用い, | ||
# 与えられた記事見出しからカテゴリとその予測確率を計算するプログラムを実装せよ. | ||
|
||
import pickle | ||
import numpy as np | ||
import pandas as pd | ||
|
||
def score_lr(lr, x): # -> [max.prob, pred.label] | ||
return [np.max(lr.predict_proba(x), axis=1), lr.predict(x)] | ||
|
||
# load model | ||
lr = pickle.load(open("output/ch6/logreg.pkl", 'rb')) | ||
|
||
X_train = pd.read_table("output/ch6/train.feature.txt") | ||
X_test = pd.read_table("output/ch6/test.feature.txt") | ||
|
||
train_pred = score_lr(lr, X_train) | ||
test_pred = score_lr(lr, X_test) | ||
|
||
if __name__ == "__main__": | ||
print(train_pred) | ||
|
||
# [array([0.91230316, 0.4089398 , 0.65182876, ..., 0.88564153, 0.9431732 , | ||
# 0.90615187]), array(['b', 'e', 'e', ..., 'e', 'b', 'b'], dtype=object)] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,26 @@ | ||
# task54. 正解率の計測 | ||
# 52で学習したロジスティック回帰モデルの正解率を,学習データおよび評価データ上で計測せよ. | ||
|
||
import pandas as pd | ||
from sklearn.metrics import accuracy_score | ||
from knock53 import train_pred, test_pred # [max.prob, pred.label] | ||
|
||
|
||
train = pd.read_csv("output/ch6/train.txt", sep='\t', header=None, | ||
names=['TITLE', 'CATEGORY']) | ||
test = pd.read_csv("output/ch6/test.txt", sep='\t', header=None, | ||
names=['TITLE', 'CATEGORY']) | ||
|
||
''' | ||
accuracy_score: fraction of correctly classified samples (float) | ||
(normalize=False -> number of correctly classified samples (int)) | ||
''' | ||
train_acc = accuracy_score(train["CATEGORY"], train_pred[1]) | ||
test_acc = accuracy_score(test["CATEGORY"], test_pred[1]) | ||
|
||
if __name__ == "__main__": | ||
print(f"Training Accuracy: {train_acc:.3f}") | ||
print(f"Test Accuracy: {test_acc:.3f}") | ||
|
||
# Training Accuracy: 0.919 | ||
# Test Accuracy: 0.862 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,44 @@ | ||
# task 55. 混同行列の作成 | ||
# 52で学習したロジスティック回帰モデルの混同行列(confusion matrix)を | ||
# 学習データおよび評価データ上で作成せよ | ||
|
||
import pandas as pd | ||
import seaborn as sns | ||
import matplotlib.pyplot as plt | ||
from sklearn.metrics import confusion_matrix | ||
from knock53 import train_pred, test_pred | ||
|
||
|
||
train = pd.read_csv("output/ch6/train.txt", sep='\t', header=None, | ||
names=['TITLE', 'CATEGORY']) | ||
test = pd.read_csv("output/ch6/test.txt", sep='\t', header=None, | ||
names=['TITLE', 'CATEGORY']) | ||
|
||
train_con = confusion_matrix(train["CATEGORY"], train_pred[1]) | ||
test_con = confusion_matrix(test["CATEGORY"], test_pred[1]) | ||
|
||
if __name__ == "__main__": | ||
print("Confusion Matrix (Train)") | ||
print(train_con) | ||
print("Confusion Matrix (Test)") | ||
print(test_con) | ||
|
||
''' | ||
Confusion Matrix (Train) | ||
[[4368 101 9 60] | ||
[ 64 4153 2 9] | ||
[ 95 140 455 11] | ||
[ 208 154 8 835]] | ||
Confusion Matrix (Test) | ||
[[521 18 4 15] | ||
[ 19 497 0 6] | ||
[ 24 20 44 2] | ||
[ 46 28 2 88]] | ||
''' | ||
|
||
sns.heatmap(train_con, annot=True, cmap="Greens") | ||
plt.savefig("output/ch6/train_confusion_matrix.png") | ||
plt.clf() | ||
sns.heatmap(test_con, annot=True, cmap="Greens") | ||
plt.savefig("output/ch6/test_confusion_matrix.png") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,25 @@ | ||
# task 56. 適合率,再現率,F1スコアの計測 | ||
# 52で学習したロジスティック回帰モデルの適合率,再現率,F1スコアを,評価データ上で計測せよ. | ||
# カテゴリごとに適合率,再現率,F1スコアを求め, | ||
# カテゴリごとの性能をマイクロ平均(micro-average)とマクロ平均(macro-average)で統合せよ | ||
|
||
from sklearn.metrics import classification_report | ||
from knock54 import * | ||
|
||
if __name__ == "__main__": | ||
print(classification_report(test["CATEGORY"], test_pred[1])) | ||
|
||
''' | ||
Training Accuracy: 0.919 | ||
Test Accuracy: 0.862 | ||
precision recall f1-score support | ||
b 0.85 0.93 0.89 558 | ||
e 0.88 0.95 0.92 522 | ||
m 0.88 0.49 0.63 90 | ||
t 0.79 0.54 0.64 164 | ||
accuracy 0.86 1334 | ||
macro avg 0.85 0.73 0.77 1334 | ||
weighted avg 0.86 0.86 0.85 1334 | ||
''' |
Oops, something went wrong.