Skip to content

Commit

Permalink
added ch06
Browse files Browse the repository at this point in the history
  • Loading branch information
kexinb426 committed Jun 10, 2024
1 parent 00cd55d commit 4daf09a
Show file tree
Hide file tree
Showing 11 changed files with 518 additions and 0 deletions.
2 changes: 2 additions & 0 deletions kexinb/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
/data/
/output/
63 changes: 63 additions & 0 deletions kexinb/chapter06/knock50.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
# task50. データの入手・整形
'''
The task is to classify a given news headline to one of the following categories:
“Business”, “Science”, “Entertainment” and “Health”
News Aggregator Data Setをダウンロードし、以下の要領で学習データ(train.txt), 検証データ(valid.txt),評価データ(test.txt)を作成せよ:
ダウンロードしたzipファイルを解凍し,readme.txtの説明を読む.
情報源(publisher)が”Reuters”, “Huffington Post”, “Businessweek”, “Contactmusic.com”,
“Daily Mail”の事例(記事)のみを抽出する.
抽出された事例をランダムに並び替える.
抽出された事例の80%を学習データ,残りの10%ずつを検証データと評価データに分割し,
それぞれtrain.txt,valid.txt,test.txtというファイル名で保存する.ファイルには,1行に1事例を書き出すこととし,
カテゴリ名と記事見出しのタブ区切り形式とせよ(このファイルは後に問題70で再利用する).
学習データと評価データを作成したら,各カテゴリの事例数を確認せよ.
'''
import pandas as pd
from sklearn.model_selection import train_test_split

header_name = ['ID', 'TITLE', 'URL', 'PUBLISHER', 'CATEGORY', 'STORY', 'HOSTNAME', 'TIMESTAMP']
extract_list = ['Reuters', 'Huffington Post', 'Businessweek', 'Contactmusic.com', 'Daily Mail']

df = pd.read_csv('data/newsCorpora.csv', header=None, sep='\t',
names=header_name)

df_ex = df.loc[df['PUBLISHER'].isin(extract_list), ['TITLE', 'CATEGORY']]

data_train, data_other = train_test_split(df_ex, test_size=0.2, random_state=42)
data_valid, data_test = train_test_split(data_other, test_size=0.5, random_state=42)

data_train.to_csv("output/ch6/train.txt", sep="\t", index=False, header=False)
data_valid.to_csv("output/ch6/valid.txt", sep="\t", index=False, header=False)
data_test.to_csv("output/ch6/test.txt", sep="\t", index=False, header=False)

if __name__ == "__main__":
print("train_data")
print(data_train['CATEGORY'].value_counts())
print("valid_data")
print(data_valid['CATEGORY'].value_counts())
print("test_data")
print(data_test['CATEGORY'].value_counts())

'''
train_data
CATEGORY
b 4538
e 4228
t 1205
m 701
Name: count, dtype: int64
valid_data
CATEGORY
b 531
e 529
t 155
m 119
Name: count, dtype: int64
test_data
CATEGORY
b 558
e 522
t 164
m 90
Name: count, dtype: int64
'''
52 changes: 52 additions & 0 deletions kexinb/chapter06/knock51.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
# task51. 特徴量抽出
'''
学習データ, 検証データ, 評価データから特徴量を抽出し,
それぞれtrain.feature.txt, valid.feature.txt, test.feature.txtというファイル名で保存せよ.
なお, カテゴリ分類に有用そうな特徴量は各自で自由に設計せよ.
記事の見出しを単語列に変換したものが最低限のベースラインとなるであろう.
'''

import pandas as pd
import string

from sklearn.feature_extraction.text import CountVectorizer # BoW
from sklearn.feature_extraction.text import TfidfVectorizer # tf-idf

def preprocess(text):
translator = str.maketrans('', '', string.punctuation)
text = text.translate(translator)
text = text.lower()
text = ''.join([i for i in text if not i.isdigit()])
return text

# Load and preprocess data
header_name = ['TITLE', 'CATEGORY']
train = pd.read_csv('output/ch6/train.txt', header=None, sep='\t', names=header_name)
valid = pd.read_csv('output/ch6/valid.txt', header=None, sep='\t', names=header_name)
test = pd.read_csv('output/ch6/test.txt', header=None, sep='\t', names=header_name)

# Concatenate data for preprocessing
df = pd.concat([train, valid, test], axis=0).reset_index(drop=True)
df['TITLE'] = df['TITLE'].apply(preprocess)

# Split back the data
train_valid_d = df[:len(train) + len(valid)]
test_d = df[len(train) + len(valid):]

# Initialize TF-IDF Vectorizer
vectorizer = TfidfVectorizer(min_df=10, ngram_range=(1, 2))

# Fit and transform data
train_valid_f = vectorizer.fit_transform(train_valid_d["TITLE"])
test_f = vectorizer.transform(test_d["TITLE"])

# Convert to DataFrame and save with headers
train_valid_vec = pd.DataFrame(train_valid_f.toarray(), columns=vectorizer.get_feature_names_out())
test_vec = pd.DataFrame(test_f.toarray(), columns=vectorizer.get_feature_names_out())

train_vec = train_valid_vec[:len(train)]
valid_vec = train_valid_vec[len(train):]

train_vec.to_csv("output/ch6/train.feature.txt", sep="\t", index=False)
valid_vec.to_csv("output/ch6/valid.feature.txt", sep="\t", index=False)
test_vec.to_csv("output/ch6/test.feature.txt", sep="\t", index=False)
22 changes: 22 additions & 0 deletions kexinb/chapter06/knock52.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
#task 52. 学習
# 51で構築した学習データを用いて,ロジスティック回帰モデルを学習せよ.

import pandas as pd
from sklearn.linear_model import LogisticRegression
import pickle

# Initialize logistic regression model
lr = LogisticRegression(random_state=42, max_iter=1000)

# Load training data with headers
X_train = pd.read_csv("output/ch6/train.feature.txt", sep='\t')
Y_train = pd.read_csv("output/ch6/train.txt", sep='\t', header=None,
names=['TITLE', 'CATEGORY'])['CATEGORY']

# Train the model
lr.fit(X_train, Y_train)

# Save the trained model
with open("output/ch6/logreg.pkl", "wb") as f:
pickle.dump(lr, f)

25 changes: 25 additions & 0 deletions kexinb/chapter06/knock53.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
# task53. 予測
# 52で学習したロジスティック回帰モデルを用い,
# 与えられた記事見出しからカテゴリとその予測確率を計算するプログラムを実装せよ.

import pickle
import numpy as np
import pandas as pd

def score_lr(lr, x): # -> [max.prob, pred.label]
return [np.max(lr.predict_proba(x), axis=1), lr.predict(x)]

# load model
lr = pickle.load(open("output/ch6/logreg.pkl", 'rb'))

X_train = pd.read_table("output/ch6/train.feature.txt")
X_test = pd.read_table("output/ch6/test.feature.txt")

train_pred = score_lr(lr, X_train)
test_pred = score_lr(lr, X_test)

if __name__ == "__main__":
print(train_pred)

# [array([0.91230316, 0.4089398 , 0.65182876, ..., 0.88564153, 0.9431732 ,
# 0.90615187]), array(['b', 'e', 'e', ..., 'e', 'b', 'b'], dtype=object)]
26 changes: 26 additions & 0 deletions kexinb/chapter06/knock54.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
# task54. 正解率の計測
# 52で学習したロジスティック回帰モデルの正解率を,学習データおよび評価データ上で計測せよ.

import pandas as pd
from sklearn.metrics import accuracy_score
from knock53 import train_pred, test_pred # [max.prob, pred.label]


train = pd.read_csv("output/ch6/train.txt", sep='\t', header=None,
names=['TITLE', 'CATEGORY'])
test = pd.read_csv("output/ch6/test.txt", sep='\t', header=None,
names=['TITLE', 'CATEGORY'])

'''
accuracy_score: fraction of correctly classified samples (float)
(normalize=False -> number of correctly classified samples (int))
'''
train_acc = accuracy_score(train["CATEGORY"], train_pred[1])
test_acc = accuracy_score(test["CATEGORY"], test_pred[1])

if __name__ == "__main__":
print(f"Training Accuracy: {train_acc:.3f}")
print(f"Test Accuracy: {test_acc:.3f}")

# Training Accuracy: 0.919
# Test Accuracy: 0.862
44 changes: 44 additions & 0 deletions kexinb/chapter06/knock55.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
# task 55. 混同行列の作成
# 52で学習したロジスティック回帰モデルの混同行列(confusion matrix)を
# 学習データおよび評価データ上で作成せよ

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
from knock53 import train_pred, test_pred


train = pd.read_csv("output/ch6/train.txt", sep='\t', header=None,
names=['TITLE', 'CATEGORY'])
test = pd.read_csv("output/ch6/test.txt", sep='\t', header=None,
names=['TITLE', 'CATEGORY'])

train_con = confusion_matrix(train["CATEGORY"], train_pred[1])
test_con = confusion_matrix(test["CATEGORY"], test_pred[1])

if __name__ == "__main__":
print("Confusion Matrix (Train)")
print(train_con)
print("Confusion Matrix (Test)")
print(test_con)

'''
Confusion Matrix (Train)
[[4368 101 9 60]
[ 64 4153 2 9]
[ 95 140 455 11]
[ 208 154 8 835]]
Confusion Matrix (Test)
[[521 18 4 15]
[ 19 497 0 6]
[ 24 20 44 2]
[ 46 28 2 88]]
'''

sns.heatmap(train_con, annot=True, cmap="Greens")
plt.savefig("output/ch6/train_confusion_matrix.png")
plt.clf()
sns.heatmap(test_con, annot=True, cmap="Greens")
plt.savefig("output/ch6/test_confusion_matrix.png")
25 changes: 25 additions & 0 deletions kexinb/chapter06/knock56.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
# task 56. 適合率,再現率,F1スコアの計測
# 52で学習したロジスティック回帰モデルの適合率,再現率,F1スコアを,評価データ上で計測せよ.
# カテゴリごとに適合率,再現率,F1スコアを求め,
# カテゴリごとの性能をマイクロ平均(micro-average)とマクロ平均(macro-average)で統合せよ

from sklearn.metrics import classification_report
from knock54 import *

if __name__ == "__main__":
print(classification_report(test["CATEGORY"], test_pred[1]))

'''
Training Accuracy: 0.919
Test Accuracy: 0.862
precision recall f1-score support
b 0.85 0.93 0.89 558
e 0.88 0.95 0.92 522
m 0.88 0.49 0.63 90
t 0.79 0.54 0.64 164
accuracy 0.86 1334
macro avg 0.85 0.73 0.77 1334
weighted avg 0.86 0.86 0.85 1334
'''
Loading

0 comments on commit 4daf09a

Please sign in to comment.