-
Notifications
You must be signed in to change notification settings - Fork 2
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
16 changed files
with
14,425 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,104 @@ | ||
{ | ||
"cells": [ | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 4, | ||
"id": "9c327100", | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
"Training data category distribution:\n", | ||
"b 4502\n", | ||
"e 4223\n", | ||
"t 1219\n", | ||
"m 728\n", | ||
"Name: CATEGORY, dtype: int64\n", | ||
"\n", | ||
"Validation data category distribution:\n", | ||
"b 562\n", | ||
"e 528\n", | ||
"t 153\n", | ||
"m 91\n", | ||
"Name: CATEGORY, dtype: int64\n", | ||
"\n", | ||
"Test data category distribution:\n", | ||
"b 563\n", | ||
"e 528\n", | ||
"t 152\n", | ||
"m 91\n", | ||
"Name: CATEGORY, dtype: int64\n" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"import pandas as pd\n", | ||
"import numpy as np\n", | ||
"from sklearn.model_selection import train_test_split\n", | ||
"\n", | ||
"# 读取数据\n", | ||
"data_path = '/Users/wenda/Desktop/news+aggregator/newsCorpora.csv'\n", | ||
"columns = ['ID', 'TITLE', 'URL', 'PUBLISHER', 'CATEGORY', 'STORY', 'HOSTNAME', 'TIMESTAMP']\n", | ||
"df = pd.read_csv(data_path, sep='\\t', names=columns)\n", | ||
"\n", | ||
"# 过滤指定的新闻来源\n", | ||
"publishers = [\"Reuters\", \"Huffington Post\", \"Businessweek\", \"Contactmusic.com\", \"Daily Mail\"]\n", | ||
"df = df[df['PUBLISHER'].isin(publishers)]\n", | ||
"\n", | ||
"# 打乱数据\n", | ||
"df = df.sample(frac=1, random_state=42).reset_index(drop=True)\n", | ||
"\n", | ||
"# 数据集划分\n", | ||
"train, temp = train_test_split(df, test_size=0.2, random_state=42, stratify=df['CATEGORY'])\n", | ||
"valid, test = train_test_split(temp, test_size=0.5, random_state=42, stratify=temp['CATEGORY'])\n", | ||
"\n", | ||
"# 将数据保存到文件\n", | ||
"def save_to_file(df, file_path):\n", | ||
" df[['CATEGORY', 'TITLE']].to_csv(file_path, sep='\\t', index=False, header=False)\n", | ||
"\n", | ||
"save_to_file(train, 'train.txt')\n", | ||
"save_to_file(valid, 'valid.txt')\n", | ||
"save_to_file(test, 'test.txt')\n", | ||
"\n", | ||
"# 各类别的事例数\n", | ||
"print(\"Training data category distribution:\")\n", | ||
"print(train['CATEGORY'].value_counts())\n", | ||
"print(\"\\nValidation data category distribution:\")\n", | ||
"print(valid['CATEGORY'].value_counts())\n", | ||
"print(\"\\nTest data category distribution:\")\n", | ||
"print(test['CATEGORY'].value_counts())\n" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "81afc9f1", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [] | ||
} | ||
], | ||
"metadata": { | ||
"kernelspec": { | ||
"display_name": "Python 3 (ipykernel)", | ||
"language": "python", | ||
"name": "python3" | ||
}, | ||
"language_info": { | ||
"codemirror_mode": { | ||
"name": "ipython", | ||
"version": 3 | ||
}, | ||
"file_extension": ".py", | ||
"mimetype": "text/x-python", | ||
"name": "python", | ||
"nbconvert_exporter": "python", | ||
"pygments_lexer": "ipython3", | ||
"version": "3.11.4" | ||
} | ||
}, | ||
"nbformat": 4, | ||
"nbformat_minor": 5 | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,70 @@ | ||
{ | ||
"cells": [ | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 2, | ||
"id": "9eabdd44", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"import pandas as pd\n", | ||
"from sklearn.feature_extraction.text import CountVectorizer\n", | ||
"\n", | ||
"# データを読み込む\n", | ||
"train_df = pd.read_csv('train.txt', sep='\\t', header=None, names=['Category', 'Title'])\n", | ||
"valid_df = pd.read_csv('valid.txt', sep='\\t', header=None, names=['Category', 'Title'])\n", | ||
"test_df = pd.read_csv('test.txt', sep='\\t', header=None, names=['Category', 'Title'])\n", | ||
"\n", | ||
"# CountVectorizerでBag of Words特徴に変換する\n", | ||
"vectorizer = CountVectorizer()\n", | ||
"X_train = vectorizer.fit_transform(train_df['Title'])\n", | ||
"X_valid = vectorizer.transform(valid_df['Title'])\n", | ||
"X_test = vectorizer.transform(test_df['Title'])\n", | ||
"\n", | ||
"# 特徴をデータフレームに変換する\n", | ||
"train_features = pd.DataFrame(X_train.toarray(), columns=vectorizer.get_feature_names_out())\n", | ||
"valid_features = pd.DataFrame(X_valid.toarray(), columns=vectorizer.get_feature_names_out())\n", | ||
"test_features = pd.DataFrame(X_test.toarray(), columns=vectorizer.get_feature_names_out())\n", | ||
"\n", | ||
"# 特徴データにカテゴリを追加する\n", | ||
"train_features.insert(0, 'Category', train_df['Category'])\n", | ||
"valid_features.insert(0, 'Category', valid_df['Category'])\n", | ||
"test_features.insert(0, 'Category', test_df['Category'])\n", | ||
"\n", | ||
"# データを保存する\n", | ||
"train_features.to_csv('train.feature.txt', sep='\\t', index=False)\n", | ||
"valid_features.to_csv('valid.feature.txt', sep='\\t', index=False)\n", | ||
"test_features.to_csv('test.feature.txt', sep='\\t', index=False)\n" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "14332526", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [] | ||
} | ||
], | ||
"metadata": { | ||
"kernelspec": { | ||
"display_name": "Python 3 (ipykernel)", | ||
"language": "python", | ||
"name": "python3" | ||
}, | ||
"language_info": { | ||
"codemirror_mode": { | ||
"name": "ipython", | ||
"version": 3 | ||
}, | ||
"file_extension": ".py", | ||
"mimetype": "text/x-python", | ||
"name": "python", | ||
"nbconvert_exporter": "python", | ||
"pygments_lexer": "ipython3", | ||
"version": "3.11.4" | ||
} | ||
}, | ||
"nbformat": 4, | ||
"nbformat_minor": 5 | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,88 @@ | ||
{ | ||
"cells": [ | ||
{ | ||
"cell_type": "markdown", | ||
"id": "28d82f0b", | ||
"metadata": {}, | ||
"source": [ | ||
"53. 予測Permalink\n", | ||
"\n", | ||
"52で学習したロジスティック回帰モデルを用い,与えられた記事見出しからカテゴリとその予測確率を計算するプログラムを実装せよ." | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 5, | ||
"id": "f0832dd1", | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"data": { | ||
"text/plain": [ | ||
"['vectorizer.pkl']" | ||
] | ||
}, | ||
"execution_count": 5, | ||
"metadata": {}, | ||
"output_type": "execute_result" | ||
} | ||
], | ||
"source": [ | ||
"import pandas as pd\n", | ||
"from sklearn.linear_model import LogisticRegression\n", | ||
"from sklearn.preprocessing import LabelEncoder\n", | ||
"from sklearn.feature_extraction.text import CountVectorizer\n", | ||
"import joblib\n", | ||
"\n", | ||
"# データを読み込む\n", | ||
"train_df = pd.read_csv('train.txt', sep='\\t', header=None, names=['Category', 'Title'])\n", | ||
"\n", | ||
"# Vectorizerを初期化して訓練データに適合する\n", | ||
"vectorizer = CountVectorizer()\n", | ||
"X_train = vectorizer.fit_transform(train_df['Title'])\n", | ||
"\n", | ||
"# ラベルを数値に変換する\n", | ||
"label_encoder = LabelEncoder()\n", | ||
"y_train_encoded = label_encoder.fit_transform(train_df['Category'])\n", | ||
"\n", | ||
"# ロジスティック回帰モデルを訓練する\n", | ||
"model = LogisticRegression(max_iter=1000, random_state=42)\n", | ||
"model.fit(X_train, y_train_encoded)\n", | ||
"\n", | ||
"# モデルとエンコーダーを保存する\n", | ||
"joblib.dump(model, 'logistic_regression_model.pkl')\n", | ||
"joblib.dump(label_encoder, 'label_encoder.pkl')\n", | ||
"joblib.dump(vectorizer, 'vectorizer.pkl')\n" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "baaf4fe3", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [] | ||
} | ||
], | ||
"metadata": { | ||
"kernelspec": { | ||
"display_name": "Python 3 (ipykernel)", | ||
"language": "python", | ||
"name": "python3" | ||
}, | ||
"language_info": { | ||
"codemirror_mode": { | ||
"name": "ipython", | ||
"version": 3 | ||
}, | ||
"file_extension": ".py", | ||
"mimetype": "text/x-python", | ||
"name": "python", | ||
"nbconvert_exporter": "python", | ||
"pygments_lexer": "ipython3", | ||
"version": "3.11.4" | ||
} | ||
}, | ||
"nbformat": 4, | ||
"nbformat_minor": 5 | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,84 @@ | ||
{ | ||
"cells": [ | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 3, | ||
"id": "d5a07cf3", | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
"Title: The stock market is experiencing unprecedented growth.\n", | ||
"Predicted Category: b\n", | ||
"Probabilities: [0.94157397 0.02711228 0.00761151 0.02370224]\n", | ||
"b: 0.9416\n", | ||
"e: 0.0271\n", | ||
"m: 0.0076\n", | ||
"t: 0.0237\n" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"import joblib\n", | ||
"from sklearn.feature_extraction.text import CountVectorizer\n", | ||
"\n", | ||
"# モデルとエンコーダーを読み込む\n", | ||
"model = joblib.load('logistic_regression_model.pkl')\n", | ||
"label_encoder = joblib.load('label_encoder.pkl')\n", | ||
"vectorizer = joblib.load('vectorizer.pkl')\n", | ||
"\n", | ||
"# 予測関数の定義\n", | ||
"def predict_category(title):\n", | ||
" X = vectorizer.transform([title])\n", | ||
" probabilities = model.predict_proba(X)[0]\n", | ||
" predicted_index = model.predict(X)[0]\n", | ||
" predicted_category = label_encoder.inverse_transform([predicted_index])[0]\n", | ||
" return predicted_category, probabilities\n", | ||
"\n", | ||
"# テスト\n", | ||
"title = \"The stock market is experiencing unprecedented growth.\"\n", | ||
"predicted_category, probabilities = predict_category(title)\n", | ||
"\n", | ||
"print(f\"Title: {title}\")\n", | ||
"print(f\"Predicted Category: {predicted_category}\")\n", | ||
"print(f\"Probabilities: {probabilities}\")\n", | ||
"\n", | ||
"# 各カテゴリの確率を表示\n", | ||
"categories = label_encoder.classes_\n", | ||
"for category, probability in zip(categories, probabilities):\n", | ||
" print(f\"{category}: {probability:.4f}\")\n" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "2c877e4a", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [] | ||
} | ||
], | ||
"metadata": { | ||
"kernelspec": { | ||
"display_name": "Python 3 (ipykernel)", | ||
"language": "python", | ||
"name": "python3" | ||
}, | ||
"language_info": { | ||
"codemirror_mode": { | ||
"name": "ipython", | ||
"version": 3 | ||
}, | ||
"file_extension": ".py", | ||
"mimetype": "text/x-python", | ||
"name": "python", | ||
"nbconvert_exporter": "python", | ||
"pygments_lexer": "ipython3", | ||
"version": "3.11.4" | ||
} | ||
}, | ||
"nbformat": 4, | ||
"nbformat_minor": 5 | ||
} |
Oops, something went wrong.