Add files via upload

tmu-nlp · Jun 8, 2024 · 1ac9199 · 1ac9199
1 parent 906ebdc
commit 1ac9199
Show file tree

Hide file tree

Showing 16 changed files with 14,425 additions and 0 deletions.
diff --git a/wangche/knock50.ipynb b/wangche/knock50.ipynb
@@ -0,0 +1,104 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "9c327100",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Training data category distribution:\n",
+      "b    4502\n",
+      "e    4223\n",
+      "t    1219\n",
+      "m     728\n",
+      "Name: CATEGORY, dtype: int64\n",
+      "\n",
+      "Validation data category distribution:\n",
+      "b    562\n",
+      "e    528\n",
+      "t    153\n",
+      "m     91\n",
+      "Name: CATEGORY, dtype: int64\n",
+      "\n",
+      "Test data category distribution:\n",
+      "b    563\n",
+      "e    528\n",
+      "t    152\n",
+      "m     91\n",
+      "Name: CATEGORY, dtype: int64\n"
+     ]
+    }
+   ],
+   "source": [
+    "import pandas as pd\n",
+    "import numpy as np\n",
+    "from sklearn.model_selection import train_test_split\n",
+    "\n",
+    "# 读取数据\n",
+    "data_path = '/Users/wenda/Desktop/news+aggregator/newsCorpora.csv'\n",
+    "columns = ['ID', 'TITLE', 'URL', 'PUBLISHER', 'CATEGORY', 'STORY', 'HOSTNAME', 'TIMESTAMP']\n",
+    "df = pd.read_csv(data_path, sep='\\t', names=columns)\n",
+    "\n",
+    "# 过滤指定的新闻来源\n",
+    "publishers = [\"Reuters\", \"Huffington Post\", \"Businessweek\", \"Contactmusic.com\", \"Daily Mail\"]\n",
+    "df = df[df['PUBLISHER'].isin(publishers)]\n",
+    "\n",
+    "# 打乱数据\n",
+    "df = df.sample(frac=1, random_state=42).reset_index(drop=True)\n",
+    "\n",
+    "# 数据集划分\n",
+    "train, temp = train_test_split(df, test_size=0.2, random_state=42, stratify=df['CATEGORY'])\n",
+    "valid, test = train_test_split(temp, test_size=0.5, random_state=42, stratify=temp['CATEGORY'])\n",
+    "\n",
+    "# 将数据保存到文件\n",
+    "def save_to_file(df, file_path):\n",
+    "    df[['CATEGORY', 'TITLE']].to_csv(file_path, sep='\\t', index=False, header=False)\n",
+    "\n",
+    "save_to_file(train, 'train.txt')\n",
+    "save_to_file(valid, 'valid.txt')\n",
+    "save_to_file(test, 'test.txt')\n",
+    "\n",
+    "# 各类别的事例数\n",
+    "print(\"Training data category distribution:\")\n",
+    "print(train['CATEGORY'].value_counts())\n",
+    "print(\"\\nValidation data category distribution:\")\n",
+    "print(valid['CATEGORY'].value_counts())\n",
+    "print(\"\\nTest data category distribution:\")\n",
+    "print(test['CATEGORY'].value_counts())\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "81afc9f1",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.4"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/wangche/knock51.ipynb b/wangche/knock51.ipynb
@@ -0,0 +1,70 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "9eabdd44",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "from sklearn.feature_extraction.text import CountVectorizer\n",
+    "\n",
+    "# データを読み込む\n",
+    "train_df = pd.read_csv('train.txt', sep='\\t', header=None, names=['Category', 'Title'])\n",
+    "valid_df = pd.read_csv('valid.txt', sep='\\t', header=None, names=['Category', 'Title'])\n",
+    "test_df = pd.read_csv('test.txt', sep='\\t', header=None, names=['Category', 'Title'])\n",
+    "\n",
+    "# CountVectorizerでBag of Words特徴に変換する\n",
+    "vectorizer = CountVectorizer()\n",
+    "X_train = vectorizer.fit_transform(train_df['Title'])\n",
+    "X_valid = vectorizer.transform(valid_df['Title'])\n",
+    "X_test = vectorizer.transform(test_df['Title'])\n",
+    "\n",
+    "# 特徴をデータフレームに変換する\n",
+    "train_features = pd.DataFrame(X_train.toarray(), columns=vectorizer.get_feature_names_out())\n",
+    "valid_features = pd.DataFrame(X_valid.toarray(), columns=vectorizer.get_feature_names_out())\n",
+    "test_features = pd.DataFrame(X_test.toarray(), columns=vectorizer.get_feature_names_out())\n",
+    "\n",
+    "# 特徴データにカテゴリを追加する\n",
+    "train_features.insert(0, 'Category', train_df['Category'])\n",
+    "valid_features.insert(0, 'Category', valid_df['Category'])\n",
+    "test_features.insert(0, 'Category', test_df['Category'])\n",
+    "\n",
+    "# データを保存する\n",
+    "train_features.to_csv('train.feature.txt', sep='\\t', index=False)\n",
+    "valid_features.to_csv('valid.feature.txt', sep='\\t', index=False)\n",
+    "test_features.to_csv('test.feature.txt', sep='\\t', index=False)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "14332526",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.4"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/wangche/knock52.ipynb b/wangche/knock52.ipynb
@@ -0,0 +1,88 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "28d82f0b",
+   "metadata": {},
+   "source": [
+    "53. 予測Permalink\n",
+    "\n",
+    "52で学習したロジスティック回帰モデルを用い，与えられた記事見出しからカテゴリとその予測確率を計算するプログラムを実装せよ．"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "f0832dd1",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['vectorizer.pkl']"
+      ]
+     },
+     "execution_count": 5,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "import pandas as pd\n",
+    "from sklearn.linear_model import LogisticRegression\n",
+    "from sklearn.preprocessing import LabelEncoder\n",
+    "from sklearn.feature_extraction.text import CountVectorizer\n",
+    "import joblib\n",
+    "\n",
+    "# データを読み込む\n",
+    "train_df = pd.read_csv('train.txt', sep='\\t', header=None, names=['Category', 'Title'])\n",
+    "\n",
+    "# Vectorizerを初期化して訓練データに適合する\n",
+    "vectorizer = CountVectorizer()\n",
+    "X_train = vectorizer.fit_transform(train_df['Title'])\n",
+    "\n",
+    "# ラベルを数値に変換する\n",
+    "label_encoder = LabelEncoder()\n",
+    "y_train_encoded = label_encoder.fit_transform(train_df['Category'])\n",
+    "\n",
+    "# ロジスティック回帰モデルを訓練する\n",
+    "model = LogisticRegression(max_iter=1000, random_state=42)\n",
+    "model.fit(X_train, y_train_encoded)\n",
+    "\n",
+    "# モデルとエンコーダーを保存する\n",
+    "joblib.dump(model, 'logistic_regression_model.pkl')\n",
+    "joblib.dump(label_encoder, 'label_encoder.pkl')\n",
+    "joblib.dump(vectorizer, 'vectorizer.pkl')\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "baaf4fe3",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.4"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/wangche/knock53.ipynb b/wangche/knock53.ipynb
@@ -0,0 +1,84 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "d5a07cf3",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Title: The stock market is experiencing unprecedented growth.\n",
+      "Predicted Category: b\n",
+      "Probabilities: [0.94157397 0.02711228 0.00761151 0.02370224]\n",
+      "b: 0.9416\n",
+      "e: 0.0271\n",
+      "m: 0.0076\n",
+      "t: 0.0237\n"
+     ]
+    }
+   ],
+   "source": [
+    "import joblib\n",
+    "from sklearn.feature_extraction.text import CountVectorizer\n",
+    "\n",
+    "# モデルとエンコーダーを読み込む\n",
+    "model = joblib.load('logistic_regression_model.pkl')\n",
+    "label_encoder = joblib.load('label_encoder.pkl')\n",
+    "vectorizer = joblib.load('vectorizer.pkl')\n",
+    "\n",
+    "# 予測関数の定義\n",
+    "def predict_category(title):\n",
+    "    X = vectorizer.transform([title])\n",
+    "    probabilities = model.predict_proba(X)[0]\n",
+    "    predicted_index = model.predict(X)[0]\n",
+    "    predicted_category = label_encoder.inverse_transform([predicted_index])[0]\n",
+    "    return predicted_category, probabilities\n",
+    "\n",
+    "# テスト\n",
+    "title = \"The stock market is experiencing unprecedented growth.\"\n",
+    "predicted_category, probabilities = predict_category(title)\n",
+    "\n",
+    "print(f\"Title: {title}\")\n",
+    "print(f\"Predicted Category: {predicted_category}\")\n",
+    "print(f\"Probabilities: {probabilities}\")\n",
+    "\n",
+    "# 各カテゴリの確率を表示\n",
+    "categories = label_encoder.classes_\n",
+    "for category, probability in zip(categories, probabilities):\n",
+    "    print(f\"{category}: {probability:.4f}\")\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "2c877e4a",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.4"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}