Skip to content

Commit

Permalink
Add files via upload
Browse files Browse the repository at this point in the history
  • Loading branch information
0301Wenda authored Jun 8, 2024
1 parent 1ab5002 commit a6b1b0e
Showing 1 changed file with 104 additions and 0 deletions.
104 changes: 104 additions & 0 deletions wangche/chapter06/knock50.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 4,
"id": "9c327100",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Training data category distribution:\n",
"b 4502\n",
"e 4223\n",
"t 1219\n",
"m 728\n",
"Name: CATEGORY, dtype: int64\n",
"\n",
"Validation data category distribution:\n",
"b 562\n",
"e 528\n",
"t 153\n",
"m 91\n",
"Name: CATEGORY, dtype: int64\n",
"\n",
"Test data category distribution:\n",
"b 563\n",
"e 528\n",
"t 152\n",
"m 91\n",
"Name: CATEGORY, dtype: int64\n"
]
}
],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"from sklearn.model_selection import train_test_split\n",
"\n",
"# 读取数据\n",
"data_path = '/Users/wenda/Desktop/news+aggregator/newsCorpora.csv'\n",
"columns = ['ID', 'TITLE', 'URL', 'PUBLISHER', 'CATEGORY', 'STORY', 'HOSTNAME', 'TIMESTAMP']\n",
"df = pd.read_csv(data_path, sep='\\t', names=columns)\n",
"\n",
"# 过滤指定的新闻来源\n",
"publishers = [\"Reuters\", \"Huffington Post\", \"Businessweek\", \"Contactmusic.com\", \"Daily Mail\"]\n",
"df = df[df['PUBLISHER'].isin(publishers)]\n",
"\n",
"# 打乱数据\n",
"df = df.sample(frac=1, random_state=42).reset_index(drop=True)\n",
"\n",
"# 数据集划分\n",
"train, temp = train_test_split(df, test_size=0.2, random_state=42, stratify=df['CATEGORY'])\n",
"valid, test = train_test_split(temp, test_size=0.5, random_state=42, stratify=temp['CATEGORY'])\n",
"\n",
"# 将数据保存到文件\n",
"def save_to_file(df, file_path):\n",
" df[['CATEGORY', 'TITLE']].to_csv(file_path, sep='\\t', index=False, header=False)\n",
"\n",
"save_to_file(train, 'train.txt')\n",
"save_to_file(valid, 'valid.txt')\n",
"save_to_file(test, 'test.txt')\n",
"\n",
"# 各类别的事例数\n",
"print(\"Training data category distribution:\")\n",
"print(train['CATEGORY'].value_counts())\n",
"print(\"\\nValidation data category distribution:\")\n",
"print(valid['CATEGORY'].value_counts())\n",
"print(\"\\nTest data category distribution:\")\n",
"print(test['CATEGORY'].value_counts())\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "81afc9f1",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.4"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

0 comments on commit a6b1b0e

Please sign in to comment.