Skip to content

Commit

Permalink
Add files via upload
Browse files Browse the repository at this point in the history
  • Loading branch information
0301Wenda authored Jun 23, 2024
1 parent a0211fd commit 98c6f4f
Showing 1 changed file with 97 additions and 0 deletions.
97 changes: 97 additions & 0 deletions wangche/knock70.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 5,
"id": "17b26993",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/var/folders/cf/glx3v3t12q7fcsb3x4qwt0r80000gn/T/ipykernel_67153/1001280089.py:37: UserWarning: Creating a tensor from a list of numpy.ndarrays is extremely slow. Please consider converting the list to a single numpy.ndarray with numpy.array() before converting to a tensor. (Triggered internally at /Users/runner/work/pytorch/pytorch/pytorch/torch/csrc/utils/tensor_new.cpp:278.)\n",
" X_torch = torch.tensor(df[\"TITLE\"].apply(Text2Vec))\n"
]
}
],
"source": [
"from gensim.models import KeyedVectors\n",
"import pandas as pd\n",
"import re\n",
"import torch\n",
"\n",
"def EncoderNN(sign):\n",
" if sign == \"b\":\n",
" code = 0\n",
" elif sign == \"t\":\n",
" code = 1\n",
" elif sign == \"e\":\n",
" code = 2\n",
" elif sign == \"m\":\n",
" code = 3\n",
" else:\n",
" print(\"Error\")\n",
" return code\n",
"\n",
"def Text2Vec(text):\n",
" lines = text.split(\" \")\n",
" vec_sum = 0\n",
" length = 0\n",
" for line in lines:\n",
" try:\n",
" temp = model.get_vector(line)\n",
" vec_sum += temp\n",
" length += 1\n",
" except:\n",
" pass\n",
" return vec_sum/length\n",
"\n",
"def TorchData(data):\n",
" df = pd.read_table(\"{}.txt\".format(data))\n",
" sign_regrex = re.compile('[!\"#$%&\\'()*+,-./:;<=>?@[\\\\]^_`|$#@£â€™]')\n",
" f_regrex = lambda x:sign_regrex.sub(\"\", x)\n",
" df[\"TITLE\"] = df[\"TITLE\"].map(f_regrex)\n",
" X_torch = torch.tensor(df[\"TITLE\"].apply(Text2Vec))\n",
" torch.save(X_torch, \"X_{}.pt\".format(data))\n",
" df[\"CATEGORY\"] = df[\"CATEGORY\"].map(EncoderNN)\n",
" Y_torch = torch.tensor(df[\"CATEGORY\"])\n",
" torch.save(Y_torch, \"Y_{}.pt\".format(data))\n",
"\n",
"model = KeyedVectors.load_word2vec_format(\"GoogleNews-vectors-negative300.bin.gz\", binary=True)\n",
"TorchData(\"train\")\n",
"TorchData(\"test\")\n",
"TorchData(\"valid\")\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "0095348d",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.4"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

0 comments on commit 98c6f4f

Please sign in to comment.