Add files via upload

tmu-nlp · Jun 23, 2024 · 98c6f4f · 98c6f4f
1 parent a0211fd
commit 98c6f4f
Showing 1 changed file with 97 additions and 0 deletions.
diff --git a/wangche/knock70.ipynb b/wangche/knock70.ipynb
@@ -0,0 +1,97 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "17b26993",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/var/folders/cf/glx3v3t12q7fcsb3x4qwt0r80000gn/T/ipykernel_67153/1001280089.py:37: UserWarning: Creating a tensor from a list of numpy.ndarrays is extremely slow. Please consider converting the list to a single numpy.ndarray with numpy.array() before converting to a tensor. (Triggered internally at /Users/runner/work/pytorch/pytorch/pytorch/torch/csrc/utils/tensor_new.cpp:278.)\n",
+      "  X_torch = torch.tensor(df[\"TITLE\"].apply(Text2Vec))\n"
+     ]
+    }
+   ],
+   "source": [
+    "from gensim.models import KeyedVectors\n",
+    "import pandas as pd\n",
+    "import re\n",
+    "import torch\n",
+    "\n",
+    "def EncoderNN(sign):\n",
+    "    if sign == \"b\":\n",
+    "        code = 0\n",
+    "    elif sign == \"t\":\n",
+    "        code = 1\n",
+    "    elif sign == \"e\":\n",
+    "        code = 2\n",
+    "    elif sign == \"m\":\n",
+    "        code = 3\n",
+    "    else:\n",
+    "        print(\"Error\")\n",
+    "    return code\n",
+    "\n",
+    "def Text2Vec(text):\n",
+    "    lines = text.split(\" \")\n",
+    "    vec_sum = 0\n",
+    "    length = 0\n",
+    "    for line in lines:\n",
+    "        try:\n",
+    "            temp = model.get_vector(line)\n",
+    "            vec_sum += temp\n",
+    "            length += 1\n",
+    "        except:\n",
+    "            pass\n",
+    "    return vec_sum/length\n",
+    "\n",
+    "def TorchData(data):\n",
+    "    df = pd.read_table(\"{}.txt\".format(data))\n",
+    "    sign_regrex = re.compile('[!\"#$%&\\'()*+,-./:;<=>?@[\\\\]^_`|＄＃＠£â€™]')\n",
+    "    f_regrex = lambda x:sign_regrex.sub(\"\", x)\n",
+    "    df[\"TITLE\"] = df[\"TITLE\"].map(f_regrex)\n",
+    "    X_torch = torch.tensor(df[\"TITLE\"].apply(Text2Vec))\n",
+    "    torch.save(X_torch, \"X_{}.pt\".format(data))\n",
+    "    df[\"CATEGORY\"] = df[\"CATEGORY\"].map(EncoderNN)\n",
+    "    Y_torch = torch.tensor(df[\"CATEGORY\"])\n",
+    "    torch.save(Y_torch, \"Y_{}.pt\".format(data))\n",
+    "\n",
+    "model = KeyedVectors.load_word2vec_format(\"GoogleNews-vectors-negative300.bin.gz\", binary=True)\n",
+    "TorchData(\"train\")\n",
+    "TorchData(\"test\")\n",
+    "TorchData(\"valid\")\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0095348d",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.4"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}