Merge pull request #4 from PyThaiNLP/tokenization-1000-samples

1000 samples tokenization
PyThaiNLP · Aug 22, 2019 · cffd227 · cffd227
2 parents a2b5c41 + 9e00e9b
commit cffd227
Show file tree

Hide file tree

Showing 4 changed files with 2,048 additions and 10 deletions.
diff --git a/word-tokenization/README.md b/word-tokenization/README.md
@@ -1,9 +1,15 @@
 # WiseSight Samples with Word Tokenization Label
 
-This directory contains WiseSight samples by tokenized humans. These samples are randomly drawn from the corpus, with 40 samples for each label. 
+This directory contains WiseSight samples by tokenized humans. These samples are randomly drawn from the corpus.
 
-Because these samples are representative of real word content, we believe having these annotaed samples will allow the community to robustly evaluate tokenization algorithms.
+For wisesight-160, we draw 40 samples for each label, while 250 samples for wisesight-1000. 
+
+**Remark:** We removed a couple of samples from wiseight-1000 because they look like spam.
 
+Althought we have two sets of data, we recommend to use **wisesight-1000** because it contains more samples.
+Hence, its evaluation is more respresentative and reliable.
+
+Because these samples are representative of real word content, we believe having these annotaed samples will allow the community to robustly evaluate tokenization algorithms.
 
 ## Acknowledgement
 

diff --git a/word-tokenization/data-preparation-and-post-processing.ipynb b/word-tokenization/data-preparation-and-post-processing.ipynb
@@ -2,7 +2,7 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": 3,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -132,30 +132,76 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "# Postprocessing"
+    "# Postprocessing\n",
+    "\n",
+    "Google Spreadsheet: https://docs.google.com/spreadsheets/d/1F_qT33T2iy0tKbflnVC8Ma-EoWEHimV3NmNRgLjN00o/edit#gid=1302375309"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 14,
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "filepath = \"https://docs.google.com/spreadsheets/d/e/2PACX-1vRm-f8qstNhxICHzEfhbCacJNQSAZptP-6ockKwsxyck5vtl7e1-A2726Qj2hgp4Oht7WfcbdivQNPT/pub?gid=1302375309&single=true&output=csv\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "we have 160 samples\n"
+      "we have 1000 samples\n"
      ]
     }
    ],
    "source": [
-    "df = pd.read_csv(\"./wisesight-tokenised.csv\")\n",
+    "df = pd.read_csv(filepath)\n",
     "print(\"we have %d samples\" % len(df))"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 15,
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "should_removed = ~df.label.apply(lambda x: len(x.split(\"-\")) > 1)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_filtered = df[should_removed]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "we have 993 after samples\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(\"we have %d after samples\" % len(df_filtered))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -164,12 +210,12 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 16,
+   "execution_count": 15,
    "metadata": {},
    "outputs": [],
    "source": [
     "with open(filename, \"w\") as ft, open(filename.replace(\".txt\", \".label\"), \"w\") as fl:\n",
-    "         for l in df.tokenised.values:\n",
+    "         for l in df_filtered.tokenised.values:\n",
     "            l = l.strip()\n",
     "            ft.write(\"%s\\n\" % l.replace(\"|\", \"\"))\n",
     "            fl.write(\"%s\\n\" % l)"