[datasets] Add IIIT HWS dataset (#1199)

mindee · May 17, 2023 · be4e8a2 · be4e8a2
1 parent cc0ceee
commit be4e8a2
Show file tree

Hide file tree

Showing 8 changed files with 133 additions and 2 deletions.
diff --git a/docs/source/index.rst b/docs/source/index.rst
@@ -64,6 +64,7 @@ Supported datasets
 * IC13 from `ICDAR 2013 <http://dagdata.cvc.uab.es/icdar2013competition/>`_.
 * IMGUR5K from `"TextStyleBrush: Transfer of Text Aesthetics from a Single Example" <https://github.com/facebookresearch/IMGUR5K-Handwriting-Dataset>`_.
 * MJSynth from `"Synthetic Data and Artificial Neural Networks for Natural Scene Text Recognition" <https://www.robots.ox.ac.uk/~vgg/data/text/>`_.
+* IIITHWS from `"Generating Synthetic Data for Text Recognition" <https://github.com/kris314/hwnet>`_.
 
 
 .. toctree::

diff --git a/docs/source/modules/datasets.rst b/docs/source/modules/datasets.rst
@@ -30,6 +30,8 @@ doctr.datasets
 
 .. autoclass:: MJSynth
 
+.. autoclass:: IIITHWS
+
 .. autoclass:: DocArtefacts
 
 Synthetic dataset generator

diff --git a/docs/source/using_doctr/using_datasets.rst b/docs/source/using_doctr/using_datasets.rst
@@ -58,7 +58,7 @@ Recognition
 This datasets contains the information to train or validate a text recognition model.
 
 +-----------------------------+---------------------------------+---------------------------------+---------------------------------------------+
-|        **Dataset**          |        **Train Samples**        |        **Test Samples**         |       **Information**                       |
+|        **Dataset**          |        **Train Samples**        |        **Test Samples**         |               **Information**               |
 +=============================+=================================+=================================+=============================================+
 | FUNSD                       | 21888                           | 8707                            | english                                     |
 +-----------------------------+---------------------------------+---------------------------------+---------------------------------------------+
@@ -80,7 +80,9 @@ This datasets contains the information to train or validate a text recognition m
 +-----------------------------+---------------------------------+---------------------------------+---------------------------------------------+
 | IMGUR5K                     | 207901                          | 22672                           | english / handwritten / external resources  |
 +-----------------------------+---------------------------------+---------------------------------+---------------------------------------------+
-| MJSynth                     | 7581382                         | 1337891                         | english                                     |
+| MJSynth                     | 7581382                         | 1337891                         | english / external resources                |
++-----------------------------+---------------------------------+---------------------------------+---------------------------------------------+
+| IIITHWS                     | 7141797                         | 793533                          | english / handwritten / external resources  |
 +-----------------------------+---------------------------------+---------------------------------+---------------------------------------------+
 
 .. code:: python3

diff --git a/doctr/datasets/__init__.py b/doctr/datasets/__init__.py
@@ -8,6 +8,7 @@
 from .ic03 import *
 from .ic13 import *
 from .iiit5k import *
+from .iiithws import *
 from .imgur5k import *
 from .mjsynth import *
 from .ocr import *

diff --git a/doctr/datasets/iiithws.py b/doctr/datasets/iiithws.py
@@ -0,0 +1,74 @@
+# Copyright (C) 2021-2023, Mindee.
+
+# This program is licensed under the Apache License 2.0.
+# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.
+
+import os
+from random import sample
+from typing import Any, List, Tuple
+
+from tqdm import tqdm
+
+from .datasets import AbstractDataset
+
+__all__ = ["IIITHWS"]
+
+
+class IIITHWS(AbstractDataset):
+    """IIITHWS dataset from `"Generating Synthetic Data for Text Recognition"
+    <https://arxiv.org/pdf/1608.04224.pdf>`_ | `"repository" <https://github.com/kris314/hwnet>`_ |
+    `"website" <https://cvit.iiit.ac.in/research/projects/cvit-projects/matchdocimgs>`_.
+
+    >>> # NOTE: This is a pure recognition dataset without bounding box labels.
+    >>> # NOTE: You need to download the dataset.
+    >>> from doctr.datasets import IIITHWS
+    >>> train_set = IIITHWS(img_folder="/path/to/iiit-hws/Images_90K_Normalized",
+    >>>                     label_path="/path/to/IIIT-HWS-90K.txt",
+    >>>                     train=True)
+    >>> img, target = train_set[0]
+    >>> test_set = IIITHWS(img_folder="/path/to/iiit-hws/Images_90K_Normalized",
+    >>>                    label_path="/path/to/IIIT-HWS-90K.txt")
+    >>>                    train=False)
+    >>> img, target = test_set[0]
+
+    Args:
+        img_folder: folder with all the images of the dataset
+        label_path: path to the file with the labels
+        train: whether the subset should be the training one
+        **kwargs: keyword arguments from `AbstractDataset`.
+    """
+
+    def __init__(
+        self,
+        img_folder: str,
+        label_path: str,
+        train: bool = True,
+        **kwargs: Any,
+    ) -> None:
+        super().__init__(img_folder, **kwargs)
+
+        # File existence check
+        if not os.path.exists(label_path) or not os.path.exists(img_folder):
+            raise FileNotFoundError(f"unable to locate {label_path if not os.path.exists(label_path) else img_folder}")
+
+        self.data: List[Tuple[str, str]] = []
+        self.train = train
+
+        with open(label_path) as f:
+            annotations = f.readlines()
+
+        # Shuffle the dataset otherwise the test set will contain the same labels n times
+        annotations = sample(annotations, len(annotations))
+        train_samples = int(len(annotations) * 0.9)
+        set_slice = slice(train_samples) if self.train else slice(train_samples, None)
+
+        for annotation in tqdm(
+            iterable=annotations[set_slice], desc="Unpacking IIITHWS", total=len(annotations[set_slice])
+        ):
+            img_path, label = annotation.split()[0:2]
+            img_path = os.path.join(img_folder, img_path)
+
+            self.data.append((img_path, label))
+
+    def extra_repr(self) -> str:
+        return f"train={self.train}"
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -627,3 +627,30 @@ def mock_mjsynth_dataset(tmpdir_factory, mock_image_stream):
         with open(fn, "wb") as f:
             f.write(file.getbuffer())
     return str(root), str(label_file)
+
+
+@pytest.fixture(scope="session")
+def mock_iiithws_dataset(tmpdir_factory, mock_image_stream):
+    root = tmpdir_factory.mktemp("datasets")
+    iiithws_root = root.mkdir("iiit-hws")
+    image_folder = iiithws_root.mkdir("Images_90K_Normalized")
+    image_sub_folder = image_folder.mkdir("1")
+    label_file = iiithws_root.join("IIIT-HWS-90K.txt")
+    labels = [
+        "./iiit-hws/Images_90K_Normalized/1/499_5_3_0_0.png I 1 0\n",
+        "./iiit-hws/Images_90K_Normalized/1/117_1_3_0_0.png am 1 0\n",
+        "./iiit-hws/Images_90K_Normalized/1/80_7_3_0_0.png a 1 0\n",
+        "./iiit-hws/Images_90K_Normalized/1/585_3_2_0_0.png Jedi 1 0\n",
+        "./iiit-hws/Images_90K_Normalized/1/222_5_3_0_0.png ! 1 0\n",
+    ]
+
+    with open(label_file, "w") as f:
+        for label in labels:
+            f.write(label)
+
+    file = BytesIO(mock_image_stream)
+    for label in labels:
+        fn = image_sub_folder.join(label.split()[0].split("/")[-1])
+        with open(fn, "wb") as f:
+            f.write(file.getbuffer())
+    return str(root), str(label_file)
diff --git a/tests/pytorch/test_datasets_pt.py b/tests/pytorch/test_datasets_pt.py
@@ -574,3 +574,15 @@ def test_mjsynth_dataset(mock_mjsynth_dataset):
     assert len(ds) == 4  # Actual set has 7581382 train and 1337891 test samples
     assert repr(ds) == f"MJSynth(train={True})"
     _validate_dataset_recognition_part(ds, input_size)
+
+
+def test_iiithws_dataset(mock_iiithws_dataset):
+    input_size = (32, 128)
+    ds = datasets.IIITHWS(
+        *mock_iiithws_dataset,
+        img_transforms=Resize(input_size, preserve_aspect_ratio=True),
+    )
+
+    assert len(ds) == 4  # Actual set has 7141797 train and 793533 test samples
+    assert repr(ds) == f"IIITHWS(train={True})"
+    _validate_dataset_recognition_part(ds, input_size)
diff --git a/tests/tensorflow/test_datasets_tf.py b/tests/tensorflow/test_datasets_tf.py
@@ -548,3 +548,15 @@ def test_mjsynth_dataset(mock_mjsynth_dataset):
     assert len(ds) == 4  # Actual set has 7581382 train and 1337891 test samples
     assert repr(ds) == f"MJSynth(train={True})"
     _validate_dataset_recognition_part(ds, input_size)
+
+
+def test_iiithws_dataset(mock_iiithws_dataset):
+    input_size = (32, 128)
+    ds = datasets.IIITHWS(
+        *mock_iiithws_dataset,
+        img_transforms=Resize(input_size, preserve_aspect_ratio=True),
+    )
+
+    assert len(ds) == 4  # Actual set has 7141797 train and 793533 test samples
+    assert repr(ds) == f"IIITHWS(train={True})"
+    _validate_dataset_recognition_part(ds, input_size)