Skip to content

Commit

Permalink
[datasets] Add IIIT HWS dataset (#1199)
Browse files Browse the repository at this point in the history
  • Loading branch information
felixT2K authored May 17, 2023
1 parent cc0ceee commit be4e8a2
Show file tree
Hide file tree
Showing 8 changed files with 133 additions and 2 deletions.
1 change: 1 addition & 0 deletions docs/source/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,7 @@ Supported datasets
* IC13 from `ICDAR 2013 <http://dagdata.cvc.uab.es/icdar2013competition/>`_.
* IMGUR5K from `"TextStyleBrush: Transfer of Text Aesthetics from a Single Example" <https://github.com/facebookresearch/IMGUR5K-Handwriting-Dataset>`_.
* MJSynth from `"Synthetic Data and Artificial Neural Networks for Natural Scene Text Recognition" <https://www.robots.ox.ac.uk/~vgg/data/text/>`_.
* IIITHWS from `"Generating Synthetic Data for Text Recognition" <https://github.com/kris314/hwnet>`_.


.. toctree::
Expand Down
2 changes: 2 additions & 0 deletions docs/source/modules/datasets.rst
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,8 @@ doctr.datasets

.. autoclass:: MJSynth

.. autoclass:: IIITHWS

.. autoclass:: DocArtefacts

Synthetic dataset generator
Expand Down
6 changes: 4 additions & 2 deletions docs/source/using_doctr/using_datasets.rst
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ Recognition
This datasets contains the information to train or validate a text recognition model.

+-----------------------------+---------------------------------+---------------------------------+---------------------------------------------+
| **Dataset** | **Train Samples** | **Test Samples** | **Information** |
| **Dataset** | **Train Samples** | **Test Samples** | **Information** |
+=============================+=================================+=================================+=============================================+
| FUNSD | 21888 | 8707 | english |
+-----------------------------+---------------------------------+---------------------------------+---------------------------------------------+
Expand All @@ -80,7 +80,9 @@ This datasets contains the information to train or validate a text recognition m
+-----------------------------+---------------------------------+---------------------------------+---------------------------------------------+
| IMGUR5K | 207901 | 22672 | english / handwritten / external resources |
+-----------------------------+---------------------------------+---------------------------------+---------------------------------------------+
| MJSynth | 7581382 | 1337891 | english |
| MJSynth | 7581382 | 1337891 | english / external resources |
+-----------------------------+---------------------------------+---------------------------------+---------------------------------------------+
| IIITHWS | 7141797 | 793533 | english / handwritten / external resources |
+-----------------------------+---------------------------------+---------------------------------+---------------------------------------------+

.. code:: python3
Expand Down
1 change: 1 addition & 0 deletions doctr/datasets/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from .ic03 import *
from .ic13 import *
from .iiit5k import *
from .iiithws import *
from .imgur5k import *
from .mjsynth import *
from .ocr import *
Expand Down
74 changes: 74 additions & 0 deletions doctr/datasets/iiithws.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
# Copyright (C) 2021-2023, Mindee.

# This program is licensed under the Apache License 2.0.
# See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details.

import os
from random import sample
from typing import Any, List, Tuple

from tqdm import tqdm

from .datasets import AbstractDataset

__all__ = ["IIITHWS"]


class IIITHWS(AbstractDataset):
"""IIITHWS dataset from `"Generating Synthetic Data for Text Recognition"
<https://arxiv.org/pdf/1608.04224.pdf>`_ | `"repository" <https://github.com/kris314/hwnet>`_ |
`"website" <https://cvit.iiit.ac.in/research/projects/cvit-projects/matchdocimgs>`_.
>>> # NOTE: This is a pure recognition dataset without bounding box labels.
>>> # NOTE: You need to download the dataset.
>>> from doctr.datasets import IIITHWS
>>> train_set = IIITHWS(img_folder="/path/to/iiit-hws/Images_90K_Normalized",
>>> label_path="/path/to/IIIT-HWS-90K.txt",
>>> train=True)
>>> img, target = train_set[0]
>>> test_set = IIITHWS(img_folder="/path/to/iiit-hws/Images_90K_Normalized",
>>> label_path="/path/to/IIIT-HWS-90K.txt")
>>> train=False)
>>> img, target = test_set[0]
Args:
img_folder: folder with all the images of the dataset
label_path: path to the file with the labels
train: whether the subset should be the training one
**kwargs: keyword arguments from `AbstractDataset`.
"""

def __init__(
self,
img_folder: str,
label_path: str,
train: bool = True,
**kwargs: Any,
) -> None:
super().__init__(img_folder, **kwargs)

# File existence check
if not os.path.exists(label_path) or not os.path.exists(img_folder):
raise FileNotFoundError(f"unable to locate {label_path if not os.path.exists(label_path) else img_folder}")

self.data: List[Tuple[str, str]] = []
self.train = train

with open(label_path) as f:
annotations = f.readlines()

# Shuffle the dataset otherwise the test set will contain the same labels n times
annotations = sample(annotations, len(annotations))
train_samples = int(len(annotations) * 0.9)
set_slice = slice(train_samples) if self.train else slice(train_samples, None)

for annotation in tqdm(
iterable=annotations[set_slice], desc="Unpacking IIITHWS", total=len(annotations[set_slice])
):
img_path, label = annotation.split()[0:2]
img_path = os.path.join(img_folder, img_path)

self.data.append((img_path, label))

def extra_repr(self) -> str:
return f"train={self.train}"
27 changes: 27 additions & 0 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -627,3 +627,30 @@ def mock_mjsynth_dataset(tmpdir_factory, mock_image_stream):
with open(fn, "wb") as f:
f.write(file.getbuffer())
return str(root), str(label_file)


@pytest.fixture(scope="session")
def mock_iiithws_dataset(tmpdir_factory, mock_image_stream):
root = tmpdir_factory.mktemp("datasets")
iiithws_root = root.mkdir("iiit-hws")
image_folder = iiithws_root.mkdir("Images_90K_Normalized")
image_sub_folder = image_folder.mkdir("1")
label_file = iiithws_root.join("IIIT-HWS-90K.txt")
labels = [
"./iiit-hws/Images_90K_Normalized/1/499_5_3_0_0.png I 1 0\n",
"./iiit-hws/Images_90K_Normalized/1/117_1_3_0_0.png am 1 0\n",
"./iiit-hws/Images_90K_Normalized/1/80_7_3_0_0.png a 1 0\n",
"./iiit-hws/Images_90K_Normalized/1/585_3_2_0_0.png Jedi 1 0\n",
"./iiit-hws/Images_90K_Normalized/1/222_5_3_0_0.png ! 1 0\n",
]

with open(label_file, "w") as f:
for label in labels:
f.write(label)

file = BytesIO(mock_image_stream)
for label in labels:
fn = image_sub_folder.join(label.split()[0].split("/")[-1])
with open(fn, "wb") as f:
f.write(file.getbuffer())
return str(root), str(label_file)
12 changes: 12 additions & 0 deletions tests/pytorch/test_datasets_pt.py
Original file line number Diff line number Diff line change
Expand Up @@ -574,3 +574,15 @@ def test_mjsynth_dataset(mock_mjsynth_dataset):
assert len(ds) == 4 # Actual set has 7581382 train and 1337891 test samples
assert repr(ds) == f"MJSynth(train={True})"
_validate_dataset_recognition_part(ds, input_size)


def test_iiithws_dataset(mock_iiithws_dataset):
input_size = (32, 128)
ds = datasets.IIITHWS(
*mock_iiithws_dataset,
img_transforms=Resize(input_size, preserve_aspect_ratio=True),
)

assert len(ds) == 4 # Actual set has 7141797 train and 793533 test samples
assert repr(ds) == f"IIITHWS(train={True})"
_validate_dataset_recognition_part(ds, input_size)
12 changes: 12 additions & 0 deletions tests/tensorflow/test_datasets_tf.py
Original file line number Diff line number Diff line change
Expand Up @@ -548,3 +548,15 @@ def test_mjsynth_dataset(mock_mjsynth_dataset):
assert len(ds) == 4 # Actual set has 7581382 train and 1337891 test samples
assert repr(ds) == f"MJSynth(train={True})"
_validate_dataset_recognition_part(ds, input_size)


def test_iiithws_dataset(mock_iiithws_dataset):
input_size = (32, 128)
ds = datasets.IIITHWS(
*mock_iiithws_dataset,
img_transforms=Resize(input_size, preserve_aspect_ratio=True),
)

assert len(ds) == 4 # Actual set has 7141797 train and 793533 test samples
assert repr(ds) == f"IIITHWS(train={True})"
_validate_dataset_recognition_part(ds, input_size)

0 comments on commit be4e8a2

Please sign in to comment.