From 69e267f2109b4883b00b4a8fd66a5ec3f4bfff64 Mon Sep 17 00:00:00 2001 From: charlesmindee Date: Tue, 21 Sep 2021 18:43:49 +0200 Subject: [PATCH 1/2] feat: add pytorch ckpts for crnn & mobilenet_v3_large --- .../models/detection/differentiable_binarization/pytorch.py | 6 +++--- doctr/models/recognition/crnn/pytorch.py | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/doctr/models/detection/differentiable_binarization/pytorch.py b/doctr/models/detection/differentiable_binarization/pytorch.py index 605eda5991..f52ace152b 100644 --- a/doctr/models/detection/differentiable_binarization/pytorch.py +++ b/doctr/models/detection/differentiable_binarization/pytorch.py @@ -42,9 +42,9 @@ 'backbone_submodule': 'features', 'fpn_layers': ['3', '6', '12', '16'], 'input_shape': (3, 1024, 1024), - 'mean': (.5, .5, .5), - 'std': (1., 1., 1.), - 'url': None, + 'mean': (0.798, 0.785, 0.772), + 'std': (0.264, 0.2749, 0.287), + 'url': 'https://github.com/mindee/doctr/releases/download/v0.3.1/db_mobilenet_v3_large-fd62154b.pt', }, } diff --git a/doctr/models/recognition/crnn/pytorch.py b/doctr/models/recognition/crnn/pytorch.py index f3b2570b18..18318ef360 100644 --- a/doctr/models/recognition/crnn/pytorch.py +++ b/doctr/models/recognition/crnn/pytorch.py @@ -20,12 +20,12 @@ default_cfgs: Dict[str, Dict[str, Any]] = { 'crnn_vgg16_bn': { - 'mean': (.5, .5, .5), - 'std': (1., 1., 1.), + 'mean': (0.694, 0.695, 0.693), + 'std': (0.299, 0.296, 0.301), 'backbone': vgg16_bn, 'rnn_units': 128, 'lstm_features': 512, 'input_shape': (3, 32, 128), 'vocab': VOCABS['legacy_french'], - 'url': None, + 'url': 'https://github.com/mindee/doctr/releases/download/v0.3.1/crnn_vgg16_bn-9762b0b0.pt', }, 'crnn_mobilenet_v3_small': { 'mean': (.5, .5, .5), From 96e2442f3259ea05f3181386047c25265a02d404 Mon Sep 17 00:00:00 2001 From: charlesmindee Date: Mon, 4 Oct 2021 10:25:50 +0200 Subject: [PATCH 2/2] feat: add crnn_mobilenet tf bench --- docs/source/using_models.rst | 42 ++++++++++++++++++++---------------- 1 file changed, 23 insertions(+), 19 deletions(-) diff --git a/docs/source/using_models.rst b/docs/source/using_models.rst index 17b2be0d4d..6ea485ec05 100644 --- a/docs/source/using_models.rst +++ b/docs/source/using_models.rst @@ -100,8 +100,8 @@ For a comprehensive comparison, we have compiled a detailed benchmark on publicl * - crnn_mobilenet_v3_small - (32, 128, 3) - 2.1M - - - - + - 86.21 + - 90.56 - * - crnn_mobilenet_v3_large - (32, 128, 3) @@ -171,6 +171,8 @@ For a comprehensive comparison, we have compiled a detailed benchmark on publicl +----------------------------------------+------------+---------------+---------+------------+---------------+---------+ | db_resnet50 + sar_resnet31 | 71.25 | 76.29 | 0.27 | 84.50 | **81.96** | 0.83 | +----------------------------------------+------------+---------------+---------+------------+---------------+---------+ +| db_resnet50 + crnn_mobilenet_v3_small | 69.85 | 74.80 | | 80.85 | 78.42 | 0.83 | ++----------------------------------------+------------+---------------+---------+------------+---------------+---------+ | db_mobilenet_v3_large + crnn_vgg16_bn | 67.73 | 71.73 | | 71.65 | 59.03 | | +----------------------------------------+------------+---------------+---------+------------+---------------+---------+ | Gvision text detection | 59.50 | 62.50 | | 75.30 | 70.00 | | @@ -190,23 +192,25 @@ FPS (Frames per second) is computed after a warmup phase of 100 tensors (where t Since you may be looking for specific use cases, we also performed this benchmark on private datasets with various document types below. Unfortunately, we are not able to share those at the moment since they contain sensitive information. -+----------------------------------------------+----------------------------+----------------------------+----------------------------+----------------------------+ -| | Receipts | Invoices | IDs | US Tax Forms | -+==============================================+============+===============+============+===============+============+===============+============+===============+ -| **Architecture** | **Recall** | **Precision** | **Recall** | **Precision** | **Recall** | **Precision** | **Recall** | **Precision** | -+----------------------------------------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+ -| db_resnet50 + crnn_vgg16_bn (ours) | 78.70 | 81.12 | 65.80 | 70.70 | 50.25 | 51.78 | 79.08 | 92.83 | -+----------------------------------------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+ -| db_resnet50 + master (ours) | **79.00** | **81.42** | 65.57 | 69.86 | 51.34 | 52.90 | 78.86 | 92.57 | -+----------------------------------------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+ -| db_resnet50 + sar_resnet31 (ours) | 78.94 | 81.37 | 65.89 | **70.79** | **51.78** | **53.35** | 79.04 | 92.78 | -+----------------------------------------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+ -| db_mobilenet_v3_large + crnn_vgg16_bn (ours) | 78.36 | 74.93 | 63.04 | 68.41 | 39.36 | 41.75 | 72.14 | 89.97 | -+----------------------------------------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+ -| Gvision doc. text detection | 68.91 | 59.89 | 63.20 | 52.85 | 43.70 | 29.21 | 69.79 | 65.68 | -+----------------------------------------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+ -| AWS textract | 75.77 | 77.70 | **70.47** | 69.13 | 46.39 | 43.32 | **84.31** | **98.11** | -+----------------------------------------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+ ++----------------------------------------------+----------------------------+----------------------------+----------------------------+----------------------------+----------------------------+----------------------------+ +| | Receipts | Invoices | IDs | US Tax Forms | Resumes | Road Fines | ++==============================================+============+===============+============+===============+============+===============+============+===============+============+===============+============+===============+ +| **Architecture** | **Recall** | **Precision** | **Recall** | **Precision** | **Recall** | **Precision** | **Recall** | **Precision** | **Recall** | **Precision** | **Recall** | **Precision** | ++----------------------------------------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+ +| db_resnet50 + crnn_vgg16_bn (ours) | 78.70 | 81.12 | 65.80 | 70.70 | 50.25 | 51.78 | 79.08 | 92.83 | | | | | ++----------------------------------------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+ +| db_resnet50 + master (ours) | **79.00** | **81.42** | 65.57 | 69.86 | 51.34 | 52.90 | 78.86 | 92.57 | | | | | ++----------------------------------------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+ +| db_resnet50 + sar_resnet31 (ours) | 78.94 | 81.37 | 65.89 | **70.79** | **51.78** | **53.35** | 79.04 | 92.78 | | | | | ++----------------------------------------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+ +| db_resnet50 + crnn_mobilenet_v3_small (ours) | 76.81 | 79.15 | 64.89 | 69.61 | 45.03 | 46.38 | 78.96 | 92.11 | 85.91 | 87.20 | 84.85 | 85.86 | ++----------------------------------------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+ +| db_mobilenet_v3_large + crnn_vgg16_bn (ours) | 78.36 | 74.93 | 63.04 | 68.41 | 39.36 | 41.75 | 72.14 | 89.97 | | | | | ++----------------------------------------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+ +| Gvision doc. text detection | 68.91 | 59.89 | 63.20 | 52.85 | 43.70 | 29.21 | 69.79 | 65.68 | | | | | ++----------------------------------------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+ +| AWS textract | 75.77 | 77.70 | **70.47** | 69.13 | 46.39 | 43.32 | **84.31** | **98.11** | | | | | ++----------------------------------------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+------------+---------------+ Two-stage approaches