From 837d29b9c35aa0e89640e40407b2aab6174040f0 Mon Sep 17 00:00:00 2001 From: Stefan Weil Date: Sun, 16 Jan 2022 13:51:53 +0100 Subject: [PATCH] Handle image and line regions in output formats ALTO, hOCR and text Signed-off-by: Stefan Weil --- src/api/altorenderer.cpp | 32 ++++++++++++++++++++++++++++++- src/api/baseapi.cpp | 16 ++++++++++++++++ src/api/hocrrenderer.cpp | 41 ++++++++++++++++++++++++++++++++++++---- 3 files changed, 84 insertions(+), 5 deletions(-) diff --git a/src/api/altorenderer.cpp b/src/api/altorenderer.cpp index 089189ef5b..83fe251b1a 100644 --- a/src/api/altorenderer.cpp +++ b/src/api/altorenderer.cpp @@ -13,6 +13,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +#include "errcode.h" // for ASSERT_HOST #ifdef _WIN32 # include "host.h" // windows.h for MultiByteToWideChar, ... #endif @@ -174,6 +175,36 @@ char *TessBaseAPI::GetAltoText(ETEXT_DESC *monitor, int page_number) { continue; } + int left, top, right, bottom; + auto block_type = res_it->BlockType(); + + switch (block_type) { + case PT_FLOWING_IMAGE: + case PT_HEADING_IMAGE: + case PT_PULLOUT_IMAGE: { + // Handle all kinds of images. + // TODO: optionally add TYPE, for example TYPE="photo". + alto_str << "\t\t\t\t\n"; + res_it->Next(RIL_BLOCK); + continue; + } + case PT_HORZ_LINE: + case PT_VERT_LINE: + // Handle horizontal and vertical lines. + alto_str << "\t\t\t\t\n"; + res_it->Next(RIL_BLOCK); + continue; + case PT_NOISE: + tprintf("TODO: Please report image which triggers the noise case.\n"); + ASSERT_HOST(false); + default: + break; + } + if (res_it->IsAtBeginningOf(RIL_BLOCK)) { alto_str << "\t\t\t\tIsAtFinalElement(RIL_PARA, RIL_WORD); bool last_word_in_cblock = res_it->IsAtFinalElement(RIL_BLOCK, RIL_WORD); - int left, top, right, bottom; res_it->BoundingBox(RIL_WORD, &left, &top, &right, &bottom); do { diff --git a/src/api/baseapi.cpp b/src/api/baseapi.cpp index 540e224bbb..ebc8c2941a 100644 --- a/src/api/baseapi.cpp +++ b/src/api/baseapi.cpp @@ -1357,6 +1357,22 @@ char *TessBaseAPI::GetUTF8Text() { if (it->Empty(RIL_PARA)) { continue; } + auto block_type = it->BlockType(); + switch (block_type) { + case PT_FLOWING_IMAGE: + case PT_HEADING_IMAGE: + case PT_PULLOUT_IMAGE: + case PT_HORZ_LINE: + case PT_VERT_LINE: + // Ignore images and lines for text output. + continue; + case PT_NOISE: + tprintf("TODO: Please report image which triggers the noise case.\n"); + ASSERT_HOST(false); + default: + break; + } + const std::unique_ptr para_text(it->GetUTF8Text(RIL_PARA)); text += para_text.get(); } while (it->Next(RIL_PARA)); diff --git a/src/api/hocrrenderer.cpp b/src/api/hocrrenderer.cpp index a3b042a317..33928adbd3 100644 --- a/src/api/hocrrenderer.cpp +++ b/src/api/hocrrenderer.cpp @@ -189,6 +189,36 @@ char *TessBaseAPI::GetHOCRText(ETEXT_DESC *monitor, int page_number) { std::unique_ptr res_it(GetIterator()); while (!res_it->Empty(RIL_BLOCK)) { + int left, top, right, bottom; + auto block_type = res_it->BlockType(); + switch (block_type) { + case PT_FLOWING_IMAGE: + case PT_HEADING_IMAGE: + case PT_PULLOUT_IMAGE: { + // Handle all kinds of images. + res_it.get()->BoundingBox(RIL_TEXTLINE, &left, &top, &right, &bottom); + hocr_str << "
\n"; + res_it->Next(RIL_BLOCK); + continue; + } + case PT_HORZ_LINE: + case PT_VERT_LINE: + // Handle horizontal and vertical lines. + res_it.get()->BoundingBox(RIL_TEXTLINE, &left, &top, &right, &bottom); + hocr_str << "
\n"; + res_it->Next(RIL_BLOCK); + continue; + case PT_NOISE: + tprintf("TODO: Please report image which triggers the noise case.\n"); + ASSERT_HOST(false); + default: + break; + } + if (res_it->Empty(RIL_WORD)) { res_it->Next(RIL_WORD); continue; @@ -218,7 +248,7 @@ char *TessBaseAPI::GetHOCRText(ETEXT_DESC *monitor, int page_number) { } if (res_it->IsAtBeginningOf(RIL_TEXTLINE)) { hocr_str << "\n BoundingBox(RIL_WORD, &left, &top, &right, &bottom); - font_name = + const char *font_name = res_it->WordFontAttributes(&bold, &italic, &underlined, &monospace, &serif, &smallcaps, &pointsize, &font_id); hocr_str << " title='bbox " << left << " " << top << " " << right << " "