diff --git a/src/api/altorenderer.cpp b/src/api/altorenderer.cpp index 089189ef5b..c2905277fd 100644 --- a/src/api/altorenderer.cpp +++ b/src/api/altorenderer.cpp @@ -13,9 +13,11 @@ // See the License for the specific language governing permissions and // limitations under the License. +#include "errcode.h" // for ASSERT_HOST #ifdef _WIN32 -# include "host.h" // windows.h for MultiByteToWideChar, ... +# include "host.h" // windows.h for MultiByteToWideChar, ... #endif +#include "tprintf.h" // for tprintf #include #include @@ -174,6 +176,36 @@ char *TessBaseAPI::GetAltoText(ETEXT_DESC *monitor, int page_number) { continue; } + int left, top, right, bottom; + auto block_type = res_it->BlockType(); + + switch (block_type) { + case PT_FLOWING_IMAGE: + case PT_HEADING_IMAGE: + case PT_PULLOUT_IMAGE: { + // Handle all kinds of images. + // TODO: optionally add TYPE, for example TYPE="photo". + alto_str << "\t\t\t\t\n"; + res_it->Next(RIL_BLOCK); + continue; + } + case PT_HORZ_LINE: + case PT_VERT_LINE: + // Handle horizontal and vertical lines. + alto_str << "\t\t\t\t\n"; + res_it->Next(RIL_BLOCK); + continue; + case PT_NOISE: + tprintf("TODO: Please report image which triggers the noise case.\n"); + ASSERT_HOST(false); + default: + break; + } + if (res_it->IsAtBeginningOf(RIL_BLOCK)) { alto_str << "\t\t\t\tIsAtFinalElement(RIL_PARA, RIL_WORD); bool last_word_in_cblock = res_it->IsAtFinalElement(RIL_BLOCK, RIL_WORD); - int left, top, right, bottom; res_it->BoundingBox(RIL_WORD, &left, &top, &right, &bottom); do { diff --git a/src/api/baseapi.cpp b/src/api/baseapi.cpp index 114c678750..f78894ce74 100644 --- a/src/api/baseapi.cpp +++ b/src/api/baseapi.cpp @@ -1371,6 +1371,22 @@ char *TessBaseAPI::GetUTF8Text() { if (it->Empty(RIL_PARA)) { continue; } + auto block_type = it->BlockType(); + switch (block_type) { + case PT_FLOWING_IMAGE: + case PT_HEADING_IMAGE: + case PT_PULLOUT_IMAGE: + case PT_HORZ_LINE: + case PT_VERT_LINE: + // Ignore images and lines for text output. + continue; + case PT_NOISE: + tprintf("TODO: Please report image which triggers the noise case.\n"); + ASSERT_HOST(false); + default: + break; + } + const std::unique_ptr para_text(it->GetUTF8Text(RIL_PARA)); text += para_text.get(); } while (it->Next(RIL_PARA)); diff --git a/src/api/hocrrenderer.cpp b/src/api/hocrrenderer.cpp index a3b042a317..33928adbd3 100644 --- a/src/api/hocrrenderer.cpp +++ b/src/api/hocrrenderer.cpp @@ -189,6 +189,36 @@ char *TessBaseAPI::GetHOCRText(ETEXT_DESC *monitor, int page_number) { std::unique_ptr res_it(GetIterator()); while (!res_it->Empty(RIL_BLOCK)) { + int left, top, right, bottom; + auto block_type = res_it->BlockType(); + switch (block_type) { + case PT_FLOWING_IMAGE: + case PT_HEADING_IMAGE: + case PT_PULLOUT_IMAGE: { + // Handle all kinds of images. + res_it.get()->BoundingBox(RIL_TEXTLINE, &left, &top, &right, &bottom); + hocr_str << "
\n"; + res_it->Next(RIL_BLOCK); + continue; + } + case PT_HORZ_LINE: + case PT_VERT_LINE: + // Handle horizontal and vertical lines. + res_it.get()->BoundingBox(RIL_TEXTLINE, &left, &top, &right, &bottom); + hocr_str << "
\n"; + res_it->Next(RIL_BLOCK); + continue; + case PT_NOISE: + tprintf("TODO: Please report image which triggers the noise case.\n"); + ASSERT_HOST(false); + default: + break; + } + if (res_it->Empty(RIL_WORD)) { res_it->Next(RIL_WORD); continue; @@ -218,7 +248,7 @@ char *TessBaseAPI::GetHOCRText(ETEXT_DESC *monitor, int page_number) { } if (res_it->IsAtBeginningOf(RIL_TEXTLINE)) { hocr_str << "\n BoundingBox(RIL_WORD, &left, &top, &right, &bottom); - font_name = + const char *font_name = res_it->WordFontAttributes(&bold, &italic, &underlined, &monospace, &serif, &smallcaps, &pointsize, &font_id); hocr_str << " title='bbox " << left << " " << top << " " << right << " "