Skip to content

Commit

Permalink
Added option to include images in hOCR output
Browse files Browse the repository at this point in the history
Signed-off-by: Merlijn Wajer <merlijn@wizzup.org>
  • Loading branch information
Aram Verstegen authored and MerlijnWajer committed Jan 5, 2022
1 parent 890cf3e commit dafe4ff
Show file tree
Hide file tree
Showing 4 changed files with 22 additions and 0 deletions.
19 changes: 19 additions & 0 deletions src/api/hocrrenderer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -143,8 +143,10 @@ char *TessBaseAPI::GetHOCRText(ETEXT_DESC *monitor, int page_number) {
const char *paragraph_lang = nullptr;
bool font_info = false;
bool hocr_boxes = false;
bool hocr_images = false;
GetBoolVariable("hocr_font_info", &font_info);
GetBoolVariable("hocr_char_boxes", &hocr_boxes);
GetBoolVariable("hocr_images", &hocr_images);

if (input_file_.empty()) {
SetInputName(nullptr);
Expand Down Expand Up @@ -189,6 +191,7 @@ char *TessBaseAPI::GetHOCRText(ETEXT_DESC *monitor, int page_number) {

std::unique_ptr<ResultIterator> res_it(GetIterator());
while (!res_it->Empty(RIL_BLOCK)) {
bool skipword = false;
if (res_it->Empty(RIL_WORD)) {
res_it->Next(RIL_WORD);
continue;
Expand Down Expand Up @@ -228,12 +231,27 @@ char *TessBaseAPI::GetHOCRText(ETEXT_DESC *monitor, int page_number) {
case PT_CAPTION_TEXT:
hocr_str << "ocr_caption";
break;
case PT_FLOWING_IMAGE:
case PT_HEADING_IMAGE:
case PT_PULLOUT_IMAGE:
{
if (hocr_images) {
hocr_str << "ocr_photo";
skipword = true;
break;
}
}
// Fall through if hocr_images is false, because we would omit ocr_line
// in the past.
default:
hocr_str << "ocr_line";
}
hocr_str << "' id='"
<< "line_" << page_id << "_" << lcnt << "'";
AddBoxTohOCR(res_it.get(), RIL_TEXTLINE, hocr_str);
if (skipword) {
goto word_end;
}
}

// Now, process the word...
Expand Down Expand Up @@ -445,6 +463,7 @@ char *TessBaseAPI::GetHOCRText(ETEXT_DESC *monitor, int page_number) {
bcnt++;
}
}
word_end:
hocr_str << " </div>\n";

const std::string &text = hocr_str.str();
Expand Down
1 change: 1 addition & 0 deletions src/ccmain/tesseractclass.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -261,6 +261,7 @@ Tesseract::Tesseract()
, BOOL_MEMBER(hocr_font_info, false, "Add font info to hocr output", this->params())
, BOOL_MEMBER(hocr_char_boxes, false, "Add coordinates for each character to hocr output",
this->params())
, BOOL_MEMBER(hocr_images, false, "Add images to hocr output", this->params())
, BOOL_MEMBER(crunch_early_merge_tess_fails, true, "Before word crunch?", this->params())
, BOOL_MEMBER(crunch_early_convert_bad_unlv_chs, false, "Take out ~^ early?", this->params())
, double_MEMBER(crunch_terrible_rating, 80.0, "crunch rating lt this", this->params())
Expand Down
1 change: 1 addition & 0 deletions src/ccmain/tesseractclass.h
Original file line number Diff line number Diff line change
Expand Up @@ -849,6 +849,7 @@ class TESS_API Tesseract : public Wordrec {
BOOL_VAR_H(unlv_tilde_crunching);
BOOL_VAR_H(hocr_font_info);
BOOL_VAR_H(hocr_char_boxes);
BOOL_VAR_H(hocr_images);
BOOL_VAR_H(crunch_early_merge_tess_fails);
BOOL_VAR_H(crunch_early_convert_bad_unlv_chs);
double_VAR_H(crunch_terrible_rating);
Expand Down
1 change: 1 addition & 0 deletions tessdata/configs/hocr
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
tessedit_create_hocr 1
hocr_font_info 0
hocr_images 1

0 comments on commit dafe4ff

Please sign in to comment.