Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

RFC: Add option to include images in hOCR output #3710

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 19 additions & 0 deletions src/api/hocrrenderer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -143,8 +143,10 @@ char *TessBaseAPI::GetHOCRText(ETEXT_DESC *monitor, int page_number) {
const char *paragraph_lang = nullptr;
bool font_info = false;
bool hocr_boxes = false;
bool hocr_images = false;
GetBoolVariable("hocr_font_info", &font_info);
GetBoolVariable("hocr_char_boxes", &hocr_boxes);
GetBoolVariable("hocr_images", &hocr_images);

if (input_file_.empty()) {
SetInputName(nullptr);
Expand Down Expand Up @@ -189,6 +191,7 @@ char *TessBaseAPI::GetHOCRText(ETEXT_DESC *monitor, int page_number) {

std::unique_ptr<ResultIterator> res_it(GetIterator());
while (!res_it->Empty(RIL_BLOCK)) {
bool skipword = false;
if (res_it->Empty(RIL_WORD)) {
res_it->Next(RIL_WORD);
continue;
Expand Down Expand Up @@ -228,12 +231,27 @@ char *TessBaseAPI::GetHOCRText(ETEXT_DESC *monitor, int page_number) {
case PT_CAPTION_TEXT:
hocr_str << "ocr_caption";
break;
case PT_FLOWING_IMAGE:
case PT_HEADING_IMAGE:
case PT_PULLOUT_IMAGE:
{
if (hocr_images) {
hocr_str << "ocr_photo";
skipword = true;
break;
}
}
// Fall through if hocr_images is false, because we would omit ocr_line
// in the past.
default:
hocr_str << "ocr_line";
}
hocr_str << "' id='"
<< "line_" << page_id << "_" << lcnt << "'";
AddBoxTohOCR(res_it.get(), RIL_TEXTLINE, hocr_str);
if (skipword) {
goto word_end;
}
}

// Now, process the word...
Expand Down Expand Up @@ -445,6 +463,7 @@ char *TessBaseAPI::GetHOCRText(ETEXT_DESC *monitor, int page_number) {
bcnt++;
}
}
word_end:
hocr_str << " </div>\n";

const std::string &text = hocr_str.str();
Expand Down
1 change: 1 addition & 0 deletions src/ccmain/tesseractclass.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -261,6 +261,7 @@ Tesseract::Tesseract()
, BOOL_MEMBER(hocr_font_info, false, "Add font info to hocr output", this->params())
, BOOL_MEMBER(hocr_char_boxes, false, "Add coordinates for each character to hocr output",
this->params())
, BOOL_MEMBER(hocr_images, false, "Add images to hocr output", this->params())
, BOOL_MEMBER(crunch_early_merge_tess_fails, true, "Before word crunch?", this->params())
, BOOL_MEMBER(crunch_early_convert_bad_unlv_chs, false, "Take out ~^ early?", this->params())
, double_MEMBER(crunch_terrible_rating, 80.0, "crunch rating lt this", this->params())
Expand Down
1 change: 1 addition & 0 deletions src/ccmain/tesseractclass.h
Original file line number Diff line number Diff line change
Expand Up @@ -849,6 +849,7 @@ class TESS_API Tesseract : public Wordrec {
BOOL_VAR_H(unlv_tilde_crunching);
BOOL_VAR_H(hocr_font_info);
BOOL_VAR_H(hocr_char_boxes);
BOOL_VAR_H(hocr_images);
BOOL_VAR_H(crunch_early_merge_tess_fails);
BOOL_VAR_H(crunch_early_convert_bad_unlv_chs);
double_VAR_H(crunch_terrible_rating);
Expand Down
1 change: 1 addition & 0 deletions tessdata/configs/hocr
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
tessedit_create_hocr 1
hocr_font_info 0
hocr_images 1