Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[FIX] Allow all oem modes with tesseract v4 #1267

Merged
merged 4 commits into from
May 8, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions docs/CHANGES.TXT
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
- Fix: General code clean up / reformatting
- Fix: Fix multiple definitions with new -fno-common default in GCC 10
- Fix: Mac now builds reproducibly again without errors on the date command (#1230)
- Fix: Allow all oem modes with tesseract v4 (#1264)
- Doc: Updated ccextractor.cnf.sample.

0.88 (2019-05-21)
Expand Down
2 changes: 1 addition & 1 deletion src/lib_ccx/ccx_common_option.c
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ void init_options(struct ccx_s_options *options)
options->hardsubx = 0; // By default, don't try to extract hard subtitles
options->dvblang = NULL; // By default, autodetect DVB language
options->ocrlang = NULL; // By default, autodetect .traineddata file
options->ocr_oem = 0; // By default, set Tesseract OEM mode OEM_TESSERACT_ONLY (0)
options->ocr_oem = -1; // By default, OEM mode depends on the tesseract version
options->ocr_quantmode = 1; // CCExtractor's internal
options->mkvlang = NULL; // By default, all the languages are extracted
options->ignore_pts_jumps = 1;
Expand Down
7 changes: 5 additions & 2 deletions src/lib_ccx/hardsubx.c
Original file line number Diff line number Diff line change
Expand Up @@ -248,13 +248,16 @@ struct lib_hardsubx_ctx *_init_hardsubx(struct ccx_s_options *options)
if (!strncmp("4.", TessVersion(), 2))
{
char tess_path[1024];
if (ccx_options.ocr_oem < 0)
ccx_options.ocr_oem = 1;
snprintf(tess_path, 1024, "%s%s%s", tessdata_path, "/", "tessdata");
//ccx_options.ocr_oem are deprecated and only supported mode is OEM_LSTM_ONLY
ret = TessBaseAPIInit4(ctx->tess_handle, tess_path, lang, 1, NULL, 0, &pars_vec,
ret = TessBaseAPIInit4(ctx->tess_handle, tess_path, lang, ccx_options.ocr_oem, NULL, 0, &pars_vec,
&pars_values, 1, false);
}
else
{
if (ccx_options.ocr_oem < 0)
ccx_options.ocr_oem = 0;
ret = TessBaseAPIInit4(ctx->tess_handle, tessdata_path, lang, ccx_options.ocr_oem, NULL, 0, &pars_vec,
&pars_values, 1, false);
}
Expand Down
7 changes: 5 additions & 2 deletions src/lib_ccx/ocr.c
Original file line number Diff line number Diff line change
Expand Up @@ -178,12 +178,15 @@ void *init_ocr(int lang_index)
{
char tess_path[1024];
snprintf(tess_path, 1024, "%s%s%s", tessdata_path, "/", "tessdata");
//ccx_options.ocr_oem are deprecated and only supported mode is OEM_LSTM_ONLY
ret = TessBaseAPIInit4(ctx->api, tess_path, lang, 1, NULL, 0, &pars_vec,
if (ccx_options.ocr_oem < 0)
ccx_options.ocr_oem = 1;
ret = TessBaseAPIInit4(ctx->api, tess_path, lang, ccx_options.ocr_oem, NULL, 0, &pars_vec,
&pars_values, 1, false);
}
else
{
if (ccx_options.ocr_oem < 0)
ccx_options.ocr_oem = 0;
ret = TessBaseAPIInit4(ctx->api, tessdata_path, lang, ccx_options.ocr_oem, NULL, 0, &pars_vec,
&pars_values, 1, false);
}
Expand Down
8 changes: 6 additions & 2 deletions src/lib_ccx/params.c
Original file line number Diff line number Diff line change
Expand Up @@ -681,10 +681,14 @@ void print_usage(void)
mprint(" 0: Don't quantize at all.\n");
mprint(" 1: Use CCExtractor's internal function (default).\n");
mprint(" 2: Reduce distinct color count in image for faster results.\n");
mprint(" -oem: Select the OEM mode for Tesseract, could be 0, 1 or 2.\n");
mprint(" 0: OEM_TESSERACT_ONLY - default value, the fastest mode.\n");
mprint(" -oem: Select the OEM mode for Tesseract.\n");
mprint(" Available modes :\n");
mprint(" 0: OEM_TESSERACT_ONLY - the fastest mode.\n");
mprint(" 1: OEM_LSTM_ONLY - use LSTM algorithm for recognition.\n");
mprint(" 2: OEM_TESSERACT_LSTM_COMBINED - both algorithms.\n");
mprint(" Default value depends on the tesseract version linked :\n");
mprint(" Tesseract v3 : default mode is 0,\n");
mprint(" Tesseract v4 : default mode is 1.\n");
mprint(" -mkvlang: For MKV subtitles, select which language's caption\n");
mprint(" stream will be processed. e.g. 'eng' for English.\n");
mprint(" Language codes can be either the 3 letters bibliographic\n");
Expand Down