From ced1f2888961b26d3cb0c57ff80943ffd5f1fa67 Mon Sep 17 00:00:00 2001 From: Gilles Hamel Date: Fri, 8 May 2020 20:57:21 +0200 Subject: [PATCH 1/3] Allow all oem modes with tesseract v4 --- src/lib_ccx/ccx_common_option.c | 2 +- src/lib_ccx/hardsubx.c | 5 +++-- src/lib_ccx/ocr.c | 5 +++-- src/lib_ccx/params.c | 8 ++++++-- 4 files changed, 13 insertions(+), 7 deletions(-) diff --git a/src/lib_ccx/ccx_common_option.c b/src/lib_ccx/ccx_common_option.c index 96e7322ab..1fc501dab 100644 --- a/src/lib_ccx/ccx_common_option.c +++ b/src/lib_ccx/ccx_common_option.c @@ -69,7 +69,7 @@ void init_options(struct ccx_s_options *options) options->hardsubx = 0; // By default, don't try to extract hard subtitles options->dvblang = NULL; // By default, autodetect DVB language options->ocrlang = NULL; // By default, autodetect .traineddata file - options->ocr_oem = 0; // By default, set Tesseract OEM mode OEM_TESSERACT_ONLY (0) + options->ocr_oem = -1; // By default, OEM mode depends on the tesseract version options->ocr_quantmode = 1; // CCExtractor's internal options->mkvlang = NULL; // By default, all the languages are extracted options->ignore_pts_jumps = 1; diff --git a/src/lib_ccx/hardsubx.c b/src/lib_ccx/hardsubx.c index 6fa51fe07..2d2721540 100644 --- a/src/lib_ccx/hardsubx.c +++ b/src/lib_ccx/hardsubx.c @@ -248,13 +248,14 @@ struct lib_hardsubx_ctx *_init_hardsubx(struct ccx_s_options *options) if (!strncmp("4.", TessVersion(), 2)) { char tess_path[1024]; + if (ccx_options.ocr_oem < 0) ccx_options.ocr_oem = 1; snprintf(tess_path, 1024, "%s%s%s", tessdata_path, "/", "tessdata"); - //ccx_options.ocr_oem are deprecated and only supported mode is OEM_LSTM_ONLY - ret = TessBaseAPIInit4(ctx->tess_handle, tess_path, lang, 1, NULL, 0, &pars_vec, + ret = TessBaseAPIInit4(ctx->tess_handle, tess_path, lang, ccx_options.ocr_oem, NULL, 0, &pars_vec, &pars_values, 1, false); } else { + if (ccx_options.ocr_oem < 0) ccx_options.ocr_oem = 0; ret = TessBaseAPIInit4(ctx->tess_handle, tessdata_path, lang, ccx_options.ocr_oem, NULL, 0, &pars_vec, &pars_values, 1, false); } diff --git a/src/lib_ccx/ocr.c b/src/lib_ccx/ocr.c index f951f4a93..f75d0d0e4 100644 --- a/src/lib_ccx/ocr.c +++ b/src/lib_ccx/ocr.c @@ -178,12 +178,13 @@ void *init_ocr(int lang_index) { char tess_path[1024]; snprintf(tess_path, 1024, "%s%s%s", tessdata_path, "/", "tessdata"); - //ccx_options.ocr_oem are deprecated and only supported mode is OEM_LSTM_ONLY - ret = TessBaseAPIInit4(ctx->api, tess_path, lang, 1, NULL, 0, &pars_vec, + if (ccx_options.ocr_oem < 0) ccx_options.ocr_oem = 1; + ret = TessBaseAPIInit4(ctx->api, tess_path, lang, ccx_options.ocr_oem, NULL, 0, &pars_vec, &pars_values, 1, false); } else { + if (ccx_options.ocr_oem < 0) ccx_options.ocr_oem = 0; ret = TessBaseAPIInit4(ctx->api, tessdata_path, lang, ccx_options.ocr_oem, NULL, 0, &pars_vec, &pars_values, 1, false); } diff --git a/src/lib_ccx/params.c b/src/lib_ccx/params.c index bb3952212..bc80c0488 100644 --- a/src/lib_ccx/params.c +++ b/src/lib_ccx/params.c @@ -681,10 +681,14 @@ void print_usage(void) mprint(" 0: Don't quantize at all.\n"); mprint(" 1: Use CCExtractor's internal function (default).\n"); mprint(" 2: Reduce distinct color count in image for faster results.\n"); - mprint(" -oem: Select the OEM mode for Tesseract, could be 0, 1 or 2.\n"); - mprint(" 0: OEM_TESSERACT_ONLY - default value, the fastest mode.\n"); + mprint(" -oem: Select the OEM mode for Tesseract.\n"); + mprint(" Available modes :\n"); + mprint(" 0: OEM_TESSERACT_ONLY - the fastest mode.\n"); mprint(" 1: OEM_LSTM_ONLY - use LSTM algorithm for recognition.\n"); mprint(" 2: OEM_TESSERACT_LSTM_COMBINED - both algorithms.\n"); + mprint(" Default value depends on the tesseract version linked :\n"); + mprint(" Tesseract v3 : default mode is 0,\n"); + mprint(" Tesseract v4 : default mode is 1.\n"); mprint(" -mkvlang: For MKV subtitles, select which language's caption\n"); mprint(" stream will be processed. e.g. 'eng' for English.\n"); mprint(" Language codes can be either the 3 letters bibliographic\n"); From 212c1bc9c0f58a04eeaf3d0ba8bcd2e6301547c5 Mon Sep 17 00:00:00 2001 From: Gilles Hamel Date: Fri, 8 May 2020 20:57:21 +0200 Subject: [PATCH 2/3] Allow all oem modes with tesseract v4 --- docs/CHANGES.TXT | 1 + src/lib_ccx/ccx_common_option.c | 2 +- src/lib_ccx/hardsubx.c | 5 +++-- src/lib_ccx/ocr.c | 5 +++-- src/lib_ccx/params.c | 8 ++++++-- 5 files changed, 14 insertions(+), 7 deletions(-) diff --git a/docs/CHANGES.TXT b/docs/CHANGES.TXT index d2129deb6..33f54ff92 100644 --- a/docs/CHANGES.TXT +++ b/docs/CHANGES.TXT @@ -19,6 +19,7 @@ - Fix: General code clean up / reformatting - Fix: Fix multiple definitions with new -fno-common default in GCC 10 - Fix: Mac now builds reproducibly again without errors on the date command (#1230) +- Fix: Allow all oem modes with tesseract v4 (#1264) - Doc: Updated ccextractor.cnf.sample. 0.88 (2019-05-21) diff --git a/src/lib_ccx/ccx_common_option.c b/src/lib_ccx/ccx_common_option.c index 96e7322ab..1fc501dab 100644 --- a/src/lib_ccx/ccx_common_option.c +++ b/src/lib_ccx/ccx_common_option.c @@ -69,7 +69,7 @@ void init_options(struct ccx_s_options *options) options->hardsubx = 0; // By default, don't try to extract hard subtitles options->dvblang = NULL; // By default, autodetect DVB language options->ocrlang = NULL; // By default, autodetect .traineddata file - options->ocr_oem = 0; // By default, set Tesseract OEM mode OEM_TESSERACT_ONLY (0) + options->ocr_oem = -1; // By default, OEM mode depends on the tesseract version options->ocr_quantmode = 1; // CCExtractor's internal options->mkvlang = NULL; // By default, all the languages are extracted options->ignore_pts_jumps = 1; diff --git a/src/lib_ccx/hardsubx.c b/src/lib_ccx/hardsubx.c index 6fa51fe07..2d2721540 100644 --- a/src/lib_ccx/hardsubx.c +++ b/src/lib_ccx/hardsubx.c @@ -248,13 +248,14 @@ struct lib_hardsubx_ctx *_init_hardsubx(struct ccx_s_options *options) if (!strncmp("4.", TessVersion(), 2)) { char tess_path[1024]; + if (ccx_options.ocr_oem < 0) ccx_options.ocr_oem = 1; snprintf(tess_path, 1024, "%s%s%s", tessdata_path, "/", "tessdata"); - //ccx_options.ocr_oem are deprecated and only supported mode is OEM_LSTM_ONLY - ret = TessBaseAPIInit4(ctx->tess_handle, tess_path, lang, 1, NULL, 0, &pars_vec, + ret = TessBaseAPIInit4(ctx->tess_handle, tess_path, lang, ccx_options.ocr_oem, NULL, 0, &pars_vec, &pars_values, 1, false); } else { + if (ccx_options.ocr_oem < 0) ccx_options.ocr_oem = 0; ret = TessBaseAPIInit4(ctx->tess_handle, tessdata_path, lang, ccx_options.ocr_oem, NULL, 0, &pars_vec, &pars_values, 1, false); } diff --git a/src/lib_ccx/ocr.c b/src/lib_ccx/ocr.c index f951f4a93..f75d0d0e4 100644 --- a/src/lib_ccx/ocr.c +++ b/src/lib_ccx/ocr.c @@ -178,12 +178,13 @@ void *init_ocr(int lang_index) { char tess_path[1024]; snprintf(tess_path, 1024, "%s%s%s", tessdata_path, "/", "tessdata"); - //ccx_options.ocr_oem are deprecated and only supported mode is OEM_LSTM_ONLY - ret = TessBaseAPIInit4(ctx->api, tess_path, lang, 1, NULL, 0, &pars_vec, + if (ccx_options.ocr_oem < 0) ccx_options.ocr_oem = 1; + ret = TessBaseAPIInit4(ctx->api, tess_path, lang, ccx_options.ocr_oem, NULL, 0, &pars_vec, &pars_values, 1, false); } else { + if (ccx_options.ocr_oem < 0) ccx_options.ocr_oem = 0; ret = TessBaseAPIInit4(ctx->api, tessdata_path, lang, ccx_options.ocr_oem, NULL, 0, &pars_vec, &pars_values, 1, false); } diff --git a/src/lib_ccx/params.c b/src/lib_ccx/params.c index bb3952212..bc80c0488 100644 --- a/src/lib_ccx/params.c +++ b/src/lib_ccx/params.c @@ -681,10 +681,14 @@ void print_usage(void) mprint(" 0: Don't quantize at all.\n"); mprint(" 1: Use CCExtractor's internal function (default).\n"); mprint(" 2: Reduce distinct color count in image for faster results.\n"); - mprint(" -oem: Select the OEM mode for Tesseract, could be 0, 1 or 2.\n"); - mprint(" 0: OEM_TESSERACT_ONLY - default value, the fastest mode.\n"); + mprint(" -oem: Select the OEM mode for Tesseract.\n"); + mprint(" Available modes :\n"); + mprint(" 0: OEM_TESSERACT_ONLY - the fastest mode.\n"); mprint(" 1: OEM_LSTM_ONLY - use LSTM algorithm for recognition.\n"); mprint(" 2: OEM_TESSERACT_LSTM_COMBINED - both algorithms.\n"); + mprint(" Default value depends on the tesseract version linked :\n"); + mprint(" Tesseract v3 : default mode is 0,\n"); + mprint(" Tesseract v4 : default mode is 1.\n"); mprint(" -mkvlang: For MKV subtitles, select which language's caption\n"); mprint(" stream will be processed. e.g. 'eng' for English.\n"); mprint(" Language codes can be either the 3 letters bibliographic\n"); From a3e84b46f63b91d99d12a727c1de053f468741fc Mon Sep 17 00:00:00 2001 From: Gilles Hamel Date: Fri, 8 May 2020 21:24:59 +0200 Subject: [PATCH 3/3] Fix formatting --- src/lib_ccx/hardsubx.c | 6 ++++-- src/lib_ccx/ocr.c | 6 ++++-- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/src/lib_ccx/hardsubx.c b/src/lib_ccx/hardsubx.c index 2d2721540..f58994bf6 100644 --- a/src/lib_ccx/hardsubx.c +++ b/src/lib_ccx/hardsubx.c @@ -248,14 +248,16 @@ struct lib_hardsubx_ctx *_init_hardsubx(struct ccx_s_options *options) if (!strncmp("4.", TessVersion(), 2)) { char tess_path[1024]; - if (ccx_options.ocr_oem < 0) ccx_options.ocr_oem = 1; + if (ccx_options.ocr_oem < 0) + ccx_options.ocr_oem = 1; snprintf(tess_path, 1024, "%s%s%s", tessdata_path, "/", "tessdata"); ret = TessBaseAPIInit4(ctx->tess_handle, tess_path, lang, ccx_options.ocr_oem, NULL, 0, &pars_vec, &pars_values, 1, false); } else { - if (ccx_options.ocr_oem < 0) ccx_options.ocr_oem = 0; + if (ccx_options.ocr_oem < 0) + ccx_options.ocr_oem = 0; ret = TessBaseAPIInit4(ctx->tess_handle, tessdata_path, lang, ccx_options.ocr_oem, NULL, 0, &pars_vec, &pars_values, 1, false); } diff --git a/src/lib_ccx/ocr.c b/src/lib_ccx/ocr.c index f75d0d0e4..78d020cfc 100644 --- a/src/lib_ccx/ocr.c +++ b/src/lib_ccx/ocr.c @@ -178,13 +178,15 @@ void *init_ocr(int lang_index) { char tess_path[1024]; snprintf(tess_path, 1024, "%s%s%s", tessdata_path, "/", "tessdata"); - if (ccx_options.ocr_oem < 0) ccx_options.ocr_oem = 1; + if (ccx_options.ocr_oem < 0) + ccx_options.ocr_oem = 1; ret = TessBaseAPIInit4(ctx->api, tess_path, lang, ccx_options.ocr_oem, NULL, 0, &pars_vec, &pars_values, 1, false); } else { - if (ccx_options.ocr_oem < 0) ccx_options.ocr_oem = 0; + if (ccx_options.ocr_oem < 0) + ccx_options.ocr_oem = 0; ret = TessBaseAPIInit4(ctx->api, tessdata_path, lang, ccx_options.ocr_oem, NULL, 0, &pars_vec, &pars_values, 1, false); }