Skip to content

Commit

Permalink
Add flag for Page Segmentation Modes control (#1601)
Browse files Browse the repository at this point in the history
* Add flag for Page Segmentation Modes control

I added an flag --psm for controlling PSM (Page Segmentation Modes) in Tesseract. The default option (3) gives me quite bad results. When I use 6, 11, or 12 for Bulgarian, it gives me much better OCR results. I haven't tested other languages yet, but I expect improvements as well if other mode is used.

* feat: add psm for rust parser

* fix: add psm to options

* fix: add default value of psm to 3

* fix: correct type of ocr oem

* fix(rust): use fatal! instead of exit

---------

Co-authored-by: Prateek Sunal <prtksunal@gmail.com>
  • Loading branch information
Neo2SHYAlien and prateekmedia authored Sep 3, 2024
1 parent 1a13bbb commit 349020e
Show file tree
Hide file tree
Showing 9 changed files with 78 additions and 0 deletions.
1 change: 1 addition & 0 deletions docs/CHANGES.TXT
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@
- Fix: infinite loop in MP4 file type detector.
- Improvement: Use Corrosion to build Rust code
- Improvement: Ignore MXF Caption Essence Container version byte to enhance SRT subtitle extraction compatibility
- New: Add tesseract page segmentation modes control with `--psm` flag

0.94 (2021-12-14)
-----------------
Expand Down
1 change: 1 addition & 0 deletions src/lib_ccx/ccx_common_option.c
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,7 @@ void init_options(struct ccx_s_options *options)
options->dvblang = NULL; // By default, autodetect DVB language
options->ocrlang = NULL; // By default, autodetect .traineddata file
options->ocr_oem = -1; // By default, OEM mode depends on the tesseract version
options->psm = 3; // Default PSM mode (3 is the default tesseract as well)
options->ocr_quantmode = 1; // CCExtractor's internal
options->mkvlang = NULL; // By default, all the languages are extracted
options->ignore_pts_jumps = 1;
Expand Down
1 change: 1 addition & 0 deletions src/lib_ccx/ccx_common_option.h
Original file line number Diff line number Diff line change
Expand Up @@ -147,6 +147,7 @@ struct ccx_s_options // Options from user parameters
char *dvblang; // The name of the language stream for DVB
const char *ocrlang; // The name of the .traineddata file to be loaded with tesseract
int ocr_oem; // The Tesseract OEM mode, could be 0 (default), 1 or 2
int psm; // The Tesseract PSM mode, could be between 0 and 13. 3 is tesseract default
int ocr_quantmode; // How to quantize the bitmap before passing to to tesseract (0=no quantization at all, 1=CCExtractor's internal)
char *mkvlang; // The name of the language stream for MKV
int analyze_video_stream; // If 1, the video stream will be processed even if we're using a different one for subtitles.
Expand Down
3 changes: 3 additions & 0 deletions src/lib_ccx/ocr.c
Original file line number Diff line number Diff line change
Expand Up @@ -177,6 +177,9 @@ void *init_ocr(int lang_index)
&pars_values, 1, false);
}

// set PSM mode
TessBaseAPISetPageSegMode(ctx->api, ccx_options.psm);

free(pars_vec);
free(pars_values);

Expand Down
38 changes: 38 additions & 0 deletions src/lib_ccx/params.c
Original file line number Diff line number Diff line change
Expand Up @@ -679,6 +679,23 @@ void print_usage(void)
mprint(" Default value depends on the tesseract version linked :\n");
mprint(" Tesseract v3 : default mode is 0,\n");
mprint(" Tesseract v4 : default mode is 1.\n");
mprint(" --psm: Select the PSM mode for Tesseract.\n");
mprint(" Available Page segmentation modes:\n");
mprint(" 0 Orientation and script detection (OSD) only.\n");
mprint(" 1 Automatic page segmentation with OSD.\n");
mprint(" 2 Automatic page segmentation, but no OSD, or OCR.\n");
mprint(" 3 Fully automatic page segmentation, but no OSD. (Default)\n");
mprint(" 4 Assume a single column of text of variable sizes.\n");
mprint(" 5 Assume a single uniform block of vertically aligned text.\n");
mprint(" 6 Assume a single uniform block of text.\n");
mprint(" 7 Treat the image as a single text line.\n");
mprint(" 8 Treat the image as a single word.\n");
mprint(" 9 Treat the image as a single word in a circle.\n");
mprint(" 10 Treat the image as a single character.\n");
mprint(" 11 Sparse text. Find as much text as possible in no particular order.\n");
mprint(" 12 Sparse text with OSD.\n");
mprint(" 13 Raw line. Treat the image as a single text line,\n");
mprint(" bypassing hacks that are Tesseract-specific.\n");
mprint(" --mkvlang: For MKV subtitles, select which language's caption\n");
mprint(" stream will be processed. e.g. 'eng' for English.\n");
mprint(" Language codes can be either the 3 letters bibliographic\n");
Expand Down Expand Up @@ -1696,6 +1713,27 @@ int parse_parameters(struct ccx_s_options *opt, int argc, char *argv[])
fatal(EXIT_MALFORMED_PARAMETER, "--oem has no argument.\n");
}
}
if (strcmp(argv[i], "--psm") == 0)
{
if (i < argc - 1)
{
i++;

char *str = (char *)malloc(sizeof(argv[i]));
sprintf(str, "%s", argv[i]);
opt->psm = atoi(str);
if (opt->psm < 0 || opt->psm > 13)
{
fatal(EXIT_MALFORMED_PARAMETER, "--psm must be between 0 and 13\n");
}

continue;
}
else
{
fatal(EXIT_MALFORMED_PARAMETER, "--psm has no argument.\n");
}
}
if (strcmp(argv[i], "--mkvlang") == 0)
{
if (i < argc - 1)
Expand Down
2 changes: 2 additions & 0 deletions src/lib_ccx/params_dump.c
Original file line number Diff line number Diff line change
Expand Up @@ -216,6 +216,8 @@ void params_dump(struct lib_ccx_ctx *ctx)
mprint("Reduced color palette]\n");
break;
}

mprint("[Tesseract PSM: %d]\n", ccx_options.psm);
}

#define Y_N(cond) ((cond) ? "Yes" : "No")
Expand Down
3 changes: 3 additions & 0 deletions src/rust/lib_ccxr/src/common/options.rs
Original file line number Diff line number Diff line change
Expand Up @@ -455,6 +455,8 @@ pub struct Options {
pub ocrlang: PathBuf,
/// The Tesseract OEM mode, could be 0 (default), 1 or 2
pub ocr_oem: i8,
/// The Tesseract PSM mode, could be between 0 and 13. 3 is tesseract default
pub psm: i32,
/// How to quantize the bitmap before passing to to tesseract
/// (0 = no quantization at all, 1 = CCExtractor's internal,
/// 2 = reduce distinct color count in image for faster results.)
Expand Down Expand Up @@ -589,6 +591,7 @@ impl Default for Options {
dvblang: Default::default(),
ocrlang: Default::default(),
ocr_oem: -1,
psm: 3,
ocr_quantmode: 1,
mkvlang: Default::default(),
analyze_video_stream: Default::default(),
Expand Down
19 changes: 19 additions & 0 deletions src/rust/src/args.rs
Original file line number Diff line number Diff line change
Expand Up @@ -600,6 +600,25 @@ pub struct Args {
/// Tesseract v4 : default mode is 1.
#[arg(long, verbatim_doc_comment, value_name="mode", help_heading=OUTPUT_AFFECTING_OUTPUT_FILES)]
pub oem: Option<u8>,
/// Select the PSM mode for Tesseract.
/// Available Page segmentation modes:
/// 0 Orientation and script detection (OSD) only.
/// 1 Automatic page segmentation with OSD.
/// 2 Automatic page segmentation, but no OSD, or OCR.
/// 3 Fully automatic page segmentation, but no OSD. (Default)
/// 4 Assume a single column of text of variable sizes.
/// 5 Assume a single uniform block of vertically aligned text.
/// 6 Assume a single uniform block of text.
/// 7 Treat the image as a single text line.
/// 8 Treat the image as a single word.
/// 9 Treat the image as a single word in a circle.
/// 10 Treat the image as a single character.
/// 11 Sparse text. Find as much text as possible in no particular order.
/// 12 Sparse text with OSD.
/// 13 Raw line. Treat the image as a single text line,
/// bypassing hacks that are Tesseract-specific.
#[arg(long, verbatim_doc_comment, value_name="mode", help_heading=OUTPUT_AFFECTING_OUTPUT_FILES)]
pub psm: Option<u8>,
/// For MKV subtitles, select which language's caption
/// stream will be processed. e.g. 'eng' for English.
/// Language codes can be either the 3 letters bibliographic
Expand Down
10 changes: 10 additions & 0 deletions src/rust/src/parser.rs
Original file line number Diff line number Diff line change
Expand Up @@ -801,6 +801,16 @@ impl OptionsExt for Options {
self.ocr_oem = *oem as _;
}

if let Some(ref psm) = args.psm {
if !(0..=13).contains(psm) {
fatal!(
cause = ExitCause::MalformedParameter;
"--psm must be between 0 and 13"
);
}
self.psm = *psm as _;
}

if let Some(ref lang) = args.mkvlang {
self.mkvlang = Some(Language::from_str(lang.as_str()).unwrap());
let str = lang.as_str();
Expand Down

0 comments on commit 349020e

Please sign in to comment.