From ce976106850aded17558af14116d5d89c54e37a4 Mon Sep 17 00:00:00 2001 From: Maxim Poliakovski Date: Wed, 20 Apr 2022 22:08:54 +0200 Subject: [PATCH] Switch to Tesseract 4.1.1 and Javacpp-presets 1.5.6. --- build.gradle | 18 +++++++++--------- .../omr/text/tesseract/TesseractOCR.java | 16 +++++++++++----- .../omr/text/tesseract/TesseractOrder.java | 11 +++++++++-- 3 files changed, 29 insertions(+), 16 deletions(-) diff --git a/build.gradle b/build.gradle index 9615fb41d..b225cf70c 100644 --- a/build.gradle +++ b/build.gradle @@ -10,9 +10,9 @@ ext.programVersion = "$project.version" ext.companyName = "$programName Ltd." ext.companyId = "${programName}Ltd" -ext.jcppVersion = '1.4.4' -ext.leptVersion = '1.77.0' -ext.tessVersion = '4.0.0' +ext.jcppVersion = '1.5.6' +ext.leptVersion = '1.81.1' +ext.tessVersion = '4.1.1' // this code is required in order to adapt values of os.name and os.arch to the // conventions used by Javacpp's dependencies @@ -114,8 +114,8 @@ dependencies { [group: 'gov.nist.math', name: 'jama', version: '1.0.3'], [group: 'org.reflections', name: 'reflections', version: '0.10.2'], [group: 'org.bytedeco', name: 'javacpp', version: jcppVersion], - [group: 'org.bytedeco.javacpp-presets', name: 'leptonica', version: "${leptVersion}-${jcppVersion}"], - [group: 'org.bytedeco.javacpp-presets', name: 'tesseract', version: "${tessVersion}-${jcppVersion}"], + [group: 'org.bytedeco', name: 'leptonica', version: "${leptVersion}-${jcppVersion}"], + [group: 'org.bytedeco', name: 'tesseract', version: "${tessVersion}-${jcppVersion}"], [group: 'com.github.jai-imageio', name: 'jai-imageio-core', version: '1.4.0'], [group: 'org.apache.directory.studio', name: 'org.apache.commons.io', version: '2.4'], [group: 'javax.xml.bind', name: 'jaxb-api', version: '2.3.1'], @@ -124,8 +124,8 @@ dependencies { ) runtimeOnly( - [group: 'org.bytedeco.javacpp-presets', name: 'leptonica', version: "${leptVersion}-${jcppVersion}", classifier: "${project.ext.targetOS}"], - [group: 'org.bytedeco.javacpp-presets', name: 'tesseract', version: "${tessVersion}-${jcppVersion}", classifier: "${project.ext.targetOS}"] + [group: 'org.bytedeco', name: 'leptonica', version: "${leptVersion}-${jcppVersion}", classifier: "${project.ext.targetOS}"], + [group: 'org.bytedeco', name: 'tesseract', version: "${tessVersion}-${jcppVersion}", classifier: "${project.ext.targetOS}"] ) testImplementation( @@ -137,8 +137,8 @@ dependencies { // Specific configurations for specific OS dependencies ['windows-x86', 'windows-x86_64'].each { os -> configurations.create("runtime-$os") - dependencies.add("runtime-$os", [group: 'org.bytedeco.javacpp-presets', name: 'leptonica', version: "${leptVersion}-${jcppVersion}", classifier: "$os"]) - dependencies.add("runtime-$os", [group: 'org.bytedeco.javacpp-presets', name: 'tesseract', version: "${tessVersion}-${jcppVersion}", classifier: "$os"]) + dependencies.add("runtime-$os", [group: 'org.bytedeco', name: 'leptonica', version: "${leptVersion}-${jcppVersion}", classifier: "$os"]) + dependencies.add("runtime-$os", [group: 'org.bytedeco', name: 'tesseract', version: "${tessVersion}-${jcppVersion}", classifier: "$os"]) } jar { diff --git a/src/main/org/audiveris/omr/text/tesseract/TesseractOCR.java b/src/main/org/audiveris/omr/text/tesseract/TesseractOCR.java index 83d8ff482..c70b94c2d 100644 --- a/src/main/org/audiveris/omr/text/tesseract/TesseractOCR.java +++ b/src/main/org/audiveris/omr/text/tesseract/TesseractOCR.java @@ -28,9 +28,10 @@ import org.audiveris.omr.text.TextLine; import org.audiveris.omr.text.TextWord; -import org.bytedeco.javacpp.tesseract; -import org.bytedeco.javacpp.tesseract.StringGenericVector; -import org.bytedeco.javacpp.tesseract.TessBaseAPI; +import org.bytedeco.tesseract.global.tesseract; +import org.bytedeco.tesseract.StringGenericVector; +import org.bytedeco.tesseract.TessBaseAPI; +import static org.bytedeco.tesseract.global.tesseract.*; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -102,6 +103,11 @@ public Set getLanguages () final Path ocrFolder = getOcrFolder(); final TreeSet set = new TreeSet<>(); + // we need to call this to avoid crash + // in native code on non-english systems + // https://github.com/bytedeco/javacpp-presets/issues/694 + setlocale(LC_ALL(), "C"); + try { final TessBaseAPI api = new TessBaseAPI(); @@ -305,9 +311,9 @@ private int getMode (LayoutMode layoutMode) private Path scanOcrLocations (String[] locations) { for (String loc : locations) { - final Path path = Paths.get(loc); + final Path path = Paths.get(loc).resolve("tessdata"); - if (Files.exists(path.resolve("tessdata"))) { + if (Files.exists(path)) { return path; } } diff --git a/src/main/org/audiveris/omr/text/tesseract/TesseractOrder.java b/src/main/org/audiveris/omr/text/tesseract/TesseractOrder.java index 4591adb6d..53dd1f190 100644 --- a/src/main/org/audiveris/omr/text/tesseract/TesseractOrder.java +++ b/src/main/org/audiveris/omr/text/tesseract/TesseractOrder.java @@ -31,8 +31,12 @@ import org.audiveris.omr.text.TextWord; import org.bytedeco.javacpp.*; -import static org.bytedeco.javacpp.lept.*; -import static org.bytedeco.javacpp.tesseract.*; +import org.bytedeco.leptonica.PIX; +import org.bytedeco.tesseract.PageIterator; +import org.bytedeco.tesseract.ResultIterator; +import org.bytedeco.tesseract.TessBaseAPI; +import static org.bytedeco.leptonica.global.lept.*; +import static org.bytedeco.tesseract.global.tesseract.*; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -182,6 +186,9 @@ public List process () } } + // prevent JVM crashes on non-english systems + setlocale(LC_ALL(), "C"); + if (api.Init(ocrFolder.toString(), lang, OEM_TESSERACT_ONLY) != 0) { logger.warn("Could not initialize Tesseract with lang {}", lang);