Skip to content

Commit

Permalink
Switch to Tesseract 4.1.1 and Javacpp-presets 1.5.6.
Browse files Browse the repository at this point in the history
  • Loading branch information
maximumspatium committed Apr 20, 2022
1 parent 1e34322 commit ce97610
Show file tree
Hide file tree
Showing 3 changed files with 29 additions and 16 deletions.
18 changes: 9 additions & 9 deletions build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,9 @@ ext.programVersion = "$project.version"
ext.companyName = "$programName Ltd."
ext.companyId = "${programName}Ltd"

ext.jcppVersion = '1.4.4'
ext.leptVersion = '1.77.0'
ext.tessVersion = '4.0.0'
ext.jcppVersion = '1.5.6'
ext.leptVersion = '1.81.1'
ext.tessVersion = '4.1.1'

// this code is required in order to adapt values of os.name and os.arch to the
// conventions used by Javacpp's dependencies
Expand Down Expand Up @@ -114,8 +114,8 @@ dependencies {
[group: 'gov.nist.math', name: 'jama', version: '1.0.3'],
[group: 'org.reflections', name: 'reflections', version: '0.10.2'],
[group: 'org.bytedeco', name: 'javacpp', version: jcppVersion],
[group: 'org.bytedeco.javacpp-presets', name: 'leptonica', version: "${leptVersion}-${jcppVersion}"],
[group: 'org.bytedeco.javacpp-presets', name: 'tesseract', version: "${tessVersion}-${jcppVersion}"],
[group: 'org.bytedeco', name: 'leptonica', version: "${leptVersion}-${jcppVersion}"],
[group: 'org.bytedeco', name: 'tesseract', version: "${tessVersion}-${jcppVersion}"],
[group: 'com.github.jai-imageio', name: 'jai-imageio-core', version: '1.4.0'],
[group: 'org.apache.directory.studio', name: 'org.apache.commons.io', version: '2.4'],
[group: 'javax.xml.bind', name: 'jaxb-api', version: '2.3.1'],
Expand All @@ -124,8 +124,8 @@ dependencies {
)

runtimeOnly(
[group: 'org.bytedeco.javacpp-presets', name: 'leptonica', version: "${leptVersion}-${jcppVersion}", classifier: "${project.ext.targetOS}"],
[group: 'org.bytedeco.javacpp-presets', name: 'tesseract', version: "${tessVersion}-${jcppVersion}", classifier: "${project.ext.targetOS}"]
[group: 'org.bytedeco', name: 'leptonica', version: "${leptVersion}-${jcppVersion}", classifier: "${project.ext.targetOS}"],
[group: 'org.bytedeco', name: 'tesseract', version: "${tessVersion}-${jcppVersion}", classifier: "${project.ext.targetOS}"]
)

testImplementation(
Expand All @@ -137,8 +137,8 @@ dependencies {
// Specific configurations for specific OS dependencies
['windows-x86', 'windows-x86_64'].each { os ->
configurations.create("runtime-$os")
dependencies.add("runtime-$os", [group: 'org.bytedeco.javacpp-presets', name: 'leptonica', version: "${leptVersion}-${jcppVersion}", classifier: "$os"])
dependencies.add("runtime-$os", [group: 'org.bytedeco.javacpp-presets', name: 'tesseract', version: "${tessVersion}-${jcppVersion}", classifier: "$os"])
dependencies.add("runtime-$os", [group: 'org.bytedeco', name: 'leptonica', version: "${leptVersion}-${jcppVersion}", classifier: "$os"])
dependencies.add("runtime-$os", [group: 'org.bytedeco', name: 'tesseract', version: "${tessVersion}-${jcppVersion}", classifier: "$os"])
}

jar {
Expand Down
16 changes: 11 additions & 5 deletions src/main/org/audiveris/omr/text/tesseract/TesseractOCR.java
Original file line number Diff line number Diff line change
Expand Up @@ -28,9 +28,10 @@
import org.audiveris.omr.text.TextLine;
import org.audiveris.omr.text.TextWord;

import org.bytedeco.javacpp.tesseract;
import org.bytedeco.javacpp.tesseract.StringGenericVector;
import org.bytedeco.javacpp.tesseract.TessBaseAPI;
import org.bytedeco.tesseract.global.tesseract;
import org.bytedeco.tesseract.StringGenericVector;
import org.bytedeco.tesseract.TessBaseAPI;
import static org.bytedeco.tesseract.global.tesseract.*;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
Expand Down Expand Up @@ -102,6 +103,11 @@ public Set<String> getLanguages ()
final Path ocrFolder = getOcrFolder();
final TreeSet<String> set = new TreeSet<>();

// we need to call this to avoid crash
// in native code on non-english systems
// https://github.com/bytedeco/javacpp-presets/issues/694
setlocale(LC_ALL(), "C");

This comment has been minimized.

Copy link
@stweil

stweil Apr 21, 2022

Contributor

Is this line really still needed? The Tesseract code should run with any locale settings (if it does not, that would be a bug).

This comment has been minimized.

Copy link
@maximumspatium

maximumspatium Apr 21, 2022

Author Contributor

@stweil Ja, das ist ein Bug, der uns seit Tesseract 3 plagt! Dieses Problem wurde in der Vergangenheit mehrfach berichtet, z.B. hier. Jedes Mal hieß es: das ist nicht unser Problem oder wir unterstützen Java nicht usw. usf. Irgendwann hatten wir die Schnauze voll und entwickelten eine Lösung, die daraus bestand, ein Java-Interface für setlocale bereitzustellen, das wir in unserem Code aufrufen können.

This comment has been minimized.

Copy link
@maximumspatium

maximumspatium Apr 21, 2022

Author Contributor

This comment has been minimized.

Copy link
@stweil

stweil Apr 22, 2022

Contributor

Z.K. bytedeco/javacpp-presets#694

That issue was fixed three years ago, see tesseract-ocr/tesseract@5d92fbf.

This comment has been minimized.

Copy link
@stweil

stweil Apr 22, 2022

Contributor

das ist ein Bug, der uns seit Tesseract 3 plagt!

Richtig, alte Versionen hatten dieses Problem. Mit Tesseract 4.1 oder Tesseract 5 ist es aber (soweit ich weiß vollständig) gelöst.


try {
final TessBaseAPI api = new TessBaseAPI();

Expand Down Expand Up @@ -305,9 +311,9 @@ private int getMode (LayoutMode layoutMode)
private Path scanOcrLocations (String[] locations)
{
for (String loc : locations) {
final Path path = Paths.get(loc);
final Path path = Paths.get(loc).resolve("tessdata");

if (Files.exists(path.resolve("tessdata"))) {
if (Files.exists(path)) {
return path;
}
}
Expand Down
11 changes: 9 additions & 2 deletions src/main/org/audiveris/omr/text/tesseract/TesseractOrder.java
Original file line number Diff line number Diff line change
Expand Up @@ -31,8 +31,12 @@
import org.audiveris.omr.text.TextWord;

import org.bytedeco.javacpp.*;
import static org.bytedeco.javacpp.lept.*;
import static org.bytedeco.javacpp.tesseract.*;
import org.bytedeco.leptonica.PIX;
import org.bytedeco.tesseract.PageIterator;
import org.bytedeco.tesseract.ResultIterator;
import org.bytedeco.tesseract.TessBaseAPI;
import static org.bytedeco.leptonica.global.lept.*;
import static org.bytedeco.tesseract.global.tesseract.*;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
Expand Down Expand Up @@ -182,6 +186,9 @@ public List<TextLine> process ()
}
}

// prevent JVM crashes on non-english systems
setlocale(LC_ALL(), "C");

if (api.Init(ocrFolder.toString(), lang, OEM_TESSERACT_ONLY) != 0) {
logger.warn("Could not initialize Tesseract with lang {}", lang);

Expand Down

0 comments on commit ce97610

Please sign in to comment.