From 9922364aa845ae41453dddc94d92537c60828bc5 Mon Sep 17 00:00:00 2001 From: tenpai Date: Tue, 16 Apr 2024 18:24:57 +0000 Subject: [PATCH] Add Japanese Vertical Support Branch for Tesseract and Ocrmypdf OCR (#2505) * Add Japanese Vertical Support * Adds Japanese Vertical mappings to default configuration. --- docker/dockerfiles/joex.dockerfile | 2 +- .../docspell/analysis/date/DateFind.scala | 1 + .../docspell/analysis/date/MonthName.scala | 2 + .../main/scala/docspell/common/Language.scala | 6 +++ .../docspell/ftspsql/FtsRepository.scala | 1 + .../joex/src/main/resources/reference.conf | 48 +++++++++++++++++-- modules/webapp/package.json | 3 ++ modules/webapp/src/main/elm/Data/Language.elm | 8 ++++ .../src/main/elm/Messages/Data/Language.elm | 9 ++++ 9 files changed, 75 insertions(+), 5 deletions(-) diff --git a/docker/dockerfiles/joex.dockerfile b/docker/dockerfiles/joex.dockerfile index df9c90c69d..6d996521c7 100644 --- a/docker/dockerfiles/joex.dockerfile +++ b/docker/dockerfiles/joex.dockerfile @@ -77,7 +77,7 @@ RUN \ wget https://github.com/tesseract-ocr/tessdata/raw/main/khm.traineddata && \ mv khm.traineddata /usr/share/tessdata -# Using these data files for japanese, because they work better. See #973 +# Using these data files for japanese, because they work better. Includes vertical data. See #973 and #2445. RUN \ wget https://raw.githubusercontent.com/tesseract-ocr/tessdata_fast/master/jpn_vert.traineddata && \ wget https://raw.githubusercontent.com/tesseract-ocr/tessdata_fast/master/jpn.traineddata && \ diff --git a/modules/analysis/src/main/scala/docspell/analysis/date/DateFind.scala b/modules/analysis/src/main/scala/docspell/analysis/date/DateFind.scala index 1260a3dbf2..c78bc81bf6 100644 --- a/modules/analysis/src/main/scala/docspell/analysis/date/DateFind.scala +++ b/modules/analysis/src/main/scala/docspell/analysis/date/DateFind.scala @@ -125,6 +125,7 @@ object DateFind { case Language.Dutch => dmy.or(ymd).or(mdy) case Language.Latvian => dmy.or(lavLong).or(ymd) case Language.Japanese => ymd + case Language.JpnVert => ymd case Language.Hebrew => dmy case Language.Lithuanian => ymd case Language.Polish => dmy diff --git a/modules/analysis/src/main/scala/docspell/analysis/date/MonthName.scala b/modules/analysis/src/main/scala/docspell/analysis/date/MonthName.scala index b646b46bb5..4693440e48 100644 --- a/modules/analysis/src/main/scala/docspell/analysis/date/MonthName.scala +++ b/modules/analysis/src/main/scala/docspell/analysis/date/MonthName.scala @@ -54,6 +54,8 @@ object MonthName { latvian case Language.Japanese => japanese + case Language.JpnVert => + japanese case Language.Hebrew => hebrew case Language.Lithuanian => diff --git a/modules/common/src/main/scala/docspell/common/Language.scala b/modules/common/src/main/scala/docspell/common/Language.scala index 7b7ef8156b..d1ae6e06be 100644 --- a/modules/common/src/main/scala/docspell/common/Language.scala +++ b/modules/common/src/main/scala/docspell/common/Language.scala @@ -123,6 +123,11 @@ object Language { val iso3 = "jpn" } + /*It's not an ISO value, but this needs to be unique and tesseract will need jpn_vert for it's scan from the config of /etc/docspell-joex/docspell-joex.conf.*/ + case object JpnVert extends Language { + val iso2 = "ja_vert" + val iso3 = "jpn_vert" + } case object Hebrew extends Language { val iso2 = "he" val iso3 = "heb" @@ -172,6 +177,7 @@ object Language { Romanian, Latvian, Japanese, + JpnVert, Hebrew, Lithuanian, Polish, diff --git a/modules/fts-psql/src/main/scala/docspell/ftspsql/FtsRepository.scala b/modules/fts-psql/src/main/scala/docspell/ftspsql/FtsRepository.scala index eecacbc93f..1be93be77e 100644 --- a/modules/fts-psql/src/main/scala/docspell/ftspsql/FtsRepository.scala +++ b/modules/fts-psql/src/main/scala/docspell/ftspsql/FtsRepository.scala @@ -201,6 +201,7 @@ object FtsRepository extends DoobieMeta { case Language.Czech => "simple" case Language.Latvian => "simple" case Language.Japanese => "simple" + case Language.JpnVert => "simple" case Language.Hebrew => "simple" case Language.Lithuanian => "simple" case Language.Polish => "simple" diff --git a/modules/joex/src/main/resources/reference.conf b/modules/joex/src/main/resources/reference.conf index 3f740c7de9..fc54bae4b5 100644 --- a/modules/joex/src/main/resources/reference.conf +++ b/modules/joex/src/main/resources/reference.conf @@ -593,13 +593,32 @@ Docpell Update Check # To convert image files to PDF files, tesseract is used. This # also extracts the text in one go. tesseract = { + # Custom Language Mappings Below + # Japanese Vertical Mapping + arg-mappings = { + "tesseract_lang" = { + value = "{{lang}}" + mappings = [ + { + matches = "jpn_vert" + args = [ "-l", "jpn_vert", "-c", "preserve_interword_spaces=1" ] + }, + # Start Other Custom Language Mappings Here + # Default Mapping Below + { + matches = ".*" + args = [ "-l", "{{lang}}" ] + } + ] + } + } command = { program = "tesseract" + # Default arguments for all processing go below. args = [ "{{infile}}", "out", - "-l", - "{{lang}}", + "{{tesseract_lang}}", "pdf", "txt" ] @@ -648,11 +667,32 @@ Docpell Update Check # (where ocr is not necessary). In this case, the pdf will be # converted to PDF/A. ocrmypdf = { + # Custom argument mappings for this program. + arg-mappings = { + "ocr_lang" = { + value = "{{lang}}" + # Custom Language Mappings Below + # Japanese Vertical Mapping + mappings = [ + { + matches = "jpn_vert" + args = [ "-l", "jpn_vert", "--pdf-renderer", "sandwich", "--tesseract-pagesegmode", "5" ] + }, + # Start Other Custom Language Mappings Here + # Default Mapping Below + { + matches = ".*" + args = [ "-l", "{{lang}}" ] + } + ] + } + } enabled = true command = { program = "ocrmypdf" + # Default arguments for all processing go below. args = [ - "-l", "{{lang}}", + "{{ocr_lang}}", "--skip-text", "--deskew", "-j", "1", @@ -893,4 +933,4 @@ Docpell Update Check } } } -} \ No newline at end of file +} diff --git a/modules/webapp/package.json b/modules/webapp/package.json index cb22d3df51..689843cf22 100644 --- a/modules/webapp/package.json +++ b/modules/webapp/package.json @@ -6,5 +6,8 @@ "@fortawesome/fontawesome-free": "^6.0.0", "@tailwindcss/forms": "^0.5.0", "flag-icons": "^7.2.0" + }, + "dependencies": { + "tailwindcss": "^3.4.1" } } diff --git a/modules/webapp/src/main/elm/Data/Language.elm b/modules/webapp/src/main/elm/Data/Language.elm index 0d863f380d..56a6f69b49 100644 --- a/modules/webapp/src/main/elm/Data/Language.elm +++ b/modules/webapp/src/main/elm/Data/Language.elm @@ -30,6 +30,7 @@ type Language | Dutch | Latvian | Japanese + | JpnVert | Hebrew | Hungarian | Lithuanian @@ -90,6 +91,9 @@ fromString str = else if str == "jpn" || str == "ja" || str == "japanese" then Just Japanese + else if str == "jpn_vert" || str == "ja_vert" || str == "jpnvert" then + Just JpnVert + else if str == "heb" || str == "he" || str == "hebrew" then Just Hebrew @@ -169,6 +173,9 @@ toIso3 lang = Japanese -> "jpn" + JpnVert -> + "jpn_vert" + Hebrew -> "heb" @@ -212,6 +219,7 @@ all = , Romanian , Latvian , Japanese + , JpnVert , Hebrew , Hungarian , Lithuanian diff --git a/modules/webapp/src/main/elm/Messages/Data/Language.elm b/modules/webapp/src/main/elm/Messages/Data/Language.elm index 2369a5a475..3bba9d7ea6 100644 --- a/modules/webapp/src/main/elm/Messages/Data/Language.elm +++ b/modules/webapp/src/main/elm/Messages/Data/Language.elm @@ -65,6 +65,9 @@ gb lang = Japanese -> "Japanese" + JpnVert -> + "JpnVert" + Hebrew -> "Hebrew" @@ -141,6 +144,9 @@ de lang = Japanese -> "Japanisch" + JpnVert -> + "JpnVert" + Hebrew -> "Hebräisch" @@ -217,6 +223,9 @@ fr lang = Japanese -> "Japonnais" + JpnVert -> + "JpnVert" + Hebrew -> "Hébreu"