Skip to content

Commit

Permalink
Add Japanese Vertical Support Branch for Tesseract and Ocrmypdf OCR (#…
Browse files Browse the repository at this point in the history
…2505)

* Add Japanese Vertical Support 
* Adds Japanese Vertical mappings to default configuration.
  • Loading branch information
tenpai-git authored Apr 16, 2024
1 parent 36c00cc commit e731d82
Show file tree
Hide file tree
Showing 9 changed files with 75 additions and 5 deletions.
2 changes: 1 addition & 1 deletion docker/dockerfiles/joex.dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ RUN \
wget https://github.com/tesseract-ocr/tessdata/raw/main/khm.traineddata && \
mv khm.traineddata /usr/share/tessdata

# Using these data files for japanese, because they work better. See #973
# Using these data files for japanese, because they work better. Includes vertical data. See #973 and #2445.
RUN \
wget https://raw.githubusercontent.com/tesseract-ocr/tessdata_fast/master/jpn_vert.traineddata && \
wget https://raw.githubusercontent.com/tesseract-ocr/tessdata_fast/master/jpn.traineddata && \
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -125,6 +125,7 @@ object DateFind {
case Language.Dutch => dmy.or(ymd).or(mdy)
case Language.Latvian => dmy.or(lavLong).or(ymd)
case Language.Japanese => ymd
case Language.JpnVert => ymd
case Language.Hebrew => dmy
case Language.Lithuanian => ymd
case Language.Polish => dmy
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,8 @@ object MonthName {
latvian
case Language.Japanese =>
japanese
case Language.JpnVert =>
japanese
case Language.Hebrew =>
hebrew
case Language.Lithuanian =>
Expand Down
6 changes: 6 additions & 0 deletions modules/common/src/main/scala/docspell/common/Language.scala
Original file line number Diff line number Diff line change
Expand Up @@ -123,6 +123,11 @@ object Language {
val iso3 = "jpn"
}

/*It's not an ISO value, but this needs to be unique and tesseract will need jpn_vert for it's scan from the config of /etc/docspell-joex/docspell-joex.conf.*/
case object JpnVert extends Language {
val iso2 = "ja_vert"
val iso3 = "jpn_vert"
}
case object Hebrew extends Language {
val iso2 = "he"
val iso3 = "heb"
Expand Down Expand Up @@ -172,6 +177,7 @@ object Language {
Romanian,
Latvian,
Japanese,
JpnVert,
Hebrew,
Lithuanian,
Polish,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -201,6 +201,7 @@ object FtsRepository extends DoobieMeta {
case Language.Czech => "simple"
case Language.Latvian => "simple"
case Language.Japanese => "simple"
case Language.JpnVert => "simple"
case Language.Hebrew => "simple"
case Language.Lithuanian => "simple"
case Language.Polish => "simple"
Expand Down
48 changes: 44 additions & 4 deletions modules/joex/src/main/resources/reference.conf
Original file line number Diff line number Diff line change
Expand Up @@ -593,13 +593,32 @@ Docpell Update Check
# To convert image files to PDF files, tesseract is used. This
# also extracts the text in one go.
tesseract = {
# Custom Language Mappings Below
# Japanese Vertical Mapping
arg-mappings = {
"tesseract_lang" = {
value = "{{lang}}"
mappings = [
{
matches = "jpn_vert"
args = [ "-l", "jpn_vert", "-c", "preserve_interword_spaces=1" ]
},
# Start Other Custom Language Mappings Here
# Default Mapping Below
{
matches = ".*"
args = [ "-l", "{{lang}}" ]
}
]
}
}
command = {
program = "tesseract"
# Default arguments for all processing go below.
args = [
"{{infile}}",
"out",
"-l",
"{{lang}}",
"{{tesseract_lang}}",
"pdf",
"txt"
]
Expand Down Expand Up @@ -648,11 +667,32 @@ Docpell Update Check
# (where ocr is not necessary). In this case, the pdf will be
# converted to PDF/A.
ocrmypdf = {
# Custom argument mappings for this program.
arg-mappings = {
"ocr_lang" = {
value = "{{lang}}"
# Custom Language Mappings Below
# Japanese Vertical Mapping
mappings = [
{
matches = "jpn_vert"
args = [ "-l", "jpn_vert", "--pdf-renderer", "sandwich", "--tesseract-pagesegmode", "5" ]
},
# Start Other Custom Language Mappings Here
# Default Mapping Below
{
matches = ".*"
args = [ "-l", "{{lang}}" ]
}
]
}
}
enabled = true
command = {
program = "ocrmypdf"
# Default arguments for all processing go below.
args = [
"-l", "{{lang}}",
"{{ocr_lang}}",
"--skip-text",
"--deskew",
"-j", "1",
Expand Down Expand Up @@ -893,4 +933,4 @@ Docpell Update Check
}
}
}
}
}
3 changes: 3 additions & 0 deletions modules/webapp/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -6,5 +6,8 @@
"@fortawesome/fontawesome-free": "^6.0.0",
"@tailwindcss/forms": "^0.5.0",
"flag-icons": "^7.2.0"
},
"dependencies": {
"tailwindcss": "^3.4.1"
}
}
8 changes: 8 additions & 0 deletions modules/webapp/src/main/elm/Data/Language.elm
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ type Language
| Dutch
| Latvian
| Japanese
| JpnVert
| Hebrew
| Hungarian
| Lithuanian
Expand Down Expand Up @@ -90,6 +91,9 @@ fromString str =
else if str == "jpn" || str == "ja" || str == "japanese" then
Just Japanese

else if str == "jpn_vert" || str == "ja_vert" || str == "jpnvert" then
Just JpnVert

else if str == "heb" || str == "he" || str == "hebrew" then
Just Hebrew

Expand Down Expand Up @@ -169,6 +173,9 @@ toIso3 lang =
Japanese ->
"jpn"

JpnVert ->
"jpn_vert"

Hebrew ->
"heb"

Expand Down Expand Up @@ -212,6 +219,7 @@ all =
, Romanian
, Latvian
, Japanese
, JpnVert
, Hebrew
, Hungarian
, Lithuanian
Expand Down
9 changes: 9 additions & 0 deletions modules/webapp/src/main/elm/Messages/Data/Language.elm
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,9 @@ gb lang =
Japanese ->
"Japanese"

JpnVert ->
"JpnVert"

Hebrew ->
"Hebrew"

Expand Down Expand Up @@ -141,6 +144,9 @@ de lang =
Japanese ->
"Japanisch"

JpnVert ->
"JpnVert"

Hebrew ->
"Hebräisch"

Expand Down Expand Up @@ -217,6 +223,9 @@ fr lang =
Japanese ->
"Japonnais"

JpnVert ->
"JpnVert"

Hebrew ->
"Hébreu"

Expand Down

0 comments on commit e731d82

Please sign in to comment.