From 56aef7265aef9ecb9872fa1faeb7e671ce050f86 Mon Sep 17 00:00:00 2001
From: Peter Gromov <peter@jetbrains.com>
Date: Mon, 3 Apr 2023 20:06:58 +0200
Subject: [PATCH] hunspell: disallow hidden title-case entries from compound
 middle/end (#12220)

if we only have custom-case uART and capitalized UART, we shouldn't accept StandUart as a compound (although we keep hidden "Uart" dictionary entries for internal purposes)
---
 .../lucene/analysis/hunspell/Hunspell.java    |  4 +-
 .../analysis/hunspell/TestSpellChecking.java  |  4 ++
 .../analysis/hunspell/germanManualCase.aff    | 51 +++++++++++++++++++
 .../analysis/hunspell/germanManualCase.dic    |  5 ++
 .../analysis/hunspell/germanManualCase.good   |  3 ++
 .../analysis/hunspell/germanManualCase.wrong  |  3 ++
 6 files changed, 68 insertions(+), 2 deletions(-)
 create mode 100644 lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/germanManualCase.aff
 create mode 100644 lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/germanManualCase.dic
 create mode 100644 lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/germanManualCase.good
 create mode 100644 lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/germanManualCase.wrong
diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Hunspell.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Hunspell.java
index 998bfc774291..1e2a1add13cd 100644
--- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Hunspell.java
+++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Hunspell.java
@@ -164,7 +164,7 @@ && checkCompoundRules(wordChars, 0, length, new ArrayList<>())) {
   Root<CharsRef> findStem(
       char[] wordChars, int offset, int length, WordCase originalCase, WordContext context) {
     checkCanceled.run();
-    boolean checkCase = context != COMPOUND_MIDDLE && context != COMPOUND_END;
+    WordCase toCheck = context != COMPOUND_MIDDLE && context != COMPOUND_END ? originalCase : null;
     @SuppressWarnings({"rawtypes", "unchecked"})
     Root<CharsRef>[] result = new Root[1];
     stemmer.doStem(
@@ -173,7 +173,7 @@ Root<CharsRef> findStem(
         length,
         context,
         (stem, formID, morphDataId, outerPrefix, innerPrefix, outerSuffix, innerSuffix) -> {
-          if (checkCase && !acceptCase(originalCase, formID, stem)) {
+          if (!acceptCase(toCheck, formID, stem)) {
             return dictionary.hasFlag(formID, Dictionary.HIDDEN_FLAG);
           }
           if (acceptsStem(formID)) {
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestSpellChecking.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestSpellChecking.java
index d1b6bea03b39..cbb6f21f0cf2 100644
--- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestSpellChecking.java
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestSpellChecking.java
@@ -205,6 +205,10 @@ public void testGermanCompounding() throws Exception {
     doTest("germancompounding");
   }
 
+  public void testGermanManualCase() throws Exception {
+    doTest("germanManualCase");
+  }
+
   public void testApplyOconvToSuggestions() throws Exception {
     doTest("oconv");
   }
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/germanManualCase.aff b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/germanManualCase.aff
new file mode 100644
index 000000000000..85274dfec8dd
--- /dev/null
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/germanManualCase.aff
@@ -0,0 +1,51 @@
+# no CHECKCOMPOUNDCASE
+
+# compound flags
+
+COMPOUNDBEGIN U
+COMPOUNDMIDDLE V
+COMPOUNDEND W
+
+ONLYINCOMPOUND X
+COMPOUNDPERMITFLAG P
+
+COMPOUNDMIN 1
+WORDCHARS -
+
+# dash prefix for compounds with dash (Arbeits-Computer)
+
+PFX - Y 1
+PFX - 0 -/P .
+
+# decapitalizing prefix
+
+PFX D Y 29
+PFX D A a/PX A
+PFX D � �/PX �
+PFX D B b/PX B
+PFX D C c/PX C
+PFX D D d/PX D
+PFX D E e/PX E
+PFX D F f/PX F
+PFX D G g/PX G
+PFX D H h/PX H
+PFX D I i/PX I
+PFX D J j/PX J
+PFX D K k/PX K
+PFX D L l/PX L
+PFX D M m/PX M
+PFX D N n/PX N
+PFX D O o/PX O
+PFX D � �/PX �
+PFX D P p/PX P
+PFX D Q q/PX Q
+PFX D R r/PX R
+PFX D S s/PX S
+PFX D T t/PX T
+PFX D U u/PX U
+PFX D � �/PX �
+PFX D V v/PX V
+PFX D W w/PX W
+PFX D X x/PX X
+PFX D Y y/PX Y
+PFX D Z z/PX Z
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/germanManualCase.dic b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/germanManualCase.dic
new file mode 100644
index 000000000000..5e075003c9a9
--- /dev/null
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/germanManualCase.dic
@@ -0,0 +1,5 @@
+4
+uART/XW-
+bein/XW-
+Stand/UX
+UART/-
\ No newline at end of file
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/germanManualCase.good b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/germanManualCase.good
new file mode 100644
index 000000000000..27c6941024fc
--- /dev/null
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/germanManualCase.good
@@ -0,0 +1,3 @@
+UART
+Standbein
+Stand-uART
diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/germanManualCase.wrong b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/germanManualCase.wrong
new file mode 100644
index 000000000000..c3ce031400c8
--- /dev/null
+++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/germanManualCase.wrong
@@ -0,0 +1,3 @@
+StandUart
+uART
+Uart