From 56aef7265aef9ecb9872fa1faeb7e671ce050f86 Mon Sep 17 00:00:00 2001 From: Peter Gromov Date: Mon, 3 Apr 2023 20:06:58 +0200 Subject: [PATCH] hunspell: disallow hidden title-case entries from compound middle/end (#12220) if we only have custom-case uART and capitalized UART, we shouldn't accept StandUart as a compound (although we keep hidden "Uart" dictionary entries for internal purposes) --- .../lucene/analysis/hunspell/Hunspell.java | 4 +- .../analysis/hunspell/TestSpellChecking.java | 4 ++ .../analysis/hunspell/germanManualCase.aff | 51 +++++++++++++++++++ .../analysis/hunspell/germanManualCase.dic | 5 ++ .../analysis/hunspell/germanManualCase.good | 3 ++ .../analysis/hunspell/germanManualCase.wrong | 3 ++ 6 files changed, 68 insertions(+), 2 deletions(-) create mode 100644 lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/germanManualCase.aff create mode 100644 lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/germanManualCase.dic create mode 100644 lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/germanManualCase.good create mode 100644 lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/germanManualCase.wrong diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Hunspell.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Hunspell.java index 998bfc774291..1e2a1add13cd 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Hunspell.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/hunspell/Hunspell.java @@ -164,7 +164,7 @@ && checkCompoundRules(wordChars, 0, length, new ArrayList<>())) { Root findStem( char[] wordChars, int offset, int length, WordCase originalCase, WordContext context) { checkCanceled.run(); - boolean checkCase = context != COMPOUND_MIDDLE && context != COMPOUND_END; + WordCase toCheck = context != COMPOUND_MIDDLE && context != COMPOUND_END ? originalCase : null; @SuppressWarnings({"rawtypes", "unchecked"}) Root[] result = new Root[1]; stemmer.doStem( @@ -173,7 +173,7 @@ Root findStem( length, context, (stem, formID, morphDataId, outerPrefix, innerPrefix, outerSuffix, innerSuffix) -> { - if (checkCase && !acceptCase(originalCase, formID, stem)) { + if (!acceptCase(toCheck, formID, stem)) { return dictionary.hasFlag(formID, Dictionary.HIDDEN_FLAG); } if (acceptsStem(formID)) { diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestSpellChecking.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestSpellChecking.java index d1b6bea03b39..cbb6f21f0cf2 100644 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestSpellChecking.java +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/TestSpellChecking.java @@ -205,6 +205,10 @@ public void testGermanCompounding() throws Exception { doTest("germancompounding"); } + public void testGermanManualCase() throws Exception { + doTest("germanManualCase"); + } + public void testApplyOconvToSuggestions() throws Exception { doTest("oconv"); } diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/germanManualCase.aff b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/germanManualCase.aff new file mode 100644 index 000000000000..85274dfec8dd --- /dev/null +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/germanManualCase.aff @@ -0,0 +1,51 @@ +# no CHECKCOMPOUNDCASE + +# compound flags + +COMPOUNDBEGIN U +COMPOUNDMIDDLE V +COMPOUNDEND W + +ONLYINCOMPOUND X +COMPOUNDPERMITFLAG P + +COMPOUNDMIN 1 +WORDCHARS - + +# dash prefix for compounds with dash (Arbeits-Computer) + +PFX - Y 1 +PFX - 0 -/P . + +# decapitalizing prefix + +PFX D Y 29 +PFX D A a/PX A +PFX D � �/PX � +PFX D B b/PX B +PFX D C c/PX C +PFX D D d/PX D +PFX D E e/PX E +PFX D F f/PX F +PFX D G g/PX G +PFX D H h/PX H +PFX D I i/PX I +PFX D J j/PX J +PFX D K k/PX K +PFX D L l/PX L +PFX D M m/PX M +PFX D N n/PX N +PFX D O o/PX O +PFX D � �/PX � +PFX D P p/PX P +PFX D Q q/PX Q +PFX D R r/PX R +PFX D S s/PX S +PFX D T t/PX T +PFX D U u/PX U +PFX D � �/PX � +PFX D V v/PX V +PFX D W w/PX W +PFX D X x/PX X +PFX D Y y/PX Y +PFX D Z z/PX Z diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/germanManualCase.dic b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/germanManualCase.dic new file mode 100644 index 000000000000..5e075003c9a9 --- /dev/null +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/germanManualCase.dic @@ -0,0 +1,5 @@ +4 +uART/XW- +bein/XW- +Stand/UX +UART/- \ No newline at end of file diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/germanManualCase.good b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/germanManualCase.good new file mode 100644 index 000000000000..27c6941024fc --- /dev/null +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/germanManualCase.good @@ -0,0 +1,3 @@ +UART +Standbein +Stand-uART diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/germanManualCase.wrong b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/germanManualCase.wrong new file mode 100644 index 000000000000..c3ce031400c8 --- /dev/null +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/hunspell/germanManualCase.wrong @@ -0,0 +1,3 @@ +StandUart +uART +Uart