-
Notifications
You must be signed in to change notification settings - Fork 25k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Ukrainian language plugin can fill up heap (#71998)
The lucene Ukrainian analyzer has a bug where a large in-memory dictionary is loaded and stored on a thread local for every tokenstream generated in a new thread (for more details see https://issues.apache.org/jira/browse/LUCENE-9930). Due to checks added in #50908, we create a tokenstream for every registered analyzer in every shard, which means that any node with the ukrainian plugin installed will leak one copy of this dictionary per shard, whether or not the ukrainian analyzer is actually being used. This commit makes the plugin use a fixed version of the UkrainianMorfologikAnalyzer, until we merge a version of lucene that contains the upstream fix.
- Loading branch information
1 parent
81017be
commit d6038a3
Showing
3 changed files
with
165 additions
and
7 deletions.
There are no files selected for viewing
158 changes: 158 additions & 0 deletions
158
...s-ukrainian/src/main/java/org/apache/lucene/analysis/uk/XUkrainianMorfologikAnalyzer.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,158 @@ | ||
/*@notice | ||
* Licensed to the Apache Software Foundation (ASF) under one or more | ||
* contributor license agreements. See the NOTICE file distributed with | ||
* this work for additional information regarding copyright ownership. | ||
* The ASF licenses this file to You under the Apache License, Version 2.0 | ||
* (the "License"); you may not use this file except in compliance with | ||
* the License. You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
package org.apache.lucene.analysis.uk; | ||
|
||
import morfologik.stemming.Dictionary; | ||
import org.apache.lucene.analysis.Analyzer; | ||
import org.apache.lucene.analysis.CharArraySet; | ||
import org.apache.lucene.analysis.LowerCaseFilter; | ||
import org.apache.lucene.analysis.StopFilter; | ||
import org.apache.lucene.analysis.StopwordAnalyzerBase; | ||
import org.apache.lucene.analysis.TokenStream; | ||
import org.apache.lucene.analysis.Tokenizer; | ||
import org.apache.lucene.analysis.WordlistLoader; | ||
import org.apache.lucene.analysis.charfilter.MappingCharFilter; | ||
import org.apache.lucene.analysis.charfilter.NormalizeCharMap; | ||
import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter; | ||
import org.apache.lucene.analysis.morfologik.MorfologikFilter; | ||
import org.apache.lucene.analysis.standard.StandardTokenizer; | ||
import org.apache.lucene.util.IOUtils; | ||
import org.elasticsearch.common.SuppressForbidden; | ||
|
||
import java.io.IOException; | ||
import java.io.Reader; | ||
import java.nio.charset.StandardCharsets; | ||
|
||
/** | ||
* A dictionary-based {@link Analyzer} for Ukrainian. | ||
* | ||
* Modified from lucene 8.8.0 sources to incorporate a bugfix for | ||
* https://issues.apache.org/jira/browse/LUCENE-9930 | ||
*/ | ||
public final class XUkrainianMorfologikAnalyzer extends StopwordAnalyzerBase { | ||
private final CharArraySet stemExclusionSet; | ||
|
||
/** File containing default Ukrainian stopwords. */ | ||
public static final String DEFAULT_STOPWORD_FILE = "stopwords.txt"; | ||
|
||
/** | ||
* Returns an unmodifiable instance of the default stop words set. | ||
* @return default stop words set. | ||
*/ | ||
public static CharArraySet getDefaultStopSet() { | ||
return DefaultSetHolder.DEFAULT_STOP_SET; | ||
} | ||
|
||
/** | ||
* Atomically loads the DEFAULT_STOP_SET and DICTIONARY in a lazy fashion once the outer class | ||
* accesses the static final set the first time.; | ||
*/ | ||
@SuppressForbidden(reason="Lucene uses IOUtils") | ||
private static class DefaultSetHolder { | ||
static final CharArraySet DEFAULT_STOP_SET; | ||
static final Dictionary DICTIONARY; | ||
|
||
static { | ||
try { | ||
DEFAULT_STOP_SET = WordlistLoader.getSnowballWordSet(IOUtils.getDecodingReader(UkrainianMorfologikAnalyzer.class, | ||
DEFAULT_STOPWORD_FILE, StandardCharsets.UTF_8)); | ||
DICTIONARY = Dictionary.read( | ||
UkrainianMorfologikAnalyzer.class.getClassLoader().getResource("ua/net/nlp/ukrainian.dict")); | ||
} catch (IOException ex) { | ||
// default set should always be present as it is part of the | ||
// distribution (JAR) | ||
throw new RuntimeException("Unable to load resources", ex); | ||
} | ||
} | ||
} | ||
|
||
/** | ||
* Builds an analyzer with the default stop words: {@link #DEFAULT_STOPWORD_FILE}. | ||
*/ | ||
public XUkrainianMorfologikAnalyzer() { | ||
this(DefaultSetHolder.DEFAULT_STOP_SET); | ||
} | ||
|
||
/** | ||
* Builds an analyzer with the given stop words. | ||
* | ||
* @param stopwords a stopword set | ||
*/ | ||
public XUkrainianMorfologikAnalyzer(CharArraySet stopwords) { | ||
this(stopwords, CharArraySet.EMPTY_SET); | ||
} | ||
|
||
/** | ||
* Builds an analyzer with the given stop words. If a non-empty stem exclusion set is | ||
* provided this analyzer will add a {@link SetKeywordMarkerFilter} before | ||
* stemming. | ||
* | ||
* @param stopwords a stopword set | ||
* @param stemExclusionSet a set of terms not to be stemmed | ||
*/ | ||
public XUkrainianMorfologikAnalyzer(CharArraySet stopwords, CharArraySet stemExclusionSet) { | ||
super(stopwords); | ||
this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(stemExclusionSet)); | ||
} | ||
|
||
@Override | ||
protected Reader initReader(String fieldName, Reader reader) { | ||
NormalizeCharMap.Builder builder = new NormalizeCharMap.Builder(); | ||
// different apostrophes | ||
builder.add("\u2019", "'"); | ||
builder.add("\u2018", "'"); | ||
builder.add("\u02BC", "'"); | ||
builder.add("`", "'"); | ||
builder.add("´", "'"); | ||
// ignored characters | ||
builder.add("\u0301", ""); | ||
builder.add("\u00AD", ""); | ||
builder.add("ґ", "г"); | ||
builder.add("Ґ", "Г"); | ||
|
||
NormalizeCharMap normMap = builder.build(); | ||
reader = new MappingCharFilter(normMap, reader); | ||
return reader; | ||
} | ||
|
||
/** | ||
* Creates a | ||
* {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents} | ||
* which tokenizes all the text in the provided {@link Reader}. | ||
* | ||
* @return A | ||
* {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents} | ||
* built from an {@link StandardTokenizer} filtered with | ||
* {@link LowerCaseFilter}, {@link StopFilter} | ||
* , {@link SetKeywordMarkerFilter} if a stem exclusion set is | ||
* provided and {@link MorfologikFilter} on the Ukrainian dictionary. | ||
*/ | ||
@Override | ||
protected TokenStreamComponents createComponents(String fieldName) { | ||
Tokenizer source = new StandardTokenizer(); | ||
TokenStream result = new LowerCaseFilter(source); | ||
result = new StopFilter(result, stopwords); | ||
|
||
if (stemExclusionSet.isEmpty() == false) { | ||
result = new SetKeywordMarkerFilter(result, stemExclusionSet); | ||
} | ||
|
||
result = new MorfologikFilter(result, DefaultSetHolder.DICTIONARY); | ||
return new TokenStreamComponents(source, result); | ||
} | ||
|
||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters