From b03c1e4430dbd55702d246a671daacbed3420331 Mon Sep 17 00:00:00 2001 From: tangdonghai Date: Thu, 11 May 2023 13:19:03 +0800 Subject: [PATCH 1/8] toposort use iterator to avoid stackoverflow --- .../lucene/util/automaton/Operations.java | 44 ++++++++++++------- .../lucene/util/automaton/TestOperations.java | 13 ------ .../analyzing/TestAnalyzingSuggester.java | 16 ------- 3 files changed, 28 insertions(+), 45 deletions(-) diff --git a/lucene/core/src/java/org/apache/lucene/util/automaton/Operations.java b/lucene/core/src/java/org/apache/lucene/util/automaton/Operations.java index 268744203154..7165e5ec4010 100644 --- a/lucene/core/src/java/org/apache/lucene/util/automaton/Operations.java +++ b/lucene/core/src/java/org/apache/lucene/util/automaton/Operations.java @@ -39,6 +39,7 @@ import java.util.List; import java.util.Map; import java.util.Set; +import java.util.Stack; import org.apache.lucene.search.DocIdSetIterator; import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.BytesRef; @@ -1284,7 +1285,7 @@ public static int[] topoSortStates(Automaton a) { int numStates = a.getNumStates(); int[] states = new int[numStates]; final BitSet visited = new BitSet(numStates); - int upto = topoSortStatesRecurse(a, visited, states, 0, 0, 0); + int upto = topoSortStates(a, visited, states); if (upto < states.length) { // There were dead states @@ -1303,24 +1304,35 @@ public static int[] topoSortStates(Automaton a) { return states; } - // TODO: not great that this is recursive... in theory a - // large automata could exceed java's stack so the maximum level of recursion is bounded to 1000 - private static int topoSortStatesRecurse( - Automaton a, BitSet visited, int[] states, int upto, int state, int level) { - if (level > MAX_RECURSION_LEVEL) { - throw new IllegalArgumentException("input automaton is too large: " + level); - } + private static int topoSortStates(Automaton a, BitSet visited, int[] states) { + + Stack stack = new Stack<>(); + stack.push(0); // Assuming that the initial state is 0. + int upto = 0; Transition t = new Transition(); - int count = a.initTransition(state, t); - for (int i = 0; i < count; i++) { - a.getNextTransition(t); - if (!visited.get(t.dest)) { - visited.set(t.dest); - upto = topoSortStatesRecurse(a, visited, states, upto, t.dest, level + 1); + + while (!stack.empty()) { + int state = stack.peek(); // Just peek, don't remove the state yet + + int count = a.initTransition(state, t); + boolean pushed = false; + for (int i = 0; i < count; i++) { + a.getNextTransition(t); + if (!visited.get(t.dest)) { + visited.set(t.dest); + stack.push(t.dest); // Push the next unvisited state onto the stack + pushed = true; + break; // Exit the loop, we'll continue from here in the next iteration + } + } + + // If we haven't pushed any new state onto the stack, we're done with this state + if (!pushed) { + stack.pop(); + states[upto] = state; + upto++; } } - states[upto] = state; - upto++; return upto; } } diff --git a/lucene/core/src/test/org/apache/lucene/util/automaton/TestOperations.java b/lucene/core/src/test/org/apache/lucene/util/automaton/TestOperations.java index 8875bf8c7718..880d6c77fea0 100644 --- a/lucene/core/src/test/org/apache/lucene/util/automaton/TestOperations.java +++ b/lucene/core/src/test/org/apache/lucene/util/automaton/TestOperations.java @@ -136,19 +136,6 @@ public void testIsFiniteEatsStack() { assertTrue(exc.getMessage().contains("input automaton is too large")); } - public void testTopoSortEatsStack() { - char[] chars = new char[50000]; - TestUtil.randomFixedLengthUnicodeString(random(), chars, 0, chars.length); - String bigString1 = new String(chars); - TestUtil.randomFixedLengthUnicodeString(random(), chars, 0, chars.length); - String bigString2 = new String(chars); - Automaton a = - Operations.union(Automata.makeString(bigString1), Automata.makeString(bigString2)); - IllegalArgumentException exc = - expectThrows(IllegalArgumentException.class, () -> Operations.topoSortStates(a)); - assertTrue(exc.getMessage().contains("input automaton is too large")); - } - /** * Returns the set of all accepted strings. * diff --git a/lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/TestAnalyzingSuggester.java b/lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/TestAnalyzingSuggester.java index b5e49c5ec79b..cb14d983716f 100644 --- a/lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/TestAnalyzingSuggester.java +++ b/lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/TestAnalyzingSuggester.java @@ -1325,22 +1325,6 @@ static final Iterable shuffle(Input... values) { return asList; } - // TODO: we need BaseSuggesterTestCase? - public void testTooLongSuggestion() throws Exception { - Analyzer a = new MockAnalyzer(random()); - Directory tempDir = getDirectory(); - AnalyzingSuggester suggester = new AnalyzingSuggester(tempDir, "suggest", a); - String bigString = TestUtil.randomSimpleString(random(), 30000, 30000); - IllegalArgumentException ex = - expectThrows( - IllegalArgumentException.class, - () -> { - suggester.build(new InputArrayIterator(new Input[] {new Input(bigString, 7)})); - }); - assertTrue(ex.getMessage().contains("input automaton is too large")); - IOUtils.close(a, tempDir); - } - private Directory getDirectory() { return newDirectory(); } From 1e7020a62730ba810b2cc664f9dd1836f9a6b219 Mon Sep 17 00:00:00 2001 From: tang donghai Date: Fri, 12 May 2023 00:55:24 +0800 Subject: [PATCH 2/8] detect cycle when topsort --- .../lucene/util/automaton/Operations.java | 39 ++++++++++++------- .../lucene/util/automaton/TestOperations.java | 31 +++++++++++++++ 2 files changed, 55 insertions(+), 15 deletions(-) diff --git a/lucene/core/src/java/org/apache/lucene/util/automaton/Operations.java b/lucene/core/src/java/org/apache/lucene/util/automaton/Operations.java index 7165e5ec4010..f05fb52e9c8f 100644 --- a/lucene/core/src/java/org/apache/lucene/util/automaton/Operations.java +++ b/lucene/core/src/java/org/apache/lucene/util/automaton/Operations.java @@ -29,17 +29,7 @@ package org.apache.lucene.util.automaton; -import java.util.ArrayDeque; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.BitSet; -import java.util.Collection; -import java.util.HashMap; -import java.util.HashSet; -import java.util.List; -import java.util.Map; -import java.util.Set; -import java.util.Stack; +import java.util.*; import org.apache.lucene.search.DocIdSetIterator; import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.BytesRef; @@ -1274,9 +1264,14 @@ static Automaton totalize(Automaton a) { } /** - * Returns the topological sort of all states reachable from the initial state. Behavior is - * undefined if this automaton has cycles. CPU cost is O(numTransitions), and the implementation - * is recursive so an automaton matching long strings may exhaust the java stack. + * Returns the topological sort of all states reachable from the initial state. This method + * assumes that the automaton does not contain cycles, and will throw an IllegalArgumentException + * if a cycle is detected. The CPU cost is O(numTransitions), and the implementation is + * non-recursive, so it will not exhaust the java stack for automaton matching long strings. If + * there are dead states in the automaton, they will be removed from the returned array. + * + * @param a the Automaton to be sorted + * @return the topologically sorted array of state ids */ public static int[] topoSortStates(Automaton a) { if (a.getNumStates() == 0) { @@ -1304,8 +1299,17 @@ public static int[] topoSortStates(Automaton a) { return states; } + /** + * Performs a topological sort on the states of the given Automaton. + * + * @param a The automaton whose states are to be topologically sorted. + * @param visited A BitSet representing the visited states during the sorting process. + * @param states An int array which stores the reversed topological order of the states. + * @return the topologically sorted array of state ids + * @throws IllegalArgumentException if the input automaton has a cycle. + */ private static int topoSortStates(Automaton a, BitSet visited, int[] states) { - + BitSet onStack = new BitSet(a.getNumStates()); Stack stack = new Stack<>(); stack.push(0); // Assuming that the initial state is 0. int upto = 0; @@ -1321,13 +1325,18 @@ private static int topoSortStates(Automaton a, BitSet visited, int[] states) { if (!visited.get(t.dest)) { visited.set(t.dest); stack.push(t.dest); // Push the next unvisited state onto the stack + onStack.set(state); pushed = true; break; // Exit the loop, we'll continue from here in the next iteration + } else if (onStack.get(t.dest)) { + // If the state is on the current recursion stack, we have detected a cycle + throw new IllegalArgumentException("Input automaton has a cycle."); } } // If we haven't pushed any new state onto the stack, we're done with this state if (!pushed) { + onStack.clear(state); // remove the node from the current recursion stack stack.pop(); states[upto] = state; upto++; diff --git a/lucene/core/src/test/org/apache/lucene/util/automaton/TestOperations.java b/lucene/core/src/test/org/apache/lucene/util/automaton/TestOperations.java index 880d6c77fea0..4417869aeb6e 100644 --- a/lucene/core/src/test/org/apache/lucene/util/automaton/TestOperations.java +++ b/lucene/core/src/test/org/apache/lucene/util/automaton/TestOperations.java @@ -17,6 +17,7 @@ package org.apache.lucene.util.automaton; import static org.apache.lucene.util.automaton.Operations.DEFAULT_DETERMINIZE_WORK_LIMIT; +import static org.apache.lucene.util.automaton.Operations.topoSortStates; import com.carrotsearch.randomizedtesting.generators.RandomNumbers; import java.util.ArrayList; @@ -69,6 +70,36 @@ public void testEmptyLanguageConcatenate() { assertTrue(Operations.isEmpty(concat)); } + /** + * Test case for the topoSortStates method when the input Automaton contains a cycle. This test + * case constructs an Automaton with two disjoint sets of states—one without a cycle and one with + * a cycle. The topoSortStates method should detect the presence of a cycle and throw an + * IllegalArgumentException. + */ + public void testCycledAutomaton() { + Automaton a = new Automaton(); + // Create three states (s1, s2, s3) for the non-cycled part of the Automaton. + int s1 = a.createState(); + int s2 = a.createState(); + int s3 = a.createState(); + + // Create three states (c1, c2, c3) for the cycled part of the Automaton. + int c1 = a.createState(); + int c2 = a.createState(); + int c3 = a.createState(); + + a.addTransition(s1, s2, 'a'); + a.addTransition(s2, s3, 'b'); + a.addTransition(s3, c1, 'c'); + a.addTransition(c1, c2, 'd'); + a.addTransition(c2, c3, 'e'); + a.addTransition(c3, c1, 'f'); + a.finishState(); + IllegalArgumentException exc = + expectThrows(IllegalArgumentException.class, () -> topoSortStates(a)); + assertTrue(exc.getMessage().contains("Input automaton has a cycle")); + } + /** Test optimization to concatenate() with empty String to an NFA */ public void testEmptySingletonNFAConcatenate() { Automaton singleton = Automata.makeString(""); From f5267fec5f4c9e005f2819b822a26708faf31af5 Mon Sep 17 00:00:00 2001 From: tang donghai Date: Fri, 12 May 2023 01:14:00 +0800 Subject: [PATCH 3/8] don't merge import --- .../org/apache/lucene/util/automaton/Operations.java | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/lucene/core/src/java/org/apache/lucene/util/automaton/Operations.java b/lucene/core/src/java/org/apache/lucene/util/automaton/Operations.java index f05fb52e9c8f..76cef04d8ff8 100644 --- a/lucene/core/src/java/org/apache/lucene/util/automaton/Operations.java +++ b/lucene/core/src/java/org/apache/lucene/util/automaton/Operations.java @@ -29,7 +29,17 @@ package org.apache.lucene.util.automaton; -import java.util.*; +import java.util.ArrayDeque; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.BitSet; +import java.util.Collection; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.Stack; import org.apache.lucene.search.DocIdSetIterator; import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.BytesRef; From 88641e592459a2c85de549c7d743ecd4bd50927f Mon Sep 17 00:00:00 2001 From: tang donghai Date: Fri, 12 May 2023 01:34:19 +0800 Subject: [PATCH 4/8] fix comment error --- .../src/java/org/apache/lucene/util/automaton/Operations.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lucene/core/src/java/org/apache/lucene/util/automaton/Operations.java b/lucene/core/src/java/org/apache/lucene/util/automaton/Operations.java index 76cef04d8ff8..cfa4fafc27a8 100644 --- a/lucene/core/src/java/org/apache/lucene/util/automaton/Operations.java +++ b/lucene/core/src/java/org/apache/lucene/util/automaton/Operations.java @@ -1314,8 +1314,8 @@ public static int[] topoSortStates(Automaton a) { * * @param a The automaton whose states are to be topologically sorted. * @param visited A BitSet representing the visited states during the sorting process. - * @param states An int array which stores the reversed topological order of the states. - * @return the topologically sorted array of state ids + * @param states An int array which stores the states. + * @return the reversed topologically sorted array of state ids * @throws IllegalArgumentException if the input automaton has a cycle. */ private static int topoSortStates(Automaton a, BitSet visited, int[] states) { From 04ceda8f2937837cf320c1ef801a657164009e34 Mon Sep 17 00:00:00 2001 From: tang donghai Date: Fri, 12 May 2023 22:12:39 +0800 Subject: [PATCH 5/8] use deque instead of stack and add unit test for toposort --- .../lucene/util/automaton/Operations.java | 13 ++- .../lucene/util/automaton/TestOperations.java | 90 +++++++++++++++---- 2 files changed, 78 insertions(+), 25 deletions(-) diff --git a/lucene/core/src/java/org/apache/lucene/util/automaton/Operations.java b/lucene/core/src/java/org/apache/lucene/util/automaton/Operations.java index cfa4fafc27a8..fdca49315447 100644 --- a/lucene/core/src/java/org/apache/lucene/util/automaton/Operations.java +++ b/lucene/core/src/java/org/apache/lucene/util/automaton/Operations.java @@ -34,12 +34,12 @@ import java.util.Arrays; import java.util.BitSet; import java.util.Collection; +import java.util.Deque; import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Set; -import java.util.Stack; import org.apache.lucene.search.DocIdSetIterator; import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.BytesRef; @@ -1289,8 +1289,7 @@ public static int[] topoSortStates(Automaton a) { } int numStates = a.getNumStates(); int[] states = new int[numStates]; - final BitSet visited = new BitSet(numStates); - int upto = topoSortStates(a, visited, states); + int upto = topoSortStates(a, states); if (upto < states.length) { // There were dead states @@ -1313,19 +1312,19 @@ public static int[] topoSortStates(Automaton a) { * Performs a topological sort on the states of the given Automaton. * * @param a The automaton whose states are to be topologically sorted. - * @param visited A BitSet representing the visited states during the sorting process. * @param states An int array which stores the states. * @return the reversed topologically sorted array of state ids * @throws IllegalArgumentException if the input automaton has a cycle. */ - private static int topoSortStates(Automaton a, BitSet visited, int[] states) { + private static int topoSortStates(Automaton a, int[] states) { BitSet onStack = new BitSet(a.getNumStates()); - Stack stack = new Stack<>(); + BitSet visited = new BitSet(a.getNumStates()); + Deque stack = new ArrayDeque<>(); stack.push(0); // Assuming that the initial state is 0. int upto = 0; Transition t = new Transition(); - while (!stack.empty()) { + while (!stack.isEmpty()) { int state = stack.peek(); // Just peek, don't remove the state yet int count = a.initTransition(state, t); diff --git a/lucene/core/src/test/org/apache/lucene/util/automaton/TestOperations.java b/lucene/core/src/test/org/apache/lucene/util/automaton/TestOperations.java index 4417869aeb6e..ec38eafe0ced 100644 --- a/lucene/core/src/test/org/apache/lucene/util/automaton/TestOperations.java +++ b/lucene/core/src/test/org/apache/lucene/util/automaton/TestOperations.java @@ -77,29 +77,35 @@ public void testEmptyLanguageConcatenate() { * IllegalArgumentException. */ public void testCycledAutomaton() { - Automaton a = new Automaton(); - // Create three states (s1, s2, s3) for the non-cycled part of the Automaton. - int s1 = a.createState(); - int s2 = a.createState(); - int s3 = a.createState(); - - // Create three states (c1, c2, c3) for the cycled part of the Automaton. - int c1 = a.createState(); - int c2 = a.createState(); - int c3 = a.createState(); - - a.addTransition(s1, s2, 'a'); - a.addTransition(s2, s3, 'b'); - a.addTransition(s3, c1, 'c'); - a.addTransition(c1, c2, 'd'); - a.addTransition(c2, c3, 'e'); - a.addTransition(c3, c1, 'f'); - a.finishState(); + Automaton a = generateRandomAutomaton(true); IllegalArgumentException exc = expectThrows(IllegalArgumentException.class, () -> topoSortStates(a)); assertTrue(exc.getMessage().contains("Input automaton has a cycle")); } + public void testTopoSortStates() { + Automaton a = generateRandomAutomaton(false); + + int[] sorted = topoSortStates(a); + int[] stateMap = new int[a.getNumStates()]; + Arrays.fill(stateMap, -1); + int order = 0; + for (int state : sorted) { + assertEquals(-1, stateMap[state]); + stateMap[state] = (order++); + } + + Transition transition = new Transition(); + for (int state : sorted) { + int count = a.initTransition(state, transition); + for (int i = 0; i < count; i++) { + a.getNextTransition(transition); + // ensure dest's order is higher than current state + assertTrue(stateMap[transition.dest] > stateMap[state]); + } + } + } + /** Test optimization to concatenate() with empty String to an NFA */ public void testEmptySingletonNFAConcatenate() { Automaton singleton = Automata.makeString(""); @@ -200,4 +206,52 @@ private static Set getFiniteStrings(FiniteStringsIterator iterator) { return result; } + + /** + * This method creates a random Automaton by generating states at multiple levels. At each level, + * a random number of states are created, and transitions are added between the states of the + * current and the previous level randomly, If the 'hasCycle' parameter is true, a transition is + * added from the first state of the last level back to the initial state to create a cycle in the + * Automaton.. + * + * @param hasCycle if true, the generated Automaton will have a cycle; if false, it won't have a + * cycle. + * @return a randomly generated Automaton instance. + */ + private Automaton generateRandomAutomaton(boolean hasCycle) { + Automaton a = new Automaton(); + List lastLevelStates = new ArrayList<>(); + int initialState = a.createState(); + int maxLevel = random().nextInt(4, 10); + lastLevelStates.add(initialState); + + for (int level = 1; level < maxLevel; level++) { + int numStates = random().nextInt(3, 10); + List nextLevelStates = new ArrayList<>(); + + for (int i = 0; i < numStates; i++) { + int nextState = a.createState(); + nextLevelStates.add(nextState); + } + + for (int lastState : lastLevelStates) { + for (int nextState : nextLevelStates) { + // if hasCycle is enabled, we will always add a transition, so we could make sure the + // generated Automaton has a cycle. + if (hasCycle || random().nextInt(7) >= 1) { + a.addTransition(lastState, nextState, random().nextInt(10)); + } + } + } + lastLevelStates = nextLevelStates; + } + + if (hasCycle) { + int lastState = lastLevelStates.get(0); + a.addTransition(lastState, initialState, random().nextInt(10)); + } + + a.finishState(); + return a; + } } From 745e6417fb6e08a729edbd120c3ec813095a3c48 Mon Sep 17 00:00:00 2001 From: tang donghai Date: Sat, 13 May 2023 20:37:32 +0800 Subject: [PATCH 6/8] update CHANGES.txt --- lucene/CHANGES.txt | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index a7016627cd78..dedd3039ae59 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -106,6 +106,36 @@ Other * GITHUB#12239: Hunspell: reduced suggestion set dependency on the hash table order (Peter Gromov) +======================== Lucene 9.7.0 ======================= + +API Changes +--------------------- +(No changes) + +New Features +--------------------- +(No changes) + +Improvements +--------------------- + +GITHUB#12245: Add support for Score Mode to `ToParentBlockJoinQuery` explain. (Marcus Eagan via Mikhail Khludnev) + +Optimizations +--------------------- + +* GITHUB#12270 Don't generate stacktrace in CollectionTerminatedException. (Armin Braun) + +* GITHUB#12286 Toposort use iterator to avoid stackoverflow. (Tang Donghai) + +Bug Fixes +--------------------- +(No changes) + +Other +--------------------- +(No changes) + ======================== Lucene 9.6.0 ======================= API Changes From 92977103d910ab19ac3b0897eaba2e2296e3ef31 Mon Sep 17 00:00:00 2001 From: tang donghai Date: Sat, 13 May 2023 21:48:11 +0800 Subject: [PATCH 7/8] use var to avoid import Deque --- .../src/java/org/apache/lucene/util/automaton/Operations.java | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/lucene/core/src/java/org/apache/lucene/util/automaton/Operations.java b/lucene/core/src/java/org/apache/lucene/util/automaton/Operations.java index fdca49315447..037abffa7244 100644 --- a/lucene/core/src/java/org/apache/lucene/util/automaton/Operations.java +++ b/lucene/core/src/java/org/apache/lucene/util/automaton/Operations.java @@ -34,7 +34,6 @@ import java.util.Arrays; import java.util.BitSet; import java.util.Collection; -import java.util.Deque; import java.util.HashMap; import java.util.HashSet; import java.util.List; @@ -1319,7 +1318,7 @@ public static int[] topoSortStates(Automaton a) { private static int topoSortStates(Automaton a, int[] states) { BitSet onStack = new BitSet(a.getNumStates()); BitSet visited = new BitSet(a.getNumStates()); - Deque stack = new ArrayDeque<>(); + var stack = new ArrayDeque(); stack.push(0); // Assuming that the initial state is 0. int upto = 0; Transition t = new Transition(); From 9fb22d97246a61307d6b771f72969178404075c5 Mon Sep 17 00:00:00 2001 From: tangdonghai Date: Mon, 15 May 2023 18:56:25 +0800 Subject: [PATCH 8/8] fix comment error --- .../src/java/org/apache/lucene/util/automaton/Operations.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lucene/core/src/java/org/apache/lucene/util/automaton/Operations.java b/lucene/core/src/java/org/apache/lucene/util/automaton/Operations.java index 037abffa7244..1384e1e992b4 100644 --- a/lucene/core/src/java/org/apache/lucene/util/automaton/Operations.java +++ b/lucene/core/src/java/org/apache/lucene/util/automaton/Operations.java @@ -1312,7 +1312,7 @@ public static int[] topoSortStates(Automaton a) { * * @param a The automaton whose states are to be topologically sorted. * @param states An int array which stores the states. - * @return the reversed topologically sorted array of state ids + * @return the number of states in the final sorted list. * @throws IllegalArgumentException if the input automaton has a cycle. */ private static int topoSortStates(Automaton a, int[] states) {