From f63cf2c9373d9c8f2b93ff87ee187560810c0686 Mon Sep 17 00:00:00 2001 From: David Beaumont Date: Wed, 15 Feb 2023 18:44:20 +0100 Subject: [PATCH] Project import generated by Copybara. (#2890) PiperOrigin-RevId: 509849832 --- .../phonenumbers/metadata/LengthsParser.java | 71 +++ .../i18n/phonenumbers/metadata/Types.java | 2 +- .../DigitSequenceMatcher.java | 317 +++++++++ .../metadata/finitestatematcher/OpCode.java | 262 ++++++++ .../compiler/MatcherBytes.java | 247 +++++++ .../compiler/MatcherCompiler.java | 299 +++++++++ .../compiler/Operation.java | 600 ++++++++++++++++++ .../compiler/Statistics.java | 44 ++ .../metadata/model/ExamplesTableSchema.java | 4 +- .../metadata/model/FileBasedCsvLoader.java | 2 +- .../metadata/model/FormatsTableSchema.java | 2 +- .../metadata/model/RangesTableSchema.java | 163 +++-- .../phonenumbers/metadata/regex/AnyPath.java | 181 ++++++ .../phonenumbers/metadata/regex/Edge.java | 351 ++++++++++ .../metadata/regex/EdgeWriter.java | 343 ++++++++++ .../metadata/regex/NfaFlattener.java | 195 ++++++ .../phonenumbers/metadata/regex/Node.java | 51 ++ .../metadata/regex/RangeTreeConverter.java | 123 ++++ .../metadata/regex/RegexFormatter.java | 118 ++++ .../metadata/regex/RegexGenerator.java | 171 +++++ .../metadata/regex/SubgroupOptimizer.java | 190 ++++++ .../metadata/regex/TrailingPathOptimizer.java | 206 ++++++ .../metadata/table/CsvParser.java | 30 +- .../phonenumbers/metadata/table/CsvTable.java | 2 +- .../metadata/table/RangeTable.java | 2 +- .../phonenumbers/metadata/table/Schema.java | 2 +- .../metadata/LengthsParserTest.java | 76 +++ .../metadata/RangeSpecificationTest.java | 2 +- .../DigitSequenceMatcherTest.java | 210 ++++++ .../compiler/CompilerRegressionTest.java | 317 +++++++++ .../compiler/MatcherCompilerTest.java | 144 +++++ .../compiler/OperationTest.java | 60 ++ .../compiler/regression_test_data.textpb | 295 +++++++++ .../metadata/regex/AnyPathTest.java | 106 ++++ .../phonenumbers/metadata/regex/EdgeTest.java | 224 +++++++ .../metadata/regex/EdgeWriterTest.java | 154 +++++ .../metadata/regex/NfaBuilder.java | 98 +++ .../metadata/regex/NfaFlattenerTest.java | 229 +++++++ .../phonenumbers/metadata/regex/NodeTest.java | 68 ++ .../regex/RangeTreeConverterTest.java | 154 +++++ .../metadata/regex/RegexFormatterTest.java | 107 ++++ .../metadata/regex/RegexGeneratorTest.java | 197 ++++++ .../metadata/regex/SubgraphOptimizerTest.java | 80 +++ .../regex/TrailingPathOptimizerTest.java | 122 ++++ metadata/src/test/proto/regression_test.proto | 49 ++ 45 files changed, 6559 insertions(+), 111 deletions(-) create mode 100644 metadata/src/main/java/com/google/i18n/phonenumbers/metadata/LengthsParser.java create mode 100644 metadata/src/main/java/com/google/i18n/phonenumbers/metadata/finitestatematcher/DigitSequenceMatcher.java create mode 100644 metadata/src/main/java/com/google/i18n/phonenumbers/metadata/finitestatematcher/OpCode.java create mode 100644 metadata/src/main/java/com/google/i18n/phonenumbers/metadata/finitestatematcher/compiler/MatcherBytes.java create mode 100644 metadata/src/main/java/com/google/i18n/phonenumbers/metadata/finitestatematcher/compiler/MatcherCompiler.java create mode 100644 metadata/src/main/java/com/google/i18n/phonenumbers/metadata/finitestatematcher/compiler/Operation.java create mode 100644 metadata/src/main/java/com/google/i18n/phonenumbers/metadata/finitestatematcher/compiler/Statistics.java create mode 100644 metadata/src/main/java/com/google/i18n/phonenumbers/metadata/regex/AnyPath.java create mode 100644 metadata/src/main/java/com/google/i18n/phonenumbers/metadata/regex/Edge.java create mode 100644 metadata/src/main/java/com/google/i18n/phonenumbers/metadata/regex/EdgeWriter.java create mode 100644 metadata/src/main/java/com/google/i18n/phonenumbers/metadata/regex/NfaFlattener.java create mode 100644 metadata/src/main/java/com/google/i18n/phonenumbers/metadata/regex/Node.java create mode 100644 metadata/src/main/java/com/google/i18n/phonenumbers/metadata/regex/RangeTreeConverter.java create mode 100644 metadata/src/main/java/com/google/i18n/phonenumbers/metadata/regex/RegexFormatter.java create mode 100644 metadata/src/main/java/com/google/i18n/phonenumbers/metadata/regex/RegexGenerator.java create mode 100644 metadata/src/main/java/com/google/i18n/phonenumbers/metadata/regex/SubgroupOptimizer.java create mode 100644 metadata/src/main/java/com/google/i18n/phonenumbers/metadata/regex/TrailingPathOptimizer.java create mode 100644 metadata/src/test/java/com/google/i18n/phonenumbers/metadata/LengthsParserTest.java create mode 100644 metadata/src/test/java/com/google/i18n/phonenumbers/metadata/finitestatematcher/DigitSequenceMatcherTest.java create mode 100644 metadata/src/test/java/com/google/i18n/phonenumbers/metadata/finitestatematcher/compiler/CompilerRegressionTest.java create mode 100644 metadata/src/test/java/com/google/i18n/phonenumbers/metadata/finitestatematcher/compiler/MatcherCompilerTest.java create mode 100644 metadata/src/test/java/com/google/i18n/phonenumbers/metadata/finitestatematcher/compiler/OperationTest.java create mode 100644 metadata/src/test/java/com/google/i18n/phonenumbers/metadata/finitestatematcher/compiler/regression_test_data.textpb create mode 100644 metadata/src/test/java/com/google/i18n/phonenumbers/metadata/regex/AnyPathTest.java create mode 100644 metadata/src/test/java/com/google/i18n/phonenumbers/metadata/regex/EdgeTest.java create mode 100644 metadata/src/test/java/com/google/i18n/phonenumbers/metadata/regex/EdgeWriterTest.java create mode 100644 metadata/src/test/java/com/google/i18n/phonenumbers/metadata/regex/NfaBuilder.java create mode 100644 metadata/src/test/java/com/google/i18n/phonenumbers/metadata/regex/NfaFlattenerTest.java create mode 100644 metadata/src/test/java/com/google/i18n/phonenumbers/metadata/regex/NodeTest.java create mode 100644 metadata/src/test/java/com/google/i18n/phonenumbers/metadata/regex/RangeTreeConverterTest.java create mode 100644 metadata/src/test/java/com/google/i18n/phonenumbers/metadata/regex/RegexFormatterTest.java create mode 100644 metadata/src/test/java/com/google/i18n/phonenumbers/metadata/regex/RegexGeneratorTest.java create mode 100644 metadata/src/test/java/com/google/i18n/phonenumbers/metadata/regex/SubgraphOptimizerTest.java create mode 100644 metadata/src/test/java/com/google/i18n/phonenumbers/metadata/regex/TrailingPathOptimizerTest.java create mode 100644 metadata/src/test/proto/regression_test.proto diff --git a/metadata/src/main/java/com/google/i18n/phonenumbers/metadata/LengthsParser.java b/metadata/src/main/java/com/google/i18n/phonenumbers/metadata/LengthsParser.java new file mode 100644 index 0000000000..d68cf9bac3 --- /dev/null +++ b/metadata/src/main/java/com/google/i18n/phonenumbers/metadata/LengthsParser.java @@ -0,0 +1,71 @@ +/* + * Copyright (C) 2022 The Libphonenumber Authors. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.google.i18n.phonenumbers.metadata; + +import static com.google.common.base.CharMatcher.whitespace; +import static com.google.common.base.Preconditions.checkArgument; +import static java.lang.Integer.parseUnsignedInt; + +import com.google.common.base.CharMatcher; +import com.google.common.base.Splitter; +import com.google.common.collect.ContiguousSet; +import com.google.common.collect.ImmutableSortedSet; +import java.util.List; +import java.util.NavigableSet; +import java.util.TreeSet; + +/** Parses strings of form "4,7-9,11" which are used as length specifiers across LPN metadata */ +public final class LengthsParser { + + private static final Splitter COMMA_SPLITTER = Splitter.on(',').trimResults(whitespace()); + private static final Splitter RANGE_SPLITTER = + Splitter.on('-').trimResults(whitespace()).limit(2); + private static final CharMatcher ALLOWED_CHARACTERS = + CharMatcher.inRange('0', '9').or(CharMatcher.anyOf("-,")).or(whitespace()); + + /** Returns the set of integers specified by this string. */ + public static ImmutableSortedSet parseLengths(String s) { + checkArgument( + ALLOWED_CHARACTERS.matchesAllOf(s), + "Length specifier contains forbidden characters: %s", + s); + NavigableSet lengths = new TreeSet<>(); + for (String lengthOrRange : COMMA_SPLITTER.split(s)) { + if (lengthOrRange.contains("-")) { + List lohi = RANGE_SPLITTER.splitToList(lengthOrRange); + int lo = parseUnsignedInt(lohi.get(0)); + int hi = parseUnsignedInt(lohi.get(1)); + checkArgument(lo < hi, "Invalid range: %s-%s", lo, hi); + checkArgument( + lengths.isEmpty() || lo > lengths.last(), + "Numbers in length specifier are out of order: %s", + s); + lengths.addAll(ContiguousSet.closed(lo, hi)); + } else { + int length = parseUnsignedInt(lengthOrRange); + checkArgument( + lengths.isEmpty() || length > lengths.last(), + "Numbers in length specifier are out of order: %s", + s); + lengths.add(length); + } + } + return ImmutableSortedSet.copyOf(lengths); + } + + private LengthsParser() {} +} diff --git a/metadata/src/main/java/com/google/i18n/phonenumbers/metadata/Types.java b/metadata/src/main/java/com/google/i18n/phonenumbers/metadata/Types.java index 392e62db4c..24c81db8e2 100644 --- a/metadata/src/main/java/com/google/i18n/phonenumbers/metadata/Types.java +++ b/metadata/src/main/java/com/google/i18n/phonenumbers/metadata/Types.java @@ -71,7 +71,7 @@ public final class Types { .put(XML_VOIP, VOIP) .put(XML_UAN, UAN) .put(XML_VOICEMAIL, VOICEMAIL) - .build(); + .buildOrThrow(); /** Returns the set of valid XML type names. */ public static ImmutableSet getXmlNames() { diff --git a/metadata/src/main/java/com/google/i18n/phonenumbers/metadata/finitestatematcher/DigitSequenceMatcher.java b/metadata/src/main/java/com/google/i18n/phonenumbers/metadata/finitestatematcher/DigitSequenceMatcher.java new file mode 100644 index 0000000000..589ffab924 --- /dev/null +++ b/metadata/src/main/java/com/google/i18n/phonenumbers/metadata/finitestatematcher/DigitSequenceMatcher.java @@ -0,0 +1,317 @@ +/* + * Copyright (C) 2017 The Libphonenumber Authors. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.google.i18n.phonenumbers.metadata.finitestatematcher; + +import com.google.i18n.phonenumbers.metadata.finitestatematcher.OpCode.State; + +/** + * Matches phone number regular expressions based on compact compiled data generated by + * {@link com.google.i18n.phonenumbers.metadata.finitestatematcher.compiler.MatcherCompiler + * MatcherCompiler}. Typically the phone number regular expression will be compiled at build time + * and the resulting matcher data will be packaged into the binary which needs it, or downloaded at + * run time. + *

+ * This class is designed to be lightweight and fast, and should be simple to implement in many + * different languages (C++, Python, JS, etc.). + * + * TODO: Consider UnisgnedBytes.toInt(x) to avoid lots of (x & 0xFF). + */ +public abstract class DigitSequenceMatcher { + + /** Possible result types returned by a match operation. */ + public enum Result { + /** The match operation was a success and the input was matched. */ + MATCHED, + /** The match operation failed because unexpected input was encountered. */ + INVALID, + /** + * The match operation failed because the input terminated too soon (ie, the input was a + * valid prefix for the matcher). + */ + TOO_SHORT, + /** + * The match operation failed due to the existence of additional input after matching had + * completed (ie, the the input would have matched if it were shorter). + */ + TOO_LONG; + } + + /** An iterator of {@code int}, used to supply the matcher with a sequence of input digits. */ + public interface DigitSequence { + /** Returns true if there are more digits available. */ + boolean hasNext(); + + /** + * Return the next digit value (from 0 to 9 inclusive, not a char value). The matcher does not + * test for invalid digits, so returning values outside this range will have undefined results, + * including false positive results. + */ + int next(); + } + + /** Internal abstraction to allow matching over either byte arrays or strings. */ + interface DataView { + /** Return the unsigned byte value at the given offset from the current position. */ + int peekByte(int offset); + + /** Return the unsigned byte value at the current position and move ahead 1 byte. */ + int readByte(); + + /** Return the unsigned short value at the current position and move ahead 2 bytes. */ + int readShort(); + + /** Return the unsigned int value at the current position and move ahead 4 bytes. */ + int readInt(); + + /** Adjust the current position by the given (non-negative) offset. */ + State branch(int offset); + + /** + * Adjust the current position by the unsigned byte offset value read from the current + * position plus the given index. This is used to implement maps and branching ranges. + */ + State jumpTable(int index); + } + + /** + * Creates a new matcher which reads instructions directly from the given byte array. Typically + * it is expected that this method will consume byte arrays packaged into a binary at build time + * (the MatcherCompiler is not suitable for direct parsing of regular expressions at run time). + *

+ * See {@code MatcherCompiler.compile(...)}. + */ + public static DigitSequenceMatcher create(byte[] data) { + if (data.length == 0) { + throw new IllegalArgumentException("matcher data cannot be empty"); + } + return new ByteArrayMatcher(data); + } + + /** + * Creates a new matcher which reads instructions from the given string. Typically it is expected + * that this method will be used when matcher data is packaged as literal Java string constants + * in (auto-generated) source files. + *

+ * See {@code MatcherCompiler.compileToUnquotedJavaSourceString(...)}. + */ + public static DigitSequenceMatcher create(String data) { + if (data.isEmpty()) { + throw new IllegalArgumentException("matcher data cannot be empty"); + } + return new StringMatcher(data); + } + + abstract DataView newDataView(); + + abstract int size(); + + /** Matches the input against this matcher, returning a result code. */ + public Result match(DigitSequence in) { + State state = runMatcher(in); + switch (state) { + case TERMINAL: + return !in.hasNext() ? Result.MATCHED : Result.TOO_LONG; + case TRUNCATED: + return Result.TOO_SHORT; + case INVALID: + return Result.INVALID; + default: + throw new AssertionError("unexpected state: " + state); + } + } + + private State runMatcher(DigitSequence in) { + DataView data = newDataView(); + State state; + do { + state = OpCode.decode(data.peekByte(0)).execute(data, in); + } while (state == State.CONTINUE); + return state; + } + + @Override + public String toString() { + int size = size(); + StringBuilder out = new StringBuilder(size + " :: [ "); + DataView data = newDataView(); + while (size-- > 0) { + out.append(Integer.toHexString(data.readByte())).append(", "); + } + out.setLength(out.length() - 2); + out.append(" ]"); + return out.toString(); + } + + /** A matcher for reading instructions from a byte array. */ + private static final class ByteArrayMatcher extends DigitSequenceMatcher { + + private class ByteArrayData implements DataView { + int position = 0; + + @Override public int peekByte(int offset) { + return bytes[position + offset] & 0xFF; + } + + @Override public int readByte() { + return bytes[position++] & 0xFF; + } + + @Override public int readShort() { + return (readByte() << 8) | readByte(); + } + + @Override public int readInt() { + return (readShort() << 16) | readShort(); + } + + @Override public State branch(int offset) { + position += offset; + return offset != 0 ? State.CONTINUE : State.TERMINAL; + } + + @Override public State jumpTable(int index) { + return branch(peekByte(index)); + } + } + + private final byte[] bytes; + + private ByteArrayMatcher(byte[] data) { + this.bytes = data; + } + + @Override + DataView newDataView() { + return new ByteArrayData(); + } + + @Override + int size() { + return bytes.length; + } + } + + /** A matcher for reading instructions from a String. */ + private static final class StringMatcher extends DigitSequenceMatcher { + + /* + * Note: Using unsigned shift "x >>> 1" is more likely to be free as part of a data load + * instruction than "x / 2". + */ + + private class StringData implements DataView { + int position = 0; + + @Override public int peekByte(int offset) { + offset += position; + int data = bytes.charAt(offset >>> 1); + // char := hi [ even-byte | odd-byte ] lo + return (offset & 1) != 0 ? data & 0xFF : data >>> 8; + } + + @Override public int readByte() { + int data = bytes.charAt(position >>> 1); + // char := hi [ even-byte | odd-byte ] lo + data = (position & 1) != 0 ? data & 0xFF : data >>> 8; + position += 1; + return data; + } + + @Override public int readShort() { + int data = bytes.charAt(position >>> 1); + // Adding 2 early does not affect odd/even (but does reference next char). + position += 2; + if ((position & 1) != 0) { + data = ((data & 0xFF) << 8) | (bytes.charAt(position >>> 1) >>> 8); + } + return data; + } + + @Override public int readInt() { + return (readShort() << 16) | readShort(); + } + + @Override public State branch(int offset) { + position += offset; + return offset != 0 ? State.CONTINUE : State.TERMINAL; + } + + @Override public State jumpTable(int index) { + return branch(peekByte(index)); + } + } + + private final String bytes; + + private StringMatcher(String bytes) { + this.bytes = bytes; + } + + @Override + DataView newDataView() { + return new StringData(); + } + + @Override + int size() { + int size = 2 * bytes.length(); + if ((bytes.charAt(bytes.length() - 1) & 0xFF) == 0xFF) { + size -= 1; + } + return size; + } + } + + /** An iterator of {@code int} that yields a sequence of input digits from a string. */ + private static final class StringDigits implements DigitSequence { + private final CharSequence number; + private int n = 0; + + private StringDigits(CharSequence number) { + this.number = number; + } + + @Override public int next() { + if (n < 0 || n >= number.length()) { + throw new IndexOutOfBoundsException( + "index '" + n + "' out of bounds for input: " + number); + } + char c = number.charAt(n); + if (c < '0' || c > '9') { + throw new IllegalArgumentException( + "non-digit character '" + c + "' [" + ((int) c) + "] at index " + n + " in: " + number); + } + n++; + return c - '0'; + } + + @Override public boolean hasNext() { + return n < number.length(); + } + } + + /** + * Returns an instance of DigitSequence based on the input string. The input string may only + * contain digits. + */ + public static DigitSequence digitsFromString(CharSequence number) { + return new StringDigits(number); + } + + /** A matcher has no internal state and is just a factory for data specific implementations. */ + private DigitSequenceMatcher() { } +} diff --git a/metadata/src/main/java/com/google/i18n/phonenumbers/metadata/finitestatematcher/OpCode.java b/metadata/src/main/java/com/google/i18n/phonenumbers/metadata/finitestatematcher/OpCode.java new file mode 100644 index 0000000000..6d59a0fb9d --- /dev/null +++ b/metadata/src/main/java/com/google/i18n/phonenumbers/metadata/finitestatematcher/OpCode.java @@ -0,0 +1,262 @@ +/* + * Copyright (C) 2017 The Libphonenumber Authors. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.google.i18n.phonenumbers.metadata.finitestatematcher; + +import com.google.i18n.phonenumbers.metadata.finitestatematcher.DigitSequenceMatcher.DataView; +import com.google.i18n.phonenumbers.metadata.finitestatematcher.DigitSequenceMatcher.DigitSequence; + +/** + * Implementation of instructions for the phone number matcher state machine. + *

+ *

Jump Tables

+ * + * Several instructions use a "jump table" concept which is simply a contiguous region of bytes + * containing offsets from which a new position is calculated. The new position is the current + * position (at the start of the jump table) plus the value of the chosen jump offset. + * + *
{@code
+ * [    ...    | JUMP_0 | JUMP_1 | ... | JUMP_N |    ...    |  DEST  |  ...
+ *  position --^            ^                               ^
+ *             `---index ---'                               |
+ *  offset     `----------------  [ position + index ] -----'
+ *
+ *  position = position + unsignedByteValueAt(position + index)
+ * }
+ * + * A jump offset of zero signifies that the state jumped to is terminal (this avoids having to jump + * to a termination byte). A jump table will always occur immediately after an associated + * instruction and the instruction's stated size includes the number of bytes in the jump table. + */ +public enum OpCode { + /** + * Jumps ahead by between 1 and 4095 bytes from the end of this opcode. This opcode does not + * consume any input. + *

+ * This is a variable length instruction, taking one byte for offsets up to 15 and (if EXT is set) + * two bytes for larger offsets up to 4095. The jump offset signifies how many bytes to skip after + * this instruction. + *

+ * As a special case, a single byte branch with a jump offset of zero (represented by a single + * zero byte) can be used to signify that the current state is terminal and the state machine + * should exit (a zero jump offset never makes sense in any instruction). + * + *

{@code
+   * [ 0 | 0 |  JUMP   ]
+   * [ 0 | 1 |  JUMP   |  EXT_JUMP   ]
+   *  <3>.<1>.<-- 4 -->.<---- 8 ---->
+   * }
+ */ + BRANCH(0) { + @Override + State execute(DataView data, DigitSequence ignored) { + int op = data.readByte(); + int offset = op & 0xF; + if ((op & (1 << 4)) != 0) { + offset = (offset << 8) + data.readByte(); + } + return data.branch(offset); + } + }, + /** + * Accepts a single input (and transition to a single state). Inputs not matching "VAL" are + * invalid from the current state. If "TRM" is set then the state being transitioned from may + * terminate. + * + *
{@code
+   * [ 1 |TRM|  VAL  ]
+   *  <3>.<1>.<- 4 ->
+   * }
+ */ + SINGLE(1) { + @Override + State execute(DataView data, DigitSequence in) { + int op = data.readByte(); + if (!in.hasNext()) { + return ((op & (1 << 4)) != 0) ? State.TERMINAL : State.TRUNCATED; + } + int n = in.next(); + return ((op & 0xF) == n) ? State.CONTINUE : State.INVALID; + } + }, + /** + * Accept any input to transition to a single state one or more times. + *

+ * If "TRM" is set then every state that is transitioned from may terminate. + * + *

{@code
+   * [ 2 |TRM| NUM-1 ]
+   *  <3>.<1>.<- 4 ->
+   * }
+ */ + ANY(2) { + @Override + State execute(DataView data, DigitSequence in) { + int op = data.readByte(); + int num = (op & 0xF) + 1; + boolean isTerminating = (op & (1 << 4)) != 0; + while (num-- > 0) { + if (!in.hasNext()) { + return isTerminating ? State.TERMINAL : State.TRUNCATED; + } + in.next(); + } + return State.CONTINUE; + } + }, + /** + * Accepts multiple inputs to transition to one or two states. The bit-set has the Nth bit set if + * we should accept digit N (bit-0 is the lowest bit of the 2 byte form of the instruction). + *

+ * This is a variable length instruction which either treats non-matched inputs as invalid + * (2 byte form) or branches to one of two states via a 2-entry jump table (4 byte form). + *

+ * If "TRM" is set then the state being transitioned from may terminate. + * + *

{@code
+   * [ 3 |TRM| 0 |---|   BIT SET  ]
+   * [ 3 |TRM| 1 |---|   BIT SET  |  JUMP_IN  | JUMP_OUT  ]
+   *  <3>.<1>.<1>.<1>.<--- 10 --->.<--- 8 --->.<--- 8 --->
+   * }
+ */ + RANGE(3) { + @Override + State execute(DataView data, DigitSequence in) { + int op = data.readShort(); + if (!in.hasNext()) { + return ((op & (1 << 12)) != 0) ? State.TERMINAL : State.TRUNCATED; + } + int n = in.next(); + if ((op & (1 << 11)) == 0) { + // 2 byte form, non-matched input is invalid. + return ((op & (1 << n)) != 0) ? State.CONTINUE : State.INVALID; + } + // 4 byte form uses jump table (use bitwise negation so a set bit becomes a 0 index). + return data.jumpTable((~op >>> n) & 1); + } + }, + /** + * Accept multiple inputs to transition to between one and ten states via jump offsets. Inputs + * not encoded in "CODED MAP" are invalid from the current state. + *

+ * Because there is no room for a termination bit in this instruction, there is an alternate + * version, {@code TMAP}, which should be used when transitioning from a terminating state. + *

+ * TODO: Figure out if we can save one bit here and merge MAP and TMAP. + * + *

{@code
+   * [ 4 |      CODED MAP       |  JUMP_1   |  ... |  JUMP_N   ]
+   *  <3>.<-------- 29 -------->.<--- 8 --->.  ... .<--- 8 --->
+   * }
+ */ + MAP(4) { + @Override + State execute(DataView data, DigitSequence in) { + return map(data, in, State.TRUNCATED); + } + }, + /** + * Like {@code MAP} but transitions from a terminating state. + */ + TMAP(5) { + @Override + State execute(DataView data, DigitSequence in) { + return map(data, in, State.TERMINAL); + } + }; + + /** The types of states that the state-machine can be in. */ + public enum State { + CONTINUE, TERMINAL, INVALID, TRUNCATED; + } + + private static final OpCode[] VALUES = values(); + + /** + * Encode maps as 29 bits where each digit takes a different number of bits to encode its offset. + * Specifically: + *
    + *
  • The first entry (matching 0) has only two possible values (it is either not present or maps + * to the first entry in the jump table), so takes only 1 bit. + *
  • The second entry (matching 1) has three possible values (not present or maps to either the + * first or second entry in the jump table), so it takes 2 bits. + *
  • In general the entry matching digit N has (N+1) possible states and takes log2(N+1) bits. + *
+ */ + private static final long MAP_SHIFT_BITS = 0L << 0 | // 1 bit (1x, mask=1) + 1L << 5 | 3L << 10 | // 2 bits (2x, mask=3) + 5L << 15 | 8L << 20 | 11L << 25 | 14L << 30 | // 3 bits (4x, mask=7) + 17L << 35 | 21L << 40 | 25L << 45; // 4 bits (3x, mask=F) + + /** + * A table of values with which to mask the coded jump table map, after shifting it. Each nibble + * is a mask of up to 4 bits to extract the encoded index from a map instruction after it has + * been shifted. + */ + private static final long MAP_MASK_BITS = 0xFFF7777331L; + + /** + * Returns the number of bits we must shift the coded jump table map for a digit with value + * {@code n} such that the jump index is in the lowest bits. + */ + public static int getMapShift(int n) { + return (int) (MAP_SHIFT_BITS >>> (5 * n)) & 0x1F; + } + + /** + * Returns a mask we must apply to the shifted jump table map to extract only the jump index from + * the lowest bits. + */ + public static int getMapMask(int n) { + return (int) (MAP_MASK_BITS >>> (4 * n)) & 0xF; + } + + /** + * Executes a map instruction by decoding the map data and selecting a jump offset to apply. + */ + private static State map(DataView data, DigitSequence in, State noInputState) { + int op = data.readInt(); + if (!in.hasNext()) { + return noInputState; + } + int n = in.next(); + // Coded indices are 1-to-10 (0 is the "invalid" state). + int index = ((op >>> getMapShift(n)) & getMapMask(n)); + if (index == 0) { + return State.INVALID; + } + // Jump offsets are zero based. + return data.jumpTable(index - 1); + } + + /** + * Returns the opcode associated with the given unsigned byte value (the first byte of any + * instruction). + */ + static OpCode decode(int unsignedByte) { + return VALUES[unsignedByte >>> 5]; + } + + private OpCode(int code) { + // Assertion checks during enum creation. Opcodes must be 3 bits and match the ordinal of the + // enum (this prevents issues if reordering enums occurs). + if ((code & ~0x7) != 0 || code != ordinal()) { + throw new AssertionError("bad opcode value: " + code); + } + } + + abstract State execute(DataView data, DigitSequence in); +} diff --git a/metadata/src/main/java/com/google/i18n/phonenumbers/metadata/finitestatematcher/compiler/MatcherBytes.java b/metadata/src/main/java/com/google/i18n/phonenumbers/metadata/finitestatematcher/compiler/MatcherBytes.java new file mode 100644 index 0000000000..cd35b394fe --- /dev/null +++ b/metadata/src/main/java/com/google/i18n/phonenumbers/metadata/finitestatematcher/compiler/MatcherBytes.java @@ -0,0 +1,247 @@ +/* + * Copyright (C) 2017 The Libphonenumber Authors. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.google.i18n.phonenumbers.metadata.finitestatematcher.compiler; + +import com.google.common.base.Preconditions; +import com.google.common.collect.Iterables; +import com.google.common.collect.Lists; +import com.google.common.io.ByteArrayDataOutput; +import com.google.common.io.ByteStreams; +import com.google.i18n.phonenumbers.metadata.RangeTree.DfaNode; +import com.google.i18n.phonenumbers.metadata.finitestatematcher.compiler.MatcherCompiler.Sequence; +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import java.util.ArrayList; +import java.util.Collections; +import java.util.Comparator; +import java.util.HashMap; +import java.util.HashSet; +import java.util.Iterator; +import java.util.LinkedHashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; + +/** + * Renders the final bytecode representation for the matcher by connecting sequences of operations + * together and fixing-up offsets and branch instructions. This is essentially the higher-level + * aspect of matcher bytecode compilation. + *

+ * Unlike {@link MatcherCompiler} in which a lot of the data is immutable (because sequences can + * be defined in isolation), there's a lot of mutable state in this class due to the need to build + * and manage offsets between the sequences, which relies on the order in which other sequences + * have been rendered. + */ +class MatcherBytes { + /** + * A partial order on byte sequences based on their size. This is not "equivalent to equals" and + * must not be used to construct an ordered set. + */ + private static final Comparator DECREASING_BY_SIZE = + new Comparator() { + @Override public int compare(SequenceBytes lhs, SequenceBytes rhs) { + return Integer.compare(rhs.size(), lhs.size()); + } + }; + + /** + * Sequences we have not considered for rendering yet. + */ + private final List remainingSequences; + /** + * Candidate sequences whose dependent sequences have all been rendered, and which may themselves + * now be rendered. + */ + private final Set canditiateSequences = new LinkedHashSet<>(); + /** + * Sequences which have been rendered (used to determine when other sequences become renderable). + */ + private final Set compiledSequences = new HashSet<>(); + /** + * A map from which are final nodes of a sequence to the sequence they belong to. The key set of + * this map is a subset of all nodes. + */ + private final Map sequenceMap = new HashMap<>(); + /** + * A list of compiled byte sequences in reverse order (ie, the sequence with the terminal node + * in it is first in this list and the sequence with the initial node is last). Compilation + * occurs in reverse order to allow offsets between sequences to be calculated as we go. + */ + private final List reverseOrder = new ArrayList<>(); + /** Statistics instance for collecting inforation about the compilation. */ + private final Statistics stats; + + MatcherBytes(Iterable allSequences, Statistics stats) { + // Our set of remaining sequences just starts out as all the sequences. + // Sequences are processed in reverse order, so reverse the sorted sequences before beginning. + remainingSequences = Lists.reverse(Lists.newArrayList(allSequences)); + this.stats = Preconditions.checkNotNull(stats); + } + + /** + * Compiles all sequences into a single byte buffer suitable for use by a + * {@code DigitSequenceMatcher}. + */ + byte[] compile() { + int totalSequenceCount = remainingSequences.size(); + // Sequences with not dependent sequences are compiled first. + compileFinalSequences(); + // Determine new candidate sequences. + while (compiledSequences.size() < totalSequenceCount) { + // We won't always add a new candidate sequence each time around the loop, but the set + // should never be emptied until the final sequence is processed. + for (Iterator it = remainingSequences.iterator(); it.hasNext();) { + Sequence s = it.next(); + if (compiledSequences.containsAll(s.unorderedOutSequences())) { + canditiateSequences.add(s); + it.remove(); + } + } + // Compile the next candidate sequence. + Sequence toCompile = Iterables.get(canditiateSequences, 0); + reverseOrder.add(compile(toCompile)); + compiledSequences.add(toCompile); + canditiateSequences.remove(toCompile); + } + // We should have always exhausted the candidate sequences when we've finished rendering. + Preconditions.checkState(remainingSequences.isEmpty()); + Preconditions.checkState(canditiateSequences.isEmpty()); + return concatSequenceBytesInForwardOrder(); + } + + /** + * Compiles any sequences which have no dependencies and orders them by size to heuristically + * reduce the size of branch offsets needed to reach them. + */ + private void compileFinalSequences() { + for (Iterator it = remainingSequences.iterator(); it.hasNext();) { + Sequence s = it.next(); + if (s.isFinal()) { + reverseOrder.add(compile(s)); + compiledSequences.add(s); + it.remove(); + } + } + // They are ordered by size (shortest first) because this will tend to reduce the number of + // 2-byte branch instructions needed to jump to them. + Collections.sort(reverseOrder, DECREASING_BY_SIZE); + } + + /** Compiles a sequence for which all dependent sequences have already been compiled. */ + private SequenceBytes compile(Sequence sequence) { + // Note: Even non branching sequences will have an out node here. + Map offsetMap = new HashMap<>(); + for (DfaNode out : sequence.getOutStates()) { + SequenceBytes targetSequence = sequenceMap.get(out); + int offsetToStartOfSequence = 0; + for (int n = reverseOrder.size() - 1; n >= 0 && reverseOrder.get(n) != targetSequence; n--) { + offsetToStartOfSequence += reverseOrder.get(n).size(); + } + if (offsetToStartOfSequence > 0 && targetSequence.isTerminator()) { + // If we would explicitly jump to a terminator sequence, we can just exit + // unconditionally at this point. + offsetToStartOfSequence = Operation.TERMINATION_OFFSET; + } + offsetMap.put(out, offsetToStartOfSequence); + } + SequenceBytes compiled = new SequenceBytes(sequence, offsetMap, stats); + sequenceMap.put(sequence.getInitialState(), compiled); + return compiled; + } + + /** Creates the final, single buffer of bytecode instructions for the matcher. */ + private byte[] concatSequenceBytesInForwardOrder() { + try { + ByteArrayOutputStream outBuffer = new ByteArrayOutputStream(); + for (int n = reverseOrder.size() - 1; n >= 0; n--) { + outBuffer.write(reverseOrder.get(n).getBytes()); + } + return outBuffer.toByteArray(); + } catch (IOException e) { + throw new AssertionError("ByteArrayOutputStream cannot throw IOException"); + } + } + + /** Renders a sequence (along with a map of branch offsets) to its bytecode form. */ + private static byte[] renderSequence( + Sequence sequence, Map offsetMap, Statistics stats) { + // Because our operations come from a sequence, we can assert that only the last operation + // could possibly be branching. + List ops = sequence.createOps(); + ByteArrayOutputStream baos = new ByteArrayOutputStream(); + ByteArrayDataOutput outBytes = ByteStreams.newDataOutput(baos); + // Write all but the last operation (there are no branches to worry about). + for (int n = 0; n < ops.size() - 1; n++) { + ops.get(n).writeTo(outBytes, null, stats); + } + Operation lastOp = Iterables.getLast(ops); + if (lastOp.isTerminating()) { + stats.record(Statistics.Type.TERMINATING); + } + if (lastOp.isBranching()) { + // A branching operation uses the offset map directly to fill in its jump table information. + lastOp.writeTo(outBytes, offsetMap, stats); + } else { + // A non-branching operation does not use offsets, but we may need to add an explicit branch + // instruction after it. + lastOp.writeTo(outBytes, null, stats); + if (!offsetMap.isEmpty()) { + // When adding a branch instruction, there should only be a single offset to use. + int offset = Iterables.getOnlyElement(offsetMap.values()); + if (offset >= 0) { + // The offset could still be zero, but this is handled correctly by writeBranch(). + Operation.writeBranch(outBytes, offset, stats); + } else { + // This is a terminal instruction and the matcher should exit. + Preconditions.checkArgument(offset == Operation.TERMINATION_OFFSET); + Operation.writeTerminator(outBytes, stats); + } + } + } + return baos.toByteArray(); + } + + /** + * A single compiled sequence of operations. This is just a holder for a {@link Sequence} and the + * compiled bytes it produces. + */ + static class SequenceBytes { + private final Sequence sequence; + private final byte[] bytes; + + SequenceBytes(Sequence sequence, Map offsetMap, Statistics stats) { + this.sequence = sequence; + this.bytes = renderSequence(sequence, offsetMap, stats); + } + + Sequence getSequence() { + return sequence; + } + + boolean isTerminator() { + return sequence.isFinal() && sequence.size() == 1; + } + + int size() { + return bytes.length; + } + + byte[] getBytes() { + return bytes; + } + } +} diff --git a/metadata/src/main/java/com/google/i18n/phonenumbers/metadata/finitestatematcher/compiler/MatcherCompiler.java b/metadata/src/main/java/com/google/i18n/phonenumbers/metadata/finitestatematcher/compiler/MatcherCompiler.java new file mode 100644 index 0000000000..621226301f --- /dev/null +++ b/metadata/src/main/java/com/google/i18n/phonenumbers/metadata/finitestatematcher/compiler/MatcherCompiler.java @@ -0,0 +1,299 @@ +/* + * Copyright (C) 2017 The Libphonenumber Authors. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.google.i18n.phonenumbers.metadata.finitestatematcher.compiler; + +import static com.google.common.base.Preconditions.checkArgument; +import static com.google.common.collect.ImmutableMap.toImmutableMap; +import static com.google.common.collect.ImmutableSet.toImmutableSet; +import static java.lang.Integer.numberOfTrailingZeros; + +import com.google.common.base.Joiner; +import com.google.common.base.Preconditions; +import com.google.common.collect.ImmutableList; +import com.google.common.collect.ImmutableMap; +import com.google.common.collect.ImmutableSet; +import com.google.common.collect.Iterables; +import com.google.common.graph.MutableValueGraph; +import com.google.common.graph.ValueGraph; +import com.google.common.graph.ValueGraphBuilder; +import com.google.i18n.phonenumbers.metadata.RangeTree; +import com.google.i18n.phonenumbers.metadata.RangeTree.DfaEdge; +import com.google.i18n.phonenumbers.metadata.RangeTree.DfaNode; +import com.google.i18n.phonenumbers.metadata.RangeTree.DfaVisitor; +import java.util.ArrayList; +import java.util.Comparator; +import java.util.LinkedHashMap; +import java.util.List; +import java.util.Set; +import java.util.function.Function; + +/** + * Compiles non-capturing phone number regular expressions into sequences of bytes suitable for + * creating {@link com.google.i18n.phonenumbers.metadata.finitestatematcher.DigitSequenceMatcher + * DigitSequenceMatcher} instances. + */ +public final class MatcherCompiler { + /** + * Compiles the given {@code RangeTree} into a sequence of bytes suitable for creating a + * {@link com.google.i18n.phonenumbers.metadata.finitestatematcher.DigitSequenceMatcher + * DigitSequenceMatcher}. + */ + public static byte[] compile(RangeTree dfa) { + return compile(dfa, Statistics.NO_OP); + } + + /** + * As {@link #compile(RangeTree)} but additionally accepts a {@link Statistics} instance + * to record metrics about the compilation. + */ + public static byte[] compile(RangeTree dfa, Statistics stats) { + return new MatcherCompiler(dfa).compile(stats); + } + + /** The DFA from which the matcher data is to be compiled. */ + private final ValueGraph dfa; + /** The unique initial node of the DFA. */ + private final DfaNode init; + /** + * A map from nodes which are at the beginning of a sequence to that sequence. Not all nodes + * will be present in the key set of this map. + */ + private final ImmutableMap seqStart; + + /** + * Builds a graph directly from the DFA in a RangeTree. + * + *

Rather than deal with the DFA tree directly (which is deliberately opaque as a data + * structure) we serialize it into a more maleable ValueGraph. This allows simpler graph + * traversal while maintaining a simple-as-possible node/edge structure. It's okay to reuse the + * RangeTree types {@code DfaNode} and {@code DfaEdge} here because they have the expected + * semantics (e.g. conforming to equals/hashcode etc...) but care must be taken not to keep the + * instances around for a long time, since this will keep larger parts of the original DFA alive + * in the garbage collector (but this is fine since only bytes are returned from this class). + */ + private static ValueGraph buildGraph(RangeTree dfa) { + Preconditions.checkArgument(!dfa.isEmpty()); + MutableValueGraph graph = + ValueGraphBuilder.directed().allowsSelfLoops(false).build(); + graph.addNode(dfa.getInitial()); + DfaVisitor visitor = new DfaVisitor() { + @Override + public void visit(DfaNode source, DfaEdge edge, DfaNode target) { + boolean isFirstVisit = graph.addNode(target); + graph.putEdgeValue(source, target, edge); + if (isFirstVisit) { + target.accept(this); + } + } + }; + dfa.accept(visitor); + return graph; + } + + /** + * Creates a {@code MatcherCompiler} from the given automaton by generating all the + * {@code Sequence}'s of operations necessary to represent it. + */ + MatcherCompiler(RangeTree ranges) { + this.dfa = buildGraph(ranges); + this.init = ranges.getInitial(); + LinkedHashMap start = new LinkedHashMap<>(); + buildSequencesFrom(init, start); + this.seqStart = ImmutableMap.copyOf(start); + } + + /** + * Returns the output targets of the given node sorted according to the lowest "accepting" digit + * on the corresponding edge. This ordering is necessary for stability, but also correctness when + * building mapping operations. Apart from special cases (e.g. only one output) this is the only + * method which should be used to obtain output nodes. + */ + private ImmutableSet sortedOutputs(DfaNode source) { + Comparator ordering = Comparator.comparing( + target -> numberOfTrailingZeros(dfa.edgeValue(source, target).get().getDigitMask())); + return dfa.successors(source).stream().sorted(ordering).collect(toImmutableSet()); + } + + /** Returns the single output target of the given node (or throws an exception). */ + private DfaNode singleOutput(DfaNode source) { + return Iterables.getOnlyElement(dfa.successors(source)); + } + + /** + * Builds the output map from a given node in the DFA in the correct order. Note that because + * ImmutableSetMultimap.Builder orders keys based on the first time they are added, and we add + * keys (nodes) in the order of the input by which they can be reached, the keys of the returned + * map are ordered by the lowest digit in their set of values (inputs). This is necessary for + * correct behaviour in the "Mapping" operation. + */ + private ImmutableMap getOutMap(DfaNode source) { + Function getMask = + target -> dfa.edgeValue(source, target).get().getDigitMask(); + return sortedOutputs(source).stream().collect(toImmutableMap(Function.identity(), getMask)); + } + + /** + * Recursively builds sequences by traversing the DFA and grouping successive sub-sequences of + * nodes which neither branch, nor are branched to. Each such sub-sequence is represented by a + * {@code Sequence} instance (a list of non-branching operations, optionally terminated with a + * branching operation). + */ + private void buildSequencesFrom(DfaNode start, LinkedHashMap map) { + if (map.containsKey(start)) { + return; + } + DfaNode current = start; + ImmutableList.Builder nodes = ImmutableList.builder(); + while (true) { + nodes.add(current); + if (dfa.outDegree(current) != 1) { + break; + } + DfaNode next = singleOutput(current); + if (dfa.inDegree(next) > 1) { + break; + } + current = next; + } + Sequence seq = new Sequence(nodes.build()); + map.put(start, seq); + // Recurse from the outputs at the end of the sequence according to their edge values. + // IMPORTANT: We must not use "current.successors()" here since we need the order of insertion + // to be well defined and ValueGraph does not make good enough promises about node ordering. + for (DfaNode out : sortedOutputs(current)) { + buildSequencesFrom(out, map); + } + } + + /** Creates and compiles a {@code MatcherBytes} instance to render the output bytes. */ + byte[] compile(Statistics stats) { + return createMatcherBytes(stats).compile(); + } + + /** Creates a mutable {@code MatcherBytes} instance which will render the output bytes. */ + MatcherBytes createMatcherBytes(Statistics stats) { + return new MatcherBytes(seqStart.values(), stats); + } + + /** + * A contiguous sub-sequence of nodes in the DFA which neither branch, nor are branched to. + *

+ * The important property of a {@code Sequence} is that branching may only occur at the end of a + * {@code Sequence} and branches may only jump to the start of another {@code Sequence}. This + * makes it easier to separate the compilation of operations (inside sequences) from the + * management of branches and offsets (between sequences). + */ + class Sequence { + private final ImmutableList nodes; + + Sequence(ImmutableList nodes) { + checkArgument(!nodes.isEmpty()); + this.nodes = nodes; + } + + private Operation getOp(DfaNode node) { + return Operation.from(node.canTerminate(), getOutMap(node)); + } + + /** + * Returns the operations representing this sequence, merging successive operations where + * possible. The final list of operations is guaranteed to have at most one branching operation + * which (if present) will always be the last element in the list. + */ + List createOps() { + List ops = new ArrayList<>(); + Operation current = getOp(nodes.get(0)); + for (int n = 1; n < nodes.size(); n++) { + Operation next = getOp(nodes.get(n)); + Operation merged = current.mergeWith(next); + if (merged != null) { + current = merged; + } else { + ops.add(current); + current = next; + } + } + ops.add(current); + return ops; + } + + DfaNode getInitialState() { + return Iterables.get(nodes, 0); + } + + DfaNode getFinalState() { + return Iterables.getLast(nodes); + } + + Set getOutStates() { + return sortedOutputs(getFinalState()); + } + + /** + * Not the same as "terminating" for an operation. A sequence is "final" if no other sequences + * follow it. Normally there is only one final sequence in a normalized DFA, even if that + * sequence contains only a single terminating node. However not all terminating nodes are + * in final sequences. + */ + boolean isFinal() { + return getOutStates().isEmpty(); + } + + /** Returns the number of nodes that this sequence represents. */ + int size() { + return nodes.size(); + } + + ImmutableSet unorderedOutSequences() { + return getOutStates().stream().map(seqStart::get).collect(toImmutableSet()); + } + + @Override + public String toString() { + return toString(new StringBuilder(), 0).toString(); + } + + private StringBuilder toString(StringBuilder buf, int indent) { + List ops = createOps(); + appendIndent(buf, indent).append( + String.format("{%s} %s", nodes.get(0), Joiner.on(" >> ").join(ops))); + ImmutableList outs = Iterables.getLast(ops).getOuts(); + if (!outs.isEmpty()) { + buf.append(" {\n"); + for (DfaNode out : outs) { + seqStart.get(out).toString(buf, indent + 1); + } + appendIndent(buf, indent).append("}\n"); + } else { + buf.append('\n'); + } + return buf; + } + } + + @Override + public String toString() { + return seqStart.get(init).toString(); + } + + private static StringBuilder appendIndent(StringBuilder out, int indent) { + for (int n = 0; n < indent; n++) { + out.append(" "); + } + return out; + } +} diff --git a/metadata/src/main/java/com/google/i18n/phonenumbers/metadata/finitestatematcher/compiler/Operation.java b/metadata/src/main/java/com/google/i18n/phonenumbers/metadata/finitestatematcher/compiler/Operation.java new file mode 100644 index 0000000000..b324e675df --- /dev/null +++ b/metadata/src/main/java/com/google/i18n/phonenumbers/metadata/finitestatematcher/compiler/Operation.java @@ -0,0 +1,600 @@ +/* + * Copyright (C) 2017 The Libphonenumber Authors. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.google.i18n.phonenumbers.metadata.finitestatematcher.compiler; + +import static com.google.common.collect.ImmutableList.toImmutableList; +import static com.google.common.collect.ImmutableSetMultimap.flatteningToImmutableSetMultimap; +import static com.google.i18n.phonenumbers.metadata.RangeSpecification.ALL_DIGITS_MASK; +import static java.lang.Integer.numberOfTrailingZeros; +import static java.util.stream.Collectors.joining; + +import com.google.common.base.Preconditions; +import com.google.common.collect.ImmutableList; +import com.google.common.collect.ImmutableMap; +import com.google.common.collect.ImmutableSet; +import com.google.common.collect.ImmutableSetMultimap; +import com.google.common.collect.Iterables; +import com.google.common.io.ByteArrayDataOutput; +import com.google.i18n.phonenumbers.metadata.RangeTree.DfaNode; +import com.google.i18n.phonenumbers.metadata.finitestatematcher.OpCode; +import com.google.i18n.phonenumbers.metadata.finitestatematcher.compiler.Statistics.Type; +import java.util.ArrayList; +import java.util.Collection; +import java.util.List; +import java.util.Map; +import java.util.Map.Entry; + +/** + * A specific instance of a number matching operation derived from a DFA. Operations are created by + * analyzing a sequence in a DFA and knowing how to write the corresponding instruction(s) as bytes + * (to be processed by DigitSequenceMatcher or similar). + */ +abstract class Operation { + /** Represents the digits which can be accepted during matching operations. */ + private enum Digit { + // Order of enums must match the digit value itself (this is checked for in the constructor). + ZERO(0), ONE(1), TWO(2), THREE(3), FOUR(4), FIVE(5), SIX(6), SEVEN(7), EIGHT(8), NINE(9); + + private static final Digit[] VALUES = values(); + + // Iteration order is order of enum declaration (and thus also the value order). + public static final ImmutableSet ALL = ImmutableSet.copyOf(VALUES); + + Digit(int value) { + // No need to store the digit value if we know it matches our ordinal value. + Preconditions.checkArgument(value == ordinal()); + } + + /** Returns the digit corresponding to the integral value in the range {@code 0...9}. */ + public static Digit of(int n) { + return VALUES[n]; + } + + /** + * Returns the set of digits corresponding to a bit-mask in which bits 0 to 9 represent the + * corresponding digits. + */ + public static ImmutableSet fromMask(int mask) { + Preconditions.checkArgument(mask >= 1 && mask <= ALL_DIGITS_MASK); + if (mask == ALL_DIGITS_MASK) { + return ALL; + } + ImmutableSet.Builder digits = ImmutableSet.builder(); + for (int n = 0; n <= 9; n++) { + if ((mask & (1 << n)) != 0) { + digits.add(VALUES[n]); + } + } + return digits.build(); + } + + /** Returns the integer value of this digit instance. */ + public int value() { + return ordinal(); + } + } + + /** + * An invalid jump offset indicating that instead of jumping to a new instruction, the state + * machine can just terminate (used to avoid jumping directly to the termination instruction). + */ + static final int TERMINATION_OFFSET = -1; + + /** The number of bytes required by a "long" branch instruction. */ + private static final int LONG_BRANCH_SIZE = 2; + + private final boolean isTerminating; + private final boolean isBranching; + + private Operation(boolean isTerminating, boolean isBranching) { + this.isTerminating = isTerminating; + this.isBranching = isBranching; + } + + /** Returns whether this operation can terminate the state machine when it has been reached. */ + boolean isTerminating() { + return isTerminating; + } + + /** + * Returns whether this operation is branching. A branching operation has more than one output + * node it can reach. + */ + boolean isBranching() { + return isBranching; + } + + /** + * Returns the output nodes of this operation. For branching operations the order of multiple + * output nodes is defined by the operation itself (most operations are not branching and have + * only one output state anyway). + */ + abstract ImmutableList getOuts(); + + /** Returns the op-code for this operation, used when writing out instruction bytes. */ + abstract OpCode getOpCode(); + + /** Writes this operation out as a series of instruction bytes. */ + abstract void writeImpl( + ByteArrayDataOutput out, Map offsetMap, Statistics stats); + + void writeTo(ByteArrayDataOutput out, Map offsetMap, Statistics stats) { + if (isTerminating()) { + stats.record(Type.TERMINATING); + } + writeImpl(out, offsetMap, stats); + } + + /** + * Merges two adjacent operations (a poor man's compiler optimization). Useful for collapsing + * sequences of "ANY" operations. If this instruction cannot be merged with the given "next" + * instruction then it should return {@code null}, which is the default behavior. + * + * @param next the operation following this operation which we will try and merge with. + */ + Operation mergeWith(Operation next) { + return null; + } + + /** Writes a branch instructions into the output byte sequence. */ + static void writeBranch(ByteArrayDataOutput out, int jump, Statistics stats) { + Preconditions.checkArgument(jump >= 0 && jump < 0x1000, "invalid jump: " + jump); + if (jump == 0) { + stats.record(Type.CONTINUATION); + } else if (jump < 16) { + stats.record(Type.SHORT_BRANCH); + out.writeByte((OpCode.BRANCH.ordinal() << 5) | jump); + } else { + stats.record(jump < 0x100 ? Type.MEDIUM_BRANCH : Type.LONG_BRANCH); + out.writeShort((OpCode.BRANCH.ordinal() << 13) | (1 << 12) | jump); + } + } + + /** Writes a termination byte into the output byte sequence. */ + static void writeTerminator(ByteArrayDataOutput out, Statistics stats) { + stats.record(Type.FINAL); + out.writeByte(0); + } + + /** + * Creates a new operation to represent the output state transition given by {@code outMasks}. + * Note that where multiple nodes exist in {@code outMasks}, their ordering must be consistent + * with the {@code Mapping} operation (whereby nodes are ordered by the lowest bit set in the + * corresponding mask. + */ + static Operation from(boolean isTerminating, ImmutableMap outMasks) { + if (outMasks.isEmpty()) { + // No out nodes; then it's a "Terminal" operation. + Preconditions.checkState(isTerminating); + return new Operation.Terminal(); + } + ImmutableList outStates = outMasks.keySet().asList(); + if (outStates.size() == 1) { + DfaNode outState = Iterables.getOnlyElement(outStates); + int digitMask = outMasks.get(outState); + if (Integer.bitCount(digitMask) == 1) { + // One output state reached by a single input; then it's a "Single" operation. + return new Operation.Single(isTerminating, numberOfTrailingZeros(digitMask), outStates); + } + if (digitMask == ALL_DIGITS_MASK) { + // One output state reached by any input; then it's an "Any" operation. + return new Operation.Any(isTerminating, 1, outStates); + } + // One output state reached other general input; then it's a "Range" operation. + return new Operation.Range(isTerminating, digitMask, outStates); + } + if (outStates.size() == 2) { + // Test if the 2 disjoint masks cover all inputs. If so, use a shorter branch operation. + List masks = outMasks.values().asList(); + if ((masks.get(0) | masks.get(1)) == ALL_DIGITS_MASK) { + // One of two output nodes reached by any input; then it's a branching "Range" operation. + return new Operation.Range(isTerminating, masks.get(0), outStates); + } + } + // Any other combination of nodes or inputs; then it's a "Mapping" operation. This code relies + // on the ordering of entries in the output map to correspond to edge order. + return new Operation.Mapping(isTerminating, outMasks); + } + + /** Respresents a state with no legal outputs, which must be a terminal state in the matcher. */ + private static final class Terminal extends Operation { + Terminal() { + super(true, true); + } + + @Override + OpCode getOpCode() { + return OpCode.BRANCH; + } + + @Override + ImmutableList getOuts() { + return ImmutableList.of(); + } + + @Override + void writeImpl(ByteArrayDataOutput out, Map unused, Statistics stats) { + writeTerminator(out, stats); + } + + @Override + public String toString() { + return "TERMINAL"; + } + } + + /** + * Respresents a state which can be transitioned from to a single output state via a single input + * (eg, "0" or "9"). + */ + private static final class Single extends Operation { + private final Digit digit; + private final ImmutableList outs; + + Single(boolean isTerminating, int digit, ImmutableList outs) { + super(isTerminating, false); + Preconditions.checkArgument(outs.size() == 1); + this.digit = Digit.of(digit); + this.outs = outs; + } + + @Override + OpCode getOpCode() { + return OpCode.SINGLE; + } + + @Override ImmutableList getOuts() { + return outs; + } + + @Override + void writeImpl(ByteArrayDataOutput out, Map unused, Statistics stats) { + // <--------- 1 byte ---------> + // [ OPCODE | TRM | VALUE ] + out.writeByte((getOpCode().ordinal() << 5) + | (isTerminating() ? (1 << 4) : 0) + | digit.value()); + } + + @Override + public String toString() { + return format(digit.value()); + } + } + + /** + * Respresents a state which can be transitioned from to a single output state via any input + * (ie, "\d"). Successive "Any" oeprations can be merged to represent a repeated sequence + * (eg, "\d{5}"). + */ + private static final class Any extends Operation { + private final int count; + private final ImmutableList outs; + + Any(boolean isTerminating, int count, ImmutableList outs) { + super(isTerminating, false); + Preconditions.checkArgument(outs.size() == 1); + Preconditions.checkArgument(count > 0); + this.count = count; + this.outs = outs; + } + + @Override + OpCode getOpCode() { + return OpCode.ANY; + } + + @Override ImmutableList getOuts() { + return outs; + } + + @Override + void writeImpl(ByteArrayDataOutput out, Map unused, Statistics stats) { + int remainingCount = count; + // <--------- 1 byte ---------> + // [ OPCODE | TRM | COUNT-1 ] + int anyN = (getOpCode().ordinal() << 5) | (isTerminating() ? (1 << 4) : 0); + while (remainingCount > 16) { + out.writeByte(anyN | 15); + remainingCount -= 16; + } + out.writeByte(anyN | remainingCount - 1); + } + + @Override + public Operation mergeWith(Operation next) { + if (next.getOpCode() == OpCode.ANY && isTerminating() == next.isTerminating()) { + return new Any(isTerminating(), this.count + ((Any) next).count, ((Any) next).outs); + } + return null; + } + + @Override + public String toString() { + return format(count); + } + } + + /** + * Represents a state which can be transitioned from via an arbitrary set of inputs to either + * one or two output nodes (eg, "[23-69]" or "[0-4]X|[5-9]Y"). In the case where there are two + * output nodes, any input must reach one of the two possible nodes (ie, there is no invalid + * input). + */ + private static final class Range extends Operation { + private final ImmutableSet digits; + private final ImmutableList outs; + + Range(boolean isTerminating, int digitMask, ImmutableList outs) { + super(isTerminating, outs.size() == 2); + Preconditions.checkArgument(outs.size() <= 2); + this.digits = Digit.fromMask(digitMask); + this.outs = outs; + } + + @Override + OpCode getOpCode() { + return OpCode.RANGE; + } + + /** + * For branching Range operations (with 2 output nodes), the order is that the state matched + * by {@code digits} is the first state and the state reached by any other input is second. + */ + @Override ImmutableList getOuts() { + return outs; + } + + @Override + void writeImpl(ByteArrayDataOutput out, Map offsetMap, Statistics stats) { + // <-------------- 2 bytes --------------> <-------- 2 bytes ---------> + // [ OPCODE | TRM | 0 | BIT SET ] + // [ OPCODE | TRM | 1 | BIT SET | JUMP_IN | JUMP_OUT ] + out.writeShort((getOpCode().ordinal() << 13) + | (isTerminating() ? (1 << 12) : 0) + | (isBranching() ? (1 << 11) : 0) + | asBitMask(digits)); + if (isBranching()) { + writeJumpTable(out, ImmutableList.of( + offsetMap.get(outs.get(0)), offsetMap.get(outs.get(1))), stats); + } + } + + @Override + public String toString() { + return format(asRangeString(digits)); + } + } + + /** + * Represents a state in the matcher which can be transitioned from via an arbitrary set of + * inputs, to an arbitrary set of nodes. This is the most general form of operation and (apart + * from branches) provides the only truly necessary instruction in the matcher; everything else + * is just some specialization of this operation. + */ + private static final class Mapping extends Operation { + private final ImmutableSetMultimap nodeMap; + + Mapping(boolean isTerminating, ImmutableMap outMasks) { + super(isTerminating, true); + this.nodeMap = outMasks.entrySet().stream() + .collect(flatteningToImmutableSetMultimap( + Entry::getKey, e -> Digit.fromMask(e.getValue()).stream())); + } + + @Override + OpCode getOpCode() { + return isTerminating() ? OpCode.TMAP : OpCode.MAP; + } + + /** + * For Mapping operations, output node order is defined by the lowest digit by which that + * node can be reached. For example, if a map operation can reach three nodes {@code A}, + * {@code B} and {@code C} via inputs in the ranges {@code [1-38]}, {@code [4-6]} and + * {@code [09]} respectively, then they will be ordered {@code (C, A, B)}. + */ + @Override ImmutableList getOuts() { + return nodeMap.keySet().asList(); + } + + @Override + void writeImpl(ByteArrayDataOutput out, Map offsetMap, Statistics stats) { + // <------------ 4 bytes ------------> <-- 1 byte per offset ---> + // [ OPCODE | CODED MAP | JUMP_1 | ... | JUMP_N ] + out.writeInt((getOpCode().ordinal() << 29) | asCodedMap(nodeMap)); + ImmutableList offsets = + getOuts().stream().map(offsetMap::get).collect(toImmutableList()); + writeJumpTable(out, offsets, stats); + } + + @Override + public String toString() { + return format(nodeMap.asMap().values().stream() + .map(Operation::asRangeString).collect(joining(", "))); + } + } + + String format(Object extra) { + return String.format("%s%s : %s", getOpCode(), isTerminating() ? "*" : "", extra); + } + + /** + * Returns an integer with the lowest 10 bits set in accordance with the digits in the given set. + */ + private static int asBitMask(ImmutableSet digits) { + int bitMask = 0; + for (Digit digit : digits) { + bitMask |= (1 << digit.value()); + } + return bitMask; + } + + /** + * Returns a integer with the lowest 29 bits set to encode an arbitrary mapping from input digit + * to an output index. The 29 bits are partitioned such that lower inputs require fewer bits to + * encode (output indices are assigned as they are encountered, starting at the first input). + * Each digit can then be quickly mapped to either its 1-indexed output node, or 0 if the input + * was invalid. + */ + private static int asCodedMap(ImmutableSetMultimap nodeMap) { + int codedMap = 0; + List outs = nodeMap.keySet().asList(); + for (int n = 0; n < outs.size(); n++) { + for (Digit digit : nodeMap.get(outs.get(n))) { + // Coded indices are 1-to-10 (0 is the "invalid" node). + codedMap |= ((n + 1) << OpCode.getMapShift(digit.value())); + } + } + return codedMap; + } + + /** + * Writes a sequence of offsets representing a unsigned byte-based jump table after either a + * Mapping or Range instruction. This accounts correctly for the need to introduce a new + * "trampoline" branch instruction after the jump table (when the desired offset is too large + * to fit in a single unsigned byte). + *

+ * Offsets are either: + *

    + *
  • The number of bytes to jump from the end of the current {@code Sequence} bytes to the + * start of the destination {@code Sequence} bytes. + *
  • {@code -1} to indicate that a terminal node has been reached. + *
+ *

+ * Note that the offset written into the jump table itself must be relative to the beginning of + * the jump table and so must be adjusted by the number of bytes in the jump table and any other + * branch instructions that follow it. This it probably the most awkward logic in the entire + * compiler. + */ + static void writeJumpTable(ByteArrayDataOutput out, List offsets, + Statistics stats) { + int jumpTableSize = offsets.size(); + boolean needsExtraBranches = false; + for (int n = 0; n < jumpTableSize && !needsExtraBranches; n++) { + // Check whether the adjusted offset (ie, the one we would write) will fit in a byte. + // It's no issue to have offsets of -1 as it can never trigger "needsExtraBranches". + needsExtraBranches = (offsets.get(n) + jumpTableSize >= 0x100); + } + if (needsExtraBranches) { + // We only get here if at least one offset (after adjustment by the original jump table size) + // would not fit into a byte. Now we must calculate exactly how many extra branches we are + // going to need. For this we must assume the worst case adjustment of "3 x jumpTableSize" + // which is 1 byte for the jump table offset and 2 bytes for the extra branch for every entry. + // This is pessimistic because there will now be cases where we write a trampoline jump for + // an offset that could have fitted had we not assumed that we might need the extra space for + // the branch. However these cases are rare enough that we choose to ignore them. + int maxOffsetAdjust = ((1 + LONG_BRANCH_SIZE) * jumpTableSize); + int extraBranchCount = 0; + for (int n = 0; n < jumpTableSize; n++) { + if (offsets.get(n) + maxOffsetAdjust >= 0x100) { + extraBranchCount += 1; + } + } + // Now we know a reasonable upper bound for how many extra branches are needed, use this to + // adjust the actual offsets and write them. When a "trampoline" branch instruction is needed + // we split the offset so the jump table jumps to the branch instruction and that jumps the + // rest. Branch instructions are positioned, in order, immediately after the jump table. + List extraBranchOffsets = new ArrayList<>(); + int totalOffsetAdjust = jumpTableSize + (LONG_BRANCH_SIZE * extraBranchCount); + for (int n = 0; n < jumpTableSize; n++) { + int offset = offsets.get(n); + if (offset >= 0) { + int worstCaseOffset = offset + maxOffsetAdjust; + // Get the actual total offset we want to jump by. + offset += totalOffsetAdjust; + // Use the worst case offset here so we repeat exactly the same decision as the loop + // above (otherwise we might add fewer branches which would screw up our offsets). + if (worstCaseOffset >= 0x100) { + // Split the original offset, recording the jump to the trampoline branch as well as + // the branch offset itself. Note that the offset adjustment changes as more trampoline + // branches are encountered (but the overall offset jumped remains the same). + int extraBranchIndex = extraBranchOffsets.size(); + // This offset will always be small (max jump table is 10 entries, so offset to the + // last possible branch will be at most 28 bytes). + int branchInstructionOffset = jumpTableSize + (LONG_BRANCH_SIZE * extraBranchIndex); + // Subtract one additional branch instruction here because when we trampoline jump, we + // jump to the start of the branch instruction, but jump away from the end of it. + extraBranchOffsets.add((offset - branchInstructionOffset) - LONG_BRANCH_SIZE); + offset = branchInstructionOffset; + } + // Write the total offset (offset must be < 0x100 here as worstCaseOffset was < 0x100). + Preconditions.checkState(offset < 0x100, "jump too long: %s", offset); + out.writeByte(offset); + } else { + // If the destination of this jump would just be a termination instruction, just write + // the termination byte here directly (no point jumping to the termination byte). + Preconditions.checkArgument(offset == TERMINATION_OFFSET, "bad offset: %s", offset); + writeTerminator(out, stats); + } + } + // Write out the trampoline jumps in the order they were found. + for (int offset : extraBranchOffsets) { + stats.record(Type.DOUBLE_JUMP); + Operation.writeBranch(out, offset, stats); + } + } else { + // In the simple case, there are no extra branches, so we just write the offsets we have. + // This has the same effect as running the code above with (extraBranchCount == 0) but can be + // reached more optimistically because we don't need to account for the worst case offset + // adjustment when deciding if it's safe to just use the offsets we were given. It's a form + // of hysteresis between the no-branch and extra-branch cases. + for (int n = 0; n < jumpTableSize; n++) { + int offset = offsets.get(n); + if (offset >= 0) { + offset += jumpTableSize; + Preconditions.checkState(offset < 0x100, "jump too long: " + offset); + out.writeByte(offset); + } else { + writeTerminator(out, stats); + } + } + } + } + + // Helper function for asRanges() to print a single range (eg, "[014-7]"). + private static String asRangeString(Collection digits) { + StringBuilder out = new StringBuilder(); + out.append("["); + Digit lhs = null; + Digit rhs = null; + for (Digit digit : digits) { + if (lhs != null) { + if (digit.value() == rhs.value() + 1) { + rhs = digit; + continue; + } + if (rhs != lhs) { + if (rhs.value() > lhs.value() + 1) { + out.append("-"); + } + out.append(rhs.value()); + } + } + lhs = digit; + rhs = digit; + out.append(lhs.value()); + } + if (rhs != lhs) { + if (rhs.value() > lhs.value() + 1) { + out.append("-"); + } + out.append(rhs.value()); + } + out.append("]"); + return out.toString(); + } +} diff --git a/metadata/src/main/java/com/google/i18n/phonenumbers/metadata/finitestatematcher/compiler/Statistics.java b/metadata/src/main/java/com/google/i18n/phonenumbers/metadata/finitestatematcher/compiler/Statistics.java new file mode 100644 index 0000000000..e175425b35 --- /dev/null +++ b/metadata/src/main/java/com/google/i18n/phonenumbers/metadata/finitestatematcher/compiler/Statistics.java @@ -0,0 +1,44 @@ +/* + * Copyright (C) 2017 The Libphonenumber Authors. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.google.i18n.phonenumbers.metadata.finitestatematcher.compiler; + + +/** + * A simple class for capturing statistics produced during regular expression compilation. This can + * be used to quantify how proposed changes to the byte-code definition will affect the size of any + * compiled matcher bytes. + */ +public interface Statistics { + + public static final Statistics NO_OP = new Statistics() { + @Override public void record(Type type) { } + }; + + /** The type of things we are counting. */ + public enum Type { + SHORT_BRANCH, + MEDIUM_BRANCH, + LONG_BRANCH, + DOUBLE_JUMP, + CONTINUATION, + TERMINATING, + FINAL; + } + + /** Records an operation of the specified type during bytecode compilation. */ + void record(Type type); +} diff --git a/metadata/src/main/java/com/google/i18n/phonenumbers/metadata/model/ExamplesTableSchema.java b/metadata/src/main/java/com/google/i18n/phonenumbers/metadata/model/ExamplesTableSchema.java index 5c3e312c40..96ab4dcf9c 100644 --- a/metadata/src/main/java/com/google/i18n/phonenumbers/metadata/model/ExamplesTableSchema.java +++ b/metadata/src/main/java/com/google/i18n/phonenumbers/metadata/model/ExamplesTableSchema.java @@ -96,7 +96,7 @@ public static CsvTable toCsv( for (Cell c : table.cellSet()) { out.put(ExampleNumberKey.of(c.getRowKey(), c.getColumnKey()), NUMBER, c.getValue()); } - return CsvTable.from(SCHEMA, out.build()); + return CsvTable.from(SCHEMA, out.buildOrThrow()); } /** @@ -110,7 +110,7 @@ public static CsvTable toCsv( for (ExampleNumberKey k : csv.getKeys()) { out.put(k.getRegion(), k.getType(), csv.getOrDefault(k, NUMBER)); } - return out.build(); + return out.buildOrThrow(); } private static Stream write(ExampleNumberKey key) { diff --git a/metadata/src/main/java/com/google/i18n/phonenumbers/metadata/model/FileBasedCsvLoader.java b/metadata/src/main/java/com/google/i18n/phonenumbers/metadata/model/FileBasedCsvLoader.java index 396f735e4a..795414188f 100644 --- a/metadata/src/main/java/com/google/i18n/phonenumbers/metadata/model/FileBasedCsvLoader.java +++ b/metadata/src/main/java/com/google/i18n/phonenumbers/metadata/model/FileBasedCsvLoader.java @@ -25,7 +25,7 @@ /** * A CSV provider which reads files rooted in a given directory. The file layout should match that - * in the CSV metadata directory ({@code googledata/third_party/i18n/phonenumbers/metadata}). + * in the CSV metadata directory ({@code third_party/libphonenumber_metadata/metadata}). */ public final class FileBasedCsvLoader implements CsvDataProvider { /** Returns a CSV loader which reads files from the given base directory. */ diff --git a/metadata/src/main/java/com/google/i18n/phonenumbers/metadata/model/FormatsTableSchema.java b/metadata/src/main/java/com/google/i18n/phonenumbers/metadata/model/FormatsTableSchema.java index a9cbca6642..e977096ce6 100644 --- a/metadata/src/main/java/com/google/i18n/phonenumbers/metadata/model/FormatsTableSchema.java +++ b/metadata/src/main/java/com/google/i18n/phonenumbers/metadata/model/FormatsTableSchema.java @@ -81,7 +81,7 @@ public static ImmutableMap toFormatSpecs(CsvTable fo formats.getOrDefault(id, NATIONAL_PREFIX_OPTIONAL), toComment(formats.getOrDefault(id, COMMENT)))); } - return specs.build(); + return specs.buildOrThrow(); } private static Optional toOptional(String s) { diff --git a/metadata/src/main/java/com/google/i18n/phonenumbers/metadata/model/RangesTableSchema.java b/metadata/src/main/java/com/google/i18n/phonenumbers/metadata/model/RangesTableSchema.java index 8fb662e376..82282eab69 100644 --- a/metadata/src/main/java/com/google/i18n/phonenumbers/metadata/model/RangesTableSchema.java +++ b/metadata/src/main/java/com/google/i18n/phonenumbers/metadata/model/RangesTableSchema.java @@ -23,13 +23,13 @@ import static java.util.function.Function.identity; import static java.util.stream.Collectors.joining; -import com.google.common.base.Splitter; import com.google.common.collect.ContiguousSet; import com.google.common.collect.ImmutableMap; import com.google.common.collect.ImmutableRangeSet; import com.google.common.collect.ImmutableSet; import com.google.common.collect.ImmutableSortedSet; import com.google.common.collect.Range; +import com.google.i18n.phonenumbers.metadata.LengthsParser; import com.google.i18n.phonenumbers.metadata.RangeSpecification; import com.google.i18n.phonenumbers.metadata.i18n.PhoneRegion; import com.google.i18n.phonenumbers.metadata.i18n.SimpleLanguageTag; @@ -49,18 +49,18 @@ import com.google.i18n.phonenumbers.metadata.table.Schema; import java.time.ZoneId; import java.util.List; -import java.util.NavigableSet; import java.util.Optional; import java.util.TreeSet; import java.util.stream.Stream; /** * The schema of the standard "Ranges" table with rows keyed by {@link RangeKey} and columns: + * *

    - *
  1. {@link #TYPE}: The semantic type of numbers in a range (note that this is not - * the same a XmlNumberType or ValidNumberType). All ranges should be assigned a type. - *
  2. {@link #TARIFF}: The expected cost of numbers in a range (combining TYPE and TARIFF - * can yield the internal ValidNumberType). All ranges should be assigned a tariff. + *
  3. {@link #TYPE}: The semantic type of numbers in a range (note that this is not the same a + * XmlNumberType or ValidNumberType). All ranges should be assigned a type. + *
  4. {@link #TARIFF}: The expected cost of numbers in a range (combining TYPE and TARIFF can + * yield the internal ValidNumberType). All ranges should be assigned a tariff. *
  5. {@link #AREA_CODE_LENGTH}: The length of an optional prefix which may be removed from * numbers in a range for local dialling. Local only lengths are derived using this column. *
  6. {@link #NATIONAL_ONLY}: True if numbers in a range cannot be dialled from outside its @@ -72,8 +72,8 @@ * applied). *
  7. {@link #TIMEZONE}: The timezone names for a range (or empty to imply the default * timezones). Multiple timezones can be specific if separated by {@code '&'}. - *
  8. {@link #REGIONS}: A group of boolean columns in the form "Region:XX", where ranges are - * set {@code true} that range is valid within the region {@code XX}. + *
  9. {@link #REGIONS}: A group of boolean columns in the form "Region:XX", where ranges are set + * {@code true} that range is valid within the region {@code XX}. *
  10. {@link #GEOCODES}: A group of String columns in the form "Geocode:XXX" containing the * geocode string for a range, where {@code XXX} is the language code of the string. *
  11. {@link #PROVENANCE}: Indicates the most important reason for a range to be valid. @@ -81,6 +81,7 @@ *
* *

Rows keys are serialized via the marshaller and produce leading columns: + * *

    *
  1. {@code Prefix}: The prefix (RangeSpecification) for the ranges in a row (e.g. "12[3-6]"). *
  2. {@code Length}: A set of lengths for the ranges in a row (e.g. "9", "8,9" or "5,7-9"). @@ -88,16 +89,16 @@ */ public final class RangesTableSchema { /** - * External number type enum. This is technically much better than ValidNumberType since it - * splits type and cost properly. Unfortunately the internal logic of the phonenumber library - * doesn't really cope with this, which is why we convert to {@code XmlRangesSchema} before - * creating legacy data structures. + * External number type enum. This is technically much better than ValidNumberType since it splits + * type and cost properly. Unfortunately the internal logic of the phonenumber library doesn't + * really cope with this, which is why we convert to {@code XmlRangesSchema} before creating + * legacy data structures. * - *

    This enum can be modified as new types are requested from data providers, providing the - * type mapping to ValidNumberType is updated appropriately. Note that until it's clear that - * mapping types such as {@link #M2M} to {@link ValidNumberType#UNKNOWN} will work okay, we - * should be very careful about using the additional types. Additional types need to be removed - * before the generated table can be turned into a {@link NumberingScheme}. + *

    This enum can be modified as new types are requested from data providers, providing the type + * mapping to ValidNumberType is updated appropriately. Note that until it's clear that mapping + * types such as {@link #M2M} to {@link ValidNumberType#UNKNOWN} will work okay, we should be very + * careful about using the additional types. Additional types need to be removed before the + * generated table can be turned into a {@link NumberingScheme}. */ public enum ExtType { /** Default value not permitted in real data. */ @@ -125,14 +126,14 @@ public enum ExtType { private static final ImmutableMap TYPE_MAP = Stream.of( - ExtType.FIXED_LINE, - ExtType.MOBILE, - ExtType.FIXED_LINE_OR_MOBILE, - ExtType.PAGER, - ExtType.PERSONAL_NUMBER, - ExtType.UAN, - ExtType.VOICEMAIL, - ExtType.VOIP) + ExtType.FIXED_LINE, + ExtType.MOBILE, + ExtType.FIXED_LINE_OR_MOBILE, + ExtType.PAGER, + ExtType.PERSONAL_NUMBER, + ExtType.UAN, + ExtType.VOICEMAIL, + ExtType.VOIP) .collect(toImmutableMap(identity(), v -> ValidNumberType.valueOf(v.name()))); public Optional toValidNumberType() { @@ -185,9 +186,9 @@ public Timezones(String s) { Column.of(ExtTariff.class, "Tariff", ExtTariff.STANDARD_RATE); /** - * The "Area Code Length" column in the range table, denoting the length of a prefix which can - * be removed from all numbers in a range to obtain locally diallable numbers. If an - * "area code" is not optional for dialling, then no value should be set here. + * The "Area Code Length" column in the range table, denoting the length of a prefix which can be + * removed from all numbers in a range to obtain locally diallable numbers. If an "area code" is + * not optional for dialling, then no value should be set here. */ public static final Column AREA_CODE_LENGTH = Column.ofUnsignedInteger("Area Code Length"); @@ -226,12 +227,13 @@ public Timezones(String s) { public static final Column COMMENT = Column.ofString("Comment"); /** Marshaller for constructing CsvTable from RangeTable. */ - private static final CsvKeyMarshaller MARSHALLER = new CsvKeyMarshaller<>( - RangesTableSchema::write, - RangesTableSchema::read, - Optional.of(RangeKey.ORDERING), - "Prefix", - "Length"); + private static final CsvKeyMarshaller MARSHALLER = + new CsvKeyMarshaller<>( + RangesTableSchema::write, + RangesTableSchema::read, + Optional.of(RangeKey.ORDERING), + "Prefix", + "Length"); /** The non-key columns of a range table. */ public static final Schema TABLE_COLUMNS = @@ -251,10 +253,10 @@ public Timezones(String s) { .build(); /** - * The columns for the serialized CSV table. Note that the "REGIONS" column group is replaced - * by the CSV regions multi-value. This allows region codes to be serialize in a single column - * (which is far nicer when looking at data in a spreadsheet). In the range table, this is - * normalized into the boolean column group (because that's far nicer to work with). + * The columns for the serialized CSV table. Note that the "REGIONS" column group is replaced by + * the CSV regions multi-value. This allows region codes to be serialize in a single column (which + * is far nicer when looking at data in a spreadsheet). In the range table, this is normalized + * into the boolean column group (because that's far nicer to work with). */ private static final Schema CSV_COLUMNS = Schema.builder() @@ -289,17 +291,21 @@ public static CsvTable toCsv(RangeTable table) { for (Change c : table.toChanges()) { for (RangeKey k : RangeKey.decompose(c.getRanges())) { regions.clear(); - c.getAssignments().forEach(a -> { - // We special case the regions column, converting a group of boolean columns into a - // multi-value of region codes. If the column is in the group, it must hold Booleans. - if (regionColumns.contains(a.column())) { - if (a.value().map(((Column) a.column())::cast).orElse(Boolean.FALSE)) { - regions.add(REGIONS.getKey(a.column())); - } - } else { - csv.put(k, a); - } - }); + c.getAssignments() + .forEach( + a -> { + // We special case the regions column, converting a group of boolean columns into + // a + // multi-value of region codes. If the column is in the group, it must hold + // Booleans. + if (regionColumns.contains(a.column())) { + if (a.value().map(((Column) a.column())::cast).orElse(Boolean.FALSE)) { + regions.add(REGIONS.getKey(a.column())); + } + } else { + csv.put(k, a); + } + }); // We can do this out-of-sequence because the table will order its columns. if (!regions.isEmpty()) { csv.put(k, CSV_REGIONS, Regions.of(regions)); @@ -311,22 +317,28 @@ public static CsvTable toCsv(RangeTable table) { /** * Converts a {@link RangeKey} based {@link CsvTable} to a {@link RangeTable}, preserving the - * original table columns. The {@link CsvSchema} of the returned table is not guaranteed to be - * the {@link #SCHEMA} instance if the given table had different columns. + * original table columns. The {@link CsvSchema} of the returned table is not guaranteed to be the + * {@link #SCHEMA} instance if the given table had different columns. */ public static RangeTable toRangeTable(CsvTable csv) { RangeTable.Builder out = RangeTable.builder(TABLE_COLUMNS); for (RangeKey k : csv.getKeys()) { Change.Builder change = Change.builder(k.asRangeTree()); - csv.getRow(k).forEach((c, v) -> { - // We special case the regions column, converting a comma separated list of region codes - // into a series of boolean column assignments. - if (c.equals(CSV_REGIONS)) { - CSV_REGIONS.cast(v).getValues().forEach(r -> change.assign(REGIONS.getColumn(r), true)); - } else { - change.assign(c, v); - } - }); + csv.getRow(k) + .forEach( + (c, v) -> { + // We special case the regions column, converting a comma separated list of region + // codes + // into a series of boolean column assignments. + if (c.equals(CSV_REGIONS)) { + CSV_REGIONS + .cast(v) + .getValues() + .forEach(r -> change.assign(REGIONS.getColumn(r), true)); + } else { + change.assign(c, v); + } + }); out.apply(change.build(), OverwriteMode.NEVER); } return out.build(); @@ -339,7 +351,8 @@ public static Stream write(RangeKey key) { // Shared by ShortcodeTableSchema public static RangeKey read(List parts) { - return RangeKey.create(RangeSpecification.parse(parts.get(0)), parseLengths(parts.get(1))); + return RangeKey.create( + RangeSpecification.parse(parts.get(0)), LengthsParser.parseLengths(parts.get(1))); } private static String formatLength(ImmutableSortedSet lengthSet) { @@ -364,33 +377,5 @@ private static String formatRange(Range r) { } } - private static final Splitter COMMA_SPLITTER = Splitter.on(',').trimResults(); - private static final Splitter RANGE_SPLITTER = Splitter.on('-').trimResults().limit(2); - - private static NavigableSet parseLengths(String s) { - NavigableSet lengths = new TreeSet<>(); - for (String lengthOrRange : COMMA_SPLITTER.split(s)) { - if (lengthOrRange.contains("-")) { - List lohi = RANGE_SPLITTER.splitToList(lengthOrRange); - int lo = parseInt(lohi.get(0)); - int hi = parseInt(lohi.get(1)); - checkArgument(lo < hi, "Invalid range: %s-%s", lo, hi); - checkArgument(lengths.isEmpty() || lo > lengths.last(), "Overlapping ranges: %s", s); - lengths.addAll(ContiguousSet.closed(lo, hi)); - } else { - int length = parseInt(lengthOrRange); - checkArgument(lengths.isEmpty() || length > lengths.last(), "Overlapping ranges: %s", s); - lengths.add(length); - } - } - return lengths; - } - - private static int parseInt(String s) { - return Integer.parseUnsignedInt(s, 10); - } - private RangesTableSchema() {} } - - diff --git a/metadata/src/main/java/com/google/i18n/phonenumbers/metadata/regex/AnyPath.java b/metadata/src/main/java/com/google/i18n/phonenumbers/metadata/regex/AnyPath.java new file mode 100644 index 0000000000..8571f5b6f9 --- /dev/null +++ b/metadata/src/main/java/com/google/i18n/phonenumbers/metadata/regex/AnyPath.java @@ -0,0 +1,181 @@ +/* + * Copyright (C) 2017 The Libphonenumber Authors. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.google.i18n.phonenumbers.metadata.regex; + +import com.google.auto.value.AutoValue; +import com.google.common.annotations.VisibleForTesting; +import com.google.common.base.Preconditions; +import java.util.Optional; + +/** + * Represents an NFA graph which accepts sequences of inputs of any digit (also known as "any-digit + * sequences"), possibly of variable length. For example, an {@code AnyPath} instance might accept + * a single input of any digit (i.e. equivalent to the regular expression {@code "\d"}), or it might + * accept sequences of any digits of length 4 or 6 (i.e. equivalent to the regular expression + * {@code "\d{4}\d{2}?"}. + * + *

    As {@code AnyPath} instances are all restricted to only accepting any-digits sequences, the + * only interesting thing about them is the set of sequence lengths they accept. + */ +@AutoValue +abstract class AnyPath implements Comparable { + /** + * The special empty path which matches zero length input. This is useful as an identity value + * when constructing other paths but should never be a path in the graph. + */ + public static final AnyPath EMPTY = new AutoValue_AnyPath(0x1); + + /** The path matching exactly one input of any digit. */ + public static final AnyPath SINGLE = of(0x2); + + /** The path matching one or zero inputs of any digit. */ + public static final AnyPath OPTIONAL = of(0x3); + + @VisibleForTesting + static AnyPath of(int mask) { + Preconditions.checkArgument(mask > 1, "invalid path mask: %s", mask); + return new AutoValue_AnyPath(mask); + } + + /** + * Returns a bit-mask representing the lengths of any-digit sequences accepted by this path. + * If bit-N is set, then this path accepts an N-length sequence of any digits. + */ + abstract int mask(); + + /** Returns whether this path accepts an any-digit sequence of length {@code n}.*/ + public boolean acceptsLength(int n) { + Preconditions.checkArgument(n >= 0 && n < 32, "invalid path length: %s", n); + return (mask() & (1 << n)) != 0; + } + + /** Returns the maximum length any-sequence that this path will accept. */ + public int maxLength() { + return (31 - Integer.numberOfLeadingZeros(mask())); + } + + /** + * Returns whether this path is empty (i.e. accepts only zero length sequences). This is only + * useful when constructing paths and empty paths should never appear in an NFA graph. + */ + public boolean isEmpty() { + return mask() == 0x1; + } + + /** + * Extends this path by one input, potentially setting all input as optional. For example (using + * 'x' to represent a single "any digit" input): + *

      + *
    • {@code "xx".extend(false) == "xxx"} + *
    • {@code "xx".extend(true) == "(xxx)?"} + *
    • {@code "xx(x)?".extend(false) == "xxx(x)?"} + *
    • {@code "xx(x)?".extend(true) == "(xxx(x)?)?"} + *
    + */ + public AnyPath extend(boolean allOptional) { + return of((mask() << 1) | (allOptional ? 0x1 : 0x0)); + } + + /** + * Joins the given path to this one, results in a new path which is equivalent to the + * concatenation of the regular expressions they represent. For example (using + * 'x' to represent a single "any digit" input): + *
      + *
    • {@code "xx".join("xx") == "xxxx"} + *
    • {@code "xx".join("x?") == "xx(x)?"} + *
    + */ + public AnyPath join(AnyPath other) { + int newMask = 0; + // Include the length itself (which is always accepted). + for (int n = 0; n <= other.maxLength(); n++) { + if (other.acceptsLength(n)) { + newMask |= mask() << n; + } + } + return of(newMask); + } + + /** + * Returns a new path which is equal to this path, except that it also accepts zero length + * sequences. + */ + public AnyPath makeOptional() { + return of(mask() | 0x1); + } + + /** + * Attempts to "factor" this path by the given path to produce a path such that + * {@code p.factor(q).join(q)} is equivalent to {@code p}. This is useful when trying to + * determine longest common paths. Factorizing may not succeed in cases where no common path + * exists (e.g. {@code "xx(xx)?".factor("x?")} fails because there is no way to join anything + * to the path {@code "x?"} to make it accept exactly 2 or 4 length any-digit sequences). + */ + public Optional factor(AnyPath other) { + int factor = mask() / other.mask(); + if (factor > 1 && (other.mask() * factor) == mask()) { + return Optional.of(of(factor)); + } else { + return Optional.empty(); + } + } + + @Override + public int compareTo(AnyPath other) { + return Integer.compare(mask(), other.mask()); + } + + @Override + public final String toString() { + // A non-obvious algorithm for getting a reasonable toString() using x's. + // Best understood via examples: + // + // 0001 is invalid as we cannot represent an optional zero-length sequence. + // + // Hi-bit-1 ==> 1 x + // 0010 -> x, 0011 -> (x)? + // + // Hi-bit-2 ==> 2 x's + // 0100 -> xx, 0101 -> (xx)?, 0110 -> x(x)?, 0111 -> (x(x)?)? + // + // Hi-bit-3 ==> 3 x's + // 1000 -> xxx, 1001 -> (xxx)?, 1010 -> x(xx)?, 1011 -> (x(xx)?)? + // 1100 -> xx(x)?, 1101 -> (xx(x)?)?, 1110 -> x(x(x)?)?, 1111 -> (x(x(x)?)?)? + // + // Rules: + // * For hi-bit M, there are M x's in the string. + // * For N < M; if bit-N is set, then a group starts after the Nth-x. + if (mask() == 0x1) { + return ""; + } + StringBuilder out = new StringBuilder(); + for (int n = 0; n < maxLength(); n++) { + out.append('x'); + } + // Loop high-to-low to prevent earlier insertions messing with the index. + for (int n = maxLength() - 1; n >= 0; n--) { + if (acceptsLength(n)) { + out.insert(n, '('); + } + } + // The number of opened groups was the number of set bits - 1. + for (int n = Integer.bitCount(mask()) - 1; n > 0; n--) { + out.append(")?"); + } + return out.toString(); + } +} diff --git a/metadata/src/main/java/com/google/i18n/phonenumbers/metadata/regex/Edge.java b/metadata/src/main/java/com/google/i18n/phonenumbers/metadata/regex/Edge.java new file mode 100644 index 0000000000..34e984018a --- /dev/null +++ b/metadata/src/main/java/com/google/i18n/phonenumbers/metadata/regex/Edge.java @@ -0,0 +1,351 @@ +/* + * Copyright (C) 2017 The Libphonenumber Authors. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.google.i18n.phonenumbers.metadata.regex; + +import static com.google.common.base.Preconditions.checkArgument; +import static com.google.i18n.phonenumbers.metadata.RangeSpecification.ALL_DIGITS_MASK; + +import com.google.common.base.Preconditions; +import com.google.common.collect.ImmutableList; +import com.google.common.collect.ImmutableSortedSet; +import com.google.i18n.phonenumbers.metadata.RangeSpecification; +import java.util.Collection; +import java.util.List; +import java.util.Set; +import java.util.stream.Collectors; +import java.util.stream.Stream; + +/** + * Value type for edges in NFA graphs of phone number regular expressions. Outside this package, + * this type is mainly used for examining NFA graphs which represent a regular expression, + * generated via {@link RangeTreeConverter#toNfaGraph}.. + * + *

    Note that the ordering of edges is carefully designed to attempt to replicate as much of the + * existing intuition about ordering in regular expressions as possible. This should result in any + * generated expressions being as close to existing hand edited expressions as possible. + */ +public abstract class Edge implements Comparable { + /** API for visiting composite edges; see also {@link #accept(Visitor)}. */ + public interface Visitor { + /** Visits a leaf node simple edge. */ + void visit(SimpleEdge edge); + /** + * Visits a composited sequence of edges. Note that sequences only ever contain disjunctions or + * simple edges, but never other sequences. For edges "a", "b", "c", this represents the + * concatenated edge "abc". + */ + void visitSequence(List edges); + /** + * Visits a disjunction of parallel edges. Note that disjunctions only ever contain sequences + * or simple edges, but never other disjunctions. For edges "a", "b", "c", this represents the + * disjunctive group "(a|b|c)". + */ + void visitGroup(Set edges, boolean isOptional); + } + + // The singleton epsilon edge. + private static final SimpleEdge EPSILON = new SimpleEdge(); + // The singleton edge matching any digit (i.e. 'x' or '\d'). + private static final SimpleEdge ANY = new SimpleEdge(ALL_DIGITS_MASK, false); + // The singleton edge optionally matching any digit (i.e. 'x?' or '\d?'). + private static final SimpleEdge OPTIONAL_ANY = ANY.optional(); + + /** Returns an edge which accepts digits 0 to 9 according tothe bits set in the given mask. */ + public static SimpleEdge fromMask(int digitMask) { + return digitMask == ALL_DIGITS_MASK ? ANY : new SimpleEdge(digitMask, false); + } + + /** + * Returns the epsilon edge which accepts zero length input and transitions immediately. This + * edge should only ever appear parallel to other edges, and not as the only transition between + * two nodes. + */ + public static SimpleEdge epsilon() { + return EPSILON; + } + + /** Returns the edge which accepts any digit {@code [0-9]}. */ + public static SimpleEdge any() { + return ANY; + } + + /** Returns the edge which optionally accepts any digit {@code [0-9]}. */ + public static SimpleEdge optionalAny() { + return OPTIONAL_ANY; + } + + /** + * Returns the ordered concatenation of the given edges. If either edge is a concatenation, it + * is first expanded, so that the resulting edge contains only simple edges or disjunctions. + */ + public static Edge concatenation(Edge lhs, Edge rhs) { + checkArgument(!lhs.equals(EPSILON) && !rhs.equals(EPSILON), "cannot concatenate epsilon edges"); + // Don't make concatenations of concatenations; flatten them out so you only have singletons + // or disjunctions. This is equivalent to writing "xyz" instead of "x(yz)". + List edges = Stream.of(lhs, rhs) + .flatMap( + e -> (e instanceof Concatenation) ? ((Concatenation) e).edges.stream() : Stream.of(e)) + .collect(Collectors.toList()); + return new Concatenation(edges); + } + + /** + * Returns the disjunction of the given edges. If either edge is already a concatenation, it + * is first expanded, so that the resulting edge contains only simple edges or disjunctions. + */ + public static Edge disjunction(Collection edges) { + // Don't make disjunctions of disjunctions; flatten them out so you only have singletons, + // concatenations or epsilon. This is equivalent to writing "(x|y|z)" instead of "(x|(y|z))". + List allEdges = edges.stream() + .flatMap( + e -> (e instanceof Disjunction) ? ((Disjunction) e).edges.stream() : Stream.of(e)) + .sorted() + .distinct() + .collect(Collectors.toList()); + // There should only ever be one epsilon when we make a disjunction (disjunctions are made when + // subgraphs collapse and each subgraph should only have one epsilon to make it optional). + // Epsilons sort to-the-left of everything, so if there is an epsilon it must be the first edge. + boolean isOptional = allEdges.get(0) == EPSILON; + if (isOptional) { + allEdges = allEdges.subList(1, allEdges.size()); + } + Preconditions.checkState(!allEdges.contains(EPSILON)); + return new Disjunction(allEdges, isOptional); + } + + /** An edge optionally matching a single input token, or the epsilon transition. */ + public static final class SimpleEdge extends Edge { + private final int digitMask; + private final boolean isOptional; + + // Constructor for singleton epsilon edge. + private SimpleEdge() { + this.digitMask = 0; + // An optional epsilon makes no real sense. + this.isOptional = false; + } + + private SimpleEdge(int digitMask, boolean isOptional) { + checkArgument(digitMask > 0 && digitMask < (1 << 10), "invalid bit mask %s", digitMask); + this.digitMask = digitMask; + this.isOptional = isOptional; + } + + /** Returns the mask of digits accepted by this edge. */ + public int getDigitMask() { + return digitMask; + } + + /** Returns whether this edge is optional. */ + public boolean isOptional() { + return isOptional; + } + + /** Returns an optional version of this, non-optional edge. */ + public SimpleEdge optional() { + Preconditions.checkState(digitMask != 0, "cannot make epsilon optional"); + Preconditions.checkState(!isOptional, "edge already optional"); + return new SimpleEdge(digitMask, true); + } + + @Override + public void accept(Visitor visitor) { + visitor.visit(this); + } + + @Override + public boolean equals(Object obj) { + return (obj instanceof SimpleEdge) && digitMask == ((SimpleEdge) obj).digitMask; + } + + @Override + public int hashCode() { + return digitMask; + } + + @Override + public int compareTo(Edge rhs) { + if (rhs instanceof SimpleEdge) { + return compare((SimpleEdge) rhs); + } else { + // Composite types know how to compare themselves to SimpleEdges, so delegate to them but + // remember to invert the result since we are reversing the comparison order. + return -rhs.compareTo(this); + } + } + + private int compare(SimpleEdge rhs) { + if (isOptional != rhs.isOptional) { + // Optional edges sort to-the-right of non-optional things. + return isOptional ? 1 : -1; + } + if (digitMask == rhs.digitMask) { + return 0; + } + if (digitMask == 0 || rhs.digitMask == 0) { + // Epsilon sorts to-the-left of everything. + return digitMask == 0 ? -1 : 1; + } + // Unlike many other places where range specifications are used, we cannot guarantee the + // ranges are disjoint here, so we sort on the reversed bitmask to favour the lowest set bit. + // This sorts 'x' ([0-9]) to the left of everything, and epsilon to the right of everything. + // I.e. "x" < "0", "0" < "1", "[0-3]" < "[0-2]", "9" < epsilon. + // + // Remember to logical-shift back down to avoid negative values. + int reverseLhsMask = (Integer.reverse(digitMask) >>> 22); + int reverseRhsMask = (Integer.reverse(rhs.digitMask) >>> 22); + // Compare in the opposite order, so the largest reversed value is ordered "to the left". + return Integer.compare(reverseRhsMask, reverseLhsMask); + } + } + + // A sequence of edges (disjunctions or simple edges). + private static final class Concatenation extends Edge { + private final ImmutableList edges; + + private Concatenation(Collection edges) { + this.edges = ImmutableList.copyOf(edges); + } + + @Override + public void accept(Visitor visitor) { + visitor.visitSequence(edges); + } + + @Override + public boolean equals(Object obj) { + return (obj instanceof Concatenation) && edges.equals(((Concatenation) obj).edges); + } + + @Override + public int hashCode() { + return edges.hashCode(); + } + + @Override + public int compareTo(Edge rhs) { + if (rhs instanceof Concatenation) { + return compareEdges(edges, ((Concatenation) rhs).edges); + } else { + // Compare our first edge to the non-concatenation. If this compares as equal, order the + // concatenation between simple edges and disjunctions to break the tie and avoid implying + // that a concatenation and a non-concatenation are equal. + int comparison = -rhs.compareTo(edges.get(0)); + return comparison != 0 ? comparison : (rhs instanceof SimpleEdge ? 1 : -1); + } + } + } + + // A disjunctive group of edges (sequences or simple edges). + private static final class Disjunction extends Edge { + private final ImmutableSortedSet edges; + private final boolean isOptional; + + private Disjunction(Collection edges, boolean isOptional) { + checkArgument(!edges.isEmpty()); + this.edges = ImmutableSortedSet.copyOf(edges); + this.isOptional = isOptional; + } + + @Override + public void accept(Visitor visitor) { + visitor.visitGroup(edges, isOptional); + } + + @Override + public boolean equals(Object obj) { + return (obj instanceof Disjunction) && edges.equals(((Disjunction) obj).edges); + } + + @Override + public int hashCode() { + // Negate bits here to be different from Concatenation. + return ~edges.hashCode(); + } + + @Override + public int compareTo(Edge rhs) { + if (rhs instanceof Disjunction) { + return compareEdges(edges.asList(), ((Disjunction) rhs).edges.asList()); + } else { + // Compare our first edge to the non-disjunction. If this compares as equal, order the + // disjunction to the right of the other edge to break the tie and avoid implying that + // a disjunction and a non-disjunction are equal. + int comparison = -rhs.compareTo(edges.asList().get(0)); + return comparison == 0 ? 1 : comparison; + } + } + } + + /** + * Accepts a visitor on this edge, visiting any sub-edges from which it is composed. This is a + * double-dispatch visitor to avoid anyone processing edges needing to know about specific types. + * Only the immediate edge is visited and the visitor is then responsible for visiting child + * edges. + */ + public abstract void accept(Visitor visitor); + + // Compare lists according to elements, and tie break on length if different. This is effectively + // a lexicographical ordering. + private static int compareEdges(ImmutableList lhs, ImmutableList rhs) { + int minSize = Math.min(lhs.size(), rhs.size()); + for (int n = 0; n < minSize; n++) { + int compared = lhs.get(n).compareTo(rhs.get(n)); + if (compared != 0) { + return compared; + } + } + return Integer.compare(lhs.size(), rhs.size()); + } + + @Override + public String toString() { + StringBuilder out = new StringBuilder(); + accept(new Visitor() { + @Override + public void visit(SimpleEdge e) { + if (e.equals(Edge.epsilon())) { + // Epsilon cannot be optional. + out.append("e"); + } else { + int m = e.getDigitMask(); + out.append(m == ALL_DIGITS_MASK ? "x" : RangeSpecification.toString(m)); + if (e.isOptional()) { + out.append('?'); + } + } + } + + @Override + public void visitSequence(List edges) { + edges.forEach(e -> e.accept(this)); + } + + @Override + public void visitGroup(Set edges, boolean isOptional) { + out.append("("); + edges.forEach(e -> { + e.accept(this); + out.append("|"); + }); + out.setLength(out.length() - 1); + out.append(isOptional ? ")?" : ")"); + } + }); + return out.toString(); + } +} diff --git a/metadata/src/main/java/com/google/i18n/phonenumbers/metadata/regex/EdgeWriter.java b/metadata/src/main/java/com/google/i18n/phonenumbers/metadata/regex/EdgeWriter.java new file mode 100644 index 0000000000..1543a81a19 --- /dev/null +++ b/metadata/src/main/java/com/google/i18n/phonenumbers/metadata/regex/EdgeWriter.java @@ -0,0 +1,343 @@ +/* + * Copyright (C) 2017 The Libphonenumber Authors. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.google.i18n.phonenumbers.metadata.regex; + +import static com.google.common.base.Preconditions.checkArgument; +import static com.google.common.base.Preconditions.checkState; +import static com.google.i18n.phonenumbers.metadata.RangeSpecification.ALL_DIGITS_MASK; + +import com.google.common.collect.Iterables; +import com.google.i18n.phonenumbers.metadata.RangeSpecification; +import com.google.i18n.phonenumbers.metadata.regex.Edge.SimpleEdge; +import com.google.i18n.phonenumbers.metadata.regex.Edge.Visitor; +import java.util.List; +import java.util.Optional; +import java.util.Set; +import javax.annotation.Nullable; + +/** Writes an NFA graph edge instance as a regular expression. */ +final class EdgeWriter implements Visitor { + // Regex constant strings pulled out for some degree of readability. + private static final String DOT_MATCH = "."; + private static final String DIGIT_MATCH = "\\d"; + private static final String OPTIONAL_MARKER = "?"; + private static final String GROUP_START = "(?:"; + private static final String GROUP_DISJUNCTION = "|"; + private static final String GROUP_END = ")"; + private static final String OPTIONAL_GROUP_END = GROUP_END + OPTIONAL_MARKER; + + /** + * Returns a regular expression corresponding to the structure of the given edge. This method + * does not apply any specific optimizations to the edge it is given and any optimizations which + * affect the output must have already been applied to the graph from which the input edge was + * derived. + * + * @param edge A collapsed edge typically derived from serializing an NFA graph. + * @param useDotMatch true if {@code '.'} should be used to "match any digit" (instead of + * {@code '\\d'}) which results in shorter output. + */ + public static String toRegex(Edge edge, boolean useDotMatch) { + EdgeWriter writer = new EdgeWriter(useDotMatch); + edge.accept(writer); + return writer.out.toString(); + } + + // The token to match any input digit (e.g. "\\d" or "."). + private final String anyToken; + // Accumulated regular expression appended to during visitation. + private final StringBuilder out = new StringBuilder(); + // Flag to determine when the top-level edge visited is a group, because if it is we can often + // omit the explicit grouping tokens and save some space. + private boolean isTopLevelGroup = true; + + private EdgeWriter(boolean useDotMatch) { + this.anyToken = useDotMatch ? DOT_MATCH : DIGIT_MATCH; + } + + @Override + public void visit(SimpleEdge e) { + checkArgument(!e.equals(Edge.epsilon()), "unexpected bare epsilon"); + isTopLevelGroup = false; + // It's easier to just attempt to extract an "any" edge as that code already has to work for + // simple edges when they are inside other composite edges. Optionality is encoded into the + // resulting AnyPath and handled by appendRegex(), so we don't need to handle it again here. + Optional any = AnyPathVisitor.extractAnyPath(e); + if (any.isPresent()) { + appendRegex(out, any.get().mask()); + } else { + // Not an "any" edge so append the usual range representation (e.g. "6" or "[014-9]"). + out.append(RangeSpecification.toString(e.getDigitMask())); + if (e.isOptional()) { + out.append(OPTIONAL_MARKER); + } + } + } + + @Override + public void visitSequence(List edges) { + checkArgument(!edges.isEmpty(), "sequences must have at least one edge"); + isTopLevelGroup = false; + // At this level a sequence might be a mix of normal and "any" edges (e.g. "123xxxx"). To + // cope with this, track and accumulate the un-written "any" edge, and emit it just before + // any other output (or at the end). + AnyPath any = AnyPath.EMPTY; + for (Edge e : edges) { + Optional next = AnyPathVisitor.extractAnyPath(e); + if (next.isPresent()) { + any = any.join(next.get()); + continue; + } + // Here we have a "normal" edge, but we still might need to emit a collected "any" edge. + if (!any.isEmpty()) { + appendRegex(out, any.mask()); + any = AnyPath.EMPTY; + } + // This recursion only happens when this was not an "any" edge (though it may still be a + // composite that contains other "any" edges). + e.accept(this); + } + // If the last thing we saw in this sequence was an "any" edge, don't forget to emit it. + if (!any.isEmpty()) { + appendRegex(out, any.mask()); + } + } + + @Override + public void visitGroup(Set edges, boolean isOptional) { + checkArgument(!edges.isEmpty(), "groups must have at least one edge"); + // The very top-level group is almost always non-optional and can be omitted for length + // (ie. "(?:a|b|c)" can just be "a|b|c"). + boolean canSkipParens = isTopLevelGroup && !isOptional; + // Unset this before recursing. + isTopLevelGroup = false; + + // We have exactly one case where an "any" edge needs to be handled for groups, and that's + // when there's an optional any group that's not part of an enclosing sequence (e.g. "(xx)?"). + if (edges.size() == 1 && isOptional) { + Optional any = AnyPathVisitor.extractAnyPath(Iterables.getOnlyElement(edges)); + if (any.isPresent()) { + // Remember to account for the optionality of the outer group. + appendRegex(out, any.get().makeOptional().mask()); + return; + } + } + + if (!canSkipParens) { + out.append(GROUP_START); + } + for (Edge e : edges) { + e.accept(this); + out.append(GROUP_DISJUNCTION); + } + // Easier to just remove the disjunction we know was added last than track state in the loop. + out.setLength(out.length() - GROUP_DISJUNCTION.length()); + if (!canSkipParens) { + out.append(isOptional ? OPTIONAL_GROUP_END : GROUP_END); + } + } + + /** + * Recursive visitor to extract "any" sequences from edges (simple or composite). A sequence of + * edges is an "any path" if all edges accept any digit. Composite edges already enforce the + * requirement that epsilon edges don't exist directly (they are represented via optionality). + */ + private static final class AnyPathVisitor implements Visitor { + /** + * Returns the longest "any" sequence represented by the given edge (if the edge represents an + * any sequence). If present, the result is non-empty. + */ + @Nullable + public static Optional extractAnyPath(Edge e) { + AnyPathVisitor visitor = new AnyPathVisitor(); + e.accept(visitor); + return Optional.ofNullable(visitor.path); + } + + // Accumulate value during visitation and set to null to abort. + @Nullable + private AnyPath path = AnyPath.EMPTY; + + @Override + public void visit(SimpleEdge edge) { + checkState(path != null, "path should never be null at start of recursion"); + if (edge.getDigitMask() == ALL_DIGITS_MASK) { + path = path.join(edge.isOptional() ? AnyPath.OPTIONAL : AnyPath.SINGLE); + } else { + path = null; + } + } + + @Override + public void visitSequence(List edges) { + checkState(path != null, "path should never be null at start of recursion"); + // Looking for a complete sequence of "any edges" (partial sequences in a concatenation are + // taken care of by the caller). + for (Edge e : edges) { + Optional next = AnyPathVisitor.extractAnyPath(e); + if (next.isPresent()) { + path = path.join(next.get()); + } else { + path = null; + break; + } + } + } + + @Override + public void visitGroup(Set edges, boolean isOptional) { + checkState(path != null, "path should never be null at start of recursion"); + // Looking for a group like (xxx(xx)?)? which contains one edge only. We just recurse into + // that edge and then make the result optional (a disjuction with only one edge must be + // optional or else it should have been a concatenation). + if (edges.size() > 1) { + path = null; + return; + } + checkState(isOptional, "single edge disjunctions should be optional"); + Edge e = Iterables.getOnlyElement(edges); + e.accept(this); + if (path != null) { + path = path.makeOptional(); + } + } + } + + // The code below here is really a bit squiffy and relies on a whole bunch of bit fiddling to + // do what it does. The good news is that it's easy to unit-test the heck out of, so that's + // what I've done. Don't look too hard at what's going on unless you're a bit of a masochist. + + /** + * Appends the regular expression corresponding to the given AnyPath mask value. This is a + * bit-mask where the Nth bit corresponds to accepting an any digit sequence of length N. + * + *

    For example: + *

      + *
    • {@code 00000010} accepts only length 1 (e.g. "\d") + *
    • {@code 00000011} accepts lengths 0 or 1 (e.g. "\d?") + *
    • {@code 00001000} accepts only length 3 (e.g. "\d{3}") + *
    • {@code 00011100} accepts lengths 2-4 (e.g. "\d{2,4}") + *
    • {@code 11101100} accepts lengths 0,2,3,5,6,7 (e.g. "(?:\d\d(?:\d(?:\d{2,4})?)?)?") + *
    + */ + private void appendRegex(StringBuilder out, int mask) { + checkArgument(mask > 1, "unexpected mask value %s", mask); + // Deal with optionality separately. + boolean allOptional = (mask & 0x1) != 0; + mask &= ~0x1; + // We are looking for bit patterns like '1111000' for contiguous ranges (e.g. {3,7}). + // Find the lo/hi size of the next contiguous range (inclusive). + int lo = Integer.numberOfTrailingZeros(mask); + int hi = Integer.numberOfTrailingZeros(~(mask >>> lo)) + (lo - 1); + + // If all the bits are accounted for (nothing above the "hi" bit) then this was the last + // contiguous range and we don't need to recurse (so no more groups need to be opened). + if (mask < (1 << (hi + 1))) { + // Writes a contiguous range as a single token with optionality (e.g. "\d", "(?:\d{2,4})?"). + appendAnyRange(out, lo, hi, allOptional); + return; + } + // This is about the entire group, not the subgroup we are about to recurse into. + if (allOptional) { + out.append(GROUP_START); + } + // IMPORTANT: If we are recursing, we must not attempt to emit the entire group here, only the + // shortest matching length. + // + // Mask "11101100" does NOT represent "\d{2,3}(?:\d{2,4})?" as that can match 4-digits too. + // Instead it should generate "\d\d(?:\d(?:\d{2,4})?)?", where the 3 digit match is part of an + // optional group. + appendRequiredAnyRange(out, lo); + // Recurse using the mask that's had the match we just emitted "factored out". This is always + // optional because bit-0 is what was the lowest set bit in our mask. + appendRegex(out, mask >>> lo); + if (allOptional) { + out.append(OPTIONAL_GROUP_END); + } + } + + /** + * Appends regular expression tokens that accept any digits for a single length. + * + *

    For example: + *

      + *
    1. {@code n=1}: {@code "\d"} + *
    2. {@code n=2}: {@code "\d\d"} (this could be extended if using '.') + *
    3. {@code otherwise}: {@code "\d{n}"} + *
    + */ + private void appendRequiredAnyRange(StringBuilder out, int n) { + checkArgument(n >= 1, "bad any length %s", n); + out.append(anyToken); + if (n == 2) { + // Only safe to do this if the group is not optional ("\d\d?" != "(?:\d{2})?"). + out.append(anyToken); + } else if (n > 2) { + out.append('{').append(n).append('}'); + } + } + + /** + * Appends regular expression tokens that accept any digits in a contiguous range of lengths. + * + *

    For example: + *

      + *
    1. {@code lo=1, hi=1, optional=false}: {@code "\d"} + *
    2. {@code lo=1, hi=1, optional=true}: {@code "\d?"} + *
    3. {@code lo=2, hi=2, optional=true}: {@code "(?:\d{2})?"} + *
    4. {@code lo=3, hi=6, optional=false}: {@code "\d{3,6}"} + *
    5. {@code lo=3, hi=6, optional=true}: {@code "(?:\d{3,6})?"} + *
    6. {@code lo=1, hi=4, optional=true}: {@code "\d{0,4}"} (not {@code (?:\d{1,4})?}) + *
    7. {@code lo=2, hi=2, optional=false}: {@code "\d\d"} (special case for size) + *
    8. {@code lo=1, hi=2, optional=false}: {@code "\d\d?"} (special case for size) + *
    + */ + private void appendAnyRange(StringBuilder out, int lo, int hi, boolean optional) { + checkArgument(lo >= 1 && hi >= lo, "bad range arguments %s, %s", lo, hi); + if (lo == hi) { + if (!optional) { + // Required single length. + appendRequiredAnyRange(out, lo); + } else { + // Optional single length. + if (lo > 1) { + out.append(GROUP_START).append(anyToken); + out.append('{').append(lo).append('}'); + out.append(OPTIONAL_GROUP_END); + } else { + out.append(anyToken).append(OPTIONAL_MARKER); + } + } + } else if (lo == 1 && hi == 2 && !optional) { + // Special case for "\d\d?" as it's shorter than "\d{1,2}" (and even shorter with '.'). + // Even though we append the "optional marker" (i.e. '?') here it's got nothing to do + // with the entire group being optional. That would be "(?:\d{1,2})?" which is "\d{0,2}". + out.append(anyToken).append(anyToken).append(OPTIONAL_MARKER); + } else if (lo == 1 && optional) { + // Special case to write "\d{0,N}" instead of "(?:\d{1,N})?" + out.append(anyToken).append("{0,").append(hi).append('}'); + } else { + if (optional) { + out.append(GROUP_START); + } + // General case. + out.append(anyToken).append('{').append(lo).append(',').append(hi).append('}'); + if (optional) { + out.append(OPTIONAL_GROUP_END); + } + } + } +} diff --git a/metadata/src/main/java/com/google/i18n/phonenumbers/metadata/regex/NfaFlattener.java b/metadata/src/main/java/com/google/i18n/phonenumbers/metadata/regex/NfaFlattener.java new file mode 100644 index 0000000000..1f5a4de5c2 --- /dev/null +++ b/metadata/src/main/java/com/google/i18n/phonenumbers/metadata/regex/NfaFlattener.java @@ -0,0 +1,195 @@ +/* + * Copyright (C) 2017 The Libphonenumber Authors. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.google.i18n.phonenumbers.metadata.regex; + +import com.google.auto.value.AutoValue; +import com.google.common.annotations.VisibleForTesting; +import com.google.common.base.Preconditions; +import com.google.common.collect.Iterables; +import com.google.common.graph.ValueGraph; +import com.google.i18n.phonenumbers.metadata.regex.Edge.SimpleEdge; +import java.util.ArrayList; +import java.util.Comparator; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.PriorityQueue; +import java.util.function.Function; + +/** + * Flattens an NFA graph of simple edges into a composite edge which represents all the same + * transitions in a strict tree structure (i.e. nestable sub-groups). This can entail some + * duplication of edges, but this should be kept to a minimum and favours duplicating trailing + * paths to avoid introducing additional non-determinism. + */ +final class NfaFlattener { + /** + * Flattens the given NFA graph into a single composite edge composed of concatenation and + * disjunction. The resulting edge can be visited using the {@code Edge.Visitor} class. + */ + public static Edge flatten(ValueGraph graph) { + return new NfaFlattener(graph).flatten(); + } + + /* + * A simple pair of edge value and target node which represents the current state along any path + * in the NFA graph. Path followers may be joined (if they point at the same node) but can only + * be split by recursion into the new subgraph. + */ + @AutoValue + abstract static class PathFollower { + private static PathFollower of(Node node, Edge edge) { + return new AutoValue_NfaFlattener_PathFollower(node, edge); + } + + /** The target node that this follower points to. */ + abstract Node node(); + /** A composite edge representing everything up to the target node in the current sub-graph. */ + abstract Edge edge(); + } + + // The graph being flattened. + private final ValueGraph graph; + // An ordering for the work queue which ensures that followers with the same node are adjacent. + private final Comparator queueOrder; + + private NfaFlattener(ValueGraph graph) { + this.graph = graph; + this.queueOrder = Comparator + .comparing(PathFollower::node, nodeOrdering(graph)) + .thenComparing(PathFollower::edge); + } + + private Edge flatten() { + // Sub-graph visitation only works for graphs which branch from and collapse to a single node. + // An NFA graph could be multiple sequential edges or a sequence of edges and sub-graphs. + // Handle that in this outer loop rather than complicate the visitor (already quite complex). + PathFollower out = visitSubgraph(Node.INITIAL); + while (out.node() != Node.TERMINAL) { + PathFollower subgraph = visitSubgraph(out.node()); + out = PathFollower.of(subgraph.node(), Edge.concatenation(out.edge(), subgraph.edge())); + } + return out.edge(); + } + + /** + * Visits the sub-graph rooted at the given node, following all out-edges until they eventually + * re-join. Because the given graph has only one terminal node and no cycles, all sub-graphs must + * eventually rejoin at some point. If during visitation of a sub-graph, a node with multiple + * out-edges is reached, then the sub-graph it starts is recursively visited. Note that as "inner" + * sub-graphs must terminate at or before their parent graph, nesting is assured. + * + *

    The key to the implementation of this algorithm is that visitation occurs in breadth-first + * order defined according to the reachability of the nodes in the graph. This ensures that when + * an edge follower which reaches a node at which other edges join together is processed (i.e. + * when it gets to the head of the queue) all the other followers that can also reach that node + * must also be present in a contiguous sequence at the front of the queue. + */ + private PathFollower visitSubgraph(Node node) { + Preconditions.checkArgument(graph.outDegree(node) > 0, "cannot recurse from the terminal node"); + if (graph.outDegree(node) == 1) { + // Visit the trivial "subgraph" that's really just a single edge. Note that this code could + // loop and concatenate all sequential single edges, but it also works fine to rely on the + // recursion of the caller (the advantage of doing it this, simpler, way means that this code + // doesn't have to know about termination due to reaching the terminal node). + Node target = Iterables.getOnlyElement(graph.successors(node)); + return PathFollower.of(target, graph.edgeValue(node, target).get()); + } + // A work-queue of the path followers, ordered primarily by the node they target. This results + // in the followers at any "point of collapse" being adjacent in the queue. + PriorityQueue followerQueue = new PriorityQueue<>(queueOrder); + for (Node t : graph.successors(node)) { + followerQueue.add(PathFollower.of(t, graph.edgeValue(node, t).get())); + } + while (true) { + // Get the set of followers that share the same target node at the head of the queue. The + // ordering in the queue ensures that followers for the same target are always adjacent. + PathFollower follower = followerQueue.remove(); + Node target = follower.node(); + List joiningEdges = collectJoiningEdges(followerQueue, target); + if (joiningEdges != null) { + // Replace any joined followers with their disjunction (they all have the same target). + joiningEdges.add(follower.edge()); + follower = PathFollower.of(target, Edge.disjunction(joiningEdges)); + } + if (followerQueue.isEmpty()) { + // If we just processed the last "joining" paths then this sub-graph has been collapsed + // into a single edge and we just return the current follower. Note that we can join edges + // without ending recursion (when 3 followers become 2) but we can only end recursion after + // joining at least 2 edges at the terminal sub-graph node. + return follower; + } + // Recurse into the next sub-graph (possibly just a single edge) which is just concatenated + // onto the current follower. + PathFollower subgraph = visitSubgraph(target); + followerQueue.add( + PathFollower.of(subgraph.node(), Edge.concatenation(follower.edge(), subgraph.edge()))); + } + } + + // Collects the edges of any followers at the front of the queue which share the same target node + // as the given follower. If the node is not a target of any other followers then return null. + private static List collectJoiningEdges(PriorityQueue queue, Node target) { + // It's really common for edges not to join, so avoid making the list unless necessary. + if (!nextFollowerJoinsTarget(queue, target)) { + return null; + } + List joiningEdges = new ArrayList<>(); + do { + joiningEdges.add(queue.remove().edge()); + } while (nextFollowerJoinsTarget(queue, target)); + return joiningEdges; + } + + // Checks if the head of the queue is a follower with the same target node. + private static boolean nextFollowerJoinsTarget(PriorityQueue queue, Node target) { + return !queue.isEmpty() && queue.peek().node().equals(target); + } + + /** + * Returns a total ordering of nodes in this graph based on the maximum path length from the + * initial node. If path lengths are equal for two nodes, then the node ID is used to tie break. + * + *

    The property of this ordering that is critical to the node flattening algorithm is that if + * {@code a < b}, then no path exists in the graph where {@code b} precedes {@code a}. This + * ensures that path followers are processed consistently with the "node reachability" and if + * several path followers target the same node, then they are adjacent in the follower queue. + * + *

    Using the node ID as a tie-break is safe, because while node IDs are assigned arbitrarily, + * they only apply between nodes in the same path length "bucket", so it cannot violate the total + * ordering requirement, since any order within a "bucket" is equally good. + */ + // Note: If there are graph cycles this will not terminate, but that implies bad bugs elsewhere. + @VisibleForTesting + static Comparator nodeOrdering(ValueGraph graph) { + Map map = new HashMap<>(); + recursivelySetMaxPathLength(Node.INITIAL, 0, graph, map); + // We have to cast the "get" method since it accepts "Object", not "Node" on a map. + return Comparator.comparing((Function) map::get).thenComparing(Node::id); + } + + private static void recursivelySetMaxPathLength( + Node node, int length, ValueGraph graph, Map map) { + // Only continue if at least some paths can be lengthened from here onwards. + if (length > map.getOrDefault(node, -1)) { + map.put(node, length); + for (Node target : graph.successors(node)) { + recursivelySetMaxPathLength(target, length + 1, graph, map); + } + } + } +} diff --git a/metadata/src/main/java/com/google/i18n/phonenumbers/metadata/regex/Node.java b/metadata/src/main/java/com/google/i18n/phonenumbers/metadata/regex/Node.java new file mode 100644 index 0000000000..d54d2bc1a1 --- /dev/null +++ b/metadata/src/main/java/com/google/i18n/phonenumbers/metadata/regex/Node.java @@ -0,0 +1,51 @@ +/* + * Copyright (C) 2017 The Libphonenumber Authors. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.google.i18n.phonenumbers.metadata.regex; + +import com.google.auto.value.AutoValue; + +/** + * Value type for nodes in NFA graphs of phone number regular expressions. This is basically a + * trivial wrapper for an {@code int}, but it makes a lot of other pieces of code type safe. + * Outside this package, this type is mainly used for examining NFA graphs which represent a + * regular expression, generated via {@link RangeTreeConverter#toNfaGraph}. + */ +@AutoValue +public abstract class Node implements Comparable { + /** The unique initial node in an NFA graph with in-order zero. */ + public static final Node INITIAL = new AutoValue_Node(0); + /** The unique terminal node in an NFA graph with out-order zero. */ + public static final Node TERMINAL = new AutoValue_Node(1); + + /** Returns a new node whose ID is one greater than this node. */ + public Node createNext() { + return (id() == 0) ? TERMINAL : new AutoValue_Node(id() + 1); + } + + /** Returns the numeric ID of this node, which must be unique within an NFA graph. */ + abstract int id(); + + @Override + public int compareTo(Node o) { + return Integer.compare(id(), o.id()); + } + + @Override + public final String toString() { + return Integer.toString(id()); + } +} diff --git a/metadata/src/main/java/com/google/i18n/phonenumbers/metadata/regex/RangeTreeConverter.java b/metadata/src/main/java/com/google/i18n/phonenumbers/metadata/regex/RangeTreeConverter.java new file mode 100644 index 0000000000..07816dece0 --- /dev/null +++ b/metadata/src/main/java/com/google/i18n/phonenumbers/metadata/regex/RangeTreeConverter.java @@ -0,0 +1,123 @@ +/* + * Copyright (C) 2017 The Libphonenumber Authors. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.google.i18n.phonenumbers.metadata.regex; + +import static com.google.common.base.Preconditions.checkState; + +import com.google.common.graph.ElementOrder; +import com.google.common.graph.MutableValueGraph; +import com.google.common.graph.ValueGraph; +import com.google.common.graph.ValueGraphBuilder; +import com.google.i18n.phonenumbers.metadata.RangeTree; +import com.google.i18n.phonenumbers.metadata.RangeTree.DfaEdge; +import com.google.i18n.phonenumbers.metadata.RangeTree.DfaNode; +import com.google.i18n.phonenumbers.metadata.RangeTree.DfaVisitor; +import com.google.i18n.phonenumbers.metadata.regex.Edge.SimpleEdge; +import java.util.HashMap; +import java.util.Map; + +/** + * Converts DFA {@link RangeTree}s to NFA {@link ValueGraph}s. The resulting graph has almost + * exactly the same node and edge structure as the original DFA, with the following exceptions: + *

      + *
    1. Nodes which could optionally terminate now have 'epsilon' edges connecting them to the + * terminal node. + *
    2. If an optionally terminating node connects directly to the terminal node, then a special + * "optional edge" is used (this is because the {@link ValueGraph} structure allows only one + * value for each edge, so you can't have an epsilon edge that goes between the same source and + * target as other edge). + *
    + */ +public final class RangeTreeConverter { + /** + * Returns the directed NFA graph representation of a {@link RangeTree}. The returned graph is + * not a DFA and may contain epsilon transitions. Nodes are assigned in visitation order, except + * for the initial and terminal nodes which are always present in the graph. + */ + public static ValueGraph toNfaGraph(RangeTree ranges) { + NfaVisitor visitor = new NfaVisitor(ranges.getInitial()); + ranges.accept(visitor); + return visitor.graph; + } + + private static class NfaVisitor implements DfaVisitor { + private final MutableValueGraph graph = ValueGraphBuilder + .directed() + .allowsSelfLoops(false) + // Stable ordering should help keep any generated structures (regex, graph files) stable. + .nodeOrder(ElementOrder.natural()) + .build(); + // Map of nodes added to the new graph (keyed by the corresponding DFA node). + private final Map nodeMap = new HashMap<>(); + // The last node we added. + private Node lastAdded; + + private NfaVisitor(DfaNode initial) { + // Add initial and terminal nodes first (there's always exactly one of each). + graph.addNode(Node.INITIAL); + graph.addNode(Node.TERMINAL); + // During visitation we check only target nodes to add epsilon edges, but we may also need + // to add an epsilon from the very top if the DFA can match the empty input. + if (initial.canTerminate()) { + graph.putEdgeValue(Node.INITIAL, Node.TERMINAL, Edge.epsilon()); + } + nodeMap.put(initial, Node.INITIAL); + nodeMap.put(RangeTree.getTerminal(), Node.TERMINAL); + lastAdded = Node.TERMINAL; + } + + @Override + public void visit(DfaNode dfaSource, DfaEdge dfaEdge, DfaNode dfaTarget) { + SimpleEdge simpleEdge = Edge.fromMask(dfaEdge.getDigitMask()); + Node source = nodeMap.get(dfaSource); + Node target = getTarget(dfaTarget); + boolean wasNewNode = graph.addNode(target); + // The only chance of an existing edge is if an epsilon was already added immediately before + // visiting this edge. This can only occur if (target == TERMINAL) however. + SimpleEdge epsilon = graph.putEdgeValue(source, target, simpleEdge); + if (epsilon != null) { + checkState(target.equals(Node.TERMINAL) && epsilon.equals(Edge.epsilon()), + "unexpected edge during visitation: %s -- %s --> %s", source, epsilon, target); + // Re-add the edge, but this time make it optional (because that's what epsilon means). + graph.putEdgeValue(source, target, simpleEdge.optional()); + } + // Only recurse if the target node was newly added to the graph in this visitation. + if (wasNewNode) { + // The TERMINAL node is always in the map so (target != TERMINAL) here. This means we + // never risk adding a loop in the graph. The epsilon may end up being swapped out for + // an optional edge when we visit the dfaTarget, but that's fine. + if (dfaTarget.canTerminate()) { + graph.putEdgeValue(target, Node.TERMINAL, Edge.epsilon()); + } + dfaTarget.accept(this); + } + } + + // Gets or creates a new target node, adding it to the node map (but not to the graph itself). + private Node getTarget(DfaNode gnode) { + Node target = nodeMap.get(gnode); + if (target != null) { + return target; + } + lastAdded = lastAdded.createNext(); + nodeMap.put(gnode, lastAdded); + return lastAdded; + } + } + + private RangeTreeConverter() {} +} diff --git a/metadata/src/main/java/com/google/i18n/phonenumbers/metadata/regex/RegexFormatter.java b/metadata/src/main/java/com/google/i18n/phonenumbers/metadata/regex/RegexFormatter.java new file mode 100644 index 0000000000..25f1f20c94 --- /dev/null +++ b/metadata/src/main/java/com/google/i18n/phonenumbers/metadata/regex/RegexFormatter.java @@ -0,0 +1,118 @@ +/* + * Copyright (C) 2017 The Libphonenumber Authors. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.google.i18n.phonenumbers.metadata.regex; + +import com.google.common.base.CharMatcher; +import com.google.common.base.Preconditions; + +/** + * Simple indenting formatter for regular expressions and other similar nested syntax. Obviously + * the results are not the same from a match perspective as the new string contains whitespace. + */ +public final class RegexFormatter { + /** Option for how to handle formatting of groups. */ + public enum FormatOption { + PRESERVE_CAPTURING_GROUPS, + FORCE_NON_CAPTURING_GROUPS, + FORCE_CAPTURING_GROUPS, + } + + // We only care about 3 specific tokens, so this code can be used to print strings which look + // similar (nested, disjunctive groups) such as the toString() of the Edge class. + private static final CharMatcher tokens = CharMatcher.anyOf("()|"); + + /** + * Formats a regular expression (or similar nested group syntax) using the following rules: + *
      + *
    1. Newline after opening '(?:' and increase indent. + *
    2. Newline after '|' + *
    3. Decrease indent and add newline before closing ')' + *
    + */ + public static String format(String regex, FormatOption formatOption) { + return new RegexFormatter(regex, formatOption).format(); + } + + private final StringBuilder out = new StringBuilder(); + private final String regex; + private final FormatOption formatOption; + + private RegexFormatter(String regex, FormatOption formatOption) { + this.regex = CharMatcher.whitespace().removeFrom(regex); + this.formatOption = Preconditions.checkNotNull(formatOption); + } + + private String format() { + recurse(0, 0); + return out.toString(); + } + + // Assume at line start. + private int recurse(int pos, int level) { + while (pos < regex.length()) { + indent(level); + // Optionally printing closing group from previous recursion. + if (regex.charAt(pos) == ')') { + out.append(')'); + pos++; + } + int nextToken = tokens.indexIn(regex, pos); + if (nextToken == -1) { + out.append(regex.substring(pos, regex.length())); + return regex.length(); + } + out.append(regex.substring(pos, nextToken)); + pos = nextToken; + switch (regex.charAt(pos)) { + case '(': + out.append("("); + pos++; + if (regex.indexOf("?:", pos) == pos) { + if (formatOption != FormatOption.FORCE_CAPTURING_GROUPS) { + out.append("?:"); + } + pos += 2; + } else if (formatOption == FormatOption.FORCE_NON_CAPTURING_GROUPS) { + out.append("?:"); + } + out.append('\n'); + pos = recurse(pos, level + 1); + break; + + case '|': + out.append("|\n"); + pos++; + break; + + case ')': + // Just exit recursion and let the parent write the ')', so don't update our position. + out.append("\n"); + return pos; + + default: + throw new AssertionError(); + } + } + return pos; + } + + private void indent(int level) { + while (level-- > 0) { + out.append(" "); + } + } +} diff --git a/metadata/src/main/java/com/google/i18n/phonenumbers/metadata/regex/RegexGenerator.java b/metadata/src/main/java/com/google/i18n/phonenumbers/metadata/regex/RegexGenerator.java new file mode 100644 index 0000000000..5bd93f8974 --- /dev/null +++ b/metadata/src/main/java/com/google/i18n/phonenumbers/metadata/regex/RegexGenerator.java @@ -0,0 +1,171 @@ +/* + * Copyright (C) 2017 The Libphonenumber Authors. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.google.i18n.phonenumbers.metadata.regex; + +import static com.google.common.base.Preconditions.checkArgument; +import static com.google.i18n.phonenumbers.metadata.RangeTreeFactorizer.MergeStrategy.ALLOW_EDGE_SPLITTING; +import static com.google.i18n.phonenumbers.metadata.RangeTreeFactorizer.MergeStrategy.REQUIRE_EQUAL_EDGES; +import static java.util.stream.Collectors.joining; + +import com.google.common.base.Preconditions; +import com.google.common.graph.ValueGraph; +import com.google.i18n.phonenumbers.metadata.RangeTree; +import com.google.i18n.phonenumbers.metadata.RangeTreeFactorizer; +import com.google.i18n.phonenumbers.metadata.RangeTreeFactorizer.MergeStrategy; +import com.google.i18n.phonenumbers.metadata.regex.Edge.SimpleEdge; +import java.util.Optional; + +/** Produces partially optimized regular expressions from {@code RangeTree}s. */ +public final class RegexGenerator { + private static final RegexGenerator BASIC = new RegexGenerator(false, false, false, false); + + // NOTE: Tail optimization should remain disabled since it seems to undo some of the benefits of + // subgroup optimization. At some point the code can probably just be removed. + private static final RegexGenerator DEFAULT_XML = + BASIC.withDfaFactorization().withSubgroupOptimization(); + + /** + * Returns a basic regular expression generator with no optional optimizations enabled. This will + * produce regular expressions with a simpler structure than other generators but output will + * almost always be longer. + */ + public static RegexGenerator basic() { + return BASIC; + } + + /** + * Returns the default regex generator for XML data. This should be used by any tool wishing to + * obtain the same regular expressions as the legacy XML data. It is deliberately not specified + * as to which optimizations are enabled for this regular expression generator. + */ + public static RegexGenerator defaultXmlGenerator() { + return DEFAULT_XML; + } + + /** + * Returns a new regular expression generator which uses the {@code '.'} token for matching any + * digit (rather than {@code '\d'}). This results in shorter output, but possibly at the cost of + * performance on certain platforms (and a degree of readability). + */ + public RegexGenerator withDotMatch() { + Preconditions.checkState(!this.useDotMatch, "Dot-matching already enabled"); + return new RegexGenerator(true, this.factorizeDfa, this.optimizeSubgroups, this.optimizeTail); + } + + /** + * Returns a new regular expression generator which applies a length-based factorization of the + * DFA graph in an attempt to reduce the number of problematic terminating states. This results + * in regular expressions with additional non-determinism, but which can greatly reduce size. + */ + public RegexGenerator withDfaFactorization() { + Preconditions.checkState(!this.factorizeDfa, "Length based factorizing already enabled"); + return new RegexGenerator(this.useDotMatch, true, this.optimizeSubgroups, this.optimizeTail); + } + + /** + * Returns a new regular expression generator which applies experimental factorization of the + * DFA graph in an attempt to identify and handle subgroups which would cause repetition. This + * results in regular expressions with additional non-determinism, but which can greatly reduce + * size. + */ + public RegexGenerator withSubgroupOptimization() { + Preconditions.checkState(!this.optimizeSubgroups, "Subgroup optimization already enabled"); + return new RegexGenerator(this.useDotMatch, this.factorizeDfa, true, this.optimizeTail); + } + + /** + * Returns a new regular expression generator which applies tail optimization to the intermediate + * NFA graph to factor out common trailing paths. This results in a small size improvement to + * many cases and does not adversely affect readability. + */ + public RegexGenerator withTailOptimization() { + Preconditions.checkState(!this.optimizeTail, "Tail optimization already enabled"); + return new RegexGenerator(this.useDotMatch, this.factorizeDfa, this.optimizeSubgroups, true); + } + + private final boolean useDotMatch; + private final boolean factorizeDfa; + private final boolean optimizeSubgroups; + private final boolean optimizeTail; + + private RegexGenerator( + boolean useDotMatch, boolean factorizeDfa, boolean optimizeSubgroups, boolean optimizeTail) { + this.useDotMatch = useDotMatch; + this.factorizeDfa = factorizeDfa; + this.optimizeSubgroups = optimizeSubgroups; + this.optimizeTail = optimizeTail; + } + + /** + * Generates a regular expression from a range tree, applying the configured options for this + * generator. + */ + public String toRegex(RangeTree ranges) { + // The regex of the empty range is "a regex that matches nothing". This is meaningless. + checkArgument(!ranges.isEmpty(), + "cannot generate regular expression from empty ranges"); + // We cannot generate any regular expressions if there are no explicit state transitions in the + // graph (i.e. we can generate "(?:)?" but only if "" is non-empty). We just get + // "the regex that always immediately terminates after no input". This is also meaningless. + checkArgument(!ranges.getInitial().equals(RangeTree.getTerminal()), + "range tree must not contain only the empty digit sequence: %s", ranges); + + String regex = generateFactorizedRegex(ranges); + if (optimizeSubgroups) { + regex = recursivelyOptimizeSubgroups(ranges, regex); + } + return regex; + } + + private String recursivelyOptimizeSubgroups(RangeTree ranges, String regex) { + Optional subgraphRanges = SubgroupOptimizer.extractRepeatingSubgraph(ranges); + if (subgraphRanges.isPresent()) { + RangeTree leftoverRanges = ranges.subtract(subgraphRanges.get()); + String leftoverRegex = generateFactorizedRegex(leftoverRanges); + leftoverRegex = recursivelyOptimizeSubgroups(leftoverRanges, leftoverRegex); + String optimizedRegex = leftoverRegex + "|" + generateFactorizedRegex(subgraphRanges.get()); + if (optimizedRegex.length() < regex.length()) { + regex = optimizedRegex; + } + } + return regex; + } + + private String generateFactorizedRegex(RangeTree ranges) { + String regex = regexOf(ranges); + if (factorizeDfa) { + regex = generateFactorizedRegex(ranges, regex, REQUIRE_EQUAL_EDGES); + regex = generateFactorizedRegex(ranges, regex, ALLOW_EDGE_SPLITTING); + } + return regex; + } + + private String generateFactorizedRegex(RangeTree dfa, String bestRegex, MergeStrategy strategy) { + String factoredRegex = RangeTreeFactorizer.factor(dfa, strategy).stream() + .map(this::regexOf) + .collect(joining("|")); + return factoredRegex.length() < bestRegex.length() ? factoredRegex : bestRegex; + } + + private String regexOf(RangeTree ranges) { + ValueGraph nfa = RangeTreeConverter.toNfaGraph(ranges); + if (optimizeTail) { + nfa = TrailingPathOptimizer.optimize(nfa); + } + return EdgeWriter.toRegex(NfaFlattener.flatten(nfa), useDotMatch); + } +} diff --git a/metadata/src/main/java/com/google/i18n/phonenumbers/metadata/regex/SubgroupOptimizer.java b/metadata/src/main/java/com/google/i18n/phonenumbers/metadata/regex/SubgroupOptimizer.java new file mode 100644 index 0000000000..8c612379ff --- /dev/null +++ b/metadata/src/main/java/com/google/i18n/phonenumbers/metadata/regex/SubgroupOptimizer.java @@ -0,0 +1,190 @@ +/* + * Copyright (C) 2017 The Libphonenumber Authors. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.google.i18n.phonenumbers.metadata.regex; + +import static com.google.common.base.Preconditions.checkArgument; +import static com.google.common.base.Preconditions.checkNotNull; +import static com.google.common.collect.ImmutableList.toImmutableList; + +import com.google.common.annotations.VisibleForTesting; +import com.google.common.collect.ImmutableList; +import com.google.common.collect.LinkedHashMultiset; +import com.google.common.collect.Multiset; +import com.google.i18n.phonenumbers.metadata.RangeSpecification; +import com.google.i18n.phonenumbers.metadata.RangeTree; +import com.google.i18n.phonenumbers.metadata.RangeTree.DfaEdge; +import com.google.i18n.phonenumbers.metadata.RangeTree.DfaNode; +import com.google.i18n.phonenumbers.metadata.RangeTree.DfaVisitor; +import java.util.ArrayList; +import java.util.List; +import java.util.Optional; +import java.util.stream.IntStream; +import javax.annotation.Nullable; + +/** + * An optimization for RangeTree DFAs which attempts to isolate and extract subgraphs which would + * otherwise cause a lot of repetition in the generated regular expression. + */ +public final class SubgroupOptimizer { + /** + * Returns the subgraph which is likely to cause the most repetition in the regular expression + * of the given DFA. Subtracting the result out of the original range tree and generating two + * distinct regular expressions is likely to be shorter than the regular expression of the + * original range. + */ + public static Optional extractRepeatingSubgraph(RangeTree ranges) { + return LinkNodeVisitor + .findBridgingNode(ranges) + .flatMap(n -> SubgraphExtractionVisitor.extractSubgraph(ranges, n)); + } + + /** + * A visitor which applies two types of weights to every interior node in a DFA. + *
      + *
    • A count of incoming edges to that node. + *
    • A count of all edges in the subgraph rooted at that node. + *
    + * These are then multiplied together using the cost function: + *
    cost(n) = subgraph-weight(n) * (in-order(n) - 1)
    + * get get a proxy for the cost of additional duplicates likely to be created by this node. + */ + static class LinkNodeVisitor implements DfaVisitor { + // Reasonable approximation for the cost of an edge in a subgraph is the length of the + // corresponding range specification (it doesn't work so well for repeated edges like + // 'xxxxxxxx' --> "\d{8}", but it's good to help break ties in the cost function). + private static final ImmutableList EDGE_WEIGHTS = + IntStream.rangeClosed(1, 0x3FF) + .mapToObj(m -> RangeSpecification.toString(m).length()) + .collect(toImmutableList()); + + // Important to use "linked" multisets here (at least for the one we iterate over) since + // otherwise we end up with non-deterministic regular expression generation. + private final Multiset inOrder = LinkedHashMultiset.create(); + private final Multiset subgraphWeight = LinkedHashMultiset.create(); + + /** + * Returns the interior node whose subgraph is likely to cause the most repetition in the + * regular expression of the given DFA. + */ + static Optional findBridgingNode(RangeTree ranges) { + checkArgument(!ranges.isEmpty(), "cannot visit empty ranges"); + LinkNodeVisitor v = new LinkNodeVisitor(); + ranges.accept(v); + return Optional.ofNullable(v.getHighestCostNode()); + } + + private static int getEdgeWeight(DfaEdge edge) { + // Subtract 1 since the array is 1-based (a zero edge mask is not legal). + return EDGE_WEIGHTS.get(edge.getDigitMask() - 1); + } + + @VisibleForTesting + int getSubgraphWeight(DfaNode n) { + return subgraphWeight.count(n); + } + + @VisibleForTesting + int getInOrder(DfaNode n) { + return inOrder.count(n); + } + + // This returns null if no edge has a cost greater than zero. Since the cost function uses + // (in-order(n) - 1) this is trivially true for any graph where all interior nodes have only + // a single in-edge (the terminal node can have more than one in-edge, but it has a weight of + // zero and the initial node is never considered a candidate). + @VisibleForTesting + @Nullable + DfaNode getHighestCostNode() { + DfaNode node = null; + int maxWeight = 0; + for (DfaNode n : inOrder.elementSet()) { + int weight = getSubgraphWeight(n) * (getInOrder(n) - 1); + if (weight > maxWeight) { + maxWeight = weight; + node = n; + } + } + return node; + } + + @Override + public void visit(DfaNode source, DfaEdge edge, DfaNode target) { + // The weight is zero only if we haven't visited this node before (or it's the terminal). + int targetWeight = subgraphWeight.count(target); + if (targetWeight == 0 && !target.equals(RangeTree.getTerminal())) { + target.accept(this); + targetWeight = subgraphWeight.count(target); + } + // Add an extra one for the edge we are processing now and increment our target's in-order. + subgraphWeight.add(source, targetWeight + getEdgeWeight(edge)); + inOrder.add(target); + } + } + + /** + * A visitor to extract the subgraph of a DFA which passes through a specified interior + * "bridging" node. + */ + private static class SubgraphExtractionVisitor implements DfaVisitor { + private final DfaNode bridgingNode; + private final List paths = new ArrayList<>(); + private RangeSpecification path = RangeSpecification.empty(); + private boolean sawBridgingNode = false; + private boolean splitHappens = false; + + /** Returns the subgraph which passes through the specified node. */ + static Optional extractSubgraph(RangeTree ranges, DfaNode node) { + SubgraphExtractionVisitor v = new SubgraphExtractionVisitor(node); + ranges.accept(v); + // Only return proper subgraphs. + return v.splitHappens ? Optional.of(RangeTree.from(v.paths)) : Optional.empty(); + } + + private SubgraphExtractionVisitor(DfaNode bridgingNode) { + this.bridgingNode = checkNotNull(bridgingNode); + } + + @Override + public void visit(DfaNode source, DfaEdge edge, DfaNode target) { + RangeSpecification oldPath = path; + path = path.extendByMask(edge.getDigitMask()); + // Potentially emit paths for any terminating node (not just the end of the graph). We have + // to extract the entire sub-graph _after_ the bridging node, including terminating nodes. + if (target.canTerminate()) { + // Emit path if we are "below" the bridging node. + if (sawBridgingNode) { + paths.add(path); + } else { + // Records that there were other paths not in the subgroup (since we only want to return + // a new DFA that's a proper subgraph of the original graph). + splitHappens = true; + } + } + if (target.equals(bridgingNode)) { + // Recurse with the flag set to emit paths once we hit the terminal node (note that the + // bridging node cannot be the terminal node). + sawBridgingNode = true; + target.accept(this); + sawBridgingNode = false; + } else { + // Recurse normally regardless of the flag. + target.accept(this); + } + path = oldPath; + } + } +} diff --git a/metadata/src/main/java/com/google/i18n/phonenumbers/metadata/regex/TrailingPathOptimizer.java b/metadata/src/main/java/com/google/i18n/phonenumbers/metadata/regex/TrailingPathOptimizer.java new file mode 100644 index 0000000000..48c0e96c62 --- /dev/null +++ b/metadata/src/main/java/com/google/i18n/phonenumbers/metadata/regex/TrailingPathOptimizer.java @@ -0,0 +1,206 @@ +/* + * Copyright (C) 2017 The Libphonenumber Authors. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.google.i18n.phonenumbers.metadata.regex; + +import static com.google.common.collect.ImmutableList.toImmutableList; +import static com.google.i18n.phonenumbers.metadata.RangeSpecification.ALL_DIGITS_MASK; +import static java.util.Comparator.naturalOrder; +import static java.util.stream.Collectors.toList; + +import com.google.common.base.Preconditions; +import com.google.common.collect.ImmutableList; +import com.google.common.graph.Graphs; +import com.google.common.graph.MutableValueGraph; +import com.google.common.graph.ValueGraph; +import com.google.i18n.phonenumbers.metadata.regex.Edge.SimpleEdge; +import java.util.Comparator; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Map.Entry; +import java.util.Optional; + +/** + * Optimizer for NFA graphs which attempts to restructure the trailing paths to maximize sharing + * and hopefully minimize the amount of duplication in the resulting regular expression. + */ +public final class TrailingPathOptimizer { + /** + * Optimizes an NFA graph to make trailing "any digit" sequences common where possible. In many + * cases this will result in no change to the structure of the NFA (common trailing paths are + * not a feature of every NFA), but in some cases a substantial reduction in duplication can + * occur. + * + *

    This is equivalent to recognizing that {@code "12\d{2}\d{2}?|34\d{2}|56\d{3}"} can be + * written as {@code "(?:12\d{2}?|34|56\d)\d{2}"}. + */ + public static ValueGraph optimize(ValueGraph graph) { + MutableValueGraph out = Graphs.copyOf(graph); + + // Build a map of trailing "any digit" sequences (key is the node it starts from). + Map anyPaths = new HashMap<>(); + recursivelyDetachTrailingPaths(Node.TERMINAL, AnyPath.EMPTY, out, anyPaths); + + // If the terminal node has no "any digit" sequences leading to it, there's nothing we can do + // (well not in this simplistic algorithm anyway). This should almost never happen for phone + // number matching graphs as it implies a match expression that can terminate at a precise + // digit, rather than any digit. The only time this might occur is for short-codes, but due to + // their size it's likely to be fine if we don't try to aggressively optimize them. + if (anyPaths.size() == 1 && anyPaths.containsKey(Node.TERMINAL)) { + return graph; + } + // This is just a way to find a node from which we can start generating new nodes. + Node lastAddedNode = out.nodes().stream().max(naturalOrder()).get(); + + // Process paths from short to long (since some paths are sub-paths of longer ones). + List shortestPathsFirst = anyPaths.entrySet().stream() + .sorted(Comparator.comparing(Entry::getValue)) + .map(Entry::getKey) + .collect(toList()); + Node pathEnd = Node.TERMINAL; + while (true) { + // Start with the next path that might be a factor of all the remaining paths. + Node shortestPathNode = shortestPathsFirst.get(0); + AnyPath shortestPath = anyPaths.get(shortestPathNode); + int pathsToFactor = shortestPathsFirst.size() - 1; + if (pathsToFactor == 0) { + // If all paths are factored, we're done. + break; + } + // Factor all the remaining paths by the shortest path (where a missing result means it + // cannot be factored). + ImmutableList factored = shortestPathsFirst.stream() + .skip(1) + .map(n -> anyPaths.get(n).factor(shortestPath)) + .filter(Optional::isPresent) + .map(Optional::get) + .collect(toImmutableList()); + // If not all the remaining paths have the shortest path as a common factor, we're done (in + // this simplistic algorithm we don't consider cases where an AnyPath is the factor of some, + // but not all, other paths; we could but it's far less likely to reduce regex size). + if (factored.size() < pathsToFactor) { + break; + } + // Shortest path is a factor of all remaining paths, so add a new path to the graph for it. + lastAddedNode = addPath(shortestPathNode, pathEnd, shortestPath, lastAddedNode, out); + // We're done with this path, but might still be able to find more factors of remaining paths. + anyPaths.remove(shortestPathNode); + shortestPathsFirst.remove(0); // index, not value. + // The newly factored edges now replace the original factors in the map. + for (int n = 0; n < factored.size(); n++) { + Preconditions.checkState(anyPaths.containsKey(shortestPathsFirst.get(n))); + anyPaths.put(shortestPathsFirst.get(n), factored.get(n)); + } + // We now connect any new factored edges to the node we just added (not the terminal node). + pathEnd = shortestPathNode; + } + // If we exit, we must still reconnect any remaining, unfactored, paths to the graph. + for (Map.Entry e : anyPaths.entrySet()) { + lastAddedNode = addPath(e.getKey(), pathEnd, e.getValue(), lastAddedNode, out); + } + return out; + } + + /** + * Recursively build up a map of trailing "any digit" sequences (AnyPath), starting from some + * current node (initially the terminal node) and working backwards. The key in the map is the + * node at which the AnyPath value starts from. Edges and nodes are removed from the graph, + * leaving "ragged" paths which will need to be reconnected later (the keys in the map are the + * set of nodes that need to be reconnected). + * + * @return whether the given node is the start of an AnyPath (i.e. if it immediately follows any + * edges which are not "any digit" sequences). + */ + private static boolean recursivelyDetachTrailingPaths( + Node node, AnyPath path, MutableValueGraph g, Map anyPaths) { + if (beginsAnAnyPath(node, g)) { + anyPaths.put(node, path); + return true; + } + // All incoming edges accept all digits, so we can recurse (but don't traverse epsilons). + List sources = g.predecessors(node).stream() + .filter(s -> !g.edgeValue(s, node).get().equals(Edge.epsilon())) + .collect(toList()); + for (Node source : sources) { + AnyPath newPath = path.extend(canTerminate(source, g)); + // Recurse to remove trailing paths higher in the tree and keep this source node only if + // recursion stopped here. + boolean keepSourceNode = recursivelyDetachTrailingPaths(source, newPath, g, anyPaths); + g.removeEdge(source, node); + // This removes the epsilon if it exists (and does nothing otherwise). This is safe since we + // know the other out-edge of this node accepts all digits, so the only remaining type of + // edge that could exist is an epsilon. After removing both we expect not to find any others. + g.removeEdge(source, Node.TERMINAL); + Preconditions.checkState(g.outDegree(source) == 0, "unexpected out edges in trailing graph"); + // If we were able to recurse past this node, it can be removed. + if (!keepSourceNode) { + g.removeNode(source); + } + } + return false; + } + + /** + * Returns whether the given node has incoming edges that do not just accept "any digit". This is + * the point at which recursion must stop since AnyPath can only represent "any digit" sequences. + */ + private static boolean beginsAnAnyPath(Node target, ValueGraph g) { + // Obviously we cannot recurse past the initial node. + if (target == Node.INITIAL) { + return true; + } + return g.predecessors(target).stream() + .map(s -> g.edgeValue(s, target).get()) + .filter(e -> !e.equals(Edge.epsilon())) + .anyMatch(e -> e.getDigitMask() != ALL_DIGITS_MASK); + } + + /** + * Returns whether this node can terminate. This logic relies on the input graph not having had + * its epsilon edges moved (i.e. if an epsilon edge exists it must point to the terminal node). + * This also looks for special "optional" edges which exist when a non-epsilon edge already + * exists from this node to the terminal node. + */ + private static boolean canTerminate(Node node, ValueGraph g) { + return g.successors(node).stream() + .map(t -> g.edgeValue(node, t).get()) + .anyMatch(e -> e.isOptional() || e.equals(Edge.epsilon())); + } + + /** Adds the given "AnyPath" into the graph, generating new nodes and edges as necessary. */ + private static Node addPath( + Node node, Node end, AnyPath path, Node lastAdded, MutableValueGraph out) { + // Path length is always at least 1 for an AnyPath. + int pathLength = path.maxLength(); + for (int n = 0; n < pathLength - 1; n++) { + if (path.acceptsLength(n)) { + out.putEdgeValue(node, end, Edge.epsilon()); + } + lastAdded = lastAdded.createNext(); + out.addNode(lastAdded); + out.putEdgeValue(node, lastAdded, Edge.any()); + node = lastAdded; + } + // For the last edge we cannot add a parallel epsilon path if we need to skip to the end, + // so add the special "optional any" edge instead. + out.putEdgeValue( + node, end, path.acceptsLength(pathLength - 1) ? Edge.optionalAny() : Edge.any()); + return lastAdded; + } + + private TrailingPathOptimizer() {} +} diff --git a/metadata/src/main/java/com/google/i18n/phonenumbers/metadata/table/CsvParser.java b/metadata/src/main/java/com/google/i18n/phonenumbers/metadata/table/CsvParser.java index bb8329c97c..8434e13177 100644 --- a/metadata/src/main/java/com/google/i18n/phonenumbers/metadata/table/CsvParser.java +++ b/metadata/src/main/java/com/google/i18n/phonenumbers/metadata/table/CsvParser.java @@ -73,19 +73,23 @@ public void accept(Stream row) { } else { ImmutableMap.Builder map = ImmutableMap.builder(); // Not a pure lambda due to the need to index columns. - row.forEach(new Consumer() { - private int i = 0; - - @Override - public void accept(String v) { - checkArgument(i < header.size(), - "too many columns (expected %s): %s", header.size(), map); - if (!v.isEmpty()) { - map.put(header.get(i++), v); - } - } - }); - handler.accept(map.build()); + row.forEach( + new Consumer() { + private int i = 0; + + @Override + public void accept(String v) { + checkArgument( + i < header.size(), + "too many columns (expected %s): %s", + header.size(), + map); + if (!v.isEmpty()) { + map.put(header.get(i++), v); + } + } + }); + handler.accept(map.buildOrThrow()); } } }; diff --git a/metadata/src/main/java/com/google/i18n/phonenumbers/metadata/table/CsvTable.java b/metadata/src/main/java/com/google/i18n/phonenumbers/metadata/table/CsvTable.java index 7a96596c93..98a287647e 100644 --- a/metadata/src/main/java/com/google/i18n/phonenumbers/metadata/table/CsvTable.java +++ b/metadata/src/main/java/com/google/i18n/phonenumbers/metadata/table/CsvTable.java @@ -582,7 +582,7 @@ public static String unescapeSingleLineCsvText(String s) { .put('r', '\r') .put('t', '\t') .put('\\', '\\') - .build(); + .buildOrThrow(); // Visible for AutoValue only. CsvTable() {} diff --git a/metadata/src/main/java/com/google/i18n/phonenumbers/metadata/table/RangeTable.java b/metadata/src/main/java/com/google/i18n/phonenumbers/metadata/table/RangeTable.java index c7db3f78d9..fdfe7fe065 100644 --- a/metadata/src/main/java/com/google/i18n/phonenumbers/metadata/table/RangeTable.java +++ b/metadata/src/main/java/com/google/i18n/phonenumbers/metadata/table/RangeTable.java @@ -628,7 +628,7 @@ public > ImmutableMap getPrefixMap( RangeTree include = getRanges(column, value); map.put(value, PrefixTree.minimal(include, allRanges.subtract(include), minPrefixLength)); } - return map.build(); + return map.buildOrThrow(); } // Constants for the simplification routine below. diff --git a/metadata/src/main/java/com/google/i18n/phonenumbers/metadata/table/Schema.java b/metadata/src/main/java/com/google/i18n/phonenumbers/metadata/table/Schema.java index eaf4e72367..9abb3af883 100644 --- a/metadata/src/main/java/com/google/i18n/phonenumbers/metadata/table/Schema.java +++ b/metadata/src/main/java/com/google/i18n/phonenumbers/metadata/table/Schema.java @@ -55,7 +55,7 @@ public Builder add(ColumnGroup group) { } public Schema build() { - return new AutoValue_Schema(names.build(), columns.build(), groups.build()); + return new AutoValue_Schema(names.build(), columns.buildOrThrow(), groups.buildOrThrow()); } } diff --git a/metadata/src/test/java/com/google/i18n/phonenumbers/metadata/LengthsParserTest.java b/metadata/src/test/java/com/google/i18n/phonenumbers/metadata/LengthsParserTest.java new file mode 100644 index 0000000000..1b2b1e18de --- /dev/null +++ b/metadata/src/test/java/com/google/i18n/phonenumbers/metadata/LengthsParserTest.java @@ -0,0 +1,76 @@ +/* + * Copyright (C) 2022 The Libphonenumber Authors. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.google.i18n.phonenumbers.metadata; + +import static com.google.common.truth.Truth.assertThat; +import static org.junit.Assert.assertThrows; + +import org.junit.Test; +import org.junit.runner.RunWith; +import org.junit.runners.JUnit4; + +@RunWith(JUnit4.class) +public final class LengthsParserTest { + + @Test + public void shouldThrowIfStringContainsForbiddenCharacters() { + assertThrows(IllegalArgumentException.class, () -> LengthsParser.parseLengths("a-6,7")); + assertThrows(IllegalArgumentException.class, () -> LengthsParser.parseLengths("8, B, C")); + assertThrows(IllegalArgumentException.class, () -> LengthsParser.parseLengths("8, ,10")); + assertThrows(IllegalArgumentException.class, () -> LengthsParser.parseLengths("4, +7-9, +11")); + } + + @Test + public void shouldThrowIfNumbersAreOutOfOrder() { + assertThrows(IllegalArgumentException.class, () -> LengthsParser.parseLengths("9-7")); + assertThrows(IllegalArgumentException.class, () -> LengthsParser.parseLengths("8,12-11")); + assertThrows(IllegalArgumentException.class, () -> LengthsParser.parseLengths("5,4,7-8")); + assertThrows(IllegalArgumentException.class, () -> LengthsParser.parseLengths("6-8, 7-9")); + } + + @Test + public void shouldThrowIfFormatIsWrong() { + assertThrows(IllegalArgumentException.class, () -> LengthsParser.parseLengths("4-6-8")); + assertThrows(IllegalArgumentException.class, () -> LengthsParser.parseLengths("7-")); + assertThrows(IllegalArgumentException.class, () -> LengthsParser.parseLengths("3, -7")); + assertThrows(IllegalArgumentException.class, () -> LengthsParser.parseLengths("1 2-3 4, 5 6")); + } + + @Test + public void testParseSingletons() { + assertThat(LengthsParser.parseLengths("8")).containsExactly(8); + assertThat(LengthsParser.parseLengths("14")).containsExactly(14); + } + + @Test + public void testParseCommaSeparatedNumbers() { + assertThat(LengthsParser.parseLengths("6,8,9")).containsExactly(6, 8, 9); + assertThat(LengthsParser.parseLengths("13, 14")).containsExactly(13, 14); + } + + @Test + public void testParseRanges() { + assertThat(LengthsParser.parseLengths("6-8")).containsExactly(6, 7, 8); + assertThat(LengthsParser.parseLengths("13 - 14")).containsExactly(13, 14); + } + + @Test + public void testParseComplex() { + assertThat(LengthsParser.parseLengths("4,7,9-12")).containsExactly(4, 7, 9, 10, 11, 12); + assertThat(LengthsParser.parseLengths("4-6, 8, 10-12")).containsExactly(4, 5, 6, 8, 10, 11, 12); + } +} diff --git a/metadata/src/test/java/com/google/i18n/phonenumbers/metadata/RangeSpecificationTest.java b/metadata/src/test/java/com/google/i18n/phonenumbers/metadata/RangeSpecificationTest.java index 6869f415d7..5bf1e4aaf6 100644 --- a/metadata/src/test/java/com/google/i18n/phonenumbers/metadata/RangeSpecificationTest.java +++ b/metadata/src/test/java/com/google/i18n/phonenumbers/metadata/RangeSpecificationTest.java @@ -20,8 +20,8 @@ import static com.google.i18n.phonenumbers.metadata.DigitSequence.domain; import static com.google.i18n.phonenumbers.metadata.RangeSpecification.ALL_DIGITS_MASK; import static com.google.i18n.phonenumbers.metadata.RangeSpecification.parse; -import static java.util.Arrays.asList; import static com.google.i18n.phonenumbers.metadata.testing.AssertUtil.assertThrows; +import static java.util.Arrays.asList; import com.google.common.collect.ImmutableRangeSet; import com.google.common.collect.Range; diff --git a/metadata/src/test/java/com/google/i18n/phonenumbers/metadata/finitestatematcher/DigitSequenceMatcherTest.java b/metadata/src/test/java/com/google/i18n/phonenumbers/metadata/finitestatematcher/DigitSequenceMatcherTest.java new file mode 100644 index 0000000000..f089c8f5ab --- /dev/null +++ b/metadata/src/test/java/com/google/i18n/phonenumbers/metadata/finitestatematcher/DigitSequenceMatcherTest.java @@ -0,0 +1,210 @@ +/* + * Copyright (C) 2017 The Libphonenumber Authors. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.google.i18n.phonenumbers.metadata.finitestatematcher; + +import static com.google.common.base.Preconditions.checkArgument; +import static com.google.i18n.phonenumbers.metadata.finitestatematcher.DigitSequenceMatcher.Result.INVALID; +import static com.google.i18n.phonenumbers.metadata.finitestatematcher.DigitSequenceMatcher.Result.MATCHED; +import static com.google.i18n.phonenumbers.metadata.finitestatematcher.DigitSequenceMatcher.Result.TOO_LONG; +import static com.google.i18n.phonenumbers.metadata.finitestatematcher.DigitSequenceMatcher.Result.TOO_SHORT; + +import com.google.common.base.CharMatcher; +import com.google.i18n.phonenumbers.metadata.RangeSpecification; +import com.google.i18n.phonenumbers.metadata.RangeTree; +import com.google.i18n.phonenumbers.metadata.finitestatematcher.DigitSequenceMatcher.DigitSequence; +import com.google.i18n.phonenumbers.metadata.finitestatematcher.DigitSequenceMatcher.Result; +import com.google.i18n.phonenumbers.metadata.finitestatematcher.compiler.MatcherCompiler; +import com.google.i18n.phonenumbers.metadata.regex.RegexGenerator; +import java.util.Arrays; +import java.util.regex.Pattern; +import org.junit.Assert; +import org.junit.Test; +import org.junit.runner.RunWith; +import org.junit.runners.JUnit4; + +@RunWith(JUnit4.class) +public class DigitSequenceMatcherTest { + + @Test public void testStringDigits() { + DigitSequence digits = DigitSequenceMatcher.digitsFromString("1234"); + + Assert.assertTrue(digits.hasNext()); + Assert.assertEquals(1, digits.next()); + Assert.assertTrue(digits.hasNext()); + Assert.assertEquals(2, digits.next()); + Assert.assertTrue(digits.hasNext()); + Assert.assertEquals(3, digits.next()); + Assert.assertTrue(digits.hasNext()); + Assert.assertEquals(4, digits.next()); + Assert.assertFalse(digits.hasNext()); + } + + @Test public void testSingleDigitMatching() { + assertNotMatches(ranges("0"), INVALID, "1", "9"); + assertNotMatches(ranges("0"), TOO_LONG, "00"); + + assertMatches(ranges("x"), "0", "5", "9"); + assertNotMatches(ranges("x"), TOO_SHORT, ""); + assertNotMatches(ranges("x"), TOO_LONG, "00"); + + assertMatches(ranges("[2-6]"), "2", "3", "4", "5", "6"); + assertNotMatches(ranges("[2-6]"), INVALID, "0", "1", "7", "8", "9"); + assertNotMatches(ranges("[2-6]"), TOO_LONG, "26"); + } + + @Test public void testOptional() { + RangeTree dfa = ranges("12", "123"); + assertMatches(ranges("12", "123"), "12", "123"); + assertNotMatches(dfa, TOO_SHORT, "1"); + assertNotMatches(dfa, INVALID, "13"); + assertNotMatches(dfa, TOO_LONG, "1233"); + } + + @Test public void testRepetition() { + assertMatches(ranges("12xx", "12xxx", "12xxxx"), "1234", "12345", "123456"); + } + + @Test public void testOr() { + RangeTree dfa = ranges("01", "23"); + assertMatches(dfa, "01", "23"); + assertNotMatches(dfa, INVALID, "03", "12"); + assertNotMatches(dfa, TOO_SHORT, "0", "2"); + assertNotMatches(dfa, TOO_LONG, "011", "233"); + + assertMatches(ranges("01", "23", "45", "6789"), "01", "23", "45", "6789"); + } + + @Test public void testRealRegexShort() { + RangeTree dfa = ranges( + "11[2-7]xxxxxxx", + "2[02][2-7]xxxxxxx", + "33[2-7]xxxxxxx", + "4[04][2-7]xxxxxxx", + "79[2-7]xxxxxxx", + "80[2-467]xxxxxxx"); + + assertMatches(dfa, "112 1234567", "797 1234567", "807 1234567"); + assertNotMatches(dfa, TOO_SHORT, "112 123", "797 12345", "807 123456"); + assertNotMatches(dfa, TOO_LONG, "112 12345678", "797 123456789"); + assertNotMatches(dfa, INVALID, "122 1234567", "799 1234567", "805 1234567"); + } + + @Test public void testRealRegexLong() { + RangeTree dfa = ranges( + "12[0-249][2-7]xxxxxx", + "13[0-25][2-7]xxxxxx", + "14[145][2-7]xxxxxx", + "1[59][14][2-7]xxxxxx", + "16[014][2-7]xxxxxx", + "17[1257][2-7]xxxxxx", + "18[01346][2-7]xxxxxx", + "21[257][2-7]xxxxxx", + "23[013][2-7]xxxxxx", + "24[01][2-7]xxxxxx", + "25[0137][2-7]xxxxxx", + "26[0158][2-7]xxxxxx", + "278[2-7]xxxxxx", + "28[1568][2-7]xxxxxx", + "29[14][2-7]xxxxxx", + "326[2-7]xxxxxx", + "34[1-3][2-7]xxxxxx", + "35[34][2-7]xxxxxx", + "36[01489][2-7]xxxxxx", + "37[02-46][2-7]xxxxxx", + "38[159][2-7]xxxxxx", + "41[36][2-7]xxxxxx", + "42[1-47][2-7]xxxxxx", + "43[15][2-7]xxxxxx", + "45[12][2-7]xxxxxx", + "46[126-9][2-7]xxxxxx", + "47[0-24-9][2-7]xxxxxx", + "48[013-57][2-7]xxxxxx", + "49[014-7][2-7]xxxxxx", + "5[136][25][2-7]xxxxxx", + "522[2-7]xxxxxx", + "54[28][2-7]xxxxxx", + "55[12][2-7]xxxxxx", + "5[78]1[2-7]xxxxxx", + "59[15][2-7]xxxxxx", + "612[2-7]xxxxxx", + "6[2-4]1[2-7]xxxxxx", + "65[17][2-7]xxxxxx", + "66[13][2-7]xxxxxx", + "67[14][2-7]xxxxxx", + "680[2-7]xxxxxx", + "712[2-7]xxxxxx", + "72[14][2-7]xxxxxx", + "73[134][2-7]xxxxxx", + "74[47][2-7]xxxxxx", + "75[15][2-7]xxxxxx", + "7[67]1[2-7]xxxxxx", + "788[2-7]xxxxxx", + "816[2-7]xxxxxx", + "82[014][2-7]xxxxxx", + "83[126][2-7]xxxxxx", + "86[136][2-7]xxxxxx", + "87[078][2-7]xxxxxx", + "88[34][2-7]xxxxxx", + "891[2-7]xxxxxx"); + + assertMatches(dfa, "364 2 123456", "674 4 123456", "883 7 123456"); + assertNotMatches(dfa, TOO_SHORT, "364 2 123", "674 4 1234", "883 7 12345"); + assertNotMatches(dfa, TOO_LONG, "364 2 1234567", "674 4 12345678"); + assertNotMatches(dfa, INVALID, + "365 2 123456", "364 8 123456", "670 4 123456", "670 5 123456", "892 2 123456"); + } + + private static RangeTree ranges(String... lines) { + return RangeTree.from(Arrays.stream(lines).map(RangeSpecification::parse)); + } + + private static void assertMatches(RangeTree dfa, String... numbers) { + checkRegex(dfa, true, numbers); + byte[] matcherData = MatcherCompiler.compile(dfa); + + DigitSequenceMatcher matcher = DigitSequenceMatcher.create(matcherData); + assertMatcher(matcher, MATCHED, numbers); + } + + private static void assertNotMatches(RangeTree dfa, Result error, String... numbers) { + checkArgument(error != MATCHED); + checkRegex(dfa, false, numbers); + byte[] matcherData = MatcherCompiler.compile(dfa); + DigitSequenceMatcher matcher = DigitSequenceMatcher.create(matcherData); + assertMatcher(matcher, error, numbers); + } + + private static void checkRegex(RangeTree dfa, boolean expectMatch, String... numbers) { + Pattern pattern = Pattern.compile(RegexGenerator.basic().toRegex(dfa)); + for (String number : numbers) { + checkArgument(expectMatch == pattern.matcher(noSpace(number)).matches(), + "regex %s could not match input %s", dfa.asRangeSpecifications(), number); + } + } + + private static void assertMatcher( + DigitSequenceMatcher matcher, Result expected, String... numbers) { + for (final String number : numbers) { + Assert.assertEquals(expected, + matcher.match(DigitSequenceMatcher.digitsFromString(noSpace(number)))); + } + } + + private static String noSpace(String input) { + return CharMatcher.whitespace().removeFrom(input); + } +} diff --git a/metadata/src/test/java/com/google/i18n/phonenumbers/metadata/finitestatematcher/compiler/CompilerRegressionTest.java b/metadata/src/test/java/com/google/i18n/phonenumbers/metadata/finitestatematcher/compiler/CompilerRegressionTest.java new file mode 100644 index 0000000000..e8a96776db --- /dev/null +++ b/metadata/src/test/java/com/google/i18n/phonenumbers/metadata/finitestatematcher/compiler/CompilerRegressionTest.java @@ -0,0 +1,317 @@ +/* + * Copyright (C) 2017 The Libphonenumber Authors. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.google.i18n.phonenumbers.metadata.finitestatematcher.compiler; + +import static com.google.common.collect.ImmutableList.toImmutableList; +import static com.google.common.truth.Truth.assertWithMessage; +import static com.google.i18n.phonenumbers.metadata.RangeSpecification.ALL_DIGITS_MASK; +import static java.lang.Integer.bitCount; +import static java.lang.Integer.lowestOneBit; +import static java.lang.Integer.numberOfTrailingZeros; + +import com.google.common.collect.Multimap; +import com.google.common.collect.MultimapBuilder; +import com.google.common.collect.SetMultimap; +import com.google.i18n.phonenumbers.internal.finitestatematcher.compiler.RegressionTestProto; +import com.google.i18n.phonenumbers.internal.finitestatematcher.compiler.RegressionTestProto.TestCase; +import com.google.i18n.phonenumbers.internal.finitestatematcher.compiler.RegressionTestProto.Tests; +import com.google.i18n.phonenumbers.metadata.DigitSequence; +import com.google.i18n.phonenumbers.metadata.RangeSpecification; +import com.google.i18n.phonenumbers.metadata.RangeTree; +import com.google.i18n.phonenumbers.metadata.RangeTree.DfaEdge; +import com.google.i18n.phonenumbers.metadata.RangeTree.DfaNode; +import com.google.i18n.phonenumbers.metadata.RangeTree.DfaVisitor; +import com.google.i18n.phonenumbers.metadata.finitestatematcher.DigitSequenceMatcher; +import com.google.i18n.phonenumbers.metadata.finitestatematcher.DigitSequenceMatcher.Result; +import com.google.protobuf.ByteString; +import com.google.protobuf.TextFormat; +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.io.PrintWriter; +import java.io.StringWriter; +import java.nio.charset.StandardCharsets; +import java.util.HashSet; +import java.util.List; +import java.util.Set; +import org.junit.Test; +import org.junit.runner.RunWith; +import org.junit.runners.JUnit4; + +@RunWith(JUnit4.class) +public class CompilerRegressionTest { + // Tests that the compiler produces the expected output, byte-for-byte. + @Test + public void testCompiledBytesEqualExpectedMatcherBytes() throws IOException { + StringWriter buffer = new StringWriter(); + PrintWriter errors = new PrintWriter(buffer); + try (InputStream data = + CompilerRegressionTest.class.getResourceAsStream("regression_test_data.textpb")) { + Tests.Builder tests = RegressionTestProto.Tests.newBuilder(); + TextFormat.merge(new InputStreamReader(data, StandardCharsets.UTF_8), tests); + for (TestCase tc : tests.getTestCaseList()) { + byte[] actual = MatcherCompiler.compile(ranges(tc.getRangeList())); + byte[] expected = combine(tc.getExpectedList()); + int diffIndex = indexOfDiff(actual, expected); + if (!tc.getShouldFail()) { + if (diffIndex != -1) { + errors.format("FAILED [%s]: First difference at index %d\n", tc.getName(), diffIndex); + errors.format("Actual : %s\n", formatPbSnippet(actual, diffIndex, 20)); + errors.format("Expected: %s\n", formatPbSnippet(expected, diffIndex, 20)); + writeGoldenPbOutput(actual, errors); + } + } else { + if (diffIndex == -1) { + errors.format("FAILED [%s]: Expected difference, but got none\n", tc.getName()); + } + } + } + } + String errorMessage = buffer.toString(); + if (!errorMessage.isEmpty()) { + assertWithMessage(errorMessage).fail(); + } + } + + // Test that the matcher behaves correctly with respect to the input ranges using the expected + // byte sequences. If this test fails, then the matcher implementation is doing something wrong, + // or the expected bytes were generated incorrectly (either by hand or from the compiler). + // + // IMPORTANT: This test tests that the expected bytes (rather than the compiled bytes) match the + // numbers in the ranges. This avoids the risk of any bugs in both the matcher and compiler + // somehow cancelling each other out. However this also means that this test depends on the + // equality test above for validity (i.e. this test can pass even if the matcher compiler is + // broken, so it should not be run in isolation when debugging). + @Test + public void testExpectedMatcherBytesMatchRanges() throws IOException { + try (InputStream data = + CompilerRegressionTest.class.getResourceAsStream("regression_test_data.textpb")) { + RegressionTestProto.Tests.Builder tests = RegressionTestProto.Tests.newBuilder(); + TextFormat.merge(new InputStreamReader(data, StandardCharsets.UTF_8), tests); + for (TestCase tc : tests.getTestCaseList()) { + RangeTree ranges = ranges(tc.getRangeList()); + // If we compiled the ranges here, we could risk a situation where the compiled bytes were + // broken but the compiler had a corresponding bug that cancelled it out. This test only + // tests the matcher behaviour, whereas the test above only tests the compiler behaviour. + DigitSequenceMatcher matcher = DigitSequenceMatcher.create(combine(tc.getExpectedList())); + Multimap numbers = buildTestNumbers(ranges); + if (!tc.getShouldFail()) { + testExpectedMatch(tc.getName(), matcher, numbers); + } else { + testExpectedFailure(tc.getName(), matcher, numbers); + } + } + } + } + + private static void testExpectedMatch(String testName, DigitSequenceMatcher matcher, + Multimap numbers) { + for (Result expectedResult : Result.values()) { + for (DigitSequence s : numbers.get(expectedResult)) { + Result result = matcher.match(new Sequence(s)); + assertWithMessage("FAILED [%s]: Sequence %s", testName, s) + .that(result).isEqualTo(expectedResult); + } + } + } + + private static void testExpectedFailure(String testName, DigitSequenceMatcher matcher, + Multimap numbers) { + for (Result expectedResult : Result.values()) { + for (DigitSequence s : numbers.get(expectedResult)) { + Result result = matcher.match(new Sequence(s)); + if (result != expectedResult) { + return; + } + } + } + assertWithMessage("FAILED [%s]: Expected at least one failure", testName).fail(); + } + + // Magic number: DigitSequences cannot be longer than 18 digits at the moment, so a check is + // needed to prevent us trying to make a longer-than-allowed sequences in tests. This only + // happens in the case of a terminal node, since non-terminal paths must be < 17 digits long. + // If the allowed digits increases, this value can be modified or left as-is. + private static final int MAX_SEQUENCE_LENGTH = 18; + + // Trivial adapter from the metadata DigitSequence to the matcher's lightweight sequence. + private static final class Sequence implements DigitSequenceMatcher.DigitSequence { + private final DigitSequence seq; + private int index = 0; + + Sequence(DigitSequence seq) { + this.seq = seq; + } + + @Override + public boolean hasNext() { + return index < seq.length(); + } + + @Override + public int next() { + return seq.getDigit(index++); + } + } + + // Returns a RangeTree for the list of RangeSpecification strings. + RangeTree ranges(List specs) { + return RangeTree.from(specs.stream().map(RangeSpecification::parse).collect(toImmutableList())); + } + + // Builds a map of numbers for the given RangeTree to test every branching point in the DFA. + // All paths combinations are generated exactly once to give coverage. This does use pseudo + // random numbers to pick random digits from masks, but it should not be flaky. If it _ever_ + // fails then it implies a serious problem with the matcher compiler or matcher implementation. + private static Multimap buildTestNumbers(RangeTree ranges) { + SetMultimap numbers = + MultimapBuilder.enumKeys(Result.class).treeSetValues().build(); + Set visited = new HashSet<>(); + ranges.accept(new Visitor(RangeSpecification.empty(), numbers, visited)); + return numbers; + } + + /** + * Visitor to generate a targeted set of test numbers from a range tree DFA, which should + * exercise every instruction in the corresponding matcher data. These numbers should ensure + * that every "branch" (including early terminations) is taken at least once. Where digits + * should be equivalent (i.e. both x & y have the same effect) they are chosen randomly, since + * otherwise you would need to generate billions of numbers to cover every possible combination. + */ + private static final class Visitor implements DfaVisitor { + private final RangeSpecification sourcePath; + private final SetMultimap numbers; + private final Set visited; + private int outEdgesMask = 0; + + Visitor(RangeSpecification sourcePath, + SetMultimap numbers, + Set visited) { + this.sourcePath = sourcePath; + this.numbers = numbers; + this.visited = visited; + } + + @Override + public void visit(DfaNode source, DfaEdge edge, DfaNode target) { + // Record the current outgoing edge mask. + int mask = edge.getDigitMask(); + outEdgesMask |= mask; + // Get the current path and add a test number for it. + RangeSpecification path = sourcePath.extendByMask(mask); + numbers.put(target.canTerminate() ? Result.MATCHED : Result.TOO_SHORT, sequenceIn(path)); + // Avoid recursing into nodes we've already visited. This avoids generating many (hundreds) + // of test numbers for nodes which are reachable in many ways (via many path prefixes). This + // is an optional check and could be removed, but for testing larger ranges it seems to make + // a difference in test time. DFA node/instruction coverage should be unaffected by this. + if (visited.contains(target)) { + return; + } + visited.add(target); + // Recurse into the next level with a new visitor starting from our path (it's okay to visit + // the terminal node here since it does nothing and leaves the out edges mask zero). + Visitor childVisitor = new Visitor(path, numbers, visited); + target.accept(childVisitor); + // After recursion, find out which of our target's out-edges cannot be reached. + int unreachableMask = ~childVisitor.outEdgesMask & ALL_DIGITS_MASK; + if (unreachableMask != 0 && path.length() < MAX_SEQUENCE_LENGTH) { + // Create a path which cannot be reached directly from our target node. If this is the + // terminal node then we create a path that's too long, otherwise it's just invalid. + Result expected = target.equals(RangeTree.getTerminal()) ? Result.TOO_LONG : Result.INVALID; + numbers.put(expected, sequenceIn(path.extendByMask(unreachableMask))); + } + } + } + + // Returns a pseudo randomly chosen sequence from the given path. + private static final DigitSequence sequenceIn(RangeSpecification path) { + DigitSequence seq = DigitSequence.empty(); + for (int n = 0; n < path.length(); n++) { + int mask = path.getBitmask(n); + // A random number M in [0..BitCount), not the bit itself. + // E.g. mask = 0011010011 ==> (0 <= maskBit < 5) (allowed digits are {0,1,4,6,7}) + int maskBit = (int) (bitCount(mask) * Math.random()); + // Mask out the M lower bits which come before the randomly selected one. + // E.g. maskBit = 3 ==> mask = 0011000000 (3 lower bits cleared) + while (maskBit > 0) { + mask &= ~lowestOneBit(mask); + maskBit--; + } + // Extend the sequence by the digit value of the randomly selected bit. + // E.g. mask = 0011000000 ==> digit = 6 (randomly chosen from the allowed digits). + seq = seq.extendBy(numberOfTrailingZeros(mask)); + } + return seq; + } + + // Combines multiple ByteStrings into a single byte[] (we allow splitting in the regression test + // file for readability. + private static byte[] combine(List bytes) { + int size = bytes.stream().mapToInt(ByteString::size).sum(); + byte[] out = new byte[size]; + int offset = 0; + for (ByteString b : bytes) { + b.copyTo(out, offset); + offset += b.size(); + } + return out; + } + + // Return the index of the first difference, or -1 is the byte arrays are the same. + private static int indexOfDiff(byte[] a, byte[] b) { + int length = Math.min(a.length, b.length); + for (int n = 0; n < length; n++) { + if (a[n] != b[n]) { + return n; + } + } + return (a.length == length && b.length == length) ? -1 : length; + } + + // Formats a subset of the bytes as a human readable snippet using C-style hex escaping (which + // is compatible with the regression test data). + private static String formatPbSnippet(byte[] bytes, int start, int length) { + StringBuilder out = new StringBuilder(); + if (start > 0) { + out.append("..."); + } + appendBytes(out, bytes, start, length); + if (start + length < bytes.length) { + out.append("..."); + } + return out.toString(); + } + + // Writes bytes such that they can be cut & pasted into a regression test file as new golden data. + private static void writeGoldenPbOutput(byte[] bytes, PrintWriter errors) { + errors.println("Golden Data:"); + StringBuilder out = new StringBuilder(); + for (int start = 0; start < bytes.length; start += 20) { + errors.format(" expected: \"%s\"\n", appendBytes(out, bytes, start, 20)); + out.setLength(0); + } + } + + // Appends a set of bytes in C-style hex format (e.g. \xHH). + private static StringBuilder appendBytes(StringBuilder out, byte[] bytes, int start, int length) { + int end = Math.min(start + length, bytes.length); + for (int n = start; n < end; n++) { + out.append(String.format("\\x%02x", bytes[n] & 0xFF)); + } + return out; + } +} diff --git a/metadata/src/test/java/com/google/i18n/phonenumbers/metadata/finitestatematcher/compiler/MatcherCompilerTest.java b/metadata/src/test/java/com/google/i18n/phonenumbers/metadata/finitestatematcher/compiler/MatcherCompilerTest.java new file mode 100644 index 0000000000..4a0fdc7a25 --- /dev/null +++ b/metadata/src/test/java/com/google/i18n/phonenumbers/metadata/finitestatematcher/compiler/MatcherCompilerTest.java @@ -0,0 +1,144 @@ +/* + * Copyright (C) 2017 The Libphonenumber Authors. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.google.i18n.phonenumbers.metadata.finitestatematcher.compiler; + +import static com.google.common.base.Preconditions.checkArgument; +import static com.google.common.primitives.Bytes.asList; +import static com.google.i18n.phonenumbers.metadata.finitestatematcher.compiler.MatcherCompiler.compile; + +import com.google.common.truth.Truth; +import com.google.i18n.phonenumbers.metadata.RangeSpecification; +import com.google.i18n.phonenumbers.metadata.RangeTree; +import com.google.i18n.phonenumbers.metadata.finitestatematcher.OpCode; +import java.util.Arrays; +import java.util.List; +import org.junit.Assert; +import org.junit.Test; +import org.junit.runner.RunWith; +import org.junit.runners.JUnit4; + +@RunWith(JUnit4.class) +public class MatcherCompilerTest { + + private static final Byte TERMINATOR = (byte) 0; + + @Test public void testSingleOperation() { + byte digit0 = single(0); + byte digit5 = single(5); + byte digit9 = single(9); + assertCompile(ranges("0"), digit0, TERMINATOR); + assertCompile(ranges("5"), digit5, TERMINATOR); + assertCompile(ranges("9"), digit9, TERMINATOR); + assertCompile(ranges("0559"), digit0, digit5, digit5, digit9, TERMINATOR); + + byte digit5Terminating = (byte) (digit5 | (1 << 4)); + assertCompile(ranges("05", "0559"), + digit0, digit5, digit5Terminating, digit9, TERMINATOR); + } + + @Test public void testAnyOperation() { + byte anyDigit = any(1); + byte anyDigit16Times = any(16); + assertCompile(ranges("x"), anyDigit, TERMINATOR); + assertCompile(ranges("xxxx_xxxx_xxxx_xxxx"), anyDigit16Times, TERMINATOR); + assertCompile(ranges("xxxx_xxxx_xxxx_xxxx_x"), + anyDigit16Times, anyDigit, TERMINATOR); + + byte anyDigitTerminating = (byte) (anyDigit | (1 << 4)); + assertCompile(ranges("x", "xx"), anyDigit, anyDigitTerminating, TERMINATOR); + assertCompile(ranges("xxxx_xxxx_xxxx_xxxx", "xxxx_xxxx_xxxx_xxxx_x"), + anyDigit16Times, anyDigitTerminating, TERMINATOR); + } + + @Test public void testRangeOperation() { + int range09 = range(0, 9); + int range123 = range(1, 2, 3); + int range789 = range(7, 8, 9); + + assertCompile(ranges("[09]"), hi(range09), lo(range09), TERMINATOR); + assertCompile(ranges("[123][789]"), + hi(range123), lo(range123), hi(range789), lo(range789), TERMINATOR); + } + + @Test public void testMapOperation() { + // Force all 10 possible branches to be taken. + byte[] data = compile(ranges("00", "11", "22", "33", "44", "55", "66", "77", "88", "99")); + // Check only the first 4 bytes for exact values. + Assert.assertEquals( + asList((byte) 0x95, (byte) 0x31, (byte) 0xF5, (byte) 0x9D), + asList(data).subList(0, 4)); + // Each branch should jump to a 2 byte sequence between 10 and 28 bytes away (inclusive). + List jumpTable = asList(data).subList(4, 14); + List remainder = asList(data).subList(14, data.length); + // TODO: Now that ordering should be consistent, tighten up this test to ensure + // consistency and remove the shorter consistency test below. + for (byte jump : new byte[] {0xA, 0xC, 0xE, 0x10, 0x12, 0x14, 0x16, 0x18, 0x1A, 0x1C}) { + Assert.assertTrue(jumpTable.contains(jump)); + int index = jumpTable.indexOf(jump); + // Subtract the length of the jump table to get relative offset in remaining code. + jump = (byte) (jump - 10); + // Each jump should end in 2 single-byte instructions (match corresponding digit, terminate). + Assert.assertEquals(single(index), remainder.get(jump)); + Assert.assertEquals(TERMINATOR, remainder.get(jump + 1)); + } + } + + @Test public void testConsistentSorting() { + // Ensure that the MatcherCompiler output is consistent, otherwise it can result in a + // non-deterministic build, because the generated file changes with each execution. + byte[] expected = new byte[] {-128, 0, 0, 29, 3, 5, 7, 32, 0, 33, 0, 34, 0}; + assertCompile(ranges("00", "11", "22"), expected); + } + + /** Returns the 1-byte instruction representing matching a single digit once. */ + private static Byte single(int value) { + checkArgument(value >= 0 && value < 10); + return (byte) ((OpCode.SINGLE.ordinal() << 5) | value); + } + + /** Returns the 1-byte instruction representing matching any digit a specified number of times. */ + private static Byte any(int count) { + checkArgument(count > 0 && count <= 16); + return (byte) ((OpCode.ANY.ordinal() << 5) | (count - 1)); + } + + /** Returns the 2-byte instruction representing matching a range of digits. */ + private static int range(int... digits) { + int mask = 0; + for (int d : digits) { + checkArgument(0 <= d && d <= 9); + mask |= 1 << d; + } + return (OpCode.RANGE.ordinal() << 13) | mask; + } + + private static Byte hi(int shortInstruction) { + return (byte) (shortInstruction >> 8); + } + + private static Byte lo(int shortInstruction) { + return (byte) (shortInstruction & 0xFF); + } + + private void assertCompile(RangeTree dfa, byte... expected) { + Truth.assertThat(compile(dfa)).isEqualTo(expected); + } + + private static RangeTree ranges(String... lines) { + return RangeTree.from(Arrays.stream(lines).map(RangeSpecification::parse)); + } +} diff --git a/metadata/src/test/java/com/google/i18n/phonenumbers/metadata/finitestatematcher/compiler/OperationTest.java b/metadata/src/test/java/com/google/i18n/phonenumbers/metadata/finitestatematcher/compiler/OperationTest.java new file mode 100644 index 0000000000..bd8f1eeb62 --- /dev/null +++ b/metadata/src/test/java/com/google/i18n/phonenumbers/metadata/finitestatematcher/compiler/OperationTest.java @@ -0,0 +1,60 @@ +/* + * Copyright (C) 2017 The Libphonenumber Authors. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.google.i18n.phonenumbers.metadata.finitestatematcher.compiler; + +import static com.google.common.primitives.Bytes.asList; + +import com.google.common.collect.ImmutableList; +import com.google.common.io.ByteArrayDataOutput; +import com.google.common.io.ByteStreams; +import junit.framework.Assert; +import org.junit.Test; +import org.junit.runner.RunWith; +import org.junit.runners.JUnit4; + +@RunWith(JUnit4.class) +public class OperationTest { + + @Test public void testWriteJumpTableNoExtraBranches() { + ByteArrayDataOutput outBytes = ByteStreams.newDataOutput(); + Operation.writeJumpTable(outBytes, ImmutableList.of(0x10, 0x80, 0xFC), Statistics.NO_OP); + // The jump table size is added to the offsets. + Assert.assertEquals( + asList(new byte[] {(byte) 0x13, (byte) 0x83, (byte) 0xFF}), + asList(outBytes.toByteArray())); + } + + // An easy way to reason about what the offsets for the branches should be is to consider + // that the last branch must always have the original offset (it jumps from the very end of + // the jump table, which is exactly what the original offset specified. The branch before it + // is the same except that it must jump over the final branch (ie, +2 bytes) and so on. + // Direct offsets are relative to the start of the jump table however and must be adjusted. + @Test public void testWriteJumpTableExtraBranches() { + ByteArrayDataOutput outBytes = ByteStreams.newDataOutput(); + // Two extra branches needed (0x200 and 0xF7). Worst case adjustment is 9 bytes. + // Total adjustment is 7 bytes (jump table size + 2 * branch) + Operation.writeJumpTable(outBytes, ImmutableList.of(0xF7, 0xF6, 0x200), Statistics.NO_OP); + Assert.assertEquals(asList(new byte[] { + // Jump table: (offset-to-branch, direct-adjusted-offset, offset-to-branch) + (byte) 0x03, (byte) 0xFD, (byte) 0x05, + // Extra branch: offset = 0xF7 + 2 (jumps over last branch) + (byte) 0x10, (byte) 0xF9, + // Extra branch: offset = 0x200 (last branch always has original offset) + (byte) 0x12, (byte) 0x00}), + asList(outBytes.toByteArray())); + } +} diff --git a/metadata/src/test/java/com/google/i18n/phonenumbers/metadata/finitestatematcher/compiler/regression_test_data.textpb b/metadata/src/test/java/com/google/i18n/phonenumbers/metadata/finitestatematcher/compiler/regression_test_data.textpb new file mode 100644 index 0000000000..d97225ccdf --- /dev/null +++ b/metadata/src/test/java/com/google/i18n/phonenumbers/metadata/finitestatematcher/compiler/regression_test_data.textpb @@ -0,0 +1,295 @@ +# Copyright (C) 2017 The Libphonenumber Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# ---- Manually crafted "unit" tests ---- + +test_case { + name: "Simple Range" + range: "1234xxx" + # 4 single byte, single value instructions: 0x20 + value + # 1 single byte, "ANY" instruction: 0x40 + (count-1) + expected: "\x21\x22\x23\x24\x42\x00" +} +test_case { + # NOTE: When the ANY instruction is marked as terminating, it applies when the instruction is + # reached, not after it's executed (i.e. \x50... is "(\d...)?", and not "\d(...)?"). + # Match 3 x ANY (0x42), then "terminate or ANY" (0x50), then 2 x ANY + name: "Variable Any Match #1" + range: "1xxx" + range: "1xxxxxx" + expected: "\x21\x42\x50\x41\x00" +} +test_case { + name: "Variable Any Match #2" + range: "1xxx" + range: "1xxxx" + range: "1xxxxx" + range: "1xxxxxx" + # A repeated terminating ANY match applies on every repeat, not just the first time. + # Match 3 x ANY (0x42 = \d{3}), then 3 x "terminate or ANY" (0x52 = \d{0,3}). + expected: "\x21\x42\x52\x00" +} +test_case { + name: "Overflow Any Match" + range: "xxxxxxxxxxxxxxxxxx" + # 18 'any' digits can't fit in one instruction, so write 2 separate opcodes to match 16 (0x4F) + # and then 2 (0x41). This will almost never occur since DigitSequence is limited to 18 digits. + expected: "\x4F\x41\x00" +} +test_case { + name: "Range Matching" + range: "[0-4]12" + # First 2 bytes are a "branch" operation (opcode = 0x60 plus mask), but there are no offsets + # after it (since one "branch" is just to continue matching, while the other is failure). + expected: "\x60\x1F\x21\x22\x00" +} +test_case { + name: "Range Matching" + # Requires a 2-way branch in the DFA where both paths cover all input digits [0-9]. + range: "[0-4]12" + range: "[5-9]34" + # First 2 bytes are a 2-way branch operation (opcode = 0x68 plus mask), then 2 jump offsets + # from the end of the branch instruction. + expected: "\x68\x1F\x02\x05\x21\x22\x00\x23\x24\x00" +} + +# ---- Deliberate failure cases ---- + +test_case { + name: "Modified Single Match Bytecode" + should_fail: true + range: "123xxxx" + range: "123xxxxx" + range: "123xxxxxx" + # Expected bytes have been tweaked to accept 4 (\x24), rather than 3 (\x23). + expected: "\x21\x22\x24\x43\x51\x00" +} +test_case { + name: "Modified Range Bytecode" + should_fail: true + range: "1[2-5]xxxx" + # Expected bytes have been tweaked to accept [7-9] (\x63\x80), rather than [2-5] (\x60\x3C) + expected: "\x21\x63\x80\x43\x00" +} +test_case { + name: "Modified Any Match Bytecode" + should_fail: true + range: "1xxxx" + # Expected bytes have been tweaked to accept xxx (\x42), rather than xxxx (\x43) + expected: "\x21\x42\x00" +} + +# ---- Auto-generated "stress tests" ---- + +test_case { + name: "GB Mobile" + range: "7[1-3]xxxxxxxx" + range: "74[0-46-9]xxxxxxx" + range: "745[0-689]xxxxxx" + range: "7457[0-57-9]xxxxx" + range: "750[0-8]xxxxxx" + range: "75[13-9]xxxxxxx" + range: "752[0-35-9]xxxxxx" + range: "7700[01]xxxxx" + range: "770[1-9]xxxxxx" + range: "77[1-7]xxxxxxx" + range: "778[02-9]xxxxxx" + range: "779[0-689]xxxxxx" + range: "78[014-9]xxxxxxx" + range: "78[23][0-8]xxxxxx" + range: "79[024-9]xxxxxxx" + range: "791[02-9]xxxxxx" + range: "7911[028]xxxxx" + range: "793[0-689]xxxxxx" + # Not much insight here - other than it starts by matching a '7' and terminates in one place + # after matching "any digit" 5 times (which is the shortest trailing match in the ranges). + expected: "\x27\x8c\xa8\x1a\x2a\x06\x09\x0d\x14\x1c\x20\x40\x10\x1e\x6b\xdf\x1c\x1f\x84\x44" + expected: "\x92\x5d\x1d\x16\x21\x88\x64\x92\x55\x1d\x0f\x21\x24\x6b\xf3\x09\x10\x82\x22\x49" + expected: "\x6d\x03\x1b\x18\x40\x10\x19\x6b\x7f\x17\x19\x61\xff\x10\x11\x63\xef\x0e\x68\x01" + expected: "\x11\x0c\x63\xfd\x07\x63\x7f\x04\x6b\xfd\x02\x0a\x40\x08\x63\xbf\x05\x60\x03\x02" + expected: "\x61\x05\x44\x00" +} +test_case { + name: "India Fixed Line" + range: "11[2-7]xxxxxxx" + range: "12[0-249][2-7]xxxxxx" + range: "12[35-8]x[2-7]xxxxx" + range: "13[0-25][2-7]xxxxxx" + range: "13[346-9]x[2-7]xxxxx" + range: "14[145][2-7]xxxxxx" + range: "14[236-9]x[2-7]xxxxx" + range: "1[59][0235-9]x[2-7]xxxxx" + range: "1[59][14][2-7]xxxxxx" + range: "16[014][2-7]xxxxxx" + range: "16[235-9]x[2-7]xxxxx" + range: "17[1257][2-7]xxxxxx" + range: "17[34689]x[2-7]xxxxx" + range: "18[01346][2-7]xxxxxx" + range: "18[257-9]x[2-7]xxxxx" + range: "2[02][2-7]xxxxxxx" + range: "21[134689]x[2-7]xxxxx" + range: "21[257][2-7]xxxxxx" + range: "23[013][2-7]xxxxxx" + range: "23[24-8]x[2-7]xxxxx" + range: "24[01][2-7]xxxxxx" + range: "24[2-8]x[2-7]xxxxx" + range: "25[0137][2-7]xxxxxx" + range: "25[25689]x[2-7]xxxxx" + range: "26[0158][2-7]xxxxxx" + range: "26[2-4679]x[2-7]xxxxx" + range: "27[13-79]x[2-7]xxxxx" + range: "278[2-7]xxxxxx" + range: "28[1568][2-7]xxxxxx" + range: "28[2-479]x[2-7]xxxxx" + range: "29[14][2-7]xxxxxx" + range: "29[235-9]x[2-7]xxxxx" + range: "301x[2-7]xxxxx" + range: "31[79]x[2-7]xxxxx" + range: "32[1-5]x[2-7]xxxxx" + range: "326[2-7]xxxxxx" + range: "33[2-7]xxxxxxx" + range: "34[13][2-7]xxxxxx" + range: "342[0189][2-7]xxxxx" + range: "342[2-7]xxxxxx" + range: "34[5-8]x[2-7]xxxxx" + range: "35[125689]x[2-7]xxxxx" + range: "35[34][2-7]xxxxxx" + range: "36[01489][2-7]xxxxxx" + range: "36[235-7]x[2-7]xxxxx" + range: "37[02-46][2-7]xxxxxx" + range: "37[157-9]x[2-7]xxxxx" + range: "38[159][2-7]xxxxxx" + range: "38[2-467]x[2-7]xxxxx" + range: "4[04][2-7]xxxxxxx" + range: "41[14578]x[2-7]xxxxx" + range: "41[36][2-7]xxxxxx" + range: "42[1-47][2-7]xxxxxx" + range: "42[5689]x[2-7]xxxxx" + range: "43[15][2-7]xxxxxx" + range: "43[2-467]x[2-7]xxxxx" + range: "45[12][2-7]xxxxxx" + range: "45[4-7]x[2-7]xxxxx" + range: "46[0-26-9][2-7]xxxxxx" + range: "46[35]x[2-7]xxxxx" + range: "47[0-24-9][2-7]xxxxxx" + range: "473x[2-7]xxxxx" + range: "48[013-57][2-7]xxxxxx" + range: "48[2689]x[2-7]xxxxx" + range: "49[014-7][2-7]xxxxxx" + range: "49[2389]x[2-7]xxxxx" + range: "51[025][2-7]xxxxxx" + range: "51[146-9]x[2-7]xxxxx" + range: "52[14-8]x[2-7]xxxxx" + range: "522[2-7]xxxxxx" + range: "53[1346]x[2-7]xxxxx" + range: "53[25][2-7]xxxxxx" + range: "54[14-69]x[2-7]xxxxx" + range: "54[28][2-7]xxxxxx" + range: "55[12][2-7]xxxxxx" + range: "55[46]x[2-7]xxxxx" + range: "56[146-9]x[2-7]xxxxx" + range: "56[25][2-7]xxxxxx" + range: "571[2-7]xxxxxx" + range: "57[2-4]x[2-7]xxxxx" + range: "581[2-7]xxxxxx" + range: "58[2-8]x[2-7]xxxxx" + range: "59[15][2-7]xxxxxx" + range: "59[246]x[2-7]xxxxx" + range: "61[1358]x[2-7]xxxxx" + range: "612[2-7]xxxxxx" + range: "621[2-7]xxxxxx" + range: "62[2457]x[2-7]xxxxx" + range: "631[2-7]xxxxxx" + range: "63[2-4]x[2-7]xxxxx" + range: "641[2-7]xxxxxx" + range: "64[235-7]x[2-7]xxxxx" + range: "65[17][2-7]xxxxxx" + range: "65[2-689]x[2-7]xxxxx" + range: "66[13][2-7]xxxxxx" + range: "66[24578]x[2-7]xxxxx" + range: "671[2-7]xxxxxx" + range: "67[235689]x[2-7]xxxxx" + range: "674[0189][2-7]xxxxx" + range: "674[2-7]xxxxxx" + range: "680[2-7]xxxxxx" + range: "68[1-6]x[2-7]xxxxx" + range: "71[013-9]x[2-7]xxxxx" + range: "712[2-7]xxxxxx" + range: "72[0235-9]x[2-7]xxxxx" + range: "72[14][2-7]xxxxxx" + range: "73[134][2-7]xxxxxx" + range: "73[2679]x[2-7]xxxxx" + range: "74[1-35689]x[2-7]xxxxx" + range: "74[47][2-7]xxxxxx" + range: "75[15][2-7]xxxxxx" + range: "75[2-46-9]x[2-7]xxxxx" + range: "7[67][02-9]x[2-7]xxxxx" + range: "7[67]1[2-7]xxxxxx" + range: "78[013-7]x[2-7]xxxxx" + range: "782[0-6][2-7]xxxxx" + range: "788[0189][2-7]xxxxx" + range: "788[2-7]xxxxxx" + range: "79[0189]x[2-7]xxxxx" + range: "79[2-7]xxxxxxx" + range: "80[2-467]xxxxxxx" + range: "81[1357-9]x[2-7]xxxxx" + range: "816[2-7]xxxxxx" + range: "82[014][2-7]xxxxxx" + range: "82[235-8]x[2-7]xxxxx" + range: "83[03-57-9]x[2-7]xxxxx" + range: "83[126][2-7]xxxxxx" + range: "84[0-24-9]x[2-7]xxxxx" + range: "85xx[2-7]xxxxx" + range: "86[136][2-7]xxxxxx" + range: "86[2457-9]x[2-7]xxxxx" + range: "87[078][2-7]xxxxxx" + range: "87[1-6]x[2-7]xxxxx" + range: "88[1256]x[2-7]xxxxx" + range: "88[34][2-7]xxxxxx" + range: "891[2-7]xxxxxx" + range: "89[2-4]x[2-7]xxxxx" + expected: "\x81\x0f\xac\x72\x08\x1e\x3b\x58\xad\xcc\x75\x8d\x8b\x0f\xac\x72\xdc\xec\xf4\x08" + expected: "\x0a\x0c\x0e\x10\x10\xf2\x10\xfa\x11\x00\x11\x06\x11\x0e\x93\x0f\xac\x6d\xc6\x09" + expected: "\x0b\x0d\x0f\x11\x13\x15\x17\x11\x07\x11\x0f\x11\x17\x11\x1f\x11\x27\x11\x2d\x11" + expected: "\x35\x11\x3d\x81\x31\xf5\x9d\x09\x0b\x0d\xa9\x0f\x11\x13\x15\x17\x12\x27\x12\x28" + expected: "\x11\x34\x11\x38\x11\x3d\x11\x41\x11\x43\x11\x45\x93\x0f\xa9\x9d\x8c\x09\x0b\x0d" + expected: "\x0f\x11\x13\x15\x17\x11\x3c\x11\x40\x11\x44\x11\x48\x11\x4c\x11\x50\x11\x52\x11" + expected: "\x54\x90\xed\xac\x72\x08\x99\x0a\x0c\x0e\x10\x12\x73\x11\xab\x11\xad\x11\xb1\x11" + expected: "\xb5\x11\xb9\x11\xdd\x95\x31\xf5\x9d\x63\x0a\x0c\x0e\x10\x12\x14\x16\x18\x1a\x11" + expected: "\xab\x11\xaf\x11\xb3\x11\xd4\x11\xd5\x11\xb1\x11\xb5\x11\xb9\x11\x44\x93\x0f\xac" + expected: "\x72\x09\x0b\x0d\x0f\x11\x13\x15\x17\x19\x11\x11\x11\x15\x11\x19\x11\x1d\x11\x21" + expected: "\x11\x25\x11\x29\x11\x2d\x11\x31\x81\x0f\xac\x72\x08\x0a\x0c\x0e\x10\x12\x14\x16" + expected: "\x11\x29\x11\x2d\x11\x13\x11\x2f\x11\x33\x11\x37\x11\x3b\x11\x40\x60\xfc\x11\x90" + expected: "\x6b\x03\x02\x04\x11\x93\x11\x88\x60\xdc\x11\x84\x6a\x17\x02\x04\x11\x80\x11\x85" + expected: "\x68\x27\x02\x04\x11\x78\x11\x7d\x84\x44\x89\x52\x02\x04\x11\x6e\x11\x73\x6b\xed" + expected: "\x02\x04\x11\x6d\x11\x64\x68\x13\x02\x04\x11\x5e\x11\x63\x84\x42\x8a\x4a\x02\x04" + expected: "\x11\x54\x11\x59\x68\x5b\x02\x04\x11\x4c\x11\x51\x82\x24\x51\x32\x02\x04\x11\x49" + expected: "\x11\x40\x80\x44\x92\x33\x02\x04\x11\x38\x11\x3d\x80\x44\x92\x53\x02\x04\x11\x2e" + expected: "\x11\x33\x84\x42\x90\x33\x02\x04\x11\x24\x11\x29\x69\x23\x02\x04\x11\x1c\x11\x21" + expected: "\x82\x42\x49\x22\x02\x04\x11\x19\x11\x10\x84\x24\x4a\x52\x02\x04\x11\x08\x11\x0d" + expected: "\x84\x44\x91\x52\x02\x04\x10\xfe\x11\x03\x80\x00\x89\x2a\xff\xf8\x80\x66\xd8\x32" + expected: "\xf2\xf5\xf9\x82\x20\x4a\x4a\xf2\xeb\x6b\x13\xe7\xee\x68\x5d\xe3\xea\x82\x04\x8a" + expected: "\x52\xdd\xe4\x80\x22\x89\x42\xde\xd7\x84\x42\x91\x2a\xd1\xd8\x80\x04\x8a\x52\xcb" + expected: "\xd2\x80\x04\x92\x0a\xc5\xcc\x82\x22\x50\x4b\xbf\xc6\x6b\xf7\xbb\xc2\x68\xbb\xb7" + expected: "\xbe\x68\xf3\xb3\xba\x84\x44\x8a\x0d\xad\xb4\x80\x22\x49\x12\xae\xa7\x80\x00\x51" + expected: "\x32\xa8\xa1\x82\x40\x49\x12\xa2\x9b\x80\x00\x82\x0a\x95\x9c\x82\x22\x51\x12\x96" + expected: "\x8f\x80\x00\x02\x52\x89\x90\x80\x44\x92\x52\x83\x8a\x80\x00\x8a\x12\x7d\x84\x80" + expected: "\x20\x08\x32\x7e\x77\x80\x04\x12\x12\x71\x78\x80\x04\x90\x52\x6b\x72\x84\x42\x92" + expected: "\x52\x65\x6c\x80\x44\x12\x32\x5f\x66\x84\x40\x93\x52\x59\x60\x5c\x80\x00\x92\x55" + expected: "\x52\x59\x6b\xfb\x55\x4e\x84\x04\x81\x32\x48\x4f\x82\x24\x4a\x2a\x49\x42\x84\x44" + expected: "\x8a\x52\x3c\x43\x6b\xfd\x3f\x38\x82\x22\x88\x22\x39\x32\x80\x44\x91\x53\x2c\x33" + expected: "\x6b\xb9\x2f\x28\x84\x44\x52\x32\x22\x29\x80\x22\x92\x55\x1c\x23\x80\x00\x4a\x4a" + expected: "\x1d\x16\x80\x62\x49\x33\x17\x19\x13\x21\x10\x11\x62\x80\x0e\x63\xf7\x0b\x40\x09" + expected: "\x40\x0c\x60\xfc\x09\x6b\x03\x09\x07\x40\x05\x60\x7f\x02\x40\x02\x60\xfc\x44\x00" +} diff --git a/metadata/src/test/java/com/google/i18n/phonenumbers/metadata/regex/AnyPathTest.java b/metadata/src/test/java/com/google/i18n/phonenumbers/metadata/regex/AnyPathTest.java new file mode 100644 index 0000000000..705d430cc7 --- /dev/null +++ b/metadata/src/test/java/com/google/i18n/phonenumbers/metadata/regex/AnyPathTest.java @@ -0,0 +1,106 @@ +/* + * Copyright (C) 2017 The Libphonenumber Authors. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.google.i18n.phonenumbers.metadata.regex; + +import static com.google.common.truth.Truth.assertThat; +import static com.google.i18n.phonenumbers.metadata.regex.AnyPath.EMPTY; +import static com.google.i18n.phonenumbers.metadata.regex.AnyPath.OPTIONAL; +import static com.google.i18n.phonenumbers.metadata.regex.AnyPath.SINGLE; + +import com.google.common.collect.ImmutableSortedSet; +import org.junit.Test; +import org.junit.runner.RunWith; +import org.junit.runners.JUnit4; + +@RunWith(JUnit4.class) +public class AnyPathTest { + @Test + public void testConstants() { + assertPath(EMPTY, 0); + assertPath(SINGLE, 1); + assertPath(OPTIONAL, 0, 1); + } + + @Test + public void testExtend() { + assertThat(EMPTY.extend(false)).isEqualTo(SINGLE); + assertThat(EMPTY.extend(true)).isEqualTo(OPTIONAL); + // Non-optional extension is the same as joining with SINGLE. + assertPath(SINGLE.extend(false), 2); + // This is not the same as joining SINGLE.join(OPTIONAL). + assertPath(SINGLE.extend(true), 0, 2); + + // 100 extends to 1000 or 1001 (if optional). + assertPath(AnyPath.of(0x4).extend(false), 3); + assertPath(AnyPath.of(0x4).extend(true), 0, 3); + } + + @Test + public void testJoin() { + assertThat(EMPTY.join(SINGLE)).isEqualTo(SINGLE); + assertThat(EMPTY.join(OPTIONAL)).isEqualTo(OPTIONAL); + assertPath(SINGLE.join(SINGLE), 2); + assertPath(SINGLE.join(OPTIONAL), 1, 2); + assertPath(OPTIONAL.join(OPTIONAL), 0, 1, 2); + + // "(x(x)?)?" == 110 and matches 0 to 2. + // "(x(x)?)?".join("(x(x)?)?") == "(x(x(x(x)?)?)?)?" == 11111 and matches 0 to 4. + assertThat(AnyPath.of(0x7).join(AnyPath.of(0x7))).isEqualTo(AnyPath.of(0x1F)); + + // "xx(x)?" == 1100 and matches 2 or 3. + // "(xx)?" == 0101 and matches 0 or 2. + // "xx(x)?".join("(xx)?") == "xx(xx)?" == 111100 and matches 2 to 5. + assertThat(AnyPath.of(0xC).join(AnyPath.of(0x5))).isEqualTo(AnyPath.of(0x3C)); + } + + @Test + public void testMakeOptional() { + assertThat(OPTIONAL.makeOptional()).isEqualTo(OPTIONAL); + assertThat(SINGLE.makeOptional()).isEqualTo(OPTIONAL); + assertPath(AnyPath.of(0x4).makeOptional(), 0, 2); + } + + @Test + public void testToString() { + assertThat(SINGLE.toString()).isEqualTo("x"); + assertThat(OPTIONAL.toString()).isEqualTo("(x)?"); + assertThat(AnyPath.of(0x8).toString()).isEqualTo("xxx"); // 1000 = 3 digits + assertThat(AnyPath.of(0xA).toString()).isEqualTo("x(xx)?"); // 1010 = 1 or 3 digits + assertThat(AnyPath.of(0xF).toString()).isEqualTo("(x(x(x)?)?)?"); // 1111 = 0 to 3 digits + } + + // Ordering is important as we need to find the shortest path at certain times. + @Test + public void testOrdering() { + assertThat(SINGLE).isGreaterThan(EMPTY); + assertThat(OPTIONAL).isGreaterThan(SINGLE); + + assertThat(AnyPath.of(0x8)).isGreaterThan(AnyPath.of(0x4)); + // Same length, but the 2nd highest length match is taken into account as a tie break. + // This strategy turns out to match numeric comparison perfectly since set-bits are lengths. + assertThat(AnyPath.of(0xA)).isGreaterThan(AnyPath.of(0x9)); + } + + private static void assertPath(AnyPath p, Integer... n) { + ImmutableSortedSet lengths = ImmutableSortedSet.copyOf(n); + int maxLength = lengths.last(); + assertThat(p.maxLength()).isEqualTo(maxLength); + for (int i = 0; i <= maxLength; i++) { + assertThat(p.acceptsLength(i)).isEqualTo(lengths.contains(i)); + } + } +} diff --git a/metadata/src/test/java/com/google/i18n/phonenumbers/metadata/regex/EdgeTest.java b/metadata/src/test/java/com/google/i18n/phonenumbers/metadata/regex/EdgeTest.java new file mode 100644 index 0000000000..dc0230783d --- /dev/null +++ b/metadata/src/test/java/com/google/i18n/phonenumbers/metadata/regex/EdgeTest.java @@ -0,0 +1,224 @@ +/* + * Copyright (C) 2017 The Libphonenumber Authors. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.google.i18n.phonenumbers.metadata.regex; + +import static com.google.common.truth.Truth.assertThat; +import static com.google.i18n.phonenumbers.metadata.RangeSpecification.ALL_DIGITS_MASK; +import static org.junit.Assert.fail; + +import com.google.common.base.Preconditions; +import com.google.common.collect.ImmutableSet; +import com.google.i18n.phonenumbers.metadata.RangeSpecification; +import com.google.i18n.phonenumbers.metadata.regex.Edge.SimpleEdge; +import com.google.i18n.phonenumbers.metadata.regex.Edge.Visitor; +import java.util.Arrays; +import java.util.List; +import java.util.Set; +import org.junit.Test; +import org.junit.runner.RunWith; +import org.junit.runners.JUnit4; + +@RunWith(JUnit4.class) +public class EdgeTest { + @Test + public void testSimple() { + assertThat(Edge.fromMask(0x6).getDigitMask()).isEqualTo(0x6); + assertThat(Edge.fromMask(0x6).isOptional()).isFalse(); + + assertThat(Edge.fromMask(0x3).toString()).isEqualTo("[01]"); // 0000000011 + assertThat(Edge.fromMask(0x300).toString()).isEqualTo("[89]"); // 1100000000 + assertThat(Edge.fromMask(0x1FE).toString()).isEqualTo("[1-8]"); // 0111111110 + assertThat(Edge.fromMask(ALL_DIGITS_MASK).toString()).isEqualTo("x"); // any digit + } + + @Test + public void testAny() { + assertThat(Edge.fromMask(ALL_DIGITS_MASK)).isEqualTo(Edge.any()); + assertThat(Edge.any().optional()).isEqualTo(Edge.optionalAny()); + + assertThat(Edge.any().toString()).isEqualTo("x"); + // Unlike AnyPath, simple edges are not sequences, so don't need parens for optional. + assertThat(Edge.optionalAny().toString()).isEqualTo("x?"); + } + + @Test + public void testEpsilon() { + // Epsilon isn't optional, it represents a path that non-optionally accepts no input. + assertThat(Edge.epsilon().isOptional()).isFalse(); + assertThat(Edge.epsilon().toString()).isEqualTo("e"); + } + + @Test + public void testConcatenation() { + Edge concatenated = Edge.concatenation(Edge.fromMask(0x3), Edge.any()); + assertThat(concatenated.toString()).isEqualTo("[01]x"); + TestingVisitor v = new TestingVisitor() { + @Override + public void visitSequence(List edges) { + assertThat(edges).containsExactly(Edge.fromMask(0x3), Edge.any()).inOrder(); + wasTested = true; + } + }; + concatenated.accept(v); + assertThat(v.wasTested).isTrue(); + } + + @Test + public void testGroup() { + Edge group = Edge.disjunction(ImmutableSet.of(Edge.fromMask(0x3), Edge.any())); + TestingVisitor v = new TestingVisitor() { + @Override + public void visitGroup(Set edges, boolean isOptional) { + assertThat(edges).containsExactly(Edge.any(), Edge.fromMask(0x3)).inOrder(); + assertThat(isOptional).isFalse(); + wasTested = true; + } + }; + group.accept(v); + assertThat(group.toString()).isEqualTo("(x|[01])"); + assertThat(v.wasTested).isTrue(); + } + + @Test + public void testOptionalGroup() { + Edge group = Edge.disjunction(ImmutableSet.of(Edge.fromMask(0x3), Edge.epsilon(), Edge.any())); + TestingVisitor v = new TestingVisitor() { + @Override + public void visitGroup(Set edges, boolean isOptional) { + // Reordered and epsilon removed. + assertThat(edges).containsExactly(Edge.any(), Edge.fromMask(0x3)).inOrder(); + assertThat(isOptional).isTrue(); + wasTested = true; + } + }; + group.accept(v); + assertThat(group.toString()).isEqualTo("(x|[01])?"); + assertThat(v.wasTested).isTrue(); + } + + @Test + public void testOrdering() { + // Testing ordering is important because when generating regular expressions, the edge order + // defines a lot about the visual order of the final regular expression. This order should be + // as close to "what a person would consider reasonable" as possible. In fact some of the cases + // tested here will never occur in real situations (e.g. sequences compared with groups) + // because of the way composite edges are created. However it seems sensible to test the + // behaviour nevertheless. + + // Simple Edges + + assertSameOrder(e("0"), e("0")); + // "0" < "1" - lowest bit set wins + assertOrdered(e("0"), e("1")); + // "[01]" < "1" - lowest bit set wins + assertOrdered(e("[01]"), e("1")); + // "x" < "9" - lowest bit set wins + assertOrdered(X, e("9")); + + // Sequences + + // ("0x" < "1") and ("0" < "1x") - first edge in sequence is compared to single edge. + assertOrdered(seq(e("0"), X), e("1")); + assertOrdered(e("0"), seq(e("1"), X)); + // "[01]" < "[01]x" - single edges are "smaller" than sequences of edges if all else is equal. + assertOrdered(e("[01]"), seq(e("[01]"), X)); + + // "[01]x" == "[01]x" + assertSameOrder(seq(e("[01]"), X), seq(e("[01]"), X)); + // "x1" < "x2" - comparing 2 sequences compares all edges. + assertOrdered(seq(X, e("1")), seq(X, e("2"))); + + // "[01]x" < "[01]xx" - shortest sequence wins in tie break (similar to how "[01]" < "[01]x") + assertOrdered(seq(e("[01]"), X), seq(e("[01]"), X, X)); + + // Disjunctions + + // "(1|2)" == "(2|1)" - edges are sorted when creating disjunctions + assertSameOrder(or(e("1"), e("2")), or(e("2"), e("1"))); + // "(1|2|3)" < "(1|2|4)" - comparing 2 disjunctions compares all edges. + assertOrdered(or(e("1"), e("2"), e("3")), or(e("1"), e("2"), e("4"))); + // "(1|2)" < "(1|2|3)" - shortest sequence wins in tie break + assertOrdered(or(e("1"), e("2")), or(e("1"), e("2"), e("3"))); + + // Miscellaneous + + // "1" < "(1|2)" - if first edge matches, single edges sort before groups. + assertOrdered(e("1"), or(e("1"), e("2"))); + + // "(1|x)" < "1x" - because "(1|x)" is actually "(x|1)" and "x" < "1". + assertOrdered(or(e("1"), X), seq(e("1"), X)); + } + + private static void assertSameOrder(Edge lhs, Edge rhs) { + assertThat(lhs).isEquivalentAccordingToCompareTo(rhs); + assertThat(lhs).isEqualTo(rhs); + } + + private static void assertOrdered(Edge lhs, Edge rhs) { + assertThat(lhs).isNotEqualTo(rhs); + assertThat(lhs).isLessThan(rhs); + assertThat(rhs).isGreaterThan(lhs); + } + + // A bit like a mock, but not really "mocking" existing behaviour. + private static class TestingVisitor implements Visitor { + // Set this in overridden method(s). + protected boolean wasTested = false; + + @Override + public void visit(SimpleEdge edge) { + fail("unexpected call"); + } + + @Override + public void visitSequence(List edges) { + fail("unexpected call"); + } + + @Override + public void visitGroup(Set edges, boolean isOptional) { + fail("unexpected call"); + } + } + + // The 'any digit' edge. + private static final Edge X = e("x"); + + // Creates a simple edge from a range specification string for testing. + private static SimpleEdge e(String s) { + RangeSpecification spec = RangeSpecification.parse(s); + Preconditions.checkArgument(spec.length() == 1, "only specify single digit ranges"); + return SimpleEdge.fromMask(spec.getBitmask(0)); + } + + // Creates sequence of edges (wrapping for convenience). + private static Edge seq(Edge first, Edge second, Edge... rest) { + // This already rejects epsilon edges. + Edge edge = Edge.concatenation(first, second); + for (Edge e : rest) { + edge = Edge.concatenation(edge, e); + } + return edge; + } + + // Creates a non-optional disjunction of edges. + private static Edge or(Edge... edges) { + List e = Arrays.asList(edges); + Preconditions.checkArgument(!e.contains(Edge.epsilon()), "use 'opt()' for optional groups"); + return Edge.disjunction(e); + } +} diff --git a/metadata/src/test/java/com/google/i18n/phonenumbers/metadata/regex/EdgeWriterTest.java b/metadata/src/test/java/com/google/i18n/phonenumbers/metadata/regex/EdgeWriterTest.java new file mode 100644 index 0000000000..e5cacb01ea --- /dev/null +++ b/metadata/src/test/java/com/google/i18n/phonenumbers/metadata/regex/EdgeWriterTest.java @@ -0,0 +1,154 @@ +/* + * Copyright (C) 2017 The Libphonenumber Authors. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.google.i18n.phonenumbers.metadata.regex; + +import static com.google.common.truth.Truth.assertThat; + +import com.google.common.base.Preconditions; +import com.google.i18n.phonenumbers.metadata.RangeSpecification; +import com.google.i18n.phonenumbers.metadata.regex.Edge.SimpleEdge; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import org.junit.Test; +import org.junit.runner.RunWith; +import org.junit.runners.JUnit4; + +@RunWith(JUnit4.class) +public class EdgeWriterTest { + + // Note that this code is tested very thoroughly by any "round-tripping" of regular expressions + // in the metadata (i.e. generating regular expressions from DFAs and then re-parsing then to + // ensure that the same DFA is produced). This is part of any acceptance test for generating + // regular expressions and serves as a far more comprehensive stress test on the code. These + // tests are thus limited to simpler cases and highlighting interesting behaviour. + + // The 'any digit' edge. + private static final Edge X = e("x"); + + @Test + public void testSimple() { + assertThat(regex(e("0"))).isEqualTo("0"); + assertThat(regex(e("[0-7]"))).isEqualTo("[0-7]"); + assertThat(regex(e("[0-9]"))).isEqualTo("\\d"); + assertThat(regex(X)).isEqualTo("\\d"); + } + + @Test + public void testSequences() { + assertThat(regex(seq(e("0"), e("1"), e("2")))).isEqualTo("012"); + } + + @Test + public void testGroups() { + // Non-optional groups spanning the top level don't need parentheses. + assertThat(regex(or(e("0"), e("1"), e("2")))).isEqualTo("0|1|2"); + // Optional groups always need parentheses. + assertThat(regex(opt(e("0"), e("1"), e("2")))).isEqualTo("(?:0|1|2)?"); + // Once a group has prefix or suffix, parentheses are needed. + assertThat(regex( + seq( + or(e("0"), e("1")), + e("2")))) + .isEqualTo("(?:0|1)2"); + } + + @Test + public void testNesting() { + // Basic nesting is handled by a very straightforward edge visitor, so one non-trivial test + // will cover all the basic cases ("any digit" sequences are a different matter however). + assertThat(regex( + seq( + e("0"), + or( + e("1"), + seq( + e("2"), + opt(e("3"), e("4")))), + e("5"), e("6")))) + .isEqualTo("0(?:1|2(?:3|4)?)56"); + } + + @Test + public void testAnyDigitSequences() { + // This is the complex part of efficient regular expression generation. + assertThat(regex(seq(e("0"), e("1"), X))).isEqualTo("01\\d"); + // "\d\d" is shorter than "\d{2}" + assertThat(regex(seq(X, X))).isEqualTo("\\d\\d"); + assertThat(regex(seq(X, X, X))).isEqualTo("\\d{3}"); + // Top level optional groups are supported. + assertThat(regex(opt(seq(X, X)))).isEqualTo("(?:\\d{2})?"); + // Optional parts go at the end. + assertThat(regex( + seq( + opt(seq(X, X)), + X, X))) + .isEqualTo("\\d\\d(?:\\d{2})?"); + // "(x(x(x)?)?)?" + Edge anyGrp = opt(seq( + X, + opt(seq( + X, + opt(X))))); + // The two cases of a group on its own or as part of a sequence are handled separately, so + // must be tested separately. + assertThat(regex(anyGrp)).isEqualTo("\\d{0,3}"); + assertThat(regex(seq(e("1"), e("2"), anyGrp))).isEqualTo("12\\d{0,3}"); + // xx(x(x(x)?)?)?" + assertThat(regex(seq(X, X, anyGrp))).isEqualTo("\\d{2,5}"); + // Combining "any digit" groups produces minimal representation + assertThat(regex(seq(anyGrp, anyGrp))).isEqualTo("\\d{0,6}"); + } + + // Helper to call standard version of regex generator (not using 'dot' for matching). + private String regex(Edge e) { + return EdgeWriter.toRegex(e, false /* use dot match */); + } + + // Creates a simple edge from a range specification string for testing. + private static SimpleEdge e(String s) { + RangeSpecification spec = RangeSpecification.parse(s); + Preconditions.checkArgument(spec.length() == 1, "only specify single digit ranges"); + return SimpleEdge.fromMask(spec.getBitmask(0)); + } + + // Creates sequence of edges (wrapping for convenience). + private static Edge seq(Edge first, Edge second, Edge... rest) { + // This already rejects epsilon edges. + Edge edge = Edge.concatenation(first, second); + for (Edge e : rest) { + edge = Edge.concatenation(edge, e); + } + return edge; + } + + // Creates a non-optional disjunction of edges. + private static Edge or(Edge... edges) { + List e = Arrays.asList(edges); + Preconditions.checkArgument(!e.contains(Edge.epsilon()), "use 'opt()' for optional groups"); + return Edge.disjunction(e); + } + + // Creates an optional disjunction of edges. + private static Edge opt(Edge... edges) { + List e = new ArrayList<>(); + e.addAll(Arrays.asList(edges)); + Preconditions.checkArgument(!e.contains(Edge.epsilon()), "don't pass epsilon directly"); + e.add(Edge.epsilon()); + return Edge.disjunction(e); + } +} diff --git a/metadata/src/test/java/com/google/i18n/phonenumbers/metadata/regex/NfaBuilder.java b/metadata/src/test/java/com/google/i18n/phonenumbers/metadata/regex/NfaBuilder.java new file mode 100644 index 0000000000..654a334c0a --- /dev/null +++ b/metadata/src/test/java/com/google/i18n/phonenumbers/metadata/regex/NfaBuilder.java @@ -0,0 +1,98 @@ +/* + * Copyright (C) 2017 The Libphonenumber Authors. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.google.i18n.phonenumbers.metadata.regex; + +import static com.google.common.base.Preconditions.checkArgument; +import static com.google.i18n.phonenumbers.metadata.regex.Node.INITIAL; +import static com.google.i18n.phonenumbers.metadata.regex.Node.TERMINAL; + +import com.google.common.graph.MutableValueGraph; +import com.google.common.graph.ValueGraph; +import com.google.common.graph.ValueGraphBuilder; +import com.google.i18n.phonenumbers.metadata.RangeSpecification; +import com.google.i18n.phonenumbers.metadata.regex.Edge.SimpleEdge; + +/** Simple fluent API for constructing graphs for testing. */ +final class NfaBuilder { + private final MutableValueGraph graph = + ValueGraphBuilder.directed().allowsSelfLoops(false).build(); + // The last node added to the graph. + private Node lastNode; + + /** Creates a new mutable NFA graph. */ + public NfaBuilder() { + graph.addNode(INITIAL); + graph.addNode(TERMINAL); + lastNode = TERMINAL; + } + + /** + * Returns an unmodifiable view of the underlying graph (not a snapshot). If the builder is + * modified after this method is called, it will affect what was returned. + */ + public ValueGraph graph() { + return graph; + } + + /** Adds a new path from the given source node, returning the newly created target node. */ + public Node addPath(Node source, String path) { + RangeSpecification spec = RangeSpecification.parse(path); + for (int n = 0; n < spec.length(); n++) { + lastNode = lastNode.createNext(); + addEdge(source, lastNode, SimpleEdge.fromMask(spec.getBitmask(n))); + source = lastNode; + } + return lastNode; + } + + /** Adds a new path between the given source and target (all intermediate nodes are new). */ + public void addPath(Node source, Node target, String path) { + RangeSpecification spec = RangeSpecification.parse(path); + for (int n = 0; n < spec.length() - 1; n++) { + lastNode = lastNode.createNext(); + addEdge(source, lastNode, SimpleEdge.fromMask(spec.getBitmask(n))); + source = lastNode; + } + addEdge(source, target, SimpleEdge.fromMask(spec.getBitmask(spec.length() - 1))); + } + + /** + * Adds a new path between the given source and target nodes, along with an epsilon edge from the + * source to the target. + */ + public void addOptionalPath(Node source, Node target, String path) { + addPath(source, target, path); + addEpsilon(source, target); + } + + private void addEpsilon(Node s, Node t) { + checkArgument(graph.nodes().contains(s), "missing source node"); + checkArgument(graph.nodes().contains(s), "missing target node"); + SimpleEdge e = graph.putEdgeValue(s, t, Edge.epsilon()); + if (e != null) { + // Edge already exists; if not an epsilon, make it optional. + checkArgument(!e.equals(Edge.epsilon()) && !e.isOptional(), "epsilon already added"); + graph.putEdgeValue(s, t, e.optional()); + } + } + + private void addEdge(Node s, Node t, SimpleEdge e) { + graph.addNode(s); + graph.addNode(t); + checkArgument(graph.putEdgeValue(s, t, e) == null, "edge already exists"); + } +} diff --git a/metadata/src/test/java/com/google/i18n/phonenumbers/metadata/regex/NfaFlattenerTest.java b/metadata/src/test/java/com/google/i18n/phonenumbers/metadata/regex/NfaFlattenerTest.java new file mode 100644 index 0000000000..adfeb15ffd --- /dev/null +++ b/metadata/src/test/java/com/google/i18n/phonenumbers/metadata/regex/NfaFlattenerTest.java @@ -0,0 +1,229 @@ +/* + * Copyright (C) 2017 The Libphonenumber Authors. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.google.i18n.phonenumbers.metadata.regex; + +import static com.google.common.truth.Truth.assertThat; +import static com.google.i18n.phonenumbers.metadata.regex.Node.INITIAL; +import static com.google.i18n.phonenumbers.metadata.regex.Node.TERMINAL; + +import com.google.common.base.Preconditions; +import com.google.i18n.phonenumbers.metadata.RangeSpecification; +import com.google.i18n.phonenumbers.metadata.regex.Edge.SimpleEdge; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Comparator; +import java.util.List; +import java.util.TreeSet; +import org.junit.Test; +import org.junit.runner.RunWith; +import org.junit.runners.JUnit4; + +@RunWith(JUnit4.class) +public class NfaFlattenerTest { + // The 'any digit' edge. + private static final Edge X = e("x"); + + @Test + public void testSimple() { + NfaBuilder nfa = new NfaBuilder(); + nfa.addPath(INITIAL, TERMINAL, "12"); + Edge flat = NfaFlattener.flatten(nfa.graph()); + assertThat(flat).isEqualTo(seq(e("1"), e("2"))); + assertThat(flat.toString()).isEqualTo("12"); + + nfa.addPath(INITIAL, TERMINAL, "34"); + flat = NfaFlattener.flatten(nfa.graph()); + assertThat(flat).isEqualTo( + or( + seq(e("1"), e("2")), + seq(e("3"), e("4")))); + assertThat(flat.toString()).isEqualTo("(12|34)"); + } + + @Test + public void testSubgroup() { + NfaBuilder nfa = new NfaBuilder(); + Node split = nfa.addPath(INITIAL, "12"); + Node join = nfa.addPath(split, "34"); + nfa.addPath(split, join, "56"); + nfa.addPath(join, TERMINAL, "78"); + + Edge flat = NfaFlattener.flatten(nfa.graph()); + assertThat(flat).isEqualTo( + seq(e("1"), e("2"), + or( + seq(e("3"), e("4")), + seq(e("5"), e("6")) + ), + e("7"), e("8"))); + assertThat(flat.toString()).isEqualTo("12(34|56)78"); + } + + @Test + public void testSubgroupWithEarlyJoining() { + NfaBuilder nfa = new NfaBuilder(); + // Create a graph with 4 initial paths branching out which collapses to 3, 2 and then 1. + Node groupStart = nfa.addPath(INITIAL, "0"); + // Add 2 edges to the first join point (if we add only one edge then it clashes with the + // joining edge, which goes directly from groupStart to firstJoin. + Node firstJoin = nfa.addPath(nfa.addPath(groupStart, "1"), "2"); + nfa.addPath(groupStart, firstJoin, "3"); + Node secondJoin = nfa.addPath(firstJoin, "4"); + nfa.addPath(groupStart, secondJoin, "5"); + Node groupEnd = nfa.addPath(secondJoin, "6"); + nfa.addPath(groupStart, groupEnd, "7"); + nfa.addPath(groupEnd, TERMINAL, "8"); + + Edge flat = NfaFlattener.flatten(nfa.graph()); + assertThat(flat).isEqualTo( + seq(e("0"), + or( + seq( + or( + seq( + or( + seq(e("1"), e("2")), + e("3")), + e("4")), + e("5")), + e("6")), + e("7")), + e("8"))); + assertThat(flat.toString()).isEqualTo("0(((12|3)4|5)6|7)8"); + } + + @Test + public void testPathDuplication() { + NfaBuilder nfa = new NfaBuilder(); + Node groupStart = nfa.addPath(INITIAL, "0"); + Node lhsMid = nfa.addPath(groupStart, "1"); + Node groupEnd = nfa.addPath(lhsMid, "2"); + Node rhsMid = nfa.addPath(groupStart, "3"); + nfa.addPath(rhsMid, groupEnd, "4"); + nfa.addPath(groupEnd, TERMINAL, "5"); + + // So far this is a normal nestable graph: + // ,--1-->()--2--v + // (I)--0-->() ()--5-->(T) + // `--3-->()--4--^ + Edge flat = NfaFlattener.flatten(nfa.graph()); + assertThat(flat).isEqualTo( + seq(e("0"), + or( + seq(e("1"), e("2")), + seq(e("3"), e("4"))), + e("5"))); + assertThat(flat.toString()).isEqualTo("0(12|34)5"); + + // This new path "crosses" the group, creating a non-nestable structure which can only be + // resolved by duplicating some path (in this case it's the 2nd part of the right-hand-side). + nfa.addPath(lhsMid, rhsMid, "x"); + + flat = NfaFlattener.flatten(nfa.graph()); + assertThat(flat).isEqualTo( + seq(e("0"), + or( + seq(e("1"), + or( + e("2"), + seq(X, e("4")))), + seq(e("3"), e("4"))), + e("5"))); + // Note the duplication of the '4' to make the graph nestable. + assertThat(flat.toString()).isEqualTo("0(1(x4|2)|34)5"); + + } + + @Test + public void testNodeOrdering_bug_65250963() { + // ,--->(C)----------. + // | v + // (I)-->(D)-->(B)-->(T) + // | ^ + // `--->(A)----' + NfaBuilder nfa = new NfaBuilder(); + // IMPORTANT: Order of insertion determines the node IDs (A=1, B=2...). The edge index just + // happens to match node ID for readability, but doesn't affect the test directly. + Node a = nfa.addPath(INITIAL, "1"); + Node b = nfa.addPath(a, "2"); + Node c = nfa.addPath(INITIAL, "3"); + Node d = nfa.addPath(INITIAL, "4"); + // Now join up remaining paths. + nfa.addPath(d, b, "5"); + nfa.addPath(b, TERMINAL, "6"); + nfa.addPath(c, TERMINAL, "7"); + Comparator ordering = NfaFlattener.nodeOrdering(nfa.graph()); + + // In the old ordering code, because (B) and (D) are not reachable to/from (C) we would have + // had the ordering (D < B), (B < C), (C < D) giving a cycle. In the new code, the longest path + // length to reach (C) is less than (B), so we get (C < B) and we no longer have a cycle. + // The node ordering is now: (INITIAL, A, C, D, B, TERMINAL) + TreeSet nodes = new TreeSet<>(ordering); + nodes.add(INITIAL); + nodes.add(TERMINAL); + nodes.add(a); + nodes.add(b); + nodes.add(c); + nodes.add(d); + assertThat(nodes).containsExactly(INITIAL, a, c, d, b, TERMINAL).inOrder(); + } + + @Test + public void testOptionalTopLevelGroup_bug_69101586() { + // ,--->(e)----. + // | v + // (I)-->(A)-->(T) + NfaBuilder nfa = new NfaBuilder(); + nfa.addOptionalPath(INITIAL, TERMINAL, "xx"); + Edge flat = NfaFlattener.flatten(nfa.graph()); + assertThat(flat).isEqualTo(opt(seq(X, X))); + assertThat(flat.toString()).isEqualTo("(xx)?"); + } + + // Creates a simple edge from a range specification string for testing. + private static SimpleEdge e(String s) { + RangeSpecification spec = RangeSpecification.parse(s); + Preconditions.checkArgument(spec.length() == 1, "only specify single digit ranges"); + return SimpleEdge.fromMask(spec.getBitmask(0)); + } + + // Creates sequence of edges (wrapping for convenience). + private static Edge seq(Edge first, Edge second, Edge... rest) { + // This already rejects epsilon edges. + Edge edge = Edge.concatenation(first, second); + for (Edge e : rest) { + edge = Edge.concatenation(edge, e); + } + return edge; + } + + // Creates an optional disjunction of edges. + private static Edge opt(Edge... edges) { + List e = new ArrayList<>(); + e.addAll(Arrays.asList(edges)); + Preconditions.checkArgument(!e.contains(Edge.epsilon()), "don't pass epsilon directly"); + e.add(Edge.epsilon()); + return Edge.disjunction(e); + } + + // Creates a non-optional disjunction of edges. + private static Edge or(Edge... edges) { + List e = Arrays.asList(edges); + Preconditions.checkArgument(!e.contains(Edge.epsilon()), "use 'opt()' for optional groups"); + return Edge.disjunction(e); + } +} diff --git a/metadata/src/test/java/com/google/i18n/phonenumbers/metadata/regex/NodeTest.java b/metadata/src/test/java/com/google/i18n/phonenumbers/metadata/regex/NodeTest.java new file mode 100644 index 0000000000..00a4d8295d --- /dev/null +++ b/metadata/src/test/java/com/google/i18n/phonenumbers/metadata/regex/NodeTest.java @@ -0,0 +1,68 @@ +/* + * Copyright (C) 2017 The Libphonenumber Authors. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.google.i18n.phonenumbers.metadata.regex; + +import static com.google.common.truth.Truth.assertThat; +import static com.google.i18n.phonenumbers.metadata.regex.Node.INITIAL; +import static com.google.i18n.phonenumbers.metadata.regex.Node.TERMINAL; + +import org.junit.Test; +import org.junit.runner.RunWith; +import org.junit.runners.JUnit4; + +@RunWith(JUnit4.class) +public class NodeTest { + @Test + public void testConstants() { + assertThat(INITIAL.id()).isEqualTo(0); + assertThat(TERMINAL.id()).isEqualTo(1); + assertThat(TERMINAL).isNotEqualTo(INITIAL); + } + + @Test + public void testNext() { + assertThat(INITIAL.createNext()).isSameInstanceAs(TERMINAL); + assertThat(TERMINAL.createNext()).isNotEqualTo(TERMINAL); + assertThat(TERMINAL.createNext().id()).isEqualTo(2); + Node node = INITIAL; + for (int id = 0; id < 10; id++) { + assertThat(node.id()).isEqualTo(id); + node = node.createNext(); + } + } + + @Test + public void testToString() { + Node node = INITIAL; + for (int id = 0; id < 10; id++) { + assertThat(node.toString()).isEqualTo(Integer.toString(id)); + node = node.createNext(); + } + } + + // Consistent ordering helps ensure regular expressions derived from graphs are deterministic. + @Test + public void testOrdering() { + assertThat(TERMINAL).isGreaterThan(INITIAL); + Node node = INITIAL; + for (int id = 0; id < 10; id++) { + Node next = node.createNext(); + assertThat(next).isGreaterThan(node); + node = next; + } + } +} diff --git a/metadata/src/test/java/com/google/i18n/phonenumbers/metadata/regex/RangeTreeConverterTest.java b/metadata/src/test/java/com/google/i18n/phonenumbers/metadata/regex/RangeTreeConverterTest.java new file mode 100644 index 0000000000..073a3576b3 --- /dev/null +++ b/metadata/src/test/java/com/google/i18n/phonenumbers/metadata/regex/RangeTreeConverterTest.java @@ -0,0 +1,154 @@ +/* + * Copyright (C) 2017 The Libphonenumber Authors. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.google.i18n.phonenumbers.metadata.regex; + +import static com.google.common.collect.ImmutableList.toImmutableList; +import static com.google.common.truth.Truth.assertThat; +import static com.google.i18n.phonenumbers.metadata.regex.Edge.any; +import static com.google.i18n.phonenumbers.metadata.regex.Edge.epsilon; +import static com.google.i18n.phonenumbers.metadata.regex.Edge.optionalAny; +import static com.google.i18n.phonenumbers.metadata.regex.Node.INITIAL; +import static com.google.i18n.phonenumbers.metadata.regex.Node.TERMINAL; + +import com.google.common.collect.Iterables; +import com.google.common.graph.ValueGraph; +import com.google.i18n.phonenumbers.metadata.RangeSpecification; +import com.google.i18n.phonenumbers.metadata.RangeTree; +import com.google.i18n.phonenumbers.metadata.regex.Edge.SimpleEdge; +import java.util.List; +import java.util.stream.Stream; +import org.junit.Test; +import org.junit.runner.RunWith; +import org.junit.runners.JUnit4; + +@RunWith(JUnit4.class) +public class RangeTreeConverterTest { + // Simple 4 node DFA. + // (I) --1--> ( ) --2--> ( ) --x--> (T) + @Test + public void testSimple() { + RangeTree dfa = RangeTree.from(specs("12x")); + ValueGraph nfa = RangeTreeConverter.toNfaGraph(dfa); + assertThat(nfa.nodes()).hasSize(4); + Node node = assertOutEdge(nfa, INITIAL, edge(1)); + node = assertOutEdge(nfa, node, edge(2)); + node = assertOutEdge(nfa, node, any()); + assertThat(node).isEqualTo(TERMINAL); + } + + // Simple 4 node DFA with optional termination immediately before the terminal node. + // (I) --1--> ( ) --2--> (T) --x--> (T) + @Test + public void testWithOptionalEdge() { + RangeTree dfa = RangeTree.from(specs("12x", "12")); + + ValueGraph nfa = RangeTreeConverter.toNfaGraph(dfa); + assertThat(nfa.nodes()).hasSize(4); + Node node = assertOutEdge(nfa, INITIAL, edge(1)); + node = assertOutEdge(nfa, node, edge(2)); + node = assertOutEdge(nfa, node, optionalAny()); + assertThat(node).isEqualTo(TERMINAL); + } + + // Simple 4 node DFA with optional termination. + // (I) --1--> (T) --2--> ( ) --x--> (T) + @Test + public void testWithEpsilon() { + RangeTree dfa = RangeTree.from(specs("12x", "1")); + + ValueGraph nfa = RangeTreeConverter.toNfaGraph(dfa); + assertThat(nfa.nodes()).hasSize(4); + Node node = assertOutEdge(nfa, INITIAL, edge(1)); + assertOutEdges(nfa, node, edge(2), epsilon()); + // One of the out nodes should be the terminal. + assertThat(follow(nfa, node, epsilon())).isEqualTo(Node.TERMINAL); + node = follow(nfa, node, edge(2)); + // The other is the normal edge that leads to the terminal. + node = follow(nfa, node, any()); + assertThat(node).isEqualTo(TERMINAL); + } + + // Simple 5 node DFA with 2 paths. + // (I) --1--> ( ) --2--> ( ) --x--> (T) + // `---3--> ( ) --4----^ + @Test + public void testMultiplePathsWithCommonTail() { + RangeTree dfa = RangeTree.from(specs("12x", "34x")); + + ValueGraph nfa = RangeTreeConverter.toNfaGraph(dfa); + assertThat(nfa.nodes()).hasSize(5); + + assertOutEdges(nfa, INITIAL, edge(1), edge(3)); + Node lhs = follow(nfa, INITIAL, edge(1)); + lhs = assertOutEdge(nfa, lhs, edge(2)); + Node rhs = follow(nfa, INITIAL, edge(3)); + rhs = assertOutEdge(nfa, rhs, edge(4)); + assertThat(lhs).isEqualTo(rhs); + Node node = assertOutEdge(nfa, lhs, any()); + assertThat(node).isEqualTo(TERMINAL); + } + + @Test + public void testOptionalTopLevelGroup_bug_69101586() { + // Requires making a top level optional group, which is (deliberately) not easy with the + // DFA tooling since it's pretty rare. This is a DFA which can terminate immediately and will + // match the empty input (as well as its normal input). + RangeTree dfa = RangeTree.from(specs("xx")).union(RangeTree.from(RangeSpecification.empty())); + + ValueGraph nfa = RangeTreeConverter.toNfaGraph(dfa); + assertThat(nfa.nodes()).hasSize(3); + assertThat(follow(nfa, INITIAL, epsilon())).isEqualTo(Node.TERMINAL); + Node node = follow(nfa, INITIAL, any()); + node = assertOutEdge(nfa, node, any()); + assertThat(node).isEqualTo(TERMINAL); + } + + // Returns the simple edge matching exactly this one digit value. + SimpleEdge edge(int n) { + return SimpleEdge.fromMask(1 << n); + } + + List specs(String... s) { + return Stream.of(s).map(RangeSpecification::parse).collect(toImmutableList()); + } + + // Asserts that a node has only one out edge and returns that edge's target. + Node assertOutEdge(ValueGraph nfa, Node node, SimpleEdge edge) { + assertThat(nfa.successors(node)).hasSize(1); + Node target = Iterables.getOnlyElement(nfa.successors(node)); + assertThat(nfa.edgeValue(node, target).get()).isEqualTo(edge); + return target; + } + + // Asserts that a node has all the given edges. + void assertOutEdges(ValueGraph nfa, Node node, SimpleEdge... edges) { + assertThat(nfa.successors(node)).hasSize(edges.length); + List out = nfa.successors(node).stream() + .map(t -> nfa.edgeValue(node, t).get()) + .collect(toImmutableList()); + assertThat(out).containsExactlyElementsIn(edges); + } + + // Follows the given edge from a node (which must be in the graph), returning the target node + // (or null if the edge does not exist in the graph). + Node follow(ValueGraph nfa, Node node, SimpleEdge edge) { + return nfa.successors(node).stream() + .filter(t -> nfa.edgeValue(node, t).get().equals(edge)) + .findFirst() + .orElse(null); + } +} diff --git a/metadata/src/test/java/com/google/i18n/phonenumbers/metadata/regex/RegexFormatterTest.java b/metadata/src/test/java/com/google/i18n/phonenumbers/metadata/regex/RegexFormatterTest.java new file mode 100644 index 0000000000..1879c80e86 --- /dev/null +++ b/metadata/src/test/java/com/google/i18n/phonenumbers/metadata/regex/RegexFormatterTest.java @@ -0,0 +1,107 @@ +/* + * Copyright (C) 2017 The Libphonenumber Authors. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.google.i18n.phonenumbers.metadata.regex; + +import static com.google.common.truth.Truth.assertThat; +import static com.google.i18n.phonenumbers.metadata.regex.RegexFormatter.FormatOption.FORCE_CAPTURING_GROUPS; +import static com.google.i18n.phonenumbers.metadata.regex.RegexFormatter.FormatOption.FORCE_NON_CAPTURING_GROUPS; +import static com.google.i18n.phonenumbers.metadata.regex.RegexFormatter.FormatOption.PRESERVE_CAPTURING_GROUPS; + +import com.google.common.base.Joiner; +import org.junit.Test; +import org.junit.runner.RunWith; +import org.junit.runners.JUnit4; + +@RunWith(JUnit4.class) +public class RegexFormatterTest { + + // Luckily the formatter cares only about 3 special characters, '(', '|' and ')', so we only need + // to test a few very straightforward cases to cover everything. + + @Test + public void testSimple() { + assertThat(RegexFormatter.format("abcd", PRESERVE_CAPTURING_GROUPS)) + .isEqualTo("abcd"); + } + + @Test + public void testNested() { + assertThat(RegexFormatter.format("ab(cd|ef)gh", PRESERVE_CAPTURING_GROUPS)).isEqualTo(lines( + "ab(", + " cd|", + " ef", + ")gh")); + + assertThat(RegexFormatter.format("ab(?:cd|ef)gh", PRESERVE_CAPTURING_GROUPS)).isEqualTo(lines( + "ab(?:", + " cd|", + " ef", + ")gh")); + } + + @Test + public void testDoubleNested() { + assertThat(RegexFormatter.format("ab(cd(ef|gh)|ij)", PRESERVE_CAPTURING_GROUPS)) + .isEqualTo(lines( + "ab(", + " cd(", + " ef|", + " gh", + " )|", + " ij", + ")")); + + assertThat(RegexFormatter.format("ab(cd(?:ef|gh)|ij)", PRESERVE_CAPTURING_GROUPS)) + .isEqualTo(lines( + "ab(", + " cd(?:", + " ef|", + " gh", + " )|", + " ij", + ")")); + } + + @Test + public void testForceNonCapturingGroups() { + assertThat(RegexFormatter.format("ab(?:cd(ef|gh)|ij)", FORCE_NON_CAPTURING_GROUPS)) + .isEqualTo(lines( + "ab(?:", + " cd(?:", + " ef|", + " gh", + " )|", + " ij", + ")")); + } + + @Test + public void testForceCapturingGroups() { + assertThat(RegexFormatter.format("ab(?:cd(ef|gh)|ij)", FORCE_CAPTURING_GROUPS)).isEqualTo(lines( + "ab(", + " cd(", + " ef|", + " gh", + " )|", + " ij", + ")")); + } + + private static String lines(String... s) { + return Joiner.on('\n').join(s); + } +} diff --git a/metadata/src/test/java/com/google/i18n/phonenumbers/metadata/regex/RegexGeneratorTest.java b/metadata/src/test/java/com/google/i18n/phonenumbers/metadata/regex/RegexGeneratorTest.java new file mode 100644 index 0000000000..258719d4cb --- /dev/null +++ b/metadata/src/test/java/com/google/i18n/phonenumbers/metadata/regex/RegexGeneratorTest.java @@ -0,0 +1,197 @@ +/* + * Copyright (C) 2017 The Libphonenumber Authors. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.google.i18n.phonenumbers.metadata.regex; + +import static com.google.common.base.CharMatcher.whitespace; +import static com.google.common.truth.Truth.assertThat; +import static com.google.i18n.phonenumbers.metadata.regex.RegexGenerator.basic; +import static java.util.stream.Collectors.joining; + +import com.google.common.collect.ImmutableList; +import com.google.i18n.phonenumbers.metadata.RangeSpecification; +import com.google.i18n.phonenumbers.metadata.RangeTree; +import java.util.Arrays; +import java.util.List; +import org.junit.Test; +import org.junit.runner.RunWith; +import org.junit.runners.JUnit4; + +@RunWith(JUnit4.class) +public class RegexGeneratorTest { + @Test + public void testSimple() { + assertRegex(basic(), ranges("123xxx"), "123\\d{3}"); + // This could be improved to "..." rather than ".{3}" saving 1 char, probably not worth it. + assertRegex(basic().withDotMatch(), ranges("123xxx"), "123.{3}"); + } + + @Test + public void testVariableLength() { + assertRegex(basic(), ranges("123xxx", "123xxxx", "123xxxxx", "123xxxxxx"), "123\\d{3,6}"); + } + + @Test + public void testTailOptimization() { + RangeTree dfa = ranges("123xxx", "123xxxx", "145xxx"); + assertRegex(basic(), dfa, "1(?:23\\d{3,4}|45\\d{3})"); + assertRegex(basic().withTailOptimization(), dfa, "1(?:23\\d?|45)\\d{3}"); + } + + @Test + public void testDfaFactorization() { + // Essentially create a "thin" wedge of specific non-determinism with the shorter (5-digit) + // numbers which prevents the larger ranges from being contiguous in the DFA. + RangeTree dfa = ranges("1234x", "1256x", "[0-4]xxxxxx", "[0-4]xxxxxxx"); + assertRegex(basic(), dfa, + "[02-4]\\d{6,7}|", + "1(?:[013-9]\\d{5,6}|", + "2(?:[0-246-9]\\d{4,5}|", + "3(?:[0-35-9]\\d{3,4}|4\\d(?:\\d{2,3})?)|", + "5(?:[0-57-9]\\d{3,4}|6\\d(?:\\d{2,3})?)))"); + assertRegex(basic().withDfaFactorization(), dfa, "[0-4]\\d{6,7}|12(?:34|56)\\d"); + } + + @Test + public void testSubgroupOptimization() { + // The subgraph of "everything except 95, 96 and 100" (this appears in China leading digits). + RangeTree postgraph = ranges("[02-8]", "1[1-9]", "10[1-9]", "9[0-47-9]"); + RangeTree pregraph = ranges("123", "234", "345", "456", "567"); + + // Cross product of pre and post paths. + RangeTree subgraph = RangeTree.from( + pregraph.asRangeSpecifications().stream() + .flatMap(a -> postgraph.asRangeSpecifications().stream().map(a::extendBy))); + + // Union in other paths to trigger repetition in the "basic" case. + RangeTree rest = ranges("128xx", "238xx", "348xx", "458xx", "568xx"); + RangeTree dfa = rest.union(subgraph); + + assertRegex(basic(), dfa, + "12(?:3(?:[02-8]|1(?:0[1-9]|[1-9])|9[0-47-9])|8\\d\\d)|", + "23(?:4(?:[02-8]|1(?:0[1-9]|[1-9])|9[0-47-9])|8\\d\\d)|", + "34(?:5(?:[02-8]|1(?:0[1-9]|[1-9])|9[0-47-9])|8\\d\\d)|", + "45(?:6(?:[02-8]|1(?:0[1-9]|[1-9])|9[0-47-9])|8\\d\\d)|", + "56(?:7(?:[02-8]|1(?:0[1-9]|[1-9])|9[0-47-9])|8\\d\\d)"); + + assertRegex(basic().withSubgroupOptimization(), dfa, + "(?:12|23|34|45|56)8\\d\\d|", + "(?:123|234|345|456|567)(?:[02-8]|1(?:0[1-9]|[1-9])|9[0-47-9])"); + } + + @Test + public void testRegression_bug_65250963() { + RangeTree dfa = ranges( + "1387", + "1697", + "1524", + "1539", + "1768", + "1946"); + assertRegex(basic(), dfa, + "1(?:", + " (?:", + " 38|", + " 69", + " )7|", + " 5(?:", + " 24|", + " 39", + " )|", + " 768|", + " 946", + ")"); + } + + @Test + public void testRegression_bug_68929642() { + assertMatches( + "1\\d{6}(?:\\d{2})?", + ImmutableList.of("1234567", "123456789"), + ImmutableList.of("12345678"), + "1xxx_xxx", "1xx_xxx_xxx"); + + assertMatches( + "1\\d{6}[0-7]?", + ImmutableList.of("1234567", "12345670"), + ImmutableList.of("123456", "123456700"), + "1xxx_xxx", "1x_xxx_xx[0-7]"); + + assertMatches( + "\\d\\d?", + ImmutableList.of("1", "12"), + ImmutableList.of("", "123"), + "x", "xx"); + + assertMatches( + "\\d{1,3}", + ImmutableList.of("1", "12", "123"), + ImmutableList.of("", "1234"), + "x", "xx", "xxx"); + + assertMatches( + "\\d(?:\\d{3}(?:\\d{2})?)?", + ImmutableList.of("1", "1234", "123456"), + ImmutableList.of("", "12", "123", "12345", "1234567"), + "x", "xxxx", "xxx_xxx"); + + assertMatches( + "(?:\\d\\d(?:\\d(?:\\d{2,4})?)?)?", + ImmutableList.of("", "12", "123", "12345", "123456", "1234567"), + ImmutableList.of("1", "1234", "12345678"), + "", "xx", "xxx", "xx_xxx", "xxx_xxx", "xxxx_xxx"); + + assertMatches( + "(?:\\d{2})?", + ImmutableList.of("", "12"), + ImmutableList.of("1", "123"), + "", "xx"); + + assertMatches( + "\\d?", + ImmutableList.of("", "1"), + ImmutableList.of("12"), + "", "x"); + } + + // This does not check that the generated regex is the same as the input, but it does test some + // positive/negative matching cases against both and verifies that the DFA for both are equal. + private static void assertMatches( + String pattern, List matchNumbers, List noMatchNumbers, String... specs) { + String regex = basic().toRegex(ranges(specs)); + assertThat(regex).isEqualTo(pattern); + + // Test the given positive/negative match numbers and expect the same behaviour from both. + for (String number : matchNumbers) { + assertThat(number).matches(pattern); + assertThat(number).matches(regex); + } + for (String number : noMatchNumbers) { + assertThat(number).doesNotMatch(pattern); + assertThat(number).doesNotMatch(regex); + } + } + + private static void assertRegex(RegexGenerator generator, RangeTree dfa, String... lines) { + String regex = generator.toRegex(dfa); + String expected = Arrays.stream(lines).map(whitespace()::removeFrom).collect(joining()); + assertThat(regex).isEqualTo(expected); + } + + private static RangeTree ranges(String... specs) { + return RangeTree.from(Arrays.stream(specs).map(RangeSpecification::parse)); + } +} diff --git a/metadata/src/test/java/com/google/i18n/phonenumbers/metadata/regex/SubgraphOptimizerTest.java b/metadata/src/test/java/com/google/i18n/phonenumbers/metadata/regex/SubgraphOptimizerTest.java new file mode 100644 index 0000000000..46e9f94cd0 --- /dev/null +++ b/metadata/src/test/java/com/google/i18n/phonenumbers/metadata/regex/SubgraphOptimizerTest.java @@ -0,0 +1,80 @@ +/* + * Copyright (C) 2017 The Libphonenumber Authors. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.google.i18n.phonenumbers.metadata.regex; + +import static com.google.common.truth.Truth.assertThat; +import static com.google.common.truth.Truth8.assertThat; + +import com.google.i18n.phonenumbers.metadata.RangeSpecification; +import com.google.i18n.phonenumbers.metadata.RangeTree; +import com.google.i18n.phonenumbers.metadata.RangeTree.DfaNode; +import com.google.i18n.phonenumbers.metadata.regex.SubgroupOptimizer.LinkNodeVisitor; +import java.util.Arrays; +import java.util.Optional; +import org.junit.Test; +import org.junit.runner.RunWith; +import org.junit.runners.JUnit4; + +@RunWith(JUnit4.class) +public class SubgraphOptimizerTest { + // The subgraph of "everything except 95, 96 and 100" (this appears in China leading digits). + // Note that unlike China, there's also an early terminating '9' in the subgraph to ensure that + // the entire subgraph is extracted (including teminating node). + private static final RangeTree POSTGRAPH = ranges("[02-9]", "1[1-9]", "10[1-9]", "9[0-47-9]"); + + // Some prefixes which come before the subgraph. + private static final RangeTree PREGRAPH = ranges("123", "234", "345", "456", "567"); + + // Cross product of pre and post paths. + private static final RangeTree SUBGRAPH = RangeTree.from( + PREGRAPH.asRangeSpecifications().stream() + .flatMap(a -> POSTGRAPH.asRangeSpecifications().stream().map(a::extendBy))); + + // Additional paths which share edges in the subgraph and will cause repetition in regular + // expressions. Also add a couple of early terminating paths "on the way to" the subgroup. + // Note however that a terminating path that reaches the root of the subgraph (e.g. "123") will + // cause a split in the DFA at the root node (one terminating, one not terminating). + private static final RangeTree TEST_RANGES = + SUBGRAPH.union(ranges("128xx", "238xx", "348xx", "458xx", "568xx", "12", "34")); + + @Test + public void testSubgraphWeightAndInOrder() { + LinkNodeVisitor v = new LinkNodeVisitor(); + TEST_RANGES.accept(v); + DfaNode n = v.getHighestCostNode(); + assertThat(n).isNotNull(); + // 5 paths in PREGRAPH which reach the root of POSTGRAPH. + assertThat(v.getInOrder(n)).isEqualTo(5); + // 7 edges in POSTGRAPH with a total weight of 27: + // "[02-8]" = 6, "1", "0", "9" = 3, 2 x "[1-9]" = 10, "[0-47-9]" = 8 + assertThat(v.getSubgraphWeight(n)).isEqualTo(27); + } + + @Test + public void testSubgraphExtraction() { + Optional extracted = SubgroupOptimizer.extractRepeatingSubgraph(TEST_RANGES); + assertThat(extracted).hasValue(SUBGRAPH); + // The "bridge" node is the same, so we extract the whole graph (so we return nothing). + assertThat(SubgroupOptimizer.extractRepeatingSubgraph(SUBGRAPH)).isEmpty(); + // There's no repetition in this graph, so return nothing. + assertThat(SubgroupOptimizer.extractRepeatingSubgraph(ranges("123", "234", "345"))).isEmpty(); + } + + private static RangeTree ranges(String... specs) { + return RangeTree.from(Arrays.stream(specs).map(RangeSpecification::parse)); + } +} diff --git a/metadata/src/test/java/com/google/i18n/phonenumbers/metadata/regex/TrailingPathOptimizerTest.java b/metadata/src/test/java/com/google/i18n/phonenumbers/metadata/regex/TrailingPathOptimizerTest.java new file mode 100644 index 0000000000..0884d87f35 --- /dev/null +++ b/metadata/src/test/java/com/google/i18n/phonenumbers/metadata/regex/TrailingPathOptimizerTest.java @@ -0,0 +1,122 @@ +/* + * Copyright (C) 2017 The Libphonenumber Authors. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.google.i18n.phonenumbers.metadata.regex; + +import static com.google.common.truth.Truth.assertThat; +import static com.google.i18n.phonenumbers.metadata.regex.Node.INITIAL; +import static com.google.i18n.phonenumbers.metadata.regex.Node.TERMINAL; + +import com.google.common.graph.ValueGraph; +import com.google.i18n.phonenumbers.metadata.regex.Edge.SimpleEdge; +import org.junit.Test; +import org.junit.runner.RunWith; +import org.junit.runners.JUnit4; + +@RunWith(JUnit4.class) +public class TrailingPathOptimizerTest { + @Test + public void testSimple() { + NfaBuilder nfa = new NfaBuilder(); + nfa.addPath(INITIAL, TERMINAL, "12xx"); + nfa.addPath(INITIAL, TERMINAL, "34xxxx"); + ValueGraph actual = TrailingPathOptimizer.optimize(nfa.graph()); + + // Expect the common trailing "xx" to be factored out at some new join point. + NfaBuilder expected = new NfaBuilder(); + Node join = expected.addPath(INITIAL, "12"); + expected.addPath(INITIAL, join, "34xx"); + expected.addPath(join, TERMINAL, "xx"); + + assertEquivalent(actual, expected); + } + + @Test + public void testTrailingOptionalGroup() { + NfaBuilder nfa = new NfaBuilder(); + nfa.addPath(INITIAL, TERMINAL, "12xx"); + // Add path "34xx(xx)?" + Node optStart = nfa.addPath(INITIAL, "34xx"); + nfa.addOptionalPath(optStart, TERMINAL, "xx"); + + ValueGraph actual = TrailingPathOptimizer.optimize(nfa.graph()); + + // Expect the common trailing "xx" to be factored out at some new join point. + NfaBuilder expected = new NfaBuilder(); + Node join = expected.addPath(INITIAL, "12"); + // Add "34(xx)?" up to the joining node. + optStart = expected.addPath(INITIAL, "34"); + expected.addOptionalPath(optStart, join, "xx"); + // Add the trailing "xx". + expected.addPath(join, TERMINAL, "xx"); + + assertEquivalent(actual, expected); + } + + @Test + public void testDoubleRecursion() { + NfaBuilder nfa = new NfaBuilder(); + nfa.addPath(INITIAL, TERMINAL, "12xx"); + nfa.addPath(INITIAL, TERMINAL, "34xxxx"); + // Add path "56xxxx(xx)?" + Node optStart = nfa.addPath(INITIAL, "56xxxx"); + nfa.addOptionalPath(optStart, TERMINAL, "xx"); + ValueGraph actual = TrailingPathOptimizer.optimize(nfa.graph()); + + // Factoring should be applied twice to pull out 2 lots of "xx". + // How I wish we had a way to embed proper graphs in JavaDoc! + // + // ,-----------12-----------v + // (I)------34----->(1)--xx-->(2)--xx-->(T) + // `-56-->()--xx--^ + // `--e---^ + // + NfaBuilder expected = new NfaBuilder(); + Node secondJoin = expected.addPath(INITIAL, "12"); + expected.addPath(secondJoin, TERMINAL, "xx"); + Node firstJoin = expected.addPath(INITIAL, "34"); + expected.addPath(firstJoin, secondJoin, "xx"); + optStart = expected.addPath(INITIAL, "56"); + expected.addOptionalPath(optStart, firstJoin, "xx"); + + assertEquivalent(actual, expected); + } + + @Test + public void testNoChangeIfNoCommonFactor() { + NfaBuilder nfa = new NfaBuilder(); + nfa.addPath(INITIAL, TERMINAL, "12xxxxxx"); + // Add path "34xxx(xx)?" which, while it shares 'xxx' with '12xxxxxx', will not be factored + // because splitting out 'xxx' would make the resulting regular expression longer + // (e.g. "(?:34\d{2}?|12\d{3})\d{3}" is longer than "34\d{2}?\d{3}|12\d{6}"). + // + // Note that there are some cases in which this isn't true (shorter sequences like 'x' might be + // splittable without cost, but they are unlikely to ever make the expression shorter, + // especially if they result in adding new parentheses for grouping. + Node optStart = nfa.addPath(INITIAL, "34xxx"); + nfa.addOptionalPath(optStart, TERMINAL, "xx"); + + ValueGraph actual = TrailingPathOptimizer.optimize(nfa.graph()); + assertEquivalent(actual, nfa); + } + + private static void assertEquivalent(ValueGraph actual, NfaBuilder expected) { + // This is a somewhat cheeky way to test graph isomorphism and relies on the fact that graph + // flattening is deterministic according to how edges sort and doesn't care about node values. + // It also, obviously, relies on the flattening code to be vaguely well tested. + assertThat(NfaFlattener.flatten(actual)).isEqualTo(NfaFlattener.flatten(expected.graph())); + } +} diff --git a/metadata/src/test/proto/regression_test.proto b/metadata/src/test/proto/regression_test.proto new file mode 100644 index 0000000000..c2a460b830 --- /dev/null +++ b/metadata/src/test/proto/regression_test.proto @@ -0,0 +1,49 @@ +/* + * Copyright (C) 2017 The Libphonenumber Authors. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +syntax = "proto3"; + +package i18n.phonenumbers.internal.finitestatematcher.compiler; + +option java_package = "com.google.i18n.phonenumbers.internal.finitestatematcher.compiler"; +option java_outer_classname = "RegressionTestProto"; + +// A set of regression tests. +message Tests { + repeated TestCase test_case = 1; +} + +// A single regression test entry. +message TestCase { + // A name for the test, ideally unique. + string name = 1; + // If set true, expect that the test will fail 100% of the time. This is + // useful to test that test numbers have enough coverage to force a failure + // and is typically achieved by modifying an input range after generating a + // passing test (or carefully modifying the output bytecodes). Note that not + // all changes will make a test fail 100% of the time, so care must be taken + // to avoid creating a flaky test (e.g. don't change a "[0-3]" to "[0-5]", as + // this only fails if the test number contains a 4 or 5 at the corresponding + // index, change it to "[4-6]" so there's no overlap and at least one test + // number that's valid for that range will not be accepted by the matcher). + bool should_fail = 2; + // The input ranges (in the form of range specifications) which form the DFA + // to be tested (e.g. "1[2-5]678xxxxx" etc...). + repeated string range = 3; + // The expected output bytes, encoded in test files using C-style hex notation + // (i.e. \xHH). This can be split over multiple lines for readability. + repeated bytes expected = 4; +}