-
Notifications
You must be signed in to change notification settings - Fork 2.1k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Project import generated by Copybara. (#2890)
PiperOrigin-RevId: 509849832
- Loading branch information
Showing
45 changed files
with
6,559 additions
and
111 deletions.
There are no files selected for viewing
71 changes: 71 additions & 0 deletions
71
metadata/src/main/java/com/google/i18n/phonenumbers/metadata/LengthsParser.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,71 @@ | ||
/* | ||
* Copyright (C) 2022 The Libphonenumber Authors. | ||
* | ||
* Licensed under the Apache License, Version 2.0 (the "License"); | ||
* you may not use this file except in compliance with the License. | ||
* You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
|
||
package com.google.i18n.phonenumbers.metadata; | ||
|
||
import static com.google.common.base.CharMatcher.whitespace; | ||
import static com.google.common.base.Preconditions.checkArgument; | ||
import static java.lang.Integer.parseUnsignedInt; | ||
|
||
import com.google.common.base.CharMatcher; | ||
import com.google.common.base.Splitter; | ||
import com.google.common.collect.ContiguousSet; | ||
import com.google.common.collect.ImmutableSortedSet; | ||
import java.util.List; | ||
import java.util.NavigableSet; | ||
import java.util.TreeSet; | ||
|
||
/** Parses strings of form "4,7-9,11" which are used as length specifiers across LPN metadata */ | ||
public final class LengthsParser { | ||
|
||
private static final Splitter COMMA_SPLITTER = Splitter.on(',').trimResults(whitespace()); | ||
private static final Splitter RANGE_SPLITTER = | ||
Splitter.on('-').trimResults(whitespace()).limit(2); | ||
private static final CharMatcher ALLOWED_CHARACTERS = | ||
CharMatcher.inRange('0', '9').or(CharMatcher.anyOf("-,")).or(whitespace()); | ||
|
||
/** Returns the set of integers specified by this string. */ | ||
public static ImmutableSortedSet<Integer> parseLengths(String s) { | ||
checkArgument( | ||
ALLOWED_CHARACTERS.matchesAllOf(s), | ||
"Length specifier contains forbidden characters: %s", | ||
s); | ||
NavigableSet<Integer> lengths = new TreeSet<>(); | ||
for (String lengthOrRange : COMMA_SPLITTER.split(s)) { | ||
if (lengthOrRange.contains("-")) { | ||
List<String> lohi = RANGE_SPLITTER.splitToList(lengthOrRange); | ||
int lo = parseUnsignedInt(lohi.get(0)); | ||
int hi = parseUnsignedInt(lohi.get(1)); | ||
checkArgument(lo < hi, "Invalid range: %s-%s", lo, hi); | ||
checkArgument( | ||
lengths.isEmpty() || lo > lengths.last(), | ||
"Numbers in length specifier are out of order: %s", | ||
s); | ||
lengths.addAll(ContiguousSet.closed(lo, hi)); | ||
} else { | ||
int length = parseUnsignedInt(lengthOrRange); | ||
checkArgument( | ||
lengths.isEmpty() || length > lengths.last(), | ||
"Numbers in length specifier are out of order: %s", | ||
s); | ||
lengths.add(length); | ||
} | ||
} | ||
return ImmutableSortedSet.copyOf(lengths); | ||
} | ||
|
||
private LengthsParser() {} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
317 changes: 317 additions & 0 deletions
317
...n/java/com/google/i18n/phonenumbers/metadata/finitestatematcher/DigitSequenceMatcher.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,317 @@ | ||
/* | ||
* Copyright (C) 2017 The Libphonenumber Authors. | ||
* | ||
* Licensed under the Apache License, Version 2.0 (the "License"); | ||
* you may not use this file except in compliance with the License. | ||
* You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
|
||
package com.google.i18n.phonenumbers.metadata.finitestatematcher; | ||
|
||
import com.google.i18n.phonenumbers.metadata.finitestatematcher.OpCode.State; | ||
|
||
/** | ||
* Matches phone number regular expressions based on compact compiled data generated by | ||
* {@link com.google.i18n.phonenumbers.metadata.finitestatematcher.compiler.MatcherCompiler | ||
* MatcherCompiler}. Typically the phone number regular expression will be compiled at build time | ||
* and the resulting matcher data will be packaged into the binary which needs it, or downloaded at | ||
* run time. | ||
* <p> | ||
* This class is designed to be lightweight and fast, and should be simple to implement in many | ||
* different languages (C++, Python, JS, etc.). | ||
* | ||
* TODO: Consider UnisgnedBytes.toInt(x) to avoid lots of (x & 0xFF). | ||
*/ | ||
public abstract class DigitSequenceMatcher { | ||
|
||
/** Possible result types returned by a match operation. */ | ||
public enum Result { | ||
/** The match operation was a success and the input was matched. */ | ||
MATCHED, | ||
/** The match operation failed because unexpected input was encountered. */ | ||
INVALID, | ||
/** | ||
* The match operation failed because the input terminated too soon (ie, the input was a | ||
* valid prefix for the matcher). | ||
*/ | ||
TOO_SHORT, | ||
/** | ||
* The match operation failed due to the existence of additional input after matching had | ||
* completed (ie, the the input would have matched if it were shorter). | ||
*/ | ||
TOO_LONG; | ||
} | ||
|
||
/** An iterator of {@code int}, used to supply the matcher with a sequence of input digits. */ | ||
public interface DigitSequence { | ||
/** Returns true if there are more digits available. */ | ||
boolean hasNext(); | ||
|
||
/** | ||
* Return the next digit value (from 0 to 9 inclusive, not a char value). The matcher does not | ||
* test for invalid digits, so returning values outside this range will have undefined results, | ||
* including false positive results. | ||
*/ | ||
int next(); | ||
} | ||
|
||
/** Internal abstraction to allow matching over either byte arrays or strings. */ | ||
interface DataView { | ||
/** Return the unsigned byte value at the given offset from the current position. */ | ||
int peekByte(int offset); | ||
|
||
/** Return the unsigned byte value at the current position and move ahead 1 byte. */ | ||
int readByte(); | ||
|
||
/** Return the unsigned short value at the current position and move ahead 2 bytes. */ | ||
int readShort(); | ||
|
||
/** Return the unsigned int value at the current position and move ahead 4 bytes. */ | ||
int readInt(); | ||
|
||
/** Adjust the current position by the given (non-negative) offset. */ | ||
State branch(int offset); | ||
|
||
/** | ||
* Adjust the current position by the unsigned byte offset value read from the current | ||
* position plus the given index. This is used to implement maps and branching ranges. | ||
*/ | ||
State jumpTable(int index); | ||
} | ||
|
||
/** | ||
* Creates a new matcher which reads instructions directly from the given byte array. Typically | ||
* it is expected that this method will consume byte arrays packaged into a binary at build time | ||
* (the MatcherCompiler is not suitable for direct parsing of regular expressions at run time). | ||
* <p> | ||
* See {@code MatcherCompiler.compile(...)}. | ||
*/ | ||
public static DigitSequenceMatcher create(byte[] data) { | ||
if (data.length == 0) { | ||
throw new IllegalArgumentException("matcher data cannot be empty"); | ||
} | ||
return new ByteArrayMatcher(data); | ||
} | ||
|
||
/** | ||
* Creates a new matcher which reads instructions from the given string. Typically it is expected | ||
* that this method will be used when matcher data is packaged as literal Java string constants | ||
* in (auto-generated) source files. | ||
* <p> | ||
* See {@code MatcherCompiler.compileToUnquotedJavaSourceString(...)}. | ||
*/ | ||
public static DigitSequenceMatcher create(String data) { | ||
if (data.isEmpty()) { | ||
throw new IllegalArgumentException("matcher data cannot be empty"); | ||
} | ||
return new StringMatcher(data); | ||
} | ||
|
||
abstract DataView newDataView(); | ||
|
||
abstract int size(); | ||
|
||
/** Matches the input against this matcher, returning a result code. */ | ||
public Result match(DigitSequence in) { | ||
State state = runMatcher(in); | ||
switch (state) { | ||
case TERMINAL: | ||
return !in.hasNext() ? Result.MATCHED : Result.TOO_LONG; | ||
case TRUNCATED: | ||
return Result.TOO_SHORT; | ||
case INVALID: | ||
return Result.INVALID; | ||
default: | ||
throw new AssertionError("unexpected state: " + state); | ||
} | ||
} | ||
|
||
private State runMatcher(DigitSequence in) { | ||
DataView data = newDataView(); | ||
State state; | ||
do { | ||
state = OpCode.decode(data.peekByte(0)).execute(data, in); | ||
} while (state == State.CONTINUE); | ||
return state; | ||
} | ||
|
||
@Override | ||
public String toString() { | ||
int size = size(); | ||
StringBuilder out = new StringBuilder(size + " :: [ "); | ||
DataView data = newDataView(); | ||
while (size-- > 0) { | ||
out.append(Integer.toHexString(data.readByte())).append(", "); | ||
} | ||
out.setLength(out.length() - 2); | ||
out.append(" ]"); | ||
return out.toString(); | ||
} | ||
|
||
/** A matcher for reading instructions from a byte array. */ | ||
private static final class ByteArrayMatcher extends DigitSequenceMatcher { | ||
|
||
private class ByteArrayData implements DataView { | ||
int position = 0; | ||
|
||
@Override public int peekByte(int offset) { | ||
return bytes[position + offset] & 0xFF; | ||
} | ||
|
||
@Override public int readByte() { | ||
return bytes[position++] & 0xFF; | ||
} | ||
|
||
@Override public int readShort() { | ||
return (readByte() << 8) | readByte(); | ||
} | ||
|
||
@Override public int readInt() { | ||
return (readShort() << 16) | readShort(); | ||
} | ||
|
||
@Override public State branch(int offset) { | ||
position += offset; | ||
return offset != 0 ? State.CONTINUE : State.TERMINAL; | ||
} | ||
|
||
@Override public State jumpTable(int index) { | ||
return branch(peekByte(index)); | ||
} | ||
} | ||
|
||
private final byte[] bytes; | ||
|
||
private ByteArrayMatcher(byte[] data) { | ||
this.bytes = data; | ||
} | ||
|
||
@Override | ||
DataView newDataView() { | ||
return new ByteArrayData(); | ||
} | ||
|
||
@Override | ||
int size() { | ||
return bytes.length; | ||
} | ||
} | ||
|
||
/** A matcher for reading instructions from a String. */ | ||
private static final class StringMatcher extends DigitSequenceMatcher { | ||
|
||
/* | ||
* Note: Using unsigned shift "x >>> 1" is more likely to be free as part of a data load | ||
* instruction than "x / 2". | ||
*/ | ||
|
||
private class StringData implements DataView { | ||
int position = 0; | ||
|
||
@Override public int peekByte(int offset) { | ||
offset += position; | ||
int data = bytes.charAt(offset >>> 1); | ||
// char := hi [ even-byte | odd-byte ] lo | ||
return (offset & 1) != 0 ? data & 0xFF : data >>> 8; | ||
} | ||
|
||
@Override public int readByte() { | ||
int data = bytes.charAt(position >>> 1); | ||
// char := hi [ even-byte | odd-byte ] lo | ||
data = (position & 1) != 0 ? data & 0xFF : data >>> 8; | ||
position += 1; | ||
return data; | ||
} | ||
|
||
@Override public int readShort() { | ||
int data = bytes.charAt(position >>> 1); | ||
// Adding 2 early does not affect odd/even (but does reference next char). | ||
position += 2; | ||
if ((position & 1) != 0) { | ||
data = ((data & 0xFF) << 8) | (bytes.charAt(position >>> 1) >>> 8); | ||
} | ||
return data; | ||
} | ||
|
||
@Override public int readInt() { | ||
return (readShort() << 16) | readShort(); | ||
} | ||
|
||
@Override public State branch(int offset) { | ||
position += offset; | ||
return offset != 0 ? State.CONTINUE : State.TERMINAL; | ||
} | ||
|
||
@Override public State jumpTable(int index) { | ||
return branch(peekByte(index)); | ||
} | ||
} | ||
|
||
private final String bytes; | ||
|
||
private StringMatcher(String bytes) { | ||
this.bytes = bytes; | ||
} | ||
|
||
@Override | ||
DataView newDataView() { | ||
return new StringData(); | ||
} | ||
|
||
@Override | ||
int size() { | ||
int size = 2 * bytes.length(); | ||
if ((bytes.charAt(bytes.length() - 1) & 0xFF) == 0xFF) { | ||
size -= 1; | ||
} | ||
return size; | ||
} | ||
} | ||
|
||
/** An iterator of {@code int} that yields a sequence of input digits from a string. */ | ||
private static final class StringDigits implements DigitSequence { | ||
private final CharSequence number; | ||
private int n = 0; | ||
|
||
private StringDigits(CharSequence number) { | ||
this.number = number; | ||
} | ||
|
||
@Override public int next() { | ||
if (n < 0 || n >= number.length()) { | ||
throw new IndexOutOfBoundsException( | ||
"index '" + n + "' out of bounds for input: " + number); | ||
} | ||
char c = number.charAt(n); | ||
if (c < '0' || c > '9') { | ||
throw new IllegalArgumentException( | ||
"non-digit character '" + c + "' [" + ((int) c) + "] at index " + n + " in: " + number); | ||
} | ||
n++; | ||
return c - '0'; | ||
} | ||
|
||
@Override public boolean hasNext() { | ||
return n < number.length(); | ||
} | ||
} | ||
|
||
/** | ||
* Returns an instance of DigitSequence based on the input string. The input string may only | ||
* contain digits. | ||
*/ | ||
public static DigitSequence digitsFromString(CharSequence number) { | ||
return new StringDigits(number); | ||
} | ||
|
||
/** A matcher has no internal state and is just a factory for data specific implementations. */ | ||
private DigitSequenceMatcher() { } | ||
} |
Oops, something went wrong.