From dc6f9eb3df3e58c3a6ceb00092584c2acc5e767e Mon Sep 17 00:00:00 2001 From: rui-mo Date: Fri, 4 Aug 2023 13:18:58 +0000 Subject: [PATCH] add regular chars --- velox/type/Subfield.cpp | 6 ++++-- velox/type/Subfield.h | 4 +++- velox/type/Tokenizer.cpp | 16 ++++++++++++++-- velox/type/Tokenizer.h | 7 ++++++- velox/type/tests/SubfieldTest.cpp | 13 +++++++++---- 5 files changed, 36 insertions(+), 10 deletions(-) diff --git a/velox/type/Subfield.cpp b/velox/type/Subfield.cpp index 2df88b5a3a73..8419577e1a1a 100644 --- a/velox/type/Subfield.cpp +++ b/velox/type/Subfield.cpp @@ -18,8 +18,10 @@ namespace facebook::velox::common { -Subfield::Subfield(const std::string& path) { - Tokenizer tokenizer(path); +Subfield::Subfield( + const std::string& path, + const std::vector& regularChars) { + Tokenizer tokenizer(path, regularChars); VELOX_CHECK(tokenizer.hasNext(), "Column name is missing: {}", path); auto firstElement = tokenizer.next(); diff --git a/velox/type/Subfield.h b/velox/type/Subfield.h index 407c38f060e3..3404242289ba 100644 --- a/velox/type/Subfield.h +++ b/velox/type/Subfield.h @@ -191,7 +191,9 @@ class Subfield { }; public: - explicit Subfield(const std::string& path); + explicit Subfield( + const std::string& path, + const std::vector& regularChars = {}); explicit Subfield(std::vector>&& path); diff --git a/velox/type/Tokenizer.cpp b/velox/type/Tokenizer.cpp index 53c4d9d595b2..e39d362df8b1 100644 --- a/velox/type/Tokenizer.cpp +++ b/velox/type/Tokenizer.cpp @@ -17,7 +17,10 @@ namespace facebook::velox::common { -Tokenizer::Tokenizer(const std::string& path) : path_(path) { +Tokenizer::Tokenizer( + const std::string& path, + const std::vector& regularChars) + : path_(path), regularChars_(regularChars) { state = State::kNotReady; index_ = 0; } @@ -87,6 +90,9 @@ void Tokenizer::match(char expected) { } bool Tokenizer::tryMatch(char expected) { + if (treatAsRegularCharacter(expected)) { + return false; + } if (!hasNextCharacter() || peekCharacter() != expected) { return false; } @@ -105,7 +111,8 @@ char Tokenizer::peekCharacter() { std::unique_ptr Tokenizer::matchPathSegment() { // seek until we see a special character or whitespace int start = index_; - while (hasNextCharacter() && isUnquotedPathCharacter(peekCharacter())) { + while (hasNextCharacter() && treatAsRegularCharacter(peekCharacter()) || + isUnquotedPathCharacter(peekCharacter())) { nextCharacter(); } int end = index_; @@ -143,6 +150,11 @@ std::unique_ptr Tokenizer::matchUnquotedSubscript() { return std::make_unique(index); } +bool Tokenizer::treatAsRegularCharacter(char c) { + return std::find(regularChars_.begin(), regularChars_.end(), c) != + regularChars_.end(); +} + bool Tokenizer::isUnquotedPathCharacter(char c) { return c == ':' || c == '$' || c == '-' || c == '/' || c == '@' || c == '|' || c == '#' || isUnquotedSubscriptCharacter(c); diff --git a/velox/type/Tokenizer.h b/velox/type/Tokenizer.h index 56380f0aead2..ef4db8b37755 100644 --- a/velox/type/Tokenizer.h +++ b/velox/type/Tokenizer.h @@ -35,7 +35,9 @@ class Tokenizer { kFailed, }; - explicit Tokenizer(const std::string& path); + explicit Tokenizer( + const std::string& path, + const std::vector& regularChars = {}); bool hasNext(); @@ -51,6 +53,7 @@ class Tokenizer { const char UNICODE_CARET = '^'; const std::string path_; + std::vector regularChars_; int index_; State state; bool firstSegment = true; @@ -74,6 +77,8 @@ class Tokenizer { bool tryToComputeNext(); + bool treatAsRegularCharacter(char c); + void invalidSubfieldPath(); bool isUnquotedPathCharacter(char c); diff --git a/velox/type/tests/SubfieldTest.cpp b/velox/type/tests/SubfieldTest.cpp index 91252ae134dd..7e9916b57291 100644 --- a/velox/type/tests/SubfieldTest.cpp +++ b/velox/type/tests/SubfieldTest.cpp @@ -20,9 +20,10 @@ using namespace facebook::velox::common; std::vector> tokenize( - const std::string& path) { + const std::string& path, + const std::vector& regularChars = {}) { std::vector> elements; - Tokenizer tokenizer(path); + Tokenizer tokenizer(path, regularChars); while (tokenizer.hasNext()) { elements.push_back(tokenizer.next()); } @@ -47,8 +48,10 @@ TEST(SubfieldTest, invalidPaths) { assertInvalidSubfield("a[2].[3].", "Invalid subfield path: a[2].^[3]."); } -void testColumnName(const std::string& name) { - auto elements = tokenize(name); +void testColumnName( + const std::string& name, + const std::vector& regularChars = {}) { + auto elements = tokenize(name, regularChars); EXPECT_EQ(elements.size(), 1); EXPECT_EQ(*elements[0].get(), Subfield::NestedField(name)); } @@ -59,6 +62,8 @@ TEST(SubfieldTest, columnNamesWithSpecialCharacters) { testColumnName("a/b/c:12"); testColumnName("@basis"); testColumnName("@basis|city_id"); + std::vector regularChars = {'.'}; + testColumnName("city.id", regularChars); } std::vector> createElements() {