Skip to content

Commit

Permalink
add regular chars
Browse files Browse the repository at this point in the history
  • Loading branch information
rui-mo committed Aug 7, 2023
1 parent 5f28e65 commit dc6f9eb
Show file tree
Hide file tree
Showing 5 changed files with 36 additions and 10 deletions.
6 changes: 4 additions & 2 deletions velox/type/Subfield.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,10 @@

namespace facebook::velox::common {

Subfield::Subfield(const std::string& path) {
Tokenizer tokenizer(path);
Subfield::Subfield(
const std::string& path,
const std::vector<char>& regularChars) {
Tokenizer tokenizer(path, regularChars);
VELOX_CHECK(tokenizer.hasNext(), "Column name is missing: {}", path);

auto firstElement = tokenizer.next();
Expand Down
4 changes: 3 additions & 1 deletion velox/type/Subfield.h
Original file line number Diff line number Diff line change
Expand Up @@ -191,7 +191,9 @@ class Subfield {
};

public:
explicit Subfield(const std::string& path);
explicit Subfield(
const std::string& path,
const std::vector<char>& regularChars = {});

explicit Subfield(std::vector<std::unique_ptr<PathElement>>&& path);

Expand Down
16 changes: 14 additions & 2 deletions velox/type/Tokenizer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,10 @@

namespace facebook::velox::common {

Tokenizer::Tokenizer(const std::string& path) : path_(path) {
Tokenizer::Tokenizer(
const std::string& path,
const std::vector<char>& regularChars)
: path_(path), regularChars_(regularChars) {
state = State::kNotReady;
index_ = 0;
}
Expand Down Expand Up @@ -87,6 +90,9 @@ void Tokenizer::match(char expected) {
}

bool Tokenizer::tryMatch(char expected) {
if (treatAsRegularCharacter(expected)) {
return false;
}
if (!hasNextCharacter() || peekCharacter() != expected) {
return false;
}
Expand All @@ -105,7 +111,8 @@ char Tokenizer::peekCharacter() {
std::unique_ptr<Subfield::PathElement> Tokenizer::matchPathSegment() {
// seek until we see a special character or whitespace
int start = index_;
while (hasNextCharacter() && isUnquotedPathCharacter(peekCharacter())) {
while (hasNextCharacter() && treatAsRegularCharacter(peekCharacter()) ||
isUnquotedPathCharacter(peekCharacter())) {
nextCharacter();
}
int end = index_;
Expand Down Expand Up @@ -143,6 +150,11 @@ std::unique_ptr<Subfield::PathElement> Tokenizer::matchUnquotedSubscript() {
return std::make_unique<Subfield::LongSubscript>(index);
}

bool Tokenizer::treatAsRegularCharacter(char c) {
return std::find(regularChars_.begin(), regularChars_.end(), c) !=
regularChars_.end();
}

bool Tokenizer::isUnquotedPathCharacter(char c) {
return c == ':' || c == '$' || c == '-' || c == '/' || c == '@' || c == '|' ||
c == '#' || isUnquotedSubscriptCharacter(c);
Expand Down
7 changes: 6 additions & 1 deletion velox/type/Tokenizer.h
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,9 @@ class Tokenizer {
kFailed,
};

explicit Tokenizer(const std::string& path);
explicit Tokenizer(
const std::string& path,
const std::vector<char>& regularChars = {});

bool hasNext();

Expand All @@ -51,6 +53,7 @@ class Tokenizer {
const char UNICODE_CARET = '^';

const std::string path_;
std::vector<char> regularChars_;
int index_;
State state;
bool firstSegment = true;
Expand All @@ -74,6 +77,8 @@ class Tokenizer {

bool tryToComputeNext();

bool treatAsRegularCharacter(char c);

void invalidSubfieldPath();

bool isUnquotedPathCharacter(char c);
Expand Down
13 changes: 9 additions & 4 deletions velox/type/tests/SubfieldTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,10 @@
using namespace facebook::velox::common;

std::vector<std::unique_ptr<Subfield::PathElement>> tokenize(
const std::string& path) {
const std::string& path,
const std::vector<char>& regularChars = {}) {
std::vector<std::unique_ptr<Subfield::PathElement>> elements;
Tokenizer tokenizer(path);
Tokenizer tokenizer(path, regularChars);
while (tokenizer.hasNext()) {
elements.push_back(tokenizer.next());
}
Expand All @@ -47,8 +48,10 @@ TEST(SubfieldTest, invalidPaths) {
assertInvalidSubfield("a[2].[3].", "Invalid subfield path: a[2].^[3].");
}

void testColumnName(const std::string& name) {
auto elements = tokenize(name);
void testColumnName(
const std::string& name,
const std::vector<char>& regularChars = {}) {
auto elements = tokenize(name, regularChars);
EXPECT_EQ(elements.size(), 1);
EXPECT_EQ(*elements[0].get(), Subfield::NestedField(name));
}
Expand All @@ -59,6 +62,8 @@ TEST(SubfieldTest, columnNamesWithSpecialCharacters) {
testColumnName("a/b/c:12");
testColumnName("@basis");
testColumnName("@basis|city_id");
std::vector<char> regularChars = {'.'};
testColumnName("city.id", regularChars);
}

std::vector<std::unique_ptr<Subfield::PathElement>> createElements() {
Expand Down

0 comments on commit dc6f9eb

Please sign in to comment.