Skip to content

Commit

Permalink
ARROW-7288: [C++][Parquet] Don't use regular expression to parse appl…
Browse files Browse the repository at this point in the history
…ication version

std::regex provided by MinGW may take a long with Japanese location on
Windows.

We can use std::regex, boost::regex or RE2 as regular expression
engine for this but RE2 doesn't use compatible syntax with others. If
we support all of them, we need to maintain multiple regular
expressions. It increases maintenance cost. If we don't use regular
expression, we don't need to think about regular expression. But we
need to maintain hand-written parser.
  • Loading branch information
kou committed Jan 30, 2021
1 parent f58f29d commit 0d959eb
Show file tree
Hide file tree
Showing 5 changed files with 392 additions and 65 deletions.
10 changes: 1 addition & 9 deletions cpp/cmake_modules/ThirdpartyToolchain.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -854,13 +854,6 @@ else()
set(THRIFT_REQUIRES_BOOST FALSE)
endif()

# Parquet requires boost only with gcc 4.8 (because of missing std::regex).
if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU" AND CMAKE_CXX_COMPILER_VERSION VERSION_LESS "4.9")
set(PARQUET_REQUIRES_BOOST TRUE)
else()
set(PARQUET_REQUIRES_BOOST FALSE)
endif()

# Compilers that don't support int128_t have a compile-time
# (header-only) dependency on Boost for int128_t.
if(ARROW_USE_UBSAN)
Expand All @@ -878,8 +871,7 @@ endif()
if(ARROW_BUILD_INTEGRATION
OR ARROW_BUILD_TESTS
OR (ARROW_FLIGHT AND ARROW_BUILD_BENCHMARKS)
OR (ARROW_S3 AND ARROW_BUILD_BENCHMARKS)
OR (ARROW_PARQUET AND PARQUET_REQUIRES_BOOST))
OR (ARROW_S3 AND ARROW_BUILD_BENCHMARKS))
set(ARROW_BOOST_REQUIRED TRUE)
set(ARROW_BOOST_REQUIRE_LIBRARY TRUE)
elseif(ARROW_GANDIVA
Expand Down
12 changes: 2 additions & 10 deletions cpp/src/parquet/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -129,13 +129,6 @@ else()
set(ARROW_LIBRARIES_FOR_STATIC_TESTS arrow_testing_shared arrow_shared)
endif()

set(PARQUET_BOOST_LINK_LIBS)

if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU" AND CMAKE_CXX_COMPILER_VERSION VERSION_LESS "4.9")
add_definitions(-DPARQUET_USE_BOOST_REGEX)
list(APPEND PARQUET_BOOST_LINK_LIBS ${BOOST_REGEX_LIBRARY})
endif()

set(PARQUET_MIN_TEST_LIBS GTest::gtest_main GTest::gtest)

if(APPLE)
Expand Down Expand Up @@ -236,12 +229,11 @@ if(NOT PARQUET_MINIMAL_DEPENDENCY)

# These are libraries that we will link privately with parquet_shared (as they
# do not need to be linked transitively by other linkers)
set(PARQUET_SHARED_PRIVATE_LINK_LIBS ${PARQUET_BOOST_LINK_LIBS} thrift::thrift)
set(PARQUET_SHARED_PRIVATE_LINK_LIBS thrift::thrift)

# Link publicly with parquet_static (because internal users need to
# transitively link all dependencies)
set(PARQUET_STATIC_LINK_LIBS ${PARQUET_STATIC_LINK_LIBS} ${PARQUET_BOOST_LINK_LIBS}
thrift::thrift)
set(PARQUET_STATIC_LINK_LIBS ${PARQUET_STATIC_LINK_LIBS} thrift::thrift)

# Although we don't link parquet_objlib against anything, we need it to depend
# on these libs as we may generate their headers via ExternalProject_Add
Expand Down
307 changes: 271 additions & 36 deletions cpp/src/parquet/metadata.cc
Original file line number Diff line number Diff line change
Expand Up @@ -949,43 +949,278 @@ ApplicationVersion::ApplicationVersion(std::string application, int major, int m
int patch)
: application_(std::move(application)), version{major, minor, patch, "", "", ""} {}

ApplicationVersion::ApplicationVersion(const std::string& created_by) {
// Use singletons to compile only once (ARROW-9863)
static regex app_regex{ApplicationVersion::APPLICATION_FORMAT};
static regex ver_regex{ApplicationVersion::VERSION_FORMAT};
smatch app_matches;
smatch ver_matches;

std::string created_by_lower = created_by;
std::transform(created_by_lower.begin(), created_by_lower.end(),
created_by_lower.begin(), ::tolower);

bool app_success = regex_match(created_by_lower, app_matches, app_regex);
bool ver_success = false;
std::string version_str;

if (app_success && app_matches.size() >= 4) {
// first match is the entire string. sub-matches start from second.
application_ = app_matches[1];
version_str = app_matches[3];
build_ = app_matches[4];
ver_success = regex_match(version_str, ver_matches, ver_regex);
} else {
application_ = "unknown";
}

if (ver_success && ver_matches.size() >= 7) {
version.major = atoi(ver_matches[1].str().c_str());
version.minor = atoi(ver_matches[2].str().c_str());
version.patch = atoi(ver_matches[3].str().c_str());
version.unknown = ver_matches[4].str();
version.pre_release = ver_matches[5].str();
version.build_info = ver_matches[6].str();
} else {
version.major = 0;
version.minor = 0;
version.patch = 0;
namespace {
// Parse the application version format and set parsed values to
// ApplicationVersion.
//
// The application version format:
// "${APPLICATION_NAME}"
// "${APPLICATION_NAME} version ${VERSION}"
// "${APPLICATION_NAME} version ${VERSION} (build ${BUILD_NAME})"
//
// Eg:
// parquet-cpp
// parquet-cpp version 1.5.0ab-xyz5.5.0+cd
// parquet-cpp version 1.5.0ab-xyz5.5.0+cd (build abcd)
//
// The VERSION format:
// "${MAJOR}.${MINOR}.${PATCH}"
// "${MAJOR}.${MINOR}.${PATCH}${UNKNOWN}"
// "${MAJOR}.${MINOR}.${PATCH}${UNKNOWN}-${PRE_RELEASE}"
// "${MAJOR}.${MINOR}.${PATCH}${UNKNOWN}-${PRE_RELEASE}+${BUILD_INFO}"
// "${MAJOR}.${MINOR}.${PATCH}${UNKNOWN}+${BUILD_INFO}"
// "${MAJOR}.${MINOR}.${PATCH}-${PRE_RELEASE}"
// "${MAJOR}.${MINOR}.${PATCH}-${PRE_RELEASE}+${BUILD_INFO}"
// "${MAJOR}.${MINOR}.${PATCH}+${BUILD_INFO}"
//
// Eg:
// 1.5.0
// 1.5.0ab
// 1.5.0ab-cdh5.5.0
// 1.5.0ab-cdh5.5.0+cd
// 1.5.0ab+cd
// 1.5.0-cdh5.5.0
// 1.5.0-cdh5.5.0+cd
// 1.5.0+cd
class ApplicationVersionParser {
public:
ApplicationVersionParser(const std::string& created_by,
ApplicationVersion& application_version)
: created_by_(created_by),
application_version_(application_version),
digit_("0123456789") {}

void Parse() {
application_version_.application_ = "unknown";
application_version_.version = {0, 0, 0, "", "", ""};

if (!ParseApplicationName()) {
return;
}
if (!ParseVersion()) {
return;
}
if (!ParseBuildName()) {
return;
}
}

private:
void RemovePrecedingSpaces(const std::string& string, size_t& start,
const size_t& end) {
while (start < end && string[start] == ' ') {
++start;
}
}

void RemoveTrailingSpaces(const std::string& string, const size_t& start, size_t& end) {
while (start < (end - 1) && (end - 1) < string.size() && string[end - 1] == ' ') {
--end;
}
}

bool ParseApplicationName() {
std::string version_mark(" version ");
auto version_mark_position = created_by_.find(version_mark);
size_t application_name_end;
// No VERSION and BUILD_NAME.
if (version_mark_position == std::string::npos) {
version_start_ = std::string::npos;
application_name_end = created_by_.size();
} else {
version_start_ = version_mark_position + version_mark.size();
application_name_end = version_mark_position;
}

size_t application_name_start = 0;
RemovePrecedingSpaces(created_by_, application_name_start, application_name_end);
RemoveTrailingSpaces(created_by_, application_name_start, application_name_end);
application_version_.application_ = created_by_.substr(
application_name_start, application_name_end - application_name_start);

return true;
}

bool ParseVersion() {
// No VERSION.
if (version_start_ == std::string::npos) {
return true;
}

RemovePrecedingSpaces(created_by_, version_start_, created_by_.size());
version_end_ = created_by_.find(" (", version_start_);
// No BUILD_NAME.
if (version_end_ == std::string::npos) {
version_end_ = created_by_.size();
}
RemoveTrailingSpaces(created_by_, version_start_, version_end_);
// No VERSION.
if (version_start_ == version_end_) {
return false;
}
version_string_ = created_by_.substr(version_start_, version_end_ - version_start_);

if (!ParseVersionMajor()) {
return false;
}
if (!ParseVersionMinor()) {
return false;
}
if (!ParseVersionPatch()) {
return false;
}
if (!ParseVersionUnknown()) {
return false;
}
if (!ParseVersionPreRelease()) {
return false;
}
if (!ParseVersionBuildInfo()) {
return false;
}

return true;
}

bool ParseVersionMajor() {
size_t version_major_start = 0;
auto version_major_end = version_string_.find_first_not_of(digit_);
// No ".".
if (version_major_end == std::string::npos ||
version_string_[version_major_end] != '.') {
return false;
}
// No MAJOR.
if (version_major_end == version_major_start) {
return false;
}
auto version_major_string = version_string_.substr(
version_major_start, version_major_end - version_major_start);
application_version_.version.major = atoi(version_major_string.c_str());
version_parsing_position_ = version_major_end + 1; // +1 is for '.'.
return true;
}

bool ParseVersionMinor() {
auto version_minor_start = version_parsing_position_;
auto version_minor_end =
version_string_.find_first_not_of(digit_, version_minor_start);
if (version_minor_end == std::string::npos ||
version_string_[version_minor_end] != '.') {
return false;
}
// No MINOR.
if (version_minor_end == version_minor_start) {
return false;
}
auto version_minor_string = version_string_.substr(
version_minor_start, version_minor_end - version_minor_start);
application_version_.version.minor = atoi(version_minor_string.c_str());
version_parsing_position_ = version_minor_end + 1; // +1 is for '.'.
return true;
}

bool ParseVersionPatch() {
auto version_patch_start = version_parsing_position_;
auto version_patch_end =
version_string_.find_first_not_of(digit_, version_patch_start);
// No UNKNOWN, PRE_RELEASE and BUILD_INFO.
if (version_patch_end == std::string::npos) {
version_patch_end = version_string_.size();
}
auto version_patch_string = version_string_.substr(
version_patch_start, version_patch_end - version_patch_start);
application_version_.version.patch = atoi(version_patch_string.c_str());
version_parsing_position_ = version_patch_end;
return true;
}

bool ParseVersionUnknown() {
// No UNKNOWN.
if (version_parsing_position_ == version_string_.size()) {
return true;
}
auto version_unknown_start = version_parsing_position_;
auto version_unknown_end = version_string_.find_first_of("-+", version_unknown_start);
// No PRE_RELEASE and BUILD_INFO
if (version_unknown_end == std::string::npos) {
version_unknown_end = version_string_.size();
}
application_version_.version.unknown = version_string_.substr(
version_unknown_start, version_unknown_end - version_unknown_start);
version_parsing_position_ = version_unknown_end;
return true;
}

bool ParseVersionPreRelease() {
// No PRE_RELEASE.
if (version_parsing_position_ == version_string_.size() ||
version_string_[version_parsing_position_] != '-') {
return true;
}

auto version_pre_release_start = version_parsing_position_ + 1; // +1 is for '-'.
auto version_pre_release_end =
version_string_.find_first_of("+", version_pre_release_start);
// No BUILD_INFO
if (version_pre_release_end == std::string::npos) {
version_pre_release_end = version_string_.size();
}
application_version_.version.pre_release = version_string_.substr(
version_pre_release_start, version_pre_release_end - version_pre_release_start);
version_parsing_position_ = version_pre_release_end;
return true;
}

bool ParseVersionBuildInfo() {
// No BUILD_INFO.
if (version_parsing_position_ == version_string_.size() ||
version_string_[version_parsing_position_] != '+') {
return true;
}

auto version_build_info_start = version_parsing_position_ + 1; // +1 is for '+'.
application_version_.version.build_info =
version_string_.substr(version_build_info_start);
return true;
}

bool ParseBuildName() {
std::string build_mark(" (build ");
auto build_mark_position = created_by_.find(build_mark, version_end_);
// No BUILD_NAME.
if (build_mark_position == std::string::npos) {
return false;
}
auto build_name_start = build_mark_position + build_mark.size();
RemovePrecedingSpaces(created_by_, build_name_start, created_by_.size());
auto build_name_end = created_by_.find_first_of(")", build_name_start);
// No end ")".
if (build_name_end == std::string::npos) {
return false;
}
RemoveTrailingSpaces(created_by_, build_name_start, build_name_end);
application_version_.build_ =
created_by_.substr(build_name_start, build_name_end - build_name_start);

return true;
}

const std::string& created_by_;
ApplicationVersion& application_version_;

// For parsing.
std::string digit_;
size_t version_parsing_position_;
size_t version_start_;
size_t version_end_;
std::string version_string_;
};
} // namespace

ApplicationVersion::ApplicationVersion(const std::string& created_by) {
ApplicationVersionParser parser(created_by, *this);
parser.Parse();
}

bool ApplicationVersion::VersionLt(const ApplicationVersion& other_version) const {
Expand Down
10 changes: 0 additions & 10 deletions cpp/src/parquet/metadata.h
Original file line number Diff line number Diff line change
Expand Up @@ -58,16 +58,6 @@ class PARQUET_EXPORT ApplicationVersion {
static const ApplicationVersion& PARQUET_816_FIXED_VERSION();
static const ApplicationVersion& PARQUET_CPP_FIXED_STATS_VERSION();
static const ApplicationVersion& PARQUET_MR_FIXED_STATS_VERSION();
// Regular expression for the version format
// major . minor . patch unknown - prerelease.x + build info
// Eg: 1.5.0ab-cdh5.5.0+cd
static constexpr char const* VERSION_FORMAT =
"^(\\d+)\\.(\\d+)\\.(\\d+)([^-+]*)?(?:-([^+]*))?(?:\\+(.*))?$";
// Regular expression for the application format
// application_name version VERSION_FORMAT (build build_name)
// Eg: parquet-cpp version 1.5.0ab-xyz5.5.0+cd (build abcd)
static constexpr char const* APPLICATION_FORMAT =
"(.*?)\\s*(?:(version\\s*(?:([^(]*?)\\s*(?:\\(\\s*build\\s*([^)]*?)\\s*\\))?)?)?)";

// Application that wrote the file. e.g. "IMPALA"
std::string application_;
Expand Down
Loading

0 comments on commit 0d959eb

Please sign in to comment.