Skip to content

Commit

Permalink
Remove hard-coded special treatment for $path and $bucket (#11221)
Browse files Browse the repository at this point in the history
Summary:
Pull Request resolved: #11221

These special columns are engine-specific and should be handled during engine split generation by setting the corresponding values in split `infoColumns`.  For example in Prestissimo this is done in https://github.com/prestodb/presto/blob/48f0a0c1d380b1155dfd7c99b134a350627c7260/presto-native-execution/presto_cpp/main/types/PrestoToVeloxConnector.cpp#L1112-L1120.

Reviewed By: xiaoxmeng

Differential Revision: D64180231

fbshipit-source-id: 5a4ec72c54587307dad1a47f7724ebb3a765b767
  • Loading branch information
Yuhta authored and facebook-github-bot committed Oct 10, 2024
1 parent 2fd894d commit 0758d04
Show file tree
Hide file tree
Showing 7 changed files with 32 additions and 36 deletions.
2 changes: 1 addition & 1 deletion velox/connectors/hive/HiveConnectorUtil.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -257,7 +257,7 @@ inline bool isSynthesizedColumn(
const std::string& name,
const std::unordered_map<std::string, std::shared_ptr<HiveColumnHandle>>&
infoColumns) {
return name == kPath || name == kBucket || infoColumns.count(name) != 0;
return infoColumns.count(name) != 0;
}

inline bool isRowIndexColumn(
Expand Down
3 changes: 0 additions & 3 deletions velox/connectors/hive/HiveConnectorUtil.h
Original file line number Diff line number Diff line change
Expand Up @@ -33,9 +33,6 @@ struct HiveConnectorSplit;
using SubfieldFilters =
std::unordered_map<common::Subfield, std::unique_ptr<common::Filter>>;

constexpr const char* kPath = "$path";
constexpr const char* kBucket = "$bucket";

const std::string& getColumnName(const common::Subfield& subfield);

void checkColumnNameLowerCase(const std::shared_ptr<const Type>& type);
Expand Down
19 changes: 0 additions & 19 deletions velox/connectors/hive/SplitReader.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -337,25 +337,6 @@ std::vector<TypePtr> SplitReader::adaptColumns(
if (auto it = hiveSplit_->partitionKeys.find(fieldName);
it != hiveSplit_->partitionKeys.end()) {
setPartitionValue(childSpec, fieldName, it->second);
} else if (fieldName == kPath) {
auto constantVec = std::make_shared<ConstantVector<StringView>>(
connectorQueryCtx_->memoryPool(),
1,
false,
VARCHAR(),
StringView(hiveSplit_->filePath));
childSpec->setConstantValue(constantVec);
} else if (fieldName == kBucket) {
if (hiveSplit_->tableBucketNumber.has_value()) {
int32_t bucket = hiveSplit_->tableBucketNumber.value();
auto constantVec = std::make_shared<ConstantVector<int32_t>>(
connectorQueryCtx_->memoryPool(),
1,
false,
INTEGER(),
std::move(bucket));
childSpec->setConstantValue(constantVec);
}
} else if (auto iter = hiveSplit_->infoColumns.find(fieldName);
iter != hiveSplit_->infoColumns.end()) {
auto infoColumnType =
Expand Down
28 changes: 20 additions & 8 deletions velox/exec/fuzzer/FuzzerUtil.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -119,20 +119,32 @@ Split makeSplit(
const std::unordered_map<std::string, std::optional<std::string>>&
partitionKeys,
std::optional<int32_t> tableBucketNumber) {
return Split{std::make_shared<connector::hive::HiveConnectorSplit>(
return Split{makeConnectorSplit(filePath, partitionKeys, tableBucketNumber)};
}

std::shared_ptr<connector::ConnectorSplit> makeConnectorSplit(
const std::string& filePath,
const std::unordered_map<std::string, std::optional<std::string>>&
partitionKeys,
std::optional<int32_t> tableBucketNumber) {
std::unordered_map<std::string, std::string> infoColumns = {
{"$path", filePath}};
if (tableBucketNumber.has_value()) {
infoColumns["$bucket"] = std::to_string(*tableBucketNumber);
}
return std::make_shared<connector::hive::HiveConnectorSplit>(
kHiveConnectorId,
filePath,
dwio::common::FileFormat::DWRF,
0,
std::numeric_limits<uint64_t>::max(),
partitionKeys,
tableBucketNumber)};
}

std::shared_ptr<connector::ConnectorSplit> makeConnectorSplit(
const std::string& filePath) {
return std::make_shared<connector::hive::HiveConnectorSplit>(
kHiveConnectorId, filePath, dwio::common::FileFormat::DWRF);
tableBucketNumber,
/*customSplitInfo=*/std::unordered_map<std::string, std::string>{},
/*extraFileInfo=*/nullptr,
/*serdeParameters=*/std::unordered_map<std::string, std::string>{},
/*splitWeight=*/0,
infoColumns);
}

std::vector<std::string> makeNames(const std::string& prefix, size_t n) {
Expand Down
5 changes: 4 additions & 1 deletion velox/exec/fuzzer/FuzzerUtil.h
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,10 @@ Split makeSplit(

/// Create a connector split from an exsiting file.
std::shared_ptr<connector::ConnectorSplit> makeConnectorSplit(
const std::string& filePath);
const std::string& filePath,
const std::unordered_map<std::string, std::optional<std::string>>&
partitionKeys = {},
std::optional<int32_t> tableBucketNumber = std::nullopt);

/// Create column names with the pattern '${prefix}${i}'.
std::vector<std::string> makeNames(const std::string& prefix, size_t n);
Expand Down
2 changes: 1 addition & 1 deletion velox/exec/tests/TableScanTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2723,7 +2723,7 @@ TEST_F(TableScanTest, path) {
auto assignments = allRegularColumns(rowType);
assignments[kPath] = synthesizedColumn(kPath, VARCHAR());

auto pathValue = fmt::format("file:{}", filePath->getPath());
auto& pathValue = filePath->getPath();
auto typeWithPath = ROW({kPath, "a"}, {VARCHAR(), BIGINT()});
auto op = PlanBuilder()
.startTableScan()
Expand Down
9 changes: 6 additions & 3 deletions velox/exec/tests/utils/HiveConnectorTestBase.h
Original file line number Diff line number Diff line change
Expand Up @@ -225,8 +225,10 @@ class HiveConnectorTestBase : public OperatorTestBase {

class HiveConnectorSplitBuilder {
public:
HiveConnectorSplitBuilder(std::string filePath)
: filePath_{std::move(filePath)} {}
explicit HiveConnectorSplitBuilder(std::string filePath)
: filePath_{std::move(filePath)} {
infoColumns_["$path"] = filePath_;
}

HiveConnectorSplitBuilder& start(uint64_t start) {
start_ = start;
Expand Down Expand Up @@ -264,6 +266,7 @@ class HiveConnectorSplitBuilder {

HiveConnectorSplitBuilder& tableBucketNumber(int32_t bucket) {
tableBucketNumber_ = bucket;
infoColumns_["$bucket"] = std::to_string(bucket);
return *this;
}

Expand Down Expand Up @@ -296,7 +299,7 @@ class HiveConnectorSplitBuilder {
static const std::unordered_map<std::string, std::string> serdeParameters;
return std::make_shared<connector::hive::HiveConnectorSplit>(
connectorId_,
filePath_.find("/") == 0 ? "file:" + filePath_ : filePath_,
filePath_,
fileFormat_,
start_,
length_,
Expand Down

0 comments on commit 0758d04

Please sign in to comment.