From 61b14316cdd9e976a437fbc3832ad673b6696854 Mon Sep 17 00:00:00 2001 From: rui-mo Date: Fri, 9 Feb 2024 19:34:40 +0800 Subject: [PATCH] Add test for partition types --- velox/connectors/hive/HiveConnectorSplit.h | 4 +- velox/exec/tests/TableScanTest.cpp | 104 +++++++++++++++++---- 2 files changed, 88 insertions(+), 20 deletions(-) diff --git a/velox/connectors/hive/HiveConnectorSplit.h b/velox/connectors/hive/HiveConnectorSplit.h index 15be998884b0..60ed26cdcccb 100644 --- a/velox/connectors/hive/HiveConnectorSplit.h +++ b/velox/connectors/hive/HiveConnectorSplit.h @@ -31,7 +31,9 @@ struct HiveConnectorSplit : public connector::ConnectorSplit { /// Mapping from partition keys to values. Values are specified as strings /// formatted the same way as CAST(x as VARCHAR). Null values are specified as /// std::nullopt. Date values must be formatted using ISO 8601 as YYYY-MM-DD. - /// All scalar types and date type are supported. + /// Decimal values must be formatted using unscaled values, e.g. '123456' for + /// '1245.56' of decimal(6, 2) type. All scalar types and date type are + /// supported. const std::unordered_map> partitionKeys; std::optional tableBucketNumber; diff --git a/velox/exec/tests/TableScanTest.cpp b/velox/exec/tests/TableScanTest.cpp index 76485ebb37fa..9506c4036855 100644 --- a/velox/exec/tests/TableScanTest.cpp +++ b/velox/exec/tests/TableScanTest.cpp @@ -39,6 +39,17 @@ using namespace facebook::velox::common::test; using namespace facebook::velox::exec::test; namespace { +std::string makeCastSql(const variant& v, const TypePtr& type) { + std::ostringstream out; + const std::string value = type->isDate() + ? folly::parseJson(v.toJson(type)).asString() + : v.toJson(type); + out << "CAST('" << value << "' AS "; + toTypeSql(type, out); + out << ")"; + return out.str(); +} + void verifyCacheStats( const FileHandleCacheStats& cacheStats, size_t curSize, @@ -148,10 +159,25 @@ class TableScanTest : public virtual HiveConnectorTestBase { void testPartitionedTableImpl( const std::string& filePath, const TypePtr& partitionType, - const std::optional& partitionValue) { - auto split = HiveConnectorSplitBuilder(filePath) - .partitionKey("pkey", partitionValue) - .build(); + const variant& partitionValue) { + // Create the partition value of a split. + std::optional value = std::nullopt; + if (!partitionValue.isNull()) { + auto type = partitionType; + if (partitionType->isDecimal()) { + const auto [precision, scale] = + getDecimalPrecisionScale(*partitionType); + // The partition value of decimal should be formatted with unscaled + // value. + type = DECIMAL(precision, 0); + } + value = std::optional( + partitionType->isDate() + ? folly::parseJson(partitionValue.toJson(type)).asString() + : partitionValue.toJson(type)); + } + auto split = + HiveConnectorSplitBuilder(filePath).partitionKey("pkey", value).build(); auto outputType = ROW({"pkey", "c0", "c1"}, {partitionType, BIGINT(), DOUBLE()}); ColumnHandleMap assignments = { @@ -166,8 +192,10 @@ class TableScanTest : public virtual HiveConnectorTestBase { .endTableScan() .planNode(); - std::string partitionValueStr = - partitionValue.has_value() ? "'" + *partitionValue + "'" : "null"; + std::string partitionValueStr = partitionValue.isNull() + ? "null" + : makeCastSql(partitionValue, partitionType); + assertQuery( op, split, fmt::format("SELECT {}, * FROM tmp", partitionValueStr)); @@ -210,9 +238,10 @@ class TableScanTest : public virtual HiveConnectorTestBase { void testPartitionedTable( const std::string& filePath, const TypePtr& partitionType, - const std::optional& partitionValue) { + const variant& partitionValue) { testPartitionedTableImpl(filePath, partitionType, partitionValue); - testPartitionedTableImpl(filePath, partitionType, std::nullopt); + testPartitionedTableImpl( + filePath, partitionType, variant::null(partitionType->kind())); } RowTypePtr rowType_{ @@ -1442,7 +1471,7 @@ TEST_F(TableScanTest, partitionedTableVarcharKey) { writeToFile(filePath->path, vectors); createDuckDbTable(vectors); - testPartitionedTable(filePath->path, VARCHAR(), "2020-11-01"); + testPartitionedTable(filePath->path, VARCHAR(), variant("2020-11-01")); } TEST_F(TableScanTest, partitionedTableBigIntKey) { @@ -1451,7 +1480,10 @@ TEST_F(TableScanTest, partitionedTableBigIntKey) { auto filePath = TempFilePath::create(); writeToFile(filePath->path, vectors); createDuckDbTable(vectors); - testPartitionedTable(filePath->path, BIGINT(), "123456789123456789"); + testPartitionedTable( + filePath->path, + BIGINT(), + variant::create(123456789123456789)); } TEST_F(TableScanTest, partitionedTableIntegerKey) { @@ -1460,7 +1492,8 @@ TEST_F(TableScanTest, partitionedTableIntegerKey) { auto filePath = TempFilePath::create(); writeToFile(filePath->path, vectors); createDuckDbTable(vectors); - testPartitionedTable(filePath->path, INTEGER(), "123456789"); + testPartitionedTable( + filePath->path, INTEGER(), variant::create(123456789)); } TEST_F(TableScanTest, partitionedTableSmallIntKey) { @@ -1469,7 +1502,8 @@ TEST_F(TableScanTest, partitionedTableSmallIntKey) { auto filePath = TempFilePath::create(); writeToFile(filePath->path, vectors); createDuckDbTable(vectors); - testPartitionedTable(filePath->path, SMALLINT(), "1"); + testPartitionedTable( + filePath->path, SMALLINT(), variant::create(1)); } TEST_F(TableScanTest, partitionedTableTinyIntKey) { @@ -1478,7 +1512,8 @@ TEST_F(TableScanTest, partitionedTableTinyIntKey) { auto filePath = TempFilePath::create(); writeToFile(filePath->path, vectors); createDuckDbTable(vectors); - testPartitionedTable(filePath->path, TINYINT(), "1"); + testPartitionedTable( + filePath->path, TINYINT(), variant::create(1)); } TEST_F(TableScanTest, partitionedTableBooleanKey) { @@ -1487,7 +1522,8 @@ TEST_F(TableScanTest, partitionedTableBooleanKey) { auto filePath = TempFilePath::create(); writeToFile(filePath->path, vectors); createDuckDbTable(vectors); - testPartitionedTable(filePath->path, BOOLEAN(), "0"); + testPartitionedTable( + filePath->path, BOOLEAN(), variant::create(false)); } TEST_F(TableScanTest, partitionedTableRealKey) { @@ -1496,7 +1532,8 @@ TEST_F(TableScanTest, partitionedTableRealKey) { auto filePath = TempFilePath::create(); writeToFile(filePath->path, vectors); createDuckDbTable(vectors); - testPartitionedTable(filePath->path, REAL(), "3.5"); + testPartitionedTable( + filePath->path, REAL(), variant::create(3.5)); } TEST_F(TableScanTest, partitionedTableDoubleKey) { @@ -1505,7 +1542,35 @@ TEST_F(TableScanTest, partitionedTableDoubleKey) { auto filePath = TempFilePath::create(); writeToFile(filePath->path, vectors); createDuckDbTable(vectors); - testPartitionedTable(filePath->path, DOUBLE(), "3.5"); + testPartitionedTable( + filePath->path, DOUBLE(), variant::create(3.5)); +} + +TEST_F(TableScanTest, partitionedTableDecimalKey) { + auto rowType = ROW({"c0", "c1"}, {BIGINT(), DOUBLE()}); + auto vectors = makeVectors(10, 1'000, rowType); + auto filePath = TempFilePath::create(); + writeToFile(filePath->path, vectors); + createDuckDbTable(vectors); + + testPartitionedTable( + filePath->path, + DECIMAL(12, 3), + variant::create(123456789123)); + testPartitionedTable( + filePath->path, + DECIMAL(12, 3), + variant::create(-123456789123)); + testPartitionedTable( + filePath->path, + DECIMAL(36, 18), + variant::create( + HugeInt::parse("123456789123456789123456789123456789"))); + testPartitionedTable( + filePath->path, + DECIMAL(36, 18), + variant::create( + HugeInt::parse("-123456789123456789123456789123456789"))); } TEST_F(TableScanTest, partitionedTableDateKey) { @@ -1514,7 +1579,8 @@ TEST_F(TableScanTest, partitionedTableDateKey) { auto filePath = TempFilePath::create(); writeToFile(filePath->path, vectors); createDuckDbTable(vectors); - testPartitionedTable(filePath->path, DATE(), "2023-10-27"); + testPartitionedTable( + filePath->path, DATE(), variant::create(19657)); } std::vector toStringViews(const std::vector& values) { @@ -3709,7 +3775,7 @@ TEST_F(TableScanTest, readMissingFieldsWithMoreColumns) { } } -TEST_F(TableScanTest, varbinaryPartitionKey) { +TEST_F(TableScanTest, partitionedTableVarbinaryKey) { auto vectors = makeVectors(1, 1'000); auto filePath = TempFilePath::create(); writeToFile(filePath->path, vectors); @@ -3734,7 +3800,7 @@ TEST_F(TableScanTest, varbinaryPartitionKey) { assertQuery(op, split, "SELECT c0, '2021-12-02' FROM tmp"); } -TEST_F(TableScanTest, timestampPartitionKey) { +TEST_F(TableScanTest, partitionedTableTimestampKey) { const char* inputs[] = {"2023-10-14 07:00:00.0", "2024-01-06 04:00:00.0"}; auto expected = makeRowVector( {"t"},