diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 6d701079b482c..6ed2768d13918 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -378,6 +378,16 @@ function(ADD_ARROW_TEST_DEPENDENCIES REL_TEST_NAME) add_dependencies(${TEST_NAME} ${ARGN}) endfunction() +# A wrapper for target_link_libraries() that is compatible with NO_TESTS. +function(ARROW_TEST_LINK_LIBRARIES REL_TEST_NAME) + if(NO_TESTS) + return() + endif() + get_filename_component(TEST_NAME ${REL_TEST_NAME} NAME_WE) + + target_link_libraries(${TEST_NAME} ${ARGN}) +endfunction() + enable_testing() ############################################################ @@ -528,6 +538,7 @@ set(ARROW_SRCS src/arrow/ipc/metadata-internal.cc src/arrow/types/construct.cc + src/arrow/types/decimal.cc src/arrow/types/json.cc src/arrow/types/list.cc src/arrow/types/primitive.cc diff --git a/cpp/src/arrow/parquet/CMakeLists.txt b/cpp/src/arrow/parquet/CMakeLists.txt index 7b449affab025..0d5cf263ec3e2 100644 --- a/cpp/src/arrow/parquet/CMakeLists.txt +++ b/cpp/src/arrow/parquet/CMakeLists.txt @@ -19,17 +19,23 @@ # arrow_parquet : Arrow <-> Parquet adapter set(PARQUET_SRCS + schema.cc ) set(PARQUET_LIBS + arrow + ${PARQUET_SHARED_LIB} ) -add_library(arrow_parquet STATIC +add_library(arrow_parquet SHARED ${PARQUET_SRCS} ) target_link_libraries(arrow_parquet ${PARQUET_LIBS}) SET_TARGET_PROPERTIES(arrow_parquet PROPERTIES LINKER_LANGUAGE CXX) +ADD_ARROW_TEST(parquet-schema-test) +ARROW_TEST_LINK_LIBRARIES(parquet-schema-test arrow_parquet) + # Headers: top level install(FILES DESTINATION include/arrow/parquet) diff --git a/cpp/src/arrow/parquet/parquet-schema-test.cc b/cpp/src/arrow/parquet/parquet-schema-test.cc new file mode 100644 index 0000000000000..9c3093d9ff7c9 --- /dev/null +++ b/cpp/src/arrow/parquet/parquet-schema-test.cc @@ -0,0 +1,147 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include + +#include "gtest/gtest.h" + +#include "arrow/test-util.h" +#include "arrow/type.h" +#include "arrow/util/status.h" + +#include "arrow/parquet/schema.h" + +namespace arrow { + +namespace parquet { + +using parquet_cpp::Repetition; +using parquet_cpp::schema::NodePtr; +using parquet_cpp::schema::GroupNode; +using parquet_cpp::schema::PrimitiveNode; + +const auto BOOL = std::make_shared(); +const auto UINT8 = std::make_shared(); +const auto INT32 = std::make_shared(); +const auto INT64 = std::make_shared(); +const auto FLOAT = std::make_shared(); +const auto DOUBLE = std::make_shared(); +const auto UTF8 = std::make_shared(); +const auto BINARY = std::make_shared( + std::make_shared("", UINT8)); + +class TestConvertParquetSchema : public ::testing::Test { + public: + virtual void SetUp() {} + + void CheckFlatSchema(const std::shared_ptr& expected_schema) { + ASSERT_EQ(expected_schema->num_fields(), result_schema_->num_fields()); + for (int i = 0; i < expected_schema->num_fields(); ++i) { + auto lhs = result_schema_->field(i); + auto rhs = expected_schema->field(i); + EXPECT_TRUE(lhs->Equals(rhs)) + << i << " " << lhs->ToString() << " != " << rhs->ToString(); + } + } + + Status ConvertSchema(const std::vector& nodes) { + NodePtr schema = GroupNode::Make("schema", Repetition::REPEATED, nodes); + descr_.Init(schema); + return FromParquetSchema(&descr_, &result_schema_); + } + + protected: + parquet_cpp::SchemaDescriptor descr_; + std::shared_ptr result_schema_; +}; + +TEST_F(TestConvertParquetSchema, ParquetFlatPrimitives) { + std::vector parquet_fields; + std::vector> arrow_fields; + + parquet_fields.push_back( + PrimitiveNode::Make("boolean", Repetition::REQUIRED, parquet_cpp::Type::BOOLEAN)); + arrow_fields.push_back(std::make_shared("boolean", BOOL, false)); + + parquet_fields.push_back( + PrimitiveNode::Make("int32", Repetition::REQUIRED, parquet_cpp::Type::INT32)); + arrow_fields.push_back(std::make_shared("int32", INT32, false)); + + parquet_fields.push_back( + PrimitiveNode::Make("int64", Repetition::REQUIRED, parquet_cpp::Type::INT64)); + arrow_fields.push_back(std::make_shared("int64", INT64, false)); + + parquet_fields.push_back( + PrimitiveNode::Make("float", Repetition::OPTIONAL, parquet_cpp::Type::FLOAT)); + arrow_fields.push_back(std::make_shared("float", FLOAT)); + + parquet_fields.push_back( + PrimitiveNode::Make("double", Repetition::OPTIONAL, parquet_cpp::Type::DOUBLE)); + arrow_fields.push_back(std::make_shared("double", DOUBLE)); + + parquet_fields.push_back( + PrimitiveNode::Make("binary", Repetition::OPTIONAL, + parquet_cpp::Type::BYTE_ARRAY)); + arrow_fields.push_back(std::make_shared("binary", BINARY)); + + parquet_fields.push_back( + PrimitiveNode::Make("string", Repetition::OPTIONAL, + parquet_cpp::Type::BYTE_ARRAY, + parquet_cpp::LogicalType::UTF8)); + arrow_fields.push_back(std::make_shared("string", UTF8)); + + parquet_fields.push_back( + PrimitiveNode::Make("flba-binary", Repetition::OPTIONAL, + parquet_cpp::Type::FIXED_LEN_BYTE_ARRAY, + parquet_cpp::LogicalType::NONE, 12)); + arrow_fields.push_back(std::make_shared("flba-binary", BINARY)); + + auto arrow_schema = std::make_shared(arrow_fields); + ASSERT_OK(ConvertSchema(parquet_fields)); + + CheckFlatSchema(arrow_schema); +} + +TEST_F(TestConvertParquetSchema, UnsupportedThings) { + std::vector unsupported_nodes; + + unsupported_nodes.push_back( + PrimitiveNode::Make("int96", Repetition::REQUIRED, parquet_cpp::Type::INT96)); + + unsupported_nodes.push_back( + GroupNode::Make("repeated-group", Repetition::REPEATED, {})); + + unsupported_nodes.push_back( + PrimitiveNode::Make("int32", Repetition::OPTIONAL, + parquet_cpp::Type::INT32, parquet_cpp::LogicalType::DATE)); + + unsupported_nodes.push_back( + PrimitiveNode::Make("int64", Repetition::OPTIONAL, + parquet_cpp::Type::INT64, parquet_cpp::LogicalType::TIMESTAMP_MILLIS)); + + for (const NodePtr& node : unsupported_nodes) { + ASSERT_RAISES(NotImplemented, ConvertSchema({node})); + } +} + +TEST(TestNodeConversion, DateAndTime) { +} + +} // namespace parquet + +} // namespace arrow diff --git a/cpp/src/arrow/parquet/schema.cc b/cpp/src/arrow/parquet/schema.cc new file mode 100644 index 0000000000000..6b1de572617b8 --- /dev/null +++ b/cpp/src/arrow/parquet/schema.cc @@ -0,0 +1,178 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/parquet/schema.h" + +#include + +#include "parquet/api/schema.h" + +#include "arrow/util/status.h" +#include "arrow/types/decimal.h" + +using parquet_cpp::schema::Node; +using parquet_cpp::schema::NodePtr; +using parquet_cpp::schema::GroupNode; +using parquet_cpp::schema::PrimitiveNode; + +using parquet_cpp::LogicalType; + +namespace arrow { + +namespace parquet { + +const auto BOOL = std::make_shared(); +const auto UINT8 = std::make_shared(); +const auto INT32 = std::make_shared(); +const auto INT64 = std::make_shared(); +const auto FLOAT = std::make_shared(); +const auto DOUBLE = std::make_shared(); +const auto UTF8 = std::make_shared(); +const auto BINARY = std::make_shared( + std::make_shared("", UINT8)); + +TypePtr MakeDecimalType(const PrimitiveNode* node) { + int precision = node->decimal_metadata().precision; + int scale = node->decimal_metadata().scale; + return std::make_shared(precision, scale); +} + +static Status FromByteArray(const PrimitiveNode* node, TypePtr* out) { + switch (node->logical_type()) { + case LogicalType::UTF8: + *out = UTF8; + break; + default: + // BINARY + *out = BINARY; + break; + } + return Status::OK(); +} + +static Status FromFLBA(const PrimitiveNode* node, TypePtr* out) { + switch (node->logical_type()) { + case LogicalType::NONE: + *out = BINARY; + break; + case LogicalType::DECIMAL: + *out = MakeDecimalType(node); + break; + default: + return Status::NotImplemented("unhandled type"); + break; + } + + return Status::OK(); +} + +static Status FromInt32(const PrimitiveNode* node, TypePtr* out) { + switch (node->logical_type()) { + case LogicalType::NONE: + *out = INT32; + break; + default: + return Status::NotImplemented("Unhandled logical type for int32"); + break; + } + return Status::OK(); +} + +static Status FromInt64(const PrimitiveNode* node, TypePtr* out) { + switch (node->logical_type()) { + case LogicalType::NONE: + *out = INT64; + break; + default: + return Status::NotImplemented("Unhandled logical type for int64"); + break; + } + return Status::OK(); +} + +// TODO: Logical Type Handling +Status NodeToField(const NodePtr& node, std::shared_ptr* out) { + std::shared_ptr type; + + if (node->is_repeated()) { + return Status::NotImplemented("No support yet for repeated node types"); + } + + if (node->is_group()) { + const GroupNode* group = static_cast(node.get()); + std::vector> fields(group->field_count()); + for (int i = 0; i < group->field_count(); i++) { + RETURN_NOT_OK(NodeToField(group->field(i), &fields[i])); + } + type = std::make_shared(fields); + } else { + // Primitive (leaf) node + const PrimitiveNode* primitive = static_cast(node.get()); + + switch (primitive->physical_type()) { + case parquet_cpp::Type::BOOLEAN: + type = BOOL; + break; + case parquet_cpp::Type::INT32: + RETURN_NOT_OK(FromInt32(primitive, &type)); + break; + case parquet_cpp::Type::INT64: + RETURN_NOT_OK(FromInt64(primitive, &type)); + break; + case parquet_cpp::Type::INT96: + // TODO: Do we have that type in Arrow? + // type = TypePtr(new Int96Type()); + return Status::NotImplemented("int96"); + case parquet_cpp::Type::FLOAT: + type = FLOAT; + break; + case parquet_cpp::Type::DOUBLE: + type = DOUBLE; + break; + case parquet_cpp::Type::BYTE_ARRAY: + // TODO: Do we have that type in Arrow? + RETURN_NOT_OK(FromByteArray(primitive, &type)); + break; + case parquet_cpp::Type::FIXED_LEN_BYTE_ARRAY: + RETURN_NOT_OK(FromFLBA(primitive, &type)); + break; + } + } + + *out = std::make_shared(node->name(), type, !node->is_required()); + return Status::OK(); +} + +Status FromParquetSchema(const parquet_cpp::SchemaDescriptor* parquet_schema, + std::shared_ptr* out) { + // TODO(wesm): Consider adding an arrow::Schema name attribute, which comes + // from the root Parquet node + const GroupNode* schema_node = static_cast( + parquet_schema->schema().get()); + + std::vector> fields(schema_node->field_count()); + for (int i = 0; i < schema_node->field_count(); i++) { + RETURN_NOT_OK(NodeToField(schema_node->field(i), &fields[i])); + } + + *out = std::make_shared(fields); + return Status::OK(); +} + +} // namespace parquet + +} // namespace arrow diff --git a/cpp/src/arrow/parquet/schema.h b/cpp/src/arrow/parquet/schema.h new file mode 100644 index 0000000000000..61de193a33877 --- /dev/null +++ b/cpp/src/arrow/parquet/schema.h @@ -0,0 +1,44 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef ARROW_PARQUET_SCHEMA_H +#define ARROW_PARQUET_SCHEMA_H + +#include + +#include "parquet/api/schema.h" + +#include "arrow/schema.h" +#include "arrow/type.h" + +namespace arrow { + +class Status; + +namespace parquet { + +Status NodeToField(const parquet_cpp::schema::NodePtr& node, + std::shared_ptr* out); + +Status FromParquetSchema(const parquet_cpp::SchemaDescriptor* parquet_schema, + std::shared_ptr* out); + +} // namespace parquet + +} // namespace arrow + +#endif diff --git a/cpp/src/arrow/types/decimal.cc b/cpp/src/arrow/types/decimal.cc new file mode 100644 index 0000000000000..f120c1a9dfde6 --- /dev/null +++ b/cpp/src/arrow/types/decimal.cc @@ -0,0 +1,32 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/types/decimal.h" + +#include +#include + +namespace arrow { + +std::string DecimalType::ToString() const { + std::stringstream s; + s << "decimal(" << precision << ", " << scale << ")"; + return s.str(); +} + +} // namespace arrow + diff --git a/cpp/src/arrow/types/decimal.h b/cpp/src/arrow/types/decimal.h index 464c3ff8da92b..26243b42b0e7d 100644 --- a/cpp/src/arrow/types/decimal.h +++ b/cpp/src/arrow/types/decimal.h @@ -18,13 +18,24 @@ #ifndef ARROW_TYPES_DECIMAL_H #define ARROW_TYPES_DECIMAL_H +#include + #include "arrow/type.h" namespace arrow { struct DecimalType : public DataType { + explicit DecimalType(int precision_, int scale_) + : DataType(Type::DECIMAL), precision(precision_), + scale(scale_) { } int precision; int scale; + + static char const *name() { + return "decimal"; + } + + std::string ToString() const override; }; } // namespace arrow diff --git a/cpp/src/arrow/util/status.h b/cpp/src/arrow/util/status.h index b5931232dbdcb..4e273edcb8f1f 100644 --- a/cpp/src/arrow/util/status.h +++ b/cpp/src/arrow/util/status.h @@ -109,6 +109,7 @@ class Status { bool IsKeyError() const { return code() == StatusCode::KeyError; } bool IsInvalid() const { return code() == StatusCode::Invalid; } bool IsIOError() const { return code() == StatusCode::IOError; } + bool IsNotImplemented() const { return code() == StatusCode::NotImplemented; } // Return a string representation of this status suitable for printing. // Returns the string "OK" for success. diff --git a/format/Layout.md b/format/Layout.md index 2d46ece606ea7..1b532c6b3817c 100644 --- a/format/Layout.md +++ b/format/Layout.md @@ -58,7 +58,7 @@ Base requirements * Memory layout and random access patterns for each relative type * Null value representation -## Non-goals (for this document +## Non-goals (for this document) * To enumerate or specify logical types that can be implemented as primitive (fixed-width) value types. For example: signed and unsigned integers,