From 5809dafb0f25b4bbf65387b0206d0f94b391c0b9 Mon Sep 17 00:00:00 2001 From: Kevin Gurney Date: Mon, 20 May 2024 13:53:21 -0400 Subject: [PATCH] GH-41656: [MATLAB] Add C Data Interface format import/export functionality for `arrow.array.Array` (#41737) ### Rationale for this change Now that #41653 and #41654 have been addressed, we should add MATLAB APIs for importing/exporting `arrow.array.Array` objects using the C Data Interface format. This pull request adds two new APIs for importing and exporting `arrow.array.Array` objects using the C Data Interface format. #### Example ```matlab >> expected = arrow.array([1, 2, 3]) expected = Float64Array with 3 elements and 0 null values: 1 | 2 | 3 >> cArray = arrow.c.Array() cArray = Array with properties: Address: 140341875084944 >> cSchema = arrow.c.Schema() cSchema = Schema with properties: Address: 140341880022320 % Export the Array to C Data Interface Format >> expected.export(cArray.Address, cSchema.Address) % Import the Array from C Data Interface Format >> actual = arrow.array.Array.import(cArray, cSchema) actual = Float64Array with 3 elements and 0 null values: 1 | 2 | 3 % The Array is the same after round-tripping to C Data Interface format >> isequal(actual, expected) ans = logical 1 ``` ### What changes are included in this PR? 1. Added new `arrow.array.Array.export(cArrowArrayAddress, cArrowSchemaAddress)` method for exporting `Array` objects to C Data Interface format. 2. Added new static `arrow.array.Array.import(cArray, cSchema)` method for importing `Array`s from C Data Interface format. 3. Added new internal `arrow.c.internal.ArrayImporter` class for importing `Array` objects from C Data Interface format. ### Are these changes tested? Yes. 1. Added new test file `matlab/test/arrow/c/tRoundTrip.m` with basic round-trip tests for importing/exporting `Array` objects using the C Data Interface format. ### Are there any user-facing changes? Yes. 1. There are now two new user-facing APIs added to the `arrow.array.Array` class. These are `arrow.array.Array.export(cArrowArrayAddress, cArrowSchemaAddress)` and `arrow.array.Array.import(cArray, cSchema)`. These APIs can be used to import/export `Array` objects using the C Data Interface format. ### Future Directions 1. Add integration tests for sharing data between MATLAB/mlarrow and Python/pyarrow running in the same process using the [MATLAB interface to Python](https://www.mathworks.com/help/matlab/call-python-libraries.html). 2. Add support for exporting/importing `arrow.tabular.RecordBatch` objects using the C Data Interface format. 3. Add support for the Arrow [C stream interface format](https://arrow.apache.org/docs/format/CStreamInterface.html). ### Notes 1. Thanks @ sgilmore10 for your help with this pull request! * GitHub Issue: #41656 Lead-authored-by: Kevin Gurney Co-authored-by: Kevin Gurney Co-authored-by: Sutou Kouhei Signed-off-by: Kevin Gurney --- .../src/cpp/arrow/matlab/array/proxy/array.cc | 18 ++ .../src/cpp/arrow/matlab/array/proxy/array.h | 2 + .../arrow/matlab/c/proxy/array_importer.cc | 69 +++++++ .../cpp/arrow/matlab/c/proxy/array_importer.h | 37 ++++ matlab/src/cpp/arrow/matlab/error/error.h | 2 + matlab/src/cpp/arrow/matlab/proxy/factory.cc | 2 + matlab/src/matlab/+arrow/+array/Array.m | 24 +++ .../+arrow/+c/+internal/ArrayImporter.m | 50 +++++ matlab/test/arrow/c/tRoundTrip.m | 182 ++++++++++++++++++ .../cmake/BuildMatlabArrowInterface.cmake | 1 + 10 files changed, 387 insertions(+) create mode 100644 matlab/src/cpp/arrow/matlab/c/proxy/array_importer.cc create mode 100644 matlab/src/cpp/arrow/matlab/c/proxy/array_importer.h create mode 100644 matlab/src/matlab/+arrow/+c/+internal/ArrayImporter.m create mode 100644 matlab/test/arrow/c/tRoundTrip.m diff --git a/matlab/src/cpp/arrow/matlab/array/proxy/array.cc b/matlab/src/cpp/arrow/matlab/array/proxy/array.cc index b8f85b08632a3..1eb6de74fec65 100644 --- a/matlab/src/cpp/arrow/matlab/array/proxy/array.cc +++ b/matlab/src/cpp/arrow/matlab/array/proxy/array.cc @@ -15,6 +15,7 @@ // specific language governing permissions and limitations // under the License. +#include "arrow/c/bridge.h" #include "arrow/util/utf8.h" #include "arrow/matlab/array/proxy/array.h" @@ -40,6 +41,7 @@ Array::Array(std::shared_ptr array) : array{std::move(array)} { REGISTER_METHOD(Array, getType); REGISTER_METHOD(Array, isEqual); REGISTER_METHOD(Array, slice); + REGISTER_METHOD(Array, exportToC); } std::shared_ptr Array::unwrap() { return array; } @@ -178,4 +180,20 @@ void Array::slice(libmexclass::proxy::method::Context& context) { output[0]["TypeID"] = factory.createScalar(type_id); context.outputs[0] = output; } + +void Array::exportToC(libmexclass::proxy::method::Context& context) { + namespace mda = ::matlab::data; + mda::StructArray opts = context.inputs[0]; + const mda::TypedArray array_address_mda = opts[0]["ArrowArrayAddress"]; + const mda::TypedArray schema_address_mda = opts[0]["ArrowSchemaAddress"]; + + auto arrow_array = reinterpret_cast(uint64_t(array_address_mda[0])); + auto arrow_schema = + reinterpret_cast(uint64_t(schema_address_mda[0])); + + MATLAB_ERROR_IF_NOT_OK_WITH_CONTEXT( + arrow::ExportArray(*array, arrow_array, arrow_schema), context, + error::C_EXPORT_FAILED); +} + } // namespace arrow::matlab::array::proxy diff --git a/matlab/src/cpp/arrow/matlab/array/proxy/array.h b/matlab/src/cpp/arrow/matlab/array/proxy/array.h index 61ba06a503bc4..c249693ac2797 100644 --- a/matlab/src/cpp/arrow/matlab/array/proxy/array.h +++ b/matlab/src/cpp/arrow/matlab/array/proxy/array.h @@ -45,6 +45,8 @@ class Array : public libmexclass::proxy::Proxy { void slice(libmexclass::proxy::method::Context& context); + void exportToC(libmexclass::proxy::method::Context& context); + std::shared_ptr array; }; diff --git a/matlab/src/cpp/arrow/matlab/c/proxy/array_importer.cc b/matlab/src/cpp/arrow/matlab/c/proxy/array_importer.cc new file mode 100644 index 0000000000000..b6f68332d1757 --- /dev/null +++ b/matlab/src/cpp/arrow/matlab/c/proxy/array_importer.cc @@ -0,0 +1,69 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/array.h" +#include "arrow/c/bridge.h" + +#include "arrow/matlab/array/proxy/wrap.h" +#include "arrow/matlab/c/proxy/array_importer.h" +#include "arrow/matlab/error/error.h" + +#include "libmexclass/proxy/ProxyManager.h" + +namespace arrow::matlab::c::proxy { + +ArrayImporter::ArrayImporter() { REGISTER_METHOD(ArrayImporter, import); } + +libmexclass::proxy::MakeResult ArrayImporter::make( + const libmexclass::proxy::FunctionArguments& constructor_arguments) { + return std::make_shared(); +} + +void ArrayImporter::import(libmexclass::proxy::method::Context& context) { + namespace mda = ::matlab::data; + using namespace libmexclass::proxy; + + mda::StructArray args = context.inputs[0]; + const mda::TypedArray arrow_array_address_mda = args[0]["ArrowArrayAddress"]; + const mda::TypedArray arrow_schema_address_mda = + args[0]["ArrowSchemaAddress"]; + + const auto arrow_array_address = uint64_t(arrow_array_address_mda[0]); + const auto arrow_schema_address = uint64_t(arrow_schema_address_mda[0]); + + auto arrow_array = reinterpret_cast(arrow_array_address); + auto arrow_schema = reinterpret_cast(arrow_schema_address); + + MATLAB_ASSIGN_OR_ERROR_WITH_CONTEXT(auto array, + arrow::ImportArray(arrow_array, arrow_schema), + context, error::C_IMPORT_FAILED); + + MATLAB_ASSIGN_OR_ERROR_WITH_CONTEXT(auto array_proxy, + arrow::matlab::array::proxy::wrap(array), context, + error::UNKNOWN_PROXY_FOR_ARRAY_TYPE); + + mda::ArrayFactory factory; + const auto array_proxy_id = ProxyManager::manageProxy(array_proxy); + const auto array_proxy_id_mda = factory.createScalar(array_proxy_id); + const auto array_type_id_mda = + factory.createScalar(static_cast(array->type_id())); + + context.outputs[0] = array_proxy_id_mda; + context.outputs[1] = array_type_id_mda; +} + +} // namespace arrow::matlab::c::proxy diff --git a/matlab/src/cpp/arrow/matlab/c/proxy/array_importer.h b/matlab/src/cpp/arrow/matlab/c/proxy/array_importer.h new file mode 100644 index 0000000000000..6459393058737 --- /dev/null +++ b/matlab/src/cpp/arrow/matlab/c/proxy/array_importer.h @@ -0,0 +1,37 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include "libmexclass/proxy/Proxy.h" + +namespace arrow::matlab::c::proxy { + +class ArrayImporter : public libmexclass::proxy::Proxy { + public: + ArrayImporter(); + + ~ArrayImporter() = default; + + static libmexclass::proxy::MakeResult make( + const libmexclass::proxy::FunctionArguments& constructor_arguments); + + protected: + void import(libmexclass::proxy::method::Context& context); +}; + +} // namespace arrow::matlab::c::proxy diff --git a/matlab/src/cpp/arrow/matlab/error/error.h b/matlab/src/cpp/arrow/matlab/error/error.h index db8b715141ee8..58c43d8843e4b 100644 --- a/matlab/src/cpp/arrow/matlab/error/error.h +++ b/matlab/src/cpp/arrow/matlab/error/error.h @@ -240,5 +240,7 @@ static const char* ARRAY_SLICE_NON_POSITIVE_OFFSET = static const char* ARRAY_SLICE_NEGATIVE_LENGTH = "arrow:array:slice:NegativeLength"; static const char* ARRAY_SLICE_FAILED_TO_CREATE_ARRAY_PROXY = "arrow:array:slice:FailedToCreateArrayProxy"; +static const char* C_EXPORT_FAILED = "arrow:c:export:ExportFailed"; +static const char* C_IMPORT_FAILED = "arrow:c:import:ImportFailed"; } // namespace arrow::matlab::error diff --git a/matlab/src/cpp/arrow/matlab/proxy/factory.cc b/matlab/src/cpp/arrow/matlab/proxy/factory.cc index d7a8fa9ac2e74..9b95fcf128090 100644 --- a/matlab/src/cpp/arrow/matlab/proxy/factory.cc +++ b/matlab/src/cpp/arrow/matlab/proxy/factory.cc @@ -26,6 +26,7 @@ #include "arrow/matlab/array/proxy/timestamp_array.h" #include "arrow/matlab/buffer/proxy/buffer.h" #include "arrow/matlab/c/proxy/array.h" +#include "arrow/matlab/c/proxy/array_importer.h" #include "arrow/matlab/c/proxy/schema.h" #include "arrow/matlab/error/error.h" #include "arrow/matlab/io/csv/proxy/table_reader.h" @@ -102,6 +103,7 @@ libmexclass::proxy::MakeResult Factory::make_proxy( REGISTER_PROXY(arrow.io.csv.proxy.TableWriter , arrow::matlab::io::csv::proxy::TableWriter); REGISTER_PROXY(arrow.io.csv.proxy.TableReader , arrow::matlab::io::csv::proxy::TableReader); REGISTER_PROXY(arrow.c.proxy.Array , arrow::matlab::c::proxy::Array); + REGISTER_PROXY(arrow.c.proxy.ArrayImporter , arrow::matlab::c::proxy::ArrayImporter); REGISTER_PROXY(arrow.c.proxy.Schema , arrow::matlab::c::proxy::Schema); // clang-format on diff --git a/matlab/src/matlab/+arrow/+array/Array.m b/matlab/src/matlab/+arrow/+array/Array.m index 4402055932b60..01bacdf5755dc 100644 --- a/matlab/src/matlab/+arrow/+array/Array.m +++ b/matlab/src/matlab/+arrow/+array/Array.m @@ -97,6 +97,19 @@ function displayScalarObject(obj) % Invoke isEqual proxy object method tf = obj.Proxy.isEqual(proxyIDs); end + + function export(obj, cArrowArrayAddress, cArrowSchemaAddress) + arguments + obj(1, 1) arrow.array.Array + cArrowArrayAddress(1, 1) uint64 + cArrowSchemaAddress(1, 1) uint64 + end + args = struct(... + ArrowArrayAddress=cArrowArrayAddress,... + ArrowSchemaAddress=cArrowSchemaAddress... + ); + obj.Proxy.exportToC(args); + end end methods (Hidden) @@ -108,4 +121,15 @@ function displayScalarObject(obj) array = traits.ArrayConstructor(proxy); end end + + methods (Static) + function array = import(cArray, cSchema) + arguments + cArray(1, 1) arrow.c.Array + cSchema(1, 1) arrow.c.Schema + end + importer = arrow.c.internal.ArrayImporter(); + array = importer.import(cArray, cSchema); + end + end end diff --git a/matlab/src/matlab/+arrow/+c/+internal/ArrayImporter.m b/matlab/src/matlab/+arrow/+c/+internal/ArrayImporter.m new file mode 100644 index 0000000000000..3f2f7445b3d6d --- /dev/null +++ b/matlab/src/matlab/+arrow/+c/+internal/ArrayImporter.m @@ -0,0 +1,50 @@ +%ARRAYIMPORTER Imports Arrow Array using the C Data Interface Format. + +% Licensed to the Apache Software Foundation (ASF) under one or more +% contributor license agreements. See the NOTICE file distributed with +% this work for additional information regarding copyright ownership. +% The ASF licenses this file to you under the Apache License, Version +% 2.0 (the "License"); you may not use this file except in compliance +% with the License. You may obtain a copy of the License at +% +% http://www.apache.org/licenses/LICENSE-2.0 +% +% Unless required by applicable law or agreed to in writing, software +% distributed under the License is distributed on an "AS IS" BASIS, +% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +% implied. See the License for the specific language governing +% permissions and limitations under the License. +classdef ArrayImporter < matlab.mixin.Scalar + + properties (Hidden, SetAccess=private, GetAccess=public) + Proxy + end + + methods + + function obj = ArrayImporter() + proxyName = "arrow.c.proxy.ArrayImporter"; + proxy = arrow.internal.proxy.create(proxyName, struct()); + obj.Proxy = proxy; + end + + function array = import(obj, cArray, cSchema) + arguments + obj(1, 1) arrow.c.internal.ArrayImporter + cArray(1, 1) arrow.c.Array + cSchema(1, 1) arrow.c.Schema + end + args = struct(... + ArrowArrayAddress=cArray.Address,... + ArrowSchemaAddress=cSchema.Address... + ); + [proxyID, typeID] = obj.Proxy.import(args); + traits = arrow.type.traits.traits(arrow.type.ID(typeID)); + proxy = libmexclass.proxy.Proxy(Name=traits.ArrayProxyClassName, ID=proxyID); + array = traits.ArrayConstructor(proxy); + end + + end + +end + diff --git a/matlab/test/arrow/c/tRoundTrip.m b/matlab/test/arrow/c/tRoundTrip.m new file mode 100644 index 0000000000000..a72dbe2679a2d --- /dev/null +++ b/matlab/test/arrow/c/tRoundTrip.m @@ -0,0 +1,182 @@ +%TROUNDTRIP Tests for roundtripping using the C Data Interface format. + +% Licensed to the Apache Software Foundation (ASF) under one or more +% contributor license agreements. See the NOTICE file distributed with +% this work for additional information regarding copyright ownership. +% The ASF licenses this file to you under the Apache License, Version +% 2.0 (the "License"); you may not use this file except in compliance +% with the License. You may obtain a copy of the License at +% +% http://www.apache.org/licenses/LICENSE-2.0 +% +% Unless required by applicable law or agreed to in writing, software +% distributed under the License is distributed on an "AS IS" BASIS, +% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +% implied. See the License for the specific language governing +% permissions and limitations under the License. +classdef tRoundTrip < matlab.unittest.TestCase + + methods (Test) + + function EmptyArray(testCase) + expected = arrow.array(double.empty(0, 1)); + cArray = arrow.c.Array(); + cSchema = arrow.c.Schema(); + + expected.export(cArray.Address, cSchema.Address); + actual = arrow.array.Array.import(cArray, cSchema); + + testCase.verifyEqual(actual, expected); + end + + function ArrayWithNulls(testCase) + % Scalar null + expected = arrow.array(double(NaN)); + cArray = arrow.c.Array(); + cSchema = arrow.c.Schema(); + + expected.export(cArray.Address, cSchema.Address); + actual = arrow.array.Array.import(cArray, cSchema); + + testCase.verifyEqual(actual, expected); + + % Vector with nulls + expected = arrow.array([1, NaN, 3, NaN, 5]); + cArray = arrow.c.Array(); + cSchema = arrow.c.Schema(); + + expected.export(cArray.Address, cSchema.Address); + actual = arrow.array.Array.import(cArray, cSchema); + + testCase.verifyEqual(actual, expected); + + % Vector all nulls + expected = arrow.array([NaN, NaN, NaN]); + cArray = arrow.c.Array(); + cSchema = arrow.c.Schema(); + + expected.export(cArray.Address, cSchema.Address); + actual = arrow.array.Array.import(cArray, cSchema); + + testCase.verifyEqual(actual, expected); + end + + function Float64Array(testCase) + % Scalar + expected = arrow.array(double(1)); + cArray = arrow.c.Array(); + cSchema = arrow.c.Schema(); + + expected.export(cArray.Address, cSchema.Address); + actual = arrow.array.Array.import(cArray, cSchema); + + testCase.verifyEqual(actual, expected); + + % Vector + expected = arrow.array([1, 2, 3]); + cArray = arrow.c.Array(); + cSchema = arrow.c.Schema(); + + expected.export(cArray.Address, cSchema.Address); + actual = arrow.array.Array.import(cArray, cSchema); + + testCase.verifyEqual(actual, expected); + end + + function StringArray(testCase) + % Scalar + expected = arrow.array("A"); + cArray = arrow.c.Array(); + cSchema = arrow.c.Schema(); + + expected.export(cArray.Address, cSchema.Address); + actual = arrow.array.Array.import(cArray, cSchema); + + testCase.verifyEqual(actual, expected); + + % Vector + expected = arrow.array(["A", "B", "C"]); + cArray = arrow.c.Array(); + cSchema = arrow.c.Schema(); + + expected.export(cArray.Address, cSchema.Address); + actual = arrow.array.Array.import(cArray, cSchema); + + testCase.verifyEqual(actual, expected); + end + + function TimestampArray(testCase) + % Scalar + expected = arrow.array(datetime(2024, 1, 1)); + cArray = arrow.c.Array(); + cSchema = arrow.c.Schema(); + + expected.export(cArray.Address, cSchema.Address); + actual = arrow.array.Array.import(cArray, cSchema); + + testCase.verifyEqual(actual, expected); + + % Vector + expected = arrow.array([... + datetime(2024, 1, 1),... + datetime(2024, 1, 2),... + datetime(2024, 1, 3)... + ]); + cArray = arrow.c.Array(); + cSchema = arrow.c.Schema(); + + expected.export(cArray.Address, cSchema.Address); + actual = arrow.array.Array.import(cArray, cSchema); + + testCase.verifyEqual(actual, expected); + end + + function ExportErrorWrongInputTypes(testCase) + A = arrow.array([1, 2, 3]); + fcn = @() A.export("cArray.Address", "cSchema.Address"); + testCase.verifyError(fcn, "MATLAB:validation:UnableToConvert"); + end + + function ExportTooFewInputs(testCase) + A = arrow.array([1, 2, 3]); + fcn = @() A.export(); + testCase.verifyError(fcn, "MATLAB:minrhs"); + end + + function ExportTooManyInputs(testCase) + A = arrow.array([1, 2, 3]); + fcn = @() A.export("A", "B", "C"); + testCase.verifyError(fcn, "MATLAB:TooManyInputs"); + end + + function ImportErrorWrongInputTypes(testCase) + cArray = "arrow.c.Array"; + cSchema = "arrow.c.Schema"; + fcn = @() arrow.array.Array.import(cArray, cSchema); + testCase.verifyError(fcn, "MATLAB:validation:UnableToConvert"); + end + + function ImportTooFewInputs(testCase) + fcn = @() arrow.array.Array.import(); + testCase.verifyError(fcn, "MATLAB:minrhs"); + end + + function ImportTooManyInputs(testCase) + A = arrow.array([1, 2, 3]); + fcn = @() arrow.array.Array.import("A", "B", "C"); + testCase.verifyError(fcn, "MATLAB:TooManyInputs"); + end + + function ImportErrorImportFailed(testCase) + cArray = arrow.c.Array(); + cSchema = arrow.c.Schema(); + % An arrow:c:import:ImportFailed error should be thrown + % if the supplied arrow.c.Array and arrow.c.Schema were + % never populated previously from an exported Array. + fcn = @() arrow.array.Array.import(cArray, cSchema); + testCase.verifyError(fcn, "arrow:c:import:ImportFailed"); + end + + end + +end diff --git a/matlab/tools/cmake/BuildMatlabArrowInterface.cmake b/matlab/tools/cmake/BuildMatlabArrowInterface.cmake index 8f37bef77b859..92e9f59145acc 100644 --- a/matlab/tools/cmake/BuildMatlabArrowInterface.cmake +++ b/matlab/tools/cmake/BuildMatlabArrowInterface.cmake @@ -77,6 +77,7 @@ set(MATLAB_ARROW_LIBMEXCLASS_CLIENT_PROXY_SOURCES "${CMAKE_SOURCE_DIR}/src/cpp/a "${CMAKE_SOURCE_DIR}/src/cpp/arrow/matlab/index/validate.cc" "${CMAKE_SOURCE_DIR}/src/cpp/arrow/matlab/buffer/proxy/buffer.cc" "${CMAKE_SOURCE_DIR}/src/cpp/arrow/matlab/c/proxy/array.cc" + "${CMAKE_SOURCE_DIR}/src/cpp/arrow/matlab/c/proxy/array_importer.cc" "${CMAKE_SOURCE_DIR}/src/cpp/arrow/matlab/c/proxy/schema.cc")