From 952cb73d2cee6305fb5af1ee5f0635dd929523a3 Mon Sep 17 00:00:00 2001 From: Tim Paine <3105306+timkpaine@users.noreply.github.com> Date: Mon, 31 Jul 2023 03:48:39 +0000 Subject: [PATCH] wip linux --- .github/workflows/build.yml | 11 +- CMakeLists.txt | 79 +- cmake/modules/FindPyarrow.cmake | 45 + pyproject.toml | 1 + setup.py | 2 +- src/apn-python/caster.hpp | 6 - src/apn-python/common.cpp | 18 +- src/apn-python/common.hpp | 6 +- src/apn-python/cpython.cpp | 11 +- src/apn-python/cpython.hpp | 2 +- src/apn-python/pybind11.cpp | 1 + src/apn-python/pybind11.hpp | 8 +- .../arrow/python/CMakeLists.txt | 18 + .../arrow/python/arrow_to_pandas.cc | 2575 +++++++++++++++++ .../arrow/python/arrow_to_pandas.h | 22 + .../arrow/python/arrow_to_python_internal.h | 49 + .../arrow/python/benchmark.cc | 38 + .../arrow/python/common.cc | 203 ++ .../apache-arrow-12.0.1/arrow/python/csv.cc | 62 + .../apache-arrow-12.0.1/arrow/python/csv.h | 2 +- .../arrow/python/datetime.cc | 663 +++++ .../arrow/python/datetime.h | 4 +- .../arrow/python/decimal.cc | 246 ++ .../arrow/python/deserialize.cc | 495 ++++ .../arrow/python/deserialize.h | 2 +- .../arrow/python/extension_type.cc | 217 ++ .../arrow/python/extension_type.h | 2 +- .../arrow/python/filesystem.cc | 206 ++ .../arrow/python/filesystem.h | 2 +- .../arrow/python/flight.cc | 388 +++ .../apache-arrow-12.0.1/arrow/python/gdb.cc | 530 ++++ .../arrow/python/helpers.cc | 470 +++ .../arrow/python/helpers.h | 2 +- .../arrow/python/inference.cc | 748 +++++ .../arrow/python/inference.h | 2 +- .../apache-arrow-12.0.1/arrow/python/init.cc | 24 + .../apache-arrow-12.0.1/arrow/python/io.cc | 384 +++ .../apache-arrow-12.0.1/arrow/python/ipc.cc | 67 + .../apache-arrow-12.0.1/arrow/python/ipc.h | 4 +- .../arrow/python/numpy_convert.cc | 562 ++++ .../arrow/python/numpy_convert.h | 2 +- .../arrow/python/numpy_internal.h | 182 ++ .../arrow/python/numpy_to_arrow.cc | 870 ++++++ .../arrow/python/parquet_encryption.cc | 98 + .../arrow/python/platform.h | 9 +- .../arrow/python/pyarrow.cc | 94 + .../arrow/python/python_test.cc | 888 ++++++ .../arrow/python/python_to_arrow.cc | 1240 ++++++++ .../arrow/python/python_to_arrow.h | 2 +- .../arrow/python/serialize.cc | 798 +++++ .../arrow/python/serialize.h | 2 +- .../apache-arrow-12.0.1/arrow/python/udf.cc | 736 +++++ .../apache-arrow-12.0.1/arrow/python/udf.h | 23 +- vcpkg.json | 2 +- 54 files changed, 13036 insertions(+), 87 deletions(-) create mode 100644 cmake/modules/FindPyarrow.cmake create mode 100644 src/vendored/apache-arrow-12.0.1/arrow/python/CMakeLists.txt create mode 100644 src/vendored/apache-arrow-12.0.1/arrow/python/arrow_to_pandas.cc create mode 100644 src/vendored/apache-arrow-12.0.1/arrow/python/arrow_to_python_internal.h create mode 100644 src/vendored/apache-arrow-12.0.1/arrow/python/benchmark.cc create mode 100644 src/vendored/apache-arrow-12.0.1/arrow/python/common.cc create mode 100644 src/vendored/apache-arrow-12.0.1/arrow/python/csv.cc create mode 100644 src/vendored/apache-arrow-12.0.1/arrow/python/datetime.cc create mode 100644 src/vendored/apache-arrow-12.0.1/arrow/python/decimal.cc create mode 100644 src/vendored/apache-arrow-12.0.1/arrow/python/deserialize.cc create mode 100644 src/vendored/apache-arrow-12.0.1/arrow/python/extension_type.cc create mode 100644 src/vendored/apache-arrow-12.0.1/arrow/python/filesystem.cc create mode 100644 src/vendored/apache-arrow-12.0.1/arrow/python/flight.cc create mode 100644 src/vendored/apache-arrow-12.0.1/arrow/python/gdb.cc create mode 100644 src/vendored/apache-arrow-12.0.1/arrow/python/helpers.cc create mode 100644 src/vendored/apache-arrow-12.0.1/arrow/python/inference.cc create mode 100644 src/vendored/apache-arrow-12.0.1/arrow/python/init.cc create mode 100644 src/vendored/apache-arrow-12.0.1/arrow/python/io.cc create mode 100644 src/vendored/apache-arrow-12.0.1/arrow/python/ipc.cc create mode 100644 src/vendored/apache-arrow-12.0.1/arrow/python/numpy_convert.cc create mode 100644 src/vendored/apache-arrow-12.0.1/arrow/python/numpy_internal.h create mode 100644 src/vendored/apache-arrow-12.0.1/arrow/python/numpy_to_arrow.cc create mode 100644 src/vendored/apache-arrow-12.0.1/arrow/python/parquet_encryption.cc create mode 100644 src/vendored/apache-arrow-12.0.1/arrow/python/pyarrow.cc create mode 100644 src/vendored/apache-arrow-12.0.1/arrow/python/python_test.cc create mode 100644 src/vendored/apache-arrow-12.0.1/arrow/python/python_to_arrow.cc create mode 100644 src/vendored/apache-arrow-12.0.1/arrow/python/serialize.cc create mode 100644 src/vendored/apache-arrow-12.0.1/arrow/python/udf.cc diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 9c53e7e..efb0744 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -4,6 +4,7 @@ on: push: branches: - main + - tkp/linux tags: - v* paths-ignore: @@ -102,7 +103,15 @@ jobs: uses: actions-ext/cpp/setup@v1 - name: Install dependencies - run: make develop + run: make develop-cpp + if: ${{ matrix.os != 'ubuntu-22.04' }} + + - name: Install dependencies + run: sudo apt-get install libarrow-dev + if: ${{ matrix.os == 'ubuntu-22.04' }} + + - name: Install dependencies + run: make develop-py - name: Build project run: make build diff --git a/CMakeLists.txt b/CMakeLists.txt index 962605f..ba353df 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -56,7 +56,7 @@ include_directories ("${CMAKE_SOURCE_DIR}/src") find_package (Color) if (NOT DEFINED CMAKE_BUILD_TYPE) - set(CMAKE_BUILD_TYPE "Debugma" CACHE STRING "Release/Debug build") + set(CMAKE_BUILD_TYPE "Debug" CACHE STRING "Release/Debug build") endif() if (NOT DEFINED PYTHON_VERSION) set(PYTHON_VERSION "3.9" CACHE STRING "Python version to build against") @@ -104,24 +104,6 @@ if(WIN32) set(CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS ON) endif() -############################################################################################################### -# Version # -########### -# Set version from cmake and extract latest hash if available -set(ARROW_PYTHON_NOCOPY_VERSION_MAJOR ${PROJECT_VERSION_MAJOR}) -set(ARROW_PYTHON_NOCOPY_VERSION_MINOR ${PROJECT_VERSION_MINOR}) -set(ARROW_PYTHON_NOCOPY_VERSION_PATCH ${PROJECT_VERSION_PATCH}) -if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/.git") - # Get latest commit - execute_process(COMMAND git rev-parse HEAD - WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} - OUTPUT_VARIABLE ARROW_PYTHON_NOCOPY_VERSION_COMMIT_SHA) - # strip newline - string(REGEX REPLACE "\n$" "" ARROW_PYTHON_NOCOPY_VERSION_COMMIT_SHA "${ARROW_PYTHON_NOCOPY_VERSION_COMMIT_SHA}") -else() - set(ARROW_PYTHON_NOCOPY_VERSION_COMMIT_SHA "release") -endif() - ############################################################################################################### # RPath # ######### @@ -148,6 +130,9 @@ endif() ############################################################################################################### # Flags # ######### +set(CMAKE_POSITION_INDEPENDENT_CODE On) +add_definitions(-D_GLIBCXX_USE_CXX11_ABI=0) + # Compiler version flags if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++2a") @@ -244,7 +229,7 @@ set( ) add_library(arrow-python-nocopy SHARED ${PROJECT_SRCS}) -target_link_libraries(arrow-python-nocopy Arrow::arrow_static) +target_link_libraries(arrow-python-nocopy PRIVATE Arrow::arrow_shared) set_target_properties(arrow-python-nocopy PROPERTIES PUBLIC_HEADER "${PROJECT_HDRS}") # export symbols @@ -261,14 +246,46 @@ install(TARGETS arrow-python-nocopy EXPORT ArrowPythonNocopy LIBRARY DESTINATION if(BUILD_PYTHON) # Find Numpy find_package(NumPy REQUIRED) + include_directories(${NUMPY_INCLUDE_DIR}) + # Find PyArrow. We will link against it for the build only + find_package(Pyarrow REQUIRED) + include_directories(${PYARROW_INCLUDE_DIR}) + # Find PyBind11 - find_package(pybind11 REQUIRED) - include_directories(${pybind11_INCLUDE_DIR}) - include_directories("${PROJECT_SOURCE_DIR}/src/vendored/apache-arrow-12.0.1") - + find_package(pybind11 REQUIRED) + include_directories(${pybind11_INCLUDE_DIR}) + set(VENDORED_PYARROW_ROOT "${PROJECT_SOURCE_DIR}/src/vendored/apache-arrow-12.0.1") + include_directories(${VENDORED_PYARROW_ROOT}) + set(VENDORED_PYARROW_SRCS + # ${VENDORED_PYARROW_ROOT}/arrow/python/arrow_to_pandas.cc + # ${VENDORED_PYARROW_ROOT}/arrow/python/benchmark.cc + ${VENDORED_PYARROW_ROOT}/arrow/python/common.cc + # ${VENDORED_PYARROW_ROOT}/arrow/python/csv.cc + ${VENDORED_PYARROW_ROOT}/arrow/python/datetime.cc + ${VENDORED_PYARROW_ROOT}/arrow/python/decimal.cc + ${VENDORED_PYARROW_ROOT}/arrow/python/deserialize.cc + ${VENDORED_PYARROW_ROOT}/arrow/python/extension_type.cc + # ${VENDORED_PYARROW_ROOT}/arrow/python/filesystem.cc + # ${VENDORED_PYARROW_ROOT}/arrow/python/flight.cc + # ${VENDORED_PYARROW_ROOT}/arrow/python/gdb.cc + ${VENDORED_PYARROW_ROOT}/arrow/python/helpers.cc + ${VENDORED_PYARROW_ROOT}/arrow/python/inference.cc + ${VENDORED_PYARROW_ROOT}/arrow/python/init.cc + ${VENDORED_PYARROW_ROOT}/arrow/python/io.cc + ${VENDORED_PYARROW_ROOT}/arrow/python/ipc.cc + ${VENDORED_PYARROW_ROOT}/arrow/python/numpy_convert.cc + ${VENDORED_PYARROW_ROOT}/arrow/python/numpy_to_arrow.cc + # ${VENDORED_PYARROW_ROOT}/arrow/python/parquet_encryption.cc + ${VENDORED_PYARROW_ROOT}/arrow/python/pyarrow.cc + # ${VENDORED_PYARROW_ROOT}/arrow/python/python_test.cc + ${VENDORED_PYARROW_ROOT}/arrow/python/python_to_arrow.cc + ${VENDORED_PYARROW_ROOT}/arrow/python/serialize.cc + # ${VENDORED_PYARROW_ROOT}/arrow/python/udf.cc +) + # common functionality - add_library(common SHARED "${PROJECT_SOURCE_DIR}/src/apn-python/common.cpp") + add_library(common SHARED "${PROJECT_SOURCE_DIR}/src/apn-python/common.cpp" ${VENDORED_PYARROW_SRCS}) set_target_properties(common PROPERTIES PUBLIC_HEADER "${PROJECT_SOURCE_DIR}/src/apn-python/common.hpp") # pybind11 extension @@ -280,12 +297,12 @@ if(BUILD_PYTHON) set_target_properties(cpythonextension PROPERTIES PUBLIC_HEADER "${PROJECT_SOURCE_DIR}/src/apn-python/cpython.hpp") # Link to standalone/common library - target_link_libraries(common PUBLIC arrow-python-nocopy) - target_link_libraries(pybind11extension PUBLIC common) - target_link_libraries(cpythonextension PUBLIC common) - set_property(TARGET common PROPERTY INSTALL_RPATH "${module_origin_path}/lib") - set_property(TARGET pybind11extension PROPERTY INSTALL_RPATH "${module_origin_path}/lib") - set_property(TARGET cpythonextension PROPERTY INSTALL_RPATH "${module_origin_path}/lib") + target_link_libraries(common PRIVATE arrow-python-nocopy) + target_link_libraries(pybind11extension PRIVATE common) + target_link_libraries(cpythonextension PRIVATE common) + set_property(TARGET common PROPERTY INSTALL_RPATH "${module_origin_path}:${module_origin_path}/lib") + set_property(TARGET pybind11extension PROPERTY INSTALL_RPATH "${module_origin_path}:${module_origin_path}/lib") + set_property(TARGET cpythonextension PROPERTY INSTALL_RPATH "${module_origin_path}:${module_origin_path}/lib") set_property(TARGET cpythonextension PROPERTY PREFIX "") # install in python module diff --git a/cmake/modules/FindPyarrow.cmake b/cmake/modules/FindPyarrow.cmake new file mode 100644 index 0000000..ca3fac4 --- /dev/null +++ b/cmake/modules/FindPyarrow.cmake @@ -0,0 +1,45 @@ +# Find the Pyarrow package +# PYARROW_INCLUDE_DIR +# PYARROW_LIB_DIR +# PYARROW_LIBRARY +# PYARROW_FOUND +# will be set by this script + +cmake_minimum_required(VERSION 3.7.2) + +find_package( PythonInterp ${PYTHON_VERSION} EXACT REQUIRED ) + +# Find out the include path +execute_process( + COMMAND "${Python_EXECUTABLE}" -c + "from __future__ import print_function;import pyarrow;print(pyarrow.get_include(), end='')" + OUTPUT_VARIABLE __pyarrow_path) + +# Find out the library path +execute_process( + COMMAND "${Python_EXECUTABLE}" -c + "from __future__ import print_function;import pyarrow;print(pyarrow.get_library_dirs()[0], end='')" + OUTPUT_VARIABLE __pyarrow_lib_path) + +# And the version +execute_process( + COMMAND "${Python_EXECUTABLE}" -c + "from __future__ import print_function;import pyarrow;print(pyarrow.__version__, end='')" + OUTPUT_VARIABLE __pyarrow_version) + +find_path(PYARROW_INCLUDE_DIR arrow/python/pyarrow.h + HINTS "${__pyarrow_path}" "${PYTHON_INCLUDE_PATH}" NO_DEFAULT_PATH) + +find_path(PYARROW_LIB_DIR libarrow_python.so + HINTS "${__pyarrow_lib_path}" "${PYTHON_LIBRARY_PATH}" NO_DEFAULT_PATH) + +find_file(PYARROW_LIBRARY libarrow_python.so + HINTS "${__pyarrow_lib_path}" "${PYTHON_LIBRARY_PATH}" NO_DEFAULT_PATH) + +if(PYARROW_INCLUDE_DIR AND PYARROW_LIB_DIR AND PYARROW_LIBRARY) + set(PYARROW_FOUND 1 CACHE INTERNAL "Python pyarrow found") +endif() + +include(FindPackageHandleStandardArgs) +find_package_handle_standard_args(Pyarrow REQUIRED_VARS PYARROW_INCLUDE_DIR PYARROW_LIB_DIR PYARROW_LIBRARY + VERSION_VAR __pyarrow_version) diff --git a/pyproject.toml b/pyproject.toml index a849837..b0625b4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -91,6 +91,7 @@ test-requires = [ [tool.cibuildwheel.linux] archs = "x86_64 aarch64" skip = "*musllinux* *i686" +before-all = "make develop && sudo apt-install libarrow-dev" [tool.cibuildwheel.macos] archs = "x86_64 arm64" diff --git a/setup.py b/setup.py index 4af45e2..315f767 100644 --- a/setup.py +++ b/setup.py @@ -12,7 +12,7 @@ ) ) -if os.path.exists(vcpkg_config_file): +if os.path.exists(vcpkg_config_file) and os.name != "posix": cmake_args.append( f"-DCMAKE_TOOLCHAIN_FILE={vcpkg_config_file}" ) diff --git a/src/apn-python/caster.hpp b/src/apn-python/caster.hpp index a0c9913..248e265 100644 --- a/src/apn-python/caster.hpp +++ b/src/apn-python/caster.hpp @@ -8,7 +8,6 @@ namespace detail { PYBIND11_TYPE_CASTER(std::shared_ptr, const_name("pyarrow::Array")); /* Python->C++ */ bool load(handle src, bool) { - arrow::py::import_pyarrow(); PyObject* source = src.ptr(); if(!arrow::py::is_array(source)) return false; @@ -21,7 +20,6 @@ namespace detail { /* C++ -> Python) */ static handle cast(std::shared_ptr src, return_value_policy /* policy */, handle /* parent */) { - arrow::py::import_pyarrow(); return arrow::py::wrap_array(src); } }; @@ -32,7 +30,6 @@ namespace detail { PYBIND11_TYPE_CASTER(std::shared_ptr, const_name("pyarrow::Schema")); /* Python->C++ */ bool load(handle src, bool) { - arrow::py::import_pyarrow(); PyObject* source = src.ptr(); if(!arrow::py::is_schema(source)) return false; @@ -45,7 +42,6 @@ namespace detail { /* C++ -> Python) */ static handle cast(std::shared_ptr src, return_value_policy /* policy */, handle /* parent */) { - arrow::py::import_pyarrow(); return arrow::py::wrap_schema(src); } }; @@ -56,7 +52,6 @@ namespace detail { PYBIND11_TYPE_CASTER(std::shared_ptr, const_name("pyarrow::Table")); /* Python->C++ */ bool load(handle src, bool) { - arrow::py::import_pyarrow(); PyObject* source = src.ptr(); if(!arrow::py::is_table(source)) return false; @@ -69,7 +64,6 @@ namespace detail { /* C++ -> Python) */ static handle cast(std::shared_ptr src, return_value_policy /* policy */, handle /* parent */) { - arrow::py::import_pyarrow(); return arrow::py::wrap_table(src); } }; diff --git a/src/apn-python/common.cpp b/src/apn-python/common.cpp index b135e2d..f4e6888 100644 --- a/src/apn-python/common.cpp +++ b/src/apn-python/common.cpp @@ -1,20 +1,19 @@ +#include #include #include #include -std::string array_info_py(std::shared_ptr array) { +char* array_info_py(std::shared_ptr array) { // ABI unstable! // return array_info(array); - char buffer[100]; + char* buffer = new char[100]; struct ArrowArray c_array; (void)ExportArray(*array, &c_array); array_info_cabi(&c_array, buffer, 100); - return std::string(buffer); + return buffer; } std::shared_ptr create_array_py() { - arrow::py::import_pyarrow(); - // ABI unstable! // std::shared_ptr arrow_array = create_array_cabi(); struct ArrowArray c_array; @@ -23,19 +22,17 @@ std::shared_ptr create_array_py() { return arrow_array; } -std::string schema_info_py(std::shared_ptr schema) { +char* schema_info_py(std::shared_ptr schema) { // ABI unstable! // return schema_info(schema); - char buffer[100]; + char* buffer = new char[100]; struct ArrowSchema c_schema; (void)arrow::ExportSchema(*schema, &c_schema); schema_info_cabi(&c_schema, buffer, 100); - return std::string(buffer); + return buffer; } std::shared_ptr create_schema_py() { - arrow::py::import_pyarrow(); - // ABI unstable! // std::shared_ptr arrow_schema = create_schema(); @@ -50,7 +47,6 @@ std::shared_ptr create_schema_py() { // } // std::shared_ptr create_table_py() { -// arrow::py::import_pyarrow(); // std::shared_ptr arrow_table = create_table(); // return arrow_table; // // PyObject* obj = arrow::py::wrap_table(arrow_table); diff --git a/src/apn-python/common.hpp b/src/apn-python/common.hpp index d9fac7c..0564b08 100644 --- a/src/apn-python/common.hpp +++ b/src/apn-python/common.hpp @@ -4,11 +4,11 @@ #include #include -LIB_EXPORT std::string array_info_py(std::shared_ptr array); +LIB_EXPORT char* array_info_py(std::shared_ptr array); LIB_EXPORT std::shared_ptr create_array_py(); -LIB_EXPORT std::string schema_info_py(std::shared_ptr schema); +LIB_EXPORT char* schema_info_py(std::shared_ptr schema); LIB_EXPORT std::shared_ptr create_schema_py(); -LIB_EXPORT std::string table_info_py(std::shared_ptr table); +LIB_EXPORT char* table_info_py(std::shared_ptr table); LIB_EXPORT std::shared_ptr create_table_py(); diff --git a/src/apn-python/cpython.cpp b/src/apn-python/cpython.cpp index e7142fc..9fd6284 100644 --- a/src/apn-python/cpython.cpp +++ b/src/apn-python/cpython.cpp @@ -1,5 +1,4 @@ #include -#include static PyObject* _raise_error(PyObject* module) { PyErr_SetString(PyExc_TypeError, "Bad value provided"); @@ -8,7 +7,6 @@ static PyObject* _raise_error(PyObject* module) { PyObject* array_info_py_raw(PyObject* self, PyObject* args) { PyObject* source; - arrow::py::import_pyarrow(); if(!PyArg_ParseTuple(args, "O", &source)) return _raise_error(self); @@ -21,8 +19,8 @@ PyObject* array_info_py_raw(PyObject* self, PyObject* args) { if(!result.ok()) return _raise_error(self); - std::string ret_str = array_info_py(std::static_pointer_cast(result.ValueOrDie())); - return PyUnicode_FromStringAndSize(ret_str.c_str(), ret_str.length()); + char* ret_str = array_info_py(std::static_pointer_cast(result.ValueOrDie())); + return PyUnicode_FromStringAndSize(ret_str, strlen(ret_str)); } PyObject* create_array_py_raw(PyObject* self, PyObject* args) { @@ -31,7 +29,6 @@ PyObject* create_array_py_raw(PyObject* self, PyObject* args) { PyObject* schema_info_py_raw(PyObject* self, PyObject* args) { PyObject* source; - arrow::py::import_pyarrow(); // parse arguments if(!PyArg_ParseTuple(args, "O", &source)) @@ -45,8 +42,8 @@ PyObject* schema_info_py_raw(PyObject* self, PyObject* args) { if(!result.ok()) return _raise_error(self); - std::string ret_str = schema_info_py(std::static_pointer_cast(result.ValueOrDie())); - return PyUnicode_FromStringAndSize(ret_str.c_str(), ret_str.length()); + char* ret_str = schema_info_py(std::static_pointer_cast(result.ValueOrDie())); + return PyUnicode_FromStringAndSize(ret_str, strlen(ret_str)); } PyObject* create_schema_py_raw(PyObject* self, PyObject* Py_UNUSED(args)) { diff --git a/src/apn-python/cpython.hpp b/src/apn-python/cpython.hpp index a65bb8b..17ffbcb 100644 --- a/src/apn-python/cpython.hpp +++ b/src/apn-python/cpython.hpp @@ -5,7 +5,6 @@ LIB_EXPORT PyObject* array_info_py_raw(PyObject*, PyObject*); LIB_EXPORT PyObject* create_array_py_raw(PyObject*, PyObject*); - LIB_EXPORT PyObject* schema_info_py_raw(PyObject*, PyObject*); LIB_EXPORT PyObject* create_schema_py_raw(PyObject*, PyObject*); @@ -24,6 +23,7 @@ static PyModuleDef cpythonextension_module = { PyModuleDef_HEAD_INIT, "cpythonextension", "cpython", -1, cpythonextension_methods}; PyMODINIT_FUNC PyInit_cpythonextension(void) { + arrow::py::import_pyarrow(); Py_Initialize(); return PyModule_Create(&cpythonextension_module); } diff --git a/src/apn-python/pybind11.cpp b/src/apn-python/pybind11.cpp index 8e212b1..003774c 100644 --- a/src/apn-python/pybind11.cpp +++ b/src/apn-python/pybind11.cpp @@ -1 +1,2 @@ #include + diff --git a/src/apn-python/pybind11.hpp b/src/apn-python/pybind11.hpp index f072224..277b16b 100644 --- a/src/apn-python/pybind11.hpp +++ b/src/apn-python/pybind11.hpp @@ -1,13 +1,19 @@ #pragma once +#include #include #include +#include #include #include namespace py = pybind11; +// LIB_EXPORT PyObject* table_info_py_raw(PyObject*, PyObject*); PYBIND11_MODULE(pybind11extension, m) { - arrow::py::import_pyarrow(); + py::module_::import("pyarrow"); + // dlopen("arrow_python.so", RTLD_LAZY); + // dlopen("libarrow_python.so", RTLD_LAZY); + import_pyarrow__lib(); m.doc() = "pybind11"; m.def("array_info", &array_info_py); m.def("create_array", &create_array_py); diff --git a/src/vendored/apache-arrow-12.0.1/arrow/python/CMakeLists.txt b/src/vendored/apache-arrow-12.0.1/arrow/python/CMakeLists.txt new file mode 100644 index 0000000..ff355e4 --- /dev/null +++ b/src/vendored/apache-arrow-12.0.1/arrow/python/CMakeLists.txt @@ -0,0 +1,18 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +arrow_install_all_headers("arrow/python") diff --git a/src/vendored/apache-arrow-12.0.1/arrow/python/arrow_to_pandas.cc b/src/vendored/apache-arrow-12.0.1/arrow/python/arrow_to_pandas.cc new file mode 100644 index 0000000..91c7b8a --- /dev/null +++ b/src/vendored/apache-arrow-12.0.1/arrow/python/arrow_to_pandas.cc @@ -0,0 +1,2575 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// Functions for pandas conversion via NumPy + +#include "arrow/python/arrow_to_pandas.h" +#include "arrow/python/numpy_interop.h" // IWYU pragma: expand + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "arrow/array.h" +#include "arrow/buffer.h" +#include "arrow/datum.h" +#include "arrow/status.h" +#include "arrow/table.h" +#include "arrow/type.h" +#include "arrow/type_traits.h" +#include "arrow/util/checked_cast.h" +#include "arrow/util/hashing.h" +#include "arrow/util/int_util.h" +#include "arrow/util/logging.h" +#include "arrow/util/macros.h" +#include "arrow/util/parallel.h" +#include "arrow/visit_type_inline.h" + +#include "arrow/compute/api.h" + +#include "arrow/python/arrow_to_python_internal.h" +#include "arrow/python/common.h" +#include "arrow/python/datetime.h" +#include "arrow/python/decimal.h" +#include "arrow/python/helpers.h" +#include "arrow/python/numpy_convert.h" +#include "arrow/python/numpy_internal.h" +#include "arrow/python/pyarrow.h" +#include "arrow/python/python_to_arrow.h" +#include "arrow/python/type_traits.h" + +namespace arrow { + +class MemoryPool; + +using internal::checked_cast; +using internal::CheckIndexBounds; +using internal::OptionalParallelFor; + +namespace py { +namespace { + +// Fix options for conversion of an inner (child) array. +PandasOptions MakeInnerOptions(PandasOptions options) { + // Make sure conversion of inner dictionary arrays always returns an array, + // not a dict {'indices': array, 'dictionary': array, 'ordered': bool} + options.decode_dictionaries = true; + options.categorical_columns.clear(); + options.strings_to_categorical = false; + + // In ARROW-7723, we found as a result of ARROW-3789 that second + // through microsecond resolution tz-aware timestamps were being promoted to + // use the DATETIME_NANO_TZ conversion path, yielding a datetime64[ns] NumPy + // array in this function. PyArray_GETITEM returns datetime.datetime for + // units second through microsecond but PyLong for nanosecond (because + // datetime.datetime does not support nanoseconds). + // We force the object conversion to preserve the value of the timezone. + // Nanoseconds are returned as integers. + options.coerce_temporal_nanoseconds = false; + + return options; +} + +// ---------------------------------------------------------------------- +// PyCapsule code for setting ndarray base to reference C++ object + +struct ArrayCapsule { + std::shared_ptr array; +}; + +struct BufferCapsule { + std::shared_ptr buffer; +}; + +void ArrayCapsule_Destructor(PyObject* capsule) { + delete reinterpret_cast(PyCapsule_GetPointer(capsule, "arrow::Array")); +} + +void BufferCapsule_Destructor(PyObject* capsule) { + delete reinterpret_cast(PyCapsule_GetPointer(capsule, "arrow::Buffer")); +} + +// ---------------------------------------------------------------------- +// pandas 0.x DataFrame conversion internals + +using internal::arrow_traits; +using internal::npy_traits; + +template +struct WrapBytes {}; + +template <> +struct WrapBytes { + static inline PyObject* Wrap(const char* data, int64_t length) { + return PyUnicode_FromStringAndSize(data, length); + } +}; + +template <> +struct WrapBytes { + static inline PyObject* Wrap(const char* data, int64_t length) { + return PyUnicode_FromStringAndSize(data, length); + } +}; + +template <> +struct WrapBytes { + static inline PyObject* Wrap(const char* data, int64_t length) { + return PyBytes_FromStringAndSize(data, length); + } +}; + +template <> +struct WrapBytes { + static inline PyObject* Wrap(const char* data, int64_t length) { + return PyBytes_FromStringAndSize(data, length); + } +}; + +template <> +struct WrapBytes { + static inline PyObject* Wrap(const char* data, int64_t length) { + return PyBytes_FromStringAndSize(data, length); + } +}; + +static inline bool ListTypeSupported(const DataType& type) { + switch (type.id()) { + case Type::BOOL: + case Type::UINT8: + case Type::INT8: + case Type::UINT16: + case Type::INT16: + case Type::UINT32: + case Type::INT32: + case Type::INT64: + case Type::UINT64: + case Type::HALF_FLOAT: + case Type::FLOAT: + case Type::DOUBLE: + case Type::DECIMAL128: + case Type::DECIMAL256: + case Type::BINARY: + case Type::LARGE_BINARY: + case Type::STRING: + case Type::LARGE_STRING: + case Type::DATE32: + case Type::DATE64: + case Type::STRUCT: + case Type::MAP: + case Type::TIME32: + case Type::TIME64: + case Type::TIMESTAMP: + case Type::DURATION: + case Type::DICTIONARY: + case Type::INTERVAL_MONTH_DAY_NANO: + case Type::NA: // empty list + // The above types are all supported. + return true; + case Type::FIXED_SIZE_LIST: + case Type::LIST: + case Type::LARGE_LIST: { + const auto& list_type = checked_cast(type); + return ListTypeSupported(*list_type.value_type()); + } + case Type::EXTENSION: { + const auto& ext = checked_cast(*type.GetSharedPtr()); + return ListTypeSupported(*(ext.storage_type())); + } + default: + break; + } + return false; +} + +Status CapsulizeArray(const std::shared_ptr& arr, PyObject** out) { + auto capsule = new ArrayCapsule{{arr}}; + *out = PyCapsule_New(reinterpret_cast(capsule), "arrow::Array", + &ArrayCapsule_Destructor); + if (*out == nullptr) { + delete capsule; + RETURN_IF_PYERROR(); + } + return Status::OK(); +} + +Status CapsulizeBuffer(const std::shared_ptr& buffer, PyObject** out) { + auto capsule = new BufferCapsule{{buffer}}; + *out = PyCapsule_New(reinterpret_cast(capsule), "arrow::Buffer", + &BufferCapsule_Destructor); + if (*out == nullptr) { + delete capsule; + RETURN_IF_PYERROR(); + } + return Status::OK(); +} + +Status SetNdarrayBase(PyArrayObject* arr, PyObject* base) { + if (PyArray_SetBaseObject(arr, base) == -1) { + // Error occurred, trust that SetBaseObject sets the error state + Py_XDECREF(base); + RETURN_IF_PYERROR(); + } + return Status::OK(); +} + +Status SetBufferBase(PyArrayObject* arr, const std::shared_ptr& buffer) { + PyObject* base; + RETURN_NOT_OK(CapsulizeBuffer(buffer, &base)); + return SetNdarrayBase(arr, base); +} + +inline void set_numpy_metadata(int type, const DataType* datatype, PyArray_Descr* out) { + auto metadata = reinterpret_cast(out->c_metadata); + if (type == NPY_DATETIME) { + if (datatype->id() == Type::TIMESTAMP) { + const auto& timestamp_type = checked_cast(*datatype); + metadata->meta.base = internal::NumPyFrequency(timestamp_type.unit()); + } else { + DCHECK(false) << "NPY_DATETIME views only supported for Arrow TIMESTAMP types"; + } + } else if (type == NPY_TIMEDELTA) { + DCHECK_EQ(datatype->id(), Type::DURATION); + const auto& duration_type = checked_cast(*datatype); + metadata->meta.base = internal::NumPyFrequency(duration_type.unit()); + } +} + +Status PyArray_NewFromPool(int nd, npy_intp* dims, PyArray_Descr* descr, MemoryPool* pool, + PyObject** out) { + // ARROW-6570: Allocate memory from MemoryPool for a couple reasons + // + // * Track allocations + // * Get better performance through custom allocators + int64_t total_size = descr->elsize; + for (int i = 0; i < nd; ++i) { + total_size *= dims[i]; + } + + ARROW_ASSIGN_OR_RAISE(auto buffer, AllocateBuffer(total_size, pool)); + *out = PyArray_NewFromDescr(&PyArray_Type, descr, nd, dims, + /*strides=*/nullptr, + /*data=*/buffer->mutable_data(), + /*flags=*/NPY_ARRAY_CARRAY | NPY_ARRAY_WRITEABLE, + /*obj=*/nullptr); + if (*out == nullptr) { + RETURN_IF_PYERROR(); + // Trust that error set if NULL returned + } + return SetBufferBase(reinterpret_cast(*out), std::move(buffer)); +} + +template +inline const T* GetPrimitiveValues(const Array& arr) { + if (arr.length() == 0) { + return nullptr; + } + const int elsize = arr.type()->byte_width(); + const auto& prim_arr = checked_cast(arr); + return reinterpret_cast(prim_arr.values()->data() + arr.offset() * elsize); +} + +Status MakeNumPyView(std::shared_ptr arr, PyObject* py_ref, int npy_type, int ndim, + npy_intp* dims, PyObject** out) { + PyAcquireGIL lock; + + PyArray_Descr* descr = internal::GetSafeNumPyDtype(npy_type); + set_numpy_metadata(npy_type, arr->type().get(), descr); + PyObject* result = PyArray_NewFromDescr( + &PyArray_Type, descr, ndim, dims, /*strides=*/nullptr, + const_cast(GetPrimitiveValues(*arr)), /*flags=*/0, nullptr); + PyArrayObject* np_arr = reinterpret_cast(result); + if (np_arr == nullptr) { + // Error occurred, trust that error set + return Status::OK(); + } + + PyObject* base; + if (py_ref == nullptr) { + // Capsule will be owned by the ndarray, no incref necessary. See + // ARROW-1973 + RETURN_NOT_OK(CapsulizeArray(arr, &base)); + } else { + Py_INCREF(py_ref); + base = py_ref; + } + RETURN_NOT_OK(SetNdarrayBase(np_arr, base)); + + // Do not allow Arrow data to be mutated + PyArray_CLEARFLAGS(np_arr, NPY_ARRAY_WRITEABLE); + *out = result; + return Status::OK(); +} + +class PandasWriter { + public: + enum type { + OBJECT, + UINT8, + INT8, + UINT16, + INT16, + UINT32, + INT32, + UINT64, + INT64, + HALF_FLOAT, + FLOAT, + DOUBLE, + BOOL, + DATETIME_DAY, + DATETIME_SECOND, + DATETIME_MILLI, + DATETIME_MICRO, + DATETIME_NANO, + DATETIME_SECOND_TZ, + DATETIME_MILLI_TZ, + DATETIME_MICRO_TZ, + DATETIME_NANO_TZ, + TIMEDELTA_SECOND, + TIMEDELTA_MILLI, + TIMEDELTA_MICRO, + TIMEDELTA_NANO, + CATEGORICAL, + EXTENSION + }; + + PandasWriter(const PandasOptions& options, int64_t num_rows, int num_columns) + : options_(options), num_rows_(num_rows), num_columns_(num_columns) { + PyAcquireGIL lock; + internal::InitPandasStaticData(); + } + virtual ~PandasWriter() {} + + void SetBlockData(PyObject* arr) { + block_arr_.reset(arr); + block_data_ = + reinterpret_cast(PyArray_DATA(reinterpret_cast(arr))); + } + + /// \brief Either copy or wrap single array to create pandas-compatible array + /// for Series or DataFrame. num_columns_ can only be 1. Will try to zero + /// copy if possible (or error if not possible and zero_copy_only=True) + virtual Status TransferSingle(std::shared_ptr data, PyObject* py_ref) = 0; + + /// \brief Copy ChunkedArray into a multi-column block + virtual Status CopyInto(std::shared_ptr data, int64_t rel_placement) = 0; + + Status EnsurePlacementAllocated() { + std::lock_guard guard(allocation_lock_); + if (placement_data_ != nullptr) { + return Status::OK(); + } + PyAcquireGIL lock; + npy_intp placement_dims[1] = {num_columns_}; + PyObject* placement_arr = PyArray_SimpleNew(1, placement_dims, NPY_INT64); + RETURN_IF_PYERROR(); + placement_arr_.reset(placement_arr); + placement_data_ = reinterpret_cast( + PyArray_DATA(reinterpret_cast(placement_arr))); + return Status::OK(); + } + + Status EnsureAllocated() { + std::lock_guard guard(allocation_lock_); + if (block_data_ != nullptr) { + return Status::OK(); + } + RETURN_NOT_OK(Allocate()); + return Status::OK(); + } + + virtual bool CanZeroCopy(const ChunkedArray& data) const { return false; } + + virtual Status Write(std::shared_ptr data, int64_t abs_placement, + int64_t rel_placement) { + RETURN_NOT_OK(EnsurePlacementAllocated()); + if (num_columns_ == 1 && options_.allow_zero_copy_blocks) { + RETURN_NOT_OK(TransferSingle(data, /*py_ref=*/nullptr)); + } else { + RETURN_NOT_OK( + CheckNoZeroCopy("Cannot do zero copy conversion into " + "multi-column DataFrame block")); + RETURN_NOT_OK(EnsureAllocated()); + RETURN_NOT_OK(CopyInto(data, rel_placement)); + } + placement_data_[rel_placement] = abs_placement; + return Status::OK(); + } + + virtual Status GetDataFrameResult(PyObject** out) { + PyObject* result = PyDict_New(); + RETURN_IF_PYERROR(); + + PyObject* block; + RETURN_NOT_OK(GetResultBlock(&block)); + + PyDict_SetItemString(result, "block", block); + PyDict_SetItemString(result, "placement", placement_arr_.obj()); + + RETURN_NOT_OK(AddResultMetadata(result)); + *out = result; + return Status::OK(); + } + + // Caller steals the reference to this object + virtual Status GetSeriesResult(PyObject** out) { + RETURN_NOT_OK(MakeBlock1D()); + // Caller owns the object now + *out = block_arr_.detach(); + return Status::OK(); + } + + protected: + virtual Status AddResultMetadata(PyObject* result) { return Status::OK(); } + + Status MakeBlock1D() { + // For Series or for certain DataFrame block types, we need to shape to a + // 1D array when there is only one column + PyAcquireGIL lock; + + DCHECK_EQ(1, num_columns_); + + npy_intp new_dims[1] = {static_cast(num_rows_)}; + PyArray_Dims dims; + dims.ptr = new_dims; + dims.len = 1; + + PyObject* reshaped = PyArray_Newshape( + reinterpret_cast(block_arr_.obj()), &dims, NPY_ANYORDER); + RETURN_IF_PYERROR(); + + // ARROW-8801: Here a PyArrayObject is created that is not being managed by + // any OwnedRef object. This object is then put in the resulting object + // with PyDict_SetItemString, which increments the reference count, so a + // memory leak ensues. There are several ways to fix the memory leak but a + // simple one is to put the reshaped 1D block array in this OwnedRefNoGIL + // so it will be correctly decref'd when this class is destructed. + block_arr_.reset(reshaped); + return Status::OK(); + } + + virtual Status GetResultBlock(PyObject** out) { + *out = block_arr_.obj(); + return Status::OK(); + } + + Status CheckNoZeroCopy(const std::string& message) { + if (options_.zero_copy_only) { + return Status::Invalid(message); + } + return Status::OK(); + } + + Status CheckNotZeroCopyOnly(const ChunkedArray& data) { + if (options_.zero_copy_only) { + return Status::Invalid("Needed to copy ", data.num_chunks(), " chunks with ", + data.null_count(), " nulls, but zero_copy_only was True"); + } + return Status::OK(); + } + + virtual Status Allocate() { + return Status::NotImplemented("Override Allocate in subclasses"); + } + + Status AllocateNDArray(int npy_type, int ndim = 2) { + PyAcquireGIL lock; + + PyObject* block_arr = nullptr; + npy_intp block_dims[2] = {0, 0}; + + if (ndim == 2) { + block_dims[0] = num_columns_; + block_dims[1] = num_rows_; + } else { + block_dims[0] = num_rows_; + } + PyArray_Descr* descr = internal::GetSafeNumPyDtype(npy_type); + if (PyDataType_REFCHK(descr)) { + // ARROW-6876: if the array has refcounted items, let Numpy + // own the array memory so as to decref elements on array destruction + block_arr = PyArray_SimpleNewFromDescr(ndim, block_dims, descr); + RETURN_IF_PYERROR(); + } else { + RETURN_NOT_OK( + PyArray_NewFromPool(ndim, block_dims, descr, options_.pool, &block_arr)); + } + + SetBlockData(block_arr); + return Status::OK(); + } + + void SetDatetimeUnit(NPY_DATETIMEUNIT unit) { + PyAcquireGIL lock; + auto date_dtype = reinterpret_cast( + PyArray_DESCR(reinterpret_cast(block_arr_.obj()))->c_metadata); + date_dtype->meta.base = unit; + } + + PandasOptions options_; + + std::mutex allocation_lock_; + + int64_t num_rows_; + int num_columns_; + + OwnedRefNoGIL block_arr_; + uint8_t* block_data_ = nullptr; + + // ndarray + OwnedRefNoGIL placement_arr_; + int64_t* placement_data_ = nullptr; + + private: + ARROW_DISALLOW_COPY_AND_ASSIGN(PandasWriter); +}; + +template +inline void ConvertIntegerWithNulls(const PandasOptions& options, + const ChunkedArray& data, OutType* out_values) { + for (int c = 0; c < data.num_chunks(); c++) { + const auto& arr = *data.chunk(c); + const InType* in_values = GetPrimitiveValues(arr); + // Upcast to double, set NaN as appropriate + + for (int i = 0; i < arr.length(); ++i) { + *out_values++ = + arr.IsNull(i) ? static_cast(NAN) : static_cast(in_values[i]); + } + } +} + +template +inline void ConvertIntegerNoNullsSameType(const PandasOptions& options, + const ChunkedArray& data, T* out_values) { + for (int c = 0; c < data.num_chunks(); c++) { + const auto& arr = *data.chunk(c); + if (arr.length() > 0) { + const T* in_values = GetPrimitiveValues(arr); + memcpy(out_values, in_values, sizeof(T) * arr.length()); + out_values += arr.length(); + } + } +} + +template +inline void ConvertIntegerNoNullsCast(const PandasOptions& options, + const ChunkedArray& data, OutType* out_values) { + for (int c = 0; c < data.num_chunks(); c++) { + const auto& arr = *data.chunk(c); + const InType* in_values = GetPrimitiveValues(arr); + for (int64_t i = 0; i < arr.length(); ++i) { + *out_values = in_values[i]; + } + } +} + +template +struct MemoizationTraits { + using Scalar = typename T::c_type; +}; + +template +struct MemoizationTraits> { + // For binary, we memoize string_view as a scalar value to avoid having to + // unnecessarily copy the memory into the memo table data structure + using Scalar = std::string_view; +}; + +// Generic Array -> PyObject** converter that handles object deduplication, if +// requested +template +inline Status ConvertAsPyObjects(const PandasOptions& options, const ChunkedArray& data, + WrapFunction&& wrap_func, PyObject** out_values) { + using ArrayType = typename TypeTraits::ArrayType; + using Scalar = typename MemoizationTraits::Scalar; + + ::arrow::internal::ScalarMemoTable memo_table(options.pool); + std::vector unique_values; + int32_t memo_size = 0; + + auto WrapMemoized = [&](const Scalar& value, PyObject** out_values) { + int32_t memo_index; + RETURN_NOT_OK(memo_table.GetOrInsert(value, &memo_index)); + if (memo_index == memo_size) { + // New entry + RETURN_NOT_OK(wrap_func(value, out_values)); + unique_values.push_back(*out_values); + ++memo_size; + } else { + // Duplicate entry + Py_INCREF(unique_values[memo_index]); + *out_values = unique_values[memo_index]; + } + return Status::OK(); + }; + + auto WrapUnmemoized = [&](const Scalar& value, PyObject** out_values) { + return wrap_func(value, out_values); + }; + + for (int c = 0; c < data.num_chunks(); c++) { + const auto& arr = arrow::internal::checked_cast(*data.chunk(c)); + if (options.deduplicate_objects) { + RETURN_NOT_OK(internal::WriteArrayObjects(arr, WrapMemoized, out_values)); + } else { + RETURN_NOT_OK(internal::WriteArrayObjects(arr, WrapUnmemoized, out_values)); + } + out_values += arr.length(); + } + return Status::OK(); +} + +Status ConvertStruct(PandasOptions options, const ChunkedArray& data, + PyObject** out_values) { + if (data.num_chunks() == 0) { + return Status::OK(); + } + // ChunkedArray has at least one chunk + auto arr = checked_cast(data.chunk(0).get()); + // Use it to cache the struct type and number of fields for all chunks + int32_t num_fields = arr->num_fields(); + auto array_type = arr->type(); + std::vector fields_data(num_fields * data.num_chunks()); + OwnedRef dict_item; + + // See notes in MakeInnerOptions. + options = MakeInnerOptions(std::move(options)); + // Don't blindly convert because timestamps in lists are handled differently. + options.timestamp_as_object = true; + + for (int c = 0; c < data.num_chunks(); c++) { + auto fields_data_offset = c * num_fields; + auto arr = checked_cast(data.chunk(c).get()); + // Convert the struct arrays first + for (int32_t i = 0; i < num_fields; i++) { + auto field = arr->field(static_cast(i)); + // In case the field is an extension array, use .storage() to convert to Pandas + if (field->type()->id() == Type::EXTENSION) { + const ExtensionArray& arr_ext = checked_cast(*field); + field = arr_ext.storage(); + } + RETURN_NOT_OK(ConvertArrayToPandas(options, field, nullptr, + fields_data[i + fields_data_offset].ref())); + DCHECK(PyArray_Check(fields_data[i + fields_data_offset].obj())); + } + + // Construct a dictionary for each row + const bool has_nulls = data.null_count() > 0; + for (int64_t i = 0; i < arr->length(); ++i) { + if (has_nulls && arr->IsNull(i)) { + Py_INCREF(Py_None); + *out_values = Py_None; + } else { + // Build the new dict object for the row + dict_item.reset(PyDict_New()); + RETURN_IF_PYERROR(); + for (int32_t field_idx = 0; field_idx < num_fields; ++field_idx) { + OwnedRef field_value; + auto name = array_type->field(static_cast(field_idx))->name(); + if (!arr->field(static_cast(field_idx))->IsNull(i)) { + // Value exists in child array, obtain it + auto array = reinterpret_cast( + fields_data[field_idx + fields_data_offset].obj()); + auto ptr = reinterpret_cast(PyArray_GETPTR1(array, i)); + field_value.reset(PyArray_GETITEM(array, ptr)); + RETURN_IF_PYERROR(); + } else { + // Translate the Null to a None + Py_INCREF(Py_None); + field_value.reset(Py_None); + } + // PyDict_SetItemString increments reference count + auto setitem_result = + PyDict_SetItemString(dict_item.obj(), name.c_str(), field_value.obj()); + RETURN_IF_PYERROR(); + DCHECK_EQ(setitem_result, 0); + } + *out_values = dict_item.obj(); + // Grant ownership to the resulting array + Py_INCREF(*out_values); + } + ++out_values; + } + } + return Status::OK(); +} + +Status DecodeDictionaries(MemoryPool* pool, const std::shared_ptr& dense_type, + ArrayVector* arrays) { + compute::ExecContext ctx(pool); + compute::CastOptions options; + for (size_t i = 0; i < arrays->size(); ++i) { + ARROW_ASSIGN_OR_RAISE((*arrays)[i], + compute::Cast(*(*arrays)[i], dense_type, options, &ctx)); + } + return Status::OK(); +} + +Status DecodeDictionaries(MemoryPool* pool, const std::shared_ptr& dense_type, + std::shared_ptr* array) { + auto chunks = (*array)->chunks(); + RETURN_NOT_OK(DecodeDictionaries(pool, dense_type, &chunks)); + *array = std::make_shared(std::move(chunks), dense_type); + return Status::OK(); +} + +template +Status ConvertListsLike(PandasOptions options, const ChunkedArray& data, + PyObject** out_values) { + // Get column of underlying value arrays + ArrayVector value_arrays; + for (int c = 0; c < data.num_chunks(); c++) { + const auto& arr = checked_cast(*data.chunk(c)); + // values() does not account for offsets, so we need to slice into it. + // We can't use Flatten(), because it removes the values behind a null list + // value, and that makes the offsets into original list values and our + // flattened_values array different. + std::shared_ptr flattened_values = arr.values()->Slice( + arr.value_offset(0), arr.value_offset(arr.length()) - arr.value_offset(0)); + if (arr.value_type()->id() == Type::EXTENSION) { + const auto& arr_ext = checked_cast(*flattened_values); + value_arrays.emplace_back(arr_ext.storage()); + } else { + value_arrays.emplace_back(flattened_values); + } + } + + using ListArrayType = typename ListArrayT::TypeClass; + const auto& list_type = checked_cast(*data.type()); + auto value_type = list_type.value_type(); + if (value_type->id() == Type::EXTENSION) { + value_type = checked_cast(*value_type).storage_type(); + } + + auto flat_column = std::make_shared(value_arrays, value_type); + + options = MakeInnerOptions(std::move(options)); + + OwnedRefNoGIL owned_numpy_array; + RETURN_NOT_OK(ConvertChunkedArrayToPandas(options, flat_column, nullptr, + owned_numpy_array.ref())); + PyObject* numpy_array = owned_numpy_array.obj(); + DCHECK(PyArray_Check(numpy_array)); + + int64_t chunk_offset = 0; + for (int c = 0; c < data.num_chunks(); c++) { + const auto& arr = checked_cast(*data.chunk(c)); + const bool has_nulls = data.null_count() > 0; + for (int64_t i = 0; i < arr.length(); ++i) { + if (has_nulls && arr.IsNull(i)) { + Py_INCREF(Py_None); + *out_values = Py_None; + } else { + // Need to subtract value_offset(0) since the original chunk might be a slice + // into another array. + OwnedRef start(PyLong_FromLongLong(arr.value_offset(i) + chunk_offset - + arr.value_offset(0))); + OwnedRef end(PyLong_FromLongLong(arr.value_offset(i + 1) + chunk_offset - + arr.value_offset(0))); + OwnedRef slice(PySlice_New(start.obj(), end.obj(), nullptr)); + + if (ARROW_PREDICT_FALSE(slice.obj() == nullptr)) { + // Fall out of loop, will return from RETURN_IF_PYERROR + break; + } + *out_values = PyObject_GetItem(numpy_array, slice.obj()); + + if (*out_values == nullptr) { + // Fall out of loop, will return from RETURN_IF_PYERROR + break; + } + } + ++out_values; + } + RETURN_IF_PYERROR(); + + chunk_offset += arr.value_offset(arr.length()) - arr.value_offset(0); + } + + return Status::OK(); +} + +template +Status ConvertMapHelper(F1 resetRow, F2 addPairToRow, F3 stealRow, + const ChunkedArray& data, PyArrayObject* py_keys, + PyArrayObject* py_items, + // needed for null checks in items + const std::vector> item_arrays, + PyObject** out_values) { + OwnedRef key_value; + OwnedRef item_value; + + int64_t chunk_offset = 0; + for (int c = 0; c < data.num_chunks(); ++c) { + const auto& arr = checked_cast(*data.chunk(c)); + const bool has_nulls = data.null_count() > 0; + + // Make a list of key/item pairs for each row in array + for (int64_t i = 0; i < arr.length(); ++i) { + if (has_nulls && arr.IsNull(i)) { + Py_INCREF(Py_None); + *out_values = Py_None; + } else { + int64_t entry_offset = arr.value_offset(i); + int64_t num_pairs = arr.value_offset(i + 1) - entry_offset; + + // Build the new list object for the row of Python pairs + RETURN_NOT_OK(resetRow(num_pairs)); + + // Add each key/item pair in the row + for (int64_t j = 0; j < num_pairs; ++j) { + // Get key value, key is non-nullable for a valid row + auto ptr_key = reinterpret_cast( + PyArray_GETPTR1(py_keys, chunk_offset + entry_offset + j)); + key_value.reset(PyArray_GETITEM(py_keys, ptr_key)); + RETURN_IF_PYERROR(); + + if (item_arrays[c]->IsNull(entry_offset + j)) { + // Translate the Null to a None + Py_INCREF(Py_None); + item_value.reset(Py_None); + } else { + // Get valid value from item array + auto ptr_item = reinterpret_cast( + PyArray_GETPTR1(py_items, chunk_offset + entry_offset + j)); + item_value.reset(PyArray_GETITEM(py_items, ptr_item)); + RETURN_IF_PYERROR(); + } + + // Add the key/item pair to the row + RETURN_NOT_OK(addPairToRow(j, key_value, item_value)); + } + + // Pass ownership to the resulting array + *out_values = stealRow(); + } + ++out_values; + } + RETURN_IF_PYERROR(); + + chunk_offset += arr.values()->length(); + } + + return Status::OK(); +} + +// A more helpful error message around TypeErrors that may stem from unhashable keys +Status CheckMapAsPydictsTypeError() { + if (ARROW_PREDICT_TRUE(!PyErr_Occurred())) { + return Status::OK(); + } + if (PyErr_ExceptionMatches(PyExc_TypeError)) { + // Modify the error string directly, so it is re-raised + // with our additional info. + // + // There are not many interesting things happening when this + // is hit. This is intended to only be called directly after + // PyDict_SetItem, where a finite set of errors could occur. + PyObject *type, *value, *traceback; + PyErr_Fetch(&type, &value, &traceback); + std::string message; + RETURN_NOT_OK(internal::PyObject_StdStringStr(value, &message)); + message += + ". If keys are not hashable, then you must use the option " + "[maps_as_pydicts=None (default)]"; + + // resets the error + PyErr_SetString(PyExc_TypeError, message.c_str()); + } + return ConvertPyError(); +} + +Status CheckForDuplicateKeys(bool error_on_duplicate_keys, Py_ssize_t total_dict_len, + Py_ssize_t total_raw_len) { + if (total_dict_len < total_raw_len) { + const char* message = + "[maps_as_pydicts] " + "After conversion of Arrow maps to pydicts, " + "detected data loss due to duplicate keys. " + "Original input length is [%lld], total converted pydict length is [%lld]."; + std::array buf; + std::snprintf(buf.data(), buf.size(), message, total_raw_len, total_dict_len); + + if (error_on_duplicate_keys) { + return Status::UnknownError(buf.data()); + } else { + ARROW_LOG(WARNING) << buf.data(); + } + } + return Status::OK(); +} + +Status ConvertMap(PandasOptions options, const ChunkedArray& data, + PyObject** out_values) { + // Get columns of underlying key/item arrays + std::vector> key_arrays; + std::vector> item_arrays; + for (int c = 0; c < data.num_chunks(); ++c) { + const auto& map_arr = checked_cast(*data.chunk(c)); + key_arrays.emplace_back(map_arr.keys()); + item_arrays.emplace_back(map_arr.items()); + } + + const auto& map_type = checked_cast(*data.type()); + auto key_type = map_type.key_type(); + auto item_type = map_type.item_type(); + + // ARROW-6899: Convert dictionary-encoded children to dense instead of + // failing below. A more efficient conversion than this could be done later + if (key_type->id() == Type::DICTIONARY) { + auto dense_type = checked_cast(*key_type).value_type(); + RETURN_NOT_OK(DecodeDictionaries(options.pool, dense_type, &key_arrays)); + key_type = dense_type; + } + if (item_type->id() == Type::DICTIONARY) { + auto dense_type = checked_cast(*item_type).value_type(); + RETURN_NOT_OK(DecodeDictionaries(options.pool, dense_type, &item_arrays)); + item_type = dense_type; + } + + // See notes in MakeInnerOptions. + options = MakeInnerOptions(std::move(options)); + // Don't blindly convert because timestamps in lists are handled differently. + options.timestamp_as_object = true; + + auto flat_keys = std::make_shared(key_arrays, key_type); + auto flat_items = std::make_shared(item_arrays, item_type); + OwnedRefNoGIL owned_numpy_keys; + RETURN_NOT_OK( + ConvertChunkedArrayToPandas(options, flat_keys, nullptr, owned_numpy_keys.ref())); + OwnedRefNoGIL owned_numpy_items; + RETURN_NOT_OK( + ConvertChunkedArrayToPandas(options, flat_items, nullptr, owned_numpy_items.ref())); + PyArrayObject* py_keys = reinterpret_cast(owned_numpy_keys.obj()); + PyArrayObject* py_items = reinterpret_cast(owned_numpy_items.obj()); + + if (options.maps_as_pydicts == MapConversionType::DEFAULT) { + // The default behavior to express an Arrow MAP as a list of [(key, value), ...] pairs + OwnedRef list_item; + return ConvertMapHelper( + [&list_item](int64_t num_pairs) { + list_item.reset(PyList_New(num_pairs)); + return CheckPyError(); + }, + [&list_item](int64_t idx, OwnedRef& key_value, OwnedRef& item_value) { + PyList_SET_ITEM(list_item.obj(), idx, + PyTuple_Pack(2, key_value.obj(), item_value.obj())); + return CheckPyError(); + }, + [&list_item] { return list_item.detach(); }, data, py_keys, py_items, item_arrays, + out_values); + } else { + // Use a native pydict + OwnedRef dict_item; + Py_ssize_t total_dict_len{0}; + Py_ssize_t total_raw_len{0}; + + bool error_on_duplicate_keys; + if (options.maps_as_pydicts == MapConversionType::LOSSY) { + error_on_duplicate_keys = false; + } else if (options.maps_as_pydicts == MapConversionType::STRICT_) { + error_on_duplicate_keys = true; + } else { + auto val = std::underlying_type_t(options.maps_as_pydicts); + return Status::UnknownError("Received unknown option for maps_as_pydicts: " + + std::to_string(val)); + } + + auto status = ConvertMapHelper( + [&dict_item, &total_raw_len](int64_t num_pairs) { + total_raw_len += num_pairs; + dict_item.reset(PyDict_New()); + return CheckPyError(); + }, + [&dict_item]([[maybe_unused]] int64_t idx, OwnedRef& key_value, + OwnedRef& item_value) { + auto setitem_result = + PyDict_SetItem(dict_item.obj(), key_value.obj(), item_value.obj()); + ARROW_RETURN_NOT_OK(CheckMapAsPydictsTypeError()); + // returns -1 if there are internal errors around hashing/resizing + return setitem_result == 0 ? Status::OK() + : Status::UnknownError( + "[maps_as_pydicts] " + "Unexpected failure inserting Arrow (key, " + "value) pair into Python dict"); + }, + [&dict_item, &total_dict_len] { + total_dict_len += PyDict_Size(dict_item.obj()); + return dict_item.detach(); + }, + data, py_keys, py_items, item_arrays, out_values); + + ARROW_RETURN_NOT_OK(status); + // If there were no errors generating the pydicts, + // then check if we detected any data loss from duplicate keys. + return CheckForDuplicateKeys(error_on_duplicate_keys, total_dict_len, total_raw_len); + } +} + +template +inline void ConvertNumericNullable(const ChunkedArray& data, InType na_value, + OutType* out_values) { + for (int c = 0; c < data.num_chunks(); c++) { + const auto& arr = *data.chunk(c); + const InType* in_values = GetPrimitiveValues(arr); + + if (arr.null_count() > 0) { + for (int64_t i = 0; i < arr.length(); ++i) { + *out_values++ = arr.IsNull(i) ? na_value : in_values[i]; + } + } else { + memcpy(out_values, in_values, sizeof(InType) * arr.length()); + out_values += arr.length(); + } + } +} + +template +inline void ConvertNumericNullableCast(const ChunkedArray& data, InType na_value, + OutType* out_values) { + for (int c = 0; c < data.num_chunks(); c++) { + const auto& arr = *data.chunk(c); + const InType* in_values = GetPrimitiveValues(arr); + + for (int64_t i = 0; i < arr.length(); ++i) { + *out_values++ = arr.IsNull(i) ? static_cast(na_value) + : static_cast(in_values[i]); + } + } +} + +template +class TypedPandasWriter : public PandasWriter { + public: + using T = typename npy_traits::value_type; + + using PandasWriter::PandasWriter; + + Status TransferSingle(std::shared_ptr data, PyObject* py_ref) override { + if (CanZeroCopy(*data)) { + PyObject* wrapped; + npy_intp dims[2] = {static_cast(num_columns_), + static_cast(num_rows_)}; + RETURN_NOT_OK( + MakeNumPyView(data->chunk(0), py_ref, NPY_TYPE, /*ndim=*/2, dims, &wrapped)); + SetBlockData(wrapped); + return Status::OK(); + } else { + RETURN_NOT_OK(CheckNotZeroCopyOnly(*data)); + RETURN_NOT_OK(EnsureAllocated()); + return CopyInto(data, /*rel_placement=*/0); + } + } + + Status CheckTypeExact(const DataType& type, Type::type expected) { + if (type.id() != expected) { + // TODO(wesm): stringify NumPy / pandas type + return Status::NotImplemented("Cannot write Arrow data of type ", type.ToString()); + } + return Status::OK(); + } + + T* GetBlockColumnStart(int64_t rel_placement) { + return reinterpret_cast(block_data_) + rel_placement * num_rows_; + } + + protected: + Status Allocate() override { return AllocateNDArray(NPY_TYPE); } +}; + +struct ObjectWriterVisitor { + const PandasOptions& options; + const ChunkedArray& data; + PyObject** out_values; + + Status Visit(const NullType& type) { + for (int c = 0; c < data.num_chunks(); c++) { + std::shared_ptr arr = data.chunk(c); + + for (int64_t i = 0; i < arr->length(); ++i) { + // All values are null + Py_INCREF(Py_None); + *out_values = Py_None; + ++out_values; + } + } + return Status::OK(); + } + + Status Visit(const BooleanType& type) { + for (int c = 0; c < data.num_chunks(); c++) { + const auto& arr = checked_cast(*data.chunk(c)); + + for (int64_t i = 0; i < arr.length(); ++i) { + if (arr.IsNull(i)) { + Py_INCREF(Py_None); + *out_values++ = Py_None; + } else if (arr.Value(i)) { + // True + Py_INCREF(Py_True); + *out_values++ = Py_True; + } else { + // False + Py_INCREF(Py_False); + *out_values++ = Py_False; + } + } + } + return Status::OK(); + } + + template + enable_if_integer Visit(const Type& type) { + using T = typename Type::c_type; + auto WrapValue = [](T value, PyObject** out) { + *out = std::is_signed::value ? PyLong_FromLongLong(value) + : PyLong_FromUnsignedLongLong(value); + RETURN_IF_PYERROR(); + return Status::OK(); + }; + return ConvertAsPyObjects(options, data, WrapValue, out_values); + } + + template + enable_if_t::value || is_fixed_size_binary_type::value, + Status> + Visit(const Type& type) { + auto WrapValue = [](const std::string_view& view, PyObject** out) { + *out = WrapBytes::Wrap(view.data(), view.length()); + if (*out == nullptr) { + PyErr_Clear(); + return Status::UnknownError("Wrapping ", view, " failed"); + } + return Status::OK(); + }; + return ConvertAsPyObjects(options, data, WrapValue, out_values); + } + + template + enable_if_date Visit(const Type& type) { + auto WrapValue = [](typename Type::c_type value, PyObject** out) { + RETURN_NOT_OK(internal::PyDate_from_int(value, Type::UNIT, out)); + RETURN_IF_PYERROR(); + return Status::OK(); + }; + return ConvertAsPyObjects(options, data, WrapValue, out_values); + } + + template + enable_if_time Visit(const Type& type) { + const TimeUnit::type unit = type.unit(); + auto WrapValue = [unit](typename Type::c_type value, PyObject** out) { + RETURN_NOT_OK(internal::PyTime_from_int(value, unit, out)); + RETURN_IF_PYERROR(); + return Status::OK(); + }; + return ConvertAsPyObjects(options, data, WrapValue, out_values); + } + + template + enable_if_timestamp Visit(const Type& type) { + const TimeUnit::type unit = type.unit(); + OwnedRef tzinfo; + + auto ConvertTimezoneNaive = [&](typename Type::c_type value, PyObject** out) { + RETURN_NOT_OK(internal::PyDateTime_from_int(value, unit, out)); + RETURN_IF_PYERROR(); + return Status::OK(); + }; + auto ConvertTimezoneAware = [&](typename Type::c_type value, PyObject** out) { + PyObject* naive_datetime; + RETURN_NOT_OK(ConvertTimezoneNaive(value, &naive_datetime)); + + // convert the timezone naive datetime object to timezone aware + // two step conversion of the datetime mimics Python's code: + // dt.replace(tzinfo=datetime.timezone.utc).astimezone(tzinfo) + // first step: replacing timezone with timezone.utc (replace method) + OwnedRef args(PyTuple_New(0)); + OwnedRef keywords(PyDict_New()); + PyDict_SetItemString(keywords.obj(), "tzinfo", PyDateTime_TimeZone_UTC); + OwnedRef naive_datetime_replace(PyObject_GetAttrString(naive_datetime, "replace")); + OwnedRef datetime_utc( + PyObject_Call(naive_datetime_replace.obj(), args.obj(), keywords.obj())); + // second step: adjust the datetime to tzinfo timezone (astimezone method) + *out = PyObject_CallMethod(datetime_utc.obj(), "astimezone", "O", tzinfo.obj()); + + // the timezone naive object is no longer required + Py_DECREF(naive_datetime); + RETURN_IF_PYERROR(); + + return Status::OK(); + }; + + if (!type.timezone().empty() && !options.ignore_timezone) { + // convert timezone aware + PyObject* tzobj; + ARROW_ASSIGN_OR_RAISE(tzobj, internal::StringToTzinfo(type.timezone())); + tzinfo.reset(tzobj); + RETURN_IF_PYERROR(); + RETURN_NOT_OK( + ConvertAsPyObjects(options, data, ConvertTimezoneAware, out_values)); + } else { + // convert timezone naive + RETURN_NOT_OK( + ConvertAsPyObjects(options, data, ConvertTimezoneNaive, out_values)); + } + + return Status::OK(); + } + + template + enable_if_t::value, Status> Visit( + const Type& type) { + OwnedRef args(PyTuple_New(0)); + OwnedRef kwargs(PyDict_New()); + RETURN_IF_PYERROR(); + auto to_date_offset = [&](const MonthDayNanoIntervalType::MonthDayNanos& interval, + PyObject** out) { + DCHECK(internal::BorrowPandasDataOffsetType() != nullptr); + // DateOffset objects do not add nanoseconds component to pd.Timestamp. + // as of Pandas 1.3.3 + // (https://github.com/pandas-dev/pandas/issues/43892). + // So convert microseconds and remainder to preserve data + // but give users more expected results. + int64_t microseconds = interval.nanoseconds / 1000; + int64_t nanoseconds; + if (interval.nanoseconds >= 0) { + nanoseconds = interval.nanoseconds % 1000; + } else { + nanoseconds = -((-interval.nanoseconds) % 1000); + } + + PyDict_SetItemString(kwargs.obj(), "months", PyLong_FromLong(interval.months)); + PyDict_SetItemString(kwargs.obj(), "days", PyLong_FromLong(interval.days)); + PyDict_SetItemString(kwargs.obj(), "microseconds", + PyLong_FromLongLong(microseconds)); + PyDict_SetItemString(kwargs.obj(), "nanoseconds", PyLong_FromLongLong(nanoseconds)); + *out = + PyObject_Call(internal::BorrowPandasDataOffsetType(), args.obj(), kwargs.obj()); + RETURN_IF_PYERROR(); + return Status::OK(); + }; + return ConvertAsPyObjects(options, data, to_date_offset, + out_values); + } + + Status Visit(const Decimal128Type& type) { + OwnedRef decimal; + OwnedRef Decimal; + RETURN_NOT_OK(internal::ImportModule("decimal", &decimal)); + RETURN_NOT_OK(internal::ImportFromModule(decimal.obj(), "Decimal", &Decimal)); + PyObject* decimal_constructor = Decimal.obj(); + + for (int c = 0; c < data.num_chunks(); c++) { + const auto& arr = checked_cast(*data.chunk(c)); + + for (int64_t i = 0; i < arr.length(); ++i) { + if (arr.IsNull(i)) { + Py_INCREF(Py_None); + *out_values++ = Py_None; + } else { + *out_values++ = + internal::DecimalFromString(decimal_constructor, arr.FormatValue(i)); + RETURN_IF_PYERROR(); + } + } + } + + return Status::OK(); + } + + Status Visit(const Decimal256Type& type) { + OwnedRef decimal; + OwnedRef Decimal; + RETURN_NOT_OK(internal::ImportModule("decimal", &decimal)); + RETURN_NOT_OK(internal::ImportFromModule(decimal.obj(), "Decimal", &Decimal)); + PyObject* decimal_constructor = Decimal.obj(); + + for (int c = 0; c < data.num_chunks(); c++) { + const auto& arr = checked_cast(*data.chunk(c)); + + for (int64_t i = 0; i < arr.length(); ++i) { + if (arr.IsNull(i)) { + Py_INCREF(Py_None); + *out_values++ = Py_None; + } else { + *out_values++ = + internal::DecimalFromString(decimal_constructor, arr.FormatValue(i)); + RETURN_IF_PYERROR(); + } + } + } + + return Status::OK(); + } + + template + enable_if_t::value || is_var_length_list_type::value, + Status> + Visit(const T& type) { + using ArrayType = typename TypeTraits::ArrayType; + if (!ListTypeSupported(*type.value_type())) { + return Status::NotImplemented( + "Not implemented type for conversion from List to Pandas: ", + type.value_type()->ToString()); + } + return ConvertListsLike(options, data, out_values); + } + + Status Visit(const MapType& type) { return ConvertMap(options, data, out_values); } + + Status Visit(const StructType& type) { + return ConvertStruct(options, data, out_values); + } + + template + enable_if_t::value || + std::is_same::value || + std::is_same::value || + std::is_same::value || + std::is_same::value || + (std::is_base_of::value && + !std::is_same::value) || + std::is_base_of::value, + Status> + Visit(const Type& type) { + return Status::NotImplemented("No implemented conversion to object dtype: ", + type.ToString()); + } +}; + +class ObjectWriter : public TypedPandasWriter { + public: + using TypedPandasWriter::TypedPandasWriter; + Status CopyInto(std::shared_ptr data, int64_t rel_placement) override { + PyAcquireGIL lock; + ObjectWriterVisitor visitor{this->options_, *data, + this->GetBlockColumnStart(rel_placement)}; + return VisitTypeInline(*data->type(), &visitor); + } +}; + +static inline bool IsNonNullContiguous(const ChunkedArray& data) { + return data.num_chunks() == 1 && data.null_count() == 0; +} + +template +class IntWriter : public TypedPandasWriter { + public: + using ArrowType = typename npy_traits::TypeClass; + using TypedPandasWriter::TypedPandasWriter; + + bool CanZeroCopy(const ChunkedArray& data) const override { + return IsNonNullContiguous(data); + } + + Status CopyInto(std::shared_ptr data, int64_t rel_placement) override { + RETURN_NOT_OK(this->CheckTypeExact(*data->type(), ArrowType::type_id)); + ConvertIntegerNoNullsSameType( + this->options_, *data, this->GetBlockColumnStart(rel_placement)); + return Status::OK(); + } +}; + +template +class FloatWriter : public TypedPandasWriter { + public: + using ArrowType = typename npy_traits::TypeClass; + using TypedPandasWriter::TypedPandasWriter; + using T = typename ArrowType::c_type; + + bool CanZeroCopy(const ChunkedArray& data) const override { + return IsNonNullContiguous(data) && data.type()->id() == ArrowType::type_id; + } + + Status CopyInto(std::shared_ptr data, int64_t rel_placement) override { + Type::type in_type = data->type()->id(); + auto out_values = this->GetBlockColumnStart(rel_placement); + +#define INTEGER_CASE(IN_TYPE) \ + ConvertIntegerWithNulls(this->options_, *data, out_values); \ + break; + + switch (in_type) { + case Type::UINT8: + INTEGER_CASE(uint8_t); + case Type::INT8: + INTEGER_CASE(int8_t); + case Type::UINT16: + INTEGER_CASE(uint16_t); + case Type::INT16: + INTEGER_CASE(int16_t); + case Type::UINT32: + INTEGER_CASE(uint32_t); + case Type::INT32: + INTEGER_CASE(int32_t); + case Type::UINT64: + INTEGER_CASE(uint64_t); + case Type::INT64: + INTEGER_CASE(int64_t); + case Type::HALF_FLOAT: + ConvertNumericNullableCast(*data, npy_traits::na_sentinel, out_values); + case Type::FLOAT: + ConvertNumericNullableCast(*data, npy_traits::na_sentinel, out_values); + break; + case Type::DOUBLE: + ConvertNumericNullableCast(*data, npy_traits::na_sentinel, out_values); + break; + default: + return Status::NotImplemented("Cannot write Arrow data of type ", + data->type()->ToString(), + " to a Pandas floating point block"); + } + +#undef INTEGER_CASE + + return Status::OK(); + } +}; + +using UInt8Writer = IntWriter; +using Int8Writer = IntWriter; +using UInt16Writer = IntWriter; +using Int16Writer = IntWriter; +using UInt32Writer = IntWriter; +using Int32Writer = IntWriter; +using UInt64Writer = IntWriter; +using Int64Writer = IntWriter; +using Float16Writer = FloatWriter; +using Float32Writer = FloatWriter; +using Float64Writer = FloatWriter; + +class BoolWriter : public TypedPandasWriter { + public: + using TypedPandasWriter::TypedPandasWriter; + + Status TransferSingle(std::shared_ptr data, PyObject* py_ref) override { + RETURN_NOT_OK( + CheckNoZeroCopy("Zero copy conversions not possible with " + "boolean types")); + RETURN_NOT_OK(EnsureAllocated()); + return CopyInto(data, /*rel_placement=*/0); + } + + Status CopyInto(std::shared_ptr data, int64_t rel_placement) override { + RETURN_NOT_OK(this->CheckTypeExact(*data->type(), Type::BOOL)); + auto out_values = this->GetBlockColumnStart(rel_placement); + for (int c = 0; c < data->num_chunks(); c++) { + const auto& arr = checked_cast(*data->chunk(c)); + for (int64_t i = 0; i < arr.length(); ++i) { + *out_values++ = static_cast(arr.Value(i)); + } + } + return Status::OK(); + } +}; + +// ---------------------------------------------------------------------- +// Date / timestamp types + +template +inline void ConvertDatetime(const ChunkedArray& data, int64_t* out_values) { + for (int c = 0; c < data.num_chunks(); c++) { + const auto& arr = *data.chunk(c); + const T* in_values = GetPrimitiveValues(arr); + + for (int64_t i = 0; i < arr.length(); ++i) { + *out_values++ = arr.IsNull(i) ? kPandasTimestampNull + : (static_cast(in_values[i]) * SHIFT); + } + } +} + +template +void ConvertDatesShift(const ChunkedArray& data, int64_t* out_values) { + for (int c = 0; c < data.num_chunks(); c++) { + const auto& arr = *data.chunk(c); + const T* in_values = GetPrimitiveValues(arr); + for (int64_t i = 0; i < arr.length(); ++i) { + *out_values++ = arr.IsNull(i) ? kPandasTimestampNull + : static_cast(in_values[i]) / SHIFT; + } + } +} + +class DatetimeDayWriter : public TypedPandasWriter { + public: + using TypedPandasWriter::TypedPandasWriter; + + Status CopyInto(std::shared_ptr data, int64_t rel_placement) override { + int64_t* out_values = this->GetBlockColumnStart(rel_placement); + const auto& type = checked_cast(*data->type()); + switch (type.unit()) { + case DateUnit::DAY: + ConvertDatesShift(*data, out_values); + break; + case DateUnit::MILLI: + ConvertDatesShift(*data, out_values); + break; + } + return Status::OK(); + } + + protected: + Status Allocate() override { + RETURN_NOT_OK(this->AllocateNDArray(NPY_DATETIME)); + SetDatetimeUnit(NPY_FR_D); + return Status::OK(); + } +}; + +template +class DatetimeWriter : public TypedPandasWriter { + public: + using TypedPandasWriter::TypedPandasWriter; + + bool CanZeroCopy(const ChunkedArray& data) const override { + if (data.type()->id() == Type::TIMESTAMP) { + const auto& type = checked_cast(*data.type()); + return IsNonNullContiguous(data) && type.unit() == UNIT; + } else { + return false; + } + } + + Status CopyInto(std::shared_ptr data, int64_t rel_placement) override { + const auto& ts_type = checked_cast(*data->type()); + DCHECK_EQ(UNIT, ts_type.unit()) << "Should only call instances of this writer " + << "with arrays of the correct unit"; + ConvertNumericNullable(*data, kPandasTimestampNull, + this->GetBlockColumnStart(rel_placement)); + return Status::OK(); + } + + protected: + Status Allocate() override { + RETURN_NOT_OK(this->AllocateNDArray(NPY_DATETIME)); + SetDatetimeUnit(internal::NumPyFrequency(UNIT)); + return Status::OK(); + } +}; + +using DatetimeSecondWriter = DatetimeWriter; + +class DatetimeMilliWriter : public DatetimeWriter { + public: + using DatetimeWriter::DatetimeWriter; + + Status CopyInto(std::shared_ptr data, int64_t rel_placement) override { + Type::type type = data->type()->id(); + int64_t* out_values = this->GetBlockColumnStart(rel_placement); + if (type == Type::DATE32) { + // Convert from days since epoch to datetime64[ms] + ConvertDatetime(*data, out_values); + } else if (type == Type::DATE64) { + ConvertNumericNullable(*data, kPandasTimestampNull, out_values); + } else { + const auto& ts_type = checked_cast(*data->type()); + DCHECK_EQ(TimeUnit::MILLI, ts_type.unit()) + << "Should only call instances of this writer " + << "with arrays of the correct unit"; + ConvertNumericNullable(*data, kPandasTimestampNull, out_values); + } + return Status::OK(); + } +}; + +using DatetimeMicroWriter = DatetimeWriter; + +class DatetimeNanoWriter : public DatetimeWriter { + public: + using DatetimeWriter::DatetimeWriter; + + Status CopyInto(std::shared_ptr data, int64_t rel_placement) override { + Type::type type = data->type()->id(); + int64_t* out_values = this->GetBlockColumnStart(rel_placement); + compute::ExecContext ctx(options_.pool); + compute::CastOptions options; + if (options_.safe_cast) { + options = compute::CastOptions::Safe(); + } else { + options = compute::CastOptions::Unsafe(); + } + Datum out; + auto target_type = timestamp(TimeUnit::NANO); + + if (type == Type::DATE32) { + // Convert from days since epoch to datetime64[ns] + ConvertDatetime(*data, out_values); + } else if (type == Type::DATE64) { + // Date64Type is millisecond timestamp stored as int64_t + // TODO(wesm): Do we want to make sure to zero out the milliseconds? + ConvertDatetime(*data, out_values); + } else if (type == Type::TIMESTAMP) { + const auto& ts_type = checked_cast(*data->type()); + + if (ts_type.unit() == TimeUnit::NANO) { + ConvertNumericNullable(*data, kPandasTimestampNull, out_values); + } else if (ts_type.unit() == TimeUnit::MICRO || ts_type.unit() == TimeUnit::MILLI || + ts_type.unit() == TimeUnit::SECOND) { + ARROW_ASSIGN_OR_RAISE(out, compute::Cast(data, target_type, options, &ctx)); + ConvertNumericNullable(*out.chunked_array(), kPandasTimestampNull, + out_values); + } else { + return Status::NotImplemented("Unsupported time unit"); + } + } else { + return Status::NotImplemented("Cannot write Arrow data of type ", + data->type()->ToString(), + " to a Pandas datetime block."); + } + return Status::OK(); + } +}; + +template +class DatetimeTZWriter : public BASE { + public: + DatetimeTZWriter(const PandasOptions& options, const std::string& timezone, + int64_t num_rows) + : BASE(options, num_rows, 1), timezone_(timezone) {} + + protected: + Status GetResultBlock(PyObject** out) override { + RETURN_NOT_OK(this->MakeBlock1D()); + *out = this->block_arr_.obj(); + return Status::OK(); + } + + Status AddResultMetadata(PyObject* result) override { + PyObject* py_tz = PyUnicode_FromStringAndSize( + timezone_.c_str(), static_cast(timezone_.size())); + RETURN_IF_PYERROR(); + PyDict_SetItemString(result, "timezone", py_tz); + Py_DECREF(py_tz); + return Status::OK(); + } + + private: + std::string timezone_; +}; + +using DatetimeSecondTZWriter = DatetimeTZWriter; +using DatetimeMilliTZWriter = DatetimeTZWriter; +using DatetimeMicroTZWriter = DatetimeTZWriter; +using DatetimeNanoTZWriter = DatetimeTZWriter; + +template +class TimedeltaWriter : public TypedPandasWriter { + public: + using TypedPandasWriter::TypedPandasWriter; + + Status AllocateTimedelta(int ndim) { + RETURN_NOT_OK(this->AllocateNDArray(NPY_TIMEDELTA, ndim)); + SetDatetimeUnit(internal::NumPyFrequency(UNIT)); + return Status::OK(); + } + + bool CanZeroCopy(const ChunkedArray& data) const override { + const auto& type = checked_cast(*data.type()); + return IsNonNullContiguous(data) && type.unit() == UNIT; + } + + Status CopyInto(std::shared_ptr data, int64_t rel_placement) override { + const auto& type = checked_cast(*data->type()); + DCHECK_EQ(UNIT, type.unit()) << "Should only call instances of this writer " + << "with arrays of the correct unit"; + ConvertNumericNullable(*data, kPandasTimestampNull, + this->GetBlockColumnStart(rel_placement)); + return Status::OK(); + } + + protected: + Status Allocate() override { return AllocateTimedelta(2); } +}; + +using TimedeltaSecondWriter = TimedeltaWriter; +using TimedeltaMilliWriter = TimedeltaWriter; +using TimedeltaMicroWriter = TimedeltaWriter; + +class TimedeltaNanoWriter : public TimedeltaWriter { + public: + using TimedeltaWriter::TimedeltaWriter; + + Status CopyInto(std::shared_ptr data, int64_t rel_placement) override { + Type::type type = data->type()->id(); + int64_t* out_values = this->GetBlockColumnStart(rel_placement); + if (type == Type::DURATION) { + const auto& ts_type = checked_cast(*data->type()); + if (ts_type.unit() == TimeUnit::NANO) { + ConvertNumericNullable(*data, kPandasTimestampNull, out_values); + } else if (ts_type.unit() == TimeUnit::MICRO) { + ConvertDatetime(*data, out_values); + } else if (ts_type.unit() == TimeUnit::MILLI) { + ConvertDatetime(*data, out_values); + } else if (ts_type.unit() == TimeUnit::SECOND) { + ConvertDatetime(*data, out_values); + } else { + return Status::NotImplemented("Unsupported time unit"); + } + } else { + return Status::NotImplemented("Cannot write Arrow data of type ", + data->type()->ToString(), + " to a Pandas timedelta block."); + } + return Status::OK(); + } +}; + +Status MakeZeroLengthArray(const std::shared_ptr& type, + std::shared_ptr* out) { + std::unique_ptr builder; + RETURN_NOT_OK(MakeBuilder(default_memory_pool(), type, &builder)); + RETURN_NOT_OK(builder->Resize(0)); + return builder->Finish(out); +} + +bool NeedDictionaryUnification(const ChunkedArray& data) { + if (data.num_chunks() < 2) { + return false; + } + const auto& arr_first = checked_cast(*data.chunk(0)); + for (int c = 1; c < data.num_chunks(); c++) { + const auto& arr = checked_cast(*data.chunk(c)); + if (!(arr_first.dictionary()->Equals(arr.dictionary()))) { + return true; + } + } + return false; +} + +template +class CategoricalWriter + : public TypedPandasWriter::npy_type> { + public: + using TRAITS = arrow_traits; + using ArrayType = typename TypeTraits::ArrayType; + using T = typename TRAITS::T; + + explicit CategoricalWriter(const PandasOptions& options, int64_t num_rows) + : TypedPandasWriter(options, num_rows, 1), + ordered_(false), + needs_copy_(false) {} + + Status CopyInto(std::shared_ptr data, int64_t rel_placement) override { + return Status::NotImplemented("categorical type"); + } + + Status TransferSingle(std::shared_ptr data, PyObject* py_ref) override { + const auto& dict_type = checked_cast(*data->type()); + std::shared_ptr dict; + if (data->num_chunks() == 0) { + // no dictionary values => create empty array + RETURN_NOT_OK(this->AllocateNDArray(TRAITS::npy_type, 1)); + RETURN_NOT_OK(MakeZeroLengthArray(dict_type.value_type(), &dict)); + } else { + DCHECK_EQ(IndexType::type_id, dict_type.index_type()->id()); + RETURN_NOT_OK(WriteIndices(*data, &dict)); + } + + PyObject* pydict; + RETURN_NOT_OK(ConvertArrayToPandas(this->options_, dict, nullptr, &pydict)); + dictionary_.reset(pydict); + ordered_ = dict_type.ordered(); + return Status::OK(); + } + + Status Write(std::shared_ptr data, int64_t abs_placement, + int64_t rel_placement) override { + RETURN_NOT_OK(this->EnsurePlacementAllocated()); + RETURN_NOT_OK(TransferSingle(data, /*py_ref=*/nullptr)); + this->placement_data_[rel_placement] = abs_placement; + return Status::OK(); + } + + Status GetSeriesResult(PyObject** out) override { + PyAcquireGIL lock; + + PyObject* result = PyDict_New(); + RETURN_IF_PYERROR(); + + // Expected single array dictionary layout + PyDict_SetItemString(result, "indices", this->block_arr_.obj()); + RETURN_IF_PYERROR(); + RETURN_NOT_OK(AddResultMetadata(result)); + + *out = result; + return Status::OK(); + } + + protected: + Status AddResultMetadata(PyObject* result) override { + PyDict_SetItemString(result, "dictionary", dictionary_.obj()); + PyObject* py_ordered = ordered_ ? Py_True : Py_False; + Py_INCREF(py_ordered); + PyDict_SetItemString(result, "ordered", py_ordered); + return Status::OK(); + } + + Status WriteIndicesUniform(const ChunkedArray& data) { + RETURN_NOT_OK(this->AllocateNDArray(TRAITS::npy_type, 1)); + T* out_values = reinterpret_cast(this->block_data_); + + for (int c = 0; c < data.num_chunks(); c++) { + const auto& arr = checked_cast(*data.chunk(c)); + const auto& indices = checked_cast(*arr.indices()); + auto values = reinterpret_cast(indices.raw_values()); + + RETURN_NOT_OK(CheckIndexBounds(*indices.data(), arr.dictionary()->length())); + // Null is -1 in CategoricalBlock + for (int i = 0; i < arr.length(); ++i) { + if (indices.IsValid(i)) { + *out_values++ = values[i]; + } else { + *out_values++ = -1; + } + } + } + return Status::OK(); + } + + Status WriteIndicesVarying(const ChunkedArray& data, std::shared_ptr* out_dict) { + // Yield int32 indices to allow for dictionary outgrowing the current index + // type + RETURN_NOT_OK(this->AllocateNDArray(NPY_INT32, 1)); + auto out_values = reinterpret_cast(this->block_data_); + + const auto& dict_type = checked_cast(*data.type()); + + ARROW_ASSIGN_OR_RAISE(auto unifier, DictionaryUnifier::Make(dict_type.value_type(), + this->options_.pool)); + for (int c = 0; c < data.num_chunks(); c++) { + const auto& arr = checked_cast(*data.chunk(c)); + const auto& indices = checked_cast(*arr.indices()); + auto values = reinterpret_cast(indices.raw_values()); + + std::shared_ptr transpose_buffer; + RETURN_NOT_OK(unifier->Unify(*arr.dictionary(), &transpose_buffer)); + + auto transpose = reinterpret_cast(transpose_buffer->data()); + int64_t dict_length = arr.dictionary()->length(); + + RETURN_NOT_OK(CheckIndexBounds(*indices.data(), dict_length)); + + // Null is -1 in CategoricalBlock + for (int i = 0; i < arr.length(); ++i) { + if (indices.IsValid(i)) { + *out_values++ = transpose[values[i]]; + } else { + *out_values++ = -1; + } + } + } + + std::shared_ptr unused_type; + return unifier->GetResult(&unused_type, out_dict); + } + + Status WriteIndices(const ChunkedArray& data, std::shared_ptr* out_dict) { + DCHECK_GT(data.num_chunks(), 0); + + // Sniff the first chunk + const auto& arr_first = checked_cast(*data.chunk(0)); + const auto indices_first = std::static_pointer_cast(arr_first.indices()); + + if (data.num_chunks() == 1 && indices_first->null_count() == 0) { + RETURN_NOT_OK( + CheckIndexBounds(*indices_first->data(), arr_first.dictionary()->length())); + + PyObject* wrapped; + npy_intp dims[1] = {static_cast(this->num_rows_)}; + RETURN_NOT_OK(MakeNumPyView(indices_first, /*py_ref=*/nullptr, TRAITS::npy_type, + /*ndim=*/1, dims, &wrapped)); + this->SetBlockData(wrapped); + *out_dict = arr_first.dictionary(); + } else { + RETURN_NOT_OK(this->CheckNotZeroCopyOnly(data)); + if (NeedDictionaryUnification(data)) { + RETURN_NOT_OK(WriteIndicesVarying(data, out_dict)); + } else { + RETURN_NOT_OK(WriteIndicesUniform(data)); + *out_dict = arr_first.dictionary(); + } + } + return Status::OK(); + } + + OwnedRefNoGIL dictionary_; + bool ordered_; + bool needs_copy_; +}; + +class ExtensionWriter : public PandasWriter { + public: + using PandasWriter::PandasWriter; + + Status Allocate() override { + // no-op + return Status::OK(); + } + + Status TransferSingle(std::shared_ptr data, PyObject* py_ref) override { + PyAcquireGIL lock; + PyObject* py_array; + py_array = wrap_chunked_array(data); + py_array_.reset(py_array); + + return Status::OK(); + } + + Status CopyInto(std::shared_ptr data, int64_t rel_placement) override { + return TransferSingle(data, nullptr); + } + + Status GetDataFrameResult(PyObject** out) override { + PyAcquireGIL lock; + PyObject* result = PyDict_New(); + RETURN_IF_PYERROR(); + + PyDict_SetItemString(result, "py_array", py_array_.obj()); + PyDict_SetItemString(result, "placement", placement_arr_.obj()); + *out = result; + return Status::OK(); + } + + Status GetSeriesResult(PyObject** out) override { + *out = py_array_.detach(); + return Status::OK(); + } + + protected: + OwnedRefNoGIL py_array_; +}; + +Status MakeWriter(const PandasOptions& options, PandasWriter::type writer_type, + const DataType& type, int64_t num_rows, int num_columns, + std::shared_ptr* writer) { +#define BLOCK_CASE(NAME, TYPE) \ + case PandasWriter::NAME: \ + *writer = std::make_shared(options, num_rows, num_columns); \ + break; + +#define CATEGORICAL_CASE(TYPE) \ + case TYPE::type_id: \ + *writer = std::make_shared>(options, num_rows); \ + break; + +#define TZ_CASE(NAME, TYPE) \ + case PandasWriter::NAME: { \ + const auto& ts_type = checked_cast(type); \ + *writer = std::make_shared(options, ts_type.timezone(), num_rows); \ + } break; + + switch (writer_type) { + case PandasWriter::CATEGORICAL: { + const auto& index_type = *checked_cast(type).index_type(); + switch (index_type.id()) { + CATEGORICAL_CASE(Int8Type); + CATEGORICAL_CASE(Int16Type); + CATEGORICAL_CASE(Int32Type); + CATEGORICAL_CASE(Int64Type); + case Type::UINT8: + case Type::UINT16: + case Type::UINT32: + case Type::UINT64: + return Status::TypeError( + "Converting unsigned dictionary indices to pandas", + " not yet supported, index type: ", index_type.ToString()); + default: + // Unreachable + DCHECK(false); + break; + } + } break; + case PandasWriter::EXTENSION: + *writer = std::make_shared(options, num_rows, num_columns); + break; + BLOCK_CASE(OBJECT, ObjectWriter); + BLOCK_CASE(UINT8, UInt8Writer); + BLOCK_CASE(INT8, Int8Writer); + BLOCK_CASE(UINT16, UInt16Writer); + BLOCK_CASE(INT16, Int16Writer); + BLOCK_CASE(UINT32, UInt32Writer); + BLOCK_CASE(INT32, Int32Writer); + BLOCK_CASE(UINT64, UInt64Writer); + BLOCK_CASE(INT64, Int64Writer); + BLOCK_CASE(HALF_FLOAT, Float16Writer); + BLOCK_CASE(FLOAT, Float32Writer); + BLOCK_CASE(DOUBLE, Float64Writer); + BLOCK_CASE(BOOL, BoolWriter); + BLOCK_CASE(DATETIME_DAY, DatetimeDayWriter); + BLOCK_CASE(DATETIME_SECOND, DatetimeSecondWriter); + BLOCK_CASE(DATETIME_MILLI, DatetimeMilliWriter); + BLOCK_CASE(DATETIME_MICRO, DatetimeMicroWriter); + BLOCK_CASE(DATETIME_NANO, DatetimeNanoWriter); + BLOCK_CASE(TIMEDELTA_SECOND, TimedeltaSecondWriter); + BLOCK_CASE(TIMEDELTA_MILLI, TimedeltaMilliWriter); + BLOCK_CASE(TIMEDELTA_MICRO, TimedeltaMicroWriter); + BLOCK_CASE(TIMEDELTA_NANO, TimedeltaNanoWriter); + TZ_CASE(DATETIME_SECOND_TZ, DatetimeSecondTZWriter); + TZ_CASE(DATETIME_MILLI_TZ, DatetimeMilliTZWriter); + TZ_CASE(DATETIME_MICRO_TZ, DatetimeMicroTZWriter); + TZ_CASE(DATETIME_NANO_TZ, DatetimeNanoTZWriter); + default: + return Status::NotImplemented("Unsupported block type"); + } + +#undef BLOCK_CASE +#undef CATEGORICAL_CASE + + return Status::OK(); +} + +static Status GetPandasWriterType(const ChunkedArray& data, const PandasOptions& options, + PandasWriter::type* output_type) { +#define INTEGER_CASE(NAME) \ + *output_type = \ + data.null_count() > 0 \ + ? options.integer_object_nulls ? PandasWriter::OBJECT : PandasWriter::DOUBLE \ + : PandasWriter::NAME; \ + break; + + switch (data.type()->id()) { + case Type::BOOL: + *output_type = data.null_count() > 0 ? PandasWriter::OBJECT : PandasWriter::BOOL; + break; + case Type::UINT8: + INTEGER_CASE(UINT8); + case Type::INT8: + INTEGER_CASE(INT8); + case Type::UINT16: + INTEGER_CASE(UINT16); + case Type::INT16: + INTEGER_CASE(INT16); + case Type::UINT32: + INTEGER_CASE(UINT32); + case Type::INT32: + INTEGER_CASE(INT32); + case Type::UINT64: + INTEGER_CASE(UINT64); + case Type::INT64: + INTEGER_CASE(INT64); + case Type::HALF_FLOAT: + *output_type = PandasWriter::HALF_FLOAT; + break; + case Type::FLOAT: + *output_type = PandasWriter::FLOAT; + break; + case Type::DOUBLE: + *output_type = PandasWriter::DOUBLE; + break; + case Type::STRING: // fall through + case Type::LARGE_STRING: // fall through + case Type::BINARY: // fall through + case Type::LARGE_BINARY: + case Type::NA: // fall through + case Type::FIXED_SIZE_BINARY: // fall through + case Type::STRUCT: // fall through + case Type::TIME32: // fall through + case Type::TIME64: // fall through + case Type::DECIMAL128: // fall through + case Type::DECIMAL256: // fall through + case Type::INTERVAL_MONTH_DAY_NANO: // fall through + *output_type = PandasWriter::OBJECT; + break; + case Type::DATE32: + if (options.date_as_object) { + *output_type = PandasWriter::OBJECT; + } else if (options.coerce_temporal_nanoseconds) { + *output_type = PandasWriter::DATETIME_NANO; + } else if (options.to_numpy) { + // Numpy supports Day, but Pandas does not + *output_type = PandasWriter::DATETIME_DAY; + } else { + *output_type = PandasWriter::DATETIME_MILLI; + } + break; + case Type::DATE64: + if (options.date_as_object) { + *output_type = PandasWriter::OBJECT; + } else if (options.coerce_temporal_nanoseconds) { + *output_type = PandasWriter::DATETIME_NANO; + } else { + *output_type = PandasWriter::DATETIME_MILLI; + } + break; + case Type::TIMESTAMP: { + const auto& ts_type = checked_cast(*data.type()); + if (options.timestamp_as_object && ts_type.unit() != TimeUnit::NANO) { + // Nanoseconds are never out of bounds for pandas, so in that case + // we don't convert to object + *output_type = PandasWriter::OBJECT; + } else if (options.coerce_temporal_nanoseconds) { + if (!ts_type.timezone().empty()) { + *output_type = PandasWriter::DATETIME_NANO_TZ; + } else { + *output_type = PandasWriter::DATETIME_NANO; + } + } else { + if (!ts_type.timezone().empty()) { + switch (ts_type.unit()) { + case TimeUnit::SECOND: + *output_type = PandasWriter::DATETIME_SECOND_TZ; + break; + case TimeUnit::MILLI: + *output_type = PandasWriter::DATETIME_MILLI_TZ; + break; + case TimeUnit::MICRO: + *output_type = PandasWriter::DATETIME_MICRO_TZ; + break; + case TimeUnit::NANO: + *output_type = PandasWriter::DATETIME_NANO_TZ; + break; + } + } else { + switch (ts_type.unit()) { + case TimeUnit::SECOND: + *output_type = PandasWriter::DATETIME_SECOND; + break; + case TimeUnit::MILLI: + *output_type = PandasWriter::DATETIME_MILLI; + break; + case TimeUnit::MICRO: + *output_type = PandasWriter::DATETIME_MICRO; + break; + case TimeUnit::NANO: + *output_type = PandasWriter::DATETIME_NANO; + break; + } + } + } + } break; + case Type::DURATION: { + const auto& dur_type = checked_cast(*data.type()); + if (options.coerce_temporal_nanoseconds) { + *output_type = PandasWriter::TIMEDELTA_NANO; + } else { + switch (dur_type.unit()) { + case TimeUnit::SECOND: + *output_type = PandasWriter::TIMEDELTA_SECOND; + break; + case TimeUnit::MILLI: + *output_type = PandasWriter::TIMEDELTA_MILLI; + break; + case TimeUnit::MICRO: + *output_type = PandasWriter::TIMEDELTA_MICRO; + break; + case TimeUnit::NANO: + *output_type = PandasWriter::TIMEDELTA_NANO; + break; + } + } + } break; + case Type::FIXED_SIZE_LIST: + case Type::LIST: + case Type::LARGE_LIST: + case Type::MAP: { + auto list_type = std::static_pointer_cast(data.type()); + if (!ListTypeSupported(*list_type->value_type())) { + return Status::NotImplemented("Not implemented type for Arrow list to pandas: ", + list_type->value_type()->ToString()); + } + *output_type = PandasWriter::OBJECT; + } break; + case Type::DICTIONARY: + *output_type = PandasWriter::CATEGORICAL; + break; + case Type::EXTENSION: + *output_type = PandasWriter::EXTENSION; + break; + default: + return Status::NotImplemented( + "No known equivalent Pandas block for Arrow data of type ", + data.type()->ToString(), " is known."); + } + return Status::OK(); +} + +// Construct the exact pandas "BlockManager" memory layout +// +// * For each column determine the correct output pandas type +// * Allocate 2D blocks (ncols x nrows) for each distinct data type in output +// * Allocate block placement arrays +// * Write Arrow columns out into each slice of memory; populate block +// * placement arrays as we go +class PandasBlockCreator { + public: + using WriterMap = std::unordered_map>; + + explicit PandasBlockCreator(const PandasOptions& options, FieldVector fields, + ChunkedArrayVector arrays) + : options_(options), fields_(std::move(fields)), arrays_(std::move(arrays)) { + num_columns_ = static_cast(arrays_.size()); + if (num_columns_ > 0) { + num_rows_ = arrays_[0]->length(); + } + column_block_placement_.resize(num_columns_); + } + virtual ~PandasBlockCreator() = default; + + virtual Status Convert(PyObject** out) = 0; + + Status AppendBlocks(const WriterMap& blocks, PyObject* list) { + for (const auto& it : blocks) { + PyObject* item; + RETURN_NOT_OK(it.second->GetDataFrameResult(&item)); + if (PyList_Append(list, item) < 0) { + RETURN_IF_PYERROR(); + } + + // ARROW-1017; PyList_Append increments object refcount + Py_DECREF(item); + } + return Status::OK(); + } + + protected: + PandasOptions options_; + + FieldVector fields_; + ChunkedArrayVector arrays_; + int num_columns_; + int64_t num_rows_; + + // column num -> relative placement within internal block + std::vector column_block_placement_; +}; + +// Helper function for extension chunked arrays +// Constructing a storage chunked array of an extension chunked array +std::shared_ptr GetStorageChunkedArray(std::shared_ptr arr) { + auto value_type = checked_cast(*arr->type()).storage_type(); + ArrayVector storage_arrays; + for (int c = 0; c < arr->num_chunks(); c++) { + const auto& arr_ext = checked_cast(*arr->chunk(c)); + storage_arrays.emplace_back(arr_ext.storage()); + } + return std::make_shared(std::move(storage_arrays), value_type); +}; + +class ConsolidatedBlockCreator : public PandasBlockCreator { + public: + using PandasBlockCreator::PandasBlockCreator; + + Status Convert(PyObject** out) override { + column_types_.resize(num_columns_); + RETURN_NOT_OK(CreateBlocks()); + RETURN_NOT_OK(WriteTableToBlocks()); + PyAcquireGIL lock; + + PyObject* result = PyList_New(0); + RETURN_IF_PYERROR(); + + RETURN_NOT_OK(AppendBlocks(blocks_, result)); + RETURN_NOT_OK(AppendBlocks(singleton_blocks_, result)); + + *out = result; + return Status::OK(); + } + + Status GetBlockType(int column_index, PandasWriter::type* out) { + if (options_.extension_columns.count(fields_[column_index]->name())) { + *out = PandasWriter::EXTENSION; + return Status::OK(); + } else { + // In case of an extension array default to the storage type + if (arrays_[column_index]->type()->id() == Type::EXTENSION) { + arrays_[column_index] = GetStorageChunkedArray(arrays_[column_index]); + } + return GetPandasWriterType(*arrays_[column_index], options_, out); + } + } + + Status CreateBlocks() { + for (int i = 0; i < num_columns_; ++i) { + const DataType& type = *arrays_[i]->type(); + PandasWriter::type output_type; + RETURN_NOT_OK(GetBlockType(i, &output_type)); + + int block_placement = 0; + std::shared_ptr writer; + if (output_type == PandasWriter::CATEGORICAL || + output_type == PandasWriter::DATETIME_SECOND_TZ || + output_type == PandasWriter::DATETIME_MILLI_TZ || + output_type == PandasWriter::DATETIME_MICRO_TZ || + output_type == PandasWriter::DATETIME_NANO_TZ || + output_type == PandasWriter::EXTENSION) { + RETURN_NOT_OK(MakeWriter(options_, output_type, type, num_rows_, + /*num_columns=*/1, &writer)); + singleton_blocks_[i] = writer; + } else { + auto it = block_sizes_.find(output_type); + if (it != block_sizes_.end()) { + block_placement = it->second; + // Increment count + ++it->second; + } else { + // Add key to map + block_sizes_[output_type] = 1; + } + } + column_types_[i] = output_type; + column_block_placement_[i] = block_placement; + } + + // Create normal non-categorical blocks + for (const auto& it : this->block_sizes_) { + PandasWriter::type output_type = static_cast(it.first); + std::shared_ptr block; + RETURN_NOT_OK(MakeWriter(this->options_, output_type, /*unused*/ *null(), num_rows_, + it.second, &block)); + this->blocks_[output_type] = block; + } + return Status::OK(); + } + + Status GetWriter(int i, std::shared_ptr* block) { + PandasWriter::type output_type = this->column_types_[i]; + switch (output_type) { + case PandasWriter::CATEGORICAL: + case PandasWriter::DATETIME_SECOND_TZ: + case PandasWriter::DATETIME_MILLI_TZ: + case PandasWriter::DATETIME_MICRO_TZ: + case PandasWriter::DATETIME_NANO_TZ: + case PandasWriter::EXTENSION: { + auto it = this->singleton_blocks_.find(i); + if (it == this->singleton_blocks_.end()) { + return Status::KeyError("No block allocated"); + } + *block = it->second; + } break; + default: + auto it = this->blocks_.find(output_type); + if (it == this->blocks_.end()) { + return Status::KeyError("No block allocated"); + } + *block = it->second; + break; + } + return Status::OK(); + } + + Status WriteTableToBlocks() { + auto WriteColumn = [this](int i) { + std::shared_ptr block; + RETURN_NOT_OK(this->GetWriter(i, &block)); + // ARROW-3789 Use std::move on the array to permit self-destructing + return block->Write(std::move(arrays_[i]), i, this->column_block_placement_[i]); + }; + + return OptionalParallelFor(options_.use_threads, num_columns_, WriteColumn); + } + + private: + // column num -> block type id + std::vector column_types_; + + // block type -> type count + std::unordered_map block_sizes_; + std::unordered_map block_types_; + + // block type -> block + WriterMap blocks_; + + WriterMap singleton_blocks_; +}; + +/// \brief Create blocks for pandas.DataFrame block manager using one block per +/// column strategy. This permits some zero-copy optimizations as well as the +/// ability for the table to "self-destruct" if selected by the user. +class SplitBlockCreator : public PandasBlockCreator { + public: + using PandasBlockCreator::PandasBlockCreator; + + Status GetWriter(int i, std::shared_ptr* writer) { + PandasWriter::type output_type = PandasWriter::OBJECT; + const DataType& type = *arrays_[i]->type(); + if (options_.extension_columns.count(fields_[i]->name())) { + output_type = PandasWriter::EXTENSION; + } else { + // Null count needed to determine output type + RETURN_NOT_OK(GetPandasWriterType(*arrays_[i], options_, &output_type)); + } + return MakeWriter(this->options_, output_type, type, num_rows_, 1, writer); + } + + Status Convert(PyObject** out) override { + PyAcquireGIL lock; + + PyObject* result = PyList_New(0); + RETURN_IF_PYERROR(); + + for (int i = 0; i < num_columns_; ++i) { + std::shared_ptr writer; + RETURN_NOT_OK(GetWriter(i, &writer)); + // ARROW-3789 Use std::move on the array to permit self-destructing + RETURN_NOT_OK(writer->Write(std::move(arrays_[i]), i, /*rel_placement=*/0)); + + PyObject* item; + RETURN_NOT_OK(writer->GetDataFrameResult(&item)); + if (PyList_Append(result, item) < 0) { + RETURN_IF_PYERROR(); + } + // PyList_Append increments object refcount + Py_DECREF(item); + } + + *out = result; + return Status::OK(); + } + + private: + std::vector> writers_; +}; + +Status ConvertCategoricals(const PandasOptions& options, ChunkedArrayVector* arrays, + FieldVector* fields) { + std::vector columns_to_encode; + + // For Categorical conversions + auto EncodeColumn = [&](int j) { + int i = columns_to_encode[j]; + if (options.zero_copy_only) { + return Status::Invalid("Need to dictionary encode a column, but ", + "only zero-copy conversions allowed"); + } + compute::ExecContext ctx(options.pool); + ARROW_ASSIGN_OR_RAISE( + Datum out, DictionaryEncode((*arrays)[i], + compute::DictionaryEncodeOptions::Defaults(), &ctx)); + (*arrays)[i] = out.chunked_array(); + (*fields)[i] = (*fields)[i]->WithType((*arrays)[i]->type()); + return Status::OK(); + }; + + if (!options.categorical_columns.empty()) { + for (int i = 0; i < static_cast(arrays->size()); i++) { + if ((*arrays)[i]->type()->id() != Type::DICTIONARY && + options.categorical_columns.count((*fields)[i]->name())) { + columns_to_encode.push_back(i); + } + } + } + if (options.strings_to_categorical) { + for (int i = 0; i < static_cast(arrays->size()); i++) { + if (is_base_binary_like((*arrays)[i]->type()->id())) { + columns_to_encode.push_back(i); + } + } + } + return OptionalParallelFor(options.use_threads, + static_cast(columns_to_encode.size()), EncodeColumn); +} + +} // namespace + +Status ConvertArrayToPandas(const PandasOptions& options, std::shared_ptr arr, + PyObject* py_ref, PyObject** out) { + return ConvertChunkedArrayToPandas( + options, std::make_shared(std::move(arr)), py_ref, out); +} + +Status ConvertChunkedArrayToPandas(const PandasOptions& options, + std::shared_ptr arr, PyObject* py_ref, + PyObject** out) { + if (options.decode_dictionaries && arr->type()->id() == Type::DICTIONARY) { + const auto& dense_type = + checked_cast(*arr->type()).value_type(); + RETURN_NOT_OK(DecodeDictionaries(options.pool, dense_type, &arr)); + DCHECK_NE(arr->type()->id(), Type::DICTIONARY); + + // The original Python DictionaryArray won't own the memory anymore + // as we actually built a new array when we decoded the DictionaryArray + // thus let the final resulting numpy array own the memory through a Capsule + py_ref = nullptr; + } + + if (options.strings_to_categorical && is_base_binary_like(arr->type()->id())) { + if (options.zero_copy_only) { + return Status::Invalid("Need to dictionary encode a column, but ", + "only zero-copy conversions allowed"); + } + compute::ExecContext ctx(options.pool); + ARROW_ASSIGN_OR_RAISE( + Datum out, + DictionaryEncode(arr, compute::DictionaryEncodeOptions::Defaults(), &ctx)); + arr = out.chunked_array(); + } + + PandasOptions modified_options = options; + modified_options.strings_to_categorical = false; + + // ARROW-7596: We permit the hybrid Series/DataFrame code path to do zero copy + // optimizations that we do not allow in the default case when converting + // Table->DataFrame + modified_options.allow_zero_copy_blocks = true; + + // In case of an extension array default to the storage type + if (arr->type()->id() == Type::EXTENSION) { + arr = GetStorageChunkedArray(arr); + } + + PandasWriter::type output_type; + RETURN_NOT_OK(GetPandasWriterType(*arr, modified_options, &output_type)); + if (options.decode_dictionaries) { + DCHECK_NE(output_type, PandasWriter::CATEGORICAL); + } + + std::shared_ptr writer; + RETURN_NOT_OK(MakeWriter(modified_options, output_type, *arr->type(), arr->length(), + /*num_columns=*/1, &writer)); + RETURN_NOT_OK(writer->TransferSingle(std::move(arr), py_ref)); + return writer->GetSeriesResult(out); +} + +Status ConvertTableToPandas(const PandasOptions& options, std::shared_ptr table, + PyObject** out) { + ChunkedArrayVector arrays = table->columns(); + FieldVector fields = table->fields(); + + // ARROW-3789: allow "self-destructing" by releasing references to columns as + // we convert them to pandas + table = nullptr; + + RETURN_NOT_OK(ConvertCategoricals(options, &arrays, &fields)); + + PandasOptions modified_options = options; + modified_options.strings_to_categorical = false; + modified_options.categorical_columns.clear(); + + if (options.split_blocks) { + modified_options.allow_zero_copy_blocks = true; + SplitBlockCreator helper(modified_options, std::move(fields), std::move(arrays)); + return helper.Convert(out); + } else { + ConsolidatedBlockCreator helper(modified_options, std::move(fields), + std::move(arrays)); + return helper.Convert(out); + } +} + +} // namespace py +} // namespace arrow diff --git a/src/vendored/apache-arrow-12.0.1/arrow/python/arrow_to_pandas.h b/src/vendored/apache-arrow-12.0.1/arrow/python/arrow_to_pandas.h index 6570364..82e0a60 100644 --- a/src/vendored/apache-arrow-12.0.1/arrow/python/arrow_to_pandas.h +++ b/src/vendored/apache-arrow-12.0.1/arrow/python/arrow_to_pandas.h @@ -41,6 +41,13 @@ class Table; namespace py { +enum class MapConversionType { + DEFAULT, // convert arrow maps to assoc lists (list of kev-value tuples) in Pandas + LOSSY, // report warnings when lossiness is encountered due to duplicate keys + STRICT_, // raise a Python exception when lossiness is encountered due to duplicate + // keys +}; + struct PandasOptions { /// arrow::MemoryPool to use for memory allocations MemoryPool* pool = default_memory_pool(); @@ -90,6 +97,17 @@ struct PandasOptions { /// conversions bool self_destruct = false; + /// \brief The default behavior (DEFAULT), is to convert Arrow Map arrays to + /// Python association lists (list-of-tuples) in the same order as the Arrow + /// Map, as in [(key1, value1), (key2, value2), ...] + /// If LOSSY or STRICT, convert Arrow Map arrays to native Python dicts. + /// This can change the ordering of (key, value) pairs, and will deduplicate + /// multiple keys, resulting in a possible loss of data. + /// If 'lossy', this key deduplication results in a warning printed + /// when detected. If 'strict', this instead results in an exception + /// being raised when detected. + MapConversionType maps_as_pydicts = MapConversionType::DEFAULT; + // Used internally for nested arrays. bool decode_dictionaries = false; @@ -99,6 +117,10 @@ struct PandasOptions { // Columns that should be passed through to be converted to // ExtensionArray/Block std::unordered_set extension_columns; + + // Used internally to decipher between to_numpy() and to_pandas() when + // the expected output differs + bool to_numpy = false; }; ARROW_PYTHON_EXPORT diff --git a/src/vendored/apache-arrow-12.0.1/arrow/python/arrow_to_python_internal.h b/src/vendored/apache-arrow-12.0.1/arrow/python/arrow_to_python_internal.h new file mode 100644 index 0000000..514cda3 --- /dev/null +++ b/src/vendored/apache-arrow-12.0.1/arrow/python/arrow_to_python_internal.h @@ -0,0 +1,49 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include "arrow/array.h" +#include "arrow/python/platform.h" + +namespace arrow { +namespace py { +namespace internal { +// TODO(ARROW-12976): See if we can refactor Pandas ObjectWriter logic +// to the .cc file and move this there as well if we can. + +// Converts array to a sequency of python objects. +template +inline Status WriteArrayObjects(const ArrayType& arr, WriteValue&& write_func, + Assigner out_values) { + // TODO(ARROW-12976): Use visitor here? + const bool has_nulls = arr.null_count() > 0; + for (int64_t i = 0; i < arr.length(); ++i) { + if (has_nulls && arr.IsNull(i)) { + Py_INCREF(Py_None); + *out_values = Py_None; + } else { + RETURN_NOT_OK(write_func(arr.GetView(i), out_values)); + } + ++out_values; + } + return Status::OK(); +} + +} // namespace internal +} // namespace py +} // namespace arrow diff --git a/src/vendored/apache-arrow-12.0.1/arrow/python/benchmark.cc b/src/vendored/apache-arrow-12.0.1/arrow/python/benchmark.cc new file mode 100644 index 0000000..6dcc959 --- /dev/null +++ b/src/vendored/apache-arrow-12.0.1/arrow/python/benchmark.cc @@ -0,0 +1,38 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/python/benchmark.h" +#include "arrow/python/helpers.h" + +namespace arrow { +namespace py { +namespace benchmark { + +void Benchmark_PandasObjectIsNull(PyObject* list) { + if (!PyList_CheckExact(list)) { + PyErr_SetString(PyExc_TypeError, "expected a list"); + return; + } + Py_ssize_t i, n = PyList_GET_SIZE(list); + for (i = 0; i < n; i++) { + internal::PandasObjectIsNull(PyList_GET_ITEM(list, i)); + } +} + +} // namespace benchmark +} // namespace py +} // namespace arrow diff --git a/src/vendored/apache-arrow-12.0.1/arrow/python/common.cc b/src/vendored/apache-arrow-12.0.1/arrow/python/common.cc new file mode 100644 index 0000000..6fe2ed4 --- /dev/null +++ b/src/vendored/apache-arrow-12.0.1/arrow/python/common.cc @@ -0,0 +1,203 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/python/common.h" + +#include +#include +#include + +#include "arrow/memory_pool.h" +#include "arrow/status.h" +#include "arrow/util/checked_cast.h" +#include "arrow/util/logging.h" + +#include "arrow/python/helpers.h" + +namespace arrow { + +using internal::checked_cast; + +namespace py { + +static std::mutex memory_pool_mutex; +static MemoryPool* default_python_pool = nullptr; + +void set_default_memory_pool(MemoryPool* pool) { + std::lock_guard guard(memory_pool_mutex); + default_python_pool = pool; +} + +MemoryPool* get_memory_pool() { + std::lock_guard guard(memory_pool_mutex); + if (default_python_pool) { + return default_python_pool; + } else { + return default_memory_pool(); + } +} + +// ---------------------------------------------------------------------- +// PythonErrorDetail + +namespace { + +const char kErrorDetailTypeId[] = "arrow::py::PythonErrorDetail"; + +// Try to match the Python exception type with an appropriate Status code +StatusCode MapPyError(PyObject* exc_type) { + StatusCode code; + + if (PyErr_GivenExceptionMatches(exc_type, PyExc_MemoryError)) { + code = StatusCode::OutOfMemory; + } else if (PyErr_GivenExceptionMatches(exc_type, PyExc_IndexError)) { + code = StatusCode::IndexError; + } else if (PyErr_GivenExceptionMatches(exc_type, PyExc_KeyError)) { + code = StatusCode::KeyError; + } else if (PyErr_GivenExceptionMatches(exc_type, PyExc_TypeError)) { + code = StatusCode::TypeError; + } else if (PyErr_GivenExceptionMatches(exc_type, PyExc_ValueError) || + PyErr_GivenExceptionMatches(exc_type, PyExc_OverflowError)) { + code = StatusCode::Invalid; + } else if (PyErr_GivenExceptionMatches(exc_type, PyExc_EnvironmentError)) { + code = StatusCode::IOError; + } else if (PyErr_GivenExceptionMatches(exc_type, PyExc_NotImplementedError)) { + code = StatusCode::NotImplemented; + } else { + code = StatusCode::UnknownError; + } + return code; +} + +// PythonErrorDetail indicates a Python exception was raised. +class PythonErrorDetail : public StatusDetail { + public: + const char* type_id() const override { return kErrorDetailTypeId; } + + std::string ToString() const override { + // This is simple enough not to need the GIL + const auto ty = reinterpret_cast(exc_type_.obj()); + // XXX Should we also print traceback? + return std::string("Python exception: ") + ty->tp_name; + } + + void RestorePyError() const { + Py_INCREF(exc_type_.obj()); + Py_INCREF(exc_value_.obj()); + Py_INCREF(exc_traceback_.obj()); + PyErr_Restore(exc_type_.obj(), exc_value_.obj(), exc_traceback_.obj()); + } + + PyObject* exc_type() const { return exc_type_.obj(); } + + PyObject* exc_value() const { return exc_value_.obj(); } + + static std::shared_ptr FromPyError() { + PyObject* exc_type = nullptr; + PyObject* exc_value = nullptr; + PyObject* exc_traceback = nullptr; + + PyErr_Fetch(&exc_type, &exc_value, &exc_traceback); + PyErr_NormalizeException(&exc_type, &exc_value, &exc_traceback); + ARROW_CHECK(exc_type) + << "PythonErrorDetail::FromPyError called without a Python error set"; + DCHECK(PyType_Check(exc_type)); + DCHECK(exc_value); // Ensured by PyErr_NormalizeException, double-check + if (exc_traceback == nullptr) { + // Needed by PyErr_Restore() + Py_INCREF(Py_None); + exc_traceback = Py_None; + } + + std::shared_ptr detail(new PythonErrorDetail); + detail->exc_type_.reset(exc_type); + detail->exc_value_.reset(exc_value); + detail->exc_traceback_.reset(exc_traceback); + return detail; + } + + protected: + PythonErrorDetail() = default; + + OwnedRefNoGIL exc_type_, exc_value_, exc_traceback_; +}; + +} // namespace + +// ---------------------------------------------------------------------- +// Python exception <-> Status + +Status ConvertPyError(StatusCode code) { + auto detail = PythonErrorDetail::FromPyError(); + if (code == StatusCode::UnknownError) { + code = MapPyError(detail->exc_type()); + } + + std::string message; + RETURN_NOT_OK(internal::PyObject_StdStringStr(detail->exc_value(), &message)); + return Status(code, message, detail); +} + +bool IsPyError(const Status& status) { + if (status.ok()) { + return false; + } + auto detail = status.detail(); + bool result = detail != nullptr && detail->type_id() == kErrorDetailTypeId; + return result; +} + +void RestorePyError(const Status& status) { + ARROW_CHECK(IsPyError(status)); + const auto& detail = checked_cast(*status.detail()); + detail.RestorePyError(); +} + +// ---------------------------------------------------------------------- +// PyBuffer + +PyBuffer::PyBuffer() : Buffer(nullptr, 0) {} + +Status PyBuffer::Init(PyObject* obj) { + if (!PyObject_GetBuffer(obj, &py_buf_, PyBUF_ANY_CONTIGUOUS)) { + data_ = reinterpret_cast(py_buf_.buf); + ARROW_CHECK_NE(data_, nullptr) << "Null pointer in Py_buffer"; + size_ = py_buf_.len; + capacity_ = py_buf_.len; + is_mutable_ = !py_buf_.readonly; + return Status::OK(); + } else { + return ConvertPyError(StatusCode::Invalid); + } +} + +Result> PyBuffer::FromPyObject(PyObject* obj) { + PyBuffer* buf = new PyBuffer(); + std::shared_ptr res(buf); + RETURN_NOT_OK(buf->Init(obj)); + return res; +} + +PyBuffer::~PyBuffer() { + if (data_ != nullptr) { + PyAcquireGIL lock; + PyBuffer_Release(&py_buf_); + } +} + +} // namespace py +} // namespace arrow diff --git a/src/vendored/apache-arrow-12.0.1/arrow/python/csv.cc b/src/vendored/apache-arrow-12.0.1/arrow/python/csv.cc new file mode 100644 index 0000000..1df3a94 --- /dev/null +++ b/src/vendored/apache-arrow-12.0.1/arrow/python/csv.cc @@ -0,0 +1,62 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "csv.h" + +#include + +#include "arrow/python/common.h" + +namespace arrow { + +using csv::InvalidRow; +using csv::InvalidRowHandler; +using csv::InvalidRowResult; + +namespace py { +namespace csv { + +InvalidRowHandler MakeInvalidRowHandler(PyInvalidRowCallback cb, PyObject* py_handler) { + if (cb == nullptr) { + return InvalidRowHandler{}; + } + + struct Handler { + PyInvalidRowCallback cb; + std::shared_ptr handler_ref; + + InvalidRowResult operator()(const InvalidRow& invalid_row) { + InvalidRowResult result; + auto st = SafeCallIntoPython([&]() -> Status { + result = cb(handler_ref->obj(), invalid_row); + if (PyErr_Occurred()) { + PyErr_WriteUnraisable(handler_ref->obj()); + } + return Status::OK(); + }); + ARROW_UNUSED(st); + return result; + } + }; + + Py_INCREF(py_handler); + return Handler{cb, std::make_shared(py_handler)}; +} + +} // namespace csv +} // namespace py +} // namespace arrow diff --git a/src/vendored/apache-arrow-12.0.1/arrow/python/csv.h b/src/vendored/apache-arrow-12.0.1/arrow/python/csv.h index 2295c49..34302e9 100644 --- a/src/vendored/apache-arrow-12.0.1/arrow/python/csv.h +++ b/src/vendored/apache-arrow-12.0.1/arrow/python/csv.h @@ -23,8 +23,8 @@ #include #include "arrow/csv/options.h" -#include "arrow/util/macros.h" #include "arrow/python/common.h" +#include "arrow/util/macros.h" namespace arrow { namespace py { diff --git a/src/vendored/apache-arrow-12.0.1/arrow/python/datetime.cc b/src/vendored/apache-arrow-12.0.1/arrow/python/datetime.cc new file mode 100644 index 0000000..0e817dd --- /dev/null +++ b/src/vendored/apache-arrow-12.0.1/arrow/python/datetime.cc @@ -0,0 +1,663 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#include "datetime.h" + +#include +#include +#include +#include +#include + +#include "arrow/array.h" +#include "arrow/python/arrow_to_python_internal.h" +#include "arrow/python/common.h" +#include "arrow/python/helpers.h" +#include "arrow/python/platform.h" +#include "arrow/scalar.h" +#include "arrow/status.h" +#include "arrow/type.h" +#include "arrow/util/logging.h" +#include "arrow/util/regex.h" +#include "arrow/util/value_parsing.h" + +namespace arrow { + +using internal::RegexMatch; + +namespace py { +namespace internal { + +namespace { + +bool MatchFixedOffset(const std::string& tz, std::string_view* sign, + std::string_view* hour, std::string_view* minute) { + static const std::regex regex("^([+-])(0[0-9]|1[0-9]|2[0-3]):([0-5][0-9])$"); + if (tz.size() < 5) { + return false; + } + return RegexMatch(regex, tz, {sign, hour, minute}); +} + +constexpr char* NonConst(const char* st) { + // Hack for python versions < 3.7 where members of PyStruct members + // where non-const (C++ doesn't like assigning string literals to these types) + return const_cast(st); +} + +static PyTypeObject MonthDayNanoTupleType = {}; + +static PyStructSequence_Field MonthDayNanoField[] = { + {NonConst("months"), NonConst("The number of months in the interval")}, + {NonConst("days"), NonConst("The number days in the interval")}, + {NonConst("nanoseconds"), NonConst("The number of nanoseconds in the interval")}, + {nullptr, nullptr}}; + +static PyStructSequence_Desc MonthDayNanoTupleDesc = { + NonConst("MonthDayNano"), + NonConst("A calendar interval consisting of months, days and nanoseconds."), + MonthDayNanoField, + /*n_in_sequence=*/3}; + +} // namespace + +#ifndef PYPY_VERSION +PyDateTime_CAPI* datetime_api = nullptr; + +void InitDatetime() { + PyAcquireGIL lock; + datetime_api = + reinterpret_cast(PyCapsule_Import(PyDateTime_CAPSULE_NAME, 0)); + if (datetime_api == nullptr) { + Py_FatalError("Could not import datetime C API"); + } +} +#endif + +// The following code is adapted from +// https://github.com/numpy/numpy/blob/main/numpy/core/src/multiarray/datetime.c + +// Days per month, regular year and leap year +static int64_t _days_per_month_table[2][12] = { + {31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31}, + {31, 29, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31}}; + +static bool is_leapyear(int64_t year) { + return (year & 0x3) == 0 && // year % 4 == 0 + ((year % 100) != 0 || (year % 400) == 0); +} + +// Calculates the days offset from the 1970 epoch. +static int64_t get_days_from_date(int64_t date_year, int64_t date_month, + int64_t date_day) { + int64_t i, month; + int64_t year, days = 0; + int64_t* month_lengths; + + year = date_year - 1970; + days = year * 365; + + // Adjust for leap years + if (days >= 0) { + // 1968 is the closest leap year before 1970. + // Exclude the current year, so add 1. + year += 1; + // Add one day for each 4 years + days += year / 4; + // 1900 is the closest previous year divisible by 100 + year += 68; + // Subtract one day for each 100 years + days -= year / 100; + // 1600 is the closest previous year divisible by 400 + year += 300; + // Add one day for each 400 years + days += year / 400; + } else { + // 1972 is the closest later year after 1970. + // Include the current year, so subtract 2. + year -= 2; + // Subtract one day for each 4 years + days += year / 4; + // 2000 is the closest later year divisible by 100 + year -= 28; + // Add one day for each 100 years + days -= year / 100; + // 2000 is also the closest later year divisible by 400 + // Subtract one day for each 400 years + days += year / 400; + } + + month_lengths = _days_per_month_table[is_leapyear(date_year)]; + month = date_month - 1; + + // Add the months + for (i = 0; i < month; ++i) { + days += month_lengths[i]; + } + + // Add the days + days += date_day - 1; + + return days; +} + +// Modifies '*days_' to be the day offset within the year, +// and returns the year. +static int64_t days_to_yearsdays(int64_t* days_) { + const int64_t days_per_400years = (400 * 365 + 100 - 4 + 1); + // Adjust so it's relative to the year 2000 (divisible by 400) + int64_t days = (*days_) - (365 * 30 + 7); + int64_t year; + + // Break down the 400 year cycle to get the year and day within the year + if (days >= 0) { + year = 400 * (days / days_per_400years); + days = days % days_per_400years; + } else { + year = 400 * ((days - (days_per_400years - 1)) / days_per_400years); + days = days % days_per_400years; + if (days < 0) { + days += days_per_400years; + } + } + + // Work out the year/day within the 400 year cycle + if (days >= 366) { + year += 100 * ((days - 1) / (100 * 365 + 25 - 1)); + days = (days - 1) % (100 * 365 + 25 - 1); + if (days >= 365) { + year += 4 * ((days + 1) / (4 * 365 + 1)); + days = (days + 1) % (4 * 365 + 1); + if (days >= 366) { + year += (days - 1) / 365; + days = (days - 1) % 365; + } + } + } + + *days_ = days; + return year + 2000; +} + +// Extracts the month and year and day number from a number of days +static void get_date_from_days(int64_t days, int64_t* date_year, int64_t* date_month, + int64_t* date_day) { + int64_t *month_lengths, i; + + *date_year = days_to_yearsdays(&days); + month_lengths = _days_per_month_table[is_leapyear(*date_year)]; + + for (i = 0; i < 12; ++i) { + if (days < month_lengths[i]) { + *date_month = i + 1; + *date_day = days + 1; + return; + } else { + days -= month_lengths[i]; + } + } + + // Should never get here + return; +} + +// Splitting time quantities, for example splitting total seconds into +// minutes and remaining seconds. After we run +// int64_t remaining = split_time(total, quotient, &next) +// we have +// total = next * quotient + remaining. Handles negative values by propagating +// them: If total is negative, next will be negative and remaining will +// always be non-negative. +static inline int64_t split_time(int64_t total, int64_t quotient, int64_t* next) { + int64_t r = total % quotient; + if (r < 0) { + *next = total / quotient - 1; + return r + quotient; + } else { + *next = total / quotient; + return r; + } +} + +static inline Status PyTime_convert_int(int64_t val, const TimeUnit::type unit, + int64_t* hour, int64_t* minute, int64_t* second, + int64_t* microsecond) { + switch (unit) { + case TimeUnit::NANO: + if (val % 1000 != 0) { + return Status::Invalid("Value ", val, " has non-zero nanoseconds"); + } + val /= 1000; + // fall through + case TimeUnit::MICRO: + *microsecond = split_time(val, 1000000LL, &val); + *second = split_time(val, 60, &val); + *minute = split_time(val, 60, hour); + break; + case TimeUnit::MILLI: + *microsecond = split_time(val, 1000, &val) * 1000; + // fall through + case TimeUnit::SECOND: + *second = split_time(val, 60, &val); + *minute = split_time(val, 60, hour); + break; + default: + break; + } + return Status::OK(); +} + +static inline Status PyDate_convert_int(int64_t val, const DateUnit unit, int64_t* year, + int64_t* month, int64_t* day) { + switch (unit) { + case DateUnit::MILLI: + val /= 86400000LL; // fall through + case DateUnit::DAY: + get_date_from_days(val, year, month, day); + default: + break; + } + return Status::OK(); +} + +PyObject* NewMonthDayNanoTupleType() { + if (MonthDayNanoTupleType.tp_name == nullptr) { + if (PyStructSequence_InitType2(&MonthDayNanoTupleType, &MonthDayNanoTupleDesc) != 0) { + Py_FatalError("Could not initialize MonthDayNanoTuple"); + } + } + Py_INCREF(&MonthDayNanoTupleType); + return (PyObject*)&MonthDayNanoTupleType; +} + +Status PyTime_from_int(int64_t val, const TimeUnit::type unit, PyObject** out) { + int64_t hour = 0, minute = 0, second = 0, microsecond = 0; + RETURN_NOT_OK(PyTime_convert_int(val, unit, &hour, &minute, &second, µsecond)); + *out = PyTime_FromTime(static_cast(hour), static_cast(minute), + static_cast(second), static_cast(microsecond)); + return Status::OK(); +} + +Status PyDate_from_int(int64_t val, const DateUnit unit, PyObject** out) { + int64_t year = 0, month = 0, day = 0; + RETURN_NOT_OK(PyDate_convert_int(val, unit, &year, &month, &day)); + *out = PyDate_FromDate(static_cast(year), static_cast(month), + static_cast(day)); + return Status::OK(); +} + +Status PyDateTime_from_int(int64_t val, const TimeUnit::type unit, PyObject** out) { + int64_t hour = 0, minute = 0, second = 0, microsecond = 0; + RETURN_NOT_OK(PyTime_convert_int(val, unit, &hour, &minute, &second, µsecond)); + int64_t total_days = 0; + hour = split_time(hour, 24, &total_days); + int64_t year = 0, month = 0, day = 0; + get_date_from_days(total_days, &year, &month, &day); + *out = PyDateTime_FromDateAndTime( + static_cast(year), static_cast(month), static_cast(day), + static_cast(hour), static_cast(minute), + static_cast(second), static_cast(microsecond)); + return Status::OK(); +} + +int64_t PyDate_to_days(PyDateTime_Date* pydate) { + return get_days_from_date(PyDateTime_GET_YEAR(pydate), PyDateTime_GET_MONTH(pydate), + PyDateTime_GET_DAY(pydate)); +} + +Result PyDateTime_utcoffset_s(PyObject* obj) { + // calculate offset from UTC timezone in seconds + // supports only PyDateTime_DateTime and PyDateTime_Time objects + OwnedRef pyoffset(PyObject_CallMethod(obj, "utcoffset", NULL)); + RETURN_IF_PYERROR(); + if (pyoffset.obj() != nullptr && pyoffset.obj() != Py_None) { + auto delta = reinterpret_cast(pyoffset.obj()); + return internal::PyDelta_to_s(delta); + } else { + return 0; + } +} + +Result PyTZInfo_utcoffset_hhmm(PyObject* pytzinfo) { + // attempt to convert timezone offset objects to "+/-{hh}:{mm}" format + OwnedRef pydelta_object(PyObject_CallMethod(pytzinfo, "utcoffset", "O", Py_None)); + RETURN_IF_PYERROR(); + + if (!PyDelta_Check(pydelta_object.obj())) { + return Status::Invalid( + "Object returned by tzinfo.utcoffset(None) is not an instance of " + "datetime.timedelta"); + } + auto pydelta = reinterpret_cast(pydelta_object.obj()); + + // retrieve the offset as seconds + auto total_seconds = internal::PyDelta_to_s(pydelta); + + // determine whether the offset is positive or negative + auto sign = (total_seconds < 0) ? "-" : "+"; + total_seconds = abs(total_seconds); + + // calculate offset components + int64_t hours, minutes, seconds; + seconds = split_time(total_seconds, 60, &minutes); + minutes = split_time(minutes, 60, &hours); + if (seconds > 0) { + // check there are no remaining seconds + return Status::Invalid("Offset must represent whole number of minutes"); + } + + // construct the timezone string + std::stringstream stream; + stream << sign << std::setfill('0') << std::setw(2) << hours << ":" << std::setfill('0') + << std::setw(2) << minutes; + return stream.str(); +} + +// Converted from python. See https://github.com/apache/arrow/pull/7604 +// for details. +Result StringToTzinfo(const std::string& tz) { + std::string_view sign_str, hour_str, minute_str; + OwnedRef pytz; + OwnedRef zoneinfo; + OwnedRef datetime; + + if (internal::ImportModule("pytz", &pytz).ok()) { + if (MatchFixedOffset(tz, &sign_str, &hour_str, &minute_str)) { + int sign = -1; + if (sign_str == "+") { + sign = 1; + } + OwnedRef fixed_offset; + RETURN_NOT_OK(internal::ImportFromModule(pytz.obj(), "FixedOffset", &fixed_offset)); + uint32_t minutes, hours; + if (!::arrow::internal::ParseUnsigned(hour_str.data(), hour_str.size(), &hours) || + !::arrow::internal::ParseUnsigned(minute_str.data(), minute_str.size(), + &minutes)) { + return Status::Invalid("Invalid timezone: ", tz); + } + OwnedRef total_minutes(PyLong_FromLong( + sign * ((static_cast(hours) * 60) + static_cast(minutes)))); + RETURN_IF_PYERROR(); + auto tzinfo = + PyObject_CallFunctionObjArgs(fixed_offset.obj(), total_minutes.obj(), NULL); + RETURN_IF_PYERROR(); + return tzinfo; + } + + OwnedRef timezone; + RETURN_NOT_OK(internal::ImportFromModule(pytz.obj(), "timezone", &timezone)); + OwnedRef py_tz_string( + PyUnicode_FromStringAndSize(tz.c_str(), static_cast(tz.size()))); + auto tzinfo = PyObject_CallFunctionObjArgs(timezone.obj(), py_tz_string.obj(), NULL); + RETURN_IF_PYERROR(); + return tzinfo; + } + + // catch fixed offset if pytz is not present + if (MatchFixedOffset(tz, &sign_str, &hour_str, &minute_str)) { + RETURN_NOT_OK(internal::ImportModule("datetime", &datetime)); + int sign = -1; + if (sign_str == "+") { + sign = 1; + } + + // import timezone and timedelta module to create a tzinfo object + OwnedRef class_timezone; + OwnedRef class_timedelta; + RETURN_NOT_OK( + internal::ImportFromModule(datetime.obj(), "timezone", &class_timezone)); + RETURN_NOT_OK( + internal::ImportFromModule(datetime.obj(), "timedelta", &class_timedelta)); + + // check input + uint32_t minutes, hours; + if (!::arrow::internal::ParseUnsigned(hour_str.data(), hour_str.size(), &hours) || + !::arrow::internal::ParseUnsigned(minute_str.data(), minute_str.size(), + &minutes)) { + return Status::Invalid("Invalid timezone: ", tz); + } + + // save offset as a signed integer + OwnedRef total_minutes(PyLong_FromLong( + sign * ((static_cast(hours) * 60) + static_cast(minutes)))); + // create zero integers for empty arguments in datetime.timedelta + OwnedRef zero(PyLong_FromLong(static_cast(0))); + + // call datetime.timedelta to get correct offset object for datetime.timezone + auto offset = + PyObject_CallFunctionObjArgs(class_timedelta.obj(), zero.obj(), zero.obj(), + zero.obj(), zero.obj(), total_minutes.obj(), NULL); + RETURN_IF_PYERROR(); + // call datetime.timezone + auto tzinfo = PyObject_CallFunctionObjArgs(class_timezone.obj(), offset, NULL); + RETURN_IF_PYERROR(); + return tzinfo; + } + + // fallback on zoneinfo if tz is string and pytz is not present + if (internal::ImportModule("zoneinfo", &zoneinfo).ok()) { + OwnedRef class_zoneinfo; + RETURN_NOT_OK( + internal::ImportFromModule(zoneinfo.obj(), "ZoneInfo", &class_zoneinfo)); + OwnedRef py_tz_string( + PyUnicode_FromStringAndSize(tz.c_str(), static_cast(tz.size()))); + auto tzinfo = + PyObject_CallFunctionObjArgs(class_zoneinfo.obj(), py_tz_string.obj(), NULL); + RETURN_IF_PYERROR(); + return tzinfo; + } + + return Status::Invalid( + "Pytz package or Python>=3.8 for zoneinfo module must be installed."); +} + +Result TzinfoToString(PyObject* tzinfo) { + OwnedRef module_pytz; // import pytz + OwnedRef module_datetime; // import datetime + OwnedRef module_zoneinfo; // import zoneinfo + OwnedRef module_dateutil; // import dateutil + OwnedRef class_timezone; // from datetime import timezone + OwnedRef class_fixedoffset; // from pytz import _FixedOffset + OwnedRef class_basetzinfo; // from pytz import BaseTzInfo + OwnedRef class_zoneinfo; // from zoneinfo import ZoneInfo + OwnedRef class_tzfile; // from zoneinfo import tzfile + + // import necessary modules + RETURN_NOT_OK(internal::ImportModule("datetime", &module_datetime)); + // import necessary classes + RETURN_NOT_OK( + internal::ImportFromModule(module_datetime.obj(), "timezone", &class_timezone)); + + // check that it's a valid tzinfo object + if (!PyTZInfo_Check(tzinfo)) { + return Status::TypeError("Not an instance of datetime.tzinfo"); + } + + // if tzinfo is an instance of datetime.timezone return the + // HH:MM offset string representation + if (PyObject_IsInstance(tzinfo, class_timezone.obj())) { + // still recognize datetime.timezone.utc as UTC (instead of +00:00) + OwnedRef tzname_object(PyObject_CallMethod(tzinfo, "tzname", "O", Py_None)); + RETURN_IF_PYERROR(); + if (PyUnicode_Check(tzname_object.obj())) { + std::string result; + RETURN_NOT_OK(internal::PyUnicode_AsStdString(tzname_object.obj(), &result)); + if (result == "UTC") { + return result; + } + } + return PyTZInfo_utcoffset_hhmm(tzinfo); + } + + // Try to import pytz if it is available + if (internal::ImportModule("pytz", &module_pytz).ok()) { + RETURN_NOT_OK(internal::ImportFromModule(module_pytz.obj(), "_FixedOffset", + &class_fixedoffset)); + RETURN_NOT_OK( + internal::ImportFromModule(module_pytz.obj(), "BaseTzInfo", &class_basetzinfo)); + } + + // if tzinfo is an instance of pytz._FixedOffset return the + // HH:MM offset string representation + if (module_pytz.obj() != nullptr && + PyObject_IsInstance(tzinfo, class_fixedoffset.obj())) { + OwnedRef tzname_object(PyObject_CallMethod(tzinfo, "tzname", "O", Py_None)); + RETURN_IF_PYERROR(); + return PyTZInfo_utcoffset_hhmm(tzinfo); + } + + // if pytz is installed and tzinfo is and instance of pytz.BaseTzInfo + if (module_pytz.obj() != nullptr && + PyObject_IsInstance(tzinfo, class_basetzinfo.obj())) { + OwnedRef zone(PyObject_GetAttrString(tzinfo, "zone")); + RETURN_IF_PYERROR(); + std::string result; + RETURN_NOT_OK(internal::PyUnicode_AsStdString(zone.obj(), &result)); + return result; + } + + // Try to import zoneinfo if it is available + if (internal::ImportModule("zoneinfo", &module_zoneinfo).ok()) { + RETURN_NOT_OK( + internal::ImportFromModule(module_zoneinfo.obj(), "ZoneInfo", &class_zoneinfo)); + } + + // if zoneinfo is installed and tzinfo is an instance of zoneinfo.ZoneInfo + if (module_zoneinfo.obj() != nullptr && + PyObject_IsInstance(tzinfo, class_zoneinfo.obj())) { + OwnedRef key(PyObject_GetAttrString(tzinfo, "key")); + RETURN_IF_PYERROR(); + std::string result; + RETURN_NOT_OK(internal::PyUnicode_AsStdString(key.obj(), &result)); + return result; + } + + // Try to import dateutil if it is available + if (internal::ImportModule("dateutil.tz", &module_dateutil).ok()) { + RETURN_NOT_OK( + internal::ImportFromModule(module_dateutil.obj(), "tzfile", &class_tzfile)); + } + + // if dateutil is installed and tzinfo is an instance of dateutil.tz.tzfile + if (module_dateutil.obj() != nullptr && + PyObject_IsInstance(tzinfo, class_tzfile.obj())) { + OwnedRef _filename(PyObject_GetAttrString(tzinfo, "_filename")); + RETURN_IF_PYERROR(); + std::string result; + RETURN_NOT_OK(internal::PyUnicode_AsStdString(_filename.obj(), &result)); + // _filename returns a full path in general ('/usr/share/zoneinfo/Europe/Paris') + // or POSIX name on Windows ('Europe/Paris') - we need a substring in first case + std::size_t pos = result.find("zoneinfo/"); + if (pos != std::string::npos) { + return result.substr(pos + 9); + } + return result; + } + + // attempt to call tzinfo.tzname(None) + OwnedRef tzname_object(PyObject_CallMethod(tzinfo, "tzname", "O", Py_None)); + RETURN_IF_PYERROR(); + if (PyUnicode_Check(tzname_object.obj())) { + std::string result; + RETURN_NOT_OK(internal::PyUnicode_AsStdString(tzname_object.obj(), &result)); + return result; + } + + // fall back to HH:MM offset string representation based on tzinfo.utcoffset(None) + return PyTZInfo_utcoffset_hhmm(tzinfo); +} + +PyObject* MonthDayNanoIntervalToNamedTuple( + const MonthDayNanoIntervalType::MonthDayNanos& interval) { + OwnedRef tuple(PyStructSequence_New(&MonthDayNanoTupleType)); + if (ARROW_PREDICT_FALSE(tuple.obj() == nullptr)) { + return nullptr; + } + PyStructSequence_SetItem(tuple.obj(), /*pos=*/0, PyLong_FromLong(interval.months)); + PyStructSequence_SetItem(tuple.obj(), /*pos=*/1, PyLong_FromLong(interval.days)); + PyStructSequence_SetItem(tuple.obj(), /*pos=*/2, + PyLong_FromLongLong(interval.nanoseconds)); + return tuple.detach(); +} + +namespace { + +// Wrapper around a Python list object that mimics dereference and assignment +// operations. +struct PyListAssigner { + public: + explicit PyListAssigner(PyObject* list) : list_(list) { DCHECK(PyList_Check(list_)); } + + PyListAssigner& operator*() { return *this; } + + void operator=(PyObject* obj) { + if (ARROW_PREDICT_FALSE(PyList_SetItem(list_, current_index_, obj) == -1)) { + Py_FatalError("list did not have the correct preallocated size."); + } + } + + PyListAssigner& operator++() { + current_index_++; + return *this; + } + + PyListAssigner& operator+=(int64_t offset) { + current_index_ += offset; + return *this; + } + + private: + PyObject* list_; + int64_t current_index_ = 0; +}; + +} // namespace + +Result MonthDayNanoIntervalArrayToPyList( + const MonthDayNanoIntervalArray& array) { + OwnedRef out_list(PyList_New(array.length())); + RETURN_IF_PYERROR(); + PyListAssigner out_objects(out_list.obj()); + auto& interval_array = + arrow::internal::checked_cast(array); + RETURN_NOT_OK(internal::WriteArrayObjects( + interval_array, + [&](const MonthDayNanoIntervalType::MonthDayNanos& interval, PyListAssigner& out) { + PyObject* tuple = internal::MonthDayNanoIntervalToNamedTuple(interval); + if (ARROW_PREDICT_FALSE(tuple == nullptr)) { + RETURN_IF_PYERROR(); + } + + *out = tuple; + return Status::OK(); + }, + out_objects)); + return out_list.detach(); +} + +Result MonthDayNanoIntervalScalarToPyObject( + const MonthDayNanoIntervalScalar& scalar) { + if (scalar.is_valid) { + return internal::MonthDayNanoIntervalToNamedTuple(scalar.value); + } else { + Py_INCREF(Py_None); + return Py_None; + } +} + +} // namespace internal +} // namespace py +} // namespace arrow diff --git a/src/vendored/apache-arrow-12.0.1/arrow/python/datetime.h b/src/vendored/apache-arrow-12.0.1/arrow/python/datetime.h index a5cca55..327a61f 100644 --- a/src/vendored/apache-arrow-12.0.1/arrow/python/datetime.h +++ b/src/vendored/apache-arrow-12.0.1/arrow/python/datetime.h @@ -20,14 +20,14 @@ #include #include +#include "arrow/python/platform.h" +#include "arrow/python/visibility.h" #include "arrow/result.h" #include "arrow/status.h" #include "arrow/type.h" #include "arrow/type_fwd.h" #include "arrow/util/int_util_overflow.h" #include "arrow/util/logging.h" -#include "arrow/python/platform.h" -#include "arrow/python/visibility.h" // By default, PyDateTimeAPI is a *static* variable. This forces // PyDateTime_IMPORT to be called in every C/C++ module using the diff --git a/src/vendored/apache-arrow-12.0.1/arrow/python/decimal.cc b/src/vendored/apache-arrow-12.0.1/arrow/python/decimal.cc new file mode 100644 index 0000000..0c00fcf --- /dev/null +++ b/src/vendored/apache-arrow-12.0.1/arrow/python/decimal.cc @@ -0,0 +1,246 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include + +#include "arrow/python/common.h" +#include "arrow/python/decimal.h" +#include "arrow/python/helpers.h" +#include "arrow/type_fwd.h" +#include "arrow/util/decimal.h" +#include "arrow/util/logging.h" + +namespace arrow { +namespace py { +namespace internal { + +Status ImportDecimalType(OwnedRef* decimal_type) { + OwnedRef decimal_module; + RETURN_NOT_OK(ImportModule("decimal", &decimal_module)); + RETURN_NOT_OK(ImportFromModule(decimal_module.obj(), "Decimal", decimal_type)); + return Status::OK(); +} + +Status PythonDecimalToString(PyObject* python_decimal, std::string* out) { + // Call Python's str(decimal_object) + return PyObject_StdStringStr(python_decimal, out); +} + +// \brief Infer the precision and scale of a Python decimal.Decimal instance +// \param python_decimal[in] An instance of decimal.Decimal +// \param precision[out] The value of the inferred precision +// \param scale[out] The value of the inferred scale +// \return The status of the operation +static Status InferDecimalPrecisionAndScale(PyObject* python_decimal, int32_t* precision, + int32_t* scale) { + DCHECK_NE(python_decimal, NULLPTR); + DCHECK_NE(precision, NULLPTR); + DCHECK_NE(scale, NULLPTR); + + // TODO(phillipc): Make sure we perform PyDecimal_Check(python_decimal) as a DCHECK + OwnedRef as_tuple(PyObject_CallMethod(python_decimal, const_cast("as_tuple"), + const_cast(""))); + RETURN_IF_PYERROR(); + DCHECK(PyTuple_Check(as_tuple.obj())); + + OwnedRef digits(PyObject_GetAttrString(as_tuple.obj(), "digits")); + RETURN_IF_PYERROR(); + DCHECK(PyTuple_Check(digits.obj())); + + const auto num_digits = static_cast(PyTuple_Size(digits.obj())); + RETURN_IF_PYERROR(); + + OwnedRef py_exponent(PyObject_GetAttrString(as_tuple.obj(), "exponent")); + RETURN_IF_PYERROR(); + DCHECK(IsPyInteger(py_exponent.obj())); + + const auto exponent = static_cast(PyLong_AsLong(py_exponent.obj())); + RETURN_IF_PYERROR(); + + if (exponent < 0) { + // If exponent > num_digits, we have a number with leading zeros + // such as 0.01234. Ensure we have enough precision for leading zeros + // (which are not included in num_digits). + *precision = std::max(num_digits, -exponent); + *scale = -exponent; + } else { + // Trailing zeros are not included in num_digits, need to add to precision. + // Note we don't generate negative scales as they are poorly supported + // in non-Arrow systems. + *precision = num_digits + exponent; + *scale = 0; + } + return Status::OK(); +} + +PyObject* DecimalFromString(PyObject* decimal_constructor, + const std::string& decimal_string) { + DCHECK_NE(decimal_constructor, nullptr); + + auto string_size = decimal_string.size(); + DCHECK_GT(string_size, 0); + + auto string_bytes = decimal_string.c_str(); + DCHECK_NE(string_bytes, nullptr); + + return PyObject_CallFunction(decimal_constructor, const_cast("s#"), string_bytes, + static_cast(string_size)); +} + +namespace { + +template +Status DecimalFromStdString(const std::string& decimal_string, + const DecimalType& arrow_type, ArrowDecimal* out) { + int32_t inferred_precision; + int32_t inferred_scale; + + RETURN_NOT_OK(ArrowDecimal::FromString(decimal_string, out, &inferred_precision, + &inferred_scale)); + + const int32_t precision = arrow_type.precision(); + const int32_t scale = arrow_type.scale(); + + if (scale != inferred_scale) { + DCHECK_NE(out, NULLPTR); + ARROW_ASSIGN_OR_RAISE(*out, out->Rescale(inferred_scale, scale)); + } + + auto inferred_scale_delta = inferred_scale - scale; + if (ARROW_PREDICT_FALSE((inferred_precision - inferred_scale_delta) > precision)) { + return Status::Invalid( + "Decimal type with precision ", inferred_precision, + " does not fit into precision inferred from first array element: ", precision); + } + + return Status::OK(); +} + +template +Status InternalDecimalFromPythonDecimal(PyObject* python_decimal, + const DecimalType& arrow_type, + ArrowDecimal* out) { + DCHECK_NE(python_decimal, NULLPTR); + DCHECK_NE(out, NULLPTR); + + std::string string; + RETURN_NOT_OK(PythonDecimalToString(python_decimal, &string)); + return DecimalFromStdString(string, arrow_type, out); +} + +template +Status InternalDecimalFromPyObject(PyObject* obj, const DecimalType& arrow_type, + ArrowDecimal* out) { + DCHECK_NE(obj, NULLPTR); + DCHECK_NE(out, NULLPTR); + + if (IsPyInteger(obj)) { + // TODO: add a fast path for small-ish ints + std::string string; + RETURN_NOT_OK(PyObject_StdStringStr(obj, &string)); + return DecimalFromStdString(string, arrow_type, out); + } else if (PyDecimal_Check(obj)) { + return InternalDecimalFromPythonDecimal(obj, arrow_type, out); + } else { + return Status::TypeError("int or Decimal object expected, got ", + Py_TYPE(obj)->tp_name); + } +} + +} // namespace + +Status DecimalFromPythonDecimal(PyObject* python_decimal, const DecimalType& arrow_type, + Decimal128* out) { + return InternalDecimalFromPythonDecimal(python_decimal, arrow_type, out); +} + +Status DecimalFromPyObject(PyObject* obj, const DecimalType& arrow_type, + Decimal128* out) { + return InternalDecimalFromPyObject(obj, arrow_type, out); +} + +Status DecimalFromPythonDecimal(PyObject* python_decimal, const DecimalType& arrow_type, + Decimal256* out) { + return InternalDecimalFromPythonDecimal(python_decimal, arrow_type, out); +} + +Status DecimalFromPyObject(PyObject* obj, const DecimalType& arrow_type, + Decimal256* out) { + return InternalDecimalFromPyObject(obj, arrow_type, out); +} + +bool PyDecimal_Check(PyObject* obj) { + static OwnedRef decimal_type; + if (!decimal_type.obj()) { + ARROW_CHECK_OK(ImportDecimalType(&decimal_type)); + DCHECK(PyType_Check(decimal_type.obj())); + } + // PyObject_IsInstance() is slower as it has to check for virtual subclasses + const int result = + PyType_IsSubtype(Py_TYPE(obj), reinterpret_cast(decimal_type.obj())); + ARROW_CHECK_NE(result, -1) << " error during PyType_IsSubtype check"; + return result == 1; +} + +bool PyDecimal_ISNAN(PyObject* obj) { + DCHECK(PyDecimal_Check(obj)) << "obj is not an instance of decimal.Decimal"; + OwnedRef is_nan( + PyObject_CallMethod(obj, const_cast("is_nan"), const_cast(""))); + return PyObject_IsTrue(is_nan.obj()) == 1; +} + +DecimalMetadata::DecimalMetadata() + : DecimalMetadata(std::numeric_limits::min(), + std::numeric_limits::min()) {} + +DecimalMetadata::DecimalMetadata(int32_t precision, int32_t scale) + : precision_(precision), scale_(scale) {} + +Status DecimalMetadata::Update(int32_t suggested_precision, int32_t suggested_scale) { + const int32_t current_scale = scale_; + scale_ = std::max(current_scale, suggested_scale); + + const int32_t current_precision = precision_; + + if (current_precision == std::numeric_limits::min()) { + precision_ = suggested_precision; + } else { + auto num_digits = std::max(current_precision - current_scale, + suggested_precision - suggested_scale); + precision_ = std::max(num_digits + scale_, current_precision); + } + + return Status::OK(); +} + +Status DecimalMetadata::Update(PyObject* object) { + bool is_decimal = PyDecimal_Check(object); + + if (ARROW_PREDICT_FALSE(!is_decimal || PyDecimal_ISNAN(object))) { + return Status::OK(); + } + + int32_t precision = 0; + int32_t scale = 0; + RETURN_NOT_OK(InferDecimalPrecisionAndScale(object, &precision, &scale)); + return Update(precision, scale); +} + +} // namespace internal +} // namespace py +} // namespace arrow diff --git a/src/vendored/apache-arrow-12.0.1/arrow/python/deserialize.cc b/src/vendored/apache-arrow-12.0.1/arrow/python/deserialize.cc new file mode 100644 index 0000000..961a168 --- /dev/null +++ b/src/vendored/apache-arrow-12.0.1/arrow/python/deserialize.cc @@ -0,0 +1,495 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/python/deserialize.h" + +#include "arrow/python/numpy_interop.h" + +#include +#include +#include +#include +#include + +#include +#include + +#include "arrow/array.h" +#include "arrow/io/interfaces.h" +#include "arrow/io/memory.h" +#include "arrow/ipc/options.h" +#include "arrow/ipc/reader.h" +#include "arrow/ipc/util.h" +#include "arrow/ipc/writer.h" +#include "arrow/table.h" +#include "arrow/util/checked_cast.h" +#include "arrow/util/logging.h" +#include "arrow/util/value_parsing.h" + +#include "arrow/python/common.h" +#include "arrow/python/datetime.h" +#include "arrow/python/helpers.h" +#include "arrow/python/numpy_convert.h" +#include "arrow/python/pyarrow.h" +#include "arrow/python/serialize.h" + +namespace arrow { + +using internal::checked_cast; +using internal::ParseValue; + +namespace py { + +Status CallDeserializeCallback(PyObject* context, PyObject* value, + PyObject** deserialized_object); + +Status DeserializeTuple(PyObject* context, const Array& array, int64_t start_idx, + int64_t stop_idx, PyObject* base, const SerializedPyObject& blobs, + PyObject** out); + +Status DeserializeList(PyObject* context, const Array& array, int64_t start_idx, + int64_t stop_idx, PyObject* base, const SerializedPyObject& blobs, + PyObject** out); + +Status DeserializeSet(PyObject* context, const Array& array, int64_t start_idx, + int64_t stop_idx, PyObject* base, const SerializedPyObject& blobs, + PyObject** out); + +Status DeserializeDict(PyObject* context, const Array& array, int64_t start_idx, + int64_t stop_idx, PyObject* base, const SerializedPyObject& blobs, + PyObject** out) { + const auto& data = checked_cast(array); + OwnedRef keys, vals; + OwnedRef result(PyDict_New()); + RETURN_IF_PYERROR(); + + DCHECK_EQ(2, data.num_fields()); + + RETURN_NOT_OK(DeserializeList(context, *data.field(0), start_idx, stop_idx, base, blobs, + keys.ref())); + RETURN_NOT_OK(DeserializeList(context, *data.field(1), start_idx, stop_idx, base, blobs, + vals.ref())); + for (int64_t i = start_idx; i < stop_idx; ++i) { + // PyDict_SetItem behaves differently from PyList_SetItem and PyTuple_SetItem. + // The latter two steal references whereas PyDict_SetItem does not. So we need + // to make sure the reference count is decremented by letting the OwnedRef + // go out of scope at the end. + int ret = PyDict_SetItem(result.obj(), PyList_GET_ITEM(keys.obj(), i - start_idx), + PyList_GET_ITEM(vals.obj(), i - start_idx)); + if (ret != 0) { + return ConvertPyError(); + } + } + static PyObject* py_type = PyUnicode_FromString("_pytype_"); + if (PyDict_Contains(result.obj(), py_type)) { + RETURN_NOT_OK(CallDeserializeCallback(context, result.obj(), out)); + } else { + *out = result.detach(); + } + return Status::OK(); +} + +Status DeserializeArray(int32_t index, PyObject* base, const SerializedPyObject& blobs, + PyObject** out) { + RETURN_NOT_OK(py::TensorToNdarray(blobs.ndarrays[index], base, out)); + // Mark the array as immutable + OwnedRef flags(PyObject_GetAttrString(*out, "flags")); + if (flags.obj() == NULL) { + return ConvertPyError(); + } + if (PyObject_SetAttrString(flags.obj(), "writeable", Py_False) < 0) { + return ConvertPyError(); + } + return Status::OK(); +} + +Status GetValue(PyObject* context, const Array& arr, int64_t index, int8_t type, + PyObject* base, const SerializedPyObject& blobs, PyObject** result) { + switch (type) { + case PythonType::NONE: + Py_INCREF(Py_None); + *result = Py_None; + return Status::OK(); + case PythonType::BOOL: + *result = PyBool_FromLong(checked_cast(arr).Value(index)); + return Status::OK(); + case PythonType::PY2INT: + case PythonType::INT: { + *result = PyLong_FromSsize_t(checked_cast(arr).Value(index)); + return Status::OK(); + } + case PythonType::BYTES: { + auto view = checked_cast(arr).GetView(index); + *result = PyBytes_FromStringAndSize(view.data(), view.length()); + return CheckPyError(); + } + case PythonType::STRING: { + auto view = checked_cast(arr).GetView(index); + *result = PyUnicode_FromStringAndSize(view.data(), view.length()); + return CheckPyError(); + } + case PythonType::HALF_FLOAT: { + *result = PyHalf_FromHalf(checked_cast(arr).Value(index)); + RETURN_IF_PYERROR(); + return Status::OK(); + } + case PythonType::FLOAT: + *result = PyFloat_FromDouble(checked_cast(arr).Value(index)); + return Status::OK(); + case PythonType::DOUBLE: + *result = PyFloat_FromDouble(checked_cast(arr).Value(index)); + return Status::OK(); + case PythonType::DATE64: { + RETURN_NOT_OK(internal::PyDateTime_from_int( + checked_cast(arr).Value(index), TimeUnit::MICRO, result)); + RETURN_IF_PYERROR(); + return Status::OK(); + } + case PythonType::LIST: { + const auto& l = checked_cast(arr); + return DeserializeList(context, *l.values(), l.value_offset(index), + l.value_offset(index + 1), base, blobs, result); + } + case PythonType::DICT: { + const auto& l = checked_cast(arr); + return DeserializeDict(context, *l.values(), l.value_offset(index), + l.value_offset(index + 1), base, blobs, result); + } + case PythonType::TUPLE: { + const auto& l = checked_cast(arr); + return DeserializeTuple(context, *l.values(), l.value_offset(index), + l.value_offset(index + 1), base, blobs, result); + } + case PythonType::SET: { + const auto& l = checked_cast(arr); + return DeserializeSet(context, *l.values(), l.value_offset(index), + l.value_offset(index + 1), base, blobs, result); + } + case PythonType::TENSOR: { + int32_t ref = checked_cast(arr).Value(index); + *result = wrap_tensor(blobs.tensors[ref]); + return Status::OK(); + } + case PythonType::SPARSECOOTENSOR: { + int32_t ref = checked_cast(arr).Value(index); + const std::shared_ptr& sparse_coo_tensor = + arrow::internal::checked_pointer_cast( + blobs.sparse_tensors[ref]); + *result = wrap_sparse_coo_tensor(sparse_coo_tensor); + return Status::OK(); + } + case PythonType::SPARSECSRMATRIX: { + int32_t ref = checked_cast(arr).Value(index); + const std::shared_ptr& sparse_csr_matrix = + arrow::internal::checked_pointer_cast( + blobs.sparse_tensors[ref]); + *result = wrap_sparse_csr_matrix(sparse_csr_matrix); + return Status::OK(); + } + case PythonType::SPARSECSCMATRIX: { + int32_t ref = checked_cast(arr).Value(index); + const std::shared_ptr& sparse_csc_matrix = + arrow::internal::checked_pointer_cast( + blobs.sparse_tensors[ref]); + *result = wrap_sparse_csc_matrix(sparse_csc_matrix); + return Status::OK(); + } + case PythonType::SPARSECSFTENSOR: { + int32_t ref = checked_cast(arr).Value(index); + const std::shared_ptr& sparse_csf_tensor = + arrow::internal::checked_pointer_cast( + blobs.sparse_tensors[ref]); + *result = wrap_sparse_csf_tensor(sparse_csf_tensor); + return Status::OK(); + } + case PythonType::NDARRAY: { + int32_t ref = checked_cast(arr).Value(index); + return DeserializeArray(ref, base, blobs, result); + } + case PythonType::BUFFER: { + int32_t ref = checked_cast(arr).Value(index); + *result = wrap_buffer(blobs.buffers[ref]); + return Status::OK(); + } + default: { + ARROW_CHECK(false) << "union tag " << type << "' not recognized"; + } + } + return Status::OK(); +} + +Status GetPythonTypes(const UnionArray& data, std::vector* result) { + ARROW_CHECK(result != nullptr); + auto type = data.type(); + for (int i = 0; i < type->num_fields(); ++i) { + int8_t tag = 0; + const std::string& data = type->field(i)->name(); + if (!ParseValue(data.c_str(), data.size(), &tag)) { + return Status::SerializationError("Cannot convert string: \"", + type->field(i)->name(), "\" to int8_t"); + } + result->push_back(tag); + } + return Status::OK(); +} + +template +Status DeserializeSequence(PyObject* context, const Array& array, int64_t start_idx, + int64_t stop_idx, PyObject* base, + const SerializedPyObject& blobs, + CreateSequenceFn&& create_sequence, SetItemFn&& set_item, + PyObject** out) { + const auto& data = checked_cast(array); + OwnedRef result(create_sequence(stop_idx - start_idx)); + RETURN_IF_PYERROR(); + const int8_t* type_codes = data.raw_type_codes(); + const int32_t* value_offsets = data.raw_value_offsets(); + std::vector python_types; + RETURN_NOT_OK(GetPythonTypes(data, &python_types)); + for (int64_t i = start_idx; i < stop_idx; ++i) { + const int64_t offset = value_offsets[i]; + const uint8_t type = type_codes[i]; + PyObject* value; + RETURN_NOT_OK(GetValue(context, *data.field(type), offset, python_types[type], base, + blobs, &value)); + RETURN_NOT_OK(set_item(result.obj(), i - start_idx, value)); + } + *out = result.detach(); + return Status::OK(); +} + +Status DeserializeList(PyObject* context, const Array& array, int64_t start_idx, + int64_t stop_idx, PyObject* base, const SerializedPyObject& blobs, + PyObject** out) { + return DeserializeSequence( + context, array, start_idx, stop_idx, base, blobs, + [](int64_t size) { return PyList_New(size); }, + [](PyObject* seq, int64_t index, PyObject* item) { + PyList_SET_ITEM(seq, index, item); + return Status::OK(); + }, + out); +} + +Status DeserializeTuple(PyObject* context, const Array& array, int64_t start_idx, + int64_t stop_idx, PyObject* base, const SerializedPyObject& blobs, + PyObject** out) { + return DeserializeSequence( + context, array, start_idx, stop_idx, base, blobs, + [](int64_t size) { return PyTuple_New(size); }, + [](PyObject* seq, int64_t index, PyObject* item) { + PyTuple_SET_ITEM(seq, index, item); + return Status::OK(); + }, + out); +} + +Status DeserializeSet(PyObject* context, const Array& array, int64_t start_idx, + int64_t stop_idx, PyObject* base, const SerializedPyObject& blobs, + PyObject** out) { + return DeserializeSequence( + context, array, start_idx, stop_idx, base, blobs, + [](int64_t size) { return PySet_New(nullptr); }, + [](PyObject* seq, int64_t index, PyObject* item) { + int err = PySet_Add(seq, item); + Py_DECREF(item); + if (err < 0) { + RETURN_IF_PYERROR(); + } + return Status::OK(); + }, + out); +} + +Status ReadSerializedObject(io::RandomAccessFile* src, SerializedPyObject* out) { + int32_t num_tensors; + int32_t num_sparse_tensors; + int32_t num_ndarrays; + int32_t num_buffers; + + // Read number of tensors + RETURN_NOT_OK(src->Read(sizeof(int32_t), reinterpret_cast(&num_tensors))); + RETURN_NOT_OK( + src->Read(sizeof(int32_t), reinterpret_cast(&num_sparse_tensors))); + RETURN_NOT_OK(src->Read(sizeof(int32_t), reinterpret_cast(&num_ndarrays))); + RETURN_NOT_OK(src->Read(sizeof(int32_t), reinterpret_cast(&num_buffers))); + + // Align stream to 8-byte offset + RETURN_NOT_OK(ipc::AlignStream(src, ipc::kArrowIpcAlignment)); + std::shared_ptr reader; + ARROW_ASSIGN_OR_RAISE(reader, ipc::RecordBatchStreamReader::Open(src)); + RETURN_NOT_OK(reader->ReadNext(&out->batch)); + + /// Skip EOS marker + RETURN_NOT_OK(src->Advance(4)); + + /// Align stream so tensor bodies are 64-byte aligned + RETURN_NOT_OK(ipc::AlignStream(src, ipc::kTensorAlignment)); + + for (int i = 0; i < num_tensors; ++i) { + std::shared_ptr tensor; + ARROW_ASSIGN_OR_RAISE(tensor, ipc::ReadTensor(src)); + RETURN_NOT_OK(ipc::AlignStream(src, ipc::kTensorAlignment)); + out->tensors.push_back(tensor); + } + + for (int i = 0; i < num_sparse_tensors; ++i) { + std::shared_ptr sparse_tensor; + ARROW_ASSIGN_OR_RAISE(sparse_tensor, ipc::ReadSparseTensor(src)); + RETURN_NOT_OK(ipc::AlignStream(src, ipc::kTensorAlignment)); + out->sparse_tensors.push_back(sparse_tensor); + } + + for (int i = 0; i < num_ndarrays; ++i) { + std::shared_ptr ndarray; + ARROW_ASSIGN_OR_RAISE(ndarray, ipc::ReadTensor(src)); + RETURN_NOT_OK(ipc::AlignStream(src, ipc::kTensorAlignment)); + out->ndarrays.push_back(ndarray); + } + + ARROW_ASSIGN_OR_RAISE(int64_t offset, src->Tell()); + for (int i = 0; i < num_buffers; ++i) { + int64_t size; + RETURN_NOT_OK(src->ReadAt(offset, sizeof(int64_t), &size)); + offset += sizeof(int64_t); + ARROW_ASSIGN_OR_RAISE(auto buffer, src->ReadAt(offset, size)); + out->buffers.push_back(buffer); + offset += size; + } + + return Status::OK(); +} + +Status DeserializeObject(PyObject* context, const SerializedPyObject& obj, PyObject* base, + PyObject** out) { + PyAcquireGIL lock; + return DeserializeList(context, *obj.batch->column(0), 0, obj.batch->num_rows(), base, + obj, out); +} + +Status GetSerializedFromComponents(int num_tensors, + const SparseTensorCounts& num_sparse_tensors, + int num_ndarrays, int num_buffers, PyObject* data, + SerializedPyObject* out) { + PyAcquireGIL gil; + const Py_ssize_t data_length = PyList_Size(data); + RETURN_IF_PYERROR(); + + const Py_ssize_t expected_data_length = 1 + num_tensors * 2 + + num_sparse_tensors.num_total_buffers() + + num_ndarrays * 2 + num_buffers; + if (data_length != expected_data_length) { + return Status::Invalid("Invalid number of buffers in data"); + } + + auto GetBuffer = [&data](Py_ssize_t index, std::shared_ptr* out) { + ARROW_CHECK_LE(index, PyList_Size(data)); + PyObject* py_buf = PyList_GET_ITEM(data, index); + return unwrap_buffer(py_buf).Value(out); + }; + + Py_ssize_t buffer_index = 0; + + // Read the union batch describing object structure + { + std::shared_ptr data_buffer; + RETURN_NOT_OK(GetBuffer(buffer_index++, &data_buffer)); + gil.release(); + io::BufferReader buf_reader(data_buffer); + std::shared_ptr reader; + ARROW_ASSIGN_OR_RAISE(reader, ipc::RecordBatchStreamReader::Open(&buf_reader)); + RETURN_NOT_OK(reader->ReadNext(&out->batch)); + gil.acquire(); + } + + // Zero-copy reconstruct tensors + for (int i = 0; i < num_tensors; ++i) { + std::shared_ptr metadata; + std::shared_ptr body; + std::shared_ptr tensor; + RETURN_NOT_OK(GetBuffer(buffer_index++, &metadata)); + RETURN_NOT_OK(GetBuffer(buffer_index++, &body)); + + ipc::Message message(metadata, body); + + ARROW_ASSIGN_OR_RAISE(tensor, ipc::ReadTensor(message)); + out->tensors.emplace_back(std::move(tensor)); + } + + // Zero-copy reconstruct sparse tensors + for (int i = 0, n = num_sparse_tensors.num_total_tensors(); i < n; ++i) { + ipc::IpcPayload payload; + RETURN_NOT_OK(GetBuffer(buffer_index++, &payload.metadata)); + + ARROW_ASSIGN_OR_RAISE( + size_t num_bodies, + ipc::internal::ReadSparseTensorBodyBufferCount(*payload.metadata)); + + payload.body_buffers.reserve(num_bodies); + for (size_t i = 0; i < num_bodies; ++i) { + std::shared_ptr body; + RETURN_NOT_OK(GetBuffer(buffer_index++, &body)); + payload.body_buffers.emplace_back(body); + } + + std::shared_ptr sparse_tensor; + ARROW_ASSIGN_OR_RAISE(sparse_tensor, ipc::internal::ReadSparseTensorPayload(payload)); + out->sparse_tensors.emplace_back(std::move(sparse_tensor)); + } + + // Zero-copy reconstruct tensors for numpy ndarrays + for (int i = 0; i < num_ndarrays; ++i) { + std::shared_ptr metadata; + std::shared_ptr body; + std::shared_ptr tensor; + RETURN_NOT_OK(GetBuffer(buffer_index++, &metadata)); + RETURN_NOT_OK(GetBuffer(buffer_index++, &body)); + + ipc::Message message(metadata, body); + + ARROW_ASSIGN_OR_RAISE(tensor, ipc::ReadTensor(message)); + out->ndarrays.emplace_back(std::move(tensor)); + } + + // Unwrap and append buffers + for (int i = 0; i < num_buffers; ++i) { + std::shared_ptr buffer; + RETURN_NOT_OK(GetBuffer(buffer_index++, &buffer)); + out->buffers.emplace_back(std::move(buffer)); + } + + return Status::OK(); +} + +Status DeserializeNdarray(const SerializedPyObject& object, + std::shared_ptr* out) { + if (object.ndarrays.size() != 1) { + return Status::Invalid("Object is not an Ndarray"); + } + *out = object.ndarrays[0]; + return Status::OK(); +} + +Status NdarrayFromBuffer(std::shared_ptr src, std::shared_ptr* out) { + io::BufferReader reader(src); + SerializedPyObject object; + RETURN_NOT_OK(ReadSerializedObject(&reader, &object)); + return DeserializeNdarray(object, out); +} + +} // namespace py +} // namespace arrow diff --git a/src/vendored/apache-arrow-12.0.1/arrow/python/deserialize.h b/src/vendored/apache-arrow-12.0.1/arrow/python/deserialize.h index ed82942..41b6a13 100644 --- a/src/vendored/apache-arrow-12.0.1/arrow/python/deserialize.h +++ b/src/vendored/apache-arrow-12.0.1/arrow/python/deserialize.h @@ -21,9 +21,9 @@ #include #include -#include "arrow/status.h" #include "arrow/python/serialize.h" #include "arrow/python/visibility.h" +#include "arrow/status.h" namespace arrow { diff --git a/src/vendored/apache-arrow-12.0.1/arrow/python/extension_type.cc b/src/vendored/apache-arrow-12.0.1/arrow/python/extension_type.cc new file mode 100644 index 0000000..3ccc171 --- /dev/null +++ b/src/vendored/apache-arrow-12.0.1/arrow/python/extension_type.cc @@ -0,0 +1,217 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include +#include + +#include "arrow/python/extension_type.h" +#include "arrow/python/helpers.h" +#include "arrow/python/pyarrow.h" +#include "arrow/util/checked_cast.h" +#include "arrow/util/logging.h" + +namespace arrow { + +using internal::checked_cast; + +namespace py { + +namespace { + +// Serialize a Python ExtensionType instance +Status SerializeExtInstance(PyObject* type_instance, std::string* out) { + OwnedRef res( + cpp_PyObject_CallMethod(type_instance, "__arrow_ext_serialize__", nullptr)); + if (!res) { + return ConvertPyError(); + } + if (!PyBytes_Check(res.obj())) { + return Status::TypeError( + "__arrow_ext_serialize__ should return bytes object, " + "got ", + internal::PyObject_StdStringRepr(res.obj())); + } + *out = internal::PyBytes_AsStdString(res.obj()); + return Status::OK(); +} + +// Deserialize a Python ExtensionType instance +PyObject* DeserializeExtInstance(PyObject* type_class, + std::shared_ptr storage_type, + const std::string& serialized_data) { + OwnedRef storage_ref(wrap_data_type(storage_type)); + if (!storage_ref) { + return nullptr; + } + OwnedRef data_ref(PyBytes_FromStringAndSize( + serialized_data.data(), static_cast(serialized_data.size()))); + if (!data_ref) { + return nullptr; + } + + return cpp_PyObject_CallMethod(type_class, "__arrow_ext_deserialize__", "OO", + storage_ref.obj(), data_ref.obj()); +} + +} // namespace + +static const char* kExtensionName = "arrow.py_extension_type"; + +std::string PyExtensionType::ToString() const { + PyAcquireGIL lock; + + std::stringstream ss; + OwnedRef instance(GetInstance()); + ss << "extension<" << this->extension_name() << "<" << Py_TYPE(instance.obj())->tp_name + << ">>"; + return ss.str(); +} + +PyExtensionType::PyExtensionType(std::shared_ptr storage_type, PyObject* typ, + PyObject* inst) + : ExtensionType(storage_type), + extension_name_(kExtensionName), + type_class_(typ), + type_instance_(inst) {} + +PyExtensionType::PyExtensionType(std::shared_ptr storage_type, + std::string extension_name, PyObject* typ, + PyObject* inst) + : ExtensionType(storage_type), + extension_name_(std::move(extension_name)), + type_class_(typ), + type_instance_(inst) {} + +bool PyExtensionType::ExtensionEquals(const ExtensionType& other) const { + PyAcquireGIL lock; + + if (other.extension_name() != extension_name()) { + return false; + } + const auto& other_ext = checked_cast(other); + int res = -1; + if (!type_instance_) { + if (other_ext.type_instance_) { + return false; + } + // Compare Python types + res = PyObject_RichCompareBool(type_class_.obj(), other_ext.type_class_.obj(), Py_EQ); + } else { + if (!other_ext.type_instance_) { + return false; + } + // Compare Python instances + OwnedRef left(GetInstance()); + OwnedRef right(other_ext.GetInstance()); + if (!left || !right) { + goto error; + } + res = PyObject_RichCompareBool(left.obj(), right.obj(), Py_EQ); + } + if (res == -1) { + goto error; + } + return res == 1; + +error: + // Cannot propagate error + PyErr_WriteUnraisable(nullptr); + return false; +} + +std::shared_ptr PyExtensionType::MakeArray(std::shared_ptr data) const { + DCHECK_EQ(data->type->id(), Type::EXTENSION); + return std::make_shared(data); +} + +std::string PyExtensionType::Serialize() const { + DCHECK(type_instance_); + return serialized_; +} + +Result> PyExtensionType::Deserialize( + std::shared_ptr storage_type, const std::string& serialized_data) const { + PyAcquireGIL lock; + + if (import_pyarrow()) { + return ConvertPyError(); + } + OwnedRef res(DeserializeExtInstance(type_class_.obj(), storage_type, serialized_data)); + if (!res) { + return ConvertPyError(); + } + return unwrap_data_type(res.obj()); +} + +PyObject* PyExtensionType::GetInstance() const { + if (!type_instance_) { + PyErr_SetString(PyExc_TypeError, "Not an instance"); + return nullptr; + } + DCHECK(PyWeakref_CheckRef(type_instance_.obj())); + PyObject* inst = PyWeakref_GET_OBJECT(type_instance_.obj()); + if (inst != Py_None) { + // Cached instance still alive + Py_INCREF(inst); + return inst; + } else { + // Must reconstruct from serialized form + // XXX cache again? + return DeserializeExtInstance(type_class_.obj(), storage_type_, serialized_); + } +} + +Status PyExtensionType::SetInstance(PyObject* inst) const { + // Check we have the right type + PyObject* typ = reinterpret_cast(Py_TYPE(inst)); + if (typ != type_class_.obj()) { + return Status::TypeError("Unexpected Python ExtensionType class ", + internal::PyObject_StdStringRepr(typ), " expected ", + internal::PyObject_StdStringRepr(type_class_.obj())); + } + + PyObject* wr = PyWeakref_NewRef(inst, nullptr); + if (wr == NULL) { + return ConvertPyError(); + } + type_instance_.reset(wr); + return SerializeExtInstance(inst, &serialized_); +} + +Status PyExtensionType::FromClass(const std::shared_ptr storage_type, + const std::string extension_name, PyObject* typ, + std::shared_ptr* out) { + Py_INCREF(typ); + out->reset(new PyExtensionType(storage_type, std::move(extension_name), typ)); + return Status::OK(); +} + +Status RegisterPyExtensionType(const std::shared_ptr& type) { + DCHECK_EQ(type->id(), Type::EXTENSION); + auto ext_type = std::dynamic_pointer_cast(type); + return RegisterExtensionType(ext_type); +} + +Status UnregisterPyExtensionType(const std::string& type_name) { + return UnregisterExtensionType(type_name); +} + +std::string PyExtensionName() { return kExtensionName; } + +} // namespace py +} // namespace arrow diff --git a/src/vendored/apache-arrow-12.0.1/arrow/python/extension_type.h b/src/vendored/apache-arrow-12.0.1/arrow/python/extension_type.h index 7fc86b9..e433d9a 100644 --- a/src/vendored/apache-arrow-12.0.1/arrow/python/extension_type.h +++ b/src/vendored/apache-arrow-12.0.1/arrow/python/extension_type.h @@ -21,9 +21,9 @@ #include #include "arrow/extension_type.h" -#include "arrow/util/macros.h" #include "arrow/python/common.h" #include "arrow/python/visibility.h" +#include "arrow/util/macros.h" namespace arrow { namespace py { diff --git a/src/vendored/apache-arrow-12.0.1/arrow/python/filesystem.cc b/src/vendored/apache-arrow-12.0.1/arrow/python/filesystem.cc new file mode 100644 index 0000000..5e9b500 --- /dev/null +++ b/src/vendored/apache-arrow-12.0.1/arrow/python/filesystem.cc @@ -0,0 +1,206 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/python/filesystem.h" +#include "arrow/util/logging.h" + +namespace arrow { + +using fs::FileInfo; +using fs::FileSelector; + +namespace py { +namespace fs { + +PyFileSystem::PyFileSystem(PyObject* handler, PyFileSystemVtable vtable) + : handler_(handler), vtable_(std::move(vtable)) { + Py_INCREF(handler); +} + +PyFileSystem::~PyFileSystem() {} + +std::shared_ptr PyFileSystem::Make(PyObject* handler, + PyFileSystemVtable vtable) { + return std::make_shared(handler, std::move(vtable)); +} + +std::string PyFileSystem::type_name() const { + std::string result; + auto st = SafeCallIntoPython([&]() -> Status { + vtable_.get_type_name(handler_.obj(), &result); + if (PyErr_Occurred()) { + PyErr_WriteUnraisable(handler_.obj()); + } + return Status::OK(); + }); + ARROW_UNUSED(st); + return result; +} + +bool PyFileSystem::Equals(const FileSystem& other) const { + bool result; + auto st = SafeCallIntoPython([&]() -> Status { + result = vtable_.equals(handler_.obj(), other); + if (PyErr_Occurred()) { + PyErr_WriteUnraisable(handler_.obj()); + } + return Status::OK(); + }); + ARROW_UNUSED(st); + return result; +} + +Result PyFileSystem::GetFileInfo(const std::string& path) { + FileInfo info; + + auto st = SafeCallIntoPython([&]() -> Status { + vtable_.get_file_info(handler_.obj(), path, &info); + return CheckPyError(); + }); + RETURN_NOT_OK(st); + return info; +} + +Result> PyFileSystem::GetFileInfo( + const std::vector& paths) { + std::vector infos; + + auto st = SafeCallIntoPython([&]() -> Status { + vtable_.get_file_info_vector(handler_.obj(), paths, &infos); + return CheckPyError(); + }); + RETURN_NOT_OK(st); + return infos; +} + +Result> PyFileSystem::GetFileInfo(const FileSelector& select) { + std::vector infos; + + auto st = SafeCallIntoPython([&]() -> Status { + vtable_.get_file_info_selector(handler_.obj(), select, &infos); + return CheckPyError(); + }); + RETURN_NOT_OK(st); + return infos; +} + +Status PyFileSystem::CreateDir(const std::string& path, bool recursive) { + return SafeCallIntoPython([&]() -> Status { + vtable_.create_dir(handler_.obj(), path, recursive); + return CheckPyError(); + }); +} + +Status PyFileSystem::DeleteDir(const std::string& path) { + return SafeCallIntoPython([&]() -> Status { + vtable_.delete_dir(handler_.obj(), path); + return CheckPyError(); + }); +} + +Status PyFileSystem::DeleteDirContents(const std::string& path, bool missing_dir_ok) { + return SafeCallIntoPython([&]() -> Status { + vtable_.delete_dir_contents(handler_.obj(), path, missing_dir_ok); + return CheckPyError(); + }); +} + +Status PyFileSystem::DeleteRootDirContents() { + return SafeCallIntoPython([&]() -> Status { + vtable_.delete_root_dir_contents(handler_.obj()); + return CheckPyError(); + }); +} + +Status PyFileSystem::DeleteFile(const std::string& path) { + return SafeCallIntoPython([&]() -> Status { + vtable_.delete_file(handler_.obj(), path); + return CheckPyError(); + }); +} + +Status PyFileSystem::Move(const std::string& src, const std::string& dest) { + return SafeCallIntoPython([&]() -> Status { + vtable_.move(handler_.obj(), src, dest); + return CheckPyError(); + }); +} + +Status PyFileSystem::CopyFile(const std::string& src, const std::string& dest) { + return SafeCallIntoPython([&]() -> Status { + vtable_.copy_file(handler_.obj(), src, dest); + return CheckPyError(); + }); +} + +Result> PyFileSystem::OpenInputStream( + const std::string& path) { + std::shared_ptr stream; + auto st = SafeCallIntoPython([&]() -> Status { + vtable_.open_input_stream(handler_.obj(), path, &stream); + return CheckPyError(); + }); + RETURN_NOT_OK(st); + return stream; +} + +Result> PyFileSystem::OpenInputFile( + const std::string& path) { + std::shared_ptr stream; + auto st = SafeCallIntoPython([&]() -> Status { + vtable_.open_input_file(handler_.obj(), path, &stream); + return CheckPyError(); + }); + RETURN_NOT_OK(st); + return stream; +} + +Result> PyFileSystem::OpenOutputStream( + const std::string& path, const std::shared_ptr& metadata) { + std::shared_ptr stream; + auto st = SafeCallIntoPython([&]() -> Status { + vtable_.open_output_stream(handler_.obj(), path, metadata, &stream); + return CheckPyError(); + }); + RETURN_NOT_OK(st); + return stream; +} + +Result> PyFileSystem::OpenAppendStream( + const std::string& path, const std::shared_ptr& metadata) { + std::shared_ptr stream; + auto st = SafeCallIntoPython([&]() -> Status { + vtable_.open_append_stream(handler_.obj(), path, metadata, &stream); + return CheckPyError(); + }); + RETURN_NOT_OK(st); + return stream; +} + +Result PyFileSystem::NormalizePath(std::string path) { + std::string normalized; + auto st = SafeCallIntoPython([&]() -> Status { + vtable_.normalize_path(handler_.obj(), path, &normalized); + return CheckPyError(); + }); + RETURN_NOT_OK(st); + return normalized; +} + +} // namespace fs +} // namespace py +} // namespace arrow diff --git a/src/vendored/apache-arrow-12.0.1/arrow/python/filesystem.h b/src/vendored/apache-arrow-12.0.1/arrow/python/filesystem.h index 2e5b223..003fd5c 100644 --- a/src/vendored/apache-arrow-12.0.1/arrow/python/filesystem.h +++ b/src/vendored/apache-arrow-12.0.1/arrow/python/filesystem.h @@ -22,9 +22,9 @@ #include #include "arrow/filesystem/filesystem.h" -#include "arrow/util/macros.h" #include "arrow/python/common.h" #include "arrow/python/visibility.h" +#include "arrow/util/macros.h" namespace arrow { namespace py { diff --git a/src/vendored/apache-arrow-12.0.1/arrow/python/flight.cc b/src/vendored/apache-arrow-12.0.1/arrow/python/flight.cc new file mode 100644 index 0000000..bf7af27 --- /dev/null +++ b/src/vendored/apache-arrow-12.0.1/arrow/python/flight.cc @@ -0,0 +1,388 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include + +#include "arrow/python/flight.h" +#include "arrow/util/io_util.h" +#include "arrow/util/logging.h" + +using arrow::flight::FlightPayload; + +namespace arrow { +namespace py { +namespace flight { + +const char* kPyServerMiddlewareName = "arrow.py_server_middleware"; + +PyServerAuthHandler::PyServerAuthHandler(PyObject* handler, + const PyServerAuthHandlerVtable& vtable) + : vtable_(vtable) { + Py_INCREF(handler); + handler_.reset(handler); +} + +Status PyServerAuthHandler::Authenticate(arrow::flight::ServerAuthSender* outgoing, + arrow::flight::ServerAuthReader* incoming) { + return SafeCallIntoPython([=] { + const Status status = vtable_.authenticate(handler_.obj(), outgoing, incoming); + RETURN_NOT_OK(CheckPyError()); + return status; + }); +} + +Status PyServerAuthHandler::IsValid(const std::string& token, + std::string* peer_identity) { + return SafeCallIntoPython([=] { + const Status status = vtable_.is_valid(handler_.obj(), token, peer_identity); + RETURN_NOT_OK(CheckPyError()); + return status; + }); +} + +PyClientAuthHandler::PyClientAuthHandler(PyObject* handler, + const PyClientAuthHandlerVtable& vtable) + : vtable_(vtable) { + Py_INCREF(handler); + handler_.reset(handler); +} + +Status PyClientAuthHandler::Authenticate(arrow::flight::ClientAuthSender* outgoing, + arrow::flight::ClientAuthReader* incoming) { + return SafeCallIntoPython([=] { + const Status status = vtable_.authenticate(handler_.obj(), outgoing, incoming); + RETURN_NOT_OK(CheckPyError()); + return status; + }); +} + +Status PyClientAuthHandler::GetToken(std::string* token) { + return SafeCallIntoPython([=] { + const Status status = vtable_.get_token(handler_.obj(), token); + RETURN_NOT_OK(CheckPyError()); + return status; + }); +} + +PyFlightServer::PyFlightServer(PyObject* server, const PyFlightServerVtable& vtable) + : vtable_(vtable) { + Py_INCREF(server); + server_.reset(server); +} + +Status PyFlightServer::ListFlights( + const arrow::flight::ServerCallContext& context, + const arrow::flight::Criteria* criteria, + std::unique_ptr* listings) { + return SafeCallIntoPython([&] { + const Status status = + vtable_.list_flights(server_.obj(), context, criteria, listings); + RETURN_NOT_OK(CheckPyError()); + return status; + }); +} + +Status PyFlightServer::GetFlightInfo(const arrow::flight::ServerCallContext& context, + const arrow::flight::FlightDescriptor& request, + std::unique_ptr* info) { + return SafeCallIntoPython([&] { + const Status status = vtable_.get_flight_info(server_.obj(), context, request, info); + RETURN_NOT_OK(CheckPyError()); + return status; + }); +} + +Status PyFlightServer::GetSchema(const arrow::flight::ServerCallContext& context, + const arrow::flight::FlightDescriptor& request, + std::unique_ptr* result) { + return SafeCallIntoPython([&] { + const Status status = vtable_.get_schema(server_.obj(), context, request, result); + RETURN_NOT_OK(CheckPyError()); + return status; + }); +} + +Status PyFlightServer::DoGet(const arrow::flight::ServerCallContext& context, + const arrow::flight::Ticket& request, + std::unique_ptr* stream) { + return SafeCallIntoPython([&] { + const Status status = vtable_.do_get(server_.obj(), context, request, stream); + RETURN_NOT_OK(CheckPyError()); + return status; + }); +} + +Status PyFlightServer::DoPut( + const arrow::flight::ServerCallContext& context, + std::unique_ptr reader, + std::unique_ptr writer) { + return SafeCallIntoPython([&] { + const Status status = + vtable_.do_put(server_.obj(), context, std::move(reader), std::move(writer)); + RETURN_NOT_OK(CheckPyError()); + return status; + }); +} + +Status PyFlightServer::DoExchange( + const arrow::flight::ServerCallContext& context, + std::unique_ptr reader, + std::unique_ptr writer) { + return SafeCallIntoPython([&] { + const Status status = + vtable_.do_exchange(server_.obj(), context, std::move(reader), std::move(writer)); + RETURN_NOT_OK(CheckPyError()); + return status; + }); +} + +Status PyFlightServer::DoAction(const arrow::flight::ServerCallContext& context, + const arrow::flight::Action& action, + std::unique_ptr* result) { + return SafeCallIntoPython([&] { + const Status status = vtable_.do_action(server_.obj(), context, action, result); + RETURN_NOT_OK(CheckPyError()); + return status; + }); +} + +Status PyFlightServer::ListActions(const arrow::flight::ServerCallContext& context, + std::vector* actions) { + return SafeCallIntoPython([&] { + const Status status = vtable_.list_actions(server_.obj(), context, actions); + RETURN_NOT_OK(CheckPyError()); + return status; + }); +} + +Status PyFlightServer::ServeWithSignals() { + // Respect the current Python settings, i.e. only interrupt the server if there is + // an active signal handler for SIGINT and SIGTERM. + std::vector signals; + for (const int signum : {SIGINT, SIGTERM}) { + ARROW_ASSIGN_OR_RAISE(auto handler, ::arrow::internal::GetSignalHandler(signum)); + auto cb = handler.callback(); + if (cb != SIG_DFL && cb != SIG_IGN) { + signals.push_back(signum); + } + } + RETURN_NOT_OK(SetShutdownOnSignals(signals)); + + // Serve until we got told to shutdown or a signal interrupted us + RETURN_NOT_OK(Serve()); + int signum = GotSignal(); + if (signum != 0) { + // Issue the signal again with Python's signal handlers restored + PyAcquireGIL lock; + raise(signum); + // XXX Ideally we would loop and serve again if no exception was raised. + // Unfortunately, gRPC will return immediately if Serve() is called again. + ARROW_UNUSED(PyErr_CheckSignals()); + } + + return Status::OK(); +} + +PyFlightResultStream::PyFlightResultStream(PyObject* generator, + PyFlightResultStreamCallback callback) + : callback_(callback) { + Py_INCREF(generator); + generator_.reset(generator); +} + +arrow::Result> PyFlightResultStream::Next() { + return SafeCallIntoPython( + [=]() -> arrow::Result> { + std::unique_ptr result; + const Status status = callback_(generator_.obj(), &result); + RETURN_NOT_OK(CheckPyError()); + RETURN_NOT_OK(status); + return result; + }); +} + +PyFlightDataStream::PyFlightDataStream( + PyObject* data_source, std::unique_ptr stream) + : stream_(std::move(stream)) { + Py_INCREF(data_source); + data_source_.reset(data_source); +} + +std::shared_ptr PyFlightDataStream::schema() { return stream_->schema(); } + +arrow::Result PyFlightDataStream::GetSchemaPayload() { + return stream_->GetSchemaPayload(); +} + +arrow::Result PyFlightDataStream::Next() { return stream_->Next(); } + +PyGeneratorFlightDataStream::PyGeneratorFlightDataStream( + PyObject* generator, std::shared_ptr schema, + PyGeneratorFlightDataStreamCallback callback, const ipc::IpcWriteOptions& options) + : schema_(schema), mapper_(*schema_), options_(options), callback_(callback) { + Py_INCREF(generator); + generator_.reset(generator); +} + +std::shared_ptr PyGeneratorFlightDataStream::schema() { return schema_; } + +arrow::Result PyGeneratorFlightDataStream::GetSchemaPayload() { + FlightPayload payload; + RETURN_NOT_OK(ipc::GetSchemaPayload(*schema_, options_, mapper_, &payload.ipc_message)); + return payload; +} + +arrow::Result PyGeneratorFlightDataStream::Next() { + return SafeCallIntoPython([=]() -> arrow::Result { + FlightPayload payload; + const Status status = callback_(generator_.obj(), &payload); + RETURN_NOT_OK(CheckPyError()); + RETURN_NOT_OK(status); + return payload; + }); +} + +// Flight Server Middleware + +PyServerMiddlewareFactory::PyServerMiddlewareFactory(PyObject* factory, + StartCallCallback start_call) + : start_call_(start_call) { + Py_INCREF(factory); + factory_.reset(factory); +} + +Status PyServerMiddlewareFactory::StartCall( + const arrow::flight::CallInfo& info, + const arrow::flight::CallHeaders& incoming_headers, + std::shared_ptr* middleware) { + return SafeCallIntoPython([&] { + const Status status = start_call_(factory_.obj(), info, incoming_headers, middleware); + RETURN_NOT_OK(CheckPyError()); + return status; + }); +} + +PyServerMiddleware::PyServerMiddleware(PyObject* middleware, Vtable vtable) + : vtable_(vtable) { + Py_INCREF(middleware); + middleware_.reset(middleware); +} + +void PyServerMiddleware::SendingHeaders(arrow::flight::AddCallHeaders* outgoing_headers) { + const Status& status = SafeCallIntoPython([&] { + const Status status = vtable_.sending_headers(middleware_.obj(), outgoing_headers); + RETURN_NOT_OK(CheckPyError()); + return status; + }); + + ARROW_WARN_NOT_OK(status, "Python server middleware failed in SendingHeaders"); +} + +void PyServerMiddleware::CallCompleted(const Status& call_status) { + const Status& status = SafeCallIntoPython([&] { + const Status status = vtable_.call_completed(middleware_.obj(), call_status); + RETURN_NOT_OK(CheckPyError()); + return status; + }); + + ARROW_WARN_NOT_OK(status, "Python server middleware failed in CallCompleted"); +} + +std::string PyServerMiddleware::name() const { return kPyServerMiddlewareName; } + +PyObject* PyServerMiddleware::py_object() const { return middleware_.obj(); } + +// Flight Client Middleware + +PyClientMiddlewareFactory::PyClientMiddlewareFactory(PyObject* factory, + StartCallCallback start_call) + : start_call_(start_call) { + Py_INCREF(factory); + factory_.reset(factory); +} + +void PyClientMiddlewareFactory::StartCall( + const arrow::flight::CallInfo& info, + std::unique_ptr* middleware) { + const Status& status = SafeCallIntoPython([&] { + const Status status = start_call_(factory_.obj(), info, middleware); + RETURN_NOT_OK(CheckPyError()); + return status; + }); + + ARROW_WARN_NOT_OK(status, "Python client middleware failed in StartCall"); +} + +PyClientMiddleware::PyClientMiddleware(PyObject* middleware, Vtable vtable) + : vtable_(vtable) { + Py_INCREF(middleware); + middleware_.reset(middleware); +} + +void PyClientMiddleware::SendingHeaders(arrow::flight::AddCallHeaders* outgoing_headers) { + const Status& status = SafeCallIntoPython([&] { + const Status status = vtable_.sending_headers(middleware_.obj(), outgoing_headers); + RETURN_NOT_OK(CheckPyError()); + return status; + }); + + ARROW_WARN_NOT_OK(status, "Python client middleware failed in StartCall"); +} + +void PyClientMiddleware::ReceivedHeaders( + const arrow::flight::CallHeaders& incoming_headers) { + const Status& status = SafeCallIntoPython([&] { + const Status status = vtable_.received_headers(middleware_.obj(), incoming_headers); + RETURN_NOT_OK(CheckPyError()); + return status; + }); + + ARROW_WARN_NOT_OK(status, "Python client middleware failed in StartCall"); +} + +void PyClientMiddleware::CallCompleted(const Status& call_status) { + const Status& status = SafeCallIntoPython([&] { + const Status status = vtable_.call_completed(middleware_.obj(), call_status); + RETURN_NOT_OK(CheckPyError()); + return status; + }); + + ARROW_WARN_NOT_OK(status, "Python client middleware failed in StartCall"); +} + +Status CreateFlightInfo(const std::shared_ptr& schema, + const arrow::flight::FlightDescriptor& descriptor, + const std::vector& endpoints, + int64_t total_records, int64_t total_bytes, + std::unique_ptr* out) { + ARROW_ASSIGN_OR_RAISE(auto result, + arrow::flight::FlightInfo::Make(*schema, descriptor, endpoints, + total_records, total_bytes)); + *out = std::unique_ptr( + new arrow::flight::FlightInfo(std::move(result))); + return Status::OK(); +} + +Status CreateSchemaResult(const std::shared_ptr& schema, + std::unique_ptr* out) { + return arrow::flight::SchemaResult::Make(*schema).Value(out); +} + +} // namespace flight +} // namespace py +} // namespace arrow diff --git a/src/vendored/apache-arrow-12.0.1/arrow/python/gdb.cc b/src/vendored/apache-arrow-12.0.1/arrow/python/gdb.cc new file mode 100644 index 0000000..6941769 --- /dev/null +++ b/src/vendored/apache-arrow-12.0.1/arrow/python/gdb.cc @@ -0,0 +1,530 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include +#include + +#include "arrow/array.h" +#include "arrow/chunked_array.h" +#include "arrow/datum.h" +#include "arrow/extension_type.h" +#include "arrow/ipc/json_simple.h" +#include "arrow/python/gdb.h" +#include "arrow/record_batch.h" +#include "arrow/scalar.h" +#include "arrow/table.h" +#include "arrow/type.h" +#include "arrow/util/debug.h" +#include "arrow/util/decimal.h" +#include "arrow/util/key_value_metadata.h" +#include "arrow/util/logging.h" +#include "arrow/util/macros.h" + +namespace arrow { + +using ipc::internal::json::ArrayFromJSON; +using ipc::internal::json::ChunkedArrayFromJSON; +using ipc::internal::json::ScalarFromJSON; + +namespace gdb { + +// Add a nested `arrow` namespace to exercise type lookup from GDB (ARROW-15652) +namespace arrow { +void DummyFunction() {} +} // namespace arrow + +namespace { + +class CustomStatusDetail : public StatusDetail { + public: + const char* type_id() const override { return "custom-detail-id"; } + std::string ToString() const override { return "This is a detail"; } +}; + +class UuidType : public ExtensionType { + public: + UuidType() : ExtensionType(fixed_size_binary(16)) {} + + std::string extension_name() const override { return "uuid"; } + + bool ExtensionEquals(const ExtensionType& other) const override { + return (other.extension_name() == this->extension_name()); + } + + std::shared_ptr MakeArray(std::shared_ptr data) const override { + return std::make_shared(data); + } + + Result> Deserialize( + std::shared_ptr storage_type, + const std::string& serialized) const override { + return Status::NotImplemented(""); + } + + std::string Serialize() const override { return "uuid-serialized"; } +}; + +std::shared_ptr SliceArrayFromJSON(const std::shared_ptr& ty, + std::string_view json, int64_t offset = 0, + int64_t length = -1) { + auto array = *ArrayFromJSON(ty, json); + if (length != -1) { + return array->Slice(offset, length); + } else { + return array->Slice(offset); + } +} + +} // namespace + +void TestSession() { + // We define local variables for all types for which we want to test + // pretty-printing. + // Then, at the end of this function, we trap to the debugger, so that + // test instrumentation can print values from this frame by interacting + // with the debugger. + // The test instrumentation is in pyarrow/tests/test_gdb.py + +#ifdef __clang__ + _Pragma("clang diagnostic push"); + _Pragma("clang diagnostic ignored \"-Wunused-variable\""); +#elif defined(__GNUC__) + _Pragma("GCC diagnostic push"); + _Pragma("GCC diagnostic ignored \"-Wunused-variable\""); +#endif + + arrow::DummyFunction(); + + // Status & Result + auto ok_status = Status::OK(); + auto error_status = Status::IOError("This is an error"); + auto error_detail_status = + error_status.WithDetail(std::make_shared()); + auto ok_result = Result(42); + auto error_result = Result(error_status); + auto error_detail_result = Result(error_detail_status); + + // String views + std::string_view string_view_abc{"abc"}; + std::string special_chars = std::string("foo\"bar") + '\x00' + "\r\n\t\x1f"; + std::string_view string_view_special_chars(special_chars); + + // Buffers + Buffer buffer_null{nullptr, 0}; + Buffer buffer_abc{string_view_abc}; + Buffer buffer_special_chars{string_view_special_chars}; + char mutable_array[3] = {'a', 'b', 'c'}; + MutableBuffer buffer_mutable{reinterpret_cast(mutable_array), 3}; + auto heap_buffer = std::make_shared(string_view_abc); + auto heap_buffer_mutable = *AllocateBuffer(buffer_abc.size()); + memcpy(heap_buffer_mutable->mutable_data(), buffer_abc.data(), buffer_abc.size()); + + // KeyValueMetadata + auto empty_metadata = key_value_metadata({}, {}); + auto metadata = key_value_metadata( + {"key_text", "key_binary"}, {"some value", std::string("z") + '\x00' + "\x1f\xff"}); + + // Decimals + Decimal128 decimal128_zero{}; + Decimal128 decimal128_pos{"98765432109876543210987654321098765432"}; + Decimal128 decimal128_neg{"-98765432109876543210987654321098765432"}; + BasicDecimal128 basic_decimal128_zero{}; + BasicDecimal128 basic_decimal128_pos{decimal128_pos.native_endian_array()}; + BasicDecimal128 basic_decimal128_neg{decimal128_neg.native_endian_array()}; + Decimal256 decimal256_zero{}; + Decimal256 decimal256_pos{ + "9876543210987654321098765432109876543210987654321098765432109876543210987654"}; + Decimal256 decimal256_neg{ + "-9876543210987654321098765432109876543210987654321098765432109876543210987654"}; + BasicDecimal256 basic_decimal256_zero{}; + BasicDecimal256 basic_decimal256_pos{decimal256_pos.native_endian_array()}; + BasicDecimal256 basic_decimal256_neg{decimal256_neg.native_endian_array()}; + + // Data types + NullType null_type; + auto heap_null_type = null(); + BooleanType bool_type; + auto heap_bool_type = boolean(); + + Date32Type date32_type; + Date64Type date64_type; + Time32Type time_type_s(TimeUnit::SECOND); + Time32Type time_type_ms(TimeUnit::MILLI); + Time64Type time_type_us(TimeUnit::MICRO); + Time64Type time_type_ns(TimeUnit::NANO); + auto heap_time_type_ns = time64(TimeUnit::NANO); + + TimestampType timestamp_type_s(TimeUnit::SECOND); + TimestampType timestamp_type_ms_timezone(TimeUnit::MILLI, "Europe/Paris"); + TimestampType timestamp_type_us(TimeUnit::MICRO); + TimestampType timestamp_type_ns_timezone(TimeUnit::NANO, "Europe/Paris"); + auto heap_timestamp_type_ns_timezone = timestamp(TimeUnit::NANO, "Europe/Paris"); + + DayTimeIntervalType day_time_interval_type; + MonthIntervalType month_interval_type; + MonthDayNanoIntervalType month_day_nano_interval_type; + + DurationType duration_type_s(TimeUnit::SECOND); + DurationType duration_type_ns(TimeUnit::NANO); + + BinaryType binary_type; + StringType string_type; + LargeBinaryType large_binary_type; + LargeStringType large_string_type; + FixedSizeBinaryType fixed_size_binary_type(10); + auto heap_fixed_size_binary_type = fixed_size_binary(10); + + Decimal128Type decimal128_type(16, 5); + Decimal256Type decimal256_type(42, 12); + auto heap_decimal128_type = decimal128(16, 5); + + ListType list_type(uint8()); + LargeListType large_list_type(large_utf8()); + auto heap_list_type = list(uint8()); + auto heap_large_list_type = large_list(large_utf8()); + + FixedSizeListType fixed_size_list_type(float64(), 3); + auto heap_fixed_size_list_type = fixed_size_list(float64(), 3); + + DictionaryType dict_type_unordered(int16(), utf8()); + DictionaryType dict_type_ordered(int16(), utf8(), /*ordered=*/true); + auto heap_dict_type = dictionary(int16(), utf8()); + + MapType map_type_unsorted(utf8(), binary()); + MapType map_type_sorted(utf8(), binary(), /*keys_sorted=*/true); + auto heap_map_type = map(utf8(), binary()); + + StructType struct_type_empty({}); + StructType struct_type( + {field("ints", int8()), field("strs", utf8(), /*nullable=*/false)}); + auto heap_struct_type = + struct_({field("ints", int8()), field("strs", utf8(), /*nullable=*/false)}); + + std::vector union_type_codes({7, 42}); + FieldVector union_fields( + {field("ints", int8()), field("strs", utf8(), /*nullable=*/false)}); + SparseUnionType sparse_union_type(union_fields, union_type_codes); + DenseUnionType dense_union_type(union_fields, union_type_codes); + + UuidType uuid_type{}; + std::shared_ptr heap_uuid_type = std::make_shared(); + + // Schema + auto schema_empty = schema({}); + auto schema_non_empty = schema({field("ints", int8()), field("strs", utf8())}); + auto schema_with_metadata = schema_non_empty->WithMetadata( + key_value_metadata({"key1", "key2"}, {"value1", "value2"})); + + // Fields + Field int_field("ints", int64()); + Field float_field("floats", float32(), /*nullable=*/false); + auto heap_int_field = field("ints", int64()); + + // Scalars + NullScalar null_scalar; + auto heap_null_scalar = MakeNullScalar(null()); + + BooleanScalar bool_scalar_null{}; + BooleanScalar bool_scalar{true}; + auto heap_bool_scalar = *MakeScalar(boolean(), true); + + Int8Scalar int8_scalar_null{}; + UInt8Scalar uint8_scalar_null{}; + Int64Scalar int64_scalar_null{}; + UInt64Scalar uint64_scalar_null{}; + Int8Scalar int8_scalar{-42}; + UInt8Scalar uint8_scalar{234}; + Int64Scalar int64_scalar{-9223372036854775807LL - 1}; + UInt64Scalar uint64_scalar{18446744073709551615ULL}; + HalfFloatScalar half_float_scalar{48640}; // -1.5 + FloatScalar float_scalar{1.25f}; + DoubleScalar double_scalar{2.5}; + + Time32Scalar time_scalar_s{100, TimeUnit::SECOND}; + Time32Scalar time_scalar_ms{1000, TimeUnit::MILLI}; + Time64Scalar time_scalar_us{10000, TimeUnit::MICRO}; + Time64Scalar time_scalar_ns{100000, TimeUnit::NANO}; + Time64Scalar time_scalar_null{time64(TimeUnit::NANO)}; + + DurationScalar duration_scalar_s{-100, TimeUnit::SECOND}; + DurationScalar duration_scalar_ms{-1000, TimeUnit::MILLI}; + DurationScalar duration_scalar_us{-10000, TimeUnit::MICRO}; + DurationScalar duration_scalar_ns{-100000, TimeUnit::NANO}; + DurationScalar duration_scalar_null{duration(TimeUnit::NANO)}; + + TimestampScalar timestamp_scalar_s{12345, timestamp(TimeUnit::SECOND)}; + TimestampScalar timestamp_scalar_ms{-123456, timestamp(TimeUnit::MILLI)}; + TimestampScalar timestamp_scalar_us{1234567, timestamp(TimeUnit::MICRO)}; + TimestampScalar timestamp_scalar_ns{-12345678, timestamp(TimeUnit::NANO)}; + TimestampScalar timestamp_scalar_null{timestamp(TimeUnit::NANO)}; + + TimestampScalar timestamp_scalar_s_tz{12345, + timestamp(TimeUnit::SECOND, "Europe/Paris")}; + TimestampScalar timestamp_scalar_ms_tz{-123456, + timestamp(TimeUnit::MILLI, "Europe/Paris")}; + TimestampScalar timestamp_scalar_us_tz{1234567, + timestamp(TimeUnit::MICRO, "Europe/Paris")}; + TimestampScalar timestamp_scalar_ns_tz{-12345678, + timestamp(TimeUnit::NANO, "Europe/Paris")}; + TimestampScalar timestamp_scalar_null_tz{timestamp(TimeUnit::NANO, "Europe/Paris")}; + + MonthIntervalScalar month_interval_scalar{23}; + MonthIntervalScalar month_interval_scalar_null{}; + DayTimeIntervalScalar day_time_interval_scalar{{23, -456}}; + DayTimeIntervalScalar day_time_interval_scalar_null{}; + MonthDayNanoIntervalScalar month_day_nano_interval_scalar{{1, 23, -456}}; + MonthDayNanoIntervalScalar month_day_nano_interval_scalar_null{}; + + Date32Scalar date32_scalar{23}; + Date32Scalar date32_scalar_null{}; + Date64Scalar date64_scalar{45 * 86400000LL}; + Date64Scalar date64_scalar_null{}; + + Decimal128Scalar decimal128_scalar_pos_scale_pos{Decimal128("1234567"), + decimal128(10, 4)}; + Decimal128Scalar decimal128_scalar_pos_scale_neg{Decimal128("-1234567"), + decimal128(10, 4)}; + Decimal128Scalar decimal128_scalar_neg_scale_pos{Decimal128("1234567"), + decimal128(10, -4)}; + Decimal128Scalar decimal128_scalar_neg_scale_neg{Decimal128("-1234567"), + decimal128(10, -4)}; + Decimal128Scalar decimal128_scalar_null{decimal128(10, 4)}; + auto heap_decimal128_scalar = *MakeScalar(decimal128(10, 4), Decimal128("1234567")); + + Decimal256Scalar decimal256_scalar_pos_scale_pos{ + Decimal256("1234567890123456789012345678901234567890123456"), decimal256(50, 4)}; + Decimal256Scalar decimal256_scalar_pos_scale_neg{ + Decimal256("-1234567890123456789012345678901234567890123456"), decimal256(50, 4)}; + Decimal256Scalar decimal256_scalar_neg_scale_pos{ + Decimal256("1234567890123456789012345678901234567890123456"), decimal256(50, -4)}; + Decimal256Scalar decimal256_scalar_neg_scale_neg{ + Decimal256("-1234567890123456789012345678901234567890123456"), decimal256(50, -4)}; + Decimal256Scalar decimal256_scalar_null{decimal256(50, 4)}; + auto heap_decimal256_scalar = *MakeScalar( + decimal256(50, 4), Decimal256("1234567890123456789012345678901234567890123456")); + + BinaryScalar binary_scalar_null{}; + BinaryScalar binary_scalar_unallocated{std::shared_ptr{nullptr}}; + BinaryScalar binary_scalar_empty{Buffer::FromString("")}; + BinaryScalar binary_scalar_abc{Buffer::FromString("abc")}; + BinaryScalar binary_scalar_bytes{ + Buffer::FromString(std::string() + '\x00' + "\x1f\xff")}; + + StringScalar string_scalar_null{}; + StringScalar string_scalar_unallocated{std::shared_ptr{nullptr}}; + StringScalar string_scalar_empty{Buffer::FromString("")}; + StringScalar string_scalar_hehe{Buffer::FromString("héhé")}; + StringScalar string_scalar_invalid_chars{ + Buffer::FromString(std::string("abc") + '\x00' + "def\xffghi")}; + + LargeBinaryScalar large_binary_scalar_abc{Buffer::FromString("abc")}; + LargeStringScalar large_string_scalar_hehe{Buffer::FromString("héhé")}; + + FixedSizeBinaryScalar fixed_size_binary_scalar{Buffer::FromString("abc"), + fixed_size_binary(3)}; + FixedSizeBinaryScalar fixed_size_binary_scalar_null{ + Buffer::FromString(" "), fixed_size_binary(3), /*is_valid=*/false}; + + std::shared_ptr dict_array; + dict_array = *ArrayFromJSON(utf8(), R"(["foo", "bar", "quux"])"); + DictionaryScalar dict_scalar{{std::make_shared(42), dict_array}, + dictionary(int8(), utf8())}; + DictionaryScalar dict_scalar_null{dictionary(int8(), utf8())}; + + std::shared_ptr list_value_array = *ArrayFromJSON(int32(), R"([4, 5, 6])"); + std::shared_ptr list_zero_length = *ArrayFromJSON(int32(), R"([])"); + ListScalar list_scalar{list_value_array}; + ListScalar list_scalar_null{list_zero_length, list(int32()), /*is_valid=*/false}; + LargeListScalar large_list_scalar{list_value_array}; + LargeListScalar large_list_scalar_null{list_zero_length, large_list(int32()), + /*is_valid=*/false}; + FixedSizeListScalar fixed_size_list_scalar{list_value_array}; + FixedSizeListScalar fixed_size_list_scalar_null{ + list_value_array, fixed_size_list(int32(), 3), /*is_valid=*/false}; + + auto struct_scalar_type = struct_({field("ints", int32()), field("strs", utf8())}); + StructScalar struct_scalar{ + ScalarVector{MakeScalar(int32_t(42)), MakeScalar("some text")}, struct_scalar_type}; + StructScalar struct_scalar_null{struct_scalar.value, struct_scalar_type, + /*is_valid=*/false}; + + auto sparse_union_scalar_type = + sparse_union(FieldVector{field("ints", int32()), field("strs", utf8())}, {7, 42}); + auto dense_union_scalar_type = + dense_union(FieldVector{field("ints", int32()), field("strs", utf8())}, {7, 42}); + std::vector> union_values = {MakeScalar(int32_t(43)), + MakeNullScalar(utf8())}; + SparseUnionScalar sparse_union_scalar{union_values, 7, sparse_union_scalar_type}; + DenseUnionScalar dense_union_scalar{union_values[0], 7, dense_union_scalar_type}; + + union_values[0] = MakeNullScalar(int32()); + SparseUnionScalar sparse_union_scalar_null{union_values, 7, sparse_union_scalar_type}; + DenseUnionScalar dense_union_scalar_null{union_values[0], 7, dense_union_scalar_type}; + + auto extension_scalar_type = std::make_shared(); + ExtensionScalar extension_scalar{ + std::make_shared(Buffer::FromString("0123456789abcdef"), + extension_scalar_type->storage_type()), + extension_scalar_type}; + ExtensionScalar extension_scalar_null{extension_scalar.value, extension_scalar_type, + /*is_valid=*/false}; + + std::shared_ptr heap_map_scalar; + ARROW_CHECK_OK( + ScalarFromJSON(map(utf8(), int32()), R"([["a", 5], ["b", 6]])", &heap_map_scalar)); + auto heap_map_scalar_null = MakeNullScalar(heap_map_scalar->type); + + // Array and ArrayData + auto heap_null_array = SliceArrayFromJSON(null(), "[null, null]"); + + auto heap_int32_array = SliceArrayFromJSON(int32(), "[-5, 6, null, 42]"); + ArrayData int32_array_data{*heap_int32_array->data()}; + Int32Array int32_array{heap_int32_array->data()->Copy()}; + + auto heap_int32_array_no_nulls = SliceArrayFromJSON(int32(), "[-5, 6, 3, 42]"); + + const char* json_int32_array = "[-1, 2, -3, 4, null, -5, 6, -7, 8, null, -9, -10]"; + auto heap_int32_array_sliced_1_9 = SliceArrayFromJSON(int32(), json_int32_array, 1, 9); + auto heap_int32_array_sliced_2_6 = SliceArrayFromJSON(int32(), json_int32_array, 2, 6); + auto heap_int32_array_sliced_8_4 = SliceArrayFromJSON(int32(), json_int32_array, 8, 4); + auto heap_int32_array_sliced_empty = + SliceArrayFromJSON(int32(), json_int32_array, 6, 0); + + const char* json_bool_array = + "[false, false, true, true, null, null, false, false, true, true, " + "null, null, false, false, true, true, null, null]"; + auto heap_bool_array = SliceArrayFromJSON(boolean(), json_bool_array); + auto heap_bool_array_sliced_1_9 = SliceArrayFromJSON(boolean(), json_bool_array, 1, 9); + auto heap_bool_array_sliced_2_6 = SliceArrayFromJSON(boolean(), json_bool_array, 2, 6); + auto heap_bool_array_sliced_empty = + SliceArrayFromJSON(boolean(), json_bool_array, 6, 0); + + auto heap_list_array = SliceArrayFromJSON(list(int64()), "[[1, 2], null, []]"); + ListArray list_array{heap_list_array->data()}; + + const char* json_double_array = "[-1.5, null]"; + auto heap_double_array = SliceArrayFromJSON(float64(), json_double_array); + + const char* json_float16_array = "[0, 48640]"; + auto heap_float16_array = + *SliceArrayFromJSON(uint16(), json_float16_array)->View(float16()); + + auto heap_date32_array = + SliceArrayFromJSON(date32(), "[0, null, 18336, -9004, -719162, -719163]"); + auto heap_date64_array = SliceArrayFromJSON( + date64(), "[1584230400000, -777945600000, -62135596800000, -62135683200000, 123]"); + + const char* json_time_array = "[null, -123, 456]"; + auto heap_time32_array_s = + SliceArrayFromJSON(time32(TimeUnit::SECOND), json_time_array); + auto heap_time32_array_ms = + SliceArrayFromJSON(time32(TimeUnit::MILLI), json_time_array); + auto heap_time64_array_us = + SliceArrayFromJSON(time64(TimeUnit::MICRO), json_time_array); + auto heap_time64_array_ns = SliceArrayFromJSON(time64(TimeUnit::NANO), json_time_array); + + auto heap_month_interval_array = + SliceArrayFromJSON(month_interval(), "[123, -456, null]"); + auto heap_day_time_interval_array = + SliceArrayFromJSON(day_time_interval(), "[[1, -600], null]"); + auto heap_month_day_nano_interval_array = + SliceArrayFromJSON(month_day_nano_interval(), "[[1, -600, 5000], null]"); + + const char* json_duration_array = "[null, -1234567890123456789]"; + auto heap_duration_array_s = + SliceArrayFromJSON(duration(TimeUnit::SECOND), json_duration_array); + auto heap_duration_array_ns = + SliceArrayFromJSON(duration(TimeUnit::NANO), json_duration_array); + + auto heap_timestamp_array_s = SliceArrayFromJSON( + timestamp(TimeUnit::SECOND), + R"([null, "1970-01-01 00:00:00", "1900-02-28 12:34:56", "3989-07-14 00:00:00"])"); + auto heap_timestamp_array_ms = SliceArrayFromJSON( + timestamp(TimeUnit::MILLI), + R"([null, "1900-02-28 12:34:56.123", "3989-07-14 00:00:00.789"])"); + auto heap_timestamp_array_us = SliceArrayFromJSON( + timestamp(TimeUnit::MICRO), + R"([null, "1900-02-28 12:34:56.654321", "3989-07-14 00:00:00.456789"])"); + auto heap_timestamp_array_ns = SliceArrayFromJSON( + timestamp(TimeUnit::NANO), R"([null, "1900-02-28 12:34:56.987654321"])"); + + auto heap_decimal128_array = SliceArrayFromJSON( + decimal128(30, 6), + R"([null, "-1234567890123456789.012345", "1234567890123456789.012345"])"); + auto heap_decimal256_array = SliceArrayFromJSON( + decimal256(50, 6), R"([null, "-123456789012345678901234567890123456789.012345"])"); + auto heap_decimal128_array_sliced = heap_decimal128_array->Slice(1, 1); + + auto heap_fixed_size_binary_array = + SliceArrayFromJSON(fixed_size_binary(3), "[null, \"abc\", \"\\u0000\\u001f\xff\"]"); + auto heap_fixed_size_binary_array_zero_width = + SliceArrayFromJSON(fixed_size_binary(0), R"([null, ""])"); + auto heap_fixed_size_binary_array_sliced = heap_fixed_size_binary_array->Slice(1, 1); + + const char* json_binary_array = "[null, \"abcd\", \"\\u0000\\u001f\xff\"]"; + auto heap_binary_array = SliceArrayFromJSON(binary(), json_binary_array); + auto heap_large_binary_array = SliceArrayFromJSON(large_binary(), json_binary_array); + const char* json_string_array = "[null, \"héhé\", \"invalid \xff char\"]"; + auto heap_string_array = SliceArrayFromJSON(utf8(), json_string_array); + auto heap_large_string_array = SliceArrayFromJSON(large_utf8(), json_string_array); + auto heap_binary_array_sliced = heap_binary_array->Slice(1, 1); + + // ChunkedArray + ArrayVector array_chunks(2); + array_chunks[0] = *ArrayFromJSON(int32(), "[1, 2]"); + array_chunks[1] = *ArrayFromJSON(int32(), "[3, null, 4]"); + ChunkedArray chunked_array{array_chunks}; + + // RecordBatch + auto batch_schema = schema({field("ints", int32()), field("strs", utf8())}); + ArrayVector batch_columns{2}; + batch_columns[0] = *ArrayFromJSON(int32(), "[1, 2, 3]"); + batch_columns[1] = *ArrayFromJSON(utf8(), R"(["abc", null, "def"])"); + auto batch = RecordBatch::Make(batch_schema, /*num_rows=*/3, batch_columns); + auto batch_with_metadata = batch->ReplaceSchemaMetadata( + key_value_metadata({"key1", "key2", "key3"}, {"value1", "value2", "value3"})); + + // Table + ChunkedArrayVector table_columns{2}; + ARROW_CHECK_OK( + ChunkedArrayFromJSON(int32(), {"[1, 2, 3]", "[4, 5]"}, &table_columns[0])); + ARROW_CHECK_OK(ChunkedArrayFromJSON( + utf8(), {R"(["abc", null])", R"(["def"])", R"(["ghi", "jkl"])"}, + &table_columns[1])); + auto table = Table::Make(batch_schema, table_columns); + + // Datum + Datum empty_datum{}; + Datum scalar_datum{MakeNullScalar(boolean())}; + Datum array_datum{heap_int32_array}; + Datum chunked_array_datum{chunked_array}; + Datum batch_datum{batch}; + Datum table_datum{table}; + +#ifdef __clang__ + _Pragma("clang diagnostic pop"); +#elif defined(__GNUC__) + _Pragma("GCC diagnostic pop"); +#endif + + // Hook into debugger + ::arrow::internal::DebugTrap(); +} + +} // namespace gdb +} // namespace arrow diff --git a/src/vendored/apache-arrow-12.0.1/arrow/python/helpers.cc b/src/vendored/apache-arrow-12.0.1/arrow/python/helpers.cc new file mode 100644 index 0000000..c266abc --- /dev/null +++ b/src/vendored/apache-arrow-12.0.1/arrow/python/helpers.cc @@ -0,0 +1,470 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// helpers.h includes a NumPy header, so we include this first +#include "arrow/python/numpy_interop.h" + +#include "arrow/python/helpers.h" + +#include +#include +#include +#include + +#include "arrow/python/common.h" +#include "arrow/python/decimal.h" +#include "arrow/type_fwd.h" +#include "arrow/util/checked_cast.h" +#include "arrow/util/logging.h" + +namespace arrow { + +using internal::checked_cast; + +namespace py { + +#define GET_PRIMITIVE_TYPE(NAME, FACTORY) \ + case Type::NAME: \ + return FACTORY() + +std::shared_ptr GetPrimitiveType(Type::type type) { + switch (type) { + case Type::NA: + return null(); + GET_PRIMITIVE_TYPE(UINT8, uint8); + GET_PRIMITIVE_TYPE(INT8, int8); + GET_PRIMITIVE_TYPE(UINT16, uint16); + GET_PRIMITIVE_TYPE(INT16, int16); + GET_PRIMITIVE_TYPE(UINT32, uint32); + GET_PRIMITIVE_TYPE(INT32, int32); + GET_PRIMITIVE_TYPE(UINT64, uint64); + GET_PRIMITIVE_TYPE(INT64, int64); + GET_PRIMITIVE_TYPE(DATE32, date32); + GET_PRIMITIVE_TYPE(DATE64, date64); + GET_PRIMITIVE_TYPE(BOOL, boolean); + GET_PRIMITIVE_TYPE(HALF_FLOAT, float16); + GET_PRIMITIVE_TYPE(FLOAT, float32); + GET_PRIMITIVE_TYPE(DOUBLE, float64); + GET_PRIMITIVE_TYPE(BINARY, binary); + GET_PRIMITIVE_TYPE(STRING, utf8); + GET_PRIMITIVE_TYPE(LARGE_BINARY, large_binary); + GET_PRIMITIVE_TYPE(LARGE_STRING, large_utf8); + GET_PRIMITIVE_TYPE(INTERVAL_MONTH_DAY_NANO, month_day_nano_interval); + default: + return nullptr; + } +} + +PyObject* PyHalf_FromHalf(npy_half value) { + PyObject* result = PyArrayScalar_New(Half); + if (result != NULL) { + PyArrayScalar_ASSIGN(result, Half, value); + } + return result; +} + +Status PyFloat_AsHalf(PyObject* obj, npy_half* out) { + if (PyArray_IsScalar(obj, Half)) { + *out = PyArrayScalar_VAL(obj, Half); + return Status::OK(); + } else { + // XXX: cannot use npy_double_to_half() without linking with Numpy + return Status::TypeError("Expected np.float16 instance"); + } +} + +namespace internal { + +std::string PyBytes_AsStdString(PyObject* obj) { + DCHECK(PyBytes_Check(obj)); + return std::string(PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj)); +} + +Status PyUnicode_AsStdString(PyObject* obj, std::string* out) { + DCHECK(PyUnicode_Check(obj)); + Py_ssize_t size; + // The utf-8 representation is cached on the unicode object + const char* data = PyUnicode_AsUTF8AndSize(obj, &size); + RETURN_IF_PYERROR(); + *out = std::string(data, size); + return Status::OK(); +} + +std::string PyObject_StdStringRepr(PyObject* obj) { + OwnedRef unicode_ref(PyObject_Repr(obj)); + OwnedRef bytes_ref; + + if (unicode_ref) { + bytes_ref.reset( + PyUnicode_AsEncodedString(unicode_ref.obj(), "utf8", "backslashreplace")); + } + if (!bytes_ref) { + PyErr_Clear(); + std::stringstream ss; + ss << "tp_name << "' repr() failed>"; + return ss.str(); + } + return PyBytes_AsStdString(bytes_ref.obj()); +} + +Status PyObject_StdStringStr(PyObject* obj, std::string* out) { + OwnedRef string_ref(PyObject_Str(obj)); + RETURN_IF_PYERROR(); + return PyUnicode_AsStdString(string_ref.obj(), out); +} + +Result IsModuleImported(const std::string& module_name) { + // PyImport_GetModuleDict returns with a borrowed reference + OwnedRef key(PyUnicode_FromString(module_name.c_str())); + auto is_imported = PyDict_Contains(PyImport_GetModuleDict(), key.obj()); + RETURN_IF_PYERROR(); + return is_imported; +} + +Status ImportModule(const std::string& module_name, OwnedRef* ref) { + PyObject* module = PyImport_ImportModule(module_name.c_str()); + RETURN_IF_PYERROR(); + ref->reset(module); + return Status::OK(); +} + +Status ImportFromModule(PyObject* module, const std::string& name, OwnedRef* ref) { + PyObject* attr = PyObject_GetAttrString(module, name.c_str()); + RETURN_IF_PYERROR(); + ref->reset(attr); + return Status::OK(); +} + +namespace { + +Status IntegerOverflowStatus(PyObject* obj, const std::string& overflow_message) { + if (overflow_message.empty()) { + std::string obj_as_stdstring; + RETURN_NOT_OK(PyObject_StdStringStr(obj, &obj_as_stdstring)); + return Status::Invalid("Value ", obj_as_stdstring, + " too large to fit in C integer type"); + } else { + return Status::Invalid(overflow_message); + } +} + +Result PyObjectToPyInt(PyObject* obj) { + // Try to call __index__ or __int__ on `obj` + // (starting from Python 3.10, the latter isn't done anymore by PyLong_AsLong*). + OwnedRef ref(PyNumber_Index(obj)); + if (ref) { + return std::move(ref); + } + PyErr_Clear(); + const auto nb = Py_TYPE(obj)->tp_as_number; + if (nb && nb->nb_int) { + ref.reset(nb->nb_int(obj)); + if (!ref) { + RETURN_IF_PYERROR(); + } + DCHECK(ref); + return std::move(ref); + } + return Status::TypeError( + "object of type ", + PyObject_StdStringRepr(reinterpret_cast(Py_TYPE(obj))), + " cannot be converted to int"); +} + +// Extract C signed int from Python object +template ::value, Int> = 0> +Status CIntFromPythonImpl(PyObject* obj, Int* out, const std::string& overflow_message) { + static_assert(sizeof(Int) <= sizeof(long long), // NOLINT + "integer type larger than long long"); + + OwnedRef ref; + if (!PyLong_Check(obj)) { + ARROW_ASSIGN_OR_RAISE(ref, PyObjectToPyInt(obj)); + obj = ref.obj(); + } + + if (sizeof(Int) > sizeof(long)) { // NOLINT + const auto value = PyLong_AsLongLong(obj); + if (ARROW_PREDICT_FALSE(value == -1)) { + RETURN_IF_PYERROR(); + } + if (ARROW_PREDICT_FALSE(value < std::numeric_limits::min() || + value > std::numeric_limits::max())) { + return IntegerOverflowStatus(obj, overflow_message); + } + *out = static_cast(value); + } else { + const auto value = PyLong_AsLong(obj); + if (ARROW_PREDICT_FALSE(value == -1)) { + RETURN_IF_PYERROR(); + } + if (ARROW_PREDICT_FALSE(value < std::numeric_limits::min() || + value > std::numeric_limits::max())) { + return IntegerOverflowStatus(obj, overflow_message); + } + *out = static_cast(value); + } + return Status::OK(); +} + +// Extract C unsigned int from Python object +template ::value, Int> = 0> +Status CIntFromPythonImpl(PyObject* obj, Int* out, const std::string& overflow_message) { + static_assert(sizeof(Int) <= sizeof(unsigned long long), // NOLINT + "integer type larger than unsigned long long"); + + OwnedRef ref; + if (!PyLong_Check(obj)) { + ARROW_ASSIGN_OR_RAISE(ref, PyObjectToPyInt(obj)); + obj = ref.obj(); + } + + if (sizeof(Int) > sizeof(unsigned long)) { // NOLINT + const auto value = PyLong_AsUnsignedLongLong(obj); + if (ARROW_PREDICT_FALSE(value == static_cast(-1))) { + RETURN_IF_PYERROR(); + } + if (ARROW_PREDICT_FALSE(value > std::numeric_limits::max())) { + return IntegerOverflowStatus(obj, overflow_message); + } + *out = static_cast(value); + } else { + const auto value = PyLong_AsUnsignedLong(obj); + if (ARROW_PREDICT_FALSE(value == static_cast(-1))) { + RETURN_IF_PYERROR(); + } + if (ARROW_PREDICT_FALSE(value > std::numeric_limits::max())) { + return IntegerOverflowStatus(obj, overflow_message); + } + *out = static_cast(value); + } + return Status::OK(); +} + +} // namespace + +template +Status CIntFromPython(PyObject* obj, Int* out, const std::string& overflow_message) { + if (PyBool_Check(obj)) { + return Status::TypeError("Expected integer, got bool"); + } + return CIntFromPythonImpl(obj, out, overflow_message); +} + +template Status CIntFromPython(PyObject*, int8_t*, const std::string&); +template Status CIntFromPython(PyObject*, int16_t*, const std::string&); +template Status CIntFromPython(PyObject*, int32_t*, const std::string&); +template Status CIntFromPython(PyObject*, int64_t*, const std::string&); +template Status CIntFromPython(PyObject*, uint8_t*, const std::string&); +template Status CIntFromPython(PyObject*, uint16_t*, const std::string&); +template Status CIntFromPython(PyObject*, uint32_t*, const std::string&); +template Status CIntFromPython(PyObject*, uint64_t*, const std::string&); + +inline bool MayHaveNaN(PyObject* obj) { + // Some core types can be very quickly type-checked and do not allow NaN values + const int64_t non_nan_tpflags = Py_TPFLAGS_LONG_SUBCLASS | Py_TPFLAGS_LIST_SUBCLASS | + Py_TPFLAGS_TUPLE_SUBCLASS | Py_TPFLAGS_BYTES_SUBCLASS | + Py_TPFLAGS_UNICODE_SUBCLASS | Py_TPFLAGS_DICT_SUBCLASS | + Py_TPFLAGS_BASE_EXC_SUBCLASS | Py_TPFLAGS_TYPE_SUBCLASS; + return !PyType_HasFeature(Py_TYPE(obj), non_nan_tpflags); +} + +bool PyFloat_IsNaN(PyObject* obj) { + return PyFloat_Check(obj) && std::isnan(PyFloat_AsDouble(obj)); +} + +namespace { + +static bool pandas_static_initialized = false; + +// Once initialized, these variables hold borrowed references to Pandas static data. +// We should not use OwnedRef here because Python destructors would be +// called on a finalized interpreter. +static PyObject* pandas_NA = nullptr; +static PyObject* pandas_NaT = nullptr; +static PyObject* pandas_Timedelta = nullptr; +static PyObject* pandas_Timestamp = nullptr; +static PyTypeObject* pandas_NaTType = nullptr; +static PyObject* pandas_DateOffset = nullptr; + +} // namespace + +void InitPandasStaticData() { + // NOTE: This is called with the GIL held. We needn't (and shouldn't, + // to avoid deadlocks) use an additional C++ lock (ARROW-10519). + if (pandas_static_initialized) { + return; + } + + OwnedRef pandas; + + // Import pandas + Status s = ImportModule("pandas", &pandas); + if (!s.ok()) { + return; + } + + // Since ImportModule can release the GIL, another thread could have + // already initialized the static data. + if (pandas_static_initialized) { + return; + } + OwnedRef ref; + + // set NaT sentinel and its type + if (ImportFromModule(pandas.obj(), "NaT", &ref).ok()) { + pandas_NaT = ref.obj(); + // PyObject_Type returns a new reference but we trust that pandas.NaT will + // outlive our use of this PyObject* + pandas_NaTType = Py_TYPE(ref.obj()); + } + + // retain a reference to Timedelta + if (ImportFromModule(pandas.obj(), "Timedelta", &ref).ok()) { + pandas_Timedelta = ref.obj(); + } + + // retain a reference to Timestamp + if (ImportFromModule(pandas.obj(), "Timestamp", &ref).ok()) { + pandas_Timestamp = ref.obj(); + } + + // if pandas.NA exists, retain a reference to it + if (ImportFromModule(pandas.obj(), "NA", &ref).ok()) { + pandas_NA = ref.obj(); + } + + // Import DateOffset type + if (ImportFromModule(pandas.obj(), "DateOffset", &ref).ok()) { + pandas_DateOffset = ref.obj(); + } + + pandas_static_initialized = true; +} + +bool PandasObjectIsNull(PyObject* obj) { + if (!MayHaveNaN(obj)) { + return false; + } + if (obj == Py_None) { + return true; + } + if (PyFloat_IsNaN(obj) || (pandas_NA && obj == pandas_NA) || + (pandas_NaTType && PyObject_TypeCheck(obj, pandas_NaTType)) || + (internal::PyDecimal_Check(obj) && internal::PyDecimal_ISNAN(obj))) { + return true; + } + return false; +} + +bool IsPandasTimedelta(PyObject* obj) { + return pandas_Timedelta && PyObject_IsInstance(obj, pandas_Timedelta); +} + +bool IsPandasTimestamp(PyObject* obj) { + return pandas_Timestamp && PyObject_IsInstance(obj, pandas_Timestamp); +} + +PyObject* BorrowPandasDataOffsetType() { return pandas_DateOffset; } + +Status InvalidValue(PyObject* obj, const std::string& why) { + auto obj_as_str = PyObject_StdStringRepr(obj); + return Status::Invalid("Could not convert ", std::move(obj_as_str), " with type ", + Py_TYPE(obj)->tp_name, ": ", why); +} + +Status InvalidType(PyObject* obj, const std::string& why) { + auto obj_as_str = PyObject_StdStringRepr(obj); + return Status::TypeError("Could not convert ", std::move(obj_as_str), " with type ", + Py_TYPE(obj)->tp_name, ": ", why); +} + +Status UnboxIntegerAsInt64(PyObject* obj, int64_t* out) { + if (PyLong_Check(obj)) { + int overflow = 0; + *out = PyLong_AsLongLongAndOverflow(obj, &overflow); + if (overflow) { + return Status::Invalid("PyLong is too large to fit int64"); + } + } else if (PyArray_IsScalar(obj, Byte)) { + *out = reinterpret_cast(obj)->obval; + } else if (PyArray_IsScalar(obj, UByte)) { + *out = reinterpret_cast(obj)->obval; + } else if (PyArray_IsScalar(obj, Short)) { + *out = reinterpret_cast(obj)->obval; + } else if (PyArray_IsScalar(obj, UShort)) { + *out = reinterpret_cast(obj)->obval; + } else if (PyArray_IsScalar(obj, Int)) { + *out = reinterpret_cast(obj)->obval; + } else if (PyArray_IsScalar(obj, UInt)) { + *out = reinterpret_cast(obj)->obval; + } else if (PyArray_IsScalar(obj, Long)) { + *out = reinterpret_cast(obj)->obval; + } else if (PyArray_IsScalar(obj, ULong)) { + *out = reinterpret_cast(obj)->obval; + } else if (PyArray_IsScalar(obj, LongLong)) { + *out = reinterpret_cast(obj)->obval; + } else if (PyArray_IsScalar(obj, Int64)) { + *out = reinterpret_cast(obj)->obval; + } else if (PyArray_IsScalar(obj, ULongLong)) { + *out = reinterpret_cast(obj)->obval; + } else if (PyArray_IsScalar(obj, UInt64)) { + *out = reinterpret_cast(obj)->obval; + } else { + return Status::Invalid("Integer scalar type not recognized"); + } + return Status::OK(); +} + +Status IntegerScalarToDoubleSafe(PyObject* obj, double* out) { + int64_t value = 0; + RETURN_NOT_OK(UnboxIntegerAsInt64(obj, &value)); + + constexpr int64_t kDoubleMax = 1LL << 53; + constexpr int64_t kDoubleMin = -(1LL << 53); + + if (value < kDoubleMin || value > kDoubleMax) { + return Status::Invalid("Integer value ", value, " is outside of the range exactly", + " representable by a IEEE 754 double precision value"); + } + *out = static_cast(value); + return Status::OK(); +} + +Status IntegerScalarToFloat32Safe(PyObject* obj, float* out) { + int64_t value = 0; + RETURN_NOT_OK(UnboxIntegerAsInt64(obj, &value)); + + constexpr int64_t kFloatMax = 1LL << 24; + constexpr int64_t kFloatMin = -(1LL << 24); + + if (value < kFloatMin || value > kFloatMax) { + return Status::Invalid("Integer value ", value, " is outside of the range exactly", + " representable by a IEEE 754 single precision value"); + } + *out = static_cast(value); + return Status::OK(); +} + +void DebugPrint(PyObject* obj) { + std::string repr = PyObject_StdStringRepr(obj); + PySys_WriteStderr("%s\n", repr.c_str()); +} + +} // namespace internal +} // namespace py +} // namespace arrow diff --git a/src/vendored/apache-arrow-12.0.1/arrow/python/helpers.h b/src/vendored/apache-arrow-12.0.1/arrow/python/helpers.h index 84455d2..a8e5f80 100644 --- a/src/vendored/apache-arrow-12.0.1/arrow/python/helpers.h +++ b/src/vendored/apache-arrow-12.0.1/arrow/python/helpers.h @@ -28,9 +28,9 @@ #include +#include "arrow/python/visibility.h" #include "arrow/type.h" #include "arrow/util/macros.h" -#include "arrow/python/visibility.h" namespace arrow { diff --git a/src/vendored/apache-arrow-12.0.1/arrow/python/inference.cc b/src/vendored/apache-arrow-12.0.1/arrow/python/inference.cc new file mode 100644 index 0000000..3407b32 --- /dev/null +++ b/src/vendored/apache-arrow-12.0.1/arrow/python/inference.cc @@ -0,0 +1,748 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/python/inference.h" +#include "arrow/python/numpy_interop.h" + +#include + +#include +#include +#include +#include +#include +#include + +#include "arrow/scalar.h" +#include "arrow/status.h" +#include "arrow/util/decimal.h" +#include "arrow/util/logging.h" + +#include "arrow/python/datetime.h" +#include "arrow/python/decimal.h" +#include "arrow/python/helpers.h" +#include "arrow/python/iterators.h" +#include "arrow/python/numpy_convert.h" + +namespace arrow { +namespace py { +namespace { +// Assigns a tuple to interval_types_tuple containing the nametuple for +// MonthDayNanoIntervalType and if present dateutil's relativedelta and +// pandas DateOffset. +Status ImportPresentIntervalTypes(OwnedRefNoGIL* interval_types_tuple) { + OwnedRef relative_delta_module; + // These are Optional imports so swallow errors. + OwnedRef relative_delta_type; + // Try to import pandas to get types. + internal::InitPandasStaticData(); + if (internal::ImportModule("dateutil.relativedelta", &relative_delta_module).ok()) { + RETURN_NOT_OK(internal::ImportFromModule(relative_delta_module.obj(), "relativedelta", + &relative_delta_type)); + } + + PyObject* date_offset_type = internal::BorrowPandasDataOffsetType(); + interval_types_tuple->reset( + PyTuple_New(1 + (date_offset_type != nullptr ? 1 : 0) + + (relative_delta_type.obj() != nullptr ? 1 : 0))); + RETURN_IF_PYERROR(); + int index = 0; + PyTuple_SetItem(interval_types_tuple->obj(), index++, + internal::NewMonthDayNanoTupleType()); + RETURN_IF_PYERROR(); + if (date_offset_type != nullptr) { + Py_XINCREF(date_offset_type); + PyTuple_SetItem(interval_types_tuple->obj(), index++, date_offset_type); + RETURN_IF_PYERROR(); + } + if (relative_delta_type.obj() != nullptr) { + PyTuple_SetItem(interval_types_tuple->obj(), index++, relative_delta_type.detach()); + RETURN_IF_PYERROR(); + } + return Status::OK(); +} + +} // namespace + +#define _NUMPY_UNIFY_NOOP(DTYPE) \ + case NPY_##DTYPE: \ + return OK; + +#define _NUMPY_UNIFY_PROMOTE(DTYPE) \ + case NPY_##DTYPE: \ + current_type_num_ = dtype; \ + current_dtype_ = descr; \ + return OK; + +#define _NUMPY_UNIFY_PROMOTE_TO(DTYPE, NEW_TYPE) \ + case NPY_##DTYPE: \ + current_type_num_ = NPY_##NEW_TYPE; \ + current_dtype_ = PyArray_DescrFromType(current_type_num_); \ + return OK; + +// Form a consensus NumPy dtype to use for Arrow conversion for a +// collection of dtype objects observed one at a time +class NumPyDtypeUnifier { + public: + enum Action { OK, INVALID }; + + NumPyDtypeUnifier() : current_type_num_(-1), current_dtype_(nullptr) {} + + Status InvalidMix(int new_dtype) { + return Status::Invalid("Cannot mix NumPy dtypes ", + GetNumPyTypeName(current_type_num_), " and ", + GetNumPyTypeName(new_dtype)); + } + + int Observe_BOOL(PyArray_Descr* descr, int dtype) { return INVALID; } + + int Observe_INT8(PyArray_Descr* descr, int dtype) { + switch (dtype) { + _NUMPY_UNIFY_PROMOTE(INT16); + _NUMPY_UNIFY_PROMOTE(INT32); + _NUMPY_UNIFY_PROMOTE(INT64); + _NUMPY_UNIFY_PROMOTE(FLOAT32); + _NUMPY_UNIFY_PROMOTE(FLOAT64); + default: + return INVALID; + } + } + + int Observe_INT16(PyArray_Descr* descr, int dtype) { + switch (dtype) { + _NUMPY_UNIFY_NOOP(INT8); + _NUMPY_UNIFY_PROMOTE(INT32); + _NUMPY_UNIFY_PROMOTE(INT64); + _NUMPY_UNIFY_NOOP(UINT8); + _NUMPY_UNIFY_PROMOTE(FLOAT32); + _NUMPY_UNIFY_PROMOTE(FLOAT64); + default: + return INVALID; + } + } + + int Observe_INT32(PyArray_Descr* descr, int dtype) { + switch (dtype) { + _NUMPY_UNIFY_NOOP(INT8); + _NUMPY_UNIFY_NOOP(INT16); + _NUMPY_UNIFY_PROMOTE(INT32); + _NUMPY_UNIFY_PROMOTE(INT64); + _NUMPY_UNIFY_NOOP(UINT8); + _NUMPY_UNIFY_NOOP(UINT16); + _NUMPY_UNIFY_PROMOTE_TO(FLOAT32, FLOAT64); + _NUMPY_UNIFY_PROMOTE(FLOAT64); + default: + return INVALID; + } + } + + int Observe_INT64(PyArray_Descr* descr, int dtype) { + switch (dtype) { + _NUMPY_UNIFY_NOOP(INT8); + _NUMPY_UNIFY_NOOP(INT16); + _NUMPY_UNIFY_NOOP(INT32); + _NUMPY_UNIFY_NOOP(INT64); + _NUMPY_UNIFY_NOOP(UINT8); + _NUMPY_UNIFY_NOOP(UINT16); + _NUMPY_UNIFY_NOOP(UINT32); + _NUMPY_UNIFY_PROMOTE_TO(FLOAT32, FLOAT64); + _NUMPY_UNIFY_PROMOTE(FLOAT64); + default: + return INVALID; + } + } + + int Observe_UINT8(PyArray_Descr* descr, int dtype) { + switch (dtype) { + _NUMPY_UNIFY_PROMOTE(UINT16); + _NUMPY_UNIFY_PROMOTE(UINT32); + _NUMPY_UNIFY_PROMOTE(UINT64); + _NUMPY_UNIFY_PROMOTE(FLOAT32); + _NUMPY_UNIFY_PROMOTE(FLOAT64); + default: + return INVALID; + } + } + + int Observe_UINT16(PyArray_Descr* descr, int dtype) { + switch (dtype) { + _NUMPY_UNIFY_NOOP(UINT8); + _NUMPY_UNIFY_PROMOTE(UINT32); + _NUMPY_UNIFY_PROMOTE(UINT64); + _NUMPY_UNIFY_PROMOTE(FLOAT32); + _NUMPY_UNIFY_PROMOTE(FLOAT64); + default: + return INVALID; + } + } + + int Observe_UINT32(PyArray_Descr* descr, int dtype) { + switch (dtype) { + _NUMPY_UNIFY_NOOP(UINT8); + _NUMPY_UNIFY_NOOP(UINT16); + _NUMPY_UNIFY_PROMOTE(UINT64); + _NUMPY_UNIFY_PROMOTE_TO(FLOAT32, FLOAT64); + _NUMPY_UNIFY_PROMOTE(FLOAT64); + default: + return INVALID; + } + } + + int Observe_UINT64(PyArray_Descr* descr, int dtype) { + switch (dtype) { + _NUMPY_UNIFY_NOOP(UINT8); + _NUMPY_UNIFY_NOOP(UINT16); + _NUMPY_UNIFY_NOOP(UINT32); + _NUMPY_UNIFY_PROMOTE_TO(FLOAT32, FLOAT64); + _NUMPY_UNIFY_PROMOTE(FLOAT64); + default: + return INVALID; + } + } + + int Observe_FLOAT16(PyArray_Descr* descr, int dtype) { + switch (dtype) { + _NUMPY_UNIFY_PROMOTE(FLOAT32); + _NUMPY_UNIFY_PROMOTE(FLOAT64); + default: + return INVALID; + } + } + + int Observe_FLOAT32(PyArray_Descr* descr, int dtype) { + switch (dtype) { + _NUMPY_UNIFY_NOOP(INT8); + _NUMPY_UNIFY_NOOP(INT16); + _NUMPY_UNIFY_NOOP(INT32); + _NUMPY_UNIFY_NOOP(INT64); + _NUMPY_UNIFY_NOOP(UINT8); + _NUMPY_UNIFY_NOOP(UINT16); + _NUMPY_UNIFY_NOOP(UINT32); + _NUMPY_UNIFY_NOOP(UINT64); + _NUMPY_UNIFY_PROMOTE(FLOAT64); + default: + return INVALID; + } + } + + int Observe_FLOAT64(PyArray_Descr* descr, int dtype) { + switch (dtype) { + _NUMPY_UNIFY_NOOP(INT8); + _NUMPY_UNIFY_NOOP(INT16); + _NUMPY_UNIFY_NOOP(INT32); + _NUMPY_UNIFY_NOOP(INT64); + _NUMPY_UNIFY_NOOP(UINT8); + _NUMPY_UNIFY_NOOP(UINT16); + _NUMPY_UNIFY_NOOP(UINT32); + _NUMPY_UNIFY_NOOP(UINT64); + default: + return INVALID; + } + } + + int Observe_DATETIME(PyArray_Descr* dtype_obj) { + // TODO: check that units are all the same + return OK; + } + + Status Observe(PyArray_Descr* descr) { + int dtype = fix_numpy_type_num(descr->type_num); + + if (current_type_num_ == -1) { + current_dtype_ = descr; + current_type_num_ = dtype; + return Status::OK(); + } else if (current_type_num_ == dtype) { + return Status::OK(); + } + +#define OBSERVE_CASE(DTYPE) \ + case NPY_##DTYPE: \ + action = Observe_##DTYPE(descr, dtype); \ + break; + + int action = OK; + switch (current_type_num_) { + OBSERVE_CASE(BOOL); + OBSERVE_CASE(INT8); + OBSERVE_CASE(INT16); + OBSERVE_CASE(INT32); + OBSERVE_CASE(INT64); + OBSERVE_CASE(UINT8); + OBSERVE_CASE(UINT16); + OBSERVE_CASE(UINT32); + OBSERVE_CASE(UINT64); + OBSERVE_CASE(FLOAT16); + OBSERVE_CASE(FLOAT32); + OBSERVE_CASE(FLOAT64); + case NPY_DATETIME: + action = Observe_DATETIME(descr); + break; + default: + return Status::NotImplemented("Unsupported numpy type ", GetNumPyTypeName(dtype)); + } + + if (action == INVALID) { + return InvalidMix(dtype); + } + return Status::OK(); + } + + bool dtype_was_observed() const { return current_type_num_ != -1; } + + PyArray_Descr* current_dtype() const { return current_dtype_; } + + int current_type_num() const { return current_type_num_; } + + private: + int current_type_num_; + PyArray_Descr* current_dtype_; +}; + +class TypeInferrer { + // A type inference visitor for Python values + public: + // \param validate_interval the number of elements to observe before checking + // whether the data is mixed type or has other problems. This helps avoid + // excess computation for each element while also making sure we "bail out" + // early with long sequences that may have problems up front + // \param make_unions permit mixed-type data by creating union types (not yet + // implemented) + explicit TypeInferrer(bool pandas_null_sentinels = false, + int64_t validate_interval = 100, bool make_unions = false) + : pandas_null_sentinels_(pandas_null_sentinels), + validate_interval_(validate_interval), + make_unions_(make_unions), + total_count_(0), + none_count_(0), + bool_count_(0), + int_count_(0), + date_count_(0), + time_count_(0), + timestamp_micro_count_(0), + duration_count_(0), + float_count_(0), + binary_count_(0), + unicode_count_(0), + decimal_count_(0), + list_count_(0), + struct_count_(0), + arrow_scalar_count_(0), + numpy_dtype_count_(0), + interval_count_(0), + max_decimal_metadata_(std::numeric_limits::min(), + std::numeric_limits::min()), + decimal_type_() { + ARROW_CHECK_OK(internal::ImportDecimalType(&decimal_type_)); + ARROW_CHECK_OK(ImportPresentIntervalTypes(&interval_types_)); + } + + /// \param[in] obj a Python object in the sequence + /// \param[out] keep_going if sufficient information has been gathered to + /// attempt to begin converting the sequence, *keep_going will be set to true + /// to signal to the calling visitor loop to terminate + Status Visit(PyObject* obj, bool* keep_going) { + ++total_count_; + + if (obj == Py_None || (pandas_null_sentinels_ && internal::PandasObjectIsNull(obj))) { + ++none_count_; + } else if (PyBool_Check(obj)) { + ++bool_count_; + *keep_going = make_unions_; + } else if (PyFloat_Check(obj)) { + ++float_count_; + *keep_going = make_unions_; + } else if (internal::IsPyInteger(obj)) { + ++int_count_; + } else if (PyDateTime_Check(obj)) { + // infer timezone from the first encountered datetime object + if (!timestamp_micro_count_) { + OwnedRef tzinfo(PyObject_GetAttrString(obj, "tzinfo")); + if (tzinfo.obj() != nullptr && tzinfo.obj() != Py_None) { + ARROW_ASSIGN_OR_RAISE(timezone_, internal::TzinfoToString(tzinfo.obj())); + } + } + ++timestamp_micro_count_; + *keep_going = make_unions_; + } else if (PyDelta_Check(obj)) { + ++duration_count_; + *keep_going = make_unions_; + } else if (PyDate_Check(obj)) { + ++date_count_; + *keep_going = make_unions_; + } else if (PyTime_Check(obj)) { + ++time_count_; + *keep_going = make_unions_; + } else if (internal::IsPyBinary(obj)) { + ++binary_count_; + *keep_going = make_unions_; + } else if (PyUnicode_Check(obj)) { + ++unicode_count_; + *keep_going = make_unions_; + } else if (arrow::py::is_scalar(obj)) { + RETURN_NOT_OK(VisitArrowScalar(obj, keep_going)); + } else if (PyArray_CheckAnyScalarExact(obj)) { + RETURN_NOT_OK(VisitDType(PyArray_DescrFromScalar(obj), keep_going)); + } else if (PySet_Check(obj) || (Py_TYPE(obj) == &PyDictValues_Type)) { + RETURN_NOT_OK(VisitSet(obj, keep_going)); + } else if (PyArray_Check(obj)) { + RETURN_NOT_OK(VisitNdarray(obj, keep_going)); + } else if (PyDict_Check(obj)) { + RETURN_NOT_OK(VisitDict(obj)); + } else if (PyList_Check(obj) || + (PyTuple_Check(obj) && + !PyObject_IsInstance(obj, PyTuple_GetItem(interval_types_.obj(), 0)))) { + RETURN_NOT_OK(VisitList(obj, keep_going)); + } else if (PyObject_IsInstance(obj, decimal_type_.obj())) { + RETURN_NOT_OK(max_decimal_metadata_.Update(obj)); + ++decimal_count_; + } else if (PyObject_IsInstance(obj, interval_types_.obj())) { + ++interval_count_; + } else { + return internal::InvalidValue(obj, + "did not recognize Python value type when inferring " + "an Arrow data type"); + } + + if (total_count_ % validate_interval_ == 0) { + RETURN_NOT_OK(Validate()); + } + + return Status::OK(); + } + + // Infer value type from a sequence of values + Status VisitSequence(PyObject* obj, PyObject* mask = nullptr) { + if (mask == nullptr || mask == Py_None) { + return internal::VisitSequence( + obj, /*offset=*/0, + [this](PyObject* value, bool* keep_going) { return Visit(value, keep_going); }); + } else { + return internal::VisitSequenceMasked( + obj, mask, /*offset=*/0, + [this](PyObject* value, uint8_t masked, bool* keep_going) { + if (!masked) { + return Visit(value, keep_going); + } else { + return Status::OK(); + } + }); + } + } + + // Infer value type from a sequence of values + Status VisitIterable(PyObject* obj) { + return internal::VisitIterable(obj, [this](PyObject* value, bool* keep_going) { + return Visit(value, keep_going); + }); + } + + Status GetType(std::shared_ptr* out) { + // TODO(wesm): handling forming unions + if (make_unions_) { + return Status::NotImplemented("Creating union types not yet supported"); + } + + RETURN_NOT_OK(Validate()); + + if (arrow_scalar_count_ > 0 && arrow_scalar_count_ + none_count_ != total_count_) { + return Status::Invalid( + "pyarrow scalars cannot be mixed " + "with other Python scalar values currently"); + } + + if (numpy_dtype_count_ > 0) { + // All NumPy scalars and Nones/nulls + if (numpy_dtype_count_ + none_count_ == total_count_) { + std::shared_ptr type; + RETURN_NOT_OK(NumPyDtypeToArrow(numpy_unifier_.current_dtype(), &type)); + *out = type; + return Status::OK(); + } + + // The "bad path": data contains a mix of NumPy scalars and + // other kinds of scalars. Note this can happen innocuously + // because numpy.nan is not a NumPy scalar (it's a built-in + // PyFloat) + + // TODO(ARROW-5564): Merge together type unification so this + // hack is not necessary + switch (numpy_unifier_.current_type_num()) { + case NPY_BOOL: + bool_count_ += numpy_dtype_count_; + break; + case NPY_INT8: + case NPY_INT16: + case NPY_INT32: + case NPY_INT64: + case NPY_UINT8: + case NPY_UINT16: + case NPY_UINT32: + case NPY_UINT64: + int_count_ += numpy_dtype_count_; + break; + case NPY_FLOAT32: + case NPY_FLOAT64: + float_count_ += numpy_dtype_count_; + break; + case NPY_DATETIME: + return Status::Invalid( + "numpy.datetime64 scalars cannot be mixed " + "with other Python scalar values currently"); + } + } + + if (list_count_) { + std::shared_ptr value_type; + RETURN_NOT_OK(list_inferrer_->GetType(&value_type)); + *out = list(value_type); + } else if (struct_count_) { + RETURN_NOT_OK(GetStructType(out)); + } else if (decimal_count_) { + if (max_decimal_metadata_.precision() > Decimal128Type::kMaxPrecision) { + // the default constructor does not validate the precision and scale + ARROW_ASSIGN_OR_RAISE(*out, + Decimal256Type::Make(max_decimal_metadata_.precision(), + max_decimal_metadata_.scale())); + } else { + ARROW_ASSIGN_OR_RAISE(*out, + Decimal128Type::Make(max_decimal_metadata_.precision(), + max_decimal_metadata_.scale())); + } + } else if (float_count_) { + // Prioritize floats before integers + *out = float64(); + } else if (int_count_) { + *out = int64(); + } else if (date_count_) { + *out = date32(); + } else if (time_count_) { + *out = time64(TimeUnit::MICRO); + } else if (timestamp_micro_count_) { + *out = timestamp(TimeUnit::MICRO, timezone_); + } else if (duration_count_) { + *out = duration(TimeUnit::MICRO); + } else if (bool_count_) { + *out = boolean(); + } else if (binary_count_) { + *out = binary(); + } else if (unicode_count_) { + *out = utf8(); + } else if (interval_count_) { + *out = month_day_nano_interval(); + } else if (arrow_scalar_count_) { + *out = scalar_type_; + } else { + *out = null(); + } + return Status::OK(); + } + + int64_t total_count() const { return total_count_; } + + protected: + Status Validate() const { + if (list_count_ > 0) { + if (list_count_ + none_count_ != total_count_) { + return Status::Invalid("cannot mix list and non-list, non-null values"); + } + RETURN_NOT_OK(list_inferrer_->Validate()); + } else if (struct_count_ > 0) { + if (struct_count_ + none_count_ != total_count_) { + return Status::Invalid("cannot mix struct and non-struct, non-null values"); + } + for (const auto& it : struct_inferrers_) { + RETURN_NOT_OK(it.second.Validate()); + } + } + return Status::OK(); + } + + Status VisitArrowScalar(PyObject* obj, bool* keep_going /* unused */) { + ARROW_ASSIGN_OR_RAISE(auto scalar, arrow::py::unwrap_scalar(obj)); + // Check that all the scalar types for the sequence are the same + if (arrow_scalar_count_ > 0 && *scalar->type != *scalar_type_) { + return internal::InvalidValue(obj, "cannot mix scalars with different types"); + } + scalar_type_ = scalar->type; + ++arrow_scalar_count_; + return Status::OK(); + } + + Status VisitDType(PyArray_Descr* dtype, bool* keep_going) { + // Continue visiting dtypes for now. + // TODO(wesm): devise approach for unions + ++numpy_dtype_count_; + *keep_going = true; + return numpy_unifier_.Observe(dtype); + } + + Status VisitList(PyObject* obj, bool* keep_going /* unused */) { + if (!list_inferrer_) { + list_inferrer_.reset( + new TypeInferrer(pandas_null_sentinels_, validate_interval_, make_unions_)); + } + ++list_count_; + return list_inferrer_->VisitSequence(obj); + } + + Status VisitSet(PyObject* obj, bool* keep_going /* unused */) { + if (!list_inferrer_) { + list_inferrer_.reset( + new TypeInferrer(pandas_null_sentinels_, validate_interval_, make_unions_)); + } + ++list_count_; + return list_inferrer_->VisitIterable(obj); + } + + Status VisitNdarray(PyObject* obj, bool* keep_going) { + PyArray_Descr* dtype = PyArray_DESCR(reinterpret_cast(obj)); + if (dtype->type_num == NPY_OBJECT) { + return VisitList(obj, keep_going); + } + // Not an object array: infer child Arrow type from dtype + if (!list_inferrer_) { + list_inferrer_.reset( + new TypeInferrer(pandas_null_sentinels_, validate_interval_, make_unions_)); + } + ++list_count_; + + // XXX(wesm): In ARROW-4324 I added accounting to check whether + // all of the non-null values have NumPy dtypes, but the + // total_count not not being properly incremented here + ++(*list_inferrer_).total_count_; + return list_inferrer_->VisitDType(dtype, keep_going); + } + + Status VisitDict(PyObject* obj) { + PyObject* key_obj; + PyObject* value_obj; + Py_ssize_t pos = 0; + + while (PyDict_Next(obj, &pos, &key_obj, &value_obj)) { + std::string key; + if (PyUnicode_Check(key_obj)) { + RETURN_NOT_OK(internal::PyUnicode_AsStdString(key_obj, &key)); + } else if (PyBytes_Check(key_obj)) { + key = internal::PyBytes_AsStdString(key_obj); + } else { + return Status::TypeError("Expected dict key of type str or bytes, got '", + Py_TYPE(key_obj)->tp_name, "'"); + } + // Get or create visitor for this key + auto it = struct_inferrers_.find(key); + if (it == struct_inferrers_.end()) { + it = struct_inferrers_ + .insert( + std::make_pair(key, TypeInferrer(pandas_null_sentinels_, + validate_interval_, make_unions_))) + .first; + } + TypeInferrer* visitor = &it->second; + + // We ignore termination signals from child visitors for now + // + // TODO(wesm): keep track of whether type inference has terminated for + // the child visitors to avoid doing unneeded work + bool keep_going = true; + RETURN_NOT_OK(visitor->Visit(value_obj, &keep_going)); + } + + // We do not terminate visiting dicts since we want the union of all + // observed keys + ++struct_count_; + return Status::OK(); + } + + Status GetStructType(std::shared_ptr* out) { + std::vector> fields; + for (auto&& it : struct_inferrers_) { + std::shared_ptr field_type; + RETURN_NOT_OK(it.second.GetType(&field_type)); + fields.emplace_back(field(it.first, field_type)); + } + *out = struct_(fields); + return Status::OK(); + } + + private: + bool pandas_null_sentinels_; + int64_t validate_interval_; + bool make_unions_; + int64_t total_count_; + int64_t none_count_; + int64_t bool_count_; + int64_t int_count_; + int64_t date_count_; + int64_t time_count_; + int64_t timestamp_micro_count_; + std::string timezone_; + int64_t duration_count_; + int64_t float_count_; + int64_t binary_count_; + int64_t unicode_count_; + int64_t decimal_count_; + int64_t list_count_; + int64_t struct_count_; + int64_t arrow_scalar_count_; + int64_t numpy_dtype_count_; + int64_t interval_count_; + std::unique_ptr list_inferrer_; + std::map struct_inferrers_; + std::shared_ptr scalar_type_; + + // If we observe a strongly-typed value in e.g. a NumPy array, we can store + // it here to skip the type counting logic above + NumPyDtypeUnifier numpy_unifier_; + + internal::DecimalMetadata max_decimal_metadata_; + + OwnedRefNoGIL decimal_type_; + OwnedRefNoGIL interval_types_; +}; + +// Non-exhaustive type inference +Result> InferArrowType(PyObject* obj, PyObject* mask, + bool pandas_null_sentinels) { + if (pandas_null_sentinels) { + // ARROW-842: If pandas is not installed then null checks will be less + // comprehensive, but that is okay. + internal::InitPandasStaticData(); + } + + std::shared_ptr out_type; + TypeInferrer inferrer(pandas_null_sentinels); + RETURN_NOT_OK(inferrer.VisitSequence(obj, mask)); + RETURN_NOT_OK(inferrer.GetType(&out_type)); + if (out_type == nullptr) { + return Status::TypeError("Unable to determine data type"); + } else { + return std::move(out_type); + } +} + +ARROW_PYTHON_EXPORT +bool IsPyBool(PyObject* obj) { return internal::PyBoolScalar_Check(obj); } + +ARROW_PYTHON_EXPORT +bool IsPyInt(PyObject* obj) { return internal::PyIntScalar_Check(obj); } + +ARROW_PYTHON_EXPORT +bool IsPyFloat(PyObject* obj) { return internal::PyFloatScalar_Check(obj); } + +} // namespace py +} // namespace arrow diff --git a/src/vendored/apache-arrow-12.0.1/arrow/python/inference.h b/src/vendored/apache-arrow-12.0.1/arrow/python/inference.h index 1d6516b..983384d 100644 --- a/src/vendored/apache-arrow-12.0.1/arrow/python/inference.h +++ b/src/vendored/apache-arrow-12.0.1/arrow/python/inference.h @@ -24,9 +24,9 @@ #include +#include "arrow/python/visibility.h" #include "arrow/type.h" #include "arrow/util/macros.h" -#include "arrow/python/visibility.h" #include "common.h" diff --git a/src/vendored/apache-arrow-12.0.1/arrow/python/init.cc b/src/vendored/apache-arrow-12.0.1/arrow/python/init.cc new file mode 100644 index 0000000..dba293b --- /dev/null +++ b/src/vendored/apache-arrow-12.0.1/arrow/python/init.cc @@ -0,0 +1,24 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// Trigger the array import (inversion of NO_IMPORT_ARRAY) +#define NUMPY_IMPORT_ARRAY + +#include "arrow/python/init.h" +#include "arrow/python/numpy_interop.h" + +int arrow_init_numpy() { return arrow::py::import_numpy(); } diff --git a/src/vendored/apache-arrow-12.0.1/arrow/python/io.cc b/src/vendored/apache-arrow-12.0.1/arrow/python/io.cc new file mode 100644 index 0000000..43f8297 --- /dev/null +++ b/src/vendored/apache-arrow-12.0.1/arrow/python/io.cc @@ -0,0 +1,384 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "io.h" + +#include +#include +#include +#include +#include + +#include "arrow/io/memory.h" +#include "arrow/memory_pool.h" +#include "arrow/status.h" +#include "arrow/util/logging.h" + +#include "arrow/python/common.h" +#include "arrow/python/pyarrow.h" + +namespace arrow { + +using arrow::io::TransformInputStream; + +namespace py { + +// ---------------------------------------------------------------------- +// Python file + +// A common interface to a Python file-like object. Must acquire GIL before +// calling any methods +class PythonFile { + public: + explicit PythonFile(PyObject* file) : file_(file), checked_read_buffer_(false) { + Py_INCREF(file); + } + + Status CheckClosed() const { + if (!file_) { + return Status::Invalid("operation on closed Python file"); + } + return Status::OK(); + } + + Status Close() { + if (file_) { + PyObject* result = cpp_PyObject_CallMethod(file_.obj(), "close", "()"); + Py_XDECREF(result); + file_.reset(); + PY_RETURN_IF_ERROR(StatusCode::IOError); + } + return Status::OK(); + } + + Status Abort() { + file_.reset(); + return Status::OK(); + } + + bool closed() const { + if (!file_) { + return true; + } + PyObject* result = PyObject_GetAttrString(file_.obj(), "closed"); + if (result == NULL) { + // Can't propagate the error, so write it out and return an arbitrary value + PyErr_WriteUnraisable(NULL); + return true; + } + int ret = PyObject_IsTrue(result); + Py_XDECREF(result); + if (ret < 0) { + PyErr_WriteUnraisable(NULL); + return true; + } + return ret != 0; + } + + Status Seek(int64_t position, int whence) { + RETURN_NOT_OK(CheckClosed()); + + // whence: 0 for relative to start of file, 2 for end of file + PyObject* result = cpp_PyObject_CallMethod(file_.obj(), "seek", "(ni)", + static_cast(position), whence); + Py_XDECREF(result); + PY_RETURN_IF_ERROR(StatusCode::IOError); + return Status::OK(); + } + + Status Read(int64_t nbytes, PyObject** out) { + RETURN_NOT_OK(CheckClosed()); + + PyObject* result = cpp_PyObject_CallMethod(file_.obj(), "read", "(n)", + static_cast(nbytes)); + PY_RETURN_IF_ERROR(StatusCode::IOError); + *out = result; + return Status::OK(); + } + + Status ReadBuffer(int64_t nbytes, PyObject** out) { + PyObject* result = cpp_PyObject_CallMethod(file_.obj(), "read_buffer", "(n)", + static_cast(nbytes)); + PY_RETURN_IF_ERROR(StatusCode::IOError); + *out = result; + return Status::OK(); + } + + Status Write(const void* data, int64_t nbytes) { + RETURN_NOT_OK(CheckClosed()); + + // Since the data isn't owned, we have to make a copy + PyObject* py_data = + PyBytes_FromStringAndSize(reinterpret_cast(data), nbytes); + PY_RETURN_IF_ERROR(StatusCode::IOError); + + PyObject* result = cpp_PyObject_CallMethod(file_.obj(), "write", "(O)", py_data); + Py_XDECREF(py_data); + Py_XDECREF(result); + PY_RETURN_IF_ERROR(StatusCode::IOError); + return Status::OK(); + } + + Status Write(const std::shared_ptr& buffer) { + RETURN_NOT_OK(CheckClosed()); + + PyObject* py_data = wrap_buffer(buffer); + PY_RETURN_IF_ERROR(StatusCode::IOError); + + PyObject* result = cpp_PyObject_CallMethod(file_.obj(), "write", "(O)", py_data); + Py_XDECREF(py_data); + Py_XDECREF(result); + PY_RETURN_IF_ERROR(StatusCode::IOError); + return Status::OK(); + } + + Result Tell() { + RETURN_NOT_OK(CheckClosed()); + + PyObject* result = cpp_PyObject_CallMethod(file_.obj(), "tell", "()"); + PY_RETURN_IF_ERROR(StatusCode::IOError); + + int64_t position = PyLong_AsLongLong(result); + Py_DECREF(result); + + // PyLong_AsLongLong can raise OverflowError + PY_RETURN_IF_ERROR(StatusCode::IOError); + return position; + } + + std::mutex& lock() { return lock_; } + + bool HasReadBuffer() { + if (!checked_read_buffer_) { // we don't want to check this each time + has_read_buffer_ = PyObject_HasAttrString(file_.obj(), "read_buffer") == 1; + checked_read_buffer_ = true; + } + return has_read_buffer_; + } + + private: + std::mutex lock_; + OwnedRefNoGIL file_; + bool has_read_buffer_; + bool checked_read_buffer_; +}; + +// ---------------------------------------------------------------------- +// Seekable input stream + +PyReadableFile::PyReadableFile(PyObject* file) { file_.reset(new PythonFile(file)); } + +// The destructor does not close the underlying Python file object, as +// there may be multiple references to it. Instead let the Python +// destructor do its job. +PyReadableFile::~PyReadableFile() {} + +Status PyReadableFile::Abort() { + return SafeCallIntoPython([this]() { return file_->Abort(); }); +} + +Status PyReadableFile::Close() { + return SafeCallIntoPython([this]() { return file_->Close(); }); +} + +bool PyReadableFile::closed() const { + bool res; + Status st = SafeCallIntoPython([this, &res]() { + res = file_->closed(); + return Status::OK(); + }); + return res; +} + +Status PyReadableFile::Seek(int64_t position) { + return SafeCallIntoPython([=] { return file_->Seek(position, 0); }); +} + +Result PyReadableFile::Tell() const { + return SafeCallIntoPython([=]() -> Result { return file_->Tell(); }); +} + +Result PyReadableFile::Read(int64_t nbytes, void* out) { + return SafeCallIntoPython([=]() -> Result { + OwnedRef bytes; + RETURN_NOT_OK(file_->Read(nbytes, bytes.ref())); + PyObject* bytes_obj = bytes.obj(); + DCHECK(bytes_obj != NULL); + + Py_buffer py_buf; + if (!PyObject_GetBuffer(bytes_obj, &py_buf, PyBUF_ANY_CONTIGUOUS)) { + const uint8_t* data = reinterpret_cast(py_buf.buf); + std::memcpy(out, data, py_buf.len); + int64_t len = py_buf.len; + PyBuffer_Release(&py_buf); + return len; + } else { + return Status::TypeError( + "Python file read() should have returned a bytes object or an object " + "supporting the buffer protocol, got '", + Py_TYPE(bytes_obj)->tp_name, "' (did you open the file in binary mode?)"); + } + }); +} + +Result> PyReadableFile::Read(int64_t nbytes) { + return SafeCallIntoPython([=]() -> Result> { + OwnedRef buffer_obj; + if (file_->HasReadBuffer()) { + RETURN_NOT_OK(file_->ReadBuffer(nbytes, buffer_obj.ref())); + } else { + RETURN_NOT_OK(file_->Read(nbytes, buffer_obj.ref())); + } + DCHECK(buffer_obj.obj() != NULL); + + return PyBuffer::FromPyObject(buffer_obj.obj()); + }); +} + +Result PyReadableFile::ReadAt(int64_t position, int64_t nbytes, void* out) { + std::lock_guard guard(file_->lock()); + return SafeCallIntoPython([=]() -> Result { + RETURN_NOT_OK(Seek(position)); + return Read(nbytes, out); + }); +} + +Result> PyReadableFile::ReadAt(int64_t position, int64_t nbytes) { + std::lock_guard guard(file_->lock()); + return SafeCallIntoPython([=]() -> Result> { + RETURN_NOT_OK(Seek(position)); + return Read(nbytes); + }); +} + +Result PyReadableFile::GetSize() { + return SafeCallIntoPython([=]() -> Result { + ARROW_ASSIGN_OR_RAISE(int64_t current_position, file_->Tell()); + RETURN_NOT_OK(file_->Seek(0, 2)); + + ARROW_ASSIGN_OR_RAISE(int64_t file_size, file_->Tell()); + // Restore previous file position + RETURN_NOT_OK(file_->Seek(current_position, 0)); + + return file_size; + }); +} + +// ---------------------------------------------------------------------- +// Output stream + +PyOutputStream::PyOutputStream(PyObject* file) : position_(0) { + file_.reset(new PythonFile(file)); +} + +// The destructor does not close the underlying Python file object, as +// there may be multiple references to it. Instead let the Python +// destructor do its job. +PyOutputStream::~PyOutputStream() {} + +Status PyOutputStream::Abort() { + return SafeCallIntoPython([=]() { return file_->Abort(); }); +} + +Status PyOutputStream::Close() { + return SafeCallIntoPython([=]() { return file_->Close(); }); +} + +bool PyOutputStream::closed() const { + bool res; + Status st = SafeCallIntoPython([this, &res]() { + res = file_->closed(); + return Status::OK(); + }); + return res; +} + +Result PyOutputStream::Tell() const { return position_; } + +Status PyOutputStream::Write(const void* data, int64_t nbytes) { + return SafeCallIntoPython([=]() { + position_ += nbytes; + return file_->Write(data, nbytes); + }); +} + +Status PyOutputStream::Write(const std::shared_ptr& buffer) { + return SafeCallIntoPython([=]() { + position_ += buffer->size(); + return file_->Write(buffer); + }); +} + +// ---------------------------------------------------------------------- +// Foreign buffer + +Status PyForeignBuffer::Make(const uint8_t* data, int64_t size, PyObject* base, + std::shared_ptr* out) { + PyForeignBuffer* buf = new PyForeignBuffer(data, size, base); + if (buf == NULL) { + return Status::OutOfMemory("could not allocate foreign buffer object"); + } else { + *out = std::shared_ptr(buf); + return Status::OK(); + } +} + +// ---------------------------------------------------------------------- +// TransformInputStream::TransformFunc wrapper + +struct TransformFunctionWrapper { + TransformFunctionWrapper(TransformCallback cb, PyObject* arg) + : cb_(std::move(cb)), arg_(std::make_shared(arg)) { + Py_INCREF(arg); + } + + Result> operator()(const std::shared_ptr& src) { + return SafeCallIntoPython([=]() -> Result> { + std::shared_ptr dest; + cb_(arg_->obj(), src, &dest); + RETURN_NOT_OK(CheckPyError()); + return dest; + }); + } + + protected: + // Need to wrap OwnedRefNoGIL because std::function needs the callable + // to be copy-constructible... + TransformCallback cb_; + std::shared_ptr arg_; +}; + +std::shared_ptr<::arrow::io::InputStream> MakeTransformInputStream( + std::shared_ptr<::arrow::io::InputStream> wrapped, TransformInputStreamVTable vtable, + PyObject* handler) { + TransformInputStream::TransformFunc transform( + TransformFunctionWrapper{std::move(vtable.transform), handler}); + return std::make_shared(std::move(wrapped), std::move(transform)); +} + +std::shared_ptr MakeStreamTransformFunc(TransformInputStreamVTable vtable, + PyObject* handler) { + TransformInputStream::TransformFunc transform( + TransformFunctionWrapper{std::move(vtable.transform), handler}); + StreamWrapFunc func = [transform](std::shared_ptr<::arrow::io::InputStream> wrapped) { + return std::make_shared(wrapped, transform); + }; + return std::make_shared(func); +} + +} // namespace py +} // namespace arrow diff --git a/src/vendored/apache-arrow-12.0.1/arrow/python/ipc.cc b/src/vendored/apache-arrow-12.0.1/arrow/python/ipc.cc new file mode 100644 index 0000000..9348182 --- /dev/null +++ b/src/vendored/apache-arrow-12.0.1/arrow/python/ipc.cc @@ -0,0 +1,67 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "ipc.h" + +#include + +#include "arrow/python/pyarrow.h" + +namespace arrow { +namespace py { + +PyRecordBatchReader::PyRecordBatchReader() {} + +Status PyRecordBatchReader::Init(std::shared_ptr schema, PyObject* iterable) { + schema_ = std::move(schema); + + iterator_.reset(PyObject_GetIter(iterable)); + return CheckPyError(); +} + +std::shared_ptr PyRecordBatchReader::schema() const { return schema_; } + +Status PyRecordBatchReader::ReadNext(std::shared_ptr* batch) { + PyAcquireGIL lock; + + if (!iterator_) { + // End of stream + batch->reset(); + return Status::OK(); + } + + OwnedRef py_batch(PyIter_Next(iterator_.obj())); + if (!py_batch) { + RETURN_IF_PYERROR(); + // End of stream + batch->reset(); + iterator_.reset(); + return Status::OK(); + } + + return unwrap_batch(py_batch.obj()).Value(batch); +} + +Result> PyRecordBatchReader::Make( + std::shared_ptr schema, PyObject* iterable) { + auto reader = std::shared_ptr(new PyRecordBatchReader()); + RETURN_NOT_OK(reader->Init(std::move(schema), iterable)); + return reader; +} + +} // namespace py +} // namespace arrow diff --git a/src/vendored/apache-arrow-12.0.1/arrow/python/ipc.h b/src/vendored/apache-arrow-12.0.1/arrow/python/ipc.h index 57eabfe..92232ed 100644 --- a/src/vendored/apache-arrow-12.0.1/arrow/python/ipc.h +++ b/src/vendored/apache-arrow-12.0.1/arrow/python/ipc.h @@ -19,11 +19,11 @@ #include +#include "arrow/python/common.h" +#include "arrow/python/visibility.h" #include "arrow/record_batch.h" #include "arrow/result.h" #include "arrow/util/macros.h" -#include "arrow/python/common.h" -#include "arrow/python/visibility.h" namespace arrow { namespace py { diff --git a/src/vendored/apache-arrow-12.0.1/arrow/python/numpy_convert.cc b/src/vendored/apache-arrow-12.0.1/arrow/python/numpy_convert.cc new file mode 100644 index 0000000..4970680 --- /dev/null +++ b/src/vendored/apache-arrow-12.0.1/arrow/python/numpy_convert.cc @@ -0,0 +1,562 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/python/numpy_interop.h" + +#include "arrow/python/numpy_convert.h" + +#include +#include +#include +#include + +#include "arrow/buffer.h" +#include "arrow/sparse_tensor.h" +#include "arrow/tensor.h" +#include "arrow/type.h" +#include "arrow/util/logging.h" + +#include "arrow/python/common.h" +#include "arrow/python/pyarrow.h" +#include "arrow/python/type_traits.h" + +namespace arrow { +namespace py { + +NumPyBuffer::NumPyBuffer(PyObject* ao) : Buffer(nullptr, 0) { + PyAcquireGIL lock; + arr_ = ao; + Py_INCREF(ao); + + if (PyArray_Check(ao)) { + PyArrayObject* ndarray = reinterpret_cast(ao); + auto ptr = reinterpret_cast(PyArray_DATA(ndarray)); + data_ = const_cast(ptr); + size_ = PyArray_SIZE(ndarray) * PyArray_DESCR(ndarray)->elsize; + capacity_ = size_; + is_mutable_ = !!(PyArray_FLAGS(ndarray) & NPY_ARRAY_WRITEABLE); + } +} + +NumPyBuffer::~NumPyBuffer() { + PyAcquireGIL lock; + Py_XDECREF(arr_); +} + +#define TO_ARROW_TYPE_CASE(NPY_NAME, FACTORY) \ + case NPY_##NPY_NAME: \ + *out = FACTORY(); \ + break; + +namespace { + +Status GetTensorType(PyObject* dtype, std::shared_ptr* out) { + if (!PyObject_TypeCheck(dtype, &PyArrayDescr_Type)) { + return Status::TypeError("Did not pass numpy.dtype object"); + } + PyArray_Descr* descr = reinterpret_cast(dtype); + int type_num = fix_numpy_type_num(descr->type_num); + + switch (type_num) { + TO_ARROW_TYPE_CASE(BOOL, uint8); + TO_ARROW_TYPE_CASE(INT8, int8); + TO_ARROW_TYPE_CASE(INT16, int16); + TO_ARROW_TYPE_CASE(INT32, int32); + TO_ARROW_TYPE_CASE(INT64, int64); + TO_ARROW_TYPE_CASE(UINT8, uint8); + TO_ARROW_TYPE_CASE(UINT16, uint16); + TO_ARROW_TYPE_CASE(UINT32, uint32); + TO_ARROW_TYPE_CASE(UINT64, uint64); + TO_ARROW_TYPE_CASE(FLOAT16, float16); + TO_ARROW_TYPE_CASE(FLOAT32, float32); + TO_ARROW_TYPE_CASE(FLOAT64, float64); + default: { + return Status::NotImplemented("Unsupported numpy type ", descr->type_num); + } + } + return Status::OK(); +} + +Status GetNumPyType(const DataType& type, int* type_num) { +#define NUMPY_TYPE_CASE(ARROW_NAME, NPY_NAME) \ + case Type::ARROW_NAME: \ + *type_num = NPY_##NPY_NAME; \ + break; + + switch (type.id()) { + NUMPY_TYPE_CASE(UINT8, UINT8); + NUMPY_TYPE_CASE(INT8, INT8); + NUMPY_TYPE_CASE(UINT16, UINT16); + NUMPY_TYPE_CASE(INT16, INT16); + NUMPY_TYPE_CASE(UINT32, UINT32); + NUMPY_TYPE_CASE(INT32, INT32); + NUMPY_TYPE_CASE(UINT64, UINT64); + NUMPY_TYPE_CASE(INT64, INT64); + NUMPY_TYPE_CASE(HALF_FLOAT, FLOAT16); + NUMPY_TYPE_CASE(FLOAT, FLOAT32); + NUMPY_TYPE_CASE(DOUBLE, FLOAT64); + default: { + return Status::NotImplemented("Unsupported tensor type: ", type.ToString()); + } + } +#undef NUMPY_TYPE_CASE + + return Status::OK(); +} + +} // namespace + +Status NumPyDtypeToArrow(PyObject* dtype, std::shared_ptr* out) { + if (!PyObject_TypeCheck(dtype, &PyArrayDescr_Type)) { + return Status::TypeError("Did not pass numpy.dtype object"); + } + PyArray_Descr* descr = reinterpret_cast(dtype); + return NumPyDtypeToArrow(descr, out); +} + +Status NumPyDtypeToArrow(PyArray_Descr* descr, std::shared_ptr* out) { + int type_num = fix_numpy_type_num(descr->type_num); + + switch (type_num) { + TO_ARROW_TYPE_CASE(BOOL, boolean); + TO_ARROW_TYPE_CASE(INT8, int8); + TO_ARROW_TYPE_CASE(INT16, int16); + TO_ARROW_TYPE_CASE(INT32, int32); + TO_ARROW_TYPE_CASE(INT64, int64); + TO_ARROW_TYPE_CASE(UINT8, uint8); + TO_ARROW_TYPE_CASE(UINT16, uint16); + TO_ARROW_TYPE_CASE(UINT32, uint32); + TO_ARROW_TYPE_CASE(UINT64, uint64); + TO_ARROW_TYPE_CASE(FLOAT16, float16); + TO_ARROW_TYPE_CASE(FLOAT32, float32); + TO_ARROW_TYPE_CASE(FLOAT64, float64); + TO_ARROW_TYPE_CASE(STRING, binary); + TO_ARROW_TYPE_CASE(UNICODE, utf8); + case NPY_DATETIME: { + auto date_dtype = + reinterpret_cast(descr->c_metadata); + switch (date_dtype->meta.base) { + case NPY_FR_s: + *out = timestamp(TimeUnit::SECOND); + break; + case NPY_FR_ms: + *out = timestamp(TimeUnit::MILLI); + break; + case NPY_FR_us: + *out = timestamp(TimeUnit::MICRO); + break; + case NPY_FR_ns: + *out = timestamp(TimeUnit::NANO); + break; + case NPY_FR_D: + *out = date32(); + break; + case NPY_FR_GENERIC: + return Status::NotImplemented("Unbound or generic datetime64 time unit"); + default: + return Status::NotImplemented("Unsupported datetime64 time unit"); + } + } break; + case NPY_TIMEDELTA: { + auto timedelta_dtype = + reinterpret_cast(descr->c_metadata); + switch (timedelta_dtype->meta.base) { + case NPY_FR_s: + *out = duration(TimeUnit::SECOND); + break; + case NPY_FR_ms: + *out = duration(TimeUnit::MILLI); + break; + case NPY_FR_us: + *out = duration(TimeUnit::MICRO); + break; + case NPY_FR_ns: + *out = duration(TimeUnit::NANO); + break; + case NPY_FR_GENERIC: + return Status::NotImplemented("Unbound or generic timedelta64 time unit"); + default: + return Status::NotImplemented("Unsupported timedelta64 time unit"); + } + } break; + default: { + return Status::NotImplemented("Unsupported numpy type ", descr->type_num); + } + } + + return Status::OK(); +} + +#undef TO_ARROW_TYPE_CASE + +Status NdarrayToTensor(MemoryPool* pool, PyObject* ao, + const std::vector& dim_names, + std::shared_ptr* out) { + if (!PyArray_Check(ao)) { + return Status::TypeError("Did not pass ndarray object"); + } + + PyArrayObject* ndarray = reinterpret_cast(ao); + + // TODO(wesm): What do we want to do with non-contiguous memory and negative strides? + + int ndim = PyArray_NDIM(ndarray); + + std::shared_ptr data = std::make_shared(ao); + std::vector shape(ndim); + std::vector strides(ndim); + + npy_intp* array_strides = PyArray_STRIDES(ndarray); + npy_intp* array_shape = PyArray_SHAPE(ndarray); + for (int i = 0; i < ndim; ++i) { + if (array_strides[i] < 0) { + return Status::Invalid("Negative ndarray strides not supported"); + } + shape[i] = array_shape[i]; + strides[i] = array_strides[i]; + } + + std::shared_ptr type; + RETURN_NOT_OK( + GetTensorType(reinterpret_cast(PyArray_DESCR(ndarray)), &type)); + *out = std::make_shared(type, data, shape, strides, dim_names); + return Status::OK(); +} + +Status TensorToNdarray(const std::shared_ptr& tensor, PyObject* base, + PyObject** out) { + int type_num = 0; + RETURN_NOT_OK(GetNumPyType(*tensor->type(), &type_num)); + PyArray_Descr* dtype = PyArray_DescrNewFromType(type_num); + RETURN_IF_PYERROR(); + + const int ndim = tensor->ndim(); + std::vector npy_shape(ndim); + std::vector npy_strides(ndim); + + for (int i = 0; i < ndim; ++i) { + npy_shape[i] = tensor->shape()[i]; + npy_strides[i] = tensor->strides()[i]; + } + + const void* immutable_data = nullptr; + if (tensor->data()) { + immutable_data = tensor->data()->data(); + } + + // Remove const =( + void* mutable_data = const_cast(immutable_data); + + int array_flags = 0; + if (tensor->is_row_major()) { + array_flags |= NPY_ARRAY_C_CONTIGUOUS; + } + if (tensor->is_column_major()) { + array_flags |= NPY_ARRAY_F_CONTIGUOUS; + } + if (tensor->is_mutable()) { + array_flags |= NPY_ARRAY_WRITEABLE; + } + + PyObject* result = + PyArray_NewFromDescr(&PyArray_Type, dtype, ndim, npy_shape.data(), + npy_strides.data(), mutable_data, array_flags, nullptr); + RETURN_IF_PYERROR(); + + if (base == Py_None || base == nullptr) { + base = py::wrap_tensor(tensor); + } else { + Py_XINCREF(base); + } + PyArray_SetBaseObject(reinterpret_cast(result), base); + *out = result; + return Status::OK(); +} + +// Wrap the dense data of a sparse tensor in a ndarray +static Status SparseTensorDataToNdarray(const SparseTensor& sparse_tensor, + std::vector data_shape, PyObject* base, + PyObject** out_data) { + int type_num_data = 0; + RETURN_NOT_OK(GetNumPyType(*sparse_tensor.type(), &type_num_data)); + PyArray_Descr* dtype_data = PyArray_DescrNewFromType(type_num_data); + RETURN_IF_PYERROR(); + + const void* immutable_data = sparse_tensor.data()->data(); + // Remove const =( + void* mutable_data = const_cast(immutable_data); + int array_flags = NPY_ARRAY_C_CONTIGUOUS | NPY_ARRAY_F_CONTIGUOUS; + if (sparse_tensor.is_mutable()) { + array_flags |= NPY_ARRAY_WRITEABLE; + } + + *out_data = PyArray_NewFromDescr(&PyArray_Type, dtype_data, + static_cast(data_shape.size()), data_shape.data(), + nullptr, mutable_data, array_flags, nullptr); + RETURN_IF_PYERROR(); + Py_XINCREF(base); + PyArray_SetBaseObject(reinterpret_cast(*out_data), base); + return Status::OK(); +} + +Status SparseCOOTensorToNdarray(const std::shared_ptr& sparse_tensor, + PyObject* base, PyObject** out_data, + PyObject** out_coords) { + const auto& sparse_index = arrow::internal::checked_cast( + *sparse_tensor->sparse_index()); + + // Wrap tensor data + OwnedRef result_data; + RETURN_NOT_OK(SparseTensorDataToNdarray( + *sparse_tensor, {static_cast(sparse_tensor->non_zero_length()), 1}, base, + result_data.ref())); + + // Wrap indices + PyObject* result_coords; + RETURN_NOT_OK(TensorToNdarray(sparse_index.indices(), base, &result_coords)); + + *out_data = result_data.detach(); + *out_coords = result_coords; + return Status::OK(); +} + +Status SparseCSXMatrixToNdarray(const std::shared_ptr& sparse_tensor, + PyObject* base, PyObject** out_data, + PyObject** out_indptr, PyObject** out_indices) { + // Wrap indices + OwnedRef result_indptr; + OwnedRef result_indices; + + switch (sparse_tensor->format_id()) { + case SparseTensorFormat::CSR: { + const auto& sparse_index = arrow::internal::checked_cast( + *sparse_tensor->sparse_index()); + RETURN_NOT_OK(TensorToNdarray(sparse_index.indptr(), base, result_indptr.ref())); + RETURN_NOT_OK(TensorToNdarray(sparse_index.indices(), base, result_indices.ref())); + break; + } + case SparseTensorFormat::CSC: { + const auto& sparse_index = arrow::internal::checked_cast( + *sparse_tensor->sparse_index()); + RETURN_NOT_OK(TensorToNdarray(sparse_index.indptr(), base, result_indptr.ref())); + RETURN_NOT_OK(TensorToNdarray(sparse_index.indices(), base, result_indices.ref())); + break; + } + default: + return Status::NotImplemented("Invalid SparseTensor type."); + } + + // Wrap tensor data + OwnedRef result_data; + RETURN_NOT_OK(SparseTensorDataToNdarray( + *sparse_tensor, {static_cast(sparse_tensor->non_zero_length()), 1}, base, + result_data.ref())); + + *out_data = result_data.detach(); + *out_indptr = result_indptr.detach(); + *out_indices = result_indices.detach(); + return Status::OK(); +} + +Status SparseCSRMatrixToNdarray(const std::shared_ptr& sparse_tensor, + PyObject* base, PyObject** out_data, + PyObject** out_indptr, PyObject** out_indices) { + return SparseCSXMatrixToNdarray(sparse_tensor, base, out_data, out_indptr, out_indices); +} + +Status SparseCSCMatrixToNdarray(const std::shared_ptr& sparse_tensor, + PyObject* base, PyObject** out_data, + PyObject** out_indptr, PyObject** out_indices) { + return SparseCSXMatrixToNdarray(sparse_tensor, base, out_data, out_indptr, out_indices); +} + +Status SparseCSFTensorToNdarray(const std::shared_ptr& sparse_tensor, + PyObject* base, PyObject** out_data, + PyObject** out_indptr, PyObject** out_indices) { + const auto& sparse_index = arrow::internal::checked_cast( + *sparse_tensor->sparse_index()); + + // Wrap tensor data + OwnedRef result_data; + RETURN_NOT_OK(SparseTensorDataToNdarray( + *sparse_tensor, {static_cast(sparse_tensor->non_zero_length()), 1}, base, + result_data.ref())); + + // Wrap indices + int ndim = static_cast(sparse_index.indices().size()); + OwnedRef indptr(PyList_New(ndim - 1)); + OwnedRef indices(PyList_New(ndim)); + RETURN_IF_PYERROR(); + + for (int i = 0; i < ndim - 1; ++i) { + PyObject* item; + RETURN_NOT_OK(TensorToNdarray(sparse_index.indptr()[i], base, &item)); + if (PyList_SetItem(indptr.obj(), i, item) < 0) { + Py_XDECREF(item); + RETURN_IF_PYERROR(); + } + } + for (int i = 0; i < ndim; ++i) { + PyObject* item; + RETURN_NOT_OK(TensorToNdarray(sparse_index.indices()[i], base, &item)); + if (PyList_SetItem(indices.obj(), i, item) < 0) { + Py_XDECREF(item); + RETURN_IF_PYERROR(); + } + } + + *out_indptr = indptr.detach(); + *out_indices = indices.detach(); + *out_data = result_data.detach(); + return Status::OK(); +} + +Status NdarraysToSparseCOOTensor(MemoryPool* pool, PyObject* data_ao, PyObject* coords_ao, + const std::vector& shape, + const std::vector& dim_names, + std::shared_ptr* out) { + if (!PyArray_Check(data_ao) || !PyArray_Check(coords_ao)) { + return Status::TypeError("Did not pass ndarray object"); + } + + PyArrayObject* ndarray_data = reinterpret_cast(data_ao); + std::shared_ptr data = std::make_shared(data_ao); + std::shared_ptr type_data; + RETURN_NOT_OK(GetTensorType(reinterpret_cast(PyArray_DESCR(ndarray_data)), + &type_data)); + + std::shared_ptr coords; + RETURN_NOT_OK(NdarrayToTensor(pool, coords_ao, {}, &coords)); + ARROW_CHECK_EQ(coords->type_id(), Type::INT64); // Should be ensured by caller + + ARROW_ASSIGN_OR_RAISE(std::shared_ptr sparse_index, + SparseCOOIndex::Make(coords)); + *out = std::make_shared>(sparse_index, type_data, data, + shape, dim_names); + return Status::OK(); +} + +template +Status NdarraysToSparseCSXMatrix(MemoryPool* pool, PyObject* data_ao, PyObject* indptr_ao, + PyObject* indices_ao, const std::vector& shape, + const std::vector& dim_names, + std::shared_ptr>* out) { + if (!PyArray_Check(data_ao) || !PyArray_Check(indptr_ao) || + !PyArray_Check(indices_ao)) { + return Status::TypeError("Did not pass ndarray object"); + } + + PyArrayObject* ndarray_data = reinterpret_cast(data_ao); + std::shared_ptr data = std::make_shared(data_ao); + std::shared_ptr type_data; + RETURN_NOT_OK(GetTensorType(reinterpret_cast(PyArray_DESCR(ndarray_data)), + &type_data)); + + std::shared_ptr indptr, indices; + RETURN_NOT_OK(NdarrayToTensor(pool, indptr_ao, {}, &indptr)); + RETURN_NOT_OK(NdarrayToTensor(pool, indices_ao, {}, &indices)); + ARROW_CHECK_EQ(indptr->type_id(), Type::INT64); // Should be ensured by caller + ARROW_CHECK_EQ(indices->type_id(), Type::INT64); // Should be ensured by caller + + auto sparse_index = std::make_shared( + std::static_pointer_cast>(indptr), + std::static_pointer_cast>(indices)); + *out = std::make_shared>(sparse_index, type_data, data, + shape, dim_names); + return Status::OK(); +} + +Status NdarraysToSparseCSFTensor(MemoryPool* pool, PyObject* data_ao, PyObject* indptr_ao, + PyObject* indices_ao, const std::vector& shape, + const std::vector& axis_order, + const std::vector& dim_names, + std::shared_ptr* out) { + if (!PyArray_Check(data_ao)) { + return Status::TypeError("Did not pass ndarray object for data"); + } + const int ndim = static_cast(shape.size()); + PyArrayObject* ndarray_data = reinterpret_cast(data_ao); + std::shared_ptr data = std::make_shared(data_ao); + std::shared_ptr type_data; + RETURN_NOT_OK(GetTensorType(reinterpret_cast(PyArray_DESCR(ndarray_data)), + &type_data)); + + std::vector> indptr(ndim - 1); + std::vector> indices(ndim); + + for (int i = 0; i < ndim - 1; ++i) { + PyObject* item = PySequence_Fast_GET_ITEM(indptr_ao, i); + if (!PyArray_Check(item)) { + return Status::TypeError("Did not pass ndarray object for indptr"); + } + RETURN_NOT_OK(NdarrayToTensor(pool, item, {}, &indptr[i])); + ARROW_CHECK_EQ(indptr[i]->type_id(), Type::INT64); // Should be ensured by caller + } + + for (int i = 0; i < ndim; ++i) { + PyObject* item = PySequence_Fast_GET_ITEM(indices_ao, i); + if (!PyArray_Check(item)) { + return Status::TypeError("Did not pass ndarray object for indices"); + } + RETURN_NOT_OK(NdarrayToTensor(pool, item, {}, &indices[i])); + ARROW_CHECK_EQ(indices[i]->type_id(), Type::INT64); // Should be ensured by caller + } + + auto sparse_index = std::make_shared(indptr, indices, axis_order); + *out = std::make_shared>(sparse_index, type_data, data, + shape, dim_names); + return Status::OK(); +} + +Status NdarraysToSparseCSRMatrix(MemoryPool* pool, PyObject* data_ao, PyObject* indptr_ao, + PyObject* indices_ao, const std::vector& shape, + const std::vector& dim_names, + std::shared_ptr* out) { + return NdarraysToSparseCSXMatrix(pool, data_ao, indptr_ao, indices_ao, + shape, dim_names, out); +} + +Status NdarraysToSparseCSCMatrix(MemoryPool* pool, PyObject* data_ao, PyObject* indptr_ao, + PyObject* indices_ao, const std::vector& shape, + const std::vector& dim_names, + std::shared_ptr* out) { + return NdarraysToSparseCSXMatrix(pool, data_ao, indptr_ao, indices_ao, + shape, dim_names, out); +} + +Status TensorToSparseCOOTensor(const std::shared_ptr& tensor, + std::shared_ptr* out) { + return SparseCOOTensor::Make(*tensor).Value(out); +} + +Status TensorToSparseCSRMatrix(const std::shared_ptr& tensor, + std::shared_ptr* out) { + return SparseCSRMatrix::Make(*tensor).Value(out); +} + +Status TensorToSparseCSCMatrix(const std::shared_ptr& tensor, + std::shared_ptr* out) { + return SparseCSCMatrix::Make(*tensor).Value(out); +} + +Status TensorToSparseCSFTensor(const std::shared_ptr& tensor, + std::shared_ptr* out) { + return SparseCSFTensor::Make(*tensor).Value(out); +} + +} // namespace py +} // namespace arrow diff --git a/src/vendored/apache-arrow-12.0.1/arrow/python/numpy_convert.h b/src/vendored/apache-arrow-12.0.1/arrow/python/numpy_convert.h index 69a7dd3..1045107 100644 --- a/src/vendored/apache-arrow-12.0.1/arrow/python/numpy_convert.h +++ b/src/vendored/apache-arrow-12.0.1/arrow/python/numpy_convert.h @@ -27,8 +27,8 @@ #include #include "arrow/buffer.h" -#include "arrow/sparse_tensor.h" #include "arrow/python/visibility.h" +#include "arrow/sparse_tensor.h" namespace arrow { diff --git a/src/vendored/apache-arrow-12.0.1/arrow/python/numpy_internal.h b/src/vendored/apache-arrow-12.0.1/arrow/python/numpy_internal.h new file mode 100644 index 0000000..b9b632f --- /dev/null +++ b/src/vendored/apache-arrow-12.0.1/arrow/python/numpy_internal.h @@ -0,0 +1,182 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// Internal utilities for dealing with NumPy + +#pragma once + +#include "arrow/python/numpy_interop.h" + +#include "arrow/status.h" + +#include "arrow/python/platform.h" + +#include +#include +#include + +namespace arrow { +namespace py { + +/// Indexing convenience for interacting with strided 1-dim ndarray objects +template +class Ndarray1DIndexer { + public: + typedef int64_t size_type; + + Ndarray1DIndexer() : arr_(NULLPTR), data_(NULLPTR) {} + + explicit Ndarray1DIndexer(PyArrayObject* arr) : Ndarray1DIndexer() { + arr_ = arr; + DCHECK_EQ(1, PyArray_NDIM(arr)) << "Only works with 1-dimensional arrays"; + data_ = reinterpret_cast(PyArray_DATA(arr)); + stride_ = PyArray_STRIDES(arr)[0]; + } + + ~Ndarray1DIndexer() = default; + + int64_t size() const { return PyArray_SIZE(arr_); } + + const T* data() const { return reinterpret_cast(data_); } + + bool is_strided() const { return stride_ != sizeof(T); } + + T& operator[](size_type index) { + return *reinterpret_cast(data_ + index * stride_); + } + const T& operator[](size_type index) const { + return *reinterpret_cast(data_ + index * stride_); + } + + private: + PyArrayObject* arr_; + uint8_t* data_; + int64_t stride_; +}; + +// Handling of Numpy Types by their static numbers +// (the NPY_TYPES enum and related defines) + +static inline std::string GetNumPyTypeName(int npy_type) { +#define TYPE_CASE(TYPE, NAME) \ + case NPY_##TYPE: \ + return NAME; + + switch (npy_type) { + TYPE_CASE(BOOL, "bool") + TYPE_CASE(INT8, "int8") + TYPE_CASE(INT16, "int16") + TYPE_CASE(INT32, "int32") + TYPE_CASE(INT64, "int64") +#if !NPY_INT32_IS_INT + TYPE_CASE(INT, "intc") +#endif +#if !NPY_INT64_IS_LONG_LONG + TYPE_CASE(LONGLONG, "longlong") +#endif + TYPE_CASE(UINT8, "uint8") + TYPE_CASE(UINT16, "uint16") + TYPE_CASE(UINT32, "uint32") + TYPE_CASE(UINT64, "uint64") +#if !NPY_INT32_IS_INT + TYPE_CASE(UINT, "uintc") +#endif +#if !NPY_INT64_IS_LONG_LONG + TYPE_CASE(ULONGLONG, "ulonglong") +#endif + TYPE_CASE(FLOAT16, "float16") + TYPE_CASE(FLOAT32, "float32") + TYPE_CASE(FLOAT64, "float64") + TYPE_CASE(DATETIME, "datetime64") + TYPE_CASE(TIMEDELTA, "timedelta64") + TYPE_CASE(OBJECT, "object") + TYPE_CASE(VOID, "void") + default: + break; + } + +#undef TYPE_CASE + std::stringstream ss; + ss << "unrecognized type (" << npy_type << ") in GetNumPyTypeName"; + return ss.str(); +} + +#define TYPE_VISIT_INLINE(TYPE) \ + case NPY_##TYPE: \ + return visitor->template Visit(arr); + +template +inline Status VisitNumpyArrayInline(PyArrayObject* arr, VISITOR* visitor) { + switch (PyArray_TYPE(arr)) { + TYPE_VISIT_INLINE(BOOL); + TYPE_VISIT_INLINE(INT8); + TYPE_VISIT_INLINE(UINT8); + TYPE_VISIT_INLINE(INT16); + TYPE_VISIT_INLINE(UINT16); + TYPE_VISIT_INLINE(INT32); + TYPE_VISIT_INLINE(UINT32); + TYPE_VISIT_INLINE(INT64); + TYPE_VISIT_INLINE(UINT64); +#if !NPY_INT32_IS_INT + TYPE_VISIT_INLINE(INT); + TYPE_VISIT_INLINE(UINT); +#endif +#if !NPY_INT64_IS_LONG_LONG + TYPE_VISIT_INLINE(LONGLONG); + TYPE_VISIT_INLINE(ULONGLONG); +#endif + TYPE_VISIT_INLINE(FLOAT16); + TYPE_VISIT_INLINE(FLOAT32); + TYPE_VISIT_INLINE(FLOAT64); + TYPE_VISIT_INLINE(DATETIME); + TYPE_VISIT_INLINE(TIMEDELTA); + TYPE_VISIT_INLINE(OBJECT); + } + return Status::NotImplemented("NumPy type not implemented: ", + GetNumPyTypeName(PyArray_TYPE(arr))); +} + +#undef TYPE_VISIT_INLINE + +namespace internal { + +inline bool PyFloatScalar_Check(PyObject* obj) { + return PyFloat_Check(obj) || PyArray_IsScalar(obj, Floating); +} + +inline bool PyIntScalar_Check(PyObject* obj) { + return PyLong_Check(obj) || PyArray_IsScalar(obj, Integer); +} + +inline bool PyBoolScalar_Check(PyObject* obj) { + return PyBool_Check(obj) || PyArray_IsScalar(obj, Bool); +} + +static inline PyArray_Descr* GetSafeNumPyDtype(int type) { + if (type == NPY_DATETIME || type == NPY_TIMEDELTA) { + // It is not safe to mutate the result of DescrFromType for datetime and + // timedelta descriptors + return PyArray_DescrNewFromType(type); + } else { + return PyArray_DescrFromType(type); + } +} + +} // namespace internal + +} // namespace py +} // namespace arrow diff --git a/src/vendored/apache-arrow-12.0.1/arrow/python/numpy_to_arrow.cc b/src/vendored/apache-arrow-12.0.1/arrow/python/numpy_to_arrow.cc new file mode 100644 index 0000000..2727ce3 --- /dev/null +++ b/src/vendored/apache-arrow-12.0.1/arrow/python/numpy_to_arrow.cc @@ -0,0 +1,870 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// Functions for pandas conversion via NumPy + +#include "arrow/python/numpy_to_arrow.h" +#include "arrow/python/numpy_interop.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "arrow/array.h" +#include "arrow/array/builder_binary.h" +#include "arrow/status.h" +#include "arrow/table.h" +#include "arrow/type_fwd.h" +#include "arrow/type_traits.h" +#include "arrow/util/bit_util.h" +#include "arrow/util/bitmap_generate.h" +#include "arrow/util/bitmap_ops.h" +#include "arrow/util/checked_cast.h" +#include "arrow/util/endian.h" +#include "arrow/util/logging.h" +#include "arrow/util/macros.h" +#include "arrow/util/string.h" +#include "arrow/util/utf8.h" +#include "arrow/visit_type_inline.h" + +#include "arrow/compute/api_scalar.h" + +#include "arrow/python/common.h" +#include "arrow/python/datetime.h" +#include "arrow/python/helpers.h" +#include "arrow/python/iterators.h" +#include "arrow/python/numpy_convert.h" +#include "arrow/python/numpy_internal.h" +#include "arrow/python/python_to_arrow.h" +#include "arrow/python/type_traits.h" + +namespace arrow { + +using internal::checked_cast; +using internal::CopyBitmap; +using internal::GenerateBitsUnrolled; + +namespace py { + +using internal::NumPyTypeSize; + +// ---------------------------------------------------------------------- +// Conversion utilities + +namespace { + +Status AllocateNullBitmap(MemoryPool* pool, int64_t length, + std::shared_ptr* out) { + int64_t null_bytes = bit_util::BytesForBits(length); + ARROW_ASSIGN_OR_RAISE(auto null_bitmap, AllocateResizableBuffer(null_bytes, pool)); + + // Padding zeroed by AllocateResizableBuffer + memset(null_bitmap->mutable_data(), 0, static_cast(null_bytes)); + *out = std::move(null_bitmap); + return Status::OK(); +} + +// ---------------------------------------------------------------------- +// Conversion from NumPy-in-Pandas to Arrow null bitmap + +template +inline int64_t ValuesToBitmap(PyArrayObject* arr, uint8_t* bitmap) { + typedef internal::npy_traits traits; + typedef typename traits::value_type T; + + int64_t null_count = 0; + + Ndarray1DIndexer values(arr); + for (int i = 0; i < values.size(); ++i) { + if (traits::isnull(values[i])) { + ++null_count; + } else { + bit_util::SetBit(bitmap, i); + } + } + + return null_count; +} + +class NumPyNullsConverter { + public: + /// Convert the given array's null values to a null bitmap. + /// The null bitmap is only allocated if null values are ever possible. + static Status Convert(MemoryPool* pool, PyArrayObject* arr, bool from_pandas, + std::shared_ptr* out_null_bitmap_, + int64_t* out_null_count) { + NumPyNullsConverter converter(pool, arr, from_pandas); + RETURN_NOT_OK(VisitNumpyArrayInline(arr, &converter)); + *out_null_bitmap_ = converter.null_bitmap_; + *out_null_count = converter.null_count_; + return Status::OK(); + } + + template + Status Visit(PyArrayObject* arr) { + typedef internal::npy_traits traits; + + const bool null_sentinels_possible = + // Always treat Numpy's NaT as null + TYPE == NPY_DATETIME || TYPE == NPY_TIMEDELTA || + // Observing pandas's null sentinels + (from_pandas_ && traits::supports_nulls); + + if (null_sentinels_possible) { + RETURN_NOT_OK(AllocateNullBitmap(pool_, PyArray_SIZE(arr), &null_bitmap_)); + null_count_ = ValuesToBitmap(arr, null_bitmap_->mutable_data()); + } + return Status::OK(); + } + + protected: + NumPyNullsConverter(MemoryPool* pool, PyArrayObject* arr, bool from_pandas) + : pool_(pool), + arr_(arr), + from_pandas_(from_pandas), + null_bitmap_data_(nullptr), + null_count_(0) {} + + MemoryPool* pool_; + PyArrayObject* arr_; + bool from_pandas_; + std::shared_ptr null_bitmap_; + uint8_t* null_bitmap_data_; + int64_t null_count_; +}; + +// Returns null count +int64_t MaskToBitmap(PyArrayObject* mask, int64_t length, uint8_t* bitmap) { + int64_t null_count = 0; + + if (!PyArray_Check(mask)) return -1; + + Ndarray1DIndexer mask_values(mask); + for (int i = 0; i < length; ++i) { + if (mask_values[i]) { + ++null_count; + bit_util::ClearBit(bitmap, i); + } else { + bit_util::SetBit(bitmap, i); + } + } + return null_count; +} + +} // namespace + +// ---------------------------------------------------------------------- +// Conversion from NumPy arrays (possibly originating from pandas) to Arrow +// format. Does not handle NPY_OBJECT dtype arrays; use ConvertPySequence for +// that + +class NumPyConverter { + public: + NumPyConverter(MemoryPool* pool, PyObject* arr, PyObject* mo, + const std::shared_ptr& type, bool from_pandas, + const compute::CastOptions& cast_options = compute::CastOptions()) + : pool_(pool), + type_(type), + arr_(reinterpret_cast(arr)), + dtype_(PyArray_DESCR(arr_)), + mask_(nullptr), + from_pandas_(from_pandas), + cast_options_(cast_options), + null_bitmap_data_(nullptr), + null_count_(0) { + if (mo != nullptr && mo != Py_None) { + mask_ = reinterpret_cast(mo); + } + length_ = static_cast(PyArray_SIZE(arr_)); + itemsize_ = static_cast(PyArray_DESCR(arr_)->elsize); + stride_ = static_cast(PyArray_STRIDES(arr_)[0]); + } + + bool is_strided() const { return itemsize_ != stride_; } + + Status Convert(); + + const ArrayVector& result() const { return out_arrays_; } + + template + enable_if_primitive_ctype Visit(const T& type) { + return VisitNative(); + } + + Status Visit(const HalfFloatType& type) { return VisitNative(); } + + Status Visit(const Date32Type& type) { return VisitNative(); } + Status Visit(const Date64Type& type) { return VisitNative(); } + Status Visit(const TimestampType& type) { return VisitNative(); } + Status Visit(const Time32Type& type) { return VisitNative(); } + Status Visit(const Time64Type& type) { return VisitNative(); } + Status Visit(const DurationType& type) { return VisitNative(); } + + Status Visit(const NullType& type) { return TypeNotImplemented(type.ToString()); } + + // NumPy ascii string arrays + Status Visit(const BinaryType& type); + + // NumPy unicode arrays + Status Visit(const StringType& type); + + Status Visit(const StructType& type); + + Status Visit(const FixedSizeBinaryType& type); + + // Default case + Status Visit(const DataType& type) { return TypeNotImplemented(type.ToString()); } + + protected: + Status InitNullBitmap() { + RETURN_NOT_OK(AllocateNullBitmap(pool_, length_, &null_bitmap_)); + null_bitmap_data_ = null_bitmap_->mutable_data(); + return Status::OK(); + } + + // Called before ConvertData to ensure Numpy input buffer is in expected + // Arrow layout + template + Status PrepareInputData(std::shared_ptr* data); + + // ---------------------------------------------------------------------- + // Traditional visitor conversion for non-object arrays + + template + Status ConvertData(std::shared_ptr* data); + + template + Status PushBuilderResult(T* builder) { + std::shared_ptr out; + RETURN_NOT_OK(builder->Finish(&out)); + out_arrays_.emplace_back(out); + return Status::OK(); + } + + Status PushArray(const std::shared_ptr& data) { + out_arrays_.emplace_back(MakeArray(data)); + return Status::OK(); + } + + template + Status VisitNative() { + if (mask_ != nullptr) { + RETURN_NOT_OK(InitNullBitmap()); + null_count_ = MaskToBitmap(mask_, length_, null_bitmap_data_); + if (null_count_ == -1) return Status::Invalid("Invalid mask type"); + } else { + RETURN_NOT_OK(NumPyNullsConverter::Convert(pool_, arr_, from_pandas_, &null_bitmap_, + &null_count_)); + } + + std::shared_ptr data; + RETURN_NOT_OK(ConvertData(&data)); + + auto arr_data = ArrayData::Make(type_, length_, {null_bitmap_, data}, null_count_, 0); + return PushArray(arr_data); + } + + Status TypeNotImplemented(std::string type_name) { + return Status::NotImplemented("NumPyConverter doesn't implement <", type_name, + "> conversion. "); + } + + MemoryPool* pool_; + std::shared_ptr type_; + PyArrayObject* arr_; + PyArray_Descr* dtype_; + PyArrayObject* mask_; + int64_t length_; + int64_t stride_; + int itemsize_; + + bool from_pandas_; + compute::CastOptions cast_options_; + + // Used in visitor pattern + ArrayVector out_arrays_; + + std::shared_ptr null_bitmap_; + uint8_t* null_bitmap_data_; + int64_t null_count_; +}; + +Status NumPyConverter::Convert() { + if (PyArray_NDIM(arr_) != 1) { + return Status::Invalid("only handle 1-dimensional arrays"); + } + + if (dtype_->type_num == NPY_OBJECT) { + // If an object array, convert it like a normal Python sequence + PyConversionOptions py_options; + py_options.type = type_; + py_options.from_pandas = from_pandas_; + ARROW_ASSIGN_OR_RAISE( + auto chunked_array, + ConvertPySequence(reinterpret_cast(arr_), + reinterpret_cast(mask_), py_options, pool_)); + out_arrays_ = chunked_array->chunks(); + return Status::OK(); + } + + if (type_ == nullptr) { + return Status::Invalid("Must pass data type for non-object arrays"); + } + + // Visit the type to perform conversion + return VisitTypeInline(*type_, this); +} + +namespace { + +Status CastBuffer(const std::shared_ptr& in_type, + const std::shared_ptr& input, const int64_t length, + const std::shared_ptr& valid_bitmap, const int64_t null_count, + const std::shared_ptr& out_type, + const compute::CastOptions& cast_options, MemoryPool* pool, + std::shared_ptr* out) { + // Must cast + auto tmp_data = ArrayData::Make(in_type, length, {valid_bitmap, input}, null_count); + compute::ExecContext context(pool); + ARROW_ASSIGN_OR_RAISE( + std::shared_ptr casted_array, + compute::Cast(*MakeArray(tmp_data), out_type, cast_options, &context)); + *out = casted_array->data()->buffers[1]; + return Status::OK(); +} + +template +Status StaticCastBuffer(const Buffer& input, const int64_t length, MemoryPool* pool, + std::shared_ptr* out) { + ARROW_ASSIGN_OR_RAISE(auto result, AllocateBuffer(sizeof(ToType) * length, pool)); + + auto in_values = reinterpret_cast(input.data()); + auto out_values = reinterpret_cast(result->mutable_data()); + for (int64_t i = 0; i < length; ++i) { + *out_values++ = static_cast(*in_values++); + } + *out = std::move(result); + return Status::OK(); +} + +template +void CopyStridedBytewise(int8_t* input_data, int64_t length, int64_t stride, + T* output_data) { + // Passing input_data as non-const is a concession to PyObject* + for (int64_t i = 0; i < length; ++i) { + memcpy(output_data + i, input_data, sizeof(T)); + input_data += stride; + } +} + +template +void CopyStridedNatural(T* input_data, int64_t length, int64_t stride, T* output_data) { + // Passing input_data as non-const is a concession to PyObject* + int64_t j = 0; + for (int64_t i = 0; i < length; ++i) { + output_data[i] = input_data[j]; + j += stride; + } +} + +class NumPyStridedConverter { + public: + static Status Convert(PyArrayObject* arr, int64_t length, MemoryPool* pool, + std::shared_ptr* out) { + NumPyStridedConverter converter(arr, length, pool); + RETURN_NOT_OK(VisitNumpyArrayInline(arr, &converter)); + *out = converter.buffer_; + return Status::OK(); + } + template + Status Visit(PyArrayObject* arr) { + using traits = internal::npy_traits; + using T = typename traits::value_type; + + ARROW_ASSIGN_OR_RAISE(buffer_, AllocateBuffer(sizeof(T) * length_, pool_)); + + const int64_t stride = PyArray_STRIDES(arr)[0]; + // ARROW-16013: convert sizeof(T) to signed int64 first, otherwise dividing by it + // would do an unsigned division. This cannot be caught by tests without ubsan, since + // common signed overflow behavior and the fact that the sizeof(T) is currently always + // a power of two here cause CopyStridedNatural to still produce correct results + const int64_t element_size = sizeof(T); + if (stride % element_size == 0) { + const int64_t stride_elements = stride / element_size; + CopyStridedNatural(reinterpret_cast(PyArray_DATA(arr)), length_, + stride_elements, reinterpret_cast(buffer_->mutable_data())); + } else { + CopyStridedBytewise(reinterpret_cast(PyArray_DATA(arr)), length_, stride, + reinterpret_cast(buffer_->mutable_data())); + } + return Status::OK(); + } + + protected: + NumPyStridedConverter(PyArrayObject* arr, int64_t length, MemoryPool* pool) + : arr_(arr), length_(length), pool_(pool), buffer_(nullptr) {} + PyArrayObject* arr_; + int64_t length_; + MemoryPool* pool_; + std::shared_ptr buffer_; +}; + +} // namespace + +template +inline Status NumPyConverter::PrepareInputData(std::shared_ptr* data) { + if (PyArray_ISBYTESWAPPED(arr_)) { + // TODO + return Status::NotImplemented("Byte-swapped arrays not supported"); + } + + if (dtype_->type_num == NPY_BOOL) { + int64_t nbytes = bit_util::BytesForBits(length_); + ARROW_ASSIGN_OR_RAISE(auto buffer, AllocateBuffer(nbytes, pool_)); + + Ndarray1DIndexer values(arr_); + int64_t i = 0; + const auto generate = [&values, &i]() -> bool { return values[i++] > 0; }; + GenerateBitsUnrolled(buffer->mutable_data(), 0, length_, generate); + + *data = std::move(buffer); + } else if (is_strided()) { + RETURN_NOT_OK(NumPyStridedConverter::Convert(arr_, length_, pool_, data)); + } else { + // Can zero-copy + *data = std::make_shared(reinterpret_cast(arr_)); + } + + return Status::OK(); +} + +template +inline Status NumPyConverter::ConvertData(std::shared_ptr* data) { + RETURN_NOT_OK(PrepareInputData(data)); + + std::shared_ptr input_type; + RETURN_NOT_OK(NumPyDtypeToArrow(reinterpret_cast(dtype_), &input_type)); + + if (!input_type->Equals(*type_)) { + RETURN_NOT_OK(CastBuffer(input_type, *data, length_, null_bitmap_, null_count_, type_, + cast_options_, pool_, data)); + } + + return Status::OK(); +} + +template <> +inline Status NumPyConverter::ConvertData(std::shared_ptr* data) { + std::shared_ptr input_type; + + RETURN_NOT_OK(PrepareInputData(data)); + + auto date_dtype = reinterpret_cast(dtype_->c_metadata); + if (dtype_->type_num == NPY_DATETIME) { + // If we have inbound datetime64[D] data, this needs to be downcasted + // separately here from int64_t to int32_t, because this data is not + // supported in compute::Cast + if (date_dtype->meta.base == NPY_FR_D) { + // TODO(wesm): How pedantic do we really want to be about checking for int32 + // overflow here? + Status s = StaticCastBuffer(**data, length_, pool_, data); + RETURN_NOT_OK(s); + } else { + RETURN_NOT_OK(NumPyDtypeToArrow(reinterpret_cast(dtype_), &input_type)); + if (!input_type->Equals(*type_)) { + // The null bitmap was already computed in VisitNative() + RETURN_NOT_OK(CastBuffer(input_type, *data, length_, null_bitmap_, null_count_, + type_, cast_options_, pool_, data)); + } + } + } else { + RETURN_NOT_OK(NumPyDtypeToArrow(reinterpret_cast(dtype_), &input_type)); + if (!input_type->Equals(*type_)) { + RETURN_NOT_OK(CastBuffer(input_type, *data, length_, null_bitmap_, null_count_, + type_, cast_options_, pool_, data)); + } + } + + return Status::OK(); +} + +template <> +inline Status NumPyConverter::ConvertData(std::shared_ptr* data) { + constexpr int64_t kMillisecondsInDay = 86400000; + std::shared_ptr input_type; + + RETURN_NOT_OK(PrepareInputData(data)); + + auto date_dtype = reinterpret_cast(dtype_->c_metadata); + if (dtype_->type_num == NPY_DATETIME) { + // If we have inbound datetime64[D] data, this needs to be downcasted + // separately here from int64_t to int32_t, because this data is not + // supported in compute::Cast + if (date_dtype->meta.base == NPY_FR_D) { + ARROW_ASSIGN_OR_RAISE(auto result, + AllocateBuffer(sizeof(int64_t) * length_, pool_)); + + auto in_values = reinterpret_cast((*data)->data()); + auto out_values = reinterpret_cast(result->mutable_data()); + for (int64_t i = 0; i < length_; ++i) { + *out_values++ = kMillisecondsInDay * (*in_values++); + } + *data = std::move(result); + } else { + RETURN_NOT_OK(NumPyDtypeToArrow(reinterpret_cast(dtype_), &input_type)); + if (!input_type->Equals(*type_)) { + // The null bitmap was already computed in VisitNative() + RETURN_NOT_OK(CastBuffer(input_type, *data, length_, null_bitmap_, null_count_, + type_, cast_options_, pool_, data)); + } + } + } else { + RETURN_NOT_OK(NumPyDtypeToArrow(reinterpret_cast(dtype_), &input_type)); + if (!input_type->Equals(*type_)) { + RETURN_NOT_OK(CastBuffer(input_type, *data, length_, null_bitmap_, null_count_, + type_, cast_options_, pool_, data)); + } + } + + return Status::OK(); +} + +// Create 16MB chunks for binary data +constexpr int32_t kBinaryChunksize = 1 << 24; + +Status NumPyConverter::Visit(const BinaryType& type) { + ::arrow::internal::ChunkedBinaryBuilder builder(kBinaryChunksize, pool_); + + auto data = reinterpret_cast(PyArray_DATA(arr_)); + + auto AppendNotNull = [&builder, this](const uint8_t* data) { + // This is annoying. NumPy allows strings to have nul-terminators, so + // we must check for them here + const size_t item_size = + strnlen(reinterpret_cast(data), static_cast(itemsize_)); + return builder.Append(data, static_cast(item_size)); + }; + + if (mask_ != nullptr) { + Ndarray1DIndexer mask_values(mask_); + for (int64_t i = 0; i < length_; ++i) { + if (mask_values[i]) { + RETURN_NOT_OK(builder.AppendNull()); + } else { + RETURN_NOT_OK(AppendNotNull(data)); + } + data += stride_; + } + } else { + for (int64_t i = 0; i < length_; ++i) { + RETURN_NOT_OK(AppendNotNull(data)); + data += stride_; + } + } + + ArrayVector result; + RETURN_NOT_OK(builder.Finish(&result)); + for (auto arr : result) { + RETURN_NOT_OK(PushArray(arr->data())); + } + return Status::OK(); +} + +Status NumPyConverter::Visit(const FixedSizeBinaryType& type) { + auto byte_width = type.byte_width(); + + if (itemsize_ != byte_width) { + return Status::Invalid("Got bytestring of length ", itemsize_, " (expected ", + byte_width, ")"); + } + + FixedSizeBinaryBuilder builder(::arrow::fixed_size_binary(byte_width), pool_); + auto data = reinterpret_cast(PyArray_DATA(arr_)); + + if (mask_ != nullptr) { + Ndarray1DIndexer mask_values(mask_); + RETURN_NOT_OK(builder.Reserve(length_)); + for (int64_t i = 0; i < length_; ++i) { + if (mask_values[i]) { + RETURN_NOT_OK(builder.AppendNull()); + } else { + RETURN_NOT_OK(builder.Append(data)); + } + data += stride_; + } + } else { + for (int64_t i = 0; i < length_; ++i) { + RETURN_NOT_OK(builder.Append(data)); + data += stride_; + } + } + + std::shared_ptr result; + RETURN_NOT_OK(builder.Finish(&result)); + return PushArray(result->data()); +} + +namespace { + +// NumPy unicode is UCS4/UTF32 always +constexpr int kNumPyUnicodeSize = 4; + +Status AppendUTF32(const char* data, int itemsize, int byteorder, + ::arrow::internal::ChunkedStringBuilder* builder) { + // The binary \x00\x00\x00\x00 indicates a nul terminator in NumPy unicode, + // so we need to detect that here to truncate if necessary. Yep. + int actual_length = 0; + for (; actual_length < itemsize / kNumPyUnicodeSize; ++actual_length) { + const char* code_point = data + actual_length * kNumPyUnicodeSize; + if ((*code_point == '\0') && (*(code_point + 1) == '\0') && + (*(code_point + 2) == '\0') && (*(code_point + 3) == '\0')) { + break; + } + } + + OwnedRef unicode_obj(PyUnicode_DecodeUTF32(data, actual_length * kNumPyUnicodeSize, + nullptr, &byteorder)); + RETURN_IF_PYERROR(); + OwnedRef utf8_obj(PyUnicode_AsUTF8String(unicode_obj.obj())); + if (utf8_obj.obj() == NULL) { + PyErr_Clear(); + return Status::Invalid("failed converting UTF32 to UTF8"); + } + + const int32_t length = static_cast(PyBytes_GET_SIZE(utf8_obj.obj())); + return builder->Append( + reinterpret_cast(PyBytes_AS_STRING(utf8_obj.obj())), length); +} + +} // namespace + +Status NumPyConverter::Visit(const StringType& type) { + util::InitializeUTF8(); + + ::arrow::internal::ChunkedStringBuilder builder(kBinaryChunksize, pool_); + + auto data = reinterpret_cast(PyArray_DATA(arr_)); + + char numpy_byteorder = dtype_->byteorder; + + // For Python C API, -1 is little-endian, 1 is big-endian +#if ARROW_LITTLE_ENDIAN + // Yield little-endian from both '|' (native) and '<' + int byteorder = numpy_byteorder == '>' ? 1 : -1; +#else + // Yield big-endian from both '|' (native) and '>' + int byteorder = numpy_byteorder == '<' ? -1 : 1; +#endif + + PyAcquireGIL gil_lock; + + const bool is_binary_type = dtype_->type_num == NPY_STRING; + const bool is_unicode_type = dtype_->type_num == NPY_UNICODE; + + if (!is_binary_type && !is_unicode_type) { + const bool is_float_type = dtype_->kind == 'f'; + if (from_pandas_ && is_float_type) { + // in case of from_pandas=True, accept an all-NaN float array as input + RETURN_NOT_OK(NumPyNullsConverter::Convert(pool_, arr_, from_pandas_, &null_bitmap_, + &null_count_)); + if (null_count_ == length_) { + auto arr = std::make_shared(length_); + compute::ExecContext context(pool_); + ARROW_ASSIGN_OR_RAISE( + std::shared_ptr out, + compute::Cast(*arr, arrow::utf8(), cast_options_, &context)); + out_arrays_.emplace_back(out); + return Status::OK(); + } + } + std::string dtype_string; + RETURN_NOT_OK(internal::PyObject_StdStringStr(reinterpret_cast(dtype_), + &dtype_string)); + return Status::TypeError("Expected a string or bytes dtype, got ", dtype_string); + } + + auto AppendNonNullValue = [&](const uint8_t* data) { + if (is_binary_type) { + if (ARROW_PREDICT_TRUE(util::ValidateUTF8(data, itemsize_))) { + return builder.Append(data, itemsize_); + } else { + return Status::Invalid("Encountered non-UTF8 binary value: ", + HexEncode(data, itemsize_)); + } + } else { + // is_unicode_type case + return AppendUTF32(reinterpret_cast(data), itemsize_, byteorder, + &builder); + } + }; + + if (mask_ != nullptr) { + Ndarray1DIndexer mask_values(mask_); + for (int64_t i = 0; i < length_; ++i) { + if (mask_values[i]) { + RETURN_NOT_OK(builder.AppendNull()); + } else { + RETURN_NOT_OK(AppendNonNullValue(data)); + } + data += stride_; + } + } else { + for (int64_t i = 0; i < length_; ++i) { + RETURN_NOT_OK(AppendNonNullValue(data)); + data += stride_; + } + } + + ArrayVector result; + RETURN_NOT_OK(builder.Finish(&result)); + for (auto arr : result) { + RETURN_NOT_OK(PushArray(arr->data())); + } + return Status::OK(); +} + +Status NumPyConverter::Visit(const StructType& type) { + std::vector sub_converters; + std::vector sub_arrays; + + { + PyAcquireGIL gil_lock; + + // Create converters for each struct type field + if (dtype_->fields == NULL || !PyDict_Check(dtype_->fields)) { + return Status::TypeError("Expected struct array"); + } + + for (auto field : type.fields()) { + PyObject* tup = PyDict_GetItemString(dtype_->fields, field->name().c_str()); + if (tup == NULL) { + return Status::Invalid("Missing field '", field->name(), "' in struct array"); + } + PyArray_Descr* sub_dtype = + reinterpret_cast(PyTuple_GET_ITEM(tup, 0)); + DCHECK(PyObject_TypeCheck(sub_dtype, &PyArrayDescr_Type)); + int offset = static_cast(PyLong_AsLong(PyTuple_GET_ITEM(tup, 1))); + RETURN_IF_PYERROR(); + Py_INCREF(sub_dtype); /* PyArray_GetField() steals ref */ + PyObject* sub_array = PyArray_GetField(arr_, sub_dtype, offset); + RETURN_IF_PYERROR(); + sub_arrays.emplace_back(sub_array); + sub_converters.emplace_back(pool_, sub_array, nullptr /* mask */, field->type(), + from_pandas_); + } + } + + std::vector groups; + int64_t null_count = 0; + + // Compute null bitmap and store it as a Boolean Array to include it + // in the rechunking below + { + if (mask_ != nullptr) { + RETURN_NOT_OK(InitNullBitmap()); + null_count = MaskToBitmap(mask_, length_, null_bitmap_data_); + if (null_count_ == -1) return Status::Invalid("Invalid mask type"); + } + groups.push_back({std::make_shared(length_, null_bitmap_)}); + } + + // Convert child data + for (auto& converter : sub_converters) { + RETURN_NOT_OK(converter.Convert()); + groups.push_back(converter.result()); + } + // Ensure the different array groups are chunked consistently + groups = ::arrow::internal::RechunkArraysConsistently(groups); + + // Make struct array chunks by combining groups + size_t ngroups = groups.size(); + size_t nchunks = groups[0].size(); + for (size_t chunk = 0; chunk < nchunks; chunk++) { + // First group has the null bitmaps as Boolean Arrays + const auto& null_data = groups[0][chunk]->data(); + DCHECK_EQ(null_data->type->id(), Type::BOOL); + DCHECK_EQ(null_data->buffers.size(), 2); + const auto& null_buffer = null_data->buffers[1]; + // Careful: the rechunked null bitmap may have a non-zero offset + // to its buffer, and it may not even start on a byte boundary + int64_t null_offset = null_data->offset; + std::shared_ptr fixed_null_buffer; + + if (!null_buffer) { + fixed_null_buffer = null_buffer; + } else if (null_offset % 8 == 0) { + fixed_null_buffer = + std::make_shared(null_buffer, + // byte offset + null_offset / 8, + // byte size + bit_util::BytesForBits(null_data->length)); + } else { + ARROW_ASSIGN_OR_RAISE( + fixed_null_buffer, + CopyBitmap(pool_, null_buffer->data(), null_offset, null_data->length)); + } + + // Create struct array chunk and populate it + auto arr_data = + ArrayData::Make(type_, null_data->length, null_count ? kUnknownNullCount : 0, 0); + arr_data->buffers.push_back(fixed_null_buffer); + // Append child chunks + for (size_t i = 1; i < ngroups; i++) { + arr_data->child_data.push_back(groups[i][chunk]->data()); + } + RETURN_NOT_OK(PushArray(arr_data)); + } + + return Status::OK(); +} + +Status NdarrayToArrow(MemoryPool* pool, PyObject* ao, PyObject* mo, bool from_pandas, + const std::shared_ptr& type, + const compute::CastOptions& cast_options, + std::shared_ptr* out) { + if (!PyArray_Check(ao)) { + // This code path cannot be reached by Python unit tests currently so this + // is only a sanity check. + return Status::TypeError("Input object was not a NumPy array"); + } + if (PyArray_NDIM(reinterpret_cast(ao)) != 1) { + return Status::Invalid("only handle 1-dimensional arrays"); + } + + NumPyConverter converter(pool, ao, mo, type, from_pandas, cast_options); + RETURN_NOT_OK(converter.Convert()); + const auto& output_arrays = converter.result(); + DCHECK_GT(output_arrays.size(), 0); + *out = std::make_shared(output_arrays); + return Status::OK(); +} + +Status NdarrayToArrow(MemoryPool* pool, PyObject* ao, PyObject* mo, bool from_pandas, + const std::shared_ptr& type, + std::shared_ptr* out) { + return NdarrayToArrow(pool, ao, mo, from_pandas, type, compute::CastOptions(), out); +} + +} // namespace py +} // namespace arrow diff --git a/src/vendored/apache-arrow-12.0.1/arrow/python/parquet_encryption.cc b/src/vendored/apache-arrow-12.0.1/arrow/python/parquet_encryption.cc new file mode 100644 index 0000000..a5f924b --- /dev/null +++ b/src/vendored/apache-arrow-12.0.1/arrow/python/parquet_encryption.cc @@ -0,0 +1,98 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/python/parquet_encryption.h" +#include "parquet/exception.h" + +namespace arrow { +namespace py { +namespace parquet { +namespace encryption { + +PyKmsClient::PyKmsClient(PyObject* handler, PyKmsClientVtable vtable) + : handler_(handler), vtable_(std::move(vtable)) { + Py_INCREF(handler); +} + +PyKmsClient::~PyKmsClient() {} + +std::string PyKmsClient::WrapKey(const std::string& key_bytes, + const std::string& master_key_identifier) { + std::string wrapped; + auto st = SafeCallIntoPython([&]() -> Status { + vtable_.wrap_key(handler_.obj(), key_bytes, master_key_identifier, &wrapped); + return CheckPyError(); + }); + if (!st.ok()) { + throw ::parquet::ParquetStatusException(st); + } + return wrapped; +} + +std::string PyKmsClient::UnwrapKey(const std::string& wrapped_key, + const std::string& master_key_identifier) { + std::string unwrapped; + auto st = SafeCallIntoPython([&]() -> Status { + vtable_.unwrap_key(handler_.obj(), wrapped_key, master_key_identifier, &unwrapped); + return CheckPyError(); + }); + if (!st.ok()) { + throw ::parquet::ParquetStatusException(st); + } + return unwrapped; +} + +PyKmsClientFactory::PyKmsClientFactory(PyObject* handler, PyKmsClientFactoryVtable vtable) + : handler_(handler), vtable_(std::move(vtable)) { + Py_INCREF(handler); +} + +PyKmsClientFactory::~PyKmsClientFactory() {} + +std::shared_ptr<::parquet::encryption::KmsClient> PyKmsClientFactory::CreateKmsClient( + const ::parquet::encryption::KmsConnectionConfig& kms_connection_config) { + std::shared_ptr<::parquet::encryption::KmsClient> kms_client; + auto st = SafeCallIntoPython([&]() -> Status { + vtable_.create_kms_client(handler_.obj(), kms_connection_config, &kms_client); + return CheckPyError(); + }); + if (!st.ok()) { + throw ::parquet::ParquetStatusException(st); + } + return kms_client; +} + +arrow::Result> +PyCryptoFactory::SafeGetFileEncryptionProperties( + const ::parquet::encryption::KmsConnectionConfig& kms_connection_config, + const ::parquet::encryption::EncryptionConfiguration& encryption_config) { + PARQUET_CATCH_AND_RETURN( + this->GetFileEncryptionProperties(kms_connection_config, encryption_config)); +} + +arrow::Result> +PyCryptoFactory::SafeGetFileDecryptionProperties( + const ::parquet::encryption::KmsConnectionConfig& kms_connection_config, + const ::parquet::encryption::DecryptionConfiguration& decryption_config) { + PARQUET_CATCH_AND_RETURN( + this->GetFileDecryptionProperties(kms_connection_config, decryption_config)); +} + +} // namespace encryption +} // namespace parquet +} // namespace py +} // namespace arrow diff --git a/src/vendored/apache-arrow-12.0.1/arrow/python/platform.h b/src/vendored/apache-arrow-12.0.1/arrow/python/platform.h index 80f7e60..e71c7ac 100644 --- a/src/vendored/apache-arrow-12.0.1/arrow/python/platform.h +++ b/src/vendored/apache-arrow-12.0.1/arrow/python/platform.h @@ -24,7 +24,7 @@ // to mean Py_ssize_t (defining this to suppress deprecation warning) #define PY_SSIZE_T_CLEAN -#include // IWYU pragma: export +#include // IWYU pragma: export #include // Work around C2528 error @@ -32,5 +32,10 @@ #if _MSC_VER >= 1900 #undef timezone #endif -#endif +// https://bugs.python.org/issue36020 +// TODO(wjones127): Can remove once we drop support for CPython 3.9 +#ifdef snprintf +#undef snprintf +#endif +#endif diff --git a/src/vendored/apache-arrow-12.0.1/arrow/python/pyarrow.cc b/src/vendored/apache-arrow-12.0.1/arrow/python/pyarrow.cc new file mode 100644 index 0000000..30d1f04 --- /dev/null +++ b/src/vendored/apache-arrow-12.0.1/arrow/python/pyarrow.cc @@ -0,0 +1,94 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/python/pyarrow.h" + +#include +#include + +#include "arrow/array.h" +#include "arrow/table.h" +#include "arrow/tensor.h" +#include "arrow/type.h" + +#include "arrow/python/common.h" +#include "arrow/python/datetime.h" +namespace { +#include "arrow/python/pyarrow_api.h" +} + +namespace arrow { +namespace py { + +static Status UnwrapError(PyObject* obj, const char* expected_type) { + return Status::TypeError("Could not unwrap ", expected_type, + " from Python object of type '", Py_TYPE(obj)->tp_name, "'"); +} + +int import_pyarrow() { +#ifdef PYPY_VERSION + PyDateTime_IMPORT; +#else + internal::InitDatetime(); +#endif + return ::import_pyarrow__lib(); +} + +#define DEFINE_WRAP_FUNCTIONS(FUNC_SUFFIX, TYPE_NAME) \ + bool is_##FUNC_SUFFIX(PyObject* obj) { return ::pyarrow_is_##FUNC_SUFFIX(obj) != 0; } \ + \ + PyObject* wrap_##FUNC_SUFFIX(const std::shared_ptr& src) { \ + return ::pyarrow_wrap_##FUNC_SUFFIX(src); \ + } \ + Result> unwrap_##FUNC_SUFFIX(PyObject* obj) { \ + auto out = ::pyarrow_unwrap_##FUNC_SUFFIX(obj); \ + if (out) { \ + return std::move(out); \ + } else { \ + return UnwrapError(obj, #TYPE_NAME); \ + } \ + } + +DEFINE_WRAP_FUNCTIONS(buffer, Buffer) + +DEFINE_WRAP_FUNCTIONS(data_type, DataType) +DEFINE_WRAP_FUNCTIONS(field, Field) +DEFINE_WRAP_FUNCTIONS(schema, Schema) + +DEFINE_WRAP_FUNCTIONS(scalar, Scalar) + +DEFINE_WRAP_FUNCTIONS(array, Array) +DEFINE_WRAP_FUNCTIONS(chunked_array, ChunkedArray) + +DEFINE_WRAP_FUNCTIONS(sparse_coo_tensor, SparseCOOTensor) +DEFINE_WRAP_FUNCTIONS(sparse_csc_matrix, SparseCSCMatrix) +DEFINE_WRAP_FUNCTIONS(sparse_csf_tensor, SparseCSFTensor) +DEFINE_WRAP_FUNCTIONS(sparse_csr_matrix, SparseCSRMatrix) +DEFINE_WRAP_FUNCTIONS(tensor, Tensor) + +DEFINE_WRAP_FUNCTIONS(batch, RecordBatch) +DEFINE_WRAP_FUNCTIONS(table, Table) + +#undef DEFINE_WRAP_FUNCTIONS + +namespace internal { + +int check_status(const Status& status) { return ::pyarrow_internal_check_status(status); } + +} // namespace internal +} // namespace py +} // namespace arrow diff --git a/src/vendored/apache-arrow-12.0.1/arrow/python/python_test.cc b/src/vendored/apache-arrow-12.0.1/arrow/python/python_test.cc new file mode 100644 index 0000000..01ab8a3 --- /dev/null +++ b/src/vendored/apache-arrow-12.0.1/arrow/python/python_test.cc @@ -0,0 +1,888 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include +#include +#include + +#include "platform.h" + +#include "arrow/array.h" +#include "arrow/array/builder_binary.h" +#include "arrow/table.h" +#include "arrow/util/decimal.h" +#include "arrow/util/logging.h" + +#include "arrow/python/arrow_to_pandas.h" +#include "arrow/python/decimal.h" +#include "arrow/python/helpers.h" +#include "arrow/python/numpy_convert.h" +#include "arrow/python/numpy_interop.h" +#include "arrow/python/python_test.h" +#include "arrow/python/python_to_arrow.h" + +#define ASSERT_EQ(x, y) \ + { \ + auto&& _left = (x); \ + auto&& _right = (y); \ + if (_left != _right) { \ + return Status::Invalid("Expected equality between `", #x, "` and `", #y, \ + "`, but ", arrow::py::testing::ToString(_left), \ + " != ", arrow::py::testing::ToString(_right)); \ + } \ + } + +#define ASSERT_NE(x, y) \ + { \ + auto&& _left = (x); \ + auto&& _right = (y); \ + if (_left == _right) { \ + return Status::Invalid("Expected inequality between `", #x, "` and `", #y, \ + "`, but ", arrow::py::testing::ToString(_left), \ + " == ", arrow::py::testing::ToString(_right)); \ + } \ + } + +#define ASSERT_FALSE(v) \ + { \ + auto&& _v = (v); \ + if (!!_v) { \ + return Status::Invalid("Expected `", #v, "` to evaluate to false, but got ", \ + arrow::py::testing::ToString(_v)); \ + } \ + } + +#define ASSERT_TRUE(v) \ + { \ + auto&& _v = (v); \ + if (!_v) { \ + return Status::Invalid("Expected `", #v, "` to evaluate to true, but got ", \ + arrow::py::testing::ToString(_v)); \ + } \ + } + +#define ASSERT_FALSE_MSG(v, msg) \ + { \ + auto&& _v = (v); \ + if (!!_v) { \ + return Status::Invalid("Expected `", #v, "` to evaluate to false, but got ", \ + arrow::py::testing::ToString(_v), ": ", msg); \ + } \ + } + +#define ASSERT_TRUE_MSG(v, msg) \ + { \ + auto&& _v = (v); \ + if (!_v) { \ + return Status::Invalid("Expected `", #v, "` to evaluate to true, but got ", \ + arrow::py::testing::ToString(_v), ": ", msg); \ + } \ + } + +#define ASSERT_OK(expr) \ + { \ + for (::arrow::Status _st = ::arrow::internal::GenericToStatus((expr)); !_st.ok();) \ + return Status::Invalid("`", #expr, "` failed with ", _st.ToString()); \ + } + +#define ASSERT_RAISES(code, expr) \ + { \ + for (::arrow::Status _st_expr = ::arrow::internal::GenericToStatus((expr)); \ + !_st_expr.Is##code();) \ + return Status::Invalid("Expected `", #expr, "` to fail with ", #code, \ + ", but got ", _st_expr.ToString()); \ + } + +namespace arrow { + +using internal::checked_cast; + +namespace py { +namespace testing { + +// ARROW-17938: Some standard libraries have ambiguous operator<<(nullptr_t), +// work around it using a custom printer function. + +template +std::string ToString(const T& t) { + std::stringstream ss; + ss << t; + return ss.str(); +} + +template <> +std::string ToString(const std::nullptr_t&) { + return "nullptr"; +} + +namespace { + +Status TestOwnedRefMoves() { + std::vector vec; + PyObject *u, *v; + u = PyList_New(0); + v = PyList_New(0); + + { + OwnedRef ref(u); + vec.push_back(std::move(ref)); + ASSERT_EQ(ref.obj(), nullptr); + } + vec.emplace_back(v); + ASSERT_EQ(Py_REFCNT(u), 1); + ASSERT_EQ(Py_REFCNT(v), 1); + return Status::OK(); +} + +Status TestOwnedRefNoGILMoves() { + PyAcquireGIL lock; + lock.release(); + + { + std::vector vec; + PyObject *u, *v; + { + lock.acquire(); + u = PyList_New(0); + v = PyList_New(0); + lock.release(); + } + { + OwnedRefNoGIL ref(u); + vec.push_back(std::move(ref)); + ASSERT_EQ(ref.obj(), nullptr); + } + vec.emplace_back(v); + ASSERT_EQ(Py_REFCNT(u), 1); + ASSERT_EQ(Py_REFCNT(v), 1); + return Status::OK(); + } +} + +std::string FormatPythonException(const std::string& exc_class_name) { + std::stringstream ss; + ss << "Python exception: "; + ss << exc_class_name; + return ss.str(); +} + +Status TestCheckPyErrorStatus() { + Status st; + std::string expected_detail = ""; + + auto check_error = [](Status& st, const char* expected_message = "some error", + std::string expected_detail = "") { + st = CheckPyError(); + ASSERT_EQ(st.message(), expected_message); + ASSERT_FALSE(PyErr_Occurred()); + if (expected_detail.size() > 0) { + auto detail = st.detail(); + ASSERT_NE(detail, nullptr); + ASSERT_EQ(detail->ToString(), expected_detail); + } + return Status::OK(); + }; + + for (PyObject* exc_type : {PyExc_Exception, PyExc_SyntaxError}) { + PyErr_SetString(exc_type, "some error"); + ASSERT_OK(check_error(st)); + ASSERT_TRUE(st.IsUnknownError()); + } + + PyErr_SetString(PyExc_TypeError, "some error"); + ASSERT_OK(check_error(st, "some error", FormatPythonException("TypeError"))); + ASSERT_TRUE(st.IsTypeError()); + + PyErr_SetString(PyExc_ValueError, "some error"); + ASSERT_OK(check_error(st)); + ASSERT_TRUE(st.IsInvalid()); + + PyErr_SetString(PyExc_KeyError, "some error"); + ASSERT_OK(check_error(st, "'some error'")); + ASSERT_TRUE(st.IsKeyError()); + + for (PyObject* exc_type : {PyExc_OSError, PyExc_IOError}) { + PyErr_SetString(exc_type, "some error"); + ASSERT_OK(check_error(st)); + ASSERT_TRUE(st.IsIOError()); + } + + PyErr_SetString(PyExc_NotImplementedError, "some error"); + ASSERT_OK(check_error(st, "some error", FormatPythonException("NotImplementedError"))); + ASSERT_TRUE(st.IsNotImplemented()); + + // No override if a specific status code is given + PyErr_SetString(PyExc_TypeError, "some error"); + st = CheckPyError(StatusCode::SerializationError); + ASSERT_TRUE(st.IsSerializationError()); + ASSERT_EQ(st.message(), "some error"); + ASSERT_FALSE(PyErr_Occurred()); + + return Status::OK(); +} + +Status TestCheckPyErrorStatusNoGIL() { + PyAcquireGIL lock; + { + Status st; + PyErr_SetString(PyExc_ZeroDivisionError, "zzzt"); + st = ConvertPyError(); + ASSERT_FALSE(PyErr_Occurred()); + lock.release(); + ASSERT_TRUE(st.IsUnknownError()); + ASSERT_EQ(st.message(), "zzzt"); + ASSERT_EQ(st.detail()->ToString(), FormatPythonException("ZeroDivisionError")); + return Status::OK(); + } +} + +Status TestRestorePyErrorBasics() { + PyErr_SetString(PyExc_ZeroDivisionError, "zzzt"); + auto st = ConvertPyError(); + ASSERT_FALSE(PyErr_Occurred()); + ASSERT_TRUE(st.IsUnknownError()); + ASSERT_EQ(st.message(), "zzzt"); + ASSERT_EQ(st.detail()->ToString(), FormatPythonException("ZeroDivisionError")); + + RestorePyError(st); + ASSERT_TRUE(PyErr_Occurred()); + PyObject* exc_type; + PyObject* exc_value; + PyObject* exc_traceback; + PyErr_Fetch(&exc_type, &exc_value, &exc_traceback); + ASSERT_TRUE(PyErr_GivenExceptionMatches(exc_type, PyExc_ZeroDivisionError)); + std::string py_message; + ASSERT_OK(internal::PyObject_StdStringStr(exc_value, &py_message)); + ASSERT_EQ(py_message, "zzzt"); + + return Status::OK(); +} + +Status TestPyBufferInvalidInputObject() { + std::shared_ptr res; + PyObject* input = Py_None; + auto old_refcnt = Py_REFCNT(input); + { + Status st = PyBuffer::FromPyObject(input).status(); + ASSERT_TRUE_MSG(IsPyError(st), st.ToString()); + ASSERT_FALSE(PyErr_Occurred()); + } + ASSERT_EQ(old_refcnt, Py_REFCNT(input)); + return Status::OK(); +} + +// Because of how it is declared, the Numpy C API instance initialized +// within libarrow_python.dll may not be visible in this test under Windows +// ("unresolved external symbol arrow_ARRAY_API referenced"). +#ifndef _WIN32 +Status TestPyBufferNumpyArray() { + npy_intp dims[1] = {10}; + + OwnedRef arr_ref(PyArray_SimpleNew(1, dims, NPY_FLOAT)); + PyObject* arr = arr_ref.obj(); + ASSERT_NE(arr, nullptr); + auto old_refcnt = Py_REFCNT(arr); + auto buf = std::move(PyBuffer::FromPyObject(arr)).ValueOrDie(); + + ASSERT_TRUE(buf->is_cpu()); + ASSERT_EQ(buf->data(), PyArray_DATA(reinterpret_cast(arr))); + ASSERT_TRUE(buf->is_mutable()); + ASSERT_EQ(buf->mutable_data(), buf->data()); + ASSERT_EQ(old_refcnt + 1, Py_REFCNT(arr)); + buf.reset(); + ASSERT_EQ(old_refcnt, Py_REFCNT(arr)); + + // Read-only + PyArray_CLEARFLAGS(reinterpret_cast(arr), NPY_ARRAY_WRITEABLE); + buf = std::move(PyBuffer::FromPyObject(arr)).ValueOrDie(); + ASSERT_TRUE(buf->is_cpu()); + ASSERT_EQ(buf->data(), PyArray_DATA(reinterpret_cast(arr))); + ASSERT_FALSE(buf->is_mutable()); + ASSERT_EQ(old_refcnt + 1, Py_REFCNT(arr)); + buf.reset(); + ASSERT_EQ(old_refcnt, Py_REFCNT(arr)); + + return Status::OK(); +} + +Status TestNumPyBufferNumpyArray() { + npy_intp dims[1] = {10}; + + OwnedRef arr_ref(PyArray_SimpleNew(1, dims, NPY_FLOAT)); + PyObject* arr = arr_ref.obj(); + ASSERT_NE(arr, nullptr); + auto old_refcnt = Py_REFCNT(arr); + + auto buf = std::make_shared(arr); + ASSERT_TRUE(buf->is_cpu()); + ASSERT_EQ(buf->data(), PyArray_DATA(reinterpret_cast(arr))); + ASSERT_TRUE(buf->is_mutable()); + ASSERT_EQ(buf->mutable_data(), buf->data()); + ASSERT_EQ(old_refcnt + 1, Py_REFCNT(arr)); + buf.reset(); + ASSERT_EQ(old_refcnt, Py_REFCNT(arr)); + + // Read-only + PyArray_CLEARFLAGS(reinterpret_cast(arr), NPY_ARRAY_WRITEABLE); + buf = std::make_shared(arr); + ASSERT_TRUE(buf->is_cpu()); + ASSERT_EQ(buf->data(), PyArray_DATA(reinterpret_cast(arr))); + ASSERT_FALSE(buf->is_mutable()); + ASSERT_EQ(old_refcnt + 1, Py_REFCNT(arr)); + buf.reset(); + ASSERT_EQ(old_refcnt, Py_REFCNT(arr)); + + return Status::OK(); +} +#endif + +Status TestPythonDecimalToString() { + OwnedRef decimal_constructor_; + OwnedRef decimal_module; + + RETURN_NOT_OK(internal::ImportModule("decimal", &decimal_module)); + RETURN_NOT_OK( + internal::ImportFromModule(decimal_module.obj(), "Decimal", &decimal_constructor_)); + + std::string decimal_string("-39402950693754869342983"); + PyObject* python_object = + internal::DecimalFromString(decimal_constructor_.obj(), decimal_string); + ASSERT_NE(python_object, nullptr); + + std::string string_result; + ASSERT_OK(internal::PythonDecimalToString(python_object, &string_result)); + + return Status::OK(); +} + +Status TestInferPrecisionAndScale() { + OwnedRef decimal_constructor_; + OwnedRef decimal_module; + + RETURN_NOT_OK(internal::ImportModule("decimal", &decimal_module)); + RETURN_NOT_OK( + internal::ImportFromModule(decimal_module.obj(), "Decimal", &decimal_constructor_)); + + std::string decimal_string("-394029506937548693.42983"); + PyObject* python_decimal = + internal::DecimalFromString(decimal_constructor_.obj(), decimal_string); + + internal::DecimalMetadata metadata; + ASSERT_OK(metadata.Update(python_decimal)); + + const auto expected_precision = + static_cast(decimal_string.size() - 2); // 1 for -, 1 for . + const int32_t expected_scale = 5; + + ASSERT_EQ(expected_precision, metadata.precision()); + ASSERT_EQ(expected_scale, metadata.scale()); + + return Status::OK(); +} + +Status TestInferPrecisionAndNegativeScale() { + OwnedRef decimal_constructor_; + OwnedRef decimal_module; + + RETURN_NOT_OK(internal::ImportModule("decimal", &decimal_module)); + RETURN_NOT_OK( + internal::ImportFromModule(decimal_module.obj(), "Decimal", &decimal_constructor_)); + + std::string decimal_string("-3.94042983E+10"); + PyObject* python_decimal = + internal::DecimalFromString(decimal_constructor_.obj(), decimal_string); + + internal::DecimalMetadata metadata; + ASSERT_OK(metadata.Update(python_decimal)); + + const auto expected_precision = 11; + const int32_t expected_scale = 0; + + ASSERT_EQ(expected_precision, metadata.precision()); + ASSERT_EQ(expected_scale, metadata.scale()); + + return Status::OK(); +} + +Status TestInferAllLeadingZeros() { + OwnedRef decimal_constructor_; + OwnedRef decimal_module; + + RETURN_NOT_OK(internal::ImportModule("decimal", &decimal_module)); + RETURN_NOT_OK( + internal::ImportFromModule(decimal_module.obj(), "Decimal", &decimal_constructor_)); + + std::string decimal_string("0.001"); + PyObject* python_decimal = + internal::DecimalFromString(decimal_constructor_.obj(), decimal_string); + + internal::DecimalMetadata metadata; + ASSERT_OK(metadata.Update(python_decimal)); + ASSERT_EQ(3, metadata.precision()); + ASSERT_EQ(3, metadata.scale()); + + return Status::OK(); +} + +Status TestInferAllLeadingZerosExponentialNotationPositive() { + OwnedRef decimal_constructor_; + OwnedRef decimal_module; + + RETURN_NOT_OK(internal::ImportModule("decimal", &decimal_module)); + RETURN_NOT_OK( + internal::ImportFromModule(decimal_module.obj(), "Decimal", &decimal_constructor_)); + + std::string decimal_string("0.01E5"); + PyObject* python_decimal = + internal::DecimalFromString(decimal_constructor_.obj(), decimal_string); + + internal::DecimalMetadata metadata; + ASSERT_OK(metadata.Update(python_decimal)); + ASSERT_EQ(4, metadata.precision()); + ASSERT_EQ(0, metadata.scale()); + + return Status::OK(); +} + +Status TestInferAllLeadingZerosExponentialNotationNegative() { + OwnedRef decimal_constructor_; + OwnedRef decimal_module; + + RETURN_NOT_OK(internal::ImportModule("decimal", &decimal_module)); + RETURN_NOT_OK( + internal::ImportFromModule(decimal_module.obj(), "Decimal", &decimal_constructor_)); + + std::string decimal_string("0.01E3"); + PyObject* python_decimal = + internal::DecimalFromString(decimal_constructor_.obj(), decimal_string); + internal::DecimalMetadata metadata; + ASSERT_OK(metadata.Update(python_decimal)); + ASSERT_EQ(2, metadata.precision()); + ASSERT_EQ(0, metadata.scale()); + + return Status::OK(); +} + +Status TestObjectBlockWriteFails() { + StringBuilder builder; + const char value[] = {'\xf1', '\0'}; + + for (int i = 0; i < 1000; ++i) { + ASSERT_OK(builder.Append(value, static_cast(strlen(value)))); + } + + std::shared_ptr arr; + ASSERT_OK(builder.Finish(&arr)); + + auto f1 = field("f1", utf8()); + auto f2 = field("f2", utf8()); + auto f3 = field("f3", utf8()); + std::vector> fields = {f1, f2, f3}; + std::vector> cols = {arr, arr, arr}; + + auto schema = ::arrow::schema(fields); + auto table = Table::Make(schema, cols); + + Status st; + Py_BEGIN_ALLOW_THREADS; + PyObject* out; + PandasOptions options; + options.use_threads = true; + st = ConvertTableToPandas(options, table, &out); + Py_END_ALLOW_THREADS; + ASSERT_RAISES(UnknownError, st); + + return Status::OK(); +} + +Status TestMixedTypeFails() { + OwnedRef list_ref(PyList_New(3)); + PyObject* list = list_ref.obj(); + + ASSERT_NE(list, nullptr); + + PyObject* str = PyUnicode_FromString("abc"); + ASSERT_NE(str, nullptr); + + PyObject* integer = PyLong_FromLong(1234L); + ASSERT_NE(integer, nullptr); + + PyObject* doub = PyFloat_FromDouble(123.0234); + ASSERT_NE(doub, nullptr); + + // This steals a reference to each object, so we don't need to decref them later + // just the list + ASSERT_EQ(PyList_SetItem(list, 0, str), 0); + ASSERT_EQ(PyList_SetItem(list, 1, integer), 0); + ASSERT_EQ(PyList_SetItem(list, 2, doub), 0); + + ASSERT_RAISES(TypeError, ConvertPySequence(list, nullptr, {})); + + return Status::OK(); +} + +template +Status DecimalTestFromPythonDecimalRescale(std::shared_ptr type, + PyObject* python_decimal, + std::optional expected) { + DecimalValue value; + const auto& decimal_type = checked_cast(*type); + + if (expected.has_value()) { + ASSERT_OK(internal::DecimalFromPythonDecimal(python_decimal, decimal_type, &value)); + ASSERT_EQ(expected.value(), value); + + ASSERT_OK(internal::DecimalFromPyObject(python_decimal, decimal_type, &value)); + ASSERT_EQ(expected.value(), value); + } else { + ASSERT_RAISES(Invalid, internal::DecimalFromPythonDecimal(python_decimal, + decimal_type, &value)); + ASSERT_RAISES(Invalid, + internal::DecimalFromPyObject(python_decimal, decimal_type, &value)); + } + return Status::OK(); +} + +Status TestFromPythonDecimalRescaleNotTruncateable() { + OwnedRef decimal_constructor_; + OwnedRef decimal_module; + + RETURN_NOT_OK(internal::ImportModule("decimal", &decimal_module)); + RETURN_NOT_OK( + internal::ImportFromModule(decimal_module.obj(), "Decimal", &decimal_constructor_)); + + std::string decimal_string("1.001"); + PyObject* python_decimal = + internal::DecimalFromString(decimal_constructor_.obj(), decimal_string); + // We fail when truncating values that would lose data if cast to a decimal type with + // lower scale + ASSERT_OK(DecimalTestFromPythonDecimalRescale(::arrow::decimal128(10, 2), + python_decimal, {})); + ASSERT_OK(DecimalTestFromPythonDecimalRescale(::arrow::decimal256(10, 2), + python_decimal, {})); + + return Status::OK(); +} + +Status TestFromPythonDecimalRescaleTruncateable() { + OwnedRef decimal_constructor_; + OwnedRef decimal_module; + + RETURN_NOT_OK(internal::ImportModule("decimal", &decimal_module)); + RETURN_NOT_OK( + internal::ImportFromModule(decimal_module.obj(), "Decimal", &decimal_constructor_)); + + std::string decimal_string("1.000"); + PyObject* python_decimal = + internal::DecimalFromString(decimal_constructor_.obj(), decimal_string); + // We allow truncation of values that do not lose precision when dividing by 10 * the + // difference between the scales, e.g., 1.000 -> 1.00 + ASSERT_OK(DecimalTestFromPythonDecimalRescale(::arrow::decimal128(10, 2), + python_decimal, 100)); + ASSERT_OK(DecimalTestFromPythonDecimalRescale(::arrow::decimal256(10, 2), + python_decimal, 100)); + + return Status::OK(); +} + +Status TestFromPythonNegativeDecimalRescale() { + OwnedRef decimal_constructor_; + OwnedRef decimal_module; + + RETURN_NOT_OK(internal::ImportModule("decimal", &decimal_module)); + RETURN_NOT_OK( + internal::ImportFromModule(decimal_module.obj(), "Decimal", &decimal_constructor_)); + + std::string decimal_string("-1.000"); + PyObject* python_decimal = + internal::DecimalFromString(decimal_constructor_.obj(), decimal_string); + ASSERT_OK(DecimalTestFromPythonDecimalRescale(::arrow::decimal128(10, 9), + python_decimal, -1000000000)); + ASSERT_OK(DecimalTestFromPythonDecimalRescale(::arrow::decimal256(10, 9), + python_decimal, -1000000000)); + + return Status::OK(); +} + +Status TestDecimal128FromPythonInteger() { + Decimal128 value; + OwnedRef python_long(PyLong_FromLong(42)); + auto type = ::arrow::decimal128(10, 2); + const auto& decimal_type = checked_cast(*type); + ASSERT_OK(internal::DecimalFromPyObject(python_long.obj(), decimal_type, &value)); + ASSERT_EQ(4200, value); + return Status::OK(); +} + +Status TestDecimal256FromPythonInteger() { + Decimal256 value; + OwnedRef python_long(PyLong_FromLong(42)); + auto type = ::arrow::decimal256(10, 2); + const auto& decimal_type = checked_cast(*type); + ASSERT_OK(internal::DecimalFromPyObject(python_long.obj(), decimal_type, &value)); + ASSERT_EQ(4200, value); + return Status::OK(); +} + +Status TestDecimal128OverflowFails() { + Decimal128 value; + OwnedRef decimal_constructor_; + OwnedRef decimal_module; + + RETURN_NOT_OK(internal::ImportModule("decimal", &decimal_module)); + RETURN_NOT_OK( + internal::ImportFromModule(decimal_module.obj(), "Decimal", &decimal_constructor_)); + + std::string decimal_string("9999999999999999999999999999999999999.9"); + PyObject* python_decimal = + internal::DecimalFromString(decimal_constructor_.obj(), decimal_string); + internal::DecimalMetadata metadata; + ASSERT_OK(metadata.Update(python_decimal)); + ASSERT_EQ(38, metadata.precision()); + ASSERT_EQ(1, metadata.scale()); + + auto type = ::arrow::decimal(38, 38); + const auto& decimal_type = checked_cast(*type); + ASSERT_RAISES(Invalid, + internal::DecimalFromPythonDecimal(python_decimal, decimal_type, &value)); + return Status::OK(); +} + +Status TestDecimal256OverflowFails() { + Decimal256 value; + OwnedRef decimal_constructor_; + OwnedRef decimal_module; + + RETURN_NOT_OK(internal::ImportModule("decimal", &decimal_module)); + RETURN_NOT_OK( + internal::ImportFromModule(decimal_module.obj(), "Decimal", &decimal_constructor_)); + + std::string decimal_string( + "999999999999999999999999999999999999999999999999999999999999999999999999999.9"); + PyObject* python_decimal = + internal::DecimalFromString(decimal_constructor_.obj(), decimal_string); + + internal::DecimalMetadata metadata; + ASSERT_OK(metadata.Update(python_decimal)); + ASSERT_EQ(76, metadata.precision()); + ASSERT_EQ(1, metadata.scale()); + + auto type = ::arrow::decimal(76, 76); + const auto& decimal_type = checked_cast(*type); + ASSERT_RAISES(Invalid, + internal::DecimalFromPythonDecimal(python_decimal, decimal_type, &value)); + return Status::OK(); +} + +Status TestNoneAndNaN() { + OwnedRef list_ref(PyList_New(4)); + PyObject* list = list_ref.obj(); + + ASSERT_NE(list, nullptr); + + OwnedRef decimal_constructor_; + OwnedRef decimal_module; + RETURN_NOT_OK(internal::ImportModule("decimal", &decimal_module)); + RETURN_NOT_OK( + internal::ImportFromModule(decimal_module.obj(), "Decimal", &decimal_constructor_)); + PyObject* constructor = decimal_constructor_.obj(); + PyObject* decimal_value = internal::DecimalFromString(constructor, "1.234"); + ASSERT_NE(decimal_value, nullptr); + + Py_INCREF(Py_None); + PyObject* missing_value1 = Py_None; + ASSERT_NE(missing_value1, nullptr); + + PyObject* missing_value2 = PyFloat_FromDouble(NPY_NAN); + ASSERT_NE(missing_value2, nullptr); + + PyObject* missing_value3 = internal::DecimalFromString(constructor, "nan"); + ASSERT_NE(missing_value3, nullptr); + + // This steals a reference to each object, so we don't need to decref them later, + // just the list + ASSERT_EQ(0, PyList_SetItem(list, 0, decimal_value)); + ASSERT_EQ(0, PyList_SetItem(list, 1, missing_value1)); + ASSERT_EQ(0, PyList_SetItem(list, 2, missing_value2)); + ASSERT_EQ(0, PyList_SetItem(list, 3, missing_value3)); + + PyConversionOptions options; + ASSERT_RAISES(TypeError, ConvertPySequence(list, nullptr, options)); + + options.from_pandas = true; + auto chunked = std::move(ConvertPySequence(list, nullptr, options)).ValueOrDie(); + ASSERT_EQ(chunked->num_chunks(), 1); + + auto arr = chunked->chunk(0); + ASSERT_TRUE(arr->IsValid(0)); + ASSERT_TRUE(arr->IsNull(1)); + ASSERT_TRUE(arr->IsNull(2)); + ASSERT_TRUE(arr->IsNull(3)); + + return Status::OK(); +} + +Status TestMixedPrecisionAndScale() { + std::vector strings{{"0.001", "1.01E5", "1.01E5"}}; + + OwnedRef list_ref(PyList_New(static_cast(strings.size()))); + PyObject* list = list_ref.obj(); + + ASSERT_NE(list, nullptr); + + OwnedRef decimal_constructor_; + OwnedRef decimal_module; + RETURN_NOT_OK(internal::ImportModule("decimal", &decimal_module)); + RETURN_NOT_OK( + internal::ImportFromModule(decimal_module.obj(), "Decimal", &decimal_constructor_)); + // PyList_SetItem steals a reference to the item so we don't decref it later + PyObject* decimal_constructor = decimal_constructor_.obj(); + for (Py_ssize_t i = 0; i < static_cast(strings.size()); ++i) { + const int result = PyList_SetItem( + list, i, internal::DecimalFromString(decimal_constructor, strings.at(i))); + ASSERT_EQ(0, result); + } + + auto arr = std::move(ConvertPySequence(list, nullptr, {})).ValueOrDie(); + const auto& type = checked_cast(*arr->type()); + + int32_t expected_precision = 9; + int32_t expected_scale = 3; + ASSERT_EQ(expected_precision, type.precision()); + ASSERT_EQ(expected_scale, type.scale()); + + return Status::OK(); +} + +Status TestMixedPrecisionAndScaleSequenceConvert() { + OwnedRef decimal_constructor_; + OwnedRef decimal_module; + + RETURN_NOT_OK(internal::ImportModule("decimal", &decimal_module)); + RETURN_NOT_OK( + internal::ImportFromModule(decimal_module.obj(), "Decimal", &decimal_constructor_)); + + std::string decimal_string_1("0.01"); + PyObject* value1 = + internal::DecimalFromString(decimal_constructor_.obj(), decimal_string_1); + ASSERT_NE(value1, nullptr); + + std::string decimal_string_2("0.001"); + PyObject* value2 = + internal::DecimalFromString(decimal_constructor_.obj(), decimal_string_2); + ASSERT_NE(value2, nullptr); + + OwnedRef list_ref(PyList_New(2)); + PyObject* list = list_ref.obj(); + + // This steals a reference to each object, so we don't need to decref them later + // just the list + ASSERT_EQ(PyList_SetItem(list, 0, value1), 0); + ASSERT_EQ(PyList_SetItem(list, 1, value2), 0); + + auto arr = std::move(ConvertPySequence(list, nullptr, {})).ValueOrDie(); + const auto& type = checked_cast(*arr->type()); + ASSERT_EQ(3, type.precision()); + ASSERT_EQ(3, type.scale()); + + return Status::OK(); +} + +Status TestSimpleInference() { + OwnedRef decimal_constructor_; + OwnedRef decimal_module; + + RETURN_NOT_OK(internal::ImportModule("decimal", &decimal_module)); + RETURN_NOT_OK( + internal::ImportFromModule(decimal_module.obj(), "Decimal", &decimal_constructor_)); + + std::string decimal_string("0.01"); + PyObject* value = + internal::DecimalFromString(decimal_constructor_.obj(), decimal_string); + ASSERT_NE(value, nullptr); + internal::DecimalMetadata metadata; + ASSERT_OK(metadata.Update(value)); + ASSERT_EQ(2, metadata.precision()); + ASSERT_EQ(2, metadata.scale()); + + return Status::OK(); +} + +Status TestUpdateWithNaN() { + internal::DecimalMetadata metadata; + OwnedRef decimal_constructor_; + OwnedRef decimal_module; + RETURN_NOT_OK(internal::ImportModule("decimal", &decimal_module)); + RETURN_NOT_OK( + internal::ImportFromModule(decimal_module.obj(), "Decimal", &decimal_constructor_)); + std::string decimal_string("nan"); + PyObject* nan_value = + internal::DecimalFromString(decimal_constructor_.obj(), decimal_string); + + ASSERT_OK(metadata.Update(nan_value)); + ASSERT_EQ(std::numeric_limits::min(), metadata.precision()); + ASSERT_EQ(std::numeric_limits::min(), metadata.scale()); + + return Status::OK(); +} + +} // namespace + +std::vector GetCppTestCases() { + return { + {"test_owned_ref_moves", TestOwnedRefMoves}, + {"test_owned_ref_nogil_moves", TestOwnedRefNoGILMoves}, + {"test_check_pyerror_status", TestCheckPyErrorStatus}, + {"test_check_pyerror_status_nogil", TestCheckPyErrorStatusNoGIL}, + {"test_restore_pyerror_basics", TestRestorePyErrorBasics}, + {"test_pybuffer_invalid_input_object", TestPyBufferInvalidInputObject}, +#ifndef _WIN32 + {"test_pybuffer_numpy_array", TestPyBufferNumpyArray}, + {"test_numpybuffer_numpy_array", TestNumPyBufferNumpyArray}, +#endif + {"test_python_decimal_to_string", TestPythonDecimalToString}, + {"test_infer_precision_and_scale", TestInferPrecisionAndScale}, + {"test_infer_precision_and_negative_scale", TestInferPrecisionAndNegativeScale}, + {"test_infer_all_leading_zeros", TestInferAllLeadingZeros}, + {"test_infer_all_leading_zeros_exponential_notation_positive", + TestInferAllLeadingZerosExponentialNotationPositive}, + {"test_infer_all_leading_zeros_exponential_notation_negative", + TestInferAllLeadingZerosExponentialNotationNegative}, + {"test_object_block_write_fails", TestObjectBlockWriteFails}, + {"test_mixed_type_fails", TestMixedTypeFails}, + {"test_from_python_decimal_rescale_not_truncateable", + TestFromPythonDecimalRescaleNotTruncateable}, + {"test_from_python_decimal_rescale_truncateable", + TestFromPythonDecimalRescaleTruncateable}, + {"test_from_python_negative_decimal_rescale", TestFromPythonNegativeDecimalRescale}, + {"test_decimal128_from_python_integer", TestDecimal128FromPythonInteger}, + {"test_decimal256_from_python_integer", TestDecimal256FromPythonInteger}, + {"test_decimal128_overflow_fails", TestDecimal128OverflowFails}, + {"test_decimal256_overflow_fails", TestDecimal256OverflowFails}, + {"test_none_and_nan", TestNoneAndNaN}, + {"test_mixed_precision_and_scale", TestMixedPrecisionAndScale}, + {"test_mixed_precision_and_scale_sequence_convert", + TestMixedPrecisionAndScaleSequenceConvert}, + {"test_simple_inference", TestSimpleInference}, + {"test_update_with_nan", TestUpdateWithNaN}, + }; +} + +} // namespace testing +} // namespace py +} // namespace arrow diff --git a/src/vendored/apache-arrow-12.0.1/arrow/python/python_to_arrow.cc b/src/vendored/apache-arrow-12.0.1/arrow/python/python_to_arrow.cc new file mode 100644 index 0000000..486bd84 --- /dev/null +++ b/src/vendored/apache-arrow-12.0.1/arrow/python/python_to_arrow.cc @@ -0,0 +1,1240 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/python/python_to_arrow.h" +#include "arrow/python/numpy_interop.h" + +#include + +#include +#include +#include +#include +#include +#include + +#include "arrow/array.h" +#include "arrow/array/builder_base.h" +#include "arrow/array/builder_binary.h" +#include "arrow/array/builder_decimal.h" +#include "arrow/array/builder_dict.h" +#include "arrow/array/builder_nested.h" +#include "arrow/array/builder_primitive.h" +#include "arrow/array/builder_time.h" +#include "arrow/chunked_array.h" +#include "arrow/result.h" +#include "arrow/scalar.h" +#include "arrow/status.h" +#include "arrow/type.h" +#include "arrow/type_traits.h" +#include "arrow/util/checked_cast.h" +#include "arrow/util/converter.h" +#include "arrow/util/decimal.h" +#include "arrow/util/int_util_overflow.h" +#include "arrow/util/logging.h" + +#include "arrow/python/datetime.h" +#include "arrow/python/decimal.h" +#include "arrow/python/helpers.h" +#include "arrow/python/inference.h" +#include "arrow/python/iterators.h" +#include "arrow/python/numpy_convert.h" +#include "arrow/python/type_traits.h" +#include "arrow/visit_type_inline.h" + +namespace arrow { + +using internal::checked_cast; +using internal::checked_pointer_cast; + +using internal::Converter; +using internal::DictionaryConverter; +using internal::ListConverter; +using internal::PrimitiveConverter; +using internal::StructConverter; + +using internal::MakeChunker; +using internal::MakeConverter; + +namespace py { + +namespace { +enum class MonthDayNanoField { kMonths, kWeeksAndDays, kDaysOnly, kNanoseconds }; + +template +struct MonthDayNanoTraits; + +struct MonthDayNanoAttrData { + const char* name; + const int64_t multiplier; +}; + +template <> +struct MonthDayNanoTraits { + using c_type = int32_t; + static const MonthDayNanoAttrData attrs[]; +}; + +const MonthDayNanoAttrData MonthDayNanoTraits::attrs[] = { + {"years", 1}, {"months", /*months_in_year=*/12}, {nullptr, 0}}; + +template <> +struct MonthDayNanoTraits { + using c_type = int32_t; + static const MonthDayNanoAttrData attrs[]; +}; + +const MonthDayNanoAttrData MonthDayNanoTraits::attrs[] = + {{"weeks", 1}, {"days", /*days_in_week=*/7}, {nullptr, 0}}; + +template <> +struct MonthDayNanoTraits { + using c_type = int32_t; + static const MonthDayNanoAttrData attrs[]; +}; + +const MonthDayNanoAttrData MonthDayNanoTraits::attrs[] = { + {"days", 1}, {nullptr, 0}}; + +template <> +struct MonthDayNanoTraits { + using c_type = int64_t; + static const MonthDayNanoAttrData attrs[]; +}; + +const MonthDayNanoAttrData MonthDayNanoTraits::attrs[] = + {{"hours", 1}, + {"minutes", /*minutes_in_hours=*/60}, + {"seconds", /*seconds_in_minute=*/60}, + {"milliseconds", /*milliseconds_in_seconds*/ 1000}, + {"microseconds", /*microseconds_in_millseconds=*/1000}, + {"nanoseconds", /*nanoseconds_in_microseconds=*/1000}, + {nullptr, 0}}; + +template +struct PopulateMonthDayNano { + using Traits = MonthDayNanoTraits; + using field_c_type = typename Traits::c_type; + + static Status Field(PyObject* obj, field_c_type* out, bool* found_attrs) { + *out = 0; + for (const MonthDayNanoAttrData* attr = &Traits::attrs[0]; attr->multiplier != 0; + ++attr) { + if (attr->multiplier != 1 && + ::arrow::internal::MultiplyWithOverflow( + static_cast(attr->multiplier), *out, out)) { + return Status::Invalid("Overflow on: ", (attr - 1)->name, + " for: ", internal::PyObject_StdStringRepr(obj)); + } + + OwnedRef field_value(PyObject_GetAttrString(obj, attr->name)); + if (field_value.obj() == nullptr) { + // No attribute present, skip to the next one. + PyErr_Clear(); + continue; + } + RETURN_IF_PYERROR(); + *found_attrs = true; + field_c_type value; + RETURN_NOT_OK(internal::CIntFromPython(field_value.obj(), &value, attr->name)); + if (::arrow::internal::AddWithOverflow(*out, value, out)) { + return Status::Invalid("Overflow on: ", attr->name, + " for: ", internal::PyObject_StdStringRepr(obj)); + } + } + + return Status::OK(); + } +}; + +// Utility for converting single python objects to their intermediate C representations +// which can be fed to the typed builders +class PyValue { + public: + // Type aliases for shorter signature definitions + using I = PyObject*; + using O = PyConversionOptions; + + // Used for null checking before actually converting the values + static bool IsNull(const O& options, I obj) { + if (options.from_pandas) { + return internal::PandasObjectIsNull(obj); + } else { + return obj == Py_None; + } + } + + // Used for post-conversion numpy NaT sentinel checking + static bool IsNaT(const TimestampType*, int64_t value) { + return internal::npy_traits::isnull(value); + } + + // Used for post-conversion numpy NaT sentinel checking + static bool IsNaT(const DurationType*, int64_t value) { + return internal::npy_traits::isnull(value); + } + + static Result Convert(const NullType*, const O&, I obj) { + if (obj == Py_None) { + return nullptr; + } else { + return Status::Invalid("Invalid null value"); + } + } + + static Result Convert(const BooleanType*, const O&, I obj) { + if (obj == Py_True) { + return true; + } else if (obj == Py_False) { + return false; + } else if (PyArray_IsScalar(obj, Bool)) { + return reinterpret_cast(obj)->obval == NPY_TRUE; + } else { + return internal::InvalidValue(obj, "tried to convert to boolean"); + } + } + + template + static enable_if_integer> Convert(const T* type, const O&, + I obj) { + typename T::c_type value; + auto status = internal::CIntFromPython(obj, &value); + if (ARROW_PREDICT_TRUE(status.ok())) { + return value; + } else if (!internal::PyIntScalar_Check(obj)) { + std::stringstream ss; + ss << "tried to convert to " << type->ToString(); + return internal::InvalidValue(obj, ss.str()); + } else { + return status; + } + } + + static Result Convert(const HalfFloatType*, const O&, I obj) { + uint16_t value; + RETURN_NOT_OK(PyFloat_AsHalf(obj, &value)); + return value; + } + + static Result Convert(const FloatType*, const O&, I obj) { + float value; + if (internal::PyFloatScalar_Check(obj)) { + value = static_cast(PyFloat_AsDouble(obj)); + RETURN_IF_PYERROR(); + } else if (internal::PyIntScalar_Check(obj)) { + RETURN_NOT_OK(internal::IntegerScalarToFloat32Safe(obj, &value)); + } else { + return internal::InvalidValue(obj, "tried to convert to float32"); + } + return value; + } + + static Result Convert(const DoubleType*, const O&, I obj) { + double value; + if (PyFloat_Check(obj)) { + value = PyFloat_AS_DOUBLE(obj); + } else if (internal::PyFloatScalar_Check(obj)) { + // Other kinds of float-y things + value = PyFloat_AsDouble(obj); + RETURN_IF_PYERROR(); + } else if (internal::PyIntScalar_Check(obj)) { + RETURN_NOT_OK(internal::IntegerScalarToDoubleSafe(obj, &value)); + } else { + return internal::InvalidValue(obj, "tried to convert to double"); + } + return value; + } + + static Result Convert(const Decimal128Type* type, const O&, I obj) { + Decimal128 value; + RETURN_NOT_OK(internal::DecimalFromPyObject(obj, *type, &value)); + return value; + } + + static Result Convert(const Decimal256Type* type, const O&, I obj) { + Decimal256 value; + RETURN_NOT_OK(internal::DecimalFromPyObject(obj, *type, &value)); + return value; + } + + static Result Convert(const Date32Type*, const O&, I obj) { + int32_t value; + if (PyDate_Check(obj)) { + auto pydate = reinterpret_cast(obj); + value = static_cast(internal::PyDate_to_days(pydate)); + } else { + RETURN_NOT_OK( + internal::CIntFromPython(obj, &value, "Integer too large for date32")); + } + return value; + } + + static Result Convert(const Date64Type*, const O&, I obj) { + int64_t value; + if (PyDateTime_Check(obj)) { + auto pydate = reinterpret_cast(obj); + value = internal::PyDateTime_to_ms(pydate); + // Truncate any intraday milliseconds + // TODO: introduce an option for this + value -= value % 86400000LL; + } else if (PyDate_Check(obj)) { + auto pydate = reinterpret_cast(obj); + value = internal::PyDate_to_ms(pydate); + } else { + RETURN_NOT_OK( + internal::CIntFromPython(obj, &value, "Integer too large for date64")); + } + return value; + } + + static Result Convert(const Time32Type* type, const O&, I obj) { + int32_t value; + if (PyTime_Check(obj)) { + switch (type->unit()) { + case TimeUnit::SECOND: + value = static_cast(internal::PyTime_to_s(obj)); + break; + case TimeUnit::MILLI: + value = static_cast(internal::PyTime_to_ms(obj)); + break; + default: + return Status::UnknownError("Invalid time unit"); + } + } else { + RETURN_NOT_OK(internal::CIntFromPython(obj, &value, "Integer too large for int32")); + } + return value; + } + + static Result Convert(const Time64Type* type, const O&, I obj) { + int64_t value; + if (PyTime_Check(obj)) { + switch (type->unit()) { + case TimeUnit::MICRO: + value = internal::PyTime_to_us(obj); + break; + case TimeUnit::NANO: + value = internal::PyTime_to_ns(obj); + break; + default: + return Status::UnknownError("Invalid time unit"); + } + } else { + RETURN_NOT_OK(internal::CIntFromPython(obj, &value, "Integer too large for int64")); + } + return value; + } + + static Result Convert(const TimestampType* type, const O& options, I obj) { + int64_t value, offset; + if (PyDateTime_Check(obj)) { + if (ARROW_PREDICT_FALSE(options.ignore_timezone)) { + offset = 0; + } else { + ARROW_ASSIGN_OR_RAISE(offset, internal::PyDateTime_utcoffset_s(obj)); + } + auto dt = reinterpret_cast(obj); + switch (type->unit()) { + case TimeUnit::SECOND: + value = internal::PyDateTime_to_s(dt) - offset; + break; + case TimeUnit::MILLI: + value = internal::PyDateTime_to_ms(dt) - offset * 1000LL; + break; + case TimeUnit::MICRO: + value = internal::PyDateTime_to_us(dt) - offset * 1000000LL; + break; + case TimeUnit::NANO: + if (internal::IsPandasTimestamp(obj)) { + // pd.Timestamp value attribute contains the offset from unix epoch + // so no adjustment for timezone is need. + OwnedRef nanos(PyObject_GetAttrString(obj, "value")); + RETURN_IF_PYERROR(); + RETURN_NOT_OK(internal::CIntFromPython(nanos.obj(), &value)); + } else { + // Conversion to nanoseconds can overflow -> check multiply of microseconds + value = internal::PyDateTime_to_us(dt); + if (arrow::internal::MultiplyWithOverflow(value, 1000LL, &value)) { + return internal::InvalidValue(obj, + "out of bounds for nanosecond resolution"); + } + + // Adjust with offset and check for overflow + if (arrow::internal::SubtractWithOverflow(value, offset * 1000000000LL, + &value)) { + return internal::InvalidValue(obj, + "out of bounds for nanosecond resolution"); + } + } + break; + default: + return Status::UnknownError("Invalid time unit"); + } + } else if (PyArray_CheckAnyScalarExact(obj)) { + // validate that the numpy scalar has np.datetime64 dtype + std::shared_ptr numpy_type; + RETURN_NOT_OK(NumPyDtypeToArrow(PyArray_DescrFromScalar(obj), &numpy_type)); + if (!numpy_type->Equals(*type)) { + return Status::NotImplemented("Expected np.datetime64 but got: ", + numpy_type->ToString()); + } + return reinterpret_cast(obj)->obval; + } else { + RETURN_NOT_OK(internal::CIntFromPython(obj, &value)); + } + return value; + } + + static Result Convert( + const MonthDayNanoIntervalType* /*type*/, const O& /*options*/, I obj) { + MonthDayNanoIntervalType::MonthDayNanos output; + bool found_attrs = false; + RETURN_NOT_OK(PopulateMonthDayNano::Field( + obj, &output.months, &found_attrs)); + // on relativeoffset weeks is a property calculated from days. On + // DateOffset is is a field on its own. timedelta doesn't have a weeks + // attribute. + PyObject* pandas_date_offset_type = internal::BorrowPandasDataOffsetType(); + bool is_date_offset = pandas_date_offset_type == (PyObject*)Py_TYPE(obj); + if (!is_date_offset) { + RETURN_NOT_OK(PopulateMonthDayNano::Field( + obj, &output.days, &found_attrs)); + } else { + RETURN_NOT_OK(PopulateMonthDayNano::Field( + obj, &output.days, &found_attrs)); + } + RETURN_NOT_OK(PopulateMonthDayNano::Field( + obj, &output.nanoseconds, &found_attrs)); + + // date_offset can have zero fields. + if (found_attrs || is_date_offset) { + return output; + } + if (PyTuple_Check(obj) && PyTuple_Size(obj) == 3) { + RETURN_NOT_OK(internal::CIntFromPython(PyTuple_GET_ITEM(obj, 0), &output.months, + "Months (tuple item #0) too large")); + RETURN_NOT_OK(internal::CIntFromPython(PyTuple_GET_ITEM(obj, 1), &output.days, + "Days (tuple item #1) too large")); + RETURN_NOT_OK(internal::CIntFromPython(PyTuple_GET_ITEM(obj, 2), + &output.nanoseconds, + "Nanoseconds (tuple item #2) too large")); + return output; + } + return Status::TypeError("No temporal attributes found on object."); + } + + static Result Convert(const DurationType* type, const O&, I obj) { + int64_t value; + if (PyDelta_Check(obj)) { + auto dt = reinterpret_cast(obj); + switch (type->unit()) { + case TimeUnit::SECOND: + value = internal::PyDelta_to_s(dt); + break; + case TimeUnit::MILLI: + value = internal::PyDelta_to_ms(dt); + break; + case TimeUnit::MICRO: { + ARROW_ASSIGN_OR_RAISE(value, internal::PyDelta_to_us(dt)); + break; + } + case TimeUnit::NANO: + if (internal::IsPandasTimedelta(obj)) { + OwnedRef nanos(PyObject_GetAttrString(obj, "value")); + RETURN_IF_PYERROR(); + RETURN_NOT_OK(internal::CIntFromPython(nanos.obj(), &value)); + } else { + ARROW_ASSIGN_OR_RAISE(value, internal::PyDelta_to_ns(dt)); + } + break; + default: + return Status::UnknownError("Invalid time unit"); + } + } else if (PyArray_CheckAnyScalarExact(obj)) { + // validate that the numpy scalar has np.datetime64 dtype + std::shared_ptr numpy_type; + RETURN_NOT_OK(NumPyDtypeToArrow(PyArray_DescrFromScalar(obj), &numpy_type)); + if (!numpy_type->Equals(*type)) { + return Status::NotImplemented("Expected np.timedelta64 but got: ", + numpy_type->ToString()); + } + return reinterpret_cast(obj)->obval; + } else { + RETURN_NOT_OK(internal::CIntFromPython(obj, &value)); + } + return value; + } + + // The binary-like intermediate representation is PyBytesView because it keeps temporary + // python objects alive (non-contiguous memoryview) and stores whether the original + // object was unicode encoded or not, which is used for unicode -> bytes coersion if + // there is a non-unicode object observed. + + static Status Convert(const BaseBinaryType*, const O&, I obj, PyBytesView& view) { + return view.ParseString(obj); + } + + static Status Convert(const FixedSizeBinaryType* type, const O&, I obj, + PyBytesView& view) { + ARROW_RETURN_NOT_OK(view.ParseString(obj)); + if (view.size != type->byte_width()) { + std::stringstream ss; + ss << "expected to be length " << type->byte_width() << " was " << view.size; + return internal::InvalidValue(obj, ss.str()); + } else { + return Status::OK(); + } + } + + template + static enable_if_string Convert(const T*, const O& options, I obj, + PyBytesView& view) { + if (options.strict) { + // Strict conversion, force output to be unicode / utf8 and validate that + // any binary values are utf8 + ARROW_RETURN_NOT_OK(view.ParseString(obj, true)); + if (!view.is_utf8) { + return internal::InvalidValue(obj, "was not a utf8 string"); + } + return Status::OK(); + } else { + // Non-strict conversion; keep track of whether values are unicode or bytes + return view.ParseString(obj); + } + } + + static Result Convert(const DataType* type, const O&, I obj) { + return Status::NotImplemented("PyValue::Convert is not implemented for type ", type); + } +}; + +// The base Converter class is a mixin with predefined behavior and constructors. +class PyConverter : public Converter { + public: + // Iterate over the input values and defer the conversion to the Append method + Status Extend(PyObject* values, int64_t size, int64_t offset = 0) override { + DCHECK_GE(size, offset); + /// Ensure we've allocated enough space + RETURN_NOT_OK(this->Reserve(size - offset)); + // Iterate over the items adding each one + return internal::VisitSequence( + values, offset, + [this](PyObject* item, bool* /* unused */) { return this->Append(item); }); + } + + // Convert and append a sequence of values masked with a numpy array + Status ExtendMasked(PyObject* values, PyObject* mask, int64_t size, + int64_t offset = 0) override { + DCHECK_GE(size, offset); + /// Ensure we've allocated enough space + RETURN_NOT_OK(this->Reserve(size - offset)); + // Iterate over the items adding each one + return internal::VisitSequenceMasked( + values, mask, offset, [this](PyObject* item, bool is_masked, bool* /* unused */) { + if (is_masked) { + return this->AppendNull(); + } else { + // This will also apply the null-checking convention in the event + // that the value is not masked + return this->Append(item); // perhaps use AppendValue instead? + } + }); + } +}; + +template +class PyPrimitiveConverter; + +template +class PyListConverter; + +template +class PyDictionaryConverter; + +class PyStructConverter; + +template +struct PyConverterTrait; + +template +struct PyConverterTrait< + T, enable_if_t<(!is_nested_type::value && !is_interval_type::value && + !is_extension_type::value) || + std::is_same::value>> { + using type = PyPrimitiveConverter; +}; + +template +struct PyConverterTrait> { + using type = PyListConverter; +}; + +template <> +struct PyConverterTrait { + using type = PyStructConverter; +}; + +template <> +struct PyConverterTrait { + template + using dictionary_type = PyDictionaryConverter; +}; + +template +class PyPrimitiveConverter> + : public PrimitiveConverter { + public: + Status Append(PyObject* value) override { + if (PyValue::IsNull(this->options_, value)) { + return this->primitive_builder_->AppendNull(); + } else if (arrow::py::is_scalar(value)) { + ARROW_ASSIGN_OR_RAISE(std::shared_ptr scalar, + arrow::py::unwrap_scalar(value)); + if (scalar->is_valid) { + return Status::Invalid("Cannot append scalar of type ", scalar->type->ToString(), + " to builder for type null"); + } else { + return this->primitive_builder_->AppendNull(); + } + } else { + ARROW_ASSIGN_OR_RAISE( + auto converted, PyValue::Convert(this->primitive_type_, this->options_, value)); + return this->primitive_builder_->Append(converted); + } + } +}; + +template +class PyPrimitiveConverter< + T, enable_if_t::value || is_number_type::value || + is_decimal_type::value || is_date_type::value || + is_time_type::value || + std::is_same::value>> + : public PrimitiveConverter { + public: + Status Append(PyObject* value) override { + // Since the required space has been already allocated in the Extend functions we can + // rely on the Unsafe builder API which improves the performance. + if (PyValue::IsNull(this->options_, value)) { + this->primitive_builder_->UnsafeAppendNull(); + } else if (arrow::py::is_scalar(value)) { + ARROW_ASSIGN_OR_RAISE(std::shared_ptr scalar, + arrow::py::unwrap_scalar(value)); + ARROW_RETURN_NOT_OK(this->primitive_builder_->AppendScalar(*scalar)); + } else { + ARROW_ASSIGN_OR_RAISE( + auto converted, PyValue::Convert(this->primitive_type_, this->options_, value)); + this->primitive_builder_->UnsafeAppend(converted); + } + return Status::OK(); + } +}; + +template +class PyPrimitiveConverter< + T, enable_if_t::value || is_duration_type::value>> + : public PrimitiveConverter { + public: + Status Append(PyObject* value) override { + if (PyValue::IsNull(this->options_, value)) { + this->primitive_builder_->UnsafeAppendNull(); + } else if (arrow::py::is_scalar(value)) { + ARROW_ASSIGN_OR_RAISE(std::shared_ptr scalar, + arrow::py::unwrap_scalar(value)); + ARROW_RETURN_NOT_OK(this->primitive_builder_->AppendScalar(*scalar)); + } else { + ARROW_ASSIGN_OR_RAISE( + auto converted, PyValue::Convert(this->primitive_type_, this->options_, value)); + // Numpy NaT sentinels can be checked after the conversion + if (PyArray_CheckAnyScalarExact(value) && + PyValue::IsNaT(this->primitive_type_, converted)) { + this->primitive_builder_->UnsafeAppendNull(); + } else { + this->primitive_builder_->UnsafeAppend(converted); + } + } + return Status::OK(); + } +}; + +template +class PyPrimitiveConverter::value>> + : public PrimitiveConverter { + public: + Status Append(PyObject* value) override { + if (PyValue::IsNull(this->options_, value)) { + this->primitive_builder_->UnsafeAppendNull(); + } else if (arrow::py::is_scalar(value)) { + ARROW_ASSIGN_OR_RAISE(std::shared_ptr scalar, + arrow::py::unwrap_scalar(value)); + ARROW_RETURN_NOT_OK(this->primitive_builder_->AppendScalar(*scalar)); + } else { + ARROW_RETURN_NOT_OK( + PyValue::Convert(this->primitive_type_, this->options_, value, view_)); + ARROW_RETURN_NOT_OK(this->primitive_builder_->ReserveData(view_.size)); + this->primitive_builder_->UnsafeAppend(view_.bytes); + } + return Status::OK(); + } + + protected: + PyBytesView view_; +}; + +template +class PyPrimitiveConverter> + : public PrimitiveConverter { + public: + using OffsetType = typename T::offset_type; + + Status Append(PyObject* value) override { + if (PyValue::IsNull(this->options_, value)) { + this->primitive_builder_->UnsafeAppendNull(); + } else if (arrow::py::is_scalar(value)) { + ARROW_ASSIGN_OR_RAISE(std::shared_ptr scalar, + arrow::py::unwrap_scalar(value)); + ARROW_RETURN_NOT_OK(this->primitive_builder_->AppendScalar(*scalar)); + } else { + ARROW_RETURN_NOT_OK( + PyValue::Convert(this->primitive_type_, this->options_, value, view_)); + if (!view_.is_utf8) { + // observed binary value + observed_binary_ = true; + } + // Since we don't know the varying length input size in advance, we need to + // reserve space in the value builder one by one. ReserveData raises CapacityError + // if the value would not fit into the array. + ARROW_RETURN_NOT_OK(this->primitive_builder_->ReserveData(view_.size)); + this->primitive_builder_->UnsafeAppend(view_.bytes, + static_cast(view_.size)); + } + return Status::OK(); + } + + Result> ToArray() override { + ARROW_ASSIGN_OR_RAISE(auto array, (PrimitiveConverter::ToArray())); + if (observed_binary_) { + // if we saw any non-unicode, cast results to BinaryArray + auto binary_type = TypeTraits::type_singleton(); + return array->View(binary_type); + } else { + return array; + } + } + + protected: + PyBytesView view_; + bool observed_binary_ = false; +}; + +template +class PyDictionaryConverter> + : public DictionaryConverter { + public: + Status Append(PyObject* value) override { + if (PyValue::IsNull(this->options_, value)) { + return this->value_builder_->AppendNull(); + } else if (arrow::py::is_scalar(value)) { + ARROW_ASSIGN_OR_RAISE(std::shared_ptr scalar, + arrow::py::unwrap_scalar(value)); + return this->value_builder_->AppendScalar(*scalar, 1); + } else { + ARROW_ASSIGN_OR_RAISE(auto converted, + PyValue::Convert(this->value_type_, this->options_, value)); + return this->value_builder_->Append(converted); + } + } +}; + +template +class PyDictionaryConverter> + : public DictionaryConverter { + public: + Status Append(PyObject* value) override { + if (PyValue::IsNull(this->options_, value)) { + return this->value_builder_->AppendNull(); + } else if (arrow::py::is_scalar(value)) { + ARROW_ASSIGN_OR_RAISE(std::shared_ptr scalar, + arrow::py::unwrap_scalar(value)); + return this->value_builder_->AppendScalar(*scalar, 1); + } else { + ARROW_RETURN_NOT_OK( + PyValue::Convert(this->value_type_, this->options_, value, view_)); + return this->value_builder_->Append(view_.bytes, static_cast(view_.size)); + } + } + + protected: + PyBytesView view_; +}; + +template +class PyListConverter : public ListConverter { + public: + Status Append(PyObject* value) override { + if (PyValue::IsNull(this->options_, value)) { + return this->list_builder_->AppendNull(); + } + + RETURN_NOT_OK(this->list_builder_->Append()); + if (PyArray_Check(value)) { + RETURN_NOT_OK(AppendNdarray(value)); + } else if (PySequence_Check(value)) { + RETURN_NOT_OK(AppendSequence(value)); + } else if (PySet_Check(value) || (Py_TYPE(value) == &PyDictValues_Type)) { + RETURN_NOT_OK(AppendIterable(value)); + } else if (PyDict_Check(value) && this->type()->id() == Type::MAP) { + // Branch to support Python Dict with `map` DataType. + auto items = PyDict_Items(value); + OwnedRef item_ref(items); + RETURN_NOT_OK(AppendSequence(items)); + } else { + return internal::InvalidType( + value, "was not a sequence or recognized null for conversion to list type"); + } + + return ValidateBuilder(this->list_type_); + } + + protected: + Status ValidateBuilder(const MapType*) { + if (this->list_builder_->key_builder()->null_count() > 0) { + return Status::Invalid("Invalid Map: key field can not contain null values"); + } else { + return Status::OK(); + } + } + + Status ValidateBuilder(const BaseListType*) { return Status::OK(); } + + Status AppendSequence(PyObject* value) { + int64_t size = static_cast(PySequence_Size(value)); + RETURN_NOT_OK(this->list_builder_->ValidateOverflow(size)); + return this->value_converter_->Extend(value, size); + } + + Status AppendIterable(PyObject* value) { + PyObject* iterator = PyObject_GetIter(value); + OwnedRef iter_ref(iterator); + while (PyObject* item = PyIter_Next(iterator)) { + OwnedRef item_ref(item); + RETURN_NOT_OK(this->value_converter_->Reserve(1)); + RETURN_NOT_OK(this->value_converter_->Append(item)); + } + return Status::OK(); + } + + Status AppendNdarray(PyObject* value) { + PyArrayObject* ndarray = reinterpret_cast(value); + if (PyArray_NDIM(ndarray) != 1) { + return Status::Invalid("Can only convert 1-dimensional array values"); + } + const int64_t size = PyArray_SIZE(ndarray); + RETURN_NOT_OK(this->list_builder_->ValidateOverflow(size)); + + const auto value_type = this->value_converter_->builder()->type(); + switch (value_type->id()) { +// If the value type does not match the expected NumPy dtype, then fall through +// to a slower PySequence-based path +#define LIST_FAST_CASE(TYPE_ID, TYPE, NUMPY_TYPE) \ + case Type::TYPE_ID: { \ + if (PyArray_DESCR(ndarray)->type_num != NUMPY_TYPE) { \ + return this->value_converter_->Extend(value, size); \ + } \ + return AppendNdarrayTyped(ndarray); \ + } + LIST_FAST_CASE(BOOL, BooleanType, NPY_BOOL) + LIST_FAST_CASE(UINT8, UInt8Type, NPY_UINT8) + LIST_FAST_CASE(INT8, Int8Type, NPY_INT8) + LIST_FAST_CASE(UINT16, UInt16Type, NPY_UINT16) + LIST_FAST_CASE(INT16, Int16Type, NPY_INT16) + LIST_FAST_CASE(UINT32, UInt32Type, NPY_UINT32) + LIST_FAST_CASE(INT32, Int32Type, NPY_INT32) + LIST_FAST_CASE(UINT64, UInt64Type, NPY_UINT64) + LIST_FAST_CASE(INT64, Int64Type, NPY_INT64) + LIST_FAST_CASE(HALF_FLOAT, HalfFloatType, NPY_FLOAT16) + LIST_FAST_CASE(FLOAT, FloatType, NPY_FLOAT) + LIST_FAST_CASE(DOUBLE, DoubleType, NPY_DOUBLE) + LIST_FAST_CASE(TIMESTAMP, TimestampType, NPY_DATETIME) + LIST_FAST_CASE(DURATION, DurationType, NPY_TIMEDELTA) +#undef LIST_FAST_CASE + default: { + return this->value_converter_->Extend(value, size); + } + } + } + + template + Status AppendNdarrayTyped(PyArrayObject* ndarray) { + // no need to go through the conversion + using NumpyTrait = internal::npy_traits; + using NumpyType = typename NumpyTrait::value_type; + using ValueBuilderType = typename TypeTraits::BuilderType; + + const bool null_sentinels_possible = + // Always treat Numpy's NaT as null + NUMPY_TYPE == NPY_DATETIME || NUMPY_TYPE == NPY_TIMEDELTA || + // Observing pandas's null sentinels + (this->options_.from_pandas && NumpyTrait::supports_nulls); + + auto value_builder = + checked_cast(this->value_converter_->builder().get()); + + Ndarray1DIndexer values(ndarray); + if (null_sentinels_possible) { + for (int64_t i = 0; i < values.size(); ++i) { + if (NumpyTrait::isnull(values[i])) { + RETURN_NOT_OK(value_builder->AppendNull()); + } else { + RETURN_NOT_OK(value_builder->Append(values[i])); + } + } + } else if (!values.is_strided()) { + RETURN_NOT_OK(value_builder->AppendValues(values.data(), values.size())); + } else { + for (int64_t i = 0; i < values.size(); ++i) { + RETURN_NOT_OK(value_builder->Append(values[i])); + } + } + return Status::OK(); + } +}; + +class PyStructConverter : public StructConverter { + public: + Status Append(PyObject* value) override { + if (PyValue::IsNull(this->options_, value)) { + return this->struct_builder_->AppendNull(); + } else if (arrow::py::is_scalar(value)) { + ARROW_ASSIGN_OR_RAISE(std::shared_ptr scalar, + arrow::py::unwrap_scalar(value)); + return this->struct_builder_->AppendScalar(*scalar); + } + switch (input_kind_) { + case InputKind::DICT: + RETURN_NOT_OK(this->struct_builder_->Append()); + return AppendDict(value); + case InputKind::TUPLE: + RETURN_NOT_OK(this->struct_builder_->Append()); + return AppendTuple(value); + case InputKind::ITEMS: + RETURN_NOT_OK(this->struct_builder_->Append()); + return AppendItems(value); + default: + RETURN_NOT_OK(InferInputKind(value)); + return Append(value); + } + } + + protected: + Status Init(MemoryPool* pool) override { + RETURN_NOT_OK((StructConverter::Init(pool))); + + // Store the field names as a PyObjects for dict matching + num_fields_ = this->struct_type_->num_fields(); + bytes_field_names_.reset(PyList_New(num_fields_)); + unicode_field_names_.reset(PyList_New(num_fields_)); + RETURN_IF_PYERROR(); + + for (int i = 0; i < num_fields_; i++) { + const auto& field_name = this->struct_type_->field(i)->name(); + PyObject* bytes = PyBytes_FromStringAndSize(field_name.c_str(), field_name.size()); + PyObject* unicode = + PyUnicode_FromStringAndSize(field_name.c_str(), field_name.size()); + RETURN_IF_PYERROR(); + PyList_SET_ITEM(bytes_field_names_.obj(), i, bytes); + PyList_SET_ITEM(unicode_field_names_.obj(), i, unicode); + } + return Status::OK(); + } + + Status InferInputKind(PyObject* value) { + // Infer input object's type, note that heterogeneous sequences are not allowed + if (PyDict_Check(value)) { + input_kind_ = InputKind::DICT; + } else if (PyTuple_Check(value)) { + input_kind_ = InputKind::TUPLE; + } else if (PySequence_Check(value)) { + input_kind_ = InputKind::ITEMS; + } else { + return internal::InvalidType(value, + "was not a dict, tuple, or recognized null value " + "for conversion to struct type"); + } + return Status::OK(); + } + + Status InferKeyKind(PyObject* items) { + for (int i = 0; i < PySequence_Length(items); i++) { + // retrieve the key from the passed key-value pairs + ARROW_ASSIGN_OR_RAISE(auto pair, GetKeyValuePair(items, i)); + + // check key exists between the unicode field names + bool do_contain = PySequence_Contains(unicode_field_names_.obj(), pair.first); + RETURN_IF_PYERROR(); + if (do_contain) { + key_kind_ = KeyKind::UNICODE; + return Status::OK(); + } + + // check key exists between the bytes field names + do_contain = PySequence_Contains(bytes_field_names_.obj(), pair.first); + RETURN_IF_PYERROR(); + if (do_contain) { + key_kind_ = KeyKind::BYTES; + return Status::OK(); + } + } + return Status::OK(); + } + + Status AppendEmpty() { + for (int i = 0; i < num_fields_; i++) { + RETURN_NOT_OK(this->children_[i]->Append(Py_None)); + } + return Status::OK(); + } + + Status AppendTuple(PyObject* tuple) { + if (!PyTuple_Check(tuple)) { + return internal::InvalidType(tuple, "was expecting a tuple"); + } + if (PyTuple_GET_SIZE(tuple) != num_fields_) { + return Status::Invalid("Tuple size must be equal to number of struct fields"); + } + for (int i = 0; i < num_fields_; i++) { + PyObject* value = PyTuple_GET_ITEM(tuple, i); + RETURN_NOT_OK(this->children_[i]->Append(value)); + } + return Status::OK(); + } + + Status AppendDict(PyObject* dict) { + if (!PyDict_Check(dict)) { + return internal::InvalidType(dict, "was expecting a dict"); + } + switch (key_kind_) { + case KeyKind::UNICODE: + return AppendDict(dict, unicode_field_names_.obj()); + case KeyKind::BYTES: + return AppendDict(dict, bytes_field_names_.obj()); + default: + RETURN_NOT_OK(InferKeyKind(PyDict_Items(dict))); + if (key_kind_ == KeyKind::UNKNOWN) { + // was unable to infer the type which means that all keys are absent + return AppendEmpty(); + } else { + return AppendDict(dict); + } + } + } + + Status AppendItems(PyObject* items) { + if (!PySequence_Check(items)) { + return internal::InvalidType(items, "was expecting a sequence of key-value items"); + } + switch (key_kind_) { + case KeyKind::UNICODE: + return AppendItems(items, unicode_field_names_.obj()); + case KeyKind::BYTES: + return AppendItems(items, bytes_field_names_.obj()); + default: + RETURN_NOT_OK(InferKeyKind(items)); + if (key_kind_ == KeyKind::UNKNOWN) { + // was unable to infer the type which means that all keys are absent + return AppendEmpty(); + } else { + return AppendItems(items); + } + } + } + + Status AppendDict(PyObject* dict, PyObject* field_names) { + // NOTE we're ignoring any extraneous dict items + for (int i = 0; i < num_fields_; i++) { + PyObject* name = PyList_GET_ITEM(field_names, i); // borrowed + PyObject* value = PyDict_GetItem(dict, name); // borrowed + if (value == NULL) { + RETURN_IF_PYERROR(); + } + RETURN_NOT_OK(this->children_[i]->Append(value ? value : Py_None)); + } + return Status::OK(); + } + + Result> GetKeyValuePair(PyObject* seq, int index) { + PyObject* pair = PySequence_GetItem(seq, index); + RETURN_IF_PYERROR(); + if (!PyTuple_Check(pair) || PyTuple_Size(pair) != 2) { + return internal::InvalidType(pair, "was expecting tuple of (key, value) pair"); + } + PyObject* key = PyTuple_GetItem(pair, 0); + RETURN_IF_PYERROR(); + PyObject* value = PyTuple_GetItem(pair, 1); + RETURN_IF_PYERROR(); + return std::make_pair(key, value); + } + + Status AppendItems(PyObject* items, PyObject* field_names) { + auto length = static_cast(PySequence_Size(items)); + RETURN_IF_PYERROR(); + + // append the values for the defined fields + for (int i = 0; i < std::min(num_fields_, length); i++) { + // retrieve the key-value pair + ARROW_ASSIGN_OR_RAISE(auto pair, GetKeyValuePair(items, i)); + + // validate that the key and the field name are equal + PyObject* name = PyList_GET_ITEM(field_names, i); + bool are_equal = PyObject_RichCompareBool(pair.first, name, Py_EQ); + RETURN_IF_PYERROR(); + + // finally append to the respective child builder + if (are_equal) { + RETURN_NOT_OK(this->children_[i]->Append(pair.second)); + } else { + ARROW_ASSIGN_OR_RAISE(auto key_view, PyBytesView::FromString(pair.first)); + ARROW_ASSIGN_OR_RAISE(auto name_view, PyBytesView::FromString(name)); + return Status::Invalid("The expected field name is `", name_view.bytes, "` but `", + key_view.bytes, "` was given"); + } + } + // insert null values for missing fields + for (int i = length; i < num_fields_; i++) { + RETURN_NOT_OK(this->children_[i]->AppendNull()); + } + return Status::OK(); + } + + // Whether we're converting from a sequence of dicts or tuples or list of pairs + enum class InputKind { UNKNOWN, DICT, TUPLE, ITEMS } input_kind_ = InputKind::UNKNOWN; + // Whether the input dictionary keys' type is python bytes or unicode + enum class KeyKind { UNKNOWN, BYTES, UNICODE } key_kind_ = KeyKind::UNKNOWN; + // Store the field names as a PyObjects for dict matching + OwnedRef bytes_field_names_; + OwnedRef unicode_field_names_; + // Store the number of fields for later reuse + int num_fields_; +}; + +// Convert *obj* to a sequence if necessary +// Fill *size* to its length. If >= 0 on entry, *size* is an upper size +// bound that may lead to truncation. +Status ConvertToSequenceAndInferSize(PyObject* obj, PyObject** seq, int64_t* size) { + if (PySequence_Check(obj)) { + // obj is already a sequence + int64_t real_size = static_cast(PySequence_Size(obj)); + RETURN_IF_PYERROR(); + if (*size < 0) { + *size = real_size; + } else { + *size = std::min(real_size, *size); + } + Py_INCREF(obj); + *seq = obj; + } else if (*size < 0) { + // unknown size, exhaust iterator + *seq = PySequence_List(obj); + RETURN_IF_PYERROR(); + *size = static_cast(PyList_GET_SIZE(*seq)); + } else { + // size is known but iterator could be infinite + Py_ssize_t i, n = *size; + PyObject* iter = PyObject_GetIter(obj); + RETURN_IF_PYERROR(); + OwnedRef iter_ref(iter); + PyObject* lst = PyList_New(n); + RETURN_IF_PYERROR(); + for (i = 0; i < n; i++) { + PyObject* item = PyIter_Next(iter); + if (!item) { + // either an error occurred or the iterator ended + RETURN_IF_PYERROR(); + break; + } + PyList_SET_ITEM(lst, i, item); + } + // Shrink list if len(iterator) < size + if (i < n && PyList_SetSlice(lst, i, n, NULL)) { + Py_DECREF(lst); + RETURN_IF_PYERROR(); + } + *seq = lst; + *size = std::min(i, *size); + } + return Status::OK(); +} + +} // namespace + +Result> ConvertPySequence(PyObject* obj, PyObject* mask, + PyConversionOptions options, + MemoryPool* pool) { + PyAcquireGIL lock; + + PyObject* seq = nullptr; + OwnedRef tmp_seq_nanny; + + ARROW_ASSIGN_OR_RAISE(auto is_pandas_imported, internal::IsModuleImported("pandas")); + if (is_pandas_imported) { + // If pandas has been already imported initialize the static pandas objects to + // support converting from pd.Timedelta and pd.Timestamp objects + internal::InitPandasStaticData(); + } + + int64_t size = options.size; + RETURN_NOT_OK(ConvertToSequenceAndInferSize(obj, &seq, &size)); + tmp_seq_nanny.reset(seq); + + // In some cases, type inference may be "loose", like strings. If the user + // passed pa.string(), then we will error if we encounter any non-UTF8 + // value. If not, then we will allow the result to be a BinaryArray + if (options.type == nullptr) { + ARROW_ASSIGN_OR_RAISE(options.type, InferArrowType(seq, mask, options.from_pandas)); + options.strict = false; + } else { + options.strict = true; + } + DCHECK_GE(size, 0); + + ARROW_ASSIGN_OR_RAISE(auto converter, (MakeConverter( + options.type, options, pool))); + if (converter->may_overflow()) { + // The converter hierarchy contains binary- or list-like builders which can overflow + // depending on the input values. Wrap the converter with a chunker which detects + // the overflow and automatically creates new chunks. + ARROW_ASSIGN_OR_RAISE(auto chunked_converter, MakeChunker(std::move(converter))); + if (mask != nullptr && mask != Py_None) { + RETURN_NOT_OK(chunked_converter->ExtendMasked(seq, mask, size)); + } else { + RETURN_NOT_OK(chunked_converter->Extend(seq, size)); + } + return chunked_converter->ToChunkedArray(); + } else { + // If the converter can't overflow spare the capacity error checking on the hot-path, + // this improves the performance roughly by ~10% for primitive types. + if (mask != nullptr && mask != Py_None) { + RETURN_NOT_OK(converter->ExtendMasked(seq, mask, size)); + } else { + RETURN_NOT_OK(converter->Extend(seq, size)); + } + return converter->ToChunkedArray(); + } +} + +} // namespace py +} // namespace arrow diff --git a/src/vendored/apache-arrow-12.0.1/arrow/python/python_to_arrow.h b/src/vendored/apache-arrow-12.0.1/arrow/python/python_to_arrow.h index d737047..d167996 100644 --- a/src/vendored/apache-arrow-12.0.1/arrow/python/python_to_arrow.h +++ b/src/vendored/apache-arrow-12.0.1/arrow/python/python_to_arrow.h @@ -25,9 +25,9 @@ #include #include +#include "arrow/python/visibility.h" #include "arrow/type.h" #include "arrow/util/macros.h" -#include "arrow/python/visibility.h" #include "arrow/python/common.h" diff --git a/src/vendored/apache-arrow-12.0.1/arrow/python/serialize.cc b/src/vendored/apache-arrow-12.0.1/arrow/python/serialize.cc new file mode 100644 index 0000000..ad079cb --- /dev/null +++ b/src/vendored/apache-arrow-12.0.1/arrow/python/serialize.cc @@ -0,0 +1,798 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/python/serialize.h" +#include "arrow/python/numpy_interop.h" + +#include +#include +#include +#include +#include +#include + +#include +#include + +#include "arrow/array.h" +#include "arrow/array/builder_binary.h" +#include "arrow/array/builder_nested.h" +#include "arrow/array/builder_primitive.h" +#include "arrow/array/builder_union.h" +#include "arrow/io/interfaces.h" +#include "arrow/io/memory.h" +#include "arrow/ipc/util.h" +#include "arrow/ipc/writer.h" +#include "arrow/record_batch.h" +#include "arrow/result.h" +#include "arrow/tensor.h" +#include "arrow/util/logging.h" + +#include "arrow/python/common.h" +#include "arrow/python/datetime.h" +#include "arrow/python/helpers.h" +#include "arrow/python/iterators.h" +#include "arrow/python/numpy_convert.h" +#include "arrow/python/platform.h" +#include "arrow/python/pyarrow.h" + +constexpr int32_t kMaxRecursionDepth = 100; + +namespace arrow { + +using internal::checked_cast; + +namespace py { + +class SequenceBuilder; +class DictBuilder; + +Status Append(PyObject* context, PyObject* elem, SequenceBuilder* builder, + int32_t recursion_depth, SerializedPyObject* blobs_out); + +// A Sequence is a heterogeneous collections of elements. It can contain +// scalar Python types, lists, tuples, dictionaries, tensors and sparse tensors. +class SequenceBuilder { + public: + explicit SequenceBuilder(MemoryPool* pool = default_memory_pool()) + : pool_(pool), + types_(::arrow::int8(), pool), + offsets_(::arrow::int32(), pool), + type_map_(PythonType::NUM_PYTHON_TYPES, -1) { + auto null_builder = std::make_shared(pool); + auto initial_ty = dense_union({field("0", null())}); + builder_.reset(new DenseUnionBuilder(pool, {null_builder}, initial_ty)); + } + + // Appending a none to the sequence + Status AppendNone() { return builder_->AppendNull(); } + + template + Status CreateAndUpdate(std::shared_ptr* child_builder, int8_t tag, + MakeBuilderFn make_builder) { + if (!*child_builder) { + child_builder->reset(make_builder()); + std::ostringstream convert; + convert.imbue(std::locale::classic()); + convert << static_cast(tag); + type_map_[tag] = builder_->AppendChild(*child_builder, convert.str()); + } + return builder_->Append(type_map_[tag]); + } + + template + Status AppendPrimitive(std::shared_ptr* child_builder, const T val, + int8_t tag) { + RETURN_NOT_OK( + CreateAndUpdate(child_builder, tag, [this]() { return new BuilderType(pool_); })); + return (*child_builder)->Append(val); + } + + // Appending a boolean to the sequence + Status AppendBool(const bool data) { + return AppendPrimitive(&bools_, data, PythonType::BOOL); + } + + // Appending an int64_t to the sequence + Status AppendInt64(const int64_t data) { + return AppendPrimitive(&ints_, data, PythonType::INT); + } + + // Append a list of bytes to the sequence + Status AppendBytes(const uint8_t* data, int32_t length) { + RETURN_NOT_OK(CreateAndUpdate(&bytes_, PythonType::BYTES, + [this]() { return new BinaryBuilder(pool_); })); + return bytes_->Append(data, length); + } + + // Appending a string to the sequence + Status AppendString(const char* data, int32_t length) { + RETURN_NOT_OK(CreateAndUpdate(&strings_, PythonType::STRING, + [this]() { return new StringBuilder(pool_); })); + return strings_->Append(data, length); + } + + // Appending a half_float to the sequence + Status AppendHalfFloat(const npy_half data) { + return AppendPrimitive(&half_floats_, data, PythonType::HALF_FLOAT); + } + + // Appending a float to the sequence + Status AppendFloat(const float data) { + return AppendPrimitive(&floats_, data, PythonType::FLOAT); + } + + // Appending a double to the sequence + Status AppendDouble(const double data) { + return AppendPrimitive(&doubles_, data, PythonType::DOUBLE); + } + + // Appending a Date64 timestamp to the sequence + Status AppendDate64(const int64_t timestamp) { + return AppendPrimitive(&date64s_, timestamp, PythonType::DATE64); + } + + // Appending a tensor to the sequence + // + // \param tensor_index Index of the tensor in the object. + Status AppendTensor(const int32_t tensor_index) { + RETURN_NOT_OK(CreateAndUpdate(&tensor_indices_, PythonType::TENSOR, + [this]() { return new Int32Builder(pool_); })); + return tensor_indices_->Append(tensor_index); + } + + // Appending a sparse coo tensor to the sequence + // + // \param sparse_coo_tensor_index Index of the sparse coo tensor in the object. + Status AppendSparseCOOTensor(const int32_t sparse_coo_tensor_index) { + RETURN_NOT_OK(CreateAndUpdate(&sparse_coo_tensor_indices_, + PythonType::SPARSECOOTENSOR, + [this]() { return new Int32Builder(pool_); })); + return sparse_coo_tensor_indices_->Append(sparse_coo_tensor_index); + } + + // Appending a sparse csr matrix to the sequence + // + // \param sparse_csr_matrix_index Index of the sparse csr matrix in the object. + Status AppendSparseCSRMatrix(const int32_t sparse_csr_matrix_index) { + RETURN_NOT_OK(CreateAndUpdate(&sparse_csr_matrix_indices_, + PythonType::SPARSECSRMATRIX, + [this]() { return new Int32Builder(pool_); })); + return sparse_csr_matrix_indices_->Append(sparse_csr_matrix_index); + } + + // Appending a sparse csc matrix to the sequence + // + // \param sparse_csc_matrix_index Index of the sparse csc matrix in the object. + Status AppendSparseCSCMatrix(const int32_t sparse_csc_matrix_index) { + RETURN_NOT_OK(CreateAndUpdate(&sparse_csc_matrix_indices_, + PythonType::SPARSECSCMATRIX, + [this]() { return new Int32Builder(pool_); })); + return sparse_csc_matrix_indices_->Append(sparse_csc_matrix_index); + } + + // Appending a sparse csf tensor to the sequence + // + // \param sparse_csf_tensor_index Index of the sparse csf tensor in the object. + Status AppendSparseCSFTensor(const int32_t sparse_csf_tensor_index) { + RETURN_NOT_OK(CreateAndUpdate(&sparse_csf_tensor_indices_, + PythonType::SPARSECSFTENSOR, + [this]() { return new Int32Builder(pool_); })); + return sparse_csf_tensor_indices_->Append(sparse_csf_tensor_index); + } + + // Appending a numpy ndarray to the sequence + // + // \param tensor_index Index of the tensor in the object. + Status AppendNdarray(const int32_t ndarray_index) { + RETURN_NOT_OK(CreateAndUpdate(&ndarray_indices_, PythonType::NDARRAY, + [this]() { return new Int32Builder(pool_); })); + return ndarray_indices_->Append(ndarray_index); + } + + // Appending a buffer to the sequence + // + // \param buffer_index Index of the buffer in the object. + Status AppendBuffer(const int32_t buffer_index) { + RETURN_NOT_OK(CreateAndUpdate(&buffer_indices_, PythonType::BUFFER, + [this]() { return new Int32Builder(pool_); })); + return buffer_indices_->Append(buffer_index); + } + + Status AppendSequence(PyObject* context, PyObject* sequence, int8_t tag, + std::shared_ptr& target_sequence, + std::unique_ptr& values, int32_t recursion_depth, + SerializedPyObject* blobs_out) { + if (recursion_depth >= kMaxRecursionDepth) { + return Status::NotImplemented( + "This object exceeds the maximum recursion depth. It may contain itself " + "recursively."); + } + RETURN_NOT_OK(CreateAndUpdate(&target_sequence, tag, [this, &values]() { + values.reset(new SequenceBuilder(pool_)); + return new ListBuilder(pool_, values->builder()); + })); + RETURN_NOT_OK(target_sequence->Append()); + return internal::VisitIterable( + sequence, [&](PyObject* obj, bool* keep_going /* unused */) { + return Append(context, obj, values.get(), recursion_depth, blobs_out); + }); + } + + Status AppendList(PyObject* context, PyObject* list, int32_t recursion_depth, + SerializedPyObject* blobs_out) { + return AppendSequence(context, list, PythonType::LIST, lists_, list_values_, + recursion_depth + 1, blobs_out); + } + + Status AppendTuple(PyObject* context, PyObject* tuple, int32_t recursion_depth, + SerializedPyObject* blobs_out) { + return AppendSequence(context, tuple, PythonType::TUPLE, tuples_, tuple_values_, + recursion_depth + 1, blobs_out); + } + + Status AppendSet(PyObject* context, PyObject* set, int32_t recursion_depth, + SerializedPyObject* blobs_out) { + return AppendSequence(context, set, PythonType::SET, sets_, set_values_, + recursion_depth + 1, blobs_out); + } + + Status AppendDict(PyObject* context, PyObject* dict, int32_t recursion_depth, + SerializedPyObject* blobs_out); + + // Finish building the sequence and return the result. + // Input arrays may be nullptr + Status Finish(std::shared_ptr* out) { return builder_->Finish(out); } + + std::shared_ptr builder() { return builder_; } + + private: + MemoryPool* pool_; + + Int8Builder types_; + Int32Builder offsets_; + + /// Mapping from PythonType to child index + std::vector type_map_; + + std::shared_ptr bools_; + std::shared_ptr ints_; + std::shared_ptr bytes_; + std::shared_ptr strings_; + std::shared_ptr half_floats_; + std::shared_ptr floats_; + std::shared_ptr doubles_; + std::shared_ptr date64s_; + + std::unique_ptr list_values_; + std::shared_ptr lists_; + std::unique_ptr dict_values_; + std::shared_ptr dicts_; + std::unique_ptr tuple_values_; + std::shared_ptr tuples_; + std::unique_ptr set_values_; + std::shared_ptr sets_; + + std::shared_ptr tensor_indices_; + std::shared_ptr sparse_coo_tensor_indices_; + std::shared_ptr sparse_csr_matrix_indices_; + std::shared_ptr sparse_csc_matrix_indices_; + std::shared_ptr sparse_csf_tensor_indices_; + std::shared_ptr ndarray_indices_; + std::shared_ptr buffer_indices_; + + std::shared_ptr builder_; +}; + +// Constructing dictionaries of key/value pairs. Sequences of +// keys and values are built separately using a pair of +// SequenceBuilders. The resulting Arrow representation +// can be obtained via the Finish method. +class DictBuilder { + public: + explicit DictBuilder(MemoryPool* pool = nullptr) : keys_(pool), vals_(pool) { + builder_.reset(new StructBuilder(struct_({field("keys", dense_union(FieldVector{})), + field("vals", dense_union(FieldVector{}))}), + pool, {keys_.builder(), vals_.builder()})); + } + + // Builder for the keys of the dictionary + SequenceBuilder& keys() { return keys_; } + // Builder for the values of the dictionary + SequenceBuilder& vals() { return vals_; } + + // Construct an Arrow StructArray representing the dictionary. + // Contains a field "keys" for the keys and "vals" for the values. + Status Finish(std::shared_ptr* out) { return builder_->Finish(out); } + + std::shared_ptr builder() { return builder_; } + + private: + SequenceBuilder keys_; + SequenceBuilder vals_; + std::shared_ptr builder_; +}; + +Status SequenceBuilder::AppendDict(PyObject* context, PyObject* dict, + int32_t recursion_depth, + SerializedPyObject* blobs_out) { + if (recursion_depth >= kMaxRecursionDepth) { + return Status::NotImplemented( + "This object exceeds the maximum recursion depth. It may contain itself " + "recursively."); + } + RETURN_NOT_OK(CreateAndUpdate(&dicts_, PythonType::DICT, [this]() { + dict_values_.reset(new DictBuilder(pool_)); + return new ListBuilder(pool_, dict_values_->builder()); + })); + RETURN_NOT_OK(dicts_->Append()); + PyObject* key; + PyObject* value; + Py_ssize_t pos = 0; + while (PyDict_Next(dict, &pos, &key, &value)) { + RETURN_NOT_OK(dict_values_->builder()->Append()); + RETURN_NOT_OK( + Append(context, key, &dict_values_->keys(), recursion_depth + 1, blobs_out)); + RETURN_NOT_OK( + Append(context, value, &dict_values_->vals(), recursion_depth + 1, blobs_out)); + } + + // This block is used to decrement the reference counts of the results + // returned by the serialization callback, which is called in AppendArray, + // in DeserializeDict and in Append + static PyObject* py_type = PyUnicode_FromString("_pytype_"); + if (PyDict_Contains(dict, py_type)) { + // If the dictionary contains the key "_pytype_", then the user has to + // have registered a callback. + if (context == Py_None) { + return Status::Invalid("No serialization callback set"); + } + Py_XDECREF(dict); + } + return Status::OK(); +} + +Status CallCustomCallback(PyObject* context, PyObject* method_name, PyObject* elem, + PyObject** result) { + if (context == Py_None) { + *result = NULL; + return Status::SerializationError("error while calling callback on ", + internal::PyObject_StdStringRepr(elem), + ": handler not registered"); + } else { + *result = PyObject_CallMethodObjArgs(context, method_name, elem, NULL); + return CheckPyError(); + } +} + +Status CallSerializeCallback(PyObject* context, PyObject* value, + PyObject** serialized_object) { + OwnedRef method_name(PyUnicode_FromString("_serialize_callback")); + RETURN_NOT_OK(CallCustomCallback(context, method_name.obj(), value, serialized_object)); + if (!PyDict_Check(*serialized_object)) { + return Status::TypeError("serialization callback must return a valid dictionary"); + } + return Status::OK(); +} + +Status CallDeserializeCallback(PyObject* context, PyObject* value, + PyObject** deserialized_object) { + OwnedRef method_name(PyUnicode_FromString("_deserialize_callback")); + return CallCustomCallback(context, method_name.obj(), value, deserialized_object); +} + +Status AppendArray(PyObject* context, PyArrayObject* array, SequenceBuilder* builder, + int32_t recursion_depth, SerializedPyObject* blobs_out); + +template +Status AppendIntegerScalar(PyObject* obj, SequenceBuilder* builder) { + int64_t value = reinterpret_cast(obj)->obval; + return builder->AppendInt64(value); +} + +// Append a potentially 64-bit wide unsigned Numpy scalar. +// Must check for overflow as we reinterpret it as signed int64. +template +Status AppendLargeUnsignedScalar(PyObject* obj, SequenceBuilder* builder) { + constexpr uint64_t max_value = std::numeric_limits::max(); + + uint64_t value = reinterpret_cast(obj)->obval; + if (value > max_value) { + return Status::Invalid("cannot serialize Numpy uint64 scalar >= 2**63"); + } + return builder->AppendInt64(static_cast(value)); +} + +Status AppendScalar(PyObject* obj, SequenceBuilder* builder) { + if (PyArray_IsScalar(obj, Bool)) { + return builder->AppendBool(reinterpret_cast(obj)->obval != 0); + } else if (PyArray_IsScalar(obj, Half)) { + return builder->AppendHalfFloat(reinterpret_cast(obj)->obval); + } else if (PyArray_IsScalar(obj, Float)) { + return builder->AppendFloat(reinterpret_cast(obj)->obval); + } else if (PyArray_IsScalar(obj, Double)) { + return builder->AppendDouble(reinterpret_cast(obj)->obval); + } + if (PyArray_IsScalar(obj, Byte)) { + return AppendIntegerScalar(obj, builder); + } else if (PyArray_IsScalar(obj, Short)) { + return AppendIntegerScalar(obj, builder); + } else if (PyArray_IsScalar(obj, Int)) { + return AppendIntegerScalar(obj, builder); + } else if (PyArray_IsScalar(obj, Long)) { + return AppendIntegerScalar(obj, builder); + } else if (PyArray_IsScalar(obj, LongLong)) { + return AppendIntegerScalar(obj, builder); + } else if (PyArray_IsScalar(obj, Int64)) { + return AppendIntegerScalar(obj, builder); + } else if (PyArray_IsScalar(obj, UByte)) { + return AppendIntegerScalar(obj, builder); + } else if (PyArray_IsScalar(obj, UShort)) { + return AppendIntegerScalar(obj, builder); + } else if (PyArray_IsScalar(obj, UInt)) { + return AppendIntegerScalar(obj, builder); + } else if (PyArray_IsScalar(obj, ULong)) { + return AppendLargeUnsignedScalar(obj, builder); + } else if (PyArray_IsScalar(obj, ULongLong)) { + return AppendLargeUnsignedScalar(obj, builder); + } else if (PyArray_IsScalar(obj, UInt64)) { + return AppendLargeUnsignedScalar(obj, builder); + } + return Status::NotImplemented("Numpy scalar type not recognized"); +} + +Status Append(PyObject* context, PyObject* elem, SequenceBuilder* builder, + int32_t recursion_depth, SerializedPyObject* blobs_out) { + // The bool case must precede the int case (PyInt_Check passes for bools) + if (PyBool_Check(elem)) { + RETURN_NOT_OK(builder->AppendBool(elem == Py_True)); + } else if (PyArray_DescrFromScalar(elem)->type_num == NPY_HALF) { + npy_half halffloat = reinterpret_cast(elem)->obval; + RETURN_NOT_OK(builder->AppendHalfFloat(halffloat)); + } else if (PyFloat_Check(elem)) { + RETURN_NOT_OK(builder->AppendDouble(PyFloat_AS_DOUBLE(elem))); + } else if (PyLong_Check(elem)) { + int overflow = 0; + int64_t data = PyLong_AsLongLongAndOverflow(elem, &overflow); + if (!overflow) { + RETURN_NOT_OK(builder->AppendInt64(data)); + } else { + // Attempt to serialize the object using the custom callback. + PyObject* serialized_object; + // The reference count of serialized_object will be decremented in SerializeDict + RETURN_NOT_OK(CallSerializeCallback(context, elem, &serialized_object)); + RETURN_NOT_OK( + builder->AppendDict(context, serialized_object, recursion_depth, blobs_out)); + } + } else if (PyBytes_Check(elem)) { + auto data = reinterpret_cast(PyBytes_AS_STRING(elem)); + int32_t size = -1; + RETURN_NOT_OK(internal::CastSize(PyBytes_GET_SIZE(elem), &size)); + RETURN_NOT_OK(builder->AppendBytes(data, size)); + } else if (PyUnicode_Check(elem)) { + ARROW_ASSIGN_OR_RAISE(auto view, PyBytesView::FromUnicode(elem)); + int32_t size = -1; + RETURN_NOT_OK(internal::CastSize(view.size, &size)); + RETURN_NOT_OK(builder->AppendString(view.bytes, size)); + } else if (PyList_CheckExact(elem)) { + RETURN_NOT_OK(builder->AppendList(context, elem, recursion_depth, blobs_out)); + } else if (PyDict_CheckExact(elem)) { + RETURN_NOT_OK(builder->AppendDict(context, elem, recursion_depth, blobs_out)); + } else if (PyTuple_CheckExact(elem)) { + RETURN_NOT_OK(builder->AppendTuple(context, elem, recursion_depth, blobs_out)); + } else if (PySet_Check(elem)) { + RETURN_NOT_OK(builder->AppendSet(context, elem, recursion_depth, blobs_out)); + } else if (PyArray_IsScalar(elem, Generic)) { + RETURN_NOT_OK(AppendScalar(elem, builder)); + } else if (PyArray_CheckExact(elem)) { + RETURN_NOT_OK(AppendArray(context, reinterpret_cast(elem), builder, + recursion_depth, blobs_out)); + } else if (elem == Py_None) { + RETURN_NOT_OK(builder->AppendNone()); + } else if (PyDateTime_Check(elem)) { + PyDateTime_DateTime* datetime = reinterpret_cast(elem); + RETURN_NOT_OK(builder->AppendDate64(internal::PyDateTime_to_us(datetime))); + } else if (is_buffer(elem)) { + RETURN_NOT_OK(builder->AppendBuffer(static_cast(blobs_out->buffers.size()))); + ARROW_ASSIGN_OR_RAISE(auto buffer, unwrap_buffer(elem)); + blobs_out->buffers.push_back(buffer); + } else if (is_tensor(elem)) { + RETURN_NOT_OK(builder->AppendTensor(static_cast(blobs_out->tensors.size()))); + ARROW_ASSIGN_OR_RAISE(auto tensor, unwrap_tensor(elem)); + blobs_out->tensors.push_back(tensor); + } else if (is_sparse_coo_tensor(elem)) { + RETURN_NOT_OK(builder->AppendSparseCOOTensor( + static_cast(blobs_out->sparse_tensors.size()))); + ARROW_ASSIGN_OR_RAISE(auto tensor, unwrap_sparse_coo_tensor(elem)); + blobs_out->sparse_tensors.push_back(tensor); + } else if (is_sparse_csr_matrix(elem)) { + RETURN_NOT_OK(builder->AppendSparseCSRMatrix( + static_cast(blobs_out->sparse_tensors.size()))); + ARROW_ASSIGN_OR_RAISE(auto matrix, unwrap_sparse_csr_matrix(elem)); + blobs_out->sparse_tensors.push_back(matrix); + } else if (is_sparse_csc_matrix(elem)) { + RETURN_NOT_OK(builder->AppendSparseCSCMatrix( + static_cast(blobs_out->sparse_tensors.size()))); + ARROW_ASSIGN_OR_RAISE(auto matrix, unwrap_sparse_csc_matrix(elem)); + blobs_out->sparse_tensors.push_back(matrix); + } else if (is_sparse_csf_tensor(elem)) { + RETURN_NOT_OK(builder->AppendSparseCSFTensor( + static_cast(blobs_out->sparse_tensors.size()))); + ARROW_ASSIGN_OR_RAISE(auto tensor, unwrap_sparse_csf_tensor(elem)); + blobs_out->sparse_tensors.push_back(tensor); + } else { + // Attempt to serialize the object using the custom callback. + PyObject* serialized_object; + // The reference count of serialized_object will be decremented in SerializeDict + RETURN_NOT_OK(CallSerializeCallback(context, elem, &serialized_object)); + RETURN_NOT_OK( + builder->AppendDict(context, serialized_object, recursion_depth, blobs_out)); + } + return Status::OK(); +} + +Status AppendArray(PyObject* context, PyArrayObject* array, SequenceBuilder* builder, + int32_t recursion_depth, SerializedPyObject* blobs_out) { + int dtype = PyArray_TYPE(array); + switch (dtype) { + case NPY_UINT8: + case NPY_INT8: + case NPY_UINT16: + case NPY_INT16: + case NPY_UINT32: + case NPY_INT32: + case NPY_UINT64: + case NPY_INT64: + case NPY_HALF: + case NPY_FLOAT: + case NPY_DOUBLE: { + RETURN_NOT_OK( + builder->AppendNdarray(static_cast(blobs_out->ndarrays.size()))); + std::shared_ptr tensor; + RETURN_NOT_OK(NdarrayToTensor(default_memory_pool(), + reinterpret_cast(array), {}, &tensor)); + blobs_out->ndarrays.push_back(tensor); + } break; + default: { + PyObject* serialized_object; + // The reference count of serialized_object will be decremented in SerializeDict + RETURN_NOT_OK(CallSerializeCallback(context, reinterpret_cast(array), + &serialized_object)); + RETURN_NOT_OK(builder->AppendDict(context, serialized_object, recursion_depth + 1, + blobs_out)); + } + } + return Status::OK(); +} + +std::shared_ptr MakeBatch(std::shared_ptr data) { + auto field = std::make_shared("list", data->type()); + auto schema = ::arrow::schema({field}); + return RecordBatch::Make(schema, data->length(), {data}); +} + +Status SerializeObject(PyObject* context, PyObject* sequence, SerializedPyObject* out) { + PyAcquireGIL lock; + SequenceBuilder builder; + RETURN_NOT_OK(internal::VisitIterable( + sequence, [&](PyObject* obj, bool* keep_going /* unused */) { + return Append(context, obj, &builder, 0, out); + })); + std::shared_ptr array; + RETURN_NOT_OK(builder.Finish(&array)); + out->batch = MakeBatch(array); + return Status::OK(); +} + +Status SerializeNdarray(std::shared_ptr tensor, SerializedPyObject* out) { + std::shared_ptr array; + SequenceBuilder builder; + RETURN_NOT_OK(builder.AppendNdarray(static_cast(out->ndarrays.size()))); + out->ndarrays.push_back(tensor); + RETURN_NOT_OK(builder.Finish(&array)); + out->batch = MakeBatch(array); + return Status::OK(); +} + +Status WriteNdarrayHeader(std::shared_ptr dtype, + const std::vector& shape, int64_t tensor_num_bytes, + io::OutputStream* dst) { + auto empty_tensor = std::make_shared( + dtype, std::make_shared(nullptr, tensor_num_bytes), shape); + SerializedPyObject serialized_tensor; + RETURN_NOT_OK(SerializeNdarray(empty_tensor, &serialized_tensor)); + return serialized_tensor.WriteTo(dst); +} + +SerializedPyObject::SerializedPyObject() + : ipc_options(ipc::IpcWriteOptions::Defaults()) {} + +Status SerializedPyObject::WriteTo(io::OutputStream* dst) { + int32_t num_tensors = static_cast(this->tensors.size()); + int32_t num_sparse_tensors = static_cast(this->sparse_tensors.size()); + int32_t num_ndarrays = static_cast(this->ndarrays.size()); + int32_t num_buffers = static_cast(this->buffers.size()); + RETURN_NOT_OK( + dst->Write(reinterpret_cast(&num_tensors), sizeof(int32_t))); + RETURN_NOT_OK( + dst->Write(reinterpret_cast(&num_sparse_tensors), sizeof(int32_t))); + RETURN_NOT_OK( + dst->Write(reinterpret_cast(&num_ndarrays), sizeof(int32_t))); + RETURN_NOT_OK( + dst->Write(reinterpret_cast(&num_buffers), sizeof(int32_t))); + + // Align stream to 8-byte offset + RETURN_NOT_OK(ipc::AlignStream(dst, ipc::kArrowIpcAlignment)); + RETURN_NOT_OK(ipc::WriteRecordBatchStream({this->batch}, this->ipc_options, dst)); + + // Align stream to 64-byte offset so tensor bodies are 64-byte aligned + RETURN_NOT_OK(ipc::AlignStream(dst, ipc::kTensorAlignment)); + + int32_t metadata_length; + int64_t body_length; + for (const auto& tensor : this->tensors) { + RETURN_NOT_OK(ipc::WriteTensor(*tensor, dst, &metadata_length, &body_length)); + RETURN_NOT_OK(ipc::AlignStream(dst, ipc::kTensorAlignment)); + } + + for (const auto& sparse_tensor : this->sparse_tensors) { + RETURN_NOT_OK( + ipc::WriteSparseTensor(*sparse_tensor, dst, &metadata_length, &body_length)); + RETURN_NOT_OK(ipc::AlignStream(dst, ipc::kTensorAlignment)); + } + + for (const auto& tensor : this->ndarrays) { + RETURN_NOT_OK(ipc::WriteTensor(*tensor, dst, &metadata_length, &body_length)); + RETURN_NOT_OK(ipc::AlignStream(dst, ipc::kTensorAlignment)); + } + + for (const auto& buffer : this->buffers) { + int64_t size = buffer->size(); + RETURN_NOT_OK(dst->Write(reinterpret_cast(&size), sizeof(int64_t))); + RETURN_NOT_OK(dst->Write(buffer->data(), size)); + } + + return Status::OK(); +} + +namespace { + +Status CountSparseTensors( + const std::vector>& sparse_tensors, PyObject** out) { + OwnedRef num_sparse_tensors(PyDict_New()); + size_t num_coo = 0; + size_t num_csr = 0; + size_t num_csc = 0; + size_t num_csf = 0; + size_t ndim_csf = 0; + + for (const auto& sparse_tensor : sparse_tensors) { + switch (sparse_tensor->format_id()) { + case SparseTensorFormat::COO: + ++num_coo; + break; + case SparseTensorFormat::CSR: + ++num_csr; + break; + case SparseTensorFormat::CSC: + ++num_csc; + break; + case SparseTensorFormat::CSF: + ++num_csf; + ndim_csf += sparse_tensor->ndim(); + break; + } + } + + PyDict_SetItemString(num_sparse_tensors.obj(), "coo", PyLong_FromSize_t(num_coo)); + PyDict_SetItemString(num_sparse_tensors.obj(), "csr", PyLong_FromSize_t(num_csr)); + PyDict_SetItemString(num_sparse_tensors.obj(), "csc", PyLong_FromSize_t(num_csc)); + PyDict_SetItemString(num_sparse_tensors.obj(), "csf", PyLong_FromSize_t(num_csf)); + PyDict_SetItemString(num_sparse_tensors.obj(), "ndim_csf", PyLong_FromSize_t(ndim_csf)); + RETURN_IF_PYERROR(); + + *out = num_sparse_tensors.detach(); + return Status::OK(); +} + +} // namespace + +Status SerializedPyObject::GetComponents(MemoryPool* memory_pool, PyObject** out) { + PyAcquireGIL py_gil; + + OwnedRef result(PyDict_New()); + PyObject* buffers = PyList_New(0); + PyObject* num_sparse_tensors = nullptr; + + // TODO(wesm): Not sure how pedantic we need to be about checking the return + // values of these functions. There are other places where we do not check + // PyDict_SetItem/SetItemString return value, but these failures would be + // quite esoteric + PyDict_SetItemString(result.obj(), "num_tensors", + PyLong_FromSize_t(this->tensors.size())); + RETURN_NOT_OK(CountSparseTensors(this->sparse_tensors, &num_sparse_tensors)); + PyDict_SetItemString(result.obj(), "num_sparse_tensors", num_sparse_tensors); + PyDict_SetItemString(result.obj(), "ndim_csf", num_sparse_tensors); + PyDict_SetItemString(result.obj(), "num_ndarrays", + PyLong_FromSize_t(this->ndarrays.size())); + PyDict_SetItemString(result.obj(), "num_buffers", + PyLong_FromSize_t(this->buffers.size())); + PyDict_SetItemString(result.obj(), "data", buffers); + RETURN_IF_PYERROR(); + + Py_DECREF(buffers); + + auto PushBuffer = [&buffers](const std::shared_ptr& buffer) { + PyObject* wrapped_buffer = wrap_buffer(buffer); + RETURN_IF_PYERROR(); + if (PyList_Append(buffers, wrapped_buffer) < 0) { + Py_DECREF(wrapped_buffer); + RETURN_IF_PYERROR(); + } + Py_DECREF(wrapped_buffer); + return Status::OK(); + }; + + constexpr int64_t kInitialCapacity = 1024; + + // Write the record batch describing the object structure + py_gil.release(); + ARROW_ASSIGN_OR_RAISE(auto stream, + io::BufferOutputStream::Create(kInitialCapacity, memory_pool)); + RETURN_NOT_OK( + ipc::WriteRecordBatchStream({this->batch}, this->ipc_options, stream.get())); + ARROW_ASSIGN_OR_RAISE(auto buffer, stream->Finish()); + py_gil.acquire(); + + RETURN_NOT_OK(PushBuffer(buffer)); + + // For each tensor, get a metadata buffer and a buffer for the body + for (const auto& tensor : this->tensors) { + ARROW_ASSIGN_OR_RAISE(std::unique_ptr message, + ipc::GetTensorMessage(*tensor, memory_pool)); + RETURN_NOT_OK(PushBuffer(message->metadata())); + RETURN_NOT_OK(PushBuffer(message->body())); + } + + // For each sparse tensor, get a metadata buffer and buffers containing index and data + for (const auto& sparse_tensor : this->sparse_tensors) { + ipc::IpcPayload payload; + RETURN_NOT_OK(ipc::GetSparseTensorPayload(*sparse_tensor, memory_pool, &payload)); + RETURN_NOT_OK(PushBuffer(payload.metadata)); + for (const auto& body : payload.body_buffers) { + RETURN_NOT_OK(PushBuffer(body)); + } + } + + // For each ndarray, get a metadata buffer and a buffer for the body + for (const auto& ndarray : this->ndarrays) { + ARROW_ASSIGN_OR_RAISE(std::unique_ptr message, + ipc::GetTensorMessage(*ndarray, memory_pool)); + RETURN_NOT_OK(PushBuffer(message->metadata())); + RETURN_NOT_OK(PushBuffer(message->body())); + } + + for (const auto& buf : this->buffers) { + RETURN_NOT_OK(PushBuffer(buf)); + } + + *out = result.detach(); + return Status::OK(); +} + +} // namespace py +} // namespace arrow diff --git a/src/vendored/apache-arrow-12.0.1/arrow/python/serialize.h b/src/vendored/apache-arrow-12.0.1/arrow/python/serialize.h index e9fd843..fd207d3 100644 --- a/src/vendored/apache-arrow-12.0.1/arrow/python/serialize.h +++ b/src/vendored/apache-arrow-12.0.1/arrow/python/serialize.h @@ -21,9 +21,9 @@ #include #include "arrow/ipc/options.h" +#include "arrow/python/visibility.h" #include "arrow/sparse_tensor.h" #include "arrow/status.h" -#include "arrow/python/visibility.h" // Forward declaring PyObject, see // https://mail.python.org/pipermail/python-dev/2003-August/037601.html diff --git a/src/vendored/apache-arrow-12.0.1/arrow/python/udf.cc b/src/vendored/apache-arrow-12.0.1/arrow/python/udf.cc new file mode 100644 index 0000000..435c89f --- /dev/null +++ b/src/vendored/apache-arrow-12.0.1/arrow/python/udf.cc @@ -0,0 +1,736 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/python/udf.h" +#include "arrow/array/builder_base.h" +#include "arrow/buffer_builder.h" +#include "arrow/compute/api_aggregate.h" +#include "arrow/compute/api_vector.h" +#include "arrow/compute/function.h" +#include "arrow/compute/kernel.h" +#include "arrow/compute/row/grouper.h" +#include "arrow/python/common.h" +#include "arrow/table.h" +#include "arrow/util/checked_cast.h" +#include "arrow/util/logging.h" + +namespace arrow { +using compute::ExecSpan; +using compute::Grouper; +using compute::KernelContext; +using compute::KernelState; +using internal::checked_cast; + +namespace py { +namespace { + +struct PythonUdfKernelState : public compute::KernelState { + explicit PythonUdfKernelState(std::shared_ptr function) + : function(function) { + Py_INCREF(function->obj()); + } + + // function needs to be destroyed at process exit + // and Python may no longer be initialized. + ~PythonUdfKernelState() { + if (_Py_IsFinalizing()) { + function->detach(); + } + } + + std::shared_ptr function; +}; + +struct PythonUdfKernelInit { + explicit PythonUdfKernelInit(std::shared_ptr function) + : function(function) { + Py_INCREF(function->obj()); + } + + // function needs to be destroyed at process exit + // and Python may no longer be initialized. + ~PythonUdfKernelInit() { + if (_Py_IsFinalizing()) { + function->detach(); + } + } + + Result> operator()( + compute::KernelContext*, const compute::KernelInitArgs&) { + return std::make_unique(function); + } + + std::shared_ptr function; +}; + +struct ScalarUdfAggregator : public compute::KernelState { + virtual Status Consume(compute::KernelContext* ctx, const compute::ExecSpan& batch) = 0; + virtual Status MergeFrom(compute::KernelContext* ctx, compute::KernelState&& src) = 0; + virtual Status Finalize(compute::KernelContext* ctx, Datum* out) = 0; +}; + +struct HashUdfAggregator : public compute::KernelState { + virtual Status Resize(KernelContext* ctx, int64_t size) = 0; + virtual Status Consume(KernelContext* ctx, const ExecSpan& batch) = 0; + virtual Status Merge(KernelContext* ct, KernelState&& other, const ArrayData&) = 0; + virtual Status Finalize(KernelContext* ctx, Datum* out) = 0; +}; + +arrow::Status AggregateUdfConsume(compute::KernelContext* ctx, + const compute::ExecSpan& batch) { + return checked_cast(ctx->state())->Consume(ctx, batch); +} + +arrow::Status AggregateUdfMerge(compute::KernelContext* ctx, compute::KernelState&& src, + compute::KernelState* dst) { + return checked_cast(dst)->MergeFrom(ctx, std::move(src)); +} + +arrow::Status AggregateUdfFinalize(compute::KernelContext* ctx, arrow::Datum* out) { + return checked_cast(ctx->state())->Finalize(ctx, out); +} + +arrow::Status HashAggregateUdfResize(KernelContext* ctx, int64_t size) { + return checked_cast(ctx->state())->Resize(ctx, size); +} + +arrow::Status HashAggregateUdfConsume(KernelContext* ctx, const ExecSpan& batch) { + return checked_cast(ctx->state())->Consume(ctx, batch); +} + +arrow::Status HashAggregateUdfMerge(KernelContext* ctx, KernelState&& src, + const ArrayData& group_id_mapping) { + return checked_cast(ctx->state()) + ->Merge(ctx, std::move(src), group_id_mapping); +} + +arrow::Status HashAggregateUdfFinalize(KernelContext* ctx, Datum* out) { + return checked_cast(ctx->state())->Finalize(ctx, out); +} + +struct PythonTableUdfKernelInit { + PythonTableUdfKernelInit(std::shared_ptr function_maker, + UdfWrapperCallback cb) + : function_maker(function_maker), cb(cb) { + Py_INCREF(function_maker->obj()); + } + + // function needs to be destroyed at process exit + // and Python may no longer be initialized. + ~PythonTableUdfKernelInit() { + if (_Py_IsFinalizing()) { + function_maker->detach(); + } + } + + Result> operator()( + compute::KernelContext* ctx, const compute::KernelInitArgs&) { + UdfContext udf_context{ctx->memory_pool(), /*batch_length=*/0}; + std::unique_ptr function; + RETURN_NOT_OK(SafeCallIntoPython([this, &udf_context, &function] { + OwnedRef empty_tuple(PyTuple_New(0)); + function = std::make_unique( + cb(function_maker->obj(), udf_context, empty_tuple.obj())); + RETURN_NOT_OK(CheckPyError()); + return Status::OK(); + })); + if (!PyCallable_Check(function->obj())) { + return Status::TypeError("Expected a callable Python object."); + } + return std::make_unique(std::move(function)); + } + + std::shared_ptr function_maker; + UdfWrapperCallback cb; +}; + +struct PythonUdfScalarAggregatorImpl : public ScalarUdfAggregator { + PythonUdfScalarAggregatorImpl(std::shared_ptr function, + UdfWrapperCallback cb, + std::vector> input_types, + std::shared_ptr output_type) + : function(function), cb(std::move(cb)), output_type(std::move(output_type)) { + Py_INCREF(function->obj()); + std::vector> fields; + for (size_t i = 0; i < input_types.size(); i++) { + fields.push_back(field("", input_types[i])); + } + input_schema = schema(std::move(fields)); + }; + + ~PythonUdfScalarAggregatorImpl() override { + if (_Py_IsFinalizing()) { + function->detach(); + } + } + + Status Consume(compute::KernelContext* ctx, const compute::ExecSpan& batch) override { + ARROW_ASSIGN_OR_RAISE( + auto rb, batch.ToExecBatch().ToRecordBatch(input_schema, ctx->memory_pool())); + values.push_back(std::move(rb)); + return Status::OK(); + } + + Status MergeFrom(compute::KernelContext* ctx, compute::KernelState&& src) override { + auto& other_values = checked_cast(src).values; + values.insert(values.end(), std::make_move_iterator(other_values.begin()), + std::make_move_iterator(other_values.end())); + + other_values.erase(other_values.begin(), other_values.end()); + return Status::OK(); + } + + Status Finalize(compute::KernelContext* ctx, Datum* out) override { + auto state = + arrow::internal::checked_cast(ctx->state()); + const int num_args = input_schema->num_fields(); + + // Note: The way that batches are concatenated together + // would result in using double amount of the memory. + // This is OK for now because non decomposable aggregate + // UDF is supposed to be used with segmented aggregation + // where the size of the segment is more or less constant + // so doubling that is not a big deal. This can be also + // improved in the future to use more efficient way to + // concatenate. + ARROW_ASSIGN_OR_RAISE(auto table, + arrow::Table::FromRecordBatches(input_schema, values)); + ARROW_ASSIGN_OR_RAISE(table, table->CombineChunks(ctx->memory_pool())); + UdfContext udf_context{ctx->memory_pool(), table->num_rows()}; + + if (table->num_rows() == 0) { + return Status::Invalid("Finalized is called with empty inputs"); + } + + RETURN_NOT_OK(SafeCallIntoPython([&] { + std::unique_ptr result; + OwnedRef arg_tuple(PyTuple_New(num_args)); + RETURN_NOT_OK(CheckPyError()); + + for (int arg_id = 0; arg_id < num_args; arg_id++) { + // Since we combined chunks there is only one chunk + std::shared_ptr c_data = table->column(arg_id)->chunk(0); + PyObject* data = wrap_array(c_data); + PyTuple_SetItem(arg_tuple.obj(), arg_id, data); + } + result = + std::make_unique(cb(function->obj(), udf_context, arg_tuple.obj())); + RETURN_NOT_OK(CheckPyError()); + // unwrapping the output for expected output type + if (is_scalar(result->obj())) { + ARROW_ASSIGN_OR_RAISE(std::shared_ptr val, unwrap_scalar(result->obj())); + if (*output_type != *val->type) { + return Status::TypeError("Expected output datatype ", output_type->ToString(), + ", but function returned datatype ", + val->type->ToString()); + } + out->value = std::move(val); + return Status::OK(); + } + return Status::TypeError("Unexpected output type: ", + Py_TYPE(result->obj())->tp_name, " (expected Scalar)"); + })); + return Status::OK(); + } + + std::shared_ptr function; + UdfWrapperCallback cb; + std::vector> values; + std::shared_ptr input_schema; + std::shared_ptr output_type; +}; + +struct PythonUdfHashAggregatorImpl : public HashUdfAggregator { + PythonUdfHashAggregatorImpl(std::shared_ptr function, + UdfWrapperCallback cb, + std::vector> input_types, + std::shared_ptr output_type) + : function(function), cb(std::move(cb)), output_type(std::move(output_type)) { + Py_INCREF(function->obj()); + std::vector> fields; + fields.reserve(input_types.size()); + for (size_t i = 0; i < input_types.size(); i++) { + fields.push_back(field("", input_types[i])); + } + input_schema = schema(std::move(fields)); + }; + + ~PythonUdfHashAggregatorImpl() override { + if (_Py_IsFinalizing()) { + function->detach(); + } + } + + // same as ApplyGrouping in parition.cc + // replicated the code here to avoid complicating the dependencies + static Result ApplyGroupings( + const ListArray& groupings, const std::shared_ptr& batch) { + ARROW_ASSIGN_OR_RAISE(Datum sorted, + compute::Take(batch, groupings.data()->child_data[0])); + + const auto& sorted_batch = *sorted.record_batch(); + + RecordBatchVector out(static_cast(groupings.length())); + for (size_t i = 0; i < out.size(); ++i) { + out[i] = sorted_batch.Slice(groupings.value_offset(i), groupings.value_length(i)); + } + + return out; + } + + Status Resize(KernelContext* ctx, int64_t new_num_groups) { + // We only need to change num_groups in resize + // similar to other hash aggregate kernels + num_groups = new_num_groups; + return Status::OK(); + } + + Status Consume(KernelContext* ctx, const ExecSpan& batch) { + ARROW_ASSIGN_OR_RAISE( + std::shared_ptr rb, + batch.ToExecBatch().ToRecordBatch(input_schema, ctx->memory_pool())); + + // This is similar to GroupedListImpl + // last array is the group id + const ArraySpan& groups_array_data = batch[batch.num_values() - 1].array; + DCHECK_EQ(groups_array_data.offset, 0); + int64_t batch_num_values = groups_array_data.length; + const auto* batch_groups = groups_array_data.GetValues(1); + RETURN_NOT_OK(groups.Append(batch_groups, batch_num_values)); + values.push_back(std::move(rb)); + num_values += batch_num_values; + return Status::OK(); + } + Status Merge(KernelContext* ctx, KernelState&& other_state, + const ArrayData& group_id_mapping) { + // This is similar to GroupedListImpl + auto& other = checked_cast(other_state); + auto& other_values = other.values; + const uint32_t* other_raw_groups = other.groups.data(); + values.insert(values.end(), std::make_move_iterator(other_values.begin()), + std::make_move_iterator(other_values.end())); + + auto g = group_id_mapping.GetValues(1); + for (uint32_t other_g = 0; static_cast(other_g) < other.num_values; + ++other_g) { + // Different state can have different group_id mappings, so we + // need to translate the ids + RETURN_NOT_OK(groups.Append(g[other_raw_groups[other_g]])); + } + + num_values += other.num_values; + return Status::OK(); + } + + Status Finalize(KernelContext* ctx, Datum* out) { + // Exclude the last column which is the group id + const int num_args = input_schema->num_fields() - 1; + + ARROW_ASSIGN_OR_RAISE(auto groups_buffer, groups.Finish()); + ARROW_ASSIGN_OR_RAISE(auto groupings, + Grouper::MakeGroupings(UInt32Array(num_values, groups_buffer), + static_cast(num_groups))); + + ARROW_ASSIGN_OR_RAISE(auto table, + arrow::Table::FromRecordBatches(input_schema, values)); + ARROW_ASSIGN_OR_RAISE(auto rb, table->CombineChunksToBatch(ctx->memory_pool())); + UdfContext udf_context{ctx->memory_pool(), table->num_rows()}; + + if (rb->num_rows() == 0) { + *out = Datum(); + return Status::OK(); + } + + ARROW_ASSIGN_OR_RAISE(RecordBatchVector rbs, ApplyGroupings(*groupings, rb)); + + return SafeCallIntoPython([&] { + ARROW_ASSIGN_OR_RAISE(std::unique_ptr builder, + MakeBuilder(output_type, ctx->memory_pool())); + for (auto& group_rb : rbs) { + std::unique_ptr result; + OwnedRef arg_tuple(PyTuple_New(num_args)); + RETURN_NOT_OK(CheckPyError()); + + for (int arg_id = 0; arg_id < num_args; arg_id++) { + // Since we combined chunks there is only one chunk + std::shared_ptr c_data = group_rb->column(arg_id); + PyObject* data = wrap_array(c_data); + PyTuple_SetItem(arg_tuple.obj(), arg_id, data); + } + + result = + std::make_unique(cb(function->obj(), udf_context, arg_tuple.obj())); + RETURN_NOT_OK(CheckPyError()); + + // unwrapping the output for expected output type + if (is_scalar(result->obj())) { + ARROW_ASSIGN_OR_RAISE(std::shared_ptr val, + unwrap_scalar(result->obj())); + if (*output_type != *val->type) { + return Status::TypeError("Expected output datatype ", output_type->ToString(), + ", but function returned datatype ", + val->type->ToString()); + } + ARROW_RETURN_NOT_OK(builder->AppendScalar(std::move(*val))); + } else { + return Status::TypeError("Unexpected output type: ", + Py_TYPE(result->obj())->tp_name, " (expected Scalar)"); + } + } + ARROW_ASSIGN_OR_RAISE(auto result, builder->Finish()); + out->value = std::move(result->data()); + return Status::OK(); + }); + } + + std::shared_ptr function; + UdfWrapperCallback cb; + // Accumulated input batches + std::vector> values; + // Group ids - extracted from the last column from the batch + TypedBufferBuilder groups; + int64_t num_groups = 0; + int64_t num_values = 0; + std::shared_ptr input_schema; + std::shared_ptr output_type; +}; + +struct PythonUdf : public PythonUdfKernelState { + PythonUdf(std::shared_ptr function, UdfWrapperCallback cb, + std::vector input_types, compute::OutputType output_type) + : PythonUdfKernelState(function), + cb(cb), + input_types(input_types), + output_type(output_type) {} + + UdfWrapperCallback cb; + std::vector input_types; + compute::OutputType output_type; + TypeHolder resolved_type; + + Result ResolveType(compute::KernelContext* ctx, + const std::vector& types) { + if (input_types == types) { + if (!resolved_type) { + ARROW_ASSIGN_OR_RAISE(resolved_type, output_type.Resolve(ctx, input_types)); + } + return resolved_type; + } + return output_type.Resolve(ctx, types); + } + + Status Exec(compute::KernelContext* ctx, const compute::ExecSpan& batch, + compute::ExecResult* out) { + auto state = arrow::internal::checked_cast(ctx->state()); + std::shared_ptr& function = state->function; + const int num_args = batch.num_values(); + UdfContext udf_context{ctx->memory_pool(), batch.length}; + + OwnedRef arg_tuple(PyTuple_New(num_args)); + RETURN_NOT_OK(CheckPyError()); + for (int arg_id = 0; arg_id < num_args; arg_id++) { + if (batch[arg_id].is_scalar()) { + std::shared_ptr c_data = batch[arg_id].scalar->GetSharedPtr(); + PyObject* data = wrap_scalar(c_data); + PyTuple_SetItem(arg_tuple.obj(), arg_id, data); + } else { + std::shared_ptr c_data = batch[arg_id].array.ToArray(); + PyObject* data = wrap_array(c_data); + PyTuple_SetItem(arg_tuple.obj(), arg_id, data); + } + } + + OwnedRef result(cb(function->obj(), udf_context, arg_tuple.obj())); + RETURN_NOT_OK(CheckPyError()); + // unwrapping the output for expected output type + if (is_array(result.obj())) { + ARROW_ASSIGN_OR_RAISE(std::shared_ptr val, unwrap_array(result.obj())); + ARROW_ASSIGN_OR_RAISE(TypeHolder type, ResolveType(ctx, batch.GetTypes())); + if (type.type == NULLPTR) { + return Status::TypeError("expected output datatype is null"); + } + if (*type.type != *val->type()) { + return Status::TypeError("Expected output datatype ", type.type->ToString(), + ", but function returned datatype ", + val->type()->ToString()); + } + out->value = std::move(val->data()); + return Status::OK(); + } else { + return Status::TypeError("Unexpected output type: ", Py_TYPE(result.obj())->tp_name, + " (expected Array)"); + } + return Status::OK(); + } +}; + +Status PythonUdfExec(compute::KernelContext* ctx, const compute::ExecSpan& batch, + compute::ExecResult* out) { + auto udf = static_cast(ctx->kernel()->data.get()); + return SafeCallIntoPython([&]() -> Status { return udf->Exec(ctx, batch, out); }); +} + +Status RegisterUdf(PyObject* user_function, compute::KernelInit kernel_init, + UdfWrapperCallback wrapper, const UdfOptions& options, + compute::FunctionRegistry* registry) { + if (!PyCallable_Check(user_function)) { + return Status::TypeError("Expected a callable Python object."); + } + auto scalar_func = std::make_shared( + options.func_name, options.arity, options.func_doc); + Py_INCREF(user_function); + std::vector input_types; + for (const auto& in_dtype : options.input_types) { + input_types.emplace_back(in_dtype); + } + compute::OutputType output_type(options.output_type); + auto udf_data = std::make_shared( + std::make_shared(user_function), wrapper, + TypeHolder::FromTypes(options.input_types), options.output_type); + compute::ScalarKernel kernel( + compute::KernelSignature::Make(std::move(input_types), std::move(output_type), + options.arity.is_varargs), + PythonUdfExec, kernel_init); + kernel.data = std::move(udf_data); + + kernel.mem_allocation = compute::MemAllocation::NO_PREALLOCATE; + kernel.null_handling = compute::NullHandling::COMPUTED_NO_PREALLOCATE; + RETURN_NOT_OK(scalar_func->AddKernel(std::move(kernel))); + if (registry == NULLPTR) { + registry = compute::GetFunctionRegistry(); + } + RETURN_NOT_OK(registry->AddFunction(std::move(scalar_func))); + return Status::OK(); +} + +} // namespace + +Status RegisterScalarFunction(PyObject* function, UdfWrapperCallback cb, + const UdfOptions& options, + compute::FunctionRegistry* registry) { + return RegisterUdf(function, + PythonUdfKernelInit{std::make_shared(function)}, cb, + options, registry); +} + +Status RegisterTabularFunction(PyObject* function, UdfWrapperCallback cb, + const UdfOptions& options, + compute::FunctionRegistry* registry) { + if (options.arity.num_args != 0 || options.arity.is_varargs) { + return Status::NotImplemented("tabular function of non-null arity"); + } + if (options.output_type->id() != Type::type::STRUCT) { + return Status::Invalid("tabular function with non-struct output"); + } + return RegisterUdf( + function, PythonTableUdfKernelInit{std::make_shared(function), cb}, + cb, options, registry); +} + +Status RegisterScalarAggregateFunction(PyObject* function, UdfWrapperCallback cb, + const UdfOptions& options, + compute::FunctionRegistry* registry) { + if (!PyCallable_Check(function)) { + return Status::TypeError("Expected a callable Python object."); + } + + if (registry == NULLPTR) { + registry = compute::GetFunctionRegistry(); + } + + // Py_INCREF here so that once a function is registered + // its refcount gets increased by 1 and doesn't get gced + // if all existing refs are gone + Py_INCREF(function); + + static auto default_scalar_aggregate_options = + compute::ScalarAggregateOptions::Defaults(); + auto aggregate_func = std::make_shared( + options.func_name, options.arity, options.func_doc, + &default_scalar_aggregate_options); + + std::vector input_types; + for (const auto& in_dtype : options.input_types) { + input_types.emplace_back(in_dtype); + } + compute::OutputType output_type(options.output_type); + + compute::KernelInit init = [cb, function, options](compute::KernelContext* ctx, + const compute::KernelInitArgs& args) + -> Result> { + return std::make_unique( + std::make_shared(function), cb, options.input_types, + options.output_type); + }; + + auto sig = compute::KernelSignature::Make( + std::move(input_types), std::move(output_type), options.arity.is_varargs); + compute::ScalarAggregateKernel kernel(std::move(sig), std::move(init), + AggregateUdfConsume, AggregateUdfMerge, + AggregateUdfFinalize, /*ordered=*/false); + RETURN_NOT_OK(aggregate_func->AddKernel(std::move(kernel))); + RETURN_NOT_OK(registry->AddFunction(std::move(aggregate_func))); + return Status::OK(); +} + +/// \brief Create a new UdfOptions with adjustment for hash kernel +/// \param options User provided udf options +UdfOptions AdjustForHashAggregate(const UdfOptions& options) { + UdfOptions hash_options; + // Append hash_ before the function name to seperate from the scalar + // version + hash_options.func_name = "hash_" + options.func_name; + // Extend input types with group id. Group id is appended by the group + // aggregation node. Here we change both arity and input types + if (options.arity.is_varargs) { + hash_options.arity = options.arity; + } else { + hash_options.arity = compute::Arity(options.arity.num_args + 1, false); + } + // Changing the function doc shouldn't be necessarily because group id + // is not user visible, however, this is currently needed to pass the + // function validation. The name group_id_array is consistent with + // hash kernels in hash_aggregate.cc + hash_options.func_doc = options.func_doc; + hash_options.func_doc.arg_names.emplace_back("group_id_array"); + std::vector> input_dtypes = options.input_types; + input_dtypes.emplace_back(uint32()); + hash_options.input_types = std::move(input_dtypes); + hash_options.output_type = options.output_type; + return hash_options; +} + +Status RegisterHashAggregateFunction(PyObject* function, UdfWrapperCallback cb, + const UdfOptions& options, + compute::FunctionRegistry* registry) { + if (!PyCallable_Check(function)) { + return Status::TypeError("Expected a callable Python object."); + } + + if (registry == NULLPTR) { + registry = compute::GetFunctionRegistry(); + } + + // Py_INCREF here so that once a function is registered + // its refcount gets increased by 1 and doesn't get gced + // if all existing refs are gone + Py_INCREF(function); + UdfOptions hash_options = AdjustForHashAggregate(options); + + std::vector input_types; + for (const auto& in_dtype : hash_options.input_types) { + input_types.emplace_back(in_dtype); + } + compute::OutputType output_type(hash_options.output_type); + + static auto default_hash_aggregate_options = + compute::ScalarAggregateOptions::Defaults(); + auto hash_aggregate_func = std::make_shared( + hash_options.func_name, hash_options.arity, hash_options.func_doc, + &default_hash_aggregate_options); + + compute::KernelInit init = [function, cb, hash_options]( + compute::KernelContext* ctx, + const compute::KernelInitArgs& args) + -> Result> { + return std::make_unique( + std::make_shared(function), cb, hash_options.input_types, + hash_options.output_type); + }; + + auto sig = compute::KernelSignature::Make( + std::move(input_types), std::move(output_type), hash_options.arity.is_varargs); + + compute::HashAggregateKernel kernel( + std::move(sig), std::move(init), HashAggregateUdfResize, HashAggregateUdfConsume, + HashAggregateUdfMerge, HashAggregateUdfFinalize, /*ordered=*/false); + RETURN_NOT_OK(hash_aggregate_func->AddKernel(std::move(kernel))); + RETURN_NOT_OK(registry->AddFunction(std::move(hash_aggregate_func))); + return Status::OK(); +} + +Status RegisterAggregateFunction(PyObject* function, UdfWrapperCallback cb, + const UdfOptions& options, + compute::FunctionRegistry* registry) { + RETURN_NOT_OK(RegisterScalarAggregateFunction(function, cb, options, registry)); + RETURN_NOT_OK(RegisterHashAggregateFunction(function, cb, options, registry)); + + return Status::OK(); +} + +Result> CallTabularFunction( + const std::string& func_name, const std::vector& args, + compute::FunctionRegistry* registry) { + if (args.size() != 0) { + return Status::NotImplemented("non-empty arguments to tabular function"); + } + if (registry == NULLPTR) { + registry = compute::GetFunctionRegistry(); + } + ARROW_ASSIGN_OR_RAISE(auto func, registry->GetFunction(func_name)); + if (func->kind() != compute::Function::SCALAR) { + return Status::Invalid("tabular function of non-scalar kind"); + } + auto arity = func->arity(); + if (arity.num_args != 0 || arity.is_varargs) { + return Status::NotImplemented("tabular function of non-null arity"); + } + auto kernels = + arrow::internal::checked_pointer_cast(func)->kernels(); + if (kernels.size() != 1) { + return Status::NotImplemented("tabular function with non-single kernel"); + } + const compute::ScalarKernel* kernel = kernels[0]; + auto out_type = kernel->signature->out_type(); + if (out_type.kind() != compute::OutputType::FIXED) { + return Status::Invalid("tabular kernel of non-fixed kind"); + } + auto datatype = out_type.type(); + if (datatype->id() != Type::type::STRUCT) { + return Status::Invalid("tabular kernel with non-struct output"); + } + auto struct_type = arrow::internal::checked_cast(datatype.get()); + auto schema = ::arrow::schema(struct_type->fields()); + std::vector in_types; + ARROW_ASSIGN_OR_RAISE(auto func_exec, + GetFunctionExecutor(func_name, in_types, NULLPTR, registry)); + auto next_func = [schema, func_exec = std::move( + func_exec)]() -> Result> { + std::vector args; + // passed_length of -1 or 0 with args.size() of 0 leads to an empty ExecSpanIterator + // in exec.cc and to never invoking the source function, so 1 is passed instead + // TODO: GH-33612: Support batch size in user-defined tabular functions + ARROW_ASSIGN_OR_RAISE(auto datum, func_exec->Execute(args, /*passed_length=*/1)); + if (!datum.is_array()) { + return Status::Invalid("UDF result of non-array kind"); + } + std::shared_ptr array = datum.make_array(); + if (array->length() == 0) { + return IterationTraits>::End(); + } + ARROW_ASSIGN_OR_RAISE(auto batch, RecordBatch::FromStructArray(std::move(array))); + if (!schema->Equals(batch->schema())) { + return Status::Invalid("UDF result with shape not conforming to schema"); + } + return std::move(batch); + }; + return RecordBatchReader::MakeFromIterator(MakeFunctionIterator(std::move(next_func)), + schema); +} + +} // namespace py +} // namespace arrow diff --git a/src/vendored/apache-arrow-12.0.1/arrow/python/udf.h b/src/vendored/apache-arrow-12.0.1/arrow/python/udf.h index cde97d9..682cbb2 100644 --- a/src/vendored/apache-arrow-12.0.1/arrow/python/udf.h +++ b/src/vendored/apache-arrow-12.0.1/arrow/python/udf.h @@ -43,29 +43,34 @@ struct ARROW_PYTHON_EXPORT UdfOptions { std::shared_ptr output_type; }; -/// \brief A context passed as the first argument of scalar UDF functions. -struct ARROW_PYTHON_EXPORT ScalarUdfContext { +/// \brief A context passed as the first argument of UDF functions. +struct ARROW_PYTHON_EXPORT UdfContext { MemoryPool* pool; int64_t batch_length; }; using UdfWrapperCallback = std::function; + PyObject* user_function, const UdfContext& context, PyObject* inputs)>; /// \brief register a Scalar user-defined-function from Python Status ARROW_PYTHON_EXPORT RegisterScalarFunction( - PyObject* user_function, UdfWrapperCallback wrapper, - const UdfOptions& options, compute::FunctionRegistry* registry = NULLPTR); + PyObject* user_function, UdfWrapperCallback wrapper, const UdfOptions& options, + compute::FunctionRegistry* registry = NULLPTR); /// \brief register a Table user-defined-function from Python Status ARROW_PYTHON_EXPORT RegisterTabularFunction( - PyObject* user_function, UdfWrapperCallback wrapper, - const UdfOptions& options, compute::FunctionRegistry* registry = NULLPTR); + PyObject* user_function, UdfWrapperCallback wrapper, const UdfOptions& options, + compute::FunctionRegistry* registry = NULLPTR); -Result> ARROW_PYTHON_EXPORT CallTabularFunction( - const std::string& func_name, const std::vector& args, +/// \brief register a Aggregate user-defined-function from Python +Status ARROW_PYTHON_EXPORT RegisterAggregateFunction( + PyObject* user_function, UdfWrapperCallback wrapper, const UdfOptions& options, compute::FunctionRegistry* registry = NULLPTR); +Result> ARROW_PYTHON_EXPORT +CallTabularFunction(const std::string& func_name, const std::vector& args, + compute::FunctionRegistry* registry = NULLPTR); + } // namespace py } // namespace arrow diff --git a/vcpkg.json b/vcpkg.json index 40d4fdf..eec37b5 100644 --- a/vcpkg.json +++ b/vcpkg.json @@ -4,4 +4,4 @@ "dependencies": [ "arrow" ] -} \ No newline at end of file +}