From a43e7f2f0e134fd1fcc4cc3a28edbe9ebe3926b2 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Tue, 20 Aug 2024 04:46:06 +0000 Subject: [PATCH 01/48] Remove libarrow dependency from libcudf and migrate to tests only --- cpp/CMakeLists.txt | 27 +-------------------------- cpp/cmake/thirdparty/get_arrow.cmake | 6 ++---- cpp/tests/CMakeLists.txt | 19 ++++++++++++++----- 3 files changed, 17 insertions(+), 35 deletions(-) diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 6b8bb26825b..aa5e1e0467d 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -54,11 +54,6 @@ mark_as_advanced(CUDF_BUILD_TESTUTIL) option(CUDF_USE_PROPRIETARY_NVCOMP "Download and use NVCOMP with proprietary extensions" ON) option(CUDF_LARGE_STRINGS_DISABLED "Build with large string support disabled" OFF) mark_as_advanced(CUDF_LARGE_STRINGS_DISABLED) -option(CUDF_USE_ARROW_STATIC "Build and statically link Arrow libraries" OFF) -option(CUDF_ENABLE_ARROW_ORC "Build the Arrow ORC adapter" OFF) -option(CUDF_ENABLE_ARROW_PYTHON "Find (or build) Arrow with Python support" OFF) -option(CUDF_ENABLE_ARROW_PARQUET "Find (or build) Arrow with Parquet support" OFF) -option(CUDF_ENABLE_ARROW_S3 "Build/Enable AWS S3 Arrow filesystem support" OFF) option( CUDF_USE_PER_THREAD_DEFAULT_STREAM "Build cuDF with per-thread default stream, including passing the per-thread default @@ -81,8 +76,6 @@ option(CUDA_ENABLE_LINEINFO option(CUDA_WARNINGS_AS_ERRORS "Enable -Werror=all-warnings for all CUDA compilation" ON) # cudart can be statically linked or dynamically linked. The python ecosystem wants dynamic linking option(CUDA_STATIC_RUNTIME "Statically link the CUDA runtime" OFF) -option(USE_LIBARROW_FROM_PYARROW "Only use the libarrow contained in pyarrow" OFF) -mark_as_advanced(USE_LIBARROW_FROM_PYARROW) set(DEFAULT_CUDF_BUILD_STREAMS_TEST_UTIL ON) if(CUDA_STATIC_RUNTIME OR NOT BUILD_SHARED_LIBS) @@ -100,8 +93,6 @@ message(VERBOSE "CUDF: Configure CMake to build tests: ${BUILD_TESTS}") message(VERBOSE "CUDF: Configure CMake to build (google & nvbench) benchmarks: ${BUILD_BENCHMARKS}") message(VERBOSE "CUDF: Build cuDF shared libraries: ${BUILD_SHARED_LIBS}") message(VERBOSE "CUDF: Use a file cache for JIT compiled kernels: ${JITIFY_USE_CACHE}") -message(VERBOSE "CUDF: Build and statically link Arrow libraries: ${CUDF_USE_ARROW_STATIC}") -message(VERBOSE "CUDF: Build and enable S3 filesystem support for Arrow: ${CUDF_ENABLE_ARROW_S3}") message(VERBOSE "CUDF: Build with per-thread default stream: ${CUDF_USE_PER_THREAD_DEFAULT_STREAM}") message( VERBOSE @@ -192,8 +183,6 @@ include(cmake/thirdparty/get_nvcomp.cmake) include(cmake/thirdparty/get_cccl.cmake) # find rmm include(cmake/thirdparty/get_rmm.cmake) -# find arrow -include(cmake/thirdparty/get_arrow.cmake) # find flatbuffers include(cmake/thirdparty/get_flatbuffers.cmake) # find dlpack @@ -806,7 +795,7 @@ add_dependencies(cudf jitify_preprocess_run) # Specify the target module library dependencies target_link_libraries( cudf - PUBLIC ${ARROW_LIBRARIES} CCCL::CCCL rmm::rmm $ + PUBLIC CCCL::CCCL rmm::rmm $ PRIVATE $ cuco::cuco ZLIB::ZLIB nvcomp::nvcomp kvikio::kvikio $ nanoarrow ) @@ -1055,20 +1044,6 @@ following IMPORTED GLOBAL targets: ]=] ) -if(CUDF_ENABLE_ARROW_PARQUET) - string( - APPEND - install_code_string - [=[ - if(NOT Parquet_DIR) - set(Parquet_DIR "${Arrow_DIR}") - endif() - set(ArrowDataset_DIR "${Arrow_DIR}") - find_dependency(ArrowDataset) - ]=] - ) -endif() - string( APPEND install_code_string diff --git a/cpp/cmake/thirdparty/get_arrow.cmake b/cpp/cmake/thirdparty/get_arrow.cmake index 0afdc526981..d126d3c61c2 100644 --- a/cpp/cmake/thirdparty/get_arrow.cmake +++ b/cpp/cmake/thirdparty/get_arrow.cmake @@ -435,7 +435,5 @@ if(NOT DEFINED CUDF_VERSION_Arrow) ) endif() -find_and_configure_arrow( - ${CUDF_VERSION_Arrow} ${CUDF_USE_ARROW_STATIC} ${CUDF_ENABLE_ARROW_S3} ${CUDF_ENABLE_ARROW_ORC} - ${CUDF_ENABLE_ARROW_PYTHON} ${CUDF_ENABLE_ARROW_PARQUET} ${USE_LIBARROW_FROM_PYARROW} -) +# TODO: Remove arguments we deem unnecessary going forward +find_and_configure_arrow(${CUDF_VERSION_Arrow} ${CUDF_USE_ARROW_STATIC} OFF OFF OFF OFF OFF) diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index ac77a362e1c..008082fd721 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -24,8 +24,8 @@ rapids_test_init() # properties and linking to build the test function(ConfigureTest CMAKE_TEST_NAME) set(options) - set(one_value GPUS PERCENT STREAM_MODE EXTRA_LIB) - set(multi_value) + set(one_value GPUS PERCENT STREAM_MODE) + set(multi_value EXTRA_LIBS) cmake_parse_arguments(_CUDF_TEST "${options}" "${one_value}" "${multi_value}" ${ARGN}) if(NOT DEFINED _CUDF_TEST_GPUS AND NOT DEFINED _CUDF_TEST_PERCENT) set(_CUDF_TEST_GPUS 1) @@ -57,7 +57,7 @@ function(ConfigureTest CMAKE_TEST_NAME) target_link_libraries( ${CMAKE_TEST_NAME} PRIVATE cudftestutil GTest::gmock GTest::gmock_main GTest::gtest GTest::gtest_main - nvtx3::nvtx3-cpp $ "${_CUDF_TEST_EXTRA_LIB}" + nvtx3::nvtx3-cpp $ "${_CUDF_TEST_EXTRA_LIBS}" ) rapids_cuda_set_runtime(${CMAKE_TEST_NAME} USE_STATIC ${CUDA_STATIC_RUNTIME}) rapids_test_add( @@ -78,6 +78,14 @@ function(ConfigureTest CMAKE_TEST_NAME) endif() endfunction() +# ################################################################################################## +# dependencies ################################################################################### +# ################################################################################################## + +# find arrow. TODO: Always use static for tests set(CUDF_USE_ARROW_STATIC ON) +set(CUDF_USE_ARROW_STATIC OFF) +include(../cmake/thirdparty/get_arrow.cmake) + # ################################################################################################## # test sources ################################################################################## # ################################################################################################## @@ -197,7 +205,7 @@ ConfigureTest( QUANTILES_TEST quantiles/percentile_approx_test.cpp quantiles/quantile_test.cpp quantiles/quantiles_test.cpp GPUS 1 - PERCENT 70 + PERCENT 70 EXTRA_LIBS ${ARROW_LIBRARIES} ) # ################################################################################################## @@ -276,8 +284,9 @@ ConfigureTest( interop/from_arrow_host_test.cpp interop/from_arrow_stream_test.cpp interop/dlpack_test.cpp - EXTRA_LIB + EXTRA_LIBS nanoarrow + ${ARROW_LIBRARIES} ) # ################################################################################################## From d1d23dd1005fd73dfaef9309f1c5dbb1fcec416e Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Tue, 20 Aug 2024 04:50:04 +0000 Subject: [PATCH 02/48] Remove libarrow dependency from all Cython builds --- ci/build_wheel_cudf.sh | 2 - ci/build_wheel_pylibcudf.sh | 2 - python/cudf/CMakeLists.txt | 16 -------- python/cudf/cudf/_lib/CMakeLists.txt | 3 -- python/cudf/cudf/_lib/io/CMakeLists.txt | 2 - .../cudf_kafka/cudf_kafka/_lib/CMakeLists.txt | 2 - python/pylibcudf/CMakeLists.txt | 17 -------- .../cmake/Modules/LinkPyarrowHeaders.cmake | 40 ------------------- python/pylibcudf/pylibcudf/io/CMakeLists.txt | 5 --- .../pylibcudf/libcudf/io/CMakeLists.txt | 3 -- 10 files changed, 92 deletions(-) delete mode 100644 python/pylibcudf/cmake/Modules/LinkPyarrowHeaders.cmake diff --git a/ci/build_wheel_cudf.sh b/ci/build_wheel_cudf.sh index 7c0fb1efebe..9c8b96b063b 100755 --- a/ci/build_wheel_cudf.sh +++ b/ci/build_wheel_cudf.sh @@ -5,8 +5,6 @@ set -euo pipefail package_dir="python/cudf" -export SKBUILD_CMAKE_ARGS="-DUSE_LIBARROW_FROM_PYARROW=ON" - # Download the pylibcudf built in the previous step RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})" RAPIDS_PY_WHEEL_NAME="pylibcudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 /tmp/pylibcudf_dist diff --git a/ci/build_wheel_pylibcudf.sh b/ci/build_wheel_pylibcudf.sh index b25d118ff81..1ea37ced24b 100755 --- a/ci/build_wheel_pylibcudf.sh +++ b/ci/build_wheel_pylibcudf.sh @@ -5,8 +5,6 @@ set -euo pipefail package_dir="python/pylibcudf" -export SKBUILD_CMAKE_ARGS="-DUSE_LIBARROW_FROM_PYARROW=ON" - ./ci/build_wheel.sh ${package_dir} python -m auditwheel repair -w ${package_dir}/final_dist ${package_dir}/dist/* diff --git a/python/cudf/CMakeLists.txt b/python/cudf/CMakeLists.txt index e11d62b3bd5..d979dc576af 100644 --- a/python/cudf/CMakeLists.txt +++ b/python/cudf/CMakeLists.txt @@ -27,8 +27,6 @@ project( option(FIND_CUDF_CPP "Search for existing CUDF C++ installations before defaulting to local files" OFF ) -option(USE_LIBARROW_FROM_PYARROW "Only use the libarrow contained in pyarrow" OFF) -mark_as_advanced(USE_LIBARROW_FROM_PYARROW) # Find Python early so that later commands can use it find_package(Python 3.9 REQUIRED COMPONENTS Interpreter) @@ -40,19 +38,6 @@ if(FIND_CUDF_CPP) include(rapids-find) rapids_cpm_init() - if(USE_LIBARROW_FROM_PYARROW) - # We need to find arrow before libcudf since libcudf requires it but doesn't bundle arrow - # libraries. These variables have no effect because we are always searching for arrow via - # pyarrow, but they must be set as they are required arguments to the function in - # get_arrow.cmake. - set(CUDF_USE_ARROW_STATIC OFF) - set(CUDF_ENABLE_ARROW_S3 OFF) - set(CUDF_ENABLE_ARROW_ORC OFF) - set(CUDF_ENABLE_ARROW_PYTHON OFF) - set(CUDF_ENABLE_ARROW_PARQUET OFF) - include(../../cpp/cmake/thirdparty/get_arrow.cmake) - endif() - find_package(cudf "${RAPIDS_VERSION}" REQUIRED) # an installed version of libcudf doesn't provide the dlpack headers so we need to download dlpack @@ -92,7 +77,6 @@ endif() rapids_cython_init() -include(../pylibcudf/cmake/Modules/LinkPyarrowHeaders.cmake) add_subdirectory(cudf/_lib) add_subdirectory(udf_cpp) diff --git a/python/cudf/cudf/_lib/CMakeLists.txt b/python/cudf/cudf/_lib/CMakeLists.txt index d6182673308..b418616b8ef 100644 --- a/python/cudf/cudf/_lib/CMakeLists.txt +++ b/python/cudf/cudf/_lib/CMakeLists.txt @@ -64,9 +64,6 @@ rapids_cython_create_modules( target_link_libraries(strings_udf PUBLIC cudf_strings_udf) -set(targets_using_arrow_headers avro csv orc json parquet) -link_to_pyarrow_headers("${targets_using_arrow_headers}") - include(${rapids-cmake-dir}/export/find_package_root.cmake) include(../../../../cpp/cmake/thirdparty/get_nanoarrow.cmake) target_link_libraries(interop PUBLIC nanoarrow) diff --git a/python/cudf/cudf/_lib/io/CMakeLists.txt b/python/cudf/cudf/_lib/io/CMakeLists.txt index 620229a1275..e7408cf2852 100644 --- a/python/cudf/cudf/_lib/io/CMakeLists.txt +++ b/python/cudf/cudf/_lib/io/CMakeLists.txt @@ -19,5 +19,3 @@ rapids_cython_create_modules( SOURCE_FILES "${cython_sources}" LINKED_LIBRARIES "${linked_libraries}" MODULE_PREFIX io_ ASSOCIATED_TARGETS cudf ) - -link_to_pyarrow_headers("${RAPIDS_CYTHON_CREATED_TARGETS}") diff --git a/python/cudf_kafka/cudf_kafka/_lib/CMakeLists.txt b/python/cudf_kafka/cudf_kafka/_lib/CMakeLists.txt index 1b205537d73..4490c41c7a9 100644 --- a/python/cudf_kafka/cudf_kafka/_lib/CMakeLists.txt +++ b/python/cudf_kafka/cudf_kafka/_lib/CMakeLists.txt @@ -20,5 +20,3 @@ rapids_cython_create_modules( SOURCE_FILES "${cython_sources}" LINKED_LIBRARIES "${linked_libraries}" ) -include(../../../pylibcudf/cmake/Modules/LinkPyarrowHeaders.cmake) -link_to_pyarrow_headers("${RAPIDS_CYTHON_CREATED_TARGETS}") diff --git a/python/pylibcudf/CMakeLists.txt b/python/pylibcudf/CMakeLists.txt index 424d8372280..0d5383bba2e 100644 --- a/python/pylibcudf/CMakeLists.txt +++ b/python/pylibcudf/CMakeLists.txt @@ -27,9 +27,6 @@ project( option(FIND_CUDF_CPP "Search for existing CUDF C++ installations before defaulting to local files" OFF ) -option(USE_LIBARROW_FROM_PYARROW "Only use the libarrow contained in pyarrow" OFF) -mark_as_advanced(USE_LIBARROW_FROM_PYARROW) - # Find Python early so that later commands can use it find_package(Python 3.9 REQUIRED COMPONENTS Interpreter) @@ -40,19 +37,6 @@ if(FIND_CUDF_CPP) include(rapids-find) rapids_cpm_init() - if(USE_LIBARROW_FROM_PYARROW) - # We need to find arrow before libcudf since libcudf requires it but doesn't bundle arrow - # libraries. These variables have no effect because we are always searching for arrow via - # pyarrow, but they must be set as they are required arguments to the function in - # get_arrow.cmake. - set(CUDF_USE_ARROW_STATIC OFF) - set(CUDF_ENABLE_ARROW_S3 OFF) - set(CUDF_ENABLE_ARROW_ORC OFF) - set(CUDF_ENABLE_ARROW_PYTHON OFF) - set(CUDF_ENABLE_ARROW_PARQUET OFF) - include(../../cpp/cmake/thirdparty/get_arrow.cmake) - endif() - find_package(cudf "${RAPIDS_VERSION}" REQUIRED) # an installed version of libcudf doesn't provide the dlpack headers so we need to download dlpack @@ -92,7 +76,6 @@ endif() rapids_cython_init() -include(cmake/Modules/LinkPyarrowHeaders.cmake) add_subdirectory(pylibcudf) if(DEFINED cython_lib_dir) diff --git a/python/pylibcudf/cmake/Modules/LinkPyarrowHeaders.cmake b/python/pylibcudf/cmake/Modules/LinkPyarrowHeaders.cmake deleted file mode 100644 index d432f9fe1f5..00000000000 --- a/python/pylibcudf/cmake/Modules/LinkPyarrowHeaders.cmake +++ /dev/null @@ -1,40 +0,0 @@ -# ============================================================================= -# Copyright (c) 2023, NVIDIA CORPORATION. -# -# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except -# in compliance with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software distributed under the License -# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express -# or implied. See the License for the specific language governing permissions and limitations under -# the License. -# ============================================================================= -include_guard(GLOBAL) - -find_package(Python REQUIRED COMPONENTS Development NumPy) - -execute_process( - COMMAND "${Python_EXECUTABLE}" -c "import pyarrow; print(pyarrow.get_include())" - OUTPUT_VARIABLE PYARROW_INCLUDE_DIR - ERROR_VARIABLE PYARROW_ERROR - RESULT_VARIABLE PYARROW_RESULT - OUTPUT_STRIP_TRAILING_WHITESPACE -) - -if(${PYARROW_RESULT}) - message(FATAL_ERROR "Error while trying to obtain pyarrow include directory:\n${PYARROW_ERROR}") -endif() - -# Due to cudf's scalar.pyx needing to cimport pylibcudf's scalar.pyx (because there are parts of -# cudf Cython that need to directly access the c_obj underlying the pylibcudf Scalar) the -# requirement for arrow headers infects all of cudf. These requirements will go away once all -# scalar-related Cython code is removed from cudf. -function(link_to_pyarrow_headers targets) - foreach(target IN LISTS targets) - # PyArrow headers require numpy headers. - target_include_directories(${target} PRIVATE "${Python_NumPy_INCLUDE_DIRS}") - target_include_directories(${target} PRIVATE "${PYARROW_INCLUDE_DIR}") - endforeach() -endfunction() diff --git a/python/pylibcudf/pylibcudf/io/CMakeLists.txt b/python/pylibcudf/pylibcudf/io/CMakeLists.txt index 55bea4fc262..bcc2151f5b6 100644 --- a/python/pylibcudf/pylibcudf/io/CMakeLists.txt +++ b/python/pylibcudf/pylibcudf/io/CMakeLists.txt @@ -20,8 +20,3 @@ rapids_cython_create_modules( SOURCE_FILES "${cython_sources}" LINKED_LIBRARIES "${linked_libraries}" MODULE_PREFIX pylibcudf_io_ ASSOCIATED_TARGETS cudf ) - -set(targets_using_arrow_headers pylibcudf_io_avro pylibcudf_io_csv pylibcudf_io_datasource - pylibcudf_io_json pylibcudf_io_parquet pylibcudf_io_types -) -link_to_pyarrow_headers("${targets_using_arrow_headers}") diff --git a/python/pylibcudf/pylibcudf/libcudf/io/CMakeLists.txt b/python/pylibcudf/pylibcudf/libcudf/io/CMakeLists.txt index 6831063ecb9..9f5f74506e9 100644 --- a/python/pylibcudf/pylibcudf/libcudf/io/CMakeLists.txt +++ b/python/pylibcudf/pylibcudf/libcudf/io/CMakeLists.txt @@ -21,6 +21,3 @@ rapids_cython_create_modules( SOURCE_FILES "${cython_sources}" LINKED_LIBRARIES "${linked_libraries}" ASSOCIATED_TARGETS cudf MODULE_PREFIX cpp_io_ ) - -set(targets_using_arrow_headers cpp_io_json cpp_io_types) -link_to_pyarrow_headers("${targets_using_arrow_headers}") From 424993fe3b29bb1546fce28aa66b9302c9759121 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Tue, 20 Aug 2024 05:21:56 +0000 Subject: [PATCH 03/48] Lots of cleanup and simplification of get_arrow.cmake --- cpp/cmake/thirdparty/get_arrow.cmake | 225 ++------------------------- 1 file changed, 9 insertions(+), 216 deletions(-) diff --git a/cpp/cmake/thirdparty/get_arrow.cmake b/cpp/cmake/thirdparty/get_arrow.cmake index d126d3c61c2..67d78191357 100644 --- a/cpp/cmake/thirdparty/get_arrow.cmake +++ b/cpp/cmake/thirdparty/get_arrow.cmake @@ -22,81 +22,8 @@ include_guard(GLOBAL) -# Generate a FindArrow module for the case where we need to search for arrow within a pip install -# pyarrow. -function(find_libarrow_in_python_wheel PYARROW_VERSION) - string(REPLACE "." ";" PYARROW_VER_COMPONENTS "${PYARROW_VERSION}") - list(GET PYARROW_VER_COMPONENTS 0 PYARROW_MAJOR_VER) - list(GET PYARROW_VER_COMPONENTS 1 PYARROW_MINOR_VER) - - # Ensure that the major and minor versions are two digits long - string(LENGTH ${PYARROW_MAJOR_VER} PYARROW_MAJOR_LENGTH) - string(LENGTH ${PYARROW_MINOR_VER} PYARROW_MINOR_LENGTH) - if(${PYARROW_MAJOR_LENGTH} EQUAL 1) - set(PYARROW_MAJOR_VER "0${PYARROW_MAJOR_VER}") - endif() - if(${PYARROW_MINOR_LENGTH} EQUAL 1) - set(PYARROW_MINOR_VER "0${PYARROW_MINOR_VER}") - endif() - - set(PYARROW_LIB "libarrow.so.${PYARROW_MAJOR_VER}${PYARROW_MINOR_VER}") - - string( - APPEND - initial_code_block - [=[ -find_package(Python 3.9 REQUIRED COMPONENTS Interpreter) -execute_process( - COMMAND "${Python_EXECUTABLE}" -c "import pyarrow; print(pyarrow.get_library_dirs()[0])" - OUTPUT_VARIABLE CUDF_PYARROW_WHEEL_DIR - OUTPUT_STRIP_TRAILING_WHITESPACE - COMMAND_ERROR_IS_FATAL ANY -) -list(APPEND CMAKE_PREFIX_PATH "${CUDF_PYARROW_WHEEL_DIR}") -]=] - ) - string( - APPEND - final_code_block - [=[ -list(POP_BACK CMAKE_PREFIX_PATH) -]=] - ) - rapids_find_generate_module( - Arrow NO_CONFIG - VERSION "${PYARROW_VERSION}" - LIBRARY_NAMES "${PYARROW_LIB}" - BUILD_EXPORT_SET cudf-exports - INSTALL_EXPORT_SET cudf-exports - HEADER_NAMES arrow/python/arrow_to_pandas.h INITIAL_CODE_BLOCK initial_code_block - FINAL_CODE_BLOCK final_code_block - ) - - find_package(Arrow ${PYARROW_VERSION} MODULE REQUIRED GLOBAL) - add_library(arrow_shared ALIAS Arrow::Arrow) - - rapids_export_package(BUILD Arrow cudf-exports) - rapids_export_package(INSTALL Arrow cudf-exports) -endfunction() - # This function finds arrow and sets any additional necessary environment variables. -function(find_and_configure_arrow VERSION BUILD_STATIC ENABLE_S3 ENABLE_ORC ENABLE_PYTHON - ENABLE_PARQUET PYARROW_LIBARROW -) - - if(PYARROW_LIBARROW) - # Generate a FindArrow.cmake to find pyarrow's libarrow.so - find_libarrow_in_python_wheel(${VERSION}) - set(ARROW_FOUND - TRUE - PARENT_SCOPE - ) - set(ARROW_LIBRARIES - arrow_shared - PARENT_SCOPE - ) - return() - endif() +function(find_and_configure_arrow VERSION BUILD_STATIC) if(BUILD_STATIC) if(TARGET arrow_static) @@ -124,10 +51,6 @@ function(find_and_configure_arrow VERSION BUILD_STATIC ENABLE_S3 ENABLE_ORC ENAB endif() endif() - if(NOT ARROW_ARMV8_ARCH) - set(ARROW_ARMV8_ARCH "armv8-a") - endif() - if(NOT ARROW_SIMD_LEVEL) set(ARROW_SIMD_LEVEL "NONE") endif() @@ -135,41 +58,19 @@ function(find_and_configure_arrow VERSION BUILD_STATIC ENABLE_S3 ENABLE_ORC ENAB if(BUILD_STATIC) set(ARROW_BUILD_STATIC ON) set(ARROW_BUILD_SHARED OFF) + set(ARROW_DEPENDENCY_USE_SHARED ON) # Turn off CPM using `find_package` so we always download and make sure we get proper static # library. set(CPM_DOWNLOAD_Arrow TRUE) - # By default ARROW will try to search for a static version of OpenSSL which is a bad idea given - # that shared linking is advised for critical components like SSL. If a static build is - # requested, we honor ARROW's default of static linking, but users may consider setting - # ARROW_OPENSSL_USE_SHARED even in static builds. else() set(ARROW_BUILD_SHARED ON) set(ARROW_BUILD_STATIC OFF) - # By default ARROW will try to search for a static version of OpenSSL which is a bad idea given - # that shared linking is advised for critical components like SSL - set(ARROW_OPENSSL_USE_SHARED ON) - endif() - - set(ARROW_PYTHON_OPTIONS "") - if(ENABLE_PYTHON) - list(APPEND ARROW_PYTHON_OPTIONS "ARROW_PYTHON ON") - # Arrow's logic to build Boost from source is busted, so we have to get it from the system. - list(APPEND ARROW_PYTHON_OPTIONS "BOOST_SOURCE SYSTEM") - list(APPEND ARROW_PYTHON_OPTIONS "ARROW_DEPENDENCY_SOURCE AUTO") - endif() - - set(ARROW_PARQUET_OPTIONS "") - if(ENABLE_PARQUET) - # Arrow's logic to build Boost from source is busted, so we have to get it from the system. - list(APPEND ARROW_PARQUET_OPTIONS "BOOST_SOURCE SYSTEM") - list(APPEND ARROW_PARQUET_OPTIONS "Thrift_SOURCE BUNDLED") - list(APPEND ARROW_PARQUET_OPTIONS "ARROW_DEPENDENCY_SOURCE AUTO") + set(ARROW_DEPENDENCY_USE_SHARED ON) endif() rapids_cpm_find( Arrow ${VERSION} - GLOBAL_TARGETS arrow_shared parquet_shared arrow_acero_shared arrow_dataset_shared arrow_static - parquet_static arrow_acero_static arrow_dataset_static + GLOBAL_TARGETS arrow_shared arrow_static CPM_ARGS GIT_REPOSITORY https://github.com/apache/arrow.git GIT_TAG apache-arrow-${VERSION} @@ -181,13 +82,11 @@ function(find_and_configure_arrow VERSION BUILD_STATIC ENABLE_S3 ENABLE_ORC ENAB "ARROW_WITH_BACKTRACE ON" "ARROW_CXXFLAGS -w" "ARROW_JEMALLOC OFF" - "ARROW_S3 ${ENABLE_S3}" - "ARROW_ORC ${ENABLE_ORC}" - # e.g. needed by blazingsql-io - ${ARROW_PARQUET_OPTIONS} - "ARROW_PARQUET ${ENABLE_PARQUET}" - "ARROW_FILESYSTEM ON" - ${ARROW_PYTHON_OPTIONS} + "ARROW_S3 OFF" + "ARROW_ORC OFF" + "ARROW_PARQUET OFF" + "ARROW_FILESYSTEM OFF" + "ARROW_PYTHON OFF" # Arrow modifies CMake's GLOBAL RULE_LAUNCH_COMPILE unless this is off "ARROW_USE_CCACHE OFF" "ARROW_ARMV8_ARCH ${ARROW_ARMV8_ARCH}" @@ -222,17 +121,6 @@ function(find_and_configure_arrow VERSION BUILD_STATIC ENABLE_S3 ENABLE_ORC ENAB # variables from find_package that we might need. This is especially problematic when # rapids_cpm_find builds from source. find_package(Arrow REQUIRED QUIET) - if(ENABLE_PARQUET) - # Setting Parquet_DIR is conditional because parquet may be installed independently of arrow. - if(NOT Parquet_DIR) - # Set this to enable `find_package(Parquet)` - set(Parquet_DIR "${Arrow_DIR}") - endif() - # Set this to enable `find_package(ArrowDataset)`. This will call find_package(ArrowAcero) for - # us - set(ArrowDataset_DIR "${Arrow_DIR}") - find_package(ArrowDataset REQUIRED QUIET) - endif() # Arrow_ADDED: set if CPM downloaded Arrow from Github elseif(Arrow_ADDED) # Copy these files so we can avoid adding paths in Arrow_BINARY_DIR to @@ -240,11 +128,6 @@ function(find_and_configure_arrow VERSION BUILD_STATIC ENABLE_S3 ENABLE_ORC ENAB file(INSTALL "${Arrow_BINARY_DIR}/src/arrow/util/config.h" DESTINATION "${Arrow_SOURCE_DIR}/cpp/src/arrow/util" ) - if(ENABLE_PARQUET) - file(INSTALL "${Arrow_BINARY_DIR}/src/parquet/parquet_version.h" - DESTINATION "${Arrow_SOURCE_DIR}/cpp/src/parquet" - ) - endif() # Arrow populates INTERFACE_INCLUDE_DIRECTORIES for the `arrow_static` and `arrow_shared` # targets in FindArrow, so for static source-builds, we have to do it after-the-fact. # @@ -286,18 +169,6 @@ function(find_and_configure_arrow VERSION BUILD_STATIC ENABLE_S3 ENABLE_ORC ENAB endif() ]=] ) - if(ENABLE_PARQUET) - string( - APPEND - arrow_code_string - " - find_package(Boost) - if (NOT TARGET Boost::headers) - add_library(Boost::headers INTERFACE IMPORTED) - endif() - " - ) - endif() if(NOT TARGET xsimd) string( APPEND @@ -333,93 +204,15 @@ function(find_and_configure_arrow VERSION BUILD_STATIC ENABLE_S3 ENABLE_ORC ENAB FINAL_CODE_BLOCK arrow_code_string ) - if(ENABLE_PARQUET) - - set(arrow_acero_code_string - [=[ - if (TARGET cudf::arrow_acero_shared AND (NOT TARGET arrow_acero_shared)) - add_library(arrow_acero_shared ALIAS cudf::arrow_acero_shared) - endif() - if (TARGET cudf::arrow_acero_static AND (NOT TARGET arrow_acero_static)) - add_library(arrow_acero_static ALIAS cudf::arrow_acero_static) - endif() - ]=] - ) - - rapids_export( - BUILD ArrowAcero - VERSION ${VERSION} - EXPORT_SET arrow_acero_targets - GLOBAL_TARGETS arrow_acero_shared arrow_acero_static - NAMESPACE cudf:: - FINAL_CODE_BLOCK arrow_acero_code_string - ) - - set(arrow_dataset_code_string - [=[ - if (TARGET cudf::arrow_dataset_shared AND (NOT TARGET arrow_dataset_shared)) - add_library(arrow_dataset_shared ALIAS cudf::arrow_dataset_shared) - endif() - if (TARGET cudf::arrow_dataset_static AND (NOT TARGET arrow_dataset_static)) - add_library(arrow_dataset_static ALIAS cudf::arrow_dataset_static) - endif() - ]=] - ) - - rapids_export( - BUILD ArrowDataset - VERSION ${VERSION} - EXPORT_SET arrow_dataset_targets - GLOBAL_TARGETS arrow_dataset_shared arrow_dataset_static - NAMESPACE cudf:: - FINAL_CODE_BLOCK arrow_dataset_code_string - ) - - set(parquet_code_string - [=[ - if (TARGET cudf::parquet_shared AND (NOT TARGET parquet_shared)) - add_library(parquet_shared ALIAS cudf::parquet_shared) - endif() - if (TARGET cudf::parquet_static AND (NOT TARGET parquet_static)) - add_library(parquet_static ALIAS cudf::parquet_static) - endif() - ]=] - ) - - rapids_export( - BUILD Parquet - VERSION ${VERSION} - EXPORT_SET parquet_targets - GLOBAL_TARGETS parquet_shared parquet_static - NAMESPACE cudf:: - FINAL_CODE_BLOCK parquet_code_string - ) - endif() endif() # We generate the arrow-configfiles when we built arrow locally, so always do `find_dependency` rapids_export_package(BUILD Arrow cudf-exports) rapids_export_package(INSTALL Arrow cudf-exports) - if(ENABLE_PARQUET) - rapids_export_package(BUILD Parquet cudf-exports) - rapids_export_package(BUILD ArrowDataset cudf-exports) - endif() - include("${rapids-cmake-dir}/export/find_package_root.cmake") rapids_export_find_package_root( BUILD Arrow [=[${CMAKE_CURRENT_LIST_DIR}]=] EXPORT_SET cudf-exports ) - rapids_export_find_package_root( - BUILD Parquet [=[${CMAKE_CURRENT_LIST_DIR}]=] - EXPORT_SET cudf-exports - CONDITION ENABLE_PARQUET - ) - rapids_export_find_package_root( - BUILD ArrowDataset [=[${CMAKE_CURRENT_LIST_DIR}]=] - EXPORT_SET cudf-exports - CONDITION ENABLE_PARQUET - ) - set(ARROW_LIBRARIES "${ARROW_LIBRARIES}" PARENT_SCOPE From a1bf94fbedfc95d001342f8c896990338f9b8e69 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Tue, 20 Aug 2024 05:24:44 +0000 Subject: [PATCH 04/48] Move module to tests --- cpp/tests/CMakeLists.txt | 2 +- cpp/{ => tests}/cmake/thirdparty/get_arrow.cmake | 0 2 files changed, 1 insertion(+), 1 deletion(-) rename cpp/{ => tests}/cmake/thirdparty/get_arrow.cmake (100%) diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index 008082fd721..4c87cf2f591 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -84,7 +84,7 @@ endfunction() # find arrow. TODO: Always use static for tests set(CUDF_USE_ARROW_STATIC ON) set(CUDF_USE_ARROW_STATIC OFF) -include(../cmake/thirdparty/get_arrow.cmake) +include(cmake/thirdparty/get_arrow.cmake) # ################################################################################################## # test sources ################################################################################## diff --git a/cpp/cmake/thirdparty/get_arrow.cmake b/cpp/tests/cmake/thirdparty/get_arrow.cmake similarity index 100% rename from cpp/cmake/thirdparty/get_arrow.cmake rename to cpp/tests/cmake/thirdparty/get_arrow.cmake From 36a86b505819aea3bbf5dbf4051aeff190b9d5f8 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Tue, 20 Aug 2024 05:24:57 +0000 Subject: [PATCH 05/48] Remove now unnecessary args --- cpp/tests/cmake/thirdparty/get_arrow.cmake | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/cpp/tests/cmake/thirdparty/get_arrow.cmake b/cpp/tests/cmake/thirdparty/get_arrow.cmake index 67d78191357..f508b15d8e4 100644 --- a/cpp/tests/cmake/thirdparty/get_arrow.cmake +++ b/cpp/tests/cmake/thirdparty/get_arrow.cmake @@ -228,5 +228,4 @@ if(NOT DEFINED CUDF_VERSION_Arrow) ) endif() -# TODO: Remove arguments we deem unnecessary going forward -find_and_configure_arrow(${CUDF_VERSION_Arrow} ${CUDF_USE_ARROW_STATIC} OFF OFF OFF OFF OFF) +find_and_configure_arrow(${CUDF_VERSION_Arrow} ${CUDF_USE_ARROW_STATIC}) From aff2a6685d70fe3d9cc3de8476f219242c167e03 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Tue, 20 Aug 2024 05:30:46 +0000 Subject: [PATCH 06/48] Remove one unnecessary branch --- cpp/tests/cmake/thirdparty/get_arrow.cmake | 12 +++--------- python/pylibcudf/pyproject.toml | 2 +- 2 files changed, 4 insertions(+), 10 deletions(-) diff --git a/cpp/tests/cmake/thirdparty/get_arrow.cmake b/cpp/tests/cmake/thirdparty/get_arrow.cmake index f508b15d8e4..b85243855f0 100644 --- a/cpp/tests/cmake/thirdparty/get_arrow.cmake +++ b/cpp/tests/cmake/thirdparty/get_arrow.cmake @@ -115,14 +115,8 @@ function(find_and_configure_arrow VERSION BUILD_STATIC) set(ARROW_LIBRARIES arrow_shared) endif() - # Arrow_DIR: set if CPM found Arrow on the system/conda/etc. - if(Arrow_DIR) - # This extra find_package is necessary because rapids_cpm_find does not propagate all the - # variables from find_package that we might need. This is especially problematic when - # rapids_cpm_find builds from source. - find_package(Arrow REQUIRED QUIET) - # Arrow_ADDED: set if CPM downloaded Arrow from Github - elseif(Arrow_ADDED) + # Arrow_ADDED: set if CPM downloaded Arrow from Github + if(Arrow_ADDED) # Copy these files so we can avoid adding paths in Arrow_BINARY_DIR to # target_include_directories. That defeats ccache. file(INSTALL "${Arrow_BINARY_DIR}/src/arrow/util/config.h" @@ -143,7 +137,7 @@ function(find_and_configure_arrow VERSION BUILD_STATIC) "$" ) endforeach() - else() + elseif(NOT Arrow_DIR) set(ARROW_FOUND FALSE PARENT_SCOPE diff --git a/python/pylibcudf/pyproject.toml b/python/pylibcudf/pyproject.toml index b037508d03f..6e059be89dd 100644 --- a/python/pylibcudf/pyproject.toml +++ b/python/pylibcudf/pyproject.toml @@ -18,7 +18,7 @@ authors = [ license = { text = "Apache 2.0" } requires-python = ">=3.9" dependencies = [ - "cuda-python>=11.7.1,<12.0a0", + "cuda-python>=12.0,<13.0a0", "nvtx>=0.2.1", "packaging", "pyarrow>=16.1.0,<16.2.0a0", From 37d2793693607238ec501421d21264313e1010a5 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Tue, 20 Aug 2024 05:36:21 +0000 Subject: [PATCH 07/48] Remove some unnecessary specializations for some targets --- cpp/tests/cmake/thirdparty/get_arrow.cmake | 18 ------------------ 1 file changed, 18 deletions(-) diff --git a/cpp/tests/cmake/thirdparty/get_arrow.cmake b/cpp/tests/cmake/thirdparty/get_arrow.cmake index b85243855f0..a96d9c4c520 100644 --- a/cpp/tests/cmake/thirdparty/get_arrow.cmake +++ b/cpp/tests/cmake/thirdparty/get_arrow.cmake @@ -155,26 +155,8 @@ function(find_and_configure_arrow VERSION BUILD_STATIC) if (TARGET cudf::arrow_static AND (NOT TARGET arrow_static)) add_library(arrow_static ALIAS cudf::arrow_static) endif() - if (NOT TARGET arrow::flatbuffers) - add_library(arrow::flatbuffers INTERFACE IMPORTED) - endif() - if (NOT TARGET arrow::hadoop) - add_library(arrow::hadoop INTERFACE IMPORTED) - endif() ]=] ) - if(NOT TARGET xsimd) - string( - APPEND - arrow_code_string - " - if(NOT TARGET arrow::xsimd) - add_library(arrow::xsimd INTERFACE IMPORTED) - target_include_directories(arrow::xsimd INTERFACE \"${Arrow_BINARY_DIR}/xsimd_ep/src/xsimd_ep-install/include\") - endif() - " - ) - endif() rapids_cmake_install_lib_dir(lib_dir) if(TARGET arrow_static) get_target_property(interface_libs arrow_static INTERFACE_LINK_LIBRARIES) From 07547b029529d66a5fb741dae6ebc98aa605675e Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Tue, 20 Aug 2024 05:58:59 +0000 Subject: [PATCH 08/48] Switch to static arrow --- cpp/tests/CMakeLists.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index 4c87cf2f591..dade9dd654a 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -82,8 +82,8 @@ endfunction() # dependencies ################################################################################### # ################################################################################################## -# find arrow. TODO: Always use static for tests set(CUDF_USE_ARROW_STATIC ON) -set(CUDF_USE_ARROW_STATIC OFF) +# find arrow. Always use static for tests +set(CUDF_USE_ARROW_STATIC ON) include(cmake/thirdparty/get_arrow.cmake) # ################################################################################################## From 14391eaf6794268bffc2110e03b11a2d6d432614 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Tue, 20 Aug 2024 06:05:25 +0000 Subject: [PATCH 09/48] Clean out some unnecessary bits --- cpp/tests/cmake/thirdparty/get_arrow.cmake | 16 ---------------- 1 file changed, 16 deletions(-) diff --git a/cpp/tests/cmake/thirdparty/get_arrow.cmake b/cpp/tests/cmake/thirdparty/get_arrow.cmake index a96d9c4c520..f76fc912fc4 100644 --- a/cpp/tests/cmake/thirdparty/get_arrow.cmake +++ b/cpp/tests/cmake/thirdparty/get_arrow.cmake @@ -122,21 +122,6 @@ function(find_and_configure_arrow VERSION BUILD_STATIC) file(INSTALL "${Arrow_BINARY_DIR}/src/arrow/util/config.h" DESTINATION "${Arrow_SOURCE_DIR}/cpp/src/arrow/util" ) - # Arrow populates INTERFACE_INCLUDE_DIRECTORIES for the `arrow_static` and `arrow_shared` - # targets in FindArrow, so for static source-builds, we have to do it after-the-fact. - # - # This only works because we know exactly which components we're using. Don't forget to update - # this list if we add more! - # - foreach(ARROW_LIBRARY ${ARROW_LIBRARIES}) - target_include_directories( - ${ARROW_LIBRARY} - INTERFACE "$" - "$" - "$" - "$" - ) - endforeach() elseif(NOT Arrow_DIR) set(ARROW_FOUND FALSE @@ -157,7 +142,6 @@ function(find_and_configure_arrow VERSION BUILD_STATIC) endif() ]=] ) - rapids_cmake_install_lib_dir(lib_dir) if(TARGET arrow_static) get_target_property(interface_libs arrow_static INTERFACE_LINK_LIBRARIES) # The `arrow_static` library is leaking a dependency on the object libraries it was built with From 2d544a1028f5bb872a4d6baa9a584da4d8f8b25b Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Tue, 20 Aug 2024 06:15:41 +0000 Subject: [PATCH 10/48] Stop exporting since now it's a private dependency --- cpp/tests/cmake/thirdparty/get_arrow.cmake | 17 ----------------- 1 file changed, 17 deletions(-) diff --git a/cpp/tests/cmake/thirdparty/get_arrow.cmake b/cpp/tests/cmake/thirdparty/get_arrow.cmake index f76fc912fc4..507de1880fa 100644 --- a/cpp/tests/cmake/thirdparty/get_arrow.cmake +++ b/cpp/tests/cmake/thirdparty/get_arrow.cmake @@ -155,24 +155,7 @@ function(find_and_configure_arrow VERSION BUILD_STATIC) get_target_property(interface_libs arrow_static INTERFACE_LINK_LIBRARIES) endif() endif() - rapids_export( - BUILD Arrow - VERSION ${VERSION} - EXPORT_SET arrow_targets - GLOBAL_TARGETS arrow_shared arrow_static - NAMESPACE cudf:: - FINAL_CODE_BLOCK arrow_code_string - ) - endif() - # We generate the arrow-configfiles when we built arrow locally, so always do `find_dependency` - rapids_export_package(BUILD Arrow cudf-exports) - rapids_export_package(INSTALL Arrow cudf-exports) - - include("${rapids-cmake-dir}/export/find_package_root.cmake") - rapids_export_find_package_root( - BUILD Arrow [=[${CMAKE_CURRENT_LIST_DIR}]=] EXPORT_SET cudf-exports - ) set(ARROW_LIBRARIES "${ARROW_LIBRARIES}" PARENT_SCOPE From d865616681d7ea869a620e2bc10264d884436fcf Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Tue, 20 Aug 2024 06:24:30 +0000 Subject: [PATCH 11/48] Some variable simplification --- cpp/tests/cmake/thirdparty/get_arrow.cmake | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/cpp/tests/cmake/thirdparty/get_arrow.cmake b/cpp/tests/cmake/thirdparty/get_arrow.cmake index 507de1880fa..9c7ae0f9f5d 100644 --- a/cpp/tests/cmake/thirdparty/get_arrow.cmake +++ b/cpp/tests/cmake/thirdparty/get_arrow.cmake @@ -58,14 +58,16 @@ function(find_and_configure_arrow VERSION BUILD_STATIC) if(BUILD_STATIC) set(ARROW_BUILD_STATIC ON) set(ARROW_BUILD_SHARED OFF) - set(ARROW_DEPENDENCY_USE_SHARED ON) + set(ARROW_DEPENDENCY_USE_SHARED OFF) + set(ARROW_LIBRARIES arrow_static) # Turn off CPM using `find_package` so we always download and make sure we get proper static # library. set(CPM_DOWNLOAD_Arrow TRUE) else() - set(ARROW_BUILD_SHARED ON) set(ARROW_BUILD_STATIC OFF) + set(ARROW_BUILD_SHARED ON) set(ARROW_DEPENDENCY_USE_SHARED ON) + set(ARROW_LIBRARIES arrow_shared) endif() rapids_cpm_find( @@ -109,12 +111,6 @@ function(find_and_configure_arrow VERSION BUILD_STATIC) PARENT_SCOPE ) - if(BUILD_STATIC) - set(ARROW_LIBRARIES arrow_static) - else() - set(ARROW_LIBRARIES arrow_shared) - endif() - # Arrow_ADDED: set if CPM downloaded Arrow from Github if(Arrow_ADDED) # Copy these files so we can avoid adding paths in Arrow_BINARY_DIR to From 3fe333da3ef57b477b1fa4db29539df612ee932a Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Tue, 20 Aug 2024 06:25:52 +0000 Subject: [PATCH 12/48] Set Arrow_FOUND at the end --- cpp/tests/cmake/thirdparty/get_arrow.cmake | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/cpp/tests/cmake/thirdparty/get_arrow.cmake b/cpp/tests/cmake/thirdparty/get_arrow.cmake index 9c7ae0f9f5d..d73f7225d20 100644 --- a/cpp/tests/cmake/thirdparty/get_arrow.cmake +++ b/cpp/tests/cmake/thirdparty/get_arrow.cmake @@ -106,11 +106,6 @@ function(find_and_configure_arrow VERSION BUILD_STATIC) "xsimd_SOURCE AUTO" ) - set(ARROW_FOUND - TRUE - PARENT_SCOPE - ) - # Arrow_ADDED: set if CPM downloaded Arrow from Github if(Arrow_ADDED) # Copy these files so we can avoid adding paths in Arrow_BINARY_DIR to @@ -152,10 +147,16 @@ function(find_and_configure_arrow VERSION BUILD_STATIC) endif() endif() endif() + set(ARROW_LIBRARIES "${ARROW_LIBRARIES}" PARENT_SCOPE ) + set(ARROW_FOUND + TRUE + PARENT_SCOPE + ) + endfunction() if(NOT DEFINED CUDF_VERSION_Arrow) From 5040b5d6425bee589c848864103f068d908e4ef2 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Tue, 20 Aug 2024 06:26:09 +0000 Subject: [PATCH 13/48] Remove now unused code string --- cpp/tests/cmake/thirdparty/get_arrow.cmake | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/cpp/tests/cmake/thirdparty/get_arrow.cmake b/cpp/tests/cmake/thirdparty/get_arrow.cmake index d73f7225d20..f0d37f4922a 100644 --- a/cpp/tests/cmake/thirdparty/get_arrow.cmake +++ b/cpp/tests/cmake/thirdparty/get_arrow.cmake @@ -122,17 +122,6 @@ function(find_and_configure_arrow VERSION BUILD_STATIC) endif() if(Arrow_ADDED) - - set(arrow_code_string - [=[ - if (TARGET cudf::arrow_shared AND (NOT TARGET arrow_shared)) - add_library(arrow_shared ALIAS cudf::arrow_shared) - endif() - if (TARGET cudf::arrow_static AND (NOT TARGET arrow_static)) - add_library(arrow_static ALIAS cudf::arrow_static) - endif() - ]=] - ) if(TARGET arrow_static) get_target_property(interface_libs arrow_static INTERFACE_LINK_LIBRARIES) # The `arrow_static` library is leaking a dependency on the object libraries it was built with From e84e26b308c4a551f2aff209ecd0f848e961dfb8 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Tue, 20 Aug 2024 06:30:52 +0000 Subject: [PATCH 14/48] Combine some conditions --- cpp/tests/cmake/thirdparty/get_arrow.cmake | 26 ++++++++++------------ 1 file changed, 12 insertions(+), 14 deletions(-) diff --git a/cpp/tests/cmake/thirdparty/get_arrow.cmake b/cpp/tests/cmake/thirdparty/get_arrow.cmake index f0d37f4922a..031cbbd6085 100644 --- a/cpp/tests/cmake/thirdparty/get_arrow.cmake +++ b/cpp/tests/cmake/thirdparty/get_arrow.cmake @@ -37,6 +37,14 @@ function(find_and_configure_arrow VERSION BUILD_STATIC) ) return() endif() + + set(ARROW_BUILD_STATIC ON) + set(ARROW_BUILD_SHARED OFF) + set(ARROW_DEPENDENCY_USE_SHARED OFF) + set(ARROW_LIBRARIES arrow_static) + # Turn off CPM using `find_package` so we always download and make sure we get proper static + # library. + set(CPM_DOWNLOAD_Arrow TRUE) else() if(TARGET arrow_shared) set(ARROW_FOUND @@ -49,27 +57,17 @@ function(find_and_configure_arrow VERSION BUILD_STATIC) ) return() endif() - endif() - - if(NOT ARROW_SIMD_LEVEL) - set(ARROW_SIMD_LEVEL "NONE") - endif() - if(BUILD_STATIC) - set(ARROW_BUILD_STATIC ON) - set(ARROW_BUILD_SHARED OFF) - set(ARROW_DEPENDENCY_USE_SHARED OFF) - set(ARROW_LIBRARIES arrow_static) - # Turn off CPM using `find_package` so we always download and make sure we get proper static - # library. - set(CPM_DOWNLOAD_Arrow TRUE) - else() set(ARROW_BUILD_STATIC OFF) set(ARROW_BUILD_SHARED ON) set(ARROW_DEPENDENCY_USE_SHARED ON) set(ARROW_LIBRARIES arrow_shared) endif() + if(NOT ARROW_SIMD_LEVEL) + set(ARROW_SIMD_LEVEL "NONE") + endif() + rapids_cpm_find( Arrow ${VERSION} GLOBAL_TARGETS arrow_shared arrow_static From 9222d152b1f411bde5f35e646f67af34294d66c5 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Tue, 20 Aug 2024 06:31:05 +0000 Subject: [PATCH 15/48] Reenable linting rules --- cpp/tests/cmake/thirdparty/get_arrow.cmake | 8 -------- python/cudf/pyproject.toml | 4 ++-- 2 files changed, 2 insertions(+), 10 deletions(-) diff --git a/cpp/tests/cmake/thirdparty/get_arrow.cmake b/cpp/tests/cmake/thirdparty/get_arrow.cmake index 031cbbd6085..95ca4651e81 100644 --- a/cpp/tests/cmake/thirdparty/get_arrow.cmake +++ b/cpp/tests/cmake/thirdparty/get_arrow.cmake @@ -12,14 +12,6 @@ # the License. # ============================================================================= -# Finding arrow is far more complex than it should be, and as a result we violate multiple linting -# rules aiming to limit complexity. Since all our other CMake scripts conform to expectations -# without undue difficulty, disabling those rules for just this function is our best approach for -# now. The spacing between this comment, the cmake-lint directives, and the function docstring is -# necessary to prevent cmake-format from trying to combine the lines. - -# cmake-lint: disable=R0912,R0913,R0915 - include_guard(GLOBAL) # This function finds arrow and sets any additional necessary environment variables. diff --git a/python/cudf/pyproject.toml b/python/cudf/pyproject.toml index 9db52164eca..ae92cf111b6 100644 --- a/python/cudf/pyproject.toml +++ b/python/cudf/pyproject.toml @@ -20,8 +20,8 @@ requires-python = ">=3.9" dependencies = [ "cachetools", "cubinlinker", - "cuda-python>=11.7.1,<12.0a0", - "cupy-cuda11x>=12.0.0", + "cuda-python>=12.0,<13.0a0", + "cupy-cuda12x>=12.0.0", "fsspec>=0.6.0", "numba>=0.57", "numpy>=1.23,<2.0a0", From c129ec7a910f75f289966e2a2a52ffb1e07417ce Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Tue, 20 Aug 2024 06:35:54 +0000 Subject: [PATCH 16/48] Remove libarrow from cpp build reqs --- conda/environments/all_cuda-118_arch-x86_64.yaml | 4 ---- conda/environments/all_cuda-125_arch-x86_64.yaml | 4 ---- conda/recipes/libcudf/conda_build_config.yaml | 3 --- conda/recipes/libcudf/meta.yaml | 2 -- dependencies.yaml | 11 ----------- 5 files changed, 24 deletions(-) diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml index 018162bd848..b607aaab394 100644 --- a/conda/environments/all_cuda-118_arch-x86_64.yaml +++ b/conda/environments/all_cuda-118_arch-x86_64.yaml @@ -37,15 +37,11 @@ dependencies: - hypothesis - identify>=2.5.20 - ipython -- libarrow-acero==16.1.0.* -- libarrow-dataset==16.1.0.* -- libarrow==16.1.0.* - libcufile-dev=1.4.0.31 - libcufile=1.4.0.31 - libcurand-dev=10.3.0.86 - libcurand=10.3.0.86 - libkvikio==24.10.*,>=0.0.0a0 -- libparquet==16.1.0.* - librdkafka>=1.9.0,<1.10.0a0 - librmm==24.10.*,>=0.0.0a0 - make diff --git a/conda/environments/all_cuda-125_arch-x86_64.yaml b/conda/environments/all_cuda-125_arch-x86_64.yaml index c60ffa7aaa5..89cd95779f6 100644 --- a/conda/environments/all_cuda-125_arch-x86_64.yaml +++ b/conda/environments/all_cuda-125_arch-x86_64.yaml @@ -38,13 +38,9 @@ dependencies: - hypothesis - identify>=2.5.20 - ipython -- libarrow-acero==16.1.0.* -- libarrow-dataset==16.1.0.* -- libarrow==16.1.0.* - libcufile-dev - libcurand-dev - libkvikio==24.10.*,>=0.0.0a0 -- libparquet==16.1.0.* - librdkafka>=1.9.0,<1.10.0a0 - librmm==24.10.*,>=0.0.0a0 - make diff --git a/conda/recipes/libcudf/conda_build_config.yaml b/conda/recipes/libcudf/conda_build_config.yaml index ff7458caf82..4b1c4cca828 100644 --- a/conda/recipes/libcudf/conda_build_config.yaml +++ b/conda/recipes/libcudf/conda_build_config.yaml @@ -19,9 +19,6 @@ c_stdlib_version: cmake_version: - ">=3.26.4,!=3.30.0" -libarrow_version: - - "==16.1.0" - dlpack_version: - ">=0.8,<1.0" diff --git a/conda/recipes/libcudf/meta.yaml b/conda/recipes/libcudf/meta.yaml index aa1c94a4bca..1c2e9e8dd98 100644 --- a/conda/recipes/libcudf/meta.yaml +++ b/conda/recipes/libcudf/meta.yaml @@ -64,7 +64,6 @@ requirements: {% endif %} - cuda-version ={{ cuda_version }} - nvcomp {{ nvcomp_version }} - - libarrow {{ libarrow_version }} - dlpack {{ dlpack_version }} - librdkafka {{ librdkafka_version }} - fmt {{ fmt_version }} @@ -92,7 +91,6 @@ outputs: - cmake {{ cmake_version }} host: - cuda-version ={{ cuda_version }} - - libarrow {{ libarrow_version }} run: - {{ pin_compatible('cuda-version', max_pin='x', min_pin='x') }} {% if cuda_major == "11" %} diff --git a/dependencies.yaml b/dependencies.yaml index 150d03be021..a3ac06ef6db 100644 --- a/dependencies.yaml +++ b/dependencies.yaml @@ -16,7 +16,6 @@ files: - depends_on_rmm - develop - docs - - libarrow_build - notebooks - py_version - rapids_build_skbuild @@ -366,16 +365,6 @@ dependencies: # Sync with conda build constraint & wheel run constraint. # TODO: Change to `2.0.*` for NumPy 2 - numpy==1.23.* - libarrow_build: - common: - - output_types: conda - packages: - # Hard pin the Arrow patch version used during the build. This must - # be kept in sync with the version pinned in get_arrow.cmake. - - libarrow-acero==16.1.0.* - - libarrow-dataset==16.1.0.* - - libarrow==16.1.0.* - - libparquet==16.1.0.* libarrow_run: common: - output_types: conda From 282da762de3763483d2e62a01167f215fe63fa2b Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Tue, 20 Aug 2024 06:38:41 +0000 Subject: [PATCH 17/48] Remove pyarrow from Python build deps --- conda/environments/all_cuda-118_arch-x86_64.yaml | 2 +- conda/environments/all_cuda-125_arch-x86_64.yaml | 2 +- conda/recipes/cudf/meta.yaml | 3 +-- conda/recipes/pylibcudf/meta.yaml | 3 +-- dependencies.yaml | 4 +--- python/cudf/pyproject.toml | 5 ++--- python/cudf_kafka/pyproject.toml | 1 - python/pylibcudf/pyproject.toml | 3 +-- 8 files changed, 8 insertions(+), 15 deletions(-) diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml index b607aaab394..fb16be390bb 100644 --- a/conda/environments/all_cuda-118_arch-x86_64.yaml +++ b/conda/environments/all_cuda-118_arch-x86_64.yaml @@ -64,7 +64,7 @@ dependencies: - pandoc - pre-commit - ptxcompiler -- pyarrow==16.1.0.* +- pyarrow>=16.1.0,<16.2.0a0 - pydata-sphinx-theme!=0.14.2 - pytest-benchmark - pytest-cases>=3.8.2 diff --git a/conda/environments/all_cuda-125_arch-x86_64.yaml b/conda/environments/all_cuda-125_arch-x86_64.yaml index 89cd95779f6..88ba3168984 100644 --- a/conda/environments/all_cuda-125_arch-x86_64.yaml +++ b/conda/environments/all_cuda-125_arch-x86_64.yaml @@ -61,7 +61,7 @@ dependencies: - pandas>=2.0,<2.2.3dev0 - pandoc - pre-commit -- pyarrow==16.1.0.* +- pyarrow>=16.1.0,<16.2.0a0 - pydata-sphinx-theme!=0.14.2 - pynvjitlink>=0.0.0a0 - pytest-benchmark diff --git a/conda/recipes/cudf/meta.yaml b/conda/recipes/cudf/meta.yaml index 7e86147732e..dbf64ec3227 100644 --- a/conda/recipes/cudf/meta.yaml +++ b/conda/recipes/cudf/meta.yaml @@ -66,7 +66,6 @@ requirements: - dlpack >=0.8,<1.0 # TODO: Change to `2.0` for NumPy 2 - numpy 1.23 - - pyarrow ==16.1.0.* - libcudf ={{ version }} - pylibcudf ={{ version }} - rmm ={{ minor_version }} @@ -86,7 +85,7 @@ requirements: - numba >=0.57 # TODO: Update `numpy` in `host` when dropping `<2.0a0` - numpy >=1.23,<2.0a0 - - {{ pin_compatible('pyarrow', max_pin='x.x') }} + - pyarrow ==16.1.0.* - libcudf ={{ version }} - pylibcudf ={{ version }} - {{ pin_compatible('rmm', max_pin='x.x') }} diff --git a/conda/recipes/pylibcudf/meta.yaml b/conda/recipes/pylibcudf/meta.yaml index f405fd10f5d..19e4646662f 100644 --- a/conda/recipes/pylibcudf/meta.yaml +++ b/conda/recipes/pylibcudf/meta.yaml @@ -66,7 +66,6 @@ requirements: - dlpack >=0.8,<1.0 # TODO: Change to `2.0` for NumPy 2 - numpy 1.23 - - pyarrow ==16.1.0.* - libcudf ={{ version }} - rmm ={{ minor_version }} {% if cuda_major == "11" %} @@ -83,7 +82,7 @@ requirements: - pandas >=2.0,<2.2.3dev0 # TODO: Update `numpy` in `host` when dropping `<2.0a0` - numpy >=1.23,<2.0a0 - - {{ pin_compatible('pyarrow', max_pin='x.x') }} + - pyarrow ==16.1.0.* - {{ pin_compatible('rmm', max_pin='x.x') }} - fsspec >=0.6.0 {% if cuda_major == "11" %} diff --git a/dependencies.yaml b/dependencies.yaml index a3ac06ef6db..5214c158709 100644 --- a/dependencies.yaml +++ b/dependencies.yaml @@ -10,6 +10,7 @@ files: - build_all - build_cpp - build_python_common + - pyarrow_run - cuda - cuda_version - depends_on_cupy @@ -356,9 +357,6 @@ dependencies: - output_types: [conda, requirements, pyproject] packages: - cython>=3.0.3 - # Hard pin the patch version used during the build. This must be kept - # in sync with the version pinned in get_arrow.cmake. - - pyarrow==16.1.0.* - output_types: pyproject packages: # Hard pin the patch version used during the build. diff --git a/python/cudf/pyproject.toml b/python/cudf/pyproject.toml index ae92cf111b6..6270cab9461 100644 --- a/python/cudf/pyproject.toml +++ b/python/cudf/pyproject.toml @@ -20,8 +20,8 @@ requires-python = ">=3.9" dependencies = [ "cachetools", "cubinlinker", - "cuda-python>=12.0,<13.0a0", - "cupy-cuda12x>=12.0.0", + "cuda-python>=11.7.1,<12.0a0", + "cupy-cuda11x>=12.0.0", "fsspec>=0.6.0", "numba>=0.57", "numpy>=1.23,<2.0a0", @@ -128,7 +128,6 @@ requires = [ "cython>=3.0.3", "ninja", "numpy==1.23.*", - "pyarrow==16.1.0.*", "pylibcudf==24.10.*,>=0.0.0a0", "rmm==24.10.*,>=0.0.0a0", ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`. diff --git a/python/cudf_kafka/pyproject.toml b/python/cudf_kafka/pyproject.toml index 63c5b07c5f3..3c695c39e54 100644 --- a/python/cudf_kafka/pyproject.toml +++ b/python/cudf_kafka/pyproject.toml @@ -107,5 +107,4 @@ requires = [ "cython>=3.0.3", "ninja", "numpy==1.23.*", - "pyarrow==16.1.0.*", ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`. diff --git a/python/pylibcudf/pyproject.toml b/python/pylibcudf/pyproject.toml index 6e059be89dd..d7b19fcb790 100644 --- a/python/pylibcudf/pyproject.toml +++ b/python/pylibcudf/pyproject.toml @@ -18,7 +18,7 @@ authors = [ license = { text = "Apache 2.0" } requires-python = ">=3.9" dependencies = [ - "cuda-python>=12.0,<13.0a0", + "cuda-python>=11.7.1,<12.0a0", "nvtx>=0.2.1", "packaging", "pyarrow>=16.1.0,<16.2.0a0", @@ -103,7 +103,6 @@ requires = [ "cython>=3.0.3", "ninja", "numpy==1.23.*", - "pyarrow==16.1.0.*", "rmm==24.10.*,>=0.0.0a0", ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`. From 614992c3ff5b63d1772d9fe4f9963f02ba8ea759 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Tue, 20 Aug 2024 06:41:42 +0000 Subject: [PATCH 18/48] Drop numpy as a build dependency --- conda/recipes/cudf/meta.yaml | 3 --- conda/recipes/pylibcudf/meta.yaml | 3 --- dependencies.yaml | 7 ------- python/cudf/pyproject.toml | 1 - python/cudf_kafka/pyproject.toml | 1 - python/pylibcudf/pyproject.toml | 1 - 6 files changed, 16 deletions(-) diff --git a/conda/recipes/cudf/meta.yaml b/conda/recipes/cudf/meta.yaml index dbf64ec3227..af1817a30dc 100644 --- a/conda/recipes/cudf/meta.yaml +++ b/conda/recipes/cudf/meta.yaml @@ -64,8 +64,6 @@ requirements: - rapids-build-backend >=0.3.0,<0.4.0.dev0 - scikit-build-core >=0.10.0 - dlpack >=0.8,<1.0 - # TODO: Change to `2.0` for NumPy 2 - - numpy 1.23 - libcudf ={{ version }} - pylibcudf ={{ version }} - rmm ={{ minor_version }} @@ -83,7 +81,6 @@ requirements: - pandas >=2.0,<2.2.3dev0 - cupy >=12.0.0 - numba >=0.57 - # TODO: Update `numpy` in `host` when dropping `<2.0a0` - numpy >=1.23,<2.0a0 - pyarrow ==16.1.0.* - libcudf ={{ version }} diff --git a/conda/recipes/pylibcudf/meta.yaml b/conda/recipes/pylibcudf/meta.yaml index 19e4646662f..732e3c0b52c 100644 --- a/conda/recipes/pylibcudf/meta.yaml +++ b/conda/recipes/pylibcudf/meta.yaml @@ -64,8 +64,6 @@ requirements: - rapids-build-backend >=0.3.0,<0.4.0.dev0 - scikit-build-core >=0.10.0 - dlpack >=0.8,<1.0 - # TODO: Change to `2.0` for NumPy 2 - - numpy 1.23 - libcudf ={{ version }} - rmm ={{ minor_version }} {% if cuda_major == "11" %} @@ -80,7 +78,6 @@ requirements: - python - typing_extensions >=4.0.0 - pandas >=2.0,<2.2.3dev0 - # TODO: Update `numpy` in `host` when dropping `<2.0a0` - numpy >=1.23,<2.0a0 - pyarrow ==16.1.0.* - {{ pin_compatible('rmm', max_pin='x.x') }} diff --git a/dependencies.yaml b/dependencies.yaml index 5214c158709..71fb63071e1 100644 --- a/dependencies.yaml +++ b/dependencies.yaml @@ -357,12 +357,6 @@ dependencies: - output_types: [conda, requirements, pyproject] packages: - cython>=3.0.3 - - output_types: pyproject - packages: - # Hard pin the patch version used during the build. - # Sync with conda build constraint & wheel run constraint. - # TODO: Change to `2.0.*` for NumPy 2 - - numpy==1.23.* libarrow_run: common: - output_types: conda @@ -555,7 +549,6 @@ dependencies: - output_types: [conda, requirements, pyproject] packages: - fsspec>=0.6.0 - # TODO: Update `numpy` in `build_python_common` when dropping `<2.0a0` - numpy>=1.23,<2.0a0 - pandas>=2.0,<2.2.3dev0 run_pylibcudf: diff --git a/python/cudf/pyproject.toml b/python/cudf/pyproject.toml index 6270cab9461..2a29d1d5bf5 100644 --- a/python/cudf/pyproject.toml +++ b/python/cudf/pyproject.toml @@ -127,7 +127,6 @@ requires = [ "cmake>=3.26.4,!=3.30.0", "cython>=3.0.3", "ninja", - "numpy==1.23.*", "pylibcudf==24.10.*,>=0.0.0a0", "rmm==24.10.*,>=0.0.0a0", ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`. diff --git a/python/cudf_kafka/pyproject.toml b/python/cudf_kafka/pyproject.toml index 3c695c39e54..a1a517d6157 100644 --- a/python/cudf_kafka/pyproject.toml +++ b/python/cudf_kafka/pyproject.toml @@ -106,5 +106,4 @@ requires = [ "cmake>=3.26.4,!=3.30.0", "cython>=3.0.3", "ninja", - "numpy==1.23.*", ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`. diff --git a/python/pylibcudf/pyproject.toml b/python/pylibcudf/pyproject.toml index d7b19fcb790..71996771cc8 100644 --- a/python/pylibcudf/pyproject.toml +++ b/python/pylibcudf/pyproject.toml @@ -102,7 +102,6 @@ requires = [ "cmake>=3.26.4,!=3.30.0", "cython>=3.0.3", "ninja", - "numpy==1.23.*", "rmm==24.10.*,>=0.0.0a0", ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`. From 987fdf0cc2a960b66acdeeb85e303e7495bc8666 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Wed, 21 Aug 2024 17:41:05 +0000 Subject: [PATCH 19/48] Add get_arrow.cmake to JNI build --- java/src/main/native/CMakeLists.txt | 4 + .../native/cmake/thirdparty/get_arrow.cmake | 150 ++++++++++++++++++ 2 files changed, 154 insertions(+) create mode 100644 java/src/main/native/cmake/thirdparty/get_arrow.cmake diff --git a/java/src/main/native/CMakeLists.txt b/java/src/main/native/CMakeLists.txt index 22059c5bc7f..461582cc33d 100644 --- a/java/src/main/native/CMakeLists.txt +++ b/java/src/main/native/CMakeLists.txt @@ -212,6 +212,10 @@ target_compile_definitions( ) target_link_options(cudfjni PRIVATE "-Wl,--no-undefined") +set(CUDF_USE_ARROW_STATIC ON) +include(cmake/thirdparty/get_arrow.cmake) +target_link_libraries(cudfjni PRIVATE ${ARROW_LIBRARIES}) + if(USE_GDS) add_library(cufilejni src/CuFileJni.cpp) set_target_properties( diff --git a/java/src/main/native/cmake/thirdparty/get_arrow.cmake b/java/src/main/native/cmake/thirdparty/get_arrow.cmake new file mode 100644 index 00000000000..95ca4651e81 --- /dev/null +++ b/java/src/main/native/cmake/thirdparty/get_arrow.cmake @@ -0,0 +1,150 @@ +# ============================================================================= +# Copyright (c) 2020-2024, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software distributed under the License +# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express +# or implied. See the License for the specific language governing permissions and limitations under +# the License. +# ============================================================================= + +include_guard(GLOBAL) + +# This function finds arrow and sets any additional necessary environment variables. +function(find_and_configure_arrow VERSION BUILD_STATIC) + + if(BUILD_STATIC) + if(TARGET arrow_static) + set(ARROW_FOUND + TRUE + PARENT_SCOPE + ) + set(ARROW_LIBRARIES + arrow_static + PARENT_SCOPE + ) + return() + endif() + + set(ARROW_BUILD_STATIC ON) + set(ARROW_BUILD_SHARED OFF) + set(ARROW_DEPENDENCY_USE_SHARED OFF) + set(ARROW_LIBRARIES arrow_static) + # Turn off CPM using `find_package` so we always download and make sure we get proper static + # library. + set(CPM_DOWNLOAD_Arrow TRUE) + else() + if(TARGET arrow_shared) + set(ARROW_FOUND + TRUE + PARENT_SCOPE + ) + set(ARROW_LIBRARIES + arrow_shared + PARENT_SCOPE + ) + return() + endif() + + set(ARROW_BUILD_STATIC OFF) + set(ARROW_BUILD_SHARED ON) + set(ARROW_DEPENDENCY_USE_SHARED ON) + set(ARROW_LIBRARIES arrow_shared) + endif() + + if(NOT ARROW_SIMD_LEVEL) + set(ARROW_SIMD_LEVEL "NONE") + endif() + + rapids_cpm_find( + Arrow ${VERSION} + GLOBAL_TARGETS arrow_shared arrow_static + CPM_ARGS + GIT_REPOSITORY https://github.com/apache/arrow.git + GIT_TAG apache-arrow-${VERSION} + GIT_SHALLOW TRUE SOURCE_SUBDIR cpp + OPTIONS "CMAKE_VERBOSE_MAKEFILE ON" + "ARROW_ACERO ON" + "ARROW_IPC ON" + "ARROW_DATASET ON" + "ARROW_WITH_BACKTRACE ON" + "ARROW_CXXFLAGS -w" + "ARROW_JEMALLOC OFF" + "ARROW_S3 OFF" + "ARROW_ORC OFF" + "ARROW_PARQUET OFF" + "ARROW_FILESYSTEM OFF" + "ARROW_PYTHON OFF" + # Arrow modifies CMake's GLOBAL RULE_LAUNCH_COMPILE unless this is off + "ARROW_USE_CCACHE OFF" + "ARROW_ARMV8_ARCH ${ARROW_ARMV8_ARCH}" + "ARROW_SIMD_LEVEL ${ARROW_SIMD_LEVEL}" + "ARROW_BUILD_STATIC ${ARROW_BUILD_STATIC}" + "ARROW_BUILD_SHARED ${ARROW_BUILD_SHARED}" + "ARROW_POSITION_INDEPENDENT_CODE ON" + "ARROW_DEPENDENCY_USE_SHARED ${ARROW_BUILD_SHARED}" + "ARROW_BOOST_USE_SHARED ${ARROW_BUILD_SHARED}" + "ARROW_BROTLI_USE_SHARED ${ARROW_BUILD_SHARED}" + "ARROW_GFLAGS_USE_SHARED ${ARROW_BUILD_SHARED}" + "ARROW_GRPC_USE_SHARED ${ARROW_BUILD_SHARED}" + "ARROW_PROTOBUF_USE_SHARED ${ARROW_BUILD_SHARED}" + "ARROW_ZSTD_USE_SHARED ${ARROW_BUILD_SHARED}" + "xsimd_SOURCE AUTO" + ) + + # Arrow_ADDED: set if CPM downloaded Arrow from Github + if(Arrow_ADDED) + # Copy these files so we can avoid adding paths in Arrow_BINARY_DIR to + # target_include_directories. That defeats ccache. + file(INSTALL "${Arrow_BINARY_DIR}/src/arrow/util/config.h" + DESTINATION "${Arrow_SOURCE_DIR}/cpp/src/arrow/util" + ) + elseif(NOT Arrow_DIR) + set(ARROW_FOUND + FALSE + PARENT_SCOPE + ) + message(FATAL_ERROR "CUDF: Arrow library not found or downloaded.") + endif() + + if(Arrow_ADDED) + if(TARGET arrow_static) + get_target_property(interface_libs arrow_static INTERFACE_LINK_LIBRARIES) + # The `arrow_static` library is leaking a dependency on the object libraries it was built with + # we need to remove this from the interface, since keeping them around would cause duplicate + # symbols and CMake export errors + if(interface_libs MATCHES "arrow_array" AND interface_libs MATCHES "arrow_compute") + string(REPLACE "BUILD_INTERFACE:" "BUILD_LOCAL_INTERFACE:" interface_libs + "${interface_libs}" + ) + set_target_properties(arrow_static PROPERTIES INTERFACE_LINK_LIBRARIES "${interface_libs}") + get_target_property(interface_libs arrow_static INTERFACE_LINK_LIBRARIES) + endif() + endif() + endif() + + set(ARROW_LIBRARIES + "${ARROW_LIBRARIES}" + PARENT_SCOPE + ) + set(ARROW_FOUND + TRUE + PARENT_SCOPE + ) + +endfunction() + +if(NOT DEFINED CUDF_VERSION_Arrow) + set(CUDF_VERSION_Arrow + # This version must be kept in sync with the libarrow version pinned for builds in + # dependencies.yaml. + 16.1.0 + CACHE STRING "The version of Arrow to find (or build)" + ) +endif() + +find_and_configure_arrow(${CUDF_VERSION_Arrow} ${CUDF_USE_ARROW_STATIC}) From c8dce44602ae49108bef2bae1b21a79735e96e4c Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Thu, 22 Aug 2024 00:11:51 +0000 Subject: [PATCH 20/48] Stop installing arrow into wheels --- python/cudf/CMakeLists.txt | 4 ++-- python/pylibcudf/CMakeLists.txt | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/python/cudf/CMakeLists.txt b/python/cudf/CMakeLists.txt index d979dc576af..3344c01e475 100644 --- a/python/cudf/CMakeLists.txt +++ b/python/cudf/CMakeLists.txt @@ -70,8 +70,8 @@ if(NOT cudf_FOUND) # practice right this would only be a problem is if libcudf was not found but some of the # dependencies were, and we have no real use cases where that happens. install_aliased_imported_targets( - TARGETS cudf arrow_shared nvcomp::nvcomp nvcomp::nvcomp_gdeflate nvcomp::nvcomp_bitcomp - DESTINATION ${cython_lib_dir} + TARGETS cudf nvcomp::nvcomp nvcomp::nvcomp_gdeflate nvcomp::nvcomp_bitcomp DESTINATION + ${cython_lib_dir} ) endif() diff --git a/python/pylibcudf/CMakeLists.txt b/python/pylibcudf/CMakeLists.txt index 0d5383bba2e..5e7d6f21d29 100644 --- a/python/pylibcudf/CMakeLists.txt +++ b/python/pylibcudf/CMakeLists.txt @@ -69,8 +69,8 @@ if(NOT cudf_FOUND) # practice right this would only be a problem is if libcudf was not found but some of the # dependencies were, and we have no real use cases where that happens. install_aliased_imported_targets( - TARGETS cudf arrow_shared nvcomp::nvcomp nvcomp::nvcomp_gdeflate nvcomp::nvcomp_bitcomp - DESTINATION ${cython_lib_dir} + TARGETS cudf nvcomp::nvcomp nvcomp::nvcomp_gdeflate nvcomp::nvcomp_bitcomp DESTINATION + ${cython_lib_dir} ) endif() From 6eb7e99d9b92e631c1a1f4e7e077e4d3562e5613 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Thu, 22 Aug 2024 04:57:20 +0000 Subject: [PATCH 21/48] Revert changes to get_arrow.cmake --- cpp/tests/cmake/thirdparty/get_arrow.cmake | 347 ++++++++++++++++-- .../native/cmake/thirdparty/get_arrow.cmake | 347 ++++++++++++++++-- 2 files changed, 638 insertions(+), 56 deletions(-) diff --git a/cpp/tests/cmake/thirdparty/get_arrow.cmake b/cpp/tests/cmake/thirdparty/get_arrow.cmake index 95ca4651e81..0afdc526981 100644 --- a/cpp/tests/cmake/thirdparty/get_arrow.cmake +++ b/cpp/tests/cmake/thirdparty/get_arrow.cmake @@ -12,10 +12,91 @@ # the License. # ============================================================================= +# Finding arrow is far more complex than it should be, and as a result we violate multiple linting +# rules aiming to limit complexity. Since all our other CMake scripts conform to expectations +# without undue difficulty, disabling those rules for just this function is our best approach for +# now. The spacing between this comment, the cmake-lint directives, and the function docstring is +# necessary to prevent cmake-format from trying to combine the lines. + +# cmake-lint: disable=R0912,R0913,R0915 + include_guard(GLOBAL) +# Generate a FindArrow module for the case where we need to search for arrow within a pip install +# pyarrow. +function(find_libarrow_in_python_wheel PYARROW_VERSION) + string(REPLACE "." ";" PYARROW_VER_COMPONENTS "${PYARROW_VERSION}") + list(GET PYARROW_VER_COMPONENTS 0 PYARROW_MAJOR_VER) + list(GET PYARROW_VER_COMPONENTS 1 PYARROW_MINOR_VER) + + # Ensure that the major and minor versions are two digits long + string(LENGTH ${PYARROW_MAJOR_VER} PYARROW_MAJOR_LENGTH) + string(LENGTH ${PYARROW_MINOR_VER} PYARROW_MINOR_LENGTH) + if(${PYARROW_MAJOR_LENGTH} EQUAL 1) + set(PYARROW_MAJOR_VER "0${PYARROW_MAJOR_VER}") + endif() + if(${PYARROW_MINOR_LENGTH} EQUAL 1) + set(PYARROW_MINOR_VER "0${PYARROW_MINOR_VER}") + endif() + + set(PYARROW_LIB "libarrow.so.${PYARROW_MAJOR_VER}${PYARROW_MINOR_VER}") + + string( + APPEND + initial_code_block + [=[ +find_package(Python 3.9 REQUIRED COMPONENTS Interpreter) +execute_process( + COMMAND "${Python_EXECUTABLE}" -c "import pyarrow; print(pyarrow.get_library_dirs()[0])" + OUTPUT_VARIABLE CUDF_PYARROW_WHEEL_DIR + OUTPUT_STRIP_TRAILING_WHITESPACE + COMMAND_ERROR_IS_FATAL ANY +) +list(APPEND CMAKE_PREFIX_PATH "${CUDF_PYARROW_WHEEL_DIR}") +]=] + ) + string( + APPEND + final_code_block + [=[ +list(POP_BACK CMAKE_PREFIX_PATH) +]=] + ) + rapids_find_generate_module( + Arrow NO_CONFIG + VERSION "${PYARROW_VERSION}" + LIBRARY_NAMES "${PYARROW_LIB}" + BUILD_EXPORT_SET cudf-exports + INSTALL_EXPORT_SET cudf-exports + HEADER_NAMES arrow/python/arrow_to_pandas.h INITIAL_CODE_BLOCK initial_code_block + FINAL_CODE_BLOCK final_code_block + ) + + find_package(Arrow ${PYARROW_VERSION} MODULE REQUIRED GLOBAL) + add_library(arrow_shared ALIAS Arrow::Arrow) + + rapids_export_package(BUILD Arrow cudf-exports) + rapids_export_package(INSTALL Arrow cudf-exports) +endfunction() + # This function finds arrow and sets any additional necessary environment variables. -function(find_and_configure_arrow VERSION BUILD_STATIC) +function(find_and_configure_arrow VERSION BUILD_STATIC ENABLE_S3 ENABLE_ORC ENABLE_PYTHON + ENABLE_PARQUET PYARROW_LIBARROW +) + + if(PYARROW_LIBARROW) + # Generate a FindArrow.cmake to find pyarrow's libarrow.so + find_libarrow_in_python_wheel(${VERSION}) + set(ARROW_FOUND + TRUE + PARENT_SCOPE + ) + set(ARROW_LIBRARIES + arrow_shared + PARENT_SCOPE + ) + return() + endif() if(BUILD_STATIC) if(TARGET arrow_static) @@ -29,14 +110,6 @@ function(find_and_configure_arrow VERSION BUILD_STATIC) ) return() endif() - - set(ARROW_BUILD_STATIC ON) - set(ARROW_BUILD_SHARED OFF) - set(ARROW_DEPENDENCY_USE_SHARED OFF) - set(ARROW_LIBRARIES arrow_static) - # Turn off CPM using `find_package` so we always download and make sure we get proper static - # library. - set(CPM_DOWNLOAD_Arrow TRUE) else() if(TARGET arrow_shared) set(ARROW_FOUND @@ -49,20 +122,54 @@ function(find_and_configure_arrow VERSION BUILD_STATIC) ) return() endif() + endif() - set(ARROW_BUILD_STATIC OFF) - set(ARROW_BUILD_SHARED ON) - set(ARROW_DEPENDENCY_USE_SHARED ON) - set(ARROW_LIBRARIES arrow_shared) + if(NOT ARROW_ARMV8_ARCH) + set(ARROW_ARMV8_ARCH "armv8-a") endif() if(NOT ARROW_SIMD_LEVEL) set(ARROW_SIMD_LEVEL "NONE") endif() + if(BUILD_STATIC) + set(ARROW_BUILD_STATIC ON) + set(ARROW_BUILD_SHARED OFF) + # Turn off CPM using `find_package` so we always download and make sure we get proper static + # library. + set(CPM_DOWNLOAD_Arrow TRUE) + # By default ARROW will try to search for a static version of OpenSSL which is a bad idea given + # that shared linking is advised for critical components like SSL. If a static build is + # requested, we honor ARROW's default of static linking, but users may consider setting + # ARROW_OPENSSL_USE_SHARED even in static builds. + else() + set(ARROW_BUILD_SHARED ON) + set(ARROW_BUILD_STATIC OFF) + # By default ARROW will try to search for a static version of OpenSSL which is a bad idea given + # that shared linking is advised for critical components like SSL + set(ARROW_OPENSSL_USE_SHARED ON) + endif() + + set(ARROW_PYTHON_OPTIONS "") + if(ENABLE_PYTHON) + list(APPEND ARROW_PYTHON_OPTIONS "ARROW_PYTHON ON") + # Arrow's logic to build Boost from source is busted, so we have to get it from the system. + list(APPEND ARROW_PYTHON_OPTIONS "BOOST_SOURCE SYSTEM") + list(APPEND ARROW_PYTHON_OPTIONS "ARROW_DEPENDENCY_SOURCE AUTO") + endif() + + set(ARROW_PARQUET_OPTIONS "") + if(ENABLE_PARQUET) + # Arrow's logic to build Boost from source is busted, so we have to get it from the system. + list(APPEND ARROW_PARQUET_OPTIONS "BOOST_SOURCE SYSTEM") + list(APPEND ARROW_PARQUET_OPTIONS "Thrift_SOURCE BUNDLED") + list(APPEND ARROW_PARQUET_OPTIONS "ARROW_DEPENDENCY_SOURCE AUTO") + endif() + rapids_cpm_find( Arrow ${VERSION} - GLOBAL_TARGETS arrow_shared arrow_static + GLOBAL_TARGETS arrow_shared parquet_shared arrow_acero_shared arrow_dataset_shared arrow_static + parquet_static arrow_acero_static arrow_dataset_static CPM_ARGS GIT_REPOSITORY https://github.com/apache/arrow.git GIT_TAG apache-arrow-${VERSION} @@ -74,11 +181,13 @@ function(find_and_configure_arrow VERSION BUILD_STATIC) "ARROW_WITH_BACKTRACE ON" "ARROW_CXXFLAGS -w" "ARROW_JEMALLOC OFF" - "ARROW_S3 OFF" - "ARROW_ORC OFF" - "ARROW_PARQUET OFF" - "ARROW_FILESYSTEM OFF" - "ARROW_PYTHON OFF" + "ARROW_S3 ${ENABLE_S3}" + "ARROW_ORC ${ENABLE_ORC}" + # e.g. needed by blazingsql-io + ${ARROW_PARQUET_OPTIONS} + "ARROW_PARQUET ${ENABLE_PARQUET}" + "ARROW_FILESYSTEM ON" + ${ARROW_PYTHON_OPTIONS} # Arrow modifies CMake's GLOBAL RULE_LAUNCH_COMPILE unless this is off "ARROW_USE_CCACHE OFF" "ARROW_ARMV8_ARCH ${ARROW_ARMV8_ARCH}" @@ -96,14 +205,62 @@ function(find_and_configure_arrow VERSION BUILD_STATIC) "xsimd_SOURCE AUTO" ) - # Arrow_ADDED: set if CPM downloaded Arrow from Github - if(Arrow_ADDED) + set(ARROW_FOUND + TRUE + PARENT_SCOPE + ) + + if(BUILD_STATIC) + set(ARROW_LIBRARIES arrow_static) + else() + set(ARROW_LIBRARIES arrow_shared) + endif() + + # Arrow_DIR: set if CPM found Arrow on the system/conda/etc. + if(Arrow_DIR) + # This extra find_package is necessary because rapids_cpm_find does not propagate all the + # variables from find_package that we might need. This is especially problematic when + # rapids_cpm_find builds from source. + find_package(Arrow REQUIRED QUIET) + if(ENABLE_PARQUET) + # Setting Parquet_DIR is conditional because parquet may be installed independently of arrow. + if(NOT Parquet_DIR) + # Set this to enable `find_package(Parquet)` + set(Parquet_DIR "${Arrow_DIR}") + endif() + # Set this to enable `find_package(ArrowDataset)`. This will call find_package(ArrowAcero) for + # us + set(ArrowDataset_DIR "${Arrow_DIR}") + find_package(ArrowDataset REQUIRED QUIET) + endif() + # Arrow_ADDED: set if CPM downloaded Arrow from Github + elseif(Arrow_ADDED) # Copy these files so we can avoid adding paths in Arrow_BINARY_DIR to # target_include_directories. That defeats ccache. file(INSTALL "${Arrow_BINARY_DIR}/src/arrow/util/config.h" DESTINATION "${Arrow_SOURCE_DIR}/cpp/src/arrow/util" ) - elseif(NOT Arrow_DIR) + if(ENABLE_PARQUET) + file(INSTALL "${Arrow_BINARY_DIR}/src/parquet/parquet_version.h" + DESTINATION "${Arrow_SOURCE_DIR}/cpp/src/parquet" + ) + endif() + # Arrow populates INTERFACE_INCLUDE_DIRECTORIES for the `arrow_static` and `arrow_shared` + # targets in FindArrow, so for static source-builds, we have to do it after-the-fact. + # + # This only works because we know exactly which components we're using. Don't forget to update + # this list if we add more! + # + foreach(ARROW_LIBRARY ${ARROW_LIBRARIES}) + target_include_directories( + ${ARROW_LIBRARY} + INTERFACE "$" + "$" + "$" + "$" + ) + endforeach() + else() set(ARROW_FOUND FALSE PARENT_SCOPE @@ -112,6 +269,48 @@ function(find_and_configure_arrow VERSION BUILD_STATIC) endif() if(Arrow_ADDED) + + set(arrow_code_string + [=[ + if (TARGET cudf::arrow_shared AND (NOT TARGET arrow_shared)) + add_library(arrow_shared ALIAS cudf::arrow_shared) + endif() + if (TARGET cudf::arrow_static AND (NOT TARGET arrow_static)) + add_library(arrow_static ALIAS cudf::arrow_static) + endif() + if (NOT TARGET arrow::flatbuffers) + add_library(arrow::flatbuffers INTERFACE IMPORTED) + endif() + if (NOT TARGET arrow::hadoop) + add_library(arrow::hadoop INTERFACE IMPORTED) + endif() + ]=] + ) + if(ENABLE_PARQUET) + string( + APPEND + arrow_code_string + " + find_package(Boost) + if (NOT TARGET Boost::headers) + add_library(Boost::headers INTERFACE IMPORTED) + endif() + " + ) + endif() + if(NOT TARGET xsimd) + string( + APPEND + arrow_code_string + " + if(NOT TARGET arrow::xsimd) + add_library(arrow::xsimd INTERFACE IMPORTED) + target_include_directories(arrow::xsimd INTERFACE \"${Arrow_BINARY_DIR}/xsimd_ep/src/xsimd_ep-install/include\") + endif() + " + ) + endif() + rapids_cmake_install_lib_dir(lib_dir) if(TARGET arrow_static) get_target_property(interface_libs arrow_static INTERFACE_LINK_LIBRARIES) # The `arrow_static` library is leaking a dependency on the object libraries it was built with @@ -125,17 +324,106 @@ function(find_and_configure_arrow VERSION BUILD_STATIC) get_target_property(interface_libs arrow_static INTERFACE_LINK_LIBRARIES) endif() endif() + rapids_export( + BUILD Arrow + VERSION ${VERSION} + EXPORT_SET arrow_targets + GLOBAL_TARGETS arrow_shared arrow_static + NAMESPACE cudf:: + FINAL_CODE_BLOCK arrow_code_string + ) + + if(ENABLE_PARQUET) + + set(arrow_acero_code_string + [=[ + if (TARGET cudf::arrow_acero_shared AND (NOT TARGET arrow_acero_shared)) + add_library(arrow_acero_shared ALIAS cudf::arrow_acero_shared) + endif() + if (TARGET cudf::arrow_acero_static AND (NOT TARGET arrow_acero_static)) + add_library(arrow_acero_static ALIAS cudf::arrow_acero_static) + endif() + ]=] + ) + + rapids_export( + BUILD ArrowAcero + VERSION ${VERSION} + EXPORT_SET arrow_acero_targets + GLOBAL_TARGETS arrow_acero_shared arrow_acero_static + NAMESPACE cudf:: + FINAL_CODE_BLOCK arrow_acero_code_string + ) + + set(arrow_dataset_code_string + [=[ + if (TARGET cudf::arrow_dataset_shared AND (NOT TARGET arrow_dataset_shared)) + add_library(arrow_dataset_shared ALIAS cudf::arrow_dataset_shared) + endif() + if (TARGET cudf::arrow_dataset_static AND (NOT TARGET arrow_dataset_static)) + add_library(arrow_dataset_static ALIAS cudf::arrow_dataset_static) + endif() + ]=] + ) + + rapids_export( + BUILD ArrowDataset + VERSION ${VERSION} + EXPORT_SET arrow_dataset_targets + GLOBAL_TARGETS arrow_dataset_shared arrow_dataset_static + NAMESPACE cudf:: + FINAL_CODE_BLOCK arrow_dataset_code_string + ) + + set(parquet_code_string + [=[ + if (TARGET cudf::parquet_shared AND (NOT TARGET parquet_shared)) + add_library(parquet_shared ALIAS cudf::parquet_shared) + endif() + if (TARGET cudf::parquet_static AND (NOT TARGET parquet_static)) + add_library(parquet_static ALIAS cudf::parquet_static) + endif() + ]=] + ) + + rapids_export( + BUILD Parquet + VERSION ${VERSION} + EXPORT_SET parquet_targets + GLOBAL_TARGETS parquet_shared parquet_static + NAMESPACE cudf:: + FINAL_CODE_BLOCK parquet_code_string + ) + endif() + endif() + # We generate the arrow-configfiles when we built arrow locally, so always do `find_dependency` + rapids_export_package(BUILD Arrow cudf-exports) + rapids_export_package(INSTALL Arrow cudf-exports) + + if(ENABLE_PARQUET) + rapids_export_package(BUILD Parquet cudf-exports) + rapids_export_package(BUILD ArrowDataset cudf-exports) endif() + include("${rapids-cmake-dir}/export/find_package_root.cmake") + rapids_export_find_package_root( + BUILD Arrow [=[${CMAKE_CURRENT_LIST_DIR}]=] EXPORT_SET cudf-exports + ) + rapids_export_find_package_root( + BUILD Parquet [=[${CMAKE_CURRENT_LIST_DIR}]=] + EXPORT_SET cudf-exports + CONDITION ENABLE_PARQUET + ) + rapids_export_find_package_root( + BUILD ArrowDataset [=[${CMAKE_CURRENT_LIST_DIR}]=] + EXPORT_SET cudf-exports + CONDITION ENABLE_PARQUET + ) + set(ARROW_LIBRARIES "${ARROW_LIBRARIES}" PARENT_SCOPE ) - set(ARROW_FOUND - TRUE - PARENT_SCOPE - ) - endfunction() if(NOT DEFINED CUDF_VERSION_Arrow) @@ -147,4 +435,7 @@ if(NOT DEFINED CUDF_VERSION_Arrow) ) endif() -find_and_configure_arrow(${CUDF_VERSION_Arrow} ${CUDF_USE_ARROW_STATIC}) +find_and_configure_arrow( + ${CUDF_VERSION_Arrow} ${CUDF_USE_ARROW_STATIC} ${CUDF_ENABLE_ARROW_S3} ${CUDF_ENABLE_ARROW_ORC} + ${CUDF_ENABLE_ARROW_PYTHON} ${CUDF_ENABLE_ARROW_PARQUET} ${USE_LIBARROW_FROM_PYARROW} +) diff --git a/java/src/main/native/cmake/thirdparty/get_arrow.cmake b/java/src/main/native/cmake/thirdparty/get_arrow.cmake index 95ca4651e81..0afdc526981 100644 --- a/java/src/main/native/cmake/thirdparty/get_arrow.cmake +++ b/java/src/main/native/cmake/thirdparty/get_arrow.cmake @@ -12,10 +12,91 @@ # the License. # ============================================================================= +# Finding arrow is far more complex than it should be, and as a result we violate multiple linting +# rules aiming to limit complexity. Since all our other CMake scripts conform to expectations +# without undue difficulty, disabling those rules for just this function is our best approach for +# now. The spacing between this comment, the cmake-lint directives, and the function docstring is +# necessary to prevent cmake-format from trying to combine the lines. + +# cmake-lint: disable=R0912,R0913,R0915 + include_guard(GLOBAL) +# Generate a FindArrow module for the case where we need to search for arrow within a pip install +# pyarrow. +function(find_libarrow_in_python_wheel PYARROW_VERSION) + string(REPLACE "." ";" PYARROW_VER_COMPONENTS "${PYARROW_VERSION}") + list(GET PYARROW_VER_COMPONENTS 0 PYARROW_MAJOR_VER) + list(GET PYARROW_VER_COMPONENTS 1 PYARROW_MINOR_VER) + + # Ensure that the major and minor versions are two digits long + string(LENGTH ${PYARROW_MAJOR_VER} PYARROW_MAJOR_LENGTH) + string(LENGTH ${PYARROW_MINOR_VER} PYARROW_MINOR_LENGTH) + if(${PYARROW_MAJOR_LENGTH} EQUAL 1) + set(PYARROW_MAJOR_VER "0${PYARROW_MAJOR_VER}") + endif() + if(${PYARROW_MINOR_LENGTH} EQUAL 1) + set(PYARROW_MINOR_VER "0${PYARROW_MINOR_VER}") + endif() + + set(PYARROW_LIB "libarrow.so.${PYARROW_MAJOR_VER}${PYARROW_MINOR_VER}") + + string( + APPEND + initial_code_block + [=[ +find_package(Python 3.9 REQUIRED COMPONENTS Interpreter) +execute_process( + COMMAND "${Python_EXECUTABLE}" -c "import pyarrow; print(pyarrow.get_library_dirs()[0])" + OUTPUT_VARIABLE CUDF_PYARROW_WHEEL_DIR + OUTPUT_STRIP_TRAILING_WHITESPACE + COMMAND_ERROR_IS_FATAL ANY +) +list(APPEND CMAKE_PREFIX_PATH "${CUDF_PYARROW_WHEEL_DIR}") +]=] + ) + string( + APPEND + final_code_block + [=[ +list(POP_BACK CMAKE_PREFIX_PATH) +]=] + ) + rapids_find_generate_module( + Arrow NO_CONFIG + VERSION "${PYARROW_VERSION}" + LIBRARY_NAMES "${PYARROW_LIB}" + BUILD_EXPORT_SET cudf-exports + INSTALL_EXPORT_SET cudf-exports + HEADER_NAMES arrow/python/arrow_to_pandas.h INITIAL_CODE_BLOCK initial_code_block + FINAL_CODE_BLOCK final_code_block + ) + + find_package(Arrow ${PYARROW_VERSION} MODULE REQUIRED GLOBAL) + add_library(arrow_shared ALIAS Arrow::Arrow) + + rapids_export_package(BUILD Arrow cudf-exports) + rapids_export_package(INSTALL Arrow cudf-exports) +endfunction() + # This function finds arrow and sets any additional necessary environment variables. -function(find_and_configure_arrow VERSION BUILD_STATIC) +function(find_and_configure_arrow VERSION BUILD_STATIC ENABLE_S3 ENABLE_ORC ENABLE_PYTHON + ENABLE_PARQUET PYARROW_LIBARROW +) + + if(PYARROW_LIBARROW) + # Generate a FindArrow.cmake to find pyarrow's libarrow.so + find_libarrow_in_python_wheel(${VERSION}) + set(ARROW_FOUND + TRUE + PARENT_SCOPE + ) + set(ARROW_LIBRARIES + arrow_shared + PARENT_SCOPE + ) + return() + endif() if(BUILD_STATIC) if(TARGET arrow_static) @@ -29,14 +110,6 @@ function(find_and_configure_arrow VERSION BUILD_STATIC) ) return() endif() - - set(ARROW_BUILD_STATIC ON) - set(ARROW_BUILD_SHARED OFF) - set(ARROW_DEPENDENCY_USE_SHARED OFF) - set(ARROW_LIBRARIES arrow_static) - # Turn off CPM using `find_package` so we always download and make sure we get proper static - # library. - set(CPM_DOWNLOAD_Arrow TRUE) else() if(TARGET arrow_shared) set(ARROW_FOUND @@ -49,20 +122,54 @@ function(find_and_configure_arrow VERSION BUILD_STATIC) ) return() endif() + endif() - set(ARROW_BUILD_STATIC OFF) - set(ARROW_BUILD_SHARED ON) - set(ARROW_DEPENDENCY_USE_SHARED ON) - set(ARROW_LIBRARIES arrow_shared) + if(NOT ARROW_ARMV8_ARCH) + set(ARROW_ARMV8_ARCH "armv8-a") endif() if(NOT ARROW_SIMD_LEVEL) set(ARROW_SIMD_LEVEL "NONE") endif() + if(BUILD_STATIC) + set(ARROW_BUILD_STATIC ON) + set(ARROW_BUILD_SHARED OFF) + # Turn off CPM using `find_package` so we always download and make sure we get proper static + # library. + set(CPM_DOWNLOAD_Arrow TRUE) + # By default ARROW will try to search for a static version of OpenSSL which is a bad idea given + # that shared linking is advised for critical components like SSL. If a static build is + # requested, we honor ARROW's default of static linking, but users may consider setting + # ARROW_OPENSSL_USE_SHARED even in static builds. + else() + set(ARROW_BUILD_SHARED ON) + set(ARROW_BUILD_STATIC OFF) + # By default ARROW will try to search for a static version of OpenSSL which is a bad idea given + # that shared linking is advised for critical components like SSL + set(ARROW_OPENSSL_USE_SHARED ON) + endif() + + set(ARROW_PYTHON_OPTIONS "") + if(ENABLE_PYTHON) + list(APPEND ARROW_PYTHON_OPTIONS "ARROW_PYTHON ON") + # Arrow's logic to build Boost from source is busted, so we have to get it from the system. + list(APPEND ARROW_PYTHON_OPTIONS "BOOST_SOURCE SYSTEM") + list(APPEND ARROW_PYTHON_OPTIONS "ARROW_DEPENDENCY_SOURCE AUTO") + endif() + + set(ARROW_PARQUET_OPTIONS "") + if(ENABLE_PARQUET) + # Arrow's logic to build Boost from source is busted, so we have to get it from the system. + list(APPEND ARROW_PARQUET_OPTIONS "BOOST_SOURCE SYSTEM") + list(APPEND ARROW_PARQUET_OPTIONS "Thrift_SOURCE BUNDLED") + list(APPEND ARROW_PARQUET_OPTIONS "ARROW_DEPENDENCY_SOURCE AUTO") + endif() + rapids_cpm_find( Arrow ${VERSION} - GLOBAL_TARGETS arrow_shared arrow_static + GLOBAL_TARGETS arrow_shared parquet_shared arrow_acero_shared arrow_dataset_shared arrow_static + parquet_static arrow_acero_static arrow_dataset_static CPM_ARGS GIT_REPOSITORY https://github.com/apache/arrow.git GIT_TAG apache-arrow-${VERSION} @@ -74,11 +181,13 @@ function(find_and_configure_arrow VERSION BUILD_STATIC) "ARROW_WITH_BACKTRACE ON" "ARROW_CXXFLAGS -w" "ARROW_JEMALLOC OFF" - "ARROW_S3 OFF" - "ARROW_ORC OFF" - "ARROW_PARQUET OFF" - "ARROW_FILESYSTEM OFF" - "ARROW_PYTHON OFF" + "ARROW_S3 ${ENABLE_S3}" + "ARROW_ORC ${ENABLE_ORC}" + # e.g. needed by blazingsql-io + ${ARROW_PARQUET_OPTIONS} + "ARROW_PARQUET ${ENABLE_PARQUET}" + "ARROW_FILESYSTEM ON" + ${ARROW_PYTHON_OPTIONS} # Arrow modifies CMake's GLOBAL RULE_LAUNCH_COMPILE unless this is off "ARROW_USE_CCACHE OFF" "ARROW_ARMV8_ARCH ${ARROW_ARMV8_ARCH}" @@ -96,14 +205,62 @@ function(find_and_configure_arrow VERSION BUILD_STATIC) "xsimd_SOURCE AUTO" ) - # Arrow_ADDED: set if CPM downloaded Arrow from Github - if(Arrow_ADDED) + set(ARROW_FOUND + TRUE + PARENT_SCOPE + ) + + if(BUILD_STATIC) + set(ARROW_LIBRARIES arrow_static) + else() + set(ARROW_LIBRARIES arrow_shared) + endif() + + # Arrow_DIR: set if CPM found Arrow on the system/conda/etc. + if(Arrow_DIR) + # This extra find_package is necessary because rapids_cpm_find does not propagate all the + # variables from find_package that we might need. This is especially problematic when + # rapids_cpm_find builds from source. + find_package(Arrow REQUIRED QUIET) + if(ENABLE_PARQUET) + # Setting Parquet_DIR is conditional because parquet may be installed independently of arrow. + if(NOT Parquet_DIR) + # Set this to enable `find_package(Parquet)` + set(Parquet_DIR "${Arrow_DIR}") + endif() + # Set this to enable `find_package(ArrowDataset)`. This will call find_package(ArrowAcero) for + # us + set(ArrowDataset_DIR "${Arrow_DIR}") + find_package(ArrowDataset REQUIRED QUIET) + endif() + # Arrow_ADDED: set if CPM downloaded Arrow from Github + elseif(Arrow_ADDED) # Copy these files so we can avoid adding paths in Arrow_BINARY_DIR to # target_include_directories. That defeats ccache. file(INSTALL "${Arrow_BINARY_DIR}/src/arrow/util/config.h" DESTINATION "${Arrow_SOURCE_DIR}/cpp/src/arrow/util" ) - elseif(NOT Arrow_DIR) + if(ENABLE_PARQUET) + file(INSTALL "${Arrow_BINARY_DIR}/src/parquet/parquet_version.h" + DESTINATION "${Arrow_SOURCE_DIR}/cpp/src/parquet" + ) + endif() + # Arrow populates INTERFACE_INCLUDE_DIRECTORIES for the `arrow_static` and `arrow_shared` + # targets in FindArrow, so for static source-builds, we have to do it after-the-fact. + # + # This only works because we know exactly which components we're using. Don't forget to update + # this list if we add more! + # + foreach(ARROW_LIBRARY ${ARROW_LIBRARIES}) + target_include_directories( + ${ARROW_LIBRARY} + INTERFACE "$" + "$" + "$" + "$" + ) + endforeach() + else() set(ARROW_FOUND FALSE PARENT_SCOPE @@ -112,6 +269,48 @@ function(find_and_configure_arrow VERSION BUILD_STATIC) endif() if(Arrow_ADDED) + + set(arrow_code_string + [=[ + if (TARGET cudf::arrow_shared AND (NOT TARGET arrow_shared)) + add_library(arrow_shared ALIAS cudf::arrow_shared) + endif() + if (TARGET cudf::arrow_static AND (NOT TARGET arrow_static)) + add_library(arrow_static ALIAS cudf::arrow_static) + endif() + if (NOT TARGET arrow::flatbuffers) + add_library(arrow::flatbuffers INTERFACE IMPORTED) + endif() + if (NOT TARGET arrow::hadoop) + add_library(arrow::hadoop INTERFACE IMPORTED) + endif() + ]=] + ) + if(ENABLE_PARQUET) + string( + APPEND + arrow_code_string + " + find_package(Boost) + if (NOT TARGET Boost::headers) + add_library(Boost::headers INTERFACE IMPORTED) + endif() + " + ) + endif() + if(NOT TARGET xsimd) + string( + APPEND + arrow_code_string + " + if(NOT TARGET arrow::xsimd) + add_library(arrow::xsimd INTERFACE IMPORTED) + target_include_directories(arrow::xsimd INTERFACE \"${Arrow_BINARY_DIR}/xsimd_ep/src/xsimd_ep-install/include\") + endif() + " + ) + endif() + rapids_cmake_install_lib_dir(lib_dir) if(TARGET arrow_static) get_target_property(interface_libs arrow_static INTERFACE_LINK_LIBRARIES) # The `arrow_static` library is leaking a dependency on the object libraries it was built with @@ -125,17 +324,106 @@ function(find_and_configure_arrow VERSION BUILD_STATIC) get_target_property(interface_libs arrow_static INTERFACE_LINK_LIBRARIES) endif() endif() + rapids_export( + BUILD Arrow + VERSION ${VERSION} + EXPORT_SET arrow_targets + GLOBAL_TARGETS arrow_shared arrow_static + NAMESPACE cudf:: + FINAL_CODE_BLOCK arrow_code_string + ) + + if(ENABLE_PARQUET) + + set(arrow_acero_code_string + [=[ + if (TARGET cudf::arrow_acero_shared AND (NOT TARGET arrow_acero_shared)) + add_library(arrow_acero_shared ALIAS cudf::arrow_acero_shared) + endif() + if (TARGET cudf::arrow_acero_static AND (NOT TARGET arrow_acero_static)) + add_library(arrow_acero_static ALIAS cudf::arrow_acero_static) + endif() + ]=] + ) + + rapids_export( + BUILD ArrowAcero + VERSION ${VERSION} + EXPORT_SET arrow_acero_targets + GLOBAL_TARGETS arrow_acero_shared arrow_acero_static + NAMESPACE cudf:: + FINAL_CODE_BLOCK arrow_acero_code_string + ) + + set(arrow_dataset_code_string + [=[ + if (TARGET cudf::arrow_dataset_shared AND (NOT TARGET arrow_dataset_shared)) + add_library(arrow_dataset_shared ALIAS cudf::arrow_dataset_shared) + endif() + if (TARGET cudf::arrow_dataset_static AND (NOT TARGET arrow_dataset_static)) + add_library(arrow_dataset_static ALIAS cudf::arrow_dataset_static) + endif() + ]=] + ) + + rapids_export( + BUILD ArrowDataset + VERSION ${VERSION} + EXPORT_SET arrow_dataset_targets + GLOBAL_TARGETS arrow_dataset_shared arrow_dataset_static + NAMESPACE cudf:: + FINAL_CODE_BLOCK arrow_dataset_code_string + ) + + set(parquet_code_string + [=[ + if (TARGET cudf::parquet_shared AND (NOT TARGET parquet_shared)) + add_library(parquet_shared ALIAS cudf::parquet_shared) + endif() + if (TARGET cudf::parquet_static AND (NOT TARGET parquet_static)) + add_library(parquet_static ALIAS cudf::parquet_static) + endif() + ]=] + ) + + rapids_export( + BUILD Parquet + VERSION ${VERSION} + EXPORT_SET parquet_targets + GLOBAL_TARGETS parquet_shared parquet_static + NAMESPACE cudf:: + FINAL_CODE_BLOCK parquet_code_string + ) + endif() + endif() + # We generate the arrow-configfiles when we built arrow locally, so always do `find_dependency` + rapids_export_package(BUILD Arrow cudf-exports) + rapids_export_package(INSTALL Arrow cudf-exports) + + if(ENABLE_PARQUET) + rapids_export_package(BUILD Parquet cudf-exports) + rapids_export_package(BUILD ArrowDataset cudf-exports) endif() + include("${rapids-cmake-dir}/export/find_package_root.cmake") + rapids_export_find_package_root( + BUILD Arrow [=[${CMAKE_CURRENT_LIST_DIR}]=] EXPORT_SET cudf-exports + ) + rapids_export_find_package_root( + BUILD Parquet [=[${CMAKE_CURRENT_LIST_DIR}]=] + EXPORT_SET cudf-exports + CONDITION ENABLE_PARQUET + ) + rapids_export_find_package_root( + BUILD ArrowDataset [=[${CMAKE_CURRENT_LIST_DIR}]=] + EXPORT_SET cudf-exports + CONDITION ENABLE_PARQUET + ) + set(ARROW_LIBRARIES "${ARROW_LIBRARIES}" PARENT_SCOPE ) - set(ARROW_FOUND - TRUE - PARENT_SCOPE - ) - endfunction() if(NOT DEFINED CUDF_VERSION_Arrow) @@ -147,4 +435,7 @@ if(NOT DEFINED CUDF_VERSION_Arrow) ) endif() -find_and_configure_arrow(${CUDF_VERSION_Arrow} ${CUDF_USE_ARROW_STATIC}) +find_and_configure_arrow( + ${CUDF_VERSION_Arrow} ${CUDF_USE_ARROW_STATIC} ${CUDF_ENABLE_ARROW_S3} ${CUDF_ENABLE_ARROW_ORC} + ${CUDF_ENABLE_ARROW_PYTHON} ${CUDF_ENABLE_ARROW_PARQUET} ${USE_LIBARROW_FROM_PYARROW} +) From 3ec9ee5d3f3678e6e8c33df685d32e7755fb6cb4 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Thu, 22 Aug 2024 05:00:11 +0000 Subject: [PATCH 22/48] Disable libarrow from pyarrow --- cpp/tests/cmake/thirdparty/get_arrow.cmake | 76 +--------------------- 1 file changed, 2 insertions(+), 74 deletions(-) diff --git a/cpp/tests/cmake/thirdparty/get_arrow.cmake b/cpp/tests/cmake/thirdparty/get_arrow.cmake index 0afdc526981..6d075ea1ac1 100644 --- a/cpp/tests/cmake/thirdparty/get_arrow.cmake +++ b/cpp/tests/cmake/thirdparty/get_arrow.cmake @@ -22,82 +22,10 @@ include_guard(GLOBAL) -# Generate a FindArrow module for the case where we need to search for arrow within a pip install -# pyarrow. -function(find_libarrow_in_python_wheel PYARROW_VERSION) - string(REPLACE "." ";" PYARROW_VER_COMPONENTS "${PYARROW_VERSION}") - list(GET PYARROW_VER_COMPONENTS 0 PYARROW_MAJOR_VER) - list(GET PYARROW_VER_COMPONENTS 1 PYARROW_MINOR_VER) - - # Ensure that the major and minor versions are two digits long - string(LENGTH ${PYARROW_MAJOR_VER} PYARROW_MAJOR_LENGTH) - string(LENGTH ${PYARROW_MINOR_VER} PYARROW_MINOR_LENGTH) - if(${PYARROW_MAJOR_LENGTH} EQUAL 1) - set(PYARROW_MAJOR_VER "0${PYARROW_MAJOR_VER}") - endif() - if(${PYARROW_MINOR_LENGTH} EQUAL 1) - set(PYARROW_MINOR_VER "0${PYARROW_MINOR_VER}") - endif() - - set(PYARROW_LIB "libarrow.so.${PYARROW_MAJOR_VER}${PYARROW_MINOR_VER}") - - string( - APPEND - initial_code_block - [=[ -find_package(Python 3.9 REQUIRED COMPONENTS Interpreter) -execute_process( - COMMAND "${Python_EXECUTABLE}" -c "import pyarrow; print(pyarrow.get_library_dirs()[0])" - OUTPUT_VARIABLE CUDF_PYARROW_WHEEL_DIR - OUTPUT_STRIP_TRAILING_WHITESPACE - COMMAND_ERROR_IS_FATAL ANY -) -list(APPEND CMAKE_PREFIX_PATH "${CUDF_PYARROW_WHEEL_DIR}") -]=] - ) - string( - APPEND - final_code_block - [=[ -list(POP_BACK CMAKE_PREFIX_PATH) -]=] - ) - rapids_find_generate_module( - Arrow NO_CONFIG - VERSION "${PYARROW_VERSION}" - LIBRARY_NAMES "${PYARROW_LIB}" - BUILD_EXPORT_SET cudf-exports - INSTALL_EXPORT_SET cudf-exports - HEADER_NAMES arrow/python/arrow_to_pandas.h INITIAL_CODE_BLOCK initial_code_block - FINAL_CODE_BLOCK final_code_block - ) - - find_package(Arrow ${PYARROW_VERSION} MODULE REQUIRED GLOBAL) - add_library(arrow_shared ALIAS Arrow::Arrow) - - rapids_export_package(BUILD Arrow cudf-exports) - rapids_export_package(INSTALL Arrow cudf-exports) -endfunction() - # This function finds arrow and sets any additional necessary environment variables. function(find_and_configure_arrow VERSION BUILD_STATIC ENABLE_S3 ENABLE_ORC ENABLE_PYTHON - ENABLE_PARQUET PYARROW_LIBARROW + ENABLE_PARQUET ) - - if(PYARROW_LIBARROW) - # Generate a FindArrow.cmake to find pyarrow's libarrow.so - find_libarrow_in_python_wheel(${VERSION}) - set(ARROW_FOUND - TRUE - PARENT_SCOPE - ) - set(ARROW_LIBRARIES - arrow_shared - PARENT_SCOPE - ) - return() - endif() - if(BUILD_STATIC) if(TARGET arrow_static) set(ARROW_FOUND @@ -437,5 +365,5 @@ endif() find_and_configure_arrow( ${CUDF_VERSION_Arrow} ${CUDF_USE_ARROW_STATIC} ${CUDF_ENABLE_ARROW_S3} ${CUDF_ENABLE_ARROW_ORC} - ${CUDF_ENABLE_ARROW_PYTHON} ${CUDF_ENABLE_ARROW_PARQUET} ${USE_LIBARROW_FROM_PYARROW} + ${CUDF_ENABLE_ARROW_PYTHON} ${CUDF_ENABLE_ARROW_PARQUET} ) From 2e1c11e5843bf21d3147de001f2a2311687961e5 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Thu, 22 Aug 2024 05:04:30 +0000 Subject: [PATCH 23/48] Remove I/O and Python options --- cpp/tests/cmake/thirdparty/get_arrow.cmake | 137 +-------------------- 1 file changed, 6 insertions(+), 131 deletions(-) diff --git a/cpp/tests/cmake/thirdparty/get_arrow.cmake b/cpp/tests/cmake/thirdparty/get_arrow.cmake index 6d075ea1ac1..2c20fa48a41 100644 --- a/cpp/tests/cmake/thirdparty/get_arrow.cmake +++ b/cpp/tests/cmake/thirdparty/get_arrow.cmake @@ -23,9 +23,7 @@ include_guard(GLOBAL) # This function finds arrow and sets any additional necessary environment variables. -function(find_and_configure_arrow VERSION BUILD_STATIC ENABLE_S3 ENABLE_ORC ENABLE_PYTHON - ENABLE_PARQUET -) +function(find_and_configure_arrow VERSION BUILD_STATIC) if(BUILD_STATIC) if(TARGET arrow_static) set(ARROW_FOUND @@ -78,22 +76,6 @@ function(find_and_configure_arrow VERSION BUILD_STATIC ENABLE_S3 ENABLE_ORC ENAB set(ARROW_OPENSSL_USE_SHARED ON) endif() - set(ARROW_PYTHON_OPTIONS "") - if(ENABLE_PYTHON) - list(APPEND ARROW_PYTHON_OPTIONS "ARROW_PYTHON ON") - # Arrow's logic to build Boost from source is busted, so we have to get it from the system. - list(APPEND ARROW_PYTHON_OPTIONS "BOOST_SOURCE SYSTEM") - list(APPEND ARROW_PYTHON_OPTIONS "ARROW_DEPENDENCY_SOURCE AUTO") - endif() - - set(ARROW_PARQUET_OPTIONS "") - if(ENABLE_PARQUET) - # Arrow's logic to build Boost from source is busted, so we have to get it from the system. - list(APPEND ARROW_PARQUET_OPTIONS "BOOST_SOURCE SYSTEM") - list(APPEND ARROW_PARQUET_OPTIONS "Thrift_SOURCE BUNDLED") - list(APPEND ARROW_PARQUET_OPTIONS "ARROW_DEPENDENCY_SOURCE AUTO") - endif() - rapids_cpm_find( Arrow ${VERSION} GLOBAL_TARGETS arrow_shared parquet_shared arrow_acero_shared arrow_dataset_shared arrow_static @@ -109,13 +91,11 @@ function(find_and_configure_arrow VERSION BUILD_STATIC ENABLE_S3 ENABLE_ORC ENAB "ARROW_WITH_BACKTRACE ON" "ARROW_CXXFLAGS -w" "ARROW_JEMALLOC OFF" - "ARROW_S3 ${ENABLE_S3}" - "ARROW_ORC ${ENABLE_ORC}" - # e.g. needed by blazingsql-io - ${ARROW_PARQUET_OPTIONS} - "ARROW_PARQUET ${ENABLE_PARQUET}" + "ARROW_S3 OFF" + "ARROW_ORC OFF" + "ARROW_PARQUET OFF" "ARROW_FILESYSTEM ON" - ${ARROW_PYTHON_OPTIONS} + "ARROW_PYTHON OFF" # Arrow modifies CMake's GLOBAL RULE_LAUNCH_COMPILE unless this is off "ARROW_USE_CCACHE OFF" "ARROW_ARMV8_ARCH ${ARROW_ARMV8_ARCH}" @@ -150,17 +130,6 @@ function(find_and_configure_arrow VERSION BUILD_STATIC ENABLE_S3 ENABLE_ORC ENAB # variables from find_package that we might need. This is especially problematic when # rapids_cpm_find builds from source. find_package(Arrow REQUIRED QUIET) - if(ENABLE_PARQUET) - # Setting Parquet_DIR is conditional because parquet may be installed independently of arrow. - if(NOT Parquet_DIR) - # Set this to enable `find_package(Parquet)` - set(Parquet_DIR "${Arrow_DIR}") - endif() - # Set this to enable `find_package(ArrowDataset)`. This will call find_package(ArrowAcero) for - # us - set(ArrowDataset_DIR "${Arrow_DIR}") - find_package(ArrowDataset REQUIRED QUIET) - endif() # Arrow_ADDED: set if CPM downloaded Arrow from Github elseif(Arrow_ADDED) # Copy these files so we can avoid adding paths in Arrow_BINARY_DIR to @@ -168,11 +137,6 @@ function(find_and_configure_arrow VERSION BUILD_STATIC ENABLE_S3 ENABLE_ORC ENAB file(INSTALL "${Arrow_BINARY_DIR}/src/arrow/util/config.h" DESTINATION "${Arrow_SOURCE_DIR}/cpp/src/arrow/util" ) - if(ENABLE_PARQUET) - file(INSTALL "${Arrow_BINARY_DIR}/src/parquet/parquet_version.h" - DESTINATION "${Arrow_SOURCE_DIR}/cpp/src/parquet" - ) - endif() # Arrow populates INTERFACE_INCLUDE_DIRECTORIES for the `arrow_static` and `arrow_shared` # targets in FindArrow, so for static source-builds, we have to do it after-the-fact. # @@ -214,18 +178,6 @@ function(find_and_configure_arrow VERSION BUILD_STATIC ENABLE_S3 ENABLE_ORC ENAB endif() ]=] ) - if(ENABLE_PARQUET) - string( - APPEND - arrow_code_string - " - find_package(Boost) - if (NOT TARGET Boost::headers) - add_library(Boost::headers INTERFACE IMPORTED) - endif() - " - ) - endif() if(NOT TARGET xsimd) string( APPEND @@ -261,92 +213,15 @@ function(find_and_configure_arrow VERSION BUILD_STATIC ENABLE_S3 ENABLE_ORC ENAB FINAL_CODE_BLOCK arrow_code_string ) - if(ENABLE_PARQUET) - - set(arrow_acero_code_string - [=[ - if (TARGET cudf::arrow_acero_shared AND (NOT TARGET arrow_acero_shared)) - add_library(arrow_acero_shared ALIAS cudf::arrow_acero_shared) - endif() - if (TARGET cudf::arrow_acero_static AND (NOT TARGET arrow_acero_static)) - add_library(arrow_acero_static ALIAS cudf::arrow_acero_static) - endif() - ]=] - ) - - rapids_export( - BUILD ArrowAcero - VERSION ${VERSION} - EXPORT_SET arrow_acero_targets - GLOBAL_TARGETS arrow_acero_shared arrow_acero_static - NAMESPACE cudf:: - FINAL_CODE_BLOCK arrow_acero_code_string - ) - - set(arrow_dataset_code_string - [=[ - if (TARGET cudf::arrow_dataset_shared AND (NOT TARGET arrow_dataset_shared)) - add_library(arrow_dataset_shared ALIAS cudf::arrow_dataset_shared) - endif() - if (TARGET cudf::arrow_dataset_static AND (NOT TARGET arrow_dataset_static)) - add_library(arrow_dataset_static ALIAS cudf::arrow_dataset_static) - endif() - ]=] - ) - - rapids_export( - BUILD ArrowDataset - VERSION ${VERSION} - EXPORT_SET arrow_dataset_targets - GLOBAL_TARGETS arrow_dataset_shared arrow_dataset_static - NAMESPACE cudf:: - FINAL_CODE_BLOCK arrow_dataset_code_string - ) - - set(parquet_code_string - [=[ - if (TARGET cudf::parquet_shared AND (NOT TARGET parquet_shared)) - add_library(parquet_shared ALIAS cudf::parquet_shared) - endif() - if (TARGET cudf::parquet_static AND (NOT TARGET parquet_static)) - add_library(parquet_static ALIAS cudf::parquet_static) - endif() - ]=] - ) - - rapids_export( - BUILD Parquet - VERSION ${VERSION} - EXPORT_SET parquet_targets - GLOBAL_TARGETS parquet_shared parquet_static - NAMESPACE cudf:: - FINAL_CODE_BLOCK parquet_code_string - ) - endif() endif() # We generate the arrow-configfiles when we built arrow locally, so always do `find_dependency` rapids_export_package(BUILD Arrow cudf-exports) rapids_export_package(INSTALL Arrow cudf-exports) - if(ENABLE_PARQUET) - rapids_export_package(BUILD Parquet cudf-exports) - rapids_export_package(BUILD ArrowDataset cudf-exports) - endif() - include("${rapids-cmake-dir}/export/find_package_root.cmake") rapids_export_find_package_root( BUILD Arrow [=[${CMAKE_CURRENT_LIST_DIR}]=] EXPORT_SET cudf-exports ) - rapids_export_find_package_root( - BUILD Parquet [=[${CMAKE_CURRENT_LIST_DIR}]=] - EXPORT_SET cudf-exports - CONDITION ENABLE_PARQUET - ) - rapids_export_find_package_root( - BUILD ArrowDataset [=[${CMAKE_CURRENT_LIST_DIR}]=] - EXPORT_SET cudf-exports - CONDITION ENABLE_PARQUET - ) set(ARROW_LIBRARIES "${ARROW_LIBRARIES}" @@ -365,5 +240,5 @@ endif() find_and_configure_arrow( ${CUDF_VERSION_Arrow} ${CUDF_USE_ARROW_STATIC} ${CUDF_ENABLE_ARROW_S3} ${CUDF_ENABLE_ARROW_ORC} - ${CUDF_ENABLE_ARROW_PYTHON} ${CUDF_ENABLE_ARROW_PARQUET} + ${CUDF_ENABLE_ARROW_PYTHON} ) From c6c4ba67b8c8df0bd7ef131795ae32ce491f145c Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Thu, 22 Aug 2024 05:04:57 +0000 Subject: [PATCH 24/48] Remove no longer support armv8 option --- cpp/tests/cmake/thirdparty/get_arrow.cmake | 5 ----- 1 file changed, 5 deletions(-) diff --git a/cpp/tests/cmake/thirdparty/get_arrow.cmake b/cpp/tests/cmake/thirdparty/get_arrow.cmake index 2c20fa48a41..0f00f3d9895 100644 --- a/cpp/tests/cmake/thirdparty/get_arrow.cmake +++ b/cpp/tests/cmake/thirdparty/get_arrow.cmake @@ -50,10 +50,6 @@ function(find_and_configure_arrow VERSION BUILD_STATIC) endif() endif() - if(NOT ARROW_ARMV8_ARCH) - set(ARROW_ARMV8_ARCH "armv8-a") - endif() - if(NOT ARROW_SIMD_LEVEL) set(ARROW_SIMD_LEVEL "NONE") endif() @@ -98,7 +94,6 @@ function(find_and_configure_arrow VERSION BUILD_STATIC) "ARROW_PYTHON OFF" # Arrow modifies CMake's GLOBAL RULE_LAUNCH_COMPILE unless this is off "ARROW_USE_CCACHE OFF" - "ARROW_ARMV8_ARCH ${ARROW_ARMV8_ARCH}" "ARROW_SIMD_LEVEL ${ARROW_SIMD_LEVEL}" "ARROW_BUILD_STATIC ${ARROW_BUILD_STATIC}" "ARROW_BUILD_SHARED ${ARROW_BUILD_SHARED}" From c8b8e422f567848a734c5d50c4685a903235f0fa Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Thu, 22 Aug 2024 05:08:17 +0000 Subject: [PATCH 25/48] Remove export logic since we no longer install --- cpp/tests/cmake/thirdparty/get_arrow.cmake | 47 ---------------------- 1 file changed, 47 deletions(-) diff --git a/cpp/tests/cmake/thirdparty/get_arrow.cmake b/cpp/tests/cmake/thirdparty/get_arrow.cmake index 0f00f3d9895..f5915ebab7c 100644 --- a/cpp/tests/cmake/thirdparty/get_arrow.cmake +++ b/cpp/tests/cmake/thirdparty/get_arrow.cmake @@ -156,36 +156,6 @@ function(find_and_configure_arrow VERSION BUILD_STATIC) endif() if(Arrow_ADDED) - - set(arrow_code_string - [=[ - if (TARGET cudf::arrow_shared AND (NOT TARGET arrow_shared)) - add_library(arrow_shared ALIAS cudf::arrow_shared) - endif() - if (TARGET cudf::arrow_static AND (NOT TARGET arrow_static)) - add_library(arrow_static ALIAS cudf::arrow_static) - endif() - if (NOT TARGET arrow::flatbuffers) - add_library(arrow::flatbuffers INTERFACE IMPORTED) - endif() - if (NOT TARGET arrow::hadoop) - add_library(arrow::hadoop INTERFACE IMPORTED) - endif() - ]=] - ) - if(NOT TARGET xsimd) - string( - APPEND - arrow_code_string - " - if(NOT TARGET arrow::xsimd) - add_library(arrow::xsimd INTERFACE IMPORTED) - target_include_directories(arrow::xsimd INTERFACE \"${Arrow_BINARY_DIR}/xsimd_ep/src/xsimd_ep-install/include\") - endif() - " - ) - endif() - rapids_cmake_install_lib_dir(lib_dir) if(TARGET arrow_static) get_target_property(interface_libs arrow_static INTERFACE_LINK_LIBRARIES) # The `arrow_static` library is leaking a dependency on the object libraries it was built with @@ -199,24 +169,7 @@ function(find_and_configure_arrow VERSION BUILD_STATIC) get_target_property(interface_libs arrow_static INTERFACE_LINK_LIBRARIES) endif() endif() - rapids_export( - BUILD Arrow - VERSION ${VERSION} - EXPORT_SET arrow_targets - GLOBAL_TARGETS arrow_shared arrow_static - NAMESPACE cudf:: - FINAL_CODE_BLOCK arrow_code_string - ) - endif() - # We generate the arrow-configfiles when we built arrow locally, so always do `find_dependency` - rapids_export_package(BUILD Arrow cudf-exports) - rapids_export_package(INSTALL Arrow cudf-exports) - - include("${rapids-cmake-dir}/export/find_package_root.cmake") - rapids_export_find_package_root( - BUILD Arrow [=[${CMAKE_CURRENT_LIST_DIR}]=] EXPORT_SET cudf-exports - ) set(ARROW_LIBRARIES "${ARROW_LIBRARIES}" From 46642123b63c539e07ebc34e45a50b421a9db94a Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Thu, 22 Aug 2024 05:13:01 +0000 Subject: [PATCH 26/48] Port changes over to Java copy --- .../native/cmake/thirdparty/get_arrow.cmake | 261 +----------------- 1 file changed, 6 insertions(+), 255 deletions(-) diff --git a/java/src/main/native/cmake/thirdparty/get_arrow.cmake b/java/src/main/native/cmake/thirdparty/get_arrow.cmake index 0afdc526981..f5915ebab7c 100644 --- a/java/src/main/native/cmake/thirdparty/get_arrow.cmake +++ b/java/src/main/native/cmake/thirdparty/get_arrow.cmake @@ -22,82 +22,8 @@ include_guard(GLOBAL) -# Generate a FindArrow module for the case where we need to search for arrow within a pip install -# pyarrow. -function(find_libarrow_in_python_wheel PYARROW_VERSION) - string(REPLACE "." ";" PYARROW_VER_COMPONENTS "${PYARROW_VERSION}") - list(GET PYARROW_VER_COMPONENTS 0 PYARROW_MAJOR_VER) - list(GET PYARROW_VER_COMPONENTS 1 PYARROW_MINOR_VER) - - # Ensure that the major and minor versions are two digits long - string(LENGTH ${PYARROW_MAJOR_VER} PYARROW_MAJOR_LENGTH) - string(LENGTH ${PYARROW_MINOR_VER} PYARROW_MINOR_LENGTH) - if(${PYARROW_MAJOR_LENGTH} EQUAL 1) - set(PYARROW_MAJOR_VER "0${PYARROW_MAJOR_VER}") - endif() - if(${PYARROW_MINOR_LENGTH} EQUAL 1) - set(PYARROW_MINOR_VER "0${PYARROW_MINOR_VER}") - endif() - - set(PYARROW_LIB "libarrow.so.${PYARROW_MAJOR_VER}${PYARROW_MINOR_VER}") - - string( - APPEND - initial_code_block - [=[ -find_package(Python 3.9 REQUIRED COMPONENTS Interpreter) -execute_process( - COMMAND "${Python_EXECUTABLE}" -c "import pyarrow; print(pyarrow.get_library_dirs()[0])" - OUTPUT_VARIABLE CUDF_PYARROW_WHEEL_DIR - OUTPUT_STRIP_TRAILING_WHITESPACE - COMMAND_ERROR_IS_FATAL ANY -) -list(APPEND CMAKE_PREFIX_PATH "${CUDF_PYARROW_WHEEL_DIR}") -]=] - ) - string( - APPEND - final_code_block - [=[ -list(POP_BACK CMAKE_PREFIX_PATH) -]=] - ) - rapids_find_generate_module( - Arrow NO_CONFIG - VERSION "${PYARROW_VERSION}" - LIBRARY_NAMES "${PYARROW_LIB}" - BUILD_EXPORT_SET cudf-exports - INSTALL_EXPORT_SET cudf-exports - HEADER_NAMES arrow/python/arrow_to_pandas.h INITIAL_CODE_BLOCK initial_code_block - FINAL_CODE_BLOCK final_code_block - ) - - find_package(Arrow ${PYARROW_VERSION} MODULE REQUIRED GLOBAL) - add_library(arrow_shared ALIAS Arrow::Arrow) - - rapids_export_package(BUILD Arrow cudf-exports) - rapids_export_package(INSTALL Arrow cudf-exports) -endfunction() - # This function finds arrow and sets any additional necessary environment variables. -function(find_and_configure_arrow VERSION BUILD_STATIC ENABLE_S3 ENABLE_ORC ENABLE_PYTHON - ENABLE_PARQUET PYARROW_LIBARROW -) - - if(PYARROW_LIBARROW) - # Generate a FindArrow.cmake to find pyarrow's libarrow.so - find_libarrow_in_python_wheel(${VERSION}) - set(ARROW_FOUND - TRUE - PARENT_SCOPE - ) - set(ARROW_LIBRARIES - arrow_shared - PARENT_SCOPE - ) - return() - endif() - +function(find_and_configure_arrow VERSION BUILD_STATIC) if(BUILD_STATIC) if(TARGET arrow_static) set(ARROW_FOUND @@ -124,10 +50,6 @@ function(find_and_configure_arrow VERSION BUILD_STATIC ENABLE_S3 ENABLE_ORC ENAB endif() endif() - if(NOT ARROW_ARMV8_ARCH) - set(ARROW_ARMV8_ARCH "armv8-a") - endif() - if(NOT ARROW_SIMD_LEVEL) set(ARROW_SIMD_LEVEL "NONE") endif() @@ -150,22 +72,6 @@ function(find_and_configure_arrow VERSION BUILD_STATIC ENABLE_S3 ENABLE_ORC ENAB set(ARROW_OPENSSL_USE_SHARED ON) endif() - set(ARROW_PYTHON_OPTIONS "") - if(ENABLE_PYTHON) - list(APPEND ARROW_PYTHON_OPTIONS "ARROW_PYTHON ON") - # Arrow's logic to build Boost from source is busted, so we have to get it from the system. - list(APPEND ARROW_PYTHON_OPTIONS "BOOST_SOURCE SYSTEM") - list(APPEND ARROW_PYTHON_OPTIONS "ARROW_DEPENDENCY_SOURCE AUTO") - endif() - - set(ARROW_PARQUET_OPTIONS "") - if(ENABLE_PARQUET) - # Arrow's logic to build Boost from source is busted, so we have to get it from the system. - list(APPEND ARROW_PARQUET_OPTIONS "BOOST_SOURCE SYSTEM") - list(APPEND ARROW_PARQUET_OPTIONS "Thrift_SOURCE BUNDLED") - list(APPEND ARROW_PARQUET_OPTIONS "ARROW_DEPENDENCY_SOURCE AUTO") - endif() - rapids_cpm_find( Arrow ${VERSION} GLOBAL_TARGETS arrow_shared parquet_shared arrow_acero_shared arrow_dataset_shared arrow_static @@ -181,16 +87,13 @@ function(find_and_configure_arrow VERSION BUILD_STATIC ENABLE_S3 ENABLE_ORC ENAB "ARROW_WITH_BACKTRACE ON" "ARROW_CXXFLAGS -w" "ARROW_JEMALLOC OFF" - "ARROW_S3 ${ENABLE_S3}" - "ARROW_ORC ${ENABLE_ORC}" - # e.g. needed by blazingsql-io - ${ARROW_PARQUET_OPTIONS} - "ARROW_PARQUET ${ENABLE_PARQUET}" + "ARROW_S3 OFF" + "ARROW_ORC OFF" + "ARROW_PARQUET OFF" "ARROW_FILESYSTEM ON" - ${ARROW_PYTHON_OPTIONS} + "ARROW_PYTHON OFF" # Arrow modifies CMake's GLOBAL RULE_LAUNCH_COMPILE unless this is off "ARROW_USE_CCACHE OFF" - "ARROW_ARMV8_ARCH ${ARROW_ARMV8_ARCH}" "ARROW_SIMD_LEVEL ${ARROW_SIMD_LEVEL}" "ARROW_BUILD_STATIC ${ARROW_BUILD_STATIC}" "ARROW_BUILD_SHARED ${ARROW_BUILD_SHARED}" @@ -222,17 +125,6 @@ function(find_and_configure_arrow VERSION BUILD_STATIC ENABLE_S3 ENABLE_ORC ENAB # variables from find_package that we might need. This is especially problematic when # rapids_cpm_find builds from source. find_package(Arrow REQUIRED QUIET) - if(ENABLE_PARQUET) - # Setting Parquet_DIR is conditional because parquet may be installed independently of arrow. - if(NOT Parquet_DIR) - # Set this to enable `find_package(Parquet)` - set(Parquet_DIR "${Arrow_DIR}") - endif() - # Set this to enable `find_package(ArrowDataset)`. This will call find_package(ArrowAcero) for - # us - set(ArrowDataset_DIR "${Arrow_DIR}") - find_package(ArrowDataset REQUIRED QUIET) - endif() # Arrow_ADDED: set if CPM downloaded Arrow from Github elseif(Arrow_ADDED) # Copy these files so we can avoid adding paths in Arrow_BINARY_DIR to @@ -240,11 +132,6 @@ function(find_and_configure_arrow VERSION BUILD_STATIC ENABLE_S3 ENABLE_ORC ENAB file(INSTALL "${Arrow_BINARY_DIR}/src/arrow/util/config.h" DESTINATION "${Arrow_SOURCE_DIR}/cpp/src/arrow/util" ) - if(ENABLE_PARQUET) - file(INSTALL "${Arrow_BINARY_DIR}/src/parquet/parquet_version.h" - DESTINATION "${Arrow_SOURCE_DIR}/cpp/src/parquet" - ) - endif() # Arrow populates INTERFACE_INCLUDE_DIRECTORIES for the `arrow_static` and `arrow_shared` # targets in FindArrow, so for static source-builds, we have to do it after-the-fact. # @@ -269,48 +156,6 @@ function(find_and_configure_arrow VERSION BUILD_STATIC ENABLE_S3 ENABLE_ORC ENAB endif() if(Arrow_ADDED) - - set(arrow_code_string - [=[ - if (TARGET cudf::arrow_shared AND (NOT TARGET arrow_shared)) - add_library(arrow_shared ALIAS cudf::arrow_shared) - endif() - if (TARGET cudf::arrow_static AND (NOT TARGET arrow_static)) - add_library(arrow_static ALIAS cudf::arrow_static) - endif() - if (NOT TARGET arrow::flatbuffers) - add_library(arrow::flatbuffers INTERFACE IMPORTED) - endif() - if (NOT TARGET arrow::hadoop) - add_library(arrow::hadoop INTERFACE IMPORTED) - endif() - ]=] - ) - if(ENABLE_PARQUET) - string( - APPEND - arrow_code_string - " - find_package(Boost) - if (NOT TARGET Boost::headers) - add_library(Boost::headers INTERFACE IMPORTED) - endif() - " - ) - endif() - if(NOT TARGET xsimd) - string( - APPEND - arrow_code_string - " - if(NOT TARGET arrow::xsimd) - add_library(arrow::xsimd INTERFACE IMPORTED) - target_include_directories(arrow::xsimd INTERFACE \"${Arrow_BINARY_DIR}/xsimd_ep/src/xsimd_ep-install/include\") - endif() - " - ) - endif() - rapids_cmake_install_lib_dir(lib_dir) if(TARGET arrow_static) get_target_property(interface_libs arrow_static INTERFACE_LINK_LIBRARIES) # The `arrow_static` library is leaking a dependency on the object libraries it was built with @@ -324,102 +169,8 @@ function(find_and_configure_arrow VERSION BUILD_STATIC ENABLE_S3 ENABLE_ORC ENAB get_target_property(interface_libs arrow_static INTERFACE_LINK_LIBRARIES) endif() endif() - rapids_export( - BUILD Arrow - VERSION ${VERSION} - EXPORT_SET arrow_targets - GLOBAL_TARGETS arrow_shared arrow_static - NAMESPACE cudf:: - FINAL_CODE_BLOCK arrow_code_string - ) - - if(ENABLE_PARQUET) - - set(arrow_acero_code_string - [=[ - if (TARGET cudf::arrow_acero_shared AND (NOT TARGET arrow_acero_shared)) - add_library(arrow_acero_shared ALIAS cudf::arrow_acero_shared) - endif() - if (TARGET cudf::arrow_acero_static AND (NOT TARGET arrow_acero_static)) - add_library(arrow_acero_static ALIAS cudf::arrow_acero_static) - endif() - ]=] - ) - - rapids_export( - BUILD ArrowAcero - VERSION ${VERSION} - EXPORT_SET arrow_acero_targets - GLOBAL_TARGETS arrow_acero_shared arrow_acero_static - NAMESPACE cudf:: - FINAL_CODE_BLOCK arrow_acero_code_string - ) - - set(arrow_dataset_code_string - [=[ - if (TARGET cudf::arrow_dataset_shared AND (NOT TARGET arrow_dataset_shared)) - add_library(arrow_dataset_shared ALIAS cudf::arrow_dataset_shared) - endif() - if (TARGET cudf::arrow_dataset_static AND (NOT TARGET arrow_dataset_static)) - add_library(arrow_dataset_static ALIAS cudf::arrow_dataset_static) - endif() - ]=] - ) - - rapids_export( - BUILD ArrowDataset - VERSION ${VERSION} - EXPORT_SET arrow_dataset_targets - GLOBAL_TARGETS arrow_dataset_shared arrow_dataset_static - NAMESPACE cudf:: - FINAL_CODE_BLOCK arrow_dataset_code_string - ) - - set(parquet_code_string - [=[ - if (TARGET cudf::parquet_shared AND (NOT TARGET parquet_shared)) - add_library(parquet_shared ALIAS cudf::parquet_shared) - endif() - if (TARGET cudf::parquet_static AND (NOT TARGET parquet_static)) - add_library(parquet_static ALIAS cudf::parquet_static) - endif() - ]=] - ) - - rapids_export( - BUILD Parquet - VERSION ${VERSION} - EXPORT_SET parquet_targets - GLOBAL_TARGETS parquet_shared parquet_static - NAMESPACE cudf:: - FINAL_CODE_BLOCK parquet_code_string - ) - endif() - endif() - # We generate the arrow-configfiles when we built arrow locally, so always do `find_dependency` - rapids_export_package(BUILD Arrow cudf-exports) - rapids_export_package(INSTALL Arrow cudf-exports) - - if(ENABLE_PARQUET) - rapids_export_package(BUILD Parquet cudf-exports) - rapids_export_package(BUILD ArrowDataset cudf-exports) endif() - include("${rapids-cmake-dir}/export/find_package_root.cmake") - rapids_export_find_package_root( - BUILD Arrow [=[${CMAKE_CURRENT_LIST_DIR}]=] EXPORT_SET cudf-exports - ) - rapids_export_find_package_root( - BUILD Parquet [=[${CMAKE_CURRENT_LIST_DIR}]=] - EXPORT_SET cudf-exports - CONDITION ENABLE_PARQUET - ) - rapids_export_find_package_root( - BUILD ArrowDataset [=[${CMAKE_CURRENT_LIST_DIR}]=] - EXPORT_SET cudf-exports - CONDITION ENABLE_PARQUET - ) - set(ARROW_LIBRARIES "${ARROW_LIBRARIES}" PARENT_SCOPE @@ -437,5 +188,5 @@ endif() find_and_configure_arrow( ${CUDF_VERSION_Arrow} ${CUDF_USE_ARROW_STATIC} ${CUDF_ENABLE_ARROW_S3} ${CUDF_ENABLE_ARROW_ORC} - ${CUDF_ENABLE_ARROW_PYTHON} ${CUDF_ENABLE_ARROW_PARQUET} ${USE_LIBARROW_FROM_PYARROW} + ${CUDF_ENABLE_ARROW_PYTHON} ) From b5721cfbf816d149151d50ff02ac2df9b3d7c334 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Thu, 22 Aug 2024 05:31:06 +0000 Subject: [PATCH 27/48] Two more tests need Arrow --- cpp/tests/CMakeLists.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index dade9dd654a..491feded52f 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -297,7 +297,7 @@ ConfigureTest(ROW_SELECTION_TEST io/row_selection_test.cpp) ConfigureTest( CSV_TEST io/csv_test.cpp GPUS 1 - PERCENT 30 + PERCENT 30 EXTRA_LIBS ${ARROW_LIBRARIES} ) ConfigureTest( FILE_IO_TEST io/file_io_test.cpp @@ -325,7 +325,7 @@ ConfigureTest( ConfigureTest( JSON_TEST io/json/json_test.cpp io/json/json_chunked_reader.cu GPUS 1 - PERCENT 30 + PERCENT 30 EXTRA_LIBS ${ARROW_LIBRARIES} ) ConfigureTest(JSON_WRITER_TEST io/json/json_writer.cpp) ConfigureTest(JSON_TYPE_CAST_TEST io/json/json_type_cast_test.cu) From ec9105b23832049e036804cf57342fa119c97bd5 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Thu, 22 Aug 2024 06:11:19 +0000 Subject: [PATCH 28/48] Add Arrow dep to example and default Arrow linkage to static everywhere --- cpp/examples/interop/CMakeLists.txt | 2 ++ cpp/tests/CMakeLists.txt | 3 +-- cpp/tests/cmake/thirdparty/get_arrow.cmake | 5 +++++ 3 files changed, 8 insertions(+), 2 deletions(-) diff --git a/cpp/examples/interop/CMakeLists.txt b/cpp/examples/interop/CMakeLists.txt index a1f99c1d2fd..93256eefe9c 100644 --- a/cpp/examples/interop/CMakeLists.txt +++ b/cpp/examples/interop/CMakeLists.txt @@ -14,7 +14,9 @@ project( ) include(../fetch_dependencies.cmake) +include(../../tests/cmake/thirdparty/get_arrow.cmake) add_executable(interop interop.cpp) target_link_libraries(interop PRIVATE cudf::cudf) target_compile_features(interop PRIVATE cxx_std_17) +target_link_libraries(interop PRIVATE ${ARROW_LIBRARIES}) diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index 491feded52f..3280ee1d937 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -82,8 +82,7 @@ endfunction() # dependencies ################################################################################### # ################################################################################################## -# find arrow. Always use static for tests -set(CUDF_USE_ARROW_STATIC ON) +# find arrow. include(cmake/thirdparty/get_arrow.cmake) # ################################################################################################## diff --git a/cpp/tests/cmake/thirdparty/get_arrow.cmake b/cpp/tests/cmake/thirdparty/get_arrow.cmake index f5915ebab7c..063f7c496a5 100644 --- a/cpp/tests/cmake/thirdparty/get_arrow.cmake +++ b/cpp/tests/cmake/thirdparty/get_arrow.cmake @@ -186,6 +186,11 @@ if(NOT DEFINED CUDF_VERSION_Arrow) ) endif() +# Default to static arrow builds +if(NOT DEFINED CUDF_USE_ARROW_STATIC) + set(CUDF_USE_ARROW_STATIC ON) +endif() + find_and_configure_arrow( ${CUDF_VERSION_Arrow} ${CUDF_USE_ARROW_STATIC} ${CUDF_ENABLE_ARROW_S3} ${CUDF_ENABLE_ARROW_ORC} ${CUDF_ENABLE_ARROW_PYTHON} From e90fdf8e51c909943095bc9972300c3492b4f0af Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Thu, 22 Aug 2024 14:42:50 +0000 Subject: [PATCH 29/48] Add build type specifier --- cpp/examples/interop/CMakeLists.txt | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/cpp/examples/interop/CMakeLists.txt b/cpp/examples/interop/CMakeLists.txt index 93256eefe9c..7b4694125b8 100644 --- a/cpp/examples/interop/CMakeLists.txt +++ b/cpp/examples/interop/CMakeLists.txt @@ -14,6 +14,14 @@ project( ) include(../fetch_dependencies.cmake) + +# Try hardcoding build type for arrow +if(NOT DEFINED CMAKE_BUILD_TYPE) + set(CMAKE_BUILD_TYPE + Release + CACHE STRING "Choose the type of build." + ) +endif() include(../../tests/cmake/thirdparty/get_arrow.cmake) add_executable(interop interop.cpp) From 8af5b3318bb06190be7af8feb9617704ae39342e Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Thu, 22 Aug 2024 15:14:21 +0000 Subject: [PATCH 30/48] Temporarily disable interop example to validate everything else --- cpp/examples/build.sh | 2 +- cpp/examples/interop/CMakeLists.txt | 8 -------- 2 files changed, 1 insertion(+), 9 deletions(-) diff --git a/cpp/examples/build.sh b/cpp/examples/build.sh index 2d6f6f316c7..085e66bcb00 100755 --- a/cpp/examples/build.sh +++ b/cpp/examples/build.sh @@ -61,4 +61,4 @@ build_example tpch build_example strings build_example nested_types build_example parquet_io -build_example interop +#build_example interop diff --git a/cpp/examples/interop/CMakeLists.txt b/cpp/examples/interop/CMakeLists.txt index 7b4694125b8..93256eefe9c 100644 --- a/cpp/examples/interop/CMakeLists.txt +++ b/cpp/examples/interop/CMakeLists.txt @@ -14,14 +14,6 @@ project( ) include(../fetch_dependencies.cmake) - -# Try hardcoding build type for arrow -if(NOT DEFINED CMAKE_BUILD_TYPE) - set(CMAKE_BUILD_TYPE - Release - CACHE STRING "Choose the type of build." - ) -endif() include(../../tests/cmake/thirdparty/get_arrow.cmake) add_executable(interop interop.cpp) From b9cf4241d060c1bc84b0d7494f4899bc9cb3af73 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Thu, 22 Aug 2024 15:43:01 +0000 Subject: [PATCH 31/48] Set the build type correctly --- cpp/examples/build.sh | 2 +- cpp/examples/interop/CMakeLists.txt | 3 +++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/cpp/examples/build.sh b/cpp/examples/build.sh index 085e66bcb00..2d6f6f316c7 100755 --- a/cpp/examples/build.sh +++ b/cpp/examples/build.sh @@ -61,4 +61,4 @@ build_example tpch build_example strings build_example nested_types build_example parquet_io -#build_example interop +build_example interop diff --git a/cpp/examples/interop/CMakeLists.txt b/cpp/examples/interop/CMakeLists.txt index 93256eefe9c..2d4a643fb82 100644 --- a/cpp/examples/interop/CMakeLists.txt +++ b/cpp/examples/interop/CMakeLists.txt @@ -14,6 +14,9 @@ project( ) include(../fetch_dependencies.cmake) + +# Try hardcoding build type for arrow +set(CMAKE_BUILD_TYPE Release) include(../../tests/cmake/thirdparty/get_arrow.cmake) add_executable(interop interop.cpp) From a6e8528f61113450cfff2a1c153d36ac4f5f8845 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Thu, 22 Aug 2024 16:56:18 +0000 Subject: [PATCH 32/48] Go back to a single get_arrow.cmake --- .../cmake/thirdparty/get_arrow.cmake | 0 cpp/examples/interop/CMakeLists.txt | 2 +- cpp/tests/CMakeLists.txt | 2 +- java/src/main/native/CMakeLists.txt | 2 +- .../native/cmake/thirdparty/get_arrow.cmake | 192 ------------------ 5 files changed, 3 insertions(+), 195 deletions(-) rename cpp/{tests => }/cmake/thirdparty/get_arrow.cmake (100%) delete mode 100644 java/src/main/native/cmake/thirdparty/get_arrow.cmake diff --git a/cpp/tests/cmake/thirdparty/get_arrow.cmake b/cpp/cmake/thirdparty/get_arrow.cmake similarity index 100% rename from cpp/tests/cmake/thirdparty/get_arrow.cmake rename to cpp/cmake/thirdparty/get_arrow.cmake diff --git a/cpp/examples/interop/CMakeLists.txt b/cpp/examples/interop/CMakeLists.txt index 2d4a643fb82..c4ab92073c2 100644 --- a/cpp/examples/interop/CMakeLists.txt +++ b/cpp/examples/interop/CMakeLists.txt @@ -17,7 +17,7 @@ include(../fetch_dependencies.cmake) # Try hardcoding build type for arrow set(CMAKE_BUILD_TYPE Release) -include(../../tests/cmake/thirdparty/get_arrow.cmake) +include(../../cmake/thirdparty/get_arrow.cmake) add_executable(interop interop.cpp) target_link_libraries(interop PRIVATE cudf::cudf) diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index 3280ee1d937..580193648f4 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -83,7 +83,7 @@ endfunction() # ################################################################################################## # find arrow. -include(cmake/thirdparty/get_arrow.cmake) +include(../cmake/thirdparty/get_arrow.cmake) # ################################################################################################## # test sources ################################################################################## diff --git a/java/src/main/native/CMakeLists.txt b/java/src/main/native/CMakeLists.txt index 461582cc33d..55332419bc8 100644 --- a/java/src/main/native/CMakeLists.txt +++ b/java/src/main/native/CMakeLists.txt @@ -213,7 +213,7 @@ target_compile_definitions( target_link_options(cudfjni PRIVATE "-Wl,--no-undefined") set(CUDF_USE_ARROW_STATIC ON) -include(cmake/thirdparty/get_arrow.cmake) +include(../../../../cpp/cmake/thirdparty/get_arrow.cmake) target_link_libraries(cudfjni PRIVATE ${ARROW_LIBRARIES}) if(USE_GDS) diff --git a/java/src/main/native/cmake/thirdparty/get_arrow.cmake b/java/src/main/native/cmake/thirdparty/get_arrow.cmake deleted file mode 100644 index f5915ebab7c..00000000000 --- a/java/src/main/native/cmake/thirdparty/get_arrow.cmake +++ /dev/null @@ -1,192 +0,0 @@ -# ============================================================================= -# Copyright (c) 2020-2024, NVIDIA CORPORATION. -# -# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except -# in compliance with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software distributed under the License -# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express -# or implied. See the License for the specific language governing permissions and limitations under -# the License. -# ============================================================================= - -# Finding arrow is far more complex than it should be, and as a result we violate multiple linting -# rules aiming to limit complexity. Since all our other CMake scripts conform to expectations -# without undue difficulty, disabling those rules for just this function is our best approach for -# now. The spacing between this comment, the cmake-lint directives, and the function docstring is -# necessary to prevent cmake-format from trying to combine the lines. - -# cmake-lint: disable=R0912,R0913,R0915 - -include_guard(GLOBAL) - -# This function finds arrow and sets any additional necessary environment variables. -function(find_and_configure_arrow VERSION BUILD_STATIC) - if(BUILD_STATIC) - if(TARGET arrow_static) - set(ARROW_FOUND - TRUE - PARENT_SCOPE - ) - set(ARROW_LIBRARIES - arrow_static - PARENT_SCOPE - ) - return() - endif() - else() - if(TARGET arrow_shared) - set(ARROW_FOUND - TRUE - PARENT_SCOPE - ) - set(ARROW_LIBRARIES - arrow_shared - PARENT_SCOPE - ) - return() - endif() - endif() - - if(NOT ARROW_SIMD_LEVEL) - set(ARROW_SIMD_LEVEL "NONE") - endif() - - if(BUILD_STATIC) - set(ARROW_BUILD_STATIC ON) - set(ARROW_BUILD_SHARED OFF) - # Turn off CPM using `find_package` so we always download and make sure we get proper static - # library. - set(CPM_DOWNLOAD_Arrow TRUE) - # By default ARROW will try to search for a static version of OpenSSL which is a bad idea given - # that shared linking is advised for critical components like SSL. If a static build is - # requested, we honor ARROW's default of static linking, but users may consider setting - # ARROW_OPENSSL_USE_SHARED even in static builds. - else() - set(ARROW_BUILD_SHARED ON) - set(ARROW_BUILD_STATIC OFF) - # By default ARROW will try to search for a static version of OpenSSL which is a bad idea given - # that shared linking is advised for critical components like SSL - set(ARROW_OPENSSL_USE_SHARED ON) - endif() - - rapids_cpm_find( - Arrow ${VERSION} - GLOBAL_TARGETS arrow_shared parquet_shared arrow_acero_shared arrow_dataset_shared arrow_static - parquet_static arrow_acero_static arrow_dataset_static - CPM_ARGS - GIT_REPOSITORY https://github.com/apache/arrow.git - GIT_TAG apache-arrow-${VERSION} - GIT_SHALLOW TRUE SOURCE_SUBDIR cpp - OPTIONS "CMAKE_VERBOSE_MAKEFILE ON" - "ARROW_ACERO ON" - "ARROW_IPC ON" - "ARROW_DATASET ON" - "ARROW_WITH_BACKTRACE ON" - "ARROW_CXXFLAGS -w" - "ARROW_JEMALLOC OFF" - "ARROW_S3 OFF" - "ARROW_ORC OFF" - "ARROW_PARQUET OFF" - "ARROW_FILESYSTEM ON" - "ARROW_PYTHON OFF" - # Arrow modifies CMake's GLOBAL RULE_LAUNCH_COMPILE unless this is off - "ARROW_USE_CCACHE OFF" - "ARROW_SIMD_LEVEL ${ARROW_SIMD_LEVEL}" - "ARROW_BUILD_STATIC ${ARROW_BUILD_STATIC}" - "ARROW_BUILD_SHARED ${ARROW_BUILD_SHARED}" - "ARROW_POSITION_INDEPENDENT_CODE ON" - "ARROW_DEPENDENCY_USE_SHARED ${ARROW_BUILD_SHARED}" - "ARROW_BOOST_USE_SHARED ${ARROW_BUILD_SHARED}" - "ARROW_BROTLI_USE_SHARED ${ARROW_BUILD_SHARED}" - "ARROW_GFLAGS_USE_SHARED ${ARROW_BUILD_SHARED}" - "ARROW_GRPC_USE_SHARED ${ARROW_BUILD_SHARED}" - "ARROW_PROTOBUF_USE_SHARED ${ARROW_BUILD_SHARED}" - "ARROW_ZSTD_USE_SHARED ${ARROW_BUILD_SHARED}" - "xsimd_SOURCE AUTO" - ) - - set(ARROW_FOUND - TRUE - PARENT_SCOPE - ) - - if(BUILD_STATIC) - set(ARROW_LIBRARIES arrow_static) - else() - set(ARROW_LIBRARIES arrow_shared) - endif() - - # Arrow_DIR: set if CPM found Arrow on the system/conda/etc. - if(Arrow_DIR) - # This extra find_package is necessary because rapids_cpm_find does not propagate all the - # variables from find_package that we might need. This is especially problematic when - # rapids_cpm_find builds from source. - find_package(Arrow REQUIRED QUIET) - # Arrow_ADDED: set if CPM downloaded Arrow from Github - elseif(Arrow_ADDED) - # Copy these files so we can avoid adding paths in Arrow_BINARY_DIR to - # target_include_directories. That defeats ccache. - file(INSTALL "${Arrow_BINARY_DIR}/src/arrow/util/config.h" - DESTINATION "${Arrow_SOURCE_DIR}/cpp/src/arrow/util" - ) - # Arrow populates INTERFACE_INCLUDE_DIRECTORIES for the `arrow_static` and `arrow_shared` - # targets in FindArrow, so for static source-builds, we have to do it after-the-fact. - # - # This only works because we know exactly which components we're using. Don't forget to update - # this list if we add more! - # - foreach(ARROW_LIBRARY ${ARROW_LIBRARIES}) - target_include_directories( - ${ARROW_LIBRARY} - INTERFACE "$" - "$" - "$" - "$" - ) - endforeach() - else() - set(ARROW_FOUND - FALSE - PARENT_SCOPE - ) - message(FATAL_ERROR "CUDF: Arrow library not found or downloaded.") - endif() - - if(Arrow_ADDED) - if(TARGET arrow_static) - get_target_property(interface_libs arrow_static INTERFACE_LINK_LIBRARIES) - # The `arrow_static` library is leaking a dependency on the object libraries it was built with - # we need to remove this from the interface, since keeping them around would cause duplicate - # symbols and CMake export errors - if(interface_libs MATCHES "arrow_array" AND interface_libs MATCHES "arrow_compute") - string(REPLACE "BUILD_INTERFACE:" "BUILD_LOCAL_INTERFACE:" interface_libs - "${interface_libs}" - ) - set_target_properties(arrow_static PROPERTIES INTERFACE_LINK_LIBRARIES "${interface_libs}") - get_target_property(interface_libs arrow_static INTERFACE_LINK_LIBRARIES) - endif() - endif() - endif() - - set(ARROW_LIBRARIES - "${ARROW_LIBRARIES}" - PARENT_SCOPE - ) -endfunction() - -if(NOT DEFINED CUDF_VERSION_Arrow) - set(CUDF_VERSION_Arrow - # This version must be kept in sync with the libarrow version pinned for builds in - # dependencies.yaml. - 16.1.0 - CACHE STRING "The version of Arrow to find (or build)" - ) -endif() - -find_and_configure_arrow( - ${CUDF_VERSION_Arrow} ${CUDF_USE_ARROW_STATIC} ${CUDF_ENABLE_ARROW_S3} ${CUDF_ENABLE_ARROW_ORC} - ${CUDF_ENABLE_ARROW_PYTHON} -) From 923d1f26f0b1324e1c9aef56a0a0892679d36aeb Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Thu, 22 Aug 2024 16:56:36 +0000 Subject: [PATCH 33/48] Update comment for interop example --- cpp/examples/interop/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/examples/interop/CMakeLists.txt b/cpp/examples/interop/CMakeLists.txt index c4ab92073c2..1eb06d30f88 100644 --- a/cpp/examples/interop/CMakeLists.txt +++ b/cpp/examples/interop/CMakeLists.txt @@ -15,7 +15,7 @@ project( include(../fetch_dependencies.cmake) -# Try hardcoding build type for arrow +# The Arrow CMake is currently broken if the build type is not set set(CMAKE_BUILD_TYPE Release) include(../../cmake/thirdparty/get_arrow.cmake) From 351c358e6e36a3f0c6982e3f739bf4df1b8384f5 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Thu, 22 Aug 2024 16:59:29 +0000 Subject: [PATCH 34/48] Stop passing nonexistent args --- cpp/cmake/thirdparty/get_arrow.cmake | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/cpp/cmake/thirdparty/get_arrow.cmake b/cpp/cmake/thirdparty/get_arrow.cmake index 063f7c496a5..ab07c9c3959 100644 --- a/cpp/cmake/thirdparty/get_arrow.cmake +++ b/cpp/cmake/thirdparty/get_arrow.cmake @@ -191,7 +191,4 @@ if(NOT DEFINED CUDF_USE_ARROW_STATIC) set(CUDF_USE_ARROW_STATIC ON) endif() -find_and_configure_arrow( - ${CUDF_VERSION_Arrow} ${CUDF_USE_ARROW_STATIC} ${CUDF_ENABLE_ARROW_S3} ${CUDF_ENABLE_ARROW_ORC} - ${CUDF_ENABLE_ARROW_PYTHON} -) +find_and_configure_arrow(${CUDF_VERSION_Arrow} ${CUDF_USE_ARROW_STATIC}) From bb5b26753942e0d1fb2e3a6efa2d9e4f7abb48b1 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Thu, 22 Aug 2024 17:13:22 +0000 Subject: [PATCH 35/48] Default to excluding Arrow from installation --- cpp/cmake/thirdparty/get_arrow.cmake | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/cpp/cmake/thirdparty/get_arrow.cmake b/cpp/cmake/thirdparty/get_arrow.cmake index ab07c9c3959..461c884c012 100644 --- a/cpp/cmake/thirdparty/get_arrow.cmake +++ b/cpp/cmake/thirdparty/get_arrow.cmake @@ -23,7 +23,7 @@ include_guard(GLOBAL) # This function finds arrow and sets any additional necessary environment variables. -function(find_and_configure_arrow VERSION BUILD_STATIC) +function(find_and_configure_arrow VERSION BUILD_STATIC EXCLUDE_FROM_ALL) if(BUILD_STATIC) if(TARGET arrow_static) set(ARROW_FOUND @@ -80,6 +80,7 @@ function(find_and_configure_arrow VERSION BUILD_STATIC) GIT_REPOSITORY https://github.com/apache/arrow.git GIT_TAG apache-arrow-${VERSION} GIT_SHALLOW TRUE SOURCE_SUBDIR cpp + EXCLUDE_FROM_ALL ${EXCLUDE_FROM_ALL} OPTIONS "CMAKE_VERBOSE_MAKEFILE ON" "ARROW_ACERO ON" "ARROW_IPC ON" @@ -191,4 +192,11 @@ if(NOT DEFINED CUDF_USE_ARROW_STATIC) set(CUDF_USE_ARROW_STATIC ON) endif() -find_and_configure_arrow(${CUDF_VERSION_Arrow} ${CUDF_USE_ARROW_STATIC}) +# Default to excluding from installation since we generally privately and statically link Arrow. +if(NOT DEFINED CUDF_EXCLUDE_ARROW_FROM_ALL) + set(CUDF_EXCLUDE_ARROW_FROM_ALL ${CUDF_USE_ARROW_STATIC}) +endif() + +find_and_configure_arrow( + ${CUDF_VERSION_Arrow} ${CUDF_USE_ARROW_STATIC} ${CUDF_EXCLUDE_ARROW_FROM_ALL} +) From d1d5518a2872cb99502c795e4d1368507a911ec5 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Mon, 26 Aug 2024 17:33:20 +0000 Subject: [PATCH 36/48] Fix numpy pinning --- conda/environments/all_cuda-118_arch-x86_64.yaml | 2 +- conda/environments/all_cuda-125_arch-x86_64.yaml | 2 +- conda/recipes/cudf/meta.yaml | 2 +- conda/recipes/pylibcudf/meta.yaml | 2 +- dependencies.yaml | 2 +- python/cudf/pyproject.toml | 2 +- python/dask_cudf/pyproject.toml | 2 +- python/pylibcudf/pyproject.toml | 2 +- 8 files changed, 8 insertions(+), 8 deletions(-) diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml index f6d24f3552a..dea539cfb63 100644 --- a/conda/environments/all_cuda-118_arch-x86_64.yaml +++ b/conda/environments/all_cuda-118_arch-x86_64.yaml @@ -52,7 +52,7 @@ dependencies: - ninja - notebook - numba>=0.57 -- numpy>=1.23,<2.0a0 +- numpy>=1.23,<3.0a0 - numpydoc - nvcc_linux-64=11.8 - nvcomp==3.0.6 diff --git a/conda/environments/all_cuda-125_arch-x86_64.yaml b/conda/environments/all_cuda-125_arch-x86_64.yaml index 56eb063ef5c..0cf0bd88700 100644 --- a/conda/environments/all_cuda-125_arch-x86_64.yaml +++ b/conda/environments/all_cuda-125_arch-x86_64.yaml @@ -51,7 +51,7 @@ dependencies: - ninja - notebook - numba>=0.57 -- numpy>=1.23,<2.0a0 +- numpy>=1.23,<3.0a0 - numpydoc - nvcomp==3.0.6 - nvtx>=0.2.1 diff --git a/conda/recipes/cudf/meta.yaml b/conda/recipes/cudf/meta.yaml index af1817a30dc..53f52a35651 100644 --- a/conda/recipes/cudf/meta.yaml +++ b/conda/recipes/cudf/meta.yaml @@ -81,7 +81,7 @@ requirements: - pandas >=2.0,<2.2.3dev0 - cupy >=12.0.0 - numba >=0.57 - - numpy >=1.23,<2.0a0 + - numpy >=1.23,<3.0a0 - pyarrow ==16.1.0.* - libcudf ={{ version }} - pylibcudf ={{ version }} diff --git a/conda/recipes/pylibcudf/meta.yaml b/conda/recipes/pylibcudf/meta.yaml index 732e3c0b52c..67b9b76bb8c 100644 --- a/conda/recipes/pylibcudf/meta.yaml +++ b/conda/recipes/pylibcudf/meta.yaml @@ -78,7 +78,7 @@ requirements: - python - typing_extensions >=4.0.0 - pandas >=2.0,<2.2.3dev0 - - numpy >=1.23,<2.0a0 + - numpy >=1.23,<3.0a0 - pyarrow ==16.1.0.* - {{ pin_compatible('rmm', max_pin='x.x') }} - fsspec >=0.6.0 diff --git a/dependencies.yaml b/dependencies.yaml index 620c4621eed..4dad9ba7326 100644 --- a/dependencies.yaml +++ b/dependencies.yaml @@ -579,7 +579,7 @@ dependencies: - output_types: [conda, requirements, pyproject] packages: - fsspec>=0.6.0 - - &numpy numpy>=1.23,<2.0a0 + - &numpy numpy>=1.23,<3.0a0 - pandas>=2.0,<2.2.3dev0 run_pylibcudf: common: diff --git a/python/cudf/pyproject.toml b/python/cudf/pyproject.toml index 7e197d888bf..fd10d52267d 100644 --- a/python/cudf/pyproject.toml +++ b/python/cudf/pyproject.toml @@ -25,7 +25,7 @@ dependencies = [ "fsspec>=0.6.0", "libcudf==24.10.*,>=0.0.0a0", "numba>=0.57", - "numpy>=1.23,<2.0a0", + "numpy>=1.23,<3.0a0", "nvtx>=0.2.1", "packaging", "pandas>=2.0,<2.2.3dev0", diff --git a/python/dask_cudf/pyproject.toml b/python/dask_cudf/pyproject.toml index 872ecd35c28..d5da7030a75 100644 --- a/python/dask_cudf/pyproject.toml +++ b/python/dask_cudf/pyproject.toml @@ -22,7 +22,7 @@ dependencies = [ "cudf==24.10.*,>=0.0.0a0", "cupy-cuda11x>=12.0.0", "fsspec>=0.6.0", - "numpy>=1.23,<2.0a0", + "numpy>=1.23,<3.0a0", "pandas>=2.0,<2.2.3dev0", "rapids-dask-dependency==24.10.*,>=0.0.0a0", ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`. diff --git a/python/pylibcudf/pyproject.toml b/python/pylibcudf/pyproject.toml index 271bc8c04aa..a600dd061a3 100644 --- a/python/pylibcudf/pyproject.toml +++ b/python/pylibcudf/pyproject.toml @@ -41,7 +41,7 @@ classifiers = [ test = [ "fastavro>=0.22.9", "hypothesis", - "numpy>=1.23,<2.0a0", + "numpy>=1.23,<3.0a0", "pandas", "pytest-cov", "pytest-xdist", From 433ba080495e81216988264aacd0db9f4f797f65 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Mon, 26 Aug 2024 20:27:25 +0000 Subject: [PATCH 37/48] Add back arrow parquet support --- cpp/cmake/thirdparty/get_arrow.cmake | 29 ++++++++++++++++++++++++++-- java/src/main/native/CMakeLists.txt | 1 + 2 files changed, 28 insertions(+), 2 deletions(-) diff --git a/cpp/cmake/thirdparty/get_arrow.cmake b/cpp/cmake/thirdparty/get_arrow.cmake index 461c884c012..b4e77f3296a 100644 --- a/cpp/cmake/thirdparty/get_arrow.cmake +++ b/cpp/cmake/thirdparty/get_arrow.cmake @@ -23,7 +23,7 @@ include_guard(GLOBAL) # This function finds arrow and sets any additional necessary environment variables. -function(find_and_configure_arrow VERSION BUILD_STATIC EXCLUDE_FROM_ALL) +function(find_and_configure_arrow VERSION BUILD_STATIC EXCLUDE_FROM_ALL ENABLE_PARQUET) if(BUILD_STATIC) if(TARGET arrow_static) set(ARROW_FOUND @@ -72,6 +72,13 @@ function(find_and_configure_arrow VERSION BUILD_STATIC EXCLUDE_FROM_ALL) set(ARROW_OPENSSL_USE_SHARED ON) endif() + if(ENABLE_PARQUET) + # Arrow's logic to build Boost from source is busted, so we have to get it from the system. + list(APPEND ARROW_PARQUET_OPTIONS "BOOST_SOURCE SYSTEM") + list(APPEND ARROW_PARQUET_OPTIONS "Thrift_SOURCE BUNDLED") + list(APPEND ARROW_PARQUET_OPTIONS "ARROW_DEPENDENCY_SOURCE AUTO") + endif() + rapids_cpm_find( Arrow ${VERSION} GLOBAL_TARGETS arrow_shared parquet_shared arrow_acero_shared arrow_dataset_shared arrow_static @@ -90,7 +97,8 @@ function(find_and_configure_arrow VERSION BUILD_STATIC EXCLUDE_FROM_ALL) "ARROW_JEMALLOC OFF" "ARROW_S3 OFF" "ARROW_ORC OFF" - "ARROW_PARQUET OFF" + ${ARROW_PARQUET_OPTIONS} + "ARROW_PARQUET ${ENABLE_PARQUET}" "ARROW_FILESYSTEM ON" "ARROW_PYTHON OFF" # Arrow modifies CMake's GLOBAL RULE_LAUNCH_COMPILE unless this is off @@ -126,6 +134,13 @@ function(find_and_configure_arrow VERSION BUILD_STATIC EXCLUDE_FROM_ALL) # variables from find_package that we might need. This is especially problematic when # rapids_cpm_find builds from source. find_package(Arrow REQUIRED QUIET) + if(ENABLE_PARQUET) + # Setting Parquet_DIR is conditional because parquet may be installed independently of arrow. + if(NOT Parquet_DIR) + # Set this to enable `find_package(Parquet)` + set(Parquet_DIR "${Arrow_DIR}") + endif() + endif() # Arrow_ADDED: set if CPM downloaded Arrow from Github elseif(Arrow_ADDED) # Copy these files so we can avoid adding paths in Arrow_BINARY_DIR to @@ -133,6 +148,11 @@ function(find_and_configure_arrow VERSION BUILD_STATIC EXCLUDE_FROM_ALL) file(INSTALL "${Arrow_BINARY_DIR}/src/arrow/util/config.h" DESTINATION "${Arrow_SOURCE_DIR}/cpp/src/arrow/util" ) + if(ENABLE_PARQUET) + file(INSTALL "${Arrow_BINARY_DIR}/src/parquet/parquet_version.h" + DESTINATION "${Arrow_SOURCE_DIR}/cpp/src/parquet" + ) + endif() # Arrow populates INTERFACE_INCLUDE_DIRECTORIES for the `arrow_static` and `arrow_shared` # targets in FindArrow, so for static source-builds, we have to do it after-the-fact. # @@ -197,6 +217,11 @@ if(NOT DEFINED CUDF_EXCLUDE_ARROW_FROM_ALL) set(CUDF_EXCLUDE_ARROW_FROM_ALL ${CUDF_USE_ARROW_STATIC}) endif() +if(NOT DEFINED CUDF_ENABLE_ARROW_PARQUET) + set(CUDF_ENABLE_ARROW_PARQUET OFF) +endif() + find_and_configure_arrow( ${CUDF_VERSION_Arrow} ${CUDF_USE_ARROW_STATIC} ${CUDF_EXCLUDE_ARROW_FROM_ALL} + ${CUDF_ENABLE_ARROW_PARQUET} ) diff --git a/java/src/main/native/CMakeLists.txt b/java/src/main/native/CMakeLists.txt index 29286194d37..93a2afc95b6 100644 --- a/java/src/main/native/CMakeLists.txt +++ b/java/src/main/native/CMakeLists.txt @@ -213,6 +213,7 @@ target_compile_definitions( target_link_options(cudfjni PRIVATE "-Wl,--no-undefined") include(../../../../cpp/cmake/thirdparty/get_arrow.cmake) +set(CUDF_ENABLE_ARROW_PARQUET ON) target_link_libraries(cudfjni PRIVATE ${ARROW_LIBRARIES}) if(USE_GDS) From 602c4d602b098e127facafe46b7539bb230613b0 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Mon, 26 Aug 2024 20:45:19 +0000 Subject: [PATCH 38/48] Fix order --- java/src/main/native/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/java/src/main/native/CMakeLists.txt b/java/src/main/native/CMakeLists.txt index 93a2afc95b6..c18a90140b6 100644 --- a/java/src/main/native/CMakeLists.txt +++ b/java/src/main/native/CMakeLists.txt @@ -212,8 +212,8 @@ target_compile_definitions( ) target_link_options(cudfjni PRIVATE "-Wl,--no-undefined") -include(../../../../cpp/cmake/thirdparty/get_arrow.cmake) set(CUDF_ENABLE_ARROW_PARQUET ON) +include(../../../../cpp/cmake/thirdparty/get_arrow.cmake) target_link_libraries(cudfjni PRIVATE ${ARROW_LIBRARIES}) if(USE_GDS) From 04d6e4f54f0634adc9715f08b2d4c19e01e4aed4 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Mon, 26 Aug 2024 20:56:39 +0000 Subject: [PATCH 39/48] Add missing set --- cpp/cmake/thirdparty/get_arrow.cmake | 1 + 1 file changed, 1 insertion(+) diff --git a/cpp/cmake/thirdparty/get_arrow.cmake b/cpp/cmake/thirdparty/get_arrow.cmake index b4e77f3296a..ec1d41821e3 100644 --- a/cpp/cmake/thirdparty/get_arrow.cmake +++ b/cpp/cmake/thirdparty/get_arrow.cmake @@ -72,6 +72,7 @@ function(find_and_configure_arrow VERSION BUILD_STATIC EXCLUDE_FROM_ALL ENABLE_P set(ARROW_OPENSSL_USE_SHARED ON) endif() + set(ARROW_PARQUET_OPTIONS "") if(ENABLE_PARQUET) # Arrow's logic to build Boost from source is busted, so we have to get it from the system. list(APPEND ARROW_PARQUET_OPTIONS "BOOST_SOURCE SYSTEM") From 556c2b5d7324a09b71ba8f5e15ca2adf21250671 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Mon, 26 Aug 2024 21:01:35 +0000 Subject: [PATCH 40/48] Remove extra parquet options altogether --- cpp/cmake/thirdparty/get_arrow.cmake | 9 --------- 1 file changed, 9 deletions(-) diff --git a/cpp/cmake/thirdparty/get_arrow.cmake b/cpp/cmake/thirdparty/get_arrow.cmake index ec1d41821e3..cddcb9bc7e0 100644 --- a/cpp/cmake/thirdparty/get_arrow.cmake +++ b/cpp/cmake/thirdparty/get_arrow.cmake @@ -72,14 +72,6 @@ function(find_and_configure_arrow VERSION BUILD_STATIC EXCLUDE_FROM_ALL ENABLE_P set(ARROW_OPENSSL_USE_SHARED ON) endif() - set(ARROW_PARQUET_OPTIONS "") - if(ENABLE_PARQUET) - # Arrow's logic to build Boost from source is busted, so we have to get it from the system. - list(APPEND ARROW_PARQUET_OPTIONS "BOOST_SOURCE SYSTEM") - list(APPEND ARROW_PARQUET_OPTIONS "Thrift_SOURCE BUNDLED") - list(APPEND ARROW_PARQUET_OPTIONS "ARROW_DEPENDENCY_SOURCE AUTO") - endif() - rapids_cpm_find( Arrow ${VERSION} GLOBAL_TARGETS arrow_shared parquet_shared arrow_acero_shared arrow_dataset_shared arrow_static @@ -98,7 +90,6 @@ function(find_and_configure_arrow VERSION BUILD_STATIC EXCLUDE_FROM_ALL ENABLE_P "ARROW_JEMALLOC OFF" "ARROW_S3 OFF" "ARROW_ORC OFF" - ${ARROW_PARQUET_OPTIONS} "ARROW_PARQUET ${ENABLE_PARQUET}" "ARROW_FILESYSTEM ON" "ARROW_PYTHON OFF" From 9bc6450e3644861ce71faaf3945c7e3209282957 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Mon, 26 Aug 2024 21:17:53 +0000 Subject: [PATCH 41/48] Revert "Remove extra parquet options altogether" This reverts commit 556c2b5d7324a09b71ba8f5e15ca2adf21250671. --- cpp/cmake/thirdparty/get_arrow.cmake | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/cpp/cmake/thirdparty/get_arrow.cmake b/cpp/cmake/thirdparty/get_arrow.cmake index cddcb9bc7e0..ec1d41821e3 100644 --- a/cpp/cmake/thirdparty/get_arrow.cmake +++ b/cpp/cmake/thirdparty/get_arrow.cmake @@ -72,6 +72,14 @@ function(find_and_configure_arrow VERSION BUILD_STATIC EXCLUDE_FROM_ALL ENABLE_P set(ARROW_OPENSSL_USE_SHARED ON) endif() + set(ARROW_PARQUET_OPTIONS "") + if(ENABLE_PARQUET) + # Arrow's logic to build Boost from source is busted, so we have to get it from the system. + list(APPEND ARROW_PARQUET_OPTIONS "BOOST_SOURCE SYSTEM") + list(APPEND ARROW_PARQUET_OPTIONS "Thrift_SOURCE BUNDLED") + list(APPEND ARROW_PARQUET_OPTIONS "ARROW_DEPENDENCY_SOURCE AUTO") + endif() + rapids_cpm_find( Arrow ${VERSION} GLOBAL_TARGETS arrow_shared parquet_shared arrow_acero_shared arrow_dataset_shared arrow_static @@ -90,6 +98,7 @@ function(find_and_configure_arrow VERSION BUILD_STATIC EXCLUDE_FROM_ALL ENABLE_P "ARROW_JEMALLOC OFF" "ARROW_S3 OFF" "ARROW_ORC OFF" + ${ARROW_PARQUET_OPTIONS} "ARROW_PARQUET ${ENABLE_PARQUET}" "ARROW_FILESYSTEM ON" "ARROW_PYTHON OFF" From afa8cb70c16409406eaaef96ad2a353f4a63fb2c Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Mon, 26 Aug 2024 21:47:19 +0000 Subject: [PATCH 42/48] Remove one more set of unnecessary libarrow deps --- dependencies.yaml | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/dependencies.yaml b/dependencies.yaml index 4dad9ba7326..d7fab5fb3db 100644 --- a/dependencies.yaml +++ b/dependencies.yaml @@ -39,7 +39,6 @@ files: output: none includes: - cuda_version - - libarrow_run - test_cpp test_python: output: none @@ -57,7 +56,6 @@ files: - build_all - cuda - cuda_version - - libarrow_run - test_java test_notebooks: output: none @@ -76,7 +74,6 @@ files: - cuda - cuda_version - docs - - libarrow_run - py_version py_build_cudf: output: pyproject @@ -387,15 +384,6 @@ dependencies: - output_types: [conda, requirements, pyproject] packages: - cython>=3.0.3 - libarrow_run: - common: - - output_types: conda - packages: - # Allow runtime version to float up to patch version - - libarrow-acero>=16.1.0,<16.2.0a0 - - libarrow-dataset>=16.1.0,<16.2.0a0 - - libarrow>=16.1.0,<16.2.0a0 - - libparquet>=16.1.0,<16.2.0a0 pyarrow_run: common: - output_types: [conda, requirements, pyproject] From 213f0dba2a66032c70ab7e45ec6041c04c1d9975 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Mon, 26 Aug 2024 22:40:31 +0000 Subject: [PATCH 43/48] Make sure install rules are in place for the JNI --- cpp/cmake/thirdparty/get_arrow.cmake | 4 ++-- cpp/examples/interop/CMakeLists.txt | 2 ++ cpp/tests/CMakeLists.txt | 3 ++- 3 files changed, 6 insertions(+), 3 deletions(-) diff --git a/cpp/cmake/thirdparty/get_arrow.cmake b/cpp/cmake/thirdparty/get_arrow.cmake index ec1d41821e3..2723c6c49d9 100644 --- a/cpp/cmake/thirdparty/get_arrow.cmake +++ b/cpp/cmake/thirdparty/get_arrow.cmake @@ -215,11 +215,11 @@ endif() # Default to excluding from installation since we generally privately and statically link Arrow. if(NOT DEFINED CUDF_EXCLUDE_ARROW_FROM_ALL) - set(CUDF_EXCLUDE_ARROW_FROM_ALL ${CUDF_USE_ARROW_STATIC}) + set(CUDF_EXCLUDE_ARROW_FROM_ALL OFF) endif() if(NOT DEFINED CUDF_ENABLE_ARROW_PARQUET) - set(CUDF_ENABLE_ARROW_PARQUET OFF) + set(CUDF_ENABLE_ARROW_PARQUET ON) endif() find_and_configure_arrow( diff --git a/cpp/examples/interop/CMakeLists.txt b/cpp/examples/interop/CMakeLists.txt index 1eb06d30f88..2816f613d3d 100644 --- a/cpp/examples/interop/CMakeLists.txt +++ b/cpp/examples/interop/CMakeLists.txt @@ -17,6 +17,8 @@ include(../fetch_dependencies.cmake) # The Arrow CMake is currently broken if the build type is not set set(CMAKE_BUILD_TYPE Release) +# No need to install Arrow libs when only the final example executable is shipped. +set(CUDF_EXCLUDE_ARROW_FROM_ALL ON) include(../../cmake/thirdparty/get_arrow.cmake) add_executable(interop interop.cpp) diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index 580193648f4..f86acbcc51b 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -82,7 +82,8 @@ endfunction() # dependencies ################################################################################### # ################################################################################################## -# find arrow. +# No need to install Arrow libs when only the final test executables are shipped. +set(CUDF_EXCLUDE_ARROW_FROM_ALL ON) include(../cmake/thirdparty/get_arrow.cmake) # ################################################################################################## From d3e14e0b5b8f81cc50a41575d547aa4ed163c8f1 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Mon, 26 Aug 2024 22:48:21 +0000 Subject: [PATCH 44/48] Revert unconditional parquet inclusion --- cpp/cmake/thirdparty/get_arrow.cmake | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/cmake/thirdparty/get_arrow.cmake b/cpp/cmake/thirdparty/get_arrow.cmake index 2723c6c49d9..d6a62d78c77 100644 --- a/cpp/cmake/thirdparty/get_arrow.cmake +++ b/cpp/cmake/thirdparty/get_arrow.cmake @@ -219,7 +219,7 @@ if(NOT DEFINED CUDF_EXCLUDE_ARROW_FROM_ALL) endif() if(NOT DEFINED CUDF_ENABLE_ARROW_PARQUET) - set(CUDF_ENABLE_ARROW_PARQUET ON) + set(CUDF_ENABLE_ARROW_PARQUET OFF) endif() find_and_configure_arrow( From 15ecf667be6c95fc8f91df116218817bc6a98fab Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Mon, 26 Aug 2024 22:48:40 +0000 Subject: [PATCH 45/48] Make sure boost is avialable for the java test build --- dependencies.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/dependencies.yaml b/dependencies.yaml index d7fab5fb3db..9201e50f2f3 100644 --- a/dependencies.yaml +++ b/dependencies.yaml @@ -698,6 +698,7 @@ dependencies: - *cmake_ver - maven - openjdk=8.* + - boost test_python_common: common: - output_types: [conda, requirements, pyproject] From 4e653e9539a615f87b8affa0a97a53c2004a5543 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Mon, 26 Aug 2024 23:05:06 +0000 Subject: [PATCH 46/48] Put back install rules --- cpp/cmake/thirdparty/get_arrow.cmake | 138 +++++++++++++++++++++++++++ 1 file changed, 138 insertions(+) diff --git a/cpp/cmake/thirdparty/get_arrow.cmake b/cpp/cmake/thirdparty/get_arrow.cmake index d6a62d78c77..257e8030486 100644 --- a/cpp/cmake/thirdparty/get_arrow.cmake +++ b/cpp/cmake/thirdparty/get_arrow.cmake @@ -141,6 +141,10 @@ function(find_and_configure_arrow VERSION BUILD_STATIC EXCLUDE_FROM_ALL ENABLE_P # Set this to enable `find_package(Parquet)` set(Parquet_DIR "${Arrow_DIR}") endif() + # Set this to enable `find_package(ArrowDataset)`. This will call find_package(ArrowAcero) for + # us + set(ArrowDataset_DIR "${Arrow_DIR}") + find_package(ArrowDataset REQUIRED QUIET) endif() # Arrow_ADDED: set if CPM downloaded Arrow from Github elseif(Arrow_ADDED) @@ -178,6 +182,47 @@ function(find_and_configure_arrow VERSION BUILD_STATIC EXCLUDE_FROM_ALL ENABLE_P endif() if(Arrow_ADDED) + set(arrow_code_string + [=[ + if (TARGET cudf::arrow_shared AND (NOT TARGET arrow_shared)) + add_library(arrow_shared ALIAS cudf::arrow_shared) + endif() + if (TARGET cudf::arrow_static AND (NOT TARGET arrow_static)) + add_library(arrow_static ALIAS cudf::arrow_static) + endif() + if (NOT TARGET arrow::flatbuffers) + add_library(arrow::flatbuffers INTERFACE IMPORTED) + endif() + if (NOT TARGET arrow::hadoop) + add_library(arrow::hadoop INTERFACE IMPORTED) + endif() + ]=] + ) + if(ENABLE_PARQUET) + string( + APPEND + arrow_code_string + " + find_package(Boost) + if (NOT TARGET Boost::headers) + add_library(Boost::headers INTERFACE IMPORTED) + endif() + " + ) + endif() + if(NOT TARGET xsimd) + string( + APPEND + arrow_code_string + " + if(NOT TARGET arrow::xsimd) + add_library(arrow::xsimd INTERFACE IMPORTED) + target_include_directories(arrow::xsimd INTERFACE \"${Arrow_BINARY_DIR}/xsimd_ep/src/xsimd_ep-install/include\") + endif() + " + ) + endif() + rapids_cmake_install_lib_dir(lib_dir) if(TARGET arrow_static) get_target_property(interface_libs arrow_static INTERFACE_LINK_LIBRARIES) # The `arrow_static` library is leaking a dependency on the object libraries it was built with @@ -191,8 +236,101 @@ function(find_and_configure_arrow VERSION BUILD_STATIC EXCLUDE_FROM_ALL ENABLE_P get_target_property(interface_libs arrow_static INTERFACE_LINK_LIBRARIES) endif() endif() + rapids_export( + BUILD Arrow + VERSION ${VERSION} + EXPORT_SET arrow_targets + GLOBAL_TARGETS arrow_shared arrow_static + NAMESPACE cudf:: + FINAL_CODE_BLOCK arrow_code_string + ) + + if(ENABLE_PARQUET) + set(arrow_acero_code_string + [=[ + if (TARGET cudf::arrow_acero_shared AND (NOT TARGET arrow_acero_shared)) + add_library(arrow_acero_shared ALIAS cudf::arrow_acero_shared) + endif() + if (TARGET cudf::arrow_acero_static AND (NOT TARGET arrow_acero_static)) + add_library(arrow_acero_static ALIAS cudf::arrow_acero_static) + endif() + ]=] + ) + + rapids_export( + BUILD ArrowAcero + VERSION ${VERSION} + EXPORT_SET arrow_acero_targets + GLOBAL_TARGETS arrow_acero_shared arrow_acero_static + NAMESPACE cudf:: + FINAL_CODE_BLOCK arrow_acero_code_string + ) + + set(arrow_dataset_code_string + [=[ + if (TARGET cudf::arrow_dataset_shared AND (NOT TARGET arrow_dataset_shared)) + add_library(arrow_dataset_shared ALIAS cudf::arrow_dataset_shared) + endif() + if (TARGET cudf::arrow_dataset_static AND (NOT TARGET arrow_dataset_static)) + add_library(arrow_dataset_static ALIAS cudf::arrow_dataset_static) + endif() + ]=] + ) + + rapids_export( + BUILD ArrowDataset + VERSION ${VERSION} + EXPORT_SET arrow_dataset_targets + GLOBAL_TARGETS arrow_dataset_shared arrow_dataset_static + NAMESPACE cudf:: + FINAL_CODE_BLOCK arrow_dataset_code_string + ) + set(parquet_code_string + [=[ + if (TARGET cudf::parquet_shared AND (NOT TARGET parquet_shared)) + add_library(parquet_shared ALIAS cudf::parquet_shared) + endif() + if (TARGET cudf::parquet_static AND (NOT TARGET parquet_static)) + add_library(parquet_static ALIAS cudf::parquet_static) + endif() + ]=] + ) + + rapids_export( + BUILD Parquet + VERSION ${VERSION} + EXPORT_SET parquet_targets + GLOBAL_TARGETS parquet_shared parquet_static + NAMESPACE cudf:: + FINAL_CODE_BLOCK parquet_code_string + ) + endif() + endif() + + # We generate the arrow-configfiles when we built arrow locally, so always do `find_dependency` + rapids_export_package(BUILD Arrow cudf-exports) + rapids_export_package(INSTALL Arrow cudf-exports) + + if(ENABLE_PARQUET) + rapids_export_package(BUILD Parquet cudf-exports) + rapids_export_package(BUILD ArrowDataset cudf-exports) endif() + include("${rapids-cmake-dir}/export/find_package_root.cmake") + rapids_export_find_package_root( + BUILD Arrow [=[${CMAKE_CURRENT_LIST_DIR}]=] EXPORT_SET cudf-exports + ) + rapids_export_find_package_root( + BUILD Parquet [=[${CMAKE_CURRENT_LIST_DIR}]=] + EXPORT_SET cudf-exports + CONDITION ENABLE_PARQUET + ) + rapids_export_find_package_root( + BUILD ArrowDataset [=[${CMAKE_CURRENT_LIST_DIR}]=] + EXPORT_SET cudf-exports + CONDITION ENABLE_PARQUET + ) + set(ARROW_LIBRARIES "${ARROW_LIBRARIES}" PARENT_SCOPE From 894ac9377dbfba29fc3b912f6667167e6acb8cfe Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Tue, 27 Aug 2024 00:46:12 +0000 Subject: [PATCH 47/48] Generate the install rules conditionally --- cpp/cmake/thirdparty/get_arrow.cmake | 168 ++++++++++++++------------- 1 file changed, 86 insertions(+), 82 deletions(-) diff --git a/cpp/cmake/thirdparty/get_arrow.cmake b/cpp/cmake/thirdparty/get_arrow.cmake index 257e8030486..f80daf904e6 100644 --- a/cpp/cmake/thirdparty/get_arrow.cmake +++ b/cpp/cmake/thirdparty/get_arrow.cmake @@ -236,100 +236,104 @@ function(find_and_configure_arrow VERSION BUILD_STATIC EXCLUDE_FROM_ALL ENABLE_P get_target_property(interface_libs arrow_static INTERFACE_LINK_LIBRARIES) endif() endif() - rapids_export( - BUILD Arrow - VERSION ${VERSION} - EXPORT_SET arrow_targets - GLOBAL_TARGETS arrow_shared arrow_static - NAMESPACE cudf:: - FINAL_CODE_BLOCK arrow_code_string - ) - - if(ENABLE_PARQUET) - set(arrow_acero_code_string - [=[ - if (TARGET cudf::arrow_acero_shared AND (NOT TARGET arrow_acero_shared)) - add_library(arrow_acero_shared ALIAS cudf::arrow_acero_shared) - endif() - if (TARGET cudf::arrow_acero_static AND (NOT TARGET arrow_acero_static)) - add_library(arrow_acero_static ALIAS cudf::arrow_acero_static) - endif() - ]=] - ) - + if(NOT EXCLUDE_FROM_ALL) rapids_export( - BUILD ArrowAcero + BUILD Arrow VERSION ${VERSION} - EXPORT_SET arrow_acero_targets - GLOBAL_TARGETS arrow_acero_shared arrow_acero_static + EXPORT_SET arrow_targets + GLOBAL_TARGETS arrow_shared arrow_static NAMESPACE cudf:: - FINAL_CODE_BLOCK arrow_acero_code_string + FINAL_CODE_BLOCK arrow_code_string ) - set(arrow_dataset_code_string - [=[ - if (TARGET cudf::arrow_dataset_shared AND (NOT TARGET arrow_dataset_shared)) - add_library(arrow_dataset_shared ALIAS cudf::arrow_dataset_shared) - endif() - if (TARGET cudf::arrow_dataset_static AND (NOT TARGET arrow_dataset_static)) - add_library(arrow_dataset_static ALIAS cudf::arrow_dataset_static) - endif() - ]=] - ) + if(ENABLE_PARQUET) + set(arrow_acero_code_string + [=[ + if (TARGET cudf::arrow_acero_shared AND (NOT TARGET arrow_acero_shared)) + add_library(arrow_acero_shared ALIAS cudf::arrow_acero_shared) + endif() + if (TARGET cudf::arrow_acero_static AND (NOT TARGET arrow_acero_static)) + add_library(arrow_acero_static ALIAS cudf::arrow_acero_static) + endif() + ]=] + ) - rapids_export( - BUILD ArrowDataset - VERSION ${VERSION} - EXPORT_SET arrow_dataset_targets - GLOBAL_TARGETS arrow_dataset_shared arrow_dataset_static - NAMESPACE cudf:: - FINAL_CODE_BLOCK arrow_dataset_code_string - ) - set(parquet_code_string - [=[ - if (TARGET cudf::parquet_shared AND (NOT TARGET parquet_shared)) - add_library(parquet_shared ALIAS cudf::parquet_shared) - endif() - if (TARGET cudf::parquet_static AND (NOT TARGET parquet_static)) - add_library(parquet_static ALIAS cudf::parquet_static) - endif() - ]=] - ) + rapids_export( + BUILD ArrowAcero + VERSION ${VERSION} + EXPORT_SET arrow_acero_targets + GLOBAL_TARGETS arrow_acero_shared arrow_acero_static + NAMESPACE cudf:: + FINAL_CODE_BLOCK arrow_acero_code_string + ) - rapids_export( - BUILD Parquet - VERSION ${VERSION} - EXPORT_SET parquet_targets - GLOBAL_TARGETS parquet_shared parquet_static - NAMESPACE cudf:: - FINAL_CODE_BLOCK parquet_code_string - ) + set(arrow_dataset_code_string + [=[ + if (TARGET cudf::arrow_dataset_shared AND (NOT TARGET arrow_dataset_shared)) + add_library(arrow_dataset_shared ALIAS cudf::arrow_dataset_shared) + endif() + if (TARGET cudf::arrow_dataset_static AND (NOT TARGET arrow_dataset_static)) + add_library(arrow_dataset_static ALIAS cudf::arrow_dataset_static) + endif() + ]=] + ) + + rapids_export( + BUILD ArrowDataset + VERSION ${VERSION} + EXPORT_SET arrow_dataset_targets + GLOBAL_TARGETS arrow_dataset_shared arrow_dataset_static + NAMESPACE cudf:: + FINAL_CODE_BLOCK arrow_dataset_code_string + ) + set(parquet_code_string + [=[ + if (TARGET cudf::parquet_shared AND (NOT TARGET parquet_shared)) + add_library(parquet_shared ALIAS cudf::parquet_shared) + endif() + if (TARGET cudf::parquet_static AND (NOT TARGET parquet_static)) + add_library(parquet_static ALIAS cudf::parquet_static) + endif() + ]=] + ) + + rapids_export( + BUILD Parquet + VERSION ${VERSION} + EXPORT_SET parquet_targets + GLOBAL_TARGETS parquet_shared parquet_static + NAMESPACE cudf:: + FINAL_CODE_BLOCK parquet_code_string + ) + endif() endif() endif() - # We generate the arrow-configfiles when we built arrow locally, so always do `find_dependency` - rapids_export_package(BUILD Arrow cudf-exports) - rapids_export_package(INSTALL Arrow cudf-exports) + if(NOT EXCLUDE_FROM_ALL) + # We generate the arrow-configfiles when we built arrow locally, so always do `find_dependency` + rapids_export_package(BUILD Arrow cudf-exports) + rapids_export_package(INSTALL Arrow cudf-exports) - if(ENABLE_PARQUET) - rapids_export_package(BUILD Parquet cudf-exports) - rapids_export_package(BUILD ArrowDataset cudf-exports) - endif() + if(ENABLE_PARQUET) + rapids_export_package(BUILD Parquet cudf-exports) + rapids_export_package(BUILD ArrowDataset cudf-exports) + endif() - include("${rapids-cmake-dir}/export/find_package_root.cmake") - rapids_export_find_package_root( - BUILD Arrow [=[${CMAKE_CURRENT_LIST_DIR}]=] EXPORT_SET cudf-exports - ) - rapids_export_find_package_root( - BUILD Parquet [=[${CMAKE_CURRENT_LIST_DIR}]=] - EXPORT_SET cudf-exports - CONDITION ENABLE_PARQUET - ) - rapids_export_find_package_root( - BUILD ArrowDataset [=[${CMAKE_CURRENT_LIST_DIR}]=] - EXPORT_SET cudf-exports - CONDITION ENABLE_PARQUET - ) + include("${rapids-cmake-dir}/export/find_package_root.cmake") + rapids_export_find_package_root( + BUILD Arrow [=[${CMAKE_CURRENT_LIST_DIR}]=] EXPORT_SET cudf-exports + ) + rapids_export_find_package_root( + BUILD Parquet [=[${CMAKE_CURRENT_LIST_DIR}]=] + EXPORT_SET cudf-exports + CONDITION ENABLE_PARQUET + ) + rapids_export_find_package_root( + BUILD ArrowDataset [=[${CMAKE_CURRENT_LIST_DIR}]=] + EXPORT_SET cudf-exports + CONDITION ENABLE_PARQUET + ) + endif() set(ARROW_LIBRARIES "${ARROW_LIBRARIES}" From aa2952f1f15a13c9a0d40c36d8e0f5d2ce8961ee Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Tue, 27 Aug 2024 00:48:40 +0000 Subject: [PATCH 48/48] Include rapids-export --- cpp/cmake/thirdparty/get_arrow.cmake | 2 ++ 1 file changed, 2 insertions(+) diff --git a/cpp/cmake/thirdparty/get_arrow.cmake b/cpp/cmake/thirdparty/get_arrow.cmake index f80daf904e6..07cbf5150f4 100644 --- a/cpp/cmake/thirdparty/get_arrow.cmake +++ b/cpp/cmake/thirdparty/get_arrow.cmake @@ -236,6 +236,8 @@ function(find_and_configure_arrow VERSION BUILD_STATIC EXCLUDE_FROM_ALL ENABLE_P get_target_property(interface_libs arrow_static INTERFACE_LINK_LIBRARIES) endif() endif() + + include(rapids-export) if(NOT EXCLUDE_FROM_ALL) rapids_export( BUILD Arrow