From efbee5a98b4450f6394a2d0c86532371287603c4 Mon Sep 17 00:00:00 2001 From: Thomas Newton Date: Wed, 30 Aug 2023 09:30:02 +0100 Subject: [PATCH] GH-29847: [C++] Build with Azure SDK for C++ (#36835) ### Rationale for this change We want to use the Azure SDK for C++ to read/write to Azure blob storage. Obviously this is pretty important for building an `AzureFileSystem`. ### What changes are included in this PR? Builds the the relevant parts of the azure SDK as a cmake external project. Adds a couple of simple tests that just assert that the Azure SDK is working and a couple of lines in `AzureFileSystem` to initialise the blob storage client to ensure the build is working correctly in all environments. I started with the build setup from https://github.com/apache/arrow/pull/12914 but I did make few changes. 1. Although its atypical for this project we chose to switch from cmake's `ExternalProject` to `FetchContent`. `FetchContent` is recomended by the Azure docs https://github.com/Azure/azure-sdk-for-cpp#cmake-project--fetch-content. It also solves a few problems including: automatically linking system curl and ssl instead of bootstrapping vcpkg and installing curl and ssl from there. 2. Only build one version of the Azure SDK for C++ because it contains all the components. Previously we were unnecessarily building 5 different versions of the whole thing on top of each other. This created race conditions for which version each component came from. 3. We are using `azure-core_1.10.2` which is a very recent version. There are a couple of important reasons for this 1. [an important managed identity fix](https://github.com/Azure/azure-sdk-for-cpp/issues/4723), 2. [fixed support for curl versions < 7.71.0](https://github.com/Azure/azure-sdk-for-cpp/issues/4792). There will be follow up PRs to enable Azure in the manylinux builds. We need to update `vcpkg` first so we can get a version of the Azure SDK which contains [an important managed identity fix](https://github.com/Azure/azure-sdk-for-cpp/issues/4723). ### Are these changes tested? Yes. There is a simple test that just runs the Azure client against azurite. Additionally just initialising the client in `AzureFileSystem` goes a long way towards ensuring the build is working. ### Are there any user-facing changes? No * Closes: #29847 Lead-authored-by: Thomas Newton Co-authored-by: Sutou Kouhei Co-authored-by: Sutou Kouhei Co-authored-by: Antoine Pitrou Co-authored-by: shefali singh Signed-off-by: Sutou Kouhei --- ci/docker/ubuntu-20.04-cpp.dockerfile | 2 + ci/docker/ubuntu-22.04-cpp.dockerfile | 2 + ci/scripts/cpp_build.sh | 1 + cpp/CMakeLists.txt | 5 ++ cpp/cmake_modules/FindAzure.cmake | 45 ++++++++++++++ cpp/cmake_modules/ThirdpartyToolchain.cmake | 68 +++++++++++++++++++++ cpp/src/arrow/filesystem/azurefs.cc | 9 +++ cpp/src/arrow/filesystem/azurefs_test.cc | 45 ++++++++++++-- cpp/thirdparty/versions.txt | 3 + 9 files changed, 174 insertions(+), 6 deletions(-) create mode 100644 cpp/cmake_modules/FindAzure.cmake diff --git a/ci/docker/ubuntu-20.04-cpp.dockerfile b/ci/docker/ubuntu-20.04-cpp.dockerfile index 125f1f48d482e..08dda6cf50a17 100644 --- a/ci/docker/ubuntu-20.04-cpp.dockerfile +++ b/ci/docker/ubuntu-20.04-cpp.dockerfile @@ -99,6 +99,7 @@ RUN apt-get update -y -q && \ libssl-dev \ libthrift-dev \ libutf8proc-dev \ + libxml2-dev \ libzstd-dev \ make \ ninja-build \ @@ -172,6 +173,7 @@ ENV absl_SOURCE=BUNDLED \ ARROW_WITH_ZSTD=ON \ ASAN_SYMBOLIZER_PATH=/usr/lib/llvm-${llvm}/bin/llvm-symbolizer \ AWSSDK_SOURCE=BUNDLED \ + Azure_SOURCE=BUNDLED \ google_cloud_cpp_storage_SOURCE=BUNDLED \ gRPC_SOURCE=BUNDLED \ GTest_SOURCE=BUNDLED \ diff --git a/ci/docker/ubuntu-22.04-cpp.dockerfile b/ci/docker/ubuntu-22.04-cpp.dockerfile index 0840b3fa5c68d..dedeedd979f72 100644 --- a/ci/docker/ubuntu-22.04-cpp.dockerfile +++ b/ci/docker/ubuntu-22.04-cpp.dockerfile @@ -98,6 +98,7 @@ RUN apt-get update -y -q && \ libssl-dev \ libthrift-dev \ libutf8proc-dev \ + libxml2-dev \ libzstd-dev \ make \ ninja-build \ @@ -196,6 +197,7 @@ ENV absl_SOURCE=BUNDLED \ ARROW_WITH_ZSTD=ON \ ASAN_SYMBOLIZER_PATH=/usr/lib/llvm-${llvm}/bin/llvm-symbolizer \ AWSSDK_SOURCE=BUNDLED \ + Azure_SOURCE=BUNDLED \ google_cloud_cpp_storage_SOURCE=BUNDLED \ GTest_SOURCE=BUNDLED \ ORC_SOURCE=BUNDLED \ diff --git a/ci/scripts/cpp_build.sh b/ci/scripts/cpp_build.sh index 5a89fafc6015e..d73f4ad23085e 100755 --- a/ci/scripts/cpp_build.sh +++ b/ci/scripts/cpp_build.sh @@ -152,6 +152,7 @@ cmake \ -DARROW_WITH_ZLIB=${ARROW_WITH_ZLIB:-OFF} \ -DARROW_WITH_ZSTD=${ARROW_WITH_ZSTD:-OFF} \ -DAWSSDK_SOURCE=${AWSSDK_SOURCE:-} \ + -DAzure_SOURCE=${Azure_SOURCE:-} \ -Dbenchmark_SOURCE=${benchmark_SOURCE:-} \ -DBOOST_SOURCE=${BOOST_SOURCE:-} \ -DBrotli_SOURCE=${Brotli_SOURCE:-} \ diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index fcff62c447014..f8e7b1eb271b2 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -818,6 +818,11 @@ if(ARROW_WITH_OPENTELEMETRY) list(APPEND ARROW_STATIC_INSTALL_INTERFACE_LIBS CURL::libcurl) endif() +if(ARROW_WITH_AZURE_SDK) + list(APPEND ARROW_SHARED_LINK_LIBS ${AZURE_SDK_LINK_LIBRARIES}) + list(APPEND ARROW_STATIC_LINK_LIBS ${AZURE_SDK_LINK_LIBRARIES}) +endif() + if(ARROW_WITH_UTF8PROC) list(APPEND ARROW_SHARED_LINK_LIBS utf8proc::utf8proc) list(APPEND ARROW_STATIC_LINK_LIBS utf8proc::utf8proc) diff --git a/cpp/cmake_modules/FindAzure.cmake b/cpp/cmake_modules/FindAzure.cmake new file mode 100644 index 0000000000000..fdf354b724e77 --- /dev/null +++ b/cpp/cmake_modules/FindAzure.cmake @@ -0,0 +1,45 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +if(Azure_FOUND) + return() +endif() + +set(find_package_args) +list(APPEND find_package_args CONFIG) +if(Azure_FIND_QUIETLY) + list(APPEND find_package_args QUIET) +endif() + +if(Azure_FIND_REQUIRED) + list(APPEND find_package_args REQUIRED) +endif() + +find_package(azure-core-cpp ${find_package_args}) +find_package(azure-identity-cpp ${find_package_args}) +find_package(azure-storage-blobs-cpp ${find_package_args}) +find_package(azure-storage-common-cpp ${find_package_args}) +find_package(azure-storage-files-datalake-cpp ${find_package_args}) + +find_package_handle_standard_args( + Azure + REQUIRED_VARS azure-core-cpp_FOUND + azure-identity-cpp_FOUND + azure-storage-blobs-cpp_FOUND + azure-storage-common-cpp_FOUND + azure-storage-files-datalake-cpp_FOUND + VERSION_VAR azure-core-cpp_VERSION) diff --git a/cpp/cmake_modules/ThirdpartyToolchain.cmake b/cpp/cmake_modules/ThirdpartyToolchain.cmake index 3101c1dc73aa6..5c2e679e10c86 100644 --- a/cpp/cmake_modules/ThirdpartyToolchain.cmake +++ b/cpp/cmake_modules/ThirdpartyToolchain.cmake @@ -49,6 +49,7 @@ set(ARROW_RE2_LINKAGE set(ARROW_THIRDPARTY_DEPENDENCIES absl AWSSDK + Azure benchmark Boost Brotli @@ -162,6 +163,8 @@ macro(build_dependency DEPENDENCY_NAME) build_absl() elseif("${DEPENDENCY_NAME}" STREQUAL "AWSSDK") build_awssdk() + elseif("${DEPENDENCY_NAME}" STREQUAL "Azure") + build_azure_sdk() elseif("${DEPENDENCY_NAME}" STREQUAL "benchmark") build_benchmark() elseif("${DEPENDENCY_NAME}" STREQUAL "Boost") @@ -389,6 +392,10 @@ if(ARROW_GCS) set(ARROW_WITH_ZLIB ON) endif() +if(ARROW_AZURE) + set(ARROW_WITH_AZURE_SDK ON) +endif() + if(ARROW_JSON) set(ARROW_WITH_RAPIDJSON ON) endif() @@ -569,6 +576,14 @@ else() "${THIRDPARTY_MIRROR_URL}/aws-sdk-cpp-${ARROW_AWSSDK_BUILD_VERSION}.tar.gz") endif() +if(DEFINED ENV{ARROW_AZURE_SDK_URL}) + set(ARROW_AZURE_SDK_URL "$ENV{ARROW_AZURE_SDK_URL}") +else() + set_urls(ARROW_AZURE_SDK_URL + "https://github.com/Azure/azure-sdk-for-cpp/archive/${ARROW_AZURE_SDK_BUILD_VERSION}.tar.gz" + ) +endif() + if(DEFINED ENV{ARROW_BOOST_URL}) set(BOOST_SOURCE_URL "$ENV{ARROW_BOOST_URL}") else() @@ -981,6 +996,8 @@ else() set(MAKE_BUILD_ARGS "-j${NPROC}") endif() +include(FetchContent) + # ---------------------------------------------------------------------- # Find pthreads @@ -1388,6 +1405,7 @@ endif() set(ARROW_OPENSSL_REQUIRED_VERSION "1.0.2") set(ARROW_USE_OPENSSL OFF) if(PARQUET_REQUIRE_ENCRYPTION + OR ARROW_AZURE OR ARROW_FLIGHT OR ARROW_GANDIVA OR ARROW_GCS @@ -5095,6 +5113,56 @@ if(ARROW_S3) endif() endif() +# ---------------------------------------------------------------------- +# Azure SDK for C++ + +function(build_azure_sdk) + message(STATUS "Building Azure SDK for C++ from source") + fetchcontent_declare(azure_sdk + URL ${ARROW_AZURE_SDK_URL} + URL_HASH "SHA256=${ARROW_AZURE_SDK_BUILD_SHA256_CHECKSUM}") + set(BUILD_PERFORMANCE_TESTS FALSE) + set(BUILD_SAMPLES FALSE) + set(BUILD_TESTING FALSE) + set(BUILD_WINDOWS_UWP TRUE) + set(CMAKE_EXPORT_NO_PACKAGE_REGISTRY TRUE) + set(DISABLE_AZURE_CORE_OPENTELEMETRY TRUE) + set(ENV{AZURE_SDK_DISABLE_AUTO_VCPKG} TRUE) + set(WARNINGS_AS_ERRORS FALSE) + # TODO: Configure flags in a better way. FetchContent builds inherit + # global flags but we want to disable -Werror for Azure SDK for C++ builds. + if(MSVC) + string(REPLACE "/WX" "" CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG}") + string(REPLACE "/WX" "" CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG}") + else() + string(REPLACE "-Werror" "" CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG}") + string(REPLACE "-Werror" "" CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG}") + endif() + fetchcontent_makeavailable(azure_sdk) + set(AZURE_SDK_VENDORED + TRUE + PARENT_SCOPE) + list(APPEND + ARROW_BUNDLED_STATIC_LIBS + Azure::azure-core + Azure::azure-identity + Azure::azure-storage-blobs + Azure::azure-storage-common + Azure::azure-storage-files-datalake) + set(ARROW_BUNDLED_STATIC_LIBS + ${ARROW_BUNDLED_STATIC_LIBS} + PARENT_SCOPE) +endfunction() + +if(ARROW_WITH_AZURE_SDK) + resolve_dependency(Azure REQUIRED_VERSION 1.10.2) + set(AZURE_SDK_LINK_LIBRARIES + Azure::azure-storage-files-datalake + Azure::azure-storage-common + Azure::azure-storage-blobs + Azure::azure-identity + Azure::azure-core) +endif() # ---------------------------------------------------------------------- # ucx - communication framework for modern, high-bandwidth and low-latency networks diff --git a/cpp/src/arrow/filesystem/azurefs.cc b/cpp/src/arrow/filesystem/azurefs.cc index 0158c0cec74e1..fcbae332d2397 100644 --- a/cpp/src/arrow/filesystem/azurefs.cc +++ b/cpp/src/arrow/filesystem/azurefs.cc @@ -17,6 +17,9 @@ #include "arrow/filesystem/azurefs.h" +#include +#include + #include "arrow/result.h" #include "arrow/util/checked_cast.h" @@ -47,6 +50,12 @@ class AzureFileSystem::Impl { : io_context_(io_context), options_(std::move(options)) {} Status Init() { + // TODO: GH-18014 Delete this once we have a proper implementation. This just + // initializes a pointless Azure blob service client with a fake endpoint to ensure + // the build will fail if the Azure SDK build is broken. + auto default_credential = std::make_shared(); + auto service_client = Azure::Storage::Blobs::BlobServiceClient( + "http://fake-blob-storage-endpoint", default_credential); if (options_.backend == AzureBackend::Azurite) { // gen1Client_->GetAccountInfo().Value.IsHierarchicalNamespaceEnabled // throws error in azurite diff --git a/cpp/src/arrow/filesystem/azurefs_test.cc b/cpp/src/arrow/filesystem/azurefs_test.cc index e940c5bd1bc32..9bf7cb8e753ec 100644 --- a/cpp/src/arrow/filesystem/azurefs_test.cc +++ b/cpp/src/arrow/filesystem/azurefs_test.cc @@ -45,6 +45,12 @@ #include "arrow/testing/gtest_util.h" #include "arrow/testing/util.h" +#include +#include +#include +#include +#include + namespace arrow { using internal::TemporaryDir; namespace fs { @@ -105,15 +111,42 @@ AzuriteEnv* GetAzuriteEnv() { return ::arrow::internal::checked_cast(azurite_env); } -// Placeholder tests for file structure +// Placeholder tests // TODO: GH-18014 Remove once a proper test is added -TEST(AzureFileSystem, InitialiseAzurite) { +TEST(AzureFileSystem, UploadThenDownload) { + const std::string container_name = "sample-container"; + const std::string blob_name = "sample-blob.txt"; + const std::string blob_content = "Hello Azure!"; + const std::string& account_name = GetAzuriteEnv()->account_name(); const std::string& account_key = GetAzuriteEnv()->account_key(); - EXPECT_EQ(account_name, "devstoreaccount1"); - EXPECT_EQ(account_key, - "Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/" - "K1SZFPTOtr/KBHBeksoGMGw=="); + + auto credential = std::make_shared( + account_name, account_key); + + auto service_client = Azure::Storage::Blobs::BlobServiceClient( + std::string("http://127.0.0.1:10000/") + account_name, credential); + auto container_client = service_client.GetBlobContainerClient(container_name); + container_client.CreateIfNotExists(); + auto blob_client = container_client.GetBlockBlobClient(blob_name); + + std::vector buffer(blob_content.begin(), blob_content.end()); + blob_client.UploadFrom(buffer.data(), buffer.size()); + + std::vector downloaded_content(blob_content.size()); + blob_client.DownloadTo(downloaded_content.data(), downloaded_content.size()); + + EXPECT_EQ(std::string(downloaded_content.begin(), downloaded_content.end()), + blob_content); +} + +TEST(AzureFileSystem, InitializeCredentials) { + auto default_credential = std::make_shared(); + auto managed_identity_credential = + std::make_shared(); + auto service_principal_credential = + std::make_shared("tenant_id", "client_id", + "client_secret"); } TEST(AzureFileSystem, OptionsCompare) { diff --git a/cpp/thirdparty/versions.txt b/cpp/thirdparty/versions.txt index 8edaa422b3dcf..52d302592b55c 100644 --- a/cpp/thirdparty/versions.txt +++ b/cpp/thirdparty/versions.txt @@ -53,6 +53,9 @@ ARROW_AWS_LC_BUILD_VERSION=v1.3.0 ARROW_AWS_LC_BUILD_SHA256_CHECKSUM=ae96a3567161552744fc0cae8b4d68ed88b1ec0f3d3c98700070115356da5a37 ARROW_AWSSDK_BUILD_VERSION=1.10.55 ARROW_AWSSDK_BUILD_SHA256_CHECKSUM=2d552fb1a84bef4a9b65e34aa7031851ed2aef5319e02cc6e4cb735c48aa30de +# Despite the confusing version name this is still the whole Azure SDK for C++ including core, keyvault, storage-common, etc. +ARROW_AZURE_SDK_BUILD_VERSION=azure-core_1.10.2 +ARROW_AZURE_SDK_BUILD_SHA256_CHECKSUM=36557dae87de4cdd257d9b441d9a7f043290eae6666fb1065e0fa486ae3e58a0 ARROW_BOOST_BUILD_VERSION=1.81.0 ARROW_BOOST_BUILD_SHA256_CHECKSUM=9e0ffae35528c35f90468997bc8d99500bf179cbae355415a89a600c38e13574 ARROW_BROTLI_BUILD_VERSION=v1.0.9