From e44984c7bbf6fc8676b0fc28d1dd6d1ab69eb52b Mon Sep 17 00:00:00 2001 From: Sandino Flores Date: Wed, 13 Mar 2024 20:38:01 +0000 Subject: [PATCH] WIP: Enable dependencies of Google BigQuery --- CMakeLists.txt | 3 + scripts/setup-adapters.sh | 45 +++++++++--- scripts/setup-ubuntu.sh | 6 ++ .../hive/storage_adapters/CMakeLists.txt | 1 + .../storage_adapters/bigquery/CMakeLists.txt | 48 ++++++++++++ .../storage_adapters/bigquery/quickstart.cc | 73 +++++++++++++++++++ 6 files changed, 166 insertions(+), 10 deletions(-) create mode 100644 velox/connectors/hive/storage_adapters/bigquery/CMakeLists.txt create mode 100644 velox/connectors/hive/storage_adapters/bigquery/quickstart.cc diff --git a/CMakeLists.txt b/CMakeLists.txt index 4885f49bcb3a..1090fab4c0f2 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -473,6 +473,9 @@ if(${VELOX_BUILD_MINIMAL_WITH_DWIO} set_source(Protobuf) resolve_dependency(Protobuf 3.21 EXACT) include_directories(${Protobuf_INCLUDE_DIRS}) + # find_package(absl REQUIRED) + # find_package(utf8_range REQUIRED) + # find_package(Protobuf REQUIRED) endif() # GCC needs to link a library to enable std::filesystem. diff --git a/scripts/setup-adapters.sh b/scripts/setup-adapters.sh index 3e3ca3afc762..67680392faa1 100755 --- a/scripts/setup-adapters.sh +++ b/scripts/setup-adapters.sh @@ -58,29 +58,54 @@ function install_gcs-sdk-cpp { # https://github.com/googleapis/google-cloud-cpp/blob/main/doc/packaging.md#required-libraries # abseil-cpp - github_checkout abseil/abseil-cpp 20240116.1 --depth 1 - sed -i 's/^#define ABSL_OPTION_USE_\(.*\) 2/#define ABSL_OPTION_USE_\1 0/' "absl/base/options.h" - cmake_install -DBUILD_SHARED_LIBS=OFF \ - -DABSL_BUILD_TESTING=OFF + github_checkout abseil/abseil-cpp 20240116.2 --depth 1 + cmake_install \ + -DCMAKE_BUILD_TYPE=Release \ + -DABSL_BUILD_TESTING=OFF \ + -DABSL_PROPAGATE_CXX_STD=ON \ + -DBUILD_SHARED_LIBS=OFF \ + + # gRPC + github_checkout grpc/grpc v1.48.0 --depth 1 + cmake_install \ + -DBUILD_SHARED_LIBS=OFF \ + -DCMAKE_BUILD_TYPE=Release \ + -DBUILD_SHARED_LIBS=OFF \ + -DgRPC_INSTALL=ON \ + -DgRPC_BUILD_TESTS=OFF \ + -DgRPC_ABSL_PROVIDER=package \ + -DgRPC_CARES_PROVIDER=package \ + -DgRPC_PROTOBUF_PROVIDER=package \ + -DgRPC_RE2_PROVIDER=package \ + -DgRPC_SSL_PROVIDER=package \ + -DgRPC_ZLIB_PROVIDER=package # crc32 github_checkout google/crc32c 1.1.2 --depth 1 - cmake_install -DBUILD_SHARED_LIBS=OFF \ + cmake_install \ + -DBUILD_SHARED_LIBS=OFF \ + -DCMAKE_BUILD_TYPE=Release \ + -DBUILD_SHARED_LIBS=OFF \ -DCRC32C_BUILD_TESTS=OFF \ -DCRC32C_BUILD_BENCHMARKS=OFF \ -DCRC32C_USE_GLOG=OFF # nlohmann json github_checkout nlohmann/json v3.11.3 --depth 1 - cmake_install -DBUILD_SHARED_LIBS=OFF \ + cmake_install \ + -DCMAKE_BUILD_TYPE=Release \ + -DBUILD_SHARED_LIBS=OFF \ + -DBUILD_TESTING=OFF \ -DJSON_BuildTests=OFF # google-cloud-cpp - github_checkout googleapis/google-cloud-cpp v2.22.0 --depth 1 - cmake_install -DBUILD_SHARED_LIBS=OFF \ - -DCMAKE_INSTALL_MESSAGE=NEVER \ + github_checkout googleapis/google-cloud-cpp v2.23.0 --depth 1 + cmake_install \ + -DBUILD_SHARED_LIBS=OFF \ + -DCMAKE_BUILD_TYPE=Release \ + -DBUILD_TESTING=OFF \ -DGOOGLE_CLOUD_CPP_ENABLE_EXAMPLES=OFF \ - -DGOOGLE_CLOUD_CPP_ENABLE=storage + -DGOOGLE_CLOUD_CPP_ENABLE=storage,bigquery } function install_azure-storage-sdk-cpp { diff --git a/scripts/setup-ubuntu.sh b/scripts/setup-ubuntu.sh index c71b5044edf9..79e91a6f04df 100755 --- a/scripts/setup-ubuntu.sh +++ b/scripts/setup-ubuntu.sh @@ -124,6 +124,11 @@ function install_fbthrift { cmake_install -Denable_tests=OFF -DBUILD_TESTS=OFF -DBUILD_SHARED_LIBS=OFF } +function install_protobuf { + github_checkout protocolbuffers/protobuf v21.4 --depth 1 + cmake_install -DBUILD_SHARED_LIBS=OFF -Dprotobuf_BUILD_TESTS=OFF +} + function install_conda { MINICONDA_PATH=/opt/miniconda-for-velox if [ -e ${MINICONDA_PATH} ]; then @@ -161,6 +166,7 @@ function install_velox_deps { run_and_time install_wangle run_and_time install_mvfst run_and_time install_fbthrift + run_and_time install_protobuf run_and_time install_conda } diff --git a/velox/connectors/hive/storage_adapters/CMakeLists.txt b/velox/connectors/hive/storage_adapters/CMakeLists.txt index bd7c37f81640..6ce77d68a140 100644 --- a/velox/connectors/hive/storage_adapters/CMakeLists.txt +++ b/velox/connectors/hive/storage_adapters/CMakeLists.txt @@ -16,3 +16,4 @@ add_subdirectory(s3fs) add_subdirectory(hdfs) add_subdirectory(gcs) add_subdirectory(abfs) +add_subdirectory(bigquery) \ No newline at end of file diff --git a/velox/connectors/hive/storage_adapters/bigquery/CMakeLists.txt b/velox/connectors/hive/storage_adapters/bigquery/CMakeLists.txt new file mode 100644 index 000000000000..51aabd99b3cd --- /dev/null +++ b/velox/connectors/hive/storage_adapters/bigquery/CMakeLists.txt @@ -0,0 +1,48 @@ +find_package(google_cloud_cpp_bigquery REQUIRED) + + +# Once the bigquery_client package is found, define new targets. +add_executable(bq-quickstart quickstart.cc) +target_link_libraries(bq-quickstart + google-cloud-cpp::bigquery + + protobuf::libprotobuf + + absl::absl_check + absl::absl_log + absl::algorithm + absl::base + absl::bind_front + absl::bits + absl::btree + absl::cleanup + absl::cord + absl::core_headers + absl::debugging + absl::die_if_null + absl::dynamic_annotations + absl::flags + absl::flat_hash_map + absl::flat_hash_set + absl::function_ref + absl::hash + absl::if_constexpr + absl::layout + absl::log_initialize + absl::log_severity + absl::memory + absl::node_hash_map + absl::node_hash_set + absl::optional + absl::span + absl::status + absl::statusor + absl::strings + absl::synchronization + absl::time + absl::type_traits + absl::utility + absl::variant + + utf8_range::utf8_validity +) diff --git a/velox/connectors/hive/storage_adapters/bigquery/quickstart.cc b/velox/connectors/hive/storage_adapters/bigquery/quickstart.cc new file mode 100644 index 000000000000..13af14a56422 --- /dev/null +++ b/velox/connectors/hive/storage_adapters/bigquery/quickstart.cc @@ -0,0 +1,73 @@ +// Copyright 2021 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//! [START bigquerystorage_quickstart] [all] +#include "google/cloud/bigquery/storage/v1/bigquery_read_client.h" +#include + +namespace { +void ProcessRowsInAvroFormat( + ::google::cloud::bigquery::storage::v1::AvroSchema const&, + ::google::cloud::bigquery::storage::v1::AvroRows const&) { + // Code to deserialize avro rows should be added here. +} +} // namespace + +int main(int argc, char* argv[]) try { + if (argc != 3) { + std::cerr << "Usage: " << argv[0] << " \n"; + return 1; + } + + // project_name should be in the format "projects/" + std::string const project_name = "projects/" + std::string(argv[1]); + // table_name should be in the format: + // "projects//datasets//tables/" The project values in project_name and table_name do not have to be + // identical. + std::string const table_name = argv[2]; + + // Create a namespace alias to make the code easier to read. + namespace bigquery_storage = ::google::cloud::bigquery_storage_v1; + constexpr int kMaxReadStreams = 1; + // Create the ReadSession. + auto client = bigquery_storage::BigQueryReadClient( + bigquery_storage::MakeBigQueryReadConnection()); + ::google::cloud::bigquery::storage::v1::ReadSession read_session; + read_session.set_data_format( + google::cloud::bigquery::storage::v1::DataFormat::AVRO); + read_session.set_table(table_name); + auto session = + client.CreateReadSession(project_name, read_session, kMaxReadStreams); + if (!session) throw std::move(session).status(); + + // Read rows from the ReadSession. + constexpr int kRowOffset = 0; + auto read_rows = client.ReadRows(session->streams(0).name(), kRowOffset); + + std::int64_t num_rows = 0; + for (auto const& row : read_rows) { + if (row.ok()) { + num_rows += row->row_count(); + ProcessRowsInAvroFormat(session->avro_schema(), row->avro_rows()); + } + } + + std::cout << num_rows << " rows read from table: " << table_name << "\n"; + return 0; +} catch (google::cloud::Status const& status) { + std::cerr << "google::cloud::Status thrown: " << status << "\n"; + return 1; +} +//! [END bigquerystorage_quickstart] [all]