Skip to content

Commit

Permalink
Remove protobuf and use parsed ORC statistics from libcudf (#15564)
Browse files Browse the repository at this point in the history
This PR removes the cuDF Python dependencies on `protobuf` and `protoc-wheel`. Closes #15511.

The only use case for the `protobuf` dependency was reading ORC file/stripe statistics. However, we have code in libcudf that can do this without requiring `protobuf`.

In this PR, we expose the C++ code for parsing ORC statistics from libcudf to Cython and remove all references to `protobuf`.

Authors:
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - Kyle Edwards (https://github.com/KyleFromNVIDIA)
  - Jake Awe (https://github.com/AyodeAwe)
  - David Wendt (https://github.com/davidwendt)
  - GALI PREM SAGAR (https://github.com/galipremsagar)
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: #15564
  • Loading branch information
bdice authored Apr 19, 2024
1 parent 40d3dd7 commit d37636d
Show file tree
Hide file tree
Showing 16 changed files with 263 additions and 182 deletions.
3 changes: 0 additions & 3 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -161,9 +161,6 @@ ENV/
# Dask
dask-worker-space/

# protobuf
**/*_pb2.py

# Sphinx docs & build artifacts
docs/cudf/source/api_docs/generated/*
docs/cudf/source/user_guide/api_docs/api/*
Expand Down
1 change: 0 additions & 1 deletion conda/environments/all_cuda-118_arch-x86_64.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,6 @@ dependencies:
- pandoc
- pip
- pre-commit
- protobuf>=3.20,<5
- ptxcompiler
- pyarrow==14.0.2.*
- pydata-sphinx-theme!=0.14.2
Expand Down
1 change: 0 additions & 1 deletion conda/environments/all_cuda-122_arch-x86_64.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,6 @@ dependencies:
- pandoc
- pip
- pre-commit
- protobuf>=3.20,<5
- pyarrow==14.0.2.*
- pydata-sphinx-theme!=0.14.2
- pynvjitlink
Expand Down
2 changes: 0 additions & 2 deletions conda/recipes/cudf/meta.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,6 @@ requirements:
- cuda-version ={{ cuda_version }}
- sysroot_{{ target_platform }} {{ sysroot_version }}
host:
- protobuf ==4.24.*
- python
- cython >=3.0.3
- scikit-build-core >=0.7.0
Expand All @@ -78,7 +77,6 @@ requirements:
{% endif %}
- cuda-version ={{ cuda_version }}
run:
- protobuf >=3.20,<5.0a0
- python
- typing_extensions >=4.0.0
- pandas >=2.0,<2.2.3dev0
Expand Down
26 changes: 16 additions & 10 deletions cpp/include/cudf/io/orc_metadata.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -154,6 +154,21 @@ struct timestamp_statistics : minmax_statistics<int64_t> {
std::optional<uint32_t> maximum_nanos; ///< nanoseconds part of the maximum
};

/**
* @brief Variant type for ORC type-specific column statistics.
*
* The variant can hold any of the supported column statistics types.
*/
using statistics_type = std::variant<no_statistics,
integer_statistics,
double_statistics,
string_statistics,
bucket_statistics,
decimal_statistics,
date_statistics,
binary_statistics,
timestamp_statistics>;

//! Orc I/O interfaces
namespace orc {
// forward declare the type that ProtobufReader uses. The `cudf::io::column_statistics` objects,
Expand All @@ -171,16 +186,7 @@ struct column_statistics;
struct column_statistics {
std::optional<uint64_t> number_of_values; ///< number of statistics
std::optional<bool> has_null; ///< column has any nulls
std::variant<no_statistics,
integer_statistics,
double_statistics,
string_statistics,
bucket_statistics,
decimal_statistics,
date_statistics,
binary_statistics,
timestamp_statistics>
type_specific_stats; ///< type-specific statistics
statistics_type type_specific_stats; ///< type-specific statistics

/**
* @brief Construct a new column statistics object
Expand Down
5 changes: 0 additions & 5 deletions dependencies.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -286,13 +286,9 @@ dependencies:
- output_types: conda
packages:
- &rmm_conda rmm==24.6.*
- &protobuf protobuf>=3.20,<5
- pip
- pip:
- git+https://github.com/python-streamz/streamz.git@master
- output_types: [requirements, pyproject]
packages:
- protoc-wheel
- output_types: requirements
packages:
# pip recognizes the index as a global option for the requirements.txt file
Expand Down Expand Up @@ -525,7 +521,6 @@ dependencies:
- packaging
- rich
- typing_extensions>=4.0.0
- *protobuf
- output_types: conda
packages:
- *rmm_conda
Expand Down
1 change: 0 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@ follow_imports = "skip"
exclude = [
"cudf/_lib/",
"cudf/cudf/tests/",
"cudf/cudf/utils/metadata/orc_column_statistics_pb2.py",
"custreamz/custreamz/tests/",
"dask_cudf/dask_cudf/tests/",
]
Expand Down
3 changes: 0 additions & 3 deletions python/cudf/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -96,9 +96,6 @@ include(cmake/Modules/LinkPyarrowHeaders.cmake)
add_subdirectory(cudf/_lib)
add_subdirectory(udf_cpp)

include(cmake/Modules/ProtobufHelpers.cmake)
codegen_protoc(cudf/utils/metadata/orc_column_statistics.proto)

if(DEFINED cython_lib_dir)
rapids_cython_add_rpath_entries(TARGET cudf PATHS "${cython_lib_dir}")
endif()
50 changes: 0 additions & 50 deletions python/cudf/cmake/Modules/ProtobufHelpers.cmake

This file was deleted.

64 changes: 59 additions & 5 deletions python/cudf/cudf/_lib/cpp/io/orc_metadata.pxd
Original file line number Diff line number Diff line change
@@ -1,19 +1,73 @@
# Copyright (c) 2020-2021, NVIDIA CORPORATION.
# Copyright (c) 2020-2024, NVIDIA CORPORATION.

from libc.stdint cimport int32_t, int64_t, uint32_t, uint64_t
from libcpp cimport bool
from libcpp.optional cimport optional
from libcpp.string cimport string
from libcpp.vector cimport vector

cimport cudf._lib.cpp.io.types as cudf_io_types
from cudf._lib.variant cimport monostate, variant


cdef extern from "cudf/io/orc_metadata.hpp" \
namespace "cudf::io" nogil:

cdef cppclass raw_orc_statistics:
ctypedef monostate no_statistics

cdef cppclass minmax_statistics[T]:
optional[T] minimum
optional[T] maximum

cdef cppclass sum_statistics[T]:
optional[T] sum

cdef cppclass integer_statistics(
minmax_statistics[int64_t], sum_statistics[int64_t]
):
pass

cdef cppclass double_statistics(
minmax_statistics[double], sum_statistics[double]
):
pass

cdef cppclass string_statistics(
minmax_statistics[string], sum_statistics[int64_t]
):
pass

cdef cppclass bucket_statistics:
vector[int64_t] count

cdef cppclass decimal_statistics(
minmax_statistics[string], sum_statistics[string]
):
pass

ctypedef minmax_statistics[int32_t] date_statistics

ctypedef sum_statistics[int64_t] binary_statistics

cdef cppclass timestamp_statistics(minmax_statistics[int64_t]):
optional[int64_t] minimum_utc
optional[int64_t] maximum_utc
optional[uint32_t] minimum_nanos
optional[uint32_t] maximum_nanos

# This is a std::variant of all the statistics types
ctypedef variant statistics_type

cdef cppclass column_statistics:
optional[uint64_t] number_of_values
optional[bool] has_null
statistics_type type_specific_stats

cdef cppclass parsed_orc_statistics:
vector[string] column_names
vector[string] file_stats
vector[vector[string]] stripes_stats
vector[column_statistics] file_stats
vector[vector[column_statistics]] stripes_stats

cdef raw_orc_statistics read_raw_orc_statistics(
cdef parsed_orc_statistics read_parsed_orc_statistics(
cudf_io_types.source_info src_info
) except +
Loading

0 comments on commit d37636d

Please sign in to comment.