-
Notifications
You must be signed in to change notification settings - Fork 902
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Remove protobuf and use parsed ORC statistics from libcudf (#15564)
This PR removes the cuDF Python dependencies on `protobuf` and `protoc-wheel`. Closes #15511. The only use case for the `protobuf` dependency was reading ORC file/stripe statistics. However, we have code in libcudf that can do this without requiring `protobuf`. In this PR, we expose the C++ code for parsing ORC statistics from libcudf to Cython and remove all references to `protobuf`. Authors: - Bradley Dice (https://github.com/bdice) Approvers: - Kyle Edwards (https://github.com/KyleFromNVIDIA) - Jake Awe (https://github.com/AyodeAwe) - David Wendt (https://github.com/davidwendt) - GALI PREM SAGAR (https://github.com/galipremsagar) - Vyas Ramasubramani (https://github.com/vyasr) URL: #15564
- Loading branch information
Showing
16 changed files
with
263 additions
and
182 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file was deleted.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,19 +1,73 @@ | ||
# Copyright (c) 2020-2021, NVIDIA CORPORATION. | ||
# Copyright (c) 2020-2024, NVIDIA CORPORATION. | ||
|
||
from libc.stdint cimport int32_t, int64_t, uint32_t, uint64_t | ||
from libcpp cimport bool | ||
from libcpp.optional cimport optional | ||
from libcpp.string cimport string | ||
from libcpp.vector cimport vector | ||
|
||
cimport cudf._lib.cpp.io.types as cudf_io_types | ||
from cudf._lib.variant cimport monostate, variant | ||
|
||
|
||
cdef extern from "cudf/io/orc_metadata.hpp" \ | ||
namespace "cudf::io" nogil: | ||
|
||
cdef cppclass raw_orc_statistics: | ||
ctypedef monostate no_statistics | ||
|
||
cdef cppclass minmax_statistics[T]: | ||
optional[T] minimum | ||
optional[T] maximum | ||
|
||
cdef cppclass sum_statistics[T]: | ||
optional[T] sum | ||
|
||
cdef cppclass integer_statistics( | ||
minmax_statistics[int64_t], sum_statistics[int64_t] | ||
): | ||
pass | ||
|
||
cdef cppclass double_statistics( | ||
minmax_statistics[double], sum_statistics[double] | ||
): | ||
pass | ||
|
||
cdef cppclass string_statistics( | ||
minmax_statistics[string], sum_statistics[int64_t] | ||
): | ||
pass | ||
|
||
cdef cppclass bucket_statistics: | ||
vector[int64_t] count | ||
|
||
cdef cppclass decimal_statistics( | ||
minmax_statistics[string], sum_statistics[string] | ||
): | ||
pass | ||
|
||
ctypedef minmax_statistics[int32_t] date_statistics | ||
|
||
ctypedef sum_statistics[int64_t] binary_statistics | ||
|
||
cdef cppclass timestamp_statistics(minmax_statistics[int64_t]): | ||
optional[int64_t] minimum_utc | ||
optional[int64_t] maximum_utc | ||
optional[uint32_t] minimum_nanos | ||
optional[uint32_t] maximum_nanos | ||
|
||
# This is a std::variant of all the statistics types | ||
ctypedef variant statistics_type | ||
|
||
cdef cppclass column_statistics: | ||
optional[uint64_t] number_of_values | ||
optional[bool] has_null | ||
statistics_type type_specific_stats | ||
|
||
cdef cppclass parsed_orc_statistics: | ||
vector[string] column_names | ||
vector[string] file_stats | ||
vector[vector[string]] stripes_stats | ||
vector[column_statistics] file_stats | ||
vector[vector[column_statistics]] stripes_stats | ||
|
||
cdef raw_orc_statistics read_raw_orc_statistics( | ||
cdef parsed_orc_statistics read_parsed_orc_statistics( | ||
cudf_io_types.source_info src_info | ||
) except + |
Oops, something went wrong.