Merge branch 'branch-0.18' into fea-cudfround

rapidsai · Jan 20, 2021 · d672059 · d672059
2 parents d699eab + 02e25b6
commit d672059
Show file tree

Hide file tree

Showing 39 changed files with 1,378 additions and 475 deletions.
diff --git a/build.sh b/build.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
 
-# Copyright (c) 2019, NVIDIA CORPORATION.
+# Copyright (c) 2019-2021, NVIDIA CORPORATION.
 
 # cuDF build script
 
@@ -37,7 +37,7 @@ HELP="$0 [clean] [libcudf] [cudf] [dask_cudf] [benchmarks] [tests] [libcudf_kafk
    --disable_nvtx       - disable inserting NVTX profiling ranges
    --show_depr_warn     - show cmake deprecation warnings
    --ptds               - enable per-thread default stream
-   -h                   - print this text
+   -h | --h[elp]        - print this text
 
    default action (no args) is to build and install 'libcudf' then 'cudf'
    then 'dask_cudf' targets
@@ -75,7 +75,7 @@ function buildAll {
     ((${NUMARGS} == 0 )) || !(echo " ${ARGS} " | grep -q " [^-]\+ ")
 }
 
-if hasArg -h; then
+if hasArg -h || hasArg --h || hasArg --help; then
     echo "${HELP}"
     exit 0
 fi

diff --git a/ci/docs/build.sh b/ci/docs/build.sh
@@ -40,6 +40,7 @@ conda list --show-channel-urls
 #libcudf Doxygen build
 gpuci_logger "Build libcudf docs..."
 cd $PROJECT_WORKSPACE/cpp/doxygen
+wget "https://raw.githubusercontent.com/rapidsai/docs/gh-pages/api/librmm/${BRANCH_VERSION}/rmm.tag" || echo "Failed to download rmm Doxygen tag"
 doxygen Doxyfile
 
 #cudf Sphinx Build
@@ -60,4 +61,3 @@ done
 
 mv $PROJECT_WORKSPACE/docs/cudf/build/html/* $DOCS_WORKSPACE/api/cudf/$BRANCH_VERSION
 mv $PROJECT_WORKSPACE/cpp/doxygen/html/* $DOCS_WORKSPACE/api/libcudf/$BRANCH_VERSION
-
diff --git a/ci/release/update-version.sh b/ci/release/update-version.sh
@@ -49,6 +49,9 @@ function sed_runner() {
 # cpp update
 sed_runner 's/'"CUDA_DATAFRAME VERSION .* LANGUAGES"'/'"CUDA_DATAFRAME VERSION ${NEXT_FULL_TAG} LANGUAGES"'/g' cpp/CMakeLists.txt
 
+# doxyfile update
+sed_runner 's/PROJECT_NUMBER         = .*/PROJECT_NUMBER         = '${NEXT_FULL_TAG}'/g' cpp/doxygen/Doxyfile
+
 # RTD update
 sed_runner 's/version = .*/version = '"'${NEXT_SHORT_TAG}'"'/g' docs/cudf/source/conf.py
 sed_runner 's/release = .*/release = '"'${NEXT_FULL_TAG}'"'/g' docs/cudf/source/conf.py
@@ -57,3 +60,6 @@ sed_runner 's/release = .*/release = '"'${NEXT_FULL_TAG}'"'/g' docs/cudf/source/
 for FILE in conda/environments/*.yml; do
   sed_runner "s/rmm=${CURRENT_SHORT_TAG}/rmm=${NEXT_SHORT_TAG}/g" ${FILE};
 done
+
+# Doxyfile update
+sed_runner "s|\(TAGFILES.*librmm/\).*|\1${NEXT_SHORT_TAG}|" cpp/doxygen/Doxyfile
diff --git a/cpp/doxygen/Doxyfile b/cpp/doxygen/Doxyfile
@@ -38,7 +38,7 @@ PROJECT_NAME           = "libcudf"
 # could be handy for archiving the generated documentation or if some version
 # control system is used.
 
-PROJECT_NUMBER         = 0.16
+PROJECT_NUMBER         = 0.18
 
 # Using the PROJECT_BRIEF tag one can provide an optional one line description
 # for a project that appears at the top of each page and should give viewer a
@@ -2167,7 +2167,7 @@ SKIP_FUNCTION_MACROS   = YES
 # the path). If a tag file is not located in the directory in which doxygen is
 # run, you must also specify the path to the tagfile here.
 
-TAGFILES               =
+TAGFILES               = rmm.tag=https://docs.rapids.ai/api/librmm/0.18
 
 # When a file name is specified after GENERATE_TAGFILE, doxygen will create a
 # tag file that is based on the input files it reads. See section "Linking to

diff --git a/cpp/include/cudf/io/orc_metadata.hpp b/cpp/include/cudf/io/orc_metadata.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -25,31 +25,236 @@
 
 #include <vector>
 
-//! cuDF interfaces
 namespace cudf {
-//! In-development features
 namespace io {
 
 /**
- * @brief Reads file-level and stripe-level statistics of ORC dataset
+ * @brief Holds column names and buffers containing raw file-level and stripe-level statistics.
+ *
+ * The buffers can be parsed using a Protobuf parser. Alternatively, use `parsed_orc_statistics` to
+ * get the statistics parsed into a libcudf representation.
+ *
+ * The `column_names` and `file_stats` members contain one element per column. The `stripes_stats`
+ * contains one element per stripe, where each element contains column statistics for each column.
+ */
+struct raw_orc_statistics {
+  std::vector<std::string> column_names;
+  std::vector<std::string> file_stats;
+  std::vector<std::vector<std::string>> stripes_stats;
+};
+
+/**
+ * @brief Reads file-level and stripe-level statistics of ORC dataset.
  *
  * @ingroup io_readers
  *
  * The following code snippet demonstrates how to read statistics of a dataset
  * from a file:
  * @code
- *  std::string filepath = "dataset.orc";
- *  auto result = cudf::read_orc_statistics(cudf::source_info(filepath));
+ *  auto result = cudf::read_raw_orc_statistics(cudf::source_info("dataset.orc"));
  * @endcode
  *
  * @param src_info Dataset source
  *
- * @return Decompressed ColumnStatistics blobs stored in a vector of vectors. The first element of
- * result vector, which is itself a vector, contains the name of each column. The second element
- * contains statistics of each column of the whole file. Remaining elements contain statistics of
- * each column for each stripe.
+ * @return Column names and encoded ORC statistics
+ */
+raw_orc_statistics read_raw_orc_statistics(source_info const& src_info);
+
+/**
+ * @brief Enumerator for types of column statistics that can be included in `column_statistics`.
+ *
+ * The statistics type depends on the column data type.
+ */
+enum class statistics_type {
+  NONE,
+  INT,
+  DOUBLE,
+  STRING,
+  BUCKET,
+  DECIMAL,
+  DATE,
+  BINARY,
+  TIMESTAMP,
+};
+
+/**
+ * @brief Base class for column statistics that include optional minimum and maximum.
+ *
+ * Includes accessors for the minimum and maximum values.
+ */
+template <typename T>
+struct minmax_statistics {
+  std::unique_ptr<T> _minimum;
+  std::unique_ptr<T> _maximum;
+
+  auto has_minimum() const { return _minimum != nullptr; }
+  auto has_maximum() const { return _maximum != nullptr; }
+  auto minimum() const { return _minimum.get(); }
+  auto maximum() const { return _maximum.get(); }
+};
+
+/**
+ * @brief Base class for column statistics that include an optional sum.
+ *
+ * Includes accessors for the sum value.
+ */
+template <typename T>
+struct sum_statistics {
+  std::unique_ptr<T> _sum;
+
+  auto has_sum() const { return _sum != nullptr; }
+  auto sum() const { return _sum.get(); }
+};
+
+/**
+ * @brief Statistics for integral columns.
+ */
+struct integer_statistics : minmax_statistics<int64_t>, sum_statistics<int64_t> {
+  static constexpr statistics_type type = statistics_type::INT;
+};
+
+/**
+ * @brief Statistics for floating point columns.
+ */
+struct double_statistics : minmax_statistics<double>, sum_statistics<double> {
+  static constexpr statistics_type type = statistics_type::DOUBLE;
+};
+
+/**
+ * @brief Statistics for string columns.
+ *
+ * The `minimum` and `maximum` are the first and last elements, respectively, in lexicographical
+ * order. The `sum` is the total length of elements in the column.
+ * Note: According to ORC specs, the sum should be signed, but pyarrow uses unsigned value
+ */
+struct string_statistics : minmax_statistics<std::string>, sum_statistics<uint64_t> {
+  static constexpr statistics_type type = statistics_type::STRING;
+};
+
+/**
+ * @brief Statistics for boolean columns.
+ *
+ * The `count` array includes the count of `false` and `true` values.
+ */
+struct bucket_statistics {
+  static constexpr statistics_type type = statistics_type::BUCKET;
+  std::vector<uint64_t> _count;
+
+  auto count(size_t index) const { return &_count.at(index); }
+};
+
+/**
+ * @brief Statistics for decimal columns.
+ */
+struct decimal_statistics : minmax_statistics<std::string>, sum_statistics<std::string> {
+  static constexpr statistics_type type = statistics_type::DECIMAL;
+};
+
+/**
+ * @brief Statistics for date(time) columns.
+ */
+struct date_statistics : minmax_statistics<int32_t> {
+  static constexpr statistics_type type = statistics_type::DATE;
+};
+
+/**
+ * @brief Statistics for binary columns.
+ *
+ * The `sum` is the total number of bytes across all elements.
+ */
+struct binary_statistics : sum_statistics<int64_t> {
+  static constexpr statistics_type type = statistics_type::BINARY;
+};
+
+/**
+ * @brief Statistics for timestamp columns.
+ *
+ * The `minimum` and `maximum` min/max elements in the column, as the number of milliseconds since
+ * the UNIX epoch. The `minimum_utc` and `maximum_utc` are the same values adjusted to UTC.
+ */
+struct timestamp_statistics : minmax_statistics<int64_t> {
+  static constexpr statistics_type type = statistics_type::TIMESTAMP;
+  std::unique_ptr<int64_t> _minimum_utc;
+  std::unique_ptr<int64_t> _maximum_utc;
+
+  auto has_minimum_utc() const { return _minimum_utc != nullptr; }
+  auto has_maximum_utc() const { return _maximum_utc != nullptr; }
+  auto minimum_utc() const { return _minimum_utc.get(); }
+  auto maximum_utc() const { return _maximum_utc.get(); }
+};
+
+namespace orc {
+// forward declare the type that ProtobufReader uses. The `cudf::io::column_statistics` objects,
+// returned from `read_parsed_orc_statistics`, are constructed from
+// `cudf::io::orc::column_statistics` objects that `ProtobufReader` initializes.
+struct column_statistics;
+}  // namespace orc
+
+/**
+ * @brief Contains per-column ORC statistics.
+ *
+ * All columns can have the `number_of_values` statistics. Depending on the data type, a column can
+ * have additional statistics, accessible through `type_specific_stats` accessor.
+ */
+class column_statistics {
+ private:
+  std::unique_ptr<uint64_t> _number_of_values;
+  statistics_type _type      = statistics_type::NONE;
+  void* _type_specific_stats = nullptr;
+
+ public:
+  column_statistics() = default;
+  column_statistics(cudf::io::orc::column_statistics&& other);
+
+  column_statistics& operator=(column_statistics&&) noexcept;
+  column_statistics(column_statistics&&) noexcept;
+
+  auto has_number_of_values() const { return _number_of_values != nullptr; }
+  auto number_of_values() const { return _number_of_values.get(); }
+
+  auto type() const { return _type; }
+
+  /**
+   * @brief Returns a non-owning pointer to the type-specific statistics of the given type.
+   *
+   * Returns null if the requested statistics type does not match the type of the currently held
+   * type-specific statistics.
+   *
+   * @tparam T the statistics type
+   */
+  template <typename T>
+  T const* type_specific_stats() const
+  {
+    if (T::type != _type) return nullptr;
+    return static_cast<T*>(_type_specific_stats);
+  }
+
+  ~column_statistics();
+};
+
+/**
+ * @brief Holds column names and parsed file-level and stripe-level statistics.
+ *
+ * The `column_names` and `file_stats` members contain one element per column. The `stripes_stats`
+ * member contains one element per stripe, where each element contains column statistics for each
+ * column.
+ */
+struct parsed_orc_statistics {
+  std::vector<std::string> column_names;
+  std::vector<column_statistics> file_stats;
+  std::vector<std::vector<column_statistics>> stripes_stats;
+};
+
+/**
+ * @brief Reads file-level and stripe-level statistics of ORC dataset.
+ *
+ * @ingroup io_readers
+ *
+ * @param src_info Dataset source
+ *
+ * @return Column names and decoded ORC statistics
  */
-std::vector<std::vector<std::string>> read_orc_statistics(source_info const& src_info);
+parsed_orc_statistics read_parsed_orc_statistics(source_info const& src_info);
 
 }  // namespace io
 }  // namespace cudf
diff --git a/cpp/include/cudf/strings/detail/gather.cuh b/cpp/include/cudf/strings/detail/gather.cuh
@@ -94,7 +94,7 @@ std::unique_ptr<cudf::column> gather(
     [] __device__(auto size) { return static_cast<size_t>(size); },
     size_t{0},
     thrust::plus<size_t>{});
-  CUDF_EXPECTS(total_bytes < std::numeric_limits<size_type>::max(),
+  CUDF_EXPECTS(total_bytes < static_cast<std::size_t>(std::numeric_limits<size_type>::max()),
                "total size of output strings is too large for a cudf column");
 
   // create offsets from sizes

diff --git a/cpp/include/cudf/strings/detail/strings_column_factories.cuh b/cpp/include/cudf/strings/detail/strings_column_factories.cuh
@@ -52,7 +52,7 @@ std::unique_ptr<column> make_strings_column(IndexPairIterator begin,
   };
   size_t bytes = thrust::transform_reduce(
     rmm::exec_policy(stream), begin, end, size_checker, 0, thrust::plus<size_t>());
-  CUDF_EXPECTS(bytes < std::numeric_limits<size_type>::max(),
+  CUDF_EXPECTS(bytes < static_cast<std::size_t>(std::numeric_limits<size_type>::max()),
                "total size of strings is too large for cudf column");
 
   // build offsets column from the strings sizes

diff --git a/cpp/src/copying/concatenate.cu b/cpp/src/copying/concatenate.cu
@@ -227,7 +227,7 @@ std::unique_ptr<column> fused_concatenate(std::vector<column_view> const& views,
   auto const& d_offsets   = std::get<2>(device_views);
   auto const output_size  = std::get<3>(device_views);
 
-  CUDF_EXPECTS(output_size < std::numeric_limits<size_type>::max(),
+  CUDF_EXPECTS(output_size < static_cast<std::size_t>(std::numeric_limits<size_type>::max()),
                "Total number of concatenated rows exceeds size_type range");
 
   // Allocate output
@@ -364,7 +364,7 @@ void bounds_and_type_check(ColIter begin, ColIter end)
     std::accumulate(begin, end, std::size_t{}, [](size_t a, auto const& b) {
       return a + static_cast<size_t>(b.size());
     });
-  CUDF_EXPECTS(total_row_count <= std::numeric_limits<size_type>::max(),
+  CUDF_EXPECTS(total_row_count <= static_cast<std::size_t>(std::numeric_limits<size_type>::max()),
                "Total number of concatenated rows exceeds size_type range");
 
   // march each child