Skip to content

Commit

Permalink
Merge branch 'branch-0.18' into fea-cudfround
Browse files Browse the repository at this point in the history
  • Loading branch information
ChrisJar authored Jan 20, 2021
2 parents d699eab + 02e25b6 commit d672059
Show file tree
Hide file tree
Showing 39 changed files with 1,378 additions and 475 deletions.
6 changes: 3 additions & 3 deletions build.sh
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#!/bin/bash

# Copyright (c) 2019, NVIDIA CORPORATION.
# Copyright (c) 2019-2021, NVIDIA CORPORATION.

# cuDF build script

Expand Down Expand Up @@ -37,7 +37,7 @@ HELP="$0 [clean] [libcudf] [cudf] [dask_cudf] [benchmarks] [tests] [libcudf_kafk
--disable_nvtx - disable inserting NVTX profiling ranges
--show_depr_warn - show cmake deprecation warnings
--ptds - enable per-thread default stream
-h - print this text
-h | --h[elp] - print this text
default action (no args) is to build and install 'libcudf' then 'cudf'
then 'dask_cudf' targets
Expand Down Expand Up @@ -75,7 +75,7 @@ function buildAll {
((${NUMARGS} == 0 )) || !(echo " ${ARGS} " | grep -q " [^-]\+ ")
}

if hasArg -h; then
if hasArg -h || hasArg --h || hasArg --help; then
echo "${HELP}"
exit 0
fi
Expand Down
2 changes: 1 addition & 1 deletion ci/docs/build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ conda list --show-channel-urls
#libcudf Doxygen build
gpuci_logger "Build libcudf docs..."
cd $PROJECT_WORKSPACE/cpp/doxygen
wget "https://raw.githubusercontent.com/rapidsai/docs/gh-pages/api/librmm/${BRANCH_VERSION}/rmm.tag" || echo "Failed to download rmm Doxygen tag"
doxygen Doxyfile

#cudf Sphinx Build
Expand All @@ -60,4 +61,3 @@ done

mv $PROJECT_WORKSPACE/docs/cudf/build/html/* $DOCS_WORKSPACE/api/cudf/$BRANCH_VERSION
mv $PROJECT_WORKSPACE/cpp/doxygen/html/* $DOCS_WORKSPACE/api/libcudf/$BRANCH_VERSION

6 changes: 6 additions & 0 deletions ci/release/update-version.sh
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,9 @@ function sed_runner() {
# cpp update
sed_runner 's/'"CUDA_DATAFRAME VERSION .* LANGUAGES"'/'"CUDA_DATAFRAME VERSION ${NEXT_FULL_TAG} LANGUAGES"'/g' cpp/CMakeLists.txt

# doxyfile update
sed_runner 's/PROJECT_NUMBER = .*/PROJECT_NUMBER = '${NEXT_FULL_TAG}'/g' cpp/doxygen/Doxyfile

# RTD update
sed_runner 's/version = .*/version = '"'${NEXT_SHORT_TAG}'"'/g' docs/cudf/source/conf.py
sed_runner 's/release = .*/release = '"'${NEXT_FULL_TAG}'"'/g' docs/cudf/source/conf.py
Expand All @@ -57,3 +60,6 @@ sed_runner 's/release = .*/release = '"'${NEXT_FULL_TAG}'"'/g' docs/cudf/source/
for FILE in conda/environments/*.yml; do
sed_runner "s/rmm=${CURRENT_SHORT_TAG}/rmm=${NEXT_SHORT_TAG}/g" ${FILE};
done

# Doxyfile update
sed_runner "s|\(TAGFILES.*librmm/\).*|\1${NEXT_SHORT_TAG}|" cpp/doxygen/Doxyfile
4 changes: 2 additions & 2 deletions cpp/doxygen/Doxyfile
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ PROJECT_NAME = "libcudf"
# could be handy for archiving the generated documentation or if some version
# control system is used.

PROJECT_NUMBER = 0.16
PROJECT_NUMBER = 0.18

# Using the PROJECT_BRIEF tag one can provide an optional one line description
# for a project that appears at the top of each page and should give viewer a
Expand Down Expand Up @@ -2167,7 +2167,7 @@ SKIP_FUNCTION_MACROS = YES
# the path). If a tag file is not located in the directory in which doxygen is
# run, you must also specify the path to the tagfile here.

TAGFILES =
TAGFILES = rmm.tag=https://docs.rapids.ai/api/librmm/0.18

# When a file name is specified after GENERATE_TAGFILE, doxygen will create a
# tag file that is based on the input files it reads. See section "Linking to
Expand Down
227 changes: 216 additions & 11 deletions cpp/include/cudf/io/orc_metadata.hpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2019-2020, NVIDIA CORPORATION.
* Copyright (c) 2019-2021, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand All @@ -25,31 +25,236 @@

#include <vector>

//! cuDF interfaces
namespace cudf {
//! In-development features
namespace io {

/**
* @brief Reads file-level and stripe-level statistics of ORC dataset
* @brief Holds column names and buffers containing raw file-level and stripe-level statistics.
*
* The buffers can be parsed using a Protobuf parser. Alternatively, use `parsed_orc_statistics` to
* get the statistics parsed into a libcudf representation.
*
* The `column_names` and `file_stats` members contain one element per column. The `stripes_stats`
* contains one element per stripe, where each element contains column statistics for each column.
*/
struct raw_orc_statistics {
std::vector<std::string> column_names;
std::vector<std::string> file_stats;
std::vector<std::vector<std::string>> stripes_stats;
};

/**
* @brief Reads file-level and stripe-level statistics of ORC dataset.
*
* @ingroup io_readers
*
* The following code snippet demonstrates how to read statistics of a dataset
* from a file:
* @code
* std::string filepath = "dataset.orc";
* auto result = cudf::read_orc_statistics(cudf::source_info(filepath));
* auto result = cudf::read_raw_orc_statistics(cudf::source_info("dataset.orc"));
* @endcode
*
* @param src_info Dataset source
*
* @return Decompressed ColumnStatistics blobs stored in a vector of vectors. The first element of
* result vector, which is itself a vector, contains the name of each column. The second element
* contains statistics of each column of the whole file. Remaining elements contain statistics of
* each column for each stripe.
* @return Column names and encoded ORC statistics
*/
raw_orc_statistics read_raw_orc_statistics(source_info const& src_info);

/**
* @brief Enumerator for types of column statistics that can be included in `column_statistics`.
*
* The statistics type depends on the column data type.
*/
enum class statistics_type {
NONE,
INT,
DOUBLE,
STRING,
BUCKET,
DECIMAL,
DATE,
BINARY,
TIMESTAMP,
};

/**
* @brief Base class for column statistics that include optional minimum and maximum.
*
* Includes accessors for the minimum and maximum values.
*/
template <typename T>
struct minmax_statistics {
std::unique_ptr<T> _minimum;
std::unique_ptr<T> _maximum;

auto has_minimum() const { return _minimum != nullptr; }
auto has_maximum() const { return _maximum != nullptr; }
auto minimum() const { return _minimum.get(); }
auto maximum() const { return _maximum.get(); }
};

/**
* @brief Base class for column statistics that include an optional sum.
*
* Includes accessors for the sum value.
*/
template <typename T>
struct sum_statistics {
std::unique_ptr<T> _sum;

auto has_sum() const { return _sum != nullptr; }
auto sum() const { return _sum.get(); }
};

/**
* @brief Statistics for integral columns.
*/
struct integer_statistics : minmax_statistics<int64_t>, sum_statistics<int64_t> {
static constexpr statistics_type type = statistics_type::INT;
};

/**
* @brief Statistics for floating point columns.
*/
struct double_statistics : minmax_statistics<double>, sum_statistics<double> {
static constexpr statistics_type type = statistics_type::DOUBLE;
};

/**
* @brief Statistics for string columns.
*
* The `minimum` and `maximum` are the first and last elements, respectively, in lexicographical
* order. The `sum` is the total length of elements in the column.
* Note: According to ORC specs, the sum should be signed, but pyarrow uses unsigned value
*/
struct string_statistics : minmax_statistics<std::string>, sum_statistics<uint64_t> {
static constexpr statistics_type type = statistics_type::STRING;
};

/**
* @brief Statistics for boolean columns.
*
* The `count` array includes the count of `false` and `true` values.
*/
struct bucket_statistics {
static constexpr statistics_type type = statistics_type::BUCKET;
std::vector<uint64_t> _count;

auto count(size_t index) const { return &_count.at(index); }
};

/**
* @brief Statistics for decimal columns.
*/
struct decimal_statistics : minmax_statistics<std::string>, sum_statistics<std::string> {
static constexpr statistics_type type = statistics_type::DECIMAL;
};

/**
* @brief Statistics for date(time) columns.
*/
struct date_statistics : minmax_statistics<int32_t> {
static constexpr statistics_type type = statistics_type::DATE;
};

/**
* @brief Statistics for binary columns.
*
* The `sum` is the total number of bytes across all elements.
*/
struct binary_statistics : sum_statistics<int64_t> {
static constexpr statistics_type type = statistics_type::BINARY;
};

/**
* @brief Statistics for timestamp columns.
*
* The `minimum` and `maximum` min/max elements in the column, as the number of milliseconds since
* the UNIX epoch. The `minimum_utc` and `maximum_utc` are the same values adjusted to UTC.
*/
struct timestamp_statistics : minmax_statistics<int64_t> {
static constexpr statistics_type type = statistics_type::TIMESTAMP;
std::unique_ptr<int64_t> _minimum_utc;
std::unique_ptr<int64_t> _maximum_utc;

auto has_minimum_utc() const { return _minimum_utc != nullptr; }
auto has_maximum_utc() const { return _maximum_utc != nullptr; }
auto minimum_utc() const { return _minimum_utc.get(); }
auto maximum_utc() const { return _maximum_utc.get(); }
};

namespace orc {
// forward declare the type that ProtobufReader uses. The `cudf::io::column_statistics` objects,
// returned from `read_parsed_orc_statistics`, are constructed from
// `cudf::io::orc::column_statistics` objects that `ProtobufReader` initializes.
struct column_statistics;
} // namespace orc

/**
* @brief Contains per-column ORC statistics.
*
* All columns can have the `number_of_values` statistics. Depending on the data type, a column can
* have additional statistics, accessible through `type_specific_stats` accessor.
*/
class column_statistics {
private:
std::unique_ptr<uint64_t> _number_of_values;
statistics_type _type = statistics_type::NONE;
void* _type_specific_stats = nullptr;

public:
column_statistics() = default;
column_statistics(cudf::io::orc::column_statistics&& other);

column_statistics& operator=(column_statistics&&) noexcept;
column_statistics(column_statistics&&) noexcept;

auto has_number_of_values() const { return _number_of_values != nullptr; }
auto number_of_values() const { return _number_of_values.get(); }

auto type() const { return _type; }

/**
* @brief Returns a non-owning pointer to the type-specific statistics of the given type.
*
* Returns null if the requested statistics type does not match the type of the currently held
* type-specific statistics.
*
* @tparam T the statistics type
*/
template <typename T>
T const* type_specific_stats() const
{
if (T::type != _type) return nullptr;
return static_cast<T*>(_type_specific_stats);
}

~column_statistics();
};

/**
* @brief Holds column names and parsed file-level and stripe-level statistics.
*
* The `column_names` and `file_stats` members contain one element per column. The `stripes_stats`
* member contains one element per stripe, where each element contains column statistics for each
* column.
*/
struct parsed_orc_statistics {
std::vector<std::string> column_names;
std::vector<column_statistics> file_stats;
std::vector<std::vector<column_statistics>> stripes_stats;
};

/**
* @brief Reads file-level and stripe-level statistics of ORC dataset.
*
* @ingroup io_readers
*
* @param src_info Dataset source
*
* @return Column names and decoded ORC statistics
*/
std::vector<std::vector<std::string>> read_orc_statistics(source_info const& src_info);
parsed_orc_statistics read_parsed_orc_statistics(source_info const& src_info);

} // namespace io
} // namespace cudf
2 changes: 1 addition & 1 deletion cpp/include/cudf/strings/detail/gather.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,7 @@ std::unique_ptr<cudf::column> gather(
[] __device__(auto size) { return static_cast<size_t>(size); },
size_t{0},
thrust::plus<size_t>{});
CUDF_EXPECTS(total_bytes < std::numeric_limits<size_type>::max(),
CUDF_EXPECTS(total_bytes < static_cast<std::size_t>(std::numeric_limits<size_type>::max()),
"total size of output strings is too large for a cudf column");

// create offsets from sizes
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ std::unique_ptr<column> make_strings_column(IndexPairIterator begin,
};
size_t bytes = thrust::transform_reduce(
rmm::exec_policy(stream), begin, end, size_checker, 0, thrust::plus<size_t>());
CUDF_EXPECTS(bytes < std::numeric_limits<size_type>::max(),
CUDF_EXPECTS(bytes < static_cast<std::size_t>(std::numeric_limits<size_type>::max()),
"total size of strings is too large for cudf column");

// build offsets column from the strings sizes
Expand Down
4 changes: 2 additions & 2 deletions cpp/src/copying/concatenate.cu
Original file line number Diff line number Diff line change
Expand Up @@ -227,7 +227,7 @@ std::unique_ptr<column> fused_concatenate(std::vector<column_view> const& views,
auto const& d_offsets = std::get<2>(device_views);
auto const output_size = std::get<3>(device_views);

CUDF_EXPECTS(output_size < std::numeric_limits<size_type>::max(),
CUDF_EXPECTS(output_size < static_cast<std::size_t>(std::numeric_limits<size_type>::max()),
"Total number of concatenated rows exceeds size_type range");

// Allocate output
Expand Down Expand Up @@ -364,7 +364,7 @@ void bounds_and_type_check(ColIter begin, ColIter end)
std::accumulate(begin, end, std::size_t{}, [](size_t a, auto const& b) {
return a + static_cast<size_t>(b.size());
});
CUDF_EXPECTS(total_row_count <= std::numeric_limits<size_type>::max(),
CUDF_EXPECTS(total_row_count <= static_cast<std::size_t>(std::numeric_limits<size_type>::max()),
"Total number of concatenated rows exceeds size_type range");

// march each child
Expand Down
Loading

0 comments on commit d672059

Please sign in to comment.