diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index aab0a9b2d49..5fd68bfb26c 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -502,6 +502,7 @@ add_library(
src/reductions/product.cu
src/reductions/reductions.cpp
src/reductions/scan/rank_scan.cu
+ src/reductions/scan/ewm.cu
src/reductions/scan/scan.cpp
src/reductions/scan/scan_exclusive.cu
src/reductions/scan/scan_inclusive.cu
diff --git a/cpp/benchmarks/io/text/multibyte_split.cpp b/cpp/benchmarks/io/text/multibyte_split.cpp
index 67705863d41..4bfef9767ca 100644
--- a/cpp/benchmarks/io/text/multibyte_split.cpp
+++ b/cpp/benchmarks/io/text/multibyte_split.cpp
@@ -85,8 +85,7 @@ static cudf::string_scalar create_random_input(int32_t num_chars,
// extract the chars from the returned strings column.
auto input_column_contents = input_column->release();
- auto chars_column_contents = input_column_contents.children[1]->release();
- auto chars_buffer = chars_column_contents.data.release();
+ auto chars_buffer = input_column_contents.data.release();
// turn the chars in to a string scalar.
return cudf::string_scalar(std::move(*chars_buffer));
@@ -218,7 +217,7 @@ NVBENCH_BENCH_TYPES(bench_multibyte_split,
NVBENCH_BENCH_TYPES(bench_multibyte_split, NVBENCH_TYPE_AXES(source_type_list))
.set_name("multibyte_split_source")
.set_min_samples(4)
- .add_int64_axis("strip_delimiters", {1})
+ .add_int64_axis("strip_delimiters", {0, 1})
.add_int64_axis("delim_size", {1})
.add_int64_axis("delim_percent", {1})
.add_int64_power_of_two_axis("size_approx", {15, 30})
diff --git a/cpp/include/cudf/aggregation.hpp b/cpp/include/cudf/aggregation.hpp
index d458c831f19..3c1023017be 100644
--- a/cpp/include/cudf/aggregation.hpp
+++ b/cpp/include/cudf/aggregation.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@@ -103,6 +103,7 @@ class aggregation {
NUNIQUE, ///< count number of unique elements
NTH_ELEMENT, ///< get the nth element
ROW_NUMBER, ///< get row-number of current index (relative to rolling window)
+ EWMA, ///< get exponential weighted moving average at current index
RANK, ///< get rank of current index
COLLECT_LIST, ///< collect values into a list
COLLECT_SET, ///< collect values into a list without duplicate entries
@@ -250,6 +251,8 @@ class segmented_reduce_aggregation : public virtual aggregation {
enum class udf_type : bool { CUDA, PTX };
/// Type of correlation method.
enum class correlation_type : int32_t { PEARSON, KENDALL, SPEARMAN };
+/// Type of treatment of EWM input values' first value
+enum class ewm_history : int32_t { INFINITE, FINITE };
/// Factory to create a SUM aggregation
/// @return A SUM aggregation object
@@ -411,6 +414,42 @@ std::unique_ptr make_nth_element_aggregation(
template
std::unique_ptr make_row_number_aggregation();
+/**
+ * @brief Factory to create an EWMA aggregation
+ *
+ * `EWMA` returns a non-nullable column with the same type as the input,
+ * whose values are the exponentially weighted moving average of the input
+ * sequence. Let these values be known as the y_i.
+ *
+ * EWMA aggregations are parameterized by a center of mass (`com`) which
+ * affects the contribution of the previous values (y_0 ... y_{i-1}) in
+ * computing the y_i.
+ *
+ * EWMA aggregations are also parameterized by a history `cudf::ewm_history`.
+ * Special considerations have to be given to the mathematical treatment of
+ * the first value of the input sequence. There are two approaches to this,
+ * one which considers the first value of the sequence to be the exponential
+ * weighted moving average of some infinite history of data, and one which
+ * takes the first value to be the only datapoint known. These assumptions
+ * lead to two different formulas for the y_i. `ewm_history` selects which.
+ *
+ * EWMA aggregations have special null handling. Nulls have two effects. The
+ * first is to propagate forward the last valid value as far as it has been
+ * computed. This could be thought of as the nulls not affecting the average
+ * in any way. The second effect changes the way the y_i are computed. Since
+ * a moving average is conceptually designed to weight contributing values by
+ * their recency, nulls ought to count as valid periods even though they do
+ * not change the average. For example, if the input sequence is {1, NULL, 3}
+ * then when computing y_2 one should weigh y_0 as if it occurs two periods
+ * before y_2 rather than just one.
+ *
+ * @param center_of_mass the center of mass.
+ * @param history which assumption to make about the first value
+ * @return A EWM aggregation object
+ */
+template
+std::unique_ptr make_ewma_aggregation(double const center_of_mass, ewm_history history);
+
/**
* @brief Factory to create a RANK aggregation
*
diff --git a/cpp/include/cudf/detail/aggregation/aggregation.hpp b/cpp/include/cudf/detail/aggregation/aggregation.hpp
index edee83783b8..843414817e3 100644
--- a/cpp/include/cudf/detail/aggregation/aggregation.hpp
+++ b/cpp/include/cudf/detail/aggregation/aggregation.hpp
@@ -76,6 +76,8 @@ class simple_aggregations_collector { // Declares the interface for the simple
class nth_element_aggregation const& agg);
virtual std::vector> visit(data_type col_type,
class row_number_aggregation const& agg);
+ virtual std::vector> visit(data_type col_type,
+ class ewma_aggregation const& agg);
virtual std::vector> visit(data_type col_type,
class rank_aggregation const& agg);
virtual std::vector> visit(
@@ -141,6 +143,7 @@ class aggregation_finalizer { // Declares the interface for the finalizer
virtual void visit(class correlation_aggregation const& agg);
virtual void visit(class tdigest_aggregation const& agg);
virtual void visit(class merge_tdigest_aggregation const& agg);
+ virtual void visit(class ewma_aggregation const& agg);
};
/**
@@ -667,6 +670,40 @@ class row_number_aggregation final : public rolling_aggregation {
void finalize(aggregation_finalizer& finalizer) const override { finalizer.visit(*this); }
};
+/**
+ * @brief Derived class for specifying an ewma aggregation
+ */
+class ewma_aggregation final : public scan_aggregation {
+ public:
+ double const center_of_mass;
+ cudf::ewm_history history;
+
+ ewma_aggregation(double const center_of_mass, cudf::ewm_history history)
+ : aggregation{EWMA}, center_of_mass{center_of_mass}, history{history}
+ {
+ }
+
+ std::unique_ptr clone() const override
+ {
+ return std::make_unique(*this);
+ }
+
+ std::vector> get_simple_aggregations(
+ data_type col_type, simple_aggregations_collector& collector) const override
+ {
+ return collector.visit(col_type, *this);
+ }
+
+ bool is_equal(aggregation const& _other) const override
+ {
+ if (!this->aggregation::is_equal(_other)) { return false; }
+ auto const& other = dynamic_cast(_other);
+ return this->center_of_mass == other.center_of_mass and this->history == other.history;
+ }
+
+ void finalize(aggregation_finalizer& finalizer) const override { finalizer.visit(*this); }
+};
+
/**
* @brief Derived class for specifying a rank aggregation
*/
@@ -1336,6 +1373,11 @@ struct target_type_impl {
using type = size_type;
};
+template
+struct target_type_impl {
+ using type = double;
+};
+
// Always use size_type accumulator for RANK
template
struct target_type_impl {
@@ -1536,6 +1578,8 @@ CUDF_HOST_DEVICE inline decltype(auto) aggregation_dispatcher(aggregation::Kind
return f.template operator()(std::forward(args)...);
case aggregation::MERGE_TDIGEST:
return f.template operator()(std::forward(args)...);
+ case aggregation::EWMA:
+ return f.template operator()(std::forward(args)...);
default: {
#ifndef __CUDA_ARCH__
CUDF_FAIL("Unsupported aggregation.");
diff --git a/cpp/src/aggregation/aggregation.cpp b/cpp/src/aggregation/aggregation.cpp
index adee9147740..5422304c5cb 100644
--- a/cpp/src/aggregation/aggregation.cpp
+++ b/cpp/src/aggregation/aggregation.cpp
@@ -154,6 +154,12 @@ std::vector> simple_aggregations_collector::visit(
return visit(col_type, static_cast(agg));
}
+std::vector> simple_aggregations_collector::visit(
+ data_type col_type, ewma_aggregation const& agg)
+{
+ return visit(col_type, static_cast(agg));
+}
+
std::vector> simple_aggregations_collector::visit(
data_type col_type, rank_aggregation const& agg)
{
@@ -333,6 +339,11 @@ void aggregation_finalizer::visit(row_number_aggregation const& agg)
visit(static_cast(agg));
}
+void aggregation_finalizer::visit(ewma_aggregation const& agg)
+{
+ visit(static_cast(agg));
+}
+
void aggregation_finalizer::visit(rank_aggregation const& agg)
{
visit(static_cast(agg));
@@ -665,6 +676,17 @@ std::unique_ptr make_row_number_aggregation()
template std::unique_ptr make_row_number_aggregation();
template std::unique_ptr make_row_number_aggregation();
+/// Factory to create an EWMA aggregation
+template
+std::unique_ptr make_ewma_aggregation(double const com, cudf::ewm_history history)
+{
+ return std::make_unique(com, history);
+}
+template std::unique_ptr make_ewma_aggregation(double const com,
+ cudf::ewm_history history);
+template std::unique_ptr make_ewma_aggregation(
+ double const com, cudf::ewm_history history);
+
/// Factory to create a RANK aggregation
template
std::unique_ptr make_rank_aggregation(rank_method method,
diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu
index ca15b532d07..bed4dbc5a66 100644
--- a/cpp/src/io/parquet/writer_impl.cu
+++ b/cpp/src/io/parquet/writer_impl.cu
@@ -296,19 +296,6 @@ size_t column_size(column_view const& column, rmm::cuda_stream_view stream)
CUDF_FAIL("Unexpected compound type");
}
-// checks to see if the given column has a fixed size. This doesn't
-// check every row, so assumes string and list columns are not fixed, even
-// if each row is the same width.
-// TODO: update this if FIXED_LEN_BYTE_ARRAY is ever supported for writes.
-bool is_col_fixed_width(column_view const& column)
-{
- if (column.type().id() == type_id::STRUCT) {
- return std::all_of(column.child_begin(), column.child_end(), is_col_fixed_width);
- }
-
- return is_fixed_width(column.type());
-}
-
/**
* @brief Extends SchemaElement to add members required in constructing parquet_column_view
*
@@ -946,6 +933,15 @@ struct parquet_column_view {
return schema_node.converted_type.value_or(UNKNOWN);
}
+ // Checks to see if the given column has a fixed-width data type. This doesn't
+ // check every value, so it assumes string and list columns are not fixed-width, even
+ // if each value has the same size.
+ [[nodiscard]] bool is_fixed_width() const
+ {
+ // lists and strings are not fixed width
+ return max_rep_level() == 0 and physical_type() != Type::BYTE_ARRAY;
+ }
+
std::vector const& get_path_in_schema() { return path_in_schema; }
// LIST related member functions
@@ -1764,7 +1760,7 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta,
// unbalanced in final page sizes, so using 4 which seems to be a good
// compromise at smoothing things out without getting fragment sizes too small.
auto frag_size_fn = [&](auto const& col, size_t col_size) {
- int const target_frags_per_page = is_col_fixed_width(col) ? 1 : 4;
+ int const target_frags_per_page = col.is_fixed_width() ? 1 : 4;
auto const avg_len =
target_frags_per_page * util::div_rounding_up_safe(col_size, input.num_rows());
if (avg_len > 0) {
@@ -1775,8 +1771,8 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta,
}
};
- std::transform(single_streams_table.begin(),
- single_streams_table.end(),
+ std::transform(parquet_columns.begin(),
+ parquet_columns.end(),
column_sizes.begin(),
column_frag_size.begin(),
frag_size_fn);
diff --git a/cpp/src/io/text/data_chunk_source_factories.cpp b/cpp/src/io/text/data_chunk_source_factories.cpp
index 596ca3458c8..58faa0ebfe4 100644
--- a/cpp/src/io/text/data_chunk_source_factories.cpp
+++ b/cpp/src/io/text/data_chunk_source_factories.cpp
@@ -120,7 +120,11 @@ class istream_data_chunk_reader : public data_chunk_reader {
{
}
- void skip_bytes(std::size_t size) override { _datastream->ignore(size); };
+ void skip_bytes(std::size_t size) override
+ {
+ // 20% faster than _datastream->ignore(size) for large files
+ _datastream->seekg(_datastream->tellg() + static_cast(size));
+ };
std::unique_ptr get_next_chunk(std::size_t read_size,
rmm::cuda_stream_view stream) override
@@ -265,7 +269,7 @@ class file_data_chunk_source : public data_chunk_source {
[[nodiscard]] std::unique_ptr create_reader() const override
{
return std::make_unique(
- std::make_unique(_filename, std::ifstream::in));
+ std::make_unique(_filename, std::ifstream::in | std::ifstream::binary));
}
private:
diff --git a/cpp/src/reductions/scan/ewm.cu b/cpp/src/reductions/scan/ewm.cu
new file mode 100644
index 00000000000..3fa2de450ad
--- /dev/null
+++ b/cpp/src/reductions/scan/ewm.cu
@@ -0,0 +1,330 @@
+/*
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "scan.cuh"
+
+#include
+#include
+#include
+#include
+#include
+
+#include
+#include
+#include
+
+#include
+#include
+#include
+
+namespace cudf {
+namespace detail {
+
+template
+using pair_type = thrust::pair;
+
+/**
+ * @brief functor to be summed over in a prefix sum such that
+ * the recurrence in question is solved. See
+ * G. E. Blelloch. Prefix sums and their applications. Technical Report
+ * CMU-CS-90-190, Nov. 1990. S. 1.4
+ * for details
+ */
+template
+class recurrence_functor {
+ public:
+ __device__ pair_type operator()(pair_type ci, pair_type cj)
+ {
+ return {ci.first * cj.first, ci.second * cj.first + cj.second};
+ }
+};
+
+template
+struct ewma_functor_base {
+ T beta;
+ const pair_type IDENTITY{1.0, 0.0};
+};
+
+template
+struct ewma_adjust_nulls_functor : public ewma_functor_base {
+ __device__ pair_type operator()(thrust::tuple const data)
+ {
+ // Not const to allow for updating the input value
+ auto [valid, exp, input] = data;
+ if (!valid) { return this->IDENTITY; }
+ if constexpr (not is_numerator) { input = 1; }
+
+ // The value is non-null, but nulls preceding it
+ // must adjust the second element of the pair
+ T const beta = this->beta;
+ return {beta * ((exp != 0) ? pow(beta, exp) : 1), input};
+ }
+};
+
+template
+struct ewma_adjust_no_nulls_functor : public ewma_functor_base {
+ __device__ pair_type operator()(T const data)
+ {
+ T const beta = this->beta;
+ if constexpr (is_numerator) {
+ return {beta, data};
+ } else {
+ return {beta, 1.0};
+ }
+ }
+};
+
+template
+struct ewma_noadjust_nulls_functor : public ewma_functor_base {
+ /*
+ In the null case, a denominator actually has to be computed. The formula is
+ y_{i+1} = (1 - alpha)x_{i-1} + alpha x_i, but really there is a "denominator"
+ which is the sum of the weights: alpha + (1 - alpha) == 1. If a null is
+ encountered, that means that the "previous" value is downweighted by a
+ factor (for each missing value). For example with a single null:
+ data = {x_0, NULL, x_1},
+ y_2 = (1 - alpha)**2 x_0 + alpha * x_2 / (alpha + (1-alpha)**2)
+
+ As such, the pairs must be updated before summing like the adjusted case to
+ properly downweight the previous values. But now but we also need to compute
+ the normalization factors and divide the results into them at the end.
+ */
+ __device__ pair_type operator()(thrust::tuple const data)
+ {
+ T const beta = this->beta;
+ auto const [input, index, valid, nullcnt] = data;
+ if (index == 0) {
+ return {beta, input};
+ } else {
+ if (!valid) { return this->IDENTITY; }
+ // preceding value is valid, return normal pair
+ if (nullcnt == 0) { return {beta, (1.0 - beta) * input}; }
+ // one or more preceding values is null, adjust by how many
+ T const factor = (1.0 - beta) + pow(beta, nullcnt + 1);
+ return {(beta * (pow(beta, nullcnt)) / factor), ((1.0 - beta) * input) / factor};
+ }
+ }
+};
+
+template
+struct ewma_noadjust_no_nulls_functor : public ewma_functor_base {
+ __device__ pair_type operator()(thrust::tuple const data)
+ {
+ T const beta = this->beta;
+ auto const [input, index] = data;
+ if (index == 0) {
+ return {beta, input};
+ } else {
+ return {beta, (1.0 - beta) * input};
+ }
+ }
+};
+
+/**
+* @brief Return an array whose values y_i are the number of null entries
+* in between the last valid entry of the input and the current index.
+* Example: {1, NULL, 3, 4, NULL, NULL, 7}
+ -> {0, 0 1, 0, 0, 1, 2}
+*/
+rmm::device_uvector null_roll_up(column_view const& input,
+ rmm::cuda_stream_view stream)
+{
+ rmm::device_uvector output(input.size(), stream);
+
+ auto device_view = column_device_view::create(input);
+ auto invalid_it = thrust::make_transform_iterator(
+ cudf::detail::make_validity_iterator(*device_view),
+ cuda::proclaim_return_type([] __device__(int valid) -> int { return 1 - valid; }));
+
+ // valid mask {1, 0, 1, 0, 0, 1} leads to output array {0, 0, 1, 0, 1, 2}
+ thrust::inclusive_scan_by_key(rmm::exec_policy(stream),
+ invalid_it,
+ invalid_it + input.size() - 1,
+ invalid_it,
+ std::next(output.begin()));
+ return output;
+}
+
+template
+rmm::device_uvector compute_ewma_adjust(column_view const& input,
+ T const beta,
+ rmm::cuda_stream_view stream,
+ rmm::device_async_resource_ref mr)
+{
+ rmm::device_uvector output(input.size(), stream);
+ rmm::device_uvector> pairs(input.size(), stream);
+
+ if (input.has_nulls()) {
+ rmm::device_uvector nullcnt = null_roll_up(input, stream);
+ auto device_view = column_device_view::create(input);
+ auto valid_it = cudf::detail::make_validity_iterator(*device_view);
+ auto data =
+ thrust::make_zip_iterator(thrust::make_tuple(valid_it, nullcnt.begin(), input.begin()));
+
+ thrust::transform_inclusive_scan(rmm::exec_policy(stream),
+ data,
+ data + input.size(),
+ pairs.begin(),
+ ewma_adjust_nulls_functor{beta},
+ recurrence_functor{});
+ thrust::transform(rmm::exec_policy(stream),
+ pairs.begin(),
+ pairs.end(),
+ output.begin(),
+ [] __device__(pair_type pair) -> T { return pair.second; });
+
+ thrust::transform_inclusive_scan(rmm::exec_policy(stream),
+ data,
+ data + input.size(),
+ pairs.begin(),
+ ewma_adjust_nulls_functor{beta},
+ recurrence_functor{});
+
+ } else {
+ thrust::transform_inclusive_scan(rmm::exec_policy(stream),
+ input.begin(),
+ input.end(),
+ pairs.begin(),
+ ewma_adjust_no_nulls_functor{beta},
+ recurrence_functor{});
+ thrust::transform(rmm::exec_policy(stream),
+ pairs.begin(),
+ pairs.end(),
+ output.begin(),
+ [] __device__(pair_type pair) -> T { return pair.second; });
+ auto itr = thrust::make_counting_iterator(0);
+
+ thrust::transform_inclusive_scan(rmm::exec_policy(stream),
+ itr,
+ itr + input.size(),
+ pairs.begin(),
+ ewma_adjust_no_nulls_functor{beta},
+ recurrence_functor{});
+ }
+
+ thrust::transform(
+ rmm::exec_policy(stream),
+ pairs.begin(),
+ pairs.end(),
+ output.begin(),
+ output.begin(),
+ [] __device__(pair_type pair, T numerator) -> T { return numerator / pair.second; });
+
+ return output;
+}
+
+template
+rmm::device_uvector compute_ewma_noadjust(column_view const& input,
+ T const beta,
+ rmm::cuda_stream_view stream,
+ rmm::device_async_resource_ref mr)
+{
+ rmm::device_uvector output(input.size(), stream);
+ rmm::device_uvector> pairs(input.size(), stream);
+ rmm::device_uvector nullcnt =
+ [&input, stream]() -> rmm::device_uvector {
+ if (input.has_nulls()) {
+ return null_roll_up(input, stream);
+ } else {
+ return rmm::device_uvector(input.size(), stream);
+ }
+ }();
+ // denominators are all 1 and do not need to be computed
+ // pairs are all (beta, 1-beta x_i) except for the first one
+
+ if (!input.has_nulls()) {
+ auto data = thrust::make_zip_iterator(
+ thrust::make_tuple(input.begin(), thrust::make_counting_iterator(0)));
+ thrust::transform_inclusive_scan(rmm::exec_policy(stream),
+ data,
+ data + input.size(),
+ pairs.begin(),
+ ewma_noadjust_no_nulls_functor{beta},
+ recurrence_functor{});
+
+ } else {
+ auto device_view = column_device_view::create(input);
+ auto valid_it = detail::make_validity_iterator(*device_view);
+
+ auto data = thrust::make_zip_iterator(thrust::make_tuple(
+ input.begin(), thrust::make_counting_iterator(0), valid_it, nullcnt.begin()));
+
+ thrust::transform_inclusive_scan(rmm::exec_policy(stream),
+ data,
+ data + input.size(),
+ pairs.begin(),
+ ewma_noadjust_nulls_functor{beta},
+ recurrence_functor());
+ }
+
+ // copy the second elements to the output for now
+ thrust::transform(rmm::exec_policy(stream),
+ pairs.begin(),
+ pairs.end(),
+ output.begin(),
+ [] __device__(pair_type pair) -> T { return pair.second; });
+ return output;
+}
+
+struct ewma_functor {
+ template ::value)>
+ std::unique_ptr operator()(scan_aggregation const& agg,
+ column_view const& input,
+ rmm::cuda_stream_view stream,
+ rmm::device_async_resource_ref mr)
+ {
+ CUDF_FAIL("Unsupported type for EWMA.");
+ }
+
+ template ::value)>
+ std::unique_ptr operator()(scan_aggregation const& agg,
+ column_view const& input,
+ rmm::cuda_stream_view stream,
+ rmm::device_async_resource_ref mr)
+ {
+ auto const ewma_agg = dynamic_cast(&agg);
+ auto const history = ewma_agg->history;
+ auto const center_of_mass = ewma_agg->center_of_mass;
+
+ // center of mass is easier for the user, but the recurrences are
+ // better expressed in terms of the derived parameter `beta`
+ T const beta = center_of_mass / (center_of_mass + 1.0);
+
+ auto result = [&]() {
+ if (history == cudf::ewm_history::INFINITE) {
+ return compute_ewma_adjust(input, beta, stream, mr);
+ } else {
+ return compute_ewma_noadjust(input, beta, stream, mr);
+ }
+ }();
+ return std::make_unique(cudf::data_type(cudf::type_to_id()),
+ input.size(),
+ result.release(),
+ rmm::device_buffer{},
+ 0);
+ }
+};
+
+std::unique_ptr exponentially_weighted_moving_average(column_view const& input,
+ scan_aggregation const& agg,
+ rmm::cuda_stream_view stream,
+ rmm::device_async_resource_ref mr)
+{
+ return type_dispatcher(input.type(), ewma_functor{}, agg, input, stream, mr);
+}
+
+} // namespace detail
+} // namespace cudf
diff --git a/cpp/src/reductions/scan/scan.cuh b/cpp/src/reductions/scan/scan.cuh
index aeb9e516cd4..6c237741ac3 100644
--- a/cpp/src/reductions/scan/scan.cuh
+++ b/cpp/src/reductions/scan/scan.cuh
@@ -36,6 +36,12 @@ std::pair mask_scan(column_view const& input_view
rmm::cuda_stream_view stream,
rmm::device_async_resource_ref mr);
+// exponentially weighted moving average of the input
+std::unique_ptr exponentially_weighted_moving_average(column_view const& input,
+ scan_aggregation const& agg,
+ rmm::cuda_stream_view stream,
+ rmm::device_async_resource_ref mr);
+
template typename DispatchFn>
std::unique_ptr scan_agg_dispatch(column_view const& input,
scan_aggregation const& agg,
@@ -59,6 +65,7 @@ std::unique_ptr scan_agg_dispatch(column_view const& input,
if (is_fixed_point(input.type())) CUDF_FAIL("decimal32/64/128 cannot support product scan");
return type_dispatcher(
input.type(), DispatchFn(), input, output_mask, stream, mr);
+ case aggregation::EWMA: return exponentially_weighted_moving_average(input, agg, stream, mr);
default: CUDF_FAIL("Unsupported aggregation operator for scan");
}
}
diff --git a/cpp/src/reductions/scan/scan_inclusive.cu b/cpp/src/reductions/scan/scan_inclusive.cu
index ad2eaa6a471..7c02a8d1b99 100644
--- a/cpp/src/reductions/scan/scan_inclusive.cu
+++ b/cpp/src/reductions/scan/scan_inclusive.cu
@@ -182,7 +182,8 @@ std::unique_ptr scan_inclusive(column_view const& input,
auto output = scan_agg_dispatch(
input, agg, static_cast(mask.data()), stream, mr);
- output->set_null_mask(std::move(mask), null_count);
+ // Use the null mask produced by the op for EWM
+ if (agg.kind != aggregation::EWMA) { output->set_null_mask(std::move(mask), null_count); }
// If the input is a structs column, we also need to push down nulls from the parent output column
// into the children columns.
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index eda470d2309..9f14455f42d 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -205,6 +205,7 @@ ConfigureTest(
ConfigureTest(
REDUCTIONS_TEST
reductions/collect_ops_tests.cpp
+ reductions/ewm_tests.cpp
reductions/rank_tests.cpp
reductions/reduction_tests.cpp
reductions/scan_tests.cpp
diff --git a/cpp/tests/ast/transform_tests.cpp b/cpp/tests/ast/transform_tests.cpp
index ef1d09e5652..6b350c137d0 100644
--- a/cpp/tests/ast/transform_tests.cpp
+++ b/cpp/tests/ast/transform_tests.cpp
@@ -65,6 +65,22 @@ TEST_F(TransformTest, ColumnReference)
CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view(), verbosity);
}
+TEST_F(TransformTest, BasicAdditionDoubleCast)
+{
+ auto c_0 = column_wrapper{3, 20, 1, 50};
+ std::vector<__int128_t> data1{10, 7, 20, 0};
+ auto c_1 = cudf::test::fixed_point_column_wrapper<__int128_t>(
+ data1.begin(), data1.end(), numeric::scale_type{0});
+ auto table = cudf::table_view{{c_0, c_1}};
+ auto col_ref_0 = cudf::ast::column_reference(0);
+ auto col_ref_1 = cudf::ast::column_reference(1);
+ auto cast = cudf::ast::operation(cudf::ast::ast_operator::CAST_TO_FLOAT64, col_ref_1);
+ auto expression = cudf::ast::operation(cudf::ast::ast_operator::ADD, col_ref_0, cast);
+ auto expected = column_wrapper{13, 27, 21, 50};
+ auto result = cudf::compute_column(table, expression);
+ CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view(), verbosity);
+}
+
TEST_F(TransformTest, Literal)
{
auto c_0 = column_wrapper{3, 20, 1, 50};
diff --git a/cpp/tests/reductions/ewm_tests.cpp b/cpp/tests/reductions/ewm_tests.cpp
new file mode 100644
index 00000000000..09cec688509
--- /dev/null
+++ b/cpp/tests/reductions/ewm_tests.cpp
@@ -0,0 +1,101 @@
+/*
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "scan_tests.hpp"
+
+#include
+#include
+#include
+
+#include
+#include
+
+template
+struct TypedEwmScanTest : BaseScanTest {
+ inline void test_ungrouped_ewma_scan(cudf::column_view const& input,
+ cudf::column_view const& expect_vals,
+ cudf::scan_aggregation const& agg,
+ cudf::null_policy null_handling)
+ {
+ auto col_out = cudf::scan(input, agg, cudf::scan_type::INCLUSIVE, null_handling);
+ CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expect_vals, col_out->view());
+ }
+};
+
+TYPED_TEST_SUITE(TypedEwmScanTest, cudf::test::FloatingPointTypes);
+
+TYPED_TEST(TypedEwmScanTest, Ewm)
+{
+ auto const v = make_vector({1.0, 2.0, 3.0, 4.0, 5.0});
+ auto col = this->make_column(v);
+
+ auto const expected_ewma_vals_adjust = cudf::test::fixed_width_column_wrapper{
+ {1.0, 1.75, 2.61538461538461497469, 3.54999999999999982236, 4.52066115702479365268}};
+
+ auto const expected_ewma_vals_noadjust =
+ cudf::test::fixed_width_column_wrapper{{1.0,
+ 1.66666666666666651864,
+ 2.55555555555555535818,
+ 3.51851851851851815667,
+ 4.50617283950617242283}};
+
+ this->test_ungrouped_ewma_scan(
+ *col,
+ expected_ewma_vals_adjust,
+ *cudf::make_ewma_aggregation(0.5, cudf::ewm_history::INFINITE),
+ cudf::null_policy::INCLUDE);
+ this->test_ungrouped_ewma_scan(
+ *col,
+ expected_ewma_vals_noadjust,
+ *cudf::make_ewma_aggregation(0.5, cudf::ewm_history::FINITE),
+ cudf::null_policy::INCLUDE);
+}
+
+TYPED_TEST(TypedEwmScanTest, EwmWithNulls)
+{
+ auto const v = make_vector({1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0});
+ auto const b = thrust::host_vector(std::vector{1, 0, 1, 0, 0, 1, 1});
+ auto col = this->make_column(v, b);
+
+ auto const expected_ewma_vals_adjust =
+ cudf::test::fixed_width_column_wrapper{{1.0,
+ 1.0,
+ 2.79999999999999982236,
+ 2.79999999999999982236,
+ 2.79999999999999982236,
+ 5.87351778656126466416,
+ 6.70977596741344139986}};
+
+ auto const expected_ewma_vals_noadjust =
+ cudf::test::fixed_width_column_wrapper{{1.0,
+ 1.0,
+ 2.71428571428571441260,
+ 2.71428571428571441260,
+ 2.71428571428571441260,
+ 5.82706766917293172980,
+ 6.60902255639097724327}};
+
+ this->test_ungrouped_ewma_scan(
+ *col,
+ expected_ewma_vals_adjust,
+ *cudf::make_ewma_aggregation(0.5, cudf::ewm_history::INFINITE),
+ cudf::null_policy::INCLUDE);
+ this->test_ungrouped_ewma_scan(
+ *col,
+ expected_ewma_vals_noadjust,
+ *cudf::make_ewma_aggregation(0.5, cudf::ewm_history::FINITE),
+ cudf::null_policy::INCLUDE);
+}
diff --git a/docs/cudf/source/user_guide/api_docs/dataframe.rst b/docs/cudf/source/user_guide/api_docs/dataframe.rst
index 70e4bd060ca..02fd9f7b396 100644
--- a/docs/cudf/source/user_guide/api_docs/dataframe.rst
+++ b/docs/cudf/source/user_guide/api_docs/dataframe.rst
@@ -137,6 +137,7 @@ Computations / descriptive stats
DataFrame.describe
DataFrame.diff
DataFrame.eval
+ DataFrame.ewm
DataFrame.kurt
DataFrame.kurtosis
DataFrame.max
diff --git a/docs/cudf/source/user_guide/api_docs/general_utilities.rst b/docs/cudf/source/user_guide/api_docs/general_utilities.rst
index d9c53c3fbbd..8d0edc0b100 100644
--- a/docs/cudf/source/user_guide/api_docs/general_utilities.rst
+++ b/docs/cudf/source/user_guide/api_docs/general_utilities.rst
@@ -8,6 +8,8 @@ Testing functions
:toctree: api/
cudf.testing.testing.assert_column_equal
+ cudf.testing.testing.assert_eq
cudf.testing.testing.assert_frame_equal
cudf.testing.testing.assert_index_equal
+ cudf.testing.testing.assert_neq
cudf.testing.testing.assert_series_equal
diff --git a/docs/cudf/source/user_guide/api_docs/series.rst b/docs/cudf/source/user_guide/api_docs/series.rst
index 5dc87a97337..48a7dc8ff87 100644
--- a/docs/cudf/source/user_guide/api_docs/series.rst
+++ b/docs/cudf/source/user_guide/api_docs/series.rst
@@ -138,6 +138,7 @@ Computations / descriptive stats
Series.describe
Series.diff
Series.digitize
+ Series.ewm
Series.factorize
Series.kurt
Series.max
diff --git a/python/cudf/cudf/_fuzz_testing/tests/fuzz_test_csv.py b/python/cudf/cudf/_fuzz_testing/tests/fuzz_test_csv.py
index f8f674fecec..d90f3ea1aca 100644
--- a/python/cudf/cudf/_fuzz_testing/tests/fuzz_test_csv.py
+++ b/python/cudf/cudf/_fuzz_testing/tests/fuzz_test_csv.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2023, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
import sys
from io import StringIO
@@ -13,7 +13,7 @@
compare_content,
run_test,
)
-from cudf.testing._utils import assert_eq
+from cudf.testing import assert_eq
@pythonfuzz(data_handle=CSVReader)
diff --git a/python/cudf/cudf/_fuzz_testing/tests/fuzz_test_json.py b/python/cudf/cudf/_fuzz_testing/tests/fuzz_test_json.py
index 2f5e6204f7c..69e9437be93 100644
--- a/python/cudf/cudf/_fuzz_testing/tests/fuzz_test_json.py
+++ b/python/cudf/cudf/_fuzz_testing/tests/fuzz_test_json.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
import io
import sys
@@ -9,7 +9,7 @@
from cudf._fuzz_testing.json import JSONReader, JSONWriter
from cudf._fuzz_testing.main import pythonfuzz
from cudf._fuzz_testing.utils import ALL_POSSIBLE_VALUES, run_test
-from cudf.testing._utils import assert_eq
+from cudf.testing import assert_eq
@pythonfuzz(data_handle=JSONReader)
diff --git a/python/cudf/cudf/_fuzz_testing/utils.py b/python/cudf/cudf/_fuzz_testing/utils.py
index d685174f3c2..e6dfe2eae62 100644
--- a/python/cudf/cudf/_fuzz_testing/utils.py
+++ b/python/cudf/cudf/_fuzz_testing/utils.py
@@ -8,7 +8,7 @@
import pyarrow as pa
import cudf
-from cudf.testing._utils import assert_eq
+from cudf.testing import assert_eq
from cudf.utils.dtypes import (
pandas_dtypes_to_np_dtypes,
pyarrow_dtypes_to_pandas_dtypes,
diff --git a/python/cudf/cudf/_lib/aggregation.pyx b/python/cudf/cudf/_lib/aggregation.pyx
index 11f801ba772..1616c24eec2 100644
--- a/python/cudf/cudf/_lib/aggregation.pyx
+++ b/python/cudf/cudf/_lib/aggregation.pyx
@@ -58,6 +58,14 @@ class Aggregation:
if dropna else pylibcudf.types.NullPolicy.INCLUDE
))
+ @classmethod
+ def ewma(cls, com=1.0, adjust=True):
+ return cls(pylibcudf.aggregation.ewma(
+ com,
+ pylibcudf.aggregation.EWMHistory.INFINITE
+ if adjust else pylibcudf.aggregation.EWMHistory.FINITE
+ ))
+
@classmethod
def size(cls):
return cls(pylibcudf.aggregation.count(pylibcudf.types.NullPolicy.INCLUDE))
diff --git a/python/cudf/cudf/_lib/csv.pyx b/python/cudf/cudf/_lib/csv.pyx
index 0b0bbdb2589..c706351a683 100644
--- a/python/cudf/cudf/_lib/csv.pyx
+++ b/python/cudf/cudf/_lib/csv.pyx
@@ -8,7 +8,7 @@ from libcpp.utility cimport move
from libcpp.vector cimport vector
cimport cudf._lib.pylibcudf.libcudf.types as libcudf_types
-from cudf._lib.io.datasource cimport Datasource, NativeFileDatasource
+from cudf._lib.pylibcudf.io.datasource cimport Datasource, NativeFileDatasource
from cudf._lib.pylibcudf.libcudf.types cimport data_type
from cudf._lib.types cimport dtype_to_data_type
diff --git a/python/cudf/cudf/_lib/io/CMakeLists.txt b/python/cudf/cudf/_lib/io/CMakeLists.txt
index 2408fa1c12f..620229a1275 100644
--- a/python/cudf/cudf/_lib/io/CMakeLists.txt
+++ b/python/cudf/cudf/_lib/io/CMakeLists.txt
@@ -1,5 +1,5 @@
# =============================================================================
-# Copyright (c) 2022-2023, NVIDIA CORPORATION.
+# Copyright (c) 2022-2024, NVIDIA CORPORATION.
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
# in compliance with the License. You may obtain a copy of the License at
@@ -12,7 +12,7 @@
# the License.
# =============================================================================
-set(cython_sources datasource.pyx utils.pyx)
+set(cython_sources utils.pyx)
set(linked_libraries cudf::cudf)
rapids_cython_create_modules(
CXX
diff --git a/python/cudf/cudf/_lib/io/utils.pyx b/python/cudf/cudf/_lib/io/utils.pyx
index 3c14ec46122..1d7c56888d9 100644
--- a/python/cudf/cudf/_lib/io/utils.pyx
+++ b/python/cudf/cudf/_lib/io/utils.pyx
@@ -8,7 +8,7 @@ from libcpp.utility cimport move
from libcpp.vector cimport vector
from cudf._lib.column cimport Column
-from cudf._lib.io.datasource cimport Datasource
+from cudf._lib.pylibcudf.io.datasource cimport Datasource
from cudf._lib.pylibcudf.libcudf.io.data_sink cimport data_sink
from cudf._lib.pylibcudf.libcudf.io.datasource cimport datasource
from cudf._lib.pylibcudf.libcudf.io.types cimport (
diff --git a/python/cudf/cudf/_lib/orc.pyx b/python/cudf/cudf/_lib/orc.pyx
index d3e6053ef4b..9609e3131b4 100644
--- a/python/cudf/cudf/_lib/orc.pyx
+++ b/python/cudf/cudf/_lib/orc.pyx
@@ -23,12 +23,12 @@ except ImportError:
cimport cudf._lib.pylibcudf.libcudf.io.types as cudf_io_types
from cudf._lib.column cimport Column
-from cudf._lib.io.datasource cimport NativeFileDatasource
from cudf._lib.io.utils cimport (
make_sink_info,
make_source_info,
update_column_struct_field_names,
)
+from cudf._lib.pylibcudf.io.datasource cimport NativeFileDatasource
from cudf._lib.pylibcudf.libcudf.io.data_sink cimport data_sink
from cudf._lib.pylibcudf.libcudf.io.orc cimport (
chunked_orc_writer_options,
diff --git a/python/cudf/cudf/_lib/parquet.pyx b/python/cudf/cudf/_lib/parquet.pyx
index f6f9cfa9a7c..7914ed7e9d9 100644
--- a/python/cudf/cudf/_lib/parquet.pyx
+++ b/python/cudf/cudf/_lib/parquet.pyx
@@ -37,12 +37,12 @@ cimport cudf._lib.pylibcudf.libcudf.io.types as cudf_io_types
cimport cudf._lib.pylibcudf.libcudf.types as cudf_types
from cudf._lib.column cimport Column
from cudf._lib.expressions cimport Expression
-from cudf._lib.io.datasource cimport NativeFileDatasource
from cudf._lib.io.utils cimport (
make_sinks_info,
make_source_info,
update_struct_field_names,
)
+from cudf._lib.pylibcudf.io.datasource cimport NativeFileDatasource
from cudf._lib.pylibcudf.libcudf.expressions cimport expression
from cudf._lib.pylibcudf.libcudf.io.parquet cimport (
chunked_parquet_reader as cpp_chunked_parquet_reader,
diff --git a/python/cudf/cudf/_lib/pylibcudf/aggregation.pxd b/python/cudf/cudf/_lib/pylibcudf/aggregation.pxd
index 8526728656b..0981d0e855a 100644
--- a/python/cudf/cudf/_lib/pylibcudf/aggregation.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/aggregation.pxd
@@ -6,6 +6,7 @@ from cudf._lib.pylibcudf.libcudf.aggregation cimport (
Kind as kind_t,
aggregation,
correlation_type,
+ ewm_history,
groupby_aggregation,
groupby_scan_aggregation,
rank_method,
@@ -80,6 +81,8 @@ cpdef Aggregation argmax()
cpdef Aggregation argmin()
+cpdef Aggregation ewma(float center_of_mass, ewm_history history)
+
cpdef Aggregation nunique(null_policy null_handling = *)
cpdef Aggregation nth_element(size_type n, null_policy null_handling = *)
diff --git a/python/cudf/cudf/_lib/pylibcudf/aggregation.pyx b/python/cudf/cudf/_lib/pylibcudf/aggregation.pyx
index 7bb64e32a1b..eed2f6de585 100644
--- a/python/cudf/cudf/_lib/pylibcudf/aggregation.pyx
+++ b/python/cudf/cudf/_lib/pylibcudf/aggregation.pyx
@@ -8,6 +8,7 @@ from libcpp.utility cimport move
from cudf._lib.pylibcudf.libcudf.aggregation cimport (
aggregation,
correlation_type,
+ ewm_history,
groupby_aggregation,
groupby_scan_aggregation,
make_all_aggregation,
@@ -19,6 +20,7 @@ from cudf._lib.pylibcudf.libcudf.aggregation cimport (
make_correlation_aggregation,
make_count_aggregation,
make_covariance_aggregation,
+ make_ewma_aggregation,
make_max_aggregation,
make_mean_aggregation,
make_median_aggregation,
@@ -52,6 +54,8 @@ from cudf._lib.pylibcudf.libcudf.types cimport (
from cudf._lib.pylibcudf.libcudf.aggregation import Kind # no-cython-lint
from cudf._lib.pylibcudf.libcudf.aggregation import \
correlation_type as CorrelationType # no-cython-lint
+from cudf._lib.pylibcudf.libcudf.aggregation import \
+ ewm_history as EWMHistory # no-cython-lint
from cudf._lib.pylibcudf.libcudf.aggregation import \
rank_method as RankMethod # no-cython-lint
from cudf._lib.pylibcudf.libcudf.aggregation import \
@@ -202,6 +206,28 @@ cpdef Aggregation max():
return Aggregation.from_libcudf(move(make_max_aggregation[aggregation]()))
+cpdef Aggregation ewma(float center_of_mass, ewm_history history):
+ """Create a EWMA aggregation.
+
+ For details, see :cpp:func:`make_ewma_aggregation`.
+
+ Parameters
+ ----------
+ center_of_mass : float
+ The decay in terms of the center of mass
+ history : ewm_history
+ Whether or not to treat the history as infinite.
+
+ Returns
+ -------
+ Aggregation
+ The EWMA aggregation.
+ """
+ return Aggregation.from_libcudf(
+ move(make_ewma_aggregation[aggregation](center_of_mass, history))
+ )
+
+
cpdef Aggregation count(null_policy null_handling = null_policy.EXCLUDE):
"""Create a count aggregation.
diff --git a/python/cudf/cudf/_lib/pylibcudf/io/CMakeLists.txt b/python/cudf/cudf/_lib/pylibcudf/io/CMakeLists.txt
index 0e85cfb0654..084b341ec48 100644
--- a/python/cudf/cudf/_lib/pylibcudf/io/CMakeLists.txt
+++ b/python/cudf/cudf/_lib/pylibcudf/io/CMakeLists.txt
@@ -12,7 +12,7 @@
# the License.
# =============================================================================
-set(cython_sources avro.pyx json.pyx types.pyx)
+set(cython_sources avro.pyx datasource.pyx json.pyx types.pyx)
set(linked_libraries cudf::cudf)
rapids_cython_create_modules(
@@ -21,5 +21,7 @@ rapids_cython_create_modules(
LINKED_LIBRARIES "${linked_libraries}" MODULE_PREFIX pylibcudf_io_ ASSOCIATED_TARGETS cudf
)
-set(targets_using_arrow_headers pylibcudf_io_avro pylibcudf_io_json pylibcudf_io_types)
+set(targets_using_arrow_headers pylibcudf_io_avro pylibcudf_io_datasource pylibcudf_io_json
+ pylibcudf_io_types
+)
link_to_pyarrow_headers("${targets_using_arrow_headers}")
diff --git a/python/cudf/cudf/_lib/pylibcudf/io/__init__.pxd b/python/cudf/cudf/_lib/pylibcudf/io/__init__.pxd
index 1bf355a1461..ef4c65b277e 100644
--- a/python/cudf/cudf/_lib/pylibcudf/io/__init__.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/io/__init__.pxd
@@ -1,4 +1,4 @@
# Copyright (c) 2024, NVIDIA CORPORATION.
-from . cimport avro, json, types
-from .types cimport SinkInfo, SourceInfo, TableWithMetadata
+from . cimport avro, datasource, json, types
+from .types cimport SourceInfo, TableWithMetadata
diff --git a/python/cudf/cudf/_lib/pylibcudf/io/__init__.py b/python/cudf/cudf/_lib/pylibcudf/io/__init__.py
index 9fc05451b3b..fb4e4c7e4bb 100644
--- a/python/cudf/cudf/_lib/pylibcudf/io/__init__.py
+++ b/python/cudf/cudf/_lib/pylibcudf/io/__init__.py
@@ -1,4 +1,4 @@
# Copyright (c) 2024, NVIDIA CORPORATION.
-from . import avro, json, types
+from . import avro, datasource, json, types
from .types import SinkInfo, SourceInfo, TableWithMetadata
diff --git a/python/cudf/cudf/_lib/io/datasource.pxd b/python/cudf/cudf/_lib/pylibcudf/io/datasource.pxd
similarity index 100%
rename from python/cudf/cudf/_lib/io/datasource.pxd
rename to python/cudf/cudf/_lib/pylibcudf/io/datasource.pxd
diff --git a/python/cudf/cudf/_lib/io/datasource.pyx b/python/cudf/cudf/_lib/pylibcudf/io/datasource.pyx
similarity index 100%
rename from python/cudf/cudf/_lib/io/datasource.pyx
rename to python/cudf/cudf/_lib/pylibcudf/io/datasource.pyx
diff --git a/python/cudf/cudf/_lib/pylibcudf/io/types.pyx b/python/cudf/cudf/_lib/pylibcudf/io/types.pyx
index f68628d244f..acfb02bf6be 100644
--- a/python/cudf/cudf/_lib/pylibcudf/io/types.pyx
+++ b/python/cudf/cudf/_lib/pylibcudf/io/types.pyx
@@ -7,7 +7,9 @@ from libcpp.string cimport string
from libcpp.utility cimport move
from libcpp.vector cimport vector
+from cudf._lib.pylibcudf.io.datasource cimport Datasource
from cudf._lib.pylibcudf.libcudf.io.data_sink cimport data_sink
+from cudf._lib.pylibcudf.libcudf.io.datasource cimport datasource
from cudf._lib.pylibcudf.libcudf.io.types cimport (
column_name_info,
host_buffer,
@@ -95,9 +97,8 @@ cdef class SourceInfo:
Parameters
----------
- sources : List[Union[str, os.PathLike, bytes, io.BytesIO]]
- A homogeneous list of sources (this can be a string filename,
- an os.PathLike, bytes, or an io.BytesIO) to read from.
+ sources : List[Union[str, os.PathLike, bytes, io.BytesIO, DataSource]]
+ A homogeneous list of sources to read from.
Mixing different types of sources will raise a `ValueError`.
"""
@@ -107,6 +108,7 @@ cdef class SourceInfo:
raise ValueError("Need to pass at least one source")
cdef vector[string] c_files
+ cdef vector[datasource*] c_datasources
if isinstance(sources[0], (os.PathLike, str)):
c_files.reserve(len(sources))
@@ -123,6 +125,13 @@ cdef class SourceInfo:
self.c_obj = move(source_info(c_files))
return
+ elif isinstance(sources[0], Datasource):
+ for csrc in sources:
+ if not isinstance(csrc, Datasource):
+ raise ValueError("All sources must be of the same type!")
+ c_datasources.push_back((csrc).get_datasource())
+ self.c_obj = move(source_info(c_datasources))
+ return
# TODO: host_buffer is deprecated API, use host_span instead
cdef vector[host_buffer] c_host_buffers
@@ -145,6 +154,9 @@ cdef class SourceInfo:
c_buffer = bio.getbuffer() # check if empty?
c_host_buffers.push_back(host_buffer(&c_buffer[0],
c_buffer.shape[0]))
+ else:
+ raise ValueError("Sources must be a list of str/paths, "
+ "bytes, io.BytesIO, or a Datasource")
self.c_obj = source_info(c_host_buffers)
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/aggregation.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/aggregation.pxd
index 8c14bc45723..fe04db52094 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/aggregation.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/aggregation.pxd
@@ -79,6 +79,10 @@ cdef extern from "cudf/aggregation.hpp" namespace "cudf" nogil:
KENDALL
SPEARMAN
+ cpdef enum class ewm_history(int32_t):
+ INFINITE
+ FINITE
+
cpdef enum class rank_method(int32_t):
FIRST
AVERAGE
@@ -143,6 +147,10 @@ cdef extern from "cudf/aggregation.hpp" namespace "cudf" nogil:
string user_defined_aggregator,
data_type output_type) except +
+ cdef unique_ptr[T] make_ewma_aggregation[T](
+ double com, ewm_history adjust
+ ) except +
+
cdef unique_ptr[T] make_correlation_aggregation[T](
correlation_type type, size_type min_periods) except +
diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py
index f538180805b..231af30c06d 100644
--- a/python/cudf/cudf/core/column/categorical.py
+++ b/python/cudf/cudf/core/column/categorical.py
@@ -1068,51 +1068,34 @@ def notnull(self) -> ColumnBase:
return result
- def fillna(
- self,
- fill_value: Any = None,
- method: str | None = None,
- ) -> Self:
- """
- Fill null values with *fill_value*
- """
- if fill_value is not None:
- fill_is_scalar = np.isscalar(fill_value)
-
- if fill_is_scalar:
- if fill_value == _DEFAULT_CATEGORICAL_VALUE:
- fill_value = self.codes.dtype.type(fill_value)
- else:
- try:
- fill_value = self._encode(fill_value)
- fill_value = self.codes.dtype.type(fill_value)
- except ValueError as err:
- err_msg = "fill value must be in categories"
- raise ValueError(err_msg) from err
+ def _validate_fillna_value(
+ self, fill_value: ScalarLike | ColumnLike
+ ) -> cudf.Scalar | ColumnBase:
+ """Align fill_value for .fillna based on column type."""
+ if cudf.api.types.is_scalar(fill_value):
+ if fill_value != _DEFAULT_CATEGORICAL_VALUE:
+ try:
+ fill_value = self._encode(fill_value)
+ except ValueError as err:
+ raise ValueError(
+ f"{fill_value=} must be in categories"
+ ) from err
+ return cudf.Scalar(fill_value, dtype=self.codes.dtype)
+ else:
+ fill_value = column.as_column(fill_value, nan_as_null=False)
+ if isinstance(fill_value.dtype, CategoricalDtype):
+ if self.dtype != fill_value.dtype:
+ raise TypeError(
+ "Cannot set a categorical with another without identical categories"
+ )
else:
- fill_value = column.as_column(fill_value, nan_as_null=False)
- if isinstance(fill_value, CategoricalColumn):
- if self.dtype != fill_value.dtype:
- raise TypeError(
- "Cannot set a Categorical with another, "
- "without identical categories"
- )
- # TODO: only required if fill_value has a subset of the
- # categories:
- fill_value = fill_value._set_categories(
- self.categories,
- is_unique=True,
- )
- fill_value = column.as_column(fill_value.codes).astype(
- self.codes.dtype
+ raise TypeError(
+ "Cannot set a categorical with non-categorical data"
)
-
- # Validation of `fill_value` will have to be performed
- # before returning self.
- if not self.nullable:
- return self
-
- return super().fillna(fill_value, method=method)
+ fill_value = fill_value._set_categories(
+ self.categories,
+ )
+ return fill_value.codes.astype(self.codes.dtype)
def indices_of(
self, value: ScalarLike
@@ -1372,11 +1355,13 @@ def _set_categories(
if not (is_unique or new_cats.is_unique):
new_cats = cudf.Series(new_cats)._column.unique()
+ if cur_cats.equals(new_cats, check_dtypes=True):
+ # TODO: Internal usages don't always need a copy; add a copy keyword
+ # as_ordered shallow copies
+ return self.copy().as_ordered(ordered=ordered)
+
cur_codes = self.codes
- max_cat_size = (
- len(cur_cats) if len(cur_cats) > len(new_cats) else len(new_cats)
- )
- out_code_dtype = min_unsigned_type(max_cat_size)
+ out_code_dtype = min_unsigned_type(max(len(cur_cats), len(new_cats)))
cur_order = column.as_column(range(len(cur_codes)))
old_codes = column.as_column(
diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
index 586689e2ee3..dfcdfbb9d91 100644
--- a/python/cudf/cudf/core/column/column.py
+++ b/python/cudf/cudf/core/column/column.py
@@ -666,15 +666,32 @@ def _check_scatter_key_length(
f"{num_keys}"
)
+ def _validate_fillna_value(
+ self, fill_value: ScalarLike | ColumnLike
+ ) -> cudf.Scalar | ColumnBase:
+ """Align fill_value for .fillna based on column type."""
+ if is_scalar(fill_value):
+ return cudf.Scalar(fill_value, dtype=self.dtype)
+ return as_column(fill_value)
+
def fillna(
self,
- fill_value: Any = None,
- method: str | None = None,
+ fill_value: ScalarLike | ColumnLike,
+ method: Literal["ffill", "bfill", None] = None,
) -> Self:
"""Fill null values with ``value``.
Returns a copy with null filled.
"""
+ if not self.has_nulls(include_nan=True):
+ return self.copy()
+ elif method is None:
+ if is_scalar(fill_value) and libcudf.scalar._is_null_host_scalar(
+ fill_value
+ ):
+ return self.copy()
+ else:
+ fill_value = self._validate_fillna_value(fill_value)
return libcudf.replace.replace_nulls(
input_col=self.nans_to_nulls(),
replacement=fill_value,
diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py
index d88553361dd..121076b69ce 100644
--- a/python/cudf/cudf/core/column/datetime.py
+++ b/python/cudf/cudf/core/column/datetime.py
@@ -8,18 +8,17 @@
import locale
import re
from locale import nl_langinfo
-from typing import TYPE_CHECKING, Any, Literal, Sequence, cast
+from typing import TYPE_CHECKING, Literal, Sequence, cast
import numpy as np
import pandas as pd
import pyarrow as pa
-from typing_extensions import Self
import cudf
from cudf import _lib as libcudf
from cudf._lib.labeling import label_bins
from cudf._lib.search import search_sorted
-from cudf.api.types import is_datetime64_dtype, is_scalar, is_timedelta64_dtype
+from cudf.api.types import is_datetime64_dtype, is_timedelta64_dtype
from cudf.core._compat import PANDAS_GE_220
from cudf.core._internals.timezones import (
check_ambiguous_and_nonexistent,
@@ -641,22 +640,6 @@ def _binaryop(self, other: ColumnBinaryOperand, op: str) -> ColumnBase:
else:
return result_col
- def fillna(
- self,
- fill_value: Any = None,
- method: str | None = None,
- ) -> Self:
- if fill_value is not None:
- if cudf.utils.utils._isnat(fill_value):
- return self.copy(deep=True)
- if is_scalar(fill_value):
- if not isinstance(fill_value, cudf.Scalar):
- fill_value = cudf.Scalar(fill_value, dtype=self.dtype)
- else:
- fill_value = column.as_column(fill_value, nan_as_null=False)
-
- return super().fillna(fill_value, method)
-
def indices_of(
self, value: ScalarLike
) -> cudf.core.column.NumericalColumn:
diff --git a/python/cudf/cudf/core/column/decimal.py b/python/cudf/cudf/core/column/decimal.py
index e9d9b4933e5..d66908b5f94 100644
--- a/python/cudf/cudf/core/column/decimal.py
+++ b/python/cudf/cudf/core/column/decimal.py
@@ -4,12 +4,11 @@
import warnings
from decimal import Decimal
-from typing import TYPE_CHECKING, Any, Sequence, cast
+from typing import TYPE_CHECKING, Sequence, cast
import cupy as cp
import numpy as np
import pyarrow as pa
-from typing_extensions import Self
import cudf
from cudf import _lib as libcudf
@@ -31,7 +30,7 @@
from .numerical_base import NumericalBaseColumn
if TYPE_CHECKING:
- from cudf._typing import ColumnBinaryOperand, Dtype
+ from cudf._typing import ColumnBinaryOperand, ColumnLike, Dtype, ScalarLike
class DecimalBaseColumn(NumericalBaseColumn):
@@ -135,30 +134,20 @@ def _binaryop(self, other: ColumnBinaryOperand, op: str):
return result
- def fillna(
- self,
- fill_value: Any = None,
- method: str | None = None,
- ) -> Self:
- """Fill null values with ``value``.
-
- Returns a copy with null filled.
- """
+ def _validate_fillna_value(
+ self, fill_value: ScalarLike | ColumnLike
+ ) -> cudf.Scalar | ColumnBase:
+ """Align fill_value for .fillna based on column type."""
if isinstance(fill_value, (int, Decimal)):
- fill_value = cudf.Scalar(fill_value, dtype=self.dtype)
- elif (
- isinstance(fill_value, DecimalBaseColumn)
- or isinstance(fill_value, cudf.core.column.NumericalColumn)
- and is_integer_dtype(fill_value.dtype)
+ return cudf.Scalar(fill_value, dtype=self.dtype)
+ elif isinstance(fill_value, ColumnBase) and (
+ isinstance(self.dtype, DecimalDtype) or self.dtype.kind in "iu"
):
- fill_value = fill_value.astype(self.dtype)
- else:
- raise TypeError(
- "Decimal columns only support using fillna with decimal and "
- "integer values"
- )
-
- return super().fillna(fill_value, method=method)
+ return fill_value.astype(self.dtype)
+ raise TypeError(
+ "Decimal columns only support using fillna with decimal and "
+ "integer values"
+ )
def normalize_binop_value(self, other):
if isinstance(other, ColumnBase):
diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py
index 098cf43421b..76c64e1aea0 100644
--- a/python/cudf/cudf/core/column/numerical.py
+++ b/python/cudf/cudf/core/column/numerical.py
@@ -532,57 +532,26 @@ def find_and_replace(
replaced, df._data["old"], df._data["new"]
)
- def fillna(
- self,
- fill_value: Any = None,
- method: str | None = None,
- ) -> Self:
- """
- Fill null values with *fill_value*
- """
- col = self.nans_to_nulls()
-
- if col.null_count == 0:
- return col
-
- if method is not None:
- return super().fillna(fill_value, method)
-
- if fill_value is None:
- raise ValueError("Must specify either 'fill_value' or 'method'")
-
- if (
- isinstance(fill_value, cudf.Scalar)
- and fill_value.dtype == col.dtype
- ):
- return super().fillna(fill_value, method)
-
- if np.isscalar(fill_value):
- # cast safely to the same dtype as self
- fill_value_casted = col.dtype.type(fill_value)
- if not np.isnan(fill_value) and (fill_value_casted != fill_value):
+ def _validate_fillna_value(
+ self, fill_value: ScalarLike | ColumnLike
+ ) -> cudf.Scalar | ColumnBase:
+ """Align fill_value for .fillna based on column type."""
+ if is_scalar(fill_value):
+ cudf_obj = cudf.Scalar(fill_value)
+ if not as_column(cudf_obj).can_cast_safely(self.dtype):
raise TypeError(
f"Cannot safely cast non-equivalent "
- f"{type(fill_value).__name__} to {col.dtype.name}"
+ f"{type(fill_value).__name__} to {self.dtype.name}"
)
- fill_value = cudf.Scalar(fill_value_casted)
else:
- fill_value = column.as_column(fill_value, nan_as_null=False)
- if is_integer_dtype(col.dtype):
- # cast safely to the same dtype as self
- if fill_value.dtype != col.dtype:
- new_fill_value = fill_value.astype(col.dtype)
- if not (new_fill_value == fill_value).all():
- raise TypeError(
- f"Cannot safely cast non-equivalent "
- f"{fill_value.dtype.type.__name__} to "
- f"{col.dtype.type.__name__}"
- )
- fill_value = new_fill_value
- else:
- fill_value = fill_value.astype(col.dtype)
-
- return super().fillna(fill_value, method)
+ cudf_obj = as_column(fill_value, nan_as_null=False)
+ if not cudf_obj.can_cast_safely(self.dtype): # type: ignore[attr-defined]
+ raise TypeError(
+ f"Cannot safely cast non-equivalent "
+ f"{cudf_obj.dtype.type.__name__} to "
+ f"{self.dtype.type.__name__}"
+ )
+ return cudf_obj.astype(self.dtype)
def can_cast_safely(self, to_dtype: DtypeObj) -> bool:
"""
diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py
index 2451a9cc0af..936cd1eccb0 100644
--- a/python/cudf/cudf/core/column/string.py
+++ b/python/cudf/cudf/core/column/string.py
@@ -5,12 +5,11 @@
import re
import warnings
from functools import cached_property
-from typing import TYPE_CHECKING, Any, Sequence, cast, overload
+from typing import TYPE_CHECKING, Sequence, cast, overload
import numpy as np
import pandas as pd
import pyarrow as pa
-from typing_extensions import Self
import cudf
import cudf.api.types
@@ -5838,21 +5837,6 @@ def find_and_replace(
res = self
return libcudf.replace.replace(res, df._data["old"], df._data["new"])
- def fillna(
- self,
- fill_value: Any = None,
- method: str | None = None,
- ) -> Self:
- if fill_value is not None:
- if not is_scalar(fill_value):
- fill_value = column.as_column(fill_value, dtype=self.dtype)
- elif cudf._lib.scalar._is_null_host_scalar(fill_value):
- # Trying to fill with value? Return copy.
- return self.copy(deep=True)
- else:
- fill_value = cudf.Scalar(fill_value, dtype=self.dtype)
- return super().fillna(fill_value, method=method)
-
def normalize_binop_value(self, other) -> column.ColumnBase | cudf.Scalar:
if (
isinstance(other, (column.ColumnBase, cudf.Scalar))
diff --git a/python/cudf/cudf/core/column/timedelta.py b/python/cudf/cudf/core/column/timedelta.py
index 26b449f1863..8f41bcb6422 100644
--- a/python/cudf/cudf/core/column/timedelta.py
+++ b/python/cudf/cudf/core/column/timedelta.py
@@ -4,12 +4,11 @@
import datetime
import functools
-from typing import TYPE_CHECKING, Any, Sequence, cast
+from typing import TYPE_CHECKING, Sequence, cast
import numpy as np
import pandas as pd
import pyarrow as pa
-from typing_extensions import Self
import cudf
from cudf import _lib as libcudf
@@ -252,22 +251,6 @@ def normalize_binop_value(self, other) -> ColumnBinaryOperand:
def time_unit(self) -> str:
return np.datetime_data(self.dtype)[0]
- def fillna(
- self,
- fill_value: Any = None,
- method: str | None = None,
- ) -> Self:
- if fill_value is not None:
- if cudf.utils.utils._isnat(fill_value):
- return self.copy(deep=True)
- if is_scalar(fill_value):
- fill_value = cudf.Scalar(fill_value)
- dtype = self.dtype
- fill_value = fill_value.astype(dtype)
- else:
- fill_value = column.as_column(fill_value, nan_as_null=False)
- return super().fillna(fill_value, method)
-
def as_numerical_column(
self, dtype: Dtype
) -> "cudf.core.column.NumericalColumn":
diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index 76bb9d2a8ed..f0d8157011d 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -2980,6 +2980,32 @@ def set_index(
df.index = idx
return df if not inplace else None
+ @_cudf_nvtx_annotate
+ def fillna(
+ self, value=None, method=None, axis=None, inplace=False, limit=None
+ ): # noqa: D102
+ if isinstance(value, (pd.Series, pd.DataFrame)):
+ value = cudf.from_pandas(value)
+ if isinstance(value, cudf.Series):
+ # Align value.index to self.columns
+ value = value.reindex(self._column_names)
+ elif isinstance(value, cudf.DataFrame):
+ if not self.index.equals(value.index):
+ # Align value.index to self.index
+ value = value.reindex(self.index)
+ value = dict(value.items())
+ elif isinstance(value, abc.Mapping):
+ # Align value.indexes to self.index
+ value = {
+ key: value.reindex(self.index)
+ if isinstance(value, cudf.Series)
+ else value
+ for key, value in value.items()
+ }
+ return super().fillna(
+ value=value, method=method, axis=axis, inplace=inplace, limit=limit
+ )
+
@_cudf_nvtx_annotate
def where(self, cond, other=None, inplace=False):
from cudf.core._internals.where import (
diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py
index 38bff3946d6..8ca71180c00 100644
--- a/python/cudf/cudf/core/frame.py
+++ b/python/cudf/cudf/core/frame.py
@@ -2,7 +2,6 @@
from __future__ import annotations
-import copy
import operator
import pickle
import warnings
@@ -20,6 +19,7 @@
import cudf
from cudf import _lib as libcudf
from cudf.api.types import is_dtype_equal, is_scalar
+from cudf.core._compat import PANDAS_LT_300
from cudf.core.buffer import acquire_spill_lock
from cudf.core.column import (
ColumnBase,
@@ -38,7 +38,7 @@
if TYPE_CHECKING:
from types import ModuleType
- from cudf._typing import Dtype
+ from cudf._typing import Dtype, ScalarLike
# TODO: It looks like Frame is missing a declaration of `copy`, need to add
@@ -613,8 +613,8 @@ def where(self, cond, other=None, inplace: bool = False) -> Self | None:
@_cudf_nvtx_annotate
def fillna(
self,
- value=None,
- method: Literal["ffill", "bfill", "pad", "backfill"] | None = None,
+ value: None | ScalarLike | cudf.Series = None,
+ method: Literal["ffill", "bfill", "pad", "backfill", None] = None,
axis=None,
inplace: bool = False,
limit=None,
@@ -725,6 +725,16 @@ def fillna(
raise ValueError("Cannot specify both 'value' and 'method'.")
if method:
+ # Do not remove until pandas 3.0 support is added.
+ assert (
+ PANDAS_LT_300
+ ), "Need to drop after pandas-3.0 support is added."
+ warnings.warn(
+ f"{type(self).__name__}.fillna with 'method' is "
+ "deprecated and will raise in a future version. "
+ "Use obj.ffill() or obj.bfill() instead.",
+ FutureWarning,
+ )
if method not in {"ffill", "bfill", "pad", "backfill"}:
raise NotImplementedError(
f"Fill method {method} is not supported"
@@ -734,57 +744,24 @@ def fillna(
elif method == "backfill":
method = "bfill"
- # TODO: This logic should be handled in different subclasses since
- # different Frames support different types of values.
- if isinstance(value, cudf.Series):
- value = value.reindex(self._data.names)
- elif isinstance(value, cudf.DataFrame):
- if not self.index.equals(value.index): # type: ignore[attr-defined]
- value = value.reindex(self.index) # type: ignore[attr-defined]
- else:
- value = value
- elif not isinstance(value, abc.Mapping):
- value = {name: copy.deepcopy(value) for name in self._data.names}
- else:
- value = {
- key: value.reindex(self.index) # type: ignore[attr-defined]
- if isinstance(value, cudf.Series)
- else value
- for key, value in value.items()
- }
-
- filled_data = {}
- for col_name, col in self._data.items():
- if col_name in value and method is None:
- replace_val = value[col_name]
- else:
- replace_val = None
- should_fill = (
- (
- col_name in value
- and col.has_nulls(include_nan=True)
- and not libcudf.scalar._is_null_host_scalar(replace_val)
- )
- or method is not None
- or (
- isinstance(col, cudf.core.column.CategoricalColumn)
- and not libcudf.scalar._is_null_host_scalar(replace_val)
- )
+ if is_scalar(value):
+ value = {name: value for name in self._column_names}
+ elif not isinstance(value, (abc.Mapping, cudf.Series)):
+ raise TypeError(
+ f'"value" parameter must be a scalar, dict '
+ f"or Series, but you passed a "
+ f'"{type(value).__name__}"'
)
- if should_fill:
- filled_data[col_name] = col.fillna(replace_val, method)
- else:
- filled_data[col_name] = col.copy(deep=True)
+
+ filled_columns = [
+ col.fillna(value[name], method) if name in value else col.copy()
+ for name, col in self._data.items()
+ ]
return self._mimic_inplace(
- self._from_data(
- data=ColumnAccessor(
- data=filled_data,
- multiindex=self._data.multiindex,
- level_names=self._data.level_names,
- rangeindex=self._data.rangeindex,
- label_dtype=self._data.label_dtype,
- verify=False,
+ self._from_data_like_self(
+ self._data._from_columns_like_self(
+ filled_columns, verify=False
)
),
inplace=inplace,
diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py
index f1b74adefed..280a6e92eab 100644
--- a/python/cudf/cudf/core/indexed_frame.py
+++ b/python/cudf/cudf/core/indexed_frame.py
@@ -52,7 +52,7 @@
_post_process_output_col,
_return_arr_from_dtype,
)
-from cudf.core.window import Rolling
+from cudf.core.window import ExponentialMovingWindow, Rolling
from cudf.utils import docutils, ioutils
from cudf.utils._numba import _CUDFNumbaConfig
from cudf.utils.docutils import copy_docstring
@@ -1853,6 +1853,32 @@ def rolling(
win_type=win_type,
)
+ @copy_docstring(ExponentialMovingWindow)
+ def ewm(
+ self,
+ com: float | None = None,
+ span: float | None = None,
+ halflife: float | None = None,
+ alpha: float | None = None,
+ min_periods: int | None = 0,
+ adjust: bool = True,
+ ignore_na: bool = False,
+ axis: int = 0,
+ times: str | np.ndarray | None = None,
+ ):
+ return ExponentialMovingWindow(
+ self,
+ com=com,
+ span=span,
+ halflife=halflife,
+ alpha=alpha,
+ min_periods=min_periods,
+ adjust=adjust,
+ ignore_na=ignore_na,
+ axis=axis,
+ times=times,
+ )
+
@_cudf_nvtx_annotate
def nans_to_nulls(self):
"""
@@ -2701,11 +2727,24 @@ def sort_index(
if ignore_index:
out = out.reset_index(drop=True)
else:
- labels = sorted(self._data.names, reverse=not ascending)
- out = self[labels]
+ labels = sorted(self._column_names, reverse=not ascending)
+ result_columns = (self._data[label] for label in labels)
if ignore_index:
- out._data.rangeindex = True
- out._data.names = list(range(self._num_columns))
+ ca = ColumnAccessor(
+ dict(enumerate(result_columns)),
+ rangeindex=True,
+ verify=False,
+ )
+ else:
+ ca = ColumnAccessor(
+ dict(zip(labels, result_columns)),
+ rangeindex=self._data.rangeindex,
+ multiindex=self._data.multiindex,
+ level_names=self._data.level_names,
+ label_dtype=self._data.label_dtype,
+ verify=False,
+ )
+ out = self._from_data_like_self(ca)
return self._mimic_inplace(out, inplace=inplace)
@@ -3178,29 +3217,6 @@ def _split(self, splits, keep_index=True):
for i in range(len(splits) + 1)
]
- @_cudf_nvtx_annotate
- def fillna(
- self, value=None, method=None, axis=None, inplace=False, limit=None
- ): # noqa: D102
- if method is not None:
- # Do not remove until pandas 3.0 support is added.
- assert (
- PANDAS_LT_300
- ), "Need to drop after pandas-3.0 support is added."
- warnings.warn(
- f"{type(self).__name__}.fillna with 'method' is "
- "deprecated and will raise in a future version. "
- "Use obj.ffill() or obj.bfill() instead.",
- FutureWarning,
- )
- old_index = self.index
- ret = super().fillna(value, method, axis, inplace, limit)
- if inplace:
- self.index = old_index
- else:
- ret.index = old_index
- return ret
-
@_cudf_nvtx_annotate
def bfill(self, value=None, axis=None, inplace=None, limit=None):
"""
diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py
index c0716d7709a..15ad0813601 100644
--- a/python/cudf/cudf/core/series.py
+++ b/python/cudf/cudf/core/series.py
@@ -1797,20 +1797,12 @@ def fillna(
):
if isinstance(value, pd.Series):
value = Series.from_pandas(value)
-
- if not (is_scalar(value) or isinstance(value, (abc.Mapping, Series))):
- raise TypeError(
- f'"value" parameter must be a scalar, dict '
- f"or Series, but you passed a "
- f'"{type(value).__name__}"'
- )
-
- if isinstance(value, (abc.Mapping, Series)):
+ elif isinstance(value, abc.Mapping):
value = Series(value)
+ if isinstance(value, cudf.Series):
if not self.index.equals(value.index):
value = value.reindex(self.index)
- value = value._column
-
+ value = {self.name: value._column}
return super().fillna(
value=value, method=method, axis=axis, inplace=inplace, limit=limit
)
diff --git a/python/cudf/cudf/core/window/__init__.py b/python/cudf/cudf/core/window/__init__.py
index 8ea3eb0179b..23522588d33 100644
--- a/python/cudf/cudf/core/window/__init__.py
+++ b/python/cudf/cudf/core/window/__init__.py
@@ -1,3 +1,3 @@
-# Copyright (c) 2019-2022, NVIDIA CORPORATION
-
+# Copyright (c) 2019-2024, NVIDIA CORPORATION
+from cudf.core.window.ewm import ExponentialMovingWindow
from cudf.core.window.rolling import Rolling
diff --git a/python/cudf/cudf/core/window/ewm.py b/python/cudf/cudf/core/window/ewm.py
new file mode 100644
index 00000000000..21693e106bd
--- /dev/null
+++ b/python/cudf/cudf/core/window/ewm.py
@@ -0,0 +1,200 @@
+# Copyright (c) 2022-2024, NVIDIA CORPORATION.
+
+from __future__ import annotations
+
+import numpy as np
+
+from cudf._lib.reduce import scan
+from cudf.api.types import is_numeric_dtype
+from cudf.core.window.rolling import _RollingBase
+
+
+class ExponentialMovingWindow(_RollingBase):
+ r"""
+ Provide exponential weighted (EW) functions.
+ Available EW functions: ``mean()``
+ Exactly one parameter: ``com``, ``span``, ``halflife``, or ``alpha``
+ must be provided.
+
+ Parameters
+ ----------
+ com : float, optional
+ Specify decay in terms of center of mass,
+ :math:`\alpha = 1 / (1 + com)`, for :math:`com \geq 0`.
+ span : float, optional
+ Specify decay in terms of span,
+ :math:`\alpha = 2 / (span + 1)`, for :math:`span \geq 1`.
+ halflife : float, str, timedelta, optional
+ Specify decay in terms of half-life,
+ :math:`\alpha = 1 - \exp\left(-\ln(2) / halflife\right)`, for
+ :math:`halflife > 0`.
+ alpha : float, optional
+ Specify smoothing factor :math:`\alpha` directly,
+ :math:`0 < \alpha \leq 1`.
+ min_periods : int, default 0
+ Not Supported
+ adjust : bool, default True
+ Controls assumptions about the first value in the sequence.
+ https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.ewm.html
+ for details.
+ ignore_na : bool, default False
+ Not Supported
+ axis : {0, 1}, default 0
+ Not Supported
+ times : str, np.ndarray, Series, default None
+ Not Supported
+
+ Returns
+ -------
+ ``ExponentialMovingWindow`` object
+
+ Notes
+ -----
+ cuDF input data may contain both nulls and nan values. For the purposes
+ of this method, they are taken to have the same meaning, meaning nulls
+ in cuDF will affect the result the same way that nan values would using
+ the equivalent pandas method.
+
+ .. pandas-compat::
+ **cudf.core.window.ExponentialMovingWindow**
+
+ The parameters ``min_periods``, ``ignore_na``, ``axis``, and ``times``
+ are not yet supported. Behavior is defined only for data that begins
+ with a valid (non-null) element.
+
+ Currently, only ``mean`` is a supported method.
+
+ Examples
+ --------
+ >>> df = cudf.DataFrame({'B': [0, 1, 2, cudf.NA, 4]})
+ >>> df
+ B
+ 0 0
+ 1 1
+ 2 2
+ 3
+ 4 4
+ >>> df.ewm(com=0.5).mean()
+ B
+ 0 0.000000
+ 1 0.750000
+ 2 1.615385
+ 3 1.615385
+ 4 3.670213
+
+ >>> df.ewm(com=0.5, adjust=False).mean()
+ B
+ 0 0.000000
+ 1 0.666667
+ 2 1.555556
+ 3 1.555556
+ 4 3.650794
+ """
+
+ def __init__(
+ self,
+ obj,
+ com: float | None = None,
+ span: float | None = None,
+ halflife: float | None = None,
+ alpha: float | None = None,
+ min_periods: int | None = 0,
+ adjust: bool = True,
+ ignore_na: bool = False,
+ axis: int = 0,
+ times: str | np.ndarray | None = None,
+ ):
+ if (min_periods, ignore_na, axis, times) != (0, False, 0, None):
+ raise NotImplementedError(
+ "The parameters `min_periods`, `ignore_na`, "
+ "`axis`, and `times` are not yet supported."
+ )
+
+ self.obj = obj
+ self.adjust = adjust
+ self.com = get_center_of_mass(com, span, halflife, alpha)
+
+ def mean(self):
+ """
+ Calculate the ewm (exponential weighted moment) mean.
+ """
+ return self._apply_agg("ewma")
+
+ def var(self, bias):
+ raise NotImplementedError("ewmvar not yet supported.")
+
+ def std(self, bias):
+ raise NotImplementedError("ewmstd not yet supported.")
+
+ def corr(self, other):
+ raise NotImplementedError("ewmcorr not yet supported.")
+
+ def cov(self, other):
+ raise NotImplementedError("ewmcov not yet supported.")
+
+ def _apply_agg_series(self, sr, agg_name):
+ if not is_numeric_dtype(sr.dtype):
+ raise TypeError("No numeric types to aggregate")
+
+ # libcudf ewm has special casing for nulls only
+ # and come what may with nans. It treats those nulls like
+ # pandas does nans in the same positions mathematically.
+ # as such we need to convert the nans to nulls before
+ # passing them in.
+ to_libcudf_column = sr._column.astype("float64").nans_to_nulls()
+
+ return self.obj._from_data_like_self(
+ self.obj._data._from_columns_like_self(
+ [
+ scan(
+ agg_name,
+ to_libcudf_column,
+ True,
+ com=self.com,
+ adjust=self.adjust,
+ )
+ ]
+ )
+ )
+
+
+def get_center_of_mass(
+ comass: float | None,
+ span: float | None,
+ halflife: float | None,
+ alpha: float | None,
+) -> float:
+ valid_count = count_not_none(comass, span, halflife, alpha)
+ if valid_count > 1:
+ raise ValueError(
+ "comass, span, halflife, and alpha are mutually exclusive"
+ )
+
+ # Convert to center of mass; domain checks ensure 0 < alpha <= 1
+ if comass is not None:
+ if comass < 0:
+ raise ValueError("comass must satisfy: comass >= 0")
+ elif span is not None:
+ if span < 1:
+ raise ValueError("span must satisfy: span >= 1")
+ comass = (span - 1) / 2
+ elif halflife is not None:
+ if halflife <= 0:
+ raise ValueError("halflife must satisfy: halflife > 0")
+ decay = 1 - np.exp(np.log(0.5) / halflife)
+ comass = 1 / decay - 1
+ elif alpha is not None:
+ if alpha <= 0 or alpha > 1:
+ raise ValueError("alpha must satisfy: 0 < alpha <= 1")
+ comass = (1 - alpha) / alpha
+ else:
+ raise ValueError("Must pass one of comass, span, halflife, or alpha")
+
+ return float(comass)
+
+
+def count_not_none(*args) -> int:
+ """
+ Returns the count of arguments that are not None.
+ """
+ return sum(x is not None for x in args)
diff --git a/python/cudf/cudf/core/window/rolling.py b/python/cudf/cudf/core/window/rolling.py
index 7d140a1ffa5..29391c68471 100644
--- a/python/cudf/cudf/core/window/rolling.py
+++ b/python/cudf/cudf/core/window/rolling.py
@@ -14,7 +14,27 @@
from cudf.utils.utils import GetAttrGetItemMixin
-class Rolling(GetAttrGetItemMixin, Reducible):
+class _RollingBase:
+ """
+ Contains methods common to all kinds of rolling
+ """
+
+ def _apply_agg_dataframe(self, df, agg_name):
+ result_df = cudf.DataFrame({})
+ for i, col_name in enumerate(df.columns):
+ result_col = self._apply_agg_series(df[col_name], agg_name)
+ result_df.insert(i, col_name, result_col)
+ result_df.index = df.index
+ return result_df
+
+ def _apply_agg(self, agg_name):
+ if isinstance(self.obj, cudf.Series):
+ return self._apply_agg_series(self.obj, agg_name)
+ else:
+ return self._apply_agg_dataframe(self.obj, agg_name)
+
+
+class Rolling(GetAttrGetItemMixin, _RollingBase, Reducible):
"""
Rolling window calculations.
diff --git a/python/cudf/cudf/pandas/_wrappers/pandas.py b/python/cudf/cudf/pandas/_wrappers/pandas.py
index 698dd946022..0ba432d6d0e 100644
--- a/python/cudf/cudf/pandas/_wrappers/pandas.py
+++ b/python/cudf/cudf/pandas/_wrappers/pandas.py
@@ -789,7 +789,7 @@ def Index__new__(cls, *args, **kwargs):
ExponentialMovingWindow = make_intermediate_proxy_type(
"ExponentialMovingWindow",
- _Unusable,
+ cudf.core.window.ewm.ExponentialMovingWindow,
pd.core.window.ewm.ExponentialMovingWindow,
)
diff --git a/python/cudf/cudf/pandas/fast_slow_proxy.py b/python/cudf/cudf/pandas/fast_slow_proxy.py
index 1540c6850e7..dfb729cae6b 100644
--- a/python/cudf/cudf/pandas/fast_slow_proxy.py
+++ b/python/cudf/cudf/pandas/fast_slow_proxy.py
@@ -17,7 +17,7 @@
import numpy as np
from ..options import _env_get_bool
-from ..testing._utils import assert_eq
+from ..testing import assert_eq
from .annotation import nvtx
diff --git a/python/cudf/cudf/pylibcudf_tests/io/test_source_sink_info.py b/python/cudf/cudf/pylibcudf_tests/io/test_source_sink_info.py
index 8ab1b816adb..6e0767b3b8c 100644
--- a/python/cudf/cudf/pylibcudf_tests/io/test_source_sink_info.py
+++ b/python/cudf/cudf/pylibcudf_tests/io/test_source_sink_info.py
@@ -2,9 +2,11 @@
import io
+import pyarrow as pa
import pytest
import cudf._lib.pylibcudf as plc
+from cudf._lib.pylibcudf.io.datasource import NativeFileDatasource
@pytest.fixture(params=[plc.io.SourceInfo, plc.io.SinkInfo])
@@ -12,8 +14,24 @@ def io_class(request):
return request.param
+def _skip_invalid_sinks(io_class, sink):
+ """
+ Skip invalid sinks for SinkInfo
+ """
+ if io_class is plc.io.SinkInfo and isinstance(
+ sink, (bytes, NativeFileDatasource)
+ ):
+ pytest.skip(f"{sink} is not a valid input for SinkInfo")
+
+
@pytest.mark.parametrize(
- "source", ["a.txt", b"hello world", io.BytesIO(b"hello world")]
+ "source",
+ [
+ "a.txt",
+ b"hello world",
+ io.BytesIO(b"hello world"),
+ NativeFileDatasource(pa.PythonFile(io.BytesIO(), mode="r")),
+ ],
)
def test_source_info_ctor(io_class, source, tmp_path):
if isinstance(source, str):
@@ -21,8 +39,7 @@ def test_source_info_ctor(io_class, source, tmp_path):
file.write_bytes("hello world".encode("utf-8"))
source = str(file)
- if io_class is plc.io.SinkInfo and isinstance(source, bytes):
- pytest.skip("bytes is not a valid input for SinkInfo")
+ _skip_invalid_sinks(source)
io_class([source])
@@ -33,6 +50,10 @@ def test_source_info_ctor(io_class, source, tmp_path):
["a.txt", "a.txt"],
[b"hello world", b"hello there"],
[io.BytesIO(b"hello world"), io.BytesIO(b"hello there")],
+ [
+ NativeFileDatasource(pa.PythonFile(io.BytesIO(), mode="r")),
+ NativeFileDatasource(pa.PythonFile(io.BytesIO(), mode="r")),
+ ],
],
)
def test_source_info_ctor_multiple(io_class, sources, tmp_path):
@@ -42,8 +63,8 @@ def test_source_info_ctor_multiple(io_class, sources, tmp_path):
file = tmp_path / source
file.write_bytes("hello world".encode("utf-8"))
sources[i] = str(file)
- elif io_class is plc.io.SinkInfo and isinstance(source, bytes):
- pytest.skip("bytes is not a valid input for SinkInfo")
+
+ _skip_invalid_sinks(source)
io_class(sources)
@@ -58,6 +79,11 @@ def test_source_info_ctor_multiple(io_class, sources, tmp_path):
io.BytesIO(b"hello there"),
b"hello world",
],
+ [
+ NativeFileDatasource(pa.PythonFile(io.BytesIO(), mode="r")),
+ "awef.txt",
+ b"hello world",
+ ],
],
)
def test_source_info_ctor_mixing_invalid(io_class, sources, tmp_path):
@@ -69,7 +95,11 @@ def test_source_info_ctor_mixing_invalid(io_class, sources, tmp_path):
file = tmp_path / source
file.write_bytes("hello world".encode("utf-8"))
sources[i] = str(file)
- elif io_class is plc.io.SinkInfo and isinstance(source, bytes):
- pytest.skip("bytes is not a valid input for SinkInfo")
+ _skip_invalid_sinks(source)
with pytest.raises(ValueError):
io_class(sources)
+
+
+def test_source_info_invalid():
+ with pytest.raises(ValueError):
+ plc.io.SourceInfo([123])
diff --git a/python/cudf/cudf/testing/__init__.py b/python/cudf/cudf/testing/__init__.py
index 1843344bc81..4e92b43b9f9 100644
--- a/python/cudf/cudf/testing/__init__.py
+++ b/python/cudf/cudf/testing/__init__.py
@@ -1,7 +1,9 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
from cudf.testing.testing import (
+ assert_eq,
assert_frame_equal,
assert_index_equal,
+ assert_neq,
assert_series_equal,
)
diff --git a/python/cudf/cudf/testing/_utils.py b/python/cudf/cudf/testing/_utils.py
index e067d15af4c..a6a2d4eea00 100644
--- a/python/cudf/cudf/testing/_utils.py
+++ b/python/cudf/cudf/testing/_utils.py
@@ -2,12 +2,10 @@
import itertools
import string
-import warnings
from collections import abc
from contextlib import contextmanager
from decimal import Decimal
-import cupy
import numpy as np
import pandas as pd
import pytest
@@ -15,7 +13,6 @@
from numba.core.typing.templates import AbstractTemplate
from numba.cuda.cudadecl import registry as cuda_decl_registry
from numba.cuda.cudaimpl import lower as cuda_lower
-from pandas import testing as tm
import cudf
from cudf._lib.null_mask import bitmask_allocation_size_bytes
@@ -113,81 +110,6 @@ def count_zero(arr):
return np.count_nonzero(arr == 0)
-def assert_eq(left, right, **kwargs):
- """Assert that two cudf-like things are equivalent
-
- This equality test works for pandas/cudf dataframes/series/indexes/scalars
- in the same way, and so makes it easier to perform parametrized testing
- without switching between assert_frame_equal/assert_series_equal/...
- functions.
- """
- # dtypes that we support but Pandas doesn't will convert to
- # `object`. Check equality before that happens:
- if kwargs.get("check_dtype", True):
- if hasattr(left, "dtype") and hasattr(right, "dtype"):
- if isinstance(
- left.dtype, cudf.core.dtypes._BaseDtype
- ) and not isinstance(
- left.dtype, cudf.CategoricalDtype
- ): # leave categorical comparison to Pandas
- assert_eq(left.dtype, right.dtype)
-
- if hasattr(left, "to_pandas"):
- left = left.to_pandas()
- if hasattr(right, "to_pandas"):
- right = right.to_pandas()
- if isinstance(left, cupy.ndarray):
- left = cupy.asnumpy(left)
- if isinstance(right, cupy.ndarray):
- right = cupy.asnumpy(right)
-
- if isinstance(left, (pd.DataFrame, pd.Series, pd.Index)):
- # TODO: A warning is emitted from the function
- # pandas.testing.assert_[series, frame, index]_equal for some inputs:
- # "DeprecationWarning: elementwise comparison failed; this will raise
- # an error in the future."
- # or "FutureWarning: elementwise ..."
- # This warning comes from a call from pandas to numpy. It is ignored
- # here because it cannot be fixed within cudf.
- with warnings.catch_warnings():
- warnings.simplefilter(
- "ignore", (DeprecationWarning, FutureWarning)
- )
- if isinstance(left, pd.DataFrame):
- tm.assert_frame_equal(left, right, **kwargs)
- elif isinstance(left, pd.Series):
- tm.assert_series_equal(left, right, **kwargs)
- else:
- tm.assert_index_equal(left, right, **kwargs)
-
- elif isinstance(left, np.ndarray) and isinstance(right, np.ndarray):
- if np.issubdtype(left.dtype, np.floating) and np.issubdtype(
- right.dtype, np.floating
- ):
- assert np.allclose(left, right, equal_nan=True)
- else:
- assert np.array_equal(left, right)
- else:
- # Use the overloaded __eq__ of the operands
- if left == right:
- return True
- elif any(np.issubdtype(type(x), np.floating) for x in (left, right)):
- np.testing.assert_almost_equal(left, right)
- else:
- np.testing.assert_equal(left, right)
- return True
-
-
-def assert_neq(left, right, **kwargs):
- __tracebackhide__ = True
- try:
- assert_eq(left, right, **kwargs)
- except AssertionError:
- pass
- else:
- raise AssertionError
-
-
def assert_exceptions_equal(
lfunc,
rfunc,
diff --git a/python/cudf/cudf/testing/testing.py b/python/cudf/cudf/testing/testing.py
index dffbbe92fc1..e56c8d867cb 100644
--- a/python/cudf/cudf/testing/testing.py
+++ b/python/cudf/cudf/testing/testing.py
@@ -2,9 +2,12 @@
from __future__ import annotations
+import warnings
+
import cupy as cp
import numpy as np
import pandas as pd
+from pandas import testing as tm
import cudf
from cudf._lib.unary import is_nan
@@ -708,3 +711,100 @@ def assert_frame_equal(
atol=atol,
obj=f'Column name="{col}"',
)
+
+
+def assert_eq(left, right, **kwargs):
+ """Assert that two cudf-like things are equivalent
+
+ Parameters
+ ----------
+ left
+ Object to compare
+ right
+ Object to compare
+ kwargs
+ Keyword arguments to control behaviour of comparisons. See
+ :func:`assert_frame_equal`, :func:`assert_series_equal`, and
+ :func:`assert_index_equal`.
+
+ Notes
+ -----
+ This equality test works for pandas/cudf dataframes/series/indexes/scalars
+ in the same way, and so makes it easier to perform parametrized testing
+ without switching between assert_frame_equal/assert_series_equal/...
+ functions.
+
+ Raises
+ ------
+ AssertionError
+ If the two objects do not compare equal.
+ """
+ # dtypes that we support but Pandas doesn't will convert to
+ # `object`. Check equality before that happens:
+ if kwargs.get("check_dtype", True):
+ if hasattr(left, "dtype") and hasattr(right, "dtype"):
+ if isinstance(
+ left.dtype, cudf.core.dtypes._BaseDtype
+ ) and not isinstance(
+ left.dtype, cudf.CategoricalDtype
+ ): # leave categorical comparison to Pandas
+ assert_eq(left.dtype, right.dtype)
+
+ if hasattr(left, "to_pandas"):
+ left = left.to_pandas()
+ if hasattr(right, "to_pandas"):
+ right = right.to_pandas()
+ if isinstance(left, cp.ndarray):
+ left = cp.asnumpy(left)
+ if isinstance(right, cp.ndarray):
+ right = cp.asnumpy(right)
+
+ if isinstance(left, (pd.DataFrame, pd.Series, pd.Index)):
+ # TODO: A warning is emitted from the function
+ # pandas.testing.assert_[series, frame, index]_equal for some inputs:
+ # "DeprecationWarning: elementwise comparison failed; this will raise
+ # an error in the future."
+ # or "FutureWarning: elementwise ..."
+ # This warning comes from a call from pandas to numpy. It is ignored
+ # here because it cannot be fixed within cudf.
+ with warnings.catch_warnings():
+ warnings.simplefilter(
+ "ignore", (DeprecationWarning, FutureWarning)
+ )
+ if isinstance(left, pd.DataFrame):
+ tm.assert_frame_equal(left, right, **kwargs)
+ elif isinstance(left, pd.Series):
+ tm.assert_series_equal(left, right, **kwargs)
+ else:
+ tm.assert_index_equal(left, right, **kwargs)
+
+ elif isinstance(left, np.ndarray) and isinstance(right, np.ndarray):
+ if np.issubdtype(left.dtype, np.floating) and np.issubdtype(
+ right.dtype, np.floating
+ ):
+ assert np.allclose(left, right, equal_nan=True)
+ else:
+ assert np.array_equal(left, right)
+ else:
+ # Use the overloaded __eq__ of the operands
+ if left == right:
+ return True
+ elif any(np.issubdtype(type(x), np.floating) for x in (left, right)):
+ np.testing.assert_almost_equal(left, right)
+ else:
+ np.testing.assert_equal(left, right)
+ return True
+
+
+def assert_neq(left, right, **kwargs):
+ """Assert that two cudf-like things are not equal.
+
+ Provides the negation of the meaning of :func:`assert_eq`.
+ """
+ __tracebackhide__ = True
+ try:
+ assert_eq(left, right, **kwargs)
+ except AssertionError:
+ pass
+ else:
+ raise AssertionError
diff --git a/python/cudf/cudf/tests/conftest.py b/python/cudf/cudf/tests/conftest.py
index 30d8f1c8422..437bc4cba67 100644
--- a/python/cudf/cudf/tests/conftest.py
+++ b/python/cudf/cudf/tests/conftest.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2022, NVIDIA CORPORATION.
+# Copyright (c) 2019-2024, NVIDIA CORPORATION.
import itertools
import os
@@ -11,7 +11,7 @@
import rmm # noqa: F401
import cudf
-from cudf.testing._utils import assert_eq
+from cudf.testing import assert_eq
_CURRENT_DIRECTORY = str(pathlib.Path(__file__).resolve().parent)
diff --git a/python/cudf/cudf/tests/dataframe/test_conversion.py b/python/cudf/cudf/tests/dataframe/test_conversion.py
index fa7e5ec1d4c..d1de7245634 100644
--- a/python/cudf/cudf/tests/dataframe/test_conversion.py
+++ b/python/cudf/cudf/tests/dataframe/test_conversion.py
@@ -1,9 +1,9 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
import pandas as pd
import pytest
import cudf
-from cudf.testing._utils import assert_eq
+from cudf.testing import assert_eq
def test_convert_dtypes():
diff --git a/python/cudf/cudf/tests/dataframe/test_io_serialization.py b/python/cudf/cudf/tests/dataframe/test_io_serialization.py
index ad81609470c..57948afe1d8 100644
--- a/python/cudf/cudf/tests/dataframe/test_io_serialization.py
+++ b/python/cudf/cudf/tests/dataframe/test_io_serialization.py
@@ -8,7 +8,7 @@
import pytest
import cudf
-from cudf.testing._utils import assert_eq
+from cudf.testing import assert_eq
@pytest.mark.parametrize(
diff --git a/python/cudf/cudf/tests/groupby/test_computation.py b/python/cudf/cudf/tests/groupby/test_computation.py
index 04c56ef7462..630fcdc4dce 100644
--- a/python/cudf/cudf/tests/groupby/test_computation.py
+++ b/python/cudf/cudf/tests/groupby/test_computation.py
@@ -1,9 +1,9 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
import pandas as pd
import pytest
import cudf
-from cudf.testing._utils import assert_eq
+from cudf.testing import assert_eq
@pytest.mark.parametrize("method", ["average", "min", "max", "first", "dense"])
diff --git a/python/cudf/cudf/tests/groupby/test_groupby_obj.py b/python/cudf/cudf/tests/groupby/test_groupby_obj.py
index 04b483e08dc..ab2b16d263c 100644
--- a/python/cudf/cudf/tests/groupby/test_groupby_obj.py
+++ b/python/cudf/cudf/tests/groupby/test_groupby_obj.py
@@ -2,7 +2,7 @@
from numpy.testing import assert_array_equal
import cudf
-from cudf.testing._utils import assert_eq
+from cudf.testing import assert_eq
def test_groupby_14955():
diff --git a/python/cudf/cudf/tests/groupby/test_indexing.py b/python/cudf/cudf/tests/groupby/test_indexing.py
index 57e8bc1c2d8..43b6183fca5 100644
--- a/python/cudf/cudf/tests/groupby/test_indexing.py
+++ b/python/cudf/cudf/tests/groupby/test_indexing.py
@@ -1,6 +1,6 @@
# Copyright (c) 2023-2024, NVIDIA CORPORATION.
import cudf
-from cudf.testing._utils import assert_eq
+from cudf.testing import assert_eq
def test_rank_return_type_compatible_mode():
diff --git a/python/cudf/cudf/tests/groupby/test_transform.py b/python/cudf/cudf/tests/groupby/test_transform.py
index 78d7fbfd879..f7138036ddf 100644
--- a/python/cudf/cudf/tests/groupby/test_transform.py
+++ b/python/cudf/cudf/tests/groupby/test_transform.py
@@ -4,7 +4,7 @@
import pytest
import cudf
-from cudf.testing._utils import assert_eq
+from cudf.testing import assert_eq
@pytest.fixture(params=[False, True], ids=["no-null-keys", "null-keys"])
diff --git a/python/cudf/cudf/tests/indexes/datetime/test_indexing.py b/python/cudf/cudf/tests/indexes/datetime/test_indexing.py
index ee4d0f7e816..4c0ce2ed191 100644
--- a/python/cudf/cudf/tests/indexes/datetime/test_indexing.py
+++ b/python/cudf/cudf/tests/indexes/datetime/test_indexing.py
@@ -4,7 +4,7 @@
import pandas as pd
import cudf
-from cudf.testing._utils import assert_eq
+from cudf.testing import assert_eq
def test_slice_datetimetz_index():
diff --git a/python/cudf/cudf/tests/indexes/datetime/test_time_specific.py b/python/cudf/cudf/tests/indexes/datetime/test_time_specific.py
index 77b32b8ce89..7cc629270b1 100644
--- a/python/cudf/cudf/tests/indexes/datetime/test_time_specific.py
+++ b/python/cudf/cudf/tests/indexes/datetime/test_time_specific.py
@@ -4,7 +4,7 @@
import pandas as pd
import cudf
-from cudf.testing._utils import assert_eq
+from cudf.testing import assert_eq
def test_tz_localize():
diff --git a/python/cudf/cudf/tests/indexes/test_interval.py b/python/cudf/cudf/tests/indexes/test_interval.py
index d59041e32d5..87b76ab7609 100644
--- a/python/cudf/cudf/tests/indexes/test_interval.py
+++ b/python/cudf/cudf/tests/indexes/test_interval.py
@@ -7,7 +7,7 @@
import cudf
from cudf.core._compat import PANDAS_CURRENT_SUPPORTED_VERSION, PANDAS_VERSION
from cudf.core.index import IntervalIndex, interval_range
-from cudf.testing._utils import assert_eq
+from cudf.testing import assert_eq
def test_interval_constructor_default_closed():
diff --git a/python/cudf/cudf/tests/input_output/test_text.py b/python/cudf/cudf/tests/input_output/test_text.py
index acba13bb5b0..e9406d080d4 100644
--- a/python/cudf/cudf/tests/input_output/test_text.py
+++ b/python/cudf/cudf/tests/input_output/test_text.py
@@ -1,11 +1,11 @@
-# Copyright (c) 2019-2023, NVIDIA CORPORATION.
+# Copyright (c) 2019-2024, NVIDIA CORPORATION.
from io import StringIO
import pytest
import cudf
-from cudf.testing._utils import assert_eq
+from cudf.testing import assert_eq
@pytest.fixture(scope="module")
diff --git a/python/cudf/cudf/tests/series/test_conversion.py b/python/cudf/cudf/tests/series/test_conversion.py
index 43ac35e41a6..e1dd359e1ba 100644
--- a/python/cudf/cudf/tests/series/test_conversion.py
+++ b/python/cudf/cudf/tests/series/test_conversion.py
@@ -1,9 +1,9 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
import pandas as pd
import pytest
import cudf
-from cudf.testing._utils import assert_eq
+from cudf.testing import assert_eq
@pytest.mark.parametrize(
diff --git a/python/cudf/cudf/tests/series/test_datetimelike.py b/python/cudf/cudf/tests/series/test_datetimelike.py
index 302ef19852d..cea86a5499e 100644
--- a/python/cudf/cudf/tests/series/test_datetimelike.py
+++ b/python/cudf/cudf/tests/series/test_datetimelike.py
@@ -9,7 +9,7 @@
import cudf
from cudf import date_range
-from cudf.testing._utils import assert_eq
+from cudf.testing import assert_eq
def _get_all_zones():
diff --git a/python/cudf/cudf/tests/test_apply_rows.py b/python/cudf/cudf/tests/test_apply_rows.py
index 8870eb421c7..a11022c1a17 100644
--- a/python/cudf/cudf/tests/test_apply_rows.py
+++ b/python/cudf/cudf/tests/test_apply_rows.py
@@ -1,10 +1,11 @@
-# Copyright (c) 2019-2022, NVIDIA CORPORATION.
+# Copyright (c) 2019-2024, NVIDIA CORPORATION.
import pytest
import cudf
from cudf.core.column import column
-from cudf.testing._utils import assert_eq, gen_rand_series
+from cudf.testing import assert_eq
+from cudf.testing._utils import gen_rand_series
def _kernel_multiply(a, b, out):
diff --git a/python/cudf/cudf/tests/test_applymap.py b/python/cudf/cudf/tests/test_applymap.py
index d720e6ce2ce..ce1dcce5887 100644
--- a/python/cudf/cudf/tests/test_applymap.py
+++ b/python/cudf/cudf/tests/test_applymap.py
@@ -4,7 +4,7 @@
from cudf import NA, DataFrame
from cudf.core._compat import PANDAS_CURRENT_SUPPORTED_VERSION, PANDAS_VERSION
-from cudf.testing import _utils as utils
+from cudf.testing import assert_eq
@pytest.mark.skipif(
@@ -46,7 +46,7 @@ def test_applymap_dataframe(data, func, na_action, request):
with pytest.warns(FutureWarning):
got = gdf.applymap(func, na_action=na_action)
- utils.assert_eq(expect, got, check_dtype=False)
+ assert_eq(expect, got, check_dtype=False)
def test_applymap_raise_cases():
diff --git a/python/cudf/cudf/tests/test_array_function.py b/python/cudf/cudf/tests/test_array_function.py
index e6b89e2c5fa..773141ee71a 100644
--- a/python/cudf/cudf/tests/test_array_function.py
+++ b/python/cudf/cudf/tests/test_array_function.py
@@ -5,7 +5,7 @@
import pytest
import cudf
-from cudf.testing._utils import assert_eq
+from cudf.testing import assert_eq
# To determine if NEP18 is available in the current version of NumPy we simply
diff --git a/python/cudf/cudf/tests/test_array_ufunc.py b/python/cudf/cudf/tests/test_array_ufunc.py
index b036c1f13f3..41b9188f036 100644
--- a/python/cudf/cudf/tests/test_array_ufunc.py
+++ b/python/cudf/cudf/tests/test_array_ufunc.py
@@ -15,11 +15,8 @@
PANDAS_LT_300,
PANDAS_VERSION,
)
-from cudf.testing._utils import (
- assert_eq,
- expect_warning_if,
- set_random_null_mask_inplace,
-)
+from cudf.testing import assert_eq
+from cudf.testing._utils import expect_warning_if, set_random_null_mask_inplace
_UFUNCS = [
obj
diff --git a/python/cudf/cudf/tests/test_avro_reader_fastavro_integration.py b/python/cudf/cudf/tests/test_avro_reader_fastavro_integration.py
index 238e8d990cc..2ec1d1d2f28 100644
--- a/python/cudf/cudf/tests/test_avro_reader_fastavro_integration.py
+++ b/python/cudf/cudf/tests/test_avro_reader_fastavro_integration.py
@@ -23,7 +23,7 @@
import pytest
import cudf
-from cudf.testing._utils import assert_eq
+from cudf.testing import assert_eq
from cudf.testing.dataset_generator import rand_dataframe
diff --git a/python/cudf/cudf/tests/test_binops.py b/python/cudf/cudf/tests/test_binops.py
index fa371914c3e..7d8c3b53115 100644
--- a/python/cudf/cudf/tests/test_binops.py
+++ b/python/cudf/cudf/tests/test_binops.py
@@ -15,7 +15,7 @@
from cudf import Index, Series
from cudf.core._compat import PANDAS_CURRENT_SUPPORTED_VERSION, PANDAS_VERSION
from cudf.core.buffer.spill_manager import get_global_manager
-from cudf.testing import _utils as utils
+from cudf.testing import _utils as utils, assert_eq
from cudf.utils.dtypes import (
BOOL_TYPES,
DATETIME_TYPES,
@@ -194,7 +194,7 @@ def test_series_binop(binop, obj_class):
if obj_class == "Index":
result = Series(result)
- utils.assert_eq(result, expect)
+ assert_eq(result, expect)
@pytest.mark.parametrize("binop", _binops)
@@ -318,7 +318,7 @@ def test_series_compare_nulls(cmpop, dtypes):
expect[expect_mask] = cmpop(lser[expect_mask], rser[expect_mask])
got = cmpop(lser, rser)
- utils.assert_eq(expect, got)
+ assert_eq(expect, got)
@pytest.fixture
@@ -349,7 +349,7 @@ def test_str_series_compare_str(
Series.from_pandas(str_series_cmp_data), "a"
)
- utils.assert_eq(expect, got.to_pandas(nullable=True))
+ assert_eq(expect, got.to_pandas(nullable=True))
def test_str_series_compare_str_reflected(
@@ -360,7 +360,7 @@ def test_str_series_compare_str_reflected(
"a", Series.from_pandas(str_series_cmp_data)
)
- utils.assert_eq(expect, got.to_pandas(nullable=True))
+ assert_eq(expect, got.to_pandas(nullable=True))
def test_str_series_compare_num(
@@ -371,7 +371,7 @@ def test_str_series_compare_num(
Series.from_pandas(str_series_cmp_data), cmp_scalar
)
- utils.assert_eq(expect, got.to_pandas(nullable=True))
+ assert_eq(expect, got.to_pandas(nullable=True))
def test_str_series_compare_num_reflected(
@@ -382,7 +382,7 @@ def test_str_series_compare_num_reflected(
cmp_scalar, Series.from_pandas(str_series_cmp_data)
)
- utils.assert_eq(expect, got.to_pandas(nullable=True))
+ assert_eq(expect, got.to_pandas(nullable=True))
@pytest.mark.parametrize("obj_class", ["Series", "Index"])
@@ -612,12 +612,12 @@ def test_different_shapes_and_columns(binop):
# Empty frame on the right side
pd_frame = binop(pd.DataFrame({"x": [1, 2]}), pd.DataFrame({}))
cd_frame = binop(cudf.DataFrame({"x": [1, 2]}), cudf.DataFrame({}))
- utils.assert_eq(cd_frame, pd_frame)
+ assert_eq(cd_frame, pd_frame)
# Empty frame on the left side
pd_frame = pd.DataFrame({}) + pd.DataFrame({"x": [1, 2]})
cd_frame = cudf.DataFrame({}) + cudf.DataFrame({"x": [1, 2]})
- utils.assert_eq(cd_frame, pd_frame)
+ assert_eq(cd_frame, pd_frame)
# Note: the below rely on a discrepancy between cudf and pandas
# While pandas inserts columns in alphabetical order, cudf inserts in the
@@ -627,12 +627,12 @@ def test_different_shapes_and_columns(binop):
# More rows on the left side
pd_frame = pd.DataFrame({"x": [1, 2, 3]}) + pd.DataFrame({"y": [1, 2]})
cd_frame = cudf.DataFrame({"x": [1, 2, 3]}) + cudf.DataFrame({"y": [1, 2]})
- utils.assert_eq(cd_frame, pd_frame)
+ assert_eq(cd_frame, pd_frame)
# More rows on the right side
pd_frame = pd.DataFrame({"x": [1, 2]}) + pd.DataFrame({"y": [1, 2, 3]})
cd_frame = cudf.DataFrame({"x": [1, 2]}) + cudf.DataFrame({"y": [1, 2, 3]})
- utils.assert_eq(cd_frame, pd_frame)
+ assert_eq(cd_frame, pd_frame)
@pytest.mark.parametrize("binop", _binops)
@@ -650,7 +650,7 @@ def test_different_shapes_and_same_columns(binop):
)
# cast x as float64 so it matches pandas dtype
cd_frame["x"] = cd_frame["x"].astype(np.float64)
- utils.assert_eq(cd_frame, pd_frame)
+ assert_eq(cd_frame, pd_frame)
@pytest.mark.parametrize("binop", _binops)
@@ -680,7 +680,7 @@ def test_different_shapes_and_columns_with_unaligned_indices(binop):
# cast x and y as float64 so it matches pandas dtype
cd_frame["x"] = cd_frame["x"].astype(np.float64)
cd_frame["y"] = cd_frame["y"].astype(np.float64)
- utils.assert_eq(cd_frame, pd_frame)
+ assert_eq(cd_frame, pd_frame)
pdf1 = pd.DataFrame({"x": [1, 1]}, index=["a", "a"])
pdf2 = pd.DataFrame({"x": [2]}, index=["a"])
@@ -688,7 +688,7 @@ def test_different_shapes_and_columns_with_unaligned_indices(binop):
gdf2 = cudf.DataFrame.from_pandas(pdf2)
pd_frame = binop(pdf1, pdf2)
cd_frame = binop(gdf1, gdf2)
- utils.assert_eq(pd_frame, cd_frame)
+ assert_eq(pd_frame, cd_frame)
@pytest.mark.parametrize(
@@ -717,12 +717,12 @@ def test_df_different_index_shape(df2, binop):
def test_boolean_scalar_binop(op):
psr = pd.Series(np.random.choice([True, False], 10))
gsr = cudf.from_pandas(psr)
- utils.assert_eq(op(psr, True), op(gsr, True))
- utils.assert_eq(op(psr, False), op(gsr, False))
+ assert_eq(op(psr, True), op(gsr, True))
+ assert_eq(op(psr, False), op(gsr, False))
# cuDF scalar
- utils.assert_eq(op(psr, True), op(gsr, cudf.Scalar(True)))
- utils.assert_eq(op(psr, False), op(gsr, cudf.Scalar(False)))
+ assert_eq(op(psr, True), op(gsr, cudf.Scalar(True)))
+ assert_eq(op(psr, False), op(gsr, cudf.Scalar(False)))
@pytest.mark.parametrize("func", _operators_arithmetic)
@@ -747,7 +747,7 @@ def test_operator_func_between_series(dtype, func, has_nulls, fill_value):
pdf_series_b, fill_value=fill_value
)
- utils.assert_eq(pdf_result, gdf_result)
+ assert_eq(pdf_result, gdf_result)
@pytest.mark.parametrize("func", _operators_arithmetic)
@@ -773,7 +773,7 @@ def test_operator_func_series_and_scalar(
scalar, fill_value=fill_value
)
- utils.assert_eq(pdf_series_result, gdf_series_result)
+ assert_eq(pdf_series_result, gdf_series_result)
_permu_values = [0, 1, None, np.nan]
@@ -812,9 +812,9 @@ def test_operator_func_between_series_logical(
and np.isnan(fill_value)
):
with pytest.raises(AssertionError):
- utils.assert_eq(expect, got)
+ assert_eq(expect, got)
return
- utils.assert_eq(expect, got)
+ assert_eq(expect, got)
@pytest.mark.parametrize("dtype", ["float32", "float64"])
@@ -851,7 +851,7 @@ def test_operator_func_series_and_scalar_logical(
expect = pdf_series_result
got = gdf_series_result.to_pandas(nullable=True)
- utils.assert_eq(expect, got)
+ assert_eq(expect, got)
@pytest.mark.parametrize("func", _operators_arithmetic)
@@ -887,7 +887,7 @@ def gen_df():
got = getattr(gdf1, func)(gdf2, fill_value=fill_value)
expect = getattr(pdf1, func)(pdf2, fill_value=fill_value)[list(got._data)]
- utils.assert_eq(expect, got)
+ assert_eq(expect, got)
@pytest.mark.parametrize("func", _operators_comparison)
@@ -923,7 +923,7 @@ def gen_df():
got = getattr(gdf1, func)(gdf2)
expect = getattr(pdf1, func)(pdf2)[list(got._data)]
- utils.assert_eq(expect, got)
+ assert_eq(expect, got)
@pytest.mark.parametrize(
@@ -949,7 +949,7 @@ def gen_df():
def test_binop_bool_uint(func, rhs):
psr = pd.Series([True, False, False])
gsr = cudf.from_pandas(psr)
- utils.assert_eq(
+ assert_eq(
getattr(psr, func)(rhs), getattr(gsr, func)(rhs), check_dtype=False
)
@@ -977,7 +977,7 @@ def test_floordiv_zero_float64(series_dtype, divisor_dtype, scalar_divisor):
else:
pd_div = pd.Series([0], dtype=divisor_dtype)
cudf_div = cudf.from_pandas(pd_div)
- utils.assert_eq(sr // pd_div, cr // cudf_div)
+ assert_eq(sr // pd_div, cr // cudf_div)
@pytest.mark.parametrize("scalar_divisor", [False, True])
@@ -1023,27 +1023,27 @@ def test_floordiv_zero_bool(scalar_divisor):
def test_rmod_zero_nan(dtype):
sr = pd.Series([1, 1, 0], dtype=dtype)
cr = cudf.from_pandas(sr)
- utils.assert_eq(1 % sr, 1 % cr)
+ assert_eq(1 % sr, 1 % cr)
expected_dtype = np.float64 if cr.dtype.kind != "f" else dtype
- utils.assert_eq(1 % cr, cudf.Series([0, 0, None], dtype=expected_dtype))
+ assert_eq(1 % cr, cudf.Series([0, 0, None], dtype=expected_dtype))
def test_series_misc_binop():
pds = pd.Series([1, 2, 4], name="abc xyz")
gds = cudf.Series([1, 2, 4], name="abc xyz")
- utils.assert_eq(pds + 1, gds + 1)
- utils.assert_eq(1 + pds, 1 + gds)
+ assert_eq(pds + 1, gds + 1)
+ assert_eq(1 + pds, 1 + gds)
- utils.assert_eq(pds + pds, gds + gds)
+ assert_eq(pds + pds, gds + gds)
pds1 = pd.Series([1, 2, 4], name="hello world")
gds1 = cudf.Series([1, 2, 4], name="hello world")
- utils.assert_eq(pds + pds1, gds + gds1)
- utils.assert_eq(pds1 + pds, gds1 + gds)
+ assert_eq(pds + pds1, gds + gds1)
+ assert_eq(pds1 + pds, gds1 + gds)
- utils.assert_eq(pds1 + pds + 5, gds1 + gds + 5)
+ assert_eq(pds1 + pds + 5, gds1 + gds + 5)
def test_int8_float16_binop():
@@ -1051,7 +1051,7 @@ def test_int8_float16_binop():
b = np.float16(2)
expect = cudf.Series([0.5])
got = a / b
- utils.assert_eq(expect, got, check_dtype=False)
+ assert_eq(expect, got, check_dtype=False)
@pytest.mark.parametrize("dtype", ["int64", "float64", "str"])
@@ -1061,7 +1061,7 @@ def test_vector_to_none_binops(dtype):
expect = Series([None] * 4).astype(dtype)
got = data + None
- utils.assert_eq(expect, got)
+ assert_eq(expect, got)
def dtype_scalar(val, dtype):
@@ -1747,12 +1747,12 @@ def test_datetime_dateoffset_binaryop(
expect = op(psr, poffset)
got = op(gsr, goffset)
- utils.assert_eq(expect, got)
+ assert_eq(expect, got)
expect = op(psr, -poffset)
got = op(gsr, -goffset)
- utils.assert_eq(expect, got)
+ assert_eq(expect, got)
@pytest.mark.parametrize(
@@ -1793,7 +1793,7 @@ def test_datetime_dateoffset_binaryop_multiple(date_col, kwargs, op):
expect = op(psr, poffset)
got = op(gsr, goffset)
- utils.assert_eq(expect, got)
+ assert_eq(expect, got)
@pytest.mark.parametrize("n_periods", [0, 1, -1, 12, -12])
@@ -1840,7 +1840,7 @@ def test_datetime_dateoffset_binaryop_reflected(
# TODO: Remove check_dtype once we get some clarity on:
# https://github.com/pandas-dev/pandas/issues/57448
- utils.assert_eq(expect, got, check_dtype=False)
+ assert_eq(expect, got, check_dtype=False)
with pytest.raises(TypeError):
poffset - psr
@@ -1878,7 +1878,7 @@ def test_binops_with_lhs_numpy_scalar(frame, dtype):
expected = data.to_pandas() == val
got = data == val
- utils.assert_eq(expected, got)
+ assert_eq(expected, got)
@pytest.mark.parametrize(
@@ -2302,7 +2302,7 @@ def test_binops_decimal(op, lhs, l_dtype, rhs, r_dtype, expect, expect_dtype):
got = op(a, b)
assert expect.dtype == got.dtype
- utils.assert_eq(expect, got)
+ assert_eq(expect, got)
@pytest.mark.parametrize(
@@ -2355,7 +2355,7 @@ def test_binops_reflect_decimal(
got = getattr(a, op)(b)
assert expect.dtype == got.dtype
- utils.assert_eq(expect, got)
+ assert_eq(expect, got)
@pytest.mark.parametrize("powers", [0, 1, 2, 3])
@@ -2371,7 +2371,7 @@ def test_binops_decimal_pow(powers):
)
ps = s.to_pandas()
- utils.assert_eq(s**powers, ps**powers, check_dtype=False)
+ assert_eq(s**powers, ps**powers, check_dtype=False)
def test_binops_raise_error():
@@ -2554,7 +2554,7 @@ def test_binops_decimal_comp_mixed_integer(args, integer_dtype, reflected):
actual = op(lhs, rhs)
- utils.assert_eq(expected, actual)
+ assert_eq(expected, actual)
@pytest.mark.parametrize(
@@ -2804,7 +2804,7 @@ def decimal_series(input, dtype):
got = op(lhs, rhs)
assert expect.dtype == got.dtype
- utils.assert_eq(expect, got)
+ assert_eq(expect, got)
@pytest.mark.parametrize(
@@ -2979,7 +2979,7 @@ def test_binops_decimal_scalar_compare(args, reflected):
actual = op(lhs, rhs)
- utils.assert_eq(expected, actual)
+ assert_eq(expected, actual)
@pytest.mark.parametrize(
@@ -3042,7 +3042,7 @@ def test_equality_ops_index_mismatch(fn):
expected = getattr(pa, fn)(pb)
actual = getattr(a, fn)(b).to_pandas(nullable=True)
- utils.assert_eq(expected, actual)
+ assert_eq(expected, actual)
def generate_test_null_equals_columnops_data():
@@ -3132,7 +3132,7 @@ def test_empty_column(binop, data, scalar):
got = binop(gdf, scalar)
expected = binop(pdf, scalar)
- utils.assert_eq(expected, got)
+ assert_eq(expected, got)
@pytest.mark.parametrize(
@@ -3179,7 +3179,7 @@ def test_binops_dot(df, other):
expected = pdf @ host_other
got = df @ other
- utils.assert_eq(expected, got)
+ assert_eq(expected, got)
def test_binop_dot_preserve_index():
@@ -3187,7 +3187,7 @@ def test_binop_dot_preserve_index():
df = cudf.DataFrame(np.eye(2), columns=["A", "B"], index=["A", "B"])
result = ser @ df
expected = ser.to_pandas() @ df.to_pandas()
- utils.assert_eq(result, expected)
+ assert_eq(result, expected)
def test_binop_series_with_repeated_index():
@@ -3198,7 +3198,7 @@ def test_binop_series_with_repeated_index():
gsr2 = cudf.from_pandas(psr2)
expected = psr1 - psr2
got = gsr1 - gsr2
- utils.assert_eq(expected, got)
+ assert_eq(expected, got)
def test_binop_integer_power_series_series():
@@ -3209,7 +3209,7 @@ def test_binop_integer_power_series_series():
ps_exponent = gs_exponent.to_pandas()
expected = ps_base**ps_exponent
got = gs_base**gs_exponent
- utils.assert_eq(expected, got)
+ assert_eq(expected, got)
def test_binop_integer_power_series_scalar():
@@ -3219,7 +3219,7 @@ def test_binop_integer_power_series_scalar():
ps_base = gs_base.to_pandas()
expected = ps_base**exponent.value
got = gs_base**exponent
- utils.assert_eq(expected, got)
+ assert_eq(expected, got)
def test_binop_integer_power_series_int():
@@ -3229,7 +3229,7 @@ def test_binop_integer_power_series_int():
ps_base = gs_base.to_pandas()
expected = ps_base**exponent
got = gs_base**exponent
- utils.assert_eq(expected, got)
+ assert_eq(expected, got)
def test_binop_integer_power_scalar_series():
@@ -3239,7 +3239,7 @@ def test_binop_integer_power_scalar_series():
ps_exponent = gs_exponent.to_pandas()
expected = base.value**ps_exponent
got = base**gs_exponent
- utils.assert_eq(expected, got)
+ assert_eq(expected, got)
def test_binop_integer_power_scalar_scalar():
@@ -3248,7 +3248,7 @@ def test_binop_integer_power_scalar_scalar():
exponent = cudf.Scalar(1)
expected = base.value**exponent.value
got = base**exponent
- utils.assert_eq(expected, got)
+ assert_eq(expected, got)
def test_binop_integer_power_scalar_int():
@@ -3257,7 +3257,7 @@ def test_binop_integer_power_scalar_int():
exponent = 1
expected = base.value**exponent
got = base**exponent
- utils.assert_eq(expected, got)
+ assert_eq(expected, got)
def test_binop_integer_power_int_series():
@@ -3267,7 +3267,7 @@ def test_binop_integer_power_int_series():
ps_exponent = gs_exponent.to_pandas()
expected = base**ps_exponent
got = base**gs_exponent
- utils.assert_eq(expected, got)
+ assert_eq(expected, got)
def test_binop_integer_power_int_scalar():
@@ -3276,7 +3276,7 @@ def test_binop_integer_power_int_scalar():
exponent = cudf.Scalar(1)
expected = base**exponent.value
got = base**exponent
- utils.assert_eq(expected, got)
+ assert_eq(expected, got)
def test_numpy_int_scalar_binop():
@@ -3291,7 +3291,7 @@ def test_binop_index_series(op):
actual = op(gi, gs)
expected = op(gi.to_pandas(), gs.to_pandas())
- utils.assert_eq(expected, actual)
+ assert_eq(expected, actual)
@pytest.mark.parametrize("name1", utils.SERIES_OR_INDEX_NAMES)
@@ -3307,7 +3307,7 @@ def test_binop_index_dt_td_series_with_names(name1, name2):
expected = gi.to_pandas() + gs.to_pandas()
actual = gi + gs
- utils.assert_eq(expected, actual)
+ assert_eq(expected, actual)
@pytest.mark.parametrize("data1", [[1, 2, 3], [10, 11, None]])
@@ -3319,9 +3319,9 @@ def test_binop_eq_ne_index_series(data1, data2):
actual = gi == gs
expected = gi.to_pandas() == gs.to_pandas()
- utils.assert_eq(expected, actual)
+ assert_eq(expected, actual)
actual = gi != gs
expected = gi.to_pandas() != gs.to_pandas()
- utils.assert_eq(expected, actual)
+ assert_eq(expected, actual)
diff --git a/python/cudf/cudf/tests/test_categorical.py b/python/cudf/cudf/tests/test_categorical.py
index c36595192e4..9b6029582ce 100644
--- a/python/cudf/cudf/tests/test_categorical.py
+++ b/python/cudf/cudf/tests/test_categorical.py
@@ -11,11 +11,8 @@
import pytest
import cudf
-from cudf.testing._utils import (
- NUMERIC_TYPES,
- assert_eq,
- assert_exceptions_equal,
-)
+from cudf.testing import assert_eq
+from cudf.testing._utils import NUMERIC_TYPES, assert_exceptions_equal
@contextmanager
diff --git a/python/cudf/cudf/tests/test_column.py b/python/cudf/cudf/tests/test_column.py
index a8a297c155f..ea919c786b9 100644
--- a/python/cudf/cudf/tests/test_column.py
+++ b/python/cudf/cudf/tests/test_column.py
@@ -9,7 +9,8 @@
import cudf
from cudf._lib.transform import mask_to_bools
from cudf.core.column.column import as_column
-from cudf.testing._utils import assert_eq, assert_exceptions_equal
+from cudf.testing import assert_eq
+from cudf.testing._utils import assert_exceptions_equal
from cudf.utils import dtypes as dtypeutils
dtypes = sorted(
diff --git a/python/cudf/cudf/tests/test_column_accessor.py b/python/cudf/cudf/tests/test_column_accessor.py
index f1f6097d6a9..f3343c37d1d 100644
--- a/python/cudf/cudf/tests/test_column_accessor.py
+++ b/python/cudf/cudf/tests/test_column_accessor.py
@@ -6,7 +6,7 @@
import cudf
from cudf.core.column_accessor import ColumnAccessor
-from cudf.testing._utils import assert_eq
+from cudf.testing import assert_eq
simple_test_data = [
{},
diff --git a/python/cudf/cudf/tests/test_concat.py b/python/cudf/cudf/tests/test_concat.py
index 4b43a33c8c8..c1c03de48d4 100644
--- a/python/cudf/cudf/tests/test_concat.py
+++ b/python/cudf/cudf/tests/test_concat.py
@@ -10,11 +10,8 @@
import cudf
from cudf.core.dtypes import Decimal32Dtype, Decimal64Dtype, Decimal128Dtype
-from cudf.testing._utils import (
- assert_eq,
- assert_exceptions_equal,
- expect_warning_if,
-)
+from cudf.testing import assert_eq
+from cudf.testing._utils import assert_exceptions_equal, expect_warning_if
@contextmanager
diff --git a/python/cudf/cudf/tests/test_contains.py b/python/cudf/cudf/tests/test_contains.py
index a65ab1780b6..fe86df99d35 100644
--- a/python/cudf/cudf/tests/test_contains.py
+++ b/python/cudf/cudf/tests/test_contains.py
@@ -9,12 +9,8 @@
import cudf
from cudf import Series
from cudf.core.index import Index, RangeIndex
-from cudf.testing._utils import (
- DATETIME_TYPES,
- NUMERIC_TYPES,
- TIMEDELTA_TYPES,
- assert_eq,
-)
+from cudf.testing import assert_eq
+from cudf.testing._utils import DATETIME_TYPES, NUMERIC_TYPES, TIMEDELTA_TYPES
def cudf_date_series(start, stop, freq):
diff --git a/python/cudf/cudf/tests/test_copying.py b/python/cudf/cudf/tests/test_copying.py
index 0bc9ffa8004..9b6f82ec705 100644
--- a/python/cudf/cudf/tests/test_copying.py
+++ b/python/cudf/cudf/tests/test_copying.py
@@ -8,7 +8,8 @@
import cudf
from cudf import Series
from cudf.core.buffer.spill_manager import get_global_manager
-from cudf.testing._utils import NUMERIC_TYPES, OTHER_TYPES, assert_eq
+from cudf.testing import assert_eq
+from cudf.testing._utils import NUMERIC_TYPES, OTHER_TYPES
pytestmark = pytest.mark.spilling
diff --git a/python/cudf/cudf/tests/test_csv.py b/python/cudf/cudf/tests/test_csv.py
index 5009a7f2628..09617306606 100644
--- a/python/cudf/cudf/tests/test_csv.py
+++ b/python/cudf/cudf/tests/test_csv.py
@@ -18,7 +18,8 @@
import cudf
from cudf import read_csv
from cudf.core._compat import PANDAS_CURRENT_SUPPORTED_VERSION, PANDAS_VERSION
-from cudf.testing._utils import assert_eq, assert_exceptions_equal
+from cudf.testing import assert_eq
+from cudf.testing._utils import assert_exceptions_equal
def make_numeric_dataframe(nrows, dtype):
diff --git a/python/cudf/cudf/tests/test_cuda_apply.py b/python/cudf/cudf/tests/test_cuda_apply.py
index 7fdf9754534..dc892caba3b 100644
--- a/python/cudf/cudf/tests/test_cuda_apply.py
+++ b/python/cudf/cudf/tests/test_cuda_apply.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2018-2022, NVIDIA CORPORATION.
+# Copyright (c) 2018-2024, NVIDIA CORPORATION.
"""
Test method that apply GPU kernel to a frame.
@@ -9,7 +9,7 @@
from numba import cuda
from cudf import DataFrame
-from cudf.testing._utils import assert_eq
+from cudf.testing import assert_eq
@pytest.mark.parametrize("nelem", [1, 2, 64, 128, 129])
diff --git a/python/cudf/cudf/tests/test_cuda_array_interface.py b/python/cudf/cudf/tests/test_cuda_array_interface.py
index 06d63561fc1..29f2f46e3c7 100644
--- a/python/cudf/cudf/tests/test_cuda_array_interface.py
+++ b/python/cudf/cudf/tests/test_cuda_array_interface.py
@@ -11,12 +11,8 @@
import cudf
from cudf.core.buffer.spill_manager import get_global_manager
-from cudf.testing._utils import (
- DATETIME_TYPES,
- NUMERIC_TYPES,
- TIMEDELTA_TYPES,
- assert_eq,
-)
+from cudf.testing import assert_eq
+from cudf.testing._utils import DATETIME_TYPES, NUMERIC_TYPES, TIMEDELTA_TYPES
@pytest.mark.parametrize("dtype", NUMERIC_TYPES + DATETIME_TYPES)
diff --git a/python/cudf/cudf/tests/test_custom_accessor.py b/python/cudf/cudf/tests/test_custom_accessor.py
index 5ffe255d0f8..278e63f3e8b 100644
--- a/python/cudf/cudf/tests/test_custom_accessor.py
+++ b/python/cudf/cudf/tests/test_custom_accessor.py
@@ -4,7 +4,7 @@
import pytest
import cudf
-from cudf.testing._utils import assert_eq
+from cudf.testing import assert_eq
@cudf.api.extensions.register_dataframe_accessor("point")
diff --git a/python/cudf/cudf/tests/test_cut.py b/python/cudf/cudf/tests/test_cut.py
index 24c1eaa8f02..3f31da035aa 100644
--- a/python/cudf/cudf/tests/test_cut.py
+++ b/python/cudf/cudf/tests/test_cut.py
@@ -9,7 +9,7 @@
import pytest
from cudf.core.cut import cut
-from cudf.testing._utils import assert_eq
+from cudf.testing import assert_eq
@pytest.mark.parametrize(
diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py
index 3661e13bd39..05ee8346afa 100644
--- a/python/cudf/cudf/tests/test_dataframe.py
+++ b/python/cudf/cudf/tests/test_dataframe.py
@@ -30,14 +30,12 @@
from cudf.core.buffer.spill_manager import get_global_manager
from cudf.core.column import column
from cudf.errors import MixedTypeError
-from cudf.testing import _utils as utils
+from cudf.testing import _utils as utils, assert_eq, assert_neq
from cudf.testing._utils import (
ALL_TYPES,
DATETIME_TYPES,
NUMERIC_TYPES,
- assert_eq,
assert_exceptions_equal,
- assert_neq,
does_not_raise,
expect_warning_if,
gen_rand,
@@ -3660,6 +3658,12 @@ def test_dataframe_mulitindex_sort_index(
assert_eq(expected, got)
+def test_sort_index_axis_1_ignore_index_true_columnaccessor_state_names():
+ gdf = cudf.DataFrame([[1, 2, 3]], columns=["b", "a", "c"])
+ result = gdf.sort_index(axis=1, ignore_index=True)
+ assert result._data.names == tuple(result._data.keys())
+
+
@pytest.mark.parametrize("dtype", dtypes + ["category"])
def test_dataframe_0_row_dtype(dtype):
if dtype == "category":
diff --git a/python/cudf/cudf/tests/test_dataframe_copy.py b/python/cudf/cudf/tests/test_dataframe_copy.py
index fec52d82ab1..45bd31ef58e 100644
--- a/python/cudf/cudf/tests/test_dataframe_copy.py
+++ b/python/cudf/cudf/tests/test_dataframe_copy.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2018-2023, NVIDIA CORPORATION.
+# Copyright (c) 2018-2024, NVIDIA CORPORATION.
from copy import copy, deepcopy
import cupy as cp
@@ -7,7 +7,8 @@
import pytest
from cudf.core.dataframe import DataFrame
-from cudf.testing._utils import ALL_TYPES, assert_eq, assert_neq
+from cudf.testing import assert_eq, assert_neq
+from cudf.testing._utils import ALL_TYPES
"""
DataFrame copy expectations
diff --git a/python/cudf/cudf/tests/test_datasets.py b/python/cudf/cudf/tests/test_datasets.py
index 8e5e5ab66c4..7f4e249a6d7 100644
--- a/python/cudf/cudf/tests/test_datasets.py
+++ b/python/cudf/cudf/tests/test_datasets.py
@@ -3,7 +3,7 @@
import numpy as np
import cudf
-from cudf.testing._utils import assert_eq
+from cudf.testing import assert_eq
def test_dataset_timeseries():
diff --git a/python/cudf/cudf/tests/test_datetime.py b/python/cudf/cudf/tests/test_datetime.py
index e3ecaafae5b..092e9790c63 100644
--- a/python/cudf/cudf/tests/test_datetime.py
+++ b/python/cudf/cudf/tests/test_datetime.py
@@ -15,10 +15,10 @@
from cudf import DataFrame, Series
from cudf.core._compat import PANDAS_CURRENT_SUPPORTED_VERSION, PANDAS_VERSION
from cudf.core.index import DatetimeIndex
+from cudf.testing import assert_eq
from cudf.testing._utils import (
DATETIME_TYPES,
NUMERIC_TYPES,
- assert_eq,
assert_exceptions_equal,
expect_warning_if,
)
diff --git a/python/cudf/cudf/tests/test_decimal.py b/python/cudf/cudf/tests/test_decimal.py
index 0745e5aba48..c41a938f6ea 100644
--- a/python/cudf/cudf/tests/test_decimal.py
+++ b/python/cudf/cudf/tests/test_decimal.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2021-2023, NVIDIA CORPORATION.
+# Copyright (c) 2021-2024, NVIDIA CORPORATION.
import decimal
from decimal import Decimal
@@ -11,12 +11,12 @@
import cudf
from cudf.core.column import Decimal32Column, Decimal64Column, NumericalColumn
from cudf.core.dtypes import Decimal32Dtype, Decimal64Dtype
+from cudf.testing import assert_eq
from cudf.testing._utils import (
FLOAT_TYPES,
INTEGER_TYPES,
SIGNED_TYPES,
_decimal_series,
- assert_eq,
expect_warning_if,
)
diff --git a/python/cudf/cudf/tests/test_df_protocol.py b/python/cudf/cudf/tests/test_df_protocol.py
index 8ce4da792a4..7f48e414180 100644
--- a/python/cudf/cudf/tests/test_df_protocol.py
+++ b/python/cudf/cudf/tests/test_df_protocol.py
@@ -20,7 +20,7 @@
from_dataframe,
protocol_dtype_to_cupy_dtype,
)
-from cudf.testing._utils import assert_eq
+from cudf.testing import assert_eq
@pytest.fixture(
diff --git a/python/cudf/cudf/tests/test_dlpack.py b/python/cudf/cudf/tests/test_dlpack.py
index 7ea3979b0f1..ebcc35784ee 100644
--- a/python/cudf/cudf/tests/test_dlpack.py
+++ b/python/cudf/cudf/tests/test_dlpack.py
@@ -9,7 +9,7 @@
from packaging import version
import cudf
-from cudf.testing._utils import assert_eq
+from cudf.testing import assert_eq
nelems = [0, 3, 10]
dtype = [np.uint16, np.int32, np.float64]
diff --git a/python/cudf/cudf/tests/test_dropna.py b/python/cudf/cudf/tests/test_dropna.py
index c3c8ed922f0..ed0cf0053ea 100644
--- a/python/cudf/cudf/tests/test_dropna.py
+++ b/python/cudf/cudf/tests/test_dropna.py
@@ -5,7 +5,7 @@
import pytest
import cudf
-from cudf.testing._utils import assert_eq
+from cudf.testing import assert_eq
@pytest.mark.parametrize(
diff --git a/python/cudf/cudf/tests/test_dtypes.py b/python/cudf/cudf/tests/test_dtypes.py
index 0efd8d9781c..edb534a3618 100644
--- a/python/cudf/cudf/tests/test_dtypes.py
+++ b/python/cudf/cudf/tests/test_dtypes.py
@@ -17,7 +17,7 @@
ListDtype,
StructDtype,
)
-from cudf.testing._utils import assert_eq
+from cudf.testing import assert_eq
from cudf.utils.dtypes import np_to_pa_dtype
diff --git a/python/cudf/cudf/tests/test_duplicates.py b/python/cudf/cudf/tests/test_duplicates.py
index 161b245953b..0b4ed52ba96 100644
--- a/python/cudf/cudf/tests/test_duplicates.py
+++ b/python/cudf/cudf/tests/test_duplicates.py
@@ -9,7 +9,8 @@
import cudf
from cudf import concat
-from cudf.testing._utils import assert_eq, assert_exceptions_equal
+from cudf.testing import assert_eq
+from cudf.testing._utils import assert_exceptions_equal
# most tests are similar to pandas drop_duplicates
diff --git a/python/cudf/cudf/tests/test_ewm.py b/python/cudf/cudf/tests/test_ewm.py
new file mode 100644
index 00000000000..6cb3c19d5a8
--- /dev/null
+++ b/python/cudf/cudf/tests/test_ewm.py
@@ -0,0 +1,46 @@
+# Copyright (c) 2022-2024, NVIDIA CORPORATION.
+import pytest
+
+import cudf
+from cudf.testing import assert_eq
+
+
+@pytest.mark.parametrize(
+ "data",
+ [
+ [1.0, 2.0, 3.0, 4.0, 5.0],
+ [5.0, cudf.NA, 3.0, cudf.NA, 8.5],
+ [5.0, cudf.NA, 3.0, cudf.NA, cudf.NA, 4.5],
+ [5.0, cudf.NA, 3.0, 4.0, cudf.NA, 5.0],
+ ],
+)
+@pytest.mark.parametrize(
+ "params",
+ [
+ {"com": 0.1},
+ {"com": 0.5},
+ {"span": 1.5},
+ {"span": 2.5},
+ {"halflife": 0.5},
+ {"halflife": 1.5},
+ {"alpha": 0.1},
+ {"alpha": 0.5},
+ ],
+)
+@pytest.mark.parametrize("adjust", [True, False])
+def test_ewma(data, params, adjust):
+ """
+ The most basic test asserts that we obtain
+ the same numerical values as pandas for various
+ sets of keyword arguemnts that effect the raw
+ coefficients of the formula
+ """
+ params["adjust"] = adjust
+
+ gsr = cudf.Series(data, dtype="float64")
+ psr = gsr.to_pandas()
+
+ expect = psr.ewm(**params).mean()
+ got = gsr.ewm(**params).mean()
+
+ assert_eq(expect, got)
diff --git a/python/cudf/cudf/tests/test_factorize.py b/python/cudf/cudf/tests/test_factorize.py
index f8782681f62..47f9180dcb1 100644
--- a/python/cudf/cudf/tests/test_factorize.py
+++ b/python/cudf/cudf/tests/test_factorize.py
@@ -7,7 +7,7 @@
import cudf
from cudf import DataFrame, Index
-from cudf.testing._utils import assert_eq
+from cudf.testing import assert_eq
@pytest.mark.parametrize("ncats,nelem", [(2, 2), (2, 10), (10, 100)])
diff --git a/python/cudf/cudf/tests/test_feather.py b/python/cudf/cudf/tests/test_feather.py
index 12a325fa4e8..7e5523bb8c7 100644
--- a/python/cudf/cudf/tests/test_feather.py
+++ b/python/cudf/cudf/tests/test_feather.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2018-2023, NVIDIA CORPORATION.
+# Copyright (c) 2018-2024, NVIDIA CORPORATION.
import os
from string import ascii_letters
@@ -9,7 +9,8 @@
import pytest
import cudf
-from cudf.testing._utils import NUMERIC_TYPES, assert_eq
+from cudf.testing import assert_eq
+from cudf.testing._utils import NUMERIC_TYPES
@pytest.fixture(params=[0, 1, 10, 100])
diff --git a/python/cudf/cudf/tests/test_gcs.py b/python/cudf/cudf/tests/test_gcs.py
index a677ace18ec..fc22d8bc0ea 100644
--- a/python/cudf/cudf/tests/test_gcs.py
+++ b/python/cudf/cudf/tests/test_gcs.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2022, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
import io
import os
@@ -8,7 +8,7 @@
import pytest
import cudf
-from cudf.testing._utils import assert_eq
+from cudf.testing import assert_eq
gcsfs = pytest.importorskip("gcsfs")
diff --git a/python/cudf/cudf/tests/test_groupby.py b/python/cudf/cudf/tests/test_groupby.py
index 674f694a224..826a0e52f57 100644
--- a/python/cudf/cudf/tests/test_groupby.py
+++ b/python/cudf/cudf/tests/test_groupby.py
@@ -28,11 +28,11 @@
from cudf.core.udf._ops import arith_ops, comparison_ops, unary_ops
from cudf.core.udf.groupby_typing import SUPPORTED_GROUPBY_NUMPY_TYPES
from cudf.core.udf.utils import UDFError, precompiled
+from cudf.testing import assert_eq
from cudf.testing._utils import (
DATETIME_TYPES,
SIGNED_TYPES,
TIMEDELTA_TYPES,
- assert_eq,
assert_exceptions_equal,
expect_warning_if,
)
diff --git a/python/cudf/cudf/tests/test_hdf.py b/python/cudf/cudf/tests/test_hdf.py
index d420c95cfb4..430ed973f19 100644
--- a/python/cudf/cudf/tests/test_hdf.py
+++ b/python/cudf/cudf/tests/test_hdf.py
@@ -8,7 +8,8 @@
import pytest
import cudf
-from cudf.testing._utils import NUMERIC_TYPES, UNSIGNED_TYPES, assert_eq
+from cudf.testing import assert_eq
+from cudf.testing._utils import NUMERIC_TYPES, UNSIGNED_TYPES
pytest.importorskip("tables")
diff --git a/python/cudf/cudf/tests/test_hdfs.py b/python/cudf/cudf/tests/test_hdfs.py
index f8de16f8609..098b5192d4a 100644
--- a/python/cudf/cudf/tests/test_hdfs.py
+++ b/python/cudf/cudf/tests/test_hdfs.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2022, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
import os
from io import BytesIO
@@ -10,7 +10,7 @@
import pytest
import cudf
-from cudf.testing._utils import assert_eq
+from cudf.testing import assert_eq
if not os.environ.get("RUN_HDFS_TESTS"):
pytestmark = pytest.mark.skip("Env not configured to run HDFS tests")
diff --git a/python/cudf/cudf/tests/test_index.py b/python/cudf/cudf/tests/test_index.py
index a59836df5ba..05dcd85df6a 100644
--- a/python/cudf/cudf/tests/test_index.py
+++ b/python/cudf/cudf/tests/test_index.py
@@ -18,6 +18,7 @@
from cudf.api.extensions import no_default
from cudf.api.types import is_bool_dtype
from cudf.core.index import CategoricalIndex, DatetimeIndex, Index, RangeIndex
+from cudf.testing import assert_eq
from cudf.testing._utils import (
ALL_TYPES,
FLOAT_TYPES,
@@ -28,7 +29,6 @@
UNSIGNED_TYPES,
assert_column_memory_eq,
assert_column_memory_ne,
- assert_eq,
assert_exceptions_equal,
expect_warning_if,
)
diff --git a/python/cudf/cudf/tests/test_indexing.py b/python/cudf/cudf/tests/test_indexing.py
index 009e48a8669..7005cbc6834 100644
--- a/python/cudf/cudf/tests/test_indexing.py
+++ b/python/cudf/cudf/tests/test_indexing.py
@@ -11,10 +11,9 @@
import cudf
from cudf.core._compat import PANDAS_CURRENT_SUPPORTED_VERSION, PANDAS_VERSION
-from cudf.testing import _utils as utils
+from cudf.testing import _utils as utils, assert_eq
from cudf.testing._utils import (
INTEGER_TYPES,
- assert_eq,
assert_exceptions_equal,
expect_warning_if,
)
diff --git a/python/cudf/cudf/tests/test_interpolate.py b/python/cudf/cudf/tests/test_interpolate.py
index a0e90cc89a2..4a0dc331e1a 100644
--- a/python/cudf/cudf/tests/test_interpolate.py
+++ b/python/cudf/cudf/tests/test_interpolate.py
@@ -4,11 +4,8 @@
import cudf
from cudf.core._compat import PANDAS_CURRENT_SUPPORTED_VERSION, PANDAS_VERSION
-from cudf.testing._utils import (
- assert_eq,
- assert_exceptions_equal,
- expect_warning_if,
-)
+from cudf.testing import assert_eq
+from cudf.testing._utils import assert_exceptions_equal, expect_warning_if
@pytest.mark.parametrize(
diff --git a/python/cudf/cudf/tests/test_interval.py b/python/cudf/cudf/tests/test_interval.py
index 013f4439ad5..1b395c09ba8 100644
--- a/python/cudf/cudf/tests/test_interval.py
+++ b/python/cudf/cudf/tests/test_interval.py
@@ -6,7 +6,7 @@
import pytest
import cudf
-from cudf.testing._utils import assert_eq
+from cudf.testing import assert_eq
@pytest.mark.parametrize(
diff --git a/python/cudf/cudf/tests/test_join_order.py b/python/cudf/cudf/tests/test_join_order.py
index 8d71a6c05b8..9ea4ba007d2 100644
--- a/python/cudf/cudf/tests/test_join_order.py
+++ b/python/cudf/cudf/tests/test_join_order.py
@@ -8,7 +8,7 @@
import cudf
from cudf.core._compat import PANDAS_CURRENT_SUPPORTED_VERSION, PANDAS_VERSION
-from cudf.testing._utils import assert_eq
+from cudf.testing import assert_eq
@pytest.fixture(params=[False, True], ids=["unsorted", "sorted"])
diff --git a/python/cudf/cudf/tests/test_joining.py b/python/cudf/cudf/tests/test_joining.py
index f36774daab2..b1ce69e58ef 100644
--- a/python/cudf/cudf/tests/test_joining.py
+++ b/python/cudf/cudf/tests/test_joining.py
@@ -9,11 +9,11 @@
import cudf
from cudf.core._compat import PANDAS_CURRENT_SUPPORTED_VERSION, PANDAS_VERSION
from cudf.core.dtypes import CategoricalDtype, Decimal64Dtype, Decimal128Dtype
+from cudf.testing import assert_eq
from cudf.testing._utils import (
INTEGER_TYPES,
NUMERIC_TYPES,
TIMEDELTA_TYPES,
- assert_eq,
assert_exceptions_equal,
expect_warning_if,
)
diff --git a/python/cudf/cudf/tests/test_json.py b/python/cudf/cudf/tests/test_json.py
index ba6a8f94719..297040b6d95 100644
--- a/python/cudf/cudf/tests/test_json.py
+++ b/python/cudf/cudf/tests/test_json.py
@@ -14,11 +14,11 @@
import cudf
from cudf.core._compat import PANDAS_CURRENT_SUPPORTED_VERSION, PANDAS_VERSION
+from cudf.testing import assert_eq
from cudf.testing._utils import (
DATETIME_TYPES,
NUMERIC_TYPES,
TIMEDELTA_TYPES,
- assert_eq,
expect_warning_if,
)
diff --git a/python/cudf/cudf/tests/test_list.py b/python/cudf/cudf/tests/test_list.py
index f04cb8a91a4..f76143cb381 100644
--- a/python/cudf/cudf/tests/test_list.py
+++ b/python/cudf/cudf/tests/test_list.py
@@ -12,12 +12,8 @@
from cudf import NA
from cudf._lib.copying import get_element
from cudf.api.types import is_scalar
-from cudf.testing._utils import (
- DATETIME_TYPES,
- NUMERIC_TYPES,
- TIMEDELTA_TYPES,
- assert_eq,
-)
+from cudf.testing import assert_eq
+from cudf.testing._utils import DATETIME_TYPES, NUMERIC_TYPES, TIMEDELTA_TYPES
@pytest.mark.parametrize(
diff --git a/python/cudf/cudf/tests/test_monotonic.py b/python/cudf/cudf/tests/test_monotonic.py
index 3c627a5fe89..0896d91570e 100644
--- a/python/cudf/cudf/tests/test_monotonic.py
+++ b/python/cudf/cudf/tests/test_monotonic.py
@@ -12,7 +12,7 @@
import cudf
from cudf import Index, MultiIndex, Series
from cudf.core.index import CategoricalIndex, DatetimeIndex, RangeIndex
-from cudf.testing._utils import assert_eq
+from cudf.testing import assert_eq
@pytest.mark.parametrize("testrange", [(10, 20, 1), (0, -10, -1), (5, 5, 1)])
diff --git a/python/cudf/cudf/tests/test_multiindex.py b/python/cudf/cudf/tests/test_multiindex.py
index 7b95e4f9a44..07c2e9c3fcf 100644
--- a/python/cudf/cudf/tests/test_multiindex.py
+++ b/python/cudf/cudf/tests/test_multiindex.py
@@ -21,12 +21,8 @@
import cudf
from cudf.api.extensions import no_default
from cudf.core.column import as_column
-from cudf.testing._utils import (
- assert_eq,
- assert_exceptions_equal,
- assert_neq,
- expect_warning_if,
-)
+from cudf.testing import assert_eq, assert_neq
+from cudf.testing._utils import assert_exceptions_equal, expect_warning_if
@contextmanager
diff --git a/python/cudf/cudf/tests/test_numerical.py b/python/cudf/cudf/tests/test_numerical.py
index 03081208739..1b0589254f5 100644
--- a/python/cudf/cudf/tests/test_numerical.py
+++ b/python/cudf/cudf/tests/test_numerical.py
@@ -5,7 +5,8 @@
import pytest
import cudf
-from cudf.testing._utils import NUMERIC_TYPES, assert_eq, expect_warning_if
+from cudf.testing import assert_eq
+from cudf.testing._utils import NUMERIC_TYPES, expect_warning_if
from cudf.utils.dtypes import np_dtypes_to_pandas_dtypes
diff --git a/python/cudf/cudf/tests/test_numpy_interop.py b/python/cudf/cudf/tests/test_numpy_interop.py
index 46324a85bb4..fa664d52ecf 100644
--- a/python/cudf/cudf/tests/test_numpy_interop.py
+++ b/python/cudf/cudf/tests/test_numpy_interop.py
@@ -1,10 +1,10 @@
-# Copyright (c) 2019-2022, NVIDIA CORPORATION.
+# Copyright (c) 2019-2024, NVIDIA CORPORATION.
import numpy as np
import pytest
from cudf import DataFrame, Series
-from cudf.testing._utils import assert_eq
+from cudf.testing import assert_eq
def test_to_records_noindex():
diff --git a/python/cudf/cudf/tests/test_onehot.py b/python/cudf/cudf/tests/test_onehot.py
index cd0055ad78b..154e1e19072 100644
--- a/python/cudf/cudf/tests/test_onehot.py
+++ b/python/cudf/cudf/tests/test_onehot.py
@@ -7,7 +7,7 @@
import pytest
import cudf
-from cudf.testing._utils import assert_eq
+from cudf.testing import assert_eq
pytestmark = pytest.mark.spilling
diff --git a/python/cudf/cudf/tests/test_orc.py b/python/cudf/cudf/tests/test_orc.py
index b83b8f08a8b..e0884a5819a 100644
--- a/python/cudf/cudf/tests/test_orc.py
+++ b/python/cudf/cudf/tests/test_orc.py
@@ -15,9 +15,8 @@
import cudf
from cudf.core._compat import PANDAS_CURRENT_SUPPORTED_VERSION, PANDAS_VERSION
from cudf.io.orc import ORCWriter
-from cudf.testing import assert_frame_equal
+from cudf.testing import assert_eq, assert_frame_equal
from cudf.testing._utils import (
- assert_eq,
expect_warning_if,
gen_rand_series,
supported_numpy_dtypes,
diff --git a/python/cudf/cudf/tests/test_pack.py b/python/cudf/cudf/tests/test_pack.py
index da506a8d5b2..ad78621c5fa 100644
--- a/python/cudf/cudf/tests/test_pack.py
+++ b/python/cudf/cudf/tests/test_pack.py
@@ -20,7 +20,7 @@
from cudf import DataFrame, Index, Series
from cudf._lib.copying import pack, unpack
-from cudf.testing._utils import assert_eq
+from cudf.testing import assert_eq
def test_sizeof_packed_dataframe():
diff --git a/python/cudf/cudf/tests/test_pandas_interop.py b/python/cudf/cudf/tests/test_pandas_interop.py
index 78cf5b998e8..5782437e394 100644
--- a/python/cudf/cudf/tests/test_pandas_interop.py
+++ b/python/cudf/cudf/tests/test_pandas_interop.py
@@ -1,11 +1,11 @@
-# Copyright (c) 2018-2021, NVIDIA CORPORATION.
+# Copyright (c) 2018-2024, NVIDIA CORPORATION.
import numpy as np
import pandas as pd
import cudf
from cudf import DataFrame
-from cudf.testing._utils import assert_eq
+from cudf.testing import assert_eq
def test_to_pandas():
diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py
index af79f361b43..e1e7952605b 100644
--- a/python/cudf/cudf/tests/test_parquet.py
+++ b/python/cudf/cudf/tests/test_parquet.py
@@ -28,12 +28,8 @@
ParquetWriter,
merge_parquet_filemetadata,
)
-from cudf.testing import dataset_generator as dg
-from cudf.testing._utils import (
- TIMEDELTA_TYPES,
- assert_eq,
- set_random_null_mask_inplace,
-)
+from cudf.testing import assert_eq, dataset_generator as dg
+from cudf.testing._utils import TIMEDELTA_TYPES, set_random_null_mask_inplace
@contextmanager
diff --git a/python/cudf/cudf/tests/test_pickling.py b/python/cudf/cudf/tests/test_pickling.py
index 13a07ef8adc..719e8a33285 100644
--- a/python/cudf/cudf/tests/test_pickling.py
+++ b/python/cudf/cudf/tests/test_pickling.py
@@ -8,7 +8,7 @@
from cudf import DataFrame, Index, RangeIndex, Series
from cudf.core.buffer import as_buffer
-from cudf.testing._utils import assert_eq
+from cudf.testing import assert_eq
pytestmark = pytest.mark.spilling
diff --git a/python/cudf/cudf/tests/test_quantiles.py b/python/cudf/cudf/tests/test_quantiles.py
index 8b126073a0f..7d8303df0c3 100644
--- a/python/cudf/cudf/tests/test_quantiles.py
+++ b/python/cudf/cudf/tests/test_quantiles.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2023, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
import re
@@ -6,7 +6,8 @@
import pytest
import cudf
-from cudf.testing._utils import assert_eq, assert_exceptions_equal
+from cudf.testing import assert_eq
+from cudf.testing._utils import assert_exceptions_equal
def test_single_q():
diff --git a/python/cudf/cudf/tests/test_query.py b/python/cudf/cudf/tests/test_query.py
index cf9e70d85c7..b12209fd3b9 100644
--- a/python/cudf/cudf/tests/test_query.py
+++ b/python/cudf/cudf/tests/test_query.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2018-2022, NVIDIA CORPORATION.
+# Copyright (c) 2018-2024, NVIDIA CORPORATION.
import datetime
@@ -11,7 +11,7 @@
import cudf
from cudf import DataFrame
-from cudf.testing._utils import assert_eq
+from cudf.testing import assert_eq
from cudf.utils import queryutils
_params_query_parser = []
diff --git a/python/cudf/cudf/tests/test_query_mask.py b/python/cudf/cudf/tests/test_query_mask.py
index ae5171f28d4..9372681187d 100644
--- a/python/cudf/cudf/tests/test_query_mask.py
+++ b/python/cudf/cudf/tests/test_query_mask.py
@@ -1,11 +1,11 @@
-# Copyright (c) 2019-2022, NVIDIA CORPORATION.
+# Copyright (c) 2019-2024, NVIDIA CORPORATION.
import numpy as np
import pandas as pd
import pytest
import cudf
-from cudf.testing._utils import assert_eq
+from cudf.testing import assert_eq
_data = [
{"a": [0, 1.0, 2.0, None, np.nan, None, 3, 5]},
diff --git a/python/cudf/cudf/tests/test_rank.py b/python/cudf/cudf/tests/test_rank.py
index 1a5f25e320f..4c1d8ce92ae 100644
--- a/python/cudf/cudf/tests/test_rank.py
+++ b/python/cudf/cudf/tests/test_rank.py
@@ -7,7 +7,8 @@
import pytest
from cudf import DataFrame
-from cudf.testing._utils import assert_eq, assert_exceptions_equal
+from cudf.testing import assert_eq
+from cudf.testing._utils import assert_exceptions_equal
@pytest.fixture
diff --git a/python/cudf/cudf/tests/test_reductions.py b/python/cudf/cudf/tests/test_reductions.py
index c6ffa1d2bc7..1247fa362ce 100644
--- a/python/cudf/cudf/tests/test_reductions.py
+++ b/python/cudf/cudf/tests/test_reductions.py
@@ -11,13 +11,8 @@
import cudf
from cudf import Series
from cudf.core.dtypes import Decimal32Dtype, Decimal64Dtype, Decimal128Dtype
-from cudf.testing import _utils as utils
-from cudf.testing._utils import (
- NUMERIC_TYPES,
- assert_eq,
- expect_warning_if,
- gen_rand,
-)
+from cudf.testing import _utils as utils, assert_eq
+from cudf.testing._utils import NUMERIC_TYPES, expect_warning_if, gen_rand
params_dtype = NUMERIC_TYPES
diff --git a/python/cudf/cudf/tests/test_replace.py b/python/cudf/cudf/tests/test_replace.py
index 9466398964a..d4fe5ff3bb5 100644
--- a/python/cudf/cudf/tests/test_replace.py
+++ b/python/cudf/cudf/tests/test_replace.py
@@ -12,10 +12,10 @@
import cudf
from cudf.core._compat import PANDAS_CURRENT_SUPPORTED_VERSION, PANDAS_VERSION
from cudf.core.dtypes import Decimal32Dtype, Decimal64Dtype, Decimal128Dtype
+from cudf.testing import assert_eq
from cudf.testing._utils import (
INTEGER_TYPES,
NUMERIC_TYPES,
- assert_eq,
assert_exceptions_equal,
expect_warning_if,
)
diff --git a/python/cudf/cudf/tests/test_resampling.py b/python/cudf/cudf/tests/test_resampling.py
index d7a3fea1273..95fa8e9a50a 100644
--- a/python/cudf/cudf/tests/test_resampling.py
+++ b/python/cudf/cudf/tests/test_resampling.py
@@ -5,7 +5,7 @@
import pytest
import cudf
-from cudf.testing._utils import assert_eq
+from cudf.testing import assert_eq
def assert_resample_results_equal(lhs, rhs, **kwargs):
diff --git a/python/cudf/cudf/tests/test_reshape.py b/python/cudf/cudf/tests/test_reshape.py
index daa1e70808f..50db4302b75 100644
--- a/python/cudf/cudf/tests/test_reshape.py
+++ b/python/cudf/cudf/tests/test_reshape.py
@@ -10,12 +10,8 @@
import cudf
from cudf.core._compat import PANDAS_CURRENT_SUPPORTED_VERSION, PANDAS_VERSION
from cudf.core.buffer.spill_manager import get_global_manager
-from cudf.testing._utils import (
- ALL_TYPES,
- DATETIME_TYPES,
- NUMERIC_TYPES,
- assert_eq,
-)
+from cudf.testing import assert_eq
+from cudf.testing._utils import ALL_TYPES, DATETIME_TYPES, NUMERIC_TYPES
pytest_xfail = pytest.mark.xfail
pytestmark = pytest.mark.spilling
diff --git a/python/cudf/cudf/tests/test_rolling.py b/python/cudf/cudf/tests/test_rolling.py
index 1d1d7ae8d29..135870f7359 100644
--- a/python/cudf/cudf/tests/test_rolling.py
+++ b/python/cudf/cudf/tests/test_rolling.py
@@ -7,7 +7,7 @@
import pytest
import cudf
-from cudf.testing._utils import assert_eq
+from cudf.testing import assert_eq
from cudf.testing.dataset_generator import rand_dataframe
diff --git a/python/cudf/cudf/tests/test_s3.py b/python/cudf/cudf/tests/test_s3.py
index cdce17eeb76..a44bf791767 100644
--- a/python/cudf/cudf/tests/test_s3.py
+++ b/python/cudf/cudf/tests/test_s3.py
@@ -12,7 +12,7 @@
from fsspec.core import get_fs_token_paths
import cudf
-from cudf.testing._utils import assert_eq
+from cudf.testing import assert_eq
moto = pytest.importorskip("moto", minversion="3.1.6")
boto3 = pytest.importorskip("boto3")
diff --git a/python/cudf/cudf/tests/test_scan.py b/python/cudf/cudf/tests/test_scan.py
index 4cbc2197cfd..b76566b00e2 100644
--- a/python/cudf/cudf/tests/test_scan.py
+++ b/python/cudf/cudf/tests/test_scan.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2021-2022, NVIDIA CORPORATION.
+# Copyright (c) 2021-2024, NVIDIA CORPORATION.
from itertools import product
@@ -8,12 +8,8 @@
import cudf
from cudf.core.dtypes import Decimal32Dtype, Decimal64Dtype, Decimal128Dtype
-from cudf.testing._utils import (
- INTEGER_TYPES,
- NUMERIC_TYPES,
- assert_eq,
- gen_rand,
-)
+from cudf.testing import assert_eq
+from cudf.testing._utils import INTEGER_TYPES, NUMERIC_TYPES, gen_rand
params_sizes = [0, 1, 2, 5]
diff --git a/python/cudf/cudf/tests/test_search.py b/python/cudf/cudf/tests/test_search.py
index 3ba652ff6c0..65943518113 100644
--- a/python/cudf/cudf/tests/test_search.py
+++ b/python/cudf/cudf/tests/test_search.py
@@ -5,7 +5,8 @@
import pytest
import cudf
-from cudf.testing._utils import assert_eq, gen_rand, random_bitmask
+from cudf.testing import assert_eq
+from cudf.testing._utils import gen_rand, random_bitmask
@pytest.mark.parametrize("side", ["left", "right"])
diff --git a/python/cudf/cudf/tests/test_serialize.py b/python/cudf/cudf/tests/test_serialize.py
index f26d78e7783..0b892a51895 100644
--- a/python/cudf/cudf/tests/test_serialize.py
+++ b/python/cudf/cudf/tests/test_serialize.py
@@ -9,8 +9,7 @@
import pytest
import cudf
-from cudf.testing import _utils as utils
-from cudf.testing._utils import assert_eq
+from cudf.testing import _utils as utils, assert_eq
@pytest.mark.parametrize(
diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py
index 52956c230ba..467d0c46ae7 100644
--- a/python/cudf/cudf/tests/test_series.py
+++ b/python/cudf/cudf/tests/test_series.py
@@ -17,11 +17,11 @@
from cudf.api.extensions import no_default
from cudf.core._compat import PANDAS_CURRENT_SUPPORTED_VERSION, PANDAS_VERSION
from cudf.errors import MixedTypeError
+from cudf.testing import assert_eq
from cudf.testing._utils import (
NUMERIC_TYPES,
SERIES_OR_INDEX_NAMES,
TIMEDELTA_TYPES,
- assert_eq,
assert_exceptions_equal,
expect_warning_if,
gen_rand,
@@ -1054,6 +1054,18 @@ def test_fillna_with_nan(data, nan_as_null, fill_value):
assert_eq(expected, actual)
+def test_fillna_categorical_with_non_categorical_raises():
+ ser = cudf.Series([1, None], dtype="category")
+ with pytest.raises(TypeError):
+ ser.fillna(cudf.Series([1, 2]))
+
+
+def test_fillna_categorical_with_different_categories_raises():
+ ser = cudf.Series([1, None], dtype="category")
+ with pytest.raises(TypeError):
+ ser.fillna(cudf.Series([1, 2]), dtype="category")
+
+
def test_series_mask_mixed_dtypes_error():
s = cudf.Series(["a", "b", "c"])
with pytest.raises(
diff --git a/python/cudf/cudf/tests/test_seriesmap.py b/python/cudf/cudf/tests/test_seriesmap.py
index 9da08e483c9..3d8b6a79d2a 100644
--- a/python/cudf/cudf/tests/test_seriesmap.py
+++ b/python/cudf/cudf/tests/test_seriesmap.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2023, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
from itertools import product
from math import floor
@@ -9,7 +9,8 @@
import cudf
from cudf import Series
-from cudf.testing._utils import assert_eq, assert_exceptions_equal
+from cudf.testing import assert_eq
+from cudf.testing._utils import assert_exceptions_equal
def test_series_map_basic():
diff --git a/python/cudf/cudf/tests/test_setitem.py b/python/cudf/cudf/tests/test_setitem.py
index ff2f7bd41f2..69122cdbafa 100644
--- a/python/cudf/cudf/tests/test_setitem.py
+++ b/python/cudf/cudf/tests/test_setitem.py
@@ -6,11 +6,8 @@
import cudf
from cudf.core._compat import PANDAS_CURRENT_SUPPORTED_VERSION, PANDAS_VERSION
-from cudf.testing._utils import (
- assert_eq,
- assert_exceptions_equal,
- expect_warning_if,
-)
+from cudf.testing import assert_eq
+from cudf.testing._utils import assert_exceptions_equal, expect_warning_if
@pytest.mark.parametrize("df", [pd.DataFrame({"a": [1, 2, 3]})])
diff --git a/python/cudf/cudf/tests/test_sorting.py b/python/cudf/cudf/tests/test_sorting.py
index 449f21721f4..a8ffce6e88b 100644
--- a/python/cudf/cudf/tests/test_sorting.py
+++ b/python/cudf/cudf/tests/test_sorting.py
@@ -10,10 +10,10 @@
from cudf import DataFrame, Series
from cudf.core._compat import PANDAS_CURRENT_SUPPORTED_VERSION, PANDAS_VERSION
from cudf.core.column import NumericalColumn
+from cudf.testing import assert_eq
from cudf.testing._utils import (
DATETIME_TYPES,
NUMERIC_TYPES,
- assert_eq,
assert_exceptions_equal,
expect_warning_if,
)
diff --git a/python/cudf/cudf/tests/test_spilling.py b/python/cudf/cudf/tests/test_spilling.py
index 59b8e6d2e70..7af83a99d60 100644
--- a/python/cudf/cudf/tests/test_spilling.py
+++ b/python/cudf/cudf/tests/test_spilling.py
@@ -39,7 +39,7 @@
SpillableBufferOwner,
SpillLock,
)
-from cudf.testing._utils import assert_eq
+from cudf.testing import assert_eq
if get_global_manager() is not None:
pytest.skip(
diff --git a/python/cudf/cudf/tests/test_stats.py b/python/cudf/cudf/tests/test_stats.py
index 27811d0fcde..d5f63fdab77 100644
--- a/python/cudf/cudf/tests/test_stats.py
+++ b/python/cudf/cudf/tests/test_stats.py
@@ -11,11 +11,8 @@
from cudf.api.extensions import no_default
from cudf.core._compat import PANDAS_CURRENT_SUPPORTED_VERSION, PANDAS_VERSION
from cudf.datasets import randomdata
-from cudf.testing._utils import (
- assert_eq,
- assert_exceptions_equal,
- expect_warning_if,
-)
+from cudf.testing import assert_eq
+from cudf.testing._utils import assert_exceptions_equal, expect_warning_if
params_dtypes = [np.int32, np.uint32, np.float32, np.float64]
methods = ["min", "max", "sum", "mean", "var", "std"]
diff --git a/python/cudf/cudf/tests/test_string.py b/python/cudf/cudf/tests/test_string.py
index 801c530da43..f447759d010 100644
--- a/python/cudf/cudf/tests/test_string.py
+++ b/python/cudf/cudf/tests/test_string.py
@@ -17,10 +17,10 @@
from cudf import concat
from cudf.core.column.string import StringColumn
from cudf.core.index import Index
+from cudf.testing import assert_eq
from cudf.testing._utils import (
DATETIME_TYPES,
NUMERIC_TYPES,
- assert_eq,
assert_exceptions_equal,
)
from cudf.utils import dtypes as dtypeutils
diff --git a/python/cudf/cudf/tests/test_string_udfs.py b/python/cudf/cudf/tests/test_string_udfs.py
index 5dbb86fe27d..4432d2afc8e 100644
--- a/python/cudf/cudf/tests/test_string_udfs.py
+++ b/python/cudf/cudf/tests/test_string_udfs.py
@@ -21,7 +21,8 @@
udf_string,
)
from cudf.core.udf.utils import _get_extensionty_size, _ptx_file
-from cudf.testing._utils import assert_eq, sv_to_udf_str
+from cudf.testing import assert_eq
+from cudf.testing._utils import sv_to_udf_str
from cudf.utils._numba import _CUDFNumbaConfig
_PTX_FILE = _ptx_file()
diff --git a/python/cudf/cudf/tests/test_struct.py b/python/cudf/cudf/tests/test_struct.py
index 60d9516f385..e91edc9eec6 100644
--- a/python/cudf/cudf/tests/test_struct.py
+++ b/python/cudf/cudf/tests/test_struct.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2023, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
import numpy as np
import pandas as pd
@@ -7,7 +7,8 @@
import cudf
from cudf.core.dtypes import StructDtype
-from cudf.testing._utils import DATETIME_TYPES, TIMEDELTA_TYPES, assert_eq
+from cudf.testing import assert_eq
+from cudf.testing._utils import DATETIME_TYPES, TIMEDELTA_TYPES
@pytest.mark.parametrize(
diff --git a/python/cudf/cudf/tests/test_testing.py b/python/cudf/cudf/tests/test_testing.py
index 1994536f395..c3620db3880 100644
--- a/python/cudf/cudf/tests/test_testing.py
+++ b/python/cudf/cudf/tests/test_testing.py
@@ -17,9 +17,8 @@
OTHER_TYPES,
assert_column_memory_eq,
assert_column_memory_ne,
- assert_eq,
)
-from cudf.testing.testing import assert_column_equal
+from cudf.testing.testing import assert_column_equal, assert_eq
@pytest.fixture(
diff --git a/python/cudf/cudf/tests/test_timedelta.py b/python/cudf/cudf/tests/test_timedelta.py
index 0c591965361..c4a2349f535 100644
--- a/python/cudf/cudf/tests/test_timedelta.py
+++ b/python/cudf/cudf/tests/test_timedelta.py
@@ -9,8 +9,8 @@
import pytest
import cudf
-from cudf.testing import _utils as utils
-from cudf.testing._utils import assert_eq, assert_exceptions_equal
+from cudf.testing import _utils as utils, assert_eq
+from cudf.testing._utils import assert_exceptions_equal
_TIMEDELTA_DATA = [
[1000000, 200000, 3000000],
diff --git a/python/cudf/cudf/tests/test_udf_masked_ops.py b/python/cudf/cudf/tests/test_udf_masked_ops.py
index 4843decedba..087d10b8295 100644
--- a/python/cudf/cudf/tests/test_udf_masked_ops.py
+++ b/python/cudf/cudf/tests/test_udf_masked_ops.py
@@ -17,9 +17,9 @@
)
from cudf.core.udf.api import Masked
from cudf.core.udf.utils import precompiled
+from cudf.testing import assert_eq
from cudf.testing._utils import (
_decimal_series,
- assert_eq,
parametrize_numeric_dtypes_pairwise,
sv_to_udf_str,
)
diff --git a/python/cudf/cudf/tests/test_unaops.py b/python/cudf/cudf/tests/test_unaops.py
index 15d9d03d4a7..dbbf4fba3a6 100644
--- a/python/cudf/cudf/tests/test_unaops.py
+++ b/python/cudf/cudf/tests/test_unaops.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2023, NVIDIA CORPORATION.
+# Copyright (c) 2019-2024, NVIDIA CORPORATION.
import itertools
import operator
@@ -10,7 +10,7 @@
import cudf
from cudf import Series
-from cudf.testing import _utils as utils
+from cudf.testing import _utils as utils, assert_eq
_unaops = [operator.abs, operator.invert, operator.neg, np.ceil, np.floor]
@@ -128,4 +128,4 @@ def test_scalar_no_negative_bools():
def test_series_bool_neg():
sr = Series([True, False, True, None, False, None, True, True])
psr = sr.to_pandas(nullable=True)
- utils.assert_eq((-sr).to_pandas(nullable=True), -psr, check_dtype=True)
+ assert_eq((-sr).to_pandas(nullable=True), -psr, check_dtype=True)
diff --git a/python/cudf/cudf/tests/text/test_subword_tokenizer.py b/python/cudf/cudf/tests/text/test_subword_tokenizer.py
index b21edc0477f..78b58344374 100644
--- a/python/cudf/cudf/tests/text/test_subword_tokenizer.py
+++ b/python/cudf/cudf/tests/text/test_subword_tokenizer.py
@@ -7,7 +7,7 @@
import cudf
from cudf.core.subword_tokenizer import SubwordTokenizer
-from cudf.testing._utils import assert_eq
+from cudf.testing import assert_eq
@pytest.fixture(scope="module")
diff --git a/python/cudf/cudf/tests/text/test_text_methods.py b/python/cudf/cudf/tests/text/test_text_methods.py
index 36f7f3de828..52179f55da3 100644
--- a/python/cudf/cudf/tests/text/test_text_methods.py
+++ b/python/cudf/cudf/tests/text/test_text_methods.py
@@ -9,7 +9,7 @@
import cudf
from cudf.core.byte_pair_encoding import BytePairEncoder
from cudf.core.tokenize_vocabulary import TokenizeVocabulary
-from cudf.testing._utils import assert_eq
+from cudf.testing import assert_eq
def test_tokenize():
diff --git a/python/cudf/cudf/utils/_ptxcompiler.py b/python/cudf/cudf/utils/_ptxcompiler.py
index 54f5ea08ee1..9d7071d55a5 100644
--- a/python/cudf/cudf/utils/_ptxcompiler.py
+++ b/python/cudf/cudf/utils/_ptxcompiler.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
@@ -14,11 +14,14 @@
import math
import os
+import re
import subprocess
import sys
import warnings
NO_DRIVER = (math.inf, math.inf)
+START_TAG = "_VER_START"
+END_TAG = "_VER_END"
NUMBA_CHECK_VERSION_CMD = """\
from ctypes import c_int, byref
@@ -28,7 +31,7 @@
drv_major = dv.value // 1000
drv_minor = (dv.value - (drv_major * 1000)) // 10
run_major, run_minor = cuda.runtime.get_version()
-print(f'{drv_major} {drv_minor} {run_major} {run_minor}')
+print(f'_VER_START{drv_major} {drv_minor} {run_major} {run_minor}_VER_END')
"""
@@ -61,7 +64,11 @@ def get_versions():
warnings.warn(msg, UserWarning)
return NO_DRIVER
- versions = [int(s) for s in cp.stdout.strip().split()]
+ pattern = r"_VER_START(.*?)_VER_END"
+
+ ver_str = re.search(pattern, cp.stdout.decode()).group(1)
+
+ versions = [int(s) for s in ver_str.strip().split()]
driver_version = tuple(versions[:2])
runtime_version = tuple(versions[2:])
diff --git a/python/cudf/cudf_pandas_tests/test_cudf_pandas.py b/python/cudf/cudf_pandas_tests/test_cudf_pandas.py
index 5be4d350c0b..eed5037cbea 100644
--- a/python/cudf/cudf_pandas_tests/test_cudf_pandas.py
+++ b/python/cudf/cudf_pandas_tests/test_cudf_pandas.py
@@ -1175,7 +1175,7 @@ def test_intermediates_are_proxied():
def test_from_dataframe():
cudf = pytest.importorskip("cudf")
- from cudf.testing._utils import assert_eq
+ from cudf.testing import assert_eq
data = {"foo": [1, 2, 3], "bar": [4, 5, 6]}
diff --git a/python/cudf_kafka/cudf_kafka/_lib/kafka.pxd b/python/cudf_kafka/cudf_kafka/_lib/kafka.pxd
index 84a3a32646d..2de0bf39785 100644
--- a/python/cudf_kafka/cudf_kafka/_lib/kafka.pxd
+++ b/python/cudf_kafka/cudf_kafka/_lib/kafka.pxd
@@ -7,7 +7,7 @@ from libcpp.memory cimport unique_ptr
from libcpp.string cimport string
from libcpp.vector cimport vector
-from cudf._lib.io.datasource cimport Datasource
+from cudf._lib.pylibcudf.io.datasource cimport Datasource
from cudf._lib.pylibcudf.libcudf.io.datasource cimport datasource
diff --git a/python/cudf_polars/cudf_polars/dsl/expr.py b/python/cudf_polars/cudf_polars/dsl/expr.py
index 73f3c1ce289..871134665af 100644
--- a/python/cudf_polars/cudf_polars/dsl/expr.py
+++ b/python/cudf_polars/cudf_polars/dsl/expr.py
@@ -898,6 +898,7 @@ def __init__(self, dtype: plc.DataType, options: Any, agg: Expr) -> None:
super().__init__(dtype)
self.options = options
self.children = (agg,)
+ raise NotImplementedError("Rolling window not implemented")
class GroupedRollingWindow(Expr):
@@ -909,6 +910,7 @@ def __init__(self, dtype: plc.DataType, options: Any, agg: Expr, *by: Expr) -> N
super().__init__(dtype)
self.options = options
self.children = (agg, *by)
+ raise NotImplementedError("Grouped rolling window not implemented")
class Cast(Expr):
diff --git a/python/cudf_polars/cudf_polars/dsl/ir.py b/python/cudf_polars/cudf_polars/dsl/ir.py
index 3ccefac6b0a..b3dd6ae7cc3 100644
--- a/python/cudf_polars/cudf_polars/dsl/ir.py
+++ b/python/cudf_polars/cudf_polars/dsl/ir.py
@@ -427,8 +427,6 @@ def check_agg(agg: expr.Expr) -> int:
if isinstance(agg, (expr.BinOp, expr.Cast)):
return max(GroupBy.check_agg(child) for child in agg.children)
elif isinstance(agg, expr.Agg):
- if agg.name == "implode":
- raise NotImplementedError("implode in groupby")
return 1 + max(GroupBy.check_agg(child) for child in agg.children)
elif isinstance(agg, (expr.Len, expr.Col, expr.Literal)):
return 0
@@ -440,7 +438,9 @@ def __post_init__(self) -> None:
if self.options.rolling is None and self.maintain_order:
raise NotImplementedError("Maintaining order in groupby")
if self.options.rolling:
- raise NotImplementedError("rolling window/groupby")
+ raise NotImplementedError(
+ "rolling window/groupby"
+ ) # pragma: no cover; rollingwindow constructor has already raised
if any(GroupBy.check_agg(a.value) > 1 for a in self.agg_requests):
raise NotImplementedError("Nested aggregations in groupby")
self.agg_infos = [req.collect_agg(depth=0) for req in self.agg_requests]
diff --git a/python/cudf_polars/cudf_polars/dsl/translate.py b/python/cudf_polars/cudf_polars/dsl/translate.py
index 41bc3032bc5..5d289885f47 100644
--- a/python/cudf_polars/cudf_polars/dsl/translate.py
+++ b/python/cudf_polars/cudf_polars/dsl/translate.py
@@ -10,6 +10,7 @@
from typing import Any
import pyarrow as pa
+from typing_extensions import assert_never
from polars.polars import _expr_nodes as pl_expr, _ir_nodes as pl_ir
@@ -354,17 +355,20 @@ def _(node: pl_expr.Function, visitor: NodeTraverser, dtype: plc.DataType) -> ex
@_translate_expr.register
def _(node: pl_expr.Window, visitor: NodeTraverser, dtype: plc.DataType) -> expr.Expr:
# TODO: raise in groupby?
- if node.partition_by is None:
+ if isinstance(node.options, pl_expr.RollingGroupOptions):
+ # pl.col("a").rolling(...)
return expr.RollingWindow(
dtype, node.options, translate_expr(visitor, n=node.function)
)
- else:
+ elif isinstance(node.options, pl_expr.WindowMapping):
+ # pl.col("a").over(...)
return expr.GroupedRollingWindow(
dtype,
node.options,
translate_expr(visitor, n=node.function),
*(translate_expr(visitor, n=n) for n in node.partition_by),
)
+ assert_never(node.options)
@_translate_expr.register
diff --git a/python/cudf_polars/cudf_polars/utils/dtypes.py b/python/cudf_polars/cudf_polars/utils/dtypes.py
index 7b0049daf11..3d4a643e1fc 100644
--- a/python/cudf_polars/cudf_polars/utils/dtypes.py
+++ b/python/cudf_polars/cudf_polars/utils/dtypes.py
@@ -70,7 +70,7 @@ def from_polars(dtype: pl.DataType) -> plc.DataType:
return plc.DataType(plc.TypeId.TIMESTAMP_MICROSECONDS)
elif dtype.time_unit == "ns":
return plc.DataType(plc.TypeId.TIMESTAMP_NANOSECONDS)
- assert dtype.time_unit is not None
+ assert dtype.time_unit is not None # pragma: no cover
assert_never(dtype.time_unit)
elif isinstance(dtype, pl.Duration):
if dtype.time_unit == "ms":
@@ -79,7 +79,7 @@ def from_polars(dtype: pl.DataType) -> plc.DataType:
return plc.DataType(plc.TypeId.DURATION_MICROSECONDS)
elif dtype.time_unit == "ns":
return plc.DataType(plc.TypeId.DURATION_NANOSECONDS)
- assert dtype.time_unit is not None
+ assert dtype.time_unit is not None # pragma: no cover
assert_never(dtype.time_unit)
elif isinstance(dtype, pl.String):
return plc.DataType(plc.TypeId.STRING)
diff --git a/python/cudf_polars/cudf_polars/utils/sorting.py b/python/cudf_polars/cudf_polars/utils/sorting.py
index 24fd449dd88..57f94c4ec4c 100644
--- a/python/cudf_polars/cudf_polars/utils/sorting.py
+++ b/python/cudf_polars/cudf_polars/utils/sorting.py
@@ -43,8 +43,8 @@ def sort_order(
for d in descending
]
null_precedence = []
- # TODO: use strict=True when we drop py39
- assert len(descending) == len(nulls_last)
+ if len(descending) != len(nulls_last) or len(descending) != num_keys:
+ raise ValueError("Mismatching length of arguments in sort_order")
for asc, null_last in zip(column_order, nulls_last):
if (asc == plc.types.Order.ASCENDING) ^ (not null_last):
null_precedence.append(plc.types.NullOrder.AFTER)
diff --git a/python/cudf_polars/pyproject.toml b/python/cudf_polars/pyproject.toml
index face04b9bd8..effa4861e0c 100644
--- a/python/cudf_polars/pyproject.toml
+++ b/python/cudf_polars/pyproject.toml
@@ -52,6 +52,13 @@ version = {file = "cudf_polars/VERSION"}
[tool.pytest.ini_options]
xfail_strict = true
+[tool.coverage.report]
+exclude_also = [
+ "if TYPE_CHECKING:",
+ "class .*\\bProtocol\\):",
+ "assert_never\\("
+]
+
[tool.ruff]
line-length = 88
indent-width = 4
diff --git a/python/cudf_polars/tests/expressions/test_datetime_basic.py b/python/cudf_polars/tests/expressions/test_datetime_basic.py
new file mode 100644
index 00000000000..6ba2a1dce1e
--- /dev/null
+++ b/python/cudf_polars/tests/expressions/test_datetime_basic.py
@@ -0,0 +1,34 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
+from __future__ import annotations
+
+import pytest
+
+import polars as pl
+
+from cudf_polars.testing.asserts import assert_gpu_result_equal
+
+
+@pytest.mark.parametrize(
+ "dtype",
+ [
+ pl.Date(),
+ pl.Datetime("ms"),
+ pl.Datetime("us"),
+ pl.Datetime("ns"),
+ pl.Duration("ms"),
+ pl.Duration("us"),
+ pl.Duration("ns"),
+ ],
+ ids=repr,
+)
+def test_datetime_dataframe_scan(dtype):
+ ldf = pl.DataFrame(
+ {
+ "a": pl.Series([1, 2, 3, 4, 5, 6, 7], dtype=dtype),
+ "b": pl.Series([3, 4, 5, 6, 7, 8, 9], dtype=pl.UInt16),
+ }
+ ).lazy()
+
+ query = ldf.select(pl.col("b"), pl.col("a"))
+ assert_gpu_result_equal(query)
diff --git a/python/cudf_polars/tests/expressions/test_rolling.py b/python/cudf_polars/tests/expressions/test_rolling.py
new file mode 100644
index 00000000000..d4920d35f14
--- /dev/null
+++ b/python/cudf_polars/tests/expressions/test_rolling.py
@@ -0,0 +1,41 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
+
+from __future__ import annotations
+
+import pytest
+
+import polars as pl
+
+from cudf_polars import translate_ir
+
+
+def test_rolling():
+ dates = [
+ "2020-01-01 13:45:48",
+ "2020-01-01 16:42:13",
+ "2020-01-01 16:45:09",
+ "2020-01-02 18:12:48",
+ "2020-01-03 19:45:32",
+ "2020-01-08 23:16:43",
+ ]
+ df = (
+ pl.DataFrame({"dt": dates, "a": [3, 7, 5, 9, 2, 1]})
+ .with_columns(pl.col("dt").str.strptime(pl.Datetime))
+ .lazy()
+ )
+ q = df.with_columns(
+ sum_a=pl.sum("a").rolling(index_column="dt", period="2d"),
+ min_a=pl.min("a").rolling(index_column="dt", period="2d"),
+ max_a=pl.max("a").rolling(index_column="dt", period="2d"),
+ )
+ with pytest.raises(NotImplementedError):
+ _ = translate_ir(q._ldf.visit())
+
+
+def test_grouped_rolling():
+ df = pl.LazyFrame({"a": [1, 2, 3, 4, 5, 6], "b": [1, 2, 1, 3, 1, 2]})
+
+ q = df.select(pl.col("a").min().over("b"))
+ with pytest.raises(NotImplementedError):
+ _ = translate_ir(q._ldf.visit())
diff --git a/python/cudf_polars/tests/expressions/test_sort.py b/python/cudf_polars/tests/expressions/test_sort.py
new file mode 100644
index 00000000000..0195266f5c6
--- /dev/null
+++ b/python/cudf_polars/tests/expressions/test_sort.py
@@ -0,0 +1,53 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
+from __future__ import annotations
+
+import itertools
+
+import pytest
+
+import polars as pl
+
+from cudf_polars.testing.asserts import assert_gpu_result_equal
+
+
+@pytest.mark.parametrize("descending", [False, True])
+@pytest.mark.parametrize("nulls_last", [False, True])
+def test_sort_expression(descending, nulls_last):
+ ldf = pl.LazyFrame(
+ {
+ "a": [5, -1, 3, 4, None, 8, 6, 7, None],
+ }
+ )
+
+ query = ldf.select(pl.col("a").sort(descending=descending, nulls_last=nulls_last))
+ assert_gpu_result_equal(query)
+
+
+@pytest.mark.parametrize(
+ "descending", itertools.combinations_with_replacement([False, True], 3)
+)
+@pytest.mark.parametrize(
+ "nulls_last", itertools.combinations_with_replacement([False, True], 3)
+)
+@pytest.mark.parametrize("maintain_order", [False, True], ids=["unstable", "stable"])
+def test_sort_by_expression(descending, nulls_last, maintain_order):
+ ldf = pl.LazyFrame(
+ {
+ "a": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
+ "b": [1, 2, 2, 3, 9, 5, -1, 2, -2, 16],
+ "c": ["a", "A", "b", "b", "c", "d", "A", "Z", "ä", "̈Ä"],
+ }
+ )
+
+ query = ldf.select(
+ pl.col("a").sort_by(
+ pl.col("b"),
+ pl.col("c"),
+ pl.col("b") + pl.col("a"),
+ descending=descending,
+ nulls_last=nulls_last,
+ maintain_order=maintain_order,
+ )
+ )
+ assert_gpu_result_equal(query, check_row_order=maintain_order)
diff --git a/python/cudf_polars/tests/test_groupby.py b/python/cudf_polars/tests/test_groupby.py
index d06a7ecf105..e70f923b097 100644
--- a/python/cudf_polars/tests/test_groupby.py
+++ b/python/cudf_polars/tests/test_groupby.py
@@ -6,6 +6,7 @@
import polars as pl
+from cudf_polars import translate_ir
from cudf_polars.testing.asserts import assert_gpu_result_equal
@@ -43,6 +44,7 @@ def keys(request):
[pl.col("float") + pl.col("int")],
[pl.col("float").max() - pl.col("int").min()],
[pl.col("float").mean(), pl.col("int").std()],
+ [(pl.col("float") - pl.lit(2)).max()],
],
ids=lambda aggs: "-".join(map(str, aggs)),
)
@@ -72,7 +74,28 @@ def test_groupby(df: pl.LazyFrame, maintain_order, keys, exprs):
if not maintain_order:
sort_keys = list(q.schema.keys())[: len(keys)]
q = q.sort(*sort_keys)
- # from cudf_polars.dsl.translate import translate_ir
- # ir = translate_ir(q._ldf.visit())
- # from IPython import embed; embed()
+
assert_gpu_result_equal(q, check_exact=False)
+
+
+def test_groupby_len(df, keys):
+ q = df.group_by(*keys).agg(pl.len())
+
+ # TODO: polars returns UInt32, libcudf returns Int32
+ with pytest.raises(AssertionError):
+ assert_gpu_result_equal(q, check_row_order=False)
+ assert_gpu_result_equal(q, check_dtypes=False, check_row_order=False)
+
+
+@pytest.mark.parametrize(
+ "expr",
+ [
+ pl.col("float").is_not_null(),
+ (pl.col("int").max() + pl.col("float").min()).max(),
+ ],
+)
+def test_groupby_unsupported(df, expr):
+ q = df.group_by("key1").agg(expr)
+
+ with pytest.raises(NotImplementedError):
+ _ = translate_ir(q._ldf.visit())
diff --git a/python/cudf_polars/tests/utils/test_dtypes.py b/python/cudf_polars/tests/utils/test_dtypes.py
new file mode 100644
index 00000000000..535fdd846a0
--- /dev/null
+++ b/python/cudf_polars/tests/utils/test_dtypes.py
@@ -0,0 +1,31 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
+
+from __future__ import annotations
+
+import pytest
+
+import polars as pl
+
+from cudf_polars.utils.dtypes import from_polars
+
+
+@pytest.mark.parametrize(
+ "pltype",
+ [
+ pl.Time(),
+ pl.Struct({"a": pl.Int8, "b": pl.Float32}),
+ pl.Datetime("ms", time_zone="US/Pacific"),
+ pl.Array(pl.Int8, 2),
+ pl.Binary(),
+ pl.Categorical(),
+ pl.Enum(["a", "b"]),
+ pl.Field("a", pl.Int8),
+ pl.Object(),
+ pl.Unknown(),
+ ],
+ ids=repr,
+)
+def test_unhandled_dtype_conversion_raises(pltype):
+ with pytest.raises(NotImplementedError):
+ _ = from_polars(pltype)
diff --git a/python/cudf_polars/tests/utils/test_sorting.py b/python/cudf_polars/tests/utils/test_sorting.py
new file mode 100644
index 00000000000..4e98a3a7ce7
--- /dev/null
+++ b/python/cudf_polars/tests/utils/test_sorting.py
@@ -0,0 +1,21 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
+
+from __future__ import annotations
+
+import pytest
+
+from cudf_polars.utils.sorting import sort_order
+
+
+@pytest.mark.parametrize(
+ "descending,nulls_last,num_keys",
+ [
+ ([True], [False, True], 3),
+ ([True, True], [False, True, False], 3),
+ ([False, True], [True], 3),
+ ],
+)
+def test_sort_order_raises_mismatch(descending, nulls_last, num_keys):
+ with pytest.raises(ValueError):
+ _ = sort_order(descending, nulls_last=nulls_last, num_keys=num_keys)
diff --git a/python/custreamz/custreamz/tests/test_kafka.py b/python/custreamz/custreamz/tests/test_kafka.py
index ad3b829544b..3a3c4e994d0 100644
--- a/python/custreamz/custreamz/tests/test_kafka.py
+++ b/python/custreamz/custreamz/tests/test_kafka.py
@@ -1,8 +1,8 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
import confluent_kafka as ck
import pytest
-from cudf.testing._utils import assert_eq
+from cudf.testing import assert_eq
@pytest.mark.parametrize("commit_offset", [1, 45, 100, 22, 1000, 10])
diff --git a/python/dask_cudf/dask_cudf/tests/test_accessor.py b/python/dask_cudf/dask_cudf/tests/test_accessor.py
index 58d28f0597e..6f04b5737da 100644
--- a/python/dask_cudf/dask_cudf/tests/test_accessor.py
+++ b/python/dask_cudf/dask_cudf/tests/test_accessor.py
@@ -9,7 +9,8 @@
from dask import dataframe as dd
from cudf import DataFrame, Series, date_range
-from cudf.testing._utils import assert_eq, does_not_raise
+from cudf.testing import assert_eq
+from cudf.testing._utils import does_not_raise
import dask_cudf
from dask_cudf.tests.utils import xfail_dask_expr
diff --git a/python/dask_cudf/dask_cudf/tests/test_core.py b/python/dask_cudf/dask_cudf/tests/test_core.py
index 7f8a619ae22..174923c2c7e 100644
--- a/python/dask_cudf/dask_cudf/tests/test_core.py
+++ b/python/dask_cudf/dask_cudf/tests/test_core.py
@@ -795,7 +795,7 @@ def test_dataframe_set_index():
pddf = dd.from_pandas(pdf, npartitions=4)
pddf = pddf.set_index("str")
- from cudf.testing._utils import assert_eq
+ from cudf.testing import assert_eq
assert_eq(ddf.compute(), pddf.compute())
diff --git a/python/dask_cudf/dask_cudf/tests/test_distributed.py b/python/dask_cudf/dask_cudf/tests/test_distributed.py
index 07fdb25dff9..be10b0d4843 100644
--- a/python/dask_cudf/dask_cudf/tests/test_distributed.py
+++ b/python/dask_cudf/dask_cudf/tests/test_distributed.py
@@ -9,7 +9,7 @@
from distributed.utils_test import cleanup, loop, loop_in_thread # noqa: F401
import cudf
-from cudf.testing._utils import assert_eq
+from cudf.testing import assert_eq
import dask_cudf