diff --git a/.github/workflows/pr_issue_status_automation.yml b/.github/workflows/pr_issue_status_automation.yml index 837963c3286..8ca971dc28d 100644 --- a/.github/workflows/pr_issue_status_automation.yml +++ b/.github/workflows/pr_issue_status_automation.yml @@ -35,7 +35,7 @@ jobs: update-status: # This job sets the PR and its linked issues to "In Progress" status uses: rapidsai/shared-workflows/.github/workflows/project-get-set-single-select-field.yaml@branch-24.08 - if: github.event.pull_request.state == 'open' + if: ${{ github.event.pull_request.state == 'open' && needs.get-project-id.outputs.ITEM_PROJECT_ID != '' }} needs: get-project-id with: PROJECT_ID: "PVT_kwDOAp2shc4AiNzl" @@ -51,7 +51,7 @@ jobs: update-sprint: # This job sets the PR and its linked issues to the current "Weekly Sprint" uses: rapidsai/shared-workflows/.github/workflows/project-get-set-iteration-field.yaml@branch-24.08 - if: github.event.pull_request.state == 'open' + if: ${{ github.event.pull_request.state == 'open' && needs.get-project-id.outputs.ITEM_PROJECT_ID != '' }} needs: get-project-id with: PROJECT_ID: "PVT_kwDOAp2shc4AiNzl" diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index cc08b832e69..f8c4f4b9143 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -60,7 +60,7 @@ repos: (?x)^( ^cpp/src/io/parquet/ipc/Schema_generated.h| ^cpp/src/io/parquet/ipc/Message_generated.h| - ^cpp/include/cudf_test/cxxopts.hpp| + ^cpp/include/cudf_test/cxxopts.hpp ) - repo: https://github.com/sirosen/texthooks rev: 0.6.6 diff --git a/ci/cudf_pandas_scripts/run_tests.sh b/ci/cudf_pandas_scripts/run_tests.sh index 78945d37f22..1c3b99953fb 100755 --- a/ci/cudf_pandas_scripts/run_tests.sh +++ b/ci/cudf_pandas_scripts/run_tests.sh @@ -5,6 +5,10 @@ set -eoxu pipefail +RAPIDS_TESTS_DIR=${RAPIDS_TESTS_DIR:-"${PWD}/test-results"} +RAPIDS_COVERAGE_DIR=${RAPIDS_COVERAGE_DIR:-"${PWD}/coverage-results"} +mkdir -p "${RAPIDS_TESTS_DIR}" "${RAPIDS_COVERAGE_DIR}" + # Function to display script usage function display_usage { echo "Usage: $0 [--no-cudf]" @@ -36,4 +40,9 @@ else python -m pip install $(ls ./local-cudf-dep/cudf*.whl)[test,cudf-pandas-tests] fi -python -m pytest -p cudf.pandas ./python/cudf/cudf_pandas_tests/ +python -m pytest -p cudf.pandas \ + --cov-config=./python/cudf/.coveragerc \ + --cov=cudf \ + --cov-report=xml:"${RAPIDS_COVERAGE_DIR}/cudf-pandas-coverage.xml" \ + --cov-report=term \ + ./python/cudf/cudf_pandas_tests/ diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt index 49504e53424..8a48126e195 100644 --- a/cpp/benchmarks/CMakeLists.txt +++ b/cpp/benchmarks/CMakeLists.txt @@ -267,6 +267,11 @@ ConfigureNVBench(PARQUET_MULTITHREAD_READER_NVBENCH io/parquet/parquet_reader_mu # * orc reader benchmark -------------------------------------------------------------------------- ConfigureNVBench(ORC_READER_NVBENCH io/orc/orc_reader_input.cpp io/orc/orc_reader_options.cpp) +# ################################################################################################## +# * orc multithreaded benchmark +# -------------------------------------------------------------------------- +ConfigureNVBench(ORC_MULTITHREADED_NVBENCH io/orc/orc_reader_multithreaded.cpp) + # ################################################################################################## # * csv reader benchmark -------------------------------------------------------------------------- ConfigureNVBench(CSV_READER_NVBENCH io/csv/csv_reader_input.cpp io/csv/csv_reader_options.cpp) diff --git a/cpp/benchmarks/io/orc/orc_reader_multithreaded.cpp b/cpp/benchmarks/io/orc/orc_reader_multithreaded.cpp new file mode 100644 index 00000000000..aa0ee39a179 --- /dev/null +++ b/cpp/benchmarks/io/orc/orc_reader_multithreaded.cpp @@ -0,0 +1,336 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#include + +#include + +size_t get_num_read_threads(nvbench::state const& state) { return state.get_int64("num_threads"); } + +size_t get_read_size(nvbench::state const& state) +{ + auto const num_reads = get_num_read_threads(state); + return state.get_int64("total_data_size") / num_reads; +} + +std::string get_label(std::string const& test_name, nvbench::state const& state) +{ + auto const num_cols = state.get_int64("num_cols"); + size_t const read_size_mb = get_read_size(state) / (1024 * 1024); + return {test_name + ", " + std::to_string(num_cols) + " columns, " + + std::to_string(get_num_read_threads(state)) + " threads " + " (" + + std::to_string(read_size_mb) + " MB each)"}; +} + +std::tuple, size_t, size_t> write_file_data( + nvbench::state& state, std::vector const& d_types) +{ + auto const cardinality = state.get_int64("cardinality"); + auto const run_length = state.get_int64("run_length"); + auto const num_cols = state.get_int64("num_cols"); + size_t const num_files = get_num_read_threads(state); + size_t const per_file_data_size = get_read_size(state); + + std::vector source_sink_vector; + + size_t total_file_size = 0; + + for (size_t i = 0; i < num_files; ++i) { + cuio_source_sink_pair source_sink{io_type::HOST_BUFFER}; + + auto const tbl = create_random_table( + cycle_dtypes(d_types, num_cols), + table_size_bytes{per_file_data_size}, + data_profile_builder().cardinality(cardinality).avg_run_length(run_length)); + auto const view = tbl->view(); + + cudf::io::orc_writer_options const write_opts = + cudf::io::orc_writer_options::builder(source_sink.make_sink_info(), view) + .compression(cudf::io::compression_type::SNAPPY); + + cudf::io::write_orc(write_opts); + total_file_size += source_sink.size(); + + source_sink_vector.push_back(std::move(source_sink)); + } + + return {std::move(source_sink_vector), total_file_size, num_files}; +} + +void BM_orc_multithreaded_read_common(nvbench::state& state, + std::vector const& d_types, + std::string const& label) +{ + auto const data_size = state.get_int64("total_data_size"); + auto const num_threads = state.get_int64("num_threads"); + + auto streams = cudf::detail::fork_streams(cudf::get_default_stream(), num_threads); + cudf::detail::thread_pool threads(num_threads); + + auto [source_sink_vector, total_file_size, num_files] = write_file_data(state, d_types); + std::vector source_info_vector; + std::transform(source_sink_vector.begin(), + source_sink_vector.end(), + std::back_inserter(source_info_vector), + [](auto& source_sink) { return source_sink.make_source_info(); }); + + auto mem_stats_logger = cudf::memory_stats_logger(); + + { + cudf::scoped_range range{("(read) " + label).c_str()}; + state.exec(nvbench::exec_tag::sync | nvbench::exec_tag::timer, + [&](nvbench::launch& launch, auto& timer) { + auto read_func = [&](int index) { + auto const stream = streams[index % num_threads]; + cudf::io::orc_reader_options read_opts = + cudf::io::orc_reader_options::builder(source_info_vector[index]); + cudf::io::read_orc(read_opts, stream, rmm::mr::get_current_device_resource()); + }; + + threads.paused = true; + for (size_t i = 0; i < num_files; ++i) { + threads.submit(read_func, i); + } + timer.start(); + threads.paused = false; + threads.wait_for_tasks(); + cudf::detail::join_streams(streams, cudf::get_default_stream()); + timer.stop(); + }); + } + + auto const time = state.get_summary("nv/cold/time/gpu/mean").get_float64("value"); + state.add_element_count(static_cast(data_size) / time, "bytes_per_second"); + state.add_buffer_size( + mem_stats_logger.peak_memory_usage(), "peak_memory_usage", "peak_memory_usage"); + state.add_buffer_size(total_file_size, "encoded_file_size", "encoded_file_size"); +} + +void BM_orc_multithreaded_read_mixed(nvbench::state& state) +{ + auto label = get_label("mixed", state); + cudf::scoped_range range{label.c_str()}; + BM_orc_multithreaded_read_common( + state, {cudf::type_id::INT32, cudf::type_id::DECIMAL64, cudf::type_id::STRING}, label); +} + +void BM_orc_multithreaded_read_fixed_width(nvbench::state& state) +{ + auto label = get_label("fixed width", state); + cudf::scoped_range range{label.c_str()}; + BM_orc_multithreaded_read_common(state, {cudf::type_id::INT32}, label); +} + +void BM_orc_multithreaded_read_string(nvbench::state& state) +{ + auto label = get_label("string", state); + cudf::scoped_range range{label.c_str()}; + BM_orc_multithreaded_read_common(state, {cudf::type_id::STRING}, label); +} + +void BM_orc_multithreaded_read_list(nvbench::state& state) +{ + auto label = get_label("list", state); + cudf::scoped_range range{label.c_str()}; + BM_orc_multithreaded_read_common(state, {cudf::type_id::LIST}, label); +} + +void BM_orc_multithreaded_read_chunked_common(nvbench::state& state, + std::vector const& d_types, + std::string const& label) +{ + size_t const data_size = state.get_int64("total_data_size"); + auto const num_threads = state.get_int64("num_threads"); + size_t const input_limit = state.get_int64("input_limit"); + size_t const output_limit = state.get_int64("output_limit"); + + auto streams = cudf::detail::fork_streams(cudf::get_default_stream(), num_threads); + cudf::detail::thread_pool threads(num_threads); + auto [source_sink_vector, total_file_size, num_files] = write_file_data(state, d_types); + std::vector source_info_vector; + std::transform(source_sink_vector.begin(), + source_sink_vector.end(), + std::back_inserter(source_info_vector), + [](auto& source_sink) { return source_sink.make_source_info(); }); + + auto mem_stats_logger = cudf::memory_stats_logger(); + + { + cudf::scoped_range range{("(read) " + label).c_str()}; + std::vector chunks; + state.exec(nvbench::exec_tag::sync | nvbench::exec_tag::timer, + [&](nvbench::launch& launch, auto& timer) { + auto read_func = [&](int index) { + auto const stream = streams[index % num_threads]; + cudf::io::orc_reader_options read_opts = + cudf::io::orc_reader_options::builder(source_info_vector[index]); + // divide chunk limits by number of threads so the number of chunks produced is + // the same for all cases. this seems better than the alternative, which is to + // keep the limits the same. if we do that, as the number of threads goes up, the + // number of chunks goes down - so are actually benchmarking the same thing in + // that case? + auto reader = cudf::io::chunked_orc_reader( + output_limit / num_threads, input_limit / num_threads, read_opts, stream); + + // read all the chunks + do { + auto table = reader.read_chunk(); + } while (reader.has_next()); + }; + + threads.paused = true; + for (size_t i = 0; i < num_files; ++i) { + threads.submit(read_func, i); + } + timer.start(); + threads.paused = false; + threads.wait_for_tasks(); + cudf::detail::join_streams(streams, cudf::get_default_stream()); + timer.stop(); + }); + } + + auto const time = state.get_summary("nv/cold/time/gpu/mean").get_float64("value"); + state.add_element_count(static_cast(data_size) / time, "bytes_per_second"); + state.add_buffer_size( + mem_stats_logger.peak_memory_usage(), "peak_memory_usage", "peak_memory_usage"); + state.add_buffer_size(total_file_size, "encoded_file_size", "encoded_file_size"); +} + +void BM_orc_multithreaded_read_chunked_mixed(nvbench::state& state) +{ + auto label = get_label("mixed", state); + cudf::scoped_range range{label.c_str()}; + BM_orc_multithreaded_read_chunked_common( + state, {cudf::type_id::INT32, cudf::type_id::DECIMAL64, cudf::type_id::STRING}, label); +} + +void BM_orc_multithreaded_read_chunked_fixed_width(nvbench::state& state) +{ + auto label = get_label("fixed width", state); + cudf::scoped_range range{label.c_str()}; + BM_orc_multithreaded_read_chunked_common(state, {cudf::type_id::INT32}, label); +} + +void BM_orc_multithreaded_read_chunked_string(nvbench::state& state) +{ + auto label = get_label("string", state); + cudf::scoped_range range{label.c_str()}; + BM_orc_multithreaded_read_chunked_common(state, {cudf::type_id::STRING}, label); +} + +void BM_orc_multithreaded_read_chunked_list(nvbench::state& state) +{ + auto label = get_label("list", state); + cudf::scoped_range range{label.c_str()}; + BM_orc_multithreaded_read_chunked_common(state, {cudf::type_id::LIST}, label); +} +auto const thread_range = std::vector{1, 2, 4, 8}; +auto const total_data_size = std::vector{512 * 1024 * 1024, 1024 * 1024 * 1024}; + +// mixed data types: fixed width and strings +NVBENCH_BENCH(BM_orc_multithreaded_read_mixed) + .set_name("orc_multithreaded_read_decode_mixed") + .set_min_samples(4) + .add_int64_axis("cardinality", {1000}) + .add_int64_axis("total_data_size", total_data_size) + .add_int64_axis("num_threads", thread_range) + .add_int64_axis("num_cols", {4}) + .add_int64_axis("run_length", {8}); + +NVBENCH_BENCH(BM_orc_multithreaded_read_fixed_width) + .set_name("orc_multithreaded_read_decode_fixed_width") + .set_min_samples(4) + .add_int64_axis("cardinality", {1000}) + .add_int64_axis("total_data_size", total_data_size) + .add_int64_axis("num_threads", thread_range) + .add_int64_axis("num_cols", {4}) + .add_int64_axis("run_length", {8}); + +NVBENCH_BENCH(BM_orc_multithreaded_read_string) + .set_name("orc_multithreaded_read_decode_string") + .set_min_samples(4) + .add_int64_axis("cardinality", {1000}) + .add_int64_axis("total_data_size", total_data_size) + .add_int64_axis("num_threads", thread_range) + .add_int64_axis("num_cols", {4}) + .add_int64_axis("run_length", {8}); + +NVBENCH_BENCH(BM_orc_multithreaded_read_list) + .set_name("orc_multithreaded_read_decode_list") + .set_min_samples(4) + .add_int64_axis("cardinality", {1000}) + .add_int64_axis("total_data_size", total_data_size) + .add_int64_axis("num_threads", thread_range) + .add_int64_axis("num_cols", {4}) + .add_int64_axis("run_length", {8}); + +// mixed data types: fixed width, strings +NVBENCH_BENCH(BM_orc_multithreaded_read_chunked_mixed) + .set_name("orc_multithreaded_read_decode_chunked_mixed") + .set_min_samples(4) + .add_int64_axis("cardinality", {1000}) + .add_int64_axis("total_data_size", total_data_size) + .add_int64_axis("num_threads", thread_range) + .add_int64_axis("num_cols", {4}) + .add_int64_axis("run_length", {8}) + .add_int64_axis("input_limit", {640 * 1024 * 1024}) + .add_int64_axis("output_limit", {640 * 1024 * 1024}); + +NVBENCH_BENCH(BM_orc_multithreaded_read_chunked_fixed_width) + .set_name("orc_multithreaded_read_decode_chunked_fixed_width") + .set_min_samples(4) + .add_int64_axis("cardinality", {1000}) + .add_int64_axis("total_data_size", total_data_size) + .add_int64_axis("num_threads", thread_range) + .add_int64_axis("num_cols", {4}) + .add_int64_axis("run_length", {8}) + .add_int64_axis("input_limit", {640 * 1024 * 1024}) + .add_int64_axis("output_limit", {640 * 1024 * 1024}); + +NVBENCH_BENCH(BM_orc_multithreaded_read_chunked_string) + .set_name("orc_multithreaded_read_decode_chunked_string") + .set_min_samples(4) + .add_int64_axis("cardinality", {1000}) + .add_int64_axis("total_data_size", total_data_size) + .add_int64_axis("num_threads", thread_range) + .add_int64_axis("num_cols", {4}) + .add_int64_axis("run_length", {8}) + .add_int64_axis("input_limit", {640 * 1024 * 1024}) + .add_int64_axis("output_limit", {640 * 1024 * 1024}); + +NVBENCH_BENCH(BM_orc_multithreaded_read_chunked_list) + .set_name("orc_multithreaded_read_decode_chunked_list") + .set_min_samples(4) + .add_int64_axis("cardinality", {1000}) + .add_int64_axis("total_data_size", total_data_size) + .add_int64_axis("num_threads", thread_range) + .add_int64_axis("num_cols", {4}) + .add_int64_axis("run_length", {8}) + .add_int64_axis("input_limit", {640 * 1024 * 1024}) + .add_int64_axis("output_limit", {640 * 1024 * 1024}); diff --git a/cpp/cmake/thirdparty/patches/cccl_override.json b/cpp/cmake/thirdparty/patches/cccl_override.json index 059f713e7a5..e61102dffac 100644 --- a/cpp/cmake/thirdparty/patches/cccl_override.json +++ b/cpp/cmake/thirdparty/patches/cccl_override.json @@ -3,60 +3,25 @@ "packages" : { "CCCL" : { "patches" : [ - { - "file" : "cccl/bug_fixes.diff", - "issue" : "CCCL installs header-search.cmake files in nondeterministic order and has a typo in checking target creation that leads to duplicates", - "fixed_in" : "2.3" - }, - { - "file" : "cccl/hide_kernels.diff", - "issue" : "Mark all cub and thrust kernels with hidden visibility [https://github.com/nvidia/cccl/pulls/443]", - "fixed_in" : "2.3" - }, { "file" : "cccl/revert_pr_211.diff", "issue" : "thrust::copy introduced a change in behavior that causes failures with cudaErrorInvalidValue.", "fixed_in" : "" }, - { - "file" : "${current_json_dir}/revert_pr_211_cccl_2.5.0.diff", - "issue" : "thrust::copy introduced a change in behavior that causes failures with cudaErrorInvalidValue.", - "fixed_in" : "" - }, - { - "file": "cccl/kernel_pointer_hiding.diff", - "issue": "Hide APIs that accept kernel pointers [https://github.com/NVIDIA/cccl/pull/1395]", - "fixed_in": "2.4" - }, { "file" : "${current_json_dir}/thrust_disable_64bit_dispatching.diff", "issue" : "Remove 64bit dispatching as not needed by libcudf and results in compiling twice as many kernels [https://github.com/rapidsai/cudf/pull/11437]", "fixed_in" : "" }, - { - "file" : "${current_json_dir}/thrust_disable_64bit_dispatching_cccl_2.5.0.diff", - "issue" : "Remove 64bit dispatching as not needed by libcudf and results in compiling twice as many kernels [https://github.com/rapidsai/cudf/pull/11437]", - "fixed_in" : "" - }, { "file" : "${current_json_dir}/thrust_faster_sort_compile_times.diff", "issue" : "Improve Thrust sort compile times by not unrolling loops for inlined comparators [https://github.com/rapidsai/cudf/pull/10577]", "fixed_in" : "" }, - { - "file" : "${current_json_dir}/thrust_faster_sort_compile_times_cccl_2.5.0.diff", - "issue" : "Improve Thrust sort compile times by not unrolling loops for inlined comparators [https://github.com/rapidsai/cudf/pull/10577]", - "fixed_in" : "" - }, { "file" : "${current_json_dir}/thrust_faster_scan_compile_times.diff", "issue" : "Improve Thrust scan compile times by reducing the number of kernels generated [https://github.com/rapidsai/cudf/pull/8183]", "fixed_in" : "" - }, - { - "file" : "${current_json_dir}/thrust_faster_scan_compile_times_cccl_2.5.0.diff", - "issue" : "Improve Thrust scan compile times by reducing the number of kernels generated [https://github.com/rapidsai/cudf/pull/8183]", - "fixed_in" : "" } ] } diff --git a/cpp/cmake/thirdparty/patches/revert_pr_211_cccl_2.5.0.diff b/cpp/cmake/thirdparty/patches/revert_pr_211_cccl_2.5.0.diff deleted file mode 100644 index 27ff16744f5..00000000000 --- a/cpp/cmake/thirdparty/patches/revert_pr_211_cccl_2.5.0.diff +++ /dev/null @@ -1,47 +0,0 @@ -diff --git a/thrust/thrust/system/cuda/detail/internal/copy_device_to_device.h b/thrust/thrust/system/cuda/detail/internal/copy_device_to_device.h -index 046eb83c0..8047c9701 100644 ---- a/thrust/thrust/system/cuda/detail/internal/copy_device_to_device.h -+++ b/thrust/thrust/system/cuda/detail/internal/copy_device_to_device.h -@@ -53,41 +53,15 @@ namespace cuda_cub - - namespace __copy - { --template --OutputIt THRUST_RUNTIME_FUNCTION device_to_device( -- execution_policy& policy, InputIt first, InputIt last, OutputIt result, thrust::detail::true_type) --{ -- typedef typename thrust::iterator_traits::value_type InputTy; -- const auto n = thrust::distance(first, last); -- if (n > 0) -- { -- cudaError status; -- status = trivial_copy_device_to_device( -- policy, -- reinterpret_cast(thrust::raw_pointer_cast(&*result)), -- reinterpret_cast(thrust::raw_pointer_cast(&*first)), -- n); -- cuda_cub::throw_on_error(status, "__copy:: D->D: failed"); -- } -- -- return result + n; --} - - template - OutputIt THRUST_RUNTIME_FUNCTION device_to_device( -- execution_policy& policy, InputIt first, InputIt last, OutputIt result, thrust::detail::false_type) -+ execution_policy& policy, InputIt first, InputIt last, OutputIt result) - { - typedef typename thrust::iterator_traits::value_type InputTy; - return cuda_cub::transform(policy, first, last, result, thrust::identity()); - } - --template --OutputIt THRUST_RUNTIME_FUNCTION --device_to_device(execution_policy& policy, InputIt first, InputIt last, OutputIt result) --{ -- return device_to_device( -- policy, first, last, result, typename is_indirectly_trivially_relocatable_to::type()); --} - } // namespace __copy - - } // namespace cuda_cub diff --git a/cpp/cmake/thirdparty/patches/thrust_disable_64bit_dispatching.diff b/cpp/cmake/thirdparty/patches/thrust_disable_64bit_dispatching.diff index d3f1a26781f..6ae1e1c917b 100644 --- a/cpp/cmake/thirdparty/patches/thrust_disable_64bit_dispatching.diff +++ b/cpp/cmake/thirdparty/patches/thrust_disable_64bit_dispatching.diff @@ -1,25 +1,25 @@ diff --git a/thrust/thrust/system/cuda/detail/dispatch.h b/thrust/thrust/system/cuda/detail/dispatch.h -index d0e3f94ec..5c32a9c60 100644 +index 2a3cc4e33..8fb337b26 100644 --- a/thrust/thrust/system/cuda/detail/dispatch.h +++ b/thrust/thrust/system/cuda/detail/dispatch.h -@@ -32,8 +32,7 @@ - status = call arguments; \ - } \ - else { \ -- auto THRUST_PP_CAT2(count, _fixed) = static_cast(count); \ -- status = call arguments; \ -+ throw std::runtime_error("THRUST_INDEX_TYPE_DISPATCH 64-bit count is unsupported in libcudf"); \ - } - +@@ -44,8 +44,7 @@ + } \ + else \ + { \ +- auto THRUST_PP_CAT2(count, _fixed) = static_cast(count); \ +- status = call arguments; \ ++ throw std::runtime_error("THRUST_INDEX_TYPE_DISPATCH 64-bit count is unsupported in libcudf"); \ + } + /** -@@ -52,9 +51,7 @@ - status = call arguments; \ - } \ - else { \ -- auto THRUST_PP_CAT2(count1, _fixed) = static_cast(count1); \ -- auto THRUST_PP_CAT2(count2, _fixed) = static_cast(count2); \ -- status = call arguments; \ -+ throw std::runtime_error("THRUST_DOUBLE_INDEX_TYPE_DISPATCH 64-bit count is unsupported in libcudf"); \ - } +@@ -66,9 +65,7 @@ + } \ + else \ + { \ +- auto THRUST_PP_CAT2(count1, _fixed) = static_cast(count1); \ +- auto THRUST_PP_CAT2(count2, _fixed) = static_cast(count2); \ +- status = call arguments; \ ++ throw std::runtime_error("THRUST_DOUBLE_INDEX_TYPE_DISPATCH 64-bit count is unsupported in libcudf"); \ + } /** * Dispatch between 32-bit and 64-bit index based versions of the same algorithm diff --git a/cpp/cmake/thirdparty/patches/thrust_disable_64bit_dispatching_cccl_2.5.0.diff b/cpp/cmake/thirdparty/patches/thrust_disable_64bit_dispatching_cccl_2.5.0.diff deleted file mode 100644 index 6ae1e1c917b..00000000000 --- a/cpp/cmake/thirdparty/patches/thrust_disable_64bit_dispatching_cccl_2.5.0.diff +++ /dev/null @@ -1,25 +0,0 @@ -diff --git a/thrust/thrust/system/cuda/detail/dispatch.h b/thrust/thrust/system/cuda/detail/dispatch.h -index 2a3cc4e33..8fb337b26 100644 ---- a/thrust/thrust/system/cuda/detail/dispatch.h -+++ b/thrust/thrust/system/cuda/detail/dispatch.h -@@ -44,8 +44,7 @@ - } \ - else \ - { \ -- auto THRUST_PP_CAT2(count, _fixed) = static_cast(count); \ -- status = call arguments; \ -+ throw std::runtime_error("THRUST_INDEX_TYPE_DISPATCH 64-bit count is unsupported in libcudf"); \ - } - - /** -@@ -66,9 +65,7 @@ - } \ - else \ - { \ -- auto THRUST_PP_CAT2(count1, _fixed) = static_cast(count1); \ -- auto THRUST_PP_CAT2(count2, _fixed) = static_cast(count2); \ -- status = call arguments; \ -+ throw std::runtime_error("THRUST_DOUBLE_INDEX_TYPE_DISPATCH 64-bit count is unsupported in libcudf"); \ - } - /** - * Dispatch between 32-bit and 64-bit index based versions of the same algorithm diff --git a/cpp/cmake/thirdparty/patches/thrust_faster_scan_compile_times.diff b/cpp/cmake/thirdparty/patches/thrust_faster_scan_compile_times.diff index a606e21b92d..fee46046194 100644 --- a/cpp/cmake/thirdparty/patches/thrust_faster_scan_compile_times.diff +++ b/cpp/cmake/thirdparty/patches/thrust_faster_scan_compile_times.diff @@ -1,23 +1,23 @@ diff --git a/cub/cub/device/dispatch/dispatch_radix_sort.cuh b/cub/cub/device/dispatch/dispatch_radix_sort.cuh -index 84b6ccffd..25a237f93 100644 +index 0606485bb..dbb99ff13 100644 --- a/cub/cub/device/dispatch/dispatch_radix_sort.cuh +++ b/cub/cub/device/dispatch/dispatch_radix_sort.cuh -@@ -808,7 +808,7 @@ struct DeviceRadixSortPolicy - - - /// SM60 (GP100) -- struct Policy600 : ChainedPolicy<600, Policy600, Policy500> -+ struct Policy600 : ChainedPolicy<600, Policy600, Policy600> +@@ -1085,7 +1085,7 @@ struct DeviceRadixSortPolicy + }; + + /// SM60 (GP100) +- struct Policy600 : ChainedPolicy<600, Policy600, Policy500> ++ struct Policy600 : ChainedPolicy<600, Policy600, Policy600> + { + enum { - enum { - PRIMARY_RADIX_BITS = (sizeof(KeyT) > 1) ? 7 : 5, // 6.9B 32b keys/s (Quadro P100) diff --git a/cub/cub/device/dispatch/dispatch_reduce.cuh b/cub/cub/device/dispatch/dispatch_reduce.cuh -index 994adc095..d3e6719a7 100644 +index f39613adb..75bd16ff9 100644 --- a/cub/cub/device/dispatch/dispatch_reduce.cuh +++ b/cub/cub/device/dispatch/dispatch_reduce.cuh -@@ -479,7 +479,7 @@ struct DeviceReducePolicy +@@ -488,7 +488,7 @@ struct DeviceReducePolicy }; - + /// SM60 - struct Policy600 : ChainedPolicy<600, Policy600, Policy350> + struct Policy600 : ChainedPolicy<600, Policy600, Policy600> @@ -25,15 +25,15 @@ index 994adc095..d3e6719a7 100644 static constexpr int threads_per_block = 256; static constexpr int items_per_thread = 16; diff --git a/cub/cub/device/dispatch/tuning/tuning_scan.cuh b/cub/cub/device/dispatch/tuning/tuning_scan.cuh -index 0ea5c41ad..1bcd8a111 100644 +index 419908c4e..6ab0840e1 100644 --- a/cub/cub/device/dispatch/tuning/tuning_scan.cuh +++ b/cub/cub/device/dispatch/tuning/tuning_scan.cuh -@@ -303,7 +303,7 @@ struct DeviceScanPolicy +@@ -339,7 +339,7 @@ struct DeviceScanPolicy /// SM600 struct Policy600 : DefaultTuning - , ChainedPolicy<600, Policy600, Policy520> + , ChainedPolicy<600, Policy600, Policy600> {}; - + /// SM800 diff --git a/cpp/cmake/thirdparty/patches/thrust_faster_scan_compile_times_cccl_2.5.0.diff b/cpp/cmake/thirdparty/patches/thrust_faster_scan_compile_times_cccl_2.5.0.diff deleted file mode 100644 index fee46046194..00000000000 --- a/cpp/cmake/thirdparty/patches/thrust_faster_scan_compile_times_cccl_2.5.0.diff +++ /dev/null @@ -1,39 +0,0 @@ -diff --git a/cub/cub/device/dispatch/dispatch_radix_sort.cuh b/cub/cub/device/dispatch/dispatch_radix_sort.cuh -index 0606485bb..dbb99ff13 100644 ---- a/cub/cub/device/dispatch/dispatch_radix_sort.cuh -+++ b/cub/cub/device/dispatch/dispatch_radix_sort.cuh -@@ -1085,7 +1085,7 @@ struct DeviceRadixSortPolicy - }; - - /// SM60 (GP100) -- struct Policy600 : ChainedPolicy<600, Policy600, Policy500> -+ struct Policy600 : ChainedPolicy<600, Policy600, Policy600> - { - enum - { -diff --git a/cub/cub/device/dispatch/dispatch_reduce.cuh b/cub/cub/device/dispatch/dispatch_reduce.cuh -index f39613adb..75bd16ff9 100644 ---- a/cub/cub/device/dispatch/dispatch_reduce.cuh -+++ b/cub/cub/device/dispatch/dispatch_reduce.cuh -@@ -488,7 +488,7 @@ struct DeviceReducePolicy - }; - - /// SM60 -- struct Policy600 : ChainedPolicy<600, Policy600, Policy350> -+ struct Policy600 : ChainedPolicy<600, Policy600, Policy600> - { - static constexpr int threads_per_block = 256; - static constexpr int items_per_thread = 16; -diff --git a/cub/cub/device/dispatch/tuning/tuning_scan.cuh b/cub/cub/device/dispatch/tuning/tuning_scan.cuh -index 419908c4e..6ab0840e1 100644 ---- a/cub/cub/device/dispatch/tuning/tuning_scan.cuh -+++ b/cub/cub/device/dispatch/tuning/tuning_scan.cuh -@@ -339,7 +339,7 @@ struct DeviceScanPolicy - /// SM600 - struct Policy600 - : DefaultTuning -- , ChainedPolicy<600, Policy600, Policy520> -+ , ChainedPolicy<600, Policy600, Policy600> - {}; - - /// SM800 diff --git a/cpp/cmake/thirdparty/patches/thrust_faster_sort_compile_times.diff b/cpp/cmake/thirdparty/patches/thrust_faster_sort_compile_times.diff index c34b6433d10..cb0cc55f4d2 100644 --- a/cpp/cmake/thirdparty/patches/thrust_faster_sort_compile_times.diff +++ b/cpp/cmake/thirdparty/patches/thrust_faster_sort_compile_times.diff @@ -1,39 +1,39 @@ diff --git a/cub/cub/block/block_merge_sort.cuh b/cub/cub/block/block_merge_sort.cuh -index dc07ef6c2..a066c14da 100644 +index eb76ebb0b..c6c529a50 100644 --- a/cub/cub/block/block_merge_sort.cuh +++ b/cub/cub/block/block_merge_sort.cuh -@@ -91,7 +91,7 @@ __device__ __forceinline__ void SerialMerge(KeyT *keys_shared, +@@ -95,7 +95,7 @@ _CCCL_DEVICE _CCCL_FORCEINLINE void SerialMerge( KeyT key1 = keys_shared[keys1_beg]; KeyT key2 = keys_shared[keys2_beg]; - + -#pragma unroll +#pragma unroll 1 for (int item = 0; item < ITEMS_PER_THREAD; ++item) { - bool p = (keys2_beg < keys2_end) && -@@ -383,7 +383,7 @@ public: + bool p = (keys2_beg < keys2_end) && ((keys1_beg >= keys1_end) || compare_op(key2, key1)); +@@ -376,7 +376,7 @@ public: // KeyT max_key = oob_default; - -- #pragma unroll -+ #pragma unroll 1 + +-#pragma unroll ++#pragma unroll 1 for (int item = 1; item < ITEMS_PER_THREAD; ++item) { if (ITEMS_PER_THREAD * linear_tid + item < valid_items) diff --git a/cub/cub/thread/thread_sort.cuh b/cub/cub/thread/thread_sort.cuh -index 5d4867896..b42fb5f00 100644 +index 7d9e8622f..da5627306 100644 --- a/cub/cub/thread/thread_sort.cuh +++ b/cub/cub/thread/thread_sort.cuh -@@ -83,10 +83,10 @@ StableOddEvenSort(KeyT (&keys)[ITEMS_PER_THREAD], +@@ -87,10 +87,10 @@ StableOddEvenSort(KeyT (&keys)[ITEMS_PER_THREAD], ValueT (&items)[ITEMS_PER_THRE { - constexpr bool KEYS_ONLY = std::is_same::value; - -- #pragma unroll -+ #pragma unroll 1 + constexpr bool KEYS_ONLY = ::cuda::std::is_same::value; + +-#pragma unroll ++#pragma unroll 1 for (int i = 0; i < ITEMS_PER_THREAD; ++i) { -- #pragma unroll -+ #pragma unroll 1 +-#pragma unroll ++#pragma unroll 1 for (int j = 1 & i; j < ITEMS_PER_THREAD - 1; j += 2) { if (compare_op(keys[j + 1], keys[j])) diff --git a/cpp/cmake/thirdparty/patches/thrust_faster_sort_compile_times_cccl_2.5.0.diff b/cpp/cmake/thirdparty/patches/thrust_faster_sort_compile_times_cccl_2.5.0.diff deleted file mode 100644 index cb0cc55f4d2..00000000000 --- a/cpp/cmake/thirdparty/patches/thrust_faster_sort_compile_times_cccl_2.5.0.diff +++ /dev/null @@ -1,39 +0,0 @@ -diff --git a/cub/cub/block/block_merge_sort.cuh b/cub/cub/block/block_merge_sort.cuh -index eb76ebb0b..c6c529a50 100644 ---- a/cub/cub/block/block_merge_sort.cuh -+++ b/cub/cub/block/block_merge_sort.cuh -@@ -95,7 +95,7 @@ _CCCL_DEVICE _CCCL_FORCEINLINE void SerialMerge( - KeyT key1 = keys_shared[keys1_beg]; - KeyT key2 = keys_shared[keys2_beg]; - --#pragma unroll -+#pragma unroll 1 - for (int item = 0; item < ITEMS_PER_THREAD; ++item) - { - bool p = (keys2_beg < keys2_end) && ((keys1_beg >= keys1_end) || compare_op(key2, key1)); -@@ -376,7 +376,7 @@ public: - // - KeyT max_key = oob_default; - --#pragma unroll -+#pragma unroll 1 - for (int item = 1; item < ITEMS_PER_THREAD; ++item) - { - if (ITEMS_PER_THREAD * linear_tid + item < valid_items) -diff --git a/cub/cub/thread/thread_sort.cuh b/cub/cub/thread/thread_sort.cuh -index 7d9e8622f..da5627306 100644 ---- a/cub/cub/thread/thread_sort.cuh -+++ b/cub/cub/thread/thread_sort.cuh -@@ -87,10 +87,10 @@ StableOddEvenSort(KeyT (&keys)[ITEMS_PER_THREAD], ValueT (&items)[ITEMS_PER_THRE - { - constexpr bool KEYS_ONLY = ::cuda::std::is_same::value; - --#pragma unroll -+#pragma unroll 1 - for (int i = 0; i < ITEMS_PER_THREAD; ++i) - { --#pragma unroll -+#pragma unroll 1 - for (int j = 1 & i; j < ITEMS_PER_THREAD - 1; j += 2) - { - if (compare_op(keys[j + 1], keys[j])) diff --git a/cpp/include/cudf/ast/detail/operators.hpp b/cpp/include/cudf/ast/detail/operators.hpp index b618f33a6e5..c483d459833 100644 --- a/cpp/include/cudf/ast/detail/operators.hpp +++ b/cpp/include/cudf/ast/detail/operators.hpp @@ -17,6 +17,7 @@ #include #include +#include #include #include @@ -819,7 +820,17 @@ struct operator_functor { template struct cast { static constexpr auto arity{1}; - template + template ()>* = nullptr> + __device__ inline auto operator()(From f) -> To + { + if constexpr (cuda::std::is_floating_point_v) { + return convert_fixed_to_floating(f); + } else { + return static_cast(f); + } + } + + template ()>* = nullptr> __device__ inline auto operator()(From f) -> decltype(static_cast(f)) { return static_cast(f); diff --git a/cpp/src/interop/to_arrow.cu b/cpp/src/interop/to_arrow.cu index 47aee982c32..2b3aa2f08f1 100644 --- a/cpp/src/interop/to_arrow.cu +++ b/cpp/src/interop/to_arrow.cu @@ -292,9 +292,9 @@ std::shared_ptr dispatch_to_arrow::operator()( auto child_arrays = fetch_child_array(input_view, {{}, {}}, ar_mr, stream); if (child_arrays.empty()) { // Empty string will have only one value in offset of 4 bytes - auto tmp_offset_buffer = allocate_arrow_buffer(4, ar_mr); - auto tmp_data_buffer = allocate_arrow_buffer(0, ar_mr); - tmp_offset_buffer->mutable_data()[0] = 0; + auto tmp_offset_buffer = allocate_arrow_buffer(sizeof(int32_t), ar_mr); + auto tmp_data_buffer = allocate_arrow_buffer(0, ar_mr); + memset(tmp_offset_buffer->mutable_data(), 0, sizeof(int32_t)); return std::make_shared( 0, std::move(tmp_offset_buffer), std::move(tmp_data_buffer)); diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu index 6d466748c17..ca15b532d07 100644 --- a/cpp/src/io/parquet/writer_impl.cu +++ b/cpp/src/io/parquet/writer_impl.cu @@ -1763,10 +1763,10 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta, // for multiple fragments per page to smooth things out. using 2 was too // unbalanced in final page sizes, so using 4 which seems to be a good // compromise at smoothing things out without getting fragment sizes too small. - auto frag_size_fn = [&](auto const& col, size_type col_size) { + auto frag_size_fn = [&](auto const& col, size_t col_size) { int const target_frags_per_page = is_col_fixed_width(col) ? 1 : 4; auto const avg_len = - target_frags_per_page * util::div_rounding_up_safe(col_size, input.num_rows()); + target_frags_per_page * util::div_rounding_up_safe(col_size, input.num_rows()); if (avg_len > 0) { auto const frag_size = util::div_rounding_up_safe(max_page_size_bytes, avg_len); return std::min(max_page_fragment_size, frag_size); diff --git a/cpp/src/stream_compaction/distinct_count.cu b/cpp/src/stream_compaction/distinct_count.cu index b7aadbe14fa..99ca89cc021 100644 --- a/cpp/src/stream_compaction/distinct_count.cu +++ b/cpp/src/stream_compaction/distinct_count.cu @@ -187,7 +187,11 @@ cudf::size_type distinct_count(column_view const& input, nan_policy nan_handling, rmm::cuda_stream_view stream) { - if (0 == input.size() or input.null_count() == input.size()) { return 0; } + if (0 == input.size()) { return 0; } + + if (input.null_count() == input.size()) { + return static_cast(null_handling == null_policy::INCLUDE); + } auto count = detail::distinct_count(table_view{{input}}, null_equality::EQUAL, stream); diff --git a/cpp/src/utilities/pinned_memory.cpp b/cpp/src/utilities/pinned_memory.cpp index 5d2e3ac332a..e90b7969b4d 100644 --- a/cpp/src/utilities/pinned_memory.cpp +++ b/cpp/src/utilities/pinned_memory.cpp @@ -43,9 +43,11 @@ class fixed_pinned_pool_memory_resource { public: fixed_pinned_pool_memory_resource(size_t size) - : pool_size_{size}, pool_{new host_pooled_mr(upstream_mr_, size, size)} + : // rmm requires the pool size to be a multiple of 256 bytes + pool_size_{rmm::align_up(size, rmm::CUDA_ALLOCATION_ALIGNMENT)}, + pool_{new host_pooled_mr(upstream_mr_, pool_size_, pool_size_)} { - if (pool_size_ == 0) { return; } + CUDF_LOG_INFO("Pinned pool size = {}", pool_size_); // Allocate full size from the pinned pool to figure out the beginning and end address pool_begin_ = pool_->allocate_async(pool_size_, stream_); @@ -145,12 +147,8 @@ CUDF_EXPORT rmm::host_device_async_resource_ref& make_default_pinned_mr( return std::min(total / 200, size_t{100} * 1024 * 1024); }(); - // rmm requires the pool size to be a multiple of 256 bytes - auto const aligned_size = rmm::align_up(size, rmm::RMM_DEFAULT_HOST_ALIGNMENT); - CUDF_LOG_INFO("Pinned pool size = {}", aligned_size); - // make the pool with max size equal to the initial size - return fixed_pinned_pool_memory_resource{aligned_size}; + return fixed_pinned_pool_memory_resource{size}; }(); static rmm::host_device_async_resource_ref mr_ref{mr}; diff --git a/cpp/tests/interop/from_arrow_test.cpp b/cpp/tests/interop/from_arrow_test.cpp index af20a5c772f..6eaa1a07e08 100644 --- a/cpp/tests/interop/from_arrow_test.cpp +++ b/cpp/tests/interop/from_arrow_test.cpp @@ -50,7 +50,8 @@ std::unique_ptr get_cudf_table() {true, false, true, true, true}); columns.emplace_back(std::move(cudf::dictionary::encode(col4))); columns.emplace_back(cudf::test::fixed_width_column_wrapper( - {true, false, true, false, true}, {true, false, true, true, false}).release()); + {true, false, true, false, true}, {true, false, true, true, false}) + .release()); columns.emplace_back(cudf::test::strings_column_wrapper( { "", @@ -338,7 +339,7 @@ TEST_F(FromArrowTest, ChunkedArray) std::vector>{dict_array1, dict_array2}); auto boolean_array = get_arrow_array({true, false, true, false, true}, {true, false, true, true, false}); - auto boolean_chunked_array = std::make_shared(boolean_array); + auto boolean_chunked_array = std::make_shared(boolean_array); auto large_string_chunked_array = std::make_shared( std::vector>{large_string_array_1}); diff --git a/docs/cudf/source/conf.py b/docs/cudf/source/conf.py index e9c760e288e..108f12bc099 100644 --- a/docs/cudf/source/conf.py +++ b/docs/cudf/source/conf.py @@ -554,6 +554,12 @@ def on_missing_reference(app, env, node, contnode): nitpick_ignore = [ ("py:class", "SeriesOrIndex"), ("py:class", "Dtype"), + # The following are erroneously warned due to + # https://github.com/sphinx-doc/sphinx/issues/11225 + ("py:class", "pa.Array"), + ("py:class", "ScalarLike"), + ("py:class", "ParentType"), + ("py:class", "ColumnLike"), # TODO: Remove this when we figure out why typing_extensions doesn't seem # to map types correctly for intersphinx ("py:class", "typing_extensions.Self"), diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/datetime.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/datetime.rst new file mode 100644 index 00000000000..ebf5fab3052 --- /dev/null +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/datetime.rst @@ -0,0 +1,6 @@ +======= +copying +======= + +.. automodule:: cudf._lib.pylibcudf.datetime + :members: diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst index 1e03fa80bb5..f98298ff052 100644 --- a/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst @@ -14,6 +14,7 @@ This page provides API documentation for pylibcudf. column_factories concatenate copying + datetime filling gpumemoryview groupby diff --git a/pyproject.toml b/pyproject.toml index d343b237ee7..2f59864894b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -26,7 +26,7 @@ quiet-level = 3 line-length = 79 [tool.ruff.lint] -select = ["E", "F", "W", "D201", "D204", "D206", "D207", "D208", "D209", "D210", "D211", "D214", "D215", "D300", "D301", "D403", "D405", "D406", "D407", "D408", "D409", "D410", "D411", "D412", "D414", "D418"] +select = ["E", "F", "W", "D201", "D204", "D206", "D207", "D208", "D209", "D210", "D211", "D214", "D215", "D300", "D301", "D403", "D405", "D406", "D407", "D408", "D409", "D410", "D411", "D412", "D414", "D418", "TCH", "FA", "UP006", "UP007"] ignore = [ # whitespace before : "E203", diff --git a/python/cudf/cudf/_lib/column.pyi b/python/cudf/cudf/_lib/column.pyi index c667286fc16..bcab009c102 100644 --- a/python/cudf/cudf/_lib/column.pyi +++ b/python/cudf/cudf/_lib/column.pyi @@ -2,8 +2,6 @@ from __future__ import annotations -from typing import Dict, Optional, Tuple - from typing_extensions import Self from cudf._typing import Dtype, DtypeObj, ScalarLike @@ -11,27 +9,27 @@ from cudf.core.buffer import Buffer from cudf.core.column import ColumnBase class Column: - _data: Optional[Buffer] - _mask: Optional[Buffer] - _base_data: Optional[Buffer] - _base_mask: Optional[Buffer] + _data: Buffer | None + _mask: Buffer | None + _base_data: Buffer | None + _base_mask: Buffer | None _dtype: DtypeObj _size: int _offset: int _null_count: int - _children: Tuple[ColumnBase, ...] - _base_children: Tuple[ColumnBase, ...] - _distinct_count: Dict[bool, int] + _children: tuple[ColumnBase, ...] + _base_children: tuple[ColumnBase, ...] + _distinct_count: dict[bool, int] def __init__( self, - data: Optional[Buffer], + data: Buffer | None, size: int, dtype: Dtype, - mask: Optional[Buffer] = None, - offset: Optional[int] = None, - null_count: Optional[int] = None, - children: Tuple[ColumnBase, ...] = (), + mask: Buffer | None = None, + offset: int | None = None, + null_count: int | None = None, + children: tuple[ColumnBase, ...] = (), ) -> None: ... @property def base_size(self) -> int: ... @@ -40,9 +38,9 @@ class Column: @property def size(self) -> int: ... @property - def base_data(self) -> Optional[Buffer]: ... + def base_data(self) -> Buffer | None: ... @property - def data(self) -> Optional[Buffer]: ... + def data(self) -> Buffer | None: ... @property def data_ptr(self) -> int: ... def set_base_data(self, value: Buffer) -> None: ... @@ -50,25 +48,25 @@ class Column: def nullable(self) -> bool: ... def has_nulls(self, include_nan: bool = False) -> bool: ... @property - def base_mask(self) -> Optional[Buffer]: ... + def base_mask(self) -> Buffer | None: ... @property - def mask(self) -> Optional[Buffer]: ... + def mask(self) -> Buffer | None: ... @property def mask_ptr(self) -> int: ... - def set_base_mask(self, value: Optional[Buffer]) -> None: ... - def set_mask(self, value: Optional[Buffer]) -> Self: ... + def set_base_mask(self, value: Buffer | None) -> None: ... + def set_mask(self, value: Buffer | None) -> Self: ... @property def null_count(self) -> int: ... @property def offset(self) -> int: ... @property - def base_children(self) -> Tuple[ColumnBase, ...]: ... + def base_children(self) -> tuple[ColumnBase, ...]: ... @property - def children(self) -> Tuple[ColumnBase, ...]: ... - def set_base_children(self, value: Tuple[ColumnBase, ...]) -> None: ... + def children(self) -> tuple[ColumnBase, ...]: ... + def set_base_children(self, value: tuple[ColumnBase, ...]) -> None: ... def _mimic_inplace( self, other_col: ColumnBase, inplace=False - ) -> Optional[Self]: ... + ) -> Self | None: ... # TODO: The val parameter should be Scalar, not ScalarLike @staticmethod diff --git a/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt b/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt index ed396208f98..0a198f431a7 100644 --- a/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt +++ b/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt @@ -19,6 +19,7 @@ set(cython_sources column_factories.pyx concatenate.pyx copying.pyx + datetime.pyx filling.pyx gpumemoryview.pyx groupby.pyx diff --git a/python/cudf/cudf/_lib/pylibcudf/__init__.pxd b/python/cudf/cudf/_lib/pylibcudf/__init__.pxd index a628ecdb038..5131df9a5cd 100644 --- a/python/cudf/cudf/_lib/pylibcudf/__init__.pxd +++ b/python/cudf/cudf/_lib/pylibcudf/__init__.pxd @@ -7,6 +7,7 @@ from . cimport ( column_factories, concatenate, copying, + datetime, filling, groupby, join, @@ -40,9 +41,10 @@ __all__ = [ "Table", "aggregation", "binaryop", + "column_factories", "concatenate", "copying", - "column_factories", + "datetime", "filling", "gpumemoryview", "groupby", diff --git a/python/cudf/cudf/_lib/pylibcudf/__init__.py b/python/cudf/cudf/_lib/pylibcudf/__init__.py index 46d0fe13cd1..43a9e2aca31 100644 --- a/python/cudf/cudf/_lib/pylibcudf/__init__.py +++ b/python/cudf/cudf/_lib/pylibcudf/__init__.py @@ -6,6 +6,7 @@ column_factories, concatenate, copying, + datetime, filling, groupby, interop, @@ -39,9 +40,10 @@ "TypeId", "aggregation", "binaryop", + "column_factories", "concatenate", "copying", - "column_factories", + "datetime", "filling", "gpumemoryview", "groupby", diff --git a/python/cudf/cudf/_lib/pylibcudf/datetime.pxd b/python/cudf/cudf/_lib/pylibcudf/datetime.pxd new file mode 100644 index 00000000000..2fce48cf1b4 --- /dev/null +++ b/python/cudf/cudf/_lib/pylibcudf/datetime.pxd @@ -0,0 +1,8 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from .column cimport Column + + +cpdef Column extract_year( + Column col +) diff --git a/python/cudf/cudf/_lib/pylibcudf/datetime.pyx b/python/cudf/cudf/_lib/pylibcudf/datetime.pyx new file mode 100644 index 00000000000..82351327de6 --- /dev/null +++ b/python/cudf/cudf/_lib/pylibcudf/datetime.pyx @@ -0,0 +1,33 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. +from libcpp.memory cimport unique_ptr +from libcpp.utility cimport move + +from cudf._lib.pylibcudf.libcudf.column.column cimport column +from cudf._lib.pylibcudf.libcudf.datetime cimport ( + extract_year as cpp_extract_year, +) + +from .column cimport Column + + +cpdef Column extract_year( + Column values +): + """ + Extract the year from a datetime column. + + Parameters + ---------- + values : Column + The column to extract the year from. + + Returns + ------- + Column + Column with the extracted years. + """ + cdef unique_ptr[column] result + + with nogil: + result = move(cpp_extract_year(values.view())) + return Column.from_libcudf(move(result)) diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/CMakeLists.txt b/python/cudf/cudf/_lib/pylibcudf/libcudf/CMakeLists.txt index ac56d42dda8..6c66d01ca57 100644 --- a/python/cudf/cudf/_lib/pylibcudf/libcudf/CMakeLists.txt +++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/CMakeLists.txt @@ -12,7 +12,7 @@ # the License. # ============================================================================= -set(cython_sources aggregation.pyx binaryop.pyx copying.pyx replace.pyx reduce.pxd round.pyx +set(cython_sources aggregation.pyx binaryop.pyx copying.pyx reduce.pyx replace.pyx round.pyx stream_compaction.pyx types.pyx unary.pyx ) diff --git a/python/cudf/cudf/_typing.py b/python/cudf/cudf/_typing.py index 206173919e1..34c96cc8cb3 100644 --- a/python/cudf/cudf/_typing.py +++ b/python/cudf/cudf/_typing.py @@ -5,9 +5,10 @@ import numpy as np from pandas import Period, Timedelta, Timestamp -from pandas.api.extensions import ExtensionDtype if TYPE_CHECKING: + from pandas.api.extensions import ExtensionDtype + import cudf # Backwards compat: mypy >= 0.790 rejects Type[NotImplemented], but diff --git a/python/cudf/cudf/api/types.py b/python/cudf/cudf/api/types.py index 42b1524bd76..d97e9c815b6 100644 --- a/python/cudf/cudf/api/types.py +++ b/python/cudf/cudf/api/types.py @@ -8,7 +8,7 @@ from collections import abc from functools import wraps from inspect import isclass -from typing import List, Union, cast +from typing import cast import cupy as cp import numpy as np @@ -219,7 +219,7 @@ def wrapped_func(obj): def _union_categoricals( - to_union: List[Union[cudf.Series, cudf.CategoricalIndex]], + to_union: list[cudf.Series | cudf.CategoricalIndex], sort_categories: bool = False, ignore_order: bool = False, ): diff --git a/python/cudf/cudf/core/_base_index.py b/python/cudf/cudf/core/_base_index.py index 5d0f7c4ede4..e71e45e410e 100644 --- a/python/cudf/cudf/core/_base_index.py +++ b/python/cudf/cudf/core/_base_index.py @@ -4,9 +4,8 @@ import pickle import warnings -from collections.abc import Generator from functools import cached_property -from typing import Any, Literal, Set, Tuple +from typing import TYPE_CHECKING, Any, Literal import pandas as pd from typing_extensions import Self @@ -31,21 +30,25 @@ ) from cudf.core.abc import Serializable from cudf.core.column import ColumnBase, column -from cudf.core.column_accessor import ColumnAccessor from cudf.errors import MixedTypeError from cudf.utils import ioutils from cudf.utils.dtypes import can_convert_to_column, is_mixed_with_object_dtype from cudf.utils.utils import _is_same_name +if TYPE_CHECKING: + from collections.abc import Generator + + from cudf.core.column_accessor import ColumnAccessor + class BaseIndex(Serializable): """Base class for all cudf Index types.""" - _accessors: Set[Any] = set() + _accessors: set[Any] = set() _data: ColumnAccessor @property - def _columns(self) -> Tuple[Any, ...]: + def _columns(self) -> tuple[Any, ...]: raise NotImplementedError @cached_property @@ -339,9 +342,9 @@ def deserialize(cls, header, frames): @property def names(self): """ - Returns a tuple containing the name of the Index. + Returns a FrozenList containing the name of the Index. """ - return (self.name,) + return pd.core.indexes.frozen.FrozenList([self.name]) @names.setter def names(self, values): diff --git a/python/cudf/cudf/core/_internals/expressions.py b/python/cudf/cudf/core/_internals/expressions.py index 5cb9f0363e0..393a68dd844 100644 --- a/python/cudf/cudf/core/_internals/expressions.py +++ b/python/cudf/cudf/core/_internals/expressions.py @@ -1,8 +1,8 @@ -# Copyright (c) 2022-2023, NVIDIA CORPORATION. +# Copyright (c) 2022-2024, NVIDIA CORPORATION. +from __future__ import annotations import ast import functools -from typing import List, Tuple from cudf._lib.expressions import ( ASTOperator, @@ -98,9 +98,9 @@ class libcudfASTVisitor(ast.NodeVisitor): The column names used to map the names in an expression. """ - def __init__(self, col_names: Tuple[str]): - self.stack: List[Expression] = [] - self.nodes: List[Expression] = [] + def __init__(self, col_names: tuple[str]): + self.stack: list[Expression] = [] + self.nodes: list[Expression] = [] self.col_names = col_names @property @@ -218,7 +218,7 @@ def visit_Call(self, node): @functools.lru_cache(256) -def parse_expression(expr: str, col_names: Tuple[str]): +def parse_expression(expr: str, col_names: tuple[str]): visitor = libcudfASTVisitor(col_names) visitor.visit(ast.parse(expr)) return visitor diff --git a/python/cudf/cudf/core/_internals/timezones.py b/python/cudf/cudf/core/_internals/timezones.py index f04cae719c2..269fcf3e37f 100644 --- a/python/cudf/cudf/core/_internals/timezones.py +++ b/python/cudf/cudf/core/_internals/timezones.py @@ -1,20 +1,23 @@ # Copyright (c) 2023-2024, NVIDIA CORPORATION. +from __future__ import annotations import os import zoneinfo from functools import lru_cache -from typing import Literal, Tuple +from typing import TYPE_CHECKING, Literal import numpy as np from cudf._lib.timezone import make_timezone_transition_table from cudf.core.column.column import as_column -from cudf.core.column.datetime import DatetimeColumn -from cudf.core.column.timedelta import TimeDeltaColumn + +if TYPE_CHECKING: + from cudf.core.column.datetime import DatetimeColumn + from cudf.core.column.timedelta import TimeDeltaColumn @lru_cache(maxsize=20) -def get_tz_data(zone_name: str) -> Tuple[DatetimeColumn, TimeDeltaColumn]: +def get_tz_data(zone_name: str) -> tuple[DatetimeColumn, TimeDeltaColumn]: """ Return timezone data (transition times and UTC offsets) for the given IANA time zone. @@ -40,7 +43,7 @@ def get_tz_data(zone_name: str) -> Tuple[DatetimeColumn, TimeDeltaColumn]: def _find_and_read_tzfile_tzpath( zone_name: str, -) -> Tuple[DatetimeColumn, TimeDeltaColumn]: +) -> tuple[DatetimeColumn, TimeDeltaColumn]: for search_path in zoneinfo.TZPATH: if os.path.isfile(os.path.join(search_path, zone_name)): return _read_tzfile_as_columns(search_path, zone_name) @@ -49,7 +52,7 @@ def _find_and_read_tzfile_tzpath( def _find_and_read_tzfile_tzdata( zone_name: str, -) -> Tuple[DatetimeColumn, TimeDeltaColumn]: +) -> tuple[DatetimeColumn, TimeDeltaColumn]: import importlib.resources package_base = "tzdata.zoneinfo" @@ -78,7 +81,7 @@ def _find_and_read_tzfile_tzdata( def _read_tzfile_as_columns( tzdir, zone_name: str -) -> Tuple[DatetimeColumn, TimeDeltaColumn]: +) -> tuple[DatetimeColumn, TimeDeltaColumn]: transition_times_and_offsets = make_timezone_transition_table( tzdir, zone_name ) @@ -92,7 +95,7 @@ def _read_tzfile_as_columns( def check_ambiguous_and_nonexistent( ambiguous: Literal["NaT"], nonexistent: Literal["NaT"] -) -> Tuple[Literal["NaT"], Literal["NaT"]]: +) -> tuple[Literal["NaT"], Literal["NaT"]]: if ambiguous != "NaT": raise NotImplementedError( "Only ambiguous='NaT' is currently supported" diff --git a/python/cudf/cudf/core/_internals/where.py b/python/cudf/cudf/core/_internals/where.py index ef6b10f66c1..44ce0ddef25 100644 --- a/python/cudf/cudf/core/_internals/where.py +++ b/python/cudf/cudf/core/_internals/where.py @@ -1,18 +1,17 @@ -# Copyright (c) 2021-2023, NVIDIA CORPORATION. +# Copyright (c) 2021-2024, NVIDIA CORPORATION. +from __future__ import annotations import warnings -from typing import Tuple, Union +from typing import TYPE_CHECKING import numpy as np import cudf -from cudf._typing import ScalarLike from cudf.api.types import ( _is_non_decimal_numeric_dtype, is_bool_dtype, is_scalar, ) -from cudf.core.column import ColumnBase from cudf.core.dtypes import CategoricalDtype from cudf.utils.dtypes import ( _can_cast, @@ -21,6 +20,10 @@ is_mixed_with_object_dtype, ) +if TYPE_CHECKING: + from cudf._typing import ScalarLike + from cudf.core.column import ColumnBase + def _normalize_categorical(input_col, other): if isinstance(input_col, cudf.core.column.CategoricalColumn): @@ -41,9 +44,9 @@ def _normalize_categorical(input_col, other): def _check_and_cast_columns_with_other( source_col: ColumnBase, - other: Union[ScalarLike, ColumnBase], + other: ScalarLike | ColumnBase, inplace: bool, -) -> Tuple[ColumnBase, Union[ScalarLike, ColumnBase]]: +) -> tuple[ColumnBase, ScalarLike | ColumnBase]: # Returns type-casted `source_col` & `other` based on `inplace`. source_dtype = source_col.dtype if isinstance(source_dtype, CategoricalDtype): diff --git a/python/cudf/cudf/core/buffer/buffer.py b/python/cudf/cudf/core/buffer/buffer.py index bf6f9f1a3c1..80dbbe4c048 100644 --- a/python/cudf/cudf/core/buffer/buffer.py +++ b/python/cudf/cudf/core/buffer/buffer.py @@ -6,7 +6,7 @@ import pickle import weakref from types import SimpleNamespace -from typing import Any, Dict, Literal, Mapping, Optional, Tuple +from typing import Any, Literal, Mapping import numpy from typing_extensions import Self @@ -42,7 +42,7 @@ def host_memory_allocation(nbytes: int) -> memoryview: def cuda_array_interface_wrapper( ptr: int, size: int, - owner: Optional[object] = None, + owner: object | None = None, readonly=False, typestr="|u1", version=0, @@ -278,7 +278,7 @@ def get_ptr(self, *, mode: Literal["read", "write"]) -> int: return self._ptr def memoryview( - self, *, offset: int = 0, size: Optional[int] = None + self, *, offset: int = 0, size: int | None = None ) -> memoryview: """Read-only access to the buffer through host memory.""" size = self._size if size is None else size @@ -319,7 +319,7 @@ def __init__( *, owner: BufferOwner, offset: int = 0, - size: Optional[int] = None, + size: int | None = None, ) -> None: size = owner.size if size is None else size if size < 0: @@ -414,7 +414,7 @@ def __cuda_array_interface__(self) -> Mapping: "version": 0, } - def serialize(self) -> Tuple[dict, list]: + def serialize(self) -> tuple[dict, list]: """Serialize the buffer into header and frames. The frames can be a mixture of memoryview, Buffer, and BufferOwner @@ -427,7 +427,7 @@ def serialize(self) -> Tuple[dict, list]: serializable metadata required to reconstruct the object. The second element is a list containing single frame. """ - header: Dict[str, Any] = {} + header: dict[str, Any] = {} header["type-serialized"] = pickle.dumps(type(self)) header["owner-type-serialized"] = pickle.dumps(type(self._owner)) header["frame_count"] = 1 @@ -480,7 +480,7 @@ def __str__(self) -> str: ) -def get_ptr_and_size(array_interface: Mapping) -> Tuple[int, int]: +def get_ptr_and_size(array_interface: Mapping) -> tuple[int, int]: """Retrieve the pointer and size from an array interface. Raises ValueError if array isn't C-contiguous. diff --git a/python/cudf/cudf/core/buffer/exposure_tracked_buffer.py b/python/cudf/cudf/core/buffer/exposure_tracked_buffer.py index 15f00fc670d..0bd8d6054b3 100644 --- a/python/cudf/cudf/core/buffer/exposure_tracked_buffer.py +++ b/python/cudf/cudf/core/buffer/exposure_tracked_buffer.py @@ -2,7 +2,7 @@ from __future__ import annotations -from typing import Literal, Mapping, Optional +from typing import Literal, Mapping from typing_extensions import Self @@ -27,7 +27,7 @@ def __init__( self, owner: BufferOwner, offset: int = 0, - size: Optional[int] = None, + size: int | None = None, ) -> None: super().__init__(owner=owner, offset=offset, size=size) self.owner._slices.add(self) diff --git a/python/cudf/cudf/core/buffer/spill_manager.py b/python/cudf/cudf/core/buffer/spill_manager.py index cd81149bdb8..762cd7f9e86 100644 --- a/python/cudf/cudf/core/buffer/spill_manager.py +++ b/python/cudf/cudf/core/buffer/spill_manager.py @@ -13,15 +13,17 @@ from contextlib import contextmanager from dataclasses import dataclass from functools import partial -from typing import Dict, List, Optional, Tuple +from typing import TYPE_CHECKING import rmm.mr -from cudf.core.buffer.spillable_buffer import SpillableBufferOwner from cudf.options import get_option from cudf.utils.nvtx_annotation import _cudf_nvtx_annotate from cudf.utils.string import format_bytes +if TYPE_CHECKING: + from cudf.core.buffer.spillable_buffer import SpillableBufferOwner + _spill_cudf_nvtx_annotate = partial( _cudf_nvtx_annotate, domain="cudf_python-spill" ) @@ -37,7 +39,7 @@ def get_traceback() -> str: def get_rmm_memory_resource_stack( mr: rmm.mr.DeviceMemoryResource, -) -> List[rmm.mr.DeviceMemoryResource]: +) -> list[rmm.mr.DeviceMemoryResource]: """Get the RMM resource stack Parameters @@ -97,14 +99,14 @@ class Expose: total_nbytes: int = 0 spilled_nbytes: int = 0 - spill_totals: Dict[Tuple[str, str], Tuple[int, float]] + spill_totals: dict[tuple[str, str], tuple[int, float]] def __init__(self, level) -> None: self.lock = threading.Lock() self.level = level self.spill_totals = defaultdict(lambda: (0, 0)) # Maps each traceback to a Expose - self.exposes: Dict[str, SpillStatistics.Expose] = {} + self.exposes: dict[str, SpillStatistics.Expose] = {} def log_spill(self, src: str, dst: str, nbytes: int, time: float) -> None: """Log a (un-)spilling event @@ -225,7 +227,7 @@ class SpillManager: def __init__( self, *, - device_memory_limit: Optional[int] = None, + device_memory_limit: int | None = None, statistic_level: int = 0, ) -> None: self._lock = threading.Lock() @@ -296,7 +298,7 @@ def add(self, buffer: SpillableBufferOwner) -> None: def buffers( self, order_by_access_time: bool = False - ) -> Tuple[SpillableBufferOwner, ...]: + ) -> tuple[SpillableBufferOwner, ...]: """Get all managed buffers Parameters @@ -345,7 +347,7 @@ def spill_device_memory(self, nbytes: int) -> int: buf.lock.release() return spilled - def spill_to_device_limit(self, device_limit: Optional[int] = None) -> int: + def spill_to_device_limit(self, device_limit: int | None = None) -> int: """Try to spill device memory until device limit Notice, by default this is a no-op. @@ -400,10 +402,10 @@ def __repr__(self) -> str: # - Initialized to None (spilling disabled) # - Initialized to a SpillManager instance (spilling enabled) _global_manager_uninitialized: bool = True -_global_manager: Optional[SpillManager] = None +_global_manager: SpillManager | None = None -def set_global_manager(manager: Optional[SpillManager]) -> None: +def set_global_manager(manager: SpillManager | None) -> None: """Set the global manager, which if None disables spilling""" global _global_manager, _global_manager_uninitialized @@ -417,7 +419,7 @@ def set_global_manager(manager: Optional[SpillManager]) -> None: _global_manager_uninitialized = False -def get_global_manager() -> Optional[SpillManager]: +def get_global_manager() -> SpillManager | None: """Get the global manager or None if spilling is disabled""" global _global_manager_uninitialized if _global_manager_uninitialized: diff --git a/python/cudf/cudf/core/buffer/spillable_buffer.py b/python/cudf/cudf/core/buffer/spillable_buffer.py index 49258fea9ab..eb57a371965 100644 --- a/python/cudf/cudf/core/buffer/spillable_buffer.py +++ b/python/cudf/cudf/core/buffer/spillable_buffer.py @@ -7,7 +7,7 @@ import time import weakref from threading import RLock -from typing import TYPE_CHECKING, Any, Dict, List, Literal, Optional, Tuple +from typing import TYPE_CHECKING, Any, Literal import numpy from typing_extensions import Self @@ -88,10 +88,10 @@ class SpillableBufferOwner(BufferOwner): lock: RLock _spill_locks: weakref.WeakSet _last_accessed: float - _ptr_desc: Dict[str, Any] + _ptr_desc: dict[str, Any] _manager: SpillManager - def _finalize_init(self, ptr_desc: Dict[str, Any]) -> None: + def _finalize_init(self, ptr_desc: dict[str, Any]) -> None: """Finish initialization of the spillable buffer This implements the common initialization that `from_device_memory` @@ -297,7 +297,7 @@ def get_ptr(self, *, mode: Literal["read", "write"]) -> int: self._last_accessed = time.monotonic() return self._ptr - def memory_info(self) -> Tuple[int, int, str]: + def memory_info(self) -> tuple[int, int, str]: """Get pointer, size, and device type of this buffer. Warning, it is not safe to access the pointer value without @@ -341,7 +341,7 @@ def __cuda_array_interface__(self) -> dict: } def memoryview( - self, *, offset: int = 0, size: Optional[int] = None + self, *, offset: int = 0, size: int | None = None ) -> memoryview: size = self._size if size is None else size with self.lock: @@ -388,11 +388,11 @@ def spillable(self) -> bool: def spill_lock(self, spill_lock: SpillLock) -> None: self._owner.spill_lock(spill_lock=spill_lock) - def memory_info(self) -> Tuple[int, int, str]: + def memory_info(self) -> tuple[int, int, str]: (ptr, _, device_type) = self._owner.memory_info() return (ptr + self._offset, self.nbytes, device_type) - def serialize(self) -> Tuple[dict, list]: + def serialize(self) -> tuple[dict, list]: """Serialize the Buffer Normally, we would use `[self]` as the frames. This would work but @@ -411,8 +411,8 @@ def serialize(self) -> Tuple[dict, list]: given to `.deserialize()`, otherwise we would have a `Buffer` pointing to memory already owned by an existing `SpillableBufferOwner`. """ - header: Dict[str, Any] = {} - frames: List[Buffer | memoryview] + header: dict[str, Any] = {} + frames: list[Buffer | memoryview] with self._owner.lock: header["type-serialized"] = pickle.dumps(self.__class__) header["owner-type-serialized"] = pickle.dumps(type(self._owner)) diff --git a/python/cudf/cudf/core/buffer/utils.py b/python/cudf/cudf/core/buffer/utils.py index 3346d05ed4a..42a1501c914 100644 --- a/python/cudf/cudf/core/buffer/utils.py +++ b/python/cudf/cudf/core/buffer/utils.py @@ -4,7 +4,7 @@ import threading from contextlib import ContextDecorator -from typing import Any, Dict, Optional, Tuple, Type, Union +from typing import Any from cudf.core.buffer.buffer import ( Buffer, @@ -22,7 +22,7 @@ from cudf.options import get_option -def get_buffer_owner(data: Any) -> Optional[BufferOwner]: +def get_buffer_owner(data: Any) -> BufferOwner | None: """Get the owner of `data`, if one exists Search through the stack of data owners in order to find an @@ -47,10 +47,10 @@ def get_buffer_owner(data: Any) -> Optional[BufferOwner]: def as_buffer( - data: Union[int, Any], + data: int | Any, *, - size: Optional[int] = None, - owner: Optional[object] = None, + size: int | None = None, + owner: object | None = None, exposed: bool = False, ) -> Buffer: """Factory function to wrap `data` in a Buffer object. @@ -117,8 +117,8 @@ def as_buffer( ) # Find the buffer types to return based on the current config - owner_class: Type[BufferOwner] - buffer_class: Type[Buffer] + owner_class: type[BufferOwner] + buffer_class: type[Buffer] if get_global_manager() is not None: owner_class = SpillableBufferOwner buffer_class = SpillableBuffer @@ -161,7 +161,7 @@ def as_buffer( return buffer_class(owner=owner, offset=ptr - base_ptr, size=size) -_thread_spill_locks: Dict[int, Tuple[Optional[SpillLock], int]] = {} +_thread_spill_locks: dict[int, tuple[SpillLock | None, int]] = {} def _push_thread_spill_lock() -> None: @@ -193,7 +193,7 @@ class acquire_spill_lock(ContextDecorator): pushing and popping from `_thread_spill_locks` using its thread ID. """ - def __enter__(self) -> Optional[SpillLock]: + def __enter__(self) -> SpillLock | None: _push_thread_spill_lock() return get_spill_lock() @@ -201,7 +201,7 @@ def __exit__(self, *exc): _pop_thread_spill_lock() -def get_spill_lock() -> Union[SpillLock, None]: +def get_spill_lock() -> SpillLock | None: """Return a spill lock within the context of `acquire_spill_lock` or None Returns None, if spilling is disabled. diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py index de20b2ace1d..f538180805b 100644 --- a/python/cudf/cudf/core/column/categorical.py +++ b/python/cudf/cudf/core/column/categorical.py @@ -3,21 +3,17 @@ from __future__ import annotations import warnings -from collections import abc from functools import cached_property -from typing import TYPE_CHECKING, Any, Mapping, Optional, Sequence, Tuple, cast +from typing import TYPE_CHECKING, Any, Mapping, Sequence, cast import numpy as np import pandas as pd import pyarrow as pa -from numba import cuda from typing_extensions import Self import cudf from cudf import _lib as libcudf from cudf._lib.transform import bools_to_mask -from cudf._typing import ColumnBinaryOperand, ColumnLike, Dtype, ScalarLike -from cudf.core.buffer import Buffer from cudf.core.column import column from cudf.core.column.methods import ColumnMethods from cudf.core.dtypes import CategoricalDtype, IntervalDtype @@ -29,7 +25,19 @@ ) if TYPE_CHECKING: - from cudf._typing import SeriesOrIndex, SeriesOrSingleColumnIndex + from collections import abc + + import numba.cuda + + from cudf._typing import ( + ColumnBinaryOperand, + ColumnLike, + Dtype, + ScalarLike, + SeriesOrIndex, + SeriesOrSingleColumnIndex, + ) + from cudf.core.buffer import Buffer from cudf.core.column import ( ColumnBase, DatetimeColumn, @@ -131,7 +139,7 @@ def ordered(self) -> bool: """ return self._column.ordered - def as_ordered(self) -> Optional[SeriesOrIndex]: + def as_ordered(self) -> SeriesOrIndex | None: """ Set the Categorical to be ordered. @@ -167,7 +175,7 @@ def as_ordered(self) -> Optional[SeriesOrIndex]: """ return self._return_or_inplace(self._column.as_ordered(ordered=True)) - def as_unordered(self) -> Optional[SeriesOrIndex]: + def as_unordered(self) -> SeriesOrIndex | None: """ Set the Categorical to be unordered. @@ -214,7 +222,7 @@ def as_unordered(self) -> Optional[SeriesOrIndex]: """ return self._return_or_inplace(self._column.as_ordered(ordered=False)) - def add_categories(self, new_categories: Any) -> Optional[SeriesOrIndex]: + def add_categories(self, new_categories: Any) -> SeriesOrIndex | None: """ Add new categories. @@ -286,7 +294,7 @@ def add_categories(self, new_categories: Any) -> Optional[SeriesOrIndex]: def remove_categories( self, removals: Any, - ) -> Optional[SeriesOrIndex]: + ) -> SeriesOrIndex | None: """ Remove the specified categories. @@ -362,7 +370,7 @@ def set_categories( new_categories: Any, ordered: bool = False, rename: bool = False, - ) -> Optional[SeriesOrIndex]: + ) -> SeriesOrIndex | None: """ Set the categories to the specified new_categories. @@ -435,7 +443,7 @@ def reorder_categories( self, new_categories: Any, ordered: bool = False, - ) -> Optional[SeriesOrIndex]: + ) -> SeriesOrIndex | None: """ Reorder categories as specified in new_categories. @@ -513,8 +521,8 @@ class CategoricalColumn(column.ColumnBase): """ dtype: cudf.core.dtypes.CategoricalDtype - _codes: Optional[NumericalColumn] - _children: Tuple[NumericalColumn] + _codes: NumericalColumn | None + _children: tuple[NumericalColumn] _VALID_REDUCTIONS = { "max", "min", @@ -531,11 +539,11 @@ class CategoricalColumn(column.ColumnBase): def __init__( self, dtype: CategoricalDtype, - mask: Optional[Buffer] = None, - size: Optional[int] = None, + mask: Buffer | None = None, + size: int | None = None, offset: int = 0, - null_count: Optional[int] = None, - children: Tuple["column.ColumnBase", ...] = (), + null_count: int | None = None, + children: tuple["column.ColumnBase", ...] = (), ): if size is None: for child in children: @@ -582,23 +590,23 @@ def set_base_data(self, value): def _process_values_for_isin( self, values: Sequence - ) -> Tuple[ColumnBase, ColumnBase]: + ) -> tuple[ColumnBase, ColumnBase]: lhs = self # We need to convert values to same type as self, # hence passing dtype=self.dtype rhs = cudf.core.column.as_column(values, dtype=self.dtype) return lhs, rhs - def set_base_mask(self, value: Optional[Buffer]): + def set_base_mask(self, value: Buffer | None): super().set_base_mask(value) self._codes = None - def set_base_children(self, value: Tuple[ColumnBase, ...]): + def set_base_children(self, value: tuple[ColumnBase, ...]): super().set_base_children(value) self._codes = None @property - def children(self) -> Tuple[NumericalColumn]: + def children(self) -> tuple[NumericalColumn]: if self._children is None: codes_column = self.base_children[0] start = self.offset * codes_column.dtype.itemsize @@ -685,9 +693,7 @@ def _fill( libcudf.filling.fill_in_place(result.codes, begin, end, fill_scalar) return result - def slice( - self, start: int, stop: int, stride: Optional[int] = None - ) -> Self: + def slice(self, start: int, stop: int, stride: int | None = None) -> Self: codes = self.codes.slice(start, stop, stride) return cast( Self, @@ -706,7 +712,7 @@ def slice( def _reduce( self, op: str, - skipna: Optional[bool] = None, + skipna: bool | None = None, min_count: int = 0, *args, **kwargs, @@ -868,7 +874,7 @@ def clip(self, lo: ScalarLike, hi: ScalarLike) -> "column.ColumnBase": def data_array_view( self, *, mode="write" - ) -> cuda.devicearray.DeviceNDArray: + ) -> numba.cuda.devicearray.DeviceNDArray: return self.codes.data_array_view(mode=mode) def unique(self) -> CategoricalColumn: @@ -1065,7 +1071,7 @@ def notnull(self) -> ColumnBase: def fillna( self, fill_value: Any = None, - method: Optional[str] = None, + method: str | None = None, ) -> Self: """ Fill null values with *fill_value* @@ -1199,7 +1205,7 @@ def memory_usage(self) -> int: def _mimic_inplace( self, other_col: ColumnBase, inplace: bool = False - ) -> Optional[Self]: + ) -> Self | None: out = super()._mimic_inplace(other_col, inplace=inplace) if inplace and isinstance(other_col, CategoricalColumn): self._codes = other_col._codes @@ -1460,7 +1466,7 @@ def _create_empty_categorical_column( def pandas_categorical_as_column( - categorical: ColumnLike, codes: Optional[ColumnLike] = None + categorical: ColumnLike, codes: ColumnLike | None = None ) -> CategoricalColumn: """Creates a CategoricalColumn from a pandas.Categorical diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 001e8996c19..c4e715aeb45 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -2,24 +2,12 @@ from __future__ import annotations -import builtins import pickle from collections import abc from functools import cached_property from itertools import chain from types import SimpleNamespace -from typing import ( - Any, - Dict, - List, - Literal, - MutableSequence, - Optional, - Sequence, - Tuple, - Union, - cast, -) +from typing import TYPE_CHECKING, Any, Literal, MutableSequence, Sequence, cast import cupy import numpy as np @@ -49,7 +37,6 @@ ) from cudf._lib.transform import bools_to_mask from cudf._lib.types import size_type_dtype -from cudf._typing import ColumnLike, Dtype, ScalarLike from cudf.api.types import ( _is_non_decimal_numeric_dtype, _is_pandas_nullable_extension_dtype, @@ -89,6 +76,11 @@ ) from cudf.utils.utils import _array_ufunc, mask_dtype +if TYPE_CHECKING: + import builtins + + from cudf._typing import ColumnLike, Dtype, ScalarLike + if PANDAS_GE_210: NumpyExtensionArray = pd.arrays.NumpyExtensionArray else: @@ -390,7 +382,7 @@ def _fill( begin: int, end: int, inplace: bool = False, - ) -> Optional[Self]: + ) -> Self | None: if end <= begin or begin >= self.size: return self if inplace else self.copy() @@ -528,9 +520,7 @@ def element_indexing(self, index: int): raise IndexError("single positional indexer is out-of-bounds") return libcudf.copying.get_element(self, idx).value - def slice( - self, start: int, stop: int, stride: Optional[int] = None - ) -> Self: + def slice(self, start: int, stop: int, stride: int | None = None) -> Self: stride = 1 if stride is None else stride if start < 0: start = start + len(self) @@ -566,7 +556,7 @@ def __setitem__(self, key: Any, value: Any): else as_column(value, dtype=self.dtype) ) - out: Optional[ColumnBase] # If None, no need to perform mimic inplace. + out: ColumnBase | None # If None, no need to perform mimic inplace. if isinstance(key, slice): out = self._scatter_by_slice(key, value_normalized) else: @@ -589,8 +579,8 @@ def _wrap_binop_normalization(self, other): def _scatter_by_slice( self, key: builtins.slice, - value: Union[cudf.core.scalar.Scalar, ColumnBase], - ) -> Optional[Self]: + value: cudf.core.scalar.Scalar | ColumnBase, + ) -> Self | None: """If this function returns None, it's either a no-op (slice is empty), or the inplace replacement is already performed (fill-in-place). """ @@ -626,7 +616,7 @@ def _scatter_by_slice( def _scatter_by_column( self, key: cudf.core.column.NumericalColumn, - value: Union[cudf.core.scalar.Scalar, ColumnBase], + value: cudf.core.scalar.Scalar | ColumnBase, ) -> Self: if is_bool_dtype(key.dtype): # `key` is boolean mask @@ -663,7 +653,7 @@ def _scatter_by_column( ]._with_type_metadata(self.dtype) def _check_scatter_key_length( - self, num_keys: int, value: Union[cudf.core.scalar.Scalar, ColumnBase] + self, num_keys: int, value: cudf.core.scalar.Scalar | ColumnBase ) -> None: """`num_keys` is the number of keys to scatter. Should equal to the number of rows in ``value`` if ``value`` is a column. @@ -678,7 +668,7 @@ def _check_scatter_key_length( def fillna( self, fill_value: Any = None, - method: Optional[str] = None, + method: str | None = None, ) -> Self: """Fill null values with ``value``. @@ -736,7 +726,7 @@ def indices_of( [as_column(range(0, len(self)), dtype=size_type_dtype)], mask )[0] - def _find_first_and_last(self, value: ScalarLike) -> Tuple[int, int]: + def _find_first_and_last(self, value: ScalarLike) -> tuple[int, int]: indices = self.indices_of(value) if n := len(indices): return ( @@ -852,7 +842,7 @@ def isin(self, values: Sequence) -> ColumnBase: def _process_values_for_isin( self, values: Sequence - ) -> Tuple[ColumnBase, ColumnBase]: + ) -> tuple[ColumnBase, ColumnBase]: """ Helper function for `isin` which pre-process `values` based on `self`. """ @@ -864,7 +854,7 @@ def _process_values_for_isin( rhs = rhs.astype(lhs.dtype) return lhs, rhs - def _isin_earlystop(self, rhs: ColumnBase) -> Union[ColumnBase, None]: + def _isin_earlystop(self, rhs: ColumnBase) -> ColumnBase | None: """ Helper function for `isin` which determines possibility of early-stopping or not. @@ -1066,7 +1056,7 @@ def as_string_column( def as_decimal_column( self, dtype: Dtype - ) -> Union["cudf.core.column.decimal.DecimalBaseColumn"]: + ) -> "cudf.core.column.decimal.DecimalBaseColumn": raise NotImplementedError def apply_boolean_mask(self, mask) -> ColumnBase: @@ -1118,6 +1108,11 @@ def __cuda_array_interface__(self) -> abc.Mapping[str, Any]: def __array_ufunc__(self, ufunc, method, *inputs, **kwargs): return _array_ufunc(self, ufunc, method, inputs, kwargs) + def __invert__(self): + raise TypeError( + f"Operation `~` not supported on {self.dtype.type.__name__}" + ) + def searchsorted( self, value, @@ -1145,7 +1140,7 @@ def unique(self) -> ColumnBase: self.dtype ) - def serialize(self) -> Tuple[dict, list]: + def serialize(self) -> tuple[dict, list]: # data model: # Serialization produces a nested metadata "header" and a flattened @@ -1158,7 +1153,7 @@ def serialize(self) -> Tuple[dict, list]: # cudf native or foreign some special-casing is required here for # serialization. - header: Dict[Any, Any] = {} + header: dict[Any, Any] = {} frames = [] header["type-serialized"] = pickle.dumps(type(self)) try: @@ -1191,7 +1186,7 @@ def serialize(self) -> Tuple[dict, list]: @classmethod def deserialize(cls, header: dict, frames: list) -> ColumnBase: - def unpack(header, frames) -> Tuple[Any, list]: + def unpack(header, frames) -> tuple[Any, list]: count = header["frame_count"] klass = pickle.loads(header["type-serialized"]) obj = klass.deserialize(header, frames[:count]) @@ -1238,13 +1233,13 @@ def nans_to_nulls(self: Self) -> Self: def normalize_binop_value( self, other: ScalarLike - ) -> Union[ColumnBase, ScalarLike]: + ) -> ColumnBase | ScalarLike: raise NotImplementedError def _reduce( self, op: str, - skipna: Optional[bool] = None, + skipna: bool | None = None, min_count: int = 0, *args, **kwargs, @@ -1265,8 +1260,8 @@ def _reduce( return preprocessed def _process_for_reduction( - self, skipna: Optional[bool] = None, min_count: int = 0 - ) -> Union[ColumnBase, ScalarLike]: + self, skipna: bool | None = None, min_count: int = 0 + ) -> ColumnBase | ScalarLike: if skipna is None: skipna = True @@ -1306,8 +1301,8 @@ def _with_type_metadata(self: ColumnBase, dtype: Dtype) -> ColumnBase: def _label_encoding( self, cats: ColumnBase, - dtype: Optional[Dtype] = None, - na_sentinel: Optional[ScalarLike] = None, + dtype: Dtype | None = None, + na_sentinel: ScalarLike | None = None, ): """ Convert each value in `self` into an integer code, with `cats` @@ -1380,9 +1375,9 @@ def _return_sentinel_column(): def column_empty_like( column: ColumnBase, - dtype: Optional[Dtype] = None, + dtype: Dtype | None = None, masked: bool = False, - newsize: Optional[int] = None, + newsize: int | None = None, ) -> ColumnBase: """Allocate a new column like the given *column*""" if dtype is None: @@ -1437,7 +1432,7 @@ def column_empty( ) -> ColumnBase: """Allocate a new column like the given row_count and dtype.""" dtype = cudf.dtype(dtype) - children = () # type: Tuple[ColumnBase, ...] + children: tuple[ColumnBase, ...] = () if isinstance(dtype, StructDtype): data = None @@ -1487,14 +1482,14 @@ def column_empty( def build_column( - data: Union[Buffer, None], + data: Buffer | None, dtype: Dtype, *, - size: Optional[int] = None, - mask: Optional[Buffer] = None, + size: int | None = None, + mask: Buffer | None = None, offset: int = 0, - null_count: Optional[int] = None, - children: Tuple[ColumnBase, ...] = (), + null_count: int | None = None, + children: tuple[ColumnBase, ...] = (), ) -> ColumnBase: """ Build a Column of the appropriate type from the given parameters @@ -1656,10 +1651,10 @@ def build_column( def build_categorical_column( categories: ColumnBase, codes: ColumnBase, - mask: Optional[Buffer] = None, - size: Optional[int] = None, + mask: Buffer | None = None, + size: int | None = None, offset: int = 0, - null_count: Optional[int] = None, + null_count: int | None = None, ordered: bool = False, ) -> "cudf.core.column.CategoricalColumn": """ @@ -1706,7 +1701,7 @@ def check_invalid_array(shape: tuple, dtype): raise TypeError("Unsupported type float16") -def as_memoryview(arbitrary: Any) -> Optional[memoryview]: +def as_memoryview(arbitrary: Any) -> memoryview | None: try: return memoryview(arbitrary) except TypeError: @@ -1715,9 +1710,9 @@ def as_memoryview(arbitrary: Any) -> Optional[memoryview]: def as_column( arbitrary: Any, - nan_as_null: Optional[bool] = None, - dtype: Optional[Dtype] = None, - length: Optional[int] = None, + nan_as_null: bool | None = None, + dtype: Dtype | None = None, + length: int | None = None, ): """Create a Column from an arbitrary object @@ -2190,7 +2185,7 @@ def _mask_from_cuda_array_interface_desc(obj, cai_mask) -> Buffer: raise NotImplementedError(f"Cannot infer mask from typestr {typestr}") -def serialize_columns(columns: list[ColumnBase]) -> Tuple[List[dict], List]: +def serialize_columns(columns: list[ColumnBase]) -> tuple[list[dict], list]: """ Return the headers and frames resulting from serializing a list of Column @@ -2207,7 +2202,7 @@ def serialize_columns(columns: list[ColumnBase]) -> Tuple[List[dict], List]: frames : list list of frames """ - headers: List[Dict[Any, Any]] = [] + headers: list[dict[Any, Any]] = [] frames = [] if len(columns) > 0: @@ -2219,7 +2214,7 @@ def serialize_columns(columns: list[ColumnBase]) -> Tuple[List[dict], List]: return headers, frames -def deserialize_columns(headers: List[dict], frames: List) -> List[ColumnBase]: +def deserialize_columns(headers: list[dict], frames: list) -> list[ColumnBase]: """ Construct a list of Columns from a list of headers and frames. diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py index 057169aa7e1..7fdebda7d76 100644 --- a/python/cudf/cudf/core/column/datetime.py +++ b/python/cudf/cudf/core/column/datetime.py @@ -8,7 +8,7 @@ import locale import re from locale import nl_langinfo -from typing import TYPE_CHECKING, Any, Literal, Optional, Sequence, Tuple, cast +from typing import TYPE_CHECKING, Any, Literal, Sequence, cast import numpy as np import pandas as pd @@ -19,22 +19,22 @@ from cudf import _lib as libcudf from cudf._lib.labeling import label_bins from cudf._lib.search import search_sorted -from cudf._typing import ( - ColumnBinaryOperand, - DatetimeLikeScalar, - Dtype, - DtypeObj, - ScalarLike, -) from cudf.api.types import is_datetime64_dtype, is_scalar, is_timedelta64_dtype from cudf.core._compat import PANDAS_GE_220 -from cudf.core.buffer import Buffer from cudf.core.column import ColumnBase, as_column, column, string from cudf.core.column.timedelta import _unit_to_nanoseconds_conversion from cudf.utils.dtypes import _get_base_dtype from cudf.utils.utils import _all_bools_with_nulls if TYPE_CHECKING: + from cudf._typing import ( + ColumnBinaryOperand, + DatetimeLikeScalar, + Dtype, + DtypeObj, + ScalarLike, + ) + from cudf.core.buffer import Buffer from cudf.core.column.numerical import NumericalColumn if PANDAS_GE_220: @@ -242,10 +242,10 @@ def __init__( self, data: Buffer, dtype: DtypeObj, - mask: Optional[Buffer] = None, - size: Optional[int] = None, # TODO: make non-optional + mask: Buffer | None = None, + size: int | None = None, # TODO: make non-optional offset: int = 0, - null_count: Optional[int] = None, + null_count: int | None = None, ): dtype = cudf.dtype(dtype) if dtype.kind != "M": @@ -499,7 +499,7 @@ def mean( def std( self, - skipna: Optional[bool] = None, + skipna: bool | None = None, min_count: int = 0, dtype: Dtype = np.float64, ddof: int = 1, @@ -511,7 +511,7 @@ def std( * _unit_to_nanoseconds_conversion[self.time_unit], ).as_unit(self.time_unit) - def median(self, skipna: Optional[bool] = None) -> pd.Timestamp: + def median(self, skipna: bool | None = None) -> pd.Timestamp: return pd.Timestamp( self.as_numerical_column("int64").median(skipna=skipna), unit=self.time_unit, @@ -631,7 +631,7 @@ def _binaryop(self, other: ColumnBinaryOperand, op: str) -> ColumnBase: def fillna( self, fill_value: Any = None, - method: Optional[str] = None, + method: str | None = None, ) -> Self: if fill_value is not None: if cudf.utils.utils._isnat(fill_value): @@ -703,7 +703,7 @@ def _with_type_metadata(self, dtype): def _find_ambiguous_and_nonexistent( self, zone_name: str - ) -> Tuple[NumericalColumn, NumericalColumn] | Tuple[bool, bool]: + ) -> tuple[NumericalColumn, NumericalColumn] | tuple[bool, bool]: """ Recognize ambiguous and nonexistent timestamps for the given timezone. @@ -822,10 +822,10 @@ def __init__( self, data: Buffer, dtype: pd.DatetimeTZDtype, - mask: Optional[Buffer] = None, - size: Optional[int] = None, + mask: Buffer | None = None, + size: int | None = None, offset: int = 0, - null_count: Optional[int] = None, + null_count: int | None = None, ): super().__init__( data=data, diff --git a/python/cudf/cudf/core/column/decimal.py b/python/cudf/cudf/core/column/decimal.py index 3a0f6649e21..e9d9b4933e5 100644 --- a/python/cudf/cudf/core/column/decimal.py +++ b/python/cudf/cudf/core/column/decimal.py @@ -4,7 +4,7 @@ import warnings from decimal import Decimal -from typing import Any, Optional, Sequence, Union, cast +from typing import TYPE_CHECKING, Any, Sequence, cast import cupy as cp import numpy as np @@ -16,7 +16,6 @@ from cudf._lib.strings.convert.convert_fixed_point import ( from_decimal as cpp_from_decimal, ) -from cudf._typing import ColumnBinaryOperand, Dtype from cudf.api.types import is_integer_dtype, is_scalar from cudf.core.buffer import as_buffer from cudf.core.column import ColumnBase @@ -31,6 +30,9 @@ from .numerical_base import NumericalBaseColumn +if TYPE_CHECKING: + from cudf._typing import ColumnBinaryOperand, Dtype + class DecimalBaseColumn(NumericalBaseColumn): """Base column for decimal32, decimal64 or decimal128 columns""" @@ -47,7 +49,7 @@ def __cuda_array_interface__(self): def as_decimal_column( self, dtype: Dtype, - ) -> Union["DecimalBaseColumn"]: + ) -> "DecimalBaseColumn": if ( isinstance(dtype, cudf.core.dtypes.DecimalDtype) and dtype.scale < self.dtype.scale @@ -136,7 +138,7 @@ def _binaryop(self, other: ColumnBinaryOperand, op: str): def fillna( self, fill_value: Any = None, - method: Optional[str] = None, + method: str | None = None, ) -> Self: """Fill null values with ``value``. @@ -197,7 +199,7 @@ def normalize_binop_value(self, other): return NotImplemented def _decimal_quantile( - self, q: Union[float, Sequence[float]], interpolation: str, exact: bool + self, q: float | Sequence[float], interpolation: str, exact: bool ) -> ColumnBase: quant = [float(q)] if not isinstance(q, (Sequence, np.ndarray)) else q # get sorted indices and exclude nulls diff --git a/python/cudf/cudf/core/column/lists.py b/python/cudf/cudf/core/column/lists.py index 8f8ee46c796..c548db67344 100644 --- a/python/cudf/cudf/core/column/lists.py +++ b/python/cudf/cudf/core/column/lists.py @@ -3,7 +3,7 @@ from __future__ import annotations from functools import cached_property -from typing import List, Optional, Sequence, Tuple, Union +from typing import TYPE_CHECKING, Sequence import numpy as np import pandas as pd @@ -26,13 +26,15 @@ ) from cudf._lib.strings.convert.convert_lists import format_list_column from cudf._lib.types import size_type_dtype -from cudf._typing import ColumnBinaryOperand, ColumnLike, Dtype, ScalarLike from cudf.api.types import _is_non_decimal_numeric_dtype, is_scalar from cudf.core.column import ColumnBase, as_column, column from cudf.core.column.methods import ColumnMethods, ParentType from cudf.core.dtypes import ListDtype from cudf.core.missing import NA +if TYPE_CHECKING: + from cudf._typing import ColumnBinaryOperand, ColumnLike, Dtype, ScalarLike + class ListColumn(ColumnBase): dtype: ListDtype @@ -165,7 +167,7 @@ def set_base_data(self, value): else: super().set_base_data(value) - def set_base_children(self, value: Tuple[ColumnBase, ...]): + def set_base_children(self, value: tuple[ColumnBase, ...]): super().set_base_children(value) _, values = value self._dtype = cudf.ListDtype(element_type=values.dtype) @@ -267,7 +269,7 @@ def _transform_leaves(self, func, *args, **kwargs) -> Self: # as ``self``, but with the leaf column transformed # by applying ``func`` to it - cc: List[ListColumn] = [] + cc: list[ListColumn] = [] c: ColumnBase = self while isinstance(c, ListColumn): @@ -318,7 +320,7 @@ def __init__(self, parent: ParentType): def get( self, index: int, - default: Optional[Union[ScalarLike, ColumnLike]] = None, + default: ScalarLike | ColumnLike | None = None, ) -> ParentType: """ Extract element at the given index from each list in a Series of lists. @@ -422,7 +424,7 @@ def contains(self, search_key: ScalarLike) -> ParentType: contains_scalar(self._column, cudf.Scalar(search_key)) ) - def index(self, search_key: Union[ScalarLike, ColumnLike]) -> ParentType: + def index(self, search_key: ScalarLike | ColumnLike) -> ParentType: """ Returns integers representing the index of the search key for each row. diff --git a/python/cudf/cudf/core/column/methods.py b/python/cudf/cudf/core/column/methods.py index 7f7355c571a..7c6f4e05577 100644 --- a/python/cudf/cudf/core/column/methods.py +++ b/python/cudf/cudf/core/column/methods.py @@ -2,7 +2,7 @@ from __future__ import annotations -from typing import Optional, Union, overload +from typing import Union, overload from typing_extensions import Literal @@ -52,7 +52,7 @@ def _return_or_inplace( inplace: bool = False, expand: bool = False, retain_index: bool = True, - ) -> Optional[ParentType]: ... + ) -> ParentType | None: ... def _return_or_inplace( self, new_col, inplace=False, expand=False, retain_index=True diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py index 6fb4f17b76d..098cf43421b 100644 --- a/python/cudf/cudf/core/column/numerical.py +++ b/python/cudf/cudf/core/column/numerical.py @@ -3,7 +3,7 @@ from __future__ import annotations import functools -from typing import Any, Callable, Optional, Sequence, Tuple, Union, cast +from typing import TYPE_CHECKING, Any, Callable, Sequence, cast import cupy as cp import numpy as np @@ -14,13 +14,6 @@ from cudf import _lib as libcudf from cudf._lib import pylibcudf from cudf._lib.types import size_type_dtype -from cudf._typing import ( - ColumnBinaryOperand, - ColumnLike, - Dtype, - DtypeObj, - ScalarLike, -) from cudf.api.types import ( is_bool_dtype, is_float_dtype, @@ -28,7 +21,6 @@ is_integer_dtype, is_scalar, ) -from cudf.core.buffer import Buffer from cudf.core.column import ( ColumnBase, as_column, @@ -48,6 +40,16 @@ from .numerical_base import NumericalBaseColumn +if TYPE_CHECKING: + from cudf._typing import ( + ColumnBinaryOperand, + ColumnLike, + Dtype, + DtypeObj, + ScalarLike, + ) + from cudf.core.buffer import Buffer + _unaryop_map = { "ASIN": "ARCSIN", "ACOS": "ARCCOS", @@ -74,10 +76,10 @@ def __init__( self, data: Buffer, dtype: DtypeObj, - mask: Optional[Buffer] = None, - size: Optional[int] = None, # TODO: make this non-optional + mask: Buffer | None = None, + size: int | None = None, # TODO: make this non-optional offset: int = 0, - null_count: Optional[int] = None, + null_count: int | None = None, ): dtype = cudf.dtype(dtype) @@ -168,7 +170,7 @@ def __setitem__(self, key: Any, value: Any): else: device_value = device_value.astype(self.dtype) - out: Optional[ColumnBase] # If None, no need to perform mimic inplace. + out: ColumnBase | None # If None, no need to perform mimic inplace. if isinstance(key, slice): out = self._scatter_by_slice(key, device_value) else: @@ -185,7 +187,7 @@ def __setitem__(self, key: Any, value: Any): if out: self._mimic_inplace(out, inplace=True) - def unary_operator(self, unaryop: Union[str, Callable]) -> ColumnBase: + def unary_operator(self, unaryop: str | Callable) -> ColumnBase: if callable(unaryop): return libcudf.transform.transform(self, unaryop) @@ -194,6 +196,14 @@ def unary_operator(self, unaryop: Union[str, Callable]) -> ColumnBase: unaryop = pylibcudf.unary.UnaryOperator[unaryop] return libcudf.unary.unary_operation(self, unaryop) + def __invert__(self): + if self.dtype.kind in "ui": + return self.unary_operator("invert") + elif self.dtype.kind == "b": + return self.unary_operator("not") + else: + return super().__invert__() + def _binaryop(self, other: ColumnBinaryOperand, op: str) -> ColumnBase: int_float_dtype_mapping = { np.int8: np.float32, @@ -283,7 +293,7 @@ def nans_to_nulls(self: Self) -> Self: def normalize_binop_value( self, other: ScalarLike - ) -> Union[ColumnBase, cudf.Scalar]: + ) -> ColumnBase | cudf.Scalar: if isinstance(other, ColumnBase): if not isinstance(other, NumericalColumn): return NotImplemented @@ -403,7 +413,7 @@ def nan_count(self) -> int: def _process_values_for_isin( self, values: Sequence - ) -> Tuple[ColumnBase, ColumnBase]: + ) -> tuple[ColumnBase, ColumnBase]: lhs = cast("cudf.core.column.ColumnBase", self) try: rhs = as_column(values, nan_as_null=False) @@ -437,12 +447,12 @@ def _process_values_for_isin( return lhs, rhs - def _can_return_nan(self, skipna: Optional[bool] = None) -> bool: + def _can_return_nan(self, skipna: bool | None = None) -> bool: return not skipna and self.has_nulls(include_nan=True) def _process_for_reduction( - self, skipna: Optional[bool] = None, min_count: int = 0 - ) -> Union[NumericalColumn, ScalarLike]: + self, skipna: bool | None = None, min_count: int = 0 + ) -> NumericalColumn | ScalarLike: skipna = True if skipna is None else skipna if self._can_return_nan(skipna=skipna): @@ -525,7 +535,7 @@ def find_and_replace( def fillna( self, fill_value: Any = None, - method: Optional[str] = None, + method: str | None = None, ) -> Self: """ Fill null values with *fill_value* @@ -711,7 +721,7 @@ def _reduction_result_dtype(self, reduction_op: str) -> Dtype: def _normalize_find_and_replace_input( - input_column_dtype: DtypeObj, col_to_normalize: Union[ColumnBase, list] + input_column_dtype: DtypeObj, col_to_normalize: ColumnBase | list ) -> ColumnBase: normalized_column = column.as_column( col_to_normalize, diff --git a/python/cudf/cudf/core/column/numerical_base.py b/python/cudf/cudf/core/column/numerical_base.py index d38ec9cf30f..95c78c5efcb 100644 --- a/python/cudf/cudf/core/column/numerical_base.py +++ b/python/cudf/cudf/core/column/numerical_base.py @@ -3,17 +3,19 @@ from __future__ import annotations -from typing import Optional, cast +from typing import TYPE_CHECKING, cast import numpy as np import cudf from cudf import _lib as libcudf -from cudf._typing import ScalarLike from cudf.core.column import ColumnBase from cudf.core.missing import NA from cudf.core.mixins import Scannable +if TYPE_CHECKING: + from cudf._typing import ScalarLike + class NumericalBaseColumn(ColumnBase, Scannable): """A column composed of numerical data. @@ -40,10 +42,10 @@ class NumericalBaseColumn(ColumnBase, Scannable): "cummax", } - def _can_return_nan(self, skipna: Optional[bool] = None) -> bool: + def _can_return_nan(self, skipna: bool | None = None) -> bool: return not skipna and self.has_nulls() - def kurtosis(self, skipna: Optional[bool] = None) -> float: + def kurtosis(self, skipna: bool | None = None) -> float: skipna = True if skipna is None else skipna if len(self) == 0 or self._can_return_nan(skipna=skipna): @@ -68,7 +70,7 @@ def kurtosis(self, skipna: Optional[bool] = None) -> float: kurt = term_one_section_one * term_one_section_two - 3 * term_two return kurt - def skew(self, skipna: Optional[bool] = None) -> ScalarLike: + def skew(self, skipna: bool | None = None) -> ScalarLike: skipna = True if skipna is None else skipna if len(self) == 0 or self._can_return_nan(skipna=skipna): @@ -140,7 +142,7 @@ def quantile( def mean( self, - skipna: Optional[bool] = None, + skipna: bool | None = None, min_count: int = 0, dtype=np.float64, ): @@ -150,7 +152,7 @@ def mean( def var( self, - skipna: Optional[bool] = None, + skipna: bool | None = None, min_count: int = 0, dtype=np.float64, ddof=1, @@ -161,7 +163,7 @@ def var( def std( self, - skipna: Optional[bool] = None, + skipna: bool | None = None, min_count: int = 0, dtype=np.float64, ddof=1, @@ -170,7 +172,7 @@ def std( "std", skipna=skipna, min_count=min_count, dtype=dtype, ddof=ddof ) - def median(self, skipna: Optional[bool] = None) -> NumericalBaseColumn: + def median(self, skipna: bool | None = None) -> NumericalBaseColumn: skipna = True if skipna is None else skipna if self._can_return_nan(skipna=skipna): diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index ad7dbe5e52e..2451a9cc0af 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -5,22 +5,11 @@ import re import warnings from functools import cached_property -from typing import ( - TYPE_CHECKING, - Any, - Optional, - Sequence, - Tuple, - Union, - cast, - overload, -) - -import cupy +from typing import TYPE_CHECKING, Any, Sequence, cast, overload + import numpy as np import pandas as pd import pyarrow as pa -from numba import cuda from typing_extensions import Self import cudf @@ -30,7 +19,6 @@ from cudf._lib.column import Column from cudf._lib.types import size_type_dtype from cudf.api.types import is_integer, is_scalar, is_string_dtype -from cudf.core.buffer import Buffer from cudf.core.column import column, datetime from cudf.core.column.column import ColumnBase from cudf.core.column.methods import ColumnMethods @@ -46,6 +34,9 @@ def str_to_boolean(column: StringColumn): if TYPE_CHECKING: + import cupy + import numba.cuda + from cudf._typing import ( ColumnBinaryOperand, ColumnLike, @@ -53,6 +44,7 @@ def str_to_boolean(column: StringColumn): ScalarLike, SeriesOrIndex, ) + from cudf.core.buffer import Buffer _str_to_numeric_typecast_functions = { @@ -256,13 +248,13 @@ def byte_count(self) -> SeriesOrIndex: @overload def cat( - self, sep: Optional[str] = None, na_rep: Optional[str] = None + self, sep: str | None = None, na_rep: str | None = None ) -> str: ... @overload def cat( - self, others, sep: Optional[str] = None, na_rep: Optional[str] = None - ) -> Union[SeriesOrIndex, "cudf.core.column.string.StringColumn"]: ... + self, others, sep: str | None = None, na_rep: str | None = None + ) -> SeriesOrIndex | "cudf.core.column.string.StringColumn": ... def cat(self, others=None, sep=None, na_rep=None): """ @@ -640,7 +632,7 @@ def extract( def contains( self, - pat: Union[str, Sequence], + pat: str | Sequence, case: bool = True, flags: int = 0, na=np.nan, @@ -791,7 +783,7 @@ def contains( result_col = libstrings.contains_multiple(input_column, pat) return self._return_or_inplace(result_col) - def like(self, pat: str, esc: Optional[str] = None) -> SeriesOrIndex: + def like(self, pat: str, esc: str | None = None) -> SeriesOrIndex: """ Test if a like pattern matches a string of a Series or Index. @@ -862,7 +854,7 @@ def like(self, pat: str, esc: Optional[str] = None) -> SeriesOrIndex: def repeat( self, - repeats: Union[int, Sequence], + repeats: int | Sequence, ) -> SeriesOrIndex: """ Duplicate each string in the Series or Index. @@ -919,8 +911,8 @@ def repeat( def replace( self, - pat: Union[str, Sequence], - repl: Union[str, Sequence], + pat: str | Sequence, + repl: str | Sequence, n: int = -1, case=None, flags: int = 0, @@ -1073,9 +1065,9 @@ def replace_with_backrefs(self, pat: str, repl: str) -> SeriesOrIndex: def slice( self, - start: Optional[int] = None, - stop: Optional[int] = None, - step: Optional[int] = None, + start: int | None = None, + stop: int | None = None, + step: int | None = None, ) -> SeriesOrIndex: """ Slice substrings from each element in the Series or Index. @@ -2050,7 +2042,7 @@ def istitle(self) -> SeriesOrIndex: return self._return_or_inplace(libstrings.is_title(self._column)) def filter_alphanum( - self, repl: Optional[str] = None, keep: bool = True + self, repl: str | None = None, keep: bool = True ) -> SeriesOrIndex: """ Remove non-alphanumeric characters from strings in this column. @@ -2137,9 +2129,9 @@ def slice_from( def slice_replace( self, - start: Optional[int] = None, - stop: Optional[int] = None, - repl: Optional[str] = None, + start: int | None = None, + stop: int | None = None, + repl: str | None = None, ) -> SeriesOrIndex: """ Replace the specified section of each string with a new string. @@ -2227,9 +2219,7 @@ def slice_replace( ), ) - def insert( - self, start: int = 0, repl: Optional[str] = None - ) -> SeriesOrIndex: + def insert(self, start: int = 0, repl: str | None = None) -> SeriesOrIndex: """ Insert the specified string into each string in the specified position. @@ -2409,10 +2399,10 @@ def get_json_object( def split( self, - pat: Optional[str] = None, + pat: str | None = None, n: int = -1, expand: bool = False, - regex: Optional[bool] = None, + regex: bool | None = None, ) -> SeriesOrIndex: """ Split strings around given separator/delimiter. @@ -2577,10 +2567,10 @@ def split( def rsplit( self, - pat: Optional[str] = None, + pat: str | None = None, n: int = -1, expand: bool = False, - regex: Optional[bool] = None, + regex: bool | None = None, ) -> SeriesOrIndex: """ Split strings around given separator/delimiter. @@ -3232,7 +3222,7 @@ def rjust(self, width: int, fillchar: str = " ") -> SeriesOrIndex: libstrings.rjust(self._column, width, fillchar) ) - def strip(self, to_strip: Optional[str] = None) -> SeriesOrIndex: + def strip(self, to_strip: str | None = None) -> SeriesOrIndex: r""" Remove leading and trailing characters. @@ -3291,7 +3281,7 @@ def strip(self, to_strip: Optional[str] = None) -> SeriesOrIndex: libstrings.strip(self._column, cudf.Scalar(to_strip, "str")) ) - def lstrip(self, to_strip: Optional[str] = None) -> SeriesOrIndex: + def lstrip(self, to_strip: str | None = None) -> SeriesOrIndex: r""" Remove leading and trailing characters. @@ -3338,7 +3328,7 @@ def lstrip(self, to_strip: Optional[str] = None) -> SeriesOrIndex: libstrings.lstrip(self._column, cudf.Scalar(to_strip, "str")) ) - def rstrip(self, to_strip: Optional[str] = None) -> SeriesOrIndex: + def rstrip(self, to_strip: str | None = None) -> SeriesOrIndex: r""" Remove leading and trailing characters. @@ -3843,7 +3833,7 @@ def endswith(self, pat: str) -> SeriesOrIndex: return self._return_or_inplace(result_col) - def startswith(self, pat: Union[str, Sequence]) -> SeriesOrIndex: + def startswith(self, pat: str | Sequence) -> SeriesOrIndex: """ Test if the start of each string element matches a pattern. @@ -3995,7 +3985,7 @@ def removeprefix(self, prefix: str) -> SeriesOrIndex: return self._return_or_inplace(result) def find( - self, sub: str, start: int = 0, end: Optional[int] = None + self, sub: str, start: int = 0, end: int | None = None ) -> SeriesOrIndex: """ Return lowest indexes in each strings in the Series/Index @@ -4052,7 +4042,7 @@ def find( return self._return_or_inplace(result_col) def rfind( - self, sub: str, start: int = 0, end: Optional[int] = None + self, sub: str, start: int = 0, end: int | None = None ) -> SeriesOrIndex: """ Return highest indexes in each strings in the Series/Index @@ -4113,7 +4103,7 @@ def rfind( return self._return_or_inplace(result_col) def index( - self, sub: str, start: int = 0, end: Optional[int] = None + self, sub: str, start: int = 0, end: int | None = None ) -> SeriesOrIndex: """ Return lowest indexes in each strings where the substring @@ -4175,7 +4165,7 @@ def index( return result def rindex( - self, sub: str, start: int = 0, end: Optional[int] = None + self, sub: str, start: int = 0, end: int | None = None ) -> SeriesOrIndex: """ Return highest indexes in each strings where the substring @@ -4442,7 +4432,7 @@ def translate(self, table: dict) -> SeriesOrIndex: ) def filter_characters( - self, table: dict, keep: bool = True, repl: Optional[str] = None + self, table: dict, keep: bool = True, repl: str | None = None ) -> SeriesOrIndex: """ Remove characters from each string using the character ranges @@ -4923,7 +4913,7 @@ def ngrams_tokenize( ) def replace_tokens( - self, targets, replacements, delimiter: Optional[str] = None + self, targets, replacements, delimiter: str | None = None ) -> SeriesOrIndex: """ The targets tokens are searched for within each string in the series @@ -5008,8 +4998,8 @@ def replace_tokens( def filter_tokens( self, min_token_length: int, - replacement: Optional[str] = None, - delimiter: Optional[str] = None, + replacement: str | None = None, + delimiter: str | None = None, ) -> SeriesOrIndex: """ Remove tokens from within each string in the series that are @@ -5278,7 +5268,7 @@ def edit_distance_matrix(self) -> SeriesOrIndex: ) def minhash( - self, seeds: Optional[ColumnLike] = None, width: int = 4 + self, seeds: ColumnLike | None = None, width: int = 4 ) -> SeriesOrIndex: """ Compute the minhash of a strings column. @@ -5321,7 +5311,7 @@ def minhash( ) def minhash64( - self, seeds: Optional[ColumnLike] = None, width: int = 4 + self, seeds: ColumnLike | None = None, width: int = 4 ) -> SeriesOrIndex: """ Compute the minhash of a strings column. @@ -5435,8 +5425,8 @@ class StringColumn(column.ColumnBase): respectively """ - _start_offset: Optional[int] - _end_offset: Optional[int] + _start_offset: int | None + _end_offset: int | None _VALID_BINARY_OPERATIONS = { "__eq__", @@ -5460,12 +5450,12 @@ class StringColumn(column.ColumnBase): def __init__( self, - data: Optional[Buffer] = None, - mask: Optional[Buffer] = None, - size: Optional[int] = None, # TODO: make non-optional + data: Buffer | None = None, + mask: Buffer | None = None, + size: int | None = None, # TODO: make non-optional offset: int = 0, - null_count: Optional[int] = None, - children: Tuple["column.ColumnBase", ...] = (), + null_count: int | None = None, + children: tuple["column.ColumnBase", ...] = (), ): dtype = cudf.api.types.dtype("object") @@ -5598,7 +5588,7 @@ def any(self, skipna: bool = True) -> bool: def data_array_view( self, *, mode="write" - ) -> cuda.devicearray.DeviceNDArray: + ) -> numba.cuda.devicearray.DeviceNDArray: raise ValueError("Cannot get an array view of a StringColumn") @property @@ -5633,8 +5623,8 @@ def to_arrow(self) -> pa.Array: def sum( self, - skipna: Optional[bool] = None, - dtype: Optional[Dtype] = None, + skipna: bool | None = None, + dtype: Dtype | None = None, min_count: int = 0, ): result_col = self._process_for_reduction( @@ -5851,7 +5841,7 @@ def find_and_replace( def fillna( self, fill_value: Any = None, - method: Optional[str] = None, + method: str | None = None, ) -> Self: if fill_value is not None: if not is_scalar(fill_value): @@ -5863,9 +5853,7 @@ def fillna( fill_value = cudf.Scalar(fill_value, dtype=self.dtype) return super().fillna(fill_value, method=method) - def normalize_binop_value( - self, other - ) -> Union[column.ColumnBase, cudf.Scalar]: + def normalize_binop_value(self, other) -> column.ColumnBase | cudf.Scalar: if ( isinstance(other, (column.ColumnBase, cudf.Scalar)) and other.dtype == "object" @@ -5929,8 +5917,8 @@ def _binaryop( # Explicit types are necessary because mypy infers ColumnBase # rather than StringColumn and sometimes forgets Scalar. - lhs: Union[cudf.Scalar, StringColumn] - rhs: Union[cudf.Scalar, StringColumn] + lhs: cudf.Scalar | StringColumn + rhs: cudf.Scalar | StringColumn lhs, rhs = (other, self) if reflect else (self, other) return cast( diff --git a/python/cudf/cudf/core/column/struct.py b/python/cudf/cudf/core/column/struct.py index 6dd35570b95..c2ce787eeae 100644 --- a/python/cudf/cudf/core/column/struct.py +++ b/python/cudf/cudf/core/column/struct.py @@ -2,17 +2,20 @@ from __future__ import annotations from functools import cached_property +from typing import TYPE_CHECKING import pandas as pd import pyarrow as pa import cudf -from cudf._typing import Dtype from cudf.core.column import ColumnBase from cudf.core.column.methods import ColumnMethods from cudf.core.dtypes import StructDtype from cudf.core.missing import NA +if TYPE_CHECKING: + from cudf._typing import Dtype + class StructColumn(ColumnBase): """ diff --git a/python/cudf/cudf/core/column/timedelta.py b/python/cudf/cudf/core/column/timedelta.py index c6af052b56f..8eec84b64f7 100644 --- a/python/cudf/cudf/core/column/timedelta.py +++ b/python/cudf/cudf/core/column/timedelta.py @@ -4,7 +4,7 @@ import datetime import functools -from typing import Any, Optional, Sequence, cast +from typing import TYPE_CHECKING, Any, Sequence, cast import numpy as np import pandas as pd @@ -13,13 +13,15 @@ import cudf from cudf import _lib as libcudf -from cudf._typing import ColumnBinaryOperand, DatetimeLikeScalar, Dtype from cudf.api.types import is_scalar, is_timedelta64_dtype from cudf.core.buffer import Buffer, acquire_spill_lock from cudf.core.column import ColumnBase, column, string from cudf.utils.dtypes import np_to_pa_dtype from cudf.utils.utils import _all_bools_with_nulls +if TYPE_CHECKING: + from cudf._typing import ColumnBinaryOperand, DatetimeLikeScalar, Dtype + _unit_to_nanoseconds_conversion = { "ns": 1, "us": 1_000, @@ -75,10 +77,10 @@ def __init__( self, data: Buffer, dtype: Dtype, - size: Optional[int] = None, # TODO: make non-optional - mask: Optional[Buffer] = None, + size: int | None = None, # TODO: make non-optional + mask: Buffer | None = None, offset: int = 0, - null_count: Optional[int] = None, + null_count: int | None = None, ): dtype = cudf.dtype(dtype) if dtype.kind != "m": @@ -253,7 +255,7 @@ def time_unit(self) -> str: def fillna( self, fill_value: Any = None, - method: Optional[str] = None, + method: str | None = None, ) -> Self: if fill_value is not None: if cudf.utils.utils._isnat(fill_value): @@ -314,7 +316,7 @@ def mean(self, skipna=None, dtype: Dtype = np.float64) -> pd.Timedelta: unit=self.time_unit, ).as_unit(self.time_unit) - def median(self, skipna: Optional[bool] = None) -> pd.Timedelta: + def median(self, skipna: bool | None = None) -> pd.Timedelta: return pd.Timedelta( self.as_numerical_column("int64").median(skipna=skipna), unit=self.time_unit, @@ -344,9 +346,9 @@ def quantile( def sum( self, - skipna: Optional[bool] = None, + skipna: bool | None = None, min_count: int = 0, - dtype: Optional[Dtype] = None, + dtype: Dtype | None = None, ) -> pd.Timedelta: return pd.Timedelta( # Since sum isn't overridden in Numerical[Base]Column, mypy only @@ -360,7 +362,7 @@ def sum( def std( self, - skipna: Optional[bool] = None, + skipna: bool | None = None, min_count: int = 0, dtype: Dtype = np.float64, ddof: int = 1, diff --git a/python/cudf/cudf/core/column_accessor.py b/python/cudf/cudf/core/column_accessor.py index 9f3de061ee8..1bf9a393566 100644 --- a/python/cudf/cudf/core/column_accessor.py +++ b/python/cudf/cudf/core/column_accessor.py @@ -6,16 +6,7 @@ import sys from collections import abc from functools import cached_property, reduce -from typing import ( - TYPE_CHECKING, - Any, - Callable, - Dict, - Mapping, - Optional, - Tuple, - Union, -) +from typing import TYPE_CHECKING, Any, Callable, Mapping import numpy as np import pandas as pd @@ -98,13 +89,13 @@ class ColumnAccessor(abc.MutableMapping): column length and type """ - _data: "Dict[Any, ColumnBase]" + _data: "dict[Any, ColumnBase]" multiindex: bool - _level_names: Tuple[Any, ...] + _level_names: tuple[Any, ...] def __init__( self, - data: Union[abc.MutableMapping, ColumnAccessor, None] = None, + data: abc.MutableMapping | ColumnAccessor | None = None, multiindex: bool = False, level_names=None, rangeindex: bool = False, @@ -210,7 +201,7 @@ def _from_columns_like_self( ) @property - def level_names(self) -> Tuple[Any, ...]: + def level_names(self) -> tuple[Any, ...]: if self._level_names is None or len(self._level_names) == 0: return tuple((None,) * max(1, self.nlevels)) else: @@ -237,11 +228,11 @@ def nrows(self) -> int: return len(next(iter(self.values()))) @cached_property - def names(self) -> Tuple[Any, ...]: + def names(self) -> tuple[Any, ...]: return tuple(self.keys()) @cached_property - def columns(self) -> Tuple[ColumnBase, ...]: + def columns(self) -> tuple[ColumnBase, ...]: return tuple(self.values()) @cached_property @@ -610,7 +601,7 @@ def _pad_key(self, key: Any, pad_value="") -> Any: return key + (pad_value,) * (self.nlevels - len(key)) def rename_levels( - self, mapper: Union[Mapping[Any, Any], Callable], level: Optional[int] + self, mapper: Mapping[Any, Any] | Callable, level: int | None ) -> ColumnAccessor: """ Rename the specified levels of the given ColumnAccessor diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index e1b6cc45dd3..065b13561ab 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -14,19 +14,7 @@ import warnings from collections import abc, defaultdict from collections.abc import Iterator -from typing import ( - Any, - Callable, - Dict, - List, - Literal, - MutableMapping, - Optional, - Set, - Tuple, - Union, - cast, -) +from typing import TYPE_CHECKING, Any, Callable, Literal, MutableMapping, cast import cupy import numba @@ -41,7 +29,6 @@ import cudf import cudf.core.common from cudf import _lib as libcudf -from cudf._typing import ColumnLike, Dtype, NotImplementedType from cudf.api.extensions import no_default from cudf.api.types import ( _is_scalar_or_zero_d_array, @@ -99,6 +86,9 @@ from cudf.utils.nvtx_annotation import _cudf_nvtx_annotate from cudf.utils.utils import GetAttrGetItemMixin, _external_only_api +if TYPE_CHECKING: + from cudf._typing import ColumnLike, Dtype, NotImplementedType + _cupy_nan_methods_map = { "min": "nanmin", "max": "nanmax", @@ -681,7 +671,7 @@ class DataFrame(IndexedFrame, Serializable, GetAttrGetItemMixin): """ _PROTECTED_KEYS = frozenset(("_data", "_index")) - _accessors: Set[Any] = set() + _accessors: set[Any] = set() _loc_indexer_type = _DataFrameLocIndexer _iloc_indexer_type = _DataFrameIlocIndexer _groupby = DataFrameGroupBy @@ -1120,7 +1110,7 @@ def _init_from_dict_like( def _from_data( cls, data: MutableMapping, - index: Optional[BaseIndex] = None, + index: BaseIndex | None = None, columns: Any = None, ) -> DataFrame: out = super()._from_data(data=data, index=index) @@ -1345,7 +1335,16 @@ def __getitem__(self, arg): 8 8 8 8 """ if _is_scalar_or_zero_d_array(arg) or isinstance(arg, tuple): - return self._get_columns_by_label(arg, downcast=True) + out = self._get_columns_by_label(arg) + if is_scalar(arg): + nlevels = 1 + elif isinstance(arg, tuple): + nlevels = len(arg) + if self._data.multiindex is False or nlevels == self._data.nlevels: + out = self._constructor_sliced._from_data(out._data) + out.index = self.index + out.name = arg + return out elif isinstance(arg, slice): return self._slice(arg) @@ -1541,7 +1540,7 @@ def _get_numeric_data(self): return self[columns] @_cudf_nvtx_annotate - def assign(self, **kwargs: Union[Callable[[Self], Any], Any]): + def assign(self, **kwargs: Callable[[Self], Any] | Any): """ Assign columns to DataFrame from keyword arguments. @@ -1990,31 +1989,6 @@ def _repr_html_(self): def _repr_latex_(self): return self._get_renderable_dataframe().to_pandas()._repr_latex_() - @_cudf_nvtx_annotate - def _get_columns_by_label( - self, labels, *, downcast=False - ) -> Self | Series: - """ - Return columns of dataframe by `labels` - - If downcast is True, try and downcast from a DataFrame to a Series - """ - ca = self._data.select_by_label(labels) - if downcast: - if is_scalar(labels): - nlevels = 1 - elif isinstance(labels, tuple): - nlevels = len(labels) - if self._data.multiindex is False or nlevels == self._data.nlevels: - out = self._constructor_sliced._from_data( - ca, index=self.index, name=labels - ) - return out - out = self.__class__._from_data( - ca, index=self.index, columns=ca.to_pandas_index() - ) - return out - def _make_operands_and_index_for_binop( self, other: Any, @@ -2022,12 +1996,10 @@ def _make_operands_and_index_for_binop( fill_value: Any = None, reflect: bool = False, can_reindex: bool = False, - ) -> Tuple[ - Union[ - Dict[Optional[str], Tuple[ColumnBase, Any, bool, Any]], - NotImplementedType, - ], - Optional[BaseIndex], + ) -> tuple[ + dict[str | None, tuple[ColumnBase, Any, bool, Any]] + | NotImplementedType, + BaseIndex | None, bool, ]: lhs, rhs = self._data, other @@ -2132,8 +2104,8 @@ def from_dict( cls, data: dict, orient: str = "columns", - dtype: Optional[Dtype] = None, - columns: Optional[list] = None, + dtype: Dtype | None = None, + columns: list | None = None, ) -> DataFrame: """ Construct DataFrame from dict of array-like or dicts. @@ -4597,7 +4569,7 @@ def apply( def applymap( self, func: Callable[[Any], Any], - na_action: Union[str, None] = None, + na_action: str | None = None, **kwargs, ) -> DataFrame: """ @@ -4630,7 +4602,7 @@ def applymap( def map( self, func: Callable[[Any], Any], - na_action: Union[str, None] = None, + na_action: str | None = None, **kwargs, ) -> DataFrame: """ @@ -7475,7 +7447,7 @@ def __dataframe__( self, nan_as_null=nan_as_null, allow_copy=allow_copy ) - def nunique(self, axis=0, dropna=True): + def nunique(self, axis=0, dropna: bool = True) -> Series: """ Count number of distinct elements in specified axis. Return Series with number of distinct elements. Can ignore NaN values. @@ -7503,13 +7475,15 @@ def nunique(self, axis=0, dropna=True): """ if axis != 0: raise NotImplementedError("axis parameter is not supported yet.") - - return cudf.Series(super().nunique(dropna=dropna)) + counts = [col.distinct_count(dropna=dropna) for col in self._columns] + return self._constructor_sliced( + counts, index=self._data.to_pandas_index() + ) def _sample_axis_1( self, n: int, - weights: Optional[ColumnLike], + weights: ColumnLike | None, replace: bool, random_state: np.random.RandomState, ignore_index: bool, @@ -7534,11 +7508,11 @@ def _sample_axis_1( def _from_columns_like_self( self, - columns: List[ColumnBase], - column_names: Optional[abc.Iterable[str]] = None, - index_names: Optional[List[str]] = None, + columns: list[ColumnBase], + column_names: abc.Iterable[str] | None = None, + index_names: list[str] | None = None, *, - override_dtypes: Optional[abc.Iterable[Optional[Dtype]]] = None, + override_dtypes: abc.Iterable[Dtype | None] | None = None, ) -> DataFrame: result = super()._from_columns_like_self( columns, @@ -8072,11 +8046,11 @@ def from_pandas(obj, nan_as_null=no_default): return cudf.Index.from_pandas(obj, nan_as_null=nan_as_null) elif isinstance(obj, pd.CategoricalDtype): return cudf.CategoricalDtype.from_pandas(obj) + elif isinstance(obj, pd.IntervalDtype): + return cudf.IntervalDtype.from_pandas(obj) else: raise TypeError( - "from_pandas only accepts Pandas Dataframes, Series, " - "Index, RangeIndex and MultiIndex objects. " - "Got %s" % type(obj) + f"from_pandas unsupported for object of type {type(obj).__name__}" ) @@ -8139,7 +8113,7 @@ def _setitem_with_dataframe( input_df: DataFrame, replace_df: DataFrame, input_cols: Any = None, - mask: Optional[ColumnBase] = None, + mask: ColumnBase | None = None, ignore_index: bool = False, ): """ diff --git a/python/cudf/cudf/core/df_protocol.py b/python/cudf/cudf/core/df_protocol.py index 62ded8ac6f1..9cd573aceb9 100644 --- a/python/cudf/cudf/core/df_protocol.py +++ b/python/cudf/cudf/core/df_protocol.py @@ -1,17 +1,9 @@ # Copyright (c) 2021-2024, NVIDIA CORPORATION. +from __future__ import annotations import enum from collections import abc -from typing import ( - Any, - Dict, - Iterable, - Mapping, - Optional, - Sequence, - Tuple, - cast, -) +from typing import Any, Iterable, Mapping, Sequence, Tuple, cast import cupy as cp import numpy as np @@ -109,7 +101,7 @@ def __dlpack__(self): except ValueError: raise TypeError(f"dtype {self._dtype} unsupported by `dlpack`") - def __dlpack_device__(self) -> Tuple[_Device, int]: + def __dlpack_device__(self) -> tuple[_Device, int]: """ _Device type and _Device ID for where the data in the buffer resides. """ @@ -265,7 +257,7 @@ def _dtype_from_cudfdtype(self, dtype) -> ProtoDtype: return (kind, bitwidth, format_str, endianness) @property - def describe_categorical(self) -> Tuple[bool, bool, Dict[int, Any]]: + def describe_categorical(self) -> tuple[bool, bool, dict[int, Any]]: """ If the dtype is categorical, there are two options: @@ -298,7 +290,7 @@ def describe_categorical(self) -> Tuple[bool, bool, Dict[int, Any]]: return ordered, is_dictionary, mapping @property - def describe_null(self) -> Tuple[int, Any]: + def describe_null(self) -> tuple[int, Any]: """ Return the missing value (or "null") representation the column dtype uses, as a tuple ``(kind, value)``. @@ -338,7 +330,7 @@ def null_count(self) -> int: return self._col.null_count @property - def metadata(self) -> Dict[str, Any]: + def metadata(self) -> dict[str, Any]: """ Store specific metadata of the column. """ @@ -351,7 +343,7 @@ def num_chunks(self) -> int: return 1 def get_chunks( - self, n_chunks: Optional[int] = None + self, n_chunks: int | None = None ) -> Iterable["_CuDFColumn"]: """ Return an iterable yielding the chunks. @@ -362,7 +354,7 @@ def get_chunks( def get_buffers( self, - ) -> Mapping[str, Optional[Tuple[_CuDFBuffer, ProtoDtype]]]: + ) -> Mapping[str, tuple[_CuDFBuffer, ProtoDtype] | None]: """ Return a dictionary containing the underlying buffers. @@ -400,7 +392,7 @@ def get_buffers( def _get_validity_buffer( self, - ) -> Optional[Tuple[_CuDFBuffer, ProtoDtype]]: + ) -> tuple[_CuDFBuffer, ProtoDtype] | None: """ Return the buffer containing the mask values indicating missing data and the buffer's associated dtype. @@ -433,7 +425,7 @@ def _get_validity_buffer( def _get_offsets_buffer( self, - ) -> Optional[Tuple[_CuDFBuffer, ProtoDtype]]: + ) -> tuple[_CuDFBuffer, ProtoDtype] | None: """ Return the buffer containing the offset values for variable-size binary data (e.g., variable-length strings) @@ -461,7 +453,7 @@ def _get_offsets_buffer( def _get_data_buffer( self, - ) -> Tuple[_CuDFBuffer, ProtoDtype]: + ) -> tuple[_CuDFBuffer, ProtoDtype]: """ Return the buffer containing the data and the buffer's associated dtype. @@ -588,7 +580,7 @@ def select_columns_by_name(self, names: Sequence[str]) -> "_CuDFDataFrame": ) def get_chunks( - self, n_chunks: Optional[int] = None + self, n_chunks: int | None = None ) -> Iterable["_CuDFDataFrame"]: """ Return an iterator yielding the chunks. @@ -745,9 +737,9 @@ def from_dataframe( def _protocol_to_cudf_column_numeric( col, allow_copy: bool -) -> Tuple[ +) -> tuple[ cudf.core.column.ColumnBase, - Mapping[str, Optional[Tuple[_CuDFBuffer, ProtoDtype]]], + Mapping[str, tuple[_CuDFBuffer, ProtoDtype] | None], ]: """ Convert an int, uint, float or bool protocol column @@ -822,9 +814,9 @@ def protocol_dtype_to_cupy_dtype(_dtype: ProtoDtype) -> cp.dtype: def _protocol_to_cudf_column_categorical( col, allow_copy: bool -) -> Tuple[ +) -> tuple[ cudf.core.column.ColumnBase, - Mapping[str, Optional[Tuple[_CuDFBuffer, ProtoDtype]]], + Mapping[str, tuple[_CuDFBuffer, ProtoDtype] | None], ]: """ Convert a categorical column to a Series instance @@ -857,9 +849,9 @@ def _protocol_to_cudf_column_categorical( def _protocol_to_cudf_column_string( col, allow_copy: bool -) -> Tuple[ +) -> tuple[ cudf.core.column.ColumnBase, - Mapping[str, Optional[Tuple[_CuDFBuffer, ProtoDtype]]], + Mapping[str, tuple[_CuDFBuffer, ProtoDtype] | None], ]: """ Convert a string ColumnObject to cudf Column object. diff --git a/python/cudf/cudf/core/dtypes.py b/python/cudf/cudf/core/dtypes.py index 4729233ee6e..034849d0e71 100644 --- a/python/cudf/cudf/core/dtypes.py +++ b/python/cudf/cudf/core/dtypes.py @@ -1,4 +1,5 @@ # Copyright (c) 2020-2024, NVIDIA CORPORATION. +from __future__ import annotations import decimal import operator @@ -6,7 +7,7 @@ import textwrap import warnings from functools import cached_property -from typing import Any, Callable, Dict, List, Tuple, Type, Union +from typing import TYPE_CHECKING, Any, Callable import numpy as np import pandas as pd @@ -16,12 +17,14 @@ from pandas.core.arrays.arrow.extension_types import ArrowIntervalType import cudf -from cudf._typing import Dtype from cudf.core._compat import PANDAS_LT_300 from cudf.core.abc import Serializable -from cudf.core.buffer import Buffer from cudf.utils.docutils import doc_apply +if TYPE_CHECKING: + from cudf._typing import Dtype + from cudf.core.buffer import Buffer + def dtype(arbitrary): """ @@ -82,11 +85,11 @@ def dtype(arbitrary): def _decode_type( - cls: Type, + cls: type, header: dict, frames: list, - is_valid_class: Callable[[Type, Type], bool] = operator.is_, -) -> Tuple[dict, list, Type]: + is_valid_class: Callable[[type, type], bool] = operator.is_, +) -> tuple[dict, list, type]: """Decode metadata-encoded type and check validity Parameters @@ -479,8 +482,8 @@ def __repr__(self): def __hash__(self): return hash(self._typ) - def serialize(self) -> Tuple[dict, list]: - header: Dict[str, Dtype] = {} + def serialize(self) -> tuple[dict, list]: + header: dict[str, Dtype] = {} header["type-serialized"] = pickle.dumps(type(self)) frames = [] @@ -625,13 +628,13 @@ def __repr__(self): def __hash__(self): return hash(self._typ) - def serialize(self) -> Tuple[dict, list]: - header: Dict[str, Any] = {} + def serialize(self) -> tuple[dict, list]: + header: dict[str, Any] = {} header["type-serialized"] = pickle.dumps(type(self)) - frames: List[Buffer] = [] + frames: list[Buffer] = [] - fields: Dict[str, Union[bytes, Tuple[Any, Tuple[int, int]]]] = {} + fields: dict[str, bytes | tuple[Any, tuple[int, int]]] = {} for k, dtype in self.fields.items(): if isinstance(dtype, _BaseDtype): @@ -821,7 +824,7 @@ def _from_decimal(cls, decimal): precision = max(len(metadata.digits), -metadata.exponent) return cls(precision, -metadata.exponent) - def serialize(self) -> Tuple[dict, list]: + def serialize(self) -> tuple[dict, list]: return ( { "type-serialized": pickle.dumps(type(self)), @@ -944,7 +947,7 @@ def __eq__(self, other): def __hash__(self): return hash((self.subtype, self.closed)) - def serialize(self) -> Tuple[dict, list]: + def serialize(self) -> tuple[dict, list]: header = { "type-serialized": pickle.dumps(type(self)), "fields": pickle.dumps((self.subtype, self.closed)), diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index af8886a44a6..c58a0161ee0 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -6,20 +6,9 @@ import itertools import operator import pickle -import types import warnings from collections import abc -from typing import ( - Any, - Callable, - Dict, - List, - Literal, - MutableMapping, - Optional, - Tuple, - Union, -) +from typing import TYPE_CHECKING, Any, Callable, Literal, MutableMapping # TODO: The `numpy` import is needed for typing purposes during doc builds # only, need to figure out why the `np` alias is insufficient then remove. @@ -31,8 +20,7 @@ import cudf from cudf import _lib as libcudf -from cudf._typing import Dtype -from cudf.api.types import is_bool_dtype, is_dtype_equal, is_scalar +from cudf.api.types import is_dtype_equal, is_scalar from cudf.core.buffer import acquire_spill_lock from cudf.core.column import ( ColumnBase, @@ -48,6 +36,11 @@ from cudf.utils.nvtx_annotation import _cudf_nvtx_annotate from cudf.utils.utils import _array_ufunc, _warn_no_dask_cudf +if TYPE_CHECKING: + from types import ModuleType + + from cudf._typing import Dtype + # TODO: It looks like Frame is missing a declaration of `copy`, need to add class Frame(BinaryOperand, Scannable): @@ -79,11 +72,11 @@ def _num_rows(self) -> int: return self._data.nrows @property - def _column_names(self) -> Tuple[Any, ...]: + def _column_names(self) -> tuple[Any, ...]: return self._data.names @property - def _columns(self) -> Tuple[ColumnBase, ...]: + def _columns(self) -> tuple[ColumnBase, ...]: return self._data.columns @property @@ -132,21 +125,28 @@ def deserialize(cls, header, frames): @classmethod @_cudf_nvtx_annotate def _from_data(cls, data: MutableMapping) -> Self: + """ + Construct cls from a ColumnAccessor-like mapping. + """ obj = cls.__new__(cls) Frame.__init__(obj, data) return obj @_cudf_nvtx_annotate def _from_data_like_self(self, data: MutableMapping) -> Self: + """ + Return type(self) from a ColumnAccessor-like mapping but + with the external properties, e.g. .index, .name, of self. + """ return self._from_data(data) @_cudf_nvtx_annotate def _from_columns_like_self( self, - columns: List[ColumnBase], - column_names: Optional[abc.Iterable[str]] = None, + columns: list[ColumnBase], + column_names: abc.Iterable[str] | None = None, *, - override_dtypes: Optional[abc.Iterable[Optional[Dtype]]] = None, + override_dtypes: abc.Iterable[Dtype | None] | None = None, ): """Construct a Frame from a list of columns with metadata from self. @@ -161,7 +161,7 @@ def _from_columns_like_self( @_cudf_nvtx_annotate def _mimic_inplace( self, result: Self, inplace: bool = False - ) -> Optional[Self]: + ) -> Self | None: if inplace: for col in self._data: if col in result._data: @@ -351,12 +351,13 @@ def equals(self, other) -> bool: ) @_cudf_nvtx_annotate - def _get_columns_by_label(self, labels, *, downcast=False) -> Self: + def _get_columns_by_label(self, labels) -> Self: """ - Returns columns of the Frame specified by `labels` + Returns columns of the Frame specified by `labels`. + Akin to cudf.DataFrame(...).loc[:, labels] """ - return self.__class__._from_data(self._data.select_by_label(labels)) + return self._from_data_like_self(self._data.select_by_label(labels)) @property @_cudf_nvtx_annotate @@ -410,17 +411,17 @@ def __arrow_array__(self, type=None): def _to_array( self, get_array: Callable, - module: types.ModuleType, + module: ModuleType, copy: bool, - dtype: Union[Dtype, None] = None, + dtype: Dtype | None = None, na_value=None, - ) -> Union[cupy.ndarray, numpy.ndarray]: + ) -> cupy.ndarray | numpy.ndarray: # Internal function to implement to_cupy and to_numpy, which are nearly # identical except for the attribute they access to generate values. def to_array( col: ColumnBase, dtype: np.dtype - ) -> Union[cupy.ndarray, numpy.ndarray]: + ) -> cupy.ndarray | numpy.ndarray: if na_value is not None: col = col.fillna(na_value) array = get_array(col) @@ -473,7 +474,7 @@ def to_array( @_cudf_nvtx_annotate def to_cupy( self, - dtype: Union[Dtype, None] = None, + dtype: Dtype | None = None, copy: bool = False, na_value=None, ) -> cupy.ndarray: @@ -507,7 +508,7 @@ def to_cupy( @_cudf_nvtx_annotate def to_numpy( self, - dtype: Union[Dtype, None] = None, + dtype: Dtype | None = None, copy: bool = True, na_value=None, ) -> numpy.ndarray: @@ -540,7 +541,7 @@ def to_numpy( ) @_cudf_nvtx_annotate - def where(self, cond, other=None, inplace: bool = False) -> Optional[Self]: + def where(self, cond, other=None, inplace: bool = False) -> Self | None: """ Replace values where the condition is False. @@ -616,11 +617,11 @@ def where(self, cond, other=None, inplace: bool = False) -> Optional[Self]: def fillna( self, value=None, - method: Optional[Literal["ffill", "bfill", "pad", "backfill"]] = None, + method: Literal["ffill", "bfill", "pad", "backfill"] | None = None, axis=None, inplace: bool = False, limit=None, - ) -> Optional[Self]: + ) -> Self | None: """Fill null values with ``value`` or specified ``method``. Parameters @@ -1035,7 +1036,7 @@ def _copy_type_metadata( self, other: Self, *, - override_dtypes: Optional[abc.Iterable[Optional[Dtype]]] = None, + override_dtypes: abc.Iterable[Dtype | None] | None = None, ) -> Self: """ Copy type metadata from each column of `other` to the corresponding @@ -1434,14 +1435,10 @@ def _get_sorted_inds( Get the indices required to sort self according to the columns specified in by. """ - - to_sort = [ - *( - self - if by is None - else self._get_columns_by_label(list(by), downcast=False) - )._columns - ] + if by is None: + to_sort = self._columns + else: + to_sort = self._get_columns_by_label(list(by))._columns if is_scalar(ascending): ascending_lst = [ascending] * len(to_sort) @@ -1449,57 +1446,12 @@ def _get_sorted_inds( ascending_lst = list(ascending) return libcudf.sort.order_by( - to_sort, + list(to_sort), ascending_lst, na_position, stable=True, ) - @_cudf_nvtx_annotate - def _is_sorted(self, ascending=None, null_position=None): - """ - Returns a boolean indicating whether the data of the Frame are sorted - based on the parameters given. Does not account for the index. - - Parameters - ---------- - self : Frame - Frame whose columns are to be checked for sort order - ascending : None or list-like of booleans - None or list-like of boolean values indicating expected sort order - of each column. If list-like, size of list-like must be - len(columns). If None, all columns expected sort order is set to - ascending. False (0) - ascending, True (1) - descending. - null_position : None or list-like of booleans - None or list-like of boolean values indicating desired order of - nulls compared to other elements. If list-like, size of list-like - must be len(columns). If None, null order is set to before. False - (0) - before, True (1) - after. - - Returns - ------- - returns : boolean - Returns True, if sorted as expected by ``ascending`` and - ``null_position``, False otherwise. - """ - if ascending is not None and not cudf.api.types.is_list_like( - ascending - ): - raise TypeError( - f"Expected a list-like or None for `ascending`, got " - f"{type(ascending)}" - ) - if null_position is not None and not cudf.api.types.is_list_like( - null_position - ): - raise TypeError( - f"Expected a list-like or None for `null_position`, got " - f"{type(null_position)}" - ) - return libcudf.sort.is_sorted( - [*self._columns], ascending=ascending, null_position=null_position - ) - @_cudf_nvtx_annotate def _split(self, splits): """Split a frame with split points in ``splits``. Returns a list of @@ -1532,7 +1484,7 @@ def _unaryop(self, op): @_cudf_nvtx_annotate def _colwise_binop( cls, - operands: Dict[Optional[str], Tuple[ColumnBase, Any, bool, Any]], + operands: dict[str | None, tuple[ColumnBase, Any, bool, Any]], fn: str, ): """Implement binary ops between two frame-like objects. @@ -1920,7 +1872,7 @@ def __invert__(self): """Bitwise invert (~) for integral dtypes, logical NOT for bools.""" return self._from_data_like_self( self._data._from_columns_like_self( - (_apply_inverse_column(col) for col in self._data.columns) + (~col for col in self._data.columns) ) ) @@ -1940,16 +1892,15 @@ def nunique(self, dropna: bool = True): dict Name and unique value counts of each column in frame. """ - return { - name: col.distinct_count(dropna=dropna) - for name, col in self._data.items() - } + raise NotImplementedError( + f"{type(self).__name__} does not implement nunique" + ) @staticmethod @_cudf_nvtx_annotate def _repeat( - columns: List[ColumnBase], repeats, axis=None - ) -> List[ColumnBase]: + columns: list[ColumnBase], repeats, axis=None + ) -> list[ColumnBase]: if axis is not None: raise NotImplementedError( "Only axis=`None` supported at this time." @@ -1970,15 +1921,3 @@ def __dask_tokenize__(self): str(dict(self._dtypes)), normalize_token(self.to_pandas()), ] - - -def _apply_inverse_column(col: ColumnBase) -> ColumnBase: - """Bitwise invert (~) for integral dtypes, logical NOT for bools.""" - if np.issubdtype(col.dtype, np.integer): - return col.unary_operator("invert") - elif is_bool_dtype(col.dtype): - return col.unary_operator("not") - else: - raise TypeError( - f"Operation `~` not supported on {col.dtype.type.__name__}" - ) diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index aa96051ea51..d08268eea3a 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -1,4 +1,5 @@ # Copyright (c) 2020-2024, NVIDIA CORPORATION. +from __future__ import annotations import copy import itertools @@ -7,7 +8,7 @@ import warnings from collections import abc from functools import cached_property -from typing import Any, Iterable, List, Optional, Tuple, Union +from typing import TYPE_CHECKING, Any, Iterable import cupy as cp import numpy as np @@ -20,7 +21,6 @@ from cudf._lib.reshape import interleave_columns from cudf._lib.sort import segmented_sort_by_key from cudf._lib.types import size_type_dtype -from cudf._typing import AggType, DataFrameOrSeries, MultiColumnAggType from cudf.api.extensions import no_default from cudf.api.types import is_bool_dtype, is_list_like, is_numeric_dtype from cudf.core._compat import PANDAS_LT_300 @@ -34,6 +34,9 @@ from cudf.utils.nvtx_annotation import _cudf_nvtx_annotate from cudf.utils.utils import GetAttrGetItemMixin +if TYPE_CHECKING: + from cudf._typing import AggType, DataFrameOrSeries, MultiColumnAggType + def _deprecate_collect(): warnings.warn( @@ -1033,11 +1036,11 @@ def ngroup(self, ascending=True): def sample( self, - n: Optional[int] = None, - frac: Optional[float] = None, + n: int | None = None, + frac: float | None = None, replace: bool = False, - weights: Union[abc.Sequence, "cudf.Series", None] = None, - random_state: Union[np.random.RandomState, int, None] = None, + weights: abc.Sequence | "cudf.Series" | None = None, + random_state: np.random.RandomState | int | None = None, ): """Return a random sample of items in each group. @@ -1222,7 +1225,7 @@ def _grouped(self, *, include_groups: bool = True): def _normalize_aggs( self, aggs: MultiColumnAggType - ) -> Tuple[Iterable[Any], Tuple[ColumnBase, ...], List[List[AggType]]]: + ) -> tuple[Iterable[Any], tuple[ColumnBase, ...], list[list[AggType]]]: """ Normalize aggs to a list of list of aggregations, where `out[i]` is a list of aggregations for column `self.obj[i]`. We support three @@ -1237,7 +1240,7 @@ def _normalize_aggs( Each agg can be string or lambda functions. """ - aggs_per_column: Iterable[Union[AggType, Iterable[AggType]]] + aggs_per_column: Iterable[AggType | Iterable[AggType]] if isinstance(aggs, dict): column_names, aggs_per_column = aggs.keys(), aggs.values() columns = tuple(self.obj._data[col] for col in column_names) diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index 732e5cdb01a..13fa187842d 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -5,19 +5,9 @@ import operator import pickle import warnings -from collections.abc import Generator from functools import cache, cached_property from numbers import Number -from typing import ( - Any, - List, - Literal, - MutableMapping, - Optional, - Tuple, - Union, - cast, -) +from typing import TYPE_CHECKING, Any, Literal, MutableMapping, cast import cupy import numpy as np @@ -71,6 +61,9 @@ from cudf.utils.nvtx_annotation import _cudf_nvtx_annotate from cudf.utils.utils import _warn_no_dask_cudf, search_range +if TYPE_CHECKING: + from collections.abc import Generator + class IndexMeta(type): """Custom metaclass for Index that overrides instance/subclass tests.""" @@ -98,10 +91,10 @@ def __subclasscheck__(self, subclass): def _lexsorted_equal_range( - idx: Union[Index, cudf.MultiIndex], + idx: Index | cudf.MultiIndex, key_as_table: Frame, is_sorted: bool, -) -> Tuple[int, int, Optional[ColumnBase]]: +) -> tuple[int, int, ColumnBase | None]: """Get equal range for key in lexicographically sorted index. If index is not sorted when called, a sort will take place and `sort_inds` is returned. Otherwise `None` is returned in that position. @@ -895,7 +888,7 @@ def __array__(self, dtype=None): ) @_cudf_nvtx_annotate - def nunique(self) -> int: + def nunique(self, dropna: bool = True) -> int: return len(self) @_cudf_nvtx_annotate @@ -2855,7 +2848,7 @@ class IntervalIndex(Index): def __init__( self, data, - closed: Optional[Literal["left", "right", "neither", "both"]] = None, + closed: Literal["left", "right", "neither", "both"] | None = None, dtype=None, copy: bool = False, name=None, @@ -2914,9 +2907,7 @@ def closed(self): def from_breaks( cls, breaks, - closed: Optional[ - Literal["left", "right", "neither", "both"] - ] = "right", + closed: Literal["left", "right", "neither", "both"] | None = "right", name=None, copy: bool = False, dtype=None, @@ -3103,7 +3094,7 @@ def _getdefault_name(values, name): @_cudf_nvtx_annotate -def _concat_range_index(indexes: List[RangeIndex]) -> BaseIndex: +def _concat_range_index(indexes: list[RangeIndex]) -> BaseIndex: """ An internal Utility function to concat RangeIndex objects. """ @@ -3144,7 +3135,7 @@ def _concat_range_index(indexes: List[RangeIndex]) -> BaseIndex: @_cudf_nvtx_annotate -def _extended_gcd(a: int, b: int) -> Tuple[int, int, int]: +def _extended_gcd(a: int, b: int) -> tuple[int, int, int]: """ Extended Euclidean algorithms to solve Bezout's identity: a*x + b*y = gcd(x, y) @@ -3194,7 +3185,7 @@ def _get_nearest_indexer( index: Index, positions: cudf.Series, target_col: cudf.core.column.ColumnBase, - tolerance: Union[int, float], + tolerance: int | float, ): """ Get the indexer for the nearest index labels; requires an index with diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index fdc78005996..06da62306e8 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -9,17 +9,12 @@ import warnings from collections import Counter, abc from typing import ( + TYPE_CHECKING, Any, Callable, - Dict, - List, Literal, MutableMapping, - Optional, - Tuple, - Type, TypeVar, - Union, cast, ) from uuid import uuid4 @@ -31,12 +26,6 @@ import cudf import cudf._lib as libcudf -from cudf._typing import ( - ColumnLike, - DataFrameOrSeries, - Dtype, - NotImplementedType, -) from cudf.api.extensions import no_default from cudf.api.types import ( _is_non_decimal_numeric_dtype, @@ -70,6 +59,14 @@ from cudf.utils.nvtx_annotation import _cudf_nvtx_annotate from cudf.utils.utils import _warn_no_dask_cudf +if TYPE_CHECKING: + from cudf._typing import ( + ColumnLike, + DataFrameOrSeries, + Dtype, + NotImplementedType, + ) + doc_reset_index_template = """ Reset the index of the {klass}, or a level of it. @@ -255,8 +252,8 @@ class IndexedFrame(Frame): """ # mypy can't handle bound type variables as class members - _loc_indexer_type: Type[_LocIndexerClass] # type: ignore - _iloc_indexer_type: Type[_IlocIndexerClass] # type: ignore + _loc_indexer_type: type[_LocIndexerClass] # type: ignore + _iloc_indexer_type: type[_IlocIndexerClass] # type: ignore _index: cudf.core.index.BaseIndex _groupby = GroupBy _resampler = _Resampler @@ -291,14 +288,14 @@ def _num_rows(self) -> int: return len(self.index) @property - def _index_names(self) -> Tuple[Any, ...]: # TODO: Tuple[str]? + def _index_names(self) -> tuple[Any, ...]: # TODO: Tuple[str]? return self.index._data.names @classmethod def _from_data( cls, data: MutableMapping, - index: Optional[BaseIndex] = None, + index: BaseIndex | None = None, ): out = super()._from_data(data) out._index = RangeIndex(out._data.nrows) if index is None else index @@ -306,18 +303,18 @@ def _from_data( @_cudf_nvtx_annotate def _from_data_like_self(self, data: MutableMapping): - out = self._from_data(data, self.index) - out._data._level_names = self._data._level_names + out = super()._from_data_like_self(data) + out.index = self.index return out @_cudf_nvtx_annotate def _from_columns_like_self( self, - columns: List[ColumnBase], - column_names: Optional[abc.Iterable[str]] = None, - index_names: Optional[List[str]] = None, + columns: list[ColumnBase], + column_names: abc.Iterable[str] | None = None, + index_names: list[str] | None = None, *, - override_dtypes: Optional[abc.Iterable[Optional[Dtype]]] = None, + override_dtypes: abc.Iterable[Dtype | None] | None = None, ) -> Self: """Construct a `Frame` from a list of columns with metadata from self. @@ -365,7 +362,7 @@ def __round__(self, digits=0): def _mimic_inplace( self, result: Self, inplace: bool = False - ) -> Optional[Self]: + ) -> Self | None: if inplace: self._index = result.index return super()._mimic_inplace(result, inplace) @@ -1785,7 +1782,7 @@ def skew(self, axis=0, skipna=True, numeric_only=False, **kwargs): ) @_cudf_nvtx_annotate - def mask(self, cond, other=None, inplace: bool = False) -> Optional[Self]: + def mask(self, cond, other=None, inplace: bool = False) -> Self | None: """ Replace values where the condition is True. @@ -1921,7 +1918,7 @@ def _copy_type_metadata( other: Self, include_index: bool = True, *, - override_dtypes: Optional[abc.Iterable[Optional[Dtype]]] = None, + override_dtypes: abc.Iterable[Dtype | None] | None = None, ) -> Self: """ Copy type metadata from each column of `other` to the corresponding @@ -4667,9 +4664,9 @@ def sample( def _sample_axis_0( self, n: int, - weights: Optional[ColumnLike], + weights: ColumnLike | None, replace: bool, - random_state: Union[np.random.RandomState, cp.random.RandomState], + random_state: np.random.RandomState | cp.random.RandomState, ignore_index: bool, ): try: @@ -4692,7 +4689,7 @@ def _sample_axis_0( def _sample_axis_1( self, n: int, - weights: Optional[ColumnLike], + weights: ColumnLike | None, replace: bool, random_state: np.random.RandomState, ignore_index: bool, @@ -4739,12 +4736,10 @@ def _make_operands_and_index_for_binop( fill_value: Any = None, reflect: bool = False, can_reindex: bool = False, - ) -> Tuple[ - Union[ - Dict[Optional[str], Tuple[ColumnBase, Any, bool, Any]], - NotImplementedType, - ], - Optional[cudf.BaseIndex], + ) -> tuple[ + dict[str | None, tuple[ColumnBase, Any, bool, Any]] + | NotImplementedType, + cudf.BaseIndex | None, bool, ]: raise NotImplementedError( @@ -6325,8 +6320,8 @@ def _check_duplicate_level_names(specified, level_names): @_cudf_nvtx_annotate def _get_replacement_values_for_columns( - to_replace: Any, value: Any, columns_dtype_map: Dict[Any, Any] -) -> Tuple[Dict[Any, bool], Dict[Any, Any], Dict[Any, Any]]: + to_replace: Any, value: Any, columns_dtype_map: dict[Any, Any] +) -> tuple[dict[Any, bool], dict[Any, Any], dict[Any, Any]]: """ Returns a per column mapping for the values to be replaced, new values to be replaced with and if all the values are empty. @@ -6351,9 +6346,9 @@ def _get_replacement_values_for_columns( A dict mapping of all columns and the corresponding values to be replaced with. """ - to_replace_columns: Dict[Any, Any] = {} - values_columns: Dict[Any, Any] = {} - all_na_columns: Dict[Any, Any] = {} + to_replace_columns: dict[Any, Any] = {} + values_columns: dict[Any, Any] = {} + all_na_columns: dict[Any, Any] = {} if is_scalar(to_replace) and is_scalar(value): to_replace_columns = {col: [to_replace] for col in columns_dtype_map} @@ -6493,8 +6488,8 @@ def _is_series(obj): @_cudf_nvtx_annotate def _drop_rows_by_labels( obj: DataFrameOrSeries, - labels: Union[ColumnLike, abc.Iterable, str], - level: Union[int, str], + labels: ColumnLike | abc.Iterable | str, + level: int | str, errors: str, ) -> DataFrameOrSeries: """Remove rows specified by `labels`. diff --git a/python/cudf/cudf/core/indexing_utils.py b/python/cudf/cudf/core/indexing_utils.py index 7242de9964f..73a1cd26367 100644 --- a/python/cudf/cudf/core/indexing_utils.py +++ b/python/cudf/cudf/core/indexing_utils.py @@ -1,9 +1,9 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. +# Copyright (c) 2023-2024, NVIDIA CORPORATION. from __future__ import annotations from dataclasses import dataclass -from typing import Any, List, Tuple, Union +from typing import Any, List, Union from typing_extensions import TypeAlias @@ -59,7 +59,7 @@ class ScalarIndexer: def destructure_iloc_key( - key: Any, frame: Union[cudf.Series, cudf.DataFrame] + key: Any, frame: cudf.Series | cudf.DataFrame ) -> tuple[Any, ...]: """ Destructure a potentially tuple-typed key into row and column indexers. @@ -124,7 +124,7 @@ def destructure_iloc_key( def destructure_dataframe_iloc_indexer( key: Any, frame: cudf.DataFrame -) -> Tuple[Any, Tuple[bool, ColumnLabels]]: +) -> tuple[Any, tuple[bool, ColumnLabels]]: """Destructure an index key for DataFrame iloc getitem. Parameters diff --git a/python/cudf/cudf/core/join/_join_helpers.py b/python/cudf/cudf/core/join/_join_helpers.py index 05cbb4429b9..dd0a4f666a1 100644 --- a/python/cudf/cudf/core/join/_join_helpers.py +++ b/python/cudf/cudf/core/join/_join_helpers.py @@ -4,7 +4,7 @@ import warnings from collections import abc -from typing import TYPE_CHECKING, Any, Tuple, cast +from typing import TYPE_CHECKING, Any, cast import numpy as np @@ -51,7 +51,7 @@ def set(self, obj: cudf.DataFrame, value: ColumnBase, validate=False): def _match_join_keys( lcol: ColumnBase, rcol: ColumnBase, how: str -) -> Tuple[ColumnBase, ColumnBase]: +) -> tuple[ColumnBase, ColumnBase]: # Casts lcol and rcol to a common dtype for use as join keys. If no casting # is necessary, they are returned as is. @@ -133,7 +133,7 @@ def _match_join_keys( def _match_categorical_dtypes_both( lcol: CategoricalColumn, rcol: CategoricalColumn, how: str -) -> Tuple[ColumnBase, ColumnBase]: +) -> tuple[ColumnBase, ColumnBase]: ltype, rtype = lcol.dtype, rcol.dtype # when both are ordered and both have the same categories, diff --git a/python/cudf/cudf/core/join/join.py b/python/cudf/cudf/core/join/join.py index da999441ca3..ce81c1fc5b1 100644 --- a/python/cudf/cudf/core/join/join.py +++ b/python/cudf/cudf/core/join/join.py @@ -2,7 +2,7 @@ from __future__ import annotations import itertools -from typing import Any, ClassVar, List, Optional +from typing import Any, ClassVar import cudf from cudf import _lib as libcudf @@ -370,7 +370,7 @@ def _merge_results( else: multiindex_columns = False - index: Optional[cudf.BaseIndex] + index: cudf.BaseIndex | None if self._using_right_index: # right_index and left_on index = left_result.index @@ -398,7 +398,7 @@ def _sort_result(self, result: cudf.DataFrame) -> cudf.DataFrame: # This is taken care of by using a stable sort here, and (in # pandas-compat mode) reordering the gather maps before # producing the input result. - by: List[Any] = [] + by: list[Any] = [] if self._using_left_index and self._using_right_index: by.extend(result.index._data.columns) if not self._using_left_index: diff --git a/python/cudf/cudf/core/mixins/binops.pyi b/python/cudf/cudf/core/mixins/binops.pyi index 8587b2dea48..6be73e25332 100644 --- a/python/cudf/cudf/core/mixins/binops.pyi +++ b/python/cudf/cudf/core/mixins/binops.pyi @@ -1,12 +1,12 @@ # Copyright (c) 2022, NVIDIA CORPORATION. -from typing import Any, Set, Tuple, TypeVar +from typing import Any, TypeVar # Note: It may be possible to define a narrower bound here eventually. BinaryOperandType = TypeVar("BinaryOperandType", bound="Any") class BinaryOperand: - _SUPPORTED_BINARY_OPERATIONS: Set + _SUPPORTED_BINARY_OPERATIONS: set def _binaryop(self, other: BinaryOperandType, op: str): ... def __add__(self, other): ... @@ -36,4 +36,4 @@ class BinaryOperand: def __gt__(self, other): ... def __ge__(self, other): ... @staticmethod - def _check_reflected_op(op) -> Tuple[bool, str]: ... + def _check_reflected_op(op) -> tuple[bool, str]: ... diff --git a/python/cudf/cudf/core/mixins/reductions.pyi b/python/cudf/cudf/core/mixins/reductions.pyi index dbaafdb5cd2..1c2126002ad 100644 --- a/python/cudf/cudf/core/mixins/reductions.pyi +++ b/python/cudf/cudf/core/mixins/reductions.pyi @@ -1,9 +1,7 @@ # Copyright (c) 2022, NVIDIA CORPORATION. -from typing import Set - class Reducible: - _SUPPORTED_REDUCTIONS: Set + _SUPPORTED_REDUCTIONS: set def sum(self): ... def product(self): ... diff --git a/python/cudf/cudf/core/mixins/scans.pyi b/python/cudf/cudf/core/mixins/scans.pyi index 37995241b1f..5190750c698 100644 --- a/python/cudf/cudf/core/mixins/scans.pyi +++ b/python/cudf/cudf/core/mixins/scans.pyi @@ -1,9 +1,7 @@ # Copyright (c) 2022, NVIDIA CORPORATION. -from typing import Set - class Scannable: - _SUPPORTED_SCANS: Set + _SUPPORTED_SCANS: set def cumsum(self): ... def cumprod(self): ... diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py index 11b4b9154a2..832cc003d2e 100644 --- a/python/cudf/cudf/core/multiindex.py +++ b/python/cudf/cudf/core/multiindex.py @@ -8,10 +8,9 @@ import pickle import warnings from collections import abc -from collections.abc import Generator from functools import cached_property from numbers import Integral -from typing import Any, List, MutableMapping, Tuple, Union +from typing import TYPE_CHECKING, Any, MutableMapping import cupy as cp import numpy as np @@ -20,7 +19,6 @@ import cudf import cudf._lib as libcudf from cudf._lib.types import size_type_dtype -from cudf._typing import DataFrameOrSeries from cudf.api.extensions import no_default from cudf.api.types import is_integer, is_list_like, is_object_dtype from cudf.core import column @@ -36,8 +34,13 @@ from cudf.utils.nvtx_annotation import _cudf_nvtx_annotate from cudf.utils.utils import NotIterable, _external_only_api, _is_same_name +if TYPE_CHECKING: + from collections.abc import Generator -def _maybe_indices_to_slice(indices: cp.ndarray) -> Union[slice, cp.ndarray]: + from cudf._typing import DataFrameOrSeries + + +def _maybe_indices_to_slice(indices: cp.ndarray) -> slice | cp.ndarray: """Makes best effort to convert an array of indices into a python slice. If the conversion is not possible, return input. `indices` are expected to be valid. @@ -846,9 +849,10 @@ def _index_and_downcast(self, result, index, index_key): def _get_row_major( self, df: DataFrameOrSeries, - row_tuple: Union[ - numbers.Number, slice, Tuple[Any, ...], List[Tuple[Any, ...]] - ], + row_tuple: numbers.Number + | slice + | tuple[Any, ...] + | list[tuple[Any, ...]], ) -> DataFrameOrSeries: if pd.api.types.is_bool_dtype( list(row_tuple) if isinstance(row_tuple, tuple) else row_tuple @@ -871,9 +875,10 @@ def _get_row_major( @_cudf_nvtx_annotate def _validate_indexer( self, - indexer: Union[ - numbers.Number, slice, Tuple[Any, ...], List[Tuple[Any, ...]] - ], + indexer: numbers.Number + | slice + | tuple[Any, ...] + | list[tuple[Any, ...]], ): if isinstance(indexer, numbers.Number): return @@ -1636,9 +1641,54 @@ def is_unique(self): def dtype(self): return np.dtype("O") + @_cudf_nvtx_annotate + def _is_sorted(self, ascending=None, null_position=None) -> bool: + """ + Returns a boolean indicating whether the data of the MultiIndex are sorted + based on the parameters given. Does not account for the index. + + Parameters + ---------- + self : MultiIndex + MultiIndex whose columns are to be checked for sort order + ascending : None or list-like of booleans + None or list-like of boolean values indicating expected sort order + of each column. If list-like, size of list-like must be + len(columns). If None, all columns expected sort order is set to + ascending. False (0) - ascending, True (1) - descending. + null_position : None or list-like of booleans + None or list-like of boolean values indicating desired order of + nulls compared to other elements. If list-like, size of list-like + must be len(columns). If None, null order is set to before. False + (0) - before, True (1) - after. + + Returns + ------- + returns : boolean + Returns True, if sorted as expected by ``ascending`` and + ``null_position``, False otherwise. + """ + if ascending is not None and not cudf.api.types.is_list_like( + ascending + ): + raise TypeError( + f"Expected a list-like or None for `ascending`, got " + f"{type(ascending)}" + ) + if null_position is not None and not cudf.api.types.is_list_like( + null_position + ): + raise TypeError( + f"Expected a list-like or None for `null_position`, got " + f"{type(null_position)}" + ) + return libcudf.sort.is_sorted( + [*self._columns], ascending=ascending, null_position=null_position + ) + @cached_property # type: ignore @_cudf_nvtx_annotate - def is_monotonic_increasing(self): + def is_monotonic_increasing(self) -> bool: """ Return if the index is monotonic increasing (only equal or increasing) values. @@ -1647,7 +1697,7 @@ def is_monotonic_increasing(self): @cached_property # type: ignore @_cudf_nvtx_annotate - def is_monotonic_decreasing(self): + def is_monotonic_decreasing(self) -> bool: """ Return if the index is monotonic decreasing (only equal or decreasing) values. @@ -1701,6 +1751,11 @@ def fillna(self, value): def unique(self): return self.drop_duplicates(keep="first") + @_cudf_nvtx_annotate + def nunique(self, dropna: bool = True) -> int: + mi = self.dropna(how="all") if dropna else self + return len(mi.unique()) + def _clean_nulls_from_index(self): """ Convert all na values(if any) in MultiIndex object diff --git a/python/cudf/cudf/core/reshape.py b/python/cudf/cudf/core/reshape.py index 53239cb7ea0..903c4fe7df5 100644 --- a/python/cudf/cudf/core/reshape.py +++ b/python/cudf/cudf/core/reshape.py @@ -1,8 +1,9 @@ # Copyright (c) 2018-2024, NVIDIA CORPORATION. +from __future__ import annotations import itertools import warnings -from typing import Dict, Optional +from typing import TYPE_CHECKING import numpy as np import pandas as pd @@ -10,13 +11,15 @@ import cudf from cudf._lib.transform import one_hot_encode from cudf._lib.types import size_type_dtype -from cudf._typing import Dtype from cudf.api.extensions import no_default from cudf.core._compat import PANDAS_LT_300 from cudf.core.column import ColumnBase, as_column, column_empty_like from cudf.core.column.categorical import CategoricalColumn from cudf.utils.dtypes import min_unsigned_type +if TYPE_CHECKING: + from cudf._typing import Dtype + _AXIS_MAP = {0: 0, 1: 1, "index": 0, "columns": 1} @@ -1217,10 +1220,10 @@ def _get_unique(column, dummy_na): def _one_hot_encode_column( column: ColumnBase, categories: ColumnBase, - prefix: Optional[str], - prefix_sep: Optional[str], - dtype: Optional[Dtype], -) -> Dict[str, ColumnBase]: + prefix: str | None, + prefix_sep: str | None, + dtype: Dtype | None, +) -> dict[str, ColumnBase]: """Encode a single column with one hot encoding. The return dictionary contains pairs of (category, encodings). The keys may be prefixed with `prefix`, separated with category name with `prefix_sep`. The encoding diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index a52b583d3b4..e532948fd11 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -9,16 +9,7 @@ import warnings from collections import abc from shutil import get_terminal_size -from typing import ( - Any, - Dict, - Literal, - MutableMapping, - Optional, - Set, - Tuple, - Union, -) +from typing import TYPE_CHECKING, Any, Literal, MutableMapping import cupy import numpy as np @@ -27,12 +18,6 @@ import cudf from cudf import _lib as libcudf -from cudf._typing import ( - ColumnLike, - DataFrameOrSeries, - NotImplementedType, - ScalarLike, -) from cudf.api.extensions import no_default from cudf.api.types import ( _is_non_decimal_numeric_dtype, @@ -85,6 +70,14 @@ ) from cudf.utils.nvtx_annotation import _cudf_nvtx_annotate +if TYPE_CHECKING: + from cudf._typing import ( + ColumnLike, + DataFrameOrSeries, + NotImplementedType, + ScalarLike, + ) + def _format_percentile_names(percentiles): return [f"{int(x * 100)}%" for x in percentiles] @@ -282,7 +275,7 @@ class _SeriesLocIndexer(_FrameIndexer): """ @_cudf_nvtx_annotate - def __getitem__(self, arg: Any) -> Union[ScalarLike, DataFrameOrSeries]: + def __getitem__(self, arg: Any) -> ScalarLike | DataFrameOrSeries: if isinstance(arg, pd.MultiIndex): arg = cudf.from_pandas(arg) @@ -461,7 +454,7 @@ class Series(SingleColumnFrame, IndexedFrame, Serializable): If ``False``, leaves ``np.nan`` values as is. """ - _accessors: Set[Any] = set() + _accessors: set[Any] = set() _loc_indexer_type = _SeriesLocIndexer _iloc_indexer_type = _SeriesIlocIndexer _groupby = SeriesGroupBy @@ -674,7 +667,7 @@ def __init__( def _from_data( cls, data: MutableMapping, - index: Optional[BaseIndex] = None, + index: BaseIndex | None = None, name: Any = no_default, ) -> Series: out = super()._from_data(data=data, index=index) @@ -682,6 +675,12 @@ def _from_data( out.name = name return out + @_cudf_nvtx_annotate + def _from_data_like_self(self, data: MutableMapping): + out = super()._from_data_like_self(data) + out.name = self.name + return out + @_cudf_nvtx_annotate def __contains__(self, item): return item in self.index @@ -856,20 +855,6 @@ def deserialize(cls, header, frames): return obj - def _get_columns_by_label(self, labels, *, downcast=False) -> Self: - """Return the column specified by `labels` - - For cudf.Series, either the column, or an empty series is returned. - Parameter `downcast` does not have effects. - """ - ca = self._data.select_by_label(labels) - - return ( - self.__class__._from_data(data=ca, index=self.index) - if len(ca) > 0 - else self.__class__(dtype=self.dtype, name=self.name) - ) - @_cudf_nvtx_annotate def drop( self, @@ -1316,7 +1301,7 @@ def map(self, arg, na_action=None) -> "Series": def _getitem_preprocessed( self, spec: indexing_utils.IndexingSpec, - ) -> Union[Self, ScalarLike]: + ) -> Self | ScalarLike: """Get subset of entries given structured data Parameters @@ -1478,12 +1463,10 @@ def _make_operands_and_index_for_binop( fill_value: Any = None, reflect: bool = False, can_reindex: bool = False, - ) -> Tuple[ - Union[ - Dict[Optional[str], Tuple[ColumnBase, Any, bool, Any]], - NotImplementedType, - ], - Optional[BaseIndex], + ) -> tuple[ + dict[str | None, tuple[ColumnBase, Any, bool, Any]] + | NotImplementedType, + BaseIndex | None, bool, ]: # Specialize binops to align indices. diff --git a/python/cudf/cudf/core/single_column_frame.py b/python/cudf/cudf/core/single_column_frame.py index acc74129a29..23a2c828a04 100644 --- a/python/cudf/cudf/core/single_column_frame.py +++ b/python/cudf/cudf/core/single_column_frame.py @@ -3,15 +3,11 @@ from __future__ import annotations -from typing import Any, Dict, Optional, Tuple, Union +from typing import TYPE_CHECKING, Any -import cupy -import numpy -import pyarrow as pa from typing_extensions import Self import cudf -from cudf._typing import NotImplementedType, ScalarLike from cudf.api.extensions import no_default from cudf.api.types import ( _is_scalar_or_zero_d_array, @@ -25,6 +21,13 @@ from cudf.utils.nvtx_annotation import _cudf_nvtx_annotate from cudf.utils.utils import NotIterable +if TYPE_CHECKING: + import cupy + import numpy + import pyarrow as pa + + from cudf._typing import NotImplementedType, ScalarLike + class SingleColumnFrame(Frame, NotIterable): """A one-dimensional frame. @@ -271,10 +274,10 @@ def _make_operands_for_binop( other: Any, fill_value: Any = None, reflect: bool = False, - ) -> Union[ - Dict[Optional[str], Tuple[ColumnBase, Any, bool, Any]], - NotImplementedType, - ]: + ) -> ( + dict[str | None, tuple[ColumnBase, Any, bool, Any]] + | NotImplementedType + ): """Generate the dictionary of operands used for a binary operation. Parameters @@ -335,11 +338,9 @@ def nunique(self, dropna: bool = True) -> int: int Number of unique values in the column. """ - if self._column.null_count == len(self): - return 0 return self._column.distinct_count(dropna=dropna) - def _get_elements_from_column(self, arg) -> Union[ScalarLike, ColumnBase]: + def _get_elements_from_column(self, arg) -> ScalarLike | ColumnBase: # A generic method for getting elements from a column that supports a # wide range of different inputs. This method should only used where # _absolutely_ necessary, since in almost all cases a more specific diff --git a/python/cudf/cudf/core/subword_tokenizer.py b/python/cudf/cudf/core/subword_tokenizer.py index 24c49e3662a..9e59b134b73 100644 --- a/python/cudf/cudf/core/subword_tokenizer.py +++ b/python/cudf/cudf/core/subword_tokenizer.py @@ -3,7 +3,6 @@ from __future__ import annotations import warnings -from typing import Union import cupy as cp @@ -60,7 +59,7 @@ def __call__( max_num_rows: int, add_special_tokens: bool = True, padding: str = "max_length", - truncation: Union[bool, str] = False, + truncation: bool | str = False, stride: int = 0, return_tensors: str = "cp", return_token_type_ids: bool = False, diff --git a/python/cudf/cudf/core/tools/datetimes.py b/python/cudf/cudf/core/tools/datetimes.py index f002a838fa9..29130130732 100644 --- a/python/cudf/cudf/core/tools/datetimes.py +++ b/python/cudf/cudf/core/tools/datetimes.py @@ -1,9 +1,10 @@ # Copyright (c) 2019-2024, NVIDIA CORPORATION. +from __future__ import annotations import math import re import warnings -from typing import Literal, Optional, Sequence, Union +from typing import Literal, Sequence import cupy as cp import numpy as np @@ -61,7 +62,7 @@ def to_datetime( dayfirst: bool = False, yearfirst: bool = False, utc: bool = False, - format: Optional[str] = None, + format: str | None = None, exact: bool = True, unit: str = "ns", infer_datetime_format: bool = True, @@ -313,7 +314,7 @@ def _process_col( unit: str, dayfirst: bool, infer_datetime_format: bool, - format: Optional[str], + format: str | None, utc: bool, ): if col.dtype.kind == "f": @@ -707,7 +708,7 @@ def _from_freqstr(cls, freqstr: str) -> Self: @classmethod def _from_pandas_ticks_or_weeks( cls, - tick: Union[pd.tseries.offsets.Tick, pd.tseries.offsets.Week], + tick: pd.tseries.offsets.Tick | pd.tseries.offsets.Week, ) -> Self: return cls(**{cls._TICK_OR_WEEK_TO_UNITS[type(tick)]: tick.n}) @@ -725,7 +726,7 @@ def _maybe_as_fast_pandas_offset(self): def _isin_datetimelike( - lhs: Union[column.TimeDeltaColumn, column.DatetimeColumn], values: Sequence + lhs: column.TimeDeltaColumn | column.DatetimeColumn, values: Sequence ) -> column.ColumnBase: """ Check whether values are contained in the @@ -784,7 +785,7 @@ def date_range( name=None, closed: Literal["left", "right", "both", "neither"] = "both", *, - unit: Optional[str] = None, + unit: str | None = None, ): """Return a fixed frequency DatetimeIndex. diff --git a/python/cudf/cudf/core/udf/groupby_typing.py b/python/cudf/cudf/core/udf/groupby_typing.py index 72088493074..dffd7db2f71 100644 --- a/python/cudf/cudf/core/udf/groupby_typing.py +++ b/python/cudf/cudf/core/udf/groupby_typing.py @@ -1,5 +1,7 @@ -# Copyright (c) 2020-2023, NVIDIA CORPORATION. -from typing import Any, Dict +# Copyright (c) 2020-2024, NVIDIA CORPORATION. +from __future__ import annotations + +from typing import Any import numba from numba import cuda, types @@ -124,7 +126,7 @@ def __init__(self, dmm, fe_type): super().__init__(dmm, fe_type, members) -call_cuda_functions: Dict[Any, Any] = {} +call_cuda_functions: dict[Any, Any] = {} def _register_cuda_binary_reduction_caller(funcname, lty, rty, retty): diff --git a/python/cudf/cudf/core/udf/utils.py b/python/cudf/cudf/core/udf/utils.py index bc1f4f2557e..f1704e4ea78 100644 --- a/python/cudf/cudf/core/udf/utils.py +++ b/python/cudf/cudf/core/udf/utils.py @@ -1,8 +1,9 @@ # Copyright (c) 2020-2024, NVIDIA CORPORATION. +from __future__ import annotations import functools import os -from typing import Any, Callable, Dict +from typing import Any, Callable import cachetools import cupy as cp @@ -57,7 +58,7 @@ MASK_BITSIZE = np.dtype("int32").itemsize * 8 precompiled: cachetools.LRUCache = cachetools.LRUCache(maxsize=32) -launch_arg_getters: Dict[Any, Any] = {} +launch_arg_getters: dict[Any, Any] = {} @functools.cache diff --git a/python/cudf/cudf/io/parquet.py b/python/cudf/cudf/io/parquet.py index dbdb2093b72..58b104b84e9 100644 --- a/python/cudf/cudf/io/parquet.py +++ b/python/cudf/cudf/io/parquet.py @@ -10,7 +10,7 @@ from collections import defaultdict from contextlib import ExitStack from functools import partial, reduce -from typing import Callable, Dict, List, Optional, Tuple +from typing import Callable from uuid import uuid4 import numpy as np @@ -679,7 +679,7 @@ def read_parquet( return df -def _normalize_filters(filters: list | None) -> List[List[tuple]] | None: +def _normalize_filters(filters: list | None) -> list[list[tuple]] | None: # Utility to normalize and validate the `filters` # argument to `read_parquet` if not filters: @@ -709,7 +709,7 @@ def _validate_predicate(item): def _apply_post_filters( - df: cudf.DataFrame, filters: List[List[tuple]] | None + df: cudf.DataFrame, filters: list[list[tuple]] | None ) -> cudf.DataFrame: """Apply DNF filters to an in-memory DataFrame @@ -738,7 +738,7 @@ def _handle_is(column: cudf.Series, value, *, negate) -> cudf.Series: ) return ~column.isna() if negate else column.isna() - handlers: Dict[str, Callable] = { + handlers: dict[str, Callable] = { "==": operator.eq, "!=": operator.ne, "<": operator.lt, @@ -1311,7 +1311,7 @@ def __init__( ) -> None: if isinstance(path, str) and path.startswith("s3://"): self.fs_meta = {"is_s3": True, "actual_path": path} - self.dir_: Optional[tempfile.TemporaryDirectory] = ( + self.dir_: tempfile.TemporaryDirectory | None = ( tempfile.TemporaryDirectory() ) self.path = self.dir_.name @@ -1328,12 +1328,12 @@ def __init__( self.partition_cols = partition_cols # Collection of `ParquetWriter`s, and the corresponding # partition_col values they're responsible for - self._chunked_writers: List[ - Tuple[libparquet.ParquetWriter, List[str], str] + self._chunked_writers: list[ + tuple[libparquet.ParquetWriter, list[str], str] ] = [] # Map of partition_col values to their ParquetWriter's index # in self._chunked_writers for reverse lookup - self.path_cw_map: Dict[str, int] = {} + self.path_cw_map: dict[str, int] = {} self.storage_options = storage_options self.filename = file_name_prefix self.max_file_size = max_file_size @@ -1345,7 +1345,7 @@ def __init__( ) self.max_file_size = _parse_bytes(max_file_size) - self._file_sizes: Dict[str, int] = {} + self._file_sizes: dict[str, int] = {} @_cudf_nvtx_annotate def write_table(self, df): diff --git a/python/cudf/cudf/options.py b/python/cudf/cudf/options.py index efa8eabd8b8..fb5a963f008 100644 --- a/python/cudf/cudf/options.py +++ b/python/cudf/cudf/options.py @@ -1,11 +1,14 @@ # Copyright (c) 2022-2024, NVIDIA CORPORATION. +from __future__ import annotations import os import textwrap -from collections.abc import Container from contextlib import ContextDecorator from dataclasses import dataclass -from typing import Any, Callable, Dict, Optional +from typing import TYPE_CHECKING, Any, Callable + +if TYPE_CHECKING: + from collections.abc import Container @dataclass @@ -16,7 +19,7 @@ class Option: validator: Callable -_OPTIONS: Dict[str, Option] = {} +_OPTIONS: dict[str, Option] = {} def _env_get_int(name, default): @@ -123,7 +126,7 @@ def _build_option_description(name, opt): ) -def describe_option(name: Optional[str] = None): +def describe_option(name: str | None = None): """Prints the description of an option. If `name` is unspecified, prints the description of all available options. diff --git a/python/cudf/cudf/pandas/__init__.py b/python/cudf/cudf/pandas/__init__.py index 5b3785531d3..ff445a63f74 100644 --- a/python/cudf/cudf/pandas/__init__.py +++ b/python/cudf/cudf/pandas/__init__.py @@ -2,6 +2,11 @@ # All rights reserved. # SPDX-License-Identifier: Apache-2.0 +import os +import warnings + +import rmm.mr + from .fast_slow_proxy import is_proxy_object from .magics import load_ipython_extension from .profiler import Profiler @@ -20,6 +25,42 @@ def install(): global LOADED LOADED = loader is not None + if (rmm_mode := os.getenv("CUDF_PANDAS_RMM_MODE", None)) is not None: + # Check if a non-default memory resource is set + current_mr = rmm.mr.get_current_device_resource() + if not isinstance(current_mr, rmm.mr.CudaMemoryResource): + warnings.warn( + f"cudf.pandas detected an already configured memory resource, ignoring 'CUDF_PANDAS_RMM_MODE'={str(rmm_mode)}", + UserWarning, + ) + free_memory, _ = rmm.mr.available_device_memory() + free_memory = int(round(float(free_memory) * 0.80 / 256) * 256) + + if rmm_mode == "cuda": + mr = rmm.mr.CudaMemoryResource() + rmm.mr.set_current_device_resource(mr) + elif rmm_mode == "pool": + rmm.mr.set_current_device_resource( + rmm.mr.PoolMemoryResource( + rmm.mr.get_current_device_resource(), + initial_pool_size=free_memory, + ) + ) + elif rmm_mode == "async": + mr = rmm.mr.CudaAsyncMemoryResource(initial_pool_size=free_memory) + rmm.mr.set_current_device_resource(mr) + elif rmm_mode == "managed": + mr = rmm.mr.ManagedMemoryResource() + rmm.mr.set_current_device_resource(mr) + elif rmm_mode == "managed_pool": + mr = rmm.mr.PoolMemoryResource( + rmm.mr.ManagedMemoryResource(), + initial_pool_size=free_memory, + ) + rmm.mr.set_current_device_resource(mr) + else: + raise ValueError(f"Unsupported rmm mode: {rmm_mode}") + def pytest_load_initial_conftests(early_config, parser, args): # We need to install ourselves before conftest.py import (which diff --git a/python/cudf/cudf/pandas/fast_slow_proxy.py b/python/cudf/cudf/pandas/fast_slow_proxy.py index 128913e5746..1540c6850e7 100644 --- a/python/cudf/cudf/pandas/fast_slow_proxy.py +++ b/python/cudf/cudf/pandas/fast_slow_proxy.py @@ -12,17 +12,7 @@ import warnings from collections.abc import Iterator from enum import IntEnum -from typing import ( - Any, - Callable, - Dict, - Literal, - Mapping, - Optional, - Set, - Tuple, - Type, -) +from typing import Any, Callable, Literal, Mapping import numpy as np @@ -118,12 +108,12 @@ def make_final_proxy_type( *, fast_to_slow: Callable, slow_to_fast: Callable, - module: Optional[str] = None, + module: str | None = None, additional_attributes: Mapping[str, Any] | None = None, postprocess: Callable[[_FinalProxy, Any, Any], Any] | None = None, - bases: Tuple = (), - metaclasses: Tuple = (), -) -> Type[_FinalProxy]: + bases: tuple = (), + metaclasses: tuple = (), +) -> type[_FinalProxy]: """ Defines a fast-slow proxy type for a pair of "final" fast and slow types. Final types are types for which known operations exist for @@ -270,8 +260,8 @@ def make_intermediate_proxy_type( fast_type: type, slow_type: type, *, - module: Optional[str] = None, -) -> Type[_IntermediateProxy]: + module: str | None = None, +) -> type[_IntermediateProxy]: """ Defines a proxy type for a pair of "intermediate" fast and slow types. Intermediate types are the types of the results of @@ -613,13 +603,13 @@ class _IntermediateProxy(_FastSlowProxy): `make_intermediate_proxy_type` to create subtypes. """ - _method_chain: Tuple[Callable, Tuple, Dict] + _method_chain: tuple[Callable, tuple, dict] @classmethod def _fsproxy_wrap( cls, obj: Any, - method_chain: Tuple[Callable, Tuple, Dict], + method_chain: tuple[Callable, tuple, dict], ): """ Parameters @@ -955,7 +945,7 @@ def _fast_slow_function_call( def _transform_arg( arg: Any, attribute_name: Literal["_fsproxy_slow", "_fsproxy_fast"], - seen: Set[int], + seen: set[int], ) -> Any: """ Transform "arg" into its corresponding slow (or fast) type. @@ -1052,7 +1042,7 @@ def _fast_arg(arg: Any) -> Any: """ Transform "arg" into its corresponding fast type. """ - seen: Set[int] = set() + seen: set[int] = set() return _transform_arg(arg, "_fsproxy_fast", seen) @@ -1060,7 +1050,7 @@ def _slow_arg(arg: Any) -> Any: """ Transform "arg" into its corresponding slow type. """ - seen: Set[int] = set() + seen: set[int] = set() return _transform_arg(arg, "_fsproxy_slow", seen) @@ -1137,7 +1127,7 @@ def _is_function_or_method(obj: Any) -> bool: def _replace_closurevars( f: types.FunctionType, attribute_name: Literal["_fsproxy_slow", "_fsproxy_fast"], - seen: Set[int], + seen: set[int], ) -> Callable[..., Any]: """ Return a copy of `f` with its closure variables replaced with @@ -1199,10 +1189,10 @@ def is_proxy_object(obj: Any) -> bool: return False -NUMPY_TYPES: Set[str] = set(np.sctypeDict.values()) +NUMPY_TYPES: set[str] = set(np.sctypeDict.values()) -_SPECIAL_METHODS: Set[str] = { +_SPECIAL_METHODS: set[str] = { "__abs__", "__add__", "__and__", diff --git a/python/cudf/cudf/pandas/module_accelerator.py b/python/cudf/cudf/pandas/module_accelerator.py index 1d431c6d882..f82e300e83d 100644 --- a/python/cudf/cudf/pandas/module_accelerator.py +++ b/python/cudf/cudf/pandas/module_accelerator.py @@ -17,7 +17,7 @@ from abc import abstractmethod from importlib._bootstrap import _ImportLockContext as ImportLock from types import ModuleType -from typing import Any, ContextManager, Dict, NamedTuple, Tuple +from typing import Any, ContextManager, NamedTuple from typing_extensions import Self @@ -377,7 +377,7 @@ class ModuleAccelerator(ModuleAcceleratorBase): attempts to call the fast version first). """ - _denylist: Tuple[str] + _denylist: tuple[str] _use_fast_lib: bool _use_fast_lib_lock: threading.RLock _module_cache_prefix: str = "_slow_lib_" @@ -519,7 +519,7 @@ def disabled(self): def getattr_real_or_wrapped( name: str, *, - real: Dict[str, Any], + real: dict[str, Any], wrapped_objs, loader: ModuleAccelerator, ) -> Any: diff --git a/python/cudf/cudf/pandas/profiler.py b/python/cudf/cudf/pandas/profiler.py index 0dbd333ce4f..0fb41fc0b26 100644 --- a/python/cudf/cudf/pandas/profiler.py +++ b/python/cudf/cudf/pandas/profiler.py @@ -1,6 +1,7 @@ # SPDX-FileCopyrightText: Copyright (c) 2023-2024, NVIDIA CORPORATION & AFFILIATES. # All rights reserved. # SPDX-License-Identifier: Apache-2.0 +from __future__ import annotations import inspect import operator @@ -8,7 +9,6 @@ import sys import time from collections import defaultdict -from typing import Union from rich.console import Console from rich.syntax import Syntax @@ -119,12 +119,10 @@ def __exit__(self, *args, **kwargs): @staticmethod def get_namespaced_function_name( - func_obj: Union[ - _FunctionProxy, - _MethodProxy, - type[_FinalProxy], - type[_IntermediateProxy], - ], + func_obj: _FunctionProxy + | _MethodProxy + | type[_FinalProxy] + | type[_IntermediateProxy], ): if isinstance(func_obj, _MethodProxy): return func_obj._fsproxy_slow.__qualname__ diff --git a/python/cudf/cudf/pylibcudf_tests/common/utils.py b/python/cudf/cudf/pylibcudf_tests/common/utils.py index 43a8ee6c2bc..98c61be0721 100644 --- a/python/cudf/cudf/pylibcudf_tests/common/utils.py +++ b/python/cudf/cudf/pylibcudf_tests/common/utils.py @@ -1,5 +1,7 @@ # Copyright (c) 2024, NVIDIA CORPORATION. +from __future__ import annotations + import io import os from typing import Optional, Union @@ -12,7 +14,7 @@ def metadata_from_arrow_array( pa_array: pa.Array, -) -> Optional[plc.interop.ColumnMetadata]: +) -> plc.interop.ColumnMetadata | None: metadata = None if pa.types.is_list(dtype := pa_array.type) or pa.types.is_struct(dtype): metadata = plc.interop.ColumnMetadata( @@ -27,7 +29,7 @@ def metadata_from_arrow_array( def assert_column_eq( - lhs: Union[pa.Array, plc.Column], rhs: Union[pa.Array, plc.Column] + lhs: pa.Array | plc.Column, rhs: pa.Array | plc.Column ) -> None: """Verify that a pylibcudf array and PyArrow array are equal.""" # Nested types require children metadata to be passed to the conversion function. diff --git a/python/cudf/cudf/pylibcudf_tests/conftest.py b/python/cudf/cudf/pylibcudf_tests/conftest.py index bedcf39a314..de8cc180d32 100644 --- a/python/cudf/cudf/pylibcudf_tests/conftest.py +++ b/python/cudf/cudf/pylibcudf_tests/conftest.py @@ -156,3 +156,8 @@ def interp_opt(request): ) def sorted_opt(request): return request.param + + +@pytest.fixture(scope="session", params=[False, True]) +def has_nulls(request): + return request.param diff --git a/python/cudf/cudf/pylibcudf_tests/test_datetime.py b/python/cudf/cudf/pylibcudf_tests/test_datetime.py new file mode 100644 index 00000000000..75af0fa6ca1 --- /dev/null +++ b/python/cudf/cudf/pylibcudf_tests/test_datetime.py @@ -0,0 +1,30 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +import datetime + +import pyarrow as pa +import pytest +from utils import assert_column_eq + +import cudf._lib.pylibcudf as plc + + +@pytest.fixture +def column(has_nulls): + values = [ + datetime.date(1999, 1, 1), + datetime.date(2024, 10, 12), + datetime.date(1, 1, 1), + datetime.date(9999, 1, 1), + ] + if has_nulls: + values[2] = None + return plc.interop.from_arrow(pa.array(values, type=pa.date32())) + + +def test_extract_year(column): + got = plc.datetime.extract_year(column) + # libcudf produces an int16, arrow produces an int64 + expect = pa.compute.year(plc.interop.to_arrow(column)).cast(pa.int16()) + + assert_column_eq(expect, got) diff --git a/python/cudf/cudf/pylibcudf_tests/test_round.py b/python/cudf/cudf/pylibcudf_tests/test_round.py index a234860477f..991e6ed310d 100644 --- a/python/cudf/cudf/pylibcudf_tests/test_round.py +++ b/python/cudf/cudf/pylibcudf_tests/test_round.py @@ -7,16 +7,11 @@ import cudf._lib.pylibcudf as plc -@pytest.fixture(params=[False, True]) -def nullable(request): - return request.param - - @pytest.fixture(params=["float32", "float64"]) -def column(request, nullable): +def column(request, has_nulls): values = [2.5, 2.49, 1.6, 8, -1.5, -1.7, -0.5, 0.5] typ = {"float32": pa.float32(), "float64": pa.float64()}[request.param] - if nullable: + if has_nulls: values[2] = None return plc.interop.from_arrow(pa.array(values, type=typ)) diff --git a/python/cudf/cudf/tests/test_avro_reader_fastavro_integration.py b/python/cudf/cudf/tests/test_avro_reader_fastavro_integration.py index 0e38b10ed52..238e8d990cc 100644 --- a/python/cudf/cudf/tests/test_avro_reader_fastavro_integration.py +++ b/python/cudf/cudf/tests/test_avro_reader_fastavro_integration.py @@ -11,10 +11,11 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +from __future__ import annotations + import datetime import io import pathlib -from typing import Optional import fastavro import numpy as np @@ -292,7 +293,7 @@ def test_can_detect_dtypes_from_avro_logical_type( assert_eq(expected, actual) -def get_days_from_epoch(date: Optional[datetime.date]) -> Optional[int]: +def get_days_from_epoch(date: datetime.date | None) -> int | None: if date is None: return None return (date - datetime.date(1970, 1, 1)).days diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index 98e9f9881c7..649821b9b7c 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -9966,6 +9966,20 @@ def test_dataframe_nunique(data): assert_eq(expected, actual) +@pytest.mark.parametrize( + "columns", + [ + pd.RangeIndex(2, name="foo"), + pd.MultiIndex.from_arrays([[1, 2], [2, 3]], names=["foo", 1]), + pd.Index([3, 5], dtype=np.int8, name="foo"), + ], +) +def test_nunique_preserve_column_in_index(columns): + df = cudf.DataFrame([[1, 2]], columns=columns) + result = df.nunique().index.to_pandas() + assert_eq(result, columns, exact=True) + + @pytest.mark.parametrize( "data", [{"key": [0, 1, 1, 0, 0, 1], "val": [1, 8, 3, 9, -3, 8]}], diff --git a/python/cudf/cudf/tests/test_df_protocol.py b/python/cudf/cudf/tests/test_df_protocol.py index a22b678ebe6..8ce4da792a4 100644 --- a/python/cudf/cudf/tests/test_df_protocol.py +++ b/python/cudf/cudf/tests/test_df_protocol.py @@ -1,6 +1,7 @@ # Copyright (c) 2021-2024, NVIDIA CORPORATION. +from __future__ import annotations -from typing import Any, Tuple +from typing import Any import cupy as cp import pandas as pd @@ -64,7 +65,7 @@ def assert_validity_equal(protocol_buffer, cudf_buffer, size, null, valid): raise NotImplementedError() -def assert_buffer_equal(buffer_and_dtype: Tuple[_CuDFBuffer, Any], cudfcol): +def assert_buffer_equal(buffer_and_dtype: tuple[_CuDFBuffer, Any], cudfcol): buf, dtype = buffer_and_dtype device_id = cp.asarray(cudfcol.data).device.id assert buf.__dlpack_device__() == (2, device_id) diff --git a/python/cudf/cudf/tests/test_interval.py b/python/cudf/cudf/tests/test_interval.py index 7b923af1f75..013f4439ad5 100644 --- a/python/cudf/cudf/tests/test_interval.py +++ b/python/cudf/cudf/tests/test_interval.py @@ -181,3 +181,10 @@ def test_interval_with_datetime(tz, box): else: with pytest.raises(NotImplementedError): cudf.from_pandas(pobj) + + +def test_from_pandas_intervaldtype(): + dtype = pd.IntervalDtype("int64", closed="left") + result = cudf.from_pandas(dtype) + expected = cudf.IntervalDtype("int64", closed="left") + assert_eq(result, expected) diff --git a/python/cudf/cudf/tests/test_multiindex.py b/python/cudf/cudf/tests/test_multiindex.py index f143112a45f..7b95e4f9a44 100644 --- a/python/cudf/cudf/tests/test_multiindex.py +++ b/python/cudf/cudf/tests/test_multiindex.py @@ -2162,3 +2162,14 @@ def test_multi_index_contains_hashable(): lfunc_args_and_kwargs=((),), rfunc_args_and_kwargs=((),), ) + + +@pytest.mark.parametrize("array", [[1, 2], [1, None], [None, None]]) +@pytest.mark.parametrize("dropna", [True, False]) +def test_nunique(array, dropna): + arrays = [array, [3, 4]] + gidx = cudf.MultiIndex.from_arrays(arrays) + pidx = pd.MultiIndex.from_arrays(arrays) + result = gidx.nunique(dropna=dropna) + expected = pidx.nunique(dropna=dropna) + assert result == expected diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py index 30189e1ac8a..52956c230ba 100644 --- a/python/cudf/cudf/tests/test_series.py +++ b/python/cudf/cudf/tests/test_series.py @@ -2851,3 +2851,13 @@ def test_nans_to_nulls_noop_copies_column(value): ser1 = cudf.Series([value]) ser2 = ser1.nans_to_nulls() assert ser1._column is not ser2._column + + +@pytest.mark.parametrize("dropna", [False, True]) +def test_nunique_all_null(dropna): + data = [None, None] + pd_ser = pd.Series(data) + cudf_ser = cudf.Series(data) + result = pd_ser.nunique(dropna=dropna) + expected = cudf_ser.nunique(dropna=dropna) + assert result == expected diff --git a/python/cudf/cudf/tests/test_spilling.py b/python/cudf/cudf/tests/test_spilling.py index 913a958b4c2..59b8e6d2e70 100644 --- a/python/cudf/cudf/tests/test_spilling.py +++ b/python/cudf/cudf/tests/test_spilling.py @@ -1,4 +1,5 @@ # Copyright (c) 2022-2024, NVIDIA CORPORATION. +from __future__ import annotations import contextlib import importlib @@ -7,7 +8,6 @@ import warnings import weakref from concurrent.futures import ThreadPoolExecutor -from typing import List, Tuple import cupy import numpy as np @@ -107,7 +107,7 @@ def single_column_df_base_data(df: cudf.DataFrame) -> SpillableBuffer: gen_df_data_nbytes = single_column_df()._data._data["a"].data.nbytes -def spilled_and_unspilled(manager: SpillManager) -> Tuple[int, int]: +def spilled_and_unspilled(manager: SpillManager) -> tuple[int, int]: """Get bytes spilled and unspilled known by the manager""" spilled = sum(buf.size for buf in manager.buffers() if buf.is_spilled) unspilled = sum( @@ -661,7 +661,7 @@ def test_statistics(manager: SpillManager): def test_statistics_expose(manager: SpillManager): assert len(manager.statistics.spill_totals) == 0 - buffers: List[SpillableBuffer] = [ + buffers: list[SpillableBuffer] = [ as_buffer(data=rmm.DeviceBuffer(size=10), exposed=False) for _ in range(10) ] @@ -687,7 +687,7 @@ def test_statistics_expose(manager: SpillManager): assert stat.spilled_nbytes == 0 # Create and spill 10 new buffers - buffers: List[SpillableBuffer] = [ + buffers: list[SpillableBuffer] = [ as_buffer(data=rmm.DeviceBuffer(size=10), exposed=False) for _ in range(10) ] diff --git a/python/cudf/cudf/utils/applyutils.py b/python/cudf/cudf/utils/applyutils.py index d57303ca122..cd7fe5ee023 100644 --- a/python/cudf/cudf/utils/applyutils.py +++ b/python/cudf/cudf/utils/applyutils.py @@ -1,7 +1,8 @@ # Copyright (c) 2018-2024, NVIDIA CORPORATION. +from __future__ import annotations import functools -from typing import Any, Dict +from typing import Any import cupy as cp from numba import cuda @@ -339,7 +340,7 @@ def chunk_wise_kernel(nrows, chunks, {args}): return kernel -_cache: Dict[Any, Any] = dict() +_cache: dict[Any, Any] = dict() @functools.wraps(_make_row_wise_kernel) diff --git a/python/cudf/cudf/utils/queryutils.py b/python/cudf/cudf/utils/queryutils.py index 239438afd24..78aeac425f7 100644 --- a/python/cudf/cudf/utils/queryutils.py +++ b/python/cudf/cudf/utils/queryutils.py @@ -1,8 +1,9 @@ -# Copyright (c) 2018-2023, NVIDIA CORPORATION. +# Copyright (c) 2018-2024, NVIDIA CORPORATION. +from __future__ import annotations import ast import datetime -from typing import Any, Dict +from typing import Any import numpy as np from numba import cuda @@ -114,7 +115,7 @@ def _check_error(tree): raise QuerySyntaxError("too many expressions") -_cache: Dict[Any, Any] = {} +_cache: dict[Any, Any] = {} def query_compile(expr): diff --git a/python/cudf/cudf/utils/utils.py b/python/cudf/cudf/utils/utils.py index 95621cf9519..2e4dfc4bb14 100644 --- a/python/cudf/cudf/utils/utils.py +++ b/python/cudf/cudf/utils/utils.py @@ -1,11 +1,11 @@ # Copyright (c) 2020-2024, NVIDIA CORPORATION. +from __future__ import annotations import decimal import functools import os import traceback import warnings -from typing import FrozenSet, Set, Union import numpy as np import pandas as pd @@ -218,7 +218,7 @@ class GetAttrGetItemMixin: # `__setstate__`, but this class may be used in complex multiple # inheritance hierarchies that might also override serialization. The # solution here is a minimally invasive change that avoids such conflicts. - _PROTECTED_KEYS: Union[FrozenSet[str], Set[str]] = frozenset() + _PROTECTED_KEYS: frozenset[str] | set[str] = frozenset() def __getattr__(self, key): if key in self._PROTECTED_KEYS: diff --git a/python/cudf/cudf_pandas_tests/test_cudf_pandas.py b/python/cudf/cudf_pandas_tests/test_cudf_pandas.py index 515a4714a5a..5be4d350c0b 100644 --- a/python/cudf/cudf_pandas_tests/test_cudf_pandas.py +++ b/python/cudf/cudf_pandas_tests/test_cudf_pandas.py @@ -9,6 +9,7 @@ import os import pathlib import pickle +import subprocess import tempfile import types from io import BytesIO, StringIO @@ -463,6 +464,9 @@ def test_options_mode(): assert xpd.options.mode.copy_on_write == pd.options.mode.copy_on_write +# Codecov and Profiler interfere with each-other, +# hence we don't want to run code-cov on this test. +@pytest.mark.no_cover def test_profiler(): pytest.importorskip("cudf") @@ -1425,6 +1429,33 @@ def test_holidays_within_dates(holiday, start, expected): ) == [utc.localize(dt) for dt in expected] +@pytest.mark.parametrize( + "env_value", + ["", "cuda", "pool", "async", "managed", "managed_pool", "abc"], +) +def test_rmm_option_on_import(env_value): + data_directory = os.path.dirname(os.path.abspath(__file__)) + # Create a copy of the current environment variables + env = os.environ.copy() + env["CUDF_PANDAS_RMM_MODE"] = env_value + + sp_completed = subprocess.run( + [ + "python", + "-m", + "cudf.pandas", + data_directory + "/data/profile_basic.py", + ], + capture_output=True, + text=True, + env=env, + ) + if env_value in {"cuda", "pool", "async", "managed", "managed_pool"}: + assert sp_completed.returncode == 0 + else: + assert sp_completed.returncode == 1 + + def test_cudf_pandas_debugging_different_results(monkeypatch): cudf_mean = cudf.Series.mean diff --git a/python/cudf/cudf_pandas_tests/test_fast_slow_proxy.py b/python/cudf/cudf_pandas_tests/test_fast_slow_proxy.py index 39bf07c49de..a75a20a4681 100644 --- a/python/cudf/cudf_pandas_tests/test_fast_slow_proxy.py +++ b/python/cudf/cudf_pandas_tests/test_fast_slow_proxy.py @@ -1,6 +1,7 @@ # SPDX-FileCopyrightText: Copyright (c) 2023-2024 NVIDIA CORPORATION & AFFILIATES. # All rights reserved. # SPDX-License-Identifier: Apache-2.0 +from __future__ import annotations import inspect from functools import partial diff --git a/python/cudf_polars/cudf_polars/containers/dataframe.py b/python/cudf_polars/cudf_polars/containers/dataframe.py index d1f7a9ed2cf..ec8d00c3123 100644 --- a/python/cudf_polars/cudf_polars/containers/dataframe.py +++ b/python/cudf_polars/cudf_polars/containers/dataframe.py @@ -70,7 +70,7 @@ def num_columns(self) -> int: @cached_property def num_rows(self) -> int: """Number of rows.""" - return self.table.num_rows() + return 0 if len(self.columns) == 0 else self.table.num_rows() @classmethod def from_cudf(cls, df: cudf.DataFrame) -> Self: diff --git a/python/cudf_polars/cudf_polars/dsl/expr.py b/python/cudf_polars/cudf_polars/dsl/expr.py index 377a905aed6..0605bba6642 100644 --- a/python/cudf_polars/cudf_polars/dsl/expr.py +++ b/python/cudf_polars/cudf_polars/dsl/expr.py @@ -644,13 +644,28 @@ def __init__( self.options = options self.name = name self.children = children + self._validate_input() + + def _validate_input(self): if self.name not in ( pl_expr.StringFunction.Lowercase, pl_expr.StringFunction.Uppercase, pl_expr.StringFunction.EndsWith, pl_expr.StringFunction.StartsWith, + pl_expr.StringFunction.Contains, ): raise NotImplementedError(f"String function {self.name}") + if self.name == pl_expr.StringFunction.Contains: + literal, strict = self.options + if not literal: + if not strict: + raise NotImplementedError( + "f{strict=} is not supported for regex contains" + ) + if not isinstance(self.children[1], Literal): + raise NotImplementedError( + "Regex contains only supports a scalar pattern" + ) def do_evaluate( self, @@ -660,6 +675,25 @@ def do_evaluate( mapping: Mapping[Expr, Column] | None = None, ) -> Column: """Evaluate this expression given a dataframe for context.""" + if self.name == pl_expr.StringFunction.Contains: + child, arg = self.children + column = child.evaluate(df, context=context, mapping=mapping) + + literal, _ = self.options + if literal: + pat = arg.evaluate(df, context=context, mapping=mapping) + pattern = ( + pat.obj_scalar + if pat.is_scalar and pat.obj.size() != column.obj.size() + else pat.obj + ) + return Column(plc.strings.find.contains(column.obj, pattern)) + assert isinstance(arg, Literal) + prog = plc.strings.regex_program.RegexProgram.create( + arg.value.as_py(), + flags=plc.strings.regex_flags.RegexFlags.DEFAULT, + ) + return Column(plc.strings.contains.contains_re(column.obj, prog)) columns = [ child.evaluate(df, context=context, mapping=mapping) for child in self.children @@ -690,8 +724,9 @@ def do_evaluate( else prefix.obj, ) ) - else: - raise NotImplementedError(f"StringFunction {self.name}") + raise NotImplementedError( + f"StringFunction {self.name}" + ) # pragma: no cover; handled by init raising class Sort(Expr): diff --git a/python/cudf_polars/cudf_polars/dsl/ir.py b/python/cudf_polars/cudf_polars/dsl/ir.py index 46241ab8e71..7f0920e1b57 100644 --- a/python/cudf_polars/cudf_polars/dsl/ir.py +++ b/python/cudf_polars/cudf_polars/dsl/ir.py @@ -165,6 +165,10 @@ class PythonScan(IR): predicate: expr.NamedExpr | None """Filter to apply to the constructed dataframe before returning it.""" + def __post_init__(self): + """Validate preconditions.""" + raise NotImplementedError("PythonScan not implemented") + @dataclasses.dataclass(slots=True) class Scan(IR): @@ -282,13 +286,18 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame: pdf = pl.DataFrame._from_pydf(self.df) if self.projection is not None: pdf = pdf.select(self.projection) - # TODO: goes away when libcudf supports large strings table = pdf.to_arrow() schema = table.schema for i, field in enumerate(schema): + # TODO: Nested types if field.type == pa.large_string(): - # TODO: Nested types + # TODO: goes away when libcudf supports large strings schema = schema.set(i, pa.field(field.name, pa.string())) + elif isinstance(field.type, pa.LargeListType): + # TODO: goes away when libcudf supports large lists + schema = schema.set( + i, pa.field(field.name, pa.list_(field.type.field(0))) + ) table = table.cast(schema) df = DataFrame.from_table( plc.interop.from_arrow(table), list(self.schema.keys()) @@ -846,9 +855,11 @@ class MapFunction(IR): _NAMES: ClassVar[frozenset[str]] = frozenset( [ - "drop_nulls", "rechunk", - "merge_sorted", + # libcudf merge is not stable wrt order of inputs, since + # it uses a priority queue to manage the tables it produces. + # See: https://github.com/rapidsai/cudf/issues/16010 + # "merge_sorted", "rename", "explode", ] @@ -865,46 +876,13 @@ def __post_init__(self) -> None: # polars requires that all to-explode columns have the # same sub-shapes raise NotImplementedError("Explode with more than one column") - elif self.name == "merge_sorted": - assert isinstance(self.df, Union) - (key_column,) = self.options - if key_column not in self.df.dfs[0].schema: - raise ValueError(f"Key column {key_column} not found") def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame: """Evaluate and return a dataframe.""" - if self.name == "merge_sorted": - # merge_sorted operates on Union inputs - # but if we evaluate the Union then we can't unpick the - # pieces, so we dive inside and evaluate the pieces by hand - assert isinstance(self.df, Union) - first, *rest = (c.evaluate(cache=cache) for c in self.df.dfs) - (key_column,) = self.options - if not all(first.column_names == r.column_names for r in rest): - raise ValueError("DataFrame shapes/column names don't match") - # Already validated that key_column is in column names - index = first.column_names.index(key_column) - return DataFrame.from_table( - plc.merge.merge_sorted( - [first.table, *(df.table for df in rest)], - [index], - [plc.types.Order.ASCENDING], - [plc.types.NullOrder.BEFORE], - ), - first.column_names, - ).sorted_like(first, subset={key_column}) - elif self.name == "rechunk": + if self.name == "rechunk": # No-op in our data model - return self.df.evaluate(cache=cache) - elif self.name == "drop_nulls": - df = self.df.evaluate(cache=cache) - (subset,) = self.options - subset = set(subset) - indices = [i for i, name in enumerate(df.column_names) if name in subset] - return DataFrame.from_table( - plc.stream_compaction.drop_nulls(df.table, indices, len(indices)), - df.column_names, - ).sorted_like(df) + # Don't think this appears in a plan tree from python + return self.df.evaluate(cache=cache) # pragma: no cover elif self.name == "rename": df = self.df.evaluate(cache=cache) # final tag is "swapping" which is useful for the @@ -920,7 +898,7 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame: plc.lists.explode_outer(df.table, index), df.column_names ).sorted_like(df, subset=subset) else: - raise AssertionError("Should never be reached") + raise AssertionError("Should never be reached") # pragma: no cover @dataclasses.dataclass(slots=True) diff --git a/python/cudf_polars/tests/conftest.py b/python/cudf_polars/tests/conftest.py new file mode 100644 index 00000000000..9bbce6bc080 --- /dev/null +++ b/python/cudf_polars/tests/conftest.py @@ -0,0 +1,10 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +# SPDX-License-Identifier: Apache-2.0 +from __future__ import annotations + +import pytest + + +@pytest.fixture(params=[False, True], ids=["no_nulls", "nulls"], scope="session") +def with_nulls(request): + return request.param diff --git a/python/cudf_polars/tests/expressions/test_agg.py b/python/cudf_polars/tests/expressions/test_agg.py index 79018c80bf3..b044bbb2885 100644 --- a/python/cudf_polars/tests/expressions/test_agg.py +++ b/python/cudf_polars/tests/expressions/test_agg.py @@ -20,11 +20,6 @@ def dtype(request): return request.param -@pytest.fixture(params=[False, True], ids=["no-nulls", "with-nulls"]) -def with_nulls(request): - return request.param - - @pytest.fixture( params=[ False, diff --git a/python/cudf_polars/tests/expressions/test_distinct.py b/python/cudf_polars/tests/expressions/test_distinct.py index 22865a7ce22..143dd7e9f0f 100644 --- a/python/cudf_polars/tests/expressions/test_distinct.py +++ b/python/cudf_polars/tests/expressions/test_distinct.py @@ -9,11 +9,6 @@ from cudf_polars.testing.asserts import assert_gpu_result_equal -@pytest.fixture(params=[False, True], ids=["no-nulls", "nulls"]) -def nullable(request): - return request.param - - @pytest.fixture( params=["is_first_distinct", "is_last_distinct", "is_unique", "is_duplicated"] ) @@ -22,9 +17,9 @@ def op(request): @pytest.fixture -def df(nullable): +def df(with_nulls): values: list[int | None] = [1, 2, 3, 1, 1, 7, 3, 2, 7, 8, 1] - if nullable: + if with_nulls: values[1] = None values[4] = None return pl.LazyFrame({"a": values}) diff --git a/python/cudf_polars/tests/expressions/test_len.py b/python/cudf_polars/tests/expressions/test_len.py new file mode 100644 index 00000000000..03b30928184 --- /dev/null +++ b/python/cudf_polars/tests/expressions/test_len.py @@ -0,0 +1,26 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +# SPDX-License-Identifier: Apache-2.0 +from __future__ import annotations + +import pytest + +import polars as pl + +from cudf_polars.testing.asserts import assert_gpu_result_equal + + +@pytest.mark.parametrize("dtype", [pl.UInt32, pl.Int32, None]) +@pytest.mark.parametrize("empty", [False, True]) +def test_len(dtype, empty): + if empty: + df = pl.LazyFrame({}) + else: + df = pl.LazyFrame({"a": [1, 2, 3]}) + + if dtype is None: + q = df.select(pl.len()) + else: + q = df.select(pl.len().cast(dtype)) + + # Workaround for https://github.com/pola-rs/polars/issues/16904 + assert_gpu_result_equal(q, collect_kwargs={"projection_pushdown": False}) diff --git a/python/cudf_polars/tests/expressions/test_numeric_binops.py b/python/cudf_polars/tests/expressions/test_numeric_binops.py index 548aebf0875..7eefc59d927 100644 --- a/python/cudf_polars/tests/expressions/test_numeric_binops.py +++ b/python/cudf_polars/tests/expressions/test_numeric_binops.py @@ -29,11 +29,6 @@ def rtype(request): return request.param -@pytest.fixture(params=[False, True], ids=["no_nulls", "nulls"]) -def with_nulls(request): - return request.param - - @pytest.fixture( params=[ pl.Expr.eq, diff --git a/python/cudf_polars/tests/expressions/test_stringfunction.py b/python/cudf_polars/tests/expressions/test_stringfunction.py new file mode 100644 index 00000000000..3c498fe7286 --- /dev/null +++ b/python/cudf_polars/tests/expressions/test_stringfunction.py @@ -0,0 +1,106 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +# SPDX-License-Identifier: Apache-2.0 +from __future__ import annotations + +from functools import partial + +import pytest + +import polars as pl + +from cudf_polars import execute_with_cudf, translate_ir +from cudf_polars.testing.asserts import assert_gpu_result_equal + + +@pytest.fixture +def ldf(with_nulls): + a = [ + "AbC", + "de", + "FGHI", + "j", + "kLm", + "nOPq", + "", + "RsT", + "sada", + "uVw", + "h", + "Wıth ünιcοde", # noqa: RUF001 + ] + if with_nulls: + a[4] = None + a[-3] = None + return pl.LazyFrame({"a": a, "b": range(len(a))}) + + +def test_supported_stringfunction_expression(ldf): + query = ldf.select( + pl.col("a").str.starts_with("Z"), + pl.col("a").str.ends_with("h").alias("endswith_h"), + pl.col("a").str.to_lowercase().alias("lower"), + pl.col("a").str.to_uppercase().alias("upper"), + ) + assert_gpu_result_equal(query) + + +def test_unsupported_stringfunction(ldf): + q = ldf.select(pl.col("a").str.count_matches("e", literal=True)) + + with pytest.raises(NotImplementedError): + _ = translate_ir(q._ldf.visit()) + + +def test_contains_re_non_strict_raises(ldf): + q = ldf.select(pl.col("a").str.contains(".", strict=False)) + + with pytest.raises(NotImplementedError): + _ = translate_ir(q._ldf.visit()) + + +def test_contains_re_non_literal_raises(ldf): + q = ldf.select(pl.col("a").str.contains(pl.col("b"), literal=False)) + + with pytest.raises(NotImplementedError): + _ = translate_ir(q._ldf.visit()) + + +@pytest.mark.parametrize( + "substr", + [ + "A", + "de", + ".*", + "^a", + "^A", + "[^a-z]", + "[a-z]{3,}", + "^[A-Z]{2,}", + "j|u", + ], +) +def test_contains_regex(ldf, substr): + query = ldf.select(pl.col("a").str.contains(substr)) + assert_gpu_result_equal(query) + + +@pytest.mark.parametrize( + "literal", ["A", "de", "FGHI", "j", "kLm", "nOPq", "RsT", "uVw"] +) +def test_contains_literal(ldf, literal): + query = ldf.select(pl.col("a").str.contains(pl.lit(literal), literal=True)) + assert_gpu_result_equal(query) + + +def test_contains_column(ldf): + query = ldf.select(pl.col("a").str.contains(pl.col("a"), literal=True)) + assert_gpu_result_equal(query) + + +def test_contains_invalid(ldf): + query = ldf.select(pl.col("a").str.contains("[")) + + with pytest.raises(pl.exceptions.ComputeError): + query.collect() + with pytest.raises(pl.exceptions.ComputeError): + query.collect(post_opt_callback=partial(execute_with_cudf, raise_on_fail=True)) diff --git a/python/cudf_polars/tests/test_mapfunction.py b/python/cudf_polars/tests/test_mapfunction.py new file mode 100644 index 00000000000..ec6b3f3fc0a --- /dev/null +++ b/python/cudf_polars/tests/test_mapfunction.py @@ -0,0 +1,43 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +# SPDX-License-Identifier: Apache-2.0 +from __future__ import annotations + +import pytest + +import polars as pl + +from cudf_polars import translate_ir +from cudf_polars.testing.asserts import assert_gpu_result_equal + + +def test_merge_sorted_raises(): + df1 = pl.LazyFrame({"a": [1, 6, 9], "b": [1, -10, 4]}) + df2 = pl.LazyFrame({"a": [-1, 5, 11, 20], "b": [2, 7, -4, None]}) + df3 = pl.LazyFrame({"a": [-10, 20, 21], "b": [1, 2, 3]}) + + q = df1.merge_sorted(df2, key="a").merge_sorted(df3, key="a") + + with pytest.raises(NotImplementedError): + _ = translate_ir(q._ldf.visit()) + + +def test_explode_multiple_raises(): + df = pl.LazyFrame({"a": [[1, 2], [3, 4]], "b": [[5, 6], [7, 8]]}) + q = df.explode("a", "b") + + with pytest.raises(NotImplementedError): + _ = translate_ir(q._ldf.visit()) + + +@pytest.mark.parametrize("column", ["a", "b"]) +def test_explode_single(column): + df = pl.LazyFrame( + { + "a": [[1, 2], [3, 4], None], + "b": [[5, 6], [7, 8], [9, 10]], + "c": [None, 11, 12], + } + ) + q = df.explode(column) + + assert_gpu_result_equal(q) diff --git a/python/cudf_polars/tests/test_python_scan.py b/python/cudf_polars/tests/test_python_scan.py new file mode 100644 index 00000000000..c03474e3dc8 --- /dev/null +++ b/python/cudf_polars/tests/test_python_scan.py @@ -0,0 +1,20 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +# SPDX-License-Identifier: Apache-2.0 +from __future__ import annotations + +import pytest + +import polars as pl + +from cudf_polars import translate_ir + + +def test_python_scan(): + def source(with_columns, predicate, nrows): + return pl.DataFrame({"a": pl.Series([1, 2, 3], dtype=pl.Int8())}) + + q = pl.LazyFrame._scan_python_function({"a": pl.Int8}, source, pyarrow=False) + with pytest.raises(NotImplementedError): + _ = translate_ir(q._ldf.visit()) + + assert q.collect().equals(source(None, None, None)) diff --git a/python/dask_cudf/dask_cudf/groupby.py b/python/dask_cudf/dask_cudf/groupby.py index ef47ea436c7..2e72461b43d 100644 --- a/python/dask_cudf/dask_cudf/groupby.py +++ b/python/dask_cudf/dask_cudf/groupby.py @@ -1,7 +1,7 @@ # Copyright (c) 2020-2024, NVIDIA CORPORATION. +from __future__ import annotations from functools import wraps -from typing import Set import numpy as np import pandas as pd @@ -695,7 +695,7 @@ def _aggs_optimized(arg, supported: set): """Check that aggregations in `arg` are a subset of `supported`""" if isinstance(arg, (list, dict)): if isinstance(arg, dict): - _global_set: Set[str] = set() + _global_set: set[str] = set() for col in arg: if isinstance(arg[col], list): _global_set = _global_set.union(set(arg[col])) diff --git a/python/dask_cudf/dask_cudf/io/parquet.py b/python/dask_cudf/dask_cudf/io/parquet.py index ba8b1e89721..810a804e428 100644 --- a/python/dask_cudf/dask_cudf/io/parquet.py +++ b/python/dask_cudf/dask_cudf/io/parquet.py @@ -316,7 +316,7 @@ def read_partition( if index and (index[0] in df.columns): df = df.set_index(index[0]) - elif index is False and df.index.names != (None,): + elif index is False and df.index.names != [None]: # If index=False, we shouldn't have a named index df.reset_index(inplace=True)