From 9a770f663018dcf46d721d0a599f89faa5564dcb Mon Sep 17 00:00:00 2001 From: Vukasin Milovanovic Date: Tue, 4 Apr 2023 19:39:41 -0700 Subject: [PATCH] Fix OOB memory access in CSV reader when reading without NA values (#13011) CSV reader uses a trie to read field with special values as nulls. The creation of the trie does not work correctly when there are not special values. This can happen when the NA filter is enabled, but the default NA values are removed, and user does not specify custom values. In this case, use of this trie leads to OOB memory access. This PR fixes the trie creation to create an empty trie when there are not special values to look for. Included a C++ test that crashes without the fix. Authors: - Vukasin Milovanovic (https://github.com/vuule) Approvers: - Vyas Ramasubramani (https://github.com/vyasr) - Mike Wilson (https://github.com/hyperbolic2346) - Nghia Truong (https://github.com/ttnghia) URL: https://github.com/rapidsai/cudf/pull/13011 --- cpp/src/io/utilities/trie.cu | 2 ++ cpp/src/io/utilities/trie.cuh | 6 +++--- cpp/tests/io/csv_test.cpp | 16 ++++++++++++++++ 3 files changed, 21 insertions(+), 3 deletions(-) diff --git a/cpp/src/io/utilities/trie.cu b/cpp/src/io/utilities/trie.cu index e2ace7258f7..d902cfecf40 100644 --- a/cpp/src/io/utilities/trie.cu +++ b/cpp/src/io/utilities/trie.cu @@ -36,6 +36,8 @@ namespace detail { rmm::device_uvector create_serialized_trie(const std::vector& keys, rmm::cuda_stream_view stream) { + if (keys.empty()) { return rmm::device_uvector{0, stream}; } + static constexpr int alphabet_size = std::numeric_limits::max() + 1; struct TreeTrieNode { using TrieNodePtr = std::unique_ptr; diff --git a/cpp/src/io/utilities/trie.cuh b/cpp/src/io/utilities/trie.cuh index 85834ad2f0e..0f87de81653 100644 --- a/cpp/src/io/utilities/trie.cuh +++ b/cpp/src/io/utilities/trie.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2021, NVIDIA CORPORATION. + * Copyright (c) 2018-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -83,8 +83,8 @@ trie create_serialized_trie(const std::vector& keys, rmm::cuda_stre __host__ __device__ inline bool serialized_trie_contains(device_span trie, device_span key) { - if (trie.data() == nullptr || trie.empty()) return false; - if (key.empty()) return trie.front().is_leaf; + if (trie.empty()) { return false; } + if (key.empty()) { return trie.front().is_leaf; } auto curr_node = trie.begin() + 1; for (auto curr_key = key.begin(); curr_key < key.end(); ++curr_key) { // Don't jump away from root node diff --git a/cpp/tests/io/csv_test.cpp b/cpp/tests/io/csv_test.cpp index a0daab767c0..193d1092cab 100644 --- a/cpp/tests/io/csv_test.cpp +++ b/cpp/tests/io/csv_test.cpp @@ -1358,6 +1358,22 @@ TEST_F(CsvReaderTest, nullHandling) CUDF_TEST_EXPECT_COLUMNS_EQUAL(expect, view.column(0)); } + + // Filter enabled, but no NA values + { + cudf::io::csv_reader_options in_opts = + cudf::io::csv_reader_options::builder(cudf::io::source_info{filepath}) + .keep_default_na(false) + .dtypes({dtype()}) + .header(-1) + .skip_blank_lines(false); + const auto result = cudf::io::read_csv(in_opts); + const auto view = result.tbl->view(); + auto expect = + cudf::test::strings_column_wrapper({"NULL", "", "null", "n/a", "Null", "NA", "nan"}); + + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expect, view.column(0)); + } } TEST_F(CsvReaderTest, FailCases)