orc multithreaded benchmark (#16009)

Addresses: #15973 Adds multithreaded benchmarks for the ORC reader. Based off of the parquet equivalent in #15585 ``` # Benchmark Results ## orc_multithreaded_read_decode_mixed ### [0] NVIDIA RTX 5880 Ada Generation | cardinality | total_data_size | num_threads | num_cols | run_length | Samples | CPU Time | Noise | GPU Time | Noise | bytes_per_second | peak_memory_usage | encoded_file_size | |-------------|-----------------|-------------|----------|------------|---------|-----------|-------|-----------|-------|------------------|-------------------|-------------------| | 1000 | 536870912 | 1 | 4 | 8 | 338x | 44.348 ms | 1.18% | 44.343 ms | 1.18% | 12107185968 | 939.341 MiB | 39.557 MiB | | 1000 | 1073741824 | 1 | 4 | 8 | 80x | 77.634 ms | 0.65% | 77.629 ms | 0.65% | 13831742649 | 1.834 GiB | 79.072 MiB | | 1000 | 536870912 | 2 | 4 | 8 | 341x | 43.921 ms | 1.20% | 43.916 ms | 1.20% | 12224889363 | 825.333 MiB | 39.568 MiB | | 1000 | 1073741824 | 2 | 4 | 8 | 80x | 75.418 ms | 0.70% | 75.414 ms | 0.70% | 14237999015 | 1.611 GiB | 79.113 MiB | | 1000 | 536870912 | 4 | 4 | 8 | 80x | 42.682 ms | 1.18% | 42.678 ms | 1.18% | 12579566132 | 883.436 MiB | 39.587 MiB | | 1000 | 1073741824 | 4 | 4 | 8 | 9x | 74.056 ms | 0.48% | 74.052 ms | 0.48% | 14499873867 | 1.724 GiB | 79.136 MiB | | 1000 | 536870912 | 8 | 4 | 8 | 25x | 42.198 ms | 0.50% | 42.194 ms | 0.49% | 12723960975 | 940.562 MiB | 39.600 MiB | | 1000 | 1073741824 | 8 | 4 | 8 | 8x | 73.933 ms | 0.49% | 73.929 ms | 0.49% | 14524042443 | 1.781 GiB | 79.175 MiB | ## orc_multithreaded_read_decode_fixed_width ### [0] NVIDIA RTX 5880 Ada Generation | cardinality | total_data_size | num_threads | num_cols | run_length | Samples | CPU Time | Noise | GPU Time | Noise | bytes_per_second | peak_memory_usage | encoded_file_size | |-------------|-----------------|-------------|----------|------------|---------|-----------|-------|-----------|-------|------------------|-------------------|-------------------| | 1000 | 536870912 | 1 | 4 | 8 | 13x | 40.149 ms | 0.04% | 40.144 ms | 0.04% | 13373482726 | 643.390 MiB | 59.821 MiB | | 1000 | 1073741824 | 1 | 4 | 8 | 211x | 71.216 ms | 0.67% | 71.211 ms | 0.67% | 15078297784 | 1.257 GiB | 119.650 MiB | | 1000 | 536870912 | 2 | 4 | 8 | 378x | 39.662 ms | 1.31% | 39.658 ms | 1.31% | 13537590893 | 643.392 MiB | 59.833 MiB | | 1000 | 1073741824 | 2 | 4 | 8 | 209x | 71.693 ms | 0.71% | 71.688 ms | 0.71% | 14978085376 | 1.257 GiB | 119.642 MiB | | 1000 | 536870912 | 4 | 4 | 8 | 377x | 39.731 ms | 1.30% | 39.726 ms | 1.30% | 13514305239 | 643.394 MiB | 59.856 MiB | | 1000 | 1073741824 | 4 | 4 | 8 | 8x | 70.766 ms | 0.08% | 70.761 ms | 0.08% | 15174115364 | 1.030 GiB | 119.665 MiB | | 1000 | 536870912 | 8 | 4 | 8 | 379x | 39.486 ms | 1.27% | 39.482 ms | 1.27% | 13597888468 | 647.399 MiB | 59.928 MiB | | 1000 | 1073741824 | 8 | 4 | 8 | 207x | 72.686 ms | 2.04% | 72.681 ms | 2.04% | 14773317833 | 1.143 GiB | 119.711 MiB | ## orc_multithreaded_read_decode_string ### [0] NVIDIA RTX 5880 Ada Generation | cardinality | total_data_size | num_threads | num_cols | run_length | Samples | CPU Time | Noise | GPU Time | Noise | bytes_per_second | peak_memory_usage | encoded_file_size | |-------------|-----------------|-------------|----------|------------|---------|-----------|-------|-----------|-------|------------------|-------------------|-------------------| | 1000 | 536870912 | 1 | 4 | 8 | 80x | 22.933 ms | 2.13% | 22.928 ms | 2.13% | 23415352877 | 661.948 MiB | 10.879 MiB | | 1000 | 1073741824 | 1 | 4 | 8 | 160x | 34.167 ms | 1.41% | 34.162 ms | 1.41% | 31430436877 | 1.293 GiB | 21.757 MiB | | 1000 | 536870912 | 2 | 4 | 8 | 560x | 22.533 ms | 2.18% | 22.528 ms | 2.18% | 23830839172 | 609.407 MiB | 10.941 MiB | | 1000 | 1073741824 | 2 | 4 | 8 | 80x | 34.311 ms | 1.54% | 34.307 ms | 1.54% | 31298288990 | 1.188 GiB | 21.758 MiB | | 1000 | 536870912 | 4 | 4 | 8 | 23x | 22.179 ms | 0.11% | 22.175 ms | 0.11% | 24211151047 | 624.177 MiB | 10.947 MiB | | 1000 | 1073741824 | 4 | 4 | 8 | 15x | 33.793 ms | 0.08% | 33.789 ms | 0.08% | 31777989791 | 1.190 GiB | 21.881 MiB | | 1000 | 536870912 | 8 | 4 | 8 | 679x | 22.006 ms | 1.74% | 22.002 ms | 1.74% | 24401381631 | 624.524 MiB | 10.951 MiB | | 1000 | 1073741824 | 8 | 4 | 8 | 160x | 33.320 ms | 1.57% | 33.316 ms | 1.57% | 32229227026 | 1.207 GiB | 21.894 MiB | ## orc_multithreaded_read_decode_list ### [0] NVIDIA RTX 5880 Ada Generation | cardinality | total_data_size | num_threads | num_cols | run_length | Samples | CPU Time | Noise | GPU Time | Noise | bytes_per_second | peak_memory_usage | encoded_file_size | |-------------|-----------------|-------------|----------|------------|---------|------------|--------|------------|--------|------------------|-------------------|-------------------| | 1000 | 536870912 | 1 | 4 | 8 | 96x | 74.437 ms | 0.68% | 74.433 ms | 0.68% | 7212831148 | 600.751 MiB | 60.245 MiB | | 1000 | 1073741824 | 1 | 4 | 8 | 7x | 80.994 ms | 0.49% | 80.990 ms | 0.49% | 13257745936 | 1.173 GiB | 120.549 MiB | | 1000 | 536870912 | 2 | 4 | 8 | 80x | 79.234 ms | 4.57% | 79.229 ms | 4.57% | 6776190522 | 600.950 MiB | 60.250 MiB | | 1000 | 1073741824 | 2 | 4 | 8 | 166x | 90.437 ms | 17.19% | 90.432 ms | 17.19% | 11873413959 | 1.173 GiB | 120.489 MiB | | 1000 | 536870912 | 4 | 4 | 8 | 80x | 78.613 ms | 2.98% | 78.608 ms | 2.98% | 6829702014 | 602.764 MiB | 60.323 MiB | | 1000 | 1073741824 | 4 | 4 | 8 | 127x | 118.629 ms | 22.67% | 118.624 ms | 22.67% | 9051644873 | 1.174 GiB | 120.499 MiB | | 1000 | 536870912 | 8 | 4 | 8 | 112x | 133.950 ms | 4.45% | 133.945 ms | 4.45% | 4008135293 | 603.471 MiB | 60.353 MiB | | 1000 | 1073741824 | 8 | 4 | 8 | 90x | 167.850 ms | 15.93% | 167.844 ms | 15.93% | 6397248426 | 1.177 GiB | 120.646 MiB | ## orc_multithreaded_read_decode_chunked_mixed ### [0] NVIDIA RTX 5880 Ada Generation | cardinality | total_data_size | num_threads | num_cols | run_length | input_limit | output_limit | Samples | CPU Time | Noise | GPU Time | Noise | bytes_per_second | peak_memory_usage | encoded_file_size | |-------------|-----------------|-------------|----------|------------|-------------|--------------|---------|-----------|-------|-----------|-------|------------------|-------------------|-------------------| | 1000 | 536870912 | 1 | 4 | 8 | 671088640 | 671088640 | 333x | 45.009 ms | 1.10% | 45.005 ms | 1.10% | 11929261073 | 939.341 MiB | 39.557 MiB | | 1000 | 1073741824 | 1 | 4 | 8 | 671088640 | 671088640 | 96x | 81.524 ms | 0.61% | 81.519 ms | 0.61% | 13171640865 | 1.834 GiB | 79.072 MiB | | 1000 | 536870912 | 2 | 4 | 8 | 671088640 | 671088640 | 339x | 44.183 ms | 0.96% | 44.179 ms | 0.96% | 12152252271 | 825.333 MiB | 39.568 MiB | | 1000 | 1073741824 | 2 | 4 | 8 | 671088640 | 671088640 | 7x | 79.051 ms | 0.02% | 79.046 ms | 0.02% | 13583676002 | 1.611 GiB | 79.113 MiB | | 1000 | 536870912 | 4 | 4 | 8 | 671088640 | 671088640 | 12x | 43.276 ms | 0.09% | 43.272 ms | 0.09% | 12407024794 | 883.436 MiB | 39.587 MiB | | 1000 | 1073741824 | 4 | 4 | 8 | 671088640 | 671088640 | 19x | 78.019 ms | 0.49% | 78.014 ms | 0.49% | 13763433041 | 1.724 GiB | 79.136 MiB | | 1000 | 536870912 | 8 | 4 | 8 | 671088640 | 671088640 | 80x | 42.803 ms | 1.22% | 42.799 ms | 1.22% | 12543864010 | 911.993 MiB | 39.600 MiB | | 1000 | 1073741824 | 8 | 4 | 8 | 671088640 | 671088640 | 193x | 77.856 ms | 0.59% | 77.852 ms | 0.59% | 13792063986 | 1.837 GiB | 79.175 MiB | ## orc_multithreaded_read_decode_chunked_fixed_width ### [0] NVIDIA RTX 5880 Ada Generation | cardinality | total_data_size | num_threads | num_cols | run_length | input_limit | output_limit | Samples | CPU Time | Noise | GPU Time | Noise | bytes_per_second | peak_memory_usage | encoded_file_size | |-------------|-----------------|-------------|----------|------------|-------------|--------------|---------|-----------|-------|-----------|-------|------------------|-------------------|-------------------| | 1000 | 536870912 | 1 | 4 | 8 | 671088640 | 671088640 | 112x | 40.497 ms | 1.23% | 40.493 ms | 1.23% | 13258480947 | 643.390 MiB | 59.821 MiB | | 1000 | 1073741824 | 1 | 4 | 8 | 671088640 | 671088640 | 7x | 75.440 ms | 0.09% | 75.435 ms | 0.09% | 14234033611 | 1.648 GiB | 119.651 MiB | | 1000 | 536870912 | 2 | 4 | 8 | 671088640 | 671088640 | 80x | 39.793 ms | 1.36% | 39.789 ms | 1.36% | 13493067216 | 643.392 MiB | 59.833 MiB | | 1000 | 1073741824 | 2 | 4 | 8 | 671088640 | 671088640 | 69x | 74.499 ms | 0.50% | 74.494 ms | 0.50% | 14413864845 | 1.336 GiB | 119.642 MiB | | 1000 | 536870912 | 4 | 4 | 8 | 671088640 | 671088640 | 381x | 39.273 ms | 1.11% | 39.269 ms | 1.11% | 13671742653 | 643.394 MiB | 59.856 MiB | | 1000 | 1073741824 | 4 | 4 | 8 | 671088640 | 671088640 | 204x | 73.755 ms | 0.60% | 73.751 ms | 0.60% | 14559012350 | 1.648 GiB | 119.665 MiB | | 1000 | 536870912 | 8 | 4 | 8 | 671088640 | 671088640 | 80x | 39.490 ms | 1.31% | 39.486 ms | 1.31% | 13596333864 | 631.980 MiB | 59.928 MiB | | 1000 | 1073741824 | 8 | 4 | 8 | 671088640 | 671088640 | 203x | 73.907 ms | 1.34% | 73.903 ms | 1.34% | 14529071322 | 1.454 GiB | 119.711 MiB | ## orc_multithreaded_read_decode_chunked_string ### [0] NVIDIA RTX 5880 Ada Generation | cardinality | total_data_size | num_threads | num_cols | run_length | input_limit | output_limit | Samples | CPU Time | Noise | GPU Time | Noise | bytes_per_second | peak_memory_usage | encoded_file_size | |-------------|-----------------|-------------|----------|------------|-------------|--------------|---------|-----------|-------|-----------|-------|------------------|-------------------|-------------------| | 1000 | 536870912 | 1 | 4 | 8 | 671088640 | 671088640 | 80x | 23.022 ms | 1.96% | 23.017 ms | 1.96% | 23324556592 | 661.948 MiB | 10.879 MiB | | 1000 | 1073741824 | 1 | 4 | 8 | 671088640 | 671088640 | 80x | 37.687 ms | 1.37% | 37.682 ms | 1.37% | 28494755419 | 1.659 GiB | 21.757 MiB | | 1000 | 536870912 | 2 | 4 | 8 | 671088640 | 671088640 | 80x | 22.703 ms | 2.30% | 22.699 ms | 2.30% | 23652118769 | 609.407 MiB | 10.941 MiB | | 1000 | 1073741824 | 2 | 4 | 8 | 671088640 | 671088640 | 80x | 37.581 ms | 1.42% | 37.577 ms | 1.42% | 28574723179 | 1.658 GiB | 21.758 MiB | | 1000 | 536870912 | 4 | 4 | 8 | 671088640 | 671088640 | 544x | 22.296 ms | 1.56% | 22.293 ms | 1.56% | 24082840350 | 631.319 MiB | 10.947 MiB | | 1000 | 1073741824 | 4 | 4 | 8 | 671088640 | 671088640 | 14x | 36.990 ms | 0.14% | 36.985 ms | 0.14% | 29031484389 | 1.554 GiB | 21.881 MiB | | 1000 | 536870912 | 8 | 4 | 8 | 671088640 | 671088640 | 676x | 22.114 ms | 1.22% | 22.110 ms | 1.22% | 24281965280 | 627.616 MiB | 10.951 MiB | | 1000 | 1073741824 | 8 | 4 | 8 | 671088640 | 671088640 | 80x | 37.409 ms | 1.40% | 37.405 ms | 1.40% | 28706077426 | 1.562 GiB | 21.894 MiB | ## orc_multithreaded_read_decode_chunked_list ### [0] NVIDIA RTX 5880 Ada Generation | cardinality | total_data_size | num_threads | num_cols | run_length | input_limit | output_limit | Samples | CPU Time | Noise | GPU Time | Noise | bytes_per_second | peak_memory_usage | encoded_file_size | |-------------|-----------------|-------------|----------|------------|-------------|--------------|---------|------------|--------|------------|--------|------------------|-------------------|-------------------| | 1000 | 536870912 | 1 | 4 | 8 | 671088640 | 671088640 | 80x | 74.780 ms | 0.67% | 74.776 ms | 0.67% | 7179747067 | 600.751 MiB | 60.245 MiB | | 1000 | 1073741824 | 1 | 4 | 8 | 671088640 | 671088640 | 175x | 86.040 ms | 0.56% | 86.035 ms | 0.56% | 12480222210 | 1.576 GiB | 120.549 MiB | | 1000 | 536870912 | 2 | 4 | 8 | 671088640 | 671088640 | 186x | 80.668 ms | 4.14% | 80.664 ms | 4.14% | 6655685080 | 600.951 MiB | 60.250 MiB | | 1000 | 1073741824 | 2 | 4 | 8 | 671088640 | 671088640 | 143x | 105.217 ms | 21.56% | 105.212 ms | 21.56% | 10205531345 | 1.576 GiB | 120.489 MiB | | 1000 | 536870912 | 4 | 4 | 8 | 671088640 | 671088640 | 128x | 80.087 ms | 3.05% | 80.082 ms | 3.05% | 6704042147 | 602.764 MiB | 60.323 MiB | | 1000 | 1073741824 | 4 | 4 | 8 | 671088640 | 671088640 | 135x | 111.556 ms | 21.88% | 111.551 ms | 21.88% | 9625546746 | 1.489 GiB | 120.499 MiB | | 1000 | 536870912 | 8 | 4 | 8 | 671088640 | 671088640 | 112x | 134.677 ms | 4.14% | 134.672 ms | 4.14% | 3986513604 | 603.471 MiB | 60.353 MiB | | 1000 | 1073741824 | 8 | 4 | 8 | 671088640 | 671088640 | 80x | 178.735 ms | 14.17% | 178.730 ms | 14.17% | 6007630497 | 1.520 GiB | 120.646 MiB | ``` Authors: - Zach Puller (https://github.com/zpuller) - Vukasin Milovanovic (https://github.com/vuule) - MithunR (https://github.com/mythrocks) Approvers: - Yunsong Wang (https://github.com/PointKernel) - MithunR (https://github.com/mythrocks) URL: #16009
rapidsai · Jun 14, 2024 · 34227d3 · 34227d3
1 parent 829b3a9
commit 34227d3
Show file tree

Hide file tree

Showing 2 changed files with 340 additions and 0 deletions.
diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt
@@ -267,6 +267,11 @@ ConfigureNVBench(PARQUET_MULTITHREAD_READER_NVBENCH io/parquet/parquet_reader_mu
 # * orc reader benchmark --------------------------------------------------------------------------
 ConfigureNVBench(ORC_READER_NVBENCH io/orc/orc_reader_input.cpp io/orc/orc_reader_options.cpp)
 
+# ##################################################################################################
+# * orc multithreaded benchmark
+# --------------------------------------------------------------------------
+ConfigureNVBench(ORC_MULTITHREADED_NVBENCH io/orc/orc_reader_multithreaded.cpp)
+
 # ##################################################################################################
 # * csv reader benchmark --------------------------------------------------------------------------
 ConfigureNVBench(CSV_READER_NVBENCH io/csv/csv_reader_input.cpp io/csv/csv_reader_options.cpp)

diff --git a/cpp/benchmarks/io/orc/orc_reader_multithreaded.cpp b/cpp/benchmarks/io/orc/orc_reader_multithreaded.cpp
@@ -0,0 +1,335 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <benchmarks/common/generate_input.hpp>
+#include <benchmarks/fixture/benchmark_fixture.hpp>
+#include <benchmarks/io/cuio_common.hpp>
+#include <benchmarks/io/nvbench_helpers.hpp>
+
+#include <cudf/detail/nvtx/ranges.hpp>
+#include <cudf/detail/utilities/stream_pool.hpp>
+#include <cudf/io/orc.hpp>
+#include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/pinned_memory.hpp>
+#include <cudf/utilities/thread_pool.hpp>
+
+#include <nvbench/nvbench.cuh>
+
+#include <vector>
+
+size_t get_num_read_threads(nvbench::state const& state) { return state.get_int64("num_threads"); }
+
+size_t get_read_size(nvbench::state const& state)
+{
+  auto const num_reads = get_num_read_threads(state);
+  return state.get_int64("total_data_size") / num_reads;
+}
+
+std::string get_label(std::string const& test_name, nvbench::state const& state)
+{
+  auto const num_cols       = state.get_int64("num_cols");
+  size_t const read_size_mb = get_read_size(state) / (1024 * 1024);
+  return {test_name + ", " + std::to_string(num_cols) + " columns, " +
+          std::to_string(get_num_read_threads(state)) + " threads " + " (" +
+          std::to_string(read_size_mb) + " MB each)"};
+}
+
+std::tuple<std::vector<cuio_source_sink_pair>, size_t, size_t> write_file_data(
+  nvbench::state& state, std::vector<cudf::type_id> const& d_types)
+{
+  auto const cardinality = state.get_int64("cardinality");
+  auto const run_length  = state.get_int64("run_length");
+  auto const num_cols    = state.get_int64("num_cols");
+  size_t const num_files            = get_num_read_threads(state);
+  size_t const per_file_data_size   = get_read_size(state);
+
+  std::vector<cuio_source_sink_pair> source_sink_vector;
+
+  size_t total_file_size = 0;
+
+  for (size_t i = 0; i < num_files; ++i) {
+    cuio_source_sink_pair source_sink{io_type::HOST_BUFFER};
+
+    auto const tbl = create_random_table(
+      cycle_dtypes(d_types, num_cols),
+      table_size_bytes{per_file_data_size},
+      data_profile_builder().cardinality(cardinality).avg_run_length(run_length));
+    auto const view = tbl->view();
+
+    cudf::io::orc_writer_options const write_opts =
+      cudf::io::orc_writer_options::builder(source_sink.make_sink_info(), view)
+        .compression(cudf::io::compression_type::SNAPPY);
+
+    cudf::io::write_orc(write_opts);
+    total_file_size += source_sink.size();
+
+    source_sink_vector.push_back(std::move(source_sink));
+  }
+
+  return {std::move(source_sink_vector), total_file_size, num_files};
+}
+
+void BM_orc_multithreaded_read_common(nvbench::state& state,
+                                      std::vector<cudf::type_id> const& d_types,
+                                      std::string const& label)
+{
+  auto const data_size = state.get_int64("total_data_size");
+  auto const num_threads = state.get_int64("num_threads");
+
+  auto streams = cudf::detail::fork_streams(cudf::get_default_stream(), num_threads);
+  cudf::detail::thread_pool threads(num_threads);
+
+  auto [source_sink_vector, total_file_size, num_files] = write_file_data(state, d_types);
+  std::vector<cudf::io::source_info> source_info_vector;
+  std::transform(source_sink_vector.begin(),
+                 source_sink_vector.end(),
+                 std::back_inserter(source_info_vector),
+                 [](auto& source_sink) { return source_sink.make_source_info(); });
+
+  auto mem_stats_logger = cudf::memory_stats_logger();
+
+  {
+    cudf::scoped_range range{("(read) " + label).c_str()};
+    state.exec(nvbench::exec_tag::sync | nvbench::exec_tag::timer,
+              [&](nvbench::launch& launch, auto& timer) {
+                auto read_func = [&](int index) {
+                  auto const stream = streams[index % num_threads];
+                  cudf::io::orc_reader_options read_opts =
+                    cudf::io::orc_reader_options::builder(source_info_vector[index]);
+                  cudf::io::read_orc(read_opts, stream, rmm::mr::get_current_device_resource());
+                };
+
+                threads.paused = true;
+                for (size_t i = 0; i < num_files; ++i) {
+                  threads.submit(read_func, i);
+                }
+                timer.start();
+                threads.paused = false;
+                threads.wait_for_tasks();
+                cudf::detail::join_streams(streams, cudf::get_default_stream());
+                timer.stop();
+              });
+  }
+
+  auto const time = state.get_summary("nv/cold/time/gpu/mean").get_float64("value");
+  state.add_element_count(static_cast<double>(data_size) / time, "bytes_per_second");
+  state.add_buffer_size(
+    mem_stats_logger.peak_memory_usage(), "peak_memory_usage", "peak_memory_usage");
+  state.add_buffer_size(total_file_size, "encoded_file_size", "encoded_file_size");
+}
+
+void BM_orc_multithreaded_read_mixed(nvbench::state& state)
+{
+  auto label = get_label("mixed", state);
+  cudf::scoped_range range{label.c_str()};
+  BM_orc_multithreaded_read_common(
+    state, {cudf::type_id::INT32, cudf::type_id::DECIMAL64, cudf::type_id::STRING}, label);
+}
+
+void BM_orc_multithreaded_read_fixed_width(nvbench::state& state)
+{
+  auto label = get_label("fixed width", state);
+  cudf::scoped_range range{label.c_str()};
+  BM_orc_multithreaded_read_common(state, {cudf::type_id::INT32}, label);
+}
+
+void BM_orc_multithreaded_read_string(nvbench::state& state)
+{
+  auto label = get_label("string", state);
+  cudf::scoped_range range{label.c_str()};
+  BM_orc_multithreaded_read_common(state, {cudf::type_id::STRING}, label);
+}
+
+void BM_orc_multithreaded_read_list(nvbench::state& state)
+{
+  auto label = get_label("list", state);
+  cudf::scoped_range range{label.c_str()};
+  BM_orc_multithreaded_read_common(state, {cudf::type_id::LIST}, label);
+}
+
+void BM_orc_multithreaded_read_chunked_common(nvbench::state& state,
+                                              std::vector<cudf::type_id> const& d_types,
+                                              std::string const& label)
+{
+  size_t const data_size    = state.get_int64("total_data_size");
+  auto const num_threads    = state.get_int64("num_threads");
+  size_t const input_limit  = state.get_int64("input_limit");
+  size_t const output_limit = state.get_int64("output_limit");
+
+  auto streams = cudf::detail::fork_streams(cudf::get_default_stream(), num_threads);
+  cudf::detail::thread_pool threads(num_threads);
+  auto [source_sink_vector, total_file_size, num_files] = write_file_data(state, d_types);
+  std::vector<cudf::io::source_info> source_info_vector;
+  std::transform(source_sink_vector.begin(),
+                 source_sink_vector.end(),
+                 std::back_inserter(source_info_vector),
+                 [](auto& source_sink) { return source_sink.make_source_info(); });
+
+  auto mem_stats_logger = cudf::memory_stats_logger();
+
+  {
+    cudf::scoped_range range{("(read) " + label).c_str()};
+    std::vector<cudf::io::table_with_metadata> chunks;
+    state.exec(nvbench::exec_tag::sync | nvbench::exec_tag::timer,
+              [&](nvbench::launch& launch, auto& timer) {
+                auto read_func = [&](int index) {
+                  auto const stream = streams[index % num_threads];
+                  cudf::io::orc_reader_options read_opts =
+                    cudf::io::orc_reader_options::builder(source_info_vector[index]);
+                  // divide chunk limits by number of threads so the number of chunks produced is the
+                  // same for all cases. this seems better than the alternative, which is to keep the
+                  // limits the same. if we do that, as the number of threads goes up, the number of
+                  // chunks goes down - so are actually benchmarking the same thing in that case?
+                  auto reader = cudf::io::chunked_orc_reader(
+                    output_limit / num_threads, input_limit / num_threads, read_opts, stream);
+
+                  // read all the chunks
+                  do {
+                    auto table = reader.read_chunk();
+                  } while (reader.has_next());
+                };
+
+                threads.paused = true;
+                for (size_t i = 0; i < num_files; ++i) {
+                  threads.submit(read_func, i);
+                }
+                timer.start();
+                threads.paused = false;
+                threads.wait_for_tasks();
+                cudf::detail::join_streams(streams, cudf::get_default_stream());
+                timer.stop();
+              });
+  }
+
+  auto const time = state.get_summary("nv/cold/time/gpu/mean").get_float64("value");
+  state.add_element_count(static_cast<double>(data_size) / time, "bytes_per_second");
+  state.add_buffer_size(
+    mem_stats_logger.peak_memory_usage(), "peak_memory_usage", "peak_memory_usage");
+  state.add_buffer_size(total_file_size, "encoded_file_size", "encoded_file_size");
+}
+
+void BM_orc_multithreaded_read_chunked_mixed(nvbench::state& state)
+{
+  auto label = get_label("mixed", state);
+  cudf::scoped_range range{label.c_str()};
+  BM_orc_multithreaded_read_chunked_common(
+    state, {cudf::type_id::INT32, cudf::type_id::DECIMAL64, cudf::type_id::STRING}, label);
+}
+
+void BM_orc_multithreaded_read_chunked_fixed_width(nvbench::state& state)
+{
+  auto label = get_label("fixed width", state);
+  cudf::scoped_range range{label.c_str()};
+  BM_orc_multithreaded_read_chunked_common(state, {cudf::type_id::INT32}, label);
+}
+
+void BM_orc_multithreaded_read_chunked_string(nvbench::state& state)
+{
+  auto label = get_label("string", state);
+  cudf::scoped_range range{label.c_str()};
+  BM_orc_multithreaded_read_chunked_common(state, {cudf::type_id::STRING}, label);
+}
+
+void BM_orc_multithreaded_read_chunked_list(nvbench::state& state)
+{
+  auto label = get_label("list", state);
+  cudf::scoped_range range{label.c_str()};
+  BM_orc_multithreaded_read_chunked_common(state, {cudf::type_id::LIST}, label);
+}
+auto const thread_range  = std::vector<nvbench::int64_t>{1, 2, 4, 8};
+auto const total_data_size = std::vector<nvbench::int64_t>{512 * 1024 * 1024, 1024 * 1024 * 1024};
+
+// mixed data types: fixed width and strings
+NVBENCH_BENCH(BM_orc_multithreaded_read_mixed)
+  .set_name("orc_multithreaded_read_decode_mixed")
+  .set_min_samples(4)
+  .add_int64_axis("cardinality", {1000})
+  .add_int64_axis("total_data_size", total_data_size)
+  .add_int64_axis("num_threads", thread_range)
+  .add_int64_axis("num_cols", {4})
+  .add_int64_axis("run_length", {8});
+
+NVBENCH_BENCH(BM_orc_multithreaded_read_fixed_width)
+  .set_name("orc_multithreaded_read_decode_fixed_width")
+  .set_min_samples(4)
+  .add_int64_axis("cardinality", {1000})
+  .add_int64_axis("total_data_size", total_data_size)
+  .add_int64_axis("num_threads", thread_range)
+  .add_int64_axis("num_cols", {4})
+  .add_int64_axis("run_length", {8});
+
+NVBENCH_BENCH(BM_orc_multithreaded_read_string)
+  .set_name("orc_multithreaded_read_decode_string")
+  .set_min_samples(4)
+  .add_int64_axis("cardinality", {1000})
+  .add_int64_axis("total_data_size", total_data_size)
+  .add_int64_axis("num_threads", thread_range)
+  .add_int64_axis("num_cols", {4})
+  .add_int64_axis("run_length", {8});
+
+NVBENCH_BENCH(BM_orc_multithreaded_read_list)
+  .set_name("orc_multithreaded_read_decode_list")
+  .set_min_samples(4)
+  .add_int64_axis("cardinality", {1000})
+  .add_int64_axis("total_data_size", total_data_size)
+  .add_int64_axis("num_threads", thread_range)
+  .add_int64_axis("num_cols", {4})
+  .add_int64_axis("run_length", {8});
+
+// mixed data types: fixed width, strings
+NVBENCH_BENCH(BM_orc_multithreaded_read_chunked_mixed)
+  .set_name("orc_multithreaded_read_decode_chunked_mixed")
+  .set_min_samples(4)
+  .add_int64_axis("cardinality", {1000})
+  .add_int64_axis("total_data_size", total_data_size)
+  .add_int64_axis("num_threads", thread_range)
+  .add_int64_axis("num_cols", {4})
+  .add_int64_axis("run_length", {8})
+  .add_int64_axis("input_limit", {640 * 1024 * 1024})
+  .add_int64_axis("output_limit", {640 * 1024 * 1024});
+
+NVBENCH_BENCH(BM_orc_multithreaded_read_chunked_fixed_width)
+  .set_name("orc_multithreaded_read_decode_chunked_fixed_width")
+  .set_min_samples(4)
+  .add_int64_axis("cardinality", {1000})
+  .add_int64_axis("total_data_size", total_data_size)
+  .add_int64_axis("num_threads", thread_range)
+  .add_int64_axis("num_cols", {4})
+  .add_int64_axis("run_length", {8})
+  .add_int64_axis("input_limit", {640 * 1024 * 1024})
+  .add_int64_axis("output_limit", {640 * 1024 * 1024});
+
+NVBENCH_BENCH(BM_orc_multithreaded_read_chunked_string)
+  .set_name("orc_multithreaded_read_decode_chunked_string")
+  .set_min_samples(4)
+  .add_int64_axis("cardinality", {1000})
+  .add_int64_axis("total_data_size", total_data_size)
+  .add_int64_axis("num_threads", thread_range)
+  .add_int64_axis("num_cols", {4})
+  .add_int64_axis("run_length", {8})
+  .add_int64_axis("input_limit", {640 * 1024 * 1024})
+  .add_int64_axis("output_limit", {640 * 1024 * 1024});
+
+NVBENCH_BENCH(BM_orc_multithreaded_read_chunked_list)
+  .set_name("orc_multithreaded_read_decode_chunked_list")
+  .set_min_samples(4)
+  .add_int64_axis("cardinality", {1000})
+  .add_int64_axis("total_data_size", total_data_size)
+  .add_int64_axis("num_threads", thread_range)
+  .add_int64_axis("num_cols", {4})
+  .add_int64_axis("run_length", {8})
+  .add_int64_axis("input_limit", {640 * 1024 * 1024})
+  .add_int64_axis("output_limit", {640 * 1024 * 1024});