From 5f9da83f7932250525232d246f9156ab22d244be Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiatruong.vn@gmail.com>
Date: Sun, 17 Jul 2022 17:57:27 -0700
Subject: [PATCH 01/31] Fix inconsistency when hashing two tables in
 `cudf::detail::contains` (#11284)

When hashing elements in a column, the nullable information needs to be taken into account. This could produce different results for the hash values of the same elements if the nullable condition is changed. As such, whenever we need to compute hashing for more than one column/table in the same operation, we need to take into account the nullable of all columns/tables, not just the one that is being hashed.

This PR fixes a bug when hashing two tables using different nullable values in `cudf::detail::contains`, which led to incorrect results when there is only one nullable table in the input.

Authors:
  - Nghia Truong (https://github.com/ttnghia)

Approvers:
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/11284
---
 cpp/src/search/contains_table.cu        |  9 +++----
 cpp/tests/join/semi_anti_join_tests.cpp | 31 +++++++++++++++++++++++--
 2 files changed, 34 insertions(+), 6 deletions(-)
diff --git a/cpp/src/search/contains_table.cu b/cpp/src/search/contains_table.cu
index 2aa6bf9f9a3..46280d4ff5f 100644
--- a/cpp/src/search/contains_table.cu
+++ b/cpp/src/search/contains_table.cu
@@ -62,11 +62,12 @@ rmm::device_uvector<bool> contains(table_view const& haystack,
 
   auto const haystack_has_nulls = has_nested_nulls(haystack);
   auto const needles_has_nulls  = has_nested_nulls(needles);
+  auto const has_any_nulls      = haystack_has_nulls || needles_has_nulls;
 
   // Insert all row hash values and indices of the haystack table.
   {
     auto const hasher   = cudf::experimental::row::hash::row_hasher(haystack, stream);
-    auto const d_hasher = hasher.device_hasher(nullate::DYNAMIC{haystack_has_nulls});
+    auto const d_hasher = hasher.device_hasher(nullate::DYNAMIC{has_any_nulls});
 
     using make_pair_fn = make_pair_function<decltype(d_hasher), lhs_index_type>;
 
@@ -110,7 +111,7 @@ rmm::device_uvector<bool> contains(table_view const& haystack,
   // Check existence for each row of the needles table in the haystack table.
   {
     auto const hasher   = cudf::experimental::row::hash::row_hasher(needles, stream);
-    auto const d_hasher = hasher.device_hasher(nullate::DYNAMIC{needles_has_nulls});
+    auto const d_hasher = hasher.device_hasher(nullate::DYNAMIC{has_any_nulls});
 
     auto const comparator =
       cudf::experimental::row::equality::two_table_comparator(haystack, needles, stream);
@@ -121,8 +122,8 @@ rmm::device_uvector<bool> contains(table_view const& haystack,
       size_type{0}, make_pair_fn{d_hasher, map.get_empty_key_sentinel()});
 
     auto const check_contains = [&](auto const value_comp) {
-      auto const d_eqcomp = comparator.equal_to(
-        nullate::DYNAMIC{needles_has_nulls || haystack_has_nulls}, compare_nulls, value_comp);
+      auto const d_eqcomp =
+        comparator.equal_to(nullate::DYNAMIC{has_any_nulls}, compare_nulls, value_comp);
       map.pair_contains(needles_it,
                         needles_it + needles.num_rows(),
                         contained.begin(),
diff --git a/cpp/tests/join/semi_anti_join_tests.cpp b/cpp/tests/join/semi_anti_join_tests.cpp
index ff4270058cd..97af1fd7006 100644
--- a/cpp/tests/join/semi_anti_join_tests.cpp
+++ b/cpp/tests/join/semi_anti_join_tests.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -26,6 +26,7 @@
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/iterator_utilities.hpp>
 #include <cudf_test/table_utilities.hpp>
 
 #include <thrust/iterator/transform_iterator.h>
@@ -52,7 +53,7 @@ TEST_F(JoinTest, TestSimple)
     cudf::data_type{cudf::type_to_id<cudf::size_type>()}, result->size(), result->data());
   column_wrapper<cudf::size_type> expected{0, 1};
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result_cv);
-};
+}
 
 std::pair<std::unique_ptr<cudf::table>, std::unique_ptr<cudf::table>> get_saj_tables(
   std::vector<bool> const& left_is_human_nulls, std::vector<bool> const& right_is_human_nulls)
@@ -230,3 +231,29 @@ TEST_F(JoinTest, AntiJoinWithStructsAndNullsNotEqual)
 
   CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*sorted_gold, *sorted_result);
 }
+
+TEST_F(JoinTest, AntiJoinWithStructsAndNullsOnOneSide)
+{
+  auto constexpr null{0};
+  auto left_col0 = [] {
+    column_wrapper<int32_t> child1{{1, null}, cudf::test::iterators::null_at(1)};
+    column_wrapper<int32_t> child2{11, 12};
+    return cudf::test::structs_column_wrapper{{child1, child2}};
+  }();
+  auto right_col0 = [] {
+    column_wrapper<int32_t> child1{1, 2, 3, 4};
+    column_wrapper<int32_t> child2{11, 12, 13, 14};
+    return cudf::test::structs_column_wrapper{{child1, child2}};
+  }();
+
+  auto left  = cudf::table_view{{left_col0}};
+  auto right = cudf::table_view{{right_col0}};
+
+  auto result   = cudf::left_anti_join(left, right, {0}, {0});
+  auto expected = [] {
+    column_wrapper<int32_t> child1{{null}, cudf::test::iterators::null_at(0)};
+    column_wrapper<int32_t> child2{12};
+    return cudf::test::structs_column_wrapper{{child1, child2}};
+  }();
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->get_column(0).view());
+}

From f220e902b1a6ea1361f973a5f5db4709d0507812 Mon Sep 17 00:00:00 2001
From: Sheilah Kirui <71867292+skirui-source@users.noreply.github.com>
Date: Mon, 18 Jul 2022 09:06:29 -0700
Subject: [PATCH 02/31] Implement Groupby pct_change (#11144)

Subsequent to https://github.com/rapidsai/cudf/pull/9805, this PR adds support for Groupby.pct_change()

Fixes https://github.com/rapidsai/cudf/issues/9606
Replaces https://github.com/rapidsai/cudf/pull/10444

Authors:
  - Sheilah Kirui (https://github.com/skirui-source)

Approvers:
  - Ashwin Srinath (https://github.com/shwina)

URL: https://github.com/rapidsai/cudf/pull/11144
---
 python/cudf/cudf/core/groupby/groupby.py | 42 ++++++++++++++
 python/cudf/cudf/tests/test_groupby.py   | 73 ++++++++++++++++++++++++
 2 files changed, 115 insertions(+)

diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py
index 8347c2bd94e..c651cfdf3a1 100644
--- a/python/cudf/cudf/core/groupby/groupby.py
+++ b/python/cudf/cudf/core/groupby/groupby.py
@@ -1416,6 +1416,48 @@ def shift(self, periods=1, freq=None, axis=0, fill_value=None):
         result = self._mimic_pandas_order(result)
         return result._copy_type_metadata(values)
 
+    def pct_change(
+        self, periods=1, fill_method="ffill", axis=0, limit=None, freq=None
+    ):
+        """
+        Calculates the percent change between sequential elements
+        in the group.
+
+        Parameters
+        ----------
+        periods : int, default 1
+            Periods to shift for forming percent change.
+        fill_method : str, default 'ffill'
+            How to handle NAs before computing percent changes.
+        limit : int, optional
+            The number of consecutive NAs to fill before stopping.
+            Not yet implemented.
+        freq : str, optional
+            Increment to use from time series API.
+            Not yet implemented.
+
+        Returns
+        -------
+        Series or DataFrame
+            Percentage changes within each group
+        """
+        if not axis == 0:
+            raise NotImplementedError("Only axis=0 is supported.")
+        if limit is not None:
+            raise NotImplementedError("limit parameter not supported yet.")
+        if freq is not None:
+            raise NotImplementedError("freq parameter not supported yet.")
+        elif fill_method not in {"ffill", "pad", "bfill", "backfill"}:
+            raise ValueError(
+                "fill_method must be one of 'ffill', 'pad', "
+                "'bfill', or 'backfill'."
+            )
+
+        filled = self.fillna(method=fill_method, limit=limit)
+        fill_grp = filled.groupby(self.grouping)
+        shifted = fill_grp.shift(periods=periods, freq=freq)
+        return (filled / shifted) - 1
+
     def _mimic_pandas_order(
         self, result: DataFrameOrSeries
     ) -> DataFrameOrSeries:
diff --git a/python/cudf/cudf/tests/test_groupby.py b/python/cudf/cudf/tests/test_groupby.py
index 0750a36461b..bd5e9fe017b 100644
--- a/python/cudf/cudf/tests/test_groupby.py
+++ b/python/cudf/cudf/tests/test_groupby.py
@@ -2604,3 +2604,76 @@ def test_groupby_transform_maintain_index(by):
     assert_groupby_results_equal(
         pdf.groupby(by).transform("max"), gdf.groupby(by).transform("max")
     )
+
+
+@pytest.mark.parametrize(
+    "data, gkey",
+    [
+        (
+            {
+                "id": ["a", "a", "a", "b", "b", "b", "c", "c", "c"],
+                "val1": [5, 4, 6, 4, 8, 7, 4, 5, 2],
+                "val2": [4, 5, 6, 1, 2, 9, 8, 5, 1],
+                "val3": [4, 5, 6, 1, 2, 9, 8, 5, 1],
+            },
+            ["id"],
+        ),
+        (
+            {
+                "id": [0, 0, 0, 0, 1, 1, 1],
+                "a": [1, 3, 4, 2.0, -3.0, 9.0, 10.0],
+                "b": [10.0, 23, -4.0, 2, -3.0, None, 19.0],
+            },
+            ["id", "a"],
+        ),
+        (
+            {
+                "id": ["a", "a", "b", "b", "c", "c"],
+                "val1": [None, None, None, None, None, None],
+            },
+            ["id"],
+        ),
+    ],
+)
+@pytest.mark.parametrize("periods", [-5, -2, 0, 2, 5])
+@pytest.mark.parametrize("fill_method", ["ffill", "bfill", "pad", "backfill"])
+def test_groupby_pct_change(data, gkey, periods, fill_method):
+    gdf = cudf.DataFrame(data)
+    pdf = gdf.to_pandas()
+
+    actual = gdf.groupby(gkey).pct_change(
+        periods=periods, fill_method=fill_method
+    )
+    expected = pdf.groupby(gkey).pct_change(
+        periods=periods, fill_method=fill_method
+    )
+
+    assert_eq(expected, actual)
+
+
+@pytest.mark.xfail(reason="https://github.com/rapidsai/cudf/issues/11259")
+@pytest.mark.parametrize("periods", [-5, 5])
+def test_groupby_pct_change_multiindex_dataframe(periods):
+    gdf = cudf.DataFrame(
+        {
+            "a": [1, 1, 2, 2],
+            "b": [1, 1, 2, 3],
+            "c": [2, 3, 4, 5],
+            "d": [6, 8, 9, 1],
+        }
+    ).set_index(["a", "b"])
+
+    actual = gdf.groupby(level=["a", "b"]).pct_change(periods)
+    expected = gdf.to_pandas().groupby(level=["a", "b"]).pct_change(periods)
+
+    assert_eq(expected, actual)
+
+
+def test_groupby_pct_change_empty_columns():
+    gdf = cudf.DataFrame(columns=["id", "val1", "val2"])
+    pdf = gdf.to_pandas()
+
+    actual = gdf.groupby("id").pct_change()
+    expected = pdf.groupby("id").pct_change()
+
+    assert_eq(expected, actual)

From dd7e955a0d972b61e295cfdbb6a192f146c27300 Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Mon, 18 Jul 2022 14:03:46 -0500
Subject: [PATCH 03/31] Pin `pytorch` to temporarily unblock from `libcupti`
 errors (#11289)

A new version of `pytorch` has been released, `1.12.0`. This version's packages don't statically link to `libcupti`(More explanation on that [here](https://github.com/pytorch/vision/issues/5635)). Until that is patched, we are going to run into the `libcupti` not found error: https://github.com/pytorch/pytorch/issues/74473#issuecomment-1187705802

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Ray Douglass (https://github.com/raydouglass)

URL: https://github.com/rapidsai/cudf/pull/11289
---
 conda/environments/cudf_dev_cuda11.5.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/conda/environments/cudf_dev_cuda11.5.yml b/conda/environments/cudf_dev_cuda11.5.yml
index 35c140cc2e1..ee945e73279 100644
--- a/conda/environments/cudf_dev_cuda11.5.yml
+++ b/conda/environments/cudf_dev_cuda11.5.yml
@@ -77,6 +77,7 @@ dependencies:
   - botocore>=1.24.21
   - aiobotocore>=2.2.0
   - s3fs>=2022.3.0
+  - pytorch<1.12.0
   - pip:
       - git+https://github.com/python-streamz/streamz.git@master
       - pyorc

From 2f331b4e817a40c8541807cdc151cdbe2a0fc3d0 Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Mon, 18 Jul 2022 12:10:52 -0700
Subject: [PATCH 04/31] Add dev docs for documentation writing (#11217)

This PR adds a page describing how to write documentation for cuDF.

Authors:
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)
  - Ashwin Srinath (https://github.com/shwina)
  - Matthew Roeschke (https://github.com/mroeschke)

URL: https://github.com/rapidsai/cudf/pull/11217
---
 docs/cudf/README.md                           |  35 +---
 .../source/developer_guide/documentation.md   | 196 ++++++++++++++++++
 docs/cudf/source/developer_guide/index.md     |   1 +
 3 files changed, 201 insertions(+), 31 deletions(-)
 create mode 100644 docs/cudf/source/developer_guide/documentation.md

diff --git a/docs/cudf/README.md b/docs/cudf/README.md
index 6cb2ff6977a..cea6e8f5f37 100644
--- a/docs/cudf/README.md
+++ b/docs/cudf/README.md
@@ -1,33 +1,6 @@
 # Building Documentation
 
-As a prerequisite, a RAPIDS compatible GPU is required to build the docs since the notebooks in the docs execute the code to generate the HTML output.
-
-## Steps to follow:
-
-In order to build the docs, we need the conda dev environment from cudf and build cudf from source. See build [instructions](https://github.com/rapidsai/cudf/blob/branch-0.13/CONTRIBUTING.md#setting-up-your-build-environment). 
-
-1. Create a conda env and build cudf from source. The dependencies to build rapids from source are installed in that conda environment, and then rapids is built and installed into the same environment.
-
-2. Once cudf is built from source, navigate to `../docs/cudf/`. If you have your documentation written and want to turn it into HTML, run makefile:
-
-
-```bash
-#be in the same directory as your Makefile
-make html
-```
-This should run Sphinx in your shell, and outputs to `build/html/index.html`.
-
-
-## View docs web page by opening HTML in browser:
-
-First navigate to `/build/html/` folder, i.e., `cd build/html` and then run the following command:
-
-```bash
-python -m http.server
-```
-Then, navigate a web browser to the IP address or hostname of the host machine at port 8000:
-
-```
-https://<host IP-Address>:8000
-```
-Now you can check if your docs edits formatted correctly, and read well.
+This directory contains the documentation of cuDF Python.
+For more information on how to write, build, and read the documentation,
+see 
+[the developer documentation](https://github.com/rapidsai/cudf/blob/main/docs/cudf/source/developer_guide/documentation.md).
diff --git a/docs/cudf/source/developer_guide/documentation.md b/docs/cudf/source/developer_guide/documentation.md
new file mode 100644
index 00000000000..894185767d5
--- /dev/null
+++ b/docs/cudf/source/developer_guide/documentation.md
@@ -0,0 +1,196 @@
+# Writing documentation
+
+cuDF documentation is split into multiple pieces.
+All core functionality is documented using inline docstrings.
+Additional pages like user or developer guides are written independently.
+While docstrings are written using [reStructuredText](https://www.sphinx-doc.org/en/master/usage/restructuredtext/basics.html) (reST),
+the latter are written using [MyST](https://myst-parser.readthedocs.io/en/latest/) 
+The inline docstrings are organized using a small set of additional reST pages.
+The results are all then compiled together using [Sphinx](https://www.sphinx-doc.org/en/master/).
+This document discusses each of these components and how to contribute to them.
+
+## Docstrings
+
+cuDF docstrings use the [numpy](https://numpydoc.readthedocs.io/en/latest/format.html) style.
+In lieu of a complete explanation,
+we include here an example of the format and the commonly used sections:
+
+```
+class A:
+    """Brief description of A.
+
+    Longer description of A.
+
+    Parameters
+    ----------
+    x : int
+        Description of x, the first constructor parameter.
+    """
+    def __init__(self, x: int):
+        pass
+
+    def foo(self, bar: str):
+        """Short description of foo.
+
+        Longer description of foo.
+
+        Parameters
+        ----------
+        bar : str
+            Description of bar.
+        
+        Returns
+        -------
+        float
+            Description of the return value of foo.
+
+        Raises
+        ------
+        ValueError
+            Explanation of when a ValueError is raised.
+            In this case, a ValueError is raised if bar is "fail".
+
+        Examples
+        --------
+        The examples section is _strongly_ encouraged.
+        Where appropriate, it may mimic the examples for the corresponding pandas API.
+        >>> a = A()
+        >>> a.foo('baz')
+        0.0
+        >>> a.foo('fail')
+        ...
+        ValueError: Failed!
+        """
+        if bar == "fail":
+            raise ValueError("Failed!")
+        return 0.0
+```
+
+`numpydoc` supports a number of other sections of docstrings.
+Developers should familiarize themselves with them, since many are useful in different scenarios.
+Our guidelines include one addition to the standard the `numpydoc` guide.
+Class properties, which are not explicitly covered, should be documented in the getter function.
+That choice makes `help` more useful as well as enabling docstring inheritance in subclasses.
+
+All of our docstrings are validated using [`pydocstyle`](http://www.pydocstyle.org/en/stable/).
+This ensures that docstring style is consistent and conformant across the codebase.
+
+## Published documentation
+
+Documentation is compiled using Sphinx, which pulls docstrings from the code.
+Rather than simply listing all APIs, however, we aim to mimic the pandas documentation.
+To do so, we organize API docs into specific pages and sections.
+These pages are stored in `docs/cudf/source/api_docs`.
+For example, all `DataFrame` documentation is contained in `docs/cudf/source/api_docs/dataframe.rst`.
+That page contains sections like "Computations / descriptive stats" to make APIs more easily discoverable.
+
+Within each section, documentation is created using [`autosummary`](https://www.sphinx-doc.org/en/master/usage/extensions/autosummary.html)
+This plugin makes it easy to generate pages for each documented API.
+To do so, each section of the docs looks like the following:
+
+```
+Section name
+~~~~~~~~~~~~
+.. autosummary::
+   API1
+   API2
+   ...
+```
+
+Each listed will automatically have its docstring rendered into a separate page.
+This layout comes from the [Sphinx theme](https://pydata-sphinx-theme.readthedocs.io/en/stable/index.html) that we use.
+
+````{note}
+Under the hood, autosummary generates stub pages that look like this (using `cudf.concat` as an example):
+
+```
+cudf.concat
+===========
+
+.. currentmodule:: cudf
+
+.. autofunction:: concat
+```
+
+Commands like `autofunction` come from [`autodoc`](https://www.sphinx-doc.org/en/master/usage/extensions/autodoc.html).
+This directive will import cudf and pull the docstring from `cudf.concat`.
+This approach allows us to do the minimal amount of manual work in organizing our docs,
+while still matching the pandas layout as closely as possible.
+````
+
+When adding a new API, developers simply have to add the API to the appropriate page.
+Adding the name of the function to the appropriate autosummary list is sufficient for it to be documented.
+
+## Comparing to pandas
+
+cuDF aims to provide a pandas-like experience.
+However, for various reasons cuDF APIs may exhibit differences from pandas.
+Where such differences exist, they should be documented.
+We facilitate such documentation with the `pandas-compat` directive.
+The directive should be used inside docstrings like so:
+
+```
+"""Brief
+
+Docstring body
+
+.. pandas-compat::
+    **$API_NAME**
+
+    Explanation of differences
+```
+
+All such API compatibility notes are collected and displayed in the rendered documentation.
+
+## Writing documentation pages
+
+In addition to docstrings, our docs also contain a number of more dedicated user guides.
+These pages are stored in `docs/cudf/source/user_guide`.
+These pages are all written using MyST, a superset of Markdown.
+MyST allows developers to write using familiar Markdown syntax,
+while also providing the full power of reST where needed.
+These pages do not conform to any specific style or set of use cases.
+However, if you develop any sufficiently complex new features,
+consider whether users would benefit from a more complete demonstration of them.
+
+## Building documentation
+
+### Requirements
+
+The following are required to build the documentation:
+- A RAPIDS-compatible GPU. This is necessary because the documentation execute code.
+- A working copy of cudf in the same build environment.
+  We recommend following the [build instructions](https://github.com/rapidsai/cudf/blob/main/CONTRIBUTING.md#setting-up-your-build-environment). 
+- Sphinx, numpydoc, and MyST-NB.
+  Assuming you follow the build instructions, these should automatically be installed into your environment.
+
+### Building and viewing docs
+
+Once you have a working copy of cudf, building the docs is straightforward:
+1. Navigate to `/path/to/cudf/docs/cudf/`.
+2. Execute `make html`
+
+This will run Sphinx in your shell and generate outputs at `build/html/index.html`.
+To view the results.
+1. Navigate to `build/html`
+2. Execute `python -m http.server`
+
+Then, open a web browser and go to `https://localhost:8000`.
+If something else is currently running on port 8000,
+`python -m http.server` will automatically find the next available port.
+Alternatively, you may specify a port with `python -m http.server $PORT`.
+
+You may build docs on a remote machine but want to view them locally.
+Assuming the other machine's IP address is visible on your local network,
+you can view the docs by replacing `localhost` with the IP address of the host machine.
+Alternatively, you may also forward the port using e.g. 
+`ssh -N -f -L localhost:$LOCAL_PORT:localhost:$REMOTE_PORT $REMOTE_IP`.
+That will make `$REMOTE_IP:$REMOTE_PORT` visible at `localhost:$LOCAL_PORT`.
+
+## Documenting cuDF internals
+
+Unlike public APIs, the documentation of internal code (functions, classes, etc) is not linted.
+Documenting internals is strongly encouraged, but not enforced in any particular way.
+Regarding style, either full numpy-style docstrings or regular `#` comments are acceptable.
+The former can be useful for complex or widely used functionality,
+while the latter is fine for small one-off functions.
diff --git a/docs/cudf/source/developer_guide/index.md b/docs/cudf/source/developer_guide/index.md
index 61f30a45352..d80afcfdcd9 100644
--- a/docs/cudf/source/developer_guide/index.md
+++ b/docs/cudf/source/developer_guide/index.md
@@ -4,3 +4,4 @@
 :maxdepth: 2
 
 library_design
+documentation

From ae1b581237b0b27d60daf8c59f0b8163081440fa Mon Sep 17 00:00:00 2001
From: Tim <43156029+AtlantaPepsi@users.noreply.github.com>
Date: Mon, 18 Jul 2022 16:32:22 -0400
Subject: [PATCH 05/31] Addition & integration of the integer power operator
 (#11025)

Partial fix for #10178 (still need to investigate whether decimal types are also affected).

This implements the `INT_POW` binary operator, which can be dispatched for integral types. This uses an [exponentiation-by-squaring](https://en.wikipedia.org/wiki/Exponentiation_by_squaring) algorithm to compute powers. Unlike `POW`, this does not cast the data to floating-point types which can suffer from precision loss when computing powers and casting back to an integral type. The cuDF Python layer has been updated to dispatch integral data to this operator, which fixes the problems seen for specific values of base and exponent (like `3**1 == 2`) noted in #10178.

Authors:
  - Tim (https://github.com/AtlantaPepsi)
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Mike Wilson (https://github.com/hyperbolic2346)
  - Jason Lowe (https://github.com/jlowe)
  - https://github.com/brandon-b-miller

URL: https://github.com/rapidsai/cudf/pull/11025
---
 cpp/CMakeLists.txt                            | 15 ++--
 cpp/include/cudf/binaryop.hpp                 | 36 +++++----
 cpp/src/binaryop/compiled/IntPow.cu           | 26 ++++++
 cpp/src/binaryop/compiled/binary_ops.cu       |  1 +
 cpp/src/binaryop/compiled/operation.cuh       | 28 +++++++
 cpp/src/binaryop/compiled/util.cpp            |  1 +
 cpp/tests/binaryop/binop-compiled-test.cpp    | 26 ++++++
 cpp/tests/table/table_tests.cpp               |  5 +-
 .../main/java/ai/rapids/cudf/BinaryOp.java    | 45 +++++------
 python/cudf/cudf/_lib/binaryop.pyx            |  5 +-
 python/cudf/cudf/_lib/cpp/binaryop.pxd        |  3 +-
 python/cudf/cudf/core/column/numerical.py     |  9 +++
 python/cudf/cudf/tests/test_binops.py         | 79 +++++++++++++++++++
 13 files changed, 229 insertions(+), 50 deletions(-)
 create mode 100644 cpp/src/binaryop/compiled/IntPow.cu

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 86bfdc1444b..0903609c1e2 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -196,26 +196,26 @@ add_library(
   src/ast/expression_parser.cpp
   src/ast/expressions.cpp
   src/binaryop/binaryop.cpp
-  src/binaryop/compiled/Add.cu
   src/binaryop/compiled/ATan2.cu
+  src/binaryop/compiled/Add.cu
   src/binaryop/compiled/BitwiseAnd.cu
   src/binaryop/compiled/BitwiseOr.cu
   src/binaryop/compiled/BitwiseXor.cu
-  src/binaryop/compiled/Less.cu
-  src/binaryop/compiled/Greater.cu
-  src/binaryop/compiled/LessEqual.cu
-  src/binaryop/compiled/GreaterEqual.cu
   src/binaryop/compiled/Div.cu
-  src/binaryop/compiled/equality_ops.cu
   src/binaryop/compiled/FloorDiv.cu
+  src/binaryop/compiled/Greater.cu
+  src/binaryop/compiled/GreaterEqual.cu
+  src/binaryop/compiled/IntPow.cu
+  src/binaryop/compiled/Less.cu
+  src/binaryop/compiled/LessEqual.cu
   src/binaryop/compiled/LogBase.cu
   src/binaryop/compiled/LogicalAnd.cu
   src/binaryop/compiled/LogicalOr.cu
   src/binaryop/compiled/Mod.cu
   src/binaryop/compiled/Mul.cu
   src/binaryop/compiled/NullEquals.cu
-  src/binaryop/compiled/NullLogicalOr.cu
   src/binaryop/compiled/NullLogicalAnd.cu
+  src/binaryop/compiled/NullLogicalOr.cu
   src/binaryop/compiled/NullMax.cu
   src/binaryop/compiled/NullMin.cu
   src/binaryop/compiled/PMod.cu
@@ -227,6 +227,7 @@ add_library(
   src/binaryop/compiled/Sub.cu
   src/binaryop/compiled/TrueDiv.cu
   src/binaryop/compiled/binary_ops.cu
+  src/binaryop/compiled/equality_ops.cu
   src/binaryop/compiled/util.cpp
   src/labeling/label_bins.cu
   src/bitmask/null_mask.cu
diff --git a/cpp/include/cudf/binaryop.hpp b/cpp/include/cudf/binaryop.hpp
index ce335c05f71..c82fd1b52a1 100644
--- a/cpp/include/cudf/binaryop.hpp
+++ b/cpp/include/cudf/binaryop.hpp
@@ -36,23 +36,25 @@ namespace cudf {
  * @brief Types of binary operations that can be performed on data.
  */
 enum class binary_operator : int32_t {
-  ADD,                   ///< operator +
-  SUB,                   ///< operator -
-  MUL,                   ///< operator *
-  DIV,                   ///< operator / using common type of lhs and rhs
-  TRUE_DIV,              ///< operator / after promoting type to floating point
-  FLOOR_DIV,             ///< operator / after promoting to 64 bit floating point and then
-                         ///< flooring the result
-  MOD,                   ///< operator %
-  PMOD,                  ///< positive modulo operator
-                         ///< If remainder is negative, this returns (remainder + divisor) % divisor
-                         ///< else, it returns (dividend % divisor)
-  PYMOD,                 ///< operator % but following Python's sign rules for negatives
-  POW,                   ///< lhs ^ rhs
-  LOG_BASE,              ///< logarithm to the base
-  ATAN2,                 ///< 2-argument arctangent
-  SHIFT_LEFT,            ///< operator <<
-  SHIFT_RIGHT,           ///< operator >>
+  ADD,          ///< operator +
+  SUB,          ///< operator -
+  MUL,          ///< operator *
+  DIV,          ///< operator / using common type of lhs and rhs
+  TRUE_DIV,     ///< operator / after promoting type to floating point
+  FLOOR_DIV,    ///< operator / after promoting to 64 bit floating point and then
+                ///< flooring the result
+  MOD,          ///< operator %
+  PMOD,         ///< positive modulo operator
+                ///< If remainder is negative, this returns (remainder + divisor) % divisor
+                ///< else, it returns (dividend % divisor)
+  PYMOD,        ///< operator % but following Python's sign rules for negatives
+  POW,          ///< lhs ^ rhs
+  INT_POW,      ///< int ^ int, used to avoid floating point precision loss. Returns 0 for negative
+                ///< exponents.
+  LOG_BASE,     ///< logarithm to the base
+  ATAN2,        ///< 2-argument arctangent
+  SHIFT_LEFT,   ///< operator <<
+  SHIFT_RIGHT,  ///< operator >>
   SHIFT_RIGHT_UNSIGNED,  ///< operator >>> (from Java)
                          ///< Logical right shift. Casts to an unsigned value before shifting.
   BITWISE_AND,           ///< operator &
diff --git a/cpp/src/binaryop/compiled/IntPow.cu b/cpp/src/binaryop/compiled/IntPow.cu
new file mode 100644
index 00000000000..468feaa3fda
--- /dev/null
+++ b/cpp/src/binaryop/compiled/IntPow.cu
@@ -0,0 +1,26 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "binary_ops.cuh"
+
+namespace cudf::binops::compiled {
+template void apply_binary_op<ops::IntPow>(mutable_column_view&,
+                                           column_view const&,
+                                           column_view const&,
+                                           bool is_lhs_scalar,
+                                           bool is_rhs_scalar,
+                                           rmm::cuda_stream_view);
+}
diff --git a/cpp/src/binaryop/compiled/binary_ops.cu b/cpp/src/binaryop/compiled/binary_ops.cu
index f16d1b99219..d91b534dffb 100644
--- a/cpp/src/binaryop/compiled/binary_ops.cu
+++ b/cpp/src/binaryop/compiled/binary_ops.cu
@@ -325,6 +325,7 @@ case binary_operator::FLOOR_DIV:            apply_binary_op<ops::FloorDiv>(out,
 case binary_operator::MOD:                  apply_binary_op<ops::Mod>(out, lhs, rhs, is_lhs_scalar, is_rhs_scalar, stream); break;
 case binary_operator::PYMOD:                apply_binary_op<ops::PyMod>(out, lhs, rhs, is_lhs_scalar, is_rhs_scalar, stream); break;
 case binary_operator::POW:                  apply_binary_op<ops::Pow>(out, lhs, rhs, is_lhs_scalar, is_rhs_scalar, stream); break;
+case binary_operator::INT_POW:               apply_binary_op<ops::IntPow>(out, lhs, rhs, is_lhs_scalar, is_rhs_scalar, stream); break;
 case binary_operator::EQUAL:
 case binary_operator::NOT_EQUAL:
 if(out.type().id() != type_id::BOOL8) CUDF_FAIL("Output type of Comparison operator should be bool type");
diff --git a/cpp/src/binaryop/compiled/operation.cuh b/cpp/src/binaryop/compiled/operation.cuh
index de9d46b6280..b4a396b3cbf 100644
--- a/cpp/src/binaryop/compiled/operation.cuh
+++ b/cpp/src/binaryop/compiled/operation.cuh
@@ -216,6 +216,34 @@ struct Pow {
   }
 };
 
+struct IntPow {
+  template <
+    typename TypeLhs,
+    typename TypeRhs,
+    std::enable_if_t<(std::is_integral_v<TypeLhs> and std::is_integral_v<TypeRhs>)>* = nullptr>
+  __device__ inline auto operator()(TypeLhs x, TypeRhs y) -> TypeLhs
+  {
+    if (y < 0) {
+      // Integer exponentiation with negative exponent is not possible.
+      return 0;
+    }
+    if (y == 0) { return 1; }
+    if (x == 0) { return 0; }
+    TypeLhs extra = 1;
+    while (y > 1) {
+      if (y & 1) {
+        // The exponent is odd, so multiply by one factor of x.
+        extra *= x;
+        y -= 1;
+      }
+      // The exponent is even, so square x and divide the exponent y by 2.
+      y /= 2;
+      x *= x;
+    }
+    return x * extra;
+  }
+};
+
 struct LogBase {
   template <typename TypeLhs,
             typename TypeRhs,
diff --git a/cpp/src/binaryop/compiled/util.cpp b/cpp/src/binaryop/compiled/util.cpp
index 91fa04be6e2..1ef521d241a 100644
--- a/cpp/src/binaryop/compiled/util.cpp
+++ b/cpp/src/binaryop/compiled/util.cpp
@@ -141,6 +141,7 @@ struct is_supported_operation_functor {
         case binary_operator::MOD:                  return call<ops::Mod>(out_type);
         case binary_operator::PYMOD:                return call<ops::PyMod>(out_type);
         case binary_operator::POW:                  return call<ops::Pow>(out_type);
+        case binary_operator::INT_POW:              return call<ops::IntPow>(out_type);
         case binary_operator::BITWISE_AND:          return call<ops::BitwiseAnd>(out_type);
         case binary_operator::BITWISE_OR:           return call<ops::BitwiseOr>(out_type);
         case binary_operator::BITWISE_XOR:          return call<ops::BitwiseXor>(out_type);
diff --git a/cpp/tests/binaryop/binop-compiled-test.cpp b/cpp/tests/binaryop/binop-compiled-test.cpp
index 72fbf8c22d1..23ef6123d48 100644
--- a/cpp/tests/binaryop/binop-compiled-test.cpp
+++ b/cpp/tests/binaryop/binop-compiled-test.cpp
@@ -396,6 +396,32 @@ TYPED_TEST(BinaryOperationCompiledTest_FloatOps, PMod_Vector_Vector)
   this->template test<cudf::library::operation::PMod>(cudf::binary_operator::PMOD);
 }
 
+using IntPow_types = cudf::test::Types<cudf::test::Types<int32_t, int32_t, int32_t>,
+                                       cudf::test::Types<int64_t, int64_t, int64_t>>;
+template <typename T>
+struct BinaryOperationCompiledTest_IntPow : public BinaryOperationCompiledTest<T> {
+};
+TYPED_TEST_SUITE(BinaryOperationCompiledTest_IntPow, IntPow_types);
+
+TYPED_TEST(BinaryOperationCompiledTest_IntPow, IntPow_SpecialCases)
+{
+  // This tests special values for which integer powers are required. Casting
+  // to double and casting the result back to int results in floating point
+  // losses, like 3**1 == 2.
+  using TypeOut = typename TestFixture::TypeOut;
+  using TypeLhs = typename TestFixture::TypeLhs;
+  using TypeRhs = typename TestFixture::TypeRhs;
+
+  auto lhs      = fixed_width_column_wrapper<TypeLhs>({3, -3, 8, -8});
+  auto rhs      = fixed_width_column_wrapper<TypeRhs>({1, 1, 7, 7});
+  auto expected = fixed_width_column_wrapper<TypeOut>({3, -3, 2097152, -2097152});
+
+  auto result = cudf::binary_operation(
+    lhs, rhs, cudf::binary_operator::INT_POW, data_type(type_to_id<TypeOut>()));
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
+}
+
 // Bit Operations
 //     n      t     d
 // n n . n
diff --git a/cpp/tests/table/table_tests.cpp b/cpp/tests/table/table_tests.cpp
index 8f9cbbb63e3..f8f0fe4325e 100644
--- a/cpp/tests/table/table_tests.cpp
+++ b/cpp/tests/table/table_tests.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -100,7 +100,8 @@ TEST_F(TableTest, SelectingOutOfBounds)
 
   Table t(std::move(cols));
 
-  EXPECT_THROW(t.select(std::vector<cudf::size_type>{0, 1, 2}), std::out_of_range);
+  EXPECT_THROW(cudf::table_view selected_tview = t.select(std::vector<cudf::size_type>{0, 1, 2}),
+               std::out_of_range);
 }
 
 TEST_F(TableTest, SelectingNoColumns)
diff --git a/java/src/main/java/ai/rapids/cudf/BinaryOp.java b/java/src/main/java/ai/rapids/cudf/BinaryOp.java
index 15b8d32d6da..fe559184878 100644
--- a/java/src/main/java/ai/rapids/cudf/BinaryOp.java
+++ b/java/src/main/java/ai/rapids/cudf/BinaryOp.java
@@ -31,28 +31,29 @@ public enum BinaryOp {
   PMOD(7), // pmod
   PYMOD(8), // mod operator % follow by python's sign rules for negatives
   POW(9),
-  LOG_BASE(10), // logarithm to the base
-  ATAN2(11), // atan2
-  SHIFT_LEFT(12), // bitwise shift left (<<)
-  SHIFT_RIGHT(13), // bitwise shift right (>>)
-  SHIFT_RIGHT_UNSIGNED(14), // bitwise shift right (>>>)
-  BITWISE_AND(15),
-  BITWISE_OR(16),
-  BITWISE_XOR(17),
-  LOGICAL_AND(18),
-  LOGICAL_OR(19),
-  EQUAL(20),
-  NOT_EQUAL(21),
-  LESS(22),
-  GREATER(23),
-  LESS_EQUAL(24), // <=
-  GREATER_EQUAL(25), // >=
-  NULL_EQUALS(26), // like EQUAL but NULL == NULL is TRUE and NULL == not NULL is FALSE
-  NULL_MAX(27), // MAX but NULL < not NULL
-  NULL_MIN(28), // MIN but NULL > not NULL
-  //NOT IMPLEMENTED YET GENERIC_BINARY(29);
-  NULL_LOGICAL_AND(30),
-  NULL_LOGICAL_OR(31);
+  INT_POW(10), // int ^ int, used to avoid floating point precision loss
+  LOG_BASE(11), // logarithm to the base
+  ATAN2(12), // atan2
+  SHIFT_LEFT(13), // bitwise shift left (<<)
+  SHIFT_RIGHT(14), // bitwise shift right (>>)
+  SHIFT_RIGHT_UNSIGNED(15), // bitwise shift right (>>>)
+  BITWISE_AND(16),
+  BITWISE_OR(17),
+  BITWISE_XOR(18),
+  LOGICAL_AND(19),
+  LOGICAL_OR(20),
+  EQUAL(21),
+  NOT_EQUAL(22),
+  LESS(23),
+  GREATER(24),
+  LESS_EQUAL(25), // <=
+  GREATER_EQUAL(26), // >=
+  NULL_EQUALS(27), // like EQUAL but NULL == NULL is TRUE and NULL == not NULL is FALSE
+  NULL_MAX(28), // MAX but NULL < not NULL
+  NULL_MIN(29), // MIN but NULL > not NULL
+  //NOT IMPLEMENTED YET GENERIC_BINARY(30);
+  NULL_LOGICAL_AND(31),
+  NULL_LOGICAL_OR(32);
 
 
   static final EnumSet<BinaryOp> COMPARISON = EnumSet.of(
diff --git a/python/cudf/cudf/_lib/binaryop.pyx b/python/cudf/cudf/_lib/binaryop.pyx
index b11d31ab368..8728437541d 100644
--- a/python/cudf/cudf/_lib/binaryop.pyx
+++ b/python/cudf/cudf/_lib/binaryop.pyx
@@ -56,6 +56,9 @@ class BinaryOperation(IntEnum):
     POW = (
         <underlying_type_t_binary_operator> binary_operator.POW
     )
+    INT_POW = (
+        <underlying_type_t_binary_operator> binary_operator.INT_POW
+    )
     EQ = (
         <underlying_type_t_binary_operator> binary_operator.EQUAL
     )
@@ -162,7 +165,7 @@ def binaryop(lhs, rhs, op, dtype):
     """
     # TODO: Shouldn't have to keep special-casing. We need to define a separate
     # pipeline for libcudf binops that don't map to Python binops.
-    if op != "NULL_EQUALS":
+    if op not in {"INT_POW", "NULL_EQUALS"}:
         op = op[2:-2]
 
     op = BinaryOperation[op.upper()]
diff --git a/python/cudf/cudf/_lib/cpp/binaryop.pxd b/python/cudf/cudf/_lib/cpp/binaryop.pxd
index c36ab124bf8..f73a9502cd1 100644
--- a/python/cudf/cudf/_lib/cpp/binaryop.pxd
+++ b/python/cudf/cudf/_lib/cpp/binaryop.pxd
@@ -1,4 +1,4 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2022, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
 from libcpp.string cimport string
@@ -20,6 +20,7 @@ cdef extern from "cudf/binaryop.hpp" namespace "cudf" nogil:
         MOD "cudf::binary_operator::MOD"
         PYMOD "cudf::binary_operator::PYMOD"
         POW "cudf::binary_operator::POW"
+        INT_POW "cudf::binary_operator::INT_POW"
         EQUAL "cudf::binary_operator::EQUAL"
         NOT_EQUAL "cudf::binary_operator::NOT_EQUAL"
         LESS "cudf::binary_operator::LESS"
diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py
index d30026e8bfa..0529c614393 100644
--- a/python/cudf/cudf/core/column/numerical.py
+++ b/python/cudf/cudf/core/column/numerical.py
@@ -31,6 +31,7 @@
 from cudf.api.types import (
     is_bool_dtype,
     is_float_dtype,
+    is_integer,
     is_integer_dtype,
     is_number,
 )
@@ -219,7 +220,15 @@ def _binaryop(self, other: ColumnBinaryOperand, op: str) -> ColumnBase:
             if is_bool_dtype(self.dtype) or is_bool_dtype(other):
                 out_dtype = "bool"
 
+        if (
+            op == "__pow__"
+            and is_integer_dtype(self.dtype)
+            and (is_integer(other) or is_integer_dtype(other.dtype))
+        ):
+            op = "INT_POW"
+
         lhs, rhs = (other, self) if reflect else (self, other)
+
         return libcudf.binaryop.binaryop(lhs, rhs, op, out_dtype)
 
     def nans_to_nulls(self: NumericalColumn) -> NumericalColumn:
diff --git a/python/cudf/cudf/tests/test_binops.py b/python/cudf/cudf/tests/test_binops.py
index 2397dba7f76..e92e608cc67 100644
--- a/python/cudf/cudf/tests/test_binops.py
+++ b/python/cudf/cudf/tests/test_binops.py
@@ -2981,3 +2981,82 @@ def test_binop_series_with_repeated_index():
     expected = psr1 - psr2
     got = gsr1 - gsr2
     utils.assert_eq(expected, got)
+
+
+def test_binop_integer_power_series_series():
+    # GH: #10178
+    gs_base = cudf.Series([3, -3, 8, -8])
+    gs_exponent = cudf.Series([1, 1, 7, 7])
+    ps_base = gs_base.to_pandas()
+    ps_exponent = gs_exponent.to_pandas()
+    expected = ps_base**ps_exponent
+    got = gs_base**gs_exponent
+    utils.assert_eq(expected, got)
+
+
+def test_binop_integer_power_series_scalar():
+    # GH: #10178
+    gs_base = cudf.Series([3, -3, 8, -8])
+    exponent = cudf.Scalar(1)
+    ps_base = gs_base.to_pandas()
+    expected = ps_base**exponent.value
+    got = gs_base**exponent
+    utils.assert_eq(expected, got)
+
+
+def test_binop_integer_power_series_int():
+    # GH: #10178
+    gs_base = cudf.Series([3, -3, 8, -8])
+    exponent = 1
+    ps_base = gs_base.to_pandas()
+    expected = ps_base**exponent
+    got = gs_base**exponent
+    utils.assert_eq(expected, got)
+
+
+def test_binop_integer_power_scalar_series():
+    # GH: #10178
+    base = cudf.Scalar(3)
+    gs_exponent = cudf.Series([1, 1, 7, 7])
+    ps_exponent = gs_exponent.to_pandas()
+    expected = base.value**ps_exponent
+    got = base**gs_exponent
+    utils.assert_eq(expected, got)
+
+
+def test_binop_integer_power_scalar_scalar():
+    # GH: #10178
+    base = cudf.Scalar(3)
+    exponent = cudf.Scalar(1)
+    expected = base.value**exponent.value
+    got = base**exponent
+    utils.assert_eq(expected, got)
+
+
+def test_binop_integer_power_scalar_int():
+    # GH: #10178
+    base = cudf.Scalar(3)
+    exponent = 1
+    expected = base.value**exponent
+    got = base**exponent
+    utils.assert_eq(expected, got)
+
+
+def test_binop_integer_power_int_series():
+    # GH: #10178
+    base = 3
+    gs_exponent = cudf.Series([1, 1, 7, 7])
+    ps_exponent = gs_exponent.to_pandas()
+    expected = base**ps_exponent
+    got = base**gs_exponent
+    utils.assert_eq(expected, got)
+
+
+@pytest.mark.xfail(reason="Reverse binops fail for scalar. See GH: #11225.")
+def test_binop_integer_power_int_scalar():
+    # GH: #10178
+    base = 3
+    exponent = cudf.Scalar(1)
+    expected = base**exponent.value
+    got = base**exponent
+    utils.assert_eq(expected, got)

From b2dd1bf8c4d085dcc4ff44956f6290fcb405b2eb Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiatruong.vn@gmail.com>
Date: Mon, 18 Jul 2022 19:25:59 -0700
Subject: [PATCH 06/31] Use `cudf::lists::distinct` in Java binding (#11233)

Java binding has `dropListDuplicates` and `dropListDuplicatesWithKeysValues` APIs to create lists column having distinct list elements. Previously they have been implemented by calling to `cudf::lists::drop_list_duplicates`, which relies on stably sorting the input lists column to extract unique list elements.

This PR makes the following changes:
 * Modifying the underlying implementation of `dropListDuplicates` to use `cudf::lists::distinct`, which has `O(n)` time complexity, and
 * Adding a new JNI-dedicated function `cudf::jni::lists_distinct_by_key` so `dropListDuplicatesWithKeysValues` will call it instead of `cudf::lists::drop_list_duplicates`. This function with keys-values pair input is only needed by Spark's `create_map` function so its implementation will be kept inside the JNI layer instead of in libcudf.

Authors:
  - Nghia Truong (https://github.com/ttnghia)

Approvers:
  - MithunR (https://github.com/mythrocks)

URL: https://github.com/rapidsai/cudf/pull/11233
---
 java/src/main/native/src/ColumnViewJni.cpp    |  67 ++------
 java/src/main/native/src/ColumnViewJni.cu     |  54 +++++++
 java/src/main/native/src/ColumnViewJni.hpp    |  17 ++
 .../java/ai/rapids/cudf/ColumnVectorTest.java | 151 ++++++++++++------
 4 files changed, 181 insertions(+), 108 deletions(-)

diff --git a/java/src/main/native/src/ColumnViewJni.cpp b/java/src/main/native/src/ColumnViewJni.cpp
index 1ad80ebe009..934aa11cc97 100644
--- a/java/src/main/native/src/ColumnViewJni.cpp
+++ b/java/src/main/native/src/ColumnViewJni.cpp
@@ -30,7 +30,6 @@
 #include <cudf/lists/contains.hpp>
 #include <cudf/lists/count_elements.hpp>
 #include <cudf/lists/detail/concatenate.hpp>
-#include <cudf/lists/drop_list_duplicates.hpp>
 #include <cudf/lists/extract.hpp>
 #include <cudf/lists/gather.hpp>
 #include <cudf/lists/lists_column_view.hpp>
@@ -463,9 +462,8 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_dropListDuplicates(JNIEnv
   JNI_NULL_CHECK(env, column_view, "column is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    cudf::column_view const *cv = reinterpret_cast<cudf::column_view const *>(column_view);
-    cudf::lists_column_view lcv(*cv);
-    return release_as_jlong(cudf::lists::drop_list_duplicates(lcv));
+    auto const input_cv = reinterpret_cast<cudf::column_view const *>(column_view);
+    return release_as_jlong(cudf::lists::distinct(cudf::lists_column_view{*input_cv}));
   }
   CATCH_STD(env, 0);
 }
@@ -476,59 +474,18 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_dropListDuplicatesWithKey
   try {
     cudf::jni::auto_set_device(env);
     auto const input_cv = reinterpret_cast<cudf::column_view const *>(keys_vals_handle);
-    CUDF_EXPECTS(input_cv->offset() == 0, "Input column has non-zero offset.");
-    CUDF_EXPECTS(input_cv->type().id() == cudf::type_id::LIST,
-                 "Input column is not a lists column.");
+    JNI_ARG_CHECK(env, input_cv->type().id() == cudf::type_id::LIST,
+                  "Input column is not a lists column.", 0);
 
-    // Extract list offsets and a column of struct<keys, values> from the input lists column.
     auto const lists_keys_vals = cudf::lists_column_view(*input_cv);
-    auto const keys_vals = lists_keys_vals.get_sliced_child(cudf::default_stream_value);
-    CUDF_EXPECTS(keys_vals.type().id() == cudf::type_id::STRUCT,
-                 "Input column has child that is not a structs column.");
-    CUDF_EXPECTS(keys_vals.num_children() == 2,
-                 "Input column has child that does not have 2 children.");
-
-    auto const lists_offsets = lists_keys_vals.offsets();
-    auto const structs_keys_vals = cudf::structs_column_view(keys_vals);
-
-    // Assemble a lists_column_view from the existing data (offsets + child).
-    // This will not copy any data, just create a view, for performance reason.
-    auto const make_lists_view = [&input_cv](auto const &offsets, auto const &child) {
-      return cudf::lists_column_view(
-          cudf::column_view(cudf::data_type{input_cv->type()}, input_cv->size(), nullptr,
-                            input_cv->null_mask(), input_cv->null_count(), 0, {offsets, child}));
-    };
-
-    // Extract keys and values lists columns from the input lists of structs column.
-    auto const keys = make_lists_view(lists_offsets, structs_keys_vals.child(0));
-    auto const vals = make_lists_view(lists_offsets, structs_keys_vals.child(1));
-
-    // Apache Spark desires to keep the last duplicate element.
-    auto [out_keys, out_vals] =
-        cudf::lists::drop_list_duplicates(keys, vals, cudf::duplicate_keep_option::KEEP_LAST);
-
-    // Release the contents of the outputs.
-    auto out_keys_content = out_keys->release();
-    auto out_vals_content = out_vals->release();
-
-    // Total number of elements in the child column.
-    // This should be the same for the out_vals column.
-    auto const out_child_size =
-        out_keys_content.children[cudf::lists_column_view::child_column_index]->size();
-
-    // Assemble a lists column of struct<out_keys, out_vals> for the final output.
-    auto out_structs_members = std::vector<std::unique_ptr<cudf::column>>();
-    out_structs_members.emplace_back(
-        std::move(out_keys_content.children[cudf::lists_column_view::child_column_index]));
-    out_structs_members.emplace_back(
-        std::move(out_vals_content.children[cudf::lists_column_view::child_column_index]));
-    auto &out_offsets = out_keys_content.children[cudf::lists_column_view::offsets_column_index];
-
-    auto out_structs =
-        cudf::make_structs_column(out_child_size, std::move(out_structs_members), 0, {});
-    return release_as_jlong(cudf::make_lists_column(input_cv->size(), std::move(out_offsets),
-                                                    std::move(out_structs), input_cv->null_count(),
-                                                    cudf::copy_bitmask(*input_cv)));
+    auto const keys_vals = lists_keys_vals.child();
+    JNI_ARG_CHECK(env, keys_vals.type().id() == cudf::type_id::STRUCT,
+                  "Input column has child that is not a structs column.", 0);
+    JNI_ARG_CHECK(env, keys_vals.num_children() == 2,
+                  "Input column has child that does not have 2 children.", 0);
+
+    return release_as_jlong(
+        cudf::jni::lists_distinct_by_key(lists_keys_vals, cudf::default_stream_value));
   }
   CATCH_STD(env, 0);
 }
diff --git a/java/src/main/native/src/ColumnViewJni.cu b/java/src/main/native/src/ColumnViewJni.cu
index 6b4db39eb34..aa21b508040 100644
--- a/java/src/main/native/src/ColumnViewJni.cu
+++ b/java/src/main/native/src/ColumnViewJni.cu
@@ -16,8 +16,16 @@
 
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/column/column_factories.hpp>
+#include <cudf/copying.hpp>
 #include <cudf/detail/iterator.cuh>
+#include <cudf/detail/labeling/label_segments.cuh>
+#include <cudf/detail/null_mask.hpp>
+#include <cudf/detail/stream_compaction.hpp>
 #include <cudf/detail/valid_if.cuh>
+#include <cudf/table/table.hpp>
+#include <cudf/table/table_view.hpp>
+#include <cudf/utilities/span.hpp>
+#include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
 #include <thrust/scan.h>
 
@@ -72,4 +80,50 @@ std::unique_ptr<cudf::column> generate_list_offsets(cudf::column_view const &lis
 
   return offsets_column;
 }
+
+std::unique_ptr<cudf::column> lists_distinct_by_key(cudf::lists_column_view const &input,
+                                                    rmm::cuda_stream_view stream) {
+  if (input.is_empty()) {
+    return empty_like(input.parent());
+  }
+
+  auto const child = input.get_sliced_child(stream);
+
+  // Genereate labels for the input list elements.
+  auto labels = rmm::device_uvector<cudf::size_type>(child.size(), stream);
+  cudf::detail::label_segments(input.offsets_begin(), input.offsets_end(), labels.begin(),
+                               labels.end(), stream);
+
+  // Use `cudf::duplicate_keep_option::KEEP_LAST` so this will produce the desired behavior when
+  // being called in `create_map` in spark-rapids.
+  // Other options comparing nulls and NaNs are set as all-equal.
+  auto out_columns = cudf::detail::stable_distinct(
+                         table_view{{column_view{cudf::device_span<cudf::size_type const>{labels}},
+                                     child.child(0), child.child(1)}}, // input table
+                         std::vector<size_type>{0, 1},                 // key columns
+                         cudf::duplicate_keep_option::KEEP_LAST, cudf::null_equality::EQUAL,
+                         cudf::nan_equality::ALL_EQUAL, stream)
+                         ->release();
+  auto const out_labels = out_columns.front()->view();
+
+  // Assemble a structs column of <out_keys, out_vals>.
+  auto out_structs_members = std::vector<std::unique_ptr<cudf::column>>();
+  out_structs_members.emplace_back(std::move(out_columns[1]));
+  out_structs_members.emplace_back(std::move(out_columns[2]));
+  auto out_structs =
+      cudf::make_structs_column(out_labels.size(), std::move(out_structs_members), 0, {});
+
+  // Assemble a lists column of structs<out_keys, out_vals>.
+  auto out_offsets = make_numeric_column(data_type{type_to_id<offset_type>()}, input.size() + 1,
+                                         mask_state::UNALLOCATED, stream);
+  auto const offsets_begin = out_offsets->mutable_view().template begin<offset_type>();
+  auto const labels_begin = out_labels.template begin<offset_type>();
+  cudf::detail::labels_to_offsets(labels_begin, labels_begin + out_labels.size(), offsets_begin,
+                                  offsets_begin + out_offsets->size(), stream);
+
+  return cudf::make_lists_column(input.size(), std::move(out_offsets), std::move(out_structs),
+                                 input.null_count(),
+                                 cudf::detail::copy_bitmask(input.parent(), stream), stream);
+}
+
 } // namespace cudf::jni
diff --git a/java/src/main/native/src/ColumnViewJni.hpp b/java/src/main/native/src/ColumnViewJni.hpp
index f9ad01d82d7..1ad8923d5b3 100644
--- a/java/src/main/native/src/ColumnViewJni.hpp
+++ b/java/src/main/native/src/ColumnViewJni.hpp
@@ -15,6 +15,7 @@
  */
 
 #include <cudf/column/column.hpp>
+#include <cudf/lists/lists_column_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <rmm/cuda_stream_view.hpp>
 
@@ -52,4 +53,20 @@ std::unique_ptr<cudf::column>
 generate_list_offsets(cudf::column_view const &list_length,
                       rmm::cuda_stream_view stream = cudf::default_stream_value);
 
+/**
+ * @brief Generates lists column by copying elements that are distinct by key from each input list
+ * row to the corresponding output row.
+ *
+ * The input lists column must be given such that each list element is a struct of <key, value>
+ * pair. With such input, a list containing distinct by key elements are defined such that the keys
+ * of all elements in the list are distinct (i.e., any two keys are always compared unequal).
+ *
+ * There will not be any validity check for the input. The caller is responsible to make sure that
+ * the input lists column has the right structure.
+ *
+ * @return A new list columns in which the elements in each list are distinct by key.
+ */
+std::unique_ptr<cudf::column> lists_distinct_by_key(cudf::lists_column_view const &input,
+                                                    rmm::cuda_stream_view stream);
+
 } // namespace cudf::jni
diff --git a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java
index 21ae0d427e2..d9d8044b0ad 100644
--- a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java
+++ b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java
@@ -4129,7 +4129,7 @@ void testExtractAllRecord() {
               null,
               null,
               Arrays.asList("a", "1", "b", "1", "a", "2"));
-          
+
           ColumnVector resultIdx0 = v.extractAllRecord(pattern, 0);
           ColumnVector resultIdx1 = v.extractAllRecord(pattern, 1);
           ColumnVector resultIdx2 = v.extractAllRecord(pattern, 2);
@@ -4466,31 +4466,52 @@ void testDropListDuplicatesWithKeysValues() {
         );
         ColumnVector inputStructsKeysVals = ColumnVector.makeStruct(inputChildKeys, inputChildVals);
         ColumnVector inputOffsets = ColumnVector.fromInts(0, 2, 5, 10, 15, 15);
-        ColumnVector inputListsKeysVals = inputStructsKeysVals.makeListFromOffsets(5,
-            inputOffsets);
-
-        ColumnVector expectedChildKeys = ColumnVector.fromBoxedInts(
-            1, 2,
-            3, 4, 5,
-            0, 6, null,
-            6, 7, null
-        );
-        ColumnVector expectedChildVals = ColumnVector.fromBoxedInts(
-            10, 20,
-            30, 40, 50,
-            100, 90, 60,
-            120, 150, 140
-        );
-        ColumnVector expectedStructsKeysVals = ColumnVector.makeStruct(expectedChildKeys,
-            expectedChildVals);
-        ColumnVector expectedOffsets = ColumnVector.fromInts(0, 2, 5, 8, 11, 11);
-        ColumnVector expectedListsKeysVals = expectedStructsKeysVals.makeListFromOffsets(5,
-            expectedOffsets);
-
-        ColumnVector output = inputListsKeysVals.dropListDuplicatesWithKeysValues();
-        ColumnVector sortedOutput = output.listSortRows(false, false);
+        ColumnVector inputListsKeysVals = inputStructsKeysVals.makeListFromOffsets(5, inputOffsets)
     ) {
-      assertColumnsAreEqual(expectedListsKeysVals, sortedOutput);
+      // Test full input:
+      try(ColumnVector expectedChildKeys = ColumnVector.fromBoxedInts(
+              1, 2, // list1
+              3, 4, 5, // list2
+              0, 6, null, // list3
+              6, 7, null // list4
+              // list5 (empty)
+          );
+          ColumnVector expectedChildVals = ColumnVector.fromBoxedInts(
+              10, 20, // list1
+              30, 40, 50, // list2
+              100, 90, 60, // list3
+              120, 150, 140 // list4
+              // list5 (empty)
+          );
+          ColumnVector expectedStructsKeysVals = ColumnVector.makeStruct(expectedChildKeys, expectedChildVals);
+          ColumnVector expectedOffsets = ColumnVector.fromInts(0, 2, 5, 8, 11, 11);
+          ColumnVector expectedListsKeysVals = expectedStructsKeysVals.makeListFromOffsets(5, expectedOffsets);
+
+          ColumnVector output = inputListsKeysVals.dropListDuplicatesWithKeysValues();
+          ColumnVector sortedOutput = output.listSortRows(false, false)
+      ) {
+        assertColumnsAreEqual(expectedListsKeysVals, sortedOutput);
+      }
+
+      // Test sliced input:
+      try(ColumnVector expectedChildKeys = ColumnVector.fromBoxedInts(
+              3, 4, 5, // list1
+              0, 6, null // list2
+          );
+          ColumnVector expectedChildVals = ColumnVector.fromBoxedInts(
+              30, 40, 50, // list1
+              100, 90, 60 // list2
+          );
+          ColumnVector expectedStructsKeysVals = ColumnVector.makeStruct(expectedChildKeys, expectedChildVals);
+          ColumnVector expectedOffsets = ColumnVector.fromInts(0, 3, 6);
+          ColumnVector expectedListsKeysVals = expectedStructsKeysVals.makeListFromOffsets(2, expectedOffsets);
+
+          ColumnVector inputSliced = inputListsKeysVals.subVector(1, 3);
+          ColumnVector output = inputSliced.dropListDuplicatesWithKeysValues();
+          ColumnVector sortedOutput = output.listSortRows(false, false)
+      ) {
+        assertColumnsAreEqual(expectedListsKeysVals, sortedOutput);
+      }
     }
   }
 
@@ -4516,35 +4537,59 @@ void testDropListDuplicatesWithKeysValuesNullable() {
         ColumnVector inputOffsets = ColumnVector.fromInts(0, 2, 2, 5, 10, 15, 15);
         ColumnVector tmpInputListsKeysVals = inputStructsKeysVals.makeListFromOffsets(6,inputOffsets);
         ColumnVector templateBitmask = ColumnVector.fromBoxedInts(1, null, 1, 1, 1, null);
-        ColumnVector inputListsKeysVals = tmpInputListsKeysVals.mergeAndSetValidity(BinaryOp.BITWISE_AND, templateBitmask);
-
-        ColumnVector expectedChildKeys = ColumnVector.fromBoxedInts(
-            1, 2, // list1
-            // list2 (null)
-            3, 4, 5, // list3
-            0, 6, null, // list4
-            6, 7, null // list5
-            // list6 (null)
-        );
-        ColumnVector expectedChildVals = ColumnVector.fromBoxedInts(
-            10, 20, // list1
-            // list2 (null)
-            30, 40, 50, // list3
-            100, 90, 60, // list4
-            120, 150, 140 // list5
-            // list6 (null)
-        );
-        ColumnVector expectedStructsKeysVals = ColumnVector.makeStruct(expectedChildKeys,
-            expectedChildVals);
-        ColumnVector expectedOffsets = ColumnVector.fromInts(0, 2, 2, 5, 8, 11, 11);
-        ColumnVector tmpExpectedListsKeysVals = expectedStructsKeysVals.makeListFromOffsets(6,
-            expectedOffsets);
-        ColumnVector expectedListsKeysVals = tmpExpectedListsKeysVals.mergeAndSetValidity(BinaryOp.BITWISE_AND, templateBitmask);
-
-        ColumnVector output = inputListsKeysVals.dropListDuplicatesWithKeysValues();
-        ColumnVector sortedOutput = output.listSortRows(false, false);
+        ColumnVector inputListsKeysVals = tmpInputListsKeysVals.mergeAndSetValidity(BinaryOp.BITWISE_AND, templateBitmask)
     ) {
-      assertColumnsAreEqual(expectedListsKeysVals, sortedOutput);
+      // Test full input:
+      try(ColumnVector expectedChildKeys = ColumnVector.fromBoxedInts(
+              1, 2, // list1
+              // list2 (null)
+              3, 4, 5, // list3
+              0, 6, null, // list4
+              6, 7, null // list5
+              // list6 (null)
+          );
+          ColumnVector expectedChildVals = ColumnVector.fromBoxedInts(
+              10, 20, // list1
+              // list2 (null)
+              30, 40, 50, // list3
+              100, 90, 60, // list4
+              120, 150, 140 // list5
+              // list6 (null)
+          );
+          ColumnVector expectedStructsKeysVals = ColumnVector.makeStruct(expectedChildKeys, expectedChildVals);
+          ColumnVector expectedOffsets = ColumnVector.fromInts(0, 2, 2, 5, 8, 11, 11);
+          ColumnVector tmpExpectedListsKeysVals = expectedStructsKeysVals.makeListFromOffsets(6, expectedOffsets);
+          ColumnVector expectedListsKeysVals = tmpExpectedListsKeysVals.mergeAndSetValidity(BinaryOp.BITWISE_AND, templateBitmask);
+
+          ColumnVector output = inputListsKeysVals.dropListDuplicatesWithKeysValues();
+          ColumnVector sortedOutput = output.listSortRows(false, false)
+      ) {
+        assertColumnsAreEqual(expectedListsKeysVals, sortedOutput);
+      }
+
+      // Test sliced input:
+      try(ColumnVector expectedChildKeys = ColumnVector.fromBoxedInts(
+              // list1 (null)
+              3, 4, 5, // list2
+              0, 6, null // list3
+          );
+          ColumnVector expectedChildVals = ColumnVector.fromBoxedInts(
+              // list1 (null)
+              30, 40, 50, // list2
+              100, 90, 60 // list3
+          );
+          ColumnVector expectedStructsKeysVals = ColumnVector.makeStruct(expectedChildKeys, expectedChildVals);
+          ColumnVector expectedOffsets = ColumnVector.fromInts(0, 0, 3, 6);
+          ColumnVector tmpExpectedListsKeysVals = expectedStructsKeysVals.makeListFromOffsets(3, expectedOffsets);
+          ColumnVector slicedTemplateBitmask = ColumnVector.fromBoxedInts(null, 1, 1);
+          ColumnVector expectedListsKeysVals = tmpExpectedListsKeysVals.mergeAndSetValidity(BinaryOp.BITWISE_AND, slicedTemplateBitmask);
+
+          ColumnVector inputSliced = inputListsKeysVals.subVector(1, 4);
+          ColumnVector output = inputSliced.dropListDuplicatesWithKeysValues();
+          ColumnVector sortedOutput = output.listSortRows(false, false)
+      ) {
+        assertColumnsAreEqual(expectedListsKeysVals, sortedOutput);
+      }
     }
   }
 

From 9f573016959754e3272b3b9b0f09583d0a5529a3 Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Tue, 19 Jul 2022 12:13:49 -0700
Subject: [PATCH 07/31] Remove legacy join APIs (#11274)

Resolves #7762

Authors:
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Nghia Truong (https://github.com/ttnghia)
  - Mark Harris (https://github.com/harrism)
  - Mike Wilson (https://github.com/hyperbolic2346)
  - Jason Lowe (https://github.com/jlowe)

URL: https://github.com/rapidsai/cudf/pull/11274
---
 cpp/benchmarks/join/join.cu                   |  18 +-
 cpp/benchmarks/join/join_common.hpp           |  11 +-
 cpp/benchmarks/join/left_join.cu              |  44 +-
 cpp/include/cudf/join.hpp                     | 250 --------
 cpp/src/join/join.cu                          | 174 ------
 cpp/src/join/semi_join.cu                     | 122 ----
 cpp/tests/join/join_tests.cpp                 | 162 ++++--
 cpp/tests/join/semi_anti_join_tests.cpp       |  78 ++-
 java/src/main/java/ai/rapids/cudf/Table.java  | 170 ------
 java/src/main/native/src/TableJni.cpp         | 245 --------
 .../test/java/ai/rapids/cudf/TableTest.java   | 550 ------------------
 11 files changed, 214 insertions(+), 1610 deletions(-)

diff --git a/cpp/benchmarks/join/join.cu b/cpp/benchmarks/join/join.cu
index f21356aff02..b42cda7f24c 100644
--- a/cpp/benchmarks/join/join.cu
+++ b/cpp/benchmarks/join/join.cu
@@ -44,12 +44,10 @@ void nvbench_inner_join(nvbench::state& state,
 
   auto join = [](cudf::table_view const& left_input,
                  cudf::table_view const& right_input,
-                 std::vector<cudf::size_type> const& left_on,
-                 std::vector<cudf::size_type> const& right_on,
                  cudf::null_equality compare_nulls,
                  rmm::cuda_stream_view stream) {
-    cudf::hash_join hj_obj(left_input.select(left_on), compare_nulls, stream);
-    return hj_obj.inner_join(right_input.select(right_on), std::nullopt, stream);
+    cudf::hash_join hj_obj(left_input, compare_nulls, stream);
+    return hj_obj.inner_join(right_input, std::nullopt, stream);
   };
 
   BM_join<key_type, payload_type, Nullable>(state, join);
@@ -66,12 +64,10 @@ void nvbench_left_join(nvbench::state& state,
 
   auto join = [](cudf::table_view const& left_input,
                  cudf::table_view const& right_input,
-                 std::vector<cudf::size_type> const& left_on,
-                 std::vector<cudf::size_type> const& right_on,
                  cudf::null_equality compare_nulls,
                  rmm::cuda_stream_view stream) {
-    cudf::hash_join hj_obj(left_input.select(left_on), compare_nulls, stream);
-    return hj_obj.left_join(right_input.select(right_on), std::nullopt, stream);
+    cudf::hash_join hj_obj(left_input, compare_nulls, stream);
+    return hj_obj.left_join(right_input, std::nullopt, stream);
   };
 
   BM_join<key_type, payload_type, Nullable>(state, join);
@@ -88,12 +84,10 @@ void nvbench_full_join(nvbench::state& state,
 
   auto join = [](cudf::table_view const& left_input,
                  cudf::table_view const& right_input,
-                 std::vector<cudf::size_type> const& left_on,
-                 std::vector<cudf::size_type> const& right_on,
                  cudf::null_equality compare_nulls,
                  rmm::cuda_stream_view stream) {
-    cudf::hash_join hj_obj(left_input.select(left_on), compare_nulls, stream);
-    return hj_obj.full_join(right_input.select(right_on), std::nullopt, stream);
+    cudf::hash_join hj_obj(left_input, compare_nulls, stream);
+    return hj_obj.full_join(right_input, std::nullopt, stream);
   };
 
   BM_join<key_type, payload_type, Nullable>(state, join);
diff --git a/cpp/benchmarks/join/join_common.hpp b/cpp/benchmarks/join/join_common.hpp
index 7d80b42529e..6762b9c1f34 100644
--- a/cpp/benchmarks/join/join_common.hpp
+++ b/cpp/benchmarks/join/join_common.hpp
@@ -143,17 +143,16 @@ static void BM_join(state_type& state, Join JoinFunc)
     for (auto _ : state) {
       cuda_event_timer raii(state, true, cudf::default_stream_value);
 
-      auto result = JoinFunc(
-        probe_table, build_table, columns_to_join, columns_to_join, cudf::null_equality::UNEQUAL);
+      auto result = JoinFunc(probe_table.select(columns_to_join),
+                             build_table.select(columns_to_join),
+                             cudf::null_equality::UNEQUAL);
     }
   }
   if constexpr (std::is_same_v<state_type, nvbench::state> and (not is_conditional)) {
     state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
       rmm::cuda_stream_view stream_view{launch.get_stream()};
-      auto result = JoinFunc(probe_table,
-                             build_table,
-                             columns_to_join,
-                             columns_to_join,
+      auto result = JoinFunc(probe_table.select(columns_to_join),
+                             build_table.select(columns_to_join),
                              cudf::null_equality::UNEQUAL,
                              stream_view);
     });
diff --git a/cpp/benchmarks/join/left_join.cu b/cpp/benchmarks/join/left_join.cu
index 58a1c2d7f29..5c1e5483ad4 100644
--- a/cpp/benchmarks/join/left_join.cu
+++ b/cpp/benchmarks/join/left_join.cu
@@ -20,18 +20,16 @@ template <typename key_type, typename payload_type>
 class Join : public cudf::benchmark {
 };
 
-#define LEFT_ANTI_JOIN_BENCHMARK_DEFINE(name, key_type, payload_type, nullable)   \
-  BENCHMARK_TEMPLATE_DEFINE_F(Join, name, key_type, payload_type)                 \
-  (::benchmark::State & st)                                                       \
-  {                                                                               \
-    auto join = [](cudf::table_view const& left,                                  \
-                   cudf::table_view const& right,                                 \
-                   std::vector<cudf::size_type> const& left_on,                   \
-                   std::vector<cudf::size_type> const& right_on,                  \
-                   cudf::null_equality compare_nulls) {                           \
-      return cudf::left_anti_join(left, right, left_on, right_on, compare_nulls); \
-    };                                                                            \
-    BM_join<key_type, payload_type, nullable>(st, join);                          \
+#define LEFT_ANTI_JOIN_BENCHMARK_DEFINE(name, key_type, payload_type, nullable) \
+  BENCHMARK_TEMPLATE_DEFINE_F(Join, name, key_type, payload_type)               \
+  (::benchmark::State & st)                                                     \
+  {                                                                             \
+    auto join = [](cudf::table_view const& left,                                \
+                   cudf::table_view const& right,                               \
+                   cudf::null_equality compare_nulls) {                         \
+      return cudf::left_anti_join(left, right, compare_nulls);                  \
+    };                                                                          \
+    BM_join<key_type, payload_type, nullable>(st, join);                        \
   }
 
 LEFT_ANTI_JOIN_BENCHMARK_DEFINE(left_anti_join_32bit, int32_t, int32_t, false);
@@ -39,18 +37,16 @@ LEFT_ANTI_JOIN_BENCHMARK_DEFINE(left_anti_join_64bit, int64_t, int64_t, false);
 LEFT_ANTI_JOIN_BENCHMARK_DEFINE(left_anti_join_32bit_nulls, int32_t, int32_t, true);
 LEFT_ANTI_JOIN_BENCHMARK_DEFINE(left_anti_join_64bit_nulls, int64_t, int64_t, true);
 
-#define LEFT_SEMI_JOIN_BENCHMARK_DEFINE(name, key_type, payload_type, nullable)   \
-  BENCHMARK_TEMPLATE_DEFINE_F(Join, name, key_type, payload_type)                 \
-  (::benchmark::State & st)                                                       \
-  {                                                                               \
-    auto join = [](cudf::table_view const& left,                                  \
-                   cudf::table_view const& right,                                 \
-                   std::vector<cudf::size_type> const& left_on,                   \
-                   std::vector<cudf::size_type> const& right_on,                  \
-                   cudf::null_equality compare_nulls) {                           \
-      return cudf::left_semi_join(left, right, left_on, right_on, compare_nulls); \
-    };                                                                            \
-    BM_join<key_type, payload_type, nullable>(st, join);                          \
+#define LEFT_SEMI_JOIN_BENCHMARK_DEFINE(name, key_type, payload_type, nullable) \
+  BENCHMARK_TEMPLATE_DEFINE_F(Join, name, key_type, payload_type)               \
+  (::benchmark::State & st)                                                     \
+  {                                                                             \
+    auto join = [](cudf::table_view const& left,                                \
+                   cudf::table_view const& right,                               \
+                   cudf::null_equality compare_nulls) {                         \
+      return cudf::left_semi_join(left, right, compare_nulls);                  \
+    };                                                                          \
+    BM_join<key_type, payload_type, nullable>(st, join);                        \
   }
 
 LEFT_SEMI_JOIN_BENCHMARK_DEFINE(left_semi_join_32bit, int32_t, int32_t, false);
diff --git a/cpp/include/cudf/join.hpp b/cpp/include/cudf/join.hpp
index d4d6e44509f..bc3bfef3a7d 100644
--- a/cpp/include/cudf/join.hpp
+++ b/cpp/include/cudf/join.hpp
@@ -87,51 +87,6 @@ inner_join(cudf::table_view const& left_keys,
            null_equality compare_nulls         = null_equality::EQUAL,
            rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
-/**
- * @brief Performs an inner join on the specified columns of two
- * tables (`left`, `right`)
- *
- * Inner Join returns rows from both tables as long as the values
- * in the columns being joined on match.
- *
- * @code{.pseudo}
- * Left: {{0, 1, 2}}
- * Right: {{4, 9, 3}, {1, 2, 5}}
- * left_on: {0}
- * right_on: {1}
- * Result: {{1, 2}, {4, 9}, {1, 2}}
- * @endcode
- *
- * @throw cudf::logic_error if number of elements in `left_on` or `right_on`
- * mismatch.
- * @throw cudf::logic_error if number of columns in either `left` or `right`
- * table is 0 or exceeds MAX_JOIN_SIZE
- * @throw std::out_of_range if element of `left_on` or `right_on` exceed the
- * number of columns in the left or right table.
- *
- * @param[in] left The left table
- * @param[in] right The right table
- * @param[in] left_on The column indices from `left` to join on.
- * The column from `left` indicated by `left_on[i]` will be compared against the column
- * from `right` indicated by `right_on[i]`.
- * @param[in] right_on The column indices from `right` to join on.
- * The column from `right` indicated by `right_on[i]` will be compared against the column
- * from `left` indicated by `left_on[i]`.
- * @param[in] compare_nulls controls whether null join-key values
- * should match or not.
- * @param mr Device memory resource used to allocate the returned table and columns' device memory
- *
- * @return Result of joining `left` and `right` tables on the columns
- * specified by `left_on` and `right_on`.
- */
-std::unique_ptr<cudf::table> inner_join(
-  cudf::table_view const& left,
-  cudf::table_view const& right,
-  std::vector<cudf::size_type> const& left_on,
-  std::vector<cudf::size_type> const& right_on,
-  null_equality compare_nulls         = null_equality::EQUAL,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
-
 /**
  * @brief Returns a pair of row index vectors corresponding to a
  * left join between the specified tables.
@@ -172,59 +127,6 @@ left_join(cudf::table_view const& left_keys,
           null_equality compare_nulls         = null_equality::EQUAL,
           rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
-/**
- * @brief Performs a left join (also known as left outer join) on the
- * specified columns of two tables (`left`, `right`)
- *
- * Left join returns all the rows from the left table and those rows from the
- * right table that match on the joined columns.
- * For rows from the right table that do not have a match, the corresponding
- * values in the left columns will be null.
- *
- * @code{.pseudo}
- * Left: {{0, 1, 2}}
- * Right: {{1, 2, 3}, {1, 2 ,5}}
- * left_on: {0}
- * right_on: {1}
- * Result: { {0, 1, 2}, {NULL, 1, 2}, {NULL, 1, 2} }
- *
- * Left: {{0, 1, 2}}
- * Right {{1, 2, 3}, {1, 2, 5}}
- * left_on: {0}
- * right_on: {0}
- * Result: { {0, 1, 2}, {NULL, 1, 2}, {NULL, 1, 2} }
- * @endcode
- *
- * @throw cudf::logic_error if number of elements in `left_on` or `right_on`
- * mismatch.
- * @throw cudf::logic_error if number of columns in either `left` or `right`
- * table is 0 or exceeds MAX_JOIN_SIZE
- * @throw std::out_of_range if element of `left_on` or `right_on` exceed the
- * number of columns in the left or right table.
- *
- * @param[in] left The left table
- * @param[in] right The right table
- * @param[in] left_on The column indices from `left` to join on.
- * The column from `left` indicated by `left_on[i]` will be compared against the column
- * from `right` indicated by `right_on[i]`.
- * @param[in] right_on The column indices from `right` to join on.
- * The column from `right` indicated by `right_on[i]` will be compared against the column
- * from `left` indicated by `left_on[i]`.
- * @param[in] compare_nulls controls whether null join-key values
- * should match or not.
- * @param mr Device memory resource used to allocate the returned table and columns' device memory
- *
- * @return Result of joining `left` and `right` tables on the columns
- * specified by `left_on` and `right_on`.
- */
-std::unique_ptr<cudf::table> left_join(
-  cudf::table_view const& left,
-  cudf::table_view const& right,
-  std::vector<cudf::size_type> const& left_on,
-  std::vector<cudf::size_type> const& right_on,
-  null_equality compare_nulls         = null_equality::EQUAL,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
-
 /**
  * @brief Returns a pair of row index vectors corresponding to a
  * full join between the specified tables.
@@ -264,59 +166,6 @@ full_join(cudf::table_view const& left_keys,
           null_equality compare_nulls         = null_equality::EQUAL,
           rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
-/**
- * @brief Performs a full join (also known as full outer join) on the
- * specified columns of two tables (`left`, `right`)
- *
- * Full Join returns the rows that would be returned by a left join and those
- * rows from the right table that do not have a match.
- * For rows from the right table that do not have a match, the corresponding
- * values in the left columns will be null.
- *
- * @code{.pseudo}
- * Left: {{0, 1, 2}}
- * Right: {{1, 2, 3}, {1, 2, 5}}
- * left_on: {0}
- * right_on: {1}
- * Result: { {0, 1, 2, NULL}, {NULL, 1, 2, 3}, {NULL, 1, 2, 5} }
- *
- * Left: {{0, 1, 2}}
- * Right: {{1, 2, 3}, {1, 2, 5}}
- * left_on: {0}
- * right_on: {0}
- * Result: { {0, 1, 2, NULL}, {NULL, 1, 2, 3}, {NULL, 1, 2, 5} }
- * @endcode
- *
- * @throw cudf::logic_error if number of elements in `left_on` or `right_on`
- * mismatch.
- * @throw cudf::logic_error if number of columns in either `left` or `right`
- * table is 0 or exceeds MAX_JOIN_SIZE
- * @throw std::out_of_range if element of `left_on` or `right_on` exceed the
- * number of columns in the left or right table.
- *
- * @param[in] left The left table
- * @param[in] right The right table
- * @param[in] left_on The column indices from `left` to join on.
- * The column from `left` indicated by `left_on[i]` will be compared against the column
- * from `right` indicated by `right_on[i]`.
- * @param[in] right_on The column indices from `right` to join on.
- * The column from `right` indicated by `right_on[i]` will be compared against the column
- * from `left` indicated by `left_on[i]`.
- * @param[in] compare_nulls controls whether null join-key values
- * should match or not.
- * @param mr Device memory resource used to allocate the returned table and columns' device memory
- *
- * @return Result of joining `left` and `right` tables on the columns
- * specified by `left_on` and `right_on`.
- */
-std::unique_ptr<cudf::table> full_join(
-  cudf::table_view const& left,
-  cudf::table_view const& right,
-  std::vector<cudf::size_type> const& left_on,
-  std::vector<cudf::size_type> const& right_on,
-  null_equality compare_nulls         = null_equality::EQUAL,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
-
 /**
  * @brief Returns a vector of row indices corresponding to a left semi join
  * between the specified tables.
@@ -349,54 +198,6 @@ std::unique_ptr<rmm::device_uvector<size_type>> left_semi_join(
   null_equality compare_nulls         = null_equality::EQUAL,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
-/**
- * @brief Performs a left semi join on the specified columns of two
- * tables (`left`, `right`)
- *
- * A left semi join only returns data from the left table, and only
- * returns rows that exist in the right table.
- *
- * @code{.pseudo}
- * TableA: {{0, 1, 2}}
- * TableB: {{1, 2, 3}, {1, 2, 5}}
- * left_on: {0}
- * right_on: {1}
- * Result: { {1, 2} }
- *
- * TableA {{0, 1, 2}, {1, 2, 5}}
- * TableB {{1, 2, 3}}
- * left_on: {0}
- * right_on: {0}
- * Result: { {1, 2}, {2, 5} }
- * @endcode
- *
- * @throw cudf::logic_error if the number of columns in either `left_keys` or `right_keys` is 0
- *
- * @param[in] left             The left table
- * @param[in] right            The right table
- * @param[in] left_on          The column indices from `left` to join on.
- *                             The column from `left` indicated by `left_on[i]`
- *                             will be compared against the column from `right`
- *                             indicated by `right_on[i]`.
- * @param[in] right_on         The column indices from `right` to join on.
- *                             The column from `right` indicated by `right_on[i]`
- *                             will be compared against the column from `left`
- *                             indicated by `left_on[i]`.
- * @param[in] compare_nulls    Controls whether null join-key values should match or not
- * @param[in] mr               Device memory resource used to allocate the returned table's
- *                             device memory
- *
- * @return                     Result of joining `left` and `right` tables on the columns
- *                             specified by `left_on` and `right_on`.
- */
-std::unique_ptr<cudf::table> left_semi_join(
-  cudf::table_view const& left,
-  cudf::table_view const& right,
-  std::vector<cudf::size_type> const& left_on,
-  std::vector<cudf::size_type> const& right_on,
-  null_equality compare_nulls         = null_equality::EQUAL,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
-
 /**
  * @brief Returns a vector of row indices corresponding to a left anti join
  * between the specified tables.
@@ -428,57 +229,6 @@ std::unique_ptr<rmm::device_uvector<size_type>> left_anti_join(
   null_equality compare_nulls         = null_equality::EQUAL,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
-/**
- * @brief Performs a left anti join on the specified columns of two
- * tables (`left`, `right`)
- *
- * A left anti join only returns data from the left table, and only
- * returns rows that do not exist in the right table.
- *
- * @code{.pseudo}
- * TableA: {{0, 1, 2}}
- * TableB: {{1, 2, 3},  {1, 2, 5}}
- * left_on: {0}
- * right_on: {1}
- * Result: {{0}}
- *
- * TableA: {{0, 1, 2}, {1, 2, 5}}
- * TableB: {{1, 2, 3}}
- * left_on: {0}
- * right_on: {0}
- * Result: { {0}, {1} }
- * @endcode
- *
- * @throw cudf::logic_error if number of elements in `left_on` or `right_on`
- * mismatch.
- * @throw cudf::logic_error if number of columns in either `left` or `right`
- * table is 0 or exceeds MAX_JOIN_SIZE
- *
- * @param[in] left             The left table
- * @param[in] right            The right table
- * @param[in] left_on          The column indices from `left` to join on.
- *                             The column from `left` indicated by `left_on[i]`
- *                             will be compared against the column from `right`
- *                             indicated by `right_on[i]`.
- * @param[in] right_on         The column indices from `right` to join on.
- *                             The column from `right` indicated by `right_on[i]`
- *                             will be compared against the column from `left`
- *                             indicated by `left_on[i]`.
- * @param[in] compare_nulls    Controls whether null join-key values should match or not
- * @param[in] mr               Device memory resource used to allocate the returned table's
- *                             device memory
- *
- * @return                     Result of joining `left` and `right` tables on the columns
- *                             specified by `left_on` and `right_on`.
- */
-std::unique_ptr<cudf::table> left_anti_join(
-  cudf::table_view const& left,
-  cudf::table_view const& right,
-  std::vector<cudf::size_type> const& left_on,
-  std::vector<cudf::size_type> const& right_on,
-  null_equality compare_nulls         = null_equality::EQUAL,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
-
 /**
  * @brief Performs a cross join on two tables (`left`, `right`)
  *
diff --git a/cpp/src/join/join.cu b/cpp/src/join/join.cu
index 829ff914dfd..bb8fc07c2d7 100644
--- a/cpp/src/join/join.cu
+++ b/cpp/src/join/join.cu
@@ -26,26 +26,6 @@
 
 namespace cudf {
 namespace detail {
-namespace {
-std::pair<std::unique_ptr<table>, std::unique_ptr<table>> get_empty_joined_table(
-  table_view const& probe, table_view const& build)
-{
-  std::unique_ptr<table> empty_probe = empty_like(probe);
-  std::unique_ptr<table> empty_build = empty_like(build);
-  return std::pair(std::move(empty_probe), std::move(empty_build));
-}
-
-std::unique_ptr<cudf::table> combine_table_pair(std::unique_ptr<cudf::table>&& left,
-                                                std::unique_ptr<cudf::table>&& right)
-{
-  auto joined_cols = left->release();
-  auto right_cols  = right->release();
-  joined_cols.insert(joined_cols.end(),
-                     std::make_move_iterator(right_cols.begin()),
-                     std::make_move_iterator(right_cols.end()));
-  return std::make_unique<cudf::table>(std::move(joined_cols));
-}
-}  // namespace
 
 std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
           std::unique_ptr<rmm::device_uvector<size_type>>>
@@ -79,42 +59,6 @@ inner_join(table_view const& left_input,
   }
 }
 
-std::unique_ptr<table> inner_join(table_view const& left_input,
-                                  table_view const& right_input,
-                                  std::vector<size_type> const& left_on,
-                                  std::vector<size_type> const& right_on,
-                                  null_equality compare_nulls,
-                                  rmm::cuda_stream_view stream,
-                                  rmm::mr::device_memory_resource* mr)
-{
-  // Make sure any dictionary columns have matched key sets.
-  // This will return any new dictionary columns created as well as updated table_views.
-  auto matched = cudf::dictionary::detail::match_dictionaries(
-    {left_input.select(left_on), right_input.select(right_on)},
-    stream,
-    rmm::mr::get_current_device_resource());  // temporary objects returned
-
-  // now rebuild the table views with the updated ones
-  auto const left  = scatter_columns(matched.second.front(), left_on, left_input);
-  auto const right = scatter_columns(matched.second.back(), right_on, right_input);
-
-  auto const [left_join_indices, right_join_indices] = cudf::detail::inner_join(
-    left.select(left_on), right.select(right_on), compare_nulls, stream, mr);
-  std::unique_ptr<table> left_result  = detail::gather(left,
-                                                      left_join_indices->begin(),
-                                                      left_join_indices->end(),
-                                                      out_of_bounds_policy::DONT_CHECK,
-                                                      stream,
-                                                      mr);
-  std::unique_ptr<table> right_result = detail::gather(right,
-                                                       right_join_indices->begin(),
-                                                       right_join_indices->end(),
-                                                       out_of_bounds_policy::DONT_CHECK,
-                                                       stream,
-                                                       mr);
-  return combine_table_pair(std::move(left_result), std::move(right_result));
-}
-
 std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
           std::unique_ptr<rmm::device_uvector<size_type>>>
 left_join(table_view const& left_input,
@@ -137,48 +81,6 @@ left_join(table_view const& left_input,
   return hj_obj.left_join(left, std::nullopt, stream, mr);
 }
 
-std::unique_ptr<table> left_join(table_view const& left_input,
-                                 table_view const& right_input,
-                                 std::vector<size_type> const& left_on,
-                                 std::vector<size_type> const& right_on,
-                                 null_equality compare_nulls,
-                                 rmm::cuda_stream_view stream,
-                                 rmm::mr::device_memory_resource* mr)
-{
-  // Make sure any dictionary columns have matched key sets.
-  // This will return any new dictionary columns created as well as updated table_views.
-  auto matched = cudf::dictionary::detail::match_dictionaries(
-    {left_input.select(left_on), right_input.select(right_on)},  // these should match
-    stream,
-    rmm::mr::get_current_device_resource());  // temporary objects returned
-  // now rebuild the table views with the updated ones
-  table_view const left  = scatter_columns(matched.second.front(), left_on, left_input);
-  table_view const right = scatter_columns(matched.second.back(), right_on, right_input);
-
-  if ((left_on.empty() or right_on.empty()) or
-      cudf::detail::is_trivial_join(left, right, cudf::detail::join_kind::LEFT_JOIN)) {
-    auto [left_empty_table, right_empty_table] = get_empty_joined_table(left, right);
-    return cudf::detail::combine_table_pair(std::move(left_empty_table),
-                                            std::move(right_empty_table));
-  }
-
-  auto const [left_join_indices, right_join_indices] = cudf::detail::left_join(
-    left.select(left_on), right.select(right_on), compare_nulls, stream, mr);
-  std::unique_ptr<table> left_result  = detail::gather(left,
-                                                      left_join_indices->begin(),
-                                                      left_join_indices->end(),
-                                                      out_of_bounds_policy::NULLIFY,
-                                                      stream,
-                                                      mr);
-  std::unique_ptr<table> right_result = detail::gather(right,
-                                                       right_join_indices->begin(),
-                                                       right_join_indices->end(),
-                                                       out_of_bounds_policy::NULLIFY,
-                                                       stream,
-                                                       mr);
-  return combine_table_pair(std::move(left_result), std::move(right_result));
-}
-
 std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
           std::unique_ptr<rmm::device_uvector<size_type>>>
 full_join(table_view const& left_input,
@@ -201,47 +103,6 @@ full_join(table_view const& left_input,
   return hj_obj.full_join(left, std::nullopt, stream, mr);
 }
 
-std::unique_ptr<table> full_join(table_view const& left_input,
-                                 table_view const& right_input,
-                                 std::vector<size_type> const& left_on,
-                                 std::vector<size_type> const& right_on,
-                                 null_equality compare_nulls,
-                                 rmm::cuda_stream_view stream,
-                                 rmm::mr::device_memory_resource* mr)
-{
-  // Make sure any dictionary columns have matched key sets.
-  // This will return any new dictionary columns created as well as updated table_views.
-  auto matched = cudf::dictionary::detail::match_dictionaries(
-    {left_input.select(left_on), right_input.select(right_on)},  // these should match
-    stream,
-    rmm::mr::get_current_device_resource());  // temporary objects returned
-  // now rebuild the table views with the updated ones
-  table_view const left  = scatter_columns(matched.second.front(), left_on, left_input);
-  table_view const right = scatter_columns(matched.second.back(), right_on, right_input);
-
-  if ((left_on.empty() or right_on.empty()) or
-      cudf::detail::is_trivial_join(left, right, cudf::detail::join_kind::FULL_JOIN)) {
-    auto [left_empty_table, right_empty_table] = get_empty_joined_table(left, right);
-    return cudf::detail::combine_table_pair(std::move(left_empty_table),
-                                            std::move(right_empty_table));
-  }
-
-  auto const [left_join_indices, right_join_indices] = cudf::detail::full_join(
-    left.select(left_on), right.select(right_on), compare_nulls, stream, mr);
-  std::unique_ptr<table> left_result  = detail::gather(left,
-                                                      left_join_indices->begin(),
-                                                      left_join_indices->end(),
-                                                      out_of_bounds_policy::NULLIFY,
-                                                      stream,
-                                                      mr);
-  std::unique_ptr<table> right_result = detail::gather(right,
-                                                       right_join_indices->begin(),
-                                                       right_join_indices->end(),
-                                                       out_of_bounds_policy::NULLIFY,
-                                                       stream,
-                                                       mr);
-  return combine_table_pair(std::move(left_result), std::move(right_result));
-}
 }  // namespace detail
 
 std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
@@ -255,18 +116,6 @@ inner_join(table_view const& left,
   return detail::inner_join(left, right, compare_nulls, cudf::default_stream_value, mr);
 }
 
-std::unique_ptr<table> inner_join(table_view const& left,
-                                  table_view const& right,
-                                  std::vector<size_type> const& left_on,
-                                  std::vector<size_type> const& right_on,
-                                  null_equality compare_nulls,
-                                  rmm::mr::device_memory_resource* mr)
-{
-  CUDF_FUNC_RANGE();
-  return detail::inner_join(
-    left, right, left_on, right_on, compare_nulls, cudf::default_stream_value, mr);
-}
-
 std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
           std::unique_ptr<rmm::device_uvector<size_type>>>
 left_join(table_view const& left,
@@ -278,18 +127,6 @@ left_join(table_view const& left,
   return detail::left_join(left, right, compare_nulls, cudf::default_stream_value, mr);
 }
 
-std::unique_ptr<table> left_join(table_view const& left,
-                                 table_view const& right,
-                                 std::vector<size_type> const& left_on,
-                                 std::vector<size_type> const& right_on,
-                                 null_equality compare_nulls,
-                                 rmm::mr::device_memory_resource* mr)
-{
-  CUDF_FUNC_RANGE();
-  return detail::left_join(
-    left, right, left_on, right_on, compare_nulls, cudf::default_stream_value, mr);
-}
-
 std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
           std::unique_ptr<rmm::device_uvector<size_type>>>
 full_join(table_view const& left,
@@ -301,15 +138,4 @@ full_join(table_view const& left,
   return detail::full_join(left, right, compare_nulls, cudf::default_stream_value, mr);
 }
 
-std::unique_ptr<table> full_join(table_view const& left,
-                                 table_view const& right,
-                                 std::vector<size_type> const& left_on,
-                                 std::vector<size_type> const& right_on,
-                                 null_equality compare_nulls,
-                                 rmm::mr::device_memory_resource* mr)
-{
-  CUDF_FUNC_RANGE();
-  return detail::full_join(
-    left, right, left_on, right_on, compare_nulls, cudf::default_stream_value, mr);
-}
 }  // namespace cudf
diff --git a/cpp/src/join/semi_join.cu b/cpp/src/join/semi_join.cu
index 5cb58b92fe9..87bac002f53 100644
--- a/cpp/src/join/semi_join.cu
+++ b/cpp/src/join/semi_join.cu
@@ -85,112 +85,8 @@ std::unique_ptr<rmm::device_uvector<cudf::size_type>> left_semi_anti_join(
   return gather_map;
 }
 
-/**
- * @brief  Performs a left semi or anti join on the specified columns of two
- * tables (left, right)
- *
- * The semi and anti joins only return data from the left table. A left semi join
- * returns rows that exist in the right table, a left anti join returns rows
- * that do not exist in the right table.
- *
- * The basic approach is to create a hash table containing the contents of the right
- * table and then select only rows that exist (or don't exist) to be included in
- * the return set.
- *
- * @throws cudf::logic_error if number of columns in either `left` or `right` table is 0
- * @throws cudf::logic_error if number of returned columns is 0
- * @throws cudf::logic_error if number of elements in `right_on` and `left_on` are not equal
- *
- * @param kind          Indicates whether to do LEFT_SEMI_JOIN or LEFT_ANTI_JOIN
- * @param left          The left table
- * @param right         The right table
- * @param left_on       The column indices from `left` to join on.
- *                      The column from `left` indicated by `left_on[i]`
- *                      will be compared against the column from `right`
- *                      indicated by `right_on[i]`.
- * @param right_on      The column indices from `right` to join on.
- *                      The column from `right` indicated by `right_on[i]`
- *                      will be compared against the column from `left`
- *                      indicated by `left_on[i]`.
- * @param compare_nulls Controls whether null join-key values should match or not.
- * @param stream        CUDA stream used for device memory operations and kernel launches.
- * @param mr            Device memory resource to used to allocate the returned table
- *
- * @returns             Result of joining `left` and `right` tables on the columns
- *                      specified by `left_on` and `right_on`.
- */
-std::unique_ptr<cudf::table> left_semi_anti_join(
-  join_kind const kind,
-  cudf::table_view const& left,
-  cudf::table_view const& right,
-  std::vector<cudf::size_type> const& left_on,
-  std::vector<cudf::size_type> const& right_on,
-  null_equality compare_nulls,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
-{
-  CUDF_EXPECTS(left_on.size() == right_on.size(), "Mismatch in number of columns to be joined on");
-
-  if ((left_on.empty() || right_on.empty()) || is_trivial_join(left, right, kind)) {
-    return empty_like(left);
-  }
-
-  if ((join_kind::LEFT_ANTI_JOIN == kind) && (0 == right.num_rows())) {
-    // Everything matches, just copy the proper columns from the left table
-    return std::make_unique<table>(left, stream, mr);
-  }
-
-  // Make sure any dictionary columns have matched key sets.
-  // This will return any new dictionary columns created as well as updated table_views.
-  auto matched = cudf::dictionary::detail::match_dictionaries(
-    {left.select(left_on), right.select(right_on)},
-    stream,
-    rmm::mr::get_current_device_resource());  // temporary objects returned
-
-  auto const left_selected  = matched.second.front();
-  auto const right_selected = matched.second.back();
-
-  auto gather_vector =
-    left_semi_anti_join(kind, left_selected, right_selected, compare_nulls, stream);
-
-  // wrapping the device vector with a column view allows calling the non-iterator
-  // version of detail::gather, improving compile time by 10% and reducing the
-  // object file size by 2.2x without affecting performance
-  auto gather_map = column_view(data_type{type_id::INT32},
-                                static_cast<size_type>(gather_vector->size()),
-                                gather_vector->data(),
-                                nullptr,
-                                0);
-
-  auto const left_updated = scatter_columns(left_selected, left_on, left);
-  return cudf::detail::gather(left_updated,
-                              gather_map,
-                              out_of_bounds_policy::DONT_CHECK,
-                              negative_index_policy::NOT_ALLOWED,
-                              stream,
-                              mr);
-}
-
 }  // namespace detail
 
-std::unique_ptr<cudf::table> left_semi_join(cudf::table_view const& left,
-                                            cudf::table_view const& right,
-                                            std::vector<cudf::size_type> const& left_on,
-                                            std::vector<cudf::size_type> const& right_on,
-                                            null_equality compare_nulls,
-                                            rmm::mr::device_memory_resource* mr)
-{
-  CUDF_FUNC_RANGE();
-  return detail::left_semi_anti_join(detail::join_kind::LEFT_SEMI_JOIN,
-                                     left,
-                                     right,
-                                     left_on,
-                                     right_on,
-                                     compare_nulls,
-                                     cudf::default_stream_value,
-                                     mr);
-}
-
 std::unique_ptr<rmm::device_uvector<cudf::size_type>> left_semi_join(
   cudf::table_view const& left,
   cudf::table_view const& right,
@@ -202,24 +98,6 @@ std::unique_ptr<rmm::device_uvector<cudf::size_type>> left_semi_join(
     detail::join_kind::LEFT_SEMI_JOIN, left, right, compare_nulls, cudf::default_stream_value, mr);
 }
 
-std::unique_ptr<cudf::table> left_anti_join(cudf::table_view const& left,
-                                            cudf::table_view const& right,
-                                            std::vector<cudf::size_type> const& left_on,
-                                            std::vector<cudf::size_type> const& right_on,
-                                            null_equality compare_nulls,
-                                            rmm::mr::device_memory_resource* mr)
-{
-  CUDF_FUNC_RANGE();
-  return detail::left_semi_anti_join(detail::join_kind::LEFT_ANTI_JOIN,
-                                     left,
-                                     right,
-                                     left_on,
-                                     right_on,
-                                     compare_nulls,
-                                     cudf::default_stream_value,
-                                     mr);
-}
-
 std::unique_ptr<rmm::device_uvector<cudf::size_type>> left_anti_join(
   cudf::table_view const& left,
   cudf::table_view const& right,
diff --git a/cpp/tests/join/join_tests.cpp b/cpp/tests/join/join_tests.cpp
index 9d9d1f4fd10..44e1d586389 100644
--- a/cpp/tests/join/join_tests.cpp
+++ b/cpp/tests/join/join_tests.cpp
@@ -47,6 +47,82 @@ using Table          = cudf::table;
 constexpr cudf::size_type NoneValue =
   std::numeric_limits<cudf::size_type>::min();  // TODO: how to test if this isn't public?
 
+// This function is a wrapper around cudf's join APIs that takes the gather map
+// from join APIs and materializes the table that would be created by gathering
+// from the joined tables. Join APIs originally returned tables like this, but
+// they were modified in https://github.com/rapidsai/cudf/pull/7454. This
+// helper function allows us to avoid rewriting all our tests in terms of
+// gather maps.
+template <std::pair<std::unique_ptr<rmm::device_uvector<cudf::size_type>>,
+                    std::unique_ptr<rmm::device_uvector<cudf::size_type>>> (*join_impl)(
+            cudf::table_view const& left_keys,
+            cudf::table_view const& right_keys,
+            cudf::null_equality compare_nulls,
+            rmm::mr::device_memory_resource* mr),
+          cudf::out_of_bounds_policy oob_policy = cudf::out_of_bounds_policy::DONT_CHECK>
+std::unique_ptr<cudf::table> join_and_gather(
+  cudf::table_view const& left_input,
+  cudf::table_view const& right_input,
+  std::vector<cudf::size_type> const& left_on,
+  std::vector<cudf::size_type> const& right_on,
+  cudf::null_equality compare_nulls,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+{
+  auto left_selected  = left_input.select(left_on);
+  auto right_selected = right_input.select(right_on);
+  auto const [left_join_indices, right_join_indices] =
+    join_impl(left_selected, right_selected, compare_nulls, mr);
+
+  auto left_indices_span  = cudf::device_span<cudf::size_type const>{*left_join_indices};
+  auto right_indices_span = cudf::device_span<cudf::size_type const>{*right_join_indices};
+
+  auto left_indices_col  = cudf::column_view{left_indices_span};
+  auto right_indices_col = cudf::column_view{right_indices_span};
+
+  auto left_result  = cudf::gather(left_input, left_indices_col, oob_policy);
+  auto right_result = cudf::gather(right_input, right_indices_col, oob_policy);
+
+  auto joined_cols = left_result->release();
+  auto right_cols  = right_result->release();
+  joined_cols.insert(joined_cols.end(),
+                     std::make_move_iterator(right_cols.begin()),
+                     std::make_move_iterator(right_cols.end()));
+  return std::make_unique<cudf::table>(std::move(joined_cols));
+}
+
+std::unique_ptr<cudf::table> inner_join(
+  cudf::table_view const& left_input,
+  cudf::table_view const& right_input,
+  std::vector<cudf::size_type> const& left_on,
+  std::vector<cudf::size_type> const& right_on,
+  cudf::null_equality compare_nulls = cudf::null_equality::EQUAL)
+{
+  return join_and_gather<cudf::inner_join>(
+    left_input, right_input, left_on, right_on, compare_nulls);
+}
+
+std::unique_ptr<cudf::table> left_join(
+  cudf::table_view const& left_input,
+  cudf::table_view const& right_input,
+  std::vector<cudf::size_type> const& left_on,
+  std::vector<cudf::size_type> const& right_on,
+  cudf::null_equality compare_nulls = cudf::null_equality::EQUAL)
+{
+  return join_and_gather<cudf::left_join, cudf::out_of_bounds_policy::NULLIFY>(
+    left_input, right_input, left_on, right_on, compare_nulls);
+}
+
+std::unique_ptr<cudf::table> full_join(
+  cudf::table_view const& full_input,
+  cudf::table_view const& right_input,
+  std::vector<cudf::size_type> const& full_on,
+  std::vector<cudf::size_type> const& right_on,
+  cudf::null_equality compare_nulls = cudf::null_equality::EQUAL)
+{
+  return join_and_gather<cudf::full_join, cudf::out_of_bounds_policy::NULLIFY>(
+    full_input, right_input, full_on, right_on, compare_nulls);
+}
+
 struct JoinTest : public cudf::test::BaseFixture {
   std::pair<std::unique_ptr<cudf::table>, std::unique_ptr<cudf::table>> gather_maps_as_tables(
     cudf::column_view const& expected_left_map,
@@ -88,7 +164,7 @@ TEST_F(JoinTest, EmptySentinelRepro)
   cudf::table_view left({left_first_col, left_second_col, left_third_col});
   cudf::table_view right({right_first_col, right_second_col, right_third_col});
 
-  auto result = cudf::inner_join(left, right, {0, 1, 2}, {0, 1, 2});
+  auto result = inner_join(left, right, {0, 1, 2}, {0, 1, 2});
 
   EXPECT_EQ(result->num_rows(), 1);
 }
@@ -114,7 +190,7 @@ TEST_F(JoinTest, LeftJoinNoNullsWithNoCommon)
   Table t0(std::move(cols0));
   Table t1(std::move(cols1));
 
-  auto result            = cudf::left_join(t0, t1, {0}, {0});
+  auto result            = left_join(t0, t1, {0}, {0});
   auto result_sort_order = cudf::sorted_order(result->view());
   auto sorted_result     = cudf::gather(result->view(), *result_sort_order);
 
@@ -160,7 +236,7 @@ TEST_F(JoinTest, FullJoinNoNulls)
   Table t0(std::move(cols0));
   Table t1(std::move(cols1));
 
-  auto result            = cudf::full_join(t0, t1, {0, 1}, {0, 1});
+  auto result            = full_join(t0, t1, {0, 1}, {0, 1});
   auto result_sort_order = cudf::sorted_order(result->view());
   auto sorted_result     = cudf::gather(result->view(), *result_sort_order);
 
@@ -209,7 +285,7 @@ TEST_F(JoinTest, FullJoinWithNulls)
   Table t0(std::move(cols0));
   Table t1(std::move(cols1));
 
-  auto result            = cudf::full_join(t0, t1, {0, 1}, {0, 1});
+  auto result            = full_join(t0, t1, {0, 1}, {0, 1});
   auto result_sort_order = cudf::sorted_order(result->view());
   auto sorted_result     = cudf::gather(result->view(), *result_sort_order);
 
@@ -261,7 +337,7 @@ TEST_F(JoinTest, FullJoinOnNulls)
   Table t0(std::move(cols0));
   Table t1(std::move(cols1));
 
-  auto result            = cudf::full_join(t0, t1, {0, 1}, {0, 1});
+  auto result            = full_join(t0, t1, {0, 1}, {0, 1});
   auto result_sort_order = cudf::sorted_order(result->view());
   auto sorted_result     = cudf::gather(result->view(), *result_sort_order);
 
@@ -310,7 +386,7 @@ TEST_F(JoinTest, FullJoinOnNulls)
   // Repeat test with compare_nulls_equal=false,
   // as per SQL standard.
 
-  result            = cudf::full_join(t0, t1, {0, 1}, {0, 1}, cudf::null_equality::UNEQUAL);
+  result            = full_join(t0, t1, {0, 1}, {0, 1}, cudf::null_equality::UNEQUAL);
   result_sort_order = cudf::sorted_order(result->view());
   sorted_result     = cudf::gather(result->view(), *result_sort_order);
 
@@ -366,7 +442,7 @@ TEST_F(JoinTest, LeftJoinNoNulls)
   Table t0(std::move(cols0));
   Table t1(std::move(cols1));
 
-  auto result            = cudf::left_join(t0, t1, {0, 1}, {0, 1});
+  auto result            = left_join(t0, t1, {0, 1}, {0, 1});
   auto result_sort_order = cudf::sorted_order(result->view());
   auto sorted_result     = cudf::gather(result->view(), *result_sort_order);
 
@@ -411,7 +487,7 @@ TEST_F(JoinTest, LeftJoinWithNulls)
   Table t0(std::move(cols0));
   Table t1(std::move(cols1));
 
-  auto result            = cudf::left_join(t0, t1, {0, 1}, {0, 1});
+  auto result            = left_join(t0, t1, {0, 1}, {0, 1});
   auto result_sort_order = cudf::sorted_order(result->view());
   auto sorted_result     = cudf::gather(result->view(), *result_sort_order);
 
@@ -475,7 +551,7 @@ TEST_F(JoinTest, LeftJoinWithStructsAndNulls)
   Table t0(std::move(cols0));
   Table t1(std::move(cols1));
 
-  auto result            = cudf::left_join(t0, t1, {3}, {3});
+  auto result            = left_join(t0, t1, {3}, {3});
   auto result_sort_order = cudf::sorted_order(result->view());
   auto sorted_result     = cudf::gather(result->view(), *result_sort_order);
 
@@ -551,7 +627,7 @@ TEST_F(JoinTest, LeftJoinOnNulls)
   Table t0(std::move(cols0));
   Table t1(std::move(cols1));
 
-  auto result            = cudf::left_join(t0, t1, {0, 1}, {0, 1});
+  auto result            = left_join(t0, t1, {0, 1}, {0, 1});
   auto result_sort_order = cudf::sorted_order(result->view());
   auto sorted_result     = cudf::gather(result->view(), *result_sort_order);
 
@@ -601,7 +677,7 @@ TEST_F(JoinTest, LeftJoinOnNulls)
   // Repeat test with compare_nulls_equal=false,
   // as per SQL standard.
 
-  result            = cudf::left_join(t0, t1, {0, 1}, {0, 1}, cudf::null_equality::UNEQUAL);
+  result            = left_join(t0, t1, {0, 1}, {0, 1}, cudf::null_equality::UNEQUAL);
   result_sort_order = cudf::sorted_order(result->view());
   sorted_result     = cudf::gather(result->view(), *result_sort_order);
 
@@ -654,7 +730,7 @@ TEST_F(JoinTest, InnerJoinNoNulls)
   Table t0(std::move(cols0));
   Table t1(std::move(cols1));
 
-  auto result            = cudf::inner_join(t0, t1, {0, 1}, {0, 1});
+  auto result            = inner_join(t0, t1, {0, 1}, {0, 1});
   auto result_sort_order = cudf::sorted_order(result->view());
   auto sorted_result     = cudf::gather(result->view(), *result_sort_order);
 
@@ -699,7 +775,7 @@ TEST_F(JoinTest, InnerJoinWithNulls)
   Table t0(std::move(cols0));
   Table t1(std::move(cols1));
 
-  auto result            = cudf::inner_join(t0, t1, {0, 1}, {0, 1});
+  auto result            = inner_join(t0, t1, {0, 1}, {0, 1});
   auto result_sort_order = cudf::sorted_order(result->view());
   auto sorted_result     = cudf::gather(result->view(), *result_sort_order);
 
@@ -767,7 +843,7 @@ TEST_F(JoinTest, InnerJoinWithStructsAndNulls)
   Table t0(std::move(cols0));
   Table t1(std::move(cols1));
 
-  auto result            = cudf::inner_join(t0, t1, {0, 1, 3}, {0, 1, 3});
+  auto result            = inner_join(t0, t1, {0, 1, 3}, {0, 1, 3});
   auto result_sort_order = cudf::sorted_order(result->view());
   auto sorted_result     = cudf::gather(result->view(), *result_sort_order);
 
@@ -833,7 +909,7 @@ TEST_F(JoinTest, InnerJoinOnNulls)
   Table t0(std::move(cols0));
   Table t1(std::move(cols1));
 
-  auto result            = cudf::inner_join(t0, t1, {0, 1}, {0, 1});
+  auto result            = inner_join(t0, t1, {0, 1}, {0, 1});
   auto result_sort_order = cudf::sorted_order(result->view());
   auto sorted_result     = cudf::gather(result->view(), *result_sort_order);
 
@@ -862,7 +938,7 @@ TEST_F(JoinTest, InnerJoinOnNulls)
   // Repeat test with compare_nulls_equal=false,
   // as per SQL standard.
 
-  result            = cudf::inner_join(t0, t1, {0, 1}, {0, 1},  cudf::null_equality::UNEQUAL);
+  result            = inner_join(t0, t1, {0, 1}, {0, 1},  cudf::null_equality::UNEQUAL);
   result_sort_order = cudf::sorted_order(result->view());
   sorted_result     = cudf::gather(result->view(), *result_sort_order);
 
@@ -909,7 +985,7 @@ TEST_F(JoinTest, EmptyLeftTableInnerJoin)
   Table empty0(std::move(cols0));
   Table t1(std::move(cols1));
 
-  auto result = cudf::inner_join(empty0, t1, {0, 1}, {0, 1});
+  auto result = inner_join(empty0, t1, {0, 1}, {0, 1});
   CUDF_TEST_EXPECT_TABLES_EQUIVALENT(empty0, *result);
 }
 
@@ -930,7 +1006,7 @@ TEST_F(JoinTest, EmptyLeftTableLeftJoin)
   Table empty0(std::move(cols0));
   Table t1(std::move(cols1));
 
-  auto result = cudf::left_join(empty0, t1, {0, 1}, {0, 1});
+  auto result = left_join(empty0, t1, {0, 1}, {0, 1});
   CUDF_TEST_EXPECT_TABLES_EQUIVALENT(empty0, *result);
 }
 
@@ -951,7 +1027,7 @@ TEST_F(JoinTest, EmptyLeftTableFullJoin)
   Table lhs(std::move(cols0));
   Table rhs(std::move(cols1));
 
-  auto result            = cudf::full_join(lhs, rhs, {0, 1}, {0, 1});
+  auto result            = full_join(lhs, rhs, {0, 1}, {0, 1});
   auto result_sort_order = cudf::sorted_order(result->view());
   auto sorted_result     = cudf::gather(result->view(), *result_sort_order);
 
@@ -992,7 +1068,7 @@ TEST_F(JoinTest, EmptyRightTableInnerJoin)
   Table empty1(std::move(cols1));
 
   {
-    auto result = cudf::inner_join(t0, empty1, {0, 1}, {0, 1});
+    auto result = inner_join(t0, empty1, {0, 1}, {0, 1});
     CUDF_TEST_EXPECT_TABLES_EQUIVALENT(empty1, *result);
   }
 
@@ -1031,7 +1107,7 @@ TEST_F(JoinTest, EmptyRightTableLeftJoin)
   Table empty1(std::move(cols1));
 
   {
-    auto result = cudf::left_join(t0, empty1, {0, 1}, {0, 1});
+    auto result = left_join(t0, empty1, {0, 1}, {0, 1});
     CUDF_TEST_EXPECT_TABLES_EQUIVALENT(t0, *result);
   }
 
@@ -1070,7 +1146,7 @@ TEST_F(JoinTest, EmptyRightTableFullJoin)
   Table empty1(std::move(cols1));
 
   {
-    auto result = cudf::full_join(t0, empty1, {0, 1}, {0, 1});
+    auto result = full_join(t0, empty1, {0, 1}, {0, 1});
     CUDF_TEST_EXPECT_TABLES_EQUIVALENT(t0, *result);
   }
 
@@ -1109,7 +1185,7 @@ TEST_F(JoinTest, BothEmptyInnerJoin)
   Table t0(std::move(cols0));
   Table empty1(std::move(cols1));
 
-  auto result = cudf::inner_join(t0, empty1, {0, 1}, {0, 1});
+  auto result = inner_join(t0, empty1, {0, 1}, {0, 1});
   CUDF_TEST_EXPECT_TABLES_EQUIVALENT(empty1, *result);
 }
 
@@ -1130,7 +1206,7 @@ TEST_F(JoinTest, BothEmptyLeftJoin)
   Table t0(std::move(cols0));
   Table empty1(std::move(cols1));
 
-  auto result = cudf::left_join(t0, empty1, {0, 1}, {0, 1});
+  auto result = left_join(t0, empty1, {0, 1}, {0, 1});
   CUDF_TEST_EXPECT_TABLES_EQUIVALENT(empty1, *result);
 }
 
@@ -1151,7 +1227,7 @@ TEST_F(JoinTest, BothEmptyFullJoin)
   Table t0(std::move(cols0));
   Table empty1(std::move(cols1));
 
-  auto result = cudf::full_join(t0, empty1, {0, 1}, {0, 1});
+  auto result = full_join(t0, empty1, {0, 1}, {0, 1});
   CUDF_TEST_EXPECT_TABLES_EQUIVALENT(empty1, *result);
 }
 
@@ -1174,7 +1250,7 @@ TEST_F(JoinTest, EqualValuesInnerJoin)
   Table t0(std::move(cols0));
   Table t1(std::move(cols1));
 
-  auto result = cudf::inner_join(t0, t1, {0, 1}, {0, 1});
+  auto result = inner_join(t0, t1, {0, 1}, {0, 1});
 
   column_wrapper<int32_t> col_gold_0{{0, 0, 0, 0}};
   strcol_wrapper col_gold_1({"s0", "s0", "s0", "s0"});
@@ -1209,7 +1285,7 @@ TEST_F(JoinTest, EqualValuesLeftJoin)
   Table t0(std::move(cols0));
   Table t1(std::move(cols1));
 
-  auto result = cudf::left_join(t0, t1, {0, 1}, {0, 1});
+  auto result = left_join(t0, t1, {0, 1}, {0, 1});
 
   column_wrapper<int32_t> col_gold_0{{0, 0, 0, 0}, {1, 1, 1, 1}};
   strcol_wrapper col_gold_1({"s0", "s0", "s0", "s0"}, {1, 1, 1, 1});
@@ -1243,7 +1319,7 @@ TEST_F(JoinTest, EqualValuesFullJoin)
   Table t0(std::move(cols0));
   Table t1(std::move(cols1));
 
-  auto result = cudf::full_join(t0, t1, {0, 1}, {0, 1});
+  auto result = full_join(t0, t1, {0, 1}, {0, 1});
 
   column_wrapper<int32_t> col_gold_0{{0, 0, 0, 0}};
   strcol_wrapper col_gold_1({"s0", "s0", "s0", "s0"});
@@ -1272,7 +1348,7 @@ TEST_F(JoinTest, InnerJoinCornerCase)
   Table t0(std::move(cols0));
   Table t1(std::move(cols1));
 
-  auto result            = cudf::inner_join(t0, t1, {0}, {0});
+  auto result            = inner_join(t0, t1, {0}, {0});
   auto result_sort_order = cudf::sorted_order(result->view());
   auto sorted_result     = cudf::gather(result->view(), *result_sort_order);
 
@@ -1453,7 +1529,7 @@ TEST_F(JoinDictionaryTest, LeftJoinNoNulls)
   auto g0 = cudf::table_view({col0_0, col0_1_w, col0_2});
   auto g1 = cudf::table_view({col1_0, col1_1_w, col1_2});
 
-  auto result      = cudf::left_join(t0, t1, {0}, {0});
+  auto result      = left_join(t0, t1, {0}, {0});
   auto result_view = result->view();
   auto decoded1    = cudf::dictionary::decode(result_view.column(1));
   auto decoded4    = cudf::dictionary::decode(result_view.column(4));
@@ -1466,7 +1542,7 @@ TEST_F(JoinDictionaryTest, LeftJoinNoNulls)
   auto result_sort_order = cudf::sorted_order(cudf::table_view(result_decoded));
   auto sorted_result     = cudf::gather(cudf::table_view(result_decoded), *result_sort_order);
 
-  auto gold            = cudf::left_join(g0, g1, {0}, {0});
+  auto gold            = left_join(g0, g1, {0}, {0});
   auto gold_sort_order = cudf::sorted_order(gold->view());
   auto sorted_gold     = cudf::gather(gold->view(), *gold_sort_order);
 
@@ -1488,7 +1564,7 @@ TEST_F(JoinDictionaryTest, LeftJoinWithNulls)
   auto t0 = cudf::table_view({col0_0, col0_1, col0_2->view()});
   auto t1 = cudf::table_view({col1_0, col1_1, col1_2->view()});
 
-  auto result      = cudf::left_join(t0, t1, {0, 1}, {0, 1});
+  auto result      = left_join(t0, t1, {0, 1}, {0, 1});
   auto result_view = result->view();
   auto decoded2    = cudf::dictionary::decode(result_view.column(2));
   auto decoded5    = cudf::dictionary::decode(result_view.column(5));
@@ -1503,7 +1579,7 @@ TEST_F(JoinDictionaryTest, LeftJoinWithNulls)
 
   auto g0              = cudf::table_view({col0_0, col0_1, col0_2_w});
   auto g1              = cudf::table_view({col1_0, col1_1, col1_2_w});
-  auto gold            = cudf::left_join(g0, g1, {0, 1}, {0, 1});
+  auto gold            = left_join(g0, g1, {0, 1}, {0, 1});
   auto gold_sort_order = cudf::sorted_order(gold->view());
   auto sorted_gold     = cudf::gather(gold->view(), *gold_sort_order);
 
@@ -1525,7 +1601,7 @@ TEST_F(JoinDictionaryTest, InnerJoinNoNulls)
   auto t0 = cudf::table_view({col0_0, col0_1->view(), col0_2});
   auto t1 = cudf::table_view({col1_0, col1_1->view(), col1_2});
 
-  auto result      = cudf::inner_join(t0, t1, {0, 1}, {0, 1});
+  auto result      = inner_join(t0, t1, {0, 1}, {0, 1});
   auto result_view = result->view();
   auto decoded1    = cudf::dictionary::decode(result_view.column(1));
   auto decoded4    = cudf::dictionary::decode(result_view.column(4));
@@ -1540,7 +1616,7 @@ TEST_F(JoinDictionaryTest, InnerJoinNoNulls)
 
   auto g0              = cudf::table_view({col0_0, col0_1_w, col0_2});
   auto g1              = cudf::table_view({col1_0, col1_1_w, col1_2});
-  auto gold            = cudf::inner_join(g0, g1, {0, 1}, {0, 1});
+  auto gold            = inner_join(g0, g1, {0, 1}, {0, 1});
   auto gold_sort_order = cudf::sorted_order(gold->view());
   auto sorted_gold     = cudf::gather(gold->view(), *gold_sort_order);
 
@@ -1562,7 +1638,7 @@ TEST_F(JoinDictionaryTest, InnerJoinWithNulls)
   auto t0 = cudf::table_view({col0_0, col0_1, col0_2->view()});
   auto t1 = cudf::table_view({col1_0, col1_1, col1_2->view()});
 
-  auto result      = cudf::inner_join(t0, t1, {0, 1}, {0, 1});
+  auto result      = inner_join(t0, t1, {0, 1}, {0, 1});
   auto result_view = result->view();
   auto decoded2    = cudf::dictionary::decode(result_view.column(2));
   auto decoded5    = cudf::dictionary::decode(result_view.column(5));
@@ -1577,7 +1653,7 @@ TEST_F(JoinDictionaryTest, InnerJoinWithNulls)
 
   auto g0              = cudf::table_view({col0_0, col0_1, col0_2_w});
   auto g1              = cudf::table_view({col1_0, col1_1, col1_2_w});
-  auto gold            = cudf::inner_join(g0, g1, {0, 1}, {0, 1});
+  auto gold            = inner_join(g0, g1, {0, 1}, {0, 1});
   auto gold_sort_order = cudf::sorted_order(gold->view());
   auto sorted_gold     = cudf::gather(gold->view(), *gold_sort_order);
 
@@ -1599,7 +1675,7 @@ TEST_F(JoinDictionaryTest, FullJoinNoNulls)
   auto t0 = cudf::table_view({col0_0, col0_1->view(), col0_2});
   auto t1 = cudf::table_view({col1_0, col1_1->view(), col1_2});
 
-  auto result      = cudf::full_join(t0, t1, {0, 1}, {0, 1});
+  auto result      = full_join(t0, t1, {0, 1}, {0, 1});
   auto result_view = result->view();
   auto decoded1    = cudf::dictionary::decode(result_view.column(1));
   auto decoded4    = cudf::dictionary::decode(result_view.column(4));
@@ -1614,7 +1690,7 @@ TEST_F(JoinDictionaryTest, FullJoinNoNulls)
 
   auto g0              = cudf::table_view({col0_0, col0_1_w, col0_2});
   auto g1              = cudf::table_view({col1_0, col1_1_w, col1_2});
-  auto gold            = cudf::full_join(g0, g1, {0, 1}, {0, 1});
+  auto gold            = full_join(g0, g1, {0, 1}, {0, 1});
   auto gold_sort_order = cudf::sorted_order(gold->view());
   auto sorted_gold     = cudf::gather(gold->view(), *gold_sort_order);
 
@@ -1636,7 +1712,7 @@ TEST_F(JoinDictionaryTest, FullJoinWithNulls)
   auto t0 = cudf::table_view({col0_0->view(), col0_1, col0_2});
   auto t1 = cudf::table_view({col1_0->view(), col1_1, col1_2});
 
-  auto result      = cudf::full_join(t0, t1, {0, 1}, {0, 1});
+  auto result      = full_join(t0, t1, {0, 1}, {0, 1});
   auto result_view = result->view();
   auto decoded0    = cudf::dictionary::decode(result_view.column(0));
   auto decoded3    = cudf::dictionary::decode(result_view.column(3));
@@ -1651,7 +1727,7 @@ TEST_F(JoinDictionaryTest, FullJoinWithNulls)
 
   auto g0              = cudf::table_view({col0_0_w, col0_1, col0_2});
   auto g1              = cudf::table_view({col1_0_w, col1_1, col1_2});
-  auto gold            = cudf::full_join(g0, g1, {0, 1}, {0, 1});
+  auto gold            = full_join(g0, g1, {0, 1}, {0, 1});
   auto gold_sort_order = cudf::sorted_order(gold->view());
   auto sorted_gold     = cudf::gather(gold->view(), *gold_sort_order);
 
@@ -1707,7 +1783,7 @@ TEST_F(JoinTest, FullJoinWithStructsAndNulls)
   Table t0(std::move(cols0));
   Table t1(std::move(cols1));
 
-  auto result            = cudf::full_join(t0, t1, {0, 1, 3}, {0, 1, 3});
+  auto result            = full_join(t0, t1, {0, 1, 3}, {0, 1, 3});
   auto result_sort_order = cudf::sorted_order(result->view());
   auto sorted_result     = cudf::gather(result->view(), *result_sort_order);
 
@@ -1815,7 +1891,7 @@ TEST_F(JoinTest, Repro_StructsWithoutNullsPushedDown)
     return make_table(dim_structs.release());
   }();
 
-  auto const result = cudf::inner_join(fact_table.view(), dimension_table.view(), {0}, {0});
+  auto const result = inner_join(fact_table.view(), dimension_table.view(), {0}, {0});
   EXPECT_EQ(result->num_rows(), 1);  // The null STRUCT rows should match.
 
   // Note: Join result might not have nulls pushed down, since it's an output of gather().
diff --git a/cpp/tests/join/semi_anti_join_tests.cpp b/cpp/tests/join/semi_anti_join_tests.cpp
index 97af1fd7006..1de70124b60 100644
--- a/cpp/tests/join/semi_anti_join_tests.cpp
+++ b/cpp/tests/join/semi_anti_join_tests.cpp
@@ -40,6 +40,58 @@ using Table          = cudf::table;
 struct JoinTest : public cudf::test::BaseFixture {
 };
 
+namespace {
+// This function is a wrapper around cudf's join APIs that takes the gather map
+// from join APIs and materializes the table that would be created by gathering
+// from the joined tables. Join APIs originally returned tables like this, but
+// they were modified in https://github.com/rapidsai/cudf/pull/7454. This
+// helper function allows us to avoid rewriting all our tests in terms of
+// gather maps.
+template <std::unique_ptr<rmm::device_uvector<cudf::size_type>> (*join_impl)(
+  cudf::table_view const& left_keys,
+  cudf::table_view const& right_keys,
+  cudf::null_equality compare_nulls,
+  rmm::mr::device_memory_resource* mr)>
+std::unique_ptr<cudf::table> join_and_gather(
+  cudf::table_view const& left_input,
+  cudf::table_view const& right_input,
+  std::vector<cudf::size_type> const& left_on,
+  std::vector<cudf::size_type> const& right_on,
+  cudf::null_equality compare_nulls,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+{
+  auto left_selected      = left_input.select(left_on);
+  auto right_selected     = right_input.select(right_on);
+  auto const join_indices = join_impl(left_selected, right_selected, compare_nulls, mr);
+
+  auto left_indices_span = cudf::device_span<cudf::size_type const>{*join_indices};
+  auto left_indices_col  = cudf::column_view{left_indices_span};
+  return cudf::gather(left_input, left_indices_col);
+}
+}  // namespace
+
+std::unique_ptr<cudf::table> left_semi_join(
+  cudf::table_view const& left_input,
+  cudf::table_view const& right_input,
+  std::vector<cudf::size_type> const& left_on,
+  std::vector<cudf::size_type> const& right_on,
+  cudf::null_equality compare_nulls = cudf::null_equality::EQUAL)
+{
+  return join_and_gather<cudf::left_semi_join>(
+    left_input, right_input, left_on, right_on, compare_nulls);
+}
+
+std::unique_ptr<cudf::table> left_anti_join(
+  cudf::table_view const& left_input,
+  cudf::table_view const& right_input,
+  std::vector<cudf::size_type> const& left_on,
+  std::vector<cudf::size_type> const& right_on,
+  cudf::null_equality compare_nulls = cudf::null_equality::EQUAL)
+{
+  return join_and_gather<cudf::left_anti_join>(
+    left_input, right_input, left_on, right_on, compare_nulls);
+}
+
 TEST_F(JoinTest, TestSimple)
 {
   column_wrapper<int32_t> left_col0{0, 1, 2};
@@ -48,7 +100,7 @@ TEST_F(JoinTest, TestSimple)
   auto left  = cudf::table_view{{left_col0}};
   auto right = cudf::table_view{{right_col0}};
 
-  auto result    = cudf::left_semi_join(left, right);
+  auto result    = left_semi_join(left, right);
   auto result_cv = cudf::column_view(
     cudf::data_type{cudf::type_to_id<cudf::size_type>()}, result->size(), result->data());
   column_wrapper<cudf::size_type> expected{0, 1};
@@ -104,8 +156,8 @@ TEST_F(JoinTest, SemiJoinWithStructsAndNulls)
 {
   auto tables = get_saj_tables({1, 1, 0, 1, 0}, {1, 0, 0, 1, 1});
 
-  auto result = cudf::left_semi_join(
-    *tables.first, *tables.second, {0, 1, 3}, {0, 1, 3}, cudf::null_equality::EQUAL);
+  auto result =
+    left_semi_join(*tables.first, *tables.second, {0, 1, 3}, {0, 1, 3}, cudf::null_equality::EQUAL);
   auto result_sort_order = cudf::sorted_order(result->view());
   auto sorted_result     = cudf::gather(result->view(), *result_sort_order);
 
@@ -136,7 +188,7 @@ TEST_F(JoinTest, SemiJoinWithStructsAndNullsNotEqual)
 {
   auto tables = get_saj_tables({1, 1, 0, 1, 1}, {1, 1, 0, 1, 1});
 
-  auto result = cudf::left_semi_join(
+  auto result = left_semi_join(
     *tables.first, *tables.second, {0, 1, 3}, {0, 1, 3}, cudf::null_equality::UNEQUAL);
   auto result_sort_order = cudf::sorted_order(result->view());
   auto sorted_result     = cudf::gather(result->view(), *result_sort_order);
@@ -169,8 +221,8 @@ TEST_F(JoinTest, AntiJoinWithStructsAndNulls)
 {
   auto tables = get_saj_tables({1, 1, 0, 1, 0}, {1, 0, 0, 1, 1});
 
-  auto result = cudf::left_anti_join(
-    *tables.first, *tables.second, {0, 1, 3}, {0, 1, 3}, cudf::null_equality::EQUAL);
+  auto result =
+    left_anti_join(*tables.first, *tables.second, {0, 1, 3}, {0, 1, 3}, cudf::null_equality::EQUAL);
   auto result_sort_order = cudf::sorted_order(result->view());
   auto sorted_result     = cudf::gather(result->view(), *result_sort_order);
 
@@ -202,7 +254,7 @@ TEST_F(JoinTest, AntiJoinWithStructsAndNullsNotEqual)
 {
   auto tables = get_saj_tables({1, 1, 0, 1, 1}, {1, 1, 0, 1, 1});
 
-  auto result = cudf::left_anti_join(
+  auto result = left_anti_join(
     *tables.first, *tables.second, {0, 1, 3}, {0, 1, 3}, cudf::null_equality::UNEQUAL);
   auto result_sort_order = cudf::sorted_order(result->view());
   auto sorted_result     = cudf::gather(result->view(), *result_sort_order);
@@ -249,11 +301,9 @@ TEST_F(JoinTest, AntiJoinWithStructsAndNullsOnOneSide)
   auto left  = cudf::table_view{{left_col0}};
   auto right = cudf::table_view{{right_col0}};
 
-  auto result   = cudf::left_anti_join(left, right, {0}, {0});
-  auto expected = [] {
-    column_wrapper<int32_t> child1{{null}, cudf::test::iterators::null_at(0)};
-    column_wrapper<int32_t> child2{12};
-    return cudf::test::structs_column_wrapper{{child1, child2}};
-  }();
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->get_column(0).view());
+  auto result      = cudf::left_anti_join(left, right);
+  auto result_span = cudf::device_span<cudf::size_type const>{*result};
+  auto result_col  = cudf::column_view{result_span};
+  auto expected    = column_wrapper<cudf::size_type>{1};
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result_col);
 }
diff --git a/java/src/main/java/ai/rapids/cudf/Table.java b/java/src/main/java/ai/rapids/cudf/Table.java
index db90c09a078..c8f842fcc63 100644
--- a/java/src/main/java/ai/rapids/cudf/Table.java
+++ b/java/src/main/java/ai/rapids/cudf/Table.java
@@ -549,9 +549,6 @@ private static native long[] orderBy(long inputTable, long[] sortKeys, boolean[]
   private static native long[] merge(long[] tableHandles, int[] sortKeyIndexes,
                                      boolean[] isDescending, boolean[] areNullsSmallest) throws CudfException;
 
-  private static native long[] leftJoin(long leftTable, int[] leftJoinCols, long rightTable,
-                                        int[] rightJoinCols, boolean compareNullsEqual) throws CudfException;
-
   private static native long[] leftJoinGatherMaps(long leftKeys, long rightKeys,
                                                   boolean compareNullsEqual) throws CudfException;
 
@@ -562,9 +559,6 @@ private static native long[] leftJoinGatherMaps(long leftKeys, long rightKeys,
   private static native long[] leftHashJoinGatherMapsWithCount(long leftTable, long rightHashJoin,
                                                                long outputRowCount) throws CudfException;
 
-  private static native long[] innerJoin(long leftTable, int[] leftJoinCols, long rightTable,
-                                         int[] rightJoinCols, boolean compareNullsEqual) throws CudfException;
-
   private static native long[] innerJoinGatherMaps(long leftKeys, long rightKeys,
                                                    boolean compareNullsEqual) throws CudfException;
 
@@ -575,9 +569,6 @@ private static native long[] innerJoinGatherMaps(long leftKeys, long rightKeys,
   private static native long[] innerHashJoinGatherMapsWithCount(long table, long hashJoin,
                                                                 long outputRowCount) throws CudfException;
 
-  private static native long[] fullJoin(long leftTable, int[] leftJoinCols, long rightTable,
-                                         int[] rightJoinCols, boolean compareNullsEqual) throws CudfException;
-
   private static native long[] fullJoinGatherMaps(long leftKeys, long rightKeys,
                                                   boolean compareNullsEqual) throws CudfException;
 
@@ -588,15 +579,9 @@ private static native long[] fullJoinGatherMaps(long leftKeys, long rightKeys,
   private static native long[] fullHashJoinGatherMapsWithCount(long leftTable, long rightHashJoin,
                                                                long outputRowCount) throws CudfException;
 
-  private static native long[] leftSemiJoin(long leftTable, int[] leftJoinCols, long rightTable,
-      int[] rightJoinCols, boolean compareNullsEqual) throws CudfException;
-
   private static native long[] leftSemiJoinGatherMap(long leftKeys, long rightKeys,
                                                      boolean compareNullsEqual) throws CudfException;
 
-  private static native long[] leftAntiJoin(long leftTable, int[] leftJoinCols, long rightTable,
-      int[] rightJoinCols, boolean compareNullsEqual) throws CudfException;
-
   private static native long[] leftAntiJoinGatherMap(long leftKeys, long rightKeys,
                                                      boolean compareNullsEqual) throws CudfException;
 
@@ -4119,161 +4104,6 @@ public static final class TableOperation {
       operation = new Operation(table, indices);
     }
 
-    /**
-     * Joins two tables on the join columns that are passed in.
-     * Usage:
-     * Table t1 ...
-     * Table t2 ...
-     * Table result = t1.onColumns(0,1).leftJoin(t2.onColumns(2,3));
-     * @param rightJoinIndices - Indices of the right table to join on
-     * @param compareNullsEqual - Whether null join-key values should match or not.
-     * @return the joined table.  The order of the columns returned will be join columns,
-     * left non-join columns, right non-join columns.
-     */
-    public Table leftJoin(TableOperation rightJoinIndices, boolean compareNullsEqual) {
-      return new Table(Table.leftJoin(operation.table.nativeHandle, operation.indices,
-          rightJoinIndices.operation.table.nativeHandle, rightJoinIndices.operation.indices,
-          compareNullsEqual));
-    }
-
-    /**
-     * Joins two tables on the join columns that are passed in.
-     * Usage:
-     * Table t1 ...
-     * Table t2 ...
-     * Table result = t1.onColumns(0,1).leftJoin(t2.onColumns(2,3));
-     * @param rightJoinIndices - Indices of the right table to join on
-     * @return the joined table.  The order of the columns returned will be join columns,
-     * left non-join columns, right non-join columns.
-     */
-    public Table leftJoin(TableOperation rightJoinIndices) {
-        return leftJoin(rightJoinIndices, true);
-    }
-
-    /**
-     * Joins two tables on the join columns that are passed in.
-     * Usage:
-     * Table t1 ...
-     * Table t2 ...
-     * Table result = t1.onColumns(0,1).innerJoin(t2.onColumns(2,3));
-     * @param rightJoinIndices - Indices of the right table to join on
-     * @param compareNullsEqual - Whether null join-key values should match or not.
-     * @return the joined table.  The order of the columns returned will be join columns,
-     * left non-join columns, right non-join columns.
-     */
-    public Table innerJoin(TableOperation rightJoinIndices, boolean compareNullsEqual) {
-      return new Table(Table.innerJoin(operation.table.nativeHandle, operation.indices,
-          rightJoinIndices.operation.table.nativeHandle, rightJoinIndices.operation.indices,
-          compareNullsEqual));
-    }
-
-    /**
-     * Joins two tables on the join columns that are passed in.
-     * Usage:
-     * Table t1 ...
-     * Table t2 ...
-     * Table result = t1.onColumns(0,1).innerJoin(t2.onColumns(2,3));
-     * @param rightJoinIndices - Indices of the right table to join on
-     * @return the joined table.  The order of the columns returned will be join columns,
-     * left non-join columns, right non-join columns.
-     */
-    public Table innerJoin(TableOperation rightJoinIndices) {
-      return innerJoin(rightJoinIndices, true);
-    }
-
-    /**
-     * Joins two tables on the join columns that are passed in.
-     * Usage:
-     * Table t1 ...
-     * Table t2 ...
-     * Table result = t1.onColumns(0,1).fullJoin(t2.onColumns(2,3));
-     * @param rightJoinIndices - Indices of the right table to join on
-     * @param compareNullsEqual - Whether null join-key values should match or not.
-     * @return the joined table.  The order of the columns returned will be join columns,
-     * left non-join columns, right non-join columns.
-     */
-    public Table fullJoin(TableOperation rightJoinIndices, boolean compareNullsEqual) {
-      return new Table(Table.fullJoin(operation.table.nativeHandle, operation.indices,
-              rightJoinIndices.operation.table.nativeHandle, rightJoinIndices.operation.indices,
-              compareNullsEqual));
-    }
-
-    /**
-     * Joins two tables on the join columns that are passed in.
-     * Usage:
-     * Table t1 ...
-     * Table t2 ...
-     * Table result = t1.onColumns(0,1).fullJoin(t2.onColumns(2,3));
-     * @param rightJoinIndices - Indices of the right table to join on
-     * @return the joined table.  The order of the columns returned will be join columns,
-     * left non-join columns, right non-join columns.
-     */
-    public Table fullJoin(TableOperation rightJoinIndices) {
-      return fullJoin(rightJoinIndices, true);
-    }
-
-    /**
-     * Performs a semi-join between a left table and a right table, returning only the rows from
-     * the left table that match rows in the right table on the join keys.
-     * Usage:
-     * Table t1 ...
-     * Table t2 ...
-     * Table result = t1.onColumns(0,1).leftSemiJoin(t2.onColumns(2,3));
-     * @param rightJoinIndices - Indices of the right table to join on
-     * @param compareNullsEqual - Whether null join-key values should match or not.
-     * @return the left semi-joined table.
-     */
-    public Table leftSemiJoin(TableOperation rightJoinIndices, boolean compareNullsEqual) {
-      return new Table(Table.leftSemiJoin(operation.table.nativeHandle, operation.indices,
-          rightJoinIndices.operation.table.nativeHandle, rightJoinIndices.operation.indices,
-          compareNullsEqual));
-    }
-
-    /**
-     * Performs a semi-join between a left table and a right table, returning only the rows from
-     * the left table that match rows in the right table on the join keys.
-     * Usage:
-     * Table t1 ...
-     * Table t2 ...
-     * Table result = t1.onColumns(0,1).leftSemiJoin(t2.onColumns(2,3));
-     * @param rightJoinIndices - Indices of the right table to join on
-     * @return the left semi-joined table.
-     */
-    public Table leftSemiJoin(TableOperation rightJoinIndices) {
-      return leftSemiJoin(rightJoinIndices, true);
-    }
-
-    /**
-     * Performs an anti-join between a left table and a right table, returning only the rows from
-     * the left table that do not match rows in the right table on the join keys.
-     * Usage:
-     * Table t1 ...
-     * Table t2 ...
-     * Table result = t1.onColumns(0,1).leftAntiJoin(t2.onColumns(2,3));
-     * @param rightJoinIndices - Indices of the right table to join on
-     * @param compareNullsEqual - Whether null join-key values should match or not.
-     * @return the left anti-joined table.
-     */
-    public Table leftAntiJoin(TableOperation rightJoinIndices, boolean compareNullsEqual) {
-      return new Table(Table.leftAntiJoin(operation.table.nativeHandle, operation.indices,
-          rightJoinIndices.operation.table.nativeHandle, rightJoinIndices.operation.indices,
-          compareNullsEqual));
-    }
-
-    /**
-     * Performs an anti-join between a left table and a right table, returning only the rows from
-     * the left table that do not match rows in the right table on the join keys.
-     * Usage:
-     * Table t1 ...
-     * Table t2 ...
-     * Table result = t1.onColumns(0,1).leftAntiJoin(t2.onColumns(2,3));
-     * @param rightJoinIndices - Indices of the right table to join on
-     * @return the left anti-joined table.
-     */
-    public Table leftAntiJoin(TableOperation rightJoinIndices) {
-      return leftAntiJoin(rightJoinIndices, true);
-    }
-
     /**
      * Hash partition a table into the specified number of partitions. Uses the default MURMUR3
      * hashing.
diff --git a/java/src/main/native/src/TableJni.cpp b/java/src/main/native/src/TableJni.cpp
index 4bdd54640d6..471ddef81c2 100644
--- a/java/src/main/native/src/TableJni.cpp
+++ b/java/src/main/native/src/TableJni.cpp
@@ -985,51 +985,6 @@ get_mixed_size_info(JNIEnv *env, jlong j_output_row_count, jlong j_matches_view)
                                        matches->template data<cudf::size_type>(), matches->size()));
 }
 
-// Returns a table view containing only the columns at the specified indices
-cudf::table_view const get_keys_table(cudf::table_view const *t,
-                                      native_jintArray const &key_indices) {
-  std::vector<cudf::column_view> key_cols;
-  key_cols.reserve(key_indices.size());
-  std::transform(key_indices.begin(), key_indices.end(), std::back_inserter(key_cols),
-                 [t](int idx) { return t->column(idx); });
-  return table_view(key_cols);
-}
-
-// Returns a table view containing only the columns that are NOT at the specified indices
-cudf::table_view const get_non_keys_table(cudf::table_view const *t,
-                                          native_jintArray const &key_indices) {
-  std::vector<int> non_key_indices;
-  for (int i = 0; i < t->num_columns(); ++i) {
-    if (std::find(key_indices.begin(), key_indices.end(), i) == key_indices.end()) {
-      non_key_indices.push_back(i);
-    }
-  }
-  std::vector<cudf::column_view> cols;
-  std::transform(non_key_indices.begin(), non_key_indices.end(), std::back_inserter(cols),
-                 [&t](int idx) { return t->column(idx); });
-  return table_view(cols);
-}
-
-// Combine left and right join results into a column pointer array that can be returned to the JVM.
-jlongArray combine_join_results(JNIEnv *env, std::vector<std::unique_ptr<cudf::column>> left_cols,
-                                std::vector<std::unique_ptr<cudf::column>> right_cols) {
-  cudf::jni::native_jlongArray outcol_handles(env, left_cols.size() + right_cols.size());
-  auto iter =
-      std::transform(left_cols.begin(), left_cols.end(), outcol_handles.begin(),
-                     [](std::unique_ptr<cudf::column> &col) { return release_as_jlong(col); });
-  std::transform(right_cols.begin(), right_cols.end(), iter,
-                 [](std::unique_ptr<cudf::column> &col) { return release_as_jlong(col); });
-  return outcol_handles.get_jArray();
-}
-
-// Combine left and right join results into a column pointer array that can be returned to the JVM.
-jlongArray combine_join_results(JNIEnv *env, cudf::table &left_results,
-                                cudf::table &right_results) {
-  std::vector<std::unique_ptr<cudf::column>> left_cols = left_results.release();
-  std::vector<std::unique_ptr<cudf::column>> right_cols = right_results.release();
-  return combine_join_results(env, std::move(left_cols), std::move(right_cols));
-}
-
 cudf::column_view remove_validity_from_col(cudf::column_view column_view) {
   if (!cudf::is_compound(column_view.type())) {
     if (column_view.nullable() && column_view.null_count() == 0) {
@@ -2025,206 +1980,6 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Table_readArrowIPCEnd(JNIEnv *env, jc
   CATCH_STD(env, )
 }
 
-JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_leftJoin(
-    JNIEnv *env, jclass, jlong j_left_table, jintArray j_left_key_indices, jlong j_right_table,
-    jintArray j_right_key_indices, jboolean compare_nulls_equal) {
-  JNI_NULL_CHECK(env, j_left_table, "left_table is null", NULL);
-  JNI_NULL_CHECK(env, j_left_key_indices, "left_col_join_indices is null", NULL);
-  JNI_NULL_CHECK(env, j_right_table, "right_table is null", NULL);
-  JNI_NULL_CHECK(env, j_right_key_indices, "right_col_join_indices is null", NULL);
-
-  try {
-    cudf::jni::auto_set_device(env);
-    auto left_in_table = reinterpret_cast<cudf::table_view *>(j_left_table);
-    auto right_in_table = reinterpret_cast<cudf::table_view *>(j_right_table);
-    cudf::jni::native_jintArray left_key_indices(env, j_left_key_indices);
-    auto left_keys_table = cudf::jni::get_keys_table(left_in_table, left_key_indices);
-    left_key_indices.cancel();
-    cudf::jni::native_jintArray right_key_indices(env, j_right_key_indices);
-    auto right_keys_table = cudf::jni::get_keys_table(right_in_table, right_key_indices);
-    auto nulleq = compare_nulls_equal ? cudf::null_equality::EQUAL : cudf::null_equality::UNEQUAL;
-
-    // compute gather maps for the left and right tables that can produce the join result rows
-    auto join_maps = cudf::left_join(left_keys_table, right_keys_table, nulleq);
-    CUDF_EXPECTS(join_maps.first->size() <= std::numeric_limits<cudf::size_type>::max(),
-                 "join result exceeds maximum column length");
-    auto num_join_rows = static_cast<cudf::size_type>(join_maps.first->size());
-
-    // compute the join result rows for the left table columns
-    auto left_gather_col = cudf::column_view(cudf::data_type{cudf::type_id::INT32}, num_join_rows,
-                                             join_maps.first->data());
-    auto left_out_table =
-        cudf::gather(*left_in_table, left_gather_col, cudf::out_of_bounds_policy::DONT_CHECK);
-
-    // compute the join result rows for the right table columns
-    auto right_non_keys_table = cudf::jni::get_non_keys_table(right_in_table, right_key_indices);
-    right_key_indices.cancel();
-    auto right_gather_col = cudf::column_view(cudf::data_type{cudf::type_id::INT32}, num_join_rows,
-                                              join_maps.second->data());
-    auto right_out_table =
-        cudf::gather(right_non_keys_table, right_gather_col, cudf::out_of_bounds_policy::NULLIFY);
-
-    return cudf::jni::combine_join_results(env, *left_out_table, *right_out_table);
-  }
-  CATCH_STD(env, NULL);
-}
-
-JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_innerJoin(
-    JNIEnv *env, jclass, jlong j_left_table, jintArray j_left_key_indices, jlong j_right_table,
-    jintArray j_right_key_indices, jboolean compare_nulls_equal) {
-  JNI_NULL_CHECK(env, j_left_table, "left_table is null", NULL);
-  JNI_NULL_CHECK(env, j_left_key_indices, "left_col_join_indices is null", NULL);
-  JNI_NULL_CHECK(env, j_right_table, "right_table is null", NULL);
-  JNI_NULL_CHECK(env, j_right_key_indices, "right_col_join_indices is null", NULL);
-
-  try {
-    cudf::jni::auto_set_device(env);
-    auto left_in_table = reinterpret_cast<cudf::table_view *>(j_left_table);
-    auto right_in_table = reinterpret_cast<cudf::table_view *>(j_right_table);
-    cudf::jni::native_jintArray left_key_indices(env, j_left_key_indices);
-    auto left_keys_table = cudf::jni::get_keys_table(left_in_table, left_key_indices);
-    left_key_indices.cancel();
-    cudf::jni::native_jintArray right_key_indices(env, j_right_key_indices);
-    auto right_keys_table = cudf::jni::get_keys_table(right_in_table, right_key_indices);
-    auto nulleq = compare_nulls_equal ? cudf::null_equality::EQUAL : cudf::null_equality::UNEQUAL;
-
-    // compute gather maps for the left and right tables that can produce the join result rows
-    auto join_maps = cudf::inner_join(left_keys_table, right_keys_table, nulleq);
-    CUDF_EXPECTS(join_maps.first->size() <= std::numeric_limits<cudf::size_type>::max(),
-                 "join result exceeds maximum column length");
-    auto num_join_rows = static_cast<cudf::size_type>(join_maps.first->size());
-
-    // compute the join result rows for the left table columns
-    auto left_gather_col = cudf::column_view(cudf::data_type{cudf::type_id::INT32}, num_join_rows,
-                                             join_maps.first->data());
-    auto left_out_table =
-        cudf::gather(*left_in_table, left_gather_col, cudf::out_of_bounds_policy::DONT_CHECK);
-
-    // compute the join result rows for the right table columns
-    auto right_non_keys_table = cudf::jni::get_non_keys_table(right_in_table, right_key_indices);
-    right_key_indices.cancel();
-    auto right_gather_col = cudf::column_view(cudf::data_type{cudf::type_id::INT32}, num_join_rows,
-                                              join_maps.second->data());
-    auto right_out_table = cudf::gather(right_non_keys_table, right_gather_col,
-                                        cudf::out_of_bounds_policy::DONT_CHECK);
-
-    return cudf::jni::combine_join_results(env, *left_out_table, *right_out_table);
-  }
-  CATCH_STD(env, NULL);
-}
-
-JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_fullJoin(
-    JNIEnv *env, jclass, jlong j_left_table, jintArray j_left_key_indices, jlong j_right_table,
-    jintArray j_right_key_indices, jboolean compare_nulls_equal) {
-  JNI_NULL_CHECK(env, j_left_table, "left_table is null", NULL);
-  JNI_NULL_CHECK(env, j_left_key_indices, "left_col_join_indices is null", NULL);
-  JNI_NULL_CHECK(env, j_right_table, "right_table is null", NULL);
-  JNI_NULL_CHECK(env, j_right_key_indices, "right_col_join_indices is null", NULL);
-
-  try {
-    cudf::jni::auto_set_device(env);
-    auto left_in_table = reinterpret_cast<cudf::table_view *>(j_left_table);
-    auto right_in_table = reinterpret_cast<cudf::table_view *>(j_right_table);
-    cudf::jni::native_jintArray left_key_indices(env, j_left_key_indices);
-    auto left_keys_table = cudf::jni::get_keys_table(left_in_table, left_key_indices);
-    cudf::jni::native_jintArray right_key_indices(env, j_right_key_indices);
-    auto right_keys_table = cudf::jni::get_keys_table(right_in_table, right_key_indices);
-    auto nulleq = compare_nulls_equal ? cudf::null_equality::EQUAL : cudf::null_equality::UNEQUAL;
-
-    // compute gather maps for the left and right tables that can produce the join result rows
-    auto join_maps = cudf::full_join(left_keys_table, right_keys_table, nulleq);
-    CUDF_EXPECTS(join_maps.first->size() <= std::numeric_limits<cudf::size_type>::max(),
-                 "join result exceeds maximum column length");
-    auto num_join_rows = static_cast<cudf::size_type>(join_maps.first->size());
-
-    // compute the join result rows for the left table columns
-    auto left_gather_col = cudf::column_view(cudf::data_type{cudf::type_id::INT32}, num_join_rows,
-                                             join_maps.first->data());
-    auto left_out_table =
-        cudf::gather(*left_in_table, left_gather_col, cudf::out_of_bounds_policy::NULLIFY);
-    // Replace any nulls in the left key column results with the right key column results.
-    std::vector<std::unique_ptr<cudf::column>> result_cols = left_out_table->release();
-    auto right_gather_col = cudf::column_view(cudf::data_type{cudf::type_id::INT32}, num_join_rows,
-                                              join_maps.second->data());
-    for (int i = 0; i < left_key_indices.size(); ++i) {
-      std::unique_ptr<cudf::column> &colptr = result_cols[left_key_indices[i]];
-      auto right_key_col = right_in_table->column(right_key_indices[i]);
-      auto gathered = cudf::gather(cudf::table_view{{right_key_col}}, right_gather_col,
-                                   cudf::out_of_bounds_policy::NULLIFY);
-      auto replaced_col = cudf::replace_nulls(*colptr, gathered->get_column(0));
-      colptr.reset(replaced_col.release());
-    }
-    left_key_indices.cancel();
-
-    // compute the join result rows for the right table columns
-    auto right_non_keys_table = cudf::jni::get_non_keys_table(right_in_table, right_key_indices);
-    right_key_indices.cancel();
-    auto right_out_table =
-        cudf::gather(right_non_keys_table, right_gather_col, cudf::out_of_bounds_policy::NULLIFY);
-
-    return cudf::jni::combine_join_results(env, std::move(result_cols), right_out_table->release());
-  }
-  CATCH_STD(env, NULL);
-}
-
-JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_leftSemiJoin(
-    JNIEnv *env, jclass, jlong left_table, jintArray left_col_join_indices, jlong right_table,
-    jintArray right_col_join_indices, jboolean compare_nulls_equal) {
-  JNI_NULL_CHECK(env, left_table, "left_table is null", NULL);
-  JNI_NULL_CHECK(env, left_col_join_indices, "left_col_join_indices is null", NULL);
-  JNI_NULL_CHECK(env, right_table, "right_table is null", NULL);
-  JNI_NULL_CHECK(env, right_col_join_indices, "right_col_join_indices is null", NULL);
-
-  try {
-    cudf::jni::auto_set_device(env);
-    cudf::table_view *n_left_table = reinterpret_cast<cudf::table_view *>(left_table);
-    cudf::table_view *n_right_table = reinterpret_cast<cudf::table_view *>(right_table);
-    cudf::jni::native_jintArray left_join_cols_arr(env, left_col_join_indices);
-    std::vector<cudf::size_type> left_join_cols(
-        left_join_cols_arr.data(), left_join_cols_arr.data() + left_join_cols_arr.size());
-    cudf::jni::native_jintArray right_join_cols_arr(env, right_col_join_indices);
-    std::vector<cudf::size_type> right_join_cols(
-        right_join_cols_arr.data(), right_join_cols_arr.data() + right_join_cols_arr.size());
-
-    std::unique_ptr<cudf::table> result =
-        cudf::left_semi_join(*n_left_table, *n_right_table, left_join_cols, right_join_cols,
-                             static_cast<bool>(compare_nulls_equal) ? cudf::null_equality::EQUAL :
-                                                                      cudf::null_equality::UNEQUAL);
-
-    return convert_table_for_return(env, result);
-  }
-  CATCH_STD(env, NULL);
-}
-
-JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_leftAntiJoin(
-    JNIEnv *env, jclass, jlong left_table, jintArray left_col_join_indices, jlong right_table,
-    jintArray right_col_join_indices, jboolean compare_nulls_equal) {
-  JNI_NULL_CHECK(env, left_table, "left_table is null", NULL);
-  JNI_NULL_CHECK(env, left_col_join_indices, "left_col_join_indices is null", NULL);
-  JNI_NULL_CHECK(env, right_table, "right_table is null", NULL);
-  JNI_NULL_CHECK(env, right_col_join_indices, "right_col_join_indices is null", NULL);
-
-  try {
-    cudf::jni::auto_set_device(env);
-    cudf::table_view *n_left_table = reinterpret_cast<cudf::table_view *>(left_table);
-    cudf::table_view *n_right_table = reinterpret_cast<cudf::table_view *>(right_table);
-    cudf::jni::native_jintArray left_join_cols_arr(env, left_col_join_indices);
-    std::vector<cudf::size_type> left_join_cols(
-        left_join_cols_arr.data(), left_join_cols_arr.data() + left_join_cols_arr.size());
-    cudf::jni::native_jintArray right_join_cols_arr(env, right_col_join_indices);
-    std::vector<cudf::size_type> right_join_cols(
-        right_join_cols_arr.data(), right_join_cols_arr.data() + right_join_cols_arr.size());
-
-    std::unique_ptr<cudf::table> result =
-        cudf::left_anti_join(*n_left_table, *n_right_table, left_join_cols, right_join_cols,
-                             static_cast<bool>(compare_nulls_equal) ? cudf::null_equality::EQUAL :
-                                                                      cudf::null_equality::UNEQUAL);
-
-    return convert_table_for_return(env, result);
-  }
-  CATCH_STD(env, NULL);
-}
-
 JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_leftJoinGatherMaps(
     JNIEnv *env, jclass, jlong j_left_keys, jlong j_right_keys, jboolean compare_nulls_equal) {
   return cudf::jni::join_gather_maps(
diff --git a/java/src/test/java/ai/rapids/cudf/TableTest.java b/java/src/test/java/ai/rapids/cudf/TableTest.java
index fbaead1e429..7ef47d6a7cc 100644
--- a/java/src/test/java/ai/rapids/cudf/TableTest.java
+++ b/java/src/test/java/ai/rapids/cudf/TableTest.java
@@ -793,556 +793,6 @@ void testReadORCTimeUnit() {
     }
   }
 
-  @Test
-  void testLeftJoinWithNulls() {
-    try (Table leftTable = new Table.TestBuilder()
-        .column(  2,   3,   9,   0,   1,   7,   4,   6,   5,   8)
-        .column(100, 101, 102, 103, 104, 105, 106, 107, 108, 109)
-        .build();
-         Table rightTable = new Table.TestBuilder()
-             .column(  6,   5,   9,   8,  10,  32)
-             .column(201, 202, 203, 204, 205, 206)
-             .build();
-         Table expected = new Table.TestBuilder()
-             .column(   2,    3,   9,    0,    1,    7,    4,   6,   5,   8) // common
-             .column( 100,  101, 102,  103,  104,  105,  106, 107, 108, 109) // left
-             .column(null, null, 203, null, null, null, null, 201, 202, 204) // right
-             .build();
-         Table joinedTable = leftTable.onColumns(0).leftJoin(rightTable.onColumns(0), true);
-         Table orderedJoinedTable = joinedTable.orderBy(OrderByArg.asc(1, true))) {
-      assertTablesAreEqual(expected, orderedJoinedTable);
-    }
-  }
-
- @Test
-  void testLeftJoinOnNullKeys() {
-    try (Table leftTable = new Table.TestBuilder()
-        .column(  2,   3,   9,   0,   1,   7,   4, null, null,   8)
-        .column(100, 101, 102, 103, 104, 105, 106,  107,  108, 109)
-        .build();
-
-         Table rightTable = new Table.TestBuilder()
-             .column(null, null,   9,   8,  10,  32)
-             .column( 201,  202, 203, 204, 205, 206)
-             .build()) {
-
-       try (Table expectedResults = new Table.TestBuilder()
-           .column(   2,    3,   9,    0,    1,    7,    4, null, null, null, null,   8) // common
-           .column( 100,  101, 102,  103,  104,  105,  106,  107,  107,  108,  108, 109) // left
-           .column(null, null, 203, null, null, null, null,  201,  202,  201,  202, 204) // right
-           .build();
-
-           Table joinedTable = leftTable.onColumns(0).leftJoin(rightTable.onColumns(0));
-           Table orderedJoinedTable = joinedTable.orderBy(OrderByArg.asc(1, true))) {
-         assertTablesAreEqual(expectedResults, orderedJoinedTable);
-       }
-
-       try (Table expectedResults = new Table.TestBuilder()
-           .column(   2,    3,   9,    0,    1,    7,    4, null, null,    8) // common
-           .column( 100,  101, 102,  103,  104,  105,  106,  107,  108,  109) // left
-           .column(null, null, 203, null, null, null, null, null, null,  204) // right
-           .build();
-
-           Table joinedTable = leftTable.onColumns(0).leftJoin(rightTable.onColumns(0), false);
-           Table orderedJoinedTable = joinedTable.orderBy(OrderByArg.asc(1, true))) {
-         assertTablesAreEqual(expectedResults, orderedJoinedTable);
-       }
-    }
-  }
-
-  @Test
-  void testLeftJoin() {
-    try (Table leftTable = new Table.TestBuilder()
-        .column(360, 326, 254, 306, 109, 361, 251, 335, 301, 317)
-        .column( 10,  11,  12,  13,  14,  15,  16,  17,  18,  19)
-        .build();
-         Table rightTable = new Table.TestBuilder()
-             .column(306, 301, 360, 109, 335, 254, 317, 361, 251, 326)
-             .column( 20,  21,  22,  23,  24,  25,  26,  27,  28,  29)
-             .build();
-         Table joinedTable = leftTable.onColumns(0).leftJoin(rightTable.onColumns(0), true);
-         Table orderedJoinedTable = joinedTable.orderBy(OrderByArg.asc(1, true));
-         Table expected = new Table.TestBuilder()
-             .column(360, 326, 254, 306, 109, 361, 251, 335, 301, 317) // common
-             .column( 10,  11,  12,  13,  14,  15,  16,  17,  18,  19) // left
-             .column( 22,  29,  25,  20,  23,  27,  28,  24,  21,  26) // right
-             .build()) {
-      assertTablesAreEqual(expected, orderedJoinedTable);
-    }
-  }
-
-  @Test
-  void testLeftJoinLeftEmpty() {
-    final Integer[] emptyInts = new Integer[0];
-    try (Table leftTable = new Table.TestBuilder()
-        .column(emptyInts)
-        .column(emptyInts)
-        .build();
-         Table rightTable = new Table.TestBuilder()
-             .column(306, 301, 360, 109, 335, 254, 317, 361, 251, 326)
-             .column( 20,  21,  22,  23,  24,  25,  26,  27,  28,  29)
-             .build();
-         Table joinedTable = leftTable.onColumns(0).leftJoin(rightTable.onColumns(0), true);
-         Table expected = new Table.TestBuilder()
-             .column(emptyInts) // common
-             .column(emptyInts) // left
-             .column(emptyInts) // right
-             .build()) {
-      assertTablesAreEqual(expected, joinedTable);
-    }
-  }
-
-  @Test
-  void testLeftJoinRightEmpty() {
-    final Integer[] emptyInts = new Integer[0];
-    final Integer[] nullInts = new Integer[10];
-    Arrays.fill(nullInts, null);
-    try (Table leftTable = new Table.TestBuilder()
-        .column(360, 326, 254, 306, 109, 361, 251, 335, 301, 317)
-        .column( 10,  11,  12,  13,  14,  15,  16,  17,  18,  19)
-        .build();
-         Table rightTable = new Table.TestBuilder()
-             .column(emptyInts)
-             .column(emptyInts)
-             .build();
-         Table joinedTable = leftTable.onColumns(0).leftJoin(rightTable.onColumns(0), true);
-         Table orderedJoinedTable = joinedTable.orderBy(OrderByArg.asc(1, true));
-         Table expected = new Table.TestBuilder()
-             .column(360, 326, 254, 306, 109, 361, 251, 335, 301, 317) // common
-             .column( 10,  11,  12,  13,  14,  15,  16,  17,  18,  19) // left
-             .column(nullInts) // right
-             .build()) {
-      assertTablesAreEqual(expected, orderedJoinedTable);
-    }
-  }
-
-  @Test
-  void testFullJoinWithNonCommonKeys() {
-    try (Table leftTable = new Table.TestBuilder()
-            .column(  2,   3,   9,   0,   1,   7,   4,   6,   5,   8)
-            .column(100, 101, 102, 103, 104, 105, 106, 107, 108, 109)
-            .build();
-         Table rightTable = new Table.TestBuilder()
-                 .column(  6,   5,   9,   8,  10,  32)
-                 .column(200, 201, 202, 203, 204, 205)
-                 .build();
-         Table expected = new Table.TestBuilder()
-                 .column(   0,    1,    2,    3,    4,   5,   6,    7,   8,   9,   10,   32) // common
-                 .column( 103,  104,  100,  101,  106, 108, 107,  105, 109, 102, null, null) // left
-                 .column(null, null, null, null, null, 201, 200, null, 203, 202,  204,  205) // right
-                 .build();
-         Table joinedTable = leftTable.onColumns(0).fullJoin(rightTable.onColumns(0), true);
-         Table orderedJoinedTable = joinedTable.orderBy(OrderByArg.asc(0, true))) {
-      assertTablesAreEqual(expected, orderedJoinedTable);
-    }
-  }
-
-  @Test
-  void testFullJoinLeftEmpty() {
-    final Integer[] emptyInts = new Integer[0];
-    final Integer[] nullInts = new Integer[6];
-    try (Table leftTable = new Table.TestBuilder().column(emptyInts).column(emptyInts).build();
-         Table rightTable = new Table.TestBuilder()
-             .column(  6,   5,   9,   8,  10,  32)
-             .column(200, 201, 202, 203, 204, 205)
-             .build();
-         Table expected = new Table.TestBuilder()
-             .column(   5,    6,    8,    9,   10,   32) // common
-             .column(nullInts) // left
-             .column( 201,  200,  203,  202,  204,  205) // right
-             .build();
-         Table joinedTable = leftTable.onColumns(0).fullJoin(rightTable.onColumns(0), true);
-         Table orderedJoinedTable = joinedTable.orderBy(OrderByArg.asc(0, true))) {
-      assertTablesAreEqual(expected, orderedJoinedTable);
-    }
-  }
-
-  @Test
-  void testFullJoinRightEmpty() {
-    final Integer[] emptyInts = new Integer[0];
-    final Integer[] nullInts = new Integer[10];
-    try (Table leftTable = new Table.TestBuilder()
-        .column(  2,   3,   9,   0,   1,   7,   4,   6,   5,   8)
-        .column(100, 101, 102, 103, 104, 105, 106, 107, 108, 109)
-        .build();
-         Table rightTable = new Table.TestBuilder().column(emptyInts).column(emptyInts).build();
-         Table expected = new Table.TestBuilder()
-             .column(   0,    1,    2,    3,    4,   5,   6,    7,   8,   9) // common
-             .column( 103,  104,  100,  101,  106, 108, 107,  105, 109, 102) // left
-             .column(nullInts) // right
-             .build();
-         Table joinedTable = leftTable.onColumns(0).fullJoin(rightTable.onColumns(0), true);
-         Table orderedJoinedTable = joinedTable.orderBy(OrderByArg.asc(0, true))) {
-      assertTablesAreEqual(expected, orderedJoinedTable);
-    }
-  }
-
-  @Test
-  void testFullJoinOnNullKeys() {
-    try (Table leftTable = new Table.TestBuilder()
-            .column(  2,   3, null,   0,   1,   7,   4, null,   5,   8)
-            .column(100, 101,  102, 103, 104, 105, 106,  107, 108, 109)
-            .build();
-         Table rightTable = new Table.TestBuilder()
-                 .column(null,   5, null,   8,  10,  32)
-                 .column( 200, 201,  202, 203, 204, 205)
-                 .build()) {
-
-      // First, test that null-key rows match, with compareNullsEqual=true.
-      try (Table expectedResults = new Table.TestBuilder()
-              .column(null, null, null, null,    0,    1,    2,    3,    4,   5,    7,   8,   10,   32) // common
-              .column( 102,  102,  107,  107,  103,  104,  100,  101,  106, 108,  105, 109, null, null) // left
-              .column( 200,  202,  200,  202, null, null, null, null, null, 201, null, 203,  204,  205) // right
-              .build();
-           Table joinedTable = leftTable.onColumns(0).fullJoin(rightTable.onColumns(0));
-           Table orderedJoinedTable = joinedTable.orderBy(OrderByArg.asc(0, true), OrderByArg.asc(1, true))) {
-        assertTablesAreEqual(expectedResults, orderedJoinedTable);
-      }
-
-      // Next, test that null-key rows do not match, with compareNullsEqual=false.
-      try (Table expectedResults = new Table.TestBuilder()
-              .column(null, null, null, null,    0,    1,    2,    3,    4,   5,    7,   8,   10,   32) // common
-              .column(null, null,  102,  107,  103,  104,  100,  101,  106, 108,  105, 109, null, null) // left
-              .column( 200,  202, null, null, null, null, null, null, null, 201, null, 203,  204,  205) // right
-              .build();
-           Table joinedTable = leftTable.onColumns(0).fullJoin(rightTable.onColumns(0), false);
-           Table orderedJoinedTable = joinedTable.orderBy(
-               OrderByArg.asc(0, true), OrderByArg.asc(1, true), OrderByArg.asc(2, true))) {
-        assertTablesAreEqual(expectedResults, orderedJoinedTable);
-      }
-    }
-  }
-
-  @Test
-  void testFullJoinWithOnlyCommonKeys() {
-    try (Table leftTable = new Table.TestBuilder()
-            .column(360, 326, 254, 306, 109, 361, 251, 335, 301, 317)
-            .column(100, 101, 102, 103, 104, 105, 106, 107, 108, 109)
-            .build();
-         Table rightTable = new Table.TestBuilder()
-                 .column(306, 301, 360, 109, 335, 254, 317, 361, 251, 326)
-                 .column(200, 201, 202, 203, 204, 205, 206, 207, 208, 209)
-                 .build();
-         Table joinedTable = leftTable.onColumns(0).fullJoin(rightTable.onColumns(new int[]{0}), true);
-         Table orderedJoinedTable = joinedTable.orderBy(OrderByArg.asc(1, true));
-         Table expected = new Table.TestBuilder()
-                 .column(360, 326, 254, 306, 109, 361, 251, 335, 301, 317) // common
-                 .column(100, 101, 102, 103, 104, 105, 106, 107, 108, 109) // left
-                 .column(202, 209, 205, 200, 203, 207, 208, 204, 201, 206) // right
-                 .build()) {
-      assertTablesAreEqual(expected, orderedJoinedTable);
-    }
-  }
-
-  @Test
-  void testInnerJoinWithNonCommonKeys() {
-    try (Table leftTable = new Table.TestBuilder()
-        .column(  2,   3,   9,   0,   1,   7,   4,   6,   5,   8)
-        .column(100, 101, 102, 103, 104, 105, 106, 107, 108, 109)
-        .build();
-         Table rightTable = new Table.TestBuilder()
-             .column(  6,   5,   9,   8,  10,  32)
-             .column(200, 201, 202, 203, 204, 205)
-             .build();
-         Table expected = new Table.TestBuilder()
-             .column(  9,   6,   5,   8) // common
-             .column(102, 107, 108, 109) // left
-             .column(202, 200, 201, 203) // right
-             .build();
-         Table joinedTable = leftTable.onColumns(0).innerJoin(rightTable.onColumns(0), true);
-         Table orderedJoinedTable = joinedTable.orderBy(OrderByArg.asc(1, true))) {
-      assertTablesAreEqual(expected, orderedJoinedTable);
-    }
-  }
-
-  @Test
-  void testInnerJoinLeftEmpty() {
-    final Integer[] emptyInts = new Integer[0];
-    try (Table leftTable = new Table.TestBuilder()
-        .column(  2,   3,   9,   0,   1,   7,   4,   6,   5,   8)
-        .column(100, 101, 102, 103, 104, 105, 106, 107, 108, 109)
-        .build();
-         Table rightTable = new Table.TestBuilder().column(emptyInts).column(emptyInts).build();
-         Table expected = new Table.TestBuilder()
-             .column(emptyInts).column(emptyInts).column(emptyInts).build();
-         Table joinedTable = leftTable.onColumns(0).innerJoin(rightTable.onColumns(0), true)) {
-      assertTablesAreEqual(expected, joinedTable);
-    }
-  }
-
-  @Test
-  void testInnerJoinRightEmpty() {
-    final Integer[] emptyInts = new Integer[0];
-    try (Table leftTable = new Table.TestBuilder()
-        .column(  2,   3,   9,   0,   1,   7,   4,   6,   5,   8)
-        .column(100, 101, 102, 103, 104, 105, 106, 107, 108, 109)
-        .build();
-         Table rightTable = new Table.TestBuilder().column(emptyInts).column(emptyInts).build();
-         Table expected = new Table.TestBuilder()
-             .column(emptyInts).column(emptyInts).column(emptyInts).build();
-         Table joinedTable = leftTable.onColumns(0).innerJoin(rightTable.onColumns(0), true)) {
-      assertTablesAreEqual(expected, joinedTable);
-    }
-  }
-
-  @Test
-  void testInnerJoinOnNullKeys() {
-    try (Table leftTable = new Table.TestBuilder()
-             .column(  2,   3,   9,   0,   1,   7,   4,   6, null,   8)
-             .column(100, 101, 102, 103, 104, 105, 106, 107,  108, 109)
-             .build();
-         Table rightTable = new Table.TestBuilder()
-             .column(  6, null,   9,   8,  10,  32)
-             .column(200,  201, 202, 203, 204, 205)
-             .build()) {
-
-      // First, test that null-key rows match, with compareNullsEqual=true.
-      try (Table expected = new Table.TestBuilder()
-             .column(  9,   6, null,   8) // common
-             .column(102, 107,  108, 109) // left
-             .column(202, 200,  201, 203) // right
-             .build();
-         Table joinedTable = leftTable.onColumns(0).innerJoin(rightTable.onColumns(0));
-         Table orderedJoinedTable = joinedTable.orderBy(OrderByArg.asc(1, true))) {
-        assertTablesAreEqual(expected, orderedJoinedTable);
-      }
-
-      // Next, test that null-key rows do not match, with compareNullsEqual=false.
-      try (Table expected = new Table.TestBuilder()
-              .column(  9,   6,    8) // common
-              .column(102, 107,  109) // left
-              .column(202, 200,  203) // right
-              .build();
-           Table joinedTable = leftTable.onColumns(0).innerJoin(rightTable.onColumns(0), false);
-           Table orderedJoinedTable = joinedTable.orderBy(OrderByArg.asc(1, true))){
-        assertTablesAreEqual(expected, orderedJoinedTable);
-      }
-    }
-  }
-
-  @Test
-  void testInnerJoinWithOnlyCommonKeys() {
-    try (Table leftTable = new Table.TestBuilder()
-        .column(360, 326, 254, 306, 109, 361, 251, 335, 301, 317)
-        .column(100, 101, 102, 103, 104, 105, 106, 107, 108, 109)
-        .build();
-         Table rightTable = new Table.TestBuilder()
-             .column(306, 301, 360, 109, 335, 254, 317, 361, 251, 326)
-             .column(200, 201, 202, 203, 204, 205, 206, 207, 208, 209)
-             .build();
-         Table joinedTable = leftTable.onColumns(0).innerJoin(rightTable.onColumns(new int[]{0}), true);
-         Table orderedJoinedTable = joinedTable.orderBy(OrderByArg.asc(1, true));
-         Table expected = new Table.TestBuilder()
-             .column(360, 326, 254, 306, 109, 361, 251, 335, 301, 317) // common
-             .column(100, 101, 102, 103, 104, 105, 106, 107, 108, 109) // left
-             .column(202, 209, 205, 200, 203, 207, 208, 204, 201, 206) // right
-             .build()) {
-      assertTablesAreEqual(expected, orderedJoinedTable);
-    }
-  }
-
-  @Test
-  void testLeftSemiJoin() {
-    try (Table leftTable = new Table.TestBuilder()
-        .column(  2,   3,   9,   0,   1,   7,   4,   6,   5,   8)
-        .column(100, 101, 102, 103, 104, 105, 106, 107, 108, 109)
-        .build();
-         Table rightTable = new Table.TestBuilder()
-             .column(  6,   5,   9,   8,  10,  32)
-             .column(201, 202, 203, 204, 205, 206)
-             .build();
-         Table expected = new Table.TestBuilder()
-             .column(  9,   6,   5,   8)
-             .column(102, 107, 108, 109)
-             .build();
-         Table joinedTable = leftTable.onColumns(0).leftSemiJoin(rightTable.onColumns(0), true);
-         Table orderedJoinedTable = joinedTable.orderBy(OrderByArg.asc(1, true))) {
-      assertTablesAreEqual(expected, orderedJoinedTable);
-    }
-  }
-
-  @Test
-  void testLeftSemiJoinLeftEmpty() {
-    final Integer[] emptyInts = new Integer[0];
-    try (Table leftTable = new Table.TestBuilder().column(emptyInts).column(emptyInts).build();
-         Table rightTable = new Table.TestBuilder()
-             .column(  6,   5,   9,   8,  10,  32)
-             .column(201, 202, 203, 204, 205, 206)
-             .build();
-         Table joinedTable = leftTable.onColumns(0).leftSemiJoin(rightTable.onColumns(0), true)) {
-      assertTablesAreEqual(leftTable, joinedTable);
-    }
-  }
-
-  @Test
-  void testLeftSemiJoinRightEmpty() {
-    final Integer[] emptyInts = new Integer[0];
-    try (Table leftTable = new Table.TestBuilder()
-        .column(  2,   3,   9,   0,   1,   7,   4,   6,   5,   8)
-        .column(100, 101, 102, 103, 104, 105, 106, 107, 108, 109)
-        .build();
-         Table rightTable = new Table.TestBuilder().column(emptyInts).column(emptyInts).build();
-         Table joinedTable = leftTable.onColumns(0).leftSemiJoin(rightTable.onColumns(0), true)) {
-      assertTablesAreEqual(rightTable, joinedTable);
-    }
-  }
-
-  @Test
-  void testLeftSemiJoinWithNulls() {
-    try (Table leftTable = new Table.TestBuilder()
-        .column( 360,  326, null,  306, null,  254,  251,  361,  301,  317)
-        .column(  10,   11, null,   13,   14, null,   16,   17,   18,   19)
-        .column("20", "29", "22", "23", "24", "25", "26", "27", "28", "29")
-        .build();
-         Table rightTable = new Table.TestBuilder()
-             .column( 306,  301,  360,  109,  335,  254,  317,  361,  251,  326)
-             .column("20", "21", "22", "23", "24", "25", "26", "27", "28", "29")
-             .build();
-         Table joinedTable = leftTable.onColumns(0, 2).leftSemiJoin(rightTable.onColumns(0, 1), true);
-         Table orderedJoinedTable = joinedTable.orderBy(OrderByArg.asc(0, true));
-         Table expected = new Table.TestBuilder()
-             .column(254,   326,   361)
-             .column(null,   11,    17)
-             .column("25", "29",  "27")
-             .build()) {
-      assertTablesAreEqual(expected, orderedJoinedTable);
-    }
-  }
-
-  @Test
-  void testLeftSemiJoinOnNullKeys() {
-    try (Table leftTable = new Table.TestBuilder()
-            .column(  2,   3,   9,   0,   1,   7,   4,   6, null,   8)
-            .column(100, 101, 102, 103, 104, 105, 106, 107,  108, 109)
-            .build();
-         Table rightTable = new Table.TestBuilder()
-                 .column(  6, null,   9,   8,  10,  32)
-                 .column(201,  202, 203, 204, 205, 206)
-                 .build()) {
-
-       // First, test that null-key rows match, with compareNullsEqual=true.
-       try (Table expected = new Table.TestBuilder()
-               .column(  9,   6, null,   8)
-               .column(102, 107,  108, 109)
-               .build();
-            Table joinedTable = leftTable.onColumns(0).leftSemiJoin(rightTable.onColumns(0));
-            Table orderedJoinedTable = joinedTable.orderBy(OrderByArg.asc(1, true))) {
-          assertTablesAreEqual(expected, orderedJoinedTable);
-       }
-
-      // Next, test that null-key rows do not match, with compareNullsEqual=false.
-      try (Table expected = new Table.TestBuilder()
-              .column(  9,   6,   8)
-              .column(102, 107, 109)
-              .build();
-           Table joinedTable = leftTable.onColumns(0).leftSemiJoin(rightTable.onColumns(0), false);
-           Table orderedJoinedTable = joinedTable.orderBy(OrderByArg.asc(1, true))) {
-        assertTablesAreEqual(expected, orderedJoinedTable);
-      }
-    }
-  }
-
-  @Test
-  void testLeftAntiJoin() {
-    try (Table leftTable = new Table.TestBuilder()
-        .column(  2,   3,   9,   0,   1,   7,   4,   6,   5,   8)
-        .column(100, 101, 102, 103, 104, 105, 106, 107, 108, 109)
-        .build();
-         Table rightTable = new Table.TestBuilder()
-             .column(  6,   5,   9,   8,  10,  32)
-             .column(201, 202, 203, 204, 205, 206)
-             .build();
-         Table expected = new Table.TestBuilder()
-             .column(  2,   3,   0,   1,   7,   4)
-             .column(100, 101, 103, 104, 105, 106)
-             .build();
-         Table joinedTable = leftTable.onColumns(0).leftAntiJoin(rightTable.onColumns(0), true);
-         Table orderedJoinedTable = joinedTable.orderBy(OrderByArg.asc(1, true))) {
-      assertTablesAreEqual(expected, orderedJoinedTable);
-    }
-  }
-
-  @Test
-  void testLeftAntiJoinLeftEmpty() {
-    final Integer[] emptyInts = new Integer[0];
-    try (Table leftTable = new Table.TestBuilder().column(emptyInts).column(emptyInts).build();
-         Table rightTable = new Table.TestBuilder()
-             .column(  6,   5,   9,   8,  10,  32)
-             .column(201, 202, 203, 204, 205, 206)
-             .build();
-         Table joinedTable = leftTable.onColumns(0).leftAntiJoin(rightTable.onColumns(0), true)) {
-      assertTablesAreEqual(leftTable, joinedTable);
-    }
-  }
-
-  @Test
-  void testLeftAntiJoinRightEmpty() {
-    final Integer[] emptyInts = new Integer[0];
-    try (Table leftTable = new Table.TestBuilder()
-        .column(  2,   3,   9,   0,   1,   7,   4,   6,   5,   8)
-        .column(100, 101, 102, 103, 104, 105, 106, 107, 108, 109)
-        .build();
-         Table rightTable = new Table.TestBuilder().column(emptyInts).column(emptyInts).build();
-         Table joinedTable = leftTable.onColumns(0).leftAntiJoin(rightTable.onColumns(0), true)) {
-      assertTablesAreEqual(leftTable, joinedTable);
-    }
-  }
-
-  @Test
-  void testLeftAntiJoinOnNullKeys() {
-    try (Table leftTable = new Table.TestBuilder()
-            .column(  2,   3,   9,   0,   1,   7,   4,   6, null,   8)
-            .column(100, 101, 102, 103, 104, 105, 106, 107,  108, 109)
-            .build();
-         Table rightTable = new Table.TestBuilder()
-                 .column(  6, null,   9,   8,  10,  32)
-                 .column(201,  202, 203, 204, 205, 206)
-                 .build()) {
-
-      // First, test that null-key rows match, with compareNullsEqual=true.
-      try (Table expected = new Table.TestBuilder()
-              .column(  2,   3,   0,   1,   7,   4)
-              .column(100, 101, 103, 104, 105, 106)
-              .build();
-           Table joinedTable = leftTable.onColumns(0).leftAntiJoin(rightTable.onColumns(0));
-           Table orderedJoinedTable = joinedTable.orderBy(OrderByArg.asc(1, true))) {
-        assertTablesAreEqual(expected, orderedJoinedTable);
-      }
-
-      // Next, test that null-key rows do not match, with compareNullsEqual=false.
-      try (Table expected = new Table.TestBuilder()
-              .column(  2,   3,   0,   1,   7,   4, null)
-              .column(100, 101, 103, 104, 105, 106,  108)
-              .build();
-           Table joinedTable = leftTable.onColumns(0).leftAntiJoin(rightTable.onColumns(0), false);
-           Table orderedJoinedTable = joinedTable.orderBy(OrderByArg.asc(1, true))) {
-        assertTablesAreEqual(expected, orderedJoinedTable);
-      }
-    }
-  }
-
-  @Test
-  void testLeftAntiJoinWithNulls() {
-    try (Table leftTable = new Table.TestBuilder()
-        .column( 360,  326, null,  306, null,  254,  251,  361,  301,  317)
-        .column(  10,   11, null,   13,   14, null,   16,   17,   18,   19)
-        .column("20", "21", "22", "23", "24", "25", "26", "27", "28", "29")
-        .build();
-         Table rightTable = new Table.TestBuilder()
-             .column( 306,  301,  360,  109,  335,  254,  317,  361,  251,  326)
-             .column("20", "21", "22", "23", "24", "25", "26", "27", "28", "29")
-             .build();
-         Table joinedTable = leftTable.onColumns(0, 2).leftAntiJoin(rightTable.onColumns(0, 1), true);
-         Table orderedJoinedTable = joinedTable.orderBy(OrderByArg.asc(2, true));
-         Table expected = new Table.TestBuilder()
-             .column( 360,  326, null,  306, null,  251,  301,  317)
-             .column(  10,   11, null,   13,   14,   16,   18,   19)
-             .column("20", "21", "22", "23", "24", "26", "28", "29")
-             .build()) {
-      assertTablesAreEqual(expected, orderedJoinedTable);
-    }
-  }
-
   @Test
   void testCrossJoin() {
     try (Table leftTable = new Table.TestBuilder()

From e98feab966f6d1b9eba83323e851c955a1691865 Mon Sep 17 00:00:00 2001
From: etseidl <etseidl@users.noreply.github.com>
Date: Tue, 19 Jul 2022 12:17:50 -0700
Subject: [PATCH 08/31] Fix decimal128 stats in parquet writer (#11179)

Fixes an issue with how decimal128 statistics are written to parquet files.  As it stands, the statistics for a decimal128 column will be truncated to 64 bits, in little endian byte order.  This patch will write the full 128 bits in big endian order.

Testing this will require code from #11178

Authors:
  - https://github.com/etseidl

Approvers:
  - Vukasin Milovanovic (https://github.com/vuule)
  - Devavret Makkar (https://github.com/devavret)

URL: https://github.com/rapidsai/cudf/pull/11179
---
 cpp/src/io/parquet/page_enc.cu                | 37 +++++++++++----
 cpp/src/io/statistics/statistics.cuh          |  1 +
 .../statistics_type_identification.cuh        | 16 ++++---
 .../io/statistics/temp_storage_wrapper.cuh    |  4 +-
 .../io/statistics/typed_statistics_chunk.cuh  | 10 ++--
 cpp/tests/io/parquet_test.cpp                 | 47 +++++++++++++++++++
 6 files changed, 95 insertions(+), 20 deletions(-)

diff --git a/cpp/src/io/parquet/page_enc.cu b/cpp/src/io/parquet/page_enc.cu
index 277dc4846de..baa1b164c35 100644
--- a/cpp/src/io/parquet/page_enc.cu
+++ b/cpp/src/io/parquet/page_enc.cu
@@ -1274,10 +1274,22 @@ class header_encoder {
   inline __device__ void set_ptr(uint8_t* ptr) { current_header_ptr = ptr; }
 };
 
+// byteswap 128 bit integer, placing result in dst in network byte order.
+// dst must point to at least 16 bytes of memory.
+static __device__ void byte_reverse128(__int128_t v, void* dst)
+{
+  auto const v_char_ptr = reinterpret_cast<unsigned char const*>(&v);
+  auto const d_char_ptr = static_cast<unsigned char*>(dst);
+  thrust::copy(thrust::seq,
+               thrust::make_reverse_iterator(v_char_ptr + sizeof(v)),
+               thrust::make_reverse_iterator(v_char_ptr),
+               d_char_ptr);
+}
+
 __device__ uint8_t* EncodeStatistics(uint8_t* start,
                                      const statistics_chunk* s,
                                      uint8_t dtype,
-                                     float* fp_scratch)
+                                     void* scratch)
 {
   uint8_t *end, dtype_len;
   switch (dtype) {
@@ -1309,10 +1321,17 @@ __device__ uint8_t* EncodeStatistics(uint8_t* start,
     } else {
       lmin = lmax = dtype_len;
       if (dtype == dtype_float32) {  // Convert from double to float32
-        fp_scratch[0] = s->min_value.fp_val;
-        fp_scratch[1] = s->max_value.fp_val;
-        vmin          = &fp_scratch[0];
-        vmax          = &fp_scratch[1];
+        auto const fp_scratch = static_cast<float*>(scratch);
+        fp_scratch[0]         = s->min_value.fp_val;
+        fp_scratch[1]         = s->max_value.fp_val;
+        vmin                  = &fp_scratch[0];
+        vmax                  = &fp_scratch[1];
+      } else if (dtype == dtype_decimal128) {
+        auto const d128_scratch = static_cast<uint8_t*>(scratch);
+        byte_reverse128(s->min_value.d128_val, d128_scratch);
+        byte_reverse128(s->max_value.d128_val, &d128_scratch[16]);
+        vmin = &d128_scratch[0];
+        vmax = &d128_scratch[16];
       } else {
         vmin = &s->min_value;
         vmax = &s->max_value;
@@ -1336,7 +1355,7 @@ __global__ void __launch_bounds__(128)
   __shared__ __align__(8) parquet_column_device_view col_g;
   __shared__ __align__(8) EncColumnChunk ck_g;
   __shared__ __align__(8) EncPage page_g;
-  __shared__ __align__(8) float fp_scratch[2];
+  __shared__ __align__(8) unsigned char scratch[32];
 
   uint32_t t = threadIdx.x;
 
@@ -1351,7 +1370,7 @@ __global__ void __launch_bounds__(128)
     if (chunk_stats && &pages[blockIdx.x] == ck_g.pages) {  // Is this the first page in a chunk?
       hdr_start = (ck_g.is_compressed) ? ck_g.compressed_bfr : ck_g.uncompressed_bfr;
       hdr_end =
-        EncodeStatistics(hdr_start, &chunk_stats[page_g.chunk_id], col_g.stats_dtype, fp_scratch);
+        EncodeStatistics(hdr_start, &chunk_stats[page_g.chunk_id], col_g.stats_dtype, scratch);
       page_g.chunk->ck_stat_size = static_cast<uint32_t>(hdr_end - hdr_start);
     }
     uncompressed_page_size = page_g.max_data_size;
@@ -1392,8 +1411,8 @@ __global__ void __launch_bounds__(128)
       // Optionally encode page-level statistics
       if (not page_stats.empty()) {
         encoder.field_struct_begin(5);
-        encoder.set_ptr(EncodeStatistics(
-          encoder.get_ptr(), &page_stats[blockIdx.x], col_g.stats_dtype, fp_scratch));
+        encoder.set_ptr(
+          EncodeStatistics(encoder.get_ptr(), &page_stats[blockIdx.x], col_g.stats_dtype, scratch));
         encoder.field_struct_end(5);
       }
       encoder.field_struct_end(5);
diff --git a/cpp/src/io/statistics/statistics.cuh b/cpp/src/io/statistics/statistics.cuh
index bb3c3ee152c..6b199bdbd13 100644
--- a/cpp/src/io/statistics/statistics.cuh
+++ b/cpp/src/io/statistics/statistics.cuh
@@ -88,6 +88,7 @@ union statistics_val {
   double fp_val;         //!< float columns
   int64_t i_val;         //!< integer columns
   uint64_t u_val;        //!< unsigned integer columns
+  __int128_t d128_val;   //!< decimal128 columns
 };
 
 struct statistics_chunk {
diff --git a/cpp/src/io/statistics/statistics_type_identification.cuh b/cpp/src/io/statistics/statistics_type_identification.cuh
index 04684351e3b..8b8795778ef 100644
--- a/cpp/src/io/statistics/statistics_type_identification.cuh
+++ b/cpp/src/io/statistics/statistics_type_identification.cuh
@@ -123,13 +123,14 @@ class extrema_type {
 
   using non_arithmetic_extrema_type = typename std::conditional_t<
     cudf::is_fixed_point<T>() or cudf::is_duration<T>() or cudf::is_timestamp<T>(),
-    int64_t,
+    typename std::conditional_t<std::is_same_v<T, numeric::decimal128>, __int128_t, int64_t>,
     typename std::conditional_t<std::is_same_v<T, string_view>, string_view, void>>;
 
   // unsigned int/bool -> uint64_t
   // signed int        -> int64_t
   // float/double      -> double
   // decimal32/64      -> int64_t
+  // decimal128        -> __int128_t
   // duration_[T]      -> int64_t
   // string_view       -> string_view
   // timestamp_[T]     -> int64_t
@@ -177,17 +178,18 @@ class aggregation_type {
   using arithmetic_aggregation_type =
     typename std::conditional_t<std::is_integral_v<T>, integral_aggregation_type, double>;
 
-  using non_arithmetic_aggregation_type =
-    typename std::conditional_t<cudf::is_fixed_point<T>() or cudf::is_duration<T>() or
-                                  cudf::is_timestamp<T>()  // To be disabled with static_assert
-                                  or std::is_same_v<T, string_view>,
-                                int64_t,
-                                void>;
+  using non_arithmetic_aggregation_type = typename std::conditional_t<
+    cudf::is_fixed_point<T>() or cudf::is_duration<T>() or
+      cudf::is_timestamp<T>()  // To be disabled with static_assert
+      or std::is_same_v<T, string_view>,
+    typename std::conditional_t<std::is_same_v<T, numeric::decimal128>, __int128_t, int64_t>,
+    void>;
 
   // unsigned int/bool -> uint64_t
   // signed int        -> int64_t
   // float/double      -> double
   // decimal32/64      -> int64_t
+  // decimal128        -> __int128_t
   // duration_[T]      -> int64_t
   // string_view       -> int64_t
   // NOTE : timestamps do not have an aggregation type
diff --git a/cpp/src/io/statistics/temp_storage_wrapper.cuh b/cpp/src/io/statistics/temp_storage_wrapper.cuh
index 7a36c873ba6..1c9c1d6c6d5 100644
--- a/cpp/src/io/statistics/temp_storage_wrapper.cuh
+++ b/cpp/src/io/statistics/temp_storage_wrapper.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -54,6 +54,7 @@ union block_reduce_storage {
   DECLARE_MEMBER(int16_t)
   DECLARE_MEMBER(int32_t)
   DECLARE_MEMBER(int64_t)
+  DECLARE_MEMBER(__int128_t)
   DECLARE_MEMBER(uint8_t)
   DECLARE_MEMBER(uint16_t)
   DECLARE_MEMBER(uint32_t)
@@ -89,6 +90,7 @@ struct storage_wrapper {
   STORAGE_WRAPPER_GET(int16_t);
   STORAGE_WRAPPER_GET(int32_t);
   STORAGE_WRAPPER_GET(int64_t);
+  STORAGE_WRAPPER_GET(__int128_t);
   STORAGE_WRAPPER_GET(uint8_t);
   STORAGE_WRAPPER_GET(uint16_t);
   STORAGE_WRAPPER_GET(uint32_t);
diff --git a/cpp/src/io/statistics/typed_statistics_chunk.cuh b/cpp/src/io/statistics/typed_statistics_chunk.cuh
index b02f3c42563..5a6a027ed0f 100644
--- a/cpp/src/io/statistics/typed_statistics_chunk.cuh
+++ b/cpp/src/io/statistics/typed_statistics_chunk.cuh
@@ -63,6 +63,12 @@ class union_member {
     return val.i_val;
   }
 
+  template <typename T, typename U>
+  __device__ static std::enable_if_t<std::is_same_v<T, __int128_t>, type<T, U>> get(U& val)
+  {
+    return val.d128_val;
+  }
+
   template <typename T, typename U>
   __device__ static std::enable_if_t<std::is_floating_point_v<T>, type<T, U>> get(U& val)
   {
@@ -126,9 +132,7 @@ struct typed_statistics_chunk<T, true> {
       minimum_value = thrust::min<E>(minimum_value, union_member::get<E>(chunk.min_value));
       maximum_value = thrust::max<E>(maximum_value, union_member::get<E>(chunk.max_value));
     }
-    if (chunk.has_sum) {
-      aggregate += detail::aggregation_type<A>::convert(union_member::get<A>(chunk.sum));
-    }
+    if (chunk.has_sum) { aggregate += union_member::get<A>(chunk.sum); }
     non_nulls += chunk.non_nulls;
     null_count += chunk.null_count;
   }
diff --git a/cpp/tests/io/parquet_test.cpp b/cpp/tests/io/parquet_test.cpp
index 376bab7185b..66a4a463ba5 100644
--- a/cpp/tests/io/parquet_test.cpp
+++ b/cpp/tests/io/parquet_test.cpp
@@ -229,6 +229,19 @@ bool read_footer(std::unique_ptr<cudf_io::datasource>& source,
   return cp.read(file_meta_data);
 }
 
+// parse the statistics_blob on chunk and return as a Statistics struct.
+// throws cudf::logic_error if the chunk statistics_blob is invalid.
+cudf_io::parquet::Statistics parse_statistics(const cudf_io::parquet::ColumnChunk& chunk)
+{
+  auto& stats_blob = chunk.meta_data.statistics_blob;
+  CUDF_EXPECTS(stats_blob.size() > 0, "Invalid statistics length");
+
+  cudf_io::parquet::Statistics stats;
+  cudf_io::parquet::CompactProtocolReader cp(stats_blob.data(), stats_blob.size());
+  CUDF_EXPECTS(cp.read(&stats), "Cannot parse column statistics");
+  return stats;
+}
+
 // Base test fixture for tests
 struct ParquetWriterTest : public cudf::test::BaseFixture {
 };
@@ -3402,6 +3415,40 @@ TEST_F(ParquetWriterTest, CheckPageRows)
   EXPECT_EQ(ph.data_page_header.num_values, page_rows);
 }
 
+TEST_F(ParquetWriterTest, Decimal128Stats)
+{
+  // check that decimal128 min and max statistics are written in network byte order
+  // this is negative, so should be the min
+  std::vector<uint8_t> expected_min{
+    0xa1, 0xb2, 0xc3, 0xd4, 0xe5, 0xf6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+  std::vector<uint8_t> expected_max{
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0xa1, 0xb2, 0xc3, 0xd4, 0xe5, 0xf6};
+
+  __int128_t val0 = 0xa1b2c3d4e5f6ULL;
+  __int128_t val1 = val0 << 80;
+  column_wrapper<numeric::decimal128> col0{{numeric::decimal128(val0, numeric::scale_type{0}),
+                                            numeric::decimal128(val1, numeric::scale_type{0})}};
+
+  std::vector<std::unique_ptr<column>> cols;
+  cols.push_back(col0.release());
+  auto expected = std::make_unique<table>(std::move(cols));
+
+  auto filepath = temp_env->get_temp_filepath("Decimal128Stats.parquet");
+  cudf_io::parquet_writer_options out_opts =
+    cudf_io::parquet_writer_options::builder(cudf_io::sink_info{filepath}, expected->view());
+  cudf_io::write_parquet(out_opts);
+
+  auto source = cudf_io::datasource::create(filepath);
+  cudf_io::parquet::FileMetaData fmd;
+
+  CUDF_EXPECTS(read_footer(source, &fmd), "Cannot parse metadata");
+
+  auto const stats = parse_statistics(fmd.row_groups[0].columns[0]);
+
+  EXPECT_EQ(expected_min, stats.min_value);
+  EXPECT_EQ(expected_max, stats.max_value);
+}
+
 TEST_F(ParquetReaderTest, EmptyColumnsParam)
 {
   srand(31337);

From b3e524742ee3fc57d1fd057a8b01666114245c22 Mon Sep 17 00:00:00 2001
From: nvdbaranec <56695930+nvdbaranec@users.noreply.github.com>
Date: Wed, 20 Jul 2022 10:26:03 -0500
Subject: [PATCH 09/31] Fix invalid allocate_like() and empty_like() tests.
 (#11268)

Fixes:  https://github.com/rapidsai/cudf/issues/11247

These tests were using `cudf::test::expect_column_properties_equal()` which explicitly compares null counts.  However the tests were using uninitialized memory for the null mask, so it experienced random failures depending on what was returned from the memory manager.   The fix is to simply use a custom set of checks specific to these tests rather than the internal cudf functionality.

Authors:
  - https://github.com/nvdbaranec

Approvers:
  - Nghia Truong (https://github.com/ttnghia)
  - David Wendt (https://github.com/davidwendt)
  - Mike Wilson (https://github.com/hyperbolic2346)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/11268
---
 cpp/tests/copying/utility_tests.cpp | 17 ++++++++---------
 1 file changed, 8 insertions(+), 9 deletions(-)

diff --git a/cpp/tests/copying/utility_tests.cpp b/cpp/tests/copying/utility_tests.cpp
index 67d7beb5f03..60d76758ff7 100644
--- a/cpp/tests/copying/utility_tests.cpp
+++ b/cpp/tests/copying/utility_tests.cpp
@@ -168,11 +168,12 @@ std::unique_ptr<cudf::table> create_table(cudf::size_type size, cudf::mask_state
   return std::make_unique<cudf::table>(std::move(columns));
 }
 
-void expect_tables_prop_equal(cudf::table_view lhs, cudf::table_view rhs)
+void expect_tables_prop_equal(cudf::table_view const& lhs, cudf::table_view const& rhs)
 {
   EXPECT_EQ(lhs.num_columns(), rhs.num_columns());
-  for (cudf::size_type index = 0; index < lhs.num_columns(); index++)
+  for (cudf::size_type index = 0; index < lhs.num_columns(); index++) {
     CUDF_TEST_EXPECT_COLUMN_PROPERTIES_EQUAL(lhs.column(index), rhs.column(index));
+  }
 }
 
 struct EmptyLikeTableTest : public cudf::test::BaseFixture {
@@ -183,7 +184,7 @@ TEST_F(EmptyLikeTableTest, TableTest)
   cudf::mask_state state = cudf::mask_state::ALL_VALID;
   cudf::size_type size   = 10;
   auto input             = create_table(size, state);
-  auto expected          = create_table(0, cudf::mask_state::UNINITIALIZED);
+  auto expected          = create_table(0, cudf::mask_state::ALL_VALID);
   auto got               = cudf::empty_like(input->view());
 
   expect_tables_prop_equal(got->view(), expected->view());
@@ -192,7 +193,6 @@ TEST_F(EmptyLikeTableTest, TableTest)
 template <typename T>
 struct AllocateLikeTest : public cudf::test::BaseFixture {
 };
-;
 
 TYPED_TEST_SUITE(AllocateLikeTest, numeric_types);
 
@@ -203,7 +203,7 @@ TYPED_TEST(AllocateLikeTest, ColumnNumericTestSameSize)
   cudf::mask_state state = cudf::mask_state::ALL_VALID;
   auto input    = make_numeric_column(cudf::data_type{cudf::type_to_id<TypeParam>()}, size, state);
   auto expected = make_numeric_column(
-    cudf::data_type{cudf::type_to_id<TypeParam>()}, size, cudf::mask_state::UNINITIALIZED);
+    cudf::data_type{cudf::type_to_id<TypeParam>()}, size, cudf::mask_state::ALL_VALID);
   auto got = cudf::allocate_like(input->view());
   CUDF_TEST_EXPECT_COLUMN_PROPERTIES_EQUAL(*expected, *got);
 }
@@ -215,10 +215,9 @@ TYPED_TEST(AllocateLikeTest, ColumnNumericTestSpecifiedSize)
   cudf::size_type specified_size = 5;
   cudf::mask_state state         = cudf::mask_state::ALL_VALID;
   auto input    = make_numeric_column(cudf::data_type{cudf::type_to_id<TypeParam>()}, size, state);
-  auto expected = make_numeric_column(cudf::data_type{cudf::type_to_id<TypeParam>()},
-                                      specified_size,
-                                      cudf::mask_state::UNINITIALIZED);
-  auto got      = cudf::allocate_like(input->view(), specified_size);
+  auto expected = make_numeric_column(
+    cudf::data_type{cudf::type_to_id<TypeParam>()}, specified_size, cudf::mask_state::ALL_VALID);
+  auto got = cudf::allocate_like(input->view(), specified_size);
   CUDF_TEST_EXPECT_COLUMN_PROPERTIES_EQUAL(*expected, *got);
 }
 

From 9cc6900616b3093f7b0d5a8886ae07e1ad2e25fa Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Wed, 20 Jul 2022 14:53:15 -0400
Subject: [PATCH 10/31] Update parquet reader to take stream parameter (#11294)

Closes #10903.

This PR makes the parquet reader class take a stream parameter.

Authors:
  - Yunsong Wang (https://github.com/PointKernel)

Approvers:
  - Nghia Truong (https://github.com/ttnghia)
  - Jim Brennan (https://github.com/jbrennan333)
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/11294
---
 cpp/include/cudf/io/detail/parquet.hpp |   8 +-
 cpp/src/io/functions.cpp               |   3 +-
 cpp/src/io/parquet/reader_impl.cu      | 168 ++++++++++++-------------
 cpp/src/io/parquet/reader_impl.hpp     |  38 ++----
 4 files changed, 96 insertions(+), 121 deletions(-)

diff --git a/cpp/include/cudf/io/detail/parquet.hpp b/cpp/include/cudf/io/detail/parquet.hpp
index a88dddb8dd0..7675dc70cb2 100644
--- a/cpp/include/cudf/io/detail/parquet.hpp
+++ b/cpp/include/cudf/io/detail/parquet.hpp
@@ -23,7 +23,7 @@
 #include <cudf/io/detail/utils.hpp>
 #include <cudf/io/types.hpp>
 #include <cudf/table/table_view.hpp>
-#include <cudf/utilities/default_stream.hpp>
+
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/mr/device/per_device_resource.hpp>
 
@@ -55,10 +55,12 @@ class reader {
    *
    * @param sources Input `datasource` objects to read the dataset from
    * @param options Settings for controlling reading behavior
+   * @param stream CUDA stream used for device memory operations and kernel launches.
    * @param mr Device memory resource to use for device memory allocation
    */
   explicit reader(std::vector<std::unique_ptr<cudf::io::datasource>>&& sources,
                   parquet_reader_options const& options,
+                  rmm::cuda_stream_view stream,
                   rmm::mr::device_memory_resource* mr);
 
   /**
@@ -70,12 +72,10 @@ class reader {
    * @brief Reads the dataset as per given options.
    *
    * @param options Settings for controlling reading behavior
-   * @param stream CUDA stream used for device memory operations and kernel launches.
    *
    * @return The set of columns along with table metadata
    */
-  table_with_metadata read(parquet_reader_options const& options,
-                           rmm::cuda_stream_view stream = cudf::default_stream_value);
+  table_with_metadata read(parquet_reader_options const& options);
 };
 
 /**
diff --git a/cpp/src/io/functions.cpp b/cpp/src/io/functions.cpp
index dd6ecf3414d..6f702a489a0 100644
--- a/cpp/src/io/functions.cpp
+++ b/cpp/src/io/functions.cpp
@@ -415,7 +415,8 @@ table_with_metadata read_parquet(parquet_reader_options const& options,
   CUDF_FUNC_RANGE();
 
   auto datasources = make_datasources(options.get_source());
-  auto reader      = std::make_unique<detail_parquet::reader>(std::move(datasources), options, mr);
+  auto reader      = std::make_unique<detail_parquet::reader>(
+    std::move(datasources), options, cudf::default_stream_value, mr);
 
   return reader->read(options);
 }
diff --git a/cpp/src/io/parquet/reader_impl.cu b/cpp/src/io/parquet/reader_impl.cu
index e593843dfb5..4d78cf74196 100644
--- a/cpp/src/io/parquet/reader_impl.cu
+++ b/cpp/src/io/parquet/reader_impl.cu
@@ -250,6 +250,15 @@ std::tuple<int32_t, int32_t, int8_t> conversion_info(type_id column_type_id,
   return std::make_tuple(type_width, clock_rate, converted_type);
 }
 
+inline void decompress_check(device_span<decompress_status const> stats,
+                             rmm::cuda_stream_view stream)
+{
+  CUDF_EXPECTS(thrust::all_of(rmm::exec_policy(stream),
+                              stats.begin(),
+                              stats.end(),
+                              [] __device__(auto const& stat) { return stat.status == 0; }),
+               "Error during decompression");
+}
 }  // namespace
 
 std::string name_from_path(const std::vector<std::string>& path_in_schema)
@@ -973,8 +982,7 @@ std::future<void> reader::impl::read_column_chunks(
   size_t begin_chunk,
   size_t end_chunk,
   const std::vector<size_t>& column_chunk_offsets,
-  std::vector<size_type> const& chunk_source_map,
-  rmm::cuda_stream_view stream)
+  std::vector<size_type> const& chunk_source_map)
 {
   // Transfer chunk data, coalescing adjacent chunks
   std::vector<std::future<size_t>> read_tasks;
@@ -999,15 +1007,15 @@ std::future<void> reader::impl::read_column_chunks(
     if (io_size != 0) {
       auto& source = _sources[chunk_source_map[chunk]];
       if (source->is_device_read_preferred(io_size)) {
-        auto buffer        = rmm::device_buffer(io_size, stream);
+        auto buffer        = rmm::device_buffer(io_size, _stream);
         auto fut_read_size = source->device_read_async(
-          io_offset, io_size, static_cast<uint8_t*>(buffer.data()), stream);
+          io_offset, io_size, static_cast<uint8_t*>(buffer.data()), _stream);
         read_tasks.emplace_back(std::move(fut_read_size));
         page_data[chunk] = datasource::buffer::create(std::move(buffer));
       } else {
         auto const buffer = source->host_read(io_offset, io_size);
         page_data[chunk] =
-          datasource::buffer::create(rmm::device_buffer(buffer->data(), buffer->size(), stream));
+          datasource::buffer::create(rmm::device_buffer(buffer->data(), buffer->size(), _stream));
       }
       auto d_compdata = page_data[chunk]->data();
       do {
@@ -1029,14 +1037,13 @@ std::future<void> reader::impl::read_column_chunks(
 /**
  * @copydoc cudf::io::detail::parquet::count_page_headers
  */
-size_t reader::impl::count_page_headers(hostdevice_vector<gpu::ColumnChunkDesc>& chunks,
-                                        rmm::cuda_stream_view stream)
+size_t reader::impl::count_page_headers(hostdevice_vector<gpu::ColumnChunkDesc>& chunks)
 {
   size_t total_pages = 0;
 
-  chunks.host_to_device(stream);
-  gpu::DecodePageHeaders(chunks.device_ptr(), chunks.size(), stream);
-  chunks.device_to_host(stream, true);
+  chunks.host_to_device(_stream);
+  gpu::DecodePageHeaders(chunks.device_ptr(), chunks.size(), _stream);
+  chunks.device_to_host(_stream, true);
 
   for (size_t c = 0; c < chunks.size(); c++) {
     total_pages += chunks[c].num_data_pages + chunks[c].num_dict_pages;
@@ -1049,8 +1056,7 @@ size_t reader::impl::count_page_headers(hostdevice_vector<gpu::ColumnChunkDesc>&
  * @copydoc cudf::io::detail::parquet::decode_page_headers
  */
 void reader::impl::decode_page_headers(hostdevice_vector<gpu::ColumnChunkDesc>& chunks,
-                                       hostdevice_vector<gpu::PageInfo>& pages,
-                                       rmm::cuda_stream_view stream)
+                                       hostdevice_vector<gpu::PageInfo>& pages)
 {
   // IMPORTANT : if you change how pages are stored within a chunk (dist pages, then data pages),
   // please update preprocess_nested_columns to reflect this.
@@ -1060,27 +1066,16 @@ void reader::impl::decode_page_headers(hostdevice_vector<gpu::ColumnChunkDesc>&
     page_count += chunks[c].max_num_pages;
   }
 
-  chunks.host_to_device(stream);
-  gpu::DecodePageHeaders(chunks.device_ptr(), chunks.size(), stream);
-  pages.device_to_host(stream, true);
-}
-
-void decompress_check(device_span<decompress_status const> stats, rmm::cuda_stream_view stream)
-{
-  CUDF_EXPECTS(thrust::all_of(rmm::exec_policy(stream),
-                              stats.begin(),
-                              stats.end(),
-                              [] __device__(auto const& stat) { return stat.status == 0; }),
-               "Error during decompression");
+  chunks.host_to_device(_stream);
+  gpu::DecodePageHeaders(chunks.device_ptr(), chunks.size(), _stream);
+  pages.device_to_host(_stream, true);
 }
 
 /**
  * @copydoc cudf::io::detail::parquet::decompress_page_data
  */
 rmm::device_buffer reader::impl::decompress_page_data(
-  hostdevice_vector<gpu::ColumnChunkDesc>& chunks,
-  hostdevice_vector<gpu::PageInfo>& pages,
-  rmm::cuda_stream_view stream)
+  hostdevice_vector<gpu::ColumnChunkDesc>& chunks, hostdevice_vector<gpu::PageInfo>& pages)
 {
   auto for_each_codec_page = [&](parquet::Compression codec, const std::function<void(size_t)>& f) {
     for (size_t c = 0, page_count = 0; c < chunks.size(); c++) {
@@ -1136,20 +1131,20 @@ rmm::device_buffer reader::impl::decompress_page_data(
       num_comp_pages++;
     });
     if (codec.compression_type == parquet::BROTLI && codec.num_pages > 0) {
-      debrotli_scratch.resize(get_gpu_debrotli_scratch_size(codec.num_pages), stream);
+      debrotli_scratch.resize(get_gpu_debrotli_scratch_size(codec.num_pages), _stream);
     }
   }
 
   // Dispatch batches of pages to decompress for each codec
-  rmm::device_buffer decomp_pages(total_decomp_size, stream);
+  rmm::device_buffer decomp_pages(total_decomp_size, _stream);
 
   std::vector<device_span<uint8_t const>> comp_in;
   comp_in.reserve(num_comp_pages);
   std::vector<device_span<uint8_t>> comp_out;
   comp_out.reserve(num_comp_pages);
 
-  rmm::device_uvector<decompress_status> comp_stats(num_comp_pages, stream);
-  thrust::fill(rmm::exec_policy(stream),
+  rmm::device_uvector<decompress_status> comp_stats(num_comp_pages, _stream);
+  thrust::fill(rmm::exec_policy(_stream),
                comp_stats.begin(),
                comp_stats.end(),
                decompress_status{0, static_cast<uint32_t>(-1000), 0});
@@ -1172,16 +1167,16 @@ rmm::device_buffer reader::impl::decompress_page_data(
 
     host_span<device_span<uint8_t const> const> comp_in_view{comp_in.data() + start_pos,
                                                              codec.num_pages};
-    auto const d_comp_in = cudf::detail::make_device_uvector_async(comp_in_view, stream);
+    auto const d_comp_in = cudf::detail::make_device_uvector_async(comp_in_view, _stream);
     host_span<device_span<uint8_t> const> comp_out_view(comp_out.data() + start_pos,
                                                         codec.num_pages);
-    auto const d_comp_out = cudf::detail::make_device_uvector_async(comp_out_view, stream);
+    auto const d_comp_out = cudf::detail::make_device_uvector_async(comp_out_view, _stream);
     device_span<decompress_status> d_comp_stats_view(comp_stats.data() + start_pos,
                                                      codec.num_pages);
 
     switch (codec.compression_type) {
       case parquet::GZIP:
-        gpuinflate(d_comp_in, d_comp_out, d_comp_stats_view, gzip_header_included::YES, stream);
+        gpuinflate(d_comp_in, d_comp_out, d_comp_stats_view, gzip_header_included::YES, _stream);
         break;
       case parquet::SNAPPY:
         if (nvcomp_integration::is_stable_enabled()) {
@@ -1191,9 +1186,9 @@ rmm::device_buffer reader::impl::decompress_page_data(
                                      d_comp_stats_view,
                                      codec.max_decompressed_size,
                                      codec.total_decomp_size,
-                                     stream);
+                                     _stream);
         } else {
-          gpu_unsnap(d_comp_in, d_comp_out, d_comp_stats_view, stream);
+          gpu_unsnap(d_comp_in, d_comp_out, d_comp_stats_view, _stream);
         }
         break;
       case parquet::ZSTD:
@@ -1203,7 +1198,7 @@ rmm::device_buffer reader::impl::decompress_page_data(
                                    d_comp_stats_view,
                                    codec.max_decompressed_size,
                                    codec.total_decomp_size,
-                                   stream);
+                                   _stream);
         break;
       case parquet::BROTLI:
         gpu_debrotli(d_comp_in,
@@ -1211,18 +1206,18 @@ rmm::device_buffer reader::impl::decompress_page_data(
                      d_comp_stats_view,
                      debrotli_scratch.data(),
                      debrotli_scratch.size(),
-                     stream);
+                     _stream);
         break;
       default: CUDF_FAIL("Unexpected decompression dispatch"); break;
     }
     start_pos += codec.num_pages;
   }
 
-  decompress_check(comp_stats, stream);
+  decompress_check(comp_stats, _stream);
 
   // Update the page information in device memory with the updated value of
   // page_data; it now points to the uncompressed data buffer
-  pages.host_to_device(stream);
+  pages.host_to_device(_stream);
 
   return decomp_pages;
 }
@@ -1232,8 +1227,7 @@ rmm::device_buffer reader::impl::decompress_page_data(
  */
 void reader::impl::allocate_nesting_info(hostdevice_vector<gpu::ColumnChunkDesc> const& chunks,
                                          hostdevice_vector<gpu::PageInfo>& pages,
-                                         hostdevice_vector<gpu::PageNestingInfo>& page_nesting_info,
-                                         rmm::cuda_stream_view stream)
+                                         hostdevice_vector<gpu::PageNestingInfo>& page_nesting_info)
 {
   // compute total # of page_nesting infos needed and allocate space. doing this in one
   // buffer to keep it to a single gpu allocation
@@ -1246,10 +1240,10 @@ void reader::impl::allocate_nesting_info(hostdevice_vector<gpu::ColumnChunkDesc>
       return total + (per_page_nesting_info_size * chunk.num_data_pages);
     });
 
-  page_nesting_info = hostdevice_vector<gpu::PageNestingInfo>{total_page_nesting_infos, stream};
+  page_nesting_info = hostdevice_vector<gpu::PageNestingInfo>{total_page_nesting_infos, _stream};
 
   // retrieve from the gpu so we can update
-  pages.device_to_host(stream, true);
+  pages.device_to_host(_stream, true);
 
   // update pointers in the PageInfos
   int target_page_index = 0;
@@ -1272,7 +1266,7 @@ void reader::impl::allocate_nesting_info(hostdevice_vector<gpu::ColumnChunkDesc>
   }
 
   // copy back to the gpu
-  pages.host_to_device(stream);
+  pages.host_to_device(_stream);
 
   // fill in
   int nesting_info_index = 0;
@@ -1342,7 +1336,7 @@ void reader::impl::allocate_nesting_info(hostdevice_vector<gpu::ColumnChunkDesc>
   }
 
   // copy nesting info to the device
-  page_nesting_info.host_to_device(stream);
+  page_nesting_info.host_to_device(_stream);
 }
 
 /**
@@ -1352,8 +1346,7 @@ void reader::impl::preprocess_columns(hostdevice_vector<gpu::ColumnChunkDesc>& c
                                       hostdevice_vector<gpu::PageInfo>& pages,
                                       size_t min_row,
                                       size_t total_rows,
-                                      bool has_lists,
-                                      rmm::cuda_stream_view stream)
+                                      bool has_lists)
 {
   // TODO : we should be selectively preprocessing only columns that have
   // lists in them instead of doing them all if even one contains lists.
@@ -1365,7 +1358,7 @@ void reader::impl::preprocess_columns(hostdevice_vector<gpu::ColumnChunkDesc>& c
       [&](std::vector<column_buffer>& cols) {
         for (size_t idx = 0; idx < cols.size(); idx++) {
           auto& col = cols[idx];
-          col.create(total_rows, stream, _mr);
+          col.create(total_rows, _stream, _mr);
           create_columns(col.children);
         }
       };
@@ -1373,8 +1366,8 @@ void reader::impl::preprocess_columns(hostdevice_vector<gpu::ColumnChunkDesc>& c
   } else {
     // preprocess per-nesting level sizes by page
     gpu::PreprocessColumnData(
-      pages, chunks, _input_columns, _output_columns, total_rows, min_row, stream, _mr);
-    stream.synchronize();
+      pages, chunks, _input_columns, _output_columns, total_rows, min_row, _stream, _mr);
+    _stream.synchronize();
   }
 }
 
@@ -1385,8 +1378,7 @@ void reader::impl::decode_page_data(hostdevice_vector<gpu::ColumnChunkDesc>& chu
                                     hostdevice_vector<gpu::PageInfo>& pages,
                                     hostdevice_vector<gpu::PageNestingInfo>& page_nesting,
                                     size_t min_row,
-                                    size_t total_rows,
-                                    rmm::cuda_stream_view stream)
+                                    size_t total_rows)
 {
   auto is_dict_chunk = [](const gpu::ColumnChunkDesc& chunk) {
     return (chunk.data_type & 0x7) == BYTE_ARRAY && chunk.num_dict_pages > 0;
@@ -1403,7 +1395,7 @@ void reader::impl::decode_page_data(hostdevice_vector<gpu::ColumnChunkDesc>& chu
   // Build index for string dictionaries since they can't be indexed
   // directly due to variable-sized elements
   auto str_dict_index = cudf::detail::make_zeroed_device_uvector_async<string_index_pair>(
-    total_str_dict_indexes, stream);
+    total_str_dict_indexes, _stream);
 
   // TODO (dm): hd_vec should have begin and end iterator members
   size_t sum_max_depths =
@@ -1417,8 +1409,8 @@ void reader::impl::decode_page_data(hostdevice_vector<gpu::ColumnChunkDesc>& chu
   // In order to reduce the number of allocations of hostdevice_vector, we allocate a single vector
   // to store all per-chunk pointers to nested data/nullmask. `chunk_offsets[i]` will store the
   // offset into `chunk_nested_data`/`chunk_nested_valids` for the array of pointers for chunk `i`
-  auto chunk_nested_valids = hostdevice_vector<uint32_t*>(sum_max_depths, stream);
-  auto chunk_nested_data   = hostdevice_vector<void*>(sum_max_depths, stream);
+  auto chunk_nested_valids = hostdevice_vector<uint32_t*>(sum_max_depths, _stream);
+  auto chunk_nested_data   = hostdevice_vector<void*>(sum_max_depths, _stream);
   auto chunk_offsets       = std::vector<size_t>();
 
   // Update chunks with pointers to column data.
@@ -1499,18 +1491,18 @@ void reader::impl::decode_page_data(hostdevice_vector<gpu::ColumnChunkDesc>& chu
     page_count += chunks[c].max_num_pages;
   }
 
-  chunks.host_to_device(stream);
-  chunk_nested_valids.host_to_device(stream);
-  chunk_nested_data.host_to_device(stream);
+  chunks.host_to_device(_stream);
+  chunk_nested_valids.host_to_device(_stream);
+  chunk_nested_data.host_to_device(_stream);
 
   if (total_str_dict_indexes > 0) {
-    gpu::BuildStringDictionaryIndex(chunks.device_ptr(), chunks.size(), stream);
+    gpu::BuildStringDictionaryIndex(chunks.device_ptr(), chunks.size(), _stream);
   }
 
-  gpu::DecodePageData(pages, chunks, total_rows, min_row, stream);
-  pages.device_to_host(stream);
-  page_nesting.device_to_host(stream);
-  stream.synchronize();
+  gpu::DecodePageData(pages, chunks, total_rows, min_row, _stream);
+  pages.device_to_host(_stream);
+  page_nesting.device_to_host(_stream);
+  _stream.synchronize();
 
   // for list columns, add the final offset to every offset buffer.
   // TODO : make this happen in more efficiently. Maybe use thrust::for_each
@@ -1540,7 +1532,7 @@ void reader::impl::decode_page_data(hostdevice_vector<gpu::ColumnChunkDesc>& chu
                       &offset,
                       sizeof(offset),
                       cudaMemcpyHostToDevice,
-                      stream.value());
+                      _stream.value());
       out_buf.user_data |= PARQUET_COLUMN_BUFFER_FLAG_LIST_TERMINATED;
     }
   }
@@ -1568,13 +1560,14 @@ void reader::impl::decode_page_data(hostdevice_vector<gpu::ColumnChunkDesc>& chu
     }
   }
 
-  stream.synchronize();
+  _stream.synchronize();
 }
 
 reader::impl::impl(std::vector<std::unique_ptr<datasource>>&& sources,
                    parquet_reader_options const& options,
+                   rmm::cuda_stream_view stream,
                    rmm::mr::device_memory_resource* mr)
-  : _mr(mr), _sources(std::move(sources))
+  : _stream(stream), _mr(mr), _sources(std::move(sources))
 {
   // Open and parse the source dataset metadata
   _metadata = std::make_unique<aggregate_reader_metadata>(_sources);
@@ -1597,8 +1590,7 @@ reader::impl::impl(std::vector<std::unique_ptr<datasource>>&& sources,
 
 table_with_metadata reader::impl::read(size_type skip_rows,
                                        size_type num_rows,
-                                       std::vector<std::vector<size_type>> const& row_group_list,
-                                       rmm::cuda_stream_view stream)
+                                       std::vector<std::vector<size_type>> const& row_group_list)
 {
   // Select only row groups required
   const auto selected_row_groups =
@@ -1614,7 +1606,7 @@ table_with_metadata reader::impl::read(size_type skip_rows,
     // Descriptors for all the chunks that make up the selected columns
     const auto num_input_columns = _input_columns.size();
     const auto num_chunks        = selected_row_groups.size() * num_input_columns;
-    hostdevice_vector<gpu::ColumnChunkDesc> chunks(0, num_chunks, stream);
+    hostdevice_vector<gpu::ColumnChunkDesc> chunks(0, num_chunks, _stream);
 
     // Association between each column chunk and its source
     std::vector<size_type> chunk_source_map(num_chunks);
@@ -1696,13 +1688,8 @@ table_with_metadata reader::impl::read(size_type skip_rows,
         }
       }
       // Read compressed chunk data to device memory
-      read_rowgroup_tasks.push_back(read_column_chunks(page_data,
-                                                       chunks,
-                                                       io_chunk_idx,
-                                                       chunks.size(),
-                                                       column_chunk_offsets,
-                                                       chunk_source_map,
-                                                       stream));
+      read_rowgroup_tasks.push_back(read_column_chunks(
+        page_data, chunks, io_chunk_idx, chunks.size(), column_chunk_offsets, chunk_source_map));
 
       remaining_rows -= row_group.num_rows;
     }
@@ -1712,15 +1699,15 @@ table_with_metadata reader::impl::read(size_type skip_rows,
     assert(remaining_rows <= 0);
 
     // Process dataset chunk pages into output columns
-    const auto total_pages = count_page_headers(chunks, stream);
+    const auto total_pages = count_page_headers(chunks);
     if (total_pages > 0) {
-      hostdevice_vector<gpu::PageInfo> pages(total_pages, total_pages, stream);
+      hostdevice_vector<gpu::PageInfo> pages(total_pages, total_pages, _stream);
       rmm::device_buffer decomp_page_data;
 
       // decoding of column/page information
-      decode_page_headers(chunks, pages, stream);
+      decode_page_headers(chunks, pages);
       if (total_decompressed_size > 0) {
-        decomp_page_data = decompress_page_data(chunks, pages, stream);
+        decomp_page_data = decompress_page_data(chunks, pages);
         // Free compressed data
         for (size_t c = 0; c < chunks.size(); c++) {
           if (chunks[c].codec != parquet::Compression::UNCOMPRESSED) { page_data[c].reset(); }
@@ -1745,7 +1732,7 @@ table_with_metadata reader::impl::read(size_type skip_rows,
       // nesting information (sizes, etc) stored -per page-
       // note : even for flat schemas, we allocate 1 level of "nesting" info
       hostdevice_vector<gpu::PageNestingInfo> page_nesting_info;
-      allocate_nesting_info(chunks, pages, page_nesting_info, stream);
+      allocate_nesting_info(chunks, pages, page_nesting_info);
 
       // - compute column sizes and allocate output buffers.
       //   important:
@@ -1756,15 +1743,15 @@ table_with_metadata reader::impl::read(size_type skip_rows,
       //
       // - for nested schemas, output buffer offset values per-page, per nesting-level for the
       // purposes of decoding.
-      preprocess_columns(chunks, pages, skip_rows, num_rows, has_lists, stream);
+      preprocess_columns(chunks, pages, skip_rows, num_rows, has_lists);
 
       // decoding of column data itself
-      decode_page_data(chunks, pages, page_nesting_info, skip_rows, num_rows, stream);
+      decode_page_data(chunks, pages, page_nesting_info, skip_rows, num_rows);
 
       // create the final output cudf columns
       for (size_t i = 0; i < _output_columns.size(); ++i) {
         column_name_info& col_name = out_metadata.schema_info.emplace_back("");
-        out_columns.emplace_back(make_column(_output_columns[i], &col_name, stream, _mr));
+        out_columns.emplace_back(make_column(_output_columns[i], &col_name, _stream, _mr));
       }
     }
   }
@@ -1772,7 +1759,7 @@ table_with_metadata reader::impl::read(size_type skip_rows,
   // Create empty columns as needed (this can happen if we've ended up with no actual data to read)
   for (size_t i = out_columns.size(); i < _output_columns.size(); ++i) {
     column_name_info& col_name = out_metadata.schema_info.emplace_back("");
-    out_columns.emplace_back(io::detail::empty_like(_output_columns[i], &col_name, stream, _mr));
+    out_columns.emplace_back(io::detail::empty_like(_output_columns[i], &col_name, _stream, _mr));
   }
 
   // Return column names (must match order of returned columns)
@@ -1793,8 +1780,9 @@ table_with_metadata reader::impl::read(size_type skip_rows,
 // Forward to implementation
 reader::reader(std::vector<std::unique_ptr<cudf::io::datasource>>&& sources,
                parquet_reader_options const& options,
+               rmm::cuda_stream_view stream,
                rmm::mr::device_memory_resource* mr)
-  : _impl(std::make_unique<impl>(std::move(sources), options, mr))
+  : _impl(std::make_unique<impl>(std::move(sources), options, stream, mr))
 {
 }
 
@@ -1802,11 +1790,9 @@ reader::reader(std::vector<std::unique_ptr<cudf::io::datasource>>&& sources,
 reader::~reader() = default;
 
 // Forward to implementation
-table_with_metadata reader::read(parquet_reader_options const& options,
-                                 rmm::cuda_stream_view stream)
+table_with_metadata reader::read(parquet_reader_options const& options)
 {
-  return _impl->read(
-    options.get_skip_rows(), options.get_num_rows(), options.get_row_groups(), stream);
+  return _impl->read(options.get_skip_rows(), options.get_num_rows(), options.get_row_groups());
 }
 
 }  // namespace parquet
diff --git a/cpp/src/io/parquet/reader_impl.hpp b/cpp/src/io/parquet/reader_impl.hpp
index 01fca5a8b50..06380fbe325 100644
--- a/cpp/src/io/parquet/reader_impl.hpp
+++ b/cpp/src/io/parquet/reader_impl.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -58,10 +58,12 @@ class reader::impl {
    *
    * @param sources Dataset sources
    * @param options Settings for controlling reading behavior
+   * @param stream CUDA stream used for device memory operations and kernel launches
    * @param mr Device memory resource to use for device memory allocation
    */
   explicit impl(std::vector<std::unique_ptr<datasource>>&& sources,
                 parquet_reader_options const& options,
+                rmm::cuda_stream_view stream,
                 rmm::mr::device_memory_resource* mr);
 
   /**
@@ -70,14 +72,12 @@ class reader::impl {
    * @param skip_rows Number of rows to skip from the start
    * @param num_rows Number of rows to read
    * @param row_group_indices TODO
-   * @param stream CUDA stream used for device memory operations and kernel launches.
    *
    * @return The set of columns along with metadata
    */
   table_with_metadata read(size_type skip_rows,
                            size_type num_rows,
-                           std::vector<std::vector<size_type>> const& row_group_indices,
-                           rmm::cuda_stream_view stream);
+                           std::vector<std::vector<size_type>> const& row_group_indices);
 
  private:
   /**
@@ -88,7 +88,6 @@ class reader::impl {
    * @param begin_chunk Index of first column chunk to read
    * @param end_chunk Index after the last column chunk to read
    * @param column_chunk_offsets File offset for all chunks
-   * @param stream CUDA stream used for device memory operations and kernel launches.
    *
    */
   std::future<void> read_column_chunks(std::vector<std::unique_ptr<datasource::buffer>>& page_data,
@@ -96,43 +95,36 @@ class reader::impl {
                                        size_t begin_chunk,
                                        size_t end_chunk,
                                        const std::vector<size_t>& column_chunk_offsets,
-                                       std::vector<size_type> const& chunk_source_map,
-                                       rmm::cuda_stream_view stream);
+                                       std::vector<size_type> const& chunk_source_map);
 
   /**
    * @brief Returns the number of total pages from the given column chunks
    *
    * @param chunks List of column chunk descriptors
-   * @param stream CUDA stream used for device memory operations and kernel launches.
    *
    * @return The total number of pages
    */
-  size_t count_page_headers(hostdevice_vector<gpu::ColumnChunkDesc>& chunks,
-                            rmm::cuda_stream_view stream);
+  size_t count_page_headers(hostdevice_vector<gpu::ColumnChunkDesc>& chunks);
 
   /**
    * @brief Returns the page information from the given column chunks.
    *
    * @param chunks List of column chunk descriptors
    * @param pages List of page information
-   * @param stream CUDA stream used for device memory operations and kernel launches.
    */
   void decode_page_headers(hostdevice_vector<gpu::ColumnChunkDesc>& chunks,
-                           hostdevice_vector<gpu::PageInfo>& pages,
-                           rmm::cuda_stream_view stream);
+                           hostdevice_vector<gpu::PageInfo>& pages);
 
   /**
    * @brief Decompresses the page data, at page granularity.
    *
    * @param chunks List of column chunk descriptors
    * @param pages List of page information
-   * @param stream CUDA stream used for device memory operations and kernel launches.
    *
    * @return Device buffer to decompressed page data
    */
   rmm::device_buffer decompress_page_data(hostdevice_vector<gpu::ColumnChunkDesc>& chunks,
-                                          hostdevice_vector<gpu::PageInfo>& pages,
-                                          rmm::cuda_stream_view stream);
+                                          hostdevice_vector<gpu::PageInfo>& pages);
 
   /**
    * @brief Allocate nesting information storage for all pages and set pointers
@@ -147,12 +139,10 @@ class reader::impl {
    * @param chunks List of column chunk descriptors
    * @param pages List of page information
    * @param page_nesting_info The allocated nesting info structs.
-   * @param stream CUDA stream used for device memory operations and kernel launches.
    */
   void allocate_nesting_info(hostdevice_vector<gpu::ColumnChunkDesc> const& chunks,
                              hostdevice_vector<gpu::PageInfo>& pages,
-                             hostdevice_vector<gpu::PageNestingInfo>& page_nesting_info,
-                             rmm::cuda_stream_view stream);
+                             hostdevice_vector<gpu::PageNestingInfo>& page_nesting_info);
 
   /**
    * @brief Preprocess column information for nested schemas.
@@ -170,14 +160,12 @@ class reader::impl {
    * @param[in] total_rows Maximum number of rows to read
    * @param[in] has_lists Whether or not this data contains lists and requires
    * a preprocess.
-   * @param[in] stream Cuda stream
    */
   void preprocess_columns(hostdevice_vector<gpu::ColumnChunkDesc>& chunks,
                           hostdevice_vector<gpu::PageInfo>& pages,
                           size_t min_row,
                           size_t total_rows,
-                          bool has_lists,
-                          rmm::cuda_stream_view stream);
+                          bool has_lists);
 
   /**
    * @brief Converts the page data and outputs to columns.
@@ -187,17 +175,17 @@ class reader::impl {
    * @param page_nesting Page nesting array
    * @param min_row Minimum number of rows from start
    * @param total_rows Number of rows to output
-   * @param stream CUDA stream used for device memory operations and kernel launches.
    */
   void decode_page_data(hostdevice_vector<gpu::ColumnChunkDesc>& chunks,
                         hostdevice_vector<gpu::PageInfo>& pages,
                         hostdevice_vector<gpu::PageNestingInfo>& page_nesting,
                         size_t min_row,
-                        size_t total_rows,
-                        rmm::cuda_stream_view stream);
+                        size_t total_rows);
 
  private:
+  rmm::cuda_stream_view _stream;
   rmm::mr::device_memory_resource* _mr = nullptr;
+
   std::vector<std::unique_ptr<datasource>> _sources;
   std::unique_ptr<aggregate_reader_metadata> _metadata;
 

From edc5062bdcc3e12755603b0ad07a4d271fe95261 Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Wed, 20 Jul 2022 16:30:53 -0700
Subject: [PATCH 11/31] Add cupy version to setup.py install_requires (#11306)

This resolves #8104. Note that that issue also requests an update to requirements.txt files, but those no longer exist.

Authors:
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - https://github.com/brandon-b-miller
  - H. Thomson Comer (https://github.com/thomcom)

URL: https://github.com/rapidsai/cudf/pull/11306
---
 python/cudf/setup.py      | 4 +++-
 python/dask_cudf/setup.py | 7 ++++---
 2 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/python/cudf/setup.py b/python/cudf/setup.py
index 5526056c77b..2ca132e37cb 100644
--- a/python/cudf/setup.py
+++ b/python/cudf/setup.py
@@ -79,7 +79,9 @@ def get_cuda_version_from_header(cuda_include_dir, delimeter=""):
 
 cuda_include_dir = os.path.join(CUDA_HOME, "include")
 install_requires.append(
-    "cupy-cuda" + get_cuda_version_from_header(cuda_include_dir)
+    "cupy-cuda"
+    + get_cuda_version_from_header(cuda_include_dir)
+    + ">=9.5.0,<11.0.0a0"
 )
 
 
diff --git a/python/dask_cudf/setup.py b/python/dask_cudf/setup.py
index 2c32e8df377..575683bc5fa 100644
--- a/python/dask_cudf/setup.py
+++ b/python/dask_cudf/setup.py
@@ -65,10 +65,11 @@ def get_cuda_version_from_header(cuda_include_dir, delimeter=""):
     raise OSError(f"Invalid CUDA_HOME: directory does not exist: {CUDA_HOME}")
 
 cuda_include_dir = os.path.join(CUDA_HOME, "include")
-cupy_package_name = "cupy-cuda" + get_cuda_version_from_header(
-    cuda_include_dir
+install_requires.append(
+    "cupy-cuda"
+    + get_cuda_version_from_header(cuda_include_dir)
+    + ">=9.5.0,<11.0.0a0"
 )
-install_requires.append(cupy_package_name)
 
 
 setup(

From f549ccbb095e5c15debe8825f712058ada151c92 Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Thu, 21 Jul 2022 08:26:26 -0700
Subject: [PATCH 12/31] Remove unused import in README sample (#11318)

Resolves #11316

Authors:
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/11318
---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 4ef937e026f..175f5e7efa8 100644
--- a/README.md
+++ b/README.md
@@ -21,7 +21,7 @@ cuDF provides a pandas-like API that will be familiar to data engineers & data s
 
 For example, the following snippet downloads a CSV, then uses the GPU to parse it into rows and columns and run calculations:
 ```python
-import cudf, io, requests
+import cudf, requests
 from io import StringIO
 
 url = "https://github.com/plotly/datasets/raw/master/tips.csv"

From 02029b6a73a14ffc95029a41a2d705362a046fb6 Mon Sep 17 00:00:00 2001
From: Mike Wilson <hyperbolic2346@users.noreply.github.com>
Date: Thu, 21 Jul 2022 12:57:43 -0400
Subject: [PATCH 13/31] removing some unused code (#11305)

In my endless wandering through parquet code, I found this unused code. Removing it.

Authors:
  - Mike Wilson (https://github.com/hyperbolic2346)

Approvers:
  - Nghia Truong (https://github.com/ttnghia)
  - Yunsong Wang (https://github.com/PointKernel)

URL: https://github.com/rapidsai/cudf/pull/11305
---
 cpp/src/io/parquet/page_enc.cu | 25 -------------------------
 1 file changed, 25 deletions(-)

diff --git a/cpp/src/io/parquet/page_enc.cu b/cpp/src/io/parquet/page_enc.cu
index baa1b164c35..3f22ead3bb1 100644
--- a/cpp/src/io/parquet/page_enc.cu
+++ b/cpp/src/io/parquet/page_enc.cu
@@ -57,7 +57,6 @@ constexpr bool enable_bool_rle = false;
 
 using ::cudf::detail::device_2dspan;
 
-constexpr int init_hash_bits       = 12;
 constexpr uint32_t rle_buffer_size = (1 << 9);
 
 struct frag_init_state_s {
@@ -101,30 +100,6 @@ uint32_t __device__ physical_type_len(Type physical_type, type_id id)
   }
 }
 
-/**
- * @brief Return a 12-bit hash from a byte sequence
- */
-inline __device__ uint32_t hash_string(const string_view& val)
-{
-  char const* ptr = val.data();
-  uint32_t len    = val.size_bytes();
-  if (len != 0) {
-    return (ptr[0] + (ptr[len - 1] << 5) + (len << 10)) & ((1 << init_hash_bits) - 1);
-  } else {
-    return 0;
-  }
-}
-
-inline __device__ uint32_t uint32_init_hash(uint32_t v)
-{
-  return (v + (v >> 11) + (v >> 22)) & ((1 << init_hash_bits) - 1);
-}
-
-inline __device__ uint32_t uint64_init_hash(uint64_t v)
-{
-  return uint32_init_hash(static_cast<uint32_t>(v + (v >> 32)));
-}
-
 // blockDim {512,1,1}
 template <int block_size>
 __global__ void __launch_bounds__(block_size)

From d0b788a50dd78a0d655108bd1d0b14d21ce13623 Mon Sep 17 00:00:00 2001
From: "Robert (Bobby) Evans" <bobby@apache.org>
Date: Thu, 21 Jul 2022 13:01:00 -0500
Subject: [PATCH 14/31] Add JNI support for the join_strings API (#11309)

This just adds in a simple JNI binding for the join_strings cudf function.

Authors:
  - Robert (Bobby) Evans (https://github.com/revans2)

Approvers:
  - Alessandro Bellina (https://github.com/abellina)
  - Raza Jafri (https://github.com/razajafri)
  - Jason Lowe (https://github.com/jlowe)

URL: https://github.com/rapidsai/cudf/pull/11309
---
 .../main/java/ai/rapids/cudf/ColumnView.java  | 23 +++++++++++++++++++
 java/src/main/native/src/ColumnViewJni.cpp    | 19 +++++++++++++++
 .../java/ai/rapids/cudf/ColumnVectorTest.java | 11 +++++++++
 3 files changed, 53 insertions(+)

diff --git a/java/src/main/java/ai/rapids/cudf/ColumnView.java b/java/src/main/java/ai/rapids/cudf/ColumnView.java
index 8f7973485a5..7ae4540bafe 100644
--- a/java/src/main/java/ai/rapids/cudf/ColumnView.java
+++ b/java/src/main/java/ai/rapids/cudf/ColumnView.java
@@ -1826,6 +1826,27 @@ public final ColumnVector capitalize(Scalar delimiters) {
     throw new IllegalArgumentException("Both input column and delimiters scalar should be" +
         " string type. But got column: " + type + ", scalar: " + delimiters.getType());
   }
+
+  /**
+   * Concatenates all strings in the column into one new string delimited
+   * by an optional separator string.
+   *
+   * This returns a column with one string. Any null entries are ignored unless
+   * the narep parameter specifies a replacement string (not a null value).
+   *
+   * @param separator what to insert to separate each row.
+   * @param narep what to replace nulls with
+   * @return a ColumnVector with a single string in it.
+   */
+  public final ColumnVector joinStrings(Scalar separator, Scalar narep) {
+    if (DType.STRING.equals(type) &&
+        DType.STRING.equals(separator.getType()) &&
+        DType.STRING.equals(narep.getType())) {
+      return new ColumnVector(joinStrings(getNativeView(), separator.getScalarHandle(),
+          narep.getScalarHandle()));
+    }
+    throw new IllegalArgumentException("The column, separator, and narep all need to be STRINGs");
+  }
   /////////////////////////////////////////////////////////////////////////////
   // TYPE CAST
   /////////////////////////////////////////////////////////////////////////////
@@ -4202,6 +4223,8 @@ private static native long clamper(long nativeView, long loScalarHandle, long lo
 
   private static native long capitalize(long strsColHandle, long delimitersHandle);
 
+  private static native long joinStrings(long strsHandle, long sepHandle, long narepHandle);
+
   private static native long makeStructView(long[] handles, long rowCount);
 
   private static native long isTimestamp(long nativeView, String format);
diff --git a/java/src/main/native/src/ColumnViewJni.cpp b/java/src/main/native/src/ColumnViewJni.cpp
index 934aa11cc97..9cf1e74d84d 100644
--- a/java/src/main/native/src/ColumnViewJni.cpp
+++ b/java/src/main/native/src/ColumnViewJni.cpp
@@ -2025,6 +2025,25 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_capitalize(JNIEnv *env, j
   CATCH_STD(env, 0);
 }
 
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_joinStrings(JNIEnv *env, jobject j_object,
+                                                                   jlong strs_handle,
+                                                                   jlong separator_handle,
+                                                                   jlong narep_handle) {
+
+  JNI_NULL_CHECK(env, strs_handle, "native view handle is null", 0)
+  JNI_NULL_CHECK(env, separator_handle, "separator scalar handle is null", 0)
+  JNI_NULL_CHECK(env, narep_handle, "narep scalar handle is null", 0)
+
+  try {
+    cudf::jni::auto_set_device(env);
+    cudf::column_view *view = reinterpret_cast<cudf::column_view *>(strs_handle);
+    cudf::string_scalar *sep = reinterpret_cast<cudf::string_scalar *>(separator_handle);
+    cudf::string_scalar *narep = reinterpret_cast<cudf::string_scalar *>(narep_handle);
+    return release_as_jlong(cudf::strings::join_strings(*view, *sep, *narep));
+  }
+  CATCH_STD(env, 0);
+}
+
 JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_makeStructView(JNIEnv *env, jobject j_object,
                                                                       jlongArray handles,
                                                                       jlong row_count) {
diff --git a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java
index d9d8044b0ad..05abe4958e2 100644
--- a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java
+++ b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java
@@ -288,6 +288,17 @@ void testRefCountLeak() throws InterruptedException {
     assertEquals(expectedLeakCount, MemoryCleaner.leakCount.get());
   }
 
+  @Test
+  void testJoinStrings() {
+    try (ColumnVector in = ColumnVector.fromStrings("A", "B", "C", "D", null, "E");
+         ColumnVector expected = ColumnVector.fromStrings("A-B-C-D-null-E");
+         Scalar sep = Scalar.fromString("-");
+         Scalar narep = Scalar.fromString("null");
+         ColumnVector found = in.joinStrings(sep, narep)) {
+      assertColumnsAreEqual(expected, found);
+    }
+  }
+
   @Test
   void testConcatTypeError() {
     try (ColumnVector v0 = ColumnVector.fromInts(1, 2, 3, 4);

From 3ea063815ba31a661d67b84c63f1fd6216921538 Mon Sep 17 00:00:00 2001
From: Ashwin Srinath <3190405+shwina@users.noreply.github.com>
Date: Thu, 21 Jul 2022 14:13:21 -0400
Subject: [PATCH 15/31] Remove Arrow CUDA IPC code (#10995)

Closes #10994.

This PR removes the Arrow CUDA-IPC related code we have, which has two benefits:

1. It deletes code (I have confirmed that no one uses this code today)
2. It removes our dependency on Arrow CUDA, which contributes towards removing our shared lib dependency on `libcuda.so`

Authors:
  - Ashwin Srinath (https://github.com/shwina)

Approvers:
  - Robert Maynard (https://github.com/robertmaynard)
  - Matthew Roeschke (https://github.com/mroeschke)
  - Bradley Dice (https://github.com/bdice)
  - AJ Schmidt (https://github.com/ajschmidt8)
  - https://github.com/jakirkham

URL: https://github.com/rapidsai/cudf/pull/10995
---
 conda/environments/cudf_dev_cuda11.5.yml      |   3 +-
 conda/recipes/cudf/meta.yaml                  |   2 +-
 conda/recipes/libcudf/meta.yaml               |   7 +-
 cpp/CMakeLists.txt                            |   8 -
 cpp/cmake/thirdparty/get_arrow.cmake          |  53 +---
 cpp/include/cudf/ipc.hpp                      |  59 ----
 cpp/src/comms/ipc/ipc.cpp                     |  22 --
 python/cudf/cudf/_lib/CMakeLists.txt          |  10 +-
 python/cudf/cudf/_lib/__init__.py             |   1 -
 python/cudf/cudf/_lib/cpp/gpuarrow.pxd        |  19 --
 python/cudf/cudf/_lib/gpuarrow.pyx            |  79 -----
 python/cudf/cudf/comm/gpuarrow.py             | 147 ---------
 .../cudf/cudf/tests/test_gpu_arrow_parser.py  | 300 ------------------
 python/cudf/cudf/tests/test_sparse_df.py      |  57 +---
 14 files changed, 12 insertions(+), 755 deletions(-)
 delete mode 100644 cpp/include/cudf/ipc.hpp
 delete mode 100644 cpp/src/comms/ipc/ipc.cpp
 delete mode 100644 python/cudf/cudf/_lib/cpp/gpuarrow.pxd
 delete mode 100644 python/cudf/cudf/_lib/gpuarrow.pyx
 delete mode 100644 python/cudf/cudf/comm/gpuarrow.py
 delete mode 100644 python/cudf/cudf/tests/test_gpu_arrow_parser.py

diff --git a/conda/environments/cudf_dev_cuda11.5.yml b/conda/environments/cudf_dev_cuda11.5.yml
index ee945e73279..979cc40c3a4 100644
--- a/conda/environments/cudf_dev_cuda11.5.yml
+++ b/conda/environments/cudf_dev_cuda11.5.yml
@@ -21,7 +21,7 @@ dependencies:
   - numba>=0.54
   - numpy
   - pandas>=1.0,<1.5.0dev0
-  - pyarrow=8.0.0=*cuda
+  - pyarrow=8.0.0
   - fastavro>=0.22.9
   - python-snappy>=0.6.0
   - notebook>=0.5.0
@@ -53,7 +53,6 @@ dependencies:
   - streamz
   - arrow-cpp=8.0.0
   - dlpack>=0.5,<0.6.0a0
-  - arrow-cpp-proc * cuda
   - double-conversion
   - rapidjson
   - hypothesis
diff --git a/conda/recipes/cudf/meta.yaml b/conda/recipes/cudf/meta.yaml
index e000c4437eb..30a5ebfbbc3 100644
--- a/conda/recipes/cudf/meta.yaml
+++ b/conda/recipes/cudf/meta.yaml
@@ -41,7 +41,7 @@ requirements:
     - setuptools
     - numba >=0.54
     - dlpack>=0.5,<0.6.0a0
-    - pyarrow =8.0.0 *cuda
+    - pyarrow =8.0.0
     - libcudf ={{ version }}
     - rmm ={{ minor_version }}
     - cudatoolkit ={{ cuda_version }}
diff --git a/conda/recipes/libcudf/meta.yaml b/conda/recipes/libcudf/meta.yaml
index d286f43e0eb..693d986deaf 100644
--- a/conda/recipes/libcudf/meta.yaml
+++ b/conda/recipes/libcudf/meta.yaml
@@ -35,8 +35,7 @@ requirements:
   host:
     - librmm {{ minor_version }}.*
     - cudatoolkit {{ cuda_version }}.*
-    - arrow-cpp {{ arrow_cpp_version }} *cuda
-    - arrow-cpp-proc * cuda
+    - arrow-cpp {{ arrow_cpp_version }}
     - dlpack {{ dlpack_version }}
     - librdkafka {{ librdkafka_version }}
 
@@ -57,8 +56,7 @@ outputs:
       run:
         - cudatoolkit {{ cuda_spec }}
         - librmm {{ minor_version }}.*
-        - arrow-cpp {{ arrow_cpp_version }} *cuda
-        - arrow-cpp-proc * cuda
+        - arrow-cpp {{ arrow_cpp_version }}
         - dlpack {{ dlpack_version }}
     test:
       commands:
@@ -159,7 +157,6 @@ outputs:
         - test -f $PREFIX/include/cudf/io/text/detail/trie.hpp
         - test -f $PREFIX/include/cudf/io/text/multibyte_split.hpp
         - test -f $PREFIX/include/cudf/io/types.hpp
-        - test -f $PREFIX/include/cudf/ipc.hpp
         - test -f $PREFIX/include/cudf/join.hpp
         - test -f $PREFIX/include/cudf/labeling/label_bins.hpp
         - test -f $PREFIX/include/cudf/lists/combine.hpp
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 0903609c1e2..20a25432038 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -237,7 +237,6 @@ add_library(
   src/column/column_factories.cpp
   src/column/column_factories.cu
   src/column/column_view.cpp
-  src/comms/ipc/ipc.cpp
   src/copying/concatenate.cu
   src/copying/contiguous_split.cu
   src/copying/copy.cpp
@@ -807,13 +806,6 @@ endif()
 ]=]
 )
 
-set(install_code_string
-    [=[
-set(ArrowCUDA_DIR "${Arrow_DIR}")
-find_dependency(ArrowCUDA)
-]=]
-)
-
 if(CUDF_ENABLE_ARROW_PARQUET)
   string(
     APPEND
diff --git a/cpp/cmake/thirdparty/get_arrow.cmake b/cpp/cmake/thirdparty/get_arrow.cmake
index 7cfb3568a11..116c5442dc3 100644
--- a/cpp/cmake/thirdparty/get_arrow.cmake
+++ b/cpp/cmake/thirdparty/get_arrow.cmake
@@ -26,9 +26,8 @@ function(find_and_configure_arrow VERSION BUILD_STATIC ENABLE_S3 ENABLE_ORC ENAB
 )
 
   if(BUILD_STATIC)
-    if(TARGET arrow_static AND TARGET arrow_cuda_static)
+    if(TARGET arrow_static)
       list(APPEND ARROW_LIBRARIES arrow_static)
-      list(APPEND ARROW_LIBRARIES arrow_cuda_static)
       set(ARROW_FOUND
           TRUE
           PARENT_SCOPE
@@ -40,9 +39,8 @@ function(find_and_configure_arrow VERSION BUILD_STATIC ENABLE_S3 ENABLE_ORC ENAB
       return()
     endif()
   else()
-    if(TARGET arrow_shared AND TARGET arrow_cuda_shared)
+    if(TARGET arrow_shared)
       list(APPEND ARROW_LIBRARIES arrow_shared)
-      list(APPEND ARROW_LIBRARIES arrow_cuda_shared)
       set(ARROW_FOUND
           TRUE
           PARENT_SCOPE
@@ -91,21 +89,15 @@ function(find_and_configure_arrow VERSION BUILD_STATIC ENABLE_S3 ENABLE_ORC ENAB
     list(APPEND ARROW_PARQUET_OPTIONS "ARROW_DEPENDENCY_SOURCE AUTO")
   endif()
 
-  # Set this so Arrow correctly finds the CUDA toolkit when the build machine does not have the CUDA
-  # driver installed. This must be an env var.
-  set(ENV{CUDA_LIB_PATH} "${CUDAToolkit_LIBRARY_DIR}/stubs")
-
   rapids_cpm_find(
     Arrow ${VERSION}
-    GLOBAL_TARGETS arrow_shared parquet_shared arrow_cuda_shared arrow_dataset_shared
+    GLOBAL_TARGETS arrow_shared parquet_shared arrow_dataset_shared
     CPM_ARGS
     GIT_REPOSITORY https://github.com/apache/arrow.git
     GIT_TAG apache-arrow-${VERSION}
     GIT_SHALLOW TRUE SOURCE_SUBDIR cpp
     OPTIONS "CMAKE_VERBOSE_MAKEFILE ON"
-            "CUDA_USE_STATIC_CUDA_RUNTIME ${CUDA_STATIC_RUNTIME}"
             "ARROW_IPC ON"
-            "ARROW_CUDA ON"
             "ARROW_DATASET ON"
             "ARROW_WITH_BACKTRACE ON"
             "ARROW_CXXFLAGS -w"
@@ -139,17 +131,12 @@ function(find_and_configure_arrow VERSION BUILD_STATIC ENABLE_S3 ENABLE_ORC ENAB
   if(Arrow_ADDED OR Arrow_DIR)
     if(BUILD_STATIC)
       list(APPEND ARROW_LIBRARIES arrow_static)
-      list(APPEND ARROW_LIBRARIES arrow_cuda_static)
     else()
       list(APPEND ARROW_LIBRARIES arrow_shared)
-      list(APPEND ARROW_LIBRARIES arrow_cuda_shared)
     endif()
 
     if(Arrow_DIR)
-      # Set this to enable `find_package(ArrowCUDA)`
-      set(ArrowCUDA_DIR "${Arrow_DIR}")
       find_package(Arrow REQUIRED QUIET)
-      find_package(ArrowCUDA REQUIRED QUIET)
       if(ENABLE_PARQUET)
         if(NOT Parquet_DIR)
           # Set this to enable `find_package(Parquet)`
@@ -165,9 +152,6 @@ function(find_and_configure_arrow VERSION BUILD_STATIC ENABLE_S3 ENABLE_ORC ENAB
       file(INSTALL "${Arrow_BINARY_DIR}/src/arrow/util/config.h"
            DESTINATION "${Arrow_SOURCE_DIR}/cpp/src/arrow/util"
       )
-      file(INSTALL "${Arrow_BINARY_DIR}/src/arrow/gpu/cuda_version.h"
-           DESTINATION "${Arrow_SOURCE_DIR}/cpp/src/arrow/gpu"
-      )
       if(ENABLE_PARQUET)
         file(INSTALL "${Arrow_BINARY_DIR}/src/parquet/parquet_version.h"
              DESTINATION "${Arrow_SOURCE_DIR}/cpp/src/parquet"
@@ -177,8 +161,7 @@ function(find_and_configure_arrow VERSION BUILD_STATIC ENABLE_S3 ENABLE_ORC ENAB
       # This shouldn't be necessary!
       #
       # Arrow populates INTERFACE_INCLUDE_DIRECTORIES for the `arrow_static` and `arrow_shared`
-      # targets in FindArrow and FindArrowCUDA respectively, so for static source-builds, we have to
-      # do it after-the-fact.
+      # targets in FindArrow, so for static source-builds, we have to do it after-the-fact.
       #
       # This only works because we know exactly which components we're using. Don't forget to update
       # this list if we add more!
@@ -220,26 +203,6 @@ function(find_and_configure_arrow VERSION BUILD_STATIC ENABLE_S3 ENABLE_ORC ENAB
       FINAL_CODE_BLOCK arrow_code_string
     )
 
-    set(arrow_cuda_code_string
-        [=[
-          if (TARGET cudf::arrow_cuda_shared AND (NOT TARGET arrow_cuda_shared))
-              add_library(arrow_cuda_shared ALIAS cudf::arrow_cuda_shared)
-          endif()
-          if (TARGET cudf::arrow_cuda_static AND (NOT TARGET arrow_cuda_static))
-              add_library(arrow_cuda_static ALIAS cudf::arrow_cuda_static)
-          endif()
-        ]=]
-    )
-
-    rapids_export(
-      BUILD ArrowCUDA
-      VERSION ${VERSION}
-      EXPORT_SET arrow_cuda_targets
-      GLOBAL_TARGETS arrow_cuda_shared arrow_cuda_static
-      NAMESPACE cudf::
-      FINAL_CODE_BLOCK arrow_cuda_code_string
-    )
-
     if(ENABLE_PARQUET)
 
       set(arrow_dataset_code_string
@@ -283,15 +246,10 @@ function(find_and_configure_arrow VERSION BUILD_STATIC ENABLE_S3 ENABLE_ORC ENAB
       )
     endif()
   endif()
-  # We generate the arrow-config and arrowcuda-config files when we built arrow locally, so always
-  # do `find_dependency`
+  # We generate the arrow-configfiles when we built arrow locally, so always do `find_dependency`
   rapids_export_package(BUILD Arrow cudf-exports)
   rapids_export_package(INSTALL Arrow cudf-exports)
 
-  # We have to generate the find_dependency(ArrowCUDA) ourselves since we need to specify
-  # ArrowCUDA_DIR to be where Arrow was found, since Arrow packages ArrowCUDA.config in a
-  # non-standard location
-  rapids_export_package(BUILD ArrowCUDA cudf-exports)
   if(ENABLE_PARQUET)
     rapids_export_package(BUILD Parquet cudf-exports)
     rapids_export_package(BUILD ArrowDataset cudf-exports)
@@ -299,7 +257,6 @@ function(find_and_configure_arrow VERSION BUILD_STATIC ENABLE_S3 ENABLE_ORC ENAB
 
   include("${rapids-cmake-dir}/export/find_package_root.cmake")
   rapids_export_find_package_root(BUILD Arrow [=[${CMAKE_CURRENT_LIST_DIR}]=] cudf-exports)
-  rapids_export_find_package_root(BUILD ArrowCUDA [=[${CMAKE_CURRENT_LIST_DIR}]=] cudf-exports)
   if(ENABLE_PARQUET)
     rapids_export_find_package_root(BUILD Parquet [=[${CMAKE_CURRENT_LIST_DIR}]=] cudf-exports)
     rapids_export_find_package_root(BUILD ArrowDataset [=[${CMAKE_CURRENT_LIST_DIR}]=] cudf-exports)
diff --git a/cpp/include/cudf/ipc.hpp b/cpp/include/cudf/ipc.hpp
deleted file mode 100644
index 7bce6e1bf5a..00000000000
--- a/cpp/include/cudf/ipc.hpp
+++ /dev/null
@@ -1,59 +0,0 @@
-/*
- * Copyright (c) 2022, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include <arrow/api.h>
-#include <arrow/gpu/cuda_api.h>
-#include <arrow/io/memory.h>
-#include <arrow/ipc/api.h>
-
-/**
- * @brief Reads Message objects from cuda buffer source
- *
- */
-class CudaMessageReader : arrow::ipc::MessageReader {
- public:
-  /**
-   * @brief Construct a new Cuda Message Reader object from a cuda buffer stream
-   *
-   * @param stream The cuda buffer reader stream
-   * @param schema The schema of the stream
-   */
-  CudaMessageReader(arrow::cuda::CudaBufferReader* stream, arrow::io::BufferReader* schema);
-
-  /**
-   * @brief Open stream from source.
-   *
-   * @param stream The cuda buffer reader stream
-   * @param schema The schema of the stream
-   * @return arrow::ipc::MessageReader object
-   */
-  static std::unique_ptr<arrow::ipc::MessageReader> Open(arrow::cuda::CudaBufferReader* stream,
-                                                         arrow::io::BufferReader* schema);
-
-  /**
-   * @brief Read next Message from the stream.
-   *
-   * @return arrow::ipc::Message object
-   */
-  arrow::Result<std::unique_ptr<arrow::ipc::Message>> ReadNextMessage() override;
-
- private:
-  arrow::cuda::CudaBufferReader* stream_;
-  arrow::io::BufferReader* host_schema_reader_ = nullptr;
-  std::shared_ptr<arrow::cuda::CudaBufferReader> owned_stream_;
-};
diff --git a/cpp/src/comms/ipc/ipc.cpp b/cpp/src/comms/ipc/ipc.cpp
deleted file mode 100644
index 21944f1354f..00000000000
--- a/cpp/src/comms/ipc/ipc.cpp
+++ /dev/null
@@ -1,22 +0,0 @@
-#include <arrow/result.h>
-#include <cudf/ipc.hpp>
-
-CudaMessageReader::CudaMessageReader(arrow::cuda::CudaBufferReader* stream,
-                                     arrow::io::BufferReader* schema)
-  : stream_(stream), host_schema_reader_(schema){};
-
-arrow::Result<std::unique_ptr<arrow::ipc::Message>> CudaMessageReader::ReadNextMessage()
-{
-  if (host_schema_reader_ != nullptr) {
-    auto message        = arrow::ipc::ReadMessage(host_schema_reader_);
-    host_schema_reader_ = nullptr;
-    if (message.ok() && *message != nullptr) { return message; }
-  }
-  return std::move(arrow::ipc::ReadMessage(stream_, arrow::default_memory_pool()));
-}
-
-std::unique_ptr<arrow::ipc::MessageReader> CudaMessageReader::Open(
-  arrow::cuda::CudaBufferReader* stream, arrow::io::BufferReader* schema)
-{
-  return std::unique_ptr<arrow::ipc::MessageReader>(new CudaMessageReader(stream, schema));
-}
diff --git a/python/cudf/cudf/_lib/CMakeLists.txt b/python/cudf/cudf/_lib/CMakeLists.txt
index d7d77d081f8..1f6b2069b49 100644
--- a/python/cudf/cudf/_lib/CMakeLists.txt
+++ b/python/cudf/cudf/_lib/CMakeLists.txt
@@ -23,7 +23,6 @@ set(cython_sources
     datetime.pyx
     expressions.pyx
     filling.pyx
-    gpuarrow.pyx
     groupby.pyx
     hash.pyx
     interop.pyx
@@ -65,18 +64,11 @@ rapids_cython_create_modules(
 # fixed in https://gitlab.kitware.com/cmake/cmake/-/merge_requests/7410 and will be available in
 # CMake 3.24, so we can remove the Development component once we upgrade to CMake 3.24.
 find_package(Python REQUIRED COMPONENTS Development NumPy)
-set(targets_using_numpy gpuarrow interop avro csv orc json parquet)
+set(targets_using_numpy interop avro csv orc json parquet)
 foreach(target IN LISTS targets_using_numpy)
   target_include_directories(${target} PRIVATE "${Python_NumPy_INCLUDE_DIRS}")
 endforeach()
 
-# PyArrow relies on the C++ Arrow library already being installed, so we can just find the C++
-# library directly and link to the same one. We rely on libcudf's exports to provide the
-# arrow_shared_lib and arrow_cuda_shared_lib libraries. That just leaves us to find the ArrowPython
-# library on our own.
-find_library(arrow_python_shared_library arrow_python REQUIRED)
-target_link_libraries(gpuarrow ${arrow_python_shared_library})
-
 foreach(cython_module IN LISTS RAPIDS_CYTHON_CREATED_TARGETS)
   set_target_properties(${cython_module} PROPERTIES INSTALL_RPATH "\$ORIGIN;\$ORIGIN/cpp")
 endforeach()
diff --git a/python/cudf/cudf/_lib/__init__.py b/python/cudf/cudf/_lib/__init__.py
index 542262b7908..bab28433c41 100644
--- a/python/cudf/cudf/_lib/__init__.py
+++ b/python/cudf/cudf/_lib/__init__.py
@@ -10,7 +10,6 @@
     datetime,
     expressions,
     filling,
-    gpuarrow,
     groupby,
     hash,
     interop,
diff --git a/python/cudf/cudf/_lib/cpp/gpuarrow.pxd b/python/cudf/cudf/_lib/cpp/gpuarrow.pxd
deleted file mode 100644
index 6ebae78b5cd..00000000000
--- a/python/cudf/cudf/_lib/cpp/gpuarrow.pxd
+++ /dev/null
@@ -1,19 +0,0 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
-
-from libcpp.memory cimport unique_ptr
-from pyarrow.includes.libarrow cimport (
-    CBufferReader,
-    CMessage,
-    CMessageReader,
-    CStatus,
-)
-from pyarrow.includes.libarrow_cuda cimport CCudaBufferReader
-
-
-cdef extern from "cudf/ipc.hpp" nogil:
-
-    cdef cppclass CCudaMessageReader" CudaMessageReader"(CMessageReader):
-        @staticmethod
-        unique_ptr[CMessageReader] Open(CCudaBufferReader* stream,
-                                        CBufferReader* schema)
-        CStatus ReadNextMessage(unique_ptr[CMessage]* out)
diff --git a/python/cudf/cudf/_lib/gpuarrow.pyx b/python/cudf/cudf/_lib/gpuarrow.pyx
deleted file mode 100644
index 0768517485e..00000000000
--- a/python/cudf/cudf/_lib/gpuarrow.pyx
+++ /dev/null
@@ -1,79 +0,0 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
-
-from libcpp.memory cimport unique_ptr
-from libcpp.utility cimport move
-from pyarrow._cuda cimport CudaBuffer
-from pyarrow.includes.libarrow_cuda cimport CCudaBufferReader
-
-from cudf._lib.cpp.gpuarrow cimport CCudaMessageReader
-
-from numba.cuda.cudadrv.devicearray import DeviceNDArray
-
-from pyarrow.includes.common cimport GetResultValue
-from pyarrow.includes.libarrow cimport (
-    CBufferReader,
-    CIpcReadOptions,
-    CMessage,
-    CMessageReader,
-    CRecordBatchStreamReader,
-)
-from pyarrow.lib cimport Buffer, RecordBatchReader, Schema, pyarrow_wrap_schema
-
-import pyarrow as pa
-
-
-cdef class CudaRecordBatchStreamReader(RecordBatchReader):
-    cdef:
-        CIpcReadOptions options
-
-    cdef readonly:
-        Schema schema
-
-    def __cinit__(self):
-        pass
-
-    def _open(self, source, schema=None):
-
-        cdef unique_ptr[CMessageReader] message_reader
-        cdef CCudaBufferReader* data_ = to_buffer_reader(source)
-        cdef CBufferReader* schema_ = schema_to_buffer_reader(schema)
-
-        with nogil:
-            message_reader = CCudaMessageReader.Open(data_, schema_)
-            self.reader = GetResultValue(CRecordBatchStreamReader.Open2(
-                move(message_reader), self.options
-            ))
-
-        self.schema = pyarrow_wrap_schema(self.reader.get().schema())
-
-
-cdef CBufferReader* schema_to_buffer_reader(schema):
-    cdef Buffer host_buf
-    if schema is None:
-        host_buf = pa.py_buffer(bytearray(0))
-    elif isinstance(schema, pa.Schema):
-        host_buf = <Buffer> schema.serialize()
-    else:
-        host_buf = <Buffer> as_pa_buffer(schema)
-    return new CBufferReader(host_buf.buffer)
-
-
-cdef CCudaBufferReader* to_buffer_reader(object obj):
-    cdef CudaBuffer cuda_buf
-    if pyarrow_is_cudabuffer(obj):
-        cuda_buf = <CudaBuffer> obj
-    elif isinstance(obj, DeviceNDArray):
-        cuda_buf = CudaBuffer.from_numba(obj.gpu_data)
-    else:
-        raise ValueError('unrecognized device buffer')
-    return new CCudaBufferReader(cuda_buf.buffer)
-
-
-cdef public api bint pyarrow_is_cudabuffer(object buffer):
-    return isinstance(buffer, CudaBuffer)
-
-
-def as_pa_buffer(object o):
-    if isinstance(o, pa.Buffer):
-        return o
-    return pa.py_buffer(o)
diff --git a/python/cudf/cudf/comm/gpuarrow.py b/python/cudf/cudf/comm/gpuarrow.py
deleted file mode 100644
index 0c4d9d7f77e..00000000000
--- a/python/cudf/cudf/comm/gpuarrow.py
+++ /dev/null
@@ -1,147 +0,0 @@
-# Copyright (c) 2019-2022, NVIDIA CORPORATION.
-from collections import OrderedDict, abc
-
-import numpy as np
-import pandas as pd
-import pyarrow as pa
-
-from cudf import Series
-from cudf._lib.gpuarrow import (
-    CudaRecordBatchStreamReader as _CudaRecordBatchStreamReader,
-)
-from cudf.core import column
-from cudf.utils.utils import mask_bitsize, mask_dtype
-
-
-class CudaRecordBatchStreamReader(_CudaRecordBatchStreamReader):
-    """
-    Reader for the Arrow streaming binary format
-
-    Parameters
-    ----------
-    source : pyarrow.cuda.CudaBuffer or numba DeviceNDarray
-        Either numba DeviceNDArray, or a pyarrow.cuda.CudaBuffer
-    schema : bytes/buffer-like, pyarrow.Schema, pyarrow.NativeFile,
-             file-like Python object
-        Optional pyarrow.Schema or host-serialized bytes/buffer-like
-                 pyarrow.Schema
-    """
-
-    def __init__(self, source, schema=None):
-        self._open(source, schema)
-
-
-class GpuArrowReader(abc.Sequence):
-    def __init__(self, schema, dev_ary):
-        self._table = CudaRecordBatchStreamReader(dev_ary, schema).read_all()
-
-    def __len__(self):
-        return self._table.num_columns
-
-    def __getitem__(self, idx):
-        return GpuArrowNodeReader(self._table, idx)
-
-    def schema(self):
-        """
-        Return a pyarrow schema
-        """
-        return self._table.schema
-
-    def to_dict(self):
-        """
-        Return a dictionary of Series object
-        """
-        dc = OrderedDict()
-        for node in self:
-            dc[node.name] = node.make_series()
-        return dc
-
-
-class GpuArrowNodeReader:
-    def __init__(self, table, index):
-        self._table = table
-        self._field = table.schema[index]
-        self._series = Series(column.as_column(table.column(index)))
-        self._series.name = self.name
-
-    def __len__(self):
-        return len(self._series)
-
-    @property
-    def schema(self):
-        return self._table.schema
-
-    @property
-    def field_schema(self):
-        return self._field
-
-    @property
-    def is_dictionary(self):
-        return pa.types.is_dictionary(self._field.type)
-
-    @property
-    def null_count(self):
-        return self._series.null_count
-
-    @property
-    def dtype(self):
-        return arrow_to_pandas_dtype(self._field.type)
-
-    @property
-    def index_dtype(self):
-        return self._field.type.index_type.to_pandas_dtype()
-
-    @property
-    def name(self):
-        return self._field.name
-
-    @property
-    def data(self):
-        """
-        Return the data as the expected dtype
-        and with the padding bytes truncated.
-        """
-        if self.data_raw is not None:
-            return self.data_raw.view(
-                self.dtype if not self.is_dictionary else self.index_dtype
-            )
-
-    @property
-    def null(self):
-        """
-        Return the null mask with the padding bytes truncated.
-        """
-        if self.null_raw is not None:
-            bits = mask_bitsize
-            itemsize = mask_dtype.itemsize
-            end = ((len(self) + bits - 1) // bits) * itemsize
-            return self.null_raw[:end].view(mask_dtype)
-
-    @property
-    def data_raw(self):
-        """Accessor for the data buffer as a device array"""
-        return self._series._column.data_array_view
-
-    @property
-    def null_raw(self):
-        """Accessor for the null buffer as a device array"""
-        return self._series._column.mask_array_view
-
-    def make_series(self):
-        """Make a Series object out of this node"""
-        return self._series.copy(deep=False)
-
-    def _make_dictionary_series(self):
-        """Make a dictionary-encoded series from this node"""
-        assert self.is_dictionary
-        return self._series.copy(deep=False)
-
-
-def arrow_to_pandas_dtype(pa_type):
-    if pa.types.is_dictionary(pa_type):
-        return pd.core.dtypes.dtypes.CategoricalDtype(ordered=pa_type.ordered)
-    if pa.types.is_date64(pa_type):
-        return np.dtype("datetime64[ms]")
-    if pa.types.is_timestamp(pa_type):
-        return np.dtype(f"M8[{pa_type.unit}]")
-    return np.dtype(pa_type.to_pandas_dtype())
diff --git a/python/cudf/cudf/tests/test_gpu_arrow_parser.py b/python/cudf/cudf/tests/test_gpu_arrow_parser.py
deleted file mode 100644
index 3b3aa72901f..00000000000
--- a/python/cudf/cudf/tests/test_gpu_arrow_parser.py
+++ /dev/null
@@ -1,300 +0,0 @@
-# Copyright (c) 2018-2021, NVIDIA CORPORATION.
-
-import logging
-
-import numpy as np
-import pyarrow as pa
-import pytest
-from numba import cuda
-
-import cudf
-from cudf.comm.gpuarrow import GpuArrowReader
-from cudf.testing._utils import INTEGER_TYPES
-
-
-def make_gpu_parse_arrow_data_batch():
-    np.random.seed(1234)
-    lat = np.random.uniform(low=27, high=42, size=23).astype(np.float32)
-    lon = np.random.uniform(low=-105, high=-76, size=23).astype(np.float32)
-
-    dest_lat = pa.array(lat)
-    dest_lon = pa.array(lon)
-
-    batch = pa.RecordBatch.from_arrays(
-        [dest_lat, dest_lon], ["dest_lat", "dest_lon"]
-    )
-    return batch
-
-
-def test_gpu_parse_arrow_data_cpu_schema():
-    batch = make_gpu_parse_arrow_data_batch()
-    schema_data = batch.schema.serialize()
-    recbatch_data = batch.serialize()
-
-    # To ensure compatibility for OmniSci we're going to create this numpy
-    # array to be read-only as that's how numpy arrays created from foreign
-    # memory buffers will be set
-    cpu_schema = np.frombuffer(schema_data, dtype=np.uint8)
-    cpu_data = np.frombuffer(recbatch_data, dtype=np.uint8)
-    gpu_data = cuda.to_device(cpu_data)
-    del cpu_data
-
-    # test reader
-    reader = GpuArrowReader(cpu_schema, gpu_data)
-    assert reader[0].name == "dest_lat"
-    assert reader[1].name == "dest_lon"
-    lat = reader[0].data.copy_to_host()
-    lon = reader[1].data.copy_to_host()
-    assert lat.size == 23
-    assert lon.size == 23
-    np.testing.assert_array_less(lat, 42)
-    np.testing.assert_array_less(27, lat)
-    np.testing.assert_array_less(lon, -76)
-    np.testing.assert_array_less(-105, lon)
-
-    dct = reader.to_dict()
-    np.testing.assert_array_equal(lat, dct["dest_lat"].to_numpy())
-    np.testing.assert_array_equal(lon, dct["dest_lon"].to_numpy())
-
-
-def test_gpu_parse_arrow_data_gpu_schema():
-    batch = make_gpu_parse_arrow_data_batch()
-    schema_data = batch.schema.serialize()
-    recbatch_data = batch.serialize()
-
-    # To ensure compatibility for OmniSci we're going to create this numpy
-    # array to be read-only as that's how numpy arrays created from foreign
-    # memory buffers will be set
-    cpu_schema = np.frombuffer(schema_data, dtype=np.uint8)
-    cpu_data = np.frombuffer(recbatch_data, dtype=np.uint8)
-    # Concatenate the schema and recordbatch into a single GPU buffer
-    gpu_data = cuda.to_device(np.concatenate([cpu_schema, cpu_data]))
-    del cpu_data
-    del cpu_schema
-
-    # test reader
-    reader = GpuArrowReader(None, gpu_data)
-    assert reader[0].name == "dest_lat"
-    assert reader[1].name == "dest_lon"
-    lat = reader[0].data.copy_to_host()
-    lon = reader[1].data.copy_to_host()
-    assert lat.size == 23
-    assert lon.size == 23
-    np.testing.assert_array_less(lat, 42)
-    np.testing.assert_array_less(27, lat)
-    np.testing.assert_array_less(lon, -76)
-    np.testing.assert_array_less(-105, lon)
-
-    dct = reader.to_dict()
-    np.testing.assert_array_equal(lat, dct["dest_lat"].to_numpy())
-    np.testing.assert_array_equal(lon, dct["dest_lon"].to_numpy())
-
-
-def test_gpu_parse_arrow_data_bad_cpu_schema_good_gpu_schema():
-    batch = make_gpu_parse_arrow_data_batch()
-    schema_data = batch.schema.serialize()
-    recbatch_data = batch.serialize()
-
-    # To ensure compatibility for OmniSci we're going to create this numpy
-    # array to be read-only as that's how numpy arrays created from foreign
-    # memory buffers will be set
-    cpu_schema = np.frombuffer(schema_data, dtype=np.uint8)
-    cpu_data = np.frombuffer(recbatch_data, dtype=np.uint8)
-    # Concatenate the schema and recordbatch into a single GPU buffer
-    gpu_data = cuda.to_device(np.concatenate([cpu_schema, cpu_data]))
-    del cpu_data
-    del cpu_schema
-
-    # test reader
-    reader = GpuArrowReader(b"", gpu_data)
-    assert reader[0].name == "dest_lat"
-    assert reader[1].name == "dest_lon"
-    lat = reader[0].data.copy_to_host()
-    lon = reader[1].data.copy_to_host()
-    assert lat.size == 23
-    assert lon.size == 23
-    np.testing.assert_array_less(lat, 42)
-    np.testing.assert_array_less(27, lat)
-    np.testing.assert_array_less(lon, -76)
-    np.testing.assert_array_less(-105, lon)
-
-    dct = reader.to_dict()
-    np.testing.assert_array_equal(lat, dct["dest_lat"].to_numpy())
-    np.testing.assert_array_equal(lon, dct["dest_lon"].to_numpy())
-
-
-expected_values = """
-0,orange,0.4713545411053003
-1,orange,0.003790919207527499
-2,orange,0.4396940888188392
-3,apple,0.5693619092183622
-4,pear,0.10894215574048405
-5,pear,0.09547296520000881
-6,orange,0.4123169425191555
-7,apple,0.4125838710498503
-8,orange,0.1904218750870219
-9,apple,0.9289366739893021
-10,orange,0.9330387015860205
-11,pear,0.46564799732291595
-12,apple,0.8573176464520044
-13,pear,0.21566885180419648
-14,orange,0.9199361970381871
-15,orange,0.9819955872277085
-16,apple,0.415964752238025
-17,grape,0.36941794781567516
-18,apple,0.9761832273396152
-19,grape,0.16672327312068824
-20,orange,0.13311815129622395
-21,orange,0.6230693626648358
-22,pear,0.7321171864853122
-23,grape,0.23106658283660853
-24,pear,0.0198404248930919
-25,orange,0.4032931749027482
-26,grape,0.665861129515741
-27,pear,0.10253071509254097
-28,orange,0.15243296681892238
-29,pear,0.3514868485827787
-"""
-
-
-def get_expected_values():
-    lines = filter(lambda x: x.strip(), expected_values.splitlines())
-    rows = [ln.split(",") for ln in lines]
-    return [(int(idx), name, float(weight)) for idx, name, weight in rows]
-
-
-def make_gpu_parse_arrow_cats_batch():
-    indices, names, weights = zip(*get_expected_values())
-    d_index = pa.array(indices).cast(pa.int32())
-    unique_names = list(set(names))
-    names_map = list(map(unique_names.index, names))
-    d_names_map = pa.array(names_map).cast(pa.int32())
-    d_names = pa.array(unique_names)
-    d_name = pa.DictionaryArray.from_arrays(d_names_map, d_names)
-    d_weight = pa.array(weights)
-    batch = pa.RecordBatch.from_arrays(
-        [d_index, d_name, d_weight], ["idx", "name", "weight"]
-    )
-    return batch
-
-
-def test_gpu_parse_arrow_cats():
-    batch = make_gpu_parse_arrow_cats_batch()
-
-    stream = pa.BufferOutputStream()
-    writer = pa.RecordBatchStreamWriter(stream, batch.schema)
-    writer.write_batch(batch)
-    writer.close()
-
-    schema_bytes = batch.schema.serialize().to_pybytes()
-    recordbatches_bytes = stream.getvalue().to_pybytes()[len(schema_bytes) :]
-
-    schema = np.ndarray(
-        shape=len(schema_bytes), dtype=np.byte, buffer=bytearray(schema_bytes)
-    )
-    rb_cpu_data = np.ndarray(
-        shape=len(recordbatches_bytes),
-        dtype=np.byte,
-        buffer=bytearray(recordbatches_bytes),
-    )
-    rb_gpu_data = cuda.to_device(rb_cpu_data)
-
-    gar = GpuArrowReader(schema, rb_gpu_data)
-    columns = gar.to_dict()
-
-    sr_idx = columns["idx"]
-    sr_name = columns["name"]
-    sr_weight = columns["weight"]
-
-    assert sr_idx.dtype == np.int32
-    assert sr_name.dtype == "category"
-    assert sr_weight.dtype == np.double
-    assert set(sr_name.to_pandas()) == {"apple", "pear", "orange", "grape"}
-
-    expected = get_expected_values()
-    for i in range(len(sr_idx)):
-        got_idx = sr_idx[i]
-        got_name = sr_name[i]
-        got_weight = sr_weight[i]
-
-        # the serialized data is not of order
-        exp_idx, exp_name, exp_weight = expected[got_idx]
-
-        assert got_idx == exp_idx
-        assert got_name == exp_name
-        np.testing.assert_almost_equal(got_weight, exp_weight)
-
-
-@pytest.mark.parametrize("dtype", INTEGER_TYPES)
-def test_gpu_parse_arrow_int(dtype):
-
-    depdelay = np.array([0, 0, -3, -2, 11, 6, -7, -4, 4, -3], dtype=dtype)
-    arrdelay = np.array([5, -3, 1, -2, 22, 11, -12, -5, 4, -9], dtype=dtype)
-    d_depdelay = pa.array(depdelay)
-    d_arrdelay = pa.array(arrdelay)
-    batch = pa.RecordBatch.from_arrays(
-        [d_depdelay, d_arrdelay], ["depdelay", "arrdelay"]
-    )
-
-    schema_bytes = batch.schema.serialize().to_pybytes()
-    recordbatches_bytes = batch.serialize().to_pybytes()
-
-    schema = np.ndarray(
-        shape=len(schema_bytes), dtype=np.byte, buffer=bytearray(schema_bytes)
-    )
-
-    rb_cpu_data = np.ndarray(
-        shape=len(recordbatches_bytes),
-        dtype=np.byte,
-        buffer=bytearray(recordbatches_bytes),
-    )
-
-    rb_gpu_data = cuda.to_device(rb_cpu_data)
-    gar = GpuArrowReader(schema, rb_gpu_data)
-    columns = gar.to_dict()
-    assert columns["depdelay"].dtype == dtype
-    assert set(columns) == {"depdelay", "arrdelay"}
-    assert list(columns["depdelay"].to_pandas()) == list(
-        depdelay.astype(dtype)
-    )
-
-
-@pytest.mark.parametrize(
-    "dtype",
-    ["datetime64[s]", "datetime64[ms]", "datetime64[us]", "datetime64[ns]"],
-)
-def test_gpu_parse_arrow_timestamps(dtype):
-    timestamp = (
-        cudf.datasets.timeseries(
-            start="2000-01-01", end="2000-01-02", freq="3600s", dtypes={}
-        )
-        .reset_index()["timestamp"]
-        .reset_index(drop=True)
-    )
-    gdf = cudf.DataFrame({"timestamp": timestamp.astype(dtype)})
-    pdf = gdf.to_arrow(preserve_index=False)
-    schema_data = pdf.schema.serialize()
-    recbatch_data = pdf.to_batches()[0].serialize()
-
-    # To ensure compatibility for OmniSci we're going to create this numpy
-    # array to be read-only as that's how numpy arrays created from foreign
-    # memory buffers will be set
-    cpu_schema = np.frombuffer(schema_data, dtype=np.uint8)
-    cpu_data = np.frombuffer(recbatch_data, dtype=np.uint8)
-    gpu_data = cuda.to_device(cpu_data)
-    del cpu_data
-
-    # test reader
-    reader = GpuArrowReader(cpu_schema, gpu_data)
-    assert reader[0].name == "timestamp"
-    timestamp_arr = reader[0].data.copy_to_host()
-    np.testing.assert_array_equal(timestamp_arr, gdf["timestamp"].to_numpy())
-    dct = reader.to_dict()
-    np.testing.assert_array_equal(timestamp_arr, dct["timestamp"].to_numpy())
-
-
-if __name__ == "__main__":
-    logging.basicConfig(level=logging.INFO)
-    logging.getLogger("cudf.gpuarrow").setLevel(logging.DEBUG)
-
-    test_gpu_parse_arrow_data_cpu_schema()
diff --git a/python/cudf/cudf/tests/test_sparse_df.py b/python/cudf/cudf/tests/test_sparse_df.py
index f7cf597afd2..0dd47c219c0 100644
--- a/python/cudf/cudf/tests/test_sparse_df.py
+++ b/python/cudf/cudf/tests/test_sparse_df.py
@@ -1,54 +1,8 @@
-# Copyright (c) 2018, NVIDIA CORPORATION.
-import os.path
+# Copyright (c) 2018-2022, NVIDIA CORPORATION.
 
 import numpy as np
-import pyarrow as pa
-import pytest
-from numba import cuda
 
-from cudf import DataFrame, Series
-from cudf.comm.gpuarrow import GpuArrowReader
-from cudf.testing._utils import assert_eq
-
-
-def read_data():
-    import pandas as pd
-
-    basedir = os.path.dirname(__file__)
-    datapath = os.path.join(basedir, "data", "ipums.pkl")
-    try:
-        df = pd.read_pickle(datapath)
-    except Exception as excpr:
-        if type(excpr).__name__ == "FileNotFoundError":
-            pytest.skip(".pkl file is not found")
-        else:
-            print(type(excpr).__name__)
-
-    names = []
-    arrays = []
-    for k in df.columns:
-        arrays.append(pa.Array.from_pandas(df[k]))
-        names.append(k)
-    batch = pa.RecordBatch.from_arrays(arrays, names)
-    schema = batch.schema.serialize().to_pybytes()
-    schema = np.ndarray(
-        shape=len(schema), dtype=np.byte, buffer=bytearray(schema)
-    )
-    data = batch.serialize().to_pybytes()
-    data = np.ndarray(shape=len(data), dtype=np.byte, buffer=bytearray(data))
-    darr = cuda.to_device(data)
-    return df, schema, darr
-
-
-def test_fillna():
-    _, schema, darr = read_data()
-    gar = GpuArrowReader(schema, darr)
-    masked_col = gar[8]
-    sr = Series(data=masked_col.data)
-    dense = sr.nans_to_nulls().fillna(123)
-    np.testing.assert_equal(123, dense.to_numpy())
-    assert len(dense) == len(sr)
-    assert dense.null_count == 0
+from cudf import Series
 
 
 def test_to_dense_array():
@@ -62,10 +16,3 @@ def test_to_dense_array():
     dense = sr.dropna().to_numpy()
     assert dense.size < filled.size
     assert filled.size == len(sr)
-
-
-def test_reading_arrow_sparse_data():
-    pdf, schema, darr = read_data()
-    gar = GpuArrowReader(schema, darr)
-    gdf = DataFrame(gar.to_dict())
-    assert_eq(pdf, gdf)

From 406bce7efb808330e4d3b1c02c909582402f561e Mon Sep 17 00:00:00 2001
From: nvdbaranec <56695930+nvdbaranec@users.noreply.github.com>
Date: Thu, 21 Jul 2022 15:11:08 -0500
Subject: [PATCH 16/31] Optimization to gpu::PreprocessColumnData in parquet
 reader. (#11252)

When handling list columns in the parquet reader, we have to run a preprocess step that computes several things per-page before we can decode values.  If the user has further specified artificial row bounds (`skip_rows`, `min_rows`) we have to do a second pass during the preprocess step.

If the user has _not_ specified row bounds, there is no need to do this; however the code was naively always doing so.  This PR simply detects when we're reading all rows (which is 99% of use cases) and skips the second pass.

Also includes some cleanup of redundant stream synchronizations.

Also worth mentioning, this `skip_rows`/`num_rows` feature is going to be deprecated in 22.08 so we will be able to follow up further in 22.10 to rip more of this code out.

Authors:
  - https://github.com/nvdbaranec

Approvers:
  - Mike Wilson (https://github.com/hyperbolic2346)
  - Jim Brennan (https://github.com/jbrennan333)
  - Nghia Truong (https://github.com/ttnghia)

URL: https://github.com/rapidsai/cudf/pull/11252
---
 cpp/src/io/parquet/page_data.cu    | 90 +++++++++++++++---------------
 cpp/src/io/parquet/parquet_gpu.hpp | 17 +++---
 cpp/src/io/parquet/reader_impl.cu  | 24 ++++++--
 cpp/src/io/parquet/reader_impl.hpp | 18 ++++--
 4 files changed, 86 insertions(+), 63 deletions(-)

diff --git a/cpp/src/io/parquet/page_data.cu b/cpp/src/io/parquet/page_data.cu
index 4243a103eee..e88182c68e3 100644
--- a/cpp/src/io/parquet/page_data.cu
+++ b/cpp/src/io/parquet/page_data.cu
@@ -866,15 +866,13 @@ static __device__ void gpuOutputGeneric(volatile page_state_s* s,
  * @param[in] p The global page to be copied from
  * @param[in] chunks The global list of chunks
  * @param[in] num_rows Maximum number of rows to read
- * @param[in] min_row crop all rows below min_row
- * @param[in] num_chunk Number of column chunks
+ * @param[in] min_row Crop all rows below min_row
  */
 static __device__ bool setupLocalPageInfo(page_state_s* const s,
-                                          PageInfo* p,
-                                          ColumnChunkDesc const* chunks,
+                                          PageInfo const* p,
+                                          device_span<ColumnChunkDesc const> chunks,
                                           size_t min_row,
-                                          size_t num_rows,
-                                          int32_t num_chunks)
+                                          size_t num_rows)
 {
   int t = threadIdx.x;
   int chunk_idx;
@@ -1510,23 +1508,20 @@ static __device__ void gpuUpdatePageSizes(page_state_s* s,
  *
  * This function will write out the size field for each level of nesting.
  *
- * @param[in,out] pages List of pages
- * @param[in] chunks List of column chunks
- * @param[in] num_chunks Number of column chunks
- * @param[in] min_row Row index to start reading at
- * @param[in] num_rows Maximum number of rows to read
- * @param[in] num_chunks Number of column chunks
- * @param[in] trim_pass Whether or not this is the trim pass.  We first have to compute
+ * @param pages List of pages
+ * @param chunks List of column chunks
+ * @param min_row Row index to start reading at
+ * @param num_rows Maximum number of rows to read. Pass as INT_MAX to guarantee reading all rows.
+ * @param trim_pass Whether or not this is the trim pass.  We first have to compute
  * the full size information of every page before we come through in a second (trim) pass
  * to determine what subset of rows in this page we should be reading.
  */
-// blockDim {block_size,1,1}
-__global__ void __launch_bounds__(block_size) gpuComputePageSizes(PageInfo* pages,
-                                                                  ColumnChunkDesc const* chunks,
-                                                                  size_t min_row,
-                                                                  size_t num_rows,
-                                                                  int32_t num_chunks,
-                                                                  bool trim_pass)
+__global__ void __launch_bounds__(block_size)
+  gpuComputePageSizes(PageInfo* pages,
+                      device_span<ColumnChunkDesc const> chunks,
+                      size_t min_row,
+                      size_t num_rows,
+                      bool trim_pass)
 {
   __shared__ __align__(16) page_state_s state_g;
 
@@ -1535,8 +1530,7 @@ __global__ void __launch_bounds__(block_size) gpuComputePageSizes(PageInfo* page
   int t                 = threadIdx.x;
   PageInfo* pp          = &pages[page_idx];
 
-  if (!setupLocalPageInfo(
-        s, pp, chunks, trim_pass ? min_row : 0, trim_pass ? num_rows : INT_MAX, num_chunks)) {
+  if (!setupLocalPageInfo(s, pp, chunks, trim_pass ? min_row : 0, trim_pass ? num_rows : INT_MAX)) {
     return;
   }
 
@@ -1551,6 +1545,7 @@ __global__ void __launch_bounds__(block_size) gpuComputePageSizes(PageInfo* page
     s->page.skipped_leaf_values = -1;
     s->input_row_count          = 0;
     s->input_value_count        = 0;
+
     // if this isn't the trim pass, make sure we visit absolutely everything
     if (!trim_pass) {
       s->first_row             = 0;
@@ -1605,18 +1600,13 @@ __global__ void __launch_bounds__(block_size) gpuComputePageSizes(PageInfo* page
  * conversion will be performed to translate from the Parquet datatype to
  * desired output datatype (ex. 32-bit to 16-bit, string to hash).
  *
- * @param[in] pages List of pages
- * @param[in,out] chunks List of column chunks
- * @param[in] min_row Row index to start reading at
- * @param[in] num_rows Maximum number of rows to read
- * @param[in] num_chunks Number of column chunks
+ * @param pages List of pages
+ * @param chunks List of column chunks
+ * @param min_row Row index to start reading at
+ * @param num_rows Maximum number of rows to read
  */
-// blockDim {block_size,1,1}
-__global__ void __launch_bounds__(block_size) gpuDecodePageData(PageInfo* pages,
-                                                                ColumnChunkDesc const* chunks,
-                                                                size_t min_row,
-                                                                size_t num_rows,
-                                                                int32_t num_chunks)
+__global__ void __launch_bounds__(block_size) gpuDecodePageData(
+  PageInfo* pages, device_span<ColumnChunkDesc const> chunks, size_t min_row, size_t num_rows)
 {
   __shared__ __align__(16) page_state_s state_g;
 
@@ -1625,7 +1615,7 @@ __global__ void __launch_bounds__(block_size) gpuDecodePageData(PageInfo* pages,
   int t                 = threadIdx.x;
   int out_thread0;
 
-  if (!setupLocalPageInfo(s, &pages[page_idx], chunks, min_row, num_rows, num_chunks)) { return; }
+  if (!setupLocalPageInfo(s, &pages[page_idx], chunks, min_row, num_rows)) { return; }
 
   if (s->dict_base) {
     out_thread0 = (s->dict_bits > 0) ? 64 : 32;
@@ -1804,6 +1794,7 @@ void PreprocessColumnData(hostdevice_vector<PageInfo>& pages,
                           std::vector<cudf::io::detail::column_buffer>& output_columns,
                           size_t num_rows,
                           size_t min_row,
+                          bool uses_custom_row_bounds,
                           rmm::cuda_stream_view stream,
                           rmm::mr::device_memory_resource* mr)
 {
@@ -1812,10 +1803,16 @@ void PreprocessColumnData(hostdevice_vector<PageInfo>& pages,
 
   // computes:
   // PageNestingInfo::size for each level of nesting, for each page.
-  // The output from this does not take row bounds (num_rows, min_row) into account
+  // This computes the size for the entire page, not taking row bounds into account.
+  // If uses_custom_row_bounds is set to true, we have to do a second pass later that "trims"
+  // the starting and ending read values to account for these bounds.
   gpuComputePageSizes<<<dim_grid, dim_block, 0, stream.value()>>>(
-    pages.device_ptr(), chunks.device_ptr(), min_row, num_rows, chunks.size(), false);
-  stream.synchronize();
+    pages.device_ptr(),
+    chunks,
+    // if uses_custom_row_bounds is false, include all possible rows.
+    uses_custom_row_bounds ? min_row : 0,
+    uses_custom_row_bounds ? num_rows : INT_MAX,
+    !uses_custom_row_bounds);
 
   // computes:
   // PageInfo::chunk_row for all pages
@@ -1831,13 +1828,13 @@ void PreprocessColumnData(hostdevice_vector<PageInfo>& pages,
 
   // computes:
   // PageNestingInfo::size for each level of nesting, for each page, taking row bounds into account.
-  // PageInfo::skipped_values, which tells us where to start decoding in the input
-  gpuComputePageSizes<<<dim_grid, dim_block, 0, stream.value()>>>(
-    pages.device_ptr(), chunks.device_ptr(), min_row, num_rows, chunks.size(), true);
-
-  // retrieve pages back (PageInfo::num_rows has been set. if we don't bring it
-  // back, this value will get overwritten later on).
-  pages.device_to_host(stream, true);
+  // PageInfo::skipped_values, which tells us where to start decoding in the input  .
+  // It is only necessary to do this second pass if uses_custom_row_bounds is set (if the user has
+  // specified artifical bounds).
+  if (uses_custom_row_bounds) {
+    gpuComputePageSizes<<<dim_grid, dim_block, 0, stream.value()>>>(
+      pages.device_ptr(), chunks, min_row, num_rows, true);
+  }
 
   // ordering of pages is by input column schema, repeated across row groups.  so
   // if we had 3 columns, each with 2 pages, and 1 row group, our schema values might look like
@@ -1919,6 +1916,9 @@ void PreprocessColumnData(hostdevice_vector<PageInfo>& pages,
                                                                  static_cast<int>(l_idx)});
     }
   }
+
+  // retrieve pages back
+  pages.device_to_host(stream);
 }
 
 /**
@@ -1934,7 +1934,7 @@ void __host__ DecodePageData(hostdevice_vector<PageInfo>& pages,
   dim3 dim_grid(pages.size(), 1);  // 1 threadblock per page
 
   gpuDecodePageData<<<dim_grid, dim_block, 0, stream.value()>>>(
-    pages.device_ptr(), chunks.device_ptr(), min_row, num_rows, chunks.size());
+    pages.device_ptr(), chunks, min_row, num_rows);
 }
 
 }  // namespace gpu
diff --git a/cpp/src/io/parquet/parquet_gpu.hpp b/cpp/src/io/parquet/parquet_gpu.hpp
index 1de6be38b3d..f94ce574731 100644
--- a/cpp/src/io/parquet/parquet_gpu.hpp
+++ b/cpp/src/io/parquet/parquet_gpu.hpp
@@ -413,13 +413,15 @@ void BuildStringDictionaryIndex(ColumnChunkDesc* chunks,
  *
  * Note : this function is where output device memory is allocated for nested columns.
  *
- * @param[in,out] pages All pages to be decoded
- * @param[in] chunks All chunks to be decoded
- * @param[in,out] input_columns Input column information
- * @param[in,out] output_columns Output column information
- * @param[in] num_rows Maximum number of rows to read
- * @param[in] min_rows crop all rows below min_row
- * @param[in] stream Cuda stream
+ * @param pages All pages to be decoded
+ * @param chunks All chunks to be decoded
+ * @param input_columns Input column information
+ * @param output_columns Output column information
+ * @param num_rows Maximum number of rows to read
+ * @param min_rows crop all rows below min_row
+ * @param uses_custom_row_bounds Whether or not num_rows and min_rows represents user-specific
+ * bounds
+ * @param stream Cuda stream
  */
 void PreprocessColumnData(hostdevice_vector<PageInfo>& pages,
                           hostdevice_vector<ColumnChunkDesc> const& chunks,
@@ -427,6 +429,7 @@ void PreprocessColumnData(hostdevice_vector<PageInfo>& pages,
                           std::vector<cudf::io::detail::column_buffer>& output_columns,
                           size_t num_rows,
                           size_t min_row,
+                          bool uses_custom_row_bounds,
                           rmm::cuda_stream_view stream,
                           rmm::mr::device_memory_resource* mr);
 
diff --git a/cpp/src/io/parquet/reader_impl.cu b/cpp/src/io/parquet/reader_impl.cu
index 4d78cf74196..60ea446dfa4 100644
--- a/cpp/src/io/parquet/reader_impl.cu
+++ b/cpp/src/io/parquet/reader_impl.cu
@@ -542,7 +542,7 @@ class aggregate_reader_metadata {
   /**
    * @brief Filters and reduces down to a selection of row groups
    *
-   * @param row_groups Lists of row group to reads, one per source
+   * @param row_groups Lists of row groups to read, one per source
    * @param row_start Starting row of the selection
    * @param row_count Total number of rows selected
    *
@@ -1346,6 +1346,7 @@ void reader::impl::preprocess_columns(hostdevice_vector<gpu::ColumnChunkDesc>& c
                                       hostdevice_vector<gpu::PageInfo>& pages,
                                       size_t min_row,
                                       size_t total_rows,
+                                      bool uses_custom_row_bounds,
                                       bool has_lists)
 {
   // TODO : we should be selectively preprocessing only columns that have
@@ -1365,8 +1366,15 @@ void reader::impl::preprocess_columns(hostdevice_vector<gpu::ColumnChunkDesc>& c
     create_columns(_output_columns);
   } else {
     // preprocess per-nesting level sizes by page
-    gpu::PreprocessColumnData(
-      pages, chunks, _input_columns, _output_columns, total_rows, min_row, _stream, _mr);
+    gpu::PreprocessColumnData(pages,
+                              chunks,
+                              _input_columns,
+                              _output_columns,
+                              total_rows,
+                              min_row,
+                              uses_custom_row_bounds,
+                              _stream,
+                              _mr);
     _stream.synchronize();
   }
 }
@@ -1590,6 +1598,7 @@ reader::impl::impl(std::vector<std::unique_ptr<datasource>>&& sources,
 
 table_with_metadata reader::impl::read(size_type skip_rows,
                                        size_type num_rows,
+                                       bool uses_custom_row_bounds,
                                        std::vector<std::vector<size_type>> const& row_group_list)
 {
   // Select only row groups required
@@ -1743,7 +1752,7 @@ table_with_metadata reader::impl::read(size_type skip_rows,
       //
       // - for nested schemas, output buffer offset values per-page, per nesting-level for the
       // purposes of decoding.
-      preprocess_columns(chunks, pages, skip_rows, num_rows, has_lists);
+      preprocess_columns(chunks, pages, skip_rows, num_rows, uses_custom_row_bounds, has_lists);
 
       // decoding of column data itself
       decode_page_data(chunks, pages, page_nesting_info, skip_rows, num_rows);
@@ -1792,7 +1801,12 @@ reader::~reader() = default;
 // Forward to implementation
 table_with_metadata reader::read(parquet_reader_options const& options)
 {
-  return _impl->read(options.get_skip_rows(), options.get_num_rows(), options.get_row_groups());
+  // if the user has specified custom row bounds
+  bool const uses_custom_row_bounds = options.get_num_rows() >= 0 || options.get_skip_rows() != 0;
+  return _impl->read(options.get_skip_rows(),
+                     options.get_num_rows(),
+                     uses_custom_row_bounds,
+                     options.get_row_groups());
 }
 
 }  // namespace parquet
diff --git a/cpp/src/io/parquet/reader_impl.hpp b/cpp/src/io/parquet/reader_impl.hpp
index 06380fbe325..f4366cd1258 100644
--- a/cpp/src/io/parquet/reader_impl.hpp
+++ b/cpp/src/io/parquet/reader_impl.hpp
@@ -71,12 +71,15 @@ class reader::impl {
    *
    * @param skip_rows Number of rows to skip from the start
    * @param num_rows Number of rows to read
-   * @param row_group_indices TODO
+   * @param uses_custom_row_bounds Whether or not num_rows and min_rows represents user-specific
+   * bounds
+   * @param row_group_indices Lists of row groups to read, one per source
    *
    * @return The set of columns along with metadata
    */
   table_with_metadata read(size_type skip_rows,
                            size_type num_rows,
+                           bool uses_custom_row_bounds,
                            std::vector<std::vector<size_type>> const& row_group_indices);
 
  private:
@@ -154,17 +157,20 @@ class reader::impl {
    *
    * For flat schemas, these values are computed during header decoding (see gpuDecodePageHeaders)
    *
-   * @param[in,out] chunks All chunks to be decoded
-   * @param[in,out] pages All pages to be decoded
-   * @param[in] min_rows crop all rows below min_row
-   * @param[in] total_rows Maximum number of rows to read
-   * @param[in] has_lists Whether or not this data contains lists and requires
+   * @param chunks All chunks to be decoded
+   * @param pages All pages to be decoded
+   * @param min_rows crop all rows below min_row
+   * @param total_rows Maximum number of rows to read
+   * @param uses_custom_row_bounds Whether or not num_rows and min_rows represents user-specific
+   * bounds
+   * @param has_lists Whether or not this data contains lists and requires
    * a preprocess.
    */
   void preprocess_columns(hostdevice_vector<gpu::ColumnChunkDesc>& chunks,
                           hostdevice_vector<gpu::PageInfo>& pages,
                           size_t min_row,
                           size_t total_rows,
+                          bool uses_custom_row_bounds,
                           bool has_lists);
 
   /**

From 6a07e753e0acb5c48dd20909696665f34ff312c7 Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Thu, 21 Jul 2022 16:39:35 -0500
Subject: [PATCH 17/31] Deprecate `skiprows` & `num_rows` in parquet reader
 (#11218)

This PR:

- [x] Deprecates `skiprows` & `num_rows` from cudf parquet reader (`cudf.read_parquet`) since these parameters are adding to a lot of overhead incase of nested types and also not supported in `pd.read_parquet`

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Vukasin Milovanovic (https://github.com/vuule)
  - Richard (Rick) Zamora (https://github.com/rjzamora)
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/11218
---
 python/cudf/cudf/io/parquet.py         | 12 ++++++++++++
 python/cudf/cudf/tests/test_parquet.py | 21 +++++++++++++++++++++
 2 files changed, 33 insertions(+)

diff --git a/python/cudf/cudf/io/parquet.py b/python/cudf/cudf/io/parquet.py
index 51c2ac8b828..5a181dc076c 100644
--- a/python/cudf/cudf/io/parquet.py
+++ b/python/cudf/cudf/io/parquet.py
@@ -371,6 +371,18 @@ def read_parquet(
 ):
     """{docstring}"""
 
+    if skiprows is not None:
+        warnings.warn(
+            "skiprows is deprecated and will be removed.",
+            FutureWarning,
+        )
+
+    if num_rows is not None:
+        warnings.warn(
+            "num_rows is deprecated and will be removed.",
+            FutureWarning,
+        )
+
     # Do not allow the user to set file-opening options
     # when `use_python_file_object=False` is specified
     if use_python_file_object is False:
diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py
index dff871276a8..973f8c75553 100644
--- a/python/cudf/cudf/tests/test_parquet.py
+++ b/python/cudf/cudf/tests/test_parquet.py
@@ -618,6 +618,12 @@ def test_parquet_read_row_groups_non_contiguous(tmpdir, pdf, row_group_size):
     assert_eq(ref_df, gdf)
 
 
+@pytest.mark.filterwarnings(
+    "ignore:skiprows is deprecated and will be removed."
+)
+@pytest.mark.filterwarnings(
+    "ignore:num_rows is deprecated and will be removed."
+)
 @pytest.mark.parametrize("row_group_size", [1, 4, 33])
 def test_parquet_read_rows(tmpdir, pdf, row_group_size):
     if len(pdf) > 100:
@@ -702,6 +708,12 @@ def test_parquet_reader_invalids(tmpdir):
     assert_eq(expect, got)
 
 
+@pytest.mark.filterwarnings(
+    "ignore:skiprows is deprecated and will be removed."
+)
+@pytest.mark.filterwarnings(
+    "ignore:num_rows is deprecated and will be removed."
+)
 def test_parquet_chunked_skiprows(tmpdir):
     processed = 0
     batch = 10000
@@ -1120,6 +1132,9 @@ def test_parquet_reader_list_large_multi_rowgroup_nulls(tmpdir):
     assert_eq(expect, got)
 
 
+@pytest.mark.filterwarnings(
+    "ignore:skiprows is deprecated and will be removed."
+)
 @pytest.mark.parametrize("skip", [0, 1, 5, 10])
 def test_parquet_reader_list_skiprows(skip, tmpdir):
     num_rows = 10
@@ -1142,6 +1157,12 @@ def test_parquet_reader_list_skiprows(skip, tmpdir):
         assert pa.Table.from_pandas(expect).equals(got.to_arrow())
 
 
+@pytest.mark.filterwarnings(
+    "ignore:skiprows is deprecated and will be removed."
+)
+@pytest.mark.filterwarnings(
+    "ignore:num_rows is deprecated and will be removed."
+)
 @pytest.mark.parametrize("skip", [0, 1, 5, 10])
 def test_parquet_reader_list_num_rows(skip, tmpdir):
     num_rows = 20

From fda920c262ce1e79d4e234b44b997ed9e6781541 Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Thu, 21 Jul 2022 16:50:56 -0500
Subject: [PATCH 18/31] Fix issue related to numpy array and `category` dtype
 (#11282)

Fixes: #11256
This PR fixes an issue with type casting when non-numpy dtypes are passed into the column constructor.

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Ashwin Srinath (https://github.com/shwina)

URL: https://github.com/rapidsai/cudf/pull/11282
---
 python/cudf/cudf/core/column/column.py     | 24 +++++++++++++++++++---
 python/cudf/cudf/tests/test_categorical.py |  1 +
 2 files changed, 22 insertions(+), 3 deletions(-)

diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
index 6a4c36b27e2..bd17cb4ede9 100644
--- a/python/cudf/cudf/core/column/column.py
+++ b/python/cudf/cudf/core/column/column.py
@@ -901,7 +901,13 @@ def as_categorical_column(self, dtype, **kwargs) -> ColumnBase:
         sr = cudf.Series(self)
 
         # Re-label self w.r.t. the provided categories
-        if isinstance(dtype, (cudf.CategoricalDtype, pd.CategoricalDtype)):
+        if (
+            isinstance(dtype, cudf.CategoricalDtype)
+            and dtype._categories is not None
+        ) or (
+            isinstance(dtype, pd.CategoricalDtype)
+            and dtype.categories is not None
+        ):
             labels = sr._label_encoding(cats=dtype.categories)
             if "ordered" in kwargs:
                 warnings.warn(
@@ -910,7 +916,7 @@ def as_categorical_column(self, dtype, **kwargs) -> ColumnBase:
                 )
 
             return build_categorical_column(
-                categories=dtype.categories,
+                categories=as_column(dtype.categories),
                 codes=labels._column,
                 mask=self.mask,
                 ordered=dtype.ordered,
@@ -1863,8 +1869,17 @@ def as_column(
         if not arbitrary.flags["C_CONTIGUOUS"]:
             arbitrary = np.ascontiguousarray(arbitrary)
 
+        delayed_cast = False
         if dtype is not None:
-            arbitrary = arbitrary.astype(np.dtype(dtype))
+            try:
+                dtype = np.dtype(dtype)
+            except TypeError:
+                # Some `dtype`'s can't be parsed by `np.dtype`
+                # for which we will have to cast after the column
+                # has been constructed.
+                delayed_cast = True
+            else:
+                arbitrary = arbitrary.astype(dtype)
 
         if arb_dtype.kind == "M":
 
@@ -1938,6 +1953,9 @@ def as_column(
         else:
             data = as_column(cupy.asarray(arbitrary), nan_as_null=nan_as_null)
 
+        if delayed_cast:
+            data = data.astype(cudf.dtype(dtype))
+
     elif isinstance(arbitrary, pd.core.arrays.numpy_.PandasArray):
         if is_categorical_dtype(arbitrary.dtype):
             arb_dtype = arbitrary.dtype
diff --git a/python/cudf/cudf/tests/test_categorical.py b/python/cudf/cudf/tests/test_categorical.py
index 61f09c39123..2827abdb7a1 100644
--- a/python/cudf/cudf/tests/test_categorical.py
+++ b/python/cudf/cudf/tests/test_categorical.py
@@ -640,6 +640,7 @@ def test_categorical_set_categories_categoricals(data, new_categories):
         pd.Series([1, 2, 3, -4], dtype="int64"),
         pd.Series([1, 2, 3, 4], dtype="uint64"),
         pd.Series([1, 2.3, 3, 4], dtype="float"),
+        np.asarray([0, 2, 1]),
         [None, 1, None, 2, None],
         [],
     ],

From 7580153ed34cb7d088bfcd355c441a36d433c036 Mon Sep 17 00:00:00 2001
From: Devavret Makkar <devavret@users.noreply.github.com>
Date: Fri, 22 Jul 2022 04:14:18 +0530
Subject: [PATCH 19/31] Add 24 bit dictionary support to Parquet writer
 (#11216)

Closes #10948

Adds support for dictionary encoding with 24 bit indices.

Authors:
  - Devavret Makkar (https://github.com/devavret)

Approvers:
  - David Wendt (https://github.com/davidwendt)
  - Vukasin Milovanovic (https://github.com/vuule)

URL: https://github.com/rapidsai/cudf/pull/11216
---
 cpp/src/io/parquet/page_enc.cu     | 21 ++++++++++++++++-----
 cpp/src/io/parquet/parquet_gpu.hpp |  7 +++----
 cpp/src/io/parquet/writer_impl.cu  | 14 ++++++++------
 3 files changed, 27 insertions(+), 15 deletions(-)

diff --git a/cpp/src/io/parquet/page_enc.cu b/cpp/src/io/parquet/page_enc.cu
index 3f22ead3bb1..2dd004740af 100644
--- a/cpp/src/io/parquet/page_enc.cu
+++ b/cpp/src/io/parquet/page_enc.cu
@@ -80,7 +80,7 @@ struct page_enc_state_s {
   EncPage page;
   EncColumnChunk ck;
   parquet_column_device_view col;
-  uint16_t vals[rle_buffer_size];
+  uint32_t vals[rle_buffer_size];
 };
 
 /**
@@ -433,8 +433,9 @@ __global__ void __launch_bounds__(128)
  * @brief Mask table representing how many consecutive repeats are needed to code a repeat run
  *[nbits-1]
  */
-static __device__ __constant__ uint32_t kRleRunMask[16] = {
-  0x00ffffff, 0x0fff, 0x00ff, 0x3f, 0x0f, 0x0f, 0x7, 0x7, 0x3, 0x3, 0x3, 0x3, 0x1, 0x1, 0x1, 0x1};
+static __device__ __constant__ uint32_t kRleRunMask[24] = {
+  0x00ffffff, 0x0fff, 0x00ff, 0x3f, 0x0f, 0x0f, 0x7, 0x7, 0x3, 0x3, 0x3, 0x3,
+  0x1,        0x1,    0x1,    0x1,  0x1,  0x1,  0x1, 0x1, 0x1, 0x1, 0x1, 0x1};
 
 /**
  * @brief Variable-length encode an integer
@@ -455,7 +456,7 @@ inline __device__ uint8_t* VlqEncode(uint8_t* p, uint32_t v)
 inline __device__ void PackLiterals(
   uint8_t* dst, uint32_t v, uint32_t count, uint32_t w, uint32_t t)
 {
-  if (w == 1 || w == 2 || w == 4 || w == 8 || w == 12 || w == 16) {
+  if (w == 1 || w == 2 || w == 4 || w == 8 || w == 12 || w == 16 || w == 24) {
     if (t <= (count | 0x1f)) {
       if (w == 1 || w == 2 || w == 4) {
         uint32_t mask = 0;
@@ -491,11 +492,18 @@ inline __device__ void PackLiterals(
           dst[t * 2 + 1] = v >> 8;
         }
         return;
+      } else if (w == 24) {
+        if (t < count) {
+          dst[t * 3 + 0] = v;
+          dst[t * 3 + 1] = v >> 8;
+          dst[t * 3 + 2] = v >> 16;
+        }
+        return;
       }
     } else {
       return;
     }
-  } else {
+  } else if (w <= 16) {
     // Scratch space to temporarily write to. Needed because we will use atomics to write 32 bit
     // words but the destination mem may not be a multiple of 4 bytes.
     // TODO (dm): This assumes blockdim = 128 and max bits per value = 16. Reduce magic numbers.
@@ -524,6 +532,8 @@ inline __device__ void PackLiterals(
     if (t < available_bytes) { dst[t] = scratch_bytes[t]; }
     if (t + 128 < available_bytes) { dst[t + 128] = scratch_bytes[t + 128]; }
     __syncthreads();
+  } else {
+    CUDF_UNREACHABLE("Unsupported bit width");
   }
 }
 
@@ -569,6 +579,7 @@ static __device__ void RleEncode(
           uint8_t* dst           = VlqEncode(s->rle_out, rle_run);
           *dst++                 = run_val;
           if (nbits > 8) { *dst++ = run_val >> 8; }
+          if (nbits > 16) { *dst++ = run_val >> 16; }
           s->rle_out = dst;
         }
         rle_run = 0;
diff --git a/cpp/src/io/parquet/parquet_gpu.hpp b/cpp/src/io/parquet/parquet_gpu.hpp
index f94ce574731..05522b52b19 100644
--- a/cpp/src/io/parquet/parquet_gpu.hpp
+++ b/cpp/src/io/parquet/parquet_gpu.hpp
@@ -45,9 +45,8 @@ namespace parquet {
 
 using cudf::io::detail::string_index_pair;
 
-// Total number of unsigned 16 bit values
-constexpr size_type MAX_DICT_SIZE =
-  std::numeric_limits<uint16_t>::max() - std::numeric_limits<uint16_t>::min() + 1;
+// Total number of unsigned 24 bit values
+constexpr size_type MAX_DICT_SIZE = (1 << 24) - 1;
 
 /**
  * @brief Struct representing an input column in the file.
@@ -355,7 +354,7 @@ struct EncColumnChunk {
     uniq_data_size;  //!< Size of dictionary page (set of all unique values) if dict enc is used
   size_type plain_data_size;  //!< Size of data in this chunk if plain encoding is used
   size_type* dict_data;       //!< Dictionary data (unique row indices)
-  uint16_t* dict_index;   //!< Index of value in dictionary page. column[dict_data[dict_index[row]]]
+  size_type* dict_index;  //!< Index of value in dictionary page. column[dict_data[dict_index[row]]]
   uint8_t dict_rle_bits;  //!< Bit size for encoding dictionary indices
   bool use_dictionary;    //!< True if the chunk uses dictionary encoding
 };
diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu
index 70f4201c04b..caab87a9c7c 100644
--- a/cpp/src/io/parquet/writer_impl.cu
+++ b/cpp/src/io/parquet/writer_impl.cu
@@ -938,7 +938,7 @@ auto build_chunk_dictionaries(hostdevice_2dvector<gpu::EncColumnChunk>& chunks,
   auto h_chunks = chunks.host_view().flat_view();
 
   std::vector<rmm::device_uvector<size_type>> dict_data;
-  std::vector<rmm::device_uvector<uint16_t>> dict_index;
+  std::vector<rmm::device_uvector<size_type>> dict_index;
 
   if (h_chunks.size() == 0) { return std::pair(std::move(dict_data), std::move(dict_index)); }
 
@@ -950,7 +950,9 @@ auto build_chunk_dictionaries(hostdevice_2dvector<gpu::EncColumnChunk>& chunks,
       chunk.use_dictionary = false;
     } else {
       chunk.use_dictionary = true;
-      auto& inserted_map   = hash_maps_storage.emplace_back(chunk.num_values, stream);
+      // cuCollections suggests using a hash map of size N * (1/0.7) = num_values * 1.43
+      // https://github.com/NVIDIA/cuCollections/blob/3a49fc71/include/cuco/static_map.cuh#L190-L193
+      auto& inserted_map   = hash_maps_storage.emplace_back(chunk.num_values * 1.43, stream);
       chunk.dict_map_slots = inserted_map.data();
       chunk.dict_map_size  = inserted_map.size();
     }
@@ -974,14 +976,14 @@ auto build_chunk_dictionaries(hostdevice_2dvector<gpu::EncColumnChunk>& chunks,
       auto max_dict_index = (ck.num_dict_entries > 0) ? ck.num_dict_entries - 1 : 0;
       auto nbits          = CompactProtocolReader::NumRequiredBits(max_dict_index);
 
-      // We don't use dictionary if the indices are > 16 bits because that's the maximum bitpacking
+      // We don't use dictionary if the indices are > 24 bits because that's the maximum bitpacking
       // bitsize we efficiently support
-      if (nbits > 16) { return std::pair(false, 0); }
+      if (nbits > 24) { return std::pair(false, 0); }
 
       // Only these bit sizes are allowed for RLE encoding because it's compute optimized
-      constexpr auto allowed_bitsizes = std::array<size_type, 6>{1, 2, 4, 8, 12, 16};
+      constexpr auto allowed_bitsizes = std::array<size_type, 7>{1, 2, 4, 8, 12, 16, 24};
 
-      // ceil to (1/2/4/8/12/16)
+      // ceil to (1/2/4/8/12/16/24)
       auto rle_bits = *std::lower_bound(allowed_bitsizes.begin(), allowed_bitsizes.end(), nbits);
       auto rle_byte_size = util::div_rounding_up_safe(ck.num_values * rle_bits, 8);
 

From 719f4c8399f54340f51aeb37c79035dfbc3cc749 Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Thu, 21 Jul 2022 16:10:47 -0700
Subject: [PATCH 20/31] Switch death test to use explicit trap. (#11326)

This change should make the test fail reliably, whereas the current approach is flaky and leads to not infrequent test failures.

Authors:
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Jake Hemstad (https://github.com/jrhemstad)
  - Nghia Truong (https://github.com/ttnghia)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/11326
---
 cpp/tests/error/error_handling_test.cu | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/cpp/tests/error/error_handling_test.cu b/cpp/tests/error/error_handling_test.cu
index 9d8e0a7fe64..a83f72bb9c8 100644
--- a/cpp/tests/error/error_handling_test.cu
+++ b/cpp/tests/error/error_handling_test.cu
@@ -83,16 +83,13 @@ TEST(StreamCheck, CatchFailedKernel)
                             "invalid configuration argument");
 }
 
-__global__ void kernel(int* p) { *p = 42; }
+__global__ void kernel() { asm("trap;"); }
 
 TEST(DeathTest, CudaFatalError)
 {
   testing::FLAGS_gtest_death_test_style = "threadsafe";
   auto call_kernel                      = []() {
-    int* p;
-    cudaMalloc(&p, 2 * sizeof(int));
-    int* misaligned = (int*)(reinterpret_cast<char*>(p) + 1);
-    kernel<<<1, 1>>>(misaligned);
+    kernel<<<1, 1>>>();
     try {
       CUDF_CUDA_TRY(cudaDeviceSynchronize());
     } catch (const cudf::fatal_cuda_error& fe) {

From a541ffb46305dd6d9412fa3572e9349094af127d Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiatruong.vn@gmail.com>
Date: Thu, 21 Jul 2022 19:06:43 -0700
Subject: [PATCH 21/31] Remove `lists::drop_list_duplicates` (#11236)

This PR completely removes `cudf::lists::drop_list_duplicates`. It is replaced by the new API `cudf::list::distinct` which has a simpler implementation but better performance. The replacements for internal cudf usage have all been merged before thus there is no side effect or breaking for the existing APIs in this work.

Closes https://github.com/rapidsai/cudf/issues/11114, https://github.com/rapidsai/cudf/issues/11093, https://github.com/rapidsai/cudf/issues/11053, https://github.com/rapidsai/cudf/issues/11034, and closes https://github.com/rapidsai/cudf/issues/9257.

Depends on:
 * https://github.com/rapidsai/cudf/pull/11228
 * https://github.com/rapidsai/cudf/pull/11149
 * https://github.com/rapidsai/cudf/pull/11234
 * https://github.com/rapidsai/cudf/pull/11233

Authors:
  - Nghia Truong (https://github.com/ttnghia)

Approvers:
  - Jordan Jacobelli (https://github.com/Ethyling)
  - Robert Maynard (https://github.com/robertmaynard)
  - Vukasin Milovanovic (https://github.com/vuule)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/11236
---
 conda/recipes/libcudf/meta.yaml               |   2 -
 cpp/CMakeLists.txt                            |   1 -
 .../lists/detail/drop_list_duplicates.hpp     |  59 --
 .../cudf/lists/drop_list_duplicates.hpp       | 119 ---
 cpp/src/lists/drop_list_duplicates.cu         | 638 ------------
 cpp/tests/CMakeLists.txt                      |   1 -
 .../lists/drop_list_duplicates_tests.cpp      | 921 ------------------
 7 files changed, 1741 deletions(-)
 delete mode 100644 cpp/include/cudf/lists/detail/drop_list_duplicates.hpp
 delete mode 100644 cpp/include/cudf/lists/drop_list_duplicates.hpp
 delete mode 100644 cpp/src/lists/drop_list_duplicates.cu
 delete mode 100644 cpp/tests/lists/drop_list_duplicates_tests.cpp

diff --git a/conda/recipes/libcudf/meta.yaml b/conda/recipes/libcudf/meta.yaml
index 693d986deaf..475c7977848 100644
--- a/conda/recipes/libcudf/meta.yaml
+++ b/conda/recipes/libcudf/meta.yaml
@@ -166,13 +166,11 @@ outputs:
         - test -f $PREFIX/include/cudf/lists/detail/concatenate.hpp
         - test -f $PREFIX/include/cudf/lists/detail/contains.hpp
         - test -f $PREFIX/include/cudf/lists/detail/copying.hpp
-        - test -f $PREFIX/include/cudf/lists/detail/drop_list_duplicates.hpp
         - test -f $PREFIX/include/cudf/lists/detail/extract.hpp
         - test -f $PREFIX/include/cudf/lists/detail/interleave_columns.hpp
         - test -f $PREFIX/include/cudf/lists/detail/scatter_helper.cuh
         - test -f $PREFIX/include/cudf/lists/detail/sorting.hpp
         - test -f $PREFIX/include/cudf/lists/detail/stream_compaction.hpp
-        - test -f $PREFIX/include/cudf/lists/drop_list_duplicates.hpp
         - test -f $PREFIX/include/cudf/lists/explode.hpp
         - test -f $PREFIX/include/cudf/lists/extract.hpp
         - test -f $PREFIX/include/cudf/lists/filling.hpp
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 20a25432038..4819d1c2f5c 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -378,7 +378,6 @@ add_library(
   src/lists/copying/segmented_gather.cu
   src/lists/copying/scatter_helper.cu
   src/lists/count_elements.cu
-  src/lists/drop_list_duplicates.cu
   src/lists/explode.cu
   src/lists/extract.cu
   src/lists/interleave_columns.cu
diff --git a/cpp/include/cudf/lists/detail/drop_list_duplicates.hpp b/cpp/include/cudf/lists/detail/drop_list_duplicates.hpp
deleted file mode 100644
index 8cde8c1708c..00000000000
--- a/cpp/include/cudf/lists/detail/drop_list_duplicates.hpp
+++ /dev/null
@@ -1,59 +0,0 @@
-/*
- * Copyright (c) 2021, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#pragma once
-
-#include <cudf/lists/drop_list_duplicates.hpp>
-
-#include <rmm/cuda_stream_view.hpp>
-
-namespace cudf {
-namespace lists {
-namespace detail {
-/**
- * @copydoc cudf::lists::drop_list_duplicates(lists_column_view const&,
- *                                            lists_column_view const&,
- *                                            duplicate_keep_option,
- *                                            null_equality,
- *                                            nan_equality,
- *                                            rmm::mr::device_memory_resource*)
- * @param stream CUDA stream used for device memory operations and kernel launches.
- */
-std::unique_ptr<column> drop_list_duplicates(
-  lists_column_view const& keys,
-  lists_column_view const& values,
-  duplicate_keep_option keep_option,
-  null_equality nulls_equal,
-  nan_equality nans_equal,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
-
-/**
- * @copydoc cudf::lists::drop_list_duplicates(lists_column_view const&,
- *                                            null_equality,
- *                                            nan_equality,
- *                                            rmm::mr::device_memory_resource*)
- * @param stream CUDA stream used for device memory operations and kernel launches.
- */
-std::unique_ptr<column> drop_list_duplicates(
-  lists_column_view const& input,
-  null_equality nulls_equal,
-  nan_equality nans_equal,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
-
-}  // namespace detail
-}  // namespace lists
-}  // namespace cudf
diff --git a/cpp/include/cudf/lists/drop_list_duplicates.hpp b/cpp/include/cudf/lists/drop_list_duplicates.hpp
deleted file mode 100644
index 123ec69a7aa..00000000000
--- a/cpp/include/cudf/lists/drop_list_duplicates.hpp
+++ /dev/null
@@ -1,119 +0,0 @@
-/*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#pragma once
-
-#include <cudf/column/column.hpp>
-#include <cudf/lists/lists_column_view.hpp>
-#include <cudf/stream_compaction.hpp>
-
-#include <rmm/mr/device/per_device_resource.hpp>
-
-namespace cudf {
-namespace lists {
-/**
- * @addtogroup lists_drop_duplicates
- * @{
- * @file
- */
-
-/**
- * @brief Copy the elements from the lists in `keys` and associated `values` columns according to
- * the unique elements in `keys`.
- *
- * For each list in `keys` and associated `values`, according to the parameter `keep_option`, copy
- * the unique elements from the list in `keys` and their corresponding elements in `values` to new
- * lists. Order of the output elements within each list are not guaranteed to be preserved as in the
- * input.
- *
- * Behavior is undefined if `count_elements(keys)[i] != count_elements(values)[i]` for all `i` in
- * `[0, keys.size())`.
- *
- * @throw cudf::logic_error If the child column of the input keys column contains nested type other
- *        than STRUCT.
- * @throw cudf::logic_error If `keys.size() != values.size()`.
- *
- * @param keys The input keys lists column to check for uniqueness and copy unique elements.
- * @param values The values lists column in which the elements are mapped to elements in the key
- *        column.
- * @param nulls_equal Flag to specify whether null key elements should be considered as equal.
- * @param nans_equal Flag to specify whether NaN key elements should be considered as equal
- *        (only applicable for floating point keys elements).
- * @param keep_option Flag to specify which elements will be copied from the input to the output.
- * @param mr Device resource used to allocate memory.
- *
- * @code{.pseudo}
- * keys   = { {1,   1,   2,   3},   {4},   NULL, {}, {NULL, NULL, NULL, 5,   6,   6,   6,   5} }
- * values = { {"a", "b", "c", "d"}, {"e"}, NULL, {}, {"N0", "N1", "N2", "f", "g", "h", "i", "j"} }
- *
- * [out_keys, out_values] = drop_list_duplicates(keys, values, duplicate_keep_option::KEEP_FIRST)
- * out_keys   = { {1,   2,   3},   {4},   NULL, {}, {5,   6,   NULL} }
- * out_values = { {"a", "c", "d"}, {"e"}, NULL, {}, {"f", "g", "N0"} }
- *
- * [out_keys, out_values] = drop_list_duplicates(keys, values, duplicate_keep_option::KEEP_LAST)
- * out_keys   = { {1,   2,   3},   {4},   NULL, {}, {5,   6,   NULL} }
- * out_values = { {"b", "c", "d"}, {"e"}, NULL, {}, {"j", "i", "N2"} }
- *
- * [out_keys, out_values] = drop_list_duplicates(keys, values, duplicate_keep_option::KEEP_NONE)
- * out_keys   = { {2,   3},   {4},   NULL, {}, {} }
- * out_values = { {"c", "d"}, {"e"}, NULL, {}, {} }
- * @endcode
- *
- * @return A pair of lists columns storing the results from extracting unique key elements and their
- * corresponding values elements from the input.
- */
-std::pair<std::unique_ptr<column>, std::unique_ptr<column>> drop_list_duplicates(
-  lists_column_view const& keys,
-  lists_column_view const& values,
-  duplicate_keep_option keep_option   = duplicate_keep_option::KEEP_FIRST,
-  null_equality nulls_equal           = null_equality::EQUAL,
-  nan_equality nans_equal             = nan_equality::UNEQUAL,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
-
-/**
- * @brief Create a new list column by copying elements from the input lists column ignoring
- * duplicate list elements.
- *
- * Given a lists column, an output lists column is generated by copying elements from the input
- * lists column in a way such that the duplicate elements in each list are ignored, producing only
- * unique list elements.
- *
- * Order of the output elements are not guaranteed to be preserved as in the input.
- *
- * @throw cudf::logic_error If the child column of the input lists column contains nested type other
- *        than STRUCT.
- *
- * @param input The input lists column to check and copy unique elements.
- * @param nulls_equal Flag to specify whether null key elements should be considered as equal.
- * @param nans_equal Flag to specify whether NaN key elements should be considered as equal
- *        (only applicable for floating point keys column).
- * @param mr Device resource used to allocate memory.
- *
- * @code{.pseudo}
- * input  = { {1, 1, 2, 3}, {4}, NULL, {}, {NULL, NULL, NULL, 5, 6, 6, 6, 5} }
- * drop_list_duplicates(input) = { {1, 2, 3}, {4}, NULL, {}, {5, 6, NULL} }
- * @endcode
- *
- * @return A lists column storing the results from extracting unique list elements from the input.
- */
-std::unique_ptr<column> drop_list_duplicates(
-  lists_column_view const& input,
-  null_equality nulls_equal           = null_equality::EQUAL,
-  nan_equality nans_equal             = nan_equality::UNEQUAL,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
-
-/** @} */  // end of group
-}  // namespace lists
-}  // namespace cudf
diff --git a/cpp/src/lists/drop_list_duplicates.cu b/cpp/src/lists/drop_list_duplicates.cu
deleted file mode 100644
index d0700f439ce..00000000000
--- a/cpp/src/lists/drop_list_duplicates.cu
+++ /dev/null
@@ -1,638 +0,0 @@
-/*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <stream_compaction/stream_compaction_common.cuh>
-
-#include <cudf/column/column_factories.hpp>
-#include <cudf/detail/copy.hpp>
-#include <cudf/detail/gather.hpp>
-#include <cudf/detail/iterator.cuh>
-#include <cudf/detail/labeling/label_segments.cuh>
-#include <cudf/detail/null_mask.hpp>
-#include <cudf/detail/nvtx/ranges.hpp>
-#include <cudf/detail/replace.hpp>
-#include <cudf/detail/sorting.hpp>
-#include <cudf/detail/structs/utilities.hpp>
-#include <cudf/lists/drop_list_duplicates.hpp>
-#include <cudf/structs/struct_view.hpp>
-#include <cudf/table/table_device_view.cuh>
-#include <cudf/table/table_view.hpp>
-#include <cudf/utilities/default_stream.hpp>
-#include <cudf/utilities/type_dispatcher.hpp>
-
-#include <rmm/cuda_stream_view.hpp>
-#include <rmm/device_uvector.hpp>
-#include <rmm/exec_policy.hpp>
-
-#include <thrust/count.h>
-#include <thrust/distance.h>
-#include <thrust/equal.h>
-#include <thrust/execution_policy.h>
-#include <thrust/iterator/counting_iterator.h>
-
-#include <optional>
-
-namespace cudf::lists {
-namespace detail {
-
-namespace {
-template <typename Type>
-struct has_negative_nans_fn {
-  column_device_view const d_view;
-
-  has_negative_nans_fn(column_device_view const& d_view) : d_view(d_view) {}
-
-  __device__ Type operator()(size_type idx) const noexcept
-  {
-    if (d_view.is_null(idx)) { return false; }
-
-    auto const val = d_view.element<Type>(idx);
-    return std::isnan(val) && std::signbit(val);  // std::signbit(x) == true if x is negative
-  }
-};
-
-/**
- * @brief A structure to be used along with type_dispatcher to check if a column has any
- * negative NaN value.
- *
- * This functor is necessary because when calling to segmented sort on the list entries, the
- * negative NaN and positive NaN values (if both exist) are separated to the two ends of the output
- * lists. We want to move all NaN values close together in order to call unique_copy later on.
- */
-struct has_negative_nans_dispatch {
-  template <typename Type, std::enable_if_t<cuda::std::is_floating_point_v<Type>>* = nullptr>
-  bool operator()(column_view const& input, rmm::cuda_stream_view stream) const noexcept
-  {
-    auto const d_entries_ptr = column_device_view::create(input, stream);
-    return thrust::count_if(rmm::exec_policy(stream),
-                            thrust::make_counting_iterator(0),
-                            thrust::make_counting_iterator(input.size()),
-                            has_negative_nans_fn<Type>{*d_entries_ptr});
-  }
-
-  template <typename Type, std::enable_if_t<std::is_same_v<Type, cudf::struct_view>>* = nullptr>
-  bool operator()(column_view const& input, rmm::cuda_stream_view stream) const
-  {
-    // Recursively check negative NaN on the children columns.
-    return std::any_of(thrust::make_counting_iterator(0),
-                       thrust::make_counting_iterator(input.num_children()),
-                       [structs_view = structs_column_view{input}, stream](auto const child_idx) {
-                         auto const col = structs_view.get_sliced_child(child_idx);
-                         return type_dispatcher(
-                           col.type(), has_negative_nans_dispatch{}, col, stream);
-                       });
-  }
-
-  template <typename Type,
-            std::enable_if_t<!cuda::std::is_floating_point_v<Type> &&
-                             !std::is_same_v<Type, cudf::struct_view>>* = nullptr>
-  bool operator()(column_view const&, rmm::cuda_stream_view) const
-  {
-    // Non-nested columns of non floating-point data do not contain NaN.
-    // Nested columns (not STRUCT) are not supported and should not reach this point.
-    return false;
-  }
-};
-
-/**
- * @brief A structure to be used along with type_dispatcher to replace -NaN by NaN for a
- * floating-point data column.
- *
- * Replacing -NaN by NaN is necessary before calling to segmented sort for lists because the sorting
- * API may separate -NaN and NaN to the two ends of each result list while we want to group all NaN
- * together.
- */
-struct replace_negative_nans_dispatch {
-  template <typename Type, std::enable_if_t<!std::is_same_v<Type, cudf::struct_view>>* = nullptr>
-  std::unique_ptr<column> operator()(column_view const& input,
-                                     rmm::cuda_stream_view stream) const noexcept
-  {
-    return cuda::std::is_floating_point_v<Type>
-             ? cudf::detail::normalize_nans_and_zeros(input, stream)
-             : std::make_unique<column>(input, stream);
-  }
-
-  template <typename Type, std::enable_if_t<std::is_same_v<Type, cudf::struct_view>>* = nullptr>
-  std::unique_ptr<column> operator()(column_view const& input,
-                                     rmm::cuda_stream_view stream) const noexcept
-  {
-    std::vector<std::unique_ptr<cudf::column>> output_struct_members;
-    std::transform(thrust::make_counting_iterator(0),
-                   thrust::make_counting_iterator(input.num_children()),
-                   std::back_inserter(output_struct_members),
-                   [structs_view = structs_column_view{input}, stream](auto const child_idx) {
-                     auto const col = structs_view.get_sliced_child(child_idx);
-                     return type_dispatcher(
-                       col.type(), replace_negative_nans_dispatch{}, col, stream);
-                   });
-
-    return cudf::make_structs_column(input.size(),
-                                     std::move(output_struct_members),
-                                     input.null_count(),
-                                     cudf::detail::copy_bitmask(input, stream),
-                                     stream);
-  }
-};
-
-/**
- * @brief Perform an equality comparison between two entries in a lists column, specialized from
- * `cudf::element_equality_comparator` to take into account both parameters `nulls_equal` and
- * `nans_equal` when comparing floating-point numbers.
- *
- * For the two entries that are NOT in the same list, they will always be considered as different.
- *
- * If they are from the same list and their type is not floating point, this functor will return the
- * same comparison result as `cudf::element_equality_comparator`.
- *
- * For floating-point types, entries holding NaN value can be considered as different or the same
- * value depending on the `nans_equal` parameter.
- */
-template <class Type>
-struct column_row_comparator_fn {
-  size_type const* const list_indices;
-  column_device_view const lhs;
-  column_device_view const rhs;
-  null_equality const nulls_equal;
-  bool const has_nulls;
-  bool const nans_equal;
-
-  __host__ __device__ column_row_comparator_fn(size_type const* const list_indices,
-                                               column_device_view const& lhs,
-                                               column_device_view const& rhs,
-                                               null_equality const nulls_equal,
-                                               bool const has_nulls,
-                                               bool const nans_equal)
-    : list_indices(list_indices),
-      lhs(lhs),
-      rhs(rhs),
-      nulls_equal(nulls_equal),
-      has_nulls(has_nulls),
-      nans_equal(nans_equal)
-  {
-  }
-
-  template <typename T = Type, std::enable_if_t<!cuda::std::is_floating_point_v<T>>* = nullptr>
-  bool __device__ compare(T const& lhs_val, T const& rhs_val) const noexcept
-  {
-    return lhs_val == rhs_val;
-  }
-
-  template <typename T = Type, std::enable_if_t<cuda::std::is_floating_point_v<T>>* = nullptr>
-  bool __device__ compare(T const& lhs_val, T const& rhs_val) const noexcept
-  {
-    // If both element(i) and element(j) are NaNs and NaNs are considered as equal value then this
-    // comparison will return `true`. This is the desired behavior in Pandas.
-    if (nans_equal && std::isnan(lhs_val) && std::isnan(rhs_val)) { return true; }
-
-    // If NaNs are considered as NOT equal, even both element(i) and element(j) are NaNs this
-    // comparison will still return `false`. This is the desired behavior in Apache Spark.
-    return lhs_val == rhs_val;
-  }
-
-  bool __device__ operator()(size_type i, size_type j) const noexcept
-  {
-    // Two entries are not considered for equality if they belong to different lists.
-    if (list_indices[i] != list_indices[j]) { return false; }
-
-    if (has_nulls) {
-      bool const lhs_is_null{lhs.nullable() && lhs.is_null_nocheck(i)};
-      bool const rhs_is_null{rhs.nullable() && rhs.is_null_nocheck(j)};
-      if (lhs_is_null && rhs_is_null) {
-        return nulls_equal == null_equality::EQUAL;
-      } else if (lhs_is_null != rhs_is_null) {
-        return false;
-      }
-    }
-
-    return compare(lhs.element<Type>(i), lhs.element<Type>(j));
-  }
-};
-
-/**
- * @brief Struct used in type_dispatcher for comparing two entries in a lists column.
- */
-struct column_row_comparator_dispatch {
-  size_type const* const list_indices;
-  column_device_view const lhs;
-  column_device_view const rhs;
-  null_equality const nulls_equal;
-  bool const has_nulls;
-  bool const nans_equal;
-
-  __device__ column_row_comparator_dispatch(size_type const* const list_indices,
-                                            column_device_view const& lhs,
-                                            column_device_view const& rhs,
-                                            null_equality const nulls_equal,
-                                            bool const has_nulls,
-                                            bool const nans_equal)
-    : list_indices(list_indices),
-      lhs(lhs),
-      rhs(rhs),
-      nulls_equal(nulls_equal),
-      has_nulls(has_nulls),
-      nans_equal(nans_equal)
-  {
-  }
-
-  template <class Type, std::enable_if_t<cudf::is_equality_comparable<Type, Type>()>* = nullptr>
-  bool __device__ operator()(size_type i, size_type j) const noexcept
-  {
-    return column_row_comparator_fn<Type>{
-      list_indices, lhs, rhs, nulls_equal, has_nulls, nans_equal}(i, j);
-  }
-
-  template <class Type, std::enable_if_t<!cudf::is_equality_comparable<Type, Type>()>* = nullptr>
-  bool operator()(size_type, size_type) const
-  {
-    CUDF_FAIL(
-      "column_row_comparator_dispatch cannot operate on types that are not equally comparable.");
-  }
-};
-
-/**
- * @brief Performs an equality comparison between rows of two tables using
- * `column_row_comparator_fn` functor to compare rows of their corresponding columns.
- */
-struct table_row_comparator_fn {
-  size_type const* const list_indices;
-  table_device_view const lhs;
-  table_device_view const rhs;
-  null_equality const nulls_equal;
-  bool const has_nulls;
-  bool const nans_equal;
-
-  table_row_comparator_fn(size_type const* const list_indices,
-                          table_device_view const& lhs,
-                          table_device_view const& rhs,
-                          null_equality const nulls_equal,
-                          bool const has_nulls,
-                          bool const nans_equal)
-    : list_indices(list_indices),
-      lhs(lhs),
-      rhs(rhs),
-      nulls_equal(nulls_equal),
-      has_nulls(has_nulls),
-      nans_equal(nans_equal)
-  {
-  }
-
-  bool __device__ operator()(size_type i, size_type j) const
-  {
-    auto column_comp = [=](column_device_view const& lhs, column_device_view const& rhs) {
-      return type_dispatcher(
-        lhs.type(),
-        column_row_comparator_dispatch{list_indices, lhs, rhs, nulls_equal, has_nulls, nans_equal},
-        i,
-        j);
-    };
-
-    return thrust::equal(thrust::seq, lhs.begin(), lhs.end(), rhs.begin(), column_comp);
-  }
-};
-
-/**
- *  @brief Struct used in type_dispatcher for copying indices of the list entries ignoring duplicate
- * list entries.
- */
-struct get_indices_of_unique_entries_dispatch {
-  template <class Type,
-            std::enable_if_t<!cudf::is_equality_comparable<Type, Type>() &&
-                             !std::is_same_v<Type, cudf::struct_view>>* = nullptr>
-  size_type* operator()(size_type const*,
-                        column_view const&,
-                        size_type,
-                        size_type*,
-                        null_equality,
-                        nan_equality,
-                        duplicate_keep_option,
-                        rmm::cuda_stream_view) const
-  {
-    CUDF_FAIL(
-      "get_indices_of_unique_entries_dispatch cannot operate on types that are not equally "
-      "comparable or not STRUCT type.");
-  }
-
-  template <class Type, std::enable_if_t<cudf::is_equality_comparable<Type, Type>()>* = nullptr>
-  size_type* operator()(size_type const* list_indices,
-                        column_view const& all_lists_entries,
-                        size_type num_entries,
-                        size_type* output_begin,
-                        null_equality nulls_equal,
-                        nan_equality nans_equal,
-                        duplicate_keep_option keep_option,
-                        rmm::cuda_stream_view stream) const noexcept
-  {
-    auto const d_view = column_device_view::create(all_lists_entries, stream);
-    auto const comp   = column_row_comparator_fn<Type>{list_indices,
-                                                     *d_view,
-                                                     *d_view,
-                                                     nulls_equal,
-                                                     all_lists_entries.has_nulls(),
-                                                     nans_equal == nan_equality::ALL_EQUAL};
-    return cudf::detail::unique_copy(thrust::make_counting_iterator(0),
-                                     thrust::make_counting_iterator(num_entries),
-                                     output_begin,
-                                     comp,
-                                     keep_option,
-                                     stream);
-  }
-
-  template <class Type, std::enable_if_t<std::is_same_v<Type, cudf::struct_view>>* = nullptr>
-  size_type* operator()(size_type const* list_indices,
-                        column_view const& all_lists_entries,
-                        size_type num_entries,
-                        size_type* output_begin,
-                        null_equality nulls_equal,
-                        nan_equality nans_equal,
-                        duplicate_keep_option keep_option,
-                        rmm::cuda_stream_view stream) const noexcept
-  {
-    auto const flattened_entries = cudf::structs::detail::flatten_nested_columns(
-      table_view{{all_lists_entries}}, {order::ASCENDING}, {null_order::AFTER}, {});
-    auto const dview_ptr = table_device_view::create(flattened_entries, stream);
-    // Search through children of all levels for nulls.
-    auto const nested_has_nulls = has_nulls(flattened_entries.flattened_columns());
-
-    auto const comp = table_row_comparator_fn{list_indices,
-                                              *dview_ptr,
-                                              *dview_ptr,
-                                              nulls_equal,
-                                              nested_has_nulls,
-                                              nans_equal == nan_equality::ALL_EQUAL};
-    return cudf::detail::unique_copy(thrust::make_counting_iterator(0),
-                                     thrust::make_counting_iterator(num_entries),
-                                     output_begin,
-                                     comp,
-                                     keep_option,
-                                     stream);
-  }
-};
-
-/**
- * @brief Extract list entries and their corresponding (1-based) list indices ignoring duplicate
- * entries.
- */
-std::vector<std::unique_ptr<column>> get_unique_entries_and_list_indices(
-  column_view const& keys_entries,
-  std::optional<column_view> const& values_entries,
-  device_span<size_type const> entries_list_indices,
-  null_equality nulls_equal,
-  nan_equality nans_equal,
-  duplicate_keep_option keep_option,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
-{
-  auto const num_entries = keys_entries.size();
-
-  // Allocate memory to store the indices of the unique key entries.
-  // These indices will be used as a gather map to collect keys and values.
-  auto unique_indices     = rmm::device_uvector<size_type>(num_entries, stream);
-  auto const output_begin = unique_indices.begin();
-  auto const output_end   = type_dispatcher(keys_entries.type(),
-                                          get_indices_of_unique_entries_dispatch{},
-                                          entries_list_indices.begin(),
-                                          keys_entries,
-                                          num_entries,
-                                          output_begin,
-                                          nulls_equal,
-                                          nans_equal,
-                                          keep_option,
-                                          stream);
-
-  auto const list_indices_view = column_view(data_type{type_to_id<size_type>()},
-                                             static_cast<size_type>(entries_list_indices.size()),
-                                             entries_list_indices.data());
-  auto const input_table       = values_entries
-                                   ? table_view{{keys_entries, values_entries.value(), list_indices_view}}
-                                   : table_view{{keys_entries, list_indices_view}};
-
-  // Collect unique entries and entry list indices.
-  // The new null_count and bitmask of the unique entries will also be generated by the gather
-  // function.
-  return cudf::detail::gather(input_table,
-                              device_span<size_type const>(
-                                unique_indices.data(), thrust::distance(output_begin, output_end)),
-                              cudf::out_of_bounds_policy::DONT_CHECK,
-                              cudf::detail::negative_index_policy::NOT_ALLOWED,
-                              stream,
-                              mr)
-    ->release();
-}
-
-/**
- * @brief Common execution code called by all public `drop_list_duplicates` APIs.
- */
-std::pair<std::unique_ptr<column>, std::unique_ptr<column>> drop_list_duplicates_common(
-  lists_column_view const& keys,
-  std::optional<lists_column_view> const& values,
-  null_equality nulls_equal,
-  nan_equality nans_equal,
-  duplicate_keep_option keep_option,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
-{
-  if (auto const child_type = keys.child().type();
-      cudf::is_nested(child_type) && child_type.id() != type_id::STRUCT) {
-    CUDF_FAIL(
-      "Keys of nested types other than STRUCT are not supported in `drop_list_duplicates`.");
-  }
-
-  CUDF_EXPECTS(!values || keys.size() == values.value().size(),
-               "Keys and values columns must have the same size.");
-
-  if (keys.is_empty()) {
-    return std::pair{cudf::empty_like(keys.parent()),
-                     values ? cudf::empty_like(values.value().parent()) : nullptr};
-  }
-
-  // The child column containing list entries.
-  auto const keys_child = keys.get_sliced_child(stream);
-
-  // Generate a mapping from list entries to their list indices for the keys column.
-  auto const entries_list_indices = [&] {
-    auto labels = rmm::device_uvector<size_type>(keys_child.size(), stream);
-    cudf::detail::label_segments(
-      keys.offsets_begin(), keys.offsets_end(), labels.begin(), labels.end(), stream);
-    return labels;
-  }();
-
-  // Generate segmented sorted order for key entries.
-  // The keys column will be sorted (gathered) using this order.
-  auto const sorted_order = [&]() {
-    auto const list_indices_view = column_view(data_type{type_to_id<size_type>()},
-                                               static_cast<size_type>(entries_list_indices.size()),
-                                               entries_list_indices.data());
-
-    // If nans_equal == ALL_EQUAL and the keys column contains floating-point data type,
-    // we need to replace `-NaN` by `NaN` before sorting.
-    auto const replace_negative_nan =
-      nans_equal == nan_equality::ALL_EQUAL &&
-      type_dispatcher(keys_child.type(), has_negative_nans_dispatch{}, keys_child, stream);
-
-    if (replace_negative_nan) {
-      auto const replaced_nan_keys_child =
-        type_dispatcher(keys_child.type(), replace_negative_nans_dispatch{}, keys_child, stream);
-      return cudf::detail::stable_sorted_order(
-        table_view{{list_indices_view, replaced_nan_keys_child->view()}},
-        {order::ASCENDING, order::ASCENDING},
-        {null_order::AFTER, null_order::AFTER},
-        stream);
-    } else {
-      return cudf::detail::stable_sorted_order(table_view{{list_indices_view, keys_child}},
-                                               {order::ASCENDING, order::ASCENDING},
-                                               {null_order::AFTER, null_order::AFTER},
-                                               stream);
-    }
-  }();
-
-  auto const sorting_table = values
-                               ? table_view{{keys_child, values.value().get_sliced_child(stream)}}
-                               : table_view{{keys_child}};
-  auto const sorted_table  = cudf::detail::gather(sorting_table,
-                                                 sorted_order->view(),
-                                                 out_of_bounds_policy::DONT_CHECK,
-                                                 cudf::detail::negative_index_policy::NOT_ALLOWED,
-                                                 stream);
-
-  // Extract the segmented sorted key entries.
-  auto const sorted_keys_entries = sorted_table->get_column(0).view();
-  auto const sorted_values_entries =
-    values ? std::optional<column_view>(sorted_table->get_column(1).view()) : std::nullopt;
-
-  // Generate child columns containing unique entries (along with their list indices).
-  // null_count and bitmask of these columns will also be generated in this function.
-  auto unique_entries_and_list_indices = get_unique_entries_and_list_indices(sorted_keys_entries,
-                                                                             sorted_values_entries,
-                                                                             entries_list_indices,
-                                                                             nulls_equal,
-                                                                             nans_equal,
-                                                                             keep_option,
-                                                                             stream,
-                                                                             mr);
-
-  // Generate offsets for the output lists column(s).
-  auto output_offsets = [&] {
-    auto out_offsets = make_numeric_column(
-      data_type{type_to_id<offset_type>()}, keys.size() + 1, mask_state::UNALLOCATED, stream, mr);
-    auto const offsets = out_offsets->mutable_view();
-    auto const labels =
-      unique_entries_and_list_indices.back()->view();  // unique entries' list indices
-    cudf::detail::labels_to_offsets(labels.template begin<size_type>(),
-                                    labels.template end<size_type>(),
-                                    offsets.template begin<size_type>(),
-                                    offsets.template end<size_type>(),
-                                    stream);
-    return out_offsets;
-  }();
-
-  // If the values lists column is not given, its corresponding output will be nullptr.
-  auto out_values =
-    values ? make_lists_column(keys.size(),
-                               std::make_unique<column>(output_offsets->view(), stream, mr),
-                               std::move(unique_entries_and_list_indices[1]),
-                               values.value().null_count(),
-                               cudf::detail::copy_bitmask(values.value().parent(), stream, mr),
-                               stream,
-                               mr)
-           : nullptr;
-
-  auto out_keys = make_lists_column(keys.size(),
-                                    std::move(output_offsets),
-                                    std::move(unique_entries_and_list_indices[0]),
-                                    keys.null_count(),
-                                    cudf::detail::copy_bitmask(keys.parent(), stream, mr),
-                                    stream,
-                                    mr);
-
-  return std::pair{std::move(out_keys), std::move(out_values)};
-}
-
-}  // anonymous namespace
-
-std::pair<std::unique_ptr<column>, std::unique_ptr<column>> drop_list_duplicates(
-  lists_column_view const& keys,
-  lists_column_view const& values,
-  null_equality nulls_equal,
-  nan_equality nans_equal,
-  duplicate_keep_option keep_option,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
-{
-  return drop_list_duplicates_common(keys,
-                                     std::optional<lists_column_view>(values),
-                                     nulls_equal,
-                                     nans_equal,
-                                     keep_option,
-                                     stream,
-                                     mr);
-}
-
-std::unique_ptr<column> drop_list_duplicates(lists_column_view const& input,
-                                             null_equality nulls_equal,
-                                             nan_equality nans_equal,
-                                             rmm::cuda_stream_view stream,
-                                             rmm::mr::device_memory_resource* mr)
-{
-  return drop_list_duplicates_common(input,
-                                     std::nullopt,
-                                     nulls_equal,
-                                     nans_equal,
-                                     duplicate_keep_option::KEEP_FIRST,
-                                     stream,
-                                     mr)
-    .first;
-}
-
-}  // namespace detail
-
-/**
- * @copydoc cudf::lists::drop_list_duplicates(lists_column_view const&,
- *                                            lists_column_view const&,
- *                                            duplicate_keep_option,
- *                                            null_equality,
- *                                            nan_equality,
- *                                            rmm::mr::device_memory_resource*)
- */
-std::pair<std::unique_ptr<column>, std::unique_ptr<column>> drop_list_duplicates(
-  lists_column_view const& keys,
-  lists_column_view const& values,
-  duplicate_keep_option keep_option,
-  null_equality nulls_equal,
-  nan_equality nans_equal,
-  rmm::mr::device_memory_resource* mr)
-{
-  CUDF_FUNC_RANGE();
-  return detail::drop_list_duplicates(
-    keys, values, nulls_equal, nans_equal, keep_option, cudf::default_stream_value, mr);
-}
-
-/**
- * @copydoc cudf::lists::drop_list_duplicates(lists_column_view const&,
- *                                            null_equality,
- *                                            nan_equality,
- *                                            rmm::mr::device_memory_resource*)
- */
-std::unique_ptr<column> drop_list_duplicates(lists_column_view const& input,
-                                             null_equality nulls_equal,
-                                             nan_equality nans_equal,
-                                             rmm::mr::device_memory_resource* mr)
-{
-  CUDF_FUNC_RANGE();
-  return detail::drop_list_duplicates(
-    input, nulls_equal, nans_equal, cudf::default_stream_value, mr);
-}
-
-}  // namespace cudf::lists
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index a1e3cfed286..d00fa6633de 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -477,7 +477,6 @@ ConfigureTest(
   lists/combine/concatenate_rows_tests.cpp
   lists/contains_tests.cpp
   lists/count_elements_tests.cpp
-  lists/drop_list_duplicates_tests.cpp
   lists/explode_tests.cpp
   lists/extract_tests.cpp
   lists/sequences_tests.cpp
diff --git a/cpp/tests/lists/drop_list_duplicates_tests.cpp b/cpp/tests/lists/drop_list_duplicates_tests.cpp
deleted file mode 100644
index 54d7ba0a95e..00000000000
--- a/cpp/tests/lists/drop_list_duplicates_tests.cpp
+++ /dev/null
@@ -1,921 +0,0 @@
-/*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <cudf/copying.hpp>
-
-#include <cudf_test/base_fixture.hpp>
-#include <cudf_test/column_wrapper.hpp>
-#include <cudf_test/iterator_utilities.hpp>
-#include <cudf_test/type_lists.hpp>
-
-#include <cudf/lists/drop_list_duplicates.hpp>
-
-#include <algorithm>
-#include <unordered_set>
-
-using namespace cudf::test::iterators;
-
-using float_type    = float;
-using IntListsCol   = cudf::test::lists_column_wrapper<int32_t>;
-using FloatListsCol = cudf::test::lists_column_wrapper<float_type>;
-using StrListsCol   = cudf::test::lists_column_wrapper<cudf::string_view>;
-using StringsCol    = cudf::test::strings_column_wrapper;
-using StructsCol    = cudf::test::structs_column_wrapper;
-using IntsCol       = cudf::test::fixed_width_column_wrapper<int32_t>;
-using FloatsCol     = cudf::test::fixed_width_column_wrapper<float_type>;
-
-auto constexpr neg_NaN   = -std::numeric_limits<float_type>::quiet_NaN();
-auto constexpr neg_Inf   = -std::numeric_limits<float_type>::infinity();
-auto constexpr NaN       = std::numeric_limits<float_type>::quiet_NaN();
-auto constexpr Inf       = std::numeric_limits<float_type>::infinity();
-auto constexpr verbosity = cudf::test::debug_output_level::FIRST_ERROR;
-
-struct DropListDuplicatesTest : public cudf::test::BaseFixture {
-};
-
-TEST_F(DropListDuplicatesTest, FloatingPointTestsWithSignedZero)
-{
-  // -0.0 and 0.0 should be considered equal.
-  auto const keys = FloatListsCol{0.0, 1, 2, -0.0, 1, 2, 0.0, 1, 2, -0.0, -0.0, 0.0, 0.0, 3};
-  auto const vals =
-    StrListsCol{"1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13", "14"};
-  auto const expected_keys = FloatListsCol{0, 1, 2, 3};
-
-  // Remove duplicates only from keys.
-  {
-    auto const results = cudf::lists::drop_list_duplicates(cudf::lists_column_view{keys});
-    CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected_keys, verbosity);
-  }
-
-  // Remove duplicates with KEEP_FIRST.
-  {
-    auto const expected_vals = StrListsCol{"1", "2", "3", "14"};
-    auto const [results_keys, results_vals] =
-      cudf::lists::drop_list_duplicates(cudf::lists_column_view{keys},
-                                        cudf::lists_column_view{vals},
-                                        cudf::duplicate_keep_option::KEEP_FIRST);
-    CUDF_TEST_EXPECT_COLUMNS_EQUAL(results_keys->view(), expected_keys, verbosity);
-    CUDF_TEST_EXPECT_COLUMNS_EQUAL(results_vals->view(), expected_vals, verbosity);
-  }
-
-  // Remove duplicates with KEEP_LAST.
-  {
-    auto const expected_vals = StrListsCol{"13", "8", "9", "14"};
-    auto const [results_keys, results_vals] =
-      cudf::lists::drop_list_duplicates(cudf::lists_column_view{keys},
-                                        cudf::lists_column_view{vals},
-                                        cudf::duplicate_keep_option::KEEP_LAST);
-    CUDF_TEST_EXPECT_COLUMNS_EQUAL(results_keys->view(), expected_keys, verbosity);
-    CUDF_TEST_EXPECT_COLUMNS_EQUAL(results_vals->view(), expected_vals, verbosity);
-  }
-
-  // Remove duplicates with KEEP_NONE.
-  {
-    auto const expected_keys = FloatListsCol{3};
-    auto const expected_vals = StrListsCol{"14"};
-    auto const [results_keys, results_vals] =
-      cudf::lists::drop_list_duplicates(cudf::lists_column_view{keys},
-                                        cudf::lists_column_view{vals},
-                                        cudf::duplicate_keep_option::KEEP_NONE);
-    CUDF_TEST_EXPECT_COLUMNS_EQUAL(results_keys->view(), expected_keys, verbosity);
-    CUDF_TEST_EXPECT_COLUMNS_EQUAL(results_vals->view(), expected_vals, verbosity);
-  }
-}
-
-TEST_F(DropListDuplicatesTest, FloatingPointTestsWithInf)
-{
-  auto const keys          = FloatListsCol{Inf, 0, neg_Inf, 0, Inf, 0, neg_Inf, 0, Inf, 0, neg_Inf};
-  auto const vals          = IntListsCol{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11};
-  auto const expected_keys = FloatListsCol{neg_Inf, 0, Inf};
-
-  // Remove duplicates only from keys.
-  {
-    auto const results = cudf::lists::drop_list_duplicates(cudf::lists_column_view{keys});
-    CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected_keys, verbosity);
-  }
-
-  // Remove duplicates with KEEP_FIRST.
-  {
-    auto const expected_vals = IntListsCol{3, 2, 1};
-    auto const [results_keys, results_vals] =
-      cudf::lists::drop_list_duplicates(cudf::lists_column_view{keys},
-                                        cudf::lists_column_view{vals},
-                                        cudf::duplicate_keep_option::KEEP_FIRST);
-    CUDF_TEST_EXPECT_COLUMNS_EQUAL(results_keys->view(), expected_keys, verbosity);
-    CUDF_TEST_EXPECT_COLUMNS_EQUAL(results_vals->view(), expected_vals, verbosity);
-  }
-
-  // Remove duplicates with KEEP_LAST.
-  {
-    auto const expected_vals = IntListsCol{11, 10, 9};
-    auto const [results_keys, results_vals] =
-      cudf::lists::drop_list_duplicates(cudf::lists_column_view{keys},
-                                        cudf::lists_column_view{vals},
-                                        cudf::duplicate_keep_option::KEEP_LAST);
-    CUDF_TEST_EXPECT_COLUMNS_EQUAL(results_keys->view(), expected_keys, verbosity);
-    CUDF_TEST_EXPECT_COLUMNS_EQUAL(results_vals->view(), expected_vals, verbosity);
-  }
-
-  // Remove duplicates with KEEP_NONE.
-  {
-    auto const expected_keys = FloatListsCol{FloatListsCol{}};
-    auto const expected_vals = IntListsCol{IntListsCol{}};
-    auto const [results_keys, results_vals] =
-      cudf::lists::drop_list_duplicates(cudf::lists_column_view{keys},
-                                        cudf::lists_column_view{vals},
-                                        cudf::duplicate_keep_option::KEEP_NONE);
-
-    CUDF_TEST_EXPECT_COLUMNS_EQUAL(results_keys->view(), expected_keys, verbosity);
-    CUDF_TEST_EXPECT_COLUMNS_EQUAL(results_vals->view(), expected_vals, verbosity);
-  }
-  //  exit(0);
-}
-
-// The position of NaN is undefined after sorting, thus we need to offload the data to CPU to
-// check for validity.
-// We will not store NaN in the results_expected variable (an unordered_set) because we can't check
-// for NaN existence in a set. Instead, we will count the number of NaNs in the input and compare
-// with the number of NaNs in the output.
-static void test_floating_point(std::vector<float_type> const& h_input,
-                                std::unordered_set<float_type> const& results_expected,
-                                cudf::nan_equality nans_equal)
-{
-  // If NaNs are considered as equal value, the final result should always contain at max ONE NaN
-  // entry per list.
-  std::size_t const num_NaNs =
-    nans_equal == cudf::nan_equality::ALL_EQUAL
-      ? std::size_t{1}
-      : std::count_if(h_input.begin(), h_input.end(), [](auto x) { return std::isnan(x); });
-
-  auto const results_col = cudf::lists::drop_list_duplicates(
-    cudf::lists_column_view{FloatListsCol(h_input.begin(), h_input.end())},
-    cudf::null_equality::EQUAL,
-    nans_equal);
-  auto const results_arr =
-    cudf::test::to_host<float_type>(cudf::lists_column_view(results_col->view()).child()).first;
-
-  EXPECT_EQ(results_arr.size(), results_expected.size() + num_NaNs);
-
-  std::size_t NaN_count{0};
-  std::unordered_set<float_type> results;
-  for (auto const x : results_arr) {
-    if (std::isnan(x)) {
-      ++NaN_count;
-    } else {
-      results.insert(x);
-    }
-  }
-  EXPECT_TRUE(results_expected.size() == results.size() && NaN_count == num_NaNs);
-}
-
-TEST_F(DropListDuplicatesTest, FloatingPointTestsWithNaNs)
-{
-  std::vector<float_type> h_input{
-    0, -1, 1, NaN, 2, 0, neg_NaN, 1, -2, 2, 0, 1, 2, neg_NaN, NaN, NaN, NaN, neg_NaN};
-  std::unordered_set<float_type> results_expected{-2, -1, 0, 1, 2};
-  test_floating_point(h_input, results_expected, cudf::nan_equality::UNEQUAL);
-  test_floating_point(h_input, results_expected, cudf::nan_equality::ALL_EQUAL);
-}
-
-TEST_F(DropListDuplicatesTest, FloatingPointTestsWithInfsAndNaNs)
-{
-  std::vector<float_type> h_input{neg_Inf, 0, neg_NaN, 1,   -1,      -2,      NaN, NaN,    Inf, NaN,
-                                  neg_NaN, 2, -1,      0,   neg_NaN, 1,       2,   Inf,    0,   1,
-                                  neg_Inf, 2, neg_NaN, Inf, neg_NaN, neg_NaN, NaN, neg_Inf};
-  std::unordered_set<float_type> results_expected{-2, -1, 0, 1, 2, neg_Inf, Inf};
-  test_floating_point(h_input, results_expected, cudf::nan_equality::UNEQUAL);
-  test_floating_point(h_input, results_expected, cudf::nan_equality::ALL_EQUAL);
-}
-
-TEST_F(DropListDuplicatesTest, StringTestsNonNull)
-{
-  // Trivial cases - empty input.
-  {
-    auto const lists    = StrListsCol{{}};
-    auto const expected = StrListsCol{{}};
-    auto const results  = cudf::lists::drop_list_duplicates(cudf::lists_column_view{lists});
-    CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected, verbosity);
-  }
-
-  // No duplicate entry.
-  {
-    auto const lists    = StrListsCol{"this", "is", "a", "string"};
-    auto const expected = StrListsCol{"a", "is", "string", "this"};
-    auto const results  = cudf::lists::drop_list_duplicates(
-      cudf::lists_column_view{lists}, cudf::null_equality::EQUAL, cudf::nan_equality::ALL_EQUAL);
-    CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected, verbosity);
-  }
-
-  // One list column.
-  {
-    auto const lists    = StrListsCol{"this", "is", "is", "is", "a", "string", "string"};
-    auto const expected = StrListsCol{"a", "is", "string", "this"};
-    auto const results  = cudf::lists::drop_list_duplicates(cudf::lists_column_view{lists});
-    CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected, verbosity);
-  }
-
-  // One list column, input is a strings column with given non-default null_equality and
-  // nans_equality parameters.
-  {
-    auto const lists    = StrListsCol{"this", "is", "is", "is", "a", "string", "string"};
-    auto const expected = StrListsCol{"a", "is", "string", "this"};
-    auto const results  = cudf::lists::drop_list_duplicates(
-      cudf::lists_column_view{lists}, cudf::null_equality::UNEQUAL, cudf::nan_equality::ALL_EQUAL);
-    CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected, verbosity);
-  }
-
-  // Multiple lists column.
-  {
-    auto const lists =
-      StrListsCol{StrListsCol{"this", "is", "a", "no duplicate", "string"},
-                  StrListsCol{"this", "is", "is", "a", "one duplicate", "string"},
-                  StrListsCol{"this", "is", "is", "is", "a", "two duplicates", "string"},
-                  StrListsCol{"this", "is", "is", "is", "is", "a", "three duplicates", "string"}};
-    auto const expected = StrListsCol{StrListsCol{"a", "is", "no duplicate", "string", "this"},
-                                      StrListsCol{"a", "is", "one duplicate", "string", "this"},
-                                      StrListsCol{"a", "is", "string", "this", "two duplicates"},
-                                      StrListsCol{"a", "is", "string", "this", "three duplicates"}};
-    auto const results  = cudf::lists::drop_list_duplicates(cudf::lists_column_view{lists});
-    CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected, verbosity);
-  }
-}
-
-TEST_F(DropListDuplicatesTest, StringTestsWithNulls)
-{
-  auto const null = std::string("");
-
-  // One list column with null entries.
-  {
-    auto const lists = StrListsCol{
-      {"this", null, "is", "is", "is", "a", null, "string", null, "string"}, nulls_at({1, 6, 8})};
-    auto const expected = StrListsCol{{"a", "is", "string", "this", null}, null_at(4)};
-    auto const results  = cudf::lists::drop_list_duplicates(cudf::lists_column_view{lists});
-    CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected, verbosity);
-  }
-
-  // Multiple lists column with null lists and null entries
-  {
-    auto const lists = StrListsCol{
-      {StrListsCol{{"this", null, "is", null, "a", null, "no duplicate", null, "string"},
-                   nulls_at({1, 3, 5, 7})},
-       StrListsCol{}, /* NULL */
-       StrListsCol{"this", "is", "is", "a", "one duplicate", "string"}},
-      null_at(1)};
-    auto const expected =
-      StrListsCol{{StrListsCol{{"a", "is", "no duplicate", "string", "this", null}, null_at(5)},
-                   StrListsCol{}, /* NULL */
-                   StrListsCol{"a", "is", "one duplicate", "string", "this"}},
-                  null_at(1)};
-    auto const results = cudf::lists::drop_list_duplicates(cudf::lists_column_view{lists});
-    CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected, verbosity);
-  }
-}
-
-template <typename T>
-struct DropListDuplicatesTypedTest : public cudf::test::BaseFixture {
-};
-
-using TypesForTest =
-  cudf::test::Concat<cudf::test::IntegralTypesNotBool, cudf::test::FloatingPointTypes>;
-TYPED_TEST_SUITE(DropListDuplicatesTypedTest, TypesForTest);
-
-TYPED_TEST(DropListDuplicatesTypedTest, InvalidInputTests)
-{
-  using ListsCol = cudf::test::lists_column_wrapper<TypeParam>;
-
-  // Nested types (except struct) are not supported.
-  EXPECT_THROW(
-    cudf::lists::drop_list_duplicates(cudf::lists_column_view{ListsCol{ListsCol{{1, 2}, {3}}}}),
-    cudf::logic_error);
-}
-
-TYPED_TEST(DropListDuplicatesTypedTest, TrivialInputTests)
-{
-  using ListsCol = cudf::test::lists_column_wrapper<TypeParam>;
-
-  // Empty input.
-  {
-    auto const lists    = ListsCol{};
-    auto const expected = ListsCol{};
-    auto const results  = cudf::lists::drop_list_duplicates(cudf::lists_column_view{lists});
-    CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected, verbosity);
-
-    auto const [results_keys, results_vals] = cudf::lists::drop_list_duplicates(
-      cudf::lists_column_view{lists}, cudf::lists_column_view{lists});
-    CUDF_TEST_EXPECT_COLUMNS_EQUAL(results_keys->view(), expected, verbosity);
-    CUDF_TEST_EXPECT_COLUMNS_EQUAL(results_vals->view(), expected, verbosity);
-  }
-
-  // All input lists are empty.
-  {
-    auto const lists    = ListsCol{ListsCol{}, ListsCol{}, ListsCol{}};
-    auto const expected = ListsCol{ListsCol{}, ListsCol{}, ListsCol{}};
-    auto const results  = cudf::lists::drop_list_duplicates(cudf::lists_column_view{lists});
-    CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected, verbosity);
-
-    auto const [results_keys, results_vals] = cudf::lists::drop_list_duplicates(
-      cudf::lists_column_view{lists}, cudf::lists_column_view{lists});
-    CUDF_TEST_EXPECT_COLUMNS_EQUAL(results_keys->view(), expected, verbosity);
-    CUDF_TEST_EXPECT_COLUMNS_EQUAL(results_vals->view(), expected, verbosity);
-  }
-
-  // Trivial cases.
-  {
-    auto const lists    = ListsCol{0, 1, 2, 3, 4, 5};
-    auto const expected = ListsCol{0, 1, 2, 3, 4, 5};
-    auto const results  = cudf::lists::drop_list_duplicates(cudf::lists_column_view{lists});
-    CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected, verbosity);
-
-    auto const [results_keys, results_vals] = cudf::lists::drop_list_duplicates(
-      cudf::lists_column_view{lists}, cudf::lists_column_view{lists});
-    CUDF_TEST_EXPECT_COLUMNS_EQUAL(results_keys->view(), expected, verbosity);
-    CUDF_TEST_EXPECT_COLUMNS_EQUAL(results_vals->view(), expected, verbosity);
-  }
-
-  // Multiple empty lists.
-  {
-    auto const lists    = ListsCol{{}, {}, {5, 4, 3, 2, 1, 0}, {}, {6}, {}};
-    auto const expected = ListsCol{{}, {}, {0, 1, 2, 3, 4, 5}, {}, {6}, {}};
-    auto const results  = cudf::lists::drop_list_duplicates(cudf::lists_column_view{lists});
-    CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected, verbosity);
-
-    auto const [results_keys, results_vals] = cudf::lists::drop_list_duplicates(
-      cudf::lists_column_view{lists}, cudf::lists_column_view{lists});
-    CUDF_TEST_EXPECT_COLUMNS_EQUAL(results_keys->view(), expected, verbosity);
-    CUDF_TEST_EXPECT_COLUMNS_EQUAL(results_vals->view(), expected, verbosity);
-  }
-}
-
-TYPED_TEST(DropListDuplicatesTypedTest, NonNullInputTests)
-{
-  using ListsCol = cudf::test::lists_column_wrapper<TypeParam>;
-
-  // Adjacent lists containing the same entries.
-  {
-    auto const keys =
-      ListsCol{{1, 1, 1, 1, 1, 1, 1, 1}, {1, 1, 1, 1, 1, 2, 2, 2}, {2, 2, 2, 2, 3, 3, 3, 3}};
-    auto const vals =
-      ListsCol{{1, 2, 3, 4, 5, 6, 7, 8}, {1, 2, 3, 4, 5, 6, 7, 8}, {1, 2, 3, 4, 5, 6, 7, 8}};
-    auto const expected_keys = ListsCol{{1}, {1, 2}, {2, 3}};
-
-    // Remove duplicates with KEEP_FIRST.
-    {
-      auto const expected_vals = ListsCol{{1}, {1, 6}, {1, 5}};
-      auto const [results_keys, results_vals] =
-        cudf::lists::drop_list_duplicates(cudf::lists_column_view{keys},
-                                          cudf::lists_column_view{vals},
-                                          cudf::duplicate_keep_option::KEEP_FIRST);
-      CUDF_TEST_EXPECT_COLUMNS_EQUAL(results_keys->view(), expected_keys, verbosity);
-      CUDF_TEST_EXPECT_COLUMNS_EQUAL(results_vals->view(), expected_vals, verbosity);
-    }
-
-    // Remove duplicates with KEEP_LAST.
-    {
-      auto const expected_vals = ListsCol{{8}, {5, 8}, {4, 8}};
-      auto const [results_keys, results_vals] =
-        cudf::lists::drop_list_duplicates(cudf::lists_column_view{keys},
-                                          cudf::lists_column_view{vals},
-                                          cudf::duplicate_keep_option::KEEP_LAST);
-      CUDF_TEST_EXPECT_COLUMNS_EQUAL(results_keys->view(), expected_keys, verbosity);
-      CUDF_TEST_EXPECT_COLUMNS_EQUAL(results_vals->view(), expected_vals, verbosity);
-    }
-
-    // Remove duplicates with KEEP_NONE.
-    {
-      auto const expected = ListsCol{ListsCol{}, ListsCol{}, ListsCol{}};
-      auto const [results_keys, results_vals] =
-        cudf::lists::drop_list_duplicates(cudf::lists_column_view{keys},
-                                          cudf::lists_column_view{vals},
-                                          cudf::duplicate_keep_option::KEEP_NONE);
-      CUDF_TEST_EXPECT_COLUMNS_EQUAL(results_keys->view(), expected, verbosity);
-      CUDF_TEST_EXPECT_COLUMNS_EQUAL(results_vals->view(), expected, verbosity);
-    }
-  }
-
-  // Sliced list column.
-  auto const lists_original =
-    ListsCol{{1, 2, 3, 2, 3, 2, 3, 2, 3}, {3, 2, 1, 4, 1}, {5}, {10, 8, 9}, {6, 7}};
-  auto const lists1 = cudf::slice(lists_original, {0, 5})[0];
-  auto const lists2 = cudf::slice(lists_original, {1, 5})[0];
-  auto const lists3 = cudf::slice(lists_original, {1, 3})[0];
-  auto const lists4 = cudf::slice(lists_original, {0, 3})[0];
-
-  {
-    auto const expected = ListsCol{{1, 2, 3}, {1, 2, 3, 4}, {5}, {8, 9, 10}, {6, 7}};
-    auto const results = cudf::lists::drop_list_duplicates(cudf::lists_column_view{lists_original});
-    CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected, verbosity);
-  }
-
-  {
-    auto const expected = ListsCol{{1, 2, 3}, {1, 2, 3, 4}, {5}, {8, 9, 10}, {6, 7}};
-    auto const results  = cudf::lists::drop_list_duplicates(cudf::lists_column_view{lists1});
-    CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected, verbosity);
-  }
-
-  {
-    auto const expected = ListsCol{{1, 2, 3, 4}, {5}, {8, 9, 10}, {6, 7}};
-    auto const results  = cudf::lists::drop_list_duplicates(cudf::lists_column_view{lists2});
-    CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected, verbosity);
-  }
-
-  {
-    auto const expected = ListsCol{{1, 2, 3, 4}, {5}};
-    auto const results  = cudf::lists::drop_list_duplicates(cudf::lists_column_view{lists3});
-    CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected, verbosity);
-  }
-
-  {
-    auto const expected = ListsCol{{1, 2, 3}, {1, 2, 3, 4}, {5}};
-    auto const results  = cudf::lists::drop_list_duplicates(cudf::lists_column_view{lists4});
-    CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected, verbosity);
-  }
-}
-
-TYPED_TEST(DropListDuplicatesTypedTest, WithNullInputTests)
-{
-  using ListsCol      = cudf::test::lists_column_wrapper<TypeParam>;
-  auto constexpr null = TypeParam{0};
-
-  // null entries and lists.
-  {
-    auto const keys = ListsCol{{{3, 2, 1, 4, 1}, {5}, {} /*NULL*/, {} /*NULL*/, {10, 8, 9}, {6, 7}},
-                               nulls_at({2, 3})};
-    auto const vals =
-      ListsCol{{ListsCol{{1, 2, null, 4, 5}, null_at(2)}, {1}, {}, {} /*NULL*/, {1, 2, 3}, {1, 2}},
-               null_at(3)};
-    auto const expected_keys =
-      ListsCol{{{1, 2, 3, 4}, {5}, {} /*NULL*/, {} /*NULL*/, {8, 9, 10}, {6, 7}}, nulls_at({2, 3})};
-
-    // Remove duplicates with KEEP_FIRST.
-    {
-      auto const expected_vals =
-        ListsCol{{ListsCol{{null, 2, 1, 4}, null_at(0)}, {1}, {}, {} /*NULL*/, {2, 3, 1}, {1, 2}},
-                 null_at(3)};
-      auto const [results_keys, results_vals] =
-        cudf::lists::drop_list_duplicates(cudf::lists_column_view{keys},
-                                          cudf::lists_column_view{vals},
-                                          cudf::duplicate_keep_option::KEEP_FIRST);
-      CUDF_TEST_EXPECT_COLUMNS_EQUAL(results_keys->view(), expected_keys, verbosity);
-      CUDF_TEST_EXPECT_COLUMNS_EQUAL(results_vals->view(), expected_vals, verbosity);
-    }
-
-    // Remove duplicates with KEEP_LAST.
-    {
-      auto const expected_vals =
-        ListsCol{{ListsCol{5, 2, 1, 4}, {1}, {}, {} /*NULL*/, {2, 3, 1}, {1, 2}}, null_at(3)};
-      auto const [results_keys, results_vals] =
-        cudf::lists::drop_list_duplicates(cudf::lists_column_view{keys},
-                                          cudf::lists_column_view{vals},
-                                          cudf::duplicate_keep_option::KEEP_LAST);
-      CUDF_TEST_EXPECT_COLUMNS_EQUAL(results_keys->view(), expected_keys, verbosity);
-      CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(results_vals->view(), expected_vals, verbosity);
-    }
-
-    // Remove duplicates with KEEP_NONE.
-    {
-      auto const expected_keys =
-        ListsCol{{{2, 3, 4}, {5}, {} /*NULL*/, {} /*NULL*/, {8, 9, 10}, {6, 7}}, nulls_at({2, 3})};
-      auto const expected_vals =
-        ListsCol{{ListsCol{2, 1, 4}, {1}, {}, {} /*NULL*/, {2, 3, 1}, {1, 2}}, null_at(3)};
-      auto const [results_keys, results_vals] =
-        cudf::lists::drop_list_duplicates(cudf::lists_column_view{keys},
-                                          cudf::lists_column_view{vals},
-                                          cudf::duplicate_keep_option::KEEP_NONE);
-      CUDF_TEST_EXPECT_COLUMNS_EQUAL(results_keys->view(), expected_keys, verbosity);
-      CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(results_vals->view(), expected_vals, verbosity);
-    }
-  }
-
-  // null entries are equal.
-  {
-    auto const keys =
-      ListsCol{{null, 1, null, 3, null, 5, null, 7, null, 9}, nulls_at({0, 2, 4, 6, 8})};
-    auto const vals = ListsCol{{null, 1, 2, 3, 4, null, 6, 7, 8, null}, nulls_at({0, 5, 9})};
-    auto const expected_keys = ListsCol{{1, 3, 5, 7, 9, null}, null_at(5)};
-
-    // Remove duplicates with KEEP_FIRST.
-    {
-      auto const expected_vals = ListsCol{{1, 3, null, 7, null, null}, nulls_at({2, 4, 5})};
-      auto const [results_keys, results_vals] =
-        cudf::lists::drop_list_duplicates(cudf::lists_column_view{keys},
-                                          cudf::lists_column_view{vals},
-                                          cudf::duplicate_keep_option::KEEP_FIRST);
-      CUDF_TEST_EXPECT_COLUMNS_EQUAL(results_keys->view(), expected_keys, verbosity);
-      CUDF_TEST_EXPECT_COLUMNS_EQUAL(results_vals->view(), expected_vals, verbosity);
-    }
-
-    // Remove duplicates with KEEP_LAST.
-    {
-      auto const expected_vals = ListsCol{{1, 3, null, 7, null, 8}, nulls_at({2, 4})};
-      auto const [results_keys, results_vals] =
-        cudf::lists::drop_list_duplicates(cudf::lists_column_view{keys},
-                                          cudf::lists_column_view{vals},
-                                          cudf::duplicate_keep_option::KEEP_LAST);
-      CUDF_TEST_EXPECT_COLUMNS_EQUAL(results_keys->view(), expected_keys, verbosity);
-      CUDF_TEST_EXPECT_COLUMNS_EQUAL(results_vals->view(), expected_vals, verbosity);
-    }
-
-    // Remove duplicates with KEEP_NONE.
-    {
-      auto const expected_keys = ListsCol{1, 3, 5, 7, 9};
-      auto const expected_vals = ListsCol{{1, 3, null, 7, null}, nulls_at({2, 4})};
-      auto const [results_keys, results_vals] =
-        cudf::lists::drop_list_duplicates(cudf::lists_column_view{keys},
-                                          cudf::lists_column_view{vals},
-                                          cudf::duplicate_keep_option::KEEP_NONE);
-      CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(results_keys->view(), expected_keys, verbosity);
-      CUDF_TEST_EXPECT_COLUMNS_EQUAL(results_vals->view(), expected_vals, verbosity);
-    }
-  }
-
-  // null entries are not equal.
-  {
-    auto const lists = ListsCol{{0, 1, 2, 3, 4, 5, 6, 7, 8, 9}, nulls_at({0, 2, 4, 6, 8})};
-    auto const expected =
-      ListsCol{std::initializer_list<TypeParam>{1, 3, 5, 7, 9, null, null, null, null, null},
-               nulls_at({5, 6, 7, 8, 9})};
-    auto const results = cudf::lists::drop_list_duplicates(cudf::lists_column_view{lists},
-                                                           cudf::null_equality::UNEQUAL);
-    CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected, verbosity);
-  }
-}
-
-TYPED_TEST(DropListDuplicatesTypedTest, InputListsOfStructsNoNull)
-{
-  using ColWrapper = cudf::test::fixed_width_column_wrapper<TypeParam, int32_t>;
-
-  auto const get_structs = [] {
-    auto child1 = ColWrapper{
-      1, 1, 1, 1, 1, 1, 1, 1,  // list1
-      1, 1, 1, 1, 2, 1, 2, 2,  // list2
-      2, 2, 2, 2, 3, 2, 3, 3   // list3
-    };
-    auto child2 = StringsCol{
-      // begin list1
-      "Banana",
-      "Mango",
-      "Apple",
-      "Cherry",
-      "Kiwi",
-      "Banana",
-      "Cherry",
-      "Kiwi",  // end list1
-      // begin list2
-      "Bear",
-      "Duck",
-      "Cat",
-      "Dog",
-      "Panda",
-      "Bear",
-      "Cat",
-      "Panda",  // end list2
-      // begin list3
-      "ÁÁÁ",
-      "ÉÉÉÉÉ",
-      "ÍÍÍÍÍ",
-      "ÁBC",
-      "XYZ",
-      "ÁÁÁ",
-      "ÁBC",
-      "XYZ"  // end list3
-    };
-    return StructsCol{{child1, child2}};
-  };
-
-  auto const get_structs_expected = [] {
-    auto child1 = ColWrapper{1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3};
-    auto child2 = StringsCol{
-      // begin list1
-      "Apple",
-      "Banana",
-      "Cherry",
-      "Kiwi",
-      "Mango",  // end list1
-      // begin list2
-      "Bear",
-      "Cat",
-      "Dog",
-      "Duck",
-      "Cat",
-      "Panda",  // end list2
-      // begin list3
-      "ÁBC",
-      "ÁÁÁ",
-      "ÉÉÉÉÉ",
-      "ÍÍÍÍÍ",
-      "XYZ",
-      "ÁBC"  // end list3
-    };
-    return StructsCol{{child1, child2}};
-  };
-
-  // Test full columns.
-  {
-    auto const lists =
-      cudf::make_lists_column(3, IntsCol{0, 8, 16, 24}.release(), get_structs().release(), 0, {});
-    auto const expected = cudf::make_lists_column(
-      3, IntsCol{0, 5, 11, 17}.release(), get_structs_expected().release(), 0, {});
-    auto const results = cudf::lists::drop_list_duplicates(cudf::lists_column_view{lists->view()});
-    CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected->view(), verbosity);
-  }
-
-  // Test sliced columns.
-  {
-    auto const lists_original =
-      cudf::make_lists_column(3, IntsCol{0, 8, 16, 24}.release(), get_structs().release(), 0, {});
-    auto const expected_original = cudf::make_lists_column(
-      3, IntsCol{0, 5, 11, 17}.release(), get_structs_expected().release(), 0, {});
-    auto const lists    = cudf::slice(lists_original->view(), {1, 3})[0];
-    auto const expected = cudf::slice(expected_original->view(), {1, 3})[0];
-    auto const results  = cudf::lists::drop_list_duplicates(cudf::lists_column_view{lists});
-    CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected, verbosity);
-  }
-}
-
-TYPED_TEST(DropListDuplicatesTypedTest, InputListsOfStructsHaveNull)
-{
-  using ColWrapper    = cudf::test::fixed_width_column_wrapper<TypeParam, int32_t>;
-  auto constexpr XXX  = int32_t{0};  // nulls at the parent structs column level
-  auto constexpr null = int32_t{0};  // nulls at the children columns level
-
-  auto const get_structs = [] {
-    auto child1 = ColWrapper{{
-                               1,    1,    null, XXX, XXX, 1, 1,    1,  // list1
-                               1,    1,    1,    1,   2,   1, null, 2,  // list2
-                               null, null, 2,    2,   3,   2, 3,    3   // list3
-                             },
-                             nulls_at({2, 14, 16, 17})};
-    auto child2 = StringsCol{{
-                               // begin list1
-                               "Banana",
-                               "Mango",
-                               "Apple",
-                               "XXX", /*NULL*/
-                               "XXX", /*NULL*/
-                               "Banana",
-                               "Cherry",
-                               "Kiwi",  // end list1
-                                        // begin list2
-                               "Bear",
-                               "Duck",
-                               "Cat",
-                               "Dog",
-                               "Panda",
-                               "Bear",
-                               "" /*NULL*/,
-                               "Panda",  // end list2
-                                         // begin list3
-                               "ÁÁÁ",
-                               "ÉÉÉÉÉ",
-                               "ÍÍÍÍÍ",
-                               "ÁBC",
-                               "" /*NULL*/,
-                               "ÁÁÁ",
-                               "ÁBC",
-                               "XYZ"  // end list3
-                             },
-                             nulls_at({14, 20})};
-    return StructsCol{{child1, child2}, nulls_at({3, 4})};
-  };
-
-  auto const get_structs_expected = [] {
-    auto child1 =
-      ColWrapper{{1, 1, 1, 1, null, XXX, 1, 1, 1, 1, 2, null, 2, 2, 2, 3, 3, 3, null, null},
-                 nulls_at({4, 5, 11, 18, 19})};
-    auto child2 = StringsCol{{
-                               // begin list1
-                               "Banana",
-                               "Cherry",
-                               "Kiwi",
-                               "Mango",
-                               "Apple",
-                               "XXX" /*NULL*/,  // end list1
-                                                // begin list2
-                               "Bear",
-                               "Cat",
-                               "Dog",
-                               "Duck",
-                               "Panda",
-                               "" /*NULL*/,  // end list2
-                                             // begin list3
-                               "ÁBC",
-                               "ÁÁÁ",
-                               "ÍÍÍÍÍ",
-                               "XYZ",
-                               "ÁBC",
-                               "" /*NULL*/,
-                               "ÁÁÁ",
-                               "ÉÉÉÉÉ"  // end list3
-                             },
-                             nulls_at({5, 11, 17})};
-    return StructsCol{{child1, child2}, null_at(5)};
-  };
-
-  // Test full columns.
-  {
-    auto const lists =
-      cudf::make_lists_column(3, IntsCol{0, 8, 16, 24}.release(), get_structs().release(), 0, {});
-    auto const expected = cudf::make_lists_column(
-      3, IntsCol{0, 6, 12, 20}.release(), get_structs_expected().release(), 0, {});
-    auto const results = cudf::lists::drop_list_duplicates(cudf::lists_column_view{lists->view()});
-    CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected->view(), verbosity);
-  }
-
-  // Test sliced columns.
-  {
-    auto const lists_original =
-      cudf::make_lists_column(3, IntsCol{0, 8, 16, 24}.release(), get_structs().release(), 0, {});
-    auto const expected_original = cudf::make_lists_column(
-      3, IntsCol{0, 6, 12, 20}.release(), get_structs_expected().release(), 0, {});
-    auto const lists    = cudf::slice(lists_original->view(), {1, 3})[0];
-    auto const expected = cudf::slice(expected_original->view(), {1, 3})[0];
-    auto const results  = cudf::lists::drop_list_duplicates(cudf::lists_column_view{lists});
-    CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected, verbosity);
-  }
-}
-
-TYPED_TEST(DropListDuplicatesTypedTest, InputListsOfNestedStructsHaveNull)
-{
-  using ColWrapper    = cudf::test::fixed_width_column_wrapper<TypeParam, int32_t>;
-  auto constexpr null = int32_t{0};  // nulls at the children columns level
-  // XXX and YYY are int placeholders for nulls at parent structs column level.
-  // We bring up two placeholders of different values to create intra null structs with
-  // children of different values, so as to test whether null_equality::EQUAL works or not.
-  auto constexpr XXX = int32_t{5};
-  auto constexpr YYY = int32_t{6};
-
-  auto const get_nested_structs = [] {
-    auto grandchild1 = ColWrapper{{
-                                    1,    XXX,  null, XXX, YYY, 1, 1,    1,  // list1
-                                    1,    1,    1,    1,   2,   1, null, 2,  // list2
-                                    null, null, 2,    2,   3,   2, 3,    3   // list3
-                                  },
-                                  nulls_at({2, 14, 16, 17})};
-    auto grandchild2 = StringsCol{{
-                                    // begin list1
-                                    "Banana",
-                                    "YYY", /*NULL*/
-                                    "Apple",
-                                    "XXX", /*NULL*/
-                                    "YYY", /*NULL*/
-                                    "Banana",
-                                    "Cherry",
-                                    "Kiwi",  // end list1
-                                             // begin list2
-                                    "Bear",
-                                    "Duck",
-                                    "Cat",
-                                    "Dog",
-                                    "Panda",
-                                    "Bear",
-                                    "" /*NULL*/,
-                                    "Panda",  // end list2
-                                              // begin list3
-                                    "ÁÁÁ",
-                                    "ÉÉÉÉÉ",
-                                    "ÍÍÍÍÍ",
-                                    "ÁBC",
-                                    "" /*NULL*/,
-                                    "ÁÁÁ",
-                                    "ÁBC",
-                                    "XYZ"  // end list3
-                                  },
-                                  nulls_at({14, 20})};
-    auto child1      = StructsCol{{grandchild1, grandchild2}, nulls_at({1, 3, 4})};
-    return StructsCol{{child1}};
-  };
-
-  auto const get_nested_struct_expected = [] {
-    auto grandchild1 =
-      ColWrapper{{1, 1, 1, null, XXX, 1, 1, 1, 1, 2, null, 2, 2, 2, 3, 3, 3, null, null},
-                 nulls_at({3, 4, 10, 17, 18})};
-    auto grandchild2 = StringsCol{{
-                                    // begin list1
-                                    "Banana",
-                                    "Cherry",
-                                    "Kiwi",
-                                    "Apple",
-                                    "XXX" /*NULL*/,  // end list1
-                                                     // begin list2
-                                    "Bear",
-                                    "Cat",
-                                    "Dog",
-                                    "Duck",
-                                    "Panda",
-                                    "" /*NULL*/,  // end list2
-                                                  // begin list3
-                                    "ÁBC",
-                                    "ÁÁÁ",
-                                    "ÍÍÍÍÍ",
-                                    "XYZ",
-                                    "ÁBC",
-                                    "" /*NULL*/,
-                                    "ÁÁÁ",
-                                    "ÉÉÉÉÉ"  // end list3
-                                  },
-                                  nulls_at({4, 10, 16})};
-    auto child1      = StructsCol{{grandchild1, grandchild2}, nulls_at({4})};
-    return StructsCol{{child1}};
-  };
-
-  // Test full columns.
-  {
-    auto const lists = cudf::make_lists_column(
-      3, IntsCol{0, 8, 16, 24}.release(), get_nested_structs().release(), 0, {});
-    auto const expected = cudf::make_lists_column(
-      3, IntsCol{0, 5, 11, 19}.release(), get_nested_struct_expected().release(), 0, {});
-    auto const results = cudf::lists::drop_list_duplicates(cudf::lists_column_view{lists->view()});
-    CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected->view(), verbosity);
-  }
-
-  // Test sliced columns.
-  {
-    auto const lists_original = cudf::make_lists_column(
-      3, IntsCol{0, 8, 16, 24}.release(), get_nested_structs().release(), 0, {});
-    auto const expected_original = cudf::make_lists_column(
-      3, IntsCol{0, 5, 11, 19}.release(), get_nested_struct_expected().release(), 0, {});
-    auto const lists    = cudf::slice(lists_original->view(), {1, 3})[0];
-    auto const expected = cudf::slice(expected_original->view(), {1, 3})[0];
-    auto const results  = cudf::lists::drop_list_duplicates(cudf::lists_column_view{lists});
-    CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected, verbosity);
-  }
-}
-
-TEST_F(DropListDuplicatesTest, SlicedInputListsOfStructsWithNaNs)
-{
-  auto const h_child = std::vector<float_type>{
-    0, -1, 1, 0, 2, 0, 1, 1, -2, 2, 0, 1, 2, neg_NaN, NaN, NaN, NaN, neg_NaN};
-
-  auto const get_structs = [&] {
-    // Two children are just identical.
-    auto child1 = FloatsCol(h_child.begin(), h_child.end());
-    auto child2 = FloatsCol(h_child.begin(), h_child.end());
-    return StructsCol{{child1, child2}};
-  };
-
-  // The first list does not have any NaN or -NaN, while the second list has both.
-  // `drop_list_duplicates` is expected to operate properly on this second list.
-  auto const lists_original =
-    cudf::make_lists_column(2, IntsCol{0, 10, 18}.release(), get_structs().release(), 0, {});
-  auto const lists2 = cudf::slice(lists_original->view(), {1, 2})[0];  // test on the second list
-
-  // Contain expected vals excluding NaN.
-  auto const results_children_expected = std::unordered_set<float_type>{0, 1, 2};
-
-  // Test for cudf::nan_equality::UNEQUAL.
-  {
-    auto const results_col = cudf::lists::drop_list_duplicates(cudf::lists_column_view{lists2});
-    auto const child       = cudf::lists_column_view(results_col->view()).child();
-    auto const results_arr = cudf::test::to_host<float_type>(child.child(0)).first;
-
-    std::size_t const num_NaNs =
-      std::count_if(h_child.begin(), h_child.end(), [](auto x) { return std::isnan(x); });
-    EXPECT_EQ(results_arr.size(), results_children_expected.size() + num_NaNs);
-
-    std::size_t NaN_count{0};
-    std::unordered_set<float_type> results;
-    for (auto const x : results_arr) {
-      if (std::isnan(x)) {
-        ++NaN_count;
-      } else {
-        results.insert(x);
-      }
-    }
-    EXPECT_TRUE(results_children_expected.size() == results.size() && NaN_count == num_NaNs);
-  }
-
-  // Test for cudf::nan_equality::ALL_EQUAL.
-  {
-    auto const results_col = cudf::lists::drop_list_duplicates(
-      cudf::lists_column_view{lists2}, cudf::null_equality::EQUAL, cudf::nan_equality::ALL_EQUAL);
-    auto const child       = cudf::lists_column_view(results_col->view()).child();
-    auto const results_arr = cudf::test::to_host<float_type>(child.child(0)).first;
-
-    std::size_t const num_NaNs = 1;
-    EXPECT_EQ(results_arr.size(), results_children_expected.size() + num_NaNs);
-
-    std::size_t NaN_count{0};
-    std::unordered_set<float_type> results;
-    for (auto const x : results_arr) {
-      if (std::isnan(x)) {
-        ++NaN_count;
-      } else {
-        results.insert(x);
-      }
-    }
-    EXPECT_TRUE(results_children_expected.size() == results.size() && NaN_count == num_NaNs);
-  }
-}

From ebcea0fa1edba776a1631aac4b7e50b8127a3e4f Mon Sep 17 00:00:00 2001
From: Elias Stehle <3958403+elstehle@users.noreply.github.com>
Date: Fri, 22 Jul 2022 16:28:54 +0200
Subject: [PATCH 22/31] Adds the Finite-State Transducer algorithm (#11242)

This PR adds a parallel _Finite-State Transducer_ (FST) algorithm. The FST is a key component of the nested JSON parser.

# Background


**An example of a Finite-State Transducer (FST) // aka the algorithm which we try to mimic**:
[Slides from the JSON parser presentation, Slides 11-17](https://docs.google.com/presentation/d/1NTQdUMM44NzzHxLNnvcGLQk6pI-fdoM3cXqNqushMbU/edit?usp=sharing)

## Our GPU-based implementation
**The GPU-based algorithm builds on the following work:**
[ParPaRaw: Massively Parallel Parsing of Delimiter-Separated Raw Data](https://arxiv.org/pdf/1905.13415.pdf)

**The following sections are of relevance:**
- Section 3.1
- Section 4.5 (i.e., the Multi-fragment in-register array)

**How the algorithm works is illustrated in the following presentation:**
[ParPaRaw @VlLDB'20](https://eliasstehle.com/media/parparaw_vldb_2020.pdf#page=21)

## Relevent Data Structures
**A word about the motivation and need for the _Multi-fragment in-register array_:**

The composition over to state-transaction vectors is a key operation (in the prefix scan). Basically, what it does for two state-transition vectors `lhs` and `rhs`, both comprising `N` items:
```
for (int32_t i = 0; i < N; ++i) {
  result[n] = rhs[lhs[i]];
}
return result;
```


The relevant part is the indexing into `rhs`: `rhs[lhs[i]]`, i.e., the index is `lhs[i]`, a runtime value that isn't known at compile time. It's important to understand that in CUB's prefix scan both `rhs` and `lhs` are thread-local variables. As such, they either live in the fast register file or in (slow off-chip) local memory.
The register file has a shortcoming, it cannot be indexed dynamically. And here, we are dynamically indexing into `rhs`. So `rhs` will need to be spilled to local memory (backed by device memory) to allow for dynamic indexing. This would usually make the algorithm very slow. That's why we have the _Multi-fragment in-register array_. For its implementation details I'd suggest reading [Section 4.5](https://arxiv.org/pdf/1905.13415.pdf).

In contrast, the following example is fine and `foo` will be mapped to registers, because the loop can be unrolled, and, if `N` is known at compile time and sufficiently small (of at most tens of items).
```
// this is fine, if N is a compile-time constant
for (int32_t i = 1; i < N; ++i) {
  foo[n] = foo[n-1];
}
```

# Style & CUB Integration

The following may be considered for being integrated into CUB at a later point, hence the deviation in style from cuDF.

- `in_reg_array.cuh`
- `agent_dfa.cuh`
- `device_dfa.cuh`
- `dispatch_dfa.cuh`

Authors:
  - Elias Stehle (https://github.com/elstehle)

Approvers:
  - Vukasin Milovanovic (https://github.com/vuule)
  - Tobias Ribizel (https://github.com/upsj)
  - Karthikeyan (https://github.com/karthikeyann)

URL: https://github.com/rapidsai/cudf/pull/11242
---
 cpp/include/cudf_test/cudf_gtest.hpp   |   9 +
 cpp/src/io/fst/agent_dfa.cuh           | 672 +++++++++++++++++++++++++
 cpp/src/io/fst/device_dfa.cuh          |  94 ++++
 cpp/src/io/fst/dispatch_dfa.cuh        | 456 +++++++++++++++++
 cpp/src/io/fst/in_reg_array.cuh        | 140 ++++++
 cpp/src/io/fst/lookup_tables.cuh       | 571 +++++++++++++++++++++
 cpp/tests/CMakeLists.txt               |   1 +
 cpp/tests/io/fst/fst_test.cu           | 262 ++++++++++
 cpp/tests/io/fst/logical_stack_test.cu |   5 +-
 9 files changed, 2206 insertions(+), 4 deletions(-)
 create mode 100644 cpp/src/io/fst/agent_dfa.cuh
 create mode 100644 cpp/src/io/fst/device_dfa.cuh
 create mode 100644 cpp/src/io/fst/dispatch_dfa.cuh
 create mode 100644 cpp/src/io/fst/in_reg_array.cuh
 create mode 100644 cpp/src/io/fst/lookup_tables.cuh
 create mode 100644 cpp/tests/io/fst/fst_test.cu

diff --git a/cpp/include/cudf_test/cudf_gtest.hpp b/cpp/include/cudf_test/cudf_gtest.hpp
index 6c62b0159ca..fb2680545d3 100644
--- a/cpp/include/cudf_test/cudf_gtest.hpp
+++ b/cpp/include/cudf_test/cudf_gtest.hpp
@@ -176,3 +176,12 @@ struct TypeList<Types<TYPES...>> {
   } catch (std::exception & e) {                                                              \
     FAIL() << "statement:" << #statement << std::endl << "reason: " << e.what() << std::endl; \
   }
+
+/**
+ * @brief test macro comparing for equality of \p lhs and and \p rhs for the first \p size elements.
+ */
+#define CUDF_TEST_EXPECT_VECTOR_EQUAL(lhs, rhs, size)          \
+  do {                                                         \
+    for (decltype(size) i = 0; i < size; i++)                  \
+      EXPECT_EQ(lhs[i], rhs[i]) << "Mismatch at index #" << i; \
+  } while (0)
diff --git a/cpp/src/io/fst/agent_dfa.cuh b/cpp/src/io/fst/agent_dfa.cuh
new file mode 100644
index 00000000000..d847598d6dd
--- /dev/null
+++ b/cpp/src/io/fst/agent_dfa.cuh
@@ -0,0 +1,672 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include "in_reg_array.cuh"
+
+#include <cub/cub.cuh>
+
+#include <thrust/execution_policy.h>
+#include <thrust/sequence.h>
+
+namespace cudf::io::fst::detail {
+
+/// Type used to enumerate (and index) into the states defined by a DFA
+using StateIndexT = uint32_t;
+
+/**
+ * @brief Implements an associative composition operation for state transition vectors to be used
+ * with a prefix scan.
+ *
+ * Read the following table as follows: c = op(l,r), where op is the composition operator.
+ * For row 0: l maps 0 to 2. r maps 2 to 2. Hence, the result for 0 is 2.
+ * For row 1: l maps 1 to 1. r maps 1 to 2. Hence, the result for 1 is 2.
+ * For row 2: l maps 2 to 0. r maps 0 to 1. Hence, the result for 2 is 1.
+ *
+ *     l   r  = c  (     s->l->r)
+ * 0: [2] [1]  [2] (i.e. 0->2->2)
+ * 1: [1] [2]  [2] (i.e. 1->1->2)
+ * 2: [0] [2]  [1] (i.e. 2->0->1)
+ * @tparam NUM_ITEMS The number of items stored within a vector
+ */
+template <int32_t NUM_ITEMS>
+struct VectorCompositeOp {
+  template <typename VectorT>
+  __host__ __device__ __forceinline__ VectorT operator()(VectorT const& lhs, VectorT const& rhs)
+  {
+    VectorT res{};
+    for (int32_t i = 0; i < NUM_ITEMS; ++i) {
+      res.Set(i, rhs.Get(lhs.Get(i)));
+    }
+    return res;
+  }
+};
+
+/**
+ * @brief A class whose ReadSymbol member function is invoked for each symbol being read from the
+ * input tape. The wrapper class looks up whether a state transition caused by a symbol is supposed
+ * to emit any output symbol (the "transduced" output) and, if so, keeps track of how many symbols
+ * it intends to write out and writing out such symbols to the given output iterators.
+ *
+ * @tparam TransducerTableT The type implementing a transducer table that can be used for looking up
+ * the symbols that are supposed to be emitted on a given state transition.
+ * @tparam TransducedOutItT A Random-access output iterator type to which symbols returned by the
+ * transducer table are assignable.
+ * @tparam TransducedIndexOutItT A Random-access output iterator type to which indexes are written.
+ */
+template <typename TransducerTableT, typename TransducedOutItT, typename TransducedIndexOutItT>
+class DFASimulationCallbackWrapper {
+ public:
+  __host__ __device__ __forceinline__ DFASimulationCallbackWrapper(
+    TransducerTableT transducer_table, TransducedOutItT out_it, TransducedIndexOutItT out_idx_it)
+    : transducer_table(transducer_table), out_it(out_it), out_idx_it(out_idx_it), write(false)
+  {
+  }
+
+  template <typename OffsetT>
+  __host__ __device__ __forceinline__ void Init(OffsetT const& offset)
+  {
+    this->offset = offset;
+    if (!write) out_count = 0;
+  }
+
+  template <typename CharIndexT, typename StateIndexT, typename SymbolIndexT>
+  __host__ __device__ __forceinline__ void ReadSymbol(CharIndexT const character_index,
+                                                      StateIndexT const old_state,
+                                                      StateIndexT const new_state,
+                                                      SymbolIndexT const symbol_id)
+  {
+    uint32_t const count = transducer_table(old_state, symbol_id);
+    if (write) {
+      for (uint32_t out_char = 0; out_char < count; out_char++) {
+        out_it[out_count + out_char]     = transducer_table(old_state, symbol_id, out_char);
+        out_idx_it[out_count + out_char] = offset + character_index;
+      }
+    }
+    out_count += count;
+  }
+
+  __host__ __device__ __forceinline__ void TearDown() {}
+
+ public:
+  TransducerTableT const transducer_table;
+  TransducedOutItT out_it;
+  TransducedIndexOutItT out_idx_it;
+  uint32_t out_count;
+  uint32_t offset;
+  bool write;
+};
+
+/**
+ * @brief Helper class that transitions the state of multiple DFA instances simultaneously whenever
+ * a symbol is read.
+ *
+ * @tparam NUM_INSTANCES The number of DFA instances to keep track of
+ * @tparam TransitionTableT The transition table type used for looking up the new state for a
+ * current_state and a read_symbol.
+ */
+template <int32_t NUM_INSTANCES, typename TransitionTableT>
+class StateVectorTransitionOp {
+ public:
+  __host__ __device__ __forceinline__ StateVectorTransitionOp(
+    TransitionTableT const& transition_table, std::array<StateIndexT, NUM_INSTANCES>& state_vector)
+    : transition_table(transition_table), state_vector(state_vector)
+  {
+  }
+
+  template <typename CharIndexT, typename SymbolIndexT>
+  __host__ __device__ __forceinline__ void ReadSymbol(CharIndexT const& character_index,
+                                                      SymbolIndexT const read_symbol_id) const
+  {
+    for (int32_t i = 0; i < NUM_INSTANCES; ++i) {
+      state_vector[i] = transition_table(state_vector[i], read_symbol_id);
+    }
+  }
+
+ public:
+  std::array<StateIndexT, NUM_INSTANCES>& state_vector;
+  TransitionTableT const& transition_table;
+};
+
+template <typename CallbackOpT, typename TransitionTableT>
+struct StateTransitionOp {
+  StateIndexT state;
+  TransitionTableT const& transition_table;
+  CallbackOpT& callback_op;
+
+  __host__ __device__ __forceinline__ StateTransitionOp(TransitionTableT const& transition_table,
+                                                        StateIndexT state,
+                                                        CallbackOpT& callback_op)
+    : transition_table(transition_table), state(state), callback_op(callback_op)
+  {
+  }
+
+  template <typename CharIndexT, typename SymbolIndexT>
+  __host__ __device__ __forceinline__ void ReadSymbol(CharIndexT const& character_index,
+                                                      SymbolIndexT const& read_symbol_id)
+  {
+    // Remember what state we were in before we made the transition
+    StateIndexT previous_state = state;
+
+    state = transition_table(state, read_symbol_id);
+    callback_op.ReadSymbol(character_index, previous_state, state, read_symbol_id);
+  }
+};
+
+template <typename AgentDFAPolicy, typename SymbolItT, typename OffsetT>
+struct AgentDFA {
+  using SymbolIndexT = uint32_t;
+  using AliasedLoadT = uint32_t;
+  using CharT        = typename std::iterator_traits<SymbolItT>::value_type;
+
+  //------------------------------------------------------------------------------
+  // DERIVED CONFIGS
+  //------------------------------------------------------------------------------
+  static constexpr uint32_t BLOCK_THREADS    = AgentDFAPolicy::BLOCK_THREADS;
+  static constexpr uint32_t ITEMS_PER_THREAD = AgentDFAPolicy::ITEMS_PER_THREAD;
+
+  // The number of symbols per thread
+  static constexpr uint32_t SYMBOLS_PER_THREAD = ITEMS_PER_THREAD;
+  static constexpr uint32_t SYMBOLS_PER_BLOCK  = BLOCK_THREADS * SYMBOLS_PER_THREAD;
+
+  static constexpr uint32_t MIN_UINTS_PER_BLOCK =
+    CUB_QUOTIENT_CEILING(SYMBOLS_PER_BLOCK, sizeof(AliasedLoadT));
+  static constexpr uint32_t UINTS_PER_THREAD =
+    CUB_QUOTIENT_CEILING(MIN_UINTS_PER_BLOCK, BLOCK_THREADS);
+  static constexpr uint32_t UINTS_PER_BLOCK        = UINTS_PER_THREAD * BLOCK_THREADS;
+  static constexpr uint32_t SYMBOLS_PER_UINT_BLOCK = UINTS_PER_BLOCK * sizeof(AliasedLoadT);
+
+  //------------------------------------------------------------------------------
+  // TYPEDEFS
+  //------------------------------------------------------------------------------
+  struct _TempStorage {
+    // For aliased loading of characters into shared memory
+    union {
+      CharT chars[SYMBOLS_PER_BLOCK];
+      AliasedLoadT uints[UINTS_PER_BLOCK];
+    };
+  };
+
+  struct TempStorage : cub::Uninitialized<_TempStorage> {
+  };
+
+  //------------------------------------------------------------------------------
+  // MEMBER VARIABLES
+  //------------------------------------------------------------------------------
+  _TempStorage& temp_storage;
+
+  //------------------------------------------------------------------------------
+  // CONSTRUCTOR
+  //------------------------------------------------------------------------------
+  __device__ __forceinline__ AgentDFA(TempStorage& temp_storage)
+    : temp_storage(temp_storage.Alias())
+  {
+  }
+
+  template <int32_t NUM_SYMBOLS,
+            typename SymbolMatcherT,
+            typename CallbackOpT,
+            int32_t IS_FULL_BLOCK>
+  __device__ __forceinline__ static void ThreadParse(SymbolMatcherT const& symbol_matcher,
+                                                     CharT const* chars,
+                                                     SymbolIndexT const& max_num_chars,
+                                                     CallbackOpT callback_op,
+                                                     cub::Int2Type<IS_FULL_BLOCK> /*IS_FULL_BLOCK*/)
+  {
+    // Iterate over symbols
+#pragma unroll
+    for (int32_t i = 0; i < NUM_SYMBOLS; ++i) {
+      if (IS_FULL_BLOCK || threadIdx.x * SYMBOLS_PER_THREAD + i < max_num_chars) {
+        auto matched_id = symbol_matcher(chars[i]);
+        callback_op.ReadSymbol(i, matched_id);
+      }
+    }
+  }
+
+  template <int32_t NUM_SYMBOLS,
+            typename SymbolMatcherT,
+            typename StateTransitionOpT,
+            int32_t IS_FULL_BLOCK>
+  __device__ __forceinline__ void GetThreadStateTransitions(
+    SymbolMatcherT const& symbol_matcher,
+    CharT const* chars,
+    SymbolIndexT const& max_num_chars,
+    StateTransitionOpT& state_transition_op,
+    cub::Int2Type<IS_FULL_BLOCK> /*IS_FULL_BLOCK*/)
+  {
+    ThreadParse<NUM_SYMBOLS>(
+      symbol_matcher, chars, max_num_chars, state_transition_op, cub::Int2Type<IS_FULL_BLOCK>());
+  }
+
+  //---------------------------------------------------------------------
+  // LOADING FULL BLOCK OF CHARACTERS, NON-ALIASED
+  //---------------------------------------------------------------------
+  __device__ __forceinline__ void LoadBlock(CharT const* d_chars,
+                                            OffsetT const block_offset,
+                                            OffsetT const num_total_symbols,
+                                            cub::Int2Type<true> /*IS_FULL_BLOCK*/,
+                                            cub::Int2Type<1> /*ALIGNMENT*/)
+  {
+    CharT thread_chars[SYMBOLS_PER_THREAD];
+
+    CharT const* d_block_symbols = d_chars + block_offset;
+    cub::LoadDirectStriped<BLOCK_THREADS>(threadIdx.x, d_block_symbols, thread_chars);
+
+#pragma unroll
+    for (int32_t i = 0; i < SYMBOLS_PER_THREAD; ++i) {
+      temp_storage.chars[threadIdx.x + i * BLOCK_THREADS] = thread_chars[i];
+    }
+  }
+
+  //---------------------------------------------------------------------
+  // LOADING PARTIAL BLOCK OF CHARACTERS, NON-ALIASED
+  //---------------------------------------------------------------------
+  __device__ __forceinline__ void LoadBlock(CharT const* d_chars,
+                                            OffsetT const block_offset,
+                                            OffsetT const num_total_symbols,
+                                            cub::Int2Type<false> /*IS_FULL_BLOCK*/,
+                                            cub::Int2Type<1> /*ALIGNMENT*/)
+  {
+    CharT thread_chars[SYMBOLS_PER_THREAD];
+
+    if (num_total_symbols <= block_offset) return;
+
+    // Last unit to be loaded is IDIV_CEIL(#SYM, SYMBOLS_PER_UNIT)
+    OffsetT num_total_chars = num_total_symbols - block_offset;
+
+    CharT const* d_block_symbols = d_chars + block_offset;
+    cub::LoadDirectStriped<BLOCK_THREADS>(
+      threadIdx.x, d_block_symbols, thread_chars, num_total_chars);
+
+#pragma unroll
+    for (int32_t i = 0; i < SYMBOLS_PER_THREAD; ++i) {
+      temp_storage.chars[threadIdx.x + i * BLOCK_THREADS] = thread_chars[i];
+    }
+  }
+
+  //---------------------------------------------------------------------
+  // LOADING FULL BLOCK OF CHARACTERS, ALIASED
+  //---------------------------------------------------------------------
+  __device__ __forceinline__ void LoadBlock(CharT const* d_chars,
+                                            OffsetT const block_offset,
+                                            OffsetT const num_total_symbols,
+                                            cub::Int2Type<true> /*IS_FULL_BLOCK*/,
+                                            cub::Int2Type<sizeof(AliasedLoadT)> /*ALIGNMENT*/)
+  {
+    AliasedLoadT thread_units[UINTS_PER_THREAD];
+
+    AliasedLoadT const* d_block_symbols =
+      reinterpret_cast<AliasedLoadT const*>(d_chars + block_offset);
+    cub::LoadDirectStriped<BLOCK_THREADS>(threadIdx.x, d_block_symbols, thread_units);
+
+#pragma unroll
+    for (int32_t i = 0; i < UINTS_PER_THREAD; ++i) {
+      temp_storage.uints[threadIdx.x + i * BLOCK_THREADS] = thread_units[i];
+    }
+  }
+
+  //---------------------------------------------------------------------
+  // LOADING PARTIAL BLOCK OF CHARACTERS, ALIASED
+  //---------------------------------------------------------------------
+  __device__ __forceinline__ void LoadBlock(CharT const* d_chars,
+                                            OffsetT const block_offset,
+                                            OffsetT const num_total_symbols,
+                                            cub::Int2Type<false> /*IS_FULL_BLOCK*/,
+                                            cub::Int2Type<sizeof(AliasedLoadT)> /*ALIGNMENT*/)
+  {
+    AliasedLoadT thread_units[UINTS_PER_THREAD];
+
+    if (num_total_symbols <= block_offset) return;
+
+    // Last unit to be loaded is IDIV_CEIL(#SYM, SYMBOLS_PER_UNIT)
+    OffsetT num_total_units =
+      CUB_QUOTIENT_CEILING(num_total_symbols - block_offset, sizeof(AliasedLoadT));
+
+    AliasedLoadT const* d_block_symbols =
+      reinterpret_cast<AliasedLoadT const*>(d_chars + block_offset);
+    cub::LoadDirectStriped<BLOCK_THREADS>(
+      threadIdx.x, d_block_symbols, thread_units, num_total_units);
+
+#pragma unroll
+    for (int32_t i = 0; i < UINTS_PER_THREAD; ++i) {
+      temp_storage.uints[threadIdx.x + i * BLOCK_THREADS] = thread_units[i];
+    }
+  }
+
+  //---------------------------------------------------------------------
+  // LOADING BLOCK OF CHARACTERS: DISPATCHER
+  //---------------------------------------------------------------------
+  __device__ __forceinline__ void LoadBlock(CharT const* d_chars,
+                                            OffsetT const block_offset,
+                                            OffsetT const num_total_symbols)
+  {
+    // Check if pointer is aligned to four bytes
+    if (((uintptr_t)(const void*)(d_chars + block_offset) % 4) == 0) {
+      if (block_offset + SYMBOLS_PER_UINT_BLOCK < num_total_symbols) {
+        LoadBlock(
+          d_chars, block_offset, num_total_symbols, cub::Int2Type<true>(), cub::Int2Type<4>());
+      } else {
+        LoadBlock(
+          d_chars, block_offset, num_total_symbols, cub::Int2Type<false>(), cub::Int2Type<1>());
+      }
+    } else {
+      if (block_offset + SYMBOLS_PER_UINT_BLOCK < num_total_symbols) {
+        LoadBlock(
+          d_chars, block_offset, num_total_symbols, cub::Int2Type<true>(), cub::Int2Type<1>());
+      } else {
+        LoadBlock(
+          d_chars, block_offset, num_total_symbols, cub::Int2Type<false>(), cub::Int2Type<1>());
+      }
+    }
+  }
+
+  template <int32_t NUM_STATES, typename SymbolMatcherT, typename TransitionTableT>
+  __device__ __forceinline__ void GetThreadStateTransitionVector(
+    SymbolMatcherT const& symbol_matcher,
+    TransitionTableT const& transition_table,
+    CharT const* d_chars,
+    OffsetT const block_offset,
+    OffsetT const num_total_symbols,
+    std::array<StateIndexT, NUM_STATES>& state_vector)
+  {
+    using StateVectorTransitionOpT = StateVectorTransitionOp<NUM_STATES, TransitionTableT>;
+
+    // Start parsing and to transition states
+    StateVectorTransitionOpT transition_op(transition_table, state_vector);
+
+    // Load characters into shared memory
+    LoadBlock(d_chars, block_offset, num_total_symbols);
+
+    // If this is a full block (i.e., all threads can parse all their symbols)
+    OffsetT num_block_chars = num_total_symbols - block_offset;
+    bool is_full_block      = (num_block_chars >= SYMBOLS_PER_BLOCK);
+
+    // Ensure characters have been loaded
+    __syncthreads();
+
+    // Thread's symbols
+    CharT* t_chars = &temp_storage.chars[threadIdx.x * SYMBOLS_PER_THREAD];
+
+    // Parse thread's symbols and transition the state-vector
+    if (is_full_block) {
+      GetThreadStateTransitions<SYMBOLS_PER_THREAD>(
+        symbol_matcher, t_chars, num_block_chars, transition_op, cub::Int2Type<true>());
+    } else {
+      GetThreadStateTransitions<SYMBOLS_PER_THREAD>(
+        symbol_matcher, t_chars, num_block_chars, transition_op, cub::Int2Type<false>());
+    }
+  }
+
+  template <int32_t BYPASS_LOAD,
+            typename SymbolMatcherT,
+            typename TransitionTableT,
+            typename CallbackOpT>
+  __device__ __forceinline__ void GetThreadStateTransitions(
+    SymbolMatcherT const& symbol_matcher,
+    TransitionTableT const& transition_table,
+    CharT const* d_chars,
+    OffsetT const block_offset,
+    OffsetT const num_total_symbols,
+    StateIndexT& state,
+    CallbackOpT& callback_op,
+    cub::Int2Type<BYPASS_LOAD> /**/)
+  {
+    using StateTransitionOpT = StateTransitionOp<CallbackOpT, TransitionTableT>;
+
+    // Start parsing and to transition states
+    StateTransitionOpT transition_op(transition_table, state, callback_op);
+
+    // Load characters into shared memory
+    if (!BYPASS_LOAD) LoadBlock(d_chars, block_offset, num_total_symbols);
+
+    // If this is a full block (i.e., all threads can parse all their symbols)
+    OffsetT num_block_chars = num_total_symbols - block_offset;
+    bool is_full_block      = (num_block_chars >= SYMBOLS_PER_BLOCK);
+
+    // Ensure characters have been loaded
+    __syncthreads();
+
+    // Thread's symbols
+    CharT* t_chars = &temp_storage.chars[threadIdx.x * SYMBOLS_PER_THREAD];
+
+    // Initialize callback
+    callback_op.Init(block_offset + threadIdx.x * SYMBOLS_PER_THREAD);
+
+    // Parse thread's symbols and transition the state-vector
+    if (is_full_block) {
+      GetThreadStateTransitions<SYMBOLS_PER_THREAD>(
+        symbol_matcher, t_chars, num_block_chars, transition_op, cub::Int2Type<true>());
+    } else {
+      GetThreadStateTransitions<SYMBOLS_PER_THREAD>(
+        symbol_matcher, t_chars, num_block_chars, transition_op, cub::Int2Type<false>());
+    }
+
+    callback_op.TearDown();
+  }
+};
+
+template <bool IS_TRANS_VECTOR_PASS,
+          bool IS_SINGLE_PASS,
+          typename DfaT,
+          typename TileStateT,
+          typename AgentDFAPolicy,
+          typename SymbolItT,
+          typename OffsetT,
+          typename StateVectorT,
+          typename OutOffsetScanTileState,
+          typename TransducedOutItT,
+          typename TransducedIndexOutItT,
+          typename TransducedCountOutItT>
+__launch_bounds__(int32_t(AgentDFAPolicy::BLOCK_THREADS)) __global__
+  void SimulateDFAKernel(DfaT dfa,
+                         SymbolItT d_chars,
+                         OffsetT const num_chars,
+                         StateIndexT seed_state,
+                         StateVectorT* __restrict__ d_thread_state_transition,
+                         TileStateT tile_state,
+                         OutOffsetScanTileState offset_tile_state,
+                         TransducedOutItT transduced_out_it,
+                         TransducedIndexOutItT transduced_out_idx_it,
+                         TransducedCountOutItT d_num_transduced_out_it)
+{
+  using AgentDfaSimT = AgentDFA<AgentDFAPolicy, SymbolItT, OffsetT>;
+
+  static constexpr int32_t NUM_STATES = DfaT::MAX_NUM_STATES;
+
+  constexpr uint32_t BLOCK_THREADS     = AgentDFAPolicy::BLOCK_THREADS;
+  constexpr uint32_t SYMBOLS_PER_BLOCK = AgentDfaSimT::SYMBOLS_PER_BLOCK;
+
+  // Shared memory required by the DFA simulation algorithm
+  __shared__ typename AgentDfaSimT::TempStorage dfa_storage;
+
+  // Shared memory required by the symbol group lookup table
+  __shared__ typename DfaT::SymbolGroupStorageT symbol_matcher_storage;
+
+  // Shared memory required by the transition table
+  __shared__ typename DfaT::TransitionTableStorageT transition_table_storage;
+
+  // Shared memory required by the transducer table
+  __shared__ typename DfaT::TranslationTableStorageT transducer_table_storage;
+
+  // Initialize symbol group lookup table
+  auto symbol_matcher = dfa.InitSymbolGroupLUT(symbol_matcher_storage);
+
+  // Initialize transition table
+  auto transition_table = dfa.InitTransitionTable(transition_table_storage);
+
+  // Initialize transition table
+  auto transducer_table = dfa.InitTranslationTable(transducer_table_storage);
+
+  // Set up DFA
+  AgentDfaSimT agent_dfa(dfa_storage);
+
+  // The state transition vector passed on to the second stage of the algorithm
+  StateVectorT out_state_vector;
+
+  // Stage 1: Compute the state-transition vector
+  if (IS_TRANS_VECTOR_PASS || IS_SINGLE_PASS) {
+    // Keeping track of the state for each of the <NUM_STATES> state machines
+    std::array<StateIndexT, NUM_STATES> state_vector;
+
+    // Initialize the seed state transition vector with the identity vector
+    thrust::sequence(thrust::seq, std::begin(state_vector), std::end(state_vector));
+
+    // Compute the state transition vector
+    agent_dfa.GetThreadStateTransitionVector<NUM_STATES>(symbol_matcher,
+                                                         transition_table,
+                                                         d_chars,
+                                                         blockIdx.x * SYMBOLS_PER_BLOCK,
+                                                         num_chars,
+                                                         state_vector);
+
+    // Initialize the state transition vector passed on to the second stage
+#pragma unroll
+    for (int32_t i = 0; i < NUM_STATES; ++i) {
+      out_state_vector.Set(i, state_vector[i]);
+    }
+
+    // Write out state-transition vector
+    if (!IS_SINGLE_PASS) {
+      d_thread_state_transition[blockIdx.x * BLOCK_THREADS + threadIdx.x] = out_state_vector;
+    }
+  }
+
+  // Stage 2: Perform FSM simulation
+  if ((!IS_TRANS_VECTOR_PASS) || IS_SINGLE_PASS) {
+    StateIndexT state = 0;
+
+    //------------------------------------------------------------------------------
+    // SINGLE-PASS:
+    // -> block-wide inclusive prefix scan on the state transition vector
+    // -> first block/tile: write out block aggregate as the "tile's" inclusive (i.e., the one that
+    // incorporates all preceding blocks/tiles results)
+    //------------------------------------------------------------------------------
+    if (IS_SINGLE_PASS) {
+      uint32_t tile_idx             = blockIdx.x;
+      using StateVectorCompositeOpT = VectorCompositeOp<NUM_STATES>;
+
+      using PrefixCallbackOpT_ =
+        cub::TilePrefixCallbackOp<StateVectorT, StateVectorCompositeOpT, TileStateT>;
+
+      using ItemsBlockScan =
+        cub::BlockScan<StateVectorT, BLOCK_THREADS, cub::BlockScanAlgorithm::BLOCK_SCAN_WARP_SCANS>;
+
+      __shared__ typename ItemsBlockScan::TempStorage scan_temp_storage;
+      __shared__ typename PrefixCallbackOpT_::TempStorage prefix_callback_temp_storage;
+
+      // STATE-TRANSITION IDENTITY VECTOR
+      StateVectorT state_identity_vector;
+      for (int32_t i = 0; i < NUM_STATES; ++i) {
+        state_identity_vector.Set(i, i);
+      }
+      StateVectorCompositeOpT state_vector_scan_op;
+
+      //
+      if (tile_idx == 0) {
+        StateVectorT block_aggregate;
+        ItemsBlockScan(scan_temp_storage)
+          .ExclusiveScan(out_state_vector,
+                         out_state_vector,
+                         state_identity_vector,
+                         state_vector_scan_op,
+                         block_aggregate);
+
+        if (threadIdx.x == 0 /*and not IS_LAST_TILE*/) {
+          tile_state.SetInclusive(0, block_aggregate);
+        }
+      } else {
+        auto prefix_op = PrefixCallbackOpT_(
+          tile_state, prefix_callback_temp_storage, state_vector_scan_op, tile_idx);
+
+        ItemsBlockScan(scan_temp_storage)
+          .ExclusiveScan(out_state_vector, out_state_vector, state_vector_scan_op, prefix_op);
+      }
+      __syncthreads();
+      state = out_state_vector.Get(seed_state);
+    } else {
+      state = d_thread_state_transition[blockIdx.x * BLOCK_THREADS + threadIdx.x].Get(seed_state);
+    }
+
+    // Perform finite-state machine simulation, computing size of transduced output
+    DFASimulationCallbackWrapper<decltype(dfa.InitTranslationTable(transducer_table_storage)),
+                                 TransducedOutItT,
+                                 TransducedIndexOutItT>
+      callback_wrapper(transducer_table, transduced_out_it, transduced_out_idx_it);
+
+    StateIndexT t_start_state = state;
+    agent_dfa.GetThreadStateTransitions(symbol_matcher,
+                                        transition_table,
+                                        d_chars,
+                                        blockIdx.x * SYMBOLS_PER_BLOCK,
+                                        num_chars,
+                                        state,
+                                        callback_wrapper,
+                                        cub::Int2Type<IS_SINGLE_PASS>());
+
+    __syncthreads();
+
+    using OffsetPrefixScanCallbackOpT_ =
+      cub::TilePrefixCallbackOp<OffsetT, cub::Sum, OutOffsetScanTileState>;
+
+    using OutOffsetBlockScan =
+      cub::BlockScan<OffsetT, BLOCK_THREADS, cub::BlockScanAlgorithm::BLOCK_SCAN_WARP_SCANS>;
+
+    __shared__ typename OutOffsetBlockScan::TempStorage scan_temp_storage;
+    __shared__ typename OffsetPrefixScanCallbackOpT_::TempStorage prefix_callback_temp_storage;
+
+    uint32_t tile_idx = blockIdx.x;
+    if (tile_idx == 0) {
+      OffsetT block_aggregate = 0;
+      OutOffsetBlockScan(scan_temp_storage)
+        .ExclusiveScan(callback_wrapper.out_count,
+                       callback_wrapper.out_count,
+                       static_cast<OffsetT>(0),
+                       cub::Sum{},
+                       block_aggregate);
+
+      if (threadIdx.x == 0 /*and not IS_LAST_TILE*/) {
+        offset_tile_state.SetInclusive(0, block_aggregate);
+      }
+
+      if (tile_idx == gridDim.x - 1 && threadIdx.x == 0) {
+        *d_num_transduced_out_it = block_aggregate;
+      }
+    } else {
+      auto prefix_op = OffsetPrefixScanCallbackOpT_(
+        offset_tile_state, prefix_callback_temp_storage, cub::Sum{}, tile_idx);
+
+      OutOffsetBlockScan(scan_temp_storage)
+        .ExclusiveScan(
+          callback_wrapper.out_count, callback_wrapper.out_count, cub::Sum{}, prefix_op);
+
+      if (tile_idx == gridDim.x - 1 && threadIdx.x == 0) {
+        *d_num_transduced_out_it = prefix_op.GetInclusivePrefix();
+      }
+    }
+
+    callback_wrapper.write = true;
+    agent_dfa.GetThreadStateTransitions(symbol_matcher,
+                                        transition_table,
+                                        d_chars,
+                                        blockIdx.x * SYMBOLS_PER_BLOCK,
+                                        num_chars,
+                                        t_start_state,
+                                        callback_wrapper,
+                                        cub::Int2Type<true>());
+  }
+}
+
+}  // namespace cudf::io::fst::detail
diff --git a/cpp/src/io/fst/device_dfa.cuh b/cpp/src/io/fst/device_dfa.cuh
new file mode 100644
index 00000000000..7eeff27eef1
--- /dev/null
+++ b/cpp/src/io/fst/device_dfa.cuh
@@ -0,0 +1,94 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include "dispatch_dfa.cuh"
+
+#include <io/utilities/hostdevice_vector.hpp>
+
+#include <cstdint>
+
+namespace cudf::io::fst {
+
+/**
+ * @brief Uses a deterministic finite automaton to transduce a sequence of symbols from an input
+ * iterator to a sequence of transduced output symbols.
+ *
+ * @tparam DfaT The DFA specification
+ * @tparam SymbolItT Random-access input iterator type to symbols fed into the FST
+ * @tparam TransducedOutItT Random-access output iterator to which the transduced output will be
+ * written
+ * @tparam TransducedIndexOutItT Random-access output iterator type to which the input symbols'
+ * indexes are written.
+ * @tparam TransducedCountOutItT A single-item output iterator type to which the total number of
+ * output symbols is written
+ * @tparam OffsetT A type large enough to index into either of both: (a) the input symbols and (b)
+ * the output symbols
+ * @param[in] d_temp_storage Device-accessible allocation of temporary storage.  When NULL, the
+ * required allocation size is written to \p temp_storage_bytes and no work is done.
+ * @param[in,out] temp_storage_bytes Reference to size in bytes of \p d_temp_storage allocation
+ * @param[in] dfa The DFA specifying the number of distinct symbol groups, transition table, and
+ * translation table
+ * @param[in] d_chars_in Random-access input iterator to the beginning of the sequence of input
+ * symbols
+ * @param[in] num_chars The total number of input symbols to process
+ * @param[out] transduced_out_it Random-access output iterator to which the transduced output is
+ * written
+ * @param[out] transduced_out_idx_it Random-access output iterator to which, the index i is written
+ * iff the i-th input symbol caused some output to be written
+ * @param[out] d_num_transduced_out_it A single-item output iterator type to which the total number
+ * of output symbols is written
+ * @param[in] seed_state The DFA's starting state. For streaming DFAs this corresponds to the
+ * "end-state" of the previous invocation of the algorithm.
+ * @param[in] stream CUDA stream to launch kernels within. Default is the null-stream.
+ */
+template <typename DfaT,
+          typename SymbolItT,
+          typename TransducedOutItT,
+          typename TransducedIndexOutItT,
+          typename TransducedCountOutItT,
+          typename OffsetT>
+cudaError_t DeviceTransduce(void* d_temp_storage,
+                            size_t& temp_storage_bytes,
+                            DfaT dfa,
+                            SymbolItT d_chars_in,
+                            OffsetT num_chars,
+                            TransducedOutItT transduced_out_it,
+                            TransducedIndexOutItT transduced_out_idx_it,
+                            TransducedCountOutItT d_num_transduced_out_it,
+                            uint32_t seed_state = 0,
+                            cudaStream_t stream = 0)
+{
+  using DispatchDfaT = detail::DispatchFSM<DfaT,
+                                           SymbolItT,
+                                           TransducedOutItT,
+                                           TransducedIndexOutItT,
+                                           TransducedCountOutItT,
+                                           OffsetT>;
+
+  return DispatchDfaT::Dispatch(d_temp_storage,
+                                temp_storage_bytes,
+                                dfa,
+                                seed_state,
+                                d_chars_in,
+                                num_chars,
+                                transduced_out_it,
+                                transduced_out_idx_it,
+                                d_num_transduced_out_it,
+                                stream);
+}
+
+}  // namespace cudf::io::fst
diff --git a/cpp/src/io/fst/dispatch_dfa.cuh b/cpp/src/io/fst/dispatch_dfa.cuh
new file mode 100644
index 00000000000..cabbe863131
--- /dev/null
+++ b/cpp/src/io/fst/dispatch_dfa.cuh
@@ -0,0 +1,456 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include "agent_dfa.cuh"
+#include "in_reg_array.cuh"
+
+#include <cub/cub.cuh>
+
+#include <cstdint>
+
+namespace cudf::io::fst::detail {
+
+/**
+ * @brief The tuning policy comprising all the architecture-specific compile-time tuning parameters.
+ *
+ * @tparam _BLOCK_THREADS Number of threads per block
+ * @tparam _ITEMS_PER_THREAD Number of symbols processed by each thread
+ */
+template <int32_t _BLOCK_THREADS, int32_t _ITEMS_PER_THREAD>
+struct AgentDFAPolicy {
+  // The number of threads per block
+  static constexpr int32_t BLOCK_THREADS = _BLOCK_THREADS;
+
+  // The number of symbols processed by each thread
+  static constexpr int32_t ITEMS_PER_THREAD = _ITEMS_PER_THREAD;
+};
+
+/**
+ * @brief The list of architecture-specific tuning policies. Yet TBD.
+ */
+struct DeviceFSMPolicy {
+  //------------------------------------------------------------------------------
+  // Architecture-specific tuning policies
+  //------------------------------------------------------------------------------
+  struct Policy900 : cub::ChainedPolicy<900, Policy900, Policy900> {
+    enum {
+      BLOCK_THREADS    = 128,
+      ITEMS_PER_THREAD = 32,
+    };
+
+    using AgentDFAPolicy = AgentDFAPolicy<BLOCK_THREADS, ITEMS_PER_THREAD>;
+  };
+
+  // Top-of-list of the tuning policy "chain"
+  using MaxPolicy = Policy900;
+};
+
+/**
+ * @brief Kernel for initializing single-pass prefix scan tile states
+ *
+ * @param items_state The tile state
+ * @param num_tiles The number of tiles to be initialized
+ * @return
+ */
+template <typename TileState>
+__global__ void initialization_pass_kernel(TileState items_state, uint32_t num_tiles)
+{
+  items_state.InitializeStatus(num_tiles);
+}
+
+template <typename DfaT,
+          typename SymbolItT,
+          typename TransducedOutItT,
+          typename TransducedIndexOutItT,
+          typename TransducedCountOutItT,
+          typename OffsetT>
+struct DispatchFSM : DeviceFSMPolicy {
+  //------------------------------------------------------------------------------
+  // DEFAULT TYPES
+  //------------------------------------------------------------------------------
+  using StateIndexT  = uint32_t;
+  using BlockOffsetT = uint32_t;
+
+  //------------------------------------------------------------------------------
+  // DERIVED CONFIGS
+  //------------------------------------------------------------------------------
+  // DFA-specific configs
+  static constexpr int32_t MAX_NUM_STATES  = DfaT::MAX_NUM_STATES;
+  static constexpr int32_t MAX_NUM_SYMBOLS = DfaT::MAX_NUM_SYMBOLS;
+
+  // Whether to use a single-pass prefix scan that does all in on
+  static constexpr bool SINGLE_PASS_STV = false;
+
+  // Whether this is a finite-state transform
+  static constexpr bool IS_FST = true;
+
+  //------------------------------------------------------------------------------
+  // TYPEDEFS
+  //------------------------------------------------------------------------------
+  using StateVectorCompositeOpT = VectorCompositeOp<MAX_NUM_STATES>;
+
+  //------------------------------------------------------------------------------
+  // MEMBER VARS
+  //------------------------------------------------------------------------------
+  void* d_temp_storage;
+  size_t& temp_storage_bytes;
+  DfaT dfa;
+  StateIndexT seed_state;
+  SymbolItT d_chars_in;
+  OffsetT num_chars;
+  TransducedOutItT transduced_out_it;
+  TransducedIndexOutItT transduced_out_idx_it;
+  TransducedCountOutItT d_num_transduced_out_it;
+  cudaStream_t stream;
+  int const ptx_version;
+
+  //------------------------------------------------------------------------------
+  // CONSTRUCTOR
+  //------------------------------------------------------------------------------
+  CUB_RUNTIME_FUNCTION __forceinline__ DispatchFSM(void* d_temp_storage,
+                                                   size_t& temp_storage_bytes,
+                                                   DfaT dfa,
+                                                   StateIndexT seed_state,
+                                                   SymbolItT d_chars_in,
+                                                   OffsetT num_chars,
+                                                   TransducedOutItT transduced_out_it,
+                                                   TransducedIndexOutItT transduced_out_idx_it,
+                                                   TransducedCountOutItT d_num_transduced_out_it,
+                                                   cudaStream_t stream,
+                                                   int ptx_version)
+    : d_temp_storage(d_temp_storage),
+      temp_storage_bytes(temp_storage_bytes),
+      dfa(dfa),
+      seed_state(seed_state),
+      d_chars_in(d_chars_in),
+      num_chars(num_chars),
+      transduced_out_it(transduced_out_it),
+      transduced_out_idx_it(transduced_out_idx_it),
+      d_num_transduced_out_it(d_num_transduced_out_it),
+      stream(stream),
+      ptx_version(ptx_version)
+  {
+  }
+
+  //------------------------------------------------------------------------------
+  // DISPATCH INTERFACE
+  //------------------------------------------------------------------------------
+  CUB_RUNTIME_FUNCTION __forceinline__ static cudaError_t Dispatch(
+    void* d_temp_storage,
+    size_t& temp_storage_bytes,
+    DfaT dfa,
+    StateIndexT seed_state,
+    SymbolItT d_chars_in,
+    OffsetT num_chars,
+    TransducedOutItT transduced_out_it,
+    TransducedIndexOutItT transduced_out_idx_it,
+    TransducedCountOutItT d_num_transduced_out_it,
+    cudaStream_t stream)
+  {
+    using MaxPolicyT = DispatchFSM::MaxPolicy;
+
+    cudaError_t error;
+
+    // Get PTX version
+    int ptx_version;
+    error = cub::PtxVersion(ptx_version);
+    if (error != cudaSuccess) return error;
+
+    // Create dispatch functor
+    DispatchFSM dispatch(d_temp_storage,
+                         temp_storage_bytes,
+                         dfa,
+                         seed_state,
+                         d_chars_in,
+                         num_chars,
+                         transduced_out_it,
+                         transduced_out_idx_it,
+                         d_num_transduced_out_it,
+                         stream,
+                         ptx_version);
+
+    error = MaxPolicyT::Invoke(ptx_version, dispatch);
+    return error;
+  }
+
+  //------------------------------------------------------------------------------
+  // DFA SIMULATION KERNEL INVOCATION
+  //------------------------------------------------------------------------------
+  template <typename ActivePolicyT,
+            typename DFASimulationKernelT,
+            typename TileStateT,
+            typename FstScanTileStateT,
+            typename StateVectorT>
+  CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t
+  InvokeDFASimulationKernel(DFASimulationKernelT dfa_kernel,
+                            int32_t sm_count,
+                            StateIndexT seed_state,
+                            StateVectorT* d_thread_state_transition,
+                            TileStateT tile_state,
+                            FstScanTileStateT fst_tile_state)
+
+  {
+    cudaError_t error = cudaSuccess;
+    cub::KernelConfig dfa_simulation_config;
+
+    using PolicyT = typename ActivePolicyT::AgentDFAPolicy;
+    if (CubDebug(error = dfa_simulation_config.Init<PolicyT>(dfa_kernel))) return error;
+
+    // Kernel invocation
+    uint32_t grid_size =
+      CUB_QUOTIENT_CEILING(num_chars, PolicyT::BLOCK_THREADS * PolicyT::ITEMS_PER_THREAD);
+    uint32_t block_threads = dfa_simulation_config.block_threads;
+
+    dfa_kernel<<<grid_size, block_threads, 0, stream>>>(dfa,
+                                                        d_chars_in,
+                                                        num_chars,
+                                                        seed_state,
+                                                        d_thread_state_transition,
+                                                        tile_state,
+                                                        fst_tile_state,
+                                                        transduced_out_it,
+                                                        transduced_out_idx_it,
+                                                        d_num_transduced_out_it);
+
+    // Check for errors
+    if (CubDebug(error = cudaPeekAtLastError())) return error;
+
+    return error;
+  }
+
+  /**
+   * @brief Computes the state-transition vectors
+   */
+  template <typename ActivePolicyT,
+            typename TileStateT,
+            typename FstScanTileStateT,
+            typename StateVectorT>
+  CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t
+  ComputeStateTransitionVector(uint32_t sm_count,
+                               TileStateT tile_state,
+                               FstScanTileStateT fst_tile_state,
+                               StateVectorT* d_thread_state_transition)
+  {
+    StateIndexT seed_state = 0;
+
+    return InvokeDFASimulationKernel<ActivePolicyT>(
+      SimulateDFAKernel<true,
+                        SINGLE_PASS_STV,
+                        DfaT,
+                        TileStateT,
+                        typename ActivePolicyT::AgentDFAPolicy,
+                        SymbolItT,
+                        OffsetT,
+                        StateVectorT,
+                        FstScanTileStateT,
+                        TransducedOutItT,
+                        TransducedIndexOutItT,
+                        TransducedCountOutItT>,
+      sm_count,
+      seed_state,
+      d_thread_state_transition,
+      tile_state,
+      fst_tile_state);
+  }
+
+  /**
+   * @brief Performs the actual DFA simulation.
+   */
+  template <typename ActivePolicyT,
+            typename TileStateT,
+            typename FstScanTileStateT,
+            typename StateVectorT>
+  CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t
+  SimulateDFA(uint32_t sm_count,
+              TileStateT tile_state,
+              FstScanTileStateT fst_tile_state,
+              StateIndexT seed_state,
+              StateVectorT* d_thread_state_transition)
+  {
+    return InvokeDFASimulationKernel<ActivePolicyT>(
+      SimulateDFAKernel<false,
+                        SINGLE_PASS_STV,
+                        DfaT,
+                        TileStateT,
+                        typename ActivePolicyT::AgentDFAPolicy,
+                        SymbolItT,
+                        OffsetT,
+                        StateVectorT,
+                        FstScanTileStateT,
+                        TransducedOutItT,
+                        TransducedIndexOutItT,
+                        TransducedCountOutItT>,
+      sm_count,
+      seed_state,
+      d_thread_state_transition,
+      tile_state,
+      fst_tile_state);
+  }
+
+  //------------------------------------------------------------------------------
+  // POLICY INVOKATION
+  //------------------------------------------------------------------------------
+  template <typename ActivePolicyT>
+  CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t Invoke()
+  {
+    cudaError_t error = cudaSuccess;
+
+    // Get SM count
+    int device_ordinal = -1;
+    int sm_count       = -1;
+
+    // Get current device
+    error = cudaGetDevice(&device_ordinal);
+    if (error != cudaSuccess) return error;
+
+    error = cudaDeviceGetAttribute(&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal);
+    if (error != cudaSuccess) return error;
+
+    //------------------------------------------------------------------------------
+    // DERIVED TYPEDEFS
+    //------------------------------------------------------------------------------
+    // Type used to represent state-transition vectors
+    using StateVectorT = MultiFragmentInRegArray<MAX_NUM_STATES, MAX_NUM_STATES - 1>;
+
+    // Scan tile state used for propagating composed state transition vectors
+    using ScanTileStateT = typename cub::ScanTileState<StateVectorT>;
+
+    // Scan tile state used for propagating transduced output offsets
+    using FstScanTileStateT = typename cub::ScanTileState<OffsetT>;
+
+    // STATE-TRANSITION IDENTITY VECTOR
+    StateVectorT state_identity_vector;
+    for (int32_t i = 0; i < MAX_NUM_STATES; ++i) {
+      state_identity_vector.Set(i, i);
+    }
+    StateVectorCompositeOpT state_vector_scan_op;
+
+    //------------------------------------------------------------------------------
+    // DERIVED CONFIGS
+    //------------------------------------------------------------------------------
+    enum {
+      BLOCK_THREADS         = ActivePolicyT::BLOCK_THREADS,
+      SYMBOLS_PER_THREAD    = ActivePolicyT::ITEMS_PER_THREAD,
+      NUM_SYMBOLS_PER_BLOCK = BLOCK_THREADS * SYMBOLS_PER_THREAD
+    };
+
+    BlockOffsetT num_blocks = CUB_QUOTIENT_CEILING(num_chars, NUM_SYMBOLS_PER_BLOCK);
+    size_t num_threads      = num_blocks * BLOCK_THREADS;
+
+    //------------------------------------------------------------------------------
+    // TEMPORARY MEMORY REQUIREMENTS
+    //------------------------------------------------------------------------------
+    enum { MEM_STATE_VECTORS = 0, MEM_SCAN, MEM_SINGLE_PASS_STV, MEM_FST_OFFSET, NUM_ALLOCATIONS };
+
+    size_t allocation_sizes[NUM_ALLOCATIONS] = {0};
+    void* allocations[NUM_ALLOCATIONS]       = {0};
+
+    size_t vector_scan_storage_bytes = 0;
+
+    // [MEMORY REQUIREMENTS] STATE-TRANSITION SCAN
+    cub::DeviceScan::ExclusiveScan(nullptr,
+                                   vector_scan_storage_bytes,
+                                   static_cast<StateVectorT*>(allocations[MEM_STATE_VECTORS]),
+                                   static_cast<StateVectorT*>(allocations[MEM_STATE_VECTORS]),
+                                   state_vector_scan_op,
+                                   state_identity_vector,
+                                   num_threads,
+                                   stream);
+
+    allocation_sizes[MEM_STATE_VECTORS] = num_threads * sizeof(StateVectorT);
+    allocation_sizes[MEM_SCAN]          = vector_scan_storage_bytes;
+
+    // Bytes needed for tile status descriptors (fusing state-transition vector + DFA simulation)
+    if constexpr (SINGLE_PASS_STV) {
+      error = ScanTileStateT::AllocationSize(num_blocks, allocation_sizes[MEM_SINGLE_PASS_STV]);
+      if (error != cudaSuccess) return error;
+    }
+
+    // Bytes needed for tile status descriptors (DFA simulation pass for output size computation +
+    // output-generating pass)
+    if constexpr (IS_FST) {
+      error = FstScanTileStateT::AllocationSize(num_blocks, allocation_sizes[MEM_FST_OFFSET]);
+      if (error != cudaSuccess) return error;
+    }
+
+    // Alias the temporary allocations from the single storage blob (or compute the necessary size
+    // of the blob)
+    error =
+      cub::AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes);
+    if (error != cudaSuccess) return error;
+
+    // Return if the caller is simply requesting the size of the storage allocation
+    if (d_temp_storage == NULL) return cudaSuccess;
+
+    // Alias memory for state-transition vectors
+    StateVectorT* d_thread_state_transition =
+      static_cast<StateVectorT*>(allocations[MEM_STATE_VECTORS]);
+
+    //------------------------------------------------------------------------------
+    // INITIALIZE SCAN TILE STATES COMPUTING TRANSDUCED OUTPUT OFFSETS
+    //------------------------------------------------------------------------------
+    FstScanTileStateT fst_offset_tile_state;
+    if constexpr (IS_FST) {
+      // Construct the tile status (aliases memory internally et al.)
+      error = fst_offset_tile_state.Init(
+        num_blocks, allocations[MEM_FST_OFFSET], allocation_sizes[MEM_FST_OFFSET]);
+      if (error != cudaSuccess) return error;
+      constexpr uint32_t FST_INIT_TPB = 256;
+      uint32_t num_fst_init_blocks    = CUB_QUOTIENT_CEILING(num_blocks, FST_INIT_TPB);
+      initialization_pass_kernel<<<num_fst_init_blocks, FST_INIT_TPB, 0, stream>>>(
+        fst_offset_tile_state, num_blocks);
+    }
+
+    //------------------------------------------------------------------------------
+    // COMPUTE STATE-TRANSITION VECTORS
+    //------------------------------------------------------------------------------
+    ScanTileStateT stv_tile_state;
+    if constexpr (SINGLE_PASS_STV) {
+      // Construct the tile status (aliases memory internally et al.)
+      error = stv_tile_state.Init(
+        num_blocks, allocations[MEM_SINGLE_PASS_STV], allocation_sizes[MEM_SINGLE_PASS_STV]);
+      if (error != cudaSuccess) return error;
+      constexpr uint32_t STV_INIT_TPB = 256;
+      uint32_t num_stv_init_blocks    = CUB_QUOTIENT_CEILING(num_blocks, STV_INIT_TPB);
+      initialization_pass_kernel<<<num_stv_init_blocks, STV_INIT_TPB, 0, stream>>>(stv_tile_state,
+                                                                                   num_blocks);
+    } else {
+      // Compute state-transition vectors
+      // TODO tag dispatch or constexpr if depending on single-pass config to avoid superfluous
+      // template instantiations
+      ComputeStateTransitionVector<ActivePolicyT>(
+        sm_count, stv_tile_state, fst_offset_tile_state, d_thread_state_transition);
+
+      // State-transition vector scan computing using the composition operator
+      cub::DeviceScan::ExclusiveScan(allocations[MEM_SCAN],
+                                     allocation_sizes[MEM_SCAN],
+                                     d_thread_state_transition,
+                                     d_thread_state_transition,
+                                     state_vector_scan_op,
+                                     state_identity_vector,
+                                     num_threads,
+                                     stream);
+    }
+
+    //------------------------------------------------------------------------------
+    // SIMULATE DFA
+    //------------------------------------------------------------------------------
+    return SimulateDFA<ActivePolicyT>(
+      sm_count, stv_tile_state, fst_offset_tile_state, seed_state, d_thread_state_transition);
+  }
+};
+}  // namespace cudf::io::fst::detail
diff --git a/cpp/src/io/fst/in_reg_array.cuh b/cpp/src/io/fst/in_reg_array.cuh
new file mode 100644
index 00000000000..0819deb6d97
--- /dev/null
+++ b/cpp/src/io/fst/in_reg_array.cuh
@@ -0,0 +1,140 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <cudf/detail/utilities/integer_utils.hpp>
+#include <cudf/types.hpp>
+
+#include <cub/cub.cuh>
+
+#include <cstdint>
+
+namespace cudf::io::fst::detail {
+
+/**
+ * @brief A bit-packed array of items that can be backed by registers yet allows to be dynamically
+ * addressed at runtime. The data struture is explained in greater detail in the paper <a
+ * href="http://www.vldb.org/pvldb/vol13/p616-stehle.pdf">ParPaRaw: Massively Parallel Parsing of
+ * Delimiter-Separated Raw Data</a>.
+ *
+ * @tparam NUM_ITEMS The maximum number of items this data structure is supposed to store
+ * @tparam MAX_ITEM_VALUE The maximum value that one item can represent
+ * @tparam BackingFragmentT The data type that is holding the fragments
+ */
+template <uint32_t NUM_ITEMS, uint32_t MAX_ITEM_VALUE, typename BackingFragmentT = uint32_t>
+class MultiFragmentInRegArray {
+ private:
+  /// Minimum number of bits required to represent all values from [0, MAX_ITEM_VALUE]
+  static constexpr uint32_t MIN_BITS_PER_ITEM =
+    (MAX_ITEM_VALUE == 0) ? 1 : cub::Log2<(MAX_ITEM_VALUE + 1)>::VALUE;
+
+  /// Number of bits that each fragment can store
+  static constexpr uint32_t NUM_BITS_PER_FRAGMENT = sizeof(BackingFragmentT) * 8;
+
+  /// The number of bits per fragment per item in the array
+  static constexpr uint32_t AVAIL_BITS_PER_FRAG_ITEM = NUM_BITS_PER_FRAGMENT / NUM_ITEMS;
+
+  /// The number of bits per item per fragment to be a power of two to avoid costly integer
+  /// multiplication
+  static constexpr uint32_t BITS_PER_FRAG_ITEM =
+    0x01U << (cub::Log2<(AVAIL_BITS_PER_FRAG_ITEM + 1)>::VALUE - 1);
+
+  // The total number of fragments required to store all the items
+  static constexpr uint32_t FRAGMENTS_PER_ITEM =
+    cudf::util::div_rounding_up_safe(MIN_BITS_PER_ITEM, BITS_PER_FRAG_ITEM);
+
+  //------------------------------------------------------------------------------
+  // HELPER FUNCTIONS
+  //------------------------------------------------------------------------------
+  /**
+   * @brief Returns the \p num_bits bits starting at \p bit_start
+   */
+  CUDF_HOST_DEVICE [[nodiscard]] uint32_t bfe(const uint32_t& data,
+                                              uint32_t bit_start,
+                                              uint32_t num_bits) const
+  {
+#if CUB_PTX_ARCH > 0
+    return cub::BFE(data, bit_start, num_bits);
+#else
+    const uint32_t MASK = (1 << num_bits) - 1;
+    return (data >> bit_start) & MASK;
+#endif
+  }
+
+  /**
+   * @brief Replaces the \p num_bits bits in \p data starting from \p bit_start with the lower \p
+   * num_bits from \p bits.
+   */
+  CUDF_HOST_DEVICE void bfi(uint32_t& data,
+                            uint32_t bits,
+                            uint32_t bit_start,
+                            uint32_t num_bits) const
+  {
+#if CUB_PTX_ARCH > 0
+    cub::BFI(data, data, bits, bit_start, num_bits);
+#else
+    uint32_t x      = bits << bit_start;
+    uint32_t y      = data;
+    uint32_t MASK_X = ((1 << num_bits) - 1) << bit_start;
+    uint32_t MASK_Y = ~MASK_X;
+    data            = (y & MASK_Y) | (x & MASK_X);
+#endif
+  }
+
+  BackingFragmentT data[FRAGMENTS_PER_ITEM];
+
+  //------------------------------------------------------------------------------
+  // ACCESSORS
+  //------------------------------------------------------------------------------
+ public:
+  CUDF_HOST_DEVICE [[nodiscard]] uint32_t Get(int32_t index) const
+  {
+    uint32_t val = 0;
+
+    for (uint32_t i = 0; i < FRAGMENTS_PER_ITEM; ++i) {
+      val = val | bfe(data[i], index * BITS_PER_FRAG_ITEM, BITS_PER_FRAG_ITEM)
+                    << (i * BITS_PER_FRAG_ITEM);
+    }
+    return val;
+  }
+
+  CUDF_HOST_DEVICE void Set(uint32_t index, uint32_t value)
+  {
+    for (uint32_t i = 0; i < FRAGMENTS_PER_ITEM; ++i) {
+      uint32_t frag_bits = bfe(value, i * BITS_PER_FRAG_ITEM, BITS_PER_FRAG_ITEM);
+      bfi(data[i], frag_bits, index * BITS_PER_FRAG_ITEM, BITS_PER_FRAG_ITEM);
+    }
+  }
+
+  //------------------------------------------------------------------------------
+  // CONSTRUCTORS
+  //------------------------------------------------------------------------------
+  CUDF_HOST_DEVICE MultiFragmentInRegArray()
+  {
+    for (uint32_t i = 0; i < FRAGMENTS_PER_ITEM; ++i) {
+      data[i] = 0;
+    }
+  }
+
+  CUDF_HOST_DEVICE MultiFragmentInRegArray(uint32_t const (&array)[NUM_ITEMS])
+  {
+    for (uint32_t i = 0; i < NUM_ITEMS; ++i) {
+      Set(i, array[i]);
+    }
+  }
+};
+
+}  // namespace cudf::io::fst::detail
diff --git a/cpp/src/io/fst/lookup_tables.cuh b/cpp/src/io/fst/lookup_tables.cuh
new file mode 100644
index 00000000000..c5033868925
--- /dev/null
+++ b/cpp/src/io/fst/lookup_tables.cuh
@@ -0,0 +1,571 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cudf/types.hpp>
+#include <io/fst/device_dfa.cuh>
+#include <io/utilities/hostdevice_vector.hpp>
+
+#include <cub/cub.cuh>
+
+#include <algorithm>
+#include <cstdint>
+#include <vector>
+
+namespace cudf::io::fst::detail {
+
+/**
+ * @brief Class template that can be plugged into the finite-state machine to look up the symbol
+ * group index for a given symbol. Class template does not support multi-symbol lookups (i.e., no
+ * look-ahead). The class uses shared memory for the lookups.
+ *
+ * @tparam SymbolT The symbol type being passed in to lookup the corresponding symbol group id
+ */
+template <typename SymbolT>
+class SingleSymbolSmemLUT {
+ private:
+  // Type used for representing a symbol group id (i.e., what we return for a given symbol)
+  using SymbolGroupIdT = uint8_t;
+
+  // Number of entries for every lookup (e.g., for 8-bit Symbol this is 256)
+  static constexpr uint32_t NUM_ENTRIES_PER_LUT = 0x01U << (sizeof(SymbolT) * 8U);
+
+  struct _TempStorage {
+    // sym_to_sgid[symbol] -> symbol group index
+    SymbolGroupIdT sym_to_sgid[NUM_ENTRIES_PER_LUT];
+  };
+
+ public:
+  struct KernelParameter {
+    // sym_to_sgid[min(symbol,num_valid_entries)] -> symbol group index
+    SymbolT num_valid_entries;
+
+    // sym_to_sgid[symbol] -> symbol group index
+    SymbolGroupIdT sym_to_sgid[NUM_ENTRIES_PER_LUT];
+  };
+
+  using TempStorage = cub::Uninitialized<_TempStorage>;
+
+  /**
+   * @brief Initializes the given \p sgid_init with the symbol group lookups defined by \p
+   * symbol_strings.
+   *
+   * @param[out] sgid_init A hostdevice_vector that will be populated
+   * @param[in] symbol_strings Array of strings, where the i-th string holds all symbols
+   * (characters!) that correspond to the i-th symbol group index
+   * @param[in] stream The stream that shall be used to cudaMemcpyAsync the lookup table
+   * @return
+   */
+  template <typename SymbolGroupItT>
+  static void InitDeviceSymbolGroupIdLut(hostdevice_vector<KernelParameter>& sgid_init,
+                                         SymbolGroupItT const& symbol_strings,
+                                         rmm::cuda_stream_view stream)
+  {
+    // The symbol group index to be returned if none of the given symbols match
+    SymbolGroupIdT no_match_id = symbol_strings.size();
+
+    // The symbol with the largest value that is mapped to a symbol group id
+    SymbolGroupIdT max_base_match_val = 0;
+
+    // Initialize all entries: by default we return the no-match-id
+    std::fill(&sgid_init.host_ptr()->sym_to_sgid[0],
+              &sgid_init.host_ptr()->sym_to_sgid[NUM_ENTRIES_PER_LUT],
+              no_match_id);
+
+    // Set up lookup table
+    uint32_t sg_id = 0;
+    // Iterate over the symbol groups
+    for (auto const& sg_symbols : symbol_strings) {
+      // Iterate over all symbols that belong to the current symbol group
+      for (auto const& sg_symbol : sg_symbols) {
+        max_base_match_val = std::max(max_base_match_val, static_cast<SymbolGroupIdT>(sg_symbol));
+        sgid_init.host_ptr()->sym_to_sgid[static_cast<int32_t>(sg_symbol)] = sg_id;
+      }
+      sg_id++;
+    }
+
+    // Initialize the out-of-bounds lookup: sym_to_sgid[max_base_match_val+1] -> no_match_id
+    sgid_init.host_ptr()->sym_to_sgid[max_base_match_val + 1] = no_match_id;
+
+    // Alias memory / return memory requiremenets
+    sgid_init.host_ptr()->num_valid_entries = max_base_match_val + 1;
+
+    sgid_init.host_to_device(stream);
+  }
+
+  _TempStorage& temp_storage;
+  SymbolGroupIdT num_valid_entries;
+
+  __device__ __forceinline__ _TempStorage& PrivateStorage()
+  {
+    __shared__ _TempStorage private_storage;
+    return private_storage;
+  }
+
+  /**
+   * @brief Initializes the lookup table, primarily to be invoked from within device code but also
+   * provides host-side implementation for verification.
+   * @note Synchronizes the thread block, if called from device, and, hence, requires all threads
+   * of the thread block to call the constructor
+   */
+  constexpr CUDF_HOST_DEVICE SingleSymbolSmemLUT(KernelParameter const& kernel_param,
+                                                 TempStorage& temp_storage)
+    : temp_storage(temp_storage.Alias()), num_valid_entries(kernel_param.num_valid_entries)
+  {
+    // GPU-side init
+#if CUB_PTX_ARCH > 0
+    for (int32_t i = threadIdx.x; i < kernel_param.num_valid_entries; i += blockDim.x) {
+      this->temp_storage.sym_to_sgid[i] = kernel_param.sym_to_sgid[i];
+    }
+    __syncthreads();
+
+#else
+    // CPU-side init
+    std::copy_n(kernel_param.sym_to_sgid, kernel_param.num_luts, this->temp_storage.sym_to_sgid);
+#endif
+  }
+
+  constexpr CUDF_HOST_DEVICE int32_t operator()(SymbolT const symbol) const
+  {
+    // Look up the symbol group for given symbol
+    return temp_storage.sym_to_sgid[min(symbol, num_valid_entries - 1)];
+  }
+};
+
+/**
+ * @brief Lookup table mapping (old_state, symbol_group_id) transitions to a new target state. The
+ * class uses shared memory for the lookups.
+ *
+ * @tparam MAX_NUM_SYMBOLS The maximum number of symbols being output by a single state transition
+ * @tparam MAX_NUM_STATES The maximum number of states that this lookup table shall support
+ */
+template <int32_t MAX_NUM_SYMBOLS, int32_t MAX_NUM_STATES>
+class TransitionTable {
+ private:
+  // Type used
+  using ItemT = char;
+
+  struct _TempStorage {
+    ItemT transitions[MAX_NUM_STATES * MAX_NUM_SYMBOLS];
+  };
+
+ public:
+  using TempStorage = cub::Uninitialized<_TempStorage>;
+
+  struct KernelParameter {
+    ItemT transitions[MAX_NUM_STATES * MAX_NUM_SYMBOLS];
+  };
+
+  template <typename StateIdT, typename = std::void_t<decltype(ItemT{std::declval<StateIdT>()})>>
+  static void InitDeviceTransitionTable(hostdevice_vector<KernelParameter>& transition_table_init,
+                                        std::vector<std::vector<StateIdT>> const& translation_table,
+                                        rmm::cuda_stream_view stream)
+  {
+    // translation_table[state][symbol] -> new state
+    for (std::size_t state = 0; state < translation_table.size(); ++state) {
+      for (std::size_t symbol = 0; symbol < translation_table[state].size(); ++symbol) {
+        CUDF_EXPECTS(
+          translation_table[state][symbol] <= std::numeric_limits<ItemT>::max(),
+          "Target state index value exceeds value representable by the transition table's type");
+        transition_table_init.host_ptr()->transitions[symbol * MAX_NUM_STATES + state] =
+          translation_table[state][symbol];
+      }
+    }
+
+    // Copy transition table to device
+    transition_table_init.host_to_device(stream);
+  }
+
+  constexpr CUDF_HOST_DEVICE TransitionTable(const KernelParameter& kernel_param,
+                                             TempStorage& temp_storage)
+    : temp_storage(temp_storage.Alias())
+  {
+#if CUB_PTX_ARCH > 0
+    for (int i = threadIdx.x; i < MAX_NUM_STATES * MAX_NUM_SYMBOLS; i += blockDim.x) {
+      this->temp_storage.transitions[i] = kernel_param.transitions[i];
+    }
+    __syncthreads();
+#else
+    std::copy_n(
+      kernel_param.transitions, MAX_NUM_STATES * MAX_NUM_SYMBOLS, this->temp_storage.transitions);
+#endif
+  }
+
+  /**
+   * @brief Returns a random-access iterator to lookup all the state transitions for one specific
+   * symbol from an arbitrary old_state, i.e., it[old_state] -> new_state.
+   *
+   * @param state_id The DFA's current state index from which we'll transition
+   * @param match_id The symbol group id of the symbol that we just read in
+   * @return
+   */
+  template <typename StateIndexT, typename SymbolIndexT>
+  constexpr CUDF_HOST_DEVICE int32_t operator()(StateIndexT const state_id,
+                                                SymbolIndexT const match_id) const
+  {
+    return temp_storage.transitions[match_id * MAX_NUM_STATES + state_id];
+  }
+
+ private:
+  _TempStorage& temp_storage;
+
+  __device__ __forceinline__ _TempStorage& PrivateStorage()
+  {
+    __shared__ _TempStorage private_storage;
+
+    return private_storage;
+  }
+};
+
+template <typename SymbolGroupIdLookupT,
+          typename TransitionTableT,
+          typename TranslationTableT,
+          int32_t NUM_STATES>
+class dfa_device_view {
+ private:
+  using sgid_lut_init_t          = typename SymbolGroupIdLookupT::KernelParameter;
+  using transition_table_init_t  = typename TransitionTableT::KernelParameter;
+  using translation_table_init_t = typename TranslationTableT::KernelParameter;
+
+ public:
+  // The maximum number of states supported by this DFA instance
+  // This is a value queried by the DFA simulation algorithm
+  static constexpr int32_t MAX_NUM_STATES = NUM_STATES;
+
+  using SymbolGroupStorageT      = typename SymbolGroupIdLookupT::TempStorage;
+  using TransitionTableStorageT  = typename TransitionTableT::TempStorage;
+  using TranslationTableStorageT = typename TranslationTableT::TempStorage;
+
+  __device__ auto InitSymbolGroupLUT(SymbolGroupStorageT& temp_storage)
+  {
+    return SymbolGroupIdLookupT(*d_sgid_lut_init, temp_storage);
+  }
+
+  __device__ auto InitTransitionTable(TransitionTableStorageT& temp_storage)
+  {
+    return TransitionTableT(*d_transition_table_init, temp_storage);
+  }
+
+  __device__ auto InitTranslationTable(TranslationTableStorageT& temp_storage)
+  {
+    return TranslationTableT(*d_translation_table_init, temp_storage);
+  }
+
+  dfa_device_view(sgid_lut_init_t const* d_sgid_lut_init,
+                  transition_table_init_t const* d_transition_table_init,
+                  translation_table_init_t const* d_translation_table_init)
+    : d_sgid_lut_init(d_sgid_lut_init),
+      d_transition_table_init(d_transition_table_init),
+      d_translation_table_init(d_translation_table_init)
+  {
+  }
+
+ private:
+  sgid_lut_init_t const* d_sgid_lut_init;
+  transition_table_init_t const* d_transition_table_init;
+  translation_table_init_t const* d_translation_table_init;
+};
+
+/**
+ * @brief Lookup table mapping (old_state, symbol_group_id) transitions to a sequence of symbols
+ * that the finite-state transducer is supposed to output for each transition. The class uses shared
+ * memory for the lookups.
+ *
+ * @tparam OutSymbolT The symbol type being output
+ * @tparam OutSymbolOffsetT Type sufficiently large to index into the lookup table of output symbols
+ * @tparam MAX_NUM_SYMBOLS The maximum number of symbols being output by a single state transition
+ * @tparam MAX_NUM_STATES The maximum number of states that this lookup table shall support
+ * @tparam MAX_TABLE_SIZE The maximum number of items in the lookup table of output symbols
+ */
+template <typename OutSymbolT,
+          typename OutSymbolOffsetT,
+          int32_t MAX_NUM_SYMBOLS,
+          int32_t MAX_NUM_STATES,
+          int32_t MAX_TABLE_SIZE = (MAX_NUM_SYMBOLS * MAX_NUM_STATES)>
+class TransducerLookupTable {
+ private:
+  struct _TempStorage {
+    OutSymbolOffsetT out_offset[MAX_NUM_STATES * MAX_NUM_SYMBOLS + 1];
+    OutSymbolT out_symbols[MAX_TABLE_SIZE];
+  };
+
+ public:
+  using TempStorage = cub::Uninitialized<_TempStorage>;
+
+  struct KernelParameter {
+    OutSymbolOffsetT d_out_offsets[MAX_NUM_STATES * MAX_NUM_SYMBOLS + 1];
+    OutSymbolT d_out_symbols[MAX_TABLE_SIZE];
+  };
+
+  /**
+   * @brief Initializes the lookup table, primarily to be invoked from within device code but also
+   * provides host-side implementation for verification.
+   * @note Synchronizes the thread block, if called from device, and, hence, requires all threads
+   * of the thread block to call the constructor
+   */
+  static void InitDeviceTranslationTable(
+    hostdevice_vector<KernelParameter>& translation_table_init,
+    std::vector<std::vector<std::vector<OutSymbolT>>> const& translation_table,
+    rmm::cuda_stream_view stream)
+  {
+    std::vector<OutSymbolT> out_symbols;
+    out_symbols.reserve(MAX_TABLE_SIZE);
+    std::vector<OutSymbolOffsetT> out_symbol_offsets;
+    out_symbol_offsets.reserve(MAX_NUM_STATES * MAX_NUM_SYMBOLS + 1);
+    out_symbol_offsets.push_back(0);
+
+    // Iterate over the states in the transition table
+    for (auto const& state_trans : translation_table) {
+      uint32_t num_added = 0;
+      // Iterate over the symbols in the transition table
+      for (auto const& symbol_out : state_trans) {
+        // Insert the output symbols for this specific (state, symbol) transition
+        out_symbols.insert(std::end(out_symbols), std::begin(symbol_out), std::end(symbol_out));
+        out_symbol_offsets.push_back(out_symbols.size());
+        num_added++;
+      }
+
+      // Copy the last offset for all symbols (to guarantee a proper lookup for omitted symbols of
+      // this state)
+      if (MAX_NUM_SYMBOLS > num_added) {
+        int32_t count = MAX_NUM_SYMBOLS - num_added;
+        auto begin_it = std::prev(std::end(out_symbol_offsets));
+        std::fill_n(begin_it, count, out_symbol_offsets[0]);
+      }
+    }
+
+    // Check whether runtime-provided table size exceeds the compile-time given max. table size
+    CUDF_EXPECTS(out_symbols.size() <= MAX_TABLE_SIZE, "Unsupported translation table");
+
+    // Prepare host-side data to be copied and passed to the device
+    std::copy(std::cbegin(out_symbol_offsets),
+              std::cend(out_symbol_offsets),
+              translation_table_init.host_ptr()->d_out_offsets);
+    std::copy(std::cbegin(out_symbols),
+              std::cend(out_symbols),
+              translation_table_init.host_ptr()->d_out_symbols);
+
+    // Copy data to device
+    translation_table_init.host_to_device(stream);
+  }
+
+ private:
+  _TempStorage& temp_storage;
+
+  __device__ __forceinline__ _TempStorage& PrivateStorage()
+  {
+    __shared__ _TempStorage private_storage;
+    return private_storage;
+  }
+
+ public:
+  /**
+   * @brief Initializes the lookup table, primarily to be invoked from within device code but also
+   * provides host-side implementation for verification.
+   * @note Synchronizes the thread block, if called from device, and, hence, requires all threads
+   * of the thread block to call the constructor
+   */
+  CUDF_HOST_DEVICE TransducerLookupTable(KernelParameter const& kernel_param,
+                                         TempStorage& temp_storage)
+    : temp_storage(temp_storage.Alias())
+  {
+    constexpr uint32_t num_offsets = MAX_NUM_STATES * MAX_NUM_SYMBOLS + 1;
+#if CUB_PTX_ARCH > 0
+    for (int i = threadIdx.x; i < num_offsets; i += blockDim.x) {
+      this->temp_storage.out_offset[i] = kernel_param.d_out_offsets[i];
+    }
+    // Make sure all threads in the block can read out_symbol_offsets[num_offsets - 1] from shared
+    // memory
+    __syncthreads();
+    for (int i = threadIdx.x; i < this->temp_storage.out_offset[num_offsets - 1]; i += blockDim.x) {
+      this->temp_storage.out_symbols[i] = kernel_param.d_out_symbols[i];
+    }
+    __syncthreads();
+#else
+    std::copy_n(kernel_param.d_out_offsets, num_offsets, this->temp_storage.out_symbol_offsets);
+    std::copy_n(kernel_param.d_out_symbols,
+                this->temp_storage.out_symbol_offsets,
+                this->temp_storage.out_symbols);
+#endif
+  }
+
+  template <typename StateIndexT, typename SymbolIndexT, typename RelativeOffsetT>
+  constexpr CUDF_HOST_DEVICE OutSymbolT operator()(StateIndexT const state_id,
+                                                   SymbolIndexT const match_id,
+                                                   RelativeOffsetT const relative_offset) const
+  {
+    auto offset = temp_storage.out_offset[state_id * MAX_NUM_SYMBOLS + match_id] + relative_offset;
+    return temp_storage.out_symbols[offset];
+  }
+
+  template <typename StateIndexT, typename SymbolIndexT>
+  constexpr CUDF_HOST_DEVICE OutSymbolOffsetT operator()(StateIndexT const state_id,
+                                                         SymbolIndexT const match_id) const
+  {
+    return temp_storage.out_offset[state_id * MAX_NUM_SYMBOLS + match_id + 1] -
+           temp_storage.out_offset[state_id * MAX_NUM_SYMBOLS + match_id];
+  }
+};
+
+/**
+ * @brief Helper class to facilitate the specification and instantiation of a DFA (i.e., the
+ * transition table and its number of states, the mapping of symbols to symbol groups, and the
+ * translation table that specifies which state transitions cause which output to be written).
+ *
+ * @tparam OutSymbolT The symbol type being output by the finite-state transducer
+ * @tparam NUM_SYMBOLS The number of symbol groups amongst which to differentiate including the
+ * wildcard symbol group (one dimension of the transition table)
+ * @tparam NUM_STATES The number of states defined by the DFA (the other dimension of the
+ * transition table)
+ */
+template <typename OutSymbolT, int32_t NUM_SYMBOLS, int32_t NUM_STATES>
+class Dfa {
+ public:
+  // The maximum number of states supported by this DFA instance
+  // This is a value queried by the DFA simulation algorithm
+  static constexpr int32_t MAX_NUM_STATES = NUM_STATES;
+
+ private:
+  // Symbol-group id lookup table
+  using SymbolGroupIdLookupT = detail::SingleSymbolSmemLUT<char>;
+  using SymbolGroupIdInitT   = typename SymbolGroupIdLookupT::KernelParameter;
+
+  // Transition table
+  using TransitionTableT     = detail::TransitionTable<NUM_SYMBOLS, NUM_STATES>;
+  using TransitionTableInitT = typename TransitionTableT::KernelParameter;
+
+  // Translation lookup table
+  using OutSymbolOffsetT      = uint32_t;
+  using TranslationTableT     = detail::TransducerLookupTable<OutSymbolT,
+                                                          OutSymbolOffsetT,
+                                                          NUM_SYMBOLS,
+                                                          NUM_STATES,
+                                                          NUM_SYMBOLS * NUM_STATES>;
+  using TranslationTableInitT = typename TranslationTableT::KernelParameter;
+
+  auto get_device_view()
+  {
+    return dfa_device_view<SymbolGroupIdLookupT, TransitionTableT, TranslationTableT, NUM_STATES>{
+      sgid_init.d_begin(), transition_table_init.d_begin(), translation_table_init.d_begin()};
+  }
+
+ public:
+  /**
+   * @brief Constructs a new DFA.
+   *
+   * @param symbol_vec Sequence container of symbol groups. Each symbol group is a sequence
+   * container to symbols within that group. The index of the symbol group containing a symbol being
+   * read will be used as symbol_gid of the transition and translation tables.
+   * @param tt_vec The transition table
+   * @param out_tt_vec The translation table
+   * @param stream The stream to which memory operations and kernels are getting dispatched to
+   */
+  template <typename StateIdT, typename SymbolGroupIdItT>
+  Dfa(SymbolGroupIdItT const& symbol_vec,
+      std::vector<std::vector<StateIdT>> const& tt_vec,
+      std::vector<std::vector<std::vector<OutSymbolT>>> const& out_tt_vec,
+      cudaStream_t stream)
+  {
+    constexpr std::size_t single_item = 1;
+
+    sgid_init              = hostdevice_vector<SymbolGroupIdInitT>{single_item, stream};
+    transition_table_init  = hostdevice_vector<TransitionTableInitT>{single_item, stream};
+    translation_table_init = hostdevice_vector<TranslationTableInitT>{single_item, stream};
+
+    // Initialize symbol group id lookup table
+    SymbolGroupIdLookupT::InitDeviceSymbolGroupIdLut(sgid_init, symbol_vec, stream);
+
+    // Initialize state transition table
+    TransitionTableT::InitDeviceTransitionTable(transition_table_init, tt_vec, stream);
+
+    // Initialize finite-state transducer lookup table
+    TranslationTableT::InitDeviceTranslationTable(translation_table_init, out_tt_vec, stream);
+  }
+
+  /**
+   * @brief Dispatches the finite-state transducer algorithm to the GPU.
+   *
+   * @tparam SymbolT The atomic symbol type from the input tape
+   * @tparam TransducedOutItT Random-access output iterator to which the transduced output will be
+   * written
+   * @tparam TransducedIndexOutItT Random-access output iterator type to which the input symbols'
+   * indexes are written.
+   * @tparam TransducedCountOutItT A single-item output iterator type to which the total number of
+   * output symbols is written
+   * @tparam OffsetT A type large enough to index into either of both: (a) the input symbols and (b)
+   * the output symbols
+   * @param d_chars Pointer to the input string of symbols
+   * @param num_chars The total number of input symbols to process
+   * @param d_out_it Random-access output iterator to which the transduced output is
+   * written
+   * @param d_out_idx_it Random-access output iterator to which, the index i is written
+   * iff the i-th input symbol caused some output to be written
+   * @param d_num_transduced_out_it A single-item output iterator type to which the total number
+   * of output symbols is written
+   * @param seed_state The DFA's starting state. For streaming DFAs this corresponds to the
+   * "end-state" of the previous invocation of the algorithm.
+   * @param stream CUDA stream to launch kernels within. Default is the null-stream.
+   */
+  template <typename SymbolT,
+            typename TransducedOutItT,
+            typename TransducedIndexOutItT,
+            typename TransducedCountOutItT,
+            typename OffsetT>
+  void Transduce(SymbolT const* d_chars,
+                 OffsetT num_chars,
+                 TransducedOutItT d_out_it,
+                 TransducedIndexOutItT d_out_idx_it,
+                 TransducedCountOutItT d_num_transduced_out_it,
+                 const uint32_t seed_state,
+                 rmm::cuda_stream_view stream)
+  {
+    std::size_t temp_storage_bytes = 0;
+    rmm::device_buffer temp_storage{};
+    DeviceTransduce(nullptr,
+                    temp_storage_bytes,
+                    this->get_device_view(),
+                    d_chars,
+                    num_chars,
+                    d_out_it,
+                    d_out_idx_it,
+                    d_num_transduced_out_it,
+                    seed_state,
+                    stream);
+
+    if (temp_storage.size() < temp_storage_bytes) {
+      temp_storage.resize(temp_storage_bytes, stream);
+    }
+
+    DeviceTransduce(temp_storage.data(),
+                    temp_storage_bytes,
+                    this->get_device_view(),
+                    d_chars,
+                    num_chars,
+                    d_out_it,
+                    d_out_idx_it,
+                    d_num_transduced_out_it,
+                    seed_state,
+                    stream);
+  }
+
+ private:
+  hostdevice_vector<SymbolGroupIdInitT> sgid_init{};
+  hostdevice_vector<TransitionTableInitT> transition_table_init{};
+  hostdevice_vector<TranslationTableInitT> translation_table_init{};
+};
+
+}  // namespace cudf::io::fst::detail
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index d00fa6633de..42192a972d3 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -225,6 +225,7 @@ ConfigureTest(JSON_TEST io/json_test.cpp)
 ConfigureTest(ARROW_IO_SOURCE_TEST io/arrow_io_source_test.cpp)
 ConfigureTest(MULTIBYTE_SPLIT_TEST io/text/multibyte_split_test.cpp)
 ConfigureTest(LOGICAL_STACK_TEST io/fst/logical_stack_test.cu)
+ConfigureTest(FST_TEST io/fst/fst_test.cu)
 if(CUDF_ENABLE_ARROW_S3)
   target_compile_definitions(ARROW_IO_SOURCE_TEST PRIVATE "S3_ENABLED")
 endif()
diff --git a/cpp/tests/io/fst/fst_test.cu b/cpp/tests/io/fst/fst_test.cu
new file mode 100644
index 00000000000..e198c804222
--- /dev/null
+++ b/cpp/tests/io/fst/fst_test.cu
@@ -0,0 +1,262 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <io/fst/lookup_tables.cuh>
+#include <io/utilities/hostdevice_vector.hpp>
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/cudf_gtest.hpp>
+
+#include <cudf/scalar/scalar_factories.hpp>
+#include <cudf/strings/repeat_strings.hpp>
+#include <cudf/types.hpp>
+
+#include <rmm/cuda_stream.hpp>
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/device_buffer.hpp>
+#include <rmm/device_uvector.hpp>
+
+#include <cstdlib>
+#include <vector>
+
+namespace {
+
+//------------------------------------------------------------------------------
+// CPU-BASED IMPLEMENTATIONS FOR VERIFICATION
+//------------------------------------------------------------------------------
+/**
+ * @brief CPU-based implementation of a finite-state transducer (FST).
+ *
+ * @tparam InputItT Forward input iterator type to symbols fed into the FST
+ * @tparam StateT Type representing states of the finite-state machine
+ * @tparam SymbolGroupLutT Sequence container of symbol groups. Each symbol group is a sequence
+ * container to symbols within that group.
+ * @tparam TransitionTableT Two-dimensional container type
+ * @tparam TransducerTableT Two-dimensional container type
+ * @tparam OutputItT Forward output iterator type
+ * @tparam IndexOutputItT Forward output iterator type
+ * @param[in] begin Forward iterator to the beginning of the symbol sequence
+ * @param[in] end Forward iterator to one past the last element of the symbol sequence
+ * @param[in] init_state The starting state of the finite-state machine
+ * @param[in] symbol_group_lut Sequence container of symbol groups. Each symbol group is a sequence
+ * container to symbols within that group. The index of the symbol group containing a symbol being
+ * read will be used as symbol_gid of the transition and translation tables.
+ * @param[in] transition_table The two-dimensional transition table, i.e.,
+ * transition_table[state][symbol_gid] -> new_state
+ * @param[in] translation_table The two-dimensional transducer table, i.e.,
+ * translation_table[state][symbol_gid] -> range_of_output_symbols
+ * @param[out] out_tape A forward output iterator to which the transduced input will be written
+ * @param[out] out_index_tape A forward output iterator to which indexes of the symbols that
+ * actually caused some output are written to
+ * @return A pair of iterators to one past the last element of (1) the transduced output symbol
+ * sequence and (2) the indexes of
+ */
+template <typename InputItT,
+          typename StateT,
+          typename SymbolGroupLutT,
+          typename TransitionTableT,
+          typename TransducerTableT,
+          typename OutputItT,
+          typename IndexOutputItT>
+static std::pair<OutputItT, IndexOutputItT> fst_baseline(InputItT begin,
+                                                         InputItT end,
+                                                         StateT const& init_state,
+                                                         SymbolGroupLutT symbol_group_lut,
+                                                         TransitionTableT transition_table,
+                                                         TransducerTableT translation_table,
+                                                         OutputItT out_tape,
+                                                         IndexOutputItT out_index_tape)
+{
+  // Initialize "FSM" with starting state
+  StateT state = init_state;
+
+  // To track the symbol offset within the input that caused the FST to output
+  std::size_t in_offset = 0;
+  for (auto it = begin; it < end; it++) {
+    // The symbol currently being read
+    auto const& symbol = *it;
+
+    // Iterate over symbol groups and search for the first symbol group containing the current
+    // symbol, if no match is found we use cend(symbol_group_lut) as the "catch-all" symbol group
+    auto symbol_group_it =
+      std::find_if(std::cbegin(symbol_group_lut), std::cend(symbol_group_lut), [symbol](auto& sg) {
+        return std::find(std::cbegin(sg), std::cend(sg), symbol) != std::cend(sg);
+      });
+    auto symbol_group = std::distance(std::cbegin(symbol_group_lut), symbol_group_it);
+
+    // Output the translated symbols to the output tape
+    out_tape = std::copy(std::cbegin(translation_table[state][symbol_group]),
+                         std::cend(translation_table[state][symbol_group]),
+                         out_tape);
+
+    auto out_size = std::distance(std::cbegin(translation_table[state][symbol_group]),
+                                  std::cend(translation_table[state][symbol_group]));
+
+    out_index_tape = std::fill_n(out_index_tape, out_size, in_offset);
+
+    // Transition the state of the finite-state machine
+    state = transition_table[state][symbol_group];
+
+    // Continue with next symbol from input tape
+    in_offset++;
+  }
+  return {out_tape, out_index_tape};
+}
+
+//------------------------------------------------------------------------------
+// TEST FST SPECIFICATIONS
+//------------------------------------------------------------------------------
+enum DFA_STATES : char {
+  // The state being active while being outside of a string. When encountering an opening bracket or
+  // curly brace, we push it onto the stack. When encountering a closing bracket or brace, we pop it
+  // from the stack.
+  TT_OOS = 0U,
+  // The state being active while being within a string (e.g., field name or a string value). We do
+  // not push or pop from the stack while being in this state.
+  TT_STR,
+  // The state being active after encountering an escape symbol (e.g., '\') while being in the
+  // TT_STR state.
+  TT_ESC,
+  // Total number of states
+  TT_NUM_STATES
+};
+
+// Definition of the symbol groups
+enum PDA_SG_ID {
+  OBC = 0U,          ///< Opening brace SG: {
+  OBT,               ///< Opening bracket SG: [
+  CBC,               ///< Closing brace SG: }
+  CBT,               ///< Closing bracket SG: ]
+  QTE,               ///< Quote character SG: "
+  ESC,               ///< Escape character SG: '\'
+  OTR,               ///< SG implicitly matching all other characters
+  NUM_SYMBOL_GROUPS  ///< Total number of symbol groups
+};
+
+// Transition table
+const std::vector<std::vector<char>> pda_state_tt = {
+  /* IN_STATE         {       [       }       ]       "       \    OTHER */
+  /* TT_OOS    */ {TT_OOS, TT_OOS, TT_OOS, TT_OOS, TT_STR, TT_OOS, TT_OOS},
+  /* TT_STR    */ {TT_STR, TT_STR, TT_STR, TT_STR, TT_OOS, TT_ESC, TT_STR},
+  /* TT_ESC    */ {TT_STR, TT_STR, TT_STR, TT_STR, TT_STR, TT_STR, TT_STR}};
+
+// Translation table (i.e., for each transition, what are the symbols that we output)
+const std::vector<std::vector<std::vector<char>>> pda_out_tt = {
+  /* IN_STATE        {      [      }      ]     "  \   OTHER */
+  /* TT_OOS    */ {{'{'}, {'['}, {'}'}, {']'}, {'x'}, {'x'}, {'x'}},
+  /* TT_STR    */ {{'x'}, {'x'}, {'x'}, {'x'}, {'x'}, {'x'}, {'x'}},
+  /* TT_ESC    */ {{'x'}, {'x'}, {'x'}, {'x'}, {'x'}, {'x'}, {'x'}}};
+
+// The i-th string representing all the characters of a symbol group
+const std::vector<std::string> pda_sgs = {"{", "[", "}", "]", "\"", "\\"};
+
+// The DFA's starting state
+constexpr int32_t start_state = TT_OOS;
+
+}  // namespace
+
+// Base test fixture for tests
+struct FstTest : public cudf::test::BaseFixture {
+};
+
+TEST_F(FstTest, GroundTruth)
+{
+  // Type used to represent the atomic symbol type used within the finite-state machine
+  using SymbolT = char;
+
+  // Type sufficiently large to index symbols within the input and output (may be unsigned)
+  using SymbolOffsetT = uint32_t;
+
+  // Helper class to set up transition table, symbol group lookup table, and translation table
+  using DfaFstT = cudf::io::fst::detail::Dfa<char, NUM_SYMBOL_GROUPS, TT_NUM_STATES>;
+
+  // Prepare cuda stream for data transfers & kernels
+  rmm::cuda_stream stream{};
+  rmm::cuda_stream_view stream_view(stream);
+
+  // Test input
+  std::string input = R"(  {)"
+                      R"("category": "reference",)"
+                      R"("index:" [4,12,42],)"
+                      R"("author": "Nigel Rees",)"
+                      R"("title": "Sayings of the Century",)"
+                      R"("price": 8.95)"
+                      R"(}  )"
+                      R"({)"
+                      R"("category": "reference",)"
+                      R"("index:" [4,{},null,{"a":[]}],)"
+                      R"("author": "Nigel Rees",)"
+                      R"("title": "Sayings of the Century",)"
+                      R"("price": 8.95)"
+                      R"(}  {} [] [ ])";
+
+  size_t string_size                 = input.size() * (1 << 10);
+  auto d_input_scalar                = cudf::make_string_scalar(input);
+  auto& d_string_scalar              = static_cast<cudf::string_scalar&>(*d_input_scalar);
+  const cudf::size_type repeat_times = string_size / input.size();
+  auto d_input_string                = cudf::strings::repeat_string(d_string_scalar, repeat_times);
+  auto& d_input = static_cast<cudf::scalar_type_t<std::string>&>(*d_input_string);
+  input         = d_input.to_string(stream);
+
+  // Prepare input & output buffers
+  constexpr std::size_t single_item = 1;
+  hostdevice_vector<SymbolT> output_gpu(input.size(), stream_view);
+  hostdevice_vector<SymbolOffsetT> output_gpu_size(single_item, stream_view);
+  hostdevice_vector<SymbolOffsetT> out_indexes_gpu(input.size(), stream_view);
+
+  // Run algorithm
+  DfaFstT parser{pda_sgs, pda_state_tt, pda_out_tt, stream.value()};
+
+  // Allocate device-side temporary storage & run algorithm
+  parser.Transduce(d_input.data(),
+                   static_cast<SymbolOffsetT>(d_input.size()),
+                   output_gpu.device_ptr(),
+                   out_indexes_gpu.device_ptr(),
+                   output_gpu_size.device_ptr(),
+                   start_state,
+                   stream.value());
+
+  // Async copy results from device to host
+  output_gpu.device_to_host(stream.view());
+  out_indexes_gpu.device_to_host(stream.view());
+  output_gpu_size.device_to_host(stream.view());
+
+  // Prepare CPU-side results for verification
+  std::string output_cpu{};
+  std::vector<SymbolOffsetT> out_index_cpu{};
+  output_cpu.reserve(input.size());
+  out_index_cpu.reserve(input.size());
+
+  // Run CPU-side algorithm
+  fst_baseline(std::begin(input),
+               std::end(input),
+               start_state,
+               pda_sgs,
+               pda_state_tt,
+               pda_out_tt,
+               std::back_inserter(output_cpu),
+               std::back_inserter(out_index_cpu));
+
+  // Make sure results have been copied back to host
+  stream.synchronize();
+
+  // Verify results
+  ASSERT_EQ(output_gpu_size[0], output_cpu.size());
+  CUDF_TEST_EXPECT_VECTOR_EQUAL(output_gpu, output_cpu, output_cpu.size());
+  CUDF_TEST_EXPECT_VECTOR_EQUAL(out_indexes_gpu, out_index_cpu, output_cpu.size());
+}
+
+CUDF_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/io/fst/logical_stack_test.cu b/cpp/tests/io/fst/logical_stack_test.cu
index 3c2cdd7fb5c..dda737f005d 100644
--- a/cpp/tests/io/fst/logical_stack_test.cu
+++ b/cpp/tests/io/fst/logical_stack_test.cu
@@ -22,7 +22,6 @@
 
 #include <rmm/cuda_stream.hpp>
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/device_buffer.hpp>
 #include <rmm/device_uvector.hpp>
 
 #include <cstdlib>
@@ -245,9 +244,7 @@ TEST_F(LogicalStackTest, GroundTruth)
   // Verify results
   ASSERT_EQ(string_size, top_of_stack_cpu.size());
   ASSERT_EQ(top_of_stack_gpu.size(), top_of_stack_cpu.size());
-  for (size_t i = 0; i < string_size && i < top_of_stack_cpu.size(); i++) {
-    ASSERT_EQ(top_of_stack_gpu.host_ptr()[i], top_of_stack_cpu[i]) << "Mismatch at index #" << i;
-  }
+  CUDF_TEST_EXPECT_VECTOR_EQUAL(top_of_stack_gpu.host_ptr(), top_of_stack_cpu, string_size);
 }
 
 CUDF_TEST_PROGRAM_MAIN()

From 3f7bb6bb5cfc03a2375633e125f2a04675c0118e Mon Sep 17 00:00:00 2001
From: Jim Brennan <jimb@nvidia.com>
Date: Fri, 22 Jul 2022 11:57:00 -0500
Subject: [PATCH 23/31] Workaround for nvcomp zstd overwriting blocks for orc
 due to underestimate of sizes (#11288)

This is a possible workaround for issue #11280.  We have a goal to support NVCOMP ZSTD in 22.08, so a short-term fix is desired.

There is a heuristic in `gpuParseCompressedStripeData` to estimate the size of the decompress buffer for very small compressed blocks.  For ZSTD, it is possible to have a high enough compression ratio that this heuristic underestimates the needed decompress size.

This pr adds a boolean parameter to allow us to disable the block size estimate for ZSTD.  When the estimate is disabled, it falls back to the maximum block size, which is guaranteed to be big enough.

cc: @devavret, @vuule

Authors:
  - Jim Brennan (https://github.com/jbrennan333)

Approvers:
  - Vukasin Milovanovic (https://github.com/vuule)
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/11288
---
 cpp/src/io/orc/orc_gpu.hpp    |  2 ++
 cpp/src/io/orc/reader_impl.cu |  6 ++++++
 cpp/src/io/orc/stripe_init.cu | 18 ++++++++++++------
 cpp/tests/io/orc_test.cpp     | 36 +++++++++++++++++++++++++++++++++++
 4 files changed, 56 insertions(+), 6 deletions(-)

diff --git a/cpp/src/io/orc/orc_gpu.hpp b/cpp/src/io/orc/orc_gpu.hpp
index a475c3a29bf..9de7dfffc0c 100644
--- a/cpp/src/io/orc/orc_gpu.hpp
+++ b/cpp/src/io/orc/orc_gpu.hpp
@@ -211,12 +211,14 @@ constexpr uint32_t encode_block_size = 512;
  * @param[in] compression_block_size maximum size of compressed blocks (up to 16M)
  * @param[in] log2maxcr log2 of maximum compression ratio (used to infer max uncompressed size from
  * compressed size)
+ * @param[in] allow_block_size_estimate If true, estimate uncompressed size for small blocks
  * @param[in] stream CUDA stream used for device memory operations and kernel launches
  */
 void ParseCompressedStripeData(CompressedStreamInfo* strm_info,
                                int32_t num_streams,
                                uint32_t compression_block_size,
                                uint32_t log2maxcr,
+                               bool allow_block_size_estimate,
                                rmm::cuda_stream_view stream);
 
 /**
diff --git a/cpp/src/io/orc/reader_impl.cu b/cpp/src/io/orc/reader_impl.cu
index 8274792c7fe..4da9c224ab6 100644
--- a/cpp/src/io/orc/reader_impl.cu
+++ b/cpp/src/io/orc/reader_impl.cu
@@ -309,10 +309,15 @@ rmm::device_buffer reader::impl::decompress_stripe_data(
   }
   compinfo.host_to_device(stream);
 
+  // Workaround for ZSTD. It is possible to have compression ratios > 2048:1,
+  // so the heuristic in gpuParseCompressedStripeData() to estimate the size for
+  // small blocks can be too low. Disable the estimation for ZSTD.
+  auto allow_block_size_estimate = (decompressor.compression() != compression_type::ZSTD);
   gpu::ParseCompressedStripeData(compinfo.device_ptr(),
                                  compinfo.size(),
                                  decompressor.GetBlockSize(),
                                  decompressor.GetLog2MaxCompressionRatio(),
+                                 allow_block_size_estimate,
                                  stream);
   compinfo.device_to_host(stream, true);
 
@@ -360,6 +365,7 @@ rmm::device_buffer reader::impl::decompress_stripe_data(
                                  compinfo.size(),
                                  decompressor.GetBlockSize(),
                                  decompressor.GetLog2MaxCompressionRatio(),
+                                 allow_block_size_estimate,
                                  stream);
 
   // Dispatch batches of blocks to decompress
diff --git a/cpp/src/io/orc/stripe_init.cu b/cpp/src/io/orc/stripe_init.cu
index 8edc884b432..edae60bfa6d 100644
--- a/cpp/src/io/orc/stripe_init.cu
+++ b/cpp/src/io/orc/stripe_init.cu
@@ -39,8 +39,12 @@ struct compressed_stream_s {
 };
 
 // blockDim {128,1,1}
-__global__ void __launch_bounds__(128, 8) gpuParseCompressedStripeData(
-  CompressedStreamInfo* strm_info, int32_t num_streams, uint32_t block_size, uint32_t log2maxcr)
+__global__ void __launch_bounds__(128, 8)
+  gpuParseCompressedStripeData(CompressedStreamInfo* strm_info,
+                               int32_t num_streams,
+                               uint32_t block_size,
+                               uint32_t log2maxcr,
+                               bool allow_block_size_estimate)
 {
   __shared__ compressed_stream_s strm_g[4];
 
@@ -78,9 +82,10 @@ __global__ void __launch_bounds__(128, 8) gpuParseCompressedStripeData(
       // TBD: For some codecs like snappy, it wouldn't be too difficult to get the actual
       // uncompressed size and avoid waste due to block size alignment For now, rely on the max
       // compression ratio to limit waste for the most extreme cases (small single-block streams)
-      uncompressed_size = (is_uncompressed)                         ? block_len
-                          : (block_len < (block_size >> log2maxcr)) ? block_len << log2maxcr
-                                                                    : block_size;
+      uncompressed_size = (is_uncompressed) ? block_len
+                          : allow_block_size_estimate && (block_len < (block_size >> log2maxcr))
+                            ? block_len << log2maxcr
+                            : block_size;
       if (is_uncompressed) {
         if (uncompressed_size <= 32) {
           // For short blocks, copy the uncompressed data to output
@@ -531,12 +536,13 @@ void __host__ ParseCompressedStripeData(CompressedStreamInfo* strm_info,
                                         int32_t num_streams,
                                         uint32_t compression_block_size,
                                         uint32_t log2maxcr,
+                                        bool allow_block_size_estimate,
                                         rmm::cuda_stream_view stream)
 {
   dim3 dim_block(128, 1);
   dim3 dim_grid((num_streams + 3) >> 2, 1);  // 1 stream per warp, 4 warps per block
   gpuParseCompressedStripeData<<<dim_grid, dim_block, 0, stream.value()>>>(
-    strm_info, num_streams, compression_block_size, log2maxcr);
+    strm_info, num_streams, compression_block_size, log2maxcr, allow_block_size_estimate);
 }
 
 void __host__ PostDecompressionReassemble(CompressedStreamInfo* strm_info,
diff --git a/cpp/tests/io/orc_test.cpp b/cpp/tests/io/orc_test.cpp
index 5823a859f7b..b3df2c8a8dd 100644
--- a/cpp/tests/io/orc_test.cpp
+++ b/cpp/tests/io/orc_test.cpp
@@ -35,6 +35,13 @@
 
 #include <type_traits>
 
+#define NVCOMP_ZSTD_HEADER <nvcomp/zstd.h>
+#if __has_include(NVCOMP_ZSTD_HEADER)
+#define ZSTD_SUPPORTED 1
+#else
+#define ZSTD_SUPPORTED 0
+#endif
+
 namespace cudf_io = cudf::io;
 
 template <typename T, typename SourceElementT = T>
@@ -1097,6 +1104,35 @@ TEST_F(OrcReaderTest, SingleInputs)
   CUDF_TEST_EXPECT_TABLES_EQUAL(*result.tbl, *table1);
 }
 
+TEST_F(OrcReaderTest, zstdCompressionRegression)
+{
+  // Test with zstd compressed orc file with high compression ratio.
+#if !ZSTD_SUPPORTED
+  GTEST_SKIP();
+#endif
+  constexpr uint8_t input_buffer[] = {
+    0x4f, 0x52, 0x43, 0x5a, 0x00, 0x00, 0x28, 0xb5, 0x2f, 0xfd, 0xa4, 0x34, 0xc7, 0x03, 0x00, 0x74,
+    0x00, 0x00, 0x18, 0x41, 0xff, 0xaa, 0x02, 0x00, 0xbb, 0xff, 0x45, 0xc8, 0x01, 0x25, 0x30, 0x04,
+    0x65, 0x00, 0x00, 0x10, 0xaa, 0x1f, 0x02, 0x00, 0x01, 0x29, 0x0b, 0xc7, 0x39, 0xb8, 0x02, 0xcb,
+    0xaf, 0x38, 0xc0, 0x07, 0x00, 0x00, 0x40, 0x01, 0xc0, 0x05, 0x00, 0x00, 0x46, 0x4d, 0x45, 0x00,
+    0x00, 0x0a, 0x06, 0x08, 0x01, 0x10, 0x01, 0x18, 0x30, 0x0a, 0x06, 0x08, 0x02, 0x10, 0x01, 0x18,
+    0x06, 0x0a, 0x06, 0x08, 0x03, 0x10, 0x01, 0x18, 0x05, 0x12, 0x02, 0x08, 0x00, 0x12, 0x04, 0x08,
+    0x03, 0x10, 0x02, 0x59, 0x00, 0x00, 0x08, 0x03, 0x10, 0x63, 0x1a, 0x0c, 0x08, 0x03, 0x10, 0x00,
+    0x18, 0x3b, 0x20, 0x25, 0x28, 0xa0, 0x9e, 0x75, 0x22, 0x10, 0x08, 0x0c, 0x12, 0x01, 0x01, 0x1a,
+    0x09, 0x63, 0x64, 0x5f, 0x67, 0x65, 0x6e, 0x64, 0x65, 0x72, 0x22, 0x02, 0x08, 0x07, 0x30, 0xa0,
+    0x9e, 0x75, 0x08, 0x2f, 0x10, 0x05, 0x18, 0x80, 0x80, 0x10, 0x22, 0x02, 0x00, 0x0c, 0x28, 0x00,
+    0x30, 0x09, 0x82, 0xf4, 0x03, 0x03, 0x4f, 0x52, 0x43, 0x17};
+
+  auto source =
+    cudf::io::source_info(reinterpret_cast<const char*>(input_buffer), sizeof(input_buffer));
+  cudf_io::orc_reader_options in_opts =
+    cudf_io::orc_reader_options::builder(source).use_index(false);
+
+  cudf::io::table_with_metadata result;
+  CUDF_EXPECT_NO_THROW(result = cudf_io::read_orc(in_opts));
+  EXPECT_EQ(1920800, result.tbl->num_rows());
+}
+
 TEST_F(OrcReaderTest, MultipleInputs)
 {
   srand(31537);

From e4c6f828dba814e6eb9aa4a45132eca18bccc507 Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Fri, 22 Jul 2022 13:05:22 -0700
Subject: [PATCH 24/31] Add --output-on-failure to ctest args. (#11321)

Resolves #3036 by making `make test` or `ninja test` default to showing output when tests fail.

Authors:
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Robert Maynard (https://github.com/robertmaynard)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/11321
---
 cpp/CMakeLists.txt | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 4819d1c2f5c..b59c40e2718 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -709,6 +709,8 @@ add_library(cudf::cudftestutil ALIAS cudftestutil)
 if(CUDF_BUILD_TESTS)
   # include CTest module -- automatically calls enable_testing()
   include(CTest)
+  # Always print verbose output when tests fail if run using `make test`.
+  list(APPEND CMAKE_CTEST_ARGUMENTS "--output-on-failure")
   add_subdirectory(tests)
 endif()
 

From 3099c68bf9d236c9a6d1d29b7ee5f9a4558d85cd Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Fri, 22 Jul 2022 16:12:54 -0500
Subject: [PATCH 25/31] Revise PR template. (#10774)

I recently revamped our cuDF [CONTRIBUTING guide](https://github.com/rapidsai/cudf/blob/HEAD/CONTRIBUTING.md). I would like to consider replacing the current PR template (which has a fairly daunting amount of text that is immediately deleted by many contributors) with a short checklist of actionable items and a reference to the CONTRIBUTING guide for longer content.

I kept this draft very minimal. Reviewers can see other examples here for inspiration: https://axolo.co/blog/p/part-3-github-pull-request-template. Happy to crowdsource others' thoughts here.

Authors:
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - Lawrence Mitchell (https://github.com/wence-)
  - Jake Hemstad (https://github.com/jrhemstad)
  - Karthikeyan (https://github.com/karthikeyann)
  - AJ Schmidt (https://github.com/ajschmidt8)

URL: https://github.com/rapidsai/cudf/pull/10774
---
 .github/PULL_REQUEST_TEMPLATE.md | 65 +++++---------------------------
 CONTRIBUTING.md                  |  2 +-
 2 files changed, 10 insertions(+), 57 deletions(-)

diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md
index 8dac8083f31..301037c8660 100644
--- a/.github/PULL_REQUEST_TEMPLATE.md
+++ b/.github/PULL_REQUEST_TEMPLATE.md
@@ -1,56 +1,9 @@
-<!--
-
-Thank you for contributing to cuDF :)
-
-Here are some guidelines to help the review process go smoothly.
-
-1. Please write a description in this text box of the changes that are being
-   made.
-
-2. Please ensure that you have written units tests for the changes made/features
-   added.
-
-3. There are CI checks in place to enforce that committed code follows our style
-   and syntax standards. Please see our contribution guide in `CONTRIBUTING.MD`
-   in the project root for more information about the checks we perform and how
-   you can run them locally.
-
-4. If you are closing an issue please use one of the automatic closing words as
-   noted here: https://help.github.com/articles/closing-issues-using-keywords/
-
-5. If your pull request is not ready for review but you want to make use of the
-   continuous integration testing facilities please mark your pull request as Draft.
-   https://docs.github.com/en/github/collaborating-with-issues-and-pull-requests/changing-the-stage-of-a-pull-request#converting-a-pull-request-to-a-draft
-
-6. If your pull request is ready to be reviewed without requiring additional
-   work on top of it, then remove it from "Draft" and make it "Ready for Review".
-   https://docs.github.com/en/github/collaborating-with-issues-and-pull-requests/changing-the-stage-of-a-pull-request#marking-a-pull-request-as-ready-for-review
-
-   If assistance is required to complete the functionality, for example when the
-   C/C++ code of a feature is complete but Python bindings are still required,
-   then add the label `help wanted` so that others can triage and assist.
-   The additional changes then can be implemented on top of the same PR.
-   If the assistance is done by members of the rapidsAI team, then no
-   additional actions are required by the creator of the original PR for this,
-   otherwise the original author of the PR needs to give permission to the
-   person(s) assisting to commit to their personal fork of the project. If that
-   doesn't happen then a new PR based on the code of the original PR can be
-   opened by the person assisting, which then will be the PR that will be
-   merged.
-
-7. Once all work has been done and review has taken place please do not add
-   features or make changes out of the scope of those requested by the reviewer
-   (doing this just add delays as already reviewed code ends up having to be
-   re-reviewed/it is hard to tell what is new etc!). Further, please do not
-   rebase your branch on the target branch, force push, or rewrite history.
-   Doing any of these causes the context of any comments made by reviewers to be lost.
-   If conflicts occur against the target branch they should be resolved by
-   merging the target branch into the branch used for making the pull request.
-
-8. Pull requests that modify cpp source that are marked ready for review 
-   will automatically be assigned two cudf-cpp-codeowners reviewers.
-   Ensure at least two approvals from cudf-cpp-codeowners before merging.
-
-Many thanks in advance for your cooperation!
-
--->
+## Description
+<!-- Provide a standalone description of changes in this PR. -->
+<!-- Reference any issues closed by this PR with "closes #1234". -->
+<!-- Note: The pull request title will be included in the CHANGELOG. -->
+
+## Checklist
+- [ ] I am familiar with the [Contributing Guidelines](https://github.com/rapidsai/cudf/blob/HEAD/CONTRIBUTING.md).
+- [ ] New or existing tests cover these changes.
+- [ ] The documentation is up to date with these changes.
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index c6ab9f95088..f9f7dc929ad 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -34,7 +34,7 @@ conduct. More information can be found at:
    describes your planned work. For example, `fix-documentation`.
 5. Write code to address the issue or implement the feature.
 6. Add unit tests and unit benchmarks.
-7. [Create your pull request](https://github.com/rapidsai/cudf/compare).
+7. [Create your pull request](https://github.com/rapidsai/cudf/compare). To run continuous integration (CI) tests without requesting review, open a draft pull request.
 8. Verify that CI passes all [status checks](https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/collaborating-on-repositories-with-code-quality-features/about-status-checks).
    Fix if needed.
 9. Wait for other developers to review your code and update code as needed.

From 204218a74de5e8748baefbcedeb79e5c80902e66 Mon Sep 17 00:00:00 2001
From: Nghia Truong <nghiatruong.vn@gmail.com>
Date: Fri, 22 Jul 2022 15:52:43 -0700
Subject: [PATCH 26/31] Fix performance issue and add a new code path to
 `cudf::detail::contains` (#11330)

The current implementation of `cudf::detail::contains` can process input with arbitrary nested types. However, it was reported to have severe performance issue when the input tables have many duplicate rows (https://github.com/rapidsai/cudf/issues/11299). In order to fix the issue, https://github.com/rapidsai/cudf/pull/11310 and https://github.com/rapidsai/cudf/pull/11325 was created.

Unfortunately, https://github.com/rapidsai/cudf/pull/11310 is separating semi-anti-join from `cudf::detail::contains`, causing duplicate implementation. On the other hand, https://github.com/rapidsai/cudf/pull/11325 can address the issue https://github.com/rapidsai/cudf/issues/11299 but semi-anti-join using it still performs worse than the previous semi-anti-join implementation.

The changes in this PR include the following:
 * Fix the performance issue reported in https://github.com/rapidsai/cudf/issues/11299 for the current `cudf::detail::contains` implementation that support nested types.
 * Add a separate code path into `cudf::detail::contains` such that:
     * Input without having lists column (at any nested level) will be processed by the code path that is the same as the old implementation of semi-anti-join. This is to make sure the performance of semi-anti-join will remain the same as before.
     * Input with nested lists column, or NaNs compared as unequal, will be processed by another code path that supports nested types and different NaNs behavior. This will make sure support for nested types will not be dropped.

Closes https://github.com/rapidsai/cudf/issues/11299.

Authors:
  - Nghia Truong (https://github.com/ttnghia)

Approvers:
  - Yunsong Wang (https://github.com/PointKernel)
  - Bradley Dice (https://github.com/bdice)
  - MithunR (https://github.com/mythrocks)
  - Vyas Ramasubramani (https://github.com/vyasr)
  - Mike Wilson (https://github.com/hyperbolic2346)
  - Alessandro Bellina (https://github.com/abellina)

URL: https://github.com/rapidsai/cudf/pull/11330
---
 cpp/src/search/contains_table.cu | 556 +++++++++++++++++++++++--------
 1 file changed, 408 insertions(+), 148 deletions(-)

diff --git a/cpp/src/search/contains_table.cu b/cpp/src/search/contains_table.cu
index 46280d4ff5f..e0f0c465895 100644
--- a/cpp/src/search/contains_table.cu
+++ b/cpp/src/search/contains_table.cu
@@ -1,148 +1,408 @@
-/*
- * Copyright (c) 2022, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <join/join_common_utils.cuh>
-
-#include <cudf/detail/join.hpp>
-#include <cudf/detail/null_mask.hpp>
-#include <cudf/table/experimental/row_operators.cuh>
-#include <cudf/table/table_view.hpp>
-#include <cudf/types.hpp>
-
-#include <rmm/cuda_stream_view.hpp>
-#include <rmm/device_uvector.hpp>
-
-#include <thrust/iterator/counting_iterator.h>
-
-#include <cuco/static_multimap.cuh>
-
-namespace cudf::detail {
-
-namespace {
-
-using cudf::experimental::row::lhs_index_type;
-using cudf::experimental::row::rhs_index_type;
-
-}  // namespace
-
-rmm::device_uvector<bool> contains(table_view const& haystack,
-                                   table_view const& needles,
-                                   null_equality compare_nulls,
-                                   nan_equality compare_nans,
-                                   rmm::cuda_stream_view stream,
-                                   rmm::mr::device_memory_resource* mr)
-{
-  // Use a hash map with key type is row hash values and map value type is `lhs_index_type` to store
-  // all row indices of the haystack table.
-  using static_multimap =
-    cuco::static_multimap<hash_value_type,
-                          lhs_index_type,
-                          cuda::thread_scope_device,
-                          rmm::mr::stream_allocator_adaptor<default_allocator<char>>,
-                          cuco::double_hashing<detail::DEFAULT_JOIN_CG_SIZE, hash_type, hash_type>>;
-
-  auto map = static_multimap(compute_hash_table_size(haystack.num_rows()),
-                             cuco::sentinel::empty_key{std::numeric_limits<hash_value_type>::max()},
-                             cuco::sentinel::empty_value{lhs_index_type{detail::JoinNoneValue}},
-                             stream.value(),
-                             detail::hash_table_allocator_type{default_allocator<char>{}, stream});
-
-  auto const haystack_has_nulls = has_nested_nulls(haystack);
-  auto const needles_has_nulls  = has_nested_nulls(needles);
-  auto const has_any_nulls      = haystack_has_nulls || needles_has_nulls;
-
-  // Insert all row hash values and indices of the haystack table.
-  {
-    auto const hasher   = cudf::experimental::row::hash::row_hasher(haystack, stream);
-    auto const d_hasher = hasher.device_hasher(nullate::DYNAMIC{has_any_nulls});
-
-    using make_pair_fn = make_pair_function<decltype(d_hasher), lhs_index_type>;
-
-    auto const haystack_it = cudf::detail::make_counting_transform_iterator(
-      size_type{0}, make_pair_fn{d_hasher, map.get_empty_key_sentinel()});
-
-    // If the haystack table has nulls but they are compared unequal, don't insert them.
-    // Otherwise, it was known to cause performance issue:
-    // - https://github.com/rapidsai/cudf/pull/6943
-    // - https://github.com/rapidsai/cudf/pull/8277
-    if (haystack_has_nulls && compare_nulls == null_equality::UNEQUAL) {
-      // Collect all nullable columns at all levels from the haystack table.
-      auto const haystack_nullable_columns = get_nullable_columns(haystack);
-      CUDF_EXPECTS(haystack_nullable_columns.size() > 0,
-                   "Haystack table has nulls thus it should have nullable columns.");
-
-      // If there are more than one nullable column, we compute bitmask_and of their null masks.
-      // Otherwise, we have only one nullable column and can use its null mask directly.
-      auto const row_bitmask =
-        haystack_nullable_columns.size() > 1
-          ? cudf::detail::bitmask_and(table_view{haystack_nullable_columns}, stream).first
-          : rmm::device_buffer{0, stream};
-      auto const row_bitmask_ptr = haystack_nullable_columns.size() > 1
-                                     ? static_cast<bitmask_type const*>(row_bitmask.data())
-                                     : haystack_nullable_columns.front().null_mask();
-
-      // Insert only rows that do not have any null at any level.
-      map.insert_if(haystack_it,
-                    haystack_it + haystack.num_rows(),
-                    thrust::counting_iterator<size_type>(0),  // stencil
-                    row_is_valid{row_bitmask_ptr},
-                    stream.value());
-    } else {
-      map.insert(haystack_it, haystack_it + haystack.num_rows(), stream.value());
-    }
-  }
-
-  // The output vector.
-  auto contained = rmm::device_uvector<bool>(needles.num_rows(), stream, mr);
-
-  // Check existence for each row of the needles table in the haystack table.
-  {
-    auto const hasher   = cudf::experimental::row::hash::row_hasher(needles, stream);
-    auto const d_hasher = hasher.device_hasher(nullate::DYNAMIC{has_any_nulls});
-
-    auto const comparator =
-      cudf::experimental::row::equality::two_table_comparator(haystack, needles, stream);
-
-    using make_pair_fn = make_pair_function<decltype(d_hasher), rhs_index_type>;
-
-    auto const needles_it = cudf::detail::make_counting_transform_iterator(
-      size_type{0}, make_pair_fn{d_hasher, map.get_empty_key_sentinel()});
-
-    auto const check_contains = [&](auto const value_comp) {
-      auto const d_eqcomp =
-        comparator.equal_to(nullate::DYNAMIC{has_any_nulls}, compare_nulls, value_comp);
-      map.pair_contains(needles_it,
-                        needles_it + needles.num_rows(),
-                        contained.begin(),
-                        pair_equality{d_eqcomp},
-                        stream.value());
-    };
-
-    if (compare_nans == nan_equality::ALL_EQUAL) {
-      using nan_equal_comparator =
-        cudf::experimental::row::equality::nan_equal_physical_equality_comparator;
-      check_contains(nan_equal_comparator{});
-    } else {
-      using nan_unequal_comparator =
-        cudf::experimental::row::equality::physical_equality_comparator;
-      check_contains(nan_unequal_comparator{});
-    }
-  }
-
-  return contained;
-}
-
-}  // namespace cudf::detail
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <join/join_common_utils.cuh>
+
+#include <cudf/detail/join.hpp>
+#include <cudf/detail/null_mask.hpp>
+#include <cudf/detail/structs/utilities.hpp>
+#include <cudf/table/experimental/row_operators.cuh>
+#include <cudf/table/table_device_view.cuh>
+#include <cudf/table/table_view.hpp>
+#include <cudf/types.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/device_uvector.hpp>
+
+#include <thrust/iterator/counting_iterator.h>
+
+#include <cuco/static_map.cuh>
+
+#include <type_traits>
+
+namespace cudf::detail {
+
+namespace {
+
+using cudf::experimental::row::lhs_index_type;
+using cudf::experimental::row::rhs_index_type;
+
+using static_map = cuco::static_map<lhs_index_type,
+                                    size_type,
+                                    cuda::thread_scope_device,
+                                    rmm::mr::stream_allocator_adaptor<default_allocator<char>>>;
+
+/**
+ * @brief Check if the given type `T` is a strong index type (i.e., `lhs_index_type` or
+ * `rhs_index_type`).
+ *
+ * @return A boolean value indicating if `T` is a strong index type
+ */
+template <typename T>
+constexpr auto is_strong_index_type()
+{
+  return std::is_same_v<T, lhs_index_type> || std::is_same_v<T, rhs_index_type>;
+}
+
+/**
+ * @brief An adapter functor to support strong index types for row hasher that must be operating on
+ * `cudf::size_type`.
+ */
+template <typename Hasher>
+struct strong_index_hasher_adapter {
+  strong_index_hasher_adapter(Hasher const& hasher) : _hasher{hasher} {}
+
+  template <typename T, CUDF_ENABLE_IF(is_strong_index_type<T>())>
+  __device__ constexpr auto operator()(T const idx) const noexcept
+  {
+    return _hasher(static_cast<size_type>(idx));
+  }
+
+ private:
+  Hasher const _hasher;
+};
+
+/**
+ * @brief An adapter functor to support strong index type for table row comparator that must be
+ * operating on `cudf::size_type`.
+ */
+template <typename Comparator>
+struct strong_index_comparator_adapter {
+  strong_index_comparator_adapter(Comparator const& comparator) : _comparator{comparator} {}
+
+  template <typename T,
+            typename U,
+            CUDF_ENABLE_IF(is_strong_index_type<T>() && is_strong_index_type<U>())>
+  __device__ constexpr auto operator()(T const lhs_index, U const rhs_index) const noexcept
+  {
+    auto const lhs = static_cast<size_type>(lhs_index);
+    auto const rhs = static_cast<size_type>(rhs_index);
+
+    if constexpr (std::is_same_v<T, U> || std::is_same_v<T, lhs_index_type>) {
+      return _comparator(lhs, rhs);
+    } else {
+      // Here we have T == rhs_index_type.
+      // This is when the indices are provided in wrong order for two table comparator, so we need
+      // to switch them back to the right order before calling the underlying comparator.
+      return _comparator(rhs, lhs);
+    }
+  }
+
+ private:
+  Comparator const _comparator;
+};
+
+/**
+ * @brief Build a row bitmask for the input table.
+ *
+ * The output bitmask will have invalid bits corresponding to the the input rows having nulls (at
+ * any nested level) and vice versa.
+ *
+ * @param input The input table
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @return A pair of pointer to the output bitmask and the buffer containing the bitmask
+ */
+std::pair<rmm::device_buffer, bitmask_type const*> build_row_bitmask(table_view const& input,
+                                                                     rmm::cuda_stream_view stream)
+{
+  auto const nullable_columns = get_nullable_columns(input);
+  CUDF_EXPECTS(nullable_columns.size() > 0,
+               "The input table has nulls thus it should have nullable columns.");
+
+  // If there are more than one nullable column, we compute `bitmask_and` of their null masks.
+  // Otherwise, we have only one nullable column and can use its null mask directly.
+  if (nullable_columns.size() > 1) {
+    auto row_bitmask = cudf::detail::bitmask_and(table_view{nullable_columns}, stream).first;
+    auto const row_bitmask_ptr = static_cast<bitmask_type const*>(row_bitmask.data());
+    return std::pair(std::move(row_bitmask), row_bitmask_ptr);
+  }
+
+  return std::pair(rmm::device_buffer{0, stream}, nullable_columns.front().null_mask());
+}
+
+/**
+ * @brief Invoke an `operator()` template with a row equality comparator based on the specified
+ * `compare_nans` parameter.
+ *
+ * @param compare_nans The flag to specify whether NaNs should be compared equal or not
+ * @param func The input functor to invoke
+ */
+template <typename Func>
+void dispatch_nan_comparator(nan_equality compare_nans, Func&& func)
+{
+  if (compare_nans == nan_equality::ALL_EQUAL) {
+    using nan_equal_comparator =
+      cudf::experimental::row::equality::nan_equal_physical_equality_comparator;
+    func(nan_equal_comparator{});
+  } else {
+    using nan_unequal_comparator = cudf::experimental::row::equality::physical_equality_comparator;
+    func(nan_unequal_comparator{});
+  }
+}
+
+/**
+ * @brief Check if rows in the given `needles` table exist in the `haystack` table.
+ *
+ * This function is designed specifically to work with input tables having lists column(s) at
+ * arbitrarily nested levels.
+ *
+ * @param haystack The table containing the search space
+ * @param needles A table of rows whose existence to check in the search space
+ * @param compare_nulls Control whether nulls should be compared as equal or not
+ * @param compare_nans Control whether floating-point NaNs values should be compared as equal or not
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned vector
+ * @return A vector of bools indicating if each row in `needles` has matching rows in `haystack`
+ */
+rmm::device_uvector<bool> contains_with_lists_or_nans(table_view const& haystack,
+                                                      table_view const& needles,
+                                                      null_equality compare_nulls,
+                                                      nan_equality compare_nans,
+                                                      rmm::cuda_stream_view stream,
+                                                      rmm::mr::device_memory_resource* mr)
+{
+  auto map =
+    static_map(compute_hash_table_size(haystack.num_rows()),
+               cuco::sentinel::empty_key{lhs_index_type{std::numeric_limits<size_type>::max()}},
+               cuco::sentinel::empty_value{detail::JoinNoneValue},
+               detail::hash_table_allocator_type{default_allocator<char>{}, stream},
+               stream.value());
+
+  auto const haystack_has_nulls = has_nested_nulls(haystack);
+  auto const needles_has_nulls  = has_nested_nulls(needles);
+  auto const has_any_nulls      = haystack_has_nulls || needles_has_nulls;
+
+  // Insert row indices of the haystack table as map keys.
+  {
+    auto const haystack_it = cudf::detail::make_counting_transform_iterator(
+      size_type{0},
+      [] __device__(auto const idx) { return cuco::make_pair(lhs_index_type{idx}, 0); });
+
+    auto const hasher = cudf::experimental::row::hash::row_hasher(haystack, stream);
+    auto const d_hasher =
+      strong_index_hasher_adapter{hasher.device_hasher(nullate::DYNAMIC{has_any_nulls})};
+
+    auto const comparator = cudf::experimental::row::equality::self_comparator(haystack, stream);
+
+    // If the haystack table has nulls but they are compared unequal, don't insert them.
+    // Otherwise, it was known to cause performance issue:
+    // - https://github.com/rapidsai/cudf/pull/6943
+    // - https://github.com/rapidsai/cudf/pull/8277
+    if (haystack_has_nulls && compare_nulls == null_equality::UNEQUAL) {
+      auto const bitmask_buffer_and_ptr = build_row_bitmask(haystack, stream);
+      auto const row_bitmask_ptr        = bitmask_buffer_and_ptr.second;
+
+      // Insert only rows that do not have any null at any level.
+      auto const insert_map = [&](auto const value_comp) {
+        auto const d_eqcomp = strong_index_comparator_adapter{
+          comparator.equal_to(nullate::DYNAMIC{haystack_has_nulls}, compare_nulls, value_comp)};
+        map.insert_if(haystack_it,
+                      haystack_it + haystack.num_rows(),
+                      thrust::counting_iterator<size_type>(0),  // stencil
+                      row_is_valid{row_bitmask_ptr},
+                      d_hasher,
+                      d_eqcomp,
+                      stream.value());
+      };
+
+      dispatch_nan_comparator(compare_nans, insert_map);
+
+    } else {  // haystack_doesn't_have_nulls || compare_nulls == null_equality::EQUAL
+      auto const insert_map = [&](auto const value_comp) {
+        auto const d_eqcomp = strong_index_comparator_adapter{
+          comparator.equal_to(nullate::DYNAMIC{haystack_has_nulls}, compare_nulls, value_comp)};
+        map.insert(
+          haystack_it, haystack_it + haystack.num_rows(), d_hasher, d_eqcomp, stream.value());
+      };
+
+      dispatch_nan_comparator(compare_nans, insert_map);
+    }
+  }
+
+  // The output vector.
+  auto contained = rmm::device_uvector<bool>(needles.num_rows(), stream, mr);
+
+  // Check existence for each row of the needles table in the haystack table.
+  {
+    auto const needles_it = cudf::detail::make_counting_transform_iterator(
+      size_type{0}, [] __device__(auto const idx) { return rhs_index_type{idx}; });
+
+    auto const hasher = cudf::experimental::row::hash::row_hasher(needles, stream);
+    auto const d_hasher =
+      strong_index_hasher_adapter{hasher.device_hasher(nullate::DYNAMIC{has_any_nulls})};
+
+    auto const comparator =
+      cudf::experimental::row::equality::two_table_comparator(haystack, needles, stream);
+
+    auto const check_contains = [&](auto const value_comp) {
+      auto const d_eqcomp =
+        comparator.equal_to(nullate::DYNAMIC{has_any_nulls}, compare_nulls, value_comp);
+      map.contains(needles_it,
+                   needles_it + needles.num_rows(),
+                   contained.begin(),
+                   d_hasher,
+                   d_eqcomp,
+                   stream.value());
+    };
+
+    dispatch_nan_comparator(compare_nans, check_contains);
+  }
+
+  return contained;
+}
+
+/**
+ * @brief Check if rows in the given `needles` table exist in the `haystack` table.
+ *
+ * This function is designed specifically to work with input tables having only columns of simple
+ * types, or structs columns of simple types.
+ *
+ * @param haystack The table containing the search space
+ * @param needles A table of rows whose existence to check in the search space
+ * @param compare_nulls Control whether nulls should be compared as equal or not
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned vector
+ * @return A vector of bools indicating if each row in `needles` has matching rows in `haystack`
+ */
+rmm::device_uvector<bool> contains_without_lists_or_nans(table_view const& haystack,
+                                                         table_view const& needles,
+                                                         null_equality compare_nulls,
+                                                         rmm::cuda_stream_view stream,
+                                                         rmm::mr::device_memory_resource* mr)
+{
+  auto map =
+    static_map(compute_hash_table_size(haystack.num_rows()),
+               cuco::sentinel::empty_key{lhs_index_type{std::numeric_limits<size_type>::max()}},
+               cuco::sentinel::empty_value{detail::JoinNoneValue},
+               detail::hash_table_allocator_type{default_allocator<char>{}, stream},
+               stream.value());
+
+  auto const haystack_has_nulls = has_nested_nulls(haystack);
+  auto const needles_has_nulls  = has_nested_nulls(needles);
+  auto const has_any_nulls      = haystack_has_nulls || needles_has_nulls;
+
+  // Flatten the input tables.
+  auto const flatten_nullability = has_any_nulls
+                                     ? structs::detail::column_nullability::FORCE
+                                     : structs::detail::column_nullability::MATCH_INCOMING;
+  auto const haystack_flattened_tables =
+    structs::detail::flatten_nested_columns(haystack, {}, {}, flatten_nullability);
+  auto const needles_flattened_tables =
+    structs::detail::flatten_nested_columns(needles, {}, {}, flatten_nullability);
+  auto const haystack_flattened = haystack_flattened_tables.flattened_columns();
+  auto const needles_flattened  = needles_flattened_tables.flattened_columns();
+  auto const haystack_tdv_ptr   = table_device_view::create(haystack_flattened, stream);
+  auto const needles_tdv_ptr    = table_device_view::create(needles_flattened, stream);
+
+  // Insert row indices of the haystack table as map keys.
+  {
+    auto const haystack_it = cudf::detail::make_counting_transform_iterator(
+      size_type{0},
+      [] __device__(auto const idx) { return cuco::make_pair(lhs_index_type{idx}, 0); });
+
+    auto const d_hasher = strong_index_hasher_adapter{
+      row_hash{cudf::nullate::DYNAMIC{has_any_nulls}, *haystack_tdv_ptr}};
+    auto const d_eqcomp =
+      strong_index_comparator_adapter{row_equality{cudf::nullate::DYNAMIC{haystack_has_nulls},
+                                                   *haystack_tdv_ptr,
+                                                   *haystack_tdv_ptr,
+                                                   compare_nulls}};
+
+    // If the haystack table has nulls but they are compared unequal, don't insert them.
+    // Otherwise, it was known to cause performance issue:
+    // - https://github.com/rapidsai/cudf/pull/6943
+    // - https://github.com/rapidsai/cudf/pull/8277
+    if (haystack_has_nulls && compare_nulls == null_equality::UNEQUAL) {
+      auto const bitmask_buffer_and_ptr = build_row_bitmask(haystack, stream);
+      auto const row_bitmask_ptr        = bitmask_buffer_and_ptr.second;
+
+      // Insert only rows that do not have any null at any level.
+      map.insert_if(haystack_it,
+                    haystack_it + haystack.num_rows(),
+                    thrust::counting_iterator<size_type>(0),  // stencil
+                    row_is_valid{row_bitmask_ptr},
+                    d_hasher,
+                    d_eqcomp,
+                    stream.value());
+
+    } else {  // haystack_doesn't_have_nulls || compare_nulls == null_equality::EQUAL
+      map.insert(
+        haystack_it, haystack_it + haystack.num_rows(), d_hasher, d_eqcomp, stream.value());
+    }
+  }
+
+  // The output vector.
+  auto contained = rmm::device_uvector<bool>(needles.num_rows(), stream, mr);
+
+  // Check existence for each row of the needles table in the haystack table.
+  {
+    auto const needles_it = cudf::detail::make_counting_transform_iterator(
+      size_type{0}, [] __device__(auto const idx) { return rhs_index_type{idx}; });
+
+    auto const d_hasher = strong_index_hasher_adapter{
+      row_hash{cudf::nullate::DYNAMIC{has_any_nulls}, *needles_tdv_ptr}};
+
+    auto const d_eqcomp = strong_index_comparator_adapter{row_equality{
+      cudf::nullate::DYNAMIC{has_any_nulls}, *haystack_tdv_ptr, *needles_tdv_ptr, compare_nulls}};
+
+    map.contains(needles_it,
+                 needles_it + needles.num_rows(),
+                 contained.begin(),
+                 d_hasher,
+                 d_eqcomp,
+                 stream.value());
+  }
+
+  return contained;
+}
+
+}  // namespace
+
+rmm::device_uvector<bool> contains(table_view const& haystack,
+                                   table_view const& needles,
+                                   null_equality compare_nulls,
+                                   nan_equality compare_nans,
+                                   rmm::cuda_stream_view stream,
+                                   rmm::mr::device_memory_resource* mr)
+{
+  // Checking for only one table is enough, because both tables will be checked to have the same
+  // shape later during row comparisons.
+  auto const has_lists = std::any_of(haystack.begin(), haystack.end(), [](auto const& col) {
+    return cudf::structs::detail::is_or_has_nested_lists(col);
+  });
+
+  if (has_lists || compare_nans == nan_equality::UNEQUAL) {
+    // We must call a separate code path that uses the new experimental row hasher and row
+    // comparator if:
+    //  - The input has lists column, or
+    //  - Floating-point NaNs are compared as unequal.
+    // Inputs with these conditions are supported only by this code path.
+    return contains_with_lists_or_nans(haystack, needles, compare_nulls, compare_nans, stream, mr);
+  }
+
+  // If the input tables don't have lists column and NaNs are compared equal, we rely on the classic
+  // code path that flattens the input tables for row comparisons. This way is known to have
+  // better performance.
+  return contains_without_lists_or_nans(haystack, needles, compare_nulls, stream, mr);
+
+  // Note: We have to keep separate code paths because unifying them will cause performance
+  // regression for the input having no nested lists.
+  //
+  // TODO: We should unify these code paths in the future when performance regression is no longer
+  // happening.
+}
+
+}  // namespace cudf::detail

From 9e01aa307f79a2a64ccc5554cb587f58ced6f1c9 Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Mon, 25 Jul 2022 09:09:57 -0700
Subject: [PATCH 27/31] Add test of wildcard selection (#11300)

Resolves #7961

Authors:
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Sheilah Kirui (https://github.com/skirui-source)

URL: https://github.com/rapidsai/cudf/pull/11300
---
 python/cudf/cudf/tests/test_dataframe.py | 36 ++++++++++++++++++++++++
 1 file changed, 36 insertions(+)

diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py
index 2382e6d5780..1a7acd01ce9 100644
--- a/python/cudf/cudf/tests/test_dataframe.py
+++ b/python/cudf/cudf/tests/test_dataframe.py
@@ -9470,3 +9470,39 @@ def test_value_counts(
 
     with pytest.raises(KeyError):
         gdf.value_counts(subset=["not_a_column_name"])
+
+
+@pytest.fixture
+def wildcard_df():
+    midx = cudf.MultiIndex.from_tuples(
+        [(c1, c2) for c1 in "abc" for c2 in "ab"]
+    )
+    df = cudf.DataFrame({f"{i}": [i] for i in range(6)})
+    df.columns = midx
+    return df
+
+
+def test_multiindex_wildcard_selection_all(wildcard_df):
+    expect = wildcard_df.to_pandas().loc[:, (slice(None), "b")]
+    got = wildcard_df.loc[:, (slice(None), "b")]
+    assert_eq(expect, got)
+
+
+@pytest.mark.xfail(reason="Not yet properly supported.")
+def test_multiindex_wildcard_selection_partial(wildcard_df):
+    expect = wildcard_df.to_pandas().loc[:, (slice("a", "b"), "b")]
+    got = wildcard_df.loc[:, (slice("a", "b"), "b")]
+    assert_eq(expect, got)
+
+
+@pytest.mark.xfail(reason="Not yet properly supported.")
+def test_multiindex_wildcard_selection_three_level_all():
+    midx = cudf.MultiIndex.from_tuples(
+        [(c1, c2, c3) for c1 in "abcd" for c2 in "abc" for c3 in "ab"]
+    )
+    df = cudf.DataFrame({f"{i}": [i] for i in range(24)})
+    df.columns = midx
+
+    expect = df.to_pandas().loc[:, (slice("a", "c"), slice("a", "b"), "b")]
+    got = df.loc[:, (slice(None), "b")]
+    assert_eq(expect, got)

From 97adac5cf92e932e4f4157f0c0c3690a5e91af09 Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Mon, 25 Jul 2022 14:11:28 -0400
Subject: [PATCH 28/31] Fix unsigned-compare compile warning in IntPow binops
 (#11339)

Fixes a compile warning was introduced in PR #11025 : [link to log containing the warning](https://gpuci.gpuopenanalytics.com/job/rapidsai/job/gpuci/job/cudf/job/prb/job/cudf-cpu-cuda-build/CUDA=11.5/10790/consoleFull)

When a templated variable `y` is unsigned the compare `(y<0)` results in a compile warning:
```
/cudf/cpp/src/binaryop/compiled/operation.cuh(226): warning #186-D: pointless comparison of unsigned integer with zero
          detected during:
            instantiation of "auto cudf::binops::compiled::ops::IntPow::operator()(TypeLhs, TypeRhs)->TypeLhs [with TypeLhs=uint8_t, TypeRhs=uint8_t, <unnamed>=(void *)nullptr]"
```
Adding an `if constexpr` around the comparison removes the warning.

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Nghia Truong (https://github.com/ttnghia)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/11339
---
 cpp/src/binaryop/compiled/operation.cuh | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/cpp/src/binaryop/compiled/operation.cuh b/cpp/src/binaryop/compiled/operation.cuh
index b4a396b3cbf..68b750ca01a 100644
--- a/cpp/src/binaryop/compiled/operation.cuh
+++ b/cpp/src/binaryop/compiled/operation.cuh
@@ -223,9 +223,11 @@ struct IntPow {
     std::enable_if_t<(std::is_integral_v<TypeLhs> and std::is_integral_v<TypeRhs>)>* = nullptr>
   __device__ inline auto operator()(TypeLhs x, TypeRhs y) -> TypeLhs
   {
-    if (y < 0) {
-      // Integer exponentiation with negative exponent is not possible.
-      return 0;
+    if constexpr (std::is_signed_v<TypeRhs>) {
+      if (y < 0) {
+        // Integer exponentiation with negative exponent is not possible.
+        return 0;
+      }
     }
     if (y == 0) { return 1; }
     if (x == 0) { return 0; }

From 39e4feb064186bc32be649226e14b6482049198b Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Mon, 25 Jul 2022 15:57:52 -0500
Subject: [PATCH 29/31] Update missing data handling APIs in docs (#11345)

This PR updates a few missing data operation APIs in docs.

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/11345
---
 docs/cudf/source/api_docs/dataframe.rst | 4 ++++
 docs/cudf/source/api_docs/series.rst    | 6 ++++--
 2 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/docs/cudf/source/api_docs/dataframe.rst b/docs/cudf/source/api_docs/dataframe.rst
index 7ee55bdaa81..bd868e85cc7 100644
--- a/docs/cudf/source/api_docs/dataframe.rst
+++ b/docs/cudf/source/api_docs/dataframe.rst
@@ -187,7 +187,10 @@ Missing data handling
 .. autosummary::
    :toctree: api/
 
+   DataFrame.backfill
+   DataFrame.bfill
    DataFrame.dropna
+   DataFrame.ffill
    DataFrame.fillna
    DataFrame.interpolate
    DataFrame.isna
@@ -195,6 +198,7 @@ Missing data handling
    DataFrame.nans_to_nulls
    DataFrame.notna
    DataFrame.notnull
+   DataFrame.pad
    DataFrame.replace
 
 Reshaping, sorting, transposing
diff --git a/docs/cudf/source/api_docs/series.rst b/docs/cudf/source/api_docs/series.rst
index 66e56ffb1bc..1e53c90b44d 100644
--- a/docs/cudf/source/api_docs/series.rst
+++ b/docs/cudf/source/api_docs/series.rst
@@ -185,13 +185,17 @@ Missing data handling
 .. autosummary::
    :toctree: api/
 
+   Series.backfill
+   Series.bfill
    Series.dropna
+   Series.ffill
    Series.fillna
    Series.isna
    Series.isnull
    Series.nans_to_nulls
    Series.notna
    Series.notnull
+   Series.pad
    Series.replace
 
 Reshaping, sorting
@@ -200,11 +204,9 @@ Reshaping, sorting
    :toctree: api/
 
    Series.argsort
-   Series.interleave_columns
    Series.sort_values
    Series.sort_index
    Series.explode
-   Series.scatter_by_map
    Series.searchsorted
    Series.repeat
    Series.transpose

From a652ca918afb5ca38e9757d36df0cebc22ea296c Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Mon, 25 Jul 2022 16:57:07 -0500
Subject: [PATCH 30/31] Add lists filtering APIs to doxygen group. (#11336)

This PR follows up on #11149 to add the lists filtering (stream compaction) APIs to a doxygen group. The previous doxygen group `lists_drop_duplicates` is empty after #11326.

Authors:
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - David Wendt (https://github.com/davidwendt)
  - Vukasin Milovanovic (https://github.com/vuule)

URL: https://github.com/rapidsai/cudf/pull/11336
---
 cpp/include/cudf/lists/stream_compaction.hpp | 8 ++++++++
 cpp/include/doxygen_groups.h                 | 2 +-
 2 files changed, 9 insertions(+), 1 deletion(-)

diff --git a/cpp/include/cudf/lists/stream_compaction.hpp b/cpp/include/cudf/lists/stream_compaction.hpp
index c7a326af701..5ddaa992184 100644
--- a/cpp/include/cudf/lists/stream_compaction.hpp
+++ b/cpp/include/cudf/lists/stream_compaction.hpp
@@ -23,6 +23,12 @@
 
 namespace cudf::lists {
 
+/**
+ * @addtogroup lists_filtering
+ * @{
+ * @file
+ */
+
 /**
  * @brief Filters elements in each row of `input` LIST column using `boolean_mask`
  * LIST of booleans as a mask.
@@ -81,4 +87,6 @@ std::unique_ptr<column> distinct(
   nan_equality nans_equal             = nan_equality::ALL_EQUAL,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
+/** @} */  // end of group
+
 }  // namespace cudf::lists
diff --git a/cpp/include/doxygen_groups.h b/cpp/include/doxygen_groups.h
index 0abaebc3b0c..8854ea80635 100644
--- a/cpp/include/doxygen_groups.h
+++ b/cpp/include/doxygen_groups.h
@@ -150,7 +150,7 @@
  *   @defgroup lists_contains Searching
  *   @defgroup lists_gather Gathering
  *   @defgroup lists_elements Counting
- *   @defgroup lists_drop_duplicates Filtering
+ *   @defgroup lists_filtering Filtering
  *   @defgroup lists_sort Sorting
  * @}
  * @defgroup nvtext_apis NVText

From 2d214ea28b5cfc1f046d924fb62fda27257a2c5f Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Mon, 25 Jul 2022 18:35:29 -0500
Subject: [PATCH 31/31] Fix `DatetimeIndex` & `TimedeltaIndex` constructors
 (#11342)

Closes #11335

This PR fixes an issue with `DatetimeIndex` & `TimedeltaIndex` where the underlying columns would still be of numeric or string types rather than `DatetimeColumn` and `TimedeltaColumn` respectively. This is the actual root cause that leads to errors in some downstream API calls.

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/11342
---
 python/cudf/cudf/core/index.py           | 32 ++++++++++++------------
 python/cudf/cudf/tests/test_datetime.py  | 28 +++++++++++++++++++++
 python/cudf/cudf/tests/test_timedelta.py | 14 +++++++++++
 3 files changed, 58 insertions(+), 16 deletions(-)

diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py
index 66ab742d4a7..a0d2a161674 100644
--- a/python/cudf/cudf/core/index.py
+++ b/python/cudf/cudf/core/index.py
@@ -1783,15 +1783,12 @@ def __init__(
         elif dtype not in valid_dtypes:
             raise TypeError("Invalid dtype")
 
-        if copy:
-            data = column.as_column(data).copy()
         kwargs = _setdefault_name(data, name=name)
-        if isinstance(data, np.ndarray) and data.dtype.kind == "M":
-            data = column.as_column(data)
-        elif isinstance(data, pd.DatetimeIndex):
-            data = column.as_column(data.values)
-        elif isinstance(data, (list, tuple)):
-            data = column.as_column(np.array(data, dtype=dtype))
+        data = column.as_column(data, dtype=dtype)
+
+        if copy:
+            data = data.copy()
+
         super().__init__(data, **kwargs)
 
     @property  # type: ignore
@@ -2263,15 +2260,18 @@ def __init__(
                 "dtype parameter is supported"
             )
 
-        if copy:
-            data = column.as_column(data).copy()
+        valid_dtypes = tuple(
+            f"timedelta64[{res}]" for res in ("s", "ms", "us", "ns")
+        )
+        if dtype not in valid_dtypes:
+            raise TypeError("Invalid dtype")
+
         kwargs = _setdefault_name(data, name=name)
-        if isinstance(data, np.ndarray) and data.dtype.kind == "m":
-            data = column.as_column(data)
-        elif isinstance(data, pd.TimedeltaIndex):
-            data = column.as_column(data.values)
-        elif isinstance(data, (list, tuple)):
-            data = column.as_column(np.array(data, dtype=dtype))
+        data = column.as_column(data, dtype=dtype)
+
+        if copy:
+            data = data.copy()
+
         super().__init__(data, **kwargs)
 
     @_cudf_nvtx_annotate
diff --git a/python/cudf/cudf/tests/test_datetime.py b/python/cudf/cudf/tests/test_datetime.py
index 07242ea49f5..04ff5b88214 100644
--- a/python/cudf/cudf/tests/test_datetime.py
+++ b/python/cudf/cudf/tests/test_datetime.py
@@ -2007,3 +2007,31 @@ def test_last(idx, offset):
     got = g.last(offset=offset)
 
     assert_eq(expect, got)
+
+
+@pytest.mark.parametrize(
+    "data",
+    [
+        [
+            "2020-01-31",
+            "2020-02-15",
+            "2020-02-29",
+            "2020-03-15",
+            "2020-03-31",
+            "2020-04-15",
+            "2020-04-30",
+        ],
+        [43534, 43543, 37897, 2000],
+    ],
+)
+@pytest.mark.parametrize("dtype", [None, "datetime64[ns]"])
+def test_datetime_constructor(data, dtype):
+    expected = pd.DatetimeIndex(data=data, dtype=dtype)
+    actual = cudf.DatetimeIndex(data=data, dtype=dtype)
+
+    assert_eq(expected, actual)
+
+    expected = pd.DatetimeIndex(data=pd.Series(data), dtype=dtype)
+    actual = cudf.DatetimeIndex(data=cudf.Series(data), dtype=dtype)
+
+    assert_eq(expected, actual)
diff --git a/python/cudf/cudf/tests/test_timedelta.py b/python/cudf/cudf/tests/test_timedelta.py
index cce2ac639ef..23270875a92 100644
--- a/python/cudf/cudf/tests/test_timedelta.py
+++ b/python/cudf/cudf/tests/test_timedelta.py
@@ -1389,3 +1389,17 @@ def test_create_TimedeltaIndex(dtype, name):
     )
     pdi = gdi.to_pandas()
     assert_eq(pdi, gdi)
+
+
+@pytest.mark.parametrize("data", [[43534, 43543, 37897, 2000]])
+@pytest.mark.parametrize("dtype", ["timedelta64[ns]"])
+def test_timedelta_constructor(data, dtype):
+    expected = pd.TimedeltaIndex(data=data, dtype=dtype)
+    actual = cudf.TimedeltaIndex(data=data, dtype=dtype)
+
+    assert_eq(expected, actual)
+
+    expected = pd.TimedeltaIndex(data=pd.Series(data), dtype=dtype)
+    actual = cudf.TimedeltaIndex(data=cudf.Series(data), dtype=dtype)
+
+    assert_eq(expected, actual)