Skip to content

Commit

Permalink
Adding support for equi-join on struct (#7720)
Browse files Browse the repository at this point in the history
Adds support for equijoin on structs.

This PR is leveraging the [struct PR](#7422) and the [rewrite for join API](#7454). It enables equijoin on structs by flattening the struct for the hash calculation.

closes #7543

Authors:
  - Mike Wilson (https://github.com/hyperbolic2346)
  - Ashwin Srinath (https://github.com/shwina)
  - Karthikeyan (https://github.com/karthikeyann)
  - Vukasin Milovanovic (https://github.com/vuule)
  - Alessandro Bellina (https://github.com/abellina)
  - Devavret Makkar (https://github.com/devavret)
  - David Wendt (https://github.com/davidwendt)
  - Liangcai Li (https://github.com/firestarman)
  - Paul Taylor (https://github.com/trxcllnt)
  - Kumar Aatish (https://github.com/kaatish)
  - Jason Lowe (https://github.com/jlowe)
  - Dillon Cullinan (https://github.com/dillon-cullinan)
  - Raza Jafri (https://github.com/razajafri)
  - https://github.com/rwlee
  - Michael Wang (https://github.com/isVoid)
  - Dante Gama Dessavre (https://github.com/dantegd)
  - Keith Kraus (https://github.com/kkraus14)
  - Robert Maynard (https://github.com/robertmaynard)
  - GALI PREM SAGAR (https://github.com/galipremsagar)
  - https://github.com/ChrisJar
  - AJ Schmidt (https://github.com/ajschmidt8)
  - https://github.com/nvdbaranec
  - Nghia Truong (https://github.com/ttnghia)
  - https://github.com/chenrui17
  - Conor Hoekstra (https://github.com/codereport)
  - Mike Wendt (https://github.com/mike-wendt)

Approvers:
  - Nghia Truong (https://github.com/ttnghia)
  - Devavret Makkar (https://github.com/devavret)
  - Jake Hemstad (https://github.com/jrhemstad)

URL: #7720
  • Loading branch information
hyperbolic2346 authored Apr 1, 2021
1 parent 299f6cc commit f4ab813
Show file tree
Hide file tree
Showing 2 changed files with 314 additions and 8 deletions.
22 changes: 14 additions & 8 deletions cpp/src/join/hash_join.cu
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
*/
#include <thrust/uninitialized_fill.h>
#include <join/hash_join.cuh>
#include <structs/utilities.hpp>

#include <cudf/detail/concatenate.cuh>
#include <cudf/detail/gather.cuh>
Expand Down Expand Up @@ -299,13 +300,15 @@ hash_join::hash_join_impl::~hash_join_impl() = default;
hash_join::hash_join_impl::hash_join_impl(cudf::table_view const &build,
null_equality compare_nulls,
rmm::cuda_stream_view stream)
: _build(build), _hash_table(nullptr)
: _hash_table(nullptr)
{
CUDF_FUNC_RANGE();
CUDF_EXPECTS(0 != _build.num_columns(), "Hash join build table is empty");
CUDF_EXPECTS(_build.num_rows() < cudf::detail::MAX_JOIN_SIZE,
CUDF_EXPECTS(0 != build.num_columns(), "Hash join build table is empty");
CUDF_EXPECTS(build.num_rows() < cudf::detail::MAX_JOIN_SIZE,
"Build column size is too big for hash join");

_build = std::get<0>(structs::detail::flatten_nested_columns(build, {}, {}));

if (0 == build.num_rows()) { return; }

_hash_table = build_join_hash_table(_build, compare_nulls, stream);
Expand Down Expand Up @@ -355,22 +358,25 @@ hash_join::hash_join_impl::compute_hash_join(cudf::table_view const &probe,
CUDF_EXPECTS(0 != probe.num_columns(), "Hash join probe table is empty");
CUDF_EXPECTS(probe.num_rows() < cudf::detail::MAX_JOIN_SIZE,
"Probe column size is too big for hash join");
CUDF_EXPECTS(_build.num_columns() == probe.num_columns(),

auto const _probe = std::get<0>(structs::detail::flatten_nested_columns(probe, {}, {}));

CUDF_EXPECTS(_build.num_columns() == _probe.num_columns(),
"Mismatch in number of columns to be joined on");

if (is_trivial_join(probe, _build, JoinKind)) {
if (is_trivial_join(_probe, _build, JoinKind)) {
return std::make_pair(std::make_unique<rmm::device_uvector<size_type>>(0, stream, mr),
std::make_unique<rmm::device_uvector<size_type>>(0, stream, mr));
}

CUDF_EXPECTS(std::equal(std::cbegin(_build),
std::cend(_build),
std::cbegin(probe),
std::cend(probe),
std::cbegin(_probe),
std::cend(_probe),
[](const auto &b, const auto &p) { return b.type() == p.type(); }),
"Mismatch in joining column data types");

return probe_join_indices<JoinKind>(probe, compare_nulls, stream, mr);
return probe_join_indices<JoinKind>(_probe, compare_nulls, stream, mr);
}

template <cudf::detail::join_kind JoinKind>
Expand Down
300 changes: 300 additions & 0 deletions cpp/tests/join/join_tests.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -410,6 +410,97 @@ TEST_F(JoinTest, LeftJoinWithNulls)
CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*sorted_gold, *sorted_result);
}

TEST_F(JoinTest, LeftJoinWithStructsAndNulls)
{
column_wrapper<int32_t> col0_0{{3, 1, 2, 0, 2}};
strcol_wrapper col0_1({"s1", "s1", "", "s4", "s0"}, {1, 1, 0, 1, 1});
column_wrapper<int32_t> col0_2{{0, 1, 2, 4, 1}};
auto col0_names_col = strcol_wrapper{
"Samuel Vimes", "Carrot Ironfoundersson", "Detritus", "Samuel Vimes", "Angua von Überwald"};
auto col0_ages_col = column_wrapper<int32_t>{{48, 27, 351, 31, 25}};

auto col0_is_human_col = column_wrapper<bool>{{true, true, false, false, false}, {1, 1, 0, 1, 0}};

auto col0_3 =
cudf::test::structs_column_wrapper{{col0_names_col, col0_ages_col, col0_is_human_col}};

column_wrapper<int32_t> col1_0{{2, 2, 0, 4, 3}};
strcol_wrapper col1_1({"s1", "s0", "s1", "s2", "s1"});
column_wrapper<int32_t> col1_2{{1, 0, 1, 2, 1}, {1, 0, 1, 1, 1}};
auto col1_names_col = strcol_wrapper{
"Samuel Vimes", "Detritus", "Detritus", "Carrot Ironfoundersson", "Angua von Überwald"};
auto col1_ages_col = column_wrapper<int32_t>{{48, 35, 351, 22, 25}};

auto col1_is_human_col = column_wrapper<bool>{{true, true, false, false, true}, {1, 1, 0, 1, 1}};

auto col1_3 =
cudf::test::structs_column_wrapper{{col1_names_col, col1_ages_col, col1_is_human_col}};

CVector cols0, cols1;
cols0.push_back(col0_0.release());
cols0.push_back(col0_1.release());
cols0.push_back(col0_2.release());
cols0.push_back(col0_3.release());
cols1.push_back(col1_0.release());
cols1.push_back(col1_1.release());
cols1.push_back(col1_2.release());
cols1.push_back(col1_3.release());

Table t0(std::move(cols0));
Table t1(std::move(cols1));

auto result = cudf::left_join(t0, t1, {3}, {3});
auto result_sort_order = cudf::sorted_order(result->view());
auto sorted_result = cudf::gather(result->view(), *result_sort_order);

column_wrapper<int32_t> col_gold_0{{3, 2, 1, 0, 2}, {1, 1, 1, 1, 1}};
strcol_wrapper col_gold_1({"s1", "", "s1", "s4", "s0"}, {1, 0, 1, 1, 1});
column_wrapper<int32_t> col_gold_2{{0, 2, 1, 4, 1}, {1, 1, 1, 1, 1}};
auto col0_gold_names_col = strcol_wrapper{
"Samuel Vimes", "Detritus", "Carrot Ironfoundersson", "Samuel Vimes", "Angua von Überwald"};
auto col0_gold_ages_col = column_wrapper<int32_t>{{48, 351, 27, 31, 25}};

auto col0_gold_is_human_col =
column_wrapper<bool>{{true, false, true, false, false}, {1, 0, 1, 1, 0}};

auto col_gold_3 = cudf::test::structs_column_wrapper{
{col0_gold_names_col, col0_gold_ages_col, col0_gold_is_human_col}};

column_wrapper<int32_t> col_gold_4{{2, 0, -1, -1, -1}, {1, 1, 0, 0, 0}};
strcol_wrapper col_gold_5{{"s1", "s1", "", "", ""}, {1, 1, 0, 0, 0}};
column_wrapper<int32_t> col_gold_6{{1, 1, -1, -1, -1}, {1, 1, 0, 0, 0}};
auto col1_gold_names_col = strcol_wrapper{{
"Samuel Vimes",
"Detritus",
"",
"",
"",
},
{1, 1, 0, 0, 0}};
auto col1_gold_ages_col = column_wrapper<int32_t>{{48, 351, -1, -1, -1}, {1, 1, 0, 0, 0}};

auto col1_gold_is_human_col =
column_wrapper<bool>{{true, false, false, false, false}, {1, 0, 0, 0, 0}};

auto col_gold_7 = cudf::test::structs_column_wrapper{
{col1_gold_names_col, col1_gold_ages_col, col1_gold_is_human_col}, {1, 1, 0, 0, 0}};

CVector cols_gold;
cols_gold.push_back(col_gold_0.release());
cols_gold.push_back(col_gold_1.release());
cols_gold.push_back(col_gold_2.release());
cols_gold.push_back(col_gold_3.release());
cols_gold.push_back(col_gold_4.release());
cols_gold.push_back(col_gold_5.release());
cols_gold.push_back(col_gold_6.release());
cols_gold.push_back(col_gold_7.release());
Table gold(std::move(cols_gold));

auto gold_sort_order = cudf::sorted_order(gold.view());
auto sorted_gold = cudf::gather(gold.view(), *gold_sort_order);
CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*sorted_gold, *sorted_result);
}

TEST_F(JoinTest, LeftJoinOnNulls)
{
// clang-format off
Expand Down Expand Up @@ -629,6 +720,91 @@ TEST_F(JoinTest, InnerJoinWithNulls)
CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*sorted_gold, *sorted_result);
}

TEST_F(JoinTest, InnerJoinWithStructsAndNulls)
{
column_wrapper<int32_t> col0_0{{3, 1, 2, 0, 2}};
strcol_wrapper col0_1({"s1", "s1", "s0", "s4", "s0"}, {1, 1, 0, 1, 1});
column_wrapper<int32_t> col0_2{{0, 1, 2, 4, 1}};
std::initializer_list<std::string> col0_names = {
"Samuel Vimes", "Carrot Ironfoundersson", "Detritus", "Samuel Vimes", "Angua von Überwald"};
auto col0_names_col = strcol_wrapper{col0_names.begin(), col0_names.end()};
auto col0_ages_col = column_wrapper<int32_t>{{48, 27, 351, 31, 25}};

auto col0_is_human_col = column_wrapper<bool>{{true, true, false, false, false}, {1, 1, 0, 1, 0}};

auto col0_3 =
cudf::test::structs_column_wrapper{{col0_names_col, col0_ages_col, col0_is_human_col}};

column_wrapper<int32_t> col1_0{{2, 2, 0, 4, 3}};
strcol_wrapper col1_1({"s1", "s0", "s1", "s2", "s1"});
column_wrapper<int32_t> col1_2{{1, 0, 1, 2, 1}, {1, 0, 1, 1, 1}};
std::initializer_list<std::string> col1_names = {"Carrot Ironfoundersson",
"Angua von Überwald",
"Detritus",
"Carrot Ironfoundersson",
"Samuel Vimes"};
auto col1_names_col = strcol_wrapper{col1_names.begin(), col1_names.end()};
auto col1_ages_col = column_wrapper<int32_t>{{351, 25, 27, 31, 48}};

auto col1_is_human_col = column_wrapper<bool>{{true, false, false, false, true}, {1, 0, 0, 1, 1}};

auto col1_3 =
cudf::test::structs_column_wrapper{{col1_names_col, col1_ages_col, col1_is_human_col}};

CVector cols0, cols1;
cols0.push_back(col0_0.release());
cols0.push_back(col0_1.release());
cols0.push_back(col0_2.release());
cols0.push_back(col0_3.release());
cols1.push_back(col1_0.release());
cols1.push_back(col1_1.release());
cols1.push_back(col1_2.release());
cols1.push_back(col1_3.release());

Table t0(std::move(cols0));
Table t1(std::move(cols1));

auto result = cudf::inner_join(t0, t1, {0, 1, 3}, {0, 1, 3});
auto result_sort_order = cudf::sorted_order(result->view());
auto sorted_result = cudf::gather(result->view(), *result_sort_order);

column_wrapper<int32_t> col_gold_0{{3, 2}};
strcol_wrapper col_gold_1({"s1", "s0"}, {1, 1});
column_wrapper<int32_t> col_gold_2{{0, 1}};
auto col_gold_3_names_col = strcol_wrapper{"Samuel Vimes", "Angua von Überwald"};
auto col_gold_3_ages_col = column_wrapper<int32_t>{{48, 25}};

auto col_gold_3_is_human_col = column_wrapper<bool>{{true, false}, {1, 0}};

auto col_gold_3 = cudf::test::structs_column_wrapper{
{col_gold_3_names_col, col_gold_3_ages_col, col_gold_3_is_human_col}};

column_wrapper<int32_t> col_gold_4{{3, 2}};
strcol_wrapper col_gold_5({"s1", "s0"}, {1, 1});
column_wrapper<int32_t> col_gold_6{{1, -1}, {1, 0}};
auto col_gold_7_names_col = strcol_wrapper{"Samuel Vimes", "Angua von Überwald"};
auto col_gold_7_ages_col = column_wrapper<int32_t>{{48, 25}};

auto col_gold_7_is_human_col = column_wrapper<bool>{{true, false}, {1, 0}};

auto col_gold_7 = cudf::test::structs_column_wrapper{
{col_gold_7_names_col, col_gold_7_ages_col, col_gold_7_is_human_col}};
CVector cols_gold;
cols_gold.push_back(col_gold_0.release());
cols_gold.push_back(col_gold_1.release());
cols_gold.push_back(col_gold_2.release());
cols_gold.push_back(col_gold_3.release());
cols_gold.push_back(col_gold_4.release());
cols_gold.push_back(col_gold_5.release());
cols_gold.push_back(col_gold_6.release());
cols_gold.push_back(col_gold_7.release());
Table gold(std::move(cols_gold));

auto gold_sort_order = cudf::sorted_order(gold.view());
auto sorted_gold = cudf::gather(gold.view(), *gold_sort_order);
CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*sorted_gold, *sorted_result);
}

// // Test to check join behaviour when join keys are null.
TEST_F(JoinTest, InnerJoinOnNulls)
{
Expand Down Expand Up @@ -1359,4 +1535,128 @@ TEST_F(JoinDictionaryTest, FullJoinWithNulls)
CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*gold, cudf::table_view(result_decoded));
}

TEST_F(JoinTest, FullJoinWithStructsAndNulls)
{
column_wrapper<int32_t> col0_0{{3, 1, 2, 0, 3}};
strcol_wrapper col0_1({"s0", "s1", "s2", "s4", "s1"});
column_wrapper<int32_t> col0_2{{0, 1, 2, 4, 1}};

std::initializer_list<std::string> col0_names = {"Samuel Vimes",
"Carrot Ironfoundersson",
"Angua von Überwald",
"Detritus",
"Carrot Ironfoundersson"};
auto col0_names_col = strcol_wrapper{col0_names.begin(), col0_names.end()};
auto col0_ages_col = column_wrapper<int32_t>{{48, 27, 25, 31, 351}};

auto col0_is_human_col = column_wrapper<bool>{{true, true, false, false, false}, {1, 1, 0, 1, 1}};

auto col0_3 =
cudf::test::structs_column_wrapper{{col0_names_col, col0_ages_col, col0_is_human_col}};

column_wrapper<int32_t> col1_0{{2, 2, 0, 4, 3}, {1, 1, 1, 0, 1}};
strcol_wrapper col1_1{{"s1", "s0", "s1", "s2", "s1"}};
column_wrapper<int32_t> col1_2{{1, 0, 1, 2, 1}};

std::initializer_list<std::string> col1_names = {"Carrot Ironfoundersson",
"Samuel Vimes",
"Carrot Ironfoundersson",
"Angua von Überwald",
"Carrot Ironfoundersson"};
auto col1_names_col = strcol_wrapper{col1_names.begin(), col1_names.end()};
auto col1_ages_col = column_wrapper<int32_t>{{27, 48, 27, 25, 27}};

auto col1_is_human_col = column_wrapper<bool>{{true, true, true, false, true}, {1, 1, 1, 0, 1}};

auto col1_3 =
cudf::test::structs_column_wrapper{{col1_names_col, col1_ages_col, col1_is_human_col}};

CVector cols0, cols1;
cols0.push_back(col0_0.release());
cols0.push_back(col0_1.release());
cols0.push_back(col0_2.release());
cols0.push_back(col0_3.release());
cols1.push_back(col1_0.release());
cols1.push_back(col1_1.release());
cols1.push_back(col1_2.release());
cols1.push_back(col1_3.release());

Table t0(std::move(cols0));
Table t1(std::move(cols1));

auto result = cudf::full_join(t0, t1, {0, 1, 3}, {0, 1, 3});
auto result_sort_order = cudf::sorted_order(result->view());
auto sorted_result = cudf::gather(result->view(), *result_sort_order);

column_wrapper<int32_t> col_gold_0{{3, 1, 2, 0, 3, -1, -1, -1, -1, -1},
{1, 1, 1, 1, 1, 0, 0, 0, 0, 0}};
strcol_wrapper col_gold_1({"s0", "s1", "s2", "s4", "s1", "", "", "", "", ""},
{1, 1, 1, 1, 1, 0, 0, 0, 0, 0});
column_wrapper<int32_t> col_gold_2{{0, 1, 2, 4, 1, -1, -1, -1, -1, -1},
{1, 1, 1, 1, 1, 0, 0, 0, 0, 0}};
auto gold_names0_col = strcol_wrapper{{"Samuel Vimes",
"Carrot Ironfoundersson",
"Angua von Überwald",
"Detritus",
"Carrot Ironfoundersson",
"",
"",
"",
"",
""},
{1, 1, 1, 1, 1, 0, 0, 0, 0, 0}};
auto gold_ages0_col = column_wrapper<int32_t>{{48, 27, 25, 31, 351, -1, -1, -1, -1, -1},
{1, 1, 1, 1, 1, 0, 0, 0, 0, 0}};

auto gold_is_human0_col =
column_wrapper<bool>{{true, true, false, false, false, false, false, false, false, false},
{1, 1, 0, 1, 1, 0, 0, 0, 0, 0}};

auto col_gold_3 = cudf::test::structs_column_wrapper{
{gold_names0_col, gold_ages0_col, gold_is_human0_col}, {1, 1, 1, 1, 1, 0, 0, 0, 0, 0}};

column_wrapper<int32_t> col_gold_4{{-1, -1, -1, -1, -1, 3, 2, 2, 0, 4},
{0, 0, 0, 0, 0, 1, 1, 1, 1, 0}};
strcol_wrapper col_gold_5({"", "", "", "", "", "s1", "s1", "s0", "s1", "s2"},
{0, 0, 0, 0, 0, 1, 1, 1, 1, 1});
column_wrapper<int32_t> col_gold_6{{-1, -1, -1, -1, -1, 1, 1, 0, 1, 2},
{0, 0, 0, 0, 0, 1, 1, 1, 1, 1}};
auto gold_names1_col = strcol_wrapper{{"",
"",
"",
"",
"",
"Carrot Ironfoundersson",
"Carrot Ironfoundersson",
"Samuel Vimes",
"Carrot Ironfoundersson",
"Angua von Überwald"},
{0, 0, 0, 0, 0, 1, 1, 1, 1, 1}};
auto gold_ages1_col = column_wrapper<int32_t>{{-1, -1, -1, -1, -1, 27, 27, 48, 27, 25},
{0, 0, 0, 0, 0, 1, 1, 1, 1, 1}};

auto gold_is_human1_col =
column_wrapper<bool>{{false, false, false, false, false, true, true, true, true, false},
{0, 0, 0, 0, 0, 1, 1, 1, 1, 0}};

auto col_gold_7 = cudf::test::structs_column_wrapper{
{gold_names1_col, gold_ages1_col, gold_is_human1_col}, {0, 0, 0, 0, 0, 1, 1, 1, 1, 1}};

CVector cols_gold;
cols_gold.push_back(col_gold_0.release());
cols_gold.push_back(col_gold_1.release());
cols_gold.push_back(col_gold_2.release());
cols_gold.push_back(col_gold_3.release());
cols_gold.push_back(col_gold_4.release());
cols_gold.push_back(col_gold_5.release());
cols_gold.push_back(col_gold_6.release());
cols_gold.push_back(col_gold_7.release());

Table gold(std::move(cols_gold));

auto gold_sort_order = cudf::sorted_order(gold.view());
auto sorted_gold = cudf::gather(gold.view(), *gold_sort_order);
CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*sorted_gold, *sorted_result);
}

CUDF_TEST_PROGRAM_MAIN()

0 comments on commit f4ab813

Please sign in to comment.