From f4ab813308eba2ee6c278d48030d301d44d9f39b Mon Sep 17 00:00:00 2001 From: Mike Wilson Date: Thu, 1 Apr 2021 11:07:35 -0400 Subject: [PATCH] Adding support for equi-join on struct (#7720) Adds support for equijoin on structs. This PR is leveraging the [struct PR](https://github.com/rapidsai/cudf/pull/7422) and the [rewrite for join API](https://github.com/rapidsai/cudf/pull/7454). It enables equijoin on structs by flattening the struct for the hash calculation. closes #7543 Authors: - Mike Wilson (https://github.com/hyperbolic2346) - Ashwin Srinath (https://github.com/shwina) - Karthikeyan (https://github.com/karthikeyann) - Vukasin Milovanovic (https://github.com/vuule) - Alessandro Bellina (https://github.com/abellina) - Devavret Makkar (https://github.com/devavret) - David Wendt (https://github.com/davidwendt) - Liangcai Li (https://github.com/firestarman) - Paul Taylor (https://github.com/trxcllnt) - Kumar Aatish (https://github.com/kaatish) - Jason Lowe (https://github.com/jlowe) - Dillon Cullinan (https://github.com/dillon-cullinan) - Raza Jafri (https://github.com/razajafri) - https://github.com/rwlee - Michael Wang (https://github.com/isVoid) - Dante Gama Dessavre (https://github.com/dantegd) - Keith Kraus (https://github.com/kkraus14) - Robert Maynard (https://github.com/robertmaynard) - GALI PREM SAGAR (https://github.com/galipremsagar) - https://github.com/ChrisJar - AJ Schmidt (https://github.com/ajschmidt8) - https://github.com/nvdbaranec - Nghia Truong (https://github.com/ttnghia) - https://github.com/chenrui17 - Conor Hoekstra (https://github.com/codereport) - Mike Wendt (https://github.com/mike-wendt) Approvers: - Nghia Truong (https://github.com/ttnghia) - Devavret Makkar (https://github.com/devavret) - Jake Hemstad (https://github.com/jrhemstad) URL: https://github.com/rapidsai/cudf/pull/7720 --- cpp/src/join/hash_join.cu | 22 ++- cpp/tests/join/join_tests.cpp | 300 ++++++++++++++++++++++++++++++++++ 2 files changed, 314 insertions(+), 8 deletions(-) diff --git a/cpp/src/join/hash_join.cu b/cpp/src/join/hash_join.cu index 5a6ad8892de..15eb122ef27 100644 --- a/cpp/src/join/hash_join.cu +++ b/cpp/src/join/hash_join.cu @@ -15,6 +15,7 @@ */ #include #include +#include #include #include @@ -299,13 +300,15 @@ hash_join::hash_join_impl::~hash_join_impl() = default; hash_join::hash_join_impl::hash_join_impl(cudf::table_view const &build, null_equality compare_nulls, rmm::cuda_stream_view stream) - : _build(build), _hash_table(nullptr) + : _hash_table(nullptr) { CUDF_FUNC_RANGE(); - CUDF_EXPECTS(0 != _build.num_columns(), "Hash join build table is empty"); - CUDF_EXPECTS(_build.num_rows() < cudf::detail::MAX_JOIN_SIZE, + CUDF_EXPECTS(0 != build.num_columns(), "Hash join build table is empty"); + CUDF_EXPECTS(build.num_rows() < cudf::detail::MAX_JOIN_SIZE, "Build column size is too big for hash join"); + _build = std::get<0>(structs::detail::flatten_nested_columns(build, {}, {})); + if (0 == build.num_rows()) { return; } _hash_table = build_join_hash_table(_build, compare_nulls, stream); @@ -355,22 +358,25 @@ hash_join::hash_join_impl::compute_hash_join(cudf::table_view const &probe, CUDF_EXPECTS(0 != probe.num_columns(), "Hash join probe table is empty"); CUDF_EXPECTS(probe.num_rows() < cudf::detail::MAX_JOIN_SIZE, "Probe column size is too big for hash join"); - CUDF_EXPECTS(_build.num_columns() == probe.num_columns(), + + auto const _probe = std::get<0>(structs::detail::flatten_nested_columns(probe, {}, {})); + + CUDF_EXPECTS(_build.num_columns() == _probe.num_columns(), "Mismatch in number of columns to be joined on"); - if (is_trivial_join(probe, _build, JoinKind)) { + if (is_trivial_join(_probe, _build, JoinKind)) { return std::make_pair(std::make_unique>(0, stream, mr), std::make_unique>(0, stream, mr)); } CUDF_EXPECTS(std::equal(std::cbegin(_build), std::cend(_build), - std::cbegin(probe), - std::cend(probe), + std::cbegin(_probe), + std::cend(_probe), [](const auto &b, const auto &p) { return b.type() == p.type(); }), "Mismatch in joining column data types"); - return probe_join_indices(probe, compare_nulls, stream, mr); + return probe_join_indices(_probe, compare_nulls, stream, mr); } template diff --git a/cpp/tests/join/join_tests.cpp b/cpp/tests/join/join_tests.cpp index 32192234c56..365653d701f 100644 --- a/cpp/tests/join/join_tests.cpp +++ b/cpp/tests/join/join_tests.cpp @@ -410,6 +410,97 @@ TEST_F(JoinTest, LeftJoinWithNulls) CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*sorted_gold, *sorted_result); } +TEST_F(JoinTest, LeftJoinWithStructsAndNulls) +{ + column_wrapper col0_0{{3, 1, 2, 0, 2}}; + strcol_wrapper col0_1({"s1", "s1", "", "s4", "s0"}, {1, 1, 0, 1, 1}); + column_wrapper col0_2{{0, 1, 2, 4, 1}}; + auto col0_names_col = strcol_wrapper{ + "Samuel Vimes", "Carrot Ironfoundersson", "Detritus", "Samuel Vimes", "Angua von Überwald"}; + auto col0_ages_col = column_wrapper{{48, 27, 351, 31, 25}}; + + auto col0_is_human_col = column_wrapper{{true, true, false, false, false}, {1, 1, 0, 1, 0}}; + + auto col0_3 = + cudf::test::structs_column_wrapper{{col0_names_col, col0_ages_col, col0_is_human_col}}; + + column_wrapper col1_0{{2, 2, 0, 4, 3}}; + strcol_wrapper col1_1({"s1", "s0", "s1", "s2", "s1"}); + column_wrapper col1_2{{1, 0, 1, 2, 1}, {1, 0, 1, 1, 1}}; + auto col1_names_col = strcol_wrapper{ + "Samuel Vimes", "Detritus", "Detritus", "Carrot Ironfoundersson", "Angua von Überwald"}; + auto col1_ages_col = column_wrapper{{48, 35, 351, 22, 25}}; + + auto col1_is_human_col = column_wrapper{{true, true, false, false, true}, {1, 1, 0, 1, 1}}; + + auto col1_3 = + cudf::test::structs_column_wrapper{{col1_names_col, col1_ages_col, col1_is_human_col}}; + + CVector cols0, cols1; + cols0.push_back(col0_0.release()); + cols0.push_back(col0_1.release()); + cols0.push_back(col0_2.release()); + cols0.push_back(col0_3.release()); + cols1.push_back(col1_0.release()); + cols1.push_back(col1_1.release()); + cols1.push_back(col1_2.release()); + cols1.push_back(col1_3.release()); + + Table t0(std::move(cols0)); + Table t1(std::move(cols1)); + + auto result = cudf::left_join(t0, t1, {3}, {3}); + auto result_sort_order = cudf::sorted_order(result->view()); + auto sorted_result = cudf::gather(result->view(), *result_sort_order); + + column_wrapper col_gold_0{{3, 2, 1, 0, 2}, {1, 1, 1, 1, 1}}; + strcol_wrapper col_gold_1({"s1", "", "s1", "s4", "s0"}, {1, 0, 1, 1, 1}); + column_wrapper col_gold_2{{0, 2, 1, 4, 1}, {1, 1, 1, 1, 1}}; + auto col0_gold_names_col = strcol_wrapper{ + "Samuel Vimes", "Detritus", "Carrot Ironfoundersson", "Samuel Vimes", "Angua von Überwald"}; + auto col0_gold_ages_col = column_wrapper{{48, 351, 27, 31, 25}}; + + auto col0_gold_is_human_col = + column_wrapper{{true, false, true, false, false}, {1, 0, 1, 1, 0}}; + + auto col_gold_3 = cudf::test::structs_column_wrapper{ + {col0_gold_names_col, col0_gold_ages_col, col0_gold_is_human_col}}; + + column_wrapper col_gold_4{{2, 0, -1, -1, -1}, {1, 1, 0, 0, 0}}; + strcol_wrapper col_gold_5{{"s1", "s1", "", "", ""}, {1, 1, 0, 0, 0}}; + column_wrapper col_gold_6{{1, 1, -1, -1, -1}, {1, 1, 0, 0, 0}}; + auto col1_gold_names_col = strcol_wrapper{{ + "Samuel Vimes", + "Detritus", + "", + "", + "", + }, + {1, 1, 0, 0, 0}}; + auto col1_gold_ages_col = column_wrapper{{48, 351, -1, -1, -1}, {1, 1, 0, 0, 0}}; + + auto col1_gold_is_human_col = + column_wrapper{{true, false, false, false, false}, {1, 0, 0, 0, 0}}; + + auto col_gold_7 = cudf::test::structs_column_wrapper{ + {col1_gold_names_col, col1_gold_ages_col, col1_gold_is_human_col}, {1, 1, 0, 0, 0}}; + + CVector cols_gold; + cols_gold.push_back(col_gold_0.release()); + cols_gold.push_back(col_gold_1.release()); + cols_gold.push_back(col_gold_2.release()); + cols_gold.push_back(col_gold_3.release()); + cols_gold.push_back(col_gold_4.release()); + cols_gold.push_back(col_gold_5.release()); + cols_gold.push_back(col_gold_6.release()); + cols_gold.push_back(col_gold_7.release()); + Table gold(std::move(cols_gold)); + + auto gold_sort_order = cudf::sorted_order(gold.view()); + auto sorted_gold = cudf::gather(gold.view(), *gold_sort_order); + CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*sorted_gold, *sorted_result); +} + TEST_F(JoinTest, LeftJoinOnNulls) { // clang-format off @@ -629,6 +720,91 @@ TEST_F(JoinTest, InnerJoinWithNulls) CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*sorted_gold, *sorted_result); } +TEST_F(JoinTest, InnerJoinWithStructsAndNulls) +{ + column_wrapper col0_0{{3, 1, 2, 0, 2}}; + strcol_wrapper col0_1({"s1", "s1", "s0", "s4", "s0"}, {1, 1, 0, 1, 1}); + column_wrapper col0_2{{0, 1, 2, 4, 1}}; + std::initializer_list col0_names = { + "Samuel Vimes", "Carrot Ironfoundersson", "Detritus", "Samuel Vimes", "Angua von Überwald"}; + auto col0_names_col = strcol_wrapper{col0_names.begin(), col0_names.end()}; + auto col0_ages_col = column_wrapper{{48, 27, 351, 31, 25}}; + + auto col0_is_human_col = column_wrapper{{true, true, false, false, false}, {1, 1, 0, 1, 0}}; + + auto col0_3 = + cudf::test::structs_column_wrapper{{col0_names_col, col0_ages_col, col0_is_human_col}}; + + column_wrapper col1_0{{2, 2, 0, 4, 3}}; + strcol_wrapper col1_1({"s1", "s0", "s1", "s2", "s1"}); + column_wrapper col1_2{{1, 0, 1, 2, 1}, {1, 0, 1, 1, 1}}; + std::initializer_list col1_names = {"Carrot Ironfoundersson", + "Angua von Überwald", + "Detritus", + "Carrot Ironfoundersson", + "Samuel Vimes"}; + auto col1_names_col = strcol_wrapper{col1_names.begin(), col1_names.end()}; + auto col1_ages_col = column_wrapper{{351, 25, 27, 31, 48}}; + + auto col1_is_human_col = column_wrapper{{true, false, false, false, true}, {1, 0, 0, 1, 1}}; + + auto col1_3 = + cudf::test::structs_column_wrapper{{col1_names_col, col1_ages_col, col1_is_human_col}}; + + CVector cols0, cols1; + cols0.push_back(col0_0.release()); + cols0.push_back(col0_1.release()); + cols0.push_back(col0_2.release()); + cols0.push_back(col0_3.release()); + cols1.push_back(col1_0.release()); + cols1.push_back(col1_1.release()); + cols1.push_back(col1_2.release()); + cols1.push_back(col1_3.release()); + + Table t0(std::move(cols0)); + Table t1(std::move(cols1)); + + auto result = cudf::inner_join(t0, t1, {0, 1, 3}, {0, 1, 3}); + auto result_sort_order = cudf::sorted_order(result->view()); + auto sorted_result = cudf::gather(result->view(), *result_sort_order); + + column_wrapper col_gold_0{{3, 2}}; + strcol_wrapper col_gold_1({"s1", "s0"}, {1, 1}); + column_wrapper col_gold_2{{0, 1}}; + auto col_gold_3_names_col = strcol_wrapper{"Samuel Vimes", "Angua von Überwald"}; + auto col_gold_3_ages_col = column_wrapper{{48, 25}}; + + auto col_gold_3_is_human_col = column_wrapper{{true, false}, {1, 0}}; + + auto col_gold_3 = cudf::test::structs_column_wrapper{ + {col_gold_3_names_col, col_gold_3_ages_col, col_gold_3_is_human_col}}; + + column_wrapper col_gold_4{{3, 2}}; + strcol_wrapper col_gold_5({"s1", "s0"}, {1, 1}); + column_wrapper col_gold_6{{1, -1}, {1, 0}}; + auto col_gold_7_names_col = strcol_wrapper{"Samuel Vimes", "Angua von Überwald"}; + auto col_gold_7_ages_col = column_wrapper{{48, 25}}; + + auto col_gold_7_is_human_col = column_wrapper{{true, false}, {1, 0}}; + + auto col_gold_7 = cudf::test::structs_column_wrapper{ + {col_gold_7_names_col, col_gold_7_ages_col, col_gold_7_is_human_col}}; + CVector cols_gold; + cols_gold.push_back(col_gold_0.release()); + cols_gold.push_back(col_gold_1.release()); + cols_gold.push_back(col_gold_2.release()); + cols_gold.push_back(col_gold_3.release()); + cols_gold.push_back(col_gold_4.release()); + cols_gold.push_back(col_gold_5.release()); + cols_gold.push_back(col_gold_6.release()); + cols_gold.push_back(col_gold_7.release()); + Table gold(std::move(cols_gold)); + + auto gold_sort_order = cudf::sorted_order(gold.view()); + auto sorted_gold = cudf::gather(gold.view(), *gold_sort_order); + CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*sorted_gold, *sorted_result); +} + // // Test to check join behaviour when join keys are null. TEST_F(JoinTest, InnerJoinOnNulls) { @@ -1359,4 +1535,128 @@ TEST_F(JoinDictionaryTest, FullJoinWithNulls) CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*gold, cudf::table_view(result_decoded)); } +TEST_F(JoinTest, FullJoinWithStructsAndNulls) +{ + column_wrapper col0_0{{3, 1, 2, 0, 3}}; + strcol_wrapper col0_1({"s0", "s1", "s2", "s4", "s1"}); + column_wrapper col0_2{{0, 1, 2, 4, 1}}; + + std::initializer_list col0_names = {"Samuel Vimes", + "Carrot Ironfoundersson", + "Angua von Überwald", + "Detritus", + "Carrot Ironfoundersson"}; + auto col0_names_col = strcol_wrapper{col0_names.begin(), col0_names.end()}; + auto col0_ages_col = column_wrapper{{48, 27, 25, 31, 351}}; + + auto col0_is_human_col = column_wrapper{{true, true, false, false, false}, {1, 1, 0, 1, 1}}; + + auto col0_3 = + cudf::test::structs_column_wrapper{{col0_names_col, col0_ages_col, col0_is_human_col}}; + + column_wrapper col1_0{{2, 2, 0, 4, 3}, {1, 1, 1, 0, 1}}; + strcol_wrapper col1_1{{"s1", "s0", "s1", "s2", "s1"}}; + column_wrapper col1_2{{1, 0, 1, 2, 1}}; + + std::initializer_list col1_names = {"Carrot Ironfoundersson", + "Samuel Vimes", + "Carrot Ironfoundersson", + "Angua von Überwald", + "Carrot Ironfoundersson"}; + auto col1_names_col = strcol_wrapper{col1_names.begin(), col1_names.end()}; + auto col1_ages_col = column_wrapper{{27, 48, 27, 25, 27}}; + + auto col1_is_human_col = column_wrapper{{true, true, true, false, true}, {1, 1, 1, 0, 1}}; + + auto col1_3 = + cudf::test::structs_column_wrapper{{col1_names_col, col1_ages_col, col1_is_human_col}}; + + CVector cols0, cols1; + cols0.push_back(col0_0.release()); + cols0.push_back(col0_1.release()); + cols0.push_back(col0_2.release()); + cols0.push_back(col0_3.release()); + cols1.push_back(col1_0.release()); + cols1.push_back(col1_1.release()); + cols1.push_back(col1_2.release()); + cols1.push_back(col1_3.release()); + + Table t0(std::move(cols0)); + Table t1(std::move(cols1)); + + auto result = cudf::full_join(t0, t1, {0, 1, 3}, {0, 1, 3}); + auto result_sort_order = cudf::sorted_order(result->view()); + auto sorted_result = cudf::gather(result->view(), *result_sort_order); + + column_wrapper col_gold_0{{3, 1, 2, 0, 3, -1, -1, -1, -1, -1}, + {1, 1, 1, 1, 1, 0, 0, 0, 0, 0}}; + strcol_wrapper col_gold_1({"s0", "s1", "s2", "s4", "s1", "", "", "", "", ""}, + {1, 1, 1, 1, 1, 0, 0, 0, 0, 0}); + column_wrapper col_gold_2{{0, 1, 2, 4, 1, -1, -1, -1, -1, -1}, + {1, 1, 1, 1, 1, 0, 0, 0, 0, 0}}; + auto gold_names0_col = strcol_wrapper{{"Samuel Vimes", + "Carrot Ironfoundersson", + "Angua von Überwald", + "Detritus", + "Carrot Ironfoundersson", + "", + "", + "", + "", + ""}, + {1, 1, 1, 1, 1, 0, 0, 0, 0, 0}}; + auto gold_ages0_col = column_wrapper{{48, 27, 25, 31, 351, -1, -1, -1, -1, -1}, + {1, 1, 1, 1, 1, 0, 0, 0, 0, 0}}; + + auto gold_is_human0_col = + column_wrapper{{true, true, false, false, false, false, false, false, false, false}, + {1, 1, 0, 1, 1, 0, 0, 0, 0, 0}}; + + auto col_gold_3 = cudf::test::structs_column_wrapper{ + {gold_names0_col, gold_ages0_col, gold_is_human0_col}, {1, 1, 1, 1, 1, 0, 0, 0, 0, 0}}; + + column_wrapper col_gold_4{{-1, -1, -1, -1, -1, 3, 2, 2, 0, 4}, + {0, 0, 0, 0, 0, 1, 1, 1, 1, 0}}; + strcol_wrapper col_gold_5({"", "", "", "", "", "s1", "s1", "s0", "s1", "s2"}, + {0, 0, 0, 0, 0, 1, 1, 1, 1, 1}); + column_wrapper col_gold_6{{-1, -1, -1, -1, -1, 1, 1, 0, 1, 2}, + {0, 0, 0, 0, 0, 1, 1, 1, 1, 1}}; + auto gold_names1_col = strcol_wrapper{{"", + "", + "", + "", + "", + "Carrot Ironfoundersson", + "Carrot Ironfoundersson", + "Samuel Vimes", + "Carrot Ironfoundersson", + "Angua von Überwald"}, + {0, 0, 0, 0, 0, 1, 1, 1, 1, 1}}; + auto gold_ages1_col = column_wrapper{{-1, -1, -1, -1, -1, 27, 27, 48, 27, 25}, + {0, 0, 0, 0, 0, 1, 1, 1, 1, 1}}; + + auto gold_is_human1_col = + column_wrapper{{false, false, false, false, false, true, true, true, true, false}, + {0, 0, 0, 0, 0, 1, 1, 1, 1, 0}}; + + auto col_gold_7 = cudf::test::structs_column_wrapper{ + {gold_names1_col, gold_ages1_col, gold_is_human1_col}, {0, 0, 0, 0, 0, 1, 1, 1, 1, 1}}; + + CVector cols_gold; + cols_gold.push_back(col_gold_0.release()); + cols_gold.push_back(col_gold_1.release()); + cols_gold.push_back(col_gold_2.release()); + cols_gold.push_back(col_gold_3.release()); + cols_gold.push_back(col_gold_4.release()); + cols_gold.push_back(col_gold_5.release()); + cols_gold.push_back(col_gold_6.release()); + cols_gold.push_back(col_gold_7.release()); + + Table gold(std::move(cols_gold)); + + auto gold_sort_order = cudf::sorted_order(gold.view()); + auto sorted_gold = cudf::gather(gold.view(), *gold_sort_order); + CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*sorted_gold, *sorted_result); +} + CUDF_TEST_PROGRAM_MAIN()