Skip to content

Commit

Permalink
Implement maps_column_view abstraction over LIST<STRUCT<K,V>> (#1…
Browse files Browse the repository at this point in the history
…0380)

Fixes #9109.

This commit adds a `map` abstraction over a `column_view` of type `LIST<STRUCT<K,V>>`, where `K` and `V` are key and value types. A list column of structs with two members may thus be viewed as a `map` column. 

`maps_column_view` is to a `LIST<STRUCT<K,V>>` column what `lists_column_view` is to a `LIST` column.

The `maps_column_view` abstraction provides methods to fetch lists of keys and values (as `LIST<K>` and `LIST<V>` respectively). It also provides map lookup methods to find the values corresponding to a specified key, for each row in the "map" column.

E.g.
```c++
auto input_column = get_list_of_structs_col();
// input_column == [ {1:10, 2:20}, {1:100, 3:300}, {2:2000, 3:3000, 4:4000} ];

auto maps_view = cudf::jni::maps_column_view{input_column->view()};
auto keys = maps_view.keys();     // keys   == [ {1,2},   {1,3},      {2,3,4} ];
auto values = maps_view.values(); // values == [ {10,20}, {100, 300}, {2000, 3000, 4000} ];

auto lookup_1 = maps_view.get_values_for( *make_numeric_scalar(1) );
// lookup_1 = [ {10, 100, null} ];
```

This abstraction should help replace the Java/JNI `map_lookup` and `map_contains` kernels, which only handles `MAP<STRING, STRING>`.

Authors:
  - MithunR (https://github.com/mythrocks)

Approvers:
  - Jason Lowe (https://github.com/jlowe)
  - AJ Schmidt (https://github.com/ajschmidt8)
  - Nghia Truong (https://github.com/ttnghia)
  - Jake Hemstad (https://github.com/jrhemstad)

URL: #10380
  • Loading branch information
mythrocks committed Mar 14, 2022
1 parent a6fe301 commit 228cc79
Show file tree
Hide file tree
Showing 11 changed files with 495 additions and 66 deletions.
2 changes: 2 additions & 0 deletions conda/recipes/libcudf/meta.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -150,7 +150,9 @@ test:
- test -f $PREFIX/include/cudf/labeling/label_bins.hpp
- test -f $PREFIX/include/cudf/lists/detail/combine.hpp
- test -f $PREFIX/include/cudf/lists/detail/concatenate.hpp
- test -f $PREFIX/include/cudf/lists/detail/contains.hpp
- test -f $PREFIX/include/cudf/lists/detail/copying.hpp
- test -f $PREFIX/include/cudf/lists/detail/extract.hpp
- test -f $PREFIX/include/cudf/lists/lists_column_factories.hpp
- test -f $PREFIX/include/cudf/lists/detail/drop_list_duplicates.hpp
- test -f $PREFIX/include/cudf/lists/detail/interleave_columns.hpp
Expand Down
78 changes: 78 additions & 0 deletions cpp/include/cudf/lists/detail/contains.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
/*
* Copyright (c) 2022, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once

#include <cudf/lists/contains.hpp>
#include <cudf/lists/lists_column_view.hpp>

namespace cudf {
namespace lists {
namespace detail {

/**
* @copydoc cudf::lists::index_of(cudf::lists_column_view const&,
* cudf::scalar const&,
* duplicate_find_option,
* rmm::mr::device_memory_resource*)
* @param stream CUDA stream used for device memory operations and kernel launches.
*/
std::unique_ptr<column> index_of(
cudf::lists_column_view const& lists,
cudf::scalar const& search_key,
cudf::lists::duplicate_find_option find_option,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/**
* @copydoc cudf::lists::index_of(cudf::lists_column_view const&,
* cudf::column_view const&,
* duplicate_find_option,
* rmm::mr::device_memory_resource*)
* @param stream CUDA stream used for device memory operations and kernel launches.
*/
std::unique_ptr<column> index_of(
cudf::lists_column_view const& lists,
cudf::column_view const& search_keys,
cudf::lists::duplicate_find_option find_option,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/**
* @copydoc cudf::lists::contains(cudf::lists_column_view const&,
* cudf::scalar const&,
* rmm::mr::device_memory_resource*)
* @param stream CUDA stream used for device memory operations and kernel launches.
*/
std::unique_ptr<column> contains(
cudf::lists_column_view const& lists,
cudf::scalar const& search_key,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/**
* @copydoc cudf::lists::contains(cudf::lists_column_view const&,
* cudf::column_view const&,
* rmm::mr::device_memory_resource*)
* @param stream CUDA stream used for device memory operations and kernel launches.
*/
std::unique_ptr<column> contains(
cudf::lists_column_view const& lists,
cudf::column_view const& search_keys,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
} // namespace detail
} // namespace lists
} // namespace cudf
49 changes: 49 additions & 0 deletions cpp/include/cudf/lists/detail/extract.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
/*
* Copyright (c) 2022, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once

#include <cudf/lists/extract.hpp>
#include <cudf/lists/lists_column_view.hpp>

namespace cudf {
namespace lists {
namespace detail {

/**
* @copydoc cudf::lists::extract_list_element(lists_column_view, size_type,
* rmm::mr::device_memory_resource*)
* @param stream CUDA stream used for device memory operations and kernel launches.
*/
std::unique_ptr<column> extract_list_element(
lists_column_view lists_column,
size_type const index,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/**
* @copydoc cudf::lists::extract_list_element(lists_column_view, column_view const&,
* rmm::mr::device_memory_resource*)
* @param stream CUDA stream used for device memory operations and kernel launches.
*/
std::unique_ptr<column> extract_list_element(
lists_column_view lists_column,
column_view const& indices,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

} // namespace detail
} // namespace lists
} // namespace cudf
61 changes: 30 additions & 31 deletions cpp/src/lists/contains.cu
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2021, NVIDIA CORPORATION.
* Copyright (c) 2021-2022, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand All @@ -18,6 +18,7 @@
#include <cudf/detail/iterator.cuh>
#include <cudf/detail/valid_if.cuh>
#include <cudf/lists/contains.hpp>
#include <cudf/lists/detail/contains.hpp>
#include <cudf/lists/list_device_view.cuh>
#include <cudf/lists/lists_column_device_view.cuh>
#include <cudf/lists/lists_column_view.hpp>
Expand Down Expand Up @@ -251,18 +252,17 @@ std::unique_ptr<column> to_contains(std::unique_ptr<column>&& key_positions,

namespace detail {
/**
* @copydoc cudf::lists::index_of(cudf::lists_column_view const&,
* cudf::scalar const&,
* duplicate_find_option,
* rmm::mr::device_memory_resource*)
* @param stream CUDA stream used for device memory operations and kernel launches.
* @copydoc cudf::lists::detail::index_of(cudf::lists_column_view const&,
* cudf::scalar const&,
* duplicate_find_option,
* rmm::cuda_stream_view,
* rmm::mr::device_memory_resource*)
*/
std::unique_ptr<column> index_of(
cudf::lists_column_view const& lists,
cudf::scalar const& search_key,
duplicate_find_option find_option,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
std::unique_ptr<column> index_of(cudf::lists_column_view const& lists,
cudf::scalar const& search_key,
duplicate_find_option find_option,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr)
{
return search_key.is_valid(stream)
? cudf::type_dispatcher(search_key.type(),
Expand All @@ -282,18 +282,17 @@ std::unique_ptr<column> index_of(
}

/**
* @copydoc cudf::lists::index_of(cudf::lists_column_view const&,
* cudf::column_view const&,
* duplicate_find_option,
* rmm::mr::device_memory_resource*)
* @param stream CUDA stream used for device memory operations and kernel launches.
* @copydoc cudf::lists::detail::index_of(cudf::lists_column_view const&,
* cudf::column_view const&,
* duplicate_find_option,
* rmm::cuda_stream_view,
* rmm::mr::device_memory_resource*)
*/
std::unique_ptr<column> index_of(
cudf::lists_column_view const& lists,
cudf::column_view const& search_keys,
duplicate_find_option find_option,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
std::unique_ptr<column> index_of(cudf::lists_column_view const& lists,
cudf::column_view const& search_keys,
duplicate_find_option find_option,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr)
{
CUDF_EXPECTS(search_keys.size() == lists.size(),
"Number of search keys must match list column size.");
Expand All @@ -316,10 +315,10 @@ std::unique_ptr<column> index_of(
}

/**
* @copydoc cudf::lists::contains(cudf::lists_column_view const&,
* cudf::scalar const&,
* rmm::mr::device_memory_resource*)
* @param stream CUDA stream used for device memory operations and kernel launches.
* @copydoc cudf::lists::detail::contains(cudf::lists_column_view const&,
* cudf::scalar const&,
* rmm::cuda_stream_view,
* rmm::mr::device_memory_resource*)
*/
std::unique_ptr<column> contains(cudf::lists_column_view const& lists,
cudf::scalar const& search_key,
Expand All @@ -331,10 +330,10 @@ std::unique_ptr<column> contains(cudf::lists_column_view const& lists,
}

/**
* @copydoc cudf::lists::contains(cudf::lists_column_view const&,
* cudf::column_view const&,
* rmm::mr::device_memory_resource*)
* @param stream CUDA stream used for device memory operations and kernel launches.
* @copydoc cudf::lists::detail::contains(cudf::lists_column_view const&,
* cudf::column_view const&,
* rmm::cuda_stream_view,
* rmm::mr::device_memory_resource*)
*/
std::unique_ptr<column> contains(cudf::lists_column_view const& lists,
cudf::column_view const& search_keys,
Expand Down
35 changes: 28 additions & 7 deletions cpp/src/lists/extract.cu
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2020-2021, NVIDIA CORPORATION.
* Copyright (c) 2020-2022, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand All @@ -18,6 +18,7 @@
#include <cudf/copying.hpp>
#include <cudf/detail/iterator.cuh>
#include <cudf/detail/sequence.hpp>
#include <cudf/lists/detail/extract.hpp>
#include <cudf/lists/detail/gather.cuh>
#include <cudf/lists/extract.hpp>
#include <cudf/scalar/scalar_factories.hpp>
Expand Down Expand Up @@ -107,10 +108,10 @@ std::unique_ptr<cudf::column> make_index_offsets(size_type num_lists, rmm::cuda_
* @param stream CUDA stream used for device memory operations and kernel launches.
*/
template <typename index_t>
std::unique_ptr<column> extract_list_element(lists_column_view lists_column,
index_t const& index,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr)
std::unique_ptr<column> extract_list_element_impl(lists_column_view lists_column,
index_t const& index,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr)
{
auto const num_lists = lists_column.size();
if (num_lists == 0) { return empty_like(lists_column.child()); }
Expand All @@ -135,6 +136,26 @@ std::unique_ptr<column> extract_list_element(lists_column_view lists_column,
return std::move(extracted_lists->release().children[lists_column_view::child_column_index]);
}

/**
* @copydoc cudf::lists::extract_list_element
* @param stream CUDA stream used for device memory operations and kernel launches.
*/
std::unique_ptr<column> extract_list_element(lists_column_view lists_column,
size_type const index,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr)
{
return detail::extract_list_element_impl(lists_column, index, stream, mr);
}

std::unique_ptr<column> extract_list_element(lists_column_view lists_column,
column_view const& indices,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr)
{
return detail::extract_list_element_impl(lists_column, indices, stream, mr);
}

} // namespace detail

/**
Expand All @@ -146,7 +167,7 @@ std::unique_ptr<column> extract_list_element(lists_column_view const& lists_colu
size_type index,
rmm::mr::device_memory_resource* mr)
{
return detail::extract_list_element(lists_column, index, rmm::cuda_stream_default, mr);
return detail::extract_list_element_impl(lists_column, index, rmm::cuda_stream_default, mr);
}

/**
Expand All @@ -160,7 +181,7 @@ std::unique_ptr<column> extract_list_element(lists_column_view const& lists_colu
{
CUDF_EXPECTS(indices.size() == lists_column.size(),
"Index column must have as many elements as lists column.");
return detail::extract_list_element(lists_column, indices, rmm::cuda_stream_default, mr);
return detail::extract_list_element_impl(lists_column, indices, rmm::cuda_stream_default, mr);
}

} // namespace lists
Expand Down
27 changes: 16 additions & 11 deletions java/src/main/java/ai/rapids/cudf/ColumnView.java
Original file line number Diff line number Diff line change
Expand Up @@ -3244,17 +3244,23 @@ public final ColumnVector urlEncode() throws CudfException {
return new ColumnVector(urlEncode(getNativeView()));
}

/** For a column of type List<Struct<String, String>> and a passed in String key, return a string column
* for all the values in the struct that match the key, null otherwise.
* @param key the String scalar to lookup in the column
* @return a string column of values or nulls based on the lookup result
private static void assertIsSupportedMapKeyType(DType keyType) {
boolean isSupportedKeyType =
!keyType.equals(DType.EMPTY) && !keyType.equals(DType.LIST) && !keyType.equals(DType.STRUCT);
assert isSupportedKeyType : "Map lookup by STRUCT and LIST keys is not supported.";
}

/**
* Given a column of type List<Struct<X, Y>> and a key of type X, return a column of type Y,
* where each row in the output column is the Y value corresponding to the X key.
* If the key is not found, the corresponding output value is null.
* @param key the scalar key to lookup in the column
* @return a column of values or nulls based on the lookup result
*/
public final ColumnVector getMapValue(Scalar key) {

assert type.equals(DType.LIST) : "column type must be a LIST";
assert key != null : "target string may not be null";
assert key.getType().equals(DType.STRING) : "target string must be a string scalar";

assert key != null : "Lookup key may not be null";
assertIsSupportedMapKeyType(key.getType());
return new ColumnVector(mapLookup(getNativeView(), key.getScalarHandle()));
}

Expand All @@ -3266,9 +3272,8 @@ public final ColumnVector getMapValue(Scalar key) {
*/
public final ColumnVector getMapKeyExistence(Scalar key) {
assert type.equals(DType.LIST) : "column type must be a LIST";
assert key != null : "target string may not be null";
assert key.getType().equals(DType.STRING) : "target must be a string scalar";

assert key != null : "Lookup key may not be null";
assertIsSupportedMapKeyType(key.getType());
return new ColumnVector(mapContains(getNativeView(), key.getScalarHandle()));
}

Expand Down
1 change: 1 addition & 0 deletions java/src/main/native/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -238,6 +238,7 @@ add_library(
src/TableJni.cpp
src/aggregation128_utils.cu
src/map_lookup.cu
src/maps_column_view.cu
src/row_conversion.cu
src/check_nvcomp_output_sizes.cu
)
Expand Down
Loading

0 comments on commit 228cc79

Please sign in to comment.