Skip to content

Commit

Permalink
Adopt matx v0.3.0 (nv-morpheus#667)
Browse files Browse the repository at this point in the history
* Remove usage of `tensorShape_t` which was deprecated, and later removed.
* Replace usage of tensor constructor in favor of the recommended `make_tensor` helper method.
* Adds more C++ unittests
* RMMTensor marked as a public symbol so the C++ tests can use it
* Add `cuda-nvtx` package to CI driver build, needed for matx-0.3.0

Includes changes from PR nv-morpheus#688
fixes nv-morpheus#317

Authors:
  - David Gardner (https://github.com/dagardner-nv)

Approvers:
  - Michael Demoret (https://github.com/mdemoret-nv)

URL: nv-morpheus#667
  • Loading branch information
dagardner-nv authored and jjacobelli committed Mar 7, 2023
1 parent de4c8af commit 1e21674
Show file tree
Hide file tree
Showing 10 changed files with 481 additions and 233 deletions.
4 changes: 2 additions & 2 deletions .github/workflows/pull_request.yml
Original file line number Diff line number Diff line change
Expand Up @@ -31,8 +31,8 @@ jobs:
uses: ./.github/workflows/ci_pipe.yml
with:
run_check: ${{ startsWith(github.ref_name, 'pull-request/') }}
container: nvcr.io/ea-nvidia-morpheus/morpheus:morpheus-ci-driver-230213
test_container: nvcr.io/ea-nvidia-morpheus/morpheus:morpheus-ci-test-230213
container: nvcr.io/ea-nvidia-morpheus/morpheus:morpheus-ci-driver-230214
test_container: nvcr.io/ea-nvidia-morpheus/morpheus:morpheus-ci-test-230214
secrets:
GHA_AWS_ACCESS_KEY_ID: ${{ secrets.GHA_AWS_ACCESS_KEY_ID }}
GHA_AWS_SECRET_ACCESS_KEY: ${{ secrets.GHA_AWS_SECRET_ACCESS_KEY }}
Expand Down
1 change: 1 addition & 0 deletions ci/runner/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,7 @@ ARG CUDA_PKG_VER
RUN apt update && \
DEBIAN_FRONTEND=noninteractive TZ=Etc/UTC \
apt install --no-install-recommends -y \
cuda-nvtx-${CUDA_PKG_VER} \
libcublas-dev-${CUDA_PKG_VER} \
libcufft-dev-${CUDA_PKG_VER} \
libcurand-dev-${CUDA_PKG_VER} \
Expand Down
3 changes: 3 additions & 0 deletions morpheus/_lib/include/morpheus/objects/rmm_tensor.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
#include <vector>

namespace morpheus {
#pragma GCC visibility push(default)
/****** Component public implementations *******************/
/****** RMMTensor****************************************/

Expand Down Expand Up @@ -164,5 +165,7 @@ class RMMTensor : public ITensor
std::vector<TensorIndex> m_shape;
std::vector<TensorIndex> m_stride;
};

#pragma GCC visibility pop
/** @} */ // end of group
} // namespace morpheus
2 changes: 1 addition & 1 deletion morpheus/_lib/include/morpheus/utilities/matx_util.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -102,7 +102,7 @@ struct MatxUtil
static std::shared_ptr<rmm::device_buffer> reduce_max(const DevMemInfo& input,
const std::vector<int32_t>& seq_ids,
size_t seq_id_offset,
const std::vector<int64_t>& output_shape);
const std::vector<std::size_t>& output_shape);
};
/** @} */ // end of group
} // namespace morpheus
2 changes: 1 addition & 1 deletion morpheus/_lib/src/messages/multi_tensor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,7 @@ void MultiTensorMessage::get_slice_impl(std::shared_ptr<MultiMessage> new_messag
sliced_message->offset = start;
sliced_message->count = stop - start;

// If we have more inference rows than message rows, we need to use the seq_ids to figure out the slicing. This
// If we have more tensor rows than message rows, we need to use the seq_ids to figure out the slicing. This
// will be slow and should be avoided at all costs
if (this->count != this->mess_count && this->memory->has_tensor("seq_ids"))
{
Expand Down
374 changes: 221 additions & 153 deletions morpheus/_lib/src/stages/triton_inference.cpp

Large diffs are not rendered by default.

157 changes: 86 additions & 71 deletions morpheus/_lib/src/utilities/matx_util.cu
Original file line number Diff line number Diff line change
Expand Up @@ -27,10 +27,14 @@
#include <matx.h>
#include <mrc/cuda/sync.hpp>

#include <array>
#include <memory>

namespace morpheus {

using tensorShape_1d = std::array<matx::index_t, 1>;
using tensorShape_2d = std::array<matx::index_t, 2>;

// Component-private classes.
// ************ MatxUtil__MatxCast**************//
/**
Expand All @@ -57,10 +61,10 @@ namespace morpheus {
typename OutputT,
std::enable_if_t<cudf::is_numeric<InputT>() && cudf::is_numeric<OutputT>()> * = nullptr>
void operator()(void *input_data, void *output_data) {
matx::tensorShape_t<1> shape({static_cast<matx::index_t>(element_count)});
tensorShape_1d shape({static_cast<matx::index_t>(element_count)});

matx::tensor_t<InputT, 1> input_tensor(static_cast<InputT *>(input_data), shape);
matx::tensor_t<OutputT, 1> output_tensor(static_cast<OutputT *>(output_data), shape);
auto input_tensor = matx::make_tensor<InputT>(static_cast<InputT *>(input_data), shape);
auto output_tensor = matx::make_tensor<OutputT>(static_cast<OutputT *>(output_data), shape);

(output_tensor = input_tensor).run(stream.value());
}
Expand Down Expand Up @@ -88,14 +92,14 @@ namespace morpheus {
*/
template<typename OutputT, std::enable_if_t<std::is_integral_v<OutputT>> * = nullptr>
void operator()(void *output_data) {
matx::tensorShape_t<2> shape({static_cast<matx::index_t>(element_count), 3});
auto matx_count = static_cast<matx::index_t>(element_count);
tensorShape_2d shape({matx_count, 3});

matx::tensor_t<OutputT, 2> output_tensor(static_cast<OutputT *>(output_data), shape);
auto output_tensor = matx::make_tensor<OutputT>(static_cast<OutputT *>(output_data), shape);

auto col0 = output_tensor.template Slice<1>({0, 0}, {matx::matxEnd, matx::matxDropDim});
auto col2 = output_tensor.template Slice<1>({0, 2}, {matx::matxEnd, matx::matxDropDim});
auto range_col =
matx::range_x<OutputT>(matx::tensorShape_t<1>({static_cast<matx::index_t>(element_count)}), 0, 1);
auto range_col = matx::range<0, tensorShape_1d, OutputT>({matx_count}, 0, 1);

(col0 = range_col).run(stream.value());
(col2 = fea_len - 1).run(stream.value());
Expand Down Expand Up @@ -123,11 +127,11 @@ namespace morpheus {
*/
template<typename InputT, std::enable_if_t<cudf::is_floating_point<InputT>()> * = nullptr>
void operator()(void *input_data, void *output_data) {
matx::tensorShape_t<1> shape({static_cast<matx::index_t>(element_count)});
tensorShape_1d shape({static_cast<matx::index_t>(element_count)});

matx::tensor_t<InputT, 1> input_tensor(static_cast<InputT *>(input_data), shape);
auto input_tensor = matx::make_tensor<InputT>(static_cast<InputT *>(input_data), shape);

matx::tensor_t<InputT, 1> output_tensor(static_cast<InputT *>(output_data), shape);
auto output_tensor = matx::make_tensor<InputT>(static_cast<InputT *>(output_data), shape);

(output_tensor = (InputT) 1 / ((InputT) 1 + matx::exp((InputT) -1 * input_tensor))).run(stream.value());
}
Expand Down Expand Up @@ -156,11 +160,11 @@ namespace morpheus {
*/
template<typename InputT, std::enable_if_t<cudf::is_numeric<InputT>()> * = nullptr>
void operator()(void *input_data, void *output_data) {
matx::tensorShape_t<2> input_shape({static_cast<matx::index_t>(rows), static_cast<matx::index_t>(cols)});
matx::tensorShape_t<2> output_shape({static_cast<matx::index_t>(cols), static_cast<matx::index_t>(rows)});
tensorShape_2d input_shape({static_cast<matx::index_t>(rows), static_cast<matx::index_t>(cols)});
tensorShape_2d output_shape({static_cast<matx::index_t>(cols), static_cast<matx::index_t>(rows)});

matx::tensor_t<InputT, 2> input_tensor(static_cast<InputT *>(input_data), input_shape);
matx::tensor_t<InputT, 2> output_tensor(static_cast<InputT *>(output_data), output_shape);
auto input_tensor = matx::make_tensor<InputT>(static_cast<InputT *>(input_data), input_shape);
auto output_tensor = matx::make_tensor<InputT>(static_cast<InputT *>(output_data), output_shape);

(output_tensor = input_tensor.Permute({1, 0})).run(stream.value());
}
Expand Down Expand Up @@ -205,24 +209,22 @@ namespace morpheus {
template<typename InputT>
void threshold_by_row(void *input_data, void *output_data, double threshold,
const std::vector<std::size_t>& stride) {
matx::tensorShape_t<2> input_shape({static_cast<matx::index_t>(rows), static_cast<matx::index_t>(cols)});

// Output is always 1 column
matx::tensorShape_t<1> output_shape({static_cast<matx::index_t>(rows)});
tensorShape_1d output_shape({static_cast<matx::index_t>(rows)});

matx::DefaultDescriptor<2> desc{{static_cast<matx::index_t>(rows), static_cast<matx::index_t>(cols)},
{static_cast<matx::index_t>(stride[0]), static_cast<matx::index_t>(stride[1])}};

// Specify the stride here since the data comes in column major order.
matx::tensor_t<InputT, 2> input_tensor(static_cast<InputT *>(input_data),
input_shape,
{static_cast<matx::index_t>(stride[0]),
static_cast<matx::index_t>(stride[1])});
auto input_tensor =
matx::make_tensor<InputT, matx::DefaultDescriptor<2>>(static_cast<InputT*>(input_data), std::move(desc));

// Tmp array to hold max value
matx::tensor_t<InputT, 1> max_tensor(output_shape);
auto max_tensor = matx::make_tensor<InputT>(output_shape);

// row-wise reduction
matx::rmax(max_tensor, input_tensor, stream.value());

matx::tensor_t<bool, 1> output_tensor(static_cast<bool *>(output_data), output_shape);
auto output_tensor = matx::make_tensor<bool>(static_cast<bool *>(output_data), output_shape);

// Convert max value to bool
(output_tensor = max_tensor > (InputT) threshold).run(stream.value());
Expand All @@ -234,13 +236,16 @@ namespace morpheus {
template<typename InputT>
void
threshold(void *input_data, void *output_data, double threshold, const std::vector<std::size_t>& stride) {
matx::tensorShape_t<2> shape({static_cast<matx::index_t>(rows), static_cast<matx::index_t>(cols)});
matx::DefaultDescriptor<2> input_desc{{static_cast<matx::index_t>(rows), static_cast<matx::index_t>(cols)},
{static_cast<matx::index_t>(stride[0]), static_cast<matx::index_t>(stride[1])}};

matx::index_t matx_stride[2] = {static_cast<matx::index_t>(stride[0]),
static_cast<matx::index_t>(stride[1])};
// Input & Output have the same shape & stride. The make_tensor API requires a move for the descriptor
// so we need to take a copy of it here.
matx::DefaultDescriptor<2> output_desc = input_desc;

matx::tensor_t<InputT, 2> input_tensor(static_cast<InputT *>(input_data), shape, matx_stride);
matx::tensor_t<bool, 2> output_tensor(static_cast<bool *>(output_data), shape, matx_stride);

auto input_tensor = matx::make_tensor<InputT>(static_cast<InputT *>(input_data), std::move(input_desc));
auto output_tensor = matx::make_tensor<bool>(static_cast<bool *>(output_data), std::move(output_desc));

// Convert max value to bool
(output_tensor = input_tensor > (InputT) threshold).run(stream.value());
Expand All @@ -249,40 +254,68 @@ namespace morpheus {

struct MatxUtil__MatxReduceMax {
matx::index_t num_input_rows;
matx::index_t num_output_rows;
matx::index_t num_cols;
std::vector<matx::index_t> input_stride;
matx::index_t num_output_rows;
void *input_data;
void *output_data;
const std::vector<int32_t> &seq_ids;
size_t seq_id_offset;
rmm::cuda_stream_view stream;

template<typename InputT, std::enable_if_t<!cudf::is_floating_point<InputT>()> * = nullptr>
void operator()(std::size_t start, std::size_t stop, int32_t output_idx) {
void operator()(void *input_data, void *output_data) {
throw std::invalid_argument("Unsupported conversion");
}

template<typename InputT, std::enable_if_t<cudf::is_floating_point<InputT>()> * = nullptr>
void operator()(std::size_t start, std::size_t stop, int32_t output_idx) {
auto input_count = stop - start;
matx::tensorShape_t<2> input_shape({static_cast<matx::index_t>(input_count), num_cols});
matx::tensorShape_t<1> output_shape({num_cols});
void operator()(void *input_data, void *output_data)
{
auto input_ptr = static_cast<InputT *>(input_data);
matx::DefaultDescriptor<2> input_desc{{num_input_rows, num_cols}, {input_stride[0], input_stride[1]}};
auto input_tensor = matx::make_tensor<InputT, matx::DefaultDescriptor<2>>(input_ptr, std::move(input_desc));

auto output_ptr = static_cast<InputT *>(output_data);

matx::index_t output_stride[2] = {input_stride[0], input_stride[1]};
if (output_stride[0] == 1)
{
output_stride[1] = num_output_rows;
}

auto input_ptr = static_cast<InputT *>(input_data) + (start * input_stride[0]);
auto output_ptr = static_cast<InputT *>(output_data) + (output_idx * output_stride[0]);
matx::DefaultDescriptor<2> output_desc{{num_output_rows, num_cols}, output_stride};
auto output_tensor = matx::make_tensor<InputT, matx::DefaultDescriptor<2>>(output_ptr, std::move(output_desc));

matx::index_t start = 0;
auto output_offset = static_cast<matx::index_t>(seq_ids[seq_id_offset]);
for (matx::index_t i=1; i < num_input_rows; ++i)
{
auto idx = seq_ids[i+seq_id_offset];
if (idx != seq_ids[start+seq_id_offset])
{
DCHECK(seq_ids[start+seq_id_offset]-output_offset < num_output_rows);
reduce_rows(input_tensor, output_tensor, start, i, static_cast<matx::index_t>(seq_ids[start+seq_id_offset])-output_offset);
start = i;
}
}


DCHECK(seq_ids[start+seq_id_offset]-output_offset < num_output_rows) << "\nstart=" << start << " seq_ids[start+seq_id_offset]-output_offset=" << seq_ids[start+seq_id_offset]-output_offset << " num_output_rows=" << num_output_rows;
reduce_rows(input_tensor, output_tensor, start, num_input_rows, static_cast<matx::index_t>(seq_ids[start+seq_id_offset])-output_offset);
}

template<typename InputT>
void reduce_rows(matx::tensor_t<InputT, 2>& input_tensor,
matx::tensor_t<InputT, 2>& output_tensor,
matx::index_t start,
matx::index_t stop,
matx::index_t output_idx)
{
auto input_slice = input_tensor.Slice({start, 0}, {stop, matx::matxEnd});
auto tmp_tensor = matx::make_tensor<InputT>({num_cols});

matx::tensor_t<InputT, 2> input_tensor(input_ptr, input_shape, {input_stride[0], input_stride[1]});
matx::tensor_t<InputT, 1> output_tensor(output_ptr, output_shape, {output_stride[1]});
matx::rmax(tmp_tensor, input_slice.Permute({1, 0}), stream.value());

// We need to transpose the input such that rmax will reduce the rows
// Matx performs reductions over the innermost dimensions.
// see https://nvidia.github.io/MatX/api/reduce.html
matx::rmax(output_tensor, input_tensor.Permute({1, 0}), stream.value());
auto output_slice = output_tensor.template Slice<1>({output_idx, 0}, {matx::matxDropDim, matx::matxEnd});
(output_slice = tmp_tensor).run(stream.value());
}
};

Expand Down Expand Up @@ -374,14 +407,16 @@ namespace morpheus {
MatxUtil::reduce_max(const DevMemInfo &input,
const std::vector<int32_t> &seq_ids,
size_t seq_id_offset,
const std::vector<int64_t> &output_shape)
const std::vector<std::size_t> &output_shape)
{
const auto& dtype = input.dtype();
auto cudf_type = cudf::data_type{dtype.cudf_type_id()};
auto num_input_rows = input.shape(0);
auto num_input_cols = input.shape(1);

std::vector<matx::index_t> matx_stride{static_cast<matx::index_t>(input.stride(0)), static_cast<matx::index_t>(input.stride(1))};
std::vector<matx::index_t> matx_stride{static_cast<matx::index_t>(input.stride(0)),
static_cast<matx::index_t>(input.stride(1))};

std::size_t output_element_count = output_shape[0] * output_shape[1];
std::size_t output_buff_size = dtype.item_size() * output_element_count;

Expand All @@ -391,34 +426,14 @@ namespace morpheus {
auto output = input.make_new_buffer(output_buff_size);

MatxUtil__MatxReduceMax matx_reduce_max{static_cast<matx::index_t>(num_input_rows),
static_cast<matx::index_t>(output_shape[0]),
static_cast<matx::index_t>(num_input_cols),
matx_stride,
output_shape[0],
input.data(),
output->data(),
seq_ids,
seq_id_offset,
output->stream()};

std::size_t start = 0;
auto output_offset = seq_ids[seq_id_offset];
for (std::size_t i=0; i < num_input_rows; ++i)
{
auto idx = seq_ids[i+seq_id_offset];
if (idx != seq_ids[start+seq_id_offset])
{
cudf::type_dispatcher(cudf_type,
matx_reduce_max,
start,
i,
seq_ids[start+seq_id_offset]-output_offset);
start = i;
}
}

cudf::type_dispatcher(cudf_type,
matx_reduce_max,
start,
num_input_rows,
seq_ids[start+seq_id_offset]-output_offset);
cudf::type_dispatcher(cudf_type, matx_reduce_max, input.data(), output->data());

mrc::enqueue_stream_sync_event(output->stream()).get();
return output;
Expand Down
Loading

0 comments on commit 1e21674

Please sign in to comment.