From f767f8311da32b03f821a2ea9449b0967b14e494 Mon Sep 17 00:00:00 2001
From: David Gardner <dagardner@nvidia.com>
Date: Tue, 27 Jun 2023 15:33:44 -0700
Subject: [PATCH 01/14] Repro for but in issue 1004

---
 tests/test_tensor_memory.py | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

diff --git a/tests/test_tensor_memory.py b/tests/test_tensor_memory.py
index ac105df457..e19abbee98 100644
--- a/tests/test_tensor_memory.py
+++ b/tests/test_tensor_memory.py
@@ -16,11 +16,13 @@
 
 import os
 import string
+import typing
 
 import cupy as cp
 import numpy as np
 import pytest
 
+from morpheus.config import Config
 from morpheus.messages.memory.inference_memory import InferenceMemory
 from morpheus.messages.memory.inference_memory import InferenceMemoryAE
 from morpheus.messages.memory.inference_memory import InferenceMemoryFIL
@@ -194,3 +196,21 @@ def test_set_tensors_length_error(config, tensor_cls):
 
     with pytest.raises(ValueError):
         m.set_tensors(tensors)
+
+
+@pytest.mark.parametrize("tensor_cls", [TensorMemory, InferenceMemory, ResponseMemory])
+@pytest.mark.parametrize(
+    "shape",
+    [
+        (536870912, 1),  # bytesize > 2**31
+        (536870912, 4)  # bytesize > 2**31 and element count > 2**31
+    ])
+def test_tensorindex_bug(config: Config, tensor_cls: TensorMemory, shape: typing.Tuple[int, int]):
+    """
+    Test for issue #1004. We use a 32bit signed integer for shape and strides, but we shouldn't for element counts and
+    byte sizes.
+    """
+    tensors = {"a": cp.zeros(shape, dtype=np.float32)}
+
+    mem = tensor_cls(count=shape[0], tensors=tensors)
+    assert mem.get_tensor('a').shape == shape

From 42693fedfd253c4596c5195937e71d5c3b519d2d Mon Sep 17 00:00:00 2001
From: David Gardner <dagardner@nvidia.com>
Date: Tue, 27 Jun 2023 15:42:02 -0700
Subject: [PATCH 02/14] update

---
 tests/test_tensor_memory.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/tests/test_tensor_memory.py b/tests/test_tensor_memory.py
index e19abbee98..0c64faf28d 100644
--- a/tests/test_tensor_memory.py
+++ b/tests/test_tensor_memory.py
@@ -213,4 +213,6 @@ def test_tensorindex_bug(config: Config, tensor_cls: TensorMemory, shape: typing
     tensors = {"a": cp.zeros(shape, dtype=np.float32)}
 
     mem = tensor_cls(count=shape[0], tensors=tensors)
-    assert mem.get_tensor('a').shape == shape
+    tensor_a = mem.get_tensor('a')
+    assert tensor_a.shape == shape
+    assert tensor_a.nbytes == shape[0] * shape[1] * 4

From 151f95402bda86a5b151024ddd68772569b9cc95 Mon Sep 17 00:00:00 2001
From: David Gardner <dagardner@nvidia.com>
Date: Tue, 27 Jun 2023 15:50:08 -0700
Subject: [PATCH 03/14] Don't try and create an 8GB array in a unittest

---
 morpheus/_lib/src/utilities/cupy_util.cpp | 7 +++++--
 tests/test_tensor_memory.py               | 2 +-
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/morpheus/_lib/src/utilities/cupy_util.cpp b/morpheus/_lib/src/utilities/cupy_util.cpp
index 93459b5052..9838b4f87d 100644
--- a/morpheus/_lib/src/utilities/cupy_util.cpp
+++ b/morpheus/_lib/src/utilities/cupy_util.cpp
@@ -80,8 +80,11 @@ pybind11::object CupyUtil::tensor_to_cupy(const TensorObject& tensor)
 
     auto ptr    = (uintptr_t)tensor.data();
     auto nbytes = tensor.bytes();
-    auto owner  = py_tensor;
-    int dev_id  = -1;
+
+    DCHECK(nbytes > 0);
+
+    auto owner = py_tensor;
+    int dev_id = -1;
 
     pybind11::list shape_list;
     pybind11::list stride_list;
diff --git a/tests/test_tensor_memory.py b/tests/test_tensor_memory.py
index 0c64faf28d..f7fa39037e 100644
--- a/tests/test_tensor_memory.py
+++ b/tests/test_tensor_memory.py
@@ -203,7 +203,7 @@ def test_set_tensors_length_error(config, tensor_cls):
     "shape",
     [
         (536870912, 1),  # bytesize > 2**31
-        (536870912, 4)  # bytesize > 2**31 and element count > 2**31
+        (134217728, 4)  # bytesize > 2**31 and element count > 2**31
     ])
 def test_tensorindex_bug(config: Config, tensor_cls: TensorMemory, shape: typing.Tuple[int, int]):
     """

From f1990b8e6f816bc1b1712b1cb67c76e493ee2407 Mon Sep 17 00:00:00 2001
From: David Gardner <dagardner@nvidia.com>
Date: Tue, 27 Jun 2023 17:01:28 -0700
Subject: [PATCH 04/14] WIP: Untested, todo: narrowing conversions in
 matx_util.cu

---
 .../include/morpheus/objects/dev_mem_info.hpp | 23 ++++----
 .../include/morpheus/objects/rmm_tensor.hpp   | 11 ++--
 .../_lib/include/morpheus/objects/tensor.hpp  |  8 +--
 .../morpheus/objects/tensor_object.hpp        |  8 +--
 .../morpheus/utilities/tensor_util.hpp        |  8 +--
 morpheus/_lib/src/objects/dev_mem_info.cpp    | 16 +++---
 morpheus/_lib/src/objects/rmm_tensor.cpp      | 12 ++---
 morpheus/_lib/src/objects/tensor.cpp          |  6 +--
 morpheus/_lib/src/stages/triton_inference.cpp |  2 +-
 morpheus/_lib/src/utilities/cupy_util.cpp     |  2 +-
 morpheus/_lib/src/utilities/matx_util.cu      | 53 +++++++------------
 morpheus/_lib/src/utilities/tensor_util.cpp   |  2 +-
 morpheus/_lib/tests/test_dev_mem_info.cpp     |  4 +-
 13 files changed, 74 insertions(+), 81 deletions(-)

diff --git a/morpheus/_lib/include/morpheus/objects/dev_mem_info.hpp b/morpheus/_lib/include/morpheus/objects/dev_mem_info.hpp
index de96a43c1e..f3ca02b1e8 100644
--- a/morpheus/_lib/include/morpheus/objects/dev_mem_info.hpp
+++ b/morpheus/_lib/include/morpheus/objects/dev_mem_info.hpp
@@ -24,7 +24,8 @@
 
 #include <rmm/device_buffer.hpp>  // for device_buffer
 
-#include <memory>  // for shared_ptr, unique_ptr & make_unique
+#include <cstddef>  // for size_t
+#include <memory>   // for shared_ptr, unique_ptr & make_unique
 
 namespace morpheus {
 /****** Component public implementations *******************/
@@ -58,7 +59,7 @@ class MORPHEUS_EXPORT DevMemInfo
                std::shared_ptr<MemoryDescriptor> md,
                ShapeType shape,
                ShapeType stride,
-               TensorIndex offset_bytes = 0);
+               std::size_t offset_bytes = 0);
 
     /**
      * @brief Construct a new DevMemInfo object from an existing `rmm::device_buffer`.
@@ -73,29 +74,29 @@ class MORPHEUS_EXPORT DevMemInfo
                DType dtype,
                ShapeType shape,
                ShapeType stride,
-               TensorIndex offset_bytes = 0);
+               std::size_t offset_bytes = 0);
     DevMemInfo(DevMemInfo&& other) = default;
 
     /**
      * @brief Return the number of bytes stored in the underlying buffer
      *
-     * @return TensorIndex
+     * @return std::size_t
      */
-    TensorIndex bytes() const;
+    std::size_t bytes() const;
 
     /**
      * @brief Return the element count stored in the underlying buffer
      *
-     * @return TensorIndex
+     * @return std::size_t
      */
-    TensorIndex count() const;
+    std::size_t count() const;
 
     /**
      * @brief Return the number of bytes offset from the head of the buffer
      *
-     * @return TensorIndex
+     * @return std::size_t
      */
-    TensorIndex offset_bytes() const;
+    std::size_t offset_bytes() const;
 
     /**
      * @brief Return the type of the data stored in the buffer
@@ -161,7 +162,7 @@ class MORPHEUS_EXPORT DevMemInfo
      * @param bytes
      * @return std::unique_ptr<rmm::device_buffer>
      */
-    std::unique_ptr<rmm::device_buffer> make_new_buffer(TensorIndex bytes) const;
+    std::unique_ptr<rmm::device_buffer> make_new_buffer(std::size_t bytes) const;
 
   private:
     // Pointer to the head of our data
@@ -175,7 +176,7 @@ class MORPHEUS_EXPORT DevMemInfo
     const ShapeType m_stride;
 
     // Offset from head of data in bytes
-    const TensorIndex m_offset_bytes;
+    const std::size_t m_offset_bytes;
 
     // Device resources used to allocate this memory
     std::shared_ptr<MemoryDescriptor> m_md;
diff --git a/morpheus/_lib/include/morpheus/objects/rmm_tensor.hpp b/morpheus/_lib/include/morpheus/objects/rmm_tensor.hpp
index 5a3f48190f..6a6cfab44d 100644
--- a/morpheus/_lib/include/morpheus/objects/rmm_tensor.hpp
+++ b/morpheus/_lib/include/morpheus/objects/rmm_tensor.hpp
@@ -24,6 +24,7 @@
 
 #include <rmm/device_buffer.hpp>
 
+#include <cstddef>  // for size_t
 #include <cstdint>
 #include <memory>
 #include <vector>
@@ -46,7 +47,7 @@ class RMMTensor : public ITensor
 {
   public:
     RMMTensor(std::shared_ptr<rmm::device_buffer> device_buffer,
-              TensorIndex offset,
+              std::size_t offset,
               DType dtype,
               ShapeType shape,
               ShapeType stride = {});
@@ -102,12 +103,12 @@ class RMMTensor : public ITensor
     /**
      * TODO(Documentation)
      */
-    TensorIndex bytes() const final;
+    std::size_t bytes() const final;
 
     /**
      * TODO(Documentation)
      */
-    TensorIndex count() const final;
+    std::size_t count() const final;
 
     /**
      * TODO(Documentation)
@@ -152,12 +153,12 @@ class RMMTensor : public ITensor
     /**
      * TODO(Documentation)
      */
-    TensorIndex offset_bytes() const;
+    std::size_t offset_bytes() const;
 
     // Memory info
     std::shared_ptr<MemoryDescriptor> m_mem_descriptor;
     std::shared_ptr<rmm::device_buffer> m_md;
-    TensorIndex m_offset;
+    std::size_t m_offset;
 
     // // Type info
     // std::string m_typestr;
diff --git a/morpheus/_lib/include/morpheus/objects/tensor.hpp b/morpheus/_lib/include/morpheus/objects/tensor.hpp
index a9919e2283..fcf17ba1ab 100644
--- a/morpheus/_lib/include/morpheus/objects/tensor.hpp
+++ b/morpheus/_lib/include/morpheus/objects/tensor.hpp
@@ -52,7 +52,7 @@ class Tensor
            std::string init_typestr,
            ShapeType init_shape,
            ShapeType init_strides,
-           TensorIndex init_offset = 0);
+           std::size_t init_offset = 0);
 
     ShapeType shape;
     ShapeType strides;
@@ -66,7 +66,7 @@ class Tensor
     /**
      * TODO(Documentation)
      */
-    TensorIndex bytes_count() const;
+    std::size_t bytes_count() const;
 
     /**
      * TODO(Documentation)
@@ -85,10 +85,10 @@ class Tensor
                                DType dtype,
                                ShapeType shape,
                                ShapeType strides,
-                               TensorIndex offset = 0);
+                               std::size_t offset = 0);
 
   private:
-    TensorIndex m_offset;
+    std::size_t m_offset;
     std::shared_ptr<rmm::device_buffer> m_device_buffer;
 };
 
diff --git a/morpheus/_lib/include/morpheus/objects/tensor_object.hpp b/morpheus/_lib/include/morpheus/objects/tensor_object.hpp
index 684ef7ad08..4e6f73e850 100644
--- a/morpheus/_lib/include/morpheus/objects/tensor_object.hpp
+++ b/morpheus/_lib/include/morpheus/objects/tensor_object.hpp
@@ -108,7 +108,7 @@ struct ITensorStorage
     virtual void* data() const = 0;
 
     // virtual const void* data() const                             = 0;
-    virtual TensorIndex bytes() const = 0;
+    virtual std::size_t bytes() const = 0;
 
     virtual std::shared_ptr<MemoryDescriptor> get_memory() const = 0;
     // virtual TensorStorageType storage_type() const               = 0;
@@ -136,7 +136,7 @@ struct ITensor : public ITensorStorage, public ITensorOperations
 
     virtual RankType rank() const = 0;
 
-    virtual TensorIndex count() const = 0;
+    virtual std::size_t count() const = 0;
 
     virtual DType dtype() const = 0;
 
@@ -200,12 +200,12 @@ struct TensorObject final
         return m_tensor->dtype();
     }
 
-    TensorIndex count() const
+    std::size_t count() const
     {
         return m_tensor->count();
     }
 
-    TensorIndex bytes() const
+    std::size_t bytes() const
     {
         return m_tensor->bytes();
     }
diff --git a/morpheus/_lib/include/morpheus/utilities/tensor_util.hpp b/morpheus/_lib/include/morpheus/utilities/tensor_util.hpp
index 933b6a0158..3724537dfe 100644
--- a/morpheus/_lib/include/morpheus/utilities/tensor_util.hpp
+++ b/morpheus/_lib/include/morpheus/utilities/tensor_util.hpp
@@ -111,13 +111,15 @@ struct MORPHEUS_EXPORT TensorUtils
      * @brief Compute the number of elements in a tensor based on the shape
      *
      * @tparam IndexT
+     * @tparam RetTypeT
      * @param shape
      * @return IndexT
      */
-    template <typename IndexT>
-    static inline IndexT get_elem_count(const std::vector<IndexT>& shape)
+    template <typename IndexT, typename RetTypeT = IndexT>
+    static inline RetTypeT get_elem_count(const std::vector<IndexT>& shape)
     {
-        return std::accumulate(shape.begin(), shape.end(), 1, std::multiplies<>());
+        RetTypeT init_val{1};
+        return std::accumulate(shape.begin(), shape.end(), init_val, std::multiplies<>());
     }
 };
 
diff --git a/morpheus/_lib/src/objects/dev_mem_info.cpp b/morpheus/_lib/src/objects/dev_mem_info.cpp
index 48b690d097..93aed98e39 100644
--- a/morpheus/_lib/src/objects/dev_mem_info.cpp
+++ b/morpheus/_lib/src/objects/dev_mem_info.cpp
@@ -17,10 +17,12 @@
 
 #include "morpheus/objects/dev_mem_info.hpp"
 
+#include "morpheus/types.hpp"
 #include "morpheus/utilities/tensor_util.hpp"  // for get_elem_count
 
 #include <glog/logging.h>  // for DCHECK
 
+#include <cstddef>
 #include <cstdint>  // for uint8_t
 #include <memory>
 #include <ostream>
@@ -34,7 +36,7 @@ DevMemInfo::DevMemInfo(void* data,
                        std::shared_ptr<MemoryDescriptor> md,
                        ShapeType shape,
                        ShapeType stride,
-                       TensorIndex offset_bytes) :
+                       std::size_t offset_bytes) :
   m_data(data),
   m_dtype(std::move(dtype)),
   m_md(std::move(md)),
@@ -49,7 +51,7 @@ DevMemInfo::DevMemInfo(std::shared_ptr<rmm::device_buffer> buffer,
                        DType dtype,
                        ShapeType shape,
                        ShapeType stride,
-                       TensorIndex offset_bytes) :
+                       std::size_t offset_bytes) :
   m_data(buffer->data()),
   m_dtype(std::move(dtype)),
   m_shape(std::move(shape)),
@@ -61,17 +63,17 @@ DevMemInfo::DevMemInfo(std::shared_ptr<rmm::device_buffer> buffer,
         << "Inconsistent dimensions, values would extend past the end of the device_buffer";
 }
 
-TensorIndex DevMemInfo::bytes() const
+std::size_t DevMemInfo::bytes() const
 {
     return count() * m_dtype.item_size();
 }
 
-TensorIndex DevMemInfo::count() const
+std::size_t DevMemInfo::count() const
 {
-    return TensorUtils::get_elem_count(m_shape);
+    return TensorUtils::get_elem_count<TensorIndex, std::size_t>(m_shape);
 }
 
-TensorIndex DevMemInfo::offset_bytes() const
+std::size_t DevMemInfo::offset_bytes() const
 {
     return m_offset_bytes;
 }
@@ -111,7 +113,7 @@ std::shared_ptr<MemoryDescriptor> DevMemInfo::memory() const
     return m_md;
 }
 
-std::unique_ptr<rmm::device_buffer> DevMemInfo::make_new_buffer(TensorIndex bytes) const
+std::unique_ptr<rmm::device_buffer> DevMemInfo::make_new_buffer(std::size_t bytes) const
 {
     return std::make_unique<rmm::device_buffer>(bytes, m_md->cuda_stream, m_md->memory_resource);
 }
diff --git a/morpheus/_lib/src/objects/rmm_tensor.cpp b/morpheus/_lib/src/objects/rmm_tensor.cpp
index fabfd44382..b8c7e34477 100644
--- a/morpheus/_lib/src/objects/rmm_tensor.cpp
+++ b/morpheus/_lib/src/objects/rmm_tensor.cpp
@@ -29,7 +29,7 @@
 #include <rmm/cuda_stream_view.hpp>  // for cuda_stream_per_thread
 #include <rmm/device_buffer.hpp>
 
-#include <algorithm>  // for copy, transform
+#include <algorithm>   // for copy, transform
 #include <functional>  // for multiplies, plus, minus
 #include <iterator>    // for back_insert_iterator, back_inserter
 #include <memory>
@@ -42,7 +42,7 @@ namespace morpheus {
 /****** Component public implementations *******************/
 /****** RMMTensor****************************************/
 RMMTensor::RMMTensor(std::shared_ptr<rmm::device_buffer> device_buffer,
-                     TensorIndex offset,
+                     std::size_t offset,
                      DType dtype,
                      ShapeType shape,
                      ShapeType stride) :
@@ -82,12 +82,12 @@ DType RMMTensor::dtype() const
     return m_dtype;
 }
 
-TensorIndex RMMTensor::count() const
+std::size_t RMMTensor::count() const
 {
-    return TensorUtils::get_elem_count(m_shape);
+    return TensorUtils::get_elem_count<TensorIndex, std::size_t>(m_shape);
 }
 
-TensorIndex RMMTensor::bytes() const
+std::size_t RMMTensor::bytes() const
 {
     return count() * m_dtype.item_size();
 }
@@ -175,7 +175,7 @@ std::shared_ptr<ITensor> RMMTensor::as_type(DType new_dtype) const
     return std::make_shared<RMMTensor>(new_data_buffer, 0, new_dtype, m_shape, m_stride);
 }
 
-TensorIndex RMMTensor::offset_bytes() const
+std::size_t RMMTensor::offset_bytes() const
 {
     return m_offset * m_dtype.item_size();
 }
diff --git a/morpheus/_lib/src/objects/tensor.cpp b/morpheus/_lib/src/objects/tensor.cpp
index 4be4af62a3..1fa427e26d 100644
--- a/morpheus/_lib/src/objects/tensor.cpp
+++ b/morpheus/_lib/src/objects/tensor.cpp
@@ -37,7 +37,7 @@ Tensor::Tensor(std::shared_ptr<rmm::device_buffer> buffer,
                std::string init_typestr,
                ShapeType init_shape,
                ShapeType init_strides,
-               TensorIndex init_offset) :
+               std::size_t init_offset) :
   m_device_buffer(std::move(buffer)),
   typestr(std::move(init_typestr)),
   shape(std::move(init_shape)),
@@ -50,7 +50,7 @@ void* Tensor::data() const
     return static_cast<uint8_t*>(m_device_buffer->data()) + m_offset;
 }
 
-TensorIndex Tensor::bytes_count() const
+std::size_t Tensor::bytes_count() const
 {
     // temp just return without shape, size, offset, etc
     return m_device_buffer->size();
@@ -73,7 +73,7 @@ auto Tensor::get_stream() const
 }
 
 TensorObject Tensor::create(
-    std::shared_ptr<rmm::device_buffer> buffer, DType dtype, ShapeType shape, ShapeType strides, TensorIndex offset)
+    std::shared_ptr<rmm::device_buffer> buffer, DType dtype, ShapeType shape, ShapeType strides, std::size_t offset)
 {
     auto md = std::make_shared<MemoryDescriptor>(buffer->stream(), buffer->memory_resource());
 
diff --git a/morpheus/_lib/src/stages/triton_inference.cpp b/morpheus/_lib/src/stages/triton_inference.cpp
index 2767070d47..15ff365ed1 100644
--- a/morpheus/_lib/src/stages/triton_inference.cpp
+++ b/morpheus/_lib/src/stages/triton_inference.cpp
@@ -103,7 +103,7 @@ void build_output_tensors(TensorIndex count,
 
         // First dimension will always end up being the number of rows in the dataframe
         total_shape[0]  = count;
-        auto elem_count = TensorUtils::get_elem_count(total_shape);
+        auto elem_count = TensorUtils::get_elem_count<TensorIndex, std::size_t>(total_shape);
 
         // Create the output memory
         auto output_buffer = std::make_shared<rmm::device_buffer>(elem_count * model_output.datatype.item_size(),
diff --git a/morpheus/_lib/src/utilities/cupy_util.cpp b/morpheus/_lib/src/utilities/cupy_util.cpp
index 9838b4f87d..3d58059a17 100644
--- a/morpheus/_lib/src/utilities/cupy_util.cpp
+++ b/morpheus/_lib/src/utilities/cupy_util.cpp
@@ -135,7 +135,7 @@ TensorObject CupyUtil::cupy_to_tensor(pybind11::object cupy_array)
     auto dtype = DType::from_numpy(typestr);
 
     //  Get the size from the shape and dtype
-    auto size = static_cast<size_t>(TensorUtils::get_elem_count(shape)) * dtype.item_size();
+    auto size = TensorUtils::get_elem_count<TensorIndex, std::size_t>(shape) * dtype.item_size();
 
     // Finally, handle the stream
     auto stream_value = arr_interface["stream"].cast<std::optional<intptr_t>>();
diff --git a/morpheus/_lib/src/utilities/matx_util.cu b/morpheus/_lib/src/utilities/matx_util.cu
index c8ded8ed2a..e9a2fdfa9c 100644
--- a/morpheus/_lib/src/utilities/matx_util.cu
+++ b/morpheus/_lib/src/utilities/matx_util.cu
@@ -24,7 +24,7 @@
 #include <mrc/cuda/sync.hpp>
 
 #include <array>
-#include <cstddef> // for size_t
+#include <cstddef>  // for size_t
 
 namespace morpheus {
 
@@ -135,9 +135,9 @@ struct MatxUtil__MatxOffsetSegIds
     {
         tensorShape_2d shape({element_count, 3});
 
-        auto input_tensor  = matx::make_tensor<InputT>(static_cast<InputT*>(input_data), shape);
+        auto input_tensor = matx::make_tensor<InputT>(static_cast<InputT*>(input_data), shape);
 
-        auto col0      = input_tensor.template Slice<1>({0, 0}, {matx::matxEnd, matx::matxDropDim});
+        auto col0 = input_tensor.template Slice<1>({0, 0}, {matx::matxEnd, matx::matxDropDim});
 
         // Simply add the offset to the column
         (col0 = col0 + offset).run(stream.value());
@@ -184,7 +184,6 @@ struct MatxUtil__MatxLogits
  */
 struct MatxUtil__MatxTranspose
 {
-    TensorIndex element_count;
     rmm::cuda_stream_view stream;
     TensorIndex rows;
     TensorIndex cols;
@@ -260,8 +259,7 @@ struct MatxUtil__MatxThreshold
         // Output is always 1 column
         tensorShape_1d output_shape({rows});
 
-        matx::DefaultDescriptor<2> desc{{rows, cols},
-                                        {stride[0], stride[1]}};
+        matx::DefaultDescriptor<2> desc{{rows, cols}, {stride[0], stride[1]}};
 
         auto input_tensor =
             matx::make_tensor<InputT, matx::DefaultDescriptor<2>>(static_cast<InputT*>(input_data), std::move(desc));
@@ -284,9 +282,7 @@ struct MatxUtil__MatxThreshold
     template <typename InputT>
     void threshold(void* input_data, void* output_data, double threshold, const ShapeType& stride)
     {
-        matx::DefaultDescriptor<2> input_desc{
-            {rows, cols},
-            {stride[0], stride[1]}};
+        matx::DefaultDescriptor<2> input_desc{{rows, cols}, {stride[0], stride[1]}};
 
         // Input & Output have the same shape & stride. The make_tensor API requires a move for the descriptor
         // so we need to take a copy of it here.
@@ -342,11 +338,7 @@ struct MatxUtil__MatxReduceMax
             if (idx != seq_ids[start + seq_id_offset])
             {
                 DCHECK(seq_ids[start + seq_id_offset] - output_offset < num_output_rows);
-                reduce_rows(input_tensor,
-                            output_tensor,
-                            start,
-                            i,
-                            seq_ids[start + seq_id_offset] - output_offset);
+                reduce_rows(input_tensor, output_tensor, start, i, seq_ids[start + seq_id_offset] - output_offset);
                 start = i;
             }
         }
@@ -355,11 +347,7 @@ struct MatxUtil__MatxReduceMax
             << "\nstart=" << start
             << " seq_ids[start+seq_id_offset]-output_offset=" << seq_ids[start + seq_id_offset] - output_offset
             << " num_output_rows=" << num_output_rows;
-        reduce_rows(input_tensor,
-                    output_tensor,
-                    start,
-                    num_input_rows,
-                    seq_ids[start + seq_id_offset] - output_offset);
+        reduce_rows(input_tensor, output_tensor, start, num_input_rows, seq_ids[start + seq_id_offset] - output_offset);
     }
 
     template <typename InputT>
@@ -399,13 +387,17 @@ std::shared_ptr<rmm::device_buffer> MatxUtil::cast(const DevMemInfo& input, Type
     return output;
 }
 
-std::shared_ptr<rmm::device_buffer> MatxUtil::create_seq_ids(TensorIndex row_count, TensorIndex fea_len, TypeId output_type, std::shared_ptr<MemoryDescriptor> md, TensorIndex start_idx)
+std::shared_ptr<rmm::device_buffer> MatxUtil::create_seq_ids(TensorIndex row_count,
+                                                             TensorIndex fea_len,
+                                                             TypeId output_type,
+                                                             std::shared_ptr<MemoryDescriptor> md,
+                                                             TensorIndex start_idx)
 {
     auto output_dtype = DType(output_type);
 
     // Now create the output
-    auto output =
-        std::make_shared<rmm::device_buffer>(output_dtype.item_size() * row_count * 3, md->cuda_stream, md->memory_resource);
+    auto output = std::make_shared<rmm::device_buffer>(
+        output_dtype.item_size() * row_count * 3, md->cuda_stream, md->memory_resource);
 
     cudf::type_dispatcher(cudf::data_type{output_dtype.cudf_type_id()},
                           MatxUtil__MatxCreateSegIds{start_idx, row_count, fea_len, output->stream()},
@@ -414,8 +406,8 @@ std::shared_ptr<rmm::device_buffer> MatxUtil::create_seq_ids(TensorIndex row_cou
     return output;
 }
 
-void MatxUtil::offset_seq_ids(const DevMemInfo& input, TensorIndex offset){
-
+void MatxUtil::offset_seq_ids(const DevMemInfo& input, TensorIndex offset)
+{
     cudf::type_dispatcher(cudf::data_type{input.dtype().cudf_type_id()},
                           MatxUtil__MatxOffsetSegIds{offset, input.shape(0), rmm::cuda_stream_per_thread},
                           input.data());
@@ -442,7 +434,7 @@ std::shared_ptr<rmm::device_buffer> MatxUtil::transpose(const DevMemInfo& input)
     auto output = input.make_new_buffer(input.bytes());
 
     cudf::type_dispatcher(cudf::data_type{input.dtype().cudf_type_id()},
-                          MatxUtil__MatxTranspose{input.count(), output->stream(), input.shape(0), input.shape(1)},
+                          MatxUtil__MatxTranspose{output->stream(), input.shape(0), input.shape(1)},
                           input.data(),
                           output->data());
 
@@ -484,7 +476,7 @@ std::shared_ptr<rmm::device_buffer> MatxUtil::reduce_max(const DevMemInfo& input
     auto num_input_rows = input.shape(0);
     auto num_input_cols = input.shape(1);
 
-    TensorIndex output_element_count = output_shape[0] * output_shape[1];
+    std::size_t output_element_count = output_shape[0] * output_shape[1];
     std::size_t output_buff_size     = dtype.item_size() * output_element_count;
 
     DCHECK(output_element_count <= input.count()) << "Output buffer size should be less than or equal to the input";
@@ -492,13 +484,8 @@ std::shared_ptr<rmm::device_buffer> MatxUtil::reduce_max(const DevMemInfo& input
 
     auto output = input.make_new_buffer(output_buff_size);
 
-    MatxUtil__MatxReduceMax matx_reduce_max{num_input_rows,
-                                            output_shape[0],
-                                            num_input_cols,
-                                            input.stride(),
-                                            seq_ids,
-                                            seq_id_offset,
-                                            output->stream()};
+    MatxUtil__MatxReduceMax matx_reduce_max{
+        num_input_rows, output_shape[0], num_input_cols, input.stride(), seq_ids, seq_id_offset, output->stream()};
 
     cudf::type_dispatcher(cudf_type, matx_reduce_max, input.data(), output->data());
 
diff --git a/morpheus/_lib/src/utilities/tensor_util.cpp b/morpheus/_lib/src/utilities/tensor_util.cpp
index 876dcd043e..3277e325ee 100644
--- a/morpheus/_lib/src/utilities/tensor_util.cpp
+++ b/morpheus/_lib/src/utilities/tensor_util.cpp
@@ -57,7 +57,7 @@ void TensorUtils::set_contiguous_stride(const ShapeType& shape, ShapeType& strid
 bool TensorUtils::has_contiguous_stride(const ShapeType& shape, const ShapeType& stride)
 {
     DCHECK_EQ(shape.size(), stride.size());
-    auto count = get_elem_count(shape);
+    auto count = get_elem_count<TensorIndex, std::size_t>(shape);
     return (shape[0] * stride[0] == count);
 }
 
diff --git a/morpheus/_lib/tests/test_dev_mem_info.cpp b/morpheus/_lib/tests/test_dev_mem_info.cpp
index 1228038a03..69e3c02c3b 100644
--- a/morpheus/_lib/tests/test_dev_mem_info.cpp
+++ b/morpheus/_lib/tests/test_dev_mem_info.cpp
@@ -62,7 +62,7 @@ TEST_F(TestDevMemInfo, RmmBufferConstructor)
     auto buffer       = std::make_shared<rmm::device_buffer>(ByteSize, rmm::cuda_stream_legacy, mem_resource.get());
 
     // Set the offset to the second row in the buffer
-    DevMemInfo dm{buffer, Dtype, {Rows - 1, Cols}, {1, Rows}, static_cast<TensorIndex>(Dtype.item_size())};
+    DevMemInfo dm{buffer, Dtype, {Rows - 1, Cols}, {1, Rows}, Dtype.item_size()};
 
     EXPECT_EQ(dm.bytes(), (Rows - 1) * Cols * Dtype.item_size());
     EXPECT_EQ(dm.count(), (Rows - 1) * Cols);
@@ -98,7 +98,7 @@ TEST_F(TestDevMemInfo, VoidPtrConstructor)
     auto md = std::make_shared<MemoryDescriptor>(rmm::cuda_stream_legacy, mem_resource.get());
 
     // Set the offset to the second row in the buffer
-    DevMemInfo dm{buffer->data(), Dtype, md, {Rows - 1, Cols}, {1, Rows}, static_cast<TensorIndex>(Dtype.item_size())};
+    DevMemInfo dm{buffer->data(), Dtype, md, {Rows - 1, Cols}, {1, Rows}, Dtype.item_size()};
 
     EXPECT_EQ(dm.bytes(), (Rows - 1) * Cols * Dtype.item_size());
     EXPECT_EQ(dm.count(), (Rows - 1) * Cols);

From b0fbdf3f4a078f041fb4d2a52ad4803e29c7956d Mon Sep 17 00:00:00 2001
From: David Gardner <dagardner@nvidia.com>
Date: Wed, 28 Jun 2023 08:29:23 -0700
Subject: [PATCH 05/14] Perform a numeric cast when needed

---
 morpheus/_lib/src/utilities/matx_util.cu | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/morpheus/_lib/src/utilities/matx_util.cu b/morpheus/_lib/src/utilities/matx_util.cu
index e9a2fdfa9c..019a10cb0b 100644
--- a/morpheus/_lib/src/utilities/matx_util.cu
+++ b/morpheus/_lib/src/utilities/matx_util.cu
@@ -18,6 +18,7 @@
 #include "morpheus/types.hpp"
 #include "morpheus/utilities/matx_util.hpp"
 
+#include <boost/numeric/conversion/cast.hpp>  // for numeric_cast
 #include <cudf/utilities/traits.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 #include <matx.h>
@@ -31,6 +32,10 @@ namespace morpheus {
 using tensorShape_1d = std::array<matx::index_t, 1>;
 using tensorShape_2d = std::array<matx::index_t, 2>;
 
+// Since we are building MatX in 32bit mode, we can only support up to 2^31 in any on dimension, for count type values
+// that consider multiple dimensions we use std::size_t, while other operations such as MatxUtil__MatxCast which only
+// opperate on a single dimension use TensorIndex.
+
 // Component-private classes.
 // ************ MatxUtil__MatxCast**************//
 /**
@@ -378,7 +383,7 @@ std::shared_ptr<rmm::device_buffer> MatxUtil::cast(const DevMemInfo& input, Type
 
     cudf::double_type_dispatcher(cudf::data_type{input.dtype().cudf_type_id()},
                                  cudf::data_type{output_dtype.cudf_type_id()},
-                                 MatxUtil__MatxCast{input.count(), output->stream()},
+                                 MatxUtil__MatxCast{boost::numeric_cast<TensorIndex>(input.count()), output->stream()},
                                  input.data(),
                                  output->data());
 
@@ -421,7 +426,7 @@ std::shared_ptr<rmm::device_buffer> MatxUtil::logits(const DevMemInfo& input)
     auto output = input.make_new_buffer(input.bytes());
 
     cudf::type_dispatcher(cudf::data_type{input.dtype().cudf_type_id()},
-                          MatxUtil__MatxLogits{input.count(), output->stream()},
+                          MatxUtil__MatxLogits{boost::numeric_cast<TensorIndex>(input.count()), output->stream()},
                           input.data(),
                           output->data());
 

From 42e5c50e7eae3b3ba82fb98f4391e25cb0c73c92 Mon Sep 17 00:00:00 2001
From: David Gardner <dagardner@nvidia.com>
Date: Wed, 28 Jun 2023 08:29:57 -0700
Subject: [PATCH 06/14] Set default return type for get_elem_count to the one
 we actually use

---
 morpheus/_lib/include/morpheus/utilities/tensor_util.hpp | 5 +++--
 morpheus/_lib/src/objects/dev_mem_info.cpp               | 2 +-
 morpheus/_lib/src/objects/rmm_tensor.cpp                 | 2 +-
 morpheus/_lib/src/stages/triton_inference.cpp            | 2 +-
 morpheus/_lib/src/utilities/cupy_util.cpp                | 2 +-
 morpheus/_lib/src/utilities/tensor_util.cpp              | 2 +-
 6 files changed, 8 insertions(+), 7 deletions(-)

diff --git a/morpheus/_lib/include/morpheus/utilities/tensor_util.hpp b/morpheus/_lib/include/morpheus/utilities/tensor_util.hpp
index 3724537dfe..956fb98ecc 100644
--- a/morpheus/_lib/include/morpheus/utilities/tensor_util.hpp
+++ b/morpheus/_lib/include/morpheus/utilities/tensor_util.hpp
@@ -20,7 +20,8 @@
 #include "morpheus/export.h"
 #include "morpheus/types.hpp"  // for ShapeType, TensorIndex
 
-#include <algorithm>   // IWYU pragma: keep
+#include <algorithm>  // IWYU pragma: keep
+#include <cstddef>
 #include <functional>  // for multiplies
 #include <iosfwd>      // for ostream
 #include <numeric>     // for accumulate
@@ -115,7 +116,7 @@ struct MORPHEUS_EXPORT TensorUtils
      * @param shape
      * @return IndexT
      */
-    template <typename IndexT, typename RetTypeT = IndexT>
+    template <typename IndexT, typename RetTypeT = std::size_t>
     static inline RetTypeT get_elem_count(const std::vector<IndexT>& shape)
     {
         RetTypeT init_val{1};
diff --git a/morpheus/_lib/src/objects/dev_mem_info.cpp b/morpheus/_lib/src/objects/dev_mem_info.cpp
index 93aed98e39..79f072eef2 100644
--- a/morpheus/_lib/src/objects/dev_mem_info.cpp
+++ b/morpheus/_lib/src/objects/dev_mem_info.cpp
@@ -70,7 +70,7 @@ std::size_t DevMemInfo::bytes() const
 
 std::size_t DevMemInfo::count() const
 {
-    return TensorUtils::get_elem_count<TensorIndex, std::size_t>(m_shape);
+    return TensorUtils::get_elem_count(m_shape);
 }
 
 std::size_t DevMemInfo::offset_bytes() const
diff --git a/morpheus/_lib/src/objects/rmm_tensor.cpp b/morpheus/_lib/src/objects/rmm_tensor.cpp
index b8c7e34477..196364d93a 100644
--- a/morpheus/_lib/src/objects/rmm_tensor.cpp
+++ b/morpheus/_lib/src/objects/rmm_tensor.cpp
@@ -84,7 +84,7 @@ DType RMMTensor::dtype() const
 
 std::size_t RMMTensor::count() const
 {
-    return TensorUtils::get_elem_count<TensorIndex, std::size_t>(m_shape);
+    return TensorUtils::get_elem_count(m_shape);
 }
 
 std::size_t RMMTensor::bytes() const
diff --git a/morpheus/_lib/src/stages/triton_inference.cpp b/morpheus/_lib/src/stages/triton_inference.cpp
index 15ff365ed1..2767070d47 100644
--- a/morpheus/_lib/src/stages/triton_inference.cpp
+++ b/morpheus/_lib/src/stages/triton_inference.cpp
@@ -103,7 +103,7 @@ void build_output_tensors(TensorIndex count,
 
         // First dimension will always end up being the number of rows in the dataframe
         total_shape[0]  = count;
-        auto elem_count = TensorUtils::get_elem_count<TensorIndex, std::size_t>(total_shape);
+        auto elem_count = TensorUtils::get_elem_count(total_shape);
 
         // Create the output memory
         auto output_buffer = std::make_shared<rmm::device_buffer>(elem_count * model_output.datatype.item_size(),
diff --git a/morpheus/_lib/src/utilities/cupy_util.cpp b/morpheus/_lib/src/utilities/cupy_util.cpp
index 3d58059a17..12ab0b7a2f 100644
--- a/morpheus/_lib/src/utilities/cupy_util.cpp
+++ b/morpheus/_lib/src/utilities/cupy_util.cpp
@@ -135,7 +135,7 @@ TensorObject CupyUtil::cupy_to_tensor(pybind11::object cupy_array)
     auto dtype = DType::from_numpy(typestr);
 
     //  Get the size from the shape and dtype
-    auto size = TensorUtils::get_elem_count<TensorIndex, std::size_t>(shape) * dtype.item_size();
+    auto size = TensorUtils::get_elem_count(shape) * dtype.item_size();
 
     // Finally, handle the stream
     auto stream_value = arr_interface["stream"].cast<std::optional<intptr_t>>();
diff --git a/morpheus/_lib/src/utilities/tensor_util.cpp b/morpheus/_lib/src/utilities/tensor_util.cpp
index 3277e325ee..876dcd043e 100644
--- a/morpheus/_lib/src/utilities/tensor_util.cpp
+++ b/morpheus/_lib/src/utilities/tensor_util.cpp
@@ -57,7 +57,7 @@ void TensorUtils::set_contiguous_stride(const ShapeType& shape, ShapeType& strid
 bool TensorUtils::has_contiguous_stride(const ShapeType& shape, const ShapeType& stride)
 {
     DCHECK_EQ(shape.size(), stride.size());
-    auto count = get_elem_count<TensorIndex, std::size_t>(shape);
+    auto count = get_elem_count(shape);
     return (shape[0] * stride[0] == count);
 }
 

From 8477244980e348232201fc27bc997f73bb054a21 Mon Sep 17 00:00:00 2001
From: David Gardner <dagardner@nvidia.com>
Date: Wed, 28 Jun 2023 09:21:23 -0700
Subject: [PATCH 07/14] Lint fixes

---
 .../_lib/include/morpheus/objects/tensor.hpp  |   1 +
 morpheus/_lib/src/objects/rmm_tensor.cpp      |   2 +-
 morpheus/_lib/src/utilities/cupy_util.cpp     |   1 -
 tests/test_tensor_memory.py                   | 132 +++++++++---------
 4 files changed, 70 insertions(+), 66 deletions(-)

diff --git a/morpheus/_lib/include/morpheus/objects/tensor.hpp b/morpheus/_lib/include/morpheus/objects/tensor.hpp
index fcf17ba1ab..14c59583aa 100644
--- a/morpheus/_lib/include/morpheus/objects/tensor.hpp
+++ b/morpheus/_lib/include/morpheus/objects/tensor.hpp
@@ -23,6 +23,7 @@
 
 #include <rmm/device_buffer.hpp>
 
+#include <cstddef>  // for size_t
 #include <cstdint>  // for uint8_t
 #include <memory>
 #include <string>
diff --git a/morpheus/_lib/src/objects/rmm_tensor.cpp b/morpheus/_lib/src/objects/rmm_tensor.cpp
index 196364d93a..d4095d2822 100644
--- a/morpheus/_lib/src/objects/rmm_tensor.cpp
+++ b/morpheus/_lib/src/objects/rmm_tensor.cpp
@@ -29,7 +29,7 @@
 #include <rmm/cuda_stream_view.hpp>  // for cuda_stream_per_thread
 #include <rmm/device_buffer.hpp>
 
-#include <algorithm>   // for copy, transform
+#include <algorithm>  // for copy, transform
 #include <functional>  // for multiplies, plus, minus
 #include <iterator>    // for back_insert_iterator, back_inserter
 #include <memory>
diff --git a/morpheus/_lib/src/utilities/cupy_util.cpp b/morpheus/_lib/src/utilities/cupy_util.cpp
index 12ab0b7a2f..cfcd639ce8 100644
--- a/morpheus/_lib/src/utilities/cupy_util.cpp
+++ b/morpheus/_lib/src/utilities/cupy_util.cpp
@@ -34,7 +34,6 @@
 #include <rmm/device_buffer.hpp>     // for device_buffer
 
 #include <array>    // for array
-#include <cstddef>  // for size_t
 #include <cstdint>  // for uintptr_t
 #include <memory>   // for make_shared
 #include <optional>
diff --git a/tests/test_tensor_memory.py b/tests/test_tensor_memory.py
index f7fa39037e..68dacadddb 100644
--- a/tests/test_tensor_memory.py
+++ b/tests/test_tensor_memory.py
@@ -31,36 +31,40 @@
 from morpheus.messages.memory.response_memory import ResponseMemoryAE
 from morpheus.messages.memory.response_memory import ResponseMemoryProbs
 from morpheus.messages.memory.tensor_memory import TensorMemory
+from morpheus.utils.type_aliases import DataFrameType
 from utils import TEST_DIRS
 
 INPUT_FILE = os.path.join(TEST_DIRS.tests_data_dir, 'filter_probs.csv')
 
+# Many of our tests require the config fixture, but don't use the value.
+# pylint: disable=unused-argument
 
-def compare_tensors(t1, t2):
-    assert sorted(t1.keys()) == sorted(t2.keys())
-    for (k, v1) in t1.items():
-        assert (v1 == t2[k]).all()
 
+def compare_tensors(tensors1: typing.Dict[str, cp.ndarray], tensors2: typing.Dict[str, cp.ndarray]):
+    assert sorted(tensors1.keys()) == sorted(tensors2.keys())
+    for (k, val1) in tensors1.items():
+        assert (val1 == tensors2[k]).all()
 
-def check_tensor_memory(cls, count, tensors):
+
+def check_tensor_memory(cls: type, count: int, tensors: typing.Dict[str, cp.ndarray]):
     other_tensors = {'ones': cp.ones(count), 'zeros': cp.zeros(count)}
 
-    m = cls(count=count)
-    assert m.count == count
-    assert m.get_tensors() == {}
+    mem = cls(count=count)
+    assert mem.count == count
+    assert mem.get_tensors() == {}
 
-    m.set_tensors(tensors)
-    compare_tensors(m.get_tensors(), tensors)
+    mem.set_tensors(tensors)
+    compare_tensors(mem.get_tensors(), tensors)
 
-    m.set_tensors(other_tensors)
-    compare_tensors(m.get_tensors(), other_tensors)
+    mem.set_tensors(other_tensors)
+    compare_tensors(mem.get_tensors(), other_tensors)
 
-    m = cls(count=count, tensors=tensors)
-    assert m.count == count
-    compare_tensors(m.get_tensors(), tensors)
+    mem = cls(count=count, tensors=tensors)
+    assert mem.count == count
+    compare_tensors(mem.get_tensors(), tensors)
 
-    m.set_tensors(other_tensors)
-    compare_tensors(m.get_tensors(), other_tensors)
+    mem.set_tensors(other_tensors)
+    compare_tensors(mem.get_tensors(), other_tensors)
 
     with pytest.raises(TypeError):
         cls(count)
@@ -69,7 +73,7 @@ def check_tensor_memory(cls, count, tensors):
         cls(count, tensors)
 
 
-def test_tensor_memory(config):
+def test_tensor_memory(config: Config):
     test_data = cp.array(np.loadtxt(INPUT_FILE, delimiter=",", skiprows=1))
     count = test_data.shape[0]
 
@@ -84,94 +88,94 @@ def test_tensor_memory(config):
 
 
 @pytest.mark.use_python
-def test_inference_memory_ae(config):
+def test_inference_memory_ae(config: Config):
     test_data = cp.array(np.loadtxt(INPUT_FILE, delimiter=",", skiprows=1))
     count = test_data.shape[0]
 
-    input = cp.array(test_data[:, 0])
+    input_tensor = cp.array(test_data[:, 0])
     seq_ids = cp.array(test_data[:, 1])
-    m = InferenceMemoryAE(count=count, input=input, seq_ids=seq_ids)
+    mem = InferenceMemoryAE(count=count, input=input_tensor, seq_ids=seq_ids)
 
-    assert m.count == count
-    compare_tensors(m.get_tensors(), {'input': input, 'seq_ids': seq_ids})
-    assert (m.input == input).all()
-    assert (m.seq_ids == seq_ids).all()
+    assert mem.count == count
+    compare_tensors(mem.get_tensors(), {'input': input_tensor, 'seq_ids': seq_ids})
+    assert (mem.input == input_tensor).all()
+    assert (mem.seq_ids == seq_ids).all()
 
     with pytest.raises(TypeError):
-        InferenceMemoryAE(count, input, seq_ids)
+        InferenceMemoryAE(count, input_tensor, seq_ids)  # pylint: disable=too-many-function-args,missing-kwoa
 
 
-def test_inference_memory_fil(config):
+def test_inference_memory_fil(config: Config):
     test_data = cp.array(np.loadtxt(INPUT_FILE, delimiter=",", skiprows=1))
     count = test_data.shape[0]
 
     input_0 = cp.array(test_data[:, 0])
     seq_ids = cp.array(test_data[:, 1])
-    m = InferenceMemoryFIL(count=count, input__0=input_0, seq_ids=seq_ids)
+    mem = InferenceMemoryFIL(count=count, input__0=input_0, seq_ids=seq_ids)
 
-    assert m.count == count
-    compare_tensors(m.get_tensors(), {'input__0': input_0, 'seq_ids': seq_ids})
-    assert (m.input__0 == input_0).all()
-    assert (m.seq_ids == seq_ids).all()
+    assert mem.count == count
+    compare_tensors(mem.get_tensors(), {'input__0': input_0, 'seq_ids': seq_ids})
+    assert (mem.input__0 == input_0).all()
+    assert (mem.seq_ids == seq_ids).all()
 
     with pytest.raises(TypeError):
-        InferenceMemoryFIL(count, input_0, seq_ids)
+        InferenceMemoryFIL(count, input_0, seq_ids)  # pylint: disable=too-many-function-args,missing-kwoa
 
 
-def test_inference_memory_nlp(config):
+def test_inference_memory_nlp(config: Config):
     test_data = cp.array(np.loadtxt(INPUT_FILE, delimiter=",", skiprows=1))
     count = test_data.shape[0]
 
     input_ids = cp.array(test_data[:, 0])
     input_mask = cp.array(test_data[:, 1])
     seq_ids = cp.array(test_data[:, 2])
-    m = InferenceMemoryNLP(count=count, input_ids=input_ids, input_mask=input_mask, seq_ids=seq_ids)
+    mem = InferenceMemoryNLP(count=count, input_ids=input_ids, input_mask=input_mask, seq_ids=seq_ids)
 
-    assert m.count == count
-    compare_tensors(m.get_tensors(), {'input_ids': input_ids, 'input_mask': input_mask, 'seq_ids': seq_ids})
-    assert (m.input_ids == input_ids).all()
-    assert (m.input_mask == input_mask).all()
-    assert (m.seq_ids == seq_ids).all()
+    assert mem.count == count
+    compare_tensors(mem.get_tensors(), {'input_ids': input_ids, 'input_mask': input_mask, 'seq_ids': seq_ids})
+    assert (mem.input_ids == input_ids).all()
+    assert (mem.input_mask == input_mask).all()
+    assert (mem.seq_ids == seq_ids).all()
 
     with pytest.raises(TypeError):
-        InferenceMemoryNLP(count, input_ids, input_mask, seq_ids)
+        InferenceMemoryNLP(count, input_ids, input_mask, seq_ids)  # pylint: disable=too-many-function-args,missing-kwoa
 
 
-def check_response_memory_probs_and_ae(cls):
+def check_response_memory_probs_and_ae(cls: type):
     test_data = cp.array(np.loadtxt(INPUT_FILE, delimiter=",", skiprows=1))
     count = test_data.shape[0]
 
-    m = cls(count=count, probs=test_data)
-    assert m.count == count
-    compare_tensors(m.get_tensors(), {'probs': test_data})
-    assert (m.get_output('probs') == test_data).all()
+    mem = cls(count=count, probs=test_data)
+    assert mem.count == count
+    compare_tensors(mem.get_tensors(), {'probs': test_data})
+    assert (mem.get_output('probs') == test_data).all()
 
     with pytest.raises(TypeError):
         cls(count, test_data)
 
-    return m
+    return mem
 
 
 @pytest.mark.use_python
-def test_response_memory_ae(config, filter_probs_df):
-    m = check_response_memory_probs_and_ae(ResponseMemoryAE)
+def test_response_memory_ae(config: Config, filter_probs_df: DataFrameType):
+    mem = check_response_memory_probs_and_ae(ResponseMemoryAE)
 
-    assert m.user_id == ""
-    assert m.explain_df is None
+    assert mem.user_id == ""
+    assert mem.explain_df is None
 
-    m.user_id = "testy"
-    m.explain_df = filter_probs_df
+    mem.user_id = "testy"
+    mem.explain_df = filter_probs_df
 
-    assert m.user_id == "testy"
-    assert (m.explain_df.values == filter_probs_df.values).all()
+    assert mem.user_id == "testy"
+    assert (mem.explain_df.values == filter_probs_df.values).all()
 
 
-def test_response_memory_probs(config):
+def test_response_memory_probs(config: Config):
     check_response_memory_probs_and_ae(ResponseMemoryProbs)
 
 
 @pytest.mark.parametrize("tensor_cls", [TensorMemory, InferenceMemory, ResponseMemory])
-def test_constructor_length_error(config, tensor_cls):
+def test_constructor_length_error(config: Config, tensor_cls: type):
     count = 10
     tensors = {"a": cp.zeros(count), "b": cp.ones(count)}
 
@@ -180,22 +184,22 @@ def test_constructor_length_error(config, tensor_cls):
 
 
 @pytest.mark.parametrize("tensor_cls", [TensorMemory, InferenceMemory, ResponseMemory])
-def test_set_tensor_length_error(config, tensor_cls):
+def test_set_tensor_length_error(config: Config, tensor_cls: type):
     count = 10
-    m = tensor_cls(count=count)
+    mem = tensor_cls(count=count)
 
     with pytest.raises(ValueError):
-        m.set_tensor('a', cp.zeros(count + 1))
+        mem.set_tensor('a', cp.zeros(count + 1))
 
 
 @pytest.mark.parametrize("tensor_cls", [TensorMemory, InferenceMemory, ResponseMemory])
-def test_set_tensors_length_error(config, tensor_cls):
+def test_set_tensors_length_error(config: Config, tensor_cls: type):
     count = 10
     tensors = {"a": cp.zeros(count), "b": cp.ones(count)}
-    m = tensor_cls(count=count + 1)
+    mem = tensor_cls(count=count + 1)
 
     with pytest.raises(ValueError):
-        m.set_tensors(tensors)
+        mem.set_tensors(tensors)
 
 
 @pytest.mark.parametrize("tensor_cls", [TensorMemory, InferenceMemory, ResponseMemory])
@@ -205,7 +209,7 @@ def test_set_tensors_length_error(config, tensor_cls):
         (536870912, 1),  # bytesize > 2**31
         (134217728, 4)  # bytesize > 2**31 and element count > 2**31
     ])
-def test_tensorindex_bug(config: Config, tensor_cls: TensorMemory, shape: typing.Tuple[int, int]):
+def test_tensorindex_bug(config: Config, tensor_cls: type, shape: typing.Tuple[int, int]):
     """
     Test for issue #1004. We use a 32bit signed integer for shape and strides, but we shouldn't for element counts and
     byte sizes.

From ea69b80d7cfb7c3feb5dd4348a9dcb5f40ed68e4 Mon Sep 17 00:00:00 2001
From: David Gardner <96306125+dagardner-nv@users.noreply.github.com>
Date: Wed, 28 Jun 2023 09:37:48 -0700
Subject: [PATCH 08/14] Update morpheus/_lib/src/utilities/cupy_util.cpp

Co-authored-by: Michael Demoret <42954918+mdemoret-nv@users.noreply.github.com>
---
 morpheus/_lib/src/utilities/cupy_util.cpp | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/morpheus/_lib/src/utilities/cupy_util.cpp b/morpheus/_lib/src/utilities/cupy_util.cpp
index cfcd639ce8..01cf0578aa 100644
--- a/morpheus/_lib/src/utilities/cupy_util.cpp
+++ b/morpheus/_lib/src/utilities/cupy_util.cpp
@@ -79,9 +79,6 @@ pybind11::object CupyUtil::tensor_to_cupy(const TensorObject& tensor)
 
     auto ptr    = (uintptr_t)tensor.data();
     auto nbytes = tensor.bytes();
-
-    DCHECK(nbytes > 0);
-
     auto owner = py_tensor;
     int dev_id = -1;
 

From 1e6573d49165f81c4c7e567fe7284790863e3195 Mon Sep 17 00:00:00 2001
From: David Gardner <dagardner@nvidia.com>
Date: Wed, 28 Jun 2023 09:42:24 -0700
Subject: [PATCH 09/14] Always set return type to size_t

---
 .../_lib/include/morpheus/utilities/tensor_util.hpp    | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/morpheus/_lib/include/morpheus/utilities/tensor_util.hpp b/morpheus/_lib/include/morpheus/utilities/tensor_util.hpp
index 956fb98ecc..5bced8391d 100644
--- a/morpheus/_lib/include/morpheus/utilities/tensor_util.hpp
+++ b/morpheus/_lib/include/morpheus/utilities/tensor_util.hpp
@@ -112,15 +112,13 @@ struct MORPHEUS_EXPORT TensorUtils
      * @brief Compute the number of elements in a tensor based on the shape
      *
      * @tparam IndexT
-     * @tparam RetTypeT
      * @param shape
-     * @return IndexT
+     * @return std::size_t
      */
-    template <typename IndexT, typename RetTypeT = std::size_t>
-    static inline RetTypeT get_elem_count(const std::vector<IndexT>& shape)
+    template <typename IndexT>
+    static inline std::size_t get_elem_count(const std::vector<IndexT>& shape)
     {
-        RetTypeT init_val{1};
-        return std::accumulate(shape.begin(), shape.end(), init_val, std::multiplies<>());
+        return std::accumulate(shape.begin(), shape.end(), std::size_t{1}, std::multiplies<>());
     }
 };
 

From 7a01590caf4cce9e51893fcd434de4a527d10b9a Mon Sep 17 00:00:00 2001
From: David Gardner <dagardner@nvidia.com>
Date: Wed, 28 Jun 2023 11:00:51 -0700
Subject: [PATCH 10/14] Bring back test_cuda.cu, none of the existing tests
 were still valid, add test to verify that we can create a tensor with a large
 shape

---
 morpheus/_lib/tests/CMakeLists.txt |   4 +-
 morpheus/_lib/tests/test_cuda.cu   | 137 ++---------------------------
 2 files changed, 12 insertions(+), 129 deletions(-)

diff --git a/morpheus/_lib/tests/CMakeLists.txt b/morpheus/_lib/tests/CMakeLists.txt
index 85194fed4e..47dad14e90 100644
--- a/morpheus/_lib/tests/CMakeLists.txt
+++ b/morpheus/_lib/tests/CMakeLists.txt
@@ -19,7 +19,7 @@ find_package(pybind11 REQUIRED)
 
 # Keep all source files sorted
 add_executable(test_libmorpheus
-  # test_cuda.cu
+  test_cuda.cu
   io/test_data_loader.cpp
   io/test_data_loader_registry.cpp
   io/test_loaders.cpp
@@ -53,6 +53,8 @@ add_test(
 set_target_properties(test_libmorpheus
   PROPERTIES
     INSTALL_RPATH "$ORIGIN/.."
+    CUDA_STANDARD 17
+    CUDA_STANDARD_REQUIRED ON
 )
 
 install(
diff --git a/morpheus/_lib/tests/test_cuda.cu b/morpheus/_lib/tests/test_cuda.cu
index 47c9d43f54..4c9b3311e5 100644
--- a/morpheus/_lib/tests/test_cuda.cu
+++ b/morpheus/_lib/tests/test_cuda.cu
@@ -17,138 +17,19 @@
 
 #include "test_morpheus.hpp"
 
-#include "morpheus/objects/tensor_object.hpp"
-
-#include <cuda/memory_resource>
-#include <cuda_runtime.h>
 #include <matx.h>
-#include <matx_type_utils.h>
-#include <mrc/cuda/common.hpp>  // for MRC_CHECK_CUDA
-#include <mrc/cuda/sync.hpp>    // for enqueue_stream_sync_event
-#include <mrc/memory/adaptors.hpp>
-#include <mrc/memory/buffer.hpp>
-#include <mrc/memory/literals.hpp>
-#include <mrc/memory/old_interface/memory.hpp>
-#include <mrc/memory/resources/device/cuda_malloc_resource.hpp>
-#include <mrc/memory/resources/host/pinned_memory_resource.hpp>
-#include <mrc/memory/resources/logging_resource.hpp>
-#include <mrc/ucx/context.hpp>
-#include <xtensor/xarray.hpp>
-#include <xtensor/xtensor.hpp>
-
-#include <algorithm>
-#include <chrono>
-#include <ratio>
 
-using namespace mrc::memory::literals;
 using namespace morpheus;
 
-using RankType = int;
-
-class TestCuda : public ::testing::Test
-{
-  protected:
-    void SetUp() override
-    {
-        MRC_CHECK_CUDA(cudaStreamCreate(&stream));
-
-        auto pinned = std::make_shared<mrc::memory::pinned_memory_resource>();
-        auto device = std::make_shared<mrc::memory::cuda_malloc_resource>(0);
-
-        m_host_allocator   = mrc::memory::OldHostAllocator(pinned, nullptr).shared();
-        m_device_allocator = mrc::memory::OldDeviceAllocator(device, nullptr).shared();
-    }
-
-    void TearDown() override
-    {
-        MRC_CHECK_CUDA(cudaStreamSynchronize(stream));
-        MRC_CHECK_CUDA(cudaStreamDestroy(stream));
-    }
-
-    template <typename T, RankType R>
-    TensorObject make_host_tensor(const TensorIndex (&shape)[R])
-    {
-        auto count = std::accumulate(std::begin(shape), std::end(shape), 1, std::multiplies<>());
-        auto md    = m_host_allocator->allocate_descriptor(count * sizeof(T)).make_shared();
-        std::vector s(std::begin(shape), std::end(shape));
-
-        auto tensor = std::make_shared<GenericTensor>(
-            md, 0, DataType(TypeId::FLOAT32), std::vector<TensorIndex>{s}, std::vector<TensorIndex>{});
-
-        return TensorObject(md, tensor);
-    }
-
-    template <typename T, RankType R>
-    TensorObject make_device_tensor(const TensorIndex (&shape)[R])
-    {
-        auto count = std::accumulate(std::begin(shape), std::end(shape), 1, std::multiplies<>());
-        auto md    = m_device_allocator->allocate_descriptor(count * sizeof(T)).make_shared();
-        std::vector s(std::begin(shape), std::end(shape));
-
-        auto tensor = std::make_shared<GenericTensor>(md, 0, DataType(TypeId::FLOAT32), s);
-
-        return TensorObject(md, std::move(tensor));
-    }
-
-    cudaStream_t stream;  // NOLINT
-
-    std::shared_ptr<mrc::memory::IAllocator> m_host_allocator;
-    std::shared_ptr<mrc::memory::IAllocator> m_device_allocator;
-};
-
-template <typename T>
-auto await_matx(matx::BaseOp<T>& op, cudaStream_t stream)
-{
-    op.run(stream);
-    return mrc::enqueue_stream_sync_event(stream);
-}
-
-void test_1d(const TensorObject& one_d)
-{
-    CHECK_EQ(one_d.rank(), 1);
-    CHECK_EQ(one_d.dtype_size(), 4);
-    CHECK_EQ(one_d.count(), 100);
-    CHECK_EQ(one_d.bytes(), 400);
-    CHECK_EQ(one_d.shape(0), 100);
-    CHECK_EQ(one_d.stride(0), 1);
-}
-
-void test_2d(const TensorObject& two_d)
-{
-    CHECK_EQ(two_d.rank(), 2);
-    CHECK_EQ(two_d.dtype_size(), 4);
-    CHECK_EQ(two_d.count(), 100);
-    CHECK_EQ(two_d.bytes(), 400);
-    CHECK_EQ(two_d.shape(0), 10);
-    CHECK_EQ(two_d.shape(1), 10);
-
-    // row major
-    CHECK_EQ(two_d.stride(0), 10);
-    CHECK_EQ(two_d.stride(1), 1);
-}
-
-TEST_F(TestCuda, Tensor1D)
-{
-    auto one_d = make_host_tensor<float>({100});
-    test_1d(one_d);
-
-    auto two_d = one_d.reshape({10, 10});
-    test_2d(two_d);
-}
-
-TEST_F(TestCuda, Tensor2D)
-{
-    auto two_d = make_host_tensor<float>({10, 10});
-    test_2d(two_d);
-
-    auto one_d = two_d.reshape({100});
-    test_1d(one_d);
-
-    CHECK_EQ(one_d.data(), two_d.data());
-}
+TEST_CLASS(Cuda);
 
-TEST_F(TestCuda, Shape)
+TEST_F(TestCuda, LargeShape)
 {
-    std::array<matx::index_t, 2> array_2d = {3, 5};
-    matx::tensorShape_t<2> shape_2d(array_2d.data());
+    // Test for issue #1004 Tensor shape with large dimensions, each dimension is < 2^31, but the total number of
+    // elements is > 2^31 as is the number of bytes.
+    const std::int32_t rows = 134217728;
+    const std::int32_t cols = 4;
+    auto tensor             = matx::make_tensor<float>({rows, cols});
+    EXPECT_EQ(tensor.Size(0), rows);
+    EXPECT_EQ(tensor.Size(1), cols);
 }

From 4c982a47ce926271bb8e13490131b41d3b8a1ce5 Mon Sep 17 00:00:00 2001
From: David Gardner <dagardner@nvidia.com>
Date: Wed, 28 Jun 2023 11:02:36 -0700
Subject: [PATCH 11/14] Formatting

---
 morpheus/_lib/src/utilities/cupy_util.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/morpheus/_lib/src/utilities/cupy_util.cpp b/morpheus/_lib/src/utilities/cupy_util.cpp
index 01cf0578aa..e66a07235d 100644
--- a/morpheus/_lib/src/utilities/cupy_util.cpp
+++ b/morpheus/_lib/src/utilities/cupy_util.cpp
@@ -79,8 +79,8 @@ pybind11::object CupyUtil::tensor_to_cupy(const TensorObject& tensor)
 
     auto ptr    = (uintptr_t)tensor.data();
     auto nbytes = tensor.bytes();
-    auto owner = py_tensor;
-    int dev_id = -1;
+    auto owner  = py_tensor;
+    int dev_id  = -1;
 
     pybind11::list shape_list;
     pybind11::list stride_list;

From da9fba9f1f99880f39377e09b3035426c14a3fba Mon Sep 17 00:00:00 2001
From: David Gardner <dagardner@nvidia.com>
Date: Wed, 28 Jun 2023 11:17:09 -0700
Subject: [PATCH 12/14] Use TensorSize type alias

---
 .../include/morpheus/objects/dev_mem_info.hpp | 25 +++++++++----------
 .../include/morpheus/objects/rmm_tensor.hpp   | 17 ++++++-------
 .../_lib/include/morpheus/objects/tensor.hpp  | 11 ++++----
 .../morpheus/objects/tensor_object.hpp        | 12 ++++-----
 morpheus/_lib/include/morpheus/types.hpp      |  2 ++
 .../morpheus/utilities/tensor_util.hpp        |  9 +++----
 morpheus/_lib/src/objects/dev_mem_info.cpp    | 14 +++++------
 morpheus/_lib/src/objects/rmm_tensor.cpp      | 10 ++++----
 morpheus/_lib/src/objects/tensor.cpp          |  6 ++---
 morpheus/_lib/src/utilities/matx_util.cu      | 14 +++++------
 10 files changed, 57 insertions(+), 63 deletions(-)

diff --git a/morpheus/_lib/include/morpheus/objects/dev_mem_info.hpp b/morpheus/_lib/include/morpheus/objects/dev_mem_info.hpp
index f3ca02b1e8..31776945ab 100644
--- a/morpheus/_lib/include/morpheus/objects/dev_mem_info.hpp
+++ b/morpheus/_lib/include/morpheus/objects/dev_mem_info.hpp
@@ -20,12 +20,11 @@
 #include "morpheus/export.h"
 #include "morpheus/objects/dtype.hpp"              // for DType, TypeId
 #include "morpheus/objects/memory_descriptor.hpp"  // for MemoryDescriptor
-#include "morpheus/types.hpp"                      // for ShapeType, TensorIndex
+#include "morpheus/types.hpp"                      // for ShapeType, TensorIndex, TensorSize
 
 #include <rmm/device_buffer.hpp>  // for device_buffer
 
-#include <cstddef>  // for size_t
-#include <memory>   // for shared_ptr, unique_ptr & make_unique
+#include <memory>  // for shared_ptr, unique_ptr & make_unique
 
 namespace morpheus {
 /****** Component public implementations *******************/
@@ -59,7 +58,7 @@ class MORPHEUS_EXPORT DevMemInfo
                std::shared_ptr<MemoryDescriptor> md,
                ShapeType shape,
                ShapeType stride,
-               std::size_t offset_bytes = 0);
+               TensorSize offset_bytes = 0);
 
     /**
      * @brief Construct a new DevMemInfo object from an existing `rmm::device_buffer`.
@@ -74,29 +73,29 @@ class MORPHEUS_EXPORT DevMemInfo
                DType dtype,
                ShapeType shape,
                ShapeType stride,
-               std::size_t offset_bytes = 0);
+               TensorSize offset_bytes = 0);
     DevMemInfo(DevMemInfo&& other) = default;
 
     /**
      * @brief Return the number of bytes stored in the underlying buffer
      *
-     * @return std::size_t
+     * @return TensorSize
      */
-    std::size_t bytes() const;
+    TensorSize bytes() const;
 
     /**
      * @brief Return the element count stored in the underlying buffer
      *
-     * @return std::size_t
+     * @return TensorSize
      */
-    std::size_t count() const;
+    TensorSize count() const;
 
     /**
      * @brief Return the number of bytes offset from the head of the buffer
      *
-     * @return std::size_t
+     * @return TensorSize
      */
-    std::size_t offset_bytes() const;
+    TensorSize offset_bytes() const;
 
     /**
      * @brief Return the type of the data stored in the buffer
@@ -162,7 +161,7 @@ class MORPHEUS_EXPORT DevMemInfo
      * @param bytes
      * @return std::unique_ptr<rmm::device_buffer>
      */
-    std::unique_ptr<rmm::device_buffer> make_new_buffer(std::size_t bytes) const;
+    std::unique_ptr<rmm::device_buffer> make_new_buffer(TensorSize bytes) const;
 
   private:
     // Pointer to the head of our data
@@ -176,7 +175,7 @@ class MORPHEUS_EXPORT DevMemInfo
     const ShapeType m_stride;
 
     // Offset from head of data in bytes
-    const std::size_t m_offset_bytes;
+    const TensorSize m_offset_bytes;
 
     // Device resources used to allocate this memory
     std::shared_ptr<MemoryDescriptor> m_md;
diff --git a/morpheus/_lib/include/morpheus/objects/rmm_tensor.hpp b/morpheus/_lib/include/morpheus/objects/rmm_tensor.hpp
index 6a6cfab44d..a6c33c7054 100644
--- a/morpheus/_lib/include/morpheus/objects/rmm_tensor.hpp
+++ b/morpheus/_lib/include/morpheus/objects/rmm_tensor.hpp
@@ -20,11 +20,10 @@
 #include "morpheus/objects/dtype.hpp"  // for DType
 #include "morpheus/objects/memory_descriptor.hpp"
 #include "morpheus/objects/tensor_object.hpp"
-#include "morpheus/types.hpp"  // for RankType, ShapeType, TensorIndex
+#include "morpheus/types.hpp"  // for RankType, ShapeType, TensorIndex, TensorSize
 
 #include <rmm/device_buffer.hpp>
 
-#include <cstddef>  // for size_t
 #include <cstdint>
 #include <memory>
 #include <vector>
@@ -47,7 +46,7 @@ class RMMTensor : public ITensor
 {
   public:
     RMMTensor(std::shared_ptr<rmm::device_buffer> device_buffer,
-              std::size_t offset,
+              TensorSize offset,
               DType dtype,
               ShapeType shape,
               ShapeType stride = {});
@@ -103,12 +102,12 @@ class RMMTensor : public ITensor
     /**
      * TODO(Documentation)
      */
-    std::size_t bytes() const final;
+    TensorSize bytes() const final;
 
     /**
      * TODO(Documentation)
      */
-    std::size_t count() const final;
+    TensorSize count() const final;
 
     /**
      * TODO(Documentation)
@@ -153,16 +152,14 @@ class RMMTensor : public ITensor
     /**
      * TODO(Documentation)
      */
-    std::size_t offset_bytes() const;
+    TensorSize offset_bytes() const;
 
     // Memory info
     std::shared_ptr<MemoryDescriptor> m_mem_descriptor;
     std::shared_ptr<rmm::device_buffer> m_md;
-    std::size_t m_offset;
+    TensorSize m_offset;
 
-    // // Type info
-    // std::string m_typestr;
-    // std::size_t m_dtype_size;
+    // Type info
     DType m_dtype;
 
     // Shape info
diff --git a/morpheus/_lib/include/morpheus/objects/tensor.hpp b/morpheus/_lib/include/morpheus/objects/tensor.hpp
index 14c59583aa..0717793c89 100644
--- a/morpheus/_lib/include/morpheus/objects/tensor.hpp
+++ b/morpheus/_lib/include/morpheus/objects/tensor.hpp
@@ -19,11 +19,10 @@
 
 #include "morpheus/objects/dtype.hpp"
 #include "morpheus/objects/tensor_object.hpp"
-#include "morpheus/types.hpp"  // for ShapeType, TensorIndex
+#include "morpheus/types.hpp"  // for ShapeType, TensorIndex, TensorSize
 
 #include <rmm/device_buffer.hpp>
 
-#include <cstddef>  // for size_t
 #include <cstdint>  // for uint8_t
 #include <memory>
 #include <string>
@@ -53,7 +52,7 @@ class Tensor
            std::string init_typestr,
            ShapeType init_shape,
            ShapeType init_strides,
-           std::size_t init_offset = 0);
+           TensorSize init_offset = 0);
 
     ShapeType shape;
     ShapeType strides;
@@ -67,7 +66,7 @@ class Tensor
     /**
      * TODO(Documentation)
      */
-    std::size_t bytes_count() const;
+    TensorSize bytes_count() const;
 
     /**
      * TODO(Documentation)
@@ -86,10 +85,10 @@ class Tensor
                                DType dtype,
                                ShapeType shape,
                                ShapeType strides,
-                               std::size_t offset = 0);
+                               TensorSize offset = 0);
 
   private:
-    std::size_t m_offset;
+    TensorSize m_offset;
     std::shared_ptr<rmm::device_buffer> m_device_buffer;
 };
 
diff --git a/morpheus/_lib/include/morpheus/objects/tensor_object.hpp b/morpheus/_lib/include/morpheus/objects/tensor_object.hpp
index 4e6f73e850..577986415d 100644
--- a/morpheus/_lib/include/morpheus/objects/tensor_object.hpp
+++ b/morpheus/_lib/include/morpheus/objects/tensor_object.hpp
@@ -19,7 +19,7 @@
 
 #include "morpheus/objects/dtype.hpp"
 #include "morpheus/objects/memory_descriptor.hpp"
-#include "morpheus/types.hpp"  // for RankType, ShapeType, TensorIndex
+#include "morpheus/types.hpp"  // for RankType, ShapeType, TensorIndex, TensorSize
 #include "morpheus/utilities/string_util.hpp"
 
 #include <cuda_runtime.h>  // for cudaMemcpyDeviceToHost & cudaMemcpy
@@ -108,7 +108,7 @@ struct ITensorStorage
     virtual void* data() const = 0;
 
     // virtual const void* data() const                             = 0;
-    virtual std::size_t bytes() const = 0;
+    virtual TensorSize bytes() const = 0;
 
     virtual std::shared_ptr<MemoryDescriptor> get_memory() const = 0;
     // virtual TensorStorageType storage_type() const               = 0;
@@ -136,7 +136,7 @@ struct ITensor : public ITensorStorage, public ITensorOperations
 
     virtual RankType rank() const = 0;
 
-    virtual std::size_t count() const = 0;
+    virtual TensorSize count() const = 0;
 
     virtual DType dtype() const = 0;
 
@@ -200,12 +200,12 @@ struct TensorObject final
         return m_tensor->dtype();
     }
 
-    std::size_t count() const
+    TensorSize count() const
     {
         return m_tensor->count();
     }
 
-    std::size_t bytes() const
+    TensorSize bytes() const
     {
         return m_tensor->bytes();
     }
@@ -215,7 +215,7 @@ struct TensorObject final
         return m_tensor->rank();
     }
 
-    std::size_t dtype_size() const
+    TensorSize dtype_size() const
     {
         return m_tensor->dtype().item_size();
     }
diff --git a/morpheus/_lib/include/morpheus/types.hpp b/morpheus/_lib/include/morpheus/types.hpp
index 0a2ec1910e..2f544fc573 100644
--- a/morpheus/_lib/include/morpheus/types.hpp
+++ b/morpheus/_lib/include/morpheus/types.hpp
@@ -19,6 +19,7 @@
 
 #include <cudf/types.hpp>
 
+#include <cstddef>  // for size_t
 #include <map>
 #include <string>
 #include <utility>  // for pair
@@ -35,6 +36,7 @@ struct TensorObject;
  */
 // NOLINTBEGIN(readability-identifier-naming)
 using TensorIndex = cudf::size_type;
+using TensorSize  = std::size_t;
 using RankType    = int;
 
 using ShapeType = std::vector<TensorIndex>;
diff --git a/morpheus/_lib/include/morpheus/utilities/tensor_util.hpp b/morpheus/_lib/include/morpheus/utilities/tensor_util.hpp
index 5bced8391d..e747749247 100644
--- a/morpheus/_lib/include/morpheus/utilities/tensor_util.hpp
+++ b/morpheus/_lib/include/morpheus/utilities/tensor_util.hpp
@@ -20,8 +20,7 @@
 #include "morpheus/export.h"
 #include "morpheus/types.hpp"  // for ShapeType, TensorIndex
 
-#include <algorithm>  // IWYU pragma: keep
-#include <cstddef>
+#include <algorithm>   // IWYU pragma: keep
 #include <functional>  // for multiplies
 #include <iosfwd>      // for ostream
 #include <numeric>     // for accumulate
@@ -113,12 +112,12 @@ struct MORPHEUS_EXPORT TensorUtils
      *
      * @tparam IndexT
      * @param shape
-     * @return std::size_t
+     * @return TensorSize
      */
     template <typename IndexT>
-    static inline std::size_t get_elem_count(const std::vector<IndexT>& shape)
+    static inline TensorSize get_elem_count(const std::vector<IndexT>& shape)
     {
-        return std::accumulate(shape.begin(), shape.end(), std::size_t{1}, std::multiplies<>());
+        return std::accumulate(shape.begin(), shape.end(), TensorSize{1}, std::multiplies<>());
     }
 };
 
diff --git a/morpheus/_lib/src/objects/dev_mem_info.cpp b/morpheus/_lib/src/objects/dev_mem_info.cpp
index 79f072eef2..b656e3d31d 100644
--- a/morpheus/_lib/src/objects/dev_mem_info.cpp
+++ b/morpheus/_lib/src/objects/dev_mem_info.cpp
@@ -17,12 +17,10 @@
 
 #include "morpheus/objects/dev_mem_info.hpp"
 
-#include "morpheus/types.hpp"
 #include "morpheus/utilities/tensor_util.hpp"  // for get_elem_count
 
 #include <glog/logging.h>  // for DCHECK
 
-#include <cstddef>
 #include <cstdint>  // for uint8_t
 #include <memory>
 #include <ostream>
@@ -36,7 +34,7 @@ DevMemInfo::DevMemInfo(void* data,
                        std::shared_ptr<MemoryDescriptor> md,
                        ShapeType shape,
                        ShapeType stride,
-                       std::size_t offset_bytes) :
+                       TensorSize offset_bytes) :
   m_data(data),
   m_dtype(std::move(dtype)),
   m_md(std::move(md)),
@@ -51,7 +49,7 @@ DevMemInfo::DevMemInfo(std::shared_ptr<rmm::device_buffer> buffer,
                        DType dtype,
                        ShapeType shape,
                        ShapeType stride,
-                       std::size_t offset_bytes) :
+                       TensorSize offset_bytes) :
   m_data(buffer->data()),
   m_dtype(std::move(dtype)),
   m_shape(std::move(shape)),
@@ -63,17 +61,17 @@ DevMemInfo::DevMemInfo(std::shared_ptr<rmm::device_buffer> buffer,
         << "Inconsistent dimensions, values would extend past the end of the device_buffer";
 }
 
-std::size_t DevMemInfo::bytes() const
+TensorSize DevMemInfo::bytes() const
 {
     return count() * m_dtype.item_size();
 }
 
-std::size_t DevMemInfo::count() const
+TensorSize DevMemInfo::count() const
 {
     return TensorUtils::get_elem_count(m_shape);
 }
 
-std::size_t DevMemInfo::offset_bytes() const
+TensorSize DevMemInfo::offset_bytes() const
 {
     return m_offset_bytes;
 }
@@ -113,7 +111,7 @@ std::shared_ptr<MemoryDescriptor> DevMemInfo::memory() const
     return m_md;
 }
 
-std::unique_ptr<rmm::device_buffer> DevMemInfo::make_new_buffer(std::size_t bytes) const
+std::unique_ptr<rmm::device_buffer> DevMemInfo::make_new_buffer(TensorSize bytes) const
 {
     return std::make_unique<rmm::device_buffer>(bytes, m_md->cuda_stream, m_md->memory_resource);
 }
diff --git a/morpheus/_lib/src/objects/rmm_tensor.cpp b/morpheus/_lib/src/objects/rmm_tensor.cpp
index d4095d2822..511b898167 100644
--- a/morpheus/_lib/src/objects/rmm_tensor.cpp
+++ b/morpheus/_lib/src/objects/rmm_tensor.cpp
@@ -29,7 +29,7 @@
 #include <rmm/cuda_stream_view.hpp>  // for cuda_stream_per_thread
 #include <rmm/device_buffer.hpp>
 
-#include <algorithm>  // for copy, transform
+#include <algorithm>   // for copy, transform
 #include <functional>  // for multiplies, plus, minus
 #include <iterator>    // for back_insert_iterator, back_inserter
 #include <memory>
@@ -42,7 +42,7 @@ namespace morpheus {
 /****** Component public implementations *******************/
 /****** RMMTensor****************************************/
 RMMTensor::RMMTensor(std::shared_ptr<rmm::device_buffer> device_buffer,
-                     std::size_t offset,
+                     TensorSize offset,
                      DType dtype,
                      ShapeType shape,
                      ShapeType stride) :
@@ -82,12 +82,12 @@ DType RMMTensor::dtype() const
     return m_dtype;
 }
 
-std::size_t RMMTensor::count() const
+TensorSize RMMTensor::count() const
 {
     return TensorUtils::get_elem_count(m_shape);
 }
 
-std::size_t RMMTensor::bytes() const
+TensorSize RMMTensor::bytes() const
 {
     return count() * m_dtype.item_size();
 }
@@ -175,7 +175,7 @@ std::shared_ptr<ITensor> RMMTensor::as_type(DType new_dtype) const
     return std::make_shared<RMMTensor>(new_data_buffer, 0, new_dtype, m_shape, m_stride);
 }
 
-std::size_t RMMTensor::offset_bytes() const
+TensorSize RMMTensor::offset_bytes() const
 {
     return m_offset * m_dtype.item_size();
 }
diff --git a/morpheus/_lib/src/objects/tensor.cpp b/morpheus/_lib/src/objects/tensor.cpp
index 1fa427e26d..55a0851152 100644
--- a/morpheus/_lib/src/objects/tensor.cpp
+++ b/morpheus/_lib/src/objects/tensor.cpp
@@ -37,7 +37,7 @@ Tensor::Tensor(std::shared_ptr<rmm::device_buffer> buffer,
                std::string init_typestr,
                ShapeType init_shape,
                ShapeType init_strides,
-               std::size_t init_offset) :
+               TensorSize init_offset) :
   m_device_buffer(std::move(buffer)),
   typestr(std::move(init_typestr)),
   shape(std::move(init_shape)),
@@ -50,7 +50,7 @@ void* Tensor::data() const
     return static_cast<uint8_t*>(m_device_buffer->data()) + m_offset;
 }
 
-std::size_t Tensor::bytes_count() const
+TensorSize Tensor::bytes_count() const
 {
     // temp just return without shape, size, offset, etc
     return m_device_buffer->size();
@@ -73,7 +73,7 @@ auto Tensor::get_stream() const
 }
 
 TensorObject Tensor::create(
-    std::shared_ptr<rmm::device_buffer> buffer, DType dtype, ShapeType shape, ShapeType strides, std::size_t offset)
+    std::shared_ptr<rmm::device_buffer> buffer, DType dtype, ShapeType shape, ShapeType strides, TensorSize offset)
 {
     auto md = std::make_shared<MemoryDescriptor>(buffer->stream(), buffer->memory_resource());
 
diff --git a/morpheus/_lib/src/utilities/matx_util.cu b/morpheus/_lib/src/utilities/matx_util.cu
index 019a10cb0b..a26fe83bb9 100644
--- a/morpheus/_lib/src/utilities/matx_util.cu
+++ b/morpheus/_lib/src/utilities/matx_util.cu
@@ -15,7 +15,7 @@
  * limitations under the License.
  */
 
-#include "morpheus/types.hpp"
+#include "morpheus/types.hpp"  // For TensorIndex, TensorSize
 #include "morpheus/utilities/matx_util.hpp"
 
 #include <boost/numeric/conversion/cast.hpp>  // for numeric_cast
@@ -33,7 +33,7 @@ using tensorShape_1d = std::array<matx::index_t, 1>;
 using tensorShape_2d = std::array<matx::index_t, 2>;
 
 // Since we are building MatX in 32bit mode, we can only support up to 2^31 in any on dimension, for count type values
-// that consider multiple dimensions we use std::size_t, while other operations such as MatxUtil__MatxCast which only
+// that consider multiple dimensions we use TensorSize, while other operations such as MatxUtil__MatxCast which only
 // opperate on a single dimension use TensorIndex.
 
 // Component-private classes.
@@ -448,9 +448,9 @@ std::shared_ptr<rmm::device_buffer> MatxUtil::transpose(const DevMemInfo& input)
 
 std::shared_ptr<rmm::device_buffer> MatxUtil::threshold(const DevMemInfo& input, double thresh_val, bool by_row)
 {
-    const auto rows         = input.shape(0);
-    const auto cols         = input.shape(1);
-    std::size_t output_size = sizeof(bool) * rows;
+    const auto rows        = input.shape(0);
+    const auto cols        = input.shape(1);
+    TensorSize output_size = sizeof(bool) * rows;
     if (!by_row)
     {
         output_size *= cols;
@@ -481,8 +481,8 @@ std::shared_ptr<rmm::device_buffer> MatxUtil::reduce_max(const DevMemInfo& input
     auto num_input_rows = input.shape(0);
     auto num_input_cols = input.shape(1);
 
-    std::size_t output_element_count = output_shape[0] * output_shape[1];
-    std::size_t output_buff_size     = dtype.item_size() * output_element_count;
+    TensorSize output_element_count = output_shape[0] * output_shape[1];
+    TensorSize output_buff_size     = dtype.item_size() * output_element_count;
 
     DCHECK(output_element_count <= input.count()) << "Output buffer size should be less than or equal to the input";
     DCHECK(num_input_cols == output_shape[1]) << "Number of input and output columns must match";

From 211fa30b01952d97eb27315673f7b64e7740ff47 Mon Sep 17 00:00:00 2001
From: David Gardner <dagardner@nvidia.com>
Date: Wed, 28 Jun 2023 11:28:21 -0700
Subject: [PATCH 13/14] Remove unused using

---
 morpheus/_lib/tests/test_cuda.cu | 2 --
 1 file changed, 2 deletions(-)

diff --git a/morpheus/_lib/tests/test_cuda.cu b/morpheus/_lib/tests/test_cuda.cu
index 4c9b3311e5..6d6980bb8c 100644
--- a/morpheus/_lib/tests/test_cuda.cu
+++ b/morpheus/_lib/tests/test_cuda.cu
@@ -19,8 +19,6 @@
 
 #include <matx.h>
 
-using namespace morpheus;
-
 TEST_CLASS(Cuda);
 
 TEST_F(TestCuda, LargeShape)

From 84ebcd796f577895c7c4312cfec876afac499dd0 Mon Sep 17 00:00:00 2001
From: David Gardner <dagardner@nvidia.com>
Date: Wed, 28 Jun 2023 11:36:32 -0700
Subject: [PATCH 14/14] formatting

---
 morpheus/_lib/src/objects/rmm_tensor.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/morpheus/_lib/src/objects/rmm_tensor.cpp b/morpheus/_lib/src/objects/rmm_tensor.cpp
index 511b898167..aaa9d81965 100644
--- a/morpheus/_lib/src/objects/rmm_tensor.cpp
+++ b/morpheus/_lib/src/objects/rmm_tensor.cpp
@@ -29,7 +29,7 @@
 #include <rmm/cuda_stream_view.hpp>  // for cuda_stream_per_thread
 #include <rmm/device_buffer.hpp>
 
-#include <algorithm>   // for copy, transform
+#include <algorithm>  // for copy, transform
 #include <functional>  // for multiplies, plus, minus
 #include <iterator>    // for back_insert_iterator, back_inserter
 #include <memory>