From c6ec9ae1b3157600738bf16eeb9d0a692b8b7670 Mon Sep 17 00:00:00 2001
From: cliffburdick <cburdick@nvidia.com>
Date: Tue, 23 Jan 2024 11:35:15 -0800
Subject: [PATCH] Added vector_norm and matrix_norm Renamed norm() function to
 abs2 Fixed several bugs with pointer ownership on default tensors Fixed bug
 where PreRun was not executed in some cases

---
 docs_input/api/math/complex/norm.rst          |  18 --
 docs_input/notebooks/04_radar_pipeline.ipynb  |   6 +-
 examples/fft_conv.cu                          |   2 +-
 examples/simple_radar_pipeline.h              |   4 +-
 include/matx/core/half_complex.h              |   4 +-
 include/matx/core/operator_utils.h            |  23 +-
 include/matx/core/pybind.h                    |  49 +++--
 include/matx/core/tensor_impl.h               |   9 +-
 include/matx/core/tie.h                       |   5 +
 include/matx/operators/all.h                  |  32 +--
 include/matx/operators/ambgfun.h              |  19 +-
 include/matx/operators/any.h                  |  18 +-
 include/matx/operators/cgsolve.h              |  17 +-
 include/matx/operators/channelize_poly.h      |  17 +-
 include/matx/operators/chol.h                 |  10 +-
 include/matx/operators/conv.h                 |  36 ++--
 include/matx/operators/corr.h                 |  17 +-
 include/matx/operators/cov.h                  |  17 +-
 include/matx/operators/cumsum.h               |  17 +-
 include/matx/operators/det.h                  |  20 +-
 include/matx/operators/eig.h                  |   1 -
 include/matx/operators/einsum.h               |   6 +
 include/matx/operators/fft.h                  |  48 +++--
 include/matx/operators/filter.h               |  17 +-
 include/matx/operators/hist.h                 |  17 +-
 include/matx/operators/inverse.h              |  22 +-
 include/matx/operators/lu.h                   |   3 +-
 include/matx/operators/matmul.h               |  19 +-
 include/matx/operators/matvec.h               |  17 +-
 include/matx/operators/max.h                  |  24 ++-
 include/matx/operators/mean.h                 |  20 +-
 include/matx/operators/median.h               |  20 +-
 include/matx/operators/min.h                  |  20 +-
 include/matx/operators/norm.h                 | 201 ++++++++++++++++++
 include/matx/operators/operators.h            |   1 +
 include/matx/operators/outer.h                |  17 +-
 include/matx/operators/percentile.h           |  20 +-
 include/matx/operators/prod.h                 |  22 +-
 include/matx/operators/pwelch.h               |  21 +-
 include/matx/operators/reduce.h               |  15 +-
 include/matx/operators/resample_poly.h        |  21 +-
 include/matx/operators/scalar_ops.h           |   2 +-
 include/matx/operators/softmax.h              |  17 +-
 include/matx/operators/sort.h                 |  20 +-
 include/matx/operators/stdd.h                 |  24 ++-
 include/matx/operators/sum.h                  |  22 +-
 include/matx/operators/trace.h                |  20 +-
 include/matx/operators/unary_operators.h      |   8 -
 include/matx/operators/var.h                  |  20 +-
 include/matx/transforms/ambgfun.h             |   4 +-
 include/matx/transforms/cub.h                 |   2 +
 include/matx/transforms/norm.h                |  91 ++++++++
 include/matx/transforms/qr.h                  |   4 +-
 include/matx/transforms/reduce.h              |   3 +-
 include/matx/transforms/svd.h                 |  14 +-
 test/00_operators/OperatorTests.cu            |   7 +-
 test/00_transform/Norm.cu                     | 147 +++++++++++++
 test/CMakeLists.txt                           |   1 +
 test/test_vectors/generators/00_transforms.py |  34 +++
 59 files changed, 973 insertions(+), 359 deletions(-)
 delete mode 100644 docs_input/api/math/complex/norm.rst
 create mode 100644 include/matx/operators/norm.h
 create mode 100644 include/matx/transforms/norm.h
 create mode 100644 test/00_transform/Norm.cu

diff --git a/docs_input/api/math/complex/norm.rst b/docs_input/api/math/complex/norm.rst
deleted file mode 100644
index e34ebfb6..00000000
--- a/docs_input/api/math/complex/norm.rst
+++ /dev/null
@@ -1,18 +0,0 @@
-.. _norm_func:
-
-norm
-====
-
-Square of the magnitude of a complex number
-
-.. doxygenfunction:: matx::norm(Op t)
-
-Examples
-~~~~~~~~
-
-.. literalinclude:: ../../../../test/00_operators/OperatorTests.cu
-   :language: cpp
-   :start-after: example-begin norm-test-1
-   :end-before: example-end norm-test-1
-   :dedent:
-
diff --git a/docs_input/notebooks/04_radar_pipeline.ipynb b/docs_input/notebooks/04_radar_pipeline.ipynb
index eeb84096..622f73e4 100644
--- a/docs_input/notebooks/04_radar_pipeline.ipynb
+++ b/docs_input/notebooks/04_radar_pipeline.ipynb
@@ -133,7 +133,7 @@
     "In this case we're applying a Hamming window to our partial waveform view. `hamming` is a generator function that generates Hamming window values at each point defined in the tensor. Next, we compute the L2 norm of the partial waveform. The L2 norm is done in two steps currently: an I^2 + Q^2 reduction, followed by a square root on the output of the reduction:\n",
     "\n",
     "```c++\n",
-    "  sum(norms, norm(waveformPart), stream);\n",
+    "  sum(norms, abs2(waveformPart), stream);\n",
     "  exec(norms, sqrt(norms), stream);\n",
     "```\n",
     "\n",
@@ -245,10 +245,10 @@
     "## CFAR Detection\n",
     "The last step in the pipeline is the constant false alarm rate (CFAR) detection. CFAR detection is broadly used to filter observible signals from noise by setting a threshold for observation. A filter mask was created in the constructor to represent the \"field of view\" that we are looking for a target in. By describing the field of view, we can differentiate what parts of the signal we believe are signal power and noise power. \n",
     "\n",
-    "CFAR detection begins by taking the signal power of the last stage by summing the squares of all complex numbers (I^2 + Q^2). This is done by using the MatX `norm` operator:\n",
+    "CFAR detection begins by taking the signal power of the last stage by summing the squares of all complex numbers (I^2 + Q^2). This is done by using the MatX `abs2` operator:\n",
     "\n",
     "```c++\n",
-    "exec(xdPow, norm(cfarIn), stream);\n",
+    "exec(xdPow, abs2(cfarIn), stream);\n",
     "```\n",
     "\n",
     "xdPow now contains the sum of the squares of each element. Using the computed power per cell, we apply the CFAR mask that was computed in the constructor. The mask is applied using a 2D convolution from the MatX `conv2d` function:\n",
diff --git a/examples/fft_conv.cu b/examples/fft_conv.cu
index 9afb0a0f..00198bd5 100644
--- a/examples/fft_conv.cu
+++ b/examples/fft_conv.cu
@@ -172,4 +172,4 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
 
   CUDA_CHECK_LAST_ERROR();
   MATX_EXIT_HANDLER();
-}
+}
\ No newline at end of file
diff --git a/examples/simple_radar_pipeline.h b/examples/simple_radar_pipeline.h
index de08ffd5..ecaaa08c 100644
--- a/examples/simple_radar_pipeline.h
+++ b/examples/simple_radar_pipeline.h
@@ -253,7 +253,7 @@ class RadarPipeline {
     (waveformPart = waveformPart * hamming<0>({waveformLength})).run(exec);
 
     // compute L2 norm
-    (norms = sum(norm(waveformPart))).run(exec);
+    (norms = sum(abs2(waveformPart))).run(exec);
     (norms = sqrt(norms)).run(exec);
 
     (waveformPart = waveformPart / norms).run(exec);
@@ -358,7 +358,7 @@ class RadarPipeline {
    */
   void CFARDetections()
   {
-    (xPow = norm(tpcView)).run(exec);
+    (xPow = abs2(tpcView)).run(exec);
 
     // Estimate the background average power in each cell
     // background_averages = conv2(Xpow, mask, 'same') ./ norm;
diff --git a/include/matx/core/half_complex.h b/include/matx/core/half_complex.h
index a5f0383a..b074ddec 100644
--- a/include/matx/core/half_complex.h
+++ b/include/matx/core/half_complex.h
@@ -740,14 +740,14 @@ __MATX_HOST__ __MATX_DEVICE__ __MATX_INLINE__ T atan2(const T &x, const T &y)
 }
 
 /**
- * @brief Norm operator
+ * @brief Squared absolute value operator
  * 
  * @tparam T Underlying type
  * @param x Value of input
  * @return Result of operation 
  */
 template <typename T>
-__MATX_HOST__ __MATX_DEVICE__ __MATX_INLINE__ T norm(const matxHalfComplex<T> &x)
+__MATX_HOST__ __MATX_DEVICE__ __MATX_INLINE__ T abs2(const matxHalfComplex<T> &x)
 {
   if (isinf(x.real()))
     return static_cast<T>(cuda::std::abs(static_cast<float>(x.real())));
diff --git a/include/matx/core/operator_utils.h b/include/matx/core/operator_utils.h
index 5dfbc4d0..da4afa1b 100644
--- a/include/matx/core/operator_utils.h
+++ b/include/matx/core/operator_utils.h
@@ -42,7 +42,7 @@ namespace matx {
   __MATX_HOST__ __MATX_INLINE__ auto ReduceOutput(Func &&func, OutputOp &&out, InputOp &&in, BeginIter &&bi, EndIter &&ei) {
     if constexpr (remove_cvref_t<decltype(out)>::Rank() <= 1 && is_tensor_view_v<OutputOp>) {
       if (out.IsContiguous()) {
-        if constexpr(ConvertType) {
+        if constexpr(ConvertType) {   
           return func(  in, 
                         reinterpret_cast<detail::convert_matx_type_t<typename remove_cvref_t<OutputOp>::scalar_type> *>(out.Data()), 
                         bi, 
@@ -64,7 +64,7 @@ namespace matx {
 
   template <typename Func, typename OutputOp, typename InputOp, bool ConvertType = true>
   __MATX_HOST__ __MATX_INLINE__ auto ReduceInput(Func &&func, OutputOp &&out, InputOp &&in) {
-    typename detail::base_type_t<InputOp> in_base = in;
+    typename detail::base_type_t<InputOp> in_base = in;    
     if constexpr (in_base.Rank() < 2 && is_tensor_view_v<InputOp>) {
       if (in_base.IsContiguous()) {
         if constexpr (ConvertType) {
@@ -89,8 +89,6 @@ namespace matx {
     auto collapsed = matx::lcollapse<remove_cvref_t<decltype(out)>::Rank()>(rcollapse<remove_cvref_t<decltype(in)>::Rank() - 
                                                                                       remove_cvref_t<decltype(out)>::Rank()>(in_base));
     const auto &iter = matx::RandomOperatorIterator<decltype(collapsed), ConvertType>{collapsed};
-
-  
     return ReduceOutput<ConvertType>(std::forward<Func>(func), std::forward<OutputOp>(out), iter, BeginOffset{iter}, EndOffset{iter});   
   } 
 
@@ -116,4 +114,21 @@ namespace matx {
 
     return shape;
   }
+
+  namespace detail {
+    // Used inside of transforms to allocate temporary output
+    template <typename TensorType, typename Executor, typename ShapeType> 
+    __MATX_HOST__ __MATX_INLINE__ void AllocateTempTensor(TensorType &tensor, Executor &&ex, ShapeType &&shape, typename TensorType::scalar_type **ptr) {
+      const auto ttl_size = std::accumulate(shape.begin(), shape.end(), static_cast<index_t>(1),
+                                  std::multiplies<index_t>()) * sizeof(*ptr);      
+      if constexpr (is_cuda_executor_v<Executor>) {
+        matxAlloc((void**)ptr, ttl_size, MATX_ASYNC_DEVICE_MEMORY, ex.getStream());
+        make_tensor(tensor, *ptr, shape);
+      }
+      else {        
+        matxAlloc((void**)ptr, ttl_size, MATX_HOST_MEMORY);
+        make_tensor(tensor, *ptr, shape);        
+      }  
+    }  
+  }
 }; 
diff --git a/include/matx/core/pybind.h b/include/matx/core/pybind.h
index 0a42d6b2..2eb6b352 100644
--- a/include/matx/core/pybind.h
+++ b/include/matx/core/pybind.h
@@ -336,35 +336,40 @@ class MatXPybind {
     using ntype = matx_convert_complex_type<T>;
     auto ften = pybind11::array_t<ntype>(np_ten);
 
-    for (index_t s1 = 0; s1 < ten.Size(0); s1++) {
-      if constexpr (RANK > 1) {
-        for (index_t s2 = 0; s2 < ten.Size(1); s2++) {
-          if constexpr (RANK > 2) {
-            for (index_t s3 = 0; s3 < ten.Size(2); s3++) {
-              if constexpr (RANK > 3) {
-                for (index_t s4 = 0; s4 < ten.Size(3); s4++) {
-                  if constexpr (RANK > 4) {
-                    for (index_t s5 = 0; s5 < ten.Size(4); s5++) {
-                      ten(s1, s2, s3, s4, s5) = ConvertComplex(ften.at(s1, s2, s3, s4, s5));
+    if constexpr (RANK == 0) {
+      ten() = ConvertComplex(ften.at());
+    }
+    else {
+      for (index_t s1 = 0; s1 < ten.Size(0); s1++) {
+        if constexpr (RANK > 1) {
+          for (index_t s2 = 0; s2 < ten.Size(1); s2++) {
+            if constexpr (RANK > 2) {
+              for (index_t s3 = 0; s3 < ten.Size(2); s3++) {
+                if constexpr (RANK > 3) {
+                  for (index_t s4 = 0; s4 < ten.Size(3); s4++) {
+                    if constexpr (RANK > 4) {
+                      for (index_t s5 = 0; s5 < ten.Size(4); s5++) {
+                        ten(s1, s2, s3, s4, s5) = ConvertComplex(ften.at(s1, s2, s3, s4, s5));
+                      }
+                    }
+                    else {
+                      ten(s1, s2, s3, s4) = ConvertComplex(ften.at(s1, s2, s3, s4));
                     }
-                  }
-                  else {
-                    ten(s1, s2, s3, s4) = ConvertComplex(ften.at(s1, s2, s3, s4));
                   }
                 }
-              }
-              else {
-                ten(s1, s2, s3) = ConvertComplex(ften.at(s1, s2, s3));
+                else {
+                  ten(s1, s2, s3) = ConvertComplex(ften.at(s1, s2, s3));
+                }
               }
             }
-          }
-          else {
-            ten(s1, s2) = ConvertComplex(ften.at(s1, s2));
+            else {
+              ten(s1, s2) = ConvertComplex(ften.at(s1, s2));
+            }
           }
         }
-      }
-      else {
-        ten(s1) = ConvertComplex(ften.at(s1));
+        else {
+          ten(s1) = ConvertComplex(ften.at(s1));
+        }
       }
     }
   }
diff --git a/include/matx/core/tensor_impl.h b/include/matx/core/tensor_impl.h
index e3bae4d1..5b6f3dea 100644
--- a/include/matx/core/tensor_impl.h
+++ b/include/matx/core/tensor_impl.h
@@ -78,6 +78,7 @@ class tensor_impl_t {
     using shape_type = typename Desc::shape_type;
     using stride_type = typename Desc::stride_type;
     using matxoplvalue = bool;
+    using self_type = tensor_impl_t<T, RANK, Desc>;
 
     // Type specifier for signaling this is a matx operation
     using matxop = bool;
@@ -231,6 +232,12 @@ class tensor_impl_t {
     {
     }      
 
+    __MATX_HOST__ void Shallow(const self_type &rhs) noexcept
+    {
+      ldata_ = rhs.ldata_;
+      desc_ = rhs.desc_;
+    }    
+
     /**
      * Lazy assignment operator=. Used to create a "set" object for deferred
      * execution on a device
@@ -811,7 +818,7 @@ class tensor_impl_t {
      * 
      * @return data pointer 
      */
-    auto Data() const noexcept {
+    __MATX_INLINE__ __MATX_HOST__ __MATX_DEVICE__  auto Data() const noexcept {
       return ldata_;
     } 
 
diff --git a/include/matx/core/tie.h b/include/matx/core/tie.h
index c7ca3566..fdb2d0d9 100644
--- a/include/matx/core/tie.h
+++ b/include/matx/core/tie.h
@@ -94,6 +94,11 @@ struct mtie : public BaseOp<mtie<Ts...>>{
 
   template <typename Executor>
   __MATX_INLINE__ void Exec(Executor &&ex) {
+    // Run the PreRun on the inner type to avoid allocation but allow transforms using MatX operators
+    // to do any setup needed
+    if constexpr (sizeof...(Ts) == 2) {
+      cuda::std::get<sizeof...(Ts) - 1>(ts_).InnerPreRun(NoShape{}, std::forward<Executor>(ex));
+    }
     cuda::std::get<sizeof...(Ts) - 1>(ts_).Exec(ts_, std::forward<Executor>(ex));
   }
 
diff --git a/include/matx/operators/all.h b/include/matx/operators/all.h
index 0a59dc55..0407f53f 100644
--- a/include/matx/operators/all.h
+++ b/include/matx/operators/all.h
@@ -49,7 +49,8 @@ namespace detail {
     private:
       OpA a_;
       cuda::std::array<index_t, ORank> out_dims_;
-      mutable matx::tensor_t<typename remove_cvref_t<OpA>::scalar_type, ORank> tmp_out_;      
+      mutable detail::tensor_impl_t<typename remove_cvref_t<OpA>::scalar_type, ORank> tmp_out_;
+      mutable typename remove_cvref_t<OpA>::scalar_type *ptr;     
 
     public:
       using matxop = bool;
@@ -80,29 +81,30 @@ namespace detail {
       }
 
       template <typename ShapeType, typename Executor>
-      __MATX_INLINE__ void PreRun([[maybe_unused]] ShapeType &&shape, Executor &&ex) const noexcept
+      __MATX_INLINE__ void InnerPreRun([[maybe_unused]] ShapeType &&shape, Executor &&ex) const noexcept
       {
         if constexpr (is_matx_op<OpA>()) {
           a_.PreRun(std::forward<ShapeType>(shape), std::forward<Executor>(ex));
         }     
+      }      
 
-        if constexpr (is_cuda_executor_v<Executor>) {
-          make_tensor(tmp_out_, out_dims_, MATX_ASYNC_DEVICE_MEMORY, ex.getStream());
-        }
-        else {
-          make_tensor(tmp_out_, out_dims_, MATX_HOST_MEMORY);          
-        }
+      template <typename ShapeType, typename Executor>
+      __MATX_INLINE__ void PreRun([[maybe_unused]] ShapeType &&shape, Executor &&ex) const noexcept
+      {
+        InnerPreRun(std::forward<ShapeType>(shape), std::forward<Executor>(ex));
+
+        detail::AllocateTempTensor(tmp_out_, std::forward<Executor>(ex), out_dims_, &ptr);
 
         Exec(cuda::std::make_tuple(tmp_out_), std::forward<Executor>(ex));
       }
 
-        template <typename ShapeType, typename Executor>
-        __MATX_INLINE__ void PostRun([[maybe_unused]] ShapeType &&shape, [[maybe_unused]] Executor &&ex) const noexcept
-        {
-          if constexpr (is_matx_op<OpA>()) {
-            a_.PostRun(std::forward<ShapeType>(shape), std::forward<Executor>(ex));
-          }
-        }      
+      template <typename ShapeType, typename Executor>
+      __MATX_INLINE__ void PostRun([[maybe_unused]] ShapeType &&shape, [[maybe_unused]] Executor &&ex) const noexcept
+      {
+        if constexpr (is_matx_op<OpA>()) {
+          a_.PostRun(std::forward<ShapeType>(shape), std::forward<Executor>(ex));
+        }
+      }      
 
       constexpr __MATX_INLINE__ __MATX_HOST__ __MATX_DEVICE__ index_t Size(int dim) const
       {
diff --git a/include/matx/operators/ambgfun.h b/include/matx/operators/ambgfun.h
index e9531857..551ab329 100644
--- a/include/matx/operators/ambgfun.h
+++ b/include/matx/operators/ambgfun.h
@@ -50,7 +50,8 @@ namespace matx
         AMBGFunCutType_t cut_;
         float cut_val_;
         cuda::std::array<index_t, 2> out_dims_;
-        mutable matx::tensor_t<typename OpX::scalar_type, 2> tmp_out_;
+        mutable detail::tensor_impl_t<typename remove_cvref_t<OpX>::scalar_type, 2> tmp_out_;
+        mutable typename remove_cvref_t<OpX>::scalar_type *ptr;         
 
       public:
         using matxop = bool;
@@ -111,19 +112,23 @@ namespace matx
         }
 
         template <typename ShapeType, typename Executor>
-        __MATX_INLINE__ void PreRun([[maybe_unused]] ShapeType &&shape, Executor &&ex) const noexcept
+        __MATX_INLINE__ void InnerPreRun([[maybe_unused]] ShapeType &&shape, Executor &&ex) const noexcept
         {
           if constexpr (is_matx_op<OpX>()) {
             x_.PreRun(std::forward<ShapeType>(shape), std::forward<Executor>(ex));
-          }
+          }     
 
           if constexpr (is_matx_op<OpY>()) {
             y_.PreRun(std::forward<ShapeType>(shape), std::forward<Executor>(ex));
-          }          
+          }             
+        }      
 
-          if constexpr (is_cuda_executor_v<Executor>) {
-            make_tensor(tmp_out_, out_dims_, MATX_ASYNC_DEVICE_MEMORY, ex.getStream());
-          }
+        template <typename ShapeType, typename Executor>
+        __MATX_INLINE__ void PreRun([[maybe_unused]] ShapeType &&shape, Executor &&ex) const noexcept
+        {
+          InnerPreRun(std::forward<ShapeType>(shape), std::forward<Executor>(ex));         
+
+          detail::AllocateTempTensor(tmp_out_, std::forward<Executor>(ex), out_dims_, &ptr);
 
           Exec(cuda::std::make_tuple(tmp_out_), std::forward<Executor>(ex));
         }
diff --git a/include/matx/operators/any.h b/include/matx/operators/any.h
index 2c3d469d..a2600346 100644
--- a/include/matx/operators/any.h
+++ b/include/matx/operators/any.h
@@ -49,7 +49,8 @@ namespace detail {
     private:
       OpA a_;
       cuda::std::array<index_t, ORank> out_dims_;
-      mutable matx::tensor_t<typename remove_cvref_t<OpA>::scalar_type, ORank> tmp_out_;      
+      mutable detail::tensor_impl_t<typename remove_cvref_t<OpA>::scalar_type, ORank> tmp_out_;
+      mutable typename remove_cvref_t<OpA>::scalar_type *ptr; 
 
     public:
       using matxop = bool;
@@ -80,18 +81,19 @@ namespace detail {
       }
 
       template <typename ShapeType, typename Executor>
-      __MATX_INLINE__ void PreRun([[maybe_unused]] ShapeType &&shape, Executor &&ex) const noexcept
+      __MATX_INLINE__ void InnerPreRun([[maybe_unused]] ShapeType &&shape, Executor &&ex) const noexcept
       {
         if constexpr (is_matx_op<OpA>()) {
           a_.PreRun(std::forward<ShapeType>(shape), std::forward<Executor>(ex));
         }     
+      }      
 
-        if constexpr (is_cuda_executor_v<Executor>) {
-          make_tensor(tmp_out_, out_dims_, MATX_ASYNC_DEVICE_MEMORY, ex.getStream());
-        }
-        else {
-          make_tensor(tmp_out_, out_dims_, MATX_HOST_MEMORY);          
-        }
+      template <typename ShapeType, typename Executor>
+      __MATX_INLINE__ void PreRun([[maybe_unused]] ShapeType &&shape, Executor &&ex) const noexcept
+      {
+        InnerPreRun(std::forward<ShapeType>(shape), std::forward<Executor>(ex));   
+
+        detail::AllocateTempTensor(tmp_out_, std::forward<Executor>(ex), out_dims_, &ptr);
 
         Exec(cuda::std::make_tuple(tmp_out_), std::forward<Executor>(ex));
       }
diff --git a/include/matx/operators/cgsolve.h b/include/matx/operators/cgsolve.h
index 1dea4f7c..fd7ba3cc 100644
--- a/include/matx/operators/cgsolve.h
+++ b/include/matx/operators/cgsolve.h
@@ -49,7 +49,8 @@ namespace matx
         double tol_;
         int max_iters_;
         cuda::std::array<index_t, 2> out_dims_;
-        mutable matx::tensor_t<typename OpA::scalar_type, 2> tmp_out_;
+        mutable detail::tensor_impl_t<typename OpA::scalar_type, 2> tmp_out_;
+        mutable typename OpA::scalar_type *ptr;               
 
       public:
         using matxop = bool;
@@ -92,7 +93,7 @@ namespace matx
         }
 
         template <typename ShapeType, typename Executor>
-        __MATX_INLINE__ void PreRun([[maybe_unused]] ShapeType &&shape, Executor &&ex) const noexcept
+        __MATX_INLINE__ void InnerPreRun([[maybe_unused]] ShapeType &&shape, Executor &&ex) const noexcept
         {
           if constexpr (is_matx_op<OpA>()) {
             a_.PreRun(std::forward<ShapeType>(shape), std::forward<Executor>(ex));
@@ -100,11 +101,15 @@ namespace matx
 
           if constexpr (is_matx_op<OpB>()) {
             b_.PreRun(std::forward<ShapeType>(shape), std::forward<Executor>(ex));
-          }          
+          }           
+        }      
 
-          if constexpr (is_cuda_executor_v<Executor>) {
-            make_tensor(tmp_out_, out_dims_, MATX_ASYNC_DEVICE_MEMORY, ex.getStream());
-          }
+        template <typename ShapeType, typename Executor>
+        __MATX_INLINE__ void PreRun([[maybe_unused]] ShapeType &&shape, Executor &&ex) const noexcept
+        {
+          InnerPreRun(std::forward<ShapeType>(shape), std::forward<Executor>(ex));         
+
+          detail::AllocateTempTensor(tmp_out_, std::forward<Executor>(ex), out_dims_, &ptr);
 
           Exec(cuda::std::make_tuple(tmp_out_), std::forward<Executor>(ex));
         }
diff --git a/include/matx/operators/channelize_poly.h b/include/matx/operators/channelize_poly.h
index 69255442..d9e81301 100644
--- a/include/matx/operators/channelize_poly.h
+++ b/include/matx/operators/channelize_poly.h
@@ -55,7 +55,8 @@ namespace detail {
       index_t num_channels_;
       index_t decimation_factor_;
       cuda::std::array<index_t, OpA::Rank() + 1> out_dims_;
-      mutable matx::tensor_t<out_t, OpA::Rank() + 1> tmp_out_;
+      mutable detail::tensor_impl_t<out_t, OpA::Rank() + 1> tmp_out_;
+      mutable out_t *ptr;       
 
     public:
       using matxop = bool;
@@ -95,7 +96,7 @@ namespace detail {
       }
 
       template <typename ShapeType, typename Executor>
-      __MATX_INLINE__ void PreRun([[maybe_unused]] ShapeType &&shape, Executor &&ex) const noexcept
+      __MATX_INLINE__ void InnerPreRun([[maybe_unused]] ShapeType &&shape, Executor &&ex) const noexcept
       {
         if constexpr (is_matx_op<OpA>()) {
           a_.PreRun(std::forward<ShapeType>(shape), std::forward<Executor>(ex));
@@ -103,11 +104,15 @@ namespace detail {
 
         if constexpr (is_matx_op<FilterType>()) {
           f_.PreRun(std::forward<ShapeType>(shape), std::forward<Executor>(ex));
-        }          
+        }           
+      }      
 
-        if constexpr (is_cuda_executor_v<Executor>) {
-          make_tensor(tmp_out_, out_dims_, MATX_ASYNC_DEVICE_MEMORY, ex.getStream());
-        }
+      template <typename ShapeType, typename Executor>
+      __MATX_INLINE__ void PreRun([[maybe_unused]] ShapeType &&shape, Executor &&ex) const noexcept
+      {
+        InnerPreRun(std::forward<ShapeType>(shape), std::forward<Executor>(ex));           
+
+        detail::AllocateTempTensor(tmp_out_, std::forward<Executor>(ex), out_dims_, &ptr);
 
         Exec(cuda::std::make_tuple(tmp_out_), std::forward<Executor>(ex));
       }
diff --git a/include/matx/operators/chol.h b/include/matx/operators/chol.h
index 99afee14..82efde6c 100644
--- a/include/matx/operators/chol.h
+++ b/include/matx/operators/chol.h
@@ -73,11 +73,17 @@ namespace detail {
       }
 
       template <typename ShapeType, typename Executor>
-      __MATX_INLINE__ void PreRun([[maybe_unused]] ShapeType &&shape, Executor &&ex) const noexcept
+      __MATX_INLINE__ void InnerPreRun([[maybe_unused]] ShapeType &&shape, Executor &&ex) const noexcept
       {
         if constexpr (is_matx_op<OpA>()) {
           a_.PreRun(std::forward<ShapeType>(shape), std::forward<Executor>(ex));
-        }
+        }             
+      }      
+
+      template <typename ShapeType, typename Executor>
+      __MATX_INLINE__ void PreRun([[maybe_unused]] ShapeType &&shape, Executor &&ex) const noexcept
+      {
+        InnerPreRun(std::forward<ShapeType>(shape), std::forward<Executor>(ex));  
 
         if constexpr (is_cuda_executor_v<Executor>) {
           make_tensor(tmp_out_, a_.Shape(), MATX_ASYNC_DEVICE_MEMORY, ex.getStream());
diff --git a/include/matx/operators/conv.h b/include/matx/operators/conv.h
index 800c69c3..e9833eb0 100644
--- a/include/matx/operators/conv.h
+++ b/include/matx/operators/conv.h
@@ -53,7 +53,8 @@ namespace matx
         matxConvCorrMethod_t method_;
         PermDims perm_;
         cuda::std::array<index_t, max_rank> out_dims_;
-        mutable matx::tensor_t<out_t, max_rank> tmp_out_;
+        mutable detail::tensor_impl_t<out_t, max_rank> tmp_out_;
+        mutable out_t *ptr; 
 
         static constexpr int MAX_MIN_DIMENSION_DIRECT = 1024;
 
@@ -160,19 +161,23 @@ namespace matx
         }
 
         template <typename ShapeType, typename Executor>
-        __MATX_INLINE__ void PreRun([[maybe_unused]] ShapeType &&shape, Executor &&ex) const noexcept
+        __MATX_INLINE__ void InnerPreRun([[maybe_unused]] ShapeType &&shape, Executor &&ex) const noexcept
         {
           if constexpr (is_matx_op<OpA>()) {
             a_.PreRun(std::forward<ShapeType>(shape), std::forward<Executor>(ex));
-          }
+          }     
 
           if constexpr (is_matx_op<OpB>()) {
             b_.PreRun(std::forward<ShapeType>(shape), std::forward<Executor>(ex));
-          }
+          }           
+        }      
 
-          if constexpr (is_cuda_executor_v<Executor>) {
-            make_tensor(tmp_out_, out_dims_, MATX_ASYNC_DEVICE_MEMORY, ex.getStream());
-          }
+        template <typename ShapeType, typename Executor>
+        __MATX_INLINE__ void PreRun([[maybe_unused]] ShapeType &&shape, Executor &&ex) const noexcept
+        {
+          InnerPreRun(std::forward<ShapeType>(shape), std::forward<Executor>(ex));  
+
+          detail::AllocateTempTensor(tmp_out_, std::forward<Executor>(ex), out_dims_, &ptr);
 
           Exec(cuda::std::make_tuple(tmp_out_), std::forward<Executor>(ex));
         }
@@ -237,7 +242,8 @@ namespace detail {
       matxConvCorrMode_t mode_;
       PermDims perm_;
       cuda::std::array<index_t, max_rank> out_dims_;
-      mutable matx::tensor_t<out_t, max_rank> tmp_out_;
+      mutable detail::tensor_impl_t<out_t, max_rank> tmp_out_;
+      mutable out_t *ptr; 
 
     public:
       using matxop = bool;
@@ -330,7 +336,7 @@ namespace detail {
       }
 
       template <typename ShapeType, typename Executor>
-      __MATX_INLINE__ void PreRun([[maybe_unused]] ShapeType &&shape, Executor &&ex) const noexcept
+      __MATX_INLINE__ void InnerPreRun([[maybe_unused]] ShapeType &&shape, Executor &&ex) const noexcept
       {
         if constexpr (is_matx_op<OpA>()) {
           a_.PreRun(std::forward<ShapeType>(shape), std::forward<Executor>(ex));
@@ -338,11 +344,15 @@ namespace detail {
 
         if constexpr (is_matx_op<OpB>()) {
           b_.PreRun(std::forward<ShapeType>(shape), std::forward<Executor>(ex));
-        }          
+        }           
+      }      
 
-        if constexpr (is_cuda_executor_v<Executor>) {
-          make_tensor(tmp_out_, out_dims_, MATX_ASYNC_DEVICE_MEMORY, ex.getStream());
-        }
+      template <typename ShapeType, typename Executor>
+      __MATX_INLINE__ void PreRun([[maybe_unused]] ShapeType &&shape, Executor &&ex) const noexcept
+      {
+        InnerPreRun(std::forward<ShapeType>(shape), std::forward<Executor>(ex));           
+
+        detail::AllocateTempTensor(tmp_out_, std::forward<Executor>(ex), out_dims_, &ptr);
 
         Exec(cuda::std::make_tuple(tmp_out_), std::forward<Executor>(ex));
       }
diff --git a/include/matx/operators/corr.h b/include/matx/operators/corr.h
index 36a42038..224e65bd 100644
--- a/include/matx/operators/corr.h
+++ b/include/matx/operators/corr.h
@@ -53,7 +53,8 @@ namespace matx
         matxConvCorrMethod_t method_;
         PermDims perm_;
         cuda::std::array<index_t, max_rank> out_dims_;
-        mutable matx::tensor_t<out_t, max_rank> tmp_out_;
+        mutable detail::tensor_impl_t<out_t, max_rank> tmp_out_;
+        mutable out_t *ptr; 
 
       public:
         using matxop = bool;
@@ -146,7 +147,7 @@ namespace matx
         }
 
         template <typename ShapeType, typename Executor>
-        __MATX_INLINE__ void PreRun([[maybe_unused]] ShapeType &&shape, Executor &&ex) const noexcept
+        __MATX_INLINE__ void InnerPreRun([[maybe_unused]] ShapeType &&shape, Executor &&ex) const noexcept
         {
           if constexpr (is_matx_op<OpA>()) {
             a_.PreRun(std::forward<ShapeType>(shape), std::forward<Executor>(ex));
@@ -154,11 +155,15 @@ namespace matx
 
           if constexpr (is_matx_op<OpB>()) {
             b_.PreRun(std::forward<ShapeType>(shape), std::forward<Executor>(ex));
-          }          
+          }           
+        }      
 
-          if constexpr (is_cuda_executor_v<Executor>) {
-            make_tensor(tmp_out_, out_dims_, MATX_ASYNC_DEVICE_MEMORY, ex.getStream());
-          }
+        template <typename ShapeType, typename Executor>
+        __MATX_INLINE__ void PreRun([[maybe_unused]] ShapeType &&shape, Executor &&ex) const noexcept
+        {
+          InnerPreRun(std::forward<ShapeType>(shape), std::forward<Executor>(ex));        
+
+          detail::AllocateTempTensor(tmp_out_, std::forward<Executor>(ex), out_dims_, &ptr);
 
           Exec(cuda::std::make_tuple(tmp_out_), std::forward<Executor>(ex));
         }
diff --git a/include/matx/operators/cov.h b/include/matx/operators/cov.h
index e0ecb314..a2f00cef 100644
--- a/include/matx/operators/cov.h
+++ b/include/matx/operators/cov.h
@@ -46,7 +46,8 @@ namespace matx
       private:
         OpA a_;
         cuda::std::array<index_t, 2> out_dims_;
-        mutable matx::tensor_t<typename OpA::scalar_type, OpA::Rank()> tmp_out_;
+        mutable detail::tensor_impl_t<typename remove_cvref_t<OpA>::scalar_type, OpA::Rank()> tmp_out_;
+        mutable typename remove_cvref_t<OpA>::scalar_type *ptr; 
 
       public:
         using matxop = bool;
@@ -88,15 +89,19 @@ namespace matx
         }
 
         template <typename ShapeType, typename Executor>
-        __MATX_INLINE__ void PreRun([[maybe_unused]] ShapeType &&shape, Executor &&ex) const noexcept
+        __MATX_INLINE__ void InnerPreRun([[maybe_unused]] ShapeType &&shape, Executor &&ex) const noexcept
         {
           if constexpr (is_matx_op<OpA>()) {
             a_.PreRun(std::forward<ShapeType>(shape), std::forward<Executor>(ex));
-          }     
+          }         
+        }      
 
-          if constexpr (is_cuda_executor_v<Executor>) {
-            make_tensor(tmp_out_, out_dims_, MATX_ASYNC_DEVICE_MEMORY, ex.getStream());
-          }
+        template <typename ShapeType, typename Executor>
+        __MATX_INLINE__ void PreRun([[maybe_unused]] ShapeType &&shape, Executor &&ex) const noexcept
+        {
+          InnerPreRun(std::forward<ShapeType>(shape), std::forward<Executor>(ex));      
+
+          detail::AllocateTempTensor(tmp_out_, std::forward<Executor>(ex), out_dims_, &ptr);
 
           Exec(cuda::std::make_tuple(tmp_out_), std::forward<Executor>(ex));
         }
diff --git a/include/matx/operators/cumsum.h b/include/matx/operators/cumsum.h
index 6a672be7..a872e763 100644
--- a/include/matx/operators/cumsum.h
+++ b/include/matx/operators/cumsum.h
@@ -48,7 +48,8 @@ namespace detail {
     private:
       OpA a_;
       cuda::std::array<index_t, OpA::Rank()> out_dims_;
-      mutable matx::tensor_t<typename OpA::scalar_type, OpA::Rank()> tmp_out_;      
+      mutable detail::tensor_impl_t<typename remove_cvref_t<OpA>::scalar_type, OpA::Rank()> tmp_out_;
+      mutable typename remove_cvref_t<OpA>::scalar_type *ptr;  
 
     public:
       using matxop = bool;
@@ -79,15 +80,19 @@ namespace detail {
       }
 
       template <typename ShapeType, typename Executor>
-      __MATX_INLINE__ void PreRun([[maybe_unused]] ShapeType &&shape, Executor &&ex) const noexcept
+      __MATX_INLINE__ void InnerPreRun([[maybe_unused]] ShapeType &&shape, Executor &&ex) const noexcept
       {
         if constexpr (is_matx_op<OpA>()) {
           a_.PreRun(std::forward<ShapeType>(shape), std::forward<Executor>(ex));
-        }     
+        }         
+      }      
 
-        if constexpr (is_cuda_executor_v<Executor>) {
-          make_tensor(tmp_out_, out_dims_, MATX_ASYNC_DEVICE_MEMORY, ex.getStream());
-        }
+      template <typename ShapeType, typename Executor>
+      __MATX_INLINE__ void PreRun([[maybe_unused]] ShapeType &&shape, Executor &&ex) const noexcept
+      {
+        InnerPreRun(std::forward<ShapeType>(shape), std::forward<Executor>(ex));      
+
+        detail::AllocateTempTensor(tmp_out_, std::forward<Executor>(ex), out_dims_, &ptr);
 
         Exec(cuda::std::make_tuple(tmp_out_), std::forward<Executor>(ex));
       }
diff --git a/include/matx/operators/det.h b/include/matx/operators/det.h
index 20f85ece..a7441174 100644
--- a/include/matx/operators/det.h
+++ b/include/matx/operators/det.h
@@ -44,7 +44,8 @@ namespace detail {
   {
     private:
       OpA a_;
-      mutable matx::tensor_t<typename OpA::scalar_type, OpA::Rank()> tmp_out_;
+      mutable detail::tensor_impl_t<typename remove_cvref_t<OpA>::scalar_type, OpA::Rank()> tmp_out_;
+      mutable typename remove_cvref_t<OpA>::scalar_type *ptr; 
 
     public:
       using matxop = bool;
@@ -72,18 +73,19 @@ namespace detail {
       }
 
       template <typename ShapeType, typename Executor>
-      __MATX_INLINE__ void PreRun([[maybe_unused]] ShapeType &&shape, Executor &&ex) const noexcept
+      __MATX_INLINE__ void InnerPreRun([[maybe_unused]] ShapeType &&shape, Executor &&ex) const noexcept
       {
         if constexpr (is_matx_op<OpA>()) {
           a_.PreRun(std::forward<ShapeType>(shape), std::forward<Executor>(ex));
-        }
+        }       
+      }      
 
-        if constexpr (is_cuda_executor_v<Executor>) {
-          make_tensor(tmp_out_, a_.Shape(), MATX_ASYNC_DEVICE_MEMORY, ex.getStream());
-        }
-        else {
-          make_tensor(tmp_out_, a_.Shape(), MATX_HOST_MEMORY);
-        }
+      template <typename ShapeType, typename Executor>
+      __MATX_INLINE__ void PreRun([[maybe_unused]] ShapeType &&shape, Executor &&ex) const noexcept
+      {
+        InnerPreRun(std::forward<ShapeType>(shape), std::forward<Executor>(ex));  
+
+        detail::AllocateTempTensor(tmp_out_, std::forward<Executor>(ex), a_.Shape(), &ptr);
 
         Exec(cuda::std::make_tuple(tmp_out_), std::forward<Executor>(ex));
       }
diff --git a/include/matx/operators/eig.h b/include/matx/operators/eig.h
index 33b3d6d3..7cf9cf0f 100644
--- a/include/matx/operators/eig.h
+++ b/include/matx/operators/eig.h
@@ -49,7 +49,6 @@ namespace detail {
       OpA a_;
       cusolverEigMode_t jobz_;
       cublasFillMode_t uplo_;
-      matx::tensor_t<typename OpA::scalar_type, OpA::Rank()> tmp_out_;
 
     public:
       using matxop = bool;
diff --git a/include/matx/operators/einsum.h b/include/matx/operators/einsum.h
index f91406f3..982fcfdb 100644
--- a/include/matx/operators/einsum.h
+++ b/include/matx/operators/einsum.h
@@ -78,6 +78,12 @@ namespace detail {
         return matxNoRank;
       }
 
+      template <typename ShapeType, typename Executor>
+      __MATX_INLINE__ void InnerPreRun([[maybe_unused]] ShapeType &&shape, [[maybe_unused]] Executor &&ex) const noexcept
+      {
+        // Maybe do something here later if we take operators as input        
+      }         
+
       template <typename ShapeType, typename Executor>
       __MATX_INLINE__ void PreRun([[maybe_unused]] ShapeType &&shape, Executor &&ex) const noexcept
       {
diff --git a/include/matx/operators/fft.h b/include/matx/operators/fft.h
index 1c86540e..89cd0cbc 100644
--- a/include/matx/operators/fft.h
+++ b/include/matx/operators/fft.h
@@ -53,9 +53,12 @@ namespace matx
         FFTType type_;
         FFTNorm norm_;
         cuda::std::array<index_t, OpA::Rank()> out_dims_;
-        mutable matx::tensor_t<std::conditional_t<is_complex_v<typename OpA::scalar_type>, 
+        using ttype = std::conditional_t<is_complex_v<typename OpA::scalar_type>, 
                                           typename OpA::scalar_type, 
-                                          typename scalar_to_complex<typename OpA::scalar_type>::ctype>, OpA::Rank()> tmp_out_;
+                                          typename scalar_to_complex<typename OpA::scalar_type>::ctype>;
+        // This should be tensor_impl_t, but need to work around issues with temp types returned in fft
+        mutable matx::tensor_t<ttype, OpA::Rank()> tmp_out_;
+        mutable ttype *ptr;                                           
 
       public:
         using matxop = bool;
@@ -161,18 +164,19 @@ namespace matx
         }
 
         template <typename ShapeType, typename Executor>
-        __MATX_INLINE__ void PreRun([[maybe_unused]] ShapeType &&shape, Executor &&ex) const noexcept
+        __MATX_INLINE__ void InnerPreRun([[maybe_unused]] ShapeType &&shape, Executor &&ex) const noexcept
         {
           if constexpr (is_matx_op<OpA>()) {
             a_.PreRun(std::forward<ShapeType>(shape), std::forward<Executor>(ex));
-          }
+          }         
+        }      
 
-          if constexpr (is_cuda_executor_v<Executor>) {
-            make_tensor(tmp_out_, out_dims_, MATX_ASYNC_DEVICE_MEMORY, ex.getStream());
-          }
-          else if constexpr (is_host_executor_v<Executor>) {
-            make_tensor(tmp_out_, out_dims_, MATX_HOST_MALLOC_MEMORY);
-          }          
+        template <typename ShapeType, typename Executor>
+        __MATX_INLINE__ void PreRun([[maybe_unused]] ShapeType &&shape, Executor &&ex) const noexcept
+        {
+          InnerPreRun(std::forward<ShapeType>(shape), std::forward<Executor>(ex));  
+
+          detail::AllocateTempTensor(tmp_out_, std::forward<Executor>(ex), out_dims_, &ptr);         
 
           Exec(cuda::std::make_tuple(tmp_out_), std::forward<Executor>(ex));
         }
@@ -299,9 +303,12 @@ namespace matx
         FFTType type_;
         FFTNorm norm_;
         cuda::std::array<index_t, OpA::Rank()> out_dims_;
-        mutable matx::tensor_t<std::conditional_t<is_complex_v<typename OpA::scalar_type>,
+        using ttype = std::conditional_t<is_complex_v<typename OpA::scalar_type>, 
                                           typename OpA::scalar_type, 
-                                          typename scalar_to_complex<OpA>::ctype>, OpA::Rank()> tmp_out_;
+                                          typename scalar_to_complex<typename OpA::scalar_type>::ctype>;
+        // This should be tensor_impl_t, but need to work around issues with temp types returned in fft
+        mutable matx::tensor_t<ttype, OpA::Rank()> tmp_out_; 
+        mutable ttype *ptr;                                                
 
       public:
         using matxop = bool;
@@ -368,18 +375,19 @@ namespace matx
         }
 
         template <typename ShapeType, typename Executor>
-        __MATX_INLINE__ void PreRun([[maybe_unused]] ShapeType &&shape, Executor &&ex) const noexcept
+        __MATX_INLINE__ void InnerPreRun([[maybe_unused]] ShapeType &&shape, Executor &&ex) const noexcept
         {
           if constexpr (is_matx_op<OpA>()) {
             a_.PreRun(std::forward<ShapeType>(shape), std::forward<Executor>(ex));
-          }
+          }         
+        }      
 
-          if constexpr (is_cuda_executor_v<Executor>) {
-            make_tensor(tmp_out_, out_dims_, MATX_ASYNC_DEVICE_MEMORY, ex.getStream());
-          }
-          else if constexpr (is_host_executor_v<Executor>) {
-            make_tensor(tmp_out_, out_dims_, MATX_HOST_MALLOC_MEMORY);
-          }          
+        template <typename ShapeType, typename Executor>
+        __MATX_INLINE__ void PreRun([[maybe_unused]] ShapeType &&shape, Executor &&ex) const noexcept
+        {
+          InnerPreRun(std::forward<ShapeType>(shape), std::forward<Executor>(ex));  
+
+          detail::AllocateTempTensor(tmp_out_, std::forward<Executor>(ex), out_dims_, &ptr);          
 
           Exec(cuda::std::make_tuple(tmp_out_), std::forward<Executor>(ex));
         }
diff --git a/include/matx/operators/filter.h b/include/matx/operators/filter.h
index f20ca203..1c2f0041 100644
--- a/include/matx/operators/filter.h
+++ b/include/matx/operators/filter.h
@@ -50,7 +50,8 @@ namespace detail {
       cuda::std::array<FilterType, NR> h_rec_;
       cuda::std::array<FilterType, NNR> h_nonrec_;
       cuda::std::array<index_t, OpA::Rank()> out_dims_;
-      mutable matx::tensor_t<typename OpA::scalar_type, OpA::Rank()> tmp_out_;
+      mutable detail::tensor_impl_t<typename remove_cvref_t<OpA>::scalar_type, OpA::Rank()> tmp_out_;
+      mutable typename remove_cvref_t<OpA>::scalar_type *ptr; 
 
     public:
       using matxop = bool;
@@ -87,15 +88,19 @@ namespace detail {
       }
 
       template <typename ShapeType, typename Executor>
-      __MATX_INLINE__ void PreRun([[maybe_unused]] ShapeType &&shape, Executor &&ex) const noexcept
+      __MATX_INLINE__ void InnerPreRun([[maybe_unused]] ShapeType &&shape, Executor &&ex) const noexcept
       {
         if constexpr (is_matx_op<OpA>()) {
           a_.PreRun(std::forward<ShapeType>(shape), std::forward<Executor>(ex));
-        }     
+        }         
+      }      
 
-        if constexpr (is_cuda_executor_v<Executor>) {
-          make_tensor(tmp_out_, out_dims_, MATX_ASYNC_DEVICE_MEMORY, ex.getStream());
-        }
+      template <typename ShapeType, typename Executor>
+      __MATX_INLINE__ void PreRun([[maybe_unused]] ShapeType &&shape, Executor &&ex) const noexcept
+      {
+        InnerPreRun(std::forward<ShapeType>(shape), std::forward<Executor>(ex));     
+
+        detail::AllocateTempTensor(tmp_out_, std::forward<Executor>(ex), out_dims_, &ptr);
 
         Exec(cuda::std::make_tuple(tmp_out_), std::forward<Executor>(ex));
       }
diff --git a/include/matx/operators/hist.h b/include/matx/operators/hist.h
index a0c5ca87..6060b61d 100644
--- a/include/matx/operators/hist.h
+++ b/include/matx/operators/hist.h
@@ -50,7 +50,8 @@ namespace detail {
       typename OpA::scalar_type lower_;
       typename OpA::scalar_type upper_;
       cuda::std::array<index_t, OpA::Rank()> out_dims_;
-      mutable matx::tensor_t<int, OpA::Rank()> tmp_out_;      
+      mutable detail::tensor_impl_t<int, OpA::Rank()> tmp_out_;
+      mutable int *ptr;  
 
     public:
       using matxop = bool;
@@ -83,15 +84,19 @@ namespace detail {
       }
 
       template <typename ShapeType, typename Executor>
-      __MATX_INLINE__ void PreRun([[maybe_unused]] ShapeType &&shape, Executor &&ex) const noexcept
+      __MATX_INLINE__ void InnerPreRun([[maybe_unused]] ShapeType &&shape, Executor &&ex) const noexcept
       {
         if constexpr (is_matx_op<OpA>()) {
           a_.PreRun(std::forward<ShapeType>(shape), std::forward<Executor>(ex));
-        }     
+        }         
+      }      
 
-        if constexpr (is_cuda_executor_v<Executor>) {
-          make_tensor(tmp_out_, out_dims_, MATX_ASYNC_DEVICE_MEMORY, ex.getStream());
-        }
+      template <typename ShapeType, typename Executor>
+      __MATX_INLINE__ void PreRun([[maybe_unused]] ShapeType &&shape, Executor &&ex) const noexcept
+      {
+        InnerPreRun(std::forward<ShapeType>(shape), std::forward<Executor>(ex));     
+
+        detail::AllocateTempTensor(tmp_out_, std::forward<Executor>(ex), out_dims_, &ptr);
 
         Exec(cuda::std::make_tuple(tmp_out_), std::forward<Executor>(ex));
       }
diff --git a/include/matx/operators/inverse.h b/include/matx/operators/inverse.h
index da463621..21c3fb56 100644
--- a/include/matx/operators/inverse.h
+++ b/include/matx/operators/inverse.h
@@ -46,7 +46,8 @@ namespace detail {
   {
     private:
       OpA a_;
-      mutable matx::tensor_t<typename OpA::scalar_type, OpA::Rank()> tmp_out_;
+      mutable detail::tensor_impl_t<typename remove_cvref_t<OpA>::scalar_type, OpA::Rank()> tmp_out_;
+      mutable typename remove_cvref_t<OpA>::scalar_type *ptr; 
 
     public:
       using matxop = bool;
@@ -79,18 +80,19 @@ namespace detail {
       }
 
       template <typename ShapeType, typename Executor>
-      __MATX_INLINE__ void PreRun([[maybe_unused]] ShapeType &&shape, Executor &&ex) const noexcept
+      __MATX_INLINE__ void InnerPreRun([[maybe_unused]] ShapeType &&shape, Executor &&ex) const noexcept
       {
         if constexpr (is_matx_op<OpA>()) {
           a_.PreRun(std::forward<ShapeType>(shape), std::forward<Executor>(ex));
-        }
-
-        if constexpr (is_cuda_executor_v<Executor>) {
-          make_tensor(tmp_out_, a_.Shape(), MATX_ASYNC_DEVICE_MEMORY, ex.getStream());
-        }
-        else {
-          make_tensor(tmp_out_, a_.Shape(), MATX_HOST_MEMORY);
-        }
+        }          
+      }      
+
+      template <typename ShapeType, typename Executor>
+      __MATX_INLINE__ void PreRun([[maybe_unused]] ShapeType &&shape, Executor &&ex) const noexcept
+      {
+        InnerPreRun(std::forward<ShapeType>(shape), std::forward<Executor>(ex));  
+
+        detail::AllocateTempTensor(tmp_out_, std::forward<Executor>(ex), a_.Shape(), &ptr);
 
         Exec(cuda::std::make_tuple(tmp_out_), std::forward<Executor>(ex));
       }      
diff --git a/include/matx/operators/lu.h b/include/matx/operators/lu.h
index aba5b0dd..16dd5146 100644
--- a/include/matx/operators/lu.h
+++ b/include/matx/operators/lu.h
@@ -44,7 +44,8 @@ namespace detail {
   {
     private:
       OpA a_;
-      mutable matx::tensor_t<typename OpA::scalar_type, OpA::Rank()> tmp_out_;
+      mutable detail::tensor_impl_t<typename remove_cvref_t<OpA>::scalar_type, OpA::Rank()> tmp_out_;
+      mutable typename remove_cvref_t<OpA>::scalar_type *ptr; 
 
     public:
       using matxop = bool;
diff --git a/include/matx/operators/matmul.h b/include/matx/operators/matmul.h
index 185e9eec..1ea6cce4 100644
--- a/include/matx/operators/matmul.h
+++ b/include/matx/operators/matmul.h
@@ -51,7 +51,8 @@ namespace matx
         PermDims perm_; 
         static constexpr int out_rank = cuda::std::max(OpA::Rank(), OpB::Rank());
         cuda::std::array<index_t, out_rank> out_dims_;
-        mutable matx::tensor_t<typename OpA::scalar_type, out_rank> tmp_out_;
+        mutable detail::tensor_impl_t<typename remove_cvref_t<OpA>::scalar_type, out_rank> tmp_out_;
+        mutable typename remove_cvref_t<OpA>::scalar_type *ptr; 
 
       public:
         using matxop = bool;
@@ -116,19 +117,23 @@ namespace matx
         }
 
         template <typename ShapeType, typename Executor>
-        __MATX_INLINE__ void PreRun([[maybe_unused]] ShapeType &&shape, Executor &&ex) const noexcept
+        __MATX_INLINE__ void InnerPreRun([[maybe_unused]] ShapeType &&shape, Executor &&ex) const noexcept
         {
           if constexpr (is_matx_op<OpA>()) {
             a_.PreRun(std::forward<ShapeType>(shape), std::forward<Executor>(ex));
-          }
+          }     
 
           if constexpr (is_matx_op<OpB>()) {
             b_.PreRun(std::forward<ShapeType>(shape), std::forward<Executor>(ex));
-          }
+          }           
+        }      
 
-          if constexpr (is_cuda_executor_v<Executor>) {
-            make_tensor(tmp_out_, out_dims_, MATX_ASYNC_DEVICE_MEMORY, ex.getStream());
-          }
+        template <typename ShapeType, typename Executor>
+        __MATX_INLINE__ void PreRun([[maybe_unused]] ShapeType &&shape, Executor &&ex) const noexcept
+        {
+          InnerPreRun(std::forward<ShapeType>(shape), std::forward<Executor>(ex));  
+
+          detail::AllocateTempTensor(tmp_out_, std::forward<Executor>(ex), out_dims_, &ptr);
 
           Exec(cuda::std::make_tuple(tmp_out_), std::forward<Executor>(ex));
         }
diff --git a/include/matx/operators/matvec.h b/include/matx/operators/matvec.h
index 16bdcc4e..15fe1fa3 100644
--- a/include/matx/operators/matvec.h
+++ b/include/matx/operators/matvec.h
@@ -50,7 +50,8 @@ namespace matx
         float beta_;
         static constexpr int RANK = remove_cvref_t<OpB>::Rank();
         cuda::std::array<index_t, RANK> out_dims_;
-        mutable matx::tensor_t<typename OpA::scalar_type, RANK> tmp_out_;
+        mutable detail::tensor_impl_t<typename remove_cvref_t<OpA>::scalar_type, RANK> tmp_out_;
+        mutable typename remove_cvref_t<OpA>::scalar_type *ptr; 
 
       public:
         using matxop = bool;
@@ -93,7 +94,7 @@ namespace matx
         }
 
         template <typename ShapeType, typename Executor>
-        __MATX_INLINE__ void PreRun([[maybe_unused]] ShapeType &&shape, Executor &&ex) const noexcept
+        __MATX_INLINE__ void InnerPreRun([[maybe_unused]] ShapeType &&shape, Executor &&ex) const noexcept
         {
           if constexpr (is_matx_op<OpA>()) {
             a_.PreRun(std::forward<ShapeType>(shape), std::forward<Executor>(ex));
@@ -101,11 +102,15 @@ namespace matx
 
           if constexpr (is_matx_op<OpB>()) {
             b_.PreRun(std::forward<ShapeType>(shape), std::forward<Executor>(ex));
-          }          
+          }           
+        }      
 
-          if constexpr (is_cuda_executor_v<Executor>) {
-            make_tensor(tmp_out_, out_dims_, MATX_ASYNC_DEVICE_MEMORY, ex.getStream());
-          }
+        template <typename ShapeType, typename Executor>
+        __MATX_INLINE__ void PreRun([[maybe_unused]] ShapeType &&shape, Executor &&ex) const noexcept
+        {
+          InnerPreRun(std::forward<ShapeType>(shape), std::forward<Executor>(ex));           
+
+          detail::AllocateTempTensor(tmp_out_, std::forward<Executor>(ex), out_dims_, &ptr);
 
           Exec(cuda::std::make_tuple(tmp_out_), std::forward<Executor>(ex));
         }
diff --git a/include/matx/operators/max.h b/include/matx/operators/max.h
index 0792861f..69aee65c 100644
--- a/include/matx/operators/max.h
+++ b/include/matx/operators/max.h
@@ -49,7 +49,8 @@ namespace detail {
     private:
       OpA a_;
       cuda::std::array<index_t, ORank> out_dims_;
-      mutable matx::tensor_t<typename remove_cvref_t<OpA>::scalar_type, ORank> tmp_out_;      
+      mutable detail::tensor_impl_t<typename remove_cvref_t<OpA>::scalar_type, ORank> tmp_out_;
+      mutable typename remove_cvref_t<OpA>::scalar_type *ptr;    
 
     public:
       using matxop = bool;
@@ -80,18 +81,19 @@ namespace detail {
       }
 
       template <typename ShapeType, typename Executor>
-      __MATX_INLINE__ void PreRun([[maybe_unused]] ShapeType &&shape, Executor &&ex) const noexcept
+      __MATX_INLINE__ void InnerPreRun([[maybe_unused]] ShapeType &&shape, Executor &&ex) const noexcept
       {
         if constexpr (is_matx_op<OpA>()) {
           a_.PreRun(std::forward<ShapeType>(shape), std::forward<Executor>(ex));
-        }     
+        }          
+      }      
 
-        if constexpr (is_cuda_executor_v<Executor>) {
-          make_tensor(tmp_out_, out_dims_, MATX_ASYNC_DEVICE_MEMORY, ex.getStream());
-        }
-        else {
-          make_tensor(tmp_out_, out_dims_, MATX_HOST_MEMORY);          
-        }
+      template <typename ShapeType, typename Executor>
+      __MATX_INLINE__ void PreRun([[maybe_unused]] ShapeType &&shape, Executor &&ex) const noexcept
+      {
+        InnerPreRun(std::forward<ShapeType>(shape), std::forward<Executor>(ex));    
+
+        detail::AllocateTempTensor(tmp_out_, std::forward<Executor>(ex), out_dims_, &ptr);
 
         Exec(cuda::std::make_tuple(tmp_out_), std::forward<Executor>(ex));
       }
@@ -131,7 +133,7 @@ namespace detail {
 template <typename InType, int D>
 __MATX_INLINE__ auto max(const InType &in, const int (&dims)[D])
 {
-  static_assert(D < InType::Rank(), "reduction dimensions must be <= Rank of input");
+  static_assert(D <= InType::Rank(), "reduction dimensions must be <= Rank of input");
   auto perm = detail::getPermuteDims<InType::Rank()>(dims);
   auto permop = permute(in, perm);
 
@@ -142,7 +144,7 @@ template <typename InType, int D>
 [[deprecated("Use max() instead of rmax() for reductions")]]
 __MATX_INLINE__ auto rmax(const InType &in, const int (&dims)[D])
 {
-  static_assert(D < InType::Rank(), "reduction dimensions must be <= Rank of input");
+  static_assert(D <= InType::Rank(), "reduction dimensions must be <= Rank of input");
   auto perm = detail::getPermuteDims<InType::Rank()>(dims);
   auto permop = permute(in, perm);
 
diff --git a/include/matx/operators/mean.h b/include/matx/operators/mean.h
index 37d1e73e..2cf22955 100644
--- a/include/matx/operators/mean.h
+++ b/include/matx/operators/mean.h
@@ -49,7 +49,8 @@ namespace detail {
     private:
       OpA a_;
       cuda::std::array<index_t, ORank> out_dims_;
-      mutable matx::tensor_t<typename remove_cvref_t<OpA>::scalar_type, ORank> tmp_out_;      
+      mutable detail::tensor_impl_t<typename remove_cvref_t<OpA>::scalar_type, ORank> tmp_out_;
+      mutable typename remove_cvref_t<OpA>::scalar_type *ptr;     
 
     public:
       using matxop = bool;
@@ -80,18 +81,19 @@ namespace detail {
       }
 
       template <typename ShapeType, typename Executor>
-      __MATX_INLINE__ void PreRun([[maybe_unused]] ShapeType &&shape, Executor &&ex) const noexcept
+      __MATX_INLINE__ void InnerPreRun([[maybe_unused]] ShapeType &&shape, Executor &&ex) const noexcept
       {
         if constexpr (is_matx_op<OpA>()) {
           a_.PreRun(std::forward<ShapeType>(shape), std::forward<Executor>(ex));
-        }     
+        }           
+      }      
 
-        if constexpr (is_cuda_executor_v<Executor>) {
-          make_tensor(tmp_out_, out_dims_, MATX_ASYNC_DEVICE_MEMORY, ex.getStream());
-        }
-        else {
-          make_tensor(tmp_out_, out_dims_, MATX_HOST_MEMORY);          
-        }
+      template <typename ShapeType, typename Executor>
+      __MATX_INLINE__ void PreRun([[maybe_unused]] ShapeType &&shape, Executor &&ex) const noexcept
+      {
+        InnerPreRun(std::forward<ShapeType>(shape), std::forward<Executor>(ex));       
+
+        detail::AllocateTempTensor(tmp_out_, std::forward<Executor>(ex), out_dims_, &ptr);
 
         Exec(cuda::std::make_tuple(tmp_out_), std::forward<Executor>(ex));
       }
diff --git a/include/matx/operators/median.h b/include/matx/operators/median.h
index 866cb7fc..cce0a467 100644
--- a/include/matx/operators/median.h
+++ b/include/matx/operators/median.h
@@ -49,7 +49,8 @@ namespace detail {
     private:
       OpA a_;
       cuda::std::array<index_t, ORank> out_dims_;
-      mutable matx::tensor_t<typename remove_cvref_t<OpA>::scalar_type, ORank> tmp_out_;      
+      mutable detail::tensor_impl_t<typename remove_cvref_t<OpA>::scalar_type, ORank> tmp_out_;
+      mutable typename remove_cvref_t<OpA>::scalar_type *ptr; 
 
     public:
       using matxop = bool;
@@ -80,18 +81,19 @@ namespace detail {
       }
 
       template <typename ShapeType, typename Executor>
-      __MATX_INLINE__ void PreRun([[maybe_unused]] ShapeType &&shape, Executor &&ex) const noexcept
+      __MATX_INLINE__ void InnerPreRun([[maybe_unused]] ShapeType &&shape, Executor &&ex) const noexcept
       {
         if constexpr (is_matx_op<OpA>()) {
           a_.PreRun(std::forward<ShapeType>(shape), std::forward<Executor>(ex));
-        }     
+        }         
+      }      
 
-        if constexpr (is_cuda_executor_v<Executor>) {
-          make_tensor(tmp_out_, out_dims_, MATX_ASYNC_DEVICE_MEMORY, ex.getStream());
-        }
-        else {
-          make_tensor(tmp_out_, out_dims_, MATX_HOST_MEMORY);          
-        }
+      template <typename ShapeType, typename Executor>
+      __MATX_INLINE__ void PreRun([[maybe_unused]] ShapeType &&shape, Executor &&ex) const noexcept
+      {
+        InnerPreRun(std::forward<ShapeType>(shape), std::forward<Executor>(ex));      
+
+        detail::AllocateTempTensor(tmp_out_, std::forward<Executor>(ex), out_dims_, &ptr);
 
         Exec(cuda::std::make_tuple(tmp_out_), std::forward<Executor>(ex));
       }
diff --git a/include/matx/operators/min.h b/include/matx/operators/min.h
index aad175ce..71049914 100644
--- a/include/matx/operators/min.h
+++ b/include/matx/operators/min.h
@@ -49,7 +49,8 @@ namespace detail {
     private:
       OpA a_;
       cuda::std::array<index_t, ORank> out_dims_;
-      mutable matx::tensor_t<typename remove_cvref_t<OpA>::scalar_type, ORank> tmp_out_;      
+      mutable detail::tensor_impl_t<typename remove_cvref_t<OpA>::scalar_type, ORank> tmp_out_;
+      mutable typename remove_cvref_t<OpA>::scalar_type *ptr;     
 
     public:
       using matxop = bool;
@@ -80,18 +81,19 @@ namespace detail {
       }
 
       template <typename ShapeType, typename Executor>
-      __MATX_INLINE__ void PreRun([[maybe_unused]] ShapeType &&shape, Executor &&ex) const noexcept
+      __MATX_INLINE__ void InnerPreRun([[maybe_unused]] ShapeType &&shape, Executor &&ex) const noexcept
       {
         if constexpr (is_matx_op<OpA>()) {
           a_.PreRun(std::forward<ShapeType>(shape), std::forward<Executor>(ex));
-        }     
+        }       
+      }      
 
-        if constexpr (is_cuda_executor_v<Executor>) {
-          make_tensor(tmp_out_, out_dims_, MATX_ASYNC_DEVICE_MEMORY, ex.getStream());
-        }
-        else {
-          make_tensor(tmp_out_, out_dims_, MATX_HOST_MEMORY);          
-        }
+      template <typename ShapeType, typename Executor>
+      __MATX_INLINE__ void PreRun([[maybe_unused]] ShapeType &&shape, Executor &&ex) const noexcept
+      {
+        InnerPreRun(std::forward<ShapeType>(shape), std::forward<Executor>(ex));    
+
+        detail::AllocateTempTensor(tmp_out_, std::forward<Executor>(ex), out_dims_, &ptr);
 
         Exec(cuda::std::make_tuple(tmp_out_), std::forward<Executor>(ex));
       }
diff --git a/include/matx/operators/norm.h b/include/matx/operators/norm.h
new file mode 100644
index 00000000..546584ba
--- /dev/null
+++ b/include/matx/operators/norm.h
@@ -0,0 +1,201 @@
+////////////////////////////////////////////////////////////////////////////////
+// BSD 3-Clause License
+//
+// Copyright (c) 2021, NVIDIA Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// 1. Redistributions of source code must retain the above copyright notice, this
+//    list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright notice,
+//    this list of conditions and the following disclaimer in the documentation
+//    and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the copyright holder nor the names of its
+//    contributors may be used to endorse or promote products derived from
+//    this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+// FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+// DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+// SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+/////////////////////////////////////////////////////////////////////////////////
+
+#pragma once
+
+
+#include "matx/core/type_utils.h"
+#include "matx/operators/base_operator.h"
+#include "matx/transforms/norm.h"
+
+namespace matx
+{
+  namespace detail {
+    template <typename OpA, typename NormType>
+    class NormOp : public BaseOp<NormOp<OpA, NormType>>
+    {
+      private:
+        using out_type = typename inner_op_type_t<typename remove_cvref_t<OpA>::scalar_type>::type;
+        OpA a_;
+        NormOrder order_;
+        static constexpr int ORank = std::is_same_v<NormType, detail::NormTypeVector> ? OpA::Rank() - 1 : OpA::Rank() - 2;
+        cuda::std::array<index_t, ORank> out_dims_;
+        mutable detail::tensor_impl_t<typename remove_cvref_t<OpA>::scalar_type, ORank> tmp_out_;
+        mutable typename remove_cvref_t<OpA>::scalar_type *ptr; 
+
+      public:
+        using matxop = bool;
+        using scalar_type = out_type;
+        using matx_transform_op = bool;
+        using norm_xform_op = bool;
+        using matx_inner_op_impl = bool; // Indicates this operator uses matx operators for its implementation
+
+      __MATX_INLINE__ std::string str() const { 
+        if constexpr (std::is_same_v<NormType, detail::NormTypeVector>) {
+          return "vector_norm()"; 
+        }
+        else {
+          return "matrix_norm";
+        }
+      }
+      __MATX_INLINE__ NormOp(const OpA &op, NormOrder order) : a_(op), order_(order) {
+        if constexpr (std::is_same_v<NormType, detail::NormTypeVector>) {
+          MATX_ASSERT_STR(order == NormOrder::NONE || order == NormOrder::L1 || order == NormOrder::L2, matxInvalidParameter,
+            "Invalid norm order used for vector mode");
+        }
+
+        for (int r = 0; r < ORank; r++) {
+          out_dims_[r] = a_.Size(r);
+        }
+      }
+
+
+      template <typename... Is>
+      __MATX_INLINE__ __MATX_DEVICE__ __MATX_HOST__ decltype(auto) operator()(Is... indices) const {
+        return tmp_out_(indices...);
+      };
+
+      template <typename Out, typename Executor>
+      void Exec(Out &&out, Executor &&ex) const {
+        norm_impl<NormType>(cuda::std::get<0>(out), a_, order_, ex);
+      }
+
+      static __MATX_INLINE__ constexpr __MATX_HOST__ __MATX_DEVICE__ int32_t Rank()
+      {
+        return ORank;
+      }
+
+      template <typename ShapeType, typename Executor>
+      __MATX_INLINE__ void InnerPreRun([[maybe_unused]] ShapeType &&shape, Executor &&ex) const noexcept {
+        if constexpr (is_matx_op<OpA>()) {
+          a_.PreRun(std::forward<ShapeType>(shape), std::forward<Executor>(ex));
+        }
+      } 
+
+      template <typename ShapeType, typename Executor>
+      __MATX_INLINE__ void PreRun([[maybe_unused]] ShapeType &&shape, Executor &&ex) const noexcept
+      {
+        InnerPreRun(std::forward<ShapeType>(shape), std::forward<Executor>(ex));
+
+        detail::AllocateTempTensor(tmp_out_, std::forward<Executor>(ex), out_dims_, &ptr);
+
+        Exec(std::make_tuple(tmp_out_), std::forward<Executor>(ex));
+      }
+
+      template <typename ShapeType, typename Executor>
+      __MATX_INLINE__ void PostRun(ShapeType &&shape, Executor &&ex) const noexcept
+      {
+        if constexpr (is_matx_op<OpA>()) {
+          a_.PostRun(std::forward<ShapeType>(shape), std::forward<Executor>(ex));
+        }
+      }
+
+      constexpr __MATX_INLINE__ __MATX_HOST__ __MATX_DEVICE__ index_t Size(int dim) const
+      {
+        return out_dims_[dim];
+      }
+    };
+  }
+
+
+  /**
+   * @brief Compute a vector norm
+   *
+   * Computes various types of matrix and vector norms based on the order
+   *
+   * @tparam Op Type of input values to evaluate
+   * @param op Input values to evaluate
+   * @param order Order of norm
+   * @return norm operator
+   */
+  template <typename Op>
+  __MATX_INLINE__ auto vector_norm(const Op &op,
+                            NormOrder order = NormOrder::NONE) {
+    return detail::NormOp<Op, detail::NormTypeVector>(op, order);
+  }
+
+
+  /**
+   * @brief Compute a vector norm
+   *
+   * Computes various types of vector norms based on the order
+   *
+   * @tparam Op Type of input values to evaluate
+   * @param op Input values to evaluate
+   * @param dims Dimensions to perform norm over   
+   * @param order Order of norm
+   * @return norm operator
+   */
+  template <typename Op, int D>
+  __MATX_INLINE__ auto vector_norm(const Op &op, const int (&dims)[D],
+                            NormOrder order = NormOrder::NONE) {
+    auto perm = detail::getPermuteDims<Op::Rank()>(dims);
+    auto permop = permute(op, perm);
+    return detail::NormOp<Op, detail::NormTypeVector>(permop, order);
+  }
+
+  /**
+   * @brief Compute a matrix norm
+   *
+   * Computes various types of matrix and matrix norms based on the order
+   *
+   * @tparam Op Type of input values to evaluate
+   * @param op Input values to evaluate
+   * @param order Order of norm
+   * @return norm operator
+   */
+  template <typename Op>
+  __MATX_INLINE__ auto matrix_norm(const Op &op,
+                            NormOrder order = NormOrder::NONE) {
+    return detail::NormOp<Op, detail::NormTypeMatrix>(op, order);
+  }
+
+
+  /**
+   * @brief Compute a matrix norm
+   *
+   * Computes various types of matrix norms based on the order
+   *
+   * @tparam Op Type of input values to evaluate
+   * @param op Input values to evaluate
+   * @param dims Dimensions to perform norm over   
+   * @param order Order of norm
+   * @return norm operator
+   */
+  template <typename Op, int D>
+  __MATX_INLINE__ auto matrix_norm(const Op &op, const int (&dims)[D],
+                            NormOrder order = NormOrder::NONE) {
+    auto perm = detail::getPermuteDims<Op::Rank()>(dims);
+    auto permop = permute(op, perm);
+    return detail::NormOp<Op, detail::NormTypeMatrix>(permop, order);
+  }  
+} // end namespace matx
diff --git a/include/matx/operators/operators.h b/include/matx/operators/operators.h
index 5a249d74..52482ed9 100644
--- a/include/matx/operators/operators.h
+++ b/include/matx/operators/operators.h
@@ -76,6 +76,7 @@
 #include "matx/operators/lu.h"
 #include "matx/operators/matmul.h"
 #include "matx/operators/matvec.h"
+#include "matx/operators/norm.h"
 #include "matx/operators/outer.h"
 #include "matx/operators/overlap.h"
 #include "matx/operators/percentile.h"
diff --git a/include/matx/operators/outer.h b/include/matx/operators/outer.h
index 9e412275..3ef2d9f4 100644
--- a/include/matx/operators/outer.h
+++ b/include/matx/operators/outer.h
@@ -50,7 +50,8 @@ namespace matx
         float beta_;
         static constexpr int RANK = cuda::std::max(remove_cvref_t<OpA>::Rank(), remove_cvref_t<OpB>::Rank()) + 1;
         cuda::std::array<index_t, RANK> out_dims_;
-        mutable matx::tensor_t<typename OpA::scalar_type, RANK> tmp_out_;
+        mutable detail::tensor_impl_t<typename remove_cvref_t<OpA>::scalar_type, RANK> tmp_out_;
+        mutable typename remove_cvref_t<OpA>::scalar_type *ptr; 
 
       public:
         using matxop = bool;
@@ -102,7 +103,7 @@ namespace matx
         }
 
         template <typename ShapeType, typename Executor>
-        __MATX_INLINE__ void PreRun([[maybe_unused]] ShapeType &&shape, Executor &&ex) const noexcept
+        __MATX_INLINE__ void InnerPreRun([[maybe_unused]] ShapeType &&shape, Executor &&ex) const noexcept
         {
           if constexpr (is_matx_op<OpA>()) {
             a_.PreRun(std::forward<ShapeType>(shape), std::forward<Executor>(ex));
@@ -110,11 +111,15 @@ namespace matx
 
           if constexpr (is_matx_op<OpB>()) {
             b_.PreRun(std::forward<ShapeType>(shape), std::forward<Executor>(ex));
-          }          
+          }           
+        }      
 
-          if constexpr (is_cuda_executor_v<Executor>) {
-            make_tensor(tmp_out_, out_dims_, MATX_ASYNC_DEVICE_MEMORY, ex.getStream());
-          }
+        template <typename ShapeType, typename Executor>
+        __MATX_INLINE__ void PreRun([[maybe_unused]] ShapeType &&shape, Executor &&ex) const noexcept
+        {
+          InnerPreRun(std::forward<ShapeType>(shape), std::forward<Executor>(ex));            
+
+          detail::AllocateTempTensor(tmp_out_, std::forward<Executor>(ex), out_dims_, &ptr);
 
           Exec(cuda::std::make_tuple(tmp_out_), std::forward<Executor>(ex));
         }
diff --git a/include/matx/operators/percentile.h b/include/matx/operators/percentile.h
index 58edf5a3..c404850b 100644
--- a/include/matx/operators/percentile.h
+++ b/include/matx/operators/percentile.h
@@ -49,7 +49,8 @@ namespace detail {
       uint32_t q_;
       PercentileMethod method_;
       cuda::std::array<index_t, ORank> out_dims_; 
-      mutable matx::tensor_t<typename remove_cvref_t<OpA>::scalar_type, ORank> tmp_out_;
+      mutable detail::tensor_impl_t<typename remove_cvref_t<OpA>::scalar_type, ORank> tmp_out_;
+      mutable typename remove_cvref_t<OpA>::scalar_type *ptr; 
 
     public:
       using matxop = bool;
@@ -80,18 +81,19 @@ namespace detail {
       }
 
       template <typename ShapeType, typename Executor>
-      __MATX_INLINE__ void PreRun([[maybe_unused]] ShapeType &&shape, Executor &&ex) const noexcept
+      __MATX_INLINE__ void InnerPreRun([[maybe_unused]] ShapeType &&shape, Executor &&ex) const noexcept
       {
         if constexpr (is_matx_op<OpA>()) {
           a_.PreRun(std::forward<ShapeType>(shape), std::forward<Executor>(ex));
-        }
+        }          
+      }      
 
-        if constexpr (is_cuda_executor_v<Executor>) {
-          make_tensor(tmp_out_, out_dims_, MATX_ASYNC_DEVICE_MEMORY, ex.getStream());
-        }
-        else {
-          make_tensor(tmp_out_, out_dims_, MATX_HOST_MEMORY);
-        }
+      template <typename ShapeType, typename Executor>
+      __MATX_INLINE__ void PreRun([[maybe_unused]] ShapeType &&shape, Executor &&ex) const noexcept
+      {
+        InnerPreRun(std::forward<ShapeType>(shape), std::forward<Executor>(ex));  
+
+        detail::AllocateTempTensor(tmp_out_, std::forward<Executor>(ex), out_dims_, &ptr);
 
         Exec(cuda::std::make_tuple(tmp_out_), std::forward<Executor>(ex));
       }
diff --git a/include/matx/operators/prod.h b/include/matx/operators/prod.h
index a23fdff5..df08bc11 100644
--- a/include/matx/operators/prod.h
+++ b/include/matx/operators/prod.h
@@ -49,7 +49,8 @@ namespace detail {
     private:
       OpA a_;
       cuda::std::array<index_t, ORank> out_dims_;
-      mutable matx::tensor_t<typename remove_cvref_t<OpA>::scalar_type, ORank> tmp_out_;      
+      mutable detail::tensor_impl_t<typename remove_cvref_t<OpA>::scalar_type, ORank> tmp_out_;
+      mutable typename remove_cvref_t<OpA>::scalar_type *ptr; 
 
     public:
       using matxop = bool;
@@ -77,21 +78,22 @@ namespace detail {
       static __MATX_INLINE__ constexpr __MATX_HOST__ __MATX_DEVICE__ int32_t Rank()
       {
         return ORank;
-    }
+      }
 
       template <typename ShapeType, typename Executor>
-      __MATX_INLINE__ void PreRun([[maybe_unused]] ShapeType &&shape, Executor &&ex) const noexcept
+      __MATX_INLINE__ void InnerPreRun([[maybe_unused]] ShapeType &&shape, Executor &&ex) const noexcept
       {
         if constexpr (is_matx_op<OpA>()) {
           a_.PreRun(std::forward<ShapeType>(shape), std::forward<Executor>(ex));
-        }     
+        }          
+      }      
 
-        if constexpr (is_cuda_executor_v<Executor>) {
-          make_tensor(tmp_out_, out_dims_, MATX_ASYNC_DEVICE_MEMORY, ex.getStream());
-        }
-        else {
-          make_tensor(tmp_out_, out_dims_, MATX_HOST_MEMORY);          
-        }
+      template <typename ShapeType, typename Executor>
+      __MATX_INLINE__ void PreRun([[maybe_unused]] ShapeType &&shape, Executor &&ex) const noexcept
+      {
+        InnerPreRun(std::forward<ShapeType>(shape), std::forward<Executor>(ex));       
+
+        detail::AllocateTempTensor(tmp_out_, std::forward<Executor>(ex), out_dims_, &ptr);
 
         Exec(cuda::std::make_tuple(tmp_out_), std::forward<Executor>(ex));
       }
diff --git a/include/matx/operators/pwelch.h b/include/matx/operators/pwelch.h
index a4b4ce28..a255fc8b 100644
--- a/include/matx/operators/pwelch.h
+++ b/include/matx/operators/pwelch.h
@@ -51,7 +51,8 @@ namespace matx
         index_t noverlap_;
         index_t nfft_;
         cuda::std::array<index_t, 1> out_dims_;
-        mutable matx::tensor_t<typename OpX::scalar_type::value_type, 1> tmp_out_;
+        mutable detail::tensor_impl_t<typename remove_cvref_t<OpX>::scalar_type, 1> tmp_out_;
+        mutable typename remove_cvref_t<OpX>::scalar_type *ptr; 
 
       public:
         using matxop = bool;
@@ -95,19 +96,23 @@ namespace matx
         }
 
         template <typename ShapeType, typename Executor>
-        __MATX_INLINE__ void PreRun([[maybe_unused]] ShapeType &&shape, Executor &&ex) const noexcept
+        __MATX_INLINE__ void InnerPreRun([[maybe_unused]] ShapeType &&shape, Executor &&ex) const noexcept
         {
           if constexpr (is_matx_op<OpX>()) {
-            x_.PreRun(Shape(x_), std::forward<Executor>(ex));
-          }
+            x_.PreRun(std::forward<ShapeType>(shape), std::forward<Executor>(ex));
+          } 
 
           if constexpr (is_matx_op<OpW>()) {
             w_.PreRun(Shape(w_), std::forward<Executor>(ex));
-          }
+          }                 
+        }      
 
-          if constexpr (is_cuda_executor_v<Executor>) {
-            make_tensor(tmp_out_, out_dims_, MATX_ASYNC_DEVICE_MEMORY, ex.getStream());
-          }
+        template <typename ShapeType, typename Executor>
+        __MATX_INLINE__ void PreRun([[maybe_unused]] ShapeType &&shape, Executor &&ex) const noexcept
+        {
+          InnerPreRun(std::forward<ShapeType>(shape), std::forward<Executor>(ex));    
+
+          detail::AllocateTempTensor(tmp_out_, std::forward<Executor>(ex), out_dims_, &ptr);
 
           Exec(cuda::std::make_tuple(tmp_out_), std::forward<Executor>(ex));
         }
diff --git a/include/matx/operators/reduce.h b/include/matx/operators/reduce.h
index 53f73f42..b3c0b8dc 100644
--- a/include/matx/operators/reduce.h
+++ b/include/matx/operators/reduce.h
@@ -50,7 +50,8 @@ namespace matx
         ReductionOp reduction_op_;
         bool init_;
         cuda::std::array<index_t, ORank> out_dims_;
-        mutable matx::tensor_t<typename OpA::scalar_type, ORank> tmp_out_;
+        mutable detail::tensor_impl_t<typename remove_cvref_t<OpA>::scalar_type, ORank> tmp_out_;
+        mutable typename remove_cvref_t<OpA>::scalar_type *ptr; 
 
       public:
         using matxop = bool;
@@ -97,15 +98,19 @@ namespace matx
         }
 
         template <typename ShapeType, typename Executor>
-        __MATX_INLINE__ void PreRun([[maybe_unused]] ShapeType &&shape, Executor &&ex) const noexcept
+        __MATX_INLINE__ void InnerPreRun([[maybe_unused]] ShapeType &&shape, Executor &&ex) const noexcept
         {
           if constexpr (is_matx_op<OpA>()) {
             a_.PreRun(std::forward<ShapeType>(shape), std::forward<Executor>(ex));
           }           
+        }      
 
-          if constexpr (is_cuda_executor_v<Executor>) {
-            make_tensor(tmp_out_, out_dims_, MATX_ASYNC_DEVICE_MEMORY, ex.getStream());
-          }
+        template <typename ShapeType, typename Executor>
+        __MATX_INLINE__ void PreRun([[maybe_unused]] ShapeType &&shape, Executor &&ex) const noexcept
+        {
+          InnerPreRun(std::forward<ShapeType>(shape), std::forward<Executor>(ex));           
+
+          detail::AllocateTempTensor(tmp_out_, std::forward<Executor>(ex), out_dims_, &ptr);
 
           Exec(cuda::std::make_tuple(tmp_out_), std::forward<Executor>(ex));
         }
diff --git a/include/matx/operators/resample_poly.h b/include/matx/operators/resample_poly.h
index ff101b81..7fe63ef6 100644
--- a/include/matx/operators/resample_poly.h
+++ b/include/matx/operators/resample_poly.h
@@ -53,7 +53,8 @@ namespace detail {
       index_t up_;
       index_t down_;
       cuda::std::array<index_t, OpA::Rank()> out_dims_;
-      mutable matx::tensor_t<out_t, OpA::Rank()> tmp_out_;
+      mutable detail::tensor_impl_t<out_t, OpA::Rank()> tmp_out_;
+      mutable out_t *ptr;       
 
     public:
       using matxop = bool;
@@ -92,20 +93,24 @@ namespace detail {
         return OpA::Rank();
       }
 
+
       template <typename ShapeType, typename Executor>
-      __MATX_INLINE__ void PreRun([[maybe_unused]] ShapeType &&shape, Executor &&ex) const noexcept
-      {
+      __MATX_INLINE__ void InnerPreRun([[maybe_unused]] ShapeType &&shape, Executor &&ex) const noexcept {
         if constexpr (is_matx_op<OpA>()) {
           a_.PreRun(std::forward<ShapeType>(shape), std::forward<Executor>(ex));
-        }     
+        }
 
         if constexpr (is_matx_op<FilterType>()) {
           f_.PreRun(std::forward<ShapeType>(shape), std::forward<Executor>(ex));
-        }          
+        }              
+      } 
 
-        if constexpr (is_cuda_executor_v<Executor>) {
-          make_tensor(tmp_out_, out_dims_, MATX_ASYNC_DEVICE_MEMORY, ex.getStream());
-        }
+      template <typename ShapeType, typename Executor>
+      __MATX_INLINE__ void PreRun([[maybe_unused]] ShapeType &&shape, Executor &&ex) const noexcept
+      {
+        InnerPreRun(std::forward<ShapeType>(shape), std::forward<Executor>(ex));         
+
+        detail::AllocateTempTensor(tmp_out_, std::forward<Executor>(ex), out_dims_, &ptr);
 
         Exec(cuda::std::make_tuple(tmp_out_), std::forward<Executor>(ex));
       }
diff --git a/include/matx/operators/scalar_ops.h b/include/matx/operators/scalar_ops.h
index 60ffa838..310623ce 100644
--- a/include/matx/operators/scalar_ops.h
+++ b/include/matx/operators/scalar_ops.h
@@ -202,7 +202,7 @@ MATX_UNARY_OP_GEN(log10, Log10);
 MATX_UNARY_OP_GEN(log2, Log2);
 MATX_UNARY_OP_GEN(log, Log);
 MATX_UNARY_OP_GEN(abs, Abs);
-MATX_UNARY_OP_GEN(norm, Norm);
+
 
 // Trigonometric functions
 template <typename T> static __MATX_INLINE__ __MATX_HOST__ __MATX_DEVICE__ auto _internal_sin(T v1)
diff --git a/include/matx/operators/softmax.h b/include/matx/operators/softmax.h
index 179e65bc..f8617233 100644
--- a/include/matx/operators/softmax.h
+++ b/include/matx/operators/softmax.h
@@ -47,7 +47,8 @@ namespace matx
         OpA a_;
         PermDims perm_;
         cuda::std::array<index_t, OpA::Rank()> out_dims_;
-        mutable matx::tensor_t<typename OpA::scalar_type, OpA::Rank()> tmp_out_;
+        mutable detail::tensor_impl_t<typename remove_cvref_t<OpA>::scalar_type, OpA::Rank()> tmp_out_;
+        mutable typename remove_cvref_t<OpA>::scalar_type *ptr; 
 
       public:
         using matxop = bool;
@@ -94,15 +95,19 @@ namespace matx
         }
 
         template <typename ShapeType, typename Executor>
-        __MATX_INLINE__ void PreRun([[maybe_unused]] ShapeType &&shape, Executor &&ex) const noexcept
+        __MATX_INLINE__ void InnerPreRun([[maybe_unused]] ShapeType &&shape, Executor &&ex) const noexcept
         {
           if constexpr (is_matx_op<OpA>()) {
             a_.PreRun(std::forward<ShapeType>(shape), std::forward<Executor>(ex));
-          }           
+          }         
+        }      
 
-          if constexpr (is_cuda_executor_v<Executor>) {
-            make_tensor(tmp_out_, out_dims_, MATX_ASYNC_DEVICE_MEMORY, ex.getStream());
-          }
+        template <typename ShapeType, typename Executor>
+        __MATX_INLINE__ void PreRun([[maybe_unused]] ShapeType &&shape, Executor &&ex) const noexcept
+        {
+          InnerPreRun(std::forward<ShapeType>(shape), std::forward<Executor>(ex));          
+
+          detail::AllocateTempTensor(tmp_out_, std::forward<Executor>(ex), out_dims_, &ptr);
 
           Exec(cuda::std::make_tuple(tmp_out_), std::forward<Executor>(ex));
         }
diff --git a/include/matx/operators/sort.h b/include/matx/operators/sort.h
index 0728e6f9..5817cab4 100644
--- a/include/matx/operators/sort.h
+++ b/include/matx/operators/sort.h
@@ -49,7 +49,8 @@ namespace detail {
       OpA a_;
       SortDirection_t dir_;
       cuda::std::array<index_t, OpA::Rank()> out_dims_;
-      mutable matx::tensor_t<typename OpA::scalar_type, OpA::Rank()> tmp_out_;      
+      mutable detail::tensor_impl_t<typename remove_cvref_t<OpA>::scalar_type, OpA::Rank()> tmp_out_;
+      mutable typename remove_cvref_t<OpA>::scalar_type *ptr; 
 
     public:
       using matxop = bool;
@@ -80,18 +81,19 @@ namespace detail {
       }
 
       template <typename ShapeType, typename Executor>
-      __MATX_INLINE__ void PreRun([[maybe_unused]] ShapeType &&shape, Executor &&ex) const noexcept
+      __MATX_INLINE__ void InnerPreRun([[maybe_unused]] ShapeType &&shape, Executor &&ex) const noexcept
       {
         if constexpr (is_matx_op<OpA>()) {
           a_.PreRun(std::forward<ShapeType>(shape), std::forward<Executor>(ex));
-        }     
+        }          
+      }      
 
-        if constexpr (is_cuda_executor_v<Executor>) {
-          make_tensor(tmp_out_, out_dims_, MATX_ASYNC_DEVICE_MEMORY, ex.getStream());
-        }
-        else {
-          make_tensor(tmp_out_, out_dims_, MATX_HOST_MEMORY);          
-        }
+      template <typename ShapeType, typename Executor>
+      __MATX_INLINE__ void PreRun([[maybe_unused]] ShapeType &&shape, Executor &&ex) const noexcept
+      {
+        InnerPreRun(std::forward<ShapeType>(shape), std::forward<Executor>(ex));     
+
+        detail::AllocateTempTensor(tmp_out_, std::forward<Executor>(ex), out_dims_, &ptr);
 
         Exec(cuda::std::make_tuple(tmp_out_), std::forward<Executor>(ex));
       }
diff --git a/include/matx/operators/stdd.h b/include/matx/operators/stdd.h
index 50161bb1..94c6faff 100644
--- a/include/matx/operators/stdd.h
+++ b/include/matx/operators/stdd.h
@@ -49,7 +49,8 @@ namespace detail {
     private:
       OpA a_;
       cuda::std::array<index_t, ORank> out_dims_;
-      mutable matx::tensor_t<typename remove_cvref_t<OpA>::scalar_type, ORank> tmp_out_;      
+      mutable detail::tensor_impl_t<typename remove_cvref_t<OpA>::scalar_type, ORank> tmp_out_;
+      mutable typename remove_cvref_t<OpA>::scalar_type *ptr;   
 
     public:
       using matxop = bool;
@@ -61,7 +62,7 @@ namespace detail {
       __MATX_INLINE__ StddOp(OpA a) : a_(a) { 
         for (int r = 0; r < ORank; r++) {
           out_dims_[r] = a_.Size(r);
-        }        
+        }
       };
 
       template <typename... Is>
@@ -80,18 +81,19 @@ namespace detail {
       }
 
       template <typename ShapeType, typename Executor>
-      __MATX_INLINE__ void PreRun([[maybe_unused]] ShapeType &&shape, Executor &&ex) const noexcept
+      __MATX_INLINE__ void InnerPreRun([[maybe_unused]] ShapeType &&shape, Executor &&ex) const noexcept
       {
         if constexpr (is_matx_op<OpA>()) {
           a_.PreRun(std::forward<ShapeType>(shape), std::forward<Executor>(ex));
-        }     
+        }         
+      }      
 
-        if constexpr (is_cuda_executor_v<Executor>) {
-          make_tensor(tmp_out_, out_dims_, MATX_ASYNC_DEVICE_MEMORY, ex.getStream());
-        }
-        else {
-          make_tensor(tmp_out_, out_dims_, MATX_HOST_MEMORY);          
-        }
+      template <typename ShapeType, typename Executor>
+      __MATX_INLINE__ void PreRun([[maybe_unused]] ShapeType &&shape, Executor &&ex) const noexcept
+      {
+        InnerPreRun(std::forward<ShapeType>(shape), std::forward<Executor>(ex));  
+
+        detail::AllocateTempTensor(tmp_out_, std::forward<Executor>(ex), out_dims_, &ptr);
 
         Exec(cuda::std::make_tuple(tmp_out_), std::forward<Executor>(ex));
       }
@@ -131,7 +133,7 @@ namespace detail {
 template <typename InType, int D>
 __MATX_INLINE__ auto stdd(const InType &in, const int (&dims)[D])
 {
-  static_assert(D < InType::Rank(), "reduction dimensions must be <= Rank of input");
+  static_assert(D <= InType::Rank(), "reduction dimensions must be <= Rank of input");
   auto perm = detail::getPermuteDims<InType::Rank()>(dims);
   auto permop = permute(in, perm);
 
diff --git a/include/matx/operators/sum.h b/include/matx/operators/sum.h
index 387b3735..36e32ad2 100644
--- a/include/matx/operators/sum.h
+++ b/include/matx/operators/sum.h
@@ -49,7 +49,8 @@ namespace detail {
     private:
       OpA a_;
       cuda::std::array<index_t, ORank> out_dims_;
-      mutable matx::tensor_t<typename remove_cvref_t<OpA>::scalar_type, ORank> tmp_out_;      
+      mutable detail::tensor_impl_t<typename remove_cvref_t<OpA>::scalar_type, ORank> tmp_out_;
+      mutable typename remove_cvref_t<OpA>::scalar_type *ptr;
 
     public:
       using matxop = bool;
@@ -58,7 +59,7 @@ namespace detail {
       using sum_xform_op = bool;
 
       __MATX_INLINE__ std::string str() const { return "sum(" + get_type_str(a_) + ")"; }
-      __MATX_INLINE__ SumOp(OpA a) : a_(a) { 
+      __MATX_INLINE__ SumOp(const OpA &a) : a_(a) { 
         for (int r = 0; r < ORank; r++) {
           out_dims_[r] = a_.Size(r);
         }        
@@ -80,18 +81,19 @@ namespace detail {
       }
 
       template <typename ShapeType, typename Executor>
-      __MATX_INLINE__ void PreRun([[maybe_unused]] ShapeType &&shape, Executor &&ex) const noexcept
+      __MATX_INLINE__ void InnerPreRun([[maybe_unused]] ShapeType &&shape, Executor &&ex) const noexcept
       {
         if constexpr (is_matx_op<OpA>()) {
           a_.PreRun(std::forward<ShapeType>(shape), std::forward<Executor>(ex));
-        }     
+        }          
+      }      
 
-        if constexpr (is_cuda_executor_v<Executor>) {
-          make_tensor(tmp_out_, out_dims_, MATX_ASYNC_DEVICE_MEMORY, ex.getStream());
-        }
-        else {
-          make_tensor(tmp_out_, out_dims_, MATX_HOST_MEMORY);          
-        }
+      template <typename ShapeType, typename Executor>
+      __MATX_INLINE__ void PreRun([[maybe_unused]] ShapeType &&shape, Executor &&ex) const noexcept
+      {
+        InnerPreRun(std::forward<ShapeType>(shape), std::forward<Executor>(ex));    
+
+        detail::AllocateTempTensor(tmp_out_, std::forward<Executor>(ex), out_dims_, &ptr);
 
         Exec(cuda::std::make_tuple(tmp_out_), std::forward<Executor>(ex));
       }
diff --git a/include/matx/operators/trace.h b/include/matx/operators/trace.h
index 864f62c5..53da7669 100644
--- a/include/matx/operators/trace.h
+++ b/include/matx/operators/trace.h
@@ -47,7 +47,8 @@ namespace detail {
   {
     private:
       OpA a_;
-      mutable matx::tensor_t<typename OpA::scalar_type, 0> tmp_out_;      
+      mutable detail::tensor_impl_t<typename remove_cvref_t<OpA>::scalar_type, 0> tmp_out_;
+      mutable typename remove_cvref_t<OpA>::scalar_type *ptr;
 
     public:
       using matxop = bool;
@@ -75,18 +76,19 @@ namespace detail {
       }
 
       template <typename ShapeType, typename Executor>
-      __MATX_INLINE__ void PreRun([[maybe_unused]] ShapeType &&shape, Executor &&ex) const noexcept
+      __MATX_INLINE__ void InnerPreRun([[maybe_unused]] ShapeType &&shape, Executor &&ex) const noexcept
       {
         if constexpr (is_matx_op<OpA>()) {
           a_.PreRun(std::forward<ShapeType>(shape), std::forward<Executor>(ex));
-        }     
+        }          
+      }      
 
-        if constexpr (is_cuda_executor_v<Executor>) {
-          make_tensor(tmp_out_, MATX_ASYNC_DEVICE_MEMORY, ex.getStream());
-        }
-        else {
-          make_tensor(tmp_out_, MATX_HOST_MEMORY);          
-        }
+      template <typename ShapeType, typename Executor>
+      __MATX_INLINE__ void PreRun([[maybe_unused]] ShapeType &&shape, Executor &&ex) const noexcept
+      {
+        InnerPreRun(std::forward<ShapeType>(shape), std::forward<Executor>(ex));      
+
+        detail::AllocateTempTensor(tmp_out_, std::forward<Executor>(ex), {}, &ptr);
 
         Exec(cuda::std::make_tuple(tmp_out_), std::forward<Executor>(ex));
       }
diff --git a/include/matx/operators/unary_operators.h b/include/matx/operators/unary_operators.h
index ce8c86cf..5f3c4d56 100644
--- a/include/matx/operators/unary_operators.h
+++ b/include/matx/operators/unary_operators.h
@@ -175,13 +175,6 @@ namespace matx
  */
   Op conj(Op t) {}
 
-  /**
- * Compute the squared magnitude of every element in the tensor
- * @param t
- *   Tensor or operator input
- */
-  Op norm(Op t) {}
-
   /**
  * Compute absolute value of every element in the tensor. For complex numbers
  * this returns the magnitude, or sqrt(x^2+y^2)
@@ -387,7 +380,6 @@ namespace matx
     }
   }
 #endif
-  DEFINE_UNARY_OP(norm, detail::NormOp);
   DEFINE_UNARY_OP(abs, detail::AbsOp);
   DEFINE_UNARY_OP(abs2, detail::Abs2Op);
   DEFINE_UNARY_OP(sin, detail::SinOp);
diff --git a/include/matx/operators/var.h b/include/matx/operators/var.h
index 04e3abbd..e1771ddd 100644
--- a/include/matx/operators/var.h
+++ b/include/matx/operators/var.h
@@ -50,7 +50,8 @@ namespace detail {
       OpA a_;
       int ddof_;
       cuda::std::array<index_t, ORank> out_dims_;
-      mutable matx::tensor_t<typename remove_cvref_t<OpA>::scalar_type, ORank> tmp_out_;      
+      mutable detail::tensor_impl_t<typename remove_cvref_t<OpA>::scalar_type, ORank> tmp_out_;
+      mutable typename remove_cvref_t<OpA>::scalar_type *ptr;  
 
     public:
       using matxop = bool;
@@ -81,18 +82,19 @@ namespace detail {
       }
 
       template <typename ShapeType, typename Executor>
-      __MATX_INLINE__ void PreRun([[maybe_unused]] ShapeType &&shape, Executor &&ex) const noexcept
+      __MATX_INLINE__ void InnerPreRun([[maybe_unused]] ShapeType &&shape, Executor &&ex) const noexcept
       {
         if constexpr (is_matx_op<OpA>()) {
           a_.PreRun(std::forward<ShapeType>(shape), std::forward<Executor>(ex));
-        }     
+        }          
+      }      
 
-        if constexpr (is_cuda_executor_v<Executor>) {
-          make_tensor(tmp_out_, out_dims_, MATX_ASYNC_DEVICE_MEMORY, ex.getStream());
-        }
-        else {
-          make_tensor(tmp_out_, out_dims_, MATX_HOST_MEMORY);          
-        }
+      template <typename ShapeType, typename Executor>
+      __MATX_INLINE__ void PreRun([[maybe_unused]] ShapeType &&shape, Executor &&ex) const noexcept
+      {
+        InnerPreRun(std::forward<ShapeType>(shape), std::forward<Executor>(ex));      
+
+        detail::AllocateTempTensor(tmp_out_, std::forward<Executor>(ex), out_dims_, &ptr);
 
         Exec(cuda::std::make_tuple(tmp_out_), std::forward<Executor>(ex));
       }
diff --git a/include/matx/transforms/ambgfun.h b/include/matx/transforms/ambgfun.h
index f83007bb..6c45e040 100644
--- a/include/matx/transforms/ambgfun.h
+++ b/include/matx/transforms/ambgfun.h
@@ -177,7 +177,7 @@ void ambgfun_impl(AMFTensor &amf, XTensor &x,
   auto x_normdiv_v = make_tensor<T1>(x.Shape(),  MATX_ASYNC_DEVICE_MEMORY, stream);
   auto x_norm_v = make_tensor<float>({}, MATX_ASYNC_DEVICE_MEMORY, stream);
 
-  (x_norm_v = sum(norm(x))).run(stream);
+  (x_norm_v = sum(abs2(x))).run(stream);
   (x_norm_v = sqrt(x_norm_v)).run(stream);
   (x_normdiv_v = x / x_norm_v).run(stream);
 
@@ -188,7 +188,7 @@ void ambgfun_impl(AMFTensor &amf, XTensor &x,
     y_normdiv_v.Shallow(make_tensor<T1>(y_normdiv_v.Shape(), MATX_ASYNC_DEVICE_MEMORY, stream));
     auto y_norm_v = make_tensor<float>({}, MATX_ASYNC_DEVICE_MEMORY, stream);
 
-    (y_norm_v = sum(norm(ry))).run(stream);
+    (y_norm_v = sum(abs2(ry))).run(stream);
     (y_normdiv_v = ry / y_norm_v).run(stream);
   }
 
diff --git a/include/matx/transforms/cub.h b/include/matx/transforms/cub.h
index 7f7e6fa8..bb8f5412 100644
--- a/include/matx/transforms/cub.h
+++ b/include/matx/transforms/cub.h
@@ -870,7 +870,9 @@ inline void ExecSort(OutputTensor &a_out,
       auto ft = [&](auto &&in, auto &&out, [[maybe_unused]] auto &&unused1, [[maybe_unused]] auto &&unused2) {
         return cub::DeviceReduce::Max(d_temp, temp_storage_bytes, in, out, static_cast<int>(TotalSize(in_base)), stream);
       };
+
       auto rv = ReduceInput(ft, out_base, in_base);
+
       MATX_ASSERT_STR_EXP(rv, cudaSuccess, matxCudaError, "Error in cub::DeviceReduce::Max");
     }
 #endif
diff --git a/include/matx/transforms/norm.h b/include/matx/transforms/norm.h
new file mode 100644
index 00000000..291eaa97
--- /dev/null
+++ b/include/matx/transforms/norm.h
@@ -0,0 +1,91 @@
+////////////////////////////////////////////////////////////////////////////////
+// BSD 3-Clause License
+//
+// Copyright (c) 2021, NVIDIA Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// 1. Redistributions of source code must retain the above copyright notice, this
+//    list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright notice,
+//    this list of conditions and the following disclaimer in the documentation
+//    and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the copyright holder nor the names of its
+//    contributors may be used to endorse or promote products derived from
+//    this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+// FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+// DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+// SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+/////////////////////////////////////////////////////////////////////////////////
+
+
+#pragma once
+
+#include <cstdint>
+#include <cstdio>
+#include <type_traits>
+
+#include "matx/core/error.h"
+#include "matx/core/nvtx.h"
+#include "matx/core/tensor.h"
+
+namespace matx {
+
+enum class NormOrder {
+  NONE,
+  L1,
+  L2,
+  FROB
+};
+
+namespace detail {
+  struct NormTypeVector{};
+  struct NormTypeMatrix{};
+};
+
+
+template <typename NormType, typename OutputOp, typename InputOp, typename Executor>
+__MATX_INLINE__ void norm_impl(OutputOp out, const InputOp &in,
+          NormOrder order, Executor &&exec)
+{
+  MATX_NVTX_START("", matx::MATX_NVTX_LOG_API)
+
+  if constexpr (std::is_same_v<NormType, detail::NormTypeVector>) {
+    if (order == NormOrder::NONE || order == NormOrder::L2) {
+      (out = sqrt(sum(abs2(in), {InputOp::Rank() - 1}))).run(exec);
+    }
+    else if (order == NormOrder::L1) {
+      (out = sum(abs(in), {InputOp::Rank() - 1})).run(exec);
+    }
+    else {
+      MATX_ASSERT_STR(false, matxInvalidParameter, "Invalid order type for vector norm");
+    }
+  }
+  else {
+    if (order == NormOrder::NONE || order == NormOrder::FROB) {
+      (out = sqrt(sum(abs2(in), {InputOp::Rank() - 2, InputOp::Rank() - 1}))).run(exec);
+    }
+    else if (order == NormOrder::L1) {
+      (out = max(sum(abs(in), {InputOp::Rank() - 2}), {InputOp::Rank() - 2})).run(exec);
+    }
+    else {
+      MATX_ASSERT_STR(false, matxInvalidParameter, "Invalid order type for matrix norm");
+    }
+  }
+}
+
+
+
+} // end namespace matx
diff --git a/include/matx/transforms/qr.h b/include/matx/transforms/qr.h
index 891610a5..fe3a3de8 100644
--- a/include/matx/transforms/qr.h
+++ b/include/matx/transforms/qr.h
@@ -154,14 +154,14 @@ namespace detail {
         (xz = (index(x.Rank()-1) >= i) * x).run(stream);
 
         // compute L2 norm without sqrt. 
-        (N = sum(norm(xz))).run(stream);
+        (N = sum(abs2(xz))).run(stream);
         //(N = sqrt(N)).run(stream);  // sqrt folded into next op
 
         (v = xz + (index(v.Rank()-1) == i) * sign(xz) * sqrt(nc)).run(stream); 
 
         auto r = x;  // alias column of R happens to be the same as x
 
-        (s = sum(norm(v))).run(stream);
+        (s = sum(abs2(v))).run(stream);
         //(s = sqrt(s)).run(stream);  // sqrt folded into next op
 
         // IFELSE to avoid nans when dividing by zero
diff --git a/include/matx/transforms/reduce.h b/include/matx/transforms/reduce.h
index c244fe43..f7d7df7a 100644
--- a/include/matx/transforms/reduce.h
+++ b/include/matx/transforms/reduce.h
@@ -1897,7 +1897,8 @@ void __MATX_INLINE__ sum_impl(OutType dest, const InType &in, [[maybe_unused]] H
     }
     else {
       for (index_t b = 0; b < lin.Size(0); b++) {
-        *(lout + b) = std::accumulate(lin + lbegin[b], lin + lend[b], static_cast<typename InType::scalar_type>(0));
+        auto f = std::accumulate(lin + lbegin[b], lin + lend[b], static_cast<typename InType::scalar_type>(0));
+        *(lout + b) = f;
       }
     }
   };
diff --git a/include/matx/transforms/svd.h b/include/matx/transforms/svd.h
index 864f824c..69d03e27 100644
--- a/include/matx/transforms/svd.h
+++ b/include/matx/transforms/svd.h
@@ -185,10 +185,10 @@ void svdpi_impl(UType &U, SType &S, VTType &VT, AType &A, X0Type &x0, int iterat
         // normalize x at each iteration to avoid instability
         // first compute sum of squares, norm will work for complex and real
 #if 0
-        sum(s, norm(x), stream);
+        sum(s, abs2(x), stream);
 #else
         //WAR cub not supporting strided output
-        (sums = sum(norm(x))).run(stream);
+        (sums = sum(abs2(x))).run(stream);
         (s = sums).run(stream);
 #endif
 
@@ -242,10 +242,10 @@ void svdpi_impl(UType &U, SType &S, VTType &VT, AType &A, X0Type &x0, int iterat
       // compute singular value as L2 norm of v
       // first compute sum of squares, norm will work for complex and real
 #if 0
-      sum(s, norm(v), stream);
+      sum(s, abs2(v), stream);
 #else
       //WAR cub not supporting strided output
-      (sums = sum(norm(v))).run(stream);
+      (sums = sum(abs2(v))).run(stream);
       (s = sums).run(stream);;
 #endif
       (s = sqrt(s)).run(stream);
@@ -269,10 +269,10 @@ void svdpi_impl(UType &U, SType &S, VTType &VT, AType &A, X0Type &x0, int iterat
       // compute singular value as L2 norm of v
       // first compute sum of squares, norm will work for complex and real
 #if 0
-      sum(s, norm(u), stream);
+      sum(s, abs2(u), stream);
 #else
       //WAR cub not supporting strided output
-      (sums = sum(norm(u))).run(stream);
+      (sums = sum(abs2(u))).run(stream);
       (s = sums).run(stream);;
 #endif
       (s = sqrt(s)).run(stream);
@@ -445,7 +445,7 @@ inline void svdbpi_impl(UType &U, SType &S, VTType &VT, const AType &A, int max_
 
       //compute L2(Q-Qold)
       // sqrt folded into next operation
-      (l2Norm = sum(norm(Q-Qold))).run(stream);  
+      (l2Norm = sum(abs2(Q-Qold))).run(stream);  
 
       // compute if all batches have converged
       if constexpr (RANK > 2) {
diff --git a/test/00_operators/OperatorTests.cu b/test/00_operators/OperatorTests.cu
index 3afe9c82..4786540a 100644
--- a/test/00_operators/OperatorTests.cu
+++ b/test/00_operators/OperatorTests.cu
@@ -2203,13 +2203,8 @@ TYPED_TEST(OperatorTestsComplexTypesAllExecs, OperatorFuncs)
   exec.sync();
   EXPECT_TRUE(MatXUtils::MatXTypeCompare(tov0(), detail::_internal_conj(c)));
 
-  // abs and norm take a complex and output a floating point value
+  // abs takes a complex and output a floating point value
   auto tdd0 = make_tensor<typename TestType::value_type>({});
-  // example-begin norm-test-1
-  (tdd0 = norm(tiv0)).run(exec);
-  // example-end norm-test-1
-  exec.sync();
-  EXPECT_TRUE(MatXUtils::MatXTypeCompare(tdd0(), detail::_internal_norm(c)));
 
   // example-begin abs-test-1
   (tdd0 = abs(tiv0)).run(exec);
diff --git a/test/00_transform/Norm.cu b/test/00_transform/Norm.cu
new file mode 100644
index 00000000..f6a9306a
--- /dev/null
+++ b/test/00_transform/Norm.cu
@@ -0,0 +1,147 @@
+////////////////////////////////////////////////////////////////////////////////
+// BSD 3-Clause License
+//
+// Copyright (c) 2021, NVIDIA Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// 1. Redistributions of source code must retain the above copyright notice, this
+//    list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright notice,
+//    this list of conditions and the following disclaimer in the documentation
+//    and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the copyright holder nor the names of its
+//    contributors may be used to endorse or promote products derived from
+//    this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+// DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+// FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+// DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+// SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+/////////////////////////////////////////////////////////////////////////////////
+
+#include "assert.h"
+#include "matx.h"
+#include "test_types.h"
+#include "utilities.h"
+#include "gtest/gtest.h"
+
+using namespace matx;
+
+constexpr index_t a_len = 16;
+
+template <typename T>
+class NormTest : public ::testing::Test {
+  using GTestType = std::tuple_element_t<0, T>;
+  using GExecType = std::tuple_element_t<1, T>;
+protected:
+  void SetUp() override
+  {
+    CheckTestTypeSupport<GTestType>();
+    pb = std::make_unique<detail::MatXPybind>();
+
+    // Half precision needs a bit more tolerance when compared to fp32
+    if constexpr (is_complex_half_v<GTestType> || is_matx_half_v<GTestType>) {
+      thresh = 0.5f;
+    }
+  }
+
+  void TearDown() { pb.reset(); }
+  GExecType exec{};   
+  std::unique_ptr<detail::MatXPybind> pb;
+  tensor_t<GTestType, 1> in_v{{a_len}};
+  tensor_t<GTestType, 2> in_m{{a_len, a_len}};
+  tensor_t<GTestType, 0> out_v{{}};
+  tensor_t<GTestType, 0> out_m{{}};
+  float thresh = 0.01f;
+};
+
+
+template <typename TensorType>
+class NormTestFloatTypes
+    : public NormTest<TensorType> {
+};
+
+
+
+TYPED_TEST_SUITE(NormTestFloatTypes, MatXTypesFloatNonComplexAllExecs);
+
+
+TYPED_TEST(NormTestFloatTypes, VectorL1)
+{
+  MATX_ENTER_HANDLER();
+  using TestType = std::tuple_element_t<0, TypeParam>;
+  this->pb->template InitTVGenerator<TestType>("00_transforms", "norm_operators", {a_len});
+  this->pb->RunTVGenerator("vector_l1");
+  this->pb->NumpyToTensorView(this->in_v, "in_v");
+  this->pb->NumpyToTensorView(this->out_v, "out_v");
+  // example-begin vector-norm-test-1
+  (this->out_v = vector_norm(this->in_v, NormOrder::L1)).run(this->exec);
+  // example-end vector-norm-test-1
+
+  MATX_TEST_ASSERT_COMPARE(this->pb, this->out_v, "out_v", this->thresh);
+
+  MATX_EXIT_HANDLER();
+}
+
+TYPED_TEST(NormTestFloatTypes, VectorL2)
+{
+  MATX_ENTER_HANDLER();
+  using TestType = std::tuple_element_t<0, TypeParam>;
+  this->pb->template InitTVGenerator<TestType>("00_transforms", "norm_operators", {a_len});
+  this->pb->RunTVGenerator("vector_l2");
+  this->pb->NumpyToTensorView(this->in_v, "in_v");
+  this->pb->NumpyToTensorView(this->out_v, "out_v");
+  // example-begin vector-norm-test-2
+  (this->out_v = vector_norm(this->in_v, NormOrder::L2)).run(this->exec);
+  // example-end vector-norm-test-2
+
+  MATX_TEST_ASSERT_COMPARE(this->pb, this->out_v, "out_v", this->thresh);
+
+  MATX_EXIT_HANDLER();
+}
+
+TYPED_TEST(NormTestFloatTypes, MatrixL1)
+{
+  MATX_ENTER_HANDLER();
+  using TestType = std::tuple_element_t<0, TypeParam>;
+  this->pb->template InitTVGenerator<TestType>("00_transforms", "norm_operators", {a_len, a_len});
+  this->pb->RunTVGenerator("matrix_l1");
+  this->pb->NumpyToTensorView(this->in_m, "in_m");
+  this->pb->NumpyToTensorView(this->out_m, "out_m");
+  // example-begin matrix-norm-test-1
+  (this->out_m = matrix_norm(this->in_m, NormOrder::L1)).run(this->exec);
+  // example-end matrix-norm-test-1
+
+  MATX_TEST_ASSERT_COMPARE(this->pb, this->out_m, "out_m", this->thresh);
+
+  MATX_EXIT_HANDLER();
+}
+
+TYPED_TEST(NormTestFloatTypes, MatrixL2)
+{
+  MATX_ENTER_HANDLER();
+  using TestType = std::tuple_element_t<0, TypeParam>;
+  this->pb->template InitTVGenerator<TestType>("00_transforms", "norm_operators", {a_len, a_len});
+  this->pb->RunTVGenerator("matrix_frob");
+  this->pb->NumpyToTensorView(this->in_m, "in_m");
+  this->pb->NumpyToTensorView(this->out_m, "out_m");
+  // example-begin matrix-norm-test-2
+  (this->out_m = matrix_norm(this->in_m, NormOrder::FROB)).run(this->exec);
+  // example-end matrix-norm-test-2
+
+  MATX_TEST_ASSERT_COMPARE(this->pb, this->out_m, "out_m", this->thresh);
+
+  MATX_EXIT_HANDLER();
+}
+
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index 2ca1b8d7..d1bbf9d0 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -17,6 +17,7 @@ set (test_sources
     00_transform/Copy.cu
     00_transform/Cov.cu
     00_transform/FFT.cu
+    00_transform/Norm.cu
     00_transform/ResamplePoly.cu
     00_transform/Solve.cu
     00_solver/Cholesky.cu
diff --git a/test/test_vectors/generators/00_transforms.py b/test/test_vectors/generators/00_transforms.py
index 2782a943..5bc7d236 100755
--- a/test/test_vectors/generators/00_transforms.py
+++ b/test/test_vectors/generators/00_transforms.py
@@ -392,3 +392,37 @@ def run(self) -> Dict[str, np.ndarray]:
             self.res['bc'][b] = np.outer(self.res['ba'][b], self.res['bb'][b])
             
         return self.res
+
+class norm_operators:
+    def __init__(self, dtype: str, size: List[int]):
+        self.size = size
+        self.dtype = dtype
+        np.random.seed(1234)
+
+    def vector_l2(self) -> Dict[str, np.ndarray]:
+        seq = matx_common.randn_ndarray((self.size[0],), self.dtype)
+        return {
+            'in_v': seq,
+            'out_v': np.linalg.norm(seq, 2)
+        }
+    
+    def vector_l1(self) -> Dict[str, np.ndarray]:
+        seq = matx_common.randn_ndarray((self.size[0],), self.dtype)
+        return {
+            'in_v': seq,
+            'out_v': np.linalg.norm(seq, 1)
+        }
+    
+    def matrix_frob(self) -> Dict[str, np.ndarray]:
+        seq = matx_common.randn_ndarray((self.size[0],self.size[1]), self.dtype)
+        return {
+            'in_m': seq,
+            'out_m': np.linalg.norm(seq, 'fro')
+        }
+    
+    def matrix_l1(self) -> Dict[str, np.ndarray]:
+        seq = matx_common.randn_ndarray((self.size[0],self.size[1]), self.dtype)
+        return {
+            'in_m': seq,
+            'out_m': np.linalg.norm(seq, 1)
+        }       
\ No newline at end of file