Added vector_norm and matrix_norm

Renamed norm() function to abs2 Fixed several bugs with pointer ownership on default tensors Fixed bug where PreRun was not executed in some cases
NVIDIA · Jun 14, 2024 · c6ec9ae · c6ec9ae
1 parent e8f96bf
commit c6ec9ae
Show file tree

Hide file tree

Showing 59 changed files with 973 additions and 359 deletions.
diff --git a/docs_input/api/math/complex/norm.rst b/docs_input/api/math/complex/norm.rst
diff --git a/docs_input/notebooks/04_radar_pipeline.ipynb b/docs_input/notebooks/04_radar_pipeline.ipynb
@@ -133,7 +133,7 @@
     "In this case we're applying a Hamming window to our partial waveform view. `hamming` is a generator function that generates Hamming window values at each point defined in the tensor. Next, we compute the L2 norm of the partial waveform. The L2 norm is done in two steps currently: an I^2 + Q^2 reduction, followed by a square root on the output of the reduction:\n",
     "\n",
     "```c++\n",
-    "  sum(norms, norm(waveformPart), stream);\n",
+    "  sum(norms, abs2(waveformPart), stream);\n",
     "  exec(norms, sqrt(norms), stream);\n",
     "```\n",
     "\n",
@@ -245,10 +245,10 @@
     "## CFAR Detection\n",
     "The last step in the pipeline is the constant false alarm rate (CFAR) detection. CFAR detection is broadly used to filter observible signals from noise by setting a threshold for observation. A filter mask was created in the constructor to represent the \"field of view\" that we are looking for a target in. By describing the field of view, we can differentiate what parts of the signal we believe are signal power and noise power. \n",
     "\n",
-    "CFAR detection begins by taking the signal power of the last stage by summing the squares of all complex numbers (I^2 + Q^2). This is done by using the MatX `norm` operator:\n",
+    "CFAR detection begins by taking the signal power of the last stage by summing the squares of all complex numbers (I^2 + Q^2). This is done by using the MatX `abs2` operator:\n",
     "\n",
     "```c++\n",
-    "exec(xdPow, norm(cfarIn), stream);\n",
+    "exec(xdPow, abs2(cfarIn), stream);\n",
     "```\n",
     "\n",
     "xdPow now contains the sum of the squares of each element. Using the computed power per cell, we apply the CFAR mask that was computed in the constructor. The mask is applied using a 2D convolution from the MatX `conv2d` function:\n",

diff --git a/examples/fft_conv.cu b/examples/fft_conv.cu
@@ -172,4 +172,4 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
 
   CUDA_CHECK_LAST_ERROR();
   MATX_EXIT_HANDLER();
-}
+}
diff --git a/examples/simple_radar_pipeline.h b/examples/simple_radar_pipeline.h
@@ -253,7 +253,7 @@ class RadarPipeline {
     (waveformPart = waveformPart * hamming<0>({waveformLength})).run(exec);
 
     // compute L2 norm
-    (norms = sum(norm(waveformPart))).run(exec);
+    (norms = sum(abs2(waveformPart))).run(exec);
     (norms = sqrt(norms)).run(exec);
 
     (waveformPart = waveformPart / norms).run(exec);
@@ -358,7 +358,7 @@ class RadarPipeline {
    */
   void CFARDetections()
   {
-    (xPow = norm(tpcView)).run(exec);
+    (xPow = abs2(tpcView)).run(exec);
 
     // Estimate the background average power in each cell
     // background_averages = conv2(Xpow, mask, 'same') ./ norm;

diff --git a/include/matx/core/half_complex.h b/include/matx/core/half_complex.h
@@ -740,14 +740,14 @@ __MATX_HOST__ __MATX_DEVICE__ __MATX_INLINE__ T atan2(const T &x, const T &y)
 }
 
 /**
- * @brief Norm operator
+ * @brief Squared absolute value operator
  * 
  * @tparam T Underlying type
  * @param x Value of input
  * @return Result of operation 
  */
 template <typename T>
-__MATX_HOST__ __MATX_DEVICE__ __MATX_INLINE__ T norm(const matxHalfComplex<T> &x)
+__MATX_HOST__ __MATX_DEVICE__ __MATX_INLINE__ T abs2(const matxHalfComplex<T> &x)
 {
   if (isinf(x.real()))
     return static_cast<T>(cuda::std::abs(static_cast<float>(x.real())));

diff --git a/include/matx/core/operator_utils.h b/include/matx/core/operator_utils.h
@@ -42,7 +42,7 @@ namespace matx {
   __MATX_HOST__ __MATX_INLINE__ auto ReduceOutput(Func &&func, OutputOp &&out, InputOp &&in, BeginIter &&bi, EndIter &&ei) {
     if constexpr (remove_cvref_t<decltype(out)>::Rank() <= 1 && is_tensor_view_v<OutputOp>) {
       if (out.IsContiguous()) {
-        if constexpr(ConvertType) {
+        if constexpr(ConvertType) {   
           return func(  in, 
                         reinterpret_cast<detail::convert_matx_type_t<typename remove_cvref_t<OutputOp>::scalar_type> *>(out.Data()), 
                         bi, 
@@ -64,7 +64,7 @@ namespace matx {
 
   template <typename Func, typename OutputOp, typename InputOp, bool ConvertType = true>
   __MATX_HOST__ __MATX_INLINE__ auto ReduceInput(Func &&func, OutputOp &&out, InputOp &&in) {
-    typename detail::base_type_t<InputOp> in_base = in;
+    typename detail::base_type_t<InputOp> in_base = in;    
     if constexpr (in_base.Rank() < 2 && is_tensor_view_v<InputOp>) {
       if (in_base.IsContiguous()) {
         if constexpr (ConvertType) {
@@ -89,8 +89,6 @@ namespace matx {
     auto collapsed = matx::lcollapse<remove_cvref_t<decltype(out)>::Rank()>(rcollapse<remove_cvref_t<decltype(in)>::Rank() - 
                                                                                       remove_cvref_t<decltype(out)>::Rank()>(in_base));
     const auto &iter = matx::RandomOperatorIterator<decltype(collapsed), ConvertType>{collapsed};
-
-
     return ReduceOutput<ConvertType>(std::forward<Func>(func), std::forward<OutputOp>(out), iter, BeginOffset{iter}, EndOffset{iter});   
   } 
 
@@ -116,4 +114,21 @@ namespace matx {
 
     return shape;
   }
+
+  namespace detail {
+    // Used inside of transforms to allocate temporary output
+    template <typename TensorType, typename Executor, typename ShapeType> 
+    __MATX_HOST__ __MATX_INLINE__ void AllocateTempTensor(TensorType &tensor, Executor &&ex, ShapeType &&shape, typename TensorType::scalar_type **ptr) {
+      const auto ttl_size = std::accumulate(shape.begin(), shape.end(), static_cast<index_t>(1),
+                                  std::multiplies<index_t>()) * sizeof(*ptr);      
+      if constexpr (is_cuda_executor_v<Executor>) {
+        matxAlloc((void**)ptr, ttl_size, MATX_ASYNC_DEVICE_MEMORY, ex.getStream());
+        make_tensor(tensor, *ptr, shape);
+      }
+      else {        
+        matxAlloc((void**)ptr, ttl_size, MATX_HOST_MEMORY);
+        make_tensor(tensor, *ptr, shape);        
+      }  
+    }  
+  }
 }; 
diff --git a/include/matx/core/pybind.h b/include/matx/core/pybind.h
@@ -336,35 +336,40 @@ class MatXPybind {
     using ntype = matx_convert_complex_type<T>;
     auto ften = pybind11::array_t<ntype>(np_ten);
 
-    for (index_t s1 = 0; s1 < ten.Size(0); s1++) {
-      if constexpr (RANK > 1) {
-        for (index_t s2 = 0; s2 < ten.Size(1); s2++) {
-          if constexpr (RANK > 2) {
-            for (index_t s3 = 0; s3 < ten.Size(2); s3++) {
-              if constexpr (RANK > 3) {
-                for (index_t s4 = 0; s4 < ten.Size(3); s4++) {
-                  if constexpr (RANK > 4) {
-                    for (index_t s5 = 0; s5 < ten.Size(4); s5++) {
-                      ten(s1, s2, s3, s4, s5) = ConvertComplex(ften.at(s1, s2, s3, s4, s5));
+    if constexpr (RANK == 0) {
+      ten() = ConvertComplex(ften.at());
+    }
+    else {
+      for (index_t s1 = 0; s1 < ten.Size(0); s1++) {
+        if constexpr (RANK > 1) {
+          for (index_t s2 = 0; s2 < ten.Size(1); s2++) {
+            if constexpr (RANK > 2) {
+              for (index_t s3 = 0; s3 < ten.Size(2); s3++) {
+                if constexpr (RANK > 3) {
+                  for (index_t s4 = 0; s4 < ten.Size(3); s4++) {
+                    if constexpr (RANK > 4) {
+                      for (index_t s5 = 0; s5 < ten.Size(4); s5++) {
+                        ten(s1, s2, s3, s4, s5) = ConvertComplex(ften.at(s1, s2, s3, s4, s5));
+                      }
+                    }
+                    else {
+                      ten(s1, s2, s3, s4) = ConvertComplex(ften.at(s1, s2, s3, s4));
                     }
-                  }
-                  else {
-                    ten(s1, s2, s3, s4) = ConvertComplex(ften.at(s1, s2, s3, s4));
                   }
                 }
-              }
-              else {
-                ten(s1, s2, s3) = ConvertComplex(ften.at(s1, s2, s3));
+                else {
+                  ten(s1, s2, s3) = ConvertComplex(ften.at(s1, s2, s3));
+                }
               }
             }
-          }
-          else {
-            ten(s1, s2) = ConvertComplex(ften.at(s1, s2));
+            else {
+              ten(s1, s2) = ConvertComplex(ften.at(s1, s2));
+            }
           }
         }
-      }
-      else {
-        ten(s1) = ConvertComplex(ften.at(s1));
+        else {
+          ten(s1) = ConvertComplex(ften.at(s1));
+        }
       }
     }
   }

diff --git a/include/matx/core/tensor_impl.h b/include/matx/core/tensor_impl.h
@@ -78,6 +78,7 @@ class tensor_impl_t {
     using shape_type = typename Desc::shape_type;
     using stride_type = typename Desc::stride_type;
     using matxoplvalue = bool;
+    using self_type = tensor_impl_t<T, RANK, Desc>;
 
     // Type specifier for signaling this is a matx operation
     using matxop = bool;
@@ -231,6 +232,12 @@ class tensor_impl_t {
     {
     }      
 
+    __MATX_HOST__ void Shallow(const self_type &rhs) noexcept
+    {
+      ldata_ = rhs.ldata_;
+      desc_ = rhs.desc_;
+    }    
+
     /**
      * Lazy assignment operator=. Used to create a "set" object for deferred
      * execution on a device
@@ -811,7 +818,7 @@ class tensor_impl_t {
      * 
      * @return data pointer 
      */
-    auto Data() const noexcept {
+    __MATX_INLINE__ __MATX_HOST__ __MATX_DEVICE__  auto Data() const noexcept {
       return ldata_;
     } 
 

diff --git a/include/matx/core/tie.h b/include/matx/core/tie.h
@@ -94,6 +94,11 @@ struct mtie : public BaseOp<mtie<Ts...>>{
 
   template <typename Executor>
   __MATX_INLINE__ void Exec(Executor &&ex) {
+    // Run the PreRun on the inner type to avoid allocation but allow transforms using MatX operators
+    // to do any setup needed
+    if constexpr (sizeof...(Ts) == 2) {
+      cuda::std::get<sizeof...(Ts) - 1>(ts_).InnerPreRun(NoShape{}, std::forward<Executor>(ex));
+    }
     cuda::std::get<sizeof...(Ts) - 1>(ts_).Exec(ts_, std::forward<Executor>(ex));
   }
 

diff --git a/include/matx/operators/all.h b/include/matx/operators/all.h
@@ -49,7 +49,8 @@ namespace detail {
     private:
       OpA a_;
       cuda::std::array<index_t, ORank> out_dims_;
-      mutable matx::tensor_t<typename remove_cvref_t<OpA>::scalar_type, ORank> tmp_out_;      
+      mutable detail::tensor_impl_t<typename remove_cvref_t<OpA>::scalar_type, ORank> tmp_out_;
+      mutable typename remove_cvref_t<OpA>::scalar_type *ptr;     
 
     public:
       using matxop = bool;
@@ -80,29 +81,30 @@ namespace detail {
       }
 
       template <typename ShapeType, typename Executor>
-      __MATX_INLINE__ void PreRun([[maybe_unused]] ShapeType &&shape, Executor &&ex) const noexcept
+      __MATX_INLINE__ void InnerPreRun([[maybe_unused]] ShapeType &&shape, Executor &&ex) const noexcept
       {
         if constexpr (is_matx_op<OpA>()) {
           a_.PreRun(std::forward<ShapeType>(shape), std::forward<Executor>(ex));
         }     
+      }      
 
-        if constexpr (is_cuda_executor_v<Executor>) {
-          make_tensor(tmp_out_, out_dims_, MATX_ASYNC_DEVICE_MEMORY, ex.getStream());
-        }
-        else {
-          make_tensor(tmp_out_, out_dims_, MATX_HOST_MEMORY);          
-        }
+      template <typename ShapeType, typename Executor>
+      __MATX_INLINE__ void PreRun([[maybe_unused]] ShapeType &&shape, Executor &&ex) const noexcept
+      {
+        InnerPreRun(std::forward<ShapeType>(shape), std::forward<Executor>(ex));
+
+        detail::AllocateTempTensor(tmp_out_, std::forward<Executor>(ex), out_dims_, &ptr);
 
         Exec(cuda::std::make_tuple(tmp_out_), std::forward<Executor>(ex));
       }
 
-        template <typename ShapeType, typename Executor>
-        __MATX_INLINE__ void PostRun([[maybe_unused]] ShapeType &&shape, [[maybe_unused]] Executor &&ex) const noexcept
-        {
-          if constexpr (is_matx_op<OpA>()) {
-            a_.PostRun(std::forward<ShapeType>(shape), std::forward<Executor>(ex));
-          }
-        }      
+      template <typename ShapeType, typename Executor>
+      __MATX_INLINE__ void PostRun([[maybe_unused]] ShapeType &&shape, [[maybe_unused]] Executor &&ex) const noexcept
+      {
+        if constexpr (is_matx_op<OpA>()) {
+          a_.PostRun(std::forward<ShapeType>(shape), std::forward<Executor>(ex));
+        }
+      }      
 
       constexpr __MATX_INLINE__ __MATX_HOST__ __MATX_DEVICE__ index_t Size(int dim) const
       {

diff --git a/include/matx/operators/ambgfun.h b/include/matx/operators/ambgfun.h
@@ -50,7 +50,8 @@ namespace matx
         AMBGFunCutType_t cut_;
         float cut_val_;
         cuda::std::array<index_t, 2> out_dims_;
-        mutable matx::tensor_t<typename OpX::scalar_type, 2> tmp_out_;
+        mutable detail::tensor_impl_t<typename remove_cvref_t<OpX>::scalar_type, 2> tmp_out_;
+        mutable typename remove_cvref_t<OpX>::scalar_type *ptr;         
 
       public:
         using matxop = bool;
@@ -111,19 +112,23 @@ namespace matx
         }
 
         template <typename ShapeType, typename Executor>
-        __MATX_INLINE__ void PreRun([[maybe_unused]] ShapeType &&shape, Executor &&ex) const noexcept
+        __MATX_INLINE__ void InnerPreRun([[maybe_unused]] ShapeType &&shape, Executor &&ex) const noexcept
         {
           if constexpr (is_matx_op<OpX>()) {
             x_.PreRun(std::forward<ShapeType>(shape), std::forward<Executor>(ex));
-          }
+          }     
 
           if constexpr (is_matx_op<OpY>()) {
             y_.PreRun(std::forward<ShapeType>(shape), std::forward<Executor>(ex));
-          }          
+          }             
+        }      
 
-          if constexpr (is_cuda_executor_v<Executor>) {
-            make_tensor(tmp_out_, out_dims_, MATX_ASYNC_DEVICE_MEMORY, ex.getStream());
-          }
+        template <typename ShapeType, typename Executor>
+        __MATX_INLINE__ void PreRun([[maybe_unused]] ShapeType &&shape, Executor &&ex) const noexcept
+        {
+          InnerPreRun(std::forward<ShapeType>(shape), std::forward<Executor>(ex));         
+
+          detail::AllocateTempTensor(tmp_out_, std::forward<Executor>(ex), out_dims_, &ptr);
 
           Exec(cuda::std::make_tuple(tmp_out_), std::forward<Executor>(ex));
         }

diff --git a/include/matx/operators/any.h b/include/matx/operators/any.h
@@ -49,7 +49,8 @@ namespace detail {
     private:
       OpA a_;
       cuda::std::array<index_t, ORank> out_dims_;
-      mutable matx::tensor_t<typename remove_cvref_t<OpA>::scalar_type, ORank> tmp_out_;      
+      mutable detail::tensor_impl_t<typename remove_cvref_t<OpA>::scalar_type, ORank> tmp_out_;
+      mutable typename remove_cvref_t<OpA>::scalar_type *ptr; 
 
     public:
       using matxop = bool;
@@ -80,18 +81,19 @@ namespace detail {
       }
 
       template <typename ShapeType, typename Executor>
-      __MATX_INLINE__ void PreRun([[maybe_unused]] ShapeType &&shape, Executor &&ex) const noexcept
+      __MATX_INLINE__ void InnerPreRun([[maybe_unused]] ShapeType &&shape, Executor &&ex) const noexcept
       {
         if constexpr (is_matx_op<OpA>()) {
           a_.PreRun(std::forward<ShapeType>(shape), std::forward<Executor>(ex));
         }     
+      }      
 
-        if constexpr (is_cuda_executor_v<Executor>) {
-          make_tensor(tmp_out_, out_dims_, MATX_ASYNC_DEVICE_MEMORY, ex.getStream());
-        }
-        else {
-          make_tensor(tmp_out_, out_dims_, MATX_HOST_MEMORY);          
-        }
+      template <typename ShapeType, typename Executor>
+      __MATX_INLINE__ void PreRun([[maybe_unused]] ShapeType &&shape, Executor &&ex) const noexcept
+      {
+        InnerPreRun(std::forward<ShapeType>(shape), std::forward<Executor>(ex));   
+
+        detail::AllocateTempTensor(tmp_out_, std::forward<Executor>(ex), out_dims_, &ptr);
 
         Exec(cuda::std::make_tuple(tmp_out_), std::forward<Executor>(ex));
       }
-Original file line number
+Diff line change
@@ Expand Up @@
       CUDA_CHECK_LAST_ERROR();
       MATX_EXIT_HANDLER();
-    }
+    }