Skip to content

Commit

Permalink
Added vector_norm and matrix_norm
Browse files Browse the repository at this point in the history
Renamed norm() function to abs2
Fixed several bugs with pointer ownership on default tensors
Fixed bug where PreRun was not executed in some cases
  • Loading branch information
cliffburdick committed Jun 14, 2024
1 parent e8f96bf commit c6ec9ae
Show file tree
Hide file tree
Showing 59 changed files with 973 additions and 359 deletions.
18 changes: 0 additions & 18 deletions docs_input/api/math/complex/norm.rst

This file was deleted.

6 changes: 3 additions & 3 deletions docs_input/notebooks/04_radar_pipeline.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -133,7 +133,7 @@
"In this case we're applying a Hamming window to our partial waveform view. `hamming` is a generator function that generates Hamming window values at each point defined in the tensor. Next, we compute the L2 norm of the partial waveform. The L2 norm is done in two steps currently: an I^2 + Q^2 reduction, followed by a square root on the output of the reduction:\n",
"\n",
"```c++\n",
" sum(norms, norm(waveformPart), stream);\n",
" sum(norms, abs2(waveformPart), stream);\n",
" exec(norms, sqrt(norms), stream);\n",
"```\n",
"\n",
Expand Down Expand Up @@ -245,10 +245,10 @@
"## CFAR Detection\n",
"The last step in the pipeline is the constant false alarm rate (CFAR) detection. CFAR detection is broadly used to filter observible signals from noise by setting a threshold for observation. A filter mask was created in the constructor to represent the \"field of view\" that we are looking for a target in. By describing the field of view, we can differentiate what parts of the signal we believe are signal power and noise power. \n",
"\n",
"CFAR detection begins by taking the signal power of the last stage by summing the squares of all complex numbers (I^2 + Q^2). This is done by using the MatX `norm` operator:\n",
"CFAR detection begins by taking the signal power of the last stage by summing the squares of all complex numbers (I^2 + Q^2). This is done by using the MatX `abs2` operator:\n",
"\n",
"```c++\n",
"exec(xdPow, norm(cfarIn), stream);\n",
"exec(xdPow, abs2(cfarIn), stream);\n",
"```\n",
"\n",
"xdPow now contains the sum of the squares of each element. Using the computed power per cell, we apply the CFAR mask that was computed in the constructor. The mask is applied using a 2D convolution from the MatX `conv2d` function:\n",
Expand Down
2 changes: 1 addition & 1 deletion examples/fft_conv.cu
Original file line number Diff line number Diff line change
Expand Up @@ -172,4 +172,4 @@ int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)

CUDA_CHECK_LAST_ERROR();
MATX_EXIT_HANDLER();
}
}
4 changes: 2 additions & 2 deletions examples/simple_radar_pipeline.h
Original file line number Diff line number Diff line change
Expand Up @@ -253,7 +253,7 @@ class RadarPipeline {
(waveformPart = waveformPart * hamming<0>({waveformLength})).run(exec);

// compute L2 norm
(norms = sum(norm(waveformPart))).run(exec);
(norms = sum(abs2(waveformPart))).run(exec);
(norms = sqrt(norms)).run(exec);

(waveformPart = waveformPart / norms).run(exec);
Expand Down Expand Up @@ -358,7 +358,7 @@ class RadarPipeline {
*/
void CFARDetections()
{
(xPow = norm(tpcView)).run(exec);
(xPow = abs2(tpcView)).run(exec);

// Estimate the background average power in each cell
// background_averages = conv2(Xpow, mask, 'same') ./ norm;
Expand Down
4 changes: 2 additions & 2 deletions include/matx/core/half_complex.h
Original file line number Diff line number Diff line change
Expand Up @@ -740,14 +740,14 @@ __MATX_HOST__ __MATX_DEVICE__ __MATX_INLINE__ T atan2(const T &x, const T &y)
}

/**
* @brief Norm operator
* @brief Squared absolute value operator
*
* @tparam T Underlying type
* @param x Value of input
* @return Result of operation
*/
template <typename T>
__MATX_HOST__ __MATX_DEVICE__ __MATX_INLINE__ T norm(const matxHalfComplex<T> &x)
__MATX_HOST__ __MATX_DEVICE__ __MATX_INLINE__ T abs2(const matxHalfComplex<T> &x)
{
if (isinf(x.real()))
return static_cast<T>(cuda::std::abs(static_cast<float>(x.real())));
Expand Down
23 changes: 19 additions & 4 deletions include/matx/core/operator_utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ namespace matx {
__MATX_HOST__ __MATX_INLINE__ auto ReduceOutput(Func &&func, OutputOp &&out, InputOp &&in, BeginIter &&bi, EndIter &&ei) {
if constexpr (remove_cvref_t<decltype(out)>::Rank() <= 1 && is_tensor_view_v<OutputOp>) {
if (out.IsContiguous()) {
if constexpr(ConvertType) {
if constexpr(ConvertType) {
return func( in,
reinterpret_cast<detail::convert_matx_type_t<typename remove_cvref_t<OutputOp>::scalar_type> *>(out.Data()),
bi,
Expand All @@ -64,7 +64,7 @@ namespace matx {

template <typename Func, typename OutputOp, typename InputOp, bool ConvertType = true>
__MATX_HOST__ __MATX_INLINE__ auto ReduceInput(Func &&func, OutputOp &&out, InputOp &&in) {
typename detail::base_type_t<InputOp> in_base = in;
typename detail::base_type_t<InputOp> in_base = in;
if constexpr (in_base.Rank() < 2 && is_tensor_view_v<InputOp>) {
if (in_base.IsContiguous()) {
if constexpr (ConvertType) {
Expand All @@ -89,8 +89,6 @@ namespace matx {
auto collapsed = matx::lcollapse<remove_cvref_t<decltype(out)>::Rank()>(rcollapse<remove_cvref_t<decltype(in)>::Rank() -
remove_cvref_t<decltype(out)>::Rank()>(in_base));
const auto &iter = matx::RandomOperatorIterator<decltype(collapsed), ConvertType>{collapsed};


return ReduceOutput<ConvertType>(std::forward<Func>(func), std::forward<OutputOp>(out), iter, BeginOffset{iter}, EndOffset{iter});
}

Expand All @@ -116,4 +114,21 @@ namespace matx {

return shape;
}

namespace detail {
// Used inside of transforms to allocate temporary output
template <typename TensorType, typename Executor, typename ShapeType>
__MATX_HOST__ __MATX_INLINE__ void AllocateTempTensor(TensorType &tensor, Executor &&ex, ShapeType &&shape, typename TensorType::scalar_type **ptr) {
const auto ttl_size = std::accumulate(shape.begin(), shape.end(), static_cast<index_t>(1),
std::multiplies<index_t>()) * sizeof(*ptr);
if constexpr (is_cuda_executor_v<Executor>) {
matxAlloc((void**)ptr, ttl_size, MATX_ASYNC_DEVICE_MEMORY, ex.getStream());
make_tensor(tensor, *ptr, shape);
}
else {
matxAlloc((void**)ptr, ttl_size, MATX_HOST_MEMORY);
make_tensor(tensor, *ptr, shape);
}
}
}
};
49 changes: 27 additions & 22 deletions include/matx/core/pybind.h
Original file line number Diff line number Diff line change
Expand Up @@ -336,35 +336,40 @@ class MatXPybind {
using ntype = matx_convert_complex_type<T>;
auto ften = pybind11::array_t<ntype>(np_ten);

for (index_t s1 = 0; s1 < ten.Size(0); s1++) {
if constexpr (RANK > 1) {
for (index_t s2 = 0; s2 < ten.Size(1); s2++) {
if constexpr (RANK > 2) {
for (index_t s3 = 0; s3 < ten.Size(2); s3++) {
if constexpr (RANK > 3) {
for (index_t s4 = 0; s4 < ten.Size(3); s4++) {
if constexpr (RANK > 4) {
for (index_t s5 = 0; s5 < ten.Size(4); s5++) {
ten(s1, s2, s3, s4, s5) = ConvertComplex(ften.at(s1, s2, s3, s4, s5));
if constexpr (RANK == 0) {
ten() = ConvertComplex(ften.at());
}
else {
for (index_t s1 = 0; s1 < ten.Size(0); s1++) {
if constexpr (RANK > 1) {
for (index_t s2 = 0; s2 < ten.Size(1); s2++) {
if constexpr (RANK > 2) {
for (index_t s3 = 0; s3 < ten.Size(2); s3++) {
if constexpr (RANK > 3) {
for (index_t s4 = 0; s4 < ten.Size(3); s4++) {
if constexpr (RANK > 4) {
for (index_t s5 = 0; s5 < ten.Size(4); s5++) {
ten(s1, s2, s3, s4, s5) = ConvertComplex(ften.at(s1, s2, s3, s4, s5));
}
}
else {
ten(s1, s2, s3, s4) = ConvertComplex(ften.at(s1, s2, s3, s4));
}
}
else {
ten(s1, s2, s3, s4) = ConvertComplex(ften.at(s1, s2, s3, s4));
}
}
}
else {
ten(s1, s2, s3) = ConvertComplex(ften.at(s1, s2, s3));
else {
ten(s1, s2, s3) = ConvertComplex(ften.at(s1, s2, s3));
}
}
}
}
else {
ten(s1, s2) = ConvertComplex(ften.at(s1, s2));
else {
ten(s1, s2) = ConvertComplex(ften.at(s1, s2));
}
}
}
}
else {
ten(s1) = ConvertComplex(ften.at(s1));
else {
ten(s1) = ConvertComplex(ften.at(s1));
}
}
}
}
Expand Down
9 changes: 8 additions & 1 deletion include/matx/core/tensor_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,7 @@ class tensor_impl_t {
using shape_type = typename Desc::shape_type;
using stride_type = typename Desc::stride_type;
using matxoplvalue = bool;
using self_type = tensor_impl_t<T, RANK, Desc>;

// Type specifier for signaling this is a matx operation
using matxop = bool;
Expand Down Expand Up @@ -231,6 +232,12 @@ class tensor_impl_t {
{
}

__MATX_HOST__ void Shallow(const self_type &rhs) noexcept
{
ldata_ = rhs.ldata_;
desc_ = rhs.desc_;
}

/**
* Lazy assignment operator=. Used to create a "set" object for deferred
* execution on a device
Expand Down Expand Up @@ -811,7 +818,7 @@ class tensor_impl_t {
*
* @return data pointer
*/
auto Data() const noexcept {
__MATX_INLINE__ __MATX_HOST__ __MATX_DEVICE__ auto Data() const noexcept {
return ldata_;
}

Expand Down
5 changes: 5 additions & 0 deletions include/matx/core/tie.h
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,11 @@ struct mtie : public BaseOp<mtie<Ts...>>{

template <typename Executor>
__MATX_INLINE__ void Exec(Executor &&ex) {
// Run the PreRun on the inner type to avoid allocation but allow transforms using MatX operators
// to do any setup needed
if constexpr (sizeof...(Ts) == 2) {
cuda::std::get<sizeof...(Ts) - 1>(ts_).InnerPreRun(NoShape{}, std::forward<Executor>(ex));
}
cuda::std::get<sizeof...(Ts) - 1>(ts_).Exec(ts_, std::forward<Executor>(ex));
}

Expand Down
32 changes: 17 additions & 15 deletions include/matx/operators/all.h
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,8 @@ namespace detail {
private:
OpA a_;
cuda::std::array<index_t, ORank> out_dims_;
mutable matx::tensor_t<typename remove_cvref_t<OpA>::scalar_type, ORank> tmp_out_;
mutable detail::tensor_impl_t<typename remove_cvref_t<OpA>::scalar_type, ORank> tmp_out_;
mutable typename remove_cvref_t<OpA>::scalar_type *ptr;

public:
using matxop = bool;
Expand Down Expand Up @@ -80,29 +81,30 @@ namespace detail {
}

template <typename ShapeType, typename Executor>
__MATX_INLINE__ void PreRun([[maybe_unused]] ShapeType &&shape, Executor &&ex) const noexcept
__MATX_INLINE__ void InnerPreRun([[maybe_unused]] ShapeType &&shape, Executor &&ex) const noexcept
{
if constexpr (is_matx_op<OpA>()) {
a_.PreRun(std::forward<ShapeType>(shape), std::forward<Executor>(ex));
}
}

if constexpr (is_cuda_executor_v<Executor>) {
make_tensor(tmp_out_, out_dims_, MATX_ASYNC_DEVICE_MEMORY, ex.getStream());
}
else {
make_tensor(tmp_out_, out_dims_, MATX_HOST_MEMORY);
}
template <typename ShapeType, typename Executor>
__MATX_INLINE__ void PreRun([[maybe_unused]] ShapeType &&shape, Executor &&ex) const noexcept
{
InnerPreRun(std::forward<ShapeType>(shape), std::forward<Executor>(ex));

detail::AllocateTempTensor(tmp_out_, std::forward<Executor>(ex), out_dims_, &ptr);

Exec(cuda::std::make_tuple(tmp_out_), std::forward<Executor>(ex));
}

template <typename ShapeType, typename Executor>
__MATX_INLINE__ void PostRun([[maybe_unused]] ShapeType &&shape, [[maybe_unused]] Executor &&ex) const noexcept
{
if constexpr (is_matx_op<OpA>()) {
a_.PostRun(std::forward<ShapeType>(shape), std::forward<Executor>(ex));
}
}
template <typename ShapeType, typename Executor>
__MATX_INLINE__ void PostRun([[maybe_unused]] ShapeType &&shape, [[maybe_unused]] Executor &&ex) const noexcept
{
if constexpr (is_matx_op<OpA>()) {
a_.PostRun(std::forward<ShapeType>(shape), std::forward<Executor>(ex));
}
}

constexpr __MATX_INLINE__ __MATX_HOST__ __MATX_DEVICE__ index_t Size(int dim) const
{
Expand Down
19 changes: 12 additions & 7 deletions include/matx/operators/ambgfun.h
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,8 @@ namespace matx
AMBGFunCutType_t cut_;
float cut_val_;
cuda::std::array<index_t, 2> out_dims_;
mutable matx::tensor_t<typename OpX::scalar_type, 2> tmp_out_;
mutable detail::tensor_impl_t<typename remove_cvref_t<OpX>::scalar_type, 2> tmp_out_;
mutable typename remove_cvref_t<OpX>::scalar_type *ptr;

public:
using matxop = bool;
Expand Down Expand Up @@ -111,19 +112,23 @@ namespace matx
}

template <typename ShapeType, typename Executor>
__MATX_INLINE__ void PreRun([[maybe_unused]] ShapeType &&shape, Executor &&ex) const noexcept
__MATX_INLINE__ void InnerPreRun([[maybe_unused]] ShapeType &&shape, Executor &&ex) const noexcept
{
if constexpr (is_matx_op<OpX>()) {
x_.PreRun(std::forward<ShapeType>(shape), std::forward<Executor>(ex));
}
}

if constexpr (is_matx_op<OpY>()) {
y_.PreRun(std::forward<ShapeType>(shape), std::forward<Executor>(ex));
}
}
}

if constexpr (is_cuda_executor_v<Executor>) {
make_tensor(tmp_out_, out_dims_, MATX_ASYNC_DEVICE_MEMORY, ex.getStream());
}
template <typename ShapeType, typename Executor>
__MATX_INLINE__ void PreRun([[maybe_unused]] ShapeType &&shape, Executor &&ex) const noexcept
{
InnerPreRun(std::forward<ShapeType>(shape), std::forward<Executor>(ex));

detail::AllocateTempTensor(tmp_out_, std::forward<Executor>(ex), out_dims_, &ptr);

Exec(cuda::std::make_tuple(tmp_out_), std::forward<Executor>(ex));
}
Expand Down
18 changes: 10 additions & 8 deletions include/matx/operators/any.h
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,8 @@ namespace detail {
private:
OpA a_;
cuda::std::array<index_t, ORank> out_dims_;
mutable matx::tensor_t<typename remove_cvref_t<OpA>::scalar_type, ORank> tmp_out_;
mutable detail::tensor_impl_t<typename remove_cvref_t<OpA>::scalar_type, ORank> tmp_out_;
mutable typename remove_cvref_t<OpA>::scalar_type *ptr;

public:
using matxop = bool;
Expand Down Expand Up @@ -80,18 +81,19 @@ namespace detail {
}

template <typename ShapeType, typename Executor>
__MATX_INLINE__ void PreRun([[maybe_unused]] ShapeType &&shape, Executor &&ex) const noexcept
__MATX_INLINE__ void InnerPreRun([[maybe_unused]] ShapeType &&shape, Executor &&ex) const noexcept
{
if constexpr (is_matx_op<OpA>()) {
a_.PreRun(std::forward<ShapeType>(shape), std::forward<Executor>(ex));
}
}

if constexpr (is_cuda_executor_v<Executor>) {
make_tensor(tmp_out_, out_dims_, MATX_ASYNC_DEVICE_MEMORY, ex.getStream());
}
else {
make_tensor(tmp_out_, out_dims_, MATX_HOST_MEMORY);
}
template <typename ShapeType, typename Executor>
__MATX_INLINE__ void PreRun([[maybe_unused]] ShapeType &&shape, Executor &&ex) const noexcept
{
InnerPreRun(std::forward<ShapeType>(shape), std::forward<Executor>(ex));

detail::AllocateTempTensor(tmp_out_, std::forward<Executor>(ex), out_dims_, &ptr);

Exec(cuda::std::make_tuple(tmp_out_), std::forward<Executor>(ex));
}
Expand Down
Loading

0 comments on commit c6ec9ae

Please sign in to comment.