Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

CUDA early exit on empty grid #1729

Merged
merged 9 commits into from
Aug 17, 2022
1 change: 1 addition & 0 deletions include/gridtools/common/cuda_util.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
*/
#pragma once

#include <cassert>
#include <memory>
#include <sstream>
#include <stdexcept>
Expand Down
83 changes: 47 additions & 36 deletions include/gridtools/fn/backend/gpu.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -139,29 +139,37 @@ namespace gridtools::fn::backend {
}
};

template <class Sizes>
bool is_domain_empty(const Sizes &sizes) {
return tuple_util::host::apply([](auto... sizes) { return ((sizes == 0) || ...); }, sizes);
}

template <class BlockSizes, class Sizes, class StencilStage, class MakeIterator, class Composite>
void apply_stencil_stage(gpu<BlockSizes> const &g,
Sizes const &sizes,
StencilStage,
MakeIterator make_iterator,
Composite &&composite) {
auto ptr_holder = sid::get_origin(std::forward<Composite>(composite));
auto strides = sid::get_strides(std::forward<Composite>(composite));

auto [blocks, threads] = blocks_and_threads<BlockSizes>(sizes);
cuda_util::launch(blocks,
threads,
0,
g.stream,
kernel<BlockSizes,
Sizes,
decltype(ptr_holder),
decltype(strides),
stencil_fun_f<StencilStage, MakeIterator>>,
sizes,
ptr_holder,
strides,
stencil_fun_f<StencilStage, MakeIterator>{std::move(make_iterator)});
if (!is_domain_empty(sizes)) {
auto ptr_holder = sid::get_origin(std::forward<Composite>(composite));
auto strides = sid::get_strides(std::forward<Composite>(composite));

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe something like the following (as the first thing in the function) would be more readable. What do you think?

Suggested change
if empty_domain(sizes)
return;

Copy link
Contributor

@havogt havogt Aug 15, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

and maybe even better to move it outside of gpu.hpp to the common part where we are dispatching to the backends (but didn't check where that is and if it really makes sense).

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I created an is_domain_empty function. I usually use an if block instead of an early return when the code is simple, but I'm happy with either.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I prefer Hannes early return in this case as I my eyes don’t have to track any braces opening and closing. And my proposed changes would be even a bit more minimalistic, just before the kernel launch:

if (blocks.x ==0 || blocks.y == 0 || blocks.z == 0)
    return;

(of course this does not include the optimization of skipping unnecessary computation of ptr_holder, strides, etc., but I don’t think we need any optimization of such trivial computations for an empty domain)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Well, uhh, it was with the blocks before Hannes suggested to make it a function :) Now that I wrote the empty domain function, I'd rather keep it, because that satisfies the requirements independent of how you calculate the blocks.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'll also put an early return

auto [blocks, threads] = blocks_and_threads<BlockSizes>(sizes);
assert(threads.x > 0 && threads.y > 0 && threads.z > 0);
cuda_util::launch(blocks,
threads,
0,
g.stream,
kernel<BlockSizes,
Sizes,
decltype(ptr_holder),
decltype(strides),
stencil_fun_f<StencilStage, MakeIterator>>,
sizes,
ptr_holder,
strides,
stencil_fun_f<StencilStage, MakeIterator>{std::move(make_iterator)});
}
}

template <class ColumnStage, class MakeIterator, class Seed>
Expand Down Expand Up @@ -190,25 +198,28 @@ namespace gridtools::fn::backend {
Composite &&composite,
Vertical,
Seed seed) {
auto ptr_holder = sid::get_origin(std::forward<Composite>(composite));
auto strides = sid::get_strides(std::forward<Composite>(composite));
auto h_sizes = hymap::canonicalize_and_remove_key<Vertical>(sizes);
int v_size = at_key<Vertical>(sizes);

auto [blocks, threads] = blocks_and_threads<BlockSizes>(h_sizes);
cuda_util::launch(blocks,
threads,
0,
g.stream,
kernel<BlockSizes,
decltype(h_sizes),
decltype(ptr_holder),
decltype(strides),
column_fun_f<ColumnStage, MakeIterator, Seed>>,
h_sizes,
ptr_holder,
strides,
column_fun_f<ColumnStage, MakeIterator, Seed>{std::move(make_iterator), std::move(seed), v_size});
if (!is_domain_empty(sizes)) {
auto ptr_holder = sid::get_origin(std::forward<Composite>(composite));
auto strides = sid::get_strides(std::forward<Composite>(composite));
auto h_sizes = hymap::canonicalize_and_remove_key<Vertical>(sizes);
int v_size = at_key<Vertical>(sizes);

auto [blocks, threads] = blocks_and_threads<BlockSizes>(h_sizes);
assert(threads.x > 0 && threads.y > 0 && threads.z > 0);
cuda_util::launch(blocks,
threads,
0,
g.stream,
kernel<BlockSizes,
decltype(h_sizes),
decltype(ptr_holder),
decltype(strides),
column_fun_f<ColumnStage, MakeIterator, Seed>>,
h_sizes,
ptr_holder,
strides,
column_fun_f<ColumnStage, MakeIterator, Seed>{std::move(make_iterator), std::move(seed), v_size});
}
}

template <class BlockSizes>
Expand Down
60 changes: 60 additions & 0 deletions tests/unit_tests/fn/test_fn_backend_gpu.cu
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
#include <gtest/gtest.h>
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would prefer having this test as a regression test, running for all available backends. You could even just put a new variant of the copy stencil test into tests/regression/fn/fn_copy.cpp I guess. There are already versions with and without domain offsets, so the one with zero domain size would actually fit well.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

…or add a separate one as you also check the scan. I don’t care much, but enabling it for all backends makes sense for sure.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I made a separate regression test (and removed the unit test).


#include <gridtools/fn/column_stage.hpp>
#include <gridtools/fn/unstructured.hpp>
#include <gridtools/sid/composite.hpp>
#include <gridtools/sid/synthetic.hpp>

Expand Down Expand Up @@ -207,5 +208,64 @@ namespace gridtools::fn::backend {
bool success = on_device::exec(global_tmp_check_fun(), ptr_holder, strides);
EXPECT_TRUE(success);
}

struct empty_stencil {
GT_FUNCTION constexpr auto operator()() const {
return []() { return 0.0f; };
}
};

TEST(backend_gpu, empty_domain_stencil) {
constexpr size_t nvertices = 0;
constexpr size_t nlevels = 0;

using block_sizes_t = meta::list<meta::list<unstructured::dim::horizontal, int_t<32>>,
meta::list<unstructured::dim::vertical, int_t<1>>>;

auto out = cuda_util::cuda_malloc<int>(nvertices * nlevels);
auto as_synthetic = [](int *x) {
return sid::synthetic()
.set<property::origin>(sid::host_device::simple_ptr_holder(x))
.set<property::strides>(
hymap::keys<unstructured::dim::horizontal, unstructured::dim::vertical>::make_values(5_c, 1_c));
};
auto out_s = as_synthetic(out.get());

auto domain = unstructured_domain(tuple{nvertices, nlevels}, tuple{0, 0});
auto backend = make_backend(backend::gpu<block_sizes_t>(), domain);
backend.stencil_executor()().arg(out_s).assign(0_c, empty_stencil{}).execute();
}

struct empty_column : fwd {
static GT_FUNCTION constexpr auto prologue() {
return tuple(scan_pass([](auto acc) { return acc; }, host_device::identity()));
}

static GT_FUNCTION constexpr auto body() {
return scan_pass([](auto acc) { return acc; }, host_device::identity());
}
};

TEST(backend_gpu, empty_domain_column) {
constexpr size_t nvertices = 0;
constexpr size_t nlevels = 0;

using block_sizes_t = meta::list<meta::list<unstructured::dim::horizontal, int_t<32>>,
meta::list<unstructured::dim::vertical, int_t<1>>>;

auto out = cuda_util::cuda_malloc<int>(nvertices * nlevels);
auto as_synthetic = [](int *x) {
return sid::synthetic()
.set<property::origin>(sid::host_device::simple_ptr_holder(x))
.set<property::strides>(
hymap::keys<unstructured::dim::horizontal, unstructured::dim::vertical>::make_values(5_c, 1_c));
};
auto out_s = as_synthetic(out.get());

auto domain = unstructured_domain(tuple{nvertices, nlevels}, tuple{0, 0});
auto backend = make_backend(backend::gpu<block_sizes_t>(), domain);
backend.vertical_executor()().arg(out_s).assign(0_c, empty_column{}, 0.0f).execute();
}

} // namespace
} // namespace gridtools::fn::backend