Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft: Pack bodies in the System #44

Open
wants to merge 5 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 9 additions & 6 deletions src/all_pairs.h
Original file line number Diff line number Diff line change
Expand Up @@ -16,10 +16,11 @@ void all_pairs_force(System<T, N>& system) {
auto it = counting_iterator<uint64_t>(0);
std::for_each_n(par_unseq, it, system.size, [s = system.state()](auto i) {
auto ai = vec<T, N>::splat(0);
auto pi = s.x[i];
auto pi = s.p[i].x();
for (typename System<T, N>::index_t j = 0; j < s.sz; j++) {
auto pj = s.x[j];
ai += s.m[j] * (pj - pi) / dist3(pi, pj);
if (i == j) continue;
auto [mj, pj] = s.p[j];
ai += mj * (pj - pi) / dist3(pi, pj);
}
s.a[i] = s.c * ai;
});
Expand All @@ -38,9 +39,11 @@ void all_pairs_collapsed_force(System<T, N>& system) {
return;
}

auto pi = s.x[i];
auto pj = s.x[j];
auto a = s.c * s.m[j] * (pj - pi) / dist3(pi, pj);
// TODO: we should exploit the symmetry of the force pairs to do
// (N^2)/2 computations here by taking this "a" and doing "s.a[j] -= a / mj * mi".
auto [mi, pi] = s.p[i];
auto [mj, pj] = s.p[j];
auto a = s.c * mj * (pj - pi) / dist3(pi, pj);
atomic_ref<T>{s.a[i][0]}.fetch_add(a[0], memory_order_relaxed);
atomic_ref<T>{s.a[i][1]}.fetch_add(a[1], memory_order_relaxed);
});
Expand Down
4 changes: 4 additions & 0 deletions src/alloc.h
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
#include <cstdint>
#include <cstdlib>
#include <memory>
#include <vector>

template <typename T>
void alloc(T*& ptr, std::size_t n) {
Expand Down Expand Up @@ -50,3 +51,6 @@ template <typename T, typename U>
constexpr bool operator!=(allocator<T> const &, allocator<U> const &) noexcept {
return false;
}

template <typename T>
using vector = std::vector<T, allocator<T>>;
84 changes: 42 additions & 42 deletions src/bvh.h
Original file line number Diff line number Diff line change
Expand Up @@ -15,10 +15,10 @@ using clock_timer = std::chrono::steady_clock;

/// Computes the bounding box of the grid.
template <typename T, dim_t N>
aabb<T, N> bounding_box(std::span<vec<T, N>> xs) {
aabb<T, N> bounding_box(std::span<monopole<T, N>> ms) {
return std::transform_reduce(
par_unseq, xs.begin(), xs.end(), aabb<T, N>(from_points, vec<T, N>::splat(0.)),
[](auto a, auto b) { return merge(a, b); }, [](auto a) { return aabb<T, N>(from_points, a); });
par_unseq, ms.begin(), ms.end(), aabb<T, N>(from_points, vec<T, N>::splat(0.)),
[](auto a, auto b) { return merge(a, b); }, [](auto a) { return aabb<T, N>(from_points, a.x()); });
}

/// Sorts bodies along the Hilbert curve.
Expand All @@ -35,11 +35,11 @@ void hilbert_sort(System<T, N>& system, aabb<T, N> bbox) {

// Compute the Hilbert index for each body in the Cartesian grid
auto bids = system.body_indices();
static std::vector<uint64_t> hilbert_ids(system.size);
static vector<uint64_t> hilbert_ids(system.size);
std::for_each(par_unseq, bids.begin(), bids.end(),
[hids = hilbert_ids.data(), x = system.x.data(), mins = bbox.xmin, grid_cell_size](auto idx) {
[hids = hilbert_ids.data(), p = system.p.data(), mins = bbox.xmin, grid_cell_size](auto idx) {
// Bucket the body into a Cartesian grid cell:
vec<uint32_t, N> cell_idx = cast<uint32_t>((x[idx] - mins) / grid_cell_size);
vec<uint32_t, N> cell_idx = cast<uint32_t>((p[idx].x() - mins) / grid_cell_size);
// Compute the Hilber index of the cell and assign it to the body:
hids[idx] = hilbert(cell_idx);
});
Expand All @@ -48,9 +48,9 @@ void hilbert_sort(System<T, N>& system, aabb<T, N> bbox) {
#if defined(__NVCOMPILER)
// Workaround for nvc++: we can use Thrust zip_iterator, which predates zip_view, but provides the same functionality,
// and works just fine:
auto b = thrust::make_zip_iterator(hilbert_ids.begin(), system.x.begin(), system.m.begin(), system.v.begin(),
auto b = thrust::make_zip_iterator(hilbert_ids.begin(), system.p.begin(), system.v.begin(),
system.a.begin(), system.ao.begin());
auto e = thrust::make_zip_iterator(hilbert_ids.end(), system.x.end(), system.m.end(), system.v.end(), system.a.end(),
auto e = thrust::make_zip_iterator(hilbert_ids.end(), system.p.end(), system.v.end(), system.a.end(),
system.ao.end());
std::sort(par_unseq, b, e, [](auto a, auto b) { return thrust::get<0>(a) < thrust::get<0>(b); });
#elif defined(__clang__) || (__cplusplus < 202302L)
Expand All @@ -59,7 +59,7 @@ void hilbert_sort(System<T, N>& system, aabb<T, N> bbox) {
// TODO: sort an array of keys and then apply a permutation in O(N) time and O(1) storage
// (instead of O(N) time and O(N) storage).

static std::vector<std::pair<uint64_t, std::size_t>> hilbert_index_map(system.size);
static vector<std::pair<uint64_t, std::size_t>> hilbert_index_map(system.size);
std::for_each_n(par_unseq, counting_iterator<std::size_t>(0), system.size,
[hmap = hilbert_index_map.data(), hids = hilbert_ids.data()](std::size_t idx) {
hmap[idx] = std::make_pair(hids[idx], idx);
Expand All @@ -70,27 +70,26 @@ void hilbert_sort(System<T, N>& system, aabb<T, N> bbox) {

// create temp copy of system so that we don't get race conditions when
// rearranging values in the next step
static std::vector<std::tuple<vec<T, N>, T, vec<T, N>, vec<T, N>, vec<T, N>>> tmp_system(system.size);
static vector<std::tuple<monopole<T, N>, vec<T, N>, vec<T, N>, vec<T, N>>> tmp_system(system.size);
std::for_each_n(par_unseq, counting_iterator<std::size_t>(0), system.size,
[tmp_sys = tmp_system.data(), x = system.x.data(), m = system.m.data(), v = system.v.data(),
[tmp_sys = tmp_system.data(), p = system.p.data(), v = system.v.data(),
a = system.a.data(), ao = system.ao.data()](std::size_t idx) {
tmp_sys[idx] = std::make_tuple(x[idx], m[idx], v[idx], a[idx], ao[idx]);
tmp_sys[idx] = std::make_tuple(p[idx], v[idx], a[idx], ao[idx]);
});

// copy back
std::for_each_n(par_unseq, counting_iterator<std::size_t>(0), system.size,
[tmp_sys = tmp_system.data(), hmap = hilbert_index_map.data(), x = system.x.data(),
m = system.m.data(), v = system.v.data(), a = system.a.data(), ao = system.ao.data()](auto idx) {
[tmp_sys = tmp_system.data(), hmap = hilbert_index_map.data(), p = system.p.data(),
v = system.v.data(), a = system.a.data(), ao = system.ao.data()](auto idx) {
std::size_t original_index = hmap[idx].second;
auto e = tmp_sys[original_index];
x[idx] = std::get<0>(e);
m[idx] = std::get<1>(e);
v[idx] = std::get<2>(e);
a[idx] = std::get<3>(e);
ao[idx] = std::get<4>(e);
p[idx] = std::get<0>(e);
v[idx] = std::get<1>(e);
a[idx] = std::get<2>(e);
ao[idx] = std::get<3>(e);
});
#else
auto r = std::views::zip(hilbert_ids, system.x, system.m, system.v, system.a, system.ao);
auto r = std::views::zip(hilbert_ids, system.p, system.v, system.a, system.ao);
std::sort(par_unseq, r.begin(), r.end(), [](auto a, auto b) { return std::get<0>(a) < std::get<0>(b); });
#endif
}
Expand Down Expand Up @@ -188,18 +187,21 @@ struct bvh {
}

if (br >= nbodies) {
m[i] = monopole(s.m[bl], s.x[bl]);
auto bb = aabb<T, N>(from_points, s.x[bl]);
auto [ml, xl] = s.p[bl];
m[i] = monopole(ml, xl);
auto bb = aabb<T, N>(from_points, xl);
b[i] = bb;
bw[i] = node_width(bb);
} else {
T mass = s.m[bl] + s.m[br];
auto [ml, xl] = s.p[bl];
auto [mr, xr] = s.p[br];
T mass = ml + mr;

vec<T, N> center_of_mass = s.m[bl] * s.x[bl] + s.m[br] * s.x[br];
vec<T, N> center_of_mass = ml * xl + mr * xr;
center_of_mass /= mass;
m[i] = monopole(mass, center_of_mass);

auto bb = aabb<T, N>(from_points, s.x[bl], s.x[br]);
auto bb = aabb<T, N>(from_points, xl, xr);
b[i] = bb;
bw[i] = node_width(bb);
}
Expand All @@ -216,24 +218,24 @@ struct bvh {
auto bl = (li * 2) + first + count;
auto br = bl + 1;

auto ml = m[bl];
auto mr = m[br];
auto [ml, xl] = m[bl];
auto [mr, xr] = m[br];

auto ibl = ml.mass() != 0.;
auto ibr = mr.mass() != 0.;
auto ibl = ml != 0.;
auto ibr = mr != 0.;

if (!ibl) {
m[i] = ml;
m[i] = m[bl];
return;
}

if (!ibr) {
m[i] = ml;
m[i] = m[bl];
b[i] = b[bl];
bw[i] = bw[bl];
} else {
T mass = ml.mass() + mr.mass();
auto x = (ml.mass() * ml.x() + mr.mass() * mr.x()) / mass;
T mass = ml + mr;
auto x = (ml * xl + mr * xr) / mass;
m[i] = monopole(mass, x);
auto bb = merge(b[bl], b[br]);
b[i] = bb;
Expand All @@ -253,7 +255,7 @@ struct bvh {
node_t nbodies = system.size;
auto ids = system.body_indices();
std::for_each(par_unseq, ids.begin(), ids.end(), [=, s = system.state(), *this](node_t i) {
auto xs = s.x[i];
auto [mi, xi] = s.p[i];

node_t tree_index = 0;
auto a = vec<T, N>::splat(0);
Expand Down Expand Up @@ -291,10 +293,8 @@ struct bvh {

for (int k = 0; k < 2; ++k) {
if (bidx < nbodies && bidx != i) {
vec<T, N> xj = s.x[bidx];
T mj = s.m[bidx];

a += mj * (xj - xs) / dist3(xs, xj);
auto [mj, xj] = s.p[bidx];
a += mj * (xj - xi) / dist3(xi, xj);
}
++bidx;
}
Expand All @@ -303,9 +303,9 @@ struct bvh {
force_ascend_right();
} else {
auto [mj, xj] = m[tree_index];
if (can_approximate(xs, xj, bw[tree_index], theta_squared)) {
if (can_approximate(xi, xj, bw[tree_index], theta_squared)) {
// below threshold
a += mj * (xj - xs) / dist3(xs, xj);
a += mj * (xj - xi) / dist3(xi, xj);
num_covered_particles += ncontained_leaves_at_level(level, nlevels);

ascend_right();
Expand Down Expand Up @@ -359,7 +359,7 @@ void run_bvh(System<T, N>& system, Arguments arguments) {
dt_force += time([&] {
// Bounding box
aabb<T, N> bbox;
dt_bbox += time([&] { bbox = bounding_box(std::span{system.x}); });
dt_bbox += time([&] { bbox = bounding_box(std::span{system.p}); });

// Sort bodies along Hilbert curve:
dt_sort += time([&] { hilbert_sort(system, bbox); });
Expand All @@ -381,7 +381,7 @@ void run_bvh(System<T, N>& system, Arguments arguments) {
} else {
auto kernels = [&] {
// Bounding box
aabb<T, N> bbox = bounding_box(std::span{system.x});
aabb<T, N> bbox = bounding_box(std::span{system.p});

// Sort bodies along Hilbert curve:
hilbert_sort(system, bbox);
Expand Down
20 changes: 9 additions & 11 deletions src/monopole.h
Original file line number Diff line number Diff line change
Expand Up @@ -6,25 +6,23 @@

template <typename T, dim_t N>
struct monopole {
vec<T, N + 1> data{};
alignas(N % 2 == 0? alignof(vec<T, N>) : alignof(T) * (N+1)) vec<T, N> x_;
T m_;

monopole() = default;
monopole(monopole const &) = default;
monopole(monopole &&) = default;
monopole &operator=(monopole const &) = default;
monopole &operator=(monopole &&) = default;

monopole(T mass, vec<T, N> x) {
for (dim_t i = 0; i < N; ++i) data[i] = x[i];
data[N] = mass;
}

T mass() { return data[N]; }
vec<T, N> x() {
vec<T, N> x;
for (dim_t i = 0; i < N; ++i) x[i] = data[i];
return x;
monopole(T m, vec<T, N> x) {
for (dim_t i = 0; i < N; ++i) x_[i] = x[i];
m_ = m;
}
T& mass() { return m_; }
T const& mass() const { return m_; }
vec<T, N>& x() { return x_; }
vec<T, N> const& x() const { return x_; }
};

namespace std {
Expand Down
Loading