Skip to content

Commit

Permalink
Optimize SWRT traversal
Browse files Browse the repository at this point in the history
 Bistro 640x360: 890ms -> 415ms (RX 6400 with --nohwrt)
  • Loading branch information
sergcpp committed Oct 18, 2024
1 parent 5dd3179 commit e3681a1
Show file tree
Hide file tree
Showing 168 changed files with 1,575 additions and 946 deletions.
3 changes: 3 additions & 0 deletions internal/Constants.inl
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,9 @@ const uint SEP_AXIS_BITS = (3u << 30); // 0b11u
const uint PRIM_COUNT_BITS = ~SEP_AXIS_BITS;
const uint RIGHT_CHILD_BITS = ~SEP_AXIS_BITS;

const uint BVH2_PRIM_COUNT_BITS = (7u << 29); // 0b111u
const uint BVH2_PRIM_INDEX_BITS = ~BVH2_PRIM_COUNT_BITS;

const float PI = 3.141592653589793238463f;

//
Expand Down
113 changes: 105 additions & 8 deletions internal/Core.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -299,6 +299,7 @@ uint32_t Ray::PreprocessMesh(const vtx_attribute_t &positions, Span<const uint32
if (!s.use_fast_bvh_build) {
num_out_nodes = PreprocessPrims_SAH(primitives, positions, s, out_nodes, out_tri_indices);
} else {
assert(false);
num_out_nodes = PreprocessPrims_HLBVH(primitives, out_nodes, out_tri_indices);
}

Expand All @@ -315,14 +316,13 @@ uint32_t Ray::PreprocessMesh(const vtx_attribute_t &positions, Span<const uint32
const uint32_t j = out_tri_indices[i];

out_tris[i] = triangles[j];
out_tri_indices[i] = uint32_t(real_indices[j]);

for (int k = 0; k < 4; ++k) {
out_tris2[i / 8].n_plane[k][i % 8] = triangles[j].n_plane[k];
out_tris2[i / 8].u_plane[k][i % 8] = triangles[j].u_plane[k];
out_tris2[i / 8].v_plane[k][i % 8] = triangles[j].v_plane[k];
}

out_tri_indices[i] = uint32_t(real_indices[j]);
}

return num_out_nodes;
Expand Down Expand Up @@ -531,6 +531,9 @@ uint32_t Ray::PreprocessPrims_SAH(Span<const prim_t> prims, const vtx_attribute_
memcpy(&n.bbox_min[0], value_ptr(bbox_min), 3 * sizeof(float));
memcpy(&n.bbox_max[0], value_ptr(bbox_max), 3 * sizeof(float));
out_indices.insert(out_indices.end(), split_data.left_indices.begin(), split_data.left_indices.end());
while (out_indices.size() % s.primitive_alignment) {
out_indices.push_back(out_indices.back());
}
} else {
const auto index = uint32_t(num_nodes);

Expand Down Expand Up @@ -718,7 +721,7 @@ uint32_t Ray::PreprocessPrims_HLBVH(Span<const prim_t> prims, std::vector<bvh_no
return uint32_t(out_nodes.size() - top_nodes_start);
}

uint32_t Ray::FlattenBVH_r(const bvh_node_t *nodes, const uint32_t node_index, const uint32_t parent_index,
uint32_t Ray::FlattenBVH_r(Span<const bvh_node_t> nodes, const uint32_t node_index,
aligned_vector<wbvh_node_t> &out_nodes) {
const bvh_node_t &cur_node = nodes[node_index];

Expand Down Expand Up @@ -827,7 +830,7 @@ uint32_t Ray::FlattenBVH_r(const bvh_node_t *nodes, const uint32_t node_index, c

for (int i = 0; i < 8; i++) {
if (sorted_children[i] != 0xffffffff) {
new_children[i] = FlattenBVH_r(nodes, sorted_children[i], node_index, out_nodes);
new_children[i] = FlattenBVH_r(nodes, sorted_children[i], out_nodes);
} else {
new_children[i] = 0x7fffffff;
}
Expand Down Expand Up @@ -855,7 +858,7 @@ uint32_t Ray::FlattenBVH_r(const bvh_node_t *nodes, const uint32_t node_index, c
return new_node_index;
}

uint32_t Ray::FlattenLightBVH_r(const light_bvh_node_t *nodes, const uint32_t node_index, const uint32_t parent_index,
uint32_t Ray::FlattenLightBVH_r(Span<const light_bvh_node_t> nodes, const uint32_t node_index,
aligned_vector<light_wbvh_node_t> &out_nodes) {
const light_bvh_node_t &cur_node = nodes[node_index];

Expand Down Expand Up @@ -968,7 +971,7 @@ uint32_t Ray::FlattenLightBVH_r(const light_bvh_node_t *nodes, const uint32_t no

for (int i = 0; i < 8; i++) {
if (sorted_children[i] != 0xffffffff) {
new_children[i] = FlattenLightBVH_r(nodes, sorted_children[i], node_index, out_nodes);
new_children[i] = FlattenLightBVH_r(nodes, sorted_children[i], out_nodes);
} else {
new_children[i] = 0x7fffffff;
}
Expand Down Expand Up @@ -1005,7 +1008,7 @@ uint32_t Ray::FlattenLightBVH_r(const light_bvh_node_t *nodes, const uint32_t no
return new_node_index;
}

uint32_t Ray::FlattenLightBVH_r(const light_bvh_node_t *nodes, const uint32_t node_index, const uint32_t parent_index,
uint32_t Ray::FlattenLightBVH_r(Span<const light_bvh_node_t> nodes, const uint32_t node_index,
aligned_vector<light_cwbvh_node_t> &out_nodes) {
const light_bvh_node_t &cur_node = nodes[node_index];

Expand Down Expand Up @@ -1132,7 +1135,7 @@ uint32_t Ray::FlattenLightBVH_r(const light_bvh_node_t *nodes, const uint32_t no
uint32_t new_children[8];
for (int i = 0; i < 8; i++) {
if (sorted_children[i] != 0xffffffff) {
new_children[i] = FlattenLightBVH_r(nodes, sorted_children[i], node_index, out_nodes);
new_children[i] = FlattenLightBVH_r(nodes, sorted_children[i], out_nodes);
} else {
new_children[i] = 0x7fffffff;
}
Expand Down Expand Up @@ -1184,6 +1187,100 @@ uint32_t Ray::FlattenLightBVH_r(const light_bvh_node_t *nodes, const uint32_t no
return new_node_index;
}

uint32_t Ray::ConvertToBVH2(Span<const bvh_node_t> nodes, std::vector<bvh2_node_t> &out_nodes) {
const uint32_t out_index = uint32_t(out_nodes.size());

if (nodes.size() == 1) {
assert((nodes[0].prim_index & LEAF_NODE_BIT) != 0);
bvh2_node_t root_node = {};

root_node.left_child = (out_index + 1);
root_node.left_child |= 1u << 30;
root_node.right_child = 0x7fffffff;

const bvh_node_t &ch0 = nodes[0];

root_node.ch_data0[0] = ch0.bbox_min[0];
root_node.ch_data0[1] = ch0.bbox_max[0];
root_node.ch_data0[2] = ch0.bbox_min[1];
root_node.ch_data0[3] = ch0.bbox_max[1];
root_node.ch_data2[0] = ch0.bbox_min[2];
root_node.ch_data2[1] = ch0.bbox_max[2];

out_nodes.push_back(root_node);
}

std::vector<uint32_t> compacted_indices;
compacted_indices.resize(nodes.size());
uint32_t compacted_count = 0;
for (int i = 0; i < int(nodes.size()); ++i) {
compacted_indices[i] = compacted_count;
if ((nodes[i].prim_index & LEAF_NODE_BIT) == 0) {
++compacted_count;
}
}
out_nodes.reserve(out_nodes.size() + compacted_count);

const uint32_t offset = uint32_t(out_nodes.size());
for (const bvh_node_t &n : nodes) {
bvh2_node_t new_node = {};
new_node.left_child = n.left_child;
new_node.right_child = n.right_child & RIGHT_CHILD_BITS;
if ((n.prim_index & LEAF_NODE_BIT) == 0) {
const bvh_node_t &ch0 = nodes[new_node.left_child];
const bvh_node_t &ch1 = nodes[new_node.right_child];

if ((ch0.prim_index & LEAF_NODE_BIT) != 0) {
new_node.left_child = ch0.prim_index & PRIM_INDEX_BITS;

const uint32_t prim_count = (ch0.prim_count & PRIM_COUNT_BITS);
assert(prim_count <= 8);
new_node.left_child |= std::max(prim_count - 1u, 1u) << 29;
assert((new_node.left_child & BVH2_PRIM_COUNT_BITS) != 0);
} else {
new_node.left_child = compacted_indices[new_node.left_child];
new_node.left_child += offset;
assert((new_node.left_child & BVH2_PRIM_COUNT_BITS) == 0);
assert(new_node.left_child < out_nodes.capacity());
}
if ((ch1.prim_index & LEAF_NODE_BIT) != 0) {
new_node.right_child = ch1.prim_index & PRIM_INDEX_BITS;

const uint32_t prim_count = (ch1.prim_count & PRIM_COUNT_BITS);
assert(prim_count <= 8);
new_node.right_child |= std::max(prim_count - 1u, 1u) << 29;
assert((new_node.right_child & BVH2_PRIM_COUNT_BITS) != 0);
} else {
new_node.right_child = compacted_indices[new_node.right_child];
new_node.right_child += offset;
assert((new_node.right_child & BVH2_PRIM_COUNT_BITS) == 0);
assert(new_node.right_child < out_nodes.capacity());
}

new_node.ch_data0[0] = ch0.bbox_min[0];
new_node.ch_data0[1] = ch0.bbox_max[0];
new_node.ch_data0[2] = ch0.bbox_min[1];
new_node.ch_data0[3] = ch0.bbox_max[1];

new_node.ch_data1[0] = ch1.bbox_min[0];
new_node.ch_data1[1] = ch1.bbox_max[0];
new_node.ch_data1[2] = ch1.bbox_min[1];
new_node.ch_data1[3] = ch1.bbox_max[1];

new_node.ch_data2[0] = ch0.bbox_min[2];
new_node.ch_data2[1] = ch0.bbox_max[2];
new_node.ch_data2[2] = ch1.bbox_min[2];
new_node.ch_data2[3] = ch1.bbox_max[2];

out_nodes.push_back(new_node);
}
}

assert(out_nodes.size() == out_nodes.capacity());

return out_index;
}

bool Ray::NaiivePluckerTest(const float p[9], const float o[3], const float d[3]) {
// plucker coordinates for edges
const float e0[6] = {p[6] - p[0],
Expand Down
33 changes: 20 additions & 13 deletions internal/Core.h
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,16 @@ struct light_bvh_node_t : public bvh_node_t {
};
static_assert(sizeof(light_bvh_node_t) == 56, "!");

struct bvh2_node_t {
float ch_data0[4]; // [ ch0.min.x, ch0.max.x, ch0.min.y, ch0.max.y ]
float ch_data1[4]; // [ ch1.min.x, ch1.max.x, ch1.min.y, ch1.max.y ]
float ch_data2[4]; // [ ch0.min.z, ch0.max.z, ch1.min.z, ch1.max.z ]
uint32_t left_child; // First three bits identify primitive count in leaf nodes
uint32_t right_child; // First three bits identify primitive count in leaf nodes
uint32_t _unused0, _unused1;
};
static_assert(sizeof(bvh2_node_t) == 64, "!");

struct alignas(32) wbvh_node_t {
float bbox_min[3][8];
float bbox_max[3][8];
Expand Down Expand Up @@ -235,6 +245,7 @@ struct bvh_settings_t {
bool allow_spatial_splits = false;
bool use_fast_bvh_build = false;
int min_primitives_in_leaf = 8;
int primitive_alignment = 1;
};

template <typename T, size_t Alignment = alignof(T)>
Expand Down Expand Up @@ -317,13 +328,14 @@ uint32_t PreprocessPrims_SAH(Span<const prim_t> prims, const vtx_attribute_t &po
uint32_t PreprocessPrims_HLBVH(Span<const prim_t> prims, std::vector<bvh_node_t> &out_nodes,
std::vector<uint32_t> &out_indices);

uint32_t FlattenBVH_r(const bvh_node_t *nodes, uint32_t node_index, uint32_t parent_index,
aligned_vector<wbvh_node_t> &out_nodes);
uint32_t FlattenLightBVH_r(const light_bvh_node_t *nodes, uint32_t node_index, uint32_t parent_index,
uint32_t FlattenBVH_r(Span<const bvh_node_t> nodes, uint32_t node_index, aligned_vector<wbvh_node_t> &out_nodes);
uint32_t FlattenLightBVH_r(Span<const light_bvh_node_t> nodes, uint32_t node_index,
aligned_vector<light_wbvh_node_t> &out_nodes);
uint32_t FlattenLightBVH_r(const light_bvh_node_t *nodes, uint32_t node_index, uint32_t parent_index,
uint32_t FlattenLightBVH_r(Span<const light_bvh_node_t> nodes, uint32_t node_index,
aligned_vector<light_cwbvh_node_t> &out_nodes);

uint32_t ConvertToBVH2(Span<const bvh_node_t> nodes, std::vector<bvh2_node_t> &out_nodes);

bool NaiivePluckerTest(const float p[9], const float o[3], const float d[3]);

const int FILTER_TABLE_SIZE = 1024;
Expand All @@ -344,7 +356,7 @@ void ConstructCamera(eCamType type, ePixelFilter filter, float filter_width, eVi
float lens_rotation, float lens_ratio, int lens_blades, float clip_start, float clip_end,
camera_t *cam);

// Applies 4x4 matrix matrix transform to bounding box
// Applies 4x4 matrix transform to bounding box
void TransformBoundingBox(const float bbox_min[3], const float bbox_max[3], const float *xform, float out_bbox_min[3],
float out_bbox_max[3]);

Expand Down Expand Up @@ -372,17 +384,13 @@ struct mesh_t {
static_assert(sizeof(mesh_t) == 64, "!");

struct mesh_instance_t {
float bbox_min[3];
uint32_t _unused;
float bbox_max[3];
uint32_t mesh_index;
uint32_t _unused2;
uint32_t mesh_block;
uint32_t node_index;
uint32_t lights_index;
uint32_t ray_visibility; // upper 24 bits identify lights_block
float xform[16], inv_xform[16];
};
static_assert(sizeof(mesh_instance_t) == 176, "!");
static_assert(sizeof(mesh_instance_t) == 144, "!");

struct environment_t {
float env_col[3];
Expand Down Expand Up @@ -505,11 +513,10 @@ enum eSpatialCacheMode { None, Update, Query };
struct scene_data_t {
const environment_t &env;
const mesh_instance_t *mesh_instances;
const uint32_t *mi_indices;
const mesh_t *meshes;
const uint32_t *vtx_indices;
const vertex_t *vertices;
const bvh_node_t *nodes;
const bvh2_node_t *nodes;
const wbvh_node_t *wnodes;
const tri_accel_t *tris;
const uint32_t *tri_indices;
Expand Down
Loading

0 comments on commit e3681a1

Please sign in to comment.