Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[opengl] [perf] Use TI_AUTO_PROF in OpenGL backend runtime #1570

Closed
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
124 changes: 124 additions & 0 deletions misc/benchmark_mpm99.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,124 @@
import taichi as ti
import numpy as np
ti.init(arch=ti.opengl) # Try to run on GPU
quality = 1 # Use a larger value for higher-res simulations
n_particles, n_grid = 9000 * quality**2, 128 * quality
dx, inv_dx = 1 / n_grid, float(n_grid)
dt = 1e-4 / quality
p_vol, p_rho = (dx * 0.5)**2, 1
p_mass = p_vol * p_rho
E, nu = 0.1e4, 0.2 # Young's modulus and Poisson's ratio
mu_0, lambda_0 = E / (2 * (1 + nu)), E * nu / (
(1 + nu) * (1 - 2 * nu)) # Lame parameters
x = ti.Vector(2, dt=ti.f32, shape=n_particles) # position
v = ti.Vector(2, dt=ti.f32, shape=n_particles) # velocity
C = ti.Matrix(2, 2, dt=ti.f32, shape=n_particles) # affine velocity field
F = ti.Matrix(2, 2, dt=ti.f32, shape=n_particles) # deformation gradient
material = ti.var(dt=ti.i32, shape=n_particles) # material id
Jp = ti.var(dt=ti.f32, shape=n_particles) # plastic deformation
grid_v = ti.Vector(2, dt=ti.f32,
shape=(n_grid, n_grid)) # grid node momentum/velocity
grid_m = ti.var(dt=ti.f32, shape=(n_grid, n_grid)) # grid node mass


@ti.kernel
def substep():
for i, j in grid_m:
grid_v[i, j] = [0, 0]
grid_m[i, j] = 0
for p in x: # Particle state update and scatter to grid (P2G)
base = (x[p] * inv_dx - 0.5).cast(int)
fx = x[p] * inv_dx - base.cast(float)
# Quadratic kernels [http://mpm.graphics Eqn. 123, with x=fx, fx-1,fx-2]
w = [0.5 * (1.5 - fx)**2, 0.75 - (fx - 1)**2, 0.5 * (fx - 0.5)**2]
F[p] = (ti.Matrix.identity(ti.f32, 2) +
dt * C[p]) @ F[p] # deformation gradient update
h = ti.exp(
10 *
(1.0 -
Jp[p])) # Hardening coefficient: snow gets harder when compressed
if material[p] == 1: # jelly, make it softer
h = 0.3
mu, la = mu_0 * h, lambda_0 * h
if material[p] == 0: # liquid
mu = 0.0
U, sig, V = ti.svd(F[p])
J = 1.0
for d in ti.static(range(2)):
new_sig = sig[d, d]
if material[p] == 2: # Snow
new_sig = min(max(sig[d, d], 1 - 2.5e-2),
1 + 4.5e-3) # Plasticity
Jp[p] *= sig[d, d] / new_sig
sig[d, d] = new_sig
J *= new_sig
if material[
p] == 0: # Reset deformation gradient to avoid numerical instability
F[p] = ti.Matrix.identity(ti.f32, 2) * ti.sqrt(J)
elif material[p] == 2:
F[p] = U @ sig @ V.transpose(
) # Reconstruct elastic deformation gradient after plasticity
stress = 2 * mu * (F[p] - U @ V.transpose()) @ F[p].transpose(
) + ti.Matrix.identity(ti.f32, 2) * la * J * (J - 1)
stress = (-dt * p_vol * 4 * inv_dx * inv_dx) * stress
affine = stress + p_mass * C[p]
for i, j in ti.static(ti.ndrange(
3, 3)): # Loop over 3x3 grid node neighborhood
offset = ti.Vector([i, j])
dpos = (offset.cast(float) - fx) * dx
weight = w[i][0] * w[j][1]
grid_v[base + offset] += weight * (p_mass * v[p] + affine @ dpos)
grid_m[base + offset] += weight * p_mass
for i, j in grid_m:
if grid_m[i, j] > 0: # No need for epsilon here
grid_v[i,
j] = (1 / grid_m[i, j]) * grid_v[i,
j] # Momentum to velocity
grid_v[i, j][1] -= dt * 50 # gravity
if i < 3 and grid_v[i, j][0] < 0:
grid_v[i, j][0] = 0 # Boundary conditions
if i > n_grid - 3 and grid_v[i, j][0] > 0: grid_v[i, j][0] = 0
if j < 3 and grid_v[i, j][1] < 0: grid_v[i, j][1] = 0
if j > n_grid - 3 and grid_v[i, j][1] > 0: grid_v[i, j][1] = 0
for p in x: # grid to particle (G2P)
base = (x[p] * inv_dx - 0.5).cast(int)
fx = x[p] * inv_dx - base.cast(float)
w = [0.5 * (1.5 - fx)**2, 0.75 - (fx - 1.0)**2, 0.5 * (fx - 0.5)**2]
new_v = ti.Vector.zero(ti.f32, 2)
new_C = ti.Matrix.zero(ti.f32, 2, 2)
for i, j in ti.static(ti.ndrange(
3, 3)): # loop over 3x3 grid node neighborhood
dpos = ti.Vector([i, j]).cast(float) - fx
g_v = grid_v[base + ti.Vector([i, j])]
weight = w[i][0] * w[j][1]
new_v += weight * g_v
new_C += 4 * inv_dx * weight * g_v.outer_product(dpos)
v[p], C[p] = new_v, new_C
x[p] += dt * v[p] # advection


group_size = n_particles // 3


@ti.kernel
def initialize():
for i in range(n_particles):
x[i] = [
ti.random() * 0.2 + 0.3 + 0.10 * (i // group_size),
ti.random() * 0.2 + 0.05 + 0.32 * (i // group_size)
]
material[i] = i // group_size # 0: fluid 1: jelly 2: snow
v[i] = ti.Matrix([0, 0])
F[i] = ti.Matrix([[1, 0], [0, 1]])
Jp[i] = 1


for round in range(5):
print(round)
initialize()
for frame in range(60 * 4):
for s in range(int(2e-3 // dt)):
substep()
x.to_numpy()
material.to_numpy()
ti.print_profile_info()
63 changes: 41 additions & 22 deletions taichi/backends/opengl/opengl_api.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -43,13 +43,14 @@ std::string get_opengl_error_string(GLenum err) {

void check_opengl_error(const std::string &msg = "OpenGL") {
auto err = glGetError();
if (err != GL_NO_ERROR) {
auto estr = get_opengl_error_string(err);
TI_ERROR("{}: {}", msg, estr);
}
if (err == GL_NO_ERROR)
return;
auto estr = get_opengl_error_string(err);
TI_ERROR("{}: {}", msg, estr);
}

int opengl_get_threads_per_group() {
TI_AUTO_PROF;
int ret = 1000;
glGetIntegerv(GL_MAX_COMPUTE_WORK_GROUP_INVOCATIONS, &ret);
check_opengl_error("glGetIntegerv(GL_MAX_COMPUTE_WORK_GROUP_INVOCATIONS)");
Expand Down Expand Up @@ -88,6 +89,7 @@ struct GLShader {
}

void compile(const std::string &source) const {
TI_AUTO_PROF;
const GLchar *source_cstr = source.c_str();
glShaderSource(id_, 1, &source_cstr, nullptr);

Expand Down Expand Up @@ -131,6 +133,7 @@ struct GLProgram {
}

void link() const {
TI_AUTO_PROF;
TI_TRACE("glLinkProgram IN");
glLinkProgram(id_);
TI_TRACE("glLinkProgram OUT");
Expand Down Expand Up @@ -196,6 +199,7 @@ struct GLSSBO {
void bind_data(void *data,
size_t size,
GLuint usage = GL_DYNAMIC_READ) const {
TI_AUTO_PROF;
glBindBuffer(GL_SHADER_STORAGE_BUFFER, id_);
check_opengl_error("glBindBuffer");
glBufferData(GL_SHADER_STORAGE_BUFFER, size, data, usage);
Expand Down Expand Up @@ -228,6 +232,7 @@ struct GLSSBO {
}

void *map(GLbitfield access = GL_READ_ONLY) const {
TI_AUTO_PROF;
glBindBuffer(GL_SHADER_STORAGE_BUFFER, id_);
check_opengl_error("glBindBuffer");
void *p = glMapBuffer(GL_SHADER_STORAGE_BUFFER, access);
Expand All @@ -251,26 +256,29 @@ struct GLBuffer : GLSSBO {

GLBuffer(GLBufId index, void *base, size_t size)
: index(index), base(base), size(size) {
TI_AUTO_PROF;
bind_data(base, size);
bind_index((int)index);
}

GLBuffer(GLBufId index)
: index(index), base(nullptr), size(0) {
GLBuffer(GLBufId index) : index(index), base(nullptr), size(0) {
bind_index((int)index);
}

void copy_forward() {
TI_AUTO_PROF;
bind_data(base, size);
}

void rebind(void *new_base, size_t new_size) {
TI_AUTO_PROF;
base = new_base;
size = new_size;
bind_data(base, size);
}

void copy_back() {
TI_AUTO_PROF;
if (!size)
return;
void *mapped = this->map();
Expand All @@ -288,6 +296,7 @@ struct GLBufferTable {
}

void add_buffer(GLBufId index, void *base, size_t size) {
TI_AUTO_PROF;
bufs[index] = std::make_unique<GLBuffer>(index, base, size);
}

Expand Down Expand Up @@ -338,6 +347,7 @@ size_t ParallelSize_ConstRange::get_num_groups(GLSLLauncher *launcher) const {
}

size_t ParallelSize_DynamicRange::get_num_groups(GLSLLauncher *launcher) const {
TI_AUTO_PROF;
const size_t TPG = opengl_get_threads_per_group();

size_t n;
Expand Down Expand Up @@ -474,6 +484,7 @@ struct CompiledKernel {
}

void dispatch_compute(GLSLLauncher *launcher) const {
TI_AUTO_PROF;
int num_groups = ps->get_num_groups(launcher);

glsl->use();
Expand All @@ -485,11 +496,17 @@ struct CompiledKernel {
// `glDispatchCompute(X, Y, Z)` - the X*Y*Z == `Blocks` in CUDA
// `layout(local_size_x = X) in;` - the X == `Threads` in CUDA
//
glDispatchCompute(num_groups, 1, 1);
check_opengl_error(fmt::format("glDispatchCompute({})", num_groups));
{
TI_PROFILER("glDispatchCompute");
glDispatchCompute(num_groups, 1, 1);
check_opengl_error("glDispatchCompute");
}

glMemoryBarrier(GL_SHADER_STORAGE_BARRIER_BIT);
check_opengl_error("glMemoryBarrier");
{
TI_PROFILER("glMemoryBarrier");
glMemoryBarrier(GL_SHADER_STORAGE_BARRIER_BIT);
check_opengl_error("glMemoryBarrier");
}
}
};

Expand Down Expand Up @@ -568,19 +585,20 @@ struct CompiledProgram::Impl {
}

void launch(Context &ctx, GLSLLauncher *launcher) const {
GLBufferTable &bufs = launcher->impl->user_bufs;
TI_AUTO_PROF;
GLBufferTable &user_bufs = launcher->impl->user_bufs;
std::vector<char> base_arr;
std::vector<void *> saved_ctx_ptrs;
// NOTE: these dirty codes are introduced by #694, TODO: RAII
/// DIRTY_BEGIN {{{
if (ext_arr_map.size()) {
bufs.add_buffer(GLBufId::Earg, ctx.extra_args,
arg_count * taichi_max_num_args * sizeof(int));
if (ext_arr_map.size()) { /// DIRTY_BEGIN {{{
TI_PROFILER("launch:ext_arr1");
user_bufs.add_buffer(GLBufId::Earg, ctx.extra_args,
arg_count * taichi_max_num_args * sizeof(int));
if (ext_arr_map.size() == 1) { // zero-copy for only one ext_arr
auto it = ext_arr_map.begin();
auto extptr = (void *)ctx.args[it->first];
ctx.args[it->first] = 0;
bufs.add_buffer(GLBufId::Extr, extptr, it->second);
user_bufs.add_buffer(GLBufId::Extr, extptr, it->second);
} else {
size_t accum_size = 0;
std::vector<void *> ptrarr;
Expand All @@ -597,12 +615,13 @@ struct CompiledProgram::Impl {
ctx.args[i] = accum_size;
accum_size += size;
} // concat all extptr into my baseptr
bufs.add_buffer(GLBufId::Extr, baseptr, accum_size);
user_bufs.add_buffer(GLBufId::Extr, baseptr, accum_size);
}
} /// DIRTY_END }}}
auto n_args = std::max(arg_count, ret_count);
if (n_args) {
user_bufs.add_buffer(GLBufId::Args, ctx.args, n_args * sizeof(uint64_t));
}
/// DIRTY_END }}}
bufs.add_buffer(GLBufId::Args, ctx.args,
std::max(arg_count, ret_count) * sizeof(uint64_t));
if (used.print) {
auto runtime_buf = launcher->impl->core_bufs.get(GLBufId::Runtime);
auto mapped = (GLSLRuntime *)runtime_buf->map();
Expand All @@ -612,10 +631,10 @@ struct CompiledProgram::Impl {
for (const auto &ker : kernels) {
ker->dispatch_compute(launcher);
}
for (auto &[idx, buf] : launcher->impl->user_bufs.bufs) {
for (auto &[idx, buf] : user_bufs.bufs) {
buf->copy_back();
}
launcher->impl->user_bufs.clear();
user_bufs.clear();
if (used.print) {
dump_message_buffer(launcher);
}
Expand Down