taichi-dev · archibate · Jul 23, 2020 · Jul 24, 2020 · Jul 24, 2020
diff --git a/misc/benchmark_mpm99.py b/misc/benchmark_mpm99.py
@@ -0,0 +1,124 @@
+import taichi as ti
+import numpy as np
+ti.init(arch=ti.opengl)  # Try to run on GPU
+quality = 1  # Use a larger value for higher-res simulations
+n_particles, n_grid = 9000 * quality**2, 128 * quality
+dx, inv_dx = 1 / n_grid, float(n_grid)
+dt = 1e-4 / quality
+p_vol, p_rho = (dx * 0.5)**2, 1
+p_mass = p_vol * p_rho
+E, nu = 0.1e4, 0.2  # Young's modulus and Poisson's ratio
+mu_0, lambda_0 = E / (2 * (1 + nu)), E * nu / (
+    (1 + nu) * (1 - 2 * nu))  # Lame parameters
+x = ti.Vector(2, dt=ti.f32, shape=n_particles)  # position
+v = ti.Vector(2, dt=ti.f32, shape=n_particles)  # velocity
+C = ti.Matrix(2, 2, dt=ti.f32, shape=n_particles)  # affine velocity field
+F = ti.Matrix(2, 2, dt=ti.f32, shape=n_particles)  # deformation gradient
+material = ti.var(dt=ti.i32, shape=n_particles)  # material id
+Jp = ti.var(dt=ti.f32, shape=n_particles)  # plastic deformation
+grid_v = ti.Vector(2, dt=ti.f32,
+                   shape=(n_grid, n_grid))  # grid node momentum/velocity
+grid_m = ti.var(dt=ti.f32, shape=(n_grid, n_grid))  # grid node mass
+
+
+@ti.kernel
+def substep():
+    for i, j in grid_m:
+        grid_v[i, j] = [0, 0]
+        grid_m[i, j] = 0
+    for p in x:  # Particle state update and scatter to grid (P2G)
+        base = (x[p] * inv_dx - 0.5).cast(int)
+        fx = x[p] * inv_dx - base.cast(float)
+        # Quadratic kernels  [http://mpm.graphics   Eqn. 123, with x=fx, fx-1,fx-2]
+        w = [0.5 * (1.5 - fx)**2, 0.75 - (fx - 1)**2, 0.5 * (fx - 0.5)**2]
+        F[p] = (ti.Matrix.identity(ti.f32, 2) +
+                dt * C[p]) @ F[p]  # deformation gradient update
+        h = ti.exp(
+            10 *
+            (1.0 -
+             Jp[p]))  # Hardening coefficient: snow gets harder when compressed
+        if material[p] == 1:  # jelly, make it softer
+            h = 0.3
+        mu, la = mu_0 * h, lambda_0 * h
+        if material[p] == 0:  # liquid
+            mu = 0.0
+        U, sig, V = ti.svd(F[p])
+        J = 1.0
+        for d in ti.static(range(2)):
+            new_sig = sig[d, d]
+            if material[p] == 2:  # Snow
+                new_sig = min(max(sig[d, d], 1 - 2.5e-2),
+                              1 + 4.5e-3)  # Plasticity
+            Jp[p] *= sig[d, d] / new_sig
+            sig[d, d] = new_sig
+            J *= new_sig
+        if material[
+                p] == 0:  # Reset deformation gradient to avoid numerical instability
+            F[p] = ti.Matrix.identity(ti.f32, 2) * ti.sqrt(J)
+        elif material[p] == 2:
+            F[p] = U @ sig @ V.transpose(
+            )  # Reconstruct elastic deformation gradient after plasticity
+        stress = 2 * mu * (F[p] - U @ V.transpose()) @ F[p].transpose(
+        ) + ti.Matrix.identity(ti.f32, 2) * la * J * (J - 1)
+        stress = (-dt * p_vol * 4 * inv_dx * inv_dx) * stress
+        affine = stress + p_mass * C[p]
+        for i, j in ti.static(ti.ndrange(
+                3, 3)):  # Loop over 3x3 grid node neighborhood
+            offset = ti.Vector([i, j])
+            dpos = (offset.cast(float) - fx) * dx
+            weight = w[i][0] * w[j][1]
+            grid_v[base + offset] += weight * (p_mass * v[p] + affine @ dpos)
+            grid_m[base + offset] += weight * p_mass
+    for i, j in grid_m:
+        if grid_m[i, j] > 0:  # No need for epsilon here
+            grid_v[i,
+                   j] = (1 / grid_m[i, j]) * grid_v[i,
+                                                    j]  # Momentum to velocity
+            grid_v[i, j][1] -= dt * 50  # gravity
+            if i < 3 and grid_v[i, j][0] < 0:
+                grid_v[i, j][0] = 0  # Boundary conditions
+            if i > n_grid - 3 and grid_v[i, j][0] > 0: grid_v[i, j][0] = 0
+            if j < 3 and grid_v[i, j][1] < 0: grid_v[i, j][1] = 0
+            if j > n_grid - 3 and grid_v[i, j][1] > 0: grid_v[i, j][1] = 0
+    for p in x:  # grid to particle (G2P)
+        base = (x[p] * inv_dx - 0.5).cast(int)
+        fx = x[p] * inv_dx - base.cast(float)
+        w = [0.5 * (1.5 - fx)**2, 0.75 - (fx - 1.0)**2, 0.5 * (fx - 0.5)**2]
+        new_v = ti.Vector.zero(ti.f32, 2)
+        new_C = ti.Matrix.zero(ti.f32, 2, 2)
+        for i, j in ti.static(ti.ndrange(
+                3, 3)):  # loop over 3x3 grid node neighborhood
+            dpos = ti.Vector([i, j]).cast(float) - fx
+            g_v = grid_v[base + ti.Vector([i, j])]
+            weight = w[i][0] * w[j][1]
+            new_v += weight * g_v
+            new_C += 4 * inv_dx * weight * g_v.outer_product(dpos)
+        v[p], C[p] = new_v, new_C
+        x[p] += dt * v[p]  # advection
+
+
+group_size = n_particles // 3
+
+
+@ti.kernel
+def initialize():
+    for i in range(n_particles):
+        x[i] = [
+            ti.random() * 0.2 + 0.3 + 0.10 * (i // group_size),
+            ti.random() * 0.2 + 0.05 + 0.32 * (i // group_size)
+        ]
+        material[i] = i // group_size  # 0: fluid 1: jelly 2: snow
+        v[i] = ti.Matrix([0, 0])
+        F[i] = ti.Matrix([[1, 0], [0, 1]])
+        Jp[i] = 1
+
+
+for round in range(5):
+    print(round)
+    initialize()
+    for frame in range(60 * 4):
+        for s in range(int(2e-3 // dt)):
+            substep()
+        x.to_numpy()
+        material.to_numpy()
+ti.print_profile_info()
diff --git a/taichi/backends/opengl/opengl_api.cpp b/taichi/backends/opengl/opengl_api.cpp
@@ -43,13 +43,14 @@ std::string get_opengl_error_string(GLenum err) {
 
 void check_opengl_error(const std::string &msg = "OpenGL") {
   auto err = glGetError();
-  if (err != GL_NO_ERROR) {
-    auto estr = get_opengl_error_string(err);
-    TI_ERROR("{}: {}", msg, estr);
-  }
+  if (err == GL_NO_ERROR)
+    return;
+  auto estr = get_opengl_error_string(err);
+  TI_ERROR("{}: {}", msg, estr);
 }
 
 int opengl_get_threads_per_group() {
+  TI_AUTO_PROF;
   int ret = 1000;
   glGetIntegerv(GL_MAX_COMPUTE_WORK_GROUP_INVOCATIONS, &ret);
   check_opengl_error("glGetIntegerv(GL_MAX_COMPUTE_WORK_GROUP_INVOCATIONS)");
@@ -88,6 +89,7 @@ struct GLShader {
   }
 
   void compile(const std::string &source) const {
+    TI_AUTO_PROF;
     const GLchar *source_cstr = source.c_str();
     glShaderSource(id_, 1, &source_cstr, nullptr);
 
@@ -131,6 +133,7 @@ struct GLProgram {
   }
 
   void link() const {
+    TI_AUTO_PROF;
     TI_TRACE("glLinkProgram IN");
     glLinkProgram(id_);
     TI_TRACE("glLinkProgram OUT");
@@ -196,6 +199,7 @@ struct GLSSBO {
   void bind_data(void *data,
                  size_t size,
                  GLuint usage = GL_DYNAMIC_READ) const {
+    TI_AUTO_PROF;
     glBindBuffer(GL_SHADER_STORAGE_BUFFER, id_);
     check_opengl_error("glBindBuffer");
     glBufferData(GL_SHADER_STORAGE_BUFFER, size, data, usage);
@@ -228,6 +232,7 @@ struct GLSSBO {
   }
 
   void *map(GLbitfield access = GL_READ_ONLY) const {
+    TI_AUTO_PROF;
     glBindBuffer(GL_SHADER_STORAGE_BUFFER, id_);
     check_opengl_error("glBindBuffer");
     void *p = glMapBuffer(GL_SHADER_STORAGE_BUFFER, access);
@@ -251,26 +256,29 @@ struct GLBuffer : GLSSBO {
 
   GLBuffer(GLBufId index, void *base, size_t size)
       : index(index), base(base), size(size) {
+    TI_AUTO_PROF;
     bind_data(base, size);
     bind_index((int)index);
   }
 
-  GLBuffer(GLBufId index)
-      : index(index), base(nullptr), size(0) {
+  GLBuffer(GLBufId index) : index(index), base(nullptr), size(0) {
     bind_index((int)index);
   }
 
   void copy_forward() {
+    TI_AUTO_PROF;
     bind_data(base, size);
   }
 
   void rebind(void *new_base, size_t new_size) {
+    TI_AUTO_PROF;
     base = new_base;
     size = new_size;
     bind_data(base, size);
   }
 
   void copy_back() {
+    TI_AUTO_PROF;
     if (!size)
       return;
     void *mapped = this->map();
@@ -288,6 +296,7 @@ struct GLBufferTable {
   }
 
   void add_buffer(GLBufId index, void *base, size_t size) {
+    TI_AUTO_PROF;
     bufs[index] = std::make_unique<GLBuffer>(index, base, size);
   }
 
@@ -338,6 +347,7 @@ size_t ParallelSize_ConstRange::get_num_groups(GLSLLauncher *launcher) const {
 }
 
 size_t ParallelSize_DynamicRange::get_num_groups(GLSLLauncher *launcher) const {
+  TI_AUTO_PROF;
   const size_t TPG = opengl_get_threads_per_group();
 
   size_t n;
@@ -474,6 +484,7 @@ struct CompiledKernel {
   }
 
   void dispatch_compute(GLSLLauncher *launcher) const {
+    TI_AUTO_PROF;
     int num_groups = ps->get_num_groups(launcher);
 
     glsl->use();
@@ -485,11 +496,17 @@ struct CompiledKernel {
     // `glDispatchCompute(X, Y, Z)`   - the X*Y*Z  == `Blocks`   in CUDA
     // `layout(local_size_x = X) in;` - the X      == `Threads`  in CUDA
     //
-    glDispatchCompute(num_groups, 1, 1);
-    check_opengl_error(fmt::format("glDispatchCompute({})", num_groups));
+    {
+      TI_PROFILER("glDispatchCompute");
+      glDispatchCompute(num_groups, 1, 1);
+      check_opengl_error("glDispatchCompute");
+    }
 
-    glMemoryBarrier(GL_SHADER_STORAGE_BARRIER_BIT);
-    check_opengl_error("glMemoryBarrier");
+    {
+      TI_PROFILER("glMemoryBarrier");
+      glMemoryBarrier(GL_SHADER_STORAGE_BARRIER_BIT);
+      check_opengl_error("glMemoryBarrier");
+    }
   }
 };
 
@@ -568,19 +585,20 @@ struct CompiledProgram::Impl {
   }
 
   void launch(Context &ctx, GLSLLauncher *launcher) const {
-    GLBufferTable &bufs = launcher->impl->user_bufs;
+    TI_AUTO_PROF;
+    GLBufferTable &user_bufs = launcher->impl->user_bufs;
     std::vector<char> base_arr;
     std::vector<void *> saved_ctx_ptrs;
     // NOTE: these dirty codes are introduced by #694, TODO: RAII
-    /// DIRTY_BEGIN {{{
-    if (ext_arr_map.size()) {
-      bufs.add_buffer(GLBufId::Earg, ctx.extra_args,
-                      arg_count * taichi_max_num_args * sizeof(int));
+    if (ext_arr_map.size()) {  /// DIRTY_BEGIN {{{
+      TI_PROFILER("launch:ext_arr1");
+      user_bufs.add_buffer(GLBufId::Earg, ctx.extra_args,
+                           arg_count * taichi_max_num_args * sizeof(int));
       if (ext_arr_map.size() == 1) {  // zero-copy for only one ext_arr
         auto it = ext_arr_map.begin();
         auto extptr = (void *)ctx.args[it->first];
         ctx.args[it->first] = 0;
-        bufs.add_buffer(GLBufId::Extr, extptr, it->second);
+        user_bufs.add_buffer(GLBufId::Extr, extptr, it->second);
       } else {
         size_t accum_size = 0;
         std::vector<void *> ptrarr;
@@ -597,12 +615,13 @@ struct CompiledProgram::Impl {
           ctx.args[i] = accum_size;
           accum_size += size;
         }  // concat all extptr into my baseptr
-        bufs.add_buffer(GLBufId::Extr, baseptr, accum_size);
+        user_bufs.add_buffer(GLBufId::Extr, baseptr, accum_size);
       }
+    }  /// DIRTY_END }}}
+    auto n_args = std::max(arg_count, ret_count);
+    if (n_args) {
+      user_bufs.add_buffer(GLBufId::Args, ctx.args, n_args * sizeof(uint64_t));
     }
-    /// DIRTY_END }}}
-    bufs.add_buffer(GLBufId::Args, ctx.args,
-                    std::max(arg_count, ret_count) * sizeof(uint64_t));
     if (used.print) {
       auto runtime_buf = launcher->impl->core_bufs.get(GLBufId::Runtime);
       auto mapped = (GLSLRuntime *)runtime_buf->map();
@@ -612,10 +631,10 @@ struct CompiledProgram::Impl {
     for (const auto &ker : kernels) {
       ker->dispatch_compute(launcher);
     }
-    for (auto &[idx, buf] : launcher->impl->user_bufs.bufs) {
+    for (auto &[idx, buf] : user_bufs.bufs) {
       buf->copy_back();
     }
-    launcher->impl->user_bufs.clear();
+    user_bufs.clear();
     if (used.print) {
       dump_message_buffer(launcher);
     }