From a21254adadc7205d8609d22b42f6139f0ce864ef Mon Sep 17 00:00:00 2001
From: archibate <1931127624@qq.com>
Date: Tue, 14 Jul 2020 14:05:31 +0800
Subject: [PATCH 01/15] base

---
 misc/load_time.py            |  7 ++++
 python/taichi/__init__.py    | 11 ++++++
 python/taichi/core/util.py   |  2 +-
 python/taichi/misc/timing.py | 77 ++++++++++++++++++++++++++++++++++++
 4 files changed, 96 insertions(+), 1 deletion(-)
 create mode 100644 misc/load_time.py
 create mode 100644 python/taichi/misc/timing.py

diff --git a/misc/load_time.py b/misc/load_time.py
new file mode 100644
index 0000000000000..51319dd3bae6e
--- /dev/null
+++ b/misc/load_time.py
@@ -0,0 +1,7 @@
+from time import time
+from colorama import Fore
+t = time()
+import taichi as ti
+print('\nTotal load time:', Fore.LIGHTRED_EX, (time() - t) * 1000, Fore.RESET, 'ms\n')
+
+ti.profiler.print()
diff --git a/python/taichi/__init__.py b/python/taichi/__init__.py
index ad71b618da303..0531b2dab8dea 100644
--- a/python/taichi/__init__.py
+++ b/python/taichi/__init__.py
@@ -1,9 +1,16 @@
+from taichi.misc.timing import profiler
+
+profiler('main')
 from taichi.main import main
+profiler('core')
 from taichi.core import ti_core
 from taichi.core import start_memory_monitoring, is_release, package_root
+profiler('misc.util')
 from taichi.misc.util import vec, veci, set_gdb_trigger, print_profile_info, set_logging_level, info, warn, error, debug, trace, INFO, WARN, ERROR, DEBUG, TRACE
 from taichi.core.util import require_version
+profiler('tools')
 from taichi.tools import *
+profiler('misc.*')
 from taichi.misc import *
 from taichi.misc.gui import GUI
 from taichi.misc.np2ply import PLYWriter
@@ -14,9 +21,12 @@
 from taichi.misc.gui import rgb_to_hex
 from taichi.misc.error import *
 from taichi.misc.settings import *
+profiler('tools.{video,file}')
 from taichi.tools.video import VideoManager
 from taichi.tools.file import *
+profiler('lang')
 from taichi.lang import *
+profiler('torch_io')
 from .torch_io import from_torch, to_torch
 
 
@@ -29,3 +39,4 @@ def test():
 
 __version__ = (core.get_version_major(), core.get_version_minor(),
                core.get_version_patch())
+profiler()
diff --git a/python/taichi/core/util.py b/python/taichi/core/util.py
index 61f097a972a48..e78ad054d9333 100644
--- a/python/taichi/core/util.py
+++ b/python/taichi/core/util.py
@@ -312,7 +312,7 @@ def prepare_sandbox(src):
         sys.path.append(tmp_dir)
         try:
             import_ti_core(tmp_dir)
-        except Exception as e:
+        except ImportError as e:
             from colorama import Fore, Back, Style
             print_red_bold("Taichi core import failed: ", end='')
             print(e)
diff --git a/python/taichi/misc/timing.py b/python/taichi/misc/timing.py
new file mode 100644
index 0000000000000..c2afcfb5973d1
--- /dev/null
+++ b/python/taichi/misc/timing.py
@@ -0,0 +1,77 @@
+import time
+from colorama import Fore, Back, Style
+
+
+def _m(sec):
+    ms = sec * 1000
+    if sec > 0.099:
+        return f'{Fore.LIGHTRED_EX}{sec:6.3f}{Fore.RESET}s'
+    else:
+        return f'{Fore.LIGHTMAGENTA_EX}{ms:5.2f}{Fore.RESET}ms'
+
+
+def _c(cnt):
+    return f'{Fore.MAGENTA}{cnt:3d}{Fore.RESET}x'
+
+
+def _n(name):
+    return f'{Fore.MAGENTA}{name}{Fore.RESET}'
+
+
+class PythonProfiler:
+    def __init__(self, timer=time.time):
+        self.timer = timer
+        self.records = {}
+        self.started = {}
+        self.last_started = None
+
+    def start(self, name):
+        t = self.timer()
+        self.started[name] = t
+        self.last_started = name
+
+    def stop(self, name=None):
+        name = name or self.last_started
+        t = self.timer()
+        diff = t - self.started[name]
+        if name not in self.records:
+            self.records[name] = []
+        self.records[name].append(diff)
+
+    def __call__(self, name=None):
+        if self.last_started:
+            self.stop()
+        if name:
+            self.start(name)
+
+    def print(self, name=None):
+        print('  min   |   avg   |   max   |  nr  |    name')
+        for name in self.records.keys() if name is None else [name]:
+            rec = self.records[name]
+            maximum = max(rec)
+            minimum = min(rec)
+            avg = sum(rec) / len(rec)
+            cnt = len(rec)
+            print(f'{_m(minimum)} | {_m(avg)} | {_m(maximum)} | {_c(cnt)} | {_n(name)}')
+
+    def timed(self, name):
+        if callable(name):
+            foo = name
+            name = foo.__name__
+            return decorator(name)
+
+        def decorator(foo):
+            def wrapped(*args, **kwargs):
+                self.start(name)
+                ret = foo(*args, **kwargs)
+                self.stop(name)
+                self.print(name)
+                return ret
+
+            return wrapped
+
+        return decorator
+
+
+
+profiler = PythonProfiler()

From 3babc4ea765d9e6fa9af72cf6eeb208bec4ea452 Mon Sep 17 00:00:00 2001
From: archibate <1931127624@qq.com>
Date: Tue, 14 Jul 2020 14:17:39 +0800
Subject: [PATCH 02/15] clean

---
 misc/load_time.py         |  9 ++++-----
 python/taichi/__init__.py | 11 +----------
 2 files changed, 5 insertions(+), 15 deletions(-)

diff --git a/misc/load_time.py b/misc/load_time.py
index 51319dd3bae6e..1b5956faf01c0 100644
--- a/misc/load_time.py
+++ b/misc/load_time.py
@@ -1,7 +1,6 @@
 from time import time
-from colorama import Fore
 t = time()
-import taichi as ti
-print('\nTotal load time:', Fore.LIGHTRED_EX, (time() - t) * 1000, Fore.RESET, 'ms\n')
-
-ti.profiler.print()
+import taichi
+t = time() - t
+from colorama import Fore
+print('\nTotal load time:', Fore.LIGHTRED_EX, t * 1000, Fore.RESET, 'ms\n')
diff --git a/python/taichi/__init__.py b/python/taichi/__init__.py
index 0531b2dab8dea..2173bc210384a 100644
--- a/python/taichi/__init__.py
+++ b/python/taichi/__init__.py
@@ -1,16 +1,9 @@
-from taichi.misc.timing import profiler
-
-profiler('main')
 from taichi.main import main
-profiler('core')
 from taichi.core import ti_core
 from taichi.core import start_memory_monitoring, is_release, package_root
-profiler('misc.util')
 from taichi.misc.util import vec, veci, set_gdb_trigger, print_profile_info, set_logging_level, info, warn, error, debug, trace, INFO, WARN, ERROR, DEBUG, TRACE
 from taichi.core.util import require_version
-profiler('tools')
 from taichi.tools import *
-profiler('misc.*')
 from taichi.misc import *
 from taichi.misc.gui import GUI
 from taichi.misc.np2ply import PLYWriter
@@ -21,12 +14,10 @@
 from taichi.misc.gui import rgb_to_hex
 from taichi.misc.error import *
 from taichi.misc.settings import *
-profiler('tools.{video,file}')
+from taichi.misc.timing import profiler
 from taichi.tools.video import VideoManager
 from taichi.tools.file import *
-profiler('lang')
 from taichi.lang import *
-profiler('torch_io')
 from .torch_io import from_torch, to_torch
 
 

From 9569fe302f8410f90a3501484d0aeda96405178c Mon Sep 17 00:00:00 2001
From: archibate <1931127624@qq.com>
Date: Tue, 14 Jul 2020 14:41:32 +0800
Subject: [PATCH 03/15] misc/test_python_profiler.py

---
 misc/load_time.py            |  6 ---
 misc/test_python_profiler.py | 39 +++++++++++++++++++
 python/taichi/misc/timing.py | 72 ++++++++++++++++++++++++++++--------
 3 files changed, 96 insertions(+), 21 deletions(-)
 delete mode 100644 misc/load_time.py
 create mode 100644 misc/test_python_profiler.py

diff --git a/misc/load_time.py b/misc/load_time.py
deleted file mode 100644
index 1b5956faf01c0..0000000000000
--- a/misc/load_time.py
+++ /dev/null
@@ -1,6 +0,0 @@
-from time import time
-t = time()
-import taichi
-t = time() - t
-from colorama import Fore
-print('\nTotal load time:', Fore.LIGHTRED_EX, t * 1000, Fore.RESET, 'ms\n')
diff --git a/misc/test_python_profiler.py b/misc/test_python_profiler.py
new file mode 100644
index 0000000000000..6f50f8b6e741c
--- /dev/null
+++ b/misc/test_python_profiler.py
@@ -0,0 +1,39 @@
+import taichi as ti
+
+
+@ti.profiler.timed('foo', warmup=5)
+@ti.kernel
+def func0():
+    print(233)
+
+
+@ti.profiler.timed(warmup=5)
+@ti.kernel
+def func1():
+    print(233)
+
+
+@ti.profiler.timed('Func2')
+@ti.kernel
+def func2():
+    print(666)
+
+
+@ti.profiler.timed
+@ti.kernel
+def func3():
+    pass
+
+
+for k in range(5):
+    ti.profiler.start('step')
+    for i in range(10):
+        func0()
+        func1()
+        func2()
+        func3()
+
+    ti.profiler.stop('step')
+
+
+ti.profiler.print()
diff --git a/python/taichi/misc/timing.py b/python/taichi/misc/timing.py
index c2afcfb5973d1..575fe1310d04e 100644
--- a/python/taichi/misc/timing.py
+++ b/python/taichi/misc/timing.py
@@ -1,4 +1,4 @@
-import time
+import time, functools
 from colorama import Fore, Back, Style
 
 
@@ -18,6 +18,31 @@ def _n(name):
     return f'{Fore.MAGENTA}{name}{Fore.RESET}'
 
 
+class RecordStatistics:
+    def __init__(self, rec):
+        self.rec = rec
+
+    @property
+    def min(self):
+        return min(self.rec)
+
+    @property
+    def max(self):
+        return max(self.rec)
+
+    @property
+    def total(self):
+        return sum(rec)
+
+    @property
+    def avg(self):
+        return sum(self.rec) / len(self.rec)
+
+    @property
+    def count(self):
+        return len(self.rec)
+
+
 class PythonProfiler:
     def __init__(self, timer=time.time):
         self.timer = timer
@@ -46,31 +71,48 @@ def __call__(self, name=None):
 
     def print(self, name=None):
         print('  min   |   avg   |   max   |  nr  |    name')
-        for name in self.records.keys() if name is None else [name]:
-            rec = self.records[name]
-            maximum = max(rec)
-            minimum = min(rec)
-            avg = sum(rec) / len(rec)
-            cnt = len(rec)
-            print(f'{_m(minimum)} | {_m(avg)} | {_m(maximum)} | {_c(cnt)} | {_n(name)}')
-
-    def timed(self, name):
+        if name is not None:
+            names = [name]
+        else:
+            def keyfunc(name):
+                rec = RecordStatistics(self.records[name])
+                return -rec.avg
+            names = sorted(self.records.keys(), key=keyfunc)
+        for name in names:
+            rec = RecordStatistics(self.records[name])
+            print(f'{_m(rec.min)} | {_m(rec.avg)} | {_m(rec.max)} | {_c(rec.count)} '
+                  f'| {_n(name)}')
+
+    def timed(self, name=None, warmup=0):
         if callable(name):
             foo = name
-            name = foo.__name__
-            return decorator(name)
+            name = None
+        else:
+            foo = None
+
+        def deco(foo):
+            if deco.name is None:
+                name = foo.__name__
+            else:
+                name = deco.name
 
-        def decorator(foo):
+            @functools.wraps(foo)
             def wrapped(*args, **kwargs):
+                if deco.warmup > 0:
+                    deco.warmup -= 1
+                    return foo(*args, **kwargs)
+
                 self.start(name)
                 ret = foo(*args, **kwargs)
                 self.stop(name)
-                self.print(name)
                 return ret
 
             return wrapped
 
-        return decorator
+        deco.name = name
+        deco.warmup = warmup
+
+        return deco(foo) if foo is not None else deco
 
 
 

From ef1b4bf7574f180ae24f755b9b460b07a496ebe0 Mon Sep 17 00:00:00 2001
From: archibate <1931127624@qq.com>
Date: Tue, 14 Jul 2020 15:04:21 +0800
Subject: [PATCH 04/15] mpm99_timed

---
 misc/mpm99_timed.py          | 106 +++++++++++++++++++++++++++++++++++
 python/taichi/misc/timing.py |  71 +++++++++++++----------
 2 files changed, 148 insertions(+), 29 deletions(-)
 create mode 100644 misc/mpm99_timed.py

diff --git a/misc/mpm99_timed.py b/misc/mpm99_timed.py
new file mode 100644
index 0000000000000..e34db46e613b3
--- /dev/null
+++ b/misc/mpm99_timed.py
@@ -0,0 +1,106 @@
+import taichi as ti
+import numpy as np
+ti.init(arch=ti.gpu) # Try to run on GPU
+quality = 1 # Use a larger value for higher-res simulations
+n_particles, n_grid = 9000 * quality ** 2, 128 * quality
+dx, inv_dx = 1 / n_grid, float(n_grid)
+dt = 1e-4 / quality
+p_vol, p_rho = (dx * 0.5)**2, 1
+p_mass = p_vol * p_rho
+E, nu = 0.1e4, 0.2 # Young's modulus and Poisson's ratio
+mu_0, lambda_0 = E / (2 * (1 + nu)), E * nu / ((1+nu) * (1 - 2 * nu)) # Lame parameters
+x = ti.Vector(2, dt=ti.f32, shape=n_particles) # position
+v = ti.Vector(2, dt=ti.f32, shape=n_particles) # velocity
+C = ti.Matrix(2, 2, dt=ti.f32, shape=n_particles) # affine velocity field
+F = ti.Matrix(2, 2, dt=ti.f32, shape=n_particles) # deformation gradient
+material = ti.var(dt=ti.i32, shape=n_particles) # material id
+Jp = ti.var(dt=ti.f32, shape=n_particles) # plastic deformation
+grid_v = ti.Vector(2, dt=ti.f32, shape=(n_grid, n_grid)) # grid node momentum/velocity
+grid_m = ti.var(dt=ti.f32, shape=(n_grid, n_grid)) # grid node mass
+
+@ti.profiler.timed(warmup=5)
+@ti.kernel
+def substep():
+  for i, j in grid_m:
+    grid_v[i, j] = [0, 0]
+    grid_m[i, j] = 0
+  for p in x: # Particle state update and scatter to grid (P2G)
+    base = (x[p] * inv_dx - 0.5).cast(int)
+    fx = x[p] * inv_dx - base.cast(float)
+    # Quadratic kernels  [http://mpm.graphics   Eqn. 123, with x=fx, fx-1,fx-2]
+    w = [0.5 * (1.5 - fx) ** 2, 0.75 - (fx - 1) ** 2, 0.5 * (fx - 0.5) ** 2]
+    F[p] = (ti.Matrix.identity(ti.f32, 2) + dt * C[p]) @ F[p] # deformation gradient update
+    h = ti.exp(10 * (1.0 - Jp[p])) # Hardening coefficient: snow gets harder when compressed
+    if material[p] == 1: # jelly, make it softer
+      h = 0.3
+    mu, la = mu_0 * h, lambda_0 * h
+    if material[p] == 0: # liquid
+      mu = 0.0
+    U, sig, V = ti.svd(F[p])
+    J = 1.0
+    for d in ti.static(range(2)):
+      new_sig = sig[d, d]
+      if material[p] == 2:  # Snow
+        new_sig = min(max(sig[d, d], 1 - 2.5e-2), 1 + 4.5e-3)  # Plasticity
+      Jp[p] *= sig[d, d] / new_sig
+      sig[d, d] = new_sig
+      J *= new_sig
+    if material[p] == 0:  # Reset deformation gradient to avoid numerical instability
+      F[p] = ti.Matrix.identity(ti.f32, 2) * ti.sqrt(J)
+    elif material[p] == 2:
+      F[p] = U @ sig @ V.T() # Reconstruct elastic deformation gradient after plasticity
+    stress = 2 * mu * (F[p] - U @ V.T()) @ F[p].T() + ti.Matrix.identity(ti.f32, 2) * la * J * (J - 1)
+    stress = (-dt * p_vol * 4 * inv_dx * inv_dx) * stress
+    affine = stress + p_mass * C[p]
+    for i, j in ti.static(ti.ndrange(3, 3)): # Loop over 3x3 grid node neighborhood
+      offset = ti.Vector([i, j])
+      dpos = (offset.cast(float) - fx) * dx
+      weight = w[i][0] * w[j][1]
+      grid_v[base + offset] += weight * (p_mass * v[p] + affine @ dpos)
+      grid_m[base + offset] += weight * p_mass
+  for i, j in grid_m:
+    if grid_m[i, j] > 0: # No need for epsilon here
+      grid_v[i, j] = (1 / grid_m[i, j]) * grid_v[i, j] # Momentum to velocity
+      grid_v[i, j][1] -= dt * 50 # gravity
+      if i < 3 and grid_v[i, j][0] < 0:          grid_v[i, j][0] = 0 # Boundary conditions
+      if i > n_grid - 3 and grid_v[i, j][0] > 0: grid_v[i, j][0] = 0
+      if j < 3 and grid_v[i, j][1] < 0:          grid_v[i, j][1] = 0
+      if j > n_grid - 3 and grid_v[i, j][1] > 0: grid_v[i, j][1] = 0
+  for p in x: # grid to particle (G2P)
+    base = (x[p] * inv_dx - 0.5).cast(int)
+    fx = x[p] * inv_dx - base.cast(float)
+    w = [0.5 * (1.5 - fx) ** 2, 0.75 - (fx - 1.0) ** 2, 0.5 * (fx - 0.5) ** 2]
+    new_v = ti.Vector.zero(ti.f32, 2)
+    new_C = ti.Matrix.zero(ti.f32, 2, 2)
+    for i, j in ti.static(ti.ndrange(3, 3)): # loop over 3x3 grid node neighborhood
+      dpos = ti.Vector([i, j]).cast(float) - fx
+      g_v = grid_v[base + ti.Vector([i, j])]
+      weight = w[i][0] * w[j][1]
+      new_v += weight * g_v
+      new_C += 4 * inv_dx * weight * g_v.outer_product(dpos)
+    v[p], C[p] = new_v, new_C
+    x[p] += dt * v[p] # advection
+
+group_size = n_particles // 3
+@ti.kernel
+def initialize():
+  for i in range(n_particles):
+    x[i] = [ti.random() * 0.2 + 0.3 + 0.10 * (i // group_size), ti.random() * 0.2 + 0.05 + 0.32 * (i // group_size)]
+    material[i] = i // group_size # 0: fluid 1: jelly 2: snow
+    v[i] = ti.Matrix([0, 0])
+    F[i] = ti.Matrix([[1, 0], [0, 1]])
+    Jp[i] = 1
+initialize()
+gui = ti.GUI("Taichi MLS-MPM-99", res=512, background_color=0x112F41)
+for frame in range(100):
+  ti.profiler('total_step')
+  for s in range(int(2e-3 // dt)):
+    substep()
+  colors = np.array([0x068587, 0xED553B, 0xEEEEF0], dtype=np.uint32)
+  ti.profiler('circles', warmup=5)
+  gui.circles(x.to_numpy(), radius=1.5, color=colors[material.to_numpy()])
+  ti.profiler('show', warmup=5)
+  gui.show()
+  ti.profiler()
+
+ti.profiler.print()
diff --git a/python/taichi/misc/timing.py b/python/taichi/misc/timing.py
index 575fe1310d04e..b9ddf64a88d9f 100644
--- a/python/taichi/misc/timing.py
+++ b/python/taichi/misc/timing.py
@@ -3,15 +3,21 @@
 
 
 def _m(sec):
-    ms = sec * 1000
-    if sec > 0.099:
+    if sec > 9.9:
+        return f'{Fore.LIGHTRED_EX}{sec:6.1f}{Fore.RESET}s'
+    elif sec > 0.099:
         return f'{Fore.LIGHTRED_EX}{sec:6.3f}{Fore.RESET}s'
     else:
-        return f'{Fore.LIGHTMAGENTA_EX}{ms:5.2f}{Fore.RESET}ms'
+        ms = sec * 1000
+        if ms > 0.099:
+            return f'{Fore.LIGHTMAGENTA_EX}{ms:5.2f}{Fore.RESET}ms'
+        else:
+            ns = ms * 1000
+            return f'{Fore.BLUE}{ns:5.2f}{Fore.RESET}ns'
 
 
 def _c(cnt):
-    return f'{Fore.MAGENTA}{cnt:3d}{Fore.RESET}x'
+    return f'{Fore.MAGENTA}{cnt:4d}{Fore.RESET}x'
 
 
 def _n(name):
@@ -19,8 +25,13 @@ def _n(name):
 
 
 class RecordStatistics:
-    def __init__(self, rec):
-        self.rec = rec
+    def __init__(self, profiler, name):
+        self.record = profiler.records[name]
+        self.options = profiler.options[name]
+        self.rec = self.record
+
+        warmup = self.options.get('warmup', 0)
+        self.rec = self.rec[warmup:]
 
     @property
     def min(self):
@@ -32,7 +43,7 @@ def max(self):
 
     @property
     def total(self):
-        return sum(rec)
+        return sum(self.rec)
 
     @property
     def avg(self):
@@ -48,71 +59,73 @@ def __init__(self, timer=time.time):
         self.timer = timer
         self.records = {}
         self.started = {}
+        self.options = {}
         self.last_started = None
 
-    def start(self, name):
+    def start(self, name, **options):
+        self.last_started = name
+        self.options[name] = self.options.get(name, options)
         t = self.timer()
         self.started[name] = t
-        self.last_started = name
 
     def stop(self, name=None):
-        name = name or self.last_started
         t = self.timer()
+        if not name and self.last_started:
+            name = self.last_started
+            self.last_started = None
         diff = t - self.started[name]
+        del self.started[name]
+        self.last_started = None
         if name not in self.records:
             self.records[name] = []
         self.records[name].append(diff)
 
-    def __call__(self, name=None):
+    def __call__(self, name=None, **options):
         if self.last_started:
             self.stop()
-        if name:
-            self.start(name)
+        if name is not None:
+            self.start(name, **options)
 
     def print(self, name=None):
-        print('  min   |   avg   |   max   |  nr  |    name')
+        print('  min   |   avg   |   max   |  num  |  total  |    name')
         if name is not None:
             names = [name]
         else:
             def keyfunc(name):
-                rec = RecordStatistics(self.records[name])
+                rec = RecordStatistics(self, name)
                 return -rec.avg
             names = sorted(self.records.keys(), key=keyfunc)
         for name in names:
-            rec = RecordStatistics(self.records[name])
+            rec = RecordStatistics(self, name)
             print(f'{_m(rec.min)} | {_m(rec.avg)} | {_m(rec.max)} | {_c(rec.count)} '
-                  f'| {_n(name)}')
+                  f'| {_m(rec.total)} | {_n(name)}')
 
-    def timed(self, name=None, warmup=0):
+    def timed(self, name=None, **options):
         if callable(name):
             foo = name
             name = None
         else:
             foo = None
 
-        def deco(foo):
-            if deco.name is None:
+        def decorator(foo):
+            if decorator.name is None:
                 name = foo.__name__
             else:
-                name = deco.name
+                name = decorator.name
 
             @functools.wraps(foo)
             def wrapped(*args, **kwargs):
-                if deco.warmup > 0:
-                    deco.warmup -= 1
-                    return foo(*args, **kwargs)
-
-                self.start(name)
+                self.start(name, **decorator.options)
                 ret = foo(*args, **kwargs)
                 self.stop(name)
                 return ret
 
             return wrapped
 
-        deco.name = name
-        deco.warmup = warmup
+        decorator.name = name
+        decorator.options = options
 
-        return deco(foo) if foo is not None else deco
+        return decorator(foo) if foo is not None else decorator
 
 
 

From b3e6dd1751294dea5b70ae730a664d93b1999aad Mon Sep 17 00:00:00 2001
From: archibate <1931127624@qq.com>
Date: Tue, 14 Jul 2020 15:07:01 +0800
Subject: [PATCH 05/15] total sort

---
 misc/mpm99_timed.py          | 5 ++---
 python/taichi/misc/timing.py | 2 +-
 2 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/misc/mpm99_timed.py b/misc/mpm99_timed.py
index e34db46e613b3..69ec15edc72bd 100644
--- a/misc/mpm99_timed.py
+++ b/misc/mpm99_timed.py
@@ -93,13 +93,12 @@ def initialize():
 initialize()
 gui = ti.GUI("Taichi MLS-MPM-99", res=512, background_color=0x112F41)
 for frame in range(100):
-  ti.profiler('total_step')
   for s in range(int(2e-3 // dt)):
     substep()
   colors = np.array([0x068587, 0xED553B, 0xEEEEF0], dtype=np.uint32)
-  ti.profiler('circles', warmup=5)
+  ti.profiler('gui.circles', warmup=5)
   gui.circles(x.to_numpy(), radius=1.5, color=colors[material.to_numpy()])
-  ti.profiler('show', warmup=5)
+  ti.profiler('gui.show', warmup=5)
   gui.show()
   ti.profiler()
 
diff --git a/python/taichi/misc/timing.py b/python/taichi/misc/timing.py
index b9ddf64a88d9f..a3b34f20d91ab 100644
--- a/python/taichi/misc/timing.py
+++ b/python/taichi/misc/timing.py
@@ -93,7 +93,7 @@ def print(self, name=None):
         else:
             def keyfunc(name):
                 rec = RecordStatistics(self, name)
-                return -rec.avg
+                return -rec.total
             names = sorted(self.records.keys(), key=keyfunc)
         for name in names:
             rec = RecordStatistics(self, name)

From b581ceebf72635fe9d2c25270140d05f4d384148 Mon Sep 17 00:00:00 2001
From: archibate <1931127624@qq.com>
Date: Tue, 14 Jul 2020 15:32:48 +0800
Subject: [PATCH 06/15] update doc

---
 a.py              |  19 +++++
 docs/profiler.rst | 181 +++++++++++++++++++++++++++++++++++++++++-----
 2 files changed, 183 insertions(+), 17 deletions(-)
 create mode 100644 a.py

diff --git a/a.py b/a.py
new file mode 100644
index 0000000000000..11cb38021ee2c
--- /dev/null
+++ b/a.py
@@ -0,0 +1,19 @@
+from time import sleep
+import taichi as ti
+ti.init()
+
+def do_something_A():
+    sleep(0.01)
+
+def do_something_B():
+    sleep(0.1)
+
+ti.profiler.start('A')
+do_something_A()
+ti.profiler.stop('A')
+
+ti.profiler.start('B')
+do_something_B()
+ti.profiler.stop('B')
+
+ti.profiler.print()
diff --git a/docs/profiler.rst b/docs/profiler.rst
index f0295de1500b1..8248a8f39c69c 100644
--- a/docs/profiler.rst
+++ b/docs/profiler.rst
@@ -2,45 +2,157 @@
 Profiler
 ========
 
-Taichi's profiler can help you analyze the run-time cost of your program. There are two profiling systems in Taichi: ``ScopedProfiler`` and ``KernelProfiler``.
+Taichi's profiler can help you analyze the run-time cost of your program.
 
-ScopedProfiler
+Currently there are three profiling systems in Taichi:
+
+- ``PythonProfiler``
+- ``KernelProfiler``
+- ``ScopedProfiler`` (for developers)
+
+
+
+PythonProfiler
 ##############
 
-1. ``ScopedProfiler`` measures time spent on the **host tasks** hierarchically.
+``PythonProfiler`` basically measures time spent between ``start()`` and ``stop()`` using
+the Python-builtin function ``time.time()``.
+
+There are 3 ways to use this profiler:
 
-2. This profiler is automatically on. To show its results, call ``ti.print_profile_info()``. For example:
+1. ``ti.profiler.start()`` and ``ti.profiler.stop()``, these are the most fundemental APIs:
 
 .. code-block:: python
 
+    from time import sleep
     import taichi as ti
+    ti.init()
 
-    ti.init(arch=ti.cpu)
-    var = ti.var(ti.f32, shape=1)
+    def do_something_A():
+        sleep(0.01)
 
+    def do_something_B():
+        sleep(0.1)
 
-    @ti.kernel
-    def compute():
-        var[0] = 1.0
-        print("Setting var[0] =", var[0])
+    ti.profiler.start('A')
+    do_something_A()
+    ti.profiler.stop('A')
 
+    ti.profiler.start('B')
+    do_something_B()
+    ti.profiler.stop('B')
 
-    compute()
-    ti.print_profile_info()
+    ti.profiler.print()
 
 
-``ti.print_profile_info()`` prints profiling results in a hierarchical format.
+2. ``ti.profiler()``, this one makes our code cleaner:
 
-.. Note::
+.. code-block:: python
+
+    from time import sleep
+    import taichi as ti
+    ti.init()
+
+    def do_something_A():
+        sleep(0.01)
+
+    def do_something_B():
+        sleep(0.1)
+
+    ti.profiler('A')  # start('A')
+    do_something_A()
+
+    ti.profiler('B')  # stop('A') & start('B')
+    do_something_B()
+    ti.profiler()     # stop('B')
+
+    ti.profiler.print()
+
+
+.. note::
+
+    Running 1 and 2 should obtain something like:
+
+    .. code-block:: none
+
+          min   |   avg   |   max   |  num  |  total  |    name
+         0.100s |  0.100s |  0.100s |    1x |  0.100s | B
+        10.10ms | 10.10ms | 10.10ms |    1x | 10.10ms | A
+
+
+3. ``@ti.profiler.timed``, this one is very intuitive when profiling kernels:
+
+.. code-block:: python
+
+    from time import sleep
+    import taichi as ti
+    ti.init()
+
+    @ti.profiler.timed
+    def do_something_A():
+        sleep(0.01)
+
+    @ti.profiler.timed
+    def do_something_B():
+        sleep(0.1)
+
+    do_something_A()
+    do_something_B()
+
+    ti.profiler.print()
+
+
+When combining ``@ti.profiler.timed`` with other decorators like ``@ti.kernel``,
+then ``@ti.profiler.timed`` should be put **above** it, e.g.:
+
+.. code-block:: python
+
+        @ti.profiler.timed
+        @ti.kernel
+        def substep():
+            ...
+
+
+.. note::
+
+    Running 3 should obtain something like:
+
+    .. code-block:: none
+
+          min   |   avg   |   max   |  num  |  total  |    name
+         0.100s |  0.100s |  0.100s |    1x |  0.100s | do_something_B
+        10.10ms | 10.10ms | 10.10ms |    1x | 10.10ms | do_something_A
+
+
+See `misc/mpm99_timed.py <https://github.com/taichi-dev/taichi/blob/master/misc/mpm99_timed.py>`_ for their usage example.
+
+
+.. warning::
+
+    ``ti.profiler``, i.e. ``PythonProfiler``, **only works in Python-scope**, e.g.::
+
+        @ti.func
+        def substep():
+            ti.profiler.start('hello')  # won't work as you expected...
+            ...
+            ti.profiler.stop('hello')
+
+        @ti.profiler.timed  # won't work as you expected...
+        @ti.func
+        def hello():
+            ...
+
+    To do profiling **inside Taichi-scope**, please see the ``KernelProfiler`` section below.
 
-    ``ScopedProfiler`` is a C++ class in the core of Taichi. It is not exposed to Python users.
 
 KernelProfiler
 ##############
 
-1. ``KernelProfiler`` records the costs of Taichi kernels on devices. To enable this profiler, set ``kernel_profiler=True`` in ``ti.init``.
+``KernelProfiler`` records the costs of Taichi kernels on devices.
 
-2. Call ``ti.kernel_profiler_print()`` to show the kernel profiling result. For example:
+To enable this profiler, please initialize Taichi using ``ti.init(kernel_profiler=True)``.
+
+Call ``ti.kernel_profiler_print()`` to show the kernel profiling result. For example:
 
 .. code-block:: python
     :emphasize-lines: 3, 13
@@ -67,3 +179,38 @@ The outputs would be:
     [ 22.73%] jit_evaluator_0_kernel_0_serial             min   0.001 ms   avg   0.001 ms   max   0.001 ms   total   0.000 s [      1x]
     [  0.00%] jit_evaluator_1_kernel_1_serial             min   0.000 ms   avg   0.000 ms   max   0.000 ms   total   0.000 s [      1x]
     [ 77.27%] compute_c4_0_kernel_2_serial                min   0.004 ms   avg   0.004 ms   max   0.004 ms   total   0.000 s [      1x]
+
+
+ScopedProfiler
+##############
+
+``ScopedProfiler`` measures time spent on the **host tasks** hierarchically.
+
+This profiler is automatically on.
+
+To show its results, call ``ti.print_profile_info()``. For example:
+
+.. code-block:: python
+
+    import taichi as ti
+
+    ti.init(arch=ti.cpu)
+    var = ti.var(ti.f32, shape=1)
+
+
+    @ti.kernel
+    def compute():
+        var[0] = 1.0
+        print("Setting var[0] =", var[0])
+
+
+    compute()
+    ti.print_profile_info()
+
+
+``ti.print_profile_info()`` prints profiling results in a hierarchical format.
+
+.. Note::
+
+    ``ScopedProfiler`` is a C++ class in the core of Taichi. It is not exposed to Python users.
+

From 6be7061f22edbfb22abb469f58aed01f40bbbdba Mon Sep 17 00:00:00 2001
From: archibate <1931127624@qq.com>
Date: Tue, 14 Jul 2020 15:55:44 +0800
Subject: [PATCH 07/15] doc warmup=5

---
 a.py                |  19 --------
 docs/profiler.rst   | 105 +++++++++++++++++++++++++++++++++++++++-----
 misc/mpm99_timed.py |   2 +-
 3 files changed, 94 insertions(+), 32 deletions(-)
 delete mode 100644 a.py

diff --git a/a.py b/a.py
deleted file mode 100644
index 11cb38021ee2c..0000000000000
--- a/a.py
+++ /dev/null
@@ -1,19 +0,0 @@
-from time import sleep
-import taichi as ti
-ti.init()
-
-def do_something_A():
-    sleep(0.01)
-
-def do_something_B():
-    sleep(0.1)
-
-ti.profiler.start('A')
-do_something_A()
-ti.profiler.stop('A')
-
-ti.profiler.start('B')
-do_something_B()
-ti.profiler.stop('B')
-
-ti.profiler.print()
diff --git a/docs/profiler.rst b/docs/profiler.rst
index 8248a8f39c69c..45193fb0d0c48 100644
--- a/docs/profiler.rst
+++ b/docs/profiler.rst
@@ -2,7 +2,7 @@
 Profiler
 ========
 
-Taichi's profiler can help you analyze the run-time cost of your program.
+Taichi's profiler can help you analyze the run-time cost of your program, i.e. performance.
 
 Currently there are three profiling systems in Taichi:
 
@@ -13,11 +13,14 @@ Currently there are three profiling systems in Taichi:
 
 
 PythonProfiler
-##############
+--------------
 
 ``PythonProfiler`` basically measures time spent between ``start()`` and ``stop()`` using
 the Python-builtin function ``time.time()``.
 
+Profiling APIs
+**************
+
 There are 3 ways to use this profiler:
 
 1. ``ti.profiler.start()`` and ``ti.profiler.stop()``, these are the most fundemental APIs:
@@ -102,15 +105,17 @@ There are 3 ways to use this profiler:
     ti.profiler.print()
 
 
-When combining ``@ti.profiler.timed`` with other decorators like ``@ti.kernel``,
-then ``@ti.profiler.timed`` should be put **above** it, e.g.:
+.. warning::
 
-.. code-block:: python
+    When combining ``@ti.profiler.timed`` with other decorators like ``@ti.kernel``,
+    then ``@ti.profiler.timed`` should be put **above** it, e.g.:
 
-        @ti.profiler.timed
-        @ti.kernel
-        def substep():
-            ...
+    .. code-block:: python
+
+            @ti.profiler.timed
+            @ti.kernel
+            def substep():
+                ...
 
 
 .. note::
@@ -124,7 +129,83 @@ then ``@ti.profiler.timed`` should be put **above** it, e.g.:
         10.10ms | 10.10ms | 10.10ms |    1x | 10.10ms | do_something_A
 
 
-See `misc/mpm99_timed.py <https://github.com/taichi-dev/taichi/blob/master/misc/mpm99_timed.py>`_ for their usage example.
+Recording multiple entries
+**************************
+
+When a same **name** is used for multiple times, then they will be merged into one, e.g.:
+
+.. code-block:: python
+
+    from time import sleep
+    import taichi as ti
+    ti.init()
+
+    def do_something_A():
+        sleep(0.01)
+
+    def do_something_B():
+        sleep(0.1)
+
+    ti.profiler.start('A')
+    do_something_A()
+    ti.profiler.stop('A')
+
+    ti.profiler.start('A')
+    do_something_B()
+    ti.profiler.stop('A')
+
+    ti.profiler.print()
+
+will obtain:
+
+.. code-block:: none
+
+      min   |   avg   |   max   |  num  |  total  |    name
+    10.10ms | 55.12ms |  0.100s |    2x |  0.110s | A
+
+
+- ``min`` is the minimum time in records.
+- ``avg`` is the average time of records.
+- ``max`` is the maximum time in records.
+- ``num`` is the number of record entries.
+- ``total`` is the total costed time of records.
+
+
+Profiler options
+****************
+
+Due to Taichi's JIT mechinism, a kernel will be **compiled** on its first invocation.
+So the first record will be extremely long compared to the following records since it
+**involves both compile time and execution time**, e.g.:
+
+.. code-block:: none
+
+       min   |   avg   |   max   |  num  |  total  |    name
+      2.37ms |  3.79ms |  1.615s | 1900x |  7.204s | substep
+
+.. code-block:: none
+
+       min   |   avg   |   max   |  num  |  total  |    name
+      2.37ms |  2.95ms | 12.70ms | 1895x |  5.592s | substep
+
+
+As you see, this make our result inaccurate, especially the ``max`` column.
+
+To avoid this, you may specify a ``warmup`` option to ``ti.profiler``, e.g.:
+
+.. code-block:: python
+
+    @ti.profiler.timed(warmup=5)
+    @ti.kernel
+    def substep():
+        ...
+
+
+Set ``warmup=5`` for example, will **discard** the first 5 record entries.
+I.e. discard the kernel compile time and possible TLB and cache misses on start up.
+
+
+Check out `misc/mpm99_timed.py <https://github.com/taichi-dev/taichi/blob/master/misc/mpm99_timed.py>`_ for sum up usage example.
 
 
 .. warning::
@@ -146,7 +227,7 @@ See `misc/mpm99_timed.py <https://github.com/taichi-dev/taichi/blob/master/misc/
 
 
 KernelProfiler
-##############
+--------------
 
 ``KernelProfiler`` records the costs of Taichi kernels on devices.
 
@@ -182,7 +263,7 @@ The outputs would be:
 
 
 ScopedProfiler
-##############
+--------------
 
 ``ScopedProfiler`` measures time spent on the **host tasks** hierarchically.
 
diff --git a/misc/mpm99_timed.py b/misc/mpm99_timed.py
index 69ec15edc72bd..341fed792a24d 100644
--- a/misc/mpm99_timed.py
+++ b/misc/mpm99_timed.py
@@ -18,7 +18,7 @@
 grid_v = ti.Vector(2, dt=ti.f32, shape=(n_grid, n_grid)) # grid node momentum/velocity
 grid_m = ti.var(dt=ti.f32, shape=(n_grid, n_grid)) # grid node mass
 
-@ti.profiler.timed(warmup=5)
+@ti.profiler.timed(warmup=10)
 @ti.kernel
 def substep():
   for i, j in grid_m:

From 2e8507ff688a894a7aca4b33668ad3f20f056d47 Mon Sep 17 00:00:00 2001
From: archibate <1931127624@qq.com>
Date: Tue, 14 Jul 2020 22:01:54 +0800
Subject: [PATCH 08/15] [skip ci] rev

---
 python/taichi/__init__.py  | 1 -
 python/taichi/core/util.py | 2 +-
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/python/taichi/__init__.py b/python/taichi/__init__.py
index 2173bc210384a..e64206090cff4 100644
--- a/python/taichi/__init__.py
+++ b/python/taichi/__init__.py
@@ -30,4 +30,3 @@ def test():
 
 __version__ = (core.get_version_major(), core.get_version_minor(),
                core.get_version_patch())
-profiler()
diff --git a/python/taichi/core/util.py b/python/taichi/core/util.py
index e78ad054d9333..61f097a972a48 100644
--- a/python/taichi/core/util.py
+++ b/python/taichi/core/util.py
@@ -312,7 +312,7 @@ def prepare_sandbox(src):
         sys.path.append(tmp_dir)
         try:
             import_ti_core(tmp_dir)
-        except ImportError as e:
+        except Exception as e:
             from colorama import Fore, Back, Style
             print_red_bold("Taichi core import failed: ", end='')
             print(e)

From 234a70c501bca758fad71d6ddb7accf24cebb3b1 Mon Sep 17 00:00:00 2001
From: Taichi Gardener <taichigardener@gmail.com>
Date: Tue, 14 Jul 2020 10:02:21 -0400
Subject: [PATCH 09/15] [skip ci] enforce code format

---
 docs/profiler.rst            |   1 -
 misc/mpm99_timed.py          | 196 ++++++++++++++++++++---------------
 misc/test_python_profiler.py |   1 -
 python/taichi/misc/timing.py |   8 +-
 4 files changed, 115 insertions(+), 91 deletions(-)

diff --git a/docs/profiler.rst b/docs/profiler.rst
index 45193fb0d0c48..78743a89cd652 100644
--- a/docs/profiler.rst
+++ b/docs/profiler.rst
@@ -294,4 +294,3 @@ To show its results, call ``ti.print_profile_info()``. For example:
 .. Note::
 
     ``ScopedProfiler`` is a C++ class in the core of Taichi. It is not exposed to Python users.
-
diff --git a/misc/mpm99_timed.py b/misc/mpm99_timed.py
index 341fed792a24d..d92259521a750 100644
--- a/misc/mpm99_timed.py
+++ b/misc/mpm99_timed.py
@@ -1,105 +1,129 @@
 import taichi as ti
 import numpy as np
-ti.init(arch=ti.gpu) # Try to run on GPU
-quality = 1 # Use a larger value for higher-res simulations
-n_particles, n_grid = 9000 * quality ** 2, 128 * quality
+ti.init(arch=ti.gpu)  # Try to run on GPU
+quality = 1  # Use a larger value for higher-res simulations
+n_particles, n_grid = 9000 * quality**2, 128 * quality
 dx, inv_dx = 1 / n_grid, float(n_grid)
 dt = 1e-4 / quality
 p_vol, p_rho = (dx * 0.5)**2, 1
 p_mass = p_vol * p_rho
-E, nu = 0.1e4, 0.2 # Young's modulus and Poisson's ratio
-mu_0, lambda_0 = E / (2 * (1 + nu)), E * nu / ((1+nu) * (1 - 2 * nu)) # Lame parameters
-x = ti.Vector(2, dt=ti.f32, shape=n_particles) # position
-v = ti.Vector(2, dt=ti.f32, shape=n_particles) # velocity
-C = ti.Matrix(2, 2, dt=ti.f32, shape=n_particles) # affine velocity field
-F = ti.Matrix(2, 2, dt=ti.f32, shape=n_particles) # deformation gradient
-material = ti.var(dt=ti.i32, shape=n_particles) # material id
-Jp = ti.var(dt=ti.f32, shape=n_particles) # plastic deformation
-grid_v = ti.Vector(2, dt=ti.f32, shape=(n_grid, n_grid)) # grid node momentum/velocity
-grid_m = ti.var(dt=ti.f32, shape=(n_grid, n_grid)) # grid node mass
+E, nu = 0.1e4, 0.2  # Young's modulus and Poisson's ratio
+mu_0, lambda_0 = E / (2 * (1 + nu)), E * nu / (
+    (1 + nu) * (1 - 2 * nu))  # Lame parameters
+x = ti.Vector(2, dt=ti.f32, shape=n_particles)  # position
+v = ti.Vector(2, dt=ti.f32, shape=n_particles)  # velocity
+C = ti.Matrix(2, 2, dt=ti.f32, shape=n_particles)  # affine velocity field
+F = ti.Matrix(2, 2, dt=ti.f32, shape=n_particles)  # deformation gradient
+material = ti.var(dt=ti.i32, shape=n_particles)  # material id
+Jp = ti.var(dt=ti.f32, shape=n_particles)  # plastic deformation
+grid_v = ti.Vector(2, dt=ti.f32,
+                   shape=(n_grid, n_grid))  # grid node momentum/velocity
+grid_m = ti.var(dt=ti.f32, shape=(n_grid, n_grid))  # grid node mass
+
 
 @ti.profiler.timed(warmup=10)
 @ti.kernel
 def substep():
-  for i, j in grid_m:
-    grid_v[i, j] = [0, 0]
-    grid_m[i, j] = 0
-  for p in x: # Particle state update and scatter to grid (P2G)
-    base = (x[p] * inv_dx - 0.5).cast(int)
-    fx = x[p] * inv_dx - base.cast(float)
-    # Quadratic kernels  [http://mpm.graphics   Eqn. 123, with x=fx, fx-1,fx-2]
-    w = [0.5 * (1.5 - fx) ** 2, 0.75 - (fx - 1) ** 2, 0.5 * (fx - 0.5) ** 2]
-    F[p] = (ti.Matrix.identity(ti.f32, 2) + dt * C[p]) @ F[p] # deformation gradient update
-    h = ti.exp(10 * (1.0 - Jp[p])) # Hardening coefficient: snow gets harder when compressed
-    if material[p] == 1: # jelly, make it softer
-      h = 0.3
-    mu, la = mu_0 * h, lambda_0 * h
-    if material[p] == 0: # liquid
-      mu = 0.0
-    U, sig, V = ti.svd(F[p])
-    J = 1.0
-    for d in ti.static(range(2)):
-      new_sig = sig[d, d]
-      if material[p] == 2:  # Snow
-        new_sig = min(max(sig[d, d], 1 - 2.5e-2), 1 + 4.5e-3)  # Plasticity
-      Jp[p] *= sig[d, d] / new_sig
-      sig[d, d] = new_sig
-      J *= new_sig
-    if material[p] == 0:  # Reset deformation gradient to avoid numerical instability
-      F[p] = ti.Matrix.identity(ti.f32, 2) * ti.sqrt(J)
-    elif material[p] == 2:
-      F[p] = U @ sig @ V.T() # Reconstruct elastic deformation gradient after plasticity
-    stress = 2 * mu * (F[p] - U @ V.T()) @ F[p].T() + ti.Matrix.identity(ti.f32, 2) * la * J * (J - 1)
-    stress = (-dt * p_vol * 4 * inv_dx * inv_dx) * stress
-    affine = stress + p_mass * C[p]
-    for i, j in ti.static(ti.ndrange(3, 3)): # Loop over 3x3 grid node neighborhood
-      offset = ti.Vector([i, j])
-      dpos = (offset.cast(float) - fx) * dx
-      weight = w[i][0] * w[j][1]
-      grid_v[base + offset] += weight * (p_mass * v[p] + affine @ dpos)
-      grid_m[base + offset] += weight * p_mass
-  for i, j in grid_m:
-    if grid_m[i, j] > 0: # No need for epsilon here
-      grid_v[i, j] = (1 / grid_m[i, j]) * grid_v[i, j] # Momentum to velocity
-      grid_v[i, j][1] -= dt * 50 # gravity
-      if i < 3 and grid_v[i, j][0] < 0:          grid_v[i, j][0] = 0 # Boundary conditions
-      if i > n_grid - 3 and grid_v[i, j][0] > 0: grid_v[i, j][0] = 0
-      if j < 3 and grid_v[i, j][1] < 0:          grid_v[i, j][1] = 0
-      if j > n_grid - 3 and grid_v[i, j][1] > 0: grid_v[i, j][1] = 0
-  for p in x: # grid to particle (G2P)
-    base = (x[p] * inv_dx - 0.5).cast(int)
-    fx = x[p] * inv_dx - base.cast(float)
-    w = [0.5 * (1.5 - fx) ** 2, 0.75 - (fx - 1.0) ** 2, 0.5 * (fx - 0.5) ** 2]
-    new_v = ti.Vector.zero(ti.f32, 2)
-    new_C = ti.Matrix.zero(ti.f32, 2, 2)
-    for i, j in ti.static(ti.ndrange(3, 3)): # loop over 3x3 grid node neighborhood
-      dpos = ti.Vector([i, j]).cast(float) - fx
-      g_v = grid_v[base + ti.Vector([i, j])]
-      weight = w[i][0] * w[j][1]
-      new_v += weight * g_v
-      new_C += 4 * inv_dx * weight * g_v.outer_product(dpos)
-    v[p], C[p] = new_v, new_C
-    x[p] += dt * v[p] # advection
+    for i, j in grid_m:
+        grid_v[i, j] = [0, 0]
+        grid_m[i, j] = 0
+    for p in x:  # Particle state update and scatter to grid (P2G)
+        base = (x[p] * inv_dx - 0.5).cast(int)
+        fx = x[p] * inv_dx - base.cast(float)
+        # Quadratic kernels  [http://mpm.graphics   Eqn. 123, with x=fx, fx-1,fx-2]
+        w = [0.5 * (1.5 - fx)**2, 0.75 - (fx - 1)**2, 0.5 * (fx - 0.5)**2]
+        F[p] = (ti.Matrix.identity(ti.f32, 2) +
+                dt * C[p]) @ F[p]  # deformation gradient update
+        h = ti.exp(
+            10 *
+            (1.0 -
+             Jp[p]))  # Hardening coefficient: snow gets harder when compressed
+        if material[p] == 1:  # jelly, make it softer
+            h = 0.3
+        mu, la = mu_0 * h, lambda_0 * h
+        if material[p] == 0:  # liquid
+            mu = 0.0
+        U, sig, V = ti.svd(F[p])
+        J = 1.0
+        for d in ti.static(range(2)):
+            new_sig = sig[d, d]
+            if material[p] == 2:  # Snow
+                new_sig = min(max(sig[d, d], 1 - 2.5e-2),
+                              1 + 4.5e-3)  # Plasticity
+            Jp[p] *= sig[d, d] / new_sig
+            sig[d, d] = new_sig
+            J *= new_sig
+        if material[
+                p] == 0:  # Reset deformation gradient to avoid numerical instability
+            F[p] = ti.Matrix.identity(ti.f32, 2) * ti.sqrt(J)
+        elif material[p] == 2:
+            F[p] = U @ sig @ V.T(
+            )  # Reconstruct elastic deformation gradient after plasticity
+        stress = 2 * mu * (F[p] - U @ V.T()) @ F[p].T() + ti.Matrix.identity(
+            ti.f32, 2) * la * J * (J - 1)
+        stress = (-dt * p_vol * 4 * inv_dx * inv_dx) * stress
+        affine = stress + p_mass * C[p]
+        for i, j in ti.static(ti.ndrange(
+                3, 3)):  # Loop over 3x3 grid node neighborhood
+            offset = ti.Vector([i, j])
+            dpos = (offset.cast(float) - fx) * dx
+            weight = w[i][0] * w[j][1]
+            grid_v[base + offset] += weight * (p_mass * v[p] + affine @ dpos)
+            grid_m[base + offset] += weight * p_mass
+    for i, j in grid_m:
+        if grid_m[i, j] > 0:  # No need for epsilon here
+            grid_v[i,
+                   j] = (1 / grid_m[i, j]) * grid_v[i,
+                                                    j]  # Momentum to velocity
+            grid_v[i, j][1] -= dt * 50  # gravity
+            if i < 3 and grid_v[i, j][0] < 0:
+                grid_v[i, j][0] = 0  # Boundary conditions
+            if i > n_grid - 3 and grid_v[i, j][0] > 0: grid_v[i, j][0] = 0
+            if j < 3 and grid_v[i, j][1] < 0: grid_v[i, j][1] = 0
+            if j > n_grid - 3 and grid_v[i, j][1] > 0: grid_v[i, j][1] = 0
+    for p in x:  # grid to particle (G2P)
+        base = (x[p] * inv_dx - 0.5).cast(int)
+        fx = x[p] * inv_dx - base.cast(float)
+        w = [0.5 * (1.5 - fx)**2, 0.75 - (fx - 1.0)**2, 0.5 * (fx - 0.5)**2]
+        new_v = ti.Vector.zero(ti.f32, 2)
+        new_C = ti.Matrix.zero(ti.f32, 2, 2)
+        for i, j in ti.static(ti.ndrange(
+                3, 3)):  # loop over 3x3 grid node neighborhood
+            dpos = ti.Vector([i, j]).cast(float) - fx
+            g_v = grid_v[base + ti.Vector([i, j])]
+            weight = w[i][0] * w[j][1]
+            new_v += weight * g_v
+            new_C += 4 * inv_dx * weight * g_v.outer_product(dpos)
+        v[p], C[p] = new_v, new_C
+        x[p] += dt * v[p]  # advection
+
 
 group_size = n_particles // 3
+
+
 @ti.kernel
 def initialize():
-  for i in range(n_particles):
-    x[i] = [ti.random() * 0.2 + 0.3 + 0.10 * (i // group_size), ti.random() * 0.2 + 0.05 + 0.32 * (i // group_size)]
-    material[i] = i // group_size # 0: fluid 1: jelly 2: snow
-    v[i] = ti.Matrix([0, 0])
-    F[i] = ti.Matrix([[1, 0], [0, 1]])
-    Jp[i] = 1
+    for i in range(n_particles):
+        x[i] = [
+            ti.random() * 0.2 + 0.3 + 0.10 * (i // group_size),
+            ti.random() * 0.2 + 0.05 + 0.32 * (i // group_size)
+        ]
+        material[i] = i // group_size  # 0: fluid 1: jelly 2: snow
+        v[i] = ti.Matrix([0, 0])
+        F[i] = ti.Matrix([[1, 0], [0, 1]])
+        Jp[i] = 1
+
+
 initialize()
 gui = ti.GUI("Taichi MLS-MPM-99", res=512, background_color=0x112F41)
 for frame in range(100):
-  for s in range(int(2e-3 // dt)):
-    substep()
-  colors = np.array([0x068587, 0xED553B, 0xEEEEF0], dtype=np.uint32)
-  ti.profiler('gui.circles', warmup=5)
-  gui.circles(x.to_numpy(), radius=1.5, color=colors[material.to_numpy()])
-  ti.profiler('gui.show', warmup=5)
-  gui.show()
-  ti.profiler()
+    for s in range(int(2e-3 // dt)):
+        substep()
+    colors = np.array([0x068587, 0xED553B, 0xEEEEF0], dtype=np.uint32)
+    ti.profiler('gui.circles', warmup=5)
+    gui.circles(x.to_numpy(), radius=1.5, color=colors[material.to_numpy()])
+    ti.profiler('gui.show', warmup=5)
+    gui.show()
+    ti.profiler()
 
 ti.profiler.print()
diff --git a/misc/test_python_profiler.py b/misc/test_python_profiler.py
index 6f50f8b6e741c..dba2b673ca329 100644
--- a/misc/test_python_profiler.py
+++ b/misc/test_python_profiler.py
@@ -35,5 +35,4 @@ def func3():
 
     ti.profiler.stop('step')
 
-
 ti.profiler.print()
diff --git a/python/taichi/misc/timing.py b/python/taichi/misc/timing.py
index a3b34f20d91ab..b50905fd3c93d 100644
--- a/python/taichi/misc/timing.py
+++ b/python/taichi/misc/timing.py
@@ -91,14 +91,17 @@ def print(self, name=None):
         if name is not None:
             names = [name]
         else:
+
             def keyfunc(name):
                 rec = RecordStatistics(self, name)
                 return -rec.total
+
             names = sorted(self.records.keys(), key=keyfunc)
         for name in names:
             rec = RecordStatistics(self, name)
-            print(f'{_m(rec.min)} | {_m(rec.avg)} | {_m(rec.max)} | {_c(rec.count)} '
-                  f'| {_m(rec.total)} | {_n(name)}')
+            print(
+                f'{_m(rec.min)} | {_m(rec.avg)} | {_m(rec.max)} | {_c(rec.count)} '
+                f'| {_m(rec.total)} | {_n(name)}')
 
     def timed(self, name=None, **options):
         if callable(name):
@@ -128,5 +131,4 @@ def wrapped(*args, **kwargs):
         return decorator(foo) if foo is not None else decorator
 
 
-
 profiler = PythonProfiler()

From 5eedb7f453fad552204674c90e64daf2f4b41d68 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=BD=AD=E4=BA=8E=E6=96=8C?= <1931127624@qq.com>
Date: Wed, 15 Jul 2020 12:35:29 +0800
Subject: [PATCH 10/15] [skip ci] Apply suggestions from code review

Co-authored-by: Xudong Feng <xudongfeng1996@gmail.com>
---
 docs/profiler.rst | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/docs/profiler.rst b/docs/profiler.rst
index 78743a89cd652..174bce02f11a6 100644
--- a/docs/profiler.rst
+++ b/docs/profiler.rst
@@ -23,7 +23,7 @@ Profiling APIs
 
 There are 3 ways to use this profiler:
 
-1. ``ti.profiler.start()`` and ``ti.profiler.stop()``, these are the most fundemental APIs:
+1. ``ti.profiler.start()`` and ``ti.profiler.stop()`` are the most fundemental APIs:
 
 .. code-block:: python
 
@@ -174,7 +174,7 @@ will obtain:
 Profiler options
 ****************
 
-Due to Taichi's JIT mechinism, a kernel will be **compiled** on its first invocation.
+Due to Taichi's JIT mechanism, a kernel will be **compiled** on its first invocation.
 So the first record will be extremely long compared to the following records since it
 **involves both compile time and execution time**, e.g.:
 
@@ -205,7 +205,7 @@ Set ``warmup=5`` for example, will **discard** the first 5 record entries.
 I.e. discard the kernel compile time and possible TLB and cache misses on start up.
 
 
-Check out `misc/mpm99_timed.py <https://github.com/taichi-dev/taichi/blob/master/misc/mpm99_timed.py>`_ for sum up usage example.
+Check out `misc/mpm99_timed.py <https://github.com/taichi-dev/taichi/blob/master/misc/mpm99_timed.py>`_ for a summary example.
 
 
 .. warning::

From 9a9057afc2a85c1f53ddb0c312981f037d15baee Mon Sep 17 00:00:00 2001
From: archibate <1931127624@qq.com>
Date: Thu, 16 Jul 2020 12:57:18 +0800
Subject: [PATCH 11/15] apply confuse from @Rullec

---
 docs/profiler.rst | 46 +++++++++++++++++++++++++---------------------
 1 file changed, 25 insertions(+), 21 deletions(-)

diff --git a/docs/profiler.rst b/docs/profiler.rst
index 174bce02f11a6..dfcd96fc1f77a 100644
--- a/docs/profiler.rst
+++ b/docs/profiler.rst
@@ -24,6 +24,8 @@ Profiling APIs
 There are 3 ways to use this profiler:
 
 1. ``ti.profiler.start()`` and ``ti.profiler.stop()`` are the most fundemental APIs:
+   It will measure the time difference between ``start`` and ``stop``.
+   Then save the result according to the given name. e.g.:
 
 .. code-block:: python
 
@@ -48,7 +50,15 @@ There are 3 ways to use this profiler:
     ti.profiler.print()
 
 
-2. ``ti.profiler()``, this one makes our code cleaner:
+.. code-block:: none
+
+      min   |   avg   |   max   |  num  |  total  |    name
+     0.100s |  0.100s |  0.100s |    1x |  0.100s | B
+    10.10ms | 10.10ms | 10.10ms |    1x | 10.10ms | A
+
+
+2. ``ti.profiler()``, this one makes our code cleaner.
+   Basically, it will automatically invoke ``stop`` when a new record is started. e.g.:
 
 .. code-block:: python
 
@@ -72,18 +82,16 @@ There are 3 ways to use this profiler:
     ti.profiler.print()
 
 
-.. note::
-
-    Running 1 and 2 should obtain something like:
-
-    .. code-block:: none
+.. code-block:: none
 
-          min   |   avg   |   max   |  num  |  total  |    name
-         0.100s |  0.100s |  0.100s |    1x |  0.100s | B
-        10.10ms | 10.10ms | 10.10ms |    1x | 10.10ms | A
+      min   |   avg   |   max   |  num  |  total  |    name
+     0.100s |  0.100s |  0.100s |    1x |  0.100s | B
+    10.10ms | 10.10ms | 10.10ms |    1x | 10.10ms | A
 
 
-3. ``@ti.profiler.timed``, this one is very intuitive when profiling kernels:
+3. ``@ti.profiler.timed``, this one is very intuitive when profiling kernels.
+   It will measure the time spent in the function, i.e. ``start`` when entering the function,
+   ``stop`` when leaving the function, and the record name is the function name.
 
 .. code-block:: python
 
@@ -105,6 +113,13 @@ There are 3 ways to use this profiler:
     ti.profiler.print()
 
 
+.. code-block:: none
+
+      min   |   avg   |   max   |  num  |  total  |    name
+     0.100s |  0.100s |  0.100s |    1x |  0.100s | do_something_B
+    10.10ms | 10.10ms | 10.10ms |    1x | 10.10ms | do_something_A
+
+
 .. warning::
 
     When combining ``@ti.profiler.timed`` with other decorators like ``@ti.kernel``,
@@ -118,17 +133,6 @@ There are 3 ways to use this profiler:
                 ...
 
 
-.. note::
-
-    Running 3 should obtain something like:
-
-    .. code-block:: none
-
-          min   |   avg   |   max   |  num  |  total  |    name
-         0.100s |  0.100s |  0.100s |    1x |  0.100s | do_something_B
-        10.10ms | 10.10ms | 10.10ms |    1x | 10.10ms | do_something_A
-
-
 Recording multiple entries
 **************************
 

From 7d7b8efe84e8872c73fe10cbe3753c55cead75e6 Mon Sep 17 00:00:00 2001
From: archibate <1931127624@qq.com>
Date: Fri, 17 Jul 2020 22:41:14 +0800
Subject: [PATCH 12/15] with ti.profiler(name) for creating TimedScope

---
 docs/profiler.rst            | 22 +++++++++++++++-------
 misc/mpm99_timed.py          |  9 ++++-----
 python/taichi/misc/timing.py | 19 ++++++++++++++-----
 3 files changed, 33 insertions(+), 17 deletions(-)

diff --git a/docs/profiler.rst b/docs/profiler.rst
index dfcd96fc1f77a..ce8c65dc0cfea 100644
--- a/docs/profiler.rst
+++ b/docs/profiler.rst
@@ -57,8 +57,10 @@ There are 3 ways to use this profiler:
     10.10ms | 10.10ms | 10.10ms |    1x | 10.10ms | A
 
 
-2. ``ti.profiler()``, this one makes our code cleaner.
-   Basically, it will automatically invoke ``stop`` when a new record is started. e.g.:
+2. ``with ti.profiler()``, this one makes our code cleaner and readable.
+   Basically, it will automatically invoke ``stop`` when the indented block exited.
+   For more details about the ``with`` syntax in Python,
+   see `this tutorial <https://www.pythonforbeginners.com/files/with-statement-in-python>`_.
 
 .. code-block:: python
 
@@ -72,12 +74,13 @@ There are 3 ways to use this profiler:
     def do_something_B():
         sleep(0.1)
 
-    ti.profiler('A')  # start('A')
-    do_something_A()
+    with ti.profiler('A'):
+        do_something_A()
+    # automatically invoke stop('A')
 
-    ti.profiler('B')  # stop('A') & start('B')
-    do_something_B()
-    ti.profiler()     # stop('B')
+    with ti.profiler('B'):
+        do_something_B()
+    # automatically invoke stop('B')
 
     ti.profiler.print()
 
@@ -158,6 +161,10 @@ When a same **name** is used for multiple times, then they will be merged into o
     do_something_B()
     ti.profiler.stop('A')
 
+    ti.profiler.start('B')
+    do_something_B()
+    ti.profiler.stop('B')
+
     ti.profiler.print()
 
 will obtain:
@@ -166,6 +173,7 @@ will obtain:
 
       min   |   avg   |   max   |  num  |  total  |    name
     10.10ms | 55.12ms |  0.100s |    2x |  0.110s | A
+     0.100s |  0.100s |  0.100s |    1x |  0.100s | B
 
 
 - ``min`` is the minimum time in records.
diff --git a/misc/mpm99_timed.py b/misc/mpm99_timed.py
index d92259521a750..88ef381907b3b 100644
--- a/misc/mpm99_timed.py
+++ b/misc/mpm99_timed.py
@@ -120,10 +120,9 @@ def initialize():
     for s in range(int(2e-3 // dt)):
         substep()
     colors = np.array([0x068587, 0xED553B, 0xEEEEF0], dtype=np.uint32)
-    ti.profiler('gui.circles', warmup=5)
-    gui.circles(x.to_numpy(), radius=1.5, color=colors[material.to_numpy()])
-    ti.profiler('gui.show', warmup=5)
-    gui.show()
-    ti.profiler()
+    with ti.profiler('gui.circles', warmup=5):
+        gui.circles(x.to_numpy(), radius=1.5, color=colors[material.to_numpy()])
+    with ti.profiler('gui.show', warmup=5):
+        gui.show()
 
 ti.profiler.print()
diff --git a/python/taichi/misc/timing.py b/python/taichi/misc/timing.py
index b50905fd3c93d..785ff517b70b7 100644
--- a/python/taichi/misc/timing.py
+++ b/python/taichi/misc/timing.py
@@ -80,11 +80,20 @@ def stop(self, name=None):
             self.records[name] = []
         self.records[name].append(diff)
 
-    def __call__(self, name=None, **options):
-        if self.last_started:
-            self.stop()
-        if name is not None:
-            self.start(name, **options)
+    class TimedScope:
+        def __init__(self, profiler, name, **options):
+            self.profiler = profiler
+            self.name = name
+            self.profiler.start(name, **options)
+
+        def __enter__(self):
+            return self
+
+        def __exit__(self, *args):
+            self.profiler.stop(self.name)
+
+    def __call__(self, name, **options):
+        return self.TimedScope(self, name, **options)
 
     def print(self, name=None):
         print('  min   |   avg   |   max   |  num  |  total  |    name')

From baf3817b31424ea99a30719c7116c8148973f68d Mon Sep 17 00:00:00 2001
From: Taichi Gardener <taichigardener@gmail.com>
Date: Fri, 17 Jul 2020 10:41:29 -0400
Subject: [PATCH 13/15] [skip ci] enforce code format

---
 misc/mpm99_timed.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/misc/mpm99_timed.py b/misc/mpm99_timed.py
index 88ef381907b3b..b28c331d34f24 100644
--- a/misc/mpm99_timed.py
+++ b/misc/mpm99_timed.py
@@ -121,7 +121,9 @@ def initialize():
         substep()
     colors = np.array([0x068587, 0xED553B, 0xEEEEF0], dtype=np.uint32)
     with ti.profiler('gui.circles', warmup=5):
-        gui.circles(x.to_numpy(), radius=1.5, color=colors[material.to_numpy()])
+        gui.circles(x.to_numpy(),
+                    radius=1.5,
+                    color=colors[material.to_numpy()])
     with ti.profiler('gui.show', warmup=5):
         gui.show()
 

From fde1ae8206b16e02edc9720c70d2abf87b3b75a4 Mon Sep 17 00:00:00 2001
From: Taichi Gardener <taichigardener@gmail.com>
Date: Thu, 23 Jul 2020 09:51:27 -0400
Subject: [PATCH 14/15] [skip ci] enforce code format

---
 python/taichi/misc/timing.py   | 1 -
 tests/python/test_bitmasked.py | 2 +-
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/python/taichi/misc/timing.py b/python/taichi/misc/timing.py
index a6da09b528a83..d189a3d5bf6dc 100644
--- a/python/taichi/misc/timing.py
+++ b/python/taichi/misc/timing.py
@@ -142,5 +142,4 @@ def wrapped(*args, **kwargs):
 
 profiler = PythonProfiler()
 
-
 __all__ = ['profiler']
diff --git a/tests/python/test_bitmasked.py b/tests/python/test_bitmasked.py
index 9de1397169fc2..627c8724fc205 100644
--- a/tests/python/test_bitmasked.py
+++ b/tests/python/test_bitmasked.py
@@ -223,7 +223,7 @@ def test_bitmasked_offset_child():
     bm = ti.root.bitmasked(ti.i, n)
     bm.dense(ti.i, 16).place(x, x2)
     bm2 = bm.bitmasked(ti.i, 4)
-    
+
     bm2.dense(ti.i, 4).place(y, y2, y3)
     bm2.bitmasked(ti.i, 4).place(z)
 

From c2bbec5277629bc20085090fd5a44e6ab7c9a6b2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=BD=AD=E4=BA=8E=E6=96=8C?= <1931127624@qq.com>
Date: Thu, 30 Jul 2020 15:03:04 +0800
Subject: [PATCH 15/15] Apply suggestions from code review

Co-authored-by: Yuanming Hu <yuanming-hu@users.noreply.github.com>
---
 docs/profiler.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/profiler.rst b/docs/profiler.rst
index ce8c65dc0cfea..fc15122898f0e 100644
--- a/docs/profiler.rst
+++ b/docs/profiler.rst
@@ -2,7 +2,7 @@
 Profiler
 ========
 
-Taichi's profiler can help you analyze the run-time cost of your program, i.e. performance.
+Taichi's profilers can help analyze the performance of your program.
 
 Currently there are three profiling systems in Taichi: