diff --git a/docs/profiler.rst b/docs/profiler.rst
index f0295de1500b1..fc15122898f0e 100644
--- a/docs/profiler.rst
+++ b/docs/profiler.rst
@@ -2,45 +2,250 @@
 Profiler
 ========
 
-Taichi's profiler can help you analyze the run-time cost of your program. There are two profiling systems in Taichi: ``ScopedProfiler`` and ``KernelProfiler``.
+Taichi's profilers can help analyze the performance of your program.
+
+Currently there are three profiling systems in Taichi:
+
+- ``PythonProfiler``
+- ``KernelProfiler``
+- ``ScopedProfiler`` (for developers)
+
 
-ScopedProfiler
-##############
 
-1. ``ScopedProfiler`` measures time spent on the **host tasks** hierarchically.
+PythonProfiler
+--------------
 
-2. This profiler is automatically on. To show its results, call ``ti.print_profile_info()``. For example:
+``PythonProfiler`` basically measures time spent between ``start()`` and ``stop()`` using
+the Python-builtin function ``time.time()``.
+
+Profiling APIs
+**************
+
+There are 3 ways to use this profiler:
+
+1. ``ti.profiler.start()`` and ``ti.profiler.stop()`` are the most fundemental APIs:
+   It will measure the time difference between ``start`` and ``stop``.
+   Then save the result according to the given name. e.g.:
 
 .. code-block:: python
 
+    from time import sleep
     import taichi as ti
+    ti.init()
 
-    ti.init(arch=ti.cpu)
-    var = ti.var(ti.f32, shape=1)
+    def do_something_A():
+        sleep(0.01)
+
+    def do_something_B():
+        sleep(0.1)
+
+    ti.profiler.start('A')
+    do_something_A()
+    ti.profiler.stop('A')
+
+    ti.profiler.start('B')
+    do_something_B()
+    ti.profiler.stop('B')
+
+    ti.profiler.print()
+
+
+.. code-block:: none
+
+      min   |   avg   |   max   |  num  |  total  |    name
+     0.100s |  0.100s |  0.100s |    1x |  0.100s | B
+    10.10ms | 10.10ms | 10.10ms |    1x | 10.10ms | A
+
+
+2. ``with ti.profiler()``, this one makes our code cleaner and readable.
+   Basically, it will automatically invoke ``stop`` when the indented block exited.
+   For more details about the ``with`` syntax in Python,
+   see `this tutorial <https://www.pythonforbeginners.com/files/with-statement-in-python>`_.
+
+.. code-block:: python
+
+    from time import sleep
+    import taichi as ti
+    ti.init()
+
+    def do_something_A():
+        sleep(0.01)
+
+    def do_something_B():
+        sleep(0.1)
+
+    with ti.profiler('A'):
+        do_something_A()
+    # automatically invoke stop('A')
+
+    with ti.profiler('B'):
+        do_something_B()
+    # automatically invoke stop('B')
+
+    ti.profiler.print()
+
+
+.. code-block:: none
+
+      min   |   avg   |   max   |  num  |  total  |    name
+     0.100s |  0.100s |  0.100s |    1x |  0.100s | B
+    10.10ms | 10.10ms | 10.10ms |    1x | 10.10ms | A
+
+
+3. ``@ti.profiler.timed``, this one is very intuitive when profiling kernels.
+   It will measure the time spent in the function, i.e. ``start`` when entering the function,
+   ``stop`` when leaving the function, and the record name is the function name.
+
+.. code-block:: python
+
+    from time import sleep
+    import taichi as ti
+    ti.init()
+
+    @ti.profiler.timed
+    def do_something_A():
+        sleep(0.01)
+
+    @ti.profiler.timed
+    def do_something_B():
+        sleep(0.1)
+
+    do_something_A()
+    do_something_B()
+
+    ti.profiler.print()
+
+
+.. code-block:: none
+
+      min   |   avg   |   max   |  num  |  total  |    name
+     0.100s |  0.100s |  0.100s |    1x |  0.100s | do_something_B
+    10.10ms | 10.10ms | 10.10ms |    1x | 10.10ms | do_something_A
+
+
+.. warning::
+
+    When combining ``@ti.profiler.timed`` with other decorators like ``@ti.kernel``,
+    then ``@ti.profiler.timed`` should be put **above** it, e.g.:
+
+    .. code-block:: python
+
+            @ti.profiler.timed
+            @ti.kernel
+            def substep():
+                ...
+
+
+Recording multiple entries
+**************************
+
+When a same **name** is used for multiple times, then they will be merged into one, e.g.:
+
+.. code-block:: python
+
+    from time import sleep
+    import taichi as ti
+    ti.init()
+
+    def do_something_A():
+        sleep(0.01)
+
+    def do_something_B():
+        sleep(0.1)
+
+    ti.profiler.start('A')
+    do_something_A()
+    ti.profiler.stop('A')
+
+    ti.profiler.start('A')
+    do_something_B()
+    ti.profiler.stop('A')
+
+    ti.profiler.start('B')
+    do_something_B()
+    ti.profiler.stop('B')
+
+    ti.profiler.print()
 
+will obtain:
 
+.. code-block:: none
+
+      min   |   avg   |   max   |  num  |  total  |    name
+    10.10ms | 55.12ms |  0.100s |    2x |  0.110s | A
+     0.100s |  0.100s |  0.100s |    1x |  0.100s | B
+
+
+- ``min`` is the minimum time in records.
+- ``avg`` is the average time of records.
+- ``max`` is the maximum time in records.
+- ``num`` is the number of record entries.
+- ``total`` is the total costed time of records.
+
+
+Profiler options
+****************
+
+Due to Taichi's JIT mechanism, a kernel will be **compiled** on its first invocation.
+So the first record will be extremely long compared to the following records since it
+**involves both compile time and execution time**, e.g.:
+
+.. code-block:: none
+
+       min   |   avg   |   max   |  num  |  total  |    name
+      2.37ms |  3.79ms |  1.615s | 1900x |  7.204s | substep
+
+.. code-block:: none
+
+       min   |   avg   |   max   |  num  |  total  |    name
+      2.37ms |  2.95ms | 12.70ms | 1895x |  5.592s | substep
+
+
+As you see, this make our result inaccurate, especially the ``max`` column.
+
+To avoid this, you may specify a ``warmup`` option to ``ti.profiler``, e.g.:
+
+.. code-block:: python
+
+    @ti.profiler.timed(warmup=5)
     @ti.kernel
-    def compute():
-        var[0] = 1.0
-        print("Setting var[0] =", var[0])
+    def substep():
+        ...
 
 
-    compute()
-    ti.print_profile_info()
+Set ``warmup=5`` for example, will **discard** the first 5 record entries.
+I.e. discard the kernel compile time and possible TLB and cache misses on start up.
 
 
-``ti.print_profile_info()`` prints profiling results in a hierarchical format.
+Check out `misc/mpm99_timed.py <https://github.com/taichi-dev/taichi/blob/master/misc/mpm99_timed.py>`_ for a summary example.
 
-.. Note::
 
-    ``ScopedProfiler`` is a C++ class in the core of Taichi. It is not exposed to Python users.
+.. warning::
+
+    ``ti.profiler``, i.e. ``PythonProfiler``, **only works in Python-scope**, e.g.::
+
+        @ti.func
+        def substep():
+            ti.profiler.start('hello')  # won't work as you expected...
+            ...
+            ti.profiler.stop('hello')
+
+        @ti.profiler.timed  # won't work as you expected...
+        @ti.func
+        def hello():
+            ...
+
+    To do profiling **inside Taichi-scope**, please see the ``KernelProfiler`` section below.
+
 
 KernelProfiler
-##############
+--------------
+
+``KernelProfiler`` records the costs of Taichi kernels on devices.
 
-1. ``KernelProfiler`` records the costs of Taichi kernels on devices. To enable this profiler, set ``kernel_profiler=True`` in ``ti.init``.
+To enable this profiler, please initialize Taichi using ``ti.init(kernel_profiler=True)``.
 
-2. Call ``ti.kernel_profiler_print()`` to show the kernel profiling result. For example:
+Call ``ti.kernel_profiler_print()`` to show the kernel profiling result. For example:
 
 .. code-block:: python
     :emphasize-lines: 3, 13
@@ -67,3 +272,37 @@ The outputs would be:
     [ 22.73%] jit_evaluator_0_kernel_0_serial             min   0.001 ms   avg   0.001 ms   max   0.001 ms   total   0.000 s [      1x]
     [  0.00%] jit_evaluator_1_kernel_1_serial             min   0.000 ms   avg   0.000 ms   max   0.000 ms   total   0.000 s [      1x]
     [ 77.27%] compute_c4_0_kernel_2_serial                min   0.004 ms   avg   0.004 ms   max   0.004 ms   total   0.000 s [      1x]
+
+
+ScopedProfiler
+--------------
+
+``ScopedProfiler`` measures time spent on the **host tasks** hierarchically.
+
+This profiler is automatically on.
+
+To show its results, call ``ti.print_profile_info()``. For example:
+
+.. code-block:: python
+
+    import taichi as ti
+
+    ti.init(arch=ti.cpu)
+    var = ti.var(ti.f32, shape=1)
+
+
+    @ti.kernel
+    def compute():
+        var[0] = 1.0
+        print("Setting var[0] =", var[0])
+
+
+    compute()
+    ti.print_profile_info()
+
+
+``ti.print_profile_info()`` prints profiling results in a hierarchical format.
+
+.. Note::
+
+    ``ScopedProfiler`` is a C++ class in the core of Taichi. It is not exposed to Python users.
diff --git a/misc/mpm99_timed.py b/misc/mpm99_timed.py
new file mode 100644
index 0000000000000..b28c331d34f24
--- /dev/null
+++ b/misc/mpm99_timed.py
@@ -0,0 +1,130 @@
+import taichi as ti
+import numpy as np
+ti.init(arch=ti.gpu)  # Try to run on GPU
+quality = 1  # Use a larger value for higher-res simulations
+n_particles, n_grid = 9000 * quality**2, 128 * quality
+dx, inv_dx = 1 / n_grid, float(n_grid)
+dt = 1e-4 / quality
+p_vol, p_rho = (dx * 0.5)**2, 1
+p_mass = p_vol * p_rho
+E, nu = 0.1e4, 0.2  # Young's modulus and Poisson's ratio
+mu_0, lambda_0 = E / (2 * (1 + nu)), E * nu / (
+    (1 + nu) * (1 - 2 * nu))  # Lame parameters
+x = ti.Vector(2, dt=ti.f32, shape=n_particles)  # position
+v = ti.Vector(2, dt=ti.f32, shape=n_particles)  # velocity
+C = ti.Matrix(2, 2, dt=ti.f32, shape=n_particles)  # affine velocity field
+F = ti.Matrix(2, 2, dt=ti.f32, shape=n_particles)  # deformation gradient
+material = ti.var(dt=ti.i32, shape=n_particles)  # material id
+Jp = ti.var(dt=ti.f32, shape=n_particles)  # plastic deformation
+grid_v = ti.Vector(2, dt=ti.f32,
+                   shape=(n_grid, n_grid))  # grid node momentum/velocity
+grid_m = ti.var(dt=ti.f32, shape=(n_grid, n_grid))  # grid node mass
+
+
+@ti.profiler.timed(warmup=10)
+@ti.kernel
+def substep():
+    for i, j in grid_m:
+        grid_v[i, j] = [0, 0]
+        grid_m[i, j] = 0
+    for p in x:  # Particle state update and scatter to grid (P2G)
+        base = (x[p] * inv_dx - 0.5).cast(int)
+        fx = x[p] * inv_dx - base.cast(float)
+        # Quadratic kernels  [http://mpm.graphics   Eqn. 123, with x=fx, fx-1,fx-2]
+        w = [0.5 * (1.5 - fx)**2, 0.75 - (fx - 1)**2, 0.5 * (fx - 0.5)**2]
+        F[p] = (ti.Matrix.identity(ti.f32, 2) +
+                dt * C[p]) @ F[p]  # deformation gradient update
+        h = ti.exp(
+            10 *
+            (1.0 -
+             Jp[p]))  # Hardening coefficient: snow gets harder when compressed
+        if material[p] == 1:  # jelly, make it softer
+            h = 0.3
+        mu, la = mu_0 * h, lambda_0 * h
+        if material[p] == 0:  # liquid
+            mu = 0.0
+        U, sig, V = ti.svd(F[p])
+        J = 1.0
+        for d in ti.static(range(2)):
+            new_sig = sig[d, d]
+            if material[p] == 2:  # Snow
+                new_sig = min(max(sig[d, d], 1 - 2.5e-2),
+                              1 + 4.5e-3)  # Plasticity
+            Jp[p] *= sig[d, d] / new_sig
+            sig[d, d] = new_sig
+            J *= new_sig
+        if material[
+                p] == 0:  # Reset deformation gradient to avoid numerical instability
+            F[p] = ti.Matrix.identity(ti.f32, 2) * ti.sqrt(J)
+        elif material[p] == 2:
+            F[p] = U @ sig @ V.T(
+            )  # Reconstruct elastic deformation gradient after plasticity
+        stress = 2 * mu * (F[p] - U @ V.T()) @ F[p].T() + ti.Matrix.identity(
+            ti.f32, 2) * la * J * (J - 1)
+        stress = (-dt * p_vol * 4 * inv_dx * inv_dx) * stress
+        affine = stress + p_mass * C[p]
+        for i, j in ti.static(ti.ndrange(
+                3, 3)):  # Loop over 3x3 grid node neighborhood
+            offset = ti.Vector([i, j])
+            dpos = (offset.cast(float) - fx) * dx
+            weight = w[i][0] * w[j][1]
+            grid_v[base + offset] += weight * (p_mass * v[p] + affine @ dpos)
+            grid_m[base + offset] += weight * p_mass
+    for i, j in grid_m:
+        if grid_m[i, j] > 0:  # No need for epsilon here
+            grid_v[i,
+                   j] = (1 / grid_m[i, j]) * grid_v[i,
+                                                    j]  # Momentum to velocity
+            grid_v[i, j][1] -= dt * 50  # gravity
+            if i < 3 and grid_v[i, j][0] < 0:
+                grid_v[i, j][0] = 0  # Boundary conditions
+            if i > n_grid - 3 and grid_v[i, j][0] > 0: grid_v[i, j][0] = 0
+            if j < 3 and grid_v[i, j][1] < 0: grid_v[i, j][1] = 0
+            if j > n_grid - 3 and grid_v[i, j][1] > 0: grid_v[i, j][1] = 0
+    for p in x:  # grid to particle (G2P)
+        base = (x[p] * inv_dx - 0.5).cast(int)
+        fx = x[p] * inv_dx - base.cast(float)
+        w = [0.5 * (1.5 - fx)**2, 0.75 - (fx - 1.0)**2, 0.5 * (fx - 0.5)**2]
+        new_v = ti.Vector.zero(ti.f32, 2)
+        new_C = ti.Matrix.zero(ti.f32, 2, 2)
+        for i, j in ti.static(ti.ndrange(
+                3, 3)):  # loop over 3x3 grid node neighborhood
+            dpos = ti.Vector([i, j]).cast(float) - fx
+            g_v = grid_v[base + ti.Vector([i, j])]
+            weight = w[i][0] * w[j][1]
+            new_v += weight * g_v
+            new_C += 4 * inv_dx * weight * g_v.outer_product(dpos)
+        v[p], C[p] = new_v, new_C
+        x[p] += dt * v[p]  # advection
+
+
+group_size = n_particles // 3
+
+
+@ti.kernel
+def initialize():
+    for i in range(n_particles):
+        x[i] = [
+            ti.random() * 0.2 + 0.3 + 0.10 * (i // group_size),
+            ti.random() * 0.2 + 0.05 + 0.32 * (i // group_size)
+        ]
+        material[i] = i // group_size  # 0: fluid 1: jelly 2: snow
+        v[i] = ti.Matrix([0, 0])
+        F[i] = ti.Matrix([[1, 0], [0, 1]])
+        Jp[i] = 1
+
+
+initialize()
+gui = ti.GUI("Taichi MLS-MPM-99", res=512, background_color=0x112F41)
+for frame in range(100):
+    for s in range(int(2e-3 // dt)):
+        substep()
+    colors = np.array([0x068587, 0xED553B, 0xEEEEF0], dtype=np.uint32)
+    with ti.profiler('gui.circles', warmup=5):
+        gui.circles(x.to_numpy(),
+                    radius=1.5,
+                    color=colors[material.to_numpy()])
+    with ti.profiler('gui.show', warmup=5):
+        gui.show()
+
+ti.profiler.print()
diff --git a/misc/test_python_profiler.py b/misc/test_python_profiler.py
new file mode 100644
index 0000000000000..dba2b673ca329
--- /dev/null
+++ b/misc/test_python_profiler.py
@@ -0,0 +1,38 @@
+import taichi as ti
+
+
+@ti.profiler.timed('foo', warmup=5)
+@ti.kernel
+def func0():
+    print(233)
+
+
+@ti.profiler.timed(warmup=5)
+@ti.kernel
+def func1():
+    print(233)
+
+
+@ti.profiler.timed('Func2')
+@ti.kernel
+def func2():
+    print(666)
+
+
+@ti.profiler.timed
+@ti.kernel
+def func3():
+    pass
+
+
+for k in range(5):
+    ti.profiler.start('step')
+    for i in range(10):
+        func0()
+        func1()
+        func2()
+        func3()
+
+    ti.profiler.stop('step')
+
+ti.profiler.print()
diff --git a/python/taichi/misc/__init__.py b/python/taichi/misc/__init__.py
index 83177d8c10d81..c37795960819c 100644
--- a/python/taichi/misc/__init__.py
+++ b/python/taichi/misc/__init__.py
@@ -3,6 +3,7 @@
 from .image import *
 from .test import *
 from .error import *
+from .timing import profiler
 from .task import Task
 
 __all__ = [s for s in dir() if not s.startswith('_')]
diff --git a/python/taichi/misc/timing.py b/python/taichi/misc/timing.py
new file mode 100644
index 0000000000000..d189a3d5bf6dc
--- /dev/null
+++ b/python/taichi/misc/timing.py
@@ -0,0 +1,145 @@
+import time, functools
+from colorama import Fore, Back, Style
+
+
+def _m(sec):
+    if sec > 9.9:
+        return f'{Fore.LIGHTRED_EX}{sec:6.1f}{Fore.RESET}s'
+    elif sec > 0.099:
+        return f'{Fore.LIGHTRED_EX}{sec:6.3f}{Fore.RESET}s'
+    else:
+        ms = sec * 1000
+        if ms > 0.099:
+            return f'{Fore.LIGHTMAGENTA_EX}{ms:5.2f}{Fore.RESET}ms'
+        else:
+            ns = ms * 1000
+            return f'{Fore.BLUE}{ns:5.2f}{Fore.RESET}ns'
+
+
+def _c(cnt):
+    return f'{Fore.MAGENTA}{cnt:4d}{Fore.RESET}x'
+
+
+def _n(name):
+    return f'{Fore.MAGENTA}{name}{Fore.RESET}'
+
+
+class RecordStatistics:
+    def __init__(self, profiler, name):
+        self.record = profiler.records[name]
+        self.options = profiler.options[name]
+        self.rec = self.record
+
+        warmup = self.options.get('warmup', 0)
+        self.rec = self.rec[warmup:]
+
+    @property
+    def min(self):
+        return min(self.rec)
+
+    @property
+    def max(self):
+        return max(self.rec)
+
+    @property
+    def total(self):
+        return sum(self.rec)
+
+    @property
+    def avg(self):
+        return sum(self.rec) / len(self.rec)
+
+    @property
+    def count(self):
+        return len(self.rec)
+
+
+class PythonProfiler:
+    def __init__(self, timer=time.time):
+        self.timer = timer
+        self.records = {}
+        self.started = {}
+        self.options = {}
+        self.last_started = None
+
+    def start(self, name, **options):
+        self.last_started = name
+        self.options[name] = self.options.get(name, options)
+        t = self.timer()
+        self.started[name] = t
+
+    def stop(self, name=None):
+        t = self.timer()
+        if not name and self.last_started:
+            name = self.last_started
+            self.last_started = None
+        diff = t - self.started[name]
+        del self.started[name]
+        self.last_started = None
+        if name not in self.records:
+            self.records[name] = []
+        self.records[name].append(diff)
+
+    class TimedScope:
+        def __init__(self, profiler, name, **options):
+            self.profiler = profiler
+            self.name = name
+            self.profiler.start(name, **options)
+
+        def __enter__(self):
+            return self
+
+        def __exit__(self, *args):
+            self.profiler.stop(self.name)
+
+    def __call__(self, name, **options):
+        return self.TimedScope(self, name, **options)
+
+    def print(self, name=None):
+        print('  min   |   avg   |   max   |  num  |  total  |    name')
+        if name is not None:
+            names = [name]
+        else:
+
+            def keyfunc(name):
+                rec = RecordStatistics(self, name)
+                return -rec.total
+
+            names = sorted(self.records.keys(), key=keyfunc)
+        for name in names:
+            rec = RecordStatistics(self, name)
+            print(
+                f'{_m(rec.min)} | {_m(rec.avg)} | {_m(rec.max)} | {_c(rec.count)} '
+                f'| {_m(rec.total)} | {_n(name)}')
+
+    def timed(self, name=None, **options):
+        if callable(name):
+            foo = name
+            name = None
+        else:
+            foo = None
+
+        def decorator(foo):
+            if decorator.name is None:
+                name = foo.__name__
+            else:
+                name = decorator.name
+
+            @functools.wraps(foo)
+            def wrapped(*args, **kwargs):
+                self.start(name, **decorator.options)
+                ret = foo(*args, **kwargs)
+                self.stop(name)
+                return ret
+
+            return wrapped
+
+        decorator.name = name
+        decorator.options = options
+
+        return decorator(foo) if foo is not None else decorator
+
+
+profiler = PythonProfiler()
+
+__all__ = ['profiler']
diff --git a/tests/python/test_bitmasked.py b/tests/python/test_bitmasked.py
index 9de1397169fc2..627c8724fc205 100644
--- a/tests/python/test_bitmasked.py
+++ b/tests/python/test_bitmasked.py
@@ -223,7 +223,7 @@ def test_bitmasked_offset_child():
     bm = ti.root.bitmasked(ti.i, n)
     bm.dense(ti.i, 16).place(x, x2)
     bm2 = bm.bitmasked(ti.i, 4)
-    
+
     bm2.dense(ti.i, 4).place(y, y2, y3)
     bm2.bitmasked(ti.i, 4).place(z)