diff --git a/docs/profiler.rst b/docs/profiler.rst index f0295de1500b1..fc15122898f0e 100644 --- a/docs/profiler.rst +++ b/docs/profiler.rst @@ -2,45 +2,250 @@ Profiler ======== -Taichi's profiler can help you analyze the run-time cost of your program. There are two profiling systems in Taichi: ``ScopedProfiler`` and ``KernelProfiler``. +Taichi's profilers can help analyze the performance of your program. + +Currently there are three profiling systems in Taichi: + +- ``PythonProfiler`` +- ``KernelProfiler`` +- ``ScopedProfiler`` (for developers) + -ScopedProfiler -############## -1. ``ScopedProfiler`` measures time spent on the **host tasks** hierarchically. +PythonProfiler +-------------- -2. This profiler is automatically on. To show its results, call ``ti.print_profile_info()``. For example: +``PythonProfiler`` basically measures time spent between ``start()`` and ``stop()`` using +the Python-builtin function ``time.time()``. + +Profiling APIs +************** + +There are 3 ways to use this profiler: + +1. ``ti.profiler.start()`` and ``ti.profiler.stop()`` are the most fundemental APIs: + It will measure the time difference between ``start`` and ``stop``. + Then save the result according to the given name. e.g.: .. code-block:: python + from time import sleep import taichi as ti + ti.init() - ti.init(arch=ti.cpu) - var = ti.var(ti.f32, shape=1) + def do_something_A(): + sleep(0.01) + + def do_something_B(): + sleep(0.1) + + ti.profiler.start('A') + do_something_A() + ti.profiler.stop('A') + + ti.profiler.start('B') + do_something_B() + ti.profiler.stop('B') + + ti.profiler.print() + + +.. code-block:: none + + min | avg | max | num | total | name + 0.100s | 0.100s | 0.100s | 1x | 0.100s | B + 10.10ms | 10.10ms | 10.10ms | 1x | 10.10ms | A + + +2. ``with ti.profiler()``, this one makes our code cleaner and readable. + Basically, it will automatically invoke ``stop`` when the indented block exited. + For more details about the ``with`` syntax in Python, + see `this tutorial `_. + +.. code-block:: python + + from time import sleep + import taichi as ti + ti.init() + + def do_something_A(): + sleep(0.01) + + def do_something_B(): + sleep(0.1) + + with ti.profiler('A'): + do_something_A() + # automatically invoke stop('A') + + with ti.profiler('B'): + do_something_B() + # automatically invoke stop('B') + + ti.profiler.print() + + +.. code-block:: none + + min | avg | max | num | total | name + 0.100s | 0.100s | 0.100s | 1x | 0.100s | B + 10.10ms | 10.10ms | 10.10ms | 1x | 10.10ms | A + + +3. ``@ti.profiler.timed``, this one is very intuitive when profiling kernels. + It will measure the time spent in the function, i.e. ``start`` when entering the function, + ``stop`` when leaving the function, and the record name is the function name. + +.. code-block:: python + + from time import sleep + import taichi as ti + ti.init() + + @ti.profiler.timed + def do_something_A(): + sleep(0.01) + + @ti.profiler.timed + def do_something_B(): + sleep(0.1) + + do_something_A() + do_something_B() + + ti.profiler.print() + + +.. code-block:: none + + min | avg | max | num | total | name + 0.100s | 0.100s | 0.100s | 1x | 0.100s | do_something_B + 10.10ms | 10.10ms | 10.10ms | 1x | 10.10ms | do_something_A + + +.. warning:: + + When combining ``@ti.profiler.timed`` with other decorators like ``@ti.kernel``, + then ``@ti.profiler.timed`` should be put **above** it, e.g.: + + .. code-block:: python + + @ti.profiler.timed + @ti.kernel + def substep(): + ... + + +Recording multiple entries +************************** + +When a same **name** is used for multiple times, then they will be merged into one, e.g.: + +.. code-block:: python + + from time import sleep + import taichi as ti + ti.init() + + def do_something_A(): + sleep(0.01) + + def do_something_B(): + sleep(0.1) + + ti.profiler.start('A') + do_something_A() + ti.profiler.stop('A') + + ti.profiler.start('A') + do_something_B() + ti.profiler.stop('A') + + ti.profiler.start('B') + do_something_B() + ti.profiler.stop('B') + + ti.profiler.print() +will obtain: +.. code-block:: none + + min | avg | max | num | total | name + 10.10ms | 55.12ms | 0.100s | 2x | 0.110s | A + 0.100s | 0.100s | 0.100s | 1x | 0.100s | B + + +- ``min`` is the minimum time in records. +- ``avg`` is the average time of records. +- ``max`` is the maximum time in records. +- ``num`` is the number of record entries. +- ``total`` is the total costed time of records. + + +Profiler options +**************** + +Due to Taichi's JIT mechanism, a kernel will be **compiled** on its first invocation. +So the first record will be extremely long compared to the following records since it +**involves both compile time and execution time**, e.g.: + +.. code-block:: none + + min | avg | max | num | total | name + 2.37ms | 3.79ms | 1.615s | 1900x | 7.204s | substep + +.. code-block:: none + + min | avg | max | num | total | name + 2.37ms | 2.95ms | 12.70ms | 1895x | 5.592s | substep + + +As you see, this make our result inaccurate, especially the ``max`` column. + +To avoid this, you may specify a ``warmup`` option to ``ti.profiler``, e.g.: + +.. code-block:: python + + @ti.profiler.timed(warmup=5) @ti.kernel - def compute(): - var[0] = 1.0 - print("Setting var[0] =", var[0]) + def substep(): + ... - compute() - ti.print_profile_info() +Set ``warmup=5`` for example, will **discard** the first 5 record entries. +I.e. discard the kernel compile time and possible TLB and cache misses on start up. -``ti.print_profile_info()`` prints profiling results in a hierarchical format. +Check out `misc/mpm99_timed.py `_ for a summary example. -.. Note:: - ``ScopedProfiler`` is a C++ class in the core of Taichi. It is not exposed to Python users. +.. warning:: + + ``ti.profiler``, i.e. ``PythonProfiler``, **only works in Python-scope**, e.g.:: + + @ti.func + def substep(): + ti.profiler.start('hello') # won't work as you expected... + ... + ti.profiler.stop('hello') + + @ti.profiler.timed # won't work as you expected... + @ti.func + def hello(): + ... + + To do profiling **inside Taichi-scope**, please see the ``KernelProfiler`` section below. + KernelProfiler -############## +-------------- + +``KernelProfiler`` records the costs of Taichi kernels on devices. -1. ``KernelProfiler`` records the costs of Taichi kernels on devices. To enable this profiler, set ``kernel_profiler=True`` in ``ti.init``. +To enable this profiler, please initialize Taichi using ``ti.init(kernel_profiler=True)``. -2. Call ``ti.kernel_profiler_print()`` to show the kernel profiling result. For example: +Call ``ti.kernel_profiler_print()`` to show the kernel profiling result. For example: .. code-block:: python :emphasize-lines: 3, 13 @@ -67,3 +272,37 @@ The outputs would be: [ 22.73%] jit_evaluator_0_kernel_0_serial min 0.001 ms avg 0.001 ms max 0.001 ms total 0.000 s [ 1x] [ 0.00%] jit_evaluator_1_kernel_1_serial min 0.000 ms avg 0.000 ms max 0.000 ms total 0.000 s [ 1x] [ 77.27%] compute_c4_0_kernel_2_serial min 0.004 ms avg 0.004 ms max 0.004 ms total 0.000 s [ 1x] + + +ScopedProfiler +-------------- + +``ScopedProfiler`` measures time spent on the **host tasks** hierarchically. + +This profiler is automatically on. + +To show its results, call ``ti.print_profile_info()``. For example: + +.. code-block:: python + + import taichi as ti + + ti.init(arch=ti.cpu) + var = ti.var(ti.f32, shape=1) + + + @ti.kernel + def compute(): + var[0] = 1.0 + print("Setting var[0] =", var[0]) + + + compute() + ti.print_profile_info() + + +``ti.print_profile_info()`` prints profiling results in a hierarchical format. + +.. Note:: + + ``ScopedProfiler`` is a C++ class in the core of Taichi. It is not exposed to Python users. diff --git a/misc/mpm99_timed.py b/misc/mpm99_timed.py new file mode 100644 index 0000000000000..b28c331d34f24 --- /dev/null +++ b/misc/mpm99_timed.py @@ -0,0 +1,130 @@ +import taichi as ti +import numpy as np +ti.init(arch=ti.gpu) # Try to run on GPU +quality = 1 # Use a larger value for higher-res simulations +n_particles, n_grid = 9000 * quality**2, 128 * quality +dx, inv_dx = 1 / n_grid, float(n_grid) +dt = 1e-4 / quality +p_vol, p_rho = (dx * 0.5)**2, 1 +p_mass = p_vol * p_rho +E, nu = 0.1e4, 0.2 # Young's modulus and Poisson's ratio +mu_0, lambda_0 = E / (2 * (1 + nu)), E * nu / ( + (1 + nu) * (1 - 2 * nu)) # Lame parameters +x = ti.Vector(2, dt=ti.f32, shape=n_particles) # position +v = ti.Vector(2, dt=ti.f32, shape=n_particles) # velocity +C = ti.Matrix(2, 2, dt=ti.f32, shape=n_particles) # affine velocity field +F = ti.Matrix(2, 2, dt=ti.f32, shape=n_particles) # deformation gradient +material = ti.var(dt=ti.i32, shape=n_particles) # material id +Jp = ti.var(dt=ti.f32, shape=n_particles) # plastic deformation +grid_v = ti.Vector(2, dt=ti.f32, + shape=(n_grid, n_grid)) # grid node momentum/velocity +grid_m = ti.var(dt=ti.f32, shape=(n_grid, n_grid)) # grid node mass + + +@ti.profiler.timed(warmup=10) +@ti.kernel +def substep(): + for i, j in grid_m: + grid_v[i, j] = [0, 0] + grid_m[i, j] = 0 + for p in x: # Particle state update and scatter to grid (P2G) + base = (x[p] * inv_dx - 0.5).cast(int) + fx = x[p] * inv_dx - base.cast(float) + # Quadratic kernels [http://mpm.graphics Eqn. 123, with x=fx, fx-1,fx-2] + w = [0.5 * (1.5 - fx)**2, 0.75 - (fx - 1)**2, 0.5 * (fx - 0.5)**2] + F[p] = (ti.Matrix.identity(ti.f32, 2) + + dt * C[p]) @ F[p] # deformation gradient update + h = ti.exp( + 10 * + (1.0 - + Jp[p])) # Hardening coefficient: snow gets harder when compressed + if material[p] == 1: # jelly, make it softer + h = 0.3 + mu, la = mu_0 * h, lambda_0 * h + if material[p] == 0: # liquid + mu = 0.0 + U, sig, V = ti.svd(F[p]) + J = 1.0 + for d in ti.static(range(2)): + new_sig = sig[d, d] + if material[p] == 2: # Snow + new_sig = min(max(sig[d, d], 1 - 2.5e-2), + 1 + 4.5e-3) # Plasticity + Jp[p] *= sig[d, d] / new_sig + sig[d, d] = new_sig + J *= new_sig + if material[ + p] == 0: # Reset deformation gradient to avoid numerical instability + F[p] = ti.Matrix.identity(ti.f32, 2) * ti.sqrt(J) + elif material[p] == 2: + F[p] = U @ sig @ V.T( + ) # Reconstruct elastic deformation gradient after plasticity + stress = 2 * mu * (F[p] - U @ V.T()) @ F[p].T() + ti.Matrix.identity( + ti.f32, 2) * la * J * (J - 1) + stress = (-dt * p_vol * 4 * inv_dx * inv_dx) * stress + affine = stress + p_mass * C[p] + for i, j in ti.static(ti.ndrange( + 3, 3)): # Loop over 3x3 grid node neighborhood + offset = ti.Vector([i, j]) + dpos = (offset.cast(float) - fx) * dx + weight = w[i][0] * w[j][1] + grid_v[base + offset] += weight * (p_mass * v[p] + affine @ dpos) + grid_m[base + offset] += weight * p_mass + for i, j in grid_m: + if grid_m[i, j] > 0: # No need for epsilon here + grid_v[i, + j] = (1 / grid_m[i, j]) * grid_v[i, + j] # Momentum to velocity + grid_v[i, j][1] -= dt * 50 # gravity + if i < 3 and grid_v[i, j][0] < 0: + grid_v[i, j][0] = 0 # Boundary conditions + if i > n_grid - 3 and grid_v[i, j][0] > 0: grid_v[i, j][0] = 0 + if j < 3 and grid_v[i, j][1] < 0: grid_v[i, j][1] = 0 + if j > n_grid - 3 and grid_v[i, j][1] > 0: grid_v[i, j][1] = 0 + for p in x: # grid to particle (G2P) + base = (x[p] * inv_dx - 0.5).cast(int) + fx = x[p] * inv_dx - base.cast(float) + w = [0.5 * (1.5 - fx)**2, 0.75 - (fx - 1.0)**2, 0.5 * (fx - 0.5)**2] + new_v = ti.Vector.zero(ti.f32, 2) + new_C = ti.Matrix.zero(ti.f32, 2, 2) + for i, j in ti.static(ti.ndrange( + 3, 3)): # loop over 3x3 grid node neighborhood + dpos = ti.Vector([i, j]).cast(float) - fx + g_v = grid_v[base + ti.Vector([i, j])] + weight = w[i][0] * w[j][1] + new_v += weight * g_v + new_C += 4 * inv_dx * weight * g_v.outer_product(dpos) + v[p], C[p] = new_v, new_C + x[p] += dt * v[p] # advection + + +group_size = n_particles // 3 + + +@ti.kernel +def initialize(): + for i in range(n_particles): + x[i] = [ + ti.random() * 0.2 + 0.3 + 0.10 * (i // group_size), + ti.random() * 0.2 + 0.05 + 0.32 * (i // group_size) + ] + material[i] = i // group_size # 0: fluid 1: jelly 2: snow + v[i] = ti.Matrix([0, 0]) + F[i] = ti.Matrix([[1, 0], [0, 1]]) + Jp[i] = 1 + + +initialize() +gui = ti.GUI("Taichi MLS-MPM-99", res=512, background_color=0x112F41) +for frame in range(100): + for s in range(int(2e-3 // dt)): + substep() + colors = np.array([0x068587, 0xED553B, 0xEEEEF0], dtype=np.uint32) + with ti.profiler('gui.circles', warmup=5): + gui.circles(x.to_numpy(), + radius=1.5, + color=colors[material.to_numpy()]) + with ti.profiler('gui.show', warmup=5): + gui.show() + +ti.profiler.print() diff --git a/misc/test_python_profiler.py b/misc/test_python_profiler.py new file mode 100644 index 0000000000000..dba2b673ca329 --- /dev/null +++ b/misc/test_python_profiler.py @@ -0,0 +1,38 @@ +import taichi as ti + + +@ti.profiler.timed('foo', warmup=5) +@ti.kernel +def func0(): + print(233) + + +@ti.profiler.timed(warmup=5) +@ti.kernel +def func1(): + print(233) + + +@ti.profiler.timed('Func2') +@ti.kernel +def func2(): + print(666) + + +@ti.profiler.timed +@ti.kernel +def func3(): + pass + + +for k in range(5): + ti.profiler.start('step') + for i in range(10): + func0() + func1() + func2() + func3() + + ti.profiler.stop('step') + +ti.profiler.print() diff --git a/python/taichi/misc/__init__.py b/python/taichi/misc/__init__.py index 83177d8c10d81..c37795960819c 100644 --- a/python/taichi/misc/__init__.py +++ b/python/taichi/misc/__init__.py @@ -3,6 +3,7 @@ from .image import * from .test import * from .error import * +from .timing import profiler from .task import Task __all__ = [s for s in dir() if not s.startswith('_')] diff --git a/python/taichi/misc/timing.py b/python/taichi/misc/timing.py new file mode 100644 index 0000000000000..d189a3d5bf6dc --- /dev/null +++ b/python/taichi/misc/timing.py @@ -0,0 +1,145 @@ +import time, functools +from colorama import Fore, Back, Style + + +def _m(sec): + if sec > 9.9: + return f'{Fore.LIGHTRED_EX}{sec:6.1f}{Fore.RESET}s' + elif sec > 0.099: + return f'{Fore.LIGHTRED_EX}{sec:6.3f}{Fore.RESET}s' + else: + ms = sec * 1000 + if ms > 0.099: + return f'{Fore.LIGHTMAGENTA_EX}{ms:5.2f}{Fore.RESET}ms' + else: + ns = ms * 1000 + return f'{Fore.BLUE}{ns:5.2f}{Fore.RESET}ns' + + +def _c(cnt): + return f'{Fore.MAGENTA}{cnt:4d}{Fore.RESET}x' + + +def _n(name): + return f'{Fore.MAGENTA}{name}{Fore.RESET}' + + +class RecordStatistics: + def __init__(self, profiler, name): + self.record = profiler.records[name] + self.options = profiler.options[name] + self.rec = self.record + + warmup = self.options.get('warmup', 0) + self.rec = self.rec[warmup:] + + @property + def min(self): + return min(self.rec) + + @property + def max(self): + return max(self.rec) + + @property + def total(self): + return sum(self.rec) + + @property + def avg(self): + return sum(self.rec) / len(self.rec) + + @property + def count(self): + return len(self.rec) + + +class PythonProfiler: + def __init__(self, timer=time.time): + self.timer = timer + self.records = {} + self.started = {} + self.options = {} + self.last_started = None + + def start(self, name, **options): + self.last_started = name + self.options[name] = self.options.get(name, options) + t = self.timer() + self.started[name] = t + + def stop(self, name=None): + t = self.timer() + if not name and self.last_started: + name = self.last_started + self.last_started = None + diff = t - self.started[name] + del self.started[name] + self.last_started = None + if name not in self.records: + self.records[name] = [] + self.records[name].append(diff) + + class TimedScope: + def __init__(self, profiler, name, **options): + self.profiler = profiler + self.name = name + self.profiler.start(name, **options) + + def __enter__(self): + return self + + def __exit__(self, *args): + self.profiler.stop(self.name) + + def __call__(self, name, **options): + return self.TimedScope(self, name, **options) + + def print(self, name=None): + print(' min | avg | max | num | total | name') + if name is not None: + names = [name] + else: + + def keyfunc(name): + rec = RecordStatistics(self, name) + return -rec.total + + names = sorted(self.records.keys(), key=keyfunc) + for name in names: + rec = RecordStatistics(self, name) + print( + f'{_m(rec.min)} | {_m(rec.avg)} | {_m(rec.max)} | {_c(rec.count)} ' + f'| {_m(rec.total)} | {_n(name)}') + + def timed(self, name=None, **options): + if callable(name): + foo = name + name = None + else: + foo = None + + def decorator(foo): + if decorator.name is None: + name = foo.__name__ + else: + name = decorator.name + + @functools.wraps(foo) + def wrapped(*args, **kwargs): + self.start(name, **decorator.options) + ret = foo(*args, **kwargs) + self.stop(name) + return ret + + return wrapped + + decorator.name = name + decorator.options = options + + return decorator(foo) if foo is not None else decorator + + +profiler = PythonProfiler() + +__all__ = ['profiler'] diff --git a/tests/python/test_bitmasked.py b/tests/python/test_bitmasked.py index 9de1397169fc2..627c8724fc205 100644 --- a/tests/python/test_bitmasked.py +++ b/tests/python/test_bitmasked.py @@ -223,7 +223,7 @@ def test_bitmasked_offset_child(): bm = ti.root.bitmasked(ti.i, n) bm.dense(ti.i, 16).place(x, x2) bm2 = bm.bitmasked(ti.i, 4) - + bm2.dense(ti.i, 4).place(y, y2, y3) bm2.bitmasked(ti.i, 4).place(z)