From c97e0f318727772bca902b7b58e8a8e3aa5ef4d8 Mon Sep 17 00:00:00 2001
From: xumingkuan <xumingkuan0721@126.com>
Date: Sat, 22 Aug 2020 17:14:13 +0800
Subject: [PATCH 1/3] [benchmark] [async] Add more statistics for async
 benchmark

---
 benchmarks/mpm2d.py            | 12 +++----
 python/taichi/lang/__init__.py | 60 +++++++++++++++++++++++-----------
 taichi/python/export_lang.cpp  |  5 +++
 3 files changed, 52 insertions(+), 25 deletions(-)

diff --git a/benchmarks/mpm2d.py b/benchmarks/mpm2d.py
index c88a747513c5a..e2a30959bfe2b 100644
--- a/benchmarks/mpm2d.py
+++ b/benchmarks/mpm2d.py
@@ -120,7 +120,7 @@ def substep():
     compile_time = time.time()
     substep()
     compile_time = time.time() - compile_time
-    ti.stat_write_yaml('compilation_time(s)', compile_time)
+    ti.stat_write('compilation_time', compile_time)
     ti.get_runtime().sync()
     t = time.time()
     for frame in range(200):
@@ -130,8 +130,8 @@ def substep():
         # gui.circles(x.to_numpy(), radius=1.5, color=colors[material.to_numpy()])
         # gui.show() # Change to gui.show(f'{frame:06d}.png') to write images to disk
     ti.get_runtime().sync()
-    avg = (time.time() - t) / 4000 * 1000  # miliseconds
-    ti.stat_write_yaml('running_time(ms)', avg)
+    avg = (time.time() - t) / 4000
+    ti.stat_write('running_time', avg)
 
 
 @ti.archs_excluding(ti.opengl)
@@ -253,7 +253,7 @@ def substep():
     compile_time = time.time()
     substep()
     compile_time = time.time() - compile_time
-    ti.stat_write_yaml('compilation_time(s)', compile_time)
+    ti.stat_write('compilation_time', compile_time)
     ti.get_runtime().sync()
     t = time.time()
     for frame in range(200):
@@ -263,5 +263,5 @@ def substep():
         # gui.circles(x.to_numpy(), radius=1.5, color=colors[material.to_numpy()])
         # gui.show() # Change to gui.show(f'{frame:06d}.png') to write images to disk
     ti.get_runtime().sync()
-    avg = (time.time() - t) / 4000 * 1000  # miliseconds
-    ti.stat_write_yaml('running_time(ms)', avg)
+    avg = (time.time() - t) / 4000
+    ti.stat_write('running_time', avg)
diff --git a/python/taichi/lang/__init__.py b/python/taichi/lang/__init__.py
index 5d9f0b4b4ca6f..de3fc8b0df28e 100644
--- a/python/taichi/lang/__init__.py
+++ b/python/taichi/lang/__init__.py
@@ -321,25 +321,45 @@ def visit(node):
 def benchmark(func, repeat=300, args=()):
     import taichi as ti
     import time
-    compile_time = time.time()
-    func(*args)
-    compile_time = time.time() - compile_time
-    ti.stat_write_yaml('compilation_time(s)', compile_time)
-    # The reason why we run 4 times is to warm up instruction/data caches.
-    # Discussion: https://github.com/taichi-dev/taichi/pull/1002#discussion_r426312136
-    for i in range(4):
-        func(*args)  # compile the kernel first
-    ti.sync()
-    t = time.time()
-    for n in range(repeat):
+    def run_benchmark():
+        compile_time = time.time()
         func(*args)
-    ti.get_runtime().sync()
-    elapsed = time.time() - t
-    avg = elapsed / repeat * 1000  # miliseconds
-    ti.stat_write_yaml('running_time(ms)', avg)
-
-
-def stat_write_yaml(key, value):
+        compile_time = time.time() - compile_time
+        ti.stat_write('compilation_time', compile_time)
+        codegen_stat = ti.core.stat()
+        for line in codegen_stat.split('\n'):
+            try:
+                a, b = line.strip().split(':')
+            except:
+                continue
+            a = a.strip()
+            b = int(float(b))
+            if a == 'codegen_kernel_statements':
+                ti.stat_write('instructions', b)
+            if a == 'codegen_offloaded_tasks':
+                ti.stat_write('offloaded_tasks', b)
+            elif a == 'launched_kernels':
+                ti.stat_write('launched_kernels', b)
+        # The reason why we run 4 times is to warm up instruction/data caches.
+        # Discussion: https://github.com/taichi-dev/taichi/pull/1002#discussion_r426312136
+        for i in range(4):
+            func(*args)  # compile the kernel first
+        ti.sync()
+        t = time.time()
+        for n in range(repeat):
+            func(*args)
+        ti.get_runtime().sync()
+        elapsed = time.time() - t
+        avg = elapsed / repeat
+        ti.stat_write('running_time', avg)
+    ti.cfg.async_mode = False
+    run_benchmark()
+    if ti.is_extension_supported(ti.cfg.arch, ti.extension.async_mode):
+        ti.cfg.async_mode = True
+        run_benchmark()
+
+
+def stat_write(key, value):
     import taichi as ti
     import yaml
     case_name = os.environ.get('TI_CURRENT_BENCHMARK')
@@ -348,6 +368,7 @@ def stat_write_yaml(key, value):
     if case_name.startswith('benchmark_'):
         case_name = case_name[10:]
     arch_name = core.arch_name(ti.cfg.arch)
+    async_mode = 'async' if ti.cfg.async_mode else 'sync'
     output_dir = os.environ.get('TI_BENCHMARK_OUTPUT_DIR', '.')
     filename = f'{output_dir}/benchmark.yml'
     try:
@@ -357,7 +378,8 @@ def stat_write_yaml(key, value):
         data = {}
     data.setdefault(key, {})
     data[key].setdefault(case_name, {})
-    data[key][case_name][arch_name] = value
+    data[key][case_name].setdefault(async_mode, {})
+    data[key][case_name][async_mode][arch_name] = value
     with open(filename, 'w') as f:
         yaml.dump(data, f, Dumper=yaml.SafeDumper)
 
diff --git a/taichi/python/export_lang.cpp b/taichi/python/export_lang.cpp
index 90f44e487708e..8b44712a95de6 100644
--- a/taichi/python/export_lang.cpp
+++ b/taichi/python/export_lang.cpp
@@ -595,6 +595,11 @@ void export_lang(py::module &m) {
   m.def("is_extension_supported", is_extension_supported);
 
   m.def("print_stat", [] { stat.print(); });
+  m.def("stat", [] {
+    std::string result;
+    stat.print(&result);
+    return result;
+  });
 
   m.def("record_action_hint", [](std::string content) {
     ActionRecorder::get_instance().record("hint",

From ef4e9cd2265493e7cd555bf8c7008531487bbf3f Mon Sep 17 00:00:00 2001
From: xumingkuan <xumingkuan0721@126.com>
Date: Sat, 22 Aug 2020 17:28:05 +0800
Subject: [PATCH 2/3] Use ti.benchmark() in mpm2d.py

---
 benchmarks/mpm2d.py | 32 ++------------------------------
 1 file changed, 2 insertions(+), 30 deletions(-)

diff --git a/benchmarks/mpm2d.py b/benchmarks/mpm2d.py
index e2a30959bfe2b..8f52f59fb7d70 100644
--- a/benchmarks/mpm2d.py
+++ b/benchmarks/mpm2d.py
@@ -117,21 +117,7 @@ def substep():
         F[i] = [[1, 0], [0, 1]]
         Jp[i] = 1
 
-    compile_time = time.time()
-    substep()
-    compile_time = time.time() - compile_time
-    ti.stat_write('compilation_time', compile_time)
-    ti.get_runtime().sync()
-    t = time.time()
-    for frame in range(200):
-        for s in range(20):
-            substep()
-        # colors = np.array([0x068587, 0xED553B, 0xEEEEF0], dtype=np.uint32)
-        # gui.circles(x.to_numpy(), radius=1.5, color=colors[material.to_numpy()])
-        # gui.show() # Change to gui.show(f'{frame:06d}.png') to write images to disk
-    ti.get_runtime().sync()
-    avg = (time.time() - t) / 4000
-    ti.stat_write('running_time', avg)
+    ti.benchmark(substep, repeat=4000)
 
 
 @ti.archs_excluding(ti.opengl)
@@ -250,18 +236,4 @@ def substep():
         F[i] = [[1, 0], [0, 1]]
         Jp[i] = 1
 
-    compile_time = time.time()
-    substep()
-    compile_time = time.time() - compile_time
-    ti.stat_write('compilation_time', compile_time)
-    ti.get_runtime().sync()
-    t = time.time()
-    for frame in range(200):
-        for s in range(20):
-            substep()
-        # colors = np.array([0x068587, 0xED553B, 0xEEEEF0], dtype=np.uint32)
-        # gui.circles(x.to_numpy(), radius=1.5, color=colors[material.to_numpy()])
-        # gui.show() # Change to gui.show(f'{frame:06d}.png') to write images to disk
-    ti.get_runtime().sync()
-    avg = (time.time() - t) / 4000
-    ti.stat_write('running_time', avg)
+    ti.benchmark(substep, repeat=4000)

From 5e23b002481a806e22225d1b365d48f235f159ae Mon Sep 17 00:00:00 2001
From: Taichi Gardener <taichigardener@gmail.com>
Date: Sat, 22 Aug 2020 05:37:56 -0400
Subject: [PATCH 3/3] [skip ci] enforce code format

---
 python/taichi/lang/__init__.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/python/taichi/lang/__init__.py b/python/taichi/lang/__init__.py
index de3fc8b0df28e..71191f9f98002 100644
--- a/python/taichi/lang/__init__.py
+++ b/python/taichi/lang/__init__.py
@@ -321,6 +321,7 @@ def visit(node):
 def benchmark(func, repeat=300, args=()):
     import taichi as ti
     import time
+
     def run_benchmark():
         compile_time = time.time()
         func(*args)
@@ -352,6 +353,7 @@ def run_benchmark():
         elapsed = time.time() - t
         avg = elapsed / repeat
         ti.stat_write('running_time', avg)
+
     ti.cfg.async_mode = False
     run_benchmark()
     if ti.is_extension_supported(ti.cfg.arch, ti.extension.async_mode):