diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
index 3f292f7122988..98ab6258dd2fc 100644
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -84,10 +84,10 @@ jobs:
           glewinfo
           ti diagnose
           ti changelog
-          ti test -vr2 -t2 -k "not ndarray"
+          ti test -vr2 -t2 -k "not ndarray and not torch"
           # ndarray test might OOM if run with -t2.
           # FIXME: unify this with presubmit.yml to avoid further divergence
-          ti test -vr2 -t1 -k "ndarray"
+          ti test -vr2 -t1 -k "ndarray or torch"
         env:
           PYTHON: ${{ matrix.python }}
 
@@ -267,8 +267,7 @@ jobs:
           7z x taichi-llvm-10.0.0-msvc2019.zip -otaichi_llvm
           curl --retry 10 --retry-delay 5 https://github.com/taichi-dev/taichi_assets/releases/download/llvm10/clang-10.0.0-win.zip -LO
           7z x clang-10.0.0-win.zip -otaichi_clang
-          $env:PATH += ";C:\taichi_llvm\bin"
-          $env:PATH += ";C:\taichi_clang\bin"
+          $env:PATH = ";C:\taichi_llvm\bin;C:\taichi_clang\bin;" + $env:PATH
           clang --version
           cd D:\a\taichi\taichi
           python -m pip install -r requirements_dev.txt
@@ -293,8 +292,7 @@ jobs:
       - name: Test
         shell: powershell
         run: |
-          $env:PATH += ";C:\taichi_llvm\bin"
-          $env:PATH += ";C:\taichi_clang\bin"
+          $env:PATH = ";C:\taichi_llvm\bin;C:\taichi_clang\bin;" + $env:PATH
           python -c "import taichi"
           python examples/algorithm/laplace.py
           python bin/taichi diagnose
diff --git a/docs/lang/articles/get-started.md b/docs/lang/articles/get-started.md
index 70dc913a3037f..b6d250d157f26 100644
--- a/docs/lang/articles/get-started.md
+++ b/docs/lang/articles/get-started.md
@@ -16,7 +16,7 @@ python3 -m pip install taichi
 ```
 
 :::note
-Currently, Taichi only supports Python 3.6/3.7/3.8 (64-bit).
+Currently, Taichi only supports Python 3.6/3.7/3.8/3.9 (64-bit).
 :::
 
 import Tabs from '@theme/Tabs';
diff --git a/docs/lang/articles/misc/gui.md b/docs/lang/articles/misc/gui.md
index da421694b4ef9..fc321deb817dd 100644
--- a/docs/lang/articles/misc/gui.md
+++ b/docs/lang/articles/misc/gui.md
@@ -9,9 +9,10 @@ Taichi has a built-in GUI system to help users visualize results.
 
 ## Create a window
 
-`ti.GUI(name, res)` creates a window. If `res` is scalar, then width will be equal to height.
+[`ti.GUI(name, res)`](https://api-docs.taichi.graphics/src/taichi.misc.html?highlight=gui%20gui#taichi.misc.gui.GUI)
+creates a window.
 
-The following codes show how to create a window of resolution `640x360`:
+The following code show how to create a window of resolution `640x360`:
 
 ```python
 gui = ti.GUI('Window Title', (640, 360))
@@ -33,7 +34,8 @@ while gui.running:
 
 ## Display a window
 
-`gui.show(filename)` helps display a window. If `filename` is specified, a screenshot will be saved to the file specified by the name. For example, the following saves frames of the window to `.png`s:
+[`gui.show(filename)`](https://api-docs.taichi.graphics/src/taichi.misc.html?highlight=show#taichi.misc.gui.GUI.show)
+helps display a window. If `filename` is specified, a screenshot will be saved to the path. For example, the following saves frames of the window to `.png`s:
 
     for frame in range(10000):
         render(img)
@@ -43,8 +45,29 @@ while gui.running:
 
 
 ## Paint on a window
+Taichi's GUI supports painting simple geometric objects, such as lines, triangles, rectangles, circles, and text.
 
-`gui.set_image(pixels)` sets an image to display on the window.
+:::note
+
+The position parameter of every drawing API expects input of 2-element tuples,
+whose values are the relative position of the object range from 0.0 to 1.0.
+(0.0, 0.0) stands for the lower left corner of the window, and (1.0, 1.0) stands for the upper right corner.
+
+Acceptable input for positions are taichi fields or numpy arrays. Primitive arrays in python are NOT acceptable.
+
+For simplicity, we use numpy arrays in the examples below.
+
+:::
+
+:::tip
+
+For detailed API description, please click on the API code. For instance, click on
+`gui.get_image()` to see the description to get a GUI images.
+
+:::
+
+[`gui.set_image(pixels)`](https://api-docs.taichi.graphics/src/taichi.misc.html?highlight=set_image#taichi.misc.gui.GUI.set_image)
+sets an image to display on the window.
 
 The image pixels are set from the values of `img[i, j]`, where `i` indicates the horizontal coordinates (from left to right) and `j` the vertical coordinates (from bottom to top).
 
@@ -56,8 +79,7 @@ If the window size is `(x, y)`, then `img` must be one of:
 
 - `ti.field(shape=(x, y, 2))`, where `2` is for `(r, g)` channels
 
-- `ti.Vector.field(3, shape=(x, y))` `(r, g, b)` channels on each
-  component
+- `ti.Vector.field(3, shape=(x, y))` `(r, g, b)` channels on each component
 
 - `ti.Vector.field(2, shape=(x, y))` `(r, g)` channels on each component
 
@@ -79,11 +101,162 @@ The data type of `img` must be one of:
 
 - `float64`, range `[0, 1]`
 
+:::note
+
+When using `float32` or `float64` as the data type, `img` entries will be clipped into range [0, 1] for display.
+
+:::
+
+[`gui.get_image()`](https://api-docs.taichi.graphics/src/taichi.misc.html?highlight=get_image#taichi.misc.gui.GUI.get_image)
+gets the 4-channel (RGBA) image shown in the current GUI system.
+
+[`gui.circle(pos)`](https://api-docs.taichi.graphics/src/taichi.misc.html?highlight=circle#taichi.misc.gui.GUI.circle)
+draws one solid circle.
+
+The color and radius of circles can be further specified with additional parameters.
+
+[`gui.circles(pos)`](https://api-docs.taichi.graphics/src/taichi.misc.html?highlight=circles#taichi.misc.gui.GUI.circles)
+draws solid circles.
+
+The color and radius of circles can be further specified with additional parameters. For a single color, use the `color` parameter.
+For multiple colors, use `palette` and `palette_indices` instead.
+
+:::note
+
+The unit of raduis in GUI APIs is number of pixels.
+
+:::
+
+For examples:
+```python
+gui.circles(pos, radius=3, color=0x068587)
+```
+draws circles all with radius of 1.5 and blue color positioned at pos array.
+
+![circles](../static/assets/circles.png)
+```python
+gui.circles(pos, radius=3, palette=[0x068587, 0xED553B, 0xEEEEF0], palette_indices=material)
+```
+draws circles with radius of 1.5 and three different colors differed by `material`, an integer array with the same size as
+`pos`. Each integer in `material` indicates which color the associated circle use (i.e. array [0, 1, 2] indicates these three
+circles are colored separately by the first, second, and third color in `palette`.
+
+![circles](../static/assets/colored_circles.png)
+
+[`gui.line(begin, end)`](https://api-docs.taichi.graphics/src/taichi.misc.html?highlight=line#taichi.misc.gui.GUI.line)
+draws one line.
+
+The color and radius of lines can be further specified with additional parameters.
+
+[`gui.lines(begin, end)`](https://api-docs.taichi.graphics/src/taichi.misc.html?highlight=line#taichi.misc.gui.GUI.lines)
+draws lines.
+
+`begin` and `end` both require input of positions.
+
+The color and radius of lines can be further specified with additional parameters.
+
+For example:
+```python
+gui.lines(begin=X, end=Y, radius=2, color=0x068587)
+```
+draws line segments from X positions to Y positions with width of 2 and color in light blue.
+
+![lines](../static/assets/lines.png)
+
+[`gui.triangle(a, b, c)`](https://api-docs.taichi.graphics/src/taichi.misc.html?highlight=triangle#taichi.misc.gui.GUI.triangle)
+draws one solid triangle.
+
+The color of triangles can be further specified with additional parameters.
+
+[`gui.triangles(a, b, c)`](https://api-docs.taichi.graphics/src/taichi.misc.html?highlight=triangles#taichi.misc.gui.GUI.triangles)
+draws solid triangles.
+
+The color of triangles can be further specified with additional parameters.
+
+For example:
+```python
+gui.triangles(a=X, b=Y, c=Z, color=0xED553B)
+```
+draws triangles with color in red and three points positioned at X, Y, and Z.
+
+![triangles](../static/assets/triangles.png)
+
+[`gui.rect(topleft, bottomright)`](https://api-docs.taichi.graphics/src/taichi.misc.html?highlight=rect#taichi.misc.gui.GUI.rect)
+draws a hollow rectangle.
+
+The color and radius of the stroke of rectangle can be further specified with additional parameters.
+
+For example:
+```python
+gui.rect([0, 0], [0.5, 0.5], radius=1, color=0xED553B)
+```
+draws a rectangle of top left corner at [0, 0] and bottom right corner at [0.5, 0.5], with stroke of radius of 1 and color in red.
+
+![rect](../static/assets/rect.png)
+
+[`gui.arrows(origin, direction)`](https://api-docs.taichi.graphics/src/taichi.misc.html?highlight=arrows#taichi.misc.gui.GUI.arrows)
+draws arrows.
+
+`origin` and `direction` both require input of positions. `origin` refers to the positions of arrows' origins, `direction`
+refers to the directions where the arrows point to relative to their origins.
+
+The color and radius of arrows can be further specified with additional parameters.
+
+For example:
+```python
+x = nunpy.array([[0.1, 0.1], [0.9, 0.1]])
+y = nunpy.array([[0.3, 0.3], [-0.3, 0.3]])
+gui.arrows(x, y, radius=1, color=0xFFFFFF)
+```
+draws two arrow originated at [0.1, 0.1], [0.9, 0.1] and pointing to [0.3, 0.3], [-0.3, 0.3] with radius of 1 and color in white.
+
+![arrows](../static/assets/arrows.png)
 
+[`gui.arrow_field(direction)`](https://api-docs.taichi.graphics/src/taichi.misc.html?highlight=arrow_field#taichi.misc.gui.GUI.arrow_field)
+draws a field of arrows.
 
-## Convert RGB to Hex
+The `direction` requires a field of `shape=(col, row, 2)` where `col` refers to the number of columns of arrow field and `row`
+refers to the number of rows of arrow field.
 
-`ti.rgb_to_hex(rgb)` can convert a (R, G, B) tuple of floats into a single integer value, e.g.,
+The color and bound of arrow field can be further specified with additional parameters.
+
+For example:
+```python
+gui.arrow_field(x, bound=0.5, color=0xFFFFFF) # x is a field of shape=(5, 5, 2)
+```
+draws a 5 by 5 arrows pointing to random directions.
+
+![arrow_field](../static/assets/arrow_field.png)
+
+[`gui.point_field(radius)`](https://api-docs.taichi.graphics/src/taichi.misc.html?highlight=point_field#taichi.misc.gui.GUI.point_field)
+draws a field of points.
+
+The `radius` requires a field of `shape=(col, row)` where `col` refers to the number of columns of arrow field and `row`
+refers to the number of rows of arrow field.
+
+The color and bound of point field can be further specified with additional parameters.
+
+For example:
+```python
+x = numpy.array([[3, 5, 7, 9], [9, 7, 5, 3], [6, 6, 6, 6]])
+gui.point_field(radius=x, bound=0.5, color=0xED553B)
+```
+draws a 3 by 4 point field of radius stored in the array.
+
+![point_field](../static/assets/point_field.png)
+
+[`gui.text(content, pos)`](https://api-docs.taichi.graphics/src/taichi.misc.html?highlight=text#taichi.misc.gui.GUI.text)
+draws a line of text on screen.
+
+The font size and color of text can be further specified with additional parameters.
+
+## RGB & Hex conversion.
+
+[`ti.hex_to_rgb(hex)`](https://api-docs.taichi.graphics/src/taichi.misc.html?highlight=hex_to_rgb#taichi.misc.gui.hex_to_rgb)
+can convert a single integer value to a (R, G, B) tuple of floats.
+
+[`ti.rgb_to_hex(rgb)`](https://api-docs.taichi.graphics/src/taichi.misc.html?highlight=rgb#taichi.misc.gui.rgb_to_hex)
+can convert a (R, G, B) tuple of floats into a single integer value, e.g.,
 
 ```python
 rgb = (0.4, 0.8, 1.0)
@@ -96,7 +269,6 @@ hex = ti.rgb_to_hex(rgb)  # np.array([0x66ccff, 0x007fff])
 The return values can be used in GUI drawing APIs.
 
 
-
 ## Event processing
 
 Every event have a key and type.
@@ -136,7 +308,8 @@ gui.get_event(ti.GUI.PRESS)
 gui.get_event((ti.GUI.PRESS, ti.GUI.ESCAPE), (ti.GUI.RELEASE, ti.GUI.SPACE))
 ```
 
-`gui.running` can help check the state of the window. `ti.GUI.EXIT` occurs when you click on the close (X) button of a window.
+[`gui.running`](https://api-docs.taichi.graphics/src/taichi.misc.html?highlight=running#taichi.misc.gui.GUI.running)
+can help check the state of the window. `ti.GUI.EXIT` occurs when you click on the close (X) button of a window.
  `gui.running` will obtain `False` when the GUI is being closed.
 
 For example, loop until the close button is clicked:
@@ -156,7 +329,8 @@ You can also close the window by manually setting `gui.running` to`False`:
         gui.set_image(pixels)
         gui.show()
 
-`gui.get_event(a, ...)` tries to pop an event from the queue, and stores it into `gui.event`.
+[`gui.get_event(a, ...)`](https://api-docs.taichi.graphics/src/taichi.misc.html?highlight=get_event#taichi.misc.gui.GUI.get_event)
+tries to pop an event from the queue, and stores it into `gui.event`.
 
 For example:
 
@@ -170,7 +344,8 @@ For example, loop until ESC is pressed:
         gui.set_image(img)
         gui.show()
 
-`gui.get_events(a, ...)` is basically the same as `gui.get_event`, except that it returns a generator of events instead of storing into `gui.event`:
+[`gui.get_events(a, ...)`](https://api-docs.taichi.graphics/src/taichi.misc.html?highlight=get_event#taichi.misc.gui.GUI.get_events)
+is basically the same as `gui.get_event`, except that it returns a generator of events instead of storing into `gui.event`:
 
     for e in gui.get_events():
         if e.key == ti.GUI.ESCAPE:
@@ -180,7 +355,8 @@ For example, loop until ESC is pressed:
         elif e.key in ['a', ti.GUI.LEFT]:
             ...
 
-`gui.is_pressed(key, ...)` can detect the keys you pressed. It must be used together with `gui.get_event`, or it won't be updated! For
+[`gui.is_pressed(key, ...)`](https://api-docs.taichi.graphics/src/taichi.misc.html?highlight=is_pressed#taichi.misc.gui.GUI.is_pressed)
+can detect the keys you pressed. It must be used together with `gui.get_event`, or it won't be updated! For
 example:
 
     while True:
@@ -190,11 +366,30 @@ example:
         elif gui.is_pressed('d', ti.GUI.RIGHT):
             print('Go right!')
 
-`gui.get_cursor_pos()` can return current cursor position within the window. For example:
+:::caution
+
+`gui.is_pressed()` must be used together with `gui.get_event()`, or it won't be updated!
+
+:::
+
+For example:
+
+```python
+while True:
+    gui.get_event() # must be called before is_pressed
+    if gui.is_pressed('a', ti.GUI.LEFT):
+        print('Go left!')
+    elif gui.is_pressed('d', ti.GUI.RIGHT):
+        print('Go right!')
+```
+
+[`gui.get_cursor_pos()`](https://api-docs.taichi.graphics/src/taichi.misc.html?highlight=get_cursor#taichi.misc.gui.GUI.get_cursor_pos)
+can return current cursor position within the window. For example:
 
     mouse_x, mouse_y = gui.get_cursor_pos()
 
-`gui.fps_limit` sets the FPS limit for a window. For example, to cap FPS at 24, simply use `gui.fps_limit = 24`. This helps reduce the overload on your hardware especially when you're using OpenGL on your integrated GPU which could make desktop slow to response.
+[`gui.fps_limit`](https://api-docs.taichi.graphics/src/taichi.misc.html?highlight=fps#taichi.misc.gui.GUI.fps_limit)
+sets the FPS limit for a window. For example, to cap FPS at 24, simply use `gui.fps_limit = 24`. This helps reduce the overload on your hardware especially when you're using OpenGL on your integrated GPU which could make desktop slow to response.
 
 
 
@@ -202,22 +397,33 @@ example:
 
 Sometimes it's more intuitive to use widgets like slider or button to control the program variables instead of using chaotic keyboard bindings. Taichi GUI provides a set of widgets for that reason:
 
-For example:
+[`gui.slider(text, min, max)`](https://api-docs.taichi.graphics/src/taichi.misc.html?highlight=slider#taichi.misc.gui.GUI.slider)
+creates a slider following the text `{text}: {value:.3f}`.
 
-    radius = gui.slider('Radius', 1, 50)
+[`gui.label(text)`](https://api-docs.taichi.graphics/src/taichi.misc.html?highlight=label#taichi.misc.gui.GUI.label)
+displays the label as: `{text}: {value:.3f}`.
 
-    while gui.running:
-        print('The radius now is', radius.value)
-        ...
-        radius.value += 0.01
-        ...
-        gui.show()
+[`gui.button(text)`](https://api-docs.taichi.graphics/src/taichi.misc.html?highlight=button#taichi.misc.gui.GUI.button)
+creates a button with text on it.
+
+For example:
+```python
+radius = gui.slider('Radius', 1, 50)
+
+while gui.running:
+    print('The radius now is', radius.value)
+    ...
+    radius.value += 0.01
+    ...
+    gui.show()
+```
 
 
 
 ## Image I/O
 
-`ti.imwrite(img, filename)` can export a `np.ndarray` or Taichi field (`ti.Matrix.field`,  `ti.Vector.field`, or `ti.field`) to a specified location `filename`.
+[`ti.imwrite(img, filename)`](https://api-docs.taichi.graphics/src/taichi.misc.html?highlight=imwrite#taichi.misc.image.imwrite)
+can export a `np.ndarray` or Taichi field (`ti.Matrix.field`,  `ti.Vector.field`, or `ti.field`) to a specified location `filename`.
 
 Same as `ti.GUI.show(filename)`, the format of the exported image is determined by **the suffix of** `filename` as well. Now `ti.imwrite` supports exporting images to `png`, `img` and `jpg` and we recommend using `png`.
 
@@ -235,7 +441,7 @@ pixels = ti.field(dtype=type, shape=shape)
 @ti.kernel
 def draw():
     for i, j in pixels:
-        pixels[i, j] = ti.random() * 255    # integars between [0, 255] for ti.u8
+        pixels[i, j] = ti.random() * 255    # integers between [0, 255] for ti.u8
 
 draw()
 
@@ -246,8 +452,8 @@ Besides, for RGB or RGBA images, `ti.imwrite` needs to receive a field which has
 
 Generally the value of the pixels on each channel of a `png` image is an integer in \[0, 255\]. For this reason, `ti.imwrite` will **cast fields** which has different data types all **into integers between \[0, 255\]**. As a result, `ti.imwrite` has the following requirements for different data types of input fields:
 
-- For float-type (`ti.f16`, `ti.f32`, etc) input fields, **the value of each pixel should be float between \[0.0, 1.0\]**. Otherwise `ti.imwrite` will first clip them into \[0.0, 1.0\]. Then they are multiplied by 256 and casted to integers ranging from \[0, 255\].
-- For int-type (`ti.u8`, `ti.u16`, etc) input fields, **the value of each pixel can be any valid integer in its own bounds**. These integers in this field will be scaled to \[0, 255\] by being divided over the upper bound of its basic type accordingly.
+- For float-type (`ti.f16`, `ti.f32`, etc.) input fields, **the value of each pixel should be float between \[0.0, 1.0\]**. Otherwise `ti.imwrite` will first clip them into \[0.0, 1.0\]. Then they are multiplied by 256 and cast to integers ranging from \[0, 255\].
+- For int-type (`ti.u8`, `ti.u16`, etc.) input fields, **the value of each pixel can be any valid integer in its own bounds**. These integers in this field will be scaled to \[0, 255\] by being divided over the upper bound of its basic type accordingly.
 
 Here is another example:
 
@@ -271,3 +477,53 @@ draw()
 
 ti.imwrite(pixels, f"export_f32.png")
 ```
+
+[`ti.imread(filename)`](https://api-docs.taichi.graphics/src/taichi.misc.html?highlight=imread#taichi.misc.image.imread)
+loads an image from the target filename and returns it as a `np.ndarray(dtype=np.uint8)`.
+Each value in this returned field is an integer in [0, 255].
+
+[`ti.imshow(img, windname)`](https://api-docs.taichi.graphics/src/taichi.misc.html?highlight=imshow#taichi.misc.image.imshow)
+creates an instance of ti.GUI and show the input image on the screen. It has the same logic as `ti.imwrite` for different data types.
+
+[`ti.imresize(img, w)`](https://api-docs.taichi.graphics/src/taichi.misc.html?highlight=imresize#taichi.misc.image.imresize)
+resizes the img specified.
+
+## Zero-copying frame buffer
+When the GUI resolution (window size) is large, it sometimes becomes difficult to achieve 60 FPS even without any kernel
+invocations between two frames.
+
+This is mainly due to the copy overhead, where Taichi GUI needs to copy the image buffer from one place to another.
+This process is necessary for the 2D drawing functions, such as `gui.circles`, to work. The larger the image shape is,
+the larger the overhead.
+
+Fortunately, sometimes your program only needs `gui.set_image` alone. In such cases, you can enable the `fast_gui` option
+for better performance. This mode allows Taichi GUI to directly write the image data to the frame buffer without additional
+copying, resulting in a much better FPS.
+
+```python
+gui = ti.GUI(res, title, fast_gui=True)
+```
+
+:::note
+
+Because of the zero-copying mechanism, the image passed into `gui.set_image` must already be in the display-compatible
+format. That is, this field must either be a `ti.Vector(3)` (RGB) or a `ti.Vector(4)` (RGBA). In addition, each channel
+must be of type `ti.f32`, `ti.f64` or `ti.u8`.
+
+:::
+
+:::note
+
+If possible, consider enabling this option, especially when `fullscreen=True`.
+
+:::
+
+:::caution
+
+Despite the performance boost, it has many limitations as trade off:
+
+`gui.set_image` is the only available paint API in this mode.
+
+`gui.set_image` will only take Taichi 3D or 4D vector fields (RGB or RGBA) as input.
+
+:::
diff --git a/docs/lang/articles/misc/install.md b/docs/lang/articles/misc/install.md
index 50ee3c14a66b5..9c06b15622ec4 100644
--- a/docs/lang/articles/misc/install.md
+++ b/docs/lang/articles/misc/install.md
@@ -6,20 +6,6 @@ sidebar_position: 0
 
 ### Linux issues
 
-- If Taichi crashes and reports `libtinfo.so.5 not found`:
-
-  - On Ubuntu, execute `sudo apt install libtinfo-dev`.
-
-  - On Arch Linux, first edit `/etc/pacman.conf`, and append these
-    lines:
-
-    ```
-    [archlinuxcn]
-    Server = https://mirrors.tuna.tsinghua.edu.cn/archlinuxcn/$arch
-    ```
-
-    Then execute `sudo pacman -Syy ncurses5-compat-libs`.
-
 - If Taichi crashes and reports
   `` /usr/lib/libstdc++.so.6: version `CXXABI_1.3.11' not found ``:
 
@@ -49,11 +35,11 @@ sidebar_position: 0
   ERROR: No matching distribution found for taichi
   ```
 
-  - Make sure you're using Python version 3.6/3.7/3.8:
+  - Make sure you're using Python version 3.6/3.7/3.8/3.9:
 
     ```bash
     python3 -c "print(__import__('sys').version[:3])"
-    # 3.6, 3.7 or 3.8
+    # 3.6, 3.7, 3.8 or 3.9
     ```
 
   - Make sure your Python executable is 64-bit:
@@ -74,7 +60,7 @@ sidebar_position: 0
   [E 05/14/20 10:46:49.911] Received signal 7 (Bus error)
   ```
 
-  This might be because that your NVIDIA GPU is pre-Pascal and it
+  This might be because that your NVIDIA GPU is pre-Pascal, and it
   has limited support for [Unified
   Memory](https://www.nextplatform.com/2019/01/24/unified-memory-the-final-piece-of-the-gpu-programming-puzzle/).
 
diff --git a/docs/lang/articles/static/assets/arrow_field.png b/docs/lang/articles/static/assets/arrow_field.png
new file mode 100644
index 0000000000000..aec332feff18c
Binary files /dev/null and b/docs/lang/articles/static/assets/arrow_field.png differ
diff --git a/docs/lang/articles/static/assets/arrows.png b/docs/lang/articles/static/assets/arrows.png
new file mode 100644
index 0000000000000..a84b6f4069471
Binary files /dev/null and b/docs/lang/articles/static/assets/arrows.png differ
diff --git a/docs/lang/articles/static/assets/circles.png b/docs/lang/articles/static/assets/circles.png
new file mode 100644
index 0000000000000..e6665229a49c9
Binary files /dev/null and b/docs/lang/articles/static/assets/circles.png differ
diff --git a/docs/lang/articles/static/assets/colored_circles.png b/docs/lang/articles/static/assets/colored_circles.png
new file mode 100644
index 0000000000000..cc35c0626d8bb
Binary files /dev/null and b/docs/lang/articles/static/assets/colored_circles.png differ
diff --git a/docs/lang/articles/static/assets/lines.png b/docs/lang/articles/static/assets/lines.png
new file mode 100644
index 0000000000000..b64fc41927218
Binary files /dev/null and b/docs/lang/articles/static/assets/lines.png differ
diff --git a/docs/lang/articles/static/assets/point_field.png b/docs/lang/articles/static/assets/point_field.png
new file mode 100644
index 0000000000000..c6c185213b50d
Binary files /dev/null and b/docs/lang/articles/static/assets/point_field.png differ
diff --git a/docs/lang/articles/static/assets/rect.png b/docs/lang/articles/static/assets/rect.png
new file mode 100644
index 0000000000000..74c566248b3cc
Binary files /dev/null and b/docs/lang/articles/static/assets/rect.png differ
diff --git a/docs/lang/articles/static/assets/triangles.png b/docs/lang/articles/static/assets/triangles.png
new file mode 100644
index 0000000000000..32ffa160105b8
Binary files /dev/null and b/docs/lang/articles/static/assets/triangles.png differ
diff --git a/python/taichi/__init__.py b/python/taichi/__init__.py
index 03f7b5fef83ab..555cfc2ec10ba 100644
--- a/python/taichi/__init__.py
+++ b/python/taichi/__init__.py
@@ -2,13 +2,16 @@
 
 import taichi.ad as ad
 from taichi._logging import *
-from taichi.core import *
+from taichi.core import (get_os_name, package_root, require_version,
+                         start_memory_monitoring)
+from taichi.core import ti_core as core
 from taichi.lang import *  # TODO(archibate): It's `taichi.lang.core` overriding `taichi.core`
 from taichi.main import main
 from taichi.misc import *
 from taichi.testing import *
 from taichi.tools import *
 from taichi.torch_io import from_torch, to_torch
+from taichi.type import *
 
 import taichi.ui as ui
 
diff --git a/python/taichi/ad.py b/python/taichi/ad.py
index ee6bba1f2011a..04a7ba61e6096 100644
--- a/python/taichi/ad.py
+++ b/python/taichi/ad.py
@@ -1,3 +1,6 @@
+from taichi.lang import impl
+
+
 def grad_replaced(func):
     """A decorator for python function to customize gradient with Taichi's autodiff
     system, e.g. `ti.Tape()` and `kernel.grad()`.
@@ -32,7 +35,6 @@ def grad_replaced(func):
         >>>     multiply_grad(a)"""
     def decorated(*args, **kwargs):
         # TODO [#3025]: get rid of circular imports and move this to the top.
-        from taichi.lang import impl
         impl.get_runtime().grad_replaced = True
         if impl.get_runtime().target_tape:
             impl.get_runtime().target_tape.insert(decorated, args)
diff --git a/python/taichi/core/__init__.py b/python/taichi/core/__init__.py
index 3296b46faeca5..a19fb81c1e88e 100644
--- a/python/taichi/core/__init__.py
+++ b/python/taichi/core/__init__.py
@@ -1,7 +1,3 @@
 from taichi.core.util import *
-# TODO: move this to taichi/__init__.py.
-#       This is blocked since we currently require importing this before taichi.lang
-#       but yapf refuses to give up formatting there.
-from taichi.type import *
 
 __all__ = [s for s in dir() if not s.startswith('_')]
diff --git a/python/taichi/diagnose.py b/python/taichi/diagnose.py
index 7c3449290257a..184e508e06da0 100644
--- a/python/taichi/diagnose.py
+++ b/python/taichi/diagnose.py
@@ -1,13 +1,13 @@
+import locale
+import os
+import platform
+import subprocess
+import sys
+
+
 def main():
     print('Taichi system diagnose:')
     print('')
-
-    import locale
-    import os
-    import platform
-    import subprocess
-    import sys
-
     executable = sys.executable
 
     print(f'python: {sys.version}')
diff --git a/python/taichi/lang/__init__.py b/python/taichi/lang/__init__.py
index a57fd880fdb7f..440b3ccf73f00 100644
--- a/python/taichi/lang/__init__.py
+++ b/python/taichi/lang/__init__.py
@@ -1,7 +1,13 @@
+import atexit
 import functools
 import os
+import shutil
+import tempfile
+import time
 from copy import deepcopy as _deepcopy
 
+import taichi.lang.linalg
+import taichi.lang.meta
 from taichi.core.util import locale_encode
 from taichi.core.util import ti_core as _ti_core
 from taichi.lang import impl, types
@@ -18,7 +24,8 @@
 from taichi.lang.ops import *
 from taichi.lang.quant_impl import quant
 from taichi.lang.runtime_ops import async_flush, sync
-from taichi.lang.sparse_matrix.sparse_matrix import SparseMatrix, SparseMatrixBuilder
+from taichi.lang.sparse_matrix.sparse_matrix import (SparseMatrix,
+                                                     SparseMatrixBuilder)
 from taichi.lang.sparse_matrix.sparse_solver import SparseSolver
 from taichi.lang.struct import Struct
 from taichi.lang.type_factory_impl import type_factory
@@ -31,9 +38,6 @@
 
 import taichi as ti
 
-# TODO(#2223): Remove
-core = _ti_core
-
 runtime = impl.get_runtime()
 
 i = axes(0)
@@ -301,10 +305,7 @@ def prepare_sandbox():
     Returns a temporary directory, which will be automatically deleted on exit.
     It may contain the taichi_core shared object or some misc. files.
     '''
-    import atexit
-    import shutil
-    from tempfile import mkdtemp
-    tmp_dir = mkdtemp(prefix='taichi-')
+    tmp_dir = tempfile.mkdtemp(prefix='taichi-')
     atexit.register(shutil.rmtree, tmp_dir)
     print(f'[Taichi] preparing sandbox at {tmp_dir}')
     os.mkdir(os.path.join(tmp_dir, 'runtime/'))
@@ -520,8 +521,7 @@ def polar_decompose(A, dt=None):
     """
     if dt is None:
         dt = impl.get_runtime().default_fp
-    from .linalg import polar_decompose
-    return polar_decompose(A, dt)
+    return taichi.lang.linalg.polar_decompose(A, dt)
 
 
 def svd(A, dt=None):
@@ -539,8 +539,7 @@ def svd(A, dt=None):
     """
     if dt is None:
         dt = impl.get_runtime().default_fp
-    from .linalg import svd
-    return svd(A, dt)
+    return taichi.lang.linalg.svd(A, dt)
 
 
 def eig(A, dt=None):
@@ -559,9 +558,8 @@ def eig(A, dt=None):
     """
     if dt is None:
         dt = impl.get_runtime().default_fp
-    from taichi.lang import linalg
     if A.n == 2:
-        return linalg.eig2x2(A, dt)
+        return taichi.lang.linalg.eig2x2(A, dt)
     raise Exception("Eigen solver only supports 2D matrices.")
 
 
@@ -582,9 +580,8 @@ def sym_eig(A, dt=None):
     assert all(A == A.transpose()), "A needs to be symmetric"
     if dt is None:
         dt = impl.get_runtime().default_fp
-    from taichi.lang import linalg
     if A.n == 2:
-        return linalg.sym_eig2x2(A, dt)
+        return taichi.lang.linalg.sym_eig2x2(A, dt)
     raise Exception("Symmetric eigen solver only supports 2D matrices.")
 
 
@@ -601,7 +598,7 @@ def randn(dt=None):
     """
     if dt is None:
         dt = impl.get_runtime().default_fp
-    from .random import randn
+    from taichi.lang._random import randn
     return randn(dt)
 
 
@@ -649,8 +646,7 @@ def Tape(loss, clear_gradients=True):
     if clear_gradients:
         clear_all_gradients()
 
-    from taichi.lang.meta import clear_loss
-    clear_loss(loss)
+    taichi.lang.meta.clear_loss(loss)
 
     return runtime.get_tape(loss)
 
@@ -671,8 +667,7 @@ def visit(node):
 
         places = tuple(places)
         if places:
-            from taichi.lang.meta import clear_gradients
-            clear_gradients(places)
+            taichi.lang.meta.clear_gradients(places)
 
     for root_fb in FieldsBuilder.finalized_roots():
         visit(root_fb)
@@ -685,8 +680,6 @@ def deactivate_all_snodes():
 
 
 def benchmark(func, repeat=300, args=()):
-    import time
-
     def run_benchmark():
         compile_time = time.time()
         func(*args)  # compile the kernel first
@@ -740,8 +733,8 @@ def benchmark_plot(fn=None,
                    bar_distance=0,
                    left_margin=0,
                    size=(12, 8)):
-    import matplotlib.pyplot as plt
-    import yaml
+    import matplotlib.pyplot as plt  # pylint: disable=C0415
+    import yaml  # pylint: disable=C0415
     if fn is None:
         fn = os.path.join(_ti_core.get_repo_dir(), 'benchmarks', 'output',
                           'benchmark.yml')
@@ -860,7 +853,7 @@ def benchmark_plot(fn=None,
 
 
 def stat_write(key, value):
-    import yaml
+    import yaml  # pylint: disable=C0415
     case_name = os.environ.get('TI_CURRENT_BENCHMARK')
     if case_name is None:
         return
diff --git a/python/taichi/lang/random.py b/python/taichi/lang/_random.py
similarity index 87%
rename from python/taichi/lang/random.py
rename to python/taichi/lang/_random.py
index 6d3c5a46368d8..af92b772083d8 100644
--- a/python/taichi/lang/random.py
+++ b/python/taichi/lang/_random.py
@@ -1,9 +1,11 @@
 import math
 
+from taichi.lang.kernel_impl import func
+
 import taichi as ti
 
 
-@ti.func
+@func
 def randn(dt):
     '''
     Generates a random number from standard normal distribution
diff --git a/python/taichi/lang/ast/checkers.py b/python/taichi/lang/ast/checkers.py
index f3dfb292c63cb..fb8a3d62ba418 100644
--- a/python/taichi/lang/ast/checkers.py
+++ b/python/taichi/lang/ast/checkers.py
@@ -1,5 +1,6 @@
 import ast
 
+import taichi.lang.kernel_impl
 from taichi.lang.shell import oinspect
 
 
@@ -68,8 +69,7 @@ def generic_visit(self, node):
             return
 
         if not (self.top_level or self.current_scope.allows_more_stmt):
-            import taichi as ti
-            raise ti.KernelDefError(
+            raise taichi.lang.kernel_impl.KernelDefError(
                 f'No more statements allowed, at {self.get_error_location(node)}'
             )
         old_top_level = self.top_level
@@ -96,8 +96,7 @@ def visit_For(self, node):
             is_static = False
         if not (self.top_level or self.current_scope.allows_for_loop
                 or is_static):
-            import taichi as ti
-            raise ti.KernelDefError(
+            raise taichi.lang.kernel_impl.KernelDefError(
                 f'No more for loops allowed, at {self.get_error_location(node)}'
             )
 
diff --git a/python/taichi/lang/ast/transformer.py b/python/taichi/lang/ast/transformer.py
index 9e83ffbbee046..01ac896f4a04f 100644
--- a/python/taichi/lang/ast/transformer.py
+++ b/python/taichi/lang/ast/transformer.py
@@ -1,8 +1,11 @@
 import ast
 
+import astor
 from taichi.lang import impl
 from taichi.lang.ast.symbol_resolver import ASTResolver
+from taichi.lang.ast_builder_utils import BuilderContext
 from taichi.lang.exception import TaichiSyntaxError
+from taichi.lang.stmt_builder import build_stmt
 
 import taichi as ti
 
@@ -26,12 +29,9 @@ def print_ast(tree, title=None):
             return
         if title is not None:
             ti.info(f'{title}:')
-        import astor
         print(astor.to_source(tree.body[0], indent_with='    '))
 
     def visit(self, tree):
-        from taichi.lang.ast_builder_utils import BuilderContext
-        from taichi.lang.stmt_builder import build_stmt
         self.print_ast(tree, 'Initial AST')
         ctx = BuilderContext(func=self.func,
                              excluded_parameters=self.excluded_parameters,
diff --git a/python/taichi/lang/ast_builder_utils.py b/python/taichi/lang/ast_builder_utils.py
index d8b563932e5c0..103bc3a4fd368 100644
--- a/python/taichi/lang/ast_builder_utils.py
+++ b/python/taichi/lang/ast_builder_utils.py
@@ -8,7 +8,7 @@ def __call__(self, ctx, node):
         method = getattr(self, 'build_' + node.__class__.__name__, None)
         if method is None:
             try:
-                import astpretty
+                import astpretty  # pylint: disable=C0415
                 error_msg = f'Unsupported node {node}:\n{astpretty.pformat(node)}'
             except:
                 error_msg = f'Unsupported node {node}'
diff --git a/python/taichi/lang/expr.py b/python/taichi/lang/expr.py
index 93093f6ffb3db..0f9b14957cc5b 100644
--- a/python/taichi/lang/expr.py
+++ b/python/taichi/lang/expr.py
@@ -1,3 +1,4 @@
+import numpy as np
 from taichi.core.util import ti_core as _ti_core
 from taichi.lang import impl
 from taichi.lang.common_ops import TaichiOperations
@@ -25,7 +26,6 @@ def __init__(self, *args, tb=None):
                 # assume to be constant
                 arg = args[0]
                 try:
-                    import numpy as np
                     if isinstance(arg, np.ndarray):
                         arg = arg.dtype(arg)
                 except:
diff --git a/python/taichi/lang/field.py b/python/taichi/lang/field.py
index 1cbffe75ef813..9d480e39bc0f6 100644
--- a/python/taichi/lang/field.py
+++ b/python/taichi/lang/field.py
@@ -230,7 +230,7 @@ def fill(self, val):
     def to_numpy(self, dtype=None):
         if dtype is None:
             dtype = to_numpy_type(self.dtype)
-        import numpy as np
+        import numpy as np  # pylint: disable=C0415
         arr = np.zeros(shape=self.shape, dtype=dtype)
         from taichi.lang.meta import tensor_to_ext_arr
         tensor_to_ext_arr(self, arr)
@@ -239,7 +239,7 @@ def to_numpy(self, dtype=None):
 
     @python_scope
     def to_torch(self, device=None):
-        import torch
+        import torch  # pylint: disable=C0415
         arr = torch.zeros(size=self.shape,
                           dtype=to_pytorch_type(self.dtype),
                           device=device)
diff --git a/python/taichi/lang/impl.py b/python/taichi/lang/impl.py
index ff231de970622..a1b3b61d14690 100644
--- a/python/taichi/lang/impl.py
+++ b/python/taichi/lang/impl.py
@@ -19,6 +19,7 @@
                               python_scope, taichi_scope, to_pytorch_type)
 from taichi.misc.util import deprecated, get_traceback, warning
 from taichi.snode.fields_builder import FieldsBuilder
+from taichi.type.primitive_types import f32, f64, i32, i64, u32, u64
 
 import taichi as ti
 
@@ -284,8 +285,8 @@ def __init__(self, kernels=None):
         self.global_vars = []
         self.print_preprocessed = False
         self.experimental_real_function = False
-        self.default_fp = ti.f32
-        self.default_ip = ti.i32
+        self.default_fp = f32
+        self.default_ip = i32
         self.target_tape = None
         self.grad_replaced = False
         self.kernels = kernels or []
@@ -294,12 +295,12 @@ def get_num_compiled_functions(self):
         return len(self.compiled_functions) + len(self.compiled_grad_functions)
 
     def set_default_fp(self, fp):
-        assert fp in [ti.f32, ti.f64]
+        assert fp in [f32, f64]
         self.default_fp = fp
         default_cfg().default_fp = self.default_fp
 
     def set_default_ip(self, ip):
-        assert ip in [ti.i32, ti.i64]
+        assert ip in [i32, i64]
         self.default_ip = ip
         default_cfg().default_ip = self.default_ip
 
@@ -389,23 +390,23 @@ def _clamp_unsigned_to_range(npty, val):
 def make_constant_expr(val):
     _taichi_skip_traceback = 1
     if isinstance(val, (int, np.integer)):
-        if pytaichi.default_ip in {ti.i32, ti.u32}:
+        if pytaichi.default_ip in {i32, u32}:
             # It is not always correct to do such clamp without the type info on
             # the LHS, but at least this makes assigning constant to unsigned
             # int work. See https://github.com/taichi-dev/taichi/issues/2060
             return Expr(
                 _ti_core.make_const_expr_i32(
                     _clamp_unsigned_to_range(np.int32, val)))
-        elif pytaichi.default_ip in {ti.i64, ti.u64}:
+        elif pytaichi.default_ip in {i64, u64}:
             return Expr(
                 _ti_core.make_const_expr_i64(
                     _clamp_unsigned_to_range(np.int64, val)))
         else:
             assert False
     elif isinstance(val, (float, np.floating, np.ndarray)):
-        if pytaichi.default_fp == ti.f32:
+        if pytaichi.default_fp == f32:
             return Expr(_ti_core.make_const_expr_f32(val))
-        elif pytaichi.default_fp == ti.f64:
+        elif pytaichi.default_fp == f64:
             return Expr(_ti_core.make_const_expr_f64(val))
         else:
             assert False
diff --git a/python/taichi/lang/kernel_arguments.py b/python/taichi/lang/kernel_arguments.py
index deae64ec54ce6..c823c13b26f58 100644
--- a/python/taichi/lang/kernel_arguments.py
+++ b/python/taichi/lang/kernel_arguments.py
@@ -2,6 +2,7 @@
 from taichi.lang.any_array import AnyArray
 from taichi.lang.enums import Layout
 from taichi.lang.expr import Expr
+from taichi.lang.ndarray import ScalarNdarray
 from taichi.lang.snode import SNode
 from taichi.lang.sparse_matrix.sparse_matrix import SparseMatrixBuilder
 from taichi.lang.util import cook_dtype, to_taichi_type
@@ -47,7 +48,6 @@ def __init__(self, element_dim=None, layout=None):
 
     def extract(self, x):
         from taichi.lang.matrix import MatrixNdarray, VectorNdarray
-        from taichi.lang.ndarray import ScalarNdarray
         if isinstance(x, ScalarNdarray):
             self.check_element_dim(x, 0)
             return x.dtype, len(x.shape), (), Layout.AOS
diff --git a/python/taichi/lang/kernel_impl.py b/python/taichi/lang/kernel_impl.py
index 54ccdba44b507..b1555eb1f800a 100644
--- a/python/taichi/lang/kernel_impl.py
+++ b/python/taichi/lang/kernel_impl.py
@@ -19,6 +19,9 @@
 
 import taichi as ti
 
+if util.has_pytorch():
+    import torch
+
 
 def _remove_indent(lines):
     lines = lines.split('\n')
@@ -492,7 +495,6 @@ def func__(*args):
                     if isinstance(v, Ndarray):
                         v = v.arr
                     has_external_arrays = True
-                    has_torch = util.has_pytorch()
                     is_numpy = isinstance(v, np.ndarray)
                     if is_numpy:
                         tmp = np.ascontiguousarray(v)
@@ -509,8 +511,7 @@ def call_back():
 
                             return call_back
 
-                        assert has_torch
-                        import torch
+                        assert util.has_pytorch()
                         assert isinstance(v, torch.Tensor)
                         tmp = v
                         taichi_arch = self.runtime.prog.config.arch
@@ -577,7 +578,6 @@ def call_back():
     def match_ext_arr(self, v):
         has_array = isinstance(v, np.ndarray)
         if not has_array and util.has_pytorch():
-            import torch
             has_array = isinstance(v, torch.Tensor)
         return has_array
 
@@ -762,9 +762,14 @@ def data_oriented(cls):
     Returns:
         The decorated class.
     """
-    def getattr(self, item):
+    def _getattr(self, item):
         _taichi_skip_traceback = 1
-        x = super(cls, self).__getattribute__(item)
+        method = getattr(cls, item, None)
+        is_property = method.__class__ == property
+        if is_property:
+            x = method.fget
+        else:
+            x = super(cls, self).__getattribute__(item)
         if hasattr(x, '_is_wrapped_kernel'):
             if inspect.ismethod(x):
                 wrapped = x.__func__
@@ -774,10 +779,14 @@ def getattr(self, item):
             if wrapped._is_classkernel:
                 ret = _BoundedDifferentiableMethod(self, wrapped)
                 ret.__name__ = wrapped.__name__
+                if is_property:
+                    return ret()
                 return ret
+        if is_property:
+            return x(self)
         return x
 
-    cls.__getattribute__ = getattr
+    cls.__getattribute__ = _getattr
     cls._data_oriented = True
 
     return cls
diff --git a/python/taichi/lang/linalg.py b/python/taichi/lang/linalg.py
index b8590409ac0e0..a3572f6b067d5 100644
--- a/python/taichi/lang/linalg.py
+++ b/python/taichi/lang/linalg.py
@@ -1,10 +1,11 @@
 from taichi.core.util import ti_core as _ti_core
 from taichi.lang.impl import expr_init
+from taichi.lang.kernel_impl import func
 
 import taichi as ti
 
 
-@ti.func
+@func
 def polar_decompose2d(A, dt):
     """Perform polar decomposition (A=UP) for 2x2 matrix.
 
@@ -25,7 +26,7 @@ def polar_decompose2d(A, dt):
     return r, r.transpose() @ A
 
 
-@ti.func
+@func
 def polar_decompose3d(A, dt):
     """Perform polar decomposition (A=UP) for 3x3 matrix.
 
@@ -43,7 +44,7 @@ def polar_decompose3d(A, dt):
 
 
 # https://www.seas.upenn.edu/~cffjiang/research/svd/svd.pdf
-@ti.func
+@func
 def svd2d(A, dt):
     """Perform singular value decomposition (A=USV^T) for 2x2 matrix.
 
@@ -126,7 +127,7 @@ def svd3d(A, dt, iters=None):
     return U, sigma, V
 
 
-@ti.func
+@func
 def eig2x2(A, dt):
     """Compute the eigenvalues and right eigenvectors (Av=lambda v) of a 2x2 real matrix.
 
@@ -176,7 +177,7 @@ def eig2x2(A, dt):
     return eigenvalues, eigenvectors
 
 
-@ti.func
+@func
 def sym_eig2x2(A, dt):
     """Compute the eigenvalues and right eigenvectors (Av=lambda v) of a 2x2 real symmetric matrix.
 
@@ -212,7 +213,7 @@ def sym_eig2x2(A, dt):
     return eigenvalues, eigenvectors
 
 
-@ti.func
+@func
 def svd(A, dt):
     """Perform singular value decomposition (A=USV^T) for arbitrary size matrix.
 
@@ -236,7 +237,7 @@ def svd(A, dt):
         raise Exception("SVD only supports 2D and 3D matrices.")
 
 
-@ti.func
+@func
 def polar_decompose(A, dt):
     """Perform polar decomposition (A=UP) for arbitrary size matrix.
 
diff --git a/python/taichi/lang/matrix.py b/python/taichi/lang/matrix.py
index a814ca11af858..510e626df0944 100644
--- a/python/taichi/lang/matrix.py
+++ b/python/taichi/lang/matrix.py
@@ -1306,7 +1306,7 @@ def to_numpy(self, keep_dims=False, as_vector=None, dtype=None):
             dtype = to_numpy_type(self.dtype)
         as_vector = self.m == 1 and not keep_dims
         shape_ext = (self.n, ) if as_vector else (self.n, self.m)
-        import numpy as np
+        import numpy as np  # pylint: disable=C0415
         arr = np.zeros(self.shape + shape_ext, dtype=dtype)
         from taichi.lang.meta import matrix_to_ext_arr
         matrix_to_ext_arr(self, arr, as_vector)
@@ -1324,7 +1324,7 @@ def to_torch(self, device=None, keep_dims=False):
         Returns:
             torch.tensor: The result torch tensor.
         """
-        import torch
+        import torch  # pylint: disable=C0415
         as_vector = self.m == 1 and not keep_dims
         shape_ext = (self.n, ) if as_vector else (self.n, self.m)
         arr = torch.empty(self.shape + shape_ext,
diff --git a/python/taichi/lang/ndarray.py b/python/taichi/lang/ndarray.py
index a902cc4b800d3..f0a59a564cdbb 100644
--- a/python/taichi/lang/ndarray.py
+++ b/python/taichi/lang/ndarray.py
@@ -1,9 +1,13 @@
+import numpy as np
 from taichi.core.util import ti_core as _ti_core
 from taichi.lang import impl
 from taichi.lang.enums import Layout
 from taichi.lang.util import (cook_dtype, has_pytorch, python_scope,
                               to_pytorch_type, to_taichi_type)
 
+if has_pytorch():
+    import torch
+
 
 class Ndarray:
     """Taichi ndarray class implemented with a torch tensor.
@@ -15,7 +19,6 @@ class Ndarray:
     def __init__(self, dtype, shape):
         assert has_pytorch(
         ), "PyTorch must be available if you want to create a Taichi ndarray."
-        import torch
         self.arr = torch.zeros(shape, dtype=to_pytorch_type(cook_dtype(dtype)))
         if impl.current_cfg().arch == _ti_core.Arch.cuda:
             self.arr = self.arr.cuda()
@@ -94,14 +97,12 @@ def from_numpy(self, arr):
         Args:
             arr (numpy.ndarray): The source numpy array.
         """
-        import numpy as np
         if not isinstance(arr, np.ndarray):
             raise TypeError(f"{np.ndarray} expected, but {type(arr)} provided")
         if tuple(self.arr.shape) != tuple(arr.shape):
             raise ValueError(
                 f"Mismatch shape: {tuple(self.arr.shape)} expected, but {tuple(arr.shape)} provided"
             )
-        import torch
         self.arr = torch.from_numpy(arr).to(self.arr.dtype)
 
 
diff --git a/python/taichi/lang/shell.py b/python/taichi/lang/shell.py
index f004db1e921d7..c21262917edaa 100644
--- a/python/taichi/lang/shell.py
+++ b/python/taichi/lang/shell.py
@@ -3,18 +3,17 @@
 import os
 import sys
 
+from taichi._logging import info, warn
 from taichi.core.util import ti_core as _ti_core
 
-import taichi as ti
-
 try:
     import sourceinspect as oinspect
 except ImportError:
-    ti.warn('`sourceinspect` not installed!')
-    ti.warn(
+    warn('`sourceinspect` not installed!')
+    warn(
         'Without this package Taichi may not function well in Python IDLE interactive shell, '
         'Blender scripting module and Python native shell.')
-    ti.warn('Please run `python3 -m pip install sourceinspect` to install.')
+    warn('Please run `python3 -m pip install sourceinspect` to install.')
     import inspect as oinspect
 
 pybuf_enabled = False
@@ -32,7 +31,7 @@ def _shell_pop_print(old_call):
         # zero-overhead!
         return old_call
 
-    ti.info('Graphical python shell detected, using wrapped sys.stdout')
+    info('Graphical python shell detected, using wrapped sys.stdout')
 
     @functools.wraps(old_call)
     def new_call(*args, **kwargs):
diff --git a/python/taichi/lang/sparse_matrix/sparse_matrix.py b/python/taichi/lang/sparse_matrix/sparse_matrix.py
index bacc9b71bd126..496c4be62c165 100644
--- a/python/taichi/lang/sparse_matrix/sparse_matrix.py
+++ b/python/taichi/lang/sparse_matrix/sparse_matrix.py
@@ -1,9 +1,13 @@
+import numpy as np
+from taichi.core.util import ti_core as _ti_core
+from taichi.lang.field import Field
+
+
 class SparseMatrix:
     def __init__(self, n=None, m=None, sm=None):
         if sm is None:
             self.n = n
             self.m = m if m else n
-            from taichi.core.util import ti_core as _ti_core
             self.matrix = _ti_core.create_sparse_matrix(n, m)
         else:
             self.n = sm.num_rows()
@@ -39,8 +43,6 @@ def transpose(self):
         return SparseMatrix(sm=sm)
 
     def __matmul__(self, other):
-        import numpy as np
-        from taichi.lang import Field
         if isinstance(other, SparseMatrix):
             assert self.m == other.n, f"Dimension mismatch between sparse matrices ({self.n}, {self.m}) and ({other.n}, {other.m})"
             sm = self.matrix.matmul(other.matrix)
@@ -71,7 +73,6 @@ def __init__(self, num_rows=None, num_cols=None, max_num_triplets=0):
         self.num_rows = num_rows
         self.num_cols = num_cols if num_cols else num_rows
         if num_rows is not None:
-            from taichi.core.util import ti_core as _ti_core
             self.ptr = _ti_core.create_sparse_matrix_builder(
                 num_rows, num_cols, max_num_triplets)
 
diff --git a/python/taichi/lang/sparse_matrix/sparse_solver.py b/python/taichi/lang/sparse_matrix/sparse_solver.py
index e535a59431e0d..fb8c326b13de0 100644
--- a/python/taichi/lang/sparse_matrix/sparse_solver.py
+++ b/python/taichi/lang/sparse_matrix/sparse_solver.py
@@ -1,3 +1,7 @@
+import numpy as np
+from taichi.core.util import ti_core as _ti_core
+from taichi.lang import Field
+from taichi.lang.impl import get_runtime
 from taichi.lang.sparse_matrix.sparse_matrix import SparseMatrix
 
 
@@ -6,8 +10,6 @@ def __init__(self, solver_type="LLT", ordering="AMD"):
         solver_type_list = ["LLT", "LDLT", "LU"]
         solver_ordering = ['AMD', 'COLAMD']
         if solver_type in solver_type_list and ordering in solver_ordering:
-            from taichi.core.util import ti_core as _ti_core
-            from taichi.lang.impl import get_runtime
             taichi_arch = get_runtime().prog.config.arch
             assert taichi_arch == _ti_core.Arch.x64 or taichi_arch == _ti_core.Arch.arm64, "SparseSolver only supports CPU for now."
             self.solver = _ti_core.make_sparse_solver(solver_type, ordering)
@@ -37,8 +39,6 @@ def factorize(self, sparse_matrix):
             self.type_assert(sparse_matrix)
 
     def solve(self, b):
-        import numpy as np
-        from taichi.lang import Field
         if isinstance(b, Field):
             return self.solver.solve(b.to_numpy())
         elif isinstance(b, np.ndarray):
diff --git a/python/taichi/lang/stmt_builder.py b/python/taichi/lang/stmt_builder.py
index 8fedd738d9ce8..60308cf4f424a 100644
--- a/python/taichi/lang/stmt_builder.py
+++ b/python/taichi/lang/stmt_builder.py
@@ -1,6 +1,7 @@
 import ast
 import copy
 
+import astor
 from taichi.lang import impl
 from taichi.lang.ast.symbol_resolver import ASTResolver
 from taichi.lang.ast_builder_utils import *
@@ -84,7 +85,6 @@ def build_Assert(ctx, node):
                 raise ValueError(
                     f"assert info must be constant, not {ast.dump(node.msg)}")
         else:
-            import astor
             msg = astor.to_source(node.test)
         node.test = build_expr(ctx, node.test)
 
@@ -97,7 +97,6 @@ def build_Assert(ctx, node):
 
     @staticmethod
     def build_Assign(ctx, node):
-        assert (len(node.targets) == 1)
         node.value = build_expr(ctx, node.value)
         node.targets = build_exprs(ctx, node.targets)
 
@@ -107,85 +106,93 @@ def build_Assign(ctx, node):
         if is_static_assign:
             return node
 
-        if isinstance(node.targets[0], ast.Tuple):
-            targets = node.targets[0].elts
-
-            # Create
-            stmts = []
-
-            holder = parse_stmt('__tmp_tuple = ti.expr_init_list(0, '
-                                f'{len(targets)})')
-            holder.value.args[0] = node.value
-
-            stmts.append(holder)
-
-            def tuple_indexed(i):
-                indexing = parse_stmt('__tmp_tuple[0]')
-                StmtBuilder.set_subscript_index(indexing.value,
-                                                parse_expr("{}".format(i)))
-                return indexing.value
-
-            for i, target in enumerate(targets):
-                is_local = isinstance(target, ast.Name)
-                if is_local and ctx.is_creation(target.id):
-                    var_name = target.id
-                    target.ctx = ast.Store()
-                    # Create, no AST resolution needed
-                    init = ast.Attribute(value=ast.Name(id='ti',
-                                                        ctx=ast.Load()),
-                                         attr='expr_init',
-                                         ctx=ast.Load())
-                    rhs = ast.Call(
-                        func=init,
-                        args=[tuple_indexed(i)],
-                        keywords=[],
-                    )
-                    ctx.create_variable(var_name)
-                    stmts.append(
-                        ast.Assign(targets=[target],
-                                   value=rhs,
-                                   type_comment=None))
-                else:
-                    # Assign
-                    target.ctx = ast.Load()
-                    func = ast.Attribute(value=target,
-                                         attr='assign',
-                                         ctx=ast.Load())
-                    call = ast.Call(func=func,
-                                    args=[tuple_indexed(i)],
-                                    keywords=[])
-                    stmts.append(ast.Expr(value=call))
-
-            for stmt in stmts:
-                ast.copy_location(stmt, node)
-            stmts.append(parse_stmt('del __tmp_tuple'))
-            return StmtBuilder.make_single_statement(stmts)
-        else:
-            is_local = isinstance(node.targets[0], ast.Name)
-            if is_local and ctx.is_creation(node.targets[0].id):
-                var_name = node.targets[0].id
-                # Create, no AST resolution needed
-                init = ast.Attribute(value=ast.Name(id='ti', ctx=ast.Load()),
-                                     attr='expr_init',
-                                     ctx=ast.Load())
-                rhs = ast.Call(
-                    func=init,
-                    args=[node.value],
-                    keywords=[],
-                )
-                ctx.create_variable(var_name)
-                return ast.copy_location(
-                    ast.Assign(targets=node.targets,
-                               value=rhs,
-                               type_comment=None), node)
+        # Keep all generated assign statements and compose single one at last.
+        # The variable is introduced to support chained assignments.
+        # Ref https://github.com/taichi-dev/taichi/issues/2659.
+        assign_stmts = []
+        for node_target in node.targets:
+            if isinstance(node_target, ast.Tuple):
+                assign_stmts.append(
+                    StmtBuilder.build_assign_unpack(ctx, node, node_target))
             else:
-                # Assign
-                node.targets[0].ctx = ast.Load()
-                func = ast.Attribute(value=node.targets[0],
-                                     attr='assign',
-                                     ctx=ast.Load())
-                call = ast.Call(func=func, args=[node.value], keywords=[])
-                return ast.copy_location(ast.Expr(value=call), node)
+                assign_stmts.append(
+                    StmtBuilder.build_assign_basic(ctx, node, node_target,
+                                                   node.value))
+        return StmtBuilder.make_single_statement(assign_stmts)
+
+    @staticmethod
+    def build_assign_unpack(ctx, node, node_target):
+        """Build the unpack assignments like this: (target1, target2) = (value1, value2).
+        The function should be called only if the node target is a tuple.
+
+        Args:
+            ctx (ast_builder_utils.BuilderContext): The builder context.
+            node (ast.Assign): An assignment. targets is a list of nodes,
+            and value is a single node.
+            node_target (ast.Tuple): A list or tuple object. elts holds a
+            list of nodes representing the elements.
+        """
+
+        targets = node_target.elts
+
+        # Create
+        stmts = []
+
+        # Create a temp list and keep values in it, delete it after the initialization is finished.
+        holder = parse_stmt('__tmp_tuple = ti.expr_init_list(0, '
+                            f'{len(targets)})')
+        holder.value.args[0] = node.value
+
+        stmts.append(holder)
+
+        def tuple_indexed(i):
+            indexing = parse_stmt('__tmp_tuple[0]')
+            StmtBuilder.set_subscript_index(indexing.value, parse_expr(f"{i}"))
+            return indexing.value
+
+        # Generate assign statements for every target, then merge them into one.
+        for i, target in enumerate(targets):
+            stmts.append(
+                StmtBuilder.build_assign_basic(ctx, node, target,
+                                               tuple_indexed(i)))
+        stmts.append(parse_stmt('del __tmp_tuple'))
+        return StmtBuilder.make_single_statement(stmts)
+
+    @staticmethod
+    def build_assign_basic(ctx, node, target, value):
+        """Build basic assginment like this: target = value.
+
+         Args:
+            ctx (ast_builder_utils.BuilderContext): The builder context.
+            node (ast.Assign): An assignment. targets is a list of nodes,
+            and value is a single node.
+            target (ast.Name): A variable name. id holds the name as
+            a string.
+            value: A node representing the value.
+        """
+        is_local = isinstance(target, ast.Name)
+        if is_local and ctx.is_creation(target.id):
+            var_name = target.id
+            target.ctx = ast.Store()
+            # Create, no AST resolution needed
+            init = ast.Attribute(value=ast.Name(id='ti', ctx=ast.Load()),
+                                 attr='expr_init',
+                                 ctx=ast.Load())
+            rhs = ast.Call(
+                func=init,
+                args=[value],
+                keywords=[],
+            )
+            ctx.create_variable(var_name)
+            return ast.copy_location(
+                ast.Assign(targets=[target], value=rhs, type_comment=None),
+                node)
+        else:
+            # Assign
+            target.ctx = ast.Load()
+            func = ast.Attribute(value=target, attr='assign', ctx=ast.Load())
+            call = ast.Call(func=func, args=[value], keywords=[])
+            return ast.copy_location(ast.Expr(value=call), node)
 
     @staticmethod
     def build_Try(ctx, node):
diff --git a/python/taichi/lang/types.py b/python/taichi/lang/types.py
index b5e23fb7802f1..1299e378b43f5 100644
--- a/python/taichi/lang/types.py
+++ b/python/taichi/lang/types.py
@@ -1,5 +1,6 @@
 import numbers
 
+import taichi.lang.matrix
 from taichi.lang.exception import TaichiSyntaxError
 
 
@@ -19,15 +20,12 @@ def field(self, **kwargs):
 
 
 def matrix(m, n, dtype=None):
-    from taichi.lang.matrix import MatrixType
-    return MatrixType(m, n, dtype=dtype)
+    return taichi.lang.matrix.MatrixType(m, n, dtype=dtype)
 
 
 def vector(m, dtype=None):
-    from taichi.lang.matrix import MatrixType
-    return MatrixType(m, 1, dtype=dtype)
+    return taichi.lang.matrix.MatrixType(m, 1, dtype=dtype)
 
 
 def struct(**kwargs):
-    from taichi.lang.struct import StructType
-    return StructType(**kwargs)
+    return taichi.lang.struct.StructType(**kwargs)
diff --git a/python/taichi/main.py b/python/taichi/main.py
index e81dedb676cf8..1b616b5862792 100644
--- a/python/taichi/main.py
+++ b/python/taichi/main.py
@@ -4,12 +4,15 @@
 import random
 import runpy
 import shutil
+import subprocess
 import sys
-import time
+import timeit
 from collections import defaultdict
 from functools import wraps
 from pathlib import Path
 
+import numpy as np
+import pytest
 from colorama import Back, Fore, Style
 from taichi.core import ti_core as _ti_core
 from taichi.tools import video
@@ -19,8 +22,6 @@
 
 def timer(func):
     """Function decorator to benchmark a function runnign time."""
-    import timeit
-
     @wraps(func)
     def wrapper(*args, **kwargs):
         start = timeit.default_timer()
@@ -113,13 +114,11 @@ def _usage(self) -> str:
     def _exec_python_file(filename: str):
         """Execute a Python file based on filename."""
         # TODO: do we really need this?
-        import subprocess
         subprocess.call([sys.executable, filename] + sys.argv[1:])
 
     @staticmethod
     def _get_examples_dir() -> Path:
         """Get the path to the examples directory."""
-        import taichi as ti
 
         root_dir = ti.package_root()
         examples_dir = Path(root_dir) / 'examples'
@@ -499,7 +498,6 @@ def get_dats(dir):
         def plot_in_gui(scatter):
             import numpy as np
 
-            import taichi as ti
             gui = ti.GUI('Regression Test', (640, 480), 0x001122)
             print('[Hint] press SPACE to go for next display')
             for key, data in scatter.items():
@@ -561,12 +559,10 @@ def plot_in_gui(scatter):
 
     @staticmethod
     def _get_benchmark_baseline_dir():
-        import taichi as ti
         return os.path.join(_ti_core.get_repo_dir(), 'benchmarks', 'baseline')
 
     @staticmethod
     def _get_benchmark_output_dir():
-        import taichi as ti
         return os.path.join(_ti_core.get_repo_dir(), 'benchmarks', 'output')
 
     @register
@@ -602,7 +598,6 @@ def baseline(self, arguments: list = sys.argv[2:]):
         # Short circuit for testing
         if self.test_mode: return args
 
-        import shutil
         baseline_dir = TaichiMain._get_benchmark_baseline_dir()
         output_dir = TaichiMain._get_benchmark_output_dir()
         shutil.rmtree(baseline_dir, True)
@@ -612,9 +607,7 @@ def baseline(self, arguments: list = sys.argv[2:]):
     @staticmethod
     def _test_python(args):
         print("\nRunning Python tests...\n")
-        import pytest
 
-        import taichi as ti
         root_dir = ti.package_root()
         test_dir = os.path.join(root_dir, 'tests')
         pytest_args = []
@@ -676,8 +669,6 @@ def _test_python(args):
 
     @staticmethod
     def _test_cpp(args):
-        import taichi as ti
-
         # Cpp tests use the legacy non LLVM backend
         ti.reset()
         print("Running C++ tests...")
@@ -720,7 +711,6 @@ def benchmark(self, arguments: list = sys.argv[2:]):
         # Short circuit for testing
         if self.test_mode: return args
 
-        import shutil
         commit_hash = _ti_core.get_commit_hash()
         with os.popen('git rev-parse HEAD') as f:
             current_commit_hash = f.read().strip()
@@ -981,12 +971,7 @@ def repl(self, arguments: list = sys.argv[2:]):
         args = parser.parse_args(arguments)
 
         def local_scope():
-            import math
-            import time
-
-            import numpy as np
 
-            import taichi as ti
             try:
                 import IPython
                 IPython.embed()
diff --git a/python/taichi/misc/gui.py b/python/taichi/misc/gui.py
index 3a80b627e1b20..b41c49bef30cb 100644
--- a/python/taichi/misc/gui.py
+++ b/python/taichi/misc/gui.py
@@ -14,8 +14,8 @@ class GUI:
         name (str, optional): The name of the GUI to be constructed.
             Default is 'Taichi'.
         res (Union[int, List[int]], optional): The resolution of created
-            GUI. Default is 512*512.
-        background_color (int, optional): The background color of creted GUI.
+            GUI. Default is 512*512. If `res` is scalar, then width will be equal to height.
+        background_color (int, optional): The background color of created GUI.
             Default is 0x000000.
         show_gui (bool, optional): Specify whether to render the GUI. Default is True.
         fullscreen (bool, optional): Specify whether to render the GUI in
diff --git a/python/taichi/misc/util.py b/python/taichi/misc/util.py
index 905acbdf9d9ff..21e81253c3028 100644
--- a/python/taichi/misc/util.py
+++ b/python/taichi/misc/util.py
@@ -1,12 +1,13 @@
 import copy
-import inspect
+import functools
+import subprocess
 import sys
 import traceback
 
 from colorama import Fore, Style
 from taichi.core import ti_core as _ti_core
 
-import taichi
+import taichi as ti
 
 
 def config_from_dict(args):
@@ -133,8 +134,6 @@ def deprecated(old, new, warning_type=DeprecationWarning):
     Returns:
         Decorated fuction with warning message
     """
-    import functools
-
     def decorator(foo):
         @functools.wraps(foo)
         def wrapped(*args, **kwargs):
@@ -225,7 +224,6 @@ def dump_dot(filepath=None, rankdir=None, embed_states_threshold=0):
 
 def dot_to_pdf(dot, filepath):
     assert filepath.endswith('.pdf')
-    import subprocess
     p = subprocess.Popen(['dot', '-Tpdf'],
                          stdin=subprocess.PIPE,
                          stdout=subprocess.PIPE)
@@ -239,7 +237,6 @@ def get_kernel_stats():
 
 
 def print_async_stats(include_kernel_profiler=False):
-    import taichi as ti
     if include_kernel_profiler:
         ti.print_kernel_profile_info()
         print()
diff --git a/python/taichi/testing.py b/python/taichi/testing.py
index 4e83bc17a9572..4cde7627e2780 100644
--- a/python/taichi/testing.py
+++ b/python/taichi/testing.py
@@ -1,6 +1,10 @@
 import copy
+import functools
 import itertools
+import os
+from tempfile import mkstemp
 
+import pytest
 from taichi.core import ti_core as _ti_core
 
 import taichi as ti
@@ -21,8 +25,6 @@ def get_rel_eps():
 
 def approx(expected, **kwargs):
     '''Tweaked pytest.approx for OpenGL low precisions'''
-    import pytest
-
     class boolean_integer:
         def __init__(self, value):
             self.value = value
@@ -48,8 +50,7 @@ def allclose(x, y, **kwargs):
 
 def make_temp_file(*args, **kwargs):
     '''Create a temporary file'''
-    import os
-    from tempfile import mkstemp
+
     fd, name = mkstemp(*args, **kwargs)
     os.close(fd)
     return name
@@ -110,8 +111,6 @@ def test(arch=None, exclude=None, require=None, **options):
         return lambda x: print('No supported arch found. Skipping')
 
     def decorator(foo):
-        import functools
-
         @functools.wraps(foo)
         def wrapped(*args, **kwargs):
             arch_params_sets = [arch, *_test_features.values()]
diff --git a/python/taichi/ui/gui.py b/python/taichi/ui/gui.py
index 9f9812f465445..7cce99fad4d09 100644
--- a/python/taichi/ui/gui.py
+++ b/python/taichi/ui/gui.py
@@ -1,4 +1,5 @@
 import pathlib
+from contextlib import contextmanager
 
 from taichi.core import ti_core as _ti_core
 from taichi.lang.impl import default_cfg
@@ -13,6 +14,30 @@ class Gui:
     def __init__(self, gui) -> None:
         self.gui = gui  #reference to a PyGui
 
+    @contextmanager
+    def sub_window(self, name, x, y, width, height):
+        """Creating a context manager for subwindow
+
+        Note:
+            All args of this method should align with `begin`.
+
+        Args:
+            x (float): The x-coordinate (between 0 and 1) of the top-left corner of the subwindow, relative to the full window.
+            y (float): The y-coordinate (between 0 and 1) of the top-left corner of the subwindow, relative to the full window.
+            width (float): The width of the subwindow relative to the full window.
+            height (float): The height of the subwindow relative to the full window.
+
+        Usage::
+
+            >>> with gui.sub_window(name, x, y, width, height) as g:
+            >>>     g.text("Hello, World!")
+        """
+        self.begin(name, x, y, width, height)
+        try:
+            yield self
+        finally:
+            self.end()
+
     def begin(self, name, x, y, width, height):
         """Creates a subwindow that holds imgui widgets.
 
diff --git a/taichi/backends/metal/kernel_manager.cpp b/taichi/backends/metal/kernel_manager.cpp
index 621f514b563b6..909eb7fdeef72 100644
--- a/taichi/backends/metal/kernel_manager.cpp
+++ b/taichi/backends/metal/kernel_manager.cpp
@@ -261,8 +261,6 @@ class SparseRuntimeMtlKernelBase : public CompiledMtlKernelBase {
 class ListgenOpMtlKernel : public SparseRuntimeMtlKernelBase {
  public:
   struct Params : public SparseRuntimeMtlKernelBase::Params {
-    const SNodeDescriptorsMap *snode_descriptors{nullptr};
-
     const SNode *snode() const {
       return kernel_attribs->runtime_list_op_attribs->snode;
     }
@@ -318,7 +316,6 @@ class CompiledTaichiKernel {
     std::string mtl_source_code;
     const TaichiKernelAttributes *ti_kernel_attribs;
     const KernelContextAttributes *ctx_attribs;
-    const SNodeDescriptorsMap *snode_descriptors;
     MTLDevice *device;
     MemoryPool *mem_pool;
     KernelProfilerBase *profiler;
@@ -359,7 +356,6 @@ class CompiledTaichiKernel {
         kparams.device = device;
         kparams.mtl_func = mtl_func.get();
         kparams.mem_pool = params.mem_pool;
-        kparams.snode_descriptors = params.snode_descriptors;
         kernel = std::make_unique<ListgenOpMtlKernel>(kparams);
       } else if (ktype == KernelTaskType::gc) {
         GcOpMtlKernel::Params kparams;
@@ -570,7 +566,6 @@ class KernelManager::Impl {
   explicit Impl(Params params)
       : config_(params.config),
         compiled_runtime_module_(params.compiled_runtime_module),
-        compiled_structs_(params.compiled_structs),
         mem_pool_(params.mem_pool),
         host_result_buffer_(params.host_result_buffer),
         profiler_(params.profiler),
@@ -583,18 +578,6 @@ class KernelManager::Impl {
     command_queue_ = new_command_queue(device_.get());
     TI_ASSERT(command_queue_ != nullptr);
     create_new_command_buffer();
-    if (compiled_structs_.root_size > 0) {
-      root_mem_ = std::make_unique<BufferMemoryView>(
-          compiled_structs_.root_size, mem_pool_);
-      root_buffer_ = new_mtl_buffer_no_copy(device_.get(), root_mem_->ptr(),
-                                            root_mem_->size());
-      TI_ASSERT(root_buffer_ != nullptr);
-      buffer_meta_data_.root_buffer_size = root_mem_->size();
-      TI_DEBUG("Metal root buffer size: {} bytes", root_mem_->size());
-      ActionRecorder::get_instance().record(
-          "allocate_root_buffer",
-          {ActionArg("size_in_bytes", (int64)root_mem_->size())});
-    }
 
     global_tmps_mem_ = std::make_unique<BufferMemoryView>(
         taichi_global_tmp_buffer_size, mem_pool_);
@@ -607,7 +590,6 @@ class KernelManager::Impl {
         device_.get(), global_tmps_mem_->ptr(), global_tmps_mem_->size());
     TI_ASSERT(global_tmps_buffer_ != nullptr);
 
-    TI_ASSERT(compiled_runtime_module_.runtime_size > 0);
     const size_t mem_pool_bytes =
         (config_->device_memory_GB * 1024 * 1024 * 1024ULL);
     runtime_mem_ = std::make_unique<BufferMemoryView>(
@@ -625,7 +607,7 @@ class KernelManager::Impl {
     ActionRecorder::get_instance().record(
         "allocate_runtime_buffer",
         {ActionArg("runtime_buffer_size_in_bytes", (int64)runtime_mem_->size()),
-         ActionArg("runtime_struct_size_in_bytes",
+         ActionArg("runtime_size_in_bytes",
                    (int64)compiled_runtime_module_.runtime_size),
          ActionArg("memory_pool_size", (int64)mem_pool_bytes)});
 
@@ -639,10 +621,29 @@ class KernelManager::Impl {
                                            print_mem_->size());
     TI_ASSERT(print_buffer_ != nullptr);
 
-    init_runtime(params.root_id);
+    init_runtime_buffer(compiled_runtime_module_);
     clear_print_assert_buffer();
   }
 
+  void add_compiled_snode_tree(const CompiledStructs &compiled_tree) {
+    if (compiled_tree.root_size > 0) {
+      root_mem_ = std::make_unique<BufferMemoryView>(compiled_tree.root_size,
+                                                     mem_pool_);
+      root_buffer_ = new_mtl_buffer_no_copy(device_.get(), root_mem_->ptr(),
+                                            root_mem_->size());
+      TI_ASSERT(root_buffer_ != nullptr);
+      buffer_meta_data_.root_buffer_size += root_mem_->size();
+      TI_DEBUG("Metal root={} buffer_size={} bytes", compiled_tree.root_id,
+               root_mem_->size());
+      ActionRecorder::get_instance().record(
+          "allocate_root_buffer",
+          {ActionArg("root_id={}", (int64)compiled_tree.root_id),
+           ActionArg("size_in_bytes", (int64)root_mem_->size())});
+    }
+
+    init_snode_tree_sparse_runtime(compiled_tree);
+  }
+
   void register_taichi_kernel(const std::string &taichi_kernel_name,
                               const std::string &mtl_kernel_source_code,
                               const TaichiKernelAttributes &ti_kernel_attribs,
@@ -661,7 +662,6 @@ class KernelManager::Impl {
     params.mtl_source_code = mtl_kernel_source_code;
     params.ti_kernel_attribs = &ti_kernel_attribs;
     params.ctx_attribs = &ctx_attribs;
-    params.snode_descriptors = &compiled_structs_.snode_descriptors;
     params.device = device_.get();
     params.mem_pool = mem_pool_;
     params.profiler = profiler_;
@@ -750,14 +750,69 @@ class KernelManager::Impl {
   }
 
  private:
-  void init_runtime(int root_id) {
+  void init_runtime_buffer(const CompiledRuntimeModule &rtm_module) {
+    char *addr = runtime_mem_->ptr();
+    // init rand_seeds
+    // TODO(k-ye): Provide a way to use a fixed seed in dev mode.
+    std::mt19937 generator(
+        std::chrono::duration_cast<std::chrono::microseconds>(
+            std::chrono::system_clock::now().time_since_epoch())
+            .count());
+    std::uniform_int_distribution<uint32_t> distr(
+        0, std::numeric_limits<uint32_t>::max());
+    for (int i = 0; i < kNumRandSeeds; ++i) {
+      uint32_t *s = reinterpret_cast<uint32_t *>(addr);
+      *s = distr(generator);
+      addr += sizeof(uint32_t);
+    }
+    TI_DEBUG("Initialized random seeds size={}", rtm_module.rand_seeds_size);
+
     using namespace shaders;
-    char *addr = reinterpret_cast<char *>(runtime_mem_->ptr());
+    addr = runtime_mem_->ptr() + rtm_module.rand_seeds_size;
     const char *const addr_begin = addr;
-    const int max_snodes = compiled_structs_.max_snodes;
-    const auto &snode_descriptors = compiled_structs_.snode_descriptors;
-    // init snode_metas
     dev_runtime_mirror_.snode_metas = (SNodeMeta *)addr;
+    size_t addr_offset = sizeof(SNodeMeta) * kMaxNumSNodes;
+    addr += addr_offset;
+    TI_DEBUG("SNodeMeta, size={} accumulated={}", addr_offset,
+             (addr - addr_begin));
+    dev_runtime_mirror_.snode_extractors = (SNodeExtractors *)addr;
+    addr_offset = sizeof(SNodeExtractors) * kMaxNumSNodes;
+    addr += addr_offset;
+    TI_DEBUG("SNodeExtractors, size={} accumulated={}", addr_offset,
+             (addr - addr_begin));
+    dev_runtime_mirror_.snode_lists = (ListManagerData *)addr;
+    addr_offset = sizeof(ListManagerData) * kMaxNumSNodes;
+    addr += addr_offset;
+    TI_DEBUG("ListManagerData, size={} accumulated={}", addr_offset,
+             (addr - addr_begin));
+    dev_runtime_mirror_.snode_allocators = (NodeManagerData *)addr;
+    addr_offset = sizeof(NodeManagerData) * kMaxNumSNodes;
+    addr += addr_offset;
+    TI_DEBUG("NodeManagerData, size={} accumulated={}", addr_offset,
+             (addr - addr_begin));
+    dev_runtime_mirror_.ambient_indices = (NodeManagerData::ElemIndex *)addr;
+    addr_offset = sizeof(NodeManagerData::ElemIndex) * kMaxNumSNodes;
+    addr += addr_offset;
+    TI_DEBUG("SNode ambient elements, size={} accumulated={}", addr_offset,
+             (addr - addr_begin));
+
+    // Initialize the memory allocator
+    dev_mem_alloc_mirror_ = reinterpret_cast<MemoryAllocator *>(addr);
+    // Make sure the retured memory address is always greater than 1.
+    dev_mem_alloc_mirror_->next = shaders::MemoryAllocator::kInitOffset;
+    TI_DEBUG("Memory allocator, begin={} next={}", (addr - addr_begin),
+             dev_mem_alloc_mirror_->next);
+
+    mark_runtime_buffer_modified();
+  }
+
+  void init_snode_tree_sparse_runtime(const CompiledStructs &snode_tree) {
+    using namespace shaders;
+    const int max_snodes = snode_tree.max_snodes;
+    const auto &snode_descriptors = snode_tree.snode_descriptors;
+    char *addr = nullptr;
+    // init snode_metas
+    addr = (char *)dev_runtime_mirror_.snode_metas;
     for (int i = 0; i < max_snodes; ++i) {
       auto iter = snode_descriptors.find(i);
       if (iter == snode_descriptors.end()) {
@@ -798,12 +853,8 @@ class KernelManager::Impl {
           i, snode_type_name(sn_meta.snode->type), rtm_meta->element_stride,
           rtm_meta->num_slots, rtm_meta->mem_offset_in_parent);
     }
-    size_t addr_offset = sizeof(SNodeMeta) * kMaxNumSNodes;
-    addr += addr_offset;
-    TI_DEBUG("Initialized SNodeMeta, size={} accumulated={}", addr_offset,
-             (addr - addr_begin));
     // init snode_extractors
-    dev_runtime_mirror_.snode_extractors = (SNodeExtractors *)addr;
+    addr = (char *)dev_runtime_mirror_.snode_extractors;
     for (int i = 0; i < max_snodes; ++i) {
       auto iter = snode_descriptors.find(i);
       if (iter == snode_descriptors.end()) {
@@ -823,12 +874,8 @@ class KernelManager::Impl {
       }
       TI_DEBUG("");
     }
-    addr_offset = sizeof(SNodeExtractors) * kMaxNumSNodes;
-    addr += addr_offset;
-    TI_DEBUG("Initialized SNodeExtractors, size={} accumulated={}", addr_offset,
-             (addr - addr_begin));
     // init snode_lists
-    dev_runtime_mirror_.snode_lists = (ListManagerData *)addr;
+    addr = (char *)dev_runtime_mirror_.snode_lists;
     ListManagerData *const rtm_list_begin =
         reinterpret_cast<ListManagerData *>(addr);
     for (int i = 0; i < max_snodes; ++i) {
@@ -847,12 +894,8 @@ class KernelManager::Impl {
       TI_DEBUG("ListManagerData\n  id={}\n  num_elems_per_chunk={}\n", i,
                num_elems_per_chunk);
     }
-    addr_offset = sizeof(ListManagerData) * kMaxNumSNodes;
-    addr += addr_offset;
-    TI_DEBUG("Initialized ListManagerData, size={} accumulated={}", addr_offset,
-             (addr - addr_begin));
     // init snode_allocators
-    dev_runtime_mirror_.snode_allocators = (NodeManagerData *)addr;
+    addr = (char *)dev_runtime_mirror_.snode_allocators;
     auto init_node_mgr = [&snode_descriptors](const SNodeDescriptor &sn_desc,
                                               NodeManagerData *nm_data) {
       nm_data->data_list.element_stride = sn_desc.element_stride;
@@ -896,51 +939,6 @@ class KernelManager::Impl {
       init_node_mgr(sn_desc, nm_data);
       snode_id_to_nodemgrs.push_back(std::make_pair(i, nm_data));
     }
-    addr_offset = sizeof(NodeManagerData) * kMaxNumSNodes;
-    addr += addr_offset;
-    TI_DEBUG("Initialized NodeManagerData, size={} accumulated={}", addr_offset,
-             (addr - addr_begin));
-    // ambient_indices initialization has to be delayed, because it relies on
-    // the initialization of MemoryAllocator.
-    auto *const ambient_indices_begin =
-        reinterpret_cast<NodeManagerData::ElemIndex *>(addr);
-    dev_runtime_mirror_.ambient_indices = ambient_indices_begin;
-    addr_offset = sizeof(NodeManagerData::ElemIndex) * kMaxNumSNodes;
-    addr += addr_offset;
-    TI_DEBUG(
-        "Delayed the initialization of SNode ambient elements, size={} "
-        "accumulated={}",
-        addr_offset, (addr - addr_begin));
-    // init rand_seeds
-    // TODO(k-ye): Provide a way to use a fixed seed in dev mode.
-    std::mt19937 generator(
-        std::chrono::duration_cast<std::chrono::microseconds>(
-            std::chrono::system_clock::now().time_since_epoch())
-            .count());
-    const auto rand_seeds_begin = (addr - addr_begin);
-    buffer_meta_data_.randseedoffset_in_runtime_buffer = rand_seeds_begin;
-    std::uniform_int_distribution<uint32_t> distr(
-        0, std::numeric_limits<uint32_t>::max());
-    for (int i = 0; i < kNumRandSeeds; ++i) {
-      uint32_t *s = reinterpret_cast<uint32_t *>(addr);
-      *s = distr(generator);
-      addr += sizeof(uint32_t);
-    }
-    TI_DEBUG("Initialized random seeds, begin={} size={} accumuated={}",
-             rand_seeds_begin, kNumRandSeeds * sizeof(uint32_t),
-             (addr - addr_begin));
-    ActionRecorder::get_instance().record(
-        "initialize_runtime_buffer",
-        {
-            ActionArg("rand_seeds_begin", (int64)rand_seeds_begin),
-        });
-
-    // Initialize the memory allocator
-    auto *mem_alloc = reinterpret_cast<MemoryAllocator *>(addr);
-    // Make sure the retured memory address is always greater than 1.
-    mem_alloc->next = shaders::MemoryAllocator::kInitOffset;
-    TI_DEBUG("Initialized memory allocator, begin={} next={}",
-             (addr - addr_begin), mem_alloc->next);
 
     // Root list is static, so it can be initialized here once.
     ListgenElement root_elem;
@@ -949,20 +947,25 @@ class KernelManager::Impl {
       root_elem.coords.at[i] = 0;
     }
     ListManager root_lm;
-    root_lm.lm_data = rtm_list_begin + root_id;
-    root_lm.mem_alloc = mem_alloc;
+    root_lm.lm_data = rtm_list_begin + snode_tree.root_id;
+    root_lm.mem_alloc = dev_mem_alloc_mirror_;
     root_lm.append(root_elem);
 
     // Initialize all the ambient elements
+    auto *const ambient_indices_begin = dev_runtime_mirror_.ambient_indices;
     for (const auto &p : snode_id_to_nodemgrs) {
       NodeManager nm;
       nm.nm_data = p.second;
-      nm.mem_alloc = mem_alloc;
+      nm.mem_alloc = dev_mem_alloc_mirror_;
       const auto snode_id = p.first;
       ambient_indices_begin[snode_id] = nm.allocate();
       TI_DEBUG("AmbientIndex\n  id={}\n  mem_alloc->next={}\n", snode_id,
-               mem_alloc->next);
+               dev_mem_alloc_mirror_->next);
     }
+    mark_runtime_buffer_modified();
+  }
+
+  void mark_runtime_buffer_modified() {
     did_modify_range(runtime_buffer_.get(), /*location=*/0,
                      runtime_mem_->size());
   }
@@ -997,35 +1000,6 @@ class KernelManager::Impl {
     // print_runtime_debug();
   }
 
-  void print_runtime_debug() {
-    // If debugging is necessary, make sure this is called after
-    // blit_buffers_and_sync().
-    int *root_base = reinterpret_cast<int *>(root_mem_->ptr());
-    for (int i = 0; i < 10; ++i) {
-      TI_INFO("root[{}]={}", i, root_base[i]);
-    }
-
-    const auto &sn_descs = compiled_structs_.snode_descriptors;
-    for (int i = 0; i < compiled_structs_.max_snodes; ++i) {
-      auto iter = sn_descs.find(i);
-      if (iter == sn_descs.end()) {
-        continue;
-      }
-      // const SNodeDescriptor &sn_desc = iter->second;
-      shaders::ListManager lm;
-      lm.lm_data = (dev_runtime_mirror_.snode_lists + i);
-      lm.mem_alloc = dev_mem_alloc_mirror_;
-
-      shaders::NodeManagerData *nma =
-          (dev_runtime_mirror_.snode_allocators + i);
-      TI_INFO(
-          "ListManager for SNode={} num_active={} num_allocated={} "
-          "free_list_used={}",
-          i, lm.num_active(), nma->data_list.next, nma->free_list_used);
-    }
-    TI_INFO("");
-  }
-
   void check_assertion_failure() {
     // TODO: Copy this to program's result_buffer, and let the Taichi runtime
     // handle the assertion failures uniformly.
@@ -1138,19 +1112,20 @@ class KernelManager::Impl {
   MemoryPool *const mem_pool_;
   uint64_t *const host_result_buffer_;
   KernelProfilerBase *const profiler_;
-  nsobj_unique_ptr<MTLDevice> device_;
-  nsobj_unique_ptr<MTLCommandQueue> command_queue_;
-  nsobj_unique_ptr<MTLCommandBuffer> cur_command_buffer_;
-  std::size_t command_buffer_id_;
+  nsobj_unique_ptr<MTLDevice> device_{nullptr};
+  nsobj_unique_ptr<MTLCommandQueue> command_queue_{nullptr};
+  nsobj_unique_ptr<MTLCommandBuffer> cur_command_buffer_{nullptr};
+  std::size_t command_buffer_id_{0};
   std::unique_ptr<BufferMemoryView> root_mem_;
   nsobj_unique_ptr<MTLBuffer> root_buffer_;
-  std::unique_ptr<BufferMemoryView> global_tmps_mem_;
-  nsobj_unique_ptr<MTLBuffer> global_tmps_buffer_;
-  std::unique_ptr<BufferMemoryView> runtime_mem_;
-  nsobj_unique_ptr<MTLBuffer> runtime_buffer_;
+  std::unique_ptr<BufferMemoryView> global_tmps_mem_{nullptr};
+  nsobj_unique_ptr<MTLBuffer> global_tmps_buffer_{nullptr};
+  std::unique_ptr<BufferMemoryView> runtime_mem_{nullptr};
+  nsobj_unique_ptr<MTLBuffer> runtime_buffer_{nullptr};
+  int last_snode_id_used_in_runtime_{-1};
   // TODO: Rename these to 'print_assert_{mem|buffer}_'
-  std::unique_ptr<BufferMemoryView> print_mem_;
-  nsobj_unique_ptr<MTLBuffer> print_buffer_;
+  std::unique_ptr<BufferMemoryView> print_mem_{nullptr};
+  nsobj_unique_ptr<MTLBuffer> print_buffer_{nullptr};
   std::unordered_map<std::string, std::unique_ptr<CompiledTaichiKernel>>
       compiled_taichi_kernels_;
   PrintStringTable print_strtable_;
@@ -1176,6 +1151,10 @@ class KernelManager::Impl {
     TI_ERROR("Metal not supported on the current OS");
   }
 
+  void add_compiled_snode_tree(const CompiledStructs &) {
+    TI_ERROR("Metal not supported on the current OS");
+  }
+
   void register_taichi_kernel(const std::string &taichi_kernel_name,
                               const std::string &mtl_kernel_source_code,
                               const TaichiKernelAttributes &ti_kernel_attribs,
@@ -1217,6 +1196,7 @@ KernelManager::~KernelManager() {
 }
 
 void KernelManager::add_compiled_snode_tree(const CompiledStructs &snode_tree) {
+  impl_->add_compiled_snode_tree(snode_tree);
 }
 
 void KernelManager::register_taichi_kernel(
diff --git a/taichi/backends/metal/kernel_manager.h b/taichi/backends/metal/kernel_manager.h
index 9098685bf9731..75f82bb8e7288 100644
--- a/taichi/backends/metal/kernel_manager.h
+++ b/taichi/backends/metal/kernel_manager.h
@@ -27,12 +27,10 @@ class KernelManager {
  public:
   struct Params {
     CompiledRuntimeModule compiled_runtime_module;
-    CompiledStructs compiled_structs;
     CompileConfig *config;
-    MemoryPool *mem_pool;
     uint64_t *host_result_buffer;
+    MemoryPool *mem_pool;
     KernelProfilerBase *profiler;
-    int root_id;
   };
 
   explicit KernelManager(Params params);
diff --git a/taichi/backends/metal/metal_program.cpp b/taichi/backends/metal/metal_program.cpp
index 98185879f9bf4..58f519f9bdde6 100644
--- a/taichi/backends/metal/metal_program.cpp
+++ b/taichi/backends/metal/metal_program.cpp
@@ -29,11 +29,18 @@ void MetalProgramImpl::materialize_runtime(MemoryPool *memory_pool,
                                            KernelProfilerBase *profiler,
                                            uint64 **result_buffer_ptr) {
   TI_ASSERT(*result_buffer_ptr == nullptr);
+  TI_ASSERT(metal_kernel_mgr_ == nullptr);
   *result_buffer_ptr = (uint64 *)memory_pool->allocate(
       sizeof(uint64) * taichi_result_buffer_entries, 8);
-  params_.mem_pool = memory_pool;
-  params_.profiler = profiler;
   compiled_runtime_module_ = metal::compile_runtime_module();
+
+  metal::KernelManager::Params params;
+  params.compiled_runtime_module = compiled_runtime_module_.value();
+  params.config = config;
+  params.host_result_buffer = *result_buffer_ptr;
+  params.mem_pool = memory_pool;
+  params.profiler = profiler;
+  metal_kernel_mgr_ = std::make_unique<metal::KernelManager>(std::move(params));
 }
 
 void MetalProgramImpl::materialize_snode_tree(
@@ -47,15 +54,7 @@ void MetalProgramImpl::materialize_snode_tree(
   auto *const root = tree->root();
 
   metal_compiled_structs_ = metal::compile_structs(*root);
-  if (metal_kernel_mgr_ == nullptr) {
-    params_.compiled_structs = metal_compiled_structs_.value();
-    params_.compiled_runtime_module = compiled_runtime_module_.value();
-    params_.config = config;
-    params_.host_result_buffer = result_buffer;
-    params_.root_id = root->id;
-    metal_kernel_mgr_ =
-        std::make_unique<metal::KernelManager>(std::move(params_));
-  }
+  metal_kernel_mgr_->add_compiled_snode_tree(metal_compiled_structs_.value());
 }
 
 std::unique_ptr<AotModuleBuilder> MetalProgramImpl::make_aot_module_builder() {
diff --git a/taichi/backends/metal/metal_program.h b/taichi/backends/metal/metal_program.h
index a3e616dd851fb..9fa9d18ff1569 100644
--- a/taichi/backends/metal/metal_program.h
+++ b/taichi/backends/metal/metal_program.h
@@ -1,4 +1,7 @@
 #pragma once
+
+#include <vector>
+
 #include "taichi/backends/metal/kernel_manager.h"
 #include "taichi/backends/metal/struct_metal.h"
 #include "taichi/system/memory_pool.h"
@@ -47,8 +50,8 @@ class MetalProgramImpl : public ProgramImpl {
   std::optional<metal::CompiledRuntimeModule> compiled_runtime_module_{
       std::nullopt};
   std::optional<metal::CompiledStructs> metal_compiled_structs_{std::nullopt};
+  std::vector<metal::CompiledStructs> compiled_snode_trees_;
   std::unique_ptr<metal::KernelManager> metal_kernel_mgr_{nullptr};
-  metal::KernelManager::Params params_;
 };
 
 }  // namespace lang
diff --git a/taichi/backends/metal/shaders/runtime_utils.metal.h b/taichi/backends/metal/shaders/runtime_utils.metal.h
index b15c2fb0c033f..ae29a1507366d 100644
--- a/taichi/backends/metal/shaders/runtime_utils.metal.h
+++ b/taichi/backends/metal/shaders/runtime_utils.metal.h
@@ -27,12 +27,12 @@
 // The actual Runtime struct has to be emitted by codegen, because it depends
 // on the number of SNodes.
 struct Runtime {
+  uint32_t *rand_seeds = nullptr;
   SNodeMeta *snode_metas = nullptr;
   SNodeExtractors *snode_extractors = nullptr;
   ListManagerData *snode_lists = nullptr;
   NodeManagerData *snode_allocators = nullptr;
   NodeManagerData::ElemIndex *ambient_indices = nullptr;
-  uint32_t *rand_seeds = nullptr;
 };
 
 #define METAL_BEGIN_RUNTIME_UTILS_DEF
diff --git a/taichi/backends/metal/struct_metal.cpp b/taichi/backends/metal/struct_metal.cpp
index 5ecae03b6d7b0..144153fd536cc 100644
--- a/taichi/backends/metal/struct_metal.cpp
+++ b/taichi/backends/metal/struct_metal.cpp
@@ -364,12 +364,12 @@ class RuntimeModuleCompiler {
     line_appender_.append_raw(shaders::kMetalRuntimeStructsSourceCode);
     emit("");
     emit("struct Runtime {{");
+    emit("  uint32_t rand_seeds[{}];", kNumRandSeeds);
     emit("  SNodeMeta snode_metas[{}];", kMaxNumSNodes);
     emit("  SNodeExtractors snode_extractors[{}];", kMaxNumSNodes);
     emit("  ListManagerData snode_lists[{}];", kMaxNumSNodes);
     emit("  NodeManagerData snode_allocators[{}];", kMaxNumSNodes);
     emit("  NodeManagerData::ElemIndex ambient_indices[{}];", kMaxNumSNodes);
-    emit("  uint32_t rand_seeds[{}];", kNumRandSeeds);
     emit("}};");
     emit("");
     line_appender_.append_raw(shaders::kMetalRuntimeUtilsSourceCode);
diff --git a/taichi/program/program_impl.h b/taichi/program/program_impl.h
index 3451842b240aa..0ae9183586d1a 100644
--- a/taichi/program/program_impl.h
+++ b/taichi/program/program_impl.h
@@ -9,6 +9,7 @@
 
 namespace taichi {
 namespace lang {
+
 class ProgramImpl {
  public:
   // TODO: Make it safer, we exposed it for now as it's directly accessed
diff --git a/tests/python/test_assign.py b/tests/python/test_assign.py
new file mode 100644
index 0000000000000..d0297c1f63e64
--- /dev/null
+++ b/tests/python/test_assign.py
@@ -0,0 +1,59 @@
+import pytest
+
+import taichi as ti
+
+
+@ti.test(debug=True)
+def test_assign_basic():
+    @ti.kernel
+    def func_basic():
+        a = 1
+        assert a == 1
+
+    func_basic()
+
+
+@ti.test(debug=True)
+def test_assign_unpack():
+    @ti.kernel
+    def func_unpack():
+        (a, b) = (1, 2)
+        assert a == 1
+        assert b == 2
+
+    func_unpack()
+
+
+@ti.test(debug=True)
+def test_assign_chained():
+    @ti.kernel
+    def func_chained():
+        a = b = 1
+        assert a == 1
+        assert b == 1
+
+    func_chained()
+
+
+@ti.test(debug=True)
+def test_assign_chained_unpack():
+    @ti.kernel
+    def func_chained_unpack():
+        (a, b) = (c, d) = (1, 2)
+        assert a == 1
+        assert b == 2
+        assert c == 1
+        assert d == 2
+
+    func_chained_unpack()
+
+
+@ti.test(debug=True)
+def test_assign_assign():
+    @ti.kernel
+    def func_assign():
+        a = 0
+        a = 1
+        assert a == 1
+
+    func_assign()
diff --git a/tests/python/test_oop.py b/tests/python/test_oop.py
index 6cab7ebad8e8b..1ce87defd5a03 100644
--- a/tests/python/test_oop.py
+++ b/tests/python/test_oop.py
@@ -222,3 +222,22 @@ def hook(x: ti.template()):
     for i in range(32):
         for j in range(32):
             assert (solver.val[i, j] == 1.0)
+
+
+@ti.test()
+def test_oop_with_portery_decorator():
+    @ti.data_oriented
+    class TestPortery:
+        @property
+        @ti.kernel
+        def kernel_property(self) -> ti.i32:
+            return 42
+
+        @property
+        def raw_proterty(self):
+            return 3
+
+    a = TestPortery()
+    assert a.kernel_property == 42
+
+    assert a.raw_proterty == 3