diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
index b8b9cc98fbcb9..9d3e90d78fb7d 100644
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -48,10 +48,10 @@ jobs:
         run: |
           if [ -z "${{ github.event.action }}" ]; then
             # For nightly release, we only run on python 3.8
-            echo '::set-output name=matrix::{"include":[{"name":"taichi-nightly","python":"3.8","conda_python":"py38"}]}"'
+            echo '::set-output name=matrix::{"include":[{"name":"taichi-nightly","python":"3.8","conda_python":"py38"},{"name":"taichi-nightly","python":"3.10","conda_python":"py310"}]}"'
 
             # M1 only supports py38 and py39(conda), so change matrix.
-            echo '::set-output name=matrix_osx::{"include":[{"name":"taichi-nightly","python":"3.8"}]}"'
+            echo '::set-output name=matrix_osx::{"include":[{"name":"taichi-nightly","python":"3.8"},{"name":"taichi-nightly","python":"3.10"}]}"'
           else
             # For production release, we run on four python versions.
             echo '::set-output name=matrix::{"include":[{"name":"taichi","python":"3.6","conda_python":"py36"},{"name":"taichi","python":"3.7","conda_python":"py37"},{"name":"taichi","python":"3.8","conda_python":"py38"},{"name":"taichi","python":"3.9","conda_python":"py39"}]}"'
diff --git a/docs/lang/articles/advanced/sparse_matrix.md b/docs/lang/articles/advanced/sparse_matrix.md
index 72c449ad423a2..5e69c1d7a457b 100644
--- a/docs/lang/articles/advanced/sparse_matrix.md
+++ b/docs/lang/articles/advanced/sparse_matrix.md
@@ -22,7 +22,7 @@ n = 4
 K = ti.linalg.SparseMatrixBuilder(n, n, max_num_triplets=100)
 
 @ti.kernel
-def fill(A: ti.linalg.sparse_matrix_builder()):
+def fill(A: ti.types.sparse_matrix_builder()):
     for i in range(n):
         A[i, i] += 1  # Only +=  and -= operators are supported for now.
 
@@ -146,7 +146,7 @@ K = ti.linalg.SparseMatrixBuilder(n, n, max_num_triplets=100)
 b = ti.field(ti.f32, shape=n)
 
 @ti.kernel
-def fill(A: ti.linalg.sparse_matrix_builder(), b: ti.template(), interval: ti.i32):
+def fill(A: ti.types.sparse_matrix_builder(), b: ti.template(), interval: ti.i32):
     for i in range(n):
         A[i, i] += 2.0
 
diff --git a/misc/spMv_linear_solve.py b/misc/spMv_linear_solve.py
index 87bea0728673e..721648f45bb8c 100644
--- a/misc/spMv_linear_solve.py
+++ b/misc/spMv_linear_solve.py
@@ -9,7 +9,7 @@
 
 
 @ti.kernel
-def fill(A: ti.linalg.sparse_matrix_builder(), b: ti.template(),
+def fill(A: ti.types.sparse_matrix_builder(), b: ti.template(),
          interval: ti.i32):
     for i in range(n):
         A[i, i] += 2.0
diff --git a/misc/sparse_matrix.py b/misc/sparse_matrix.py
index 49cac7af58e87..a56054f7aedcd 100644
--- a/misc/sparse_matrix.py
+++ b/misc/sparse_matrix.py
@@ -9,8 +9,8 @@
 
 
 @ti.kernel
-def fill(A: ti.linalg.sparse_matrix_builder(),
-         b: ti.linalg.sparse_matrix_builder(), interval: ti.i32):
+def fill(A: ti.types.sparse_matrix_builder(),
+         b: ti.types.sparse_matrix_builder(), interval: ti.i32):
     for i in range(n):
         if i > 0:
             A[i - 1, i] += -1.0
diff --git a/python/taichi/__init__.py b/python/taichi/__init__.py
index 94f94c411951a..7d834259bcba4 100644
--- a/python/taichi/__init__.py
+++ b/python/taichi/__init__.py
@@ -9,7 +9,7 @@
 # Provide a shortcut to types since they're commonly used.
 from taichi.types.primitive_types import *
 
-from taichi import ad, linalg, tools
+from taichi import ad, experimental, linalg, tools
 from taichi.ui import GUI, hex_to_rgb, rgb_to_hex, ui
 
 # Issue#2223: Do not reorder, or we're busted with partially initialized module
diff --git a/python/taichi/_version_check.py b/python/taichi/_version_check.py
index 1718f70b5aa18..5958aecab2455 100644
--- a/python/taichi/_version_check.py
+++ b/python/taichi/_version_check.py
@@ -1,11 +1,15 @@
+import datetime
 import json
 import os
 import platform
 import threading
+import uuid
 from urllib import request
 
+from taichi._lib import core as _ti_core
 
-def check_version():
+
+def check_version(cur_uuid):
     # Check Taichi version for the user.
     major = _ti_core.get_version_major()
     minor = _ti_core.get_version_minor()
@@ -35,7 +39,10 @@ def check_version():
         payload['python'] = 'cp38'
     elif python_version.startswith('3.9.'):
         payload['python'] = 'cp39'
+    elif python_version.startswith('3.10.'):
+        payload['python'] = 'cp310'
 
+    payload['uuid'] = cur_uuid
     # We do not want request exceptions break users' usage of Taichi.
     try:
         payload = json.dumps(payload)
@@ -50,39 +57,40 @@ def check_version():
         return None
 
 
+def write_version_info(response, cur_uuid, version_info_path, cur_date):
+    if response is None:
+        return
+    with open(version_info_path, 'w') as f:
+        f.write((cur_date).strftime('%Y-%m-%d'))
+        f.write('\n')
+        if response['status'] == 1:
+            f.write(response['latest_version'])
+        else:
+            f.write('0.0.0')
+        f.write('\n')
+        f.write(cur_uuid)
+        f.write('\n')
+
+
 def try_check_version():
     try:
         os.makedirs(_ti_core.get_repo_dir(), exist_ok=True)
-        timestamp_path = os.path.join(_ti_core.get_repo_dir(), 'timestamp')
+        version_info_path = os.path.join(_ti_core.get_repo_dir(),
+                                         'version_info')
         cur_date = datetime.date.today()
-        if os.path.exists(timestamp_path):
-            last_time = ''
-            with open(timestamp_path, 'r') as f:
-                last_time = f.readlines()[0].rstrip()
+        if os.path.exists(version_info_path):
+            with open(version_info_path, 'r') as f:
+                version_info_file = f.readlines()
+                last_time = version_info_file[0].rstrip()
+                cur_uuid = version_info_file[2].rstrip()
             if cur_date.strftime('%Y-%m-%d') > last_time:
-                response = check_version()
-                if response is None:
-                    return
-                with open(timestamp_path, 'w') as f:
-                    f.write((cur_date +
-                             datetime.timedelta(days=7)).strftime('%Y-%m-%d'))
-                    f.write('\n')
-                    if response['status'] == 1:
-                        f.write(response['latest_version'])
-                    else:
-                        f.write('0.0.0')
+                response = check_version(cur_uuid)
+                write_version_info(response, cur_uuid, version_info_path,
+                                   cur_date)
         else:
-            response = check_version()
-            if response is None:
-                return
-            with open(timestamp_path, 'w') as f:
-                f.write((cur_date +
-                         datetime.timedelta(days=7)).strftime('%Y-%m-%d'))
-                f.write('\n')
-                if response['status'] == 1:
-                    f.write(response['latest_version'])
-                else:
-                    f.write('0.0.0')
+            cur_uuid = str(uuid.uuid4())
+            response = check_version(cur_uuid)
+            write_version_info(response, cur_uuid, version_info_path, cur_date)
     # Wildcard exception to catch potential file writing errors.
     except:
         pass
diff --git a/python/taichi/examples/simulation/implicit_mass_spring.py b/python/taichi/examples/simulation/implicit_mass_spring.py
index 38deb2daa2f0c..698d73df096de 100644
--- a/python/taichi/examples/simulation/implicit_mass_spring.py
+++ b/python/taichi/examples/simulation/implicit_mass_spring.py
@@ -85,7 +85,7 @@ def init_edges(self):
             rest_len[idx] = (pos[idx1] - pos[idx2]).norm()
 
     @ti.kernel
-    def init_mass_sp(self, M: ti.linalg.sparse_matrix_builder()):
+    def init_mass_sp(self, M: ti.types.sparse_matrix_builder()):
         for i in range(self.NV):
             mass = self.mass[i]
             M[2 * i + 0, 2 * i + 0] += mass
@@ -137,7 +137,7 @@ def compute_Jacobians(self):
         self.Jf[1] = ti.Matrix([[-self.kf, 0], [0, -self.kf]])
 
     @ti.kernel
-    def assemble_K(self, K: ti.linalg.sparse_matrix_builder()):
+    def assemble_K(self, K: ti.types.sparse_matrix_builder()):
         for i in self.spring:
             idx1, idx2 = self.spring[i][0], self.spring[i][1]
             for m, n in ti.static(ti.ndrange(2, 2)):
@@ -150,7 +150,7 @@ def assemble_K(self, K: ti.linalg.sparse_matrix_builder()):
             K[2 * (self.NV - 1) + m, 2 * (self.NV - 1) + n] += self.Jf[1][m, n]
 
     @ti.kernel
-    def assemble_D(self, D: ti.linalg.sparse_matrix_builder()):
+    def assemble_D(self, D: ti.types.sparse_matrix_builder()):
         for i in self.spring:
             idx1, idx2 = self.spring[i][0], self.spring[i][1]
             for m, n in ti.static(ti.ndrange(2, 2)):
diff --git a/python/taichi/examples/simulation/stable_fluid.py b/python/taichi/examples/simulation/stable_fluid.py
index d0a8f293bd6ad..864fd4204799a 100644
--- a/python/taichi/examples/simulation/stable_fluid.py
+++ b/python/taichi/examples/simulation/stable_fluid.py
@@ -69,7 +69,7 @@ def swap(self):
 if use_sparse_matrix:
     # use a sparse matrix to solve Poisson's pressure equation.
     @ti.kernel
-    def fill_laplacian_matrix(A: ti.linalg.sparse_matrix_builder()):
+    def fill_laplacian_matrix(A: ti.types.sparse_matrix_builder()):
         for i, j in ti.ndrange(res, res):
             row = i * res + j
             center = 0.0
diff --git a/python/taichi/experimental.py b/python/taichi/experimental.py
new file mode 100644
index 0000000000000..13df4c4a9baf3
--- /dev/null
+++ b/python/taichi/experimental.py
@@ -0,0 +1,3 @@
+from taichi.lang.kernel_impl import real_func
+
+__all__ = ["real_func"]
diff --git a/python/taichi/lang/ast/ast_transformer.py b/python/taichi/lang/ast/ast_transformer.py
index 58af5053fc5eb..002cc2de76780 100644
--- a/python/taichi/lang/ast/ast_transformer.py
+++ b/python/taichi/lang/ast/ast_transformer.py
@@ -456,8 +456,10 @@ def transform_as_kernel():
                     ctx.create_variable(arg.arg, ctx.global_vars[arg.arg])
                 elif isinstance(ctx.func.argument_annotations[i],
                                 annotations.sparse_matrix_builder):
-                    ctx.create_variable(arg.arg,
-                                        kernel_arguments.decl_sparse_matrix())
+                    ctx.create_variable(
+                        arg.arg,
+                        kernel_arguments.decl_sparse_matrix(
+                            to_taichi_type(ctx.arg_features[i])))
                 elif isinstance(ctx.func.argument_annotations[i],
                                 annotations.any_arr):
                     ctx.create_variable(
@@ -482,7 +484,7 @@ def transform_as_kernel():
             transform_as_kernel()
 
         else:  # ti.func
-            if impl.get_runtime().experimental_real_function:
+            if ctx.is_real_function:
                 transform_as_kernel()
             else:
                 len_args = len(args.args)
@@ -524,12 +526,12 @@ def transform_as_kernel():
 
     @staticmethod
     def build_Return(ctx, node):
-        if not impl.get_runtime().experimental_real_function:
+        if not ctx.is_real_function:
             if ctx.is_in_non_static_control_flow():
                 raise TaichiSyntaxError(
                     "Return inside non-static if/for is not supported")
         build_stmt(ctx, node.value)
-        if ctx.is_kernel or impl.get_runtime().experimental_real_function:
+        if ctx.is_kernel or ctx.is_real_function:
             # TODO: check if it's at the end of a kernel, throw TaichiSyntaxError if not
             if node.value is not None:
                 if ctx.func.return_type is None:
@@ -557,7 +559,7 @@ def build_Return(ctx, node):
                 # only need to replace the object part, i.e. args[0].value
         else:
             ctx.return_data = node.value.ptr
-        if not impl.get_runtime().experimental_real_function:
+        if not ctx.is_real_function:
             ctx.returned = True
         return None
 
@@ -1109,13 +1111,11 @@ def build_If(ctx, node):
     @staticmethod
     def build_Expr(ctx, node):
         build_stmt(ctx, node.value)
-        if not isinstance(
-                node.value,
-                ast.Call) or not impl.get_runtime().experimental_real_function:
+        if not isinstance(node.value, ast.Call):
             return None
         is_taichi_function = getattr(node.value.func.ptr,
                                      '_is_taichi_function', False)
-        if is_taichi_function:
+        if is_taichi_function and node.value.func.ptr._is_real_function:
             func_call_result = node.value.ptr
             ctx.ast_builder.insert_expr_stmt(func_call_result.ptr)
         return None
diff --git a/python/taichi/lang/ast/ast_transformer_utils.py b/python/taichi/lang/ast/ast_transformer_utils.py
index bf39aa5874768..900351cc75555 100644
--- a/python/taichi/lang/ast/ast_transformer_utils.py
+++ b/python/taichi/lang/ast/ast_transformer_utils.py
@@ -114,7 +114,8 @@ def __init__(self,
                  file=None,
                  src=None,
                  start_lineno=None,
-                 ast_builder=None):
+                 ast_builder=None,
+                 is_real_function=False):
         self.func = func
         self.local_scopes = []
         self.loop_scopes = []
@@ -140,6 +141,7 @@ def __init__(self,
         self.returned = False
         self.ast_builder = ast_builder
         self.visited_funcdef = False
+        self.is_real_function = is_real_function
 
     # e.g.: FunctionDef, Module, Global
     def variable_scope_guard(self):
diff --git a/python/taichi/lang/expr.py b/python/taichi/lang/expr.py
index 8c7f771666a99..05582cd6be9cb 100644
--- a/python/taichi/lang/expr.py
+++ b/python/taichi/lang/expr.py
@@ -3,7 +3,8 @@
 from taichi.lang import impl
 from taichi.lang.common_ops import TaichiOperations
 from taichi.lang.exception import TaichiTypeError
-from taichi.lang.util import is_taichi_class
+from taichi.lang.util import is_taichi_class, to_numpy_type, to_taichi_type
+from taichi.types.primitive_types import integer_types, real_types
 
 
 # Scalar, basic data type
@@ -30,7 +31,7 @@ def __init__(self, *args, tb=None, dtype=None):
                             "Only 0-dimensional numpy array can be used to initialize a scalar expression"
                         )
                     arg = arg.dtype.type(arg)
-                self.ptr = impl.make_constant_expr(arg, dtype).ptr
+                self.ptr = make_constant_expr(arg, dtype).ptr
         else:
             assert False
         if self.tb:
@@ -47,6 +48,50 @@ def __repr__(self):
         return '<ti.Expr>'
 
 
+def _check_in_range(npty, val):
+    iif = np.iinfo(npty)
+    if not iif.min <= val <= iif.max:
+        # This isn't the case we want to deal with: |val| does't fall into the valid range of either
+        # the signed or the unsigned type.
+        raise TaichiTypeError(
+            f'Constant {val} has exceeded the range of {to_taichi_type(npty)}: [{iif.min}, {iif.max}]'
+        )
+
+
+def _clamp_unsigned_to_range(npty, val):
+    # npty: np.int32 or np.int64
+    iif = np.iinfo(npty)
+    if iif.min <= val <= iif.max:
+        return val
+    cap = (1 << iif.bits)
+    assert 0 <= val < cap
+    new_val = val - cap
+    return new_val
+
+
+def make_constant_expr(val, dtype):
+    if isinstance(val, (int, np.integer)):
+        constant_dtype = impl.get_runtime(
+        ).default_ip if dtype is None else dtype
+        if constant_dtype not in integer_types:
+            raise TaichiTypeError(
+                'Integer literals must be annotated with a integer type. For type casting, use `ti.cast`.'
+            )
+        _check_in_range(to_numpy_type(constant_dtype), val)
+        return Expr(
+            _ti_core.make_const_expr_int(
+                constant_dtype, _clamp_unsigned_to_range(np.int64, val)))
+    if isinstance(val, (float, np.floating)):
+        constant_dtype = impl.get_runtime(
+        ).default_fp if dtype is None else dtype
+        if constant_dtype not in real_types:
+            raise TaichiTypeError(
+                'Floating-point literals must be annotated with a floating-point type. For type casting, use `ti.cast`.'
+            )
+        return Expr(_ti_core.make_const_expr_fp(constant_dtype, val))
+    raise TaichiTypeError(f'Invalid constant scalar data type: {type(val)}')
+
+
 def make_var_list(size):
     exprs = []
     for _ in range(size):
diff --git a/python/taichi/lang/impl.py b/python/taichi/lang/impl.py
index 426c973809f88..742cccf31792c 100644
--- a/python/taichi/lang/impl.py
+++ b/python/taichi/lang/impl.py
@@ -4,12 +4,11 @@
 
 import numpy as np
 from taichi._lib import core as _ti_core
-from taichi._logging import error
 from taichi._snode.fields_builder import FieldsBuilder
 from taichi.lang._ndarray import ScalarNdarray
 from taichi.lang._ndrange import GroupedNDRange, _Ndrange
 from taichi.lang.any_array import AnyArray, AnyArrayAccess
-from taichi.lang.exception import TaichiRuntimeError, TaichiTypeError
+from taichi.lang.exception import TaichiRuntimeError
 from taichi.lang.expr import Expr, make_expr_group
 from taichi.lang.field import Field, ScalarField
 from taichi.lang.kernel_arguments import SparseMatrixProxy
@@ -23,8 +22,7 @@
 from taichi.lang.struct import Struct, StructField, _IntermediateStruct
 from taichi.lang.tape import TapeImpl
 from taichi.lang.util import (cook_dtype, get_traceback, is_taichi_class,
-                              python_scope, taichi_scope, to_numpy_type,
-                              to_taichi_type, warning)
+                              python_scope, taichi_scope, warning)
 from taichi.types.primitive_types import f16, f32, f64, i32, i64
 
 
@@ -117,12 +115,6 @@ def begin_frontend_if(ast_builder, cond):
     ast_builder.begin_frontend_if(Expr(cond).ptr)
 
 
-def wrap_scalar(x):
-    if type(x) in [int, float]:
-        return Expr(x)
-    return x
-
-
 @taichi_scope
 def subscript(value, *_indices, skip_reordered=False):
     if isinstance(value, np.ndarray):
@@ -247,7 +239,6 @@ def __init__(self, kernels=None):
         self.current_kernel = None
         self.global_vars = []
         self.matrix_fields = []
-        self.experimental_real_function = False
         self.default_fp = f32
         self.default_ip = i32
         self.target_tape = None
@@ -364,47 +355,6 @@ def get_runtime():
     return pytaichi
 
 
-def _check_in_range(npty, val):
-    iif = np.iinfo(npty)
-    if not iif.min <= val <= iif.max:
-        # This isn't the case we want to deal with: |val| does't fall into the valid range of either
-        # the signed or the unsigned type.
-        error(
-            f'Constant {val} has exceeded the range of {to_taichi_type(npty)}: [{iif.min}, {iif.max}]'
-        )
-
-
-def _clamp_unsigned_to_range(npty, val):
-    # npty: np.int32 or np.int64
-    iif = np.iinfo(npty)
-    if iif.min <= val <= iif.max:
-        return val
-    cap = (1 << iif.bits)
-    assert 0 <= val < cap
-    new_val = val - cap
-    return new_val
-
-
-@taichi_scope
-def make_constant_expr_i32(val):
-    assert isinstance(val, (int, np.integer))
-    return Expr(_ti_core.make_const_expr_int(i32, val))
-
-
-@taichi_scope
-def make_constant_expr(val, dtype):
-    if isinstance(val, (int, np.integer)):
-        constant_dtype = pytaichi.default_ip if dtype is None else dtype
-        _check_in_range(to_numpy_type(constant_dtype), val)
-        return Expr(
-            _ti_core.make_const_expr_int(
-                constant_dtype, _clamp_unsigned_to_range(np.int64, val)))
-    if isinstance(val, (float, np.floating)):
-        constant_dtype = pytaichi.default_fp if dtype is None else dtype
-        return Expr(_ti_core.make_const_expr_fp(constant_dtype, val))
-    raise TaichiTypeError(f'Invalid constant scalar data type: {type(val)}')
-
-
 def reset():
     global pytaichi
     old_kernels = pytaichi.kernels
diff --git a/python/taichi/lang/kernel_arguments.py b/python/taichi/lang/kernel_arguments.py
index d8ad5dca414fc..aa1bf8a3c7324 100644
--- a/python/taichi/lang/kernel_arguments.py
+++ b/python/taichi/lang/kernel_arguments.py
@@ -1,6 +1,6 @@
 import taichi.lang
 from taichi._lib import core as _ti_core
-from taichi.lang import impl
+from taichi.lang import impl, ops
 from taichi.lang.any_array import AnyArray
 from taichi.lang.enums import Layout
 from taichi.lang.expr import Expr
@@ -10,30 +10,31 @@
 
 
 class SparseMatrixEntry:
-    def __init__(self, ptr, i, j):
+    def __init__(self, ptr, i, j, dtype):
         self.ptr = ptr
         self.i = i
         self.j = j
+        self.dtype = dtype
 
     def _augassign(self, value, op):
+        call_func = f"insert_triplet_{self.dtype}"
         if op == 'Add':
-            taichi.lang.impl.call_internal("insert_triplet", self.ptr, self.i,
-                                           self.j,
-                                           taichi.lang.impl.ti_float(value))
+            taichi.lang.impl.call_internal(call_func, self.ptr, self.i, self.j,
+                                           ops.cast(value, self.dtype))
         elif op == 'Sub':
-            taichi.lang.impl.call_internal("insert_triplet", self.ptr, self.i,
-                                           self.j,
-                                           -taichi.lang.impl.ti_float(value))
+            taichi.lang.impl.call_internal(call_func, self.ptr, self.i, self.j,
+                                           -ops.cast(value, self.dtype))
         else:
             assert False, "Only operations '+=' and '-=' are supported on sparse matrices."
 
 
 class SparseMatrixProxy:
-    def __init__(self, ptr):
+    def __init__(self, ptr, dtype):
         self.ptr = ptr
+        self.dtype = dtype
 
     def subscript(self, i, j):
-        return SparseMatrixEntry(self.ptr, i, j)
+        return SparseMatrixEntry(self.ptr, i, j, self.dtype)
 
 
 def decl_scalar_arg(dtype):
@@ -48,11 +49,13 @@ def decl_matrix_arg(matrixtype):
          for _ in range(matrixtype.n)])
 
 
-def decl_sparse_matrix():
+def decl_sparse_matrix(dtype):
+    value_type = cook_dtype(dtype)
     ptr_type = cook_dtype(u64)
     # Treat the sparse matrix argument as a scalar since we only need to pass in the base pointer
     arg_id = impl.get_runtime().prog.decl_arg(ptr_type, False)
-    return SparseMatrixProxy(_ti_core.make_arg_load_expr(arg_id, ptr_type))
+    return SparseMatrixProxy(_ti_core.make_arg_load_expr(arg_id, ptr_type),
+                             value_type)
 
 
 def decl_any_arr_arg(dtype, dim, element_shape, layout):
diff --git a/python/taichi/lang/kernel_impl.py b/python/taichi/lang/kernel_impl.py
index 2378295b61ddc..ee6c549be3d92 100644
--- a/python/taichi/lang/kernel_impl.py
+++ b/python/taichi/lang/kernel_impl.py
@@ -27,7 +27,7 @@
     import torch
 
 
-def func(fn):
+def func(fn, is_real_function=False):
     """Marks a function as callable in Taichi-scope.
 
     This decorator transforms a Python function into a Taichi one. Taichi
@@ -35,6 +35,7 @@ def func(fn):
 
     Args:
         fn (Callable): The Python function to be decorated
+        is_real_function (bool): Whether the function is a real function
 
     Returns:
         Callable: The decorated function
@@ -51,16 +52,21 @@ def func(fn):
     """
     is_classfunc = _inside_class(level_of_class_stackframe=3)
 
-    fun = Func(fn, _classfunc=is_classfunc)
+    fun = Func(fn, _classfunc=is_classfunc, is_real_function=is_real_function)
 
     @functools.wraps(fn)
     def decorated(*args):
         return fun.__call__(*args)
 
     decorated._is_taichi_function = True
+    decorated._is_real_function = is_real_function
     return decorated
 
 
+def real_func(fn):
+    return func(fn, is_real_function=True)
+
+
 def pyfunc(fn):
     """Marks a function as callable in both Taichi and Python scopes.
 
@@ -92,7 +98,8 @@ def _get_tree_and_ctx(self,
                       is_kernel=True,
                       arg_features=None,
                       args=None,
-                      ast_builder=None):
+                      ast_builder=None,
+                      is_real_function=False):
     file = oinspect.getsourcefile(self.func)
     src, start_lineno = oinspect.getsourcelines(self.func)
     src = [textwrap.fill(line, tabsize=4, width=9999) for line in src]
@@ -111,7 +118,7 @@ def _get_tree_and_ctx(self,
     if isinstance(func_body.returns, ast.Name):
         global_vars[func_body.returns.id] = self.return_type
 
-    if is_kernel or impl.get_runtime().experimental_real_function:
+    if is_kernel or is_real_function:
         # inject template parameters into globals
         for i in self.template_slot_locations:
             template_var_name = self.argument_names[i]
@@ -126,19 +133,25 @@ def _get_tree_and_ctx(self,
                                        src=src,
                                        start_lineno=start_lineno,
                                        file=file,
-                                       ast_builder=ast_builder)
+                                       ast_builder=ast_builder,
+                                       is_real_function=is_real_function)
 
 
 class Func:
     function_counter = 0
 
-    def __init__(self, _func, _classfunc=False, _pyfunc=False):
+    def __init__(self,
+                 _func,
+                 _classfunc=False,
+                 _pyfunc=False,
+                 is_real_function=False):
         self.func = _func
         self.func_id = Func.function_counter
         Func.function_counter += 1
         self.compiled = None
         self.classfunc = _classfunc
         self.pyfunc = _pyfunc
+        self.is_real_function = is_real_function
         self.argument_annotations = []
         self.argument_names = []
         self.return_type = None
@@ -158,7 +171,7 @@ def __call__(self, *args):
                     "Taichi functions cannot be called from Python-scope.")
             return self.func(*args)
 
-        if impl.get_runtime().experimental_real_function:
+        if self.is_real_function:
             if impl.get_runtime().current_kernel.is_grad:
                 raise TaichiSyntaxError(
                     "Real function in gradient kernels unsupported.")
@@ -174,9 +187,10 @@ def __call__(self, *args):
             self,
             is_kernel=False,
             args=args,
-            ast_builder=impl.get_runtime().prog.current_ast_builder())
+            ast_builder=impl.get_runtime().prog.current_ast_builder(),
+            is_real_function=self.is_real_function)
         ret = transform_tree(tree, ctx)
-        if not impl.get_runtime().experimental_real_function:
+        if not self.is_real_function:
             if self.return_type and not ctx.returned:
                 raise TaichiSyntaxError(
                     "Function has a return type but does not have a return statement"
@@ -185,7 +199,7 @@ def __call__(self, *args):
 
     def func_call_rvalue(self, key, args):
         # Skip the template args, e.g., |self|
-        assert impl.get_runtime().experimental_real_function
+        assert self.is_real_function
         non_template_args = []
         for i, anno in enumerate(self.argument_annotations):
             if not isinstance(anno, template):
@@ -196,7 +210,10 @@ def func_call_rvalue(self, key, args):
                 self.taichi_functions[key.instance_id], non_template_args))
 
     def do_compile(self, key, args):
-        tree, ctx = _get_tree_and_ctx(self, is_kernel=False, args=args)
+        tree, ctx = _get_tree_and_ctx(self,
+                                      is_kernel=False,
+                                      args=args,
+                                      is_real_function=self.is_real_function)
         fn = impl.get_runtime().prog.create_function(key)
 
         def func_body():
@@ -236,8 +253,7 @@ def extract_arguments(self):
                     annotation = template()
                 # TODO: pyfunc also need type annotation check when real function is enabled,
                 #       but that has to happen at runtime when we know which scope it's called from.
-                elif not self.pyfunc and impl.get_runtime(
-                ).experimental_real_function:
+                elif not self.pyfunc and self.is_real_function:
                     raise TaichiSyntaxError(
                         f'Taichi function `{self.func.__name__}` parameter `{arg_name}` must be type annotated'
                     )
@@ -305,6 +321,8 @@ def extract_arg(arg, anno):
                                              element_dim] if layout == Layout.SOA else shape[
                                                  -element_dim:]
             return to_taichi_type(arg.dtype), len(shape), element_shape, layout
+        if isinstance(anno, sparse_matrix_builder):
+            return arg.dtype
         # Use '#' as a placeholder because other kinds of arguments are not involved in template instantiation
         return '#'
 
@@ -457,7 +475,7 @@ def taichi_ast_generator(kernel_cxx):
             try:
                 ctx.ast_builder = kernel_cxx.ast_builder()
                 transform_tree(tree, ctx)
-                if not impl.get_runtime().experimental_real_function:
+                if not ctx.is_real_function:
                     if self.return_type and not ctx.returned:
                         raise TaichiSyntaxError(
                             "Kernel has a return type but does not have a return statement"
@@ -545,7 +563,7 @@ def func__(*args):
                                                      provided)
                     launch_ctx.set_arg_int(actual_argument_slot, int(v))
                 elif isinstance(needed, sparse_matrix_builder):
-                    # Pass only the base pointer of the ti.linalg.sparse_matrix_builder() argument
+                    # Pass only the base pointer of the ti.types.sparse_matrix_builder() argument
                     launch_ctx.set_arg_int(actual_argument_slot, v._get_addr())
                 elif isinstance(needed, any_arr) and isinstance(
                         v, taichi.lang._ndarray.Ndarray):
diff --git a/python/taichi/lang/matrix.py b/python/taichi/lang/matrix.py
index b79b8fcb539a4..02efa2e6aedbd 100644
--- a/python/taichi/lang/matrix.py
+++ b/python/taichi/lang/matrix.py
@@ -76,8 +76,8 @@ def __init__(self, n=1, m=1, dt=None, suppress_warning=False):
                         mat.append(
                             list([
                                 impl.make_tensor_element_expr(
-                                    self.local_tensor_proxy,
-                                    (impl.make_constant_expr_i32(i), ),
+                                    self.local_tensor_proxy, (expr.Expr(
+                                        i, dtype=primitive_types.i32), ),
                                     (len(n), ), self.dynamic_index_stride)
                             ]))
             else:  # now init a Matrix
@@ -119,8 +119,8 @@ def __init__(self, n=1, m=1, dt=None, suppress_warning=False):
                             mat[i].append(
                                 impl.make_tensor_element_expr(
                                     self.local_tensor_proxy,
-                                    (impl.make_constant_expr_i32(i),
-                                     impl.make_constant_expr_i32(j)),
+                                    (expr.Expr(i, dtype=primitive_types.i32),
+                                     expr.Expr(j, dtype=primitive_types.i32)),
                                     (len(n), len(n[0])),
                                     self.dynamic_index_stride))
             self.n = len(mat)
diff --git a/python/taichi/lang/misc.py b/python/taichi/lang/misc.py
index f682c39d26e5a..5ad52fd460a18 100644
--- a/python/taichi/lang/misc.py
+++ b/python/taichi/lang/misc.py
@@ -240,7 +240,6 @@ class _SpecialConfig:
     def __init__(self):
         self.log_level = 'info'
         self.gdb_trigger = False
-        self.experimental_real_function = False
         self.short_circuit_operators = False
 
 
@@ -383,7 +382,6 @@ def init(arch=None,
     # submodule configurations (spec_cfg):
     env_spec.add('log_level', str)
     env_spec.add('gdb_trigger')
-    env_spec.add('experimental_real_function')
     env_spec.add('short_circuit_operators')
 
     # compiler configurations (ti.cfg):
@@ -405,8 +403,6 @@ def init(arch=None,
     # dispatch configurations that are not in ti.cfg:
     if not _test_mode:
         _ti_core.set_core_trigger_gdb_when_crash(spec_cfg.gdb_trigger)
-        impl.get_runtime().experimental_real_function = \
-            spec_cfg.experimental_real_function
         impl.get_runtime().short_circuit_operators = \
             spec_cfg.short_circuit_operators
         _logging.set_logging_level(spec_cfg.log_level.lower())
diff --git a/python/taichi/linalg/sparse_matrix.py b/python/taichi/linalg/sparse_matrix.py
index 4dab4a0ab8ee8..928aec91d1382 100644
--- a/python/taichi/linalg/sparse_matrix.py
+++ b/python/taichi/linalg/sparse_matrix.py
@@ -143,9 +143,10 @@ def __init__(self,
                  dtype=f32):
         self.num_rows = num_rows
         self.num_cols = num_cols if num_cols else num_rows
+        self.dtype = dtype
         if num_rows is not None:
             self.ptr = get_runtime().prog.create_sparse_matrix_builder(
-                num_rows, num_cols, max_num_triplets)
+                num_rows, num_cols, max_num_triplets, dtype)
 
     def _get_addr(self):
         """Get the address of the sparse matrix"""
diff --git a/taichi/aot/module_loader.cpp b/taichi/aot/module_loader.cpp
index d3436cbf3effc..9d168d3055da3 100644
--- a/taichi/aot/module_loader.cpp
+++ b/taichi/aot/module_loader.cpp
@@ -1,18 +1,26 @@
 #include "taichi/aot/module_loader.h"
 
+#include "taichi/backends/vulkan/aot_module_loader_impl.h"
+#include "taichi/backends/metal/aot_module_loader_impl.h"
+
 namespace taichi {
 namespace lang {
 namespace aot {
 
-Kernel *ModuleLoader::get_kernel(const std::string &name) {
-  auto itr = loaded_kernels_.find(name);
-  if (itr != loaded_kernels_.end()) {
-    return itr->second.get();
+std::unique_ptr<Module> Module::load(const std::string &path,
+                                     Arch arch,
+                                     std::any mod_params) {
+  if (arch == Arch::vulkan) {
+#ifdef TI_WITH_VULKAN
+    return vulkan::make_aot_module(mod_params);
+#endif
+  } else if (arch == Arch::metal) {
+#ifdef TI_WITH_METAL
+    return metal::make_aot_module(mod_params);
+#endif
+  } else {
+    TI_NOT_IMPLEMENTED;
   }
-  auto k = make_new_kernel(name);
-  auto *kptr = k.get();
-  loaded_kernels_[name] = std::move(k);
-  return kptr;
 }
 
 }  // namespace aot
diff --git a/taichi/aot/module_loader.h b/taichi/aot/module_loader.h
index 303a3bffddbbd..634f8462e92ec 100644
--- a/taichi/aot/module_loader.h
+++ b/taichi/aot/module_loader.h
@@ -1,5 +1,6 @@
 #pragma once
 
+#include <any>
 #include <memory>
 #include <string>
 #include <unordered_map>
@@ -13,10 +14,21 @@
 namespace taichi {
 namespace lang {
 
-class RuntimeContext;
+struct RuntimeContext;
 
 namespace aot {
 
+class TI_DLL_EXPORT Field {
+ public:
+  // Rule of 5 to make MSVC happy
+  Field() = default;
+  virtual ~Field() = default;
+  Field(const Field &) = delete;
+  Field &operator=(const Field &) = delete;
+  Field(Field &&) = default;
+  Field &operator=(Field &&) = default;
+};
+
 class TI_DLL_EXPORT Kernel {
  public:
   // Rule of 5 to make MSVC happy
@@ -37,30 +49,25 @@ class TI_DLL_EXPORT Kernel {
   virtual void launch(RuntimeContext *ctx) = 0;
 };
 
-class TI_DLL_EXPORT ModuleLoader {
+class TI_DLL_EXPORT Module {
  public:
   // Rule of 5 to make MSVC happy
-  ModuleLoader() = default;
-  virtual ~ModuleLoader() = default;
-  ModuleLoader(const ModuleLoader &) = delete;
-  ModuleLoader &operator=(const ModuleLoader &) = delete;
-  ModuleLoader(ModuleLoader &&) = default;
-  ModuleLoader &operator=(ModuleLoader &&) = default;
-
-  // TODO: Add method get_kernel(...) once the kernel field data will be
-  // generic/common across all backends.
-
-  virtual bool get_field(const std::string &name,
-                         aot::CompiledFieldData &field) = 0;
-
-  /**
-   * @brief Get the kernel object
-   *
-   * @param name Name of the kernel
-   * @return Kernel*
-   */
-  Kernel *get_kernel(const std::string &name);
-
+  Module() = default;
+  virtual ~Module() = default;
+  Module(const Module &) = delete;
+  Module &operator=(const Module &) = delete;
+  Module(Module &&) = default;
+  Module &operator=(Module &&) = default;
+
+  static std::unique_ptr<Module> load(const std::string &path,
+                                      Arch arch,
+                                      std::any mod_params);
+
+  // Module metadata
+  virtual Arch arch() const = 0;
+  virtual uint64_t version() const = 0;
+  virtual std::unique_ptr<Kernel> get_kernel(const std::string &name) = 0;
+  virtual std::unique_ptr<Field> get_field(const std::string &name) = 0;
   virtual size_t get_root_size() const = 0;
 
  protected:
diff --git a/taichi/backends/cpu/cpu_device.h b/taichi/backends/cpu/cpu_device.h
index 2309d1b9edc5e..5d5ccfd5e5ff1 100644
--- a/taichi/backends/cpu/cpu_device.h
+++ b/taichi/backends/cpu/cpu_device.h
@@ -5,7 +5,7 @@
 #include <vector>
 
 #include "taichi/common/core.h"
-#include "taichi/backends/device.h"
+#include "taichi/llvm/llvm_device.h"
 #include "taichi/system/virtual_memory.h"
 
 namespace taichi {
@@ -75,7 +75,7 @@ class CpuStream : public Stream {
   void command_sync() override{TI_NOT_IMPLEMENTED};
 };
 
-class CpuDevice : public Device {
+class CpuDevice : public LlvmDevice {
  public:
   struct AllocInfo {
     void *ptr{nullptr};
diff --git a/taichi/backends/cuda/cuda_caching_allocator.cpp b/taichi/backends/cuda/cuda_caching_allocator.cpp
index e857aabad37ce..cb037af88f14b 100644
--- a/taichi/backends/cuda/cuda_caching_allocator.cpp
+++ b/taichi/backends/cuda/cuda_caching_allocator.cpp
@@ -4,11 +4,12 @@ namespace taichi {
 namespace lang {
 namespace cuda {
 
-CudaCachingAllocator::CudaCachingAllocator(Device *device) : device_(device) {
+CudaCachingAllocator::CudaCachingAllocator(LlvmDevice *device)
+    : device_(device) {
 }
 
 uint64_t *CudaCachingAllocator::allocate(
-    const Device::LlvmRuntimeAllocParams &params) {
+    const LlvmDevice::LlvmRuntimeAllocParams &params) {
   uint64_t *ret{nullptr};
   auto size_aligned = taichi::iroundup(params.size, taichi_page_size);
   auto it_blk = mem_blocks_.lower_bound(size_aligned);
diff --git a/taichi/backends/cuda/cuda_caching_allocator.h b/taichi/backends/cuda/cuda_caching_allocator.h
index 25bf804f05430..14af5a493792f 100644
--- a/taichi/backends/cuda/cuda_caching_allocator.h
+++ b/taichi/backends/cuda/cuda_caching_allocator.h
@@ -1,8 +1,8 @@
 #pragma once
 
-#include "taichi/backends/device.h"
 #include "taichi/common/core.h"
 #include "taichi/math/arithmetic.h"
+#include "taichi/llvm/llvm_device.h"
 #include <stdint.h>
 #include <map>
 
@@ -12,14 +12,14 @@ namespace cuda {
 
 class CudaCachingAllocator {
  public:
-  CudaCachingAllocator(Device *device);
+  CudaCachingAllocator(LlvmDevice *device);
 
-  uint64_t *allocate(const Device::LlvmRuntimeAllocParams &params);
+  uint64_t *allocate(const LlvmDevice::LlvmRuntimeAllocParams &params);
   void release(size_t sz, uint64_t *ptr);
 
  private:
   std::multimap<size_t, uint64_t *> mem_blocks_;
-  Device *device_{nullptr};
+  LlvmDevice *device_{nullptr};
 };
 
 }  // namespace cuda
diff --git a/taichi/backends/cuda/cuda_device.h b/taichi/backends/cuda/cuda_device.h
index 96d742ff43d52..039c17b012061 100644
--- a/taichi/backends/cuda/cuda_device.h
+++ b/taichi/backends/cuda/cuda_device.h
@@ -6,7 +6,7 @@
 #include "taichi/backends/cuda/cuda_driver.h"
 #include "taichi/backends/cuda/cuda_caching_allocator.h"
 #include "taichi/backends/cuda/cuda_context.h"
-#include "taichi/backends/device.h"
+#include "taichi/llvm/llvm_device.h"
 
 namespace taichi {
 namespace lang {
@@ -75,7 +75,7 @@ class CudaStream : public Stream {
   void command_sync() override{TI_NOT_IMPLEMENTED};
 };
 
-class CudaDevice : public Device {
+class CudaDevice : public LlvmDevice {
  public:
   struct AllocInfo {
     void *ptr{nullptr};
diff --git a/taichi/backends/cuda/cupti_toolkit.cpp b/taichi/backends/cuda/cupti_toolkit.cpp
index 9a57b20363cbe..5c4244fba6222 100644
--- a/taichi/backends/cuda/cupti_toolkit.cpp
+++ b/taichi/backends/cuda/cupti_toolkit.cpp
@@ -22,7 +22,7 @@ enum class CuptiMetricsDefault : uint {
   CUPTI_METRIC_DEFAULT_TOTAL = 2
 };
 
-constexpr const char *MetricListDefault[] = {
+[[maybe_unused]] constexpr const char *MetricListDefault[] = {
     "smsp__cycles_elapsed.avg",  // CUPTI_METRIC_KERNEL_ELAPSED_CLK_NUMS
     "smsp__cycles_elapsed.avg.per_second",  // CUPTI_METRIC_CORE_FREQUENCY_HZS
 };
diff --git a/taichi/backends/cuda/cupti_toolkit.h b/taichi/backends/cuda/cupti_toolkit.h
index f7e50632435ac..98a5c3309730f 100644
--- a/taichi/backends/cuda/cupti_toolkit.h
+++ b/taichi/backends/cuda/cupti_toolkit.h
@@ -38,8 +38,8 @@ class CuptiToolkit {
   void set_status(bool enable);
 
  private:
-  bool enabled_{false};
-  CuptiConfig cupti_config_;
+  [[maybe_unused]] bool enabled_{false};
+  [[maybe_unused]] CuptiConfig cupti_config_;
   CuptiImage cupti_image_;
 };
 
diff --git a/taichi/backends/device.cpp b/taichi/backends/device.cpp
index 6dba7f1680eff..550f762b5ab4b 100644
--- a/taichi/backends/device.cpp
+++ b/taichi/backends/device.cpp
@@ -140,15 +140,6 @@ void Device::print_all_cap() const {
   }
 }
 
-uint64_t *Device::allocate_llvm_runtime_memory_jit(
-    const LlvmRuntimeAllocParams &params) {
-  params.runtime_jit->call<void *, std::size_t, std::size_t>(
-      "runtime_memory_allocate_aligned", params.runtime, params.size,
-      taichi_page_size);
-  return taichi_union_cast_with_different_sizes<uint64_t *>(fetch_result_uint64(
-      taichi_result_buffer_runtime_query_id, params.result_buffer));
-}
-
 void GraphicsDevice::image_transition(DeviceAllocation img,
                                       ImageLayout old_layout,
                                       ImageLayout new_layout) {
diff --git a/taichi/backends/device.h b/taichi/backends/device.h
index 72c4736f98115..82b2788f43c64 100644
--- a/taichi/backends/device.h
+++ b/taichi/backends/device.h
@@ -407,25 +407,10 @@ class Device {
     AllocUsage usage{AllocUsage::Storage};
   };
 
-  struct LlvmRuntimeAllocParams : AllocParams {
-    bool use_cached{true};
-    JITModule *runtime_jit{nullptr};
-    LLVMRuntime *runtime{nullptr};
-    uint64 *result_buffer{nullptr};
-  };
-
   virtual DeviceAllocation allocate_memory(const AllocParams &params) = 0;
 
-  virtual DeviceAllocation allocate_memory_runtime(
-      const LlvmRuntimeAllocParams &params) {
-    TI_NOT_IMPLEMENTED
-  }
-
   virtual void dealloc_memory(DeviceAllocation handle) = 0;
 
-  uint64_t *allocate_llvm_runtime_memory_jit(
-      const LlvmRuntimeAllocParams &params);
-
   virtual uint64_t get_memory_physical_pointer(DeviceAllocation handle) {
     TI_NOT_IMPLEMENTED
   }
diff --git a/taichi/backends/metal/aot_module_loader_impl.cpp b/taichi/backends/metal/aot_module_loader_impl.cpp
index 0975f52c818d0..a02192c6c7115 100644
--- a/taichi/backends/metal/aot_module_loader_impl.cpp
+++ b/taichi/backends/metal/aot_module_loader_impl.cpp
@@ -23,9 +23,9 @@ class KernelImpl : public aot::Kernel {
   const std::string kernel_name_;
 };
 
-class AotModuleLoaderImpl : public aot::ModuleLoader {
+class AotModuleImpl : public aot::Module {
  public:
-  explicit AotModuleLoaderImpl(const AotModuleParams &params)
+  explicit AotModuleImpl(const AotModuleParams &params)
       : runtime_(params.runtime) {
     const std::string bin_path =
         fmt::format("{}/metadata.tcb", params.module_path);
@@ -36,16 +36,26 @@ class AotModuleLoaderImpl : public aot::ModuleLoader {
     }
   }
 
-  bool get_field(const std::string &name,
-                 aot::CompiledFieldData &field) override {
-    TI_ERROR("AOT: get_field for Metal not implemented yet");
-    return false;
+  std::unique_ptr<aot::Kernel> get_kernel(const std::string &name) override {
+    return make_new_kernel(name);
+  }
+
+  std::unique_ptr<aot::Field> get_field(const std::string &name) override {
+    TI_NOT_IMPLEMENTED;
   }
 
   size_t get_root_size() const override {
     return aot_data_.metadata.root_buffer_size;
   }
 
+  // Module metadata
+  Arch arch() const override {
+    return Arch::metal;
+  }
+  uint64_t version() const override {
+    TI_NOT_IMPLEMENTED;
+  }
+
  private:
   std::unique_ptr<aot::Kernel> make_new_kernel(
       const std::string &name) override {
@@ -68,9 +78,9 @@ class AotModuleLoaderImpl : public aot::ModuleLoader {
 
 }  // namespace
 
-std::unique_ptr<aot::ModuleLoader> make_aot_module_loader(
-    const AotModuleParams &params) {
-  return std::make_unique<AotModuleLoaderImpl>(params);
+std::unique_ptr<aot::Module> make_aot_module(std::any mod_params) {
+  AotModuleParams params = std::any_cast<AotModuleParams &>(mod_params);
+  return std::make_unique<AotModuleImpl>(params);
 }
 
 }  // namespace metal
diff --git a/taichi/backends/metal/aot_module_loader_impl.h b/taichi/backends/metal/aot_module_loader_impl.h
index 86d9a0bab35b3..04cc71f494f78 100644
--- a/taichi/backends/metal/aot_module_loader_impl.h
+++ b/taichi/backends/metal/aot_module_loader_impl.h
@@ -17,9 +17,7 @@ struct AotModuleParams {
   KernelManager *runtime{nullptr};
 };
 
-std::unique_ptr<aot::ModuleLoader> make_aot_module_loader(
-    const AotModuleParams &params);
-
+std::unique_ptr<aot::Module> make_aot_module(std::any mod_params);
 }  // namespace metal
 }  // namespace lang
 }  // namespace taichi
diff --git a/taichi/backends/vulkan/aot_module_loader_impl.cpp b/taichi/backends/vulkan/aot_module_loader_impl.cpp
index ef85c8768715e..249d0fea570f0 100644
--- a/taichi/backends/vulkan/aot_module_loader_impl.cpp
+++ b/taichi/backends/vulkan/aot_module_loader_impl.cpp
@@ -26,76 +26,103 @@ class KernelImpl : public aot::Kernel {
   VkRuntime *const runtime_;
   const KernelHandle handle_;
 };
-}  // namespace
 
-AotModuleLoaderImpl::AotModuleLoaderImpl(const std::string &output_dir) {
-  const std::string bin_path = fmt::format("{}/metadata.tcb", output_dir);
-  read_from_binary_file(ti_aot_data_, bin_path);
-  for (int i = 0; i < ti_aot_data_.kernels.size(); ++i) {
-    auto k = ti_aot_data_.kernels[i];
-    std::vector<std::vector<uint32_t>> spirv_sources_codes;
-    for (int j = 0; j < k.tasks_attribs.size(); ++j) {
-      std::vector<uint32_t> res = read_spv_file(output_dir, k.tasks_attribs[j]);
-      spirv_sources_codes.push_back(res);
+class AotModuleImpl : public aot::Module {
+ public:
+  explicit AotModuleImpl(const AotModuleParams &params)
+      : runtime_(params.runtime) {
+    const std::string bin_path =
+        fmt::format("{}/metadata.tcb", params.module_path);
+    read_from_binary_file(ti_aot_data_, bin_path);
+
+    for (int i = 0; i < ti_aot_data_.kernels.size(); ++i) {
+      auto k = ti_aot_data_.kernels[i];
+
+      std::vector<std::vector<uint32_t>> spirv_sources_codes;
+      for (int j = 0; j < k.tasks_attribs.size(); ++j) {
+        std::vector<uint32_t> res =
+            read_spv_file(params.module_path, k.tasks_attribs[j]);
+        spirv_sources_codes.push_back(res);
+      }
+      ti_aot_data_.spirv_codes.push_back(spirv_sources_codes);
     }
-    ti_aot_data_.spirv_codes.push_back(spirv_sources_codes);
   }
-}
 
-std::vector<uint32_t> AotModuleLoaderImpl::read_spv_file(
-    const std::string &output_dir,
-    const TaskAttributes &k) {
-  const std::string spv_path = fmt::format("{}/{}.spv", output_dir, k.name);
-  std::vector<uint32_t> source_code;
-  std::ifstream fs(spv_path, std::ios_base::binary | std::ios::ate);
-  size_t size = fs.tellg();
-  fs.seekg(0, std::ios::beg);
-  source_code.resize(size / sizeof(uint32_t));
-  fs.read((char *)source_code.data(), size);
-  fs.close();
-  return source_code;
-}
+  std::unique_ptr<aot::Kernel> get_kernel(const std::string &name) override {
+    return make_new_kernel(name);
+  }
+
+  std::unique_ptr<aot::Field> get_field(const std::string &name) override {
+    TI_NOT_IMPLEMENTED;
+  }
+
+  size_t get_root_size() const override {
+    return ti_aot_data_.root_buffer_size;
+  }
+
+  // Module metadata
+  Arch arch() const override {
+    return Arch::vulkan;
+  }
+  uint64_t version() const override {
+    TI_NOT_IMPLEMENTED;
+  }
 
-bool AotModuleLoaderImpl::get_kernel(const std::string &name,
-                                     VkRuntime::RegisterParams &kernel) {
-  for (int i = 0; i < ti_aot_data_.kernels.size(); ++i) {
-    // Offloaded task names encode more than the name of the function, but for
-    // AOT, only use the name of the function which should be the first part of
-    // the struct
-    if (ti_aot_data_.kernels[i].name.rfind(name, 0) == 0) {
-      kernel.kernel_attribs = ti_aot_data_.kernels[i];
-      kernel.task_spirv_source_codes = ti_aot_data_.spirv_codes[i];
-      // We don't have to store the number of SNodeTree in |ti_aot_data_| yet,
-      // because right now we only support a single SNodeTree during AOT.
-      // TODO: Support multiple SNodeTrees in AOT.
-      kernel.num_snode_trees = 1;
-      return true;
+ private:
+  bool get_kernel_params_by_name(const std::string &name,
+                                 VkRuntime::RegisterParams &kernel) {
+    for (int i = 0; i < ti_aot_data_.kernels.size(); ++i) {
+      // Offloaded task names encode more than the name of the function, but for
+      // AOT, only use the name of the function which should be the first part
+      // of the struct
+      if (ti_aot_data_.kernels[i].name.rfind(name, 0) == 0) {
+        kernel.kernel_attribs = ti_aot_data_.kernels[i];
+        kernel.task_spirv_source_codes = ti_aot_data_.spirv_codes[i];
+        // We don't have to store the number of SNodeTree in |ti_aot_data_| yet,
+        // because right now we only support a single SNodeTree during AOT.
+        // TODO: Support multiple SNodeTrees in AOT.
+        kernel.num_snode_trees = 1;
+        return true;
+      }
     }
+    return false;
   }
 
-  return false;
-}
+  std::unique_ptr<aot::Kernel> make_new_kernel(
+      const std::string &name) override {
+    VkRuntime::RegisterParams kparams;
+    if (!get_kernel_params_by_name(name, kparams)) {
+      TI_DEBUG("Failed to load kernel {}", name);
+      return nullptr;
+    }
+    auto handle = runtime_->register_taichi_kernel(kparams);
+    return std::make_unique<KernelImpl>(runtime_, handle);
+  }
 
-std::unique_ptr<aot::Kernel> AotModuleLoaderImpl::make_new_kernel(
-    const std::string &name) {
-  VkRuntime::RegisterParams kparams;
-  if (!get_kernel(name, kparams)) {
-    TI_DEBUG("Failed to load kernel {}", name);
-    return nullptr;
+  std::vector<uint32_t> read_spv_file(const std::string &output_dir,
+                                      const TaskAttributes &k) {
+    const std::string spv_path = fmt::format("{}/{}.spv", output_dir, k.name);
+    std::vector<uint32_t> source_code;
+    std::ifstream fs(spv_path, std::ios_base::binary | std::ios::ate);
+    size_t size = fs.tellg();
+    fs.seekg(0, std::ios::beg);
+    source_code.resize(size / sizeof(uint32_t));
+    fs.read((char *)source_code.data(), size);
+    fs.close();
+    return source_code;
   }
-  auto handle = runtime_->register_taichi_kernel(kparams);
-  return std::make_unique<KernelImpl>(runtime_, handle);
-}
 
-bool AotModuleLoaderImpl::get_field(const std::string &name,
-                                    aot::CompiledFieldData &field) {
-  TI_ERROR("AOT: get_field for Vulkan not implemented yet");
-  return false;
-}
+  TaichiAotData ti_aot_data_;
+  VkRuntime *runtime_{nullptr};
+};
 
-size_t AotModuleLoaderImpl::get_root_size() const {
-  return ti_aot_data_.root_buffer_size;
+}  // namespace
+
+std::unique_ptr<aot::Module> make_aot_module(std::any mod_params) {
+  AotModuleParams params = std::any_cast<AotModuleParams &>(mod_params);
+  return std::make_unique<AotModuleImpl>(params);
 }
+
 }  // namespace vulkan
 }  // namespace lang
 }  // namespace taichi
diff --git a/taichi/backends/vulkan/aot_module_loader_impl.h b/taichi/backends/vulkan/aot_module_loader_impl.h
index 48687b655e681..37e28d01f6388 100644
--- a/taichi/backends/vulkan/aot_module_loader_impl.h
+++ b/taichi/backends/vulkan/aot_module_loader_impl.h
@@ -15,27 +15,12 @@ namespace vulkan {
 
 class VkRuntime;
 
-class TI_DLL_EXPORT AotModuleLoaderImpl : public aot::ModuleLoader {
- public:
-  explicit AotModuleLoaderImpl(const std::string &output_dir);
-
-  bool get_kernel(const std::string &name, VkRuntime::RegisterParams &kernel);
-
-  bool get_field(const std::string &name,
-                 aot::CompiledFieldData &field) override;
-
-  size_t get_root_size() const override;
-
- private:
-  std::unique_ptr<aot::Kernel> make_new_kernel(
-      const std::string &name) override;
-  std::vector<uint32_t> read_spv_file(const std::string &output_dir,
-                                      const TaskAttributes &k);
-
-  TaichiAotData ti_aot_data_;
-  VkRuntime *runtime_{nullptr};
+struct AotModuleParams {
+  std::string module_path;
+  VkRuntime *runtime{nullptr};
 };
 
+std::unique_ptr<aot::Module> make_aot_module(std::any mod_params);
 }  // namespace vulkan
 }  // namespace lang
 }  // namespace taichi
diff --git a/taichi/backends/vulkan/runtime.h b/taichi/backends/vulkan/runtime.h
index aa92abe8af3c8..d862dcf5cf755 100644
--- a/taichi/backends/vulkan/runtime.h
+++ b/taichi/backends/vulkan/runtime.h
@@ -64,7 +64,7 @@ class CompiledTaichiKernel {
   TaichiKernelAttributes ti_kernel_attribs_;
   std::vector<TaskAttributes> tasks_attribs_;
 
-  Device *device_;
+  [[maybe_unused]] Device *device_;
 
   InputBuffersMap input_buffers_;
 
diff --git a/taichi/codegen/spirv/spirv_codegen.cpp b/taichi/codegen/spirv/spirv_codegen.cpp
index 7350433c94608..821ccb12b91f6 100644
--- a/taichi/codegen/spirv/spirv_codegen.cpp
+++ b/taichi/codegen/spirv/spirv_codegen.cpp
@@ -1517,8 +1517,6 @@ class TaskCodegen : public IRVisitor {
       task_attribs_.advisory_total_num_threads = total_num_cells;
       int num_cells = snode->num_cells_per_container;
 
-      int upper_level_cells = total_num_cells / num_cells;
-
       TI_INFO("ListGen {} * {}", total_num_cells / num_cells, num_cells);
 
       auto listgen_buffer =
diff --git a/taichi/common/exceptions.h b/taichi/common/exceptions.h
index 600b0de12a3e7..8cdb4cfe40223 100644
--- a/taichi/common/exceptions.h
+++ b/taichi/common/exceptions.h
@@ -5,26 +5,27 @@ namespace lang {
 
 class IRModified {};
 
-class TaichiTypeError : public std::exception {
+class TaichiExceptionImpl : public std::exception {
   std::string msg_;
 
  public:
-  TaichiTypeError(const std::string msg) : msg_(msg) {
+  TaichiExceptionImpl(const std::string msg) : msg_(msg) {
   }
   const char *what() const throw() override {
     return msg_.c_str();
   }
 };
 
-class TaichiSyntaxError : public std::exception {
-  std::string msg_;
+class TaichiTypeError : public TaichiExceptionImpl {
+  using TaichiExceptionImpl::TaichiExceptionImpl;
+};
 
- public:
-  TaichiSyntaxError(const std::string msg) : msg_(msg) {
-  }
-  const char *what() const throw() override {
-    return msg_.c_str();
-  }
+class TaichiSyntaxError : public TaichiExceptionImpl {
+  using TaichiExceptionImpl::TaichiExceptionImpl;
+};
+
+class TaichiRuntimeError : public TaichiExceptionImpl {
+  using TaichiExceptionImpl::TaichiExceptionImpl;
 };
 
 }  // namespace lang
diff --git a/taichi/ir/frontend_ir.cpp b/taichi/ir/frontend_ir.cpp
index 2e18ad7c76b05..cbbf7c089d0b2 100644
--- a/taichi/ir/frontend_ir.cpp
+++ b/taichi/ir/frontend_ir.cpp
@@ -909,19 +909,19 @@ void ASTBuilder::insert_expr_stmt(const Expr &val) {
 
 void ASTBuilder::create_scope(std::unique_ptr<Block> &list, LoopType tp) {
   TI_ASSERT(list == nullptr);
-  list = std::make_unique<Block>();
-  if (!stack_.empty()) {
-    list->parent_stmt = get_last_stmt();
-  }
-  stack_.push_back(list.get());
   LoopState prev = loop_state_stack_.back();
   if (tp == NotLoop) {
     loop_state_stack_.push_back(prev);
-  } else if (tp == For && prev == None) {
+  } else if (tp == For && stack_.size() == 1) {
     loop_state_stack_.push_back(Outermost);
   } else {
     loop_state_stack_.push_back(Inner);
   }
+  list = std::make_unique<Block>();
+  if (!stack_.empty()) {
+    list->parent_stmt = get_last_stmt();
+  }
+  stack_.push_back(list.get());
 }
 
 void ASTBuilder::pop_scope() {
diff --git a/taichi/ir/snode.cpp b/taichi/ir/snode.cpp
index a6be09c4a0372..b5b715884868c 100644
--- a/taichi/ir/snode.cpp
+++ b/taichi/ir/snode.cpp
@@ -50,7 +50,10 @@ SNode &SNode::create_node(std::vector<Axis> axes,
 
   auto &new_node = insert_children(type);
   for (int i = 0; i < (int)axes.size(); i++) {
-    TI_ASSERT(sizes[i] > 0);
+    if (sizes[i] <= 0) {
+      throw TaichiRuntimeError(
+          "Every dimension of a Taichi field should be positive");
+    }
     auto &ind = axes[i];
     new_node.extractors[ind.value].activate(
         bit::log2int(bit::least_pot_bound(sizes[i])));
diff --git a/taichi/ir/statements.cpp b/taichi/ir/statements.cpp
index 5f2b1821dc64b..2da47acc5ed28 100644
--- a/taichi/ir/statements.cpp
+++ b/taichi/ir/statements.cpp
@@ -305,11 +305,11 @@ MeshForStmt::MeshForStmt(mesh::Mesh *mesh,
                          int num_cpu_threads,
                          int block_dim)
     : mesh(mesh),
-      major_from_type(element_type),
       body(std::move(body)),
       bit_vectorize(bit_vectorize),
       num_cpu_threads(num_cpu_threads),
-      block_dim(block_dim) {
+      block_dim(block_dim),
+      major_from_type(element_type) {
   this->body->parent_stmt = this;
   TI_STMT_REG_FIELDS;
 }
diff --git a/taichi/jit/jit_session.cpp b/taichi/jit/jit_session.cpp
index a979f55bf4ab9..dd9547589949b 100644
--- a/taichi/jit/jit_session.cpp
+++ b/taichi/jit/jit_session.cpp
@@ -33,6 +33,7 @@ std::unique_ptr<JITSession> JITSession::create(LlvmProgramImpl *llvm_prog,
 #else
   TI_ERROR("Llvm disabled");
 #endif
+  return nullptr;
 }
 
 TLANG_NAMESPACE_END
diff --git a/taichi/llvm/llvm_device.cpp b/taichi/llvm/llvm_device.cpp
new file mode 100644
index 0000000000000..5643dd9dd16f2
--- /dev/null
+++ b/taichi/llvm/llvm_device.cpp
@@ -0,0 +1,16 @@
+#include "taichi/llvm/llvm_device.h"
+
+namespace taichi {
+namespace lang {
+
+uint64_t *LlvmDevice::allocate_llvm_runtime_memory_jit(
+    const LlvmRuntimeAllocParams &params) {
+  params.runtime_jit->call<void *, std::size_t, std::size_t>(
+      "runtime_memory_allocate_aligned", params.runtime, params.size,
+      taichi_page_size);
+  return taichi_union_cast_with_different_sizes<uint64_t *>(fetch_result_uint64(
+      taichi_result_buffer_runtime_query_id, params.result_buffer));
+}
+
+}  // namespace lang
+}  // namespace taichi
diff --git a/taichi/llvm/llvm_device.h b/taichi/llvm/llvm_device.h
new file mode 100644
index 0000000000000..e692f68521459
--- /dev/null
+++ b/taichi/llvm/llvm_device.h
@@ -0,0 +1,27 @@
+#pragma once
+
+#include "taichi/backends/device.h"
+
+namespace taichi {
+namespace lang {
+
+class LlvmDevice : public Device {
+ public:
+  struct LlvmRuntimeAllocParams : AllocParams {
+    bool use_cached{true};
+    JITModule *runtime_jit{nullptr};
+    LLVMRuntime *runtime{nullptr};
+    uint64 *result_buffer{nullptr};
+  };
+
+  virtual DeviceAllocation allocate_memory_runtime(
+      const LlvmRuntimeAllocParams &params) {
+    TI_NOT_IMPLEMENTED;
+  }
+
+  uint64_t *allocate_llvm_runtime_memory_jit(
+      const LlvmRuntimeAllocParams &params);
+};
+
+}  // namespace lang
+}  // namespace taichi
diff --git a/taichi/llvm/llvm_program.cpp b/taichi/llvm/llvm_program.cpp
index b28202da03b18..b0d7e83b790c2 100755
--- a/taichi/llvm/llvm_program.cpp
+++ b/taichi/llvm/llvm_program.cpp
@@ -573,6 +573,11 @@ cpu::CpuDevice *LlvmProgramImpl::cpu_device() {
   return static_cast<cpu::CpuDevice *>(device_.get());
 }
 
+LlvmDevice *LlvmProgramImpl::llvm_device() {
+  TI_ASSERT(dynamic_cast<LlvmDevice *>(device_.get()));
+  return static_cast<LlvmDevice *>(device_.get());
+}
+
 DevicePtr LlvmProgramImpl::get_snode_tree_device_ptr(int tree_id) {
   DeviceAllocation tree_alloc = snode_tree_allocs_[tree_id];
   return tree_alloc.get_ptr();
@@ -588,7 +593,7 @@ DeviceAllocation LlvmProgramImpl::allocate_memory_ndarray(
     tlctx = llvm_context_host_.get();
   }
 
-  return get_compute_device()->allocate_memory_runtime(
+  return llvm_device()->allocate_memory_runtime(
       {{alloc_size, /*host_write=*/false, /*host_read=*/false,
         /*export_sharing=*/false, AllocUsage::Storage},
        config->ndarray_use_cached_allocator,
diff --git a/taichi/llvm/llvm_program.h b/taichi/llvm/llvm_program.h
index 21d3d89dd8a43..72ef3c111b42f 100644
--- a/taichi/llvm/llvm_program.h
+++ b/taichi/llvm/llvm_program.h
@@ -1,4 +1,5 @@
 #pragma once
+#include "taichi/llvm/llvm_device.h"
 #include "taichi/system/snode_tree_buffer_manager.h"
 #include "taichi/inc/constants.h"
 #include "taichi/program/compile_config.h"
@@ -174,6 +175,7 @@ class LlvmProgramImpl : public ProgramImpl {
   std::shared_ptr<Device> device_{nullptr};
   cuda::CudaDevice *cuda_device();
   cpu::CpuDevice *cpu_device();
+  LlvmDevice *llvm_device();
 };
 }  // namespace lang
 }  // namespace taichi
diff --git a/taichi/program/ndarray.cpp b/taichi/program/ndarray.cpp
index 196fda01fe989..a54923d353ade 100644
--- a/taichi/program/ndarray.cpp
+++ b/taichi/program/ndarray.cpp
@@ -16,7 +16,6 @@ Ndarray::Ndarray(Program *prog,
                  const std::vector<int> &shape)
     : dtype(type),
       shape(shape),
-      prog_impl_(prog->get_llvm_program_impl()),
       num_active_indices(shape.size()),
       nelement_(std::accumulate(std::begin(shape),
                                 std::end(shape),
@@ -24,6 +23,7 @@ Ndarray::Ndarray(Program *prog,
                                 std::multiplies<>())),
       element_size_(data_type_size(dtype)),
       device_(prog->get_device_shared()),
+      prog_impl_(prog->get_llvm_program_impl()),
       rw_accessors_bank_(&prog->get_ndarray_rw_accessors_bank()) {
   ndarray_alloc_ = prog->allocate_memory_ndarray(nelement_ * element_size_,
                                                  prog->result_buffer);
diff --git a/taichi/program/program.cpp b/taichi/program/program.cpp
index 269ca30092642..48fd81c676000 100644
--- a/taichi/program/program.cpp
+++ b/taichi/program/program.cpp
@@ -81,8 +81,12 @@ Program::Program(Arch desired_arch)
     TI_ERROR("This taichi is not compiled with LLVM");
 #endif
   } else if (config.arch == Arch::metal) {
+#ifdef TI_WITH_METAL
     TI_ASSERT(metal::is_metal_api_available());
     program_impl_ = std::make_unique<MetalProgramImpl>(config);
+#else
+    TI_ERROR("This taichi is not compiled with Metal")
+#endif
   } else if (config.arch == Arch::vulkan) {
 #ifdef TI_WITH_VULKAN
     TI_ASSERT(vulkan::is_vulkan_api_available());
diff --git a/taichi/program/sparse_matrix.cpp b/taichi/program/sparse_matrix.cpp
index a42ab8d205f71..4d2a26f32ce41 100644
--- a/taichi/program/sparse_matrix.cpp
+++ b/taichi/program/sparse_matrix.cpp
@@ -10,34 +10,53 @@ namespace lang {
 
 SparseMatrixBuilder::SparseMatrixBuilder(int rows,
                                          int cols,
-                                         int max_num_triplets)
-    : rows_(rows), cols_(cols), max_num_triplets_(max_num_triplets) {
-  data_.reserve(max_num_triplets * 3);
-  data_base_ptr_ = get_data_base_ptr();
-}
-
-void *SparseMatrixBuilder::get_data_base_ptr() {
-  return data_.data();
-}
-
-void SparseMatrixBuilder::print_triplets() {
-  fmt::print("n={}, m={}, num_triplets={} (max={})", rows_, cols_,
+                                         int max_num_triplets,
+                                         DataType dtype)
+    : rows_(rows),
+      cols_(cols),
+      max_num_triplets_(max_num_triplets),
+      dtype_(dtype) {
+  auto element_size = data_type_size(dtype);
+  TI_ASSERT((element_size == 4 || element_size == 8));
+  data_base_ptr_ =
+      std::make_unique<uchar[]>(max_num_triplets_ * 3 * element_size);
+}
+
+template <typename T, typename G>
+void SparseMatrixBuilder::print_template() {
+  fmt::print("n={}, m={}, num_triplets={} (max={})\n", rows_, cols_,
              num_triplets_, max_num_triplets_);
+  T *data = reinterpret_cast<T *>(data_base_ptr_.get());
   for (int64 i = 0; i < num_triplets_; i++) {
-    fmt::print("({}, {}) val={}", data_[i * 3], data_[i * 3 + 1],
-               taichi_union_cast<float32>(data_[i * 3 + 2]));
+    fmt::print("({}, {}) val={}\n", ((G *)data)[i * 3], ((G *)data)[i * 3 + 1],
+               taichi_union_cast<T>(data[i * 3 + 2]));
   }
   fmt::print("\n");
 }
 
-SparseMatrix SparseMatrixBuilder::build() {
-  TI_ASSERT(built_ == false);
-  built_ = true;
-  using T = Eigen::Triplet<float32>;
-  std::vector<T> triplets;
+void SparseMatrixBuilder::print_triplets() {
+  auto element_size = data_type_size(dtype_);
+  switch (element_size) {
+    case 4:
+      print_template<float32, int32>();
+      break;
+    case 8:
+      print_template<float64, int64>();
+      break;
+    default:
+      TI_ERROR("Unsupported sparse matrix data type!");
+      break;
+  }
+}
+
+template <typename T, typename G>
+SparseMatrix SparseMatrixBuilder::build_template() {
+  using V = Eigen::Triplet<T>;
+  std::vector<V> triplets;
+  T *data = reinterpret_cast<T *>(data_base_ptr_.get());
   for (int i = 0; i < num_triplets_; i++) {
-    triplets.push_back(T(data_[i * 3], data_[i * 3 + 1],
-                         taichi_union_cast<float32>(data_[i * 3 + 2])));
+    triplets.push_back(V(((G *)data)[i * 3], ((G *)data)[i * 3 + 1],
+                         taichi_union_cast<T>(data[i * 3 + 2])));
   }
   SparseMatrix sm(rows_, cols_);
   sm.get_matrix().setFromTriplets(triplets.begin(), triplets.end());
@@ -45,6 +64,21 @@ SparseMatrix SparseMatrixBuilder::build() {
   return sm;
 }
 
+SparseMatrix SparseMatrixBuilder::build() {
+  TI_ASSERT(built_ == false);
+  built_ = true;
+  auto element_size = data_type_size(dtype_);
+  switch (element_size) {
+    case 4:
+      return build_template<float32, int32>();
+    case 8:
+      return build_template<float64, int64>();
+    default:
+      TI_ERROR("Unsupported sparse matrix data type!");
+      break;
+  }
+}
+
 void SparseMatrixBuilder::clear() {
   built_ = false;
   num_triplets_ = 0;
diff --git a/taichi/program/sparse_matrix.h b/taichi/program/sparse_matrix.h
index 2de867d1ce249..f5baa62539912 100644
--- a/taichi/program/sparse_matrix.h
+++ b/taichi/program/sparse_matrix.h
@@ -2,6 +2,8 @@
 
 #include "taichi/common/core.h"
 #include "taichi/inc/constants.h"
+#include "taichi/ir/type_utils.h"
+
 #include "Eigen/Sparse"
 
 namespace taichi {
@@ -11,9 +13,7 @@ class SparseMatrix;
 
 class SparseMatrixBuilder {
  public:
-  SparseMatrixBuilder(int rows, int cols, int max_num_triplets);
-
-  void *get_data_base_ptr();
+  SparseMatrixBuilder(int rows, int cols, int max_num_triplets, DataType dtype);
 
   void print_triplets();
 
@@ -21,14 +21,21 @@ class SparseMatrixBuilder {
 
   void clear();
 
+ private:
+  template <typename T, typename G>
+  void print_template();
+
+  template <typename T, typename G>
+  SparseMatrix build_template();
+
  private:
   uint64 num_triplets_{0};
-  void *data_base_ptr_{nullptr};
-  std::vector<uint32> data_;
+  std::unique_ptr<uchar[]> data_base_ptr_{nullptr};
   int rows_{0};
   int cols_{0};
   uint64 max_num_triplets_{0};
   bool built_{false};
+  DataType dtype_{PrimitiveType::f32};
 };
 
 class SparseMatrix {
diff --git a/taichi/python/export_lang.cpp b/taichi/python/export_lang.cpp
index 0d52388c295e7..a8cbfa1bafe33 100644
--- a/taichi/python/export_lang.cpp
+++ b/taichi/python/export_lang.cpp
@@ -64,6 +64,8 @@ void export_lang(py::module &m) {
                                           PyExc_TypeError);
   py::register_exception<TaichiSyntaxError>(m, "TaichiSyntaxError",
                                             PyExc_SyntaxError);
+  py::register_exception<TaichiRuntimeError>(m, "TaichiRuntimeError",
+                                             PyExc_RuntimeError);
   py::enum_<Arch>(m, "Arch", py::arithmetic())
 #define PER_ARCH(x) .value(#x, Arch::x)
 #include "taichi/inc/archs.inc.h"
@@ -357,10 +359,11 @@ void export_lang(py::module &m) {
       .def("create_function", &Program::create_function,
            py::return_value_policy::reference)
       .def("create_sparse_matrix_builder",
-           [](Program *program, int n, int m, uint64 max_num_entries) {
+           [](Program *program, int n, int m, uint64 max_num_entries,
+              DataType dtype) {
              TI_ERROR_IF(!arch_is_cpu(program->config.arch),
                          "SparseMatrix only supports CPU for now.");
-             return SparseMatrixBuilder(n, m, max_num_entries);
+             return SparseMatrixBuilder(n, m, max_num_entries, dtype);
            })
       .def("create_sparse_matrix",
            [](Program *program, int n, int m) {
diff --git a/taichi/runtime/llvm/internal_functions.h b/taichi/runtime/llvm/internal_functions.h
index b8451635204ed..591ccddc64bcf 100644
--- a/taichi/runtime/llvm/internal_functions.h
+++ b/taichi/runtime/llvm/internal_functions.h
@@ -9,6 +9,17 @@
     }                                                        \
   } while (0)
 
+#define ATOMIC_INSERT(T)                                             \
+  do {                                                               \
+    auto base_ptr = (int64 *)base_ptr_;                              \
+    int64 *num_triplets = base_ptr;                                  \
+    auto data_base_ptr = *(T **)(base_ptr + 1);                      \
+    auto triplet_id = atomic_add_i64(num_triplets, 1);               \
+    data_base_ptr[triplet_id * 3] = i;                               \
+    data_base_ptr[triplet_id * 3 + 1] = j;                           \
+    data_base_ptr[triplet_id * 3 + 2] = taichi_union_cast<T>(value); \
+  } while (0);
+
 i32 do_nothing(RuntimeContext *context) {
   return 0;
 }
@@ -20,20 +31,21 @@ i32 refresh_counter(RuntimeContext *context) {
   return 0;
 }
 
-i32 insert_triplet(RuntimeContext *context,
-                   int64 base_ptr_,
-                   int i,
-                   int j,
-                   float value) {
-  auto base_ptr = (int64 *)base_ptr_;
-
-  int64 *num_triplets = base_ptr;
-  auto data_base_ptr = *(int32 **)(base_ptr + 1);
+i32 insert_triplet_f32(RuntimeContext *context,
+                       int64 base_ptr_,
+                       int i,
+                       int j,
+                       float value) {
+  ATOMIC_INSERT(int32);
+  return 0;
+}
 
-  auto triplet_id = atomic_add_i64(num_triplets, 1);
-  data_base_ptr[triplet_id * 3] = i;
-  data_base_ptr[triplet_id * 3 + 1] = j;
-  data_base_ptr[triplet_id * 3 + 2] = taichi_union_cast<int32>(value);
+i32 insert_triplet_f64(RuntimeContext *context,
+                       int64 base_ptr_,
+                       int i,
+                       int j,
+                       float64 value) {
+  ATOMIC_INSERT(int64);
   return 0;
 }
 
diff --git a/taichi/transforms/lower_ast.cpp b/taichi/transforms/lower_ast.cpp
index 88995f468a217..81daee000c54d 100644
--- a/taichi/transforms/lower_ast.cpp
+++ b/taichi/transforms/lower_ast.cpp
@@ -33,6 +33,7 @@ class LowerAST : public IRVisitor {
   Stmt *capturing_loop_;
   std::unordered_set<Stmt *> detected_fors_with_break_;
   Block *current_block_;
+  int current_block_depth_;
 
   FlattenContext make_flatten_ctx() {
     FlattenContext fctx;
@@ -43,7 +44,8 @@ class LowerAST : public IRVisitor {
  public:
   explicit LowerAST(const std::unordered_set<Stmt *> &_detected_fors_with_break)
       : detected_fors_with_break_(_detected_fors_with_break),
-        current_block_(nullptr) {
+        current_block_(nullptr),
+        current_block_depth_(0) {
     // TODO: change this to false
     allow_undefined_visitor = true;
     capturing_loop_ = nullptr;
@@ -53,9 +55,11 @@ class LowerAST : public IRVisitor {
     auto backup_block = this->current_block_;
     this->current_block_ = stmt_list;
     auto stmts = make_raw_pointer_list(stmt_list->statements);
+    current_block_depth_++;
     for (auto &stmt : stmts) {
       stmt->accept(this);
     }
+    current_block_depth_--;
     this->current_block_ = backup_block;
   }
 
@@ -201,8 +205,8 @@ class LowerAST : public IRVisitor {
       flatten_rvalue(begin, &fctx);
       flatten_rvalue(end, &fctx);
       bool is_good_range_for =
-          capturing_loop_ == nullptr || detected_fors_with_break_.find(stmt) ==
-                                            detected_fors_with_break_.end();
+          current_block_depth_ == 1 || detected_fors_with_break_.find(stmt) ==
+                                           detected_fors_with_break_.end();
       // #578: a good range for is a range for that doesn't contains a break
       // statement
       if (is_good_range_for) {
diff --git a/taichi/transforms/make_mesh_block_local.cpp b/taichi/transforms/make_mesh_block_local.cpp
index bb4699237eefa..98f5319141a71 100644
--- a/taichi/transforms/make_mesh_block_local.cpp
+++ b/taichi/transforms/make_mesh_block_local.cpp
@@ -400,7 +400,7 @@ void MakeMeshBlockLocal::fetch_mapping(
 
 MakeMeshBlockLocal::MakeMeshBlockLocal(OffloadedStmt *offload,
                                        const CompileConfig &config)
-    : offload_(offload), config_(config) {
+    : config_(config), offload_(offload) {
   // Step 0: simplify l2g + g2r -> l2r
   simplify_nested_conversion();
 
diff --git a/taichi/transforms/offload.cpp b/taichi/transforms/offload.cpp
index 00abf97230691..135e0c7053e16 100644
--- a/taichi/transforms/offload.cpp
+++ b/taichi/transforms/offload.cpp
@@ -672,7 +672,7 @@ class FixCrossOffloadReferences : public BasicStmtVisitor {
   }
 
  private:
-  const CompileConfig &config_;
+  [[maybe_unused]] const CompileConfig &config_;
   StmtToOffsetMap local_to_global_offset_;
   std::unordered_map<Stmt *, Stmt *> stmt_to_offloaded_;
   OffloadedRanges *const offloaded_ranges_;
diff --git a/tests/cpp/aot/aot_save_load_test.cpp b/tests/cpp/aot/aot_save_load_test.cpp
index 5ffff6ae87600..45885d7e28bf8 100644
--- a/tests/cpp/aot/aot_save_load_test.cpp
+++ b/tests/cpp/aot/aot_save_load_test.cpp
@@ -4,13 +4,17 @@
 #include "taichi/program/program.h"
 #ifdef TI_WITH_VULKAN
 #include "taichi/backends/vulkan/aot_module_loader_impl.h"
+#include "taichi/backends/device.h"
+#include "taichi/backends/vulkan/vulkan_device.h"
+#include "taichi/backends/vulkan/vulkan_device_creator.h"
 #include "taichi/backends/vulkan/vulkan_loader.h"
+#include "taichi/backends/vulkan/vulkan_utils.h"
 #endif
 
 using namespace taichi;
 using namespace lang;
 
-static void aot_save() {
+[[maybe_unused]] static void aot_save() {
   auto program = Program(Arch::vulkan);
 
   program.config.advanced_optimization = false;
@@ -91,19 +95,59 @@ TEST(AotSaveLoad, Vulkan) {
 
   aot_save();
 
-  vulkan::AotModuleLoaderImpl aot_loader(".");
-  vulkan::VkRuntime::RegisterParams init_kernel, ret_kernel;
+  // API based on proposal https://github.com/taichi-dev/taichi/issues/3642
+  // Initialize Vulkan program
+  taichi::uint64 *result_buffer{nullptr};
+  taichi::lang::RuntimeContext host_ctx;
+  auto memory_pool =
+      std::make_unique<taichi::lang::MemoryPool>(Arch::vulkan, nullptr);
+  result_buffer = (taichi::uint64 *)memory_pool->allocate(
+      sizeof(taichi::uint64) * taichi_result_buffer_entries, 8);
+
+  // Create Taichi Device for computation
+  lang::vulkan::VulkanDeviceCreator::Params evd_params;
+  evd_params.api_version =
+      taichi::lang::vulkan::VulkanEnvSettings::kApiVersion();
+  auto embedded_device =
+      std::make_unique<taichi::lang::vulkan::VulkanDeviceCreator>(evd_params);
+
+  // Create Vulkan runtime
+  vulkan::VkRuntime::Params params;
+  params.host_result_buffer = result_buffer;
+  params.device = embedded_device->device();
+  auto vulkan_runtime =
+      std::make_unique<taichi::lang::vulkan::VkRuntime>(std::move(params));
+
+  // Run AOT module loader
+  vulkan::AotModuleParams mod_params;
+  mod_params.module_path = ".";
+  mod_params.runtime = vulkan_runtime.get();
+
+  std::unique_ptr<aot::Module> vk_module =
+      aot::Module::load(".", Arch::vulkan, mod_params);
+  EXPECT_TRUE(vk_module);
+
+  // Retrieve kernels/fields/etc from AOT module to initialize runtime
+  auto root_size = vk_module->get_root_size();
+  EXPECT_EQ(root_size, 64);
+  vulkan_runtime->add_root_buffer(root_size);
 
-  auto ret = aot_loader.get_kernel("init", init_kernel);
-  EXPECT_TRUE(ret);
+  auto init_kernel = vk_module->get_kernel("init");
+  EXPECT_TRUE(init_kernel);
 
-  ret = aot_loader.get_kernel("ret", ret_kernel);
-  EXPECT_TRUE(ret);
+  auto ret_kernel = vk_module->get_kernel("ret");
+  EXPECT_TRUE(ret_kernel);
 
-  ret = aot_loader.get_kernel("ret2", ret_kernel);
-  EXPECT_FALSE(ret);
+  auto ret2_kernel = vk_module->get_kernel("ret2");
+  EXPECT_FALSE(ret2_kernel);
 
-  auto root_size = aot_loader.get_root_size();
-  EXPECT_EQ(root_size, 64);
+  // Run kernels
+  init_kernel->launch(&host_ctx);
+  ret_kernel->launch(&host_ctx);
+  vulkan_runtime->synchronize();
+
+  // auto x_field = vk_module.get_field("x");
+  // EXPECT_TRUE(x_field);
+  // x_field.copy_to(/*dst=*/x.get());
 }
 #endif
diff --git a/tests/cpp/transforms/extract_constant_test.cpp b/tests/cpp/transforms/extract_constant_test.cpp
index b579f9178a429..4e4693d3bc501 100644
--- a/tests/cpp/transforms/extract_constant_test.cpp
+++ b/tests/cpp/transforms/extract_constant_test.cpp
@@ -26,7 +26,7 @@ TEST_F(ExtractConstantTest, ExtractConstant) {
   builder.set_insertion_point_to_loop_begin(for_stmt);
   auto *x = builder.create_local_var(get_data_type<int>());
   auto *x_v = builder.create_local_load(x);
-  auto *sum = builder.create_add(x_v, builder.get_int32(1));
+  builder.create_add(x_v, builder.get_int32(1));
 
   auto ir = builder.extract_ir();
   ASSERT_TRUE(ir->is<Block>());
diff --git a/tests/python/test_api.py b/tests/python/test_api.py
index 52862e66c4328..bd732a8152146 100644
--- a/tests/python/test_api.py
+++ b/tests/python/test_api.py
@@ -18,21 +18,22 @@
     'atomic_min', 'atomic_or', 'atomic_sub', 'atomic_xor', 'axes', 'bit_cast',
     'bit_shr', 'block_dim', 'block_local', 'cache_read_only', 'cast', 'cc',
     'ceil', 'clear_all_gradients', 'cos', 'cpu', 'cuda', 'data_oriented',
-    'deactivate', 'deactivate_all_snodes', 'dx11', 'eig', 'exp', 'ext_arr',
-    'extension', 'f16', 'f32', 'f64', 'field', 'float16', 'float32', 'float64',
-    'floor', 'func', 'get_addr', 'global_thread_idx', 'gpu', 'grouped',
-    'hex_to_rgb', 'i', 'i16', 'i32', 'i64', 'i8', 'ij', 'ijk', 'ijkl', 'ijl',
-    'ik', 'ikl', 'il', 'init', 'int16', 'int32', 'int64', 'int8', 'is_active',
-    'is_logging_effective', 'j', 'jk', 'jkl', 'jl', 'k', 'kernel', 'kl', 'l',
-    'lang', 'length', 'linalg', 'log', 'max', 'mesh_local', 'mesh_patch_idx',
-    'metal', 'min', 'ndarray', 'ndrange', 'no_activate', 'one', 'opengl',
-    'parallelize', 'polar_decompose', 'pow', 'profiler', 'randn', 'random',
-    'raw_div', 'raw_mod', 'rescale_index', 'reset', 'rgb_to_hex', 'root',
-    'round', 'rsqrt', 'select', 'set_logging_level', 'sin',
-    'sparse_matrix_builder', 'sqrt', 'static', 'static_assert', 'static_print',
-    'stop_grad', 'svd', 'sym_eig', 'sync', 'tan', 'tanh', 'template', 'tools',
-    'types', 'u16', 'u32', 'u64', 'u8', 'ui', 'uint16', 'uint32', 'uint64',
-    'uint8', 'vulkan', 'wasm', 'x64', 'x86_64', 'zero'
+    'deactivate', 'deactivate_all_snodes', 'dx11', 'eig', 'exp',
+    'experimental', 'ext_arr', 'extension', 'f16', 'f32', 'f64', 'field',
+    'float16', 'float32', 'float64', 'floor', 'func', 'get_addr',
+    'global_thread_idx', 'gpu', 'grouped', 'hex_to_rgb', 'i', 'i16', 'i32',
+    'i64', 'i8', 'ij', 'ijk', 'ijkl', 'ijl', 'ik', 'ikl', 'il', 'init',
+    'int16', 'int32', 'int64', 'int8', 'is_active', 'is_logging_effective',
+    'j', 'jk', 'jkl', 'jl', 'k', 'kernel', 'kl', 'l', 'lang', 'length',
+    'linalg', 'log', 'max', 'mesh_local', 'mesh_patch_idx', 'metal', 'min',
+    'ndarray', 'ndrange', 'no_activate', 'one', 'opengl', 'parallelize',
+    'polar_decompose', 'pow', 'profiler', 'randn', 'random', 'raw_div',
+    'raw_mod', 'rescale_index', 'reset', 'rgb_to_hex', 'root', 'round',
+    'rsqrt', 'select', 'set_logging_level', 'sin', 'sparse_matrix_builder',
+    'sqrt', 'static', 'static_assert', 'static_print', 'stop_grad', 'svd',
+    'sym_eig', 'sync', 'tan', 'tanh', 'template', 'tools', 'types', 'u16',
+    'u32', 'u64', 'u8', 'ui', 'uint16', 'uint32', 'uint64', 'uint8', 'vulkan',
+    'wasm', 'x64', 'x86_64', 'zero'
 ]
 user_api[ti.Field] = [
     'copy_from', 'dtype', 'fill', 'from_numpy', 'from_torch', 'parent',
diff --git a/tests/python/test_field.py b/tests/python/test_field.py
index 7f1bcd1fbcf0c..06af7ed63a0ac 100644
--- a/tests/python/test_field.py
+++ b/tests/python/test_field.py
@@ -181,3 +181,11 @@ def test_field_copy_from_with_non_filed_object():
     other = np.zeros((2, 3))
     with pytest.raises(TypeError):
         x.copy_from(other)
+
+
+@test_utils.test()
+def test_field_shape_0():
+    with pytest.raises(
+            ti._lib.core.TaichiRuntimeError,
+            match="Every dimension of a Taichi field should be positive"):
+        x = ti.field(dtype=ti.f32, shape=0)
diff --git a/tests/python/test_function.py b/tests/python/test_function.py
index eac5d4395c627..adf27083336cf 100644
--- a/tests/python/test_function.py
+++ b/tests/python/test_function.py
@@ -4,11 +4,11 @@
 from tests import test_utils
 
 
-@test_utils.test(experimental_real_function=True, arch=[ti.cpu, ti.gpu])
+@test_utils.test(arch=[ti.cpu, ti.gpu])
 def test_function_without_return():
     x = ti.field(ti.i32, shape=())
 
-    @ti.func
+    @ti.experimental.real_func
     def foo(val: ti.i32):
         x[None] += val
 
@@ -22,13 +22,11 @@ def run():
     assert x[None] == 42
 
 
-@test_utils.test(experimental_real_function=True,
-                 arch=[ti.cpu, ti.gpu],
-                 debug=True)
+@test_utils.test(arch=[ti.cpu, ti.gpu], debug=True)
 def test_function_with_return():
     x = ti.field(ti.i32, shape=())
 
-    @ti.func
+    @ti.experimental.real_func
     def foo(val: ti.i32) -> ti.i32:
         x[None] += val
         return val
@@ -44,113 +42,25 @@ def run():
     assert x[None] == 42
 
 
-# @test_utils.test(experimental_real_function=True, arch=[ti.cpu, ti.gpu])
-# def test_function_with_multiple_last_return():
-#     x = ti.field(ti.i32, shape=())
-#
-#     @ti.func
-#     def foo(val: ti.i32) -> ti.i32:
-#         if x[None]:
-#             x[None] += val * 2
-#             return val * 2
-#         else:
-#             x[None] += val
-#             return val
-#
-#     @ti.kernel
-#     def run():
-#         a = foo(40)
-#         foo(1)
-#         assert a == 40
-#
-#     x[None] = 0
-#     run()
-#     assert x[None] == 42
-#
-#
-# @test_utils.test(experimental_real_function=True, arch=[ti.cpu, ti.gpu])
-# def test_call_expressions():
-#     x = ti.field(ti.i32, shape=())
-#
-#     @ti.func
-#     def foo(val: ti.i32) -> ti.i32:
-#         if x[None] > 10:
-#             x[None] += 1
-#         x[None] += val
-#         return 0
-#
-#     @ti.kernel
-#     def run():
-#         assert foo(15) == 0
-#         assert foo(10) == 0
-#
-#     x[None] = 0
-#     run()
-#     assert x[None] == 26
-#
-#
-# @test_utils.test(arch=ti.cpu, experimental_real_function=True)
-# def test_failing_multiple_return():
-#     x = ti.field(ti.i32, shape=())
-#
-#     @ti.func
-#     def foo(val: ti.i32) -> ti.i32:
-#         if x[None] > 10:
-#             if x[None] > 20:
-#                 return 1
-#             x[None] += 1
-#         x[None] += val
-#         return 0
-#
-#     @ti.kernel
-#     def run():
-#         assert foo(15) == 0
-#         assert foo(10) == 0
-#         assert foo(100) == 1
-#
-#     with pytest.raises(AssertionError):
-#         x[None] = 0
-#         run()
-#         assert x[None] == 26
-
-#
-# @test_utils.test(experimental_real_function=True, arch=[ti.cpu, ti.gpu])
-# def test_python_function():
-#     x = ti.field(ti.i32, shape=())
-#
-#     @ti.func
-#     def inc(val: ti.i32):
-#         x[None] += val
-#
-#     def identity(x):
-#         return x
-#
-#     @ti.data_oriented
-#     class A:
-#         def __init__(self):
-#             self.count = ti.field(ti.i32, shape=())
-#             self.count[None] = 0
-#
-#         @ti.lang.kernel_impl.pyfunc
-#         def dec(self, val: ti.i32) -> ti.i32:
-#             self.count[None] += 1
-#             x[None] -= val
-#             return self.count[None]
-#
-#         @ti.kernel
-#         def run(self) -> ti.i32:
-#             a = self.dec(1)
-#             identity(2)
-#             inc(identity(3))
-#             return a
-#
-#     a = A()
-#     x[None] = 0
-#     assert a.run() == 1
-#     assert a.run() == 2
-#     assert x[None] == 4
-#     assert a.dec(4) == 3
-#     assert x[None] == 0
+@test_utils.test(arch=[ti.cpu, ti.gpu])
+def test_call_expressions():
+    x = ti.field(ti.i32, shape=())
+
+    @ti.experimental.real_func
+    def foo(val: ti.i32) -> ti.i32:
+        if x[None] > 10:
+            x[None] += 1
+        x[None] += val
+        return 0
+
+    @ti.kernel
+    def run():
+        assert foo(15) == 0
+        assert foo(10) == 0
+
+    x[None] = 0
+    run()
+    assert x[None] == 26
 
 
 @test_utils.test(arch=[ti.cpu, ti.cuda], debug=True)
@@ -220,7 +130,7 @@ def run_func():
     run_func()
 
 
-@test_utils.test(experimental_real_function=True, arch=[ti.cpu, ti.gpu])
+@test_utils.test(arch=[ti.cpu, ti.gpu])
 def test_experimental_templates():
     x = ti.field(ti.i32, shape=())
     y = ti.field(ti.i32, shape=())
@@ -240,7 +150,7 @@ def run_kernel():
         assert x[None] == 11
         assert y[None] == 21
 
-    @ti.func
+    @ti.experimental.real_func
     def inc(x: ti.template()):
         x[None] += 1
 
@@ -266,21 +176,21 @@ def verify():
     verify()
 
 
-@test_utils.test(experimental_real_function=True, arch=[ti.cpu, ti.gpu])
+@test_utils.test(arch=[ti.cpu, ti.gpu])
 def test_missing_arg_annotation():
     with pytest.raises(ti.TaichiSyntaxError, match='must be type annotated'):
 
-        @ti.func
+        @ti.experimental.real_func
         def add(a, b: ti.i32) -> ti.i32:
             return a + b
 
 
-@test_utils.test(experimental_real_function=True, arch=[ti.cpu, ti.gpu])
+@test_utils.test(arch=[ti.cpu, ti.gpu])
 def test_missing_return_annotation():
     with pytest.raises(ti.TaichiCompilationError,
                        match='return value must be annotated'):
 
-        @ti.func
+        @ti.experimental.real_func
         def add(a: ti.i32, b: ti.i32):
             return a + b
 
diff --git a/tests/python/test_literal.py b/tests/python/test_literal.py
index f62822b9c8239..88ac3827df213 100644
--- a/tests/python/test_literal.py
+++ b/tests/python/test_literal.py
@@ -23,7 +23,9 @@ def test_literal_multi_args_error():
     def multi_args_error():
         a = ti.i64(1, 2)
 
-    with pytest.raises(ti.TaichiSyntaxError):
+    with pytest.raises(
+            ti.TaichiSyntaxError,
+            match="Type annotation can only be given to a single literal."):
         multi_args_error()
 
 
@@ -33,7 +35,9 @@ def test_literal_keywords_error():
     def keywords_error():
         a = ti.f64(1, x=2)
 
-    with pytest.raises(ti.TaichiSyntaxError):
+    with pytest.raises(
+            ti.TaichiSyntaxError,
+            match="Type annotation can only be given to a single literal."):
         keywords_error()
 
 
@@ -44,5 +48,35 @@ def expr_error():
         a = 1
         b = ti.f16(a)
 
-    with pytest.raises(ti.TaichiSyntaxError):
+    with pytest.raises(
+            ti.TaichiSyntaxError,
+            match="Type annotation can only be given to a single literal."):
         expr_error()
+
+
+@test_utils.test()
+def test_literal_int_annotation_error():
+    @ti.kernel
+    def int_annotation_error():
+        a = ti.f32(0)
+
+    with pytest.raises(
+            ti.TaichiTypeError,
+            match=
+            "Integer literals must be annotated with a integer type. For type casting, use `ti.cast`."
+    ):
+        int_annotation_error()
+
+
+@test_utils.test()
+def test_literal_float_annotation_error():
+    @ti.kernel
+    def float_annotation_error():
+        a = ti.i32(0.0)
+
+    with pytest.raises(
+            ti.TaichiTypeError,
+            match=
+            "Floating-point literals must be annotated with a floating-point type. For type casting, use `ti.cast`."
+    ):
+        float_annotation_error()
diff --git a/tests/python/test_loops.py b/tests/python/test_loops.py
index 588bf302d34ca..81b0456df8cd3 100644
--- a/tests/python/test_loops.py
+++ b/tests/python/test_loops.py
@@ -172,3 +172,18 @@ def func():
     x[None] = 1
     func()
     assert x[None] == 1
+
+
+@test_utils.test()
+def test_break_in_outermost_for_not_in_outermost_scope():
+    @ti.kernel
+    def foo() -> ti.i32:
+        a = 0
+        if True:
+            for i in range(1000):
+                if i == 100:
+                    break
+                a += 1
+        return a
+
+    assert foo() == 100
diff --git a/tests/python/test_sparse_matrix.py b/tests/python/test_sparse_matrix.py
index 64576cefd8f25..179cbefedc31a 100644
--- a/tests/python/test_sparse_matrix.py
+++ b/tests/python/test_sparse_matrix.py
@@ -1,14 +1,20 @@
+import pytest
+
 import taichi as ti
 from tests import test_utils
 
 
+@pytest.mark.parametrize('dtype', [ti.f32, ti.f64])
 @test_utils.test(arch=ti.cpu)
-def test_sparse_matrix_builder_deprecated_anno():
+def test_sparse_matrix_builder_deprecated_anno(dtype):
     n = 8
-    Abuilder = ti.linalg.SparseMatrixBuilder(n, n, max_num_triplets=100)
+    Abuilder = ti.linalg.SparseMatrixBuilder(n,
+                                             n,
+                                             max_num_triplets=100,
+                                             dtype=dtype)
 
     @ti.kernel
-    def fill(Abuilder: ti.linalg.sparse_matrix_builder()):
+    def fill(Abuilder: ti.types.sparse_matrix_builder()):
         for i, j in ti.ndrange(n, n):
             Abuilder[i, j] += i + j
 
@@ -19,10 +25,14 @@ def fill(Abuilder: ti.linalg.sparse_matrix_builder()):
             assert A[i, j] == i + j
 
 
+@pytest.mark.parametrize('dtype', [ti.f32, ti.f64])
 @test_utils.test(arch=ti.cpu)
-def test_sparse_matrix_builder():
+def test_sparse_matrix_builder(dtype):
     n = 8
-    Abuilder = ti.linalg.SparseMatrixBuilder(n, n, max_num_triplets=100)
+    Abuilder = ti.linalg.SparseMatrixBuilder(n,
+                                             n,
+                                             max_num_triplets=100,
+                                             dtype=dtype)
 
     @ti.kernel
     def fill(Abuilder: ti.types.sparse_matrix_builder()):
@@ -36,10 +46,14 @@ def fill(Abuilder: ti.types.sparse_matrix_builder()):
             assert A[i, j] == i + j
 
 
+@pytest.mark.parametrize('dtype', [ti.f32, ti.f64])
 @test_utils.test(arch=ti.cpu)
-def test_sparse_matrix_shape():
+def test_sparse_matrix_shape(dtype):
     n, m = 8, 9
-    Abuilder = ti.linalg.SparseMatrixBuilder(n, m, max_num_triplets=100)
+    Abuilder = ti.linalg.SparseMatrixBuilder(n,
+                                             m,
+                                             max_num_triplets=100,
+                                             dtype=dtype)
 
     @ti.kernel
     def fill(Abuilder: ti.types.sparse_matrix_builder()):
@@ -51,10 +65,14 @@ def fill(Abuilder: ti.types.sparse_matrix_builder()):
     assert A.shape() == (n, m)
 
 
+@pytest.mark.parametrize('dtype', [ti.f32, ti.f64])
 @test_utils.test(arch=ti.cpu)
-def test_sparse_matrix_element_access():
+def test_sparse_matrix_element_access(dtype):
     n = 8
-    Abuilder = ti.linalg.SparseMatrixBuilder(n, n, max_num_triplets=100)
+    Abuilder = ti.linalg.SparseMatrixBuilder(n,
+                                             n,
+                                             max_num_triplets=100,
+                                             dtype=dtype)
 
     @ti.kernel
     def fill(Abuilder: ti.types.sparse_matrix_builder()):
@@ -67,10 +85,14 @@ def fill(Abuilder: ti.types.sparse_matrix_builder()):
         assert A[i, i] == i
 
 
+@pytest.mark.parametrize('dtype', [ti.f32, ti.f64])
 @test_utils.test(arch=ti.cpu)
-def test_sparse_matrix_element_modify():
+def test_sparse_matrix_element_modify(dtype):
     n = 8
-    Abuilder = ti.linalg.SparseMatrixBuilder(n, n, max_num_triplets=100)
+    Abuilder = ti.linalg.SparseMatrixBuilder(n,
+                                             n,
+                                             max_num_triplets=100,
+                                             dtype=dtype)
 
     @ti.kernel
     def fill(Abuilder: ti.types.sparse_matrix_builder()):
@@ -83,11 +105,18 @@ def fill(Abuilder: ti.types.sparse_matrix_builder()):
     assert A[0, 0] == 1024.0
 
 
+@pytest.mark.parametrize('dtype', [ti.f32, ti.f64])
 @test_utils.test(arch=ti.cpu)
-def test_sparse_matrix_addition():
+def test_sparse_matrix_addition(dtype):
     n = 8
-    Abuilder = ti.linalg.SparseMatrixBuilder(n, n, max_num_triplets=100)
-    Bbuilder = ti.linalg.SparseMatrixBuilder(n, n, max_num_triplets=100)
+    Abuilder = ti.linalg.SparseMatrixBuilder(n,
+                                             n,
+                                             max_num_triplets=100,
+                                             dtype=dtype)
+    Bbuilder = ti.linalg.SparseMatrixBuilder(n,
+                                             n,
+                                             max_num_triplets=100,
+                                             dtype=dtype)
 
     @ti.kernel
     def fill(Abuilder: ti.types.sparse_matrix_builder(),
@@ -105,11 +134,18 @@ def fill(Abuilder: ti.types.sparse_matrix_builder(),
             assert C[i, j] == 2 * i
 
 
+@pytest.mark.parametrize('dtype', [ti.f32, ti.f64])
 @test_utils.test(arch=ti.cpu)
-def test_sparse_matrix_subtraction():
+def test_sparse_matrix_subtraction(dtype):
     n = 8
-    Abuilder = ti.linalg.SparseMatrixBuilder(n, n, max_num_triplets=100)
-    Bbuilder = ti.linalg.SparseMatrixBuilder(n, n, max_num_triplets=100)
+    Abuilder = ti.linalg.SparseMatrixBuilder(n,
+                                             n,
+                                             max_num_triplets=100,
+                                             dtype=dtype)
+    Bbuilder = ti.linalg.SparseMatrixBuilder(n,
+                                             n,
+                                             max_num_triplets=100,
+                                             dtype=dtype)
 
     @ti.kernel
     def fill(Abuilder: ti.types.sparse_matrix_builder(),
@@ -127,10 +163,14 @@ def fill(Abuilder: ti.types.sparse_matrix_builder(),
             assert C[i, j] == 2 * j
 
 
+@pytest.mark.parametrize('dtype', [ti.f32, ti.f64])
 @test_utils.test(arch=ti.cpu)
-def test_sparse_matrix_scalar_multiplication():
+def test_sparse_matrix_scalar_multiplication(dtype):
     n = 8
-    Abuilder = ti.linalg.SparseMatrixBuilder(n, n, max_num_triplets=100)
+    Abuilder = ti.linalg.SparseMatrixBuilder(n,
+                                             n,
+                                             max_num_triplets=100,
+                                             dtype=dtype)
 
     @ti.kernel
     def fill(Abuilder: ti.types.sparse_matrix_builder()):
@@ -145,10 +185,14 @@ def fill(Abuilder: ti.types.sparse_matrix_builder()):
             assert B[i, j] == 3 * (i + j)
 
 
+@pytest.mark.parametrize('dtype', [ti.f32, ti.f64])
 @test_utils.test(arch=ti.cpu)
-def test_sparse_matrix_transpose():
+def test_sparse_matrix_transpose(dtype):
     n = 8
-    Abuilder = ti.linalg.SparseMatrixBuilder(n, n, max_num_triplets=100)
+    Abuilder = ti.linalg.SparseMatrixBuilder(n,
+                                             n,
+                                             max_num_triplets=100,
+                                             dtype=dtype)
 
     @ti.kernel
     def fill(Abuilder: ti.types.sparse_matrix_builder()):
@@ -163,11 +207,18 @@ def fill(Abuilder: ti.types.sparse_matrix_builder()):
             assert B[i, j] == A[j, i]
 
 
+@pytest.mark.parametrize('dtype', [ti.f32, ti.f64])
 @test_utils.test(arch=ti.cpu)
-def test_sparse_matrix_elementwise_multiplication():
+def test_sparse_matrix_elementwise_multiplication(dtype):
     n = 8
-    Abuilder = ti.linalg.SparseMatrixBuilder(n, n, max_num_triplets=100)
-    Bbuilder = ti.linalg.SparseMatrixBuilder(n, n, max_num_triplets=100)
+    Abuilder = ti.linalg.SparseMatrixBuilder(n,
+                                             n,
+                                             max_num_triplets=100,
+                                             dtype=dtype)
+    Bbuilder = ti.linalg.SparseMatrixBuilder(n,
+                                             n,
+                                             max_num_triplets=100,
+                                             dtype=dtype)
 
     @ti.kernel
     def fill(Abuilder: ti.types.sparse_matrix_builder(),
@@ -185,11 +236,18 @@ def fill(Abuilder: ti.types.sparse_matrix_builder(),
             assert C[i, j] == (i + j) * (i - j)
 
 
+@pytest.mark.parametrize('dtype', [ti.f32, ti.f64])
 @test_utils.test(arch=ti.cpu)
-def test_sparse_matrix_multiplication():
+def test_sparse_matrix_multiplication(dtype):
     n = 2
-    Abuilder = ti.linalg.SparseMatrixBuilder(n, n, max_num_triplets=100)
-    Bbuilder = ti.linalg.SparseMatrixBuilder(n, n, max_num_triplets=100)
+    Abuilder = ti.linalg.SparseMatrixBuilder(n,
+                                             n,
+                                             max_num_triplets=100,
+                                             dtype=dtype)
+    Bbuilder = ti.linalg.SparseMatrixBuilder(n,
+                                             n,
+                                             max_num_triplets=100,
+                                             dtype=dtype)
 
     @ti.kernel
     def fill(Abuilder: ti.types.sparse_matrix_builder(),
@@ -208,11 +266,18 @@ def fill(Abuilder: ti.types.sparse_matrix_builder(),
     assert C[1, 1] == -1.0
 
 
+@pytest.mark.parametrize('dtype', [ti.f32, ti.f64])
 @test_utils.test(arch=ti.cpu)
-def test_sparse_matrix_nonsymmetric_multiplication():
+def test_sparse_matrix_nonsymmetric_multiplication(dtype):
     n, k, m = 2, 3, 4
-    Abuilder = ti.linalg.SparseMatrixBuilder(n, k, max_num_triplets=100)
-    Bbuilder = ti.linalg.SparseMatrixBuilder(k, m, max_num_triplets=100)
+    Abuilder = ti.linalg.SparseMatrixBuilder(n,
+                                             k,
+                                             max_num_triplets=100,
+                                             dtype=dtype)
+    Bbuilder = ti.linalg.SparseMatrixBuilder(k,
+                                             m,
+                                             max_num_triplets=100,
+                                             dtype=dtype)
 
     @ti.kernel
     def fill(Abuilder: ti.types.sparse_matrix_builder(),