From d34f521bf5f9280c6139483a3e45d39e4427453a Mon Sep 17 00:00:00 2001
From: weilinquan <352977670@qq.com>
Date: Sat, 9 Dec 2023 10:10:06 +0800
Subject: [PATCH] [frontend] Add Initial Graph Infra.

Co-authored-by: xtayex <liyuliang2022_sei@stu.ecnu.edu.cn>
Co-authored-by: zhanghb97 <hongbin2019@iscas.ac.cn>
---
 examples/BuddyBert/import-bert.py             |    8 +-
 examples/BuddyGraph/README.md                 |   23 +
 examples/BuddyGraph/import-dynamo-break.py    |   63 +
 examples/BuddyLlama/import-llama2.py          |   15 +-
 examples/BuddyLlama/llama-main.cpp            |    3 -
 examples/BuddyPython/bert.py                  |   10 +-
 examples/BuddyPython/module_gen.py            |   29 +-
 examples/BuddyResNet18/import-resnet18.py     |   45 +
 examples/lit.cfg.py                           |    1 +
 frontend/Interfaces/buddy/LLM/TextContainer.h |    2 +-
 frontend/Python/frontend.py                   |  627 ++--
 frontend/Python/graph/__init__.py             |   23 +
 frontend/Python/graph/graph.py                |  487 ++++
 frontend/Python/graph/operation.py            |  456 +++
 frontend/Python/graph/transform/__init__.py   |    1 +
 .../graph/transform/useless_op_eliminate.py   |   66 +
 frontend/Python/graph/type.py                 |   79 +
 frontend/Python/ops/linalg.py                 | 2590 ++++++-----------
 frontend/Python/ops/math.py                   |   11 +-
 frontend/Python/ops/tosa.py                   |  583 +++-
 frontend/Python/ops/utils.py                  |   56 +
 requirements.txt                              |    1 +
 tests/Python/test_addmm.py                    |    8 +-
 tests/Python/test_amax.py                     |    8 +-
 tests/Python/test_arange.py                   |   13 +-
 tests/Python/test_arith_add.py                |   13 +-
 tests/Python/test_arith_div.py                |   12 +-
 tests/Python/test_arith_mul.py                |   15 +-
 tests/Python/test_arith_sub.py                |    8 +-
 tests/Python/test_bmm.py                      |   15 +-
 tests/Python/test_cat.py                      |   12 +-
 tests/Python/test_clone.py                    |   14 +-
 tests/Python/test_convert_element_type.py     |   10 +-
 tests/Python/test_convolution_default.py      |   42 +
 tests/Python/test_embedding.py                |   31 +-
 tests/Python/test_exp.py                      |    8 +-
 tests/Python/test_full.py                     |   12 +-
 tests/Python/test_index.py                    |   12 +-
 tests/Python/{test_expand.py => test_iota.py} |   27 +-
 tests/Python/test_lt.py                       |   14 +-
 tests/Python/test_masked_fill.py              |   12 +-
 tests/Python/test_max_pool2d.py               |   44 +
 tests/Python/test_mean.py                     |   26 +-
 tests/Python/test_mm.py                       |   12 +-
 tests/Python/test_neg.py                      |   14 +-
 tests/Python/test_ones.py                     |   12 +-
 tests/Python/test_permute.py                  |    8 +-
 tests/Python/test_pow.py                      |   12 +-
 tests/Python/test_reciprocal.py               |   36 +
 tests/Python/test_relu.py                     |   36 +
 tests/Python/test_reshape.py                  |    8 +-
 tests/Python/test_rsqrt.py                    |   15 +-
 tests/Python/test_rsub.py                     |   20 +-
 tests/Python/test_select.py                   |    8 +-
 tests/Python/test_sigmoid.py                  |   35 +
 tests/Python/test_silu.py                     |   12 +-
 tests/Python/test_slice.py                    |   12 +-
 tests/Python/test_softmax.py                  |   22 +-
 tests/Python/test_sqrt.py                     |   36 +
 tests/Python/test_squeeze.py                  |   12 +-
 tests/Python/test_sum.py                      |    8 +-
 tests/Python/test_t.py                        |   16 +-
 tests/Python/test_tanh.py                     |    8 +-
 tests/Python/test_to_copy.py                  |   12 +-
 tests/Python/test_transpose.py                |   13 +-
 tests/Python/test_unsqueeze.py                |   12 +-
 tests/Python/test_var_mean.py                 |   37 +-
 tests/Python/test_view.py                     |   18 +-
 tests/Python/test_where.py                    |   38 +
 69 files changed, 3766 insertions(+), 2241 deletions(-)
 create mode 100644 examples/BuddyGraph/README.md
 create mode 100644 examples/BuddyGraph/import-dynamo-break.py
 create mode 100644 examples/BuddyResNet18/import-resnet18.py
 create mode 100644 frontend/Python/graph/__init__.py
 create mode 100644 frontend/Python/graph/graph.py
 create mode 100644 frontend/Python/graph/operation.py
 create mode 100644 frontend/Python/graph/transform/__init__.py
 create mode 100644 frontend/Python/graph/transform/useless_op_eliminate.py
 create mode 100644 frontend/Python/graph/type.py
 create mode 100644 frontend/Python/ops/utils.py
 create mode 100644 tests/Python/test_convolution_default.py
 rename tests/Python/{test_expand.py => test_iota.py} (52%)
 create mode 100644 tests/Python/test_max_pool2d.py
 create mode 100644 tests/Python/test_reciprocal.py
 create mode 100644 tests/Python/test_relu.py
 create mode 100644 tests/Python/test_sigmoid.py
 create mode 100644 tests/Python/test_sqrt.py
 create mode 100644 tests/Python/test_where.py

diff --git a/examples/BuddyBert/import-bert.py b/examples/BuddyBert/import-bert.py
index c2044cb037..92e8e055ea 100644
--- a/examples/BuddyBert/import-bert.py
+++ b/examples/BuddyBert/import-bert.py
@@ -46,12 +46,16 @@
     "attention_mask": torch.tensor([[1 for _ in range(5)]], dtype=torch.int64),
 }
 with torch.no_grad():
-    module, params = dynamo_compiler.importer(model, **inputs)
+    graphs = dynamo_compiler.importer(model, **inputs)
 
+assert len(graphs) == 1
+graph = graphs[0]
+params = dynamo_compiler.imported_params[graph]
+graph.lower_to_top_level_ir(do_params_pack=True)
 current_path = os.path.dirname(os.path.abspath(__file__))
 
 with open(Path(current_path) / "bert.mlir", "w") as module_file:
-    module_file.write(str(module))
+    module_file.write(str(graph._imported_module))
 
 float32_param = np.concatenate(
     [param.detach().numpy().reshape([-1]) for param in params[:-1]]
diff --git a/examples/BuddyGraph/README.md b/examples/BuddyGraph/README.md
new file mode 100644
index 0000000000..d7b977f57e
--- /dev/null
+++ b/examples/BuddyGraph/README.md
@@ -0,0 +1,23 @@
+# Buddy Graph Representation Examples
+
+## Run the Examples
+
+0. Enter your Python Env
+```
+(base)$ conda activate buddy
+(buddy)$ ...
+```
+1. Build Python Packages
+2. Configure Python Path
+```
+(buddy)$ cd buddy-mlir/build
+(buddy)$ export BUDDY_MLIR_BUILD_DIR=$PWD
+(buddy)$ export LLVM_MLIR_BUILD_DIR=$PWD/../llvm/build
+(buddy)$ export PYTHONPATH=${LLVM_MLIR_BUILD_DIR}/tools/mlir/python_packages/mlir_core:${BUDDY_MLIR_BUILD_DIR}/python_packages:${PYTHONPATH}
+
+```
+3. Run the Examples
+```
+(buddy)$ cd examples/BuddyGraph
+(buddy)$ python import-dynamo-break.py
+```
\ No newline at end of file
diff --git a/examples/BuddyGraph/import-dynamo-break.py b/examples/BuddyGraph/import-dynamo-break.py
new file mode 100644
index 0000000000..42bbed6030
--- /dev/null
+++ b/examples/BuddyGraph/import-dynamo-break.py
@@ -0,0 +1,63 @@
+# ===- import-dynamo-break.py --------------------------------------------------
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# ===---------------------------------------------------------------------------
+#
+# The example for dynamo graph break, import, and execute.
+#
+# ===---------------------------------------------------------------------------
+
+import torch
+import torch._dynamo as dynamo
+from torch._inductor.decomposition import decompositions as inductor_decomp
+from torch._functorch.aot_autograd import aot_autograd_decompositions
+
+from buddy.compiler.frontend import DynamoCompiler
+from buddy.compiler.ops import tosa
+
+
+class TestModule(torch.nn.Module):
+    def __init__(self, *args, **kwargs) -> None:
+        super().__init__(*args, **kwargs)
+
+    def forward(self, b, c):
+        if torch.nn.functional.silu(b)[0][0]:
+            return torch.add(b, c)
+        else:
+            return torch.matmul(b, c)
+
+# Define a PyTorch model and run it with PyTorch runtime.
+model = TestModule()
+a, b = torch.randn((1024, 1024)), torch.randn((1024, 1024))
+print(model(a, b))
+
+# JIT Mode
+# Initialize Buddy Dynamo Compiler to compile and execute the PyTorch model.
+dynamo_compiler = DynamoCompiler(
+    primary_registry=tosa.ops_registry,
+    aot_autograd_decomposition=aot_autograd_decompositions
+)
+model_opt = torch.compile(model, backend=dynamo_compiler)
+print(model_opt(a, b))
+
+torch._dynamo.reset()
+
+# AOT Mode
+# Import PyTorch model to Buddy Graph and MLIR/LLVM IR.
+graphs = dynamo_compiler.importer(
+    model, a, b
+)
+for g in graphs:
+    g.lower_to_top_level_ir()
+    print(g._imported_module)
diff --git a/examples/BuddyLlama/import-llama2.py b/examples/BuddyLlama/import-llama2.py
index d5a3a29e1b..47eb9e61ec 100644
--- a/examples/BuddyLlama/import-llama2.py
+++ b/examples/BuddyLlama/import-llama2.py
@@ -19,11 +19,13 @@
 # ===---------------------------------------------------------------------------
 
 import os
+import time
 
 import numpy
 import torch
 from transformers import LlamaForCausalLM, LlamaTokenizer
 from torch._functorch.aot_autograd import aot_autograd_decompositions
+from torch._inductor.decomposition import decompositions as inductor_decomp
 
 from buddy.compiler.frontend import DynamoCompiler
 from buddy.compiler.ops import tosa
@@ -44,19 +46,22 @@
 # Initialize Dynamo Compiler with specific configurations as an importer.
 dynamo_compiler = DynamoCompiler(
     primary_registry=tosa.ops_registry,
-    aot_autograd_decomposition=aot_autograd_decompositions,
+    aot_autograd_decomposition=inductor_decomp,
 )
 
 # Import the model into MLIR module and parameters.
 with torch.no_grad():
-    gm, params = dynamo_compiler.importer(
-        model, torch.tensor([[1 for _ in range(40)]], dtype=torch.int64)
-    )
+    data = torch.tensor([[1 for i in range(40)]], dtype=torch.int64)
+    graphs = dynamo_compiler.importer(model, data)
 
+assert len(graphs) == 1
+graph = graphs[0]
+params = dynamo_compiler.imported_params[graph]
+graph.lower_to_top_level_ir(True)
 path_prefix = os.path.dirname(os.path.abspath(__file__))
 # Write the MLIR module to the file.
 with open(os.path.join(path_prefix, "llama.mlir"), "w") as module_file:
-    print(gm, file=module_file)
+    print(graph._imported_module, file=module_file)
 
 # Concatenate all parameters into a single numpy array and write to a file.
 all_param = numpy.concatenate(
diff --git a/examples/BuddyLlama/llama-main.cpp b/examples/BuddyLlama/llama-main.cpp
index 78b5cec027..55530a01c2 100644
--- a/examples/BuddyLlama/llama-main.cpp
+++ b/examples/BuddyLlama/llama-main.cpp
@@ -18,12 +18,9 @@
 #include <buddy/LLM/TextContainer.h>
 #include <chrono>
 #include <cstddef>
-#include <cstdint>
 #include <filesystem>
 #include <fstream>
 #include <iostream>
-#include <limits>
-#include <type_traits>
 
 using namespace buddy;
 
diff --git a/examples/BuddyPython/bert.py b/examples/BuddyPython/bert.py
index 7f4f004359..e57dc991b1 100644
--- a/examples/BuddyPython/bert.py
+++ b/examples/BuddyPython/bert.py
@@ -15,6 +15,10 @@
 text = "Replace me by any text you'd like."
 encoded_text = tokenizer(text, return_tensors="pt")
 with torch.no_grad():
-    module, params = dynamo_compiler.importer(model, **encoded_text)
-    print(module)
-    print(params)
+    graphs = dynamo_compiler.importer(model, **encoded_text)
+
+graph = graphs[0]
+params = dynamo_compiler.imported_params[graph]    
+graph.lower_to_top_level_ir(do_params_pack=True)
+print(graph._imported_module)
+print(params)
diff --git a/examples/BuddyPython/module_gen.py b/examples/BuddyPython/module_gen.py
index 10a1e2ee1c..e2c722cebf 100644
--- a/examples/BuddyPython/module_gen.py
+++ b/examples/BuddyPython/module_gen.py
@@ -43,23 +43,12 @@ def foo(x, y):
     aot_autograd_decomposition=inductor_decomp,
 )
 
-# The first way to generate an MLIR Module:
-# Pass the function and input data to the dynamo compiler's importer,
-# and accepts the generated module and weight parameters.
-module, params = dynamo_compiler.importer(foo, *(float32_in1, float32_in2))
-
-print(module)
-print(params)
-
-# The second way to generate an MLIR Module:
-# Execute the target function using a define-by-run style,
-# and get the module and weight parameters from the dynamo compiler's attribute.
-foo_mlir = dynamo.optimize(dynamo_compiler)(foo)
-
-foo_mlir(float32_in1, float32_in2)
-print(dynamo_compiler.imported_module)
-print(dynamo_compiler.imported_params)
-
-foo_mlir(int32_in1, int32_in2)
-print(dynamo_compiler.imported_module)
-print(dynamo_compiler.imported_params)
+# Pass the function and input data to the dynamo compiler's importer, the 
+# importer will first build a graph. Then, lower the graph to top-level IR. 
+# (tosa, linalg, etc.). Finally, accepts the generated module and weight parameters.
+graphs = dynamo_compiler.importer(foo, *(float32_in1, float32_in2))
+graph = graphs[0]
+graph.lower_to_top_level_ir(do_params_pack=True)
+
+print(graph._imported_module)
+print(dynamo_compiler.imported_params[graph])
diff --git a/examples/BuddyResNet18/import-resnet18.py b/examples/BuddyResNet18/import-resnet18.py
new file mode 100644
index 0000000000..c58f4a604a
--- /dev/null
+++ b/examples/BuddyResNet18/import-resnet18.py
@@ -0,0 +1,45 @@
+# ===- import-resnet18.py ------------------------------------------------------
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# ===---------------------------------------------------------------------------
+#
+# This is the test of resnet18 model.
+#
+# ===---------------------------------------------------------------------------
+
+import torch
+import torchvision
+from torch._inductor.decomposition import decompositions as inductor_decomp
+
+from buddy.compiler.frontend import DynamoCompiler
+from buddy.compiler.ops import tosa
+
+
+model = torchvision.models.resnet18()
+model = model.eval()
+
+# Initialize Dynamo Compiler with specific configurations as an importer.
+dynamo_compiler = DynamoCompiler(
+    primary_registry=tosa.ops_registry,
+    aot_autograd_decomposition=inductor_decomp,
+)
+
+data = torch.randn([1, 3, 224, 224])
+# Import the model into MLIR module and parameters.
+with torch.no_grad():
+    graphs = dynamo_compiler.importer(model, data)
+
+assert len(graphs) == 1
+graphs[0].lower_to_top_level_ir(do_params_pack=True)
+print(graphs[0]._imported_module)
diff --git a/examples/lit.cfg.py b/examples/lit.cfg.py
index fa37a2e51f..724d9cdaad 100644
--- a/examples/lit.cfg.py
+++ b/examples/lit.cfg.py
@@ -36,6 +36,7 @@
 # subdirectories contain auxiliary inputs for various tests in their parent
 # directories.
 config.excludes = [
+    'BuddyBert',
     'BuddyLlama',
     'BuddyBert',
     'ConvOpt',
diff --git a/frontend/Interfaces/buddy/LLM/TextContainer.h b/frontend/Interfaces/buddy/LLM/TextContainer.h
index b5e307abdf..28432b3c19 100644
--- a/frontend/Interfaces/buddy/LLM/TextContainer.h
+++ b/frontend/Interfaces/buddy/LLM/TextContainer.h
@@ -325,7 +325,7 @@ template <typename T, size_t N> std::string Text<T, N>::revertLlama() {
   const int CLS_ID = 1;
   const int SEP_ID = 2;
 
-  for (size_t i = 0; i < this->getSize(); i++) {
+  for (size_t i = 0; i < this->tokenCnt; i++) {
     int id = this->aligned[i];
     if (id == PAD_ID || id == CLS_ID)
       continue;
diff --git a/frontend/Python/frontend.py b/frontend/Python/frontend.py
index 24002fd641..e89597800c 100644
--- a/frontend/Python/frontend.py
+++ b/frontend/Python/frontend.py
@@ -16,32 +16,44 @@
 #
 # This is the entry of the Buddy Compiler frontend.
 #
+# TODO[Low]: When integrating more frameworks, `frontend.py` acts as a unified
+# entry and driver, separating out compilers/importers for various platforms
+# (e.g. DynamoCompiler).
+#
 # ===---------------------------------------------------------------------------
 
-import operator
 from typing import Any, List, Optional
-import functools
+import operator
+import os
+import ctypes
+import platform
 
-import mlir.dialects.func as func
 import mlir.ir as ir
+import mlir.dialects.func as func
+from mlir.passmanager import *
+from mlir.execution_engine import *
+from mlir import runtime as rt
 import torch
 import torch._dynamo as dynamo
 from torch._functorch.aot_autograd import aot_module_simplified
 import torch.utils._pytree as pytree
 
-from .ops.math import ops_registry as math_ops_registry
-from .ops.tosa import ops_registry as tosa_ops_registry
 from .ops.linalg import ops_registry as linalg_ops_registry
+from .ops.tosa import ops_registry as tosa_ops_registry
+from .ops.math import ops_registry as math_ops_registry
+from .graph import Graph, TensorDType, TensorMeta
+from .graph.operation import *
+from .graph.transform import maxpool2d_simplify
 
 
 class DynamoCompiler:
     """
     Dynamo Compiler is one of the frontends of Buddy Compiler.
-    Dynamo Compiler acts as a custom compiler for the Torch Dynamo framework,
-    which converts an FX Graph into an equivalent MLIR module.
+    Dynamo Compiler acts as a custom compiler for the TorchDynamo framework,
+    which converts an FX Graph into an equivalent Buddy Graph and MLIR module.
 
     Attributes:
-        imported_module: The imported MLIR module after compilation.
+        imported_graphs: The imported graphs.
         imported_params: The imported parameters from the model.
     """
 
@@ -50,80 +62,279 @@ def __init__(
         func_name: str = "forward",
         primary_registry: Optional[dict] = None,
         aot_autograd_decomposition: Optional[dict] = None,
-        do_param_pack: bool = True,
     ) -> None:
         """
         Initializes the Dynamo Compiler.
 
         Args:
-            func_name (str, optional): The function name to be used.
+            func_name: The function name to be used.
             primary_registry (dict, optional): The primary operations registry.
             aot_autograd_decomposition (Optional[dict], optional):
-                The ahead-of-time autograd decomposition dictionary.
+            The ahead-of-time autograd decomposition dictionary.
+        Attributes:
+            _func_name: The function name to be used.
+            _aot_autograd_decomposition (Optional[dict], optional):
+            The ahead-of-time autograd decomposition dictionary.
+            _imported_graphs: The buddy graphs from dynamo importer.
+            _ops_registry (dict, optional): The buddy operations' lower func
+            registry.
+            _imported_params: The model params extract from torch.
+            _ops_map: The torch aten ops map with buddy ops.
+
         """
         if primary_registry is None:
             primary_registry = {}
         self._func_name = func_name
         self._aot_autograd_decomposition = aot_autograd_decomposition
-        self._imported_module = None
-        self._imported_params = None
-        self._do_param_pack = do_param_pack
+        self._imported_graphs = []
         self._ops_registry = {}
+        self._imported_params = {}
         self._ops_registry.update(math_ops_registry)
         self._ops_registry.update(linalg_ops_registry)
         self._ops_registry.update(tosa_ops_registry)
         self._ops_registry.update(primary_registry)
+        self._ops_map = {
+            "output": OutputOp,
+            "placeholder": PlaceholderOp,
+            "arange.start": ArangeOp,
+            "arange.default": ArangeOp,
+            "unsqueeze.default": UnsqueezeOp,
+            "view.default": ViewOp,
+            "ones.default": OnesOp,
+            "full.default": FullOp,
+            "lt.Tensor": LessThanOp,
+            "embedding.default": EmbeddingOp,
+            "masked_fill.Scalar": MaskedFillOp,
+            "slice.Tensor": SliceOp,
+            "expand.default": ExpandOp,
+            "_to_copy.default": ToCopyOp,
+            "rsub.Scalar": RsubOp,
+            "pow.Tensor_Scalar": PowOp,
+            "mean.dim": MeanOp,
+            "rsqrt.default": RsqrtOp,
+            "mul.Tensor": MulOp,
+            "t.default": TOp,
+            "mm.default": MatmulOp,
+            "transpose.int": TransposeOp,
+            "index.Tensor": IndexOp,
+            "neg.default": NegOp,
+            "cat.default": CatOp,
+            "squeeze.dim": SqueezeOp,
+            "bmm.default": BatchMatmulOp,
+            "div.Tensor": DivOp,
+            "_softmax.default": SoftmaxOp,
+            "clone.default": CloneOp,
+            "silu.default": SiluOp,
+            "add.Tensor": AddOp,
+            "addmm.default": AddMMOp,
+            "permute.default": PermuteOp,
+            "convert_element_type.default": ConvertElementTypeOp,
+            "sum.dim_IntList": SumDimOp,
+            "tanh.default": TanhOp,
+            "sub.Tensor": SubOp,
+            "var_mean.correction": VarMeanOp,
+            "amax.default": AmaxOp,
+            "select.int": SelectOp,
+            "exp.default": ExpOp,
+            "erf.default": ErfOp,
+            "getitem": GetItemOp,
+            "convolution.default": Conv2dOp,
+            "max_pool2d_with_indices.default": MaxPool2dWithIndicesOp,
+            "relu.default": ReluOp,
+            "iota.default": IotaOp,
+            "sigmoid.default": SigmoidOp,
+            "scalar_tensor.default": ScalarTensorOp,
+            "where.self": WhereOp,
+            "sqrt.default": SqrtOp,
+            "reciprocal.default": ReciprocalOp,
+        }
 
     @property
-    def imported_module(self):
-        """Returns the imported MLIR module after compilation."""
-        return self._imported_module
+    def imported_graphs(self):
+        """Returns the imported buddy graphs after compilation."""
+        return self._imported_graphs
 
     @property
     def imported_params(self):
-        """Returns the imported parameters from the model."""
+        """Returns the imported model params after compilation."""
         return self._imported_params
 
+    def _torch_dtype_translate(self, dtype):
+        match dtype:
+            case "torch.int64":
+                return TensorDType.Int64
+            case "torch.int32":
+                return TensorDType.Int32
+            case "torch.float16":
+                return TensorDType.Float16
+            case "torch.float32":
+                return TensorDType.Float32
+            case "torch.float64":
+                return TensorDType.Float64
+            case "torch.bool":
+                return TensorDType.Bool
+            case _:
+                raise NotImplementedError(f"Unsupported dtype: {dtype}")
+
+    def _create_node(
+        self,
+        gm_node_name: str,
+        node_name: str,
+        node_input: Tuple,
+        node_users: List[str],
+        node_output_shape: list = [],
+        node_output_dtype: TensorDType = None,
+        node_kwargs: Optional[Dict] = None,
+    ):
+        """
+        Create buddy op node from torch aten op.
+
+        Args:
+            gm_node_name: The op node class map to buddy op by _ops_map.
+            node_name: The op node name to be used.
+            node_input: The args input to op node.
+            node_output_shape: The list of the op node's output shape.
+            node_output_dtype: The TensorDType enum type of the op node's output
+            data type.
+            node_kwargs: The restful attributes for op node.
+        """
+        op_class = self._ops_map[gm_node_name]
+        buddy_node = op_class()
+        buddy_node._name = node_name
+        if gm_node_name == "output":
+            for input_arg in node_input[0]:
+                buddy_node.add_argument(str(input_arg))
+            return buddy_node
+        for input_arg in node_input:
+            if isinstance(input_arg, torch.fx.Node):
+                buddy_node.add_argument(str(input_arg))
+                buddy_node.add_parent(str(input_arg))
+            elif isinstance(input_arg, torch.dtype):
+                buddy_node.add_argument(self._torch_dtype_translate(str(input_arg)))
+            else:
+                buddy_node.add_argument(input_arg)
+        for user in node_users:
+            buddy_node.add_children(user)
+        if node_kwargs is None:
+            node_kwargs = {}
+        buddy_node._keyword_arguments.update(node_kwargs)
+        buddy_node._tensor_meta["shape"] = node_output_shape
+        buddy_node._tensor_meta["dtype"] = node_output_dtype
+        return buddy_node
+
     def _compile_fx(
         self, gm: torch.fx.GraphModule, inputs: List[torch.Tensor]
     ) -> Any:
         """
-        Compiles the provided FX Graph to MLIR module.
+        Compiles the provided FX Graph to Buddy Graph.
 
         Args:
             gm (torch.fx.GraphModule): The GraphModule to be compiled.
             inputs (List[torch.Tensor]): The input tensors.
 
         Returns:
-            Any: The result of the ahead-of-time compiled module.
+            dynamo_run: The function of the ahead-of-time compiled module,
+            return for torchdynamo's call.
         """
 
-        def _compiler(_gm: torch.fx.GraphModule, _inputs: List[torch.Tensor]):
-            """Compile a FX graph in Aten/Prims IR to MLIR."""
-            func_params = _inputs[: len(self.imported_params)]
-            func_inputs = _inputs[len(self.imported_params) :]
-
-            # Initializes the MLIR context.
-            ctx = ir.Context()
-            with ir.Location.unknown(ctx):
-                fx_importer = FXGraphImporter(
-                    _gm,
-                    func_params,
-                    func_inputs,
-                    self._do_param_pack,
-                    self._func_name,
-                    self._ops_registry,
-                )
-                self._imported_module = fx_importer.import_graph()
-            # TODO: Lower to LLVM dialect and use JIT engine to execute.
-            return _gm.forward
-
         params = {
             **dict(gm.named_parameters(remove_duplicate=False)),
             **dict(gm.named_buffers(remove_duplicate=False)),
         }
         params_flat, _ = pytree.tree_flatten(params)
-        self._imported_params = params_flat
+
+        def _compiler(_gm: torch.fx.GraphModule, _inputs: List[torch.Tensor]):
+            """Compile a FX graph in Aten/Prims IR to MLIR."""
+            nonlocal params_flat
+            func_inputs = []
+            for inp in _inputs[len(params_flat) :]:
+                inp_shape = inp.shape
+                inp_dtype = self._torch_dtype_translate(str(inp.dtype))
+                func_inputs.append(TensorMeta(inp_shape, inp_dtype))
+            fake_params = []
+            for param in params_flat:
+                param_dtype = self._torch_dtype_translate(str(param.dtype))
+                fake_params.append(TensorMeta(param.shape, param_dtype))
+            graph = Graph(
+                func_inputs,
+                fake_params,
+                self._ops_registry,
+                self._func_name,
+            )
+            for gm_node in _gm.graph.nodes:
+                node_users = []
+                for user in gm_node.users.keys():
+                    node_users.append(str(user))
+                if gm_node.op == "placeholder":
+                    node_dtype = self._torch_dtype_translate(
+                        str(gm_node.meta["tensor_meta"].dtype)
+                    )
+                    buddy_node = self._create_node(
+                        gm_node.op,
+                        gm_node.name,
+                        gm_node.args,
+                        node_users,
+                        gm_node.meta["tensor_meta"].shape,
+                        node_dtype,
+                    )
+
+                elif gm_node.op == "output":
+                    buddy_node = self._create_node(
+                        gm_node.op,
+                        gm_node.name,
+                        gm_node.args,
+                        node_users
+                    )
+
+                elif gm_node.target is operator.getitem:
+                    node_dtype = self._torch_dtype_translate(
+                        str(gm_node.meta["tensor_meta"].dtype)
+                    )
+                    buddy_node = self._create_node(
+                        str(gm_node.target.__name__),
+                        gm_node.name,
+                        gm_node.args,
+                        node_users,
+                        gm_node.meta["tensor_meta"].shape,
+                        node_dtype,
+                    )
+
+                else:
+                    tensor_meta = gm_node.meta.get("tensor_meta")
+                    val = gm_node.meta.get("val")
+                    num_returns = len(gm_node.target._schema.returns)
+                    if num_returns == 1:
+                        node_dtype = self._torch_dtype_translate(
+                            str(tensor_meta.dtype)
+                        )
+                        node_shape = tensor_meta.shape
+                    elif num_returns > 1:
+                        node_dtype = tuple(
+                            [
+                                self._torch_dtype_translate(str(val_item.dtype))
+                                for val_item in val
+                            ]
+                        )
+                        node_shape = tuple([val_item.shape for val_item in val])
+                    else:
+                        raise RuntimeError("Zero returns is not supported.")
+
+                    buddy_node = self._create_node(
+                        str(gm_node.target.__name__),
+                        gm_node.name,
+                        gm_node.args,
+                        node_users,
+                        node_shape,
+                        node_dtype,
+                        node_kwargs=gm_node.kwargs,
+                    )
+
+                graph.add_node(buddy_node)
+            transform_list = [maxpool2d_simplify]
+            graph.perform(transform_list)
+            self._imported_graphs.append(graph)
+            self._imported_params[graph] = params_flat
+            return self.dynamo_run()
 
         return aot_module_simplified(
             gm,
@@ -143,11 +354,12 @@ def __call__(
             inputs (List[torch.Tensor]): The input tensors.
 
         Returns:
-            Any: The result of the ahead-of-time compiled module.
+            dynamo_run: The function of the ahead-of-time compiled module,
+            return for torchdynamo's call.
         """
         return self._compile_fx(gm, inputs)
 
-    def importer(self, model, *args, **kwargs):
+    def importer(self, model, *args, **kwargs) -> List[Graph]:
         """
         Imports the provided model as MLIR module and flat parameters.
 
@@ -157,212 +369,145 @@ def importer(self, model, *args, **kwargs):
             kwargs: Keyword arguments for the model.
 
         Returns:
-            module: The imported MLIR module.
-            params: The imported flat parameters.
+            imported_graphs: The imported buddy graphs.
         """
         model_opt = dynamo.optimize(self._compile_fx)(model)
         model_opt(*args, **kwargs)
-        module = self._imported_module
-        params = self._imported_params
-        return module, params
-
-
-class FXGraphImporter:
-    """
-    Imports an FX graph and generates an MLIR module in high-level dialects.
-
-    Attributes:
-        _symbol_table (dict): A dictionary to keep track of the symbols.
-        _gm (torch.fx.GraphModule): The FX graph module to be imported.
-        _func_name (str): Name of the generated MLIR function.
-        _inputs (List[torch.Tensor]): Input tensor(s) of the FX graph.
-        _num_input_visited (int): Number of input nodes that have been visited.
-        _module (mlir.ir.Module): The generated MLIR module.
-        _ops_registry (dict): Registry for the candidate operations.
-    """
-
-    def __init__(
-        self,
-        gm: torch.fx.GraphModule,
-        params: List[torch.Tensor],
-        inputs: List[torch.Tensor],
-        do_param_pack: bool = True,
-        func_name: str = "forward",
-        ops_registry: Optional[dict] = None,
-    ):
-        """
-        Initializes the FX Graph importer.
-
-        Args:
-            gm (torch.fx.GraphModule): The FX graph that will be imported.
-            inputs (List[torch.Tensor]): Input tensor(s) of the FX graph.
-            func_name (str): Name of the generated MLIR function.
-            ops_registry (dict): Registry for the candidate operations.
-        """
-        if ops_registry is None:
-            ops_registry = {}
-        self._symbol_table = {}
-        self._gm = gm
-        self._func_name = func_name
-        self._params = params
-        self._inputs = inputs
-        self._do_param_pack = do_param_pack
-        self._param_packs = []
-        self._num_input_visited = 0
-        self._module = ir.Module.create()
-        self._ops_registry = ops_registry
-        self._current_param_pack_offset = None
-
-    def _torch_dtype_to_mlir_dtype(self, dtype: torch.dtype) -> ir.Type:
-        """
-        Converts a torch dtype to the corresponding MLIR dtype.
-
-        Args:
-            dtype (torch.dtype): The torch data type.
+        return self._imported_graphs
 
-        Returns:
-            mlir.ir.Type: The corresponding MLIR data type.
-
-        Raises:
-            NotImplementedError: If the given dtype is not supported.
-        """
-        match dtype:
-            case torch.int32:
-                return ir.IntegerType.get_signless(32)
-            case torch.int64:
-                return ir.IntegerType.get_signless(64)
-            case torch.float32:
-                return ir.F32Type.get()
-            case torch.bool:
-                return ir.IntegerType.get_signless(1)
-            case _:
-                raise NotImplementedError(f"Unsupported dtype {dtype}")
-
-    def _pack_params(self) -> None:
-        dtypes = list(set([param.dtype for param in self._params]))
-        dtypes.sort(key=str)
-        self._current_param_pack_offset = {dtype: 0 for dtype in dtypes}
-        for dtype in dtypes:
-            params_of_dtype = [
-                param for param in self._params if param.dtype == dtype
-            ]
-            param_total_size = 0
-            for param in params_of_dtype:
-                param_total_size += functools.reduce(
-                    lambda x, y: x * y, list(param.shape)
-                )
-            mlir_dtype = self._torch_dtype_to_mlir_dtype(dtype)
-            self._param_packs.append(
-                ir.RankedTensorType.get([param_total_size], mlir_dtype)
-            )
-
-    def import_graph(self) -> ir.Module:
+    def dynamo_run(self):
         """
-        Imports FX graph and generates an MLIR module in high-level dialects.
+        A callable method that wraps around the `exec_buddy_graph` method.
 
         Returns:
-            mlir.ir.Module: An MLIR module in high-level dialects.
-        """
-        with ir.InsertionPoint(self._module.body):
-            arguments = []
-            if self._do_param_pack:
-                self._pack_params()
-                arguments.extend(self._param_packs)
-                inputs = self._inputs
-            else:
-                inputs = self._params + self._inputs
-            for arg in inputs:
-                shape_list = list(arg.shape)
-                torch_dtype = arg.dtype
-                mlir_dtype = self._torch_dtype_to_mlir_dtype(torch_dtype)
-                tensor_arg = ir.RankedTensorType.get(shape_list, mlir_dtype)
-                arguments.append(tensor_arg)
-
-            @func.FuncOp.from_py_func(*arguments, name=self._func_name)
-            def generated_func(*args):
-                args_list = list(args)
-                for node in self._gm.graph.nodes:
-                    if not (
-                        node.op in ["output", "placeholder", "call_function"]
-                        or node.target is operator.getitem
-                    ):
-                        continue
-                    if node.op == "output":
-                        output_node_args = node.args[0]
-                        returns = [
-                            self._symbol_table.get((str(output_arg), 0))
-                            for output_arg in output_node_args
-                        ]
-                        self._symbol_table[("output", 0)] = returns
-                    elif node.op == "placeholder":
-                        self._import_placeholder(node, args_list)
-                    elif node.target is operator.getitem:
-                        self._symbol_table[
-                            (str(node.name), 0)
-                        ] = self._symbol_table[
-                            (str(node.args[0]), node.args[1])
-                        ]
-                    else:
-                        self._import_op(node)
-
-                return self._symbol_table.get(("output", 0))
-
-        return self._module
-
-    def _import_placeholder(
-        self, node: torch.fx.Node, args_list: List[ir.BlockArgument]
-    ):
+            exec_buddy_graph: The function of the ahead-of-time compiled module,
+            return for torchdynamo's call.
         """
-        Imports a placeholder node from the FX graph.
 
-        Args:
-            node (torch.fx.Node): The FX node representing the placeholder.
-            args_list (List[mlir.ir.BlockArgument]): List of input tensors.
-        """
-        if self._num_input_visited < len(self._params):
-            dtype = node.meta["tensor_meta"].dtype
-            pack_of_dtype = None
-            for pack in args_list:
-                if ir.RankedTensorType(
-                    pack.type
-                ).element_type == self._torch_dtype_to_mlir_dtype(dtype):
-                    pack_of_dtype = pack
-                    break
-            placeholder_name = self._ops_registry["param.extract"](
-                node, self._current_param_pack_offset[dtype], pack_of_dtype
-            ).result
-            self._current_param_pack_offset[dtype] += functools.reduce(
-                lambda x, y: x * y, list(node.meta["tensor_meta"].shape)
-            )
-        else:
-            if len(self._params) > 0:
-                placeholder_name = args_list[
-                    self._num_input_visited
-                    - len(self._params)
-                    + len(self._param_packs)
-                ]
+        def get_lib_extension():
+            if platform.system() == "Linux":
+                return ".so"
+            elif platform.system() == "Darwin":
+                return ".dylib"
             else:
-                placeholder_name = args_list[self._num_input_visited]
-
-        self._symbol_table[(str(node.name), 0)] = placeholder_name
-        self._num_input_visited += 1
-
-    def _import_op(self, node: torch.fx.Node):
-        """
-        Imports an operation node from the FX graph.
-
-        Args:
-            node (torch.fx.Node): The FX node representing the operation.
-
-        """
-        op_name = node.target.__name__
-        op_ret: ir.Operation | ir.Value | tuple | ir.OpResult = (
-            self._ops_registry[op_name](node, self._symbol_table)
+                raise RuntimeError("Unsupported platform")
+
+        # Dynamo's graph break may import more than one graph.
+        graph = self._imported_graphs[-1]
+        graph.compile()
+        # Collect dependency libraries.
+        lib_extension = get_lib_extension()
+        lib_names = ["libmlir_runner_utils", "libmlir_c_runner_utils", "libomp"]
+        path_prefix = os.path.dirname(os.path.abspath(__file__))
+        lib_base_path = os.path.join(path_prefix, "../../../../llvm/build/lib/")
+        lib_base_path = os.path.abspath(lib_base_path)
+        shared_libs = [
+            os.path.join(lib_base_path, lib_name + lib_extension)
+            for lib_name in lib_names
+        ]
+        # Define execution engine.
+        ee = ExecutionEngine(
+            graph._imported_module, opt_level=3, shared_libs=shared_libs
         )
-        if isinstance(op_ret, tuple):
-            for i, operation in enumerate(op_ret):
-                self._symbol_table[(str(node.name), i)] = operation.result
-        elif isinstance(op_ret, ir.OpResult):
-            self._symbol_table[(str(node.name), 0)] = op_ret
-        else:
-            self._symbol_table[(str(node.name), 0)] = op_ret.result
+
+        def cast_c_ptr(outdata_ptr, memref_ptr):
+            """
+            Casts a C pointer (`outdata_ptr`) to the type of another C pointer 
+            (`memref_ptr`).
+
+            Args:
+                outdata_ptr: ctypes.POINTER
+                The C pointer whose type needs to be cast.
+                memref_ptr: ctypes.POINTER
+                The reference C pointer whose type will be used for casting.
+
+            Returns:
+            ctypes.POINTER
+                A new C pointer with the type of `memref_ptr`, representing the 
+                same memory location as `outdata_ptr`.
+
+            Example:
+            outdata = ctypes.pointer(ctypes.c_int())
+            memref = ctypes.pointer(ctypes.c_float())
+            casted_ptr = cast_c_ptr(outdata, memref)
+            # Now `casted_ptr` points to the same memory location as `outdata`, 
+            but with the type of `memref`.
+            """
+            outdata_addr = ctypes.addressof(outdata_ptr.contents)
+            out_ptr = ctypes.cast(outdata_addr, type(memref_ptr))
+            return out_ptr
+
+        def move_c_ptr(outdata_ptr, memref_ptr):
+            """
+            Moves a C pointer (`outdata_ptr`) to the next element in memory, 
+            based on the size of the referenced type in another C pointer 
+            (`memref_ptr`).
+
+            Args:
+                outdata_ptr: ctypes.POINTER
+                The C pointer whose position needs to be moved.
+                memref_ptr: ctypes.POINTER
+                The reference C pointer whose type determines the size of each 
+                element for the move.
+
+            Returns:
+            ctypes.POINTER
+                A new C pointer pointing to the next element in memory, based on
+                the size of the type referenced by `memref_ptr`.
+            """
+            elem_size = ctypes.sizeof(memref_ptr.contents)
+            outdata_addr = ctypes.addressof(outdata_ptr.contents)
+            out_ptr = ctypes.cast(outdata_addr + elem_size, type(memref_ptr))
+            return out_ptr
+
+        def exec_buddy_graph(*args):
+            """
+            Execute a graph using TorchDynamo with the provided input tensors.
+
+            Args:
+                *args: List[torch.Tensor]
+                Input tensors to be passed to the graph's function.
+
+            Returns:
+            List[torch.Tensor]
+                The result of executing the graph, represented as a list of 
+                output tensors.
+            """
+            # A list of ctypes pointers representing memory references for input
+            # tensors.
+            input_memref = [
+                ctypes.pointer(
+                    ctypes.pointer(
+                        rt.get_ranked_memref_descriptor(tensor.numpy())
+                    )
+                )
+                for tensor in args
+            ]
+            # A list of ctypes pointers representing memory references for 
+            # output tensors.
+            output_memref = [
+                ctypes.pointer(ctypes.pointer(graph._output_descriptor()))
+            ]
+            args_memref = output_memref + input_memref
+            # Invoke the graph's function using the provided execution engine 
+            # and memory references
+            ee.invoke(graph._func_name, *args_memref)
+
+            output_tensor = []
+            outdata_ptr = args_memref[0][0]
+            # Iterate through each output memory reference in the graph
+            for output_ptr in graph._output_memref:
+                # Cast the output data pointer to the type of the current output
+                # memory reference
+                data_ptr = cast_c_ptr(outdata_ptr, output_ptr[0])
+                # Convert the C data pointer to a NumPy array and append it to
+                # the output_tensor list
+                output_tensor.append(rt.ranked_memref_to_numpy(data_ptr))
+                # Move to the next element in memory based on the size of the
+                # current output type
+                outdata_ptr = move_c_ptr(outdata_ptr, output_ptr[0])
+            # Convert each NumPy array to a PyTorch tensor and return the list 
+            # of tensors
+            return [torch.from_numpy(tensor) for tensor in output_tensor]
+
+        return exec_buddy_graph
diff --git a/frontend/Python/graph/__init__.py b/frontend/Python/graph/__init__.py
new file mode 100644
index 0000000000..bd927a3c0d
--- /dev/null
+++ b/frontend/Python/graph/__init__.py
@@ -0,0 +1,23 @@
+# ===- __init__.py -------------------------------------------------------------
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# ===---------------------------------------------------------------------------
+#
+# Init the packages in graph directory.
+#
+# ===---------------------------------------------------------------------------
+
+from .graph import Graph
+from .operation import *
+from .type import TensorDType, TensorMeta
diff --git a/frontend/Python/graph/graph.py b/frontend/Python/graph/graph.py
new file mode 100644
index 0000000000..be2ce438c4
--- /dev/null
+++ b/frontend/Python/graph/graph.py
@@ -0,0 +1,487 @@
+# ===- graph.py ----------------------------------------------------------------
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# ===---------------------------------------------------------------------------
+#
+# This is the graph level of the Buddy Compiler frontend.
+#
+# ===---------------------------------------------------------------------------
+
+from typing import Any, List, Optional
+from types import FunctionType
+import ctypes
+import functools
+
+import numpy as np
+import mlir.ir as ir
+import mlir.dialects.func as func
+from mlir.passmanager import *
+from mlir.execution_engine import *
+from mlir import runtime as rt
+
+from .operation import *
+from .type import *
+
+
+def make_output_memref_descriptor(ranks, dtypes):
+    """
+    Make an output memref descriptor for the given memref ranks and dtypes.
+
+    Parameters:
+    - ranks: List[int]
+        A list of integers representing the ranks of each memref.
+    - dtypes: List[str]
+        A list of strings representing the data types of each memref.
+
+    Returns:
+    ctypes.Structure
+        An output memref descriptor struct.
+
+    Example:
+    ranks = [2, 3, 1]
+    dtypes = [np.float32, np.int64, np.bool]
+    descriptor = make_output_memref_descriptor(ranks, dtypes)
+    # Use the descriptor in your code
+    """
+    memref_descriptor = []
+    for i, rank, dtype in zip(range(len(ranks)), ranks, dtypes):
+        memref_descriptor.append(
+            (str(i), rt.make_nd_memref_descriptor(rank, dtype))
+        )
+
+    class OutputDescriptor(ctypes.Structure):
+        """Builds an output struct descriptor for the multi memref."""
+
+        _fields_ = memref_descriptor
+
+    return OutputDescriptor
+
+
+class Graph:
+    """
+    Graph is a graph-level expression for the Buddy Compiler frontends.
+    It acts as a model compute graph, which converts a Graph into an equivalent 
+    MLIR module.
+
+    Attributes:
+    - _body: List[Op]
+        The sequence of operation nodes in the graph.
+    - _inputs: List[TensorMeta]
+        The model inputs represented as TensorMeta objects.
+    - _fake_params: List[TensorMeta]
+        The fake parameters represented as TensorMeta objects.
+    - device: str
+        The hardware for graph runtime.
+    - _imported_module: Union[None, ImportedModuleType]
+        The imported MLIR module after compilation, if set.
+    - _ops_registry: dict
+        The ops lower strategy for the graph.
+    - _func_name: str
+        The function name for the MLIR module.
+    - _ctx: ir.Context
+        The context of the MLIR module.
+    - _output_memref: Union[None, ctypes.POINTER]
+        The memref pointer in the MLIR function output, if set.
+    - _output_descriptor: Union[None, OutputDescriptorType]
+        The output descriptor for the MLIR function, if set.
+    - ee_: Union[None, ExecutionEngineType]
+        The execution engine for the graph, if set.
+    """
+
+    def __init__(
+        self,
+        inputs: List[TensorMeta],
+        fake_params: List[TensorMeta],
+        ops_registry: dict,
+        func_name: str,
+    ) -> None:
+        """
+        Initializes the Graph.
+
+        Args:
+            inputs: List[TensorMeta]
+                The model inputs represented as TensorMeta objects.
+            fake_params: List[TensorMeta]
+                The fake parameters represented as TensorMeta objects.
+            ops_registry: dict
+                The ops lower strategy for the graph.
+            func_name: str
+                The function name for the MLIR module.
+        """
+        self._body = []
+        self._inputs = inputs
+        self.node_table: Dict[str, Op] = {}
+        self._fake_params = fake_params
+        self.device = "cpu"
+        self._imported_module = None
+        self._ops_registry = ops_registry
+        self._func_name = func_name
+        self._ctx = ir.Context()
+        self._output_memref = None
+        self._output_descriptor = None
+        self.execution_engine = None
+
+    def add_node(self, node: Op):
+        """
+        Adds an operation node to the graph's body.
+
+        Parameters:
+        - node: Op
+            The operation node to be added to the graph.
+
+        Returns:
+        None
+
+        Example:
+        graph_instance = Graph(inputs, fake_params, ops_registry, func_name)
+        op_node = Op()
+        graph_instance.add_node(op_node)
+        # The op_node is now part of the graph's body
+        """
+        self._body.append(node)
+        self.node_table[node.name] = node
+
+    def perform(self, func_list: List[FunctionType]):
+        for transform_func in func_list:
+            transform_func(self)
+
+    def lower_to_top_level_ir(self, do_params_pack=False):
+        """
+        Lowers the graph to top-level MLIR dialects.
+
+        Parameters:
+        - do_params_pack: bool, optional (default=False)
+            Flag indicating whether to perform parameters packing to one memref.
+
+        Returns:
+        None
+
+        Example:
+        graph_instance = Graph(inputs, fake_params, ops_registry, func_name)
+        graph_instance.lower_to_top_level_ir(do_params_pack=True)
+        # The graph is now lowered to top-level MLIR dialects
+        """
+        with ir.Location.unknown(self._ctx):
+            fx_importer = GraphImporter(
+                self._body,
+                self._fake_params,
+                self._inputs,
+                do_params_pack,
+                self._func_name,
+                self._ops_registry,
+            )
+            self._imported_module = fx_importer.import_graph()
+            outputs = fx_importer.get_output_nodes()
+        self._output_memref = []
+        output_ranks = []
+        output_dtypes = []
+        for out_node in outputs:
+            out_type = ir.RankedTensorType(out_node.type)
+            shape = list(out_type.shape)
+            dtype = out_type.element_type
+            match str(dtype):
+                case "i1":
+                    np_type = np.dtype(np.bool_)
+                case "i32":
+                    np_type = np.dtype(np.int32)
+                case "i64":
+                    np_type = np.dtype(np.int64)
+                case "f32":
+                    np_type = np.dtype(np.float32)
+                case _:
+                    raise NotImplementedError(f"Unsupported dtype {dtype}")
+            self._output_memref.append(
+                ctypes.pointer(
+                    ctypes.pointer(
+                        rt.make_nd_memref_descriptor(
+                            len(shape), rt.as_ctype(np_type)
+                        )()
+                    )
+                )
+            )
+            output_ranks.append(len(shape))
+            output_dtypes.append(rt.as_ctype(np_type))
+        self._output_descriptor = make_output_memref_descriptor(
+            output_ranks, output_dtypes
+        )
+
+    def lower_to_llvm_ir(self):
+        """
+        Lower graph to llvm ir.
+        """
+        if self._imported_module is None:
+            self.lower_to_top_level_ir()
+
+        with ir.Location.unknown(self._ctx):
+            pm = PassManager("builtin.module")
+            pm.add("func.func(tosa-to-linalg-named)")
+            pm.add("func.func(tosa-to-linalg)")
+            pm.add("func.func(tosa-to-tensor)")
+            pm.add("func.func(tosa-to-arith)")
+            pm.run(self._imported_module.operation)
+            pm.add("arith-expand")
+            pm.add("eliminate-empty-tensors")
+            pm.add("empty-tensor-to-alloc-tensor")
+            pm.add("convert-elementwise-to-linalg")
+            pm.add('one-shot-bufferize')
+            pm.add("func.func(convert-linalg-to-affine-loops)")
+            pm.add("affine-loop-fusion")
+            pm.add("func.func(affine-parallelize)")
+            pm.add("lower-affine")
+            pm.add("convert-scf-to-openmp")
+            pm.add("func-bufferize")
+            pm.add("arith-bufferize")
+            pm.add("func.func(tensor-bufferize)")
+            pm.add("func.func(buffer-deallocation)")
+            pm.add("func.func(finalizing-bufferize)")
+            pm.add("expand-strided-metadata")
+            pm.add("convert-vector-to-llvm")
+            pm.add("memref-expand")
+            pm.add("arith-expand")
+            pm.add("convert-arith-to-llvm")
+            pm.add("finalize-memref-to-llvm")
+            pm.add("convert-scf-to-cf")
+            pm.add("func.func(llvm-request-c-wrappers)")
+            pm.add("convert-openmp-to-llvm")
+            pm.add("convert-math-to-llvm")
+            pm.add("convert-math-to-libm")
+            pm.add("convert-func-to-llvm")
+            pm.add("reconcile-unrealized-casts")
+            pm.run(self._imported_module.operation)
+
+    def compile(self):
+        """
+        Compile graph from Buddy Graph to LLVM IR.
+        """
+        self.lower_to_top_level_ir()
+        self.lower_to_llvm_ir()
+
+
+class GraphImporter:
+    """
+    Imports an buddy graph and generates an MLIR module in high-level dialects.
+
+    Attributes:
+        _symbol_table (dict): A dictionary to keep track of the symbols.
+        _body (List[Op]): The FX graph module to be imported.
+        _func_name (str): Name of the generated MLIR function.
+        _inputs (List[TensorMeta]): Input tensor(s) of the FX graph.
+        _num_input_visited (int): Number of input nodes that have been visited.
+        _module (mlir.ir.Module): The generated MLIR module.
+        _ops_registry (dict): Registry for the candidate operations.
+    """
+
+    def __init__(
+        self,
+        body: List[Op],
+        params: List[TensorMeta],
+        inputs: List[TensorMeta],
+        do_param_pack: bool,
+        func_name: str,
+        ops_registry: dict,
+    ):
+        """
+        Initializes the buddy Graph importer.
+
+        Args:
+            gm (Graph): The buddy graph that will be imported.
+            inputs (List[TensorMeta]): Input tensor(s) of the buddy graph.
+            func_name (str): Name of the generated MLIR function.
+            ops_registry (dict): Registry for the candidate operations.
+        """
+        if ops_registry is None:
+            ops_registry = {}
+        self._symbol_table = {}
+        self._body = body
+        self._func_name = func_name
+        self._params = params
+        self._inputs = inputs
+        self._do_param_pack = do_param_pack
+        self._param_packs = []
+        self._num_input_visited = 0
+        self._module = ir.Module.create()
+        self._ops_registry = ops_registry
+        self._current_param_pack_offset = None
+
+    def _str_to_mlir_dtype(self, dtype: str) -> ir.Type:
+        """
+        Converts a str to the corresponding MLIR dtype.
+
+        Args:
+            dtype (str): The tensor type.
+
+        Returns:
+            mlir.ir.Type: The corresponding MLIR data type.
+
+        Raises:
+            NotImplementedError: If the given dtype is not supported.
+        """
+        match dtype:
+            case TensorDType.Int32:
+                return ir.IntegerType.get_signless(32)
+            case TensorDType.Int64:
+                return ir.IntegerType.get_signless(64)
+            case TensorDType.Float32:
+                return ir.F32Type.get()
+            case TensorDType.Bool:
+                return ir.IntegerType.get_signless(1)
+            case _:
+                raise NotImplementedError(f"Unsupported dtype {dtype}")
+
+    def _pack_params(self) -> None:
+        """
+        Packs parameters of the graph to one memref.
+
+        Returns:
+        None
+
+        Example:
+        graph_instance = Graph(inputs, fake_params, ops_registry, func_name)
+        graph_instance._pack_params()
+        # The parameters of the graph are now packed to one memref.
+        """
+        dtypes = list(set([param.dtype for param in self._params]))
+        dtypes.sort(key=str)
+        self._current_param_pack_offset = {dtype: 0 for dtype in dtypes}
+        for dtype in dtypes:
+            params_of_dtype = [
+                param for param in self._params if param.dtype == dtype
+            ]
+            param_total_size = 0
+            for param in params_of_dtype:
+                param_total_size += functools.reduce(
+                    lambda x, y: x * y, list(param.shape), 1
+                )
+            mlir_dtype = self._str_to_mlir_dtype(dtype)
+            self._param_packs.append(
+                ir.RankedTensorType.get([param_total_size], mlir_dtype)
+            )
+
+    def import_graph(self) -> ir.Module:
+        """
+        Imports buddy graph and generates an MLIR module in high-level dialects.
+
+        Returns:
+            mlir.ir.Module: An MLIR module in high-level dialects.
+        """
+        with ir.InsertionPoint(self._module.body):
+            arguments = []
+            if self._do_param_pack:
+                self._pack_params()
+                arguments.extend(self._param_packs)
+                inputs = self._inputs
+            else:
+                inputs = self._params + self._inputs
+            for arg in inputs:
+                shape_list = list(arg.shape)
+                dtype = arg.dtype
+                mlir_dtype = self._str_to_mlir_dtype(dtype)
+                tensor_arg = ir.RankedTensorType.get(shape_list, mlir_dtype)
+                arguments.append(tensor_arg)
+
+            @func.FuncOp.from_py_func(*arguments, name=self._func_name)
+            def generated_func(*args):
+                args_list = list(args)
+                for node in self._body:
+                    if isinstance(node, OutputOp):
+                        output_node_args = node.args
+                        returns = [
+                            self._symbol_table.get((str(output_arg), 0))
+                            for output_arg in output_node_args
+                        ]
+                        self._symbol_table[("output", 0)] = returns
+                    elif isinstance(node, PlaceholderOp):
+                        self._import_placeholder(node, args_list)
+                    elif isinstance(node, GetItemOp):
+                        self._symbol_table[
+                            (str(node.name), 0)
+                        ] = self._symbol_table[
+                            (str(node.args[0]), node.args[1])
+                        ]
+                    else:
+                        self._import_op(node)
+
+                return self._symbol_table.get(("output", 0))
+
+        return self._module
+
+    def _import_placeholder(
+        self, node: PlaceholderOp, args_list: List[ir.BlockArgument]
+    ):
+        """
+        Imports a placeholder node from the Buddy graph.
+
+        Parameters:
+        - node (PlaceholderOp): The PlaceholderOp node representing the 
+        placeholder.
+        - args_list (List[mlir.ir.BlockArgument]): List of input memrefs.
+
+        Returns:
+        None
+        """
+        if self._num_input_visited < len(self._params) and self._do_param_pack:
+            dtype = node.tensor_meta["dtype"]
+            pack_of_dtype = None
+            for pack in args_list:
+                if ir.RankedTensorType(
+                    pack.type
+                ).element_type == self._str_to_mlir_dtype(dtype):
+                    pack_of_dtype = pack
+                    break
+            placeholder_name = self._ops_registry["param.extract"](
+                node, self._current_param_pack_offset[dtype], pack_of_dtype
+            ).result
+            self._current_param_pack_offset[dtype] += functools.reduce(
+                lambda x, y: x * y, list(node.tensor_meta["shape"]), 1
+            )
+        elif self._do_param_pack:
+            if len(self._params) > 0:
+                placeholder_name = args_list[
+                    self._num_input_visited
+                    - len(self._params)
+                    + len(self._param_packs)
+                ]
+            else:
+                placeholder_name = args_list[self._num_input_visited]
+        else:
+            placeholder_name = args_list[self._num_input_visited]
+
+        self._symbol_table[(str(node.name), 0)] = placeholder_name
+        self._num_input_visited += 1
+
+    def _import_op(self, node: Op):
+        """
+        Imports an operation node from the buddy graph.
+
+        Args:
+            node (Op): The buddy node representing the operation.
+
+        """
+        op_name = node.__class__.__name__
+        op_ret: ir.Operation | ir.Value | tuple | ir.OpResult = (
+            self._ops_registry[op_name](node, self._symbol_table)
+        )
+        if isinstance(op_ret, tuple):
+            for i, operation in enumerate(op_ret):
+                self._symbol_table[(str(node.name), i)] = operation.result
+        elif isinstance(op_ret, ir.OpResult):
+            self._symbol_table[(str(node.name), 0)] = op_ret
+        else:
+            self._symbol_table[(str(node.name), 0)] = op_ret.result
+
+    def get_output_nodes(self):
+        """
+        Get output nodes from the lowered mlir func.
+        """
+        return self._symbol_table.get(("output", 0))
diff --git a/frontend/Python/graph/operation.py b/frontend/Python/graph/operation.py
new file mode 100644
index 0000000000..550f3f3211
--- /dev/null
+++ b/frontend/Python/graph/operation.py
@@ -0,0 +1,456 @@
+# ===- operation.py ------------------------------------------------------------
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# ===---------------------------------------------------------------------------
+#
+# This is the operation structure of Buddy Compiler graph representation.
+#
+# ===---------------------------------------------------------------------------
+
+from enum import Enum
+from typing import Dict, Optional, List, Tuple
+
+from .type import TensorDType, TensorMeta
+
+
+class OpType(Enum):
+    """
+    Enum class for declaring operation types.
+
+    Members:
+    - BroadcastType: int
+        Represents a broadcast operation.
+    - ElementwiseType: int
+        Represents an elementwise operation.
+    - ReshapeType: int
+        Represents a reshape operation.
+    - ReduceType: int
+        Represents a reduction operation.
+    - ConcatType: int
+        Represents a concatenation operation.
+    - PlaceholderType: int
+        Represents a placeholder operation.
+    - GetItemType: int
+        Represents an operation to retrieve an item.
+
+    Note: The underlying values are integers for these operation types.
+    """
+
+    BroadcastType = 0
+    ElementwiseType = 1
+    ReshapeType = 2
+    ReduceType = 3
+    ConcatType = 4
+    PlaceholderType = 5
+    GetItemType = 6
+
+
+class Op:
+    """
+    Base class for all operations in a computational graph.
+
+    Attributes:
+    - _name: str
+        The unique name of the operation node.
+    - _arguments: list
+        The input arguments of the operation node.
+    - _keyword_arguments: dict
+        The keyword arguments of the operation node.
+    - _tensor_meta: dict
+        The metadata of the output tensor, including shape and data type.
+    - _op_type: OpType
+        The type of the operation node, as defined in the OpType enum.
+    """
+
+    def __init__(self) -> None:
+        """
+        Initialize a new instance of the Op class.
+        """
+        self._name = None
+        self._arguments = []
+        self._keyword_arguments = {}
+        self._tensor_meta: List[TensorMeta] = {}
+        self._op_type: OpType = None
+        self._children: List[str] = []
+        self._parents: List[str] = []
+
+    def add_argument(self, arg):
+        """
+        Add an input argument to the operation node.
+
+        Parameters:
+        - arg: Any
+            The input argument to be added.
+        """
+        self._arguments.append(arg)
+
+    def add_parent(self, parent: str):
+        """
+        Add an parent node's name to the operation node.
+
+        Parameters:
+        - parent: str
+            The parent node's name to be added.
+        """
+        self._parents.append(parent)
+
+    def add_children(self, child):
+        """
+        Add an user node's name to the operation node.
+
+        Parameters:
+        - user: str
+            The user node's name to be added.
+        """
+        self._children.append(child)
+
+    @property
+    def args(self):
+        return self._arguments
+
+    @property
+    def kwargs(self):
+        return self._keyword_arguments
+
+    @property
+    def name(self):
+        return self._name
+
+    @name.setter
+    def name(self, new_name):
+        self._name = new_name
+
+    @property
+    def tensor_meta(self):
+        return self._tensor_meta
+
+
+class PlaceholderOp(Op):
+    def __init__(self) -> None:
+        super().__init__()
+        self._op_type = OpType.PlaceholderType
+
+
+class MatmulOp(Op):
+    def __init__(self) -> None:
+        super().__init__()
+        self._op_type = OpType.ReduceType
+
+
+class GetItemOp(Op):
+    def __init__(self) -> None:
+        super().__init__()
+        self._op_type = OpType.GetItemType
+
+
+class OutputOp(Op):
+    def __init__(self) -> None:
+        super().__init__()
+        self._op_type = OpType.GetItemType
+
+
+class ArangeOp(Op):
+    def __init__(self) -> None:
+        super().__init__()
+        self._op_type = OpType.PlaceholderType
+
+
+class UnsqueezeOp(Op):
+    def __init__(self) -> None:
+        super().__init__()
+        self._op_type = OpType.ReshapeType
+
+
+class ViewOp(Op):
+    def __init__(self) -> None:
+        super().__init__()
+        self._op_type = OpType.ReshapeType
+
+
+class EmbeddingOp(Op):
+    def __init__(self) -> None:
+        super().__init__()
+        self._op_type = OpType.ReshapeType
+
+
+class OnesOp(Op):
+    def __init__(self) -> None:
+        super().__init__()
+        self._op_type = OpType.PlaceholderType
+
+
+class FullOp(Op):
+    def __init__(self) -> None:
+        super().__init__()
+        self._op_type = OpType.PlaceholderType
+
+
+class LessThanOp(Op):
+    def __init__(self) -> None:
+        super().__init__()
+        self._op_type = OpType.BroadcastType
+
+
+class MaskedFillOp(Op):
+    def __init__(self) -> None:
+        super().__init__()
+        self._op_type = OpType.ElementwiseType
+
+
+class SliceOp(Op):
+    def __init__(self) -> None:
+        super().__init__()
+        self._op_type = OpType.ReshapeType
+
+
+class ToCopyOp(Op):
+    def __init__(self) -> None:
+        super().__init__()
+        self._op_type = OpType.ElementwiseType
+
+
+class RsubOp(Op):
+    def __init__(self) -> None:
+        super().__init__()
+        self._op_type = OpType.BroadcastType
+
+
+class PowOp(Op):
+    def __init__(self) -> None:
+        super().__init__()
+        self._op_type = OpType.BroadcastType
+
+
+class MeanOp(Op):
+    def __init__(self) -> None:
+        super().__init__()
+        self._op_type = OpType.ReduceType
+
+
+class RsqrtOp(Op):
+    def __init__(self) -> None:
+        super().__init__()
+        self._op_type = OpType.ElementwiseType
+
+
+class MulOp(Op):
+    def __init__(self) -> None:
+        super().__init__()
+        self._op_type = OpType.BroadcastType
+
+
+class TransposeOp(Op):
+    def __init__(self) -> None:
+        super().__init__()
+        self._op_type = OpType.ReshapeType
+
+
+class IndexOp(Op):
+    def __init__(self) -> None:
+        super().__init__()
+        self._op_type = OpType.ReshapeType
+
+
+class NegOp(Op):
+    def __init__(self) -> None:
+        super().__init__()
+        self._op_type = OpType.ElementwiseType
+
+
+class CatOp(Op):
+    def __init__(self) -> None:
+        super().__init__()
+        self._op_type = OpType.ConcatType
+
+
+class SqueezeOp(Op):
+    def __init__(self) -> None:
+        super().__init__()
+        self._op_type = OpType.ReshapeType
+
+
+class BatchMatmulOp(Op):
+    def __init__(self) -> None:
+        super().__init__()
+        self._op_type = OpType.ReduceType
+
+
+class DivOp(Op):
+    def __init__(self) -> None:
+        super().__init__()
+        self._op_type = OpType.BroadcastType
+
+
+class SoftmaxOp(Op):
+    def __init__(self) -> None:
+        super().__init__()
+        self._op_type = OpType.ReduceType
+
+
+class CloneOp(Op):
+    def __init__(self) -> None:
+        super().__init__()
+        self._op_type = OpType.ReduceType
+
+
+class SiluOp(Op):
+    def __init__(self) -> None:
+        super().__init__()
+        self._op_type = OpType.ElementwiseType
+
+
+class AddOp(Op):
+    def __init__(self) -> None:
+        super().__init__()
+        self._op_type = OpType.BroadcastType
+
+
+class AddMMOp(Op):
+    def __init__(self) -> None:
+        super().__init__()
+        self._op_type = OpType.ReduceType
+
+
+class AmaxOp(Op):
+    def __init__(self) -> None:
+        super().__init__()
+        self._op_type = OpType.ReduceType
+
+
+class SubOp(Op):
+    def __init__(self) -> None:
+        super().__init__()
+        self._op_type = OpType.BroadcastType
+
+
+class ConvertElementTypeOp(Op):
+    def __init__(self) -> None:
+        super().__init__()
+        self._op_type = OpType.ElementwiseType
+
+
+class ExpOp(Op):
+    def __init__(self) -> None:
+        super().__init__()
+        self._op_type = OpType.ElementwiseType
+
+
+class ExpandOp(Op):
+    def __init__(self) -> None:
+        super().__init__()
+        self._op_type = OpType.ReshapeType
+
+
+class PermuteOp(Op):
+    def __init__(self) -> None:
+        super().__init__()
+        self._op_type = OpType.ReshapeType
+
+
+class ReshapeOp(Op):
+    def __init__(self) -> None:
+        super().__init__()
+        self._op_type = OpType.ReshapeType
+
+
+class SelectOp(Op):
+    def __init__(self) -> None:
+        super().__init__()
+        self._op_type = OpType.ReshapeType
+
+
+class SumDimOp(Op):
+    def __init__(self) -> None:
+        super().__init__()
+        self._op_type = OpType.ReduceType
+
+
+class TanhOp(Op):
+    def __init__(self) -> None:
+        super().__init__()
+        self._op_type = OpType.ElementwiseType
+
+
+class VarMeanOp(Op):
+    def __init__(self) -> None:
+        super().__init__()
+        self._op_type = OpType.ReduceType
+
+
+class TOp(Op):
+    def __init__(self) -> None:
+        super().__init__()
+        self._op_type = OpType.ReshapeType
+
+
+class ErfOp(Op):
+    def __init__(self) -> None:
+        super().__init__()
+        self._op_type = OpType.ElementwiseType
+
+class Conv2dOp(Op):
+    def __init__(self) -> None:
+        super().__init__()
+        self._op_type = OpType.ReduceType
+        self._layout = "NCHW_FCHW"
+
+class ReluOp(Op):
+    def __init__(self) -> None:
+        super().__init__()
+        self._op_type = OpType.ElementwiseType
+
+class SigmoidOp(Op):
+    def __init__(self) -> None:
+        super().__init__()
+        self._op_type = OpType.ElementwiseType
+
+class IotaOp(Op):
+    def __init__(self) -> None:
+        super().__init__()
+        self._op_type = OpType.PlaceholderType
+
+class ScalarTensorOp(Op):
+    def __init__(self) -> None:
+        super().__init__()
+        self._op_type = OpType.PlaceholderType
+
+class WhereOp(Op):
+    def __init__(self) -> None:
+        super().__init__()
+        self._op_type = OpType.ElementwiseType
+
+class MaxPool2dWithIndicesOp(Op):
+    def __init__(self) -> None:
+        super().__init__()
+        self._op_type = OpType.ReduceType
+        self._layout = "NCHW"
+
+
+class MaxPool2dOp(Op):
+    def __init__(self) -> None:
+        super().__init__()
+        self._op_type = OpType.ReduceType
+        self._layout = "NCHW"
+
+
+class ReciprocalOp(Op):
+    def __init__(self) -> None:
+        super().__init__()
+        self._op_type = OpType.ElementwiseType
+
+
+class SqrtOp(Op):
+    def __init__(self) -> None:
+        super().__init__()
+        self._op_type = OpType.ElementwiseType
diff --git a/frontend/Python/graph/transform/__init__.py b/frontend/Python/graph/transform/__init__.py
new file mode 100644
index 0000000000..c4b7ac3d16
--- /dev/null
+++ b/frontend/Python/graph/transform/__init__.py
@@ -0,0 +1 @@
+from .useless_op_eliminate import maxpool2d_simplify
\ No newline at end of file
diff --git a/frontend/Python/graph/transform/useless_op_eliminate.py b/frontend/Python/graph/transform/useless_op_eliminate.py
new file mode 100644
index 0000000000..1b3f592966
--- /dev/null
+++ b/frontend/Python/graph/transform/useless_op_eliminate.py
@@ -0,0 +1,66 @@
+# ===- maxpool2d_simplify.py ---------------------------------------------------
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# ===---------------------------------------------------------------------------
+#
+# simplify the maxpool2d with getitem.
+#
+# ===---------------------------------------------------------------------------
+
+from .. import Graph
+from ..operation import *
+
+
+def maxpool2d_simplify(graph: Graph):
+    """
+    Fuse the maxpool op and getitem op to simpllify graph.
+
+    Args:
+        graph (torch.fx.GraphModule): The Graph to be simplified.
+    """
+    for i, node in enumerate(graph._body):
+        if isinstance(node, MaxPool2dWithIndicesOp):
+            getitem_num = 0
+            for user in node._children:
+                if isinstance(graph.node_table[user], GetItemOp):
+                    getitem_num += 1
+                    getitem_node = graph.node_table[user]
+            if (
+                getitem_num == 1
+                and len(node._children) == 1
+                and getitem_node.args[1] == 0
+            ):
+                new_node = MaxPool2dOp()
+                new_node.name = getitem_node.name
+                for arg in node.args:
+                    new_node.add_argument(arg)
+                for parent in node._parents:
+                    new_node.add_parent(parent)
+                for child in getitem_node._children:
+                    new_node.add_children(child)
+                new_node.tensor_meta["shape"] = getitem_node.tensor_meta[
+                    "shape"
+                ]
+                new_node.tensor_meta["dtype"] = getitem_node.tensor_meta[
+                    "dtype"
+                ]
+                new_node._layout = node._layout
+                del graph.node_table[node.name]
+                del graph.node_table[getitem_node.name]
+                graph.node_table[new_node.name] = new_node
+                del graph._body[i]
+                for j, op in enumerate(graph._body):
+                    if op == getitem_node:
+                        graph._body[j] = new_node
+                        break
diff --git a/frontend/Python/graph/type.py b/frontend/Python/graph/type.py
new file mode 100644
index 0000000000..5e1db3ed8a
--- /dev/null
+++ b/frontend/Python/graph/type.py
@@ -0,0 +1,79 @@
+# ===- type.py -----------------------------------------------------------------
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# ===---------------------------------------------------------------------------
+#
+# This is the tensor type of the Buddy Compiler frontend.
+#
+# ===---------------------------------------------------------------------------
+
+from enum import Enum
+
+
+class TensorDType(Enum):
+    """
+    Enum class for declaring tensor data types.
+
+    Members:
+    - Int32: str
+        Represents the 32-bit integer data type.
+    - Int64: str
+        Represents the 64-bit integer data type.
+    - Float32: str
+        Represents the 32-bit floating-point data type.
+    - Bool: str
+        Represents the boolean data type.
+    """
+
+    Int32 = "int32"
+    Int64 = "int64"
+    Float16 = "float16"
+    Float32 = "float32"
+    Float64 = "float64"
+    Bool = "bool"
+
+
+class TensorMeta:
+    """
+    Store tensor metadata, including shape and data type, while overlooking raw 
+    data.
+
+    Attributes:
+    - shape: tuple
+        Represents the shape of the tensor.
+    - dtype: str
+        Represents the data type of the tensor.
+
+    Methods:
+    - __init__(shape: tuple, dtype: str) -> None:
+        Initializes a new instance of the TensorMeta class with the specified 
+        shape and data type.
+
+    Example:
+    meta = TensorMeta(shape=(3, 4), dtype='float32')
+    # Access metadata attributes: meta.shape, meta.dtype
+    """
+
+    def __init__(self, shape, dtype) -> None:
+        """
+        Initialize a new instance of the TensorMeta class.
+
+        Parameters:
+        - shape: tuple
+            Represents the shape of the tensor.
+        - dtype: str
+            Represents the data type of the tensor.
+        """
+        self.shape = shape
+        self.dtype = dtype
diff --git a/frontend/Python/ops/linalg.py b/frontend/Python/ops/linalg.py
index 6a6e161c93..0a22478e13 100644
--- a/frontend/Python/ops/linalg.py
+++ b/frontend/Python/ops/linalg.py
@@ -14,29 +14,70 @@
 #
 # ===---------------------------------------------------------------------------
 #
-# The registry of mappings from Torch node to MLIR linalg dialect operations.
+# The registry of mappings from Buddy Graph to MLIR linalg dialect operations.
 #
 # ===---------------------------------------------------------------------------
 
 from typing import Dict, Tuple, List
 
-import torch
-
 import mlir.ir as ir
 from mlir.dialects import tosa, linalg, arith, tensor, math
 import copy
 import numpy
 import functools
 
+from ..graph import *
+from ..graph.graph import TensorDType
+from .utils import *
+
+
+def add_op(node: AddOp, symbol_table: Dict[Tuple[str, int], ir.Operation]):
+    """
+    Import tensor add operation.
+    From buddy AddOp to MLIR arith `constant` operation.
+
+    Note: this function init an output tensor according input range.
+
+    Args:
+        node: Containing information from the input graph node.
+        symbol_table: A dictionary mapping symbols to their corresponding
+        operations.
+
+    Returns:
+        op: The operation representing the result tensor of two input nodes' add
+        result.
+    """
+    input1 = symbol_table.get((str(node.args[0]), 0))
+    dtype = node.tensor_meta["dtype"]
+    mlir_dtype = mlir_element_type_get(dtype)
+    shape = list(node.tensor_meta["shape"])
+    if isinstance(node.args[1], str):
+        input2 = symbol_table.get((str(node.args[1]), 0))
+    else:
+        data = [node.args[1]]
+        input2_shape = numpy.array(data).shape
+        tensor_type = ir.RankedTensorType.get(input2_shape, mlir_dtype)
+        element = mlir_element_attr_get(dtype, node.args[1])
+        attr = ir.DenseElementsAttr.get_splat(tensor_type, element)
+        input2 = arith.ConstantOp(tensor_type, attr).result
+    if input1 is None or input2 is None:
+        return
+    add_result_tensor_type = ir.RankedTensorType.get(shape, mlir_dtype)
+    op = tosa.AddOp(
+        add_result_tensor_type,
+        input1,
+        input2,
+    )
+    return op.result
+
 
 def arange_op(
-    node: torch.fx.Node,
+    node: ArangeOp,
     symbol_table: Dict[Tuple[str, int], ir.Operation],
 ):
     """
     Import tensor arange operation.
-    From PyTorch `aten.arange.default` and `aten.arange.start` operator to MLIR
-    arith `constant` operation.
+    From buddy ArangeOp to MLIR arith `constant` operation.
 
     Note: this function init an output tensor according input range.
 
@@ -49,51 +90,34 @@ def arange_op(
         op: The operation representing the result tensor of ranging the start
         and end from input node.
     """
-    if node.target.__name__ == "arange.start":
+    if len(node.args) == 2:
         start = int(node.args[0])
         end = int(node.args[1])
-        stride = int(node.meta["tensor_meta"].stride[0])
-        dtype = str(node.meta["tensor_meta"].dtype)
-        shape = list(node.meta["tensor_meta"].shape)
-        dtype = ir.IntegerType.get_signless(64)
-        tensor_type = ir.RankedTensorType.get(shape, dtype)
-        attr = ir.DenseElementsAttr.get(
-            numpy.array([i for i in range(start, end, stride)]),
-            signless=True,
-            type=tensor_type,
-        )
-        op = arith.ConstantOp(tensor_type, attr)
-
-    elif node.target.__name__ == "arange.default":
+    else:
         start = 0
         end = int(node.args[0])
-        stride = int(node.meta["tensor_meta"].stride[0])
-        dtype = str(node.meta["tensor_meta"].dtype)
-        shape = list(node.meta["tensor_meta"].shape)
-        dtype = ir.IntegerType.get_signless(64)
-        tensor_type = ir.RankedTensorType.get(shape, dtype)
-        attr = ir.DenseElementsAttr.get(
-            numpy.array([i for i in range(start, end, stride)]),
-            signless=True,
-            type=tensor_type,
-        )
-        op = arith.ConstantOp(tensor_type, attr)
+    stride = 1
+    dtype = node.tensor_meta["dtype"]
+    shape = list(node.tensor_meta["shape"])
+    dtype = mlir_element_type_get(dtype)
+    tensor_type = ir.RankedTensorType.get(shape, dtype)
+    attr = ir.DenseElementsAttr.get(
+        numpy.array([i for i in range(start, end, stride)]),
+        signless=True,
+        type=tensor_type,
+    )
+    op = arith.ConstantOp(tensor_type, attr)
 
     return op
 
 
 def unsqueeze_op(
-    node: torch.fx.Node,
+    node: UnsqueezeOp,
     symbol_table: Dict[Tuple[str, int], ir.Operation],
 ):
     """
     Import the unsqueeze operation.
-    From PyTorch `aten.unsqueeze.default` operator to MLIR TOSA `reshape`
-    operation.
-
-    Note: "unsqueeze" means inserting a new dimension of size 1 at the specified
-          position. For more information, please refer to
-          https://pytorch.org/docs/stable/generated/torch.unsqueeze.html
+    From buddy UnsqueezeOp to MLIR TOSA `reshape` operation.
 
     Args:
         node: Containing information from the input graph node.
@@ -118,12 +142,12 @@ def unsqueeze_op(
 
 
 def view_op(
-    node: torch.fx.Node,
+    node: ViewOp,
     symbol_table: Dict[Tuple[str, int], ir.Operation],
 ):
     """
     Import the tensor view operation.
-    From PyTorch `aten.view.default` operator to MLIR TOSA `reshape` operation.
+    From buddy ViewOp to MLIR TOSA `reshape` operation.
 
     Note: If the new shape contains one and only one `-1`, the size of the new
     shape will be inferred automatically.
@@ -160,13 +184,12 @@ def view_op(
 
 
 def embedding_op(
-    node: torch.fx.Node,
+    node: EmbeddingOp,
     symbol_table: Dict[Tuple[str, int], ir.Operation],
 ):
     """
     Import the embedding operation.
-    From PyTorch `aten.embedding.default` operator to MLIR linalg `generic`
-    operation.
+    From buddy EmbeddingOp to MLIR linalg `generic` operation.
 
     Note: In this op, input node1's value is as index to get input node2's row
     slice.
@@ -180,52 +203,51 @@ def embedding_op(
     """
     input1 = symbol_table.get((str(node.args[0]), 0))
     input2 = symbol_table.get((str(node.args[1]), 0))
-    output_shape = list(node.meta["tensor_meta"].shape)
-    dtype = str(node.meta["tensor_meta"].dtype)
-    if dtype == "torch.float32":
-        tensor_type = ir.RankedTensorType.get(output_shape, ir.F32Type.get())
-        output = tensor.EmptyOp(output_shape, ir.F32Type.get())
-        generic_map = ir.AffineMap.get_permutation([0, 1, 2])
-        op = linalg.GenericOp(
-            [tensor_type],
-            [input2],
-            [output],
-            ir.ArrayAttr.get(
-                [
-                    ir.AffineMapAttr.get(generic_map.get_submap([0, 1])),
-                    ir.AffineMapAttr.get(generic_map.get_submap([0, 1, 2])),
-                ]
-            ),
-            ir.ArrayAttr.get(
-                [ir.Attribute.parse("#linalg.iterator_type<parallel>")] * 3
-            ),
-        )
-        block = ir.Block.create_at_start(
-            op.region,
+    output_shape = list(node.tensor_meta["shape"])
+    dtype = node.tensor_meta["dtype"]
+    dtype = mlir_element_type_get(dtype)
+    tensor_type = ir.RankedTensorType.get(output_shape, dtype)
+    output = tensor.EmptyOp(output_shape, dtype)
+    generic_map = ir.AffineMap.get_permutation([0, 1, 2])
+    op = linalg.GenericOp(
+        [tensor_type],
+        [input2],
+        [output],
+        ir.ArrayAttr.get(
             [
-                ir.RankedTensorType(input2.type).element_type,
-                ir.RankedTensorType(output.result.type).element_type,
-            ],
-        )
-        index1 = arith.IndexCastOp(ir.IndexType.get(), block.arguments[0])
-        index2 = linalg.IndexOp(ir._i64Attr(2, None))
-        value = tensor.ExtractOp(input1, [index1.result, index2.result])
-        block.append(index1)
-        block.append(index2)
-        block.append(value)
-        block.append(linalg.YieldOp([value.result]))
+                ir.AffineMapAttr.get(generic_map.get_submap([0, 1])),
+                ir.AffineMapAttr.get(generic_map.get_submap([0, 1, 2])),
+            ]
+        ),
+        ir.ArrayAttr.get(
+            [ir.Attribute.parse("#linalg.iterator_type<parallel>")] * 3
+        ),
+    )
+    block = ir.Block.create_at_start(
+        op.region,
+        [
+            ir.RankedTensorType(input2.type).element_type,
+            ir.RankedTensorType(output.result.type).element_type,
+        ],
+    )
+    index1 = arith.IndexCastOp(ir.IndexType.get(), block.arguments[0])
+    index2 = linalg.IndexOp(ir._i64Attr(2, None))
+    value = tensor.ExtractOp(input1, [index1.result, index2.result])
+    block.append(index1)
+    block.append(index2)
+    block.append(value)
+    block.append(linalg.YieldOp([value.result]))
 
     return op
 
 
 def ones_op(
-    node: torch.fx.Node,
+    node: OnesOp,
     symbol_table: Dict[Tuple[str, int], ir.Operation],
 ):
     """
     Import the tensor ones operation.
-    From PyTorch `aten.ones.default` operator to MLIR arith `constant`
-    operation.
+    From buddy OnesOp to MLIR arith `constant` operation.
 
     Note: This op, input node1's value is as index to get input node2's row
     slice.
@@ -238,30 +260,21 @@ def ones_op(
         op: The operation return the arith.constant op.
     """
     output_shape = list(node.args[0])
-    dtype = str(node.meta["tensor_meta"].dtype)
-    if dtype == "torch.bool":
-        element = ir.BoolAttr.get(1)
-        tensor_type = ir.RankedTensorType.get(output_shape, element.type)
-        attr = ir.DenseElementsAttr.get_splat(tensor_type, element)
-    elif dtype == "torch.int64":
-        dtype = ir.IntegerType.get_signless(64)
-        tensor_type = ir.RankedTensorType.get(output_shape, dtype)
-        attr = ir.DenseElementsAttr.get(
-            numpy.ones(output_shape), signless=True, type=tensor_type
-        )
+    dtype = node.tensor_meta["dtype"]
+    element = mlir_element_attr_get(dtype, 1)
+    tensor_type = ir.RankedTensorType.get(output_shape, element.type)
+    attr = ir.DenseElementsAttr.get_splat(tensor_type, element)
     op = arith.ConstantOp(tensor_type, attr)
 
     return op
 
-
 def full_op(
-    node: torch.fx.Node,
+    node: FullOp,
     symbol_table: Dict[Tuple[str, int], ir.Operation],
 ):
     """
     Import the tensor full operation.
-    From PyTorch `aten.full.default` operator to MLIR arith `constant`
-    operation.
+    From buddy FullOp to MLIR arith `constant` operation.
 
     Note: This op, input node1's value is the shape of output tensor, input
     node2's value is the value of all elements in output tensor.
@@ -275,39 +288,22 @@ def full_op(
     """
     output_shape = list(node.args[0])
     value = node.args[1]
-    dtype = str(node.meta["tensor_meta"].dtype)
-    if dtype == "torch.bool":
-        element = ir.BoolAttr.get(bool(value))
-        tensor_type = ir.RankedTensorType.get(output_shape, element.type)
-        attr = ir.DenseElementsAttr.get_splat(tensor_type, element)
-    elif dtype == "torch.int64":
-        dtype = ir.IntegerType.get_signless(64)
-        tensor_type = ir.RankedTensorType.get(output_shape, dtype)
-        attr = ir.DenseElementsAttr.get(
-            numpy.full(output_shape, value, dtype=numpy.int64),
-            signless=True,
-            type=tensor_type,
-        )
-    elif dtype == "torch.float32":
-        dtype = ir.F32Type.get()
-        tensor_type = ir.RankedTensorType.get(output_shape, dtype)
-        attr = ir.DenseElementsAttr.get(
-            numpy.full(output_shape, value, dtype=numpy.float32),
-            signless=True,
-            type=tensor_type,
-        )
+    dtype = node.tensor_meta["dtype"]
+    element = mlir_element_attr_get(dtype, value)
+    tensor_type = ir.RankedTensorType.get(output_shape, element.type)
+    attr = ir.DenseElementsAttr.get_splat(tensor_type, element)
     op = arith.ConstantOp(tensor_type, attr)
 
     return op
 
 
 def lt_op(
-    node: torch.fx.Node,
+    node: LessThanOp,
     symbol_table: Dict[Tuple[str, int], ir.Operation],
 ):
     """
     Import the tensor less than operation.
-    From PyTorch `aten.lt.Tensor` operator to MLIR arith `constant` operation.
+    From buddy LessThanOp to MLIR arith `constant` operation.
 
     Note: This op, campare two input nodes, and output bool tensor to represent
     compare result.
@@ -321,93 +317,86 @@ def lt_op(
     """
     input1 = symbol_table.get((str(node.args[0]), 0))
     input2 = symbol_table.get((str(node.args[1]), 0))
-    output_shape = list(node.meta["tensor_meta"].shape)
-    dtype = str(node.meta["tensor_meta"].dtype)
+    output_shape = list(node.tensor_meta["shape"])
+    dtype = node.tensor_meta["dtype"]
     value = ir.IntegerAttr.get(ir.IntegerType.get_signless(64), 2)
     shp1 = list(ir.RankedTensorType(ir.Value(input1).type).shape)
     shp2 = list(ir.RankedTensorType(ir.Value(input2).type).shape)
-    if dtype == "torch.bool":
-        tensor_type = ir.RankedTensorType.get(
-            output_shape, ir.IntegerType.get_signless(1)
-        )
-        output = tensor.EmptyOp(output_shape, ir.IntegerType.get_signless(1))
-        if len(shp1) < len(shp2):
-            if int(shp1[-1]) > 1 and shp2[-1] == 1:
-                generic_map = ir.AffineMap.get_permutation(
-                    [i for i in range(len(shp2) + 1)]
-                )
-                op = linalg.GenericOp(
-                    [tensor_type],
-                    [input1, input2],
-                    [output],
-                    ir.ArrayAttr.get(
-                        [
-                            ir.AffineMapAttr.get(
-                                generic_map.get_submap(
-                                    [
-                                        i
-                                        for i in range(
-                                            len(shp2) - len(shp1), len(shp2)
-                                        )
-                                    ]
-                                )
-                            ),
-                            ir.AffineMapAttr.get(
-                                generic_map.get_submap(
-                                    [i for i in range(0, len(shp2) - 1)]
-                                    + [len(shp2)]
-                                )
-                            ),
-                            ir.AffineMapAttr.get(
-                                generic_map.get_submap(
-                                    [i for i in range(0, len(shp2))]
-                                )
-                            ),
-                        ]
-                    ),
-                    ir.ArrayAttr.get(
-                        [ir.Attribute.parse("#linalg.iterator_type<parallel>")]
-                        * len(shp2)
-                        + [
-                            ir.Attribute.parse(
-                                "#linalg.iterator_type<reduction>"
+    dtype = mlir_element_type_get(dtype)
+    tensor_type = ir.RankedTensorType.get(output_shape, dtype)
+    output = tensor.EmptyOp(output_shape, dtype)
+    if len(shp1) < len(shp2):
+        if int(shp1[-1]) > 1 and shp2[-1] == 1:
+            generic_map = ir.AffineMap.get_permutation(
+                [i for i in range(len(shp2) + 1)]
+            )
+            op = linalg.GenericOp(
+                [tensor_type],
+                [input1, input2],
+                [output],
+                ir.ArrayAttr.get(
+                    [
+                        ir.AffineMapAttr.get(
+                            generic_map.get_submap(
+                                [
+                                    i
+                                    for i in range(
+                                        len(shp2) - len(shp1), len(shp2)
+                                    )
+                                ]
                             )
-                        ]
-                    ),
+                        ),
+                        ir.AffineMapAttr.get(
+                            generic_map.get_submap(
+                                [i for i in range(0, len(shp2) - 1)]
+                                + [len(shp2)]
+                            )
+                        ),
+                        ir.AffineMapAttr.get(
+                            generic_map.get_submap(
+                                [i for i in range(0, len(shp2))]
+                            )
+                        ),
+                    ]
+                ),
+                ir.ArrayAttr.get(
+                    [ir.Attribute.parse("#linalg.iterator_type<parallel>")]
+                    * len(shp2)
+                    + [ir.Attribute.parse("#linalg.iterator_type<reduction>")]
+                ),
+            )
+            block = ir.Block.create_at_start(
+                op.region,
+                [
+                    ir.RankedTensorType(input2.type).element_type,
+                    ir.RankedTensorType(input2.type).element_type,
+                    dtype,
+                ],
+            )
+            if (
+                str(ir.RankedTensorType(input2.type).element_type).find("i")
+                != -1
+            ):
+                cmpop = arith.CmpIOp(
+                    value, block.arguments[0], block.arguments[1]
                 )
-                block = ir.Block.create_at_start(
-                    op.region,
-                    [
-                        ir.RankedTensorType(input2.type).element_type,
-                        ir.RankedTensorType(input2.type).element_type,
-                        ir.IntegerType.get_signless(1),
-                    ],
+            else:
+                cmpop = arith.CmpFOp(
+                    value, block.arguments[0], block.arguments[1]
                 )
-                if (
-                    str(ir.RankedTensorType(input2.type).element_type).find("i")
-                    != -1
-                ):
-                    cmpop = arith.CmpIOp(
-                        value, block.arguments[0], block.arguments[1]
-                    )
-                else:
-                    cmpop = arith.CmpFOp(
-                        value, block.arguments[0], block.arguments[1]
-                    )
-                block.append(cmpop)
-                block.append(linalg.YieldOp([cmpop.result]))
+            block.append(cmpop)
+            block.append(linalg.YieldOp([cmpop.result]))
 
     return op
 
 
 def masked_fill_op(
-    node: torch.fx.Node,
+    node: MaskedFillOp,
     symbol_table: Dict[Tuple[str, int], ir.Operation],
 ):
     """
     Import the tensor masked fill operation.
-    From PyTorch `aten.masked_fill.Scalar` operator to MLIR linalg `generic`
-    operation.
+    From buddy MaskedFillOp to MLIR linalg `generic` operation.
 
     Note: This op, input node2 is a bool tensor. Select input node1's value or
     input node3's value by true or false in input node2's value.
@@ -423,71 +412,67 @@ def masked_fill_op(
     input2 = symbol_table.get((str(node.args[1]), 0))
     if input1 is None or input2 is None:
         return
-    if str(node.args[0].meta["tensor_meta"].dtype) == "torch.float32":
-        value = float(node.args[2])
-        attr = ir.FloatAttr.get(ir.F32Type.get(), value)
-        value = arith.ConstantOp(ir.F32Type.get(), attr)
-    output_shape = list(node.meta["tensor_meta"].shape)
-    dtype = str(node.meta["tensor_meta"].dtype)
-    if dtype == "torch.float32":
-        tensor_type = ir.RankedTensorType.get(output_shape, ir.F32Type.get())
-        output = tensor.EmptyOp(output_shape, ir.F32Type.get())
-        generic_map = ir.AffineMap.get_permutation(
-            [i for i in range(len(output_shape))]
-        )
-        op = linalg.GenericOp(
-            [tensor_type],
-            [input1, input2],
-            [output],
-            ir.ArrayAttr.get(
-                [
-                    ir.AffineMapAttr.get(
-                        generic_map.get_submap(
-                            [i for i in range(len(output_shape))]
-                        )
-                    ),
-                    ir.AffineMapAttr.get(
-                        generic_map.get_submap(
-                            [i for i in range(len(output_shape))]
-                        )
-                    ),
-                    ir.AffineMapAttr.get(
-                        generic_map.get_submap(
-                            [i for i in range(len(output_shape))]
-                        )
-                    ),
-                ]
-            ),
-            ir.ArrayAttr.get(
-                [ir.Attribute.parse("#linalg.iterator_type<parallel>")]
-                * len(output_shape)
-            ),
-        )
-        block = ir.Block.create_at_start(
-            op.region,
+    dtype = node.tensor_meta["dtype"]
+    value = node.args[2]
+    attr = mlir_element_attr_get(dtype, value)
+    dtype = mlir_element_type_get(dtype)
+    value = arith.ConstantOp(dtype, attr)
+    output_shape = list(node.tensor_meta["shape"])
+    tensor_type = ir.RankedTensorType.get(output_shape, dtype)
+    output = tensor.EmptyOp(output_shape, dtype)
+    generic_map = ir.AffineMap.get_permutation(
+        [i for i in range(len(output_shape))]
+    )
+    op = linalg.GenericOp(
+        [tensor_type],
+        [input1, input2],
+        [output],
+        ir.ArrayAttr.get(
             [
-                ir.RankedTensorType(input1.type).element_type,
-                ir.RankedTensorType(input2.type).element_type,
-                ir.RankedTensorType(output.result.type).element_type,
-            ],
-        )
-        select_op = arith.SelectOp(
-            block.arguments[1], value, block.arguments[0]
-        )
-        block.append(select_op)
-        block.append(linalg.YieldOp([select_op.result]))
+                ir.AffineMapAttr.get(
+                    generic_map.get_submap(
+                        [i for i in range(len(output_shape))]
+                    )
+                ),
+                ir.AffineMapAttr.get(
+                    generic_map.get_submap(
+                        [i for i in range(len(output_shape))]
+                    )
+                ),
+                ir.AffineMapAttr.get(
+                    generic_map.get_submap(
+                        [i for i in range(len(output_shape))]
+                    )
+                ),
+            ]
+        ),
+        ir.ArrayAttr.get(
+            [ir.Attribute.parse("#linalg.iterator_type<parallel>")]
+            * len(output_shape)
+        ),
+    )
+    block = ir.Block.create_at_start(
+        op.region,
+        [
+            ir.RankedTensorType(input1.type).element_type,
+            ir.RankedTensorType(input2.type).element_type,
+            ir.RankedTensorType(output.result.type).element_type,
+        ],
+    )
+    select_op = arith.SelectOp(block.arguments[1], value, block.arguments[0])
+    block.append(select_op)
+    block.append(linalg.YieldOp([select_op.result]))
 
     return op
 
 
 def slice_op(
-    node: torch.fx.Node,
+    node: SliceOp,
     symbol_table: Dict[Tuple[str, int], ir.Operation],
 ):
     """
     Import the tensor slice operation.
-    From PyTorch `aten.slice.Tensor` operator to MLIR tensor `extract_slice`
-    operation.
+    From buddy SliceOp to MLIR tensor `extract_slice` operation.
 
     Note: This op, get the slice of input node1.
     Args:
@@ -514,18 +499,14 @@ def slice_op(
     offset = [0 for x in input_shape]
     offset[dim] = start
     offset_attr = ir._denseI64ArrayAttr(offset, None)
-    output_shape = list(node.meta["tensor_meta"].shape)
+    output_shape = list(node.tensor_meta["shape"])
     size_attr = ir._denseI64ArrayAttr(output_shape, None)
     stride = [1 for x in output_shape]
     stride[dim] = step
     stride_attr = ir._denseI64ArrayAttr(stride, None)
-    dtype = str(node.meta["tensor_meta"].dtype)
-    if dtype == "torch.float32":
-        tensor_type = ir.RankedTensorType.get(output_shape, ir.F32Type.get())
-    if dtype == "torch.bool":
-        tensor_type = ir.RankedTensorType.get(
-            output_shape, ir.IntegerType.get_signless(1)
-        )
+    dtype = node.tensor_meta["dtype"]
+    dtype = mlir_element_type_get(dtype)
+    tensor_type = ir.RankedTensorType.get(output_shape, dtype)
 
     op = tensor.ExtractSliceOp(
         tensor_type, input1, [], [], [], offset_attr, size_attr, stride_attr
@@ -535,13 +516,12 @@ def slice_op(
 
 
 def expand_op(
-    node: torch.fx.Node,
+    node: ExpandOp,
     symbol_table: Dict[Tuple[str, int], ir.Operation],
 ):
     """
     Import the tensor expand operation.
-    From PyTorch `aten.expand.default` operator to MLIR tensor `extract_slice`
-    operation.
+    From buddy ExpandOp to MLIR tensor `extract_slice` operation.
 
     Note: This op, based on expand shape, create a new tensor and extract slice
     from origin tensor.
@@ -559,26 +539,15 @@ def expand_op(
     if input1 is None:
         return
     input_shape = ir.RankedTensorType(input1.type).shape
-    output_shape = list(node.meta["tensor_meta"].shape)
-    dtype = str(node.meta["tensor_meta"].dtype)
-    if dtype == "torch.bool":
-        empty_tensor = tensor.EmptyOp(
-            output_shape, ir.IntegerType.get_signless(1)
-        )
-    elif dtype == "torch.float32":
-        empty_tensor = tensor.EmptyOp(output_shape, ir.F32Type.get())
+    output_shape = list(node.tensor_meta["shape"])
+    dtype = node.tensor_meta["dtype"]
+    dtype = mlir_element_type_get(dtype)
+    empty_tensor = tensor.EmptyOp(output_shape, dtype)
     if list(input_shape) == list(node.args[1]):
         offset_attr = ir._denseI64ArrayAttr([0 for x in input_shape], None)
         size_attr = ir._denseI64ArrayAttr(output_shape, None)
         stride_attr = ir._denseI64ArrayAttr([1 for x in input_shape], None)
-        if dtype == "torch.bool":
-            tensor_type = ir.RankedTensorType.get(
-                output_shape, ir.IntegerType.get_signless(1)
-            )
-        elif dtype == "torch.float32":
-            tensor_type = ir.RankedTensorType.get(
-                output_shape, ir.F32Type.get()
-            )
+        tensor_type = ir.RankedTensorType.get(output_shape, dtype)
         extract_tensor = tensor.ExtractSliceOp(
             tensor_type, input1, [], [], [], offset_attr, size_attr, stride_attr
         )
@@ -602,16 +571,10 @@ def expand_op(
                         [1] * (i + 1) + [x for x in output_shape[i + 1 :]], None
                     )
                     stride_attr = ir._denseI64ArrayAttr([1] * len(offset), None)
-                    if dtype == "torch.bool":
-                        tensor_type = ir.RankedTensorType.get(
-                            [1] * (i + 1) + [x for x in output_shape[i + 1 :]],
-                            ir.IntegerType.get_signless(1),
-                        )
-                    elif dtype == "torch.float32":
-                        tensor_type = ir.RankedTensorType.get(
-                            [1] * (i + 1) + [x for x in output_shape[i + 1 :]],
-                            ir.F32Type.get(),
-                        )
+                    tensor_type = ir.RankedTensorType.get(
+                        [1] * (i + 1) + [x for x in output_shape[i + 1 :]],
+                        dtype,
+                    )
                     extract_tensor = tensor.ExtractSliceOp(
                         tensor_type,
                         input1,
@@ -639,12 +602,12 @@ def expand_op(
 
 
 def to_copy_op(
-    node: torch.fx.Node,
+    node: ToCopyOp,
     symbol_table: Dict[Tuple[str, int], ir.Operation],
 ):
     """
     Import the tensor copy operation.
-    From PyTorch `aten._to_copy.default` operator to MLIR linalg `generic`
+    From buddy ToCopyOp to MLIR linalg `generic`
     operation.
 
     Note: This op, will convert input node's value type, such as float32 to
@@ -660,10 +623,10 @@ def to_copy_op(
     input1 = symbol_table.get((str(node.args[0]), 0))
     if input1 is None:
         return
-    output_shape = list(node.meta["tensor_meta"].shape)
-    dtype = str(node.meta["tensor_meta"].dtype)
+    output_shape = list(node.tensor_meta["shape"])
+    dtype = node.tensor_meta["dtype"]
 
-    if dtype == "torch.bool":
+    if dtype == TensorDType.Bool:
         if str(ir.RankedTensorType(input1.type).element_type) == "f32":
             tensor_type = ir.RankedTensorType.get(
                 output_shape, ir.IntegerType.get_signless(1)
@@ -713,7 +676,7 @@ def to_copy_op(
             block.append(fptosi_op)
             block.append(trunc_op)
             block.append(linalg.YieldOp([trunc_op.result]))
-    elif dtype == "torch.float32":
+    elif dtype == TensorDType.Float32:
         if str(ir.RankedTensorType(input1.type).element_type) == "i1":
             tensor_type = ir.RankedTensorType.get(
                 output_shape, ir.F32Type.get()
@@ -764,12 +727,12 @@ def to_copy_op(
 
 
 def rsub_op(
-    node: torch.fx.Node,
+    node: RsubOp,
     symbol_table: Dict[Tuple[str, int], ir.Operation],
 ):
     """
     Import the tensor rsub operation.
-    From PyTorch `aten.rsub.Scalar` operator to MLIR linalg `generic` operation.
+    From buddy RsubOp to MLIR linalg `generic` operation.
 
     Note: This op, compute input node1 rsub input node2
     Args:
@@ -782,20 +745,94 @@ def rsub_op(
     """
     input1 = symbol_table.get((str(node.args[0]), 0))
     value = node.args[1]
-    output_shape = list(node.meta["tensor_meta"].shape)
-    dtype = str(node.meta["tensor_meta"].dtype)
-    if not isinstance(value, torch.fx.Node):
-        if dtype == "torch.float32":
-            value = arith.ConstantOp(
-                ir.F32Type.get(), ir.FloatAttr.get(ir.F32Type.get(), value)
-            )
+    output_shape = list(node.tensor_meta["shape"])
+    dtype = node.tensor_meta["dtype"]
+    mlir_dtype = mlir_element_type_get(dtype)
+    if not isinstance(value, str):
+        value = arith.ConstantOp(
+            mlir_dtype, mlir_element_attr_get(dtype, value)
+        )
+        generic_map = ir.AffineMap.get_permutation(
+            [i for i in range(len(output_shape))]
+        )
+        tensor_type = ir.RankedTensorType.get(output_shape, mlir_dtype)
+        output = tensor.EmptyOp(output_shape, mlir_dtype)
+        op = linalg.GenericOp(
+            [tensor_type],
+            [input1],
+            [output],
+            ir.ArrayAttr.get(
+                [
+                    ir.AffineMapAttr.get(
+                        generic_map.get_submap(
+                            [i for i in range(len(output_shape))]
+                        )
+                    ),
+                    ir.AffineMapAttr.get(
+                        generic_map.get_submap(
+                            [i for i in range(len(output_shape))]
+                        )
+                    ),
+                ]
+            ),
+            ir.ArrayAttr.get(
+                [ir.Attribute.parse("#linalg.iterator_type<parallel>")]
+                * len(output_shape)
+            ),
+        )
+        block = ir.Block.create_at_start(
+            op.region,
+            [
+                ir.RankedTensorType(input1.type).element_type,
+                ir.RankedTensorType(output.result.type).element_type,
+            ],
+        )
+        if str(ir.RankedTensorType(input1.type).element_type).find("i") != -1:
+            sub_op = arith.SubIOp(value.result, block.arguments[0])
+        else:
+            sub_op = arith.SubFOp(value.result, block.arguments[0])
+        block.append(sub_op)
+        block.append(linalg.YieldOp([sub_op.result]))
+
+    return op
+
+
+def pow_op(
+    node: PowOp,
+    symbol_table: Dict[Tuple[str, int], ir.Operation],
+):
+    """
+    Import the tensor copy operation.
+    From buddy PowOp to MLIR linalg `generic`
+    operation.
+
+    Note: This op, compute input node's power result.
+    Args:
+        node: Containing information from the input graph node.
+        symbol_table: A dictionary mapping symbols to their corresponding
+        operations.
+
+    Returns:
+        op: The operation return the linalg.generic op.
+    """
+    input1 = symbol_table.get((str(node.args[0]), 0))
+    if input1 is None:
+        return
+    value = node.args[1]
+    output_shape = list(node.tensor_meta["shape"])
+    dtype = node.tensor_meta["dtype"]
+    dtype = mlir_element_type_get(dtype)
+    if not isinstance(value, str):
+        if abs(int(value) - float(value)) < 1e-6:
             generic_map = ir.AffineMap.get_permutation(
                 [i for i in range(len(output_shape))]
             )
-            tensor_type = ir.RankedTensorType.get(
-                output_shape, ir.F32Type.get()
+            tensor_type = ir.RankedTensorType.get(output_shape, dtype)
+            output = tensor.EmptyOp(output_shape, dtype)
+            value = arith.ConstantOp(
+                ir.IntegerType.get_signless(32),
+                ir.IntegerAttr.get(ir.IntegerType.get_signless(32), value),
             )
-            output = tensor.EmptyOp(output_shape, ir.F32Type.get())
             op = linalg.GenericOp(
                 [tensor_type],
                 [input1],
@@ -826,23 +863,28 @@ def rsub_op(
                     ir.RankedTensorType(output.result.type).element_type,
                 ],
             )
-            subf_op = arith.SubFOp(value.result, block.arguments[0])
-            block.append(subf_op)
-            block.append(linalg.YieldOp([subf_op.result]))
+            if (
+                str(ir.RankedTensorType(input1.type).element_type).find("i")
+                != -1
+            ):
+                powi_op = math.IPowIOp(block.arguments[0], value.result)
+            else:
+                powi_op = math.FPowIOp(block.arguments[0], value.result)
+            block.append(powi_op)
+            block.append(linalg.YieldOp([powi_op.result]))
 
     return op
 
 
-def pow_op(
-    node: torch.fx.Node,
+def mean_op(
+    node: MeanOp,
     symbol_table: Dict[Tuple[str, int], ir.Operation],
 ):
     """
     Import the tensor copy operation.
-    From PyTorch `aten.pow.Tensor_Scalar` operator to MLIR linalg `generic`
-    operation.
+    From buddy MeanOp to MLIR linalg `generic` operation.
 
-    Note: This op, compute input node's power result.
+    Note: This op, compute input node's mean result in a specified dim.
     Args:
         node: Containing information from the input graph node.
         symbol_table: A dictionary mapping symbols to their corresponding
@@ -854,160 +896,91 @@ def pow_op(
     input1 = symbol_table.get((str(node.args[0]), 0))
     if input1 is None:
         return
-    value = node.args[1]
-    output_shape = list(node.meta["tensor_meta"].shape)
-    dtype = str(node.meta["tensor_meta"].dtype)
-    if not isinstance(value, torch.fx.Node):
-        if dtype == "torch.float32":
+    dims = list(node.args[1])
+    keep_dim = bool(node.args[2])
+    output_shape = list(node.tensor_meta["shape"])
+    dtype = node.tensor_meta["dtype"]
+    mlir_dtype = mlir_element_type_get(dtype)
+    tensor_type = ir.RankedTensorType.get(output_shape, mlir_dtype)
+    element = mlir_element_attr_get(dtype, 0.0)
+    attr = ir.DenseElementsAttr.get_splat(tensor_type, element)
+    output = arith.ConstantOp(tensor_type, attr)
+    assert len(dims) == 1
+    for dim in dims:
+        if dim < 0:
+            dim = len(list(ir.RankedTensorType(input1.type).shape)) + dim
+        if keep_dim:
             generic_map = ir.AffineMap.get_permutation(
-                [i for i in range(len(output_shape))]
+                [i for i in range(len(output_shape) + 1)]
             )
-            tensor_type = ir.RankedTensorType.get(
-                output_shape, ir.F32Type.get()
+            tensor_type = ir.RankedTensorType.get(output_shape, mlir_dtype)
+            output_map = [i for i in range(len(output_shape))]
+            output_map[dim] = len(output_shape)
+            loop_type = [
+                ir.Attribute.parse("#linalg.iterator_type<parallel>")
+            ] * (len(output_shape) + 1)
+            loop_type[dim] = ir.Attribute.parse(
+                "#linalg.iterator_type<reduction>"
             )
-            output = tensor.EmptyOp(output_shape, ir.F32Type.get())
-            if abs(int(value) - float(value)) < 1e-6:
-                value = arith.ConstantOp(
-                    ir.IntegerType.get_signless(32),
-                    ir.IntegerAttr.get(ir.IntegerType.get_signless(32), value),
-                )
-                op = linalg.GenericOp(
-                    [tensor_type],
-                    [input1],
-                    [output],
-                    ir.ArrayAttr.get(
-                        [
-                            ir.AffineMapAttr.get(
-                                generic_map.get_submap(
-                                    [i for i in range(len(output_shape))]
-                                )
-                            ),
-                            ir.AffineMapAttr.get(
-                                generic_map.get_submap(
-                                    [i for i in range(len(output_shape))]
-                                )
-                            ),
-                        ]
-                    ),
-                    ir.ArrayAttr.get(
-                        [ir.Attribute.parse("#linalg.iterator_type<parallel>")]
-                        * len(output_shape)
-                    ),
-                )
-                block = ir.Block.create_at_start(
-                    op.region,
-                    [
-                        ir.RankedTensorType(input1.type).element_type,
-                        ir.RankedTensorType(output.result.type).element_type,
-                    ],
-                )
-                fpowi_op = math.FPowIOp(block.arguments[0], value.result)
-                block.append(fpowi_op)
-                block.append(linalg.YieldOp([fpowi_op.result]))
-
-    return op
-
-
-def mean_op(
-    node: torch.fx.Node,
-    symbol_table: Dict[Tuple[str, int], ir.Operation],
-):
-    """
-    Import the tensor copy operation.
-    From PyTorch `aten.mean.dim` operator to MLIR linalg `generic` operation.
-
-    Note: This op, compute input node's mean result in a specified dim.
-    Args:
-        node: Containing information from the input graph node.
-        symbol_table: A dictionary mapping symbols to their corresponding
-        operations.
-
-    Returns:
-        op: The operation return the linalg.generic op.
-    """
-    input1 = symbol_table.get((str(node.args[0]), 0))
-    if input1 is None:
-        return
-    dims = list(node.args[1])
-    keep_dim = bool(node.args[2])
-    output_shape = list(node.meta["tensor_meta"].shape)
-    dtype = str(node.meta["tensor_meta"].dtype)
-    if dtype == "torch.float32":
-        tensor_type = ir.RankedTensorType.get(output_shape, ir.F32Type.get())
-        element = ir.FloatAttr.get(ir.F32Type.get(), 0.0)
-        attr = ir.DenseElementsAttr.get_splat(tensor_type, element)
-        output = arith.ConstantOp(tensor_type, attr)
-
-        assert len(dims) == 1
-
-        for dim in dims:
-            if dim == -1:
-                dim = len(list(ir.RankedTensorType(input1.type).shape)) - 1
-            if keep_dim:
-                generic_map = ir.AffineMap.get_permutation(
-                    [i for i in range(len(output_shape) + 1)]
-                )
-                tensor_type = ir.RankedTensorType.get(
-                    output_shape, ir.F32Type.get()
-                )
-                output_map = [i for i in range(len(output_shape))]
-                output_map[dim] = len(output_shape)
-                loop_type = [
-                    ir.Attribute.parse("#linalg.iterator_type<parallel>")
-                ] * (len(output_shape) + 1)
-                loop_type[dim] = ir.Attribute.parse(
-                    "#linalg.iterator_type<reduction>"
-                )
-                op = linalg.GenericOp(
-                    [tensor_type],
-                    [input1],
-                    [output],
-                    ir.ArrayAttr.get(
-                        [
-                            ir.AffineMapAttr.get(
-                                generic_map.get_submap(
-                                    [i for i in range(len(output_shape))]
-                                )
-                            ),
-                            ir.AffineMapAttr.get(
-                                generic_map.get_submap(output_map)
-                            ),
-                        ]
-                    ),
-                    ir.ArrayAttr.get(loop_type),
-                )
-                block = ir.Block.create_at_start(
-                    op.region,
+            op = linalg.GenericOp(
+                [tensor_type],
+                [input1],
+                [output],
+                ir.ArrayAttr.get(
                     [
-                        ir.RankedTensorType(input1.type).element_type,
-                        ir.RankedTensorType(output.result.type).element_type,
-                    ],
+                        ir.AffineMapAttr.get(
+                            generic_map.get_submap(
+                                [i for i in range(len(output_shape))]
+                            )
+                        ),
+                        ir.AffineMapAttr.get(
+                            generic_map.get_submap(output_map)
+                        ),
+                    ]
+                ),
+                ir.ArrayAttr.get(loop_type),
+            )
+            block = ir.Block.create_at_start(
+                op.region,
+                [
+                    ir.RankedTensorType(input1.type).element_type,
+                    ir.RankedTensorType(output.result.type).element_type,
+                ],
+            )
+            value = arith.ConstantOp(
+                mlir_dtype,
+                mlir_element_attr_get(
+                    dtype, list(ir.RankedTensorType(input1.type).shape)[dim]
+                ),
+            )
+            if (
+                str(ir.RankedTensorType(input1.type).element_type).find("i")
+                != -1
+            ):
+                block_div_op = arith.DivSIOp(block.arguments[0], value.result)
+                block_add_op = arith.AddIOp(
+                    block_div_op.result, block.arguments[1]
                 )
-                value = arith.ConstantOp(
-                    ir.F32Type.get(),
-                    ir.FloatAttr.get(
-                        ir.F32Type.get(),
-                        list(ir.RankedTensorType(input1.type).shape)[dim],
-                    ),
+            else:
+                block_div_op = arith.DivFOp(block.arguments[0], value.result)
+                block_add_op = arith.AddFOp(
+                    block_div_op.result, block.arguments[1]
                 )
-                divf_op = arith.DivFOp(block.arguments[0], value.result)
-                addf_op = arith.AddFOp(divf_op.result, block.arguments[1])
-                block.append(value)
-                block.append(divf_op)
-                block.append(addf_op)
-                block.append(linalg.YieldOp([addf_op.result]))
+            block.append(value)
+            block.append(block_div_op)
+            block.append(block_add_op)
+            block.append(linalg.YieldOp([block_add_op.result]))
 
     return op
 
 
 def rsqrt_op(
-    node: torch.fx.Node,
+    node: RsqrtOp,
     symbol_table: Dict[Tuple[str, int], ir.Operation],
 ):
     """
     Import the tensor rsqrt operation.
-    From PyTorch `aten.rsqrt.default` operator to MLIR linalg `generic`
-    operation.
+    From buddy RsqrtOp to MLIR linalg `generic` operation.
 
     Note: This op, compute input node's rsqrt result.
     Args:
@@ -1023,59 +996,58 @@ def rsqrt_op(
     if input1 is None:
         return
 
-    output_shape = list(node.meta["tensor_meta"].shape)
-    dtype = str(node.meta["tensor_meta"].dtype)
-
-    if dtype == "torch.float32":
-        tensor_type = ir.RankedTensorType.get(output_shape, ir.F32Type.get())
-        output = tensor.EmptyOp(output_shape, ir.F32Type.get())
-        generic_map = ir.AffineMap.get_permutation(
-            [i for i in range(len(output_shape))]
-        )
-        op = linalg.GenericOp(
-            [tensor_type],
-            [input1],
-            [output],
-            ir.ArrayAttr.get(
-                [
-                    ir.AffineMapAttr.get(
-                        generic_map.get_submap(
-                            [i for i in range(len(output_shape))]
-                        )
-                    ),
-                    ir.AffineMapAttr.get(
-                        generic_map.get_submap(
-                            [i for i in range(len(output_shape))]
-                        )
-                    ),
-                ]
-            ),
-            ir.ArrayAttr.get(
-                [ir.Attribute.parse("#linalg.iterator_type<parallel>")]
-                * len(output_shape)
-            ),
-        )
-        block = ir.Block.create_at_start(
-            op.region,
+    output_shape = list(node.tensor_meta["shape"])
+    dtype = node.tensor_meta["dtype"]
+    mlir_dtype = mlir_element_type_get(dtype)
+    tensor_type = ir.RankedTensorType.get(output_shape, mlir_dtype)
+    output = tensor.EmptyOp(output_shape, mlir_dtype)
+    generic_map = ir.AffineMap.get_permutation(
+        [i for i in range(len(output_shape))]
+    )
+    op = linalg.GenericOp(
+        [tensor_type],
+        [input1],
+        [output],
+        ir.ArrayAttr.get(
             [
-                ir.RankedTensorType(input1.type).element_type,
-                ir.RankedTensorType(output.result.type).element_type,
-            ],
-        )
-        math_rsqrt_op = math.RsqrtOp(block.arguments[0])
-        block.append(math_rsqrt_op)
-        block.append(linalg.YieldOp([math_rsqrt_op.result]))
+                ir.AffineMapAttr.get(
+                    generic_map.get_submap(
+                        [i for i in range(len(output_shape))]
+                    )
+                ),
+                ir.AffineMapAttr.get(
+                    generic_map.get_submap(
+                        [i for i in range(len(output_shape))]
+                    )
+                ),
+            ]
+        ),
+        ir.ArrayAttr.get(
+            [ir.Attribute.parse("#linalg.iterator_type<parallel>")]
+            * len(output_shape)
+        ),
+    )
+    block = ir.Block.create_at_start(
+        op.region,
+        [
+            ir.RankedTensorType(input1.type).element_type,
+            ir.RankedTensorType(output.result.type).element_type,
+        ],
+    )
+    math_rsqrt_op = math.RsqrtOp(block.arguments[0])
+    block.append(math_rsqrt_op)
+    block.append(linalg.YieldOp([math_rsqrt_op.result]))
 
     return op
 
 
 def mul_op(
-    node: torch.fx.Node,
+    node: MulOp,
     symbol_table: Dict[Tuple[str, int], ir.Operation],
 ):
     """
     Import the tensor mul operation.
-    From PyTorch `aten.mul.Tensor` operator to MLIR linalg `generic` operation.
+    From buddy MulOp to MLIR linalg `generic` operation.
 
     Note: This op, compute input node's mul result.
     Args:
@@ -1087,257 +1059,38 @@ def mul_op(
         op: The operation return the linalg.generic op.
     """
     assert len(node.args) == 2
-    if isinstance(node.args[0], torch.fx.Node):
-        input1 = symbol_table.get((str(node.args[0]), 0))
-    else:
-        input1 = node.args[0]
-
-    if isinstance(node.args[1], torch.fx.Node):
+    input1 = symbol_table.get((str(node.args[0]), 0))
+    dtype = node.tensor_meta["dtype"]
+    mlir_dtype = mlir_element_type_get(dtype)
+    shape = list(node.tensor_meta["shape"])
+    if isinstance(node.args[1], str):
         input2 = symbol_table.get((str(node.args[1]), 0))
     else:
-        input2 = node.args[1]
-
+        data = [node.args[1]]
+        input2_shape = numpy.array(data).shape
+        tensor_type = ir.RankedTensorType.get(input2_shape, mlir_dtype)
+        element = mlir_element_attr_get(dtype, node.args[1])
+        attr = ir.DenseElementsAttr.get_splat(tensor_type, element)
+        input2 = arith.ConstantOp(tensor_type, attr).result
     if input1 is None or input2 is None:
         return
-
-    output_shape = list(node.meta["tensor_meta"].shape)
-    dtype = str(node.meta["tensor_meta"].dtype)
-
-    if isinstance(node.args[0], torch.fx.Node):
-        if dtype == "torch.float32":
-            if not isinstance(node.args[1], torch.fx.Node):
-                input2 = arith.ConstantOp(
-                    ir.F32Type.get(), ir.FloatAttr.get(ir.F32Type.get(), input2)
-                )
-                tensor_type = ir.RankedTensorType.get(
-                    output_shape, ir.F32Type.get()
-                )
-                output = tensor.EmptyOp(output_shape, ir.F32Type.get())
-                generic_map = ir.AffineMap.get_permutation(
-                    [i for i in range(len(output_shape))]
-                )
-                op = linalg.GenericOp(
-                    [tensor_type],
-                    [input1],
-                    [output],
-                    ir.ArrayAttr.get(
-                        [
-                            ir.AffineMapAttr.get(
-                                generic_map.get_submap(
-                                    [i for i in range(len(output_shape))]
-                                )
-                            ),
-                            ir.AffineMapAttr.get(
-                                generic_map.get_submap(
-                                    [i for i in range(len(output_shape))]
-                                )
-                            ),
-                        ]
-                    ),
-                    ir.ArrayAttr.get(
-                        [ir.Attribute.parse("#linalg.iterator_type<parallel>")]
-                        * len(output_shape)
-                    ),
-                )
-                block = ir.Block.create_at_start(
-                    op.region,
-                    [
-                        ir.RankedTensorType(input1.type).element_type,
-                        ir.RankedTensorType(output.result.type).element_type,
-                    ],
-                )
-                mulf_op = arith.MulFOp(block.arguments[0], input2.result)
-                block.append(mulf_op)
-                block.append(linalg.YieldOp([mulf_op.result]))
-            else:
-                tensor_type = ir.RankedTensorType.get(
-                    output_shape, ir.F32Type.get()
-                )
-                output = tensor.EmptyOp(output_shape, ir.F32Type.get())
-                input1_shape = list(ir.RankedTensorType(input1.type).shape)
-                if input1_shape != output_shape:
-                    dims = []
-                    for i in range(len(input1_shape) - 1, -1, -1):
-                        if (
-                            input1_shape[i]
-                            != output_shape[
-                                len(output_shape) - (len(input1_shape) - i)
-                            ]
-                        ):
-                            dims.append(i)
-                    output1 = tensor.EmptyOp(output_shape, ir.F32Type.get())
-                    generic_map = ir.AffineMap.get_permutation(
-                        [i for i in range(len(output_shape) + len(dims))]
-                    )
-                    input1_map = [
-                        i
-                        for i in range(
-                            len(output_shape) - len(input1_shape),
-                            len(output_shape),
-                        )
-                    ]
-                    for index, i in enumerate(dims):
-                        input1_map[i] = len(output_shape) + index
-                    input1_map = generic_map.get_submap(input1_map)
-                    input1_op = linalg.GenericOp(
-                        [tensor_type],
-                        [input1],
-                        [output1],
-                        ir.ArrayAttr.get(
-                            [
-                                ir.AffineMapAttr.get(input1_map),
-                                ir.AffineMapAttr.get(
-                                    generic_map.get_submap(
-                                        [i for i in range(len(output_shape))]
-                                    )
-                                ),
-                            ]
-                        ),
-                        ir.ArrayAttr.get(
-                            [
-                                ir.Attribute.parse(
-                                    "#linalg.iterator_type<parallel>"
-                                )
-                            ]
-                            * len(output_shape)
-                            + [
-                                ir.Attribute.parse(
-                                    "#linalg.iterator_type<reduction>"
-                                )
-                            ]
-                            * len(dims)
-                        ),
-                    )
-                    block = ir.Block.create_at_start(
-                        input1_op.region,
-                        [
-                            ir.RankedTensorType(input1.type).element_type,
-                            ir.RankedTensorType(
-                                output.result.type
-                            ).element_type,
-                        ],
-                    )
-                    block.append(linalg.YieldOp([block.arguments[0]]))
-                    input1 = input1_op.result
-
-                input2_shape = list(ir.RankedTensorType(input2.type).shape)
-                if input2_shape != output_shape:
-                    dims = []
-                    for i in range(len(input2_shape) - 1, -1, -1):
-                        if (
-                            input2_shape[i]
-                            != output_shape[
-                                len(output_shape) - (len(input2_shape) - i)
-                            ]
-                        ):
-                            dims.append(i)
-                    output2 = tensor.EmptyOp(output_shape, ir.F32Type.get())
-                    generic_map = ir.AffineMap.get_permutation(
-                        [i for i in range(len(output_shape) + len(dims))]
-                    )
-                    input2_map = [
-                        i
-                        for i in range(
-                            len(output_shape) - len(input2_shape),
-                            len(output_shape),
-                        )
-                    ]
-                    for index, i in enumerate(dims):
-                        input2_map[i] = len(output_shape) + index
-                    input2_map = generic_map.get_submap(input2_map)
-                    input2_op = linalg.GenericOp(
-                        [tensor_type],
-                        [input2],
-                        [output2],
-                        ir.ArrayAttr.get(
-                            [
-                                ir.AffineMapAttr.get(input2_map),
-                                ir.AffineMapAttr.get(
-                                    generic_map.get_submap(
-                                        [i for i in range(len(output_shape))]
-                                    )
-                                ),
-                            ]
-                        ),
-                        ir.ArrayAttr.get(
-                            [
-                                ir.Attribute.parse(
-                                    "#linalg.iterator_type<parallel>"
-                                )
-                            ]
-                            * len(output_shape)
-                            + [
-                                ir.Attribute.parse(
-                                    "#linalg.iterator_type<reduction>"
-                                )
-                            ]
-                            * len(dims)
-                        ),
-                    )
-                    block = ir.Block.create_at_start(
-                        input2_op.region,
-                        [
-                            ir.RankedTensorType(input2.type).element_type,
-                            ir.RankedTensorType(
-                                output.result.type
-                            ).element_type,
-                        ],
-                    )
-                    block.append(linalg.YieldOp([block.arguments[0]]))
-                    input2 = input2_op.result
-                generic_map = ir.AffineMap.get_permutation(
-                    [i for i in range(len(output_shape))]
-                )
-                op = linalg.GenericOp(
-                    [tensor_type],
-                    [input1, input2],
-                    [output],
-                    ir.ArrayAttr.get(
-                        [
-                            ir.AffineMapAttr.get(
-                                generic_map.get_submap(
-                                    [i for i in range(len(output_shape))]
-                                )
-                            ),
-                            ir.AffineMapAttr.get(
-                                generic_map.get_submap(
-                                    [i for i in range(len(output_shape))]
-                                )
-                            ),
-                            ir.AffineMapAttr.get(
-                                generic_map.get_submap(
-                                    [i for i in range(len(output_shape))]
-                                )
-                            ),
-                        ]
-                    ),
-                    ir.ArrayAttr.get(
-                        [ir.Attribute.parse("#linalg.iterator_type<parallel>")]
-                        * len(output_shape)
-                    ),
-                )
-                block = ir.Block.create_at_start(
-                    op.region,
-                    [
-                        ir.RankedTensorType(input1.type).element_type,
-                        ir.RankedTensorType(input2.type).element_type,
-                        ir.RankedTensorType(output.result.type).element_type,
-                    ],
-                )
-                mulf_op = arith.MulFOp(block.arguments[0], block.arguments[1])
-                block.append(mulf_op)
-                block.append(linalg.YieldOp([mulf_op.result]))
-
-    return op
+    mul_result_tensor_type = ir.RankedTensorType.get(shape, mlir_dtype)
+    op = tosa.MulOp(
+        mul_result_tensor_type,
+        input1,
+        input2,
+        ir.IntegerAttr.get(ir.IntegerType.get_signless(8), 0),
+    )
+    return op.result
 
 
 def t_op(
-    node: torch.fx.Node,
+    node: TOp,
     symbol_table: Dict[Tuple[str, int], ir.Operation],
 ):
     """
     Import the tensor tanspose operation.
-    From PyTorch `aten.t.default` operator to MLIR linalg `generic` operation.
+    From buddy TransposeOp to MLIR linalg `generic` operation.
 
     Note: This op, compute input node's transpose result.
     Args:
@@ -1353,50 +1106,23 @@ def t_op(
     if input1 is None:
         return
 
-    input_shape = list(ir.RankedTensorType(input1.type).shape)
-    output_shape = list(node.meta["tensor_meta"].shape)
-    dtype = str(node.meta["tensor_meta"].dtype)
-    if len(input_shape) == 2:
-        if dtype == "torch.float32":
-            tensor_type = ir.RankedTensorType.get(
-                output_shape, ir.F32Type.get()
-            )
-            output = tensor.EmptyOp(output_shape, ir.F32Type.get())
-            generic_map = ir.AffineMap.get_permutation([0, 1])
-            op = linalg.GenericOp(
-                [tensor_type],
-                [input1],
-                [output],
-                ir.ArrayAttr.get(
-                    [
-                        ir.AffineMapAttr.get(generic_map.get_submap([0, 1])),
-                        ir.AffineMapAttr.get(generic_map.get_submap([1, 0])),
-                    ]
-                ),
-                ir.ArrayAttr.get(
-                    [ir.Attribute.parse("#linalg.iterator_type<parallel>")]
-                    * len(output_shape)
-                ),
-            )
-            block = ir.Block.create_at_start(
-                op.region,
-                [
-                    ir.RankedTensorType(input1.type).element_type,
-                    ir.RankedTensorType(output.result.type).element_type,
-                ],
-            )
-            block.append(linalg.YieldOp([block.arguments[0]]))
+    output_shape = list(node.tensor_meta["shape"])
+    dtype = node.tensor_meta["dtype"]
+    mlir_dtype = mlir_element_type_get(dtype)
+    perm = ir._denseI64ArrayAttr([1, 0], None)
+    output = tensor.EmptyOp(output_shape, mlir_dtype)
+    op = linalg.transpose(input=input1, outs=[output], permutation=perm)
 
-    return op
+    return op.result[0]
 
 
 def matmul_op(
-    node: torch.fx.Node,
+    node: MatmulOp,
     symbol_table: Dict[Tuple[str, int], ir.Operation],
 ):
     """
     Import the tensor matmul operation.
-    From PyTorch `aten.mm.default` operator to MLIR linalg `matmul` operation.
+    From Buddy MatmulOp to MLIR linalg `matmul` operation.
 
     Note: This op, compute input node's matrix multiplication result.
     Args:
@@ -1413,25 +1139,24 @@ def matmul_op(
     if input1 is None or input2 is None:
         return
 
-    output_shape = list(node.meta["tensor_meta"].shape)
-    dtype = str(node.meta["tensor_meta"].dtype)
-    if dtype == "torch.float32":
-        tensor_type = ir.RankedTensorType.get(output_shape, ir.F32Type.get())
-        f32 = ir.F32Type.get()
-        element = ir.FloatAttr.get(f32, 0.0)
-        attr = ir.DenseElementsAttr.get_splat(tensor_type, element)
-        matmul_result_buffer = arith.ConstantOp(tensor_type, attr).result
-        op = linalg.matmul(input1, input2, outs=[matmul_result_buffer])
+    output_shape = list(node.tensor_meta["shape"])
+    dtype = node.tensor_meta["dtype"]
+    mlir_dtype = mlir_element_type_get(dtype)
+    tensor_type = ir.RankedTensorType.get(output_shape, mlir_dtype)
+    element = mlir_element_attr_get(dtype, 0.0)
+    attr = ir.DenseElementsAttr.get_splat(tensor_type, element)
+    matmul_result_buffer = arith.ConstantOp(tensor_type, attr).result
+    op = linalg.matmul(input1, input2, outs=[matmul_result_buffer])
     return op
 
 
 def transpose_op(
-    node: torch.fx.Node,
+    node: TransposeOp,
     symbol_table: Dict[Tuple[str, int], ir.Operation],
 ):
     """
     Import the tensor transpose operation.
-    From PyTorch `aten.transpose.int` operator to MLIR linalg `generic`
+    From buddy TransposeSpecificDimOp to MLIR linalg `generic`
     operation.
 
     Note: This op, compute input node's transpose result.
@@ -1449,51 +1174,25 @@ def transpose_op(
         return
     dim1 = int(node.args[1])
     dim2 = int(node.args[2])
-    output_shape = list(node.meta["tensor_meta"].shape)
-    dtype = str(node.meta["tensor_meta"].dtype)
-    if dtype == "torch.float32":
-        tensor_type = ir.RankedTensorType.get(output_shape, ir.F32Type.get())
-        output = tensor.EmptyOp(output_shape, ir.F32Type.get())
-        generic_map = ir.AffineMap.get_permutation(
-            [i for i in range(len(output_shape))]
-        )
-        input1_map = [i for i in range(len(output_shape))]
-        input1_map[dim1], input1_map[dim2] = input1_map[dim2], input1_map[dim1]
-        output_map = [i for i in range(len(output_shape))]
-        op = linalg.GenericOp(
-            [tensor_type],
-            [input1],
-            [output],
-            ir.ArrayAttr.get(
-                [
-                    ir.AffineMapAttr.get(generic_map.get_submap(input1_map)),
-                    ir.AffineMapAttr.get(generic_map.get_submap(output_map)),
-                ]
-            ),
-            ir.ArrayAttr.get(
-                [ir.Attribute.parse("#linalg.iterator_type<parallel>")]
-                * len(output_shape)
-            ),
-        )
-        block = ir.Block.create_at_start(
-            op.region,
-            [
-                ir.RankedTensorType(input1.type).element_type,
-                ir.RankedTensorType(output.result.type).element_type,
-            ],
-        )
-        block.append(linalg.YieldOp([block.arguments[0]]))
+    output_shape = list(node.tensor_meta["shape"])
+    dtype = node.tensor_meta["dtype"]
+    mlir_dtype = mlir_element_type_get(dtype)
+    output_perm = [i for i in range(len(output_shape))]
+    output_perm[dim2], output_perm[dim1] = output_perm[dim1], output_perm[dim2]
+    perm = ir._denseI64ArrayAttr(output_perm, None)
+    output = tensor.EmptyOp(output_shape, mlir_dtype)
+    op = linalg.transpose(input=input1, outs=[output], permutation=perm)
 
-    return op
+    return op.result[0]
 
 
 def index_op(
-    node: torch.fx.Node,
+    node: IndexOp,
     symbol_table: Dict[Tuple[str, int], ir.Operation],
 ):
     """
     Import the tensor index operation.
-    From PyTorch `aten.index.Tensor` operator to MLIR linalg `generic`
+    From buddy IndexOp to MLIR linalg `generic`
     operation.
 
     Note: This op, get input node slice result by input index.
@@ -1511,70 +1210,66 @@ def index_op(
         return
     input1_shape = ir.RankedTensorType(input1.type).shape
     input2 = node.args[1]
-    output_shape = list(node.meta["tensor_meta"].shape)
-    dtype = str(node.meta["tensor_meta"].dtype)
+    output_shape = list(node.tensor_meta["shape"])
+    dtype = node.tensor_meta["dtype"]
+    mlir_dtype = mlir_element_type_get(dtype)
     if len(input2) < len(input1_shape):
-        if dtype == "torch.float32":
-            tensor_type = ir.RankedTensorType.get(
-                output_shape, ir.F32Type.get()
-            )
-            output = tensor.EmptyOp(output_shape, ir.F32Type.get())
-            loops = ir.RankedTensorType(
-                symbol_table.get((str(input2[0]), 0)).type
-            ).shape
-            generic_map = ir.AffineMap.get_permutation(
-                [i for i in range(len(output_shape))]
+        tensor_type = ir.RankedTensorType.get(output_shape, mlir_dtype)
+        output = tensor.EmptyOp(output_shape, mlir_dtype)
+        loops = ir.RankedTensorType(
+            symbol_table.get((str(input2[0]), 0)).type
+        ).shape
+        generic_map = ir.AffineMap.get_permutation(
+            [i for i in range(len(output_shape))]
+        )
+        input_map = [
+            ir.AffineMapAttr.get(
+                generic_map.get_submap([j for j in range(len(loops))])
             )
-            input_map = [
-                ir.AffineMapAttr.get(
-                    generic_map.get_submap([j for j in range(len(loops))])
-                )
-                for i in range(len(input2))
-            ] + [
-                ir.AffineMapAttr.get(
-                    generic_map.get_submap(
-                        [j for j in range(len(output_shape))]
-                    )
-                )
-            ]
-            operands = [symbol_table.get((str(i), 0)) for i in input2]
-            op = linalg.GenericOp(
-                [tensor_type],
-                operands,
-                [output],
-                ir.ArrayAttr.get(input_map),
-                ir.ArrayAttr.get(
-                    [ir.Attribute.parse("#linalg.iterator_type<parallel>")]
-                    * len(output_shape)
-                ),
+            for i in range(len(input2))
+        ] + [
+            ir.AffineMapAttr.get(
+                generic_map.get_submap([j for j in range(len(output_shape))])
             )
-            arguments = [
-                ir.RankedTensorType(i.type).element_type for i in operands
-            ] + [ir.RankedTensorType(output.result.type).element_type]
-            block = ir.Block.create_at_start(op.region, arguments)
-            index = []
-            for i in block.arguments[:-1]:
-                indexcast_op = arith.IndexCastOp(ir.IndexType.get(), i)
-                block.append(indexcast_op)
-                index.append(indexcast_op.result)
-            for i in range(len(loops), len(output_shape) - len(input2) + 1):
-                index_op = linalg.IndexOp(ir._i64Attr(i, None))
-                block.append(index_op)
-                index.append(index_op.result)
-            value = tensor.ExtractOp(input1, index)
-            block.append(value)
-            block.append(linalg.YieldOp([value.result]))
+        ]
+        operands = [symbol_table.get((str(i), 0)) for i in input2]
+        op = linalg.GenericOp(
+            [tensor_type],
+            operands,
+            [output],
+            ir.ArrayAttr.get(input_map),
+            ir.ArrayAttr.get(
+                [ir.Attribute.parse("#linalg.iterator_type<parallel>")]
+                * len(output_shape)
+            ),
+        )
+        arguments = [
+            ir.RankedTensorType(i.type).element_type for i in operands
+        ] + [ir.RankedTensorType(output.result.type).element_type]
+        block = ir.Block.create_at_start(op.region, arguments)
+        index = []
+        for i in block.arguments[:-1]:
+            indexcast_op = arith.IndexCastOp(ir.IndexType.get(), i)
+            block.append(indexcast_op)
+            index.append(indexcast_op.result)
+        for i in range(len(loops), len(output_shape) - len(input2) + 1):
+            index_op = linalg.IndexOp(ir._i64Attr(i, None))
+            block.append(index_op)
+            index.append(index_op.result)
+        value = tensor.ExtractOp(input1, index)
+        block.append(value)
+        block.append(linalg.YieldOp([value.result]))
 
     return op
 
 
 def neg_op(
-    node: torch.fx.Node,
+    node: NegOp,
     symbol_table: Dict[Tuple[str, int], ir.Operation],
 ):
     """
     Import the tensor neg operation.
-    From PyTorch `aten.neg.default` operator to MLIR linalg `matmul` operation.
+    From buddy NegOp to MLIR linalg `negf` operation.
 
     Note: This op, compute input node's neg result.
     Args:
@@ -1589,59 +1284,22 @@ def neg_op(
     input1 = symbol_table.get((str(node.args[0]), 0))
     if input1 is None:
         return
-
-    output_shape = list(node.meta["tensor_meta"].shape)
-    dtype = str(node.meta["tensor_meta"].dtype)
-    if dtype == "torch.float32":
-        tensor_type = ir.RankedTensorType.get(output_shape, ir.F32Type.get())
-        output = tensor.EmptyOp(output_shape, ir.F32Type.get())
-        generic_map = ir.AffineMap.get_permutation(
-            [i for i in range(len(output_shape))]
-        )
-        op = linalg.GenericOp(
-            [tensor_type],
-            [input1],
-            [output],
-            ir.ArrayAttr.get(
-                [
-                    ir.AffineMapAttr.get(
-                        generic_map.get_submap(
-                            [i for i in range(len(output_shape))]
-                        )
-                    ),
-                    ir.AffineMapAttr.get(
-                        generic_map.get_submap(
-                            [i for i in range(len(output_shape))]
-                        )
-                    ),
-                ]
-            ),
-            ir.ArrayAttr.get(
-                [ir.Attribute.parse("#linalg.iterator_type<parallel>")]
-                * len(output_shape)
-            ),
-        )
-        block = ir.Block.create_at_start(
-            op.region,
-            [
-                ir.RankedTensorType(input1.type).element_type,
-                ir.RankedTensorType(output.result.type).element_type,
-            ],
-        )
-        negf_op = arith.NegFOp(block.arguments[0])
-        block.append(negf_op)
-        block.append(linalg.YieldOp([negf_op.result]))
+    output_shape = list(node.tensor_meta["shape"])
+    dtype = node.tensor_meta["dtype"]
+    mlir_dtype = mlir_element_type_get(dtype)
+    output = tensor.EmptyOp(output_shape, mlir_dtype)
+    op = linalg.negf(input1, outs=output)
 
     return op
 
 
 def cat_op(
-    node: torch.fx.Node,
+    node: CatOp,
     symbol_table: Dict[Tuple[str, int], ir.Operation],
 ):
     """
     Import the tensor concate operation.
-    From PyTorch `aten.cat.default` operator to MLIR tensor `insert_slice`
+    From buddy CatOp to MLIR tensor `insert_slice`
     operation.
 
     Note: This op, concate two input tensor.
@@ -1660,52 +1318,52 @@ def cat_op(
     if input1 is None or input2 is None:
         return
 
-    output_shape = list(node.meta["tensor_meta"].shape)
+    output_shape = list(node.tensor_meta["shape"])
     if dim < 0:
         dim = len(output_shape) + dim
-    dtype = str(node.meta["tensor_meta"].dtype)
-    if dtype == "torch.float32":
-        output = tensor.EmptyOp(output_shape, ir.F32Type.get())
-        offset = [0 for x in output_shape]
-        offset_attr = ir._denseI64ArrayAttr(offset, None)
-        input1_shape = ir.RankedTensorType(input1.type).shape
-        size_attr = ir._denseI64ArrayAttr(input1_shape, None)
-        stride_attr = ir._denseI64ArrayAttr([1] * len(offset), None)
-        insert_input1 = tensor.InsertSliceOp(
-            input1,
-            output.result,
-            [],
-            [],
-            [],
-            offset_attr,
-            size_attr,
-            stride_attr,
-        )
-        offset[dim] += input1_shape[dim]
-        offset_attr = ir._denseI64ArrayAttr(offset, None)
-        input2_shape = ir.RankedTensorType(input2.type).shape
-        size_attr = ir._denseI64ArrayAttr(input2_shape, None)
-        insert_input2 = tensor.InsertSliceOp(
-            input2,
-            insert_input1.result,
-            [],
-            [],
-            [],
-            offset_attr,
-            size_attr,
-            stride_attr,
-        )
+    dtype = node.tensor_meta["dtype"]
+    mlir_dtype = mlir_element_type_get(dtype)
+    output = tensor.EmptyOp(output_shape, mlir_dtype)
+    offset = [0 for x in output_shape]
+    offset_attr = ir._denseI64ArrayAttr(offset, None)
+    input1_shape = ir.RankedTensorType(input1.type).shape
+    size_attr = ir._denseI64ArrayAttr(input1_shape, None)
+    stride_attr = ir._denseI64ArrayAttr([1] * len(offset), None)
+    insert_input1 = tensor.InsertSliceOp(
+        input1,
+        output.result,
+        [],
+        [],
+        [],
+        offset_attr,
+        size_attr,
+        stride_attr,
+    )
+    offset[dim] += input1_shape[dim]
+    offset_attr = ir._denseI64ArrayAttr(offset, None)
+    input2_shape = ir.RankedTensorType(input2.type).shape
+    size_attr = ir._denseI64ArrayAttr(input2_shape, None)
+    insert_input2 = tensor.InsertSliceOp(
+        input2,
+        insert_input1.result,
+        [],
+        [],
+        [],
+        offset_attr,
+        size_attr,
+        stride_attr,
+    )
 
     return insert_input2
 
 
 def squeeze_op(
-    node: torch.fx.Node,
+    node: SqueezeOp,
     symbol_table: Dict[Tuple[str, int], ir.Operation],
 ):
     """
     Import the tensor squeeze operation.
-    From PyTorch `aten.squeeze.dim` operator to MLIR linalg `generic` operation.
+    From buddy SqueezeOp to MLIR linalg `generic` operation.
 
     Note: This op, reduce the input tensor's shape dims by specified dim.
     Args:
@@ -1722,78 +1380,78 @@ def squeeze_op(
     if input1 is None:
         return
 
-    output_shape = list(node.meta["tensor_meta"].shape)
+    output_shape = list(node.tensor_meta["shape"])
     input1_shape = ir.RankedTensorType(input1.type).shape
     if dim < 0:
         dim = len(input1_shape) + dim
-    dtype = str(node.meta["tensor_meta"].dtype)
-    if dtype == "torch.float32":
-        tensor_type = ir.RankedTensorType.get(output_shape, ir.F32Type.get())
-        output = tensor.EmptyOp(output_shape, ir.F32Type.get())
-        if input1_shape[dim] != 1:
-            offset = [0 for x in output_shape]
-            offset_attr = ir._denseI64ArrayAttr(offset, None)
-            size_attr = ir._denseI64ArrayAttr(input1_shape, None)
-            stride_attr = ir._denseI64ArrayAttr([1] * len(offset), None)
-            op = tensor.InsertSliceOp(
-                input1,
-                output.result,
-                [],
-                [],
-                [],
-                offset_attr,
-                size_attr,
-                stride_attr,
-            )
-        else:
-            output_map = ir.AffineMap.get(
-                len(output_shape),
-                0,
-                [ir.AffineExpr.get_dim(i) for i in range(len(output_shape))],
-            )
-            input1_map = []
-            loop_index = 0
-            for i in range(len(input1_shape)):
-                if len(input1_map) == dim:
-                    input1_map.append(ir.AffineExpr.get_constant(0))
-                else:
-                    input1_map.append(ir.AffineExpr.get_dim(loop_index))
-                    loop_index += 1
-            input1_map = ir.AffineMap.get(len(output_shape), 0, input1_map)
-            op = linalg.GenericOp(
-                [tensor_type],
-                [input1],
-                [output],
-                ir.ArrayAttr.get(
-                    [
-                        ir.AffineMapAttr.get(input1_map),
-                        ir.AffineMapAttr.get(output_map),
-                    ]
-                ),
-                ir.ArrayAttr.get(
-                    [ir.Attribute.parse("#linalg.iterator_type<parallel>")]
-                    * len(output_shape)
-                ),
-            )
-            block = ir.Block.create_at_start(
-                op.region,
+    dtype = node.tensor_meta["dtype"]
+    mlir_dtype = mlir_element_type_get(dtype)
+    tensor_type = ir.RankedTensorType.get(output_shape, mlir_dtype)
+    output = tensor.EmptyOp(output_shape, mlir_dtype)
+    if input1_shape[dim] != 1:
+        offset = [0 for x in output_shape]
+        offset_attr = ir._denseI64ArrayAttr(offset, None)
+        size_attr = ir._denseI64ArrayAttr(input1_shape, None)
+        stride_attr = ir._denseI64ArrayAttr([1] * len(offset), None)
+        op = tensor.InsertSliceOp(
+            input1,
+            output.result,
+            [],
+            [],
+            [],
+            offset_attr,
+            size_attr,
+            stride_attr,
+        )
+    else:
+        output_map = ir.AffineMap.get(
+            len(output_shape),
+            0,
+            [ir.AffineExpr.get_dim(i) for i in range(len(output_shape))],
+        )
+        input1_map = []
+        loop_index = 0
+        for i in range(len(input1_shape)):
+            if len(input1_map) == dim:
+                input1_map.append(ir.AffineExpr.get_constant(0))
+            else:
+                input1_map.append(ir.AffineExpr.get_dim(loop_index))
+                loop_index += 1
+        input1_map = ir.AffineMap.get(len(output_shape), 0, input1_map)
+        op = linalg.GenericOp(
+            [tensor_type],
+            [input1],
+            [output],
+            ir.ArrayAttr.get(
                 [
-                    ir.RankedTensorType(input1.type).element_type,
-                    ir.RankedTensorType(output.result.type).element_type,
-                ],
-            )
-            block.append(linalg.YieldOp([block.arguments[0]]))
+                    ir.AffineMapAttr.get(input1_map),
+                    ir.AffineMapAttr.get(output_map),
+                ]
+            ),
+            ir.ArrayAttr.get(
+                [ir.Attribute.parse("#linalg.iterator_type<parallel>")]
+                * len(output_shape)
+            ),
+        )
+        block = ir.Block.create_at_start(
+            op.region,
+            [
+                ir.RankedTensorType(input1.type).element_type,
+                ir.RankedTensorType(output.result.type).element_type,
+            ],
+        )
+        block.append(linalg.YieldOp([block.arguments[0]]))
 
     return op
 
 
 def batch_matmul_op(
-    node: torch.fx.Node,
+    node: BatchMatmulOp,
     symbol_table: Dict[Tuple[str, int], ir.Operation],
 ):
     """
     Import the tensor batch matmul operation.
-    From PyTorch `aten.bmm.default` operator to MLIR linalg `batch_matmul`
+    From buddy BatchMatmulOp to MLIR linalg `batch_matmul`
     operation.
 
     Note: This op, compute input node's batch matrix multiplication result.
@@ -1811,45 +1469,25 @@ def batch_matmul_op(
     if input1 is None or input2 is None:
         return
 
-    output_shape = list(node.meta["tensor_meta"].shape)
-    dtype = str(node.meta["tensor_meta"].dtype)
-    if dtype == "torch.float32":
-        tensor_type = ir.RankedTensorType.get(output_shape, ir.F32Type.get())
-        output = tensor.EmptyOp(output_shape, ir.F32Type.get())
-        # use linalg.generic implementation
-        generic_map = ir.AffineMap.get_permutation([0, 1, 2])
-        zero_fill = linalg.GenericOp(
-            [tensor_type],
-            [],
-            [output],
-            ir.ArrayAttr.get(
-                [ir.AffineMapAttr.get(generic_map.get_submap([0, 1, 2]))]
-            ),
-            ir.ArrayAttr.get(
-                [ir.Attribute.parse("#linalg.iterator_type<parallel>")] * 3
-            ),
-        )
-        block = ir.Block.create_at_start(
-            zero_fill.region,
-            [ir.RankedTensorType(output.result.type).element_type],
-        )
-        zero_op = arith.ConstantOp(
-            ir.F32Type.get(), ir.FloatAttr.get(ir.F32Type.get(), 0)
-        )
-        block.append(zero_op)
-        block.append(linalg.YieldOp([zero_op.result]))
-        op = linalg.batch_matmul(input1, input2, outs=[zero_fill.result])
+    output_shape = list(node.tensor_meta["shape"])
+    dtype = node.tensor_meta["dtype"]
+    mlir_dtype = mlir_element_type_get(dtype)
+    tensor_type = ir.RankedTensorType.get(output_shape, mlir_dtype)
+    element = mlir_element_attr_get(dtype, 0)
+    attr = ir.DenseElementsAttr.get_splat(tensor_type, element)
+    zero_fill = arith.ConstantOp(tensor_type, attr).result
+    op = linalg.batch_matmul(input1, input2, outs=[zero_fill])
 
     return op
 
 
 def div_op(
-    node: torch.fx.Node,
+    node: DivOp,
     symbol_table: Dict[Tuple[str, int], ir.Operation],
 ):
     """
     Import the tensor divsion operation.
-    From PyTorch `aten.div.Tensor` operator to MLIR linalg `generic` operation.
+    From buddy DivOp to MLIR linalg `generic` operation.
 
     Note: This op, compute input node's division result.
     Args:
@@ -1861,258 +1499,38 @@ def div_op(
         op: The operation return the linalg.generic op.
     """
     assert len(node.args) == 2
-    if isinstance(node.args[0], torch.fx.Node):
-        input1 = symbol_table.get((str(node.args[0]), 0))
-    else:
-        input1 = node.args[0]
-
-    if isinstance(node.args[1], torch.fx.Node):
+    input1 = symbol_table.get((str(node.args[0]), 0))
+    dtype = node.tensor_meta["dtype"]
+    mlir_dtype = mlir_element_type_get(dtype)
+    shape = list(node.tensor_meta["shape"])
+    if isinstance(node.args[1], str):
         input2 = symbol_table.get((str(node.args[1]), 0))
     else:
-        input2 = node.args[1]
-
+        data = [node.args[1]]
+        input2_shape = numpy.array(data).shape
+        tensor_type = ir.RankedTensorType.get(input2_shape, mlir_dtype)
+        element = mlir_element_attr_get(dtype, node.args[1])
+        attr = ir.DenseElementsAttr.get_splat(tensor_type, element)
+        input2 = arith.ConstantOp(tensor_type, attr).result
     if input1 is None or input2 is None:
         return
-
-    output_shape = list(node.meta["tensor_meta"].shape)
-    dtype = str(node.meta["tensor_meta"].dtype)
-
-    if isinstance(node.args[0], torch.fx.Node):
-        if dtype == "torch.float32":
-            if not isinstance(node.args[1], torch.fx.Node):
-                input2 = arith.ConstantOp(
-                    ir.F32Type.get(), ir.FloatAttr.get(ir.F32Type.get(), input2)
-                )
-                tensor_type = ir.RankedTensorType.get(
-                    output_shape, ir.F32Type.get()
-                )
-                output = tensor.EmptyOp(output_shape, ir.F32Type.get())
-                generic_map = ir.AffineMap.get_permutation(
-                    [i for i in range(len(output_shape))]
-                )
-                op = linalg.GenericOp(
-                    [tensor_type],
-                    [input1],
-                    [output],
-                    ir.ArrayAttr.get(
-                        [
-                            ir.AffineMapAttr.get(
-                                generic_map.get_submap(
-                                    [i for i in range(len(output_shape))]
-                                )
-                            ),
-                            ir.AffineMapAttr.get(
-                                generic_map.get_submap(
-                                    [i for i in range(len(output_shape))]
-                                )
-                            ),
-                        ]
-                    ),
-                    ir.ArrayAttr.get(
-                        [ir.Attribute.parse("#linalg.iterator_type<parallel>")]
-                        * len(output_shape)
-                    ),
-                )
-                block = ir.Block.create_at_start(
-                    op.region,
-                    [
-                        ir.RankedTensorType(input1.type).element_type,
-                        ir.RankedTensorType(output.result.type).element_type,
-                    ],
-                )
-                divf_op = arith.DivFOp(block.arguments[0], input2.result)
-                block.append(divf_op)
-                block.append(linalg.YieldOp([divf_op.result]))
-            else:
-                tensor_type = ir.RankedTensorType.get(
-                    output_shape, ir.F32Type.get()
-                )
-                output = tensor.EmptyOp(output_shape, ir.F32Type.get())
-                input1_shape = list(ir.RankedTensorType(input1.type).shape)
-                if input1_shape != output_shape:
-                    dims = []
-                    for i in range(len(input1_shape) - 1, -1, -1):
-                        if (
-                            input1_shape[i]
-                            != output_shape[
-                                len(output_shape) - (len(input1_shape) - i)
-                            ]
-                        ):
-                            dims.append(i)
-                    output1 = tensor.EmptyOp(output_shape, ir.F32Type.get())
-                    generic_map = ir.AffineMap.get_permutation(
-                        [i for i in range(len(output_shape) + len(dims))]
-                    )
-                    input1_map = [
-                        i
-                        for i in range(
-                            len(output_shape) - len(input1_shape),
-                            len(output_shape),
-                        )
-                    ]
-                    for index, i in enumerate(dims):
-                        input1_map[i] = len(output_shape) + index
-                    input1_map = generic_map.get_submap(input1_map)
-                    input1_op = linalg.GenericOp(
-                        [tensor_type],
-                        [input1],
-                        [output1],
-                        ir.ArrayAttr.get(
-                            [
-                                ir.AffineMapAttr.get(input1_map),
-                                ir.AffineMapAttr.get(
-                                    generic_map.get_submap(
-                                        [i for i in range(len(output_shape))]
-                                    )
-                                ),
-                            ]
-                        ),
-                        ir.ArrayAttr.get(
-                            [
-                                ir.Attribute.parse(
-                                    "#linalg.iterator_type<parallel>"
-                                )
-                            ]
-                            * len(output_shape)
-                            + [
-                                ir.Attribute.parse(
-                                    "#linalg.iterator_type<reduction>"
-                                )
-                            ]
-                            * len(dims)
-                        ),
-                    )
-                    block = ir.Block.create_at_start(
-                        input1_op.region,
-                        [
-                            ir.RankedTensorType(input1.type).element_type,
-                            ir.RankedTensorType(
-                                output.result.type
-                            ).element_type,
-                        ],
-                    )
-                    block.append(linalg.YieldOp([block.arguments[0]]))
-                    input1 = input1_op.result
-
-                input2_shape = list(ir.RankedTensorType(input2.type).shape)
-                if input2_shape != output_shape:
-                    dims = []
-                    for i in range(len(input2_shape) - 1, -1, -1):
-                        if (
-                            input2_shape[i]
-                            != output_shape[
-                                len(output_shape) - (len(input2_shape) - i)
-                            ]
-                        ):
-                            dims.append(i)
-                    output2 = tensor.EmptyOp(output_shape, ir.F32Type.get())
-                    generic_map = ir.AffineMap.get_permutation(
-                        [i for i in range(len(output_shape) + len(dims))]
-                    )
-                    input2_map = [
-                        i
-                        for i in range(
-                            len(output_shape) - len(input2_shape),
-                            len(output_shape),
-                        )
-                    ]
-                    for index, i in enumerate(dims):
-                        input2_map[i] = len(output_shape) + index
-                    input2_map = generic_map.get_submap(input2_map)
-                    input2_op = linalg.GenericOp(
-                        [tensor_type],
-                        [input2],
-                        [output2],
-                        ir.ArrayAttr.get(
-                            [
-                                ir.AffineMapAttr.get(input2_map),
-                                ir.AffineMapAttr.get(
-                                    generic_map.get_submap(
-                                        [i for i in range(len(output_shape))]
-                                    )
-                                ),
-                            ]
-                        ),
-                        ir.ArrayAttr.get(
-                            [
-                                ir.Attribute.parse(
-                                    "#linalg.iterator_type<parallel>"
-                                )
-                            ]
-                            * len(output_shape)
-                            + [
-                                ir.Attribute.parse(
-                                    "#linalg.iterator_type<reduction>"
-                                )
-                            ]
-                            * len(dims)
-                        ),
-                    )
-                    block = ir.Block.create_at_start(
-                        input2_op.region,
-                        [
-                            ir.RankedTensorType(input2.type).element_type,
-                            ir.RankedTensorType(
-                                output.result.type
-                            ).element_type,
-                        ],
-                    )
-                    block.append(linalg.YieldOp([block.arguments[0]]))
-                    input2 = input2_op.result
-                generic_map = ir.AffineMap.get_permutation(
-                    [i for i in range(len(output_shape))]
-                )
-                op = linalg.GenericOp(
-                    [tensor_type],
-                    [input1, input2],
-                    [output],
-                    ir.ArrayAttr.get(
-                        [
-                            ir.AffineMapAttr.get(
-                                generic_map.get_submap(
-                                    [i for i in range(len(output_shape))]
-                                )
-                            ),
-                            ir.AffineMapAttr.get(
-                                generic_map.get_submap(
-                                    [i for i in range(len(output_shape))]
-                                )
-                            ),
-                            ir.AffineMapAttr.get(
-                                generic_map.get_submap(
-                                    [i for i in range(len(output_shape))]
-                                )
-                            ),
-                        ]
-                    ),
-                    ir.ArrayAttr.get(
-                        [ir.Attribute.parse("#linalg.iterator_type<parallel>")]
-                        * len(output_shape)
-                    ),
-                )
-                block = ir.Block.create_at_start(
-                    op.region,
-                    [
-                        ir.RankedTensorType(input1.type).element_type,
-                        ir.RankedTensorType(input2.type).element_type,
-                        ir.RankedTensorType(output.result.type).element_type,
-                    ],
-                )
-                divf_op = arith.DivFOp(block.arguments[0], block.arguments[1])
-                block.append(divf_op)
-                block.append(linalg.YieldOp([divf_op.result]))
-
-    return op
+    div_result_tensor_type = ir.RankedTensorType.get(shape, mlir_dtype)
+    op = tosa.MulOp(
+        div_result_tensor_type,
+        input1,
+        tosa.ReciprocalOp(input2.type, input2).result,
+        ir.IntegerAttr.get(ir.IntegerType.get_signless(8), 0),
+    )
+    return op.result
 
 
 def softmax_op(
-    node: torch.fx.Node,
+    node: SoftmaxOp,
     symbol_table: Dict[Tuple[str, int], ir.Operation],
 ):
     """
     Import the tensor softmax operation.
-    From PyTorch `aten._softmax.default` operator to MLIR linalg `generic`
-    operation.
+    From buddy SoftmaxOp to MLIR linalg `generic` operation.
 
     Note: This op, compute input node's softmax result.
     Args:
@@ -2129,266 +1547,109 @@ def softmax_op(
     dim = int(node.args[1])
     if input1 is None:
         return
-    output_shape = list(node.meta["tensor_meta"].shape)
-    dtype = str(node.meta["tensor_meta"].dtype)
+    output_shape = list(node.tensor_meta["shape"])
+    dtype = node.tensor_meta["dtype"]
     if dim < 0:
         dim += len(output_shape)
-    if dtype == "torch.float32":
-        max_tensor_shape = copy.deepcopy(output_shape)
-        max_tensor_shape[dim] = 1
-        max_tensor_type = ir.RankedTensorType.get(
-            max_tensor_shape, ir.F32Type.get()
-        )
-        max_tensor = tensor.EmptyOp(max_tensor_shape, ir.F32Type.get())
-        max_tensor_map = [
-            ir.AffineExpr.get_dim(i) for i in range(len(max_tensor_shape))
-        ]
-        max_tensor_map = ir.AffineMap.get(
-            len(max_tensor_shape), 0, max_tensor_map
-        )
-        neg_inf_fill = linalg.GenericOp(
-            [max_tensor_type],
-            [],
-            [max_tensor],
-            ir.ArrayAttr.get([ir.AffineMapAttr.get(max_tensor_map)]),
-            ir.ArrayAttr.get(
-                [ir.Attribute.parse("#linalg.iterator_type<parallel>")]
-                * len(max_tensor_shape)
-            ),
-        )
-        block = ir.Block.create_at_start(
-            neg_inf_fill.region,
-            [ir.RankedTensorType(max_tensor.result.type).element_type],
-        )
-        neg_inf_op = arith.ConstantOp(
-            ir.F32Type.get(), ir.FloatAttr.get(ir.F32Type.get(), float("-inf"))
-        )
-        block.append(neg_inf_op)
-        block.append(linalg.YieldOp([neg_inf_op.result]))
-
-        input1_map = [
-            ir.AffineExpr.get_dim(i) for i in range(len(output_shape))
-        ]
-        input1_map = ir.AffineMap.get(len(output_shape), 0, input1_map)
-        max_tensor_map = [
-            ir.AffineExpr.get_dim(i) for i in range(len(output_shape))
-        ]
-        max_tensor_map[dim] = ir.AffineExpr.get_constant(0)
-        max_tensor_map = ir.AffineMap.get(len(output_shape), 0, max_tensor_map)
-        loop_type = [
-            ir.Attribute.parse("#linalg.iterator_type<parallel>")
-        ] * len(output_shape)
-        loop_type[dim] = ir.Attribute.parse("#linalg.iterator_type<reduction>")
-        max_tensor_op = linalg.GenericOp(
-            [max_tensor_type],
-            [input1],
-            [neg_inf_fill],
-            ir.ArrayAttr.get(
-                [
-                    ir.AffineMapAttr.get(input1_map),
-                    ir.AffineMapAttr.get(max_tensor_map),
-                ]
-            ),
-            ir.ArrayAttr.get(loop_type),
-        )
-        block = ir.Block.create_at_start(
-            max_tensor_op.region,
-            [
-                ir.RankedTensorType(input1.type).element_type,
-                ir.RankedTensorType(neg_inf_fill.result.type).element_type,
-            ],
-        )
-        max_op = arith.MaximumFOp(block.arguments[0], block.arguments[1])
-        block.append(max_op)
-        block.append(linalg.YieldOp([max_op.result]))
-
-        exp_tensor = tensor.EmptyOp(output_shape, ir.F32Type.get())
-        exp_tensor_type = ir.RankedTensorType.get(
-            output_shape, ir.F32Type.get()
-        )
-        input1_map = [
-            ir.AffineExpr.get_dim(i) for i in range(len(output_shape))
-        ]
-        input1_map = ir.AffineMap.get(len(output_shape), 0, input1_map)
-        max_tensor_map = [
-            ir.AffineExpr.get_dim(i) for i in range(len(output_shape))
-        ]
-        max_tensor_map[dim] = ir.AffineExpr.get_constant(0)
-        max_tensor_map = ir.AffineMap.get(len(output_shape), 0, max_tensor_map)
-        exp_tensor_map = [
-            ir.AffineExpr.get_dim(i) for i in range(len(output_shape))
-        ]
-        exp_tensor_map = ir.AffineMap.get(len(output_shape), 0, exp_tensor_map)
-        exp_tensor_op = linalg.GenericOp(
-            [exp_tensor_type],
-            [input1, max_tensor_op.result],
-            [exp_tensor],
-            ir.ArrayAttr.get(
-                [
-                    ir.AffineMapAttr.get(input1_map),
-                    ir.AffineMapAttr.get(max_tensor_map),
-                    ir.AffineMapAttr.get(exp_tensor_map),
-                ]
-            ),
-            ir.ArrayAttr.get(
-                [ir.Attribute.parse("#linalg.iterator_type<parallel>")]
-                * len(output_shape)
-            ),
-        )
-        block = ir.Block.create_at_start(
-            exp_tensor_op.region,
-            [
-                ir.RankedTensorType(input1.type).element_type,
-                ir.RankedTensorType(max_tensor_op.result.type).element_type,
-                ir.RankedTensorType(exp_tensor.result.type).element_type,
-            ],
-        )
-        sub_op = arith.SubFOp(block.arguments[0], block.arguments[1])
-        exp_op = math.ExpOp(sub_op.result)
-        block.append(sub_op)
-        block.append(exp_op)
-        block.append(linalg.YieldOp([exp_op.result]))
-
-        reduce_sum_tensor_shape = copy.deepcopy(output_shape)
-        reduce_sum_tensor_shape[dim] = 1
-        reduce_sum_tensor = tensor.EmptyOp(
-            reduce_sum_tensor_shape, ir.F32Type.get()
-        )
-        reduce_sum_tensor_type = ir.RankedTensorType.get(
-            reduce_sum_tensor_shape, ir.F32Type.get()
-        )
-        reduce_sum_tensor_map = [
-            ir.AffineExpr.get_dim(i) for i in range(len(output_shape))
-        ]
-        reduce_sum_tensor_map = ir.AffineMap.get(
-            len(output_shape), 0, reduce_sum_tensor_map
-        )
-        zero_fill_op = linalg.GenericOp(
-            [reduce_sum_tensor_type],
-            [],
-            [reduce_sum_tensor.result],
-            ir.ArrayAttr.get([ir.AffineMapAttr.get(reduce_sum_tensor_map)]),
-            ir.ArrayAttr.get(
-                [ir.Attribute.parse("#linalg.iterator_type<parallel>")]
-                * len(output_shape)
-            ),
-        )
-        block = ir.Block.create_at_start(
-            zero_fill_op.region,
-            [ir.RankedTensorType(reduce_sum_tensor.result.type).element_type],
-        )
-        zero_op = arith.ConstantOp(
-            ir.F32Type.get(), ir.FloatAttr.get(ir.F32Type.get(), 0)
-        )
-        block.append(zero_op)
-        block.append(linalg.YieldOp([zero_op.result]))
-
-        reduce_sum_tensor_shape = copy.deepcopy(output_shape)
-        reduce_sum_tensor_shape[dim] = 1
-        reduce_sum_tensor_type = ir.RankedTensorType.get(
-            reduce_sum_tensor_shape, ir.F32Type.get()
-        )
-        exp_tensor_map = [
-            ir.AffineExpr.get_dim(i) for i in range(len(output_shape))
-        ]
-        exp_tensor_map = ir.AffineMap.get(len(output_shape), 0, exp_tensor_map)
-        reduce_sum_tensor_map = [
-            ir.AffineExpr.get_dim(i) for i in range(len(output_shape))
-        ]
-        reduce_sum_tensor_map[dim] = ir.AffineExpr.get_constant(0)
-        reduce_sum_tensor_map = ir.AffineMap.get(
-            len(output_shape), 0, reduce_sum_tensor_map
-        )
-        loop_type = [
-            ir.Attribute.parse("#linalg.iterator_type<parallel>")
-        ] * len(output_shape)
-        loop_type[dim] = ir.Attribute.parse("#linalg.iterator_type<reduction>")
-        reduce_sum_tensor_op = linalg.GenericOp(
-            [reduce_sum_tensor_type],
-            [exp_tensor_op.result],
-            [zero_fill_op.result],
-            ir.ArrayAttr.get(
-                [
-                    ir.AffineMapAttr.get(exp_tensor_map),
-                    ir.AffineMapAttr.get(reduce_sum_tensor_map),
-                ]
-            ),
-            ir.ArrayAttr.get(loop_type),
-        )
-        block = ir.Block.create_at_start(
-            reduce_sum_tensor_op.region,
+    mlir_dtype = mlir_element_type_get(dtype)
+    # tensor_type = ir.RankedTensorType.get(output_shape, mlir_dtype)
+    # output = tensor.EmptyOp(output_shape, mlir_dtype)
+    # op = linalg.softmax(
+    #     [tensor_type],
+    #     input1,
+    #     output,
+    #     ir.IntegerAttr.get(ir.IntegerType.get_signless(64), dim),
+    # )
+    # print(op, flush=True)
+    sum_tensor_shape = copy.deepcopy(output_shape)
+    sum_tensor_shape[dim] = 1
+    sum_tensor_type = ir.RankedTensorType.get(sum_tensor_shape, mlir_dtype)
+    element = mlir_element_attr_get(dtype, 0)
+    attr = ir.DenseElementsAttr.get_splat(sum_tensor_type, element)
+    sum_tensor = arith.ConstantOp(sum_tensor_type, attr).result
+    input1_map = [ir.AffineExpr.get_dim(i) for i in range(len(output_shape))]
+    input1_map = ir.AffineMap.get(len(output_shape), 0, input1_map)
+    sum_tensor_map = [
+        ir.AffineExpr.get_dim(i) for i in range(len(output_shape))
+    ]
+    sum_tensor_map[dim] = ir.AffineExpr.get_constant(0)
+    sum_tensor_map = ir.AffineMap.get(len(output_shape), 0, sum_tensor_map)
+    loop_type = [ir.Attribute.parse("#linalg.iterator_type<parallel>")] * len(
+        output_shape
+    )
+    loop_type[dim] = ir.Attribute.parse("#linalg.iterator_type<reduction>")
+    sum_tensor_op = linalg.GenericOp(
+        [sum_tensor_type],
+        [input1],
+        [sum_tensor],
+        ir.ArrayAttr.get(
             [
-                ir.RankedTensorType(exp_tensor_op.result.type).element_type,
-                ir.RankedTensorType(zero_fill_op.result.type).element_type,
-            ],
-        )
-        add_op = arith.AddFOp(block.arguments[0], block.arguments[1])
-        block.append(add_op)
-        block.append(linalg.YieldOp([add_op.result]))
-
-        reduce_sum_tensor_shape = copy.deepcopy(output_shape)
-        reduce_sum_tensor_shape[dim] = 1
-        result_tensor_type = ir.RankedTensorType.get(
-            output_shape, ir.F32Type.get()
-        )
-        result_tensor = tensor.EmptyOp(output_shape, ir.F32Type.get())
-        exp_tensor_map = [
-            ir.AffineExpr.get_dim(i) for i in range(len(output_shape))
-        ]
-        exp_tensor_map = ir.AffineMap.get(len(output_shape), 0, exp_tensor_map)
-        reduce_sum_tensor_map = [
-            ir.AffineExpr.get_dim(i) for i in range(len(output_shape))
-        ]
-        reduce_sum_tensor_map[dim] = ir.AffineExpr.get_constant(0)
-        reduce_sum_tensor_map = ir.AffineMap.get(
-            len(output_shape), 0, reduce_sum_tensor_map
-        )
-        result_tensor_map = [
-            ir.AffineExpr.get_dim(i) for i in range(len(output_shape))
-        ]
-        result_tensor_map = ir.AffineMap.get(
-            len(output_shape), 0, result_tensor_map
-        )
-        op = linalg.GenericOp(
-            [result_tensor_type],
-            [exp_tensor_op.result, reduce_sum_tensor_op.result],
-            [result_tensor.result],
-            ir.ArrayAttr.get(
-                [
-                    ir.AffineMapAttr.get(exp_tensor_map),
-                    ir.AffineMapAttr.get(reduce_sum_tensor_map),
-                    ir.AffineMapAttr.get(result_tensor_map),
-                ]
-            ),
-            ir.ArrayAttr.get(
-                [ir.Attribute.parse("#linalg.iterator_type<parallel>")]
-                * len(output_shape)
-            ),
-        )
-        block = ir.Block.create_at_start(
-            op.region,
+                ir.AffineMapAttr.get(input1_map),
+                ir.AffineMapAttr.get(sum_tensor_map),
+            ]
+        ),
+        ir.ArrayAttr.get(loop_type),
+    )
+    block = ir.Block.create_at_start(
+        sum_tensor_op.region,
+        [
+            mlir_dtype,
+            mlir_dtype,
+        ],
+    )
+    exp_op = math.ExpOp(block.arguments[0])
+    add_op = arith.AddFOp(exp_op.result, block.arguments[1])
+    block.append(exp_op)
+    block.append(add_op)
+    block.append(linalg.YieldOp([add_op.result]))
+    result_tensor_type = ir.RankedTensorType.get(output_shape, mlir_dtype)
+    result_tensor = tensor.EmptyOp(output_shape, mlir_dtype)
+    result_tensor_map = [
+        ir.AffineExpr.get_dim(i) for i in range(len(output_shape))
+    ]
+    result_tensor_map = ir.AffineMap.get(
+        len(output_shape), 0, result_tensor_map
+    )
+    op = linalg.GenericOp(
+        [result_tensor_type],
+        [input1, sum_tensor_op.result],
+        [result_tensor.result],
+        ir.ArrayAttr.get(
             [
-                ir.RankedTensorType(exp_tensor_op.result.type).element_type,
-                ir.RankedTensorType(
-                    reduce_sum_tensor_op.result.type
-                ).element_type,
-                ir.RankedTensorType(result_tensor.result.type).element_type,
-            ],
-        )
-        div_op = arith.DivFOp(block.arguments[0], block.arguments[1])
-        block.append(div_op)
-        block.append(linalg.YieldOp([div_op.result]))
+                ir.AffineMapAttr.get(input1_map),
+                ir.AffineMapAttr.get(sum_tensor_map),
+                ir.AffineMapAttr.get(result_tensor_map),
+            ]
+        ),
+        ir.ArrayAttr.get(
+            [ir.Attribute.parse("#linalg.iterator_type<parallel>")]
+            * len(output_shape)
+        ),
+    )
+    block = ir.Block.create_at_start(
+        op.region,
+        [
+            mlir_dtype,
+            mlir_dtype,
+            mlir_dtype,
+        ],
+    )
+    exp_op = math.ExpOp(block.arguments[0])
+    div_op = arith.DivFOp(exp_op.result, block.arguments[1])
+    block.append(exp_op)
+    block.append(div_op)
+    block.append(linalg.YieldOp([div_op.result]))
 
     return op
 
 
 def clone_op(
-    node: torch.fx.Node,
+    node: CloneOp,
     symbol_table: Dict[Tuple[str, int], ir.Operation],
 ):
     """
     Import the tensor clone operation.
-    From PyTorch `aten.clone.default` operator to MLIR tensor `extract_slice`
+    From buddy CloneOp to MLIR tensor `extract_slice`
     operation.
 
     Note: This op, clone input tensor to a new tensor.
@@ -2405,31 +1666,29 @@ def clone_op(
     if input1 is None:
         return
 
-    output_shape = list(node.meta["tensor_meta"].shape)
-    dtype = str(node.meta["tensor_meta"].dtype)
-    if dtype == "torch.float32":
-        offset = [0 for x in output_shape]
-        offset_attr = ir._denseI64ArrayAttr(offset, None)
-        size_attr = ir._denseI64ArrayAttr(output_shape, None)
-        stride = [1 for x in output_shape]
-        stride_attr = ir._denseI64ArrayAttr(stride, None)
-        tensor_type = ir.RankedTensorType.get(output_shape, ir.F32Type.get())
-
-        op = tensor.ExtractSliceOp(
-            tensor_type, input1, [], [], [], offset_attr, size_attr, stride_attr
-        )
+    output_shape = list(node.tensor_meta["shape"])
+    dtype = node.tensor_meta["dtype"]
+    mlir_dtype = mlir_element_type_get(dtype)
+    offset = [0 for x in output_shape]
+    offset_attr = ir._denseI64ArrayAttr(offset, None)
+    size_attr = ir._denseI64ArrayAttr(output_shape, None)
+    stride = [1 for x in output_shape]
+    stride_attr = ir._denseI64ArrayAttr(stride, None)
+    tensor_type = ir.RankedTensorType.get(output_shape, mlir_dtype)
+    op = tensor.ExtractSliceOp(
+        tensor_type, input1, [], [], [], offset_attr, size_attr, stride_attr
+    )
 
     return op
 
 
 def silu_op(
-    node: torch.fx.Node,
+    node: SiluOp,
     symbol_table: Dict[Tuple[str, int], ir.Operation],
 ):
     """
     Import the tensor silu activation operation.
-    From PyTorch `aten.silu.default` operator to MLIR linalg `generic`
-    operation.
+    From Buddy SiluOp to MLIR linalg `generic` operation.
 
     Note: This op, compute input node's silu activation result.
     Args:
@@ -2445,63 +1704,61 @@ def silu_op(
     if input1 is None:
         return
 
-    output_shape = list(node.meta["tensor_meta"].shape)
-    dtype = str(node.meta["tensor_meta"].dtype)
-    if dtype == "torch.float32":
-        tensor_type = ir.RankedTensorType.get(output_shape, ir.F32Type.get())
-        output = tensor.EmptyOp(output_shape, ir.F32Type.get())
-        generic_map = ir.AffineMap.get_permutation(
-            [i for i in range(len(output_shape))]
-        )
-        op = linalg.GenericOp(
-            [tensor_type],
-            [input1],
-            [output],
-            ir.ArrayAttr.get(
-                [
-                    ir.AffineMapAttr.get(
-                        generic_map.get_submap(
-                            [i for i in range(len(output_shape))]
-                        )
-                    ),
-                    ir.AffineMapAttr.get(
-                        generic_map.get_submap(
-                            [i for i in range(len(output_shape))]
-                        )
-                    ),
-                ]
-            ),
-            ir.ArrayAttr.get(
-                [ir.Attribute.parse("#linalg.iterator_type<parallel>")]
-                * len(output_shape)
-            ),
-        )
-        block = ir.Block.create_at_start(
-            op.region,
+    output_shape = list(node.tensor_meta["shape"])
+    dtype = node.tensor_meta["dtype"]
+    mlir_dtype = mlir_element_type_get(dtype)
+    tensor_type = ir.RankedTensorType.get(output_shape, mlir_dtype)
+    output = tensor.EmptyOp(output_shape, mlir_dtype)
+    generic_map = ir.AffineMap.get_permutation(
+        [i for i in range(len(output_shape))]
+    )
+    op = linalg.GenericOp(
+        [tensor_type],
+        [input1],
+        [output],
+        ir.ArrayAttr.get(
             [
-                ir.RankedTensorType(input1.type).element_type,
-                ir.RankedTensorType(output.result.type).element_type,
-            ],
-        )
-        neg_op = arith.NegFOp(block.arguments[0])
-        exp_op = math.ExpOp(neg_op.result)
-        one_op = arith.ConstantOp(
-            ir.F32Type.get(), ir.FloatAttr.get(ir.F32Type.get(), 1)
-        )
-        add_op = arith.AddFOp(one_op.result, exp_op.result)
-        div_op = arith.DivFOp(block.arguments[0], add_op.result)
-        block.append(neg_op)
-        block.append(exp_op)
-        block.append(one_op)
-        block.append(add_op)
-        block.append(div_op)
-        block.append(linalg.YieldOp([div_op.result]))
+                ir.AffineMapAttr.get(
+                    generic_map.get_submap(
+                        [i for i in range(len(output_shape))]
+                    )
+                ),
+                ir.AffineMapAttr.get(
+                    generic_map.get_submap(
+                        [i for i in range(len(output_shape))]
+                    )
+                ),
+            ]
+        ),
+        ir.ArrayAttr.get(
+            [ir.Attribute.parse("#linalg.iterator_type<parallel>")]
+            * len(output_shape)
+        ),
+    )
+    block = ir.Block.create_at_start(
+        op.region,
+        [
+            ir.RankedTensorType(input1.type).element_type,
+            ir.RankedTensorType(output.result.type).element_type,
+        ],
+    )
+    neg_op = arith.NegFOp(block.arguments[0])
+    exp_op = math.ExpOp(neg_op.result)
+    one_op = arith.ConstantOp(mlir_dtype, mlir_element_attr_get(dtype, 1))
+    add_op = arith.AddFOp(one_op.result, exp_op.result)
+    div_op = arith.DivFOp(block.arguments[0], add_op.result)
+    block.append(neg_op)
+    block.append(exp_op)
+    block.append(one_op)
+    block.append(add_op)
+    block.append(div_op)
+    block.append(linalg.YieldOp([div_op.result]))
 
     return op
 
 
 def param_extract(
-    node: torch.fx.Node,
+    node: PlaceholderOp,
     offset,
     params_mlir_node,
 ):
@@ -2519,12 +1776,12 @@ def param_extract(
         op: The operation return the tensor.expand_shape op.
     """
     dtype_mapping = {
-        torch.float32: ir.F32Type.get(),
-        torch.int64: ir.IntegerType.get_signless(64),
+        TensorDType.Float32: ir.F32Type.get(),
+        TensorDType.Int64: ir.IntegerType.get_signless(64),
     }
-    tensor_element_type = dtype_mapping[node.meta["tensor_meta"].dtype]
-    output_shape = list(node.meta["tensor_meta"].shape)
-    extract_size = functools.reduce(lambda x, y: x * y, output_shape)
+    tensor_element_type = dtype_mapping[node.tensor_meta["dtype"]]
+    output_shape = list(node.tensor_meta["shape"])
+    extract_size = functools.reduce(lambda x, y: x * y, output_shape, 1)
     offset_attr = ir._denseI64ArrayAttr([offset], None)
     size_attr = ir._denseI64ArrayAttr([extract_size], None)
     stride = [1]
@@ -2540,7 +1797,7 @@ def param_extract(
         size_attr,
         stride_attr,
     )
-    if len(output_shape) == 1:
+    if len(output_shape) == 1 or len(output_shape) == 0:
         return extract_slice_op
     tensor_type = ir.RankedTensorType.get(output_shape, tensor_element_type)
     axis = ir.ArrayAttr.get(
@@ -2553,36 +1810,123 @@ def param_extract(
     axis = ir.ArrayAttr.get([axis], None)
     return tensor.ExpandShapeOp(tensor_type, extract_slice_op.result, axis)
 
+def where_op(
+    node: WhereOp,
+    symbol_table: Dict[Tuple[str, int], ir.Operation],
+):
+    """
+    Import the tensor where operation.
+    From Buddy WhereOp to MLIR linalg `generic` operation.
+
+    Note: This op, compute input node's silu activation result.
+    Args:
+        node: Containing information from the input graph node.
+        symbol_table: A dictionary mapping symbols to their corresponding
+        operations.
+
+    Returns:
+        op: The operation return the linalg.generic op.
+    """
+    assert len(node.args) == 3
+    input1 = symbol_table.get((str(node.args[0]), 0))
+    input2 = symbol_table.get((str(node.args[1]), 0))
+    input3 = symbol_table.get((str(node.args[2]), 0))
+    if input1 is None or input2 is None or input3 is None:
+        return
+
+    output_shape = list(node.tensor_meta["shape"])
+    dtype = node.tensor_meta["dtype"]
+    mlir_dtype = mlir_element_type_get(dtype)
+    tensor_type = ir.RankedTensorType.get(output_shape, mlir_dtype)
+    output = tensor.EmptyOp(output_shape, mlir_dtype)
+    generic_map = ir.AffineMap.get_permutation(
+        [i for i in range(len(output_shape))]
+    )
+    op = linalg.GenericOp(
+        [tensor_type],
+        [input1, input3],
+        [output],
+        ir.ArrayAttr.get(
+            [
+                ir.AffineMapAttr.get(
+                    generic_map.get_submap(
+                        [i for i in range(len(output_shape))]
+                    )
+                ),
+                ir.AffineMapAttr.get(
+                    generic_map.get_submap(
+                        [i for i in range(len(output_shape))]
+                    )
+                ),
+                ir.AffineMapAttr.get(
+                    generic_map.get_submap(
+                        [i for i in range(len(output_shape))]
+                    )
+                ),
+            ]
+        ),
+        ir.ArrayAttr.get(
+            [ir.Attribute.parse("#linalg.iterator_type<parallel>")]
+            * len(output_shape)
+        ),
+    )
+    block = ir.Block.create_at_start(
+        op.region,
+        [
+            ir.RankedTensorType(input1.type).element_type,
+            ir.RankedTensorType(input3.type).element_type,
+            ir.RankedTensorType(output.result.type).element_type,
+        ],
+    )
+    select_op = arith.SelectOp(block.arguments[0], input2, block.arguments[1])
+    block.append(select_op)
+    block.append(linalg.YieldOp([select_op.result]))
+
+    return op
+
+def scalar_tensor_op(node: ScalarTensorOp, symbol_table):
+    """
+    Import the tensor Scalar_Tensor operation.
+    From Buddy ScalarTensorOp to MLIR arith `ConstantOp` operation.
+    """
+    assert len(node.args) == 1
+    dtype = node.tensor_meta["dtype"]
+    attr = mlir_element_attr_get(dtype, node.args[0])
+    op = arith.ConstantOp(dtype, attr)
+
+    return op
 
 ops_registry = {
-    "arange.start": arange_op,
-    "arange.default": arange_op,
-    "unsqueeze.default": unsqueeze_op,
-    "view.default": view_op,
-    "ones.default": ones_op,
-    "full.default": full_op,
-    "lt.Tensor": lt_op,
-    "embedding.default": embedding_op,
-    "masked_fill.Scalar": masked_fill_op,
-    "slice.Tensor": slice_op,
-    "expand.default": expand_op,
-    "_to_copy.default": to_copy_op,
-    "rsub.Scalar": rsub_op,
-    "pow.Tensor_Scalar": pow_op,
-    "mean.dim": mean_op,
-    "rsqrt.default": rsqrt_op,
-    "mul.Tensor": mul_op,
-    "t.default": t_op,
-    "mm.default": matmul_op,
-    "transpose.int": transpose_op,
-    "index.Tensor": index_op,
-    "neg.default": neg_op,
-    "cat.default": cat_op,
-    "squeeze.dim": squeeze_op,
-    "bmm.default": batch_matmul_op,
-    "div.Tensor": div_op,
-    "_softmax.default": softmax_op,
-    "clone.default": clone_op,
-    "silu.default": silu_op,
     "param.extract": param_extract,
+    "MatmulOp": matmul_op,
+    "ArangeOp": arange_op,
+    "UnsqueezeOp": unsqueeze_op,
+    "ViewOp": view_op,
+    "EmbeddingOp": embedding_op,
+    "OnesOp": ones_op,
+    "FullOp": full_op,
+    "LessThanOp": lt_op,
+    "MaskedFillOp": masked_fill_op,
+    "SliceOp": slice_op,
+    "ExpandOp": expand_op,
+    "ToCopyOp": to_copy_op,
+    "RsubOp": rsub_op,
+    "PowOp": pow_op,
+    "MeanOp": mean_op,
+    "RsqrtOp": rsqrt_op,
+    "MulOp": mul_op,
+    "TOp": t_op,
+    "TransposeOp": transpose_op,
+    "IndexOp": index_op,
+    "NegOp": neg_op,
+    "CatOp": cat_op,
+    "SqueezeOp": squeeze_op,
+    "BatchMatmulOp": batch_matmul_op,
+    "DivOp": div_op,
+    "SoftmaxOp": softmax_op,
+    "CloneOp": clone_op,
+    "SiluOp": silu_op,
+    "AddOp": add_op,
+    "WhereOp": where_op,
+    "ScalarTensorOp": scalar_tensor_op,
 }
diff --git a/frontend/Python/ops/math.py b/frontend/Python/ops/math.py
index 7e2de80b5f..19820c2b3b 100644
--- a/frontend/Python/ops/math.py
+++ b/frontend/Python/ops/math.py
@@ -22,11 +22,16 @@
 
 
 def erf_op(node, symbol_table):
-    input_ = symbol_table.get((str(node.args[0]), 0))
-    op = math.ErfOp(input_)
+    input_tensor = symbol_table.get((str(node.args[0]), 0))
+    op = math.ErfOp(input_tensor)
     return op
 
+def sqrt_op(node, symbol_table):
+    input_tensor = symbol_table.get((str(node.args[0]), 0))
+    return math.SqrtOp(input_tensor)
+
 
 ops_registry = {
-    "erf.default": erf_op,
+    "ErfOp": erf_op,
+    "SqrtOp": sqrt_op,
 }
diff --git a/frontend/Python/ops/tosa.py b/frontend/Python/ops/tosa.py
index bf957002a9..8a0997a3a0 100644
--- a/frontend/Python/ops/tosa.py
+++ b/frontend/Python/ops/tosa.py
@@ -14,17 +14,52 @@
 #
 # ===---------------------------------------------------------------------------
 #
-# The registry of mappings from Torch node to MLIR tosa dialect operations.
+# The registry of mappings from Buddy Graph to MLIR tosa dialect operations.
 #
 # ===---------------------------------------------------------------------------
 
-import torch
 import array
 from typing import Dict, List, Tuple, Union
+import numpy
 
 import mlir.ir as ir
 from mlir.dialects import tensor, tosa
 
+from ..graph import TensorDType
+from ..graph import (
+    AddOp,
+    PermuteOp,
+    AddMMOp,
+    BatchMatmulOp,
+    SubOp,
+    MulOp,
+    DivOp,
+    TanhOp,
+    ExpOp,
+    RsqrtOp,
+    AmaxOp,
+    ReshapeOp,
+    UnsqueezeOp,
+    SelectOp,
+    SliceOp,
+    ConvertElementTypeOp,
+    CloneOp,
+    VarMeanOp,
+    EmbeddingOp,
+    ExpandOp,
+    SumDimOp,
+    TOp,
+    TransposeOp,
+    MaxPool2dOp,
+    Conv2dOp,
+    ReluOp,
+    IotaOp,
+    SigmoidOp,
+    ReciprocalOp,
+    MeanOp,
+)
+from .utils import *
+
 
 def _normalize_binary_operator_shape(shp1, shp2):
     """Normalize the shape of two input tensors according to the broadcasting
@@ -75,9 +110,8 @@ def _gen_arith_binary_op(input1, input2, op_func):
 def _scalar_to_tensor(
     scalar: Union[float, int], element_type: ir.Type, shape: List[int]
 ):
-    """PyTorch allow the binary operation between tensor and scalar. But MLIR
-    does not.
-    So we need to convert scalars to the corresponding tensors."""
+    """Convert scalers to cooresponding tensors since MLIR
+    doesn't support operation between scalers and tensors."""
     element = (
         ir.FloatAttr.get(element_type, float(scalar))
         if str(element_type) == "f32"
@@ -128,11 +162,11 @@ def _normalize_binary_operator_args(arg1, arg2):
 
 
 def addmm_op(
-    node, symbol_table: Dict[Tuple[str, int], ir.Operation]
+    node: AddMMOp, symbol_table: Dict[Tuple[str, int], ir.Operation]
 ) -> ir.Operation:
     """
     Import matrix multiplication operation.
-    From PyTorch `aten.addmm.default` operator to MLIR TOSA `matmul` operation.
+    From buddy graph ir's `AddMMOp` operator to MLIR TOSA `matmul` operation.
 
     Note: this function first reshapes the input matrices to 3D tensors
     (since tosa.MatMulOp requires it). Then it multiplies these reshaped
@@ -146,8 +180,7 @@ def addmm_op(
 
     Returns:
         op: The operation representing the result of adding the matrix
-        multiplication
-            to the input tensor.
+        multiplication to the input tensor.
     """
     # get input
     input_ = symbol_table.get((str(node.args[0]), 0))
@@ -184,10 +217,11 @@ def addmm_op(
     return op
 
 
-def bmm_op(node, symbol_table) -> ir.Operation:
+def bmm_op(node: BatchMatmulOp, symbol_table) -> ir.Operation:
     """
     Import batch matrix multiplication operation.
-    From PyTorch `aten.bmm.default` operator to MLIR TOSA `matmul` operation.
+    From buddy graph ir's `BatchMatmulOp` operator to MLIR TOSA `matmul` 
+    operation.
     """
     input_ = symbol_table.get((str(node.args[0]), 0))
     mat2 = symbol_table.get((str(node.args[1]), 0))
@@ -200,30 +234,30 @@ def bmm_op(node, symbol_table) -> ir.Operation:
     return op
 
 
-def add_op(node, symbol_table):
+def add_op(node: AddOp, symbol_table):
     """
     Import tensor addition operation.
-    From PyTorch `aten.add.Tensor` operator to MLIR TOSA `add` operation.
+    From buddy graph ir's `AddOp` operator to MLIR TOSA `add` operation.
     """
     input1 = symbol_table.get((str(node.args[0]), 0), node.args[0])
     input2 = symbol_table.get((str(node.args[1]), 0), node.args[1])
     return _gen_arith_binary_op(input1, input2, tosa.AddOp)
 
 
-def sub_op(node, symbol_table):
+def sub_op(node: SubOp, symbol_table):
     """
     Import tensor subtraction operation.
-    From PyTorch `aten.sub.Tensor` operator to MLIR TOSA `sub` operation.
+    From buddy graph ir's `SubOp` operator to MLIR TOSA `sub` operation.
     """
     input1 = symbol_table.get((str(node.args[0]), 0), node.args[0])
     input2 = symbol_table.get((str(node.args[1]), 0), node.args[1])
     return _gen_arith_binary_op(input1, input2, tosa.SubOp)
 
 
-def mul_op(node, symbol_table):
+def mul_op(node: MulOp, symbol_table):
     """
-    Import tensor multiplication operation.
-    From PyTorch `aten.mul.Tensor` operator to MLIR TOSA `mul` operation.
+    Import tensor division operation.
+    From buddy graph ir's `DivOp` operator to MLIR TOSA `div` operation.
     """
 
     def _inner_op(result_type, input1, input2):
@@ -240,10 +274,10 @@ def _inner_op(result_type, input1, input2):
     return _gen_arith_binary_op(input1, input2, _inner_op)
 
 
-def div_op(node, symbol_table):
+def div_op(node: DivOp, symbol_table):
     """
     Import tensor division operation.
-    From PyTorch `aten.div.Tensor` operator to MLIR TOSA `div` operation.
+    From buddy graph ir's `DivOp` operator to MLIR TOSA `div` operation.
     """
 
     def _inner_op(result_type, input1, input2):
@@ -260,10 +294,10 @@ def _inner_op(result_type, input1, input2):
     return _gen_arith_binary_op(input1, input2, _inner_op)
 
 
-def tanh_op(node, symbol_table):
+def tanh_op(node: TanhOp, symbol_table):
     """
     Import elementwise tanh operation.
-    From PyTorch `aten.tanh.default` operator to MLIR TOSA `tanh` operation.
+    From buddy graph ir's `TanhOp` operator to MLIR TOSA `tanh` operation.
     """
     input1 = symbol_table.get((str(node.args[0]), 0))
     sizes = ir.RankedTensorType(input1.type).shape
@@ -273,10 +307,10 @@ def tanh_op(node, symbol_table):
     return op
 
 
-def exp_op(node, symbol_table):
+def exp_op(node: ExpOp, symbol_table):
     """
     Import elementwise exponential operation.
-    From PyTorch `aten.exp.default` operator to MLIR TOSA `exp` operation.
+    From buddy graph ir's `ExpOp` operator to MLIR TOSA `exp` operation.
     """
     input1 = symbol_table.get((str(node.args[0]), 0))
     sizes = ir.RankedTensorType(input1.type).shape
@@ -286,10 +320,10 @@ def exp_op(node, symbol_table):
     return op
 
 
-def rsqrt_op(node, symbol_table):
+def rsqrt_op(node: RsqrtOp, symbol_table):
     """
     Import elementwise reciprocal square root operation.
-    From PyTorch `aten.rsqrt.default` operator to MLIR TOSA `rsqrt` operation.
+    From buddy graph ir's `RsqrtOp` operator to MLIR TOSA `rsqrt` operation.
     """
     input1 = symbol_table.get((str(node.args[0]), 0))
     sizes = ir.RankedTensorType(input1.type).shape
@@ -301,15 +335,11 @@ def rsqrt_op(node, symbol_table):
     return op
 
 
-def amax_op(node, symbol_table):
+def amax_op(node: AmaxOp, symbol_table):
     """
     Import the amax operation.
-    From PyTorch `aten.amax.default` operator to MLIR TOSA `reduce_max`
+    From buddy graph ir's `AmaxOp` operator to MLIR TOSA `reduce_max`
     operation.
-
-    Note: This conversion function returns the maximum value of each slice
-          of the input tensor in the given dimension(s). This is consistent
-          with PyTorch's `torch.amax` operator.
     """
     input1 = symbol_table.get((str(node.args[0]), 0))
     dim_val = node.args[1][0]
@@ -321,10 +351,10 @@ def amax_op(node, symbol_table):
     return op
 
 
-def reshape_op(node, symbol_table):
+def reshape_op(node: ReshapeOp, symbol_table):
     """
     Import the reshape operation.
-    From PyTorch `aten.reshape.default` operator to MLIR TOSA `reshape`
+    From buddy graph ir's `ReshapeOp` operator to MLIR TOSA `reshape`
     operation.
 
     Note: If the new shape contains one and only one `-1`, the size of the new
@@ -362,34 +392,30 @@ def reshape_op(node, symbol_table):
     return op
 
 
-def unsqueeze_op(node, symbol_table):
+def unsqueeze_op(node: UnsqueezeOp, symbol_table):
     """
     Import the unsqueeze operation.
-    From PyTorch `aten.unsqueeze.default` operator to MLIR TOSA `reshape`
+    From buddy graph ir's `UnsqueezeOp` operator to MLIR TOSA `reshape`
     operation.
-
-    Note: "unsqueeze" means inserting a new dimension of size 1 at the specified
-          position. For more information, please refer to
-          https://pytorch.org/docs/stable/generated/torch.unsqueeze.html
     """
     input_tensor = symbol_table.get((str(node.args[0]), 0))
     dim = node.args[1]
     sizes = ir.RankedTensorType(input_tensor.type).shape
-    sizes.insert(dim, 1)
+    if dim == -1:
+        sizes.append(1)
+    else:
+        sizes.insert(dim, 1)
     new_shape_content = array.array("i", sizes)
     new_shape_content = memoryview(new_shape_content)
     op = tosa.ReshapeOp(input_tensor, new_shape_content)
     return op
 
 
-def select_op(node, symbol_table):
+def select_op(node: SelectOp, symbol_table):
     """
     Import the select operation.
-    From PyTorch `aten.select.int` operator to MLIR TOSA `reshape` operation.
-
-    Note: "select" means slicing the input tensor along the selected dimension
-    at the given index. For more information, please refer to
-          https://pytorch.org/docs/stable/generated/torch.select.html
+    From buddy graph ir's `SelectOp` operator to MLIR TOSA `reshape`
+    operation.
     """
     input_tensor = symbol_table.get((str(node.args[0]), 0))
     dim = node.args[1]
@@ -416,14 +442,11 @@ def select_op(node, symbol_table):
     return op
 
 
-def slice_op(node, symbol_table):
+def slice_op(node: SliceOp, symbol_table):
     """
     Import the slice operation.
-    From PyTorch `aten.slice.Tensor` operator to MLIR tensor `extract_slice`
+    From buddy graph ir's `SliceOp` operator to MLIR TOSA `extract_slice`
     operation.
-
-    Note: "slice" means slicing the input tensor along the selected dimension
-    from a given start index to an end index.
     """
     input_tensor = symbol_table.get((str(node.args[0]), 0))
     dim = node.args[1]
@@ -477,17 +500,19 @@ def slice_op(node, symbol_table):
     return op
 
 
-def convert_element_type_op(node, symbol_table):
+def convert_element_type_op(node: ConvertElementTypeOp, symbol_table):
     """
     Import the element type conversion operation.
-    From PyTorch `prims.convert_element_type.default` operator to
-    MLIR TOSA `cast` operation.
+    From buddy graph ir's `ConvertElementTypeOp` operator to MLIR TOSA
+    `cast` operation.
     """
-    # maintain a mapping of torch types and mlir types
+    # maintain a mapping of buddy dtype to mlir types
     types_mapping = {
-        torch.float64: ir.F64Type.get(),
-        torch.float32: ir.F32Type.get(),
-        torch.float16: ir.F16Type.get(),
+        TensorDType.Float64: ir.F64Type.get(),
+        TensorDType.Float32: ir.F32Type.get(),
+        TensorDType.Float16: ir.F16Type.get(),
+        TensorDType.Int32: ir.IntegerType.get_signless(32),
+        TensorDType.Bool: ir.IntegerType.get_signless(1),
     }
     input_tensor = symbol_table.get((str(node.args[0]), 0))
     to_cast_type = types_mapping[node.args[1]]
@@ -496,13 +521,13 @@ def convert_element_type_op(node, symbol_table):
     return tosa.CastOp(output_type, input_tensor)
 
 
-def clone_op(node, symbol_table):
+def clone_op(node: CloneOp, symbol_table):
     """
     Import the clone operation.
-    From PyTorch `aten.clone.default` operator to MLIR TOSA `identity`
+    From buddy graph ir's `CloneOp` operator to MLIR TOSA `identity`
     operation.
 
-    Note: Since MLIR follow the SSA form, when using the `identity` operation,
+    Note: Since MLIR follows the SSA form, when using the `identity` operation,
     we actually deep-copies the original tensor.
     """
     input_tensor = symbol_table.get((str(node.args[0]), 0))
@@ -513,13 +538,16 @@ def clone_op(node, symbol_table):
     return tosa.IdentityOp(output_type, input_tensor)
 
 
-def var_mean_op(node, symbol_table):
+def var_mean_op(node: VarMeanOp, symbol_table):
     """
     Import the variance & mean operation.
-    From PyTorch `aten.var_mean.default` operator to two MLIR TOSA `mul`
+    From buddy graph ir's `VarMeanOp` operator to two MLIR TOSA `mul`
     operation.
 
-    Note: The conversion procedure can be splited into two steps:
+    Note: By now, this conversion function follows PyTorch's `var_mean`
+    semantic.
+
+          The conversion procedure can be splited into two steps:
           1. In the first part, we calculate the mean value along the given
           dimension(s) in `mean_dim_op` function. We first reduce the input
           tensor along the given dimension(s) using tosa's `reduce_sum`
@@ -667,10 +695,10 @@ def var_dim_op(
     return var_op, mean_op
 
 
-def permute_op(node, symbol_table):
+def permute_op(node: PermuteOp, symbol_table):
     """
     Import the permute operation.
-    From PyTorch `aten.permute.default` operator to MLIR TOSA `transpose`
+    From buddy graph ir's `PermuteOp` operator to MLIR TOSA `transpose`
     operation.
     """
     input_tensor = symbol_table.get((str(node.args[0]), 0))
@@ -693,10 +721,10 @@ def permute_op(node, symbol_table):
     return permute_op
 
 
-def embedding_op(node, symbol_table):
+def embedding_op(node: EmbeddingOp, symbol_table):
     """
     Import the embedding operation.
-    From PyTorch `aten.embedding.default` operator to MLIR TOSA `reshape`
+    From buddy graph ir's `EmbeddingOp` operator to MLIR TOSA `reshape`
     operation.
 
     Note: Althought this conversion function will finally return a `reshape`
@@ -754,10 +782,10 @@ def embedding_op(node, symbol_table):
     return op
 
 
-def expand_op(node, symbol_table) -> ir.Operation:
+def expand_op(node: ExpandOp, symbol_table) -> ir.Operation:
     """
     Import the expand operation.
-    From PyTorch `aten.expand.default` operator to MLIR TOSA `add` operation.
+    From buddy graph ir's `ExpandOp` operator to MLIR TOSA `add` operation.
 
     Note: This conversion is implemented using the broadcast machanism of TOSA
           `add` operation. We allocate a tensor with the shape to expand and
@@ -787,11 +815,10 @@ def expand_op(node, symbol_table) -> ir.Operation:
     return op
 
 
-def sum_op(node, symbol_table):
+def sum_op(node: SumDimOp, symbol_table):
     """
     Import the sum operation.
-    From PyTorch `aten.sum.dim_IntList` operator to MLIR TOSA `reduce_sum`
-    operation.
+    From buddy graph ir's `SumDimOp` operator to MLIR TOSA `reduce_sum`
     """
     input_tensor = symbol_table.get((str(node.args[0]), 0))
     reduce_sum_dims = node.args[1]
@@ -813,40 +840,37 @@ def sum_op(node, symbol_table):
     return reduce_sum_op
 
 
-def t_op(node, symbol_table):
+def t_op(node: TOp, symbol_table):
     """
     Import the tensor transpose operation.
-    From PyTorch `aten.t.default` operator to MLIR TOSA `reduce_sum` operation.
+    From buddy graph ir's `TOp` operator to MLIR TOSA `transpose` operation
     """
     assert len(node.args) == 1
     input1 = symbol_table.get((str(node.args[0]), 0))
-    if input1 is None:
-        return
+    assert input1 is not None
 
     input_shape = list(ir.RankedTensorType(input1.type).shape)
-    output_shape = list(node.meta["tensor_meta"].shape)
-    if len(input_shape) == 2:
-        perm_const_op = tosa.ConstOp(
-            ir.DenseElementsAttr.get(memoryview(array.array("i", [1, 0])))
-        )
-        result_element_type = ir.RankedTensorType(input1.type).element_type
-        permute_result_type = ir.RankedTensorType.get(
-            output_shape, result_element_type
-        )
-        op = tosa.TransposeOp(
-            permute_result_type, input1, perm_const_op.results[0]
-        )
+    output_shape = list(node.tensor_meta["shape"])
+    assert len(input_shape) == 2, "Input tensor must be 2D"
+    perm_const_op = tosa.ConstOp(
+        ir.DenseElementsAttr.get(memoryview(array.array("i", [1, 0])))
+    )
+    result_element_type = ir.RankedTensorType(input1.type).element_type
+    permute_result_type = ir.RankedTensorType.get(
+        output_shape, result_element_type
+    )
+    op = tosa.TransposeOp(permute_result_type, input1, perm_const_op.results[0])
 
     return op
 
 
-def transpose_op(node, symbol_table):
+def transpose_op(node: TransposeOp, symbol_table):
     """
     Import the tensor permute operation based on input dims.
-    From PyTorch `aten.transpose.int` operator to MLIR TOSA `reduce_sum`
+    From buddy graph ir's `TransposeOp` operator to MLIR TOSA `transpose`
     operation.
     """
-    assert len(node.args) == 3
+    assert len(node.args) == 3, "Input tensor must be 3D"
     input1 = symbol_table.get((str(node.args[0]), 0))
     if input1 is None:
         return
@@ -857,7 +881,7 @@ def transpose_op(node, symbol_table):
     temp = perm_list[dim1]
     perm_list[dim1] = perm_list[dim2]
     perm_list[dim2] = temp
-    output_shape = list(node.meta["tensor_meta"].shape)
+    output_shape = list(node.tensor_meta["shape"])
     perm_const_op = tosa.ConstOp(
         ir.DenseElementsAttr.get(memoryview(array.array("i", perm_list)))
     )
@@ -870,29 +894,352 @@ def transpose_op(node, symbol_table):
     return op
 
 
+def maxpool2d_op(node: MaxPool2dOp, symbol_table):
+    """
+    Import the maxpool2d operation.
+    From Buddy MaxPool2dOp to MLIR TOSA `max_pool2d` operation.
+    """
+    if len(node.args) == 5:
+        raise NotImplementedError
+    input1 = symbol_table.get((str(node.args[0]), 0))
+    kernel = node.args[1]
+    stride = node.args[2]
+    if len(node.args) > 3:
+        pad = node.args[3]
+    else:
+        pad = [0 for _ in kernel]
+    dtype = node.tensor_meta["dtype"]
+    result_element_type = mlir_element_type_get(dtype)
+    if node._layout.find("NCHW") != -1:
+        perm_list = [0, 2, 3, 1]
+        perm_const_op = tosa.ConstOp(
+            ir.DenseElementsAttr.get(memoryview(array.array("i", perm_list)))
+        )
+        out_shape = list(ir.RankedTensorType(input1.type).shape)
+        perm_shape = []
+        perm_shape.append(out_shape[0])
+        perm_shape.append(out_shape[2])
+        perm_shape.append(out_shape[3])
+        perm_shape.append(out_shape[1])
+        permute_result_type = ir.RankedTensorType.get(
+            perm_shape, result_element_type
+        )
+        input1 = tosa.TransposeOp(
+            permute_result_type, input1, perm_const_op.results[0]
+        ).result
+    out_shape = node.tensor_meta["shape"]
+    if len(pad) == 1:
+        pad = [pad[0]] * 4
+    elif len(pad) == 2:
+        pad = [pad[0]] * 2 + [pad[1]] * 2
+    kernel_attr = ir._denseI64ArrayAttr(kernel, None)
+    stride_attr = ir._denseI64ArrayAttr(stride, None)
+    pad_attr = ir._denseI64ArrayAttr(pad, None)
+    if node._layout.find("NCHW") != -1:
+        perm_shape = []
+        perm_shape.append(out_shape[0])
+        perm_shape.append(out_shape[2])
+        perm_shape.append(out_shape[3])
+        perm_shape.append(out_shape[1])
+        out_shape = perm_shape
+    output = ir.RankedTensorType.get(out_shape, result_element_type)
+    op = tosa.MaxPool2dOp(output, input1, kernel_attr, stride_attr, pad_attr)
+    if node._layout.find("NCHW") != -1:
+        perm_list = [0, 3, 1, 2]
+        perm_const_op = tosa.ConstOp(
+            ir.DenseElementsAttr.get(memoryview(array.array("i", perm_list)))
+        )
+        perm_shape = []
+        perm_shape.append(out_shape[0])
+        perm_shape.append(out_shape[3])
+        perm_shape.append(out_shape[1])
+        perm_shape.append(out_shape[2])
+        permute_result_type = ir.RankedTensorType.get(
+            perm_shape, result_element_type
+        )
+        op = tosa.TransposeOp(
+            permute_result_type, op.result, perm_const_op.results[0]
+        )
+    return op
+
+
+def convolution2d_op(node: Conv2dOp, symbol_table):
+    """
+    Import the convolution operation.
+    From Buddy Conv2dOp to MLIR TOSA `conv2d` operation.
+    """
+    assert len(node.args) == 9
+    input1 = symbol_table.get((str(node.args[0]), 0))
+    weight = symbol_table.get((str(node.args[1]), 0))
+    is_kernel_transposed = node.args[6]
+    dtype = node.tensor_meta["dtype"]
+    result_element_type = mlir_element_type_get(dtype)
+    if node._layout.find("NCHW") != -1:
+        perm_list = [0, 2, 3, 1]
+        perm_const_op = tosa.ConstOp(
+            ir.DenseElementsAttr.get(memoryview(array.array("i", perm_list)))
+        )
+        out_shape = list(ir.RankedTensorType(input1.type).shape)
+        perm_shape = []
+        perm_shape.append(out_shape[0])
+        perm_shape.append(out_shape[2])
+        perm_shape.append(out_shape[3])
+        perm_shape.append(out_shape[1])
+        permute_result_type = ir.RankedTensorType.get(
+            perm_shape, result_element_type
+        )
+        input1 = tosa.TransposeOp(
+            permute_result_type, input1, perm_const_op.results[0]
+        ).result
+    if node._layout.find("FCHW") != -1:
+        perm_list = [0, 2, 3, 1]
+        perm_const_op = tosa.ConstOp(
+            ir.DenseElementsAttr.get(memoryview(array.array("i", perm_list)))
+        )
+        out_shape = list(ir.RankedTensorType(weight.type).shape)
+        perm_shape = []
+        perm_shape.append(out_shape[0])
+        perm_shape.append(out_shape[2])
+        perm_shape.append(out_shape[3])
+        perm_shape.append(out_shape[1])
+        permute_result_type = ir.RankedTensorType.get(
+            perm_shape, result_element_type
+        )
+        weight = tosa.TransposeOp(
+            permute_result_type, weight, perm_const_op.results[0]
+        ).result
+    if is_kernel_transposed:
+        in_channels = list(ir.RankedTensorType(weight.type).shape)[0]
+        out_channels = list(ir.RankedTensorType(weight.type).shape)[1]
+    else:
+        in_channels = list(ir.RankedTensorType(weight.type).shape)[1]
+        out_channels = list(ir.RankedTensorType(weight.type).shape)[0]
+    if len(node._parents) == 2:
+        new_size_tensor_type = ir.RankedTensorType.get(
+            [out_channels], result_element_type
+        )
+        element = mlir_element_attr_get(dtype, 0)
+        new_size_attr = ir.DenseElementsAttr.get_splat(
+            new_size_tensor_type, element
+        )
+        bias_tensor = tosa.ConstOp(new_size_attr).results[0]
+    else:
+        bias_tensor = symbol_table.get((str(node.args[2]), 0))
+    assert input1 != None and weight != None and bias_tensor != None
+    stride = node.args[3]
+    input_padding = node.args[4]
+    if len(input_padding) == 1:
+        input_padding = [input_padding[0]] * 4
+    elif len(input_padding) == 2:
+        input_padding = [input_padding[0]] * 2 + [input_padding[1]] * 2
+    dilation = node.args[5]
+    groups = node.args[8]
+    out_shape = node.tensor_meta["shape"]
+    if node._layout.find("NCHW") != -1:
+        perm_shape = []
+        perm_shape.append(out_shape[0])
+        perm_shape.append(out_shape[2])
+        perm_shape.append(out_shape[3])
+        perm_shape.append(out_shape[1])
+        out_shape = perm_shape
+    output = ir.RankedTensorType.get(out_shape, result_element_type)
+    stride_attr = ir._denseI64ArrayAttr(stride, None)
+    assert groups == 1, 'tosa.conv2d only support one group'
+    if is_kernel_transposed:
+        if sum(input_padding) > 0 or sum(dilation) > len(dilation):
+            raise NotImplementedError
+        out_padding = node.args[7]
+        for i in range(len(out_padding), 4):
+            out_padding = [0] + out_padding
+        out_padding_attr = ir._denseI64ArrayAttr(out_padding, None)
+        out_shape_attr = ir._denseI64ArrayAttr(out_shape, None)
+        op = tosa.TransposeConv2DOp(
+            output,
+            input1,
+            weight,
+            bias_tensor,
+            out_padding_attr,
+            stride_attr,
+            out_shape_attr,
+        )
+    else:
+        input_padding_attr = ir._denseI64ArrayAttr(input_padding, None)
+        dilation_attr = ir._denseI64ArrayAttr(dilation, None)
+        op = tosa.Conv2DOp(
+            output,
+            input1,
+            weight,
+            bias_tensor,
+            input_padding_attr,
+            stride_attr,
+            dilation_attr,
+        )
+    if node._layout.find("NCHW") != -1:
+        perm_list = [0, 3, 1, 2]
+        perm_const_op = tosa.ConstOp(
+            ir.DenseElementsAttr.get(memoryview(array.array("i", perm_list)))
+        )
+        perm_shape = []
+        perm_shape.append(out_shape[0])
+        perm_shape.append(out_shape[3])
+        perm_shape.append(out_shape[1])
+        perm_shape.append(out_shape[2])
+        permute_result_type = ir.RankedTensorType.get(
+            perm_shape, result_element_type
+        )
+        op = tosa.TransposeOp(
+            permute_result_type, op.result, perm_const_op.results[0]
+        )
+    return op
+
+
+def relu_op(node: ReluOp, symbol_table):
+    """
+    Import the tensor relu operation.
+    From Buddy ReluOp to MLIR TOSA `maximum` operation.
+    """
+    assert len(node.args) == 1
+    input1 = symbol_table.get((str(node.args[0]), 0))
+    if input1 is None:
+        return
+    output_shape = list(node.tensor_meta["shape"])
+    dtype = node.tensor_meta["dtype"]
+    element = mlir_element_attr_get(dtype, 0)
+    tensor_type = ir.RankedTensorType.get(output_shape, element.type)
+    attr = ir.DenseElementsAttr.get_splat(tensor_type, element)
+    zero_op = tosa.ConstOp(attr)
+    result_element_type = mlir_element_type_get(dtype)
+    op = tosa.MaximumOp(tensor_type, input1, zero_op)
+
+    return op
+
+
+def iota_op(node: IotaOp, symbol_table):
+    """
+    Import the tensor iota operation.
+    From Buddy IotaOp to MLIR TOSA `ConstOp` operation.
+    """
+    assert len(node.args) == 1
+    output_shape = list(node.tensor_meta["shape"])
+    dtype = node.tensor_meta["dtype"]
+    start = node.kwargs["start"]
+    end = node.args[0]
+    step = node.kwargs["step"]
+    mlir_dtype = mlir_element_type_get(dtype)
+    tensor_type = ir.RankedTensorType.get(output_shape, mlir_dtype)
+    attr = ir.DenseElementsAttr.get(
+        numpy.arange(start, end, step),
+        type=tensor_type,
+    )
+    op = tosa.ConstOp(attr)
+
+    return op
+
+
+def sigmoid_op(node: SigmoidOp, symbol_table):
+    """
+    Import the tensor sigmoid operation.
+    From Buddy SigmoidOp to MLIR TOSA `SigmoidOp` operation.
+    """
+    assert len(node.args) == 1
+    input1 = symbol_table.get((str(node.args[0]), 0))
+    if input1 is None:
+        return
+    output_shape = list(node.tensor_meta["shape"])
+    dtype = node.tensor_meta["dtype"]
+    mlir_dtype = mlir_element_type_get(dtype)
+    tensor_type = ir.RankedTensorType.get(output_shape, mlir_dtype)
+    op = tosa.SigmoidOp(tensor_type, input1)
+
+    return op
+
+
+def reciprocal_op(node: ReciprocalOp, symbol_table):
+    input_tensor = symbol_table.get((str(node.args[0]), 0))
+    return tosa.ReciprocalOp(input_tensor.type, input_tensor)
+
+
+def mean_op(node: MeanOp, symbol_table):
+    input_tensor = symbol_table.get((str(node.args[0]), 0))
+    keepdim = node.args[2]
+    dims = [x for x in node.args[1]]
+    if isinstance(dims, int):
+        dims = [dims]
+
+    for dim_item_idx, _ in enumerate(dims):
+        if dims[dim_item_idx] < 0:
+            dims[dim_item_idx] += len(
+                ir.RankedTensorType(input_tensor.type).shape
+            )
+
+    reduce_sum_result = input_tensor
+    for dim_item in dims:
+        reduce_dim_attr = ir.IntegerAttr.get(
+            ir.IntegerType.get_signless(32), dim_item
+        )
+        reduce_sum_op = tosa.ReduceSumOp(reduce_sum_result, reduce_dim_attr)
+        reduce_sum_result = reduce_sum_op.results[0]
+
+    tensor_shp = ir.RankedTensorType(input_tensor.type).shape
+    dim_size = 1
+
+    for dim_item in dims:
+        dim_size *= tensor_shp[dim_item]
+
+    denominator_const_op = tosa.ConstOp(
+        ir.DenseElementsAttr.get(memoryview(array.array("f", [dim_size])))
+    )
+    reciprocal_op = tosa.ReciprocalOp(
+        denominator_const_op.results[0].type, denominator_const_op
+    )
+
+    ret = tosa.MulOp(
+        reduce_sum_op.results[0].type,
+        reciprocal_op.results[0],
+        reduce_sum_op.results[0],
+        ir.IntegerAttr.get(ir.IntegerType.get_signless(8), 0),
+    )
+
+    if not keepdim:
+        result_shp = ir.RankedTensorType(ret.results[0].type).shape
+        result_shp = [siz for siz in result_shp if siz != 1]
+        ret = tosa.ReshapeOp(
+            ret.results[0], memoryview(array.array("i", result_shp))
+        )
+
+    return ret
+
+
 ops_registry = {
-    "add.Tensor": add_op,
-    "mul.Tensor": mul_op,
-    "sub.Tensor": sub_op,
-    "sum.dim_IntList": sum_op,
-    "tanh.default": tanh_op,
-    "amax.default": amax_op,
-    "rsqrt.default": rsqrt_op,
-    "bmm.default": bmm_op,
-    "clone.default": clone_op,
-    "div.Tensor": div_op,
-    "exp.default": exp_op,
-    "expand.default": expand_op,
-    "var_mean.correction": var_mean_op,
-    "addmm.default": addmm_op,
-    "reshape.default": reshape_op,
-    "view.default": reshape_op,
-    "select.int": select_op,
-    "slice.Tensor": slice_op,
-    "embedding.default": embedding_op,
-    "convert_element_type.default": convert_element_type_op,
-    "permute.default": permute_op,
-    "unsqueeze.default": unsqueeze_op,
-    "t.default": t_op,
-    "transpose.int": transpose_op,
+    "AddOp": add_op,
+    "MulOp": mul_op,
+    "SubOp": sub_op,
+    "SumDimOp": sum_op,
+    "TanhOp": tanh_op,
+    "AmaxOp": amax_op,
+    "RsqrtOp": rsqrt_op,
+    "BatchMatmulOp": bmm_op,
+    "CloneOp": clone_op,
+    "DivOp": div_op,
+    "ExpOp": exp_op,
+    "ExpandOp": expand_op,
+    "VarMeanOp": var_mean_op,
+    "AddMMOp": addmm_op,
+    "ReshapeOp": reshape_op,
+    "ViewOp": reshape_op,
+    "SelectOp": select_op,
+    "SliceOp": slice_op,
+    "EmbeddingOp": embedding_op,
+    "ConvertElementTypeOp": convert_element_type_op,
+    "PermuteOp": permute_op,
+    "UnsqueezeOp": unsqueeze_op,
+    "TOp": t_op,
+    "TransposeOp": transpose_op,
+    "MaxPool2dOp": maxpool2d_op,
+    "Conv2dOp": convolution2d_op,
+    "ReluOp": relu_op,
+    "IotaOp": iota_op,
+    "SigmoidOp": sigmoid_op,
+    "ReciprocalOp": reciprocal_op,
+    "MeanOp": mean_op,
 }
diff --git a/frontend/Python/ops/utils.py b/frontend/Python/ops/utils.py
new file mode 100644
index 0000000000..337f5a6b49
--- /dev/null
+++ b/frontend/Python/ops/utils.py
@@ -0,0 +1,56 @@
+# ===- utils.py ----------------------------------------------------------------
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# ===---------------------------------------------------------------------------
+#
+# The element utils of mlir element type and attribute.
+#
+# ===---------------------------------------------------------------------------
+
+from typing import Dict
+import mlir.ir as ir
+
+from ..graph import TensorDType
+
+
+def mlir_element_type_get(type_name):
+    """
+    Get the mlir element type base on TensorDType's enum type.
+    Args:
+        type_name: The TensorDType's enum type.
+    """
+    match type_name:
+        case TensorDType.Float32:
+            return ir.F32Type.get()
+        case TensorDType.Int64:
+            return ir.IntegerType.get_signless(64)
+        case TensorDType.Bool:
+            return ir.IntegerType.get_signless(1)
+
+
+def mlir_element_attr_get(type_name, value):
+    """
+    Get the mlir element attribute base on TensorDType's enum type and value.
+    Args:
+        type_name: The TensorDType's enum type.
+        value: The real value for mlir element attribute.
+    """
+    match type_name:
+        case TensorDType.Float32:
+            return ir.FloatAttr.get(ir.F32Type.get(), value)
+        case TensorDType.Int64:
+            return ir.IntegerAttr.get(ir.IntegerType.get_signless(64), value)
+        case TensorDType.Bool:
+            return ir.IntegerAttr.get(ir.IntegerType.get_signless(1), value)
+
diff --git a/requirements.txt b/requirements.txt
index 45d5b1fa36..606179eb74 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -6,3 +6,4 @@ sentencepiece == 0.1.99
 accelerate
 protobuf
 pybind11 == 2.11.1
+torchvision
diff --git a/tests/Python/test_addmm.py b/tests/Python/test_addmm.py
index cb4459f450..563c874462 100644
--- a/tests/Python/test_addmm.py
+++ b/tests/Python/test_addmm.py
@@ -22,8 +22,11 @@ def foo(x, y, z):
     aot_autograd_decomposition=inductor_decomp,
 )
 
-foo_mlir = dynamo.optimize(dynamo_compiler)(foo)
-foo_mlir(in1, in2, in3)
+graphs = dynamo_compiler.importer(foo, in1, in2, in3)
+assert len(graphs) == 1
+graph = graphs[0]
+graph.lower_to_top_level_ir()
+print(graph._imported_module)
 
 # CHECK: module {
 # CHECK-LABEL: func.func @forward
@@ -32,4 +35,3 @@ def foo(x, y, z):
 # CHECK: return %{{.*}}
 # CHECK: }
 # CHECK: }
-print(dynamo_compiler.imported_module)
diff --git a/tests/Python/test_amax.py b/tests/Python/test_amax.py
index 3759b352c7..81944a2c27 100644
--- a/tests/Python/test_amax.py
+++ b/tests/Python/test_amax.py
@@ -22,8 +22,11 @@ def foo(x, dim):
     aot_autograd_decomposition=inductor_decomp,
 )
 
-foo_mlir = dynamo.optimize(dynamo_compiler)(foo)
-foo_mlir(in1, dim)
+graphs = dynamo_compiler.importer(foo, in1, dim)
+assert len(graphs) == 1
+graph = graphs[0]
+graph.lower_to_top_level_ir()
+print(graph._imported_module)
 
 # CHECK: module {
 # CHECK-LABEL: func.func @forward
@@ -31,4 +34,3 @@ def foo(x, dim):
 # CHECK: return %{{.*}}
 # CHECK: }
 # CHECK: }
-print(dynamo_compiler.imported_module)
diff --git a/tests/Python/test_arange.py b/tests/Python/test_arange.py
index ac7fa3c45e..f7e1cd1c4f 100644
--- a/tests/Python/test_arange.py
+++ b/tests/Python/test_arange.py
@@ -2,10 +2,9 @@
 
 import torch
 import torch._dynamo as dynamo
-from torch._inductor.decomposition import decompositions as inductor_decomp
 from torch._functorch.aot_autograd import aot_autograd_decompositions
 from buddy.compiler.frontend import DynamoCompiler
-from buddy.compiler.ops import tosa
+from buddy.compiler.ops import linalg
 
 
 def foo(x):
@@ -16,12 +15,15 @@ def foo(x):
 
 # Initialize the dynamo compiler.
 dynamo_compiler = DynamoCompiler(
-    primary_registry=tosa.ops_registry,
+    primary_registry=linalg.ops_registry,
     aot_autograd_decomposition=aot_autograd_decompositions,
 )
 
-foo_mlir = dynamo.optimize(dynamo_compiler)(foo)
-foo_mlir(in1)
+graphs = dynamo_compiler.importer(foo, in1)
+assert len(graphs) == 1
+graph = graphs[0]
+graph.lower_to_top_level_ir()
+print(graph._imported_module)
 
 # CHECK: module {
 # CHECK-LABEL: func.func @forward
@@ -29,4 +31,3 @@ def foo(x):
 # CHECK: return %{{.*}}
 # CHECK: }
 # CHECK: }
-print(dynamo_compiler.imported_module)
diff --git a/tests/Python/test_arith_add.py b/tests/Python/test_arith_add.py
index 44db4609d7..9c6e9d3121 100644
--- a/tests/Python/test_arith_add.py
+++ b/tests/Python/test_arith_add.py
@@ -1,11 +1,10 @@
 # RUN: %PYTHON %s 2>&1 | FileCheck %s
 
 import torch
-import torch._dynamo as dynamo
 from torch._inductor.decomposition import decompositions as inductor_decomp
 
 from buddy.compiler.frontend import DynamoCompiler
-from buddy.compiler.ops import tosa
+from buddy.compiler.ops import linalg
 
 
 def foo(x, y):
@@ -17,12 +16,15 @@ def foo(x, y):
 
 # Initialize the dynamo compiler.
 dynamo_compiler = DynamoCompiler(
-    primary_registry=tosa.ops_registry,
+    primary_registry=linalg.ops_registry,
     aot_autograd_decomposition=inductor_decomp,
 )
 
-foo_mlir = dynamo.optimize(dynamo_compiler)(foo)
-foo_mlir(in1, in2)
+graphs = dynamo_compiler.importer(foo, in1, in2)
+assert len(graphs) == 1
+graph = graphs[0]
+graph.lower_to_top_level_ir()
+print(graph._imported_module)
 
 # CHECK: module {
 # CHECK-LABEL: func.func @forward
@@ -30,4 +32,3 @@ def foo(x, y):
 # CHECK: return %{{.*}}
 # CHECK: }
 # CHECK: }
-print(dynamo_compiler.imported_module)
diff --git a/tests/Python/test_arith_div.py b/tests/Python/test_arith_div.py
index afc222a154..cf5b29023b 100644
--- a/tests/Python/test_arith_div.py
+++ b/tests/Python/test_arith_div.py
@@ -5,7 +5,7 @@
 from torch._inductor.decomposition import decompositions as inductor_decomp
 
 from buddy.compiler.frontend import DynamoCompiler
-from buddy.compiler.ops import tosa
+from buddy.compiler.ops import linalg
 
 
 def foo(x, y):
@@ -17,12 +17,15 @@ def foo(x, y):
 
 # Initialize the dynamo compiler.
 dynamo_compiler = DynamoCompiler(
-    primary_registry=tosa.ops_registry,
+    primary_registry=linalg.ops_registry,
     aot_autograd_decomposition=inductor_decomp,
 )
 
-foo_mlir = dynamo.optimize(dynamo_compiler)(foo)
-foo_mlir(in1, in2)
+graphs = dynamo_compiler.importer(foo, in1, in2)
+assert len(graphs) == 1
+graph = graphs[0]
+graph.lower_to_top_level_ir()
+print(graph._imported_module)
 
 # CHECK: module {
 # CHECK-LABEL: func.func @forward
@@ -31,4 +34,3 @@ def foo(x, y):
 # CHECK: return %{{.*}}
 # CHECK: }
 # CHECK: }
-print(dynamo_compiler.imported_module)
diff --git a/tests/Python/test_arith_mul.py b/tests/Python/test_arith_mul.py
index 9dc4dfbfff..b22c6ebfda 100644
--- a/tests/Python/test_arith_mul.py
+++ b/tests/Python/test_arith_mul.py
@@ -5,7 +5,7 @@
 from torch._inductor.decomposition import decompositions as inductor_decomp
 
 from buddy.compiler.frontend import DynamoCompiler
-from buddy.compiler.ops import tosa
+from buddy.compiler.ops import linalg
 
 
 def foo(x, y):
@@ -13,21 +13,24 @@ def foo(x, y):
 
 
 in1 = torch.randn(10)
-in2 = torch.randn(10)
+in2 = 2
 
 # Initialize the dynamo compiler.
 dynamo_compiler = DynamoCompiler(
-    primary_registry=tosa.ops_registry,
+    primary_registry=linalg.ops_registry,
     aot_autograd_decomposition=inductor_decomp,
 )
 
-foo_mlir = dynamo.optimize(dynamo_compiler)(foo)
-foo_mlir(in1, in2)
+graphs = dynamo_compiler.importer(foo, in1, in2)
+assert len(graphs) == 1
+graph = graphs[0]
+graph.lower_to_top_level_ir()
+print(graph._imported_module)
 
 # CHECK: module {
 # CHECK-LABEL: func.func @forward
+# CHECK: %{{.*}} = arith.constant
 # CHECK: %{{.*}} = tosa.mul
 # CHECK: return %{{.*}}
 # CHECK: }
 # CHECK: }
-print(dynamo_compiler.imported_module)
diff --git a/tests/Python/test_arith_sub.py b/tests/Python/test_arith_sub.py
index 95b5475fc0..0f6238afa2 100644
--- a/tests/Python/test_arith_sub.py
+++ b/tests/Python/test_arith_sub.py
@@ -21,8 +21,11 @@ def foo(x, y):
     aot_autograd_decomposition=inductor_decomp,
 )
 
-foo_mlir = dynamo.optimize(dynamo_compiler)(foo)
-foo_mlir(in1, in2)
+graphs = dynamo_compiler.importer(foo, in1, in2)
+assert len(graphs) == 1
+graph = graphs[0]
+graph.lower_to_top_level_ir()
+print(graph._imported_module)
 
 # CHECK: module {
 # CHECK-LABEL: func.func @forward
@@ -30,4 +33,3 @@ def foo(x, y):
 # CHECK: return %{{.*}}
 # CHECK: }
 # CHECK: }
-print(dynamo_compiler.imported_module)
diff --git a/tests/Python/test_bmm.py b/tests/Python/test_bmm.py
index 403b0621b2..ec7c8b1601 100644
--- a/tests/Python/test_bmm.py
+++ b/tests/Python/test_bmm.py
@@ -5,7 +5,7 @@
 from torch._inductor.decomposition import decompositions as inductor_decomp
 
 from buddy.compiler.frontend import DynamoCompiler
-from buddy.compiler.ops import tosa
+from buddy.compiler.ops import linalg
 
 
 def foo(x, y):
@@ -17,17 +17,20 @@ def foo(x, y):
 
 # Initialize the dynamo compiler.
 dynamo_compiler = DynamoCompiler(
-    primary_registry=tosa.ops_registry,
+    primary_registry=linalg.ops_registry,
     aot_autograd_decomposition=inductor_decomp,
 )
 
-foo_mlir = dynamo.optimize(dynamo_compiler)(foo)
-foo_mlir(in1, in2)
+graphs = dynamo_compiler.importer(foo, in1, in2)
+assert len(graphs) == 1
+graph = graphs[0]
+graph.lower_to_top_level_ir()
+print(graph._imported_module)
 
 # CHECK: module {
 # CHECK-LABEL: func.func @forward
-# CHECK: %{{.*}} = tosa.matmul
+# CHECK: %{{.*}} = arith.constant
+# CHECK: %{{.*}} = linalg.batch_matmul
 # CHECK: return %{{.*}}
 # CHECK: }
 # CHECK: }
-print(dynamo_compiler.imported_module)
diff --git a/tests/Python/test_cat.py b/tests/Python/test_cat.py
index db9dacf11c..9c769ae656 100644
--- a/tests/Python/test_cat.py
+++ b/tests/Python/test_cat.py
@@ -6,7 +6,7 @@
 from torch._functorch.aot_autograd import aot_autograd_decompositions
 
 from buddy.compiler.frontend import DynamoCompiler
-from buddy.compiler.ops import tosa
+from buddy.compiler.ops import linalg
 
 
 def foo(x, y):
@@ -17,12 +17,15 @@ def foo(x, y):
 in2 = torch.ones([13, 13], dtype=torch.float32)
 # Initialize the dynamo compiler.
 dynamo_compiler = DynamoCompiler(
-    primary_registry=tosa.ops_registry,
+    primary_registry=linalg.ops_registry,
     aot_autograd_decomposition=aot_autograd_decompositions,
 )
 
-foo_mlir = dynamo.optimize(dynamo_compiler)(foo)
-foo_mlir(in1, in2)
+graphs = dynamo_compiler.importer(foo, in1, in2)
+assert len(graphs) == 1
+graph = graphs[0]
+graph.lower_to_top_level_ir()
+print(graph._imported_module)
 
 # CHECK: module {
 # CHECK-LABEL: func.func @forward
@@ -32,4 +35,3 @@ def foo(x, y):
 # CHECK: return %{{.*}}
 # CHECK: }
 # CHECK: }
-print(dynamo_compiler.imported_module)
diff --git a/tests/Python/test_clone.py b/tests/Python/test_clone.py
index 24fcd32254..3eabd7d647 100644
--- a/tests/Python/test_clone.py
+++ b/tests/Python/test_clone.py
@@ -5,7 +5,7 @@
 from torch._inductor.decomposition import decompositions as inductor_decomp
 
 from buddy.compiler.frontend import DynamoCompiler
-from buddy.compiler.ops import tosa
+from buddy.compiler.ops import linalg
 
 
 def foo(x):
@@ -16,17 +16,19 @@ def foo(x):
 
 # Initialize the dynamo compiler.
 dynamo_compiler = DynamoCompiler(
-    primary_registry=tosa.ops_registry,
+    primary_registry=linalg.ops_registry,
     aot_autograd_decomposition=inductor_decomp,
 )
 
-foo_mlir = dynamo.optimize(dynamo_compiler)(foo)
-foo_mlir(in1)
+graphs = dynamo_compiler.importer(foo, in1)
+assert len(graphs) == 1
+graph = graphs[0]
+graph.lower_to_top_level_ir()
+print(graph._imported_module)
 
 # CHECK: module {
 # CHECK-LABEL: func.func @forward
-# CHECK: %{{.*}} = tosa.identity
+# CHECK: %{{.*}} = tensor.extract_slice
 # CHECK: return %{{.*}}
 # CHECK: }
 # CHECK: }
-print(dynamo_compiler.imported_module)
diff --git a/tests/Python/test_convert_element_type.py b/tests/Python/test_convert_element_type.py
index 63cd1ddaea..ca88384633 100644
--- a/tests/Python/test_convert_element_type.py
+++ b/tests/Python/test_convert_element_type.py
@@ -13,7 +13,7 @@ def foo(x, to_cast_type):
 
 
 in1 = torch.randn(10).to(torch.float32)
-to_cast_type = torch.float16
+to_cast_type = torch.int32
 
 # Initialize the dynamo compiler.
 dynamo_compiler = DynamoCompiler(
@@ -21,8 +21,11 @@ def foo(x, to_cast_type):
     aot_autograd_decomposition=inductor_decomp,
 )
 
-foo_mlir = dynamo.optimize(dynamo_compiler)(foo)
-foo_mlir(in1, to_cast_type)
+graphs = dynamo_compiler.importer(foo, in1, to_cast_type)
+assert len(graphs) == 1
+graph = graphs[0]
+graph.lower_to_top_level_ir()
+print(graph._imported_module)
 
 # CHECK: module {
 # CHECK-LABEL: func.func @forward
@@ -30,4 +33,3 @@ def foo(x, to_cast_type):
 # CHECK: return %{{.*}}
 # CHECK: }
 # CHECK: }
-print(dynamo_compiler.imported_module)
diff --git a/tests/Python/test_convolution_default.py b/tests/Python/test_convolution_default.py
new file mode 100644
index 0000000000..fed1607c7a
--- /dev/null
+++ b/tests/Python/test_convolution_default.py
@@ -0,0 +1,42 @@
+# RUN: %PYTHON %s 2>&1 | FileCheck %s
+
+import torch
+import torch._dynamo as dynamo
+from torch._inductor.decomposition import decompositions as inductor_decomp
+
+from buddy.compiler.frontend import DynamoCompiler
+from buddy.compiler.ops import tosa
+
+
+class Convolution(torch.nn.Module):
+    def __init__(self, *args, **kwargs) -> None:
+        super().__init__(*args, **kwargs)
+        self.conv = torch.nn.Conv2d(3, 255, (5, 5), 3, 3, bias=False)
+
+    def forward(self, a):
+        return self.conv(a)
+
+
+model = Convolution()
+dynamo_compiler = DynamoCompiler(
+    primary_registry=tosa.ops_registry,
+    aot_autograd_decomposition=inductor_decomp,
+)
+
+in1 = torch.randn((1, 3, 640, 480))
+graphs = dynamo_compiler.importer(model, in1)
+assert len(graphs) == 1
+graph = graphs[0]
+graph.lower_to_top_level_ir()
+print(graph._imported_module)
+# CHECK: module {
+# CHECK-LABEL: func.func @forward
+# CHECK: %{{.*}} = "tosa.const"
+# CHECK: %{{.*}} = tosa.transpose
+# CHECK: %{{.*}} = "tosa.const"()
+# CHECK: %{{.*}} = tosa.transpose
+# CHECK: %{{.*}} = tosa.conv2d
+# CHECK: %{{.*}} = tosa.transpose
+# CHECK: return %{{.*}}
+# CHECK: }
+# CHECK: }
diff --git a/tests/Python/test_embedding.py b/tests/Python/test_embedding.py
index ee76d2068d..484bb617b5 100644
--- a/tests/Python/test_embedding.py
+++ b/tests/Python/test_embedding.py
@@ -22,8 +22,11 @@ def foo(weight, indices):
 weight = torch.randn(10, 5)
 indices = torch.randint(10, (3, 3)).to(torch.int32)
 
-foo_mlir = dynamo.optimize(dynamo_compiler)(foo)
-foo_mlir(weight, indices)
+graphs = dynamo_compiler.importer(foo, weight, indices)
+assert len(graphs) == 1
+graph = graphs[0]
+graph.lower_to_top_level_ir()
+print(graph._imported_module)
 
 # CHECK: module {
 # CHECK-LABEL: func.func @forward
@@ -34,16 +37,29 @@ def foo(weight, indices):
 # CHECK: return %{{.*}}
 # CHECK: }
 # CHECK: }
-print(dynamo_compiler.imported_module)
-
 
 # test cast case
 weight = torch.randn(10, 5)
 indices = torch.randint(10, (3, 3)).to(torch.int64)
 
+graphs = dynamo_compiler.importer(foo, weight, indices)
+print(graphs)
+assert len(graphs) == 2
+graphs[0].lower_to_top_level_ir()
+print(graphs[0]._imported_module)
+
+# CHECK: module {
+# CHECK-LABEL: func.func @forward
+# CHECK: %{{.*}} = tosa.reshape
+# CHECK: %{{.*}} = tosa.reshape
+# CHECK: %{{.*}} = tosa.gather
+# CHECK: %{{.*}} = tosa.reshape
+# CHECK: return %{{.*}}
+# CHECK: }
+# CHECK: }
 
-foo_mlir = dynamo.optimize(dynamo_compiler)(foo)
-foo_mlir(weight, indices)
+graphs[1].lower_to_top_level_ir()
+print(graphs[1]._imported_module)
 
 # CHECK: module {
 # CHECK-LABEL: func.func @forward
@@ -54,5 +70,4 @@ def foo(weight, indices):
 # CHECK: %{{.*}} = tosa.reshape
 # CHECK: return %{{.*}}
 # CHECK: }
-# CHECK: }
-print(dynamo_compiler.imported_module)
+# CHECK: }
\ No newline at end of file
diff --git a/tests/Python/test_exp.py b/tests/Python/test_exp.py
index 3fcff43613..7519a999b3 100644
--- a/tests/Python/test_exp.py
+++ b/tests/Python/test_exp.py
@@ -20,8 +20,11 @@ def foo(x):
     aot_autograd_decomposition=inductor_decomp,
 )
 
-foo_mlir = dynamo.optimize(dynamo_compiler)(foo)
-foo_mlir(in1)
+graphs = dynamo_compiler.importer(foo, in1)
+assert len(graphs) == 1
+graph = graphs[0]
+graph.lower_to_top_level_ir()
+print(graph._imported_module)
 
 # CHECK: module {
 # CHECK-LABEL: func.func @forward
@@ -29,4 +32,3 @@ def foo(x):
 # CHECK: return %{{.*}}
 # CHECK: }
 # CHECK: }
-print(dynamo_compiler.imported_module)
diff --git a/tests/Python/test_full.py b/tests/Python/test_full.py
index 33cdc2c1d3..0a5f5888b1 100644
--- a/tests/Python/test_full.py
+++ b/tests/Python/test_full.py
@@ -5,7 +5,7 @@
 from torch._inductor.decomposition import decompositions as inductor_decomp
 
 from buddy.compiler.frontend import DynamoCompiler
-from buddy.compiler.ops import tosa
+from buddy.compiler.ops import linalg
 
 
 def foo(x, y):
@@ -17,12 +17,15 @@ def foo(x, y):
 
 # Initialize the dynamo compiler.
 dynamo_compiler = DynamoCompiler(
-    primary_registry=tosa.ops_registry,
+    primary_registry=linalg.ops_registry,
     aot_autograd_decomposition=inductor_decomp,
 )
 
-foo_mlir = dynamo.optimize(dynamo_compiler)(foo)
-foo_mlir(in1, in2)
+graphs = dynamo_compiler.importer(foo, in1, in2)
+assert len(graphs) == 1
+graph = graphs[0]
+graph.lower_to_top_level_ir()
+print(graph._imported_module)
 
 # CHECK: module {
 # CHECK-LABEL: func.func @forward
@@ -30,4 +33,3 @@ def foo(x, y):
 # CHECK: return %{{.*}}
 # CHECK: }
 # CHECK: }
-print(dynamo_compiler.imported_module)
diff --git a/tests/Python/test_index.py b/tests/Python/test_index.py
index da31095c1a..c21ce1a5f6 100644
--- a/tests/Python/test_index.py
+++ b/tests/Python/test_index.py
@@ -6,7 +6,7 @@
 from torch._functorch.aot_autograd import aot_autograd_decompositions
 
 from buddy.compiler.frontend import DynamoCompiler
-from buddy.compiler.ops import tosa
+from buddy.compiler.ops import linalg
 
 
 def foo(x, y):
@@ -17,12 +17,15 @@ def foo(x, y):
 in2 = torch.tensor([1])
 # Initialize the dynamo compiler.
 dynamo_compiler = DynamoCompiler(
-    primary_registry=tosa.ops_registry,
+    primary_registry=linalg.ops_registry,
     aot_autograd_decomposition=aot_autograd_decompositions,
 )
 
-foo_mlir = dynamo.optimize(dynamo_compiler)(foo)
-foo_mlir(in1, in2)
+graphs = dynamo_compiler.importer(foo, in1, in2)
+assert len(graphs) == 1
+graph = graphs[0]
+graph.lower_to_top_level_ir()
+print(graph._imported_module)
 
 # CHECK: module {
 # CHECK-LABEL: func.func @forward
@@ -31,4 +34,3 @@ def foo(x, y):
 # CHECK: return %{{.*}}
 # CHECK: }
 # CHECK: }
-print(dynamo_compiler.imported_module)
diff --git a/tests/Python/test_expand.py b/tests/Python/test_iota.py
similarity index 52%
rename from tests/Python/test_expand.py
rename to tests/Python/test_iota.py
index 37e9aca383..d4e9d3e566 100644
--- a/tests/Python/test_expand.py
+++ b/tests/Python/test_iota.py
@@ -8,25 +8,28 @@
 from buddy.compiler.ops import tosa
 
 
-def foo(x, new_size):
-    return torch.ops.aten.expand(x, new_size)
+class foo(torch.nn.Module):
+    def __init__(self, *args, **kwargs) -> None:
+        super().__init__(*args, **kwargs)
 
-x = torch.randn(1, 3)
-new_size = (6, 3)
+    def forward(self, a):
+        return torch.arange(a)
 
-# Initialize the dynamo compiler.
+
+model = foo()
 dynamo_compiler = DynamoCompiler(
     primary_registry=tosa.ops_registry,
     aot_autograd_decomposition=inductor_decomp,
 )
-
-foo_mlir = dynamo.optimize(dynamo_compiler)(foo)
-foo_mlir(x, new_size)
-
+in1 = 40
+graphs = dynamo_compiler.importer(model, in1)
+assert len(graphs) == 1
+graph = graphs[0]
+graph.lower_to_top_level_ir()
+print(graph._imported_module)
 # CHECK: module {
 # CHECK-LABEL: func.func @forward
-# CHECK: %{{.*}} = tosa.add
-# CHECK: return %{{.*}} : tensor<6x3xf32>
+# CHECK: %{{.*}} = "tosa.const"
+# CHECK: return %{{.*}}
 # CHECK: }
 # CHECK: }
-print(dynamo_compiler.imported_module)
diff --git a/tests/Python/test_lt.py b/tests/Python/test_lt.py
index a6f30b61cb..5cea5ce5f6 100644
--- a/tests/Python/test_lt.py
+++ b/tests/Python/test_lt.py
@@ -5,23 +5,26 @@
 from torch._inductor.decomposition import decompositions as inductor_decomp
 
 from buddy.compiler.frontend import DynamoCompiler
-from buddy.compiler.ops import tosa
+from buddy.compiler.ops import linalg
 
 
 def foo(x, y):
-    return torch.ops.aten.lt(x,y)
+    return torch.ops.aten.lt(x, y)
 
 
 in1 = torch.ones([13], dtype=torch.int64)
 in2 = torch.ones([13, 1], dtype=torch.int64)
 # Initialize the dynamo compiler.
 dynamo_compiler = DynamoCompiler(
-    primary_registry=tosa.ops_registry,
+    primary_registry=linalg.ops_registry,
     aot_autograd_decomposition=inductor_decomp,
 )
 
-foo_mlir = dynamo.optimize(dynamo_compiler)(foo)
-foo_mlir(in1, in2)
+graphs = dynamo_compiler.importer(foo, in1, in2)
+assert len(graphs) == 1
+graph = graphs[0]
+graph.lower_to_top_level_ir()
+print(graph._imported_module)
 
 # CHECK: module {
 # CHECK-LABEL: func.func @forward
@@ -30,4 +33,3 @@ def foo(x, y):
 # CHECK: return %{{.*}}
 # CHECK: }
 # CHECK: }
-print(dynamo_compiler.imported_module)
diff --git a/tests/Python/test_masked_fill.py b/tests/Python/test_masked_fill.py
index 3802b3de7a..3abbe88cd8 100644
--- a/tests/Python/test_masked_fill.py
+++ b/tests/Python/test_masked_fill.py
@@ -6,7 +6,7 @@
 from torch._functorch.aot_autograd import aot_autograd_decompositions
 
 from buddy.compiler.frontend import DynamoCompiler
-from buddy.compiler.ops import tosa
+from buddy.compiler.ops import linalg
 
 
 def foo(x, y, z):
@@ -18,12 +18,15 @@ def foo(x, y, z):
 in3 = 0
 # Initialize the dynamo compiler.
 dynamo_compiler = DynamoCompiler(
-    primary_registry=tosa.ops_registry,
+    primary_registry=linalg.ops_registry,
     aot_autograd_decomposition=aot_autograd_decompositions,
 )
 
-foo_mlir = dynamo.optimize(dynamo_compiler)(foo)
-foo_mlir(in1, in2, in3)
+graphs = dynamo_compiler.importer(foo, in1, in2, in3)
+assert len(graphs) == 1
+graph = graphs[0]
+graph.lower_to_top_level_ir()
+print(graph._imported_module)
 
 # CHECK: module {
 # CHECK-LABEL: func.func @forward
@@ -33,4 +36,3 @@ def foo(x, y, z):
 # CHECK: return %{{.*}}
 # CHECK: }
 # CHECK: }
-print(dynamo_compiler.imported_module)
diff --git a/tests/Python/test_max_pool2d.py b/tests/Python/test_max_pool2d.py
new file mode 100644
index 0000000000..eecfc73d93
--- /dev/null
+++ b/tests/Python/test_max_pool2d.py
@@ -0,0 +1,44 @@
+# RUN: %PYTHON %s 2>&1 | FileCheck %s
+
+import torch
+from torch._inductor.decomposition import decompositions as inductor_decomp
+
+from buddy.compiler.frontend import DynamoCompiler
+from buddy.compiler.ops import tosa
+
+
+class TestModule(torch.nn.Module):
+    def __init__(self, *args, **kwargs) -> None:
+        super().__init__(*args, **kwargs)
+        self.pool = torch.nn.MaxPool2d((5, 5), 3, (2, 2))
+
+    def forward(self, a):
+        return self.pool(a)
+
+
+model = TestModule()
+dynamo_compiler = DynamoCompiler(
+    primary_registry=tosa.ops_registry,
+    aot_autograd_decomposition=inductor_decomp,
+)
+
+in1 = torch.randn((1, 3, 640, 480))
+
+model_opt = torch.compile(model, backend=dynamo_compiler)
+assert torch.allclose(model_opt(in1), model(in1), equal_nan=True)
+
+graphs = dynamo_compiler.importer(model, in1)
+assert len(graphs) == 1
+graph = graphs[0]
+graph.lower_to_top_level_ir()
+print(graph._imported_module)
+# CHECK: module {
+# CHECK-LABEL: func.func @forward
+# CHECK: %{{.*}} = "tosa.const"
+# CHECK: %{{.*}} = tosa.transpose
+# CHECK: %{{.*}} = tosa.max_pool2d
+# CHECK: %{{.*}} = "tosa.const"
+# CHECK: %{{.*}} = tosa.transpose
+# CHECK: return %{{.*}}
+# CHECK: }
+# CHECK: }
diff --git a/tests/Python/test_mean.py b/tests/Python/test_mean.py
index 781e494162..0595619d18 100644
--- a/tests/Python/test_mean.py
+++ b/tests/Python/test_mean.py
@@ -1,16 +1,14 @@
 # RUN: %PYTHON %s 2>&1 | FileCheck %s
 
 import torch
-import torch._dynamo as dynamo
 from torch._inductor.decomposition import decompositions as inductor_decomp
-from torch._functorch.aot_autograd import aot_autograd_decompositions
 
 from buddy.compiler.frontend import DynamoCompiler
 from buddy.compiler.ops import tosa
 
 
-def foo(x, y, z):
-    return torch.mean(x, y, z)
+def foo(x, y, keepdim):
+    return torch.mean(x, y, keepdim=keepdim)
 
 
 in1 = torch.ones([13, 13], dtype=torch.float32)
@@ -19,17 +17,25 @@ def foo(x, y, z):
 # Initialize the dynamo compiler.
 dynamo_compiler = DynamoCompiler(
     primary_registry=tosa.ops_registry,
-    aot_autograd_decomposition=aot_autograd_decompositions,
+    aot_autograd_decomposition=inductor_decomp,
 )
 
-foo_mlir = dynamo.optimize(dynamo_compiler)(foo)
-foo_mlir(in1, in2, in3)
+foo_mlir = torch.compile(foo, backend=dynamo_compiler)
+assert torch.allclose(
+    foo_mlir(in1, in2, keepdim=in3), foo(in1, in2, keepdim=in3), equal_nan=True
+)
+graphs = dynamo_compiler.importer(foo, in1, in2, in3)
+assert len(graphs) == 1
+graph = graphs[0]
+graph.lower_to_top_level_ir()
+print(graph._imported_module)
 
 # CHECK: module {
 # CHECK-LABEL: func.func @forward
-# CHECK: %{{.*}} = arith.constant
-# CHECK: %{{.*}} = linalg.generic
+# CHECK: %{{.*}} = tosa.reduce_sum
+# CHECK: %{{.*}} = "tosa.const"
+# CHECK: %{{.*}} = tosa.reciprocal
+# CHECK: %{{.*}} = tosa.mul
 # CHECK: return %{{.*}}
 # CHECK: }
 # CHECK: }
-print(dynamo_compiler.imported_module)
diff --git a/tests/Python/test_mm.py b/tests/Python/test_mm.py
index 4440b4ad8c..4f7c41df3e 100644
--- a/tests/Python/test_mm.py
+++ b/tests/Python/test_mm.py
@@ -6,7 +6,7 @@
 from torch._functorch.aot_autograd import aot_autograd_decompositions
 
 from buddy.compiler.frontend import DynamoCompiler
-from buddy.compiler.ops import tosa
+from buddy.compiler.ops import linalg
 
 
 def foo(x, y):
@@ -17,12 +17,15 @@ def foo(x, y):
 in2 = torch.ones([13, 13], dtype=torch.float32)
 # Initialize the dynamo compiler.
 dynamo_compiler = DynamoCompiler(
-    primary_registry=tosa.ops_registry,
+    primary_registry=linalg.ops_registry,
     aot_autograd_decomposition=aot_autograd_decompositions,
 )
 
-foo_mlir = dynamo.optimize(dynamo_compiler)(foo)
-foo_mlir(in1, in2)
+graphs = dynamo_compiler.importer(foo, in1, in2)
+assert len(graphs) == 1
+graph = graphs[0]
+graph.lower_to_top_level_ir()
+print(graph._imported_module)
 
 # CHECK: module {
 # CHECK-LABEL: func.func @forward
@@ -31,4 +34,3 @@ def foo(x, y):
 # CHECK: return %{{.*}}
 # CHECK: }
 # CHECK: }
-print(dynamo_compiler.imported_module)
diff --git a/tests/Python/test_neg.py b/tests/Python/test_neg.py
index e2f9e6f3d7..78261085a4 100644
--- a/tests/Python/test_neg.py
+++ b/tests/Python/test_neg.py
@@ -6,7 +6,7 @@
 from torch._functorch.aot_autograd import aot_autograd_decompositions
 
 from buddy.compiler.frontend import DynamoCompiler
-from buddy.compiler.ops import tosa
+from buddy.compiler.ops import linalg
 
 
 def foo(x):
@@ -16,18 +16,20 @@ def foo(x):
 in1 = torch.ones([13, 13], dtype=torch.float32)
 # Initialize the dynamo compiler.
 dynamo_compiler = DynamoCompiler(
-    primary_registry=tosa.ops_registry,
+    primary_registry=linalg.ops_registry,
     aot_autograd_decomposition=aot_autograd_decompositions,
 )
 
-foo_mlir = dynamo.optimize(dynamo_compiler)(foo)
-foo_mlir(in1)
+graphs = dynamo_compiler.importer(foo, in1)
+assert len(graphs) == 1
+graph = graphs[0]
+graph.lower_to_top_level_ir()
+print(graph._imported_module)
 
 # CHECK: module {
 # CHECK-LABEL: func.func @forward
 # CHECK: %{{.*}} = tensor.empty
-# CHECK: %{{.*}} = linalg.generic
+# CHECK: %{{.*}} = linalg.negf
 # CHECK: return %{{.*}}
 # CHECK: }
 # CHECK: }
-print(dynamo_compiler.imported_module)
diff --git a/tests/Python/test_ones.py b/tests/Python/test_ones.py
index 7343fd1026..4af4ead36e 100644
--- a/tests/Python/test_ones.py
+++ b/tests/Python/test_ones.py
@@ -5,7 +5,7 @@
 from torch._inductor.decomposition import decompositions as inductor_decomp
 
 from buddy.compiler.frontend import DynamoCompiler
-from buddy.compiler.ops import tosa
+from buddy.compiler.ops import linalg
 
 
 def foo(x):
@@ -16,12 +16,15 @@ def foo(x):
 
 # Initialize the dynamo compiler.
 dynamo_compiler = DynamoCompiler(
-    primary_registry=tosa.ops_registry,
+    primary_registry=linalg.ops_registry,
     aot_autograd_decomposition=inductor_decomp,
 )
 
-foo_mlir = dynamo.optimize(dynamo_compiler)(foo)
-foo_mlir(in1)
+graphs = dynamo_compiler.importer(foo, in1)
+assert len(graphs) == 1
+graph = graphs[0]
+graph.lower_to_top_level_ir()
+print(graph._imported_module)
 
 # CHECK: module {
 # CHECK-LABEL: func.func @forward
@@ -29,4 +32,3 @@ def foo(x):
 # CHECK: return %{{.*}}
 # CHECK: }
 # CHECK: }
-print(dynamo_compiler.imported_module)
diff --git a/tests/Python/test_permute.py b/tests/Python/test_permute.py
index d260df3c2f..7f1aad3e10 100644
--- a/tests/Python/test_permute.py
+++ b/tests/Python/test_permute.py
@@ -21,8 +21,11 @@ def foo(x, y):
     aot_autograd_decomposition=inductor_decomp,
 )
 
-foo_mlir = dynamo.optimize(dynamo_compiler)(foo)
-foo_mlir(x, perm)
+graphs = dynamo_compiler.importer(foo, x, perm)
+assert len(graphs) == 1
+graph = graphs[0]
+graph.lower_to_top_level_ir()
+print(graph._imported_module)
 
 # CHECK: module {
 # CHECK-LABEL: func.func @forward
@@ -30,4 +33,3 @@ def foo(x, y):
 # CHECK: return %{{.*}} : tensor<4x3x2xf32>
 # CHECK: }
 # CHECK: }
-print(dynamo_compiler.imported_module)
diff --git a/tests/Python/test_pow.py b/tests/Python/test_pow.py
index cfc47feb1e..d671563832 100644
--- a/tests/Python/test_pow.py
+++ b/tests/Python/test_pow.py
@@ -6,7 +6,7 @@
 from torch._functorch.aot_autograd import aot_autograd_decompositions
 
 from buddy.compiler.frontend import DynamoCompiler
-from buddy.compiler.ops import tosa
+from buddy.compiler.ops import linalg
 
 
 def foo(x, y):
@@ -17,12 +17,15 @@ def foo(x, y):
 in2 = 2
 # Initialize the dynamo compiler.
 dynamo_compiler = DynamoCompiler(
-    primary_registry=tosa.ops_registry,
+    primary_registry=linalg.ops_registry,
     aot_autograd_decomposition=aot_autograd_decompositions,
 )
 
-foo_mlir = dynamo.optimize(dynamo_compiler)(foo)
-foo_mlir(in1, in2)
+graphs = dynamo_compiler.importer(foo, in1, in2)
+assert len(graphs) == 1
+graph = graphs[0]
+graph.lower_to_top_level_ir()
+print(graph._imported_module)
 
 # CHECK: module {
 # CHECK-LABEL: func.func @forward
@@ -32,4 +35,3 @@ def foo(x, y):
 # CHECK: return %{{.*}}
 # CHECK: }
 # CHECK: }
-print(dynamo_compiler.imported_module)
diff --git a/tests/Python/test_reciprocal.py b/tests/Python/test_reciprocal.py
new file mode 100644
index 0000000000..9c31fb8b5b
--- /dev/null
+++ b/tests/Python/test_reciprocal.py
@@ -0,0 +1,36 @@
+# RUN: %PYTHON %s 2>&1 | FileCheck %s
+
+import torch
+from torch._inductor.decomposition import decompositions as inductor_decomp
+
+from buddy.compiler.frontend import DynamoCompiler
+from buddy.compiler.ops import math
+
+
+def foo(x):
+    return torch.ops.aten.reciprocal(x)
+
+
+x = torch.randn(10, 3, 6)
+
+# Initialize the dynamo compiler.
+dynamo_compiler = DynamoCompiler(
+    primary_registry=math.ops_registry,
+    aot_autograd_decomposition=inductor_decomp,
+)
+
+foo_mlir = torch.compile(foo, backend=dynamo_compiler)
+assert torch.allclose(foo_mlir(x), foo(x), equal_nan=True)
+
+graphs = dynamo_compiler.importer(foo, x)
+assert len(graphs) == 1
+graph = graphs[0]
+graph.lower_to_top_level_ir()
+print(graph._imported_module)
+
+# CHECK: module {
+# CHECK-LABEL: func.func @forward
+# CHECK: %{{.*}} = tosa.reciprocal
+# CHECK: return %{{.*}}
+# CHECK: }
+# CHECK: }
diff --git a/tests/Python/test_relu.py b/tests/Python/test_relu.py
new file mode 100644
index 0000000000..c6d6bc6aed
--- /dev/null
+++ b/tests/Python/test_relu.py
@@ -0,0 +1,36 @@
+# RUN: %PYTHON %s 2>&1 | FileCheck %s
+
+import torch
+import torch._dynamo as dynamo
+from torch._inductor.decomposition import decompositions as inductor_decomp
+
+from buddy.compiler.frontend import DynamoCompiler
+from buddy.compiler.ops import tosa
+
+
+class foo(torch.nn.Module):
+    def __init__(self, *args, **kwargs) -> None:
+        super().__init__(*args, **kwargs)
+
+    def forward(self, a):
+        return torch.relu(a)
+
+
+model = foo()
+dynamo_compiler = DynamoCompiler(
+    primary_registry=tosa.ops_registry,
+    aot_autograd_decomposition=inductor_decomp,
+)
+in1 = torch.randn((1, 3, 640, 480), device="cpu")
+graphs = dynamo_compiler.importer(model, in1)
+assert len(graphs) == 1
+graph = graphs[0]
+graph.lower_to_top_level_ir()
+print(graph._imported_module)
+# CHECK: module {
+# CHECK-LABEL: func.func @forward
+# CHECK: %{{.*}} = "tosa.const"
+# CHECK: %{{.*}} = tosa.maximum
+# CHECK: return %{{.*}}
+# CHECK: }
+# CHECK: }
diff --git a/tests/Python/test_reshape.py b/tests/Python/test_reshape.py
index 56a194697e..989e0e4da5 100644
--- a/tests/Python/test_reshape.py
+++ b/tests/Python/test_reshape.py
@@ -21,8 +21,11 @@ def foo(x, new_shape):
     aot_autograd_decomposition=inductor_decomp,
 )
 
-foo_mlir = dynamo.optimize(dynamo_compiler)(foo)
-foo_mlir(x, new_shape)
+graphs = dynamo_compiler.importer(foo, x, new_shape)
+assert len(graphs) == 1
+graph = graphs[0]
+graph.lower_to_top_level_ir()
+print(graph._imported_module)
 
 # CHECK: module {
 # CHECK-LABEL: func.func @forward
@@ -30,4 +33,3 @@ def foo(x, new_shape):
 # CHECK: return %{{.*}}
 # CHECK: }
 # CHECK: }
-print(dynamo_compiler.imported_module)
diff --git a/tests/Python/test_rsqrt.py b/tests/Python/test_rsqrt.py
index 8ca0cf929a..370334d661 100644
--- a/tests/Python/test_rsqrt.py
+++ b/tests/Python/test_rsqrt.py
@@ -5,7 +5,7 @@
 from torch._inductor.decomposition import decompositions as inductor_decomp
 
 from buddy.compiler.frontend import DynamoCompiler
-from buddy.compiler.ops import tosa
+from buddy.compiler.ops import linalg
 
 
 def foo(x):
@@ -16,17 +16,20 @@ def foo(x):
 
 # Initialize the dynamo compiler.
 dynamo_compiler = DynamoCompiler(
-    primary_registry=tosa.ops_registry,
+    primary_registry=linalg.ops_registry,
     aot_autograd_decomposition=inductor_decomp,
 )
 
-foo_mlir = dynamo.optimize(dynamo_compiler)(foo)
-foo_mlir(x)
+graphs = dynamo_compiler.importer(foo, x)
+assert len(graphs) == 1
+graph = graphs[0]
+graph.lower_to_top_level_ir()
+print(graph._imported_module)
 
 # CHECK: module {
 # CHECK-LABEL: func.func @forward
-# CHECK: %{{.*}} = tosa.rsqrt
+# CHECK: %{{.*}} = tensor.empty()
+# CHECK: %{{.*}} = linalg.generic
 # CHECK: return %{{.*}}
 # CHECK: }
 # CHECK: }
-print(dynamo_compiler.imported_module)
diff --git a/tests/Python/test_rsub.py b/tests/Python/test_rsub.py
index fc945970c9..99843af0e7 100644
--- a/tests/Python/test_rsub.py
+++ b/tests/Python/test_rsub.py
@@ -6,28 +6,32 @@
 from torch._functorch.aot_autograd import aot_autograd_decompositions
 
 from buddy.compiler.frontend import DynamoCompiler
-from buddy.compiler.ops import tosa
+from buddy.compiler.ops import linalg
 
 
 def foo(x, y):
-    return y-x
+    return torch.ops.aten.rsub(x, y)
 
 
 in1 = torch.ones([13, 13], dtype=torch.float32)
-in2 = torch.ones([13, 13], dtype=torch.float32)
+in2 = 2
 # Initialize the dynamo compiler.
 dynamo_compiler = DynamoCompiler(
-    primary_registry=tosa.ops_registry,
+    primary_registry=linalg.ops_registry,
     aot_autograd_decomposition=aot_autograd_decompositions,
 )
 
-foo_mlir = dynamo.optimize(dynamo_compiler)(foo)
-foo_mlir(in1, in2)
+graphs = dynamo_compiler.importer(foo, in1, in2)
+assert len(graphs) == 1
+graph = graphs[0]
+graph.lower_to_top_level_ir()
+print(graph._imported_module)
 
 # CHECK: module {
 # CHECK-LABEL: func.func @forward
-# CHECK: %{{.*}} = tosa.sub
+# CHECK: %{{.*}} = arith.constant
+# CHECK: %{{.*}} = tensor.empty()
+# CHECK: %{{.*}} = linalg.generic
 # CHECK: return %{{.*}}
 # CHECK: }
 # CHECK: }
-print(dynamo_compiler.imported_module)
diff --git a/tests/Python/test_select.py b/tests/Python/test_select.py
index d94bd296a1..c54420a117 100644
--- a/tests/Python/test_select.py
+++ b/tests/Python/test_select.py
@@ -22,8 +22,11 @@ def foo(x, dim, index):
     aot_autograd_decomposition=inductor_decomp,
 )
 
-foo_mlir = dynamo.optimize(dynamo_compiler)(foo)
-foo_mlir(x, dim, index)
+graphs = dynamo_compiler.importer(foo, x, dim, index)
+assert len(graphs) == 1
+graph = graphs[0]
+graph.lower_to_top_level_ir()
+print(graph._imported_module)
 
 # CHECK: module {
 # CHECK-LABEL: func.func @forward
@@ -32,4 +35,3 @@ def foo(x, dim, index):
 # CHECK: return %{{.*}} : tensor<3x2xf32>
 # CHECK: }
 # CHECK: }
-print(dynamo_compiler.imported_module)
diff --git a/tests/Python/test_sigmoid.py b/tests/Python/test_sigmoid.py
new file mode 100644
index 0000000000..43f03cc11f
--- /dev/null
+++ b/tests/Python/test_sigmoid.py
@@ -0,0 +1,35 @@
+# RUN: %PYTHON %s 2>&1 | FileCheck %s
+
+import torch
+import torch._dynamo as dynamo
+from torch._inductor.decomposition import decompositions as inductor_decomp
+
+from buddy.compiler.frontend import DynamoCompiler
+from buddy.compiler.ops import tosa
+
+
+class foo(torch.nn.Module):
+    def __init__(self, *args, **kwargs) -> None:
+        super().__init__(*args, **kwargs)
+
+    def forward(self, a):
+        return torch.sigmoid(a)
+
+
+model = foo()
+dynamo_compiler = DynamoCompiler(
+    primary_registry=tosa.ops_registry,
+    aot_autograd_decomposition=inductor_decomp,
+)
+in1 = torch.randn((1, 3, 640, 480), device="cpu")
+graphs = dynamo_compiler.importer(model, in1)
+assert len(graphs) == 1
+graph = graphs[0]
+graph.lower_to_top_level_ir()
+print(graph._imported_module)
+# CHECK: module {
+# CHECK-LABEL: func.func @forward
+# CHECK: %{{.*}} = tosa.sigmoid
+# CHECK: return %{{.*}}
+# CHECK: }
+# CHECK: }
diff --git a/tests/Python/test_silu.py b/tests/Python/test_silu.py
index dcd919ca53..2aa5047765 100644
--- a/tests/Python/test_silu.py
+++ b/tests/Python/test_silu.py
@@ -6,7 +6,7 @@
 from torch._functorch.aot_autograd import aot_autograd_decompositions
 
 from buddy.compiler.frontend import DynamoCompiler
-from buddy.compiler.ops import tosa
+from buddy.compiler.ops import linalg
 
 
 def foo(x):
@@ -16,12 +16,15 @@ def foo(x):
 in1 = torch.ones([13, 13], dtype=torch.float32)
 # Initialize the dynamo compiler.
 dynamo_compiler = DynamoCompiler(
-    primary_registry=tosa.ops_registry,
+    primary_registry=linalg.ops_registry,
     aot_autograd_decomposition=aot_autograd_decompositions,
 )
 
-foo_mlir = dynamo.optimize(dynamo_compiler)(foo)
-foo_mlir(in1)
+graphs = dynamo_compiler.importer(foo, in1)
+assert len(graphs) == 1
+graph = graphs[0]
+graph.lower_to_top_level_ir()
+print(graph._imported_module)
 
 # CHECK: module {
 # CHECK-LABEL: func.func @forward
@@ -30,4 +33,3 @@ def foo(x):
 # CHECK: return %{{.*}}
 # CHECK: }
 # CHECK: }
-print(dynamo_compiler.imported_module)
diff --git a/tests/Python/test_slice.py b/tests/Python/test_slice.py
index 61a8658e1b..acc0acaa25 100644
--- a/tests/Python/test_slice.py
+++ b/tests/Python/test_slice.py
@@ -5,7 +5,7 @@
 from torch._inductor.decomposition import decompositions as inductor_decomp
 
 from buddy.compiler.frontend import DynamoCompiler
-from buddy.compiler.ops import tosa
+from buddy.compiler.ops import linalg
 
 
 def foo(x, dim, start_idx, end_idx):
@@ -19,12 +19,15 @@ def foo(x, dim, start_idx, end_idx):
 
 # Initialize the dynamo compiler.
 dynamo_compiler = DynamoCompiler(
-    primary_registry=tosa.ops_registry,
+    primary_registry=linalg.ops_registry,
     aot_autograd_decomposition=inductor_decomp,
 )
 
-foo_mlir = dynamo.optimize(dynamo_compiler)(foo)
-foo_mlir(x, dim, start_idx, end_idx)
+graphs = dynamo_compiler.importer(foo, x, dim, start_idx, end_idx)
+assert len(graphs) == 1
+graph = graphs[0]
+graph.lower_to_top_level_ir()
+print(graph._imported_module)
 
 # CHECK: module {
 # CHECK-LABEL: func.func @forward
@@ -32,4 +35,3 @@ def foo(x, dim, start_idx, end_idx):
 # CHECK: return %{{.*}} : tensor<3x2x2xf32>
 # CHECK: }
 # CHECK: }
-print(dynamo_compiler.imported_module)
diff --git a/tests/Python/test_softmax.py b/tests/Python/test_softmax.py
index d5e656de76..eca5b2c600 100644
--- a/tests/Python/test_softmax.py
+++ b/tests/Python/test_softmax.py
@@ -6,7 +6,7 @@
 from torch._functorch.aot_autograd import aot_autograd_decompositions
 
 from buddy.compiler.frontend import DynamoCompiler
-from buddy.compiler.ops import tosa
+from buddy.compiler.ops import linalg
 
 
 def foo(x):
@@ -16,26 +16,22 @@ def foo(x):
 in1 = torch.ones([13, 13], dtype=torch.float32)
 # Initialize the dynamo compiler.
 dynamo_compiler = DynamoCompiler(
-    primary_registry=tosa.ops_registry,
+    primary_registry=linalg.ops_registry,
     aot_autograd_decomposition=aot_autograd_decompositions,
 )
 
-foo_mlir = dynamo.optimize(dynamo_compiler)(foo)
-foo_mlir(in1)
+graphs = dynamo_compiler.importer(foo, in1)
+assert len(graphs) == 1
+graph = graphs[0]
+graph.lower_to_top_level_ir()
+print(graph._imported_module)
 
 # CHECK: module {
 # CHECK-LABEL: func.func @forward
-# CHECK: %{{.*}} = tensor.empty
+# CHECK: %{{.*}} = arith.constant
 # CHECK: %{{.*}} = linalg.generic
-# CHECK: %{{.*}} = linalg.generic
-# CHECK: %{{.*}} = tensor.empty
-# CHECK: %{{.*}} = linalg.generic
-# CHECK: %{{.*}} = tensor.empty
-# CHECK: %{{.*}} = linalg.generic
-# CHECK: %{{.*}} = linalg.generic
-# CHECK: %{{.*}} = tensor.empty
+# CHECK: %{{.*}} = tensor.empty()
 # CHECK: %{{.*}} = linalg.generic
 # CHECK: return %{{.*}}
 # CHECK: }
 # CHECK: }
-print(dynamo_compiler.imported_module)
diff --git a/tests/Python/test_sqrt.py b/tests/Python/test_sqrt.py
new file mode 100644
index 0000000000..b929d11075
--- /dev/null
+++ b/tests/Python/test_sqrt.py
@@ -0,0 +1,36 @@
+# RUN: %PYTHON %s 2>&1 | FileCheck %s
+
+import torch
+from torch._inductor.decomposition import decompositions as inductor_decomp
+
+from buddy.compiler.frontend import DynamoCompiler
+from buddy.compiler.ops import math
+
+
+def foo(x):
+    return torch.ops.aten.sqrt(x)
+
+
+x = torch.randn(10, 3, 6)
+
+# Initialize the dynamo compiler.
+dynamo_compiler = DynamoCompiler(
+    primary_registry=math.ops_registry,
+    aot_autograd_decomposition=inductor_decomp,
+)
+
+foo_mlir = torch.compile(foo, backend=dynamo_compiler)
+assert torch.allclose(foo_mlir(x), foo(x), equal_nan=True)
+
+graphs = dynamo_compiler.importer(foo, x)
+assert len(graphs) == 1
+graph = graphs[0]
+graph.lower_to_top_level_ir()
+print(graph._imported_module)
+
+# CHECK: module {
+# CHECK-LABEL: func.func @forward
+# CHECK: %{{.*}} = math.sqrt
+# CHECK: return %{{.*}}
+# CHECK: }
+# CHECK: }
diff --git a/tests/Python/test_squeeze.py b/tests/Python/test_squeeze.py
index f394ca8d72..e6b1b5c00e 100644
--- a/tests/Python/test_squeeze.py
+++ b/tests/Python/test_squeeze.py
@@ -6,7 +6,7 @@
 from torch._functorch.aot_autograd import aot_autograd_decompositions
 
 from buddy.compiler.frontend import DynamoCompiler
-from buddy.compiler.ops import tosa
+from buddy.compiler.ops import linalg
 
 
 def foo(x):
@@ -16,12 +16,15 @@ def foo(x):
 in1 = torch.ones([1, 13, 13], dtype=torch.float32)
 # Initialize the dynamo compiler.
 dynamo_compiler = DynamoCompiler(
-    primary_registry=tosa.ops_registry,
+    primary_registry=linalg.ops_registry,
     aot_autograd_decomposition=aot_autograd_decompositions,
 )
 
-foo_mlir = dynamo.optimize(dynamo_compiler)(foo)
-foo_mlir(in1)
+graphs = dynamo_compiler.importer(foo, in1)
+assert len(graphs) == 1
+graph = graphs[0]
+graph.lower_to_top_level_ir()
+print(graph._imported_module)
 
 # CHECK: module {
 # CHECK-LABEL: func.func @forward
@@ -30,4 +33,3 @@ def foo(x):
 # CHECK: return %{{.*}}
 # CHECK: }
 # CHECK: }
-print(dynamo_compiler.imported_module)
diff --git a/tests/Python/test_sum.py b/tests/Python/test_sum.py
index 713910f15c..e97f942095 100644
--- a/tests/Python/test_sum.py
+++ b/tests/Python/test_sum.py
@@ -22,8 +22,11 @@ def foo(x, dim):
     aot_autograd_decomposition=inductor_decomp,
 )
 
-foo_mlir = dynamo.optimize(dynamo_compiler)(foo)
-foo_mlir(x, dim)
+graphs = dynamo_compiler.importer(foo, x, dim)
+assert len(graphs) == 1
+graph = graphs[0]
+graph.lower_to_top_level_ir()
+print(graph._imported_module)
 
 # CHECK: module {
 # CHECK-LABEL: func.func @forward
@@ -31,4 +34,3 @@ def foo(x, dim):
 # CHECK: return %{{.*}}
 # CHECK: }
 # CHECK: }
-print(dynamo_compiler.imported_module)
diff --git a/tests/Python/test_t.py b/tests/Python/test_t.py
index 835bb4c2f2..09d44facc2 100644
--- a/tests/Python/test_t.py
+++ b/tests/Python/test_t.py
@@ -6,7 +6,7 @@
 from torch._functorch.aot_autograd import aot_autograd_decompositions
 
 from buddy.compiler.frontend import DynamoCompiler
-from buddy.compiler.ops import tosa
+from buddy.compiler.ops import linalg
 
 
 def foo(x):
@@ -16,18 +16,20 @@ def foo(x):
 in1 = torch.ones([13, 13], dtype=torch.float32)
 # Initialize the dynamo compiler.
 dynamo_compiler = DynamoCompiler(
-    primary_registry=tosa.ops_registry,
+    primary_registry=linalg.ops_registry,
     aot_autograd_decomposition=aot_autograd_decompositions,
 )
 
-foo_mlir = dynamo.optimize(dynamo_compiler)(foo)
-foo_mlir(in1)
+graphs = dynamo_compiler.importer(foo, in1)
+assert len(graphs) == 1
+graph = graphs[0]
+graph.lower_to_top_level_ir()
+print(graph._imported_module)
 
 # CHECK: module {
 # CHECK-LABEL: func.func @forward
-# CHECK: %{{.*}} = "tosa.const"
-# CHECK: %{{.*}} = tosa.transpose
+# CHECK: %{{.*}} = tensor.empty()
+# CHECK: %{{.*}} = linalg.transpose
 # CHECK: return %{{.*}}
 # CHECK: }
 # CHECK: }
-print(dynamo_compiler.imported_module)
diff --git a/tests/Python/test_tanh.py b/tests/Python/test_tanh.py
index b1875dfd51..b9ca6082cd 100644
--- a/tests/Python/test_tanh.py
+++ b/tests/Python/test_tanh.py
@@ -20,8 +20,11 @@ def foo(x):
     aot_autograd_decomposition=inductor_decomp,
 )
 
-foo_mlir = dynamo.optimize(dynamo_compiler)(foo)
-foo_mlir(x)
+graphs = dynamo_compiler.importer(foo, x)
+assert len(graphs) == 1
+graph = graphs[0]
+graph.lower_to_top_level_ir()
+print(graph._imported_module)
 
 # CHECK: module {
 # CHECK-LABEL: func.func @forward
@@ -29,4 +32,3 @@ def foo(x):
 # CHECK: return %{{.*}}
 # CHECK: }
 # CHECK: }
-print(dynamo_compiler.imported_module)
diff --git a/tests/Python/test_to_copy.py b/tests/Python/test_to_copy.py
index 9632d9f5cb..0b6c2ad22a 100644
--- a/tests/Python/test_to_copy.py
+++ b/tests/Python/test_to_copy.py
@@ -6,7 +6,7 @@
 from torch._functorch.aot_autograd import aot_autograd_decompositions
 
 from buddy.compiler.frontend import DynamoCompiler
-from buddy.compiler.ops import tosa
+from buddy.compiler.ops import linalg
 
 
 def foo(x):
@@ -16,12 +16,15 @@ def foo(x):
 in1 = torch.ones([13, 13], dtype=torch.bool)
 # Initialize the dynamo compiler.
 dynamo_compiler = DynamoCompiler(
-    primary_registry=tosa.ops_registry,
+    primary_registry=linalg.ops_registry,
     aot_autograd_decomposition=aot_autograd_decompositions,
 )
 
-foo_mlir = dynamo.optimize(dynamo_compiler)(foo)
-foo_mlir(in1)
+graphs = dynamo_compiler.importer(foo, in1)
+assert len(graphs) == 1
+graph = graphs[0]
+graph.lower_to_top_level_ir()
+print(graph._imported_module)
 
 # CHECK: module {
 # CHECK-LABEL: func.func @forward
@@ -30,4 +33,3 @@ def foo(x):
 # CHECK: return %{{.*}}
 # CHECK: }
 # CHECK: }
-print(dynamo_compiler.imported_module)
diff --git a/tests/Python/test_transpose.py b/tests/Python/test_transpose.py
index d7e71be8e0..9769604f30 100644
--- a/tests/Python/test_transpose.py
+++ b/tests/Python/test_transpose.py
@@ -3,7 +3,6 @@
 import torch
 import torch._dynamo as dynamo
 from torch._inductor.decomposition import decompositions as inductor_decomp
-from torch._functorch.aot_autograd import aot_autograd_decompositions
 
 from buddy.compiler.frontend import DynamoCompiler
 from buddy.compiler.ops import tosa
@@ -19,17 +18,19 @@ def foo(x, y, z):
 # Initialize the dynamo compiler.
 dynamo_compiler = DynamoCompiler(
     primary_registry=tosa.ops_registry,
-    aot_autograd_decomposition=aot_autograd_decompositions,
+    aot_autograd_decomposition=inductor_decomp,
 )
 
-foo_mlir = dynamo.optimize(dynamo_compiler)(foo)
-foo_mlir(in1, in2, in3)
+graphs = dynamo_compiler.importer(foo, in1, in2, in3)
+assert len(graphs) == 1
+graph = graphs[0]
+graph.lower_to_top_level_ir()
+print(graph._imported_module)
 
 # CHECK: module {
 # CHECK-LABEL: func.func @forward
-# CHECK: %{{.*}} = "tosa.const"
+# CHECK: %{{.*}} = "tosa.const"()
 # CHECK: %{{.*}} = tosa.transpose
 # CHECK: return %{{.*}}
 # CHECK: }
 # CHECK: }
-print(dynamo_compiler.imported_module)
diff --git a/tests/Python/test_unsqueeze.py b/tests/Python/test_unsqueeze.py
index 577354b9f5..5cb4ee5527 100644
--- a/tests/Python/test_unsqueeze.py
+++ b/tests/Python/test_unsqueeze.py
@@ -5,7 +5,7 @@
 from torch._inductor.decomposition import decompositions as inductor_decomp
 
 from buddy.compiler.frontend import DynamoCompiler
-from buddy.compiler.ops import tosa
+from buddy.compiler.ops import linalg
 
 
 def foo(x, dim):
@@ -17,12 +17,15 @@ def foo(x, dim):
 
 # Initialize the dynamo compiler.
 dynamo_compiler = DynamoCompiler(
-    primary_registry=tosa.ops_registry,
+    primary_registry=linalg.ops_registry,
     aot_autograd_decomposition=inductor_decomp,
 )
 
-foo_mlir = dynamo.optimize(dynamo_compiler)(foo)
-foo_mlir(x, dim)
+graphs = dynamo_compiler.importer(foo, x, dim)
+assert len(graphs) == 1
+graph = graphs[0]
+graph.lower_to_top_level_ir()
+print(graph._imported_module)
 
 # CHECK: module {
 # CHECK-LABEL: func.func @forward
@@ -30,4 +33,3 @@ def foo(x, dim):
 # CHECK: return %{{.*}} : tensor<1x10xf32>
 # CHECK: }
 # CHECK: }
-print(dynamo_compiler.imported_module)
diff --git a/tests/Python/test_var_mean.py b/tests/Python/test_var_mean.py
index eb7f254e47..eae1c99839 100644
--- a/tests/Python/test_var_mean.py
+++ b/tests/Python/test_var_mean.py
@@ -24,8 +24,11 @@ def foo_keepdim(x):
     aot_autograd_decomposition=inductor_decomp,
 )
 
-foo_mlir = dynamo.optimize(dynamo_compiler)(foo)
-foo_mlir(x)
+graphs = dynamo_compiler.importer(foo, x)
+assert len(graphs) == 1
+graph = graphs[0]
+graph.lower_to_top_level_ir()
+print(graph._imported_module)
 
 # CHECK: module {
 # CHECK-LABEL: func.func @forward
@@ -44,10 +47,33 @@ def foo_keepdim(x):
 # CHECK: return %{{.*}} : tensor<f32>, tensor<f32>
 # CHECK: }
 # CHECK: }
-print(dynamo_compiler.imported_module)
 
-foo_keepdim_mlir = dynamo.optimize(dynamo_compiler)(foo_keepdim)
-foo_keepdim_mlir(x)
+graphs = dynamo_compiler.importer(foo_keepdim, x)
+assert len(graphs) == 2
+graphs[0].lower_to_top_level_ir()
+print(graphs[0]._imported_module)
+
+# CHECK: module {
+# CHECK-LABEL: func.func @forward
+# CHECK: %{{.*}} = tosa.reduce_sum
+# CHECK: %{{.*}} = "tosa.const"
+# CHECK: %{{.*}} = tosa.reciprocal
+# CHECK: %{{.*}} = tosa.mul
+# CHECK: %{{.*}} = tosa.sub
+# CHECK: %{{.*}} = tosa.mul
+# CHECK: %{{.*}} = tosa.reduce_sum
+# CHECK: %{{.*}} = "tosa.const"
+# CHECK: %{{.*}} = tosa.reciprocal
+# CHECK: %{{.*}} = tosa.mul
+# CHECK: %{{.*}} = tosa.reshape
+# CHECK: %{{.*}} = tosa.reshape
+# CHECK: return %{{.*}} : tensor<f32>, tensor<f32>
+# CHECK: }
+# CHECK: }
+
+graphs[1].lower_to_top_level_ir()
+print(graphs[1]._imported_module)
+
 # CHECK: module {
 # CHECK-LABEL: func.func @forward
 # CHECK: %{{.*}} = tosa.reduce_sum
@@ -63,4 +89,3 @@ def foo_keepdim(x):
 # CHECK: return %{{.*}} : tensor<1x1x1xf32>, tensor<1x1x1xf32>
 # CHECK: }
 # CHECK: }
-print(dynamo_compiler.imported_module)
diff --git a/tests/Python/test_view.py b/tests/Python/test_view.py
index 44db4609d7..31eacddc77 100644
--- a/tests/Python/test_view.py
+++ b/tests/Python/test_view.py
@@ -5,29 +5,31 @@
 from torch._inductor.decomposition import decompositions as inductor_decomp
 
 from buddy.compiler.frontend import DynamoCompiler
-from buddy.compiler.ops import tosa
+from buddy.compiler.ops import linalg
 
 
 def foo(x, y):
-    return x + y
+    return torch.ops.aten.view(x, y)
 
 
 in1 = torch.randn(10)
-in2 = torch.randn(10)
+in2 = (2, 5)
 
 # Initialize the dynamo compiler.
 dynamo_compiler = DynamoCompiler(
-    primary_registry=tosa.ops_registry,
+    primary_registry=linalg.ops_registry,
     aot_autograd_decomposition=inductor_decomp,
 )
 
-foo_mlir = dynamo.optimize(dynamo_compiler)(foo)
-foo_mlir(in1, in2)
+graphs = dynamo_compiler.importer(foo, in1, in2)
+assert len(graphs) == 1
+graph = graphs[0]
+graph.lower_to_top_level_ir()
+print(graph._imported_module)
 
 # CHECK: module {
 # CHECK-LABEL: func.func @forward
-# CHECK: %{{.*}} = tosa.add
+# CHECK: %{{.*}} = tosa.reshape
 # CHECK: return %{{.*}}
 # CHECK: }
 # CHECK: }
-print(dynamo_compiler.imported_module)
diff --git a/tests/Python/test_where.py b/tests/Python/test_where.py
new file mode 100644
index 0000000000..5266f00b74
--- /dev/null
+++ b/tests/Python/test_where.py
@@ -0,0 +1,38 @@
+# RUN: %PYTHON %s 2>&1 | FileCheck %s
+
+import torch
+import torch._dynamo as dynamo
+from torch._inductor.decomposition import decompositions as inductor_decomp
+from torch._functorch.aot_autograd import aot_autograd_decompositions
+
+from buddy.compiler.frontend import DynamoCompiler
+from buddy.compiler.ops import linalg
+
+
+def foo(x, y, z):
+    return torch.where(x, y, z)
+
+
+in1 = torch.ones([13, 13], dtype=torch.bool)
+in2 = 0
+in3 = torch.ones([13, 13], dtype=torch.float32)
+# Initialize the dynamo compiler.
+dynamo_compiler = DynamoCompiler(
+    primary_registry=linalg.ops_registry,
+    aot_autograd_decomposition=aot_autograd_decompositions,
+)
+
+graphs = dynamo_compiler.importer(foo, in1, in2, in3)
+assert len(graphs) == 1
+graph = graphs[0]
+graph.lower_to_top_level_ir()
+print(graph._imported_module)
+
+# CHECK: module {
+# CHECK-LABEL: func.func @forward
+# CHECK: %{{.*}} = arith.constant
+# CHECK: %{{.*}} = tensor.empty
+# CHECK: %{{.*}} = linalg.generic
+# CHECK: return %{{.*}}
+# CHECK: }
+# CHECK: }