Add AtenOp Benchmarking (#2495)

Summary: As described in pytorch/pytorch#136168, I'm trying to migrate native PyTorch implementation comparison([the original operatorbench](https://github.com/pytorch/pytorch/blob/main/benchmarks/dynamo/microbenchmarks/operatorbench.py)) to TritonBench. This PR adds an Operator Loader which can load aten ops used in TorchBench, HuggingFace, and TIMM models. The benchmark classes are dynamically created. Then benchmark them between aten and inductor implementations. Files `torchbenchmark/operator_loader/operator_inp_utils.py`, `torchbenchmark/operator_loader/operatorbench.py`, and all configs files in `torchbenchmark/operator_loader/operator_inp_logs/` are copied from original operatorbench. Example commands: ```bash python run_benchmark.py triton --op aten._softmax.default --num-inputs 1 --operator-loader --precision fp16 ``` Exampled Output: ``` Evaluating an op name into an OpOverload: The underlying op of 'aten.upsample_nearest2d_backward' has no overload name 'vec' Evaluating an op name into an OpOverload: '_OpNamespace' 'aten' object has no attribute 'im2col_backward' Evaluating an op name into an OpOverload: '_OpNamespace' 'aten' object has no attribute 'col2im_backward' Evaluating an op name into an OpOverload: '_OpNamespace' 'aten' object has no attribute 'im2col_backward' Evaluating an op name into an OpOverload: The underlying op of 'aten.upsample_bilinear2d_backward' has no overload name 'vec' Evaluating an op name into an OpOverload: The underlying op of 'aten.upsample_nearest2d_backward' has no overload name 'vec' 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:02<00:00, 1.20s/it] x_val eager-latency inductor-latency ------- --------------- ------------------ 0 0.090592 0.089632 1 0.055808 0.038112 ``` Pull Request resolved: #2495 Reviewed By: xuzhao9 Differential Revision: D64200358 Pulled By: FindHao fbshipit-source-id: f0121168b33247224bc905a1a88af69e4b13def6
pytorch · Oct 12, 2024 · 34d4f94 · 34d4f94
1 parent f9f52f6
commit 34d4f94
Show file tree

Hide file tree

Showing 160 changed files with 28,510 additions and 3 deletions.
diff --git a/torchbenchmark/operator_loader/__init__.py b/torchbenchmark/operator_loader/__init__.py
@@ -0,0 +1,157 @@
+import argparse
+import sys
+import types
+from typing import Any, Generator, List, Optional
+
+import torch
+from torch._dynamo.backends.cudagraphs import cudagraphs_inner
+from torch._inductor.compile_fx import compile_fx
+from torch._inductor.utils import gen_gm_and_inputs
+from torch._ops import OpOverload
+from torch.utils._pytree import tree_map_only
+
+from torchbenchmark.util.triton_op import (
+    BenchmarkOperator,
+    register_benchmark_mannually,
+)
+
+from .operator_inp_utils import aten, OperatorInputsLoader, to_channels_last
+
+timm_loader = None
+huggingface_loader = None
+torchbench_loader = None
+
+
+def maybe_load_operator_inputs_loader():
+    global timm_loader, huggingface_loader, torchbench_loader
+    if timm_loader is None:
+        timm_loader = OperatorInputsLoader.get_timm_loader()
+    if huggingface_loader is None:
+        huggingface_loader = OperatorInputsLoader.get_huggingface_loader()
+    if torchbench_loader is None:
+        torchbench_loader = OperatorInputsLoader.get_torchbench_loader()
+
+
+def parse_args(extra_args: Optional[List[str]] = None):
+    parser = argparse.ArgumentParser(allow_abbrev=False)
+    parser.add_argument(
+        "--channel-list",
+        action="store_true",
+        help="Flag to enable channel list benchmarking.",
+    )
+    return parser.parse_known_args(extra_args)
+
+
+def list_operators() -> List[str]:
+    """In the original operator benchmark design, all operators are registered in the
+    operator loader. We need to collect them here.
+    """
+    maybe_load_operator_inputs_loader()
+    all_ops = (
+        list(timm_loader.get_all_ops())
+        + list(huggingface_loader.get_all_ops())
+        + list(torchbench_loader.get_all_ops())
+    )
+    # remove duplicate operators
+    all_ops_str = list(set(str(item) for item in all_ops))
+    return all_ops_str
+
+
+def load_opbench_by_name_from_loader(args: argparse.Namespace):
+    all_ops_str = list_operators()
+    if args.op not in all_ops_str:
+        raise ValueError(f"{args.op} is not found in the operator loader.")
+    # args.op is a string, we need to evaluate it to get the actual operator overload
+    op_eval = eval(args.op)
+    return dynamically_create_aten_op_class(op_eval)
+
+
+def create_operator_class(op_eval: OpOverload):
+    """Create a new class for the operator overload."""
+
+    def __init__(
+        self, tb_args: argparse.Namespace, extra_args: Optional[List[str]] = None
+    ):
+        BenchmarkOperator.__init__(self, tb_args, extra_args)
+        native_args, _ = parse_args(extra_args)
+        self.channel_list = native_args.channel_list
+        self.device = tb_args.device
+        self.huggingface_loader = huggingface_loader
+        self.torchbench_loader = torchbench_loader
+        self.timm_loader = timm_loader
+        # We enable cuda graphs by default when we get the input iter. So, we don't
+        # utilize tritonbench's cuda graphs.
+        self.use_cuda_graphs = False
+        self.DEFAULT_PRECISION = "fp16"
+        assert self.dtype in (
+            torch.float16,
+            torch.float32,
+        ), f"AtenOpBenchmark only supports fp16 and fp32, but got {self.dtype}"
+
+    def get_input_iter(self) -> Generator:
+        inps_gens = [self.huggingface_loader, self.torchbench_loader, self.timm_loader]
+        for inp_gen in inps_gens:
+            for inp in inp_gen.get_inputs_for_operator(
+                self.op_eval, self.dtype, self.device
+            ):
+                args, kwargs = inp
+                if self.channel_list:
+                    args, kwargs = tree_map_only(
+                        torch.Tensor, to_channels_last, (args, kwargs)
+                    )
+                gm, gm_args = gen_gm_and_inputs(self.op_eval, args, kwargs)
+                torch.jit._builtins._register_builtin(
+                    torch.ops.aten.convolution_backward.default,
+                    "aten::convolution_backward",
+                )
+                if self.device == "cuda":
+                    cudagraph_eager = cudagraphs_inner(
+                        gm, gm_args, copy_outputs=False, copy_inputs=False
+                    )
+                    self.eager_op = cudagraph_eager
+                    compiled_fn = compile_fx(gm, gm_args)
+                    cudagraph_compiled = cudagraphs_inner(
+                        compiled_fn, gm_args, copy_outputs=False, copy_inputs=False
+                    )
+                    self.inductor_op = cudagraph_compiled
+                else:
+                    self.eager_op = gm
+                    self.inductor_op = gm
+
+                yield gm_args
+
+    def eager(self, input):
+        return lambda: self.eager_op(input)
+
+    def inductor(self, input):
+        return lambda: self.inductor_op(input)
+
+    class_attrs = {
+        "eager": eager,
+        "inductor": inductor,
+        "get_input_iter": get_input_iter,
+        "__init__": __init__,
+    }
+    new_class = type("Operator", (BenchmarkOperator,), class_attrs)
+    new_class.op_eval = op_eval
+    return new_class
+
+
+def dynamically_create_aten_op_class(op_eval: OpOverload):
+    """
+    To keep same with custom operators, we dynamically create aten operator classes here.
+    """
+    maybe_load_operator_inputs_loader()
+    class_name = f"aten_{str(op_eval).replace('.', '_')}"
+    module_name = f"torchbenchmark.operator_loader.{class_name}"
+    # create a new module for each operator
+    op_name_module = types.ModuleType(module_name)
+    sys.modules[module_name] = op_name_module
+    op_class = create_operator_class(op_eval)
+    # need to set __module__ to make _find_op_name_from_module_path work
+    op_class.__module__ = module_name
+    op_name_module.Operator = op_class
+    # because the class is dynamically created, decorator can't get the desired module_path.
+    register_benchmark_mannually(class_name, "eager", baseline=True)
+    register_benchmark_mannually(class_name, "inductor")
+    return op_class
diff --git a/torchbenchmark/operator_loader/operator_inp_logs/hf_train/AlbertForMaskedLM_training.txt b/torchbenchmark/operator_loader/operator_inp_logs/hf_train/AlbertForMaskedLM_training.txt
@@ -0,0 +1,115 @@
+Operator: aten._log_softmax.default
+cnt: 1, ((T([1024, 30000], f16), 1, False), {})
+Operator: aten._log_softmax_backward_data.default
+cnt: 1, ((T([1024, 30000], f16), T([1024, 30000], f16), 1, f16), {})
+Operator: aten._softmax.default
+cnt: 12, ((T([2, 64, 512, 512], f16), -1, False), {})
+Operator: aten._softmax_backward_data.default
+cnt: 12, ((T([2, 64, 512, 512], f16), T([2, 64, 512, 512], f16), -1, f16), {})
+Operator: aten._to_copy.default
+cnt: 1, ((T([2, 1, 1, 512], f32),), {'dtype': f16})
+Operator: aten._unsafe_view.default
+cnt: 36, ((T([2, 64, 512, 64], f16), [128, 512, 64]), {})
+cnt: 12, ((T([2, 64, 64, 512], f16), [128, 64, 512]), {})
+cnt: 12, ((T([128, 512, 512], f16), [2, 64, 512, 512]), {})
+cnt: 12, ((T([128, 512, 64], f16), [2, 64, 512, 64]), {})
+cnt: 36, ((T([2, 512, 64, 64], f16), [2, 512, 4096]), {})
+cnt: 12, ((T([2, 512, 4096], f16), [1024, 4096]), {})
+Operator: aten.add.Tensor
+cnt: 4, ((T([2, 512, 128], f16), T([2, 512, 128], f16)), {})
+cnt: 12, ((T([2, 64, 512, 512], f16), T([2, 1, 1, 512], f16)), {})
+cnt: 72, ((T([2, 512, 4096], f16), T([2, 512, 4096], f16)), {})
+cnt: 36, ((T([2, 512, 16384], f16), T([2, 512, 16384], f16)), {})
+cnt: 12, ((T([2, 512, 16384], f16), 1.0), {})
+cnt: 1, ((T([2, 512, 128], f16), 1.0), {})
+cnt: 99, ((T([4096], f16), T([4096], f16)), {})
+cnt: 11, ((T([4096, 16384], f16), T([4096, 16384], f16)), {})
+cnt: 11, ((T([16384], f16), T([16384], f16)), {})
+cnt: 11, ((T([16384, 4096], f16), T([16384, 4096], f16)), {})
+cnt: 44, ((T([4096, 4096], f16), T([4096, 4096], f16)), {})
+cnt: 1, ((T([30000, 128], f16), T([30000, 128], f16)), {})
+Operator: aten.add_.Tensor
+cnt: 1, ((T([2, 512, 128], f16), T([1, 512, 128], f16)), {})
+Operator: aten.addmm.default
+cnt: 1, ((T([4096], f16), T([1024, 128], f16), T([128, 4096], f16, stride=(1, 128))), {})
+cnt: 48, ((T([4096], f16), T([1024, 4096], f16), T([4096, 4096], f16, stride=(1, 4096))), {})
+cnt: 12, ((T([16384], f16), T([1024, 4096], f16), T([4096, 16384], f16, stride=(1, 4096))), {})
+cnt: 12, ((T([4096], f16), T([1024, 16384], f16), T([16384, 4096], f16, stride=(1, 16384))), {})
+cnt: 1, ((T([128], f16), T([1024, 4096], f16), T([4096, 128], f16, stride=(1, 4096))), {})
+cnt: 1, ((T([30000], f16), T([1024, 128], f16), T([128, 30000], f16, stride=(1, 128))), {})
+Operator: aten.bmm.default
+cnt: 12, ((T([128, 512, 64], f16), T([128, 64, 512], f16)), {})
+cnt: 12, ((T([128, 512, 512], f16), T([128, 512, 64], f16)), {})
+cnt: 12, ((T([128, 512, 512], f16, stride=(262144, 1, 512)), T([128, 512, 64], f16)), {})
+cnt: 12, ((T([128, 512, 64], f16), T([128, 64, 512], f16, stride=(32768, 1, 64))), {})
+cnt: 12, ((T([128, 64, 512], f16, stride=(32768, 1, 64)), T([128, 512, 512], f16)), {})
+cnt: 12, ((T([128, 512, 512], f16), T([128, 512, 64], f16, stride=(32768, 1, 512))), {})
+Operator: aten.clone.default
+cnt: 2, ((T([2, 512], i64),), {})
+Operator: aten.copy_.default
+cnt: 2, ((T([2, 512], i64), T([2, 512], i64)), {})
+Operator: aten.div.Tensor
+cnt: 24, ((T([2, 64, 512, 512], f16), 8.0), {})
+Operator: aten.embedding.default
+cnt: 1, ((T([30000, 128], f16), T([2, 512], i64), 0), {})
+cnt: 1, ((T([2, 128], f16), T([2, 512], i64, stride=(0, 1))), {})
+cnt: 1, ((T([512, 128], f16), T([1, 512], i64)), {})
+Operator: aten.embedding_dense_backward.default
+cnt: 1, ((T([1, 512, 128], f16), T([1, 512], i64), 512, -1, False), {})
+cnt: 1, ((T([2, 512, 128], f16), T([2, 512], i64, stride=(0, 1)), 2, -1, False), {})
+cnt: 1, ((T([2, 512, 128], f16), T([2, 512], i64), 30000, 0, False), {})
+Operator: aten.mm.default
+cnt: 1, ((T([1024, 30000], f16), T([30000, 128], f16)), {})
+cnt: 1, ((T([30000, 1024], f16, stride=(1, 30000)), T([1024, 128], f16)), {})
+cnt: 1, ((T([1024, 128], f16), T([128, 4096], f16)), {})
+cnt: 1, ((T([128, 1024], f16, stride=(1, 128)), T([1024, 4096], f16)), {})
+cnt: 12, ((T([1024, 4096], f16), T([4096, 16384], f16)), {})
+cnt: 12, ((T([4096, 1024], f16, stride=(1, 4096)), T([1024, 16384], f16)), {})
+cnt: 12, ((T([1024, 16384], f16), T([16384, 4096], f16)), {})
+cnt: 12, ((T([16384, 1024], f16, stride=(1, 16384)), T([1024, 4096], f16)), {})
+cnt: 48, ((T([1024, 4096], f16), T([4096, 4096], f16)), {})
+cnt: 48, ((T([4096, 1024], f16, stride=(1, 4096)), T([1024, 4096], f16)), {})
+cnt: 1, ((T([1024, 4096], f16), T([4096, 128], f16)), {})
+cnt: 1, ((T([4096, 1024], f16, stride=(1, 4096)), T([1024, 128], f16)), {})
+Operator: aten.mul.Scalar
+cnt: 1, ((T([2, 512, 128], f16), 3.0), {})
+cnt: 12, ((T([2, 512, 16384], f16), 3.0), {})
+Operator: aten.mul.Tensor
+cnt: 1, ((T([2, 1, 1, 512], f16), -65504.0), {})
+cnt: 24, ((T([2, 512, 16384], f16), 0.5), {})
+cnt: 24, ((T([2, 512, 16384], f16), 0.044715), {})
+cnt: 24, ((T([2, 512, 16384], f16), 0.7978845608028654), {})
+cnt: 48, ((T([2, 512, 16384], f16), T([2, 512, 16384], f16)), {})
+cnt: 2, ((T([2, 512, 128], f16), 0.5), {})
+cnt: 2, ((T([2, 512, 128], f16), 0.044715), {})
+cnt: 2, ((T([2, 512, 128], f16), 0.7978845608028654), {})
+cnt: 4, ((T([2, 512, 128], f16), T([2, 512, 128], f16)), {})
+Operator: aten.native_layer_norm.default
+cnt: 2, ((T([2, 512, 128], f16), [128], T([128], f16), T([128], f16), 1e-12), {})
+cnt: 24, ((T([2, 512, 4096], f16), [4096], T([4096], f16), T([4096], f16), 1e-12), {})
+Operator: aten.native_layer_norm_backward.default
+cnt: 2, ((T([2, 512, 128], f16), T([2, 512, 128], f16), [128], T([2, 512, 1], f32), T([2, 512, 1], f32), T([128], f16), T([128], f16), [True, True, True]), {})
+cnt: 24, ((T([2, 512, 4096], f16), T([2, 512, 4096], f16), [4096], T([2, 512, 1], f32), T([2, 512, 1], f32), T([4096], f16), T([4096], f16), [True, True, True]), {})
+Operator: aten.nll_loss_backward.default
+cnt: 1, ((T([], f16), T([1024, 30000], f16), T([1024], i64), None, 1, -100, T([], f16)), {})
+Operator: aten.nll_loss_forward.default
+cnt: 1, ((T([1024, 30000], f16), T([1024], i64), None, 1, -100), {})
+Operator: aten.pow.Tensor_Scalar
+cnt: 12, ((T([2, 512, 16384], f16), 3.0), {})
+cnt: 1, ((T([2, 512, 128], f16), 3.0), {})
+cnt: 1, ((T([2, 512, 128], f16), 2.0), {})
+cnt: 12, ((T([2, 512, 16384], f16), 2.0), {})
+Operator: aten.rsub.Scalar
+cnt: 1, ((T([2, 1, 1, 512], f16), 1.0), {})
+Operator: aten.sum.SymInt
+cnt: 1, ((T([1024, 30000], f16), [0], True), {})
+cnt: 1, ((T([1024, 128], f16), [0], True), {})
+cnt: 61, ((T([1024, 4096], f16), [0], True), {})
+cnt: 12, ((T([1024, 16384], f16), [0], True), {})
+cnt: 1, ((T([2, 512, 128], f16), [0], True), {})
+Operator: aten.tanh.default
+cnt: 12, ((T([2, 512, 16384], f16),), {})
+cnt: 1, ((T([2, 512, 128], f16),), {})
+Operator: aten.tanh_backward.default
+cnt: 1, ((T([2, 512, 128], f16), T([2, 512, 128], f16)), {})
+cnt: 12, ((T([2, 512, 16384], f16), T([2, 512, 16384], f16)), {})
diff --git a/...chmark/operator_loader/operator_inp_logs/hf_train/AlbertForQuestionAnswering_training.txt b/...chmark/operator_loader/operator_inp_logs/hf_train/AlbertForQuestionAnswering_training.txt
@@ -0,0 +1,110 @@
+Operator: aten._log_softmax.default
+cnt: 2, ((T([2, 512], f16), 1, False), {})
+Operator: aten._log_softmax_backward_data.default
+cnt: 2, ((T([2, 512], f16), T([2, 512], f16), 1, f16), {})
+Operator: aten._softmax.default
+cnt: 12, ((T([2, 64, 512, 512], f16), -1, False), {})
+Operator: aten._softmax_backward_data.default
+cnt: 12, ((T([2, 64, 512, 512], f16), T([2, 64, 512, 512], f16), -1, f16), {})
+Operator: aten._to_copy.default
+cnt: 1, ((T([2, 1, 1, 512], f32),), {'dtype': f16})
+Operator: aten._unsafe_view.default
+cnt: 36, ((T([2, 64, 512, 64], f16), [128, 512, 64]), {})
+cnt: 12, ((T([2, 64, 64, 512], f16), [128, 64, 512]), {})
+cnt: 12, ((T([128, 512, 512], f16), [2, 64, 512, 512]), {})
+cnt: 12, ((T([128, 512, 64], f16), [2, 64, 512, 64]), {})
+cnt: 36, ((T([2, 512, 64, 64], f16), [2, 512, 4096]), {})
+cnt: 12, ((T([2, 512, 4096], f16), [1024, 4096]), {})
+Operator: aten.add.Tensor
+cnt: 1, ((T([2, 512, 128], f16), T([2, 512, 128], f16)), {})
+cnt: 12, ((T([2, 64, 512, 512], f16), T([2, 1, 1, 512], f16)), {})
+cnt: 72, ((T([2, 512, 4096], f16), T([2, 512, 4096], f16)), {})
+cnt: 36, ((T([2, 512, 16384], f16), T([2, 512, 16384], f16)), {})
+cnt: 12, ((T([2, 512, 16384], f16), 1.0), {})
+cnt: 1, ((T([], f16), T([], f16)), {})
+cnt: 99, ((T([4096], f16), T([4096], f16)), {})
+cnt: 11, ((T([4096, 16384], f16), T([4096, 16384], f16)), {})
+cnt: 11, ((T([16384], f16), T([16384], f16)), {})
+cnt: 11, ((T([16384, 4096], f16), T([16384, 4096], f16)), {})
+cnt: 44, ((T([4096, 4096], f16), T([4096, 4096], f16)), {})
+Operator: aten.add_.Tensor
+cnt: 1, ((T([2, 512, 128], f16), T([1, 512, 128], f16)), {})
+Operator: aten.addmm.default
+cnt: 1, ((T([4096], f16), T([1024, 128], f16), T([128, 4096], f16, stride=(1, 128))), {})
+cnt: 48, ((T([4096], f16), T([1024, 4096], f16), T([4096, 4096], f16, stride=(1, 4096))), {})
+cnt: 12, ((T([16384], f16), T([1024, 4096], f16), T([4096, 16384], f16, stride=(1, 4096))), {})
+cnt: 12, ((T([4096], f16), T([1024, 16384], f16), T([16384, 4096], f16, stride=(1, 16384))), {})
+cnt: 1, ((T([2], f16), T([1024, 4096], f16), T([4096, 2], f16, stride=(1, 4096))), {})
+Operator: aten.bmm.default
+cnt: 12, ((T([128, 512, 64], f16), T([128, 64, 512], f16)), {})
+cnt: 12, ((T([128, 512, 512], f16), T([128, 512, 64], f16)), {})
+cnt: 12, ((T([128, 512, 512], f16, stride=(262144, 1, 512)), T([128, 512, 64], f16)), {})
+cnt: 12, ((T([128, 512, 64], f16), T([128, 64, 512], f16, stride=(32768, 1, 64))), {})
+cnt: 12, ((T([128, 64, 512], f16, stride=(32768, 1, 64)), T([128, 512, 512], f16)), {})
+cnt: 12, ((T([128, 512, 512], f16), T([128, 512, 64], f16, stride=(32768, 1, 512))), {})
+Operator: aten.cat.default
+cnt: 1, (([T([2, 512, 1], f16), T([2, 512, 1], f16)], 2), {})
+Operator: aten.clamp.default
+cnt: 2, ((T([2], i64), 0, 512), {})
+Operator: aten.clone.default
+cnt: 1, ((T([2, 512], i64),), {})
+cnt: 2, ((T([2], i64),), {})
+Operator: aten.copy_.default
+cnt: 1, ((T([2, 512], i64), T([2, 512], i64)), {})
+cnt: 2, ((T([2], i64), T([2], i64)), {})
+Operator: aten.div.Tensor
+cnt: 24, ((T([2, 64, 512, 512], f16), 8.0), {})
+cnt: 2, ((T([], f16), 2), {})
+Operator: aten.embedding.default
+cnt: 1, ((T([30000, 128], f16), T([2, 512], i64), 0), {})
+cnt: 1, ((T([2, 128], f16), T([2, 512], i64, stride=(0, 1))), {})
+cnt: 1, ((T([512, 128], f16), T([1, 512], i64)), {})
+Operator: aten.embedding_dense_backward.default
+cnt: 1, ((T([1, 512, 128], f16), T([1, 512], i64), 512, -1, False), {})
+cnt: 1, ((T([2, 512, 128], f16), T([2, 512], i64, stride=(0, 1)), 2, -1, False), {})
+cnt: 1, ((T([2, 512, 128], f16), T([2, 512], i64), 30000, 0, False), {})
+Operator: aten.mm.default
+cnt: 1, ((T([1024, 2], f16), T([2, 4096], f16)), {})
+cnt: 1, ((T([2, 1024], f16, stride=(1, 2)), T([1024, 4096], f16)), {})
+cnt: 12, ((T([1024, 4096], f16), T([4096, 16384], f16)), {})
+cnt: 12, ((T([4096, 1024], f16, stride=(1, 4096)), T([1024, 16384], f16)), {})
+cnt: 12, ((T([1024, 16384], f16), T([16384, 4096], f16)), {})
+cnt: 12, ((T([16384, 1024], f16, stride=(1, 16384)), T([1024, 4096], f16)), {})
+cnt: 48, ((T([1024, 4096], f16), T([4096, 4096], f16)), {})
+cnt: 48, ((T([4096, 1024], f16, stride=(1, 4096)), T([1024, 4096], f16)), {})
+cnt: 1, ((T([1024, 4096], f16), T([4096, 128], f16)), {})
+cnt: 1, ((T([4096, 1024], f16, stride=(1, 4096)), T([1024, 128], f16)), {})
+Operator: aten.mul.Scalar
+cnt: 12, ((T([2, 512, 16384], f16), 3.0), {})
+Operator: aten.mul.Tensor
+cnt: 1, ((T([2, 1, 1, 512], f16), -65504.0), {})
+cnt: 24, ((T([2, 512, 16384], f16), 0.5), {})
+cnt: 24, ((T([2, 512, 16384], f16), 0.044715), {})
+cnt: 24, ((T([2, 512, 16384], f16), 0.7978845608028654), {})
+cnt: 48, ((T([2, 512, 16384], f16), T([2, 512, 16384], f16)), {})
+Operator: aten.native_layer_norm.default
+cnt: 1, ((T([2, 512, 128], f16), [128], T([128], f16), T([128], f16), 1e-12), {})
+cnt: 24, ((T([2, 512, 4096], f16), [4096], T([4096], f16), T([4096], f16), 1e-12), {})
+Operator: aten.native_layer_norm_backward.default
+cnt: 24, ((T([2, 512, 4096], f16), T([2, 512, 4096], f16), [4096], T([2, 512, 1], f32), T([2, 512, 1], f32), T([4096], f16), T([4096], f16), [True, True, True]), {})
+cnt: 1, ((T([2, 512, 128], f16), T([2, 512, 128], f16), [128], T([2, 512, 1], f32), T([2, 512, 1], f32), T([128], f16), T([128], f16), [True, True, True]), {})
+Operator: aten.nll_loss_backward.default
+cnt: 2, ((T([], f16), T([2, 512], f16), T([2], i64), None, 1, 512, T([], f16)), {})
+Operator: aten.nll_loss_forward.default
+cnt: 2, ((T([2, 512], f16), T([2], i64), None, 1, 512), {})
+Operator: aten.pow.Tensor_Scalar
+cnt: 12, ((T([2, 512, 16384], f16), 3.0), {})
+cnt: 12, ((T([2, 512, 16384], f16), 2.0), {})
+Operator: aten.rsub.Scalar
+cnt: 1, ((T([2, 1, 1, 512], f16), 1.0), {})
+Operator: aten.split.Tensor
+cnt: 1, ((T([2, 512, 2], f16), 1, -1), {})
+Operator: aten.sum.SymInt
+cnt: 1, ((T([1024, 2], f16), [0], True), {})
+cnt: 61, ((T([1024, 4096], f16), [0], True), {})
+cnt: 12, ((T([1024, 16384], f16), [0], True), {})
+cnt: 1, ((T([2, 512, 128], f16), [0], True), {})
+Operator: aten.tanh.default
+cnt: 12, ((T([2, 512, 16384], f16),), {})
+Operator: aten.tanh_backward.default
+cnt: 12, ((T([2, 512, 16384], f16), T([2, 512, 16384], f16)), {})