Skip to content

Commit

Permalink
Add AtenOp Benchmarking (#2495)
Browse files Browse the repository at this point in the history
Summary:
As described in pytorch/pytorch#136168, I'm trying to migrate native PyTorch implementation comparison([the original operatorbench](https://github.com/pytorch/pytorch/blob/main/benchmarks/dynamo/microbenchmarks/operatorbench.py)) to TritonBench.

This PR adds an Operator Loader which can load aten ops used in TorchBench, HuggingFace, and TIMM models. The benchmark classes are dynamically created. Then benchmark them between aten and inductor implementations.

Files `torchbenchmark/operator_loader/operator_inp_utils.py`, `torchbenchmark/operator_loader/operatorbench.py`, and all configs files in `torchbenchmark/operator_loader/operator_inp_logs/` are copied from original operatorbench.

Example commands:
```bash
python run_benchmark.py triton --op aten._softmax.default --num-inputs 1 --operator-loader --precision fp16
```
Exampled Output:
```
Evaluating an op name into an OpOverload: The underlying op of 'aten.upsample_nearest2d_backward' has no overload name 'vec'
Evaluating an op name into an OpOverload: '_OpNamespace' 'aten' object has no attribute 'im2col_backward'
Evaluating an op name into an OpOverload: '_OpNamespace' 'aten' object has no attribute 'col2im_backward'
Evaluating an op name into an OpOverload: '_OpNamespace' 'aten' object has no attribute 'im2col_backward'
Evaluating an op name into an OpOverload: The underlying op of 'aten.upsample_bilinear2d_backward' has no overload name 'vec'
Evaluating an op name into an OpOverload: The underlying op of 'aten.upsample_nearest2d_backward' has no overload name 'vec'
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:02<00:00,  1.20s/it]
  x_val    eager-latency    inductor-latency
-------  ---------------  ------------------
      0         0.090592            0.089632
      1         0.055808            0.038112
```

Pull Request resolved: #2495

Reviewed By: xuzhao9

Differential Revision: D64200358

Pulled By: FindHao

fbshipit-source-id: f0121168b33247224bc905a1a88af69e4b13def6
  • Loading branch information
FindHao authored and facebook-github-bot committed Oct 12, 2024
1 parent f9f52f6 commit 34d4f94
Show file tree
Hide file tree
Showing 160 changed files with 28,510 additions and 3 deletions.
157 changes: 157 additions & 0 deletions torchbenchmark/operator_loader/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,157 @@
import argparse
import sys
import types
from typing import Any, Generator, List, Optional

import torch
from torch._dynamo.backends.cudagraphs import cudagraphs_inner
from torch._inductor.compile_fx import compile_fx
from torch._inductor.utils import gen_gm_and_inputs
from torch._ops import OpOverload
from torch.utils._pytree import tree_map_only

from torchbenchmark.util.triton_op import (
BenchmarkOperator,
register_benchmark_mannually,
)

from .operator_inp_utils import aten, OperatorInputsLoader, to_channels_last

timm_loader = None
huggingface_loader = None
torchbench_loader = None


def maybe_load_operator_inputs_loader():
global timm_loader, huggingface_loader, torchbench_loader
if timm_loader is None:
timm_loader = OperatorInputsLoader.get_timm_loader()
if huggingface_loader is None:
huggingface_loader = OperatorInputsLoader.get_huggingface_loader()
if torchbench_loader is None:
torchbench_loader = OperatorInputsLoader.get_torchbench_loader()


def parse_args(extra_args: Optional[List[str]] = None):
parser = argparse.ArgumentParser(allow_abbrev=False)
parser.add_argument(
"--channel-list",
action="store_true",
help="Flag to enable channel list benchmarking.",
)
return parser.parse_known_args(extra_args)


def list_operators() -> List[str]:
"""In the original operator benchmark design, all operators are registered in the
operator loader. We need to collect them here.
"""
maybe_load_operator_inputs_loader()
all_ops = (
list(timm_loader.get_all_ops())
+ list(huggingface_loader.get_all_ops())
+ list(torchbench_loader.get_all_ops())
)
# remove duplicate operators
all_ops_str = list(set(str(item) for item in all_ops))
return all_ops_str


def load_opbench_by_name_from_loader(args: argparse.Namespace):
all_ops_str = list_operators()
if args.op not in all_ops_str:
raise ValueError(f"{args.op} is not found in the operator loader.")
# args.op is a string, we need to evaluate it to get the actual operator overload
op_eval = eval(args.op)
return dynamically_create_aten_op_class(op_eval)


def create_operator_class(op_eval: OpOverload):
"""Create a new class for the operator overload."""

def __init__(
self, tb_args: argparse.Namespace, extra_args: Optional[List[str]] = None
):
BenchmarkOperator.__init__(self, tb_args, extra_args)
native_args, _ = parse_args(extra_args)
self.channel_list = native_args.channel_list
self.device = tb_args.device
self.huggingface_loader = huggingface_loader
self.torchbench_loader = torchbench_loader
self.timm_loader = timm_loader
# We enable cuda graphs by default when we get the input iter. So, we don't
# utilize tritonbench's cuda graphs.
self.use_cuda_graphs = False
self.DEFAULT_PRECISION = "fp16"
assert self.dtype in (
torch.float16,
torch.float32,
), f"AtenOpBenchmark only supports fp16 and fp32, but got {self.dtype}"

def get_input_iter(self) -> Generator:
inps_gens = [self.huggingface_loader, self.torchbench_loader, self.timm_loader]
for inp_gen in inps_gens:
for inp in inp_gen.get_inputs_for_operator(
self.op_eval, self.dtype, self.device
):
args, kwargs = inp
if self.channel_list:
args, kwargs = tree_map_only(
torch.Tensor, to_channels_last, (args, kwargs)
)
gm, gm_args = gen_gm_and_inputs(self.op_eval, args, kwargs)
torch.jit._builtins._register_builtin(
torch.ops.aten.convolution_backward.default,
"aten::convolution_backward",
)
if self.device == "cuda":
cudagraph_eager = cudagraphs_inner(
gm, gm_args, copy_outputs=False, copy_inputs=False
)
self.eager_op = cudagraph_eager
compiled_fn = compile_fx(gm, gm_args)
cudagraph_compiled = cudagraphs_inner(
compiled_fn, gm_args, copy_outputs=False, copy_inputs=False
)
self.inductor_op = cudagraph_compiled
else:
self.eager_op = gm
self.inductor_op = gm

yield gm_args

def eager(self, input):
return lambda: self.eager_op(input)

def inductor(self, input):
return lambda: self.inductor_op(input)

class_attrs = {
"eager": eager,
"inductor": inductor,
"get_input_iter": get_input_iter,
"__init__": __init__,
}
new_class = type("Operator", (BenchmarkOperator,), class_attrs)
new_class.op_eval = op_eval
return new_class


def dynamically_create_aten_op_class(op_eval: OpOverload):
"""
To keep same with custom operators, we dynamically create aten operator classes here.
"""
maybe_load_operator_inputs_loader()
class_name = f"aten_{str(op_eval).replace('.', '_')}"
module_name = f"torchbenchmark.operator_loader.{class_name}"
# create a new module for each operator
op_name_module = types.ModuleType(module_name)
sys.modules[module_name] = op_name_module
op_class = create_operator_class(op_eval)
# need to set __module__ to make _find_op_name_from_module_path work
op_class.__module__ = module_name
op_name_module.Operator = op_class
# because the class is dynamically created, decorator can't get the desired module_path.
register_benchmark_mannually(class_name, "eager", baseline=True)
register_benchmark_mannually(class_name, "inductor")
return op_class
Original file line number Diff line number Diff line change
@@ -0,0 +1,115 @@
Operator: aten._log_softmax.default
cnt: 1, ((T([1024, 30000], f16), 1, False), {})
Operator: aten._log_softmax_backward_data.default
cnt: 1, ((T([1024, 30000], f16), T([1024, 30000], f16), 1, f16), {})
Operator: aten._softmax.default
cnt: 12, ((T([2, 64, 512, 512], f16), -1, False), {})
Operator: aten._softmax_backward_data.default
cnt: 12, ((T([2, 64, 512, 512], f16), T([2, 64, 512, 512], f16), -1, f16), {})
Operator: aten._to_copy.default
cnt: 1, ((T([2, 1, 1, 512], f32),), {'dtype': f16})
Operator: aten._unsafe_view.default
cnt: 36, ((T([2, 64, 512, 64], f16), [128, 512, 64]), {})
cnt: 12, ((T([2, 64, 64, 512], f16), [128, 64, 512]), {})
cnt: 12, ((T([128, 512, 512], f16), [2, 64, 512, 512]), {})
cnt: 12, ((T([128, 512, 64], f16), [2, 64, 512, 64]), {})
cnt: 36, ((T([2, 512, 64, 64], f16), [2, 512, 4096]), {})
cnt: 12, ((T([2, 512, 4096], f16), [1024, 4096]), {})
Operator: aten.add.Tensor
cnt: 4, ((T([2, 512, 128], f16), T([2, 512, 128], f16)), {})
cnt: 12, ((T([2, 64, 512, 512], f16), T([2, 1, 1, 512], f16)), {})
cnt: 72, ((T([2, 512, 4096], f16), T([2, 512, 4096], f16)), {})
cnt: 36, ((T([2, 512, 16384], f16), T([2, 512, 16384], f16)), {})
cnt: 12, ((T([2, 512, 16384], f16), 1.0), {})
cnt: 1, ((T([2, 512, 128], f16), 1.0), {})
cnt: 99, ((T([4096], f16), T([4096], f16)), {})
cnt: 11, ((T([4096, 16384], f16), T([4096, 16384], f16)), {})
cnt: 11, ((T([16384], f16), T([16384], f16)), {})
cnt: 11, ((T([16384, 4096], f16), T([16384, 4096], f16)), {})
cnt: 44, ((T([4096, 4096], f16), T([4096, 4096], f16)), {})
cnt: 1, ((T([30000, 128], f16), T([30000, 128], f16)), {})
Operator: aten.add_.Tensor
cnt: 1, ((T([2, 512, 128], f16), T([1, 512, 128], f16)), {})
Operator: aten.addmm.default
cnt: 1, ((T([4096], f16), T([1024, 128], f16), T([128, 4096], f16, stride=(1, 128))), {})
cnt: 48, ((T([4096], f16), T([1024, 4096], f16), T([4096, 4096], f16, stride=(1, 4096))), {})
cnt: 12, ((T([16384], f16), T([1024, 4096], f16), T([4096, 16384], f16, stride=(1, 4096))), {})
cnt: 12, ((T([4096], f16), T([1024, 16384], f16), T([16384, 4096], f16, stride=(1, 16384))), {})
cnt: 1, ((T([128], f16), T([1024, 4096], f16), T([4096, 128], f16, stride=(1, 4096))), {})
cnt: 1, ((T([30000], f16), T([1024, 128], f16), T([128, 30000], f16, stride=(1, 128))), {})
Operator: aten.bmm.default
cnt: 12, ((T([128, 512, 64], f16), T([128, 64, 512], f16)), {})
cnt: 12, ((T([128, 512, 512], f16), T([128, 512, 64], f16)), {})
cnt: 12, ((T([128, 512, 512], f16, stride=(262144, 1, 512)), T([128, 512, 64], f16)), {})
cnt: 12, ((T([128, 512, 64], f16), T([128, 64, 512], f16, stride=(32768, 1, 64))), {})
cnt: 12, ((T([128, 64, 512], f16, stride=(32768, 1, 64)), T([128, 512, 512], f16)), {})
cnt: 12, ((T([128, 512, 512], f16), T([128, 512, 64], f16, stride=(32768, 1, 512))), {})
Operator: aten.clone.default
cnt: 2, ((T([2, 512], i64),), {})
Operator: aten.copy_.default
cnt: 2, ((T([2, 512], i64), T([2, 512], i64)), {})
Operator: aten.div.Tensor
cnt: 24, ((T([2, 64, 512, 512], f16), 8.0), {})
Operator: aten.embedding.default
cnt: 1, ((T([30000, 128], f16), T([2, 512], i64), 0), {})
cnt: 1, ((T([2, 128], f16), T([2, 512], i64, stride=(0, 1))), {})
cnt: 1, ((T([512, 128], f16), T([1, 512], i64)), {})
Operator: aten.embedding_dense_backward.default
cnt: 1, ((T([1, 512, 128], f16), T([1, 512], i64), 512, -1, False), {})
cnt: 1, ((T([2, 512, 128], f16), T([2, 512], i64, stride=(0, 1)), 2, -1, False), {})
cnt: 1, ((T([2, 512, 128], f16), T([2, 512], i64), 30000, 0, False), {})
Operator: aten.mm.default
cnt: 1, ((T([1024, 30000], f16), T([30000, 128], f16)), {})
cnt: 1, ((T([30000, 1024], f16, stride=(1, 30000)), T([1024, 128], f16)), {})
cnt: 1, ((T([1024, 128], f16), T([128, 4096], f16)), {})
cnt: 1, ((T([128, 1024], f16, stride=(1, 128)), T([1024, 4096], f16)), {})
cnt: 12, ((T([1024, 4096], f16), T([4096, 16384], f16)), {})
cnt: 12, ((T([4096, 1024], f16, stride=(1, 4096)), T([1024, 16384], f16)), {})
cnt: 12, ((T([1024, 16384], f16), T([16384, 4096], f16)), {})
cnt: 12, ((T([16384, 1024], f16, stride=(1, 16384)), T([1024, 4096], f16)), {})
cnt: 48, ((T([1024, 4096], f16), T([4096, 4096], f16)), {})
cnt: 48, ((T([4096, 1024], f16, stride=(1, 4096)), T([1024, 4096], f16)), {})
cnt: 1, ((T([1024, 4096], f16), T([4096, 128], f16)), {})
cnt: 1, ((T([4096, 1024], f16, stride=(1, 4096)), T([1024, 128], f16)), {})
Operator: aten.mul.Scalar
cnt: 1, ((T([2, 512, 128], f16), 3.0), {})
cnt: 12, ((T([2, 512, 16384], f16), 3.0), {})
Operator: aten.mul.Tensor
cnt: 1, ((T([2, 1, 1, 512], f16), -65504.0), {})
cnt: 24, ((T([2, 512, 16384], f16), 0.5), {})
cnt: 24, ((T([2, 512, 16384], f16), 0.044715), {})
cnt: 24, ((T([2, 512, 16384], f16), 0.7978845608028654), {})
cnt: 48, ((T([2, 512, 16384], f16), T([2, 512, 16384], f16)), {})
cnt: 2, ((T([2, 512, 128], f16), 0.5), {})
cnt: 2, ((T([2, 512, 128], f16), 0.044715), {})
cnt: 2, ((T([2, 512, 128], f16), 0.7978845608028654), {})
cnt: 4, ((T([2, 512, 128], f16), T([2, 512, 128], f16)), {})
Operator: aten.native_layer_norm.default
cnt: 2, ((T([2, 512, 128], f16), [128], T([128], f16), T([128], f16), 1e-12), {})
cnt: 24, ((T([2, 512, 4096], f16), [4096], T([4096], f16), T([4096], f16), 1e-12), {})
Operator: aten.native_layer_norm_backward.default
cnt: 2, ((T([2, 512, 128], f16), T([2, 512, 128], f16), [128], T([2, 512, 1], f32), T([2, 512, 1], f32), T([128], f16), T([128], f16), [True, True, True]), {})
cnt: 24, ((T([2, 512, 4096], f16), T([2, 512, 4096], f16), [4096], T([2, 512, 1], f32), T([2, 512, 1], f32), T([4096], f16), T([4096], f16), [True, True, True]), {})
Operator: aten.nll_loss_backward.default
cnt: 1, ((T([], f16), T([1024, 30000], f16), T([1024], i64), None, 1, -100, T([], f16)), {})
Operator: aten.nll_loss_forward.default
cnt: 1, ((T([1024, 30000], f16), T([1024], i64), None, 1, -100), {})
Operator: aten.pow.Tensor_Scalar
cnt: 12, ((T([2, 512, 16384], f16), 3.0), {})
cnt: 1, ((T([2, 512, 128], f16), 3.0), {})
cnt: 1, ((T([2, 512, 128], f16), 2.0), {})
cnt: 12, ((T([2, 512, 16384], f16), 2.0), {})
Operator: aten.rsub.Scalar
cnt: 1, ((T([2, 1, 1, 512], f16), 1.0), {})
Operator: aten.sum.SymInt
cnt: 1, ((T([1024, 30000], f16), [0], True), {})
cnt: 1, ((T([1024, 128], f16), [0], True), {})
cnt: 61, ((T([1024, 4096], f16), [0], True), {})
cnt: 12, ((T([1024, 16384], f16), [0], True), {})
cnt: 1, ((T([2, 512, 128], f16), [0], True), {})
Operator: aten.tanh.default
cnt: 12, ((T([2, 512, 16384], f16),), {})
cnt: 1, ((T([2, 512, 128], f16),), {})
Operator: aten.tanh_backward.default
cnt: 1, ((T([2, 512, 128], f16), T([2, 512, 128], f16)), {})
cnt: 12, ((T([2, 512, 16384], f16), T([2, 512, 16384], f16)), {})
Original file line number Diff line number Diff line change
@@ -0,0 +1,110 @@
Operator: aten._log_softmax.default
cnt: 2, ((T([2, 512], f16), 1, False), {})
Operator: aten._log_softmax_backward_data.default
cnt: 2, ((T([2, 512], f16), T([2, 512], f16), 1, f16), {})
Operator: aten._softmax.default
cnt: 12, ((T([2, 64, 512, 512], f16), -1, False), {})
Operator: aten._softmax_backward_data.default
cnt: 12, ((T([2, 64, 512, 512], f16), T([2, 64, 512, 512], f16), -1, f16), {})
Operator: aten._to_copy.default
cnt: 1, ((T([2, 1, 1, 512], f32),), {'dtype': f16})
Operator: aten._unsafe_view.default
cnt: 36, ((T([2, 64, 512, 64], f16), [128, 512, 64]), {})
cnt: 12, ((T([2, 64, 64, 512], f16), [128, 64, 512]), {})
cnt: 12, ((T([128, 512, 512], f16), [2, 64, 512, 512]), {})
cnt: 12, ((T([128, 512, 64], f16), [2, 64, 512, 64]), {})
cnt: 36, ((T([2, 512, 64, 64], f16), [2, 512, 4096]), {})
cnt: 12, ((T([2, 512, 4096], f16), [1024, 4096]), {})
Operator: aten.add.Tensor
cnt: 1, ((T([2, 512, 128], f16), T([2, 512, 128], f16)), {})
cnt: 12, ((T([2, 64, 512, 512], f16), T([2, 1, 1, 512], f16)), {})
cnt: 72, ((T([2, 512, 4096], f16), T([2, 512, 4096], f16)), {})
cnt: 36, ((T([2, 512, 16384], f16), T([2, 512, 16384], f16)), {})
cnt: 12, ((T([2, 512, 16384], f16), 1.0), {})
cnt: 1, ((T([], f16), T([], f16)), {})
cnt: 99, ((T([4096], f16), T([4096], f16)), {})
cnt: 11, ((T([4096, 16384], f16), T([4096, 16384], f16)), {})
cnt: 11, ((T([16384], f16), T([16384], f16)), {})
cnt: 11, ((T([16384, 4096], f16), T([16384, 4096], f16)), {})
cnt: 44, ((T([4096, 4096], f16), T([4096, 4096], f16)), {})
Operator: aten.add_.Tensor
cnt: 1, ((T([2, 512, 128], f16), T([1, 512, 128], f16)), {})
Operator: aten.addmm.default
cnt: 1, ((T([4096], f16), T([1024, 128], f16), T([128, 4096], f16, stride=(1, 128))), {})
cnt: 48, ((T([4096], f16), T([1024, 4096], f16), T([4096, 4096], f16, stride=(1, 4096))), {})
cnt: 12, ((T([16384], f16), T([1024, 4096], f16), T([4096, 16384], f16, stride=(1, 4096))), {})
cnt: 12, ((T([4096], f16), T([1024, 16384], f16), T([16384, 4096], f16, stride=(1, 16384))), {})
cnt: 1, ((T([2], f16), T([1024, 4096], f16), T([4096, 2], f16, stride=(1, 4096))), {})
Operator: aten.bmm.default
cnt: 12, ((T([128, 512, 64], f16), T([128, 64, 512], f16)), {})
cnt: 12, ((T([128, 512, 512], f16), T([128, 512, 64], f16)), {})
cnt: 12, ((T([128, 512, 512], f16, stride=(262144, 1, 512)), T([128, 512, 64], f16)), {})
cnt: 12, ((T([128, 512, 64], f16), T([128, 64, 512], f16, stride=(32768, 1, 64))), {})
cnt: 12, ((T([128, 64, 512], f16, stride=(32768, 1, 64)), T([128, 512, 512], f16)), {})
cnt: 12, ((T([128, 512, 512], f16), T([128, 512, 64], f16, stride=(32768, 1, 512))), {})
Operator: aten.cat.default
cnt: 1, (([T([2, 512, 1], f16), T([2, 512, 1], f16)], 2), {})
Operator: aten.clamp.default
cnt: 2, ((T([2], i64), 0, 512), {})
Operator: aten.clone.default
cnt: 1, ((T([2, 512], i64),), {})
cnt: 2, ((T([2], i64),), {})
Operator: aten.copy_.default
cnt: 1, ((T([2, 512], i64), T([2, 512], i64)), {})
cnt: 2, ((T([2], i64), T([2], i64)), {})
Operator: aten.div.Tensor
cnt: 24, ((T([2, 64, 512, 512], f16), 8.0), {})
cnt: 2, ((T([], f16), 2), {})
Operator: aten.embedding.default
cnt: 1, ((T([30000, 128], f16), T([2, 512], i64), 0), {})
cnt: 1, ((T([2, 128], f16), T([2, 512], i64, stride=(0, 1))), {})
cnt: 1, ((T([512, 128], f16), T([1, 512], i64)), {})
Operator: aten.embedding_dense_backward.default
cnt: 1, ((T([1, 512, 128], f16), T([1, 512], i64), 512, -1, False), {})
cnt: 1, ((T([2, 512, 128], f16), T([2, 512], i64, stride=(0, 1)), 2, -1, False), {})
cnt: 1, ((T([2, 512, 128], f16), T([2, 512], i64), 30000, 0, False), {})
Operator: aten.mm.default
cnt: 1, ((T([1024, 2], f16), T([2, 4096], f16)), {})
cnt: 1, ((T([2, 1024], f16, stride=(1, 2)), T([1024, 4096], f16)), {})
cnt: 12, ((T([1024, 4096], f16), T([4096, 16384], f16)), {})
cnt: 12, ((T([4096, 1024], f16, stride=(1, 4096)), T([1024, 16384], f16)), {})
cnt: 12, ((T([1024, 16384], f16), T([16384, 4096], f16)), {})
cnt: 12, ((T([16384, 1024], f16, stride=(1, 16384)), T([1024, 4096], f16)), {})
cnt: 48, ((T([1024, 4096], f16), T([4096, 4096], f16)), {})
cnt: 48, ((T([4096, 1024], f16, stride=(1, 4096)), T([1024, 4096], f16)), {})
cnt: 1, ((T([1024, 4096], f16), T([4096, 128], f16)), {})
cnt: 1, ((T([4096, 1024], f16, stride=(1, 4096)), T([1024, 128], f16)), {})
Operator: aten.mul.Scalar
cnt: 12, ((T([2, 512, 16384], f16), 3.0), {})
Operator: aten.mul.Tensor
cnt: 1, ((T([2, 1, 1, 512], f16), -65504.0), {})
cnt: 24, ((T([2, 512, 16384], f16), 0.5), {})
cnt: 24, ((T([2, 512, 16384], f16), 0.044715), {})
cnt: 24, ((T([2, 512, 16384], f16), 0.7978845608028654), {})
cnt: 48, ((T([2, 512, 16384], f16), T([2, 512, 16384], f16)), {})
Operator: aten.native_layer_norm.default
cnt: 1, ((T([2, 512, 128], f16), [128], T([128], f16), T([128], f16), 1e-12), {})
cnt: 24, ((T([2, 512, 4096], f16), [4096], T([4096], f16), T([4096], f16), 1e-12), {})
Operator: aten.native_layer_norm_backward.default
cnt: 24, ((T([2, 512, 4096], f16), T([2, 512, 4096], f16), [4096], T([2, 512, 1], f32), T([2, 512, 1], f32), T([4096], f16), T([4096], f16), [True, True, True]), {})
cnt: 1, ((T([2, 512, 128], f16), T([2, 512, 128], f16), [128], T([2, 512, 1], f32), T([2, 512, 1], f32), T([128], f16), T([128], f16), [True, True, True]), {})
Operator: aten.nll_loss_backward.default
cnt: 2, ((T([], f16), T([2, 512], f16), T([2], i64), None, 1, 512, T([], f16)), {})
Operator: aten.nll_loss_forward.default
cnt: 2, ((T([2, 512], f16), T([2], i64), None, 1, 512), {})
Operator: aten.pow.Tensor_Scalar
cnt: 12, ((T([2, 512, 16384], f16), 3.0), {})
cnt: 12, ((T([2, 512, 16384], f16), 2.0), {})
Operator: aten.rsub.Scalar
cnt: 1, ((T([2, 1, 1, 512], f16), 1.0), {})
Operator: aten.split.Tensor
cnt: 1, ((T([2, 512, 2], f16), 1, -1), {})
Operator: aten.sum.SymInt
cnt: 1, ((T([1024, 2], f16), [0], True), {})
cnt: 61, ((T([1024, 4096], f16), [0], True), {})
cnt: 12, ((T([1024, 16384], f16), [0], True), {})
cnt: 1, ((T([2, 512, 128], f16), [0], True), {})
Operator: aten.tanh.default
cnt: 12, ((T([2, 512, 16384], f16),), {})
Operator: aten.tanh_backward.default
cnt: 12, ((T([2, 512, 16384], f16), T([2, 512, 16384], f16)), {})
Loading

0 comments on commit 34d4f94

Please sign in to comment.