From 8438d9808c441265b870c0acc1352d6506b1d7e6 Mon Sep 17 00:00:00 2001
From: Connor Goggins <cgoggins0@gmail.com>
Date: Thu, 13 Feb 2020 16:51:02 -0800
Subject: [PATCH] Implement all miscellaneous ops (#17511)

* Initial commit - added first batch of misc ops

* Initial commit - added first batch of misc ops

* Added remaining misc ops, including Custom op logic

* Added more test cases, fixed lint errors

* Update documentation

* Added run_backward=True for ops supporting backwards runs

* Added issue link for bilinear UpSampling

* Added remaining misc ops, including Custom op logic

* Update documentation

* Updated alias map

* Fixed missing and incorrect alias issues

* Added remaining missing aliases

* Fixed Custom profile dump parsing and alias

* Switched to using sets for O(1) op membership checks

* Added fix for dtype issue in master
---
 .../opperf/nd_operations/misc_operators.py    | 124 ++++++++++++++++++
 benchmark/opperf/opperf.py                    |   4 +
 benchmark/opperf/rules/default_params.py      |  36 ++++-
 benchmark/opperf/utils/benchmark_utils.py     |   2 +-
 benchmark/opperf/utils/op_registry_utils.py   |  70 ++++++----
 benchmark/opperf/utils/profiler_utils.py      |  11 +-
 6 files changed, 217 insertions(+), 30 deletions(-)
 create mode 100644 benchmark/opperf/nd_operations/misc_operators.py

diff --git a/benchmark/opperf/nd_operations/misc_operators.py b/benchmark/opperf/nd_operations/misc_operators.py
new file mode 100644
index 000000000000..5a0efc57de0d
--- /dev/null
+++ b/benchmark/opperf/nd_operations/misc_operators.py
@@ -0,0 +1,124 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""Performance benchmark tests for MXNet NDArray Miscellaneous Operations.
+
+Below 16 Miscellaneous Operators are covered:
+
+['reset_arrays', 'multi_all_finite', 'multi_sum_sq', 'add_n', 'UpSampling', 'Custom', 'squeeze',
+'all_finite', 'clip', 'multi_lars', 'SequenceReverse', 'SequenceLast', 'SequenceMask', 'cast_storage',
+'cumsum', 'fill_element_0index']
+
+"""
+
+import mxnet as mx
+
+from benchmark.opperf.utils.benchmark_utils import run_op_benchmarks
+from benchmark.opperf.utils.op_registry_utils import get_remaining_miscellaneous_operators
+
+from benchmark.opperf.utils.benchmark_utils import run_performance_test
+from benchmark.opperf.utils.common_utils import merge_map_list
+from benchmark.opperf.rules.default_params import MX_OP_MODULE
+
+from benchmark.opperf.custom_operations.custom_operations import CustomAddOneProp
+
+
+def run_mx_misc_operators_benchmarks(ctx=mx.cpu(), dtype='float32', profiler='native', warmup=25, runs=100):
+    """Runs benchmarks with the given context and precision (dtype) for all the miscellaneous
+    operators in MXNet.
+
+    Parameters
+    ----------
+    ctx: mx.ctx
+        Context to run benchmarks
+    dtype: str, default 'float32'
+        Precision to use for benchmarks
+    profiler: str, default 'native'
+        Type of Profiler to use (native/python)
+    warmup: int, default 25
+        Number of times to run for warmup
+    runs: int, default 100
+        Number of runs to capture benchmark results
+
+    Returns
+    -------
+    Dictionary of results. Key -> Name of the operator, Value -> Benchmark results.
+
+    """
+    # Individual tests for ops with positional args
+    array_ops_benchmark = run_performance_test([getattr(MX_OP_MODULE, "reset_arrays"),
+                                                getattr(MX_OP_MODULE, "multi_all_finite"),
+                                                getattr(MX_OP_MODULE, "multi_sum_sq")],
+                                               run_backward=False,
+                                               dtype=dtype,
+                                               ctx=ctx,
+                                               profiler=profiler,
+                                               inputs=[{"args": [(1024, 1024)],
+                                                        "num_arrays": 1},
+                                                       {"args": [(10000, 1)],
+                                                        "num_arrays": 1},
+                                                       {"args": [(10000, 10)],
+                                                        "num_arrays": 1}],
+                                               warmup=warmup,
+                                               runs=runs)
+    add_n_benchmark = run_performance_test([getattr(MX_OP_MODULE, "add_n")],
+                                           run_backward=True,
+                                           dtype=dtype,
+                                           ctx=ctx,
+                                           profiler=profiler,
+                                           inputs=[{"args": [(1024, 1024)]},
+                                                   {"args": [(10000, 1)]},
+                                                   {"args": [(10000, 10)]}],
+                                           warmup=warmup,
+                                           runs=runs)
+    # There are currently issus with UpSampling with bilinear interpolation.
+    # track issue here: https://github.com/apache/incubator-mxnet/issues/9138
+    upsampling_benchmark = run_performance_test([getattr(MX_OP_MODULE, "UpSampling")],
+                                                run_backward=True,
+                                                dtype=dtype,
+                                                ctx=ctx,
+                                                profiler=profiler,
+                                                inputs=[{"args": (32, 3, 256, 256),
+                                                         "scale": 2,
+                                                         "sample_type": "nearest"},
+                                                        {"args": (32, 3, 10000, 1),
+                                                         "scale": 4,
+                                                         "sample_type": "nearest"}],
+                                                warmup=warmup,
+                                                runs=runs)
+    # Create and register CustomAddOne operator for use in Custom op testing
+    c = CustomAddOneProp()
+    c.create_operator(ctx, [(1024,1024)], [dtype])
+    custom_benchmark = run_performance_test([getattr(MX_OP_MODULE, "Custom")],
+                                            run_backward=True,
+                                            dtype=dtype,
+                                            ctx=ctx,
+                                            profiler=profiler,
+                                            inputs=[{"args": [(1024, 1024)],
+                                                     "op_type": "CustomAddOne"},
+                                                    {"args": [(10000, 1)],
+                                                     "op_type": "CustomAddOne"},
+                                                    {"args": [(10000, 10)],
+                                                     "op_type": "CustomAddOne"}],
+                                            warmup=warmup,
+                                            runs=runs)
+
+    # Fetch remaining Miscellaneous Operators
+    mx_misc_ops = get_remaining_miscellaneous_operators()
+    # Run benchmarks
+    mx_misc_op_results = run_op_benchmarks(mx_misc_ops, dtype, ctx, profiler, warmup, runs)
+    return merge_map_list(array_ops_benchmark + add_n_benchmark + upsampling_benchmark + custom_benchmark + [mx_misc_op_results])
diff --git a/benchmark/opperf/opperf.py b/benchmark/opperf/opperf.py
index dc71190e8659..5b8c43f417da 100755
--- a/benchmark/opperf/opperf.py
+++ b/benchmark/opperf/opperf.py
@@ -44,6 +44,7 @@
 from benchmark.opperf.nd_operations.indexing_routines import run_indexing_routines_benchmarks
 from benchmark.opperf.nd_operations.nn_loss_operators import run_loss_operators_benchmarks
 from benchmark.opperf.nd_operations.linalg_operators import run_linalg_operators_benchmarks
+from benchmark.opperf.nd_operations.misc_operators import run_mx_misc_operators_benchmarks
 
 from benchmark.opperf.utils.common_utils import merge_map_list, save_to_file
 from benchmark.opperf.utils.op_registry_utils import get_operators_with_no_benchmark, \
@@ -114,6 +115,9 @@ def run_all_mxnet_operator_benchmarks(ctx=mx.cpu(), dtype='float32', profiler='n
 
     # Run all NN loss operations benchmarks with default input values
     mxnet_operator_benchmark_results.append(run_loss_operators_benchmarks(ctx=ctx, dtype=dtype, profiler=profiler, warmup=warmup, runs=runs))
+    
+    # Run all Miscellaneous operations benchmarks with default input values
+    mxnet_operator_benchmark_results.append(run_mx_misc_operators_benchmarks(ctx=ctx, dtype=dtype, profiler=profiler, warmup=warmup, runs=runs))
 
     # Run all Linear Algebra operations benchmarks with default input values
     mxnet_operator_benchmark_results.append(run_linalg_operators_benchmarks(ctx=ctx, dtype=dtype, profiler=profiler, warmup=warmup, runs=runs))
diff --git a/benchmark/opperf/rules/default_params.py b/benchmark/opperf/rules/default_params.py
index 8f10e4ebbeb6..31940da8eb77 100644
--- a/benchmark/opperf/rules/default_params.py
+++ b/benchmark/opperf/rules/default_params.py
@@ -115,6 +115,22 @@
 DEFAULT_DATA_4d = [(1, 4, 2, 4), (10, 25, 10, 100)]
 DEFAULT_BLOCK_SIZE = [2, 5]
 
+# For miscellaneous operators
+DEFAULT_DATA_SQUEEZE = [(1, 1024, 1024), (32, 1, 256, 256)]
+DEFAULT_AXIS_SQUEEZE = [0, 1]
+DEFAULT_A_MIN = [0.1]
+DEFAULT_A_MAX = [0.9]
+DEFAULT_LRS = [(1024, 1024), (10000, 1), (10000, 100)]
+DEFAULT_WSS = [(1024, 1024), (10000, 1), (10000, 100)]
+DEFAULT_GSS = [(1024, 1024), (10000, 1), (10000, 100)]
+DEFAULT_WDS = [(1024, 1024), (10000, 1), (10000, 100)]
+DEFAULT_ETA = [.5]
+DEFAULT_STYPE = ['default', 'csr', 'row_sparse']
+DEFAULT_A = [(1024, 1024), (10000, 1), (10000, 100)]
+DEFAULT_LHS_FEI = [(1024, 1024), (10000, 1), (10000, 100)]
+DEFAULT_MHS = [(1024,), (10000,), (10000,)]
+DEFAULT_RHS_FEI = [(1024,), (10000,), (10000,)]
+
 # For swapaxis operator
 DEFAULT_DIM_1 = [0]
 DEFAULT_DIM_2 = [1]
@@ -236,7 +252,22 @@
                    "axes": DEFAULT_AXES,
                    "act_type_leakyrelu": DEFAULT_ACT_TYPE_LR,
                    "label_softmax": DEFAULT_LABEL_SOFTMAX,
-                   "act_type_activation": DEFAULT_ACT_TYPE_ACTIVATION}
+                   "act_type_activation": DEFAULT_ACT_TYPE_ACTIVATION,
+                   "data_squeeze": DEFAULT_DATA_SQUEEZE,
+                   "axis_squeeze": DEFAULT_AXIS_SQUEEZE,
+                   "a_min": DEFAULT_A_MIN,
+                   "a_max": DEFAULT_A_MAX,
+                   "lrs": DEFAULT_LRS,
+                   "weights_sum_sq": DEFAULT_WSS,
+                   "grads_sum_sq": DEFAULT_GSS,
+                   "wds": DEFAULT_WDS,
+                   "eta": DEFAULT_ETA,
+                   "eps": DEFAULT_EPSILON,
+                   "stype": DEFAULT_STYPE,
+                   "a": DEFAULT_A,
+                   "lhs_fill_element_0index": DEFAULT_LHS_FEI,
+                   "rhs_fill_element_0index": DEFAULT_RHS_FEI,
+                   "mhs": DEFAULT_MHS}
 
 
 # These are names of MXNet operator parameters that is of type NDArray.
@@ -250,4 +281,5 @@
                           "weight", "weight32", "grad", "mean", "var", "mom", "n", "d",
                           "v", "z", "g", "delta", "args", "indices", "shape_like", "y",
                           "x", "condition", "a", "index", "raveL_data", "label", "grid",
-                          "A", "B", "C", "r1", "r2", "rois"]
+                          "A", "B", "C", "r1", "r2", "rois", "lrs", "wds", "weights_sum_sq",
+                          "grads_sum_sq", "mhs"]
diff --git a/benchmark/opperf/utils/benchmark_utils.py b/benchmark/opperf/utils/benchmark_utils.py
index 60914118a56e..29223ff40aa9 100644
--- a/benchmark/opperf/utils/benchmark_utils.py
+++ b/benchmark/opperf/utils/benchmark_utils.py
@@ -26,7 +26,7 @@
 from benchmark.opperf.rules.default_params import PARAMS_OF_TYPE_NDARRAY
 from .profiler_utils import cpp_profile, python_profile
 
-no_backward = ['gather_nd', 'softmax_cross_entropy', 'linalg_gelqf', 'linalg_slogdet', 'moments']
+no_backward = ['gather_nd', 'softmax_cross_entropy', 'linalg_gelqf', 'linalg_slogdet', 'moments', 'SequenceLast']
 
 def _prepare_op_inputs(inputs, run_backward, dtype, ctx):
     mx.random.seed(41)
diff --git a/benchmark/opperf/utils/op_registry_utils.py b/benchmark/opperf/utils/op_registry_utils.py
index d2598310e852..de7ad4dcc93f 100644
--- a/benchmark/opperf/utils/op_registry_utils.py
+++ b/benchmark/opperf/utils/op_registry_utils.py
@@ -113,15 +113,17 @@ def prepare_op_inputs(op, arg_params):
     inputs = []
 
     # 4d tensor is needed only by following two ops
-    ops_4d = ['depth_to_space', 'space_to_depth']
+    ops_4d = {'depth_to_space', 'space_to_depth'}
 
     # 3d tensor is needed by following ops
-    ops_3d = ['CTCLoss', 'ctc_loss']
+    ops_3d = {'CTCLoss', 'ctc_loss'}
 
     # For ops with args that need to change shape/value for different ops
-    custom_data = ['Activation', 'LeakyReLU', 'Softmax', 'BilinearSampler', 'GridGenerator', 'sample_multinomial', 'linalg_maketrian']
+    custom_data = {'Activation', 'LeakyReLU', 'Softmax', 'BilinearSampler', 'GridGenerator',
+                   'sample_multinomial', 'linalg_maketrian', 'squeeze', 'fill_element_0index'}
 
-    int_only = ['random_randint']
+    int_only = {'random_randint'}
+    float_only = {'log_softmax', 'softmax', 'softmin'}
 
     # Prepare op to default input mapping
     arg_values = {}
@@ -133,7 +135,7 @@ def prepare_op_inputs(op, arg_params):
         # rest all operators take int as well as float
         if op in int_only and arg_name == "dtype":
             arg_values[arg_name] = DEFAULTS_INPUTS["dtype_int"]
-        elif op.startswith(('random','sample')) and arg_name == "dtype":
+        elif (op.startswith(('random','sample')) or op in float_only) and arg_name == "dtype":
             arg_values[arg_name] = DEFAULTS_INPUTS["dtype_float"]
         elif "NDArray" in arg_type and op == "ravel_multi_index":
             arg_values[arg_name] = DEFAULTS_INPUTS["ravel_data"]
@@ -185,7 +187,7 @@ def get_all_unary_operators():
     {"operator_name": {"has_backward", "nd_op_handle", "params"}}
     """
     # Cast operators (cast & amp_cast are unary)
-    cast_ops = ['cast', 'amp_cast']
+    cast_ops = {'cast', 'amp_cast'}
 
     # Get all mxnet operators
     mx_operators = _get_all_mxnet_operators()
@@ -232,7 +234,7 @@ def get_all_misc_binary_operators():
 
     # Filter for miscellaneous binary operators
     binary_misc_mx_operators = {}
-    for op_name, op_params in mx_operators.items():
+    for op_name, _ in mx_operators.items():
         if "choose_element_0index" == op_name:
             binary_misc_mx_operators[op_name] = mx_operators[op_name]
         elif "reshape_like" == op_name:
@@ -270,7 +272,7 @@ def get_all_random_sampling_operators():
     {"operator_name": {"has_backward", "nd_op_handle", "params"}}
     """
     # Additional Random Sampling ops which do not start with "random_" or "sample_"
-    additional_random_sampling_ops = ['GridGenerator', 'BilinearSampler']
+    additional_random_sampling_ops = {'GridGenerator', 'BilinearSampler'}
 
     # Get all mxnet operators
     mx_operators = _get_all_mxnet_operators()
@@ -290,10 +292,10 @@ def get_all_linalg_operators():
     -------
     {"operator_name": {"has_backward", "nd_op_handle", "params"}}
     """
-    other_linalg_ops = ['moments']
+    other_linalg_ops = {'moments'}
 
     # Already tested linalg_potrf independently
-    independently_tested = ['linalg_potrf']
+    independently_tested = {'linalg_potrf'}
 
     # Get all mxnet operators
     mx_operators = _get_all_mxnet_operators()
@@ -333,7 +335,7 @@ def get_all_nn_activation_operators():
      -------
      {"operator_name": {"has_backward", "nd_op_handle", "params"}}
      """
-    nn_activation_ops = ['Softmax', 'SoftmaxActivation', 'softmin', 'Activation', 'LeakyReLU', 'hard_sigmoid', 'softmax', 'log_softmax']
+    nn_activation_ops = {'Softmax', 'SoftmaxActivation', 'softmin', 'Activation', 'LeakyReLU', 'hard_sigmoid', 'softmax', 'log_softmax'}
 
     # Get all mxnet operators
     mx_operators = _get_all_mxnet_operators()
@@ -353,17 +355,17 @@ def get_all_optimizer_operators():
      -------
      {"operator_name": {"has_backward", "nd_op_handle", "params"}}
      """
-    optimizer_ops = ['mp_sgd_update', 'signum_update', 'rmspropalex_update', 'ftml_update', 'rmsprop_update',
+    optimizer_ops = {'mp_sgd_update', 'signum_update', 'rmspropalex_update', 'ftml_update', 'rmsprop_update',
                      'sgd_mom_update', 'signsgd_update', 'mp_sgd_mom_update', 'ftrl_update', 'sgd_update',
                      'adam_update', 'mp_nag_mom_update', 'nag_mom_update', 'lamb_update_phase1',
-                     'lamb_update_phase2']
+                     'lamb_update_phase2'}
 
     # Get all mxnet operators
     mx_operators = _get_all_mxnet_operators()
 
     # Filter for Optimizer operators
     optimizer_mx_operators = {}
-    for op_name, op_params in mx_operators.items():
+    for op_name, _ in mx_operators.items():
         if op_name in optimizer_ops:
             optimizer_mx_operators[op_name] = mx_operators[op_name]
     return optimizer_mx_operators
@@ -375,14 +377,14 @@ def get_all_sorting_searching_operators():
     -------
     {"operator_name": {"has_backward", "nd_op_handle", "params"}}
     """
-    sort_search_ops = ['sort', 'argsort', 'argmax', 'argmin', 'topk']
+    sort_search_ops = {'sort', 'argsort', 'argmax', 'argmin', 'topk'}
 
     # Get all mxnet operators
     mx_operators = _get_all_mxnet_operators()
 
     # Filter for Sort and search operators
     sort_search_mx_operators = {}
-    for op_name, op_params in mx_operators.items():
+    for op_name, _ in mx_operators.items():
         if op_name in sort_search_ops:
             sort_search_mx_operators[op_name] = mx_operators[op_name]
     return sort_search_mx_operators
@@ -395,33 +397,53 @@ def get_all_rearrange_operators():
     -------
     {"operator_name": {"has_backward", "nd_op_handle", "params"}}
     """
-    rearrange_ops = ['transpose','swapaxes','flip','depth_to_space','space_to_depth']
+    rearrange_ops = {'transpose','swapaxes','flip','depth_to_space','space_to_depth'}
 
     # Get all mxnet operators
     mx_operators = _get_all_mxnet_operators()
 
     # Filter for Array Rearrange operators
     rearrange_mx_operators = {}
-    for op_name, op_params in mx_operators.items():
+    for op_name, _ in mx_operators.items():
         if op_name in rearrange_ops:
             rearrange_mx_operators[op_name] = mx_operators[op_name]
     return rearrange_mx_operators
 
 
-def get_all_indexing_routines():
-    """Gets all indexing routines registered with MXNet.
+def get_remaining_miscellaneous_operators():
+    """Gets remaining Miscellaneous operators registered with MXNet not covered by individual tests.
 
     Returns
     -------
     {"operator_name": {"has_backward", "nd_op_handle", "params"}}
     """
+    misc_ops = {'squeeze', 'all_finite', 'clip', 'multi_lars', 'SequenceReverse', 'SequenceLast', 'SequenceMask', 'cast_storage', 'cumsum', 'fill_element_0index'}
+
+    # Get all mxnet operators
+    mx_operators = _get_all_mxnet_operators()
+
+    # Filter for Miscellaneous operators
+    misc_mx_operators = {}
+    for op_name, _ in mx_operators.items():
+        if op_name in misc_ops:
+            misc_mx_operators[op_name] = mx_operators[op_name]
+    return misc_mx_operators
+
+def get_all_indexing_routines():
+    """Gets all indexing routines registered with MXNet.
+
     # @ChaiBapchya unravel_index errors out on certain inputs
     # tracked here https://github.com/apache/incubator-mxnet/issues/16771
     # @ChaiBapchya scatter_nd errors with core dump
     # tracked here https://github.com/apache/incubator-mxnet/issues/17480
-    indexing_routines = ['slice', 'slice_axis', 'slice_like', 'take', 'one_hot',
-                         'where', 'ravel_multi_index', 'gather_nd', 'pick']
 
+    Returns
+    -------
+    {"operator_name": {"has_backward", "nd_op_handle", "params"}}
+    """
+    indexing_routines = {'slice', 'slice_axis', 'slice_like', 'take', 'one_hot',
+                         'where', 'ravel_multi_index', 'gather_nd', 'pick'}
+    
     # Get all mxnet operators
     mx_operators = _get_all_mxnet_operators()
 
@@ -440,14 +462,14 @@ def get_all_loss_operators():
     -------
     {"operator_name": {"has_backward", "nd_op_handle", "params"}}
     """
-    loss_ops = ['smooth_l1', 'CTCLoss', 'ctc_loss', 'MakeLoss', 'softmax_cross_entropy']
+    loss_ops = {'smooth_l1', 'CTCLoss', 'ctc_loss', 'MakeLoss', 'softmax_cross_entropy'}
 
     # Get all mxnet operators
     mx_operators = _get_all_mxnet_operators()
 
     # Filter for NN Loss operators
     loss_mx_operators = {}
-    for op_name, op_params in mx_operators.items():
+    for op_name, _ in mx_operators.items():
         if op_name in loss_ops:
             loss_mx_operators[op_name] = mx_operators[op_name]
     return loss_mx_operators
diff --git a/benchmark/opperf/utils/profiler_utils.py b/benchmark/opperf/utils/profiler_utils.py
index fa959bf5a8b1..e795a3aaa535 100644
--- a/benchmark/opperf/utils/profiler_utils.py
+++ b/benchmark/opperf/utils/profiler_utils.py
@@ -48,8 +48,9 @@ def _get_operator_profile(operator_name, operator_profile_results):
     # alias map : dictionary of the form {"alias" : "registered_name"}
     # allows to retrieve alias operator profile from the profiler results
     # TODO handling - "identity" : "_copy"
-    alias_map = {"broadcast_plus": "broadcast_add", "broadcast_minus": "broadcast_sub", "flatten": "Flatten", "max_axis": "max",
-                 "swapaxes": "SwapAxis", "flip": "reverse", "reshape": "Reshape", "crop": "slice", "sum_axis": "sum", "min_axis": "min", "CTCLoss": "ctc_loss"}
+    alias_map = {"broadcast_plus": "broadcast_add", "broadcast_minus": "broadcast_sub", "flatten": "Flatten", "max_axis": "max", "Custom": "CustomAddOne",
+                 "swapaxes": "SwapAxis", "flip": "reverse", "reshape": "Reshape", "crop": "slice", "sum_axis": "sum", "min_axis": "min", "ctc_loss": "CTCLoss",
+                 "fill_element_0index": "TernaryOp", "identity": "_copy", "ElementWiseSum": "add_n", "choose_element_0index": "pick", "stop_gradient": "BlockGrad"}
 
     op_name = None
 
@@ -135,7 +136,11 @@ def parse_profiler_dump(operator_name, profiler_dump):
     # String Patterns to look out for when parsing
     memory_profile_result_start = "Device Storage"  # Helps identify start of Memory profile
     c_api_profile_result_start = "MXNET_C_API"  # Helps identify end of Memory profile
-    operator_profile_result_start = "operator"  # Helps identify start of Operator profile
+
+    if operator_name == "Custom":
+        operator_profile_result_start = "Custom Operator"  # Helps identify start of Custom Operator profile
+    else:
+        operator_profile_result_start = "operator"  # Helps identify start of Operator profile
 
     memory_profile_results = []
     operator_profile_results = []