Skip to content

Commit

Permalink
Fix ONNXRT calibration for Dml EP (#1526)
Browse files Browse the repository at this point in the history
Signed-off-by: yuwenzho <yuwen.zhou@intel.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
  • Loading branch information
yuwenzho and pre-commit-ci[bot] authored Jan 18, 2024
1 parent d22df53 commit c58aeaa
Show file tree
Hide file tree
Showing 3 changed files with 145 additions and 47 deletions.
2 changes: 1 addition & 1 deletion neural_compressor/adaptor/onnxrt.py
Original file line number Diff line number Diff line change
Expand Up @@ -765,7 +765,7 @@ def _get_quantize_params(self, model, data_loader, quantize_config, iterations,
black_nodes=black_nodes,
white_nodes=white_nodes,
iterations=list(range(0, iterations)),
backend=self.backend if self.backend != "DmlExecutionProvider" else "CPUExecutionProvider",
backend=self.backend,
reduce_range=self.reduce_range,
**kwargs,
)
Expand Down
164 changes: 128 additions & 36 deletions neural_compressor/adaptor/ox_utils/calibration.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,7 @@ def dataloder_for_next_split_model(self):
"""Return dataloader for next split model for layer-wise quantization."""
return self._dataloder_for_next_split_model

def augment_graph(self, activation_only=False, weight_only=False):
def augment_graph(self):
"""Augment_graph.
Adds nodes to all quantization_candidates op type nodes in model and
Expand All @@ -118,7 +118,7 @@ def augment_graph(self, activation_only=False, weight_only=False):
self.dequantized_output.clear()
onnx_version = Version(onnx.__version__)
if onnx_version < ONNX18_VERSION:
logger.warning("Static quantization for NLP model is supported " "at onnx 1.8.0 and newer.")
logger.warning("Static quantization for NLP model is supported at onnx 1.8.0 and newer.")
if self.already_quantized and any(
[i.dims in [1, 2] for i in self.model_wrapper.initializer() if i.name.endswith("_scale")]
):
Expand All @@ -138,53 +138,43 @@ def augment_graph(self, activation_only=False, weight_only=False):
for augment_node_type in self.augment_nodes:
if augment_node_type not in ["DequantizeLinear"]: # pragma: no cover
raise ValueError(
"Unexpected augment_node {} only DequantizeLinear is " "supported".format(augment_node_type)
"Unexpected augment_node {} only DequantizeLinear is supported".format(augment_node_type)
)

if self.already_quantized:
# mapping between fp32 node and int8 node
new_white_nodes = []
for white_node in self.white_nodes:
new_white_node = white_node + "_quant"
assert new_white_node in model_nodes_names, "no quantized {} in the " "graph".format(white_node)
assert new_white_node in model_nodes_names, "no quantized {} in the graph".format(white_node)
new_white_nodes.append(new_white_node)
self.white_nodes = new_white_nodes

initializers = {i.name: i.data_type for i in model.graph.initializer}
node_outputs = []
for node in model.graph.node: # pylint: disable=no-member
node_outputs.extend(node.output)
should_be_dump = ((node.op_type in self.dump_op_types) and (node.name not in self.black_nodes)) or (
node.name in self.white_nodes
)
if should_be_dump:
if not weight_only and not activation_only:
tensors_to_dump.update([input for input in node.input if len(input) != 0])
tensors_to_dump.update([output for output in node.output if len(output) != 0])
tensors_to_dump.update(node.output)
elif weight_only:
for input in node.input:
if (
self.already_quantized
and input.replace("_dequantized", "_quantized") in initializers
and len(input) != 0
):
tensors_to_dump.add(input)
elif not self.already_quantized and input in initializers and len(input) != 0:
# add input tensors which should be dump
for input in node.input:
if len(input) != 0: # to prevent input is ""
initializer_tensor = self.model_wrapper.get_initializer(input)
if initializer_tensor is None:
tensors_to_dump.add(input)
elif activation_only:
if len(node.input[0]) != 0:
tensors_to_dump.update([node.input[0]])
# add output tensors which should be dump
tensors_to_dump.update([output for output in node.output if len(output) != 0])

model_inputs = [i.name for i in model.graph.input]
for tensor in tensors_to_dump:
if tensor not in node_outputs and tensor not in initializers and tensor not in model_inputs:
if tensor not in node_outputs and tensor not in model_inputs:
continue
if self.augment_nodes:
for augment_node_type in self.augment_nodes:
if augment_node_type in ["DequantizeLinear"]:
# insert DequantizeLinear node as output
if tensor.endswith("_scale") or tensor.endswith("_zero_point"):
if tensor.endswith("_scale") or tensor.endswith("_zero_point"): # pragma: no cover
continue

if not self.dynamically_quantized:
Expand Down Expand Up @@ -238,10 +228,18 @@ def augment_graph(self, activation_only=False, weight_only=False):
convert_attribute=False,
)

def get_intermediate_outputs(self, q_config=None):
"""Gather intermediate model outputs after running inference."""
def get_activation_tensors_calib_range(self, q_config=None):
"""Get calib ranges of activation tensors.
Args:
q_config (dict, optional): quantization config. Defaults to None.
Returns:
dict: calib ranges
"""
# conduct inference session and get intermediate outputs
so = onnxruntime.SessionOptions()
so.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_DISABLE_ALL
if sys.version_info < (3, 11) and find_spec("onnxruntime_extensions"): # pragma: no cover
from onnxruntime_extensions import get_library_path

Expand Down Expand Up @@ -280,7 +278,7 @@ def get_intermediate_outputs(self, q_config=None):
assert node, "{} is neither an input nor an output of nodes in augmented model.".format(data_name)
name_to_node[data_name] = node.name

output_dicts = {}
activation_tensors_calib_range = {}
intermediate_tensor = {}
name_to_calibrator = {}
ort_inputs_for_next_split_model = []
Expand All @@ -294,8 +292,8 @@ def get_intermediate_outputs(self, q_config=None):
else:
ort_inputs.update({inputs_names[0]: to_numpy(inputs)})
else:
# skip check input length for layer-wise calibration
if not self.layer_wise:
# for layer-wise calibration
assert len_inputs == len(inputs), "number of input tensors must align with graph inputs"

if isinstance(inputs, dict):
Expand Down Expand Up @@ -335,14 +333,16 @@ def _collect_data(ort_inputs):
# per iteration in the future.
if calibrator.method_name == "minmax":
calibrator.collect(output)
output_dicts[node_output_names[output_idx]] = [list(calibrator.calib_range)]
activation_tensors_calib_range[node_output_names[output_idx]] = [
list(calibrator.calib_range)
]
name_to_calibrator[node_output_names[output_idx]] = calibrator
else:
intermediate_tensor.setdefault((node_output_names[output_idx], node_name), []).append(
output
)
elif q_config is None:
output_dicts.setdefault(node_output_names[output_idx], []).append(output)
activation_tensors_calib_range.setdefault(node_output_names[output_idx], []).append(output)

if self.layer_wise:
# for layer-wise calibration
Expand All @@ -369,12 +369,94 @@ def _collect_data(ort_inputs):
)
calibrator = CALIBRATOR[calib_method]()
calibrator.collect(datas)
output_dicts.setdefault(output_name, []).append(list(calibrator.calib_range))
activation_tensors_calib_range.setdefault(output_name, []).append(list(calibrator.calib_range))
calibrator.clear()
del calibrator

# set for layer-wise quant
self._dataloder_for_next_split_model = ort_inputs_for_next_split_model

return activation_tensors_calib_range

def get_weight_tensors_calib_range(self):
"""Get calib ranges of weight tensors.
Returns:
dict: calib ranges
"""
model_nodes_names = [node.name for node in self.model.graph.node]

# if augmented_model is not None, it means self.white_nodes is already updated in augment_graph func
# then skip update here
if self.already_quantized and self.augmented_model is None:
# mapping between fp32 node and int8 node
new_white_nodes = []
for white_node in self.white_nodes:
new_white_node = white_node + "_quant"
assert new_white_node in model_nodes_names, "no quantized {} in the " "graph".format(white_node)
new_white_nodes.append(new_white_node)
self.white_nodes = new_white_nodes

added_outputs = set()
initializer_tensors_to_dump = []
initializers = [init.name for init in self.model.graph.initializer]
for node in self.model.graph.node: # pylint: disable=no-member
should_be_dump = ((node.op_type in self.dump_op_types) and (node.name not in self.black_nodes)) or (
node.name in self.white_nodes
)
if should_be_dump:
for input in node.input:
if (
(self.already_quantized and input.replace("_dequantized", "_quantized") in initializers)
or (not self.already_quantized and input in initializers)
) and len(input) != 0:
added_outputs.add(input)

for tensor in added_outputs:
if tensor not in initializers:
continue
if self.augment_nodes:
for augment_node_type in self.augment_nodes:
if augment_node_type in ["DequantizeLinear"]:
if not (tensor.endswith("_scale") or tensor.endswith("_zero_point")):
initializer_tensors_to_dump.append(tensor)
else:
initializer_tensors_to_dump.append(tensor)

weight_tensors_calib_range = {}
for initializer_tensor_name in initializer_tensors_to_dump:
if self.layer_wise:
self.model_wrapper.load_model_initializer_by_tensor()
initializer_tensor = self.model_wrapper.get_initializer(initializer_tensor_name)

# double check initializer tensor is not None
if initializer_tensor is None: # pragma: no cover
continue

initializer_tensor = numpy_helper.to_array(
initializer_tensor,
base_dir=os.path.dirname(self.model_wrapper.model_path)
if self.model_wrapper.model_path is not None
else "",
)
calibrator = CALIBRATOR["minmax"]() # use minmax method to calibrate initializer tensors
calibrator.collect(initializer_tensor)
weight_tensors_calib_range[initializer_tensor_name] = [list(calibrator.calib_range)]
calibrator.clear()
del calibrator
return weight_tensors_calib_range

def get_intermediate_outputs(self, q_config=None, activation_only=False, weight_only=False):
"""Gather intermediate model outputs after running inference."""
output_dicts = {}
if not activation_only and not weight_only:
output_dicts = self.get_activation_tensors_calib_range(q_config)
output_dicts.update(self.get_weight_tensors_calib_range())
elif weight_only:
output_dicts = self.get_weight_tensors_calib_range()
elif activation_only:
output_dicts = self.get_activation_tensors_calib_range(q_config)

return list(output_dicts.keys()), output_dicts

def _dequantize(self, tensor, scale_tensor, zo_tensor):
Expand Down Expand Up @@ -472,7 +554,12 @@ def _map_calibration(self, node_output_names, output_dicts):
return final_dict

def dump_minmax(self, q_config):
"""Get min/max values of tensors."""
"""Get calib ranges of tensors."""
# pipeline of getting calib ranges of tensors during calibration:
# 1. augment_graph(): insert activation tensors to model output
# 2. get_intermediate_outputs():
# 2.1 get_activation_tensors_calib_range(): get calib ranges of activation tensors using the augment graph
# 2.2 get_weight_tensors_calib_range(): get calib ranges of weight tensors
self.augment_graph()
node_output_names, output_dicts = self.get_intermediate_outputs(q_config)
return self._map_calibration(node_output_names, output_dicts)
Expand Down Expand Up @@ -553,15 +640,20 @@ def dump_tensor(self, activation=True, weight=False, format=None):
self.already_quantized = True
self.dynamically_quantized = "DynamicQuantizeLinear" in [node.op_type for node in self.model.graph.node]
is_qdq = format == "qdq"
self.augment_graph(activation_only=not weight, weight_only=not activation)
_, output_dicts = self.get_intermediate_outputs()
if activation:
self.augment_graph() # add activation tensors to model output
_, output_dicts = self.get_intermediate_outputs(activation_only=not weight, weight_only=not activation)
iters = len(list(output_dicts.values())[-1])
map_node_activation = [{} for _ in range(iters)]
map_node_weight = {}
self.white_nodes = [node.replace("_quant", "") for node in self.white_nodes]
augmengted_wrapper = ONNXModel(self.augmented_model)
map_output = augmengted_wrapper.output_name_to_node
map_input = augmengted_wrapper.input_name_to_nodes

if activation and self.augmented_model is None:
raise ValueError("augmented model should not be None when dump activation tensors.")
# if activation tensors are not dumped, then use origin model wrapper
model_wrapper = ONNXModel(self.augmented_model) if activation else self.model_wrapper
map_output = model_wrapper.output_name_to_node
map_input = model_wrapper.input_name_to_nodes
model_output_names = [t.name for t in self.model.graph.output]
model_input_names = [t.name for t in self.model.graph.input]
model_initializer_names = [t.name for t in self.model.graph.initializer]
Expand Down
26 changes: 16 additions & 10 deletions test/adaptor/onnxrt_adaptor/test_onnxrt_augment.py
Original file line number Diff line number Diff line change
Expand Up @@ -330,7 +330,7 @@ def test_augment_graph(self):
attn_output_scale = generate_input_initializer([1], np.float32, "attn_output_scale")
Q_zo = helper.make_tensor_value_info("attn_output_zero_point", TensorProto.INT8, [1])
attn_output_zero_point = generate_input_initializer([1], np.int8, "attn_output_zero_point")
Output = helper.make_tensor_value_info("output", TensorProto.INT8, [13, 7])
Output = helper.make_tensor_value_info("attn_output_quantized", TensorProto.INT8, [13, 7])
attention_node = onnx.helper.make_node(
"QAttention",
[
Expand Down Expand Up @@ -386,15 +386,15 @@ def test_augment_graph(self):
augment.augment_nodes = ["DequantizeLinear"]
augment.already_quantized = True

augment.augment_graph(activation_only=True, weight_only=False)
augment.augment_graph()
augmented_model = augment.augmented_model

augmented_model_node_names = [node.name for node in augmented_model.graph.node]
augmented_model_outputs = [output.name for output in augmented_model.graph.output]
added_node_names = ["attention_quant", "attn_output_QuantizeLinear"]
added_outputs = ["input_quantized_output", "output"]
added_node_names = ["attention_quant", "attn_output_QuantizeLinear", "input_quantized_DequantizeLinear"]
added_outputs = ["attn_output_quantized", "input_quantized_output", "attn_output"]
self.assertEqual(len(augmented_model_node_names), 3)
self.assertEqual(len(augmented_model_outputs), 2)
self.assertEqual(len(augmented_model_outputs), 3)
for name in added_node_names:
self.assertTrue(name in augmented_model_node_names)
for output in added_outputs:
Expand Down Expand Up @@ -470,15 +470,21 @@ def test_augment_graph(self):
augment = ONNXRTAugment(ONNXModel(model), data_reader, [], white_nodes=["conv"])
augment.augment_nodes = ["DequantizeLinear"]
augment.already_quantized = True
augment.augment_graph(activation_only=True, weight_only=False)
augment.augment_graph()
augmented_model = augment.augmented_model

augmented_model_node_names = [node.name for node in augmented_model.graph.node]
augmented_model_outputs = [output.name for output in augmented_model.graph.output]
added_node_names = ["A_QuantizeLinear", "conv_quant", "D_DequantizeLinear", "A_quantized_DequantizeLinear"]
added_outputs = ["D", "A_quantized_output"]
self.assertEqual(len(augmented_model_node_names), 4)
self.assertEqual(len(augmented_model_outputs), 2)
added_node_names = [
"A_QuantizeLinear",
"conv_quant",
"D_DequantizeLinear",
"D_quantized_DequantizeLinear",
"A_quantized_DequantizeLinear",
]
added_outputs = ["D", "D_quantized_output", "A_quantized_output"]
self.assertEqual(len(augmented_model_node_names), 5)
self.assertEqual(len(augmented_model_outputs), 3)
for name in added_node_names:
self.assertTrue(name in augmented_model_node_names)
for output in added_outputs:
Expand Down

0 comments on commit c58aeaa

Please sign in to comment.