Skip to content

Commit

Permalink
Bug - Fix bugs in data diagnosis (#273)
Browse files Browse the repository at this point in the history
**Description**
Fix bugs in data diagnosis.

**Major Revision**
- fix package import issue of file_handler
- deal with monitor metrics
- fix typo in output_path
  • Loading branch information
yukirora authored and abuccts committed Dec 29, 2021
1 parent 91435b7 commit b214024
Show file tree
Hide file tree
Showing 5 changed files with 64 additions and 20 deletions.
31 changes: 19 additions & 12 deletions superbench/analyzer/data_diagnosis.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,13 @@

import re
from typing import Callable
from pathlib import Path

import pandas as pd

from superbench.common.utils import logger
from superbench.analyzer.diagnosis_rule_op import RuleOp, DiagnosisRuleType
import superbench.analyzer.file_handler as file_handler
from superbench.analyzer import file_handler


class DataDiagnosis():
Expand All @@ -31,10 +32,15 @@ def _get_metrics_by_benchmarks(self, metrics_list):
"""
benchmarks_metrics = {}
for metric in metrics_list:
benchmark = metric.split('/')[0]
if benchmark not in benchmarks_metrics:
benchmarks_metrics[benchmark] = set()
benchmarks_metrics[benchmark].add(metric)
if '/' not in metric:
logger.warning(
'DataDiagnosis: get_metrics_by_benchmarks - {} does not have benchmark_name'.format(metric)
)
else:
benchmark = metric.split('/')[0]
if benchmark not in benchmarks_metrics:
benchmarks_metrics[benchmark] = set()
benchmarks_metrics[benchmark].add(metric)
return benchmarks_metrics

def _check_rules(self, rule, name):
Expand Down Expand Up @@ -133,6 +139,7 @@ def _get_criteria(self, rule_file, baseline_file):
if re.search(metric_regex, metric):
self._sb_rules[rule]['metrics'][metric] = self._get_baseline_of_metric(baseline, metric)
self._enable_metrics.append(metric)
self._enable_metrics.sort()
except Exception as e:
logger.error('DataDiagnosis: get criteria failed - {}'.format(str(e)))
return False
Expand Down Expand Up @@ -171,8 +178,8 @@ def _run_diagnosis_rules_for_single_node(self, node):
issue_label = True
if issue_label:
# Add category information
general_cat_str = ','.join(categories)
details_cat_str = ','.join(details)
general_cat_str = ','.join(sorted(list(categories)))
details_cat_str = ','.join(sorted((details)))
details_row = [general_cat_str, details_cat_str]
return details_row, summary_data_row

Expand Down Expand Up @@ -236,15 +243,15 @@ def run(self, raw_data_file, rule_file, baseline_file, output_dir, output_format
try:
self._raw_data_df = file_handler.read_raw_data(raw_data_file)
self._metrics = self._get_metrics_by_benchmarks(list(self._raw_data_df.columns))
logger.info('DataDiagnosis: Begin to processe {} nodes'.format(len(self._raw_data_df)))
logger.info('DataDiagnosis: Begin to process {} nodes'.format(len(self._raw_data_df)))
data_not_accept_df, label_df = self.run_diagnosis_rules(rule_file, baseline_file)
logger.info('DataDiagnosis: Processed finished')
outpout_path = ''
output_path = ''
if output_format == 'excel':
output_path = output_dir + '/diagnosis_summary.xlsx'
file_handler.output_excel(self._raw_data_df, data_not_accept_df, outpout_path, self._sb_rules)
output_path = str(Path(output_dir) / 'diagnosis_summary.xlsx')
file_handler.output_excel(self._raw_data_df, data_not_accept_df, output_path, self._sb_rules)
elif output_format == 'json':
output_path = output_dir + '/diagnosis_summary.jsonl'
output_path = str(Path(output_dir) / 'diagnosis_summary.jsonl')
file_handler.output_json_data_not_accept(data_not_accept_df, output_path)
else:
logger.error('DataDiagnosis: output failed - unsupported output format')
Expand Down
File renamed without changes.
51 changes: 43 additions & 8 deletions tests/analyzer/test_data_diagnosis.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,9 +18,10 @@ class TestDataDiagnosis(unittest.TestCase):
"""Test for DataDiagnosis class."""
def setUp(self):
"""Method called to prepare the test fixture."""
self.output_excel_file = str(Path(__file__).parent.resolve()) + '/diagnosis_summary.xlsx'
self.test_rule_file_fake = str(Path(__file__).parent.resolve()) + '/test_rules_fake.yaml'
self.output_json_file = str(Path(__file__).parent.resolve()) + '/diagnosis_summary.jsonl'
self.parent_path = Path(__file__).parent
self.output_excel_file = str(self.parent_path / 'diagnosis_summary.xlsx')
self.test_rule_file_fake = str(self.parent_path / 'test_rules_fake.yaml')
self.output_json_file = str(self.parent_path / 'diagnosis_summary.jsonl')

def tearDown(self):
"""Method called after the test method has been called and the result recorded."""
Expand All @@ -33,21 +34,31 @@ def test_data_diagnosis(self):
"""Test for rule-based data diagnosis."""
# Test - read_raw_data and get_metrics_from_raw_data
# Positive case
test_raw_data = str(Path(__file__).parent.resolve()) + '/test_results.jsonl'
test_rule_file = str(Path(__file__).parent.resolve()) + '/test_rules.yaml'
test_baseline_file = str(Path(__file__).parent.resolve()) + '/test_baseline.json'
test_raw_data = str(self.parent_path / 'test_results.jsonl')
test_rule_file = str(self.parent_path / 'test_rules.yaml')
test_baseline_file = str(self.parent_path / 'test_baseline.json')
diag1 = DataDiagnosis()
diag1._raw_data_df = file_handler.read_raw_data(test_raw_data)
diag1._metrics = diag1._get_metrics_by_benchmarks(list(diag1._raw_data_df))
assert (len(diag1._raw_data_df) == 3)
# Negative case
test_raw_data_fake = str(Path(__file__).parent.resolve()) + '/test_results_fake.jsonl'
test_rule_file_fake = str(Path(__file__).parent.resolve()) + '/test_rules_fake.yaml'
test_raw_data_fake = str(self.parent_path / 'test_results_fake.jsonl')
test_rule_file_fake = str(self.parent_path / 'test_rules_fake.yaml')
diag2 = DataDiagnosis()
diag2._raw_data_df = file_handler.read_raw_data(test_raw_data_fake)
diag2._metrics = diag2._get_metrics_by_benchmarks(list(diag2._raw_data_df))
assert (len(diag2._raw_data_df) == 0)
assert (len(diag2._metrics) == 0)
metric_list = [
'gpu_temperature', 'gpu_power_limit', 'gemm-flops/FP64',
'bert_models/pytorch-bert-base/steptime_train_float32'
]
self.assertDictEqual(
diag2._get_metrics_by_benchmarks(metric_list), {
'gemm-flops': {'gemm-flops/FP64'},
'bert_models': {'bert_models/pytorch-bert-base/steptime_train_float32'}
}
)
# Test - read rules
rules = file_handler.read_rules(test_rule_file_fake)
assert (not rules)
Expand Down Expand Up @@ -176,3 +187,27 @@ def test_data_diagnosis(self):
assert ('Category' in line)
assert ('Defective Details' in line)
assert ('Index' in line)

def test_data_diagnosis_run(self):
"""Test for the run process of rule-based data diagnosis."""
test_raw_data = str(self.parent_path / 'test_results.jsonl')
test_rule_file = str(self.parent_path / 'test_rules.yaml')
test_baseline_file = str(self.parent_path / 'test_baseline.json')

# Test - output in excel
DataDiagnosis().run(test_raw_data, test_rule_file, test_baseline_file, str(self.parent_path), 'excel')
excel_file = pd.ExcelFile(self.output_excel_file, engine='openpyxl')
data_sheet_name = 'Not Accept'
data_not_accept_read_from_excel = excel_file.parse(data_sheet_name)
expect_result_file = pd.ExcelFile(str(self.parent_path / '../data/diagnosis_summary.xlsx'), engine='openpyxl')
expect_result = expect_result_file.parse(data_sheet_name)
pd.util.testing.assert_frame_equal(data_not_accept_read_from_excel, expect_result)
# Test - output in json
DataDiagnosis().run(test_raw_data, test_rule_file, test_baseline_file, str(self.parent_path), 'json')
assert (Path(self.output_json_file).is_file())
with Path(self.output_json_file).open() as f:
data_not_accept_read_from_json = f.read()
expect_result_file = self.parent_path / '../data/diagnosis_summary.jsonl'
with Path(expect_result_file).open() as f:
expect_result = f.read()
assert (data_not_accept_read_from_json == expect_result)
2 changes: 2 additions & 0 deletions tests/data/diagnosis_summary.jsonl
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
{"Category": "KernelLaunch", "Defective Details": "kernel-launch/event_overhead:0(B/L: 0.0060 VAL: 0.1000 VAR: 1577.85% Rule:lambda x:x>0.05)", "kernel-launch/event_overhead:0": 15.7785234899, "kernel-launch/event_overhead:1": -0.0016778523, "kernel-launch/event_overhead:2": -0.0654362416, "kernel-launch/event_overhead:3": -0.0771812081, "kernel-launch/event_overhead:4": -0.0067114094, "kernel-launch/event_overhead:5": -0.0117449664, "kernel-launch/event_overhead:6": -0.0402684564, "kernel-launch/event_overhead:7": -0.0100671141, "kernel-launch/return_code": 0.0, "kernel-launch/wall_overhead:0": 0.0, "kernel-launch/wall_overhead:1": 0.0, "kernel-launch/wall_overhead:2": 0.0194931774, "kernel-launch/wall_overhead:3": 0.022417154, "kernel-launch/wall_overhead:4": 0.0360623782, "kernel-launch/wall_overhead:5": -0.0194931774, "kernel-launch/wall_overhead:6": 0.0185185185, "kernel-launch/wall_overhead:7": 0.0438596491, "mem-bw/D2H_Mem_BW:0": 0.0, "mem-bw/D2H_Mem_BW:1": 0.012345679, "mem-bw/D2H_Mem_BW:2": 0.0082304527, "mem-bw/D2H_Mem_BW:3": 0.012345679, "mem-bw/D2H_Mem_BW:4": 0.0, "mem-bw/D2H_Mem_BW:5": 0.0, "mem-bw/D2H_Mem_BW:6": -0.0164609053, "mem-bw/D2H_Mem_BW:7": 0.012345679, "mem-bw/H2D_Mem_BW:0": 0.0, "mem-bw/H2D_Mem_BW:1": 0.0078125, "mem-bw/H2D_Mem_BW:2": 0.015625, "mem-bw/H2D_Mem_BW:3": 0.01953125, "mem-bw/H2D_Mem_BW:4": 0.0234375, "mem-bw/H2D_Mem_BW:5": 0.0078125, "mem-bw/H2D_Mem_BW:6": -0.01171875, "mem-bw/H2D_Mem_BW:7": 0.01953125, "mem-bw/return_code": 0.0, "Index": "sb-validation-01"}
{"Category": "FailedTest,Mem", "Defective Details": "mem-bw/D2H_Mem_BW:0_miss,mem-bw/D2H_Mem_BW:1_miss,mem-bw/D2H_Mem_BW:2_miss,mem-bw/D2H_Mem_BW:3_miss,mem-bw/D2H_Mem_BW:4_miss,mem-bw/D2H_Mem_BW:5_miss,mem-bw/D2H_Mem_BW:6_miss,mem-bw/D2H_Mem_BW:7_miss,mem-bw/H2D_Mem_BW:0_miss,mem-bw/H2D_Mem_BW:1_miss,mem-bw/H2D_Mem_BW:2_miss,mem-bw/H2D_Mem_BW:3_miss,mem-bw/H2D_Mem_BW:4_miss,mem-bw/H2D_Mem_BW:5_miss,mem-bw/H2D_Mem_BW:6_miss,mem-bw/H2D_Mem_BW:7_miss,mem-bw/return_code(VAL: 1.0000 Rule:lambda x:x>0)", "kernel-launch/event_overhead:0": 0.0, "kernel-launch/event_overhead:1": -0.0016778523, "kernel-launch/event_overhead:2": -0.0654362416, "kernel-launch/event_overhead:3": -0.0771812081, "kernel-launch/event_overhead:4": -0.0067114094, "kernel-launch/event_overhead:5": -0.0117449664, "kernel-launch/event_overhead:6": -0.0402684564, "kernel-launch/event_overhead:7": -0.0100671141, "kernel-launch/return_code": 0.0, "kernel-launch/wall_overhead:0": 0.0, "kernel-launch/wall_overhead:1": 0.0, "kernel-launch/wall_overhead:2": 0.0194931774, "kernel-launch/wall_overhead:3": 0.022417154, "kernel-launch/wall_overhead:4": 0.0360623782, "kernel-launch/wall_overhead:5": -0.0194931774, "kernel-launch/wall_overhead:6": 0.0185185185, "kernel-launch/wall_overhead:7": 0.0438596491, "mem-bw/D2H_Mem_BW:0": null, "mem-bw/D2H_Mem_BW:1": null, "mem-bw/D2H_Mem_BW:2": null, "mem-bw/D2H_Mem_BW:3": null, "mem-bw/D2H_Mem_BW:4": null, "mem-bw/D2H_Mem_BW:5": null, "mem-bw/D2H_Mem_BW:6": null, "mem-bw/D2H_Mem_BW:7": null, "mem-bw/H2D_Mem_BW:0": null, "mem-bw/H2D_Mem_BW:1": null, "mem-bw/H2D_Mem_BW:2": null, "mem-bw/H2D_Mem_BW:3": null, "mem-bw/H2D_Mem_BW:4": null, "mem-bw/H2D_Mem_BW:5": null, "mem-bw/H2D_Mem_BW:6": null, "mem-bw/H2D_Mem_BW:7": null, "mem-bw/return_code": 1.0, "Index": "sb-validation-03"}
Binary file added tests/data/diagnosis_summary.xlsx
Binary file not shown.

0 comments on commit b214024

Please sign in to comment.