Bug - Fix bugs in data diagnosis (#273)

**Description** Fix bugs in data diagnosis. **Major Revision** - fix package import issue of file_handler - deal with monitor metrics - fix typo in output_path
microsoft · Dec 29, 2021 · b214024 · b214024
1 parent 91435b7
commit b214024
Show file tree

Hide file tree

Showing 5 changed files with 64 additions and 20 deletions.
diff --git a/superbench/analyzer/data_diagnosis.py b/superbench/analyzer/data_diagnosis.py
@@ -5,12 +5,13 @@
 
 import re
 from typing import Callable
+from pathlib import Path
 
 import pandas as pd
 
 from superbench.common.utils import logger
 from superbench.analyzer.diagnosis_rule_op import RuleOp, DiagnosisRuleType
-import superbench.analyzer.file_handler as file_handler
+from superbench.analyzer import file_handler
 
 
 class DataDiagnosis():
@@ -31,10 +32,15 @@ def _get_metrics_by_benchmarks(self, metrics_list):
         """
         benchmarks_metrics = {}
         for metric in metrics_list:
-            benchmark = metric.split('/')[0]
-            if benchmark not in benchmarks_metrics:
-                benchmarks_metrics[benchmark] = set()
-            benchmarks_metrics[benchmark].add(metric)
+            if '/' not in metric:
+                logger.warning(
+                    'DataDiagnosis: get_metrics_by_benchmarks - {} does not have benchmark_name'.format(metric)
+                )
+            else:
+                benchmark = metric.split('/')[0]
+                if benchmark not in benchmarks_metrics:
+                    benchmarks_metrics[benchmark] = set()
+                benchmarks_metrics[benchmark].add(metric)
         return benchmarks_metrics
 
     def _check_rules(self, rule, name):
@@ -133,6 +139,7 @@ def _get_criteria(self, rule_file, baseline_file):
                             if re.search(metric_regex, metric):
                                 self._sb_rules[rule]['metrics'][metric] = self._get_baseline_of_metric(baseline, metric)
                                 self._enable_metrics.append(metric)
+            self._enable_metrics.sort()
         except Exception as e:
             logger.error('DataDiagnosis: get criteria failed - {}'.format(str(e)))
             return False
@@ -171,8 +178,8 @@ def _run_diagnosis_rules_for_single_node(self, node):
                 issue_label = True
         if issue_label:
             # Add category information
-            general_cat_str = ','.join(categories)
-            details_cat_str = ','.join(details)
+            general_cat_str = ','.join(sorted(list(categories)))
+            details_cat_str = ','.join(sorted((details)))
             details_row = [general_cat_str, details_cat_str]
             return details_row, summary_data_row
 
@@ -236,15 +243,15 @@ def run(self, raw_data_file, rule_file, baseline_file, output_dir, output_format
         try:
             self._raw_data_df = file_handler.read_raw_data(raw_data_file)
             self._metrics = self._get_metrics_by_benchmarks(list(self._raw_data_df.columns))
-            logger.info('DataDiagnosis: Begin to processe {} nodes'.format(len(self._raw_data_df)))
+            logger.info('DataDiagnosis: Begin to process {} nodes'.format(len(self._raw_data_df)))
             data_not_accept_df, label_df = self.run_diagnosis_rules(rule_file, baseline_file)
             logger.info('DataDiagnosis: Processed finished')
-            outpout_path = ''
+            output_path = ''
             if output_format == 'excel':
-                output_path = output_dir + '/diagnosis_summary.xlsx'
-                file_handler.output_excel(self._raw_data_df, data_not_accept_df, outpout_path, self._sb_rules)
+                output_path = str(Path(output_dir) / 'diagnosis_summary.xlsx')
+                file_handler.output_excel(self._raw_data_df, data_not_accept_df, output_path, self._sb_rules)
             elif output_format == 'json':
-                output_path = output_dir + '/diagnosis_summary.jsonl'
+                output_path = str(Path(output_dir) / 'diagnosis_summary.jsonl')
                 file_handler.output_json_data_not_accept(data_not_accept_df, output_path)
             else:
                 logger.error('DataDiagnosis: output failed - unsupported output format')

diff --git a/tests/analylzer/test_data_analysis.py → tests/analyzer/test_data_analysis.py b/tests/analylzer/test_data_analysis.py → tests/analyzer/test_data_analysis.py
diff --git a/tests/analyzer/test_data_diagnosis.py b/tests/analyzer/test_data_diagnosis.py
@@ -18,9 +18,10 @@ class TestDataDiagnosis(unittest.TestCase):
     """Test for DataDiagnosis class."""
     def setUp(self):
         """Method called to prepare the test fixture."""
-        self.output_excel_file = str(Path(__file__).parent.resolve()) + '/diagnosis_summary.xlsx'
-        self.test_rule_file_fake = str(Path(__file__).parent.resolve()) + '/test_rules_fake.yaml'
-        self.output_json_file = str(Path(__file__).parent.resolve()) + '/diagnosis_summary.jsonl'
+        self.parent_path = Path(__file__).parent
+        self.output_excel_file = str(self.parent_path / 'diagnosis_summary.xlsx')
+        self.test_rule_file_fake = str(self.parent_path / 'test_rules_fake.yaml')
+        self.output_json_file = str(self.parent_path / 'diagnosis_summary.jsonl')
 
     def tearDown(self):
         """Method called after the test method has been called and the result recorded."""
@@ -33,21 +34,31 @@ def test_data_diagnosis(self):
         """Test for rule-based data diagnosis."""
         # Test - read_raw_data and get_metrics_from_raw_data
         # Positive case
-        test_raw_data = str(Path(__file__).parent.resolve()) + '/test_results.jsonl'
-        test_rule_file = str(Path(__file__).parent.resolve()) + '/test_rules.yaml'
-        test_baseline_file = str(Path(__file__).parent.resolve()) + '/test_baseline.json'
+        test_raw_data = str(self.parent_path / 'test_results.jsonl')
+        test_rule_file = str(self.parent_path / 'test_rules.yaml')
+        test_baseline_file = str(self.parent_path / 'test_baseline.json')
         diag1 = DataDiagnosis()
         diag1._raw_data_df = file_handler.read_raw_data(test_raw_data)
         diag1._metrics = diag1._get_metrics_by_benchmarks(list(diag1._raw_data_df))
         assert (len(diag1._raw_data_df) == 3)
         # Negative case
-        test_raw_data_fake = str(Path(__file__).parent.resolve()) + '/test_results_fake.jsonl'
-        test_rule_file_fake = str(Path(__file__).parent.resolve()) + '/test_rules_fake.yaml'
+        test_raw_data_fake = str(self.parent_path / 'test_results_fake.jsonl')
+        test_rule_file_fake = str(self.parent_path / 'test_rules_fake.yaml')
         diag2 = DataDiagnosis()
         diag2._raw_data_df = file_handler.read_raw_data(test_raw_data_fake)
         diag2._metrics = diag2._get_metrics_by_benchmarks(list(diag2._raw_data_df))
         assert (len(diag2._raw_data_df) == 0)
         assert (len(diag2._metrics) == 0)
+        metric_list = [
+            'gpu_temperature', 'gpu_power_limit', 'gemm-flops/FP64',
+            'bert_models/pytorch-bert-base/steptime_train_float32'
+        ]
+        self.assertDictEqual(
+            diag2._get_metrics_by_benchmarks(metric_list), {
+                'gemm-flops': {'gemm-flops/FP64'},
+                'bert_models': {'bert_models/pytorch-bert-base/steptime_train_float32'}
+            }
+        )
         # Test - read rules
         rules = file_handler.read_rules(test_rule_file_fake)
         assert (not rules)
@@ -176,3 +187,27 @@ def test_data_diagnosis(self):
             assert ('Category' in line)
             assert ('Defective Details' in line)
             assert ('Index' in line)
+
+    def test_data_diagnosis_run(self):
+        """Test for the run process of rule-based data diagnosis."""
+        test_raw_data = str(self.parent_path / 'test_results.jsonl')
+        test_rule_file = str(self.parent_path / 'test_rules.yaml')
+        test_baseline_file = str(self.parent_path / 'test_baseline.json')
+
+        # Test - output in excel
+        DataDiagnosis().run(test_raw_data, test_rule_file, test_baseline_file, str(self.parent_path), 'excel')
+        excel_file = pd.ExcelFile(self.output_excel_file, engine='openpyxl')
+        data_sheet_name = 'Not Accept'
+        data_not_accept_read_from_excel = excel_file.parse(data_sheet_name)
+        expect_result_file = pd.ExcelFile(str(self.parent_path / '../data/diagnosis_summary.xlsx'), engine='openpyxl')
+        expect_result = expect_result_file.parse(data_sheet_name)
+        pd.util.testing.assert_frame_equal(data_not_accept_read_from_excel, expect_result)
+        # Test - output in json
+        DataDiagnosis().run(test_raw_data, test_rule_file, test_baseline_file, str(self.parent_path), 'json')
+        assert (Path(self.output_json_file).is_file())
+        with Path(self.output_json_file).open() as f:
+            data_not_accept_read_from_json = f.read()
+        expect_result_file = self.parent_path / '../data/diagnosis_summary.jsonl'
+        with Path(expect_result_file).open() as f:
+            expect_result = f.read()
+        assert (data_not_accept_read_from_json == expect_result)
diff --git a/tests/data/diagnosis_summary.jsonl b/tests/data/diagnosis_summary.jsonl
@@ -0,0 +1,2 @@
+{"Category": "KernelLaunch", "Defective Details": "kernel-launch/event_overhead:0(B/L: 0.0060 VAL: 0.1000 VAR: 1577.85% Rule:lambda x:x>0.05)", "kernel-launch/event_overhead:0": 15.7785234899, "kernel-launch/event_overhead:1": -0.0016778523, "kernel-launch/event_overhead:2": -0.0654362416, "kernel-launch/event_overhead:3": -0.0771812081, "kernel-launch/event_overhead:4": -0.0067114094, "kernel-launch/event_overhead:5": -0.0117449664, "kernel-launch/event_overhead:6": -0.0402684564, "kernel-launch/event_overhead:7": -0.0100671141, "kernel-launch/return_code": 0.0, "kernel-launch/wall_overhead:0": 0.0, "kernel-launch/wall_overhead:1": 0.0, "kernel-launch/wall_overhead:2": 0.0194931774, "kernel-launch/wall_overhead:3": 0.022417154, "kernel-launch/wall_overhead:4": 0.0360623782, "kernel-launch/wall_overhead:5": -0.0194931774, "kernel-launch/wall_overhead:6": 0.0185185185, "kernel-launch/wall_overhead:7": 0.0438596491, "mem-bw/D2H_Mem_BW:0": 0.0, "mem-bw/D2H_Mem_BW:1": 0.012345679, "mem-bw/D2H_Mem_BW:2": 0.0082304527, "mem-bw/D2H_Mem_BW:3": 0.012345679, "mem-bw/D2H_Mem_BW:4": 0.0, "mem-bw/D2H_Mem_BW:5": 0.0, "mem-bw/D2H_Mem_BW:6": -0.0164609053, "mem-bw/D2H_Mem_BW:7": 0.012345679, "mem-bw/H2D_Mem_BW:0": 0.0, "mem-bw/H2D_Mem_BW:1": 0.0078125, "mem-bw/H2D_Mem_BW:2": 0.015625, "mem-bw/H2D_Mem_BW:3": 0.01953125, "mem-bw/H2D_Mem_BW:4": 0.0234375, "mem-bw/H2D_Mem_BW:5": 0.0078125, "mem-bw/H2D_Mem_BW:6": -0.01171875, "mem-bw/H2D_Mem_BW:7": 0.01953125, "mem-bw/return_code": 0.0, "Index": "sb-validation-01"}
+{"Category": "FailedTest,Mem", "Defective Details": "mem-bw/D2H_Mem_BW:0_miss,mem-bw/D2H_Mem_BW:1_miss,mem-bw/D2H_Mem_BW:2_miss,mem-bw/D2H_Mem_BW:3_miss,mem-bw/D2H_Mem_BW:4_miss,mem-bw/D2H_Mem_BW:5_miss,mem-bw/D2H_Mem_BW:6_miss,mem-bw/D2H_Mem_BW:7_miss,mem-bw/H2D_Mem_BW:0_miss,mem-bw/H2D_Mem_BW:1_miss,mem-bw/H2D_Mem_BW:2_miss,mem-bw/H2D_Mem_BW:3_miss,mem-bw/H2D_Mem_BW:4_miss,mem-bw/H2D_Mem_BW:5_miss,mem-bw/H2D_Mem_BW:6_miss,mem-bw/H2D_Mem_BW:7_miss,mem-bw/return_code(VAL: 1.0000 Rule:lambda x:x>0)", "kernel-launch/event_overhead:0": 0.0, "kernel-launch/event_overhead:1": -0.0016778523, "kernel-launch/event_overhead:2": -0.0654362416, "kernel-launch/event_overhead:3": -0.0771812081, "kernel-launch/event_overhead:4": -0.0067114094, "kernel-launch/event_overhead:5": -0.0117449664, "kernel-launch/event_overhead:6": -0.0402684564, "kernel-launch/event_overhead:7": -0.0100671141, "kernel-launch/return_code": 0.0, "kernel-launch/wall_overhead:0": 0.0, "kernel-launch/wall_overhead:1": 0.0, "kernel-launch/wall_overhead:2": 0.0194931774, "kernel-launch/wall_overhead:3": 0.022417154, "kernel-launch/wall_overhead:4": 0.0360623782, "kernel-launch/wall_overhead:5": -0.0194931774, "kernel-launch/wall_overhead:6": 0.0185185185, "kernel-launch/wall_overhead:7": 0.0438596491, "mem-bw/D2H_Mem_BW:0": null, "mem-bw/D2H_Mem_BW:1": null, "mem-bw/D2H_Mem_BW:2": null, "mem-bw/D2H_Mem_BW:3": null, "mem-bw/D2H_Mem_BW:4": null, "mem-bw/D2H_Mem_BW:5": null, "mem-bw/D2H_Mem_BW:6": null, "mem-bw/D2H_Mem_BW:7": null, "mem-bw/H2D_Mem_BW:0": null, "mem-bw/H2D_Mem_BW:1": null, "mem-bw/H2D_Mem_BW:2": null, "mem-bw/H2D_Mem_BW:3": null, "mem-bw/H2D_Mem_BW:4": null, "mem-bw/H2D_Mem_BW:5": null, "mem-bw/H2D_Mem_BW:6": null, "mem-bw/H2D_Mem_BW:7": null, "mem-bw/return_code": 1.0, "Index": "sb-validation-03"}
diff --git a/tests/data/diagnosis_summary.xlsx b/tests/data/diagnosis_summary.xlsx
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		{"Category": "KernelLaunch", "Defective Details": "kernel-launch/event_overhead:0(B/L: 0.0060 VAL: 0.1000 VAR: 1577.85% Rule:lambda x:x>0.05)", "kernel-launch/event_overhead:0": 15.7785234899, "kernel-launch/event_overhead:1": -0.0016778523, "kernel-launch/event_overhead:2": -0.0654362416, "kernel-launch/event_overhead:3": -0.0771812081, "kernel-launch/event_overhead:4": -0.0067114094, "kernel-launch/event_overhead:5": -0.0117449664, "kernel-launch/event_overhead:6": -0.0402684564, "kernel-launch/event_overhead:7": -0.0100671141, "kernel-launch/return_code": 0.0, "kernel-launch/wall_overhead:0": 0.0, "kernel-launch/wall_overhead:1": 0.0, "kernel-launch/wall_overhead:2": 0.0194931774, "kernel-launch/wall_overhead:3": 0.022417154, "kernel-launch/wall_overhead:4": 0.0360623782, "kernel-launch/wall_overhead:5": -0.0194931774, "kernel-launch/wall_overhead:6": 0.0185185185, "kernel-launch/wall_overhead:7": 0.0438596491, "mem-bw/D2H_Mem_BW:0": 0.0, "mem-bw/D2H_Mem_BW:1": 0.012345679, "mem-bw/D2H_Mem_BW:2": 0.0082304527, "mem-bw/D2H_Mem_BW:3": 0.012345679, "mem-bw/D2H_Mem_BW:4": 0.0, "mem-bw/D2H_Mem_BW:5": 0.0, "mem-bw/D2H_Mem_BW:6": -0.0164609053, "mem-bw/D2H_Mem_BW:7": 0.012345679, "mem-bw/H2D_Mem_BW:0": 0.0, "mem-bw/H2D_Mem_BW:1": 0.0078125, "mem-bw/H2D_Mem_BW:2": 0.015625, "mem-bw/H2D_Mem_BW:3": 0.01953125, "mem-bw/H2D_Mem_BW:4": 0.0234375, "mem-bw/H2D_Mem_BW:5": 0.0078125, "mem-bw/H2D_Mem_BW:6": -0.01171875, "mem-bw/H2D_Mem_BW:7": 0.01953125, "mem-bw/return_code": 0.0, "Index": "sb-validation-01"}
		{"Category": "FailedTest,Mem", "Defective Details": "mem-bw/D2H_Mem_BW:0_miss,mem-bw/D2H_Mem_BW:1_miss,mem-bw/D2H_Mem_BW:2_miss,mem-bw/D2H_Mem_BW:3_miss,mem-bw/D2H_Mem_BW:4_miss,mem-bw/D2H_Mem_BW:5_miss,mem-bw/D2H_Mem_BW:6_miss,mem-bw/D2H_Mem_BW:7_miss,mem-bw/H2D_Mem_BW:0_miss,mem-bw/H2D_Mem_BW:1_miss,mem-bw/H2D_Mem_BW:2_miss,mem-bw/H2D_Mem_BW:3_miss,mem-bw/H2D_Mem_BW:4_miss,mem-bw/H2D_Mem_BW:5_miss,mem-bw/H2D_Mem_BW:6_miss,mem-bw/H2D_Mem_BW:7_miss,mem-bw/return_code(VAL: 1.0000 Rule:lambda x:x>0)", "kernel-launch/event_overhead:0": 0.0, "kernel-launch/event_overhead:1": -0.0016778523, "kernel-launch/event_overhead:2": -0.0654362416, "kernel-launch/event_overhead:3": -0.0771812081, "kernel-launch/event_overhead:4": -0.0067114094, "kernel-launch/event_overhead:5": -0.0117449664, "kernel-launch/event_overhead:6": -0.0402684564, "kernel-launch/event_overhead:7": -0.0100671141, "kernel-launch/return_code": 0.0, "kernel-launch/wall_overhead:0": 0.0, "kernel-launch/wall_overhead:1": 0.0, "kernel-launch/wall_overhead:2": 0.0194931774, "kernel-launch/wall_overhead:3": 0.022417154, "kernel-launch/wall_overhead:4": 0.0360623782, "kernel-launch/wall_overhead:5": -0.0194931774, "kernel-launch/wall_overhead:6": 0.0185185185, "kernel-launch/wall_overhead:7": 0.0438596491, "mem-bw/D2H_Mem_BW:0": null, "mem-bw/D2H_Mem_BW:1": null, "mem-bw/D2H_Mem_BW:2": null, "mem-bw/D2H_Mem_BW:3": null, "mem-bw/D2H_Mem_BW:4": null, "mem-bw/D2H_Mem_BW:5": null, "mem-bw/D2H_Mem_BW:6": null, "mem-bw/D2H_Mem_BW:7": null, "mem-bw/H2D_Mem_BW:0": null, "mem-bw/H2D_Mem_BW:1": null, "mem-bw/H2D_Mem_BW:2": null, "mem-bw/H2D_Mem_BW:3": null, "mem-bw/H2D_Mem_BW:4": null, "mem-bw/H2D_Mem_BW:5": null, "mem-bw/H2D_Mem_BW:6": null, "mem-bw/H2D_Mem_BW:7": null, "mem-bw/return_code": 1.0, "Index": "sb-validation-03"}