Remove duplicates between all_columns and extra_columns, add unit tests.

reframe-hpc · Apr 11, 2024 · 264a07f · 264a07f
1 parent 8906026
commit 264a07f
Show file tree

Hide file tree

Showing 2 changed files with 98 additions and 16 deletions.
diff --git a/post-processing/config_handler.py b/post-processing/config_handler.py
@@ -154,6 +154,13 @@ def parse_columns(self):
             dict.fromkeys((self.plot_columns + self.filter_columns +
                            ([self.scaling_column.get("name")] if self.scaling_column else []))))
 
+        # remove duplicated columns from the extra_columns list
+        duplicates = set(self.all_columns) & set(self.extra_columns)
+        while len(duplicates) != 0:
+            for d in duplicates:
+                self.extra_columns.remove(d)
+            duplicates = set(self.all_columns) & set(self.extra_columns)
+
     def remove_redundant_types(self):
         """
             Check for columns that are no longer in use and remove them from the type dict.

diff --git a/post-processing/test_post_processing.py b/post-processing/test_post_processing.py
@@ -236,7 +236,8 @@ def test_high_level_script(run_sombrero):
                  "series": [],
                  "column_types": {"fake_column": "int",
                                   "flops_value": "float",
-                                  "flops_unit": "str"}}))
+                                  "flops_unit": "str"},
+                 "additional_columns_to_csv": []}))
     except KeyError as e:
         assert e.args[1] == ["fake_column"]
     else:
@@ -256,7 +257,8 @@ def test_high_level_script(run_sombrero):
                  "series": [],
                  "column_types": {"tasks": "int",
                                   "flops_value": "float",
-                                  "flops_unit": "str"}}))
+                                  "flops_unit": "str"},
+                 "additional_columns_to_csv": []}))
     except KeyError as e:
         assert e.args[1] == "!!"
     else:
@@ -276,7 +278,8 @@ def test_high_level_script(run_sombrero):
                  "series": [],
                  "column_types": {"tasks": "int",
                                   "flops_value": "float",
-                                  "flops_unit": "str"}}))
+                                  "flops_unit": "str"},
+                 "additional_columns_to_csv": []}))
     except ValueError:
         assert True
     else:
@@ -296,7 +299,8 @@ def test_high_level_script(run_sombrero):
                  "series": [],
                  "column_types": {"tasks": "int",
                                   "flops_value": "float",
-                                  "flops_unit": "str"}}))
+                                  "flops_unit": "str"},
+                 "additional_columns_to_csv": []}))
     except pd.errors.EmptyDataError:
         assert True
     else:
@@ -315,7 +319,8 @@ def test_high_level_script(run_sombrero):
                  "series": [],
                  "column_types": {"tasks": "int",
                                   "flops_value": "float",
-                                  "flops_unit": "str"}}))
+                                  "flops_unit": "str"},
+                 "additional_columns_to_csv": []}))
     except RuntimeError:
         assert True
     else:
@@ -334,7 +339,8 @@ def test_high_level_script(run_sombrero):
                  "series": [],
                  "column_types": {"tasks": "int",
                                   "cpus_per_task": "int",
-                                  "extra_param": "int"}}))
+                                  "extra_param": "int"},
+                 "additional_columns_to_csv": []}))
     except RuntimeError as e:
         # three param columns found in changed log
         EXPECTED_FIELDS = ["tasks", "cpus_per_task", "extra_param"]
@@ -356,7 +362,8 @@ def test_high_level_script(run_sombrero):
              "series": [],
              "column_types": {"job_completion_time": "datetime",
                               "flops_value": "float",
-                              "flops_unit": "str"}}))
+                              "flops_unit": "str"},
+             "additional_columns_to_csv": []}))
     # check returned subset is as expected
     assert len(df) == 2
 
@@ -374,7 +381,8 @@ def test_high_level_script(run_sombrero):
              "column_types": {"tasks": "int",
                               "cpus_per_task": "int",
                               "flops_value": "float",
-                              "flops_unit": "str"}}))
+                              "flops_unit": "str"},
+             "additional_columns_to_csv": []}))
     # check returned subset is as expected
     assert len(df) == 4
 
@@ -394,7 +402,8 @@ def test_high_level_script(run_sombrero):
                               "flops_value": "float",
                               "flops_unit": "str",
                               "cpus_per_task": "int",
-                              "OMP_NUM_THREADS": "int"}}))
+                              "OMP_NUM_THREADS": "int"},
+             "additional_columns_to_csv": []}))
     # check flops values are halved compared to previous df
     assert (dfs["flops_value"].values == df[df["cpus_per_task"] == 2]["flops_value"].values/2).all()
 
@@ -413,7 +422,8 @@ def test_high_level_script(run_sombrero):
              "column_types": {"tasks": "int",
                               "flops_value": "float",
                               "flops_unit": "str",
-                              "cpus_per_task": "int"}}))
+                              "cpus_per_task": "int"},
+             "additional_columns_to_csv": []}))
     assert (dfs[dfs["cpus_per_task"] == 1]["flops_value"].values ==
             df[df["cpus_per_task"] == 1]["flops_value"].values /
             df[df["cpus_per_task"] == 1]["flops_value"].values).all()
@@ -437,7 +447,8 @@ def test_high_level_script(run_sombrero):
              "column_types": {"tasks": "int",
                               "flops_value": "float",
                               "flops_unit": "str",
-                              "cpus_per_task": "int"}}))
+                              "cpus_per_task": "int"},
+             "additional_columns_to_csv": []}))
     assert (dfs["flops_value"].values == df["flops_value"].values /
             df[(df["cpus_per_task"] == 1) & (df["tasks"] == 2)]["flops_value"].iloc[0]).all()
 
@@ -456,7 +467,8 @@ def test_high_level_script(run_sombrero):
              "column_types": {"tasks": "int",
                               "flops_value": "float",
                               "flops_unit": "str",
-                              "cpus_per_task": "int"}}))
+                              "cpus_per_task": "int"},
+             "additional_columns_to_csv": []}))
     # check flops values are halved compared to previous df
     assert (dfs["flops_value"].values == df[df["cpus_per_task"] == 2]["flops_value"].values/2).all()
 
@@ -476,7 +488,8 @@ def test_high_level_script(run_sombrero):
                                   "flops_value": "float",
                                   "flops_unit": "str",
                                   "cpus_per_task": "int",
-                                  "OMP_NUM_THREADS": "str"}}))
+                                  "OMP_NUM_THREADS": "str"},
+                 "additional_columns_to_csv": []}))
     except TypeError:
         assert True
 
@@ -496,7 +509,8 @@ def test_high_level_script(run_sombrero):
                  "column_types": {"tasks": "int",
                                   "flops_value": "float",
                                   "flops_unit": "str",
-                                  "cpus_per_task": "int"}}))
+                                  "cpus_per_task": "int"},
+                 "additional_columns_to_csv": []}))
     except ValueError:
         assert True
 
@@ -514,7 +528,8 @@ def test_high_level_script(run_sombrero):
                  "series": [],
                  "column_types": {"tasks": "int",
                                   "flops_value": "float",
-                                  "flops_unit": "str"}}))
+                                  "flops_unit": "str"},
+                 "additional_columns_to_csv": []}))
     except RuntimeError as e:
         # dataframe has records from both files
         assert len(e.args[1]) == 8
@@ -535,9 +550,69 @@ def test_high_level_script(run_sombrero):
              "column_types": {"tasks": "int",
                               "flops_value": "float",
                               "flops_unit": "str",
-                              "cpus_per_task": "int"}}))
+                              "cpus_per_task": "int"},
+             "additional_columns_to_csv": []}))
 
     EXPECTED_FIELDS = ["tasks", "flops_value", "flops_unit"]
     # check returned subset is as expected
     assert df.columns.tolist() == EXPECTED_FIELDS
     assert len(df) == 1
+
+    # get filtered dataframe with extra columns for csv
+    df = PostProcessing(sombrero_log_path, save=True).run_post_processing(
+        ConfigHandler(
+            {"title": "Title",
+             "x_axis": {"value": "tasks",
+                        "units": {"custom": None}},
+             "y_axis": {"value": "flops_value",
+                        "units": {"column": "flops_unit"}},
+             "filters": {"and": [["tasks", ">", 1], ["cpus_per_task", "==", 2]],
+                         "or": []},
+             "series": [],
+             "column_types": {"tasks": "int",
+                              "flops_value": "float",
+                              "flops_unit": "str",
+                              "cpus_per_task": "int"},
+            "additional_columns_to_csv": ["spack_spec"]}
+        ))
+
+    EXPECTED_FIELDS = ["tasks", "flops_value", "flops_unit"]
+    # check returned subset is as expected
+    assert df.columns.tolist() == EXPECTED_FIELDS
+    assert len(df) == 1
+
+    EXPECTED_FIELDS.append("spack_spec")
+    # check subset written to csv is as expected
+    output_file = "output.csv"
+    df_saved = pd.read_csv(output_file, index_col=0)
+    assert df_saved.columns.tolist() == EXPECTED_FIELDS
+    assert len(df_saved) == 1
+
+    # get filtered dataframe with duplicated extra columns for csv
+    df = PostProcessing(sombrero_log_path, save=True).run_post_processing(
+        ConfigHandler(
+            {"title": "Title",
+             "x_axis": {"value": "tasks",
+                        "units": {"custom": None}},
+             "y_axis": {"value": "flops_value",
+                        "units": {"column": "flops_unit"}},
+             "filters": {"and": [["tasks", ">", 1], ["cpus_per_task", "==", 2]],
+                         "or": []},
+             "series": [],
+             "column_types": {"tasks": "int",
+                              "flops_value": "float",
+                              "flops_unit": "str",
+                              "cpus_per_task": "int"},
+            "additional_columns_to_csv": ["tasks", "tasks"]}
+        ))
+
+    EXPECTED_FIELDS = ["tasks", "flops_value", "flops_unit"]
+    # check returned subset is as expected
+    assert df.columns.tolist() == EXPECTED_FIELDS
+    assert len(df) == 1
+
+    # check subset written to csv is as expected
+    output_file = "output.csv"
+    df_saved = pd.read_csv(output_file, index_col=0)
+    assert df_saved.columns.tolist() == EXPECTED_FIELDS
+    assert len(df_saved) == 1