diff --git a/post-processing/README.md b/post-processing/README.md index 2258dbdc..b081f10e 100644 --- a/post-processing/README.md +++ b/post-processing/README.md @@ -39,7 +39,7 @@ python post_processing.py log_path config_path [-p plot_type] - `config_path` - Path to a configuration file containing plot details. - `plot_type` - (Optional.) Type of plot to be generated. (`Note: only a generic bar chart is currently implemented.`) -Run `post_processing.py -h` for more information (including debugging flags). +Run `post_processing.py -h` for more information (including debugging and file output flags). #### Streamlit @@ -68,12 +68,13 @@ Before running post-processing, create a config file including all necessary inf - `Format: [column_name, value]` - `column_types` - Pandas dtype for each relevant column (axes, units, filters, series). Specified with a dictionary. - `Accepted types: "str"/"string"/"object", "int"/"int64", "float"/"float64", "datetime"/"datetime64"` +- `additional_columns_to_csv` - (Optional.) List of additional columns to export to csv file, in addition to the ones above. Those columns are not used in plotting. (Specify an empty list if no additional columns are required.) #### A Note on Replaced ReFrame Columns -A perflog contains certain columns that will not be present in the DataFrame available to the graphing script. Currently, these columns are `display_name`, `extra_resources`, and `env_vars`. Removed columns should not be referenced in a plot config file. +A perflog contains certain columns with complex information that has to be unpacked in order to be useful. Currently, such columns are `display_name`, `extra_resources`, `env_vars`, and `spack_spec_dict`. Those columns are parsed by the postprocessing, removed from the DataFrame, and substituted by new columns with the unpacked information. Therefore they will not be present in the DataFrame available to the graphing script and should not be referenced in a plot config file. -When the row contents of `display_name` are parsed, they are separated into their constituent benchmark names and parameters. This column is replaced with a new `test_name` column and new parameter columns (if present). Similarly, the `extra_resources` and `env_vars` columns are replaced with their respective dictionary row contents (keys become columns, values become row contents). +When the row contents of `display_name` are parsed, they are separated into their constituent benchmark names and parameters. This column is replaced with a new `test_name` column and new parameter columns (if present). Similarly, the `extra_resources`, `env_vars`, and `spack_spec_dict` columns are replaced with their respective dictionary row contents (keys become columns, values become row contents). #### Complete Config Template @@ -121,6 +122,10 @@ series: # accepted types: string/object, int, float, datetime column_types: : + +# optional (default: no extra columns exported to a csv file in addition to the ones above) +additional_columns_to_csv: + ``` #### Example Config @@ -162,6 +167,9 @@ column_types: filter_col_1: "datetime" filter_col_2: "int" series_col: "str" + +additional_columns_to_csv: + ["additional_col_1", "additional_col_2"] ``` #### X-axis Grouping diff --git a/post-processing/config_handler.py b/post-processing/config_handler.py index 8f8760e3..e4b7b3d2 100644 --- a/post-processing/config_handler.py +++ b/post-processing/config_handler.py @@ -25,6 +25,7 @@ def __init__(self, config: dict, template=False): self.filters = config.get("filters") self.series = config.get("series") self.column_types = config.get("column_types") + self.extra_columns = config.get("additional_columns_to_csv") # parse filter information self.and_filters = [] @@ -153,6 +154,13 @@ def parse_columns(self): dict.fromkeys((self.plot_columns + self.filter_columns + ([self.scaling_column.get("name")] if self.scaling_column else [])))) + # remove duplicated columns from the extra_columns list + duplicates = set(self.all_columns) & set(self.extra_columns) + while len(duplicates) != 0: + for d in duplicates: + self.extra_columns.remove(d) + duplicates = set(self.all_columns) & set(self.extra_columns) + def remove_redundant_types(self): """ Check for columns that are no longer in use and remove them from the type dict. diff --git a/post-processing/post_processing.py b/post-processing/post_processing.py index ba80c659..2a724d93 100644 --- a/post-processing/post_processing.py +++ b/post-processing/post_processing.py @@ -2,6 +2,7 @@ import operator as op import traceback from functools import reduce +import os from pathlib import Path import pandas as pd @@ -12,7 +13,7 @@ class PostProcessing: - def __init__(self, log_path: Path, debug=False, verbose=False): + def __init__(self, log_path: Path, debug=False, verbose=False, save=False, plotting=True): """ Initialise class. @@ -20,11 +21,15 @@ def __init__(self, log_path: Path, debug=False, verbose=False): log_path: Path, path to performance log file or directory. debug: bool, flag to print additional information to console. verbose: bool, flag to print more additional information to console. + save: bool, flag to save the filtered dataframe in csv file + plotting: bool, flag to generate and store a plot in html file """ # FIXME (issue #264): add proper logging self.debug = debug self.verbose = verbose + self.save = save + self.plotting = plotting # find and read perflogs self.original_df = PerflogHandler(log_path, self.debug).get_df() # copy original data for modification during post-processing @@ -58,16 +63,18 @@ def run_post_processing(self, config: ConfigHandler): # scale y-axis self.transform_df_data( config.x_axis["value"], config.y_axis["value"], *config.get_y_scaling(), config.series_filters) - - # FIXME (#issue #255): have an option to put this into a file (-s / --save flag?) if self.debug: print("Selected dataframe:") - print(self.df[self.mask][config.plot_columns]) + print(self.df[self.mask][config.plot_columns + config.extra_columns]) + if self.save: + self.df[self.mask][config.plot_columns + config.extra_columns].to_csv( + path_or_buf=os.path.join(Path(__file__).parent,'output.csv'), index=True) # Set index=False to exclude the DataFrame index from the CSV # call a plotting script - self.plot = plot_generic( - config.title, self.df[self.mask][config.plot_columns], - config.x_axis, config.y_axis, config.series_filters, self.debug) + if self.plotting: + self.plot = plot_generic( + config.title, self.df[self.mask][config.plot_columns], + config.x_axis, config.y_axis, config.series_filters, self.debug) # FIXME (#issue #255): maybe save this bit to a file as well for easier viewing if self.debug & self.verbose: @@ -396,6 +403,11 @@ def read_args(): parser.add_argument("-v", "--verbose", action="store_true", help="verbose flag for printing more debug information \ (must be used in conjunction with the debug flag)") + parser.add_argument("-s", "--save", action="store_true", + help="save flag for saving the filtered dataframe in csv file") + parser.add_argument("-np", "--no_plot", action="store_true", + help="no-plot flag for disabling generating and storing a plot") + return parser.parse_args() @@ -405,7 +417,7 @@ def main(): args = read_args() try: - post = PostProcessing(args.log_path, args.debug, args.verbose) + post = PostProcessing(args.log_path, args.debug, args.verbose, args.save, not(args.no_plot)) config = ConfigHandler.from_path(args.config_path) post.run_post_processing(config) diff --git a/post-processing/post_processing_config.yaml b/post-processing/post_processing_config.yaml index 2e4b3521..c421b1b4 100644 --- a/post-processing/post_processing_config.yaml +++ b/post-processing/post_processing_config.yaml @@ -47,3 +47,8 @@ column_types: flops_unit: "str" system: "str" cpus_per_task: "int" + +# Optional setting to specify additional columns to export to csv file, in addition to +# the ones in axes/series/filters +additional_columns_to_csv: + ["spack_spec"] \ No newline at end of file diff --git a/post-processing/test_post_processing.py b/post-processing/test_post_processing.py index e75b5548..f183bc23 100644 --- a/post-processing/test_post_processing.py +++ b/post-processing/test_post_processing.py @@ -236,7 +236,8 @@ def test_high_level_script(run_sombrero): "series": [], "column_types": {"fake_column": "int", "flops_value": "float", - "flops_unit": "str"}})) + "flops_unit": "str"}, + "additional_columns_to_csv": []})) except KeyError as e: assert e.args[1] == ["fake_column"] else: @@ -256,7 +257,8 @@ def test_high_level_script(run_sombrero): "series": [], "column_types": {"tasks": "int", "flops_value": "float", - "flops_unit": "str"}})) + "flops_unit": "str"}, + "additional_columns_to_csv": []})) except KeyError as e: assert e.args[1] == "!!" else: @@ -276,7 +278,8 @@ def test_high_level_script(run_sombrero): "series": [], "column_types": {"tasks": "int", "flops_value": "float", - "flops_unit": "str"}})) + "flops_unit": "str"}, + "additional_columns_to_csv": []})) except ValueError: assert True else: @@ -296,7 +299,8 @@ def test_high_level_script(run_sombrero): "series": [], "column_types": {"tasks": "int", "flops_value": "float", - "flops_unit": "str"}})) + "flops_unit": "str"}, + "additional_columns_to_csv": []})) except pd.errors.EmptyDataError: assert True else: @@ -315,7 +319,8 @@ def test_high_level_script(run_sombrero): "series": [], "column_types": {"tasks": "int", "flops_value": "float", - "flops_unit": "str"}})) + "flops_unit": "str"}, + "additional_columns_to_csv": []})) except RuntimeError: assert True else: @@ -334,7 +339,8 @@ def test_high_level_script(run_sombrero): "series": [], "column_types": {"tasks": "int", "cpus_per_task": "int", - "extra_param": "int"}})) + "extra_param": "int"}, + "additional_columns_to_csv": []})) except RuntimeError as e: # three param columns found in changed log EXPECTED_FIELDS = ["tasks", "cpus_per_task", "extra_param"] @@ -356,7 +362,8 @@ def test_high_level_script(run_sombrero): "series": [], "column_types": {"job_completion_time": "datetime", "flops_value": "float", - "flops_unit": "str"}})) + "flops_unit": "str"}, + "additional_columns_to_csv": []})) # check returned subset is as expected assert len(df) == 2 @@ -374,7 +381,8 @@ def test_high_level_script(run_sombrero): "column_types": {"tasks": "int", "cpus_per_task": "int", "flops_value": "float", - "flops_unit": "str"}})) + "flops_unit": "str"}, + "additional_columns_to_csv": []})) # check returned subset is as expected assert len(df) == 4 @@ -394,7 +402,8 @@ def test_high_level_script(run_sombrero): "flops_value": "float", "flops_unit": "str", "cpus_per_task": "int", - "OMP_NUM_THREADS": "int"}})) + "OMP_NUM_THREADS": "int"}, + "additional_columns_to_csv": []})) # check flops values are halved compared to previous df assert (dfs["flops_value"].values == df[df["cpus_per_task"] == 2]["flops_value"].values/2).all() @@ -413,7 +422,8 @@ def test_high_level_script(run_sombrero): "column_types": {"tasks": "int", "flops_value": "float", "flops_unit": "str", - "cpus_per_task": "int"}})) + "cpus_per_task": "int"}, + "additional_columns_to_csv": []})) assert (dfs[dfs["cpus_per_task"] == 1]["flops_value"].values == df[df["cpus_per_task"] == 1]["flops_value"].values / df[df["cpus_per_task"] == 1]["flops_value"].values).all() @@ -437,7 +447,8 @@ def test_high_level_script(run_sombrero): "column_types": {"tasks": "int", "flops_value": "float", "flops_unit": "str", - "cpus_per_task": "int"}})) + "cpus_per_task": "int"}, + "additional_columns_to_csv": []})) assert (dfs["flops_value"].values == df["flops_value"].values / df[(df["cpus_per_task"] == 1) & (df["tasks"] == 2)]["flops_value"].iloc[0]).all() @@ -456,7 +467,8 @@ def test_high_level_script(run_sombrero): "column_types": {"tasks": "int", "flops_value": "float", "flops_unit": "str", - "cpus_per_task": "int"}})) + "cpus_per_task": "int"}, + "additional_columns_to_csv": []})) # check flops values are halved compared to previous df assert (dfs["flops_value"].values == df[df["cpus_per_task"] == 2]["flops_value"].values/2).all() @@ -476,7 +488,8 @@ def test_high_level_script(run_sombrero): "flops_value": "float", "flops_unit": "str", "cpus_per_task": "int", - "OMP_NUM_THREADS": "str"}})) + "OMP_NUM_THREADS": "str"}, + "additional_columns_to_csv": []})) except TypeError: assert True @@ -496,7 +509,8 @@ def test_high_level_script(run_sombrero): "column_types": {"tasks": "int", "flops_value": "float", "flops_unit": "str", - "cpus_per_task": "int"}})) + "cpus_per_task": "int"}, + "additional_columns_to_csv": []})) except ValueError: assert True @@ -514,7 +528,8 @@ def test_high_level_script(run_sombrero): "series": [], "column_types": {"tasks": "int", "flops_value": "float", - "flops_unit": "str"}})) + "flops_unit": "str"}, + "additional_columns_to_csv": []})) except RuntimeError as e: # dataframe has records from both files assert len(e.args[1]) == 8 @@ -535,9 +550,69 @@ def test_high_level_script(run_sombrero): "column_types": {"tasks": "int", "flops_value": "float", "flops_unit": "str", - "cpus_per_task": "int"}})) + "cpus_per_task": "int"}, + "additional_columns_to_csv": []})) EXPECTED_FIELDS = ["tasks", "flops_value", "flops_unit"] # check returned subset is as expected assert df.columns.tolist() == EXPECTED_FIELDS assert len(df) == 1 + + # get filtered dataframe with extra columns for csv + df = PostProcessing(sombrero_log_path, save=True).run_post_processing( + ConfigHandler( + {"title": "Title", + "x_axis": {"value": "tasks", + "units": {"custom": None}}, + "y_axis": {"value": "flops_value", + "units": {"column": "flops_unit"}}, + "filters": {"and": [["tasks", ">", 1], ["cpus_per_task", "==", 2]], + "or": []}, + "series": [], + "column_types": {"tasks": "int", + "flops_value": "float", + "flops_unit": "str", + "cpus_per_task": "int"}, + "additional_columns_to_csv": ["spack_spec"]} + )) + + EXPECTED_FIELDS = ["tasks", "flops_value", "flops_unit"] + # check returned subset is as expected + assert df.columns.tolist() == EXPECTED_FIELDS + assert len(df) == 1 + + EXPECTED_FIELDS.append("spack_spec") + # check subset written to csv is as expected + output_file = "output.csv" + df_saved = pd.read_csv(output_file, index_col=0) + assert df_saved.columns.tolist() == EXPECTED_FIELDS + assert len(df_saved) == 1 + + # get filtered dataframe with duplicated extra columns for csv + df = PostProcessing(sombrero_log_path, save=True).run_post_processing( + ConfigHandler( + {"title": "Title", + "x_axis": {"value": "tasks", + "units": {"custom": None}}, + "y_axis": {"value": "flops_value", + "units": {"column": "flops_unit"}}, + "filters": {"and": [["tasks", ">", 1], ["cpus_per_task", "==", 2]], + "or": []}, + "series": [], + "column_types": {"tasks": "int", + "flops_value": "float", + "flops_unit": "str", + "cpus_per_task": "int"}, + "additional_columns_to_csv": ["tasks", "tasks"]} + )) + + EXPECTED_FIELDS = ["tasks", "flops_value", "flops_unit"] + # check returned subset is as expected + assert df.columns.tolist() == EXPECTED_FIELDS + assert len(df) == 1 + + # check subset written to csv is as expected + output_file = "output.csv" + df_saved = pd.read_csv(output_file, index_col=0) + assert df_saved.columns.tolist() == EXPECTED_FIELDS + assert len(df_saved) == 1