From 438be0257c39cf81bb1260e2a881559c2c10df3c Mon Sep 17 00:00:00 2001
From: Kaan Olgu <k.olgu20@gmail.com>
Date: Thu, 1 Feb 2024 13:33:46 +0000
Subject: [PATCH 1/8] Instead of new axis, pick the columns to export from
 dataframe

---
 post-processing/post_processing.py          | 7 ++++++-
 post-processing/post_processing_config.yaml | 7 +++++++
 2 files changed, 13 insertions(+), 1 deletion(-)

diff --git a/post-processing/post_processing.py b/post-processing/post_processing.py
index 96f46e03..faabde65 100644
--- a/post-processing/post_processing.py
+++ b/post-processing/post_processing.py
@@ -61,6 +61,7 @@ def run_post_processing(self, log_path, config):
             print("")
 
         df = pd.DataFrame()
+        df_csv_export = pd.DataFrame()
         # put all perflog information in one dataframe
         for file in log_files:
             try:
@@ -73,7 +74,11 @@ def run_post_processing(self, log_path, config):
                     print("")
         if df.empty:
             raise FileNotFoundError(errno.ENOENT, "Could not find a valid perflog in path", log_path)
-
+        # specify columns to export from dataframe to csv
+        for col in config["csv_export"]:
+            df_csv_export = pd.concat([df_csv_export, df[col]], axis=1, join='outer')
+        if self.debug & self.verbose:
+            print(df_csv_export)
         # get axis columns
         columns = [config["x_axis"]["value"], config["y_axis"]["value"]]
         if config["x_axis"]["units"].get("column"):
diff --git a/post-processing/post_processing_config.yaml b/post-processing/post_processing_config.yaml
index 2e4b3521..5b0f5d4b 100644
--- a/post-processing/post_processing_config.yaml
+++ b/post-processing/post_processing_config.yaml
@@ -47,3 +47,10 @@ column_types:
   flops_unit: "str"
   system: "str"
   cpus_per_task: "int"
+
+# Specify which columns to export to csv file
+csv_export:
+  [tasks,
+  flops_value,
+  Triad_unit,
+  cpus_per_task]
\ No newline at end of file

From a49a30c04206fd7439902a7dfa08da4da4206a17 Mon Sep 17 00:00:00 2001
From: Kaan Olgu <k.olgu20@gmail.com>
Date: Thu, 1 Feb 2024 14:23:22 +0000
Subject: [PATCH 2/8] Handling missing csv_export error

---
 post-processing/post_processing.py | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/post-processing/post_processing.py b/post-processing/post_processing.py
index faabde65..09920a91 100644
--- a/post-processing/post_processing.py
+++ b/post-processing/post_processing.py
@@ -75,10 +75,14 @@ def run_post_processing(self, log_path, config):
         if df.empty:
             raise FileNotFoundError(errno.ENOENT, "Could not find a valid perflog in path", log_path)
         # specify columns to export from dataframe to csv
-        for col in config["csv_export"]:
-            df_csv_export = pd.concat([df_csv_export, df[col]], axis=1, join='outer')
-        if self.debug & self.verbose:
-            print(df_csv_export)
+        if config.get("csv_export") is None:
+            raise KeyError("Missing csv_export (specify an empty list [] if none are required).")
+        else:
+            for col in config["csv_export"]:
+                df_csv_export = pd.concat([df_csv_export, df[col]], axis=1, join='outer')
+            if self.debug:
+                print("Selected dataframe to export CSV file:")
+                print(df_csv_export)
         # get axis columns
         columns = [config["x_axis"]["value"], config["y_axis"]["value"]]
         if config["x_axis"]["units"].get("column"):

From 8b2c2ee0d661440610167456f777196e7b76874f Mon Sep 17 00:00:00 2001
From: Kaan Olgu <k.olgu20@gmail.com>
Date: Thu, 1 Feb 2024 14:40:30 +0000
Subject: [PATCH 3/8] CSV export to the perflogs folder

---
 post-processing/post_processing.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/post-processing/post_processing.py b/post-processing/post_processing.py
index 09920a91..658c4021 100644
--- a/post-processing/post_processing.py
+++ b/post-processing/post_processing.py
@@ -80,6 +80,7 @@ def run_post_processing(self, log_path, config):
         else:
             for col in config["csv_export"]:
                 df_csv_export = pd.concat([df_csv_export, df[col]], axis=1, join='outer')
+                df_csv_export.to_csv(log_path+'/output.csv', index=False)  # Set index=False to exclude the DataFrame index from the CSV
             if self.debug:
                 print("Selected dataframe to export CSV file:")
                 print(df_csv_export)

From 464e2baec23e5b8c0e5d2e9102c63d8a3c8aadaa Mon Sep 17 00:00:00 2001
From: Kaan Olgu <k.olgu20@gmail.com>
Date: Tue, 6 Feb 2024 12:07:15 +0000
Subject: [PATCH 4/8] New version adapted to refactoring

---
 post-processing/config_handler.py           | 1 +
 post-processing/post_processing.py          | 9 +++++++--
 post-processing/post_processing_config.yaml | 6 ++----
 3 files changed, 10 insertions(+), 6 deletions(-)

diff --git a/post-processing/config_handler.py b/post-processing/config_handler.py
index c0fdc2ae..14c31c20 100644
--- a/post-processing/config_handler.py
+++ b/post-processing/config_handler.py
@@ -14,6 +14,7 @@ def __init__(self, config: dict):
         self.filters = config.get("filters")
         self.series = config.get("series")
         self.column_types = config.get("column_types")
+        self.extra_columns = config.get("additional_columns_to_csv")
 
         # parse filter information
         self.and_filters = []
diff --git a/post-processing/post_processing.py b/post-processing/post_processing.py
index e9f5cc1b..384500eb 100644
--- a/post-processing/post_processing.py
+++ b/post-processing/post_processing.py
@@ -23,12 +23,13 @@ def __init__(self, log_path: Path, debug=False, verbose=False):
         # dataframe filters
         self.mask = pd.Series(self.df.index.notnull())
 
-    def run_post_processing(self, config: ConfigHandler):
+    def run_post_processing(self, log_path: Path, config: ConfigHandler):
         """
             Return a dataframe containing the information passed to a plotting script
             and produce relevant graphs.
 
             Args:
+                log_path: str, path to a log file or a directory containing log files.
                 config: ConfigHandler, class containing configuration information for plotting.
         """
 
@@ -50,6 +51,10 @@ def run_post_processing(self, config: ConfigHandler):
         # FIXME (#issue #255): have an option to put this into a file (-s / --save flag?)
         print("Selected dataframe:")
         print(self.df[self.mask][config.plot_columns])
+        if self.debug:
+            print("CSV dataframe:")
+            print(self.df[self.mask][config.plot_columns + config.extra_columns])
+        self.df[self.mask][config.plot_columns + config.extra_columns].to_csv(str(log_path)+'/output.csv', index=True)  # Set index=False to exclude the DataFrame index from the CSV
 
         # call a plotting script
         plot_generic(
@@ -380,7 +385,7 @@ def main():
     try:
         post = PostProcessing(args.log_path, args.debug, args.verbose)
         config = ConfigHandler.from_path(args.config_path)
-        post.run_post_processing(config)
+        post.run_post_processing(args.log_path,config)
 
     except Exception as e:
         print(type(e).__name__ + ":", e)
diff --git a/post-processing/post_processing_config.yaml b/post-processing/post_processing_config.yaml
index 5b0f5d4b..478359a3 100644
--- a/post-processing/post_processing_config.yaml
+++ b/post-processing/post_processing_config.yaml
@@ -49,8 +49,6 @@ column_types:
   cpus_per_task: "int"
 
 # Specify which columns to export to csv file
-csv_export:
-  [tasks,
-  flops_value,
-  Triad_unit,
+additional_columns_to_csv:
+  [flops_value,
   cpus_per_task]
\ No newline at end of file

From 493854cf8f63b2157234f342df5df008356a7180 Mon Sep 17 00:00:00 2001
From: Kaan Olgu <k.olgu20@gmail.com>
Date: Mon, 12 Feb 2024 21:58:23 +0000
Subject: [PATCH 5/8]  Removing log path from the run_post_processing

---
 post-processing/post_processing.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/post-processing/post_processing.py b/post-processing/post_processing.py
index 384500eb..0026721a 100644
--- a/post-processing/post_processing.py
+++ b/post-processing/post_processing.py
@@ -22,8 +22,9 @@ def __init__(self, log_path: Path, debug=False, verbose=False):
         # for re-running post-processing with front-end
         # dataframe filters
         self.mask = pd.Series(self.df.index.notnull())
+        self.log_path = log_path
 
-    def run_post_processing(self, log_path: Path, config: ConfigHandler):
+    def run_post_processing(self, config: ConfigHandler):
         """
             Return a dataframe containing the information passed to a plotting script
             and produce relevant graphs.
@@ -54,7 +55,7 @@ def run_post_processing(self, log_path: Path, config: ConfigHandler):
         if self.debug:
             print("CSV dataframe:")
             print(self.df[self.mask][config.plot_columns + config.extra_columns])
-        self.df[self.mask][config.plot_columns + config.extra_columns].to_csv(str(log_path)+'/output.csv', index=True)  # Set index=False to exclude the DataFrame index from the CSV
+        self.df[self.mask][config.plot_columns + config.extra_columns].to_csv(str(self.log_path)+'/output.csv', index=True)  # Set index=False to exclude the DataFrame index from the CSV
 
         # call a plotting script
         plot_generic(
@@ -385,7 +386,7 @@ def main():
     try:
         post = PostProcessing(args.log_path, args.debug, args.verbose)
         config = ConfigHandler.from_path(args.config_path)
-        post.run_post_processing(args.log_path,config)
+        post.run_post_processing(config)
 
     except Exception as e:
         print(type(e).__name__ + ":", e)

From 4c497968db89d4fa6640f9e54b86a2a82b2dbfa9 Mon Sep 17 00:00:00 2001
From: Kaan Olgu <k.olgu20@gmail.com>
Date: Mon, 12 Feb 2024 22:02:41 +0000
Subject: [PATCH 6/8] Removing the log_path from the arguments of
 run_post_processing

---
 post-processing/post_processing.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/post-processing/post_processing.py b/post-processing/post_processing.py
index 0026721a..0bd3e436 100644
--- a/post-processing/post_processing.py
+++ b/post-processing/post_processing.py
@@ -30,7 +30,6 @@ def run_post_processing(self, config: ConfigHandler):
             and produce relevant graphs.
 
             Args:
-                log_path: str, path to a log file or a directory containing log files.
                 config: ConfigHandler, class containing configuration information for plotting.
         """
 

From 89060260a8466971eff8d2f7d4d660f87970e930 Mon Sep 17 00:00:00 2001
From: Ilektra Christidi <ilektra.christidi@ucl.ac.uk>
Date: Thu, 11 Apr 2024 17:48:09 +0100
Subject: [PATCH 7/8] Add flags for plotting and csv file saving Save output
 csv in current folder, independent of perflog path Update docs and example
 yaml

---
 post-processing/README.md                   | 14 +++++++---
 post-processing/post_processing.py          | 29 ++++++++++++++-------
 post-processing/post_processing_config.yaml |  6 ++---
 3 files changed, 33 insertions(+), 16 deletions(-)

diff --git a/post-processing/README.md b/post-processing/README.md
index 2258dbdc..b081f10e 100644
--- a/post-processing/README.md
+++ b/post-processing/README.md
@@ -39,7 +39,7 @@ python post_processing.py log_path config_path [-p plot_type]
 - `config_path` - Path to a configuration file containing plot details.
 - `plot_type` - (Optional.) Type of plot to be generated. (`Note: only a generic bar chart is currently implemented.`)
 
-Run `post_processing.py -h` for more information (including debugging flags).
+Run `post_processing.py -h` for more information (including debugging and file output flags).
 
 #### Streamlit
 
@@ -68,12 +68,13 @@ Before running post-processing, create a config file including all necessary inf
     - `Format: [column_name, value]`
 - `column_types` - Pandas dtype for each relevant column (axes, units, filters, series). Specified with a dictionary.
     - `Accepted types: "str"/"string"/"object", "int"/"int64", "float"/"float64", "datetime"/"datetime64"`
+- `additional_columns_to_csv` - (Optional.) List of additional columns to export to csv file, in addition to the ones above. Those columns are not used in plotting. (Specify an empty list if no additional columns are required.)
 
 #### A Note on Replaced ReFrame Columns
 
-A perflog contains certain columns that will not be present in the DataFrame available to the graphing script. Currently, these columns are `display_name`, `extra_resources`, and `env_vars`. Removed columns should not be referenced in a plot config file.
+A perflog contains certain columns with complex information that has to be unpacked in order to be useful. Currently, such columns are `display_name`, `extra_resources`, `env_vars`, and `spack_spec_dict`. Those columns are parsed by the postprocessing, removed from the DataFrame, and substituted by new columns with the unpacked information. Therefore they will not be present in the DataFrame available to the graphing script and should not be referenced in a plot config file.
 
-When the row contents of `display_name` are parsed, they are separated into their constituent benchmark names and parameters. This column is replaced with a new `test_name` column and new parameter columns (if present). Similarly, the `extra_resources` and `env_vars` columns are replaced with their respective dictionary row contents (keys become columns, values become row contents).
+When the row contents of `display_name` are parsed, they are separated into their constituent benchmark names and parameters. This column is replaced with a new `test_name` column and new parameter columns (if present). Similarly, the `extra_resources`, `env_vars`, and `spack_spec_dict` columns are replaced with their respective dictionary row contents (keys become columns, values become row contents).
 
 #### Complete Config Template
 
@@ -121,6 +122,10 @@ series: <series_list>
 # accepted types: string/object, int, float, datetime
 column_types:
   <column_name>: <column_type>
+
+# optional (default: no extra columns exported to a csv file in addition to the ones above)
+additional_columns_to_csv:
+  <columns_list>
 ```
 
 #### Example Config
@@ -162,6 +167,9 @@ column_types:
   filter_col_1: "datetime"
   filter_col_2: "int"
   series_col: "str"
+
+additional_columns_to_csv:
+  ["additional_col_1", "additional_col_2"]
 ```
 
 #### X-axis Grouping
diff --git a/post-processing/post_processing.py b/post-processing/post_processing.py
index 2923aa7a..2a724d93 100644
--- a/post-processing/post_processing.py
+++ b/post-processing/post_processing.py
@@ -2,6 +2,7 @@
 import operator as op
 import traceback
 from functools import reduce
+import os
 from pathlib import Path
 
 import pandas as pd
@@ -12,7 +13,7 @@
 
 class PostProcessing:
 
-    def __init__(self, log_path: Path, debug=False, verbose=False):
+    def __init__(self, log_path: Path, debug=False, verbose=False, save=False, plotting=True):
         """
             Initialise class.
 
@@ -20,11 +21,15 @@ def __init__(self, log_path: Path, debug=False, verbose=False):
                 log_path: Path, path to performance log file or directory.
                 debug: bool, flag to print additional information to console.
                 verbose: bool, flag to print more additional information to console.
+                save: bool, flag to save the filtered dataframe in csv file
+                plotting: bool, flag to generate and store a plot in html file
         """
 
         # FIXME (issue #264): add proper logging
         self.debug = debug
         self.verbose = verbose
+        self.save = save
+        self.plotting = plotting
         # find and read perflogs
         self.original_df = PerflogHandler(log_path, self.debug).get_df()
         # copy original data for modification during post-processing
@@ -58,19 +63,18 @@ def run_post_processing(self, config: ConfigHandler):
         # scale y-axis
         self.transform_df_data(
             config.x_axis["value"], config.y_axis["value"], *config.get_y_scaling(), config.series_filters)
-
-        # FIXME (#issue #255): have an option to put this into a file (-s / --save flag?)
         if self.debug:
             print("Selected dataframe:")
-            print(self.df[self.mask][config.plot_columns])
-            print("CSV dataframe:")
             print(self.df[self.mask][config.plot_columns + config.extra_columns])
-            self.df[self.mask][config.plot_columns + config.extra_columns].to_csv(str(self.log_path)+'/output.csv', index=True)  # Set index=False to exclude the DataFrame index from the CSV
+        if self.save:
+            self.df[self.mask][config.plot_columns + config.extra_columns].to_csv(
+                path_or_buf=os.path.join(Path(__file__).parent,'output.csv'), index=True)  # Set index=False to exclude the DataFrame index from the CSV
 
         # call a plotting script
-        self.plot = plot_generic(
-            config.title, self.df[self.mask][config.plot_columns],
-            config.x_axis, config.y_axis, config.series_filters, self.debug)
+        if self.plotting:
+            self.plot = plot_generic(
+                config.title, self.df[self.mask][config.plot_columns],
+                config.x_axis, config.y_axis, config.series_filters, self.debug)
 
         # FIXME (#issue #255): maybe save this bit to a file as well for easier viewing
         if self.debug & self.verbose:
@@ -399,6 +403,11 @@ def read_args():
     parser.add_argument("-v", "--verbose", action="store_true",
                         help="verbose flag for printing more debug information \
                               (must be used in conjunction with the debug flag)")
+    parser.add_argument("-s", "--save", action="store_true",
+                        help="save flag for saving the filtered dataframe in csv file")
+    parser.add_argument("-np", "--no_plot", action="store_true",
+                        help="no-plot flag for disabling generating and storing a plot")
+    
 
     return parser.parse_args()
 
@@ -408,7 +417,7 @@ def main():
     args = read_args()
 
     try:
-        post = PostProcessing(args.log_path, args.debug, args.verbose)
+        post = PostProcessing(args.log_path, args.debug, args.verbose, args.save, not(args.no_plot))
         config = ConfigHandler.from_path(args.config_path)
         post.run_post_processing(config)
 
diff --git a/post-processing/post_processing_config.yaml b/post-processing/post_processing_config.yaml
index 478359a3..c421b1b4 100644
--- a/post-processing/post_processing_config.yaml
+++ b/post-processing/post_processing_config.yaml
@@ -48,7 +48,7 @@ column_types:
   system: "str"
   cpus_per_task: "int"
 
-# Specify which columns to export to csv file
+# Optional setting to specify additional columns to export to csv file, in addition to
+# the ones in axes/series/filters
 additional_columns_to_csv:
-  [flops_value,
-  cpus_per_task]
\ No newline at end of file
+  ["spack_spec"]
\ No newline at end of file

From 264a07f0e18dd00ca778bc11335cc9550de43514 Mon Sep 17 00:00:00 2001
From: Ilektra Christidi <ilektra.christidi@ucl.ac.uk>
Date: Thu, 11 Apr 2024 19:02:34 +0100
Subject: [PATCH 8/8] Remove duplicates between all_columns and extra_columns,
 add unit tests.

---
 post-processing/config_handler.py       |   7 ++
 post-processing/test_post_processing.py | 107 ++++++++++++++++++++----
 2 files changed, 98 insertions(+), 16 deletions(-)

diff --git a/post-processing/config_handler.py b/post-processing/config_handler.py
index c880226e..e4b7b3d2 100644
--- a/post-processing/config_handler.py
+++ b/post-processing/config_handler.py
@@ -154,6 +154,13 @@ def parse_columns(self):
             dict.fromkeys((self.plot_columns + self.filter_columns +
                            ([self.scaling_column.get("name")] if self.scaling_column else []))))
 
+        # remove duplicated columns from the extra_columns list
+        duplicates = set(self.all_columns) & set(self.extra_columns)
+        while len(duplicates) != 0:
+            for d in duplicates:
+                self.extra_columns.remove(d)
+            duplicates = set(self.all_columns) & set(self.extra_columns)
+
     def remove_redundant_types(self):
         """
             Check for columns that are no longer in use and remove them from the type dict.
diff --git a/post-processing/test_post_processing.py b/post-processing/test_post_processing.py
index e75b5548..f183bc23 100644
--- a/post-processing/test_post_processing.py
+++ b/post-processing/test_post_processing.py
@@ -236,7 +236,8 @@ def test_high_level_script(run_sombrero):
                  "series": [],
                  "column_types": {"fake_column": "int",
                                   "flops_value": "float",
-                                  "flops_unit": "str"}}))
+                                  "flops_unit": "str"},
+                 "additional_columns_to_csv": []}))
     except KeyError as e:
         assert e.args[1] == ["fake_column"]
     else:
@@ -256,7 +257,8 @@ def test_high_level_script(run_sombrero):
                  "series": [],
                  "column_types": {"tasks": "int",
                                   "flops_value": "float",
-                                  "flops_unit": "str"}}))
+                                  "flops_unit": "str"},
+                 "additional_columns_to_csv": []}))
     except KeyError as e:
         assert e.args[1] == "!!"
     else:
@@ -276,7 +278,8 @@ def test_high_level_script(run_sombrero):
                  "series": [],
                  "column_types": {"tasks": "int",
                                   "flops_value": "float",
-                                  "flops_unit": "str"}}))
+                                  "flops_unit": "str"},
+                 "additional_columns_to_csv": []}))
     except ValueError:
         assert True
     else:
@@ -296,7 +299,8 @@ def test_high_level_script(run_sombrero):
                  "series": [],
                  "column_types": {"tasks": "int",
                                   "flops_value": "float",
-                                  "flops_unit": "str"}}))
+                                  "flops_unit": "str"},
+                 "additional_columns_to_csv": []}))
     except pd.errors.EmptyDataError:
         assert True
     else:
@@ -315,7 +319,8 @@ def test_high_level_script(run_sombrero):
                  "series": [],
                  "column_types": {"tasks": "int",
                                   "flops_value": "float",
-                                  "flops_unit": "str"}}))
+                                  "flops_unit": "str"},
+                 "additional_columns_to_csv": []}))
     except RuntimeError:
         assert True
     else:
@@ -334,7 +339,8 @@ def test_high_level_script(run_sombrero):
                  "series": [],
                  "column_types": {"tasks": "int",
                                   "cpus_per_task": "int",
-                                  "extra_param": "int"}}))
+                                  "extra_param": "int"},
+                 "additional_columns_to_csv": []}))
     except RuntimeError as e:
         # three param columns found in changed log
         EXPECTED_FIELDS = ["tasks", "cpus_per_task", "extra_param"]
@@ -356,7 +362,8 @@ def test_high_level_script(run_sombrero):
              "series": [],
              "column_types": {"job_completion_time": "datetime",
                               "flops_value": "float",
-                              "flops_unit": "str"}}))
+                              "flops_unit": "str"},
+             "additional_columns_to_csv": []}))
     # check returned subset is as expected
     assert len(df) == 2
 
@@ -374,7 +381,8 @@ def test_high_level_script(run_sombrero):
              "column_types": {"tasks": "int",
                               "cpus_per_task": "int",
                               "flops_value": "float",
-                              "flops_unit": "str"}}))
+                              "flops_unit": "str"},
+             "additional_columns_to_csv": []}))
     # check returned subset is as expected
     assert len(df) == 4
 
@@ -394,7 +402,8 @@ def test_high_level_script(run_sombrero):
                               "flops_value": "float",
                               "flops_unit": "str",
                               "cpus_per_task": "int",
-                              "OMP_NUM_THREADS": "int"}}))
+                              "OMP_NUM_THREADS": "int"},
+             "additional_columns_to_csv": []}))
     # check flops values are halved compared to previous df
     assert (dfs["flops_value"].values == df[df["cpus_per_task"] == 2]["flops_value"].values/2).all()
 
@@ -413,7 +422,8 @@ def test_high_level_script(run_sombrero):
              "column_types": {"tasks": "int",
                               "flops_value": "float",
                               "flops_unit": "str",
-                              "cpus_per_task": "int"}}))
+                              "cpus_per_task": "int"},
+             "additional_columns_to_csv": []}))
     assert (dfs[dfs["cpus_per_task"] == 1]["flops_value"].values ==
             df[df["cpus_per_task"] == 1]["flops_value"].values /
             df[df["cpus_per_task"] == 1]["flops_value"].values).all()
@@ -437,7 +447,8 @@ def test_high_level_script(run_sombrero):
              "column_types": {"tasks": "int",
                               "flops_value": "float",
                               "flops_unit": "str",
-                              "cpus_per_task": "int"}}))
+                              "cpus_per_task": "int"},
+             "additional_columns_to_csv": []}))
     assert (dfs["flops_value"].values == df["flops_value"].values /
             df[(df["cpus_per_task"] == 1) & (df["tasks"] == 2)]["flops_value"].iloc[0]).all()
 
@@ -456,7 +467,8 @@ def test_high_level_script(run_sombrero):
              "column_types": {"tasks": "int",
                               "flops_value": "float",
                               "flops_unit": "str",
-                              "cpus_per_task": "int"}}))
+                              "cpus_per_task": "int"},
+             "additional_columns_to_csv": []}))
     # check flops values are halved compared to previous df
     assert (dfs["flops_value"].values == df[df["cpus_per_task"] == 2]["flops_value"].values/2).all()
 
@@ -476,7 +488,8 @@ def test_high_level_script(run_sombrero):
                                   "flops_value": "float",
                                   "flops_unit": "str",
                                   "cpus_per_task": "int",
-                                  "OMP_NUM_THREADS": "str"}}))
+                                  "OMP_NUM_THREADS": "str"},
+                 "additional_columns_to_csv": []}))
     except TypeError:
         assert True
 
@@ -496,7 +509,8 @@ def test_high_level_script(run_sombrero):
                  "column_types": {"tasks": "int",
                                   "flops_value": "float",
                                   "flops_unit": "str",
-                                  "cpus_per_task": "int"}}))
+                                  "cpus_per_task": "int"},
+                 "additional_columns_to_csv": []}))
     except ValueError:
         assert True
 
@@ -514,7 +528,8 @@ def test_high_level_script(run_sombrero):
                  "series": [],
                  "column_types": {"tasks": "int",
                                   "flops_value": "float",
-                                  "flops_unit": "str"}}))
+                                  "flops_unit": "str"},
+                 "additional_columns_to_csv": []}))
     except RuntimeError as e:
         # dataframe has records from both files
         assert len(e.args[1]) == 8
@@ -535,9 +550,69 @@ def test_high_level_script(run_sombrero):
              "column_types": {"tasks": "int",
                               "flops_value": "float",
                               "flops_unit": "str",
-                              "cpus_per_task": "int"}}))
+                              "cpus_per_task": "int"},
+             "additional_columns_to_csv": []}))
 
     EXPECTED_FIELDS = ["tasks", "flops_value", "flops_unit"]
     # check returned subset is as expected
     assert df.columns.tolist() == EXPECTED_FIELDS
     assert len(df) == 1
+
+    # get filtered dataframe with extra columns for csv
+    df = PostProcessing(sombrero_log_path, save=True).run_post_processing(
+        ConfigHandler(
+            {"title": "Title",
+             "x_axis": {"value": "tasks",
+                        "units": {"custom": None}},
+             "y_axis": {"value": "flops_value",
+                        "units": {"column": "flops_unit"}},
+             "filters": {"and": [["tasks", ">", 1], ["cpus_per_task", "==", 2]],
+                         "or": []},
+             "series": [],
+             "column_types": {"tasks": "int",
+                              "flops_value": "float",
+                              "flops_unit": "str",
+                              "cpus_per_task": "int"},
+            "additional_columns_to_csv": ["spack_spec"]}
+        ))
+
+    EXPECTED_FIELDS = ["tasks", "flops_value", "flops_unit"]
+    # check returned subset is as expected
+    assert df.columns.tolist() == EXPECTED_FIELDS
+    assert len(df) == 1
+
+    EXPECTED_FIELDS.append("spack_spec")
+    # check subset written to csv is as expected
+    output_file = "output.csv"
+    df_saved = pd.read_csv(output_file, index_col=0)
+    assert df_saved.columns.tolist() == EXPECTED_FIELDS
+    assert len(df_saved) == 1
+
+    # get filtered dataframe with duplicated extra columns for csv
+    df = PostProcessing(sombrero_log_path, save=True).run_post_processing(
+        ConfigHandler(
+            {"title": "Title",
+             "x_axis": {"value": "tasks",
+                        "units": {"custom": None}},
+             "y_axis": {"value": "flops_value",
+                        "units": {"column": "flops_unit"}},
+             "filters": {"and": [["tasks", ">", 1], ["cpus_per_task", "==", 2]],
+                         "or": []},
+             "series": [],
+             "column_types": {"tasks": "int",
+                              "flops_value": "float",
+                              "flops_unit": "str",
+                              "cpus_per_task": "int"},
+            "additional_columns_to_csv": ["tasks", "tasks"]}
+        ))
+
+    EXPECTED_FIELDS = ["tasks", "flops_value", "flops_unit"]
+    # check returned subset is as expected
+    assert df.columns.tolist() == EXPECTED_FIELDS
+    assert len(df) == 1
+
+    # check subset written to csv is as expected
+    output_file = "output.csv"
+    df_saved = pd.read_csv(output_file, index_col=0)
+    assert df_saved.columns.tolist() == EXPECTED_FIELDS
+    assert len(df_saved) == 1