feat: create tables for HEPData (#215)

* add possibility to create HEPData tables via new option
iris-hep · Jul 12, 2024 · ebdc634 · ebdc634
1 parent d8b529f
commit ebdc634
Show file tree

Hide file tree

Showing 5 changed files with 133 additions and 7 deletions.
diff --git a/analyses/cms-open-data-ttbar/README.md b/analyses/cms-open-data-ttbar/README.md
@@ -18,6 +18,7 @@ This directory is focused on running the CMS Open Data $t\bar{t}$ analysis throu
 | models/                       | Contains models used for ML inference task (when `USE_TRITON = False`)                                                                          |
 | utils/                        | Contains code for bookkeeping and cosmetics, as well as some boilerplate. Also contains images used in notebooks.                               |
 | utils/config.py               | This is a general config file to handle different options for running the analysis.                               |
+| utils/hepdata.py              | Function to create tables for submission to the [HEP_DATA website](https://www.hepdata.net) (use `HEP_DATA = True`)     |
 
 #### Instructions for paired notebook
 
@@ -51,3 +52,36 @@ argument is the appropriate reference file for the number of files per process a
 For full usage help see the output of `python validate_histograms.py --help`.
 
 `validate_histograms.py` can also be used to create new references by passing the `--dump-json` option.
+
+#### HEP data creation and submision.
+For proper submission, you need to modify the `submission.yaml` with proper explanation of variables and your table.
+To submit the created histograms to HEP data, you'll need to install the necessary packages and make some modifications to the `ttbar_analysis_pipeline.ipynb` notebook.
+``` console
+pip install hepdata_lib hepdata-cli
+```
+Next, modify the notebook to enable the submission in one run. You'll need to create a zip archive of your data for uploading.
+
+```python
+import shutil
+folder_path = "hepdata_model" #name of the folder which was created wiht hepdata syntax
+zip_filename = "hepdata_model.zip"
+temp_folder = "temp_folder"
+# Create a temporary folder without unwanted files
+shutil.copytree(folder_path, temp_folder, ignore=shutil.ignore_patterns('.ipynb_checkpoints'))
+# Create the archive from the temporary folder
+shutil.make_archive(zip_filename, 'zip', temp_folder)
+# Remove the temporary folder
+shutil.rmtree(temp_folder)
+```
+
+```python
+from getpass import getpass
+import os
+# Get the password securely
+password = getpass("Enter your password: ")
+
+command = f"hepdata-cli upload '/home/cms-jovyan/analysis-grand-challenge/analyses/cms-open-data-ttbar/hepdata_model.zip.zip' -e yourname.yoursurname@cern.ch"
+os.system(f'echo {password} | {command}') #insert your password in the active window
+```
+If the submission is successful, you'll see your uploaded data in the provided link.
+
diff --git a/analyses/cms-open-data-ttbar/ttbar_analysis_pipeline.ipynb b/analyses/cms-open-data-ttbar/ttbar_analysis_pipeline.ipynb
@@ -1210,9 +1210,9 @@
    "source": [
     "# obtain model prediction before and after fit\n",
     "if USE_INFERENCE:\n",
-    "    model_prediction = cabinetry.model_utils.prediction(model_ml)\n",
+    "    model_prediction_ml = cabinetry.model_utils.prediction(model_ml)\n",
     "    fit_results_mod = cabinetry.model_utils.match_fit_results(model_ml, fit_results)\n",
-    "    model_prediction_postfit = cabinetry.model_utils.prediction(model_ml, fit_results=fit_results_mod)"
+    "    model_prediction_postfit_ml = cabinetry.model_utils.prediction(model_ml, fit_results=fit_results_mod)"
    ]
   },
   {
@@ -1296,7 +1296,22 @@
    ],
    "source": [
     "if USE_INFERENCE:\n",
-    "    utils.plotting.plot_data_mc(model_prediction, model_prediction_postfit, data_ml, config_ml)"
+    "    utils.plotting.plot_data_mc(model_prediction_ml, model_prediction_postfit_ml, data_ml, config_ml)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "54967894",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "if utils.config[\"preservation\"][\"HEPData\"] is True:\n",
+    "    import utils.hepdata\n",
+    "    #Submission of model prediction\n",
+    "    utils.hepdata.preparing_hep_data_format(model, model_prediction, \"hepdata_model\", cabinetry_config)\n",
+    "    #Submission of model_ml prediction\n",
+    "    utils.hepdata.preparing_hep_data_format(model_ml, model_prediction_ml,\"hepdata_model_ml\", config_ml)"
    ]
   },
   {

diff --git a/analyses/cms-open-data-ttbar/ttbar_analysis_pipeline.py b/analyses/cms-open-data-ttbar/ttbar_analysis_pipeline.py
@@ -6,7 +6,7 @@
 #       extension: .py
 #       format_name: percent
 #       format_version: '1.3'
-#       jupytext_version: 1.15.2
+#       jupytext_version: 1.16.2
 #   kernelspec:
 #     display_name: Python 3 (ipykernel)
 #     language: python
@@ -723,13 +723,21 @@ def get_query(source):
 # %%
 # obtain model prediction before and after fit
 if USE_INFERENCE:
-    model_prediction = cabinetry.model_utils.prediction(model_ml)
+    model_prediction_ml = cabinetry.model_utils.prediction(model_ml)
     fit_results_mod = cabinetry.model_utils.match_fit_results(model_ml, fit_results)
-    model_prediction_postfit = cabinetry.model_utils.prediction(model_ml, fit_results=fit_results_mod)
+    model_prediction_postfit_ml = cabinetry.model_utils.prediction(model_ml, fit_results=fit_results_mod)
 
 # %%
 if USE_INFERENCE:
-    utils.plotting.plot_data_mc(model_prediction, model_prediction_postfit, data_ml, config_ml)
+    utils.plotting.plot_data_mc(model_prediction_ml, model_prediction_postfit_ml, data_ml, config_ml)
+
+# %%
+if utils.config["preservation"]["HEPData"] is True:
+    import utils.hepdata
+    #Submission of model prediction
+    utils.hepdata.submission_hep_data(model, model_prediction, "hepdata_model", cabinetry_config)
+    #Submission of model_ml prediction
+    utils.hepdata.submission_hep_data(model_ml, model_prediction_ml,"hepdata_model_ml", config_ml)
 
 # %% [markdown]
 # ### What is next?

diff --git a/analyses/cms-open-data-ttbar/utils/config.py b/analyses/cms-open-data-ttbar/utils/config.py
@@ -196,4 +196,7 @@
             "Quark vs Gluon likelihood discriminator of the $b_{top-lep}$ Jet",
         ],
     },
+    "preservation": {
+        "HEPData": False
+    }
 }
diff --git a/analyses/cms-open-data-ttbar/utils/hepdata.py b/analyses/cms-open-data-ttbar/utils/hepdata.py
@@ -0,0 +1,66 @@
+from hepdata_lib import Submission, Table, Variable, Uncertainty
+
+def preparing_hep_data_format(model, model_prediction, path, config):
+    submission = Submission()
+    for i in range(1, len(model.config.channels) + 1):
+        table = create_hep_data_table_with_config(i, model, model_prediction, config)
+        submission.add_table(table)
+    submission.add_additional_resource("Workspace file", "workspace.json", copy_file=True)
+    submission.create_files(path, remove_old=True)
+
+def create_hep_data_table_with_config(index, model, model_prediction, config):
+    return create_hep_data_table(index, model, model_prediction, config)
+
+def create_hep_data_table(index, model, model_prediction, config):
+    output = {}
+
+    for i_chan, channel in enumerate(model.config.channels):
+        for j_sam, sample in enumerate(model.config.samples):
+            yields = model_prediction.model_yields[i_chan][j_sam]
+            uncertainties = model_prediction.total_stdev_model_bins[i_chan][j_sam]
+            num_bins = len(yields)
+            for k_bin in range(num_bins):
+                key = f"{channel} {sample} bin{k_bin}"
+                value = {"value": yields[k_bin], "symerror": uncertainties[k_bin]}
+                output[key] = value
+
+    independent_variables = []
+    dependent_variables = []
+    independent_variables_ml = []
+    dependent_variables_ml = []
+
+    for key in output.keys():
+        columns = key.split()
+        if f'4j{index}b' in key:
+            independent_variables.append(f"{columns[0]} {columns[1]} {columns[-1]}")
+            dependent_variables.append(' '.join(columns[2:-1]))
+        elif f'Feature{index}' in key:
+            independent_variables_ml.append(f"{columns[0]} {columns[-1]}")
+            dependent_variables_ml.append(' '.join(columns[1:-1]))
+
+    table_name = ""
+    if independent_variables:
+        table_name = f"4j{index}b Figure"
+    elif independent_variables_ml:
+        table_name = f"Feature{index} Figure"
+
+    table = Table(table_name)
+
+    # Create a single variable for the region corresponding to the feature index
+    region = config['Regions'][index - 1]
+    var = Variable(f"Region {index}", is_independent=True, is_binned=False, units=region['Variable'])
+    var.values = [f"Feature{index} bin{k_bin}" for k_bin in range(len(model_prediction.model_yields[0][0]))]
+    table.add_variable(var)
+
+    # Add dependent variables and uncertainties
+    for i, sample in enumerate(model.config.samples):
+        data_var = Variable(sample, is_independent=False, is_binned=False, units="Number of jets")
+        data_var.values = model_prediction.model_yields[index - 1][i]
+
+        unc = Uncertainty("A symmetric error", is_symmetric=True)
+        unc.values = model_prediction.total_stdev_model_bins[index - 1][i]
+
+        data_var.add_uncertainty(unc)
+        table.add_variable(data_var)
+
+    return table