Skip to content

Commit

Permalink
feat: create tables for HEPData (#215)
Browse files Browse the repository at this point in the history
* add possibility to create HEPData tables via new option
  • Loading branch information
AndriiPovsten authored Jul 12, 2024
1 parent d8b529f commit ebdc634
Show file tree
Hide file tree
Showing 5 changed files with 133 additions and 7 deletions.
34 changes: 34 additions & 0 deletions analyses/cms-open-data-ttbar/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ This directory is focused on running the CMS Open Data $t\bar{t}$ analysis throu
| models/ | Contains models used for ML inference task (when `USE_TRITON = False`) |
| utils/ | Contains code for bookkeeping and cosmetics, as well as some boilerplate. Also contains images used in notebooks. |
| utils/config.py | This is a general config file to handle different options for running the analysis. |
| utils/hepdata.py | Function to create tables for submission to the [HEP_DATA website](https://www.hepdata.net) (use `HEP_DATA = True`) |

#### Instructions for paired notebook

Expand Down Expand Up @@ -51,3 +52,36 @@ argument is the appropriate reference file for the number of files per process a
For full usage help see the output of `python validate_histograms.py --help`.

`validate_histograms.py` can also be used to create new references by passing the `--dump-json` option.

#### HEP data creation and submision.
For proper submission, you need to modify the `submission.yaml` with proper explanation of variables and your table.
To submit the created histograms to HEP data, you'll need to install the necessary packages and make some modifications to the `ttbar_analysis_pipeline.ipynb` notebook.
``` console
pip install hepdata_lib hepdata-cli
```
Next, modify the notebook to enable the submission in one run. You'll need to create a zip archive of your data for uploading.

```python
import shutil
folder_path = "hepdata_model" #name of the folder which was created wiht hepdata syntax
zip_filename = "hepdata_model.zip"
temp_folder = "temp_folder"
# Create a temporary folder without unwanted files
shutil.copytree(folder_path, temp_folder, ignore=shutil.ignore_patterns('.ipynb_checkpoints'))
# Create the archive from the temporary folder
shutil.make_archive(zip_filename, 'zip', temp_folder)
# Remove the temporary folder
shutil.rmtree(temp_folder)
```

```python
from getpass import getpass
import os
# Get the password securely
password = getpass("Enter your password: ")

command = f"hepdata-cli upload '/home/cms-jovyan/analysis-grand-challenge/analyses/cms-open-data-ttbar/hepdata_model.zip.zip' -e yourname.yoursurname@cern.ch"
os.system(f'echo {password} | {command}') #insert your password in the active window
```
If the submission is successful, you'll see your uploaded data in the provided link.

21 changes: 18 additions & 3 deletions analyses/cms-open-data-ttbar/ttbar_analysis_pipeline.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -1210,9 +1210,9 @@
"source": [
"# obtain model prediction before and after fit\n",
"if USE_INFERENCE:\n",
" model_prediction = cabinetry.model_utils.prediction(model_ml)\n",
" model_prediction_ml = cabinetry.model_utils.prediction(model_ml)\n",
" fit_results_mod = cabinetry.model_utils.match_fit_results(model_ml, fit_results)\n",
" model_prediction_postfit = cabinetry.model_utils.prediction(model_ml, fit_results=fit_results_mod)"
" model_prediction_postfit_ml = cabinetry.model_utils.prediction(model_ml, fit_results=fit_results_mod)"
]
},
{
Expand Down Expand Up @@ -1296,7 +1296,22 @@
],
"source": [
"if USE_INFERENCE:\n",
" utils.plotting.plot_data_mc(model_prediction, model_prediction_postfit, data_ml, config_ml)"
" utils.plotting.plot_data_mc(model_prediction_ml, model_prediction_postfit_ml, data_ml, config_ml)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "54967894",
"metadata": {},
"outputs": [],
"source": [
"if utils.config[\"preservation\"][\"HEPData\"] is True:\n",
" import utils.hepdata\n",
" #Submission of model prediction\n",
" utils.hepdata.preparing_hep_data_format(model, model_prediction, \"hepdata_model\", cabinetry_config)\n",
" #Submission of model_ml prediction\n",
" utils.hepdata.preparing_hep_data_format(model_ml, model_prediction_ml,\"hepdata_model_ml\", config_ml)"
]
},
{
Expand Down
16 changes: 12 additions & 4 deletions analyses/cms-open-data-ttbar/ttbar_analysis_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
# extension: .py
# format_name: percent
# format_version: '1.3'
# jupytext_version: 1.15.2
# jupytext_version: 1.16.2
# kernelspec:
# display_name: Python 3 (ipykernel)
# language: python
Expand Down Expand Up @@ -723,13 +723,21 @@ def get_query(source):
# %%
# obtain model prediction before and after fit
if USE_INFERENCE:
model_prediction = cabinetry.model_utils.prediction(model_ml)
model_prediction_ml = cabinetry.model_utils.prediction(model_ml)
fit_results_mod = cabinetry.model_utils.match_fit_results(model_ml, fit_results)
model_prediction_postfit = cabinetry.model_utils.prediction(model_ml, fit_results=fit_results_mod)
model_prediction_postfit_ml = cabinetry.model_utils.prediction(model_ml, fit_results=fit_results_mod)

# %%
if USE_INFERENCE:
utils.plotting.plot_data_mc(model_prediction, model_prediction_postfit, data_ml, config_ml)
utils.plotting.plot_data_mc(model_prediction_ml, model_prediction_postfit_ml, data_ml, config_ml)

# %%
if utils.config["preservation"]["HEPData"] is True:
import utils.hepdata
#Submission of model prediction
utils.hepdata.submission_hep_data(model, model_prediction, "hepdata_model", cabinetry_config)
#Submission of model_ml prediction
utils.hepdata.submission_hep_data(model_ml, model_prediction_ml,"hepdata_model_ml", config_ml)

# %% [markdown]
# ### What is next?
Expand Down
3 changes: 3 additions & 0 deletions analyses/cms-open-data-ttbar/utils/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -196,4 +196,7 @@
"Quark vs Gluon likelihood discriminator of the $b_{top-lep}$ Jet",
],
},
"preservation": {
"HEPData": False
}
}
66 changes: 66 additions & 0 deletions analyses/cms-open-data-ttbar/utils/hepdata.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
from hepdata_lib import Submission, Table, Variable, Uncertainty

def preparing_hep_data_format(model, model_prediction, path, config):
submission = Submission()
for i in range(1, len(model.config.channels) + 1):
table = create_hep_data_table_with_config(i, model, model_prediction, config)
submission.add_table(table)
submission.add_additional_resource("Workspace file", "workspace.json", copy_file=True)
submission.create_files(path, remove_old=True)

def create_hep_data_table_with_config(index, model, model_prediction, config):
return create_hep_data_table(index, model, model_prediction, config)

def create_hep_data_table(index, model, model_prediction, config):
output = {}

for i_chan, channel in enumerate(model.config.channels):
for j_sam, sample in enumerate(model.config.samples):
yields = model_prediction.model_yields[i_chan][j_sam]
uncertainties = model_prediction.total_stdev_model_bins[i_chan][j_sam]
num_bins = len(yields)
for k_bin in range(num_bins):
key = f"{channel} {sample} bin{k_bin}"
value = {"value": yields[k_bin], "symerror": uncertainties[k_bin]}
output[key] = value

independent_variables = []
dependent_variables = []
independent_variables_ml = []
dependent_variables_ml = []

for key in output.keys():
columns = key.split()
if f'4j{index}b' in key:
independent_variables.append(f"{columns[0]} {columns[1]} {columns[-1]}")
dependent_variables.append(' '.join(columns[2:-1]))
elif f'Feature{index}' in key:
independent_variables_ml.append(f"{columns[0]} {columns[-1]}")
dependent_variables_ml.append(' '.join(columns[1:-1]))

table_name = ""
if independent_variables:
table_name = f"4j{index}b Figure"
elif independent_variables_ml:
table_name = f"Feature{index} Figure"

table = Table(table_name)

# Create a single variable for the region corresponding to the feature index
region = config['Regions'][index - 1]
var = Variable(f"Region {index}", is_independent=True, is_binned=False, units=region['Variable'])
var.values = [f"Feature{index} bin{k_bin}" for k_bin in range(len(model_prediction.model_yields[0][0]))]
table.add_variable(var)

# Add dependent variables and uncertainties
for i, sample in enumerate(model.config.samples):
data_var = Variable(sample, is_independent=False, is_binned=False, units="Number of jets")
data_var.values = model_prediction.model_yields[index - 1][i]

unc = Uncertainty("A symmetric error", is_symmetric=True)
unc.values = model_prediction.total_stdev_model_bins[index - 1][i]

data_var.add_uncertainty(unc)
table.add_variable(data_var)

return table

0 comments on commit ebdc634

Please sign in to comment.