From ebdc63412976d4e458d4fa510b40fda9257f29b4 Mon Sep 17 00:00:00 2001 From: Andrii Povsten <80416008+AndriiPovsten@users.noreply.github.com> Date: Fri, 12 Jul 2024 12:41:33 +0200 Subject: [PATCH] feat: create tables for HEPData (#215) * add possibility to create HEPData tables via new option --- analyses/cms-open-data-ttbar/README.md | 34 ++++++++++ .../ttbar_analysis_pipeline.ipynb | 21 +++++- .../ttbar_analysis_pipeline.py | 16 +++-- analyses/cms-open-data-ttbar/utils/config.py | 3 + analyses/cms-open-data-ttbar/utils/hepdata.py | 66 +++++++++++++++++++ 5 files changed, 133 insertions(+), 7 deletions(-) create mode 100644 analyses/cms-open-data-ttbar/utils/hepdata.py diff --git a/analyses/cms-open-data-ttbar/README.md b/analyses/cms-open-data-ttbar/README.md index caa8be30..214d98d7 100644 --- a/analyses/cms-open-data-ttbar/README.md +++ b/analyses/cms-open-data-ttbar/README.md @@ -18,6 +18,7 @@ This directory is focused on running the CMS Open Data $t\bar{t}$ analysis throu | models/ | Contains models used for ML inference task (when `USE_TRITON = False`) | | utils/ | Contains code for bookkeeping and cosmetics, as well as some boilerplate. Also contains images used in notebooks. | | utils/config.py | This is a general config file to handle different options for running the analysis. | +| utils/hepdata.py | Function to create tables for submission to the [HEP_DATA website](https://www.hepdata.net) (use `HEP_DATA = True`) | #### Instructions for paired notebook @@ -51,3 +52,36 @@ argument is the appropriate reference file for the number of files per process a For full usage help see the output of `python validate_histograms.py --help`. `validate_histograms.py` can also be used to create new references by passing the `--dump-json` option. + +#### HEP data creation and submision. +For proper submission, you need to modify the `submission.yaml` with proper explanation of variables and your table. +To submit the created histograms to HEP data, you'll need to install the necessary packages and make some modifications to the `ttbar_analysis_pipeline.ipynb` notebook. +``` console +pip install hepdata_lib hepdata-cli +``` +Next, modify the notebook to enable the submission in one run. You'll need to create a zip archive of your data for uploading. + +```python +import shutil +folder_path = "hepdata_model" #name of the folder which was created wiht hepdata syntax +zip_filename = "hepdata_model.zip" +temp_folder = "temp_folder" +# Create a temporary folder without unwanted files +shutil.copytree(folder_path, temp_folder, ignore=shutil.ignore_patterns('.ipynb_checkpoints')) +# Create the archive from the temporary folder +shutil.make_archive(zip_filename, 'zip', temp_folder) +# Remove the temporary folder +shutil.rmtree(temp_folder) +``` + +```python +from getpass import getpass +import os +# Get the password securely +password = getpass("Enter your password: ") + +command = f"hepdata-cli upload '/home/cms-jovyan/analysis-grand-challenge/analyses/cms-open-data-ttbar/hepdata_model.zip.zip' -e yourname.yoursurname@cern.ch" +os.system(f'echo {password} | {command}') #insert your password in the active window +``` +If the submission is successful, you'll see your uploaded data in the provided link. + diff --git a/analyses/cms-open-data-ttbar/ttbar_analysis_pipeline.ipynb b/analyses/cms-open-data-ttbar/ttbar_analysis_pipeline.ipynb index 257de8d9..66d9a8c4 100644 --- a/analyses/cms-open-data-ttbar/ttbar_analysis_pipeline.ipynb +++ b/analyses/cms-open-data-ttbar/ttbar_analysis_pipeline.ipynb @@ -1210,9 +1210,9 @@ "source": [ "# obtain model prediction before and after fit\n", "if USE_INFERENCE:\n", - " model_prediction = cabinetry.model_utils.prediction(model_ml)\n", + " model_prediction_ml = cabinetry.model_utils.prediction(model_ml)\n", " fit_results_mod = cabinetry.model_utils.match_fit_results(model_ml, fit_results)\n", - " model_prediction_postfit = cabinetry.model_utils.prediction(model_ml, fit_results=fit_results_mod)" + " model_prediction_postfit_ml = cabinetry.model_utils.prediction(model_ml, fit_results=fit_results_mod)" ] }, { @@ -1296,7 +1296,22 @@ ], "source": [ "if USE_INFERENCE:\n", - " utils.plotting.plot_data_mc(model_prediction, model_prediction_postfit, data_ml, config_ml)" + " utils.plotting.plot_data_mc(model_prediction_ml, model_prediction_postfit_ml, data_ml, config_ml)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "54967894", + "metadata": {}, + "outputs": [], + "source": [ + "if utils.config[\"preservation\"][\"HEPData\"] is True:\n", + " import utils.hepdata\n", + " #Submission of model prediction\n", + " utils.hepdata.preparing_hep_data_format(model, model_prediction, \"hepdata_model\", cabinetry_config)\n", + " #Submission of model_ml prediction\n", + " utils.hepdata.preparing_hep_data_format(model_ml, model_prediction_ml,\"hepdata_model_ml\", config_ml)" ] }, { diff --git a/analyses/cms-open-data-ttbar/ttbar_analysis_pipeline.py b/analyses/cms-open-data-ttbar/ttbar_analysis_pipeline.py index ca5642e5..81c33da4 100644 --- a/analyses/cms-open-data-ttbar/ttbar_analysis_pipeline.py +++ b/analyses/cms-open-data-ttbar/ttbar_analysis_pipeline.py @@ -6,7 +6,7 @@ # extension: .py # format_name: percent # format_version: '1.3' -# jupytext_version: 1.15.2 +# jupytext_version: 1.16.2 # kernelspec: # display_name: Python 3 (ipykernel) # language: python @@ -723,13 +723,21 @@ def get_query(source): # %% # obtain model prediction before and after fit if USE_INFERENCE: - model_prediction = cabinetry.model_utils.prediction(model_ml) + model_prediction_ml = cabinetry.model_utils.prediction(model_ml) fit_results_mod = cabinetry.model_utils.match_fit_results(model_ml, fit_results) - model_prediction_postfit = cabinetry.model_utils.prediction(model_ml, fit_results=fit_results_mod) + model_prediction_postfit_ml = cabinetry.model_utils.prediction(model_ml, fit_results=fit_results_mod) # %% if USE_INFERENCE: - utils.plotting.plot_data_mc(model_prediction, model_prediction_postfit, data_ml, config_ml) + utils.plotting.plot_data_mc(model_prediction_ml, model_prediction_postfit_ml, data_ml, config_ml) + +# %% +if utils.config["preservation"]["HEPData"] is True: + import utils.hepdata + #Submission of model prediction + utils.hepdata.submission_hep_data(model, model_prediction, "hepdata_model", cabinetry_config) + #Submission of model_ml prediction + utils.hepdata.submission_hep_data(model_ml, model_prediction_ml,"hepdata_model_ml", config_ml) # %% [markdown] # ### What is next? diff --git a/analyses/cms-open-data-ttbar/utils/config.py b/analyses/cms-open-data-ttbar/utils/config.py index 119fd55e..3e1f0003 100644 --- a/analyses/cms-open-data-ttbar/utils/config.py +++ b/analyses/cms-open-data-ttbar/utils/config.py @@ -196,4 +196,7 @@ "Quark vs Gluon likelihood discriminator of the $b_{top-lep}$ Jet", ], }, + "preservation": { + "HEPData": False + } } diff --git a/analyses/cms-open-data-ttbar/utils/hepdata.py b/analyses/cms-open-data-ttbar/utils/hepdata.py new file mode 100644 index 00000000..dae520e6 --- /dev/null +++ b/analyses/cms-open-data-ttbar/utils/hepdata.py @@ -0,0 +1,66 @@ +from hepdata_lib import Submission, Table, Variable, Uncertainty + +def preparing_hep_data_format(model, model_prediction, path, config): + submission = Submission() + for i in range(1, len(model.config.channels) + 1): + table = create_hep_data_table_with_config(i, model, model_prediction, config) + submission.add_table(table) + submission.add_additional_resource("Workspace file", "workspace.json", copy_file=True) + submission.create_files(path, remove_old=True) + +def create_hep_data_table_with_config(index, model, model_prediction, config): + return create_hep_data_table(index, model, model_prediction, config) + +def create_hep_data_table(index, model, model_prediction, config): + output = {} + + for i_chan, channel in enumerate(model.config.channels): + for j_sam, sample in enumerate(model.config.samples): + yields = model_prediction.model_yields[i_chan][j_sam] + uncertainties = model_prediction.total_stdev_model_bins[i_chan][j_sam] + num_bins = len(yields) + for k_bin in range(num_bins): + key = f"{channel} {sample} bin{k_bin}" + value = {"value": yields[k_bin], "symerror": uncertainties[k_bin]} + output[key] = value + + independent_variables = [] + dependent_variables = [] + independent_variables_ml = [] + dependent_variables_ml = [] + + for key in output.keys(): + columns = key.split() + if f'4j{index}b' in key: + independent_variables.append(f"{columns[0]} {columns[1]} {columns[-1]}") + dependent_variables.append(' '.join(columns[2:-1])) + elif f'Feature{index}' in key: + independent_variables_ml.append(f"{columns[0]} {columns[-1]}") + dependent_variables_ml.append(' '.join(columns[1:-1])) + + table_name = "" + if independent_variables: + table_name = f"4j{index}b Figure" + elif independent_variables_ml: + table_name = f"Feature{index} Figure" + + table = Table(table_name) + + # Create a single variable for the region corresponding to the feature index + region = config['Regions'][index - 1] + var = Variable(f"Region {index}", is_independent=True, is_binned=False, units=region['Variable']) + var.values = [f"Feature{index} bin{k_bin}" for k_bin in range(len(model_prediction.model_yields[0][0]))] + table.add_variable(var) + + # Add dependent variables and uncertainties + for i, sample in enumerate(model.config.samples): + data_var = Variable(sample, is_independent=False, is_binned=False, units="Number of jets") + data_var.values = model_prediction.model_yields[index - 1][i] + + unc = Uncertainty("A symmetric error", is_symmetric=True) + unc.values = model_prediction.total_stdev_model_bins[index - 1][i] + + data_var.add_uncertainty(unc) + table.add_variable(data_var) + + return table