diff --git a/cmsdb/campaigns/run3_2022_postEE_nano_v12/data.py b/cmsdb/campaigns/run3_2022_postEE_nano_v12/data.py index e343dcb6..63d3efe8 100644 --- a/cmsdb/campaigns/run3_2022_postEE_nano_v12/data.py +++ b/cmsdb/campaigns/run3_2022_postEE_nano_v12/data.py @@ -4,6 +4,8 @@ CMS datasets from the 2022 post-EE data-taking campaign """ +from order import DatasetInfo + import cmsdb.processes as procs from cmsdb.campaigns.run3_2022_postEE_nano_v12 import campaign_run3_2022_postEE_nano_v12 as cpn @@ -120,11 +122,18 @@ id=14783435, is_data=True, processes=[procs.data_muoneg], - keys=[ - "/MuonEG/Run2022E-22Sep2023-v1/NANOAOD", # noqa - ], - n_files=29, - n_events=12873327, + info=dict( + nominal=DatasetInfo( + keys=[ + "/MuonEG/Run2022E-22Sep2023-v1/NANOAOD", # noqa: E501 + ], + aux={ + "broken_files": [], + }, + n_files=29, # 29-0 + n_events=12873327, + ), + ), aux={ "era": "E", }, @@ -135,11 +144,21 @@ id=14784482, is_data=True, processes=[procs.data_muoneg], - keys=[ - "/MuonEG/Run2022F-22Sep2023-v1/NANOAOD", # noqa - ], - n_files=95, - n_events=38219969, + info=dict( + nominal=DatasetInfo( + keys=[ + "/MuonEG/Run2022F-22Sep2023-v1/NANOAOD", # noqa: E501 + ], + aux={ + "broken_files": [ + "/store/data/Run2022F/MuonEG/NANOAOD/22Sep2023-v1/50000/4d76213a-ef14-411a-9558-559a6df3f978.root", # empty # noqa: E501 + "/store/data/Run2022F/MuonEG/NANOAOD/22Sep2023-v1/50000/4fb72196-3b02-4499-8f6c-a54e15692b32.root", # empty # noqa: E501 + ], + }, + n_files=93, # 95-2 + n_events=38219969, + ), + ), aux={ "era": "F", }, @@ -150,11 +169,20 @@ id=14784485, is_data=True, processes=[procs.data_muoneg], - keys=[ - "/MuonEG/Run2022G-22Sep2023-v1/NANOAOD", # noqa - ], - n_files=27, - n_events=6238527, + info=dict( + nominal=DatasetInfo( + keys=[ + "/MuonEG/Run2022G-22Sep2023-v1/NANOAOD", # noqa: E501 + ], + aux={ + "broken_files": [ + "/store/data/Run2022G/MuonEG/NANOAOD/22Sep2023-v1/2520000/cd404eb6-8218-4787-b5ed-af6cd9fe3750.root", # empty # noqa: E501 + ], + }, + n_files=26, # 27-1 + n_events=6238527, + ), + ), aux={ "era": "G", }, diff --git a/cmsdb/campaigns/run3_2022_preEE_nano_v12/data.py b/cmsdb/campaigns/run3_2022_preEE_nano_v12/data.py index 8e0c414f..b87b07dd 100644 --- a/cmsdb/campaigns/run3_2022_preEE_nano_v12/data.py +++ b/cmsdb/campaigns/run3_2022_preEE_nano_v12/data.py @@ -4,6 +4,8 @@ CMS datasets from the 2022 pre-EE data-taking campaign """ +from order import DatasetInfo + import cmsdb.processes as procs from cmsdb.campaigns.run3_2022_preEE_nano_v12 import campaign_run3_2022_preEE_nano_v12 as cpn @@ -120,11 +122,20 @@ id=14783289, is_data=True, processes=[procs.data_muoneg], - keys=[ - "/MuonEG/Run2022A-22Sep2023-v1/NANOAOD", # noqa - ], - n_files=5, - n_events=12, + info=dict( + nominal=DatasetInfo( + keys=[ + "/MuonEG/Run2022A-22Sep2023-v1/NANOAOD", # noqa: E501 + ], + aux={ + "broken_files": [ + "/store/data/Run2022A/MuonEG/NANOAOD/22Sep2023-v1/50000/9a127bdb-9522-4f49-b754-67bb9152c0b3.root", # empty # noqa: E501 + ], + }, + n_files=4, # 5-1 + n_events=12, + ), + ), aux={ "era": "A", }, @@ -135,11 +146,20 @@ id=14784076, is_data=True, processes=[procs.data_muoneg], - keys=[ - "/MuonEG/Run2022B-22Sep2023-v1/NANOAOD", # noqa - ], - n_files=7, - n_events=254803, + info=dict( + nominal=DatasetInfo( + keys=[ + "/MuonEG/Run2022B-22Sep2023-v1/NANOAOD", # noqa: E501 + ], + aux={ + "broken_files": [ + "/store/data/Run2022B/MuonEG/NANOAOD/22Sep2023-v1/50000/947809ff-822e-4a3a-84a2-d3fe84fc2573.root", # empty # noqa: E501 + ], + }, + n_files=6, # 7-1 + n_events=254803, + ), + ), aux={ "era": "B", }, @@ -150,11 +170,18 @@ id=14784125, is_data=True, processes=[procs.data_muoneg], - keys=[ - "/MuonEG/Run2022C-22Sep2023-v1/NANOAOD", # noqa - ], - n_files=28, - n_events=15768439, + info=dict( + nominal=DatasetInfo( + keys=[ + "/MuonEG/Run2022C-22Sep2023-v1/NANOAOD", # noqa: E501 + ], + aux={ + "broken_files": [], + }, + n_files=28, # 28-0 + n_events=15768439, + ), + ), aux={ "era": "C", }, @@ -165,11 +192,18 @@ id=14784209, is_data=True, processes=[procs.data_muoneg], - keys=[ - "/MuonEG/Run2022D-22Sep2023-v1/NANOAOD", # noqa - ], - n_files=16, - n_events=8007031, + info=dict( + nominal=DatasetInfo( + keys=[ + "/MuonEG/Run2022D-22Sep2023-v1/NANOAOD", # noqa: E501 + ], + aux={ + "broken_files": [], + }, + n_files=16, # 16-0 + n_events=8007031, + ), + ), aux={ "era": "D", }, diff --git a/scripts/get_das_info.py b/scripts/get_das_info.py index ac7ebe74..7733db2b 100644 --- a/scripts/get_das_info.py +++ b/scripts/get_das_info.py @@ -26,30 +26,33 @@ def get_generator_name(name: str) -> str: return "" -def convert_default(data: dict, placeholder="PLACEHOLDER") -> str: +def get_broken_files_str(data: dict, n_spaces: int = 20) -> str: """ - Function that converts dataset info into one order Dataset per query + Function that returns a string represenatation of broken files """ - generator = get_generator_name(data["name"]) - return f"""cpn.add_dataset( - name="{placeholder}{generator}", - id={data['dataset_id']}, - processes=[procs.{placeholder}], - keys=[ - "{data['name']}", # noqa - ], - n_files={data['nfiles']}, - n_events={data['nevents']}, -) -""" + + broken_files_list = [ + f'"{d}", # broken # noqa: E501' for d in data["broken_files"] + ] + [ + f'"{d}", # empty # noqa: E501' for d in data["empty_files"] if d not in data["broken_files"] + ] + + if not broken_files_list: + return "" + else: + return ( + f"\n{' '* n_spaces}" + + f"\n{' '* n_spaces}".join(broken_files_list) + + f"\n{' '* (n_spaces - 4)}" + ) -def convert_variation(data: dict, placeholder="PLACEHOLDER") -> str: +def convert_default(data: dict, placeholder="PLACEHOLDER") -> str: """ - Function that converts dataset info into one order Dataset per query. Stores the dataset info - in a dict with the dataset type as key. + Function that converts dataset info into one order Dataset per query """ generator = get_generator_name(data["name"]) + return f"""cpn.add_dataset( name="{placeholder}{generator}", id={data['dataset_id']}, @@ -57,9 +60,12 @@ def convert_variation(data: dict, placeholder="PLACEHOLDER") -> str: info=dict( nominal=DatasetInfo( keys=[ - "{data['name']}", # noqa + "{data['name']}", # noqa: E501 ], - n_files={data['nfiles']}, + aux={{ + "broken_files": [{get_broken_files_str(data)}], + }}, + n_files={data['nfiles_good']}, # {data["nfiles"]}-{data["nfiles_bad"]} n_events={data['nevents']}, ), ), @@ -128,9 +134,12 @@ def convert_top(data: dict, placeholder="PLACEHOLDER") -> str: info=dict( nominal=DatasetInfo( keys=[ - "{data['name']}", # noqa + "{data['name']}", # noqa: E501 ], - n_files={data['nfiles']}, + aux={{ + "broken_files": [{get_broken_files_str(data)}], + }}, + n_files={data['nfiles_good']}, # {data["nfiles"]}-{data["nfiles_bad"]} n_events={data['nevents']}, ), ), @@ -139,9 +148,12 @@ def convert_top(data: dict, placeholder="PLACEHOLDER") -> str: # comment out this dataset return f""" # {identifier}=DatasetInfo( # keys=[ - # "{data['name']}", # noqa + # "{data['name']}", # noqa: E501 # ], - # n_files={data['nfiles']}, + # aux={{ + # "broken_files": [{get_broken_files_str(data)}], + # }}, + # n_files={data['nfiles_good']}, # {data["nfiles"]}-{data["nfiles_bad"]} # n_events={data['nevents']}, # ),""" elif dataset_type == "ignore": @@ -150,9 +162,12 @@ def convert_top(data: dict, placeholder="PLACEHOLDER") -> str: # some known variation of the dataset return f""" {dataset_type}=DatasetInfo( keys=[ - "{data['name']}", # noqa + "{data['name']}", # noqa: E501 ], - n_files={data['nfiles']}, + aux={{ + "broken_files": [{get_broken_files_str(data)}], + }}, + n_files={data['nfiles_good']}, # {data["nfiles"]}-{data["nfiles_bad"]} n_events={data['nevents']}, ),""" @@ -168,25 +183,22 @@ def convert_minimal(data: dict) -> str: """ Function that only returns the dataset key + number of events. """ - return f"""{data['name']}\nFiles: {data['nfiles']}\nEvents: {data['nevents']}\n""" + return f"""{data['name']}\nFiles: {data['nfiles_good']}\nEvents: {data['nevents']}\n""" convert_functions = { "default": convert_default, - "variation": convert_variation, "keys": convert_keys, "top": convert_top, "minimal": convert_minimal, } -def get_das_info( - dataset: str, -) -> dict: +def load_das_info(dataset: str, add_file_info: bool = False) -> dict: from law.util import interruptable_popen # call dasgoclient command - cmd = f"dasgoclient -query='dataset={dataset}' -json" + cmd = f"dasgoclient -query='{'file ' if add_file_info else ''}dataset={dataset}' -json" code, out, _ = interruptable_popen( cmd, shell=True, @@ -196,16 +208,44 @@ def get_das_info( if code != 0: raise Exception(f"dasgoclient query failed:\n{out}") infos = json.loads(out) + + return infos + + +def get_das_info(dataset: str) -> dict: info_of_interest = {"name": dataset} - for info in infos: - dataset_info = info["dataset"][0] - # Get json format of single das_string gives multiple dictornaries with different info - # Avoid to print multiple infos twice and ask specificly for the kew of interest - if "dataset_info" in info["das"]["services"][0]: - info_of_interest["dataset_id"] = dataset_info.get("dataset_id", "") - elif "filesummaries" in info["das"]["services"][0]: - info_of_interest["nfiles"] = dataset_info.get("nfiles", "") - info_of_interest["nevents"] = dataset_info.get("nevents", "") + + file_infos = load_das_info(dataset, add_file_info=True) + + info_of_interest["dataset_id"] = file_infos[0]["file"][0]["dataset_id"] + + empty_files_filter = lambda info: info["file"][0]["nevents"] == 0 + broken_files_filter = lambda info: info["file"][0]["is_file_valid"] == 0 + + good_files = list(filter(lambda x: not broken_files_filter(x) and not empty_files_filter(x), file_infos)) + + dataset_id = {info["file"][0]["dataset_id"] for info in good_files} + if len(dataset_id) == 1: + info_of_interest["dataset_id"] = dataset_id.pop() + else: + raise ValueError(f"Multiple dataset IDs ({dataset_id}) found for dataset {dataset}") + + info_of_interest["nfiles"] = len(file_infos) + info_of_interest["nfiles_good"] = len(good_files) + info_of_interest["nevents"] = sum(info["file"][0]["nevents"] for info in good_files) + + empty_files = [ + info["file"][0]["name"] + for info in filter(empty_files_filter, file_infos) + ] + broken_files = [ + info["file"][0]["name"] + for info in filter(broken_files_filter, file_infos) + ] + info_of_interest["empty_files"] = empty_files + info_of_interest["broken_files"] = broken_files + + info_of_interest["nfiles_bad"] = len(set(empty_files + broken_files)) return info_of_interest @@ -215,8 +255,6 @@ def print_das_info( keys_of_interest: tuple | None = None, convert_function_str: str | None = None, ): - from law.util import interruptable_popen - # get the requested convert function convert_function = convert_functions[convert_function_str] @@ -224,7 +262,7 @@ def print_das_info( # set default keys of interest # NOTE: this attribute is currently not used keys_of_interest = keys_of_interest or ( - "name", "dataset_id", "nfiles", "nevents", + "name", "dataset_id", "nfiles", "nevents", "empty_files", "broken_files", ) wildcard = "*" in das_string @@ -234,16 +272,7 @@ def print_das_info( datasets.append(das_string) else: # using a wildcard leads to a different structer in json format - cmd = f"dasgoclient -query='dataset={das_string}' -json" - code, out, _ = interruptable_popen( - cmd, - shell=True, - stdout=subprocess.PIPE, - executable="/bin/bash", - ) - if code != 0: - raise Exception(f"dasgoclient query failed:\n{out}") - infos = json.loads(out) + infos = load_das_info(das_string, add_file_info=False) for info in infos: dataset_name = info.get("dataset", [])[0].get("name", "") datasets.append(dataset_name)