diff --git a/Configuration/PyReleaseValidation/python/MatrixReader.py b/Configuration/PyReleaseValidation/python/MatrixReader.py index 7448cc2d7ba25..a4c63b599e16a 100644 --- a/Configuration/PyReleaseValidation/python/MatrixReader.py +++ b/Configuration/PyReleaseValidation/python/MatrixReader.py @@ -55,7 +55,8 @@ def reset(self, what='all'): 'relval_identity':'id-', 'relval_machine': 'mach-', 'relval_premix': 'premix-', - 'relval_nano':'nano-' + 'relval_nano':'nano-', + 'relval_data_highstats':'data-' } self.files = ['relval_standard' , @@ -73,7 +74,8 @@ def reset(self, what='all'): 'relval_identity', 'relval_machine', 'relval_premix', - 'relval_nano' + 'relval_nano', + 'relval_data_highstats' ] self.filesDefault = {'relval_standard':True , 'relval_highstats':True , @@ -90,7 +92,8 @@ def reset(self, what='all'): 'relval_identity':False, 'relval_machine':True, 'relval_premix':True, - 'relval_nano':True + 'relval_nano':True, + 'relval_data_highstats':False } self.relvalModule = None diff --git a/Configuration/PyReleaseValidation/python/MatrixUtil.py b/Configuration/PyReleaseValidation/python/MatrixUtil.py index 1988efd672466..afd4fa63eac2b 100644 --- a/Configuration/PyReleaseValidation/python/MatrixUtil.py +++ b/Configuration/PyReleaseValidation/python/MatrixUtil.py @@ -103,7 +103,7 @@ def selectedLS(list_runs=[],maxNum=-1,l_json=data_json2015): InputInfoNDefault=2000000 class InputInfo(object): - def __init__(self,dataSet,dataSetParent='',label='',run=[],ls={},files=1000,events=InputInfoNDefault,split=10,location='CAF',ib_blacklist=None,ib_block=None) : + def __init__(self,dataSet,dataSetParent='',label='',run=[],ls={},files=1000,events=InputInfoNDefault,split=10,location='CAF',ib_blacklist=None,ib_block=None,skimEvents=False) : self.run = run self.ls = ls self.files = files @@ -115,29 +115,39 @@ def __init__(self,dataSet,dataSetParent='',label='',run=[],ls={},files=1000,even self.ib_blacklist = ib_blacklist self.ib_block = ib_block self.dataSetParent = dataSetParent - + self.skimEvents = skimEvents + def das(self, das_options, dataset): - if len(self.run) != 0 or self.ls: + if not self.skimEvents and (len(self.run) != 0 or self.ls): queries = self.queries(dataset) if len(self.run) != 0: - command = ";".join(["dasgoclient %s --query '%s'" % (das_options, query) for query in queries]) + command = ";".join(["dasgoclient %s --query '%s'" % (das_options, query) for query in queries]) else: lumis = self.lumis() commands = [] while queries: - commands.append("dasgoclient %s --query 'lumi,%s' --format json | das-selected-lumis.py %s " % (das_options, queries.pop(), lumis.pop())) + commands.append("dasgoclient %s --query 'lumi,%s' --format json | das-selected-lumis.py %s " % (das_options, queries.pop(), lumis.pop())) command = ";".join(commands) command = "({0})".format(command) - else: + elif not self.skimEvents: command = "dasgoclient %s --query '%s'" % (das_options, self.queries(dataset)[0]) - + elif self.skimEvents: + from os import getenv + if getenv("CMSSW_USE_IBEOS","false")=="true": + # to be assured that whatever happens the files are only those at CERN + command = "das-up-to-nevents.py -d %s -e %d -s T2_CH_CERN"%(dataset,self.events) + else: + command = "das-up-to-nevents.py -d %s -e %d"%(dataset,self.events) # Run filter on DAS output if self.ib_blacklist: command += " | grep -E -v " command += " ".join(["-e '{0}'".format(pattern) for pattern in self.ib_blacklist]) - from os import getenv - if getenv("CMSSW_USE_IBEOS","false")=="true": return command + " | ibeos-lfn-sort" - return command + " | sort -u" + if not self.skimEvents: ## keep run-lumi sorting + from os import getenv + if getenv("CMSSW_USE_IBEOS","false")=="true": return command + " | ibeos-lfn-sort" + return command + " | sort -u" + else: + return command def lumiRanges(self): if len(self.run) != 0: @@ -145,7 +155,7 @@ def lumiRanges(self): if self.ls : return "echo '{\n"+",".join(('"%d" : %s\n'%( int(x),self.ls[x]) for x in self.ls.keys()))+"}'" return None - + def lumis(self): query_lumis = [] if self.ls: diff --git a/Configuration/PyReleaseValidation/python/relval_data_highstats.py b/Configuration/PyReleaseValidation/python/relval_data_highstats.py new file mode 100644 index 0000000000000..9d676ae1386ff --- /dev/null +++ b/Configuration/PyReleaseValidation/python/relval_data_highstats.py @@ -0,0 +1,30 @@ +# import the definition of the steps and input files: +from Configuration.PyReleaseValidation.relval_steps import * + +# here only define the workflows as a combination of the steps defined above: +workflows = Matrix() + +## Here we define higher (>50k events) stats data workflows +## not to be run as default. 150k, 250k, 500k or 1M events each + +## 2024 +base_wf_number_2024 = 2024.0 +offset_era = 0.1 # less than 10 eras +offset_pd = 0.001 # less than 100 pds +offset_events = 0.0001 # less than 10 event setups (50k,150k,250k,500k) + +for e_n,era in enumerate(eras_2024): + for p_n,pd in enumerate(pds_2024): + for e_key,evs in event_steps_dict.items(): + if "50k" == e_key: # already defined in relval_standard + continue + wf_number = base_wf_number_2024 + wf_number = wf_number + offset_era * e_n + wf_number = wf_number + offset_pd * p_n + wf_number = wf_number + offset_events * evs + wf_number = round(wf_number,6) + step_name = "Run" + pd + era.split("Run")[1] + "_" + e_key + workflows[wf_number] = ['',[step_name,'HLTDR3_2024','AODNANORUN3_reHLT_2024','HARVESTRUN3_2024']] + + + diff --git a/Configuration/PyReleaseValidation/python/relval_highstats.py b/Configuration/PyReleaseValidation/python/relval_highstats.py index c74abc4e8a5d9..6069eb9117384 100644 --- a/Configuration/PyReleaseValidation/python/relval_highstats.py +++ b/Configuration/PyReleaseValidation/python/relval_highstats.py @@ -89,3 +89,26 @@ +## 2024 Data Higher Stats Workflows +## with 150k, 250k, 500k or 1M events each + +base_wf_number_2024 = 2024.0 +offset_era = 0.1 # less than 10 eras +offset_pd = 0.001 # less than 100 pds +offset_events = 0.0001 # less than 10 event setups (50k,150k,250k,500k) + +for e_n,era in enumerate(eras_2024): + for p_n,pd in enumerate(pds_2024): + for e_key,evs in event_steps_dict.items(): + if "50k" in e_key: # already defined in relval_standard + continue + wf_number = base_wf_number_2024 + wf_number = wf_number + offset_era * e_n + wf_number = wf_number + offset_pd * p_n + wf_number = wf_number + offset_events * evs + wf_number = round(wf_number,6) + step_name = "Run" + pd + era.split("Run")[1] + "_" + e_key + workflows[wf_number] = ['',[step_name,'HLTDR3_2024','AODNANORUN3_reHLT_2024','HARVESTRUN3_2024']] + + + diff --git a/Configuration/PyReleaseValidation/python/relval_standard.py b/Configuration/PyReleaseValidation/python/relval_standard.py index 3ca6cd4e44bce..31a82e1dbc468 100644 --- a/Configuration/PyReleaseValidation/python/relval_standard.py +++ b/Configuration/PyReleaseValidation/python/relval_standard.py @@ -415,6 +415,7 @@ workflows[136.903] = ['', ['RunDoubleMuon2017B', 'TauEmbedding_Selection_2017', 'TauEmbedding_Cleaning_2017', 'TauEmbedding_GenPreHLT_2017', 'TauEmbedding_GenHLT_2017', 'TauEmbedding_GenPostHLT_2017', 'TauEmbedding_Merging_2017']] workflows[136.904] = ['', ['RunDoubleMuon2018C', 'TauEmbedding_Selection_2018', 'TauEmbedding_Cleaning_2018', 'TauEmbedding_GenPreHLT_2018', 'TauEmbedding_GenHLT_2018', 'TauEmbedding_GenPostHLT_2018', 'TauEmbedding_Merging_2018']] + ### run 2021 collisions ### workflows[139.001] = ['RunMinimumBias2021',['RunMinimumBias2021','HLTDR3_2022','RECODR3_reHLT_MinBiasOffline','HARVESTD2021MB_reHLT']] workflows[139.002] = ['',['RunZeroBias2021','HLTDR3_2022','RECODR3_reHLT_ZBOffline','HARVESTD2021ZB_reHLT']] @@ -558,6 +559,21 @@ workflows[142.901] = ['',['RunUPC2023','RECODR3_2024_UPC','HARVESTDPROMPTR3']] workflows[142.902] = ['',['RunUPC2023','RECODR3_2024_HIN','HARVESTDPROMPTR3']] +## 2024 Data Workflows +base_wf_number_2024 = 2024.0 +offset_era = 0.1 # less than 10 eras +offset_pd = 0.001 # less than 100 pds + +for e_n,era in enumerate(eras_2024): + for p_n,pd in enumerate(pds_2024): + wf_number = base_wf_number_2024 + wf_number = wf_number + offset_era * e_n + wf_number = wf_number + offset_pd * p_n + wf_number = wf_number + 0.0001 * 0.05 + wf_number = round(wf_number,6) + step_name = "Run" + pd + era.split("Run")[1] + "_50k" + workflows[wf_number] = ['',[step_name,'HLTDR3_2024','AODNANORUN3_reHLT_2024','HARVESTRUN3_2024']] + ### fastsim ### workflows[5.1] = ['TTbarFS', ['TTbarFS','HARVESTFS']] workflows[5.2] = ['SingleMuPt10FS', ['SingleMuPt10FS','HARVESTFS']] diff --git a/Configuration/PyReleaseValidation/python/relval_steps.py b/Configuration/PyReleaseValidation/python/relval_steps.py index bea88e21c1a36..53a1dc00a7824 100644 --- a/Configuration/PyReleaseValidation/python/relval_steps.py +++ b/Configuration/PyReleaseValidation/python/relval_steps.py @@ -44,6 +44,10 @@ steps = Steps() +#### Event to runs +event_steps = [0.05,0.15,0.25,0.5,1] #in millions +event_steps_k = ["50k","150k","250k","500k","1M"] +event_steps_dict = dict(zip(event_steps_k,event_steps)) #### Production test section #### steps['ProdMinBias']=merge([{'cfg':'MinBias_8TeV_pythia8_TuneCUETP8M1_cff','--relval':'9000,300'},step1Defaults]) steps['ProdTTbar']=merge([{'cfg':'TTbar_8TeV_TuneCUETP8M1_cfi','--relval':'9000,100'},step1Defaults]) @@ -478,7 +482,13 @@ # UL AOD steps['RunJetHT2018D_reminiaodUL']={'INPUT':InputInfo(dataSet='/JetHT/Run2018D-12Nov2019_UL2018-v4/AOD',label='2018DrmaodUL',events=100000,location='STD', ls=Run2018D)} -#### run3 #### +#################################### +#### Run3 ########################## +#################################### + +###2022 + +## Collisions at 900 GeV and ramp-up to 13.6 TeV Run2022A={353015: [[1, 100]]} steps['RunMinimumBias2022A']={'INPUT':InputInfo(dataSet='/MinimumBias/Run2022A-v1/RAW',label='2022A',events=100000,location='STD', ls=Run2022A)} steps['RunSingleMuon2022A']={'INPUT':InputInfo(dataSet='/SingleMuon/Run2022A-v1/RAW',label='2022A',events=100000,location='STD', ls=Run2022A)} @@ -497,7 +507,7 @@ steps['RunDoubleMuon2022A']={'INPUT':InputInfo(dataSet='/DoubleMuon/Run2022A-v1/RAW',label='2022A',events=100000,location='STD', ls=Run2022A)} steps['RunMuonEG2022A']={'INPUT':InputInfo(dataSet='/MuonEG/Run2022A-v1/RAW',label='2022A',events=100000,location='STD', ls=Run2022A)} -Run2022B={355769: [[1, 106]]} +Run2022B={355769: [[1, 106]]} ## this could be rised to "355769": [[1, 541]] steps['RunMinimumBias2022B']={'INPUT':InputInfo(dataSet='/MinimumBias/Run2022B-v1/RAW',label='2022B',events=100000,location='STD', ls=Run2022B)} steps['RunSingleMuon2022B']={'INPUT':InputInfo(dataSet='/SingleMuon/Run2022B-v1/RAW',label='2022B',events=100000,location='STD', ls=Run2022B)} steps['RunZeroBias2022B']={'INPUT':InputInfo(dataSet='/ZeroBias/Run2022B-v1/RAW',label='2022B',events=100000,location='STD', ls=Run2022B)} @@ -514,7 +524,6 @@ steps['RunTau2022B']={'INPUT':InputInfo(dataSet='/Tau/Run2022B-v1/RAW',label='2022B',events=100000,location='STD', ls=Run2022B)} steps['RunDoubleMuon2022B']={'INPUT':InputInfo(dataSet='/DoubleMuon/Run2022B-v1/RAW',label='2022B',events=100000,location='STD', ls=Run2022B)} steps['RunMuonEG2022B']={'INPUT':InputInfo(dataSet='/MuonEG/Run2022B-v1/RAW',label='2022B',events=100000,location='STD', ls=Run2022B)} -#steps['RunParkingBPH2022B']={'INPUT':InputInfo(dataSet='/ParkingBPH/Run2022B-v1/RAW',label='2022B',events=100000,location='STD', ls=Run2022B)} Run2022C={356381: [[1, 1193]]} Run2022C_LS40={356381: [[1, 40]]} @@ -576,7 +585,7 @@ # reMINIAOD for 2022 steps['RunJetMET2022D_reMINI']={'INPUT':InputInfo(dataSet='/JetMET/Run2022D-16Jun2023-v1/AOD',label='rmaod',events=100000,location='STD', ls=Run2022D_LS25)} -#### run3 #### +###2023 Run2023B={366727: [[1, 244]]} steps['RunMuon2023B']={'INPUT':InputInfo(dataSet='/Muon0/Run2023B-v1/RAW',label='2023B',events=100000,location='STD', ls=Run2023B)} steps['RunZeroBias2023B']={'INPUT':InputInfo(dataSet='/ZeroBias/Run2023B-v1/RAW',label='2023B',events=100000,location='STD', ls=Run2023B)} @@ -625,6 +634,23 @@ RunHI2023={375491: [[100, 100]]} steps['RunHIPhysicsRawPrime2023A']={'INPUT':InputInfo(dataSet='/HIPhysicsRawPrime0/HIRun2023A-v1/RAW',label='HI2023A',events=100000,location='STD', ls=RunHI2023)} +### Golden Data Wfs +# reading good runs directly from the latest golden json +# in https://cms-service-dqmdc.web.cern.ch/CAF/certification/ +# or (if available) + +###2024 +# number of events limits the files used as input + +pds_2024 = ['BTagMu', 'DisplacedJet', 'EGamma0', 'HcalNZS', 'JetMET0', 'Muon0', 'MuonEG', 'NoBPTX', 'ParkingDoubleMuonLowMass0', 'ParkingHH', 'ParkingLLP', 'ParkingSingleMuon0', 'ParkingVBF0', 'Tau', 'ZeroBias'] +eras_2024 = ['Run2024B', 'Run2024C', 'Run2024D', 'Run2024E', 'Run2024F'] +for era in eras_2024: + for pd in pds_2024: + dataset = "/" + pd + "/" + era + "-v1/RAW" + for e_key,evs in event_steps_dict.items(): + step_name = "Run" + pd + era.split("Run")[1] + "_" + e_key + steps[step_name] = {'INPUT':InputInfo(dataSet=dataset,label=era.split("Run")[1],events=int(evs*1e6), skimEvents=True, location='STD')} + # Highstat HLTPhysics Run2015DHS=selectedLS([258712,258713,258714,258741,258742,258745,258749,258750,259626,259637,259683,259685,259686,259721,259809,259810,259818,259820,259821,259822,259862,259890,259891]) steps['RunHLTPhy2015DHS']={'INPUT':InputInfo(dataSet='/HLTPhysics/Run2015D-v1/RAW',label='2015DHS',events=100000,location='STD', ls=Run2015DHS)} @@ -2166,6 +2192,8 @@ def lhegensim2018ml(fragment,howMuch): steps['HLTDR3_2023B']=merge( [ {'-s':'L1REPACK:Full,HLT:@%s'%hltKey2024,},{'--conditions':'auto:run3_hlt_relval'},{'--era' : 'Run3'},steps['HLTD'] ] ) +steps['HLTDR3_2024']=merge( [ {'-s':'L1REPACK:Full,HLT:@%s'%hltKey2024,},{'--conditions':'auto:run3_hlt_relval'},{'--era' : 'Run3_2024'},steps['HLTD'] ] ) + steps['HLTDR3_HI2023ARawprime']=merge([{'-s':'L1REPACK:Full,HLT:HIon'}, {'--conditions':'auto:run3_hlt_HIon'}, {'--era' : 'Run3_pp_on_PbPb_approxSiStripClusters_2023'}, @@ -2696,10 +2724,11 @@ def lhegensim2018ml(fragment,howMuch): steps['RECODR3_2023']=merge([{'--era':'Run3_2023'},steps['RECODR3']]) steps['RECODR3_2024']=merge([{'--era':'Run3_2024'},steps['RECODR3']]) - + steps['RECODR3_reHLT_2022']=merge([{'--conditions':'auto:run3_data_relval', '--hltProcess':'reHLT'},steps['RECODR3']]) steps['RECODR3_reHLT_2023']=merge([{'--conditions':'auto:run3_data_prompt_relval', '--hltProcess':'reHLT'},steps['RECODR3_2023']]) steps['RECODR3_reHLT_2023B']=merge([{'--conditions':'auto:run3_data_prompt_relval', '--hltProcess':'reHLT'},steps['RECODR3']]) +steps['RECODR3_reHLT_2024']=merge([{'--conditions':'auto:run3_data_prompt_relval', '--hltProcess':'reHLT'},steps['RECODR3']]) steps['RECODR3_2023_HIN']=merge([{'--conditions':'auto:run3_data_prompt', '-s':'RAW2DIGI,L1Reco,RECO,DQM:@commonFakeHLT+@standardDQMFakeHLT', '--repacked':'', '-n':1000},steps['RECODR3_2023']]) steps['RECODR3_2023_UPC']=merge([{'--era':'Run3_2023_UPC'},steps['RECODR3_2023_HIN']]) @@ -3058,6 +3087,8 @@ def gen2023HiMix(fragment,howMuch): steps['RECOHIRUN3_reHLT_2023']=merge([{'-s':'RAW2DIGI,L1Reco,RECO,PAT,DQM:@standardDQM','--datatier':'RECO,MINIAOD,DQMIO','--eventcontent':'RECO,MINIAOD,DQM','--era':'Run3_pp_on_PbPb_approxSiStripClusters_2023','--conditions':'auto:run3_data_HIon'},steps['RECODR3_reHLT_2023']]) +steps['AODNANORUN3_reHLT_2024']=merge([{'-s':'RAW2DIGI,L1Reco,RECO,PAT,NANO,DQM:@standardDQM+@miniAODDQM+@nanoAODDQM','--datatier':'AOD,MINIAOD,NANOAOD,DQMIO','--eventcontent':'AOD,MINIAOD,NANOEDMAOD,DQM'},steps['RECODR3_reHLT_2024']]) + # patatrack validation in data steps['RecoData_Patatrack_AllGPU_Validation_2023'] = merge([{'-s':'RAW2DIGI:RawToDigi_pixelOnly+RawToDigi_ecalOnly+RawToDigi_hcalOnly,RECO:reconstruction_pixelTrackingOnly+reconstruction_ecalOnly+reconstruction_hcalOnly,DQM:@pixelTrackingOnlyDQM+@ecalOnly+@hcalOnly+@hcal2Only', '--conditions':'auto:run3_data_prompt', @@ -3788,6 +3819,7 @@ def gen2023HiMix(fragment,howMuch): steps['HARVESTRUN3_COS_2022']=merge([{'--data':'', '--scenario':'cosmics', '--era':'Run3', '-s':'HARVESTING:dqmHarvesting'},steps['HARVESTDRUN3']]) steps['HARVESTRUN3_2023']=merge([{'--era':'Run3_2023', '-s':'HARVESTING:@standardDQM+@miniAODDQM+@nanoAODDQM'},steps['HARVESTRUN3_2022']]) steps['HARVESTRUN3_2023B']=merge([{'--era':'Run3', '-s':'HARVESTING:@standardDQM+@miniAODDQM+@nanoAODDQM'},steps['HARVESTRUN3_2022']]) +steps['HARVESTRUN3_2024']=merge([{'--era':'Run3', '-s':'HARVESTING:@standardDQM+@miniAODDQM+@nanoAODDQM'},steps['HARVESTDRUN3']]) steps['HARVESTRUN3_HI2023A']=merge([{'--era':'Run3_pp_on_PbPb_approxSiStripClusters_2023', '-s':'HARVESTING:@standardDQM+@miniAODDQM'},steps['HARVESTRUN3_2022']]) diff --git a/Configuration/PyReleaseValidation/scripts/das-up-to-nevents.py b/Configuration/PyReleaseValidation/scripts/das-up-to-nevents.py new file mode 100755 index 0000000000000..04b942e834ca6 --- /dev/null +++ b/Configuration/PyReleaseValidation/scripts/das-up-to-nevents.py @@ -0,0 +1,189 @@ +#!/usr/bin/env python3 +import pycurl +from io import BytesIO +import pycurl +import ast +import subprocess +import pandas as pd +import argparse +from bs4 import BeautifulSoup +import numpy as np +import os +import json +import sys + +## Helpers +base_cert_url = "https://cms-service-dqmdc.web.cern.ch/CAF/certification/" +base_cert_path = "/eos/user/c/cmsdqm/www/CAF/certification/" + +def get_url_clean(url): + + buffer = BytesIO() + c = pycurl.Curl() + c.setopt(c.URL, url) + c.setopt(c.WRITEDATA, buffer) + c.perform() + c.close() + + return BeautifulSoup(buffer.getvalue(), "lxml").text + +def das_do_command(cmd): + out = subprocess.check_output(cmd, shell=True, executable="/bin/bash").decode('utf8') + return out.split("\n") + +def das_file_site(dataset, site): + cmd = "dasgoclient --query='file dataset=%s site=%s'"%(dataset,site) + out = das_do_command(cmd) + df = pd.DataFrame(out,columns=["file"]) + + return df + +def das_file_data(dataset,opt=""): + cmd = "dasgoclient --query='file dataset=%s %s| grep file.name, file.nevents'"%(dataset,opt) + + out = das_do_command(cmd) + out = [np.array(r.split(" "))[[0,3]] for r in out if len(r) > 0] + + df = pd.DataFrame(out,columns=["file","events"]) + df.events = df.events.values.astype(int) + + return df + +def das_lumi_data(dataset,opt=""): + cmd = "dasgoclient --query='file,lumi,run dataset=%s %s'"%(dataset,opt) + + out = das_do_command(cmd) + out = [r.split(" ") for r in out if len(r)>0] + + df = pd.DataFrame(out,columns=["file","run","lumis"]) + + return df + +if __name__ == '__main__': + + parser = argparse.ArgumentParser() + parser.add_argument('--dataset','-d', default=None, help="Dataset Name (e.g. '/DisplacedJet/Run2024C-v1/RAW' )",type=str,required=True) + parser.add_argument('--threshold','-t', help ="Event threshold per file",type=int,default=-1) + parser.add_argument('--events','-e', help ="Tot number of events targeted",type=int,default=-1) + parser.add_argument('--outfile','-o', help='Dump results to file', type=str, default=None) + parser.add_argument('--pandas', '-pd',action='store_true',help="Store the whole dataset (no event or threshold cut) in a csv") + parser.add_argument('--proxy','-p', help='Allow to parse a x509 proxy if needed', type=str, default=None) + parser.add_argument('--site','-s', help='Only data at specific site', type=str, default=None) + args = parser.parse_args() + + if args.proxy is not None: + os.environ["X509_USER_PROXY"] = args.proxy + elif "X509_USER_PROXY" not in os.environ: + print("No X509 proxy set. Exiting.") + sys.exit(1) + + dataset = args.dataset + events = args.events + threshold = args.threshold + outfile = args.outfile + site = args.site + + ## get the greatest golden json + year = dataset.split("Run")[1][2:4] # from 20XX to XX + PD = dataset.split("/")[1] + cert_type = "Collisions" + str(year) + if "Cosmics" in dataset: + cert_type = "Cosmics" + str(year) + elif "Commisioning" in dataset: + cert_type = "Commisioning2020" + elif "HI" in PD: + cert_type = "Collisions" + str(year) + "HI" + + cert_path = base_cert_path + cert_type + "/" + web_fallback = False + + if os.path.isdir(cert_path): + json_list = os.listdir(cert_path) + if len(json_list) == 0: + web_fallback == True + json_list = [c for c in json_list if "Golden" in c and "era" not in c] + json_list = [c for c in json_list if c.startswith("Cert_C") and c.endswith("json")] + else: + web_fallback = True + + if web_fallback: + cert_url = base_cert_url + cert_type + "/" + json_list = get_url_clean(cert_url).split("\n") + json_list = [c for c in json_list if "Golden" in c and "era" not in c] + json_list = [[cc for cc in c.split(" ") if cc.startswith("Cert_C") and cc.endswith("json")][0] for c in json_list] + + # the larger the better, assuming file naming schema + # Cert_X_RunStart_RunFinish_Type.json + run_ranges = [int(c.split("_")[3]) - int(c.split("_")[2]) for c in json_list] + latest_json = np.array(json_list[np.argmax(run_ranges)]).reshape(1,-1)[0].astype(str) + best_json = str(latest_json[0]) + if not web_fallback: + with open(cert_path + "/" + best_json) as js: + golden = json.load(js) + else: + golden = get_url_clean(cert_url + best_json) + golden = ast.literal_eval(golden) #converts string to dict + + # golden json with all the lumisections + golden_flat = {} + for k in golden: + R = [] + for r in golden[k]: + R = R + [f for f in range(r[0],r[1]+1)] + golden_flat[k] = R + + # building the dataframe, cleaning for bad lumis + df = das_lumi_data(dataset).merge(das_file_data(dataset),on="file",how="inner") # merge file informations with run and lumis + df = df[df["run"].isin(list(golden.keys()))] # skim for golden runs + df["lumis"] = [[int(ff) for ff in f.replace("[","").replace("]","").split(",")] for f in df.lumis.values] + df_rs = [] + for r in golden_flat: + cut = (df["run"] == r) + if not any(cut): + continue + + df_r = df[cut] + + # jumping low event content runs + if df_r["events"].sum() < threshold: + continue + + good_lumis = np.array([len([ll for ll in l if ll in golden_flat[r]]) for l in df_r.lumis]) + n_lumis = np.array([len(l) for l in df_r.lumis]) + df_rs.append(df_r[good_lumis==n_lumis]) + + if len(df_rs) == 0: + print("No intersection between:") + print(" - json : ", best_json) + print(" - dataset: ", dataset) + print("Exiting.") + sys.exit(1) + + df = pd.concat(df_rs) + df.loc[:,"min_lumi"] = [min(f) for f in df.lumis] + df.loc[:,"max_lumi"] = [max(f) for f in df.lumis] + df = df.sort_values(["run","min_lumi","max_lumi"]) + + if site is not None: + df = df.merge(das_file_site(dataset,site),on="file",how="inner") + + if args.pandas: + df.to_csv(dataset.replace("/","")+".csv") + + if events > 0: + df = df[df["events"] <= events] #jump too big files + df.loc[:,"sum_evs"] = df.loc[:,"events"].cumsum() + df = df[df["sum_evs"] < events] + + files = df.file + + if outfile is not None: + with open(outfile, 'w') as f: + for line in files: + f.write(f"{line}\n") + else: + print("\n".join(files)) + + sys.exit(0) + +