Add a tool to convert data to the .raw format used as input by the HLT

cms-sw · Jun 27, 2022 · 551d34c · 551d34c
1 parent 767a838
commit 551d34c
Show file tree

Hide file tree

Showing 3 changed files with 361 additions and 0 deletions.
diff --git a/HLTrigger/Tools/README.md b/HLTrigger/Tools/README.md
@@ -0,0 +1,26 @@
+# convertToRaw
+
+Convert RAW data stored in one or more EDM .root files into the .raw file used as input by the HLT.
+
+```
+usage: convertToRaw [-h] [-o PATH] [-f EVENTS] [-l EVENTS] [--one-file-per-lumi] FILES [FILES ...]
+
+Convert RAW data from .root format to .raw format.
+
+positional arguments:
+  FILES                 input files in .root format
+
+optional arguments:
+  -h, --help            show this help message and exit
+  -o PATH, --output PATH
+                        base path to store the output files; subdirectories based on the run number are automatically created (default: )
+  -f EVENTS, --events_per_file EVENTS
+                        split the output into files with at most EVENTS events (default: 50)
+  -l EVENTS, --events_per_lumi EVENTS
+                        process at most EVENTS events in each lumisection (default: 11650)
+  --one-file-per-lumi   assume that lumisections are not split across files (and disable --events_per_lumi) (default: False)
+```
+
+The default behaviour is to process a single luminosity section at a time, in order to support luminosity sections split across multiple files and a limit on the number of events in each lumisection.
+
+If neither of these features is needed (_i.e._ if lumisections are not split, and all events should be converted) the `--one-file-per-lumi` can be used to process all data with a single job, speeding up the conversion considerably.
diff --git a/HLTrigger/Tools/python/convertToRaw.py b/HLTrigger/Tools/python/convertToRaw.py
@@ -0,0 +1,125 @@
+# convert the .raw data will appear under
+# store/raw/Run2022A/MinimumBias/RAW/v1/000/run353087
+
+import sys
+import os
+import FWCore.ParameterSet.Config as cms
+import FWCore.ParameterSet.VarParsing as VarParsing
+
+process = cms.Process("FAKE")
+
+process.maxEvents = cms.untracked.PSet(
+    input = cms.untracked.int32(-1)                                 # to be overwritten after parsing the command line options
+)
+
+process.source = cms.Source("PoolSource",
+    fileNames = cms.untracked.vstring()                             # to be overwritten after parsing the command line options
+)
+
+process.EvFDaqDirector = cms.Service( "EvFDaqDirector",
+    runNumber = cms.untracked.uint32( 0 ),                          # to be overwritten after parsing the command line options
+    baseDir = cms.untracked.string( "" ),                           # to be overwritten after parsing the command line options
+    buBaseDir = cms.untracked.string( "" ),                         # to be overwritten after parsing the command line options
+    useFileBroker = cms.untracked.bool( False ),
+    fileBrokerKeepAlive = cms.untracked.bool( True ),
+    fileBrokerPort = cms.untracked.string( "8080" ),
+    fileBrokerUseLocalLock = cms.untracked.bool( True ),
+    fuLockPollInterval = cms.untracked.uint32( 2000 ),
+    requireTransfersPSet = cms.untracked.bool( False ),
+    selectedTransferMode = cms.untracked.string( "" ),
+    mergingPset = cms.untracked.string( "" ),
+    outputAdler32Recheck = cms.untracked.bool( False ),
+)
+
+process.writer = cms.OutputModule("RawStreamFileWriterForBU",
+    source = cms.InputTag('rawDataCollector'),
+    numEventsPerFile = cms.uint32(0)                                # to be overwritten after parsing the command line options
+)
+
+process.endpath = cms.EndPath(process.writer)
+
+process.load('FWCore.MessageService.MessageLogger_cfi')
+process.MessageLogger.cerr.FwkReport.reportEvery = 0                # to be overwritten after parsing the command line options
+
+# parse command line options
+options = VarParsing.VarParsing ('python')
+for name in 'filePrepend', 'maxEvents', 'outputFile', 'secondaryOutputFile', 'section', 'tag', 'storePrepend', 'totalSections':
+    del options._register[name]
+    del options._beenSet[name]
+    del options._info[name]
+    del options._types[name]
+    if name in options._singletons:
+        del options._singletons[name]
+    if name in options._lists:
+        del options._lists[name]
+    if name in options._noCommaSplit:
+        del options._noCommaSplit[name]
+    if name in options._noDefaultClear:
+        del options._noDefaultClear[name]
+
+
+options.register('runNumber',
+                 0,
+                 VarParsing.VarParsing.multiplicity.singleton,
+                 VarParsing.VarParsing.varType.int,
+                 "Run number to use")
+
+options.register('lumiNumber',
+                 None,
+                 VarParsing.VarParsing.multiplicity.singleton,
+                 VarParsing.VarParsing.varType.int,
+                 "Luminosity section number to use")
+
+options.register('eventsPerLumi',
+                 11650,
+                 VarParsing.VarParsing.multiplicity.singleton,
+                 VarParsing.VarParsing.varType.int,
+                 "Number of events in the given luminosity section to process")
+
+options.register('eventsPerFile',
+                 50,
+                 VarParsing.VarParsing.multiplicity.singleton,
+                 VarParsing.VarParsing.varType.int,
+                 "Split the output into files with at most this number of events")
+
+options.register('outputPath',
+                 os.getcwd(),
+                 VarParsing.VarParsing.multiplicity.singleton,
+                 VarParsing.VarParsing.varType.string,
+                 "Output directory for the FED RAW data files")
+
+options.parseArguments()
+
+# check that the option values are valide
+if options.runNumber == 0:
+    sys.stderr.write('Invalid run number\n')
+    sys.exit(1)
+
+if options.lumiNumber == 0:
+    sys.stderr.write('Invalid luminosity section number\n')
+    sys.exit(1)
+
+if options.eventsPerLumi == 0:
+    sys.stderr.write('Invalid number of events per luminosity section\n')
+    sys.exit(1)
+
+if options.eventsPerFile == 0:
+    sys.stderr.write('Invalid number of events per output file\n')
+    sys.exit(1)
+
+# configure the job based on the command line options
+process.source.fileNames = options.inputFiles
+if options.lumiNumber is not None:
+    # process only one lumisection
+    process.source.lumisToProcess = cms.untracked.VLuminosityBlockRange('%d:%d' % (options.runNumber, options.lumiNumber))
+    process.maxEvents.input = options.eventsPerLumi
+process.EvFDaqDirector.runNumber = options.runNumber
+process.EvFDaqDirector.baseDir = options.outputPath
+process.EvFDaqDirector.buBaseDir = options.outputPath
+process.writer.numEventsPerFile = options.eventsPerFile
+process.MessageLogger.cerr.FwkReport.reportEvery = options.eventsPerFile
+
+# create the output directory, if it does not exist
+os.makedirs(options.outputPath, exist_ok=True)
+os.makedirs('%s/run%06d' % (options.outputPath, options.runNumber), exist_ok=True)
+open('%s/run%06d/fu.lock' % (options.outputPath, options.runNumber), 'w').close()
diff --git a/HLTrigger/Tools/scripts/convertToRaw b/HLTrigger/Tools/scripts/convertToRaw
@@ -0,0 +1,210 @@
+#! /usr/bin/env python3
+
+import argparse
+import glob
+import json
+import os, os.path
+import re
+import shutil
+import socket
+import subprocess
+import sys
+
+def cmsRun(config, **args):
+    cmd = [ 'cmsRun', config ] + [ arg + '=' + str(val) for (arg, val) in args.items() ]
+    sys.stdout.write(' \\\n  '.join(cmd))
+    sys.stdout.write('\n\n')
+    status = subprocess.run(cmd, stdout=None, stderr=None)
+    status.check_returncode()
+
+    # handle error conditions
+    if status.returncode < 0:
+        sys.stderr.write('error: cmsRun was killed by signal %d\n' % -status.returncode)
+        sys.exit(status.returncode)
+    elif status.returncode > 0:
+        sys.stderr.write('error: cmsRun exited with error code %d\n' % status.returncode)
+        sys.exit(status.returncode)
+
+
+# default values
+events_per_file = 50
+events_per_lumi = 11650
+output_directory = ''
+
+parser = argparse.ArgumentParser(description='Convert RAW data from .root format to .raw format.', formatter_class = argparse.ArgumentDefaultsHelpFormatter)
+parser.add_argument('files', type=str, metavar='FILES', nargs='+', help='input files in .root format')
+parser.add_argument('-o', '--output', type=str, dest='output_directory', metavar='PATH', default='', help='base path to store the output files; subdirectories based on the run number are automatically created')
+parser.add_argument('-f', '--events_per_file', type=int, dest='events_per_file', metavar='EVENTS', default=events_per_file, help='split the output into files with at most EVENTS events')
+parser.add_argument('-l', '--events_per_lumi', type=int, dest='events_per_lumi', metavar='EVENTS', default=events_per_lumi, help='process at most EVENTS events in each lumisection')
+parser.add_argument('--one-file-per-lumi', action='store_true', dest='one_file_per_lumi', default=False, help='assume that lumisections are not split across files (and disable --events_per_lumi)')
+
+# parse the command line arguments and options
+args = parser.parse_args()
+if args.output_directory and args.output_directory.endswith('/'):
+    args.output_directory = args.output_directory[:-1]
+
+# read the list of input files from the command line arguments
+files = [ 'file:' + f if (not ':' in f and not f.startswith('/store/') and os.path.exists(f)) else f for f in args.files ]
+
+# extract the list of runs and lumiections in the input files
+class FileInfo(object):
+    def __init__(self):
+        self.events = 0
+        self.files = set()
+
+header  = re.compile(r'^ +Run +Lumi +# Events$')
+empty   = re.compile(r'^ *$')
+content = {}
+
+for f in files:
+
+    # run edmFileUtil --eventsInLumis ...
+    output = subprocess.run(['edmFileUtil', '--eventsInLumis', f], capture_output=True, text=True)
+    if output.returncode < 0:
+        sys.stderr.write('error: edmFileUtil was killed by signal %d\n' % -output.returncode)
+        sys.stderr.write('\n')
+        sys.stderr.write(output.stderr)
+        sys.exit(output.returncode)
+    elif output.returncode > 0:
+        sys.stderr.write('error: edmFileUtil exited with error code %d\n' % output.returncode)
+        sys.stderr.write('\n')
+        sys.stderr.write(output.stderr)
+        sys.exit(output.returncode)
+
+    # parse the output of edmFileUtil
+    parsing = False
+    for line in output.stdout.splitlines():
+        if not parsing and header.match(line):
+            # start parsing
+            parsing = True
+            continue
+
+        if parsing and empty.match(line):
+            # stop parsing
+            parsing = False
+            continue
+
+        if parsing:
+            run, lumi, events = tuple(map(int, line.split()))
+            if not run in content:
+                content[run] = {}
+            if not lumi in content[run]:
+                content[run][lumi] = FileInfo()
+            content[run][lumi].events += events
+            content[run][lumi].files.add(f)
+
+# drop empty lumisections
+for run in content:
+    empty_lumis = [ lumi for lumi in content[run] if content[run][lumi].events == 0 ]
+    for lumi in empty_lumis:
+        del content[run][lumi]
+
+# drop empty runs
+empty_runs = [ run for run in content if not content[run] ]
+for run in empty_runs:
+    del content[run]
+
+# locate the CMSSW configuration file
+config_name = 'HLTrigger/Tools/python/convertToRaw.py'
+current_area = os.environ['CMSSW_BASE']
+release_area = os.environ['CMSSW_RELEASE_BASE']
+
+config_py = current_area + '/src/' + config_name
+if not os.path.exists(config_py):
+    config_py = release_area + '/src/' + config_name
+if not os.path.exists(config_py):
+    sys.stderr.write('error: cannot find the configuration file %s\n' % config_name)
+    sys.exit(1)
+
+# convert the input data to FED RAW data format
+
+# process each run
+for run in sorted(content):
+    for lumi in sorted(content[run]):
+        print("events: %d" % content[run][lumi].events)
+        print("file: %s" % ', '.join(content[run][lumi].files))
+
+for run in sorted(content):
+
+    # create the output directory structure
+    run_path = args.output_directory + f'/run{run:06d}'
+    shutil.rmtree(run_path, ignore_errors=True)
+    os.makedirs(run_path)
+
+    if args.one_file_per_lumi:
+        # process the whole run
+        lumis = sorted(content[run])
+        print('found run %d, lumis %d-%d, with %d events' % (run, min(lumis), max(lumis), sum(content[run][lumi].events for lumi in lumis)))
+        cmsRun(config_py, inputFiles = ','.join(files), runNumber = run, eventsPerFile = args.events_per_file, outputPath = args.output_directory)
+
+    else:
+        # process lumisections individualy, then merge the output
+        summary = {
+            'data': [0, 0, 0, 0],   # [ 'events', 'files', 'lumisections', 'last lumisection' ]
+            'definition': run_path + '/jsd/EoR.jsd',
+            'source': socket.getfqdn() + '_' + str(os.getpid())
+        }
+
+        for lumi in sorted(content[run]):
+
+            # process individual lumisections
+            print('found run %d, lumi %d, with %d events' % (run, lumi, content[run][lumi].events))
+            lumi_path = args.output_directory + f'/run{run:06d}_ls{lumi:04d}'
+            shutil.rmtree(lumi_path, ignore_errors=True)
+            os.makedirs(lumi_path)
+            cmsRun(config_py, inputFiles = ','.join(content[run][lumi].files), runNumber = run, lumiNumber = lumi, eventsPerLumi = args.events_per_lumi, eventsPerFile = args.events_per_file, outputPath = lumi_path)
+
+            # merge all lumisetions data
+
+            # number of events expected to be processed
+            if args.events_per_lumi < 0:
+                expected_events = content[run][lumi].events
+            else:
+                expected_events = min(args.events_per_lumi, content[run][lumi].events)
+
+            # number of files expected to be created
+            expected_files = (expected_events + args.events_per_file - 1) // args.events_per_file
+
+            # find the files produced by the conversion job and move them to the per-run path
+            lumi_base_path = args.output_directory + f'/run{run:06d}_ls{lumi:04d}'
+            lumi_path = lumi_base_path + f'/run{run:06d}'
+
+            # jsd files
+            jsd_path = lumi_path + '/jsd'
+            if not os.path.exists(run_path + '/jsd'):
+                shutil.move(jsd_path, run_path)
+            else:
+                shutil.rmtree(jsd_path)
+
+            # lumisection data and EoLS files
+            lumi_files = glob.glob(lumi_path + f'/run{run:06d}_ls{lumi:04d}_*')
+            for f in lumi_files:
+                shutil.move(f, run_path + '/')
+
+            # read the partial EoR file
+            eor_file = lumi_path + f'/run{run:06d}_ls0000_EoR.jsn'
+            with open(eor_file) as f:
+                eor = json.load(f)
+                produced_events = int(eor['data'][0])
+                produced_files = int(eor['data'][1])
+                produced_lumis = int(eor['data'][2])
+                produced_last_lumi = int(eor['data'][3])
+                assert produced_events == expected_events
+                assert produced_files == expected_files
+                assert produced_lumis == 1
+                assert produced_last_lumi == lumi
+                summary['data'][0] += expected_events
+                summary['data'][1] += expected_files
+                summary['data'][2] += 1
+                summary['data'][3] = lumi
+            os.remove(eor_file)
+
+            # remove the intermediate directory
+            shutil.rmtree(lumi_base_path, ignore_errors=True)
+
+        # write the final EoR file
+        # implemented by hand instead of using json.dump() to match the style used by the DAQ tools
+        eor_file = run_path + f'/run{run:06d}_ls0000_EoR.jsn'
+        f = open(eor_file, 'w')
+        f.write('{\n   "data" : [ "%d", "%d", "%d", "%d" ],\n   "definition" : "%s",\n   "source" : "%s"\n}\n' % (summary['data'][0], summary['data'][1], summary['data'][2], summary['data'][3], summary['definition'], summary['source']))
+        f.close()