diff --git a/HLTrigger/Configuration/scripts/hltFindDuplicates b/HLTrigger/Configuration/scripts/hltFindDuplicates index 0dd2f41c92eee..6a771dd358fb3 100755 --- a/HLTrigger/Configuration/scripts/hltFindDuplicates +++ b/HLTrigger/Configuration/scripts/hltFindDuplicates @@ -1,11 +1,33 @@ -#! /usr/bin/env python3 +#!/usr/bin/env python3 +"""hltFindDuplicates: script to find duplicate modules of an HLT configuration. + +Input. + Path to a local cmsRun configuration file, or stdin. + +Output. + A directory containing + (1) the input cmsRun configuration, and + (2) text files listing the groups of duplicate modules. + +Examples. + + # input: local configuration file + hltFindDuplicates tmp.py -o output_dir + + # input: stdin + hltConfigFromDB --configName /dev/CMSSW_X_Y_0/GRun/Vn | hltFindDuplicates -o output_dir + hltGetConfiguration /dev/CMSSW_X_Y_0/GRun/Vn | hltFindDuplicates -o output_dir -x realData=0 globalTag=@ +""" +import os +import sys +import argparse +import re +import itertools +import shutil -from __future__ import print_function -import sys, imp, re, itertools -from HLTrigger.Configuration.Tools.frozendict import frozendict import FWCore.ParameterSet.Config as cms -debug = True +from HLTrigger.Configuration.Tools.frozendict import frozendict whitelist_types = [ 'HLTPrescaler', @@ -22,19 +44,21 @@ whitelist_labels = [ def whitelist(module): return module.label in whitelist_labels or module.type in whitelist_types +def iterate(arg): + return (not isinstance(arg, str) and '__iter__' in dir(arg)) def freeze(arg): if type(arg) == dict: - return frozendict((k, freeze(v)) for (k, v) in arg.iteritems()) - elif '__iter__' in dir(arg): + return frozendict((k, freeze(v)) for (k, v) in iter(arg.items())) + elif iterate(arg): return tuple( freeze(v) for v in arg ) else: return arg def unfreeze(arg): if type(arg) == frozendict: - return dict((k, unfreeze(v)) for (k, v) in arg.iteritems()) - elif '__iter__' in dir(arg): + return dict((k, unfreeze(v)) for (k, v) in iter(arg.items())) + elif iterate(arg): return list( unfreeze(v) for v in arg ) else: return arg @@ -43,68 +67,87 @@ def pythonize(arg): if 'parameters_' in dir(arg): arg = arg.parameters_() - if 'value' in dir(arg): + elif 'value' in dir(arg): arg = arg.value() if type(arg) == dict: - return frozendict((k, pythonize(v)) for (k, v) in arg.iteritems()) - elif '__iter__' in dir(arg): + return frozendict((k, pythonize(v)) for (k, v) in iter(arg.items())) + elif iterate(arg): return tuple( pythonize(v) for v in arg ) else: return arg +def mkdirp(dirpath): + try: + os.makedirs(dirpath) + except OSError: + if not os.path.isdir(dirpath): + raise class Module(object): - type = '' - label = '' + type = '' + label = '' params = frozendict() - hash = 0 + hash = 0 def __init__(self, module): - self.label = module.label_() - self.type = module.type_() + self.label = module.label_() + self.type = module.type_() self.params = pythonize(module.parameters_()) - self.__rehash() + self.__rehash(self.params) + def __str__(self): + return f'{self.label} (type: {self.type}): {self.params}' def key(self): return self.hash - def __rehash(self): - self.hash = (hash(self.type) << 4) + hash(self.params) + def __rehash(self, params): + self.hash = (hash(self.type) << 4) + hash(params) - def __check(self, value, group): - return type(value) is str and bool(group.match(value)) + def __check(self, value, check): + if isinstance(value, list): + return any(self.__check(foo, check) for foo in value) + elif isinstance(value, dict): + return any(self.__check(value[foo], check) for foo in value) + else: + return isinstance(value, str) and bool(check.match(value)) def __sub(self, value, group, label): - if type(value) is str: + if isinstance(value, list): + return [self.__sub(foo, group, label) for foo in value] + elif isinstance(value, dict): + return {foo:self.__sub(value[foo], group, label) for foo in value} + elif isinstance(value, str): return group.sub(r'%s\2' % label, value) else: return value - def apply_rename(self, groups): + def apply_rename(self, groups, verbosity_level): modified = False newparams = unfreeze(self.params) - for label, (group, check) in groups.iteritems(): - for k, p in newparams.iteritems(): - if '__iter__' in dir(p): - if any(self.__check(v, check) for v in p): - newparams[k] = tuple(self.__sub(v, check, label) for v in p) - modified = True - else: - if self.__check(p, check): - newparams[k] = self.__sub(p, check, label) - modified = True - if modified: - self.params = frozendict(newparams) - self.__rehash() + if verbosity_level > 2: + print('') + print(f' {self.label} ({self.type})') + print(f' parameters before: {newparams}') + for label, (group, check) in iter(groups.items()): + for k, p in iter(newparams.items()): + if self.__check(p, check): + newparams[k] = self.__sub(p, check, label) + modified = True + if verbosity_level > 2: + print(f' parameters after: {newparams}') + print(f' modified = {modified}') + if modified: + self.__rehash(frozendict(newparams)) class ModuleList(object): modules = [] + hashToLabelDict = {} def append(self, module): m = Module(module) @@ -117,98 +160,196 @@ class ModuleList(object): def __init__(self, *args): for arg in args: - if '__iter__' in dir(arg): + if iterate(arg): self.extend(arg) else: self.append(arg) + def hash_label(self, hash_value): + return self.hashToLabelDict.get(hash_value, None) + def sort(self): self.modules.sort(key = Module.key) def group(self): groups = dict() self.sort() - i = 0 for v, g in itertools.groupby(self.modules, Module.key): group = list(g) if len(group) > 1: - i = i + 1 g = [ m.label for m in group ] g.sort() - l = 'hltGroup%d' %i + # hash identifying the group (it is the same for every module in the group) + g_key = group[0].key() + if g_key not in self.hashToLabelDict: + # label identifying this group of modules + # (set only once so it cannot change from step to step) + self.hashToLabelDict[g_key] = f'{group[0].type} ({g[0]})' r = re.compile(r'^(%s)($|:)' % r'|'.join(g)) - groups[l] = (g, r) + groups[g_key] = (g, r) return groups - def apply_rename(self, groups): + def apply_rename(self, groups, verbosity_level): for module in self.modules: - module.apply_rename(groups) + module.apply_rename(groups, verbosity_level) - def dump(self): + def dump(self, indent=0): for m in self.modules: - print("%s = (%s) {" % (m.label, m.type)) - for k, v in m.params.iteritems(): - print("\t%s = %s" % (k, v)) - print('}') - print() - + print(' '*indent + "%s = (%s) {" % (m.label, m.type)) + for k, v in iter(m.params.items()): + print(' '*indent + " %s = %s" % (k, v)) + print(' '*indent + '}\n') +def findDuplicates(process, output_dir, verbosity_level): + mkdirp(output_dir) -def findDuplicates(process): modules = ModuleList( - process._Process__analyzers.itervalues(), - process._Process__producers.itervalues(), - process._Process__filters.itervalues() + iter(process.analyzers_().values()), + iter(process.producers_().values()), + iter(process.filters_().values()) ) oldups = 0 groups = modules.group() - dups = sum(len(g[0]) for g in groups.itervalues()) - len(groups) + dups = sum(len(g[0]) for g in groups.values()) - len(groups) index = 1 - while(dups != oldups): - if debug: - dump = open('step%d.sed' % index, 'w') - for target, (group, regexp) in groups.iteritems(): - dump.write('s#\\<\\(%s\\)\\>#%s#g\n' % ('\\|'.join(group), target)) - dump.close() - dump = open('step%d.txt' % index, 'w') - for target, (group, regexp) in groups.iteritems(): - dump.write('#%s\n%s\n\n' % ( target, '\n'.join(group))) - dump.close() - print("found %3d duplicates in %3d groups" % (dups, len(groups))) + while dups != oldups: + groupLabelToHashDict = {modules.hash_label(group_hash):group_hash for group_hash in groups} + + dump = open(os.path.join(output_dir, f'step{index}.sed'), 'w') + for group_label in sorted(groupLabelToHashDict.keys()): + (group, regexp) = groups[groupLabelToHashDict[group_label]] + dump.write('s#\\<\\(%s\\)\\>#%s#g\n' % ('\\|'.join(group), group_label)) + dump.close() + + dump = open(os.path.join(output_dir, f'step{index}.txt'), 'w') + first_entry = True + for group_label in sorted(groupLabelToHashDict.keys()): + (group, regexp) = groups[groupLabelToHashDict[group_label]] + dump.write('\n'*(not first_entry) + '# %s\n%s\n' % ( group_label, '\n'.join(group))) + first_entry = False + dump.close() + + if verbosity_level > 0: + print(f"[step {index:>2d}] found {dups:>3d} duplicates in {len(groups):>3d} groups") + + if verbosity_level > 2: + print(f'[step {index:>2d}] groups={groups}') + print(f'[step {index:>2d}] ---------------') + print(f'[step {index:>2d}] apply_rename ..') + oldups = dups - modules.apply_rename(groups) + modules.apply_rename(groups, verbosity_level) + + if verbosity_level > 2: + print() + print(f' ------------------------') + print(f' modules (after renaming)') + print(f' ------------------------') + modules.dump(indent=14) + groups = modules.group() - dups = sum(len(g[0]) for g in groups.itervalues()) - len(groups) - index = index + 1 + dups = sum(len(g[0]) for g in groups.values()) - len(groups) + index += 1 - dump = open('groups.sed', 'w') - for target, (group, regexp) in groups.iteritems(): - dump.write('s#\\<\\(%s\\)\\>#%s#\n' % ('\\|'.join(group), target)) - dump.close() + groupLabelToHashDict = {modules.hash_label(group_hash):group_hash for group_hash in groups} - dump = open('groups.txt', 'w') - for target, (group, regexp) in groups.iteritems(): - dump.write('#%s\n%s\n\n' % ( target, '\n'.join(group))) + dump = open(os.path.join(output_dir, 'groups.sed'), 'w') + for group_label in sorted(groupLabelToHashDict.keys()): + (group, regexp) = groups[groupLabelToHashDict[group_label]] + dump.write('s#\\<\\(%s\\)\\>#%s#\n' % ('\\|'.join(group), group_label)) dump.close() + dump = open(os.path.join(output_dir, 'groups.txt'), 'w') + first_entry = True + for group_label in sorted(groupLabelToHashDict.keys()): + (group, regexp) = groups[groupLabelToHashDict[group_label]] + dump.write('\n'*(not first_entry) + '# %s\n%s\n' % ( group_label, '\n'.join(group))) + first_entry = False + dump.close() +## +## main +## +if __name__ == '__main__': + + ### args + parser = argparse.ArgumentParser( + prog = './'+os.path.basename(__file__), + formatter_class = argparse.RawDescriptionHelpFormatter, + description = __doc__, + argument_default = argparse.SUPPRESS, + ) + + # menu: name of ConfDB config, or local cmsRun cfg file, or stdin + parser.add_argument('menu', + nargs = '?', + metavar = 'MENU', + default = None, + help = 'Path to cmsRun configuration file (if not specified, stdin is used)') + + # output-dir: path to directory containing output files + parser.add_argument('-o', '--output-dir', + metavar = 'OUTPUT_DIR', + default = 'hltFindDuplicates_output', + help = 'Path to directory containing output files') + + # menu arguments: list of arguments to be applied to the cmsRun configuration file + # (via argparse, VarParsing, or similar) + parser.add_argument('-x', '--menu-args', + nargs = '+', + metavar = 'MENU_ARGS', + default = [], + help = 'List of arguments (each without whitespaces) to be applied to the cmsRun configuration file') + + # verbosity level: level of verbosity of stdout/stderr printouts + parser.add_argument('-v', '--verbosity-level', + metavar = 'VERBOSITY_LEVEL', + type = int, + default = 1, + help = 'Verbosity level') + + # parse command line arguments and options + opts = parser.parse_args() + + print('-'*25) + print('hltFindDuplicates') + print('-'*25) + + # create new output directory + if os.path.exists(opts.output_dir): + log_msg = 'Failed to create output directory (a directory or file already exists under that path)' + raise RuntimeError(f'{log_msg}: {opts.output_dir}') + + mkdirp(opts.output_dir) + output_config_filepath = os.path.join(opts.output_dir, 'config.py') + + print(f'output directory: {opts.output_dir}') + print('-'*25) + + # parse the HLT configuration from a local cfg file, or from standard input + hlt = {'process': None, 'fragment': None} + + if opts.menu != None: + if not os.path.isfile(opts.menu): + raise RuntimeError(f'Invalid path to input file (file does not exist): {opts.menu}') + shutil.copyfile(opts.menu, output_config_filepath) + else: + with open(output_config_filepath, 'w') as config_file: + config_file.write(sys.stdin.read()) -def main(): - # parse the HLT configuration from standard input or from the given file - hlt = imp.new_module('hlt') - try: - configname = sys.argv[1] - except: - config = sys.stdin - else: - config = open(configname) - exec(config, globals(), hlt.__dict__) - config.close() - findDuplicates(hlt.process) + sys.argv = [sys.argv[0], output_config_filepath] + opts.menu_args + exec(open(output_config_filepath).read(), globals(), hlt) + # find cms.Process object + process = None + if hlt['process'] != None: + process = hlt['process'] + if hlt['fragment'] != None: + process = hlt['fragment'] -if __name__ == "__main__": - main() + if process == None or not isinstance(process, cms.Process): + raise RuntimeError('Failed to find object of type cms.Process !') + findDuplicates(process, output_dir=opts.output_dir, verbosity_level=opts.verbosity_level) diff --git a/HLTrigger/Configuration/test/BuildFile.xml b/HLTrigger/Configuration/test/BuildFile.xml index 11c72ffe9f4fb..e106df22c9b7d 100644 --- a/HLTrigger/Configuration/test/BuildFile.xml +++ b/HLTrigger/Configuration/test/BuildFile.xml @@ -11,3 +11,6 @@ + + + diff --git a/HLTrigger/Configuration/test/test_hltFindDuplicates.sh b/HLTrigger/Configuration/test/test_hltFindDuplicates.sh new file mode 100755 index 0000000000000..34087048908c9 --- /dev/null +++ b/HLTrigger/Configuration/test/test_hltFindDuplicates.sh @@ -0,0 +1,78 @@ +#!/bin/bash + +# Pass in name and status +function die { + printf "\n%s: status %s\n" "$1" "$2" + if [ $# -gt 2 ]; then + printf "%s\n" "=== Log File ==========" + cat $3 + printf "%s\n" "=== End of Log File ===" + fi + exit $2 +} + +if [ -z "${SCRAM_TEST_PATH}" ]; then + printf "\n%s\n\n" "ERROR -- environment variable SCRAM_TEST_PATH not defined" + exit 1 +fi + +### +### test #1: "mode == 0" +### +rm -rf test_hltFindDuplicates_mode0_output + +hltFindDuplicates "${SCRAM_TEST_PATH}"/test_hltFindDuplicates_cfg.py -x="--mode=0" -v 2 \ + -o test_hltFindDuplicates_mode0_output &> test_hltFindDuplicates_mode0_log \ + || die 'Failure running hltFindDuplicates (mode: 0)' $? test_hltFindDuplicates_mode0_log + +cat <<@EOF > test_hltFindDuplicates_mode0_groups_expected +# A3 (d3x) +d3x +d3y +m3x +m3y + +# F2 (d2x) +d2x +d2y +m2x +m2y + +# P1 (d1x) +d1x +d1y +m1x +m1y +@EOF + +diff test_hltFindDuplicates_mode0_groups_expected test_hltFindDuplicates_mode0_output/groups.txt \ + || die "Unexpected differences in groups.txt output of hltFindDuplicates (mode: 0)" $? + +### +### test #2: "mode == 1" +### +rm -rf test_hltFindDuplicates_mode1_output + +hltFindDuplicates "${SCRAM_TEST_PATH}"/test_hltFindDuplicates_cfg.py -x="--mode=1" -v 2 \ + -o test_hltFindDuplicates_mode1_output &> test_hltFindDuplicates_mode1_log \ + || die 'Failure running hltFindDuplicates (mode: 1)' $? test_hltFindDuplicates_mode1_log + +cat <<@EOF > test_hltFindDuplicates_mode1_groups_expected +# A3 (d3x) +d3x +d3y +m3x + +# F2 (d2x) +d2x +d2y +m2x + +# P1 (d1x) +d1x +d1y +m1x +@EOF + +diff test_hltFindDuplicates_mode1_groups_expected test_hltFindDuplicates_mode1_output/groups.txt \ + || die "Unexpected differences in groups.txt output of hltFindDuplicates (mode: 1)" $? diff --git a/HLTrigger/Configuration/test/test_hltFindDuplicates_cfg.py b/HLTrigger/Configuration/test/test_hltFindDuplicates_cfg.py new file mode 100644 index 0000000000000..9ab81636a9e8e --- /dev/null +++ b/HLTrigger/Configuration/test/test_hltFindDuplicates_cfg.py @@ -0,0 +1,131 @@ +#!/usr/bin/env python3 +""" +Configuration file to be used as input in unit tests of the utility hltFindDuplicates. + +The configuration is made of modules labelled "d*" and "m*". + +Details on the configuration. + - For each group of modules (d* and m*), + - modules are ordered in 3 levels (e.g. d1*, d2*, d3*), and + - for every level, there are two versions (*x and *y) of the module (e.g. d1x, d1y). + - The *x (*y) modules depend only on *x (*y) modules, and not on *y (*x) modules. + - The *2* modules depend on *1* modules. + - The *3* modules depend on *1* and *2* modules. + - The m* modules are the counterparts of the d* modules. + - The m* modules do not depend on d* modules (and viceversa). + - A given m{1,2,3}{x,y} module may or may not be a duplicate of the corresponding d* module. + +The --mode option determines how the ED modules are configured. + + - mode == 0: + the m* modules are duplicates of the corresponding d* modules. + + - mode == 1: + one parameter in m1y is changed compared to d1y + and this makes all the m*y modules unique, + while the m*x modules should ultimately + be identified as duplicates of the d*x modules. +""" +import FWCore.ParameterSet.Config as cms + +import os +import argparse + +parser = argparse.ArgumentParser( + prog = 'python3 '+os.path.basename(__file__), + formatter_class = argparse.RawDescriptionHelpFormatter, + description = __doc__, + argument_default = argparse.SUPPRESS, +) + +parser.add_argument("--mode", + type = int, + default = 0, + choices = [0,1], + help = "Choose how to configure the modules." +) + +args,_ = parser.parse_known_args() + +process = cms.Process('TEST') + +### "d*" modules: the duplicates +### - the *x (*y) modules depend only on *x (*y) modules, and not on *y (*x) modules +### - the *2* modules depend on *1* modules +### - the *3* modules depend on *1* and *2* modules +process.d1x = cms.EDProducer('P1', + p1 = cms.InputTag('rawDataCollector'), + p2 = cms.bool(False), + p3 = cms.vbool(False, True), + p4 = cms.uint32(1), + p5 = cms.vuint32(1,2,3), + p6 = cms.int32(-1), + p7 = cms.vint32(-1,2,-3), + p8 = cms.double(1.1), + p9 = cms.vdouble(2.3, 4.5) +) + +process.d1y = process.d1x.clone() + +process.d2x = cms.EDFilter('F2', + p1 = cms.vint32(1, 2, 3), + p2 = cms.VInputTag('d1x'), + p3 = cms.PSet( + theStrings = cms.vstring('keyword1', 'keyword2') + ) +) + +process.d2y = process.d2x.clone( p2 = ['d1y'] ) + +process.d3x = cms.EDAnalyzer('A3', + p1 = cms.VPSet( + cms.PSet( + pset_a = cms.PSet( + tag1 = cms.InputTag('d1x') + ), + pset_b = cms.PSet( + tag2 = cms.InputTag('d2x') + ), + ) + ), + p2 = cms.PSet( + p_a = cms.PSet( + p_b = cms.PSet( + p_c = cms.VInputTag('d2x', 'd1x') + ) + ) + ) +) + +process.d3y = process.d3x.clone() +process.d3y.p1[0].pset_a.tag1 = 'd1y' +process.d3y.p1[0].pset_b.tag2 = 'd2y' +process.d3y.p2.p_a.p_b.p_c = ['d2y', 'd1y'] + +### m* modules +### - the m* modules are the counterparts of the d* modules +### - m* modules do not depend on d* modules (and viceversa) +### - if the mode "unique-m*y" is chosen, +### one parameter in m1y is changed compared to d1y +### and this makes all the m*y modules unique, +### while the m*x modules should ultimately +### be flagged as duplicates of the d*x modules +process.m1x = process.d1x.clone() + +if args.mode == 0: + process.m1y = process.d1y.clone() +elif args.mode == 1: + process.m1y = process.d1y.clone( p2 = True ) + +process.m2x = process.d2x.clone( p2 = ['m1x'] ) +process.m2y = process.d2y.clone( p2 = ['m1y'] ) +process.m3x = process.d3x.clone() + +process.m3x.p1[0].pset_a.tag1 = 'm1x' +process.m3x.p1[0].pset_b.tag2 = 'm2x' +process.m3x.p2.p_a.p_b.p_c = ['m2x', 'm1x'] + +process.m3y = process.d3y.clone() +process.m3y.p1[0].pset_a.tag1 = 'm1y' +process.m3y.p1[0].pset_b.tag2 = 'm2y' +process.m3y.p2.p_a.p_b.p_c = ['m2y', 'm1y']