From 94594d3f1c533a129a74da7485da72cd13fef7eb Mon Sep 17 00:00:00 2001 From: mauro verzetti Date: Wed, 10 Oct 2018 17:33:32 +0200 Subject: [PATCH 01/13] bugfix --- macros/kmeans_reweight.py | 86 +++++++++++++++++---------------------- 1 file changed, 37 insertions(+), 49 deletions(-) diff --git a/macros/kmeans_reweight.py b/macros/kmeans_reweight.py index 20a42473df1bd..6f3da450e8b6f 100644 --- a/macros/kmeans_reweight.py +++ b/macros/kmeans_reweight.py @@ -12,8 +12,11 @@ parser.add_argument( '--nthreads', default=10, type=int ) +parser.add_argument( + '--test', action='store_true', +) args = parser.parse_args() -dataset = 'all' +dataset = 'test' if args.test else 'all' import matplotlib.pyplot as plt #import ROOT @@ -50,12 +53,16 @@ from sklearn.cluster import KMeans clusterizer = KMeans(n_clusters=args.nbins, n_jobs=-2) -clusterizer.fit(data[data.is_e][reweight_feats]) +clusterizer.fit(data[reweight_feats]) #fit(data[data.is_e][reweight_feats]) data['cluster'] = clusterizer.predict(data[reweight_feats]) weights = {} for cluster, group in data.groupby('cluster'): - weight = group.shape[0]/float(group.is_e.sum()) + nbkg = np.invert(group.is_e).sum() + nsig = group.is_e.sum() + if not nbkg: RuntimeError('cluster %d has no background events, reduce the number of bins!' % nbkg) + elif not nsig: RuntimeError('cluster %d has no electrons events, reduce the number of bins!' % nsig) + weight = nbkg/float(nsig) weights[cluster] = weight from sklearn.externals import joblib @@ -105,14 +112,16 @@ except : pass plt.clf() - +#set_trace() +from matplotlib.colors import LogNorm Z = apply_weight(Zlin, weights).reshape(xx.shape) plt.figure(figsize=[10, 8]) plt.imshow( Z, interpolation='nearest', extent=(xx.min(), xx.max(), yy.min(), yy.max()), - cmap=plt.cm.inferno, - aspect='auto', origin='lower', norm=LogNorm()) + cmap=plt.cm.seismic, + norm=LogNorm(vmin=0.01, vmax=100), + aspect='auto', origin='lower') plt.title('weight') plt.xlim(x_min, x_max) plt.ylim(y_min, y_max) @@ -141,6 +150,7 @@ plt.ylabel('Occurrency') plt.legend(loc='best') plt.ylim(0.5, entries.max()*1.2) +#plt.xlim(entries.min(), entries.max()*1.2) plt.gca().set_xscale('log') plt.gca().set_yscale('log') plt.plot() @@ -150,49 +160,27 @@ except : pass plt.clf() -# plots with unweighted events -for plot in reweight_feats: - x_range = min(data[data.is_e][plot].min(), - data[np.invert(data.is_e)][plot].min()), \ - max(data[data.is_e][plot].max(), - data[np.invert(data.is_e)][plot].max()) - #if plot in cosmetics.ranges: x_range = cosmetics.ranges[plot] - plt.hist( - data[data.is_e][plot], bins=50, normed=True, - histtype='step', label='electrons', range=x_range, - ) - plt.hist( - data[np.invert(data.is_e)][plot], bins=50, normed=True, - histtype='step', label='background', range=x_range, - ) - plt.legend(loc='best') - plt.xlabel(plot if plot not in cosmetics.beauty else cosmetics.beauty[plot]) - plt.ylabel('A.U.') - try : plt.savefig('%s/%s_unweighted_%s.png' % (plots, dataset, plot)) - except : pass - try : plt.savefig('%s/%s_unweighted_%s.pdf' % (plots, dataset, plot)) - except : pass - plt.clf() - -# plots with weighted events -for plot in reweight_feats: +for plot in reweight_feats+['trk_pt']: x_range = min(data[data.is_e][plot].min(), data[np.invert(data.is_e)][plot].min()), \ max(data[data.is_e][plot].max(), data[np.invert(data.is_e)][plot].max()) - if plot in cosmetics.ranges: x_range = cosmetics.ranges[plot] - plt.hist( - data[data.is_e][plot], bins=50, normed=True, - histtype='step', label='electrons', range=x_range, weights=data[data.is_e].weight - ) - plt.hist( - data[np.invert(data.is_e)][plot], bins=50, normed=True, - histtype='step', label='background', range=x_range, weights=data[np.invert(data.is_e)].weight - ) - plt.legend(loc='best') - plt.xlabel(plot if plot not in cosmetics.beauty else cosmetics.beauty[plot]) - plt.ylabel('A.U.') - try : plt.savefig('%s/%s_reweight_%s.png' % (plots, dataset, plot)) - except : pass - try : plt.savefig('%s/%s_reweight_%s.pdf' % (plots, dataset, plot)) - except : pass - plt.clf() + x_range = cosmetics.ranges.get(plot, x_range) + for name, weight in [ + ('unweighted', np.ones(data.shape[0])), + ('reweight', data.weight)]: + plt.hist( + data[data.is_e][plot], bins=50, normed=True, + histtype='step', label='electrons', range=x_range, weights=weight[data.is_e] + ) + plt.hist( + data[np.invert(data.is_e)][plot], bins=50, normed=True, + histtype='step', label='background', range=x_range, weights=weight[np.invert(data.is_e)] + ) + plt.legend(loc='best') + plt.xlabel(plot if plot not in cosmetics.beauty else cosmetics.beauty[plot]) + plt.ylabel('A.U.') + try : plt.savefig('%s/%s_%s_%s.png' % (plots, dataset, name, plot)) + except : pass + try : plt.savefig('%s/%s_%s_%s.pdf' % (plots, dataset, name, plot)) + except : pass + plt.clf() From b1ee63de23a36988098586f6cc260a15282e6ee8 Mon Sep 17 00:00:00 2001 From: mauro verzetti Date: Wed, 10 Oct 2018 17:34:34 +0200 Subject: [PATCH 02/13] update the job tag --- macros/datasets.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/macros/datasets.py b/macros/datasets.py index a730982c54a1d..ac5de0ad0693f 100644 --- a/macros/datasets.py +++ b/macros/datasets.py @@ -1,8 +1,8 @@ from glob import glob #A single place where to bookkeep the dataset file locations -tag = '2018Sep20' +#tag = '2018Sep20' +tag = '2018Oct05' input_files = { - 'test' : ['/eos/cms/store/cmst3/user/mverzett/BToKee_Pythia/crab_2018Sep20_BToKee_v1AssocByDR/180920_162924/0000/BToKee_assocByDR_10.root'] } all_sets = [] @@ -18,6 +18,7 @@ all_sets += files input_files['all'] = all_sets +input_files['test'] = all_sets[:10] dataset_names = { 'BToKee' : r'B $\to$ K ee', @@ -44,7 +45,7 @@ def get_data(dataset, columns, nthreads=2*multiprocessing.cpu_count(), exclude={ columns = [i for i in infiles[0]['features/tree'].keys() if i not in exclude] ret = None arrays = [i['features/tree'].arrays(columns, executor=thread_pool, blocking=False) for i in infiles] - ret = arrays[0]() + ret = arrays[0]() for arr in arrays[1:]: tmp = arr() for column in columns: From aafe8cd56def71cbddf5aa42692af9dcc51bbbad Mon Sep 17 00:00:00 2001 From: mauro verzetti Date: Wed, 10 Oct 2018 17:35:31 +0200 Subject: [PATCH 03/13] added new features --- macros/features.py | 66 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 66 insertions(+) diff --git a/macros/features.py b/macros/features.py index 08e2c1bf47974..5fa087eba0f17 100644 --- a/macros/features.py +++ b/macros/features.py @@ -64,6 +64,72 @@ 'ktf_hcal_cluster_dphi', ] +new_features = [ + 'match_SC_EoverP', + 'match_SC_dEta', + 'match_SC_dPhi', + 'match_seed_EoverP', + 'match_seed_EoverPout', + 'match_seed_dEta', + 'match_seed_dPhi', + 'match_seed_dEta_vtx', + 'match_eclu_EoverP', + 'match_eclu_dEta', + 'match_eclu_dPhi', + + 'shape_sigmaEtaEta', + 'shape_sigmaIetaIeta', + 'shape_sigmaIphiIphi', + 'shape_e1x5', + 'shape_e2x5Max', + 'shape_e5x5', + 'shape_r9', + 'shape_HoverE', + 'shape_HoverEBc', + 'shape_hcalDepth1', + 'shape_hcalDepth2', + 'shape_hcalDepth1Bc', + 'shape_hcalDepth2Bc', + 'shape_nHcalTowersBc', + 'shape_eLeft', + 'shape_eRight', + 'shape_eTop', + 'shape_eBottom', + 'shape_full5x5_sigmaEtaEta', + 'shape_full5x5_sigmaIetaIeta', + 'shape_full5x5_sigmaIphiIphi', + 'shape_full5x5_circularity', + 'shape_full5x5_e1x5', + 'shape_full5x5_e2x5Max', + 'shape_full5x5_e5x5', + 'shape_full5x5_r9', + 'shape_full5x5_HoverE', + 'shape_full5x5_HoverEBc', + 'shape_full5x5_hcalDepth1', + 'shape_full5x5_hcalDepth2', + 'shape_full5x5_hcalDepth1Bc', + 'shape_full5x5_hcalDepth2Bc', + 'shape_full5x5_eLeft', + 'shape_full5x5_eRight', + 'shape_full5x5_eTop', + 'shape_full5x5_eBottom', + + 'brem_frac', + 'brem_fracTrk', + 'brem_fracSC', + 'brem_N', + + 'sc_etaWidth', + 'sc_phiWidth', + 'sc_ps_EoverEraw', + 'sc_E', + 'sc_Et', + 'sc_eta', + 'sc_phi', + 'sc_RawE', + 'sc_Nclus', +] + seed_additional = ['preid_trk_ecal_match', 'preid_trkfilter_pass', 'preid_mva_pass'] id_additional = ['ele_mvaIdV1', 'ele_mvaIdV2'] From 2aa9f638f643668fc6c50a7b4361425f17b734ed Mon Sep 17 00:00:00 2001 From: mauro verzetti Date: Wed, 10 Oct 2018 17:36:36 +0200 Subject: [PATCH 04/13] minor updates and checkpoints --- macros/train_bdt_xgbo.py | 27 +++++++++++++++++++++------ 1 file changed, 21 insertions(+), 6 deletions(-) diff --git a/macros/train_bdt_xgbo.py b/macros/train_bdt_xgbo.py index f27f348194e19..f1187cd54ec14 100644 --- a/macros/train_bdt_xgbo.py +++ b/macros/train_bdt_xgbo.py @@ -13,9 +13,12 @@ '--jobtag', default='', type=str ) +parser.add_argument( + '--test', action='store_true' +) + args = parser.parse_args() -dataset = 'all' -#dataset = 'test' +dataset = 'test' if args.test else 'all' import matplotlib.pyplot as plt import uproot @@ -27,10 +30,18 @@ from datasets import get_data, tag, kmeans_weighter, training_selection import os +if 'CMSSW_BASE' not in os.environ: + cmssw_path = dir_path = os.path.dirname(os.path.realpath(__file__)).split('src/LowPtElectrons')[0] + os.environ['CMSSW_BASE'] = cmssw_path + mods = '%s/src/LowPtElectrons/LowPtElectrons/macros/models/%s/' % (os.environ['CMSSW_BASE'], tag) if not os.path.isdir(mods): os.makedirs(mods) +opti_dir = '%s/bdt_bo_%s' % (mods, args.what) +if not os.path.isdir(opti_dir): + os.makedirs(opti_dir) + plots = '%s/src/LowPtElectrons/LowPtElectrons/macros/plots/%s/' % (os.environ['CMSSW_BASE'], tag) if not os.path.isdir(plots): os.makedirs(plots) @@ -95,15 +106,17 @@ params = {'eval_metric':'auc', 'objective' :'binary:logitraw'} -model_default = xgb.train(params, xgtrain, num_boost_round=1000) +#model_default = xgb.train(params, xgtrain, num_boost_round=1000) #Next we try out the xgbo package. from xgbo import XgboClassifier -xgbo_classifier = XgboClassifier(out_dir='%s/ele_opti_%s' % (mods, dataset)) +xgbo_classifier = XgboClassifier( + out_dir=opti_dir, +) -# xgbo_classifier.optimize(xgtrain, init_points=5, n_iter=50, acq='ei') -xgbo_classifier.optimize(xgtrain, init_points=0, n_iter=1, acq='ei') +xgbo_classifier.optimize(xgtrain, init_points=5, n_iter=50, acq='ei') +#xgbo_classifier.optimize(xgtrain, init_points=0, n_iter=1, acq='ei') xgbo_classifier.fit(xgtrain, model="default") xgbo_classifier.fit(xgtrain, model="optimized") @@ -116,6 +129,8 @@ preds_early_stop = xgbo_classifier.predict(xgtest, model="default") preds_optimized = xgbo_classifier.predict(xgtest, model="optimized") +xgbo_classifier._bo.summary() + """ Finally, we want to plot some ROC curves. """ From aa1e81d4e98cd7c55a20b49e094720f7820e49b2 Mon Sep 17 00:00:00 2001 From: mauro verzetti Date: Wed, 10 Oct 2018 17:36:56 +0200 Subject: [PATCH 05/13] minor updates and checkpoints --- macros/train_nn_bo.py | 56 +++++++++++++++++++++++++++---------------- 1 file changed, 36 insertions(+), 20 deletions(-) diff --git a/macros/train_nn_bo.py b/macros/train_nn_bo.py index e5f1276853c75..68c1ef6c75142 100644 --- a/macros/train_nn_bo.py +++ b/macros/train_nn_bo.py @@ -14,12 +14,15 @@ parser.add_argument( '--jobtag', default='', type=str ) +parser.add_argument( + '--test', action='store_true' +) parser.add_argument("--gpu", help="select specific GPU", type=int, metavar="OPT", default=-1) parser.add_argument("--gpufraction", help="select memory fraction for GPU", type=float, metavar="OPT", default=0.5) args = parser.parse_args() -dataset = 'all' +dataset = 'test' if args.test else 'all' #dataset = 'test' cmssw_path = dir_path = os.path.dirname(os.path.realpath(__file__)).split('src/LowPtElectrons')[0] @@ -65,6 +68,10 @@ if not os.path.isdir(mods): os.makedirs(mods) +opti_dir = '%s/nn_bo_%s' % (mods, args.what) +if not os.path.isdir(opti_dir): + os.makedirs(opti_dir) + from features import * if args.what == 'seeding': features = seed_features @@ -74,15 +81,23 @@ additional = seed_additional elif args.what == 'id': features = id_features - additional = id_additional + additional = id_additional+['gsf_ecal_cluster_ematrix'] else: raise ValueError() -data = pd.DataFrame( - get_data(dataset, features+labeling+additional) -) -data = data[np.invert(data.is_e_not_matched)] #remove non-matched electrons -data = data[training_selection(data)] +data_dict = get_data(dataset, features+labeling+additional) +multi_dim = data_dict.pop('gsf_ecal_cluster_ematrix', None) +data = pd.DataFrame(data_dict) +if args.what == 'id': + flattened = pd.DataFrame(multi_dim.reshape(multi_dim.shape[0], -1)) + new_features = ['crystal_%d' % i for i in range(len(flattened.columns))] + flattened.columns = new_features + features += new_features + data = pd.concat([data, flattened], axis=1) + +data_mask = (np.invert(data.is_e_not_matched) & training_selection(data)) +data = data[data_mask] +multi_dim = multi_dim[data_mask] data['training_out'] = -1 data['log_trkpt'] = np.log10(data.trk_pt) #convert bools to integers @@ -112,7 +127,7 @@ from sklearn.model_selection import train_test_split train, test = train_test_split(data, test_size=0.2, random_state=42) test.to_hdf( - '%s/nn_bo_%s_testdata.hdf' % (mods, args.what), + '%s/nn_bo_%s_testdata.hdf' % (opti_dir, args.what), 'data' ) @@ -161,22 +176,23 @@ def make_model(n_layers = 3, n_nodes = 2*len(features), dropout = 0.1): def train_model(**kwargs): print 'training:', kwargs train_hash = kwargs.__repr__().__hash__() - train_dir = '%s/src/LowPtElectrons/LowPtElectrons/macros/models/train_bo_%d' % (os.environ['CMSSW_BASE'], train_hash) + train_dir = '%s/train_bo_%d' % (opti_dir, train_hash) if not os.path.isdir(train_dir): os.makedirs(train_dir) else: - os.makedirs('%s_clash' % train_dir) + train_dir = '%s_clash' % train_dir + os.makedirs(train_dir) with open('%s/hyperparameters.json' % train_dir, 'w') as j: j.write(json.dumps(kwargs)) learn_rate = 10.**kwargs['log_learn_rate'] batch_size = int(kwargs['batch_size']) - n_epochs = int(kwargs['n_epochs']) + n_epochs = 150 #int(kwargs['n_epochs']) del kwargs['log_learn_rate'] del kwargs['batch_size'] - del kwargs['n_epochs'] + #del kwargs['n_epochs'] model = make_model(**kwargs) @@ -206,8 +222,8 @@ def train_model(**kwargs): callbacks = DeepJet_callbacks( model, outputDir=train_dir, - stop_patience=30, - lr_patience = 7, + stop_patience=50, + lr_patience = 10, verbose=False ) @@ -227,23 +243,23 @@ def train_model(**kwargs): from xgbo import BayesianOptimization par_space = { 'n_layers' : (2, 10), - 'n_nodes' : (len(features)/2, 4*len(features)), + 'n_nodes' : (len(features)/3, 3*len(features)), 'dropout' : (0., 0.8), 'log_learn_rate' : (-4., -1), 'batch_size' : (500, 2000), - 'n_epochs' : (10, 150), + #'n_epochs' : (10, 150), } bo = BayesianOptimization( train_model, par_space, - verbose=1 + verbose=1, + checkpoints='%s/checkpoints.csv' % opti_dir ) -bo.init(3) #len(par_space)) -bo.maximize(3, 50) +bo.maximize(5, 50) -with open('%s/nn_bo.json' % mods, 'w') as j: +with open('%s/nn_bo.json' % opti_dir, 'w') as j: mpoint = bo.space.max_point() thash = mpoint['max_params'].__repr__().__hash__() info = mpoint['max_params'] From 4d1354bc97fdd2ccd3f547e90e084a99fec24f3e Mon Sep 17 00:00:00 2001 From: mauro verzetti Date: Wed, 10 Oct 2018 17:42:08 +0200 Subject: [PATCH 06/13] sample merging script --- production/merge_samples.py | 69 +++++++++++++++++++++++++++++++++++++ 1 file changed, 69 insertions(+) create mode 100644 production/merge_samples.py diff --git a/production/merge_samples.py b/production/merge_samples.py new file mode 100644 index 0000000000000..71d9adae49fc5 --- /dev/null +++ b/production/merge_samples.py @@ -0,0 +1,69 @@ +#! /bin/env python + +import os +from argparse import ArgumentParser + +parser = ArgumentParser() +parser.add_argument('jobid') +parser.add_argument('--se', default='/eos/cms/store/cmst3/user/mverzett/') +parser.add_argument('--group', default=10, type=int) +args = parser.parse_args() + +import ROOT as R + +def syscall(cmd): + print 'Executing: %s' % cmd + retval = os.system(cmd) + if retval != 0: + raise RuntimeError('Command failed!') + +import sys +toolbar_width=50 +def progbar(frac): + done = int(frac*toolbar_width) + msg = "[%s%s] %d%%" % ( + '#'*done, " "*(toolbar_width-done), + int(frac*100) + ) + sys.stdout.write(msg) + sys.stdout.flush() + sys.stdout.write("\b" * len(msg)) # return to start of line, after '[' + +def hadd(ins, out, tree_path='features/tree'): + print 'producing', out,' ', + tc = R.TChain(tree_path) + for i in ins: + tc.Add(i) + + tf = R.TFile(out, 'RECREATE') + td = tf.mkdir(os.path.dirname(tree_path)) + td.cd() + otree = tc.CloneTree(0) + entries = tc.GetEntries() + print '(', entries, 'entries)' + for idx in xrange(entries): + if idx%500 == 0: progbar(idx/float(entries)) + tc.GetEntry(idx) + otree.Fill() + otree.AutoSave() + print '' + tf.Close() + +def chunks(l, n): + """Yield successive n-sized chunks from l.""" + for i in range(0, len(l), n): + yield l[i:i + n] + +from glob import glob +indirs = glob('%s/*/crab_%s_*' % (args.se, args.jobid)) + +for sample in indirs: + ins = glob('%s/*/*/*.root' % sample) + base_name = sample.split('_%s_' % args.jobid)[1] + ins_chuncks = [i for i in chunks(ins, args.group)] + for idx, chunk in enumerate(ins_chuncks): + hadd( + chunk, + '%s_%s_%d.root' % (base_name, args.jobid, idx) + ) + From 2f483c8fded8122219514dc97f466475139d2f23 Mon Sep 17 00:00:00 2001 From: mauro verzetti Date: Wed, 10 Oct 2018 17:44:49 +0200 Subject: [PATCH 07/13] NN evaluation script --- macros/eval_nn.py | 145 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 145 insertions(+) create mode 100644 macros/eval_nn.py diff --git a/macros/eval_nn.py b/macros/eval_nn.py new file mode 100644 index 0000000000000..46b747a6e52ae --- /dev/null +++ b/macros/eval_nn.py @@ -0,0 +1,145 @@ +import numpy as np +import matplotlib +matplotlib.use('Agg') +from argparse import ArgumentParser +from cmsjson import CMSJson +from pdb import set_trace +import os + +parser = ArgumentParser() +parser.add_argument( + 'what', choices=['seeding', 'fullseeding', 'id'], +) +parser.add_argument('model') +parser.add_argument('--dataset') + +parser.add_argument("--gpu", help="select specific GPU", type=int, metavar="OPT", default=-1) +parser.add_argument("--gpufraction", help="select memory fraction for GPU", type=float, metavar="OPT", default=0.5) + +args = parser.parse_args() + +import pandas as pd +import json +import matplotlib.pyplot as plt +from glob import glob +if args.model.endswith('.csv'): + #TODO, make the BO part + bo = pd.read_csv(args.model) + best = bo.target.argmax() + pars = dict(bo.loc[best]) + del pars['target'] + base = os.path.dirname(args.model) + #unfortunately the hash does not mean anything :( + for jfile in glob('%s/train_bo_*/hyperparameters.json' % base): + #check for ~equality + jpars = json.load(open(jfile)) + equals = all( + abs(pars[i] - jpars[i])/abs(pars[i]) < 10**-3 + for i in pars + ) + if equals: + break + else: #for else! like, the third time I use it! + raise RuntimeError('I cannot find the training dir') + + train_dir = os.path.dirname(jfile) + model = '%s/KERAS_check_best_model.h5' % train_dir + dataset = glob('%s/*.hdf' % base)[0] + plots = base +else: + model = args.model + dataset = args.dataset + plots = os.dirname(model) + if not dataset: + raise RuntimeError('You must specify a dataset if you are not running in Bayesian Optimization mode') + +cmssw_path = os.path.dirname(os.path.realpath(__file__)).split('src/LowPtElectrons')[0] +os.environ['CMSSW_BASE'] = cmssw_path + + +from keras import backend as K, callbacks +from keras.models import load_model +import tensorflow as tf +if args.gpu<0: + import imp + try: + imp.find_module('setGPU') + import setGPU + except ImportError: + found = False +else: + os.environ['CUDA_DEVICE_ORDER'] = 'PCI_BUS_ID' + os.environ['CUDA_VISIBLE_DEVICES'] = str(args.gpu) + print('running on GPU '+str(args.gpu)) + +if args.gpufraction>0 and args.gpufraction<1: + gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=args.gpufraction) + sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) + K.set_session(sess) + print('using gpu memory fraction: '+str(args.gpufraction)) + +#this should be outsorced +from features import * +if args.what == 'seeding': + features = seed_features + additional = seed_additional +elif args.what == 'fullseeding': + features = fullseed_features + additional = seed_additional +elif args.what == 'id': + features = id_features + additional = id_additional+['gsf_ecal_cluster_ematrix'] +else: + raise ValueError() + +model = load_model(model) +test = pd.read_hdf(dataset, key='data') + +# +# plot performance +# +from sklearn.metrics import roc_curve, roc_auc_score +#this should go in the outsourced part as well! +if any('crystal' in i for i in test.columns): + ncrystals = len([i for i in test.columns if 'crystal' in i]) + new_features = ['crystal_%d' % i for i in range(ncrystals)] + features += new_features +training_out = model.predict(test[features].as_matrix()) +roc = roc_curve( + test.is_e.as_matrix().astype(int), + training_out)[:2] +auc_score = roc_auc_score(test.is_e, training_out) + +# make plots +plt.figure(figsize=[8, 8]) +plt.title('%s training' % args.what) +plt.plot( + np.arange(0,1,0.01), + np.arange(0,1,0.01), + 'k--') +plt.plot(*roc, label='Retraining (AUC: %.2f)' % auc_score) +if args.what in ['seeding', 'fullseeding']: + eff = float((data.baseline & data.is_e).sum())/data.is_e.sum() + mistag = float((data.baseline & np.invert(data.is_e)).sum())/np.invert(data.is_e).sum() + plt.plot([mistag], [eff], 'o', label='baseline', markersize=5) +elif args.what == 'id': + mva_v1 = roc_curve(test.is_e, test.ele_mvaIdV1)[:2] + mva_v2 = roc_curve(test.is_e, test.ele_mvaIdV2)[:2] + mva_v1_auc = roc_auc_score(test.is_e, test.ele_mvaIdV1) + mva_v2_auc = roc_auc_score(test.is_e, test.ele_mvaIdV2) + plt.plot(*mva_v1, label='MVA ID V1 (AUC: %.2f)' % mva_v1_auc) + plt.plot(*mva_v2, label='MVA ID V2 (AUC: %.2f)' % mva_v2_auc) +else: + raise ValueError() + +plt.xlabel('Mistag Rate') +plt.ylabel('Efficiency') +plt.legend(loc='best') +plt.xlim(0., 1) +plt.savefig('%s/test_NN.png' % (plots)) +plt.savefig('%s/test_NN.pdf' % (plots)) +plt.gca().set_xscale('log') +plt.xlim(1e-4, 1) +plt.savefig('%s/test_log_NN.png' % (plots)) +plt.savefig('%s/test_log_NN.pdf' % (plots)) +plt.clf() From 07f09c9e771679eaa7ca02d72e15b3c68189209c Mon Sep 17 00:00:00 2001 From: mauro verzetti Date: Fri, 12 Oct 2018 15:36:35 +0200 Subject: [PATCH 08/13] update dataset, read files synchronously --- macros/datasets.py | 54 ++++++++++++++++++++++++++++++---------------- 1 file changed, 36 insertions(+), 18 deletions(-) diff --git a/macros/datasets.py b/macros/datasets.py index ac5de0ad0693f..6cc6a3106786f 100644 --- a/macros/datasets.py +++ b/macros/datasets.py @@ -2,29 +2,27 @@ #A single place where to bookkeep the dataset file locations #tag = '2018Sep20' tag = '2018Oct05' -input_files = { -} - -all_sets = [] -for dataset, name in [ - ('BToKee_Pythia', 'BToKee'), - ('BToKstee_Pythia', 'BToKstee'), - ('Bu_KJPsi_ee_Pythia', 'BToKJPsiee'), - ('Bd_KstJPsi_ee_Pythia_GEN-SIM_18_07_01', 'BToKstJPsiee'), - ('Bu_KJPsi_ee_Pythia_GEN-SIM_18_06_4', 'BToKJPsiee')]: - if name not in input_files: input_files[name] = [] - files = glob('/eos/cms/store/cmst3/user/mverzett/%s/crab_%s_*/*/*/*.root' % (dataset, tag)) - input_files[name] += files - all_sets += files +posix = '2018Oct0[589]' #in case of rescue submissions +import os +all_sets = glob('/eos/cms/store/cmst3/group/bpark/electron_training/*_%s_*.root' % posix) +sets = set([os.path.basename(i).split('_')[0].split('Assoc')[0] for i in all_sets]) +sets = sorted(list(sets), key=lambda x: -len(x)) +input_files = {i : [] for i in sets} input_files['all'] = all_sets -input_files['test'] = all_sets[:10] +input_files['test'] = all_sets[:1] +for inf in all_sets: + for name in sets: + if os.path.basename(inf).startswith(name): + input_files[name].append(inf) + break + dataset_names = { 'BToKee' : r'B $\to$ K ee', - 'BToKstee' : r'B $\to$ K* ee', - 'BToKJPsiee' : r'B $\to$ K J/$\Psi$(ee)', - 'BToKstJPsiee' : r'B $\to$ K* J/$\Psi$(ee)', + #'BToKstee' : r'B $\to$ K* ee', + 'BToJPsieeK' : r'B $\to$ K J/$\Psi$(ee)', + #'BToKstJPsiee' : r'B $\to$ K* J/$\Psi$(ee)', } import os @@ -52,6 +50,26 @@ def get_data(dataset, columns, nthreads=2*multiprocessing.cpu_count(), exclude={ ret[column] = np.concatenate((ret[column],tmp[column])) return ret +def get_data_sync(dataset, columns, nthreads=2*multiprocessing.cpu_count(), exclude={}): + if dataset not in input_files: + raise ValueError('The dataset %s does not exist, I have %s' % (dataset, ', '.join(input_files.keys()))) + infiles = [uproot.open(i) for i in input_files[dataset]] + if columns == 'all': + columns = [i for i in infiles[0]['features/tree'].keys() if i not in exclude] + try: + ret = infiles[0]['features/tree'].arrays(columns) + except: + raise RuntimeError('Failed to open %s properly' % infiles[0]) + for infile in infiles[1:]: + try: + arrays = infile['features/tree'].arrays(columns) + except: + raise RuntimeError('Failed to open %s properly' % infile) + for column in columns: + ret[column] = np.concatenate((ret[column],arrays[column])) + return ret + + from sklearn.cluster import KMeans from sklearn.externals import joblib import json From 0b1b6e887fb2c585e1978dca5d5abe782eba4793 Mon Sep 17 00:00:00 2001 From: mauro verzetti Date: Fri, 12 Oct 2018 15:37:40 +0200 Subject: [PATCH 09/13] move to synchronous file reading, MinBatchKMeans (much faster) and weight the background sample --- macros/kmeans_reweight.py | 25 +++++++++++++++---------- 1 file changed, 15 insertions(+), 10 deletions(-) diff --git a/macros/kmeans_reweight.py b/macros/kmeans_reweight.py index 6f3da450e8b6f..44809334ff80d 100644 --- a/macros/kmeans_reweight.py +++ b/macros/kmeans_reweight.py @@ -29,7 +29,7 @@ from pdb import set_trace rc('font',**{'family':'sans-serif','sans-serif':['Helvetica']}) rc('text', usetex=True) -from datasets import get_data, tag +from datasets import get_data, tag, apply_weight, get_data_sync import os mods = '%s/src/LowPtElectrons/LowPtElectrons/macros/models/%s/' % (os.environ['CMSSW_BASE'], tag) @@ -40,9 +40,11 @@ if not os.path.isdir(plots): os.makedirs(plots) +print 'Getting data...' data = pd.DataFrame( - get_data(dataset, ['trk_pt', 'trk_eta', 'is_e', 'is_e_not_matched', 'is_other']) + get_data_sync(dataset, ['trk_pt', 'trk_eta', 'is_e', 'is_e_not_matched', 'is_other']) ) +print '...Done' data = data[np.invert(data.is_e_not_matched)] #remove non-matched electrons #remove things that do not yield tracks data = data[(data.trk_pt > 0) & (np.abs(data.trk_eta) < 2.4) & (data.trk_pt < 15)] @@ -51,9 +53,11 @@ overall_scale = data.shape[0]/float(data.is_e.sum()) reweight_feats = ['log_trkpt', 'trk_eta'] -from sklearn.cluster import KMeans -clusterizer = KMeans(n_clusters=args.nbins, n_jobs=-2) +print 'clustering...' +from sklearn.cluster import KMeans, MiniBatchKMeans +clusterizer = MiniBatchKMeans(n_clusters=args.nbins, batch_size=3000) #n_jobs=3) clusterizer.fit(data[reweight_feats]) #fit(data[data.is_e][reweight_feats]) +global_ratio = float(data.is_e.sum())/np.invert(data.is_e).sum() data['cluster'] = clusterizer.predict(data[reweight_feats]) weights = {} @@ -62,7 +66,7 @@ nsig = group.is_e.sum() if not nbkg: RuntimeError('cluster %d has no background events, reduce the number of bins!' % nbkg) elif not nsig: RuntimeError('cluster %d has no electrons events, reduce the number of bins!' % nsig) - weight = nbkg/float(nsig) + weight = float(nsig)/nbkg weights[cluster] = weight from sklearn.externals import joblib @@ -74,11 +78,12 @@ weights['features'] = reweight_feats with open('%s/kmeans_%s_weighter.json' % (mods, dataset), 'w') as ww: json.dump(weights, ww) +print '...done' #vectorize(excluded={2}) -apply_weight = np.vectorize(lambda x, y: y.get(x), excluded={2}) -data['weight'] = data.is_e*apply_weight(data.cluster, weights)+np.invert(data.is_e) +data['weight'] = np.invert(data.is_e)*apply_weight(data.cluster, weights)+data.is_e +print 'time for plots!' # Step size of the mesh. Decrease to increase the quality of the VQ. h = .02 # point in the mesh [x_min, x_max]x[y_min, y_max]. @@ -120,7 +125,7 @@ Z, interpolation='nearest', extent=(xx.min(), xx.max(), yy.min(), yy.max()), cmap=plt.cm.seismic, - norm=LogNorm(vmin=0.01, vmax=100), + norm=LogNorm(vmin=10**-4, vmax=10**4), aspect='auto', origin='lower') plt.title('weight') plt.xlim(x_min, x_max) @@ -140,7 +145,7 @@ entries, _, _ = plt.hist( data.weight, bins=np.logspace( - np.log(data.weight.min()), + np.log(max(data.weight.min(), 10**-5)), np.log(data.weight.max()), 100 ), @@ -150,7 +155,7 @@ plt.ylabel('Occurrency') plt.legend(loc='best') plt.ylim(0.5, entries.max()*1.2) -#plt.xlim(entries.min(), entries.max()*1.2) +plt.xlim(max(entries.min(), 10**-4), entries.max()*1.2) plt.gca().set_xscale('log') plt.gca().set_yscale('log') plt.plot() From 5e7d163c55479f8d7447c8ea88b2d836f3965dd5 Mon Sep 17 00:00:00 2001 From: mauro verzetti Date: Fri, 12 Oct 2018 15:37:58 +0200 Subject: [PATCH 10/13] more printout --- production/merge_samples.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/production/merge_samples.py b/production/merge_samples.py index 71d9adae49fc5..d2d554f9ab7e7 100644 --- a/production/merge_samples.py +++ b/production/merge_samples.py @@ -55,10 +55,13 @@ def chunks(l, n): yield l[i:i + n] from glob import glob +from pdb import set_trace indirs = glob('%s/*/crab_%s_*' % (args.se, args.jobid)) for sample in indirs: ins = glob('%s/*/*/*.root' % sample) + print 'sample: ', sample + print 'found', len(ins), 'input files' base_name = sample.split('_%s_' % args.jobid)[1] ins_chuncks = [i for i in chunks(ins, args.group)] for idx, chunk in enumerate(ins_chuncks): From 133772a950b881681dee323abc946cb9f8280d1f Mon Sep 17 00:00:00 2001 From: mauro verzetti Date: Mon, 15 Oct 2018 11:45:14 +0200 Subject: [PATCH 11/13] bugfix for uproot readout --- production/merge_samples.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/production/merge_samples.py b/production/merge_samples.py index d2d554f9ab7e7..18455ce83e205 100644 --- a/production/merge_samples.py +++ b/production/merge_samples.py @@ -45,7 +45,9 @@ def hadd(ins, out, tree_path='features/tree'): if idx%500 == 0: progbar(idx/float(entries)) tc.GetEntry(idx) otree.Fill() - otree.AutoSave() + #otree.AutoSave() + otree.Write() + #tf.Write() print '' tf.Close() From 82260ea782522c035bb379db1de92d81408d29cf Mon Sep 17 00:00:00 2001 From: mauro verzetti Date: Mon, 15 Oct 2018 16:16:47 +0200 Subject: [PATCH 12/13] make input feature plots --- macros/basic_plots.py | 176 ++++++++++++++++++++++++++++++++++++++++++ macros/cosmetics.py | 68 +++++++++++----- 2 files changed, 224 insertions(+), 20 deletions(-) create mode 100644 macros/basic_plots.py diff --git a/macros/basic_plots.py b/macros/basic_plots.py new file mode 100644 index 0000000000000..65838a96b7424 --- /dev/null +++ b/macros/basic_plots.py @@ -0,0 +1,176 @@ +import numpy as np +import matplotlib +matplotlib.use('Agg') +import uproot +import matplotlib.pyplot as plt +import root_numpy +import rootpy +import rootpy.plotting as rplt +import json +import pandas as pd +from matplotlib import rc +from pdb import set_trace +import os +rc('font',**{'family':'sans-serif','sans-serif':['Helvetica']}) +rc('text', usetex=True) +from baseline import baseline +import cosmetics + +debug = False +print 'Getting the data' +from datasets import dataset_names, tag, get_data_sync, kmeans_weighter, training_selection + +plots = '%s/src/LowPtElectrons/LowPtElectrons/macros/plots/%s/' % (os.environ['CMSSW_BASE'], tag) +if not os.path.isdir(plots): + os.mkdirs(plots) + +mods = '%s/src/LowPtElectrons/LowPtElectrons/macros/models/%s/' % (os.environ['CMSSW_BASE'], tag) +if not os.path.isdir(mods): + os.makedirs(mods) + +all_data = {} +for dataset in dataset_names: + print 'loading', dataset + all_data[dataset] = pd.DataFrame( + get_data_sync( + dataset, + ['is_e', 'is_e_not_matched', 'is_other', + 'gen_pt', 'gen_eta', 'trk_pt' + ] + ) + ) + +plt.figure(figsize=[8,8]) +for to_plot, nbins in [ + ('gen_pt', 30), + ('gen_eta', 30), + ('trk_pt', 30),]: + plt.clf() + for dataset, sample in all_data.iteritems(): + electrons = sample[sample.is_e] + plt.hist( + electrons[to_plot], bins=nbins, + range=cosmetics.ranges[to_plot], + histtype='step', normed=True, + label = dataset_names[dataset], + ) + plt.xlabel(cosmetics.beauty[to_plot]) + plt.ylabel('Fraction') + plt.legend(loc='best') + plt.plot() + plt.savefig('%s/electrons_%s.png' % (plots, to_plot)) + plt.savefig('%s/electrons_%s.pdf' % (plots, to_plot)) + plt.clf() + + plt.clf() + for dataset, sample in all_data.iteritems(): + electrons = sample[(sample.is_e) & (sample.trk_pt > 0)] + plt.hist( + electrons[to_plot], bins=nbins, + range=cosmetics.ranges[to_plot], + histtype='step', normed=True, + label = dataset_names[dataset], + ) + plt.xlabel(cosmetics.beauty[to_plot]) + plt.ylabel('Fraction') + plt.legend(loc='best') + plt.plot() + plt.savefig('%s/electrons_withTrk_%s.png' % (plots, to_plot)) + plt.savefig('%s/electrons_withTrk_%s.pdf' % (plots, to_plot)) + plt.clf() + + +from features import * +features = id_features+new_features +additional = id_additional + +multi_dim_branches = ['gsf_ecal_cluster_ematrix', 'ktf_ecal_cluster_ematrix'] +dict_data = get_data_sync( + 'all', + features+labeling+additional+multi_dim_branches +) +data = pd.DataFrame( + {i : dict_data[i] for i in dict_data if i not in multi_dim_branches} +) +multi_dim = {i : dict_data[i] for i in dict_data if i in multi_dim_branches} +data_mask = np.invert(data.is_e_not_matched) & training_selection(data) +for key in multi_dim: + multi_dim[key] = multi_dim[key][data_mask] +data = data[data_mask] #remove non-matched electrons +data['training_out'] = -1 +data['log_trkpt'] = np.log10(data.trk_pt) +#convert bools to integers +for c in features: + if data[c].dtype == np.dtype('bool'): + data[c] = data[c].astype(int) + +#apply pt-eta reweighting +weights = kmeans_weighter( + data[['log_trkpt', 'trk_eta']], + '%s/kmeans_all_weighter.plk' % mods + ) +data['weight'] = weights*np.invert(data.is_e) + data.is_e +print 'making plots' + +for feat in multi_dim_branches: + vals = {} + for dataset in [ + {'name' : 'electrons', + 'mask' : data.is_e, + 'weight' : data[data.is_e].weight}, + {'name' : 'tracks', + 'mask' : np.invert(data.is_e), + 'weight' : data[np.invert(data.is_e)].weight}, + ]: + plt.clf() + plt.title(feat.replace('_', ' ')) + masked = multi_dim[feat][dataset['mask']] + sum_val = masked.sum(axis=-1).sum(axis=-1) + mask = np.invert(sum_val == 0) + masked = masked[mask] + sum_val = sum_val[mask] + masked /= sum_val[:,None,None] + heatmap = np.average(masked, axis=0, weights=dataset['weight'][mask]) + vals[dataset['name']] = heatmap + plt.imshow(heatmap, cmap='viridis', interpolation='nearest') + plt.colorbar() + plt.savefig('%s/%s_%s.png' % (plots, dataset['name'], feat)) + plt.savefig('%s/%s_%s.pdf' % (plots, dataset['name'], feat)) + plt.clf() + #make ratios + ratio = (vals['electrons']/vals['tracks'])-1 + plt.clf() + plt.title(feat.replace('_', ' ')) + plt.imshow(ratio, cmap='RdBu', interpolation='nearest', vmin=-1, vmax=1) + plt.colorbar() + plt.savefig('%s/ratio_%s_%s.png' % (plots, dataset['name'], feat)) + plt.savefig('%s/ratio_%s_%s.pdf' % (plots, dataset['name'], feat)) + plt.clf() + +exit() + +for to_plot in features: + plt.clf() + electrons = data[data.is_e] + tracks = data[np.invert(data.is_e) & np.invert(data.is_e_not_matched)] + plt.hist( + electrons[to_plot], bins=50, + weights=electrons.weight, + range=cosmetics.ranges.get(to_plot, None), + histtype='step', normed=True, + label = 'Electrons', + ) + plt.hist( + tracks[to_plot], bins=50, + weights=tracks.weight, + range=cosmetics.ranges.get(to_plot, None), + histtype='step', normed=True, + label = 'Tracks', + ) + plt.xlabel(cosmetics.beauty.get(to_plot, to_plot.replace('_', ' '))) + plt.ylabel('Fraction') + plt.legend(loc='best') + plt.plot() + plt.savefig('%s/electrons_vs_tracks_%s.png' % (plots, to_plot)) + plt.savefig('%s/electrons_vs_tracks_%s.pdf' % (plots, to_plot)) + plt.clf() diff --git a/macros/cosmetics.py b/macros/cosmetics.py index 6f66a6b6b5026..6f5c54dc1c754 100644 --- a/macros/cosmetics.py +++ b/macros/cosmetics.py @@ -1,35 +1,63 @@ +from fnmatch import fnmatch class RangesByName(object): def __init__(self, rlist): self._rlist_ = rlist #list of (ending, range) + self._rlist_.sort(key=lambda x: -1*len(x[0])) def get(self, val, default=None): for ending, vrange in self._rlist_: - if val.endswith(ending): + if fnmatch(val, ending): return vrange return default def __getitem__(self, val): - self.get(val) + return self.get(val) ranges = RangesByName([ - ('_cluster_deta', (-2, 2)), - ('_pt', (0, 15)), - ('_eta' , (-3, 3)), - ('_inp' , (0, 20)), - ('_outp' , (0, 10)), - ('_chi2red' , (0, 6)), - ('_Deta' , (0, 0.2)), - ('_Dphi' , (-0.2, 0.2)), - ('_nhits' , (0, 50)), - ('_p' , (0, 20)), - ('_cluster_e', (0, 20)), - ('_cluster_ecorr', (0, 20)), - ('_cluster_eta', (-3, 3)), - ('_cluster_deta', (-1.5, 1.5)), - ('_cluster_dphi', (-1.5, 1.5)), - ## ('_cluster_covEtaEta', (-0.5, 0.5)), - ## ('_cluster_covEtaPhi', (-0.5, 0.5)), - ## ('_cluster_covPhiPhi', (-0.5, 0.5)), + ('*_cluster_deta', (-2, 2)), + ('*_pt', (0, 15)), + ('*_eta' , (-3, 3)), + ('*_inp' , (0, 20)), + ('*_outp' , (0, 10)), + ('*_chi2red' , (0, 6)), + ('*_Deta' , (0, 0.2)), + ('*_Dphi' , (-0.2, 0.2)), + ('*_nhits' , (0, 50)), + ('*_p' , (0, 20)), + ('*_cluster_e', (0, 20)), + ('*_cluster_ecorr', (0, 20)), + ('*_cluster_eta', (-3, 3)), + ('*_cluster_deta', (-1.5, 1.5)), + ('*_cluster_dphi', (-1.5, 1.5)), + ('*brem_frac', (-4,2)), + ('*_fracSC', (0.4, 1)), + ('*brem_fracTrk', (-4,2)), + ('*_covEtaEta', (-1, 1)), + ('*_covEtaPhi', (-1, 1)), + ('*_cluster_covPhiPhi', (-1, 1)), + ('*_EoverP', (0, 10)), + ('*_dEta', (-2, 2)), + ('*_dPhi', (-1, 1)), + ('*EoverPout', (0, 50)), + ('*dEta_vtx', (-5, 5)), + ('*sc_E', (0, 100)), + ('*sc_Et', (0, 200)), + ('*sc_RawE', (0, 100)), + ('*sc_etaWidth', (0, 1)), + ('*HoverE', (0,1)), + ('*HoverEBc', (0, 0.4)), + ('*_e[0-9]x[0-5]*', (0, 10.)), + ('shape_e[BLRT]*', (0, 2.)), + ('*_full5x5_HoverE', (0, 2)), + ('*_full5x5_HoverEBc', (0, 1)), + ('*full5x5_e[BLRT]*', (0, 2)), + ('*_hcalDepth1*', (0, 1)), + ('*_hcalDepth2*', (0, 0.3)), + ('*_r9', (0, 0.5)), + ('*_sigmaEtaEta', (0., 0.1)), + ('*_sigmaIetaIeta', (0., 0.1)), + ('*_sigmaIphiIphi', (0., 0.08)), + ## ('*', ()), ]) beauty = { From 2cb77e13bd48eeb143284a1e34aea6d448ac8343 Mon Sep 17 00:00:00 2001 From: mauro verzetti Date: Tue, 16 Oct 2018 12:32:00 +0200 Subject: [PATCH 13/13] refactor code to avoid repetitions --- macros/datasets.py | 56 +++++++++++++++++++++++++++++- macros/features.py | 45 +++++++++++++++++++++++++ macros/train_bdt.py | 73 +++++++++++++--------------------------- macros/train_bdt_xgbo.py | 53 +++-------------------------- macros/train_nn.py | 54 ++++------------------------- macros/train_nn_bo.py | 59 ++++---------------------------- run/mc_features.py | 2 ++ 7 files changed, 143 insertions(+), 199 deletions(-) diff --git a/macros/datasets.py b/macros/datasets.py index 6cc6a3106786f..a364994900024 100644 --- a/macros/datasets.py +++ b/macros/datasets.py @@ -10,12 +10,12 @@ sets = sorted(list(sets), key=lambda x: -len(x)) input_files = {i : [] for i in sets} input_files['all'] = all_sets -input_files['test'] = all_sets[:1] for inf in all_sets: for name in sets: if os.path.basename(inf).startswith(name): input_files[name].append(inf) break +input_files['test'] = input_files['BToKee'][:1] dataset_names = { @@ -34,6 +34,16 @@ import uproot import numpy as np +def get_models_dir(): + if 'CMSSW_BASE' not in os.environ: + cmssw_path = dir_path = os.path.dirname(os.path.realpath(__file__)).split('src/LowPtElectrons')[0] + os.environ['CMSSW_BASE'] = cmssw_path + + mods = '%s/src/LowPtElectrons/LowPtElectrons/macros/models/%s/' % (os.environ['CMSSW_BASE'], tag) + if not os.path.isdir(mods): + os.makedirs(mods) + return mods + def get_data(dataset, columns, nthreads=2*multiprocessing.cpu_count(), exclude={}): thread_pool = concurrent.futures.ThreadPoolExecutor(nthreads) if dataset not in input_files: @@ -93,3 +103,47 @@ def training_selection(df): 'ensures there is a GSF Track and a KTF track within eta/pt boundaries' return (df.trk_pt > 0) & (df.trk_pt < 15) & (np.abs(df.trk_eta) < 2.4) & (df.gsf_pt > 0) +import pandas as pd +import numpy as np +def pre_process_data(dataset, features, for_seeding=False): + mods = get_models_dir() + features = list(set(features+['trk_pt', 'gsf_pt', 'trk_eta'])) + data_dict = get_data_sync(dataset, features) + if 'gsf_ecal_cluster_ematrix' in features: + multi_dim = data_dict.pop('gsf_ecal_cluster_ematrix', None) + data = pd.DataFrame(data_dict) + if 'gsf_ecal_cluster_ematrix' in features: + flattened = pd.DataFrame(multi_dim.reshape(multi_dim.shape[0], -1)) + new_features = ['crystal_%d' % i for i in range(len(flattened.columns))] + flattened.columns = new_features + features += new_features + data = pd.concat([data, flattened], axis=1) + + data = data[np.invert(data.is_e_not_matched)] #remove non-matched electrons + data = data[training_selection(data)] + data['training_out'] = -1 + data['log_trkpt'] = np.log10(data.trk_pt) + + #apply pt-eta reweighting + ## from hep_ml.reweight import GBReweighter + ## from sklearn.externals import joblib + ## reweighter = joblib.load('%s/%s_reweighting.pkl' % (mods, dataset)) + ## weights = reweighter.predict_weights(data[['trk_pt', 'trk_eta']]) + weights = kmeans_weighter( + data[['log_trkpt', 'trk_eta']], + '%s/kmeans_%s_weighter.plk' % (mods, dataset) + ) + data['weight'] = weights*np.invert(data.is_e) + data.is_e + + #add baseline seeding (for seeding only) + if for_seeding: + data['baseline'] = ( + data.preid_trk_ecal_match | + (np.invert(data.preid_trk_ecal_match) & data.preid_trkfilter_pass & data.preid_mva_pass) + ) + + #convert bools to integers + for c in features: + if data[c].dtype == np.dtype('bool'): + data[c] = data[c].astype(int) + return data diff --git a/macros/features.py b/macros/features.py index 5fa087eba0f17..994794319061d 100644 --- a/macros/features.py +++ b/macros/features.py @@ -140,3 +140,48 @@ 'gen_phi', 'gen_charge', ] + +mva_id_inputs = [ + 'rho', + 'ele_pt', + 'sc_eta', + 'shape_full5x5_sigmaIetaIeta', + 'shape_full5x5_sigmaIphiIphi', + 'shape_full5x5_circularity', + 'shape_full5x5_r9', + 'sc_etaWidth', + 'sc_phiWidth', + 'shape_full5x5_HoverE', + 'trk_nhits', + 'trk_chi2red', + 'gsf_chi2red', + 'brem_frac', + 'gsf_nhits', + 'match_SC_EoverP', + 'match_eclu_EoverP', + 'match_SC_dEta', #should be abs + 'match_SC_dPhi', #should be abs + 'match_seed_dEta', #should be abs + 'sc_E', + 'trk_p', +#ele_expected_inner_hits gsfTrack.hitPattern.numberOfLostHits('MISSING_INNER_HITS') None None +#ele_conversionVertexFitProbability electronMVAVariableHelper:convVtxFitProb None None +#ele_IoEmIop 1.0/ecalEnergy-1.0/trackMomentumAtVtx.R None None +] + +def get_features(ftype): + if ftype == 'seeding': + features = seed_features + additional = seed_additional + elif ftype == 'fullseeding': + features = fullseed_features + additional = seed_additional + elif ftype == 'id': + features = id_features + additional = id_additional + elif ftype == 'mva_id': + features = mva_id_inputs + additional = id_additional + else: + raise ValueError('%s is not among the possible feature collection' % ftype) + return features, additional diff --git a/macros/train_bdt.py b/macros/train_bdt.py index 786f9d32896bc..95b4ea485dc85 100644 --- a/macros/train_bdt.py +++ b/macros/train_bdt.py @@ -7,7 +7,10 @@ parser = ArgumentParser() parser.add_argument( - 'what', choices=['seeding', 'fullseeding', 'id'], + 'what' +) +parser.add_argument( + '--test', action='store_true' ) parser.add_argument( '--jobtag', default='', type=str @@ -50,7 +53,7 @@ ) args = parser.parse_args() -dataset = 'all' +dataset = 'test' if args.test else 'all' #dataset = 'test' import matplotlib.pyplot as plt @@ -62,7 +65,7 @@ from matplotlib import rc rc('font',**{'family':'sans-serif','sans-serif':['Helvetica']}) rc('text', usetex=True) -from datasets import get_data, tag, kmeans_weighter, training_selection +from datasets import tag, pre_process_data import os mods = '%s/src/LowPtElectrons/LowPtElectrons/macros/models/%s/' % (os.environ['CMSSW_BASE'], tag) @@ -74,52 +77,12 @@ os.mkdirs(plots) from features import * - -if args.what == 'seeding': - features = seed_features - additional = seed_additional -elif args.what == 'fullseeding': - features = fullseed_features - additional = seed_additional -elif args.what == 'id': - features = id_features - additional = id_additional -else: - raise ValueError() +features, additional = get_features(args.what) fields = features+labeling+additional if 'gsf_pt' not in fields : fields += ['gsf_pt'] -data = pd.DataFrame( - get_data(dataset, fields) -) -data = data[np.invert(data.is_e_not_matched)] #remove non-matched electrons -#ensure that there is at least the GSF and a track within meaningful boundaries -data = data[training_selection(data)] -data['training_out'] = -1 -data['log_trkpt'] = np.log10(data.trk_pt) -#convert bools to integers -for c in features: - if data[c].dtype == np.dtype('bool'): - data[c] = data[c].astype(int) - - -#apply pt-eta reweighting -## from hep_ml.reweight import GBReweighter -## from sklearn.externals import joblib -## reweighter = joblib.load('%s/%s_reweighting.pkl' % (mods, dataset)) -## weights = reweighter.predict_weights(data[['trk_pt', 'trk_eta']]) -weights = kmeans_weighter( - data[['log_trkpt', 'trk_eta']], - '%s/kmeans_%s_weighter.plk' % (mods, dataset) - ) -data['weight'] = weights*data.is_e + np.invert(data.is_e) - -#add baseline seeding (for seeding only) -if args.what in ['seeding', 'fullseeding']: - data['baseline'] = ( - data.preid_trk_ecal_match | - (np.invert(data.preid_trk_ecal_match) & data.preid_trkfilter_pass & data.preid_mva_pass) - ) + +data = pre_process_data(dataset, fields, args.what in ['seeding', 'fullseeding']) from sklearn.model_selection import train_test_split train_test, validation = train_test_split(data, test_size=0.2, random_state=42) @@ -170,8 +133,8 @@ rocs = {} for df, name in [ - (train, 'train'), - (test, 'test'), + ##(train, 'train'), + ##(test, 'test'), (validation, 'validation') ]: training_out = clf.predict_proba(df[features].as_matrix())[:, 1] @@ -194,17 +157,27 @@ if args.what in ['seeding', 'fullseeding']: eff = float((data.baseline & data.is_e).sum())/data.is_e.sum() mistag = float((data.baseline & np.invert(data.is_e)).sum())/np.invert(data.is_e).sum() - plt.plot([mistag], [eff], 'o', label='baseline', markersize=5) -elif args.what == 'id': + rocs['baseline'] = [[mistag], [eff]] + plt.plot([mistag], [eff], 'o', label='baseline', markersize=5) +elif 'id' in args.what: mva_v1 = roc_curve(validation.is_e, validation.ele_mvaIdV1)[:2] mva_v2 = roc_curve(validation.is_e, validation.ele_mvaIdV2)[:2] mva_v1_auc = roc_auc_score(validation.is_e, validation.ele_mvaIdV1) mva_v2_auc = roc_auc_score(validation.is_e, validation.ele_mvaIdV2) + rocs['mva_v1'] = mva_v1 + rocs['mva_v2'] = mva_v2 plt.plot(*mva_v1, label='MVA ID V1 (AUC: %.2f)' % mva_v1_auc) plt.plot(*mva_v2, label='MVA ID V2 (AUC: %.2f)' % mva_v2_auc) else: raise ValueError() +for key in rocs: + fpr, tpr = rocs[key] + rocs[key] = [list(fpr), list(tpr)] + +with open('%s/%s_%s_%s_ROCS.json' % (plots, dataset, args.jobtag, args.what), 'w') as rr: + rr.write(json.dumps(rocs)) + plt.xlabel('Mistag Rate') plt.ylabel('Efficiency') plt.legend(loc='best') diff --git a/macros/train_bdt_xgbo.py b/macros/train_bdt_xgbo.py index f1187cd54ec14..c049e32a82922 100644 --- a/macros/train_bdt_xgbo.py +++ b/macros/train_bdt_xgbo.py @@ -6,9 +6,7 @@ from pdb import set_trace parser = ArgumentParser() -parser.add_argument( - 'what', choices=['seeding', 'fullseeding', 'id'], -) +parser.add_argument('what') parser.add_argument( '--jobtag', default='', type=str ) @@ -27,16 +25,10 @@ from matplotlib import rc rc('font',**{'family':'sans-serif','sans-serif':['Helvetica']}) rc('text', usetex=True) -from datasets import get_data, tag, kmeans_weighter, training_selection +from datasets import tag, pre_process_data, get_models_dir import os -if 'CMSSW_BASE' not in os.environ: - cmssw_path = dir_path = os.path.dirname(os.path.realpath(__file__)).split('src/LowPtElectrons')[0] - os.environ['CMSSW_BASE'] = cmssw_path - -mods = '%s/src/LowPtElectrons/LowPtElectrons/macros/models/%s/' % (os.environ['CMSSW_BASE'], tag) -if not os.path.isdir(mods): - os.makedirs(mods) +mods = get_models_dir() opti_dir = '%s/bdt_bo_%s' % (mods, args.what) if not os.path.isdir(opti_dir): @@ -47,46 +39,11 @@ os.makedirs(plots) from features import * - -if args.what == 'seeding': - features = seed_features - additional = seed_additional -elif args.what == 'fullseeding': - features = fullseed_features - additional = seed_additional -elif args.what == 'id': - features = id_features - additional = id_additional -else: - raise ValueError() +features, additional = get_features(args.what) fields = features+labeling+additional if 'gsf_pt' not in fields : fields += ['gsf_pt'] -data = pd.DataFrame( - get_data(dataset, fields) -) -data = data[np.invert(data.is_e_not_matched)] #remove non-matched electrons -data = data[training_selection(data)] -data['training_out'] = -1 -data['log_trkpt'] = np.log10(data.trk_pt) -#convert bools to integers -for c in features: - if data[c].dtype == np.dtype('bool'): - data[c] = data[c].astype(int) - -#apply pt-eta reweighting -weights = kmeans_weighter( - data[['log_trkpt', 'trk_eta']], - '%s/kmeans_%s_weighter.plk' % (mods, dataset) - ) -data['weight'] = weights*data.is_e + np.invert(data.is_e) - -#add baseline seeding (for seeding only) -if args.what in ['seeding', 'fullseeding']: - data['baseline'] = ( - data.preid_trk_ecal_match | - (np.invert(data.preid_trk_ecal_match) & data.preid_trkfilter_pass & data.preid_mva_pass) - ) +data = pre_process_data(dataset, fields, args.what in ['seeding', 'fullseeding']) from sklearn.model_selection import train_test_split train, test = train_test_split(data, test_size=0.2, random_state=42) diff --git a/macros/train_nn.py b/macros/train_nn.py index 464b46e3d5697..39236bc8cbd89 100644 --- a/macros/train_nn.py +++ b/macros/train_nn.py @@ -6,10 +6,7 @@ from pdb import set_trace parser = ArgumentParser() -parser.add_argument( - 'what', choices=['seeding', 'fullseeding', 'id'], -) - +parser.add_argument('what') parser.add_argument( '--jobtag', default='', type=str ) @@ -51,10 +48,10 @@ from matplotlib import rc rc('font',**{'family':'sans-serif','sans-serif':['Helvetica']}) #rc('text', usetex=True) -from datasets import get_data, tag, kmeans_weighter, training_selection +from datasets import tag, pre_process_data import os -cmssw_path = dir_path = os.path.dirname(os.path.realpath(__file__)).split('src/LowPtElectrons')[0] +ccmssw_path = dir_path = os.path.dirname(os.path.realpath(__file__)).split('src/LowPtElectrons')[0] os.environ['CMSSW_BASE'] = cmssw_path mods = '%s/src/LowPtElectrons/LowPtElectrons/macros/models/%s/' % (os.environ['CMSSW_BASE'], tag) if not os.path.isdir(mods): @@ -72,48 +69,11 @@ os.makedirs(plots) from features import * -if args.what == 'seeding': - features = seed_features - additional = seed_additional -elif args.what == 'fullseeding': - features = fullseed_features - additional = seed_additional -elif args.what == 'id': - features = id_features - additional = id_additional -else: - raise ValueError() +features, additional = get_features(args.what) -data = pd.DataFrame( - get_data(dataset, features+labeling+additional) -) -data = data[np.invert(data.is_e_not_matched)] #remove non-matched electrons -data = data[training_selection(data)] -data['training_out'] = -1 -data['log_trkpt'] = np.log10(data.trk_pt) -#convert bools to integers -for c in features: - if data[c].dtype == np.dtype('bool'): - data[c] = data[c].astype(int) - - -#apply pt-eta reweighting -## from hep_ml.reweight import GBReweighter -## from sklearn.externals import joblib -## reweighter = joblib.load('%s/%s_reweighting.pkl' % (mods, dataset)) -## weights = reweighter.predict_weights(data[['trk_pt', 'trk_eta']]) -weights = kmeans_weighter( - data[['log_trkpt', 'trk_eta']], - '%s/kmeans_%s_weighter.plk' % (mods, dataset) - ) -data['weight'] = weights*data.is_e + np.invert(data.is_e) - -#add baseline seeding (for seeding only) -if args.what in ['seeding', 'fullseeding']: - data['baseline'] = ( - data.preid_trk_ecal_match | - (np.invert(data.preid_trk_ecal_match) & data.preid_trkfilter_pass & data.preid_mva_pass) - ) +fields = features+labeling+additional +if 'gsf_pt' not in fields : fields += ['gsf_pt'] +data = pre_process_data(dataset, fields, args.what in ['seeding', 'fullseeding']) from sklearn.model_selection import train_test_split train, test = train_test_split(data, test_size=0.2, random_state=42) diff --git a/macros/train_nn_bo.py b/macros/train_nn_bo.py index 68c1ef6c75142..6c69390f76149 100644 --- a/macros/train_nn_bo.py +++ b/macros/train_nn_bo.py @@ -7,9 +7,7 @@ import os parser = ArgumentParser() -parser.add_argument( - 'what', choices=['seeding', 'fullseeding', 'id'], -) +parser.add_argument('what') parser.add_argument( '--jobtag', default='', type=str @@ -58,7 +56,7 @@ from matplotlib import rc rc('font',**{'family':'sans-serif','sans-serif':['Helvetica']}) #rc('text', usetex=True) -from datasets import get_data, tag, kmeans_weighter, training_selection +from datasets import tag, pre_process_data plots = '%s/src/LowPtElectrons/LowPtElectrons/macros/plots/%s/' % (os.environ['CMSSW_BASE'], tag) if not os.path.isdir(plots): @@ -73,56 +71,11 @@ os.makedirs(opti_dir) from features import * -if args.what == 'seeding': - features = seed_features - additional = seed_additional -elif args.what == 'fullseeding': - features = fullseed_features - additional = seed_additional -elif args.what == 'id': - features = id_features - additional = id_additional+['gsf_ecal_cluster_ematrix'] -else: - raise ValueError() - -data_dict = get_data(dataset, features+labeling+additional) -multi_dim = data_dict.pop('gsf_ecal_cluster_ematrix', None) -data = pd.DataFrame(data_dict) -if args.what == 'id': - flattened = pd.DataFrame(multi_dim.reshape(multi_dim.shape[0], -1)) - new_features = ['crystal_%d' % i for i in range(len(flattened.columns))] - flattened.columns = new_features - features += new_features - data = pd.concat([data, flattened], axis=1) - -data_mask = (np.invert(data.is_e_not_matched) & training_selection(data)) -data = data[data_mask] -multi_dim = multi_dim[data_mask] -data['training_out'] = -1 -data['log_trkpt'] = np.log10(data.trk_pt) -#convert bools to integers -for c in features: - if data[c].dtype == np.dtype('bool'): - data[c] = data[c].astype(int) - - -#apply pt-eta reweighting -## from hep_ml.reweight import GBReweighter -## from sklearn.externals import joblib -## reweighter = joblib.load('%s/%s_reweighting.pkl' % (mods, dataset)) -## weights = reweighter.predict_weights(data[['trk_pt', 'trk_eta']]) -weights = kmeans_weighter( - data[['log_trkpt', 'trk_eta']], - '%s/kmeans_%s_weighter.plk' % (mods, dataset) - ) -data['weight'] = weights*data.is_e + np.invert(data.is_e) +features, additional = get_features(args.what) -#add baseline seeding (for seeding only) -if args.what in ['seeding', 'fullseeding']: - data['baseline'] = ( - data.preid_trk_ecal_match | - (np.invert(data.preid_trk_ecal_match) & data.preid_trkfilter_pass & data.preid_mva_pass) - ) +fields = features+labeling+additional +if 'gsf_pt' not in fields : fields += ['gsf_pt'] +data = pre_process_data(dataset, fields, args.what in ['seeding', 'fullseeding']) from sklearn.model_selection import train_test_split train, test = train_test_split(data, test_size=0.2, random_state=42) diff --git a/run/mc_features.py b/run/mc_features.py index 247dc125a5a02..c061c528471eb 100644 --- a/run/mc_features.py +++ b/run/mc_features.py @@ -225,6 +225,8 @@ ################################################################################ # Path and EndPath definitions, TFileService, OutputModule +process.dumper = cms.EDAnalyzer('EventContentDumper') +process.EIsequence *= process.dumper # ReReco and ntuplize process.raw2digi_step = cms.Path(process.RawToDigi) process.reconstruction_step = cms.Path(process.reconstruction)