Skip to content

Commit

Permalink
refactor code to avoid repetitions
Browse files Browse the repository at this point in the history
  • Loading branch information
mauro verzetti committed Oct 16, 2018
1 parent 82260ea commit 2cb77e1
Show file tree
Hide file tree
Showing 7 changed files with 143 additions and 199 deletions.
56 changes: 55 additions & 1 deletion macros/datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,12 +10,12 @@
sets = sorted(list(sets), key=lambda x: -len(x))
input_files = {i : [] for i in sets}
input_files['all'] = all_sets
input_files['test'] = all_sets[:1]
for inf in all_sets:
for name in sets:
if os.path.basename(inf).startswith(name):
input_files[name].append(inf)
break
input_files['test'] = input_files['BToKee'][:1]


dataset_names = {
Expand All @@ -34,6 +34,16 @@
import uproot
import numpy as np

def get_models_dir():
if 'CMSSW_BASE' not in os.environ:
cmssw_path = dir_path = os.path.dirname(os.path.realpath(__file__)).split('src/LowPtElectrons')[0]
os.environ['CMSSW_BASE'] = cmssw_path

mods = '%s/src/LowPtElectrons/LowPtElectrons/macros/models/%s/' % (os.environ['CMSSW_BASE'], tag)
if not os.path.isdir(mods):
os.makedirs(mods)
return mods

def get_data(dataset, columns, nthreads=2*multiprocessing.cpu_count(), exclude={}):
thread_pool = concurrent.futures.ThreadPoolExecutor(nthreads)
if dataset not in input_files:
Expand Down Expand Up @@ -93,3 +103,47 @@ def training_selection(df):
'ensures there is a GSF Track and a KTF track within eta/pt boundaries'
return (df.trk_pt > 0) & (df.trk_pt < 15) & (np.abs(df.trk_eta) < 2.4) & (df.gsf_pt > 0)

import pandas as pd
import numpy as np
def pre_process_data(dataset, features, for_seeding=False):
mods = get_models_dir()
features = list(set(features+['trk_pt', 'gsf_pt', 'trk_eta']))
data_dict = get_data_sync(dataset, features)
if 'gsf_ecal_cluster_ematrix' in features:
multi_dim = data_dict.pop('gsf_ecal_cluster_ematrix', None)
data = pd.DataFrame(data_dict)
if 'gsf_ecal_cluster_ematrix' in features:
flattened = pd.DataFrame(multi_dim.reshape(multi_dim.shape[0], -1))
new_features = ['crystal_%d' % i for i in range(len(flattened.columns))]
flattened.columns = new_features
features += new_features
data = pd.concat([data, flattened], axis=1)

data = data[np.invert(data.is_e_not_matched)] #remove non-matched electrons
data = data[training_selection(data)]
data['training_out'] = -1
data['log_trkpt'] = np.log10(data.trk_pt)

#apply pt-eta reweighting
## from hep_ml.reweight import GBReweighter
## from sklearn.externals import joblib
## reweighter = joblib.load('%s/%s_reweighting.pkl' % (mods, dataset))
## weights = reweighter.predict_weights(data[['trk_pt', 'trk_eta']])
weights = kmeans_weighter(
data[['log_trkpt', 'trk_eta']],
'%s/kmeans_%s_weighter.plk' % (mods, dataset)
)
data['weight'] = weights*np.invert(data.is_e) + data.is_e

#add baseline seeding (for seeding only)
if for_seeding:
data['baseline'] = (
data.preid_trk_ecal_match |
(np.invert(data.preid_trk_ecal_match) & data.preid_trkfilter_pass & data.preid_mva_pass)
)

#convert bools to integers
for c in features:
if data[c].dtype == np.dtype('bool'):
data[c] = data[c].astype(int)
return data
45 changes: 45 additions & 0 deletions macros/features.py
Original file line number Diff line number Diff line change
Expand Up @@ -140,3 +140,48 @@
'gen_phi',
'gen_charge',
]

mva_id_inputs = [
'rho',
'ele_pt',
'sc_eta',
'shape_full5x5_sigmaIetaIeta',
'shape_full5x5_sigmaIphiIphi',
'shape_full5x5_circularity',
'shape_full5x5_r9',
'sc_etaWidth',
'sc_phiWidth',
'shape_full5x5_HoverE',
'trk_nhits',
'trk_chi2red',
'gsf_chi2red',
'brem_frac',
'gsf_nhits',
'match_SC_EoverP',
'match_eclu_EoverP',
'match_SC_dEta', #should be abs
'match_SC_dPhi', #should be abs
'match_seed_dEta', #should be abs
'sc_E',
'trk_p',
#ele_expected_inner_hits gsfTrack.hitPattern.numberOfLostHits('MISSING_INNER_HITS') None None
#ele_conversionVertexFitProbability electronMVAVariableHelper:convVtxFitProb None None
#ele_IoEmIop 1.0/ecalEnergy-1.0/trackMomentumAtVtx.R None None
]

def get_features(ftype):
if ftype == 'seeding':
features = seed_features
additional = seed_additional
elif ftype == 'fullseeding':
features = fullseed_features
additional = seed_additional
elif ftype == 'id':
features = id_features
additional = id_additional
elif ftype == 'mva_id':
features = mva_id_inputs
additional = id_additional
else:
raise ValueError('%s is not among the possible feature collection' % ftype)
return features, additional
73 changes: 23 additions & 50 deletions macros/train_bdt.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,10 @@

parser = ArgumentParser()
parser.add_argument(
'what', choices=['seeding', 'fullseeding', 'id'],
'what'
)
parser.add_argument(
'--test', action='store_true'
)
parser.add_argument(
'--jobtag', default='', type=str
Expand Down Expand Up @@ -50,7 +53,7 @@
)

args = parser.parse_args()
dataset = 'all'
dataset = 'test' if args.test else 'all'
#dataset = 'test'

import matplotlib.pyplot as plt
Expand All @@ -62,7 +65,7 @@
from matplotlib import rc
rc('font',**{'family':'sans-serif','sans-serif':['Helvetica']})
rc('text', usetex=True)
from datasets import get_data, tag, kmeans_weighter, training_selection
from datasets import tag, pre_process_data
import os

mods = '%s/src/LowPtElectrons/LowPtElectrons/macros/models/%s/' % (os.environ['CMSSW_BASE'], tag)
Expand All @@ -74,52 +77,12 @@
os.mkdirs(plots)

from features import *

if args.what == 'seeding':
features = seed_features
additional = seed_additional
elif args.what == 'fullseeding':
features = fullseed_features
additional = seed_additional
elif args.what == 'id':
features = id_features
additional = id_additional
else:
raise ValueError()
features, additional = get_features(args.what)

fields = features+labeling+additional
if 'gsf_pt' not in fields : fields += ['gsf_pt']
data = pd.DataFrame(
get_data(dataset, fields)
)
data = data[np.invert(data.is_e_not_matched)] #remove non-matched electrons
#ensure that there is at least the GSF and a track within meaningful boundaries
data = data[training_selection(data)]
data['training_out'] = -1
data['log_trkpt'] = np.log10(data.trk_pt)
#convert bools to integers
for c in features:
if data[c].dtype == np.dtype('bool'):
data[c] = data[c].astype(int)


#apply pt-eta reweighting
## from hep_ml.reweight import GBReweighter
## from sklearn.externals import joblib
## reweighter = joblib.load('%s/%s_reweighting.pkl' % (mods, dataset))
## weights = reweighter.predict_weights(data[['trk_pt', 'trk_eta']])
weights = kmeans_weighter(
data[['log_trkpt', 'trk_eta']],
'%s/kmeans_%s_weighter.plk' % (mods, dataset)
)
data['weight'] = weights*data.is_e + np.invert(data.is_e)

#add baseline seeding (for seeding only)
if args.what in ['seeding', 'fullseeding']:
data['baseline'] = (
data.preid_trk_ecal_match |
(np.invert(data.preid_trk_ecal_match) & data.preid_trkfilter_pass & data.preid_mva_pass)
)

data = pre_process_data(dataset, fields, args.what in ['seeding', 'fullseeding'])

from sklearn.model_selection import train_test_split
train_test, validation = train_test_split(data, test_size=0.2, random_state=42)
Expand Down Expand Up @@ -170,8 +133,8 @@

rocs = {}
for df, name in [
(train, 'train'),
(test, 'test'),
##(train, 'train'),
##(test, 'test'),
(validation, 'validation')
]:
training_out = clf.predict_proba(df[features].as_matrix())[:, 1]
Expand All @@ -194,17 +157,27 @@
if args.what in ['seeding', 'fullseeding']:
eff = float((data.baseline & data.is_e).sum())/data.is_e.sum()
mistag = float((data.baseline & np.invert(data.is_e)).sum())/np.invert(data.is_e).sum()
plt.plot([mistag], [eff], 'o', label='baseline', markersize=5)
elif args.what == 'id':
rocs['baseline'] = [[mistag], [eff]]
plt.plot([mistag], [eff], 'o', label='baseline', markersize=5)
elif 'id' in args.what:
mva_v1 = roc_curve(validation.is_e, validation.ele_mvaIdV1)[:2]
mva_v2 = roc_curve(validation.is_e, validation.ele_mvaIdV2)[:2]
mva_v1_auc = roc_auc_score(validation.is_e, validation.ele_mvaIdV1)
mva_v2_auc = roc_auc_score(validation.is_e, validation.ele_mvaIdV2)
rocs['mva_v1'] = mva_v1
rocs['mva_v2'] = mva_v2
plt.plot(*mva_v1, label='MVA ID V1 (AUC: %.2f)' % mva_v1_auc)
plt.plot(*mva_v2, label='MVA ID V2 (AUC: %.2f)' % mva_v2_auc)
else:
raise ValueError()

for key in rocs:
fpr, tpr = rocs[key]
rocs[key] = [list(fpr), list(tpr)]

with open('%s/%s_%s_%s_ROCS.json' % (plots, dataset, args.jobtag, args.what), 'w') as rr:
rr.write(json.dumps(rocs))

plt.xlabel('Mistag Rate')
plt.ylabel('Efficiency')
plt.legend(loc='best')
Expand Down
53 changes: 5 additions & 48 deletions macros/train_bdt_xgbo.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,7 @@
from pdb import set_trace

parser = ArgumentParser()
parser.add_argument(
'what', choices=['seeding', 'fullseeding', 'id'],
)
parser.add_argument('what')
parser.add_argument(
'--jobtag', default='', type=str
)
Expand All @@ -27,16 +25,10 @@
from matplotlib import rc
rc('font',**{'family':'sans-serif','sans-serif':['Helvetica']})
rc('text', usetex=True)
from datasets import get_data, tag, kmeans_weighter, training_selection
from datasets import tag, pre_process_data, get_models_dir
import os

if 'CMSSW_BASE' not in os.environ:
cmssw_path = dir_path = os.path.dirname(os.path.realpath(__file__)).split('src/LowPtElectrons')[0]
os.environ['CMSSW_BASE'] = cmssw_path

mods = '%s/src/LowPtElectrons/LowPtElectrons/macros/models/%s/' % (os.environ['CMSSW_BASE'], tag)
if not os.path.isdir(mods):
os.makedirs(mods)
mods = get_models_dir()

opti_dir = '%s/bdt_bo_%s' % (mods, args.what)
if not os.path.isdir(opti_dir):
Expand All @@ -47,46 +39,11 @@
os.makedirs(plots)

from features import *

if args.what == 'seeding':
features = seed_features
additional = seed_additional
elif args.what == 'fullseeding':
features = fullseed_features
additional = seed_additional
elif args.what == 'id':
features = id_features
additional = id_additional
else:
raise ValueError()
features, additional = get_features(args.what)

fields = features+labeling+additional
if 'gsf_pt' not in fields : fields += ['gsf_pt']
data = pd.DataFrame(
get_data(dataset, fields)
)
data = data[np.invert(data.is_e_not_matched)] #remove non-matched electrons
data = data[training_selection(data)]
data['training_out'] = -1
data['log_trkpt'] = np.log10(data.trk_pt)
#convert bools to integers
for c in features:
if data[c].dtype == np.dtype('bool'):
data[c] = data[c].astype(int)

#apply pt-eta reweighting
weights = kmeans_weighter(
data[['log_trkpt', 'trk_eta']],
'%s/kmeans_%s_weighter.plk' % (mods, dataset)
)
data['weight'] = weights*data.is_e + np.invert(data.is_e)

#add baseline seeding (for seeding only)
if args.what in ['seeding', 'fullseeding']:
data['baseline'] = (
data.preid_trk_ecal_match |
(np.invert(data.preid_trk_ecal_match) & data.preid_trkfilter_pass & data.preid_mva_pass)
)
data = pre_process_data(dataset, fields, args.what in ['seeding', 'fullseeding'])

from sklearn.model_selection import train_test_split
train, test = train_test_split(data, test_size=0.2, random_state=42)
Expand Down
Loading

0 comments on commit 2cb77e1

Please sign in to comment.