From e9eeca4ed20793e719434d2e0a6fc10b06439989 Mon Sep 17 00:00:00 2001 From: Luca Guzzi Date: Mon, 26 Oct 2020 18:27:47 +0100 Subject: [PATCH 01/16] adding validation tool (KS test) --- Production/scripts/validation_tool.py | 151 ++++++++++++++++++++++++++ 1 file changed, 151 insertions(+) create mode 100755 Production/scripts/validation_tool.py diff --git a/Production/scripts/validation_tool.py b/Production/scripts/validation_tool.py new file mode 100755 index 00000000000..a90d6f1a615 --- /dev/null +++ b/Production/scripts/validation_tool.py @@ -0,0 +1,151 @@ +#!/usr/bin/env python +import argparse +parser = argparse.ArgumentParser(''' +The script runs a binned KS test between different chunks of the same RDataFrame. For simplicity, all chunks are compared to the first one. +If a KS test is below the threshold, a warning message is printed on screen. +NOTE: pvalue = 99 means that one of the two histograms is empty. +''') + +import ROOT +import glob +import json +from collections import OrderedDict + +parser.add_argument('--input' , required = True, type = str, help = 'input file. Accepts glob patterns') +parser.add_argument('--output' , required = True, type = str, help = 'output file name') +parser.add_argument('--pdf' , default = None, type = str, help = 'output pdf directory') +parser.add_argument('--json' , required = True, type = str, help = 'output json file name') +parser.add_argument('--nsplit' , default = 5 , type = str, help = 'number of chunks per file') +parser.add_argument('--pvthreshold' , default = .05 , type = str, help = 'threshold of KS test (above = ok)') + +parser.add_argument('--visual', action = 'store_true', help = 'Won\'t run the script in batch mode') +parser.add_argument('--legend', action = 'store_true', help = 'Draw a TLegent on canvases') +args = parser.parse_args() + +import os +if not os.path.exists(args.pdf): + os.makedirs(args.pdf) + +ROOT.gROOT.SetBatch(not args.visual) +ROOT.gStyle.SetOptStat(0) + +JSON_DICT = OrderedDict() +OUTPUT_ROOT = ROOT.TFile.Open('{}'.format(args.output), 'RECREATE') +OUTPUT_JSON = open(args.json, 'w') +N_SPLITS = args.nsplit +PVAL_THRESHOLD = args.pvthreshold + +## binning of tested variables (cannot use unbinned distributions with python before root 6.18) +BINS = { + 'tau_pt' : (50, 0, 5000), + 'tau_eta' : (5, -2.5, 2.5), + 'lepton_gen_match' : (20, -1, 19), + 'sampleType': (20, -1, 19), + 'dataset_id': (20, -1, 19), + 'dataset_group_id': (20, -1, 19), +} + +def groupby(dataframe, by): + _ = dataframe.Histo1D(by) + hist = _.GetValue() + hist.ClearUnderflowAndOverflow() + types = list(set([round(hist.GetBinCenter(jj)) for jj in range(hist.GetNbinsX()) if hist.GetBinContent(jj)])) + + return {tt: dataframe.Filter('{} == {}'.format(by, tt)) for tt in types} + +def get_histos(dataframe, branch, norm = False): + size = int(dataframe.Count()) + sub_size = 1 + size / N_SPLITS + subframes = [dataframe.Range(ii*sub_size, (ii+1)*sub_size) for ii in range(N_SPLITS)] + + model = (branch, "") + BINS[branch] + hptrs = [sf.Histo1D(model, branch) for sf in subframes] + histos = [hh.GetValue().Clone() for hh in hptrs] + + for hh in histos: + hh.SetTitle(branch) + hh.Sumw2() + hh.ClearUnderflowAndOverflow() + if norm and hh.Integral(): + hh.Scale(1. / hh.Integral()) + + return histos + +def save_histos(histos, fdir, pvalues): + OUTPUT_ROOT.cd() + OUTPUT_ROOT.cd(fdir) + + can = ROOT.TCanvas() + leg = ROOT.TLegend(0.9, 0.1, 1., 0.9, "p-values (KS with the first chunk)") + + histos[0].GetYaxis().SetRangeUser(0, 1.1*max(hh.GetMaximum() for hh in histos)) + histos[0].SetMarkerStyle(20) + + for ii, hh in enumerate(histos): + hh.SetLineColor(ii+1) + hh.SetMarkerColor(ii+1) + leg.AddEntry(hh, 'chunk %d - pval = %.3f' %(ii, pvalues[ii]), 'lep') + hh.Draw("PE" + " SAME" * (ii != 0)) + hh.Write() + if args.legend: + leg.Draw("SAME") + if args.pdf is not None: + can.SaveAs('{}/{}.pdf'.format(args.pdf, fdir.replace('/', '_')), 'pdf') + can.Write() + OUTPUT_ROOT.cd() + +def run_validation(dataframe, branches, pwd = ''): + OUTPUT_ROOT.cd() + OUTPUT_ROOT.mkdir(pwd) + + if not pwd in JSON_DICT.keys(): + JSON_DICT[pwd] = OrderedDict() + + for branch in branches: + OUTPUT_ROOT.cd() + OUTPUT_ROOT.mkdir('/'.join([pwd, branch])) + + histos = get_histos(dataframe, branch = branch, norm = True) + + pvalues = [histos[0].KolmogorovTest(hh) if histos[0].Integral()*hh.Integral() else 99 for hh in histos] + if not histos[0].Integral(): + print '[WARNING] control histogram is empty for step {} inside {}'.format(branch, pwd) + + if not all([pv >= PVAL_THRESHOLD for pv in pvalues]): + print '[WARNING] KS test failed for step {} inside {}. p-values are:'.format(branch, pwd) + print '\t', pvalues + + JSON_DICT[pwd][branch] = pvalues + + save_histos(histos, fdir = '/'.join([pwd, branch]), pvalues = pvalues) + +if __name__ == '__main__': + print '[INFO] reading files', args.input + input_files = ROOT.std.vector('std::string')() + for file in glob.glob(args.input): + input_files.push_back(str(file)) + + main_dir = 'KS_test' + + ## first, run on plain columns + dataframe = ROOT.RDataFrame('taus', input_files) + run_validation(dataframe = dataframe, pwd = main_dir, branches = ['lepton_gen_match', 'sampleType', 'dataset_group_id']) + + ## then, group by tau type + tau_type_dataframes = groupby(dataframe = dataframe, by = 'lepton_gen_match') + for ii, df in tau_type_dataframes.iteritems(): + run_validation(dataframe = df, pwd = '/'.join([main_dir, 'lepton_gen_match', str(ii)]), branches = ['tau_pt', 'tau_eta']) + + ## then, group by sample type + sample_type_dataframes = groupby(dataframe = dataframe, by = 'sampleType') + for ii, df in sample_type_dataframes.iteritems(): + run_validation(dataframe = df, pwd = '/'.join([main_dir, 'sampleType', str(ii)]), branches = ['tau_pt', 'tau_eta']) + + ## then, group by dataset group id + group_id_dataframes = groupby(dataframe = dataframe, by = 'dataset_group_id') + for ii, df in group_id_dataframes.iteritems(): + run_validation(dataframe = df, pwd = '/'.join([main_dir, 'dataset_group_id', str(ii)]), branches = ['tau_pt', 'tau_eta', 'dataset_id']) + + OUTPUT_ROOT.Close() + json.dump(JSON_DICT, OUTPUT_JSON, indent = 4) + print '[INFO] all done. Files', args.output, 'and', args.json, 'have been created' \ No newline at end of file From 0ef725bb86813b0a0f460477302db59f45ceca16 Mon Sep 17 00:00:00 2001 From: Luca Guzzi Date: Mon, 26 Oct 2020 18:45:47 +0100 Subject: [PATCH 02/16] adding a comment --- Production/scripts/validation_tool.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/Production/scripts/validation_tool.py b/Production/scripts/validation_tool.py index a90d6f1a615..1109c1cf821 100755 --- a/Production/scripts/validation_tool.py +++ b/Production/scripts/validation_tool.py @@ -3,6 +3,7 @@ parser = argparse.ArgumentParser(''' The script runs a binned KS test between different chunks of the same RDataFrame. For simplicity, all chunks are compared to the first one. If a KS test is below the threshold, a warning message is printed on screen. +NOTE: the binning of each variable must be hard coded in the script (using the BINS dictionary) NOTE: pvalue = 99 means that one of the two histograms is empty. ''') @@ -106,7 +107,7 @@ def run_validation(dataframe, branches, pwd = ''): OUTPUT_ROOT.mkdir('/'.join([pwd, branch])) histos = get_histos(dataframe, branch = branch, norm = True) - + pvalues = [histos[0].KolmogorovTest(hh) if histos[0].Integral()*hh.Integral() else 99 for hh in histos] if not histos[0].Integral(): print '[WARNING] control histogram is empty for step {} inside {}'.format(branch, pwd) From 5f7c1a91bd1fa2fd0254d30e8aeb25fdf8ce485d Mon Sep 17 00:00:00 2001 From: Luca Guzzi Date: Mon, 26 Oct 2020 18:49:03 +0100 Subject: [PATCH 03/16] small fix --- Production/scripts/validation_tool.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Production/scripts/validation_tool.py b/Production/scripts/validation_tool.py index 1109c1cf821..be76f30ab15 100755 --- a/Production/scripts/validation_tool.py +++ b/Production/scripts/validation_tool.py @@ -95,7 +95,7 @@ def save_histos(histos, fdir, pvalues): can.Write() OUTPUT_ROOT.cd() -def run_validation(dataframe, branches, pwd = ''): +def run_validation(dataframe, branches, pwd): OUTPUT_ROOT.cd() OUTPUT_ROOT.mkdir(pwd) From eec9866fa4abd51383fa93b679118287edf911b1 Mon Sep 17 00:00:00 2001 From: Luca Guzzi Date: Tue, 27 Oct 2020 10:35:01 +0100 Subject: [PATCH 04/16] integer types from groupby method --- Production/scripts/validation_tool.py | 1 + 1 file changed, 1 insertion(+) diff --git a/Production/scripts/validation_tool.py b/Production/scripts/validation_tool.py index be76f30ab15..0817c5f420d 100755 --- a/Production/scripts/validation_tool.py +++ b/Production/scripts/validation_tool.py @@ -51,6 +51,7 @@ def groupby(dataframe, by): hist = _.GetValue() hist.ClearUnderflowAndOverflow() types = list(set([round(hist.GetBinCenter(jj)) for jj in range(hist.GetNbinsX()) if hist.GetBinContent(jj)])) + types = [int(tt) for tt in types] return {tt: dataframe.Filter('{} == {}'.format(by, tt)) for tt in types} From cb5ae3e1281c9f798d890fc91a421eb34d9241e8 Mon Sep 17 00:00:00 2001 From: Luca Guzzi Date: Tue, 27 Oct 2020 10:35:32 +0100 Subject: [PATCH 05/16] numbered histograms --- Production/scripts/validation_tool.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/Production/scripts/validation_tool.py b/Production/scripts/validation_tool.py index 0817c5f420d..4a883ecd1da 100755 --- a/Production/scripts/validation_tool.py +++ b/Production/scripts/validation_tool.py @@ -64,8 +64,9 @@ def get_histos(dataframe, branch, norm = False): hptrs = [sf.Histo1D(model, branch) for sf in subframes] histos = [hh.GetValue().Clone() for hh in hptrs] - for hh in histos: + for ii, hh in enumerate(histos): hh.SetTitle(branch) + hh.SetName('_'.join([hh.GetName(), ii])) hh.Sumw2() hh.ClearUnderflowAndOverflow() if norm and hh.Integral(): From e3a93ac91f46d27a70b9d4494bdc41a38fe9f641 Mon Sep 17 00:00:00 2001 From: Luca Guzzi Date: Tue, 27 Oct 2020 10:37:16 +0100 Subject: [PATCH 06/16] adding dataset_id to main variables --- Production/scripts/validation_tool.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/Production/scripts/validation_tool.py b/Production/scripts/validation_tool.py index 4a883ecd1da..4dc943043a6 100755 --- a/Production/scripts/validation_tool.py +++ b/Production/scripts/validation_tool.py @@ -132,7 +132,7 @@ def run_validation(dataframe, branches, pwd): ## first, run on plain columns dataframe = ROOT.RDataFrame('taus', input_files) - run_validation(dataframe = dataframe, pwd = main_dir, branches = ['lepton_gen_match', 'sampleType', 'dataset_group_id']) + run_validation(dataframe = dataframe, pwd = main_dir, branches = ['lepton_gen_match', 'sampleType', 'dataset_group_id', 'dataset_id']) ## then, group by tau type tau_type_dataframes = groupby(dataframe = dataframe, by = 'lepton_gen_match') @@ -149,6 +149,11 @@ def run_validation(dataframe, branches, pwd): for ii, df in group_id_dataframes.iteritems(): run_validation(dataframe = df, pwd = '/'.join([main_dir, 'dataset_group_id', str(ii)]), branches = ['tau_pt', 'tau_eta', 'dataset_id']) + ## then, group by dataset id + group_id_dataframes = groupby(dataframe = dataframe, by = 'dataset_id') + for ii, df in group_id_dataframes.iteritems(): + run_validation(dataframe = df, pwd = '/'.join([main_dir, 'dataset_id', str(ii)]), branches = ['tau_pt', 'tau_eta']) + OUTPUT_ROOT.Close() json.dump(JSON_DICT, OUTPUT_JSON, indent = 4) print '[INFO] all done. Files', args.output, 'and', args.json, 'have been created' \ No newline at end of file From bd576113aa71ed040e619dfd177c3dfec56566e3 Mon Sep 17 00:00:00 2001 From: Luca Guzzi Date: Tue, 27 Oct 2020 14:54:57 +0100 Subject: [PATCH 07/16] comment --- Production/scripts/validation_tool.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Production/scripts/validation_tool.py b/Production/scripts/validation_tool.py index 4dc943043a6..7608c63ea9f 100755 --- a/Production/scripts/validation_tool.py +++ b/Production/scripts/validation_tool.py @@ -12,7 +12,7 @@ import json from collections import OrderedDict -parser.add_argument('--input' , required = True, type = str, help = 'input file. Accepts glob patterns') +parser.add_argument('--input' , required = True, type = str, help = 'input file. Accepts glob patterns (use quotes)') parser.add_argument('--output' , required = True, type = str, help = 'output file name') parser.add_argument('--pdf' , default = None, type = str, help = 'output pdf directory') parser.add_argument('--json' , required = True, type = str, help = 'output json file name') From b12d21a4f45a162ec0eb4f5cdeb5f1e82ca16a2c Mon Sep 17 00:00:00 2001 From: Luca Guzzi Date: Tue, 27 Oct 2020 14:55:35 +0100 Subject: [PATCH 08/16] cast fix --- Production/scripts/validation_tool.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Production/scripts/validation_tool.py b/Production/scripts/validation_tool.py index 7608c63ea9f..2e3b285a845 100755 --- a/Production/scripts/validation_tool.py +++ b/Production/scripts/validation_tool.py @@ -63,10 +63,10 @@ def get_histos(dataframe, branch, norm = False): model = (branch, "") + BINS[branch] hptrs = [sf.Histo1D(model, branch) for sf in subframes] histos = [hh.GetValue().Clone() for hh in hptrs] - + for ii, hh in enumerate(histos): hh.SetTitle(branch) - hh.SetName('_'.join([hh.GetName(), ii])) + hh.SetName('_'.join([hh.GetName(), str(ii)])) hh.Sumw2() hh.ClearUnderflowAndOverflow() if norm and hh.Integral(): From 7f604244e967ba25d9b6ab50dc1820bb76537f1c Mon Sep 17 00:00:00 2001 From: Luca Guzzi Date: Tue, 27 Oct 2020 14:56:07 +0100 Subject: [PATCH 09/16] n splits default 100 --- Production/scripts/validation_tool.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Production/scripts/validation_tool.py b/Production/scripts/validation_tool.py index 2e3b285a845..74e2630e188 100755 --- a/Production/scripts/validation_tool.py +++ b/Production/scripts/validation_tool.py @@ -16,7 +16,7 @@ parser.add_argument('--output' , required = True, type = str, help = 'output file name') parser.add_argument('--pdf' , default = None, type = str, help = 'output pdf directory') parser.add_argument('--json' , required = True, type = str, help = 'output json file name') -parser.add_argument('--nsplit' , default = 5 , type = str, help = 'number of chunks per file') +parser.add_argument('--nsplit' , default = 100 , type = str, help = 'number of chunks per file') parser.add_argument('--pvthreshold' , default = .05 , type = str, help = 'threshold of KS test (above = ok)') parser.add_argument('--visual', action = 'store_true', help = 'Won\'t run the script in batch mode') From 6869fd165a1489e473864169a05588e5b48748a7 Mon Sep 17 00:00:00 2001 From: Luca Guzzi Date: Tue, 27 Oct 2020 15:05:34 +0100 Subject: [PATCH 10/16] python3-like print --- Production/scripts/validation_tool.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/Production/scripts/validation_tool.py b/Production/scripts/validation_tool.py index 74e2630e188..753ac5c9b70 100755 --- a/Production/scripts/validation_tool.py +++ b/Production/scripts/validation_tool.py @@ -112,18 +112,18 @@ def run_validation(dataframe, branches, pwd): pvalues = [histos[0].KolmogorovTest(hh) if histos[0].Integral()*hh.Integral() else 99 for hh in histos] if not histos[0].Integral(): - print '[WARNING] control histogram is empty for step {} inside {}'.format(branch, pwd) + print ('[WARNING] control histogram is empty for step {} inside {}'.format(branch, pwd)) if not all([pv >= PVAL_THRESHOLD for pv in pvalues]): - print '[WARNING] KS test failed for step {} inside {}. p-values are:'.format(branch, pwd) - print '\t', pvalues + print ('[WARNING] KS test failed for step {} inside {}. p-values are:'.format(branch, pwd)) + print ('\t', pvalues) JSON_DICT[pwd][branch] = pvalues save_histos(histos, fdir = '/'.join([pwd, branch]), pvalues = pvalues) if __name__ == '__main__': - print '[INFO] reading files', args.input + print ('[INFO] reading files', args.input) input_files = ROOT.std.vector('std::string')() for file in glob.glob(args.input): input_files.push_back(str(file)) @@ -156,4 +156,4 @@ def run_validation(dataframe, branches, pwd): OUTPUT_ROOT.Close() json.dump(JSON_DICT, OUTPUT_JSON, indent = 4) - print '[INFO] all done. Files', args.output, 'and', args.json, 'have been created' \ No newline at end of file + print ('[INFO] all done. Files', args.output, 'and', args.json, 'have been created') \ No newline at end of file From 200cb398c46d84b2b86385c2132c54a92878985c Mon Sep 17 00:00:00 2001 From: Luca Guzzi Date: Tue, 27 Oct 2020 15:13:35 +0100 Subject: [PATCH 11/16] changin output files structure --- Production/scripts/validation_tool.py | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/Production/scripts/validation_tool.py b/Production/scripts/validation_tool.py index 753ac5c9b70..6465a0fd7a0 100755 --- a/Production/scripts/validation_tool.py +++ b/Production/scripts/validation_tool.py @@ -13,9 +13,7 @@ from collections import OrderedDict parser.add_argument('--input' , required = True, type = str, help = 'input file. Accepts glob patterns (use quotes)') -parser.add_argument('--output' , required = True, type = str, help = 'output file name') -parser.add_argument('--pdf' , default = None, type = str, help = 'output pdf directory') -parser.add_argument('--json' , required = True, type = str, help = 'output json file name') +parser.add_argument('--output' , required = True, type = str, help = 'output directory name') parser.add_argument('--nsplit' , default = 100 , type = str, help = 'number of chunks per file') parser.add_argument('--pvthreshold' , default = .05 , type = str, help = 'threshold of KS test (above = ok)') @@ -24,15 +22,16 @@ args = parser.parse_args() import os -if not os.path.exists(args.pdf): - os.makedirs(args.pdf) +pdf_dir = '/'.join([args.output, 'pdf']) +if not os.path.exists(pdf_dir): + os.makedirs(pdf_dir) ROOT.gROOT.SetBatch(not args.visual) ROOT.gStyle.SetOptStat(0) JSON_DICT = OrderedDict() -OUTPUT_ROOT = ROOT.TFile.Open('{}'.format(args.output), 'RECREATE') -OUTPUT_JSON = open(args.json, 'w') +OUTPUT_ROOT = ROOT.TFile.Open('{}/histograms.root'.format(args.output), 'RECREATE') +OUTPUT_JSON = open('{}/pvalues.json'.format(args.output), 'w') N_SPLITS = args.nsplit PVAL_THRESHOLD = args.pvthreshold @@ -92,8 +91,7 @@ def save_histos(histos, fdir, pvalues): hh.Write() if args.legend: leg.Draw("SAME") - if args.pdf is not None: - can.SaveAs('{}/{}.pdf'.format(args.pdf, fdir.replace('/', '_')), 'pdf') + can.SaveAs('{}/pdf/{}.pdf'.format(args.output, fdir.replace('/', '_')), 'pdf') can.Write() OUTPUT_ROOT.cd() From cf217ef9a8bf61d5c203f49c2e88628ba533d8ce Mon Sep 17 00:00:00 2001 From: Luca Guzzi Date: Tue, 27 Oct 2020 15:17:30 +0100 Subject: [PATCH 12/16] changin / to // --- Production/scripts/validation_tool.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Production/scripts/validation_tool.py b/Production/scripts/validation_tool.py index 6465a0fd7a0..7bd43c721b2 100755 --- a/Production/scripts/validation_tool.py +++ b/Production/scripts/validation_tool.py @@ -56,7 +56,7 @@ def groupby(dataframe, by): def get_histos(dataframe, branch, norm = False): size = int(dataframe.Count()) - sub_size = 1 + size / N_SPLITS + sub_size = 1 + size // N_SPLITS subframes = [dataframe.Range(ii*sub_size, (ii+1)*sub_size) for ii in range(N_SPLITS)] model = (branch, "") + BINS[branch] From 9175af58f6fa891a6effbb9ca5a960ac56bb709a Mon Sep 17 00:00:00 2001 From: Luca Guzzi Date: Tue, 27 Oct 2020 15:19:13 +0100 Subject: [PATCH 13/16] correct conversion from RDF.Count() to int --- Production/scripts/validation_tool.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Production/scripts/validation_tool.py b/Production/scripts/validation_tool.py index 7bd43c721b2..2bc41bb3eb2 100755 --- a/Production/scripts/validation_tool.py +++ b/Production/scripts/validation_tool.py @@ -55,7 +55,7 @@ def groupby(dataframe, by): return {tt: dataframe.Filter('{} == {}'.format(by, tt)) for tt in types} def get_histos(dataframe, branch, norm = False): - size = int(dataframe.Count()) + size = dataframe.Count().GetValue() sub_size = 1 + size // N_SPLITS subframes = [dataframe.Range(ii*sub_size, (ii+1)*sub_size) for ii in range(N_SPLITS)] From 3c0beb5d728080e6639776adff4a5716d08f175e Mon Sep 17 00:00:00 2001 From: Luca Guzzi Date: Mon, 2 Nov 2020 19:00:32 +0100 Subject: [PATCH 14/16] new version of validation tool --- Production/scripts/validation_tool.py | 248 +++++++++++++++----------- 1 file changed, 143 insertions(+), 105 deletions(-) mode change 100755 => 100644 Production/scripts/validation_tool.py diff --git a/Production/scripts/validation_tool.py b/Production/scripts/validation_tool.py old mode 100755 new mode 100644 index 2bc41bb3eb2..295bd71ebcf --- a/Production/scripts/validation_tool.py +++ b/Production/scripts/validation_tool.py @@ -1,4 +1,12 @@ -#!/usr/bin/env python +from __future__ import print_function +import sys; PYTHON_MAJOR = int(sys.version_info.major) +dict_iterator = 'items' if PYTHON_MAJOR == 3 else 'iteritems' + +import ROOT +import glob +import json +from collections import OrderedDict + import argparse parser = argparse.ArgumentParser(''' The script runs a binned KS test between different chunks of the same RDataFrame. For simplicity, all chunks are compared to the first one. @@ -7,151 +15,181 @@ NOTE: pvalue = 99 means that one of the two histograms is empty. ''') -import ROOT -import glob -import json -from collections import OrderedDict - parser.add_argument('--input' , required = True, type = str, help = 'input file. Accepts glob patterns (use quotes)') parser.add_argument('--output' , required = True, type = str, help = 'output directory name') -parser.add_argument('--nsplit' , default = 100 , type = str, help = 'number of chunks per file') +parser.add_argument('--nsplit' , default = 100 , type = int, help = 'number of chunks per file') parser.add_argument('--pvthreshold' , default = .05 , type = str, help = 'threshold of KS test (above = ok)') +parser.add_argument('--n_threads' , default = 1 , type = int, help = 'enable ROOT implicit multithreading') parser.add_argument('--visual', action = 'store_true', help = 'Won\'t run the script in batch mode') parser.add_argument('--legend', action = 'store_true', help = 'Draw a TLegent on canvases') args = parser.parse_args() +ROOT.gROOT.SetBatch(not args.visual) +ROOT.gStyle.SetOptStat(0) + import os pdf_dir = '/'.join([args.output, 'pdf']) if not os.path.exists(pdf_dir): os.makedirs(pdf_dir) -ROOT.gROOT.SetBatch(not args.visual) -ROOT.gStyle.SetOptStat(0) - JSON_DICT = OrderedDict() OUTPUT_ROOT = ROOT.TFile.Open('{}/histograms.root'.format(args.output), 'RECREATE') OUTPUT_JSON = open('{}/pvalues.json'.format(args.output), 'w') -N_SPLITS = args.nsplit +N_SPLIT = args.nsplit PVAL_THRESHOLD = args.pvthreshold ## binning of tested variables (cannot use unbinned distributions with python before root 6.18) BINS = { 'tau_pt' : (50, 0, 5000), - 'tau_eta' : (5, -2.5, 2.5), + 'tau_eta' : (5, -3.2, 3.2), 'lepton_gen_match' : (20, -1, 19), 'sampleType': (20, -1, 19), 'dataset_id': (20, -1, 19), 'dataset_group_id': (20, -1, 19), } -def groupby(dataframe, by): - _ = dataframe.Histo1D(by) - hist = _.GetValue() - hist.ClearUnderflowAndOverflow() - types = list(set([round(hist.GetBinCenter(jj)) for jj in range(hist.GetNbinsX()) if hist.GetBinContent(jj)])) - types = [int(tt) for tt in types] - - return {tt: dataframe.Filter('{} == {}'.format(by, tt)) for tt in types} - -def get_histos(dataframe, branch, norm = False): - size = dataframe.Count().GetValue() - sub_size = 1 + size // N_SPLITS - subframes = [dataframe.Range(ii*sub_size, (ii+1)*sub_size) for ii in range(N_SPLITS)] - - model = (branch, "") + BINS[branch] - hptrs = [sf.Histo1D(model, branch) for sf in subframes] - histos = [hh.GetValue().Clone() for hh in hptrs] - - for ii, hh in enumerate(histos): - hh.SetTitle(branch) - hh.SetName('_'.join([hh.GetName(), str(ii)])) - hh.Sumw2() - hh.ClearUnderflowAndOverflow() - if norm and hh.Integral(): - hh.Scale(1. / hh.Integral()) - - return histos - -def save_histos(histos, fdir, pvalues): - OUTPUT_ROOT.cd() - OUTPUT_ROOT.cd(fdir) - - can = ROOT.TCanvas() - leg = ROOT.TLegend(0.9, 0.1, 1., 0.9, "p-values (KS with the first chunk)") - - histos[0].GetYaxis().SetRangeUser(0, 1.1*max(hh.GetMaximum() for hh in histos)) - histos[0].SetMarkerStyle(20) - - for ii, hh in enumerate(histos): - hh.SetLineColor(ii+1) - hh.SetMarkerColor(ii+1) - leg.AddEntry(hh, 'chunk %d - pval = %.3f' %(ii, pvalues[ii]), 'lep') - hh.Draw("PE" + " SAME" * (ii != 0)) - hh.Write() - if args.legend: - leg.Draw("SAME") - can.SaveAs('{}/pdf/{}.pdf'.format(args.output, fdir.replace('/', '_')), 'pdf') - can.Write() - OUTPUT_ROOT.cd() - -def run_validation(dataframe, branches, pwd): - OUTPUT_ROOT.cd() - OUTPUT_ROOT.mkdir(pwd) +class Lazy_container: + def __init__(self, ptr, hst = None): + self.ptr = ptr + self.hst = hst + def load_histogram(self): + self.hst = self.ptr.GetValue() + +class Entry: ## 2D entry (chunk_id x variable) + def __init__(self, var, histo, tdir = None): + self.var = var + self.hst = histo + self.tdir = tdir if not tdir is None else self.var + + def run_KS_test(self, norm = True): + self.chunks = [self.hst.ProjectionY('chunk_{}'.format(cc), cc+1, cc+1).Clone() for cc in range(N_SPLIT)] + self.chunks[0].SetMarkerStyle(20) + for jj, hh in enumerate(self.chunks): + hh.SetTitle(self.tdir) + hh.Sumw2() + hh.SetLineColor(jj+1) + if hh.Integral() and norm: + hh.Scale(1. / hh.Integral()) + + if not self.chunks[0].Integral(): + print ('[WARNING] control histogram is empty inside {}'.format(self.tdir)) + + self.pvalues = [self.chunks[0].KolmogorovTest(hh) if self.chunks[0].Integral()*hh.Integral() else 99 for hh in self.chunks] + if not all([pv >= PVAL_THRESHOLD for pv in self.pvalues]): + print ('[WARNING] KS test failed for step {}. p-values are:'.format(self.tdir)) + print ('\t', self.pvalues) - if not pwd in JSON_DICT.keys(): - JSON_DICT[pwd] = OrderedDict() - - for branch in branches: - OUTPUT_ROOT.cd() - OUTPUT_ROOT.mkdir('/'.join([pwd, branch])) - - histos = get_histos(dataframe, branch = branch, norm = True) - - pvalues = [histos[0].KolmogorovTest(hh) if histos[0].Integral()*hh.Integral() else 99 for hh in histos] - if not histos[0].Integral(): - print ('[WARNING] control histogram is empty for step {} inside {}'.format(branch, pwd)) + def save_data(self): + OUTPUT_ROOT.cd() + + if not OUTPUT_ROOT.GetDirectory(self.tdir): + OUTPUT_ROOT.mkdir(self.tdir) - if not all([pv >= PVAL_THRESHOLD for pv in pvalues]): - print ('[WARNING] KS test failed for step {} inside {}. p-values are:'.format(branch, pwd)) - print ('\t', pvalues) + OUTPUT_ROOT.cd(self.tdir) - JSON_DICT[pwd][branch] = pvalues + can = ROOT.TCanvas() + leg = ROOT.TLegend(0.9, 0.1, 1., 0.9, "p-values (KS with the first chunk)") + for ii, hh in enumerate(self.chunks): + hh.Write() + hh.Draw('PE'+' SAME'*(ii != 0)) + leg.AddEntry(hh, 'chunk %d - pval = %.3f' %(ii, self.pvalues[ii]), 'lep') + if args.legend: + leg.Draw("SAME") - save_histos(histos, fdir = '/'.join([pwd, branch]), pvalues = pvalues) + can.SaveAs('{}/pdf/{}.pdf'.format(args.output, self.tdir.replace('/', '_')), 'pdf') + can.Write() + + OUTPUT_ROOT.cd() + + json_here = JSON_DICT + for here in self.tdir.split('/'): + if not here in json_here.keys(): + json_here[here] = OrderedDict() + json_here = json_here[here] + json_here['pvalues'] = self.pvalues + +def to_2D(histo, vbin): + histo.GetZaxis().SetRange(vbin, vbin) + return histo.Project3D('yx').Clone() if __name__ == '__main__': print ('[INFO] reading files', args.input) + + if args.n_threads > 1: + ROOT.ROOT.EnableImplicitMT(args.n_threads) + input_files = ROOT.std.vector('std::string')() + for file in glob.glob(args.input): input_files.push_back(str(file)) - main_dir = 'KS_test' - - ## first, run on plain columns + model = lambda main, third = None: (main, '', N_SPLIT, 0, N_SPLIT)+BINS[main]+BINS[third] if not third is None else (main, '', N_SPLIT, 0, N_SPLIT)+BINS[main] + dataframe = ROOT.RDataFrame('taus', input_files) - run_validation(dataframe = dataframe, pwd = main_dir, branches = ['lepton_gen_match', 'sampleType', 'dataset_group_id', 'dataset_id']) - - ## then, group by tau type - tau_type_dataframes = groupby(dataframe = dataframe, by = 'lepton_gen_match') - for ii, df in tau_type_dataframes.iteritems(): - run_validation(dataframe = df, pwd = '/'.join([main_dir, 'lepton_gen_match', str(ii)]), branches = ['tau_pt', 'tau_eta']) - - ## then, group by sample type - sample_type_dataframes = groupby(dataframe = dataframe, by = 'sampleType') - for ii, df in sample_type_dataframes.iteritems(): - run_validation(dataframe = df, pwd = '/'.join([main_dir, 'sampleType', str(ii)]), branches = ['tau_pt', 'tau_eta']) + dataframe = dataframe.Define('chunk_id', 'rdfentry_ % {}'.format(N_SPLIT)) + + ## unbinned distributions + ptr_lgm = Lazy_container(dataframe.Histo2D(model('lepton_gen_match'), 'chunk_id', 'lepton_gen_match')) + ptr_st = Lazy_container(dataframe.Histo2D(model('sampleType') , 'chunk_id', 'sampleType' )) + ptr_dgi = Lazy_container(dataframe.Histo2D(model('dataset_group_id'), 'chunk_id', 'dataset_group_id')) + ptr_di = Lazy_container(dataframe.Histo2D(model('dataset_id') , 'chunk_id', 'dataset_id' )) + + ## binned distributions + ptrs_tau_pt = { + binned_variable: Lazy_container(dataframe.Histo3D(model('tau_pt', third = binned_variable), 'chunk_id', 'tau_pt', binned_variable)) + for binned_variable in ['lepton_gen_match', 'sampleType', 'dataset_group_id', 'dataset_id'] + } + ptrs_tau_eta = { + binned_variable: Lazy_container(dataframe.Histo3D(model('tau_eta', third = binned_variable), 'chunk_id', 'tau_eta', binned_variable)) + for binned_variable in ['lepton_gen_match', 'sampleType', 'dataset_group_id', 'dataset_id'] + } + ptrs_dataset_id = { + binned_variable: Lazy_container(dataframe.Histo3D(model('dataset_id', third = binned_variable), 'chunk_id', 'dataset_id', binned_variable)) + for binned_variable in ['dataset_group_id'] + } + + lazy_containers = [ptr_lgm, ptr_st, ptr_dgi, ptr_di ] +\ + [lc for lc in ptrs_tau_pt.values()] +\ + [lc for lc in ptrs_tau_eta.values()] +\ + [lc for lc in ptrs_dataset_id.values()] + + for lc in lazy_containers: + lc.load_histogram() + + ## run validation + entry_lgm = Entry(var = 'lepton_gen_match', histo = ptr_lgm.hst) + entry_st = Entry(var = 'sampleType' , histo = ptr_st .hst) + entry_dgi = Entry(var = 'dataset_group_id', histo = ptr_dgi.hst) + entry_di = Entry(var = 'dataset_id' , histo = ptr_di .hst) + + entries_tau_pt = [ + Entry(var = 'tau_pt', histo = to_2D(ptrs_tau_pt[binned_variable].hst, jj+1), tdir = '/'.join([binned_variable, str(bb), 'tau_pt'])) + for binned_variable in ['lepton_gen_match', 'sampleType', 'dataset_group_id', 'dataset_id'] + for jj, bb in enumerate(range(*BINS[binned_variable][1:])) + ] ; entries_tau_pt = [ee for ee in entries_tau_pt if ee.hst.GetEntries()] + + entries_tau_eta = [ + Entry(var = 'tau_eta', histo = to_2D(ptrs_tau_eta[binned_variable].hst, jj+1), tdir = '/'.join([binned_variable, str(bb), 'tau_eta'])) + for binned_variable in ['lepton_gen_match', 'sampleType', 'dataset_group_id', 'dataset_id'] + for jj, bb in enumerate(range(*BINS[binned_variable][1:])) + ] ; entries_tau_eta = [ee for ee in entries_tau_eta if ee.hst.GetEntries()] + + entries_dataset_id = [ + Entry(var = 'dataset_id', histo = to_2D(ptrs_dataset_id[binned_variable].hst, jj+1), tdir = '/'.join([binned_variable, str(bb), 'dataset_id'])) + for binned_variable in ['dataset_group_id'] + for jj, bb in enumerate(range(*BINS[binned_variable][1:])) + ]; entries_dataset_id = [ee for ee in entries_dataset_id if ee.hst.GetEntries()] - ## then, group by dataset group id - group_id_dataframes = groupby(dataframe = dataframe, by = 'dataset_group_id') - for ii, df in group_id_dataframes.iteritems(): - run_validation(dataframe = df, pwd = '/'.join([main_dir, 'dataset_group_id', str(ii)]), branches = ['tau_pt', 'tau_eta', 'dataset_id']) + entries = [entry_lgm, entry_st, entry_dgi, entry_di] +\ + [ee for ee in entries_tau_pt] +\ + [ee for ee in entries_tau_eta] +\ + [ee for ee in entries_dataset_id] - ## then, group by dataset id - group_id_dataframes = groupby(dataframe = dataframe, by = 'dataset_id') - for ii, df in group_id_dataframes.iteritems(): - run_validation(dataframe = df, pwd = '/'.join([main_dir, 'dataset_id', str(ii)]), branches = ['tau_pt', 'tau_eta']) + for ee in entries: + ee.run_KS_test() + ee.save_data() OUTPUT_ROOT.Close() json.dump(JSON_DICT, OUTPUT_JSON, indent = 4) - print ('[INFO] all done. Files', args.output, 'and', args.json, 'have been created') \ No newline at end of file + print ('[INFO] all done. Files saved in', args.output) \ No newline at end of file From 75a67353c0fbc895f5d07bc7550030a7e2c34ce7 Mon Sep 17 00:00:00 2001 From: Luca Guzzi Date: Tue, 3 Nov 2020 13:09:54 +0100 Subject: [PATCH 15/16] validation tool update --- Production/scripts/validation_tool.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/Production/scripts/validation_tool.py b/Production/scripts/validation_tool.py index 295bd71ebcf..839bd6cac68 100644 --- a/Production/scripts/validation_tool.py +++ b/Production/scripts/validation_tool.py @@ -1,6 +1,4 @@ from __future__ import print_function -import sys; PYTHON_MAJOR = int(sys.version_info.major) -dict_iterator = 'items' if PYTHON_MAJOR == 3 else 'iteritems' import ROOT import glob @@ -39,7 +37,7 @@ N_SPLIT = args.nsplit PVAL_THRESHOLD = args.pvthreshold -## binning of tested variables (cannot use unbinned distributions with python before root 6.18) +## binning of tested variables BINS = { 'tau_pt' : (50, 0, 5000), 'tau_eta' : (5, -3.2, 3.2), @@ -64,7 +62,9 @@ def __init__(self, var, histo, tdir = None): def run_KS_test(self, norm = True): self.chunks = [self.hst.ProjectionY('chunk_{}'.format(cc), cc+1, cc+1).Clone() for cc in range(N_SPLIT)] + self.chunks[0].SetMarkerStyle(20) + for jj, hh in enumerate(self.chunks): hh.SetTitle(self.tdir) hh.Sumw2() @@ -72,6 +72,8 @@ def run_KS_test(self, norm = True): if hh.Integral() and norm: hh.Scale(1. / hh.Integral()) + self.chunks[0].GetYaxis().SetRangeUser(0, 1.1*max(hh.GetMaximum() for hh in self.chunks)) + if not self.chunks[0].Integral(): print ('[WARNING] control histogram is empty inside {}'.format(self.tdir)) @@ -127,7 +129,8 @@ def to_2D(histo, vbin): model = lambda main, third = None: (main, '', N_SPLIT, 0, N_SPLIT)+BINS[main]+BINS[third] if not third is None else (main, '', N_SPLIT, 0, N_SPLIT)+BINS[main] dataframe = ROOT.RDataFrame('taus', input_files) - dataframe = dataframe.Define('chunk_id', 'rdfentry_ % {}'.format(N_SPLIT)) + tot_entries = dataframe.Count().GetValue() + dataframe = dataframe.Define('chunk_id', 'rdfentry_ * {} / {}'.format(N_SPLIT, tot_entries)) ## unbinned distributions ptr_lgm = Lazy_container(dataframe.Histo2D(model('lepton_gen_match'), 'chunk_id', 'lepton_gen_match')) From 1e7bf8ab0e8b433b8a87daa769194cfcea11db78 Mon Sep 17 00:00:00 2001 From: Luca Guzzi Date: Wed, 4 Nov 2020 12:04:53 +0100 Subject: [PATCH 16/16] update readme for validation tool --- README.md | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/README.md b/README.md index 336ed911878..946f135a9f5 100644 --- a/README.md +++ b/README.md @@ -161,6 +161,35 @@ ShuffleMerge --cfg TauML/Analysis/config/testing_inputs.cfg --input tuples-v2 -- --n-threads 12 --disabled-branches "trainingWeight" ``` +#### Validation +A validation can be run on shuffled samples to ensure that different parts of the training set have compatible distributions. +To run the validation tool, a ROOT version greater or equal to 6.16 is needed: +``` +source /cvmfs/sft.cern.ch/lcg/views/LCG_97apython3/x86_64-centos7-clang10-opt/setup.sh +``` +Then, run: +``` +python TauMLTools/Production/scripts/validation_tool.py --input "/path/to/input/*.root" \ + --output output_directory \ + --n_threads n_threads \ + --legend > results.txt +``` +The script will create the directory "output_directory" containing the results of the test. +Validation is run on the following ditributions with a Kolmogorov-Smirnov test: + +- dataset_id, dataset_group_id, lepton_gen_match, sampleType +- tau_pt and tau_eta for each bin of the previous +- dataset_id for each bin of dataset_group_id + +If a KS test is not successful, a warning message is print on screen. + +Optional arguments are available running: +``` +python TauMLTools/Production/scripts/validation_tool.py --help +``` + +A time benchmark is available [here](https://github.com/cms-tau-pog/TauMLTools/pull/31#issue-510206277). + ### Production of flat inputs In this stage, `TauTuple`s are transformed into flat [TrainingTuples](https://github.com/cms-tau-pog/TauMLTools/blob/master/Analysis/interface/TrainingTuple.h) that are suitable as an input for the training.