From e9eeca4ed20793e719434d2e0a6fc10b06439989 Mon Sep 17 00:00:00 2001
From: Luca Guzzi <luca.guzzi@cern.ch>
Date: Mon, 26 Oct 2020 18:27:47 +0100
Subject: [PATCH 01/16] adding validation tool (KS test)

---
 Production/scripts/validation_tool.py | 151 ++++++++++++++++++++++++++
 1 file changed, 151 insertions(+)
 create mode 100755 Production/scripts/validation_tool.py

diff --git a/Production/scripts/validation_tool.py b/Production/scripts/validation_tool.py
new file mode 100755
index 00000000000..a90d6f1a615
--- /dev/null
+++ b/Production/scripts/validation_tool.py
@@ -0,0 +1,151 @@
+#!/usr/bin/env python
+import argparse
+parser = argparse.ArgumentParser('''
+The script runs a binned KS test between different chunks of the same RDataFrame. For simplicity, all chunks are compared to the first one.
+If a KS test is below the threshold, a warning message is printed on screen.
+NOTE: pvalue = 99 means that one of the two histograms is empty.
+''')
+
+import ROOT
+import glob
+import json
+from collections import OrderedDict
+
+parser.add_argument('--input'       , required = True, type = str, help = 'input file. Accepts glob patterns')
+parser.add_argument('--output'      , required = True, type = str, help = 'output file name')
+parser.add_argument('--pdf'         , default  = None, type = str, help = 'output pdf directory')
+parser.add_argument('--json'        , required = True, type = str, help = 'output json file name')
+parser.add_argument('--nsplit'      , default  = 5   , type = str, help = 'number of chunks per file')
+parser.add_argument('--pvthreshold' , default  = .05 , type = str, help = 'threshold of KS test (above = ok)')
+
+parser.add_argument('--visual', action = 'store_true', help = 'Won\'t run the script in batch mode')
+parser.add_argument('--legend', action = 'store_true', help = 'Draw a TLegent on canvases')
+args = parser.parse_args()
+
+import os
+if not os.path.exists(args.pdf):
+  os.makedirs(args.pdf)
+
+ROOT.gROOT.SetBatch(not args.visual)
+ROOT.gStyle.SetOptStat(0)
+
+JSON_DICT      = OrderedDict()
+OUTPUT_ROOT    = ROOT.TFile.Open('{}'.format(args.output), 'RECREATE')
+OUTPUT_JSON    = open(args.json, 'w')
+N_SPLITS       = args.nsplit
+PVAL_THRESHOLD = args.pvthreshold
+
+## binning of tested variables (cannot use unbinned distributions with python before root 6.18)
+BINS = {
+  'tau_pt'    : (50, 0, 5000),
+  'tau_eta'   : (5, -2.5, 2.5),
+  'lepton_gen_match' : (20, -1, 19),
+  'sampleType': (20, -1, 19),      
+  'dataset_id': (20, -1, 19),
+  'dataset_group_id': (20, -1, 19),
+}
+
+def groupby(dataframe, by):
+  _ = dataframe.Histo1D(by)
+  hist = _.GetValue()
+  hist.ClearUnderflowAndOverflow()
+  types = list(set([round(hist.GetBinCenter(jj)) for jj in range(hist.GetNbinsX()) if hist.GetBinContent(jj)]))
+
+  return {tt: dataframe.Filter('{} == {}'.format(by, tt)) for tt in types}
+
+def get_histos(dataframe, branch, norm = False):
+  size = int(dataframe.Count())
+  sub_size = 1 + size / N_SPLITS
+  subframes = [dataframe.Range(ii*sub_size, (ii+1)*sub_size) for ii in range(N_SPLITS)]
+  
+  model = (branch, "") + BINS[branch]
+  hptrs = [sf.Histo1D(model, branch) for sf in subframes]
+  histos = [hh.GetValue().Clone() for hh in hptrs]
+  
+  for hh in histos:
+    hh.SetTitle(branch)
+    hh.Sumw2()
+    hh.ClearUnderflowAndOverflow()
+    if norm and hh.Integral():
+      hh.Scale(1. / hh.Integral())
+
+  return histos
+
+def save_histos(histos, fdir, pvalues):
+  OUTPUT_ROOT.cd()
+  OUTPUT_ROOT.cd(fdir)
+  
+  can = ROOT.TCanvas()
+  leg = ROOT.TLegend(0.9, 0.1, 1., 0.9, "p-values (KS with the first chunk)")
+
+  histos[0].GetYaxis().SetRangeUser(0, 1.1*max(hh.GetMaximum() for hh in histos))
+  histos[0].SetMarkerStyle(20)
+
+  for ii, hh in enumerate(histos): 
+    hh.SetLineColor(ii+1)
+    hh.SetMarkerColor(ii+1)
+    leg.AddEntry(hh, 'chunk %d - pval = %.3f' %(ii, pvalues[ii]), 'lep')
+    hh.Draw("PE" + " SAME" * (ii != 0))
+    hh.Write()
+  if args.legend:
+    leg.Draw("SAME")
+  if args.pdf is not None:
+    can.SaveAs('{}/{}.pdf'.format(args.pdf, fdir.replace('/', '_')), 'pdf')
+  can.Write()
+  OUTPUT_ROOT.cd()
+
+def run_validation(dataframe, branches, pwd = ''):
+  OUTPUT_ROOT.cd()
+  OUTPUT_ROOT.mkdir(pwd)
+  
+  if not pwd in JSON_DICT.keys():
+    JSON_DICT[pwd] = OrderedDict()
+
+  for branch in branches:
+    OUTPUT_ROOT.cd() 
+    OUTPUT_ROOT.mkdir('/'.join([pwd, branch]))
+
+    histos = get_histos(dataframe, branch = branch, norm = True)
+  
+    pvalues = [histos[0].KolmogorovTest(hh) if histos[0].Integral()*hh.Integral() else 99 for hh in histos]
+    if not histos[0].Integral():
+      print '[WARNING] control histogram is empty for step {} inside {}'.format(branch, pwd)
+    
+    if not all([pv >= PVAL_THRESHOLD for pv in pvalues]):
+      print '[WARNING] KS test failed for step {} inside {}. p-values are:'.format(branch, pwd)
+      print '\t', pvalues
+    
+    JSON_DICT[pwd][branch] = pvalues
+    
+    save_histos(histos, fdir = '/'.join([pwd, branch]), pvalues = pvalues)
+
+if __name__ == '__main__':
+  print '[INFO] reading files', args.input
+  input_files = ROOT.std.vector('std::string')()
+  for file in glob.glob(args.input):
+    input_files.push_back(str(file))
+
+  main_dir = 'KS_test'
+
+  ## first, run on plain columns
+  dataframe = ROOT.RDataFrame('taus', input_files)
+  run_validation(dataframe = dataframe, pwd = main_dir, branches = ['lepton_gen_match', 'sampleType', 'dataset_group_id'])
+
+  ## then, group by tau type
+  tau_type_dataframes = groupby(dataframe = dataframe, by = 'lepton_gen_match')
+  for ii, df in tau_type_dataframes.iteritems():
+    run_validation(dataframe = df, pwd = '/'.join([main_dir, 'lepton_gen_match', str(ii)]), branches = ['tau_pt', 'tau_eta'])
+
+  ## then, group by sample type
+  sample_type_dataframes = groupby(dataframe = dataframe, by = 'sampleType')
+  for ii, df in sample_type_dataframes.iteritems():
+    run_validation(dataframe = df, pwd = '/'.join([main_dir, 'sampleType', str(ii)]), branches = ['tau_pt', 'tau_eta'])
+
+  ## then, group by dataset group id
+  group_id_dataframes = groupby(dataframe = dataframe, by = 'dataset_group_id')
+  for ii, df in group_id_dataframes.iteritems():
+    run_validation(dataframe = df, pwd = '/'.join([main_dir, 'dataset_group_id', str(ii)]), branches = ['tau_pt', 'tau_eta', 'dataset_id'])
+
+  OUTPUT_ROOT.Close()
+  json.dump(JSON_DICT, OUTPUT_JSON, indent = 4)
+  print '[INFO] all done. Files', args.output, 'and', args.json, 'have been created'
\ No newline at end of file

From 0ef725bb86813b0a0f460477302db59f45ceca16 Mon Sep 17 00:00:00 2001
From: Luca Guzzi <luca.guzzi@cern.ch>
Date: Mon, 26 Oct 2020 18:45:47 +0100
Subject: [PATCH 02/16] adding a comment

---
 Production/scripts/validation_tool.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/Production/scripts/validation_tool.py b/Production/scripts/validation_tool.py
index a90d6f1a615..1109c1cf821 100755
--- a/Production/scripts/validation_tool.py
+++ b/Production/scripts/validation_tool.py
@@ -3,6 +3,7 @@
 parser = argparse.ArgumentParser('''
 The script runs a binned KS test between different chunks of the same RDataFrame. For simplicity, all chunks are compared to the first one.
 If a KS test is below the threshold, a warning message is printed on screen.
+NOTE: the binning of each variable must be hard coded in the script (using the BINS dictionary)
 NOTE: pvalue = 99 means that one of the two histograms is empty.
 ''')
 
@@ -106,7 +107,7 @@ def run_validation(dataframe, branches, pwd = ''):
     OUTPUT_ROOT.mkdir('/'.join([pwd, branch]))
 
     histos = get_histos(dataframe, branch = branch, norm = True)
-  
+
     pvalues = [histos[0].KolmogorovTest(hh) if histos[0].Integral()*hh.Integral() else 99 for hh in histos]
     if not histos[0].Integral():
       print '[WARNING] control histogram is empty for step {} inside {}'.format(branch, pwd)

From 5f7c1a91bd1fa2fd0254d30e8aeb25fdf8ce485d Mon Sep 17 00:00:00 2001
From: Luca Guzzi <luca.guzzi@cern.ch>
Date: Mon, 26 Oct 2020 18:49:03 +0100
Subject: [PATCH 03/16] small fix

---
 Production/scripts/validation_tool.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Production/scripts/validation_tool.py b/Production/scripts/validation_tool.py
index 1109c1cf821..be76f30ab15 100755
--- a/Production/scripts/validation_tool.py
+++ b/Production/scripts/validation_tool.py
@@ -95,7 +95,7 @@ def save_histos(histos, fdir, pvalues):
   can.Write()
   OUTPUT_ROOT.cd()
 
-def run_validation(dataframe, branches, pwd = ''):
+def run_validation(dataframe, branches, pwd):
   OUTPUT_ROOT.cd()
   OUTPUT_ROOT.mkdir(pwd)
   

From eec9866fa4abd51383fa93b679118287edf911b1 Mon Sep 17 00:00:00 2001
From: Luca Guzzi <luca.guzzi@cern.ch>
Date: Tue, 27 Oct 2020 10:35:01 +0100
Subject: [PATCH 04/16] integer types from groupby method

---
 Production/scripts/validation_tool.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/Production/scripts/validation_tool.py b/Production/scripts/validation_tool.py
index be76f30ab15..0817c5f420d 100755
--- a/Production/scripts/validation_tool.py
+++ b/Production/scripts/validation_tool.py
@@ -51,6 +51,7 @@ def groupby(dataframe, by):
   hist = _.GetValue()
   hist.ClearUnderflowAndOverflow()
   types = list(set([round(hist.GetBinCenter(jj)) for jj in range(hist.GetNbinsX()) if hist.GetBinContent(jj)]))
+  types = [int(tt) for tt in types]
 
   return {tt: dataframe.Filter('{} == {}'.format(by, tt)) for tt in types}
 

From cb5ae3e1281c9f798d890fc91a421eb34d9241e8 Mon Sep 17 00:00:00 2001
From: Luca Guzzi <luca.guzzi@cern.ch>
Date: Tue, 27 Oct 2020 10:35:32 +0100
Subject: [PATCH 05/16] numbered histograms

---
 Production/scripts/validation_tool.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/Production/scripts/validation_tool.py b/Production/scripts/validation_tool.py
index 0817c5f420d..4a883ecd1da 100755
--- a/Production/scripts/validation_tool.py
+++ b/Production/scripts/validation_tool.py
@@ -64,8 +64,9 @@ def get_histos(dataframe, branch, norm = False):
   hptrs = [sf.Histo1D(model, branch) for sf in subframes]
   histos = [hh.GetValue().Clone() for hh in hptrs]
   
-  for hh in histos:
+  for ii, hh in enumerate(histos):
     hh.SetTitle(branch)
+    hh.SetName('_'.join([hh.GetName(), ii]))
     hh.Sumw2()
     hh.ClearUnderflowAndOverflow()
     if norm and hh.Integral():

From e3a93ac91f46d27a70b9d4494bdc41a38fe9f641 Mon Sep 17 00:00:00 2001
From: Luca Guzzi <luca.guzzi@cern.ch>
Date: Tue, 27 Oct 2020 10:37:16 +0100
Subject: [PATCH 06/16] adding dataset_id to main variables

---
 Production/scripts/validation_tool.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/Production/scripts/validation_tool.py b/Production/scripts/validation_tool.py
index 4a883ecd1da..4dc943043a6 100755
--- a/Production/scripts/validation_tool.py
+++ b/Production/scripts/validation_tool.py
@@ -132,7 +132,7 @@ def run_validation(dataframe, branches, pwd):
 
   ## first, run on plain columns
   dataframe = ROOT.RDataFrame('taus', input_files)
-  run_validation(dataframe = dataframe, pwd = main_dir, branches = ['lepton_gen_match', 'sampleType', 'dataset_group_id'])
+  run_validation(dataframe = dataframe, pwd = main_dir, branches = ['lepton_gen_match', 'sampleType', 'dataset_group_id', 'dataset_id'])
 
   ## then, group by tau type
   tau_type_dataframes = groupby(dataframe = dataframe, by = 'lepton_gen_match')
@@ -149,6 +149,11 @@ def run_validation(dataframe, branches, pwd):
   for ii, df in group_id_dataframes.iteritems():
     run_validation(dataframe = df, pwd = '/'.join([main_dir, 'dataset_group_id', str(ii)]), branches = ['tau_pt', 'tau_eta', 'dataset_id'])
 
+  ## then, group by dataset id
+  group_id_dataframes = groupby(dataframe = dataframe, by = 'dataset_id')
+  for ii, df in group_id_dataframes.iteritems():
+    run_validation(dataframe = df, pwd = '/'.join([main_dir, 'dataset_id', str(ii)]), branches = ['tau_pt', 'tau_eta'])
+
   OUTPUT_ROOT.Close()
   json.dump(JSON_DICT, OUTPUT_JSON, indent = 4)
   print '[INFO] all done. Files', args.output, 'and', args.json, 'have been created'
\ No newline at end of file

From bd576113aa71ed040e619dfd177c3dfec56566e3 Mon Sep 17 00:00:00 2001
From: Luca Guzzi <luca.guzzi@cern.ch>
Date: Tue, 27 Oct 2020 14:54:57 +0100
Subject: [PATCH 07/16] comment

---
 Production/scripts/validation_tool.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Production/scripts/validation_tool.py b/Production/scripts/validation_tool.py
index 4dc943043a6..7608c63ea9f 100755
--- a/Production/scripts/validation_tool.py
+++ b/Production/scripts/validation_tool.py
@@ -12,7 +12,7 @@
 import json
 from collections import OrderedDict
 
-parser.add_argument('--input'       , required = True, type = str, help = 'input file. Accepts glob patterns')
+parser.add_argument('--input'       , required = True, type = str, help = 'input file. Accepts glob patterns (use quotes)')
 parser.add_argument('--output'      , required = True, type = str, help = 'output file name')
 parser.add_argument('--pdf'         , default  = None, type = str, help = 'output pdf directory')
 parser.add_argument('--json'        , required = True, type = str, help = 'output json file name')

From b12d21a4f45a162ec0eb4f5cdeb5f1e82ca16a2c Mon Sep 17 00:00:00 2001
From: Luca Guzzi <luca.guzzi@cern.ch>
Date: Tue, 27 Oct 2020 14:55:35 +0100
Subject: [PATCH 08/16] cast fix

---
 Production/scripts/validation_tool.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Production/scripts/validation_tool.py b/Production/scripts/validation_tool.py
index 7608c63ea9f..2e3b285a845 100755
--- a/Production/scripts/validation_tool.py
+++ b/Production/scripts/validation_tool.py
@@ -63,10 +63,10 @@ def get_histos(dataframe, branch, norm = False):
   model = (branch, "") + BINS[branch]
   hptrs = [sf.Histo1D(model, branch) for sf in subframes]
   histos = [hh.GetValue().Clone() for hh in hptrs]
-  
+
   for ii, hh in enumerate(histos):
     hh.SetTitle(branch)
-    hh.SetName('_'.join([hh.GetName(), ii]))
+    hh.SetName('_'.join([hh.GetName(), str(ii)]))
     hh.Sumw2()
     hh.ClearUnderflowAndOverflow()
     if norm and hh.Integral():

From 7f604244e967ba25d9b6ab50dc1820bb76537f1c Mon Sep 17 00:00:00 2001
From: Luca Guzzi <luca.guzzi@cern.ch>
Date: Tue, 27 Oct 2020 14:56:07 +0100
Subject: [PATCH 09/16] n splits default 100

---
 Production/scripts/validation_tool.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Production/scripts/validation_tool.py b/Production/scripts/validation_tool.py
index 2e3b285a845..74e2630e188 100755
--- a/Production/scripts/validation_tool.py
+++ b/Production/scripts/validation_tool.py
@@ -16,7 +16,7 @@
 parser.add_argument('--output'      , required = True, type = str, help = 'output file name')
 parser.add_argument('--pdf'         , default  = None, type = str, help = 'output pdf directory')
 parser.add_argument('--json'        , required = True, type = str, help = 'output json file name')
-parser.add_argument('--nsplit'      , default  = 5   , type = str, help = 'number of chunks per file')
+parser.add_argument('--nsplit'      , default  = 100 , type = str, help = 'number of chunks per file')
 parser.add_argument('--pvthreshold' , default  = .05 , type = str, help = 'threshold of KS test (above = ok)')
 
 parser.add_argument('--visual', action = 'store_true', help = 'Won\'t run the script in batch mode')

From 6869fd165a1489e473864169a05588e5b48748a7 Mon Sep 17 00:00:00 2001
From: Luca Guzzi <luca.guzzi@cern.ch>
Date: Tue, 27 Oct 2020 15:05:34 +0100
Subject: [PATCH 10/16] python3-like print

---
 Production/scripts/validation_tool.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/Production/scripts/validation_tool.py b/Production/scripts/validation_tool.py
index 74e2630e188..753ac5c9b70 100755
--- a/Production/scripts/validation_tool.py
+++ b/Production/scripts/validation_tool.py
@@ -112,18 +112,18 @@ def run_validation(dataframe, branches, pwd):
 
     pvalues = [histos[0].KolmogorovTest(hh) if histos[0].Integral()*hh.Integral() else 99 for hh in histos]
     if not histos[0].Integral():
-      print '[WARNING] control histogram is empty for step {} inside {}'.format(branch, pwd)
+      print ('[WARNING] control histogram is empty for step {} inside {}'.format(branch, pwd))
     
     if not all([pv >= PVAL_THRESHOLD for pv in pvalues]):
-      print '[WARNING] KS test failed for step {} inside {}. p-values are:'.format(branch, pwd)
-      print '\t', pvalues
+      print ('[WARNING] KS test failed for step {} inside {}. p-values are:'.format(branch, pwd))
+      print ('\t', pvalues)
     
     JSON_DICT[pwd][branch] = pvalues
     
     save_histos(histos, fdir = '/'.join([pwd, branch]), pvalues = pvalues)
 
 if __name__ == '__main__':
-  print '[INFO] reading files', args.input
+  print ('[INFO] reading files', args.input)
   input_files = ROOT.std.vector('std::string')()
   for file in glob.glob(args.input):
     input_files.push_back(str(file))
@@ -156,4 +156,4 @@ def run_validation(dataframe, branches, pwd):
 
   OUTPUT_ROOT.Close()
   json.dump(JSON_DICT, OUTPUT_JSON, indent = 4)
-  print '[INFO] all done. Files', args.output, 'and', args.json, 'have been created'
\ No newline at end of file
+  print ('[INFO] all done. Files', args.output, 'and', args.json, 'have been created')
\ No newline at end of file

From 200cb398c46d84b2b86385c2132c54a92878985c Mon Sep 17 00:00:00 2001
From: Luca Guzzi <luca.guzzi@cern.ch>
Date: Tue, 27 Oct 2020 15:13:35 +0100
Subject: [PATCH 11/16] changin output files structure

---
 Production/scripts/validation_tool.py | 16 +++++++---------
 1 file changed, 7 insertions(+), 9 deletions(-)

diff --git a/Production/scripts/validation_tool.py b/Production/scripts/validation_tool.py
index 753ac5c9b70..6465a0fd7a0 100755
--- a/Production/scripts/validation_tool.py
+++ b/Production/scripts/validation_tool.py
@@ -13,9 +13,7 @@
 from collections import OrderedDict
 
 parser.add_argument('--input'       , required = True, type = str, help = 'input file. Accepts glob patterns (use quotes)')
-parser.add_argument('--output'      , required = True, type = str, help = 'output file name')
-parser.add_argument('--pdf'         , default  = None, type = str, help = 'output pdf directory')
-parser.add_argument('--json'        , required = True, type = str, help = 'output json file name')
+parser.add_argument('--output'      , required = True, type = str, help = 'output directory name')
 parser.add_argument('--nsplit'      , default  = 100 , type = str, help = 'number of chunks per file')
 parser.add_argument('--pvthreshold' , default  = .05 , type = str, help = 'threshold of KS test (above = ok)')
 
@@ -24,15 +22,16 @@
 args = parser.parse_args()
 
 import os
-if not os.path.exists(args.pdf):
-  os.makedirs(args.pdf)
+pdf_dir = '/'.join([args.output, 'pdf'])
+if not os.path.exists(pdf_dir):
+  os.makedirs(pdf_dir)
 
 ROOT.gROOT.SetBatch(not args.visual)
 ROOT.gStyle.SetOptStat(0)
 
 JSON_DICT      = OrderedDict()
-OUTPUT_ROOT    = ROOT.TFile.Open('{}'.format(args.output), 'RECREATE')
-OUTPUT_JSON    = open(args.json, 'w')
+OUTPUT_ROOT    = ROOT.TFile.Open('{}/histograms.root'.format(args.output), 'RECREATE')
+OUTPUT_JSON    = open('{}/pvalues.json'.format(args.output), 'w')
 N_SPLITS       = args.nsplit
 PVAL_THRESHOLD = args.pvthreshold
 
@@ -92,8 +91,7 @@ def save_histos(histos, fdir, pvalues):
     hh.Write()
   if args.legend:
     leg.Draw("SAME")
-  if args.pdf is not None:
-    can.SaveAs('{}/{}.pdf'.format(args.pdf, fdir.replace('/', '_')), 'pdf')
+  can.SaveAs('{}/pdf/{}.pdf'.format(args.output, fdir.replace('/', '_')), 'pdf')
   can.Write()
   OUTPUT_ROOT.cd()
 

From cf217ef9a8bf61d5c203f49c2e88628ba533d8ce Mon Sep 17 00:00:00 2001
From: Luca Guzzi <luca.guzzi@cern.ch>
Date: Tue, 27 Oct 2020 15:17:30 +0100
Subject: [PATCH 12/16] changin / to //

---
 Production/scripts/validation_tool.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Production/scripts/validation_tool.py b/Production/scripts/validation_tool.py
index 6465a0fd7a0..7bd43c721b2 100755
--- a/Production/scripts/validation_tool.py
+++ b/Production/scripts/validation_tool.py
@@ -56,7 +56,7 @@ def groupby(dataframe, by):
 
 def get_histos(dataframe, branch, norm = False):
   size = int(dataframe.Count())
-  sub_size = 1 + size / N_SPLITS
+  sub_size = 1 + size // N_SPLITS
   subframes = [dataframe.Range(ii*sub_size, (ii+1)*sub_size) for ii in range(N_SPLITS)]
   
   model = (branch, "") + BINS[branch]

From 9175af58f6fa891a6effbb9ca5a960ac56bb709a Mon Sep 17 00:00:00 2001
From: Luca Guzzi <luca.guzzi@cern.ch>
Date: Tue, 27 Oct 2020 15:19:13 +0100
Subject: [PATCH 13/16] correct conversion from RDF.Count() to int

---
 Production/scripts/validation_tool.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Production/scripts/validation_tool.py b/Production/scripts/validation_tool.py
index 7bd43c721b2..2bc41bb3eb2 100755
--- a/Production/scripts/validation_tool.py
+++ b/Production/scripts/validation_tool.py
@@ -55,7 +55,7 @@ def groupby(dataframe, by):
   return {tt: dataframe.Filter('{} == {}'.format(by, tt)) for tt in types}
 
 def get_histos(dataframe, branch, norm = False):
-  size = int(dataframe.Count())
+  size = dataframe.Count().GetValue()
   sub_size = 1 + size // N_SPLITS
   subframes = [dataframe.Range(ii*sub_size, (ii+1)*sub_size) for ii in range(N_SPLITS)]
   

From 3c0beb5d728080e6639776adff4a5716d08f175e Mon Sep 17 00:00:00 2001
From: Luca Guzzi <luca.guzzi@cern.ch>
Date: Mon, 2 Nov 2020 19:00:32 +0100
Subject: [PATCH 14/16] new version of validation tool

---
 Production/scripts/validation_tool.py | 248 +++++++++++++++-----------
 1 file changed, 143 insertions(+), 105 deletions(-)
 mode change 100755 => 100644 Production/scripts/validation_tool.py

diff --git a/Production/scripts/validation_tool.py b/Production/scripts/validation_tool.py
old mode 100755
new mode 100644
index 2bc41bb3eb2..295bd71ebcf
--- a/Production/scripts/validation_tool.py
+++ b/Production/scripts/validation_tool.py
@@ -1,4 +1,12 @@
-#!/usr/bin/env python
+from __future__ import print_function
+import sys; PYTHON_MAJOR = int(sys.version_info.major)
+dict_iterator = 'items' if PYTHON_MAJOR == 3 else 'iteritems'
+
+import ROOT
+import glob
+import json
+from collections import OrderedDict
+
 import argparse
 parser = argparse.ArgumentParser('''
 The script runs a binned KS test between different chunks of the same RDataFrame. For simplicity, all chunks are compared to the first one.
@@ -7,151 +15,181 @@
 NOTE: pvalue = 99 means that one of the two histograms is empty.
 ''')
 
-import ROOT
-import glob
-import json
-from collections import OrderedDict
-
 parser.add_argument('--input'       , required = True, type = str, help = 'input file. Accepts glob patterns (use quotes)')
 parser.add_argument('--output'      , required = True, type = str, help = 'output directory name')
-parser.add_argument('--nsplit'      , default  = 100 , type = str, help = 'number of chunks per file')
+parser.add_argument('--nsplit'      , default  = 100 , type = int, help = 'number of chunks per file')
 parser.add_argument('--pvthreshold' , default  = .05 , type = str, help = 'threshold of KS test (above = ok)')
+parser.add_argument('--n_threads'   , default  = 1   , type = int, help = 'enable ROOT implicit multithreading')
 
 parser.add_argument('--visual', action = 'store_true', help = 'Won\'t run the script in batch mode')
 parser.add_argument('--legend', action = 'store_true', help = 'Draw a TLegent on canvases')
 args = parser.parse_args()
 
+ROOT.gROOT.SetBatch(not args.visual)
+ROOT.gStyle.SetOptStat(0)
+
 import os
 pdf_dir = '/'.join([args.output, 'pdf'])
 if not os.path.exists(pdf_dir):
   os.makedirs(pdf_dir)
 
-ROOT.gROOT.SetBatch(not args.visual)
-ROOT.gStyle.SetOptStat(0)
-
 JSON_DICT      = OrderedDict()
 OUTPUT_ROOT    = ROOT.TFile.Open('{}/histograms.root'.format(args.output), 'RECREATE')
 OUTPUT_JSON    = open('{}/pvalues.json'.format(args.output), 'w')
-N_SPLITS       = args.nsplit
+N_SPLIT        = args.nsplit
 PVAL_THRESHOLD = args.pvthreshold
 
 ## binning of tested variables (cannot use unbinned distributions with python before root 6.18)
 BINS = {
   'tau_pt'    : (50, 0, 5000),
-  'tau_eta'   : (5, -2.5, 2.5),
+  'tau_eta'   : (5, -3.2, 3.2),
   'lepton_gen_match' : (20, -1, 19),
   'sampleType': (20, -1, 19),      
   'dataset_id': (20, -1, 19),
   'dataset_group_id': (20, -1, 19),
 }
 
-def groupby(dataframe, by):
-  _ = dataframe.Histo1D(by)
-  hist = _.GetValue()
-  hist.ClearUnderflowAndOverflow()
-  types = list(set([round(hist.GetBinCenter(jj)) for jj in range(hist.GetNbinsX()) if hist.GetBinContent(jj)]))
-  types = [int(tt) for tt in types]
-
-  return {tt: dataframe.Filter('{} == {}'.format(by, tt)) for tt in types}
-
-def get_histos(dataframe, branch, norm = False):
-  size = dataframe.Count().GetValue()
-  sub_size = 1 + size // N_SPLITS
-  subframes = [dataframe.Range(ii*sub_size, (ii+1)*sub_size) for ii in range(N_SPLITS)]
-  
-  model = (branch, "") + BINS[branch]
-  hptrs = [sf.Histo1D(model, branch) for sf in subframes]
-  histos = [hh.GetValue().Clone() for hh in hptrs]
-
-  for ii, hh in enumerate(histos):
-    hh.SetTitle(branch)
-    hh.SetName('_'.join([hh.GetName(), str(ii)]))
-    hh.Sumw2()
-    hh.ClearUnderflowAndOverflow()
-    if norm and hh.Integral():
-      hh.Scale(1. / hh.Integral())
-
-  return histos
-
-def save_histos(histos, fdir, pvalues):
-  OUTPUT_ROOT.cd()
-  OUTPUT_ROOT.cd(fdir)
-  
-  can = ROOT.TCanvas()
-  leg = ROOT.TLegend(0.9, 0.1, 1., 0.9, "p-values (KS with the first chunk)")
-
-  histos[0].GetYaxis().SetRangeUser(0, 1.1*max(hh.GetMaximum() for hh in histos))
-  histos[0].SetMarkerStyle(20)
-
-  for ii, hh in enumerate(histos): 
-    hh.SetLineColor(ii+1)
-    hh.SetMarkerColor(ii+1)
-    leg.AddEntry(hh, 'chunk %d - pval = %.3f' %(ii, pvalues[ii]), 'lep')
-    hh.Draw("PE" + " SAME" * (ii != 0))
-    hh.Write()
-  if args.legend:
-    leg.Draw("SAME")
-  can.SaveAs('{}/pdf/{}.pdf'.format(args.output, fdir.replace('/', '_')), 'pdf')
-  can.Write()
-  OUTPUT_ROOT.cd()
-
-def run_validation(dataframe, branches, pwd):
-  OUTPUT_ROOT.cd()
-  OUTPUT_ROOT.mkdir(pwd)
+class Lazy_container:
+  def __init__(self, ptr, hst = None):
+    self.ptr = ptr
+    self.hst = hst
+  def load_histogram(self):
+    self.hst = self.ptr.GetValue()
+
+class Entry:  ## 2D entry (chunk_id x variable)
+  def __init__(self, var, histo, tdir = None):
+    self.var = var
+    self.hst = histo
+    self.tdir = tdir if not tdir is None else self.var
+
+  def run_KS_test(self, norm = True):    
+    self.chunks = [self.hst.ProjectionY('chunk_{}'.format(cc), cc+1, cc+1).Clone() for cc in range(N_SPLIT)]
+    self.chunks[0].SetMarkerStyle(20)
+    for jj, hh in enumerate(self.chunks):
+      hh.SetTitle(self.tdir)
+      hh.Sumw2()
+      hh.SetLineColor(jj+1)
+      if hh.Integral() and norm:
+        hh.Scale(1. / hh.Integral())
+    
+    if not self.chunks[0].Integral():
+      print ('[WARNING] control histogram is empty inside {}'.format(self.tdir))
+    
+    self.pvalues = [self.chunks[0].KolmogorovTest(hh) if self.chunks[0].Integral()*hh.Integral() else 99 for hh in self.chunks]
+    if not all([pv >= PVAL_THRESHOLD for pv in self.pvalues]):
+      print ('[WARNING] KS test failed for step {}. p-values are:'.format(self.tdir))
+      print ('\t', self.pvalues)
   
-  if not pwd in JSON_DICT.keys():
-    JSON_DICT[pwd] = OrderedDict()
-
-  for branch in branches:
-    OUTPUT_ROOT.cd() 
-    OUTPUT_ROOT.mkdir('/'.join([pwd, branch]))
-
-    histos = get_histos(dataframe, branch = branch, norm = True)
-
-    pvalues = [histos[0].KolmogorovTest(hh) if histos[0].Integral()*hh.Integral() else 99 for hh in histos]
-    if not histos[0].Integral():
-      print ('[WARNING] control histogram is empty for step {} inside {}'.format(branch, pwd))
+  def save_data(self):
+    OUTPUT_ROOT.cd()
+    
+    if not OUTPUT_ROOT.GetDirectory(self.tdir):
+      OUTPUT_ROOT.mkdir(self.tdir)
     
-    if not all([pv >= PVAL_THRESHOLD for pv in pvalues]):
-      print ('[WARNING] KS test failed for step {} inside {}. p-values are:'.format(branch, pwd))
-      print ('\t', pvalues)
+    OUTPUT_ROOT.cd(self.tdir)
     
-    JSON_DICT[pwd][branch] = pvalues
+    can = ROOT.TCanvas()
+    leg = ROOT.TLegend(0.9, 0.1, 1., 0.9, "p-values (KS with the first chunk)")
+    for ii, hh in enumerate(self.chunks):
+      hh.Write()
+      hh.Draw('PE'+' SAME'*(ii != 0))
+      leg.AddEntry(hh, 'chunk %d - pval = %.3f' %(ii, self.pvalues[ii]), 'lep')
+    if args.legend:
+      leg.Draw("SAME")
     
-    save_histos(histos, fdir = '/'.join([pwd, branch]), pvalues = pvalues)
+    can.SaveAs('{}/pdf/{}.pdf'.format(args.output, self.tdir.replace('/', '_')), 'pdf')
+    can.Write()
+
+    OUTPUT_ROOT.cd()
+
+    json_here = JSON_DICT
+    for here in self.tdir.split('/'):
+      if not here in json_here.keys():
+        json_here[here] = OrderedDict()
+      json_here = json_here[here]
+    json_here['pvalues'] = self.pvalues
+
+def to_2D(histo, vbin):
+  histo.GetZaxis().SetRange(vbin, vbin)
+  return histo.Project3D('yx').Clone()
 
 if __name__ == '__main__':
   print ('[INFO] reading files', args.input)
+  
+  if args.n_threads > 1:
+    ROOT.ROOT.EnableImplicitMT(args.n_threads)
+
   input_files = ROOT.std.vector('std::string')()
+  
   for file in glob.glob(args.input):
     input_files.push_back(str(file))
 
-  main_dir = 'KS_test'
-
-  ## first, run on plain columns
+  model = lambda main, third = None: (main, '', N_SPLIT, 0, N_SPLIT)+BINS[main]+BINS[third] if not third is None else (main, '', N_SPLIT, 0, N_SPLIT)+BINS[main]
+  
   dataframe = ROOT.RDataFrame('taus', input_files)
-  run_validation(dataframe = dataframe, pwd = main_dir, branches = ['lepton_gen_match', 'sampleType', 'dataset_group_id', 'dataset_id'])
-
-  ## then, group by tau type
-  tau_type_dataframes = groupby(dataframe = dataframe, by = 'lepton_gen_match')
-  for ii, df in tau_type_dataframes.iteritems():
-    run_validation(dataframe = df, pwd = '/'.join([main_dir, 'lepton_gen_match', str(ii)]), branches = ['tau_pt', 'tau_eta'])
-
-  ## then, group by sample type
-  sample_type_dataframes = groupby(dataframe = dataframe, by = 'sampleType')
-  for ii, df in sample_type_dataframes.iteritems():
-    run_validation(dataframe = df, pwd = '/'.join([main_dir, 'sampleType', str(ii)]), branches = ['tau_pt', 'tau_eta'])
+  dataframe = dataframe.Define('chunk_id', 'rdfentry_ % {}'.format(N_SPLIT))
+
+  ## unbinned distributions
+  ptr_lgm = Lazy_container(dataframe.Histo2D(model('lepton_gen_match'), 'chunk_id', 'lepton_gen_match'))
+  ptr_st  = Lazy_container(dataframe.Histo2D(model('sampleType')      , 'chunk_id', 'sampleType'      ))
+  ptr_dgi = Lazy_container(dataframe.Histo2D(model('dataset_group_id'), 'chunk_id', 'dataset_group_id'))
+  ptr_di  = Lazy_container(dataframe.Histo2D(model('dataset_id')      , 'chunk_id', 'dataset_id'      ))
+
+  ## binned distributions
+  ptrs_tau_pt = {
+    binned_variable: Lazy_container(dataframe.Histo3D(model('tau_pt', third = binned_variable), 'chunk_id', 'tau_pt', binned_variable))
+      for binned_variable in ['lepton_gen_match', 'sampleType', 'dataset_group_id', 'dataset_id']
+  }
+  ptrs_tau_eta = {
+    binned_variable: Lazy_container(dataframe.Histo3D(model('tau_eta', third = binned_variable), 'chunk_id', 'tau_eta', binned_variable))
+      for binned_variable in ['lepton_gen_match', 'sampleType', 'dataset_group_id', 'dataset_id']
+  }
+  ptrs_dataset_id = {
+    binned_variable: Lazy_container(dataframe.Histo3D(model('dataset_id', third = binned_variable), 'chunk_id', 'dataset_id', binned_variable))
+      for binned_variable in ['dataset_group_id']
+  }
+
+  lazy_containers = [ptr_lgm, ptr_st, ptr_dgi, ptr_di ] +\
+    [lc for lc in ptrs_tau_pt.values()]  +\
+    [lc for lc in ptrs_tau_eta.values()] +\
+    [lc for lc in ptrs_dataset_id.values()]
+  
+  for lc in lazy_containers:
+    lc.load_histogram()
+  
+  ## run validation
+  entry_lgm = Entry(var = 'lepton_gen_match', histo = ptr_lgm.hst)
+  entry_st  = Entry(var = 'sampleType'      , histo = ptr_st .hst)
+  entry_dgi = Entry(var = 'dataset_group_id', histo = ptr_dgi.hst)
+  entry_di  = Entry(var = 'dataset_id'      , histo = ptr_di .hst)
+
+  entries_tau_pt = [
+    Entry(var = 'tau_pt', histo = to_2D(ptrs_tau_pt[binned_variable].hst, jj+1), tdir = '/'.join([binned_variable, str(bb), 'tau_pt']))
+      for binned_variable in ['lepton_gen_match', 'sampleType', 'dataset_group_id', 'dataset_id']
+      for jj, bb in enumerate(range(*BINS[binned_variable][1:]))
+  ] ; entries_tau_pt = [ee for ee in entries_tau_pt if ee.hst.GetEntries()]
+
+  entries_tau_eta = [
+    Entry(var = 'tau_eta', histo = to_2D(ptrs_tau_eta[binned_variable].hst, jj+1), tdir = '/'.join([binned_variable, str(bb), 'tau_eta']))
+      for binned_variable in ['lepton_gen_match', 'sampleType', 'dataset_group_id', 'dataset_id']
+      for jj, bb in enumerate(range(*BINS[binned_variable][1:]))
+  ] ; entries_tau_eta = [ee for ee in entries_tau_eta if ee.hst.GetEntries()]
+  
+  entries_dataset_id = [
+    Entry(var = 'dataset_id', histo = to_2D(ptrs_dataset_id[binned_variable].hst, jj+1), tdir = '/'.join([binned_variable, str(bb), 'dataset_id']))
+      for binned_variable in ['dataset_group_id']
+      for jj, bb in enumerate(range(*BINS[binned_variable][1:]))
+  ]; entries_dataset_id = [ee for ee in entries_dataset_id if ee.hst.GetEntries()]
 
-  ## then, group by dataset group id
-  group_id_dataframes = groupby(dataframe = dataframe, by = 'dataset_group_id')
-  for ii, df in group_id_dataframes.iteritems():
-    run_validation(dataframe = df, pwd = '/'.join([main_dir, 'dataset_group_id', str(ii)]), branches = ['tau_pt', 'tau_eta', 'dataset_id'])
+  entries = [entry_lgm, entry_st, entry_dgi, entry_di] +\
+    [ee for ee in entries_tau_pt]  +\
+    [ee for ee in entries_tau_eta] +\
+    [ee for ee in entries_dataset_id]
 
-  ## then, group by dataset id
-  group_id_dataframes = groupby(dataframe = dataframe, by = 'dataset_id')
-  for ii, df in group_id_dataframes.iteritems():
-    run_validation(dataframe = df, pwd = '/'.join([main_dir, 'dataset_id', str(ii)]), branches = ['tau_pt', 'tau_eta'])
+  for ee in entries:
+    ee.run_KS_test()
+    ee.save_data()
 
   OUTPUT_ROOT.Close()
   json.dump(JSON_DICT, OUTPUT_JSON, indent = 4)
-  print ('[INFO] all done. Files', args.output, 'and', args.json, 'have been created')
\ No newline at end of file
+  print ('[INFO] all done. Files saved in', args.output)
\ No newline at end of file

From 75a67353c0fbc895f5d07bc7550030a7e2c34ce7 Mon Sep 17 00:00:00 2001
From: Luca Guzzi <luca.guzzi@cern.ch>
Date: Tue, 3 Nov 2020 13:09:54 +0100
Subject: [PATCH 15/16] validation tool update

---
 Production/scripts/validation_tool.py | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/Production/scripts/validation_tool.py b/Production/scripts/validation_tool.py
index 295bd71ebcf..839bd6cac68 100644
--- a/Production/scripts/validation_tool.py
+++ b/Production/scripts/validation_tool.py
@@ -1,6 +1,4 @@
 from __future__ import print_function
-import sys; PYTHON_MAJOR = int(sys.version_info.major)
-dict_iterator = 'items' if PYTHON_MAJOR == 3 else 'iteritems'
 
 import ROOT
 import glob
@@ -39,7 +37,7 @@
 N_SPLIT        = args.nsplit
 PVAL_THRESHOLD = args.pvthreshold
 
-## binning of tested variables (cannot use unbinned distributions with python before root 6.18)
+## binning of tested variables
 BINS = {
   'tau_pt'    : (50, 0, 5000),
   'tau_eta'   : (5, -3.2, 3.2),
@@ -64,7 +62,9 @@ def __init__(self, var, histo, tdir = None):
 
   def run_KS_test(self, norm = True):    
     self.chunks = [self.hst.ProjectionY('chunk_{}'.format(cc), cc+1, cc+1).Clone() for cc in range(N_SPLIT)]
+
     self.chunks[0].SetMarkerStyle(20)
+
     for jj, hh in enumerate(self.chunks):
       hh.SetTitle(self.tdir)
       hh.Sumw2()
@@ -72,6 +72,8 @@ def run_KS_test(self, norm = True):
       if hh.Integral() and norm:
         hh.Scale(1. / hh.Integral())
     
+    self.chunks[0].GetYaxis().SetRangeUser(0, 1.1*max(hh.GetMaximum() for hh in self.chunks))
+
     if not self.chunks[0].Integral():
       print ('[WARNING] control histogram is empty inside {}'.format(self.tdir))
     
@@ -127,7 +129,8 @@ def to_2D(histo, vbin):
   model = lambda main, third = None: (main, '', N_SPLIT, 0, N_SPLIT)+BINS[main]+BINS[third] if not third is None else (main, '', N_SPLIT, 0, N_SPLIT)+BINS[main]
   
   dataframe = ROOT.RDataFrame('taus', input_files)
-  dataframe = dataframe.Define('chunk_id', 'rdfentry_ % {}'.format(N_SPLIT))
+  tot_entries = dataframe.Count().GetValue()
+  dataframe = dataframe.Define('chunk_id', 'rdfentry_ * {} / {}'.format(N_SPLIT, tot_entries))
 
   ## unbinned distributions
   ptr_lgm = Lazy_container(dataframe.Histo2D(model('lepton_gen_match'), 'chunk_id', 'lepton_gen_match'))

From 1e7bf8ab0e8b433b8a87daa769194cfcea11db78 Mon Sep 17 00:00:00 2001
From: Luca Guzzi <luca.guzzi@cern.ch>
Date: Wed, 4 Nov 2020 12:04:53 +0100
Subject: [PATCH 16/16] update readme for validation tool

---
 README.md | 29 +++++++++++++++++++++++++++++
 1 file changed, 29 insertions(+)

diff --git a/README.md b/README.md
index 336ed911878..946f135a9f5 100644
--- a/README.md
+++ b/README.md
@@ -161,6 +161,35 @@ ShuffleMerge --cfg TauML/Analysis/config/testing_inputs.cfg --input tuples-v2 --
              --n-threads 12 --disabled-branches "trainingWeight"
 ```
 
+#### Validation
+A validation can be run on shuffled samples to ensure that different parts of the training set have compatible distributions.
+To run the validation tool, a ROOT version greater or equal to 6.16 is needed:
+```
+source /cvmfs/sft.cern.ch/lcg/views/LCG_97apython3/x86_64-centos7-clang10-opt/setup.sh
+```
+Then, run:
+```
+python TauMLTools/Production/scripts/validation_tool.py  --input "/path/to/input/*.root" \
+                                                         --output output_directory \
+                                                         --n_threads n_threads \
+                                                         --legend > results.txt
+```
+The script will create the directory "output_directory" containing the results of the test.
+Validation is run on the following ditributions with a Kolmogorov-Smirnov test:
+
+- dataset_id, dataset_group_id, lepton_gen_match, sampleType 
+- tau_pt and tau_eta for each bin of the previous
+- dataset_id for each bin of dataset_group_id
+
+If a KS test is not successful, a warning message is print on screen.
+
+Optional arguments are available running:
+```
+python TauMLTools/Production/scripts/validation_tool.py --help
+```
+
+A time benchmark is available [here](https://github.com/cms-tau-pog/TauMLTools/pull/31#issue-510206277).
+
 ### Production of flat inputs
 
 In this stage, `TauTuple`s are transformed into flat [TrainingTuples](https://github.com/cms-tau-pog/TauMLTools/blob/master/Analysis/interface/TrainingTuple.h) that are suitable as an input for the training.