Skip to content

Commit

Permalink
Merge pull request #31 from lguzzi/master
Browse files Browse the repository at this point in the history
adding validation tool (KS test)
  • Loading branch information
kandrosov authored Nov 4, 2020
2 parents 514fcbb + 1e7bf8a commit 4fc9573
Show file tree
Hide file tree
Showing 2 changed files with 227 additions and 0 deletions.
198 changes: 198 additions & 0 deletions Production/scripts/validation_tool.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,198 @@
from __future__ import print_function

import ROOT
import glob
import json
from collections import OrderedDict

import argparse
parser = argparse.ArgumentParser('''
The script runs a binned KS test between different chunks of the same RDataFrame. For simplicity, all chunks are compared to the first one.
If a KS test is below the threshold, a warning message is printed on screen.
NOTE: the binning of each variable must be hard coded in the script (using the BINS dictionary)
NOTE: pvalue = 99 means that one of the two histograms is empty.
''')

parser.add_argument('--input' , required = True, type = str, help = 'input file. Accepts glob patterns (use quotes)')
parser.add_argument('--output' , required = True, type = str, help = 'output directory name')
parser.add_argument('--nsplit' , default = 100 , type = int, help = 'number of chunks per file')
parser.add_argument('--pvthreshold' , default = .05 , type = str, help = 'threshold of KS test (above = ok)')
parser.add_argument('--n_threads' , default = 1 , type = int, help = 'enable ROOT implicit multithreading')

parser.add_argument('--visual', action = 'store_true', help = 'Won\'t run the script in batch mode')
parser.add_argument('--legend', action = 'store_true', help = 'Draw a TLegent on canvases')
args = parser.parse_args()

ROOT.gROOT.SetBatch(not args.visual)
ROOT.gStyle.SetOptStat(0)

import os
pdf_dir = '/'.join([args.output, 'pdf'])
if not os.path.exists(pdf_dir):
os.makedirs(pdf_dir)

JSON_DICT = OrderedDict()
OUTPUT_ROOT = ROOT.TFile.Open('{}/histograms.root'.format(args.output), 'RECREATE')
OUTPUT_JSON = open('{}/pvalues.json'.format(args.output), 'w')
N_SPLIT = args.nsplit
PVAL_THRESHOLD = args.pvthreshold

## binning of tested variables
BINS = {
'tau_pt' : (50, 0, 5000),
'tau_eta' : (5, -3.2, 3.2),
'lepton_gen_match' : (20, -1, 19),
'sampleType': (20, -1, 19),
'dataset_id': (20, -1, 19),
'dataset_group_id': (20, -1, 19),
}

class Lazy_container:
def __init__(self, ptr, hst = None):
self.ptr = ptr
self.hst = hst
def load_histogram(self):
self.hst = self.ptr.GetValue()

class Entry: ## 2D entry (chunk_id x variable)
def __init__(self, var, histo, tdir = None):
self.var = var
self.hst = histo
self.tdir = tdir if not tdir is None else self.var

def run_KS_test(self, norm = True):
self.chunks = [self.hst.ProjectionY('chunk_{}'.format(cc), cc+1, cc+1).Clone() for cc in range(N_SPLIT)]

self.chunks[0].SetMarkerStyle(20)

for jj, hh in enumerate(self.chunks):
hh.SetTitle(self.tdir)
hh.Sumw2()
hh.SetLineColor(jj+1)
if hh.Integral() and norm:
hh.Scale(1. / hh.Integral())

self.chunks[0].GetYaxis().SetRangeUser(0, 1.1*max(hh.GetMaximum() for hh in self.chunks))

if not self.chunks[0].Integral():
print ('[WARNING] control histogram is empty inside {}'.format(self.tdir))

self.pvalues = [self.chunks[0].KolmogorovTest(hh) if self.chunks[0].Integral()*hh.Integral() else 99 for hh in self.chunks]
if not all([pv >= PVAL_THRESHOLD for pv in self.pvalues]):
print ('[WARNING] KS test failed for step {}. p-values are:'.format(self.tdir))
print ('\t', self.pvalues)

def save_data(self):
OUTPUT_ROOT.cd()

if not OUTPUT_ROOT.GetDirectory(self.tdir):
OUTPUT_ROOT.mkdir(self.tdir)

OUTPUT_ROOT.cd(self.tdir)

can = ROOT.TCanvas()
leg = ROOT.TLegend(0.9, 0.1, 1., 0.9, "p-values (KS with the first chunk)")
for ii, hh in enumerate(self.chunks):
hh.Write()
hh.Draw('PE'+' SAME'*(ii != 0))
leg.AddEntry(hh, 'chunk %d - pval = %.3f' %(ii, self.pvalues[ii]), 'lep')
if args.legend:
leg.Draw("SAME")

can.SaveAs('{}/pdf/{}.pdf'.format(args.output, self.tdir.replace('/', '_')), 'pdf')
can.Write()

OUTPUT_ROOT.cd()

json_here = JSON_DICT
for here in self.tdir.split('/'):
if not here in json_here.keys():
json_here[here] = OrderedDict()
json_here = json_here[here]
json_here['pvalues'] = self.pvalues

def to_2D(histo, vbin):
histo.GetZaxis().SetRange(vbin, vbin)
return histo.Project3D('yx').Clone()

if __name__ == '__main__':
print ('[INFO] reading files', args.input)

if args.n_threads > 1:
ROOT.ROOT.EnableImplicitMT(args.n_threads)

input_files = ROOT.std.vector('std::string')()

for file in glob.glob(args.input):
input_files.push_back(str(file))

model = lambda main, third = None: (main, '', N_SPLIT, 0, N_SPLIT)+BINS[main]+BINS[third] if not third is None else (main, '', N_SPLIT, 0, N_SPLIT)+BINS[main]

dataframe = ROOT.RDataFrame('taus', input_files)
tot_entries = dataframe.Count().GetValue()
dataframe = dataframe.Define('chunk_id', 'rdfentry_ * {} / {}'.format(N_SPLIT, tot_entries))

## unbinned distributions
ptr_lgm = Lazy_container(dataframe.Histo2D(model('lepton_gen_match'), 'chunk_id', 'lepton_gen_match'))
ptr_st = Lazy_container(dataframe.Histo2D(model('sampleType') , 'chunk_id', 'sampleType' ))
ptr_dgi = Lazy_container(dataframe.Histo2D(model('dataset_group_id'), 'chunk_id', 'dataset_group_id'))
ptr_di = Lazy_container(dataframe.Histo2D(model('dataset_id') , 'chunk_id', 'dataset_id' ))

## binned distributions
ptrs_tau_pt = {
binned_variable: Lazy_container(dataframe.Histo3D(model('tau_pt', third = binned_variable), 'chunk_id', 'tau_pt', binned_variable))
for binned_variable in ['lepton_gen_match', 'sampleType', 'dataset_group_id', 'dataset_id']
}
ptrs_tau_eta = {
binned_variable: Lazy_container(dataframe.Histo3D(model('tau_eta', third = binned_variable), 'chunk_id', 'tau_eta', binned_variable))
for binned_variable in ['lepton_gen_match', 'sampleType', 'dataset_group_id', 'dataset_id']
}
ptrs_dataset_id = {
binned_variable: Lazy_container(dataframe.Histo3D(model('dataset_id', third = binned_variable), 'chunk_id', 'dataset_id', binned_variable))
for binned_variable in ['dataset_group_id']
}

lazy_containers = [ptr_lgm, ptr_st, ptr_dgi, ptr_di ] +\
[lc for lc in ptrs_tau_pt.values()] +\
[lc for lc in ptrs_tau_eta.values()] +\
[lc for lc in ptrs_dataset_id.values()]

for lc in lazy_containers:
lc.load_histogram()

## run validation
entry_lgm = Entry(var = 'lepton_gen_match', histo = ptr_lgm.hst)
entry_st = Entry(var = 'sampleType' , histo = ptr_st .hst)
entry_dgi = Entry(var = 'dataset_group_id', histo = ptr_dgi.hst)
entry_di = Entry(var = 'dataset_id' , histo = ptr_di .hst)

entries_tau_pt = [
Entry(var = 'tau_pt', histo = to_2D(ptrs_tau_pt[binned_variable].hst, jj+1), tdir = '/'.join([binned_variable, str(bb), 'tau_pt']))
for binned_variable in ['lepton_gen_match', 'sampleType', 'dataset_group_id', 'dataset_id']
for jj, bb in enumerate(range(*BINS[binned_variable][1:]))
] ; entries_tau_pt = [ee for ee in entries_tau_pt if ee.hst.GetEntries()]

entries_tau_eta = [
Entry(var = 'tau_eta', histo = to_2D(ptrs_tau_eta[binned_variable].hst, jj+1), tdir = '/'.join([binned_variable, str(bb), 'tau_eta']))
for binned_variable in ['lepton_gen_match', 'sampleType', 'dataset_group_id', 'dataset_id']
for jj, bb in enumerate(range(*BINS[binned_variable][1:]))
] ; entries_tau_eta = [ee for ee in entries_tau_eta if ee.hst.GetEntries()]

entries_dataset_id = [
Entry(var = 'dataset_id', histo = to_2D(ptrs_dataset_id[binned_variable].hst, jj+1), tdir = '/'.join([binned_variable, str(bb), 'dataset_id']))
for binned_variable in ['dataset_group_id']
for jj, bb in enumerate(range(*BINS[binned_variable][1:]))
]; entries_dataset_id = [ee for ee in entries_dataset_id if ee.hst.GetEntries()]

entries = [entry_lgm, entry_st, entry_dgi, entry_di] +\
[ee for ee in entries_tau_pt] +\
[ee for ee in entries_tau_eta] +\
[ee for ee in entries_dataset_id]

for ee in entries:
ee.run_KS_test()
ee.save_data()

OUTPUT_ROOT.Close()
json.dump(JSON_DICT, OUTPUT_JSON, indent = 4)
print ('[INFO] all done. Files saved in', args.output)
29 changes: 29 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -161,6 +161,35 @@ ShuffleMerge --cfg TauML/Analysis/config/testing_inputs.cfg --input tuples-v2 --
--n-threads 12 --disabled-branches "trainingWeight"
```

#### Validation
A validation can be run on shuffled samples to ensure that different parts of the training set have compatible distributions.
To run the validation tool, a ROOT version greater or equal to 6.16 is needed:
```
source /cvmfs/sft.cern.ch/lcg/views/LCG_97apython3/x86_64-centos7-clang10-opt/setup.sh
```
Then, run:
```
python TauMLTools/Production/scripts/validation_tool.py --input "/path/to/input/*.root" \
--output output_directory \
--n_threads n_threads \
--legend > results.txt
```
The script will create the directory "output_directory" containing the results of the test.
Validation is run on the following ditributions with a Kolmogorov-Smirnov test:

- dataset_id, dataset_group_id, lepton_gen_match, sampleType
- tau_pt and tau_eta for each bin of the previous
- dataset_id for each bin of dataset_group_id

If a KS test is not successful, a warning message is print on screen.

Optional arguments are available running:
```
python TauMLTools/Production/scripts/validation_tool.py --help
```

A time benchmark is available [here](https://github.com/cms-tau-pog/TauMLTools/pull/31#issue-510206277).

### Production of flat inputs

In this stage, `TauTuple`s are transformed into flat [TrainingTuples](https://github.com/cms-tau-pog/TauMLTools/blob/master/Analysis/interface/TrainingTuple.h) that are suitable as an input for the training.
Expand Down

0 comments on commit 4fc9573

Please sign in to comment.