Skip to content

Commit

Permalink
[RF] Skip out-of-range events in RDataFrame to RooDataSet Helper
Browse files Browse the repository at this point in the history
The RDataFrameHelper should be consistent with creating a RooDataSet
from a TTree, meaning out-of-range events should be skipped. This is
implemented in this commit, borrowing the logic from
`RooTreeDataStore::loadValues()`. A unit test is also implemented.

The previous logic of just taking just all values to fill the dataset
was very dangerous, because these values then clipped to the RooRealVar
limits and biased the number of events observed at the boundaries.

Closes root-project#11017.
  • Loading branch information
guitargeek committed Jul 21, 2022
1 parent 453e639 commit e444120
Show file tree
Hide file tree
Showing 2 changed files with 87 additions and 2 deletions.
38 changes: 36 additions & 2 deletions roofit/RDataFrameHelpers/inc/RooAbsDataHelper.h
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
#include <RooArgSet.h>
#include <RooDataSet.h>
#include <RooDataHist.h>
#include <RooMsgService.h>

#include <ROOT/RDataFrame.hxx>
#include <ROOT/RDF/ActionHelpers.hxx>
Expand Down Expand Up @@ -71,6 +72,7 @@ class RooAbsDataHelper : public ROOT::Detail::RDF::RActionImpl<RooAbsDataHelper<
private:
std::shared_ptr<DataSet_t> _dataset;
std::mutex _mutex_dataset;
std::size_t _numInvalid = 0;

std::vector<std::vector<double>> _events; // One vector of values per data-processing slot
const std::size_t _eventSize; // Number of variables in dataset
Expand Down Expand Up @@ -146,6 +148,11 @@ class RooAbsDataHelper : public ROOT::Detail::RDF::RActionImpl<RooAbsDataHelper<
FillDataSet(vector, _eventSize);
vector.clear();
}

if (_numInvalid>0) {
const auto prefix = std::string(_dataset->ClassName()) + "Helper::Finalize(" + _dataset->GetName() + ") ";
oocoutW(nullptr, DataHandling) << prefix << "Ignored " << _numInvalid << " out-of-range events\n";
}
}


Expand All @@ -163,10 +170,37 @@ class RooAbsDataHelper : public ROOT::Detail::RDF::RActionImpl<RooAbsDataHelper<
const RooArgSet& argSet = *_dataset->get();

for (std::size_t i = 0; i < events.size(); i += eventSize) {

// Creating a RooDataSet from an RDataFrame should be consistent with the
// creation from a TTree. The construction from a TTree discards entries
// outside the variable definition range, so we have to do that too (see
// also RooTreeDataStore::loadValues).

bool allOK = true;
for (std::size_t j=0; j < eventSize; ++j) {
static_cast<RooAbsRealLValue*>(argSet[j])->setVal(events[i+j]);
auto * destArg = static_cast<RooAbsRealLValue*>(argSet[j]);
double sourceVal = events[i+j];

if (!destArg->inRange(sourceVal, nullptr)) {
_numInvalid++ ;
allOK = false;
const auto prefix = std::string(_dataset->ClassName()) + "Helper::FillDataSet(" + _dataset->GetName() + ") ";
if (_numInvalid < 5) {
// Unlike in the TreeVectorStore case, we don't log the event
// number here because we don't know it anyway, because of
// RDataFrame slots and multithreading.
oocoutI(nullptr, DataHandling) << prefix << "Skipping event because " << destArg->GetName()
<< " cannot accommodate the value " << sourceVal << "\n";
} else if (_numInvalid == 5) {
oocoutI(nullptr, DataHandling) << prefix << "Skipping ...\n";
}
break ;
}
destArg->setVal(sourceVal);
}
if(allOK) {
_dataset->add(argSet);
}
_dataset->add(argSet);
}
}
};
Expand Down
51 changes: 51 additions & 0 deletions roofit/RDataFrameHelpers/test/testActionHelpers.cxx
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,12 @@
/// \author Stephan Hageboeck (CERN)

#include <RooAbsDataHelper.h>
#include <RooGlobalFunc.h>
#include <RooMsgService.h>

#include <TROOT.h>
#include <TRandom.h>
#include <TSystem.h>
#include <Rtypes.h>

#include "gtest/gtest.h"
Expand Down Expand Up @@ -76,3 +80,50 @@ TEST(RooAbsDataHelper, MTConstruction)
EXPECT_NEAR(rooDataHist->moment(y, 2.), 0.25, 1.E-2); // Variance is affected in a binned distribution
}


/// This test verifies that out-of-range events are correctly skipped,
/// consistent with the construction of a RooDataSet from a TTree.
TEST(RooAbsDataHelper, SkipEventsOutOfRange) {

RooMsgService::instance().getStream(0).removeTopic(RooFit::DataHandling);
RooMsgService::instance().getStream(1).removeTopic(RooFit::DataHandling);

std::size_t nEvents = 100;
const char * filename = "testRooAbsDataHelper_SkipEventsOutOfRange_tree.root";
const char * treename = "tree";

{
// Create the ROOT file with the dataset
ROOT::RDataFrame rdf(nEvents);
auto rdf_x = rdf.Define("x", [](){ return gRandom->Gaus(0.0, 1.0); });
rdf_x.Snapshot(treename, filename);
// We can't reuse the same RDataFrame now, because when we rerun the event
// loop it would generate new random values. So this scope ends here and we
// open a new RDF from the file later.
}

// Open dataset with RDataFrame and TTree
std::unique_ptr<TFile> file{TFile::Open(filename, "READ")};
auto tree = file->Get<TTree>(treename);
ROOT::RDataFrame rdf(treename, filename);

// Create RooFit variable
RooRealVar x{"x", "x", 0.0, -2.0, 2.0};

// Create a RooDataset from the TTree, and one from the RDataFrame
RooDataSet dataSetTree{"dataSetTree", "dataSetTree", tree, x};
auto dataSetRDF = rdf.Book<double>(RooDataSetHelper("dataSetRDF", "dataSetRDF", RooArgSet(x)), {"x"});

// Check if in the creation of the datasets, the entries outside the
// variable range were sucessfully discarded.
double nPassing = *rdf.Filter("x >= -2 && x <= 2.0").Count();

EXPECT_EQ(dataSetRDF->numEntries(), nPassing);
EXPECT_EQ(dataSetTree.numEntries(), nPassing);

file.reset(); // Close file
gSystem->Unlink(filename); // delete temporary file

RooMsgService::instance().getStream(0).addTopic(RooFit::DataHandling);
RooMsgService::instance().getStream(1).addTopic(RooFit::DataHandling);
}

0 comments on commit e444120

Please sign in to comment.