From 48389504d81ab1c230dd337c5b52b47295a4aaa2 Mon Sep 17 00:00:00 2001 From: Jonas Rembser Date: Tue, 23 Jul 2024 12:34:49 +0200 Subject: [PATCH] [RF] Support weighted filling from RDF to RooFit in RooAbsDataHelper Support weighted filling from RDF to RooFit in RooAbsDataHelper and also implement a test for it. Closes #7223. --- .../RDataFrameHelpers/inc/RooAbsDataHelper.h | 4 +- .../test/testActionHelpers.cxx | 245 +++++++++++------- roofit/roofitcore/inc/RooAbsDataFiller.h | 6 +- roofit/roofitcore/src/RooAbsDataFiller.cxx | 48 +++- 4 files changed, 196 insertions(+), 107 deletions(-) diff --git a/roofit/RDataFrameHelpers/inc/RooAbsDataHelper.h b/roofit/RDataFrameHelpers/inc/RooAbsDataHelper.h index a113751f88004..87bcf71122dba 100644 --- a/roofit/RDataFrameHelpers/inc/RooAbsDataHelper.h +++ b/roofit/RDataFrameHelpers/inc/RooAbsDataHelper.h @@ -31,6 +31,7 @@ /// - Book the helper as an RDataFrame action. Here, the RDataFrame column types have to be passed as template /// parameters. /// - Pass the column names to the Book action. These are matched by position to the variables of the dataset. +/// If there is one more column name than variables in the dataset, the last columns values will be used as weights. /// /// All arguments passed to are forwarded to RooDataSet::RooDataSet() / RooDataHist::RooDataHist(). /// @@ -42,7 +43,8 @@ /// RooDataSetHelper{"dataset", // Name (directly forwarded to RooDataSet::RooDataSet()) /// "Title of dataset", // Title ( ~ " ~ ) /// RooArgSet(x, y) }, // Variables to create in dataset -/// {"x", "y"} // Column names from RDataFrame +/// {"x", "y", "weight"} // Column names from RDataFrame +/// // (this example uses an additional column for the weight) /// ); /// /// ``` diff --git a/roofit/RDataFrameHelpers/test/testActionHelpers.cxx b/roofit/RDataFrameHelpers/test/testActionHelpers.cxx index 02089f0f26245..96fd796abadc8 100644 --- a/roofit/RDataFrameHelpers/test/testActionHelpers.cxx +++ b/roofit/RDataFrameHelpers/test/testActionHelpers.cxx @@ -14,116 +14,177 @@ #include "gtest/gtest.h" -TEST(RooAbsDataHelper, MTConstruction) +namespace { + +constexpr std::size_t nEvent = 200000; + +constexpr double targetXMean = 0.; +constexpr double targetYMean = 1.; +constexpr double targetXVar = 100. / 12.; +constexpr double targetYVar = 4. / 12.; + +auto makeDataFrame() { #ifdef R__USE_IMT - ROOT::EnableImplicitMT(4); + ROOT::EnableImplicitMT(4); #endif - // We create an RDataFrame with two columns filled with 2 million random numbers. - constexpr std::size_t nEvent = 200000; - ROOT::RDataFrame d(nEvent); - auto dd = - d.DefineSlot("x", [=](unsigned int /*slot*/, ULong64_t entry) { return -5. + 10. * ((double)entry) / nEvent; }, + return ROOT::RDataFrame{nEvent} + .DefineSlot("x", [=](unsigned int /*slot*/, ULong64_t entry) { return -5. + 10. * ((double)entry) / nEvent; }, {"rdfentry_"}) - .DefineSlot("y", [=](unsigned int /*slot*/, ULong64_t entry) { return 0. + 2. * ((double)entry) / nEvent; }, - {"rdfentry_"}); - auto meanX = dd.Mean("x"); - auto meanY = dd.Mean("y"); - - constexpr double targetXMean = 0.; - constexpr double targetYMean = 1.; - constexpr double targetXVar = 100./12.; - constexpr double targetYVar = 4./12.; - - // We create RooFit variables that will represent the dataset. - RooRealVar x("x", "x", -5., 5.); - RooRealVar y("y", "y", -50., 50.); - x.setBins(10); - y.setBins(100); - - - auto rooDataSet = dd.Book( - RooDataSetHelper("dataset", // Name - "Title of dataset", // Title - RooArgSet(x, y) // Variables in this dataset - ), - {"x", "y"} // Column names in RDataFrame. - ); - - RooDataHistHelper rdhMaker{"datahist", // Name - "Title of data hist", // Title - RooArgSet(x, y) // Variables in this dataset - }; - - // Then, we move it into the RDataFrame action: - auto rooDataHist = dd.Book(std::move(rdhMaker), {"x", "y"}); - - - - // Run it and inspect the results - // ------------------------------- - EXPECT_NEAR(meanX.GetValue(), targetXMean, 1.E-4); - EXPECT_NEAR(meanY.GetValue(), targetYMean, 1.E-4); - - ASSERT_EQ(rooDataSet->numEntries(), nEvent); - EXPECT_NEAR(rooDataSet->sumEntries(), nEvent, nEvent * 1.E-9); - EXPECT_NEAR(rooDataSet->mean(x), targetXMean, 1.E-4); - EXPECT_NEAR(rooDataSet->moment(x, 2.), targetXVar, targetXVar * 1.E-4); - EXPECT_NEAR(rooDataSet->mean(y), targetYMean, 1.E-4); - EXPECT_NEAR(rooDataSet->moment(y, 2.), targetYVar, targetYVar * 1.E-4); - - EXPECT_NEAR(rooDataHist->sumEntries(), nEvent, nEvent * 1.E-9); - EXPECT_NEAR(rooDataHist->mean(x), targetXMean, 1.E-4); - EXPECT_NEAR(rooDataHist->moment(x, 2.), 8.25, 1.E-2); // Variance is affected in a binned distribution - EXPECT_NEAR(rooDataHist->mean(y), targetYMean, 1.E-4); - EXPECT_NEAR(rooDataHist->moment(y, 2.), 0.25, 1.E-2); // Variance is affected in a binned distribution + .DefineSlot("y", [=](unsigned int /*slot*/, ULong64_t entry) { return 0. + 2. * ((double)entry) / nEvent; }, + {"rdfentry_"}) + .DefineSlot("w", [=](unsigned int /*slot*/, ULong64_t /*entry*/) { return 0.5; }, {"rdfentry_"}); +} + +RooArgSet makeVariablesSet() +{ + // We create RooFit variables that will represent the dataset. + auto x = std::make_unique("x", "x", -5., 5.); + auto y = std::make_unique("y", "y", -50., 50.); + x->setBins(10); + y->setBins(100); + + RooArgSet out; + out.addOwned(std::move(x)); + out.addOwned(std::move(y)); + return out; +} + +} // namespace + +TEST(RooAbsDataHelper, RooDataSet) +{ + auto dd = makeDataFrame(); + RooArgSet vars = makeVariablesSet(); + RooRealVar &x = static_cast(vars["x"]); + RooRealVar &y = static_cast(vars["y"]); + + RooDataSetHelper rdsMaker{"dataset", "Title of dataset", vars}; + + auto rooDataSet = dd.Book(std::move(rdsMaker), {"x", "y"}); + + double sumEntries = nEvent; + + ASSERT_EQ(rooDataSet->numEntries(), nEvent); + EXPECT_NEAR(rooDataSet->sumEntries(), sumEntries, sumEntries * 1.E-9); + EXPECT_NEAR(rooDataSet->mean(x), targetXMean, 1.E-4); + EXPECT_NEAR(rooDataSet->moment(x, 2.), targetXVar, targetXVar * 1.E-4); + EXPECT_NEAR(rooDataSet->mean(y), targetYMean, 1.E-4); + EXPECT_NEAR(rooDataSet->moment(y, 2.), targetYVar, targetYVar * 1.E-4); +} + +TEST(RooAbsDataHelper, RooDataSetWeighted) +{ + auto dd = makeDataFrame(); + RooArgSet vars = makeVariablesSet(); + RooRealVar &x = static_cast(vars["x"]); + RooRealVar &y = static_cast(vars["y"]); + + RooDataSetHelper rdsMaker{"dataset", "Title of dataset", vars, RooFit::WeightVar("w")}; + + auto rooDataSet = dd.Book(std::move(rdsMaker), {"x", "y", "w"}); + + auto sumEntriesResult = dd.Sum("w"); + + double sumEntries = sumEntriesResult.GetValue(); + + ASSERT_EQ(rooDataSet->numEntries(), nEvent); + EXPECT_NEAR(rooDataSet->sumEntries(), sumEntries, sumEntries * 1.E-9); + EXPECT_NEAR(rooDataSet->mean(x), targetXMean, 1.E-4); + EXPECT_NEAR(rooDataSet->moment(x, 2.), targetXVar, targetXVar * 1.E-4); + EXPECT_NEAR(rooDataSet->mean(y), targetYMean, 1.E-4); + EXPECT_NEAR(rooDataSet->moment(y, 2.), targetYVar, targetYVar * 1.E-4); } +TEST(RooAbsDataHelper, RooDataHist) +{ + auto dd = makeDataFrame(); + RooArgSet vars = makeVariablesSet(); + RooRealVar &x = static_cast(vars["x"]); + RooRealVar &y = static_cast(vars["y"]); + + RooDataHistHelper rdhMaker{"datahist", "Title of data hist", vars}; + + // Then, we move it into the RDataFrame action: + auto rooDataHist = dd.Book(std::move(rdhMaker), {"x", "y"}); + + double sumEntries = nEvent; + + EXPECT_NEAR(rooDataHist->sumEntries(), sumEntries, sumEntries * 1.E-9); + EXPECT_NEAR(rooDataHist->mean(x), targetXMean, 1.E-4); + EXPECT_NEAR(rooDataHist->moment(x, 2.), 8.25, 1.E-2); // Variance is affected in a binned distribution + EXPECT_NEAR(rooDataHist->mean(y), targetYMean, 1.E-4); + EXPECT_NEAR(rooDataHist->moment(y, 2.), 0.25, 1.E-2); // Variance is affected in a binned distribution +} + +TEST(RooAbsDataHelper, RooDataHistWeighted) +{ + auto dd = makeDataFrame(); + RooArgSet vars = makeVariablesSet(); + RooRealVar &x = static_cast(vars["x"]); + RooRealVar &y = static_cast(vars["y"]); + + RooDataHistHelper rdhMaker{"datahist", "Title of data hist", vars}; + + // Then, we move it into the RDataFrame action: + auto rooDataHist = dd.Book(std::move(rdhMaker), {"x", "y", "w"}); + + auto sumEntriesResult = dd.Sum("w"); + + double sumEntries = sumEntriesResult.GetValue(); + + EXPECT_NEAR(rooDataHist->sumEntries(), sumEntries, sumEntries * 1.E-9); + EXPECT_NEAR(rooDataHist->mean(x), targetXMean, 1.E-4); + EXPECT_NEAR(rooDataHist->moment(x, 2.), 8.25, 1.E-2); // Variance is affected in a binned distribution + EXPECT_NEAR(rooDataHist->mean(y), targetYMean, 1.E-4); + EXPECT_NEAR(rooDataHist->moment(y, 2.), 0.25, 1.E-2); // Variance is affected in a binned distribution +} /// This test verifies that out-of-range events are correctly skipped, /// consistent with the construction of a RooDataSet from a TTree. -TEST(RooAbsDataHelper, SkipEventsOutOfRange) { +TEST(RooAbsDataHelper, SkipEventsOutOfRange) +{ - RooMsgService::instance().getStream(0).removeTopic(RooFit::DataHandling); - RooMsgService::instance().getStream(1).removeTopic(RooFit::DataHandling); + RooMsgService::instance().getStream(0).removeTopic(RooFit::DataHandling); + RooMsgService::instance().getStream(1).removeTopic(RooFit::DataHandling); - std::size_t nEvents = 100; - const char * filename = "testRooAbsDataHelper_SkipEventsOutOfRange_tree.root"; - const char * treename = "tree"; + std::size_t nEvents = 100; + const char *filename = "testRooAbsDataHelper_SkipEventsOutOfRange_tree.root"; + const char *treename = "tree"; - { - // Create the ROOT file with the dataset - ROOT::RDataFrame rdf(nEvents); - auto rdf_x = rdf.Define("x", [](){ return gRandom->Gaus(0.0, 1.0); }); - rdf_x.Snapshot(treename, filename); - // We can't reuse the same RDataFrame now, because when we rerun the event - // loop it would generate new random values. So this scope ends here and we - // open a new RDF from the file later. - } + { + // Create the ROOT file with the dataset + ROOT::RDataFrame rdf(nEvents); + auto rdf_x = rdf.Define("x", []() { return gRandom->Gaus(0.0, 1.0); }); + rdf_x.Snapshot(treename, filename); + // We can't reuse the same RDataFrame now, because when we rerun the event + // loop it would generate new random values. So this scope ends here and we + // open a new RDF from the file later. + } - // Open dataset with RDataFrame and TTree - std::unique_ptr file{TFile::Open(filename, "READ")}; - auto tree = file->Get(treename); - ROOT::RDataFrame rdf(treename, filename); + // Open dataset with RDataFrame and TTree + std::unique_ptr file{TFile::Open(filename, "READ")}; + auto tree = file->Get(treename); + ROOT::RDataFrame rdf(treename, filename); - // Create RooFit variable - RooRealVar x{"x", "x", 0.0, -2.0, 2.0}; + // Create RooFit variable + RooRealVar x{"x", "x", 0.0, -2.0, 2.0}; - // Create a RooDataset from the TTree, and one from the RDataFrame - RooDataSet dataSetTree{"dataSetTree", "dataSetTree", x, RooFit::Import(*tree)}; - auto dataSetRDF = rdf.Book(RooDataSetHelper("dataSetRDF", "dataSetRDF", RooArgSet(x)), {"x"}); + // Create a RooDataset from the TTree, and one from the RDataFrame + RooDataSet dataSetTree{"dataSetTree", "dataSetTree", x, RooFit::Import(*tree)}; + auto dataSetRDF = rdf.Book(RooDataSetHelper("dataSetRDF", "dataSetRDF", RooArgSet(x)), {"x"}); - // Check if in the creation of the datasets, the entries outside the - // variable range were successfully discarded. - double nPassing = *rdf.Filter("x >= -2 && x <= 2.0").Count(); + // Check if in the creation of the datasets, the entries outside the + // variable range were successfully discarded. + double nPassing = *rdf.Filter("x >= -2 && x <= 2.0").Count(); - EXPECT_EQ(dataSetRDF->numEntries(), nPassing); - EXPECT_EQ(dataSetTree.numEntries(), nPassing); + EXPECT_EQ(dataSetRDF->numEntries(), nPassing); + EXPECT_EQ(dataSetTree.numEntries(), nPassing); - file.reset(); // Close file - gSystem->Unlink(filename); // delete temporary file + file.reset(); // Close file + gSystem->Unlink(filename); // delete temporary file - RooMsgService::instance().getStream(0).addTopic(RooFit::DataHandling); - RooMsgService::instance().getStream(1).addTopic(RooFit::DataHandling); + RooMsgService::instance().getStream(0).addTopic(RooFit::DataHandling); + RooMsgService::instance().getStream(1).addTopic(RooFit::DataHandling); } diff --git a/roofit/roofitcore/inc/RooAbsDataFiller.h b/roofit/roofitcore/inc/RooAbsDataFiller.h index c996720b303da..274549b78a5e2 100644 --- a/roofit/roofitcore/inc/RooAbsDataFiller.h +++ b/roofit/roofitcore/inc/RooAbsDataFiller.h @@ -53,11 +53,15 @@ class RooAbsDataFiller { protected: void FillAbsData(const std::vector &events, unsigned int eventSize); - std::mutex _mutex_dataset; + std::mutex _mutexDataset; std::size_t _numInvalid = 0; std::vector> _events; // One vector of values per data-processing slot std::size_t _eventSize; // Number of variables in dataset + std::size_t _nValues; // Number of variables in dataframe + + bool _isWeighted = false; + bool _isDataHist = false; }; } // namespace Detail diff --git a/roofit/roofitcore/src/RooAbsDataFiller.cxx b/roofit/roofitcore/src/RooAbsDataFiller.cxx index e4b21c901a760..d32fc1e0f8bc6 100644 --- a/roofit/roofitcore/src/RooAbsDataFiller.cxx +++ b/roofit/roofitcore/src/RooAbsDataFiller.cxx @@ -29,7 +29,11 @@ RooAbsDataFiller::RooAbsDataFiller() void RooAbsDataFiller::Initialize() { - _eventSize = GetAbsData().get()->size(); + RooAbsData &absData = GetAbsData(); + + _eventSize = absData.get()->size(); + _isWeighted = absData.isWeighted(); + _isDataHist = std::string{absData.ClassName()} != "RooDataSet"; } /// Append all `events` to the internal RooDataSet or increment the bins of a RooDataHist at the given locations. @@ -46,8 +50,15 @@ void RooAbsDataFiller::FillAbsData(const std::vector &events, unsigned i RooAbsData &absData = GetAbsData(); const RooArgSet &argSet = *absData.get(); + // Relevant for weighted RooDataSet + RooRealVar *weightVar = !_isDataHist && _isWeighted ? static_cast(absData).weightVar() : nullptr; + for (std::size_t i = 0; i < events.size(); i += eventSize) { + // The RooDataHist has no dedicated RooRealVar for the weight. So we just + // use a double. + double weightVal = 1.0; + // Creating a RooDataSet from an RDataFrame should be consistent with the // creation from a TTree. The construction from a TTree discards entries // outside the variable definition range, so we have to do that too (see @@ -55,10 +66,15 @@ void RooAbsDataFiller::FillAbsData(const std::vector &events, unsigned i bool allOK = true; for (std::size_t j = 0; j < eventSize; ++j) { - auto *destArg = static_cast(argSet[j]); + RooAbsRealLValue *destArg = nullptr; + if (j < argSet.size()) { + destArg = static_cast(argSet[j]); + } else { + destArg = weightVar; + } double sourceVal = events[i + j]; - if (!destArg->inRange(sourceVal, nullptr)) { + if (destArg && !destArg->inRange(sourceVal, nullptr)) { _numInvalid++; allOK = false; const auto prefix = std::string(absData.ClassName()) + "Helper::FillAbsData(" + absData.GetName() + ") "; @@ -73,10 +89,14 @@ void RooAbsDataFiller::FillAbsData(const std::vector &events, unsigned i } break; } - destArg->setVal(sourceVal); + if (destArg) { + destArg->setVal(sourceVal); + } else { + weightVal = sourceVal; + } } if (allOK) { - absData.add(argSet); + absData.add(argSet, weightVar ? weightVar->getVal() : weightVal); } } } @@ -87,7 +107,7 @@ void RooAbsDataFiller::Finalize() RooAbsData &absData = GetAbsData(); for (auto &vector : _events) { - FillAbsData(vector, _eventSize); + FillAbsData(vector, _nValues); vector.clear(); } @@ -97,17 +117,19 @@ void RooAbsDataFiller::Finalize() } } -void RooAbsDataFiller::ExecImpl(std::size_t nValues, std::vector& vector) +void RooAbsDataFiller::ExecImpl(std::size_t nValues, std::vector &vector) { - if (nValues != _eventSize) { - throw std::invalid_argument(std::string("RooDataSet can hold ") + std::to_string(_eventSize) + - " variables per event, but RDataFrame passed " + + if (nValues != _eventSize && !(_isWeighted && nValues == _eventSize + 1)) { + throw std::invalid_argument(std::string("RooAbsData can hold ") + std::to_string(_eventSize) + + " variables per event (plus an optional weight in case of weighted data), but RDataFrame passed " + std::to_string(nValues) + " columns."); } - if (vector.size() > 1024 && _mutex_dataset.try_lock()) { - const std::lock_guard guard(_mutex_dataset, std::adopt_lock_t()); - FillAbsData(vector, _eventSize); + _nValues = nValues; + + if (vector.size() > 1024 && _mutexDataset.try_lock()) { + const std::lock_guard guard(_mutexDataset, std::adopt_lock_t()); + FillAbsData(vector, _nValues); vector.clear(); } }