Skip to content

Commit

Permalink
Better reporting of CPU timing
Browse files Browse the repository at this point in the history
The Timing service reported CPU time for child processes as part of the total CPU time, but not as part of the loop time.
The timing summary did not account for child processes at all.
  • Loading branch information
Dr15Jones committed Aug 18, 2021
1 parent 294a387 commit 9be29e8
Show file tree
Hide file tree
Showing 5 changed files with 211 additions and 21 deletions.
2 changes: 2 additions & 0 deletions FWCore/Framework/interface/SystemTimeKeeper.h
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@

// user include files
#include "FWCore/Utilities/interface/CPUTimer.h"
#include "FWCore/Utilities/interface/ChildrenCPUTimer.h"
#include "FWCore/Utilities/interface/WallclockTimer.h"

// forward declarations
Expand Down Expand Up @@ -106,6 +107,7 @@ namespace edm {
std::vector<std::vector<std::string>> m_modulesOnPaths;

CPUTimer m_processingLoopTimer;
ChildrenCPUTimer m_processingLoopChildrenTimer;
ProcessContext const* m_processContext;

unsigned int m_minModuleID;
Expand Down
12 changes: 9 additions & 3 deletions FWCore/Framework/src/SystemTimeKeeper.cc
Original file line number Diff line number Diff line change
Expand Up @@ -202,9 +202,15 @@ void SystemTimeKeeper::restartModuleEvent(StreamContext const& iStream, ModuleCa
}
}

void SystemTimeKeeper::startProcessingLoop() { m_processingLoopTimer.start(); }
void SystemTimeKeeper::startProcessingLoop() {
m_processingLoopTimer.start();
m_processingLoopChildrenTimer.start();
}

void SystemTimeKeeper::stopProcessingLoop() { m_processingLoopTimer.stop(); }
void SystemTimeKeeper::stopProcessingLoop() {
m_processingLoopTimer.stop();
m_processingLoopChildrenTimer.stop();
}

static void fillPathSummary(unsigned int iStartIndex,
unsigned int iEndIndex,
Expand Down Expand Up @@ -248,7 +254,7 @@ void SystemTimeKeeper::fillTriggerTimingReport(TriggerTimingReport& rep) const {
sumEventTime += stream.realTime();
}
rep.eventSummary.realTime = m_processingLoopTimer.realTime();
rep.eventSummary.cpuTime = m_processingLoopTimer.cpuTime();
rep.eventSummary.cpuTime = m_processingLoopTimer.cpuTime() + m_processingLoopChildrenTimer.cpuTime();
rep.eventSummary.sumStreamRealTime = sumEventTime;
}

Expand Down
49 changes: 31 additions & 18 deletions FWCore/Services/plugins/Timing.cc
Original file line number Diff line number Diff line change
Expand Up @@ -104,8 +104,8 @@ namespace edm {
double curr_job_cpu_; // seconds
std::atomic<double> extra_job_cpu_; //seconds
//use last run time for determining end of processing
std::atomic<double> last_run_time_;
std::atomic<double> last_run_cpu_;
std::atomic<double> end_loop_time_;
std::atomic<double> end_loop_cpu_;
std::vector<double> curr_events_time_; // seconds
bool summary_only_;
bool report_summary_;
Expand Down Expand Up @@ -161,6 +161,16 @@ namespace edm {
return static_cast<double>(t.tv_sec) + (static_cast<double>(t.tv_usec) * 1E-6);
}

static double getChildrenCPU() {
struct rusage usage;

getrusage(RUSAGE_CHILDREN, &usage);
double totalCPUTime = (double)usage.ru_utime.tv_sec + (double(usage.ru_utime.tv_usec) * 1E-6);
totalCPUTime += (double)usage.ru_stime.tv_sec + (double(usage.ru_stime.tv_usec) * 1E-6);

return totalCPUTime;
}

static double getCPU() {
struct rusage usage;
getrusage(RUSAGE_SELF, &usage);
Expand Down Expand Up @@ -208,8 +218,8 @@ namespace edm {
: curr_job_time_(0.),
curr_job_cpu_(0.),
extra_job_cpu_(0.0),
last_run_time_(0.0),
last_run_cpu_(0.0),
end_loop_time_(0.0),
end_loop_cpu_(0.0),
curr_events_time_(),
summary_only_(iPS.getUntrackedParameter<bool>("summaryOnly")),
report_summary_(iPS.getUntrackedParameter<bool>("useJobReport")),
Expand All @@ -228,6 +238,10 @@ namespace edm {
nSubProcesses_{0} {
iRegistry.watchPreBeginJob(this, &Timing::preBeginJob);
iRegistry.watchPostBeginJob(this, &Timing::postBeginJob);
iRegistry.preEndJobSignal_.connect([this]() {
end_loop_time_ = getTime();
end_loop_cpu_ = getCPU();
});
iRegistry.watchPostEndJob(this, &Timing::postEndJob);

iRegistry.watchPreEvent(this, &Timing::preEvent);
Expand Down Expand Up @@ -311,11 +325,6 @@ namespace edm {
countSubProcessesPostEvent_.emplace_back(std::make_unique<std::atomic<unsigned int>>(0));
}
});

iRegistry.postGlobalEndRunSignal_.connect([this](edm::GlobalContext const&) {
last_run_time_ = getTime();
last_run_cpu_ = getCPU();
});
}

Timing::~Timing() {}
Expand Down Expand Up @@ -383,6 +392,8 @@ namespace edm {

double total_job_cpu = job_end_cpu + extra_job_cpu_;

const double job_end_children_cpu = getChildrenCPU();

const double total_initialization_time = curr_job_time_ - jobStartTime();
const double total_initialization_cpu = curr_job_cpu_;

Expand All @@ -395,10 +406,10 @@ namespace edm {
double min_event_time = *(std::min_element(min_events_time_.begin(), min_events_time_.end()));
double max_event_time = *(std::max_element(max_events_time_.begin(), max_events_time_.end()));

auto total_loop_time = last_run_time_ - curr_job_time_;
auto total_loop_cpu = last_run_cpu_ + extra_job_cpu_ - curr_job_cpu_;
auto total_loop_time = end_loop_time_ - curr_job_time_;
auto total_loop_cpu = end_loop_cpu_ + extra_job_cpu_ - curr_job_cpu_;

if (last_run_time_ == 0.0) {
if (end_loop_time_ == 0.0) {
total_loop_time = 0.0;
total_loop_cpu = 0.0;
}
Expand Down Expand Up @@ -427,14 +438,15 @@ namespace edm {
<< " - Total loop: " << total_loop_time << "\n"
<< " - Total init: " << total_initialization_time << "\n"
<< " - Total job: " << total_job_time << "\n"
<< " - EventSetup Lock: " << accumulatedTimeForLock_ << "\n"
<< " - EventSetup Get: " << accumulatedTimeForGet_ << "\n"
<< " - EventSetup Lock: " << accumulatedTimeForLock_ << "\n"
<< " - EventSetup Get: " << accumulatedTimeForGet_ << "\n"
<< " Event Throughput: " << event_throughput << " ev/s\n"
<< " CPU Summary: \n"
<< " - Total loop: " << total_loop_cpu << "\n"
<< " - Total init: " << total_initialization_cpu << "\n"
<< " - Total extra: " << extra_job_cpu_ << "\n"
<< " - Total job: " << total_job_cpu << "\n"
<< " - Total loop: " << total_loop_cpu << "\n"
<< " - Total init: " << total_initialization_cpu << "\n"
<< " - Total extra: " << extra_job_cpu_ << "\n"
<< " - Total children: " << job_end_children_cpu << "\n"
<< " - Total job: " << total_job_cpu << "\n"
<< " Processing Summary: \n"
<< " - Number of Events: " << total_event_count_ << "\n"
<< " - Number of Global Begin Lumi Calls: " << begin_lumi_count_ << "\n"
Expand All @@ -450,6 +462,7 @@ namespace edm {
reportData.insert(std::make_pair("EventThroughput", d2str(event_throughput)));
reportData.insert(std::make_pair("TotalJobTime", d2str(total_job_time)));
reportData.insert(std::make_pair("TotalJobCPU", d2str(total_job_cpu)));
reportData.insert(std::make_pair("TotalJobChildrenCPU", d2str(job_end_children_cpu)));
reportData.insert(std::make_pair("TotalLoopTime", d2str(total_loop_time)));
reportData.insert(std::make_pair("TotalLoopCPU", d2str(total_loop_cpu)));
reportData.insert(std::make_pair("TotalInitTime", d2str(total_initialization_time)));
Expand Down
59 changes: 59 additions & 0 deletions FWCore/Utilities/interface/ChildrenCPUTimer.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
#ifndef FWCore_Utilities_ChildrenCPUTimer_h
#define FWCore_Utilities_ChildrenCPUTimer_h
// -*- C++ -*-
//
// Package: Utilities
// Class : ChildrenCPUTimer
//
/**\class ChildrenCPUTimer ChildrenCPUTimer.h FWCore/Utilities/interface/ChildrenCPUTimer.h
Description: Timer which measures the CPU time for child processes
Usage:
<usage>
*/
//
// Original Author: Chris Jones
// Created: Sun Apr 16 20:32:13 EDT 2006
//

// system include files
#include <sys/time.h>

// user include files

// forward declarations
namespace edm {
class ChildrenCPUTimer {
public:
ChildrenCPUTimer();
~ChildrenCPUTimer();
ChildrenCPUTimer(ChildrenCPUTimer&&) = default;
ChildrenCPUTimer(const ChildrenCPUTimer&) = delete;
ChildrenCPUTimer& operator=(const ChildrenCPUTimer&) = delete;

// ---------- const member functions ---------------------
double cpuTime() const;

// ---------- static member functions --------------------

// ---------- member functions ---------------------------
void start();
double stop(); //returns delta time
void reset();

void add(double t);

private:
double calculateDeltaTime() const;

// ---------- member data --------------------------------
enum State { kRunning, kStopped } state_;
struct timeval startCPUTime_;

double accumulatedCPUTime_;
};
} // namespace edm

#endif
110 changes: 110 additions & 0 deletions FWCore/Utilities/src/ChildrenCPUTimer.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,110 @@
// -*- C++ -*-
//
// Package: Utilities
// Class : ChildrenCPUTimer
//
// Implementation:
// <Notes on implementation>
//
// Original Author: Chris Jones
// Created: Sun Apr 16 20:32:20 EDT 2006
//

// system include files
#include <sys/resource.h>
#include <cerrno>

// user include files
#include "FWCore/Utilities/interface/ChildrenCPUTimer.h"
#include "FWCore/Utilities/interface/Exception.h"

//
// constants, enums and typedefs
//
using namespace edm;

//
// static data member definitions
//

//
// constructors and destructor
//
ChildrenCPUTimer::ChildrenCPUTimer() : state_(kStopped), startCPUTime_(), accumulatedCPUTime_(0) {
startCPUTime_.tv_sec = 0;
startCPUTime_.tv_usec = 0;
}

// ChildrenCPUTimer::ChildrenCPUTimer(ChildrenCPUTimer const& rhs) {
// // do actual copying here;
// }

ChildrenCPUTimer::~ChildrenCPUTimer() {}

//
// assignment operators
//
// ChildrenCPUTimer const& ChildrenCPUTimer::operator=(ChildrenCPUTimer const& rhs) {
// //An exception safe implementation is
// ChildrenCPUTimer temp(rhs);
// swap(rhs);
//
// return *this;
// }

//
// member functions
//
void ChildrenCPUTimer::start() {
if (kStopped == state_) {
rusage theUsage;
if (0 != getrusage(RUSAGE_CHILDREN, &theUsage)) {
throw cms::Exception("ChildrenCPUTimerFailed") << errno;
}
startCPUTime_.tv_sec = theUsage.ru_stime.tv_sec + theUsage.ru_utime.tv_sec;
startCPUTime_.tv_usec = theUsage.ru_stime.tv_usec + theUsage.ru_utime.tv_usec;
state_ = kRunning;
}
}

double ChildrenCPUTimer::stop() {
if (kRunning == state_) {
auto t = calculateDeltaTime();
accumulatedCPUTime_ += t;

state_ = kStopped;
return t;
}
return 0.;
}

void ChildrenCPUTimer::reset() { accumulatedCPUTime_ = 0; }

void ChildrenCPUTimer::add(double t) { accumulatedCPUTime_ += t; }

double ChildrenCPUTimer::calculateDeltaTime() const {
double returnValue;
double const microsecToSec = 1E-6;

rusage theUsage;
if (0 != getrusage(RUSAGE_CHILDREN, &theUsage)) {
throw cms::Exception("CPUTimerFailed") << errno;
}

returnValue = theUsage.ru_stime.tv_sec + theUsage.ru_utime.tv_sec - startCPUTime_.tv_sec +
microsecToSec * (theUsage.ru_stime.tv_usec + theUsage.ru_utime.tv_usec - startCPUTime_.tv_usec);
return returnValue;
}
//
// const member functions
//
double ChildrenCPUTimer::cpuTime() const {
if (kStopped == state_) {
return accumulatedCPUTime_;
}
return accumulatedCPUTime_ + calculateDeltaTime();
}

//
// static member functions
//

0 comments on commit 9be29e8

Please sign in to comment.