Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add GPU backend option for TensorFlow session #40551

Merged
merged 7 commits into from
Feb 13, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions PhysicsTools/TensorFlow/BuildFile.xml
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
<use name="FWCore/Framework"/>
<use name="FWCore/MessageLogger"/>
<use name="FWCore/Utilities"/>
<use name="FWCore/ServiceRegistry"/>
<export>
<lib name="1"/>
</export>
19 changes: 15 additions & 4 deletions PhysicsTools/TensorFlow/interface/TensorFlow.h
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,8 @@

namespace tensorflow {

enum class Backend { cpu, cuda, rocm, intel, best };

typedef std::pair<std::string, Tensor> NamedTensor;
typedef std::vector<NamedTensor> NamedTensorList;

Expand All @@ -39,6 +41,10 @@ namespace tensorflow {
// since the threading configuration is done per run() call as of 2.1
void setThreading(SessionOptions& sessionOptions, int nThreads, const std::string& singleThreadPool);

// Set the backend option cpu/cuda
// The gpu memory is set to "allow_growth" to avoid TF getting all the CUDA memory at once.
void setBackend(SessionOptions& sessionOptions, Backend backend = Backend::cpu);

// loads a meta graph definition saved at exportDir using the SavedModel interface for a tag and
// predefined sessionOptions
// transfers ownership
Expand All @@ -52,11 +58,13 @@ namespace tensorflow {
// transfers ownership
MetaGraphDef* loadMetaGraphDef(const std::string& exportDir,
const std::string& tag = kSavedModelTagServe,
Backend backend = Backend::cpu,
int nThreads = 1);

// deprecated in favor of loadMetaGraphDef
MetaGraphDef* loadMetaGraph(const std::string& exportDir,
const std::string& tag = kSavedModelTagServe,
Backend backend = Backend::cpu,
int nThreads = 1);

// loads a graph definition saved as a protobuf file at pbFile
Expand All @@ -67,9 +75,9 @@ namespace tensorflow {
// transfers ownership
Session* createSession(SessionOptions& sessionOptions);

// return a new, empty session with nThreads
// return a new, empty session with nThreads and selected backend
// transfers ownership
Session* createSession(int nThreads = 1);
Session* createSession(Backend backend = Backend::cpu, int nThreads = 1);

// return a new session that will contain an already loaded meta graph whose exportDir must be
// given in order to load and initialize the variables, sessionOptions are predefined
Expand All @@ -83,7 +91,10 @@ namespace tensorflow {
// in order to load and initialize the variables, threading options are inferred from nThreads
// an error is thrown when metaGraphDef is a nullptr or when the graph has no nodes
// transfers ownership
Session* createSession(const MetaGraphDef* metaGraphDef, const std::string& exportDir, int nThreads = 1);
Session* createSession(const MetaGraphDef* metaGraphDef,
const std::string& exportDir,
Backend backend = Backend::cpu,
int nThreads = 1);

// return a new session that will contain an already loaded graph def, sessionOptions are predefined
// an error is thrown when graphDef is a nullptr or when the graph has no nodes
Expand All @@ -94,7 +105,7 @@ namespace tensorflow {
// inferred from nThreads
// an error is thrown when graphDef is a nullptr or when the graph has no nodes
// transfers ownership
Session* createSession(const GraphDef* graphDef, int nThreads = 1);
Session* createSession(const GraphDef* graphDef, Backend backend = Backend::cpu, int nThreads = 1);

// closes a session, calls its destructor, resets the pointer, and returns true on success
bool closeSession(Session*& session);
Expand Down
2 changes: 1 addition & 1 deletion PhysicsTools/TensorFlow/plugins/TfGraphDefProducer.cc
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ TfGraphDefProducer::TfGraphDefProducer(const edm::ParameterSet& iConfig)
// ------------ method called to produce the data ------------
TfGraphDefProducer::ReturnType TfGraphDefProducer::produce(const TfGraphRecord& iRecord) {
auto* graph = tensorflow::loadGraphDef(filename_);
return std::make_unique<TfGraphDefWrapper>(tensorflow::createSession(graph, 1), graph);
return std::make_unique<TfGraphDefWrapper>(tensorflow::createSession(graph), graph);
}

void TfGraphDefProducer::fillDescriptions(edm::ConfigurationDescriptions& descriptions) {
Expand Down
78 changes: 71 additions & 7 deletions PhysicsTools/TensorFlow/src/TensorFlow.cc
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,9 @@
*/

#include "PhysicsTools/TensorFlow/interface/TensorFlow.h"

#include "FWCore/MessageLogger/interface/MessageLogger.h"
#include "FWCore/ServiceRegistry/interface/Service.h"
#include "FWCore/Utilities/interface/ResourceInformation.h"

namespace tensorflow {

Expand All @@ -25,6 +26,65 @@ namespace tensorflow {
setThreading(sessionOptions, nThreads);
}

void setBackend(SessionOptions& sessionOptions, Backend backend) {
/*
* The TensorFlow backend configures the available devices using options provided in the sessionOptions proto.
* // Options from https://github.com/tensorflow/tensorflow/blob/c53dab9fbc9de4ea8b1df59041a5ffd3987328c3/tensorflow/core/protobuf/config.proto
*
* If the device_count["GPU"] = 0 GPUs are not used.
* The visible_device_list configuration is used to map the `visible` devices (from CUDA_VISIBLE_DEVICES) to `virtual` devices.
* If Backend::cpu is request, the GPU device is disallowed by device_count configuration.
* If Backend::cuda is request:
* - if ResourceInformation shows an available Nvidia GPU device:
* the device is used with memory_growth configuration (not allocating all cuda memory at once).
* - if no device is present: an exception is raised.
*/

edm::Service<edm::ResourceInformation> ri;
if (backend == Backend::cpu) {
// disable GPU usage
(*sessionOptions.config.mutable_device_count())["GPU"] = 0;
sessionOptions.config.mutable_gpu_options()->set_visible_device_list("");
}
// NVidia GPU
else if (backend == Backend::cuda) {
if (not ri->nvidiaDriverVersion().empty()) {
// Take only the first GPU in the CUDA_VISIBLE_DEVICE list
(*sessionOptions.config.mutable_device_count())["GPU"] = 1;
sessionOptions.config.mutable_gpu_options()->set_visible_device_list("0");
// Do not allocate all the memory on the GPU at the beginning.
sessionOptions.config.mutable_gpu_options()->set_allow_growth(true);
} else {
edm::Exception ex(edm::errors::UnavailableAccelerator);
ex << "Cuda backend requested, but no NVIDIA GPU available in the job";
ex.addContext("Calling tensorflow::setBackend()");
throw ex;
}
}
// ROCm and Intel GPU are still not supported
else if ((backend == Backend::rocm) || (backend == Backend::intel)) {
edm::Exception ex(edm::errors::UnavailableAccelerator);
ex << "ROCm/Intel GPU backend requested, but TF is not compiled yet for this platform";
ex.addContext("Calling tensorflow::setBackend()");
throw ex;
}
// Get NVidia GPU if possible or fallback to CPU
else if (backend == Backend::best) {
// Check if a Nvidia GPU is availabl
if (not ri->nvidiaDriverVersion().empty()) {
// Take only the first GPU in the CUDA_VISIBLE_DEVICE list
(*sessionOptions.config.mutable_device_count())["GPU"] = 1;
sessionOptions.config.mutable_gpu_options()->set_visible_device_list("0");
// Do not allocate all the memory on the GPU at the beginning.
sessionOptions.config.mutable_gpu_options()->set_allow_growth(true);
} else {
// Just CPU support
(*sessionOptions.config.mutable_device_count())["GPU"] = 0;
sessionOptions.config.mutable_gpu_options()->set_visible_device_list("");
}
}
}

MetaGraphDef* loadMetaGraphDef(const std::string& exportDir, const std::string& tag, SessionOptions& sessionOptions) {
// objects to load the graph
Status status;
Expand All @@ -49,19 +109,20 @@ namespace tensorflow {
return loadMetaGraphDef(exportDir, tag, sessionOptions);
}

MetaGraphDef* loadMetaGraphDef(const std::string& exportDir, const std::string& tag, int nThreads) {
MetaGraphDef* loadMetaGraphDef(const std::string& exportDir, const std::string& tag, Backend backend, int nThreads) {
// create session options and set thread options
SessionOptions sessionOptions;
setThreading(sessionOptions, nThreads);
setBackend(sessionOptions, backend);

return loadMetaGraphDef(exportDir, tag, sessionOptions);
}

MetaGraphDef* loadMetaGraph(const std::string& exportDir, const std::string& tag, int nThreads) {
MetaGraphDef* loadMetaGraph(const std::string& exportDir, const std::string& tag, Backend backend, int nThreads) {
edm::LogInfo("PhysicsTools/TensorFlow")
<< "tensorflow::loadMetaGraph() is deprecated, use tensorflow::loadMetaGraphDef() instead";

return loadMetaGraphDef(exportDir, tag, nThreads);
return loadMetaGraphDef(exportDir, tag, backend, nThreads);
}

GraphDef* loadGraphDef(const std::string& pbFile) {
Expand Down Expand Up @@ -95,10 +156,11 @@ namespace tensorflow {
return session;
}

Session* createSession(int nThreads) {
Session* createSession(Backend backend, int nThreads) {
// create session options and set thread options
SessionOptions sessionOptions;
setThreading(sessionOptions, nThreads);
setBackend(sessionOptions, backend);

return createSession(sessionOptions);
}
Expand Down Expand Up @@ -152,10 +214,11 @@ namespace tensorflow {
return session;
}

Session* createSession(const MetaGraphDef* metaGraphDef, const std::string& exportDir, int nThreads) {
Session* createSession(const MetaGraphDef* metaGraphDef, const std::string& exportDir, Backend backend, int nThreads) {
// create session options and set thread options
SessionOptions sessionOptions;
setThreading(sessionOptions, nThreads);
setBackend(sessionOptions, backend);

return createSession(metaGraphDef, exportDir, sessionOptions);
}
Expand Down Expand Up @@ -186,10 +249,11 @@ namespace tensorflow {
return session;
}

Session* createSession(const GraphDef* graphDef, int nThreads) {
Session* createSession(const GraphDef* graphDef, Backend backend, int nThreads) {
// create session options and set thread options
SessionOptions sessionOptions;
setThreading(sessionOptions, nThreads);
setBackend(sessionOptions, backend);

return createSession(graphDef, sessionOptions);
}
Expand Down
128 changes: 128 additions & 0 deletions PhysicsTools/TensorFlow/test/BuildFile.xml
Original file line number Diff line number Diff line change
Expand Up @@ -2,38 +2,166 @@
<use name="tensorflow-cc"/>
<use name="boost_filesystem"/>
<use name="cppunit"/>
<use name="PhysicsTools/TensorFlow"/>
</bin>

<iftool name="cuda">
<bin name="testTFHelloWorldCUDA" file="testRunner.cpp,testHelloWorldCUDA.cc">
<use name="tensorflow-cc"/>
<use name="boost_filesystem"/>
<use name="cppunit"/>
<use name="PhysicsTools/TensorFlow"/>
<use name="HeterogeneousCore/CUDAServices"/>
<use name="HeterogeneousCore/CUDAUtilities"/>
<use name="catch2"/>
<use name="FWCore/ParameterSet"/>
<use name="FWCore/ParameterSetReader"/>
<use name="FWCore/PluginManager"/>
<use name="FWCore/ServiceRegistry"/>
<use name="FWCore/Utilities"/>
<use name="cuda"/>
</bin>
</iftool>


<bin name="testTFMetaGraphLoading" file="testRunner.cpp,testMetaGraphLoading.cc">
<use name="boost_filesystem"/>
<use name="cppunit"/>
<use name="PhysicsTools/TensorFlow"/>
</bin>

<iftool name="cuda">
<bin name="testTFMetaGraphLoadingCUDA" file="testRunner.cpp,testMetaGraphLoadingCUDA.cc">
<use name="boost_filesystem"/>
<use name="cppunit"/>
<use name="PhysicsTools/TensorFlow"/>
<use name="HeterogeneousCore/CUDAServices"/>
<use name="HeterogeneousCore/CUDAUtilities"/>
<use name="catch2"/>
<use name="FWCore/ParameterSet"/>
<use name="FWCore/ParameterSetReader"/>
<use name="FWCore/PluginManager"/>
<use name="FWCore/ServiceRegistry"/>
<use name="FWCore/Utilities"/>
<use name="cuda"/>
</bin>
</iftool>

<bin name="testTFGraphLoading" file="testRunner.cpp,testGraphLoading.cc">
<use name="boost_filesystem"/>
<use name="cppunit"/>
<use name="PhysicsTools/TensorFlow"/>
</bin>

<iftool name="cuda">
<bin name="testTFGraphLoadingCUDA" file="testRunner.cpp,testGraphLoadingCUDA.cc">
<use name="boost_filesystem"/>
<use name="cppunit"/>
<use name="PhysicsTools/TensorFlow"/>
<use name="HeterogeneousCore/CUDAServices"/>
<use name="HeterogeneousCore/CUDAUtilities"/>
<use name="catch2"/>
<use name="FWCore/ParameterSet"/>
<use name="FWCore/ParameterSetReader"/>
<use name="FWCore/PluginManager"/>
<use name="FWCore/ServiceRegistry"/>
<use name="FWCore/Utilities"/>
<use name="cuda"/>
</bin>
</iftool>

<bin name="testTFConstSession" file="testRunner.cpp,testConstSession.cc">
<use name="boost_filesystem"/>
<use name="cppunit"/>
<use name="PhysicsTools/TensorFlow"/>
</bin>

<iftool name="cuda">
<bin name="testTFConstSessionCUDA" file="testRunner.cpp,testConstSessionCUDA.cc">
<use name="boost_filesystem"/>
<use name="cppunit"/>
<use name="PhysicsTools/TensorFlow"/>
<use name="HeterogeneousCore/CUDAServices"/>
<use name="HeterogeneousCore/CUDAUtilities"/>
<use name="catch2"/>
<use name="FWCore/ParameterSet"/>
<use name="FWCore/ParameterSetReader"/>
<use name="FWCore/PluginManager"/>
<use name="FWCore/ServiceRegistry"/>
<use name="FWCore/Utilities"/>
<use name="cuda"/>
</bin>
</iftool>

<bin name="testTFSessionCache" file="testRunner.cpp,testSessionCache.cc">
<use name="boost_filesystem"/>
<use name="cppunit"/>
<use name="PhysicsTools/TensorFlow"/>
</bin>

<iftool name="cuda">
<bin name="testTFSessionCacheCUDA" file="testRunner.cpp,testSessionCacheCUDA.cc">
<use name="boost_filesystem"/>
<use name="cppunit"/>
<use name="PhysicsTools/TensorFlow"/>
<use name="HeterogeneousCore/CUDAServices"/>
<use name="HeterogeneousCore/CUDAUtilities"/>
<use name="catch2"/>
<use name="FWCore/ParameterSet"/>
<use name="FWCore/ParameterSetReader"/>
<use name="FWCore/PluginManager"/>
<use name="FWCore/ServiceRegistry"/>
<use name="FWCore/Utilities"/>
<use name="cuda"/>
</bin>
</iftool>

<bin name="testTFThreadPools" file="testRunner.cpp,testThreadPools.cc">
<use name="boost_filesystem"/>
<use name="cppunit"/>
<use name="PhysicsTools/TensorFlow"/>
</bin>

<iftool name="cuda">
<bin name="testTFThreadPoolsCUDA" file="testRunner.cpp,testThreadPoolsCUDA.cc">
<use name="boost_filesystem"/>
<use name="cppunit"/>
<use name="PhysicsTools/TensorFlow"/>
<use name="HeterogeneousCore/CUDAServices"/>
<use name="HeterogeneousCore/CUDAUtilities"/>
<use name="catch2"/>
<use name="FWCore/ParameterSet"/>
<use name="FWCore/ParameterSetReader"/>
<use name="FWCore/PluginManager"/>
<use name="FWCore/ServiceRegistry"/>
<use name="FWCore/Utilities"/>
<use name="cuda"/>
</bin>
</iftool>

<bin name="testTFVisibleDevices" file="testRunner.cpp,testVisibleDevices.cc">
<use name="boost_filesystem"/>
<use name="cppunit"/>
<use name="PhysicsTools/TensorFlow"/>
</bin>


<iftool name="cuda">
<bin name="testTFVisibleDevicesCUDA" file="testRunner.cpp,testVisibleDevicesCUDA.cc">
<use name="boost_filesystem"/>
<use name="cppunit"/>
<use name="PhysicsTools/TensorFlow"/>
<use name="HeterogeneousCore/CUDAServices"/>
<use name="catch2"/>
<use name="FWCore/ParameterSet"/>
<use name="FWCore/ParameterSetReader"/>
<use name="FWCore/PluginManager"/>
<use name="FWCore/ServiceRegistry"/>
<use name="FWCore/Utilities"/>
<use name="cuda"/>
</bin>
</iftool>

<!-- <ifarchitecture name="!_ppc64le_">
<bin name="testTFAOT" file="testRunner.cpp,testAOT.cc">
<flags DNN_NAME="testAOT_add" />
Expand Down
Loading