Skip to content

Commit

Permalink
Import CUDA interaction layer from Patatrack
Browse files Browse the repository at this point in the history
  • Loading branch information
makortel committed Dec 3, 2019
1 parent 91be707 commit 3854fbb
Show file tree
Hide file tree
Showing 118 changed files with 7,312 additions and 137 deletions.
5 changes: 5 additions & 0 deletions CUDADataFormats/Common/BuildFile.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
<use name="HeterogeneousCore/CUDAUtilities"/>

<export>
<lib name="1"/>
</export>
55 changes: 55 additions & 0 deletions CUDADataFormats/Common/interface/CUDAProduct.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
#ifndef CUDADataFormats_Common_CUDAProduct_h
#define CUDADataFormats_Common_CUDAProduct_h

#include <memory>

#include "CUDADataFormats/Common/interface/CUDAProductBase.h"

namespace edm {
template <typename T>
class Wrapper;
}
namespace impl {
class CUDAScopedContextGetterBase;
}

/**
* The purpose of this class is to wrap CUDA data to edm::Event in a
* way which forces correct use of various utilities.
*
* The non-default construction has to be done with CUDAScopedContext
* (in order to properly register the CUDA event).
*
* The default constructor is needed only for the ROOT dictionary generation.
*
* The CUDA event is in practice needed only for stream-stream
* synchronization, but someone with long-enough lifetime has to own
* it. Here is a somewhat natural place. If overhead is too much, we
* can use them only where synchronization between streams is needed.
*/
template <typename T>
class CUDAProduct : public CUDAProductBase {
public:
CUDAProduct() = default; // Needed only for ROOT dictionary generation

CUDAProduct(const CUDAProduct&) = delete;
CUDAProduct& operator=(const CUDAProduct&) = delete;
CUDAProduct(CUDAProduct&&) = default;
CUDAProduct& operator=(CUDAProduct&&) = default;

private:
friend class impl::CUDAScopedContextGetterBase;
friend class CUDAScopedContextProduce;
friend class edm::Wrapper<CUDAProduct<T>>;

explicit CUDAProduct(int device, cudautils::SharedStreamPtr stream, T data)
: CUDAProductBase(device, std::move(stream)), data_(std::move(data)) {}

template <typename... Args>
explicit CUDAProduct(int device, cudautils::SharedStreamPtr stream, Args&&... args)
: CUDAProductBase(device, std::move(stream)), data_(std::forward<Args>(args)...) {}

T data_; //!
};

#endif
90 changes: 90 additions & 0 deletions CUDADataFormats/Common/interface/CUDAProductBase.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
#ifndef CUDADataFormats_Common_CUDAProductBase_h
#define CUDADataFormats_Common_CUDAProductBase_h

#include <atomic>
#include <memory>

#include "HeterogeneousCore/CUDAUtilities/interface/SharedStreamPtr.h"
#include "HeterogeneousCore/CUDAUtilities/interface/SharedEventPtr.h"

namespace impl {
class CUDAScopedContextBase;
}

/**
* Base class for all instantiations of CUDA<T> to hold the
* non-T-dependent members.
*/
class CUDAProductBase {
public:
CUDAProductBase() = default; // Needed only for ROOT dictionary generation
~CUDAProductBase();

CUDAProductBase(const CUDAProductBase&) = delete;
CUDAProductBase& operator=(const CUDAProductBase&) = delete;
CUDAProductBase(CUDAProductBase&& other)
: stream_{std::move(other.stream_)},
event_{std::move(other.event_)},
mayReuseStream_{other.mayReuseStream_.load()},
device_{other.device_} {}
CUDAProductBase& operator=(CUDAProductBase&& other) {
stream_ = std::move(other.stream_);
event_ = std::move(other.event_);
mayReuseStream_ = other.mayReuseStream_.load();
device_ = other.device_;
return *this;
}

bool isValid() const { return stream_.get() != nullptr; }
bool isAvailable() const;

int device() const { return device_; }

// cudaStream_t is a pointer to a thread-safe object, for which a
// mutable access is needed even if the CUDAScopedContext itself
// would be const. Therefore it is ok to return a non-const
// pointer from a const method here.
cudaStream_t stream() const { return stream_.get(); }

// cudaEvent_t is a pointer to a thread-safe object, for which a
// mutable access is needed even if the CUDAScopedContext itself
// would be const. Therefore it is ok to return a non-const
// pointer from a const method here.
cudaEvent_t event() const { return event_ ? event_.get() : nullptr; }

protected:
explicit CUDAProductBase(int device, cudautils::SharedStreamPtr stream)
: stream_{std::move(stream)}, device_{device} {}

private:
friend class impl::CUDAScopedContextBase;
friend class CUDAScopedContextProduce;

// The following functions are intended to be used only from CUDAScopedContext
void setEvent(cudautils::SharedEventPtr event) { event_ = std::move(event); }
const cudautils::SharedStreamPtr& streamPtr() const { return stream_; }

bool mayReuseStream() const {
bool expected = true;
bool changed = mayReuseStream_.compare_exchange_strong(expected, false);
// If the current thread is the one flipping the flag, it may
// reuse the stream.
return changed;
}

// The cudaStream_t is really shared among edm::Event products, so
// using shared_ptr also here
cudautils::SharedStreamPtr stream_; //!
// shared_ptr because of caching in CUDAEventCache
cudautils::SharedEventPtr event_; //!

// This flag tells whether the CUDA stream may be reused by a
// consumer or not. The goal is to have a "chain" of modules to
// queue their work to the same stream.
mutable std::atomic<bool> mayReuseStream_ = true; //!

// The CUDA device associated with this product
int device_ = -1; //!
};

#endif
27 changes: 27 additions & 0 deletions CUDADataFormats/Common/src/CUDAProductBase.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
#include "CUDADataFormats/Common/interface/CUDAProductBase.h"
#include "HeterogeneousCore/CUDAUtilities/interface/eventIsOccurred.h"

bool CUDAProductBase::isAvailable() const {
// In absence of event, the product was available already at the end
// of produce() of the producer.
if (not event_) {
return true;
}
return cudautils::eventIsOccurred(event_.get());
}

CUDAProductBase::~CUDAProductBase() {
// Make sure that the production of the product in the GPU is
// complete before destructing the product. This is to make sure
// that the EDM stream does not move to the next event before all
// asynchronous processing of the current is complete.
if (event_) {
// TODO: a callback notifying a WaitingTaskHolder (or similar)
// would avoid blocking the CPU, but would also require more work.
//
// Intentionally not checking the return value to avoid throwing
// exceptions. If this call would fail, we should get failures
// elsewhere as well.
cudaEventSynchronize(event_.get());
}
}
5 changes: 5 additions & 0 deletions CUDADataFormats/Common/test/BuildFile.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
<bin file="test*.cc" name="testCUDADataFormatsCommon">
<use name="HeterogeneousCore/CUDACore"/>
<use name="catch2"/>
<use name="cuda"/>
</bin>
68 changes: 68 additions & 0 deletions CUDADataFormats/Common/test/test_CUDAProduct.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
#include "catch.hpp"

#include "CUDADataFormats/Common/interface/CUDAProduct.h"
#include "HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h"
#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
#include "HeterogeneousCore/CUDAUtilities/interface/requireCUDADevices.h"
#include "HeterogeneousCore/CUDAUtilities/interface/CUDAStreamCache.h"
#include "HeterogeneousCore/CUDAUtilities/interface/CUDAEventCache.h"

#include <cuda_runtime_api.h>

namespace cudatest {
class TestCUDAScopedContext {
public:
static CUDAScopedContextProduce make(int dev, bool createEvent) {
cudautils::SharedEventPtr event;
if (createEvent) {
event = cudautils::getCUDAEventCache().getCUDAEvent();
}
return CUDAScopedContextProduce(dev, cudautils::getCUDAStreamCache().getCUDAStream(), std::move(event));
}
};
} // namespace cudatest

TEST_CASE("Use of CUDAProduct template", "[CUDACore]") {
SECTION("Default constructed") {
auto foo = CUDAProduct<int>();
REQUIRE(!foo.isValid());

auto bar = std::move(foo);
}

if (not hasCUDADevices()) {
return;
}

constexpr int defaultDevice = 0;
cudaCheck(cudaSetDevice(defaultDevice));
{
auto ctx = cudatest::TestCUDAScopedContext::make(defaultDevice, true);
std::unique_ptr<CUDAProduct<int>> dataPtr = ctx.wrap(10);
auto& data = *dataPtr;

SECTION("Construct from CUDAScopedContext") {
REQUIRE(data.isValid());
REQUIRE(data.device() == defaultDevice);
REQUIRE(data.stream() == ctx.stream());
REQUIRE(data.event() != nullptr);
}

SECTION("Move constructor") {
auto data2 = CUDAProduct<int>(std::move(data));
REQUIRE(data2.isValid());
REQUIRE(!data.isValid());
}

SECTION("Move assignment") {
CUDAProduct<int> data2;
data2 = std::move(data);
REQUIRE(data2.isValid());
REQUIRE(!data.isValid());
}
}

cudaCheck(cudaSetDevice(defaultDevice));
cudaCheck(cudaDeviceSynchronize());
// Note: CUDA resources are cleaned up by the destructors of the global cache objects
}
2 changes: 2 additions & 0 deletions CUDADataFormats/Common/test/test_main.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
#define CATCH_CONFIG_MAIN
#include "catch.hpp"
28 changes: 26 additions & 2 deletions FWCore/Concurrency/interface/WaitingTaskWithArenaHolder.h
Original file line number Diff line number Diff line change
Expand Up @@ -24,9 +24,9 @@

#include "tbb/task_arena.h"

namespace edm {
#include "FWCore/Concurrency/interface/WaitingTask.h"

class WaitingTask;
namespace edm {
class WaitingTaskHolder;

class WaitingTaskWithArenaHolder {
Expand Down Expand Up @@ -72,5 +72,29 @@ namespace edm {
WaitingTask* m_task;
std::shared_ptr<tbb::task_arena> m_arena;
};

template <typename F>
auto make_lambda_with_holder(WaitingTaskWithArenaHolder h, F&& f) {
return [holder = std::move(h), func = std::forward<F>(f)]() mutable {
try {
func(holder);
} catch (...) {
holder.doneWaiting(std::current_exception());
}
};
}

template <typename ALLOC, typename F>
auto make_waiting_task_with_holder(ALLOC&& iAlloc, WaitingTaskWithArenaHolder h, F&& f) {
return make_waiting_task(
std::forward<ALLOC>(iAlloc),
[holder = h, func = make_lambda_with_holder(h, std::forward<F>(f))](std::exception_ptr const* excptr) mutable {
if (excptr) {
holder.doneWaiting(*excptr);
return;
}
func();
});
}
} // namespace edm
#endif
12 changes: 12 additions & 0 deletions HeterogeneousCore/CUDACore/BuildFile.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
<use name="FWCore/Concurrency"/>
<use name="FWCore/Framework"/>
<use name="FWCore/ServiceRegistry"/>
<use name="FWCore/ParameterSet"/>
<use name="CUDADataFormats/Common"/>
<use name="DataFormats/Provenance"/>
<use name="HeterogeneousCore/CUDAServices"/>
<use name="cuda"/>

<export>
<lib name="1"/>
</export>
Loading

0 comments on commit 3854fbb

Please sign in to comment.