Import CUDA interaction layer from Patatrack

cms-sw · Dec 3, 2019 · 3854fbb · 3854fbb
1 parent 91be707
commit 3854fbb
Show file tree

Hide file tree

Showing 118 changed files with 7,312 additions and 137 deletions.
diff --git a/CUDADataFormats/Common/BuildFile.xml b/CUDADataFormats/Common/BuildFile.xml
@@ -0,0 +1,5 @@
+<use name="HeterogeneousCore/CUDAUtilities"/>
+
+<export>
+    <lib name="1"/>
+</export>
diff --git a/CUDADataFormats/Common/interface/CUDAProduct.h b/CUDADataFormats/Common/interface/CUDAProduct.h
@@ -0,0 +1,55 @@
+#ifndef CUDADataFormats_Common_CUDAProduct_h
+#define CUDADataFormats_Common_CUDAProduct_h
+
+#include <memory>
+
+#include "CUDADataFormats/Common/interface/CUDAProductBase.h"
+
+namespace edm {
+  template <typename T>
+  class Wrapper;
+}
+namespace impl {
+  class CUDAScopedContextGetterBase;
+}
+
+/**
+ * The purpose of this class is to wrap CUDA data to edm::Event in a
+ * way which forces correct use of various utilities.
+ *
+ * The non-default construction has to be done with CUDAScopedContext
+ * (in order to properly register the CUDA event).
+ *
+ * The default constructor is needed only for the ROOT dictionary generation.
+ *
+ * The CUDA event is in practice needed only for stream-stream
+ * synchronization, but someone with long-enough lifetime has to own
+ * it. Here is a somewhat natural place. If overhead is too much, we
+ * can use them only where synchronization between streams is needed.
+ */
+template <typename T>
+class CUDAProduct : public CUDAProductBase {
+public:
+  CUDAProduct() = default;  // Needed only for ROOT dictionary generation
+
+  CUDAProduct(const CUDAProduct&) = delete;
+  CUDAProduct& operator=(const CUDAProduct&) = delete;
+  CUDAProduct(CUDAProduct&&) = default;
+  CUDAProduct& operator=(CUDAProduct&&) = default;
+
+private:
+  friend class impl::CUDAScopedContextGetterBase;
+  friend class CUDAScopedContextProduce;
+  friend class edm::Wrapper<CUDAProduct<T>>;
+
+  explicit CUDAProduct(int device, cudautils::SharedStreamPtr stream, T data)
+      : CUDAProductBase(device, std::move(stream)), data_(std::move(data)) {}
+
+  template <typename... Args>
+  explicit CUDAProduct(int device, cudautils::SharedStreamPtr stream, Args&&... args)
+      : CUDAProductBase(device, std::move(stream)), data_(std::forward<Args>(args)...) {}
+
+  T data_;  //!
+};
+
+#endif
diff --git a/CUDADataFormats/Common/interface/CUDAProductBase.h b/CUDADataFormats/Common/interface/CUDAProductBase.h
@@ -0,0 +1,90 @@
+#ifndef CUDADataFormats_Common_CUDAProductBase_h
+#define CUDADataFormats_Common_CUDAProductBase_h
+
+#include <atomic>
+#include <memory>
+
+#include "HeterogeneousCore/CUDAUtilities/interface/SharedStreamPtr.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/SharedEventPtr.h"
+
+namespace impl {
+  class CUDAScopedContextBase;
+}
+
+/**
+ * Base class for all instantiations of CUDA<T> to hold the
+ * non-T-dependent members.
+ */
+class CUDAProductBase {
+public:
+  CUDAProductBase() = default;  // Needed only for ROOT dictionary generation
+  ~CUDAProductBase();
+
+  CUDAProductBase(const CUDAProductBase&) = delete;
+  CUDAProductBase& operator=(const CUDAProductBase&) = delete;
+  CUDAProductBase(CUDAProductBase&& other)
+      : stream_{std::move(other.stream_)},
+        event_{std::move(other.event_)},
+        mayReuseStream_{other.mayReuseStream_.load()},
+        device_{other.device_} {}
+  CUDAProductBase& operator=(CUDAProductBase&& other) {
+    stream_ = std::move(other.stream_);
+    event_ = std::move(other.event_);
+    mayReuseStream_ = other.mayReuseStream_.load();
+    device_ = other.device_;
+    return *this;
+  }
+
+  bool isValid() const { return stream_.get() != nullptr; }
+  bool isAvailable() const;
+
+  int device() const { return device_; }
+
+  // cudaStream_t is a pointer to a thread-safe object, for which a
+  // mutable access is needed even if the CUDAScopedContext itself
+  // would be const. Therefore it is ok to return a non-const
+  // pointer from a const method here.
+  cudaStream_t stream() const { return stream_.get(); }
+
+  // cudaEvent_t is a pointer to a thread-safe object, for which a
+  // mutable access is needed even if the CUDAScopedContext itself
+  // would be const. Therefore it is ok to return a non-const
+  // pointer from a const method here.
+  cudaEvent_t event() const { return event_ ? event_.get() : nullptr; }
+
+protected:
+  explicit CUDAProductBase(int device, cudautils::SharedStreamPtr stream)
+      : stream_{std::move(stream)}, device_{device} {}
+
+private:
+  friend class impl::CUDAScopedContextBase;
+  friend class CUDAScopedContextProduce;
+
+  // The following functions are intended to be used only from CUDAScopedContext
+  void setEvent(cudautils::SharedEventPtr event) { event_ = std::move(event); }
+  const cudautils::SharedStreamPtr& streamPtr() const { return stream_; }
+
+  bool mayReuseStream() const {
+    bool expected = true;
+    bool changed = mayReuseStream_.compare_exchange_strong(expected, false);
+    // If the current thread is the one flipping the flag, it may
+    // reuse the stream.
+    return changed;
+  }
+
+  // The cudaStream_t is really shared among edm::Event products, so
+  // using shared_ptr also here
+  cudautils::SharedStreamPtr stream_;  //!
+  // shared_ptr because of caching in CUDAEventCache
+  cudautils::SharedEventPtr event_;  //!
+
+  // This flag tells whether the CUDA stream may be reused by a
+  // consumer or not. The goal is to have a "chain" of modules to
+  // queue their work to the same stream.
+  mutable std::atomic<bool> mayReuseStream_ = true;  //!
+
+  // The CUDA device associated with this product
+  int device_ = -1;  //!
+};
+
+#endif
diff --git a/CUDADataFormats/Common/src/CUDAProductBase.cc b/CUDADataFormats/Common/src/CUDAProductBase.cc
@@ -0,0 +1,27 @@
+#include "CUDADataFormats/Common/interface/CUDAProductBase.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/eventIsOccurred.h"
+
+bool CUDAProductBase::isAvailable() const {
+  // In absence of event, the product was available already at the end
+  // of produce() of the producer.
+  if (not event_) {
+    return true;
+  }
+  return cudautils::eventIsOccurred(event_.get());
+}
+
+CUDAProductBase::~CUDAProductBase() {
+  // Make sure that the production of the product in the GPU is
+  // complete before destructing the product. This is to make sure
+  // that the EDM stream does not move to the next event before all
+  // asynchronous processing of the current is complete.
+  if (event_) {
+    // TODO: a callback notifying a WaitingTaskHolder (or similar)
+    // would avoid blocking the CPU, but would also require more work.
+    //
+    // Intentionally not checking the return value to avoid throwing
+    // exceptions. If this call would fail, we should get failures
+    // elsewhere as well.
+    cudaEventSynchronize(event_.get());
+  }
+}
diff --git a/CUDADataFormats/Common/test/BuildFile.xml b/CUDADataFormats/Common/test/BuildFile.xml
@@ -0,0 +1,5 @@
+<bin file="test*.cc" name="testCUDADataFormatsCommon">
+  <use name="HeterogeneousCore/CUDACore"/>
+  <use name="catch2"/>
+  <use name="cuda"/>
+</bin>
diff --git a/CUDADataFormats/Common/test/test_CUDAProduct.cc b/CUDADataFormats/Common/test/test_CUDAProduct.cc
@@ -0,0 +1,68 @@
+#include "catch.hpp"
+
+#include "CUDADataFormats/Common/interface/CUDAProduct.h"
+#include "HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/requireCUDADevices.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/CUDAStreamCache.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/CUDAEventCache.h"
+
+#include <cuda_runtime_api.h>
+
+namespace cudatest {
+  class TestCUDAScopedContext {
+  public:
+    static CUDAScopedContextProduce make(int dev, bool createEvent) {
+      cudautils::SharedEventPtr event;
+      if (createEvent) {
+        event = cudautils::getCUDAEventCache().getCUDAEvent();
+      }
+      return CUDAScopedContextProduce(dev, cudautils::getCUDAStreamCache().getCUDAStream(), std::move(event));
+    }
+  };
+}  // namespace cudatest
+
+TEST_CASE("Use of CUDAProduct template", "[CUDACore]") {
+  SECTION("Default constructed") {
+    auto foo = CUDAProduct<int>();
+    REQUIRE(!foo.isValid());
+
+    auto bar = std::move(foo);
+  }
+
+  if (not hasCUDADevices()) {
+    return;
+  }
+
+  constexpr int defaultDevice = 0;
+  cudaCheck(cudaSetDevice(defaultDevice));
+  {
+    auto ctx = cudatest::TestCUDAScopedContext::make(defaultDevice, true);
+    std::unique_ptr<CUDAProduct<int>> dataPtr = ctx.wrap(10);
+    auto& data = *dataPtr;
+
+    SECTION("Construct from CUDAScopedContext") {
+      REQUIRE(data.isValid());
+      REQUIRE(data.device() == defaultDevice);
+      REQUIRE(data.stream() == ctx.stream());
+      REQUIRE(data.event() != nullptr);
+    }
+
+    SECTION("Move constructor") {
+      auto data2 = CUDAProduct<int>(std::move(data));
+      REQUIRE(data2.isValid());
+      REQUIRE(!data.isValid());
+    }
+
+    SECTION("Move assignment") {
+      CUDAProduct<int> data2;
+      data2 = std::move(data);
+      REQUIRE(data2.isValid());
+      REQUIRE(!data.isValid());
+    }
+  }
+
+  cudaCheck(cudaSetDevice(defaultDevice));
+  cudaCheck(cudaDeviceSynchronize());
+  // Note: CUDA resources are cleaned up by the destructors of the global cache objects
+}
diff --git a/CUDADataFormats/Common/test/test_main.cc b/CUDADataFormats/Common/test/test_main.cc
@@ -0,0 +1,2 @@
+#define CATCH_CONFIG_MAIN
+#include "catch.hpp"
diff --git a/FWCore/Concurrency/interface/WaitingTaskWithArenaHolder.h b/FWCore/Concurrency/interface/WaitingTaskWithArenaHolder.h
@@ -24,9 +24,9 @@
 
 #include "tbb/task_arena.h"
 
-namespace edm {
+#include "FWCore/Concurrency/interface/WaitingTask.h"
 
-  class WaitingTask;
+namespace edm {
   class WaitingTaskHolder;
 
   class WaitingTaskWithArenaHolder {
@@ -72,5 +72,29 @@ namespace edm {
     WaitingTask* m_task;
     std::shared_ptr<tbb::task_arena> m_arena;
   };
+
+  template <typename F>
+  auto make_lambda_with_holder(WaitingTaskWithArenaHolder h, F&& f) {
+    return [holder = std::move(h), func = std::forward<F>(f)]() mutable {
+      try {
+        func(holder);
+      } catch (...) {
+        holder.doneWaiting(std::current_exception());
+      }
+    };
+  }
+
+  template <typename ALLOC, typename F>
+  auto make_waiting_task_with_holder(ALLOC&& iAlloc, WaitingTaskWithArenaHolder h, F&& f) {
+    return make_waiting_task(
+        std::forward<ALLOC>(iAlloc),
+        [holder = h, func = make_lambda_with_holder(h, std::forward<F>(f))](std::exception_ptr const* excptr) mutable {
+          if (excptr) {
+            holder.doneWaiting(*excptr);
+            return;
+          }
+          func();
+        });
+  }
 }  // namespace edm
 #endif
diff --git a/HeterogeneousCore/CUDACore/BuildFile.xml b/HeterogeneousCore/CUDACore/BuildFile.xml
@@ -0,0 +1,12 @@
+<use name="FWCore/Concurrency"/>
+<use name="FWCore/Framework"/>
+<use name="FWCore/ServiceRegistry"/>
+<use name="FWCore/ParameterSet"/>
+<use name="CUDADataFormats/Common"/>
+<use name="DataFormats/Provenance"/>
+<use name="HeterogeneousCore/CUDAServices"/>
+<use name="cuda"/>
+
+<export>
+    <lib name="1"/>
+</export>