Synchronise with CMSSW_11_0_0_pre13

cms-patatrack · Nov 30, 2019 · 62a224f · 62a224f
2 parents 91be707 + a2ead18
commit 62a224f
Show file tree

Hide file tree

Showing 453 changed files with 41,735 additions and 687 deletions.
diff --git a/CUDADataFormats/BeamSpot/BuildFile.xml b/CUDADataFormats/BeamSpot/BuildFile.xml
@@ -0,0 +1,8 @@
+<use name="rootcore"/>
+<use name="CUDADataFormats/Common"/>
+<use name="DataFormats/Common"/>
+<use name="HeterogeneousCore/CUDAUtilities"/>
+
+<export>
+    <lib name="1"/>
+</export>
diff --git a/CUDADataFormats/BeamSpot/interface/BeamSpotCUDA.h b/CUDADataFormats/BeamSpot/interface/BeamSpotCUDA.h
@@ -0,0 +1,32 @@
+#ifndef CUDADataFormats_BeamSpot_interface_BeamSpotCUDA_h
+#define CUDADataFormats_BeamSpot_interface_BeamSpotCUDA_h
+
+#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
+
+#include <cuda_runtime.h>
+
+class BeamSpotCUDA {
+public:
+  // alignas(128) doesn't really make sense as there is only one
+  // beamspot per event?
+  struct Data {
+    float x, y, z;  // position
+    // TODO: add covariance matrix
+
+    float sigmaZ;
+    float beamWidthX, beamWidthY;
+    float dxdz, dydz;
+    float emittanceX, emittanceY;
+    float betaStar;
+  };
+
+  BeamSpotCUDA() = default;
+  BeamSpotCUDA(Data const* data_h, cudaStream_t stream);
+
+  Data const* data() const { return data_d_.get(); }
+
+private:
+  cudautils::device::unique_ptr<Data> data_d_;
+};
+
+#endif
diff --git a/CUDADataFormats/BeamSpot/src/BeamSpotCUDA.cc b/CUDADataFormats/BeamSpot/src/BeamSpotCUDA.cc
@@ -0,0 +1,9 @@
+#include "CUDADataFormats/BeamSpot/interface/BeamSpotCUDA.h"
+
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
+
+BeamSpotCUDA::BeamSpotCUDA(Data const* data_h, cudaStream_t stream) {
+  data_d_ = cudautils::make_device_unique<Data>(stream);
+  cudaCheck(cudaMemcpyAsync(data_d_.get(), data_h, sizeof(Data), cudaMemcpyHostToDevice, stream));
+}
diff --git a/CUDADataFormats/BeamSpot/src/classes.h b/CUDADataFormats/BeamSpot/src/classes.h
@@ -0,0 +1,8 @@
+#ifndef CUDADataFormats_BeamSpot_classes_h
+#define CUDADataFormats_BeamSpot_classes_h
+
+#include "CUDADataFormats/Common/interface/CUDAProduct.h"
+#include "CUDADataFormats/BeamSpot/interface/BeamSpotCUDA.h"
+#include "DataFormats/Common/interface/Wrapper.h"
+
+#endif
diff --git a/CUDADataFormats/BeamSpot/src/classes_def.xml b/CUDADataFormats/BeamSpot/src/classes_def.xml
@@ -0,0 +1,4 @@
+<lcgdict>
+  <class name="CUDAProduct<BeamSpotCUDA>" persistent="false"/>
+  <class name="edm::Wrapper<CUDAProduct<BeamSpotCUDA>>" persistent="false"/>
+</lcgdict>
diff --git a/CUDADataFormats/Common/BuildFile.xml b/CUDADataFormats/Common/BuildFile.xml
@@ -0,0 +1,5 @@
+<use name="HeterogeneousCore/CUDAUtilities"/>
+
+<export>
+    <lib name="1"/>
+</export>
diff --git a/CUDADataFormats/Common/interface/ArrayShadow.h b/CUDADataFormats/Common/interface/ArrayShadow.h
@@ -0,0 +1,12 @@
+#ifndef CUDADataFormatsCommonArrayShadow_H
+#define CUDADataFormatsCommonArrayShadow_H
+#include <array>
+
+template <typename A>
+struct ArrayShadow {
+  using T = typename A::value_type;
+  constexpr static auto size() { return std::tuple_size<A>::value; }
+  T data[std::tuple_size<A>::value];
+};
+
+#endif
diff --git a/CUDADataFormats/Common/interface/CUDAProduct.h b/CUDADataFormats/Common/interface/CUDAProduct.h
@@ -0,0 +1,55 @@
+#ifndef CUDADataFormats_Common_CUDAProduct_h
+#define CUDADataFormats_Common_CUDAProduct_h
+
+#include <memory>
+
+#include "CUDADataFormats/Common/interface/CUDAProductBase.h"
+
+namespace edm {
+  template <typename T>
+  class Wrapper;
+}
+namespace impl {
+  class CUDAScopedContextGetterBase;
+}
+
+/**
+ * The purpose of this class is to wrap CUDA data to edm::Event in a
+ * way which forces correct use of various utilities.
+ *
+ * The non-default construction has to be done with CUDAScopedContext
+ * (in order to properly register the CUDA event).
+ *
+ * The default constructor is needed only for the ROOT dictionary generation.
+ *
+ * The CUDA event is in practice needed only for stream-stream
+ * synchronization, but someone with long-enough lifetime has to own
+ * it. Here is a somewhat natural place. If overhead is too much, we
+ * can use them only where synchronization between streams is needed.
+ */
+template <typename T>
+class CUDAProduct : public CUDAProductBase {
+public:
+  CUDAProduct() = default;  // Needed only for ROOT dictionary generation
+
+  CUDAProduct(const CUDAProduct&) = delete;
+  CUDAProduct& operator=(const CUDAProduct&) = delete;
+  CUDAProduct(CUDAProduct&&) = default;
+  CUDAProduct& operator=(CUDAProduct&&) = default;
+
+private:
+  friend class impl::CUDAScopedContextGetterBase;
+  friend class CUDAScopedContextProduce;
+  friend class edm::Wrapper<CUDAProduct<T>>;
+
+  explicit CUDAProduct(int device, cudautils::SharedStreamPtr stream, T data)
+      : CUDAProductBase(device, std::move(stream)), data_(std::move(data)) {}
+
+  template <typename... Args>
+  explicit CUDAProduct(int device, cudautils::SharedStreamPtr stream, Args&&... args)
+      : CUDAProductBase(device, std::move(stream)), data_(std::forward<Args>(args)...) {}
+
+  T data_;  //!
+};
+
+#endif
diff --git a/CUDADataFormats/Common/interface/CUDAProductBase.h b/CUDADataFormats/Common/interface/CUDAProductBase.h
@@ -0,0 +1,90 @@
+#ifndef CUDADataFormats_Common_CUDAProductBase_h
+#define CUDADataFormats_Common_CUDAProductBase_h
+
+#include <atomic>
+#include <memory>
+
+#include "HeterogeneousCore/CUDAUtilities/interface/SharedStreamPtr.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/SharedEventPtr.h"
+
+namespace impl {
+  class CUDAScopedContextBase;
+}
+
+/**
+ * Base class for all instantiations of CUDA<T> to hold the
+ * non-T-dependent members.
+ */
+class CUDAProductBase {
+public:
+  CUDAProductBase() = default;  // Needed only for ROOT dictionary generation
+  ~CUDAProductBase();
+
+  CUDAProductBase(const CUDAProductBase&) = delete;
+  CUDAProductBase& operator=(const CUDAProductBase&) = delete;
+  CUDAProductBase(CUDAProductBase&& other)
+      : stream_{std::move(other.stream_)},
+        event_{std::move(other.event_)},
+        mayReuseStream_{other.mayReuseStream_.load()},
+        device_{other.device_} {}
+  CUDAProductBase& operator=(CUDAProductBase&& other) {
+    stream_ = std::move(other.stream_);
+    event_ = std::move(other.event_);
+    mayReuseStream_ = other.mayReuseStream_.load();
+    device_ = other.device_;
+    return *this;
+  }
+
+  bool isValid() const { return stream_.get() != nullptr; }
+  bool isAvailable() const;
+
+  int device() const { return device_; }
+
+  // cudaStream_t is a pointer to a thread-safe object, for which a
+  // mutable access is needed even if the CUDAScopedContext itself
+  // would be const. Therefore it is ok to return a non-const
+  // pointer from a const method here.
+  cudaStream_t stream() const { return stream_.get(); }
+
+  // cudaEvent_t is a pointer to a thread-safe object, for which a
+  // mutable access is needed even if the CUDAScopedContext itself
+  // would be const. Therefore it is ok to return a non-const
+  // pointer from a const method here.
+  cudaEvent_t event() const { return event_ ? event_.get() : nullptr; }
+
+protected:
+  explicit CUDAProductBase(int device, cudautils::SharedStreamPtr stream)
+      : stream_{std::move(stream)}, device_{device} {}
+
+private:
+  friend class impl::CUDAScopedContextBase;
+  friend class CUDAScopedContextProduce;
+
+  // The following functions are intended to be used only from CUDAScopedContext
+  void setEvent(cudautils::SharedEventPtr event) { event_ = std::move(event); }
+  const cudautils::SharedStreamPtr& streamPtr() const { return stream_; }
+
+  bool mayReuseStream() const {
+    bool expected = true;
+    bool changed = mayReuseStream_.compare_exchange_strong(expected, false);
+    // If the current thread is the one flipping the flag, it may
+    // reuse the stream.
+    return changed;
+  }
+
+  // The cudaStream_t is really shared among edm::Event products, so
+  // using shared_ptr also here
+  cudautils::SharedStreamPtr stream_;  //!
+  // shared_ptr because of caching in CUDAEventCache
+  cudautils::SharedEventPtr event_;  //!
+
+  // This flag tells whether the CUDA stream may be reused by a
+  // consumer or not. The goal is to have a "chain" of modules to
+  // queue their work to the same stream.
+  mutable std::atomic<bool> mayReuseStream_ = true;  //!
+
+  // The CUDA device associated with this product
+  int device_ = -1;  //!
+};
+
+#endif