diff --git a/CUDADataFormats/BeamSpot/BuildFile.xml b/CUDADataFormats/BeamSpot/BuildFile.xml new file mode 100644 index 0000000000000..75f3d15738429 --- /dev/null +++ b/CUDADataFormats/BeamSpot/BuildFile.xml @@ -0,0 +1,8 @@ + + + + + + + + diff --git a/CUDADataFormats/BeamSpot/interface/BeamSpotCUDA.h b/CUDADataFormats/BeamSpot/interface/BeamSpotCUDA.h new file mode 100644 index 0000000000000..36b152b64dfc1 --- /dev/null +++ b/CUDADataFormats/BeamSpot/interface/BeamSpotCUDA.h @@ -0,0 +1,32 @@ +#ifndef CUDADataFormats_BeamSpot_interface_BeamSpotCUDA_h +#define CUDADataFormats_BeamSpot_interface_BeamSpotCUDA_h + +#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h" + +#include + +class BeamSpotCUDA { +public: + // alignas(128) doesn't really make sense as there is only one + // beamspot per event? + struct Data { + float x, y, z; // position + // TODO: add covariance matrix + + float sigmaZ; + float beamWidthX, beamWidthY; + float dxdz, dydz; + float emittanceX, emittanceY; + float betaStar; + }; + + BeamSpotCUDA() = default; + BeamSpotCUDA(Data const* data_h, cudaStream_t stream); + + Data const* data() const { return data_d_.get(); } + +private: + cudautils::device::unique_ptr data_d_; +}; + +#endif diff --git a/CUDADataFormats/BeamSpot/src/BeamSpotCUDA.cc b/CUDADataFormats/BeamSpot/src/BeamSpotCUDA.cc new file mode 100644 index 0000000000000..a297ae11dc327 --- /dev/null +++ b/CUDADataFormats/BeamSpot/src/BeamSpotCUDA.cc @@ -0,0 +1,9 @@ +#include "CUDADataFormats/BeamSpot/interface/BeamSpotCUDA.h" + +#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" +#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h" + +BeamSpotCUDA::BeamSpotCUDA(Data const* data_h, cudaStream_t stream) { + data_d_ = cudautils::make_device_unique(stream); + cudaCheck(cudaMemcpyAsync(data_d_.get(), data_h, sizeof(Data), cudaMemcpyHostToDevice, stream)); +} diff --git a/CUDADataFormats/BeamSpot/src/classes.h b/CUDADataFormats/BeamSpot/src/classes.h new file mode 100644 index 0000000000000..62f990c0ba3b3 --- /dev/null +++ b/CUDADataFormats/BeamSpot/src/classes.h @@ -0,0 +1,8 @@ +#ifndef CUDADataFormats_BeamSpot_classes_h +#define CUDADataFormats_BeamSpot_classes_h + +#include "CUDADataFormats/Common/interface/CUDAProduct.h" +#include "CUDADataFormats/BeamSpot/interface/BeamSpotCUDA.h" +#include "DataFormats/Common/interface/Wrapper.h" + +#endif diff --git a/CUDADataFormats/BeamSpot/src/classes_def.xml b/CUDADataFormats/BeamSpot/src/classes_def.xml new file mode 100644 index 0000000000000..29a0eafa04005 --- /dev/null +++ b/CUDADataFormats/BeamSpot/src/classes_def.xml @@ -0,0 +1,4 @@ + + + + diff --git a/CUDADataFormats/Common/BuildFile.xml b/CUDADataFormats/Common/BuildFile.xml new file mode 100644 index 0000000000000..98033aab4d99d --- /dev/null +++ b/CUDADataFormats/Common/BuildFile.xml @@ -0,0 +1,5 @@ + + + + + diff --git a/CUDADataFormats/Common/interface/ArrayShadow.h b/CUDADataFormats/Common/interface/ArrayShadow.h new file mode 100644 index 0000000000000..2e1b85cbfd04d --- /dev/null +++ b/CUDADataFormats/Common/interface/ArrayShadow.h @@ -0,0 +1,12 @@ +#ifndef CUDADataFormatsCommonArrayShadow_H +#define CUDADataFormatsCommonArrayShadow_H +#include + +template +struct ArrayShadow { + using T = typename A::value_type; + constexpr static auto size() { return std::tuple_size::value; } + T data[std::tuple_size::value]; +}; + +#endif diff --git a/CUDADataFormats/Common/interface/CUDAProduct.h b/CUDADataFormats/Common/interface/CUDAProduct.h new file mode 100644 index 0000000000000..75c9c80e7f206 --- /dev/null +++ b/CUDADataFormats/Common/interface/CUDAProduct.h @@ -0,0 +1,55 @@ +#ifndef CUDADataFormats_Common_CUDAProduct_h +#define CUDADataFormats_Common_CUDAProduct_h + +#include + +#include "CUDADataFormats/Common/interface/CUDAProductBase.h" + +namespace edm { + template + class Wrapper; +} +namespace impl { + class CUDAScopedContextGetterBase; +} + +/** + * The purpose of this class is to wrap CUDA data to edm::Event in a + * way which forces correct use of various utilities. + * + * The non-default construction has to be done with CUDAScopedContext + * (in order to properly register the CUDA event). + * + * The default constructor is needed only for the ROOT dictionary generation. + * + * The CUDA event is in practice needed only for stream-stream + * synchronization, but someone with long-enough lifetime has to own + * it. Here is a somewhat natural place. If overhead is too much, we + * can use them only where synchronization between streams is needed. + */ +template +class CUDAProduct : public CUDAProductBase { +public: + CUDAProduct() = default; // Needed only for ROOT dictionary generation + + CUDAProduct(const CUDAProduct&) = delete; + CUDAProduct& operator=(const CUDAProduct&) = delete; + CUDAProduct(CUDAProduct&&) = default; + CUDAProduct& operator=(CUDAProduct&&) = default; + +private: + friend class impl::CUDAScopedContextGetterBase; + friend class CUDAScopedContextProduce; + friend class edm::Wrapper>; + + explicit CUDAProduct(int device, cudautils::SharedStreamPtr stream, T data) + : CUDAProductBase(device, std::move(stream)), data_(std::move(data)) {} + + template + explicit CUDAProduct(int device, cudautils::SharedStreamPtr stream, Args&&... args) + : CUDAProductBase(device, std::move(stream)), data_(std::forward(args)...) {} + + T data_; //! +}; + +#endif diff --git a/CUDADataFormats/Common/interface/CUDAProductBase.h b/CUDADataFormats/Common/interface/CUDAProductBase.h new file mode 100644 index 0000000000000..219b7e619de7f --- /dev/null +++ b/CUDADataFormats/Common/interface/CUDAProductBase.h @@ -0,0 +1,90 @@ +#ifndef CUDADataFormats_Common_CUDAProductBase_h +#define CUDADataFormats_Common_CUDAProductBase_h + +#include +#include + +#include "HeterogeneousCore/CUDAUtilities/interface/SharedStreamPtr.h" +#include "HeterogeneousCore/CUDAUtilities/interface/SharedEventPtr.h" + +namespace impl { + class CUDAScopedContextBase; +} + +/** + * Base class for all instantiations of CUDA to hold the + * non-T-dependent members. + */ +class CUDAProductBase { +public: + CUDAProductBase() = default; // Needed only for ROOT dictionary generation + ~CUDAProductBase(); + + CUDAProductBase(const CUDAProductBase&) = delete; + CUDAProductBase& operator=(const CUDAProductBase&) = delete; + CUDAProductBase(CUDAProductBase&& other) + : stream_{std::move(other.stream_)}, + event_{std::move(other.event_)}, + mayReuseStream_{other.mayReuseStream_.load()}, + device_{other.device_} {} + CUDAProductBase& operator=(CUDAProductBase&& other) { + stream_ = std::move(other.stream_); + event_ = std::move(other.event_); + mayReuseStream_ = other.mayReuseStream_.load(); + device_ = other.device_; + return *this; + } + + bool isValid() const { return stream_.get() != nullptr; } + bool isAvailable() const; + + int device() const { return device_; } + + // cudaStream_t is a pointer to a thread-safe object, for which a + // mutable access is needed even if the CUDAScopedContext itself + // would be const. Therefore it is ok to return a non-const + // pointer from a const method here. + cudaStream_t stream() const { return stream_.get(); } + + // cudaEvent_t is a pointer to a thread-safe object, for which a + // mutable access is needed even if the CUDAScopedContext itself + // would be const. Therefore it is ok to return a non-const + // pointer from a const method here. + cudaEvent_t event() const { return event_ ? event_.get() : nullptr; } + +protected: + explicit CUDAProductBase(int device, cudautils::SharedStreamPtr stream) + : stream_{std::move(stream)}, device_{device} {} + +private: + friend class impl::CUDAScopedContextBase; + friend class CUDAScopedContextProduce; + + // The following functions are intended to be used only from CUDAScopedContext + void setEvent(cudautils::SharedEventPtr event) { event_ = std::move(event); } + const cudautils::SharedStreamPtr& streamPtr() const { return stream_; } + + bool mayReuseStream() const { + bool expected = true; + bool changed = mayReuseStream_.compare_exchange_strong(expected, false); + // If the current thread is the one flipping the flag, it may + // reuse the stream. + return changed; + } + + // The cudaStream_t is really shared among edm::Event products, so + // using shared_ptr also here + cudautils::SharedStreamPtr stream_; //! + // shared_ptr because of caching in CUDAEventCache + cudautils::SharedEventPtr event_; //! + + // This flag tells whether the CUDA stream may be reused by a + // consumer or not. The goal is to have a "chain" of modules to + // queue their work to the same stream. + mutable std::atomic mayReuseStream_ = true; //! + + // The CUDA device associated with this product + int device_ = -1; //! +}; + +#endif diff --git a/CUDADataFormats/Common/interface/HeterogeneousSoA.h b/CUDADataFormats/Common/interface/HeterogeneousSoA.h new file mode 100644 index 0000000000000..907b7647a3452 --- /dev/null +++ b/CUDADataFormats/Common/interface/HeterogeneousSoA.h @@ -0,0 +1,187 @@ +#ifndef CUDADataFormatsCommonHeterogeneousSoA_H +#define CUDADataFormatsCommonHeterogeneousSoA_H + +#include + +#include "HeterogeneousCore/CUDAUtilities/interface/copyAsync.h" +#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" +#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h" +#include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h" + +// a heterogeneous unique pointer... +template +class HeterogeneousSoA { +public: + using Product = T; + + HeterogeneousSoA() = default; // make root happy + ~HeterogeneousSoA() = default; + HeterogeneousSoA(HeterogeneousSoA &&) = default; + HeterogeneousSoA &operator=(HeterogeneousSoA &&) = default; + + explicit HeterogeneousSoA(cudautils::device::unique_ptr &&p) : dm_ptr(std::move(p)) {} + explicit HeterogeneousSoA(cudautils::host::unique_ptr &&p) : hm_ptr(std::move(p)) {} + explicit HeterogeneousSoA(std::unique_ptr &&p) : std_ptr(std::move(p)) {} + + auto const *get() const { return dm_ptr ? dm_ptr.get() : (hm_ptr ? hm_ptr.get() : std_ptr.get()); } + + auto const &operator*() const { return *get(); } + + auto const *operator-> () const { return get(); } + + auto *get() { return dm_ptr ? dm_ptr.get() : (hm_ptr ? hm_ptr.get() : std_ptr.get()); } + + auto &operator*() { return *get(); } + + auto *operator-> () { return get(); } + + // in reality valid only for GPU version... + cudautils::host::unique_ptr toHostAsync(cudaStream_t stream) const { + assert(dm_ptr); + auto ret = cudautils::make_host_unique(stream); + cudaCheck(cudaMemcpyAsync(ret.get(), dm_ptr.get(), sizeof(T), cudaMemcpyDefault, stream)); + return ret; + } + +private: + // a union wan't do it, a variant will not be more efficienct + cudautils::device::unique_ptr dm_ptr; //! + cudautils::host::unique_ptr hm_ptr; //! + std::unique_ptr std_ptr; //! +}; + +namespace cudaCompat { + + struct GPUTraits { + template + using unique_ptr = cudautils::device::unique_ptr; + + template + static auto make_unique(cudaStream_t stream) { + return cudautils::make_device_unique(stream); + } + + template + static auto make_unique(size_t size, cudaStream_t stream) { + return cudautils::make_device_unique(size, stream); + } + + template + static auto make_host_unique(cudaStream_t stream) { + return cudautils::make_host_unique(stream); + } + + template + static auto make_device_unique(cudaStream_t stream) { + return cudautils::make_device_unique(stream); + } + + template + static auto make_device_unique(size_t size, cudaStream_t stream) { + return cudautils::make_device_unique(size, stream); + } + }; + + struct HostTraits { + template + using unique_ptr = cudautils::host::unique_ptr; + + template + static auto make_unique(cudaStream_t stream) { + return cudautils::make_host_unique(stream); + } + + template + static auto make_host_unique(cudaStream_t stream) { + return cudautils::make_host_unique(stream); + } + + template + static auto make_device_unique(cudaStream_t stream) { + return cudautils::make_device_unique(stream); + } + + template + static auto make_device_unique(size_t size, cudaStream_t stream) { + return cudautils::make_device_unique(size, stream); + } + }; + + struct CPUTraits { + template + using unique_ptr = std::unique_ptr; + + template + static auto make_unique(cudaStream_t) { + return std::make_unique(); + } + + template + static auto make_unique(size_t size, cudaStream_t) { + return std::make_unique(size); + } + + template + static auto make_host_unique(cudaStream_t) { + return std::make_unique(); + } + + template + static auto make_device_unique(cudaStream_t) { + return std::make_unique(); + } + + template + static auto make_device_unique(size_t size, cudaStream_t) { + return std::make_unique(size); + } + }; + +} // namespace cudaCompat + +// a heterogeneous unique pointer (of a different sort) ... +template +class HeterogeneousSoAImpl { +public: + template + using unique_ptr = typename Traits::template unique_ptr; + + HeterogeneousSoAImpl() = default; // make root happy + ~HeterogeneousSoAImpl() = default; + HeterogeneousSoAImpl(HeterogeneousSoAImpl &&) = default; + HeterogeneousSoAImpl &operator=(HeterogeneousSoAImpl &&) = default; + + explicit HeterogeneousSoAImpl(unique_ptr &&p) : m_ptr(std::move(p)) {} + explicit HeterogeneousSoAImpl(cudaStream_t stream); + + T const *get() const { return m_ptr.get(); } + + T *get() { return m_ptr.get(); } + + cudautils::host::unique_ptr toHostAsync(cudaStream_t stream) const; + +private: + unique_ptr m_ptr; //! +}; + +template +HeterogeneousSoAImpl::HeterogeneousSoAImpl(cudaStream_t stream) { + m_ptr = Traits::template make_unique(stream); +} + +// in reality valid only for GPU version... +template +cudautils::host::unique_ptr HeterogeneousSoAImpl::toHostAsync(cudaStream_t stream) const { + auto ret = cudautils::make_host_unique(stream); + cudaCheck(cudaMemcpyAsync(ret.get(), get(), sizeof(T), cudaMemcpyDefault, stream)); + return ret; +} + +template +using HeterogeneousSoAGPU = HeterogeneousSoAImpl; +template +using HeterogeneousSoACPU = HeterogeneousSoAImpl; +template +using HeterogeneousSoAHost = HeterogeneousSoAImpl; + +#endif diff --git a/CUDADataFormats/Common/interface/HostProduct.h b/CUDADataFormats/Common/interface/HostProduct.h new file mode 100644 index 0000000000000..17ad98ba403a4 --- /dev/null +++ b/CUDADataFormats/Common/interface/HostProduct.h @@ -0,0 +1,29 @@ +#ifndef CUDADataFormatsCommonHostProduct_H +#define CUDADataFormatsCommonHostProduct_H + +#include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h" + +// a heterogeneous unique pointer... +template +class HostProduct { +public: + HostProduct() = default; // make root happy + ~HostProduct() = default; + HostProduct(HostProduct&&) = default; + HostProduct& operator=(HostProduct&&) = default; + + explicit HostProduct(cudautils::host::unique_ptr&& p) : hm_ptr(std::move(p)) {} + explicit HostProduct(std::unique_ptr&& p) : std_ptr(std::move(p)) {} + + auto const* get() const { return hm_ptr ? hm_ptr.get() : std_ptr.get(); } + + auto const& operator*() const { return *get(); } + + auto const* operator-> () const { return get(); } + +private: + cudautils::host::unique_ptr hm_ptr; //! + std::unique_ptr std_ptr; //! +}; + +#endif diff --git a/CUDADataFormats/Common/src/CUDAProductBase.cc b/CUDADataFormats/Common/src/CUDAProductBase.cc new file mode 100644 index 0000000000000..72302d3165676 --- /dev/null +++ b/CUDADataFormats/Common/src/CUDAProductBase.cc @@ -0,0 +1,27 @@ +#include "CUDADataFormats/Common/interface/CUDAProductBase.h" +#include "HeterogeneousCore/CUDAUtilities/interface/eventIsOccurred.h" + +bool CUDAProductBase::isAvailable() const { + // In absence of event, the product was available already at the end + // of produce() of the producer. + if (not event_) { + return true; + } + return cudautils::eventIsOccurred(event_.get()); +} + +CUDAProductBase::~CUDAProductBase() { + // Make sure that the production of the product in the GPU is + // complete before destructing the product. This is to make sure + // that the EDM stream does not move to the next event before all + // asynchronous processing of the current is complete. + if (event_) { + // TODO: a callback notifying a WaitingTaskHolder (or similar) + // would avoid blocking the CPU, but would also require more work. + // + // Intentionally not checking the return value to avoid throwing + // exceptions. If this call would fail, we should get failures + // elsewhere as well. + cudaEventSynchronize(event_.get()); + } +} diff --git a/CUDADataFormats/Common/test/BuildFile.xml b/CUDADataFormats/Common/test/BuildFile.xml new file mode 100644 index 0000000000000..5e804fe80a736 --- /dev/null +++ b/CUDADataFormats/Common/test/BuildFile.xml @@ -0,0 +1,5 @@ + + + + + diff --git a/CUDADataFormats/Common/test/test_CUDAProduct.cc b/CUDADataFormats/Common/test/test_CUDAProduct.cc new file mode 100644 index 0000000000000..e674ca2bf694f --- /dev/null +++ b/CUDADataFormats/Common/test/test_CUDAProduct.cc @@ -0,0 +1,66 @@ +#include "catch.hpp" + +#include "CUDADataFormats/Common/interface/CUDAProduct.h" +#include "HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h" +#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" +#include "HeterogeneousCore/CUDAUtilities/interface/exitSansCUDADevices.h" +#include "HeterogeneousCore/CUDAUtilities/interface/CUDAStreamCache.h" +#include "HeterogeneousCore/CUDAUtilities/interface/CUDAEventCache.h" + +#include + +namespace cudatest { + class TestCUDAScopedContext { + public: + static CUDAScopedContextProduce make(int dev, bool createEvent) { + cudautils::SharedEventPtr event; + if (createEvent) { + event = cudautils::getCUDAEventCache().getCUDAEvent(); + } + return CUDAScopedContextProduce(dev, cudautils::getCUDAStreamCache().getCUDAStream(), std::move(event)); + } + }; +} // namespace cudatest + +TEST_CASE("Use of CUDAProduct template", "[CUDACore]") { + SECTION("Default constructed") { + auto foo = CUDAProduct(); + REQUIRE(!foo.isValid()); + + auto bar = std::move(foo); + } + + exitSansCUDADevices(); + + constexpr int defaultDevice = 0; + cudaCheck(cudaSetDevice(defaultDevice)); + { + auto ctx = cudatest::TestCUDAScopedContext::make(defaultDevice, true); + std::unique_ptr> dataPtr = ctx.wrap(10); + auto& data = *dataPtr; + + SECTION("Construct from CUDAScopedContext") { + REQUIRE(data.isValid()); + REQUIRE(data.device() == defaultDevice); + REQUIRE(data.stream() == ctx.stream()); + REQUIRE(data.event() != nullptr); + } + + SECTION("Move constructor") { + auto data2 = CUDAProduct(std::move(data)); + REQUIRE(data2.isValid()); + REQUIRE(!data.isValid()); + } + + SECTION("Move assignment") { + CUDAProduct data2; + data2 = std::move(data); + REQUIRE(data2.isValid()); + REQUIRE(!data.isValid()); + } + } + + cudaCheck(cudaSetDevice(defaultDevice)); + cudaCheck(cudaDeviceSynchronize()); + // Note: CUDA resources are cleaned up by the destructors of the global cache objects +} diff --git a/CUDADataFormats/Common/test/test_main.cc b/CUDADataFormats/Common/test/test_main.cc new file mode 100644 index 0000000000000..0c7c351f437f5 --- /dev/null +++ b/CUDADataFormats/Common/test/test_main.cc @@ -0,0 +1,2 @@ +#define CATCH_CONFIG_MAIN +#include "catch.hpp" diff --git a/CUDADataFormats/EcalRecHitSoA/BuildFile.xml b/CUDADataFormats/EcalRecHitSoA/BuildFile.xml new file mode 100644 index 0000000000000..794d2bf7abead --- /dev/null +++ b/CUDADataFormats/EcalRecHitSoA/BuildFile.xml @@ -0,0 +1,7 @@ + + + + + + + diff --git a/CUDADataFormats/EcalRecHitSoA/interface/EcalUncalibratedRecHit_soa.h b/CUDADataFormats/EcalRecHitSoA/interface/EcalUncalibratedRecHit_soa.h new file mode 100644 index 0000000000000..e11c13ebdf4c2 --- /dev/null +++ b/CUDADataFormats/EcalRecHitSoA/interface/EcalUncalibratedRecHit_soa.h @@ -0,0 +1,73 @@ +#ifndef CUDADataFormats_EcalRecHitSoA_interface_EcalUncalibratedRecHit_soa_h +#define CUDADataFormats_EcalRecHitSoA_interface_EcalUncalibratedRecHit_soa_h + +#include +#include + +#include "DataFormats/EcalDigi/interface/EcalDataFrame.h" + +#include "CUDADataFormats/EcalRecHitSoA/interface/RecoTypes.h" +#include "HeterogeneousCore/CUDAUtilities/interface/CUDAHostAllocator.h" + +namespace ecal { + + namespace Tag { + + struct soa {}; + struct ptr {}; + + } // namespace Tag + + template + struct type_wrapper { + //#ifndef ECAL_MULTIFIT_DONOT_USE_PINNED_MEM + // using type = std::vector>; + //#else + using type = std::vector; + //#endif + }; + + template + struct type_wrapper { + using type = T*; + }; + + template + struct UncalibratedRecHit { + UncalibratedRecHit() = default; + UncalibratedRecHit(const UncalibratedRecHit&) = default; + UncalibratedRecHit& operator=(const UncalibratedRecHit&) = default; + + UncalibratedRecHit(UncalibratedRecHit&&) = default; + UncalibratedRecHit& operator=(UncalibratedRecHit&&) = default; + + // TODO: std::array causes root's dictionary problems + typename type_wrapper::type amplitudesAll; + // typename type_wrapper, L>::type amplitudesAll; + typename type_wrapper::type amplitude; + typename type_wrapper::type chi2; + typename type_wrapper::type pedestal; + typename type_wrapper::type jitter; + typename type_wrapper::type jitterError; + typename type_wrapper::type did; + typename type_wrapper::type flags; + + template + typename std::enable_if::value, void>::type resize(size_t size) { + amplitudesAll.resize(size * EcalDataFrame::MAXSAMPLES); + amplitude.resize(size); + pedestal.resize(size); + chi2.resize(size); + did.resize(size); + flags.resize(size); + jitter.resize(size); + jitterError.resize(size); + } + }; + + using SoAUncalibratedRecHitCollection = UncalibratedRecHit; + +} // namespace ecal + +#endif // RecoLocalCalo_EcalRecAlgos_interface_EcalUncalibratedRecHit_soa_h diff --git a/CUDADataFormats/EcalRecHitSoA/interface/RecoTypes.h b/CUDADataFormats/EcalRecHitSoA/interface/RecoTypes.h new file mode 100644 index 0000000000000..5667a9225f29d --- /dev/null +++ b/CUDADataFormats/EcalRecHitSoA/interface/RecoTypes.h @@ -0,0 +1,13 @@ +#ifndef CUDADataFormats_EcalRecHitSoA_interface_RecoTypes +#define CUDADataFormats_EcalRecHitSoA_interface_RecoTypes + +namespace ecal { + namespace reco { + + using ComputationScalarType = float; + using StorageScalarType = float; + + } // namespace reco +} // namespace ecal + +#endif diff --git a/CUDADataFormats/EcalRecHitSoA/src/classes.h b/CUDADataFormats/EcalRecHitSoA/src/classes.h new file mode 100644 index 0000000000000..8ad6b8d684b9a --- /dev/null +++ b/CUDADataFormats/EcalRecHitSoA/src/classes.h @@ -0,0 +1,2 @@ +#include "DataFormats/Common/interface/Wrapper.h" +#include "CUDADataFormats/EcalRecHitSoA/interface/EcalUncalibratedRecHit_soa.h" diff --git a/CUDADataFormats/EcalRecHitSoA/src/classes_def.xml b/CUDADataFormats/EcalRecHitSoA/src/classes_def.xml new file mode 100644 index 0000000000000..461460835a723 --- /dev/null +++ b/CUDADataFormats/EcalRecHitSoA/src/classes_def.xml @@ -0,0 +1,5 @@ + + + + + diff --git a/CUDADataFormats/SiPixelCluster/BuildFile.xml b/CUDADataFormats/SiPixelCluster/BuildFile.xml new file mode 100644 index 0000000000000..5e401d215c4eb --- /dev/null +++ b/CUDADataFormats/SiPixelCluster/BuildFile.xml @@ -0,0 +1,10 @@ + + + + + + + + + + diff --git a/CUDADataFormats/SiPixelCluster/interface/SiPixelClustersCUDA.h b/CUDADataFormats/SiPixelCluster/interface/SiPixelClustersCUDA.h new file mode 100644 index 0000000000000..d3650e164d44e --- /dev/null +++ b/CUDADataFormats/SiPixelCluster/interface/SiPixelClustersCUDA.h @@ -0,0 +1,73 @@ +#ifndef CUDADataFormats_SiPixelCluster_interface_SiPixelClustersCUDA_h +#define CUDADataFormats_SiPixelCluster_interface_SiPixelClustersCUDA_h + +#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h" +#include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h" +#include "HeterogeneousCore/CUDAUtilities/interface/cudaCompat.h" + +#include + +class SiPixelClustersCUDA { +public: + SiPixelClustersCUDA() = default; + explicit SiPixelClustersCUDA(size_t maxClusters, cudaStream_t stream); + ~SiPixelClustersCUDA() = default; + + SiPixelClustersCUDA(const SiPixelClustersCUDA &) = delete; + SiPixelClustersCUDA &operator=(const SiPixelClustersCUDA &) = delete; + SiPixelClustersCUDA(SiPixelClustersCUDA &&) = default; + SiPixelClustersCUDA &operator=(SiPixelClustersCUDA &&) = default; + + void setNClusters(uint32_t nClusters) { nClusters_h = nClusters; } + + uint32_t nClusters() const { return nClusters_h; } + + uint32_t *moduleStart() { return moduleStart_d.get(); } + uint32_t *clusInModule() { return clusInModule_d.get(); } + uint32_t *moduleId() { return moduleId_d.get(); } + uint32_t *clusModuleStart() { return clusModuleStart_d.get(); } + + uint32_t const *moduleStart() const { return moduleStart_d.get(); } + uint32_t const *clusInModule() const { return clusInModule_d.get(); } + uint32_t const *moduleId() const { return moduleId_d.get(); } + uint32_t const *clusModuleStart() const { return clusModuleStart_d.get(); } + + uint32_t const *c_moduleStart() const { return moduleStart_d.get(); } + uint32_t const *c_clusInModule() const { return clusInModule_d.get(); } + uint32_t const *c_moduleId() const { return moduleId_d.get(); } + uint32_t const *c_clusModuleStart() const { return clusModuleStart_d.get(); } + + class DeviceConstView { + public: + // DeviceConstView() = default; + + __device__ __forceinline__ uint32_t moduleStart(int i) const { return __ldg(moduleStart_ + i); } + __device__ __forceinline__ uint32_t clusInModule(int i) const { return __ldg(clusInModule_ + i); } + __device__ __forceinline__ uint32_t moduleId(int i) const { return __ldg(moduleId_ + i); } + __device__ __forceinline__ uint32_t clusModuleStart(int i) const { return __ldg(clusModuleStart_ + i); } + + friend SiPixelClustersCUDA; + + // private: + uint32_t const *moduleStart_; + uint32_t const *clusInModule_; + uint32_t const *moduleId_; + uint32_t const *clusModuleStart_; + }; + + DeviceConstView *view() const { return view_d.get(); } + +private: + cudautils::device::unique_ptr moduleStart_d; // index of the first pixel of each module + cudautils::device::unique_ptr clusInModule_d; // number of clusters found in each module + cudautils::device::unique_ptr moduleId_d; // module id of each module + + // originally from rechits + cudautils::device::unique_ptr clusModuleStart_d; // index of the first cluster of each module + + cudautils::device::unique_ptr view_d; // "me" pointer + + uint32_t nClusters_h; +}; + +#endif diff --git a/CUDADataFormats/SiPixelCluster/interface/gpuClusteringConstants.h b/CUDADataFormats/SiPixelCluster/interface/gpuClusteringConstants.h new file mode 100644 index 0000000000000..1430606ab6678 --- /dev/null +++ b/CUDADataFormats/SiPixelCluster/interface/gpuClusteringConstants.h @@ -0,0 +1,32 @@ +#ifndef CUDADataFormats_SiPixelCluster_interface_gpuClusteringConstants_h +#define CUDADataFormats_SiPixelCluster_interface_gpuClusteringConstants_h + +#include + +namespace pixelGPUConstants { +#ifdef GPU_SMALL_EVENTS + constexpr uint32_t maxNumberOfHits = 24 * 1024; +#else + constexpr uint32_t maxNumberOfHits = + 48 * 1024; // data at pileup 50 has 18300 +/- 3500 hits; 40000 is around 6 sigma away +#endif +} // namespace pixelGPUConstants + +namespace gpuClustering { +#ifdef GPU_SMALL_EVENTS + constexpr uint32_t maxHitsInIter() { return 64; } +#else + // optimized for real data PU 50 + constexpr uint32_t maxHitsInIter() { return 160; } +#endif + constexpr uint32_t maxHitsInModule() { return 1024; } + + constexpr uint32_t MaxNumModules = 2000; + constexpr int32_t MaxNumClustersPerModules = maxHitsInModule(); + constexpr uint32_t MaxHitsInModule = maxHitsInModule(); // as above + constexpr uint32_t MaxNumClusters = pixelGPUConstants::maxNumberOfHits; + constexpr uint16_t InvId = 9999; // must be > MaxNumModules + +} // namespace gpuClustering + +#endif // CUDADataFormats_SiPixelCluster_interface_gpuClusteringConstants_h diff --git a/CUDADataFormats/SiPixelCluster/src/SiPixelClustersCUDA.cc b/CUDADataFormats/SiPixelCluster/src/SiPixelClustersCUDA.cc new file mode 100644 index 0000000000000..c814cd4a2e131 --- /dev/null +++ b/CUDADataFormats/SiPixelCluster/src/SiPixelClustersCUDA.cc @@ -0,0 +1,21 @@ +#include "CUDADataFormats/SiPixelCluster/interface/SiPixelClustersCUDA.h" + +#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h" +#include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h" +#include "HeterogeneousCore/CUDAUtilities/interface/copyAsync.h" + +SiPixelClustersCUDA::SiPixelClustersCUDA(size_t maxClusters, cudaStream_t stream) { + moduleStart_d = cudautils::make_device_unique(maxClusters + 1, stream); + clusInModule_d = cudautils::make_device_unique(maxClusters, stream); + moduleId_d = cudautils::make_device_unique(maxClusters, stream); + clusModuleStart_d = cudautils::make_device_unique(maxClusters + 1, stream); + + auto view = cudautils::make_host_unique(stream); + view->moduleStart_ = moduleStart_d.get(); + view->clusInModule_ = clusInModule_d.get(); + view->moduleId_ = moduleId_d.get(); + view->clusModuleStart_ = clusModuleStart_d.get(); + + view_d = cudautils::make_device_unique(stream); + cudautils::copyAsync(view_d, view, stream); +} diff --git a/CUDADataFormats/SiPixelCluster/src/classes.h b/CUDADataFormats/SiPixelCluster/src/classes.h new file mode 100644 index 0000000000000..08d46244adc7d --- /dev/null +++ b/CUDADataFormats/SiPixelCluster/src/classes.h @@ -0,0 +1,8 @@ +#ifndef CUDADataFormats_SiPixelCluster_classes_h +#define CUDADataFormats_SiPixelCluster_classes_h + +#include "CUDADataFormats/Common/interface/CUDAProduct.h" +#include "CUDADataFormats/SiPixelCluster/interface/SiPixelClustersCUDA.h" +#include "DataFormats/Common/interface/Wrapper.h" + +#endif diff --git a/CUDADataFormats/SiPixelCluster/src/classes_def.xml b/CUDADataFormats/SiPixelCluster/src/classes_def.xml new file mode 100644 index 0000000000000..ba0706ac4b8aa --- /dev/null +++ b/CUDADataFormats/SiPixelCluster/src/classes_def.xml @@ -0,0 +1,4 @@ + + + + diff --git a/CUDADataFormats/SiPixelDigi/BuildFile.xml b/CUDADataFormats/SiPixelDigi/BuildFile.xml new file mode 100644 index 0000000000000..ee357e2d4e157 --- /dev/null +++ b/CUDADataFormats/SiPixelDigi/BuildFile.xml @@ -0,0 +1,10 @@ + + + + + + + + + + diff --git a/CUDADataFormats/SiPixelDigi/interface/SiPixelDigiErrorsCUDA.h b/CUDADataFormats/SiPixelDigi/interface/SiPixelDigiErrorsCUDA.h new file mode 100644 index 0000000000000..7c18d58a3fc12 --- /dev/null +++ b/CUDADataFormats/SiPixelDigi/interface/SiPixelDigiErrorsCUDA.h @@ -0,0 +1,41 @@ +#ifndef CUDADataFormats_SiPixelDigi_interface_SiPixelDigiErrorsCUDA_h +#define CUDADataFormats_SiPixelDigi_interface_SiPixelDigiErrorsCUDA_h + +#include "DataFormats/SiPixelDigi/interface/PixelErrors.h" +#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h" +#include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h" +#include "HeterogeneousCore/CUDAUtilities/interface/GPUSimpleVector.h" + +#include + +class SiPixelDigiErrorsCUDA { +public: + SiPixelDigiErrorsCUDA() = default; + explicit SiPixelDigiErrorsCUDA(size_t maxFedWords, PixelFormatterErrors errors, cudaStream_t stream); + ~SiPixelDigiErrorsCUDA() = default; + + SiPixelDigiErrorsCUDA(const SiPixelDigiErrorsCUDA&) = delete; + SiPixelDigiErrorsCUDA& operator=(const SiPixelDigiErrorsCUDA&) = delete; + SiPixelDigiErrorsCUDA(SiPixelDigiErrorsCUDA&&) = default; + SiPixelDigiErrorsCUDA& operator=(SiPixelDigiErrorsCUDA&&) = default; + + const PixelFormatterErrors& formatterErrors() const { return formatterErrors_h; } + + GPU::SimpleVector* error() { return error_d.get(); } + GPU::SimpleVector const* error() const { return error_d.get(); } + GPU::SimpleVector const* c_error() const { return error_d.get(); } + + using HostDataError = + std::pair, cudautils::host::unique_ptr>; + HostDataError dataErrorToHostAsync(cudaStream_t stream) const; + + void copyErrorToHostAsync(cudaStream_t stream); + +private: + cudautils::device::unique_ptr data_d; + cudautils::device::unique_ptr> error_d; + cudautils::host::unique_ptr> error_h; + PixelFormatterErrors formatterErrors_h; +}; + +#endif diff --git a/CUDADataFormats/SiPixelDigi/interface/SiPixelDigisCUDA.h b/CUDADataFormats/SiPixelDigi/interface/SiPixelDigisCUDA.h new file mode 100644 index 0000000000000..47efe634ad93d --- /dev/null +++ b/CUDADataFormats/SiPixelDigi/interface/SiPixelDigisCUDA.h @@ -0,0 +1,98 @@ +#ifndef CUDADataFormats_SiPixelDigi_interface_SiPixelDigisCUDA_h +#define CUDADataFormats_SiPixelDigi_interface_SiPixelDigisCUDA_h + +#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h" +#include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h" +#include "HeterogeneousCore/CUDAUtilities/interface/cudaCompat.h" + +#include + +class SiPixelDigisCUDA { +public: + SiPixelDigisCUDA() = default; + explicit SiPixelDigisCUDA(size_t maxFedWords, cudaStream_t stream); + ~SiPixelDigisCUDA() = default; + + SiPixelDigisCUDA(const SiPixelDigisCUDA &) = delete; + SiPixelDigisCUDA &operator=(const SiPixelDigisCUDA &) = delete; + SiPixelDigisCUDA(SiPixelDigisCUDA &&) = default; + SiPixelDigisCUDA &operator=(SiPixelDigisCUDA &&) = default; + + void setNModulesDigis(uint32_t nModules, uint32_t nDigis) { + nModules_h = nModules; + nDigis_h = nDigis; + } + + uint32_t nModules() const { return nModules_h; } + uint32_t nDigis() const { return nDigis_h; } + + uint16_t *xx() { return xx_d.get(); } + uint16_t *yy() { return yy_d.get(); } + uint16_t *adc() { return adc_d.get(); } + uint16_t *moduleInd() { return moduleInd_d.get(); } + int32_t *clus() { return clus_d.get(); } + uint32_t *pdigi() { return pdigi_d.get(); } + uint32_t *rawIdArr() { return rawIdArr_d.get(); } + + uint16_t const *xx() const { return xx_d.get(); } + uint16_t const *yy() const { return yy_d.get(); } + uint16_t const *adc() const { return adc_d.get(); } + uint16_t const *moduleInd() const { return moduleInd_d.get(); } + int32_t const *clus() const { return clus_d.get(); } + uint32_t const *pdigi() const { return pdigi_d.get(); } + uint32_t const *rawIdArr() const { return rawIdArr_d.get(); } + + uint16_t const *c_xx() const { return xx_d.get(); } + uint16_t const *c_yy() const { return yy_d.get(); } + uint16_t const *c_adc() const { return adc_d.get(); } + uint16_t const *c_moduleInd() const { return moduleInd_d.get(); } + int32_t const *c_clus() const { return clus_d.get(); } + uint32_t const *c_pdigi() const { return pdigi_d.get(); } + uint32_t const *c_rawIdArr() const { return rawIdArr_d.get(); } + + cudautils::host::unique_ptr adcToHostAsync(cudaStream_t stream) const; + cudautils::host::unique_ptr clusToHostAsync(cudaStream_t stream) const; + cudautils::host::unique_ptr pdigiToHostAsync(cudaStream_t stream) const; + cudautils::host::unique_ptr rawIdArrToHostAsync(cudaStream_t stream) const; + + class DeviceConstView { + public: + // DeviceConstView() = default; + + __device__ __forceinline__ uint16_t xx(int i) const { return __ldg(xx_ + i); } + __device__ __forceinline__ uint16_t yy(int i) const { return __ldg(yy_ + i); } + __device__ __forceinline__ uint16_t adc(int i) const { return __ldg(adc_ + i); } + __device__ __forceinline__ uint16_t moduleInd(int i) const { return __ldg(moduleInd_ + i); } + __device__ __forceinline__ int32_t clus(int i) const { return __ldg(clus_ + i); } + + friend class SiPixelDigisCUDA; + + // private: + uint16_t const *xx_; + uint16_t const *yy_; + uint16_t const *adc_; + uint16_t const *moduleInd_; + int32_t const *clus_; + }; + + const DeviceConstView *view() const { return view_d.get(); } + +private: + // These are consumed by downstream device code + cudautils::device::unique_ptr xx_d; // local coordinates of each pixel + cudautils::device::unique_ptr yy_d; // + cudautils::device::unique_ptr adc_d; // ADC of each pixel + cudautils::device::unique_ptr moduleInd_d; // module id of each pixel + cudautils::device::unique_ptr clus_d; // cluster id of each pixel + cudautils::device::unique_ptr view_d; // "me" pointer + + // These are for CPU output; should we (eventually) place them to a + // separate product? + cudautils::device::unique_ptr pdigi_d; + cudautils::device::unique_ptr rawIdArr_d; + + uint32_t nModules_h = 0; + uint32_t nDigis_h = 0; +}; + +#endif diff --git a/CUDADataFormats/SiPixelDigi/src/SiPixelDigiErrorsCUDA.cc b/CUDADataFormats/SiPixelDigi/src/SiPixelDigiErrorsCUDA.cc new file mode 100644 index 0000000000000..7640348c15f08 --- /dev/null +++ b/CUDADataFormats/SiPixelDigi/src/SiPixelDigiErrorsCUDA.cc @@ -0,0 +1,42 @@ +#include "CUDADataFormats/SiPixelDigi/interface/SiPixelDigiErrorsCUDA.h" + +#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h" +#include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h" +#include "HeterogeneousCore/CUDAUtilities/interface/copyAsync.h" +#include "HeterogeneousCore/CUDAUtilities/interface/memsetAsync.h" + +#include + +SiPixelDigiErrorsCUDA::SiPixelDigiErrorsCUDA(size_t maxFedWords, PixelFormatterErrors errors, cudaStream_t stream) + : formatterErrors_h(std::move(errors)) { + error_d = cudautils::make_device_unique>(stream); + data_d = cudautils::make_device_unique(maxFedWords, stream); + + cudautils::memsetAsync(data_d, 0x00, maxFedWords, stream); + + error_h = cudautils::make_host_unique>(stream); + GPU::make_SimpleVector(error_h.get(), maxFedWords, data_d.get()); + assert(error_h->empty()); + assert(error_h->capacity() == static_cast(maxFedWords)); + + cudautils::copyAsync(error_d, error_h, stream); +} + +void SiPixelDigiErrorsCUDA::copyErrorToHostAsync(cudaStream_t stream) { + cudautils::copyAsync(error_h, error_d, stream); +} + +SiPixelDigiErrorsCUDA::HostDataError SiPixelDigiErrorsCUDA::dataErrorToHostAsync(cudaStream_t stream) const { + // On one hand size() could be sufficient. On the other hand, if + // someone copies the SimpleVector<>, (s)he might expect the data + // buffer to actually have space for capacity() elements. + auto data = cudautils::make_host_unique(error_h->capacity(), stream); + + // but transfer only the required amount + if (not error_h->empty()) { + cudautils::copyAsync(data, data_d, error_h->size(), stream); + } + auto err = *error_h; + err.set_data(data.get()); + return HostDataError(std::move(err), std::move(data)); +} diff --git a/CUDADataFormats/SiPixelDigi/src/SiPixelDigisCUDA.cc b/CUDADataFormats/SiPixelDigi/src/SiPixelDigisCUDA.cc new file mode 100644 index 0000000000000..a8aab7ab5a4b8 --- /dev/null +++ b/CUDADataFormats/SiPixelDigi/src/SiPixelDigisCUDA.cc @@ -0,0 +1,50 @@ +#include "CUDADataFormats/SiPixelDigi/interface/SiPixelDigisCUDA.h" + +#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h" +#include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h" +#include "HeterogeneousCore/CUDAUtilities/interface/copyAsync.h" + +SiPixelDigisCUDA::SiPixelDigisCUDA(size_t maxFedWords, cudaStream_t stream) { + xx_d = cudautils::make_device_unique(maxFedWords, stream); + yy_d = cudautils::make_device_unique(maxFedWords, stream); + adc_d = cudautils::make_device_unique(maxFedWords, stream); + moduleInd_d = cudautils::make_device_unique(maxFedWords, stream); + clus_d = cudautils::make_device_unique(maxFedWords, stream); + + pdigi_d = cudautils::make_device_unique(maxFedWords, stream); + rawIdArr_d = cudautils::make_device_unique(maxFedWords, stream); + + auto view = cudautils::make_host_unique(stream); + view->xx_ = xx_d.get(); + view->yy_ = yy_d.get(); + view->adc_ = adc_d.get(); + view->moduleInd_ = moduleInd_d.get(); + view->clus_ = clus_d.get(); + + view_d = cudautils::make_device_unique(stream); + cudautils::copyAsync(view_d, view, stream); +} + +cudautils::host::unique_ptr SiPixelDigisCUDA::adcToHostAsync(cudaStream_t stream) const { + auto ret = cudautils::make_host_unique(nDigis(), stream); + cudautils::copyAsync(ret, adc_d, nDigis(), stream); + return ret; +} + +cudautils::host::unique_ptr SiPixelDigisCUDA::clusToHostAsync(cudaStream_t stream) const { + auto ret = cudautils::make_host_unique(nDigis(), stream); + cudautils::copyAsync(ret, clus_d, nDigis(), stream); + return ret; +} + +cudautils::host::unique_ptr SiPixelDigisCUDA::pdigiToHostAsync(cudaStream_t stream) const { + auto ret = cudautils::make_host_unique(nDigis(), stream); + cudautils::copyAsync(ret, pdigi_d, nDigis(), stream); + return ret; +} + +cudautils::host::unique_ptr SiPixelDigisCUDA::rawIdArrToHostAsync(cudaStream_t stream) const { + auto ret = cudautils::make_host_unique(nDigis(), stream); + cudautils::copyAsync(ret, rawIdArr_d, nDigis(), stream); + return ret; +} diff --git a/CUDADataFormats/SiPixelDigi/src/classes.h b/CUDADataFormats/SiPixelDigi/src/classes.h new file mode 100644 index 0000000000000..41b135640b883 --- /dev/null +++ b/CUDADataFormats/SiPixelDigi/src/classes.h @@ -0,0 +1,9 @@ +#ifndef CUDADataFormats_SiPixelDigi_classes_h +#define CUDADataFormats_SiPixelDigi_classes_h + +#include "CUDADataFormats/Common/interface/CUDAProduct.h" +#include "CUDADataFormats/SiPixelDigi/interface/SiPixelDigisCUDA.h" +#include "CUDADataFormats/SiPixelDigi/interface/SiPixelDigiErrorsCUDA.h" +#include "DataFormats/Common/interface/Wrapper.h" + +#endif diff --git a/CUDADataFormats/SiPixelDigi/src/classes_def.xml b/CUDADataFormats/SiPixelDigi/src/classes_def.xml new file mode 100644 index 0000000000000..9d6816ed3b14c --- /dev/null +++ b/CUDADataFormats/SiPixelDigi/src/classes_def.xml @@ -0,0 +1,7 @@ + + + + + + + diff --git a/CUDADataFormats/Track/BuildFile.xml b/CUDADataFormats/Track/BuildFile.xml new file mode 100644 index 0000000000000..e3f9a0910bbd8 --- /dev/null +++ b/CUDADataFormats/Track/BuildFile.xml @@ -0,0 +1,9 @@ + + + + + + + + + diff --git a/CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h b/CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h new file mode 100644 index 0000000000000..bd4ec059f6e9c --- /dev/null +++ b/CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h @@ -0,0 +1,74 @@ +#ifndef CUDADataFormatsTrackTrackHeterogeneous_H +#define CUDADataFormatsTrackTrackHeterogeneous_H + +#include "CUDADataFormats/Track/interface/TrajectoryStateSoA.h" +#include "HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h" + +#include "CUDADataFormats/Common/interface/HeterogeneousSoA.h" + +namespace trackQuality { + enum Quality : uint8_t { bad = 0, dup, loose, strict, tight, highPurity }; +} + +template +class TrackSoAT { +public: + static constexpr int32_t stride() { return S; } + + using Quality = trackQuality::Quality; + using hindex_type = uint16_t; + using HitContainer = OneToManyAssoc; + + // Always check quality is at least loose! + // CUDA does not support enums in __lgc ... + eigenSoA::ScalarSoA m_quality; + constexpr Quality quality(int32_t i) const { return (Quality)(m_quality(i)); } + constexpr Quality &quality(int32_t i) { return (Quality &)(m_quality(i)); } + constexpr Quality const *qualityData() const { return (Quality const *)(m_quality.data()); } + constexpr Quality *qualityData() { return (Quality *)(m_quality.data()); } + + // this is chi2/ndof as not necessarely all hits are used in the fit + eigenSoA::ScalarSoA chi2; + + constexpr int nHits(int i) const { return detIndices.size(i); } + + // State at the Beam spot + // phi,tip,1/pt,cotan(theta),zip + TrajectoryStateSoA stateAtBS; + eigenSoA::ScalarSoA eta; + eigenSoA::ScalarSoA pt; + constexpr float charge(int32_t i) const { return std::copysign(1.f, stateAtBS.state(i)(2)); } + constexpr float phi(int32_t i) const { return stateAtBS.state(i)(0); } + constexpr float tip(int32_t i) const { return stateAtBS.state(i)(1); } + constexpr float zip(int32_t i) const { return stateAtBS.state(i)(4); } + + // state at the detector of the outermost hit + // representation to be decided... + // not yet filled on GPU + // TrajectoryStateSoA stateAtOuterDet; + + HitContainer hitIndices; + HitContainer detIndices; + + // total number of tracks (including those not fitted) + uint32_t m_nTracks; +}; + +namespace pixelTrack { + +#ifdef GPU_SMALL_EVENTS + constexpr uint32_t maxNumber() { return 2 * 1024; } +#else + constexpr uint32_t maxNumber() { return 32 * 1024; } +#endif + + using TrackSoA = TrackSoAT; + using TrajectoryState = TrajectoryStateSoA; + using HitContainer = TrackSoA::HitContainer; + using Quality = trackQuality::Quality; + +} // namespace pixelTrack + +using PixelTrackHeterogeneous = HeterogeneousSoA; + +#endif // CUDADataFormatsTrackTrackSoA_H diff --git a/CUDADataFormats/Track/interface/TrajectoryStateSoA.h b/CUDADataFormats/Track/interface/TrajectoryStateSoA.h new file mode 100644 index 0000000000000..7cd2e93fb914e --- /dev/null +++ b/CUDADataFormats/Track/interface/TrajectoryStateSoA.h @@ -0,0 +1,59 @@ +#ifndef CUDADataFormatsTrackTrajectoryStateSOA_H +#define CUDADataFormatsTrackTrajectoryStateSOA_H + +#include +#include "HeterogeneousCore/CUDAUtilities/interface/eigenSoA.h" + +template +struct TrajectoryStateSoA { + using Vector5f = Eigen::Matrix; + using Vector15f = Eigen::Matrix; + + using Vector5d = Eigen::Matrix; + using Matrix5d = Eigen::Matrix; + + static constexpr int32_t stride() { return S; } + + eigenSoA::MatrixSoA state; + eigenSoA::MatrixSoA covariance; + + template + __host__ __device__ inline void copyFromCircle( + V3 const& cp, M3 const& ccov, V2 const& lp, M2 const& lcov, float b, int32_t i) { + state(i) << cp.template cast(), lp.template cast(); + state(i)(2) *= b; + auto cov = covariance(i); + cov(0) = ccov(0, 0); + cov(1) = ccov(0, 1); + cov(2) = b * float(ccov(0, 2)); + cov(4) = cov(3) = 0; + cov(5) = ccov(1, 1); + cov(6) = b * float(ccov(1, 2)); + cov(8) = cov(7) = 0; + cov(9) = b * b * float(ccov(2, 2)); + cov(11) = cov(10) = 0; + cov(12) = lcov(0, 0); + cov(13) = lcov(0, 1); + cov(14) = lcov(1, 1); + } + + template + __host__ __device__ inline void copyFromDense(V5 const& v, M5 const& cov, int32_t i) { + state(i) = v.template cast(); + for (int j = 0, ind = 0; j < 5; ++j) + for (auto k = j; k < 5; ++k) + covariance(i)(ind++) = cov(j, k); + } + + template + __host__ __device__ inline void copyToDense(V5& v, M5& cov, int32_t i) const { + v = state(i).template cast(); + for (int j = 0, ind = 0; j < 5; ++j) { + cov(j, j) = covariance(i)(ind++); + for (auto k = j + 1; k < 5; ++k) + cov(k, j) = cov(j, k) = covariance(i)(ind++); + } + } +}; + +#endif // CUDADataFormatsTrackTrajectoryStateSOA_H diff --git a/CUDADataFormats/Track/src/classes.h b/CUDADataFormats/Track/src/classes.h new file mode 100644 index 0000000000000..699e45ede05d4 --- /dev/null +++ b/CUDADataFormats/Track/src/classes.h @@ -0,0 +1,10 @@ +#ifndef CUDADataFormats__src_classes_h +#define CUDADataFormats__src_classes_h + +#include "CUDADataFormats/Common/interface/CUDAProduct.h" +#include "CUDADataFormats/Common/interface/HostProduct.h" +#include "CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h" +#include "CUDADataFormats/Common/interface/ArrayShadow.h" +#include "DataFormats/Common/interface/Wrapper.h" + +#endif diff --git a/CUDADataFormats/Track/src/classes_def.xml b/CUDADataFormats/Track/src/classes_def.xml new file mode 100644 index 0000000000000..a4c2e766582dd --- /dev/null +++ b/CUDADataFormats/Track/src/classes_def.xml @@ -0,0 +1,8 @@ + + + + + + + + diff --git a/CUDADataFormats/Track/test/BuildFile.xml b/CUDADataFormats/Track/test/BuildFile.xml new file mode 100644 index 0000000000000..598b345d4709d --- /dev/null +++ b/CUDADataFormats/Track/test/BuildFile.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + diff --git a/CUDADataFormats/Track/test/TrajectoryStateSOA_t.cpp b/CUDADataFormats/Track/test/TrajectoryStateSOA_t.cpp new file mode 100644 index 0000000000000..d6ff539a642b0 --- /dev/null +++ b/CUDADataFormats/Track/test/TrajectoryStateSOA_t.cpp @@ -0,0 +1 @@ +#include "TrajectoryStateSOA_t.h" diff --git a/CUDADataFormats/Track/test/TrajectoryStateSOA_t.cu b/CUDADataFormats/Track/test/TrajectoryStateSOA_t.cu new file mode 100644 index 0000000000000..d6ff539a642b0 --- /dev/null +++ b/CUDADataFormats/Track/test/TrajectoryStateSOA_t.cu @@ -0,0 +1 @@ +#include "TrajectoryStateSOA_t.h" diff --git a/CUDADataFormats/Track/test/TrajectoryStateSOA_t.h b/CUDADataFormats/Track/test/TrajectoryStateSOA_t.h new file mode 100644 index 0000000000000..03c51c39acdfb --- /dev/null +++ b/CUDADataFormats/Track/test/TrajectoryStateSOA_t.h @@ -0,0 +1,75 @@ +#include "CUDADataFormats/Track/interface/TrajectoryStateSoA.h" + +using Vector5d = Eigen::Matrix; +using Matrix5d = Eigen::Matrix; + +__host__ __device__ Matrix5d loadCov(Vector5d const& e) { + Matrix5d cov; + for (int i = 0; i < 5; ++i) + cov(i, i) = e(i) * e(i); + for (int i = 0; i < 5; ++i) { + for (int j = 0; j < i; ++j) { + double v = 0.3 * std::sqrt(cov(i, i) * cov(j, j)); // this makes the matrix pos defined + cov(i, j) = (i + j) % 2 ? -0.4 * v : 0.1 * v; + cov(j, i) = cov(i, j); + } + } + return cov; +} + +using TS = TrajectoryStateSoA<128>; + +__global__ void testTSSoA(TS* pts, int n) { + assert(n <= 128); + + Vector5d par0; + par0 << 0.2, 0.1, 3.5, 0.8, 0.1; + Vector5d e0; + e0 << 0.01, 0.01, 0.035, -0.03, -0.01; + auto cov0 = loadCov(e0); + + TS& ts = *pts; + + int first = threadIdx.x + blockIdx.x * blockDim.x; + + for (int i = first; i < n; i += blockDim.x * gridDim.x) { + ts.copyFromDense(par0, cov0, i); + Vector5d par1; + Matrix5d cov1; + ts.copyToDense(par1, cov1, i); + Vector5d delV = par1 - par0; + Matrix5d delM = cov1 - cov0; + for (int j = 0; j < 5; ++j) { + assert(std::abs(delV(j)) < 1.e-5); + for (auto k = j; k < 5; ++k) { + assert(cov0(k, j) == cov0(j, k)); + assert(cov1(k, j) == cov1(j, k)); + assert(std::abs(delM(k, j)) < 1.e-5); + } + } + } +} + +#ifdef __CUDACC__ +#include "HeterogeneousCore/CUDAUtilities/interface/exitSansCUDADevices.h" +#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" +#endif + +int main() { +#ifdef __CUDACC__ + exitSansCUDADevices(); +#endif + + TS ts; + +#ifdef __CUDACC__ + TS* ts_d; + cudaCheck(cudaMalloc(&ts_d, sizeof(TS))); + testTSSoA<<<1, 64>>>(ts_d, 128); + cudaCheck(cudaGetLastError()); + cudaCheck(cudaMemcpy(&ts, ts_d, sizeof(TS), cudaMemcpyDefault)); + cudaCheck(cudaDeviceSynchronize()); +#else + testTSSoA(&ts, 128); +#endif +} diff --git a/CUDADataFormats/TrackingRecHit/BuildFile.xml b/CUDADataFormats/TrackingRecHit/BuildFile.xml new file mode 100644 index 0000000000000..8dc569d40b6c4 --- /dev/null +++ b/CUDADataFormats/TrackingRecHit/BuildFile.xml @@ -0,0 +1,8 @@ + + + + + + + + diff --git a/CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DCUDA.h b/CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DCUDA.h new file mode 100644 index 0000000000000..f6b715b3e743e --- /dev/null +++ b/CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DCUDA.h @@ -0,0 +1 @@ +#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h" diff --git a/CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h b/CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h new file mode 100644 index 0000000000000..aa551f21b4aad --- /dev/null +++ b/CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h @@ -0,0 +1,150 @@ +#ifndef CUDADataFormats_TrackingRecHit_interface_TrackingRecHit2DHeterogeneous_h +#define CUDADataFormats_TrackingRecHit_interface_TrackingRecHit2DHeterogeneous_h + +#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DSOAView.h" +#include "CUDADataFormats/Common/interface/HeterogeneousSoA.h" + +template +class TrackingRecHit2DHeterogeneous { +public: + template + using unique_ptr = typename Traits::template unique_ptr; + + using Hist = TrackingRecHit2DSOAView::Hist; + + TrackingRecHit2DHeterogeneous() = default; + + explicit TrackingRecHit2DHeterogeneous(uint32_t nHits, + pixelCPEforGPU::ParamsOnGPU const* cpeParams, + uint32_t const* hitsModuleStart, + cudaStream_t stream); + + ~TrackingRecHit2DHeterogeneous() = default; + + TrackingRecHit2DHeterogeneous(const TrackingRecHit2DHeterogeneous&) = delete; + TrackingRecHit2DHeterogeneous& operator=(const TrackingRecHit2DHeterogeneous&) = delete; + TrackingRecHit2DHeterogeneous(TrackingRecHit2DHeterogeneous&&) = default; + TrackingRecHit2DHeterogeneous& operator=(TrackingRecHit2DHeterogeneous&&) = default; + + TrackingRecHit2DSOAView* view() { return m_view.get(); } + TrackingRecHit2DSOAView const* view() const { return m_view.get(); } + + auto nHits() const { return m_nHits; } + + auto hitsModuleStart() const { return m_hitsModuleStart; } + auto hitsLayerStart() { return m_hitsLayerStart; } + auto phiBinner() { return m_hist; } + auto iphi() { return m_iphi; } + + // only the local coord and detector index + cudautils::host::unique_ptr localCoordToHostAsync(cudaStream_t stream) const; + cudautils::host::unique_ptr detIndexToHostAsync(cudaStream_t stream) const; + cudautils::host::unique_ptr hitsModuleStartToHostAsync(cudaStream_t stream) const; + +private: + static constexpr uint32_t n16 = 4; + static constexpr uint32_t n32 = 9; + static_assert(sizeof(uint32_t) == sizeof(float)); // just stating the obvious + + unique_ptr m_store16; //! + unique_ptr m_store32; //! + + unique_ptr m_HistStore; //! + unique_ptr m_AverageGeometryStore; //! + + unique_ptr m_view; //! + + uint32_t m_nHits; + + uint32_t const* m_hitsModuleStart; // needed for legacy, this is on GPU! + + // needed as kernel params... + Hist* m_hist; + uint32_t* m_hitsLayerStart; + int16_t* m_iphi; +}; + +#include "HeterogeneousCore/CUDAUtilities/interface/copyAsync.h" +#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" + +template +TrackingRecHit2DHeterogeneous::TrackingRecHit2DHeterogeneous(uint32_t nHits, + pixelCPEforGPU::ParamsOnGPU const* cpeParams, + uint32_t const* hitsModuleStart, + cudaStream_t stream) + : m_nHits(nHits), m_hitsModuleStart(hitsModuleStart) { + auto view = Traits::template make_host_unique(stream); + + view->m_nHits = nHits; + m_view = Traits::template make_device_unique(stream); + m_AverageGeometryStore = Traits::template make_device_unique(stream); + view->m_averageGeometry = m_AverageGeometryStore.get(); + view->m_cpeParams = cpeParams; + view->m_hitsModuleStart = hitsModuleStart; + + // if empy do not bother + if (0 == nHits) { + if +#ifndef __CUDACC__ + constexpr +#endif + (std::is_same::value) { + cudautils::copyAsync(m_view, view, stream); + } else { + m_view.reset(view.release()); // NOLINT: std::move() breaks CUDA version + } + return; + } + + // the single arrays are not 128 bit alligned... + // the hits are actually accessed in order only in building + // if ordering is relevant they may have to be stored phi-ordered by layer or so + // this will break 1to1 correspondence with cluster and module locality + // so unless proven VERY inefficient we keep it ordered as generated + m_store16 = Traits::template make_device_unique(nHits * n16, stream); + m_store32 = Traits::template make_device_unique(nHits * n32 + 11, stream); + m_HistStore = Traits::template make_device_unique(stream); + + auto get16 = [&](int i) { return m_store16.get() + i * nHits; }; + auto get32 = [&](int i) { return m_store32.get() + i * nHits; }; + + // copy all the pointers + m_hist = view->m_hist = m_HistStore.get(); + + view->m_xl = get32(0); + view->m_yl = get32(1); + view->m_xerr = get32(2); + view->m_yerr = get32(3); + + view->m_xg = get32(4); + view->m_yg = get32(5); + view->m_zg = get32(6); + view->m_rg = get32(7); + + m_iphi = view->m_iphi = reinterpret_cast(get16(0)); + + view->m_charge = reinterpret_cast(get32(8)); + view->m_xsize = reinterpret_cast(get16(2)); + view->m_ysize = reinterpret_cast(get16(3)); + view->m_detInd = get16(1); + + m_hitsLayerStart = view->m_hitsLayerStart = reinterpret_cast(get32(n32)); + + // transfer view + if +#ifndef __CUDACC__ + constexpr +#endif + (std::is_same::value) { + cudautils::copyAsync(m_view, view, stream); + } else { + m_view.reset(view.release()); // NOLINT: std::move() breaks CUDA version + } +} + +using TrackingRecHit2DGPU = TrackingRecHit2DHeterogeneous; +using TrackingRecHit2DCUDA = TrackingRecHit2DHeterogeneous; +using TrackingRecHit2DCPU = TrackingRecHit2DHeterogeneous; +using TrackingRecHit2DHost = TrackingRecHit2DHeterogeneous; + +#endif // CUDADataFormats_TrackingRecHit_interface_TrackingRecHit2DHeterogeneous_h diff --git a/CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DSOAView.h b/CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DSOAView.h new file mode 100644 index 0000000000000..8e6d99e81238a --- /dev/null +++ b/CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DSOAView.h @@ -0,0 +1,100 @@ +#ifndef CUDADataFormats_TrackingRecHit_interface_TrackingRecHit2DSOAView_h +#define CUDADataFormats_TrackingRecHit_interface_TrackingRecHit2DSOAView_h + +#include + +#include "CUDADataFormats/SiPixelCluster/interface/gpuClusteringConstants.h" +#include "HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h" +#include "HeterogeneousCore/CUDAUtilities/interface/cudaCompat.h" +#include "Geometry/TrackerGeometryBuilder/interface/phase1PixelTopology.h" + +namespace pixelCPEforGPU { + struct ParamsOnGPU; +} + +class TrackingRecHit2DSOAView { +public: + static constexpr uint32_t maxHits() { return gpuClustering::MaxNumClusters; } + using hindex_type = uint16_t; // if above is <=2^16 + + using Hist = HistoContainer; + + using AverageGeometry = phase1PixelTopology::AverageGeometry; + + template + friend class TrackingRecHit2DHeterogeneous; + + __device__ __forceinline__ uint32_t nHits() const { return m_nHits; } + + __device__ __forceinline__ float& xLocal(int i) { return m_xl[i]; } + __device__ __forceinline__ float xLocal(int i) const { return __ldg(m_xl + i); } + __device__ __forceinline__ float& yLocal(int i) { return m_yl[i]; } + __device__ __forceinline__ float yLocal(int i) const { return __ldg(m_yl + i); } + + __device__ __forceinline__ float& xerrLocal(int i) { return m_xerr[i]; } + __device__ __forceinline__ float xerrLocal(int i) const { return __ldg(m_xerr + i); } + __device__ __forceinline__ float& yerrLocal(int i) { return m_yerr[i]; } + __device__ __forceinline__ float yerrLocal(int i) const { return __ldg(m_yerr + i); } + + __device__ __forceinline__ float& xGlobal(int i) { return m_xg[i]; } + __device__ __forceinline__ float xGlobal(int i) const { return __ldg(m_xg + i); } + __device__ __forceinline__ float& yGlobal(int i) { return m_yg[i]; } + __device__ __forceinline__ float yGlobal(int i) const { return __ldg(m_yg + i); } + __device__ __forceinline__ float& zGlobal(int i) { return m_zg[i]; } + __device__ __forceinline__ float zGlobal(int i) const { return __ldg(m_zg + i); } + __device__ __forceinline__ float& rGlobal(int i) { return m_rg[i]; } + __device__ __forceinline__ float rGlobal(int i) const { return __ldg(m_rg + i); } + + __device__ __forceinline__ int16_t& iphi(int i) { return m_iphi[i]; } + __device__ __forceinline__ int16_t iphi(int i) const { return __ldg(m_iphi + i); } + + __device__ __forceinline__ int32_t& charge(int i) { return m_charge[i]; } + __device__ __forceinline__ int32_t charge(int i) const { return __ldg(m_charge + i); } + __device__ __forceinline__ int16_t& clusterSizeX(int i) { return m_xsize[i]; } + __device__ __forceinline__ int16_t clusterSizeX(int i) const { return __ldg(m_xsize + i); } + __device__ __forceinline__ int16_t& clusterSizeY(int i) { return m_ysize[i]; } + __device__ __forceinline__ int16_t clusterSizeY(int i) const { return __ldg(m_ysize + i); } + __device__ __forceinline__ uint16_t& detectorIndex(int i) { return m_detInd[i]; } + __device__ __forceinline__ uint16_t detectorIndex(int i) const { return __ldg(m_detInd + i); } + + __device__ __forceinline__ pixelCPEforGPU::ParamsOnGPU const& cpeParams() const { return *m_cpeParams; } + + __device__ __forceinline__ uint32_t hitsModuleStart(int i) const { return __ldg(m_hitsModuleStart + i); } + + __device__ __forceinline__ uint32_t* hitsLayerStart() { return m_hitsLayerStart; } + __device__ __forceinline__ uint32_t const* hitsLayerStart() const { return m_hitsLayerStart; } + + __device__ __forceinline__ Hist& phiBinner() { return *m_hist; } + __device__ __forceinline__ Hist const& phiBinner() const { return *m_hist; } + + __device__ __forceinline__ AverageGeometry& averageGeometry() { return *m_averageGeometry; } + __device__ __forceinline__ AverageGeometry const& averageGeometry() const { return *m_averageGeometry; } + +private: + // local coord + float *m_xl, *m_yl; + float *m_xerr, *m_yerr; + + // global coord + float *m_xg, *m_yg, *m_zg, *m_rg; + int16_t* m_iphi; + + // cluster properties + int32_t* m_charge; + int16_t* m_xsize; + int16_t* m_ysize; + uint16_t* m_detInd; + + // supporting objects + AverageGeometry* m_averageGeometry; // owned (corrected for beam spot: not sure where to host it otherwise) + pixelCPEforGPU::ParamsOnGPU const* m_cpeParams; // forwarded from setup, NOT owned + uint32_t const* m_hitsModuleStart; // forwarded from clusters + + uint32_t* m_hitsLayerStart; + + Hist* m_hist; + + uint32_t m_nHits; +}; + +#endif diff --git a/CUDADataFormats/TrackingRecHit/src/TrackingRecHit2DCUDA.cc b/CUDADataFormats/TrackingRecHit/src/TrackingRecHit2DCUDA.cc new file mode 100644 index 0000000000000..e6f223bfec4e3 --- /dev/null +++ b/CUDADataFormats/TrackingRecHit/src/TrackingRecHit2DCUDA.cc @@ -0,0 +1,19 @@ +#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DCUDA.h" +#include "HeterogeneousCore/CUDAUtilities/interface/copyAsync.h" +#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" +#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h" +#include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h" + +template <> +cudautils::host::unique_ptr TrackingRecHit2DCUDA::localCoordToHostAsync(cudaStream_t stream) const { + auto ret = cudautils::make_host_unique(4 * nHits(), stream); + cudautils::copyAsync(ret, m_store32, 4 * nHits(), stream); + return ret; +} + +template <> +cudautils::host::unique_ptr TrackingRecHit2DCUDA::hitsModuleStartToHostAsync(cudaStream_t stream) const { + auto ret = cudautils::make_host_unique(2001, stream); + cudaCheck(cudaMemcpyAsync(ret.get(), m_hitsModuleStart, 4 * 2001, cudaMemcpyDefault, stream)); + return ret; +} diff --git a/CUDADataFormats/TrackingRecHit/src/classes.h b/CUDADataFormats/TrackingRecHit/src/classes.h new file mode 100644 index 0000000000000..90cfd0945d76e --- /dev/null +++ b/CUDADataFormats/TrackingRecHit/src/classes.h @@ -0,0 +1,9 @@ +#ifndef CUDADataFormats_SiPixelCluster_src_classes_h +#define CUDADataFormats_SiPixelCluster_src_classes_h + +#include "CUDADataFormats/Common/interface/CUDAProduct.h" +#include "CUDADataFormats/Common/interface/HostProduct.h" +#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DCUDA.h" +#include "DataFormats/Common/interface/Wrapper.h" + +#endif // CUDADataFormats_SiPixelCluster_src_classes_h diff --git a/CUDADataFormats/TrackingRecHit/src/classes_def.xml b/CUDADataFormats/TrackingRecHit/src/classes_def.xml new file mode 100644 index 0000000000000..4e8325ddce87e --- /dev/null +++ b/CUDADataFormats/TrackingRecHit/src/classes_def.xml @@ -0,0 +1,10 @@ + + + + + + + + + + diff --git a/CUDADataFormats/TrackingRecHit/test/BuildFile.xml b/CUDADataFormats/TrackingRecHit/test/BuildFile.xml new file mode 100644 index 0000000000000..74f2818790d0f --- /dev/null +++ b/CUDADataFormats/TrackingRecHit/test/BuildFile.xml @@ -0,0 +1,3 @@ + + + diff --git a/CUDADataFormats/TrackingRecHit/test/TrackingRecHit2DCUDA_t.cpp b/CUDADataFormats/TrackingRecHit/test/TrackingRecHit2DCUDA_t.cpp new file mode 100644 index 0000000000000..42be4bc6991e1 --- /dev/null +++ b/CUDADataFormats/TrackingRecHit/test/TrackingRecHit2DCUDA_t.cpp @@ -0,0 +1,29 @@ +#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DCUDA.h" +#include "HeterogeneousCore/CUDAUtilities/interface/copyAsync.h" +#include "HeterogeneousCore/CUDAUtilities/interface/exitSansCUDADevices.h" +#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" + +namespace testTrackingRecHit2D { + + void runKernels(TrackingRecHit2DSOAView* hits); + +} + +int main() { + exitSansCUDADevices(); + + cudaStream_t stream; + cudaCheck(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking)); + + // inner scope to deallocate memory before destroying the stream + { + auto nHits = 200; + TrackingRecHit2DCUDA tkhit(nHits, nullptr, nullptr, stream); + + testTrackingRecHit2D::runKernels(tkhit.view()); + } + + cudaCheck(cudaStreamDestroy(stream)); + + return 0; +} diff --git a/CUDADataFormats/TrackingRecHit/test/TrackingRecHit2DCUDA_t.cu b/CUDADataFormats/TrackingRecHit/test/TrackingRecHit2DCUDA_t.cu new file mode 100644 index 0000000000000..6b55f8a8f98c5 --- /dev/null +++ b/CUDADataFormats/TrackingRecHit/test/TrackingRecHit2DCUDA_t.cu @@ -0,0 +1,31 @@ +#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DCUDA.h" + +namespace testTrackingRecHit2D { + + __global__ void fill(TrackingRecHit2DSOAView* phits) { + assert(phits); + auto& hits = *phits; + assert(hits.nHits() == 200); + + int i = threadIdx.x; + if (i > 200) + return; + } + + __global__ void verify(TrackingRecHit2DSOAView const* phits) { + assert(phits); + auto const& hits = *phits; + assert(hits.nHits() == 200); + + int i = threadIdx.x; + if (i > 200) + return; + } + + void runKernels(TrackingRecHit2DSOAView* hits) { + assert(hits); + fill<<<1, 1024>>>(hits); + verify<<<1, 1024>>>(hits); + } + +} // namespace testTrackingRecHit2D diff --git a/CUDADataFormats/Vertex/BuildFile.xml b/CUDADataFormats/Vertex/BuildFile.xml new file mode 100644 index 0000000000000..e3f9a0910bbd8 --- /dev/null +++ b/CUDADataFormats/Vertex/BuildFile.xml @@ -0,0 +1,9 @@ + + + + + + + + + diff --git a/CUDADataFormats/Vertex/interface/ZVertexHeterogeneous.h b/CUDADataFormats/Vertex/interface/ZVertexHeterogeneous.h new file mode 100644 index 0000000000000..d12ed5f3d98de --- /dev/null +++ b/CUDADataFormats/Vertex/interface/ZVertexHeterogeneous.h @@ -0,0 +1,14 @@ +#ifndef CUDADataFormatsVertexZVertexHeterogeneous_H +#define CUDADataFormatsVertexZVertexHeterogeneous_H + +#include "CUDADataFormats/Vertex/interface/ZVertexSoA.h" +#include "CUDADataFormats/Common/interface/HeterogeneousSoA.h" +#include "CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h" + +using ZVertexHeterogeneous = HeterogeneousSoA; +#ifndef __CUDACC__ +#include "CUDADataFormats/Common/interface/CUDAProduct.h" +using ZVertexCUDAProduct = CUDAProduct; +#endif + +#endif diff --git a/CUDADataFormats/Vertex/interface/ZVertexSoA.h b/CUDADataFormats/Vertex/interface/ZVertexSoA.h new file mode 100644 index 0000000000000..cd1f8aea4e340 --- /dev/null +++ b/CUDADataFormats/Vertex/interface/ZVertexSoA.h @@ -0,0 +1,26 @@ +#ifndef CUDADataFormatsVertexZVertexSoA_H +#define CUDADataFormatsVertexZVertexSoA_H + +#include +#include "HeterogeneousCore/CUDAUtilities/interface/cudaCompat.h" + +// SOA for vertices +// These vertices are clusterized and fitted only along the beam line (z) +// to obtain their global coordinate the beam spot position shall be added (eventually correcting for the beam angle as well) +struct ZVertexSoA { + static constexpr uint32_t MAXTRACKS = 32 * 1024; + static constexpr uint32_t MAXVTX = 1024; + + int16_t idv[MAXTRACKS]; // vertex index for each associated (original) track (-1 == not associate) + float zv[MAXVTX]; // output z-posistion of found vertices + float wv[MAXVTX]; // output weight (1/error^2) on the above + float chi2[MAXVTX]; // vertices chi2 + float ptv2[MAXVTX]; // vertices pt^2 + int32_t ndof[MAXVTX]; // vertices number of dof (reused as workspace for the number of nearest neighbours) + uint16_t sortInd[MAXVTX]; // sorted index (by pt2) ascending + uint32_t nvFinal; // the number of vertices + + __host__ __device__ void init() { nvFinal = 0; } +}; + +#endif // CUDADataFormatsVertexZVertexSoA.H diff --git a/CUDADataFormats/Vertex/src/classes.h b/CUDADataFormats/Vertex/src/classes.h new file mode 100644 index 0000000000000..f1144d1e3014e --- /dev/null +++ b/CUDADataFormats/Vertex/src/classes.h @@ -0,0 +1,8 @@ +#ifndef CUDADataFormats__src_classes_h +#define CUDADataFormats__src_classes_h + +#include "CUDADataFormats/Vertex/interface/ZVertexHeterogeneous.h" +#include "CUDADataFormats/Common/interface/CUDAProduct.h" +#include "DataFormats/Common/interface/Wrapper.h" + +#endif diff --git a/CUDADataFormats/Vertex/src/classes_def.xml b/CUDADataFormats/Vertex/src/classes_def.xml new file mode 100644 index 0000000000000..c43814eb03def --- /dev/null +++ b/CUDADataFormats/Vertex/src/classes_def.xml @@ -0,0 +1,6 @@ + + + + + + diff --git a/CalibTracker/Records/interface/SiPixelGainCalibrationForHLTGPURcd.h b/CalibTracker/Records/interface/SiPixelGainCalibrationForHLTGPURcd.h new file mode 100644 index 0000000000000..56301421f325c --- /dev/null +++ b/CalibTracker/Records/interface/SiPixelGainCalibrationForHLTGPURcd.h @@ -0,0 +1,17 @@ +#ifndef CalibTracker_Records_SiPixelGainCalibrationForHLTGPURcd_h +#define CalibTracker_Records_SiPixelGainCalibrationForHLTGPURcd_h + +#include "FWCore/Framework/interface/EventSetupRecordImplementation.h" +#include "FWCore/Framework/interface/DependentRecordImplementation.h" + +#include "CondFormats/DataRecord/interface/SiPixelGainCalibrationForHLTRcd.h" +#include "Geometry/Records/interface/TrackerDigiGeometryRecord.h" + +#include "boost/mpl/vector.hpp" + +class SiPixelGainCalibrationForHLTGPURcd + : public edm::eventsetup::DependentRecordImplementation< + SiPixelGainCalibrationForHLTGPURcd, + boost::mpl::vector > {}; + +#endif diff --git a/CalibTracker/Records/src/SiPixelGainCalibrationForHLTGPURcd.cc b/CalibTracker/Records/src/SiPixelGainCalibrationForHLTGPURcd.cc new file mode 100644 index 0000000000000..e6020eca80b1f --- /dev/null +++ b/CalibTracker/Records/src/SiPixelGainCalibrationForHLTGPURcd.cc @@ -0,0 +1,5 @@ +#include "CalibTracker/Records/interface/SiPixelGainCalibrationForHLTGPURcd.h" +#include "FWCore/Framework/interface/eventsetuprecord_registration_macro.h" +#include "FWCore/Utilities/interface/typelookup.h" + +EVENTSETUP_RECORD_REG(SiPixelGainCalibrationForHLTGPURcd); diff --git a/CalibTracker/SiPixelESProducers/BuildFile.xml b/CalibTracker/SiPixelESProducers/BuildFile.xml index e9d22b32f0afb..02a36e17ed732 100644 --- a/CalibTracker/SiPixelESProducers/BuildFile.xml +++ b/CalibTracker/SiPixelESProducers/BuildFile.xml @@ -7,7 +7,9 @@ + + diff --git a/CalibTracker/SiPixelESProducers/interface/SiPixelGainCalibrationForHLTGPU.h b/CalibTracker/SiPixelESProducers/interface/SiPixelGainCalibrationForHLTGPU.h new file mode 100644 index 0000000000000..8bfefee5c3387 --- /dev/null +++ b/CalibTracker/SiPixelESProducers/interface/SiPixelGainCalibrationForHLTGPU.h @@ -0,0 +1,32 @@ +#ifndef CalibTracker_SiPixelESProducers_interface_SiPixelGainCalibrationForHLTGPU_h +#define CalibTracker_SiPixelESProducers_interface_SiPixelGainCalibrationForHLTGPU_h + +#include "CondFormats/SiPixelObjects/interface/SiPixelGainCalibrationForHLT.h" +#include "HeterogeneousCore/CUDACore/interface/CUDAESProduct.h" + +class SiPixelGainCalibrationForHLT; +class SiPixelGainForHLTonGPU; +struct SiPixelGainForHLTonGPU_DecodingStructure; +class TrackerGeometry; + +class SiPixelGainCalibrationForHLTGPU { +public: + explicit SiPixelGainCalibrationForHLTGPU(const SiPixelGainCalibrationForHLT &gains, const TrackerGeometry &geom); + ~SiPixelGainCalibrationForHLTGPU(); + + const SiPixelGainForHLTonGPU *getGPUProductAsync(cudaStream_t cudaStream) const; + const SiPixelGainForHLTonGPU *getCPUProduct() const { return gainForHLTonHost_; } + const SiPixelGainCalibrationForHLT *getOriginalProduct() { return gains_; } + +private: + const SiPixelGainCalibrationForHLT *gains_ = nullptr; + SiPixelGainForHLTonGPU *gainForHLTonHost_ = nullptr; + struct GPUData { + ~GPUData(); + SiPixelGainForHLTonGPU *gainForHLTonGPU = nullptr; + SiPixelGainForHLTonGPU_DecodingStructure *gainDataOnGPU = nullptr; + }; + CUDAESProduct gpuData_; +}; + +#endif // CalibTracker_SiPixelESProducers_interface_SiPixelGainCalibrationForHLTGPU_h diff --git a/CalibTracker/SiPixelESProducers/plugins/BuildFile.xml b/CalibTracker/SiPixelESProducers/plugins/BuildFile.xml index 44db9d9ba0582..57bf68a1b7518 100644 --- a/CalibTracker/SiPixelESProducers/plugins/BuildFile.xml +++ b/CalibTracker/SiPixelESProducers/plugins/BuildFile.xml @@ -6,6 +6,8 @@ + + diff --git a/CalibTracker/SiPixelESProducers/plugins/SiPixelGainCalibrationForHLTGPUESProducer.cc b/CalibTracker/SiPixelESProducers/plugins/SiPixelGainCalibrationForHLTGPUESProducer.cc new file mode 100644 index 0000000000000..bf8a0b2c5a75f --- /dev/null +++ b/CalibTracker/SiPixelESProducers/plugins/SiPixelGainCalibrationForHLTGPUESProducer.cc @@ -0,0 +1,47 @@ +#include "CalibTracker/SiPixelESProducers/interface/SiPixelGainCalibrationForHLTGPU.h" +#include "CalibTracker/Records/interface/SiPixelGainCalibrationForHLTGPURcd.h" +#include "CondFormats/SiPixelObjects/interface/SiPixelGainCalibrationForHLT.h" +#include "CondFormats/DataRecord/interface/SiPixelGainCalibrationForHLTRcd.h" +#include "FWCore/Framework/interface/ESProducer.h" +#include "FWCore/Framework/interface/EventSetup.h" +#include "FWCore/Framework/interface/ESHandle.h" +#include "FWCore/Framework/interface/ModuleFactory.h" +#include "FWCore/ParameterSet/interface/ParameterSet.h" +#include "Geometry/TrackerGeometryBuilder/interface/TrackerGeometry.h" +#include "Geometry/Records/interface/TrackerDigiGeometryRecord.h" + +#include + +class SiPixelGainCalibrationForHLTGPUESProducer : public edm::ESProducer { +public: + explicit SiPixelGainCalibrationForHLTGPUESProducer(const edm::ParameterSet& iConfig); + std::unique_ptr produce(const SiPixelGainCalibrationForHLTGPURcd& iRecord); + + static void fillDescriptions(edm::ConfigurationDescriptions& descriptions); + +private: + edm::ESGetToken gainsToken_; + edm::ESGetToken geometryToken_; +}; + +SiPixelGainCalibrationForHLTGPUESProducer::SiPixelGainCalibrationForHLTGPUESProducer(const edm::ParameterSet& iConfig) { + setWhatProduced(this).setConsumes(gainsToken_).setConsumes(geometryToken_); +} + +void SiPixelGainCalibrationForHLTGPUESProducer::fillDescriptions(edm::ConfigurationDescriptions& descriptions) { + edm::ParameterSetDescription desc; + descriptions.add("siPixelGainCalibrationForHLTGPU", desc); +} + +std::unique_ptr SiPixelGainCalibrationForHLTGPUESProducer::produce( + const SiPixelGainCalibrationForHLTGPURcd& iRecord) { + auto gains = iRecord.getHandle(gainsToken_); + auto geom = iRecord.getHandle(geometryToken_); + return std::make_unique(*gains, *geom); +} + +#include "FWCore/Framework/interface/MakerMacros.h" +#include "FWCore/Utilities/interface/typelookup.h" +#include "FWCore/Framework/interface/eventsetuprecord_registration_macro.h" + +DEFINE_FWK_EVENTSETUP_MODULE(SiPixelGainCalibrationForHLTGPUESProducer); diff --git a/CalibTracker/SiPixelESProducers/src/ES_SiPixelGainCalibrationForHLTGPU.cc b/CalibTracker/SiPixelESProducers/src/ES_SiPixelGainCalibrationForHLTGPU.cc new file mode 100644 index 0000000000000..80932fb468f71 --- /dev/null +++ b/CalibTracker/SiPixelESProducers/src/ES_SiPixelGainCalibrationForHLTGPU.cc @@ -0,0 +1,4 @@ +#include "CalibTracker/SiPixelESProducers/interface/SiPixelGainCalibrationForHLTGPU.h" +#include "FWCore/Utilities/interface/typelookup.h" + +TYPELOOKUP_DATA_REG(SiPixelGainCalibrationForHLTGPU); diff --git a/CalibTracker/SiPixelESProducers/src/SiPixelGainCalibrationForHLTGPU.cc b/CalibTracker/SiPixelESProducers/src/SiPixelGainCalibrationForHLTGPU.cc new file mode 100644 index 0000000000000..e4f278c28ec69 --- /dev/null +++ b/CalibTracker/SiPixelESProducers/src/SiPixelGainCalibrationForHLTGPU.cc @@ -0,0 +1,104 @@ +#include + +#include "CalibTracker/SiPixelESProducers/interface/SiPixelGainCalibrationForHLTGPU.h" +#include "CondFormats/SiPixelObjects/interface/SiPixelGainCalibrationForHLT.h" +#include "CondFormats/SiPixelObjects/interface/SiPixelGainForHLTonGPU.h" +#include "Geometry/CommonDetUnit/interface/GeomDetType.h" +#include "Geometry/TrackerGeometryBuilder/interface/TrackerGeometry.h" +#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" + +SiPixelGainCalibrationForHLTGPU::SiPixelGainCalibrationForHLTGPU(const SiPixelGainCalibrationForHLT& gains, + const TrackerGeometry& geom) + : gains_(&gains) { + // bizzarre logic (looking for fist strip-det) don't ask + auto const& dus = geom.detUnits(); + unsigned m_detectors = dus.size(); + for (unsigned int i = 1; i < 7; ++i) { + if (geom.offsetDU(GeomDetEnumerators::tkDetEnum[i]) != dus.size() && + dus[geom.offsetDU(GeomDetEnumerators::tkDetEnum[i])]->type().isTrackerStrip()) { + if (geom.offsetDU(GeomDetEnumerators::tkDetEnum[i]) < m_detectors) + m_detectors = geom.offsetDU(GeomDetEnumerators::tkDetEnum[i]); + } + } + + /* + std::cout << "caching calibs for " << m_detectors << " pixel detectors of size " << gains.data().size() << std::endl; + std::cout << "sizes " << sizeof(char) << ' ' << sizeof(uint8_t) << ' ' << sizeof(SiPixelGainForHLTonGPU::DecodingStructure) << std::endl; + */ + + cudaCheck(cudaMallocHost((void**)&gainForHLTonHost_, sizeof(SiPixelGainForHLTonGPU))); + gainForHLTonHost_->v_pedestals = + (SiPixelGainForHLTonGPU_DecodingStructure*)this->gains_->data().data(); // so it can be used on CPU as well... + + // do not read back from the (possibly write-combined) memory buffer + auto minPed = gains.getPedLow(); + auto maxPed = gains.getPedHigh(); + auto minGain = gains.getGainLow(); + auto maxGain = gains.getGainHigh(); + auto nBinsToUseForEncoding = 253; + + // we will simplify later (not everything is needed....) + gainForHLTonHost_->minPed_ = minPed; + gainForHLTonHost_->maxPed_ = maxPed; + gainForHLTonHost_->minGain_ = minGain; + gainForHLTonHost_->maxGain_ = maxGain; + + gainForHLTonHost_->numberOfRowsAveragedOver_ = 80; + gainForHLTonHost_->nBinsToUseForEncoding_ = nBinsToUseForEncoding; + gainForHLTonHost_->deadFlag_ = 255; + gainForHLTonHost_->noisyFlag_ = 254; + + gainForHLTonHost_->pedPrecision = static_cast(maxPed - minPed) / nBinsToUseForEncoding; + gainForHLTonHost_->gainPrecision = static_cast(maxGain - minGain) / nBinsToUseForEncoding; + + /* + std::cout << "precisions g " << gainForHLTonHost_->pedPrecision << ' ' << gainForHLTonHost_->gainPrecision << std::endl; + */ + + // fill the index map + auto const& ind = gains.getIndexes(); + /* + std::cout << ind.size() << " " << m_detectors << std::endl; + */ + + for (auto i = 0U; i < m_detectors; ++i) { + auto p = std::lower_bound( + ind.begin(), ind.end(), dus[i]->geographicalId().rawId(), SiPixelGainCalibrationForHLT::StrictWeakOrdering()); + assert(p != ind.end() && p->detid == dus[i]->geographicalId()); + assert(p->iend <= gains.data().size()); + assert(p->iend >= p->ibegin); + assert(0 == p->ibegin % 2); + assert(0 == p->iend % 2); + assert(p->ibegin != p->iend); + assert(p->ncols > 0); + gainForHLTonHost_->rangeAndCols[i] = std::make_pair(SiPixelGainForHLTonGPU::Range(p->ibegin, p->iend), p->ncols); + // if (ind[i].detid!=dus[i]->geographicalId()) std::cout << ind[i].detid<<"!="<geographicalId() << std::endl; + // gainForHLTonHost_->rangeAndCols[i] = std::make_pair(SiPixelGainForHLTonGPU::Range(ind[i].ibegin,ind[i].iend), ind[i].ncols); + } +} + +SiPixelGainCalibrationForHLTGPU::~SiPixelGainCalibrationForHLTGPU() { cudaCheck(cudaFreeHost(gainForHLTonHost_)); } + +SiPixelGainCalibrationForHLTGPU::GPUData::~GPUData() { + cudaCheck(cudaFree(gainForHLTonGPU)); + cudaCheck(cudaFree(gainDataOnGPU)); +} + +const SiPixelGainForHLTonGPU* SiPixelGainCalibrationForHLTGPU::getGPUProductAsync(cudaStream_t cudaStream) const { + const auto& data = gpuData_.dataForCurrentDeviceAsync(cudaStream, [this](GPUData& data, cudaStream_t stream) { + cudaCheck(cudaMalloc((void**)&data.gainForHLTonGPU, sizeof(SiPixelGainForHLTonGPU))); + cudaCheck(cudaMalloc((void**)&data.gainDataOnGPU, this->gains_->data().size())); + // gains.data().data() is used also for non-GPU code, we cannot allocate it on aligned and write-combined memory + cudaCheck(cudaMemcpyAsync( + data.gainDataOnGPU, this->gains_->data().data(), this->gains_->data().size(), cudaMemcpyDefault, stream)); + + cudaCheck(cudaMemcpyAsync( + data.gainForHLTonGPU, this->gainForHLTonHost_, sizeof(SiPixelGainForHLTonGPU), cudaMemcpyDefault, stream)); + cudaCheck(cudaMemcpyAsync(&(data.gainForHLTonGPU->v_pedestals), + &(data.gainDataOnGPU), + sizeof(SiPixelGainForHLTonGPU_DecodingStructure*), + cudaMemcpyDefault, + stream)); + }); + return data.gainForHLTonGPU; +} diff --git a/CondFormats/SiPixelObjects/interface/SiPixelGainForHLTonGPU.h b/CondFormats/SiPixelObjects/interface/SiPixelGainForHLTonGPU.h new file mode 100644 index 0000000000000..8ce3924e54609 --- /dev/null +++ b/CondFormats/SiPixelObjects/interface/SiPixelGainForHLTonGPU.h @@ -0,0 +1,63 @@ +#ifndef CondFormats_SiPixelObjects_SiPixelGainForHLTonGPU_h +#define CondFormats_SiPixelObjects_SiPixelGainForHLTonGPU_h + +#include +#include +#include + +#include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h" + +struct SiPixelGainForHLTonGPU_DecodingStructure { + uint8_t gain; + uint8_t ped; +}; + +// copy of SiPixelGainCalibrationForHLT +class SiPixelGainForHLTonGPU { +public: + using DecodingStructure = SiPixelGainForHLTonGPU_DecodingStructure; + + using Range = std::pair; + + inline __host__ __device__ std::pair getPedAndGain( + uint32_t moduleInd, int col, int row, bool& isDeadColumn, bool& isNoisyColumn) const { + auto range = rangeAndCols[moduleInd].first; + auto nCols = rangeAndCols[moduleInd].second; + + // determine what averaged data block we are in (there should be 1 or 2 of these depending on if plaquette is 1 by X or 2 by X + unsigned int lengthOfColumnData = (range.second - range.first) / nCols; + unsigned int lengthOfAveragedDataInEachColumn = 2; // we always only have two values per column averaged block + unsigned int numberOfDataBlocksToSkip = row / numberOfRowsAveragedOver_; + + auto offset = range.first + col * lengthOfColumnData + lengthOfAveragedDataInEachColumn * numberOfDataBlocksToSkip; + + assert(offset < range.second); + assert(offset < 3088384); + assert(0 == offset % 2); + + DecodingStructure const* __restrict__ lp = v_pedestals; + auto s = lp[offset / 2]; + + isDeadColumn = (s.ped & 0xFF) == deadFlag_; + isNoisyColumn = (s.ped & 0xFF) == noisyFlag_; + + return std::make_pair(decodePed(s.ped & 0xFF), decodeGain(s.gain & 0xFF)); + } + + constexpr float decodeGain(unsigned int gain) const { return gain * gainPrecision + minGain_; } + constexpr float decodePed(unsigned int ped) const { return ped * pedPrecision + minPed_; } + + DecodingStructure* v_pedestals; + std::pair rangeAndCols[2000]; + + float minPed_, maxPed_, minGain_, maxGain_; + + float pedPrecision, gainPrecision; + + unsigned int numberOfRowsAveragedOver_; // this is 80!!!! + unsigned int nBinsToUseForEncoding_; + unsigned int deadFlag_; + unsigned int noisyFlag_; +}; + +#endif // CondFormats_SiPixelObjects_SiPixelGainForHLTonGPU_h diff --git a/Configuration/Applications/python/ConfigBuilder.py b/Configuration/Applications/python/ConfigBuilder.py index 608651f121385..4ae4b2df7a1d7 100644 --- a/Configuration/Applications/python/ConfigBuilder.py +++ b/Configuration/Applications/python/ConfigBuilder.py @@ -921,6 +921,8 @@ def define_Configs(self): self.loadAndRemember('SimGeneral.HepPDTESSource.'+self._options.particleTable+'_cfi') self.loadAndRemember('FWCore/MessageService/MessageLogger_cfi') + # Eventually replace with some more generic file to load + self.loadAndRemember('HeterogeneousCore/CUDAServices/CUDAService_cfi') self.ALCADefaultCFF="Configuration/StandardSequences/AlCaRecoStreams_cff" self.GENDefaultCFF="Configuration/StandardSequences/Generator_cff" diff --git a/Configuration/ProcessModifiers/python/gpu_cff.py b/Configuration/ProcessModifiers/python/gpu_cff.py new file mode 100644 index 0000000000000..993f71804fbc1 --- /dev/null +++ b/Configuration/ProcessModifiers/python/gpu_cff.py @@ -0,0 +1,5 @@ +import FWCore.ParameterSet.Config as cms + +# This modifier is for replacing CPU modules with GPU counterparts + +gpu = cms.Modifier() diff --git a/Configuration/ProcessModifiers/python/pixelNtupleFit_cff.py b/Configuration/ProcessModifiers/python/pixelNtupleFit_cff.py new file mode 100644 index 0000000000000..db8a2ac229a02 --- /dev/null +++ b/Configuration/ProcessModifiers/python/pixelNtupleFit_cff.py @@ -0,0 +1,5 @@ +import FWCore.ParameterSet.Config as cms + +# This modifier is for replacing the default pixel track "fitting" with eihter Riemann or BrokenLine fit + +pixelNtupleFit = cms.Modifier() diff --git a/Configuration/PyReleaseValidation/python/relval_2017.py b/Configuration/PyReleaseValidation/python/relval_2017.py index 68bb0323078a1..7917a21310dd6 100644 --- a/Configuration/PyReleaseValidation/python/relval_2017.py +++ b/Configuration/PyReleaseValidation/python/relval_2017.py @@ -20,10 +20,18 @@ # 2018 (ele guns 10, 35, 1000; pho guns 10, 35; mu guns 1, 10, 100, 1000, QCD 3TeV, QCD Flat) # 2018 (ZMM, TTbar, ZEE, MinBias, TTbar PU, ZEE PU, TTbar design) # (TTbar trackingOnly, pixelTrackingOnly) -# he collapse: TTbar, TTbar PU, TTbar design -# ParkingBPH: TTbar +# (HE collapse: TTbar, TTbar PU, TTbar design) +# (ParkingBPH: TTbar) +# (Patatrack pixel-only: ZMM - on CPU, on GPU, both, auto) +# (Patatrack pixel-only: TTbar - on CPU, on GPU, both, auto) +# (Patatrack ECAL-only: TTbar - on CPU, on GPU, both, auto) +# (Patatrack HCAL-only: TTbar - on CPU, on GPU, both, auto) # 2021 (ZMM, TTbar, ZEE, MinBias, TTbar PU, TTbar PU premix, ZEE PU, TTbar design) # (TTbar trackingMkFit) +# (Patatrack pixel-only: ZMM - on CPU, on GPU, both, auto) +# (Patatrack pixel-only: TTbar - on CPU, on GPU, both, auto) +# (Patatrack ECAL-only: TTbar - on CPU, on GPU, both, auto) +# (Patatrack HCAL-only: TTbar - on CPU, on GPU, both, auto) # 2023 (TTbar, TTbar PU, TTbar PU premix) # 2024 (TTbar, TTbar PU, TTbar PU premix) numWFIB = [10001.0,10002.0,10003.0,10004.0,10005.0,10006.0,10007.0,10008.0,10009.0,10059.0,10071.0, @@ -34,8 +42,16 @@ 10824.1,10824.5, 10824.6,11024.6,11224.6, 10824.8, + 10842.501,10842.502, # 10842.503,10842.504, + 10824.501,10824.502, # 10824.503,10824.504, + # 10824.511,10824.512,10824.513,10824.514, + # 10824.521,10824.522,10824.523,10824.524, 11650.0,11634.0,11646.0,11640.0,11834.0,11834.99,11846.0,12024.0, 11634.7, + 11650.501,11650.502, # 11650.503,11650.504, + 11634.501,11634.502, # 11634.503,11634.504, + # 11634.511,11634.512,11634.513,11634.514, + # 11634.521,11634.522,11634.523,11634.524, 12434.0,12634.0,12634.99, 12834.0,13034.0,13034.99] for numWF in numWFIB: diff --git a/Configuration/PyReleaseValidation/python/relval_standard.py b/Configuration/PyReleaseValidation/python/relval_standard.py index 95b6e0d7a8a52..5afa5be092aa2 100644 --- a/Configuration/PyReleaseValidation/python/relval_standard.py +++ b/Configuration/PyReleaseValidation/python/relval_standard.py @@ -399,6 +399,14 @@ workflows[136.895] = ['',['RunDisplacedJet2018D','HLTDR2_2018','RECODR2_2018reHLT_skimDisplacedJet_Prompt','HARVEST2018_Prompt']] workflows[136.896] = ['',['RunCharmonium2018D','HLTDR2_2018','RECODR2_2018reHLT_skimCharmonium_Prompt','HARVEST2018_Prompt']] +### run 2018D pixel tracks ### +workflows[136.8855] = ['',['RunHLTPhy2018D','HLTDR2_2018','RECODR2_2018reHLT_Prompt_pixelTrackingOnly','HARVEST2018_pixelTrackingOnly']] +workflows[136.885501] = ['',['RunHLTPhy2018D','HLTDR2_2018','RECODR2_2018reHLT_Patatrack_PixelOnlyCPU','HARVEST2018_pixelTrackingOnly']] +workflows[136.885502] = ['',['RunHLTPhy2018D','HLTDR2_2018','RECODR2_2018reHLT_Patatrack_PixelOnlyGPU','HARVEST2018_pixelTrackingOnly']] +workflows[136.8885] = ['',['RunJetHT2018D','HLTDR2_2018','RECODR2_2018reHLT_Prompt_pixelTrackingOnly','HARVEST2018_pixelTrackingOnly']] +workflows[136.888501] = ['',['RunJetHT2018D','HLTDR2_2018','RECODR2_2018reHLT_Patatrack_PixelOnlyCPU','HARVEST2018_pixelTrackingOnly']] +workflows[136.888502] = ['',['RunJetHT2018D','HLTDR2_2018','RECODR2_2018reHLT_Patatrack_PixelOnlyGPU','HARVEST2018_pixelTrackingOnly']] + # multi-run harvesting workflows[137.8] = ['',['RunEGamma2018C','HLTDR2_2018','RECODR2_2018reHLT_skimEGamma_Offline_L1TEgDQM', 'RunEGamma2018D','HLTDR2_2018','RECODR2_2018reHLT_skimEGamma_Prompt_L1TEgDQM','HARVEST2018_L1TEgDQM_MULTIRUN']] diff --git a/Configuration/PyReleaseValidation/python/relval_steps.py b/Configuration/PyReleaseValidation/python/relval_steps.py index 46dae0224829a..8af413eb4bdfa 100644 --- a/Configuration/PyReleaseValidation/python/relval_steps.py +++ b/Configuration/PyReleaseValidation/python/relval_steps.py @@ -2110,6 +2110,12 @@ def gen2018HiMix(fragment,howMuch): '--era' :'Run2_2016' } +step3_pixelNtupleFit = { + '--procModifiers': 'pixelNtupleFit', +} +step3_gpu = { + '--procModifiers': 'gpu', +} step3_trackingLowPU = { '--era': 'Run2_2016_trackingLowPU' } @@ -2243,6 +2249,9 @@ def gen2018HiMix(fragment,howMuch): steps['RECODR2_2018reHLT_Prompt']=merge([{'--conditions':'auto:run2_data_promptlike'},steps['RECODR2_2018reHLT']]) steps['RECODR2_2018reHLT_ZBPrompt']=merge([{'--conditions':'auto:run2_data_promptlike','-s':'RAW2DIGI,L1Reco,RECO,EI,PAT,ALCA:SiStripCalZeroBias+SiStripCalMinBias+TkAlMinBias+EcalESAlign,DQM:@rerecoZeroBias+@ExtraHLT+@miniAODDQM'},steps['RECODR2_2018reHLT']]) +steps['RECODR2_2018reHLT_Prompt_pixelTrackingOnly']=merge([{'-s': 'RAW2DIGI:RawToDigi_pixelOnly,RECO:reconstruction_pixelTrackingOnly,DQM:@pixelTrackingOnlyDQM'},steps['RECODR2_2018reHLT_Prompt']]) +steps['RECODR2_2018reHLT_Patatrack_PixelOnlyCPU']=merge([step3_pixelNtupleFit, steps['RECODR2_2018reHLT_Prompt_pixelTrackingOnly']]) +steps['RECODR2_2018reHLT_Patatrack_PixelOnlyGPU']=merge([step3_gpu, steps['RECODR2_2018reHLT_Prompt_pixelTrackingOnly']]) steps['RECODR2_2018reHLT_Offline']=merge([{'--conditions':'auto:run2_data'},steps['RECODR2_2018reHLT']]) steps['RECODR2_2018reHLT_ZBOffline']=merge([{'--conditions':'auto:run2_data','-s':'RAW2DIGI,L1Reco,RECO,EI,PAT,ALCA:SiStripCalZeroBias+SiStripCalMinBias+TkAlMinBias+EcalESAlign,DQM:@rerecoZeroBias+@ExtraHLT+@miniAODDQM'},steps['RECODR2_2018reHLT']]) steps['RECODR2_2018reHLT_skimEGamma_Prompt_L1TEgDQM']=merge([{'--conditions':'auto:run2_data_promptlike'},steps['RECODR2_2018reHLT_skimEGamma_L1TEgDQM']]) @@ -2579,6 +2588,7 @@ def gen2018HiMix(fragment,howMuch): steps['HARVEST2018_L1TEgDQM_Prompt'] = merge([ {'-s':'HARVESTING:@standardDQMFakeHLT+@miniAODDQM+@L1TEgamma'}, steps['HARVEST2018_Prompt'] ]) steps['HARVEST2018_L1TMuDQM'] = merge([ {'-s':'HARVESTING:@standardDQMFakeHLT+@miniAODDQM+@L1TMuon'}, steps['HARVEST2018'] ]) steps['HARVEST2018_L1TMuDQM_Prompt'] = merge([ {'-s':'HARVESTING:@standardDQMFakeHLT+@miniAODDQM+@L1TMuon'}, steps['HARVEST2018_Prompt'] ]) +steps['HARVEST2018_pixelTrackingOnly'] = merge([ {'-s':'HARVESTING:@pixelTrackingOnlyDQM'}, steps['HARVEST2018'] ]) steps['HARVEST2018_hBStar'] = merge([ {'--era' : 'Run2_2018_highBetaStar'}, steps['HARVEST2018'] ]) steps['HARVEST2018_HEfail'] = merge([ {'--conditions':'auto:run2_data_promptlike_HEfail'}, steps['HARVEST2018'] ]) steps['HARVEST2018_BadHcalMitig'] = merge([ {'--era' : 'Run2_2018,pf_badHcalMitigation','--conditions':'auto:run2_data_promptlike_HEfail'}, steps['HARVEST2018'] ]) diff --git a/Configuration/PyReleaseValidation/python/upgradeWorkflowComponents.py b/Configuration/PyReleaseValidation/python/upgradeWorkflowComponents.py index 1ba1874bad75c..8ecd4285ce005 100644 --- a/Configuration/PyReleaseValidation/python/upgradeWorkflowComponents.py +++ b/Configuration/PyReleaseValidation/python/upgradeWorkflowComponents.py @@ -272,6 +272,72 @@ def condition_(self, fragment, stepList, key, hasHarvest): '--customise': 'RecoTracker/MkFit/customizeInitialStepToMkFit.customizeInitialStepToMkFit' } +# Patatrack workflows +class UpgradeWorkflowPatatrack(UpgradeWorkflow): + def condition(self, fragment, stepList, key, hasHarvest): + is_2018_ttbar = ('2018' in key and fragment=="TTbar_13") + is_2021_ttbar = ('2021' in key and fragment=="TTbar_14TeV") + is_2018_zmumu = ('2018' in key and fragment=="ZMM_13") + is_2021_zmumu = ('2021' in key and fragment=="ZMM_14") + result = any((is_2018_ttbar, is_2021_ttbar, is_2018_zmumu, is_2021_zmumu)) and hasHarvest and self.condition_(fragment, stepList, key, hasHarvest) + if result: + # skip ALCA and Nano + skipList = [s for s in stepList if (("ALCA" in s) or ("Nano" in s))] + for skip in skipList: + stepList.remove(skip) + return result + def condition_(self, fragment, stepList, key, hasHarvest): + return True + +class UpgradeWorkflowPatatrack_PixelOnlyCPU(UpgradeWorkflowPatatrack): + def setup_(self, step, stepName, stepDict, k, properties): + if 'Reco' in step: stepDict[stepName][k] = merge([self.step3, stepDict[step][k]]) + elif 'HARVEST' in step: stepDict[stepName][k] = merge([{'-s': 'HARVESTING:@trackingOnlyValidation+@pixelTrackingOnlyDQM'}, stepDict[step][k]]) + def condition_(self, fragment, stepList, key, hasHarvest): + return '2018' in key or '2021' in key +upgradeWFs['PatatrackPixelOnlyCPU'] = UpgradeWorkflowPatatrack_PixelOnlyCPU( + steps = [ + 'RecoFull', + 'HARVESTFull', + 'RecoFullGlobal', + 'HARVESTFullGlobal', + ], + PU = [], + suffix = 'Patatrack_PixelOnlyCPU', + offset = 0.501, +) +upgradeWFs['PatatrackPixelOnlyCPU'].step3 = { + '-s': 'RAW2DIGI:RawToDigi_pixelOnly,RECO:reconstruction_pixelTrackingOnly,VALIDATION:@pixelTrackingOnlyValidation,DQM:@pixelTrackingOnlyDQM', + '--datatier': 'GEN-SIM-RECO,DQMIO', + '--eventcontent': 'RECOSIM,DQM', + '--procModifiers': 'pixelNtupleFit' +} + +class UpgradeWorkflowPatatrack_PixelOnlyGPU(UpgradeWorkflowPatatrack): + def setup_(self, step, stepName, stepDict, k, properties): + if 'Reco' in step: stepDict[stepName][k] = merge([self.step3, stepDict[step][k]]) + elif 'HARVEST' in step: stepDict[stepName][k] = merge([{'-s': 'HARVESTING:@trackingOnlyValidation+@pixelTrackingOnlyDQM'}, stepDict[step][k]]) + def condition_(self, fragment, stepList, key, hasHarvest): + return '2018' in key or '2021' in key +upgradeWFs['PatatrackPixelOnlyGPU'] = UpgradeWorkflowPatatrack_PixelOnlyGPU( + steps = [ + 'RecoFull', + 'HARVESTFull', + 'RecoFullGlobal', + 'HARVESTFullGlobal', + ], + PU = [], + suffix = 'Patatrack_PixelOnlyGPU', + offset = 0.502, +) +upgradeWFs['PatatrackPixelOnlyGPU'].step3 = { + '-s': 'RAW2DIGI:RawToDigi_pixelOnly,RECO:reconstruction_pixelTrackingOnly,VALIDATION:@pixelTrackingOnlyValidation,DQM:@pixelTrackingOnlyDQM', + '--datatier': 'GEN-SIM-RECO,DQMIO', + '--eventcontent': 'RECOSIM,DQM', + '--procModifiers': 'gpu' +} +# end of Patatrack workflows + class UpgradeWorkflow_ProdLike(UpgradeWorkflow): def setup_(self, step, stepName, stepDict, k, properties): if 'Reco' in step: diff --git a/Configuration/StandardSequences/python/RawToDigi_cff.py b/Configuration/StandardSequences/python/RawToDigi_cff.py index 5eead9b7c72bb..d6ac829a1bfb8 100644 --- a/Configuration/StandardSequences/python/RawToDigi_cff.py +++ b/Configuration/StandardSequences/python/RawToDigi_cff.py @@ -3,7 +3,7 @@ # This object is used to selectively make changes for different running # scenarios. In this case it makes changes for Run 2. -from EventFilter.SiPixelRawToDigi.SiPixelRawToDigi_cfi import * +from EventFilter.SiPixelRawToDigi.siPixelDigis_cff import * from EventFilter.SiStripRawToDigi.SiStripDigis_cfi import * @@ -46,7 +46,7 @@ from EventFilter.CTPPSRawToDigi.ctppsRawToDigi_cff import * RawToDigiTask = cms.Task(L1TRawToDigiTask, - siPixelDigis, + siPixelDigisTask, siStripDigis, ecalDigis, ecalPreshowerDigis, @@ -61,14 +61,14 @@ ) RawToDigi = cms.Sequence(RawToDigiTask) -RawToDigiTask_noTk = RawToDigiTask.copyAndExclude([siPixelDigis, siStripDigis]) +RawToDigiTask_noTk = RawToDigiTask.copyAndExclude([siPixelDigisTask, siStripDigis]) RawToDigi_noTk = cms.Sequence(RawToDigiTask_noTk) -RawToDigiTask_pixelOnly = cms.Task(siPixelDigis) +RawToDigiTask_pixelOnly = cms.Task(siPixelDigisTask, scalersRawToDigi) RawToDigi_pixelOnly = cms.Sequence(RawToDigiTask_pixelOnly) scalersRawToDigi.scalersInputTag = 'rawDataCollector' -siPixelDigis.InputLabel = 'rawDataCollector' +siPixelDigis.cpu.InputLabel = 'rawDataCollector' #false by default anyways ecalDigis.DoRegional = False ecalDigis.InputLabel = 'rawDataCollector' ecalPreshowerDigis.sourceTag = 'rawDataCollector' diff --git a/Configuration/StandardSequences/python/Reconstruction_cff.py b/Configuration/StandardSequences/python/Reconstruction_cff.py index 4b606b213d1cb..52bfc33d5a91e 100644 --- a/Configuration/StandardSequences/python/Reconstruction_cff.py +++ b/Configuration/StandardSequences/python/Reconstruction_cff.py @@ -16,7 +16,7 @@ siPixelClusterShapeCachePreSplitting = siPixelClusterShapeCache.clone( src = 'siPixelClustersPreSplitting' - ) +) # Global reco from RecoEcal.Configuration.RecoEcal_cff import * @@ -197,9 +197,9 @@ reconstruction_trackingOnly = cms.Sequence(localreco*globalreco_tracking) reconstruction_pixelTrackingOnly = cms.Sequence( pixeltrackerlocalreco* - offlineBeamSpot* siPixelClusterShapeCachePreSplitting* - recopixelvertexing + recopixelvertexing, + offlineBeamSpotTask ) #need a fully expanded sequence copy diff --git a/DQM/TrackingMonitorClient/python/pixelTrackingEffFromHitPattern_cff.py b/DQM/TrackingMonitorClient/python/pixelTrackingEffFromHitPattern_cff.py index 15ceaf93ed20a..cff85e56d94f7 100644 --- a/DQM/TrackingMonitorClient/python/pixelTrackingEffFromHitPattern_cff.py +++ b/DQM/TrackingMonitorClient/python/pixelTrackingEffFromHitPattern_cff.py @@ -21,7 +21,10 @@ def _layers(suffix, quant, histoPostfix): ] pixelTrackingEffFromHitPattern = DQMEDHarvester("DQMGenericClient", - subDirs = cms.untracked.vstring("Tracking/PixelTrackParameters/HitEffFromHitPattern*"), + subDirs = cms.untracked.vstring("Tracking/PixelTrackParameters/pixelTracks/HitEffFromHitPattern*", + "Tracking/PixelTrackParameters/dzPV0p1/HitEffFromHitPattern*", + "Tracking/PixelTrackParameters/pt_0to1/HitEffFromHitPattern*", + "Tracking/PixelTrackParameters/pt_1/HitEffFromHitPattern*"), efficiency = cms.vstring( _layers("PU", "GoodNumVertices", "") + _layers("BX", "BX", "VsBX") + diff --git a/DQM/TrackingMonitorClient/python/pixelVertexResolutionClient_cfi.py b/DQM/TrackingMonitorClient/python/pixelVertexResolutionClient_cfi.py new file mode 100644 index 0000000000000..2558e88d26012 --- /dev/null +++ b/DQM/TrackingMonitorClient/python/pixelVertexResolutionClient_cfi.py @@ -0,0 +1,7 @@ +import FWCore.ParameterSet.Config as cms + +from DQM.TrackingMonitorClient.primaryVertexResolutionClient_cfi import primaryVertexResolutionClient as _primaryVertexResolutionClient + +pixelVertexResolutionClient = _primaryVertexResolutionClient.clone( + subDirs = ["OfflinePixelPV/Resolution/*"] +) diff --git a/DQM/TrackingMonitorSource/python/pixelTracksMonitoring_cff.py b/DQM/TrackingMonitorSource/python/pixelTracksMonitoring_cff.py index a075f671f05ce..d5deba78b46c8 100644 --- a/DQM/TrackingMonitorSource/python/pixelTracksMonitoring_cff.py +++ b/DQM/TrackingMonitorSource/python/pixelTracksMonitoring_cff.py @@ -1,23 +1,77 @@ import FWCore.ParameterSet.Config as cms import DQM.TrackingMonitor.TrackerCollisionTrackingMonitor_cfi -pixelTracksMonitoring = DQM.TrackingMonitor.TrackerCollisionTrackingMonitor_cfi.TrackerCollisionTrackMon.clone() -pixelTracksMonitoring.FolderName = 'Tracking/PixelTrackParameters' -pixelTracksMonitoring.TrackProducer = 'pixelTracks' -pixelTracksMonitoring.allTrackProducer = 'pixelTracks' -pixelTracksMonitoring.beamSpot = 'offlineBeamSpot' -pixelTracksMonitoring.primaryVertex = 'pixelVertices' -pixelTracksMonitoring.pvNDOF = 1 -pixelTracksMonitoring.doAllPlots = True -pixelTracksMonitoring.doLumiAnalysis = True -pixelTracksMonitoring.doProfilesVsLS = True -pixelTracksMonitoring.doDCAPlots = True -pixelTracksMonitoring.doProfilesVsLS = True -pixelTracksMonitoring.doPlotsVsGoodPVtx = True -pixelTracksMonitoring.doEffFromHitPatternVsPU = False -pixelTracksMonitoring.doEffFromHitPatternVsBX = False -pixelTracksMonitoring.doEffFromHitPatternVsLUMI = False -pixelTracksMonitoring.doPlotsVsGoodPVtx = True -pixelTracksMonitoring.doPlotsVsLUMI = True -pixelTracksMonitoring.doPlotsVsBX = True +pixelTracksMonitor = DQM.TrackingMonitor.TrackerCollisionTrackingMonitor_cfi.TrackerCollisionTrackMon.clone() +pixelTracksMonitor.FolderName = 'Tracking/PixelTrackParameters/pixelTracks' +pixelTracksMonitor.TrackProducer = 'pixelTracks' +pixelTracksMonitor.allTrackProducer = 'pixelTracks' +pixelTracksMonitor.beamSpot = 'offlineBeamSpot' +pixelTracksMonitor.primaryVertex = 'pixelVertices' +pixelTracksMonitor.pvNDOF = 1 +pixelTracksMonitor.doAllPlots = True +pixelTracksMonitor.doLumiAnalysis = True +pixelTracksMonitor.doProfilesVsLS = True +pixelTracksMonitor.doDCAPlots = True +pixelTracksMonitor.doProfilesVsLS = True +pixelTracksMonitor.doPlotsVsGoodPVtx = True +pixelTracksMonitor.doEffFromHitPatternVsPU = False +pixelTracksMonitor.doEffFromHitPatternVsBX = False +pixelTracksMonitor.doEffFromHitPatternVsLUMI = False +pixelTracksMonitor.doPlotsVsGoodPVtx = True +pixelTracksMonitor.doPlotsVsLUMI = True +pixelTracksMonitor.doPlotsVsBX = True +_trackSelector = cms.EDFilter('TrackSelector', + src = cms.InputTag('pixelTracks'), + cut = cms.string("") +) + +pixelTracksPt0to1 = _trackSelector.clone(cut = "pt >= 0 & pt < 1 ") +pixelTracksPt1 = _trackSelector.clone(cut = "pt >= 1 ") +from DQM.TrackingMonitorSource.TrackCollections2monitor_cff import highPurityPV0p1 as _highPurityPV0p1 +pixelTracksPV0p1 = _highPurityPV0p1.clone( + src = "pixelTracks", + quality = "", + vertexTag = "goodPixelVertices" +) + +pixelTracksMonitorPt0to1 = pixelTracksMonitor.clone( + TrackProducer = "pixelTracksPt0to1", + FolderName = "Tracking/PixelTrackParameters/pt_0to1" +) +pixelTracksMonitorPt1 = pixelTracksMonitor.clone( + TrackProducer = "pixelTracksPt1", + FolderName = "Tracking/PixelTrackParameters/pt_1" +) +pixelTracksMonitorPV0p1 = pixelTracksMonitor.clone( + TrackProducer = "pixelTracksPV0p1", + FolderName = "Tracking/PixelTrackParameters/dzPV0p1" +) + + +from CommonTools.ParticleFlow.goodOfflinePrimaryVertices_cfi import goodOfflinePrimaryVertices as _goodOfflinePrimaryVertices +goodPixelVertices = _goodOfflinePrimaryVertices.clone( + src = "pixelVertices", +) + +from DQM.TrackingMonitor.primaryVertexResolution_cfi import primaryVertexResolution as _primaryVertexResolution +pixelVertexResolution = _primaryVertexResolution.clone( + vertexSrc = "goodPixelVertices", + rootFolder = "OfflinePixelPV/Resolution", +) + +pixelTracksMonitoringTask = cms.Task( + goodPixelVertices, + pixelTracksPt0to1, + pixelTracksPt1, + pixelTracksPV0p1, +) + +pixelTracksMonitoring = cms.Sequence( + pixelTracksMonitor + + pixelTracksMonitorPt0to1 + + pixelTracksMonitorPt1 + + pixelTracksMonitorPV0p1 + + pixelVertexResolution, + pixelTracksMonitoringTask +) diff --git a/DQMOffline/Configuration/python/DQMOffline_SecondStep_cff.py b/DQMOffline/Configuration/python/DQMOffline_SecondStep_cff.py index fba049f84295b..a4d8e88aa9a40 100644 --- a/DQMOffline/Configuration/python/DQMOffline_SecondStep_cff.py +++ b/DQMOffline/Configuration/python/DQMOffline_SecondStep_cff.py @@ -117,6 +117,7 @@ from DQM.CTPPS.ctppsDQM_cff import * from Validation.RecoTau.DQMSequences_cfi import * from DQM.TrackingMonitorClient.pixelTrackingEffFromHitPattern_cff import * +from DQM.TrackingMonitorClient.pixelVertexResolutionClient_cfi import * DQMHarvestDCS = cms.Sequence ( dqmDcsInfoClient ) @@ -175,7 +176,8 @@ DQMHarvestTracking = cms.Sequence( TrackingOfflineDQMClient * dqmFastTimerServiceClient ) -DQMHarvestPixelTracking = cms.Sequence( pixelTrackingEffFromHitPattern ) +DQMHarvestPixelTracking = cms.Sequence( pixelTrackingEffFromHitPattern * + pixelVertexResolutionClient ) DQMHarvestOuterTracker = cms.Sequence( dqmDcsInfoClient * diff --git a/DQMOffline/Configuration/python/DQMOffline_cff.py b/DQMOffline/Configuration/python/DQMOffline_cff.py index d729c97e2c7c8..a54a84f0d04dd 100644 --- a/DQMOffline/Configuration/python/DQMOffline_cff.py +++ b/DQMOffline/Configuration/python/DQMOffline_cff.py @@ -138,10 +138,12 @@ #DQMOfflineCommon from DQM.TrackingMonitorSource.pixelTracksMonitoring_cff import * +from DQMOffline.RecoB.PixelVertexMonitor_cff import * from DQM.SiOuterTracker.OuterTrackerSourceConfig_cff import * from Validation.RecoTau.DQMSequences_cfi import * -DQMOfflinePixelTracking = cms.Sequence( pixelTracksMonitoring ) +DQMOfflinePixelTracking = cms.Sequence( pixelTracksMonitoring * + pixelPVMonitor ) DQMOuterTracker = cms.Sequence( DQMOfflineDCS * OuterTrackerSource * diff --git a/DQMOffline/RecoB/python/PixelVertexMonitor_cff.py b/DQMOffline/RecoB/python/PixelVertexMonitor_cff.py new file mode 100644 index 0000000000000..3c2e3d7d6700e --- /dev/null +++ b/DQMOffline/RecoB/python/PixelVertexMonitor_cff.py @@ -0,0 +1,7 @@ +import FWCore.ParameterSet.Config as cms + +from DQMOffline.RecoB.PrimaryVertexMonitor_cff import pvMonitor as _pvMonitor +pixelPVMonitor = _pvMonitor.clone( + TopFolderName = "OfflinePixelPV", + vertexLabel = "pixelVertices", +) diff --git a/DataFormats/CaloRecHit/test/BuildFile.xml b/DataFormats/CaloRecHit/test/BuildFile.xml index 983e853f47698..6daf8cf086086 100644 --- a/DataFormats/CaloRecHit/test/BuildFile.xml +++ b/DataFormats/CaloRecHit/test/BuildFile.xml @@ -1,13 +1,14 @@ - - - - - + + + + + + - + diff --git a/DataFormats/CaloRecHit/test/test_calo_rechit.cu b/DataFormats/CaloRecHit/test/test_calo_rechit.cu index 9c24668240425..301db13ba508f 100644 --- a/DataFormats/CaloRecHit/test/test_calo_rechit.cu +++ b/DataFormats/CaloRecHit/test/test_calo_rechit.cu @@ -1,10 +1,11 @@ +#include +#include + #include #include -#include -#include - #include "DataFormats/CaloRecHit/interface/CaloRecHit.h" +#include "HeterogeneousCore/CUDAUtilities/interface/exitSansCUDADevices.h" __global__ void kernel_test_calo_rechit(CaloRecHit* other) { CaloRecHit rh{DetId(0), 10, 1, 0, 0}; @@ -42,12 +43,9 @@ void test_calo_rechit() { } int main(int argc, char** argv) { - int nDevices; - cudaGetDeviceCount(&nDevices); - std::cout << "nDevices = " << nDevices << std::endl; + exitSansCUDADevices(); - if (nDevices > 0) - test_calo_rechit(); + test_calo_rechit(); std::cout << "all good!" << std::endl; return 0; diff --git a/DataFormats/DetId/test/BuildFile.xml b/DataFormats/DetId/test/BuildFile.xml index 0cccd9fb0d26b..376a8bdc397ad 100644 --- a/DataFormats/DetId/test/BuildFile.xml +++ b/DataFormats/DetId/test/BuildFile.xml @@ -1,6 +1,7 @@ - + + diff --git a/DataFormats/DetId/test/test_detid.cu b/DataFormats/DetId/test/test_detid.cu index 1c0d4ec13b89e..16379427851c4 100644 --- a/DataFormats/DetId/test/test_detid.cu +++ b/DataFormats/DetId/test/test_detid.cu @@ -1,10 +1,12 @@ -#include +#include +#include + #include +#include -#include -#include #include "DataFormats/DetId/interface/DetId.h" #include "DataFormats/HcalDetId/interface/HcalDetId.h" +#include "HeterogeneousCore/CUDAUtilities/interface/exitSansCUDADevices.h" __global__ void test_gen_detid(DetId* id, uint32_t const rawid) { DetId did{rawid}; @@ -27,11 +29,8 @@ void test_detid() { } int main(int argc, char** argv) { - int nDevices; - cudaGetDeviceCount(&nDevices); - std::cout << "nDevices = " << nDevices << std::endl; + exitSansCUDADevices(); // test det id functionality - if (nDevices > 0) - test_detid(); + test_detid(); } diff --git a/DataFormats/GeometrySurface/interface/SOARotation.h b/DataFormats/GeometrySurface/interface/SOARotation.h index 331a56b7ecf57..d75efef4736cb 100644 --- a/DataFormats/GeometrySurface/interface/SOARotation.h +++ b/DataFormats/GeometrySurface/interface/SOARotation.h @@ -100,6 +100,34 @@ class SOAFrame { uz += pz; } + constexpr inline void toGlobal(T cxx, T cxy, T cyy, T *gl) const { + auto const &r = rot; + gl[0] = r.xx() * (r.xx() * cxx + r.yx() * cxy) + r.yx() * (r.xx() * cxy + r.yx() * cyy); + gl[1] = r.xx() * (r.xy() * cxx + r.yy() * cxy) + r.yx() * (r.xy() * cxy + r.yy() * cyy); + gl[2] = r.xy() * (r.xy() * cxx + r.yy() * cxy) + r.yy() * (r.xy() * cxy + r.yy() * cyy); + gl[3] = r.xx() * (r.xz() * cxx + r.yz() * cxy) + r.yx() * (r.xz() * cxy + r.yz() * cyy); + gl[4] = r.xy() * (r.xz() * cxx + r.yz() * cxy) + r.yy() * (r.xz() * cxy + r.yz() * cyy); + gl[5] = r.xz() * (r.xz() * cxx + r.yz() * cxy) + r.yz() * (r.xz() * cxy + r.yz() * cyy); + } + + constexpr inline void toLocal(T const *ge, T &lxx, T &lxy, T &lyy) const { + auto const &r = rot; + + T cxx = ge[0]; + T cyx = ge[1]; + T cyy = ge[2]; + T czx = ge[3]; + T czy = ge[4]; + T czz = ge[5]; + + lxx = r.xx() * (r.xx() * cxx + r.xy() * cyx + r.xz() * czx) + + r.xy() * (r.xx() * cyx + r.xy() * cyy + r.xz() * czy) + r.xz() * (r.xx() * czx + r.xy() * czy + r.xz() * czz); + lxy = r.yx() * (r.xx() * cxx + r.xy() * cyx + r.xz() * czx) + + r.yy() * (r.xx() * cyx + r.xy() * cyy + r.xz() * czy) + r.yz() * (r.xx() * czx + r.xy() * czy + r.xz() * czz); + lyy = r.yx() * (r.yx() * cxx + r.yy() * cyx + r.yz() * czx) + + r.yy() * (r.yx() * cyx + r.yy() * cyy + r.yz() * czy) + r.yz() * (r.yx() * czx + r.yy() * czy + r.yz() * czz); + } + constexpr inline T x() const { return px; } constexpr inline T y() const { return py; } constexpr inline T z() const { return pz; } diff --git a/DataFormats/GeometrySurface/test/BuildFile.xml b/DataFormats/GeometrySurface/test/BuildFile.xml index 050cdb4c8f19d..5f4db224a639b 100644 --- a/DataFormats/GeometrySurface/test/BuildFile.xml +++ b/DataFormats/GeometrySurface/test/BuildFile.xml @@ -13,3 +13,16 @@ + + + + + + + + + + + + + diff --git a/DataFormats/GeometrySurface/test/gpuFrameTransformKernel.cu b/DataFormats/GeometrySurface/test/gpuFrameTransformKernel.cu new file mode 100644 index 0000000000000..9af9f5bef600a --- /dev/null +++ b/DataFormats/GeometrySurface/test/gpuFrameTransformKernel.cu @@ -0,0 +1,40 @@ +#include +#include +#include + +#include "DataFormats/GeometrySurface/interface/SOARotation.h" +#include "HeterogeneousCore/CUDAUtilities/interface/launch.h" + +__global__ void toGlobal(SOAFrame const* frame, + float const* xl, + float const* yl, + float* x, + float* y, + float* z, + float const* le, + float* ge, + uint32_t n) { + int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i >= n) + return; + + frame[0].toGlobal(xl[i], yl[i], x[i], y[i], z[i]); + frame[0].toGlobal(le[3 * i], le[3 * i + 1], le[3 * i + 2], ge + 6 * i); +} + +void toGlobalWrapper(SOAFrame const* frame, + float const* xl, + float const* yl, + float* x, + float* y, + float* z, + float const* le, + float* ge, + uint32_t n) { + int threadsPerBlock = 256; + int blocksPerGrid = (n + threadsPerBlock - 1) / threadsPerBlock; + std::cout << "CUDA toGlobal kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads" + << std::endl; + + cudautils::launch(toGlobal, {blocksPerGrid, threadsPerBlock}, frame, xl, yl, x, y, z, le, ge, n); +} diff --git a/DataFormats/GeometrySurface/test/gpuFrameTransformTest.cpp b/DataFormats/GeometrySurface/test/gpuFrameTransformTest.cpp new file mode 100644 index 0000000000000..8c18054e0deb5 --- /dev/null +++ b/DataFormats/GeometrySurface/test/gpuFrameTransformTest.cpp @@ -0,0 +1,114 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h" +#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" +#include "DataFormats/GeometrySurface/interface/GloballyPositioned.h" +#include "DataFormats/GeometrySurface/interface/SOARotation.h" +#include "DataFormats/GeometrySurface/interface/TkRotation.h" +#include "HeterogeneousCore/CUDAUtilities/interface/exitSansCUDADevices.h" + +void toGlobalWrapper(SOAFrame const *frame, + float const *xl, + float const *yl, + float *x, + float *y, + float *z, + float const *le, + float *ge, + uint32_t n); + +int main(void) { + exitSansCUDADevices(); + + typedef float T; + typedef TkRotation Rotation; + typedef SOARotation SRotation; + typedef GloballyPositioned Frame; + typedef SOAFrame SFrame; + typedef typename Frame::PositionType Position; + typedef typename Frame::GlobalVector GlobalVector; + typedef typename Frame::GlobalPoint GlobalPoint; + typedef typename Frame::LocalVector LocalVector; + typedef typename Frame::LocalPoint LocalPoint; + + constexpr uint32_t size = 10000; + constexpr uint32_t size32 = size * sizeof(float); + + float xl[size], yl[size]; + float x[size], y[size], z[size]; + + // errors + float le[3 * size]; + float ge[6 * size]; + + auto d_xl = cudautils::make_device_unique(size, nullptr); + auto d_yl = cudautils::make_device_unique(size, nullptr); + + auto d_x = cudautils::make_device_unique(size, nullptr); + auto d_y = cudautils::make_device_unique(size, nullptr); + auto d_z = cudautils::make_device_unique(size, nullptr); + + auto d_le = cudautils::make_device_unique(3 * size, nullptr); + auto d_ge = cudautils::make_device_unique(6 * size, nullptr); + + double a = 0.01; + double ca = std::cos(a); + double sa = std::sin(a); + + Rotation r1(ca, sa, 0, -sa, ca, 0, 0, 0, 1); + Frame f1(Position(2, 3, 4), r1); + std::cout << "f1.position() " << f1.position() << std::endl; + std::cout << "f1.rotation() " << '\n' << f1.rotation() << std::endl; + + SFrame sf1(f1.position().x(), f1.position().y(), f1.position().z(), f1.rotation()); + + auto d_sf = cudautils::make_device_unique(sizeof(SFrame), nullptr); + cudaCheck(cudaMemcpy(d_sf.get(), &sf1, sizeof(SFrame), cudaMemcpyHostToDevice)); + + for (auto i = 0U; i < size; ++i) { + xl[i] = yl[i] = 0.1f * float(i) - float(size / 2); + le[3 * i] = 0.01f; + le[3 * i + 2] = (i > size / 2) ? 1.f : 0.04f; + le[2 * i + 1] = 0.; + } + std::random_shuffle(xl, xl + size); + std::random_shuffle(yl, yl + size); + + cudaCheck(cudaMemcpy(d_xl.get(), xl, size32, cudaMemcpyHostToDevice)); + cudaCheck(cudaMemcpy(d_yl.get(), yl, size32, cudaMemcpyHostToDevice)); + cudaCheck(cudaMemcpy(d_le.get(), le, 3 * size32, cudaMemcpyHostToDevice)); + + toGlobalWrapper((SFrame const *)(d_sf.get()), + d_xl.get(), + d_yl.get(), + d_x.get(), + d_y.get(), + d_z.get(), + d_le.get(), + d_ge.get(), + size); + cudaCheck(cudaMemcpy(x, d_x.get(), size32, cudaMemcpyDeviceToHost)); + cudaCheck(cudaMemcpy(y, d_y.get(), size32, cudaMemcpyDeviceToHost)); + cudaCheck(cudaMemcpy(z, d_z.get(), size32, cudaMemcpyDeviceToHost)); + cudaCheck(cudaMemcpy(ge, d_ge.get(), 6 * size32, cudaMemcpyDeviceToHost)); + + float eps = 0.; + for (auto i = 0U; i < size; ++i) { + auto gp = f1.toGlobal(LocalPoint(xl[i], yl[i])); + eps = std::max(eps, std::abs(x[i] - gp.x())); + eps = std::max(eps, std::abs(y[i] - gp.y())); + eps = std::max(eps, std::abs(z[i] - gp.z())); + } + + std::cout << "max eps " << eps << std::endl; + + return 0; +} diff --git a/DataFormats/HcalDetId/test/BuildFile.xml b/DataFormats/HcalDetId/test/BuildFile.xml index e49b14a5cf4bd..722976b366f1c 100644 --- a/DataFormats/HcalDetId/test/BuildFile.xml +++ b/DataFormats/HcalDetId/test/BuildFile.xml @@ -2,5 +2,6 @@ + diff --git a/DataFormats/HcalDetId/test/test_hcal_detid.cu b/DataFormats/HcalDetId/test/test_hcal_detid.cu index 3f885984a9778..1696d645ffb01 100644 --- a/DataFormats/HcalDetId/test/test_hcal_detid.cu +++ b/DataFormats/HcalDetId/test/test_hcal_detid.cu @@ -1,10 +1,12 @@ -#include +#include +#include + #include +#include -#include -#include #include "DataFormats/DetId/interface/DetId.h" #include "DataFormats/HcalDetId/interface/HcalDetId.h" +#include "HeterogeneousCore/CUDAUtilities/interface/exitSansCUDADevices.h" __global__ void test_gen_detid(DetId *id) { DetId did; @@ -63,17 +65,13 @@ void test_hcal_detid() { } int main(int argc, char **argv) { - int nDevices; - cudaGetDeviceCount(&nDevices); - std::cout << "nDevices = " << nDevices << std::endl; + exitSansCUDADevices(); // test det id functionality - if (nDevices > 0) - test_detid(); + test_detid(); // test hcal det ids - if (nDevices > 0) - test_hcal_detid(); + test_hcal_detid(); return 0; } diff --git a/DataFormats/HcalDigi/test/BuildFile.xml b/DataFormats/HcalDigi/test/BuildFile.xml index 5fad363ed130e..c4a84c577a380 100644 --- a/DataFormats/HcalDigi/test/BuildFile.xml +++ b/DataFormats/HcalDigi/test/BuildFile.xml @@ -1,14 +1,15 @@ - - - - - + + + + + - - - - + + + + + diff --git a/DataFormats/HcalDigi/test/test_hcal_digi.cu b/DataFormats/HcalDigi/test/test_hcal_digi.cu index 295607e5abb62..d423ab835e352 100644 --- a/DataFormats/HcalDigi/test/test_hcal_digi.cu +++ b/DataFormats/HcalDigi/test/test_hcal_digi.cu @@ -1,15 +1,17 @@ -#include +#include +#include + #include +#include -#include -#include +#include "DataFormats/Common/interface/DataFrame.h" #include "DataFormats/DetId/interface/DetId.h" #include "DataFormats/HcalDetId/interface/HcalDetId.h" #include "DataFormats/HcalDigi/interface/HBHEDataFrame.h" +#include "DataFormats/HcalDigi/interface/HcalDigiCollections.h" #include "DataFormats/HcalDigi/interface/QIE10DataFrame.h" #include "DataFormats/HcalDigi/interface/QIE11DataFrame.h" -#include "DataFormats/HcalDigi/interface/HcalDigiCollections.h" -#include "DataFormats/Common/interface/DataFrame.h" +#include "HeterogeneousCore/CUDAUtilities/interface/exitSansCUDADevices.h" __global__ void kernel_test_hcal_qiesample(HcalQIESample *sample, uint16_t value) { printf("kernel: testing hcal qie sampel\n"); @@ -69,7 +71,6 @@ void test_hcal_qie1011_digis() { constexpr int samples = 10; constexpr int detid = 2; HcalDataFrameContainer coll{samples, detid}; - TDF *d_dfs; uint16_t *d_data; uint32_t *d_out; uint32_t h_out[size], h_test_out[size]; @@ -162,22 +163,18 @@ void test_hcal_qie8_hbhedf() { } int main(int argc, char **argv) { - int nDevices; - cudaGetDeviceCount(&nDevices); - std::cout << "nDevices = " << nDevices << std::endl; - - if (nDevices > 0) { - // qie8 - test_hcal_qiesample(); - test_hcal_qie8_hbhedf(); - test_hcal_qie8_digis(); - test_hcal_qie8_digis(); - test_hcal_qie8_digis(); - - // qie1011 - test_hcal_qie1011_digis(); - test_hcal_qie1011_digis(); - } + exitSansCUDADevices(); + + // qie8 + test_hcal_qiesample(); + test_hcal_qie8_hbhedf(); + test_hcal_qie8_digis(); + test_hcal_qie8_digis(); + test_hcal_qie8_digis(); + + // qie1011 + test_hcal_qie1011_digis(); + test_hcal_qie1011_digis(); return 0; } diff --git a/DataFormats/HcalRecHit/test/BuildFile.xml b/DataFormats/HcalRecHit/test/BuildFile.xml index d0eacceebeddd..90bac3b024c0e 100644 --- a/DataFormats/HcalRecHit/test/BuildFile.xml +++ b/DataFormats/HcalRecHit/test/BuildFile.xml @@ -1,15 +1,16 @@ - - - - - + + + + + - - - - - + + + + + + diff --git a/DataFormats/HcalRecHit/test/test_hcal_reco.cu b/DataFormats/HcalRecHit/test/test_hcal_reco.cu index 4af8c3c065e2c..dfd5a7aead864 100644 --- a/DataFormats/HcalRecHit/test/test_hcal_reco.cu +++ b/DataFormats/HcalRecHit/test/test_hcal_reco.cu @@ -10,6 +10,7 @@ #include "DataFormats/HcalRecHit/interface/HORecHit.h" #include "DataFormats/HcalRecHit/interface/HFQIE10Info.h" #include "DataFormats/HcalRecHit/interface/HBHEChannelInfo.h" +#include "HeterogeneousCore/CUDAUtilities/interface/exitSansCUDADevices.h" template __global__ void kernel_test_hcal_rechits(T *other) { @@ -109,18 +110,13 @@ void test_hcal_hbhechinfo() { } int main(int argc, char **argv) { - int nDevices; - cudaGetDeviceCount(&nDevices); - std::cout << "nDevices = " << nDevices << std::endl; + exitSansCUDADevices(); - if (nDevices > 0) { - test_hcal_rechits(); - test_hcal_rechits(); - test_hcal_rechits(); - test_hcal_hbhechinfo(); - - std::cout << "all good" << std::endl; - } + test_hcal_rechits(); + test_hcal_rechits(); + test_hcal_rechits(); + test_hcal_hbhechinfo(); + std::cout << "all good" << std::endl; return 0; } diff --git a/DataFormats/Math/BuildFile.xml b/DataFormats/Math/BuildFile.xml index 6aa1d86287860..83d06125a017c 100644 --- a/DataFormats/Math/BuildFile.xml +++ b/DataFormats/Math/BuildFile.xml @@ -1,6 +1,7 @@ - - + + + - + diff --git a/DataFormats/Math/interface/choleskyInversion.h b/DataFormats/Math/interface/choleskyInversion.h new file mode 100644 index 0000000000000..eba2a17648008 --- /dev/null +++ b/DataFormats/Math/interface/choleskyInversion.h @@ -0,0 +1,333 @@ +#ifndef DataFormat_Math_choleskyInversion_h +#define DataFormat_Math_choleskyInversion_h + +#include + +#include + +/** + * fully inlined specialized code to perform the inversion of a + * positive defined matrix of rank up to 6. + * + * originally by + * @author Manuel Schiller + * @date Aug 29 2008 + * + * + */ +namespace choleskyInversion { + + template + inline constexpr void invert11(M1 const& src, M2& dst) { + using F = decltype(src(0, 0)); + dst(0, 0) = F(1.0) / src(0, 0); + } + + template + inline constexpr void invert22(M1 const& src, M2& dst) { + using F = decltype(src(0, 0)); + auto luc0 = F(1.0) / src(0, 0); + auto luc1 = src(1, 0) * src(1, 0) * luc0; + auto luc2 = F(1.0) / (src(1, 1) - luc1); + + auto li21 = luc1 * luc0 * luc2; + + dst(0, 0) = li21 + luc0; + dst(1, 0) = -src(1, 0) * luc0 * luc2; + dst(1, 1) = luc2; + } + + template + inline constexpr void invert33(M1 const& src, M2& dst) { + using F = decltype(src(0, 0)); + auto luc0 = F(1.0) / src(0, 0); + auto luc1 = src(1, 0); + auto luc2 = src(1, 1) - luc0 * luc1 * luc1; + luc2 = F(1.0) / luc2; + auto luc3 = src(2, 0); + auto luc4 = (src(2, 1) - luc0 * luc1 * luc3); + auto luc5 = src(2, 2) - (luc0 * luc3 * luc3 + (luc2 * luc4) * luc4); + luc5 = F(1.0) / luc5; + + auto li21 = -luc0 * luc1; + auto li32 = -(luc2 * luc4); + auto li31 = (luc1 * (luc2 * luc4) - luc3) * luc0; + + dst(0, 0) = luc5 * li31 * li31 + li21 * li21 * luc2 + luc0; + dst(1, 0) = luc5 * li31 * li32 + li21 * luc2; + dst(1, 1) = luc5 * li32 * li32 + luc2; + dst(2, 0) = luc5 * li31; + dst(2, 1) = luc5 * li32; + dst(2, 2) = luc5; + } + + template + inline constexpr void invert44(M1 const& src, M2& dst) { + using F = decltype(src(0, 0)); + auto luc0 = F(1.0) / src(0, 0); + auto luc1 = src(1, 0); + auto luc2 = src(1, 1) - luc0 * luc1 * luc1; + luc2 = F(1.0) / luc2; + auto luc3 = src(2, 0); + auto luc4 = (src(2, 1) - luc0 * luc1 * luc3); + auto luc5 = src(2, 2) - (luc0 * luc3 * luc3 + luc2 * luc4 * luc4); + luc5 = F(1.0) / luc5; + auto luc6 = src(3, 0); + auto luc7 = (src(3, 1) - luc0 * luc1 * luc6); + auto luc8 = (src(3, 2) - luc0 * luc3 * luc6 - luc2 * luc4 * luc7); + auto luc9 = src(3, 3) - (luc0 * luc6 * luc6 + luc2 * luc7 * luc7 + luc8 * (luc8 * luc5)); + luc9 = F(1.0) / luc9; + + auto li21 = -luc1 * luc0; + auto li32 = -luc2 * luc4; + auto li31 = (luc1 * (luc2 * luc4) - luc3) * luc0; + auto li43 = -(luc8 * luc5); + auto li42 = (luc4 * luc8 * luc5 - luc7) * luc2; + auto li41 = (-luc1 * (luc2 * luc4) * (luc8 * luc5) + luc1 * (luc2 * luc7) + luc3 * (luc8 * luc5) - luc6) * luc0; + + dst(0, 0) = luc9 * li41 * li41 + luc5 * li31 * li31 + luc2 * li21 * li21 + luc0; + dst(1, 0) = luc9 * li41 * li42 + luc5 * li31 * li32 + luc2 * li21; + dst(1, 1) = luc9 * li42 * li42 + luc5 * li32 * li32 + luc2; + dst(2, 0) = luc9 * li41 * li43 + luc5 * li31; + dst(2, 1) = luc9 * li42 * li43 + luc5 * li32; + dst(2, 2) = luc9 * li43 * li43 + luc5; + dst(3, 0) = luc9 * li41; + dst(3, 1) = luc9 * li42; + dst(3, 2) = luc9 * li43; + dst(3, 3) = luc9; + } + + template + inline constexpr void invert55(M1 const& src, M2& dst) { + using F = decltype(src(0, 0)); + auto luc0 = F(1.0) / src(0, 0); + auto luc1 = src(1, 0); + auto luc2 = src(1, 1) - luc0 * luc1 * luc1; + luc2 = F(1.0) / luc2; + auto luc3 = src(2, 0); + auto luc4 = (src(2, 1) - luc0 * luc1 * luc3); + auto luc5 = src(2, 2) - (luc0 * luc3 * luc3 + luc2 * luc4 * luc4); + luc5 = F(1.0) / luc5; + auto luc6 = src(3, 0); + auto luc7 = (src(3, 1) - luc0 * luc1 * luc6); + auto luc8 = (src(3, 2) - luc0 * luc3 * luc6 - luc2 * luc4 * luc7); + auto luc9 = src(3, 3) - (luc0 * luc6 * luc6 + luc2 * luc7 * luc7 + luc8 * (luc8 * luc5)); + luc9 = F(1.0) / luc9; + auto luc10 = src(4, 0); + auto luc11 = (src(4, 1) - luc0 * luc1 * luc10); + auto luc12 = (src(4, 2) - luc0 * luc3 * luc10 - luc2 * luc4 * luc11); + auto luc13 = (src(4, 3) - luc0 * luc6 * luc10 - luc2 * luc7 * luc11 - luc5 * luc8 * luc12); + auto luc14 = + src(4, 4) - (luc0 * luc10 * luc10 + luc2 * luc11 * luc11 + luc5 * luc12 * luc12 + luc9 * luc13 * luc13); + luc14 = F(1.0) / luc14; + + auto li21 = -luc1 * luc0; + auto li32 = -luc2 * luc4; + auto li31 = (luc1 * (luc2 * luc4) - luc3) * luc0; + auto li43 = -(luc8 * luc5); + auto li42 = (luc4 * luc8 * luc5 - luc7) * luc2; + auto li41 = (-luc1 * (luc2 * luc4) * (luc8 * luc5) + luc1 * (luc2 * luc7) + luc3 * (luc8 * luc5) - luc6) * luc0; + auto li54 = -luc13 * luc9; + auto li53 = (luc13 * luc8 * luc9 - luc12) * luc5; + auto li52 = (-luc4 * luc8 * luc13 * luc5 * luc9 + luc4 * luc12 * luc5 + luc7 * luc13 * luc9 - luc11) * luc2; + auto li51 = (luc1 * luc4 * luc8 * luc13 * luc2 * luc5 * luc9 - luc13 * luc8 * luc3 * luc9 * luc5 - + luc12 * luc4 * luc1 * luc2 * luc5 - luc13 * luc7 * luc1 * luc9 * luc2 + luc11 * luc1 * luc2 + + luc12 * luc3 * luc5 + luc13 * luc6 * luc9 - luc10) * + luc0; + + dst(0, 0) = luc14 * li51 * li51 + luc9 * li41 * li41 + luc5 * li31 * li31 + luc2 * li21 * li21 + luc0; + dst(1, 0) = luc14 * li51 * li52 + luc9 * li41 * li42 + luc5 * li31 * li32 + luc2 * li21; + dst(1, 1) = luc14 * li52 * li52 + luc9 * li42 * li42 + luc5 * li32 * li32 + luc2; + dst(2, 0) = luc14 * li51 * li53 + luc9 * li41 * li43 + luc5 * li31; + dst(2, 1) = luc14 * li52 * li53 + luc9 * li42 * li43 + luc5 * li32; + dst(2, 2) = luc14 * li53 * li53 + luc9 * li43 * li43 + luc5; + dst(3, 0) = luc14 * li51 * li54 + luc9 * li41; + dst(3, 1) = luc14 * li52 * li54 + luc9 * li42; + dst(3, 2) = luc14 * li53 * li54 + luc9 * li43; + dst(3, 3) = luc14 * li54 * li54 + luc9; + dst(4, 0) = luc14 * li51; + dst(4, 1) = luc14 * li52; + dst(4, 2) = luc14 * li53; + dst(4, 3) = luc14 * li54; + dst(4, 4) = luc14; + } + + template + inline __attribute__((always_inline)) constexpr void invert66(M1 const& src, M2& dst) { + using F = decltype(src(0, 0)); + auto luc0 = F(1.0) / src(0, 0); + auto luc1 = src(1, 0); + auto luc2 = src(1, 1) - luc0 * luc1 * luc1; + luc2 = F(1.0) / luc2; + auto luc3 = src(2, 0); + auto luc4 = (src(2, 1) - luc0 * luc1 * luc3); + auto luc5 = src(2, 2) - (luc0 * luc3 * luc3 + luc2 * luc4 * luc4); + luc5 = F(1.0) / luc5; + auto luc6 = src(3, 0); + auto luc7 = (src(3, 1) - luc0 * luc1 * luc6); + auto luc8 = (src(3, 2) - luc0 * luc3 * luc6 - luc2 * luc4 * luc7); + auto luc9 = src(3, 3) - (luc0 * luc6 * luc6 + luc2 * luc7 * luc7 + luc8 * (luc8 * luc5)); + luc9 = F(1.0) / luc9; + auto luc10 = src(4, 0); + auto luc11 = (src(4, 1) - luc0 * luc1 * luc10); + auto luc12 = (src(4, 2) - luc0 * luc3 * luc10 - luc2 * luc4 * luc11); + auto luc13 = (src(4, 3) - luc0 * luc6 * luc10 - luc2 * luc7 * luc11 - luc5 * luc8 * luc12); + auto luc14 = + src(4, 4) - (luc0 * luc10 * luc10 + luc2 * luc11 * luc11 + luc5 * luc12 * luc12 + luc9 * luc13 * luc13); + luc14 = F(1.0) / luc14; + auto luc15 = src(5, 0); + auto luc16 = (src(5, 1) - luc0 * luc1 * luc15); + auto luc17 = (src(5, 2) - luc0 * luc3 * luc15 - luc2 * luc4 * luc16); + auto luc18 = (src(5, 3) - luc0 * luc6 * luc15 - luc2 * luc7 * luc16 - luc5 * luc8 * luc17); + auto luc19 = + (src(5, 4) - luc0 * luc10 * luc15 - luc2 * luc11 * luc16 - luc5 * luc12 * luc17 - luc9 * luc13 * luc18); + auto luc20 = src(5, 5) - (luc0 * luc15 * luc15 + luc2 * luc16 * luc16 + luc5 * luc17 * luc17 + + luc9 * luc18 * luc18 + luc14 * luc19 * luc19); + luc20 = F(1.0) / luc20; + + auto li21 = -luc1 * luc0; + auto li32 = -luc2 * luc4; + auto li31 = (luc1 * (luc2 * luc4) - luc3) * luc0; + auto li43 = -(luc8 * luc5); + auto li42 = (luc4 * luc8 * luc5 - luc7) * luc2; + auto li41 = (-luc1 * (luc2 * luc4) * (luc8 * luc5) + luc1 * (luc2 * luc7) + luc3 * (luc8 * luc5) - luc6) * luc0; + auto li54 = -luc13 * luc9; + auto li53 = (luc13 * luc8 * luc9 - luc12) * luc5; + auto li52 = (-luc4 * luc8 * luc13 * luc5 * luc9 + luc4 * luc12 * luc5 + luc7 * luc13 * luc9 - luc11) * luc2; + auto li51 = (luc1 * luc4 * luc8 * luc13 * luc2 * luc5 * luc9 - luc13 * luc8 * luc3 * luc9 * luc5 - + luc12 * luc4 * luc1 * luc2 * luc5 - luc13 * luc7 * luc1 * luc9 * luc2 + luc11 * luc1 * luc2 + + luc12 * luc3 * luc5 + luc13 * luc6 * luc9 - luc10) * + luc0; + + auto li65 = -luc19 * luc14; + auto li64 = (luc19 * luc14 * luc13 - luc18) * luc9; + auto li63 = (-luc8 * luc13 * (luc19 * luc14) * luc9 + luc8 * luc9 * luc18 + luc12 * (luc19 * luc14) - luc17) * luc5; + auto li62 = (luc4 * (luc8 * luc9) * luc13 * luc5 * (luc19 * luc14) - luc18 * luc4 * (luc8 * luc9) * luc5 - + luc19 * luc12 * luc4 * luc14 * luc5 - luc19 * luc13 * luc7 * luc14 * luc9 + luc17 * luc4 * luc5 + + luc18 * luc7 * luc9 + luc19 * luc11 * luc14 - luc16) * + luc2; + auto li61 = + (-luc19 * luc13 * luc8 * luc4 * luc1 * luc2 * luc5 * luc9 * luc14 + + luc18 * luc8 * luc4 * luc1 * luc2 * luc5 * luc9 + luc19 * luc12 * luc4 * luc1 * luc2 * luc5 * luc14 + + luc19 * luc13 * luc7 * luc1 * luc2 * luc9 * luc14 + luc19 * luc13 * luc8 * luc3 * luc5 * luc9 * luc14 - + luc17 * luc4 * luc1 * luc2 * luc5 - luc18 * luc7 * luc1 * luc2 * luc9 - luc19 * luc11 * luc1 * luc2 * luc14 - + luc18 * luc8 * luc3 * luc5 * luc9 - luc19 * luc12 * luc3 * luc5 * luc14 - luc19 * luc13 * luc6 * luc9 * luc14 + + luc16 * luc1 * luc2 + luc17 * luc3 * luc5 + luc18 * luc6 * luc9 + luc19 * luc10 * luc14 - luc15) * + luc0; + + dst(0, 0) = + luc20 * li61 * li61 + luc14 * li51 * li51 + luc9 * li41 * li41 + luc5 * li31 * li31 + luc2 * li21 * li21 + luc0; + dst(1, 0) = luc20 * li61 * li62 + luc14 * li51 * li52 + luc9 * li41 * li42 + luc5 * li31 * li32 + luc2 * li21; + dst(1, 1) = luc20 * li62 * li62 + luc14 * li52 * li52 + luc9 * li42 * li42 + luc5 * li32 * li32 + luc2; + dst(2, 0) = luc20 * li61 * li63 + luc14 * li51 * li53 + luc9 * li41 * li43 + luc5 * li31; + dst(2, 1) = luc20 * li62 * li63 + luc14 * li52 * li53 + luc9 * li42 * li43 + luc5 * li32; + dst(2, 2) = luc20 * li63 * li63 + luc14 * li53 * li53 + luc9 * li43 * li43 + luc5; + dst(3, 0) = luc20 * li61 * li64 + luc14 * li51 * li54 + luc9 * li41; + dst(3, 1) = luc20 * li62 * li64 + luc14 * li52 * li54 + luc9 * li42; + dst(3, 2) = luc20 * li63 * li64 + luc14 * li53 * li54 + luc9 * li43; + dst(3, 3) = luc20 * li64 * li64 + luc14 * li54 * li54 + luc9; + dst(4, 0) = luc20 * li61 * li65 + luc14 * li51; + dst(4, 1) = luc20 * li62 * li65 + luc14 * li52; + dst(4, 2) = luc20 * li63 * li65 + luc14 * li53; + dst(4, 3) = luc20 * li64 * li65 + luc14 * li54; + dst(4, 4) = luc20 * li65 * li65 + luc14; + dst(5, 0) = luc20 * li61; + dst(5, 1) = luc20 * li62; + dst(5, 2) = luc20 * li63; + dst(5, 3) = luc20 * li64; + dst(5, 4) = luc20 * li65; + dst(5, 5) = luc20; + } + + template + inline constexpr void symmetrize11(M& dst) {} + template + inline constexpr void symmetrize22(M& dst) { + dst(0, 1) = dst(1, 0); + } + template + inline constexpr void symmetrize33(M& dst) { + symmetrize22(dst); + dst(0, 2) = dst(2, 0); + dst(1, 2) = dst(2, 1); + } + template + inline constexpr void symmetrize44(M& dst) { + symmetrize33(dst); + dst(0, 3) = dst(3, 0); + dst(1, 3) = dst(3, 1); + dst(2, 3) = dst(3, 2); + } + template + inline constexpr void symmetrize55(M& dst) { + symmetrize44(dst); + dst(0, 4) = dst(4, 0); + dst(1, 4) = dst(4, 1); + dst(2, 4) = dst(4, 2); + dst(3, 4) = dst(4, 3); + } + template + inline constexpr void symmetrize66(M& dst) { + symmetrize55(dst); + dst(0, 5) = dst(5, 0); + dst(1, 5) = dst(5, 1); + dst(2, 5) = dst(5, 2); + dst(3, 5) = dst(5, 3); + dst(4, 5) = dst(5, 4); + } + + template + struct Inverter { + static constexpr void eval(M1 const& src, M2& dst) { dst = src.inverse(); } + }; + template + struct Inverter { + static constexpr void eval(M1 const& src, M2& dst) { invert11(src, dst); } + }; + template + struct Inverter { + static constexpr void eval(M1 const& src, M2& dst) { + invert22(src, dst); + symmetrize22(dst); + } + }; + template + struct Inverter { + static constexpr void eval(M1 const& src, M2& dst) { + invert33(src, dst); + symmetrize33(dst); + } + }; + template + struct Inverter { + static constexpr void eval(M1 const& src, M2& dst) { + invert44(src, dst); + symmetrize44(dst); + } + }; + template + struct Inverter { + static constexpr void eval(M1 const& src, M2& dst) { + invert55(src, dst); + symmetrize55(dst); + } + }; + template + struct Inverter { + static constexpr void eval(M1 const& src, M2& dst) { + invert66(src, dst); + symmetrize66(dst); + } + }; + + // Eigen interface + template + inline constexpr void invert(Eigen::DenseBase const& src, Eigen::DenseBase& dst) { + using M1 = Eigen::DenseBase; + using M2 = Eigen::DenseBase; + Inverter::eval(src, dst); + } + +} // namespace choleskyInversion + +#endif // DataFormat_Math_choleskyInversion_h diff --git a/DataFormats/Math/test/BuildFile.xml b/DataFormats/Math/test/BuildFile.xml index 6b1112e30472c..a312bfd89f06a 100644 --- a/DataFormats/Math/test/BuildFile.xml +++ b/DataFormats/Math/test/BuildFile.xml @@ -1,27 +1,31 @@ - - - - + + + + - - - - + + + + - + + - + + - + + - + + @@ -29,75 +33,96 @@ - + - + + - + + - + + - + + - + + + + + - + - + - + + - + + - + + + + - + + + - - + + - - + + - - + + + + + + + + diff --git a/DataFormats/Math/test/CholeskyInvert_t.cpp b/DataFormats/Math/test/CholeskyInvert_t.cpp new file mode 100644 index 0000000000000..4c0b064da6ed1 --- /dev/null +++ b/DataFormats/Math/test/CholeskyInvert_t.cpp @@ -0,0 +1,136 @@ +// nvcc -O3 CholeskyDecomp_t.cu --expt-relaxed-constexpr -gencode arch=compute_61,code=sm_61 --compiler-options="-Ofast -march=native" +// add -DDOPROF to run nvprof --metrics all + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include "DataFormats/Math/interface/choleskyInversion.h" + +constexpr int stride() { return 5 * 1024; } +template +using MXN = Eigen::Matrix; +template +using MapMX = Eigen::Map, 0, Eigen::Stride >; + +// generate matrices +template +void genMatrix(M& m) { + using T = typename std::remove_reference::type; + int n = M::ColsAtCompileTime; + std::mt19937 eng; + // std::mt19937 eng2; + std::uniform_real_distribution rgen(0., 1.); + + // generate first diagonal elemets + for (int i = 0; i < n; ++i) { + double maxVal = i * 10000 / (n - 1) + 1; // max condition is 10^4 + m(i, i) = maxVal * rgen(eng); + } + for (int i = 0; i < n; ++i) { + for (int j = 0; j < i; ++j) { + double v = 0.3 * std::sqrt(m(i, i) * m(j, j)); // this makes the matrix pos defined + m(i, j) = v * rgen(eng); + m(j, i) = m(i, j); + } + } +} + +template +void go(bool soa) { + constexpr unsigned int DIM = N; + using MX = MXN; + std::cout << "testing Matrix of dimension " << DIM << " size " << sizeof(MX) << " in " << (soa ? "SOA" : "AOS") + << " mode" << std::endl; + + auto start = std::chrono::high_resolution_clock::now(); + auto delta = start - start; + auto delta1 = delta; + auto delta2 = delta; + + constexpr unsigned int SIZE = 4 * 1024; + + alignas(128) MX mm[stride()]; // just storage in case of SOA + double* __restrict__ p = (double*)__builtin_assume_aligned(mm, 128); + + if (soa) { + for (unsigned int i = 0; i < SIZE; ++i) { + MapMX m(p + i); + genMatrix(m); + } + } else { + for (auto& m : mm) + genMatrix(m); + } + + std::cout << mm[SIZE / 2](1, 1) << std::endl; + + if (soa) + for (unsigned int i = 0; i < SIZE; ++i) { + MapMX m(p + i); + choleskyInversion::invert(m, m); + choleskyInversion::invert(m, m); + } + else + for (auto& m : mm) { + choleskyInversion::invert(m, m); + choleskyInversion::invert(m, m); + } + + std::cout << mm[SIZE / 2](1, 1) << std::endl; + + constexpr int NKK = +#ifdef DOPROF + 2; +#else + 1000; +#endif + for (int kk = 0; kk < NKK; ++kk) { + delta2 -= (std::chrono::high_resolution_clock::now() - start); + if (soa) +#pragma GCC ivdep +#pragma clang loop vectorize(enable) interleave(enable) + for (unsigned int i = 0; i < SIZE; ++i) { + MapMX m(p + i); + choleskyInversion::invert(m, m); + } + else +#pragma GCC ivdep + for (auto& m : mm) { + choleskyInversion::invert(m, m); + } + + delta2 += (std::chrono::high_resolution_clock::now() - start); + } + + std::cout << mm[SIZE / 2](1, 1) << std::endl; + + double DNNK = NKK; + std::cout << "x86 computation took " << std::chrono::duration_cast(delta2).count() / DNNK + << ' ' << " ms" << std::endl; +} + +int main() { + go<2>(false); + go<3>(false); + go<4>(false); + go<5>(false); + go<6>(false); + + go<2>(true); + go<3>(true); + go<4>(true); + go<5>(true); + go<6>(true); + + // go<10>(); + return 0; +} diff --git a/DataFormats/Math/test/CholeskyInvert_t.cu b/DataFormats/Math/test/CholeskyInvert_t.cu new file mode 100644 index 0000000000000..f2e440d6009ff --- /dev/null +++ b/DataFormats/Math/test/CholeskyInvert_t.cu @@ -0,0 +1,214 @@ +// nvcc -O3 CholeskyDecomp_t.cu --expt-relaxed-constexpr -gencode arch=compute_61,code=sm_61 --compiler-options="-Ofast -march=native" +// add -DDOPROF to run nvprof --metrics all + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include "DataFormats/Math/interface/choleskyInversion.h" +#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h" +#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" +#include "HeterogeneousCore/CUDAUtilities/interface/exitSansCUDADevices.h" +#include "HeterogeneousCore/CUDAUtilities/interface/launch.h" +#include "HeterogeneousCore/CUDAUtilities/interface/cudaDeviceCount.h" + +constexpr int stride() { return 5 * 1024; } +template +using MXN = Eigen::Matrix; +template +using MapMX = Eigen::Map, 0, Eigen::Stride>; + +template +__global__ void invertSOA(double *__restrict__ p, unsigned int n) { + auto i = blockIdx.x * blockDim.x + threadIdx.x; + if (i >= n) + return; + + MapMX m(p + i); + choleskyInversion::invert(m, m); +} + +template +__global__ void invert(M *mm, unsigned int n) { + auto i = blockIdx.x * blockDim.x + threadIdx.x; + if (i >= n) + return; + + auto &m = mm[i]; + choleskyInversion::invert(m, m); +} + +template +__global__ void invertSeq(M *mm, unsigned int n) { + if (threadIdx.x != 0) + return; + auto first = blockIdx.x * blockDim.x; + auto last = std::min(first + blockDim.x, n); + + for (auto i = first; i < last; ++i) { + auto &m = mm[i]; + choleskyInversion::invert(m, m); + } +} + +// generate matrices +template +void genMatrix(M &m) { + using T = typename std::remove_reference::type; + int n = M::ColsAtCompileTime; + std::mt19937 eng; + // std::mt19937 eng2; + std::uniform_real_distribution rgen(0., 1.); + + // generate first diagonal elemets + for (int i = 0; i < n; ++i) { + double maxVal = i * 10000 / (n - 1) + 1; // max condition is 10^4 + m(i, i) = maxVal * rgen(eng); + } + for (int i = 0; i < n; ++i) { + for (int j = 0; j < i; ++j) { + double v = 0.3 * std::sqrt(m(i, i) * m(j, j)); // this makes the matrix pos defined + m(i, j) = v * rgen(eng); + m(j, i) = m(i, j); + } + } +} + +template +void go(bool soa) { + constexpr unsigned int DIM = N; + using MX = MXN; + std::cout << "testing Matrix of dimension " << DIM << " size " << sizeof(MX) << std::endl; + + auto start = std::chrono::high_resolution_clock::now(); + auto delta = start - start; + auto delta1 = delta; + auto delta2 = delta; + + if (cudautils::cudaDeviceCount() == 0) { + std::cerr << "No CUDA devices on this system" + << "\n"; + exit(EXIT_FAILURE); + } + + constexpr unsigned int SIZE = 4 * 1024; + + MX mm[stride()]; // just storage in case of SOA + double *__restrict__ p = (double *)(mm); + + if (soa) { + for (unsigned int i = 0; i < SIZE; ++i) { + MapMX m(p + i); + genMatrix(m); + } + } else { + for (auto &m : mm) + genMatrix(m); + } + + std::cout << mm[SIZE / 2](1, 1) << std::endl; + + if (soa) + for (unsigned int i = 0; i < SIZE; ++i) { + MapMX m(p + i); + choleskyInversion::invert(m, m); + choleskyInversion::invert(m, m); + } + else + for (auto &m : mm) { + choleskyInversion::invert(m, m); + choleskyInversion::invert(m, m); + } + + std::cout << mm[SIZE / 2](1, 1) << std::endl; + + auto m_d = cudautils::make_device_unique(DIM * DIM * stride(), nullptr); + cudaCheck(cudaMemcpy(m_d.get(), (double const *)(mm), stride() * sizeof(MX), cudaMemcpyHostToDevice)); + + constexpr int NKK = +#ifdef DOPROF + 2; +#else + 1000; +#endif + for (int kk = 0; kk < NKK; ++kk) { + int threadsPerBlock = 128; + int blocksPerGrid = SIZE / threadsPerBlock; + + delta -= (std::chrono::high_resolution_clock::now() - start); + + if (soa) + cudautils::launch(invertSOA, {blocksPerGrid, threadsPerBlock}, m_d.get(), SIZE); + else + cudautils::launch(invert, {blocksPerGrid, threadsPerBlock}, (MX *)(m_d.get()), SIZE); + + cudaCheck(cudaMemcpy(&mm, m_d.get(), stride() * sizeof(MX), cudaMemcpyDeviceToHost)); + + delta += (std::chrono::high_resolution_clock::now() - start); + + if (0 == kk) + std::cout << mm[SIZE / 2](1, 1) << std::endl; + + if (!soa) { + delta1 -= (std::chrono::high_resolution_clock::now() - start); + +#ifndef DOPROF + cudautils::launch(invertSeq, {blocksPerGrid, threadsPerBlock}, (MX *)(m_d.get()), SIZE); + cudaCheck(cudaMemcpy(&mm, m_d.get(), stride() * sizeof(MX), cudaMemcpyDeviceToHost)); +#endif + delta1 += (std::chrono::high_resolution_clock::now() - start); + + if (0 == kk) + std::cout << mm[SIZE / 2](1, 1) << std::endl; + } + + delta2 -= (std::chrono::high_resolution_clock::now() - start); + if (soa) +#pragma GCC ivdep + for (unsigned int i = 0; i < SIZE; ++i) { + MapMX m(p + i); + choleskyInversion::invert(m, m); + } + else +#pragma GCC ivdep + for (auto &m : mm) { + choleskyInversion::invert(m, m); + } + + delta2 += (std::chrono::high_resolution_clock::now() - start); + } + + std::cout << mm[SIZE / 2](1, 1) << std::endl; + + double DNNK = NKK; + std::cout << "cuda/cudaSeq/x86 computation took " + << std::chrono::duration_cast(delta).count() / DNNK << ' ' + << std::chrono::duration_cast(delta1).count() / DNNK << ' ' + << std::chrono::duration_cast(delta2).count() / DNNK << ' ' << " ms" + << std::endl; +} + +int main() { + exitSansCUDADevices(); + + go<2>(false); + go<4>(false); + go<5>(false); + go<6>(false); + + go<2>(true); + go<4>(true); + go<5>(true); + go<6>(true); + + // go<10>(); + return 0; +} diff --git a/DataFormats/Math/test/cudaAtan2Test.cu b/DataFormats/Math/test/cudaAtan2Test.cu index ecc0be911c777..52901dd7e480e 100644 --- a/DataFormats/Math/test/cudaAtan2Test.cu +++ b/DataFormats/Math/test/cudaAtan2Test.cu @@ -25,10 +25,13 @@ end #include #include #include - -#include "cuda/api_wrappers.h" +#include #include "DataFormats/Math/interface/approx_atan2.h" +#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h" +#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" +#include "HeterogeneousCore/CUDAUtilities/interface/exitSansCUDADevices.h" +#include "HeterogeneousCore/CUDAUtilities/interface/launch.h" constexpr float xmin = -100.001; // avoid 0 constexpr float incr = 0.04; @@ -62,15 +65,13 @@ void go() { auto start = std::chrono::high_resolution_clock::now(); auto delta = start - start; - auto current_device = cuda::device::current::get(); - // atan2 delta -= (std::chrono::high_resolution_clock::now() - start); - auto diff_d = cuda::memory::device::make_unique(current_device, 3); + auto diff_d = cudautils::make_device_unique(3, nullptr); int diffs[3]; - cuda::memory::device::zero(diff_d.get(), 3 * 4); + cudaCheck(cudaMemset(diff_d.get(), 0, 3 * 4)); // Launch the diff CUDA Kernel dim3 threadsPerBlock(32, 32, 1); @@ -79,9 +80,9 @@ void go() { std::cout << "CUDA kernel 'diff' launch with " << blocksPerGrid.x << " blocks of " << threadsPerBlock.y << " threads\n"; - cuda::launch(diffAtan, {blocksPerGrid, threadsPerBlock}, diff_d.get()); + cudautils::launch(diffAtan, {blocksPerGrid, threadsPerBlock}, diff_d.get()); - cuda::memory::copy(diffs, diff_d.get(), 3 * 4); + cudaCheck(cudaMemcpy(diffs, diff_d.get(), 3 * 4, cudaMemcpyDeviceToHost)); delta += (std::chrono::high_resolution_clock::now() - start); float mdiff = diffs[0] * 1.e-7; @@ -95,26 +96,15 @@ void go() { } int main() { - int count = 0; - auto status = cudaGetDeviceCount(&count); - if (status != cudaSuccess) { - std::cerr << "Failed to initialise the CUDA runtime, the test will be skipped." - << "\n"; - exit(EXIT_SUCCESS); - } - if (count == 0) { - std::cerr << "No CUDA devices on this system, the test will be skipped." - << "\n"; - exit(EXIT_SUCCESS); - } + exitSansCUDADevices(); try { go<3>(); go<5>(); go<7>(); go<9>(); - } catch (cuda::runtime_error &ex) { - std::cerr << "CUDA error: " << ex.what() << std::endl; + } catch (std::runtime_error &ex) { + std::cerr << "CUDA or std runtime error: " << ex.what() << std::endl; exit(EXIT_FAILURE); } catch (...) { std::cerr << "A non-CUDA error occurred" << std::endl; diff --git a/DataFormats/Math/test/cudaMathTest.cu b/DataFormats/Math/test/cudaMathTest.cu index 6aeaa0f2ededb..6d2a289877d7c 100644 --- a/DataFormats/Math/test/cudaMathTest.cu +++ b/DataFormats/Math/test/cudaMathTest.cu @@ -25,12 +25,7 @@ end #include #include #include - -#include "cuda/api_wrappers.h" - -#include -#include -#include +#include #ifdef __CUDACC__ #define inline __host__ __device__ inline @@ -40,6 +35,14 @@ end #include #endif +#include "DataFormats/Math/interface/approx_log.h" +#include "DataFormats/Math/interface/approx_exp.h" +#include "DataFormats/Math/interface/approx_atan2.h" +#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h" +#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" +#include "HeterogeneousCore/CUDAUtilities/interface/exitSansCUDADevices.h" +#include "HeterogeneousCore/CUDAUtilities/interface/launch.h" + std::mt19937 eng; std::mt19937 eng2; std::uniform_real_distribution rgen(0., 1.); @@ -85,8 +88,6 @@ void go() { auto start = std::chrono::high_resolution_clock::now(); auto delta = start - start; - auto current_device = cuda::device::current::get(); - int numElements = 200000; size_t size = numElements * sizeof(float); std::cout << "[Vector of " << numElements << " elements]\n"; @@ -100,12 +101,12 @@ void go() { std::generate(h_B.get(), h_B.get() + numElements, [&]() { return rgen(eng); }); delta -= (std::chrono::high_resolution_clock::now() - start); - auto d_A = cuda::memory::device::make_unique(current_device, numElements); - auto d_B = cuda::memory::device::make_unique(current_device, numElements); - auto d_C = cuda::memory::device::make_unique(current_device, numElements); + auto d_A = cudautils::make_device_unique(numElements, nullptr); + auto d_B = cudautils::make_device_unique(numElements, nullptr); + auto d_C = cudautils::make_device_unique(numElements, nullptr); - cuda::memory::copy(d_A.get(), h_A.get(), size); - cuda::memory::copy(d_B.get(), h_B.get(), size); + cudaCheck(cudaMemcpy(d_A.get(), h_A.get(), size, cudaMemcpyHostToDevice)); + cudaCheck(cudaMemcpy(d_B.get(), h_B.get(), size, cudaMemcpyHostToDevice)); delta += (std::chrono::high_resolution_clock::now() - start); std::cout << "cuda alloc+copy took " << std::chrono::duration_cast(delta).count() << " ms" << std::endl; @@ -116,19 +117,21 @@ void go() { std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads\n"; delta -= (std::chrono::high_resolution_clock::now() - start); - cuda::launch(vectorOp, {blocksPerGrid, threadsPerBlock}, d_A.get(), d_B.get(), d_C.get(), numElements); + cudautils::launch( + vectorOp, {blocksPerGrid, threadsPerBlock}, d_A.get(), d_B.get(), d_C.get(), numElements); delta += (std::chrono::high_resolution_clock::now() - start); std::cout << "cuda computation took " << std::chrono::duration_cast(delta).count() << " ms" << std::endl; delta -= (std::chrono::high_resolution_clock::now() - start); - cuda::launch(vectorOp, {blocksPerGrid, threadsPerBlock}, d_A.get(), d_B.get(), d_C.get(), numElements); + cudautils::launch( + vectorOp, {blocksPerGrid, threadsPerBlock}, d_A.get(), d_B.get(), d_C.get(), numElements); delta += (std::chrono::high_resolution_clock::now() - start); std::cout << "cuda computation took " << std::chrono::duration_cast(delta).count() << " ms" << std::endl; delta -= (std::chrono::high_resolution_clock::now() - start); - cuda::memory::copy(h_C.get(), d_C.get(), size); + cudaCheck(cudaMemcpy(h_C.get(), d_C.get(), size, cudaMemcpyDeviceToHost)); delta += (std::chrono::high_resolution_clock::now() - start); std::cout << "cuda copy back took " << std::chrono::duration_cast(delta).count() << " ms" << std::endl; @@ -178,27 +181,15 @@ void go() { } int main() { - int count = 0; - auto status = cudaGetDeviceCount(&count); - if (status != cudaSuccess) { - std::cerr << "Failed to initialise the CUDA runtime, the test will be skipped." - << "\n"; - exit(EXIT_SUCCESS); - } - if (count == 0) { - std::cerr << "No CUDA devices on this system, the test will be skipped." - << "\n"; - exit(EXIT_SUCCESS); - } + exitSansCUDADevices(); try { go(); go(); go(); - go(); - } catch (cuda::runtime_error &ex) { - std::cerr << "CUDA error: " << ex.what() << std::endl; + } catch (std::runtime_error &ex) { + std::cerr << "CUDA or std runtime error: " << ex.what() << std::endl; exit(EXIT_FAILURE); } catch (...) { std::cerr << "A non-CUDA error occurred" << std::endl; diff --git a/DataFormats/Math/test/testAtan2.cpp b/DataFormats/Math/test/testAtan2.cpp index 207e69c848de9..11c951fc2e06b 100644 --- a/DataFormats/Math/test/testAtan2.cpp +++ b/DataFormats/Math/test/testAtan2.cpp @@ -195,6 +195,10 @@ void testIntPhi() { } int main() { + constexpr float p2i = ((int)(std::numeric_limits::max()) + 1) / M_PI; + constexpr float i2p = M_PI / ((int)(std::numeric_limits::max()) + 1); + std::cout << std::hexfloat << "p2i i2p " << p2i << " " << i2p << std::defaultfloat << std::endl; + std::cout << unsafe_atan2f<5>(0.f, 0.f) << " " << std::atan2(0., 0.) << std::endl; std::cout << unsafe_atan2f<5>(0.5f, 0.5f) << " " << std::atan2(0.5, 0.5) << std::endl; std::cout << unsafe_atan2f<5>(0.5f, -0.5f) << " " << std::atan2(0.5, -0.5) << std::endl; diff --git a/DataFormats/SiPixelCluster/interface/SiPixelCluster.h b/DataFormats/SiPixelCluster/interface/SiPixelCluster.h index ab4ae1add2132..5dfb8671c0a38 100644 --- a/DataFormats/SiPixelCluster/interface/SiPixelCluster.h +++ b/DataFormats/SiPixelCluster/interface/SiPixelCluster.h @@ -21,6 +21,7 @@ #include #include #include +#include class PixelDigi; @@ -189,6 +190,10 @@ class SiPixelCluster { float getSplitClusterErrorX() const { return err_x; } float getSplitClusterErrorY() const { return err_y; } + // the original id (they get sorted) + auto originalId() const { return theOriginalClusterId; } + void setOriginalId(uint16_t id) { theOriginalClusterId = id; } + private: std::vector thePixelOffset; std::vector thePixelADC; @@ -198,6 +203,8 @@ class SiPixelCluster { uint8_t thePixelRowSpan = 0; // Span pixel index in the x direction (low edge). uint8_t thePixelColSpan = 0; // Span pixel index in the y direction (left edge). + uint16_t theOriginalClusterId = std::numeric_limits::max(); + float err_x = -99999.9f; float err_y = -99999.9f; }; diff --git a/DataFormats/SiPixelCluster/src/classes_def.xml b/DataFormats/SiPixelCluster/src/classes_def.xml index 55c9fd8538417..d43f062877eb0 100644 --- a/DataFormats/SiPixelCluster/src/classes_def.xml +++ b/DataFormats/SiPixelCluster/src/classes_def.xml @@ -4,6 +4,7 @@ + diff --git a/DataFormats/SiPixelDigi/interface/PixelErrors.h b/DataFormats/SiPixelDigi/interface/PixelErrors.h new file mode 100644 index 0000000000000..073b9962deaaa --- /dev/null +++ b/DataFormats/SiPixelDigi/interface/PixelErrors.h @@ -0,0 +1,21 @@ +#ifndef DataFormats_SiPixelDigi_interface_PixelErrors_h +#define DataFormats_SiPixelDigi_interface_PixelErrors_h + +#include +#include + +#include "DataFormats/SiPixelRawData/interface/SiPixelRawDataError.h" +#include "FWCore/Utilities/interface/typedefs.h" + +// Better ideas for the placement of these? + +struct PixelErrorCompact { + uint32_t rawId; + uint32_t word; + uint8_t errorType; + uint8_t fedId; +}; + +using PixelFormatterErrors = std::map>; + +#endif // DataFormats_SiPixelDigi_interface_PixelErrors_h diff --git a/DataFormats/SiPixelDigi/interface/SiPixelDigiErrorsSoA.h b/DataFormats/SiPixelDigi/interface/SiPixelDigiErrorsSoA.h new file mode 100644 index 0000000000000..ee1227ed4fae1 --- /dev/null +++ b/DataFormats/SiPixelDigi/interface/SiPixelDigiErrorsSoA.h @@ -0,0 +1,28 @@ +#ifndef DataFormats_SiPixelDigi_interface_SiPixelDigiErrorsSoA_h +#define DataFormats_SiPixelDigi_interface_SiPixelDigiErrorsSoA_h + +#include "DataFormats/SiPixelDigi/interface/PixelErrors.h" + +#include +#include + +class SiPixelDigiErrorsSoA { +public: + SiPixelDigiErrorsSoA() = default; + explicit SiPixelDigiErrorsSoA(size_t nErrors, const PixelErrorCompact *error, const PixelFormatterErrors *err); + ~SiPixelDigiErrorsSoA() = default; + + auto size() const { return error_.size(); } + + const PixelFormatterErrors *formatterErrors() const { return formatterErrors_; } + + const PixelErrorCompact &error(size_t i) const { return error_[i]; } + + const std::vector &errorVector() const { return error_; } + +private: + std::vector error_; + const PixelFormatterErrors *formatterErrors_ = nullptr; +}; + +#endif diff --git a/DataFormats/SiPixelDigi/interface/SiPixelDigisSoA.h b/DataFormats/SiPixelDigi/interface/SiPixelDigisSoA.h new file mode 100644 index 0000000000000..50e863f03ff02 --- /dev/null +++ b/DataFormats/SiPixelDigi/interface/SiPixelDigisSoA.h @@ -0,0 +1,33 @@ +#ifndef DataFormats_SiPixelDigi_interface_SiPixelDigisSoA_h +#define DataFormats_SiPixelDigi_interface_SiPixelDigisSoA_h + +#include +#include + +class SiPixelDigisSoA { +public: + SiPixelDigisSoA() = default; + explicit SiPixelDigisSoA( + size_t nDigis, const uint32_t* pdigi, const uint32_t* rawIdArr, const uint16_t* adc, const int32_t* clus); + ~SiPixelDigisSoA() = default; + + auto size() const { return pdigi_.size(); } + + uint32_t pdigi(size_t i) const { return pdigi_[i]; } + uint32_t rawIdArr(size_t i) const { return rawIdArr_[i]; } + uint16_t adc(size_t i) const { return adc_[i]; } + int32_t clus(size_t i) const { return clus_[i]; } + + const std::vector& pdigiVector() const { return pdigi_; } + const std::vector& rawIdArrVector() const { return rawIdArr_; } + const std::vector& adcVector() const { return adc_; } + const std::vector& clusVector() const { return clus_; } + +private: + std::vector pdigi_; + std::vector rawIdArr_; + std::vector adc_; + std::vector clus_; +}; + +#endif diff --git a/DataFormats/SiPixelDigi/src/SiPixelDigiErrorsSoA.cc b/DataFormats/SiPixelDigi/src/SiPixelDigiErrorsSoA.cc new file mode 100644 index 0000000000000..a93bd7d3774f3 --- /dev/null +++ b/DataFormats/SiPixelDigi/src/SiPixelDigiErrorsSoA.cc @@ -0,0 +1,10 @@ +#include "DataFormats/SiPixelDigi/interface/SiPixelDigiErrorsSoA.h" + +#include + +SiPixelDigiErrorsSoA::SiPixelDigiErrorsSoA(size_t nErrors, + const PixelErrorCompact *error, + const PixelFormatterErrors *err) + : error_(error, error + nErrors), formatterErrors_(err) { + assert(error_.size() == nErrors); +} diff --git a/DataFormats/SiPixelDigi/src/SiPixelDigisSoA.cc b/DataFormats/SiPixelDigi/src/SiPixelDigisSoA.cc new file mode 100644 index 0000000000000..992c98f450616 --- /dev/null +++ b/DataFormats/SiPixelDigi/src/SiPixelDigisSoA.cc @@ -0,0 +1,12 @@ +#include "DataFormats/SiPixelDigi/interface/SiPixelDigisSoA.h" + +#include + +SiPixelDigisSoA::SiPixelDigisSoA( + size_t nDigis, const uint32_t *pdigi, const uint32_t *rawIdArr, const uint16_t *adc, const int32_t *clus) + : pdigi_(pdigi, pdigi + nDigis), + rawIdArr_(rawIdArr, rawIdArr + nDigis), + adc_(adc, adc + nDigis), + clus_(clus, clus + nDigis) { + assert(pdigi_.size() == nDigis); +} diff --git a/DataFormats/SiPixelDigi/src/classes.h b/DataFormats/SiPixelDigi/src/classes.h index 2f36b72ca7df8..ba68d3289e8cd 100644 --- a/DataFormats/SiPixelDigi/src/classes.h +++ b/DataFormats/SiPixelDigi/src/classes.h @@ -5,6 +5,8 @@ #include "DataFormats/SiPixelDigi/interface/PixelDigiCollection.h" #include "DataFormats/SiPixelDigi/interface/SiPixelCalibDigi.h" #include "DataFormats/SiPixelDigi/interface/SiPixelCalibDigiError.h" +#include "DataFormats/SiPixelDigi/interface/SiPixelDigisSoA.h" +#include "DataFormats/SiPixelDigi/interface/SiPixelDigiErrorsSoA.h" #include "DataFormats/Common/interface/Wrapper.h" #include "DataFormats/Common/interface/DetSetVector.h" #include "DataFormats/Common/interface/DetSetVectorNew.h" diff --git a/DataFormats/SiPixelDigi/src/classes_def.xml b/DataFormats/SiPixelDigi/src/classes_def.xml index de7779a5c00ea..8cabbd3f3f06e 100755 --- a/DataFormats/SiPixelDigi/src/classes_def.xml +++ b/DataFormats/SiPixelDigi/src/classes_def.xml @@ -49,4 +49,10 @@ + + + + + + diff --git a/EventFilter/SiPixelRawToDigi/plugins/BuildFile.xml b/EventFilter/SiPixelRawToDigi/plugins/BuildFile.xml index f92aa68373927..212738e941533 100644 --- a/EventFilter/SiPixelRawToDigi/plugins/BuildFile.xml +++ b/EventFilter/SiPixelRawToDigi/plugins/BuildFile.xml @@ -1,4 +1,7 @@ + + + diff --git a/EventFilter/SiPixelRawToDigi/plugins/SiPixelDigiErrorsFromSoA.cc b/EventFilter/SiPixelRawToDigi/plugins/SiPixelDigiErrorsFromSoA.cc new file mode 100644 index 0000000000000..270598b0528b8 --- /dev/null +++ b/EventFilter/SiPixelRawToDigi/plugins/SiPixelDigiErrorsFromSoA.cc @@ -0,0 +1,183 @@ +#include "CondFormats/DataRecord/interface/SiPixelFedCablingMapRcd.h" +#include "CondFormats/SiPixelObjects/interface/SiPixelFedCablingMap.h" +#include "CondFormats/SiPixelObjects/interface/SiPixelFedCablingTree.h" +#include "DataFormats/Common/interface/DetSetVector.h" +#include "DataFormats/Common/interface/Handle.h" +#include "DataFormats/DetId/interface/DetIdCollection.h" +#include "DataFormats/SiPixelDetId/interface/PixelFEDChannel.h" +#include "DataFormats/SiPixelDigi/interface/PixelDigi.h" +#include "DataFormats/SiPixelDigi/interface/SiPixelDigiErrorsSoA.h" +#include "EventFilter/SiPixelRawToDigi/interface/PixelDataFormatter.h" +#include "FWCore/Framework/interface/ESTransientHandle.h" +#include "FWCore/Framework/interface/ESWatcher.h" +#include "FWCore/Framework/interface/EventSetup.h" +#include "FWCore/Framework/interface/Event.h" +#include "FWCore/Framework/interface/MakerMacros.h" +#include "FWCore/Framework/interface/stream/EDProducer.h" +#include "FWCore/ParameterSet/interface/ConfigurationDescriptions.h" +#include "FWCore/ParameterSet/interface/ParameterSetDescription.h" +#include "FWCore/ParameterSet/interface/ParameterSet.h" + +#include + +class SiPixelDigiErrorsFromSoA : public edm::stream::EDProducer<> { +public: + explicit SiPixelDigiErrorsFromSoA(const edm::ParameterSet& iConfig); + ~SiPixelDigiErrorsFromSoA() override = default; + + static void fillDescriptions(edm::ConfigurationDescriptions& descriptions); + +private: + void produce(edm::Event& iEvent, const edm::EventSetup& iSetup) override; + + edm::EDGetTokenT digiErrorSoAGetToken_; + + edm::EDPutTokenT> errorPutToken_; + edm::EDPutTokenT tkErrorPutToken_; + edm::EDPutTokenT userErrorPutToken_; + edm::EDPutTokenT> disabledChannelPutToken_; + + edm::ESWatcher cablingWatcher_; + std::unique_ptr cabling_; + const std::string cablingMapLabel_; + + const std::vector tkerrorlist_; + const std::vector usererrorlist_; + + const bool usePhase1_; +}; + +SiPixelDigiErrorsFromSoA::SiPixelDigiErrorsFromSoA(const edm::ParameterSet& iConfig) + : digiErrorSoAGetToken_{consumes(iConfig.getParameter("digiErrorSoASrc"))}, + errorPutToken_{produces>()}, + tkErrorPutToken_{produces()}, + userErrorPutToken_{produces("UserErrorModules")}, + disabledChannelPutToken_{produces>()}, + cablingMapLabel_(iConfig.getParameter("CablingMapLabel")), + tkerrorlist_(iConfig.getParameter>("ErrorList")), + usererrorlist_(iConfig.getParameter>("UserErrorList")), + usePhase1_(iConfig.getParameter("UsePhase1")) {} + +void SiPixelDigiErrorsFromSoA::fillDescriptions(edm::ConfigurationDescriptions& descriptions) { + edm::ParameterSetDescription desc; + desc.add("digiErrorSoASrc", edm::InputTag("siPixelDigiErrorsSoA")); + desc.add("CablingMapLabel", "")->setComment("CablingMap label"); + desc.add("UsePhase1", false)->setComment("## Use phase1"); + desc.add>("ErrorList", std::vector{29}) + ->setComment("## ErrorList: list of error codes used by tracking to invalidate modules"); + desc.add>("UserErrorList", std::vector{40}) + ->setComment("## UserErrorList: list of error codes used by Pixel experts for investigation"); + descriptions.addWithDefaultLabel(desc); +} + +void SiPixelDigiErrorsFromSoA::produce(edm::Event& iEvent, const edm::EventSetup& iSetup) { + // pack errors into collection + + // initialize cabling map or update if necessary + if (cablingWatcher_.check(iSetup)) { + // cabling map, which maps online address (fed->link->ROC->local pixel) to offline (DetId->global pixel) + edm::ESTransientHandle cablingMap; + iSetup.get().get(cablingMapLabel_, cablingMap); + cabling_ = cablingMap->cablingTree(); + LogDebug("map version:") << cabling_->version(); + } + + const auto& digiErrors = iEvent.get(digiErrorSoAGetToken_); + + edm::DetSetVector errorcollection{}; + DetIdCollection tkerror_detidcollection{}; + DetIdCollection usererror_detidcollection{}; + edmNew::DetSetVector disabled_channelcollection{}; + + PixelDataFormatter formatter(cabling_.get(), usePhase1_); // for phase 1 & 0 + const PixelDataFormatter::Errors* formatterErrors = digiErrors.formatterErrors(); + assert(formatterErrors != nullptr); + auto errors = *formatterErrors; // make a copy + PixelDataFormatter::DetErrors nodeterrors; + + auto size = digiErrors.size(); + for (auto i = 0U; i < size; i++) { + PixelErrorCompact err = digiErrors.error(i); + if (err.errorType != 0) { + SiPixelRawDataError error(err.word, err.errorType, err.fedId + 1200); + errors[err.rawId].push_back(error); + } + } + + constexpr uint32_t dummydetid = 0xffffffff; + typedef PixelDataFormatter::Errors::iterator IE; + for (IE is = errors.begin(); is != errors.end(); is++) { + uint32_t errordetid = is->first; + if (errordetid == dummydetid) { // errors given dummy detId must be sorted by Fed + nodeterrors.insert(nodeterrors.end(), errors[errordetid].begin(), errors[errordetid].end()); + } else { + edm::DetSet& errorDetSet = errorcollection.find_or_insert(errordetid); + errorDetSet.data.insert(errorDetSet.data.end(), is->second.begin(), is->second.end()); + // Fill detid of the detectors where there is error AND the error number is listed + // in the configurable error list in the job option cfi. + // Code needs to be here, because there can be a set of errors for each + // entry in the for loop over PixelDataFormatter::Errors + + std::vector disabledChannelsDetSet; + + for (auto const& aPixelError : errorDetSet) { + // For the time being, we extend the error handling functionality with ErrorType 25 + // In the future, we should sort out how the usage of tkerrorlist can be generalized + if (aPixelError.getType() == 25) { + int fedId = aPixelError.getFedId(); + const sipixelobjects::PixelFEDCabling* fed = cabling_->fed(fedId); + if (fed) { + cms_uint32_t linkId = formatter.linkId(aPixelError.getWord32()); + const sipixelobjects::PixelFEDLink* link = fed->link(linkId); + if (link) { + // The "offline" 0..15 numbering is fixed by definition, also, the FrameConversion depends on it + // in contrast, the ROC-in-channel numbering is determined by hardware --> better to use the "offline" scheme + PixelFEDChannel ch = {fed->id(), linkId, 25, 0}; + for (unsigned int iRoc = 1; iRoc <= link->numberOfROCs(); iRoc++) { + const sipixelobjects::PixelROC* roc = link->roc(iRoc); + if (roc->idInDetUnit() < ch.roc_first) + ch.roc_first = roc->idInDetUnit(); + if (roc->idInDetUnit() > ch.roc_last) + ch.roc_last = roc->idInDetUnit(); + } + if (ch.roc_first < ch.roc_last) + disabledChannelsDetSet.push_back(ch); + } + } + } else { + // fill list of detIds to be turned off by tracking + if (!tkerrorlist_.empty()) { + auto it_find = std::find(tkerrorlist_.begin(), tkerrorlist_.end(), aPixelError.getType()); + if (it_find != tkerrorlist_.end()) { + tkerror_detidcollection.push_back(errordetid); + } + } + } + + // fill list of detIds with errors to be studied + if (!usererrorlist_.empty()) { + auto it_find = std::find(usererrorlist_.begin(), usererrorlist_.end(), aPixelError.getType()); + if (it_find != usererrorlist_.end()) { + usererror_detidcollection.push_back(errordetid); + } + } + + } // loop on DetSet of errors + + if (!disabledChannelsDetSet.empty()) { + disabled_channelcollection.insert(errordetid, disabledChannelsDetSet.data(), disabledChannelsDetSet.size()); + } + + } // if error assigned to a real DetId + } // loop on errors in event for this FED + + edm::DetSet& errorDetSet = errorcollection.find_or_insert(dummydetid); + errorDetSet.data = nodeterrors; + + iEvent.emplace(errorPutToken_, std::move(errorcollection)); + iEvent.emplace(tkErrorPutToken_, std::move(tkerror_detidcollection)); + iEvent.emplace(userErrorPutToken_, std::move(usererror_detidcollection)); + iEvent.emplace(disabledChannelPutToken_, std::move(disabled_channelcollection)); +} + +DEFINE_FWK_MODULE(SiPixelDigiErrorsFromSoA); diff --git a/EventFilter/SiPixelRawToDigi/plugins/SiPixelDigiErrorsSoAFromCUDA.cc b/EventFilter/SiPixelRawToDigi/plugins/SiPixelDigiErrorsSoAFromCUDA.cc new file mode 100644 index 0000000000000..ad6c46082be8b --- /dev/null +++ b/EventFilter/SiPixelRawToDigi/plugins/SiPixelDigiErrorsSoAFromCUDA.cc @@ -0,0 +1,78 @@ +#include "CUDADataFormats/Common/interface/CUDAProduct.h" +#include "CUDADataFormats/SiPixelDigi/interface/SiPixelDigiErrorsCUDA.h" +#include "DataFormats/SiPixelDigi/interface/SiPixelDigiErrorsSoA.h" +#include "FWCore/Framework/interface/EventSetup.h" +#include "FWCore/Framework/interface/Event.h" +#include "FWCore/Framework/interface/MakerMacros.h" +#include "FWCore/Framework/interface/stream/EDProducer.h" +#include "FWCore/ParameterSet/interface/ConfigurationDescriptions.h" +#include "FWCore/ParameterSet/interface/ParameterSetDescription.h" +#include "FWCore/ParameterSet/interface/ParameterSet.h" +#include "HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h" +#include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h" + +class SiPixelDigiErrorsSoAFromCUDA : public edm::stream::EDProducer { +public: + explicit SiPixelDigiErrorsSoAFromCUDA(const edm::ParameterSet& iConfig); + ~SiPixelDigiErrorsSoAFromCUDA() override = default; + + static void fillDescriptions(edm::ConfigurationDescriptions& descriptions); + +private: + void acquire(const edm::Event& iEvent, + const edm::EventSetup& iSetup, + edm::WaitingTaskWithArenaHolder waitingTaskHolder) override; + void produce(edm::Event& iEvent, const edm::EventSetup& iSetup) override; + + edm::EDGetTokenT> digiErrorGetToken_; + edm::EDPutTokenT digiErrorPutToken_; + + cudautils::host::unique_ptr data_; + GPU::SimpleVector error_; + const PixelFormatterErrors* formatterErrors_ = nullptr; +}; + +SiPixelDigiErrorsSoAFromCUDA::SiPixelDigiErrorsSoAFromCUDA(const edm::ParameterSet& iConfig) + : digiErrorGetToken_(consumes>(iConfig.getParameter("src"))), + digiErrorPutToken_(produces()) {} + +void SiPixelDigiErrorsSoAFromCUDA::fillDescriptions(edm::ConfigurationDescriptions& descriptions) { + edm::ParameterSetDescription desc; + desc.add("src", edm::InputTag("siPixelClustersCUDA")); + descriptions.addWithDefaultLabel(desc); +} + +void SiPixelDigiErrorsSoAFromCUDA::acquire(const edm::Event& iEvent, + const edm::EventSetup& iSetup, + edm::WaitingTaskWithArenaHolder waitingTaskHolder) { + // Do the transfer in a CUDA stream parallel to the computation CUDA stream + CUDAScopedContextAcquire ctx{iEvent.streamID(), std::move(waitingTaskHolder)}; + + const auto& gpuDigiErrors = ctx.get(iEvent, digiErrorGetToken_); + + auto tmp = gpuDigiErrors.dataErrorToHostAsync(ctx.stream()); + error_ = std::move(tmp.first); + data_ = std::move(tmp.second); + formatterErrors_ = &(gpuDigiErrors.formatterErrors()); +} + +void SiPixelDigiErrorsSoAFromCUDA::produce(edm::Event& iEvent, const edm::EventSetup& iSetup) { + // The following line copies the data from the pinned host memory to + // regular host memory. In principle that feels unnecessary (why not + // just use the pinned host memory?). There are a few arguments for + // doing it though + // - Now can release the pinned host memory back to the (caching) allocator + // * if we'd like to keep the pinned memory, we'd need to also + // keep the CUDA stream around as long as that, or allow pinned + // host memory to be allocated without a CUDA stream + // - What if a CPU algorithm would produce the same SoA? We can't + // use cudaMallocHost without a GPU... + iEvent.emplace(digiErrorPutToken_, error_.size(), error_.data(), formatterErrors_); + + error_ = GPU::make_SimpleVector(0, nullptr); + data_.reset(); + formatterErrors_ = nullptr; +} + +// define as framework plugin +DEFINE_FWK_MODULE(SiPixelDigiErrorsSoAFromCUDA); diff --git a/EventFilter/SiPixelRawToDigi/plugins/SiPixelDigisSoAFromCUDA.cc b/EventFilter/SiPixelRawToDigi/plugins/SiPixelDigisSoAFromCUDA.cc new file mode 100644 index 0000000000000..7794032154e98 --- /dev/null +++ b/EventFilter/SiPixelRawToDigi/plugins/SiPixelDigisSoAFromCUDA.cc @@ -0,0 +1,83 @@ +#include "CUDADataFormats/Common/interface/CUDAProduct.h" +#include "CUDADataFormats/SiPixelDigi/interface/SiPixelDigisCUDA.h" +#include "DataFormats/SiPixelDigi/interface/SiPixelDigisSoA.h" +#include "FWCore/Framework/interface/EventSetup.h" +#include "FWCore/Framework/interface/Event.h" +#include "FWCore/Framework/interface/MakerMacros.h" +#include "FWCore/Framework/interface/stream/EDProducer.h" +#include "FWCore/ParameterSet/interface/ConfigurationDescriptions.h" +#include "FWCore/ParameterSet/interface/ParameterSetDescription.h" +#include "FWCore/ParameterSet/interface/ParameterSet.h" +#include "HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h" +#include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h" + +class SiPixelDigisSoAFromCUDA : public edm::stream::EDProducer { +public: + explicit SiPixelDigisSoAFromCUDA(const edm::ParameterSet& iConfig); + ~SiPixelDigisSoAFromCUDA() override = default; + + static void fillDescriptions(edm::ConfigurationDescriptions& descriptions); + +private: + void acquire(const edm::Event& iEvent, + const edm::EventSetup& iSetup, + edm::WaitingTaskWithArenaHolder waitingTaskHolder) override; + void produce(edm::Event& iEvent, const edm::EventSetup& iSetup) override; + + edm::EDGetTokenT> digiGetToken_; + edm::EDPutTokenT digiPutToken_; + + cudautils::host::unique_ptr pdigi_; + cudautils::host::unique_ptr rawIdArr_; + cudautils::host::unique_ptr adc_; + cudautils::host::unique_ptr clus_; + + int nDigis_; +}; + +SiPixelDigisSoAFromCUDA::SiPixelDigisSoAFromCUDA(const edm::ParameterSet& iConfig) + : digiGetToken_(consumes>(iConfig.getParameter("src"))), + digiPutToken_(produces()) {} + +void SiPixelDigisSoAFromCUDA::fillDescriptions(edm::ConfigurationDescriptions& descriptions) { + edm::ParameterSetDescription desc; + desc.add("src", edm::InputTag("siPixelClustersCUDA")); + descriptions.addWithDefaultLabel(desc); +} + +void SiPixelDigisSoAFromCUDA::acquire(const edm::Event& iEvent, + const edm::EventSetup& iSetup, + edm::WaitingTaskWithArenaHolder waitingTaskHolder) { + // Do the transfer in a CUDA stream parallel to the computation CUDA stream + CUDAScopedContextAcquire ctx{iEvent.streamID(), std::move(waitingTaskHolder)}; + + const auto& gpuDigis = ctx.get(iEvent, digiGetToken_); + + nDigis_ = gpuDigis.nDigis(); + pdigi_ = gpuDigis.pdigiToHostAsync(ctx.stream()); + rawIdArr_ = gpuDigis.rawIdArrToHostAsync(ctx.stream()); + adc_ = gpuDigis.adcToHostAsync(ctx.stream()); + clus_ = gpuDigis.clusToHostAsync(ctx.stream()); +} + +void SiPixelDigisSoAFromCUDA::produce(edm::Event& iEvent, const edm::EventSetup& iSetup) { + // The following line copies the data from the pinned host memory to + // regular host memory. In principle that feels unnecessary (why not + // just use the pinned host memory?). There are a few arguments for + // doing it though + // - Now can release the pinned host memory back to the (caching) allocator + // * if we'd like to keep the pinned memory, we'd need to also + // keep the CUDA stream around as long as that, or allow pinned + // host memory to be allocated without a CUDA stream + // - What if a CPU algorithm would produce the same SoA? We can't + // use cudaMallocHost without a GPU... + iEvent.emplace(digiPutToken_, nDigis_, pdigi_.get(), rawIdArr_.get(), adc_.get(), clus_.get()); + + pdigi_.reset(); + rawIdArr_.reset(); + adc_.reset(); + clus_.reset(); +} + +// define as framework plugin +DEFINE_FWK_MODULE(SiPixelDigisSoAFromCUDA); diff --git a/EventFilter/SiPixelRawToDigi/python/SiPixelRawToDigi_cfi.py b/EventFilter/SiPixelRawToDigi/python/SiPixelRawToDigi_cfi.py index 12ff657cefd8e..50c8f0fcabd3c 100644 --- a/EventFilter/SiPixelRawToDigi/python/SiPixelRawToDigi_cfi.py +++ b/EventFilter/SiPixelRawToDigi/python/SiPixelRawToDigi_cfi.py @@ -1,7 +1,24 @@ import FWCore.ParameterSet.Config as cms -import EventFilter.SiPixelRawToDigi.siPixelRawToDigi_cfi +from EventFilter.SiPixelRawToDigi.siPixelRawToDigi_cfi import siPixelRawToDigi as _siPixelRawToDigi -siPixelDigis = EventFilter.SiPixelRawToDigi.siPixelRawToDigi_cfi.siPixelRawToDigi.clone() +from HeterogeneousCore.CUDACore.SwitchProducerCUDA import SwitchProducerCUDA +siPixelDigis = SwitchProducerCUDA( + cpu = _siPixelRawToDigi.clone() +) from Configuration.Eras.Modifier_phase1Pixel_cff import phase1Pixel -phase1Pixel.toModify(siPixelDigis, UsePhase1=True) +phase1Pixel.toModify(siPixelDigis.cpu, UsePhase1=True) + +from Configuration.ProcessModifiers.gpu_cff import gpu +gpu.toModify(siPixelDigis, + cuda = cms.EDAlias( + siPixelDigiErrors = cms.VPSet( + cms.PSet(type = cms.string("DetIdedmEDCollection")), + cms.PSet(type = cms.string("SiPixelRawDataErroredmDetSetVector")), + cms.PSet(type = cms.string("PixelFEDChanneledmNewDetSetVector")) + ), + siPixelDigisClustersPreSplitting = cms.VPSet( + cms.PSet(type = cms.string("PixelDigiedmDetSetVector")) + ) + ) +) diff --git a/EventFilter/SiPixelRawToDigi/python/siPixelDigis_cff.py b/EventFilter/SiPixelRawToDigi/python/siPixelDigis_cff.py new file mode 100644 index 0000000000000..31ba8596bddc6 --- /dev/null +++ b/EventFilter/SiPixelRawToDigi/python/siPixelDigis_cff.py @@ -0,0 +1,30 @@ +import FWCore.ParameterSet.Config as cms + +from EventFilter.SiPixelRawToDigi.SiPixelRawToDigi_cfi import siPixelDigis +from EventFilter.SiPixelRawToDigi.siPixelDigisSoAFromCUDA_cfi import siPixelDigisSoAFromCUDA as _siPixelDigisSoAFromCUDA +from EventFilter.SiPixelRawToDigi.siPixelDigiErrorsSoAFromCUDA_cfi import siPixelDigiErrorsSoAFromCUDA as _siPixelDigiErrorsSoAFromCUDA +from EventFilter.SiPixelRawToDigi.siPixelDigiErrorsFromSoA_cfi import siPixelDigiErrorsFromSoA as _siPixelDigiErrorsFromSoA + +siPixelDigisTask = cms.Task(siPixelDigis) + +siPixelDigisSoA = _siPixelDigisSoAFromCUDA.clone( + src = "siPixelClustersCUDAPreSplitting" +) +siPixelDigiErrorsSoA = _siPixelDigiErrorsSoAFromCUDA.clone( + src = "siPixelClustersCUDAPreSplitting" +) +siPixelDigiErrors = _siPixelDigiErrorsFromSoA.clone() + +from Configuration.Eras.Modifier_phase1Pixel_cff import phase1Pixel +phase1Pixel.toModify(siPixelDigiErrors, UsePhase1=True) + +siPixelDigisTaskCUDA = cms.Task( + siPixelDigisSoA, + siPixelDigiErrorsSoA, + siPixelDigiErrors +) + +from Configuration.ProcessModifiers.gpu_cff import gpu +_siPixelDigisTask_gpu = siPixelDigisTask.copy() +_siPixelDigisTask_gpu.add(siPixelDigisTaskCUDA) +gpu.toReplaceWith(siPixelDigisTask, _siPixelDigisTask_gpu) diff --git a/FWCore/Concurrency/interface/WaitingTaskWithArenaHolder.h b/FWCore/Concurrency/interface/WaitingTaskWithArenaHolder.h index efc7d9e6cde0b..44f7b1ca14944 100644 --- a/FWCore/Concurrency/interface/WaitingTaskWithArenaHolder.h +++ b/FWCore/Concurrency/interface/WaitingTaskWithArenaHolder.h @@ -24,9 +24,9 @@ #include "tbb/task_arena.h" -namespace edm { +#include "FWCore/Concurrency/interface/WaitingTask.h" - class WaitingTask; +namespace edm { class WaitingTaskHolder; class WaitingTaskWithArenaHolder { @@ -72,5 +72,29 @@ namespace edm { WaitingTask* m_task; std::shared_ptr m_arena; }; + + template + auto make_lambda_with_holder(WaitingTaskWithArenaHolder h, F&& f) { + return [holder = std::move(h), func = std::forward(f)]() mutable { + try { + func(holder); + } catch (...) { + holder.doneWaiting(std::current_exception()); + } + }; + } + + template + auto make_waiting_task_with_holder(ALLOC&& iAlloc, WaitingTaskWithArenaHolder h, F&& f) { + return make_waiting_task( + std::forward(iAlloc), + [holder = h, func = make_lambda_with_holder(h, std::forward(f))](std::exception_ptr const* excptr) mutable { + if (excptr) { + holder.doneWaiting(*excptr); + return; + } + func(); + }); + } } // namespace edm #endif diff --git a/FastSimulation/Tracking/python/SeedingMigration.py b/FastSimulation/Tracking/python/SeedingMigration.py index 751670daa50c8..3a982eba55e36 100644 --- a/FastSimulation/Tracking/python/SeedingMigration.py +++ b/FastSimulation/Tracking/python/SeedingMigration.py @@ -13,8 +13,9 @@ def _hitSetProducerToFactoryPSet(producer): "PixelTripletLargeTipEDProducer": "PixelTripletLargeTipGenerator", "MultiHitFromChi2EDProducer": "MultiHitGeneratorFromChi2", "CAHitTripletEDProducer": "CAHitTripletGenerator", - "CAHitQuadrupletEDProducer": "CAHitQuadrupletGenerator", - } + "CAHitQuadrupletEDProducer": "CAHitQuadrupletGenerator", + "CAHitNtupletHeterogeneousEDProducer": "CAHitQuadrupletGenerator", + } ret = cms.PSet() _copy(producer, ret) ret.ComponentName = cms.string(_map[producer._TypedParameterizable__type]); diff --git a/Geometry/TrackerGeometryBuilder/interface/phase1PixelTopology.h b/Geometry/TrackerGeometryBuilder/interface/phase1PixelTopology.h index cefdbe4b3296a..409ebec3cb43f 100644 --- a/Geometry/TrackerGeometryBuilder/interface/phase1PixelTopology.h +++ b/Geometry/TrackerGeometryBuilder/interface/phase1PixelTopology.h @@ -2,6 +2,7 @@ #define Geometry_TrackerGeometryBuilder_phase1PixelTopology_h #include +#include namespace phase1PixelTopology { @@ -20,6 +21,96 @@ namespace phase1PixelTopology { constexpr uint32_t numPixsInModule = uint32_t(numRowsInModule) * uint32_t(numColsInModule); + constexpr uint32_t numberOfModules = 1856; + constexpr uint32_t numberOfLayers = 10; + constexpr uint32_t layerStart[numberOfLayers + 1] = {0, + 96, + 320, + 672, // barrel + 1184, + 1296, + 1408, // positive endcap + 1520, + 1632, + 1744, // negative endcap + numberOfModules}; + constexpr char const* layerName[numberOfLayers] = { + "BL1", + "BL2", + "BL3", + "BL4", // barrel + "E+1", + "E+2", + "E+3", // positive endcap + "E-1", + "E-2", + "E-3" // negative endcap + }; + + constexpr uint32_t numberOfModulesInBarrel = 1184; + constexpr uint32_t numberOfLaddersInBarrel = numberOfModulesInBarrel / 8; + + template + constexpr auto map_to_array_helper(Function f, std::index_sequence) + -> std::array::type, sizeof...(Indices)> { + return {{f(Indices)...}}; + } + + template + constexpr auto map_to_array(Function f) -> std::array::type, N> { + return map_to_array_helper(f, std::make_index_sequence{}); + } + + constexpr uint32_t findMaxModuleStride() { + bool go = true; + int n = 2; + while (go) { + for (uint8_t i = 1; i < 11; ++i) { + if (layerStart[i] % n != 0) { + go = false; + break; + } + } + if (!go) + break; + n *= 2; + } + return n / 2; + } + + constexpr uint32_t maxModuleStride = findMaxModuleStride(); + + constexpr uint8_t findLayer(uint32_t detId) { + for (uint8_t i = 0; i < 11; ++i) + if (detId < layerStart[i + 1]) + return i; + return 11; + } + + constexpr uint8_t findLayerFromCompact(uint32_t detId) { + detId *= maxModuleStride; + for (uint8_t i = 0; i < 11; ++i) + if (detId < layerStart[i + 1]) + return i; + return 11; + } + + constexpr uint32_t layerIndexSize = numberOfModules / maxModuleStride; + constexpr std::array layer = map_to_array(findLayerFromCompact); + + constexpr bool validateLayerIndex() { + bool res = true; + for (auto i = 0U; i < numberOfModules; ++i) { + auto j = i / maxModuleStride; + res &= (layer[j] < 10); + res &= (i >= layerStart[layer[j]]); + res &= (i < layerStart[layer[j] + 1]); + } + return res; + } + + static_assert(validateLayerIndex(), "layer from detIndex algo is buggy"); + // this is for the ROC n<512 (upgrade 1024) constexpr inline uint16_t divu52(uint16_t n) { n = n >> 2; @@ -31,9 +122,11 @@ namespace phase1PixelTopology { } constexpr inline bool isEdgeX(uint16_t px) { return (px == 0) | (px == lastRowInModule); } + constexpr inline bool isEdgeY(uint16_t py) { return (py == 0) | (py == lastColInModule); } constexpr inline uint16_t toRocX(uint16_t px) { return (px < numRowsInRoc) ? px : px - numRowsInRoc; } + constexpr inline uint16_t toRocY(uint16_t py) { auto roc = divu52(py); return py - 52 * roc; @@ -64,6 +157,18 @@ namespace phase1PixelTopology { return py + shift; } + //FIXME move it elsewhere? + struct AverageGeometry { + static constexpr auto numberOfLaddersInBarrel = phase1PixelTopology::numberOfLaddersInBarrel; + float ladderZ[numberOfLaddersInBarrel]; + float ladderX[numberOfLaddersInBarrel]; + float ladderY[numberOfLaddersInBarrel]; + float ladderR[numberOfLaddersInBarrel]; + float ladderMinZ[numberOfLaddersInBarrel]; + float ladderMaxZ[numberOfLaddersInBarrel]; + float endCapZ[2]; // just for pos and neg Layer1 + }; + } // namespace phase1PixelTopology #endif // Geometry_TrackerGeometryBuilder_phase1PixelTopology_h diff --git a/Geometry/TrackerGeometryBuilder/test/phase1PixelTopology_t.cpp b/Geometry/TrackerGeometryBuilder/test/phase1PixelTopology_t.cpp index 9a00efbff9a9a..8dfae57b685b4 100644 --- a/Geometry/TrackerGeometryBuilder/test/phase1PixelTopology_t.cpp +++ b/Geometry/TrackerGeometryBuilder/test/phase1PixelTopology_t.cpp @@ -142,5 +142,19 @@ int main() { assert(std::get<1>(ori) == bp); } + for (auto i = 0U; i < phase1PixelTopology::numberOfLayers; ++i) { + std::cout << "layer " << i << ", \"" << phase1PixelTopology::layerName[i] << "\", [" + << phase1PixelTopology::layerStart[i] << ", " << phase1PixelTopology::layerStart[i + 1] << ")" + << std::endl; + } + + for (auto i = 0U; i < phase1PixelTopology::numberOfModules; ++i) { + int layer = phase1PixelTopology::layer[i / phase1PixelTopology::maxModuleStride]; + //std::cout << "module " << i << ": " << "layer " << layer << ", \"" << phase1PixelTopology::layerName[layer] << "\", [" << phase1PixelTopology::layerStart[layer] << ", " << phase1PixelTopology::layerStart[layer+1] << ")" << std::endl; + assert(layer < 10); + assert(i >= phase1PixelTopology::layerStart[layer]); + assert(i < phase1PixelTopology::layerStart[layer + 1]); + } + return 0; } diff --git a/HeterogeneousCore/CUDACore/BuildFile.xml b/HeterogeneousCore/CUDACore/BuildFile.xml new file mode 100644 index 0000000000000..d78c8a28f0470 --- /dev/null +++ b/HeterogeneousCore/CUDACore/BuildFile.xml @@ -0,0 +1,12 @@ + + + + + + + + + + + + diff --git a/HeterogeneousCore/CUDACore/README.md b/HeterogeneousCore/CUDACore/README.md new file mode 100644 index 0000000000000..3948ae7e59f79 --- /dev/null +++ b/HeterogeneousCore/CUDACore/README.md @@ -0,0 +1,1003 @@ +# CUDA algorithms in CMSSW + +## Outline + +* [Introduction](#introduction) + * [Design goals](#design-goals) + * [Overall guidelines](#overall-guidelines) +* [Sub-packages](#sub-packages) +* [Examples](#examples) + * [Isolated producer (no CUDA input nor output)](#isolated-producer-no-cuda-input-nor-output) + * [Producer with CUDA output](#producer-with-cuda-output) + * [Producer with CUDA input](#producer-with-cuda-input) + * [Producer with CUDA input and output (with ExternalWork)](#producer-with-cuda-input-and-output-with-externalwork) + * [Producer with CUDA input and output, and internal chain of CPU and GPU tasks (with ExternalWork)](producer-with-cuda-input-and-output-and-internal-chain-of-cpu-and-gpu-tasks-with-externalwork) + * [Producer with CUDA input and output (without ExternalWork)](#producer-with-cuda-input-and-output-without-externalwork) + * [Analyzer with CUDA input](#analyzer-with-cuda-input) + * [Configuration](#configuration) + * [GPU-only configuration](#gpu-only-configuration) + * [Automatic switching between CPU and GPU modules](#automatic-switching-between-cpu-and-gpu-modules) +* [More details](#more-details) + * [Device choice](#device-choice) + * [Data model](#data-model) + * [CUDA EDProducer](#cuda-edproducer) + * [Class declaration](#class-declaration) + * [Memory allocation](#memory-allocation) + * [Caching allocator](#caching-allocator) + * [Non-cached pinned host `unique_ptr`](#non-cached-pinned-host-unique_ptr) + * [CUDA API](#cuda-api) + * [Setting the current device](#setting-the-current-device) + * [Getting input](#getting-input) + * [Calling the CUDA kernels](#calling-the-cuda-kernels) + * [Putting output](#putting-output) + * [`ExternalWork` extension](#externalwork-extension) + * [Module-internal chain of CPU and GPU tasks](#module-internal-chain-of-cpu-and-gpu-tasks) + * [Transferring GPU data to CPU](#transferring-gpu-data-to-cpu) + * [Synchronizing between CUDA streams](#synchronizing-between-cuda-streams) + * [CUDA ESProduct](#cuda-esproduct) + +## Introduction + +This page documents the CUDA integration within CMSSW + +### Design goals + +1. Provide a mechanism for a chain of modules to share a resource + * Resource can be e.g. CUDA device memory or a CUDA stream +2. Minimize data movements between the CPU and the device +3. Support multiple devices +4. Allow the same job configuration to be used on all hardware combinations + +### Overall guidelines + +1. Within the `acquire()`/`produce()` functions all CUDA operations should be asynchronous, i.e. + * Use `cudaMemcpyAsync()`, `cudaMemsetAsync()`, `cudaMemPrefetchAsync()` etc. + * Avoid `cudaMalloc*()`, `cudaHostAlloc()`, `cudaFree*()`, `cudaHostRegister()`, `cudaHostUnregister()` on every event + * Occasional calls are permitted through a caching mechanism that amortizes the cost (see also [Caching allocator](#caching-allocator)) + * Avoid `assert()` in device functions, or use `#include HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h` + * With the latter the `assert()` calls in CUDA code are disabled by + default, but can be enabled by defining a `GPU_DEBUG` macro + (before the aforementioned include) +2. Synchronization needs should be fulfilled with + [`ExternalWork`](https://twiki.cern.ch/twiki/bin/view/CMSPublic/FWMultithreadedFrameworkStreamModuleInterface#edm_ExternalWork) + extension to EDProducers + * `ExternalWork` can be used to replace one synchronization point + (e.g. between device kernels and copying a known amount of data + back to CPU). + * For further synchronization points (e.g. copying data whose + amount is known only at the device side), split the work to + multiple `ExternalWork` producers. This approach has the added + benefit that e.g. data transfers to CPU become on-demand automatically + * A general breakdown of the possible steps: + * Convert input legacy CPU data format to CPU SoA + * Transfer input CPU SoA to GPU + * Launch kernels + * Transfer the number of output elements to CPU + * Transfer the output data from GPU to CPU SoA + * Convert the output SoA to legacy CPU data formats +3. Within `acquire()`/`produce()`, the current CUDA device is set + implicitly and the CUDA stream is provided by the system (with + `CUDAScopedContextAcquire`/`CUDAScopedContextProduce`) + * It is strongly recommended to use the provided CUDA stream for all operations + * If that is not feasible for some reason, the provided CUDA + stream must synchronize with the work queued on other CUDA + streams (with CUDA events and `cudaStreamWaitEvent()`) +4. Outside of `acquire()`/`produce()`, CUDA API functions may be + called only if `CUDAService::enabled()` returns `true`. + * With point 3 it follows that in these cases multiple devices have + to be dealt with explicitly, as well as CUDA streams + +## Sub-packages +* [`HeterogeneousCore/CUDACore`](#cuda-integration) CUDA-specific core components +* [`HeterogeneousCore/CUDAServices`](../CUDAServices) Various edm::Services related to CUDA +* [`HeterogeneousCore/CUDAUtilities`](../CUDAUtilities) Various utilities for CUDA kernel code +* [`HeterogeneousCore/CUDATest`](../CUDATest) Test modules and configurations +* [`CUDADataFormats/Common`](../../CUDADataFormats/Common) Utilities for event products with CUDA data + +## Examples + +### Isolated producer (no CUDA input nor output) + +```cpp +class IsolatedProducerCUDA: public edm::stream::EDProducer { +public: + ... + void acquire(edm::Event const& iEvent, edm::EventSetup const& iSetup, edm::WaitingTaskWithArenaHolder waitingTaskHolder) override; + void produce(edm::Event& iEvent, edm::EventSetup const& iSetup) override; + ... +private: + ... + IsolatedProducerGPUAlgo gpuAlgo_; + edm::EDGetTokenT inputToken_; + edm::EDPutTokenT outputToken_; +}; +... +void IsolatedProducerCUDA::acquire(edm::Event const& iEvent, edm::EventSetup const& iSetup, edm::WaitingTaskWithArenaHolder waitingTaskHolder) { + // Sets the current device and creates a CUDA stream + CUDAScopedContextAcquire ctx{iEvent.streamID(), std::move(waitingTaskHolder)}; + + auto const& inputData = iEvent.get(inputToken_); + + // Queues asynchronous data transfers and kernels to the CUDA stream + // returned by CUDAScopedContextAcquire::stream() + gpuAlgo_.makeAsync(inputData, ctx.stream()); + + // Destructor of ctx queues a callback to the CUDA stream notifying + // waitingTaskHolder when the queued asynchronous work has finished +} + +// Called after the asynchronous work has finished +void IsolatedProducerCUDA::produce(edm::Event& iEvent, edm::EventSetup const& iSetup) { + // Real life is likely more complex than this simple example. Here + // getResult() returns some data in CPU memory that is passed + // directly to the OutputData constructor. + iEvent.emplace(outputToken_, gpuAlgo_.getResult()); +} +``` + +### Producer with CUDA output + +```cpp +class ProducerOutputCUDA: public edm::stream::EDProducer { +public: + ... + void acquire(edm::Event const& iEvent, edm::EventSetup const& iSetup, edm::WaitingTaskWithArenaHolder waitingTaskHolder) override; + void produce(edm::Event& iEvent, edm::EventSetup const& iSetup) override; + ... +private: + ... + ProducerOutputGPUAlgo gpuAlgo_; + edm::EDGetTokenT inputToken_; + edm::EDPutTokenT> outputToken_; + CUDAContextState ctxState_; +}; +... +void ProducerOutputCUDA::acquire(edm::Event const& iEvent, edm::EventSetup const& iSetup, edm::WaitingTaskWithArenaHolder waitingTaskHolder) { + // Sets the current device and creates a CUDA stream + CUDAScopedContextAcquire ctx{iEvent.streamID(), std::move(waitingTaskHolder), ctxState_}; + + auto const& inputData = iEvent.get(inputToken_); + + // Queues asynchronous data transfers and kernels to the CUDA stream + // returned by CUDAScopedContextAcquire::stream() + gpuAlgo.makeAsync(inputData, ctx.stream()); + + // Destructor of ctx queues a callback to the CUDA stream notifying + // waitingTaskHolder when the queued asynchronous work has finished, + // and saves the device and CUDA stream to ctxState_ +} + +// Called after the asynchronous work has finished +void ProducerOutputCUDA::produce(edm::Event& iEvent, edm::EventSetup const& iSetup) { + // Sets again the current device, uses the CUDA stream created in the acquire() + CUDAScopedContextProduce ctx{ctxState_}; + + // Now getResult() returns data in GPU memory that is passed to the + // constructor of OutputData. CUDAScopedContextProduce::emplace() wraps the + // OutputData to CUDAProduct. CUDAProduct stores also + // the current device and the CUDA stream since those will be needed + // in the consumer side. + ctx.emplace(iEvent, outputToken_, gpuAlgo.getResult()); +} +``` + +### Producer with CUDA input + +```cpp +class ProducerInputCUDA: public edm::stream::EDProducer { +public: + ... + void acquire(edm::Event const& iEvent, edm::EventSetup const& iSetup, edm::WaitingTaskWithArenaHolder waitingTaskHolder) override; + void produce(edm::Event& iEvent, edm::EventSetup const& iSetup) override; + ... +private: + ... + ProducerInputGPUAlgo gpuAlgo_; + edm::EDGetTokenT> inputToken_; + edm::EDGetTokenT> otherInputToken_; + edm::EDPutTokenT outputToken_; +}; +... +void ProducerInputCUDA::acquire(edm::Event const& iEvent, edm::EventSetup& iSetup, edm::WaitingTaskWithArenaHolder waitingTaskHolder) { + CUDAProduct const& inputDataWrapped = iEvent.get(inputToken_); + + // Set the current device to the same that was used to produce + // InputData, and possibly use the same CUDA stream + CUDAScopedContextAcquire ctx{inputDataWrapped, std::move(waitingTaskHolder)}; + + // Grab the real input data. Checks that the input data is on the + // current device. If the input data was produced in a different CUDA + // stream than the CUDAScopedContextAcquire holds, create an inter-stream + // synchronization point with CUDA event and cudaStreamWaitEvent() + auto const& inputData = ctx.get(inputDataWrapped); + + // Input data from another producer + auto const& otherInputData = ctx.get(iEvent.get(otherInputToken_)); + // or + auto const& otherInputData = ctx.get(iEvent, otherInputToken_); + + + // Queues asynchronous data transfers and kernels to the CUDA stream + // returned by CUDAScopedContextAcquire::stream() + gpuAlgo.makeAsync(inputData, otherInputData, ctx.stream()); + + // Destructor of ctx queues a callback to the CUDA stream notifying + // waitingTaskHolder when the queued asynchronous work has finished +} + +// Called after the asynchronous work has finished +void ProducerInputCUDA::produce(edm::Event& iEvent, edm::EventSetup& iSetup) { + // Real life is likely more complex than this simple example. Here + // getResult() returns some data in CPU memory that is passed + // directly to the OutputData constructor. + iEvent.emplace(outputToken_, gpuAlgo_.getResult()); +} +``` + +See [further below](#setting-the-current-device) for the conditions +when the `CUDAScopedContextAcquire` constructor reuses the CUDA stream. Note +that the `CUDAScopedContextAcquire` constructor taking `edm::StreamID` is +allowed, it will just always create a new CUDA stream. + + +### Producer with CUDA input and output (with ExternalWork) + +```cpp +class ProducerInputOutputCUDA: public edm::stream::EDProducer { +public: + ... + void acquire(edm::Event const& iEvent, edm::EventSetup& iSetup, edm::WaitingTaskWithArenaHolder waitingTaskHolder) override; + void produce(edm::Event& iEvent, edm::EventSetup& iSetup) override; + ... +private: + ... + ProducerInputGPUAlgo gpuAlgo_; + edm::EDGetTokenT> inputToken_; + edm::EDPutTokenT> outputToken_; +}; +... +void ProducerInputOutputCUDA::acquire(edm::Event const& iEvent, edm::EventSetup& iSetup, edm::WaitingTaskWithArenaHolder waitingTaskHolder) { + CUDAProduct const& inputDataWrapped = iEvent.get(inputToken_); + + // Set the current device to the same that was used to produce + // InputData, and also use the same CUDA stream + CUDAScopedContextAcquire ctx{inputDataWrapped, std::move(waitingTaskHolder), ctxState_}; + + // Grab the real input data. Checks that the input data is on the + // current device. If the input data was produced in a different CUDA + // stream than the CUDAScopedContextAcquire holds, create an inter-stream + // synchronization point with CUDA event and cudaStreamWaitEvent() + auto const& inputData = ctx.get(inputDataWrapped); + + // Queues asynchronous data transfers and kernels to the CUDA stream + // returned by CUDAScopedContextAcquire::stream() + gpuAlgo.makeAsync(inputData, ctx.stream()); + + // Destructor of ctx queues a callback to the CUDA stream notifying + // waitingTaskHolder when the queued asynchronous work has finished, + // and saves the device and CUDA stream to ctxState_ +} + +// Called after the asynchronous work has finished +void ProducerInputOutputCUDA::produce(edm::Event& iEvent, edm::EventSetup& iSetup) { + // Sets again the current device, uses the CUDA stream created in the acquire() + CUDAScopedContextProduce ctx{ctxState_}; + + // Now getResult() returns data in GPU memory that is passed to the + // constructor of OutputData. CUDAScopedContextProduce::emplace() wraps the + // OutputData to CUDAProduct. CUDAProduct stores also + // the current device and the CUDA stream since those will be needed + // in the consumer side. + ctx.emplace(iEvent, outputToken_, gpuAlgo.getResult()); +} +``` + +[Complete example](../CUDATest/plugins/TestCUDAProducerGPUEW.cc) + + +### Producer with CUDA input and output, and internal chain of CPU and GPU tasks (with ExternalWork) + +```cpp +class ProducerInputOutputCUDA: public edm::stream::EDProducer { +public: + ... + void acquire(edm::Event const& iEvent, edm::EventSetup& iSetup, edm::WaitingTaskWithArenaHolder waitingTaskHolder) override; + void produce(edm::Event& iEvent, edm::EventSetup& iSetup) override; + ... +private: + void addMoreWork(edm::WaitingTaskWithArenaHolder waitingTashHolder); + + ... + ProducerInputGPUAlgo gpuAlgo_; + edm::EDGetTokenT> inputToken_; + edm::EDPutTokenT> outputToken_; +}; +... +void ProducerInputOutputCUDA::acquire(edm::Event const& iEvent, edm::EventSetup& iSetup, edm::WaitingTaskWithArenaHolder waitingTaskHolder) { + CUDAProduct const& inputDataWrapped = iEvent.get(inputToken_); + + // Set the current device to the same that was used to produce + // InputData, and also use the same CUDA stream + CUDAScopedContextAcquire ctx{inputDataWrapped, std::move(waitingTaskHolder), ctxState_}; + + // Grab the real input data. Checks that the input data is on the + // current device. If the input data was produced in a different CUDA + // stream than the CUDAScopedContextAcquire holds, create an inter-stream + // synchronization point with CUDA event and cudaStreamWaitEvent() + auto const& inputData = ctx.get(inputDataWrapped); + + // Queues asynchronous data transfers and kernels to the CUDA stream + // returned by CUDAScopedContextAcquire::stream() + gpuAlgo.makeAsync(inputData, ctx.stream()); + + // Push a functor on top of "a stack of tasks" to be run as a next + // task after the work queued above before produce(). In this case ctx + // is a context constructed by the calling TBB task, and therefore the + // current device and CUDA stream have been already set up. The ctx + // internally holds the WaitingTaskWithArenaHolder for the next task. + + ctx.pushNextTask([this](CUDAScopedContextTask ctx) { + addMoreWork(ctx); + }); + + // Destructor of ctx queues a callback to the CUDA stream notifying + // waitingTaskHolder when the queued asynchronous work has finished, + // and saves the device and CUDA stream to ctxState_ +} + +// Called after the asynchronous work queued in acquire() has finished +void ProducerInputOutputCUDA::addMoreWork(CUDAScopedContextTask& ctx) { + // Current device and CUDA stream have already been set + + // Queues more asynchronous data transfer and kernels to the CUDA + // stream returned by CUDAScopedContextTask::stream() + gpuAlgo.makeMoreAsync(ctx.stream()); + + // Destructor of ctx queues a callback to the CUDA stream notifying + // waitingTaskHolder when the queued asynchronous work has finished +} + +// Called after the asynchronous work queued in addMoreWork() has finished +void ProducerInputOutputCUDA::produce(edm::Event& iEvent, edm::EventSetup& iSetup) { + // Sets again the current device, uses the CUDA stream created in the acquire() + CUDAScopedContextProduce ctx{ctxState_}; + + // Now getResult() returns data in GPU memory that is passed to the + // constructor of OutputData. CUDAScopedContextProduce::emplace() wraps the + // OutputData to CUDAProduct. CUDAProduct stores also + // the current device and the CUDA stream since those will be needed + // in the consumer side. + ctx.emplace(iEvent, outputToken_, gpuAlgo.getResult()); +} +``` + +[Complete example](../CUDATest/plugins/TestCUDAProducerGPUEWTask.cc) + + +### Producer with CUDA input and output (without ExternalWork) + +If the producer does not need to transfer anything back to CPU (like +the number of output elements), the `ExternalWork` extension is not +needed as there is no need to synchronize. + +```cpp +class ProducerInputOutputCUDA: public edm::global::EDProducer<> { +public: + ... + void produce(edm::StreamID streamID, edm::Event& iEvent, edm::EventSetup& iSetup) const override; + ... +private: + ... + ProducerInputGPUAlgo gpuAlgo_; + edm::EDGetTokenT> inputToken_; + edm::EDPutTokenT> outputToken_; +}; +... +void ProducerInputOutputCUDA::produce(edm::StreamID streamID, edm::Event& iEvent, edm::EventSetup& iSetup) const { + CUDAProduct const& inputDataWrapped = iEvent.get(inputToken_); + + // Set the current device to the same that was used to produce + // InputData, and possibly use the same CUDA stream + CUDAScopedContextProduce ctx{inputDataWrapped}; + + // Grab the real input data. Checks that the input data is on the + // current device. If the input data was produced in a different CUDA + // stream than the CUDAScopedContextProduce holds, create an inter-stream + // synchronization point with CUDA event and cudaStreamWaitEvent() + auto const& inputData = ctx.get(inputDataWrapped); + + // Queues asynchronous data transfers and kernels to the CUDA stream + // returned by CUDAScopedContextProduce::stream(). Here makeAsync() also + // returns data in GPU memory that is passed to the constructor of + // OutputData. CUDAScopedContextProduce::emplace() wraps the OutputData to + // CUDAProduct. CUDAProduct stores also the current + // device and the CUDA stream since those will be needed in the + // consumer side. + ctx.emplace(iEvent, outputToken, gpuAlgo.makeAsync(inputData, ctx.stream()); + + // Destructor of ctx queues a callback to the CUDA stream notifying + // waitingTaskHolder when the queued asynchronous work has finished +} +``` + +[Complete example](../CUDATest/plugins/TestCUDAProducerGPU.cc) + + +### Analyzer with CUDA input + +Analyzer with CUDA input is similar to [producer with CUDA +input](#producer-with-cuda-input). Note that currently we do not have +a mechanism for portable configurations with analyzers (like +[`SwitchProducer`](#automatic-switching-between-cpu-and-gpu-modules) +for producers). This means that a configuration with a CUDA analyzer +can only run on a machine with CUDA device(s). + +```cpp +class AnalyzerInputCUDA: public edm::global::EDAnalyzer<> { +public: + ... + void analyzer(edm::Event const& iEvent, edm::EventSetup const& iSetup) override; + ... +private: + ... + AnalyzerInputGPUAlgo gpuAlgo_; + edm::EDGetTokenT> inputToken_; + edm::EDGetTokenT> otherInputToken_; +}; +... +void AnalyzerInputCUDA::analyze(edm::Event const& iEvent, edm::EventSetup& iSetup) { + CUDAProduct const& inputDataWrapped = iEvent.get(inputToken_); + + // Set the current device to the same that was used to produce + // InputData, and possibly use the same CUDA stream + CUDAScopedContextAnalyze ctx{inputDataWrapped}; + + // Grab the real input data. Checks that the input data is on the + // current device. If the input data was produced in a different CUDA + // stream than the CUDAScopedContextAnalyze holds, create an inter-stream + // synchronization point with CUDA event and cudaStreamWaitEvent() + auto const& inputData = ctx.get(inputDataWrapped); + + // Input data from another producer + auto const& otherInputData = ctx.get(iEvent.get(otherInputToken_)); + // or + auto const& otherInputData = ctx.get(iEvent, otherInputToken_); + + + // Queues asynchronous data transfers and kernels to the CUDA stream + // returned by CUDAScopedContextAnalyze::stream() + gpuAlgo.analyzeAsync(inputData, otherInputData, ctx.stream()); +} +``` + +[Complete example](../CUDATest/plugins/TestCUDAAnalyzerGPU.cc) + + +### Configuration + +#### GPU-only configuration + +For a GPU-only configuration there is nothing special to be done, just +construct the Paths/Sequences/Tasks from the GPU modules. + +#### Automatic switching between CPU and GPU modules + +The `SwitchProducer` mechanism can be used to switch automatically +between CPU and GPU modules based on the availability of GPUs on the +machine where the configuration is done. Framework decides at the +beginning of the job which of the modules to run for a given module +label. + +Framework requires that the modules in the switch must produce the +same types of output products (the closer the actual results are the +better, but the framework can not enforce that). This means that for a +chain of GPU modules, it is the module that transforms the SoA data +format back to the legacy data formats (possibly, but not necessarily, +transferring the SoA data from GPU to CPU) that should be switched +between the legacy CPU module. The rest of the GPU modules should be +placed to a `Task`, in which case framework runs them only if their +output is needed by another module. + +```python +from HeterogeneousCore.CUDACore.SwitchProducerCUDA import SwitchProducerCUDA +process.foo = SwitchProducerCUDA( + cpu = cms.EDProducer("FooProducer"), # legacy CPU + cuda = cms.EDProducer("FooProducerFromCUDA", + src="fooCUDA" + ) +) +process.fooCUDA = cms.EDProducer("FooProducerCUDA") + +process.fooTaskCUDA = cms.Task(process.fooCUDA) +process.fooTask = cms.Task( + process.foo, + process.fooTaskCUDA +) +``` + +For a more complete example, see [here](../CUDATest/test/testCUDASwitch_cfg.py). + + + + + +## More details + +### Device choice + +As discussed above, with `SwitchProducer` the choice between CPU and +GPU modules is done at the beginning of the job. + +For multi-GPU setup the device is chosen in the first CUDA module in a +chain of modules by one of the constructors of +`CUDAScopedContextAcquire`/`CUDAScopedContextProduce` +```cpp +// In ExternalWork acquire() +CUDAScopedContextAcquire ctx{iEvent.streamID(), ...}; + +// In normal produce() (or filter()) +CUDAScopedContextProduce ctx{iEvent.streamID()}; +``` +As the choice is still the static EDM stream to device assignment, the +EDM stream ID is needed. The logic will likely evolve in the future to +be more dynamic, and likely the device choice has to be made for the +full event. + +### Data model + +The "GPU data product" should be a class/struct containing smart +pointer(s) to device data (see [Memory allocation](#memory-allocation)). +When putting the data to event, the data is wrapped to +`CUDAProduct` template, which holds +* the GPU data product + * must be moveable, but no other restrictions +* the current device where the data was produced, and the CUDA stream the data was produced with +* [CUDA event for synchronization between multiple CUDA streams](#synchronizing-between-cuda-streams) + +Note that the `CUDAProduct` wrapper can be constructed only with +`CUDAScopedContextProduce::wrap()`, and the data `T` can be obtained +from it only with +`CUDAScopedContextAcquire::get()`/`CUDAScopedContextProduce::get()`/`CUDAScopedContextAnalyze::get()`, +as described further below. When putting the data product directly to +`edm::Event`, also `CUDASCopedContextProduce::emplace()` can be used. + +The GPU data products that depend on the CUDA runtime should be placed +under `CUDADataFormats` package, using the same name for sub-package +that would be used in `DataFormats`. Everything else, e.g. SoA for +CPU, should go under `DataFormats` as usual. + + +### CUDA EDProducer + +#### Class declaration + +The CUDA producers are normal EDProducers. The `ExternalWork` +extension should be used if a synchronization between the GPU and CPU +is needed, e.g. when transferring data from GPU to CPU. + +#### Memory allocation + +##### Caching allocator + +The memory allocations should be done dynamically with the following functions +```cpp +#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h" +#include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h" + +cudautils::device::unique_ptr device_buffer = cudautils::make_device_unique(50, cudaStream); +cudautils::host::unique_ptr host_buffer = cudautils::make_host_unique(50, cudaStream); +``` + +in the `acquire()` and `produce()` functions. The same +`cudaStream_t` object that is used for transfers and kernels +should be passed to the allocator. + +The allocator is based on `cub::CachingDeviceAllocator`. The memory is +guaranteed to be reserved +* for the host: up to the destructor of the `unique_ptr` +* for the device: until all work queued in the `cudaStream` up to the point when the `unique_ptr` destructor is called has finished + +##### Non-cached pinned host `unique_ptr` + +In producers transferring data to GPU one may want to pinned host +memory allocated with `cudaHostAllocWriteCombined`. As of now we don't +want to include the flag dimension to the caching allocator. The CUDA +API wrapper library does not support allocation flags, so we add our +own `unique_ptr` for that. + +```cpp +#include "HeterogeneousCore/CUDAUtilities/interface/host_noncached_unique_ptr.h" + +cudautils::host::noncached_unique_ptr host_buffer = cudautils::make_host_noncached_unique(50, flags); +``` +The `flags` is passed directly to `cudaHostAlloc()`. + +##### CUDA API + +The `cudaMalloc()` etc may be used outside of the event loop, but that +should be limited to only relatively small allocations in order to +allow as much re-use of device memory as possible. + +If really needed, the `cudaMalloc()` etc may be used also within the +event loop, but then the cost of allocation and implicit +synchronization should be explicitly amortized e.g. by caching. + +#### Setting the current device + +A CUDA producer should construct `CUDAScopedContextAcquire` in +`acquire()` (`CUDAScopedContextProduce` `produce()` if not using +`ExternalWork`) either with `edm::StreamID`, or with a +`CUDAProduct` read as an input. + +```cpp +// From edm::StreamID +CUDAScopedContextAcquire ctx{iEvent.streamID(), ...}; +// or +CUDAScopedContextProduce ctx{iEvent.streamID()}; + + +// From CUDAProduct +CUDAProduct const& cclus = iEvent.get(srcToken_); +CUDAScopedContextAcquire ctx{cclus, ...}; +// or +CUDAScopedContextProduce ctx{cclus}; +``` + +A CUDA analyzer should construct `CUDAScopedContextAnalyze` with a +`CUDAProduct` read as an input. + +```cpp +CUDAProduct const& cclus = iEvent.get(srcToken_); +CUDAScopedContextAnalyze ctx{cclus}; +``` + +`CUDAScopedContextAcquire`/`CUDAScopedContextProduce`/`CUDAScopedContextAnalyze` work in the RAII way and does the following +* Sets the current device for the current scope + - If constructed from the `edm::StreamID`, chooses the device and creates a new CUDA stream + - If constructed from the `CUDAProduct`, uses the same device and possibly the same CUDA stream as was used to produce the `CUDAProduct` + * The CUDA stream is reused if this producer is the first consumer + of the `CUDAProduct`, otherwise a new CUDA stream is created. + This approach is simple compromise to automatically express the work of + parallel producers in different CUDA streams, and at the same + time allow a chain of producers to queue their work to the same + CUDA stream. +* Gives access to the CUDA stream the algorithm should use to queue asynchronous work +* `CUDAScopedContextAcquire` calls `edm::WaitingTaskWithArenaHolder::doneWaiting()` when necessary (in its destructor) +* [Synchronizes between CUDA streams if necessary](#synchronizing-between-cuda-streams) +* Needed to get `CUDAProduct` from the event + * `CUDAScopedContextProduce` is needed to put `CUDAProduct` to the event + +In case of multiple input products, from possibly different CUDA +streams and/or CUDA devices, this approach gives the developer full +control in which of them the kernels of the algorithm should be run. + +#### Getting input + +The real product (`T`) can be obtained from `CUDAProduct` only with +the help of +`CUDAScopedContextAcquire`/`CUDAScopedContextProduce`/`CUDAScopedContextAnalyze`. + +```cpp +// From CUDAProduct +CUDAProduct cclus = iEvent.get(srcToken_); +GPUClusters const& clus = ctx.get(cclus); + +// Directly from Event +GPUClusters const& clus = ctx.get(iEvent, srcToken_); +``` + +This step is needed to +* check that the data are on the same CUDA device + * if not, throw an exception (with unified memory could prefetch instead) +* if the CUDA streams are different, synchronize between them + +#### Calling the CUDA kernels + +It is usually best to wrap the CUDA kernel calls to a separate class, +and then call methods of that class from the EDProducer. The only +requirement is that the CUDA stream where to queue the operations +should be the one from the +`CUDAScopedContextAcquire`/`CUDAScopedContextProduce`/`CUDAScopedContextAnalyze`. + +```cpp +gpuAlgo.makeClustersAsync(..., ctx.stream()); +``` + +If necessary, different CUDA streams may be used internally, but they +should to be made to synchronize with the provided CUDA stream with +CUDA events and `cudaStreamWaitEvent()`. + + +#### Putting output + +The GPU data needs to be wrapped to `CUDAProduct` template with +`CUDAScopedContextProduce::wrap()` or `CUDAScopedContextProduce::emplace()` + +```cpp +GPUClusters clusters = gpuAlgo.makeClustersAsync(..., ctx.stream()); +std::unique_ptr> ret = ctx.wrap(clusters); +iEvent.put(std::move(ret)); + +// or with one line +iEvent.put(ctx.wrap(gpuAlgo.makeClustersAsync(ctx.stream()))); + +// or avoid one unique_ptr with emplace +edm::PutTokenT> putToken_ = produces>(); // in constructor +... +ctx.emplace(iEvent, putToken_, gpuAlgo.makeClustersAsync(ctx.stream())); +``` + +This step is needed to +* store the current device and CUDA stream into `CUDAProduct` +* record the CUDA event needed for CUDA stream synchronization + +#### `ExternalWork` extension + +Everything above works both with and without `ExternalWork`. + +Without `ExternalWork` the `EDProducer`s act similar to TBB +flowgraph's "streaming node". In other words, they just queue more +asynchronous work to the CUDA stream in their `produce()`. + +The `ExternalWork` is needed when one would otherwise call +`cudeStreamSynchronize()`. For example transferring something to CPU +needed for downstream DQM, or queueing more asynchronous work. With +`ExternalWork` an `acquire()` method needs to be implemented that gets +an `edm::WaitingTaskWithArenaHolder` parameter. The +`edm::WaitingTaskWithArenaHolder` should then be passed to the +constructor of `CUDAScopedContextAcquire` along + +```cpp +void acquire(..., edm::WaitingTaskWithArenaHolder waitingTaskHolder) { + CUDAProduct const& cclus = iEvent.get(token_); + CUDAScopedContextAcquire ctx{cclus, std::move(waitingTaskHolder)}; // can also copy instead of move if waitingTaskHolder is needed for something else as well + ... +``` + +When constructed this way, `CUDAScopedContextAcquire` registers a +callback function to the CUDA stream in its destructor to call +`waitingTaskHolder.doneWaiting()`. + +A GPU->GPU producer needs a `CUDAScopedContext` also in its +`produce()`. The device and CUDA stream are transferred via +`CUDAContextState` member variable: + +```cpp +class FooProducerCUDA ... { + ... + CUDAContextState ctxState_; +}; + +void acquire(...) { + ... + FooProducerCUDA::CUDAScopedContextAcquire ctx{..., std::move(waitingTaskHolder), ctxState_}; + ... +} + +void produce(...( { + ... + FooProducerCUDA::CUDAScopedContextProduce ctx{ctxState_}; +} +``` + +The `CUDAScopedContextAcquire` saves its state to the `ctxState_` in +the destructor, and `CUDAScopedContextProduce` then restores the +context. + +#### Module-internal chain of CPU and GPU tasks + +Technically `ExternalWork` works such that the framework calls +`acquire()` with a `edm::WaitingTaskWithArenaHolder` that holds an +`edm::WaitingTask` (that inherits from `tbb::task`) for calling +`produce()` in a `std::shared_ptr` semantics: spawn the task when +reference count hits `0`. It is also possible to create a longer chain +of such tasks, alternating between CPU and GPU work. This mechanism +can also be used to re-run (part of) the GPU work. + +The "next tasks" to run are essentially structured as a stack, such +that +- `CUDAScopedContextAcquire`/`CUDAScopedContextTask::pushNextTask()` + pushes a new functor on top of the stack +- Completion of both the asynchronous work and the queueing function + pops the top task of the stack and enqueues it (so that TBB + eventually runs the task) + * Technically the task is made eligible to run when all copies of + `edm::WaitingTaskWithArenaHolder` of the acquire() (or "previous" + function) have either been destructed or their `doneWaiting()` has + been called + * The code calling `acquire()` or the functor holds one copy of + `edm::WaitingTaskWithArenaHolder` so it is guaranteed that the + next function will not run before the earlier one has finished + + +Below is an example how to push a functor on top of the stack of tasks +to run next (following the example of the previous section) +```cpp +void FooProducerCUDA::acquire(...) { + ... + ctx.pushNextTask([this](CUDAScopedContextTask ctx) { + ... + }); + ... +} +``` + +In this case the `ctx`argument to the function is a +`CUDAScopedContexTask` object constructed by the TBB task calling the +user-given function. It follows that the current device and CUDA +stream have been set up already. The `pushNextTask()` can be called +many times. On each invocation the `pushNextTask()` pushes a new task +on top of the stack (i.e. in front of the chain). It follows that in +```cpp +void FooProducerCUDA::acquire(...) { + ... + ctx.pushNextTask([this](CUDAScopedContextTask ctx) { + ... // function 1 + }); + ctx.pushNextTask([this](CUDAScopedContextTask ctx) { + ... // function 2 + }); + ctx.pushNextTask([this](CUDAScopedContextTask ctx) { + ... // function 3 + }); + ... +} +``` +the functions will be run in the order 3, 2, 1. + +**Note** that the `CUDAService` is **not** available (nor is any other +service) in these intermediate tasks. In the near future memory +allocations etc. will be made possible by taking them out from the +`CUDAService`. + +The `CUDAScopedContextAcquire`/`CUDAScopedContextTask` have also a +more generic member function, `replaceWaitingTaskHolder()`, that can +be used to just replace the currently-hold +`edm::WaitingTaskWithArenaHolder` (that will get notified by the +callback function) with anything. In this case the caller is +responsible of creating the task(s) and setting up the chain of them. + + +#### Transferring GPU data to CPU + +The GPU->CPU data transfer needs synchronization to ensure the CPU +memory to have all data before putting that to the event. This means +the `ExternalWork` needs to be used along +* In `acquire()` + * (allocate CPU memory buffers) + * Queue all GPU->CPU transfers asynchronously +* In `produce()` + * If needed, read additional CPU products (e.g. from `edm::Ref`s) + * Reformat data back to legacy data formats + * Note: `CUDAScopedContextProduce` is **not** needed in `produce()` + +#### Synchronizing between CUDA streams + +In case the producer needs input data that were produced in two (or +more) CUDA streams, these streams have to be synchronized. Here this +synchronization is achieved with CUDA events. + +Each `CUDAProduct` constains also a CUDA event object. The call to +`CUDAScopedContextProduce::wrap()` will *record* the event in the CUDA +stream. This means that when all work queued to the CUDA stream up to +that point has been finished, the CUDA event becomes *occurred*. Then, +in +`CUDAScopedContextAcquire::get()`/`CUDAScopedContextProduce::get()`/`CUDAScopedContextAnalyze::get()`, +if the `CUDAProduct` to get from has a different CUDA stream than +the +`CUDAScopedContextAcquire`/`CUDAScopedContextProduce`/`CUDAScopedContextAnalyze`, +`cudaStreamWaitEvent(stream, event)` is called. This means that all +subsequent work queued to the CUDA stream will wait for the CUDA event +to become occurred. Therefore this subsequent work can assume that the +to-be-getted CUDA product exists. + + +### CUDA ESProduct + +Conditions data can be transferred to the device with the following +pattern. + +1. Define a `class`/`struct` for the data to be transferred in the format accessed in the device (hereafter referred to as "payload") +2. Define a wrapper ESProduct that holds the aforementioned data in the pinned host memory +3. The wrapper should have a function returning the payload on the + device memory. The function should transfer the data to the device + asynchronously with the help of `CUDAESProduct`. + +#### Example + +```cpp +#include "HeterogeneousCore/CUDACore/interface/CUDAESProduct.h" + +// Declare the struct for the payload to be transferred. Here the +// example is an array with (potentially) dynamic size. Note that all of +// below becomes simpler if the array has compile-time size. +struct ESProductExampleCUDA { + float *someData; + unsigned int size; +}; + +// Declare the wrapper ESProduct. The corresponding ESProducer should +// produce objects of this type. +class ESProductExampleCUDAWrapper { +public: + // Constructor takes the standard CPU ESProduct, and transforms the + // necessary data to array(s) in pinned host memory + ESProductExampleCUDAWrapper(ESProductExample const&); + + // Deallocates all pinned host memory + ~ESProductExampleCUDAWrapper(); + + // Function to return the actual payload on the memory of the current device + ESProductExampleCUDA const *getGPUProductAsync(cudaStream_t stream) const; + +private: + // Holds the data in pinned CPU memory + float *someData_; + unsigned int size_; + + // Helper struct to hold all information that has to be allocated and + // deallocated per device + struct GPUData { + // Destructor should free all member pointers + ~GPUData(); + // internal pointers are on device, struct itself is on CPU + ESProductExampleCUDA *esproductHost = nullptr; + // internal pounters and struct are on device + ESProductExampleCUDA *esproductDevice = nullptr; + }; + + // Helper that takes care of complexity of transferring the data to + // multiple devices + CUDAESProduct gpuData_; +}; + +ESProductExampleCUDAWrapper::ESProductExampleCUDAWrapper(ESProductExample const& cpuProduct) { + cudaCheck(cudaMallocHost(&someData_, sizeof(float)*NUM_ELEMENTS)); + // fill someData_ and size_ from cpuProduct +} + +ESProductExampleCUDA const *ESProductExampleCUDAWrapper::getGPUProductAsync(cudaStream_t stream) const { + // CUDAESProduct essentially holds an array of GPUData objects, + // one per device. If the data have already been transferred to the + // current device (or the transfer has been queued), the helper just + // returns a reference to that GPUData object. Otherwise, i.e. data are + // not yet on the current device, the helper calls the lambda to do the + // necessary memory allocations and to queue the transfers. + auto const& data = gpuData_.dataForCurrentDeviceAsync(stream, [this](GPUData& data, cudaStream_t stream) { + // Allocate memory. Currently this can be with the CUDA API, + // sometime we'll migrate to the caching allocator. Assumption is + // that IOV changes are rare enough that adding global synchronization + // points is not that bad (for now). + + // Allocate the payload object on pinned host memory. + cudaCheck(cudaMallocHost(&data.esproductHost, sizeof(ESProductExampleCUDA))); + // Allocate the payload array(s) on device memory. + cudaCheck(cudaMalloc(&data.esproductHost->someData, sizeof(float)*NUM_ELEMENTS)); + + // Allocate the payload object on the device memory. + cudaCheck(cudaMalloc(&data.esproductDevice, sizeof(ESProductDevice))); + + // Complete the host-side information on the payload + data.cablingMapHost->size = this->size_; + + + // Transfer the payload, first the array(s) ... + cudaCheck(cudaMemcpyAsync(data.esproductHost->someData, this->someData, sizeof(float)*NUM_ELEMENTS, cudaMemcpyDefault, stream)); + // ... and then the payload object + cudaCheck(cudaMemcpyAsync(data.esproductDevice, data.esproduceHost, sizeof(ESProductExampleCUDA), cudaMemcpyDefault, stream)); +}); + + // Returns the payload object on the memory of the current device + return data.esproductDevice; +} + +// Destructor frees all member pointers +ESProductExampleCUDA::GPUData::~GPUData() { + if(esproductHost != nullptr) { + cudaCheck(cudaFree(esproductHost->someData)); + cudaCheck(cudaFreeHost(esproductHost)); + } + cudaCheck(cudaFree(esProductDevice)); +} + +``` diff --git a/HeterogeneousCore/CUDACore/interface/CUDAContextState.h b/HeterogeneousCore/CUDACore/interface/CUDAContextState.h new file mode 100644 index 0000000000000..b3c20dcb73159 --- /dev/null +++ b/HeterogeneousCore/CUDACore/interface/CUDAContextState.h @@ -0,0 +1,57 @@ +#ifndef HeterogeneousCore_CUDACore_CUDAContextState_h +#define HeterogeneousCore_CUDACore_CUDAContextState_h + +#include "HeterogeneousCore/CUDAUtilities/interface/SharedStreamPtr.h" + +#include + +/** + * The purpose of this class is to deliver the device and CUDA stream + * information from ExternalWork's acquire() to producer() via a + * member/StreamCache variable. + */ +class CUDAContextState { +public: + CUDAContextState() = default; + ~CUDAContextState() = default; + + CUDAContextState(const CUDAContextState&) = delete; + CUDAContextState& operator=(const CUDAContextState&) = delete; + CUDAContextState(CUDAContextState&&) = delete; + CUDAContextState& operator=(CUDAContextState&& other) = delete; + +private: + friend class CUDAScopedContextAcquire; + friend class CUDAScopedContextProduce; + friend class CUDAScopedContextTask; + + void set(int device, cudautils::SharedStreamPtr stream) { + throwIfStream(); + device_ = device; + stream_ = std::move(stream); + } + + int device() const { return device_; } + + const cudautils::SharedStreamPtr& streamPtr() const { + throwIfNoStream(); + return stream_; + } + + cudautils::SharedStreamPtr releaseStreamPtr() { + throwIfNoStream(); + // This function needs to effectively reset stream_ (i.e. stream_ + // must be empty after this function). This behavior ensures that + // the SharedStreamPtr is not hold for inadvertedly long (i.e. to + // the next event), and is checked at run time. + return std::move(stream_); + } + + void throwIfStream() const; + void throwIfNoStream() const; + + cudautils::SharedStreamPtr stream_; + int device_; +}; + +#endif diff --git a/HeterogeneousCore/CUDACore/interface/CUDAESProduct.h b/HeterogeneousCore/CUDACore/interface/CUDAESProduct.h new file mode 100644 index 0000000000000..b8b230e510fa3 --- /dev/null +++ b/HeterogeneousCore/CUDACore/interface/CUDAESProduct.h @@ -0,0 +1,100 @@ +#ifndef HeterogeneousCore_CUDACore_CUDAESProduct_h +#define HeterogeneousCore_CUDACore_CUDAESProduct_h + +#include +#include +#include +#include + +#include "FWCore/Concurrency/interface/hardware_pause.h" +#include "FWCore/Utilities/interface/thread_safety_macros.h" +#include "HeterogeneousCore/CUDAUtilities/interface/CUDAEventCache.h" +#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" +#include "HeterogeneousCore/CUDAUtilities/interface/cudaDeviceCount.h" +#include "HeterogeneousCore/CUDAUtilities/interface/currentDevice.h" +#include "HeterogeneousCore/CUDAUtilities/interface/eventIsOccurred.h" + +template +class CUDAESProduct { +public: + CUDAESProduct() : gpuDataPerDevice_(cudautils::cudaDeviceCount()) { + for (size_t i = 0; i < gpuDataPerDevice_.size(); ++i) { + gpuDataPerDevice_[i].m_event = cudautils::getCUDAEventCache().getCUDAEvent(); + } + } + ~CUDAESProduct() = default; + + // transferAsync should be a function of (T&, cudaStream_t) + // which enqueues asynchronous transfers (possibly kernels as well) + // to the CUDA stream + template + const T& dataForCurrentDeviceAsync(cudaStream_t cudaStream, F transferAsync) const { + auto device = cudautils::currentDevice(); + + auto& data = gpuDataPerDevice_[device]; + + // If GPU data has already been filled, we can return it + // immediately + if (not data.m_filled.load()) { + // It wasn't, so need to fill it + std::scoped_lock lk{data.m_mutex}; + + if (data.m_filled.load()) { + // Other thread marked it filled while we were locking the mutex, so we're free to return it + return data.m_data; + } + + if (data.m_fillingStream != nullptr) { + // Someone else is filling + + // Check first if the recorded event has occurred + if (cudautils::eventIsOccurred(data.m_event.get())) { + // It was, so data is accessible from all CUDA streams on + // the device. Set the 'filled' for all subsequent calls and + // return the value + auto should_be_false = data.m_filled.exchange(true); + assert(not should_be_false); + data.m_fillingStream = nullptr; + } else if (data.m_fillingStream != cudaStream) { + // Filling is still going on. For other CUDA stream, add + // wait on the CUDA stream and return the value. Subsequent + // work queued on the stream will wait for the event to + // occur (i.e. transfer to finish). + cudaCheck(cudaStreamWaitEvent(cudaStream, data.m_event.get(), 0), + "Failed to make a stream to wait for an event"); + } + // else: filling is still going on. But for the same CUDA + // stream (which would be a bit strange but fine), we can just + // return as all subsequent work should be enqueued to the + // same CUDA stream (or stream to be explicitly synchronized + // by the caller) + } else { + // Now we can be sure that the data is not yet on the GPU, and + // this thread is the first to try that. + transferAsync(data.m_data, cudaStream); + assert(data.m_fillingStream == nullptr); + data.m_fillingStream = cudaStream; + // Now the filling has been enqueued to the cudaStream, so we + // can return the GPU data immediately, since all subsequent + // work must be either enqueued to the cudaStream, or the cudaStream + // must be synchronized by the caller + } + } + + return data.m_data; + } + +private: + struct Item { + mutable std::mutex m_mutex; + CMS_THREAD_GUARD(m_mutex) mutable cudautils::SharedEventPtr m_event; + // non-null if some thread is already filling (cudaStream_t is just a pointer) + CMS_THREAD_GUARD(m_mutex) mutable cudaStream_t m_fillingStream = nullptr; + mutable std::atomic m_filled = false; // easy check if data has been filled already or not + CMS_THREAD_GUARD(m_mutex) mutable T m_data; + }; + + std::vector gpuDataPerDevice_; +}; + +#endif diff --git a/HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h b/HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h new file mode 100644 index 0000000000000..758218bb958a2 --- /dev/null +++ b/HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h @@ -0,0 +1,252 @@ +#ifndef HeterogeneousCore_CUDACore_CUDAScopedContext_h +#define HeterogeneousCore_CUDACore_CUDAScopedContext_h + +#include + +#include "CUDADataFormats/Common/interface/CUDAProduct.h" +#include "FWCore/Concurrency/interface/WaitingTaskWithArenaHolder.h" +#include "FWCore/Framework/interface/Event.h" +#include "FWCore/Utilities/interface/EDGetToken.h" +#include "FWCore/Utilities/interface/EDPutToken.h" +#include "FWCore/Utilities/interface/StreamID.h" +#include "HeterogeneousCore/CUDACore/interface/CUDAContextState.h" +#include "HeterogeneousCore/CUDAUtilities/interface/SharedEventPtr.h" +#include "HeterogeneousCore/CUDAUtilities/interface/SharedStreamPtr.h" + +namespace cudatest { + class TestCUDAScopedContext; +} + +namespace impl { + // This class is intended to be derived by other CUDAScopedContext*, not for general use + class CUDAScopedContextBase { + public: + int device() const { return currentDevice_; } + + // cudaStream_t is a pointer to a thread-safe object, for which a + // mutable access is needed even if the CUDAScopedContext itself + // would be const. Therefore it is ok to return a non-const + // pointer from a const method here. + cudaStream_t stream() const { return stream_.get(); } + const cudautils::SharedStreamPtr& streamPtr() const { return stream_; } + + protected: + // The constructors set the current device device, but the device + // is not set back to the previous value at the destructor. This + // should be sufficient (and tiny bit faster) as all CUDA API + // functions relying on the current device should be called from + // the scope where this context is. The current device doesn't + // really matter between modules (or across TBB tasks). + explicit CUDAScopedContextBase(edm::StreamID streamID); + + explicit CUDAScopedContextBase(const CUDAProductBase& data); + + explicit CUDAScopedContextBase(int device, cudautils::SharedStreamPtr stream); + + private: + int currentDevice_; + cudautils::SharedStreamPtr stream_; + }; + + class CUDAScopedContextGetterBase : public CUDAScopedContextBase { + public: + template + const T& get(const CUDAProduct& data) { + synchronizeStreams(data.device(), data.stream(), data.isAvailable(), data.event()); + return data.data_; + } + + template + const T& get(const edm::Event& iEvent, edm::EDGetTokenT> token) { + return get(iEvent.get(token)); + } + + protected: + template + CUDAScopedContextGetterBase(Args&&... args) : CUDAScopedContextBase(std::forward(args)...) {} + + void synchronizeStreams(int dataDevice, cudaStream_t dataStream, bool available, cudaEvent_t dataEvent); + }; + + class CUDAScopedContextHolderHelper { + public: + CUDAScopedContextHolderHelper(edm::WaitingTaskWithArenaHolder waitingTaskHolder) + : waitingTaskHolder_{std::move(waitingTaskHolder)} {} + + template + void pushNextTask(F&& f, CUDAContextState const* state); + + void replaceWaitingTaskHolder(edm::WaitingTaskWithArenaHolder waitingTaskHolder) { + waitingTaskHolder_ = std::move(waitingTaskHolder); + } + + void enqueueCallback(int device, cudaStream_t stream); + + private: + edm::WaitingTaskWithArenaHolder waitingTaskHolder_; + }; +} // namespace impl + +/** + * The aim of this class is to do necessary per-event "initialization" in ExternalWork acquire(): + * - setting the current device + * - calling edm::WaitingTaskWithArenaHolder::doneWaiting() when necessary + * - synchronizing between CUDA streams if necessary + * and enforce that those get done in a proper way in RAII fashion. + */ +class CUDAScopedContextAcquire : public impl::CUDAScopedContextGetterBase { +public: + /// Constructor to create a new CUDA stream (no need for context beyond acquire()) + explicit CUDAScopedContextAcquire(edm::StreamID streamID, edm::WaitingTaskWithArenaHolder waitingTaskHolder) + : CUDAScopedContextGetterBase(streamID), holderHelper_{std::move(waitingTaskHolder)} {} + + /// Constructor to create a new CUDA stream, and the context is needed after acquire() + explicit CUDAScopedContextAcquire(edm::StreamID streamID, + edm::WaitingTaskWithArenaHolder waitingTaskHolder, + CUDAContextState& state) + : CUDAScopedContextGetterBase(streamID), holderHelper_{std::move(waitingTaskHolder)}, contextState_{&state} {} + + /// Constructor to (possibly) re-use a CUDA stream (no need for context beyond acquire()) + explicit CUDAScopedContextAcquire(const CUDAProductBase& data, edm::WaitingTaskWithArenaHolder waitingTaskHolder) + : CUDAScopedContextGetterBase(data), holderHelper_{std::move(waitingTaskHolder)} {} + + /// Constructor to (possibly) re-use a CUDA stream, and the context is needed after acquire() + explicit CUDAScopedContextAcquire(const CUDAProductBase& data, + edm::WaitingTaskWithArenaHolder waitingTaskHolder, + CUDAContextState& state) + : CUDAScopedContextGetterBase(data), holderHelper_{std::move(waitingTaskHolder)}, contextState_{&state} {} + + ~CUDAScopedContextAcquire(); + + template + void pushNextTask(F&& f) { + if (contextState_ == nullptr) + throwNoState(); + holderHelper_.pushNextTask(std::forward(f), contextState_); + } + + void replaceWaitingTaskHolder(edm::WaitingTaskWithArenaHolder waitingTaskHolder) { + holderHelper_.replaceWaitingTaskHolder(std::move(waitingTaskHolder)); + } + +private: + void throwNoState(); + + impl::CUDAScopedContextHolderHelper holderHelper_; + CUDAContextState* contextState_ = nullptr; +}; + +/** + * The aim of this class is to do necessary per-event "initialization" in ExternalWork produce() or normal produce(): + * - setting the current device + * - synchronizing between CUDA streams if necessary + * and enforce that those get done in a proper way in RAII fashion. + */ +class CUDAScopedContextProduce : public impl::CUDAScopedContextGetterBase { +public: + /// Constructor to create a new CUDA stream (non-ExternalWork module) + explicit CUDAScopedContextProduce(edm::StreamID streamID) : CUDAScopedContextGetterBase(streamID) {} + + /// Constructor to (possibly) re-use a CUDA stream (non-ExternalWork module) + explicit CUDAScopedContextProduce(const CUDAProductBase& data) : CUDAScopedContextGetterBase(data) {} + + /// Constructor to re-use the CUDA stream of acquire() (ExternalWork module) + explicit CUDAScopedContextProduce(CUDAContextState& state) + : CUDAScopedContextGetterBase(state.device(), state.releaseStreamPtr()) {} + + ~CUDAScopedContextProduce(); + + template + std::unique_ptr> wrap(T data) { + // make_unique doesn't work because of private constructor + // + // CUDAProduct constructor records CUDA event to the CUDA + // stream. The event will become "occurred" after all work queued + // to the stream before this point has been finished. + std::unique_ptr> ret(new CUDAProduct(device(), streamPtr(), std::move(data))); + createEventIfStreamBusy(); + ret->setEvent(event_); + return ret; + } + + template + auto emplace(edm::Event& iEvent, edm::EDPutTokenT token, Args&&... args) { + auto ret = iEvent.emplace(token, device(), streamPtr(), std::forward(args)...); + createEventIfStreamBusy(); + const_cast(*ret).setEvent(event_); + return ret; + } + +private: + friend class cudatest::TestCUDAScopedContext; + + // This construcor is only meant for testing + explicit CUDAScopedContextProduce(int device, cudautils::SharedStreamPtr stream, cudautils::SharedEventPtr event) + : CUDAScopedContextGetterBase(device, std::move(stream)), event_{std::move(event)} {} + + void createEventIfStreamBusy(); + + cudautils::SharedEventPtr event_; +}; + +/** + * The aim of this class is to do necessary per-task "initialization" tasks created in ExternalWork acquire(): + * - setting the current device + * - calling edm::WaitingTaskWithArenaHolder::doneWaiting() when necessary + * and enforce that those get done in a proper way in RAII fashion. + */ +class CUDAScopedContextTask : public impl::CUDAScopedContextBase { +public: + /// Constructor to re-use the CUDA stream of acquire() (ExternalWork module) + explicit CUDAScopedContextTask(CUDAContextState const* state, edm::WaitingTaskWithArenaHolder waitingTaskHolder) + : CUDAScopedContextBase(state->device(), state->streamPtr()), // don't move, state is re-used afterwards + holderHelper_{std::move(waitingTaskHolder)}, + contextState_{state} {} + + ~CUDAScopedContextTask(); + + template + void pushNextTask(F&& f) { + holderHelper_.pushNextTask(std::forward(f), contextState_); + } + + void replaceWaitingTaskHolder(edm::WaitingTaskWithArenaHolder waitingTaskHolder) { + holderHelper_.replaceWaitingTaskHolder(std::move(waitingTaskHolder)); + } + +private: + impl::CUDAScopedContextHolderHelper holderHelper_; + CUDAContextState const* contextState_; +}; + +/** + * The aim of this class is to do necessary per-event "initialization" in analyze() + * - setting the current device + * - synchronizing between CUDA streams if necessary + * and enforce that those get done in a proper way in RAII fashion. + */ +/** + * The aim of this class is to do necessary per-event "initialization" in ExternalWork produce() or normal produce(): + * - setting the current device + * - synchronizing between CUDA streams if necessary + * and enforce that those get done in a proper way in RAII fashion. + */ +class CUDAScopedContextAnalyze : public impl::CUDAScopedContextGetterBase { +public: + /// Constructor to (possibly) re-use a CUDA stream + explicit CUDAScopedContextAnalyze(const CUDAProductBase& data) : CUDAScopedContextGetterBase(data) {} +}; + +namespace impl { + template + void CUDAScopedContextHolderHelper::pushNextTask(F&& f, CUDAContextState const* state) { + replaceWaitingTaskHolder(edm::WaitingTaskWithArenaHolder{ + edm::make_waiting_task_with_holder(tbb::task::allocate_root(), + std::move(waitingTaskHolder_), + [state, func = std::forward(f)](edm::WaitingTaskWithArenaHolder h) { + func(CUDAScopedContextTask{state, std::move(h)}); + })}); + } +} // namespace impl + +#endif diff --git a/HeterogeneousCore/CUDACore/python/SwitchProducerCUDA.py b/HeterogeneousCore/CUDACore/python/SwitchProducerCUDA.py new file mode 100644 index 0000000000000..ded114e2fddfe --- /dev/null +++ b/HeterogeneousCore/CUDACore/python/SwitchProducerCUDA.py @@ -0,0 +1,34 @@ +import FWCore.ParameterSet.Config as cms + +_cuda_enabled_cached = None + +def _switch_cuda(): + global _cuda_enabled_cached + if _cuda_enabled_cached is None: + import os + _cuda_enabled_cached = (os.system("cudaIsEnabled") == 0) + return (_cuda_enabled_cached, 2) + +class SwitchProducerCUDA(cms.SwitchProducer): + def __init__(self, **kargs): + super(SwitchProducerCUDA,self).__init__( + dict(cpu = cms.SwitchProducer.getCpu(), + cuda = _switch_cuda), + **kargs + ) +cms.specialImportRegistry.registerSpecialImportForType(SwitchProducerCUDA, "from HeterogeneousCore.CUDACore.SwitchProducerCUDA import SwitchProducerCUDA") + +if __name__ == "__main__": + import unittest + + class TestSwitchProducerCUDA(unittest.TestCase): + def testPickle(self): + import pickle + sp = SwitchProducerCUDA(cpu = cms.EDProducer("Foo"), cuda = cms.EDProducer("Bar")) + pkl = pickle.dumps(sp) + unpkl = pickle.loads(pkl) + self.assertEqual(unpkl.cpu.type_(), "Foo") + self.assertEqual(unpkl.cuda.type_(), "Bar") + + unittest.main() + diff --git a/HeterogeneousCore/CUDACore/src/CUDAContextState.cc b/HeterogeneousCore/CUDACore/src/CUDAContextState.cc new file mode 100644 index 0000000000000..bcdbae89d9094 --- /dev/null +++ b/HeterogeneousCore/CUDACore/src/CUDAContextState.cc @@ -0,0 +1,14 @@ +#include "HeterogeneousCore/CUDACore/interface/CUDAContextState.h" +#include "FWCore/Utilities/interface/Exception.h" + +void CUDAContextState::throwIfStream() const { + if (stream_) { + throw cms::Exception("LogicError") << "Trying to set CUDAContextState, but it already had a valid state"; + } +} + +void CUDAContextState::throwIfNoStream() const { + if (not stream_) { + throw cms::Exception("LogicError") << "Trying to get CUDAContextState, but it did not have a valid state"; + } +} diff --git a/HeterogeneousCore/CUDACore/src/CUDAScopedContext.cc b/HeterogeneousCore/CUDACore/src/CUDAScopedContext.cc new file mode 100644 index 0000000000000..df56c318e22fa --- /dev/null +++ b/HeterogeneousCore/CUDACore/src/CUDAScopedContext.cc @@ -0,0 +1,134 @@ +#include "HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h" + +#include "FWCore/MessageLogger/interface/MessageLogger.h" +#include "FWCore/ServiceRegistry/interface/Service.h" +#include "FWCore/Utilities/interface/Exception.h" +#include "HeterogeneousCore/CUDAUtilities/interface/CUDAEventCache.h" +#include "HeterogeneousCore/CUDAUtilities/interface/CUDAStreamCache.h" +#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" + +#include "chooseCUDADevice.h" + +namespace { + struct CallbackData { + edm::WaitingTaskWithArenaHolder holder; + int device; + }; + + void CUDART_CB cudaScopedContextCallback(cudaStream_t streamId, cudaError_t status, void* data) { + std::unique_ptr guard{reinterpret_cast(data)}; + edm::WaitingTaskWithArenaHolder& waitingTaskHolder = guard->holder; + int device = guard->device; + if (status == cudaSuccess) { + LogTrace("CUDAScopedContext") << " GPU kernel finished (in callback) device " << device << " CUDA stream " + << streamId; + waitingTaskHolder.doneWaiting(nullptr); + } else { + // wrap the exception in a try-catch block to let GDB "catch throw" break on it + try { + auto error = cudaGetErrorName(status); + auto message = cudaGetErrorString(status); + throw cms::Exception("CUDAError") << "Callback of CUDA stream " << streamId << " in device " << device + << " error " << error << ": " << message; + } catch (cms::Exception&) { + waitingTaskHolder.doneWaiting(std::current_exception()); + } + } + } +} // namespace + +namespace impl { + CUDAScopedContextBase::CUDAScopedContextBase(edm::StreamID streamID) + : currentDevice_(cudacore::chooseCUDADevice(streamID)) { + cudaCheck(cudaSetDevice(currentDevice_)); + stream_ = cudautils::getCUDAStreamCache().getCUDAStream(); + } + + CUDAScopedContextBase::CUDAScopedContextBase(const CUDAProductBase& data) : currentDevice_(data.device()) { + cudaCheck(cudaSetDevice(currentDevice_)); + if (data.mayReuseStream()) { + stream_ = data.streamPtr(); + } else { + stream_ = cudautils::getCUDAStreamCache().getCUDAStream(); + } + } + + CUDAScopedContextBase::CUDAScopedContextBase(int device, cudautils::SharedStreamPtr stream) + : currentDevice_(device), stream_(std::move(stream)) { + cudaCheck(cudaSetDevice(currentDevice_)); + } + + //////////////////// + + void CUDAScopedContextGetterBase::synchronizeStreams(int dataDevice, + cudaStream_t dataStream, + bool available, + cudaEvent_t dataEvent) { + if (dataDevice != device()) { + // Eventually replace with prefetch to current device (assuming unified memory works) + // If we won't go to unified memory, need to figure out something else... + throw cms::Exception("LogicError") << "Handling data from multiple devices is not yet supported"; + } + + if (dataStream != stream()) { + // Different streams, need to synchronize + if (not available) { + // Event not yet occurred, so need to add synchronization + // here. Sychronization is done by making the CUDA stream to + // wait for an event, so all subsequent work in the stream + // will run only after the event has "occurred" (i.e. data + // product became available). + cudaCheck(cudaStreamWaitEvent(stream(), dataEvent, 0), "Failed to make a stream to wait for an event"); + } + } + } + + void CUDAScopedContextHolderHelper::enqueueCallback(int device, cudaStream_t stream) { + cudaCheck( + cudaStreamAddCallback(stream, cudaScopedContextCallback, new CallbackData{waitingTaskHolder_, device}, 0)); + } +} // namespace impl + +//////////////////// + +CUDAScopedContextAcquire::~CUDAScopedContextAcquire() { + holderHelper_.enqueueCallback(device(), stream()); + if (contextState_) { + contextState_->set(device(), std::move(streamPtr())); + } +} + +void CUDAScopedContextAcquire::throwNoState() { + throw cms::Exception("LogicError") + << "Calling CUDAScopedContextAcquire::insertNextTask() requires CUDAScopedContextAcquire to be constructed with " + "CUDAContextState, but that was not the case"; +} + +//////////////////// + +CUDAScopedContextProduce::~CUDAScopedContextProduce() { + if (event_) { + cudaCheck(cudaEventRecord(event_.get(), stream())); + } +} + +void CUDAScopedContextProduce::createEventIfStreamBusy() { + if (event_) { + return; + } + auto ret = cudaStreamQuery(stream()); + if (ret == cudaSuccess) { + return; + } + if (ret != cudaErrorNotReady) { + // cudaErrorNotReady indicates that the stream is busy, and thus + // is not an error + cudaCheck(ret); + } + + event_ = cudautils::getCUDAEventCache().getCUDAEvent(); +} + +//////////////////// + +CUDAScopedContextTask::~CUDAScopedContextTask() { holderHelper_.enqueueCallback(device(), stream()); } diff --git a/HeterogeneousCore/CUDACore/src/chooseCUDADevice.cc b/HeterogeneousCore/CUDACore/src/chooseCUDADevice.cc new file mode 100644 index 0000000000000..7e9ac2faed380 --- /dev/null +++ b/HeterogeneousCore/CUDACore/src/chooseCUDADevice.cc @@ -0,0 +1,18 @@ +#include "FWCore/ServiceRegistry/interface/Service.h" +#include "HeterogeneousCore/CUDAServices/interface/CUDAService.h" + +#include "chooseCUDADevice.h" + +namespace cudacore { + int chooseCUDADevice(edm::StreamID id) { + edm::Service cudaService; + + // For startes we "statically" assign the device based on + // edm::Stream number. This is suboptimal if the number of + // edm::Streams is not a multiple of the number of CUDA devices + // (and even then there is no load balancing). + // + // TODO: improve the "assignment" logic + return id % cudaService->numberOfDevices(); + } +} // namespace cudacore diff --git a/HeterogeneousCore/CUDACore/src/chooseCUDADevice.h b/HeterogeneousCore/CUDACore/src/chooseCUDADevice.h new file mode 100644 index 0000000000000..bb09c302af7f5 --- /dev/null +++ b/HeterogeneousCore/CUDACore/src/chooseCUDADevice.h @@ -0,0 +1,10 @@ +#ifndef HeterogeneousCore_CUDACore_chooseCUDADevice_h +#define HeterogeneousCore_CUDACore_chooseCUDADevice_h + +#include "FWCore/Utilities/interface/StreamID.h" + +namespace cudacore { + int chooseCUDADevice(edm::StreamID id); +} + +#endif diff --git a/HeterogeneousCore/CUDACore/test/BuildFile.xml b/HeterogeneousCore/CUDACore/test/BuildFile.xml new file mode 100644 index 0000000000000..a6f34c70e8822 --- /dev/null +++ b/HeterogeneousCore/CUDACore/test/BuildFile.xml @@ -0,0 +1,16 @@ + + + + + + + + + + + + + + + + diff --git a/HeterogeneousCore/CUDACore/test/testStreamEvent.cu b/HeterogeneousCore/CUDACore/test/testStreamEvent.cu new file mode 100644 index 0000000000000..bd9ce4f29fba3 --- /dev/null +++ b/HeterogeneousCore/CUDACore/test/testStreamEvent.cu @@ -0,0 +1,134 @@ +/** + * The purpose of this test program is to ensure that the logic for + * CUDA event use in CUDAProduct and CUDAScopedContext + */ + +#include +#include +#include +#include +#include +#include + +#include + +#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" +#include "HeterogeneousCore/CUDAUtilities/interface/exitSansCUDADevices.h" + +namespace { + constexpr int ARRAY_SIZE = 20000000; + constexpr int NLOOPS = 10; +} // namespace + +__global__ void kernel_looping(float *point, unsigned int num) { + unsigned int idx = threadIdx.x + blockIdx.x * blockDim.x; + + for (int iloop = 0; iloop < NLOOPS; ++iloop) { + for (size_t offset = idx; offset < num; offset += gridDim.x * blockDim.x) { + point[offset] += 1; + } + } +} + +int main() { + exitSansCUDADevices(); + + constexpr bool debug = false; + + float *dev_points1; + float *host_points1; + cudaStream_t stream1, stream2; + cudaEvent_t event1, event2; + + cudaCheck(cudaMalloc(&dev_points1, ARRAY_SIZE * sizeof(float))); + cudaCheck(cudaMallocHost(&host_points1, ARRAY_SIZE * sizeof(float))); + cudaStreamCreateWithFlags(&stream1, cudaStreamNonBlocking); + cudaStreamCreateWithFlags(&stream2, cudaStreamNonBlocking); + cudaEventCreate(&event1); + cudaEventCreate(&event2); + + for (size_t j = 0; j < ARRAY_SIZE; ++j) { + host_points1[j] = static_cast(j); + } + + cudaCheck(cudaMemcpyAsync(dev_points1, host_points1, ARRAY_SIZE * sizeof(float), cudaMemcpyHostToDevice, stream1)); + kernel_looping<<<1, 16, 0, stream1>>>(dev_points1, ARRAY_SIZE); + if (debug) + std::cout << "Kernel launched on stream1" << std::endl; + + auto status = cudaStreamQuery(stream1); + if (debug) + std::cout << "Stream1 busy? " << (status == cudaErrorNotReady) << " idle? " << (status == cudaSuccess) << std::endl; + cudaEventRecord(event1, stream1); + status = cudaEventQuery(event1); + if (debug) + std::cout << "Event1 recorded? " << (status == cudaErrorNotReady) << " occurred? " << (status == cudaSuccess) + << std::endl; + assert(status == cudaErrorNotReady); + + status = cudaStreamQuery(stream2); + if (debug) + std::cout << "Stream2 busy? " << (status == cudaErrorNotReady) << " idle? " << (status == cudaSuccess) << std::endl; + assert(status == cudaSuccess); + if (debug) { + cudaEventRecord(event2, stream2); + status = cudaEventQuery(event2); + std::cout << "Event2 recorded? " << (status == cudaErrorNotReady) << " occurred? " << (status == cudaSuccess) + << std::endl; + std::this_thread::sleep_for(std::chrono::milliseconds(1)); + status = cudaEventQuery(event2); + std::cout << "Event2 recorded? " << (status == cudaErrorNotReady) << " occurred? " << (status == cudaSuccess) + << std::endl; + } + + cudaStreamWaitEvent(stream2, event1, 0); + if (debug) + std::cout << "\nStream2 waiting for event1" << std::endl; + status = cudaStreamQuery(stream2); + if (debug) + std::cout << "Stream2 busy? " << (status == cudaErrorNotReady) << " idle? " << (status == cudaSuccess) << std::endl; + assert(status == cudaErrorNotReady); + cudaEventRecord(event2, stream2); + status = cudaEventQuery(event2); + if (debug) + std::cout << "Event2 recorded? " << (status == cudaErrorNotReady) << " occurred? " << (status == cudaSuccess) + << std::endl; + assert(status == cudaErrorNotReady); + if (debug) { + std::this_thread::sleep_for(std::chrono::milliseconds(1)); + status = cudaEventQuery(event2); + std::cout << "Event2 recorded? " << (status == cudaErrorNotReady) << " occurred? " << (status == cudaSuccess) + << std::endl; + } + + status = cudaStreamQuery(stream1); + if (debug) { + std::cout << "\nStream1 busy? " << (status == cudaErrorNotReady) << " idle? " << (status == cudaSuccess) + << std::endl; + std::cout << "Synchronizing stream1" << std::endl; + } + assert(status == cudaErrorNotReady); + cudaStreamSynchronize(stream1); + if (debug) + std::cout << "Synchronized stream1" << std::endl; + + status = cudaEventQuery(event1); + if (debug) + std::cout << "Event1 recorded? " << (status == cudaErrorNotReady) << " occurred? " << (status == cudaSuccess) + << std::endl; + assert(status == cudaSuccess); + status = cudaEventQuery(event2); + if (debug) + std::cout << "Event2 recorded? " << (status == cudaErrorNotReady) << " occurred? " << (status == cudaSuccess) + << std::endl; + assert(status == cudaSuccess); + + cudaFree(dev_points1); + cudaFreeHost(host_points1); + cudaStreamDestroy(stream1); + cudaStreamDestroy(stream2); + cudaEventDestroy(event1); + cudaEventDestroy(event2); + + return 0; +} diff --git a/HeterogeneousCore/CUDACore/test/test_CUDAScopedContext.cc b/HeterogeneousCore/CUDACore/test/test_CUDAScopedContext.cc new file mode 100644 index 0000000000000..507824f8bfdb7 --- /dev/null +++ b/HeterogeneousCore/CUDACore/test/test_CUDAScopedContext.cc @@ -0,0 +1,132 @@ +#include "catch.hpp" + +#include "CUDADataFormats/Common/interface/CUDAProduct.h" +#include "FWCore/Concurrency/interface/WaitingTask.h" +#include "FWCore/ParameterSet/interface/ParameterSet.h" +#include "FWCore/ParameterSet/interface/ConfigurationDescriptions.h" +#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h" +#include "HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h" +#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" +#include "HeterogeneousCore/CUDAUtilities/interface/eventIsOccurred.h" +#include "HeterogeneousCore/CUDAUtilities/interface/exitSansCUDADevices.h" +#include "HeterogeneousCore/CUDAUtilities/interface/CUDAStreamCache.h" +#include "HeterogeneousCore/CUDAUtilities/interface/CUDAEventCache.h" +#include "HeterogeneousCore/CUDAUtilities/interface/currentDevice.h" +#include "HeterogeneousCore/CUDAUtilities/interface/ScopedSetDevice.h" + +#include "test_CUDAScopedContextKernels.h" + +namespace cudatest { + class TestCUDAScopedContext { + public: + static CUDAScopedContextProduce make(int dev, bool createEvent) { + cudautils::SharedEventPtr event; + if (createEvent) { + event = cudautils::getCUDAEventCache().getCUDAEvent(); + } + return CUDAScopedContextProduce(dev, cudautils::getCUDAStreamCache().getCUDAStream(), std::move(event)); + } + }; +} // namespace cudatest + +namespace { + std::unique_ptr> produce(int device, int* d, int* h) { + auto ctx = cudatest::TestCUDAScopedContext::make(device, true); + cudaCheck(cudaMemcpyAsync(d, h, sizeof(int), cudaMemcpyHostToDevice, ctx.stream())); + testCUDAScopedContextKernels_single(d, ctx.stream()); + return ctx.wrap(d); + } +} // namespace + +TEST_CASE("Use of CUDAScopedContext", "[CUDACore]") { + exitSansCUDADevices(); + + constexpr int defaultDevice = 0; + { + auto ctx = cudatest::TestCUDAScopedContext::make(defaultDevice, true); + + SECTION("Construct from device ID") { REQUIRE(cudautils::currentDevice() == defaultDevice); } + + SECTION("Wrap T to CUDAProduct") { + std::unique_ptr> dataPtr = ctx.wrap(10); + REQUIRE(dataPtr.get() != nullptr); + REQUIRE(dataPtr->device() == ctx.device()); + REQUIRE(dataPtr->stream() == ctx.stream()); + } + + SECTION("Construct from from CUDAProduct") { + std::unique_ptr> dataPtr = ctx.wrap(10); + const auto& data = *dataPtr; + + CUDAScopedContextProduce ctx2{data}; + REQUIRE(cudautils::currentDevice() == data.device()); + REQUIRE(ctx2.stream() == data.stream()); + + // Second use of a product should lead to new stream + CUDAScopedContextProduce ctx3{data}; + REQUIRE(cudautils::currentDevice() == data.device()); + REQUIRE(ctx3.stream() != data.stream()); + } + + SECTION("Storing state in CUDAContextState") { + CUDAContextState ctxstate; + { // acquire + std::unique_ptr> dataPtr = ctx.wrap(10); + const auto& data = *dataPtr; + edm::WaitingTaskWithArenaHolder dummy{ + edm::make_waiting_task(tbb::task::allocate_root(), [](std::exception_ptr const* iPtr) {})}; + CUDAScopedContextAcquire ctx2{data, std::move(dummy), ctxstate}; + } + + { // produce + CUDAScopedContextProduce ctx2{ctxstate}; + REQUIRE(cudautils::currentDevice() == ctx.device()); + REQUIRE(ctx2.stream() == ctx.stream()); + } + } + + SECTION("Joining multiple CUDA streams") { + cudautils::ScopedSetDevice setDeviceForThisScope(defaultDevice); + + // Mimick a producer on the first CUDA stream + int h_a1 = 1; + auto d_a1 = cudautils::make_device_unique(nullptr); + auto wprod1 = produce(defaultDevice, d_a1.get(), &h_a1); + + // Mimick a producer on the second CUDA stream + int h_a2 = 2; + auto d_a2 = cudautils::make_device_unique(nullptr); + auto wprod2 = produce(defaultDevice, d_a2.get(), &h_a2); + + REQUIRE(wprod1->stream() != wprod2->stream()); + + // Mimick a third producer "joining" the two streams + CUDAScopedContextProduce ctx2{*wprod1}; + + auto prod1 = ctx2.get(*wprod1); + auto prod2 = ctx2.get(*wprod2); + + auto d_a3 = cudautils::make_device_unique(nullptr); + testCUDAScopedContextKernels_join(prod1, prod2, d_a3.get(), ctx2.stream()); + cudaCheck(cudaStreamSynchronize(ctx2.stream())); + REQUIRE(wprod2->isAvailable()); + REQUIRE(cudautils::eventIsOccurred(wprod2->event())); + + h_a1 = 0; + h_a2 = 0; + int h_a3 = 0; + + cudaCheck(cudaMemcpyAsync(&h_a1, d_a1.get(), sizeof(int), cudaMemcpyDeviceToHost, ctx.stream())); + cudaCheck(cudaMemcpyAsync(&h_a2, d_a2.get(), sizeof(int), cudaMemcpyDeviceToHost, ctx.stream())); + cudaCheck(cudaMemcpyAsync(&h_a3, d_a3.get(), sizeof(int), cudaMemcpyDeviceToHost, ctx.stream())); + + REQUIRE(h_a1 == 2); + REQUIRE(h_a2 == 4); + REQUIRE(h_a3 == 6); + } + } + + cudaCheck(cudaSetDevice(defaultDevice)); + cudaCheck(cudaDeviceSynchronize()); + // Note: CUDA resources are cleaned up by the destructors of the global cache objects +} diff --git a/HeterogeneousCore/CUDACore/test/test_CUDAScopedContextKernels.cu b/HeterogeneousCore/CUDACore/test/test_CUDAScopedContextKernels.cu new file mode 100644 index 0000000000000..330e83dfd4960 --- /dev/null +++ b/HeterogeneousCore/CUDACore/test/test_CUDAScopedContextKernels.cu @@ -0,0 +1,13 @@ +#include "test_CUDAScopedContextKernels.h" + +namespace { + __global__ void single_mul(int *d) { d[0] = d[0] * 2; } + + __global__ void join_add(const int *d1, const int *d2, int *d3) { d3[0] = d1[0] + d2[0]; } +} // namespace + +void testCUDAScopedContextKernels_single(int *d, cudaStream_t stream) { single_mul<<<1, 1, 0, stream>>>(d); } + +void testCUDAScopedContextKernels_join(const int *d1, const int *d2, int *d3, cudaStream_t stream) { + join_add<<<1, 1, 0, stream>>>(d1, d2, d3); +} diff --git a/HeterogeneousCore/CUDACore/test/test_CUDAScopedContextKernels.h b/HeterogeneousCore/CUDACore/test/test_CUDAScopedContextKernels.h new file mode 100644 index 0000000000000..527a4ce71e1cb --- /dev/null +++ b/HeterogeneousCore/CUDACore/test/test_CUDAScopedContextKernels.h @@ -0,0 +1,9 @@ +#ifndef HeterogeneousCore_CUDACore_test_CUDAScopedContextKernels_h +#define HeterogeneousCore_CUDACore_test_CUDAScopedContextKernels_h + +#include + +void testCUDAScopedContextKernels_single(int *d, cudaStream_t stream); +void testCUDAScopedContextKernels_join(const int *d1, const int *d2, int *d3, cudaStream_t stream); + +#endif diff --git a/HeterogeneousCore/CUDACore/test/test_main.cc b/HeterogeneousCore/CUDACore/test/test_main.cc new file mode 100644 index 0000000000000..2e1027598a4de --- /dev/null +++ b/HeterogeneousCore/CUDACore/test/test_main.cc @@ -0,0 +1,31 @@ +#define CATCH_CONFIG_MAIN +#include "catch.hpp" + +#include "FWCore/ParameterSetReader/interface/ParameterSetReader.h" +#include "FWCore/PluginManager/interface/PluginManager.h" +#include "FWCore/PluginManager/interface/standard.h" +#include "FWCore/ServiceRegistry/interface/ServiceRegistry.h" + +class ServiceRegistryListener : public Catch::TestEventListenerBase { +public: + using Catch::TestEventListenerBase::TestEventListenerBase; // inherit constructor + + void testRunStarting(Catch::TestRunInfo const& testRunInfo) override { + edmplugin::PluginManager::configure(edmplugin::standard::config()); + + const std::string config{ + R"_(import FWCore.ParameterSet.Config as cms +process = cms.Process('Test') +process.CUDAService = cms.Service('CUDAService') +)_"}; + + std::unique_ptr params; + edm::makeParameterSets(config, params); + edm::ServiceToken tempToken(edm::ServiceRegistry::createServicesFromConfig(std::move(params))); + operate_.reset(new edm::ServiceRegistry::Operate(tempToken)); + } + +private: + std::unique_ptr operate_; +}; +CATCH_REGISTER_LISTENER(ServiceRegistryListener); diff --git a/HeterogeneousCore/CUDAServices/BuildFile.xml b/HeterogeneousCore/CUDAServices/BuildFile.xml new file mode 100644 index 0000000000000..9320cad14f285 --- /dev/null +++ b/HeterogeneousCore/CUDAServices/BuildFile.xml @@ -0,0 +1,11 @@ + + + + + + + + + + + diff --git a/HeterogeneousCore/CUDAServices/bin/BuildFile.xml b/HeterogeneousCore/CUDAServices/bin/BuildFile.xml new file mode 100644 index 0000000000000..041ed25ba134a --- /dev/null +++ b/HeterogeneousCore/CUDAServices/bin/BuildFile.xml @@ -0,0 +1,7 @@ + + + + + + + diff --git a/HeterogeneousCore/CUDAServices/bin/cudaComputeCapabilities.cpp b/HeterogeneousCore/CUDAServices/bin/cudaComputeCapabilities.cpp new file mode 100644 index 0000000000000..5a65575873116 --- /dev/null +++ b/HeterogeneousCore/CUDAServices/bin/cudaComputeCapabilities.cpp @@ -0,0 +1,23 @@ +// C++ standard headers +#include +#include + +// CUDA headers +#include + +// CMSSW headers +#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" + +int main() { + int devices = 0; + cudaCheck(cudaGetDeviceCount(&devices)); + + for (int i = 0; i < devices; ++i) { + cudaDeviceProp properties; + cudaGetDeviceProperties(&properties, i); + std::cout << std::setw(4) << i << " " << std::setw(2) << properties.major << "." << properties.minor << " " + << properties.name << std::endl; + } + + return 0; +} diff --git a/HeterogeneousCore/CUDAServices/bin/cudaIsEnabled.cpp b/HeterogeneousCore/CUDAServices/bin/cudaIsEnabled.cpp new file mode 100644 index 0000000000000..d901e1850bceb --- /dev/null +++ b/HeterogeneousCore/CUDAServices/bin/cudaIsEnabled.cpp @@ -0,0 +1,31 @@ +#include +#include +#include +#include + +#include + +int main() { + int devices = 0; + auto status = cudaGetDeviceCount(&devices); + if (status != cudaSuccess) { + return EXIT_FAILURE; + } + + int minimumMajor = 6; // min minor is implicitly 0 + + // This approach (requiring all devices are supported) is rather + // conservative. In principle we could consider just dropping the + // unsupported devices. Currently that would be easiest to achieve + // in CUDAService though. + for (int i = 0; i < devices; ++i) { + cudaDeviceProp properties; + cudaGetDeviceProperties(&properties, i); + + if (properties.major < minimumMajor) { + return EXIT_FAILURE; + } + } + + return EXIT_SUCCESS; +} diff --git a/HeterogeneousCore/CUDAServices/interface/CUDAService.h b/HeterogeneousCore/CUDAServices/interface/CUDAService.h new file mode 100644 index 0000000000000..625ce40fdcdc9 --- /dev/null +++ b/HeterogeneousCore/CUDAServices/interface/CUDAService.h @@ -0,0 +1,46 @@ +#ifndef HeterogeneousCore_CUDAServices_CUDAService_h +#define HeterogeneousCore_CUDAServices_CUDAService_h + +#include +#include + +#include "FWCore/Utilities/interface/StreamID.h" + +namespace edm { + class ParameterSet; + class ActivityRegistry; + class ConfigurationDescriptions; +} // namespace edm + +/** + * TODO: + * - CUDA stream management? + * * Not really needed until we want to pass CUDA stream objects from one module to another + * * Which is not really needed until we want to go for "streaming mode" + * * Until that framework's inter-module synchronization is safe (but not necessarily optimal) + * - Management of (preallocated) memory? + */ +class CUDAService { +public: + CUDAService(edm::ParameterSet const& iConfig, edm::ActivityRegistry& iRegistry); + ~CUDAService(); + + static void fillDescriptions(edm::ConfigurationDescriptions& descriptions); + + bool enabled() const { return enabled_; } + + int numberOfDevices() const { return numberOfDevices_; } + + // major, minor + std::pair computeCapability(int device) { return computeCapabilities_.at(device); } + + // Returns the id of device with most free memory. If none is found, returns -1. + int deviceWithMostFreeMemory() const; + +private: + int numberOfDevices_ = 0; + std::vector> computeCapabilities_; + bool enabled_ = false; +}; + +#endif diff --git a/HeterogeneousCore/CUDAServices/interface/numberOfCUDADevices.h b/HeterogeneousCore/CUDAServices/interface/numberOfCUDADevices.h new file mode 100644 index 0000000000000..b563b98b516cf --- /dev/null +++ b/HeterogeneousCore/CUDAServices/interface/numberOfCUDADevices.h @@ -0,0 +1,9 @@ +#ifndef HeterogeneousCore_CUDAServices_numberOfCUDADevices_h +#define HeterogeneousCore_CUDAServices_numberOfCUDADevices_h + +// Returns the number of CUDA devices +// The difference wrt. the standard CUDA function is that if +// CUDAService is disabled, this function returns 0. +int numberOfCUDADevices(); + +#endif diff --git a/HeterogeneousCore/CUDAServices/plugins/BuildFile.xml b/HeterogeneousCore/CUDAServices/plugins/BuildFile.xml index afcf86afdef75..81d4f20331ce3 100644 --- a/HeterogeneousCore/CUDAServices/plugins/BuildFile.xml +++ b/HeterogeneousCore/CUDAServices/plugins/BuildFile.xml @@ -1,18 +1,15 @@ -#Skip building plugins by dropping all files for none-AMD64 build - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + diff --git a/HeterogeneousCore/CUDAServices/plugins/CUDAMonitoringService.cc b/HeterogeneousCore/CUDAServices/plugins/CUDAMonitoringService.cc new file mode 100644 index 0000000000000..6d8527935e334 --- /dev/null +++ b/HeterogeneousCore/CUDAServices/plugins/CUDAMonitoringService.cc @@ -0,0 +1,107 @@ +#include + +#include + +#include "DataFormats/Provenance/interface/ModuleDescription.h" +#include "FWCore/MessageLogger/interface/MessageLogger.h" +#include "FWCore/ParameterSet/interface/ConfigurationDescriptions.h" +#include "FWCore/ParameterSet/interface/ParameterSet.h" +#include "FWCore/ParameterSet/interface/ParameterSetDescription.h" +#include "FWCore/ServiceRegistry/interface/ActivityRegistry.h" +#include "FWCore/ServiceRegistry/interface/ModuleCallingContext.h" +#include "FWCore/ServiceRegistry/interface/Service.h" +#include "FWCore/ServiceRegistry/interface/ServiceMaker.h" +#include "HeterogeneousCore/CUDAServices/interface/CUDAService.h" +#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" + +namespace edm { + class StreamContext; +} + +class CUDAMonitoringService { +public: + CUDAMonitoringService(edm::ParameterSet const& iConfig, edm::ActivityRegistry& iRegistry); + ~CUDAMonitoringService() = default; + + static void fillDescriptions(edm::ConfigurationDescriptions& descriptions); + + void postModuleConstruction(edm::ModuleDescription const& desc); + void postModuleBeginStream(edm::StreamContext const&, edm::ModuleCallingContext const& mcc); + void postEvent(edm::StreamContext const& sc); + +private: + int numberOfDevices_ = 0; +}; + +CUDAMonitoringService::CUDAMonitoringService(edm::ParameterSet const& config, edm::ActivityRegistry& registry) { + // make sure that CUDA is initialised, and that the CUDAService destructor is called after this service's destructor + edm::Service cudaService; + if (!cudaService->enabled()) + return; + numberOfDevices_ = cudaService->numberOfDevices(); + + if (config.getUntrackedParameter("memoryConstruction")) { + registry.watchPostModuleConstruction(this, &CUDAMonitoringService::postModuleConstruction); + } + if (config.getUntrackedParameter("memoryBeginStream")) { + registry.watchPostModuleBeginStream(this, &CUDAMonitoringService::postModuleBeginStream); + } + if (config.getUntrackedParameter("memoryPerEvent")) { + registry.watchPostEvent(this, &CUDAMonitoringService::postEvent); + } +} + +void CUDAMonitoringService::fillDescriptions(edm::ConfigurationDescriptions& descriptions) { + edm::ParameterSetDescription desc; + + desc.addUntracked("memoryConstruction", false) + ->setComment("Print memory information for each device after the construction of each module"); + desc.addUntracked("memoryBeginStream", true) + ->setComment("Print memory information for each device after the beginStream() of each module"); + desc.addUntracked("memoryPerEvent", true) + ->setComment("Print memory information for each device after each event"); + + descriptions.add("CUDAMonitoringService", desc); + descriptions.setComment( + "The memory information is the global state of the device. This gets confusing if there are multiple processes " + "running on the same device. Probably the information retrieval should be re-thought?"); +} + +// activity handlers +namespace { + template + void dumpUsedMemory(T& log, int num) { + int old = 0; + cudaCheck(cudaGetDevice(&old)); + for (int i = 0; i < num; ++i) { + size_t freeMemory, totalMemory; + cudaCheck(cudaSetDevice(i)); + cudaCheck(cudaMemGetInfo(&freeMemory, &totalMemory)); + log << "\n" + << i << ": " << (totalMemory - freeMemory) / (1 << 20) << " MB used / " << totalMemory / (1 << 20) + << " MB total"; + } + cudaCheck(cudaSetDevice(old)); + } +} // namespace + +void CUDAMonitoringService::postModuleConstruction(edm::ModuleDescription const& desc) { + auto log = edm::LogPrint("CUDAMonitoringService"); + log << "CUDA device memory after construction of " << desc.moduleLabel() << " (" << desc.moduleName() << ")"; + dumpUsedMemory(log, numberOfDevices_); +} + +void CUDAMonitoringService::postModuleBeginStream(edm::StreamContext const&, edm::ModuleCallingContext const& mcc) { + auto log = edm::LogPrint("CUDAMonitoringService"); + log << "CUDA device memory after beginStream() of " << mcc.moduleDescription()->moduleLabel() << " (" + << mcc.moduleDescription()->moduleName() << ")"; + dumpUsedMemory(log, numberOfDevices_); +} + +void CUDAMonitoringService::postEvent(edm::StreamContext const& sc) { + auto log = edm::LogPrint("CUDAMonitoringService"); + log << "CUDA device memory after event"; + dumpUsedMemory(log, numberOfDevices_); +} + +DEFINE_FWK_SERVICE(CUDAMonitoringService); diff --git a/HeterogeneousCore/CUDAServices/plugins/NVProfilerService.cc b/HeterogeneousCore/CUDAServices/plugins/NVProfilerService.cc index ec8c4deac4d4d..29fa1ab959025 100644 --- a/HeterogeneousCore/CUDAServices/plugins/NVProfilerService.cc +++ b/HeterogeneousCore/CUDAServices/plugins/NVProfilerService.cc @@ -41,6 +41,7 @@ #include "FWCore/Utilities/interface/Exception.h" #include "FWCore/Utilities/interface/ProductKindOfType.h" #include "FWCore/Utilities/interface/TimeOfDay.h" +#include "HeterogeneousCore/CUDAServices/interface/CUDAService.h" using namespace std::string_literals; @@ -285,9 +286,8 @@ class NVProfilerService { std::vector highlightModules_; const bool showModulePrefetching_; - bool skipFirstEvent_; + const bool skipFirstEvent_; - unsigned int concurrentStreams_; std::atomic globalFirstEventDone_ = false; std::vector> streamFirstEventDone_; std::vector event_; // per-stream event ranges @@ -295,49 +295,22 @@ class NVProfilerService { // use a tbb::concurrent_vector rather than an std::vector because its final size is not known tbb::concurrent_vector global_modules_; // global per-module events -private: - struct Domains { - nvtxDomainHandle_t global; - std::vector stream; - - Domains(NVProfilerService* service) { - global = nvtxDomainCreate("EDM Global"); - allocate_streams(service->concurrentStreams_); - } - - ~Domains() { - nvtxDomainDestroy(global); - for (unsigned int sid = 0; sid < stream.size(); ++sid) { - nvtxDomainDestroy(stream[sid]); - } - } - - void allocate_streams(unsigned int streams) { - stream.resize(streams); - for (unsigned int sid = 0; sid < streams; ++sid) { - stream[sid] = nvtxDomainCreate((boost::format("EDM Stream %d") % sid).str().c_str()); - } - } - }; - - // allow access to concurrentStreams_ - friend struct Domains; - - tbb::enumerable_thread_specific domains_; - - nvtxDomainHandle_t global_domain() { return domains_.local().global; } - - nvtxDomainHandle_t stream_domain(unsigned int sid) { return domains_.local().stream.at(sid); } + nvtxDomainHandle_t global_domain_; // NVTX domain for global EDM transitions + std::vector stream_domain_; // NVTX domains for per-EDM-stream transitions }; NVProfilerService::NVProfilerService(edm::ParameterSet const& config, edm::ActivityRegistry& registry) : highlightModules_(config.getUntrackedParameter>("highlightModules")), showModulePrefetching_(config.getUntrackedParameter("showModulePrefetching")), - skipFirstEvent_(config.getUntrackedParameter("skipFirstEvent")), - concurrentStreams_(0), - domains_(this) { + skipFirstEvent_(config.getUntrackedParameter("skipFirstEvent")) { + // make sure that CUDA is initialised, and that the CUDAService destructor is called after this service's destructor + edm::Service cudaService; + std::sort(highlightModules_.begin(), highlightModules_.end()); + // create the NVTX domain for global EDM transitions + global_domain_ = nvtxDomainCreate("EDM Global"); + // enables profile collection; if profiling is already enabled it has no effect if (not skipFirstEvent_) { cudaProfilerStart(); @@ -491,7 +464,13 @@ NVProfilerService::NVProfilerService(edm::ParameterSet const& config, edm::Activ registry.watchPostEventReadFromSource(this, &NVProfilerService::postEventReadFromSource); } -NVProfilerService::~NVProfilerService() { cudaProfilerStop(); } +NVProfilerService::~NVProfilerService() { + for (unsigned int sid = 0; sid < stream_domain_.size(); ++sid) { + nvtxDomainDestroy(stream_domain_[sid]); + } + nvtxDomainDestroy(global_domain_); + cudaProfilerStop(); +} void NVProfilerService::fillDescriptions(edm::ConfigurationDescriptions& descriptions) { edm::ParameterSetDescription desc; @@ -517,17 +496,20 @@ void NVProfilerService::preallocate(edm::service::SystemBounds const& bounds) { out << "preallocate: " << bounds.maxNumberOfConcurrentRuns() << " concurrent runs, " << bounds.maxNumberOfConcurrentLuminosityBlocks() << " luminosity sections, " << bounds.maxNumberOfStreams() << " streams\nrunning on" << bounds.maxNumberOfThreads() << " threads"; - nvtxDomainMark(global_domain(), out.str().c_str()); + nvtxDomainMark(global_domain_, out.str().c_str()); - concurrentStreams_ = bounds.maxNumberOfStreams(); - for (auto& domain : domains_) { - domain.allocate_streams(concurrentStreams_); + auto concurrentStreams = bounds.maxNumberOfStreams(); + // create the NVTX domains for per-EDM-stream transitions + stream_domain_.resize(concurrentStreams); + for (unsigned int sid = 0; sid < concurrentStreams; ++sid) { + stream_domain_[sid] = nvtxDomainCreate((boost::format("EDM Stream %d") % sid).str().c_str()); } - event_.resize(concurrentStreams_); - stream_modules_.resize(concurrentStreams_); + + event_.resize(concurrentStreams); + stream_modules_.resize(concurrentStreams); if (skipFirstEvent_) { globalFirstEventDone_ = false; - std::vector> tmp(concurrentStreams_); + std::vector> tmp(concurrentStreams); for (auto& element : tmp) std::atomic_init(&element, false); streamFirstEventDone_ = std::move(tmp); @@ -536,86 +518,86 @@ void NVProfilerService::preallocate(edm::service::SystemBounds const& bounds) { void NVProfilerService::preBeginJob(edm::PathsAndConsumesOfModulesBase const& pathsAndConsumes, edm::ProcessContext const& pc) { - nvtxDomainMark(global_domain(), "preBeginJob"); + nvtxDomainMark(global_domain_, "preBeginJob"); // FIXME this probably works only in the absence of subprocesses // size() + 1 because pathsAndConsumes.allModules() does not include the source unsigned int modules = pathsAndConsumes.allModules().size() + 1; global_modules_.resize(modules, nvtxInvalidRangeId); - for (unsigned int sid = 0; sid < concurrentStreams_; ++sid) { + for (unsigned int sid = 0; sid < stream_modules_.size(); ++sid) { stream_modules_[sid].resize(modules, nvtxInvalidRangeId); } } void NVProfilerService::postBeginJob() { if (not skipFirstEvent_ or globalFirstEventDone_) { - nvtxDomainMark(global_domain(), "postBeginJob"); + nvtxDomainMark(global_domain_, "postBeginJob"); } } void NVProfilerService::postEndJob() { if (not skipFirstEvent_ or globalFirstEventDone_) { - nvtxDomainMark(global_domain(), "postEndJob"); + nvtxDomainMark(global_domain_, "postEndJob"); } } void NVProfilerService::preSourceEvent(edm::StreamID sid) { if (not skipFirstEvent_ or streamFirstEventDone_[sid]) { - nvtxDomainRangePush(stream_domain(sid), "source"); + nvtxDomainRangePush(stream_domain_[sid], "source"); } } void NVProfilerService::postSourceEvent(edm::StreamID sid) { if (not skipFirstEvent_ or streamFirstEventDone_[sid]) { - nvtxDomainRangePop(stream_domain(sid)); + nvtxDomainRangePop(stream_domain_[sid]); } } void NVProfilerService::preSourceLumi(edm::LuminosityBlockIndex index) { if (not skipFirstEvent_ or globalFirstEventDone_) { - nvtxDomainRangePush(global_domain(), "source lumi"); + nvtxDomainRangePush(global_domain_, "source lumi"); } } void NVProfilerService::postSourceLumi(edm::LuminosityBlockIndex index) { if (not skipFirstEvent_ or globalFirstEventDone_) { - nvtxDomainRangePop(global_domain()); + nvtxDomainRangePop(global_domain_); } } void NVProfilerService::preSourceRun(edm::RunIndex index) { if (not skipFirstEvent_ or globalFirstEventDone_) { - nvtxDomainRangePush(global_domain(), "source run"); + nvtxDomainRangePush(global_domain_, "source run"); } } void NVProfilerService::postSourceRun(edm::RunIndex index) { if (not skipFirstEvent_ or globalFirstEventDone_) { - nvtxDomainRangePop(global_domain()); + nvtxDomainRangePop(global_domain_); } } void NVProfilerService::preOpenFile(std::string const& lfn, bool) { if (not skipFirstEvent_ or globalFirstEventDone_) { - nvtxDomainRangePush(global_domain(), ("open file "s + lfn).c_str()); + nvtxDomainRangePush(global_domain_, ("open file "s + lfn).c_str()); } } void NVProfilerService::postOpenFile(std::string const& lfn, bool) { if (not skipFirstEvent_ or globalFirstEventDone_) { - nvtxDomainRangePop(global_domain()); + nvtxDomainRangePop(global_domain_); } } void NVProfilerService::preCloseFile(std::string const& lfn, bool) { if (not skipFirstEvent_ or globalFirstEventDone_) { - nvtxDomainRangePush(global_domain(), ("close file "s + lfn).c_str()); + nvtxDomainRangePush(global_domain_, ("close file "s + lfn).c_str()); } } void NVProfilerService::postCloseFile(std::string const& lfn, bool) { if (not skipFirstEvent_ or globalFirstEventDone_) { - nvtxDomainRangePop(global_domain()); + nvtxDomainRangePop(global_domain_); } } @@ -626,7 +608,7 @@ void NVProfilerService::preModuleBeginStream(edm::StreamContext const& sc, edm:: auto const& label = mcc.moduleDescription()->moduleLabel(); auto const& msg = label + " begin stream"; assert(stream_modules_[sid][mid] == nvtxInvalidRangeId); - stream_modules_[sid][mid] = nvtxDomainRangeStartColor(stream_domain(sid), msg.c_str(), labelColor(label)); + stream_modules_[sid][mid] = nvtxDomainRangeStartColor(stream_domain_[sid], msg.c_str(), labelColor(label)); } } @@ -634,7 +616,7 @@ void NVProfilerService::postModuleBeginStream(edm::StreamContext const& sc, edm: auto sid = sc.streamID(); if (not skipFirstEvent_ or streamFirstEventDone_[sid]) { auto mid = mcc.moduleDescription()->id(); - nvtxDomainRangeEnd(stream_domain(sid), stream_modules_[sid][mid]); + nvtxDomainRangeEnd(stream_domain_[sid], stream_modules_[sid][mid]); stream_modules_[sid][mid] = nvtxInvalidRangeId; } } @@ -646,7 +628,7 @@ void NVProfilerService::preModuleEndStream(edm::StreamContext const& sc, edm::Mo auto const& label = mcc.moduleDescription()->moduleLabel(); auto const& msg = label + " end stream"; assert(stream_modules_[sid][mid] == nvtxInvalidRangeId); - stream_modules_[sid][mid] = nvtxDomainRangeStartColor(stream_domain(sid), msg.c_str(), labelColor(label)); + stream_modules_[sid][mid] = nvtxDomainRangeStartColor(stream_domain_[sid], msg.c_str(), labelColor(label)); } } @@ -654,124 +636,124 @@ void NVProfilerService::postModuleEndStream(edm::StreamContext const& sc, edm::M auto sid = sc.streamID(); if (not skipFirstEvent_ or streamFirstEventDone_[sid]) { auto mid = mcc.moduleDescription()->id(); - nvtxDomainRangeEnd(stream_domain(sid), stream_modules_[sid][mid]); + nvtxDomainRangeEnd(stream_domain_[sid], stream_modules_[sid][mid]); stream_modules_[sid][mid] = nvtxInvalidRangeId; } } void NVProfilerService::preGlobalBeginRun(edm::GlobalContext const& gc) { if (not skipFirstEvent_ or globalFirstEventDone_) { - nvtxDomainRangePush(global_domain(), "global begin run"); + nvtxDomainRangePush(global_domain_, "global begin run"); } } void NVProfilerService::postGlobalBeginRun(edm::GlobalContext const& gc) { if (not skipFirstEvent_ or globalFirstEventDone_) { - nvtxDomainRangePop(global_domain()); + nvtxDomainRangePop(global_domain_); } } void NVProfilerService::preGlobalEndRun(edm::GlobalContext const& gc) { if (not skipFirstEvent_ or globalFirstEventDone_) { - nvtxDomainRangePush(global_domain(), "global end run"); + nvtxDomainRangePush(global_domain_, "global end run"); } } void NVProfilerService::postGlobalEndRun(edm::GlobalContext const& gc) { if (not skipFirstEvent_ or globalFirstEventDone_) { - nvtxDomainRangePop(global_domain()); + nvtxDomainRangePop(global_domain_); } } void NVProfilerService::preStreamBeginRun(edm::StreamContext const& sc) { auto sid = sc.streamID(); if (not skipFirstEvent_ or streamFirstEventDone_[sid]) { - nvtxDomainRangePush(stream_domain(sid), "stream begin run"); + nvtxDomainRangePush(stream_domain_[sid], "stream begin run"); } } void NVProfilerService::postStreamBeginRun(edm::StreamContext const& sc) { auto sid = sc.streamID(); if (not skipFirstEvent_ or streamFirstEventDone_[sid]) { - nvtxDomainRangePop(stream_domain(sid)); + nvtxDomainRangePop(stream_domain_[sid]); } } void NVProfilerService::preStreamEndRun(edm::StreamContext const& sc) { auto sid = sc.streamID(); if (not skipFirstEvent_ or streamFirstEventDone_[sid]) { - nvtxDomainRangePush(stream_domain(sid), "stream end run"); + nvtxDomainRangePush(stream_domain_[sid], "stream end run"); } } void NVProfilerService::postStreamEndRun(edm::StreamContext const& sc) { auto sid = sc.streamID(); if (not skipFirstEvent_ or streamFirstEventDone_[sid]) { - nvtxDomainRangePop(stream_domain(sid)); + nvtxDomainRangePop(stream_domain_[sid]); } } void NVProfilerService::preGlobalBeginLumi(edm::GlobalContext const& gc) { if (not skipFirstEvent_ or globalFirstEventDone_) { - nvtxDomainRangePush(global_domain(), "global begin lumi"); + nvtxDomainRangePush(global_domain_, "global begin lumi"); } } void NVProfilerService::postGlobalBeginLumi(edm::GlobalContext const& gc) { if (not skipFirstEvent_ or globalFirstEventDone_) { - nvtxDomainRangePop(global_domain()); + nvtxDomainRangePop(global_domain_); } } void NVProfilerService::preGlobalEndLumi(edm::GlobalContext const& gc) { if (not skipFirstEvent_ or globalFirstEventDone_) { - nvtxDomainRangePush(global_domain(), "global end lumi"); + nvtxDomainRangePush(global_domain_, "global end lumi"); } } void NVProfilerService::postGlobalEndLumi(edm::GlobalContext const& gc) { if (not skipFirstEvent_ or globalFirstEventDone_) { - nvtxDomainRangePop(global_domain()); + nvtxDomainRangePop(global_domain_); } } void NVProfilerService::preStreamBeginLumi(edm::StreamContext const& sc) { auto sid = sc.streamID(); if (not skipFirstEvent_ or streamFirstEventDone_[sid]) { - nvtxDomainRangePush(stream_domain(sid), "stream begin lumi"); + nvtxDomainRangePush(stream_domain_[sid], "stream begin lumi"); } } void NVProfilerService::postStreamBeginLumi(edm::StreamContext const& sc) { auto sid = sc.streamID(); if (not skipFirstEvent_ or streamFirstEventDone_[sid]) { - nvtxDomainRangePop(stream_domain(sid)); + nvtxDomainRangePop(stream_domain_[sid]); } } void NVProfilerService::preStreamEndLumi(edm::StreamContext const& sc) { auto sid = sc.streamID(); - nvtxDomainRangePush(stream_domain(sid), "stream end lumi"); + nvtxDomainRangePush(stream_domain_[sid], "stream end lumi"); } void NVProfilerService::postStreamEndLumi(edm::StreamContext const& sc) { auto sid = sc.streamID(); if (not skipFirstEvent_ or streamFirstEventDone_[sid]) { - nvtxDomainRangePop(stream_domain(sid)); + nvtxDomainRangePop(stream_domain_[sid]); } } void NVProfilerService::preEvent(edm::StreamContext const& sc) { auto sid = sc.streamID(); if (not skipFirstEvent_ or streamFirstEventDone_[sid]) { - event_[sid] = nvtxDomainRangeStartColor(stream_domain(sid), "event", nvtxDarkGreen); + event_[sid] = nvtxDomainRangeStartColor(stream_domain_[sid], "event", nvtxDarkGreen); } } void NVProfilerService::postEvent(edm::StreamContext const& sc) { auto sid = sc.streamID(); if (not skipFirstEvent_ or streamFirstEventDone_[sid]) { - nvtxDomainRangeEnd(stream_domain(sid), event_[sid]); + nvtxDomainRangeEnd(stream_domain_[sid], event_[sid]); event_[sid] = nvtxInvalidRangeId; } else { streamFirstEventDone_[sid] = true; @@ -787,7 +769,7 @@ void NVProfilerService::postEvent(edm::StreamContext const& sc) { void NVProfilerService::prePathEvent(edm::StreamContext const& sc, edm::PathContext const& pc) { auto sid = sc.streamID(); if (not skipFirstEvent_ or streamFirstEventDone_[sid]) { - nvtxDomainMark(global_domain(), ("before path "s + pc.pathName()).c_str()); + nvtxDomainMark(global_domain_, ("before path "s + pc.pathName()).c_str()); } } @@ -796,7 +778,7 @@ void NVProfilerService::postPathEvent(edm::StreamContext const& sc, edm::HLTPathStatus const& hlts) { auto sid = sc.streamID(); if (not skipFirstEvent_ or streamFirstEventDone_[sid]) { - nvtxDomainMark(global_domain(), ("after path "s + pc.pathName()).c_str()); + nvtxDomainMark(global_domain_, ("after path "s + pc.pathName()).c_str()); } } @@ -807,7 +789,7 @@ void NVProfilerService::preModuleEventPrefetching(edm::StreamContext const& sc, auto const& label = mcc.moduleDescription()->moduleLabel(); auto const& msg = label + " prefetching"; assert(stream_modules_[sid][mid] == nvtxInvalidRangeId); - stream_modules_[sid][mid] = nvtxDomainRangeStartColor(stream_domain(sid), msg.c_str(), labelColorLight(label)); + stream_modules_[sid][mid] = nvtxDomainRangeStartColor(stream_domain_[sid], msg.c_str(), labelColorLight(label)); } } @@ -815,7 +797,7 @@ void NVProfilerService::postModuleEventPrefetching(edm::StreamContext const& sc, auto sid = sc.streamID(); if (not skipFirstEvent_ or streamFirstEventDone_[sid]) { auto mid = mcc.moduleDescription()->id(); - nvtxDomainRangeEnd(stream_domain(sid), stream_modules_[sid][mid]); + nvtxDomainRangeEnd(stream_domain_[sid], stream_modules_[sid][mid]); stream_modules_[sid][mid] = nvtxInvalidRangeId; } } @@ -826,14 +808,14 @@ void NVProfilerService::preModuleConstruction(edm::ModuleDescription const& desc global_modules_.grow_to_at_least(mid + 1); auto const& label = desc.moduleLabel(); auto const& msg = label + " construction"; - global_modules_[mid] = nvtxDomainRangeStartColor(global_domain(), msg.c_str(), labelColor(label)); + global_modules_[mid] = nvtxDomainRangeStartColor(global_domain_, msg.c_str(), labelColor(label)); } } void NVProfilerService::postModuleConstruction(edm::ModuleDescription const& desc) { if (not skipFirstEvent_) { auto mid = desc.id(); - nvtxDomainRangeEnd(global_domain(), global_modules_[mid]); + nvtxDomainRangeEnd(global_domain_, global_modules_[mid]); global_modules_[mid] = nvtxInvalidRangeId; } } @@ -843,14 +825,14 @@ void NVProfilerService::preModuleBeginJob(edm::ModuleDescription const& desc) { auto mid = desc.id(); auto const& label = desc.moduleLabel(); auto const& msg = label + " begin job"; - global_modules_[mid] = nvtxDomainRangeStartColor(global_domain(), msg.c_str(), labelColor(label)); + global_modules_[mid] = nvtxDomainRangeStartColor(global_domain_, msg.c_str(), labelColor(label)); } } void NVProfilerService::postModuleBeginJob(edm::ModuleDescription const& desc) { if (not skipFirstEvent_) { auto mid = desc.id(); - nvtxDomainRangeEnd(global_domain(), global_modules_[mid]); + nvtxDomainRangeEnd(global_domain_, global_modules_[mid]); global_modules_[mid] = nvtxInvalidRangeId; } } @@ -860,14 +842,14 @@ void NVProfilerService::preModuleEndJob(edm::ModuleDescription const& desc) { auto mid = desc.id(); auto const& label = desc.moduleLabel(); auto const& msg = label + " end job"; - global_modules_[mid] = nvtxDomainRangeStartColor(global_domain(), msg.c_str(), labelColor(label)); + global_modules_[mid] = nvtxDomainRangeStartColor(global_domain_, msg.c_str(), labelColor(label)); } } void NVProfilerService::postModuleEndJob(edm::ModuleDescription const& desc) { if (not skipFirstEvent_ or globalFirstEventDone_) { auto mid = desc.id(); - nvtxDomainRangeEnd(global_domain(), global_modules_[mid]); + nvtxDomainRangeEnd(global_domain_, global_modules_[mid]); global_modules_[mid] = nvtxInvalidRangeId; } } @@ -879,7 +861,7 @@ void NVProfilerService::preModuleEventAcquire(edm::StreamContext const& sc, edm: auto const& label = mcc.moduleDescription()->moduleLabel(); auto const& msg = label + " acquire"; assert(stream_modules_[sid][mid] == nvtxInvalidRangeId); - stream_modules_[sid][mid] = nvtxDomainRangeStartColor(stream_domain(sid), msg.c_str(), labelColor(label)); + stream_modules_[sid][mid] = nvtxDomainRangeStartColor(stream_domain_[sid], msg.c_str(), labelColor(label)); } } @@ -887,7 +869,7 @@ void NVProfilerService::postModuleEventAcquire(edm::StreamContext const& sc, edm auto sid = sc.streamID(); if (not skipFirstEvent_ or streamFirstEventDone_[sid]) { auto mid = mcc.moduleDescription()->id(); - nvtxDomainRangeEnd(stream_domain(sid), stream_modules_[sid][mid]); + nvtxDomainRangeEnd(stream_domain_[sid], stream_modules_[sid][mid]); stream_modules_[sid][mid] = nvtxInvalidRangeId; } } @@ -898,7 +880,7 @@ void NVProfilerService::preModuleEvent(edm::StreamContext const& sc, edm::Module auto mid = mcc.moduleDescription()->id(); auto const& label = mcc.moduleDescription()->moduleLabel(); assert(stream_modules_[sid][mid] == nvtxInvalidRangeId); - stream_modules_[sid][mid] = nvtxDomainRangeStartColor(stream_domain(sid), label.c_str(), labelColor(label)); + stream_modules_[sid][mid] = nvtxDomainRangeStartColor(stream_domain_[sid], label.c_str(), labelColor(label)); } } @@ -906,7 +888,7 @@ void NVProfilerService::postModuleEvent(edm::StreamContext const& sc, edm::Modul auto sid = sc.streamID(); if (not skipFirstEvent_ or streamFirstEventDone_[sid]) { auto mid = mcc.moduleDescription()->id(); - nvtxDomainRangeEnd(stream_domain(sid), stream_modules_[sid][mid]); + nvtxDomainRangeEnd(stream_domain_[sid], stream_modules_[sid][mid]); stream_modules_[sid][mid] = nvtxInvalidRangeId; } } @@ -919,7 +901,7 @@ void NVProfilerService::preModuleEventDelayedGet(edm::StreamContext const& sc, e auto const & label = mcc.moduleDescription()->moduleLabel(); auto const & msg = label + " delayed get"; assert(stream_modules_[sid][mid] == nvtxInvalidRangeId); - stream_modules_[sid][mid] = nvtxDomainRangeStartColor(stream_domain(sid), label.c_str(), labelColorLight(label)); + stream_modules_[sid][mid] = nvtxDomainRangeStartColor(stream_domain_[sid], label.c_str(), labelColorLight(label)); } */ } @@ -929,7 +911,7 @@ void NVProfilerService::postModuleEventDelayedGet(edm::StreamContext const& sc, auto sid = sc.streamID(); if (not skipFirstEvent_ or streamFirstEventDone_[sid]) { auto mid = mcc.moduleDescription()->id(); - nvtxDomainRangeEnd(stream_domain(sid), stream_modules_[sid][mid]); + nvtxDomainRangeEnd(stream_domain_[sid], stream_modules_[sid][mid]); stream_modules_[sid][mid] = nvtxInvalidRangeId; } */ @@ -943,7 +925,7 @@ void NVProfilerService::preEventReadFromSource(edm::StreamContext const& sc, edm auto const & label = mcc.moduleDescription()->moduleLabel(); auto const & msg = label + " read from source"; assert(stream_modules_[sid][mid] == nvtxInvalidRangeId); - stream_modules_[sid][mid] = nvtxDomainRangeStartColor(stream_domain(sid), msg.c_str(), labelColorLight(label)); + stream_modules_[sid][mid] = nvtxDomainRangeStartColor(stream_domain_[sid], msg.c_str(), labelColorLight(label)); } */ } @@ -953,7 +935,7 @@ void NVProfilerService::postEventReadFromSource(edm::StreamContext const& sc, ed auto sid = sc.streamID(); if (not skipFirstEvent_ or streamFirstEventDone_[sid]) { auto mid = mcc.moduleDescription()->id(); - nvtxDomainRangeEnd(stream_domain(sid), stream_modules_[sid][mid]); + nvtxDomainRangeEnd(stream_domain_[sid], stream_modules_[sid][mid]); stream_modules_[sid][mid] = nvtxInvalidRangeId; } */ @@ -966,7 +948,7 @@ void NVProfilerService::preModuleStreamBeginRun(edm::StreamContext const& sc, ed auto const& label = mcc.moduleDescription()->moduleLabel(); auto const& msg = label + " stream begin run"; assert(stream_modules_[sid][mid] == nvtxInvalidRangeId); - stream_modules_[sid][mid] = nvtxDomainRangeStartColor(stream_domain(sid), msg.c_str(), labelColor(label)); + stream_modules_[sid][mid] = nvtxDomainRangeStartColor(stream_domain_[sid], msg.c_str(), labelColor(label)); } } @@ -974,7 +956,7 @@ void NVProfilerService::postModuleStreamBeginRun(edm::StreamContext const& sc, e auto sid = sc.streamID(); if (not skipFirstEvent_ or streamFirstEventDone_[sid]) { auto mid = mcc.moduleDescription()->id(); - nvtxDomainRangeEnd(stream_domain(sid), stream_modules_[sid][mid]); + nvtxDomainRangeEnd(stream_domain_[sid], stream_modules_[sid][mid]); stream_modules_[sid][mid] = nvtxInvalidRangeId; } } @@ -986,7 +968,7 @@ void NVProfilerService::preModuleStreamEndRun(edm::StreamContext const& sc, edm: auto const& label = mcc.moduleDescription()->moduleLabel(); auto const& msg = label + " stream end run"; assert(stream_modules_[sid][mid] == nvtxInvalidRangeId); - stream_modules_[sid][mid] = nvtxDomainRangeStartColor(stream_domain(sid), msg.c_str(), labelColor(label)); + stream_modules_[sid][mid] = nvtxDomainRangeStartColor(stream_domain_[sid], msg.c_str(), labelColor(label)); } } @@ -994,7 +976,7 @@ void NVProfilerService::postModuleStreamEndRun(edm::StreamContext const& sc, edm auto sid = sc.streamID(); if (not skipFirstEvent_ or streamFirstEventDone_[sid]) { auto mid = mcc.moduleDescription()->id(); - nvtxDomainRangeEnd(stream_domain(sid), stream_modules_[sid][mid]); + nvtxDomainRangeEnd(stream_domain_[sid], stream_modules_[sid][mid]); stream_modules_[sid][mid] = nvtxInvalidRangeId; } } @@ -1006,7 +988,7 @@ void NVProfilerService::preModuleStreamBeginLumi(edm::StreamContext const& sc, e auto const& label = mcc.moduleDescription()->moduleLabel(); auto const& msg = label + " stream begin lumi"; assert(stream_modules_[sid][mid] == nvtxInvalidRangeId); - stream_modules_[sid][mid] = nvtxDomainRangeStartColor(stream_domain(sid), msg.c_str(), labelColor(label)); + stream_modules_[sid][mid] = nvtxDomainRangeStartColor(stream_domain_[sid], msg.c_str(), labelColor(label)); } } @@ -1014,7 +996,7 @@ void NVProfilerService::postModuleStreamBeginLumi(edm::StreamContext const& sc, auto sid = sc.streamID(); if (not skipFirstEvent_ or streamFirstEventDone_[sid]) { auto mid = mcc.moduleDescription()->id(); - nvtxDomainRangeEnd(stream_domain(sid), stream_modules_[sid][mid]); + nvtxDomainRangeEnd(stream_domain_[sid], stream_modules_[sid][mid]); stream_modules_[sid][mid] = nvtxInvalidRangeId; } } @@ -1026,7 +1008,7 @@ void NVProfilerService::preModuleStreamEndLumi(edm::StreamContext const& sc, edm auto const& label = mcc.moduleDescription()->moduleLabel(); auto const& msg = label + " stream end lumi"; assert(stream_modules_[sid][mid] == nvtxInvalidRangeId); - stream_modules_[sid][mid] = nvtxDomainRangeStartColor(stream_domain(sid), msg.c_str(), labelColor(label)); + stream_modules_[sid][mid] = nvtxDomainRangeStartColor(stream_domain_[sid], msg.c_str(), labelColor(label)); } } @@ -1034,7 +1016,7 @@ void NVProfilerService::postModuleStreamEndLumi(edm::StreamContext const& sc, ed auto sid = sc.streamID(); if (not skipFirstEvent_ or streamFirstEventDone_[sid]) { auto mid = mcc.moduleDescription()->id(); - nvtxDomainRangeEnd(stream_domain(sid), stream_modules_[sid][mid]); + nvtxDomainRangeEnd(stream_domain_[sid], stream_modules_[sid][mid]); stream_modules_[sid][mid] = nvtxInvalidRangeId; } } @@ -1044,14 +1026,14 @@ void NVProfilerService::preModuleGlobalBeginRun(edm::GlobalContext const& gc, ed auto mid = mcc.moduleDescription()->id(); auto const& label = mcc.moduleDescription()->moduleLabel(); auto const& msg = label + " global begin run"; - global_modules_[mid] = nvtxDomainRangeStartColor(global_domain(), msg.c_str(), labelColor(label)); + global_modules_[mid] = nvtxDomainRangeStartColor(global_domain_, msg.c_str(), labelColor(label)); } } void NVProfilerService::postModuleGlobalBeginRun(edm::GlobalContext const& gc, edm::ModuleCallingContext const& mcc) { if (not skipFirstEvent_ or globalFirstEventDone_) { auto mid = mcc.moduleDescription()->id(); - nvtxDomainRangeEnd(global_domain(), global_modules_[mid]); + nvtxDomainRangeEnd(global_domain_, global_modules_[mid]); global_modules_[mid] = nvtxInvalidRangeId; } } @@ -1061,14 +1043,14 @@ void NVProfilerService::preModuleGlobalEndRun(edm::GlobalContext const& gc, edm: auto mid = mcc.moduleDescription()->id(); auto const& label = mcc.moduleDescription()->moduleLabel(); auto const& msg = label + " global end run"; - global_modules_[mid] = nvtxDomainRangeStartColor(global_domain(), msg.c_str(), labelColor(label)); + global_modules_[mid] = nvtxDomainRangeStartColor(global_domain_, msg.c_str(), labelColor(label)); } } void NVProfilerService::postModuleGlobalEndRun(edm::GlobalContext const& gc, edm::ModuleCallingContext const& mcc) { if (not skipFirstEvent_ or globalFirstEventDone_) { auto mid = mcc.moduleDescription()->id(); - nvtxDomainRangeEnd(global_domain(), global_modules_[mid]); + nvtxDomainRangeEnd(global_domain_, global_modules_[mid]); global_modules_[mid] = nvtxInvalidRangeId; } } @@ -1078,14 +1060,14 @@ void NVProfilerService::preModuleGlobalBeginLumi(edm::GlobalContext const& gc, e auto mid = mcc.moduleDescription()->id(); auto const& label = mcc.moduleDescription()->moduleLabel(); auto const& msg = label + " global begin lumi"; - global_modules_[mid] = nvtxDomainRangeStartColor(global_domain(), msg.c_str(), labelColor(label)); + global_modules_[mid] = nvtxDomainRangeStartColor(global_domain_, msg.c_str(), labelColor(label)); } } void NVProfilerService::postModuleGlobalBeginLumi(edm::GlobalContext const& gc, edm::ModuleCallingContext const& mcc) { if (not skipFirstEvent_ or globalFirstEventDone_) { auto mid = mcc.moduleDescription()->id(); - nvtxDomainRangeEnd(global_domain(), global_modules_[mid]); + nvtxDomainRangeEnd(global_domain_, global_modules_[mid]); global_modules_[mid] = nvtxInvalidRangeId; } } @@ -1095,14 +1077,14 @@ void NVProfilerService::preModuleGlobalEndLumi(edm::GlobalContext const& gc, edm auto mid = mcc.moduleDescription()->id(); auto const& label = mcc.moduleDescription()->moduleLabel(); auto const& msg = label + " global end lumi"; - global_modules_[mid] = nvtxDomainRangeStartColor(global_domain(), msg.c_str(), labelColor(label)); + global_modules_[mid] = nvtxDomainRangeStartColor(global_domain_, msg.c_str(), labelColor(label)); } } void NVProfilerService::postModuleGlobalEndLumi(edm::GlobalContext const& gc, edm::ModuleCallingContext const& mcc) { if (not skipFirstEvent_ or globalFirstEventDone_) { auto mid = mcc.moduleDescription()->id(); - nvtxDomainRangeEnd(global_domain(), global_modules_[mid]); + nvtxDomainRangeEnd(global_domain_, global_modules_[mid]); global_modules_[mid] = nvtxInvalidRangeId; } } @@ -1113,14 +1095,14 @@ void NVProfilerService::preSourceConstruction(edm::ModuleDescription const& desc global_modules_.grow_to_at_least(mid + 1); auto const& label = desc.moduleLabel(); auto const& msg = label + " construction"; - global_modules_[mid] = nvtxDomainRangeStartColor(global_domain(), msg.c_str(), labelColor(label)); + global_modules_[mid] = nvtxDomainRangeStartColor(global_domain_, msg.c_str(), labelColor(label)); } } void NVProfilerService::postSourceConstruction(edm::ModuleDescription const& desc) { if (not skipFirstEvent_) { auto mid = desc.id(); - nvtxDomainRangeEnd(global_domain(), global_modules_[mid]); + nvtxDomainRangeEnd(global_domain_, global_modules_[mid]); global_modules_[mid] = nvtxInvalidRangeId; } } diff --git a/HeterogeneousCore/CUDAServices/plugins/plugins.cc b/HeterogeneousCore/CUDAServices/plugins/plugins.cc new file mode 100644 index 0000000000000..d8aefa42e9c99 --- /dev/null +++ b/HeterogeneousCore/CUDAServices/plugins/plugins.cc @@ -0,0 +1,4 @@ +#include "FWCore/ServiceRegistry/interface/ServiceMaker.h" +#include "HeterogeneousCore/CUDAServices/interface/CUDAService.h" + +DEFINE_FWK_SERVICE(CUDAService); diff --git a/HeterogeneousCore/CUDAServices/scripts/cmsCudaRebuild.sh b/HeterogeneousCore/CUDAServices/scripts/cmsCudaRebuild.sh new file mode 100755 index 0000000000000..bde3e26382976 --- /dev/null +++ b/HeterogeneousCore/CUDAServices/scripts/cmsCudaRebuild.sh @@ -0,0 +1,10 @@ +#! /bin/bash -e + +# move to the .../src directory +cd $CMSSW_BASE/src/ + +# check out all packages containing .cu files +git ls-files --full-name | grep '.*\.cu$' | cut -d/ -f-2 | sort -u | xargs git cms-addpkg + +# rebuild all checked out packages +scram b -j diff --git a/HeterogeneousCore/CUDAServices/scripts/cmsCudaSetup.sh b/HeterogeneousCore/CUDAServices/scripts/cmsCudaSetup.sh new file mode 100755 index 0000000000000..f3335f4cd409f --- /dev/null +++ b/HeterogeneousCore/CUDAServices/scripts/cmsCudaSetup.sh @@ -0,0 +1,19 @@ +#! /bin/bash +TOOL=$CMSSW_BASE/config/toolbox/$SCRAM_ARCH/tools/selected/cuda.xml + +# enumerate the supported streaming multiprocessor (sm) compute capabilites +DOTS=$(cudaComputeCapabilities | awk '{ print $2 }' | sort -u) +CAPS=$(echo $DOTS | sed -e's#\.*##g') + +# remove existing capabilities +sed -i $TOOL -e'\##d' + +# add support for the capabilities found on this machine +for CAP in $CAPS; do + sed -i $TOOL -e"\##a\ " +done + +# reconfigure the cuda.xml tool +scram setup cuda + +echo "SCRAM configured to support CUDA streaming multiprocessor architectures $DOTS" diff --git a/HeterogeneousCore/CUDAServices/scripts/cudaPreallocate.py b/HeterogeneousCore/CUDAServices/scripts/cudaPreallocate.py new file mode 100755 index 0000000000000..331ddd30f73bd --- /dev/null +++ b/HeterogeneousCore/CUDAServices/scripts/cudaPreallocate.py @@ -0,0 +1,38 @@ +#!/usr/bin/env python + +from __future__ import print_function +import re +import sys +import argparse + +def main(opts): + device = [] + host = [] + + device_re = re.compile("Device.*allocated new device block.*\((?P\d+) bytes") + host_re = re.compile("Host.*allocated new host block.*\((?P\d+) bytes") + + f = open(opts.file) + for line in f: + m = device_re.search(line) + if m: + device.append(m.group("bytes")) + continue + m = host_re.search(line) + if m: + host.append(m.group("bytes")) + f.close() + + print("process.CUDAService.allocator.devicePreallocate = cms.untracked.vuint32(%s)" % ",".join(device)) + print("process.CUDAService.allocator.hostPreallocate = cms.untracked.vuint32(%s)" % ",".join(host)) + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="""Extract CUDAService preallocation parameters from a log file. + +To use, run the job once with "process.CUDAService.allocator.debug = +True" and direct the output to a file. Then run this script by passing +the file as an argument, and copy the output of this script back to +the configuration file.""") + parser.add_argument("file", type=str, help="Log file to parse") + opts = parser.parse_args() + main(opts) diff --git a/HeterogeneousCore/CUDAServices/scripts/nvprof-remote b/HeterogeneousCore/CUDAServices/scripts/nvprof-remote new file mode 100755 index 0000000000000..3b010c005291f --- /dev/null +++ b/HeterogeneousCore/CUDAServices/scripts/nvprof-remote @@ -0,0 +1,23 @@ +#! /bin/bash + +# find the CMSSW release +if [ -z "$CMSSW_BASE" ]; then + export CMSSW_BASE=$(readlink -f $(dirname $0)/../..) +fi + +# load the CMS environment +source $(< "$CMSSW_BASE"/config/scram_basedir)/cmsset_default.sh + +# load the CMSSW release environment +eval `cd "$CMSSW_BASE"; scram runtime -sh 2> /dev/null` + +# log the commands being run +{ + date + echo "cwd: $PWD" + echo "cmd: $0 $@" + echo +} >> $CMSSW_BASE/tmp/nvprof.log + +# run the CUDA profiler +nvprof "$@" diff --git a/HeterogeneousCore/CUDAServices/src/CUDAService.cc b/HeterogeneousCore/CUDAServices/src/CUDAService.cc new file mode 100644 index 0000000000000..1568e5bb508eb --- /dev/null +++ b/HeterogeneousCore/CUDAServices/src/CUDAService.cc @@ -0,0 +1,387 @@ +#include +#include +#include + +#include + +#include "FWCore/MessageLogger/interface/MessageLogger.h" +#include "FWCore/ParameterSet/interface/ConfigurationDescriptions.h" +#include "FWCore/ParameterSet/interface/ParameterSet.h" +#include "FWCore/ParameterSet/interface/ParameterSetDescription.h" +#include "FWCore/Utilities/interface/ReusableObjectHolder.h" +#include "HeterogeneousCore/CUDAServices/interface/CUDAService.h" +#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" +#include "HeterogeneousCore/CUDAUtilities/interface/CUDAEventCache.h" +#include "HeterogeneousCore/CUDAUtilities/interface/CUDAStreamCache.h" +#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h" +#include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h" +#include "HeterogeneousCore/CUDAUtilities/interface/currentDevice.h" +#include "HeterogeneousCore/CUDAUtilities/src/getCachingDeviceAllocator.h" +#include "HeterogeneousCore/CUDAUtilities/src/getCachingHostAllocator.h" + +void setCudaLimit(cudaLimit limit, const char* name, size_t request) { + // read the current device + int device; + cudaCheck(cudaGetDevice(&device)); + // try to set the requested limit + auto result = cudaDeviceSetLimit(limit, request); + if (cudaErrorUnsupportedLimit == result) { + edm::LogWarning("CUDAService") << "CUDA device " << device << ": unsupported limit \"" << name << "\""; + return; + } + // read back the limit value + size_t value; + cudaCheck(cudaDeviceGetLimit(&value, limit)); + if (cudaSuccess != result) { + edm::LogWarning("CUDAService") << "CUDA device " << device << ": failed to set limit \"" << name << "\" to " + << request << ", current value is " << value; + } else if (value != request) { + edm::LogWarning("CUDAService") << "CUDA device " << device << ": limit \"" << name << "\" set to " << value + << " instead of requested " << request; + } +} + +constexpr unsigned int getCudaCoresPerSM(unsigned int major, unsigned int minor) { + switch (major * 10 + minor) { + // Fermi architecture + case 20: // SM 2.0: GF100 class + return 32; + case 21: // SM 2.1: GF10x class + return 48; + + // Kepler architecture + case 30: // SM 3.0: GK10x class + case 32: // SM 3.2: GK10x class + case 35: // SM 3.5: GK11x class + case 37: // SM 3.7: GK21x class + return 192; + + // Maxwell architecture + case 50: // SM 5.0: GM10x class + case 52: // SM 5.2: GM20x class + case 53: // SM 5.3: GM20x class + return 128; + + // Pascal architecture + case 60: // SM 6.0: GP100 class + return 64; + case 61: // SM 6.1: GP10x class + case 62: // SM 6.2: GP10x class + return 128; + + // Volta architecture + case 70: // SM 7.0: GV100 class + case 72: // SM 7.2: GV11b class + return 64; + + // Turing architecture + case 75: // SM 7.5: TU10x class + return 64; + + // unknown architecture, return a default value + default: + return 64; + } +} + +namespace { + template