diff --git a/CUDADataFormats/SiPixelCluster/BuildFile.xml b/CUDADataFormats/SiPixelCluster/BuildFile.xml
index 5406d1355533f..1bf72a85ddc0a 100644
--- a/CUDADataFormats/SiPixelCluster/BuildFile.xml
+++ b/CUDADataFormats/SiPixelCluster/BuildFile.xml
@@ -2,6 +2,7 @@
 <use name="rootcore"/>
 <use name="CUDADataFormats/Common"/>
 <use name="DataFormats/Common"/>
+<use name="DataFormats/SoATemplate" source_only="1"/>
 <use name="HeterogeneousCore/CUDAUtilities"/>
 
 <export>
diff --git a/CUDADataFormats/SiPixelCluster/interface/SiPixelClustersCUDA.h b/CUDADataFormats/SiPixelCluster/interface/SiPixelClustersCUDA.h
index 4ecdf14d8d33c..7f461bef6d2f9 100644
--- a/CUDADataFormats/SiPixelCluster/interface/SiPixelClustersCUDA.h
+++ b/CUDADataFormats/SiPixelCluster/interface/SiPixelClustersCUDA.h
@@ -5,16 +5,34 @@
 #include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCompat.h"
 
+#include "DataFormats/SoATemplate/interface/SoALayout.h"
+#include "CUDADataFormats/Common/interface/PortableDeviceCollection.h"
+
 #include <cuda_runtime.h>
 
-class SiPixelClustersCUDA {
+GENERATE_SOA_LAYOUT(SiPixelClustersCUDALayout,
+                    SOA_COLUMN(uint32_t, moduleStart),
+                    SOA_COLUMN(uint32_t, clusInModule),
+                    SOA_COLUMN(uint32_t, moduleId),
+                    SOA_COLUMN(uint32_t, clusModuleStart))
+
+using SiPixelClustersCUDASoA = SiPixelClustersCUDALayout<>;
+using SiPixelClustersCUDASOAView = SiPixelClustersCUDALayout<>::View;
+using SiPixelClustersCUDASOAConstView = SiPixelClustersCUDALayout<>::ConstView;
+
+// TODO: The class is created via inheritance of the PortableDeviceCollection.
+// This is generally discouraged, and should be done via composition, i.e.,
+// by adding a public class attribute like:
+// cms::cuda::Portabledevicecollection<SiPixelClustersCUDALayout<>> collection;
+// See: https://github.com/cms-sw/cmssw/pull/40465#discussion_r1067364306
+class SiPixelClustersCUDA : public cms::cuda::PortableDeviceCollection<SiPixelClustersCUDALayout<>> {
 public:
   SiPixelClustersCUDA() = default;
-  explicit SiPixelClustersCUDA(size_t maxModules, cudaStream_t stream);
   ~SiPixelClustersCUDA() = default;
 
-  SiPixelClustersCUDA(const SiPixelClustersCUDA &) = delete;
-  SiPixelClustersCUDA &operator=(const SiPixelClustersCUDA &) = delete;
+  explicit SiPixelClustersCUDA(size_t maxModules, cudaStream_t stream)
+      : PortableDeviceCollection<SiPixelClustersCUDALayout<>>(maxModules + 1, stream) {}
+
   SiPixelClustersCUDA(SiPixelClustersCUDA &&) = default;
   SiPixelClustersCUDA &operator=(SiPixelClustersCUDA &&) = default;
 
@@ -26,41 +44,7 @@ class SiPixelClustersCUDA {
   uint32_t nClusters() const { return nClusters_h; }
   int32_t offsetBPIX2() const { return offsetBPIX2_h; }
 
-  uint32_t *moduleStart() { return moduleStart_d.get(); }
-  uint32_t *clusInModule() { return clusInModule_d.get(); }
-  uint32_t *moduleId() { return moduleId_d.get(); }
-  uint32_t *clusModuleStart() { return clusModuleStart_d.get(); }
-
-  uint32_t const *moduleStart() const { return moduleStart_d.get(); }
-  uint32_t const *clusInModule() const { return clusInModule_d.get(); }
-  uint32_t const *moduleId() const { return moduleId_d.get(); }
-  uint32_t const *clusModuleStart() const { return clusModuleStart_d.get(); }
-
-  class SiPixelClustersCUDASOAView {
-  public:
-    __device__ __forceinline__ uint32_t moduleStart(int i) const { return __ldg(moduleStart_ + i); }
-    __device__ __forceinline__ uint32_t clusInModule(int i) const { return __ldg(clusInModule_ + i); }
-    __device__ __forceinline__ uint32_t moduleId(int i) const { return __ldg(moduleId_ + i); }
-    __device__ __forceinline__ uint32_t clusModuleStart(int i) const { return __ldg(clusModuleStart_ + i); }
-
-    uint32_t const *moduleStart_;
-    uint32_t const *clusInModule_;
-    uint32_t const *moduleId_;
-    uint32_t const *clusModuleStart_;
-  };
-
-  SiPixelClustersCUDASOAView const *view() const { return view_d.get(); }
-
 private:
-  cms::cuda::device::unique_ptr<uint32_t[]> moduleStart_d;   // index of the first pixel of each module
-  cms::cuda::device::unique_ptr<uint32_t[]> clusInModule_d;  // number of clusters found in each module
-  cms::cuda::device::unique_ptr<uint32_t[]> moduleId_d;      // module id of each module
-
-  // originally from rechits
-  cms::cuda::device::unique_ptr<uint32_t[]> clusModuleStart_d;  // index of the first cluster of each module
-
-  cms::cuda::device::unique_ptr<SiPixelClustersCUDASOAView> view_d;  // "me" pointer
-
   uint32_t nClusters_h = 0;
   int32_t offsetBPIX2_h = 0;
 };
diff --git a/CUDADataFormats/SiPixelCluster/src/SiPixelClustersCUDA.cc b/CUDADataFormats/SiPixelCluster/src/SiPixelClustersCUDA.cc
deleted file mode 100644
index c8a340d2162f9..0000000000000
--- a/CUDADataFormats/SiPixelCluster/src/SiPixelClustersCUDA.cc
+++ /dev/null
@@ -1,19 +0,0 @@
-#include "CUDADataFormats/SiPixelCluster/interface/SiPixelClustersCUDA.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/copyAsync.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h"
-
-SiPixelClustersCUDA::SiPixelClustersCUDA(size_t maxModules, cudaStream_t stream)
-    : moduleStart_d(cms::cuda::make_device_unique<uint32_t[]>(maxModules + 1, stream)),
-      clusInModule_d(cms::cuda::make_device_unique<uint32_t[]>(maxModules, stream)),
-      moduleId_d(cms::cuda::make_device_unique<uint32_t[]>(maxModules, stream)),
-      clusModuleStart_d(cms::cuda::make_device_unique<uint32_t[]>(maxModules + 1, stream)) {
-  auto view = cms::cuda::make_host_unique<SiPixelClustersCUDASOAView>(stream);
-  view->moduleStart_ = moduleStart_d.get();
-  view->clusInModule_ = clusInModule_d.get();
-  view->moduleId_ = moduleId_d.get();
-  view->clusModuleStart_ = clusModuleStart_d.get();
-
-  view_d = cms::cuda::make_device_unique<SiPixelClustersCUDASOAView>(stream);
-  cms::cuda::copyAsync(view_d, view, stream);
-}
diff --git a/CUDADataFormats/SiPixelDigi/BuildFile.xml b/CUDADataFormats/SiPixelDigi/BuildFile.xml
index 0806768a9b657..784f42c4441a4 100644
--- a/CUDADataFormats/SiPixelDigi/BuildFile.xml
+++ b/CUDADataFormats/SiPixelDigi/BuildFile.xml
@@ -3,6 +3,7 @@
 <use name="CUDADataFormats/Common"/>
 <use name="DataFormats/Common"/>
 <use name="DataFormats/SiPixelRawData"/>
+<use name="DataFormats/SoATemplate" source_only="1"/>
 <use name="HeterogeneousCore/CUDAUtilities"/>
 
 <export>
diff --git a/CUDADataFormats/SiPixelDigi/interface/SiPixelDigisCUDA.h b/CUDADataFormats/SiPixelDigi/interface/SiPixelDigisCUDA.h
index cf6b51687982f..5888cd04a6128 100644
--- a/CUDADataFormats/SiPixelDigi/interface/SiPixelDigisCUDA.h
+++ b/CUDADataFormats/SiPixelDigi/interface/SiPixelDigisCUDA.h
@@ -6,17 +6,32 @@
 #include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCompat.h"
-#include "CUDADataFormats/SiPixelDigi/interface/SiPixelDigisCUDASOAView.h"
-
-class SiPixelDigisCUDA {
+#include "CUDADataFormats/Common/interface/PortableDeviceCollection.h"
+#include "DataFormats/SoATemplate/interface/SoALayout.h"
+
+GENERATE_SOA_LAYOUT(SiPixelDigisSoALayout,
+                    SOA_COLUMN(int32_t, clus),
+                    SOA_COLUMN(uint32_t, pdigi),
+                    SOA_COLUMN(uint32_t, rawIdArr),
+                    SOA_COLUMN(uint16_t, adc),
+                    SOA_COLUMN(uint16_t, xx),
+                    SOA_COLUMN(uint16_t, yy),
+                    SOA_COLUMN(uint16_t, moduleId))
+
+using SiPixelDigisCUDASOA = SiPixelDigisSoALayout<>;
+using SiPixelDigisCUDASOAView = SiPixelDigisCUDASOA::View;
+using SiPixelDigisCUDASOAConstView = SiPixelDigisCUDASOA::ConstView;
+
+// TODO: The class is created via inheritance of the PortableDeviceCollection.
+// This is generally discouraged, and should be done via composition.
+// See: https://github.com/cms-sw/cmssw/pull/40465#discussion_r1067364306
+class SiPixelDigisCUDA : public cms::cuda::PortableDeviceCollection<SiPixelDigisSoALayout<>> {
 public:
-  using StoreType = uint16_t;
   SiPixelDigisCUDA() = default;
-  explicit SiPixelDigisCUDA(size_t maxFedWords, cudaStream_t stream);
+  explicit SiPixelDigisCUDA(size_t maxFedWords, cudaStream_t stream)
+      : PortableDeviceCollection<SiPixelDigisSoALayout<>>(maxFedWords + 1, stream) {}
   ~SiPixelDigisCUDA() = default;
 
-  SiPixelDigisCUDA(const SiPixelDigisCUDA &) = delete;
-  SiPixelDigisCUDA &operator=(const SiPixelDigisCUDA &) = delete;
   SiPixelDigisCUDA(SiPixelDigisCUDA &&) = default;
   SiPixelDigisCUDA &operator=(SiPixelDigisCUDA &&) = default;
 
@@ -28,17 +43,7 @@ class SiPixelDigisCUDA {
   uint32_t nModules() const { return nModules_h; }
   uint32_t nDigis() const { return nDigis_h; }
 
-  cms::cuda::host::unique_ptr<StoreType[]> copyAllToHostAsync(cudaStream_t stream) const;
-
-  SiPixelDigisCUDASOAView view() { return m_view; }
-  SiPixelDigisCUDASOAView const view() const { return m_view; }
-
 private:
-  // These are consumed by downstream device code
-  cms::cuda::device::unique_ptr<StoreType[]> m_store;
-
-  SiPixelDigisCUDASOAView m_view;
-
   uint32_t nModules_h = 0;
   uint32_t nDigis_h = 0;
 };
diff --git a/CUDADataFormats/SiPixelDigi/interface/SiPixelDigisCUDASOAView.h b/CUDADataFormats/SiPixelDigi/interface/SiPixelDigisCUDASOAView.h
deleted file mode 100644
index 78406cd241473..0000000000000
--- a/CUDADataFormats/SiPixelDigi/interface/SiPixelDigisCUDASOAView.h
+++ /dev/null
@@ -1,112 +0,0 @@
-#ifndef CUDADataFormats_SiPixelDigi_interface_SiPixelDigisCUDASOAView_h
-#define CUDADataFormats_SiPixelDigi_interface_SiPixelDigisCUDASOAView_h
-
-#include <cuda_runtime.h>
-
-#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/cudaCompat.h"
-
-#include <cstdint>
-
-class SiPixelDigisCUDASOAView {
-public:
-  friend class SiPixelDigisCUDA;
-
-  template <typename TrackerTraits>
-  friend class SiPixelRecHitSoAFromLegacyT;
-
-  enum class StorageLocation {
-    kCLUS = 0,
-    kPDIGI = 2,
-    kRAWIDARR = 4,
-    kADC = 6,
-    kXX = 7,
-    kYY = 8,
-    kMODULEIND = 9,
-    kMAX = 10
-  };
-  /*
-  ============================================================================================================================
-  |          CLUS         |          PDIGI         |         RAWIDARR        |    ADC    |    XX     |     YY    | MODULEIND |
-  ============================================================================================================================
-  |        0: N*32        |         2: N*32        |         4: N*32         |  6: N*16  |  7: N*16  |  8: N*16  |  9: N*16  |
-  ============================================================================================================================
-  */
-  // These are for CPU output
-  // we don't copy local x and y coordinates and module index
-  enum class StorageLocationHost { kCLUS = 0, kPDIGI = 2, kRAWIDARR = 4, kADC = 6, kMAX = 7 };
-  /*
-  ========================================================================================
-  |          CLUS         |          PDIGI         |         RAWIDARR        |    ADC    |
-  ========================================================================================
-  |        0: N*32        |         2: N*32        |         4: N*32         |  6: N*16  |
-  ========================================================================================
-  */
-
-  SiPixelDigisCUDASOAView() = default;
-
-  template <typename StoreType>
-  SiPixelDigisCUDASOAView(StoreType& store, int maxFedWords, StorageLocation s) {
-    xx_ = getColumnAddress<uint16_t>(StorageLocation::kXX, store, maxFedWords);
-    yy_ = getColumnAddress<uint16_t>(StorageLocation::kYY, store, maxFedWords);
-    adc_ = getColumnAddress<uint16_t>(StorageLocation::kADC, store, maxFedWords);
-    moduleInd_ = getColumnAddress<uint16_t>(StorageLocation::kMODULEIND, store, maxFedWords);
-    clus_ = getColumnAddress<int32_t>(StorageLocation::kCLUS, store, maxFedWords);
-    pdigi_ = getColumnAddress<uint32_t>(StorageLocation::kPDIGI, store, maxFedWords);
-    rawIdArr_ = getColumnAddress<uint32_t>(StorageLocation::kRAWIDARR, store, maxFedWords);
-  }
-
-  template <typename StoreType>
-  SiPixelDigisCUDASOAView(StoreType& store, int maxFedWords, StorageLocationHost s) {
-    adc_ = getColumnAddress<uint16_t>(StorageLocationHost::kADC, store, maxFedWords);
-    clus_ = getColumnAddress<int32_t>(StorageLocationHost::kCLUS, store, maxFedWords);
-    pdigi_ = getColumnAddress<uint32_t>(StorageLocationHost::kPDIGI, store, maxFedWords);
-    rawIdArr_ = getColumnAddress<uint32_t>(StorageLocationHost::kRAWIDARR, store, maxFedWords);
-  }
-
-  __device__ __forceinline__ uint16_t xx(int i) const { return __ldg(xx_ + i); }
-  __device__ __forceinline__ uint16_t yy(int i) const { return __ldg(yy_ + i); }
-  __device__ __forceinline__ uint16_t adc(int i) const { return __ldg(adc_ + i); }
-  __device__ __forceinline__ uint16_t moduleInd(int i) const { return __ldg(moduleInd_ + i); }
-  __device__ __forceinline__ int32_t clus(int i) const { return __ldg(clus_ + i); }
-  __device__ __forceinline__ uint32_t pdigi(int i) const { return __ldg(pdigi_ + i); }
-  __device__ __forceinline__ uint32_t rawIdArr(int i) const { return __ldg(rawIdArr_ + i); }
-
-  const uint16_t* xx() const { return xx_; }
-  const uint16_t* yy() const { return yy_; }
-  const uint16_t* adc() const { return adc_; }
-  const uint16_t* moduleInd() const { return moduleInd_; }
-  const int32_t* clus() const { return clus_; }
-  const uint32_t* pdigi() const { return pdigi_; }
-  const uint32_t* rawIdArr() const { return rawIdArr_; }
-
-  uint16_t* xx() { return xx_; }
-  uint16_t* yy() { return yy_; }
-  uint16_t* adc() { return adc_; }
-  uint16_t* moduleInd() { return moduleInd_; }
-  int32_t* clus() { return clus_; }
-  uint32_t* pdigi() { return pdigi_; }
-  uint32_t* rawIdArr() { return rawIdArr_; }
-
-private:
-  uint16_t* xx_;  // local coordinates of each pixel
-  uint16_t* yy_;
-  uint16_t* adc_;        // ADC of each pixel
-  uint16_t* moduleInd_;  // module id of each pixel
-  int32_t* clus_;        // cluster id of each pixel
-  uint32_t* pdigi_;
-  uint32_t* rawIdArr_;
-
-  template <typename ReturnType, typename StoreType, typename LocationType>
-  ReturnType* getColumnAddress(LocationType column, StoreType& store, int size) {
-    return reinterpret_cast<ReturnType*>(store.get() + static_cast<int>(column) * roundFor128ByteAlignment(size));
-  }
-
-  static int roundFor128ByteAlignment(int size) {
-    constexpr int mul = 128 / sizeof(uint16_t);
-    return ((size + mul - 1) / mul) * mul;
-  };
-};
-
-#endif
diff --git a/CUDADataFormats/SiPixelDigi/src/SiPixelDigisCUDA.cc b/CUDADataFormats/SiPixelDigi/src/SiPixelDigisCUDA.cc
deleted file mode 100644
index 9a7f8ae8bdad5..0000000000000
--- a/CUDADataFormats/SiPixelDigi/src/SiPixelDigisCUDA.cc
+++ /dev/null
@@ -1,29 +0,0 @@
-#include <cassert>
-
-#include "CUDADataFormats/SiPixelDigi/interface/SiPixelDigisCUDA.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/copyAsync.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h"
-
-SiPixelDigisCUDA::SiPixelDigisCUDA(size_t maxFedWords, cudaStream_t stream)
-    : m_store(cms::cuda::make_device_unique<SiPixelDigisCUDA::StoreType[]>(
-          SiPixelDigisCUDASOAView::roundFor128ByteAlignment(maxFedWords) *
-              static_cast<int>(SiPixelDigisCUDASOAView::StorageLocation::kMAX),
-          stream)),
-      m_view(m_store, maxFedWords, SiPixelDigisCUDASOAView::StorageLocation::kMAX) {
-  assert(maxFedWords != 0);
-}
-
-cms::cuda::host::unique_ptr<SiPixelDigisCUDA::StoreType[]> SiPixelDigisCUDA::copyAllToHostAsync(
-    cudaStream_t stream) const {
-  auto ret = cms::cuda::make_host_unique<StoreType[]>(
-      m_view.roundFor128ByteAlignment(nDigis()) * static_cast<int>(SiPixelDigisCUDASOAView::StorageLocationHost::kMAX),
-      stream);
-  cudaCheck(cudaMemcpyAsync(ret.get(),
-                            m_view.clus(),
-                            m_view.roundFor128ByteAlignment(nDigis()) * sizeof(SiPixelDigisCUDA::StoreType) *
-                                static_cast<int>(SiPixelDigisCUDASOAView::StorageLocationHost::kMAX),
-                            cudaMemcpyDeviceToHost,
-                            stream));
-  return ret;
-}
diff --git a/CUDADataFormats/Track/BuildFile.xml b/CUDADataFormats/Track/BuildFile.xml
index e3f9a0910bbd8..cf07e3b540f24 100644
--- a/CUDADataFormats/Track/BuildFile.xml
+++ b/CUDADataFormats/Track/BuildFile.xml
@@ -2,6 +2,7 @@
 <use name="rootcore"/>
 <use name="CUDADataFormats/Common"/>
 <use name="DataFormats/Common"/>
+<use name="DataFormats/SoATemplate" source_only="1"/>
 <use name="HeterogeneousCore/CUDAUtilities"/>
 <use name="eigen"/>
 <export>
diff --git a/CUDADataFormats/Track/README.md b/CUDADataFormats/Track/README.md
new file mode 100644
index 0000000000000..8f66d9e4c4467
--- /dev/null
+++ b/CUDADataFormats/Track/README.md
@@ -0,0 +1,50 @@
+# Track CUDA Data Formats
+
+`CUDADataFormat`s meant to be used on Host (CPU) or Device (CUDA GPU) for
+storing information about `Track`s created during the Pixel-local Reconstruction
+chain. It stores data in an SoA manner. It combines the data contained in the
+deprecated `TrackSoAHeterogeneousT` and `TrajectoryStateSoAT` classes. 
+
+The host format is inheriting from `CUDADataFormats/Common/interface/PortableHostCollection.h`,
+while the device format is inheriting from `CUDADataFormats/Common/interface/PortableDeviceCollection.h`
+
+Both formats use the same SoA Layout (`TrackSoAHeterogeneousLayout`) which is generated
+via the `GENERATE_SOA_LAYOUT` macro in the `PixelTrackUtilities.h` file.
+
+## Notes
+
+-`hitIndices` and `detIndices`, instances of `HitContainer`, have been added into the
+layout as `SOA_SCALAR`s, meaning that they manage their own data independently from the SoA
+`Layout`. This could be improved in the future, if `HitContainer` (aka a `OneToManyAssoc` of fixed size)
+is replaced, but there don't seem to be any conflicts in including it in the `Layout` like this.
+- Host and Device classes should **not** be created via inheritance, as they're done here,
+but via composition. See [this discussion](https://github.com/cms-sw/cmssw/pull/40465#discussion_r1066039309).
+
+## TrackSoAHeterogeneousHost
+
+The version of the data format to be used for storing `Track` data on the CPU. 
+Instances of this class are to be used for:
+
+- Having a place to copy data to host from device, via `cudaMemcpy`, or
+- Running host-side algorithms using data stored in an SoA manner.
+
+## TrackSoAHeterogeneousDevice
+
+The version of the data format to be used for storing `Track` data on the GPU.
+
+Instances of `TrackSoAHeterogeneousDevice` are to be created on host and be
+used on device only. To do so, the instance's `view()` method is to be called
+to pass a `View` to any kernel launched. Accessing data from the `view()` is not
+possible on the host side.
+
+## Utilities
+
+`PixelTrackUtilities.h` contains a collection of methods which were originally
+defined as class methods inside either `TrackSoAHeterogeneousT` and `TrajectoryStateSoAT`
+which have been adapted to operate on `View` instances, so that they are callable
+from within `__global__` kernels, on both CPU and CPU. 
+
+## Use case
+
+See `test/TrackSoAHeterogeneous_test.cpp` for a simple example of instantiation,
+processing and copying from device to host.
diff --git a/CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h b/CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h
deleted file mode 100644
index f9e9b3a37c63f..0000000000000
--- a/CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h
+++ /dev/null
@@ -1,11 +0,0 @@
-#ifndef CUDADataFormats_Track_PixelTrackHeterogeneous_h
-#define CUDADataFormats_Track_PixelTrackHeterogeneous_h
-
-#include "CUDADataFormats/Common/interface/HeterogeneousSoA.h"
-#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousT.h"
-#include "Geometry/CommonTopologies/interface/SimplePixelTopology.h"
-
-template <typename TrackerTraits>
-using PixelTrackHeterogeneousT = HeterogeneousSoA<pixelTrack::TrackSoAT<TrackerTraits>>;
-
-#endif  // #ifndef CUDADataFormats_Track_PixelTrackHeterogeneous_h
diff --git a/CUDADataFormats/Track/interface/PixelTrackUtilities.h b/CUDADataFormats/Track/interface/PixelTrackUtilities.h
new file mode 100644
index 0000000000000..6d7ea258be8d2
--- /dev/null
+++ b/CUDADataFormats/Track/interface/PixelTrackUtilities.h
@@ -0,0 +1,243 @@
+#ifndef CUDADataFormats_Track_PixelTrackUtilities_h
+#define CUDADataFormats_Track_PixelTrackUtilities_h
+
+#include <Eigen/Dense>
+#include <Eigen/Core>
+#include "Geometry/CommonTopologies/interface/SimplePixelTopology.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h"
+#include "DataFormats/SoATemplate/interface/SoALayout.h"
+
+namespace pixelTrack {
+
+  enum class Quality : uint8_t { bad = 0, edup, dup, loose, strict, tight, highPurity, notQuality };
+  constexpr uint32_t qualitySize{uint8_t(Quality::notQuality)};
+  const std::string qualityName[qualitySize]{"bad", "edup", "dup", "loose", "strict", "tight", "highPurity"};
+  inline Quality qualityByName(std::string const &name) {
+    auto qp = std::find(qualityName, qualityName + qualitySize, name) - qualityName;
+    return static_cast<Quality>(qp);
+  }
+
+}  // namespace pixelTrack
+
+template <typename TrackerTraits>
+struct TrackSoA {
+  static constexpr int32_t S = TrackerTraits::maxNumberOfTuples;
+  static constexpr int32_t H = TrackerTraits::avgHitsPerTrack;
+  // Aliases in order to not confuse the GENERATE_SOA_LAYOUT
+  // macro with weird colons and angled brackets.
+  using Vector5f = Eigen::Matrix<float, 5, 1>;
+  using Vector15f = Eigen::Matrix<float, 15, 1>;
+  using Quality = pixelTrack::Quality;
+
+  using hindex_type = uint32_t;
+
+  using HitContainer = cms::cuda::OneToManyAssoc<hindex_type, S + 1, H * S>;
+
+  GENERATE_SOA_LAYOUT(TrackSoALayout,
+                      SOA_COLUMN(Quality, quality),
+                      SOA_COLUMN(float, chi2),
+                      SOA_COLUMN(int8_t, nLayers),
+                      SOA_COLUMN(float, eta),
+                      SOA_COLUMN(float, pt),
+                      SOA_EIGEN_COLUMN(Vector5f, state),
+                      SOA_EIGEN_COLUMN(Vector15f, covariance),
+                      SOA_SCALAR(int, nTracks),
+                      SOA_SCALAR(HitContainer, hitIndices),
+                      SOA_SCALAR(HitContainer, detIndices))
+};
+
+// Methods that operate on View and ConstView of the TrackSoA, and cannot be class methods.
+
+template <typename TrackerTraits>
+struct TracksUtilities {
+  using TrackSoAView = typename TrackSoA<TrackerTraits>::template TrackSoALayout<>::View;
+  using TrackSoAConstView = typename TrackSoA<TrackerTraits>::template TrackSoALayout<>::ConstView;
+  using hindex_type = typename TrackSoA<TrackerTraits>::hindex_type;
+
+  // State at the Beam spot
+  // phi,tip,1/pt,cotan(theta),zip
+  static __host__ __device__ inline float charge(const TrackSoAConstView &tracks, int32_t i) {
+    return std::copysign(1.f, tracks[i].state()(2));
+  }
+
+  static constexpr __host__ __device__ inline float phi(const TrackSoAConstView &tracks, int32_t i) {
+    return tracks[i].state()(0);
+  }
+
+  static constexpr __host__ __device__ inline float tip(const TrackSoAConstView &tracks, int32_t i) {
+    return tracks[i].state()(1);
+  }
+
+  static constexpr __host__ __device__ inline float zip(const TrackSoAConstView &tracks, int32_t i) {
+    return tracks[i].state()(4);
+  }
+
+  static constexpr __host__ __device__ inline bool isTriplet(const TrackSoAConstView &tracks, int i) {
+    return tracks[i].nLayers() == 3;
+  }
+
+  template <typename V3, typename M3, typename V2, typename M2>
+  static constexpr __host__ __device__ inline void copyFromCircle(
+      TrackSoAView &tracks, V3 const &cp, M3 const &ccov, V2 const &lp, M2 const &lcov, float b, int32_t i) {
+    tracks[i].state() << cp.template cast<float>(), lp.template cast<float>();
+
+    tracks[i].state()(2) = tracks[i].state()(2) * b;
+    auto cov = tracks[i].covariance();
+    cov(0) = ccov(0, 0);
+    cov(1) = ccov(0, 1);
+    cov(2) = b * float(ccov(0, 2));
+    cov(4) = cov(3) = 0;
+    cov(5) = ccov(1, 1);
+    cov(6) = b * float(ccov(1, 2));
+    cov(8) = cov(7) = 0;
+    cov(9) = b * b * float(ccov(2, 2));
+    cov(11) = cov(10) = 0;
+    cov(12) = lcov(0, 0);
+    cov(13) = lcov(0, 1);
+    cov(14) = lcov(1, 1);
+  }
+
+  template <typename V5, typename M5>
+  static constexpr __host__ __device__ inline void copyFromDense(TrackSoAView &tracks,
+                                                                 V5 const &v,
+                                                                 M5 const &cov,
+                                                                 int32_t i) {
+    tracks[i].state() = v.template cast<float>();
+    for (int j = 0, ind = 0; j < 5; ++j)
+      for (auto k = j; k < 5; ++k)
+        tracks[i].covariance()(ind++) = cov(j, k);
+  }
+
+  template <typename V5, typename M5>
+  static constexpr __host__ __device__ inline void copyToDense(const TrackSoAConstView &tracks,
+                                                               V5 &v,
+                                                               M5 &cov,
+                                                               int32_t i) {
+    v = tracks[i].state().template cast<typename V5::Scalar>();
+    for (int j = 0, ind = 0; j < 5; ++j) {
+      cov(j, j) = tracks[i].covariance()(ind++);
+      for (auto k = j + 1; k < 5; ++k)
+        cov(k, j) = cov(j, k) = tracks[i].covariance()(ind++);
+    }
+  }
+
+  static constexpr __host__ __device__ inline int computeNumberOfLayers(const TrackSoAConstView &tracks, int32_t i) {
+    auto pdet = tracks.detIndices().begin(i);
+    int nl = 1;
+    auto ol = pixelTopology::getLayer<TrackerTraits>(*pdet);
+    for (; pdet < tracks.detIndices().end(i); ++pdet) {
+      auto il = pixelTopology::getLayer<TrackerTraits>(*pdet);
+      if (il != ol)
+        ++nl;
+      ol = il;
+    }
+    return nl;
+  }
+
+  static constexpr __host__ __device__ inline int nHits(const TrackSoAConstView &tracks, int i) {
+    return tracks.detIndices().size(i);
+  }
+};
+
+namespace pixelTrack {
+
+  template <typename TrackerTraits, typename Enable = void>
+  struct QualityCutsT {};
+
+  template <typename TrackerTraits>
+  struct QualityCutsT<TrackerTraits, pixelTopology::isPhase1Topology<TrackerTraits>> {
+    using TrackSoAView = typename TrackSoA<TrackerTraits>::template TrackSoALayout<>::View;
+    using TrackSoAConstView = typename TrackSoA<TrackerTraits>::template TrackSoALayout<>::ConstView;
+    using tracksHelper = TracksUtilities<TrackerTraits>;
+    // chi2 cut = chi2Scale * (chi2Coeff[0] + pT/GeV * (chi2Coeff[1] + pT/GeV * (chi2Coeff[2] + pT/GeV * chi2Coeff[3])))
+    float chi2Coeff[4];
+    float chi2MaxPt;  // GeV
+    float chi2Scale;
+
+    struct Region {
+      float maxTip;  // cm
+      float minPt;   // GeV
+      float maxZip;  // cm
+    };
+
+    Region triplet;
+    Region quadruplet;
+
+    __device__ __forceinline__ bool isHP(const TrackSoAConstView &tracks, int nHits, int it) const {
+      // impose "region cuts" based on the fit results (phi, Tip, pt, cotan(theta)), Zip)
+      // default cuts:
+      //   - for triplets:    |Tip| < 0.3 cm, pT > 0.5 GeV, |Zip| < 12.0 cm
+      //   - for quadruplets: |Tip| < 0.5 cm, pT > 0.3 GeV, |Zip| < 12.0 cm
+      // (see CAHitNtupletGeneratorGPU.cc)
+      auto const &region = (nHits > 3) ? quadruplet : triplet;
+      return (std::abs(tracksHelper::tip(tracks, it)) < region.maxTip) and (tracks.pt(it) > region.minPt) and
+             (std::abs(tracksHelper::zip(tracks, it)) < region.maxZip);
+    }
+
+    __device__ __forceinline__ bool strictCut(const TrackSoAConstView &tracks, int it) const {
+      auto roughLog = [](float x) {
+        // max diff [0.5,12] at 1.25 0.16143
+        // average diff  0.0662998
+        union IF {
+          uint32_t i;
+          float f;
+        };
+        IF z;
+        z.f = x;
+        uint32_t lsb = 1 < 21;
+        z.i += lsb;
+        z.i >>= 21;
+        auto f = z.i & 3;
+        int ex = int(z.i >> 2) - 127;
+
+        // log2(1+0.25*f)
+        // averaged over bins
+        const float frac[4] = {0.160497f, 0.452172f, 0.694562f, 0.901964f};
+        return float(ex) + frac[f];
+      };
+
+      float pt = std::min<float>(tracks.pt(it), chi2MaxPt);
+      float chi2Cut = chi2Scale * (chi2Coeff[0] + roughLog(pt) * chi2Coeff[1]);
+      if (tracks.chi2(it) >= chi2Cut) {
+#ifdef NTUPLE_FIT_DEBUG
+        printf("Bad chi2 %d pt %f eta %f chi2 %f\n", it, tracks.pt(it), tracks.eta(it), tracks.chi2(it));
+#endif
+        return true;
+      }
+      return false;
+    }
+  };
+
+  template <typename TrackerTraits>
+  struct QualityCutsT<TrackerTraits, pixelTopology::isPhase2Topology<TrackerTraits>> {
+    using TrackSoAView = typename TrackSoA<TrackerTraits>::template TrackSoALayout<>::View;
+    using TrackSoAConstView = typename TrackSoA<TrackerTraits>::template TrackSoALayout<>::ConstView;
+    using tracksHelper = TracksUtilities<TrackerTraits>;
+
+    float maxChi2;
+    float minPt;
+    float maxTip;
+    float maxZip;
+
+    __device__ __forceinline__ bool isHP(const TrackSoAConstView &tracks, int nHits, int it) const {
+      return (std::abs(tracksHelper::tip(tracks, it)) < maxTip) and (tracks.pt(it) > minPt) and
+             (std::abs(tracksHelper::zip(tracks, it)) < maxZip);
+    }
+    __device__ __forceinline__ bool strictCut(const TrackSoAConstView &tracks, int it) const {
+      return tracks.chi2(it) >= maxChi2;
+    }
+  };
+
+}  // namespace pixelTrack
+
+template <typename TrackerTraits>
+using TrackLayout = typename TrackSoA<TrackerTraits>::template TrackSoALayout<>;
+template <typename TrackerTraits>
+using TrackSoAView = typename TrackSoA<TrackerTraits>::template TrackSoALayout<>::View;
+template <typename TrackerTraits>
+using TrackSoAConstView = typename TrackSoA<TrackerTraits>::template TrackSoALayout<>::ConstView;
+
+template struct TracksUtilities<pixelTopology::Phase1>;
+template struct TracksUtilities<pixelTopology::Phase2>;
+
+#endif
diff --git a/CUDADataFormats/Track/interface/TrackSoAHeterogeneousDevice.h b/CUDADataFormats/Track/interface/TrackSoAHeterogeneousDevice.h
new file mode 100644
index 0000000000000..1938991e071e1
--- /dev/null
+++ b/CUDADataFormats/Track/interface/TrackSoAHeterogeneousDevice.h
@@ -0,0 +1,36 @@
+#ifndef CUDADataFormats_Track_TrackHeterogeneousDevice_H
+#define CUDADataFormats_Track_TrackHeterogeneousDevice_H
+
+#include <cstdint>
+
+#include "CUDADataFormats/Track/interface/PixelTrackUtilities.h"
+#include "CUDADataFormats/Common/interface/PortableDeviceCollection.h"
+
+#include "Geometry/CommonTopologies/interface/SimplePixelTopology.h"
+
+// TODO: The class is created via inheritance of the PortableDeviceCollection.
+// This is generally discouraged, and should be done via composition.
+// See: https://github.com/cms-sw/cmssw/pull/40465#discussion_r1067364306
+template <typename TrackerTraits>
+class TrackSoAHeterogeneousDevice : public cms::cuda::PortableDeviceCollection<TrackLayout<TrackerTraits>> {
+public:
+  using cms::cuda::PortableDeviceCollection<TrackLayout<TrackerTraits>>::view;
+  using cms::cuda::PortableDeviceCollection<TrackLayout<TrackerTraits>>::const_view;
+  using cms::cuda::PortableDeviceCollection<TrackLayout<TrackerTraits>>::buffer;
+  using cms::cuda::PortableDeviceCollection<TrackLayout<TrackerTraits>>::bufferSize;
+
+  TrackSoAHeterogeneousDevice() = default;  // cms::cuda::Product needs this
+
+  // Constructor which specifies the SoA size
+  explicit TrackSoAHeterogeneousDevice(cudaStream_t stream)
+      : cms::cuda::PortableDeviceCollection<TrackLayout<TrackerTraits>>(TrackerTraits::maxNumberOfTuples, stream) {}
+};
+
+namespace pixelTrack {
+
+  using TrackSoADevicePhase1 = TrackSoAHeterogeneousDevice<pixelTopology::Phase1>;
+  using TrackSoADevicePhase2 = TrackSoAHeterogeneousDevice<pixelTopology::Phase2>;
+
+}  // namespace pixelTrack
+
+#endif  // CUDADataFormats_Track_TrackHeterogeneousT_H
diff --git a/CUDADataFormats/Track/interface/TrackSoAHeterogeneousHost.h b/CUDADataFormats/Track/interface/TrackSoAHeterogeneousHost.h
new file mode 100644
index 0000000000000..af8af2a40a52e
--- /dev/null
+++ b/CUDADataFormats/Track/interface/TrackSoAHeterogeneousHost.h
@@ -0,0 +1,35 @@
+#ifndef CUDADataFormats_Track_TrackHeterogeneousHost_H
+#define CUDADataFormats_Track_TrackHeterogeneousHost_H
+
+#include <cstdint>
+
+#include "CUDADataFormats/Track/interface/PixelTrackUtilities.h"
+#include "CUDADataFormats/Common/interface/PortableHostCollection.h"
+
+// TODO: The class is created via inheritance of the PortableHostCollection.
+// This is generally discouraged, and should be done via composition.
+// See: https://github.com/cms-sw/cmssw/pull/40465#discussion_r1067364306
+template <typename TrackerTraits>
+class TrackSoAHeterogeneousHost : public cms::cuda::PortableHostCollection<TrackLayout<TrackerTraits>> {
+public:
+  static constexpr int32_t S = TrackerTraits::maxNumberOfTuples;  //TODO: this could be made configurable at runtime
+  explicit TrackSoAHeterogeneousHost() : cms::cuda::PortableHostCollection<TrackLayout<TrackerTraits>>(S) {}
+
+  using cms::cuda::PortableHostCollection<TrackLayout<TrackerTraits>>::view;
+  using cms::cuda::PortableHostCollection<TrackLayout<TrackerTraits>>::const_view;
+  using cms::cuda::PortableHostCollection<TrackLayout<TrackerTraits>>::buffer;
+  using cms::cuda::PortableHostCollection<TrackLayout<TrackerTraits>>::bufferSize;
+
+  // Constructor which specifies the SoA size
+  explicit TrackSoAHeterogeneousHost(cudaStream_t stream)
+      : cms::cuda::PortableHostCollection<TrackLayout<TrackerTraits>>(S, stream) {}
+};
+
+namespace pixelTrack {
+
+  using TrackSoAHostPhase1 = TrackSoAHeterogeneousHost<pixelTopology::Phase1>;
+  using TrackSoAHostPhase2 = TrackSoAHeterogeneousHost<pixelTopology::Phase2>;
+
+}  // namespace pixelTrack
+
+#endif  // CUDADataFormats_Track_TrackHeterogeneousT_H
diff --git a/CUDADataFormats/Track/interface/TrackSoAHeterogeneousT.h b/CUDADataFormats/Track/interface/TrackSoAHeterogeneousT.h
deleted file mode 100644
index b5b1df0d5118a..0000000000000
--- a/CUDADataFormats/Track/interface/TrackSoAHeterogeneousT.h
+++ /dev/null
@@ -1,195 +0,0 @@
-#ifndef CUDADataFormats_Track_TrackHeterogeneousT_H
-#define CUDADataFormats_Track_TrackHeterogeneousT_H
-
-#include <string>
-#include <algorithm>
-
-#include "CUDADataFormats/Track/interface/TrajectoryStateSoAT.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h"
-#include "Geometry/CommonTopologies/interface/SimplePixelTopology.h"
-#include "CUDADataFormats/Common/interface/HeterogeneousSoA.h"
-#include "DataFormats/Common/interface/CMS_CLASS_VERSION.h"
-
-namespace pixelTrack {
-
-  enum class Quality : uint8_t { bad = 0, edup, dup, loose, strict, tight, highPurity, notQuality };
-  constexpr uint32_t qualitySize{uint8_t(Quality::notQuality)};
-  const std::string qualityName[qualitySize]{"bad", "edup", "dup", "loose", "strict", "tight", "highPurity"};
-  inline Quality qualityByName(std::string const &name) {
-    auto qp = std::find(qualityName, qualityName + qualitySize, name) - qualityName;
-    return static_cast<Quality>(qp);
-  }
-
-}  // namespace pixelTrack
-
-template <typename TrackerTraits>
-class TrackSoAHeterogeneousT {
-public:
-  static constexpr int32_t S = TrackerTraits::maxNumberOfTuples;
-  static constexpr int32_t H = TrackerTraits::maxHitsOnTrack;  // Average hits rather than max?
-  static constexpr int32_t stride() { return S; }
-
-  using hindex_type = uint32_t;  //TrackerTraits::hindex_type ?
-
-  using Quality = pixelTrack::Quality;
-  using HitContainer = cms::cuda::OneToManyAssoc<hindex_type, S + 1, H * S>;
-
-  // Always check quality is at least loose!
-  // CUDA does not support enums  in __lgc ...
-protected:
-  eigenSoA::ScalarSoA<uint8_t, S> quality_;
-
-public:
-  constexpr Quality quality(int32_t i) const { return (Quality)(quality_(i)); }
-  constexpr Quality &quality(int32_t i) { return (Quality &)(quality_(i)); }
-  constexpr Quality const *qualityData() const { return (Quality const *)(quality_.data()); }
-  constexpr Quality *qualityData() { return (Quality *)(quality_.data()); }
-
-  // this is chi2/ndof as not necessarely all hits are used in the fit
-  eigenSoA::ScalarSoA<float, S> chi2;
-
-  eigenSoA::ScalarSoA<int8_t, S> nLayers;
-
-  constexpr int nTracks() const { return nTracks_; }
-  constexpr void setNTracks(int n) { nTracks_ = n; }
-
-  constexpr int nHits(int i) const { return detIndices.size(i); }
-
-  constexpr bool isTriplet(int i) const { return nLayers(i) == 3; }
-
-  constexpr int computeNumberOfLayers(int32_t i) const {
-    // layers are in order and we assume tracks are either forward or backward
-    auto pdet = detIndices.begin(i);
-    int nl = 1;
-    auto ol = pixelTopology::getLayer<TrackerTraits>(*pdet);
-    for (; pdet < detIndices.end(i); ++pdet) {
-      auto il = pixelTopology::getLayer<TrackerTraits>(*pdet);
-      if (il != ol)
-        ++nl;
-      ol = il;
-    }
-    return nl;
-  }
-
-  // State at the Beam spot
-  // phi,tip,1/pt,cotan(theta),zip
-  TrajectoryStateSoAT<S> stateAtBS;
-  eigenSoA::ScalarSoA<float, S> eta;
-  eigenSoA::ScalarSoA<float, S> pt;
-  constexpr float charge(int32_t i) const { return std::copysign(1.f, stateAtBS.state(i)(2)); }
-  constexpr float phi(int32_t i) const { return stateAtBS.state(i)(0); }
-  constexpr float tip(int32_t i) const { return stateAtBS.state(i)(1); }
-  constexpr float zip(int32_t i) const { return stateAtBS.state(i)(4); }
-
-  // state at the detector of the outermost hit
-  // representation to be decided...
-  // not yet filled on GPU
-  // TrajectoryStateSoA<S> stateAtOuterDet;
-
-  HitContainer hitIndices;
-  HitContainer detIndices;
-
-private:
-  int nTracks_;
-};
-
-namespace pixelTrack {
-
-  template <typename TrackerTraits>
-  using TrackSoAT = TrackSoAHeterogeneousT<TrackerTraits>;
-
-  template <typename TrackerTraits>
-  using HitContainerT = typename TrackSoAHeterogeneousT<TrackerTraits>::HitContainer;
-
-  //Used only to ease classes definitions
-  using TrackSoAPhase1 = TrackSoAHeterogeneousT<pixelTopology::Phase1>;
-  using TrackSoAPhase2 = TrackSoAHeterogeneousT<pixelTopology::Phase2>;
-
-  template <typename TrackerTraits, typename Enable = void>
-  struct QualityCutsT {};
-
-  template <typename TrackerTraits>
-  struct QualityCutsT<TrackerTraits, pixelTopology::isPhase1Topology<TrackerTraits>> {
-    // chi2 cut = chi2Scale * (chi2Coeff[0] + pT/GeV * (chi2Coeff[1] + pT/GeV * (chi2Coeff[2] + pT/GeV * chi2Coeff[3])))
-    float chi2Coeff[4];
-    float chi2MaxPt;  // GeV
-    float chi2Scale;
-
-    struct Region {
-      float maxTip;  // cm
-      float minPt;   // GeV
-      float maxZip;  // cm
-    };
-
-    Region triplet;
-    Region quadruplet;
-
-    __device__ __forceinline__ bool isHP(TrackSoAHeterogeneousT<TrackerTraits> const *__restrict__ tracks,
-                                         int nHits,
-                                         int it) const {
-      // impose "region cuts" based on the fit results (phi, Tip, pt, cotan(theta)), Zip)
-      // default cuts:
-      //   - for triplets:    |Tip| < 0.3 cm, pT > 0.5 GeV, |Zip| < 12.0 cm
-      //   - for quadruplets: |Tip| < 0.5 cm, pT > 0.3 GeV, |Zip| < 12.0 cm
-      // (see CAHitNtupletGeneratorGPU.cc)
-      auto const &region = (nHits > 3) ? quadruplet : triplet;
-      return (std::abs(tracks->tip(it)) < region.maxTip) and (tracks->pt(it) > region.minPt) and
-             (std::abs(tracks->zip(it)) < region.maxZip);
-    }
-
-    __device__ __forceinline__ bool strictCut(TrackSoAHeterogeneousT<TrackerTraits> const *__restrict__ tracks,
-                                              int it) const {
-      auto roughLog = [](float x) {
-        // max diff [0.5,12] at 1.25 0.16143
-        // average diff  0.0662998
-        union IF {
-          uint32_t i;
-          float f;
-        };
-        IF z;
-        z.f = x;
-        uint32_t lsb = 1 < 21;
-        z.i += lsb;
-        z.i >>= 21;
-        auto f = z.i & 3;
-        int ex = int(z.i >> 2) - 127;
-
-        // log2(1+0.25*f)
-        // averaged over bins
-        const float frac[4] = {0.160497f, 0.452172f, 0.694562f, 0.901964f};
-        return float(ex) + frac[f];
-      };
-
-      float pt = std::min<float>(tracks->pt(it), chi2MaxPt);
-      float chi2Cut = chi2Scale * (chi2Coeff[0] + roughLog(pt) * chi2Coeff[1]);
-      if (tracks->chi2(it) >= chi2Cut) {
-#ifdef NTUPLE_FIT_DEBUG
-        printf("Bad chi2 %d pt %f eta %f chi2 %f\n", it, tracks->pt(it), tracks->eta(it), tracks->chi2(it));
-#endif
-        return true;
-      }
-      return false;
-    }
-  };
-
-  template <typename TrackerTraits>
-  struct QualityCutsT<TrackerTraits, pixelTopology::isPhase2Topology<TrackerTraits>> {
-    float maxChi2;
-    float minPt;
-    float maxTip;
-    float maxZip;
-
-    __device__ __forceinline__ bool isHP(TrackSoAHeterogeneousT<TrackerTraits> const *__restrict__ tracks,
-                                         int nHits,
-                                         int it) const {
-      return (std::abs(tracks->tip(it)) < maxTip) and (tracks->pt(it) > minPt) and (std::abs(tracks->zip(it)) < maxZip);
-    }
-    __device__ __forceinline__ bool strictCut(TrackSoAHeterogeneousT<TrackerTraits> const *__restrict__ tracks,
-                                              int it) const {
-      return tracks->chi2(it) >= maxChi2;
-    }
-  };
-
-}  // namespace pixelTrack
-
-#endif  // CUDADataFormats_Track_TrackHeterogeneousT_H
diff --git a/CUDADataFormats/Track/interface/TrajectoryStateSoAT.h b/CUDADataFormats/Track/interface/TrajectoryStateSoAT.h
deleted file mode 100644
index 64fcd573a6991..0000000000000
--- a/CUDADataFormats/Track/interface/TrajectoryStateSoAT.h
+++ /dev/null
@@ -1,59 +0,0 @@
-#ifndef CUDADataFormats_Track_TrajectoryStateSOAT_H
-#define CUDADataFormats_Track_TrajectoryStateSOAT_H
-
-#include <Eigen/Dense>
-#include "HeterogeneousCore/CUDAUtilities/interface/eigenSoA.h"
-
-template <int32_t S>
-struct TrajectoryStateSoAT {
-  using Vector5f = Eigen::Matrix<float, 5, 1>;
-  using Vector15f = Eigen::Matrix<float, 15, 1>;
-
-  using Vector5d = Eigen::Matrix<double, 5, 1>;
-  using Matrix5d = Eigen::Matrix<double, 5, 5>;
-
-  static constexpr int32_t stride() { return S; }
-
-  eigenSoA::MatrixSoA<Vector5f, S> state;
-  eigenSoA::MatrixSoA<Vector15f, S> covariance;
-
-  template <typename V3, typename M3, typename V2, typename M2>
-  __host__ __device__ inline void copyFromCircle(
-      V3 const& cp, M3 const& ccov, V2 const& lp, M2 const& lcov, float b, int32_t i) {
-    state(i) << cp.template cast<float>(), lp.template cast<float>();
-    state(i)(2) *= b;
-    auto cov = covariance(i);
-    cov(0) = ccov(0, 0);
-    cov(1) = ccov(0, 1);
-    cov(2) = b * float(ccov(0, 2));
-    cov(4) = cov(3) = 0;
-    cov(5) = ccov(1, 1);
-    cov(6) = b * float(ccov(1, 2));
-    cov(8) = cov(7) = 0;
-    cov(9) = b * b * float(ccov(2, 2));
-    cov(11) = cov(10) = 0;
-    cov(12) = lcov(0, 0);
-    cov(13) = lcov(0, 1);
-    cov(14) = lcov(1, 1);
-  }
-
-  template <typename V5, typename M5>
-  __host__ __device__ inline void copyFromDense(V5 const& v, M5 const& cov, int32_t i) {
-    state(i) = v.template cast<float>();
-    for (int j = 0, ind = 0; j < 5; ++j)
-      for (auto k = j; k < 5; ++k)
-        covariance(i)(ind++) = cov(j, k);
-  }
-
-  template <typename V5, typename M5>
-  __host__ __device__ inline void copyToDense(V5& v, M5& cov, int32_t i) const {
-    v = state(i).template cast<typename V5::Scalar>();
-    for (int j = 0, ind = 0; j < 5; ++j) {
-      cov(j, j) = covariance(i)(ind++);
-      for (auto k = j + 1; k < 5; ++k)
-        cov(k, j) = cov(j, k) = covariance(i)(ind++);
-    }
-  }
-};
-
-#endif  // CUDADataFormats_Track_TrajectoryStateSOAT_H
diff --git a/CUDADataFormats/Track/src/classes.h b/CUDADataFormats/Track/src/classes.h
index 97c116f6c88d3..2e07adddcddd0 100644
--- a/CUDADataFormats/Track/src/classes.h
+++ b/CUDADataFormats/Track/src/classes.h
@@ -3,7 +3,10 @@
 
 #include "CUDADataFormats/Common/interface/Product.h"
 #include "CUDADataFormats/Common/interface/HostProduct.h"
-#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousT.h"
+
+#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousDevice.h"
+#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousHost.h"
+
 #include "DataFormats/Common/interface/Wrapper.h"
 
 #endif  // CUDADataFormats_Track_src_classes_h
diff --git a/CUDADataFormats/Track/src/classes_def.xml b/CUDADataFormats/Track/src/classes_def.xml
index 5216c19dded65..5e3116609330a 100644
--- a/CUDADataFormats/Track/src/classes_def.xml
+++ b/CUDADataFormats/Track/src/classes_def.xml
@@ -1,15 +1,15 @@
 <lcgdict>
 
-  <class name="pixelTrack::TrackSoAPhase1" persistent="false"/>
-  <class name="edm::Wrapper<HeterogeneousSoA<pixelTrack::TrackSoAPhase1>>" persistent="false" />
-  <class name="cms::cuda::Product<HeterogeneousSoA<pixelTrack::TrackSoAPhase1>>" persistent="false"/>
-  <class name="edm::Wrapper<cms::cuda::Product<HeterogeneousSoA<pixelTrack::TrackSoAPhase1>>>" persistent="false"/>
-  <class name="HeterogeneousSoA<pixelTrack::TrackSoAPhase1>" persistent="false" />
+  <class name="pixelTrack::TrackSoAHostPhase1" persistent="false"/>
+  <class name="edm::Wrapper<pixelTrack::TrackSoAHostPhase1>" persistent="false"/>
+  <class name="pixelTrack::TrackSoADevicePhase1" persistent="false"/>
+  <class name="cms::cuda::Product<pixelTrack::TrackSoADevicePhase1>" persistent="false"/>
+  <class name="edm::Wrapper<cms::cuda::Product<pixelTrack::TrackSoADevicePhase1>>" persistent="false"/>
 
-  <class name="pixelTrack::TrackSoAPhase2" persistent="false"/>
-  <class name="edm::Wrapper<HeterogeneousSoA<pixelTrack::TrackSoAPhase2>>" persistent="false" />
-  <class name="cms::cuda::Product<HeterogeneousSoA<pixelTrack::TrackSoAPhase2>>" persistent="false"/>
-  <class name="edm::Wrapper<cms::cuda::Product<HeterogeneousSoA<pixelTrack::TrackSoAPhase2>>>" persistent="false"/>
-  <class name="HeterogeneousSoA<pixelTrack::TrackSoAPhase2>" persistent="false" />
+  <class name="pixelTrack::TrackSoAHostPhase2" persistent="false"/>
+  <class name="edm::Wrapper<pixelTrack::TrackSoAHostPhase2>" persistent="false"/>
+  <class name="pixelTrack::TrackSoADevicePhase2" persistent="false"/>
+  <class name="cms::cuda::Product<pixelTrack::TrackSoADevicePhase2>" persistent="false"/>
+  <class name="edm::Wrapper<cms::cuda::Product<pixelTrack::TrackSoADevicePhase2>>" persistent="false"/>
 
 </lcgdict>
diff --git a/CUDADataFormats/Track/test/BuildFile.xml b/CUDADataFormats/Track/test/BuildFile.xml
index fc78783db473b..32256c87ed577 100644
--- a/CUDADataFormats/Track/test/BuildFile.xml
+++ b/CUDADataFormats/Track/test/BuildFile.xml
@@ -1,19 +1,22 @@
 <use name="HeterogeneousCore/CUDAUtilities"/>
 
-<bin file="TrackSoAHeterogeneous_t.cpp">
-  <use name="eigen"/>
-  <flags CXXFLAGS="-g -DGPU_DEBUG"/>
-</bin>
+<use name="CUDADataFormats/Track"/>
 
 <bin file="TrajectoryStateSOA_t.cpp" name="cpuTrajectoryStateSOA_t">
-  <use name="eigen"/>
-  <flags CXXFLAGS="-g -DGPU_DEBUG"/>
-</bin>
-
-<iftool name="cuda-gcc-support">
-<bin file="TrajectoryStateSOA_t.cu" name="gpuTrajectoryStateSOA_t">
   <use name="eigen"/>
   <flags CUDA_FLAGS="-g -DGPU_DEBUG"/>
   <flags CXXFLAGS="-g -DGPU_DEBUG"/>
 </bin>
+<iftool name="cuda-gcc-support">
+  <bin file="TrackSoAHeterogeneous_test.cpp TrackSoAHeterogeneous_test.cu" name="TrackSoAHeterogeneous_test">
+	<use name="eigen"/>
+	<flags CUDA_FLAGS="-g -DGPU_DEBUG"/>
+	<flags CXXFLAGS="-g -DGPU_DEBUG"/>
+  </bin>
+  <bin file="TrajectoryStateSOA_t.cu" name="gpuTrajectoryStateSOA_t">
+	<use name="eigen"/>	
+	<flags CUDA_FLAGS="-g -DGPU_DEBUG"/>
+	<flags CXXFLAGS="-g -DGPU_DEBUG"/>	
+  </bin>
+  
 </iftool>
diff --git a/CUDADataFormats/Track/test/TrackSoAHeterogeneous_t.cpp b/CUDADataFormats/Track/test/TrackSoAHeterogeneous_t.cpp
deleted file mode 100644
index 9708b689dd05b..0000000000000
--- a/CUDADataFormats/Track/test/TrackSoAHeterogeneous_t.cpp
+++ /dev/null
@@ -1,21 +0,0 @@
-#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousT.h"
-
-#include <iostream>
-#include <cassert>
-
-int main() {
-  // test quality
-
-  auto q = pixelTrack::qualityByName("tight");
-  assert(pixelTrack::Quality::tight == q);
-  q = pixelTrack::qualityByName("toght");
-  assert(pixelTrack::Quality::notQuality == q);
-
-  for (uint32_t i = 0; i < pixelTrack::qualitySize; ++i) {
-    auto const qt = static_cast<pixelTrack::Quality>(i);
-    auto q = pixelTrack::qualityByName(pixelTrack::qualityName[i]);
-    assert(qt == q);
-  }
-
-  return 0;
-}
diff --git a/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cpp b/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cpp
new file mode 100644
index 0000000000000..dafa75e2e18d7
--- /dev/null
+++ b/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cpp
@@ -0,0 +1,73 @@
+/**
+   Simple test for the pixelTrack::TrackSoA data structure
+   which inherits from PortableDeviceCollection.
+
+   Creates an instance of the class (automatically allocates
+   memory on device), passes the view of the SoA data to
+   the CUDA kernels which:
+   - Fill the SoA with data.
+   - Verify that the data written is correct.
+
+   Then, the SoA data are copied back to Host, where
+   a temporary host-side view (tmp_view) is created using
+   the same Layout to access the data on host and print it.
+ */
+
+#include <cstdint>
+#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousDevice.h"
+#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousHost.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/requireDevices.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+
+#include "Geometry/CommonTopologies/interface/SimplePixelTopology.h"
+
+namespace testTrackSoA {
+
+  template <typename TrackerTraits>
+  void runKernels(TrackSoAView<TrackerTraits> &tracks_view, cudaStream_t stream);
+}
+
+int main() {
+  cms::cudatest::requireDevices();
+
+  cudaStream_t stream;
+  cudaCheck(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
+
+  // Inner scope to deallocate memory before destroying the stream
+  {
+    // Instantiate tracks on device. PortableDeviceCollection allocates
+    // SoA on device automatically.
+    TrackSoAHeterogeneousDevice<pixelTopology::Phase1> tracks_d(stream);
+    testTrackSoA::runKernels<pixelTopology::Phase1>(tracks_d.view(), stream);
+
+    // Instantate tracks on host. This is where the data will be
+    // copied to from device.
+    TrackSoAHeterogeneousHost<pixelTopology::Phase1> tracks_h(stream);
+
+    cudaCheck(cudaMemcpyAsync(
+        tracks_h.buffer().get(), tracks_d.const_buffer().get(), tracks_d.bufferSize(), cudaMemcpyDeviceToHost, stream));
+    cudaCheck(cudaStreamSynchronize(stream));
+
+    // Print results
+    std::cout << "pt"
+              << "\t"
+              << "eta"
+              << "\t"
+              << "chi2"
+              << "\t"
+              << "quality"
+              << "\t"
+              << "nLayers"
+              << "\t"
+              << "hitIndices off" << std::endl;
+
+    for (int i = 0; i < 10; ++i) {
+      std::cout << tracks_h.view()[i].pt() << "\t" << tracks_h.view()[i].eta() << "\t" << tracks_h.view()[i].chi2()
+                << "\t" << (int)tracks_h.view()[i].quality() << "\t" << (int)tracks_h.view()[i].nLayers() << "\t"
+                << tracks_h.view().hitIndices().off[i] << std::endl;
+    }
+  }
+  cudaCheck(cudaStreamDestroy(stream));
+
+  return 0;
+}
diff --git a/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cu b/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cu
new file mode 100644
index 0000000000000..8e8595eb43e94
--- /dev/null
+++ b/CUDADataFormats/Track/test/TrackSoAHeterogeneous_test.cu
@@ -0,0 +1,63 @@
+#include "CUDADataFormats/Track/interface/PixelTrackUtilities.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/OneToManyAssoc.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+
+namespace testTrackSoA {
+
+  // Kernel which fills the TrackSoAView with data
+  // to test writing to it
+  template <typename TrackerTraits>
+  __global__ void fill(TrackSoAView<TrackerTraits> tracks_view) {
+    int i = threadIdx.x;
+    if (i == 0) {
+      tracks_view.nTracks() = 420;
+    }
+
+    for (int j = i; j < tracks_view.metadata().size(); j += blockDim.x) {
+      tracks_view[j].pt() = (float)j;
+      tracks_view[j].eta() = (float)j;
+      tracks_view[j].chi2() = (float)j;
+      tracks_view[j].quality() = (pixelTrack::Quality)(j % 256);
+      tracks_view[j].nLayers() = j % 128;
+      tracks_view.hitIndices().off[j] = j;
+    }
+  }
+
+  // Kernel which reads from the TrackSoAView to verify
+  // that it was written correctly from the fill kernel
+  template <typename TrackerTraits>
+  __global__ void verify(TrackSoAConstView<TrackerTraits> tracks_view) {
+    int i = threadIdx.x;
+
+    if (i == 0) {
+      printf("SoA size: % d, block dims: % d\n", tracks_view.metadata().size(), blockDim.x);
+      assert(tracks_view.nTracks() == 420);
+    }
+    for (int j = i; j < tracks_view.metadata().size(); j += blockDim.x) {
+      assert(abs(tracks_view[j].pt() - (float)j) < .0001);
+      assert(abs(tracks_view[j].eta() - (float)j) < .0001);
+      assert(abs(tracks_view[j].chi2() - (float)j) < .0001);
+      assert(tracks_view[j].quality() == (pixelTrack::Quality)(j % 256));
+      assert(tracks_view[j].nLayers() == j % 128);
+      assert(tracks_view.hitIndices().off[j] == j);
+    }
+  }
+
+  // Host function which invokes the two kernels above
+  template <typename TrackerTraits>
+  void runKernels(TrackSoAView<TrackerTraits>& tracks_view, cudaStream_t stream) {
+    fill<TrackerTraits><<<1, 1024, 0, stream>>>(tracks_view);
+    cudaCheck(cudaGetLastError());
+    cudaCheck(cudaDeviceSynchronize());
+
+    verify<TrackerTraits><<<1, 1024, 0, stream>>>(tracks_view);
+    cudaCheck(cudaGetLastError());
+    cudaCheck(cudaDeviceSynchronize());
+  }
+
+  template void runKernels<pixelTopology::Phase1>(TrackSoAView<pixelTopology::Phase1>& tracks_view,
+                                                  cudaStream_t stream);
+  template void runKernels<pixelTopology::Phase2>(TrackSoAView<pixelTopology::Phase2>& tracks_view,
+                                                  cudaStream_t stream);
+
+}  // namespace testTrackSoA
diff --git a/CUDADataFormats/Track/test/TrajectoryStateSOA_t.h b/CUDADataFormats/Track/test/TrajectoryStateSOA_t.h
index 97b88873c2613..6ba0eaa5c986e 100644
--- a/CUDADataFormats/Track/test/TrajectoryStateSOA_t.h
+++ b/CUDADataFormats/Track/test/TrajectoryStateSOA_t.h
@@ -1,7 +1,11 @@
-#include "CUDADataFormats/Track/interface/TrajectoryStateSoAT.h"
+#include "Geometry/CommonTopologies/interface/SimplePixelTopology.h"
+#include "CUDADataFormats/Track/interface/PixelTrackUtilities.h"
+#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousHost.h"
+#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousDevice.h"
 
 using Vector5d = Eigen::Matrix<double, 5, 1>;
 using Matrix5d = Eigen::Matrix<double, 5, 5>;
+using helper = TracksUtilities<pixelTopology::Phase1>;
 
 __host__ __device__ Matrix5d loadCov(Vector5d const& e) {
   Matrix5d cov;
@@ -17,26 +21,21 @@ __host__ __device__ Matrix5d loadCov(Vector5d const& e) {
   return cov;
 }
 
-using TS = TrajectoryStateSoAT<128>;
-
-__global__ void testTSSoA(TS* pts, int n) {
-  assert(n <= 128);
-
+template <typename TrackerTraits>
+__global__ void testTSSoA(TrackSoAView<TrackerTraits> ts) {
   Vector5d par0;
   par0 << 0.2, 0.1, 3.5, 0.8, 0.1;
   Vector5d e0;
   e0 << 0.01, 0.01, 0.035, -0.03, -0.01;
   auto cov0 = loadCov(e0);
 
-  TS& ts = *pts;
-
   int first = threadIdx.x + blockIdx.x * blockDim.x;
 
-  for (int i = first; i < n; i += blockDim.x * gridDim.x) {
-    ts.copyFromDense(par0, cov0, i);
+  for (int i = first; i < ts.metadata().size(); i += blockDim.x * gridDim.x) {
+    helper::copyFromDense(ts, par0, cov0, i);
     Vector5d par1;
     Matrix5d cov1;
-    ts.copyToDense(par1, cov1, i);
+    helper::copyToDense(ts, par1, cov1, i);
     Vector5d delV = par1 - par0;
     Matrix5d delM = cov1 - cov0;
     for (int j = 0; j < 5; ++j) {
@@ -58,18 +57,29 @@ __global__ void testTSSoA(TS* pts, int n) {
 int main() {
 #ifdef __CUDACC__
   cms::cudatest::requireDevices();
+  cudaStream_t stream;
+  cudaCheck(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
 #endif
 
-  TS ts;
+#ifdef __CUDACC__
+  // Since we are going to copy data from ts_d to ts_h, we
+  // need to initialize the Host collection with a stream.
+  TrackSoAHeterogeneousHost<pixelTopology::Phase1> ts_h(stream);
+  TrackSoAHeterogeneousDevice<pixelTopology::Phase1> ts_d(stream);
+#else
+  // If CUDA is not available, Host collection must not be initialized
+  // with a stream.
+  TrackSoAHeterogeneousHost<pixelTopology::Phase1> ts_h;
+#endif
 
 #ifdef __CUDACC__
-  TS* ts_d;
-  cudaCheck(cudaMalloc(&ts_d, sizeof(TS)));
-  testTSSoA<<<1, 64>>>(ts_d, 128);
+  testTSSoA<pixelTopology::Phase1><<<1, 64, 0, stream>>>(ts_d.view());
+  cudaCheck(cudaGetLastError());
+  cudaCheck(cudaMemcpyAsync(
+      ts_h.buffer().get(), ts_d.const_buffer().get(), ts_d.bufferSize(), cudaMemcpyDeviceToHost, stream));
   cudaCheck(cudaGetLastError());
-  cudaCheck(cudaMemcpy(&ts, ts_d, sizeof(TS), cudaMemcpyDefault));
-  cudaCheck(cudaDeviceSynchronize());
+  cudaCheck(cudaStreamSynchronize(stream));
 #else
-  testTSSoA(&ts, 128);
+  testTSSoA<pixelTopology::Phase1>(ts_h.view());
 #endif
 }
diff --git a/CUDADataFormats/TrackingRecHit/BuildFile.xml b/CUDADataFormats/TrackingRecHit/BuildFile.xml
index 4cda8ebd306b0..e67c2227feef9 100644
--- a/CUDADataFormats/TrackingRecHit/BuildFile.xml
+++ b/CUDADataFormats/TrackingRecHit/BuildFile.xml
@@ -3,6 +3,7 @@
 <use name="CUDADataFormats/Common"/>
 <use name="CUDADataFormats/SiPixelCluster"/>
 <use name="DataFormats/Common"/>
+<use name="DataFormats/SoATemplate" source_only="1"/>
 <use name="Geometry/CommonTopologies" source_only="1"/>
 <use name="HeterogeneousCore/CUDAUtilities"/>
 <export>
diff --git a/CUDADataFormats/TrackingRecHit/interface/SiPixelHitStatus.h b/CUDADataFormats/TrackingRecHit/interface/SiPixelHitStatus.h
index b3bdade5ec97c..13322ce3952b7 100644
--- a/CUDADataFormats/TrackingRecHit/interface/SiPixelHitStatus.h
+++ b/CUDADataFormats/TrackingRecHit/interface/SiPixelHitStatus.h
@@ -12,4 +12,9 @@ struct SiPixelHitStatus {
   uint8_t qBin : 3;  //  ∈[0,1,...,7]
 };
 
+struct SiPixelHitStatusAndCharge {
+  SiPixelHitStatus status;
+  uint32_t charge : 24;
+};
+
 #endif
diff --git a/CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h b/CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h
deleted file mode 100644
index ad78daa8354e2..0000000000000
--- a/CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h
+++ /dev/null
@@ -1,384 +0,0 @@
-#ifndef CUDADataFormats_TrackingRecHit_interface_TrackingRecHit2DHeterogeneous_h
-#define CUDADataFormats_TrackingRecHit_interface_TrackingRecHit2DHeterogeneous_h
-
-#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DSOAView.h"
-#include "CUDADataFormats/Common/interface/HeterogeneousSoA.h"
-#include "RecoLocalTracker/SiPixelRecHits/interface/pixelCPEforGPU.h"
-#include "Geometry/CommonTopologies/interface/SimplePixelTopology.h"
-#include "DataFormats/Common/interface/CMS_CLASS_VERSION.h"
-
-namespace {
-  enum class Storage32 {
-    kXLocal = 0,
-    kYLocal = 1,
-    kXerror = 2,
-    kYerror = 3,
-    kCharge = 4,
-    kXGlobal = 5,
-    kYGlobal = 6,
-    kZGlobal = 7,
-    kRGlobal = 8,
-    kPhiStorage = 9,
-    kLayers = 10
-  };
-
-  enum class Storage16 {
-    kDetId = 0,
-    kPhi = 1,
-    kXSize = 2,
-    kYSize = 3,
-  };
-}  // namespace
-
-template <typename Traits, typename TrackerTraits>
-class TrackingRecHit2DHeterogeneousT {
-public:
-  template <typename>
-  friend class TrackingRecHit2DHostT;
-
-  template <typename T>
-  using unique_ptr = typename Traits::template unique_ptr<T>;
-
-  using TrackingRecHit2DSOAView = TrackingRecHit2DSOAViewT<TrackerTraits>;
-  using PhiBinner = typename TrackingRecHit2DSOAView::PhiBinner;
-  using AverageGeometry = typename TrackingRecHit2DSOAView::AverageGeometry;
-
-  TrackingRecHit2DHeterogeneousT() = default;
-
-  explicit TrackingRecHit2DHeterogeneousT(uint32_t nHits,
-                                          int32_t offsetBPIX2,
-                                          pixelCPEforGPU::ParamsOnGPUT<TrackerTraits> const* cpeParams,
-                                          uint32_t const* hitsModuleStart,
-                                          cudaStream_t stream = nullptr);
-
-  explicit TrackingRecHit2DHeterogeneousT(cms::cuda::host::unique_ptr<float[]>& store32,
-                                          cms::cuda::host::unique_ptr<uint16_t[]>& store16,
-                                          uint32_t* modules,
-                                          int nHits,
-                                          cudaStream_t stream = nullptr);
-  ~TrackingRecHit2DHeterogeneousT() = default;
-
-  TrackingRecHit2DHeterogeneousT(const TrackingRecHit2DHeterogeneousT&) = delete;
-  TrackingRecHit2DHeterogeneousT& operator=(const TrackingRecHit2DHeterogeneousT&) = delete;
-  TrackingRecHit2DHeterogeneousT(TrackingRecHit2DHeterogeneousT&&) = default;
-  TrackingRecHit2DHeterogeneousT& operator=(TrackingRecHit2DHeterogeneousT&&) = default;
-
-  TrackingRecHit2DSOAView* view() { return m_view.get(); }
-  TrackingRecHit2DSOAView const* view() const { return m_view.get(); }
-
-  auto nHits() const { return m_nHits; }
-  auto offsetBPIX2() const { return m_offsetBPIX2; }
-
-  auto hitsModuleStart() const { return m_hitsModuleStart; }
-  auto hitsLayerStart() { return m_hitsLayerStart; }
-  auto phiBinner() { return m_phiBinner; }
-  auto phiBinnerStorage() { return m_phiBinnerStorage; }
-  auto iphi() { return m_iphi; }
-
-  cms::cuda::host::unique_ptr<float[]> localCoordToHostAsync(cudaStream_t stream) const;
-
-  cms::cuda::host::unique_ptr<uint32_t[]> hitsModuleStartToHostAsync(cudaStream_t stream) const;
-
-  cms::cuda::host::unique_ptr<uint16_t[]> store16ToHostAsync(cudaStream_t stream) const;
-  cms::cuda::host::unique_ptr<float[]> store32ToHostAsync(cudaStream_t stream) const;
-
-protected:
-  static constexpr uint32_t n16 = 4;                 // number of elements in m_store16
-  static constexpr uint32_t n32 = 10;                // number of elements in m_store32
-  static_assert(sizeof(uint32_t) == sizeof(float));  // just stating the obvious
-  static_assert(n32 == static_cast<uint32_t>(Storage32::kLayers));
-  unique_ptr<uint16_t[]> m_store16;  //!
-  unique_ptr<float[]> m_store32;     //!
-
-  unique_ptr<PhiBinner> m_PhiBinnerStore;              //!
-  unique_ptr<AverageGeometry> m_AverageGeometryStore;  //!
-
-  unique_ptr<TrackingRecHit2DSOAView> m_view;  //!
-
-  uint32_t m_nHits;
-  int32_t m_offsetBPIX2;
-
-  uint32_t const* m_hitsModuleStart;  // needed for legacy, this is on GPU!
-
-  // needed as kernel params...
-  PhiBinner* m_phiBinner;
-  typename PhiBinner::index_type* m_phiBinnerStorage;
-  uint32_t* m_hitsLayerStart;
-  int16_t* m_iphi;
-};
-
-//Inherit and overload only what we need to overload, remember to use this->
-//GPU
-template <typename TrackerTraits>
-class TrackingRecHit2DGPUT : public TrackingRecHit2DHeterogeneousT<cms::cudacompat::GPUTraits, TrackerTraits> {
-public:
-  using TrackingRecHit2DHeterogeneousT<cms::cudacompat::GPUTraits, TrackerTraits>::TrackingRecHit2DHeterogeneousT;
-
-  cms::cuda::host::unique_ptr<float[]> localCoordToHostAsync(cudaStream_t stream) const;
-  cms::cuda::host::unique_ptr<uint32_t[]> hitsModuleStartToHostAsync(cudaStream_t stream) const;
-  cms::cuda::host::unique_ptr<uint16_t[]> store16ToHostAsync(cudaStream_t stream) const;
-  cms::cuda::host::unique_ptr<float[]> store32ToHostAsync(cudaStream_t stream) const;
-};
-
-//CPU
-template <typename TrackerTraits>
-class TrackingRecHit2DCPUT : public TrackingRecHit2DHeterogeneousT<cms::cudacompat::CPUTraits, TrackerTraits> {
-public:
-  using TrackingRecHit2DHeterogeneousT<cms::cudacompat::CPUTraits, TrackerTraits>::TrackingRecHit2DHeterogeneousT;
-
-  cms::cuda::host::unique_ptr<uint32_t[]> hitsModuleStartToHostAsync(cudaStream_t stream) const;
-  cms::cuda::host::unique_ptr<uint16_t[]> store16ToHostAsync(cudaStream_t stream) const;
-  cms::cuda::host::unique_ptr<float[]> store32ToHostAsync(cudaStream_t stream) const;
-};
-
-//HOST
-template <typename TrackerTraits>
-class TrackingRecHit2DHostT : public TrackingRecHit2DHeterogeneousT<cms::cudacompat::HostTraits, TrackerTraits> {
-public:
-  ~TrackingRecHit2DHostT() = default;
-  TrackingRecHit2DHostT() = default;
-
-  explicit TrackingRecHit2DHostT(uint32_t nHits,
-                                 int32_t offsetBPIX2,
-                                 pixelCPEforGPU::ParamsOnGPUT<TrackerTraits> const* cpeParams,
-                                 uint32_t const* hitsModuleStart,
-                                 cudaStream_t stream = nullptr)
-      : TrackingRecHit2DHeterogeneousT<cms::cudacompat::HostTraits, TrackerTraits>(
-            nHits, offsetBPIX2, cpeParams, hitsModuleStart, stream) {}
-
-  explicit TrackingRecHit2DHostT(cms::cuda::host::unique_ptr<float[]>& store32,
-                                 cms::cuda::host::unique_ptr<uint16_t[]>& store16,
-                                 uint32_t* modules,
-                                 int nHits,
-                                 cudaStream_t stream = nullptr)
-      : TrackingRecHit2DHeterogeneousT<cms::cudacompat::HostTraits, TrackerTraits>(
-            store32, store16, modules, nHits, stream) {}
-
-  explicit TrackingRecHit2DHostT(uint32_t nHits,
-                                 int32_t offsetBPIX2,
-                                 pixelCPEforGPU::ParamsOnGPUT<TrackerTraits> const* cpeParams,
-                                 uint32_t const* hitsModuleStart,
-                                 cudaStream_t stream,
-                                 TrackingRecHit2DHeterogeneousT<cms::cudacompat::GPUTraits, TrackerTraits> const* input);
-};
-
-#include "HeterogeneousCore/CUDAUtilities/interface/copyAsync.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
-
-template <typename Traits, typename TrackerTraits>
-TrackingRecHit2DHeterogeneousT<Traits, TrackerTraits>::TrackingRecHit2DHeterogeneousT(
-    uint32_t nHits,
-    int32_t offsetBPIX2,
-    pixelCPEforGPU::ParamsOnGPUT<TrackerTraits> const* cpeParams,
-    uint32_t const* hitsModuleStart,
-    cudaStream_t stream)
-    : m_nHits(nHits), m_offsetBPIX2(offsetBPIX2), m_hitsModuleStart(hitsModuleStart) {
-  using TrackingRecHit2DSOAView = TrackingRecHit2DSOAViewT<TrackerTraits>;
-
-  auto view = Traits::template make_host_unique<TrackingRecHit2DSOAView>(stream);
-
-  view->m_nHits = nHits;
-  m_view = Traits::template make_unique<TrackingRecHit2DSOAView>(stream);  // leave it on host and pass it by value?
-  m_AverageGeometryStore = Traits::template make_unique<typename TrackingRecHit2DSOAView::AverageGeometry>(stream);
-  view->m_averageGeometry = m_AverageGeometryStore.get();
-  view->m_cpeParams = cpeParams;
-  view->m_hitsModuleStart = hitsModuleStart;
-
-  // if empy do not bother
-  if (0 == nHits) {
-    if constexpr (std::is_same_v<Traits, cms::cudacompat::GPUTraits>) {
-      cms::cuda::copyAsync(m_view, view, stream);
-    } else {
-      m_view.reset(view.release());  // NOLINT: std::move() breaks CUDA version
-    }
-    return;
-  }
-
-  // the single arrays are not 128 bit alligned...
-  // the hits are actually accessed in order only in building
-  // if ordering is relevant they may have to be stored phi-ordered by layer or so
-  // this will break 1to1 correspondence with cluster and module locality
-  // so unless proven VERY inefficient we keep it ordered as generated
-
-  m_store16 = Traits::template make_unique<uint16_t[]>(nHits * n16, stream);
-  m_store32 = Traits::template make_unique<float[]>(nHits * n32 + TrackerTraits::numberOfLayers + 1, stream);
-  m_PhiBinnerStore = Traits::template make_unique<typename TrackingRecHit2DSOAView::PhiBinner>(stream);
-
-  static_assert(sizeof(typename TrackingRecHit2DSOAView::hindex_type) == sizeof(float));
-  static_assert(sizeof(typename TrackingRecHit2DSOAView::hindex_type) ==
-                sizeof(typename TrackingRecHit2DSOAView::PhiBinner::index_type));
-
-  auto get32 = [&](Storage32 i) { return m_store32.get() + static_cast<int>(i) * nHits; };
-
-  // copy all the pointers
-  m_phiBinner = view->m_phiBinner = m_PhiBinnerStore.get();
-  m_phiBinnerStorage = view->m_phiBinnerStorage =
-      reinterpret_cast<typename TrackingRecHit2DSOAView::PhiBinner::index_type*>(get32(Storage32::kPhiStorage));
-
-  view->m_xl = get32(Storage32::kXLocal);
-  view->m_yl = get32(Storage32::kYLocal);
-  view->m_xerr = get32(Storage32::kXerror);
-  view->m_yerr = get32(Storage32::kYerror);
-  view->m_chargeAndStatus = reinterpret_cast<uint32_t*>(get32(Storage32::kCharge));
-
-  view->m_xg = get32(Storage32::kXGlobal);
-  view->m_yg = get32(Storage32::kYGlobal);
-  view->m_zg = get32(Storage32::kZGlobal);
-  view->m_rg = get32(Storage32::kRGlobal);
-
-  auto get16 = [&](Storage16 i) { return m_store16.get() + static_cast<int>(i) * nHits; };
-  m_iphi = view->m_iphi = reinterpret_cast<int16_t*>(get16(Storage16::kPhi));
-
-  view->m_xsize = reinterpret_cast<int16_t*>(get16(Storage16::kXSize));
-  view->m_ysize = reinterpret_cast<int16_t*>(get16(Storage16::kYSize));
-  view->m_detInd = get16(Storage16::kDetId);
-
-  m_phiBinner = view->m_phiBinner = m_PhiBinnerStore.get();
-  m_hitsLayerStart = view->m_hitsLayerStart = reinterpret_cast<uint32_t*>(get32(Storage32::kLayers));
-
-  // transfer view
-  if constexpr (std::is_same_v<Traits, cms::cudacompat::GPUTraits>) {
-    cms::cuda::copyAsync(m_view, view, stream);
-  } else {
-    m_view.reset(view.release());  // NOLINT: std::move() breaks CUDA version
-  }
-}
-
-template <typename TrackerTraits>
-TrackingRecHit2DHostT<TrackerTraits>::TrackingRecHit2DHostT(
-    uint32_t nHits,
-    int32_t offsetBPIX2,
-    pixelCPEforGPU::ParamsOnGPUT<TrackerTraits> const* cpeParams,
-    uint32_t const* hitsModuleStart,
-    cudaStream_t stream,
-    TrackingRecHit2DHeterogeneousT<cms::cudacompat::GPUTraits, TrackerTraits> const* input) {
-  using TrackingRecHit2DSOAView = TrackingRecHit2DSOAViewT<TrackerTraits>;
-
-  this->m_nHits = nHits;
-  this->m_offsetBPIX2 = offsetBPIX2;
-  this->m_hitsModuleStart = hitsModuleStart;
-
-  auto view = cms::cuda::make_host_unique<TrackingRecHit2DSOAView>(stream);
-
-  view->m_nHits = nHits;
-  this->m_view =
-      cms::cuda::make_host_unique<TrackingRecHit2DSOAView>(stream);  // leave it on host and pass it by value?
-  this->m_AverageGeometryStore = cms::cuda::make_host_unique<typename TrackingRecHit2DSOAView::AverageGeometry>(stream);
-  view->m_averageGeometry = this->m_AverageGeometryStore.get();
-  view->m_cpeParams = cpeParams;
-  view->m_hitsModuleStart = hitsModuleStart;
-
-  // if empy do not bother
-  if (0 == nHits) {
-    this->m_view.reset(view.release());  // NOLINT: std::move() breaks CUDA version
-    return;
-  }
-
-  this->m_store32 = cms::cuda::make_host_unique<float[]>(5 * input->nHits(), stream);
-  cms::cuda::copyAsync(this->m_store32, input->m_store32, 5 * input->nHits(), stream);
-
-  static_assert(sizeof(typename TrackingRecHit2DSOAView::hindex_type) == sizeof(float));
-  static_assert(sizeof(typename TrackingRecHit2DSOAView::hindex_type) ==
-                sizeof(typename TrackingRecHit2DSOAView::PhiBinner::index_type));
-
-  auto get32 = [&](Storage32 i) { return this->m_store32.get() + static_cast<int>(i) * nHits; };
-
-  // copy all the pointers
-  this->m_phiBinner = view->m_phiBinner = this->m_PhiBinnerStore.get();
-  this->m_phiBinnerStorage = view->m_phiBinnerStorage =
-      reinterpret_cast<typename TrackingRecHit2DSOAView::PhiBinner::index_type*>(get32(Storage32::kPhiStorage));
-
-  view->m_xl = get32(Storage32::kXLocal);
-  view->m_yl = get32(Storage32::kYLocal);
-  view->m_xerr = get32(Storage32::kXerror);
-  view->m_yerr = get32(Storage32::kYerror);
-  view->m_chargeAndStatus = reinterpret_cast<uint32_t*>(get32(Storage32::kCharge));
-
-  this->m_view = std::move(view);
-}
-
-//this is intended to be used only for CPU SoA but doesn't hurt to have it for all cases
-template <typename Traits, typename TrackerTraits>
-TrackingRecHit2DHeterogeneousT<Traits, TrackerTraits>::TrackingRecHit2DHeterogeneousT(
-    cms::cuda::host::unique_ptr<float[]>& store32,
-    cms::cuda::host::unique_ptr<uint16_t[]>& store16,
-    uint32_t* modules,
-    int nHits,
-    cudaStream_t stream)
-    : m_nHits(nHits), m_hitsModuleStart(modules) {
-  auto view = Traits::template make_host_unique<TrackingRecHit2DSOAView>(stream);
-
-  m_view = Traits::template make_unique<TrackingRecHit2DSOAView>(stream);
-
-  view->m_nHits = nHits;
-
-  if (0 == nHits) {
-    if constexpr (std::is_same_v<Traits, cms::cudacompat::GPUTraits>) {
-      cms::cuda::copyAsync(m_view, view, stream);
-    } else {
-      m_view = std::move(view);
-    }
-    return;
-  }
-
-  m_store16 = Traits::template make_unique<uint16_t[]>(nHits * n16, stream);
-  m_store32 = Traits::template make_unique<float[]>(nHits * n32, stream);
-  m_PhiBinnerStore = Traits::template make_unique<typename TrackingRecHit2DSOAView::PhiBinner>(stream);
-  m_AverageGeometryStore = Traits::template make_unique<typename TrackingRecHit2DSOAView::AverageGeometry>(stream);
-
-  view->m_averageGeometry = m_AverageGeometryStore.get();
-  view->m_hitsModuleStart = m_hitsModuleStart;
-
-  //store transfer
-  if constexpr (std::is_same_v<Traits, cms::cudacompat::GPUTraits>) {
-    cms::cuda::copyAsync(m_store16, store16, static_cast<int>(n16 * nHits), stream);
-    cms::cuda::copyAsync(m_store32, store32, static_cast<int>(n32 * nHits), stream);
-
-  } else {
-    std::copy(store32.get(), store32.get() + nHits * n32, m_store32.get());  // want to copy it
-    std::copy(store16.get(), store16.get() + nHits * n16, m_store16.get());
-  }
-
-  //getters
-  auto get32 = [&](Storage32 i) { return m_store32.get() + static_cast<int>(i) * nHits; };
-  auto get16 = [&](Storage16 i) { return m_store16.get() + static_cast<int>(i) * nHits; };
-
-  //Store 32
-  view->m_xl = get32(Storage32::kXLocal);
-  view->m_yl = get32(Storage32::kYLocal);
-  view->m_xerr = get32(Storage32::kXerror);
-  view->m_yerr = get32(Storage32::kYerror);
-  view->m_chargeAndStatus = reinterpret_cast<uint32_t*>(get32(Storage32::kCharge));
-  view->m_xg = get32(Storage32::kXGlobal);
-  view->m_yg = get32(Storage32::kYGlobal);
-  view->m_zg = get32(Storage32::kZGlobal);
-  view->m_rg = get32(Storage32::kRGlobal);
-
-  m_phiBinner = view->m_phiBinner = m_PhiBinnerStore.get();
-  m_phiBinnerStorage = view->m_phiBinnerStorage =
-      reinterpret_cast<typename TrackingRecHit2DSOAView::PhiBinner::index_type*>(get32(Storage32::kPhiStorage));
-
-  //Store 16
-  view->m_detInd = get16(Storage16::kDetId);
-  m_iphi = view->m_iphi = reinterpret_cast<int16_t*>(get16(Storage16::kPhi));
-  view->m_xsize = reinterpret_cast<int16_t*>(get16(Storage16::kXSize));
-  view->m_ysize = reinterpret_cast<int16_t*>(get16(Storage16::kYSize));
-
-  // transfer view
-  if constexpr (std::is_same_v<Traits, cms::cudacompat::GPUTraits>) {
-    cms::cuda::copyAsync(m_view, view, stream);
-  } else {
-    m_view = std::move(view);
-  }
-}
-
-//Classes definition for Phase1/Phase2, to make the classes_def lighter. Not actually used in the code.
-using TrackingRecHit2DGPUPhase1 = TrackingRecHit2DGPUT<pixelTopology::Phase1>;
-using TrackingRecHit2DCPUPhase1 = TrackingRecHit2DCPUT<pixelTopology::Phase1>;
-using TrackingRecHit2DHostPhase1 = TrackingRecHit2DHostT<pixelTopology::Phase1>;
-
-using TrackingRecHit2DGPUPhase2 = TrackingRecHit2DGPUT<pixelTopology::Phase2>;
-using TrackingRecHit2DCPUPhase2 = TrackingRecHit2DCPUT<pixelTopology::Phase2>;
-using TrackingRecHit2DHostPhase2 = TrackingRecHit2DHostT<pixelTopology::Phase2>;
-
-#endif  // CUDADataFormats_TrackingRecHit_interface_TrackingRecHit2DHeterogeneousT_h
diff --git a/CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DReduced.h b/CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DReduced.h
deleted file mode 100644
index 8fd2bc54cfad7..0000000000000
--- a/CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DReduced.h
+++ /dev/null
@@ -1,59 +0,0 @@
-#ifndef CUDADataFormats_TrackingRecHit_interface_TrackingRecHit2DReducedT_h
-#define CUDADataFormats_TrackingRecHit_interface_TrackingRecHit2DReducedT_h
-
-#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DSOAView.h"
-#include "CUDADataFormats/Common/interface/HostProduct.h"
-
-// a reduced (in content and therefore in size) version to be used on CPU for Legacy reconstruction
-template <typename TrackerTraits>
-class TrackingRecHit2DReducedT {
-  using TrackingRecHit2DSOAView = TrackingRecHit2DSOAViewT<TrackerTraits>;
-
-public:
-  using HLPstorage = HostProduct<float[]>;
-  using HIDstorage = HostProduct<uint16_t[]>;
-
-  template <typename UP32, typename UP16>
-  TrackingRecHit2DReducedT(UP32&& istore32, UP16&& istore16, int nhits)
-      : m_store32(std::move(istore32)), m_store16(std::move(istore16)), m_nHits(nhits) {
-    auto get32 = [&](int i) { return const_cast<float*>(m_store32.get()) + i * nhits; };
-
-    // copy all the pointers (better be in sync with the producer store)
-
-    m_view.m_xl = get32(0);
-    m_view.m_yl = get32(1);
-    m_view.m_xerr = get32(2);
-    m_view.m_yerr = get32(3);
-    m_view.m_chargeAndStatus = reinterpret_cast<uint32_t*>(get32(4));
-    m_view.m_detInd = const_cast<uint16_t*>(m_store16.get());
-  }
-
-  // view only!
-  TrackingRecHit2DReducedT(TrackingRecHit2DSOAView const& iview, int nhits) : m_view(iview), m_nHits(nhits) {}
-
-  TrackingRecHit2DReducedT() = default;
-  ~TrackingRecHit2DReducedT() = default;
-
-  TrackingRecHit2DReducedT(const TrackingRecHit2DReducedT&) = delete;
-  TrackingRecHit2DReducedT& operator=(const TrackingRecHit2DReducedT&) = delete;
-  TrackingRecHit2DReducedT(TrackingRecHit2DReducedT&&) = default;
-  TrackingRecHit2DReducedT& operator=(TrackingRecHit2DReducedT&&) = default;
-
-  TrackingRecHit2DSOAView& view() { return m_view; }
-  TrackingRecHit2DSOAView const& view() const { return m_view; }
-
-  auto nHits() const { return m_nHits; }
-
-private:
-  TrackingRecHit2DSOAView m_view;
-
-  HLPstorage m_store32;
-  HIDstorage m_store16;
-
-  int m_nHits;
-};
-
-using TrackingRecHit2DReducedPhase1 = TrackingRecHit2DReducedT<pixelTopology::Phase1>;
-using TrackingRecHit2DReducedPhase2 = TrackingRecHit2DReducedT<pixelTopology::Phase2>;
-
-#endif
diff --git a/CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DSOAView.h b/CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DSOAView.h
deleted file mode 100644
index 59b7cb1337fdf..0000000000000
--- a/CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DSOAView.h
+++ /dev/null
@@ -1,131 +0,0 @@
-#ifndef CUDADataFormats_TrackingRecHit_interface_TrackingRecHit2DSOAView_h
-#define CUDADataFormats_TrackingRecHit_interface_TrackingRecHit2DSOAView_h
-
-#include <cuda_runtime.h>
-
-#include "CUDADataFormats/SiPixelCluster/interface/gpuClusteringConstants.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/cudaCompat.h"
-#include "Geometry/CommonTopologies/interface/SimplePixelTopology.h"
-#include "CUDADataFormats/TrackingRecHit/interface/SiPixelHitStatus.h"
-
-namespace pixelCPEforGPU {
-  template <typename TrackerTraits>
-  struct ParamsOnGPUT;
-}
-
-template <typename TrackerTraits>
-class TrackingRecHit2DSOAViewT {
-public:
-  using Status = SiPixelHitStatus;
-  static_assert(sizeof(Status) == sizeof(uint8_t));
-
-  using hindex_type = typename TrackerTraits::hindex_type;
-  using PhiBinner = cms::cuda::HistoContainer<int16_t,
-                                              256,
-                                              -1,
-                                              8 * sizeof(int16_t),
-                                              hindex_type,
-                                              TrackerTraits::numberOfLayers>;  //28 for phase2 geometry
-  using AverageGeometry = pixelTopology::AverageGeometryT<TrackerTraits>;
-  using ParamsOnGPU = pixelCPEforGPU::ParamsOnGPUT<TrackerTraits>;
-
-  template <typename, typename>
-  friend class TrackingRecHit2DHeterogeneousT;
-  template <typename>
-  friend class TrackingRecHit2DHostT;
-  // template <typename>
-  // friend class TrackingRecHit2DReducedT;
-
-  __device__ __forceinline__ uint32_t nHits() const { return m_nHits; }
-
-  __device__ __forceinline__ float& xLocal(int i) { return m_xl[i]; }
-  __device__ __forceinline__ float xLocal(int i) const { return __ldg(m_xl + i); }
-  __device__ __forceinline__ float& yLocal(int i) { return m_yl[i]; }
-  __device__ __forceinline__ float yLocal(int i) const { return __ldg(m_yl + i); }
-
-  __device__ __forceinline__ float& xerrLocal(int i) { return m_xerr[i]; }
-  __device__ __forceinline__ float xerrLocal(int i) const { return __ldg(m_xerr + i); }
-  __device__ __forceinline__ float& yerrLocal(int i) { return m_yerr[i]; }
-  __device__ __forceinline__ float yerrLocal(int i) const { return __ldg(m_yerr + i); }
-
-  __device__ __forceinline__ float& xGlobal(int i) { return m_xg[i]; }
-  __device__ __forceinline__ float xGlobal(int i) const { return __ldg(m_xg + i); }
-  __device__ __forceinline__ float& yGlobal(int i) { return m_yg[i]; }
-  __device__ __forceinline__ float yGlobal(int i) const { return __ldg(m_yg + i); }
-  __device__ __forceinline__ float& zGlobal(int i) { return m_zg[i]; }
-  __device__ __forceinline__ float zGlobal(int i) const { return __ldg(m_zg + i); }
-  __device__ __forceinline__ float& rGlobal(int i) { return m_rg[i]; }
-  __device__ __forceinline__ float rGlobal(int i) const { return __ldg(m_rg + i); }
-
-  __device__ __forceinline__ int16_t& iphi(int i) { return m_iphi[i]; }
-  __device__ __forceinline__ int16_t iphi(int i) const { return __ldg(m_iphi + i); }
-
-  __device__ __forceinline__ void setChargeAndStatus(int i, uint32_t ich, Status is) {
-    ich = std::min(ich, chargeMask());
-    uint32_t w = *reinterpret_cast<uint8_t*>(&is);
-    ich |= (w << 24);
-    m_chargeAndStatus[i] = ich;
-  }
-
-  __device__ __forceinline__ uint32_t charge(int i) const { return __ldg(m_chargeAndStatus + i) & 0xFFFFFF; }
-
-  __device__ __forceinline__ Status status(int i) const {
-    uint8_t w = __ldg(m_chargeAndStatus + i) >> 24;
-    return *reinterpret_cast<Status*>(&w);
-  }
-
-  __device__ __forceinline__ int16_t& clusterSizeX(int i) { return m_xsize[i]; }
-  __device__ __forceinline__ int16_t clusterSizeX(int i) const { return __ldg(m_xsize + i); }
-  __device__ __forceinline__ int16_t& clusterSizeY(int i) { return m_ysize[i]; }
-  __device__ __forceinline__ int16_t clusterSizeY(int i) const { return __ldg(m_ysize + i); }
-  __device__ __forceinline__ uint16_t& detectorIndex(int i) { return m_detInd[i]; }
-  __device__ __forceinline__ uint16_t detectorIndex(int i) const { return __ldg(m_detInd + i); }
-
-  __device__ __forceinline__ ParamsOnGPU const& cpeParams() const { return *m_cpeParams; }
-
-  __device__ __forceinline__ uint32_t hitsModuleStart(int i) const { return __ldg(m_hitsModuleStart + i); }
-
-  __device__ __forceinline__ uint32_t* hitsLayerStart() { return m_hitsLayerStart; }
-  __device__ __forceinline__ uint32_t const* hitsLayerStart() const { return m_hitsLayerStart; }
-
-  __device__ __forceinline__ PhiBinner& phiBinner() { return *m_phiBinner; }
-  __device__ __forceinline__ PhiBinner const& phiBinner() const { return *m_phiBinner; }
-
-  __device__ __forceinline__ AverageGeometry& averageGeometry() { return *m_averageGeometry; }
-  __device__ __forceinline__ AverageGeometry const& averageGeometry() const { return *m_averageGeometry; }
-
-  __device__ __forceinline__ bool clusterCut(int i, int o, bool debug = false) const { return false; }
-  __device__ __forceinline__ bool zSizeCut(int i, int o, bool debug = false) const { return false; }
-
-private:
-  // local coord
-  float *m_xl, *m_yl;
-  float *m_xerr, *m_yerr;
-
-  // global coord
-  float *m_xg, *m_yg, *m_zg, *m_rg;
-  int16_t* m_iphi;
-
-  // cluster properties
-  static constexpr uint32_t chargeMask() { return (1 << 24) - 1; }
-  uint32_t* m_chargeAndStatus;
-  int16_t* m_xsize;
-  int16_t* m_ysize;
-  uint16_t* m_detInd;
-
-  // supporting objects
-  // m_averageGeometry is corrected for beam spot, not sure where to host it otherwise
-  AverageGeometry* m_averageGeometry;  // owned by TrackingRecHit2DHeterogeneous
-  ParamsOnGPU const* m_cpeParams;      // forwarded from setup, NOT owned
-  uint32_t const* m_hitsModuleStart;   // forwarded from clusters
-
-  uint32_t* m_hitsLayerStart;
-
-  PhiBinner* m_phiBinner;
-  typename PhiBinner::index_type* m_phiBinnerStorage;
-
-  uint32_t m_nHits;
-};
-
-#endif  // CUDADataFormats_TrackingRecHit_interface_TrackingRecHit2DSOAView_h
diff --git a/CUDADataFormats/TrackingRecHit/interface/TrackingRecHitSoADevice.h b/CUDADataFormats/TrackingRecHit/interface/TrackingRecHitSoADevice.h
new file mode 100644
index 0000000000000..a64f017876439
--- /dev/null
+++ b/CUDADataFormats/TrackingRecHit/interface/TrackingRecHitSoADevice.h
@@ -0,0 +1,94 @@
+#ifndef CUDADataFormats_RecHits_TrackingRecHitsDevice_h
+#define CUDADataFormats_RecHits_TrackingRecHitsDevice_h
+
+#include <cstdint>
+
+#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHitsUtilities.h"
+#include "CUDADataFormats/Common/interface/PortableDeviceCollection.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+
+template <typename TrackerTraits>
+class TrackingRecHitSoADevice : public cms::cuda::PortableDeviceCollection<TrackingRecHitLayout<TrackerTraits>> {
+public:
+  using hitSoA = TrackingRecHitSoA<TrackerTraits>;
+  //Need to decorate the class with the inherited portable accessors being now a template
+  using cms::cuda::PortableDeviceCollection<TrackingRecHitLayout<TrackerTraits>>::view;
+  using cms::cuda::PortableDeviceCollection<TrackingRecHitLayout<TrackerTraits>>::const_view;
+  using cms::cuda::PortableDeviceCollection<TrackingRecHitLayout<TrackerTraits>>::buffer;
+  using cms::cuda::PortableDeviceCollection<TrackingRecHitLayout<TrackerTraits>>::bufferSize;
+
+  TrackingRecHitSoADevice() = default;  // cms::cuda::Product needs this
+
+  using AverageGeometry = typename hitSoA::AverageGeometry;
+  using ParamsOnGPU = typename hitSoA::ParamsOnGPU;
+  using PhiBinnerStorageType = typename hitSoA::PhiBinnerStorageType;
+  using PhiBinner = typename hitSoA::PhiBinner;
+  // Constructor which specifies the SoA size
+  explicit TrackingRecHitSoADevice(uint32_t nHits,
+                                   int32_t offsetBPIX2,
+                                   ParamsOnGPU const* cpeParams,
+                                   uint32_t const* hitsModuleStart,
+                                   cudaStream_t stream)
+      : cms::cuda::PortableDeviceCollection<TrackingRecHitLayout<TrackerTraits>>(nHits, stream),
+        nHits_(nHits),
+        cpeParams_(cpeParams),
+        hitsModuleStart_(hitsModuleStart),
+        offsetBPIX2_(offsetBPIX2) {
+    phiBinner_ = &(view().phiBinner());
+    cudaCheck(cudaMemcpyAsync(&(view().nHits()), &nHits, sizeof(uint32_t), cudaMemcpyDefault, stream));
+    // hitsModuleStart is on Device
+    cudaCheck(cudaMemcpyAsync(view().hitsModuleStart().data(),
+                              hitsModuleStart,
+                              sizeof(uint32_t) * int(TrackerTraits::numberOfModules + 1),
+                              cudaMemcpyDefault,
+                              stream));
+    cudaCheck(cudaMemcpyAsync(&(view().offsetBPIX2()), &offsetBPIX2, sizeof(int32_t), cudaMemcpyDefault, stream));
+
+    // cpeParams argument is a pointer to device memory, copy
+    // its contents into the Layout.
+    cudaCheck(cudaMemcpyAsync(&(view().cpeParams()), cpeParams, int(sizeof(ParamsOnGPU)), cudaMemcpyDefault, stream));
+  }
+
+  uint32_t nHits() const { return nHits_; }  //go to size of view
+
+  cms::cuda::host::unique_ptr<float[]> localCoordToHostAsync(cudaStream_t stream) const {
+    auto ret = cms::cuda::make_host_unique<float[]>(4 * nHits(), stream);
+    size_t rowSize = sizeof(float) * nHits();
+    cudaCheck(cudaMemcpyAsync(ret.get(), view().xLocal(), rowSize * 4, cudaMemcpyDefault, stream));
+
+    return ret;
+  }  //move to utilities
+
+  cms::cuda::host::unique_ptr<uint32_t[]> hitsModuleStartToHostAsync(cudaStream_t stream) const {
+    auto ret = cms::cuda::make_host_unique<uint32_t[]>(TrackerTraits::numberOfModules + 1, stream);
+    cudaCheck(cudaMemcpyAsync(ret.get(),
+                              view().hitsModuleStart().data(),
+                              sizeof(uint32_t) * (TrackerTraits::numberOfModules + 1),
+                              cudaMemcpyDefault,
+                              stream));
+    return ret;
+  }
+
+  auto phiBinnerStorage() { return phiBinnerStorage_; }
+  auto hitsModuleStart() const { return hitsModuleStart_; }
+  uint32_t offsetBPIX2() const { return offsetBPIX2_; }
+  auto phiBinner() { return phiBinner_; }
+
+private:
+  uint32_t nHits_;  //Needed for the host SoA size
+
+  //TODO: this is used not that much from the hits (only once in BrokenLineFit), would make sens to remove it from this class.
+  ParamsOnGPU const* cpeParams_;
+  uint32_t const* hitsModuleStart_;
+  uint32_t offsetBPIX2_;
+
+  PhiBinnerStorageType* phiBinnerStorage_;
+  PhiBinner* phiBinner_;
+};
+
+//Classes definition for Phase1/Phase2, to make the classes_def lighter. Not actually used in the code.
+using TrackingRecHitSoADevicePhase1 = TrackingRecHitSoADevice<pixelTopology::Phase1>;
+using TrackingRecHitSoADevicePhase2 = TrackingRecHitSoADevice<pixelTopology::Phase2>;
+
+#endif  // CUDADataFormats_Track_TrackHeterogeneousT_H
diff --git a/CUDADataFormats/TrackingRecHit/interface/TrackingRecHitSoAHost.h b/CUDADataFormats/TrackingRecHit/interface/TrackingRecHitSoAHost.h
new file mode 100644
index 0000000000000..f8bbe61f4a781
--- /dev/null
+++ b/CUDADataFormats/TrackingRecHit/interface/TrackingRecHitSoAHost.h
@@ -0,0 +1,80 @@
+#ifndef CUDADataFormats_RecHits_TrackingRecHitsHost_h
+#define CUDADataFormats_RecHits_TrackingRecHitsHost_h
+
+#include <cstdint>
+
+#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHitsUtilities.h"
+#include "CUDADataFormats/Common/interface/PortableHostCollection.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+
+template <typename TrackerTraits>
+class TrackingRecHitSoAHost : public cms::cuda::PortableHostCollection<TrackingRecHitLayout<TrackerTraits>> {
+public:
+  using hitSoA = TrackingRecHitSoA<TrackerTraits>;
+  //Need to decorate the class with the inherited portable accessors being now a template
+  using cms::cuda::PortableHostCollection<TrackingRecHitLayout<TrackerTraits>>::view;
+  using cms::cuda::PortableHostCollection<TrackingRecHitLayout<TrackerTraits>>::const_view;
+  using cms::cuda::PortableHostCollection<TrackingRecHitLayout<TrackerTraits>>::buffer;
+  using cms::cuda::PortableHostCollection<TrackingRecHitLayout<TrackerTraits>>::bufferSize;
+
+  TrackingRecHitSoAHost() = default;
+
+  using AverageGeometry = typename hitSoA::AverageGeometry;
+  using ParamsOnGPU = typename hitSoA::ParamsOnGPU;
+  using PhiBinnerStorageType = typename hitSoA::PhiBinnerStorageType;
+  using PhiBinner = typename hitSoA::PhiBinner;
+
+  // This SoA Host is used basically only for DQM
+  // so we  just need a slim constructor
+  explicit TrackingRecHitSoAHost(uint32_t nHits)
+      : cms::cuda::PortableHostCollection<TrackingRecHitLayout<TrackerTraits>>(nHits) {}
+
+  explicit TrackingRecHitSoAHost(uint32_t nHits, cudaStream_t stream)
+      : cms::cuda::PortableHostCollection<TrackingRecHitLayout<TrackerTraits>>(nHits, stream) {}
+
+  explicit TrackingRecHitSoAHost(uint32_t nHits,
+                                 int32_t offsetBPIX2,
+                                 ParamsOnGPU const* cpeParams,
+                                 uint32_t const* hitsModuleStart)
+      : cms::cuda::PortableHostCollection<TrackingRecHitLayout<TrackerTraits>>(nHits),
+        nHits_(nHits),
+        cpeParams_(cpeParams),
+        offsetBPIX2_(offsetBPIX2) {
+    view().nHits() = nHits;
+    std::copy(hitsModuleStart, hitsModuleStart + TrackerTraits::numberOfModules + 1, view().hitsModuleStart().begin());
+    memcpy(&(view().cpeParams()), cpeParams, sizeof(ParamsOnGPU));
+    view().offsetBPIX2() = offsetBPIX2;
+  }
+
+  explicit TrackingRecHitSoAHost(uint32_t nHits,
+                                 int32_t offsetBPIX2,
+                                 ParamsOnGPU const* cpeParams,
+                                 uint32_t const* hitsModuleStart,
+                                 cudaStream_t stream)
+      : cms::cuda::PortableHostCollection<TrackingRecHitLayout<TrackerTraits>>(nHits, stream),
+        nHits_(nHits),
+        cpeParams_(cpeParams),
+        offsetBPIX2_(offsetBPIX2) {
+    view().nHits() = nHits;
+    std::copy(hitsModuleStart, hitsModuleStart + TrackerTraits::numberOfModules + 1, view().hitsModuleStart().begin());
+    memcpy(&(view().cpeParams()), cpeParams, sizeof(ParamsOnGPU));
+    view().offsetBPIX2() = offsetBPIX2;
+  }
+
+  uint32_t nHits() const { return nHits_; }
+  uint32_t offsetBPIX2() const { return offsetBPIX2_; }
+  auto phiBinnerStorage() { return phiBinnerStorage_; }
+
+private:
+  uint32_t nHits_;  //Needed for the host SoA size
+  ParamsOnGPU const* cpeParams_;
+  uint32_t offsetBPIX2_;
+
+  PhiBinnerStorageType* phiBinnerStorage_;
+};
+
+using TrackingRecHitSoAHostPhase1 = TrackingRecHitSoAHost<pixelTopology::Phase1>;
+using TrackingRecHitSoAHostPhase2 = TrackingRecHitSoAHost<pixelTopology::Phase2>;
+
+#endif  // CUDADataFormats_Track_TrackHeterogeneousT_H
diff --git a/CUDADataFormats/TrackingRecHit/interface/TrackingRecHitsUtilities.h b/CUDADataFormats/TrackingRecHit/interface/TrackingRecHitsUtilities.h
new file mode 100644
index 0000000000000..7e28cb97becc8
--- /dev/null
+++ b/CUDADataFormats/TrackingRecHit/interface/TrackingRecHitsUtilities.h
@@ -0,0 +1,66 @@
+#ifndef CUDADataFormats_RecHits_TrackingRecHitsUtilities_h
+#define CUDADataFormats_RecHits_TrackingRecHitsUtilities_h
+
+#include <Eigen/Dense>
+#include "Geometry/CommonTopologies/interface/SimplePixelTopology.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h"
+#include "DataFormats/SoATemplate/interface/SoALayout.h"
+#include "RecoLocalTracker/SiPixelRecHits/interface/pixelCPEforGPU.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h"
+#include "SiPixelHitStatus.h"
+
+template <typename TrackerTraits>
+struct TrackingRecHitSoA {
+  using hindex_type = typename TrackerTraits::hindex_type;
+  using PhiBinner = cms::cuda::HistoContainer<int16_t,
+                                              256,
+                                              -1,
+                                              8 * sizeof(int16_t),
+                                              hindex_type,
+                                              TrackerTraits::numberOfLayers>;  //28 for phase2 geometry
+
+  using PhiBinnerStorageType = typename PhiBinner::index_type;
+  using AverageGeometry = pixelTopology::AverageGeometryT<TrackerTraits>;
+  using ParamsOnGPU = pixelCPEforGPU::ParamsOnGPUT<TrackerTraits>;
+
+  using HitLayerStartArray = std::array<hindex_type, TrackerTraits::numberOfLayers + 1>;
+  using HitModuleStartArray = std::array<hindex_type, TrackerTraits::numberOfModules + 1>;
+
+  //Is it better to have two split?
+  GENERATE_SOA_LAYOUT(TrackingRecHitSoALayout,
+                      SOA_COLUMN(float, xLocal),
+                      SOA_COLUMN(float, yLocal),
+                      SOA_COLUMN(float, xerrLocal),
+                      SOA_COLUMN(float, yerrLocal),
+                      SOA_COLUMN(float, xGlobal),
+                      SOA_COLUMN(float, yGlobal),
+                      SOA_COLUMN(float, zGlobal),
+                      SOA_COLUMN(float, rGlobal),
+                      SOA_COLUMN(int16_t, iphi),
+                      SOA_COLUMN(SiPixelHitStatusAndCharge, chargeAndStatus),
+                      SOA_COLUMN(int16_t, clusterSizeX),
+                      SOA_COLUMN(int16_t, clusterSizeY),
+                      SOA_COLUMN(uint16_t, detectorIndex),
+
+                      SOA_SCALAR(uint32_t, nHits),
+                      SOA_SCALAR(int32_t, offsetBPIX2),
+                      //These above could be separated in a specific
+                      //layout since they don't depends on the template
+                      //for the moment I'm keeping them here
+                      SOA_COLUMN(PhiBinnerStorageType, phiBinnerStorage),
+                      SOA_SCALAR(HitModuleStartArray, hitsModuleStart),
+                      SOA_SCALAR(HitLayerStartArray, hitsLayerStart),
+                      SOA_SCALAR(ParamsOnGPU, cpeParams),
+                      SOA_SCALAR(AverageGeometry, averageGeometry),
+                      SOA_SCALAR(PhiBinner, phiBinner));
+};
+
+template <typename TrackerTraits>
+using TrackingRecHitLayout = typename TrackingRecHitSoA<TrackerTraits>::template TrackingRecHitSoALayout<>;
+template <typename TrackerTraits>
+using TrackingRecHitSoAView = typename TrackingRecHitSoA<TrackerTraits>::template TrackingRecHitSoALayout<>::View;
+template <typename TrackerTraits>
+using TrackingRecHitSoAConstView =
+    typename TrackingRecHitSoA<TrackerTraits>::template TrackingRecHitSoALayout<>::ConstView;
+
+#endif
diff --git a/CUDADataFormats/TrackingRecHit/src/TrackingRecHit2DHeterogeneous.cc b/CUDADataFormats/TrackingRecHit/src/TrackingRecHit2DHeterogeneous.cc
deleted file mode 100644
index 05c3eba3d8bde..0000000000000
--- a/CUDADataFormats/TrackingRecHit/src/TrackingRecHit2DHeterogeneous.cc
+++ /dev/null
@@ -1,49 +0,0 @@
-#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/copyAsync.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h"
-
-template <typename TrackerTraits>
-cms::cuda::host::unique_ptr<float[]> TrackingRecHit2DGPUT<TrackerTraits>::localCoordToHostAsync(
-    cudaStream_t stream) const {
-  auto ret = cms::cuda::make_host_unique<float[]>(5 * this->nHits(), stream);
-  cms::cuda::copyAsync(ret, this->m_store32, 5 * this->nHits(), stream);
-  return ret;
-}
-
-template <typename TrackerTraits>
-cms::cuda::host::unique_ptr<float[]> TrackingRecHit2DGPUT<TrackerTraits>::store32ToHostAsync(cudaStream_t stream) const {
-  auto ret = cms::cuda::make_host_unique<float[]>(static_cast<int>(this->n32) * this->nHits(), stream);
-  cms::cuda::copyAsync(ret, this->m_store32, static_cast<int>(this->n32) * this->nHits(), stream);
-  return ret;
-}
-
-template <typename TrackerTraits>
-cms::cuda::host::unique_ptr<uint16_t[]> TrackingRecHit2DGPUT<TrackerTraits>::store16ToHostAsync(
-    cudaStream_t stream) const {
-  auto ret = cms::cuda::make_host_unique<uint16_t[]>(static_cast<int>(this->n16) * this->nHits(), stream);
-  cms::cuda::copyAsync(ret, this->m_store16, static_cast<int>(this->n16) * this->nHits(), stream);
-  return ret;
-}
-
-template <typename TrackerTraits>
-cms::cuda::host::unique_ptr<uint32_t[]> TrackingRecHit2DGPUT<TrackerTraits>::hitsModuleStartToHostAsync(
-    cudaStream_t stream) const {
-  auto ret = cms::cuda::make_host_unique<uint32_t[]>(TrackerTraits::numberOfModules + 1, stream);
-  cudaCheck(cudaMemcpyAsync(ret.get(),
-                            this->m_hitsModuleStart,
-                            sizeof(uint32_t) * (TrackerTraits::numberOfModules + 1),
-                            cudaMemcpyDefault,
-                            stream));
-  return ret;
-}
-
-template class TrackingRecHit2DGPUT<pixelTopology::Phase1>;
-template class TrackingRecHit2DGPUT<pixelTopology::Phase2>;
-
-template class TrackingRecHit2DCPUT<pixelTopology::Phase1>;
-template class TrackingRecHit2DCPUT<pixelTopology::Phase2>;
-
-template class TrackingRecHit2DHostT<pixelTopology::Phase1>;
-template class TrackingRecHit2DHostT<pixelTopology::Phase2>;
diff --git a/CUDADataFormats/TrackingRecHit/src/classes.h b/CUDADataFormats/TrackingRecHit/src/classes.h
index b9a20695712e3..1f494d0517450 100644
--- a/CUDADataFormats/TrackingRecHit/src/classes.h
+++ b/CUDADataFormats/TrackingRecHit/src/classes.h
@@ -2,8 +2,8 @@
 #define CUDADataFormats_TrackingRecHit_src_classes_h
 
 #include "CUDADataFormats/Common/interface/Product.h"
-#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h"
-#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DReduced.h"
+#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHitSoAHost.h"
+#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHitSoADevice.h"
 #include "DataFormats/Common/interface/Wrapper.h"
 
 #endif  // CUDADataFormats_TrackingRecHit_src_classes_h
diff --git a/CUDADataFormats/TrackingRecHit/src/classes_def.xml b/CUDADataFormats/TrackingRecHit/src/classes_def.xml
index 4287860ee8495..6c2389e829549 100644
--- a/CUDADataFormats/TrackingRecHit/src/classes_def.xml
+++ b/CUDADataFormats/TrackingRecHit/src/classes_def.xml
@@ -1,22 +1,16 @@
 <lcgdict>
-  <class name="TrackingRecHit2DCPUPhase1" persistent="false"/>
-  <class name="edm::Wrapper<TrackingRecHit2DCPUPhase1>" persistent="false"/>
-  <class name="TrackingRecHit2DHostPhase1" persistent="false"/>
-  <class name="edm::Wrapper<TrackingRecHit2DHostPhase1>" persistent="false"/>
-  <class name="cms::cuda::Product<TrackingRecHit2DGPUPhase1>" persistent="false"/>
-  <class name="edm::Wrapper<cms::cuda::Product<TrackingRecHit2DGPUPhase1>>" persistent="false"/>
 
-  <class name="TrackingRecHit2DCPUPhase2" persistent="false"/>
-  <class name="edm::Wrapper<TrackingRecHit2DCPUPhase2>" persistent="false"/>
-  <class name="TrackingRecHit2DHostPhase2" persistent="false"/>
-  <class name="edm::Wrapper<TrackingRecHit2DHostPhase2>" persistent="false"/>
-  <class name="cms::cuda::Product<TrackingRecHit2DGPUPhase2>" persistent="false"/>
-  <class name="edm::Wrapper<cms::cuda::Product<TrackingRecHit2DGPUPhase2>>" persistent="false"/>
+  <class name="TrackingRecHitSoAHostPhase1" persistent="false"/>
+  <class name="edm::Wrapper<TrackingRecHitSoAHostPhase1>" persistent="false"/>
 
-  <class name="TrackingRecHit2DReducedPhase1" persistent="false"/>
-  <class name="edm::Wrapper<TrackingRecHit2DReducedPhase1>" persistent="false"/>
+  <class name="TrackingRecHitSoAHostPhase2" persistent="false"/>
+  <class name="edm::Wrapper<TrackingRecHitSoAHostPhase2>" persistent="false"/>
 
-  <class name="TrackingRecHit2DReducedPhase2" persistent="false"/>
-  <class name="edm::Wrapper<TrackingRecHit2DReducedPhase2>" persistent="false"/>
+  <class name="TrackingRecHitSoADevicePhase1" persistent="false"/>
+  <class name="cms::cuda::Product<TrackingRecHitSoADevicePhase1>" persistent="false"/>
+  <class name="edm::Wrapper<cms::cuda::Product<TrackingRecHitSoADevicePhase1>>" persistent="false"/>
+
+  <class name="cms::cuda::Product<TrackingRecHitSoADevicePhase2>" persistent="false"/>
+  <class name="edm::Wrapper<cms::cuda::Product<TrackingRecHitSoADevicePhase2>>" persistent="false"/>
 
 </lcgdict>
diff --git a/CUDADataFormats/TrackingRecHit/test/BuildFile.xml b/CUDADataFormats/TrackingRecHit/test/BuildFile.xml
index f064563aa7051..7baacbac416a1 100644
--- a/CUDADataFormats/TrackingRecHit/test/BuildFile.xml
+++ b/CUDADataFormats/TrackingRecHit/test/BuildFile.xml
@@ -1,6 +1,7 @@
 <use name="CUDADataFormats/TrackingRecHit"/>
 <use name="HeterogeneousCore/CUDAUtilities"/>
 <flags CUDA_FLAGS="-g -DGPU_DEBUG"/>
+
 <iftool name="cuda-gcc-support">
-<bin file="TrackingRecHit2DCUDA_t.cpp TrackingRecHit2DCUDA_t.cu" name="TrackingRecHit2DCUDA_t"/>
+  <bin file="TrackingRecHitSoA_test.cpp TrackingRecHitSoA_test.cu" name="TrackingRecHitSoA_test"/>
 </iftool>
diff --git a/CUDADataFormats/TrackingRecHit/test/TrackingRecHit2DCUDAImpl_t.h b/CUDADataFormats/TrackingRecHit/test/TrackingRecHit2DCUDAImpl_t.h
deleted file mode 100644
index b2da57c2471ae..0000000000000
--- a/CUDADataFormats/TrackingRecHit/test/TrackingRecHit2DCUDAImpl_t.h
+++ /dev/null
@@ -1,26 +0,0 @@
-#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h"
-
-namespace testTrackingRecHit2D {
-
-  template <typename TrackerTraits>
-  __global__ void fill(TrackingRecHit2DSOAViewT<TrackerTraits>* phits) {
-    assert(phits);
-    auto& hits = *phits;
-    assert(hits.nHits() == 200);
-
-    int i = threadIdx.x;
-    if (i > 200)
-      return;
-  }
-
-  template <typename TrackerTraits>
-  __global__ void verify(TrackingRecHit2DSOAViewT<TrackerTraits> const* phits) {
-    assert(phits);
-    auto const& hits = *phits;
-    assert(hits.nHits() == 200);
-
-    int i = threadIdx.x;
-    if (i > 200)
-      return;
-  }
-}  // namespace testTrackingRecHit2D
diff --git a/CUDADataFormats/TrackingRecHit/test/TrackingRecHit2DCUDA_t.cpp b/CUDADataFormats/TrackingRecHit/test/TrackingRecHit2DCUDA_t.cpp
deleted file mode 100644
index 0d910273933dc..0000000000000
--- a/CUDADataFormats/TrackingRecHit/test/TrackingRecHit2DCUDA_t.cpp
+++ /dev/null
@@ -1,42 +0,0 @@
-#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/copyAsync.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/requireDevices.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
-#include "Geometry/CommonTopologies/interface/SimplePixelTopology.h"
-
-namespace testTrackingRecHit2D {
-
-  template <typename TrackerTraits>
-  void runKernels(TrackingRecHit2DSOAViewT<TrackerTraits>* hits);
-}  // namespace testTrackingRecHit2D
-
-int main() {
-  cms::cudatest::requireDevices();
-
-  cudaStream_t stream;
-  cudaCheck(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
-
-  auto nHits = 200;
-  // inner scope to deallocate memory before destroying the stream
-  {
-    TrackingRecHit2DGPUT<pixelTopology::Phase1> tkhit(nHits, 0, nullptr, nullptr, stream);
-    testTrackingRecHit2D::runKernels<pixelTopology::Phase1>(tkhit.view());
-
-    TrackingRecHit2DGPUT<pixelTopology::Phase2> tkhitPhase2(nHits, 0, nullptr, nullptr, stream);
-    testTrackingRecHit2D::runKernels<pixelTopology::Phase2>(tkhitPhase2.view());
-
-    TrackingRecHit2DHostT<pixelTopology::Phase1> tkhitH(nHits, 0, nullptr, nullptr, stream, &tkhit);
-    cudaStreamSynchronize(stream);
-    assert(tkhitH.view());
-    assert(tkhitH.view()->nHits() == unsigned(nHits));
-
-    TrackingRecHit2DHostT<pixelTopology::Phase2> tkhitHPhase2(nHits, 0, nullptr, nullptr, stream, &tkhitPhase2);
-    cudaStreamSynchronize(stream);
-    assert(tkhitHPhase2.view());
-    assert(tkhitHPhase2.view()->nHits() == unsigned(nHits));
-  }
-
-  cudaCheck(cudaStreamDestroy(stream));
-
-  return 0;
-}
diff --git a/CUDADataFormats/TrackingRecHit/test/TrackingRecHit2DCUDA_t.cu b/CUDADataFormats/TrackingRecHit/test/TrackingRecHit2DCUDA_t.cu
deleted file mode 100644
index e902ea971edf3..0000000000000
--- a/CUDADataFormats/TrackingRecHit/test/TrackingRecHit2DCUDA_t.cu
+++ /dev/null
@@ -1,15 +0,0 @@
-#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h"
-#include "TrackingRecHit2DCUDAImpl_t.h"
-
-namespace testTrackingRecHit2D {
-
-  template <typename TrackerTraits>
-  void runKernels(TrackingRecHit2DSOAViewT<TrackerTraits>* hits) {
-    assert(hits);
-    fill<TrackerTraits><<<1, 1024>>>(hits);
-    verify<TrackerTraits><<<1, 1024>>>(hits);
-  }
-
-  template void runKernels<pixelTopology::Phase1>(TrackingRecHit2DSOAViewT<pixelTopology::Phase1>* hits);
-  template void runKernels<pixelTopology::Phase2>(TrackingRecHit2DSOAViewT<pixelTopology::Phase2>* hits);
-}  // namespace testTrackingRecHit2D
diff --git a/CUDADataFormats/TrackingRecHit/test/TrackingRecHitSoA_test.cpp b/CUDADataFormats/TrackingRecHit/test/TrackingRecHitSoA_test.cpp
new file mode 100644
index 0000000000000..146bb9133d9d8
--- /dev/null
+++ b/CUDADataFormats/TrackingRecHit/test/TrackingRecHitSoA_test.cpp
@@ -0,0 +1,50 @@
+#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHitSoAHost.h"
+#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHitSoADevice.h"
+
+#include "HeterogeneousCore/CUDAUtilities/interface/copyAsync.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/requireDevices.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/allocate_device.h"
+#include "Geometry/CommonTopologies/interface/SimplePixelTopology.h"
+
+namespace testTrackingRecHitSoA {
+
+  template <typename TrackerTraits>
+  void runKernels(TrackingRecHitSoADevice<TrackerTraits>& hits, cudaStream_t stream);
+
+}
+
+int main() {
+  using ParamsOnGPU = TrackingRecHitSoADevice<pixelTopology::Phase1>::ParamsOnGPU;
+  cms::cudatest::requireDevices();
+
+  cudaStream_t stream;
+  cudaCheck(cudaStreamCreateWithFlags(&stream, cudaStreamDefault));
+
+  // inner scope to deallocate memory before destroying the stream
+  {
+    uint32_t nHits = 2000;
+    int32_t offset = 100;
+    uint32_t moduleStart[1856];
+
+    for (size_t i = 0; i < 1856; i++) {
+      moduleStart[i] = i * 2;
+    }
+    ParamsOnGPU* cpeParams_d;
+    cudaCheck(cudaMalloc(&cpeParams_d, sizeof(ParamsOnGPU)));
+    TrackingRecHitSoADevice<pixelTopology::Phase1> tkhit(nHits, offset, cpeParams_d, &moduleStart[0], stream);
+
+    testTrackingRecHitSoA::runKernels<pixelTopology::Phase1>(tkhit, stream);
+    printf("tkhit hits %d \n", tkhit.nHits());
+    auto test = tkhit.localCoordToHostAsync(stream);
+    printf("test[9] %.2f\n", test[9]);
+
+    auto ret = tkhit.hitsModuleStartToHostAsync(stream);
+    printf("mods[9] %d\n", ret[9]);
+    cudaCheck(cudaFree(cpeParams_d));
+  }
+
+  cudaCheck(cudaStreamDestroy(stream));
+
+  return 0;
+}
diff --git a/CUDADataFormats/TrackingRecHit/test/TrackingRecHitSoA_test.cu b/CUDADataFormats/TrackingRecHit/test/TrackingRecHitSoA_test.cu
new file mode 100644
index 0000000000000..490f30fa6b7bd
--- /dev/null
+++ b/CUDADataFormats/TrackingRecHit/test/TrackingRecHitSoA_test.cu
@@ -0,0 +1,64 @@
+#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHitsUtilities.h"
+#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHitSoADevice.h"
+
+namespace testTrackingRecHitSoA {
+
+  template <typename TrackerTraits>
+  __global__ void fill(TrackingRecHitSoAView<TrackerTraits> soa) {
+    int i = threadIdx.x;
+    int j = blockIdx.x;
+    if (i == 0 and j == 0) {
+      soa.offsetBPIX2() = 22;
+      soa[10].xLocal() = 1.11;
+    }
+
+    soa[i].iphi() = i % 10;
+    soa.hitsLayerStart()[j] = j;
+    __syncthreads();
+  }
+
+  template <typename TrackerTraits>
+  __global__ void show(TrackingRecHitSoAView<TrackerTraits> soa) {
+    int i = threadIdx.x;
+    int j = blockIdx.x;
+
+    if (i == 0 and j == 0) {
+      printf("nbins = %d \n", soa.phiBinner().nbins());
+      printf("offsetBPIX %d ->%d \n", i, soa.offsetBPIX2());
+      printf("nHits %d ->%d \n", i, soa.nHits());
+      printf("hitsModuleStart %d ->%d \n", i, soa.hitsModuleStart().at(28));
+    }
+
+    if (i < 10)  // can be increased to soa.nHits() for debugging
+      printf("iPhi %d ->%d \n", i, soa[i].iphi());
+
+    if (j * blockDim.x + i < 10)  // can be increased to soa.phiBinner().nbins() for debugging
+      printf(">bin size %d ->%d \n", j * blockDim.x + i, soa.phiBinner().size(j * blockDim.x + i));
+    __syncthreads();
+  }
+
+  template <typename TrackerTraits>
+  void runKernels(TrackingRecHitSoADevice<TrackerTraits>& hits, cudaStream_t stream) {
+    printf("> RUN!\n");
+    fill<TrackerTraits><<<10, 100, 0, stream>>>(hits.view());
+
+    cudaCheck(cudaDeviceSynchronize());
+    cms::cuda::fillManyFromVector(hits.phiBinner(),
+                                  10,
+                                  hits.view().iphi(),
+                                  hits.view().hitsLayerStart().data(),
+                                  2000,
+                                  256,
+                                  hits.view().phiBinnerStorage(),
+                                  stream);
+    cudaCheck(cudaDeviceSynchronize());
+    show<TrackerTraits><<<10, 1000, 0, stream>>>(hits.view());
+    cudaCheck(cudaDeviceSynchronize());
+  }
+
+  template void runKernels<pixelTopology::Phase1>(TrackingRecHitSoADevice<pixelTopology::Phase1>& hits,
+                                                  cudaStream_t stream);
+  template void runKernels<pixelTopology::Phase2>(TrackingRecHitSoADevice<pixelTopology::Phase2>& hits,
+                                                  cudaStream_t stream);
+
+}  // namespace testTrackingRecHitSoA
diff --git a/CUDADataFormats/Vertex/BuildFile.xml b/CUDADataFormats/Vertex/BuildFile.xml
index f61e4aff7403f..c6b918ec4b12b 100644
--- a/CUDADataFormats/Vertex/BuildFile.xml
+++ b/CUDADataFormats/Vertex/BuildFile.xml
@@ -3,6 +3,7 @@
 <use name="rootcore"/>
 <use name="CUDADataFormats/Common"/>
 <use name="DataFormats/Common"/>
+<use name="DataFormats/SoATemplate" source_only="1"/>
 <use name="HeterogeneousCore/CUDAUtilities"/>
 <export>
     <lib name="1"/>
diff --git a/CUDADataFormats/Vertex/README.md b/CUDADataFormats/Vertex/README.md
new file mode 100644
index 0000000000000..3e495d15f776e
--- /dev/null
+++ b/CUDADataFormats/Vertex/README.md
@@ -0,0 +1,45 @@
+# Vertex CUDA Data Formats
+
+`CUDADataFormat`s meant to be used on Host (CPU) or Device (CUDA GPU) for
+storing information about vertices created during the Pixel-local Reconstruction
+chain. It stores data in an SoA manner. It contains the data that was previously
+contained in the deprecated `ZVertexSoA` class. 
+
+The host format is inheriting from `CUDADataFormats/Common/interface/PortableHostCollection.h`,
+while the device format is inheriting from `CUDADataFormats/Common/interface/PortableDeviceCollection.h`
+
+Both formats use the same SoA Layout (`ZVertexSoAHeterogeneousLayout`) which is generated
+via the `GENERATE_SOA_LAYOUT` macro in the `ZVertexUtilities.h` file.
+
+## Notes
+
+- Initially, `ZVertexSoA` had distinct array sizes for each attribute (e.g. `zv` was `MAXVTX` elements 
+long, `ndof` was `MAXTRACKS` elements long). All columns are now of uniform `MAXTRACKS` size, 
+meaning that there will be some wasted space (appx. 190kB). 
+- Host and Device classes should **not** be created via inheritance, as they're done here,
+but via composition. See [this discussion](https://github.com/cms-sw/cmssw/pull/40465#discussion_r1066039309).
+
+## ZVertexHeterogeneousHost
+
+The version of the data format to be used for storing vertex data on the CPU. 
+Instances of this class are to be used for:
+
+- Having a place to copy data to host from device, via `cudaMemcpy`, or
+- Running host-side algorithms using data stored in an SoA manner.
+
+## ZVertexHeterogeneousDevice
+
+The version of the data format to be used for storing vertex data on the GPU.
+
+Instances of `ZVertexHeterogeneousDevice` are to be created on host and be
+used on device only. To do so, the instance's `view()` method is to be called
+to pass a `View` to any kernel launched. Accessing data from the `view()` is not
+possible on the host side.
+
+## Utilities
+
+Apart from `ZVertexSoAHeterogeneousLayout`, `ZVertexUtilities.h` also contains
+a collection of methods which were originally
+defined as class methods inside the `ZVertexSoA` class
+which have been adapted to operate on `View` instances, so that they are callable
+from within `__global__` kernels, on both CPU and CPU. 
diff --git a/CUDADataFormats/Vertex/interface/ZVertexSoAHeterogeneousDevice.h b/CUDADataFormats/Vertex/interface/ZVertexSoAHeterogeneousDevice.h
new file mode 100644
index 0000000000000..ae662d7fd5f9a
--- /dev/null
+++ b/CUDADataFormats/Vertex/interface/ZVertexSoAHeterogeneousDevice.h
@@ -0,0 +1,22 @@
+#ifndef CUDADataFormats_Vertex_ZVertexHeterogeneousDevice_H
+#define CUDADataFormats_Vertex_ZVertexHeterogeneousDevice_H
+
+#include "CUDADataFormats/Vertex/interface/ZVertexUtilities.h"
+#include "CUDADataFormats/Common/interface/PortableDeviceCollection.h"
+
+// TODO: The class is created via inheritance of the PortableDeviceCollection.
+// This is generally discouraged, and should be done via composition.
+// See: https://github.com/cms-sw/cmssw/pull/40465#discussion_r1067364306
+template <int32_t S>
+class ZVertexSoAHeterogeneousDevice : public cms::cuda::PortableDeviceCollection<ZVertexSoAHeterogeneousLayout<>> {
+public:
+  ZVertexSoAHeterogeneousDevice() = default;  // cms::cuda::Product needs this
+
+  // Constructor which specifies the SoA size
+  explicit ZVertexSoAHeterogeneousDevice(cudaStream_t stream)
+      : PortableDeviceCollection<ZVertexSoAHeterogeneousLayout<>>(S, stream) {}
+};
+
+using ZVertexSoADevice = ZVertexSoAHeterogeneousDevice<zVertex::utilities::MAXTRACKS>;
+
+#endif  // CUDADataFormats_Vertex_ZVertexHeterogeneousDevice_H
diff --git a/CUDADataFormats/Vertex/interface/ZVertexSoAHeterogeneousHost.h b/CUDADataFormats/Vertex/interface/ZVertexSoAHeterogeneousHost.h
new file mode 100644
index 0000000000000..6b62d615e1d11
--- /dev/null
+++ b/CUDADataFormats/Vertex/interface/ZVertexSoAHeterogeneousHost.h
@@ -0,0 +1,24 @@
+#ifndef CUDADataFormats_Vertex_ZVertexHeterogeneousHost_H
+#define CUDADataFormats_Vertex_ZVertexHeterogeneousHost_H
+
+#include <cstdint>
+
+#include "CUDADataFormats/Vertex/interface/ZVertexUtilities.h"
+#include "CUDADataFormats/Common/interface/PortableHostCollection.h"
+
+// TODO: The class is created via inheritance of the PortableHostCollection.
+// This is generally discouraged, and should be done via composition.
+// See: https://github.com/cms-sw/cmssw/pull/40465#discussion_r1067364306
+template <int32_t S>
+class ZVertexSoAHeterogeneousHost : public cms::cuda::PortableHostCollection<ZVertexSoAHeterogeneousLayout<>> {
+public:
+  explicit ZVertexSoAHeterogeneousHost() : cms::cuda::PortableHostCollection<ZVertexSoAHeterogeneousLayout<>>(S) {}
+
+  // Constructor which specifies the SoA size and CUDA stream
+  explicit ZVertexSoAHeterogeneousHost(cudaStream_t stream)
+      : PortableHostCollection<ZVertexSoAHeterogeneousLayout<>>(S, stream) {}
+};
+
+using ZVertexSoAHost = ZVertexSoAHeterogeneousHost<zVertex::utilities::MAXTRACKS>;
+
+#endif  // CUDADataFormats_Vertex_ZVertexHeterogeneousHost_H
diff --git a/CUDADataFormats/Vertex/interface/ZVertexUtilities.h b/CUDADataFormats/Vertex/interface/ZVertexUtilities.h
new file mode 100644
index 0000000000000..2403652377971
--- /dev/null
+++ b/CUDADataFormats/Vertex/interface/ZVertexUtilities.h
@@ -0,0 +1,35 @@
+#ifndef CUDADataFormats_Vertex_ZVertexUtilities_h
+#define CUDADataFormats_Vertex_ZVertexUtilities_h
+
+#include <cuda_runtime.h>
+#include "DataFormats/SoATemplate/interface/SoALayout.h"
+
+GENERATE_SOA_LAYOUT(ZVertexSoAHeterogeneousLayout,
+                    SOA_COLUMN(int16_t, idv),
+                    SOA_COLUMN(float, zv),
+                    SOA_COLUMN(float, wv),
+                    SOA_COLUMN(float, chi2),
+                    SOA_COLUMN(float, ptv2),
+                    SOA_COLUMN(int32_t, ndof),
+                    SOA_COLUMN(uint16_t, sortInd),
+                    SOA_SCALAR(uint32_t, nvFinal))
+
+// Previous ZVertexSoA class methods.
+// They operate on View and ConstView of the ZVertexSoA.
+namespace zVertex {
+  // Common types for both Host and Device code
+  using ZVertexSoALayout = ZVertexSoAHeterogeneousLayout<>;
+  using ZVertexSoAView = ZVertexSoAHeterogeneousLayout<>::View;
+  using ZVertexSoAConstView = ZVertexSoAHeterogeneousLayout<>::ConstView;
+
+  namespace utilities {
+
+    static constexpr uint32_t MAXTRACKS = 128 * 1024;
+    static constexpr uint32_t MAXVTX = 1024;
+
+    __host__ __device__ inline void init(ZVertexSoAView &vertices) { vertices.nvFinal() = 0; }
+
+  }  // namespace utilities
+}  // namespace zVertex
+
+#endif
diff --git a/CUDADataFormats/Vertex/src/classes.h b/CUDADataFormats/Vertex/src/classes.h
index 7931beaa8f4bd..0340affffa06c 100644
--- a/CUDADataFormats/Vertex/src/classes.h
+++ b/CUDADataFormats/Vertex/src/classes.h
@@ -1,7 +1,8 @@
 #ifndef CUDADataFormats_Vertex_src_classes_h
 #define CUDADataFormats_Vertex_src_classes_h
 
-#include "CUDADataFormats/Vertex/interface/ZVertexHeterogeneous.h"
+#include "CUDADataFormats/Vertex/interface/ZVertexSoAHeterogeneousDevice.h"
+#include "CUDADataFormats/Vertex/interface/ZVertexSoAHeterogeneousHost.h"
 #include "CUDADataFormats/Common/interface/Product.h"
 #include "DataFormats/Common/interface/Wrapper.h"
 
diff --git a/CUDADataFormats/Vertex/src/classes_def.xml b/CUDADataFormats/Vertex/src/classes_def.xml
index ea633080af9af..63bd5a1cc94a7 100644
--- a/CUDADataFormats/Vertex/src/classes_def.xml
+++ b/CUDADataFormats/Vertex/src/classes_def.xml
@@ -1,6 +1,7 @@
 <lcgdict>
-  <class name="cms::cuda::Product<ZVertexHeterogeneous>" persistent="false"/>
-  <class name="edm::Wrapper<ZVertexCUDAProduct>" persistent="false"/>
-  <class name="ZVertexHeterogeneous" persistent="false"/>
-  <class name="edm::Wrapper<ZVertexHeterogeneous>" persistent="false"/>
+  <class name="ZVertexSoAHost" persistent="false"/>
+  <class name="edm::Wrapper<ZVertexSoAHost>" persistent="false"/>
+  <class name="ZVertexSoADevice" persistent="false"/>
+  <class name="cms::cuda::Product<ZVertexSoADevice>" persistent="false"/>
+  <class name="edm::Wrapper<cms::cuda::Product<ZVertexSoADevice>>" persistent="false"/>
 </lcgdict>
diff --git a/DQM/SiPixelHeterogeneous/plugins/SiPixelCompareRecHitsSoA.cc b/DQM/SiPixelHeterogeneous/plugins/SiPixelCompareRecHitsSoA.cc
index 9e054ecd17898..c13aa5eb47b42 100644
--- a/DQM/SiPixelHeterogeneous/plugins/SiPixelCompareRecHitsSoA.cc
+++ b/DQM/SiPixelHeterogeneous/plugins/SiPixelCompareRecHitsSoA.cc
@@ -18,7 +18,8 @@
 #include "DQMServices/Core/interface/MonitorElement.h"
 #include "DQMServices/Core/interface/DQMEDAnalyzer.h"
 #include "DQMServices/Core/interface/DQMStore.h"
-#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h"
+#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHitSoAHost.h"
+#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHitsUtilities.h"
 // Geometry
 #include "Geometry/TrackerGeometryBuilder/interface/TrackerGeometry.h"
 #include "DataFormats/TrackerCommon/interface/TrackerTopology.h"
@@ -30,8 +31,8 @@
 template <typename T>
 class SiPixelCompareRecHitsSoA : public DQMEDAnalyzer {
 public:
-  using HitSoA = TrackingRecHit2DSOAViewT<T>;
-  using HitsOnCPU = TrackingRecHit2DCPUT<T>;
+  using HitSoA = TrackingRecHitSoAView<T>;
+  using HitsOnHost = TrackingRecHitSoAHost<T>;
 
   explicit SiPixelCompareRecHitsSoA(const edm::ParameterSet&);
   ~SiPixelCompareRecHitsSoA() override = default;
@@ -43,8 +44,8 @@ class SiPixelCompareRecHitsSoA : public DQMEDAnalyzer {
 private:
   const edm::ESGetToken<TrackerGeometry, TrackerDigiGeometryRecord> geomToken_;
   const edm::ESGetToken<TrackerTopology, TrackerTopologyRcd> topoToken_;
-  const edm::EDGetTokenT<HitsOnCPU> tokenSoAHitsCPU_;
-  const edm::EDGetTokenT<HitsOnCPU> tokenSoAHitsGPU_;
+  const edm::EDGetTokenT<HitsOnHost> tokenSoAHitsHost_;    //these two are both on Host but originally they have been
+  const edm::EDGetTokenT<HitsOnHost> tokenSoAHitsDevice_;  //produced on Host or on Device
   const std::string topFolderName_;
   const float mind2cut_;
   static constexpr uint32_t invalidHit_ = std::numeric_limits<uint32_t>::max();
@@ -82,8 +83,8 @@ template <typename T>
 SiPixelCompareRecHitsSoA<T>::SiPixelCompareRecHitsSoA(const edm::ParameterSet& iConfig)
     : geomToken_(esConsumes<TrackerGeometry, TrackerDigiGeometryRecord, edm::Transition::BeginRun>()),
       topoToken_(esConsumes<TrackerTopology, TrackerTopologyRcd, edm::Transition::BeginRun>()),
-      tokenSoAHitsCPU_(consumes(iConfig.getParameter<edm::InputTag>("pixelHitsSrcCPU"))),
-      tokenSoAHitsGPU_(consumes(iConfig.getParameter<edm::InputTag>("pixelHitsSrcGPU"))),
+      tokenSoAHitsHost_(consumes(iConfig.getParameter<edm::InputTag>("pixelHitsSrcCPU"))),
+      tokenSoAHitsDevice_(consumes(iConfig.getParameter<edm::InputTag>("pixelHitsSrcGPU"))),
       topFolderName_(iConfig.getParameter<std::string>("topFolderName")),
       mind2cut_(iConfig.getParameter<double>("minD2cut")) {}
 //
@@ -100,39 +101,41 @@ void SiPixelCompareRecHitsSoA<T>::dqmBeginRun(const edm::Run& iRun, const edm::E
 //
 template <typename T>
 void SiPixelCompareRecHitsSoA<T>::analyze(const edm::Event& iEvent, const edm::EventSetup& iSetup) {
-  const auto& rhsoaHandleCPU = iEvent.getHandle(tokenSoAHitsCPU_);
-  const auto& rhsoaHandleGPU = iEvent.getHandle(tokenSoAHitsGPU_);
-  if (not rhsoaHandleCPU or not rhsoaHandleGPU) {
+  const auto& rhsoaHandleHost = iEvent.getHandle(tokenSoAHitsHost_);
+  const auto& rhsoaHandleDevice = iEvent.getHandle(tokenSoAHitsDevice_);
+  if (not rhsoaHandleHost or not rhsoaHandleDevice) {
     edm::LogWarning out("SiPixelCompareRecHitSoA");
-    if (not rhsoaHandleCPU) {
-      out << "reference (cpu) rechits not found; ";
+    if (not rhsoaHandleHost) {
+      out << "reference (Host) rechits not found; ";
     }
-    if (not rhsoaHandleGPU) {
-      out << "target (gpu) rechits not found; ";
+    if (not rhsoaHandleDevice) {
+      out << "target (Device) rechits not found; ";
     }
     out << "the comparison will not run.";
     return;
   }
 
-  auto const& rhsoaCPU = *rhsoaHandleCPU;
-  const HitSoA* soa2dCPU = rhsoaCPU.view();
-  auto const& rhsoaGPU = *rhsoaHandleGPU;
-  const HitSoA* soa2dGPU = rhsoaGPU.view();
+  auto const& rhsoaHost = *rhsoaHandleHost;
+  auto const& rhsoaDevice = *rhsoaHandleDevice;
 
-  uint32_t nHitsCPU = soa2dCPU->nHits();
-  uint32_t nHitsGPU = soa2dGPU->nHits();
-  hnHits_->Fill(nHitsCPU, nHitsGPU);
+  auto const& soa2dHost = rhsoaHost.const_view();
+  auto const& soa2dDevice = rhsoaDevice.const_view();
+
+  uint32_t nHitsHost = soa2dHost.nHits();
+  uint32_t nHitsDevice = soa2dDevice.nHits();
+
+  hnHits_->Fill(nHitsHost, nHitsDevice);
   auto detIds = tkGeom_->detUnitIds();
-  for (uint32_t i = 0; i < nHitsCPU; i++) {
+  for (uint32_t i = 0; i < nHitsHost; i++) {
     float minD = mind2cut_;
     uint32_t matchedHit = invalidHit_;
-    uint16_t indCPU = soa2dCPU->detectorIndex(i);
-    float xLocalCPU = soa2dCPU->xLocal(i);
-    float yLocalCPU = soa2dCPU->yLocal(i);
-    for (uint32_t j = 0; j < nHitsGPU; j++) {
-      if (soa2dGPU->detectorIndex(j) == indCPU) {
-        float dx = xLocalCPU - soa2dGPU->xLocal(j);
-        float dy = yLocalCPU - soa2dGPU->yLocal(j);
+    uint16_t indHost = soa2dHost[i].detectorIndex();
+    float xLocalHost = soa2dHost[i].xLocal();
+    float yLocalHost = soa2dHost[i].yLocal();
+    for (uint32_t j = 0; j < nHitsDevice; j++) {
+      if (soa2dDevice.detectorIndex(j) == indHost) {
+        float dx = xLocalHost - soa2dDevice[j].xLocal();
+        float dy = yLocalHost - soa2dDevice[j].yLocal();
         float distance = dx * dx + dy * dy;
         if (distance < minD) {
           minD = distance;
@@ -140,46 +143,46 @@ void SiPixelCompareRecHitsSoA<T>::analyze(const edm::Event& iEvent, const edm::E
         }
       }
     }
-    DetId id = detIds[indCPU];
-    uint32_t chargeCPU = soa2dCPU->charge(i);
-    int16_t sizeXCPU = std::ceil(float(std::abs(soa2dCPU->clusterSizeX(i)) / 8.));
-    int16_t sizeYCPU = std::ceil(float(std::abs(soa2dCPU->clusterSizeY(i)) / 8.));
-    uint32_t chargeGPU = 0;
-    int16_t sizeXGPU = -99;
-    int16_t sizeYGPU = -99;
-    float xLocalGPU = -999.;
-    float yLocalGPU = -999.;
+    DetId id = detIds[indHost];
+    uint32_t chargeHost = soa2dHost[i].chargeAndStatus().charge;
+    int16_t sizeXHost = std::ceil(float(std::abs(soa2dHost[i].clusterSizeX()) / 8.));
+    int16_t sizeYHost = std::ceil(float(std::abs(soa2dHost[i].clusterSizeY()) / 8.));
+    uint32_t chargeDevice = 0;
+    int16_t sizeXDevice = -99;
+    int16_t sizeYDevice = -99;
+    float xLocalDevice = -999.;
+    float yLocalDevice = -999.;
     if (matchedHit != invalidHit_) {
-      chargeGPU = soa2dGPU->charge(matchedHit);
-      sizeXGPU = std::ceil(float(std::abs(soa2dGPU->clusterSizeX(matchedHit)) / 8.));
-      sizeYGPU = std::ceil(float(std::abs(soa2dGPU->clusterSizeY(matchedHit)) / 8.));
-      xLocalGPU = soa2dGPU->xLocal(matchedHit);
-      yLocalGPU = soa2dGPU->yLocal(matchedHit);
+      chargeDevice = soa2dDevice[matchedHit].chargeAndStatus().charge;
+      sizeXDevice = std::ceil(float(std::abs(soa2dDevice[matchedHit].clusterSizeX()) / 8.));
+      sizeYDevice = std::ceil(float(std::abs(soa2dDevice[matchedHit].clusterSizeY()) / 8.));
+      xLocalDevice = soa2dDevice[matchedHit].xLocal();
+      yLocalDevice = soa2dDevice[matchedHit].yLocal();
     }
     switch (id.subdetId()) {
       case PixelSubdetector::PixelBarrel:
-        hBchargeL_[tTopo_->pxbLayer(id) - 1]->Fill(chargeCPU, chargeGPU);
-        hBsizexL_[tTopo_->pxbLayer(id) - 1]->Fill(sizeXCPU, sizeXGPU);
-        hBsizeyL_[tTopo_->pxbLayer(id) - 1]->Fill(sizeYCPU, sizeYGPU);
-        hBposxL_[tTopo_->pxbLayer(id) - 1]->Fill(xLocalCPU, xLocalGPU);
-        hBposyL_[tTopo_->pxbLayer(id) - 1]->Fill(yLocalCPU, yLocalGPU);
-        hBchargeDiff_->Fill(chargeCPU - chargeGPU);
-        hBsizeXDiff_->Fill(sizeXCPU - sizeXGPU);
-        hBsizeYDiff_->Fill(sizeYCPU - sizeYGPU);
-        hBposXDiff_->Fill(micron_ * (xLocalCPU - xLocalGPU));
-        hBposYDiff_->Fill(micron_ * (yLocalCPU - yLocalGPU));
+        hBchargeL_[tTopo_->pxbLayer(id) - 1]->Fill(chargeHost, chargeDevice);
+        hBsizexL_[tTopo_->pxbLayer(id) - 1]->Fill(sizeXHost, sizeXDevice);
+        hBsizeyL_[tTopo_->pxbLayer(id) - 1]->Fill(sizeYHost, sizeYDevice);
+        hBposxL_[tTopo_->pxbLayer(id) - 1]->Fill(xLocalHost, xLocalDevice);
+        hBposyL_[tTopo_->pxbLayer(id) - 1]->Fill(yLocalHost, yLocalDevice);
+        hBchargeDiff_->Fill(chargeHost - chargeDevice);
+        hBsizeXDiff_->Fill(sizeXHost - sizeXDevice);
+        hBsizeYDiff_->Fill(sizeYHost - sizeYDevice);
+        hBposXDiff_->Fill(micron_ * (xLocalHost - xLocalDevice));
+        hBposYDiff_->Fill(micron_ * (yLocalHost - yLocalDevice));
         break;
       case PixelSubdetector::PixelEndcap:
-        hFchargeD_[tTopo_->pxfSide(id) - 1][tTopo_->pxfDisk(id) - 1]->Fill(chargeCPU, chargeGPU);
-        hFsizexD_[tTopo_->pxfSide(id) - 1][tTopo_->pxfDisk(id) - 1]->Fill(sizeXCPU, sizeXGPU);
-        hFsizeyD_[tTopo_->pxfSide(id) - 1][tTopo_->pxfDisk(id) - 1]->Fill(sizeYCPU, sizeYGPU);
-        hFposxD_[tTopo_->pxfSide(id) - 1][tTopo_->pxfDisk(id) - 1]->Fill(xLocalCPU, xLocalGPU);
-        hFposyD_[tTopo_->pxfSide(id) - 1][tTopo_->pxfDisk(id) - 1]->Fill(yLocalCPU, yLocalGPU);
-        hFchargeDiff_->Fill(chargeCPU - chargeGPU);
-        hFsizeXDiff_->Fill(sizeXCPU - sizeXGPU);
-        hFsizeYDiff_->Fill(sizeYCPU - sizeYGPU);
-        hFposXDiff_->Fill(micron_ * (xLocalCPU - xLocalGPU));
-        hFposYDiff_->Fill(micron_ * (yLocalCPU - yLocalGPU));
+        hFchargeD_[tTopo_->pxfSide(id) - 1][tTopo_->pxfDisk(id) - 1]->Fill(chargeHost, chargeDevice);
+        hFsizexD_[tTopo_->pxfSide(id) - 1][tTopo_->pxfDisk(id) - 1]->Fill(sizeXHost, sizeXDevice);
+        hFsizeyD_[tTopo_->pxfSide(id) - 1][tTopo_->pxfDisk(id) - 1]->Fill(sizeYHost, sizeYDevice);
+        hFposxD_[tTopo_->pxfSide(id) - 1][tTopo_->pxfDisk(id) - 1]->Fill(xLocalHost, xLocalDevice);
+        hFposyD_[tTopo_->pxfSide(id) - 1][tTopo_->pxfDisk(id) - 1]->Fill(yLocalHost, yLocalDevice);
+        hFchargeDiff_->Fill(chargeHost - chargeDevice);
+        hFsizeXDiff_->Fill(sizeXHost - sizeXDevice);
+        hFsizeYDiff_->Fill(sizeYHost - sizeYDevice);
+        hFposXDiff_->Fill(micron_ * (xLocalHost - xLocalDevice));
+        hFposYDiff_->Fill(micron_ * (yLocalHost - yLocalDevice));
         break;
     }
   }
@@ -197,47 +200,47 @@ void SiPixelCompareRecHitsSoA<T>::bookHistograms(DQMStore::IBooker& iBook,
 
   // clang-format off
   //Global
-  hnHits_ = iBook.book2I("nHits", "CPUvsGPU RecHits per event;#CPU RecHits;#GPU RecHits", 200, 0, 5000,200, 0, 5000);
+  hnHits_ = iBook.book2I("nHits", "HostvsDevice RecHits per event;#Host RecHits;#Device RecHits", 200, 0, 5000,200, 0, 5000);
   //Barrel Layer
   for(unsigned int il=0;il<tkGeom_->numberOfLayers(PixelSubdetector::PixelBarrel);il++){
-    hBchargeL_[il] = iBook.book2I(Form("recHitsBLay%dCharge",il+1), Form("CPUvsGPU RecHits Charge Barrel Layer%d;CPU Charge;GPU Charge",il+1), 250, 0, 100000, 250, 0, 100000);
-    hBsizexL_[il] = iBook.book2I(Form("recHitsBLay%dSizex",il+1), Form("CPUvsGPU RecHits SizeX Barrel Layer%d;CPU SizeX;GPU SizeX",il+1), 30, 0, 30, 30, 0, 30);
-    hBsizeyL_[il] = iBook.book2I(Form("recHitsBLay%dSizey",il+1), Form("CPUvsGPU RecHits SizeY Barrel Layer%d;CPU SizeY;GPU SizeY",il+1), 30, 0, 30, 30, 0, 30);
-    hBposxL_[il] = iBook.book2D(Form("recHitsBLay%dPosx",il+1), Form("CPUvsGPU RecHits x-pos in Barrel Layer%d;CPU pos x;GPU pos x",il+1), 200, -5, 5, 200,-5,5);
-    hBposyL_[il] = iBook.book2D(Form("recHitsBLay%dPosy",il+1), Form("CPUvsGPU RecHits y-pos in Barrel Layer%d;CPU pos y;GPU pos y",il+1), 200, -5, 5, 200,-5,5);
+    hBchargeL_[il] = iBook.book2I(Form("recHitsBLay%dCharge",il+1), Form("HostvsDevice RecHits Charge Barrel Layer%d;Host Charge;Device Charge",il+1), 250, 0, 100000, 250, 0, 100000);
+    hBsizexL_[il] = iBook.book2I(Form("recHitsBLay%dSizex",il+1), Form("HostvsDevice RecHits SizeX Barrel Layer%d;Host SizeX;Device SizeX",il+1), 30, 0, 30, 30, 0, 30);
+    hBsizeyL_[il] = iBook.book2I(Form("recHitsBLay%dSizey",il+1), Form("HostvsDevice RecHits SizeY Barrel Layer%d;Host SizeY;Device SizeY",il+1), 30, 0, 30, 30, 0, 30);
+    hBposxL_[il] = iBook.book2D(Form("recHitsBLay%dPosx",il+1), Form("HostvsDevice RecHits x-pos in Barrel Layer%d;Host pos x;Device pos x",il+1), 200, -5, 5, 200,-5,5);
+    hBposyL_[il] = iBook.book2D(Form("recHitsBLay%dPosy",il+1), Form("HostvsDevice RecHits y-pos in Barrel Layer%d;Host pos y;Device pos y",il+1), 200, -5, 5, 200,-5,5);
   }
   //Endcaps
   //Endcaps Disk
   for(int is=0;is<2;is++){
     int sign=is==0? -1:1;
     for(unsigned int id=0;id<tkGeom_->numberOfLayers(PixelSubdetector::PixelEndcap);id++){
-      hFchargeD_[is][id] = iBook.book2I(Form("recHitsFDisk%+dCharge",id*sign+sign), Form("CPUvsGPU RecHits Charge Endcaps Disk%+d;CPU Charge;GPU Charge",id*sign+sign), 250, 0, 100000, 250, 0, 100000);
-      hFsizexD_[is][id] = iBook.book2I(Form("recHitsFDisk%+dSizex",id*sign+sign), Form("CPUvsGPU RecHits SizeX Endcaps Disk%+d;CPU SizeX;GPU SizeX",id*sign+sign), 30, 0, 30, 30, 0, 30);
-      hFsizeyD_[is][id] = iBook.book2I(Form("recHitsFDisk%+dSizey",id*sign+sign), Form("CPUvsGPU RecHits SizeY Endcaps Disk%+d;CPU SizeY;GPU SizeY",id*sign+sign), 30, 0, 30, 30, 0, 30);
-      hFposxD_[is][id] = iBook.book2D(Form("recHitsFDisk%+dPosx",id*sign+sign), Form("CPUvsGPU RecHits x-pos Endcaps Disk%+d;CPU pos x;GPU pos x",id*sign+sign), 200, -5, 5, 200, -5, 5);
-      hFposyD_[is][id] = iBook.book2D(Form("recHitsFDisk%+dPosy",id*sign+sign), Form("CPUvsGPU RecHits y-pos Endcaps Disk%+d;CPU pos y;GPU pos y",id*sign+sign), 200, -5, 5, 200, -5, 5);
+      hFchargeD_[is][id] = iBook.book2I(Form("recHitsFDisk%+dCharge",id*sign+sign), Form("HostvsDevice RecHits Charge Endcaps Disk%+d;Host Charge;Device Charge",id*sign+sign), 250, 0, 100000, 250, 0, 100000);
+      hFsizexD_[is][id] = iBook.book2I(Form("recHitsFDisk%+dSizex",id*sign+sign), Form("HostvsDevice RecHits SizeX Endcaps Disk%+d;Host SizeX;Device SizeX",id*sign+sign), 30, 0, 30, 30, 0, 30);
+      hFsizeyD_[is][id] = iBook.book2I(Form("recHitsFDisk%+dSizey",id*sign+sign), Form("HostvsDevice RecHits SizeY Endcaps Disk%+d;Host SizeY;Device SizeY",id*sign+sign), 30, 0, 30, 30, 0, 30);
+      hFposxD_[is][id] = iBook.book2D(Form("recHitsFDisk%+dPosx",id*sign+sign), Form("HostvsDevice RecHits x-pos Endcaps Disk%+d;Host pos x;Device pos x",id*sign+sign), 200, -5, 5, 200, -5, 5);
+      hFposyD_[is][id] = iBook.book2D(Form("recHitsFDisk%+dPosy",id*sign+sign), Form("HostvsDevice RecHits y-pos Endcaps Disk%+d;Host pos y;Device pos y",id*sign+sign), 200, -5, 5, 200, -5, 5);
     }
   }
   //1D differences
-  hBchargeDiff_ = iBook.book1D("rechitChargeDiffBpix","Charge differnce of rechits in BPix; rechit charge difference (CPU - GPU)", 101, -50.5, 50.5);
-  hFchargeDiff_ = iBook.book1D("rechitChargeDiffFpix","Charge differnce of rechits in FPix; rechit charge difference (CPU - GPU)", 101, -50.5, 50.5);
-  hBsizeXDiff_ = iBook.book1D("rechitsizeXDiffBpix","SizeX difference of rechits in BPix; rechit sizex difference (CPU - GPU)", 21, -10.5, 10.5);
-  hFsizeXDiff_ = iBook.book1D("rechitsizeXDiffFpix","SizeX difference of rechits in FPix; rechit sizex difference (CPU - GPU)", 21, -10.5, 10.5);
-  hBsizeYDiff_ = iBook.book1D("rechitsizeYDiffBpix","SizeY difference of rechits in BPix; rechit sizey difference (CPU - GPU)", 21, -10.5, 10.5);
-  hFsizeYDiff_ = iBook.book1D("rechitsizeYDiffFpix","SizeY difference of rechits in FPix; rechit sizey difference (CPU - GPU)", 21, -10.5, 10.5);
-  hBposXDiff_ = iBook.book1D("rechitsposXDiffBpix","x-position difference of rechits in BPix; rechit x-pos difference (CPU - GPU)", 1000, -10, 10);
-  hFposXDiff_ = iBook.book1D("rechitsposXDiffFpix","x-position difference of rechits in FPix; rechit x-pos difference (CPU - GPU)", 1000, -10, 10);
-  hBposYDiff_ = iBook.book1D("rechitsposYDiffBpix","y-position difference of rechits in BPix; rechit y-pos difference (CPU - GPU)", 1000, -10, 10);
-  hFposYDiff_ = iBook.book1D("rechitsposYDiffFpix","y-position difference of rechits in FPix; rechit y-pos difference (CPU - GPU)", 1000, -10, 10);
+  hBchargeDiff_ = iBook.book1D("rechitChargeDiffBpix","Charge differnce of rechits in BPix; rechit charge difference (Host - Device)", 101, -50.5, 50.5);
+  hFchargeDiff_ = iBook.book1D("rechitChargeDiffFpix","Charge differnce of rechits in FPix; rechit charge difference (Host - Device)", 101, -50.5, 50.5);
+  hBsizeXDiff_ = iBook.book1D("rechitsizeXDiffBpix","SizeX difference of rechits in BPix; rechit sizex difference (Host - Device)", 21, -10.5, 10.5);
+  hFsizeXDiff_ = iBook.book1D("rechitsizeXDiffFpix","SizeX difference of rechits in FPix; rechit sizex difference (Host - Device)", 21, -10.5, 10.5);
+  hBsizeYDiff_ = iBook.book1D("rechitsizeYDiffBpix","SizeY difference of rechits in BPix; rechit sizey difference (Host - Device)", 21, -10.5, 10.5);
+  hFsizeYDiff_ = iBook.book1D("rechitsizeYDiffFpix","SizeY difference of rechits in FPix; rechit sizey difference (Host - Device)", 21, -10.5, 10.5);
+  hBposXDiff_ = iBook.book1D("rechitsposXDiffBpix","x-position difference of rechits in BPix; rechit x-pos difference (Host - Device)", 1000, -10, 10);
+  hFposXDiff_ = iBook.book1D("rechitsposXDiffFpix","x-position difference of rechits in FPix; rechit x-pos difference (Host - Device)", 1000, -10, 10);
+  hBposYDiff_ = iBook.book1D("rechitsposYDiffBpix","y-position difference of rechits in BPix; rechit y-pos difference (Host - Device)", 1000, -10, 10);
+  hFposYDiff_ = iBook.book1D("rechitsposYDiffFpix","y-position difference of rechits in FPix; rechit y-pos difference (Host - Device)", 1000, -10, 10);
 }
 
 template<typename T>
 void SiPixelCompareRecHitsSoA<T>::fillDescriptions(edm::ConfigurationDescriptions& descriptions) {
   // monitorpixelRecHitsSoA
   edm::ParameterSetDescription desc;
-  desc.add<edm::InputTag>("pixelHitsSrcCPU", edm::InputTag("siPixelRecHitsPreSplittingSoA@cpu"));
+  desc.add<edm::InputTag>("pixelHitsSrcCPU", edm::InputTag("siPixelRecHitsPreSplittingSoA@Host"));
   desc.add<edm::InputTag>("pixelHitsSrcGPU", edm::InputTag("siPixelRecHitsPreSplittingSoA@cuda"));
-  desc.add<std::string>("topFolderName", "SiPixelHeterogeneous/PixelRecHitsCompareGPUvsCPU");
+  desc.add<std::string>("topFolderName", "SiPixelHeterogeneous/PixelRecHitsCompareDevicevsHost");
   desc.add<double>("minD2cut", 0.0001);
   descriptions.addWithDefaultLabel(desc);
 }
diff --git a/DQM/SiPixelHeterogeneous/plugins/SiPixelCompareTrackSoA.cc b/DQM/SiPixelHeterogeneous/plugins/SiPixelCompareTrackSoA.cc
index fde8e892c560c..ecac8989df441 100644
--- a/DQM/SiPixelHeterogeneous/plugins/SiPixelCompareTrackSoA.cc
+++ b/DQM/SiPixelHeterogeneous/plugins/SiPixelCompareTrackSoA.cc
@@ -20,7 +20,8 @@
 #include "DQMServices/Core/interface/MonitorElement.h"
 #include "DQMServices/Core/interface/DQMEDAnalyzer.h"
 #include "DQMServices/Core/interface/DQMStore.h"
-#include "CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h"
+#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousHost.h"
+#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousDevice.h"
 // for string manipulations
 #include <fmt/printf.h>
 
@@ -65,7 +66,7 @@ namespace {
 template <typename T>
 class SiPixelCompareTrackSoA : public DQMEDAnalyzer {
 public:
-  using PixelTrackSoA = PixelTrackHeterogeneousT<T>;
+  using PixelTrackSoA = TrackSoAHeterogeneousHost<T>;
 
   explicit SiPixelCompareTrackSoA(const edm::ParameterSet&);
   ~SiPixelCompareTrackSoA() override = default;
@@ -133,6 +134,7 @@ SiPixelCompareTrackSoA<T>::SiPixelCompareTrackSoA(const edm::ParameterSet& iConf
 //
 template <typename T>
 void SiPixelCompareTrackSoA<T>::analyze(const edm::Event& iEvent, const edm::EventSetup& iSetup) {
+  using helper = TracksUtilities<T>;
   const auto& tsoaHandleCPU = iEvent.getHandle(tokenSoATrackCPU_);
   const auto& tsoaHandleGPU = iEvent.getHandle(tokenSoATrackGPU_);
   if (not tsoaHandleCPU or not tsoaHandleGPU) {
@@ -147,12 +149,12 @@ void SiPixelCompareTrackSoA<T>::analyze(const edm::Event& iEvent, const edm::Eve
     return;
   }
 
-  auto const& tsoaCPU = *tsoaHandleCPU->get();
-  auto const& tsoaGPU = *tsoaHandleGPU->get();
-  auto maxTracksCPU = tsoaCPU.stride();  //this should be same for both?
-  auto maxTracksGPU = tsoaGPU.stride();  //this should be same for both?
-  auto const* qualityCPU = tsoaCPU.qualityData();
-  auto const* qualityGPU = tsoaGPU.qualityData();
+  auto const& tsoaCPU = *tsoaHandleCPU;
+  auto const& tsoaGPU = *tsoaHandleGPU;
+  auto maxTracksCPU = tsoaCPU.view().metadata().size();  //this should be same for both?
+  auto maxTracksGPU = tsoaGPU.view().metadata().size();  //this should be same for both?
+  auto const* qualityCPU = tsoaCPU.view().quality();
+  auto const* qualityGPU = tsoaGPU.view().quality();
   int32_t nTracksCPU = 0;
   int32_t nTracksGPU = 0;
   int32_t nLooseAndAboveTracksCPU = 0;
@@ -162,9 +164,9 @@ void SiPixelCompareTrackSoA<T>::analyze(const edm::Event& iEvent, const edm::Eve
   //Loop over GPU tracks and store the indices of the loose tracks. Whats happens if useQualityCut_ is false?
   std::vector<int32_t> looseTrkidxGPU;
   for (int32_t jt = 0; jt < maxTracksGPU; ++jt) {
-    if (tsoaGPU.nHits(jt) == 0)
+    if (helper::nHits(tsoaGPU.view(), jt) == 0)
       break;  // this is a guard
-    if (!(tsoaGPU.pt(jt) > 0.))
+    if (!(tsoaGPU.view()[jt].pt() > 0.))
       continue;
     nTracksGPU++;
     if (useQualityCut_ && qualityGPU[jt] < minQuality_)
@@ -175,9 +177,18 @@ void SiPixelCompareTrackSoA<T>::analyze(const edm::Event& iEvent, const edm::Eve
 
   //Now loop over CPU tracks//nested loop for loose gPU tracks
   for (int32_t it = 0; it < maxTracksCPU; ++it) {
-    if (tsoaCPU.nHits(it) == 0)
+    int nHitsCPU = helper::nHits(tsoaCPU.view(), it);
+
+    if (nHitsCPU == 0)
       break;  // this is a guard
-    if (!(tsoaCPU.pt(it) > 0.))
+
+    float ptCPU = tsoaCPU.view()[it].pt();
+    float etaCPU = tsoaCPU.view()[it].eta();
+    float phiCPU = helper::phi(tsoaCPU.view(), it);
+    float zipCPU = helper::zip(tsoaCPU.view(), it);
+    float tipCPU = helper::tip(tsoaCPU.view(), it);
+
+    if (!(ptCPU > 0.))
       continue;
     nTracksCPU++;
     if (useQualityCut_ && qualityCPU[it] < minQuality_)
@@ -187,12 +198,11 @@ void SiPixelCompareTrackSoA<T>::analyze(const edm::Event& iEvent, const edm::Eve
     const int32_t notFound = -1;
     int32_t closestTkidx = notFound;
     float mindr2 = dr2cut_;
-    float etacpu = tsoaCPU.eta(it);
-    float phicpu = tsoaCPU.phi(it);
+
     for (auto gid : looseTrkidxGPU) {
-      float etagpu = tsoaGPU.eta(gid);
-      float phigpu = tsoaGPU.phi(gid);
-      float dr2 = reco::deltaR2(etacpu, phicpu, etagpu, phigpu);
+      float etaGPU = tsoaGPU.view()[gid].eta();
+      float phiGPU = helper::phi(tsoaGPU.view(), gid);
+      float dr2 = reco::deltaR2(etaCPU, phiCPU, etaGPU, phiGPU);
       if (dr2 > dr2cut_)
         continue;  // this is arbitrary
       if (mindr2 > dr2) {
@@ -201,31 +211,31 @@ void SiPixelCompareTrackSoA<T>::analyze(const edm::Event& iEvent, const edm::Eve
       }
     }
 
-    hpt_eta_tkAllCPU_->Fill(etacpu, tsoaCPU.pt(it));  //all CPU tk
-    hphi_z_tkAllCPU_->Fill(phicpu, tsoaCPU.zip(it));
+    hpt_eta_tkAllCPU_->Fill(etaCPU, ptCPU);  //all CPU tk
+    hphi_z_tkAllCPU_->Fill(phiCPU, zipCPU);
     if (closestTkidx == notFound)
       continue;
     nLooseAndAboveTracksCPU_matchedGPU++;
 
-    hchi2_->Fill(tsoaCPU.chi2(it), tsoaGPU.chi2(closestTkidx));
-    hCharge_->Fill(tsoaCPU.charge(it), tsoaGPU.charge(closestTkidx));
-    hnHits_->Fill(tsoaCPU.nHits(it), tsoaGPU.nHits(closestTkidx));
-    hnLayers_->Fill(tsoaCPU.nLayers(it), tsoaGPU.nLayers(closestTkidx));
-    hpt_->Fill(tsoaCPU.pt(it), tsoaGPU.pt(closestTkidx));
-    hptLogLog_->Fill(tsoaCPU.pt(it), tsoaGPU.pt(closestTkidx));
-    heta_->Fill(etacpu, tsoaGPU.eta(closestTkidx));
-    hphi_->Fill(phicpu, tsoaGPU.phi(closestTkidx));
-    hz_->Fill(tsoaCPU.zip(it), tsoaGPU.zip(closestTkidx));
-    htip_->Fill(tsoaCPU.tip(it), tsoaGPU.tip(closestTkidx));
-    hptdiffMatched_->Fill(tsoaCPU.pt(it) - tsoaGPU.pt(closestTkidx));
-    hCurvdiffMatched_->Fill((tsoaCPU.charge(it) / tsoaCPU.pt(it)) -
-                            (tsoaGPU.charge(closestTkidx) / tsoaGPU.pt(closestTkidx)));
-    hetadiffMatched_->Fill(etacpu - tsoaGPU.eta(closestTkidx));
-    hphidiffMatched_->Fill(reco::deltaPhi(phicpu, tsoaGPU.phi(closestTkidx)));
-    hzdiffMatched_->Fill(tsoaCPU.zip(it) - tsoaGPU.zip(closestTkidx));
-    htipdiffMatched_->Fill(tsoaCPU.tip(it) - tsoaGPU.tip(closestTkidx));
-    hpt_eta_tkAllCPUMatched_->Fill(etacpu, tsoaCPU.pt(it));  //matched to gpu
-    hphi_z_tkAllCPUMatched_->Fill(phicpu, tsoaCPU.zip(it));
+    hchi2_->Fill(tsoaCPU.view()[it].chi2(), tsoaGPU.view()[closestTkidx].chi2());
+    hCharge_->Fill(helper::charge(tsoaCPU.view(), it), helper::charge(tsoaGPU.view(), closestTkidx));
+    hnHits_->Fill(helper::nHits(tsoaCPU.view(), it), helper::nHits(tsoaGPU.view(), closestTkidx));
+    hnLayers_->Fill(tsoaCPU.view()[it].nLayers(), tsoaGPU.view()[closestTkidx].nLayers());
+    hpt_->Fill(tsoaCPU.view()[it].pt(), tsoaGPU.view()[closestTkidx].pt());
+    hptLogLog_->Fill(tsoaCPU.view()[it].pt(), tsoaGPU.view()[closestTkidx].pt());
+    heta_->Fill(etaCPU, tsoaGPU.view()[closestTkidx].eta());
+    hphi_->Fill(etaCPU, helper::phi(tsoaGPU.view(), closestTkidx));
+    hz_->Fill(zipCPU, helper::zip(tsoaGPU.view(), closestTkidx));
+    htip_->Fill(tipCPU, helper::tip(tsoaGPU.view(), closestTkidx));
+    hptdiffMatched_->Fill(tsoaCPU.view()[it].pt() - tsoaGPU.view()[closestTkidx].pt());
+    hCurvdiffMatched_->Fill((helper::charge(tsoaCPU.view(), it) / tsoaCPU.view()[it].pt()) -
+                            (helper::charge(tsoaGPU.view(), closestTkidx) / tsoaGPU.view()[closestTkidx].pt()));
+    hetadiffMatched_->Fill(etaCPU - tsoaGPU.view()[closestTkidx].eta());
+    hphidiffMatched_->Fill(reco::deltaPhi(etaCPU, helper::phi(tsoaGPU.view(), closestTkidx)));
+    hzdiffMatched_->Fill(zipCPU - helper::zip(tsoaGPU.view(), closestTkidx));
+    htipdiffMatched_->Fill(tipCPU - helper::tip(tsoaGPU.view(), closestTkidx));
+    hpt_eta_tkAllCPUMatched_->Fill(etaCPU, tsoaCPU.view()[it].pt());  //matched to gpu
+    hphi_z_tkAllCPUMatched_->Fill(etaCPU, zipCPU);
   }
   hnTracks_->Fill(nTracksCPU, nTracksGPU);
   hnLooseAndAboveTracks_->Fill(nLooseAndAboveTracksCPU, nLooseAndAboveTracksGPU);
diff --git a/DQM/SiPixelHeterogeneous/plugins/SiPixelCompareVertexSoA.cc b/DQM/SiPixelHeterogeneous/plugins/SiPixelCompareVertexSoA.cc
index 52e8396a49022..555542eb56995 100644
--- a/DQM/SiPixelHeterogeneous/plugins/SiPixelCompareVertexSoA.cc
+++ b/DQM/SiPixelHeterogeneous/plugins/SiPixelCompareVertexSoA.cc
@@ -2,7 +2,7 @@
 // Package:    SiPixelCompareVertexSoA
 // Class:      SiPixelCompareVertexSoA
 //
-/**\class SiPixelCompareVertexSoA SiPixelCompareVertexSoA.cc 
+/**\class SiPixelCompareVertexSoA SiPixelCompareVertexSoA.cc
 */
 //
 // Author: Suvankar Roy Chowdhury
@@ -18,7 +18,7 @@
 #include "DQMServices/Core/interface/MonitorElement.h"
 #include "DQMServices/Core/interface/DQMEDAnalyzer.h"
 #include "DQMServices/Core/interface/DQMStore.h"
-#include "CUDADataFormats/Vertex/interface/ZVertexHeterogeneous.h"
+#include "CUDADataFormats/Vertex/interface/ZVertexSoAHeterogeneousHost.h"
 #include "DataFormats/BeamSpot/interface/BeamSpot.h"
 
 class SiPixelCompareVertexSoA : public DQMEDAnalyzer {
@@ -31,8 +31,8 @@ class SiPixelCompareVertexSoA : public DQMEDAnalyzer {
   static void fillDescriptions(edm::ConfigurationDescriptions& descriptions);
 
 private:
-  const edm::EDGetTokenT<ZVertexHeterogeneous> tokenSoAVertexCPU_;
-  const edm::EDGetTokenT<ZVertexHeterogeneous> tokenSoAVertexGPU_;
+  const edm::EDGetTokenT<ZVertexSoAHost> tokenSoAVertexCPU_;
+  const edm::EDGetTokenT<ZVertexSoAHost> tokenSoAVertexGPU_;
   const edm::EDGetTokenT<reco::BeamSpot> tokenBeamSpot_;
   const std::string topFolderName_;
   const float dzCut_;
@@ -53,9 +53,10 @@ class SiPixelCompareVertexSoA : public DQMEDAnalyzer {
 // constructors
 //
 
+// Note tokenSoAVertexGPU_ contains data copied from device to host, hence is a HostCollection
 SiPixelCompareVertexSoA::SiPixelCompareVertexSoA(const edm::ParameterSet& iConfig)
-    : tokenSoAVertexCPU_(consumes<ZVertexHeterogeneous>(iConfig.getParameter<edm::InputTag>("pixelVertexSrcCPU"))),
-      tokenSoAVertexGPU_(consumes<ZVertexHeterogeneous>(iConfig.getParameter<edm::InputTag>("pixelVertexSrcGPU"))),
+    : tokenSoAVertexCPU_(consumes<ZVertexSoAHost>(iConfig.getParameter<edm::InputTag>("pixelVertexSrcCPU"))),
+      tokenSoAVertexGPU_(consumes<ZVertexSoAHost>(iConfig.getParameter<edm::InputTag>("pixelVertexSrcGPU"))),
       tokenBeamSpot_(consumes<reco::BeamSpot>(iConfig.getParameter<edm::InputTag>("beamSpotSrc"))),
       topFolderName_(iConfig.getParameter<std::string>("topFolderName")),
       dzCut_(iConfig.getParameter<double>("dzCut")) {}
@@ -78,10 +79,10 @@ void SiPixelCompareVertexSoA::analyze(const edm::Event& iEvent, const edm::Event
     return;
   }
 
-  auto const& vsoaCPU = *vsoaHandleCPU->get();
-  int nVerticesCPU = vsoaCPU.nvFinal;
-  auto const& vsoaGPU = *vsoaHandleGPU->get();
-  int nVerticesGPU = vsoaGPU.nvFinal;
+  auto const& vsoaCPU = *vsoaHandleCPU;
+  int nVerticesCPU = vsoaCPU.view().nvFinal();
+  auto const& vsoaGPU = *vsoaHandleGPU;
+  int nVerticesGPU = vsoaGPU.view().nvFinal();
 
   auto bsHandle = iEvent.getHandle(tokenBeamSpot_);
   float x0 = 0., y0 = 0., z0 = 0., dxdz = 0., dydz = 0.;
@@ -97,22 +98,22 @@ void SiPixelCompareVertexSoA::analyze(const edm::Event& iEvent, const edm::Event
   }
 
   for (int ivc = 0; ivc < nVerticesCPU; ivc++) {
-    auto sic = vsoaCPU.sortInd[ivc];
-    auto zc = vsoaCPU.zv[sic];
+    auto sic = vsoaCPU.view()[ivc].sortInd();
+    auto zc = vsoaCPU.view()[sic].zv();
     auto xc = x0 + dxdz * zc;
     auto yc = y0 + dydz * zc;
     zc += z0;
 
-    auto ndofCPU = vsoaCPU.ndof[sic];
-    auto chi2CPU = vsoaCPU.chi2[sic];
+    auto ndofCPU = vsoaCPU.view()[sic].ndof();
+    auto chi2CPU = vsoaCPU.view()[sic].chi2();
 
     const int32_t notFound = -1;
     int32_t closestVtxidx = notFound;
     float mindz = dzCut_;
 
     for (int ivg = 0; ivg < nVerticesGPU; ivg++) {
-      auto sig = vsoaGPU.sortInd[ivg];
-      auto zgc = vsoaGPU.zv[sig] + z0;
+      auto sig = vsoaGPU.view()[ivg].sortInd();
+      auto zgc = vsoaGPU.view()[sig].zv() + z0;
       auto zDist = std::abs(zc - zgc);
       //insert some matching condition
       if (zDist > dzCut_)
@@ -125,12 +126,12 @@ void SiPixelCompareVertexSoA::analyze(const edm::Event& iEvent, const edm::Event
     if (closestVtxidx == notFound)
       continue;
 
-    auto zg = vsoaGPU.zv[closestVtxidx];
+    auto zg = vsoaGPU.view()[closestVtxidx].zv();
     auto xg = x0 + dxdz * zg;
     auto yg = y0 + dydz * zg;
     zg += z0;
-    auto ndofGPU = vsoaGPU.ndof[closestVtxidx];
-    auto chi2GPU = vsoaGPU.chi2[closestVtxidx];
+    auto ndofGPU = vsoaGPU.view()[closestVtxidx].ndof();
+    auto chi2GPU = vsoaGPU.view()[closestVtxidx].chi2();
 
     hx_->Fill(xc - x0, xg - x0);
     hy_->Fill(yc - y0, yg - y0);
@@ -140,7 +141,7 @@ void SiPixelCompareVertexSoA::analyze(const edm::Event& iEvent, const edm::Event
     hzdiff_->Fill(zc - zg);
     hchi2_->Fill(chi2CPU, chi2GPU);
     hchi2oNdof_->Fill(chi2CPU / ndofCPU, chi2GPU / ndofGPU);
-    hptv2_->Fill(vsoaCPU.ptv2[sic], vsoaGPU.ptv2[closestVtxidx]);
+    hptv2_->Fill(vsoaCPU.view()[sic].ptv2(), vsoaGPU.view()[closestVtxidx].ptv2());
     hntrks_->Fill(ndofCPU + 1, ndofGPU + 1);
   }
   hnVertex_->Fill(nVerticesCPU, nVerticesGPU);
diff --git a/DQM/SiPixelHeterogeneous/plugins/SiPixelMonitorRecHitsSoA.cc b/DQM/SiPixelHeterogeneous/plugins/SiPixelMonitorRecHitsSoA.cc
index ba68a8182e261..0844bd865ca1f 100644
--- a/DQM/SiPixelHeterogeneous/plugins/SiPixelMonitorRecHitsSoA.cc
+++ b/DQM/SiPixelHeterogeneous/plugins/SiPixelMonitorRecHitsSoA.cc
@@ -19,7 +19,8 @@
 #include "DQMServices/Core/interface/MonitorElement.h"
 #include "DQMServices/Core/interface/DQMEDAnalyzer.h"
 #include "DQMServices/Core/interface/DQMStore.h"
-#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h"
+#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHitSoAHost.h"
+#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHitsUtilities.h"
 // Geometry
 #include "Geometry/TrackerGeometryBuilder/interface/TrackerGeometry.h"
 #include "DataFormats/TrackerCommon/interface/TrackerTopology.h"
@@ -31,8 +32,8 @@
 template <typename T>
 class SiPixelMonitorRecHitsSoA : public DQMEDAnalyzer {
 public:
-  using HitSoA = TrackingRecHit2DSOAViewT<T>;
-  using HitsOnCPU = TrackingRecHit2DCPUT<T>;
+  using HitSoA = TrackingRecHitSoAView<T>;
+  using HitsOnHost = TrackingRecHitSoAHost<T>;
 
   explicit SiPixelMonitorRecHitsSoA(const edm::ParameterSet&);
   ~SiPixelMonitorRecHitsSoA() override = default;
@@ -44,7 +45,7 @@ class SiPixelMonitorRecHitsSoA : public DQMEDAnalyzer {
 private:
   const edm::ESGetToken<TrackerGeometry, TrackerDigiGeometryRecord> geomToken_;
   const edm::ESGetToken<TrackerTopology, TrackerTopologyRcd> topoToken_;
-  const edm::EDGetTokenT<HitsOnCPU> tokenSoAHitsCPU_;
+  const edm::EDGetTokenT<HitsOnHost> tokenSoAHitsCPU_;
   const std::string topFolderName_;
   const TrackerGeometry* tkGeom_ = nullptr;
   const TrackerTopology* tTopo_ = nullptr;
@@ -101,21 +102,21 @@ void SiPixelMonitorRecHitsSoA<T>::analyze(const edm::Event& iEvent, const edm::E
     return;
   }
   auto const& rhsoa = *rhsoaHandle;
-  const HitSoA* soa2d = rhsoa.view();
+  auto const& soa2d = rhsoa.const_view();
 
-  uint32_t nHits_ = soa2d->nHits();
+  uint32_t nHits_ = soa2d.nHits();
   hnHits->Fill(nHits_);
   auto detIds = tkGeom_->detUnitIds();
   for (uint32_t i = 0; i < nHits_; i++) {
-    DetId id = detIds[soa2d->detectorIndex(i)];
-    float xG = soa2d->xGlobal(i);
-    float yG = soa2d->yGlobal(i);
-    float zG = soa2d->zGlobal(i);
-    float rG = soa2d->rGlobal(i);
-    float fphi = short2phi(soa2d->iphi(i));
-    uint32_t charge = soa2d->charge(i);
-    int16_t sizeX = std::ceil(float(std::abs(soa2d->clusterSizeX(i)) / 8.));
-    int16_t sizeY = std::ceil(float(std::abs(soa2d->clusterSizeY(i)) / 8.));
+    DetId id = detIds[soa2d[i].detectorIndex()];
+    float xG = soa2d[i].xGlobal();
+    float yG = soa2d[i].yGlobal();
+    float zG = soa2d[i].zGlobal();
+    float rG = soa2d[i].rGlobal();
+    float fphi = short2phi(soa2d[i].iphi());
+    uint32_t charge = soa2d[i].chargeAndStatus().charge;
+    int16_t sizeX = std::ceil(float(std::abs(soa2d[i].clusterSizeX()) / 8.));
+    int16_t sizeY = std::ceil(float(std::abs(soa2d[i].clusterSizeY()) / 8.));
     hBFposZP->Fill(zG, fphi);
     int16_t ysign = yG >= 0 ? 1 : -1;
     hBFposZR->Fill(zG, rG * ysign);
diff --git a/DQM/SiPixelHeterogeneous/plugins/SiPixelMonitorTrackSoA.cc b/DQM/SiPixelHeterogeneous/plugins/SiPixelMonitorTrackSoA.cc
index 491c8f1be238a..3deb289888477 100644
--- a/DQM/SiPixelHeterogeneous/plugins/SiPixelMonitorTrackSoA.cc
+++ b/DQM/SiPixelHeterogeneous/plugins/SiPixelMonitorTrackSoA.cc
@@ -20,14 +20,15 @@
 #include "DQMServices/Core/interface/MonitorElement.h"
 #include "DQMServices/Core/interface/DQMEDAnalyzer.h"
 #include "DQMServices/Core/interface/DQMStore.h"
-#include "CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h"
+#include "CUDADataFormats/Track/interface/PixelTrackUtilities.h"
+#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousHost.h"
 // for string manipulations
 #include <fmt/printf.h>
 
 template <typename T>
 class SiPixelMonitorTrackSoA : public DQMEDAnalyzer {
 public:
-  using PixelTrackHeterogeneous = PixelTrackHeterogeneousT<T>;
+  using PixelTrackHeterogeneous = TrackSoAHeterogeneousHost<T>;
   explicit SiPixelMonitorTrackSoA(const edm::ParameterSet&);
   ~SiPixelMonitorTrackSoA() override = default;
   void bookHistograms(DQMStore::IBooker& ibooker, edm::Run const& iRun, edm::EventSetup const& iSetup) override;
@@ -81,23 +82,24 @@ void SiPixelMonitorTrackSoA<T>::analyze(const edm::Event& iEvent, const edm::Eve
     return;
   }
 
-  auto const& tsoa = *((tsoaHandle.product())->get());
-  auto maxTracks = tsoa.stride();
-  auto const* quality = tsoa.qualityData();
+  using helper = TracksUtilities<T>;
+  auto const& tsoa = *tsoaHandle.product();
+  auto maxTracks = tsoa.view().metadata().size();
+  auto const* quality = tsoa.view().quality();
   int32_t nTracks = 0;
   int32_t nLooseAndAboveTracks = 0;
 
   for (int32_t it = 0; it < maxTracks; ++it) {
-    auto nHits = tsoa.nHits(it);
-    auto nLayers = tsoa.nLayers(it);
+    auto nHits = helper::nHits(tsoa.const_view(), it);
+    auto nLayers = tsoa.view()[it].nLayers();
     if (nHits == 0)
       break;  // this is a guard
-    float pt = tsoa.pt(it);
+    float pt = tsoa.view()[it].pt();
     if (!(pt > 0.))
       continue;
 
     // fill the quality for all tracks
-    pixelTrack::Quality qual = tsoa.quality(it);
+    pixelTrack::Quality qual = quality[it];
     hquality->Fill(int(qual));
     nTracks++;
 
@@ -105,11 +107,11 @@ void SiPixelMonitorTrackSoA<T>::analyze(const edm::Event& iEvent, const edm::Eve
       continue;
 
     // fill parameters only for quality >= loose
-    float chi2 = tsoa.chi2(it);
-    float phi = tsoa.phi(it);
-    float zip = tsoa.zip(it);
-    float eta = tsoa.eta(it);
-    float tip = tsoa.tip(it);
+    float chi2 = tsoa.view()[it].chi2();
+    float phi = helper::phi(tsoa.const_view(), it);
+    float zip = helper::zip(tsoa.const_view(), it);
+    float eta = tsoa.view()[it].eta();
+    float tip = helper::tip(tsoa.const_view(), it);
 
     hchi2->Fill(chi2);
     hChi2VsPhi->Fill(phi, chi2);
diff --git a/DQM/SiPixelHeterogeneous/plugins/SiPixelMonitorVertexSoA.cc b/DQM/SiPixelHeterogeneous/plugins/SiPixelMonitorVertexSoA.cc
index 13cf991b54c82..4287babcf4964 100644
--- a/DQM/SiPixelHeterogeneous/plugins/SiPixelMonitorVertexSoA.cc
+++ b/DQM/SiPixelHeterogeneous/plugins/SiPixelMonitorVertexSoA.cc
@@ -21,7 +21,7 @@
 #include "DQMServices/Core/interface/MonitorElement.h"
 #include "DQMServices/Core/interface/DQMEDAnalyzer.h"
 #include "DQMServices/Core/interface/DQMStore.h"
-#include "CUDADataFormats/Vertex/interface/ZVertexHeterogeneous.h"
+#include "CUDADataFormats/Vertex/interface/ZVertexSoAHeterogeneousHost.h"
 #include "DataFormats/BeamSpot/interface/BeamSpot.h"
 
 class SiPixelMonitorVertexSoA : public DQMEDAnalyzer {
@@ -34,7 +34,7 @@ class SiPixelMonitorVertexSoA : public DQMEDAnalyzer {
   static void fillDescriptions(edm::ConfigurationDescriptions& descriptions);
 
 private:
-  edm::EDGetTokenT<ZVertexHeterogeneous> tokenSoAVertex_;
+  edm::EDGetTokenT<ZVertexSoAHost> tokenSoAVertex_;
   edm::EDGetTokenT<reco::BeamSpot> tokenBeamSpot_;
   std::string topFolderName_;
   MonitorElement* hnVertex;
@@ -52,7 +52,7 @@ class SiPixelMonitorVertexSoA : public DQMEDAnalyzer {
 //
 
 SiPixelMonitorVertexSoA::SiPixelMonitorVertexSoA(const edm::ParameterSet& iConfig) {
-  tokenSoAVertex_ = consumes<ZVertexHeterogeneous>(iConfig.getParameter<edm::InputTag>("pixelVertexSrc"));
+  tokenSoAVertex_ = consumes(iConfig.getParameter<edm::InputTag>("pixelVertexSrc"));
   tokenBeamSpot_ = consumes<reco::BeamSpot>(iConfig.getParameter<edm::InputTag>("beamSpotSrc"));
   topFolderName_ = iConfig.getParameter<std::string>("topFolderName");
 }
@@ -67,8 +67,8 @@ void SiPixelMonitorVertexSoA::analyze(const edm::Event& iEvent, const edm::Event
     return;
   }
 
-  auto const& vsoa = *((vsoaHandle.product())->get());
-  int nVertices = vsoa.nvFinal;
+  auto const& vsoa = *vsoaHandle;
+  int nVertices = vsoa.view().nvFinal();
   auto bsHandle = iEvent.getHandle(tokenBeamSpot_);
   float x0 = 0., y0 = 0., z0 = 0., dxdz = 0., dydz = 0.;
   if (!bsHandle.isValid()) {
@@ -83,8 +83,8 @@ void SiPixelMonitorVertexSoA::analyze(const edm::Event& iEvent, const edm::Event
   }
 
   for (int iv = 0; iv < nVertices; iv++) {
-    auto si = vsoa.sortInd[iv];
-    auto z = vsoa.zv[si];
+    auto si = vsoa.view()[iv].sortInd();
+    auto z = vsoa.view()[si].zv();
     auto x = x0 + dxdz * z;
     auto y = y0 + dydz * z;
 
@@ -92,10 +92,10 @@ void SiPixelMonitorVertexSoA::analyze(const edm::Event& iEvent, const edm::Event
     hx->Fill(x);
     hy->Fill(y);
     hz->Fill(z);
-    auto ndof = vsoa.ndof[si];
-    hchi2->Fill(vsoa.chi2[si]);
-    hchi2oNdof->Fill(vsoa.chi2[si] / ndof);
-    hptv2->Fill(vsoa.ptv2[si]);
+    auto ndof = vsoa.view()[si].ndof();
+    hchi2->Fill(vsoa.view()[si].chi2());
+    hchi2oNdof->Fill(vsoa.view()[si].chi2() / ndof);
+    hptv2->Fill(vsoa.view()[si].ptv2());
     hntrks->Fill(ndof + 1);
   }
   hnVertex->Fill(nVertices);
diff --git a/EventFilter/SiPixelRawToDigi/plugins/SiPixelDigisSoAFromCUDA.cc b/EventFilter/SiPixelRawToDigi/plugins/SiPixelDigisSoAFromCUDA.cc
index 0702bc4830c7c..5b23f2dbda104 100644
--- a/EventFilter/SiPixelRawToDigi/plugins/SiPixelDigisSoAFromCUDA.cc
+++ b/EventFilter/SiPixelRawToDigi/plugins/SiPixelDigisSoAFromCUDA.cc
@@ -10,6 +10,7 @@
 #include "FWCore/ParameterSet/interface/ParameterSet.h"
 #include "HeterogeneousCore/CUDACore/interface/ScopedContext.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h"
+#include "CUDADataFormats/Common/interface/PortableHostCollection.h"
 
 class SiPixelDigisSoAFromCUDA : public edm::stream::EDProducer<edm::ExternalWork> {
 public:
@@ -27,7 +28,7 @@ class SiPixelDigisSoAFromCUDA : public edm::stream::EDProducer<edm::ExternalWork
   edm::EDGetTokenT<cms::cuda::Product<SiPixelDigisCUDA>> digiGetToken_;
   edm::EDPutTokenT<SiPixelDigisSoA> digiPutToken_;
 
-  cms::cuda::host::unique_ptr<uint16_t[]> store_;
+  cms::cuda::PortableHostCollection<SiPixelDigisSoALayout<>> digis_h_;
 
   int nDigis_;
 };
@@ -48,29 +49,25 @@ void SiPixelDigisSoAFromCUDA::acquire(const edm::Event& iEvent,
   // Do the transfer in a CUDA stream parallel to the computation CUDA stream
   cms::cuda::ScopedContextAcquire ctx{iEvent.streamID(), std::move(waitingTaskHolder)};
 
-  const auto& gpuDigis = ctx.get(iEvent, digiGetToken_);
+  const auto& digis_d = ctx.get(iEvent, digiGetToken_);
 
-  nDigis_ = gpuDigis.nDigis();
-  store_ = gpuDigis.copyAllToHostAsync(ctx.stream());
+  nDigis_ = digis_d.nDigis();
+  nDigis_ = digis_d.nDigis();
+  digis_h_ = cms::cuda::PortableHostCollection<SiPixelDigisSoALayout<>>(digis_d.view().metadata().size(), ctx.stream());
+  cudaCheck(cudaMemcpyAsync(digis_h_.buffer().get(),
+                            digis_d.const_buffer().get(),
+                            digis_d.bufferSize(),
+                            cudaMemcpyDeviceToHost,
+                            ctx.stream()));
 }
 
 void SiPixelDigisSoAFromCUDA::produce(edm::Event& iEvent, const edm::EventSetup& iSetup) {
-  // The following line copies the data from the pinned host memory to
-  // regular host memory. In principle that feels unnecessary (why not
-  // just use the pinned host memory?). There are a few arguments for
-  // doing it though
-  // - Now can release the pinned host memory back to the (caching) allocator
-  //   * if we'd like to keep the pinned memory, we'd need to also
-  //     keep the CUDA stream around as long as that, or allow pinned
-  //     host memory to be allocated without a CUDA stream
-  // - What if a CPU algorithm would produce the same SoA? We can't
-  //   use cudaMallocHost without a GPU...
-
-  auto tmp_view = SiPixelDigisCUDASOAView(store_, nDigis_, SiPixelDigisCUDASOAView::StorageLocationHost::kMAX);
-
-  iEvent.emplace(digiPutToken_, nDigis_, tmp_view.pdigi(), tmp_view.rawIdArr(), tmp_view.adc(), tmp_view.clus());
-
-  store_.reset();
+  iEvent.emplace(digiPutToken_,
+                 nDigis_,
+                 digis_h_.view().pdigi(),
+                 digis_h_.view().rawIdArr(),
+                 digis_h_.view().adc(),
+                 digis_h_.view().clus());
 }
 
 // define as framework plugin
diff --git a/Geometry/CommonTopologies/interface/SimplePixelTopology.h b/Geometry/CommonTopologies/interface/SimplePixelTopology.h
index c991d09666297..304e8a1255cce 100644
--- a/Geometry/CommonTopologies/interface/SimplePixelTopology.h
+++ b/Geometry/CommonTopologies/interface/SimplePixelTopology.h
@@ -28,6 +28,8 @@ namespace pixelTopology {
   constexpr int16_t phi0p07 = 730;  // round(730.12648...) = phi2short(0.07);
   constexpr int16_t phi0p09 = 900;
 
+  constexpr uint16_t last_barrel_layer = 3;  // this is common between all the topologies
+
   template <class Function, std::size_t... Indices>
   constexpr auto map_to_array_helper(Function f, std::index_sequence<Indices...>)
       -> std::array<std::invoke_result_t<Function, std::size_t>, sizeof...(Indices)> {
@@ -292,10 +294,11 @@ namespace pixelTopology {
     static constexpr uint32_t maxCellTracks = 302;
     static constexpr uint32_t maxHitsOnTrack = 15;
     static constexpr uint32_t maxHitsOnTrackForFullFit = 6;
-    static constexpr uint32_t avgHitsPerTrack = 9;
+    static constexpr uint32_t avgHitsPerTrack = 7;
     static constexpr uint32_t maxCellsPerHit = 256;
     static constexpr uint32_t avgTracksPerHit = 10;
     static constexpr uint32_t maxNumberOfTuples = 256 * 1024;
+    //this is well above thanks to maxNumberOfTuples
     static constexpr uint32_t maxHitsForContainers = avgHitsPerTrack * maxNumberOfTuples;
     static constexpr uint32_t maxNumberOfDoublets = 5 * 512 * 1024;
     static constexpr uint32_t maxNumOfActiveDoublets = maxNumberOfDoublets / 8;
@@ -308,9 +311,9 @@ namespace pixelTopology {
     static constexpr uint32_t getDoubletsFromHistoMaxBlockSize = 64;  // for both x and y
     static constexpr uint32_t getDoubletsFromHistoMinBlocksPerMP = 16;
 
-    static constexpr uint32_t last_bpix1_detIndex = 108;
-    static constexpr uint32_t last_bpix2_detIndex = 324;
-    static constexpr uint32_t last_barrel_detIndex = 504;
+    static constexpr uint16_t last_bpix1_detIndex = 108;
+    static constexpr uint16_t last_bpix2_detIndex = 324;
+    static constexpr uint16_t last_barrel_detIndex = 504;
 
     static constexpr uint32_t maxPixInModule = 6000;
 
@@ -383,7 +386,7 @@ namespace pixelTopology {
     static constexpr uint32_t maxCellTracks = 48;
     static constexpr uint32_t maxHitsOnTrack = 10;
     static constexpr uint32_t maxHitsOnTrackForFullFit = 6;
-    static constexpr uint32_t avgHitsPerTrack = 4;
+    static constexpr uint32_t avgHitsPerTrack = 5;
     static constexpr uint32_t maxCellsPerHit = 256;
     static constexpr uint32_t avgTracksPerHit = 6;
     static constexpr uint32_t maxNumberOfTuples = 32 * 1024;
@@ -399,9 +402,9 @@ namespace pixelTopology {
     static constexpr uint32_t getDoubletsFromHistoMaxBlockSize = 64;  // for both x and y
     static constexpr uint32_t getDoubletsFromHistoMinBlocksPerMP = 16;
 
-    static constexpr uint32_t last_bpix1_detIndex = 96;
-    static constexpr uint32_t last_bpix2_detIndex = 320;
-    static constexpr uint32_t last_barrel_detIndex = 1184;
+    static constexpr uint16_t last_bpix1_detIndex = 96;
+    static constexpr uint16_t last_bpix2_detIndex = 320;
+    static constexpr uint16_t last_barrel_detIndex = 1184;
 
     static constexpr uint32_t maxPixInModule = 6000;
 
diff --git a/HLTrigger/Configuration/python/customizeHLTforCMSSW.py b/HLTrigger/Configuration/python/customizeHLTforCMSSW.py
index 2f0e84337cb70..5aca943b85192 100644
--- a/HLTrigger/Configuration/python/customizeHLTforCMSSW.py
+++ b/HLTrigger/Configuration/python/customizeHLTforCMSSW.py
@@ -258,6 +258,12 @@ def customizeHLTfor40334(process):
 
   return process
 
+def customizeHLTfor40465(process):
+    try:
+        process.hltSiPixelRecHitsSoA.cpu.hltSiPixelRecHitsFromLegacy[0].type = 'pixelTopologyPhase1TrackingRecHitSoAHost'
+    except:
+        pass
+    return process
 
 # CMSSW version specific customizations
 def customizeHLTforCMSSW(process, menuType="GRun"):
@@ -266,9 +272,10 @@ def customizeHLTforCMSSW(process, menuType="GRun"):
 
     # add call to action function in proper order: newest last!
     # process = customiseFor12718(process)
- 
+
     process = customizeHLTfor38761(process)
     process = customizeHLTfor40264(process)
     process = customizeHLTfor40334(process)
+    process = customizeHLTfor40465(process)
 
     return process
diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterCUDA.cc b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterCUDA.cc
index df168da110301..76cc641d365c5 100644
--- a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterCUDA.cc
+++ b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterCUDA.cc
@@ -32,6 +32,7 @@
 #include "FWCore/ParameterSet/interface/ParameterSet.h"
 #include "FWCore/ParameterSet/interface/ParameterSetDescription.h"
 #include "FWCore/ServiceRegistry/interface/Service.h"
+#include "Geometry/CommonTopologies/interface/SimplePixelTopology.h"
 #include "HeterogeneousCore/CUDACore/interface/ScopedContext.h"
 #include "HeterogeneousCore/CUDAServices/interface/CUDAService.h"
 #include "RecoTracker/Record/interface/CkfComponentsRecord.h"
@@ -272,9 +273,15 @@ void SiPixelRawToClusterCUDA::produce(edm::Event& iEvent, const edm::EventSetup&
   cms::cuda::ScopedContextProduce ctx{ctxState_};
 
   if (nDigis_ == 0) {
-    // default construct collections and place them in event
-    ctx.emplace(iEvent, digiPutToken_, SiPixelDigisCUDA{});
-    ctx.emplace(iEvent, clusterPutToken_, SiPixelClustersCUDA{});
+    // Cannot use the default constructor here, as it would not allocate memory.
+    // In the case of no digis, clusters_d are not being instantiated, but are
+    // still used downstream to initialize TrackingRecHitSoADevice. If there
+    // are no valid pointers to clusters' Collection columns, instantiation
+    // of TrackingRecHits fail. Example: workflow 11604.0
+    SiPixelDigisCUDA digis_d = SiPixelDigisCUDA(nDigis_, ctx.stream());
+    SiPixelClustersCUDA clusters_d = SiPixelClustersCUDA(pixelTopology::Phase1::numberOfModules, ctx.stream());
+    ctx.emplace(iEvent, digiPutToken_, std::move(digis_d));
+    ctx.emplace(iEvent, clusterPutToken_, std::move(clusters_d));
     if (includeErrors_) {
       ctx.emplace(iEvent, digiErrorPutToken_, SiPixelDigiErrorsCUDA{});
     }
diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.cu b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.cu
index bc9be260deb20..293d4422e8458 100644
--- a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.cu
+++ b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.cu
@@ -318,12 +318,7 @@ namespace pixelgpudetails {
                                    const uint32_t wordCounter,
                                    const uint32_t *word,
                                    const uint8_t *fedIds,
-                                   uint16_t *xx,
-                                   uint16_t *yy,
-                                   uint16_t *adc,
-                                   uint32_t *pdigi,
-                                   uint32_t *rawIdArr,
-                                   uint16_t *moduleId,
+                                   SiPixelDigisCUDASOAView digisView,
                                    cms::cuda::SimpleVector<SiPixelErrorCompact> *err,
                                    bool useQualityInfo,
                                    bool includeErrors) {
@@ -332,17 +327,18 @@ namespace pixelgpudetails {
     int32_t first = threadIdx.x + blockIdx.x * blockDim.x;
     for (int32_t iloop = first, nend = wordCounter; iloop < nend; iloop += blockDim.x * gridDim.x) {
       auto gIndex = iloop;
-      xx[gIndex] = 0;
-      yy[gIndex] = 0;
-      adc[gIndex] = 0;
+      auto dvgi = digisView[gIndex];
+      dvgi.xx() = 0;
+      dvgi.yy() = 0;
+      dvgi.adc() = 0;
       bool skipROC = false;
 
       uint8_t fedId = fedIds[gIndex / 2];  // +1200;
 
       // initialize (too many coninue below)
-      pdigi[gIndex] = 0;
-      rawIdArr[gIndex] = 0;
-      moduleId[gIndex] = gpuClustering::invalidModuleId;
+      dvgi.pdigi() = 0;
+      dvgi.rawIdArr() = 0;
+      dvgi.moduleId() = gpuClustering::invalidModuleId;
 
       uint32_t ww = word[gIndex];  // Array containing 32 bit raw data
       if (ww == 0) {
@@ -433,12 +429,12 @@ namespace pixelgpudetails {
       }
 
       pixelgpudetails::Pixel globalPix = frameConversion(barrel, side, layer, detId.rocInDet, localPix);
-      xx[gIndex] = globalPix.row;  // origin shifting by 1 0-159
-      yy[gIndex] = globalPix.col;  // origin shifting by 1 0-415
-      adc[gIndex] = sipixelconstants::getADC(ww);
-      pdigi[gIndex] = pixelgpudetails::pack(globalPix.row, globalPix.col, adc[gIndex]);
-      moduleId[gIndex] = detId.moduleId;
-      rawIdArr[gIndex] = rawId;
+      dvgi.xx() = globalPix.row;  // origin shifting by 1 0-159
+      dvgi.yy() = globalPix.col;  // origin shifting by 1 0-415
+      dvgi.adc() = sipixelconstants::getADC(ww);
+      dvgi.pdigi() = pixelgpudetails::pack(globalPix.row, globalPix.col, dvgi.adc());
+      dvgi.moduleId() = detId.moduleId;
+      dvgi.rawIdArr() = rawId;
     }  // end of loop (gIndex < end)
 
   }  // end of Raw to Digi kernel
@@ -451,7 +447,6 @@ namespace pixelgpudetails {
     constexpr int nMaxModules = TrackerTraits::numberOfModules;
     constexpr int startBPIX2 = TrackerTraits::layerStart[1];
 
-    assert(nMaxModules < TrackerTraits::numberOfModules);
     assert(startBPIX2 < nMaxModules);
     assert(nMaxModules < 4096);  // easy to extend at least till 32*1024
     assert(nMaxModules > 1024);
@@ -549,7 +544,8 @@ namespace pixelgpudetails {
 #endif
 
     // since wordCounter != 0 we're not allocating 0 bytes,
-    digis_d = SiPixelDigisCUDA(wordCounter, stream);
+    // digis_d = SiPixelDigisCUDA(wordCounter, stream);
+    digis_d = SiPixelDigisCUDA(size_t(wordCounter), stream);
     if (includeErrors) {
       digiErrors_d = SiPixelDigiErrorsCUDA(wordCounter, std::move(errors), stream);
     }
@@ -578,12 +574,7 @@ namespace pixelgpudetails {
             wordCounter,
             word_d.get(),
             fedId_d.get(),
-            digis_d.view().xx(),
-            digis_d.view().yy(),
-            digis_d.view().adc(),
-            digis_d.view().pdigi(),
-            digis_d.view().rawIdArr(),
-            digis_d.view().moduleInd(),
+            digis_d.view(),
             digiErrors_d.error(),  // returns nullptr if default-constructed
             useQualityInfo,
             includeErrors);
@@ -594,12 +585,7 @@ namespace pixelgpudetails {
             wordCounter,
             word_d.get(),
             fedId_d.get(),
-            digis_d.view().xx(),
-            digis_d.view().yy(),
-            digis_d.view().adc(),
-            digis_d.view().pdigi(),
-            digis_d.view().rawIdArr(),
-            digis_d.view().moduleInd(),
+            digis_d.view(),
             digiErrors_d.error(),  // returns nullptr if default-constructed
             useQualityInfo,
             includeErrors);
@@ -621,25 +607,25 @@ namespace pixelgpudetails {
       int blocks = (std::max(int(wordCounter), int(Phase1::numberOfModules)) + threadsPerBlock - 1) / threadsPerBlock;
 
       if (isRun2)
-        gpuCalibPixel::calibDigis<true><<<blocks, threadsPerBlock, 0, stream>>>(digis_d.view().moduleInd(),
-                                                                                digis_d.view().xx(),
-                                                                                digis_d.view().yy(),
-                                                                                digis_d.view().adc(),
+        gpuCalibPixel::calibDigis<true><<<blocks, threadsPerBlock, 0, stream>>>(digis_d->moduleId(),
+                                                                                digis_d->xx(),
+                                                                                digis_d->yy(),
+                                                                                digis_d->adc(),
                                                                                 gains,
                                                                                 wordCounter,
-                                                                                clusters_d.moduleStart(),
-                                                                                clusters_d.clusInModule(),
-                                                                                clusters_d.clusModuleStart());
+                                                                                clusters_d->moduleStart(),
+                                                                                clusters_d->clusInModule(),
+                                                                                clusters_d->clusModuleStart());
       else
-        gpuCalibPixel::calibDigis<false><<<blocks, threadsPerBlock, 0, stream>>>(digis_d.view().moduleInd(),
-                                                                                 digis_d.view().xx(),
-                                                                                 digis_d.view().yy(),
-                                                                                 digis_d.view().adc(),
+        gpuCalibPixel::calibDigis<false><<<blocks, threadsPerBlock, 0, stream>>>(digis_d->moduleId(),
+                                                                                 digis_d->xx(),
+                                                                                 digis_d->yy(),
+                                                                                 digis_d->adc(),
                                                                                  gains,
                                                                                  wordCounter,
-                                                                                 clusters_d.moduleStart(),
-                                                                                 clusters_d.clusInModule(),
-                                                                                 clusters_d.clusModuleStart());
+                                                                                 clusters_d->moduleStart(),
+                                                                                 clusters_d->clusInModule(),
+                                                                                 clusters_d->clusModuleStart());
 
       cudaCheck(cudaGetLastError());
 #ifdef GPU_DEBUG
@@ -652,7 +638,7 @@ namespace pixelgpudetails {
 #endif
 
       countModules<Phase1><<<blocks, threadsPerBlock, 0, stream>>>(
-          digis_d.view().moduleInd(), clusters_d.moduleStart(), digis_d.view().clus(), wordCounter);
+          digis_d->moduleId(), clusters_d->moduleStart(), digis_d->clus(), wordCounter);
       cudaCheck(cudaGetLastError());
 
       threadsPerBlock = 256 + 128;  /// should be larger than 6000/16 aka (maxPixInModule/maxiter in the kernel)
@@ -661,14 +647,14 @@ namespace pixelgpudetails {
       std::cout << "CUDA findClus kernel launch with " << blocks << " blocks of " << threadsPerBlock << " threads\n";
 #endif
 
-      findClus<Phase1><<<blocks, threadsPerBlock, 0, stream>>>(digis_d.view().rawIdArr(),
-                                                               digis_d.view().moduleInd(),
-                                                               digis_d.view().xx(),
-                                                               digis_d.view().yy(),
-                                                               clusters_d.moduleStart(),
-                                                               clusters_d.clusInModule(),
-                                                               clusters_d.moduleId(),
-                                                               digis_d.view().clus(),
+      findClus<Phase1><<<blocks, threadsPerBlock, 0, stream>>>(digis_d->rawIdArr(),
+                                                               digis_d->moduleId(),
+                                                               digis_d->xx(),
+                                                               digis_d->yy(),
+                                                               clusters_d->moduleStart(),
+                                                               clusters_d->clusInModule(),
+                                                               clusters_d->moduleId(),
+                                                               digis_d->clus(),
                                                                wordCounter);
 
       cudaCheck(cudaGetLastError());
@@ -678,12 +664,12 @@ namespace pixelgpudetails {
 
       // apply charge cut
       clusterChargeCut<Phase1><<<blocks, threadsPerBlock, 0, stream>>>(clusterThresholds,
-                                                                       digis_d.view().moduleInd(),
-                                                                       digis_d.view().adc(),
-                                                                       clusters_d.moduleStart(),
-                                                                       clusters_d.clusInModule(),
-                                                                       clusters_d.moduleId(),
-                                                                       digis_d.view().clus(),
+                                                                       digis_d->moduleId(),
+                                                                       digis_d->adc(),
+                                                                       clusters_d->moduleStart(),
+                                                                       clusters_d->clusInModule(),
+                                                                       clusters_d->moduleId(),
+                                                                       digis_d->clus(),
                                                                        wordCounter);
 
       cudaCheck(cudaGetLastError());
@@ -694,8 +680,10 @@ namespace pixelgpudetails {
       // synchronization/ExternalWork
       auto nModules_Clusters_d = cms::cuda::make_device_unique<uint32_t[]>(3, stream);
       // MUST be ONE block
-      fillHitsModuleStart<Phase1><<<1, 1024, 0, stream>>>(
-          clusters_d.clusInModule(), clusters_d.clusModuleStart(), clusters_d.moduleStart(), nModules_Clusters_d.get());
+      fillHitsModuleStart<Phase1><<<1, 1024, 0, stream>>>(clusters_d->clusInModule(),
+                                                          clusters_d->clusModuleStart(),
+                                                          clusters_d->moduleStart(),
+                                                          nModules_Clusters_d.get());
 
       // copy to host
       nModules_Clusters_h = cms::cuda::make_host_unique<uint32_t[]>(3, stream);
@@ -723,15 +711,12 @@ namespace pixelgpudetails {
     nDigis = numDigis;
     digis_d = SiPixelDigisCUDA(numDigis, stream);
 
-    cudaCheck(
-        cudaMemcpyAsync(digis_d.view().moduleInd(), moduleIds, sizeof(uint16_t) * numDigis, cudaMemcpyDefault, stream));
-    cudaCheck(cudaMemcpyAsync(digis_d.view().xx(), xDigis, sizeof(uint16_t) * numDigis, cudaMemcpyDefault, stream));
-    cudaCheck(cudaMemcpyAsync(digis_d.view().yy(), yDigis, sizeof(uint16_t) * numDigis, cudaMemcpyDefault, stream));
-    cudaCheck(cudaMemcpyAsync(digis_d.view().adc(), adcDigis, sizeof(uint16_t) * numDigis, cudaMemcpyDefault, stream));
-    cudaCheck(
-        cudaMemcpyAsync(digis_d.view().pdigi(), packedData, sizeof(uint32_t) * numDigis, cudaMemcpyDefault, stream));
-    cudaCheck(
-        cudaMemcpyAsync(digis_d.view().rawIdArr(), rawIds, sizeof(uint32_t) * numDigis, cudaMemcpyDefault, stream));
+    cudaCheck(cudaMemcpyAsync(digis_d->moduleId(), moduleIds, sizeof(uint16_t) * numDigis, cudaMemcpyDefault, stream));
+    cudaCheck(cudaMemcpyAsync(digis_d->xx(), xDigis, sizeof(uint16_t) * numDigis, cudaMemcpyDefault, stream));
+    cudaCheck(cudaMemcpyAsync(digis_d->yy(), yDigis, sizeof(uint16_t) * numDigis, cudaMemcpyDefault, stream));
+    cudaCheck(cudaMemcpyAsync(digis_d->adc(), adcDigis, sizeof(uint16_t) * numDigis, cudaMemcpyDefault, stream));
+    cudaCheck(cudaMemcpyAsync(digis_d->pdigi(), packedData, sizeof(uint32_t) * numDigis, cudaMemcpyDefault, stream));
+    cudaCheck(cudaMemcpyAsync(digis_d->rawIdArr(), rawIds, sizeof(uint32_t) * numDigis, cudaMemcpyDefault, stream));
 
     clusters_d = SiPixelClustersCUDA(Phase2::numberOfModules, stream);
 
@@ -740,12 +725,12 @@ namespace pixelgpudetails {
     int threadsPerBlock = 512;
     int blocks = (int(numDigis) + threadsPerBlock - 1) / threadsPerBlock;
 
-    gpuCalibPixel::calibDigisPhase2<<<blocks, threadsPerBlock, 0, stream>>>(digis_d.view().moduleInd(),
-                                                                            digis_d.view().adc(),
+    gpuCalibPixel::calibDigisPhase2<<<blocks, threadsPerBlock, 0, stream>>>(digis_d->moduleId(),
+                                                                            digis_d->adc(),
                                                                             numDigis,
-                                                                            clusters_d.moduleStart(),
-                                                                            clusters_d.clusInModule(),
-                                                                            clusters_d.clusModuleStart());
+                                                                            clusters_d->moduleStart(),
+                                                                            clusters_d->clusInModule(),
+                                                                            clusters_d->clusModuleStart());
 
     cudaCheck(cudaGetLastError());
 
@@ -755,12 +740,12 @@ namespace pixelgpudetails {
 #endif
 
     countModules<Phase2><<<blocks, threadsPerBlock, 0, stream>>>(
-        digis_d.view().moduleInd(), clusters_d.moduleStart(), digis_d.view().clus(), numDigis);
+        digis_d->moduleId(), clusters_d->moduleStart(), digis_d->clus(), numDigis);
     cudaCheck(cudaGetLastError());
 
     // read the number of modules into a data member, used by getProduct())
     cudaCheck(cudaMemcpyAsync(
-        &(nModules_Clusters_h[0]), clusters_d.moduleStart(), sizeof(uint32_t), cudaMemcpyDefault, stream));
+        &(nModules_Clusters_h[0]), clusters_d->moduleStart(), sizeof(uint32_t), cudaMemcpyDefault, stream));
 
     threadsPerBlock = 256;
     blocks = Phase2::numberOfModules;
@@ -769,14 +754,14 @@ namespace pixelgpudetails {
     cudaCheck(cudaStreamSynchronize(stream));
     std::cout << "CUDA findClus kernel launch with " << blocks << " blocks of " << threadsPerBlock << " threads\n";
 #endif
-    findClus<Phase2><<<blocks, threadsPerBlock, 0, stream>>>(digis_d.view().rawIdArr(),
-                                                             digis_d.view().moduleInd(),
-                                                             digis_d.view().xx(),
-                                                             digis_d.view().yy(),
-                                                             clusters_d.moduleStart(),
-                                                             clusters_d.clusInModule(),
-                                                             clusters_d.moduleId(),
-                                                             digis_d.view().clus(),
+    findClus<Phase2><<<blocks, threadsPerBlock, 0, stream>>>(digis_d->rawIdArr(),
+                                                             digis_d->moduleId(),
+                                                             digis_d->xx(),
+                                                             digis_d->yy(),
+                                                             clusters_d->moduleStart(),
+                                                             clusters_d->clusInModule(),
+                                                             clusters_d->moduleId(),
+                                                             digis_d->clus(),
                                                              numDigis);
 
     cudaCheck(cudaGetLastError());
@@ -788,12 +773,12 @@ namespace pixelgpudetails {
 
     // apply charge cut
     clusterChargeCut<Phase2><<<blocks, threadsPerBlock, 0, stream>>>(clusterThresholds,
-                                                                     digis_d.view().moduleInd(),
-                                                                     digis_d.view().adc(),
-                                                                     clusters_d.moduleStart(),
-                                                                     clusters_d.clusInModule(),
-                                                                     clusters_d.moduleId(),
-                                                                     digis_d.view().clus(),
+                                                                     digis_d->moduleId(),
+                                                                     digis_d->adc(),
+                                                                     clusters_d->moduleStart(),
+                                                                     clusters_d->clusInModule(),
+                                                                     clusters_d->moduleId(),
+                                                                     digis_d->clus(),
                                                                      numDigis);
     cudaCheck(cudaGetLastError());
 
@@ -805,8 +790,10 @@ namespace pixelgpudetails {
     std::cout << "CUDA fillHitsModuleStart kernel launch \n";
 #endif
 
-    fillHitsModuleStart<Phase2><<<1, 1024, 0, stream>>>(
-        clusters_d.clusInModule(), clusters_d.clusModuleStart(), clusters_d.moduleStart(), nModules_Clusters_d.get());
+    fillHitsModuleStart<Phase2><<<1, 1024, 0, stream>>>(clusters_d->clusInModule(),
+                                                        clusters_d->clusModuleStart(),
+                                                        clusters_d->moduleStart(),
+                                                        nModules_Clusters_d.get());
 
     nModules_Clusters_h = cms::cuda::make_host_unique<uint32_t[]>(3, stream);
     cudaCheck(cudaMemcpyAsync(
diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHitGPUKernel.cu b/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHitGPUKernel.cu
index cb5b4b2f2c387..38d9ed1ad77e3 100644
--- a/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHitGPUKernel.cu
+++ b/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHitGPUKernel.cu
@@ -12,7 +12,7 @@
 
 #include "PixelRecHitGPUKernel.h"
 #include "gpuPixelRecHits.h"
-// #define GPU_DEBUG 1
+// #define GPU_DEBUG
 
 namespace {
   template <typename TrackerTraits>
@@ -42,7 +42,7 @@ namespace {
 namespace pixelgpudetails {
 
   template <typename TrackerTraits>
-  TrackingRecHit2DGPUT<TrackerTraits> PixelRecHitGPUKernel<TrackerTraits>::makeHitsAsync(
+  TrackingRecHitSoADevice<TrackerTraits> PixelRecHitGPUKernel<TrackerTraits>::makeHitsAsync(
       SiPixelDigisCUDA const& digis_d,
       SiPixelClustersCUDA const& clusters_d,
       BeamSpotCUDA const& bs_d,
@@ -51,8 +51,8 @@ namespace pixelgpudetails {
     using namespace gpuPixelRecHits;
     auto nHits = clusters_d.nClusters();
 
-    TrackingRecHit2DGPUT<TrackerTraits> hits_d(
-        nHits, clusters_d.offsetBPIX2(), cpeParams, clusters_d.clusModuleStart(), stream);
+    TrackingRecHitSoADevice<TrackerTraits> hits_d(
+        nHits, clusters_d.offsetBPIX2(), cpeParams, clusters_d->clusModuleStart(), stream);
 
     int activeModulesWithDigis = digis_d.nModules();
     // protect from empty events
@@ -61,11 +61,10 @@ namespace pixelgpudetails {
       int blocks = activeModulesWithDigis;
 
 #ifdef GPU_DEBUG
-
       std::cout << "launching getHits kernel for " << blocks << " blocks" << std::endl;
 #endif
       getHits<TrackerTraits><<<blocks, threadsPerBlock, 0, stream>>>(
-          cpeParams, bs_d.data(), digis_d.view(), digis_d.nDigis(), clusters_d.view(), hits_d.view());
+          cpeParams, bs_d.data(), digis_d.view(), digis_d.nDigis(), clusters_d.const_view(), hits_d.view());
       cudaCheck(cudaGetLastError());
 #ifdef GPU_DEBUG
       cudaCheck(cudaDeviceSynchronize());
@@ -74,16 +73,16 @@ namespace pixelgpudetails {
       // assuming full warp of threads is better than a smaller number...
       if (nHits) {
         setHitsLayerStart<TrackerTraits>
-            <<<1, 32, 0, stream>>>(clusters_d.clusModuleStart(), cpeParams, hits_d.hitsLayerStart());
+            <<<1, 32, 0, stream>>>(clusters_d->clusModuleStart(), cpeParams, hits_d.view().hitsLayerStart().data());
         cudaCheck(cudaGetLastError());
         constexpr auto nLayers = TrackerTraits::numberOfLayers;
         cms::cuda::fillManyFromVector(hits_d.phiBinner(),
                                       nLayers,
-                                      hits_d.iphi(),
-                                      hits_d.hitsLayerStart(),
+                                      hits_d.view().iphi(),
+                                      hits_d.view().hitsLayerStart().data(),
                                       nHits,
                                       256,
-                                      hits_d.phiBinnerStorage(),
+                                      hits_d.view().phiBinnerStorage(),
                                       stream);
         cudaCheck(cudaGetLastError());
 
@@ -93,6 +92,11 @@ namespace pixelgpudetails {
       }
     }
 
+#ifdef GPU_DEBUG
+    cudaCheck(cudaDeviceSynchronize());
+    std::cout << "PixelRecHitGPUKernel -> DONE!" << std::endl;
+#endif
+
     return hits_d;
   }
 
diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHitGPUKernel.h b/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHitGPUKernel.h
index 0a3c2b647f22e..25cc724cd4c4a 100644
--- a/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHitGPUKernel.h
+++ b/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHitGPUKernel.h
@@ -8,9 +8,9 @@
 #include "CUDADataFormats/BeamSpot/interface/BeamSpotCUDA.h"
 #include "CUDADataFormats/SiPixelCluster/interface/SiPixelClustersCUDA.h"
 #include "CUDADataFormats/SiPixelDigi/interface/SiPixelDigisCUDA.h"
-#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h"
+#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHitSoADevice.h"
 #include "Geometry/CommonTopologies/interface/SimplePixelTopology.h"
-//#define GPU_DEBUG 1
+//#define GPU_DEBUG
 namespace pixelgpudetails {
 
   template <typename TrackerTraits>
@@ -26,11 +26,11 @@ namespace pixelgpudetails {
 
     using ParamsOnGPU = pixelCPEforGPU::ParamsOnGPUT<TrackerTraits>;
 
-    TrackingRecHit2DGPUT<TrackerTraits> makeHitsAsync(SiPixelDigisCUDA const& digis_d,
-                                                      SiPixelClustersCUDA const& clusters_d,
-                                                      BeamSpotCUDA const& bs_d,
-                                                      ParamsOnGPU const* cpeParams,
-                                                      cudaStream_t stream) const;
+    TrackingRecHitSoADevice<TrackerTraits> makeHitsAsync(SiPixelDigisCUDA const& digis_d,
+                                                         SiPixelClustersCUDA const& clusters_d,
+                                                         BeamSpotCUDA const& bs_d,
+                                                         ParamsOnGPU const* cpeParams,
+                                                         cudaStream_t stream) const;
   };
 
 }  // namespace pixelgpudetails
diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitCUDA.cc b/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitCUDA.cc
index b23fa7dcc11ed..3bf0cf670a577 100644
--- a/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitCUDA.cc
+++ b/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitCUDA.cc
@@ -4,7 +4,7 @@
 #include "CUDADataFormats/Common/interface/Product.h"
 #include "CUDADataFormats/SiPixelCluster/interface/SiPixelClustersCUDA.h"
 #include "CUDADataFormats/SiPixelDigi/interface/SiPixelDigisCUDA.h"
-#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h"
+#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHitSoADevice.h"
 #include "DataFormats/Common/interface/Handle.h"
 #include "FWCore/Framework/interface/Event.h"
 #include "FWCore/Framework/interface/EventSetup.h"
@@ -39,7 +39,7 @@ class SiPixelRecHitCUDAT : public edm::global::EDProducer<> {
   const edm::EDGetTokenT<cms::cuda::Product<BeamSpotCUDA>> tBeamSpot;
   const edm::EDGetTokenT<cms::cuda::Product<SiPixelClustersCUDA>> token_;
   const edm::EDGetTokenT<cms::cuda::Product<SiPixelDigisCUDA>> tokenDigi_;
-  const edm::EDPutTokenT<cms::cuda::Product<TrackingRecHit2DGPUT<TrackerTraits>>> tokenHit_;
+  const edm::EDPutTokenT<cms::cuda::Product<TrackingRecHitSoADevice<TrackerTraits>>> tokenHit_;
 
   const pixelgpudetails::PixelRecHitGPUKernel<TrackerTraits> gpuAlgo_;
 };
@@ -50,7 +50,7 @@ SiPixelRecHitCUDAT<TrackerTraits>::SiPixelRecHitCUDAT(const edm::ParameterSet& i
       tBeamSpot(consumes<cms::cuda::Product<BeamSpotCUDA>>(iConfig.getParameter<edm::InputTag>("beamSpot"))),
       token_(consumes<cms::cuda::Product<SiPixelClustersCUDA>>(iConfig.getParameter<edm::InputTag>("src"))),
       tokenDigi_(consumes<cms::cuda::Product<SiPixelDigisCUDA>>(iConfig.getParameter<edm::InputTag>("src"))),
-      tokenHit_(produces<cms::cuda::Product<TrackingRecHit2DGPUT<TrackerTraits>>>()) {}
+      tokenHit_(produces<cms::cuda::Product<TrackingRecHitSoADevice<TrackerTraits>>>()) {}
 
 template <typename TrackerTraits>
 void SiPixelRecHitCUDAT<TrackerTraits>::fillDescriptions(edm::ConfigurationDescriptions& descriptions) {
diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitFromCUDA.cc b/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitFromCUDA.cc
index 1428efe06a1d1..c639d5cc4fefa 100644
--- a/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitFromCUDA.cc
+++ b/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitFromCUDA.cc
@@ -4,7 +4,7 @@
 
 #include "CUDADataFormats/Common/interface/HostProduct.h"
 #include "CUDADataFormats/Common/interface/Product.h"
-#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h"
+#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHitSoADevice.h"
 #include "DataFormats/Common/interface/DetSetVectorNew.h"
 #include "DataFormats/Common/interface/Handle.h"
 #include "DataFormats/SiPixelCluster/interface/SiPixelCluster.h"
@@ -33,7 +33,7 @@ class SiPixelRecHitFromCUDAT : public edm::stream::EDProducer<edm::ExternalWork>
   static void fillDescriptions(edm::ConfigurationDescriptions& descriptions);
 
   using HMSstorage = HostProduct<uint32_t[]>;
-  using HitsOnGPU = TrackingRecHit2DGPUT<TrackerTraits>;
+  using HitsOnGPU = TrackingRecHitSoADevice<TrackerTraits>;
 
 private:
   void acquire(edm::Event const& iEvent,
diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitSoAFromCUDA.cc b/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitSoAFromCUDA.cc
index 8bcb218255548..15bc0c8df70b5 100644
--- a/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitSoAFromCUDA.cc
+++ b/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitSoAFromCUDA.cc
@@ -4,7 +4,8 @@
 
 #include "CUDADataFormats/Common/interface/HostProduct.h"
 #include "CUDADataFormats/Common/interface/Product.h"
-#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h"
+#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHitSoAHost.h"
+#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHitSoADevice.h"
 #include "DataFormats/Common/interface/DetSetVectorNew.h"
 #include "DataFormats/Common/interface/Handle.h"
 #include "DataFormats/SiPixelCluster/interface/SiPixelCluster.h"
@@ -32,7 +33,8 @@ class SiPixelRecHitSoAFromCUDAT : public edm::stream::EDProducer<edm::ExternalWo
 
   static void fillDescriptions(edm::ConfigurationDescriptions& descriptions);
   using HMSstorage = HostProduct<uint32_t[]>;
-  using TrackingRecHit2DSOAView = TrackingRecHit2DSOAViewT<TrackerTraits>;
+  using HitsOnHost = TrackingRecHitSoAHost<TrackerTraits>;
+  using HitsOnDevice = TrackingRecHitSoADevice<TrackerTraits>;
 
 private:
   void acquire(edm::Event const& iEvent,
@@ -40,21 +42,18 @@ class SiPixelRecHitSoAFromCUDAT : public edm::stream::EDProducer<edm::ExternalWo
                edm::WaitingTaskWithArenaHolder waitingTaskHolder) override;
   void produce(edm::Event& iEvent, edm::EventSetup const& iSetup) override;
 
-  const edm::EDGetTokenT<cms::cuda::Product<TrackingRecHit2DGPUT<TrackerTraits>>> hitsTokenGPU_;  // CUDA hits
-  const edm::EDPutTokenT<TrackingRecHit2DCPUT<TrackerTraits>> hitsPutTokenCPU_;
+  const edm::EDGetTokenT<cms::cuda::Product<HitsOnDevice>> hitsTokenGPU_;  // CUDA hits
+  const edm::EDPutTokenT<HitsOnHost> hitsPutTokenCPU_;
   const edm::EDPutTokenT<HMSstorage> hostPutToken_;
 
   uint32_t nHits_;
-
-  cms::cuda::host::unique_ptr<float[]> store32_;
-  cms::cuda::host::unique_ptr<uint16_t[]> store16_;
-  cms::cuda::host::unique_ptr<uint32_t[]> hitsModuleStart_;
+  HitsOnHost hits_h_;
 };
 
 template <typename TrackerTraits>
 SiPixelRecHitSoAFromCUDAT<TrackerTraits>::SiPixelRecHitSoAFromCUDAT(const edm::ParameterSet& iConfig)
     : hitsTokenGPU_(consumes(iConfig.getParameter<edm::InputTag>("pixelRecHitSrc"))),
-      hitsPutTokenCPU_(produces<TrackingRecHit2DCPUT<TrackerTraits>>()),
+      hitsPutTokenCPU_(produces<HitsOnHost>()),
       hostPutToken_(produces<HMSstorage>()) {}
 
 template <typename TrackerTraits>
@@ -69,18 +68,18 @@ template <typename TrackerTraits>
 void SiPixelRecHitSoAFromCUDAT<TrackerTraits>::acquire(edm::Event const& iEvent,
                                                        edm::EventSetup const& iSetup,
                                                        edm::WaitingTaskWithArenaHolder waitingTaskHolder) {
-  cms::cuda::Product<TrackingRecHit2DGPUT<TrackerTraits>> const& inputDataWrapped = iEvent.get(hitsTokenGPU_);
+  cms::cuda::Product<HitsOnDevice> const& inputDataWrapped = iEvent.get(hitsTokenGPU_);
   cms::cuda::ScopedContextAcquire ctx{inputDataWrapped, std::move(waitingTaskHolder)};
   auto const& inputData = ctx.get(inputDataWrapped);
 
   nHits_ = inputData.nHits();
+  hits_h_ = HitsOnHost(nHits_, ctx.stream());
+  cudaCheck(cudaMemcpyAsync(hits_h_.buffer().get(),
+                            inputData.const_buffer().get(),
+                            inputData.bufferSize(),
+                            cudaMemcpyDeviceToHost,
+                            ctx.stream()));  // Copy data from Device to Host
   LogDebug("SiPixelRecHitSoAFromCUDA") << "copying to cpu SoA" << inputData.nHits() << " Hits";
-
-  if (0 == nHits_)
-    return;
-  store32_ = inputData.store32ToHostAsync(ctx.stream());
-  store16_ = inputData.store16ToHostAsync(ctx.stream());
-  hitsModuleStart_ = inputData.hitsModuleStartToHostAsync(ctx.stream());
 }
 
 template <typename TrackerTraits>
@@ -88,10 +87,10 @@ void SiPixelRecHitSoAFromCUDAT<TrackerTraits>::produce(edm::Event& iEvent, edm::
   auto hmsp = std::make_unique<uint32_t[]>(TrackerTraits::numberOfModules + 1);
 
   if (nHits_ > 0)
-    std::copy(hitsModuleStart_.get(), hitsModuleStart_.get() + TrackerTraits::numberOfModules + 1, hmsp.get());
+    std::copy(hits_h_.view().hitsModuleStart().begin(), hits_h_.view().hitsModuleStart().end(), hmsp.get());
 
   iEvent.emplace(hostPutToken_, std::move(hmsp));
-  iEvent.emplace(hitsPutTokenCPU_, store32_, store16_, hitsModuleStart_.get(), nHits_);
+  iEvent.emplace(hitsPutTokenCPU_, std::move(hits_h_));
 }
 
 using SiPixelRecHitSoAFromCUDA = SiPixelRecHitSoAFromCUDAT<pixelTopology::Phase1>;
diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitSoAFromLegacy.cc b/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitSoAFromLegacy.cc
index 1edc7870f4800..dfc18d31154f2 100644
--- a/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitSoAFromLegacy.cc
+++ b/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitSoAFromLegacy.cc
@@ -3,7 +3,8 @@
 #include "CUDADataFormats/BeamSpot/interface/BeamSpotCUDA.h"
 #include "CUDADataFormats/SiPixelCluster/interface/SiPixelClustersCUDA.h"
 #include "CUDADataFormats/SiPixelDigi/interface/SiPixelDigisCUDA.h"
-#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h"
+#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHitSoAHost.h"
+#include "CUDADataFormats/Common/interface/PortableHostCollection.h"
 #include "CUDADataFormats/Common/interface/HostProduct.h"
 #include "DataFormats/BeamSpot/interface/BeamSpot.h"
 #include "DataFormats/Common/interface/DetSetVectorNew.h"
@@ -35,8 +36,9 @@ class SiPixelRecHitSoAFromLegacyT : public edm::global::EDProducer<> {
 
   static void fillDescriptions(edm::ConfigurationDescriptions& descriptions);
 
-  using HitModuleStart = std::array<uint32_t, gpuClustering::maxNumModules + 1>;
+  using HitModuleStart = std::array<uint32_t, TrackerTraits::numberOfModules + 1>;
   using HMSstorage = HostProduct<uint32_t[]>;
+  using HitsOnHost = TrackingRecHitSoAHost<TrackerTraits>;
 
 private:
   void produce(edm::StreamID streamID, edm::Event& iEvent, const edm::EventSetup& iSetup) const override;
@@ -45,7 +47,7 @@ class SiPixelRecHitSoAFromLegacyT : public edm::global::EDProducer<> {
   const edm::ESGetToken<PixelClusterParameterEstimator, TkPixelCPERecord> cpeToken_;
   const edm::EDGetTokenT<reco::BeamSpot> bsGetToken_;
   const edm::EDGetTokenT<SiPixelClusterCollectionNew> clusterToken_;  // Legacy Clusters
-  const edm::EDPutTokenT<TrackingRecHit2DCPUT<TrackerTraits>> tokenHit_;
+  const edm::EDPutTokenT<HitsOnHost> tokenHit_;
   const edm::EDPutTokenT<HMSstorage> tokenModuleStart_;
   const bool convert2Legacy_;
 };
@@ -56,7 +58,7 @@ SiPixelRecHitSoAFromLegacyT<TrackerTraits>::SiPixelRecHitSoAFromLegacyT(const ed
       cpeToken_(esConsumes(edm::ESInputTag("", iConfig.getParameter<std::string>("CPE")))),
       bsGetToken_{consumes<reco::BeamSpot>(iConfig.getParameter<edm::InputTag>("beamSpot"))},
       clusterToken_{consumes<SiPixelClusterCollectionNew>(iConfig.getParameter<edm::InputTag>("src"))},
-      tokenHit_{produces<TrackingRecHit2DCPUT<TrackerTraits>>()},
+      tokenHit_{produces<HitsOnHost>()},
       tokenModuleStart_{produces<HMSstorage>()},
       convert2Legacy_(iConfig.getParameter<bool>("convertToLegacy")) {
   if (convert2Legacy_)
@@ -99,12 +101,11 @@ void SiPixelRecHitSoAFromLegacyT<TrackerTraits>::produce(edm::StreamID streamID,
   iEvent.getByToken(clusterToken_, hclusters);
   auto const& input = *hclusters;
 
-  constexpr int maxModules = TrackerTraits::numberOfModules;
+  constexpr int nModules = TrackerTraits::numberOfModules;
   constexpr int startBPIX2 = pixelTopology::layerStart<TrackerTraits>(1);
 
   // allocate a buffer for the indices of the clusters
-  auto hmsp = std::make_unique<uint32_t[]>(maxModules + 1);
-  // hitsModuleStart is a non-owning pointer to the buffer
+  auto hmsp = std::make_unique<uint32_t[]>(nModules + 1);
   auto hitsModuleStart = hmsp.get();
   // wrap the buffer in a HostProduct
   auto hms = std::make_unique<HMSstorage>(std::move(hmsp));
@@ -114,28 +115,19 @@ void SiPixelRecHitSoAFromLegacyT<TrackerTraits>::produce(edm::StreamID streamID,
   // legacy output
   auto legacyOutput = std::make_unique<SiPixelRecHitCollectionNew>();
 
-  // storage
-  std::vector<uint16_t> xx;
-  std::vector<uint16_t> yy;
-  std::vector<uint16_t> adc;
-  std::vector<uint16_t> moduleInd;
-  std::vector<int32_t> clus;
-
   std::vector<edm::Ref<edmNew::DetSetVector<SiPixelCluster>, SiPixelCluster>> clusterRef;
 
   constexpr uint32_t maxHitsInModule = gpuClustering::maxHitsInModule();
 
-  HitModuleStart moduleStart_;  // index of the first pixel of each module
-  HitModuleStart clusInModule_;
-  memset(&clusInModule_, 0, sizeof(HitModuleStart));  // needed??
-  memset(&moduleStart_, 0, sizeof(HitModuleStart));
-  assert(gpuClustering::maxNumModules + 1 == clusInModule_.size());
-  assert(0 == clusInModule_[gpuClustering::maxNumModules]);
-  uint32_t moduleId_;
-  moduleStart_[1] = 0;  // we run sequentially....
+  cms::cuda::PortableHostCollection<SiPixelClustersCUDALayout<>> clusters_h(nModules + 1);
+
+  memset(clusters_h.view().clusInModule(), 0, (nModules + 1) * sizeof(uint32_t));  // needed??
+  memset(clusters_h.view().moduleStart(), 0, (nModules + 1) * sizeof(uint32_t));
+  memset(clusters_h.view().moduleId(), 0, (nModules + 1) * sizeof(uint32_t));
+  memset(clusters_h.view().clusModuleStart(), 0, (nModules + 1) * sizeof(uint32_t));
 
-  SiPixelClustersCUDA::SiPixelClustersCUDASOAView clusterView{
-      moduleStart_.data(), clusInModule_.data(), &moduleId_, hitsModuleStart};
+  assert(0 == clusters_h.view()[nModules].clusInModule());
+  clusters_h.view()[1].moduleStart() = 0;
 
   // fill cluster arrays
   int numberOfClusters = 0;
@@ -144,33 +136,33 @@ void SiPixelRecHitSoAFromLegacyT<TrackerTraits>::produce(edm::StreamID streamID,
     DetId detIdObject(detid);
     const GeomDetUnit* genericDet = geom_->idToDetUnit(detIdObject);
     auto gind = genericDet->index();
-    assert(gind < maxModules);
+    assert(gind < nModules);
     auto const nclus = dsv.size();
-    clusInModule_[gind] = nclus;
+    clusters_h.view()[gind].clusInModule() = nclus;
     numberOfClusters += nclus;
   }
-  hitsModuleStart[0] = 0;
-
-  for (int i = 1, n = maxModules + 1; i < n; ++i)
-    hitsModuleStart[i] = hitsModuleStart[i - 1] + clusInModule_[i - 1];
+  clusters_h.view()[0].clusModuleStart() = 0;
 
-  assert(numberOfClusters == int(hitsModuleStart[maxModules]));
+  for (int i = 1; i < nModules + 1; ++i) {
+    clusters_h.view()[i].clusModuleStart() =
+        clusters_h.view()[i - 1].clusModuleStart() + clusters_h.view()[i - 1].clusInModule();
+  }
 
+  assert((uint32_t)numberOfClusters == clusters_h.view()[nModules].clusModuleStart());
   // output SoA
   // element 96 is the start of BPIX2 (i.e. the number of clusters in BPIX1)
-
-  auto output = std::make_unique<TrackingRecHit2DCPUT<TrackerTraits>>(
-      numberOfClusters, hitsModuleStart[startBPIX2], &cpeView, hitsModuleStart, nullptr);
+  HitsOnHost output(
+      numberOfClusters, clusters_h.view()[startBPIX2].clusModuleStart(), &cpeView, clusters_h.view().clusModuleStart());
 
   if (0 == numberOfClusters) {
-    iEvent.put(std::move(output));
+    iEvent.emplace(tokenHit_, std::move(output));
     if (convert2Legacy_)
       iEvent.put(std::move(legacyOutput));
     return;
   }
 
   if (convert2Legacy_)
-    legacyOutput->reserve(maxModules, numberOfClusters);
+    legacyOutput->reserve(nModules, numberOfClusters);
 
   int numberOfDetUnits = 0;
   int numberOfHits = 0;
@@ -180,16 +172,17 @@ void SiPixelRecHitSoAFromLegacyT<TrackerTraits>::produce(edm::StreamID streamID,
     DetId detIdObject(detid);
     const GeomDetUnit* genericDet = geom_->idToDetUnit(detIdObject);
     auto const gind = genericDet->index();
-    assert(gind < maxModules);
+    assert(gind < nModules);
     const PixelGeomDetUnit* pixDet = dynamic_cast<const PixelGeomDetUnit*>(genericDet);
     assert(pixDet);
     auto const nclus = dsv.size();
-    assert(clusInModule_[gind] == nclus);
+
+    assert(clusters_h.view()[gind].clusInModule() == nclus);
     if (0 == nclus)
       continue;  // is this really possible?
 
-    auto const fc = hitsModuleStart[gind];
-    auto const lc = hitsModuleStart[gind + 1];
+    auto const fc = clusters_h.view()[gind].clusModuleStart();
+    auto const lc = clusters_h.view()[gind + 1].clusModuleStart();
     assert(lc > fc);
     LogDebug("SiPixelRecHitSoAFromLegacy") << "in det " << gind << ": conv " << nclus << " hits from " << dsv.size()
                                            << " legacy clusters" << ' ' << fc << ',' << lc;
@@ -198,25 +191,30 @@ void SiPixelRecHitSoAFromLegacyT<TrackerTraits>::produce(edm::StreamID streamID,
       printf(
           "WARNING: too many clusters %d in Module %d. Only first %d Hits converted\n", nclus, gind, maxHitsInModule);
 
-    // fill digis
-    xx.clear();
-    yy.clear();
-    adc.clear();
-    moduleInd.clear();
-    clus.clear();
+    // count digis
+    uint32_t ndigi = 0;
+    for (auto const& clust : dsv) {
+      assert(clust.size() > 0);
+      ndigi += clust.size();
+    }
+
+    cms::cuda::PortableHostCollection<SiPixelDigisSoALayout<>> digis_h(ndigi);
+
     clusterRef.clear();
-    moduleId_ = gind;
+    clusters_h.view()[0].moduleId() = gind;
+
     uint32_t ic = 0;
-    uint32_t ndigi = 0;
+    ndigi = 0;
+    //filling digis
     for (auto const& clust : dsv) {
       assert(clust.size() > 0);
       for (int i = 0, nd = clust.size(); i < nd; ++i) {
         auto px = clust.pixel(i);
-        xx.push_back(px.x);
-        yy.push_back(px.y);
-        adc.push_back(px.adc);
-        moduleInd.push_back(gind);
-        clus.push_back(ic);
+        digis_h.view()[ndigi].xx() = px.x;
+        digis_h.view()[ndigi].yy() = px.y;
+        digis_h.view()[ndigi].adc() = px.adc;
+        digis_h.view()[ndigi].moduleId() = gind;
+        digis_h.view()[ndigi].clus() = ic;
         ++ndigi;
       }
 
@@ -225,25 +223,19 @@ void SiPixelRecHitSoAFromLegacyT<TrackerTraits>::produce(edm::StreamID streamID,
       ic++;
     }
     assert(nclus == ic);
-    assert(clus.size() == ndigi);
+
     numberOfHits += nclus;
     // filled creates view
-    SiPixelDigisCUDASOAView digiView;
-    digiView.xx_ = xx.data();
-    digiView.yy_ = yy.data();
-    digiView.adc_ = adc.data();
-    digiView.moduleInd_ = moduleInd.data();
-    digiView.clus_ = clus.data();
-    digiView.pdigi_ = nullptr;
-    digiView.rawIdArr_ = nullptr;
-    assert(digiView.adc(0) != 0);
+    assert(digis_h.view()[0].adc() != 0);
     // we run on blockId.x==0
-    gpuPixelRecHits::getHits(&cpeView, &bsHost, digiView, ndigi, &clusterView, output->view());
+
+    gpuPixelRecHits::getHits(&cpeView, &bsHost, digis_h.view(), ndigi, clusters_h.view(), output.view());
     for (auto h = fc; h < lc; ++h)
       if (h - fc < maxHitsInModule)
-        assert(gind == output->view()->detectorIndex(h));
+        assert(gind == output.view()[h].detectorIndex());
       else
-        assert(gpuClustering::invalidModuleId == output->view()->detectorIndex(h));
+        assert(gpuClustering::invalidModuleId == output.view()[h].detectorIndex());
+
     if (convert2Legacy_) {
       SiPixelRecHitCollectionNew::FastFiller recHitsOnDetUnit(*legacyOutput, detid);
       for (auto h = fc; h < lc; ++h) {
@@ -253,8 +245,9 @@ void SiPixelRecHitSoAFromLegacyT<TrackerTraits>::produce(edm::StreamID streamID,
           break;
 
         assert(ih < clusterRef.size());
-        LocalPoint lp(output->view()->xLocal(h), output->view()->yLocal(h));
-        LocalError le(output->view()->xerrLocal(h), 0, output->view()->yerrLocal(h));
+        LocalPoint lp(output.view()[h].xLocal(), output.view()[h].yLocal());
+        LocalError le(output.view()[h].xerrLocal(), 0, output.view()[h].yerrLocal());
+
         SiPixelRecHitQuality::QualWordType rqw = 0;
         SiPixelRecHit hit(lp, le, rqw, *genericDet, clusterRef[ih]);
         recHitsOnDetUnit.push_back(hit);
@@ -267,24 +260,28 @@ void SiPixelRecHitSoAFromLegacyT<TrackerTraits>::produce(edm::StreamID streamID,
   // fill data structure to support CA
   constexpr auto nLayers = TrackerTraits::numberOfLayers;
   for (auto i = 0U; i < nLayers + 1; ++i) {
-    output->hitsLayerStart()[i] = hitsModuleStart[cpeView.layerGeometry().layerStart[i]];
+    output.view().hitsLayerStart()[i] = clusters_h.view()[cpeView.layerGeometry().layerStart[i]].clusModuleStart();
     LogDebug("SiPixelRecHitSoAFromLegacy")
         << "Layer n." << i << " - starting at module: " << cpeView.layerGeometry().layerStart[i]
         << " - starts ad cluster: " << output->hitsLayerStart()[i] << "\n";
   }
 
-  cms::cuda::fillManyFromVector(output->phiBinner(),
+  cms::cuda::fillManyFromVector(&(output.view().phiBinner()),
                                 nLayers,
-                                output->iphi(),
-                                output->hitsLayerStart(),
-                                numberOfHits,
+                                output.view().iphi(),
+                                output.view().hitsLayerStart().data(),
+                                output.view().nHits(),
                                 256,
-                                output->phiBinnerStorage());
+                                output.view().phiBinnerStorage());
 
   LogDebug("SiPixelRecHitSoAFromLegacy") << "created HitSoa for " << numberOfClusters << " clusters in "
                                          << numberOfDetUnits << " Dets"
                                          << "\n";
-  iEvent.put(std::move(output));
+
+  // copy pointer to data (SoA view) to allocated buffer
+  memcpy(hitsModuleStart, clusters_h.view().clusModuleStart(), nModules * sizeof(uint32_t));
+
+  iEvent.emplace(tokenHit_, std::move(output));
   if (convert2Legacy_)
     iEvent.put(std::move(legacyOutput));
 }
diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/gpuPixelRecHits.h b/RecoLocalTracker/SiPixelRecHits/plugins/gpuPixelRecHits.h
index f0798cc74a975..09d0b55030d9c 100644
--- a/RecoLocalTracker/SiPixelRecHits/plugins/gpuPixelRecHits.h
+++ b/RecoLocalTracker/SiPixelRecHits/plugins/gpuPixelRecHits.h
@@ -7,11 +7,11 @@
 
 #include "CUDADataFormats/BeamSpot/interface/BeamSpotCUDA.h"
 #include "CUDADataFormats/SiPixelCluster/interface/gpuClusteringConstants.h"
-#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h"
+#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHitsUtilities.h"
 #include "DataFormats/Math/interface/approx_atan2.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h"
 #include "RecoLocalTracker/SiPixelRecHits/interface/pixelCPEforGPU.h"
-#include "CUDADataFormats/SiPixelDigi/interface/SiPixelDigisCUDASOAView.h"
+#include "CUDADataFormats/SiPixelDigi/interface/SiPixelDigisCUDA.h"
 
 //#define GPU_DEBUG 1
 namespace gpuPixelRecHits {
@@ -19,20 +19,17 @@ namespace gpuPixelRecHits {
   template <typename TrackerTraits>
   __global__ void getHits(pixelCPEforGPU::ParamsOnGPUT<TrackerTraits> const* __restrict__ cpeParams,
                           BeamSpotPOD const* __restrict__ bs,
-                          SiPixelDigisCUDASOAView const digis,
+                          SiPixelDigisCUDASOAConstView digis,
                           int numElements,
-                          SiPixelClustersCUDA::SiPixelClustersCUDASOAView const* __restrict__ pclusters,
-                          TrackingRecHit2DSOAViewT<TrackerTraits>* phits) {
+                          SiPixelClustersCUDASOAConstView clusters,
+                          TrackingRecHitSoAView<TrackerTraits> hits) {
     // FIXME
     // the compiler seems NOT to optimize loads from views (even in a simple test case)
     // The whole gimnastic here of copying or not is a pure heuristic exercise that seems to produce the fastest code with the above signature
     // not using views (passing a gazzilion of array pointers) seems to produce the fastest code (but it is harder to mantain)
 
-    assert(phits);
     assert(cpeParams);
-    auto& hits = *phits;
 
-    auto const& clusters = *pclusters;
     // copy average geometry corrected by beamspot . FIXME (move it somewhere else???)
     if (0 == blockIdx.x) {
       auto& agc = hits.averageGeometry();
@@ -51,7 +48,6 @@ namespace gpuPixelRecHits {
       if (0 == threadIdx.x) {
         agc.endCapZ[0] = ag.endCapZ[0] - bs->z;
         agc.endCapZ[1] = ag.endCapZ[1] - bs->z;
-        //         printf("endcapZ %f %f\n",agc.endCapZ[0],agc.endCapZ[1]);
       }
     }
 
@@ -64,23 +60,22 @@ namespace gpuPixelRecHits {
     // as usual one block per module
     __shared__ ClusParams clusParams;
 
-    auto me = clusters.moduleId(blockIdx.x);
-    int nclus = clusters.clusInModule(me);
+    auto me = clusters[blockIdx.x].moduleId();
+    int nclus = clusters[me].clusInModule();
 
     if (0 == nclus)
       return;
-// #ifdef GPU_DEBUG
-//     if (threadIdx.x == 0) {
-//       auto k = clusters.moduleStart(1 + blockIdx.x);
-//       while (digis.moduleInd(k) == invalidModuleId)
-//         ++k;
-//       assert(digis.moduleInd(k) == me);
-//     }
-// #endif
 #ifdef GPU_DEBUG
+    if (threadIdx.x == 0) {
+      auto k = clusters[1 + blockIdx.x].moduleStart();
+      while (digis[k].moduleId() == invalidModuleId)
+        ++k;
+      assert(digis[k].moduleId() == me);
+    }
+
     if (me % 100 == 1)
       if (threadIdx.x == 0)
-        printf("hitbuilder: %d clusters in module %d. will write at %d\n", nclus, me, clusters.clusModuleStart(me));
+        printf("hitbuilder: %d clusters in module %d. will write at %d\n", nclus, me, clusters[me].clusModuleStart());
 #endif
 
     for (int startClus = 0, endClus = nclus; startClus < endClus; startClus += MaxHitsInIter) {
@@ -108,21 +103,21 @@ namespace gpuPixelRecHits {
       __syncthreads();
 
       // one thread per "digi"
-      auto first = clusters.moduleStart(1 + blockIdx.x) + threadIdx.x;
+      auto first = clusters[1 + blockIdx.x].moduleStart() + threadIdx.x;
       for (int i = first; i < numElements; i += blockDim.x) {
-        auto id = digis.moduleInd(i);
+        auto id = digis[i].moduleId();
         if (id == invalidModuleId)
           continue;  // not valid
         if (id != me)
           break;  // end of module
-        auto cl = digis.clus(i);
+        auto cl = digis[i].clus();
         if (cl < startClus || cl >= lastClus)
           continue;
         cl -= startClus;
         assert(cl >= 0);
         assert(cl < MaxHitsInIter);
-        auto x = digis.xx(i);
-        auto y = digis.yy(i);
+        auto x = digis[i].xx();
+        auto y = digis[i].yy();
         atomicMin(&clusParams.minRow[cl], x);
         atomicMax(&clusParams.maxRow[cl], x);
         atomicMin(&clusParams.minCol[cl], y);
@@ -133,20 +128,20 @@ namespace gpuPixelRecHits {
 
       auto pixmx = cpeParams->detParams(me).pixmx;
       for (int i = first; i < numElements; i += blockDim.x) {
-        auto id = digis.moduleInd(i);
+        auto id = digis[i].moduleId();
         if (id == invalidModuleId)
           continue;  // not valid
         if (id != me)
           break;  // end of module
-        auto cl = digis.clus(i);
+        auto cl = digis[i].clus();
         if (cl < startClus || cl >= lastClus)
           continue;
         cl -= startClus;
         assert(cl >= 0);
         assert(cl < MaxHitsInIter);
-        auto x = digis.xx(i);
-        auto y = digis.yy(i);
-        auto ch = digis.adc(i);
+        auto x = digis[i].xx();
+        auto y = digis[i].yy();
+        auto ch = digis[i].adc();
         atomicAdd(&clusParams.charge[cl], ch);
         ch = std::min(ch, pixmx);
         if (clusParams.minRow[cl] == x)
@@ -163,30 +158,31 @@ namespace gpuPixelRecHits {
 
       // next one cluster per thread...
 
-      first = clusters.clusModuleStart(me) + startClus;
+      first = clusters[me].clusModuleStart() + startClus;
       for (int ic = threadIdx.x; ic < nClusInIter; ic += blockDim.x) {
         auto h = first + ic;  // output index in global memory
 
         assert(h < hits.nHits());
-        assert(h < clusters.clusModuleStart(me + 1));
+        assert(h < clusters[me + 1].clusModuleStart());
 
         pixelCPEforGPU::position<TrackerTraits>(cpeParams->commonParams(), cpeParams->detParams(me), clusParams, ic);
 
         pixelCPEforGPU::errorFromDB<TrackerTraits>(cpeParams->commonParams(), cpeParams->detParams(me), clusParams, ic);
 
         // store it
-        hits.setChargeAndStatus(h, clusParams.charge[ic], clusParams.status[ic]);
-        hits.detectorIndex(h) = me;
+        hits[h].chargeAndStatus().charge = clusParams.charge[ic];
+        hits[h].chargeAndStatus().status = clusParams.status[ic];
+        hits[h].detectorIndex() = me;
 
         float xl, yl;
-        hits.xLocal(h) = xl = clusParams.xpos[ic];
-        hits.yLocal(h) = yl = clusParams.ypos[ic];
+        hits[h].xLocal() = xl = clusParams.xpos[ic];
+        hits[h].yLocal() = yl = clusParams.ypos[ic];
 
-        hits.clusterSizeX(h) = clusParams.xsize[ic];
-        hits.clusterSizeY(h) = clusParams.ysize[ic];
+        hits[h].clusterSizeX() = clusParams.xsize[ic];
+        hits[h].clusterSizeY() = clusParams.ysize[ic];
 
-        hits.xerrLocal(h) = clusParams.xerr[ic] * clusParams.xerr[ic] + cpeParams->detParams(me).apeXX;
-        hits.yerrLocal(h) = clusParams.yerr[ic] * clusParams.yerr[ic] + cpeParams->detParams(me).apeYY;
+        hits[h].xerrLocal() = clusParams.xerr[ic] * clusParams.xerr[ic] + cpeParams->detParams(me).apeXX;
+        hits[h].yerrLocal() = clusParams.yerr[ic] * clusParams.yerr[ic] + cpeParams->detParams(me).apeYY;
 
         // keep it local for computations
         float xg, yg, zg;
@@ -197,12 +193,12 @@ namespace gpuPixelRecHits {
         yg -= bs->y;
         zg -= bs->z;
 
-        hits.xGlobal(h) = xg;
-        hits.yGlobal(h) = yg;
-        hits.zGlobal(h) = zg;
+        hits[h].xGlobal() = xg;
+        hits[h].yGlobal() = yg;
+        hits[h].zGlobal() = zg;
 
-        hits.rGlobal(h) = std::sqrt(xg * xg + yg * yg);
-        hits.iphi(h) = unsafe_atan2s<7>(yg, xg);
+        hits[h].rGlobal() = std::sqrt(xg * xg + yg * yg);
+        hits[h].iphi() = unsafe_atan2s<7>(yg, xg);
       }
       __syncthreads();
     }  // end loop on batches
diff --git a/RecoLocalTracker/SiPixelRecHits/python/SiPixelRecHits_cfi.py b/RecoLocalTracker/SiPixelRecHits/python/SiPixelRecHits_cfi.py
index ec3e068bca422..7284dab68f05e 100644
--- a/RecoLocalTracker/SiPixelRecHits/python/SiPixelRecHits_cfi.py
+++ b/RecoLocalTracker/SiPixelRecHits/python/SiPixelRecHits_cfi.py
@@ -56,7 +56,7 @@
 siPixelRecHitsPreSplittingSoA = SwitchProducerCUDA(
     cpu = cms.EDAlias(
             siPixelRecHitsPreSplittingCPU = cms.VPSet(
-                 cms.PSet(type = cms.string("pixelTopologyPhase1TrackingRecHit2DCPUT")),
+                 cms.PSet(type = cms.string("pixelTopologyPhase1TrackingRecHitSoAHost")),
                  cms.PSet(type = cms.string("uintAsHostProduct"))
              )),
 )
@@ -64,7 +64,7 @@
 phase2_tracker.toModify(siPixelRecHitsPreSplittingSoA,
 cpu = cms.EDAlias(
         siPixelRecHitsPreSplittingCPU = cms.VPSet(
-             cms.PSet(type = cms.string("pixelTopologyPhase2TrackingRecHit2DCPUT")),
+             cms.PSet(type = cms.string("pixelTopologyPhase2TrackingRecHitSoAHost")),
              cms.PSet(type = cms.string("uintAsHostProduct"))
          )))
 
diff --git a/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackDumpCUDA.cc b/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackDumpCUDA.cc
index ef73c625ebfa8..ac58a494cdf58 100644
--- a/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackDumpCUDA.cc
+++ b/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackDumpCUDA.cc
@@ -1,9 +1,7 @@
 #include <cuda_runtime.h>
+#include <Eigen/Core>  // needed here by soa layout
 
 #include "CUDADataFormats/Common/interface/Product.h"
-#include "CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h"
-#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h"
-#include "CUDADataFormats/Vertex/interface/ZVertexHeterogeneous.h"
 #include "DataFormats/Common/interface/Handle.h"
 #include "FWCore/Framework/interface/ConsumesCollector.h"
 #include "FWCore/Framework/interface/Event.h"
@@ -20,10 +18,21 @@
 #include "HeterogeneousCore/CUDACore/interface/ScopedContext.h"
 #include "RecoTracker/TkMSParametrization/interface/PixelRecoUtilities.h"
 
+#include "CUDADataFormats/Vertex/interface/ZVertexSoAHeterogeneousHost.h"
+#include "CUDADataFormats/Vertex/interface/ZVertexSoAHeterogeneousDevice.h"
+
+#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousDevice.h"
+#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousHost.h"
+
 template <typename TrackerTraits>
 class PixelTrackDumpCUDAT : public edm::global::EDAnalyzer<> {
 public:
-  using PixelTrackHeterogeneous = PixelTrackHeterogeneousT<TrackerTraits>;
+  using TrackSoAHost = TrackSoAHeterogeneousHost<TrackerTraits>;
+  using TrackSoADevice = TrackSoAHeterogeneousDevice<TrackerTraits>;
+
+  using VertexSoAHost = ZVertexSoAHost;
+  using VertexSoADevice = ZVertexSoADevice;
+
   explicit PixelTrackDumpCUDAT(const edm::ParameterSet& iConfig);
   ~PixelTrackDumpCUDAT() override = default;
 
@@ -32,23 +41,21 @@ class PixelTrackDumpCUDAT : public edm::global::EDAnalyzer<> {
 private:
   void analyze(edm::StreamID streamID, edm::Event const& iEvent, const edm::EventSetup& iSetup) const override;
   const bool m_onGPU;
-  edm::EDGetTokenT<cms::cuda::Product<PixelTrackHeterogeneous>> tokenGPUTrack_;
-  edm::EDGetTokenT<cms::cuda::Product<ZVertexHeterogeneous>> tokenGPUVertex_;
-  edm::EDGetTokenT<PixelTrackHeterogeneous> tokenSoATrack_;
-  edm::EDGetTokenT<ZVertexHeterogeneous> tokenSoAVertex_;
+  edm::EDGetTokenT<cms::cuda::Product<TrackSoADevice>> tokenGPUTrack_;
+  edm::EDGetTokenT<cms::cuda::Product<VertexSoADevice>> tokenGPUVertex_;
+  edm::EDGetTokenT<TrackSoAHost> tokenSoATrack_;
+  edm::EDGetTokenT<VertexSoAHost> tokenSoAVertex_;
 };
 
 template <typename TrackerTraits>
 PixelTrackDumpCUDAT<TrackerTraits>::PixelTrackDumpCUDAT(const edm::ParameterSet& iConfig)
     : m_onGPU(iConfig.getParameter<bool>("onGPU")) {
   if (m_onGPU) {
-    tokenGPUTrack_ =
-        consumes<cms::cuda::Product<PixelTrackHeterogeneous>>(iConfig.getParameter<edm::InputTag>("pixelTrackSrc"));
-    tokenGPUVertex_ =
-        consumes<cms::cuda::Product<ZVertexHeterogeneous>>(iConfig.getParameter<edm::InputTag>("pixelVertexSrc"));
+    tokenGPUTrack_ = consumes(iConfig.getParameter<edm::InputTag>("pixelTrackSrc"));
+    tokenGPUVertex_ = consumes(iConfig.getParameter<edm::InputTag>("pixelVertexSrc"));
   } else {
     tokenSoATrack_ = consumes(iConfig.getParameter<edm::InputTag>("pixelTrackSrc"));
-    tokenSoAVertex_ = consumes<ZVertexHeterogeneous>(iConfig.getParameter<edm::InputTag>("pixelVertexSrc"));
+    tokenSoAVertex_ = consumes(iConfig.getParameter<edm::InputTag>("pixelVertexSrc"));
   }
 }
 
@@ -71,19 +78,19 @@ void PixelTrackDumpCUDAT<TrackerTraits>::analyze(edm::StreamID streamID,
     cms::cuda::ScopedContextProduce ctx{hTracks};
 
     auto const& tracks = ctx.get(hTracks);
-    auto const* tsoa = tracks.get();
-    assert(tsoa);
+    auto const* tsoa = &tracks;
+    assert(tsoa->buffer());
 
     auto const& vertices = ctx.get(iEvent.get(tokenGPUVertex_));
-    auto const* vsoa = vertices.get();
-    assert(vsoa);
+    auto const* vsoa = &vertices;
+    assert(vsoa->buffer());
 
   } else {
-    auto const* tsoa = iEvent.get(tokenSoATrack_).get();
-    assert(tsoa);
+    auto const& tsoa = iEvent.get(tokenSoATrack_);
+    assert(tsoa.buffer());
 
-    auto const* vsoa = iEvent.get(tokenSoAVertex_).get();
-    assert(vsoa);
+    auto const& vsoa = iEvent.get(tokenSoAVertex_);
+    assert(vsoa.buffer());
   }
 }
 
diff --git a/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackProducerFromSoA.cc b/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackProducerFromSoA.cc
index 6a0f918b0d979..358d0b7b63e0c 100644
--- a/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackProducerFromSoA.cc
+++ b/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackProducerFromSoA.cc
@@ -27,20 +27,24 @@
 #include "RecoPixelVertexing/PixelTrackFitting/interface/FitUtils.h"
 
 #include "CUDADataFormats/Common/interface/HostProduct.h"
-#include "CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h"
 #include "CUDADataFormats/SiPixelCluster/interface/gpuClusteringConstants.h"
 #include "Geometry/CommonTopologies/interface/SimplePixelTopology.h"
 
 #include "storeTracks.h"
 #include "CUDADataFormats/Common/interface/HostProduct.h"
 
+#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousHost.h"
+#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousDevice.h"
+#include "CUDADataFormats/Track/interface/PixelTrackUtilities.h"
+
 /**
  * This class creates "leagcy"  reco::Track
  * objects from the output of SoA CA.
  */
 template <typename TrackerTraits>
 class PixelTrackProducerFromSoAT : public edm::global::EDProducer<> {
-  using PixelTrackHeterogeneous = PixelTrackHeterogeneousT<TrackerTraits>;
+  using TrackSoAHost = TrackSoAHeterogeneousHost<TrackerTraits>;
+  using tracksHelpers = TracksUtilities<TrackerTraits>;
 
 public:
   using IndToEdm = std::vector<uint32_t>;
@@ -50,7 +54,6 @@ class PixelTrackProducerFromSoAT : public edm::global::EDProducer<> {
 
   static void fillDescriptions(edm::ConfigurationDescriptions &descriptions);
 
-  //  using HitModuleStart = std::array<uint32_t, gpuClustering::maxNumModules + 1>;
   using HMSstorage = HostProduct<uint32_t[]>;
 
 private:
@@ -58,7 +61,7 @@ class PixelTrackProducerFromSoAT : public edm::global::EDProducer<> {
 
   // Event Data tokens
   const edm::EDGetTokenT<reco::BeamSpot> tBeamSpot_;
-  const edm::EDGetTokenT<PixelTrackHeterogeneous> tokenTrack_;
+  const edm::EDGetTokenT<TrackSoAHost> tokenTrack_;
   const edm::EDGetTokenT<SiPixelRecHitCollectionNew> cpuHits_;
   const edm::EDGetTokenT<HMSstorage> hmsToken_;
   // Event Setup tokens
@@ -139,6 +142,7 @@ void PixelTrackProducerFromSoAT<TrackerTraits>::produce(edm::StreamID streamID,
   std::vector<TrackingRecHit const *> hitmap;
   auto const &rcs = rechits.data();
   auto nhits = rcs.size();
+
   hitmap.resize(nhits, nullptr);
 
   auto const *hitsModuleStart = iEvent.get(hmsToken_).get();
@@ -152,6 +156,7 @@ void PixelTrackProducerFromSoAT<TrackerTraits>::produce(edm::StreamID streamID,
     auto i = fc[detI] + clus.pixelCluster().originalId();
     if (i >= hitmap.size())
       hitmap.resize(i + 256, nullptr);  // only in case of hit overflow in one module
+
     assert(nullptr == hitmap[i]);
     hitmap[i] = &h;
   }
@@ -159,12 +164,10 @@ void PixelTrackProducerFromSoAT<TrackerTraits>::produce(edm::StreamID streamID,
   std::vector<const TrackingRecHit *> hits;
   hits.reserve(5);
 
-  const auto &tsoa = *iEvent.get(tokenTrack_);
-
-  auto const *quality = tsoa.qualityData();
-  auto const &fit = tsoa.stateAtBS;
-  auto const &hitIndices = tsoa.hitIndices;
-  auto nTracks = tsoa.nTracks();
+  auto const &tsoa = iEvent.get(tokenTrack_);
+  auto const *quality = tsoa.view().quality();
+  auto const &hitIndices = tsoa.view().hitIndices();
+  auto nTracks = tsoa.view().nTracks();
 
   tracks.reserve(nTracks);
 
@@ -173,19 +176,20 @@ void PixelTrackProducerFromSoAT<TrackerTraits>::produce(edm::StreamID streamID,
   //sort index by pt
   std::vector<int32_t> sortIdxs(nTracks);
   std::iota(sortIdxs.begin(), sortIdxs.end(), 0);
-  std::sort(
-      sortIdxs.begin(), sortIdxs.end(), [&](int32_t const i1, int32_t const i2) { return tsoa.pt(i1) > tsoa.pt(i2); });
+  std::sort(sortIdxs.begin(), sortIdxs.end(), [&](int32_t const i1, int32_t const i2) {
+    return tsoa.view()[i1].pt() > tsoa.view()[i2].pt();
+  });
 
   //store the index of the SoA: indToEdm[index_SoAtrack] -> index_edmTrack (if it exists)
   indToEdm.resize(sortIdxs.size(), -1);
   for (const auto &it : sortIdxs) {
-    auto nHits = tsoa.nHits(it);
+    auto nHits = tracksHelpers::nHits(tsoa.view(), it);
     assert(nHits >= 3);
     auto q = quality[it];
 
     if (q < minQuality_)
       continue;
-    if (tsoa.nLayers(it) < minNumberOfHits_)
+    if (nHits < minNumberOfHits_)  //move to nLayers?
       continue;
     indToEdm[it] = nt;
     ++nt;
@@ -197,12 +201,12 @@ void PixelTrackProducerFromSoAT<TrackerTraits>::produce(edm::StreamID streamID,
 
     // mind: this values are respect the beamspot!
 
-    float chi2 = tsoa.chi2(it);
-    float phi = tsoa.phi(it);
+    float chi2 = tsoa.view()[it].chi2();
+    float phi = tracksHelpers::phi(tsoa.view(), it);
 
     riemannFit::Vector5d ipar, opar;
     riemannFit::Matrix5d icov, ocov;
-    fit.copyToDense(ipar, icov, it);
+    tracksHelpers::template copyToDense<riemannFit::Vector5d, riemannFit::Matrix5d>(tsoa.view(), ipar, icov, it);
     riemannFit::transformToPerigeePlane(ipar, icov, opar, ocov);
 
     LocalTrajectoryParameters lpar(opar(0), opar(1), opar(2), opar(3), opar(4), 1.);
diff --git a/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackSoAFromCUDA.cc b/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackSoAFromCUDA.cc
index 0675effd091e8..82c21da184ab9 100644
--- a/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackSoAFromCUDA.cc
+++ b/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackSoAFromCUDA.cc
@@ -1,8 +1,11 @@
 #include <cuda_runtime.h>
+#include <Eigen/Core>  // needed here by soa layout
 
 #include "CUDADataFormats/Common/interface/Product.h"
 #include "CUDADataFormats/Common/interface/HostProduct.h"
-#include "CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h"
+#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousHost.h"
+#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousDevice.h"
+#include "CUDADataFormats/Track/interface/PixelTrackUtilities.h"
 #include "DataFormats/Common/interface/Handle.h"
 #include "FWCore/Framework/interface/Event.h"
 #include "FWCore/Framework/interface/EventSetup.h"
@@ -21,8 +24,8 @@
 
 template <typename TrackerTraits>
 class PixelTrackSoAFromCUDAT : public edm::stream::EDProducer<edm::ExternalWork> {
-  using PixelTrackHeterogeneous = PixelTrackHeterogeneousT<TrackerTraits>;
-  using TrackSoA = pixelTrack::TrackSoAT<TrackerTraits>;
+  using TrackSoAHost = TrackSoAHeterogeneousHost<TrackerTraits>;
+  using TrackSoADevice = TrackSoAHeterogeneousDevice<TrackerTraits>;
 
 public:
   explicit PixelTrackSoAFromCUDAT(const edm::ParameterSet& iConfig);
@@ -36,16 +39,15 @@ class PixelTrackSoAFromCUDAT : public edm::stream::EDProducer<edm::ExternalWork>
                edm::WaitingTaskWithArenaHolder waitingTaskHolder) override;
   void produce(edm::Event& iEvent, edm::EventSetup const& iSetup) override;
 
-  edm::EDGetTokenT<cms::cuda::Product<PixelTrackHeterogeneous>> tokenCUDA_;
-  edm::EDPutTokenT<PixelTrackHeterogeneous> tokenSOA_;
+  edm::EDGetTokenT<cms::cuda::Product<TrackSoADevice>> tokenCUDA_;
+  edm::EDPutTokenT<TrackSoAHost> tokenSOA_;
 
-  cms::cuda::host::unique_ptr<TrackSoA> soa_;
+  TrackSoAHost tracks_h_;
 };
 
 template <typename TrackerTraits>
 PixelTrackSoAFromCUDAT<TrackerTraits>::PixelTrackSoAFromCUDAT(const edm::ParameterSet& iConfig)
-    : tokenCUDA_(consumes<cms::cuda::Product<PixelTrackHeterogeneous>>(iConfig.getParameter<edm::InputTag>("src"))),
-      tokenSOA_(produces<PixelTrackHeterogeneous>()) {}
+    : tokenCUDA_(consumes(iConfig.getParameter<edm::InputTag>("src"))), tokenSOA_(produces<TrackSoAHost>()) {}
 
 template <typename TrackerTraits>
 void PixelTrackSoAFromCUDAT<TrackerTraits>::fillDescriptions(edm::ConfigurationDescriptions& descriptions) {
@@ -59,19 +61,22 @@ template <typename TrackerTraits>
 void PixelTrackSoAFromCUDAT<TrackerTraits>::acquire(edm::Event const& iEvent,
                                                     edm::EventSetup const& iSetup,
                                                     edm::WaitingTaskWithArenaHolder waitingTaskHolder) {
-  cms::cuda::Product<PixelTrackHeterogeneous> const& inputDataWrapped = iEvent.get(tokenCUDA_);
+  cms::cuda::Product<TrackSoADevice> const& inputDataWrapped = iEvent.get(tokenCUDA_);
   cms::cuda::ScopedContextAcquire ctx{inputDataWrapped, std::move(waitingTaskHolder)};
-  auto const& inputData = ctx.get(inputDataWrapped);
-
-  soa_ = inputData.toHostAsync(ctx.stream());
+  auto const& tracks_d = ctx.get(inputDataWrapped);  // Tracks on device
+  tracks_h_ = TrackSoAHost(ctx.stream());            // Create an instance of Tracks on Host, using the stream
+  cudaCheck(cudaMemcpyAsync(tracks_h_.buffer().get(),
+                            tracks_d.const_buffer().get(),
+                            tracks_d.bufferSize(),
+                            cudaMemcpyDeviceToHost,
+                            ctx.stream()));  // Copy data from Device to Host
 }
 
 template <typename TrackerTraits>
 void PixelTrackSoAFromCUDAT<TrackerTraits>::produce(edm::Event& iEvent, edm::EventSetup const& iSetup) {
-  auto const& tsoa = *soa_;
-  auto maxTracks = tsoa.stride();
+  auto maxTracks = tracks_h_.view().metadata().size();
+  auto nTracks = tracks_h_.view().nTracks();
 
-  auto nTracks = tsoa.nTracks();
   assert(nTracks < maxTracks);
   if (nTracks == maxTracks - 1) {
     edm::LogWarning("PixelTracks") << "Unsorted reconstructed pixel tracks truncated to " << maxTracks - 1
@@ -84,8 +89,8 @@ void PixelTrackSoAFromCUDAT<TrackerTraits>::produce(edm::Event& iEvent, edm::Eve
 
   int32_t nt = 0;
   for (int32_t it = 0; it < maxTracks; ++it) {
-    auto nHits = tsoa.nHits(it);
-    assert(nHits == int(tsoa.hitIndices.size(it)));
+    auto nHits = TracksUtilities<TrackerTraits>::nHits(tracks_h_.view(), it);
+    assert(nHits == int(tracks_h_.view().hitIndices().size(it)));
     if (nHits == 0)
       break;  // this is a guard: maybe we need to move to nTracks...
     nt++;
@@ -94,9 +99,8 @@ void PixelTrackSoAFromCUDAT<TrackerTraits>::produce(edm::Event& iEvent, edm::Eve
 #endif
 
   // DO NOT  make a copy  (actually TWO....)
-  iEvent.emplace(tokenSOA_, std::move(soa_));
-
-  assert(!soa_);
+  iEvent.emplace(tokenSOA_, std::move(tracks_h_));
+  assert(!tracks_h_.buffer());
 }
 
 using PixelTrackSoAFromCUDA = PixelTrackSoAFromCUDAT<pixelTopology::Phase1>;
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.cc b/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.cc
index d6a9db4953be1..be92f2d5d0fa2 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.cc
+++ b/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.cc
@@ -1,7 +1,7 @@
 #include "BrokenLineFitOnGPU.h"
 
 template <typename TrackerTraits>
-void HelixFitOnGPU<TrackerTraits>::launchBrokenLineKernelsOnCPU(HitsView const* hv,
+void HelixFitOnGPU<TrackerTraits>::launchBrokenLineKernelsOnCPU(const TrackingRecHitSoAConstView<TrackerTraits> &hv,
                                                                 uint32_t hitsInFit,
                                                                 uint32_t maxNumberOfTuples) {
   assert(tuples_);
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.cu b/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.cu
index b1ee028b8863e..c5c9ac7fc6345 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.cu
+++ b/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.cu
@@ -2,7 +2,7 @@
 #include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
 
 template <typename TrackerTraits>
-void HelixFitOnGPU<TrackerTraits>::launchBrokenLineKernels(HitsView const *hv,
+void HelixFitOnGPU<TrackerTraits>::launchBrokenLineKernels(const TrackingRecHitSoAConstView<TrackerTraits>& hv,
                                                            uint32_t hitsInFit,
                                                            uint32_t maxNumberOfTuples,
                                                            cudaStream_t stream) {
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.h b/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.h
index 4d1d57c4e27a8..e347b0c000dc3 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.h
@@ -8,7 +8,7 @@
 
 #include <cuda_runtime.h>
 
-#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h"
+#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHitsUtilities.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h"
 #include "RecoLocalTracker/SiPixelRecHits/interface/pixelCPEforGPU.h"
@@ -17,23 +17,18 @@
 #include "HelixFitOnGPU.h"
 
 template <typename TrackerTraits>
-using HitsOnGPU = TrackingRecHit2DSOAViewT<TrackerTraits>;
+using Tuples = typename TrackSoA<TrackerTraits>::HitContainer;
 template <typename TrackerTraits>
-using Tuples = pixelTrack::HitContainerT<TrackerTraits>;
-template <typename TrackerTraits>
-using OutputSoA = pixelTrack::TrackSoAT<TrackerTraits>;
+using OutputSoAView = TrackSoAView<TrackerTraits>;
 template <typename TrackerTraits>
 using TupleMultiplicity = caStructures::TupleMultiplicityT<TrackerTraits>;
 
-// using tindex_type = typename TrackerTraits::tindex_type;
-// constexpr auto invalidTkId = std::numeric_limits<tindex_type>::max();
-
 // #define BL_DUMP_HITS
 
 template <int N, typename TrackerTraits>
 __global__ void kernel_BLFastFit(Tuples<TrackerTraits> const *__restrict__ foundNtuplets,
                                  TupleMultiplicity<TrackerTraits> const *__restrict__ tupleMultiplicity,
-                                 HitsOnGPU<TrackerTraits> const *__restrict__ hhp,
+                                 TrackingRecHitSoAConstView<TrackerTraits> hh,
                                  typename TrackerTraits::tindex_type *__restrict__ ptkids,
                                  double *__restrict__ phits,
                                  float *__restrict__ phits_ge,
@@ -46,7 +41,6 @@ __global__ void kernel_BLFastFit(Tuples<TrackerTraits> const *__restrict__ found
 
   assert(hitsInFit <= nHitsL);
   assert(nHitsL <= nHitsH);
-  assert(hhp);
   assert(phits);
   assert(pfast_fit);
   assert(foundNtuplets);
@@ -100,9 +94,9 @@ __global__ void kernel_BLFastFit(Tuples<TrackerTraits> const *__restrict__ found
     // #define YERR_FROM_DC
 #ifdef YERR_FROM_DC
     // try to compute more precise error in y
-    auto dx = hhp->xGlobal(hitId[hitsInFit - 1]) - hhp->xGlobal(hitId[0]);
-    auto dy = hhp->yGlobal(hitId[hitsInFit - 1]) - hhp->yGlobal(hitId[0]);
-    auto dz = hhp->zGlobal(hitId[hitsInFit - 1]) - hhp->zGlobal(hitId[0]);
+    auto dx = hh[hitId[hitsInFit - 1]].xGlobal() - hh[hitId[0]].xGlobal();
+    auto dy = hh[hitId[hitsInFit - 1]].yGlobal() - hh[hitId[0]].yGlobal();
+    auto dz = hh[hitId[hitsInFit - 1]].zGlobal() - hh[hitId[0]].zGlobal();
     float ux, uy, uz;
 #endif
 
@@ -118,8 +112,8 @@ __global__ void kernel_BLFastFit(Tuples<TrackerTraits> const *__restrict__ found
       float ge[6];
 
 #ifdef YERR_FROM_DC
-      auto const &dp = hhp->cpeParams().detParams(hhp->detectorIndex(hit));
-      auto status = hhp->status(hit);
+      auto const &dp = hh.cpeParams().detParams(hh.detectorIndex(hit));
+      auto status = hh[hit].chargeAndStatus().status;
       int qbin = CPEFastParametrisation::kGenErrorQBins - 1 - status.qBin;
       assert(qbin >= 0 && qbin < 5);
       bool nok = (status.isBigY | status.isOneY);
@@ -136,12 +130,10 @@ __global__ void kernel_BLFastFit(Tuples<TrackerTraits> const *__restrict__ found
       yerr *= dp.yfact[qbin];                // inflate
       yerr *= yerr;
       yerr += dp.apeYY;
-      yerr = nok ? hhp->yerrLocal(hit) : yerr;
-      dp.frame.toGlobal(hhp->xerrLocal(hit), 0, yerr, ge);
+      yerr = nok ? hh[hit].yerrLocal() : yerr;
+      dp.frame.toGlobal(hh[hit].xerrLocal(), 0, yerr, ge);
 #else
-      hhp->cpeParams()
-          .detParams(hhp->detectorIndex(hit))
-          .frame.toGlobal(hhp->xerrLocal(hit), 0, hhp->yerrLocal(hit), ge);
+      hh.cpeParams().detParams(hh[hit].detectorIndex()).frame.toGlobal(hh[hit].xerrLocal(), 0, hh[hit].yerrLocal(), ge);
 #endif
 
 #ifdef BL_DUMP_HITS
@@ -151,16 +143,16 @@ __global__ void kernel_BLFastFit(Tuples<TrackerTraits> const *__restrict__ found
                local_idx,
                tkid,
                hit,
-               hhp->detectorIndex(hit),
+               hh[hit].detectorIndex(),
                i,
-               hhp->xGlobal(hit),
-               hhp->yGlobal(hit),
-               hhp->zGlobal(hit));
+               hh[hit].xGlobal(),
+               hh[hit].yGlobal(),
+               hh[hit].zGlobal());
         printf("Error: hits_ge.col(%d) << %e,%e,%e,%e,%e,%e\n", i, ge[0], ge[1], ge[2], ge[3], ge[4], ge[5]);
       }
 #endif
 
-      hits.col(i) << hhp->xGlobal(hit), hhp->yGlobal(hit), hhp->zGlobal(hit);
+      hits.col(i) << hh[hit].xGlobal(), hh[hit].yGlobal(), hh[hit].zGlobal();
       hits_ge.col(i) << ge[0], ge[1], ge[2], ge[3], ge[4], ge[5];
     }
     brokenline::fastFit(hits, fast_fit);
@@ -176,12 +168,14 @@ __global__ void kernel_BLFastFit(Tuples<TrackerTraits> const *__restrict__ found
 template <int N, typename TrackerTraits>
 __global__ void kernel_BLFit(TupleMultiplicity<TrackerTraits> const *__restrict__ tupleMultiplicity,
                              double bField,
-                             OutputSoA<TrackerTraits> *results,
+                             OutputSoAView<TrackerTraits> results_view,
                              typename TrackerTraits::tindex_type const *__restrict__ ptkids,
                              double *__restrict__ phits,
                              float *__restrict__ phits_ge,
                              double *__restrict__ pfast_fit) {
-  assert(results);
+  assert(results_view.pt());
+  assert(results_view.eta());
+  assert(results_view.chi2());
   assert(pfast_fit);
   constexpr auto invalidTkId = std::numeric_limits<typename TrackerTraits::tindex_type>::max();
 
@@ -209,10 +203,11 @@ __global__ void kernel_BLFit(TupleMultiplicity<TrackerTraits> const *__restrict_
     brokenline::lineFit(hits_ge, fast_fit, bField, data, line);
     brokenline::circleFit(hits, hits_ge, fast_fit, bField, data, circle);
 
-    results->stateAtBS.copyFromCircle(circle.par, circle.cov, line.par, line.cov, 1.f / float(bField), tkid);
-    results->pt(tkid) = float(bField) / float(std::abs(circle.par(2)));
-    results->eta(tkid) = asinhf(line.par(0));
-    results->chi2(tkid) = (circle.chi2 + line.chi2) / (2 * N - 5);
+    TracksUtilities<TrackerTraits>::copyFromCircle(
+        results_view, circle.par, circle.cov, line.par, line.cov, 1.f / float(bField), tkid);
+    results_view[tkid].pt() = float(bField) / float(std::abs(circle.par(2)));
+    results_view[tkid].eta() = asinhf(line.par(0));
+    results_view[tkid].chi2() = (circle.chi2 + line.chi2) / (2 * N - 5);
 
 #ifdef BROKENLINE_DEBUG
     if (!(circle.chi2 >= 0) || !(line.chi2 >= 0))
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/BuildFile.xml b/RecoPixelVertexing/PixelTriplets/plugins/BuildFile.xml
index 95c443c3b51e7..de2a40fc8b0f0 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/BuildFile.xml
+++ b/RecoPixelVertexing/PixelTriplets/plugins/BuildFile.xml
@@ -1,5 +1,6 @@
 <use name="ofast-flag"/>
 <use name="CUDADataFormats/Track"/>
+<use name="CUDADataFormats/TrackingRecHit"/>
 <use name="CommonTools/RecoAlgos"/>
 <use name="FWCore/Framework"/>
 <use name="FWCore/ParameterSet"/>
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletCUDA.cc b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletCUDA.cc
index fade739410e2f..122f4af710966 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletCUDA.cc
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletCUDA.cc
@@ -17,19 +17,23 @@
 #include "FWCore/Utilities/interface/RunningAverage.h"
 #include "HeterogeneousCore/CUDACore/interface/ScopedContext.h"
 #include "MagneticField/Records/interface/IdealMagneticFieldRecord.h"
+
 #include "RecoTracker/TkMSParametrization/interface/PixelRecoUtilities.h"
 
 #include "CAHitNtupletGeneratorOnGPU.h"
-#include "CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h"
-#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h"
+
+#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousHost.h"
+#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousDevice.h"
 
 template <typename TrackerTraits>
 class CAHitNtupletCUDAT : public edm::global::EDProducer<> {
-  using PixelTrackHeterogeneous = PixelTrackHeterogeneousT<TrackerTraits>;
+  using HitsConstView = TrackingRecHitSoAConstView<TrackerTraits>;
+  using HitsOnGPU = TrackingRecHitSoADevice<TrackerTraits>;  //TODO move to OnDevice
+  using HitsOnCPU = TrackingRecHitSoAHost<TrackerTraits>;    //TODO move to OnHost
+
+  using TrackSoAHost = TrackSoAHeterogeneousHost<TrackerTraits>;
+  using TrackSoADevice = TrackSoAHeterogeneousDevice<TrackerTraits>;
 
-  using HitsView = TrackingRecHit2DSOAViewT<TrackerTraits>;
-  using HitsOnGPU = TrackingRecHit2DGPUT<TrackerTraits>;
-  using HitsOnCPU = TrackingRecHit2DCPUT<TrackerTraits>;
   using GPUAlgo = CAHitNtupletGeneratorOnGPU<TrackerTraits>;
 
 public:
@@ -48,9 +52,9 @@ class CAHitNtupletCUDAT : public edm::global::EDProducer<> {
 
   edm::ESGetToken<MagneticField, IdealMagneticFieldRecord> tokenField_;
   edm::EDGetTokenT<cms::cuda::Product<HitsOnGPU>> tokenHitGPU_;
-  edm::EDPutTokenT<cms::cuda::Product<PixelTrackHeterogeneous>> tokenTrackGPU_;
+  edm::EDPutTokenT<cms::cuda::Product<TrackSoADevice>> tokenTrackGPU_;
   edm::EDGetTokenT<HitsOnCPU> tokenHitCPU_;
-  edm::EDPutTokenT<PixelTrackHeterogeneous> tokenTrackCPU_;
+  edm::EDPutTokenT<TrackSoAHost> tokenTrackCPU_;
 
   GPUAlgo gpuAlgo_;
 };
@@ -60,10 +64,10 @@ CAHitNtupletCUDAT<TrackerTraits>::CAHitNtupletCUDAT(const edm::ParameterSet& iCo
     : onGPU_(iConfig.getParameter<bool>("onGPU")), tokenField_(esConsumes()), gpuAlgo_(iConfig, consumesCollector()) {
   if (onGPU_) {
     tokenHitGPU_ = consumes(iConfig.getParameter<edm::InputTag>("pixelRecHitSrc"));
-    tokenTrackGPU_ = produces<cms::cuda::Product<PixelTrackHeterogeneous>>();
+    tokenTrackGPU_ = produces<cms::cuda::Product<TrackSoADevice>>();
   } else {
     tokenHitCPU_ = consumes(iConfig.getParameter<edm::InputTag>("pixelRecHitSrc"));
-    tokenTrackCPU_ = produces<PixelTrackHeterogeneous>();
+    tokenTrackCPU_ = produces<TrackSoAHost>();
   }
 }
 
@@ -95,13 +99,14 @@ void CAHitNtupletCUDAT<TrackerTraits>::produce(edm::StreamID streamID,
   auto bf = 1. / es.getData(tokenField_).inverseBzAtOriginInGeV();
 
   if (onGPU_) {
-    auto hHits = iEvent.getHandle(tokenHitGPU_);
-    cms::cuda::ScopedContextProduce ctx{*hHits};
-    auto const& hits = ctx.get(*hHits);
-    ctx.emplace(iEvent, tokenTrackGPU_, gpuAlgo_.makeTuplesAsync(hits, bf, ctx.stream()));
+    auto const& hits = iEvent.get(tokenHitGPU_);
+
+    cms::cuda::ScopedContextProduce ctx{hits};
+    auto& hits_d = ctx.get(hits);
+    ctx.emplace(iEvent, tokenTrackGPU_, gpuAlgo_.makeTuplesAsync(hits_d, bf, ctx.stream()));
   } else {
-    auto const& hits = iEvent.get(tokenHitCPU_);
-    iEvent.emplace(tokenTrackCPU_, gpuAlgo_.makeTuples(hits, bf));
+    auto& hits_h = iEvent.get(tokenHitCPU_);
+    iEvent.emplace(tokenTrackCPU_, gpuAlgo_.makeTuples(hits_h, bf));
   }
 }
 
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc
index 75fbbffb49190..f826b1b5c89da 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc
@@ -14,7 +14,9 @@ void CAHitNtupletGeneratorKernelsCPU<TrackerTraits>::printCounters(Counters cons
 }
 
 template <typename TrackerTraits>
-void CAHitNtupletGeneratorKernelsCPU<TrackerTraits>::buildDoublets(HitsOnCPU const &hh, cudaStream_t stream) {
+void CAHitNtupletGeneratorKernelsCPU<TrackerTraits>::buildDoublets(const HitsConstView &hh,
+                                                                   int32_t offsetBPIX2,
+                                                                   cudaStream_t stream) {
   using namespace gpuPixelDoublets;
 
   using GPUCACell = GPUCACellT<TrackerTraits>;
@@ -26,7 +28,7 @@ void CAHitNtupletGeneratorKernelsCPU<TrackerTraits>::buildDoublets(HitsOnCPU con
   auto nhits = hh.nHits();
 
 #ifdef NTUPLE_DEBUG
-  std::cout << "building Doublets out of " << nhits << " Hits. BPIX2 offset is " << hh.offsetBPIX2() << std::endl;
+  std::cout << "building Doublets out of " << nhits << " Hits. BPIX2 offset is " << offsetBPIX2 << std::endl;
 #endif
 
   // use "nhits" to heuristically dimension the workspace
@@ -35,7 +37,7 @@ void CAHitNtupletGeneratorKernelsCPU<TrackerTraits>::buildDoublets(HitsOnCPU con
   //this->device_isOuterHitOfCell_ = Traits::template make_unique<GPUCACell::OuterHitOfCell[]>(std::max(1U, nhits), stream);
   this->device_isOuterHitOfCell_ = std::make_unique<OuterHitOfCellContainer[]>(std::max(1U, nhits));
   assert(this->device_isOuterHitOfCell_.get());
-  this->isOuterHitOfCell_ = OuterHitOfCell{this->device_isOuterHitOfCell_.get(), hh.offsetBPIX2()};
+  this->isOuterHitOfCell_ = OuterHitOfCell{this->device_isOuterHitOfCell_.get(), offsetBPIX2};
 
   auto cellStorageSize = TrackerTraits::maxNumOfActiveDoublets * sizeof(CellNeighbors) +
                          TrackerTraits::maxNumOfActiveDoublets * sizeof(CellTracks);
@@ -68,28 +70,22 @@ void CAHitNtupletGeneratorKernelsCPU<TrackerTraits>::buildDoublets(HitsOnCPU con
                                       this->device_nCells_,
                                       this->device_theCellNeighbors_.get(),
                                       this->device_theCellTracks_.get(),
-                                      hh.view(),
+                                      hh,
                                       this->isOuterHitOfCell_,
                                       nActualPairs,
                                       this->params_.cellCuts_);
 }
 
 template <typename TrackerTraits>
-void CAHitNtupletGeneratorKernelsCPU<TrackerTraits>::launchKernels(HitsOnCPU const &hh,
-                                                                   TkSoA *tracks_d,
+void CAHitNtupletGeneratorKernelsCPU<TrackerTraits>::launchKernels(const HitsConstView &hh,
+                                                                   TkSoAView &tracks_view,
                                                                    cudaStream_t cudaStream) {
   using namespace caHitNtupletGeneratorKernels;
 
-  auto *tuples_d = &tracks_d->hitIndices;
-  auto *detId_d = &tracks_d->detIndices;
-  auto *quality_d = tracks_d->qualityData();
-
-  assert(tuples_d && quality_d);
-
   // zero tuples
-  cms::cuda::launchZero(tuples_d, cudaStream);
+  cms::cuda::launchZero(&tracks_view.hitIndices(), cudaStream);
 
-  auto nhits = hh.nHits();
+  uint32_t nhits = hh.metadata().size();
 
 #ifdef NTUPLE_DEBUG
   std::cout << "start tuple building. N hits " << nhits << std::endl;
@@ -103,7 +99,7 @@ void CAHitNtupletGeneratorKernelsCPU<TrackerTraits>::launchKernels(HitsOnCPU con
 
   kernel_connect<TrackerTraits>(this->device_hitTuple_apc_,
                                 this->device_hitToTuple_apc_,  // needed only to be reset, ready for next kernel
-                                hh.view(),
+                                hh,
                                 this->device_theCells_.get(),
                                 this->device_nCells_,
                                 this->device_theCellNeighbors_.get(),
@@ -112,91 +108,83 @@ void CAHitNtupletGeneratorKernelsCPU<TrackerTraits>::launchKernels(HitsOnCPU con
 
   if (nhits > 1 && this->params_.earlyFishbone_) {
     gpuPixelDoublets::fishbone<TrackerTraits>(
-        hh.view(), this->device_theCells_.get(), this->device_nCells_, this->isOuterHitOfCell_, nhits, false);
+        hh, this->device_theCells_.get(), this->device_nCells_, this->isOuterHitOfCell_, nhits, false);
   }
 
-  kernel_find_ntuplets<TrackerTraits>(hh.view(),
+  kernel_find_ntuplets<TrackerTraits>(hh,
+                                      tracks_view,
                                       this->device_theCells_.get(),
                                       this->device_nCells_,
                                       this->device_theCellTracks_.get(),
-                                      tuples_d,
                                       this->device_hitTuple_apc_,
-                                      quality_d,
                                       this->params_.caParams_);
   if (this->params_.doStats_)
     kernel_mark_used(this->device_theCells_.get(), this->device_nCells_);
 
-  cms::cuda::finalizeBulk(this->device_hitTuple_apc_, tuples_d);
+  cms::cuda::finalizeBulk(this->device_hitTuple_apc_, &tracks_view.hitIndices());
 
-  kernel_fillHitDetIndices<TrackerTraits>(tuples_d, hh.view(), detId_d);
-  kernel_fillNLayers<TrackerTraits>(tracks_d, this->device_hitTuple_apc_);
+  kernel_fillHitDetIndices<TrackerTraits>(tracks_view, hh);
+  kernel_fillNLayers<TrackerTraits>(tracks_view, this->device_hitTuple_apc_);
 
   // remove duplicates (tracks that share a doublet)
   kernel_earlyDuplicateRemover<TrackerTraits>(
-      this->device_theCells_.get(), this->device_nCells_, tracks_d, quality_d, this->params_.dupPassThrough_);
+      this->device_theCells_.get(), this->device_nCells_, tracks_view, this->params_.dupPassThrough_);
 
-  kernel_countMultiplicity<TrackerTraits>(tuples_d, quality_d, this->device_tupleMultiplicity_.get());
+  kernel_countMultiplicity<TrackerTraits>(tracks_view, this->device_tupleMultiplicity_.get());
   cms::cuda::launchFinalize(this->device_tupleMultiplicity_.get(), cudaStream);
-  kernel_fillMultiplicity<TrackerTraits>(tuples_d, quality_d, this->device_tupleMultiplicity_.get());
+  kernel_fillMultiplicity<TrackerTraits>(tracks_view, this->device_tupleMultiplicity_.get());
 
   if (nhits > 1 && this->params_.lateFishbone_) {
     gpuPixelDoublets::fishbone<TrackerTraits>(
-        hh.view(), this->device_theCells_.get(), this->device_nCells_, this->isOuterHitOfCell_, nhits, true);
+        hh, this->device_theCells_.get(), this->device_nCells_, this->isOuterHitOfCell_, nhits, true);
   }
 }
 
 template <typename TrackerTraits>
-void CAHitNtupletGeneratorKernelsCPU<TrackerTraits>::classifyTuples(HitsOnCPU const &hh,
-                                                                    TkSoA *tracks_d,
+void CAHitNtupletGeneratorKernelsCPU<TrackerTraits>::classifyTuples(const HitsConstView &hh,
+                                                                    TkSoAView &tracks_view,
                                                                     cudaStream_t cudaStream) {
   using namespace caHitNtupletGeneratorKernels;
 
-  int32_t nhits = hh.nHits();
-
-  auto const *tuples_d = &tracks_d->hitIndices;
-  auto *quality_d = tracks_d->qualityData();
+  int32_t nhits = hh.metadata().size();
 
   // classify tracks based on kinematics
-  kernel_classifyTracks<TrackerTraits>(tuples_d, tracks_d, this->params_.qualityCuts_, quality_d);
+  kernel_classifyTracks<TrackerTraits>(tracks_view, this->params_.qualityCuts_);
   if (this->params_.lateFishbone_) {
     // apply fishbone cleaning to good tracks
-    kernel_fishboneCleaner<TrackerTraits>(this->device_theCells_.get(), this->device_nCells_, quality_d);
+    kernel_fishboneCleaner<TrackerTraits>(this->device_theCells_.get(), this->device_nCells_, tracks_view);
   }
 
   // remove duplicates (tracks that share a doublet)
   kernel_fastDuplicateRemover<TrackerTraits>(
-      this->device_theCells_.get(), this->device_nCells_, tracks_d, this->params_.dupPassThrough_);
+      this->device_theCells_.get(), this->device_nCells_, tracks_view, this->params_.dupPassThrough_);
 
   // fill hit->track "map"
   if (this->params_.doSharedHitCut_ || this->params_.doStats_) {
-    kernel_countHitInTracks<TrackerTraits>(tuples_d, quality_d, this->device_hitToTuple_.get());
+    kernel_countHitInTracks<TrackerTraits>(tracks_view, this->device_hitToTuple_.get());
     cms::cuda::launchFinalize(this->hitToTupleView_, cudaStream);
-    kernel_fillHitInTracks<TrackerTraits>(tuples_d, quality_d, this->device_hitToTuple_.get());
+    kernel_fillHitInTracks<TrackerTraits>(tracks_view, this->device_hitToTuple_.get());
   }
 
   // remove duplicates (tracks that share at least one hit)
   if (this->params_.doSharedHitCut_) {
-    kernel_rejectDuplicate<TrackerTraits>(tracks_d,
-                                          quality_d,
+    kernel_rejectDuplicate<TrackerTraits>(tracks_view,
                                           this->params_.minHitsForSharingCut_,
                                           this->params_.dupPassThrough_,
                                           this->device_hitToTuple_.get());
 
-    kernel_sharedHitCleaner<TrackerTraits>(hh.view(),
-                                           tracks_d,
-                                           quality_d,
+    kernel_sharedHitCleaner<TrackerTraits>(hh,
+                                           tracks_view,
                                            this->params_.minHitsForSharingCut_,
                                            this->params_.dupPassThrough_,
                                            this->device_hitToTuple_.get());
     if (this->params_.useSimpleTripletCleaner_) {
-      kernel_simpleTripletCleaner<TrackerTraits>(tracks_d,
-                                                 quality_d,
+      kernel_simpleTripletCleaner<TrackerTraits>(tracks_view,
                                                  this->params_.minHitsForSharingCut_,
                                                  this->params_.dupPassThrough_,
                                                  this->device_hitToTuple_.get());
     } else {
-      kernel_tripletCleaner<TrackerTraits>(tracks_d,
-                                           quality_d,
+      kernel_tripletCleaner<TrackerTraits>(tracks_view,
                                            this->params_.minHitsForSharingCut_,
                                            this->params_.dupPassThrough_,
                                            this->device_hitToTuple_.get());
@@ -205,7 +193,7 @@ void CAHitNtupletGeneratorKernelsCPU<TrackerTraits>::classifyTuples(HitsOnCPU co
 
   if (this->params_.doStats_) {
     std::lock_guard guard(lock_stat);
-    kernel_checkOverflows<TrackerTraits>(tuples_d,
+    kernel_checkOverflows<TrackerTraits>(tracks_view,
                                          this->device_tupleMultiplicity_.get(),
                                          this->device_hitToTuple_.get(),
                                          this->device_hitTuple_apc_,
@@ -223,7 +211,7 @@ void CAHitNtupletGeneratorKernelsCPU<TrackerTraits>::classifyTuples(HitsOnCPU co
     // counters (add flag???)
     std::lock_guard guard(lock_stat);
     kernel_doStatsForHitInTracks<TrackerTraits>(this->device_hitToTuple_.get(), this->counters_);
-    kernel_doStatsForTracks<TrackerTraits>(tuples_d, quality_d, this->counters_);
+    kernel_doStatsForTracks<TrackerTraits>(tracks_view, this->counters_);
   }
 
 #ifdef DUMP_GPU_TK_TUPLES
@@ -232,8 +220,7 @@ void CAHitNtupletGeneratorKernelsCPU<TrackerTraits>::classifyTuples(HitsOnCPU co
   {
     std::lock_guard<std::mutex> guard(lock);
     ++iev;
-    kernel_print_found_ntuplets<TrackerTraits>(
-        hh.view(), tuples_d, tracks_d, quality_d, this->device_hitToTuple_.get(), 0, 1000000, iev);
+    kernel_print_found_ntuplets<TrackerTraits>(hh, tracks_view, this->device_hitToTuple_.get(), 0, 1000000, iev);
   }
 #endif
 }
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu
index 59ae2041b44aa..cd15b96bcd5fc 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu
@@ -5,20 +5,16 @@
 // #define GPU_DEBUG
 
 template <typename TrackerTraits>
-void CAHitNtupletGeneratorKernelsGPU<TrackerTraits>::launchKernels(HitsOnCPU const &hh,
-                                                                   TkSoA *tracks_d,
+void CAHitNtupletGeneratorKernelsGPU<TrackerTraits>::launchKernels(const HitsConstView &hh,
+                                                                   TkSoAView &tracks_view,
                                                                    cudaStream_t cudaStream) {
   using namespace gpuPixelDoublets;
   using namespace caHitNtupletGeneratorKernels;
-  // these are pointer on GPU!
-  auto *tuples_d = &tracks_d->hitIndices;
-  auto *detId_d = &tracks_d->detIndices;
-  auto *quality_d = tracks_d->qualityData();
 
   // zero tuples
-  cms::cuda::launchZero(tuples_d, cudaStream);
+  cms::cuda::launchZero(&(tracks_view.hitIndices()), cudaStream);  //TODO test .data()
 
-  int32_t nhits = hh.nHits();
+  int32_t nhits = hh.metadata().size();
 
 #ifdef NTUPLE_DEBUG
   std::cout << "start tuple building. N hits " << nhits << std::endl;
@@ -45,7 +41,7 @@ void CAHitNtupletGeneratorKernelsGPU<TrackerTraits>::launchKernels(HitsOnCPU con
   kernel_connect<TrackerTraits>
       <<<blks, thrs, 0, cudaStream>>>(this->device_hitTuple_apc_,
                                       this->device_hitToTuple_apc_,  // needed only to be reset, ready for next kernel
-                                      hh.view(),
+                                      hh,
                                       this->device_theCells_.get(),
                                       this->device_nCells_,
                                       this->device_theCellNeighbors_.get(),
@@ -63,19 +59,18 @@ void CAHitNtupletGeneratorKernelsGPU<TrackerTraits>::launchKernels(HitsOnCPU con
     dim3 blks(1, numberOfBlocks, 1);
     dim3 thrs(stride, blockSize, 1);
     fishbone<TrackerTraits><<<blks, thrs, 0, cudaStream>>>(
-        hh.view(), this->device_theCells_.get(), this->device_nCells_, this->isOuterHitOfCell_, nhits, false);
+        hh, this->device_theCells_.get(), this->device_nCells_, this->isOuterHitOfCell_, nhits, false);
     cudaCheck(cudaGetLastError());
   }
 
   blockSize = 64;
   numberOfBlocks = (3 * this->params_.cellCuts_.maxNumberOfDoublets_ / 4 + blockSize - 1) / blockSize;
-  kernel_find_ntuplets<TrackerTraits><<<numberOfBlocks, blockSize, 0, cudaStream>>>(hh.view(),
+  kernel_find_ntuplets<TrackerTraits><<<numberOfBlocks, blockSize, 0, cudaStream>>>(hh,
+                                                                                    tracks_view,
                                                                                     this->device_theCells_.get(),
                                                                                     this->device_nCells_,
                                                                                     this->device_theCellTracks_.get(),
-                                                                                    tuples_d,
                                                                                     this->device_hitTuple_apc_,
-                                                                                    quality_d,
                                                                                     this->params_.caParams_);
 #ifdef GPU_DEBUG
   cudaDeviceSynchronize();
@@ -94,21 +89,23 @@ void CAHitNtupletGeneratorKernelsGPU<TrackerTraits>::launchKernels(HitsOnCPU con
   blockSize = 128;
   numberOfBlocks = (HitContainer::ctNOnes() + blockSize - 1) / blockSize;
 
-  cms::cuda::finalizeBulk<<<numberOfBlocks, blockSize, 0, cudaStream>>>(this->device_hitTuple_apc_, tuples_d);
+  cms::cuda::finalizeBulk<<<numberOfBlocks, blockSize, 0, cudaStream>>>(this->device_hitTuple_apc_,
+                                                                        &tracks_view.hitIndices());  //TODO test .data()
 
 #ifdef GPU_DEBUG
   cudaDeviceSynchronize();
   cudaCheck(cudaGetLastError());
 #endif
 
-  kernel_fillHitDetIndices<TrackerTraits><<<numberOfBlocks, blockSize, 0, cudaStream>>>(tuples_d, hh.view(), detId_d);
+  kernel_fillHitDetIndices<TrackerTraits><<<numberOfBlocks, blockSize, 0, cudaStream>>>(tracks_view, hh);
   cudaCheck(cudaGetLastError());
 
 #ifdef GPU_DEBUG
   cudaDeviceSynchronize();
   cudaCheck(cudaGetLastError());
 #endif
-  kernel_fillNLayers<TrackerTraits><<<numberOfBlocks, blockSize, 0, cudaStream>>>(tracks_d, this->device_hitTuple_apc_);
+  kernel_fillNLayers<TrackerTraits>
+      <<<numberOfBlocks, blockSize, 0, cudaStream>>>(tracks_view, this->device_hitTuple_apc_);
   cudaCheck(cudaGetLastError());
 
 #ifdef GPU_DEBUG
@@ -120,7 +117,7 @@ void CAHitNtupletGeneratorKernelsGPU<TrackerTraits>::launchKernels(HitsOnCPU con
   numberOfBlocks = this->nDoubletBlocks(blockSize);
 
   kernel_earlyDuplicateRemover<TrackerTraits><<<numberOfBlocks, blockSize, 0, cudaStream>>>(
-      this->device_theCells_.get(), this->device_nCells_, tracks_d, quality_d, this->params_.dupPassThrough_);
+      this->device_theCells_.get(), this->device_nCells_, tracks_view, this->params_.dupPassThrough_);
   cudaCheck(cudaGetLastError());
 #ifdef GPU_DEBUG
   cudaDeviceSynchronize();
@@ -130,10 +127,10 @@ void CAHitNtupletGeneratorKernelsGPU<TrackerTraits>::launchKernels(HitsOnCPU con
   blockSize = 128;
   numberOfBlocks = (3 * TrackerTraits::maxNumberOfTuples / 4 + blockSize - 1) / blockSize;
   kernel_countMultiplicity<TrackerTraits>
-      <<<numberOfBlocks, blockSize, 0, cudaStream>>>(tuples_d, quality_d, this->device_tupleMultiplicity_.get());
+      <<<numberOfBlocks, blockSize, 0, cudaStream>>>(tracks_view, this->device_tupleMultiplicity_.get());
   cms::cuda::launchFinalize(this->device_tupleMultiplicity_.get(), cudaStream);
   kernel_fillMultiplicity<TrackerTraits>
-      <<<numberOfBlocks, blockSize, 0, cudaStream>>>(tuples_d, quality_d, this->device_tupleMultiplicity_.get());
+      <<<numberOfBlocks, blockSize, 0, cudaStream>>>(tracks_view, this->device_tupleMultiplicity_.get());
   cudaCheck(cudaGetLastError());
 #ifdef GPU_DEBUG
   cudaDeviceSynchronize();
@@ -149,7 +146,7 @@ void CAHitNtupletGeneratorKernelsGPU<TrackerTraits>::launchKernels(HitsOnCPU con
     dim3 blks(1, numberOfBlocks, 1);
     dim3 thrs(stride, blockSize, 1);
     fishbone<TrackerTraits><<<blks, thrs, 0, cudaStream>>>(
-        hh.view(), this->device_theCells_.get(), this->device_nCells_, this->isOuterHitOfCell_, nhits, true);
+        hh, this->device_theCells_.get(), this->device_nCells_, this->isOuterHitOfCell_, nhits, true);
     cudaCheck(cudaGetLastError());
   }
 
@@ -157,14 +154,13 @@ void CAHitNtupletGeneratorKernelsGPU<TrackerTraits>::launchKernels(HitsOnCPU con
   cudaDeviceSynchronize();
   cudaCheck(cudaGetLastError());
 #endif
-
-  // free space asap
-  // this->device_isOuterHitOfCell_.reset();
 }
 
 template <typename TrackerTraits>
-void CAHitNtupletGeneratorKernelsGPU<TrackerTraits>::buildDoublets(HitsOnCPU const &hh, cudaStream_t stream) {
-  int32_t nhits = hh.nHits();
+void CAHitNtupletGeneratorKernelsGPU<TrackerTraits>::buildDoublets(const HitsConstView &hh,
+                                                                   int32_t offsetBPIX2,
+                                                                   cudaStream_t stream) {
+  int32_t nhits = hh.metadata().size();
 
   using namespace gpuPixelDoublets;
 
@@ -174,7 +170,7 @@ void CAHitNtupletGeneratorKernelsGPU<TrackerTraits>::buildDoublets(HitsOnCPU con
   using CellTracks = typename GPUCACell::CellTracks;
   using OuterHitOfCellContainer = typename GPUCACell::OuterHitOfCellContainer;
 
-  this->isOuterHitOfCell_ = OuterHitOfCell{this->device_isOuterHitOfCell_.get(), hh.offsetBPIX2()};
+  this->isOuterHitOfCell_ = OuterHitOfCell{this->device_isOuterHitOfCell_.get(), offsetBPIX2};
 
 #ifdef NTUPLE_DEBUG
   std::cout << "building Doublets out of " << nhits << " Hits" << std::endl;
@@ -187,10 +183,10 @@ void CAHitNtupletGeneratorKernelsGPU<TrackerTraits>::buildDoublets(HitsOnCPU con
 
   // in principle we can use "nhits" to heuristically dimension the workspace...
   this->device_isOuterHitOfCell_ =
-      cms::cuda::make_device_unique<OuterHitOfCellContainer[]>(std::max(1, nhits - hh.offsetBPIX2()), stream);
+      cms::cuda::make_device_unique<OuterHitOfCellContainer[]>(std::max(1, nhits - offsetBPIX2), stream);
   assert(this->device_isOuterHitOfCell_.get());
 
-  this->isOuterHitOfCell_ = OuterHitOfCell{this->device_isOuterHitOfCell_.get(), hh.offsetBPIX2()};
+  this->isOuterHitOfCell_ = OuterHitOfCell{this->device_isOuterHitOfCell_.get(), offsetBPIX2};
 
   this->cellStorage_ =
       cms::cuda::make_device_unique<unsigned char[]>(TrackerTraits::maxNumOfActiveDoublets * sizeof(CellNeighbors) +
@@ -203,7 +199,7 @@ void CAHitNtupletGeneratorKernelsGPU<TrackerTraits>::buildDoublets(HitsOnCPU con
   {
     int threadsPerBlock = 128;
     // at least one block!
-    int blocks = (std::max(1, nhits - hh.offsetBPIX2()) + threadsPerBlock - 1) / threadsPerBlock;
+    int blocks = (std::max(1, nhits - offsetBPIX2) + threadsPerBlock - 1) / threadsPerBlock;
     initDoublets<TrackerTraits><<<blocks, threadsPerBlock, 0, stream>>>(this->isOuterHitOfCell_,
                                                                         nhits,
                                                                         this->device_theCellNeighbors_.get(),
@@ -236,7 +232,7 @@ void CAHitNtupletGeneratorKernelsGPU<TrackerTraits>::buildDoublets(HitsOnCPU con
                                                                  this->device_nCells_,
                                                                  this->device_theCellNeighbors_.get(),
                                                                  this->device_theCellTracks_.get(),
-                                                                 hh.view(),
+                                                                 hh,
                                                                  this->isOuterHitOfCell_,
                                                                  nActualPairs,
                                                                  this->params_.cellCuts_);
@@ -249,36 +245,32 @@ void CAHitNtupletGeneratorKernelsGPU<TrackerTraits>::buildDoublets(HitsOnCPU con
 }
 
 template <typename TrackerTraits>
-void CAHitNtupletGeneratorKernelsGPU<TrackerTraits>::classifyTuples(HitsOnCPU const &hh,
-                                                                    TkSoA *tracks_d,
+void CAHitNtupletGeneratorKernelsGPU<TrackerTraits>::classifyTuples(const HitsConstView &hh,
+                                                                    TkSoAView &tracks_view,
                                                                     cudaStream_t cudaStream) {
   using namespace caHitNtupletGeneratorKernels;
 
-  // these are pointer on GPU!
-  auto const *tuples_d = &tracks_d->hitIndices;
-  auto *quality_d = tracks_d->qualityData();
-
-  int32_t nhits = hh.nHits();
+  int32_t nhits = hh.metadata().size();
 
   auto blockSize = 64;
 
   // classify tracks based on kinematics
   auto numberOfBlocks = this->nQuadrupletBlocks(blockSize);
   kernel_classifyTracks<TrackerTraits>
-      <<<numberOfBlocks, blockSize, 0, cudaStream>>>(tuples_d, tracks_d, this->params_.qualityCuts_, quality_d);
+      <<<numberOfBlocks, blockSize, 0, cudaStream>>>(tracks_view, this->params_.qualityCuts_);
 
   if (this->params_.lateFishbone_) {
     // apply fishbone cleaning to good tracks
     numberOfBlocks = this->nDoubletBlocks(blockSize);
     kernel_fishboneCleaner<TrackerTraits>
-        <<<numberOfBlocks, blockSize, 0, cudaStream>>>(this->device_theCells_.get(), this->device_nCells_, quality_d);
+        <<<numberOfBlocks, blockSize, 0, cudaStream>>>(this->device_theCells_.get(), this->device_nCells_, tracks_view);
     cudaCheck(cudaGetLastError());
   }
 
   // mark duplicates (tracks that share a doublet)
   numberOfBlocks = this->nDoubletBlocks(blockSize);
   kernel_fastDuplicateRemover<TrackerTraits><<<numberOfBlocks, blockSize, 0, cudaStream>>>(
-      this->device_theCells_.get(), this->device_nCells_, tracks_d, this->params_.dupPassThrough_);
+      this->device_theCells_.get(), this->device_nCells_, tracks_view, this->params_.dupPassThrough_);
   cudaCheck(cudaGetLastError());
 #ifdef GPU_DEBUG
   cudaCheck(cudaDeviceSynchronize());
@@ -289,7 +281,7 @@ void CAHitNtupletGeneratorKernelsGPU<TrackerTraits>::classifyTuples(HitsOnCPU co
     assert(this->hitToTupleView_.offSize > nhits);
     numberOfBlocks = this->nQuadrupletBlocks(blockSize);
     kernel_countHitInTracks<TrackerTraits>
-        <<<numberOfBlocks, blockSize, 0, cudaStream>>>(tuples_d, quality_d, this->device_hitToTuple_.get());
+        <<<numberOfBlocks, blockSize, 0, cudaStream>>>(tracks_view, this->device_hitToTuple_.get());  //CHECK
     cudaCheck(cudaGetLastError());
     assert((this->hitToTupleView_.assoc == this->device_hitToTuple_.get()) &&
            (this->hitToTupleView_.offStorage == this->device_hitToTupleStorage_.get()) &&
@@ -297,7 +289,7 @@ void CAHitNtupletGeneratorKernelsGPU<TrackerTraits>::classifyTuples(HitsOnCPU co
     cms::cuda::launchFinalize(this->hitToTupleView_, cudaStream);
     cudaCheck(cudaGetLastError());
     kernel_fillHitInTracks<TrackerTraits>
-        <<<numberOfBlocks, blockSize, 0, cudaStream>>>(tuples_d, quality_d, this->device_hitToTuple_.get());
+        <<<numberOfBlocks, blockSize, 0, cudaStream>>>(tracks_view, this->device_hitToTuple_.get());
     cudaCheck(cudaGetLastError());
 #ifdef GPU_DEBUG
     cudaCheck(cudaDeviceSynchronize());
@@ -308,32 +300,25 @@ void CAHitNtupletGeneratorKernelsGPU<TrackerTraits>::classifyTuples(HitsOnCPU co
     // mark duplicates (tracks that share at least one hit)
     numberOfBlocks = (this->hitToTupleView_.offSize + blockSize - 1) / blockSize;
 
-    kernel_rejectDuplicate<TrackerTraits>
-        <<<numberOfBlocks, blockSize, 0, cudaStream>>>(tracks_d,
-                                                       quality_d,
-                                                       this->params_.minHitsForSharingCut_,
-                                                       this->params_.dupPassThrough_,
-                                                       this->device_hitToTuple_.get());
+    kernel_rejectDuplicate<TrackerTraits><<<numberOfBlocks, blockSize, 0, cudaStream>>>(
+        tracks_view, this->params_.minHitsForSharingCut_, this->params_.dupPassThrough_, this->device_hitToTuple_.get());
 
     kernel_sharedHitCleaner<TrackerTraits>
-        <<<numberOfBlocks, blockSize, 0, cudaStream>>>(hh.view(),
-                                                       tracks_d,
-                                                       quality_d,
+        <<<numberOfBlocks, blockSize, 0, cudaStream>>>(hh,
+                                                       tracks_view,
                                                        this->params_.minHitsForSharingCut_,
                                                        this->params_.dupPassThrough_,
                                                        this->device_hitToTuple_.get());
 
     if (this->params_.useSimpleTripletCleaner_) {
       kernel_simpleTripletCleaner<TrackerTraits>
-          <<<numberOfBlocks, blockSize, 0, cudaStream>>>(tracks_d,
-                                                         quality_d,
+          <<<numberOfBlocks, blockSize, 0, cudaStream>>>(tracks_view,
                                                          this->params_.minHitsForSharingCut_,
                                                          this->params_.dupPassThrough_,
                                                          this->device_hitToTuple_.get());
     } else {
       kernel_tripletCleaner<TrackerTraits>
-          <<<numberOfBlocks, blockSize, 0, cudaStream>>>(tracks_d,
-                                                         quality_d,
+          <<<numberOfBlocks, blockSize, 0, cudaStream>>>(tracks_view,
                                                          this->params_.minHitsForSharingCut_,
                                                          this->params_.dupPassThrough_,
                                                          this->device_hitToTuple_.get());
@@ -347,7 +332,7 @@ void CAHitNtupletGeneratorKernelsGPU<TrackerTraits>::classifyTuples(HitsOnCPU co
   if (this->params_.doStats_) {
     numberOfBlocks = (std::max(nhits, int(this->params_.cellCuts_.maxNumberOfDoublets_)) + blockSize - 1) / blockSize;
     kernel_checkOverflows<TrackerTraits>
-        <<<numberOfBlocks, blockSize, 0, cudaStream>>>(tuples_d,
+        <<<numberOfBlocks, blockSize, 0, cudaStream>>>(tracks_view,
                                                        this->device_tupleMultiplicity_.get(),
                                                        this->device_hitToTuple_.get(),
                                                        this->device_hitTuple_apc_,
@@ -370,7 +355,7 @@ void CAHitNtupletGeneratorKernelsGPU<TrackerTraits>::classifyTuples(HitsOnCPU co
     cudaCheck(cudaGetLastError());
     numberOfBlocks = (3 * TrackerTraits::maxNumberOfQuadruplets / 4 + blockSize - 1) / blockSize;
     kernel_doStatsForTracks<TrackerTraits>
-        <<<numberOfBlocks, blockSize, 0, cudaStream>>>(tuples_d, quality_d, this->counters_);
+        <<<numberOfBlocks, blockSize, 0, cudaStream>>>(tracks_view, this->counters_);  //why sometimes yes and some no?
     cudaCheck(cudaGetLastError());
   }
 #ifdef GPU_DEBUG
@@ -385,14 +370,13 @@ void CAHitNtupletGeneratorKernelsGPU<TrackerTraits>::classifyTuples(HitsOnCPU co
     std::lock_guard<std::mutex> guard(lock);
     ++iev;
     for (int k = 0; k < 20000; k += 500) {
-      kernel_print_found_ntuplets<TrackerTraits><<<1, 32, 0, cudaStream>>>(
-          hh.view(), tuples_d, tracks_d, quality_d, this->device_hitToTuple_.get(), k, k + 500, iev);
-      cudaDeviceSynchronize();
+      kernel_print_found_ntuplets<TrackerTraits>
+          <<<1, 32, 0, cudaStream>>>(hh, tracks_view, this->device_hitToTuple_.get(), k, k + 500, iev);
+      cudaCheck(cudaStreamSynchronize(cudaStream));
     }
-    kernel_print_found_ntuplets<TrackerTraits><<<1, 32, 0, cudaStream>>>(
-        hh.view(), tuples_d, tracks_d, quality_d, this->device_hitToTuple_.get(), 20000, 1000000, iev);
-    cudaDeviceSynchronize();
-    // cudaStreamSynchronize(cudaStream);
+    kernel_print_found_ntuplets<TrackerTraits>
+        <<<1, 32, 0, cudaStream>>>(hh, tracks_view, this->device_hitToTuple_.get(), 20000, 1000000, iev);
+    cudaCheck(cudaStreamSynchronize(cudaStream));
   }
 #endif
 }
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.h b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.h
index b595106299d71..f019283b90469 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.h
@@ -3,9 +3,15 @@
 
 // #define GPU_DEBUG
 
-#include "CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h"
 #include "GPUCACell.h"
 #include "gpuPixelDoublets.h"
+
+#include "CUDADataFormats/Track/interface/PixelTrackUtilities.h"
+#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHitsUtilities.h"
+#include "CUDADataFormats/Common/interface/HeterogeneousSoA.h"
+#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHitSoADevice.h"
+#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousHost.h"
+
 // #define DUMP_GPU_TK_TUPLES
 
 namespace caHitNtupletGenerator {
@@ -201,8 +207,9 @@ class CAHitNtupletGeneratorKernels {
   template <typename T>
   using unique_ptr = typename Traits::template unique_ptr<T>;
 
-  using HitsView = TrackingRecHit2DSOAViewT<TrackerTraits>;
-  using HitsOnCPU = TrackingRecHit2DHeterogeneousT<Traits, TrackerTraits>;
+  using HitsView = TrackingRecHitSoAView<TrackerTraits>;
+  using HitsConstView = TrackingRecHitSoAConstView<TrackerTraits>;
+  using TkSoAView = TrackSoAView<TrackerTraits>;
 
   using HitToTuple = caStructures::HitToTupleT<TrackerTraits>;
   using TupleMultiplicity = caStructures::TupleMultiplicityT<TrackerTraits>;
@@ -216,8 +223,7 @@ class CAHitNtupletGeneratorKernels {
   using CACell = GPUCACellT<TrackerTraits>;
 
   using Quality = pixelTrack::Quality;
-  using TkSoA = pixelTrack::TrackSoAT<TrackerTraits>;
-  using HitContainer = pixelTrack::HitContainerT<TrackerTraits>;
+  using HitContainer = typename TrackSoA<TrackerTraits>::HitContainer;
 
   CAHitNtupletGeneratorKernels(Params const& params)
       : params_(params), paramsMaxDoubletes3Quarters_(3 * params.cellCuts_.maxNumberOfDoublets_ / 4) {}
@@ -226,11 +232,11 @@ class CAHitNtupletGeneratorKernels {
 
   TupleMultiplicity const* tupleMultiplicity() const { return device_tupleMultiplicity_.get(); }
 
-  void launchKernels(HitsOnCPU const& hh, TkSoA* tuples_d, cudaStream_t cudaStream);
+  void launchKernels(const HitsConstView& hh, TkSoAView& track_view, cudaStream_t cudaStream);
 
-  void classifyTuples(HitsOnCPU const& hh, TkSoA* tuples_d, cudaStream_t cudaStream);
+  void classifyTuples(const HitsConstView& hh, TkSoAView& track_view, cudaStream_t cudaStream);
 
-  void buildDoublets(HitsOnCPU const& hh, cudaStream_t stream);
+  void buildDoublets(const HitsConstView& hh, int32_t offsetBPIX2, cudaStream_t stream);
   void allocateOnGPU(int32_t nHits, cudaStream_t stream);
   void cleanup(cudaStream_t cudaStream);
 
@@ -283,20 +289,24 @@ class CAHitNtupletGeneratorKernels {
 template <typename TrackerTraits>
 class CAHitNtupletGeneratorKernelsGPU : public CAHitNtupletGeneratorKernels<cms::cudacompat::GPUTraits, TrackerTraits> {
   using CAHitNtupletGeneratorKernels<cms::cudacompat::GPUTraits, TrackerTraits>::CAHitNtupletGeneratorKernels;
-  using HitsOnCPU = TrackingRecHit2DHeterogeneousT<cms::cudacompat::GPUTraits, TrackerTraits>;
-  using TkSoA = pixelTrack::TrackSoAT<TrackerTraits>;
+
   using Counters = caHitNtupletGenerator::Counters;
-  using HitContainer = pixelTrack::HitContainerT<TrackerTraits>;
+  using CAParams = caHitNtupletGenerator::CAParamsT<TrackerTraits>;
+
+  using HitContainer = typename TrackSoA<TrackerTraits>::HitContainer;
+
   using CellNeighborsVector = caStructures::CellNeighborsVectorT<TrackerTraits>;
   using HitToTuple = caStructures::HitToTupleT<TrackerTraits>;
   using CellTracksVector = caStructures::CellTracksVectorT<TrackerTraits>;
   using TupleMultiplicity = caStructures::TupleMultiplicityT<TrackerTraits>;
-  using CAParams = caHitNtupletGenerator::CAParamsT<TrackerTraits>;
+
+  using HitsConstView = TrackingRecHitSoAConstView<TrackerTraits>;
+  using TkSoAView = TrackSoAView<TrackerTraits>;
 
 public:
-  void launchKernels(HitsOnCPU const& hh, TkSoA* tuples_d, cudaStream_t cudaStream);
-  void classifyTuples(HitsOnCPU const& hh, TkSoA* tuples_d, cudaStream_t cudaStream);
-  void buildDoublets(HitsOnCPU const& hh, cudaStream_t stream);
+  void launchKernels(const HitsConstView& hh, TkSoAView& track_view, cudaStream_t cudaStream);
+  void classifyTuples(const HitsConstView& hh, TkSoAView& track_view, cudaStream_t cudaStream);
+  void buildDoublets(const HitsConstView& hh, int32_t offsetBPIX2, cudaStream_t stream);
   void allocateOnGPU(int32_t nHits, cudaStream_t stream);
   static void printCounters(Counters const* counters);
 };
@@ -304,19 +314,24 @@ class CAHitNtupletGeneratorKernelsGPU : public CAHitNtupletGeneratorKernels<cms:
 template <typename TrackerTraits>
 class CAHitNtupletGeneratorKernelsCPU : public CAHitNtupletGeneratorKernels<cms::cudacompat::CPUTraits, TrackerTraits> {
   using CAHitNtupletGeneratorKernels<cms::cudacompat::CPUTraits, TrackerTraits>::CAHitNtupletGeneratorKernels;
-  using HitsOnCPU = TrackingRecHit2DHeterogeneousT<cms::cudacompat::CPUTraits, TrackerTraits>;
-  using TkSoA = pixelTrack::TrackSoAT<TrackerTraits>;
+
   using Counters = caHitNtupletGenerator::Counters;
+  using CAParams = caHitNtupletGenerator::CAParamsT<TrackerTraits>;
+
+  using HitContainer = typename TrackSoA<TrackerTraits>::HitContainer;
+
   using CellNeighborsVector = caStructures::CellNeighborsVectorT<TrackerTraits>;
   using HitToTuple = caStructures::HitToTupleT<TrackerTraits>;
   using CellTracksVector = caStructures::CellTracksVectorT<TrackerTraits>;
   using TupleMultiplicity = caStructures::TupleMultiplicityT<TrackerTraits>;
-  using CAParams = caHitNtupletGenerator::CAParamsT<TrackerTraits>;
+
+  using HitsConstView = TrackingRecHitSoAConstView<TrackerTraits>;
+  using TkSoAView = TrackSoAView<TrackerTraits>;
 
 public:
-  void launchKernels(HitsOnCPU const& hh, TkSoA* tuples_d, cudaStream_t cudaStream);
-  void classifyTuples(HitsOnCPU const& hh, TkSoA* tuples_d, cudaStream_t cudaStream);
-  void buildDoublets(HitsOnCPU const& hh, cudaStream_t stream);
+  void launchKernels(const HitsConstView& hh, TkSoAView& track_view, cudaStream_t cudaStream);
+  void classifyTuples(const HitsConstView& hh, TkSoAView& track_view, cudaStream_t cudaStream);
+  void buildDoublets(const HitsConstView& hh, int32_t offsetBPIX2, cudaStream_t stream);
   void allocateOnGPU(int32_t nHits, cudaStream_t stream);
   static void printCounters(Counters const* counters);
 };
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h
index 03112e0f3fc48..85386305eca6a 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsImpl.h
@@ -15,6 +15,9 @@
 #include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h"
 #include "RecoLocalTracker/SiPixelRecHits/interface/pixelCPEforGPU.h"
 
+#include "CUDADataFormats/Track/interface/PixelTrackUtilities.h"
+#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHitsUtilities.h"
+
 #include "CAStructures.h"
 #include "CAHitNtupletGeneratorKernels.h"
 #include "GPUCACell.h"
@@ -28,8 +31,6 @@ namespace caHitNtupletGeneratorKernels {
   constexpr float nSigma2 = 25.f;
 
   //all of these below are mostly to avoid brining around the relative namespace
-  template <typename TrackerTraits>
-  using HitsView = TrackingRecHit2DSOAViewT<TrackerTraits>;
 
   template <typename TrackerTraits>
   using HitToTuple = caStructures::HitToTupleT<TrackerTraits>;
@@ -49,13 +50,13 @@ namespace caHitNtupletGeneratorKernels {
   using Quality = pixelTrack::Quality;
 
   template <typename TrackerTraits>
-  using TkSoA = pixelTrack::TrackSoAT<TrackerTraits>;
+  using TkSoAView = TrackSoAView<TrackerTraits>;
 
   template <typename TrackerTraits>
-  using HitContainer = pixelTrack::HitContainerT<TrackerTraits>;
+  using HitContainer = typename TrackSoA<TrackerTraits>::HitContainer;
 
   template <typename TrackerTraits>
-  using Hits = typename GPUCACellT<TrackerTraits>::Hits;
+  using HitsConstView = typename GPUCACellT<TrackerTraits>::HitsConstView;
 
   template <typename TrackerTraits>
   using QualityCuts = pixelTrack::QualityCutsT<TrackerTraits>;
@@ -66,7 +67,7 @@ namespace caHitNtupletGeneratorKernels {
   using Counters = caHitNtupletGenerator::Counters;
 
   template <typename TrackerTraits>
-  __global__ void kernel_checkOverflows(HitContainer<TrackerTraits> const *foundNtuplets,
+  __global__ void kernel_checkOverflows(TkSoAView<TrackerTraits> tracks_view,
                                         TupleMultiplicity<TrackerTraits> const *tupleMultiplicity,
                                         HitToTuple<TrackerTraits> const *hitToTuple,
                                         cms::cuda::AtomicPairCounter *apc,
@@ -99,16 +100,16 @@ namespace caHitNtupletGeneratorKernels {
              nHits,
              hitToTuple->totOnes());
       if (apc->get().m < TrackerTraits::maxNumberOfQuadruplets) {
-        assert(foundNtuplets->size(apc->get().m) == 0);
-        assert(foundNtuplets->size() == apc->get().n);
+        assert(tracks_view.hitIndices().size(apc->get().m) == 0);
+        assert(tracks_view.hitIndices().size() == apc->get().n);
       }
     }
 
-    for (int idx = first, nt = foundNtuplets->nOnes(); idx < nt; idx += gridDim.x * blockDim.x) {
-      if (foundNtuplets->size(idx) > TrackerTraits::maxHitsOnTrack)  // current real limit
-        printf("ERROR %d, %d\n", idx, foundNtuplets->size(idx));
-      assert(foundNtuplets->size(idx) <= TrackerTraits::maxHitsOnTrack);
-      for (auto ih = foundNtuplets->begin(idx); ih != foundNtuplets->end(idx); ++ih)
+    for (int idx = first, nt = tracks_view.hitIndices().nOnes(); idx < nt; idx += gridDim.x * blockDim.x) {
+      if (tracks_view.hitIndices().size(idx) > TrackerTraits::maxHitsOnTrack)  // current real limit
+        printf("ERROR %d, %d\n", idx, tracks_view.hitIndices().size(idx));
+      assert(ftracks_view.hitIndices().size(idx) <= TrackerTraits::maxHitsOnTrack);
+      for (auto ih = tracks_view.hitIndices().begin(idx); ih != tracks_view.hitIndices().end(idx); ++ih)
         assert(int(*ih) < nHits);
     }
 #endif
@@ -168,7 +169,7 @@ namespace caHitNtupletGeneratorKernels {
   template <typename TrackerTraits>
   __global__ void kernel_fishboneCleaner(GPUCACellT<TrackerTraits> const *cells,
                                          uint32_t const *__restrict__ nCells,
-                                         Quality *quality) {
+                                         TkSoAView<TrackerTraits> tracks_view) {
     constexpr auto reject = pixelTrack::Quality::dup;
 
     auto first = threadIdx.x + blockIdx.x * blockDim.x;
@@ -178,7 +179,7 @@ namespace caHitNtupletGeneratorKernels {
         continue;
 
       for (auto it : thisCell.tracks())
-        quality[it] = reject;
+        tracks_view[it].quality() = reject;
     }
   }
 
@@ -187,14 +188,11 @@ namespace caHitNtupletGeneratorKernels {
   template <typename TrackerTraits>
   __global__ void kernel_earlyDuplicateRemover(GPUCACellT<TrackerTraits> const *cells,
                                                uint32_t const *__restrict__ nCells,
-                                               TkSoA<TrackerTraits> const *__restrict__ ptracks,
-                                               Quality *quality,
+                                               TkSoAView<TrackerTraits> tracks_view,
                                                bool dupPassThrough) {
     // quality to mark rejected
     constexpr auto reject = pixelTrack::Quality::edup;  /// cannot be loose
 
-    auto const &tracks = *ptracks;
-
     assert(nCells);
     auto first = threadIdx.x + blockIdx.x * blockDim.x;
     for (int idx = first, nt = (*nCells); idx < nt; idx += gridDim.x * blockDim.x) {
@@ -207,7 +205,7 @@ namespace caHitNtupletGeneratorKernels {
 
       // find maxNl
       for (auto it : thisCell.tracks()) {
-        auto nl = tracks.nLayers(it);
+        auto nl = tracks_view[it].nLayers();
         maxNl = std::max(nl, maxNl);
       }
 
@@ -216,8 +214,8 @@ namespace caHitNtupletGeneratorKernels {
       //  maxNl = std::min(4, maxNl);
 
       for (auto it : thisCell.tracks()) {
-        if (tracks.nLayers(it) < maxNl)
-          quality[it] = reject;  //no race:  simple assignment of the same constant
+        if (tracks_view[it].nLayers() < maxNl)
+          tracks_view[it].quality() = reject;  //no race:  simple assignment of the same constant
       }
     }
   }
@@ -226,7 +224,7 @@ namespace caHitNtupletGeneratorKernels {
   template <typename TrackerTraits>
   __global__ void kernel_fastDuplicateRemover(GPUCACellT<TrackerTraits> const *__restrict__ cells,
                                               uint32_t const *__restrict__ nCells,
-                                              TkSoA<TrackerTraits> *__restrict__ tracks,
+                                              TkSoAView<TrackerTraits> tracks_view,
                                               bool dupPassThrough) {
     // quality to mark rejected
     auto const reject = dupPassThrough ? pixelTrack::Quality::loose : pixelTrack::Quality::dup;
@@ -243,45 +241,37 @@ namespace caHitNtupletGeneratorKernels {
       float mc = maxScore;
       uint16_t im = tkNotFound;
 
-      /* chi2 penalize higher-pt tracks  (try rescale it?)
-    auto score = [&](auto it) {
-      return tracks->nLayers(it) < 4 ?
-              std::abs(tracks->tip(it)) :  // tip for triplets
-              tracks->chi2(it);            //chi2 for quads
-    };
-    */
-
-      auto score = [&](auto it) { return std::abs(tracks->tip(it)); };
+      auto score = [&](auto it) { return std::abs(TracksUtilities<TrackerTraits>::tip(tracks_view, it)); };
 
       // full crazy combinatorics
       // full crazy combinatorics
       int ntr = thisCell.tracks().size();
       for (int i = 0; i < ntr - 1; ++i) {
         auto it = thisCell.tracks()[i];
-        auto qi = tracks->quality(it);
+        auto qi = tracks_view[it].quality();
         if (qi <= reject)
           continue;
-        auto opi = tracks->stateAtBS.state(it)(2);
-        auto e2opi = tracks->stateAtBS.covariance(it)(9);
-        auto cti = tracks->stateAtBS.state(it)(3);
-        auto e2cti = tracks->stateAtBS.covariance(it)(12);
+        auto opi = tracks_view[it].state()(2);
+        auto e2opi = tracks_view[it].covariance()(9);
+        auto cti = tracks_view[it].state()(3);
+        auto e2cti = tracks_view[it].covariance()(12);
         for (auto j = i + 1; j < ntr; ++j) {
           auto jt = thisCell.tracks()[j];
-          auto qj = tracks->quality(jt);
+          auto qj = tracks_view[jt].quality();
           if (qj <= reject)
             continue;
-          auto opj = tracks->stateAtBS.state(jt)(2);
-          auto ctj = tracks->stateAtBS.state(jt)(3);
-          auto dct = nSigma2 * (tracks->stateAtBS.covariance(jt)(12) + e2cti);
+          auto opj = tracks_view[jt].state()(2);
+          auto ctj = tracks_view[jt].state()(3);
+          auto dct = nSigma2 * (tracks_view[jt].covariance()(12) + e2cti);
           if ((cti - ctj) * (cti - ctj) > dct)
             continue;
-          auto dop = nSigma2 * (tracks->stateAtBS.covariance(jt)(9) + e2opi);
+          auto dop = nSigma2 * (tracks_view[jt].covariance()(9) + e2opi);
           if ((opi - opj) * (opi - opj) > dop)
             continue;
           if ((qj < qi) || (qj == qi && score(it) < score(jt)))
-            tracks->quality(jt) = reject;
+            tracks_view[jt].quality() = reject;
           else {
-            tracks->quality(it) = reject;
+            tracks_view[it].quality() = reject;
             break;
           }
         }
@@ -290,8 +280,8 @@ namespace caHitNtupletGeneratorKernels {
       // find maxQual
       auto maxQual = reject;  // no duplicate!
       for (auto it : thisCell.tracks()) {
-        if (tracks->quality(it) > maxQual)
-          maxQual = tracks->quality(it);
+        if (tracks_view[it].quality() > maxQual)
+          maxQual = tracks_view[it].quality();
       }
 
       if (maxQual <= loose)
@@ -299,7 +289,7 @@ namespace caHitNtupletGeneratorKernels {
 
       // find min score
       for (auto it : thisCell.tracks()) {
-        if (tracks->quality(it) == maxQual && score(it) < mc) {
+        if (tracks_view[it].quality() == maxQual && score(it) < mc) {
           mc = score(it);
           im = it;
         }
@@ -310,8 +300,8 @@ namespace caHitNtupletGeneratorKernels {
 
       // mark all other duplicates  (not yet, keep it loose)
       for (auto it : thisCell.tracks()) {
-        if (tracks->quality(it) > loose && it != im)
-          tracks->quality(it) = loose;  //no race:  simple assignment of the same constant
+        if (tracks_view[it].quality() > loose && it != im)
+          tracks_view[it].quality() = loose;  //no race:  simple assignment of the same constant
       }
     }
   }
@@ -319,14 +309,13 @@ namespace caHitNtupletGeneratorKernels {
   template <typename TrackerTraits>
   __global__ void kernel_connect(cms::cuda::AtomicPairCounter *apc1,
                                  cms::cuda::AtomicPairCounter *apc2,  // just to zero them,
-                                 Hits<TrackerTraits> const *__restrict__ hhp,
+                                 HitsConstView<TrackerTraits> hh,
                                  GPUCACellT<TrackerTraits> *cells,
                                  uint32_t const *__restrict__ nCells,
                                  CellNeighborsVector<TrackerTraits> *cellNeighbors,
                                  OuterHitOfCell<TrackerTraits> const isOuterHitOfCell,
                                  CAParams<TrackerTraits> params) {
     using Cell = GPUCACellT<TrackerTraits>;
-    auto const &hh = *hhp;
 
     auto firstCellIndex = threadIdx.y + blockIdx.y * blockDim.y;
     auto first = threadIdx.x;
@@ -383,16 +372,14 @@ namespace caHitNtupletGeneratorKernels {
   }
 
   template <typename TrackerTraits>
-  __global__ void kernel_find_ntuplets(Hits<TrackerTraits> const *__restrict__ hhp,
+  __global__ void kernel_find_ntuplets(HitsConstView<TrackerTraits> hh,
+                                       TkSoAView<TrackerTraits> tracks_view,
                                        GPUCACellT<TrackerTraits> *__restrict__ cells,
                                        uint32_t const *nCells,
                                        CellTracksVector<TrackerTraits> *cellTracks,
-                                       HitContainer<TrackerTraits> *foundNtuplets,
                                        cms::cuda::AtomicPairCounter *apc,
-                                       Quality *__restrict__ quality,
                                        CAParams<TrackerTraits> params) {
     // recursive: not obvious to widen
-    auto const &hh = *hhp;
 
     using Cell = GPUCACellT<TrackerTraits>;
 
@@ -422,8 +409,15 @@ namespace caHitNtupletGeneratorKernels {
 
         bool bpix1Start = params.startAt0(pid);
 
-        thisCell.template find_ntuplets<maxDepth>(
-            hh, cells, *cellTracks, *foundNtuplets, *apc, quality, stack, params.minHitsPerNtuplet_, bpix1Start);
+        thisCell.template find_ntuplets<maxDepth>(hh,
+                                                  cells,
+                                                  *cellTracks,
+                                                  tracks_view.hitIndices(),
+                                                  *apc,
+                                                  tracks_view.quality(),
+                                                  stack,
+                                                  params.minHitsPerNtuplet_,
+                                                  bpix1Start);
 
         assert(stack.empty());
       }
@@ -441,17 +435,16 @@ namespace caHitNtupletGeneratorKernels {
   }
 
   template <typename TrackerTraits>
-  __global__ void kernel_countMultiplicity(HitContainer<TrackerTraits> const *__restrict__ foundNtuplets,
-                                           Quality const *__restrict__ quality,
+  __global__ void kernel_countMultiplicity(TkSoAView<TrackerTraits> tracks_view,
                                            TupleMultiplicity<TrackerTraits> *tupleMultiplicity) {
     auto first = blockIdx.x * blockDim.x + threadIdx.x;
-    for (int it = first, nt = foundNtuplets->nOnes(); it < nt; it += gridDim.x * blockDim.x) {
-      auto nhits = foundNtuplets->size(it);
+    for (int it = first, nt = tracks_view.hitIndices().nOnes(); it < nt; it += gridDim.x * blockDim.x) {
+      auto nhits = tracks_view.hitIndices().size(it);
       if (nhits < 3)
         continue;
-      if (quality[it] == pixelTrack::Quality::edup)
+      if (tracks_view[it].quality() == pixelTrack::Quality::edup)
         continue;
-      assert(quality[it] == pixelTrack::Quality::bad);
+      assert(tracks_view[it].quality() == pixelTrack::Quality::bad);
       if (nhits > TrackerTraits::maxHitsOnTrack)  // current limit
         printf("wrong mult %d %d\n", it, nhits);
       assert(nhits <= TrackerTraits::maxHitsOnTrack);
@@ -460,17 +453,16 @@ namespace caHitNtupletGeneratorKernels {
   }
 
   template <typename TrackerTraits>
-  __global__ void kernel_fillMultiplicity(HitContainer<TrackerTraits> const *__restrict__ foundNtuplets,
-                                          Quality const *__restrict__ quality,
+  __global__ void kernel_fillMultiplicity(TkSoAView<TrackerTraits> tracks_view,
                                           TupleMultiplicity<TrackerTraits> *tupleMultiplicity) {
     auto first = blockIdx.x * blockDim.x + threadIdx.x;
-    for (int it = first, nt = foundNtuplets->nOnes(); it < nt; it += gridDim.x * blockDim.x) {
-      auto nhits = foundNtuplets->size(it);
+    for (int it = first, nt = tracks_view.hitIndices().nOnes(); it < nt; it += gridDim.x * blockDim.x) {
+      auto nhits = tracks_view.hitIndices().size(it);
       if (nhits < 3)
         continue;
-      if (quality[it] == pixelTrack::Quality::edup)
+      if (tracks_view[it].quality() == pixelTrack::Quality::edup)
         continue;
-      assert(quality[it] == pixelTrack::Quality::bad);
+      assert(tracks_view[it].quality() == pixelTrack::Quality::bad);
       if (nhits > TrackerTraits::maxHitsOnTrack)
         printf("wrong mult %d %d\n", it, nhits);
       assert(nhits <= TrackerTraits::maxHitsOnTrack);
@@ -478,22 +470,21 @@ namespace caHitNtupletGeneratorKernels {
     }
   }
 
+  ///TODO : why there was quality here?
   template <typename TrackerTraits>
-  __global__ void kernel_classifyTracks(HitContainer<TrackerTraits> const *__restrict__ tuples,
-                                        TkSoA<TrackerTraits> const *__restrict__ tracks,
-                                        QualityCuts<TrackerTraits> cuts,
-                                        Quality *__restrict__ quality) {
+  __global__ void kernel_classifyTracks(TkSoAView<TrackerTraits> tracks_view, QualityCuts<TrackerTraits> cuts) {
+    // Quality *__restrict__ quality) {
     int first = blockDim.x * blockIdx.x + threadIdx.x;
-    for (int it = first, nt = tuples->nOnes(); it < nt; it += gridDim.x * blockDim.x) {
-      auto nhits = tuples->size(it);
+    for (int it = first, nt = tracks_view.hitIndices().nOnes(); it < nt; it += gridDim.x * blockDim.x) {
+      auto nhits = tracks_view.hitIndices().size(it);
       if (nhits == 0)
         break;  // guard
 
       // if duplicate: not even fit
-      if (quality[it] == pixelTrack::Quality::edup)
+      if (tracks_view[it].quality() == pixelTrack::Quality::edup)
         continue;
 
-      assert(quality[it] == pixelTrack::Quality::bad);
+      assert(tracks_view[it].quality() == pixelTrack::Quality::bad);
 
       // mark doublets as bad
       if (nhits < 3)
@@ -502,101 +493,91 @@ namespace caHitNtupletGeneratorKernels {
       // if the fit has any invalid parameters, mark it as bad
       bool isNaN = false;
       for (int i = 0; i < 5; ++i) {
-        isNaN |= std::isnan(tracks->stateAtBS.state(it)(i));
+        isNaN |= std::isnan(tracks_view[it].state()(i));
       }
       if (isNaN) {
 #ifdef NTUPLE_DEBUG
-        printf("NaN in fit %d size %d chi2 %f\n", it, tuples->size(it), tracks->chi2(it));
+        printf("NaN in fit %d size %d chi2 %f\n", it, tracks_view.hitIndices().size(it), tracks_view[it].chi2());
 #endif
         continue;
       }
 
-      quality[it] = pixelTrack::Quality::strict;
+      tracks_view[it].quality() = pixelTrack::Quality::strict;
 
-      if (cuts.strictCut(tracks, it))
+      if (cuts.strictCut(tracks_view, it))
         continue;
 
-      quality[it] = pixelTrack::Quality::tight;
+      tracks_view[it].quality() = pixelTrack::Quality::tight;
 
-      if (cuts.isHP(tracks, nhits, it))
-        quality[it] = pixelTrack::Quality::highPurity;
+      if (cuts.isHP(tracks_view, nhits, it))
+        tracks_view[it].quality() = pixelTrack::Quality::highPurity;
     }
   }
 
   template <typename TrackerTraits>
-  __global__ void kernel_doStatsForTracks(HitContainer<TrackerTraits> const *__restrict__ tuples,
-                                          Quality const *__restrict__ quality,
-                                          Counters *counters) {
+  __global__ void kernel_doStatsForTracks(TkSoAView<TrackerTraits> tracks_view, Counters *counters) {
     int first = blockDim.x * blockIdx.x + threadIdx.x;
-    for (int idx = first, ntot = tuples->nOnes(); idx < ntot; idx += gridDim.x * blockDim.x) {
-      if (tuples->size(idx) == 0)
+    for (int idx = first, ntot = tracks_view.hitIndices().nOnes(); idx < ntot; idx += gridDim.x * blockDim.x) {
+      if (tracks_view.hitIndices().size(idx) == 0)
         break;  //guard
-      if (quality[idx] < pixelTrack::Quality::loose)
+      if (tracks_view[idx].quality() < pixelTrack::Quality::loose)
         continue;
       atomicAdd(&(counters->nLooseTracks), 1);
-      if (quality[idx] < pixelTrack::Quality::strict)
+      if (tracks_view[idx].quality() < pixelTrack::Quality::strict)
         continue;
       atomicAdd(&(counters->nGoodTracks), 1);
     }
   }
 
   template <typename TrackerTraits>
-  __global__ void kernel_countHitInTracks(HitContainer<TrackerTraits> const *__restrict__ tuples,
-                                          Quality const *__restrict__ quality,
-                                          HitToTuple<TrackerTraits> *hitToTuple) {
+  __global__ void kernel_countHitInTracks(TkSoAView<TrackerTraits> tracks_view, HitToTuple<TrackerTraits> *hitToTuple) {
     int first = blockDim.x * blockIdx.x + threadIdx.x;
-    for (int idx = first, ntot = tuples->nOnes(); idx < ntot; idx += gridDim.x * blockDim.x) {
-      if (tuples->size(idx) == 0)
+    for (int idx = first, ntot = tracks_view.hitIndices().nOnes(); idx < ntot; idx += gridDim.x * blockDim.x) {
+      if (tracks_view.hitIndices().size(idx) == 0)
         break;  // guard
-      for (auto h = tuples->begin(idx); h != tuples->end(idx); ++h)
+      for (auto h = tracks_view.hitIndices().begin(idx); h != tracks_view.hitIndices().end(idx); ++h)
         hitToTuple->count(*h);
     }
   }
 
   template <typename TrackerTraits>
-  __global__ void kernel_fillHitInTracks(HitContainer<TrackerTraits> const *__restrict__ tuples,
-                                         Quality const *__restrict__ quality,
-                                         HitToTuple<TrackerTraits> *hitToTuple) {
+  __global__ void kernel_fillHitInTracks(TkSoAView<TrackerTraits> tracks_view, HitToTuple<TrackerTraits> *hitToTuple) {
     int first = blockDim.x * blockIdx.x + threadIdx.x;
-    for (int idx = first, ntot = tuples->nOnes(); idx < ntot; idx += gridDim.x * blockDim.x) {
-      if (tuples->size(idx) == 0)
+    for (int idx = first, ntot = tracks_view.hitIndices().nOnes(); idx < ntot; idx += gridDim.x * blockDim.x) {
+      if (tracks_view.hitIndices().size(idx) == 0)
         break;  // guard
-      for (auto h = tuples->begin(idx); h != tuples->end(idx); ++h)
+      for (auto h = tracks_view.hitIndices().begin(idx); h != tracks_view.hitIndices().end(idx); ++h)
         hitToTuple->fill(*h, idx);
     }
   }
 
   template <typename TrackerTraits>
-  __global__ void kernel_fillHitDetIndices(HitContainer<TrackerTraits> const *__restrict__ tuples,
-                                           HitsView<TrackerTraits> const *__restrict__ hhp,
-                                           HitContainer<TrackerTraits> *__restrict__ hitDetIndices) {
+  __global__ void kernel_fillHitDetIndices(TkSoAView<TrackerTraits> tracks_view, HitsConstView<TrackerTraits> hh) {
     int first = blockDim.x * blockIdx.x + threadIdx.x;
     // copy offsets
-    for (int idx = first, ntot = tuples->totOnes(); idx < ntot; idx += gridDim.x * blockDim.x) {
-      hitDetIndices->off[idx] = tuples->off[idx];
+    for (int idx = first, ntot = tracks_view.hitIndices().totOnes(); idx < ntot; idx += gridDim.x * blockDim.x) {
+      tracks_view.detIndices().off[idx] = tracks_view.hitIndices().off[idx];
     }
     // fill hit indices
-    auto const &hh = *hhp;
     auto nhits = hh.nHits();
 
-    for (int idx = first, ntot = tuples->size(); idx < ntot; idx += gridDim.x * blockDim.x) {
-      assert(tuples->content[idx] < nhits);
-      hitDetIndices->content[idx] = hh.detectorIndex(tuples->content[idx]);
+    for (int idx = first, ntot = tracks_view.hitIndices().size(); idx < ntot; idx += gridDim.x * blockDim.x) {
+      assert(tracks_view.hitIndices().content[idx] < nhits);
+      tracks_view.detIndices().content[idx] = hh[tracks_view.hitIndices().content[idx]].detectorIndex();
     }
   }
 
   template <typename TrackerTraits>
-  __global__ void kernel_fillNLayers(TkSoA<TrackerTraits> *__restrict__ ptracks, cms::cuda::AtomicPairCounter *apc) {
-    auto &tracks = *ptracks;
+  __global__ void kernel_fillNLayers(TkSoAView<TrackerTraits> tracks_view, cms::cuda::AtomicPairCounter *apc) {
     auto first = blockIdx.x * blockDim.x + threadIdx.x;
     // clamp the number of tracks to the capacity of the SoA
-    auto ntracks = std::min<int>(apc->get().m, tracks.stride() - 1);
+    auto ntracks = std::min<int>(apc->get().m, tracks_view.metadata().size() - 1);
     if (0 == first)
-      tracks.setNTracks(ntracks);
+      tracks_view.nTracks() = ntracks;
     for (int idx = first, nt = ntracks; idx < nt; idx += gridDim.x * blockDim.x) {
-      auto nHits = tracks.nHits(idx);
+      auto nHits = TracksUtilities<TrackerTraits>::nHits(tracks_view, idx);
       assert(nHits >= 3);
-      tracks.nLayers(idx) = tracks.computeNumberOfLayers(idx);
+      tracks_view[idx].nLayers() = TracksUtilities<TrackerTraits>::computeNumberOfLayers(tracks_view, idx);
     }
   }
 
@@ -677,8 +658,7 @@ namespace caHitNtupletGeneratorKernels {
 
   // mostly for very forward triplets.....
   template <typename TrackerTraits>
-  __global__ void kernel_rejectDuplicate(TkSoA<TrackerTraits> const *__restrict__ ptracks,
-                                         Quality *__restrict__ quality,
+  __global__ void kernel_rejectDuplicate(TkSoAView<TrackerTraits> tracks_view,
                                          uint16_t nmin,
                                          bool dupPassThrough,
                                          HitToTuple<TrackerTraits> const *__restrict__ phitToTuple) {
@@ -686,50 +666,43 @@ namespace caHitNtupletGeneratorKernels {
     auto const reject = dupPassThrough ? pixelTrack::Quality::loose : pixelTrack::Quality::dup;
 
     auto &hitToTuple = *phitToTuple;
-    auto const &tracks = *ptracks;
 
     int first = blockDim.x * blockIdx.x + threadIdx.x;
     for (int idx = first, ntot = hitToTuple.nOnes(); idx < ntot; idx += gridDim.x * blockDim.x) {
       if (hitToTuple.size(idx) < 2)
         continue;
 
-      /* chi2 is bad for large pt
-    auto score = [&](auto it, auto nl) {
-      return nl < 4 ? std::abs(tracks.tip(it)) :  // tip for triplets
-                 tracks.chi2(it);                 //chi2
-    };
-    */
-      auto score = [&](auto it, auto nl) { return std::abs(tracks.tip(it)); };
+      auto score = [&](auto it, auto nl) { return std::abs(TracksUtilities<TrackerTraits>::tip(tracks_view, it)); };
 
       // full combinatorics
       for (auto ip = hitToTuple.begin(idx); ip < hitToTuple.end(idx) - 1; ++ip) {
         auto const it = *ip;
-        auto qi = quality[it];
+        auto qi = tracks_view[it].quality();
         if (qi <= reject)
           continue;
-        auto opi = tracks.stateAtBS.state(it)(2);
-        auto e2opi = tracks.stateAtBS.covariance(it)(9);
-        auto cti = tracks.stateAtBS.state(it)(3);
-        auto e2cti = tracks.stateAtBS.covariance(it)(12);
-        auto nli = tracks.nLayers(it);
+        auto opi = tracks_view[it].state()(2);
+        auto e2opi = tracks_view[it].covariance()(9);
+        auto cti = tracks_view[it].state()(3);
+        auto e2cti = tracks_view[it].covariance()(12);
+        auto nli = tracks_view[it].nLayers();
         for (auto jp = ip + 1; jp < hitToTuple.end(idx); ++jp) {
           auto const jt = *jp;
-          auto qj = quality[jt];
+          auto qj = tracks_view[jt].quality();
           if (qj <= reject)
             continue;
-          auto opj = tracks.stateAtBS.state(jt)(2);
-          auto ctj = tracks.stateAtBS.state(jt)(3);
-          auto dct = nSigma2 * (tracks.stateAtBS.covariance(jt)(12) + e2cti);
+          auto opj = tracks_view[jt].state()(2);
+          auto ctj = tracks_view[jt].state()(3);
+          auto dct = nSigma2 * (tracks_view[jt].covariance()(12) + e2cti);
           if ((cti - ctj) * (cti - ctj) > dct)
             continue;
-          auto dop = nSigma2 * (tracks.stateAtBS.covariance(jt)(9) + e2opi);
+          auto dop = nSigma2 * (tracks_view[jt].covariance()(9) + e2opi);
           if ((opi - opj) * (opi - opj) > dop)
             continue;
-          auto nlj = tracks.nLayers(jt);
+          auto nlj = tracks_view[jt].nLayers();
           if (nlj < nli || (nlj == nli && (qj < qi || (qj == qi && score(it, nli) < score(jt, nlj)))))
-            quality[jt] = reject;
+            tracks_view[jt].quality() = reject;
           else {
-            quality[it] = reject;
+            tracks_view[it].quality() = reject;
             break;
           }
         }
@@ -738,9 +711,8 @@ namespace caHitNtupletGeneratorKernels {
   }
 
   template <typename TrackerTraits>
-  __global__ void kernel_sharedHitCleaner(HitsView<TrackerTraits> const *__restrict__ hhp,
-                                          TkSoA<TrackerTraits> const *__restrict__ ptracks,
-                                          Quality *__restrict__ quality,
+  __global__ void kernel_sharedHitCleaner(HitsConstView<TrackerTraits> hh,
+                                          TkSoAView<TrackerTraits> tracks_view,
                                           int nmin,
                                           bool dupPassThrough,
                                           HitToTuple<TrackerTraits> const *__restrict__ phitToTuple) {
@@ -750,9 +722,7 @@ namespace caHitNtupletGeneratorKernels {
     auto const longTqual = pixelTrack::Quality::highPurity;
 
     auto &hitToTuple = *phitToTuple;
-    auto const &tracks = *ptracks;
 
-    auto const &hh = *hhp;
     int l1end = hh.hitsLayerStart()[1];
 
     int first = blockDim.x * blockIdx.x + threadIdx.x;
@@ -764,10 +734,10 @@ namespace caHitNtupletGeneratorKernels {
 
       // find maxNl
       for (auto it = hitToTuple.begin(idx); it != hitToTuple.end(idx); ++it) {
-        if (quality[*it] < longTqual)
+        if (tracks_view[*it].quality() < longTqual)
           continue;
-        // if (tracks.nHits(*it)==3) continue;
-        auto nl = tracks.nLayers(*it);
+        // if (tracks_view[*it].nHits()==3) continue;
+        auto nl = tracks_view[*it].nLayers();
         maxNl = std::max(nl, maxNl);
       }
 
@@ -779,21 +749,20 @@ namespace caHitNtupletGeneratorKernels {
 
       // kill all tracks shorter than maxHl (only triplets???
       for (auto it = hitToTuple.begin(idx); it != hitToTuple.end(idx); ++it) {
-        auto nl = tracks.nLayers(*it);
+        auto nl = tracks_view[*it].nLayers();
 
         //checking if shared hit is on bpix1 and if the tuple is short enough
         if (idx < l1end and nl > nmin)
           continue;
 
-        if (nl < maxNl && quality[*it] > reject)
-          quality[*it] = reject;
+        if (nl < maxNl && tracks_view[*it].quality() > reject)
+          tracks_view[*it].quality() = reject;
       }
     }
   }
 
   template <typename TrackerTraits>
-  __global__ void kernel_tripletCleaner(TkSoA<TrackerTraits> const *__restrict__ ptracks,
-                                        Quality *__restrict__ quality,
+  __global__ void kernel_tripletCleaner(TkSoAView<TrackerTraits> tracks_view,
                                         uint16_t nmin,
                                         bool dupPassThrough,
                                         HitToTuple<TrackerTraits> const *__restrict__ phitToTuple) {
@@ -803,7 +772,6 @@ namespace caHitNtupletGeneratorKernels {
     auto const good = pixelTrack::Quality::strict;
 
     auto &hitToTuple = *phitToTuple;
-    auto const &tracks = *ptracks;
 
     int first = blockDim.x * blockIdx.x + threadIdx.x;
     for (int idx = first, ntot = hitToTuple.nOnes(); idx < ntot; idx += gridDim.x * blockDim.x) {
@@ -816,9 +784,9 @@ namespace caHitNtupletGeneratorKernels {
 
       // check if only triplets
       for (auto it = hitToTuple.begin(idx); it != hitToTuple.end(idx); ++it) {
-        if (quality[*it] <= good)
+        if (tracks_view[*it].quality() <= good)
           continue;
-        onlyTriplets &= tracks.isTriplet(*it);
+        onlyTriplets &= TracksUtilities<TrackerTraits>::isTriplet(tracks_view, *it);
         if (!onlyTriplets)
           break;
       }
@@ -830,8 +798,8 @@ namespace caHitNtupletGeneratorKernels {
       // for triplets choose best tip!  (should we first find best quality???)
       for (auto ip = hitToTuple.begin(idx); ip != hitToTuple.end(idx); ++ip) {
         auto const it = *ip;
-        if (quality[it] >= good && std::abs(tracks.tip(it)) < mc) {
-          mc = std::abs(tracks.tip(it));
+        if (tracks_view[it].quality() >= good && std::abs(TracksUtilities<TrackerTraits>::tip(tracks_view, it)) < mc) {
+          mc = std::abs(TracksUtilities<TrackerTraits>::tip(tracks_view, it));
           im = it;
         }
       }
@@ -842,16 +810,15 @@ namespace caHitNtupletGeneratorKernels {
       // mark worse ambiguities
       for (auto ip = hitToTuple.begin(idx); ip != hitToTuple.end(idx); ++ip) {
         auto const it = *ip;
-        if (quality[it] > reject && it != im)
-          quality[it] = reject;  //no race:  simple assignment of the same constant
+        if (tracks_view[it].quality() > reject && it != im)
+          tracks_view[it].quality() = reject;  //no race:  simple assignment of the same constant
       }
 
     }  // loop over hits
   }
 
   template <typename TrackerTraits>
-  __global__ void kernel_simpleTripletCleaner(TkSoA<TrackerTraits> const *__restrict__ ptracks,
-                                              Quality *__restrict__ quality,
+  __global__ void kernel_simpleTripletCleaner(TkSoAView<TrackerTraits> tracks_view,
                                               uint16_t nmin,
                                               bool dupPassThrough,
                                               HitToTuple<TrackerTraits> const *__restrict__ phitToTuple) {
@@ -861,7 +828,6 @@ namespace caHitNtupletGeneratorKernels {
     auto const good = pixelTrack::Quality::loose;
 
     auto &hitToTuple = *phitToTuple;
-    auto const &tracks = *ptracks;
 
     int first = blockDim.x * blockIdx.x + threadIdx.x;
     for (int idx = first, ntot = hitToTuple.nOnes(); idx < ntot; idx += gridDim.x * blockDim.x) {
@@ -874,8 +840,8 @@ namespace caHitNtupletGeneratorKernels {
       // choose best tip!  (should we first find best quality???)
       for (auto ip = hitToTuple.begin(idx); ip != hitToTuple.end(idx); ++ip) {
         auto const it = *ip;
-        if (quality[it] >= good && std::abs(tracks.tip(it)) < mc) {
-          mc = std::abs(tracks.tip(it));
+        if (tracks_view[it].quality() >= good && std::abs(TracksUtilities<TrackerTraits>::tip(tracks_view, it)) < mc) {
+          mc = std::abs(TracksUtilities<TrackerTraits>::tip(tracks_view, it));
           im = it;
         }
       }
@@ -886,53 +852,50 @@ namespace caHitNtupletGeneratorKernels {
       // mark worse ambiguities
       for (auto ip = hitToTuple.begin(idx); ip != hitToTuple.end(idx); ++ip) {
         auto const it = *ip;
-        if (quality[it] > reject && tracks.isTriplet(it) && it != im)
-          quality[it] = reject;  //no race:  simple assignment of the same constant
+        if (tracks_view[it].quality() > reject && TracksUtilities<TrackerTraits>::isTriplet(tracks_view, it) &&
+            it != im)
+          tracks_view[it].quality() = reject;  //no race:  simple assignment of the same constant
       }
 
     }  // loop over hits
   }
 
   template <typename TrackerTraits>
-  __global__ void kernel_print_found_ntuplets(HitsView<TrackerTraits> const *__restrict__ hhp,
-                                              HitContainer<TrackerTraits> const *__restrict__ ptuples,
-                                              TkSoA<TrackerTraits> const *__restrict__ ptracks,
-                                              Quality const *__restrict__ quality,
+  __global__ void kernel_print_found_ntuplets(HitsConstView<TrackerTraits> hh,
+                                              TkSoAView<TrackerTraits> tracks_view,
                                               HitToTuple<TrackerTraits> const *__restrict__ phitToTuple,
                                               int32_t firstPrint,
                                               int32_t lastPrint,
                                               int iev) {
     constexpr auto loose = pixelTrack::Quality::loose;
-    auto const &hh = *hhp;
-    auto const &foundNtuplets = *ptuples;
-    auto const &tracks = *ptracks;
+
     int first = firstPrint + blockDim.x * blockIdx.x + threadIdx.x;
-    for (int i = first, np = std::min(lastPrint, foundNtuplets.nOnes()); i < np; i += blockDim.x * gridDim.x) {
-      auto nh = foundNtuplets.size(i);
+    for (int i = first, np = std::min(lastPrint, tracks_view.hitIndices().nOnes()); i < np;
+         i += blockDim.x * gridDim.x) {
+      auto nh = tracks_view.hitIndices().size(i);
       if (nh < 3)
         continue;
-      if (quality[i] < loose)
+      if (tracks_view[i].quality() < loose)
         continue;
       printf("TK: %d %d %d %d %f %f %f %f %f %f %f %.3f %.3f %.3f %.3f %.3f %.3f %.3f\n",
              10000 * iev + i,
-             int(quality[i]),
+             int(tracks_view[i].quality()),
              nh,
-             tracks.nLayers(i),
-             tracks.charge(i),
-             tracks.pt(i),
-             tracks.eta(i),
-             tracks.phi(i),
-             tracks.tip(i),
-             tracks.zip(i),
-             //           asinhf(fit_results[i].par(3)),
-             tracks.chi2(i),
-             hh.zGlobal(*foundNtuplets.begin(i)),
-             hh.zGlobal(*(foundNtuplets.begin(i) + 1)),
-             hh.zGlobal(*(foundNtuplets.begin(i) + 2)),
-             nh > 3 ? hh.zGlobal(int(*(foundNtuplets.begin(i) + 3))) : 0,
-             nh > 4 ? hh.zGlobal(int(*(foundNtuplets.begin(i) + 4))) : 0,
-             nh > 5 ? hh.zGlobal(int(*(foundNtuplets.begin(i) + 5))) : 0,
-             nh > 6 ? hh.zGlobal(int(*(foundNtuplets.begin(i) + nh - 1))) : 0);
+             tracks_view[i].nLayers(),
+             TracksUtilities<TrackerTraits>::charge(tracks_view, i),
+             tracks_view[i].pt(),
+             tracks_view[i].eta(),
+             TracksUtilities<TrackerTraits>::phi(tracks_view, i),
+             TracksUtilities<TrackerTraits>::tip(tracks_view, i),
+             TracksUtilities<TrackerTraits>::zip(tracks_view, i),
+             tracks_view[i].chi2(),
+             hh[*tracks_view.hitIndices().begin(i)].zGlobal(),
+             hh[*(tracks_view.hitIndices().begin(i) + 1)].zGlobal(),
+             hh[*(tracks_view.hitIndices().begin(i) + 2)].zGlobal(),
+             nh > 3 ? hh[int(*(tracks_view.hitIndices().begin(i) + 3))].zGlobal() : 0,
+             nh > 4 ? hh[int(*(tracks_view.hitIndices().begin(i) + 4))].zGlobal() : 0,
+             nh > 5 ? hh[int(*(tracks_view.hitIndices().begin(i) + 5))].zGlobal() : 0,
+             nh > 6 ? hh[int(*(tracks_view.hitIndices().begin(i) + nh - 1))].zGlobal() : 0);
     }
   }
 
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.cc b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.cc
index 6d9ac785155d2..f499a6c90d384 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.cc
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.cc
@@ -21,6 +21,12 @@
 #include "HeterogeneousCore/CUDAServices/interface/CUDAService.h"
 #include "TrackingTools/DetLayers/interface/BarrelDetLayer.h"
 
+#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousHost.h"
+#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousDevice.h"
+
+#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHitSoAHost.h"
+#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHitSoADevice.h"
+
 #include "CAHitNtupletGeneratorOnGPU.h"
 
 namespace {
@@ -66,25 +72,25 @@ namespace {
                                        (float)cfg.getParameter<double>("dcaCutOuterTriplet")}};
     };
 
-    static constexpr QualityCutsT<TrackerTraits> makeQualityCuts(edm::ParameterSet const& pset) {
+    static constexpr pixelTrack::QualityCutsT<TrackerTraits> makeQualityCuts(edm::ParameterSet const& pset) {
       auto coeff = pset.getParameter<std::array<double, 2>>("chi2Coeff");
       auto ptMax = pset.getParameter<double>("chi2MaxPt");
 
       coeff[1] = (coeff[1] - coeff[0]) / log2(ptMax);
-      return QualityCutsT<TrackerTraits>{// polynomial coefficients for the pT-dependent chi2 cut
-                                         {(float)coeff[0], (float)coeff[1], 0.f, 0.f},
-                                         // max pT used to determine the chi2 cut
-                                         (float)ptMax,
-                                         // chi2 scale factor: 8 for broken line fit, ?? for Riemann fit
-                                         (float)pset.getParameter<double>("chi2Scale"),
-                                         // regional cuts for triplets
-                                         {(float)pset.getParameter<double>("tripletMaxTip"),
-                                          (float)pset.getParameter<double>("tripletMinPt"),
-                                          (float)pset.getParameter<double>("tripletMaxZip")},
-                                         // regional cuts for quadruplets
-                                         {(float)pset.getParameter<double>("quadrupletMaxTip"),
-                                          (float)pset.getParameter<double>("quadrupletMinPt"),
-                                          (float)pset.getParameter<double>("quadrupletMaxZip")}};
+      return pixelTrack::QualityCutsT<TrackerTraits>{// polynomial coefficients for the pT-dependent chi2 cut
+                                                     {(float)coeff[0], (float)coeff[1], 0.f, 0.f},
+                                                     // max pT used to determine the chi2 cut
+                                                     (float)ptMax,
+                                                     // chi2 scale factor: 8 for broken line fit, ?? for Riemann fit
+                                                     (float)pset.getParameter<double>("chi2Scale"),
+                                                     // regional cuts for triplets
+                                                     {(float)pset.getParameter<double>("tripletMaxTip"),
+                                                      (float)pset.getParameter<double>("tripletMinPt"),
+                                                      (float)pset.getParameter<double>("tripletMaxZip")},
+                                                     // regional cuts for quadruplets
+                                                     {(float)pset.getParameter<double>("quadrupletMaxTip"),
+                                                      (float)pset.getParameter<double>("quadrupletMinPt"),
+                                                      (float)pset.getParameter<double>("quadrupletMaxZip")}};
     }
   };
 
@@ -101,8 +107,8 @@ namespace {
                                       {(bool)cfg.getParameter<bool>("includeFarForwards")}};
     }
 
-    static constexpr QualityCutsT<TrackerTraits> makeQualityCuts(edm::ParameterSet const& pset) {
-      return QualityCutsT<TrackerTraits>{
+    static constexpr pixelTrack::QualityCutsT<TrackerTraits> makeQualityCuts(edm::ParameterSet const& pset) {
+      return pixelTrack::QualityCutsT<TrackerTraits>{
           (float)pset.getParameter<double>("maxChi2"),
           (float)pset.getParameter<double>("minPt"),
           (float)pset.getParameter<double>("maxTip"),
@@ -274,37 +280,30 @@ void CAHitNtupletGeneratorOnGPU<TrackerTraits>::endJob() {
 }
 
 template <typename TrackerTraits>
-PixelTrackHeterogeneousT<TrackerTraits> CAHitNtupletGeneratorOnGPU<TrackerTraits>::makeTuplesAsync(
+TrackSoAHeterogeneousDevice<TrackerTraits> CAHitNtupletGeneratorOnGPU<TrackerTraits>::makeTuplesAsync(
     HitsOnGPU const& hits_d, float bfield, cudaStream_t stream) const {
   using HelixFitOnGPU = HelixFitOnGPU<TrackerTraits>;
-  using PixelTrackHeterogeneous = PixelTrackHeterogeneousT<TrackerTraits>;
+  using TrackSoA = TrackSoAHeterogeneousDevice<TrackerTraits>;
   using GPUKernels = CAHitNtupletGeneratorKernelsGPU<TrackerTraits>;
 
-  PixelTrackHeterogeneous tracks(cms::cuda::make_device_unique<OutputSoA>(stream));
-
-  auto* soa = tracks.get();
-  assert(soa);
-  cudaCheck(cudaGetLastError());
+  TrackSoA tracks(stream);
 
   GPUKernels kernels(m_params);
   kernels.setCounters(m_counters);
   kernels.allocateOnGPU(hits_d.nHits(), stream);
-  cudaCheck(cudaGetLastError());
 
-  kernels.buildDoublets(hits_d, stream);
-  cudaCheck(cudaGetLastError());
+  kernels.buildDoublets(hits_d.view(), hits_d.offsetBPIX2(), stream);
 
-  kernels.launchKernels(hits_d, soa, stream);
-  cudaCheck(cudaGetLastError());
+  kernels.launchKernels(hits_d.view(), tracks.view(), stream);
 
   HelixFitOnGPU fitter(bfield, m_params.fitNas4_);
-  fitter.allocateOnGPU(&(soa->hitIndices), kernels.tupleMultiplicity(), soa);
+  fitter.allocateOnGPU(kernels.tupleMultiplicity(), tracks.view());
   if (m_params.useRiemannFit_) {
     fitter.launchRiemannKernels(hits_d.view(), hits_d.nHits(), TrackerTraits::maxNumberOfQuadruplets, stream);
   } else {
     fitter.launchBrokenLineKernels(hits_d.view(), hits_d.nHits(), TrackerTraits::maxNumberOfQuadruplets, stream);
   }
-  kernels.classifyTuples(hits_d, soa, stream);
+  kernels.classifyTuples(hits_d.view(), tracks.view(), stream);
 #ifdef GPU_DEBUG
   cudaDeviceSynchronize();
   cudaCheck(cudaGetLastError());
@@ -315,47 +314,43 @@ PixelTrackHeterogeneousT<TrackerTraits> CAHitNtupletGeneratorOnGPU<TrackerTraits
 }
 
 template <typename TrackerTraits>
-PixelTrackHeterogeneousT<TrackerTraits> CAHitNtupletGeneratorOnGPU<TrackerTraits>::makeTuples(HitsOnCPU const& hits_d,
-                                                                                              float bfield) const {
+TrackSoAHeterogeneousHost<TrackerTraits> CAHitNtupletGeneratorOnGPU<TrackerTraits>::makeTuples(HitsOnCPU const& hits_h,
+                                                                                               float bfield) const {
   using HelixFitOnGPU = HelixFitOnGPU<TrackerTraits>;
-  using PixelTrackHeterogeneous = PixelTrackHeterogeneousT<TrackerTraits>;
+  using TrackSoA = TrackSoAHeterogeneousHost<TrackerTraits>;
   using CPUKernels = CAHitNtupletGeneratorKernelsCPU<TrackerTraits>;
 
-  PixelTrackHeterogeneous tracks(std::make_unique<OutputSoA>());
-
-  auto* soa = tracks.get();
-  assert(soa);
+  TrackSoA tracks;
 
   CPUKernels kernels(m_params);
   kernels.setCounters(m_counters);
-  kernels.allocateOnGPU(hits_d.nHits(), nullptr);
+  kernels.allocateOnGPU(hits_h.nHits(), nullptr);
 
-  kernels.buildDoublets(hits_d, nullptr);
-  kernels.launchKernels(hits_d, soa, nullptr);
+  kernels.buildDoublets(hits_h.view(), hits_h.offsetBPIX2(), nullptr);
+  kernels.launchKernels(hits_h.view(), tracks.view(), nullptr);
 
-  if (0 == hits_d.nHits())
+  if (0 == hits_h.nHits())
     return tracks;
 
   // now fit
   HelixFitOnGPU fitter(bfield, m_params.fitNas4_);
-  fitter.allocateOnGPU(&(soa->hitIndices), kernels.tupleMultiplicity(), soa);
+  fitter.allocateOnGPU(kernels.tupleMultiplicity(), tracks.view());
 
   if (m_params.useRiemannFit_) {
-    fitter.launchRiemannKernelsOnCPU(hits_d.view(), hits_d.nHits(), TrackerTraits::maxNumberOfQuadruplets);
+    fitter.launchRiemannKernelsOnCPU(hits_h.view(), hits_h.nHits(), TrackerTraits::maxNumberOfQuadruplets);
   } else {
-    fitter.launchBrokenLineKernelsOnCPU(hits_d.view(), hits_d.nHits(), TrackerTraits::maxNumberOfQuadruplets);
+    fitter.launchBrokenLineKernelsOnCPU(hits_h.view(), hits_h.nHits(), TrackerTraits::maxNumberOfQuadruplets);
   }
 
-  kernels.classifyTuples(hits_d, soa, nullptr);
+  kernels.classifyTuples(hits_h.view(), tracks.view(), nullptr);
 
 #ifdef GPU_DEBUG
   std::cout << "finished building pixel tracks on CPU" << std::endl;
 #endif
 
   // check that the fixed-size SoA does not overflow
-  auto const& tsoa = *soa;
-  auto maxTracks = tsoa.stride();
-  auto nTracks = tsoa.nTracks();
+  auto maxTracks = tracks.view().metadata().size();
+  auto nTracks = tracks.view().nTracks();
   assert(nTracks < maxTracks);
   if (nTracks == maxTracks - 1) {
     edm::LogWarning("PixelTracks") << "Unsorted reconstructed pixel tracks truncated to " << maxTracks - 1
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.h b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.h
index 745579b960b76..8ee65736541f3 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.h
@@ -2,8 +2,14 @@
 #define RecoPixelVertexing_PixelTriplets_plugins_CAHitNtupletGeneratorOnGPU_h
 
 #include <cuda_runtime.h>
-#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h"
-#include "CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h"
+// #include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h"
+// #include "CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h"
+#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousHost.h"
+#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousDevice.h"
+
+#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHitsUtilities.h"
+#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHitSoAHost.h"
+#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHitSoADevice.h"
 
 #include "DataFormats/SiPixelDetId/interface/PixelSubdetector.h"
 #include "FWCore/ParameterSet/interface/ParameterSet.h"
@@ -24,20 +30,20 @@ namespace edm {
 template <typename TrackerTraits>
 class CAHitNtupletGeneratorOnGPU {
 public:
-  using PixelTrackHeterogeneous = PixelTrackHeterogeneousT<TrackerTraits>;
-
-  using HitsView = TrackingRecHit2DSOAViewT<TrackerTraits>;
-  using HitsOnGPU = TrackingRecHit2DGPUT<TrackerTraits>;
-  using HitsOnCPU = TrackingRecHit2DCPUT<TrackerTraits>;
-  using hindex_type = typename HitsView::hindex_type;
+  using HitsView = TrackingRecHitSoAView<TrackerTraits>;
+  using HitsConstView = TrackingRecHitSoAConstView<TrackerTraits>;
+  using HitsOnGPU = TrackingRecHitSoADevice<TrackerTraits>;  //TODO move to OnDevice
+  using HitsOnCPU = TrackingRecHitSoAHost<TrackerTraits>;    //TODO move to OnHost
+  using hindex_type = typename TrackingRecHitSoA<TrackerTraits>::hindex_type;
 
   using HitToTuple = caStructures::HitToTupleT<TrackerTraits>;
   using TupleMultiplicity = caStructures::TupleMultiplicityT<TrackerTraits>;
   using OuterHitOfCell = caStructures::OuterHitOfCellT<TrackerTraits>;
 
   using GPUCACell = GPUCACellT<TrackerTraits>;
-  using OutputSoA = pixelTrack::TrackSoAT<TrackerTraits>;
-  using HitContainer = typename OutputSoA::HitContainer;
+  using TrackSoAHost = TrackSoAHeterogeneousHost<TrackerTraits>;
+  using TrackSoADevice = TrackSoAHeterogeneousDevice<TrackerTraits>;
+  using HitContainer = typename TrackSoA<TrackerTraits>::HitContainer;
   using Tuple = HitContainer;
 
   using CellNeighborsVector = caStructures::CellNeighborsVectorT<TrackerTraits>;
@@ -56,21 +62,20 @@ class CAHitNtupletGeneratorOnGPU {
 
   static void fillDescriptions(edm::ParameterSetDescription& desc);
   static void fillDescriptionsCommon(edm::ParameterSetDescription& desc);
-  //static const char* fillDescriptionsLabel() { return "caHitNtupletOnGPU"; }
 
   void beginJob();
   void endJob();
 
-  PixelTrackHeterogeneous makeTuplesAsync(HitsOnGPU const& hits_d, float bfield, cudaStream_t stream) const;
+  TrackSoADevice makeTuplesAsync(HitsOnGPU const& hits_d, float bfield, cudaStream_t stream) const;
 
-  PixelTrackHeterogeneous makeTuples(HitsOnCPU const& hits_d, float bfield) const;
+  TrackSoAHost makeTuples(HitsOnCPU const& hits_d, float bfield) const;
 
 private:
-  void buildDoublets(HitsOnGPU const& hh, cudaStream_t stream) const;
+  void buildDoublets(const HitsConstView& hh, cudaStream_t stream) const;
 
-  void hitNtuplets(HitsOnGPU const& hh, const edm::EventSetup& es, bool useRiemannFit, cudaStream_t cudaStream);
+  void hitNtuplets(const HitsConstView& hh, const edm::EventSetup& es, bool useRiemannFit, cudaStream_t cudaStream);
 
-  void launchKernels(HitsOnGPU const& hh, bool useRiemannFit, cudaStream_t cudaStream) const;
+  void launchKernels(const HitsConstView& hh, bool useRiemannFit, cudaStream_t cudaStream) const;
 
   Params m_params;
 
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/GPUCACell.h b/RecoPixelVertexing/PixelTriplets/plugins/GPUCACell.h
index 965889abcb268..2f8ae9105ac55 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/GPUCACell.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/GPUCACell.h
@@ -9,12 +9,12 @@
 
 #include <cuda_runtime.h>
 
-#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h"
+#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHitsUtilities.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/SimpleVector.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/VecArray.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h"
 #include "RecoPixelVertexing/PixelTriplets/interface/CircleEq.h"
-#include "CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h"
+#include "CUDADataFormats/Track/interface/PixelTrackUtilities.h"
 #include "Geometry/CommonTopologies/interface/SimplePixelTopology.h"
 #include "CAStructures.h"
 
@@ -31,14 +31,14 @@ class GPUCACellT {
   using CellNeighborsVector = caStructures::CellNeighborsVectorT<TrackerTraits>;
   using CellTracksVector = caStructures::CellTracksVectorT<TrackerTraits>;
 
-  using Hits = TrackingRecHit2DSOAViewT<TrackerTraits>;
+  using HitsConstView = TrackingRecHitSoAConstView<TrackerTraits>;
   using hindex_type = typename TrackerTraits::hindex_type;
   using tindex_type = typename TrackerTraits::tindex_type;
   static constexpr auto invalidHitId = std::numeric_limits<hindex_type>::max();
 
   using TmpTuple = cms::cuda::VecArray<uint32_t, TrackerTraits::maxDepth>;
 
-  using HitContainer = pixelTrack::HitContainerT<TrackerTraits>;
+  using HitContainer = typename TrackSoA<TrackerTraits>::HitContainer;
   using Quality = pixelTrack::Quality;
   static constexpr auto bad = pixelTrack::Quality::bad;
 
@@ -48,7 +48,7 @@ class GPUCACellT {
 
   __device__ __forceinline__ void init(CellNeighborsVector& cellNeighbors,
                                        CellTracksVector& cellTracks,
-                                       Hits const& hh,
+                                       const HitsConstView& hh,
                                        int layerPairId,
                                        hindex_type innerHitId,
                                        hindex_type outerHitId) {
@@ -59,8 +59,8 @@ class GPUCACellT {
     theFishboneId = invalidHitId;
 
     // optimization that depends on access pattern
-    theInnerZ = hh.zGlobal(innerHitId);
-    theInnerR = hh.rGlobal(innerHitId);
+    theInnerZ = hh[innerHitId].zGlobal();
+    theInnerR = hh[innerHitId].rGlobal();
 
     // link to default empty
     theOuterNeighbors = &cellNeighbors[0];
@@ -115,22 +115,26 @@ class GPUCACellT {
   __device__ __forceinline__ CellTracks const& tracks() const { return *theTracks; }
   __device__ __forceinline__ CellNeighbors& outerNeighbors() { return *theOuterNeighbors; }
   __device__ __forceinline__ CellNeighbors const& outerNeighbors() const { return *theOuterNeighbors; }
-  __device__ __forceinline__ float inner_x(Hits const& hh) const { return hh.xGlobal(theInnerHitId); }
-  __device__ __forceinline__ float outer_x(Hits const& hh) const { return hh.xGlobal(theOuterHitId); }
-  __device__ __forceinline__ float inner_y(Hits const& hh) const { return hh.yGlobal(theInnerHitId); }
-  __device__ __forceinline__ float outer_y(Hits const& hh) const { return hh.yGlobal(theOuterHitId); }
-  __device__ __forceinline__ float inner_z(Hits const& hh) const { return theInnerZ; }
+  __device__ __forceinline__ float inner_x(const HitsConstView& hh) const { return hh[theInnerHitId].xGlobal(); }
+  __device__ __forceinline__ float outer_x(const HitsConstView& hh) const { return hh[theOuterHitId].xGlobal(); }
+  __device__ __forceinline__ float inner_y(const HitsConstView& hh) const { return hh[theInnerHitId].yGlobal(); }
+  __device__ __forceinline__ float outer_y(const HitsConstView& hh) const { return hh[theOuterHitId].yGlobal(); }
+  __device__ __forceinline__ float inner_z(const HitsConstView& hh) const { return theInnerZ; }
   // { return hh.zGlobal(theInnerHitId); } // { return theInnerZ; }
-  __device__ __forceinline__ float outer_z(Hits const& hh) const { return hh.zGlobal(theOuterHitId); }
-  __device__ __forceinline__ float inner_r(Hits const& hh) const { return theInnerR; }
+  __device__ __forceinline__ float outer_z(const HitsConstView& hh) const { return hh[theOuterHitId].zGlobal(); }
+  __device__ __forceinline__ float inner_r(const HitsConstView& hh) const { return theInnerR; }
   // { return hh.rGlobal(theInnerHitId); } // { return theInnerR; }
-  __device__ __forceinline__ float outer_r(Hits const& hh) const { return hh.rGlobal(theOuterHitId); }
+  __device__ __forceinline__ float outer_r(const HitsConstView& hh) const { return hh[theOuterHitId].rGlobal(); }
 
-  __device__ __forceinline__ auto inner_iphi(Hits const& hh) const { return hh.iphi(theInnerHitId); }
-  __device__ __forceinline__ auto outer_iphi(Hits const& hh) const { return hh.iphi(theOuterHitId); }
+  __device__ __forceinline__ auto inner_iphi(const HitsConstView& hh) const { return hh[theInnerHitId].iphi(); }
+  __device__ __forceinline__ auto outer_iphi(const HitsConstView& hh) const { return hh[theOuterHitId].iphi(); }
 
-  __device__ __forceinline__ float inner_detIndex(Hits const& hh) const { return hh.detectorIndex(theInnerHitId); }
-  __device__ __forceinline__ float outer_detIndex(Hits const& hh) const { return hh.detectorIndex(theOuterHitId); }
+  __device__ __forceinline__ float inner_detIndex(const HitsConstView& hh) const {
+    return hh[theInnerHitId].detectorIndex();
+  }
+  __device__ __forceinline__ float outer_detIndex(const HitsConstView& hh) const {
+    return hh[theOuterHitId].detectorIndex();
+  }
 
   constexpr unsigned int inner_hit_id() const { return theInnerHitId; }
   constexpr unsigned int outer_hit_id() const { return theOuterHitId; }
@@ -142,7 +146,7 @@ class GPUCACellT {
            theOuterHitId);
   }
 
-  __device__ bool check_alignment(Hits const& hh,
+  __device__ bool check_alignment(const HitsConstView& hh,
                                   GPUCACellT const& otherCell,
                                   const float ptmin,
                                   const float hardCurvCut,
@@ -189,7 +193,7 @@ class GPUCACellT {
     return tan_12_13_half_mul_distance_13_squared * pMin <= thetaCut * distance_13_squared * radius_diff;
   }
 
-  __device__ inline bool dcaCut(Hits const& hh,
+  __device__ inline bool dcaCut(const HitsConstView& hh,
                                 GPUCACellT const& otherCell,
                                 const float region_origin_radius_plus_tolerance,
                                 const float maxCurv) const {
@@ -226,7 +230,7 @@ class GPUCACellT {
     return std::abs(eq.dca0()) < region_origin_radius_plus_tolerance * std::abs(eq.curvature());
   }
 
-  __device__ inline bool hole0(Hits const& hh, GPUCACellT const& innerCell) const {
+  __device__ inline bool hole0(const HitsConstView& hh, GPUCACellT const& innerCell) const {
     using namespace phase1PixelTopology;
 
     int p = innerCell.inner_iphi(hh);
@@ -247,7 +251,7 @@ class GPUCACellT {
     return gap;
   }
 
-  __device__ inline bool hole4(Hits const& hh, GPUCACellT const& innerCell) const {
+  __device__ inline bool hole4(const HitsConstView& hh, GPUCACellT const& innerCell) const {
     using namespace phase1PixelTopology;
 
     int p = outer_iphi(hh);
@@ -274,7 +278,7 @@ class GPUCACellT {
   // the visit of the graph based on the neighborhood connections between cells.
 
   template <int DEPTH>
-  __device__ inline void find_ntuplets(Hits const& hh,
+  __device__ inline void find_ntuplets(const HitsConstView& hh,
                                        GPUCACellT* __restrict__ cells,
                                        CellTracksVector& cellTracks,
                                        HitContainer& foundNtuplets,
@@ -356,14 +360,14 @@ class GPUCACellT {
   __device__ __forceinline__ bool unused() const { return 0 == (uint16_t(StatusBit::kUsed) & theStatus_); }
   __device__ __forceinline__ void setStatusBits(StatusBit mask) { theStatus_ |= uint16_t(mask); }
 
-  __device__ __forceinline__ void setFishbone(hindex_type id, float z, Hits const& hh) {
+  __device__ __forceinline__ void setFishbone(hindex_type id, float z, const HitsConstView& hh) {
     // make it deterministic: use the farther apart (in z)
     auto old = theFishboneId;
-    while (
-        old !=
-        atomicCAS(&theFishboneId,
-                  old,
-                  (invalidHitId == old || std::abs(z - theInnerZ) > std::abs(hh.zGlobal(old) - theInnerZ)) ? id : old))
+    while (old !=
+           atomicCAS(
+               &theFishboneId,
+               old,
+               (invalidHitId == old || std::abs(z - theInnerZ) > std::abs(hh[old].zGlobal() - theInnerZ)) ? id : old))
       old = theFishboneId;
   }
   __device__ __forceinline__ auto fishboneId() const { return theFishboneId; }
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.cc b/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.cc
index c300329a82208..befd30ffab7b2 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.cc
+++ b/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.cc
@@ -2,17 +2,16 @@
 #include "HelixFitOnGPU.h"
 
 template <typename TrackerTraits>
-void HelixFitOnGPU<TrackerTraits>::allocateOnGPU(
-    Tuples const *tuples,
-    caStructures::TupleMultiplicityT<TrackerTraits> const *tupleMultiplicity,
-    pixelTrack::TrackSoAT<TrackerTraits> *helix_fit_results) {
-  tuples_ = tuples;
+void HelixFitOnGPU<TrackerTraits>::allocateOnGPU(TupleMultiplicity const *tupleMultiplicity,
+                                                 OutputSoAView &helix_fit_results) {
+  tuples_ = &helix_fit_results.hitIndices();
   tupleMultiplicity_ = tupleMultiplicity;
   outputSoa_ = helix_fit_results;
 
   assert(tuples_);
   assert(tupleMultiplicity_);
-  assert(outputSoa_);
+  assert(outputSoa_.chi2());
+  assert(outputSoa_.pt());
 }
 
 template <typename TrackerTraits>
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.h b/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.h
index 78bec6f5e2a87..88dc882ce5de9 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.h
@@ -1,8 +1,8 @@
 #ifndef RecoPixelVertexing_PixelTriplets_plugins_HelixFitOnGPU_h
 #define RecoPixelVertexing_PixelTriplets_plugins_HelixFitOnGPU_h
 
-#include "CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h"
-#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h"
+#include "CUDADataFormats/Track/interface/PixelTrackUtilities.h"
+#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHitsUtilities.h"
 #include "RecoPixelVertexing/PixelTrackFitting/interface/FitResult.h"
 #include "Geometry/CommonTopologies/interface/SimplePixelTopology.h"
 
@@ -43,10 +43,13 @@ namespace riemannFit {
 template <typename TrackerTraits>
 class HelixFitOnGPU {
 public:
-  using HitsView = TrackingRecHit2DSOAViewT<TrackerTraits>;
+  using TrackingRecHitSoAs = TrackingRecHitSoA<TrackerTraits>;
 
-  using Tuples = pixelTrack::HitContainerT<TrackerTraits>;
-  using OutputSoA = pixelTrack::TrackSoAT<TrackerTraits>;
+  using HitView = TrackingRecHitSoAView<TrackerTraits>;
+  using HitConstView = TrackingRecHitSoAConstView<TrackerTraits>;
+
+  using Tuples = typename TrackSoA<TrackerTraits>::HitContainer;
+  using OutputSoAView = TrackSoAView<TrackerTraits>;
 
   using TupleMultiplicity = caStructures::TupleMultiplicityT<TrackerTraits>;
 
@@ -54,13 +57,16 @@ class HelixFitOnGPU {
   ~HelixFitOnGPU() { deallocateOnGPU(); }
 
   void setBField(double bField) { bField_ = bField; }
-  void launchRiemannKernels(HitsView const *hv, uint32_t nhits, uint32_t maxNumberOfTuples, cudaStream_t cudaStream);
-  void launchBrokenLineKernels(HitsView const *hv, uint32_t nhits, uint32_t maxNumberOfTuples, cudaStream_t cudaStream);
+  void launchRiemannKernels(const HitConstView &hv, uint32_t nhits, uint32_t maxNumberOfTuples, cudaStream_t cudaStream);
+  void launchBrokenLineKernels(const HitConstView &hv,
+                               uint32_t nhits,
+                               uint32_t maxNumberOfTuples,
+                               cudaStream_t cudaStream);
 
-  void launchRiemannKernelsOnCPU(HitsView const *hv, uint32_t nhits, uint32_t maxNumberOfTuples);
-  void launchBrokenLineKernelsOnCPU(HitsView const *hv, uint32_t nhits, uint32_t maxNumberOfTuples);
+  void launchRiemannKernelsOnCPU(const HitConstView &hv, uint32_t nhits, uint32_t maxNumberOfTuples);
+  void launchBrokenLineKernelsOnCPU(const HitConstView &hv, uint32_t nhits, uint32_t maxNumberOfTuples);
 
-  void allocateOnGPU(Tuples const *tuples, TupleMultiplicity const *tupleMultiplicity, OutputSoA *outputSoA);
+  void allocateOnGPU(TupleMultiplicity const *tupleMultiplicity, OutputSoAView &helix_fit_results);
   void deallocateOnGPU();
 
 private:
@@ -69,7 +75,7 @@ class HelixFitOnGPU {
   // fowarded
   Tuples const *tuples_ = nullptr;
   TupleMultiplicity const *tupleMultiplicity_ = nullptr;
-  OutputSoA *outputSoa_;
+  OutputSoAView outputSoa_;
   float bField_;
 
   const bool fitNas4_;
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.cc b/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.cc
index e4a7de6adaf4c..2678f60f75b3f 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.cc
+++ b/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.cc
@@ -1,7 +1,7 @@
 #include "RiemannFitOnGPU.h"
 
 template <typename TrackerTraits>
-void HelixFitOnGPU<TrackerTraits>::launchRiemannKernelsOnCPU(HitsView const *hv,
+void HelixFitOnGPU<TrackerTraits>::launchRiemannKernelsOnCPU(const TrackingRecHitSoAConstView<TrackerTraits> &hv,
                                                              uint32_t nhits,
                                                              uint32_t maxNumberOfTuples) {
   assert(tuples_);
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.cu b/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.cu
index 3d6b2d570077e..99c55992bbf71 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.cu
+++ b/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.cu
@@ -2,7 +2,7 @@
 #include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
 
 template <typename TrackerTraits>
-void HelixFitOnGPU<TrackerTraits>::launchRiemannKernels(HitsView const *hv,
+void HelixFitOnGPU<TrackerTraits>::launchRiemannKernels(const TrackingRecHitSoAConstView<TrackerTraits> &hv,
                                                         uint32_t nhits,
                                                         uint32_t maxNumberOfTuples,
                                                         cudaStream_t stream) {
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.h b/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.h
index 18dd205cd13c3..96cccf0d0cc0b 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.h
@@ -6,7 +6,8 @@
 
 #include <cuda_runtime.h>
 
-#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h"
+#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHitsUtilities.h"
+#include "CUDADataFormats/Track/interface/PixelTrackUtilities.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h"
 #include "RecoLocalTracker/SiPixelRecHits/interface/pixelCPEforGPU.h"
@@ -15,11 +16,9 @@
 #include "HelixFitOnGPU.h"
 
 template <typename TrackerTraits>
-using HitsOnGPU = TrackingRecHit2DSOAViewT<TrackerTraits>;
+using Tuples = typename TrackSoA<TrackerTraits>::HitContainer;
 template <typename TrackerTraits>
-using Tuples = pixelTrack::HitContainerT<TrackerTraits>;
-template <typename TrackerTraits>
-using OutputSoA = pixelTrack::TrackSoAT<TrackerTraits>;
+using OutputSoAView = TrackSoAView<TrackerTraits>;
 template <typename TrackerTraits>
 using TupleMultiplicity = caStructures::TupleMultiplicityT<TrackerTraits>;
 
@@ -27,7 +26,7 @@ template <int N, typename TrackerTraits>
 __global__ void kernel_FastFit(Tuples<TrackerTraits> const *__restrict__ foundNtuplets,
                                TupleMultiplicity<TrackerTraits> const *__restrict__ tupleMultiplicity,
                                uint32_t nHits,
-                               HitsOnGPU<TrackerTraits> const *__restrict__ hhp,
+                               TrackingRecHitSoAConstView<TrackerTraits> hh,
                                double *__restrict__ phits,
                                float *__restrict__ phits_ge,
                                double *__restrict__ pfast_fit,
@@ -68,14 +67,10 @@ __global__ void kernel_FastFit(Tuples<TrackerTraits> const *__restrict__ foundNt
     auto const *hitId = foundNtuplets->begin(tkid);
     for (unsigned int i = 0; i < hitsInFit; ++i) {
       auto hit = hitId[i];
-      // printf("Hit global: %f,%f,%f\n", hhp->xg_d[hit],hhp->yg_d[hit],hhp->zg_d[hit]);
       float ge[6];
-      hhp->cpeParams()
-          .detParams(hhp->detectorIndex(hit))
-          .frame.toGlobal(hhp->xerrLocal(hit), 0, hhp->yerrLocal(hit), ge);
-      // printf("Error: %d: %f,%f,%f,%f,%f,%f\n",hhp->detInd_d[hit],ge[0],ge[1],ge[2],ge[3],ge[4],ge[5]);
+      hh.cpeParams().detParams(hh[hit].detectorIndex()).frame.toGlobal(hh[hit].xerrLocal(), 0, hh[hit].yerrLocal(), ge);
 
-      hits.col(i) << hhp->xGlobal(hit), hhp->yGlobal(hit), hhp->zGlobal(hit);
+      hits.col(i) << hh[hit].xGlobal(), hh[hit].yGlobal(), hh[hit].zGlobal();
       hits_ge.col(i) << ge[0], ge[1], ge[2], ge[3], ge[4], ge[5];
     }
     riemannFit::fastFit(hits, fast_fit);
@@ -133,13 +128,12 @@ template <int N, typename TrackerTraits>
 __global__ void kernel_LineFit(TupleMultiplicity<TrackerTraits> const *__restrict__ tupleMultiplicity,
                                uint32_t nHits,
                                double bField,
-                               OutputSoA<TrackerTraits> *results,
+                               OutputSoAView<TrackerTraits> results_view,
                                double *__restrict__ phits,
                                float *__restrict__ phits_ge,
                                double *__restrict__ pfast_fit_input,
                                riemannFit::CircleFit *__restrict__ circle_fit,
                                uint32_t offset) {
-  assert(results);
   assert(circle_fit);
   assert(N <= nHits);
 
@@ -154,7 +148,7 @@ __global__ void kernel_LineFit(TupleMultiplicity<TrackerTraits> const *__restric
       break;
 
     // get it for the ntuple container (one to one to helix)
-    auto tkid = *(tupleMultiplicity->begin(nHits) + tuple_idx);
+    int32_t tkid = *(tupleMultiplicity->begin(nHits) + tuple_idx);
 
     riemannFit::Map3xNd<N> hits(phits + local_idx);
     riemannFit::Map4d fast_fit(pfast_fit_input + local_idx);
@@ -164,11 +158,16 @@ __global__ void kernel_LineFit(TupleMultiplicity<TrackerTraits> const *__restric
 
     riemannFit::fromCircleToPerigee(circle_fit[local_idx]);
 
-    results->stateAtBS.copyFromCircle(
-        circle_fit[local_idx].par, circle_fit[local_idx].cov, line_fit.par, line_fit.cov, 1.f / float(bField), tkid);
-    results->pt(tkid) = bField / std::abs(circle_fit[local_idx].par(2));
-    results->eta(tkid) = asinhf(line_fit.par(0));
-    results->chi2(tkid) = (circle_fit[local_idx].chi2 + line_fit.chi2) / (2 * N - 5);
+    TracksUtilities<TrackerTraits>::copyFromCircle(results_view,
+                                                   circle_fit[local_idx].par,
+                                                   circle_fit[local_idx].cov,
+                                                   line_fit.par,
+                                                   line_fit.cov,
+                                                   1.f / float(bField),
+                                                   tkid);
+    results_view[tkid].pt() = bField / std::abs(circle_fit[local_idx].par(2));
+    results_view[tkid].eta() = asinhf(line_fit.par(0));
+    results_view[tkid].chi2() = (circle_fit[local_idx].chi2 + line_fit.chi2) / (2 * N - 5);
 
 #ifdef RIEMANN_DEBUG
     printf("kernelLineFit size %d for %d hits circle.par(0,1,2): %d %f,%f,%f\n",
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/gpuFishbone.h b/RecoPixelVertexing/PixelTriplets/plugins/gpuFishbone.h
index d4b3282574ec3..f32adf9f6e770 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/gpuFishbone.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/gpuFishbone.h
@@ -27,10 +27,10 @@ namespace gpuPixelDoublets {
   template <typename TrackerTraits>
   using OuterHitOfCell = caStructures::OuterHitOfCellT<TrackerTraits>;
   template <typename TrackerTraits>
-  using Hits = typename GPUCACellT<TrackerTraits>::Hits;
+  using HitsConstView = typename GPUCACellT<TrackerTraits>::HitsConstView;
 
   template <typename TrackerTraits>
-  __global__ void fishbone(Hits<TrackerTraits> const* __restrict__ hhp,
+  __global__ void fishbone(HitsConstView<TrackerTraits> hh,
                            GPUCACellT<TrackerTraits>* cells,
                            uint32_t const* __restrict__ nCells,
                            OuterHitOfCell<TrackerTraits> const isOuterHitOfCellWrap,
@@ -38,8 +38,6 @@ namespace gpuPixelDoublets {
                            bool checkTrack) {
     constexpr auto maxCellsPerHit = GPUCACellT<TrackerTraits>::maxCellsPerHit;
 
-    auto const& hh = *hhp;
-
     auto const isOuterHitOfCell = isOuterHitOfCellWrap.container;
     int32_t offset = isOuterHitOfCellWrap.offset;
 
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoublets.h b/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoublets.h
index deed54ca02b5b..740b63ac774a5 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoublets.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoublets.h
@@ -7,22 +7,6 @@
 
 namespace gpuPixelDoublets {
 
-  template <typename TrackerTraits>
-  using CellNeighbors = caStructures::CellNeighborsT<TrackerTraits>;
-  template <typename TrackerTraits>
-  using CellTracks = caStructures::CellTracksT<TrackerTraits>;
-  template <typename TrackerTraits>
-  using CellNeighborsVector = caStructures::CellNeighborsVectorT<TrackerTraits>;
-  template <typename TrackerTraits>
-  using CellTracksVector = caStructures::CellTracksVectorT<TrackerTraits>;
-  template <typename TrackerTraits>
-  using OuterHitOfCell = caStructures::OuterHitOfCellT<TrackerTraits>;
-  template <typename TrackerTraits>
-  using Hits = typename GPUCACellT<TrackerTraits>::Hits;
-
-  // end constants
-  // clang-format on
-
   template <typename TrackerTraits>
   __global__ void initDoublets(OuterHitOfCell<TrackerTraits> isOuterHitOfCell,
                                int nHits,
@@ -59,11 +43,10 @@ namespace gpuPixelDoublets {
                                 uint32_t* nCells,
                                 CellNeighborsVector<TrackerTraits>* cellNeighbors,
                                 CellTracksVector<TrackerTraits>* cellTracks,
-                                TrackingRecHit2DSOAViewT<TrackerTraits> const* __restrict__ hhp,
+                                HitsConstView<TrackerTraits> hh,
                                 OuterHitOfCell<TrackerTraits> isOuterHitOfCell,
                                 int nActualPairs,
                                 CellCutsT<TrackerTraits> cuts) {
-    auto const& __restrict__ hh = *hhp;
 
     doubletsFromHisto<TrackerTraits>(
         nActualPairs, cells, nCells, cellNeighbors, cellTracks, hh, isOuterHitOfCell, cuts);
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoubletsAlgos.h b/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoubletsAlgos.h
index 0f3d786a8e476..eaaefb42b74ae 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoubletsAlgos.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoubletsAlgos.h
@@ -7,7 +7,7 @@
 #include <cstdio>
 #include <limits>
 
-#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h"
+#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHitsUtilities.h"
 #include "DataFormats/Math/interface/approx_atan2.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/VecArray.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h"
@@ -32,11 +32,11 @@ namespace gpuPixelDoublets {
   template <typename TrackerTraits>
   using OuterHitOfCell = caStructures::OuterHitOfCellT<TrackerTraits>;
   template <typename TrackerTraits>
-  using Hits = typename GPUCACellT<TrackerTraits>::Hits;
+  using HitsConstView = typename GPUCACellT<TrackerTraits>::HitsConstView;
 
   template <typename TrackerTraits>
   struct CellCutsT {
-    using H = Hits<TrackerTraits>;
+    using H = HitsConstView<TrackerTraits>;
     using T = TrackerTraits;
 
     const uint32_t maxNumberOfDoublets_;
@@ -45,21 +45,21 @@ namespace gpuPixelDoublets {
     const bool doPtCut_;
     const bool idealConditions_;  //this is actually not used by phase2
 
-    __device__ __forceinline__ bool zSizeCut(H const& hh, int i, int o) const {
-      auto mi = hh.detectorIndex(i);
+    __device__ __forceinline__ bool zSizeCut(H hh, int i, int o) const {
+      const uint32_t mi = hh[i].detectorIndex();
 
       bool innerB1 = mi < T::last_bpix1_detIndex;
       bool isOuterLadder = idealConditions_ ? true : 0 == (mi / 8) % 2;
-      auto mes = (!innerB1) || isOuterLadder ? hh.clusterSizeY(i) : -1;
+      auto mes = (!innerB1) || isOuterLadder ? hh[i].clusterSizeY() : -1;
 
       if (mes < 0)
         return false;
 
-      auto mo = hh.detectorIndex(o);
-      auto so = hh.clusterSizeY(o);
+      const uint32_t mo = hh[o].detectorIndex();
+      auto so = hh[o].clusterSizeY();
 
-      auto dz = hh.zGlobal(i) - hh.zGlobal(o);
-      auto dr = hh.rGlobal(i) - hh.rGlobal(o);
+      auto dz = hh[i].zGlobal() - hh[o].zGlobal();
+      auto dr = hh[i].rGlobal() - hh[o].rGlobal();
 
       auto innerBarrel = mi < T::last_barrel_detIndex;
       auto onlyBarrel = mo < T::last_barrel_detIndex;
@@ -72,14 +72,8 @@ namespace gpuPixelDoublets {
                         : innerBarrel && std::abs(mes - int(std::abs(dz / dr) * T::dzdrFact + 0.5f)) > T::maxDYPred;
     }
 
-    __device__ __forceinline__ bool clusterCut(H const& hh, int i, int o) const {
-      auto mo = hh.detectorIndex(o);
-      bool outerFwd = (mo >= T::last_barrel_detIndex);
-
-      if (!outerFwd)
-        return false;
-
-      auto mi = hh.detectorIndex(i);
+    __device__ __forceinline__ bool clusterCut(H hh, int i) const {
+      const uint32_t mi = hh[i].detectorIndex();
       bool innerB1orB2 = mi < T::last_bpix2_detIndex;
 
       if (!innerB1orB2)
@@ -87,13 +81,13 @@ namespace gpuPixelDoublets {
 
       bool innerB1 = mi < T::last_bpix1_detIndex;
       bool isOuterLadder = idealConditions_ ? true : 0 == (mi / 8) % 2;
-      auto mes = (!innerB1) || isOuterLadder ? hh.clusterSizeY(i) : -1;
+      auto mes = (!innerB1) || isOuterLadder ? hh[i].clusterSizeY() : -1;
 
-      if (innerB1 && outerFwd)  // B1 and F1
+      if (innerB1)  // B1
         if (mes > 0 && mes < T::minYsizeB1)
           return true;                                                                 // only long cluster  (5*8)
       bool innerB2 = (mi >= T::last_bpix1_detIndex) && (mi < T::last_bpix2_detIndex);  //FIXME number
-      if (innerB2 && outerFwd)                                                         // B2 and F1
+      if (innerB2)                                                                     // B2 and F1
         if (mes > 0 && mes < T::minYsizeB2)
           return true;
 
@@ -101,19 +95,13 @@ namespace gpuPixelDoublets {
     }
   };
 
-  // template <typename TrackerTraits>
-  // struct CellCutsT : public CellCutsCommon<TrackerTraits> {};
-  //
-  // template <>
-  // struct CellCutsT<pixelTopology::Phase2> : public CellCutsCommon<pixelTopology::Phase2> {};
-
   template <typename TrackerTraits>
   __device__ __forceinline__ void doubletsFromHisto(uint32_t nPairs,
                                                     GPUCACellT<TrackerTraits>* cells,
                                                     uint32_t* nCells,
                                                     CellNeighborsVector<TrackerTraits>* cellNeighbors,
                                                     CellTracksVector<TrackerTraits>* cellTracks,
-                                                    TrackingRecHit2DSOAViewT<TrackerTraits> const& __restrict__ hh,
+                                                    HitsConstView<TrackerTraits> hh,
                                                     OuterHitOfCell<TrackerTraits> isOuterHitOfCell,
                                                     CellCutsT<TrackerTraits> const& cuts) {
     // ysize cuts (z in the barrel)  times 8
@@ -124,10 +112,10 @@ namespace gpuPixelDoublets {
     const bool doPtCut = cuts.doPtCut_;
     const uint32_t maxNumOfDoublets = cuts.maxNumberOfDoublets_;
 
-    using PhiBinner = typename TrackingRecHit2DSOAViewT<TrackerTraits>::PhiBinner;
+    using PhiBinner = typename TrackingRecHitSoA<TrackerTraits>::PhiBinner;
 
     auto const& __restrict__ phiBinner = hh.phiBinner();
-    uint32_t const* __restrict__ offsets = hh.hitsLayerStart();
+    uint32_t const* __restrict__ offsets = hh.hitsLayerStart().data();
     assert(offsets);
 
     auto layerSize = [=](uint8_t li) { return offsets[li + 1] - offsets[li]; };
@@ -168,18 +156,15 @@ namespace gpuPixelDoublets {
       assert(outer > inner);
 
       auto hoff = PhiBinner::histOff(outer);
-      auto fo = __ldg(phiBinner.begin(hoff));  //first hit on outer for the cluster cut
       auto i = (0 == pairLayerId) ? j : j - innerLayerCumulativeSize[pairLayerId - 1];
       i += offsets[inner];
 
-      // printf("Hit in Layer %d %d %d %d\n", i, inner, pairLayerId, j);
-
       assert(i >= offsets[inner]);
       assert(i < offsets[inner + 1]);
 
       // found hit corresponding to our cuda thread, now do the job
 
-      if (hh.detectorIndex(i) > gpuClustering::maxNumModules)
+      if (hh[i].detectorIndex() > gpuClustering::maxNumModules)
         continue;  // invalid
 
       /* maybe clever, not effective when zoCut is on
@@ -188,16 +173,16 @@ namespace gpuPixelDoublets {
       if ( ((inner<3) & (outer>3)) && bpos!=fpos) continue;
       */
 
-      auto mez = hh.zGlobal(i);
+      auto mez = hh[i].zGlobal();
 
       if (mez < TrackerTraits::minz[pairLayerId] || mez > TrackerTraits::maxz[pairLayerId])
         continue;
 
-      if (doClusterCut && cuts.clusterCut(hh, i, fo))
+      if (doClusterCut && outer > pixelTopology::last_barrel_layer && cuts.clusterCut(hh, i))
         continue;
 
-      auto mep = hh.iphi(i);
-      auto mer = hh.rGlobal(i);
+      auto mep = hh[i].iphi();
+      auto mer = hh[i].rGlobal();
 
       // all cuts: true if fails
       constexpr float z0cut = TrackerTraits::z0Cut;              // cm
@@ -208,13 +193,13 @@ namespace gpuPixelDoublets {
       auto ptcut = [&](int j, int16_t idphi) {
         auto r2t4 = minRadius2T4;
         auto ri = mer;
-        auto ro = hh.rGlobal(j);
+        auto ro = hh[j].rGlobal();
         auto dphi = short2phi(idphi);
         return dphi * dphi * (r2t4 - ri * ro) > (ro - ri) * (ro - ri);
       };
       auto z0cutoff = [&](int j) {
-        auto zo = hh.zGlobal(j);
-        auto ro = hh.rGlobal(j);
+        auto zo = hh[j].zGlobal();
+        auto ro = hh[j].rGlobal();
         auto dr = ro - mer;
         return dr > TrackerTraits::maxr[pairLayerId] || dr < 0 || std::abs((mez * ro - mer * zo)) > z0cut * dr;
       };
@@ -245,14 +230,14 @@ namespace gpuPixelDoublets {
           auto oi = __ldg(p);
           assert(oi >= offsets[outer]);
           assert(oi < offsets[outer + 1]);
-          auto mo = hh.detectorIndex(oi);
+          auto mo = hh[oi].detectorIndex();
 
           if (mo > gpuClustering::maxNumModules)
             continue;  //    invalid
 
           if (doZ0Cut && z0cutoff(oi))
             continue;
-          auto mop = hh.iphi(oi);
+          auto mop = hh[oi].iphi();
           uint16_t idphi = std::min(std::abs(int16_t(mop - mep)), std::abs(int16_t(mep - mop)));
           if (idphi > iphicut)
             continue;
diff --git a/RecoPixelVertexing/PixelTriplets/test/BuildFile.xml b/RecoPixelVertexing/PixelTriplets/test/BuildFile.xml
index d480d7408b9e2..522b186f3351b 100644
--- a/RecoPixelVertexing/PixelTriplets/test/BuildFile.xml
+++ b/RecoPixelVertexing/PixelTriplets/test/BuildFile.xml
@@ -26,4 +26,5 @@
 <bin file="CAsizes_t.cpp">
   <use name="cuda"/>
   <use name="eigen"/>
+  <use name="boost"/>
 </bin>
diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/PixelVertexProducerCUDA.cc b/RecoPixelVertexing/PixelVertexFinding/plugins/PixelVertexProducerCUDA.cc
index 024c95398b988..b51bd73350940 100644
--- a/RecoPixelVertexing/PixelVertexFinding/plugins/PixelVertexProducerCUDA.cc
+++ b/RecoPixelVertexing/PixelVertexFinding/plugins/PixelVertexProducerCUDA.cc
@@ -18,13 +18,19 @@
 #include "FWCore/Utilities/interface/RunningAverage.h"
 #include "HeterogeneousCore/CUDACore/interface/ScopedContext.h"
 
+#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousDevice.h"
+#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousHost.h"
+#include "CUDADataFormats/Vertex/interface/ZVertexSoAHeterogeneousDevice.h"
+#include "CUDADataFormats/Vertex/interface/ZVertexSoAHeterogeneousHost.h"
+
 #include "gpuVertexFinder.h"
 
 #undef PIXVERTEX_DEBUG_PRODUCE
 
 template <typename TrackerTraits>
 class PixelVertexProducerCUDAT : public edm::global::EDProducer<> {
-  using PixelTrackHeterogeneous = PixelTrackHeterogeneousT<TrackerTraits>;
+  using TracksSoADevice = TrackSoAHeterogeneousDevice<TrackerTraits>;
+  using TracksSoAHost = TrackSoAHeterogeneousHost<TrackerTraits>;
   using GPUAlgo = gpuVertexFinder::Producer<TrackerTraits>;
 
 public:
@@ -40,10 +46,10 @@ class PixelVertexProducerCUDAT : public edm::global::EDProducer<> {
 
   bool onGPU_;
 
-  edm::EDGetTokenT<cms::cuda::Product<PixelTrackHeterogeneous>> tokenGPUTrack_;
-  edm::EDPutTokenT<ZVertexCUDAProduct> tokenGPUVertex_;
-  edm::EDGetTokenT<PixelTrackHeterogeneous> tokenCPUTrack_;
-  edm::EDPutTokenT<ZVertexHeterogeneous> tokenCPUVertex_;
+  edm::EDGetTokenT<cms::cuda::Product<TracksSoADevice>> tokenGPUTrack_;
+  edm::EDPutTokenT<cms::cuda::Product<ZVertexSoADevice>> tokenGPUVertex_;
+  edm::EDGetTokenT<TracksSoAHost> tokenCPUTrack_;
+  edm::EDPutTokenT<ZVertexSoAHost> tokenCPUVertex_;
 
   const GPUAlgo gpuAlgo_;
 
@@ -67,12 +73,11 @@ PixelVertexProducerCUDAT<TrackerTraits>::PixelVertexProducerCUDAT(const edm::Par
       ptMax_(conf.getParameter<double>("PtMax"))   // 75. GeV
 {
   if (onGPU_) {
-    tokenGPUTrack_ =
-        consumes<cms::cuda::Product<PixelTrackHeterogeneous>>(conf.getParameter<edm::InputTag>("pixelTrackSrc"));
-    tokenGPUVertex_ = produces<ZVertexCUDAProduct>();
+    tokenGPUTrack_ = consumes(conf.getParameter<edm::InputTag>("pixelTrackSrc"));
+    tokenGPUVertex_ = produces();
   } else {
     tokenCPUTrack_ = consumes(conf.getParameter<edm::InputTag>("pixelTrackSrc"));
-    tokenCPUVertex_ = produces<ZVertexHeterogeneous>();
+    tokenCPUVertex_ = produces();
   }
 }
 
@@ -104,23 +109,20 @@ template <typename TrackerTraits>
 void PixelVertexProducerCUDAT<TrackerTraits>::produceOnGPU(edm::StreamID streamID,
                                                            edm::Event& iEvent,
                                                            const edm::EventSetup& iSetup) const {
-  edm::Handle<cms::cuda::Product<PixelTrackHeterogeneous>> hTracks;
-  iEvent.getByToken(tokenGPUTrack_, hTracks);
+  using TracksSoA = TrackSoAHeterogeneousDevice<TrackerTraits>;
+  auto hTracks = iEvent.getHandle(tokenGPUTrack_);
 
   cms::cuda::ScopedContextProduce ctx{*hTracks};
-  auto const* tracks = ctx.get(*hTracks).get();
-
-  assert(tracks);
+  auto& tracks = ctx.get(*hTracks);
 
-  ctx.emplace(iEvent, tokenGPUVertex_, gpuAlgo_.makeAsync(ctx.stream(), tracks, ptMin_, ptMax_));
+  ctx.emplace(iEvent, tokenGPUVertex_, gpuAlgo_.makeAsync(ctx.stream(), tracks.view(), ptMin_, ptMax_));
 }
 
 template <typename TrackerTraits>
 void PixelVertexProducerCUDAT<TrackerTraits>::produceOnCPU(edm::StreamID streamID,
                                                            edm::Event& iEvent,
                                                            const edm::EventSetup& iSetup) const {
-  auto const* tracks = iEvent.get(tokenCPUTrack_).get();
-  assert(tracks);
+  auto& tracks = iEvent.get(tokenCPUTrack_);
 
 #ifdef PIXVERTEX_DEBUG_PRODUCE
   auto const& tsoa = *tracks;
@@ -129,8 +131,8 @@ void PixelVertexProducerCUDAT<TrackerTraits>::produceOnCPU(edm::StreamID streamI
 
   int32_t nt = 0;
   for (int32_t it = 0; it < maxTracks; ++it) {
-    auto nHits = tsoa.nHits(it);
-    assert(nHits == int(tsoa.hitIndices.size(it)));
+    auto nHits = TracksUtilities<TrackerTraits>::nHits(tracks.view(), it);
+    assert(nHits == int(tracks.view().hitIndices().size(it)));
     if (nHits == 0)
       break;  // this is a guard: maybe we need to move to nTracks...
     nt++;
@@ -138,7 +140,7 @@ void PixelVertexProducerCUDAT<TrackerTraits>::produceOnCPU(edm::StreamID streamI
   std::cout << "found " << nt << " tracks in cpu SoA for Vertexing at " << tracks << std::endl;
 #endif  // PIXVERTEX_DEBUG_PRODUCE
 
-  iEvent.emplace(tokenCPUVertex_, gpuAlgo_.make(tracks, ptMin_, ptMax_));
+  iEvent.emplace(tokenCPUVertex_, gpuAlgo_.make(tracks.view(), ptMin_, ptMax_));
 }
 
 template <typename TrackerTraits>
diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/PixelVertexProducerFromSoA.cc b/RecoPixelVertexing/PixelVertexFinding/plugins/PixelVertexProducerFromSoA.cc
index 8cceeaa42cc10..91de2bdb6992b 100644
--- a/RecoPixelVertexing/PixelVertexFinding/plugins/PixelVertexProducerFromSoA.cc
+++ b/RecoPixelVertexing/PixelVertexFinding/plugins/PixelVertexProducerFromSoA.cc
@@ -1,4 +1,5 @@
-#include "CUDADataFormats/Vertex/interface/ZVertexHeterogeneous.h"
+#include "CUDADataFormats/Vertex/interface/ZVertexSoAHeterogeneousHost.h"
+#include "CUDADataFormats/Vertex/interface/ZVertexSoAHeterogeneousDevice.h"
 #include "DataFormats/BeamSpot/interface/BeamSpot.h"
 #include "DataFormats/Common/interface/OrphanHandle.h"
 #include "DataFormats/TrackReco/interface/Track.h"
@@ -35,17 +36,17 @@ class PixelVertexProducerFromSoA : public edm::global::EDProducer<> {
 private:
   void produce(edm::StreamID streamID, edm::Event &iEvent, const edm::EventSetup &iSetup) const override;
 
-  edm::EDGetTokenT<ZVertexHeterogeneous> tokenVertex_;
+  edm::EDGetTokenT<ZVertexSoAHost> tokenVertex_;
   edm::EDGetTokenT<reco::BeamSpot> tokenBeamSpot_;
   edm::EDGetTokenT<reco::TrackCollection> tokenTracks_;
   edm::EDGetTokenT<IndToEdm> tokenIndToEdm_;
 };
 
 PixelVertexProducerFromSoA::PixelVertexProducerFromSoA(const edm::ParameterSet &conf)
-    : tokenVertex_(consumes<ZVertexHeterogeneous>(conf.getParameter<edm::InputTag>("src"))),
-      tokenBeamSpot_(consumes<reco::BeamSpot>(conf.getParameter<edm::InputTag>("beamSpot"))),
-      tokenTracks_(consumes<reco::TrackCollection>(conf.getParameter<edm::InputTag>("TrackCollection"))),
-      tokenIndToEdm_(consumes<IndToEdm>(conf.getParameter<edm::InputTag>("TrackCollection"))) {
+    : tokenVertex_(consumes(conf.getParameter<edm::InputTag>("src"))),
+      tokenBeamSpot_(consumes(conf.getParameter<edm::InputTag>("beamSpot"))),
+      tokenTracks_(consumes(conf.getParameter<edm::InputTag>("TrackCollection"))),
+      tokenIndToEdm_(consumes(conf.getParameter<edm::InputTag>("TrackCollection"))) {
   produces<reco::VertexCollection>();
 }
 
@@ -81,9 +82,9 @@ void PixelVertexProducerFromSoA::produce(edm::StreamID streamID, edm::Event &iEv
     dydz = bs.dydz();
   }
 
-  auto const &soa = *(iEvent.get(tokenVertex_).get());
+  auto const &soa = iEvent.get(tokenVertex_);
 
-  int nv = soa.nvFinal;
+  int nv = soa.view().nvFinal();
 
 #ifdef PIXVERTEX_DEBUG_PRODUCE
   std::cout << "converting " << nv << " vertices "
@@ -92,20 +93,20 @@ void PixelVertexProducerFromSoA::produce(edm::StreamID streamID, edm::Event &iEv
 
   std::set<uint32_t> uind;  // for verifing index consistency
   for (int j = nv - 1; j >= 0; --j) {
-    auto i = soa.sortInd[j];  // on gpu sorted in ascending order....
+    auto i = soa.view()[j].sortInd();  // on gpu sorted in ascending order....
     assert(i < nv);
     uind.insert(i);
     assert(itrk.empty());
-    auto z = soa.zv[i];
+    auto z = soa.view()[i].zv();
     auto x = x0 + dxdz * z;
     auto y = y0 + dydz * z;
     z += z0;
     reco::Vertex::Error err;
-    err(2, 2) = 1.f / soa.wv[i];
+    err(2, 2) = 1.f / soa.view()[i].wv();
     err(2, 2) *= 2.;  // artifically inflate error
     //Copy also the tracks (no intention to be efficient....)
     for (auto k = 0U; k < indToEdm.size(); ++k) {
-      if (soa.idv[k] == int16_t(i))
+      if (soa.view()[k].idv() == int16_t(i))
         itrk.push_back(k);
     }
     auto nt = itrk.size();
@@ -119,7 +120,7 @@ void PixelVertexProducerFromSoA::produce(edm::StreamID streamID, edm::Event &iEv
       itrk.clear();
       continue;
     }  // remove outliers
-    (*vertexes).emplace_back(reco::Vertex::Point(x, y, z), err, soa.chi2[i], soa.ndof[i], nt);
+    (*vertexes).emplace_back(reco::Vertex::Point(x, y, z), err, soa.view()[i].chi2(), soa.view()[i].ndof(), nt);
     auto &v = (*vertexes).back();
     v.reserve(itrk.size());
     for (auto it : itrk) {
diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/PixelVertexSoAFromCUDA.cc b/RecoPixelVertexing/PixelVertexFinding/plugins/PixelVertexSoAFromCUDA.cc
index dc125878b1058..b13b6c96f0bd3 100644
--- a/RecoPixelVertexing/PixelVertexFinding/plugins/PixelVertexSoAFromCUDA.cc
+++ b/RecoPixelVertexing/PixelVertexFinding/plugins/PixelVertexSoAFromCUDA.cc
@@ -2,7 +2,8 @@
 
 #include "CUDADataFormats/Common/interface/Product.h"
 #include "CUDADataFormats/Common/interface/HostProduct.h"
-#include "CUDADataFormats/Vertex/interface/ZVertexHeterogeneous.h"
+#include "CUDADataFormats/Vertex/interface/ZVertexSoAHeterogeneousHost.h"
+#include "CUDADataFormats/Vertex/interface/ZVertexSoAHeterogeneousDevice.h"
 #include "DataFormats/Common/interface/Handle.h"
 #include "FWCore/Framework/interface/ESHandle.h"
 #include "FWCore/Framework/interface/Event.h"
@@ -30,15 +31,15 @@ class PixelVertexSoAFromCUDA : public edm::stream::EDProducer<edm::ExternalWork>
                edm::WaitingTaskWithArenaHolder waitingTaskHolder) override;
   void produce(edm::Event& iEvent, edm::EventSetup const& iSetup) override;
 
-  edm::EDGetTokenT<cms::cuda::Product<ZVertexHeterogeneous>> tokenCUDA_;
-  edm::EDPutTokenT<ZVertexHeterogeneous> tokenSOA_;
+  edm::EDGetTokenT<cms::cuda::Product<ZVertexSoADevice>> tokenCUDA_;
+  edm::EDPutTokenT<ZVertexSoAHost> tokenSOA_;
 
-  cms::cuda::host::unique_ptr<ZVertexSoA> m_soa;
+  ZVertexSoAHost zvertex_h;
 };
 
 PixelVertexSoAFromCUDA::PixelVertexSoAFromCUDA(const edm::ParameterSet& iConfig)
-    : tokenCUDA_(consumes<cms::cuda::Product<ZVertexHeterogeneous>>(iConfig.getParameter<edm::InputTag>("src"))),
-      tokenSOA_(produces<ZVertexHeterogeneous>()) {}
+    : tokenCUDA_(consumes<cms::cuda::Product<ZVertexSoADevice>>(iConfig.getParameter<edm::InputTag>("src"))),
+      tokenSOA_(produces<ZVertexSoAHost>()) {}
 
 void PixelVertexSoAFromCUDA::fillDescriptions(edm::ConfigurationDescriptions& descriptions) {
   edm::ParameterSetDescription desc;
@@ -50,16 +51,20 @@ void PixelVertexSoAFromCUDA::fillDescriptions(edm::ConfigurationDescriptions& de
 void PixelVertexSoAFromCUDA::acquire(edm::Event const& iEvent,
                                      edm::EventSetup const& iSetup,
                                      edm::WaitingTaskWithArenaHolder waitingTaskHolder) {
-  auto const& inputDataWrapped = iEvent.get(tokenCUDA_);
+  cms::cuda::Product<ZVertexSoADevice> const& inputDataWrapped = iEvent.get(tokenCUDA_);
   cms::cuda::ScopedContextAcquire ctx{inputDataWrapped, std::move(waitingTaskHolder)};
-  auto const& inputData = ctx.get(inputDataWrapped);
-
-  m_soa = inputData.toHostAsync(ctx.stream());
+  auto const& zvertex_d = ctx.get(inputDataWrapped);  // Tracks on device
+  zvertex_h = ZVertexSoAHost(ctx.stream());           // Create an instance of Tracks on Host, using the stream
+  cudaCheck(cudaMemcpyAsync(zvertex_h.buffer().get(),
+                            zvertex_d.const_buffer().get(),
+                            zvertex_d.bufferSize(),
+                            cudaMemcpyDeviceToHost,
+                            ctx.stream()));  // Copy data from Device to Host
 }
 
 void PixelVertexSoAFromCUDA::produce(edm::Event& iEvent, edm::EventSetup const& iSetup) {
   // No copies....
-  iEvent.emplace(tokenSOA_, ZVertexHeterogeneous(std::move(m_soa)));
+  iEvent.emplace(tokenSOA_, std::move(zvertex_h));
 }
 
 DEFINE_FWK_MODULE(PixelVertexSoAFromCUDA);
diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/PixelVertexWorkSpaceSoADevice.h b/RecoPixelVertexing/PixelVertexFinding/plugins/PixelVertexWorkSpaceSoADevice.h
new file mode 100644
index 0000000000000..223c3d7e94785
--- /dev/null
+++ b/RecoPixelVertexing/PixelVertexFinding/plugins/PixelVertexWorkSpaceSoADevice.h
@@ -0,0 +1,23 @@
+#ifndef RecoPixelVertexing_PixelVertexFinding_PixelVertexWorkSpaceSoADevice_h
+#define RecoPixelVertexing_PixelVertexFinding_PixelVertexWorkSpaceSoADevice_h
+
+#include "CUDADataFormats/Common/interface/PortableDeviceCollection.h"
+#include "CUDADataFormats/Vertex/interface/ZVertexUtilities.h"
+#include "RecoPixelVertexing/PixelVertexFinding/plugins/PixelVertexWorkSpaceUtilities.h"
+
+template <int32_t S>
+class PixelVertexWorkSpaceSoADevice : public cms::cuda::PortableDeviceCollection<PixelVertexWSSoALayout<>> {
+public:
+  PixelVertexWorkSpaceSoADevice() = default;
+
+  // Constructor which specifies the SoA size and CUDA stream
+  explicit PixelVertexWorkSpaceSoADevice(cudaStream_t stream)
+      : PortableDeviceCollection<PixelVertexWSSoALayout<>>(S, stream) {}
+};
+
+namespace gpuVertexFinder {
+  namespace workSpace {
+    using PixelVertexWorkSpaceSoADevice = PixelVertexWorkSpaceSoADevice<zVertex::utilities::MAXTRACKS>;
+  }
+}  // namespace gpuVertexFinder
+#endif
diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/PixelVertexWorkSpaceSoAHost.h b/RecoPixelVertexing/PixelVertexFinding/plugins/PixelVertexWorkSpaceSoAHost.h
new file mode 100644
index 0000000000000..6c424fcec8a30
--- /dev/null
+++ b/RecoPixelVertexing/PixelVertexFinding/plugins/PixelVertexWorkSpaceSoAHost.h
@@ -0,0 +1,22 @@
+#ifndef RecoPixelVertexing_PixelVertexFinding_PixelVertexWorkSpaceSoAHost_h
+#define RecoPixelVertexing_PixelVertexFinding_PixelVertexWorkSpaceSoAHost_h
+
+#include "CUDADataFormats/Common/interface/PortableHostCollection.h"
+#include "CUDADataFormats/Vertex/interface/ZVertexUtilities.h"
+#include "RecoPixelVertexing/PixelVertexFinding/plugins/PixelVertexWorkSpaceUtilities.h"
+
+template <int32_t S>
+class PixelVertexWorkSpaceSoAHost : public cms::cuda::PortableHostCollection<PixelVertexWSSoALayout<>> {
+public:
+  explicit PixelVertexWorkSpaceSoAHost() : PortableHostCollection<PixelVertexWSSoALayout<>>(S) {}
+  // Constructor which specifies the SoA size and CUDA stream
+  explicit PixelVertexWorkSpaceSoAHost(cudaStream_t stream)
+      : PortableHostCollection<PixelVertexWSSoALayout<>>(S, stream) {}
+};
+
+namespace gpuVertexFinder {
+  namespace workSpace {
+    using PixelVertexWorkSpaceSoAHost = PixelVertexWorkSpaceSoAHost<zVertex::utilities::MAXTRACKS>;
+  }
+}  // namespace gpuVertexFinder
+#endif
diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/PixelVertexWorkSpaceUtilities.h b/RecoPixelVertexing/PixelVertexFinding/plugins/PixelVertexWorkSpaceUtilities.h
new file mode 100644
index 0000000000000..f5859319c0b6b
--- /dev/null
+++ b/RecoPixelVertexing/PixelVertexFinding/plugins/PixelVertexWorkSpaceUtilities.h
@@ -0,0 +1,35 @@
+#ifndef RecoPixelVertexing_PixelVertexFinding_PixelVertexWorkSpace_h
+#define RecoPixelVertexing_PixelVertexFinding_PixelVertexWorkSpace_h
+
+#include <cuda_runtime.h>
+#include "DataFormats/SoATemplate/interface/SoALayout.h"
+
+// Intermediate data used in the vertex reco algos
+// For internal use only
+GENERATE_SOA_LAYOUT(PixelVertexWSSoALayout,
+                    SOA_COLUMN(uint16_t, itrk),            // index of original track
+                    SOA_COLUMN(float, zt),                 // input track z at bs
+                    SOA_COLUMN(float, ezt2),               // input error^2 on the above
+                    SOA_COLUMN(float, ptt2),               // input pt^2 on the above
+                    SOA_COLUMN(uint8_t, izt),              // interized z-position of input tracks
+                    SOA_COLUMN(int32_t, iv),               // vertex index for each associated track
+                    SOA_SCALAR(uint32_t, ntrks),           // number of "selected tracks"
+                    SOA_SCALAR(uint32_t, nvIntermediate))  // the number of vertices after splitting pruning etc.
+
+// Methods that operate on View and ConstView of the WorkSpaceSoALayout.
+namespace gpuVertexFinder {
+  namespace workSpace {
+    using PixelVertexWorkSpaceSoALayout = PixelVertexWSSoALayout<>;
+    using PixelVertexWorkSpaceSoAView = PixelVertexWSSoALayout<>::View;
+    using PixelVertexWorkSpaceSoAConstView = PixelVertexWSSoALayout<>::ConstView;
+
+    namespace utilities {
+      __host__ __device__ inline void init(PixelVertexWorkSpaceSoAView &workspace_view) {
+        workspace_view.ntrks() = 0;
+        workspace_view.nvIntermediate() = 0;
+      }
+    }  // namespace utilities
+  }    // namespace workSpace
+}  // namespace gpuVertexFinder
+
+#endif
diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuClusterTracksByDensity.h b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuClusterTracksByDensity.h
index f71aa56842a67..915e48e867d95 100644
--- a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuClusterTracksByDensity.h
+++ b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuClusterTracksByDensity.h
@@ -17,8 +17,8 @@ namespace gpuVertexFinder {
   //
   // based on Rodrighez&Laio algo
   //
-  __device__ __forceinline__ void clusterTracksByDensity(gpuVertexFinder::ZVertices* pdata,
-                                                         gpuVertexFinder::WorkSpace* pws,
+  __device__ __forceinline__ void clusterTracksByDensity(VtxSoAView& pdata,
+                                                         WsSoAView& pws,
                                                          int minT,      // min number of neighbours to be "seed"
                                                          float eps,     // max absolute distance to cluster
                                                          float errmax,  // max error to be "seed"
@@ -32,21 +32,24 @@ namespace gpuVertexFinder {
 
     auto er2mx = errmax * errmax;
 
-    auto& __restrict__ data = *pdata;
-    auto& __restrict__ ws = *pws;
-    auto nt = ws.ntrks;
-    float const* __restrict__ zt = ws.zt;
-    float const* __restrict__ ezt2 = ws.ezt2;
+    auto& __restrict__ data = pdata;
+    auto& __restrict__ ws = pws;
+    auto nt = ws.ntrks();
+    float const* __restrict__ zt = ws.zt();
+    float const* __restrict__ ezt2 = ws.ezt2();
 
-    uint32_t& nvFinal = data.nvFinal;
-    uint32_t& nvIntermediate = ws.nvIntermediate;
+    uint32_t& nvFinal = data.nvFinal();
+    uint32_t& nvIntermediate = ws.nvIntermediate();
 
-    uint8_t* __restrict__ izt = ws.izt;
-    int32_t* __restrict__ nn = data.ndof;
-    int32_t* __restrict__ iv = ws.iv;
+    uint8_t* __restrict__ izt = ws.izt();
+    int32_t* __restrict__ nn = data.ndof();
+    int32_t* __restrict__ iv = ws.iv();
 
-    assert(pdata);
     assert(zt);
+    assert(ezt2);
+    assert(izt);
+    assert(nn);
+    assert(iv);
 
     using Hist = cms::cuda::HistoContainer<uint8_t, 256, 16000, 8, uint16_t>;
     __shared__ Hist hist;
@@ -63,7 +66,7 @@ namespace gpuVertexFinder {
 
     // fill hist  (bin shall be wider than "eps")
     for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
-      assert(i < ZVertices::MAXTRACKS);
+      assert(i < zVertex::utilities::MAXTRACKS);
       int iz = int(zt[i] * 10.);  // valid if eps<=0.1
       // iz = std::clamp(iz, INT8_MIN, INT8_MAX);  // sorry c++17 only
       iz = std::min(std::max(iz, INT8_MIN), INT8_MAX);
@@ -197,7 +200,7 @@ namespace gpuVertexFinder {
     }
     __syncthreads();
 
-    assert(foundClusters < ZVertices::MAXVTX);
+    assert(foundClusters < zVertex::utilities::MAXVTX);
 
     // propagate the negative id to all the tracks in the cluster.
     for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
@@ -219,8 +222,8 @@ namespace gpuVertexFinder {
       printf("found %d proto vertices\n", foundClusters);
   }
 
-  __global__ void clusterTracksByDensityKernel(gpuVertexFinder::ZVertices* pdata,
-                                               gpuVertexFinder::WorkSpace* pws,
+  __global__ void clusterTracksByDensityKernel(VtxSoAView pdata,
+                                               WsSoAView pws,
                                                int minT,      // min number of neighbours to be "seed"
                                                float eps,     // max absolute distance to cluster
                                                float errmax,  // max error to be "seed"
diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuClusterTracksDBSCAN.h b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuClusterTracksDBSCAN.h
index a11283a7b2065..f92d9a1d0113d 100644
--- a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuClusterTracksDBSCAN.h
+++ b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuClusterTracksDBSCAN.h
@@ -14,8 +14,8 @@ namespace gpuVertexFinder {
 
   // this algo does not really scale as it works in a single block...
   // enough for <10K tracks we have
-  __global__ void clusterTracksDBSCAN(ZVertices* pdata,
-                                      WorkSpace* pws,
+  __global__ void clusterTracksDBSCAN(VtxSoAView pdata,
+                                      WsSoAView pws,
                                       int minT,      // min number of neighbours to be "core"
                                       float eps,     // max absolute distance to cluster
                                       float errmax,  // max error to be "seed"
@@ -28,21 +28,23 @@ namespace gpuVertexFinder {
 
     auto er2mx = errmax * errmax;
 
-    auto& __restrict__ data = *pdata;
-    auto& __restrict__ ws = *pws;
-    auto nt = ws.ntrks;
-    float const* __restrict__ zt = ws.zt;
-    float const* __restrict__ ezt2 = ws.ezt2;
+    auto& __restrict__ data = pdata;
+    auto& __restrict__ ws = pws;
+    auto nt = ws.ntrks();
+    float const* __restrict__ zt = ws.zt();
+    float const* __restrict__ ezt2 = ws.ezt2();
 
-    uint32_t& nvFinal = data.nvFinal;
-    uint32_t& nvIntermediate = ws.nvIntermediate;
+    uint32_t& nvFinal = data.nvFinal();
+    uint32_t& nvIntermediate = ws.nvIntermediate();
 
-    uint8_t* __restrict__ izt = ws.izt;
-    int32_t* __restrict__ nn = data.ndof;
-    int32_t* __restrict__ iv = ws.iv;
+    uint8_t* __restrict__ izt = ws.izt();
+    int32_t* __restrict__ nn = data.ndof();
+    int32_t* __restrict__ iv = ws.iv();
 
-    assert(pdata);
     assert(zt);
+    assert(iv);
+    assert(nn);
+    assert(ezt2);
 
     using Hist = cms::cuda::HistoContainer<uint8_t, 256, 16000, 8, uint16_t>;
     __shared__ Hist hist;
@@ -59,7 +61,7 @@ namespace gpuVertexFinder {
 
     // fill hist  (bin shall be wider than "eps")
     for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
-      assert(i < ZVertices::MAXTRACKS);
+      assert(i < zVertex::utilities::MAXTRACKS);
       int iz = int(zt[i] * 10.);  // valid if eps<=0.1
       iz = std::clamp(iz, INT8_MIN, INT8_MAX);
       izt[i] = iz - INT8_MIN;
@@ -214,7 +216,7 @@ namespace gpuVertexFinder {
     }
     __syncthreads();
 
-    assert(foundClusters < ZVertices::MAXVTX);
+    assert(foundClusters < zVertex::utilities::MAXVTX);
 
     // propagate the negative id to all the tracks in the cluster.
     for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuClusterTracksIterative.h b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuClusterTracksIterative.h
index 66d246fcfa4fa..21182690ec7e8 100644
--- a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuClusterTracksIterative.h
+++ b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuClusterTracksIterative.h
@@ -14,8 +14,8 @@ namespace gpuVertexFinder {
 
   // this algo does not really scale as it works in a single block...
   // enough for <10K tracks we have
-  __global__ void clusterTracksIterative(ZVertices* pdata,
-                                         WorkSpace* pws,
+  __global__ void clusterTracksIterative(VtxSoAView pdata,
+                                         WsSoAView pws,
                                          int minT,      // min number of neighbours to be "core"
                                          float eps,     // max absolute distance to cluster
                                          float errmax,  // max error to be "seed"
@@ -28,21 +28,23 @@ namespace gpuVertexFinder {
 
     auto er2mx = errmax * errmax;
 
-    auto& __restrict__ data = *pdata;
-    auto& __restrict__ ws = *pws;
-    auto nt = ws.ntrks;
-    float const* __restrict__ zt = ws.zt;
-    float const* __restrict__ ezt2 = ws.ezt2;
+    auto& __restrict__ data = pdata;
+    auto& __restrict__ ws = pws;
+    auto nt = ws.ntrks();
+    float const* __restrict__ zt = ws.zt();
+    float const* __restrict__ ezt2 = ws.ezt2();
 
-    uint32_t& nvFinal = data.nvFinal;
-    uint32_t& nvIntermediate = ws.nvIntermediate;
+    uint32_t& nvFinal = data.nvFinal();
+    uint32_t& nvIntermediate = ws.nvIntermediate();
 
-    uint8_t* __restrict__ izt = ws.izt;
-    int32_t* __restrict__ nn = data.ndof;
-    int32_t* __restrict__ iv = ws.iv;
+    uint8_t* __restrict__ izt = ws.izt();
+    int32_t* __restrict__ nn = data.ndof();
+    int32_t* __restrict__ iv = ws.iv();
 
-    assert(pdata);
     assert(zt);
+    assert(nn);
+    assert(iv);
+    assert(ezt2);
 
     using Hist = cms::cuda::HistoContainer<uint8_t, 256, 16000, 8, uint16_t>;
     __shared__ Hist hist;
@@ -59,7 +61,7 @@ namespace gpuVertexFinder {
 
     // fill hist  (bin shall be wider than "eps")
     for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
-      assert(i < ZVertices::MAXTRACKS);
+      assert(i < zVertex::utilities::MAXTRACKS);
       int iz = int(zt[i] * 10.);  // valid if eps<=0.1
       iz = std::clamp(iz, INT8_MIN, INT8_MAX);
       izt[i] = iz - INT8_MIN;
@@ -185,7 +187,7 @@ namespace gpuVertexFinder {
     }
     __syncthreads();
 
-    assert(foundClusters < ZVertices::MAXVTX);
+    assert(foundClusters < zVertex::utilities::MAXVTX);
 
     // propagate the negative id to all the tracks in the cluster.
     for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuFitVertices.h b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuFitVertices.h
index 0acf67244528a..a89064b7f2ac0 100644
--- a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuFitVertices.h
+++ b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuFitVertices.h
@@ -12,28 +12,25 @@
 
 namespace gpuVertexFinder {
 
-  __device__ __forceinline__ void fitVertices(ZVertices* pdata,
-                                              WorkSpace* pws,
+  __device__ __forceinline__ void fitVertices(VtxSoAView& pdata,
+                                              WsSoAView& pws,
                                               float chi2Max  // for outlier rejection
   ) {
     constexpr bool verbose = false;  // in principle the compiler should optmize out if false
 
-    auto& __restrict__ data = *pdata;
-    auto& __restrict__ ws = *pws;
-    auto nt = ws.ntrks;
-    float const* __restrict__ zt = ws.zt;
-    float const* __restrict__ ezt2 = ws.ezt2;
-    float* __restrict__ zv = data.zv;
-    float* __restrict__ wv = data.wv;
-    float* __restrict__ chi2 = data.chi2;
-    uint32_t& nvFinal = data.nvFinal;
-    uint32_t& nvIntermediate = ws.nvIntermediate;
+    auto& __restrict__ data = pdata;
+    auto& __restrict__ ws = pws;
+    auto nt = ws.ntrks();
+    float const* __restrict__ zt = ws.zt();
+    float const* __restrict__ ezt2 = ws.ezt2();
+    float* __restrict__ zv = data.zv();
+    float* __restrict__ wv = data.wv();
+    float* __restrict__ chi2 = data.chi2();
+    uint32_t& nvFinal = data.nvFinal();
+    uint32_t& nvIntermediate = ws.nvIntermediate();
 
-    int32_t* __restrict__ nn = data.ndof;
-    int32_t* __restrict__ iv = ws.iv;
-
-    assert(pdata);
-    assert(zt);
+    int32_t* __restrict__ nn = data.ndof();
+    int32_t* __restrict__ iv = ws.iv();
 
     assert(nvFinal <= nvIntermediate);
     nvFinal = nvIntermediate;
@@ -101,8 +98,8 @@ namespace gpuVertexFinder {
       printf("and %d noise\n", noise);
   }
 
-  __global__ void fitVerticesKernel(ZVertices* pdata,
-                                    WorkSpace* pws,
+  __global__ void fitVerticesKernel(VtxSoAView pdata,
+                                    WsSoAView pws,
                                     float chi2Max  // for outlier rejection
   ) {
     fitVertices(pdata, pws, chi2Max);
diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuSortByPt2.h b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuSortByPt2.h
index 93f78d498b26f..2e2e2353f6b30 100644
--- a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuSortByPt2.h
+++ b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuSortByPt2.h
@@ -15,29 +15,29 @@
 
 namespace gpuVertexFinder {
 
-  __device__ __forceinline__ void sortByPt2(ZVertices* pdata, WorkSpace* pws) {
-    auto& __restrict__ data = *pdata;
-    auto& __restrict__ ws = *pws;
-    auto nt = ws.ntrks;
-    float const* __restrict__ ptt2 = ws.ptt2;
-    uint32_t const& nvFinal = data.nvFinal;
+  __device__ __forceinline__ void sortByPt2(VtxSoAView& pdata, WsSoAView& pws) {
+    auto& __restrict__ data = pdata;
+    auto& __restrict__ ws = pws;
+    auto nt = ws.ntrks();
+    float const* __restrict__ ptt2 = ws.ptt2();
+    uint32_t const& nvFinal = data.nvFinal();
 
-    int32_t const* __restrict__ iv = ws.iv;
-    float* __restrict__ ptv2 = data.ptv2;
-    uint16_t* __restrict__ sortInd = data.sortInd;
+    int32_t const* __restrict__ iv = ws.iv();
+    float* __restrict__ ptv2 = data.ptv2();
+    uint16_t* __restrict__ sortInd = data.sortInd();
 
-    // if (threadIdx.x == 0)
-    //    printf("sorting %d vertices\n",nvFinal);
+    assert(ptv2);
+    assert(sortInd);
 
     if (nvFinal < 1)
       return;
 
     // fill indexing
     for (auto i = threadIdx.x; i < nt; i += blockDim.x) {
-      data.idv[ws.itrk[i]] = iv[i];
+      data[ws[i].itrk()].idv() = iv[i];
     }
 
-    // can be done asynchronoisly at the end of previous event
+    // can be done asynchronously at the end of previous event
     for (auto i = threadIdx.x; i < nvFinal; i += blockDim.x) {
       ptv2[i] = 0;
     }
@@ -66,7 +66,7 @@ namespace gpuVertexFinder {
 #endif
   }
 
-  __global__ void sortByPt2Kernel(ZVertices* pdata, WorkSpace* pws) { sortByPt2(pdata, pws); }
+  __global__ void sortByPt2Kernel(VtxSoAView pdata, WsSoAView pws) { sortByPt2(pdata, pws); }
 
 }  // namespace gpuVertexFinder
 
diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuSplitVertices.h b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuSplitVertices.h
index 0fe8bd882dcc5..7f18d58d11454 100644
--- a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuSplitVertices.h
+++ b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuSplitVertices.h
@@ -12,24 +12,26 @@
 
 namespace gpuVertexFinder {
 
-  __device__ __forceinline__ void splitVertices(ZVertices* pdata, WorkSpace* pws, float maxChi2) {
+  __device__ __forceinline__ void splitVertices(VtxSoAView& pdata, WsSoAView& pws, float maxChi2) {
     constexpr bool verbose = false;  // in principle the compiler should optmize out if false
 
-    auto& __restrict__ data = *pdata;
-    auto& __restrict__ ws = *pws;
-    auto nt = ws.ntrks;
-    float const* __restrict__ zt = ws.zt;
-    float const* __restrict__ ezt2 = ws.ezt2;
-    float* __restrict__ zv = data.zv;
-    float* __restrict__ wv = data.wv;
-    float const* __restrict__ chi2 = data.chi2;
-    uint32_t& nvFinal = data.nvFinal;
+    auto& __restrict__ data = pdata;
+    auto& __restrict__ ws = pws;
+    auto nt = ws.ntrks();
+    float const* __restrict__ zt = ws.zt();
+    float const* __restrict__ ezt2 = ws.ezt2();
+    float* __restrict__ zv = data.zv();
+    float* __restrict__ wv = data.wv();
+    float const* __restrict__ chi2 = data.chi2();
+    uint32_t& nvFinal = data.nvFinal();
 
-    int32_t const* __restrict__ nn = data.ndof;
-    int32_t* __restrict__ iv = ws.iv;
+    int32_t const* __restrict__ nn = data.ndof();
+    int32_t* __restrict__ iv = ws.iv();
 
-    assert(pdata);
     assert(zt);
+    assert(wv);
+    assert(chi2);
+    assert(nn);
 
     // one vertex per block
     for (auto kv = blockIdx.x; kv < nvFinal; kv += gridDim.x) {
@@ -120,7 +122,7 @@ namespace gpuVertexFinder {
       // get a new global vertex
       __shared__ uint32_t igv;
       if (0 == threadIdx.x)
-        igv = atomicAdd(&ws.nvIntermediate, 1);
+        igv = atomicAdd(&ws.nvIntermediate(), 1);
       __syncthreads();
       for (auto k = threadIdx.x; k < nq; k += blockDim.x) {
         if (1 == newV[k])
@@ -130,7 +132,7 @@ namespace gpuVertexFinder {
     }  // loop on vertices
   }
 
-  __global__ void splitVerticesKernel(ZVertices* pdata, WorkSpace* pws, float maxChi2) {
+  __global__ void splitVerticesKernel(VtxSoAView pdata, WsSoAView pws, float maxChi2) {
     splitVertices(pdata, pws, maxChi2);
   }
 
diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.cc b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.cc
index 74bcd26f8a79c..950a31f8ac48a 100644
--- a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.cc
+++ b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.cc
@@ -1,5 +1,12 @@
 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
 
+#include "CUDADataFormats/Track/interface/PixelTrackUtilities.h"
+#include "CUDADataFormats/Vertex/interface/ZVertexUtilities.h"
+
+#include "PixelVertexWorkSpaceUtilities.h"
+#include "PixelVertexWorkSpaceSoAHost.h"
+#include "PixelVertexWorkSpaceSoADevice.h"
+
 #include "gpuClusterTracksByDensity.h"
 #include "gpuClusterTracksDBSCAN.h"
 #include "gpuClusterTracksIterative.h"
@@ -20,28 +27,23 @@ namespace gpuVertexFinder {
 
   template <typename TrackerTraits>
   __global__ void loadTracks(
-      pixelTrack::TrackSoAT<TrackerTraits> const* ptracks, ZVertexSoA* soa, WorkSpace* pws, float ptMin, float ptMax) {
-    assert(ptracks);
-    assert(soa);
-    auto const& tracks = *ptracks;
-    auto const& fit = tracks.stateAtBS;
-    auto const* quality = tracks.qualityData();
-
+      TrackSoAConstView<TrackerTraits> tracks_view, VtxSoAView soa, WsSoAView pws, float ptMin, float ptMax) {
+    auto const* quality = tracks_view.quality();
+    using helper = TracksUtilities<TrackerTraits>;
     auto first = blockIdx.x * blockDim.x + threadIdx.x;
-
-    for (int idx = first, nt = tracks.nTracks(); idx < nt; idx += gridDim.x * blockDim.x) {
-      auto nHits = tracks.nHits(idx);
+    for (int idx = first, nt = tracks_view.nTracks(); idx < nt; idx += gridDim.x * blockDim.x) {
+      auto nHits = helper::nHits(tracks_view, idx);
       assert(nHits >= 3);
 
       // initialize soa...
-      soa->idv[idx] = -1;
+      soa[idx].idv() = -1;
 
-      if (tracks.isTriplet(idx))
+      if (helper::isTriplet(tracks_view, idx))
         continue;  // no triplets
       if (quality[idx] < pixelTrack::Quality::highPurity)
         continue;
 
-      auto pt = tracks.pt(idx);
+      auto pt = tracks_view[idx].pt();
 
       if (pt < ptMin)
         continue;
@@ -49,19 +51,19 @@ namespace gpuVertexFinder {
       // clamp pt
       pt = std::min(pt, ptMax);
 
-      auto& data = *pws;
-      auto it = atomicAdd(&data.ntrks, 1);
-      data.itrk[it] = idx;
-      data.zt[it] = tracks.zip(idx);
-      data.ezt2[it] = fit.covariance(idx)(14);
-      data.ptt2[it] = pt * pt;
+      auto& data = pws;
+      auto it = atomicAdd(&data.ntrks(), 1);
+      data[it].itrk() = idx;
+      data[it].zt() = helper::zip(tracks_view, idx);
+      data[it].ezt2() = tracks_view[idx].covariance()(14);
+      data[it].ptt2() = pt * pt;
     }
   }
 
 // #define THREE_KERNELS
 #ifndef THREE_KERNELS
-  __global__ void vertexFinderOneKernel(gpuVertexFinder::ZVertices* pdata,
-                                        gpuVertexFinder::WorkSpace* pws,
+  __global__ void vertexFinderOneKernel(VtxSoAView pdata,
+                                        WsSoAView pws,
                                         int minT,      // min number of neighbours to be "seed"
                                         float eps,     // max absolute distance to cluster
                                         float errmax,  // max error to be "seed"
@@ -78,8 +80,8 @@ namespace gpuVertexFinder {
     sortByPt2(pdata, pws);
   }
 #else
-  __global__ void vertexFinderKernel1(gpuVertexFinder::ZVertices* pdata,
-                                      gpuVertexFinder::WorkSpace* pws,
+  __global__ void vertexFinderKernel1(VtxSoAView pdata,
+                                      WsSoAView pws,
                                       int minT,      // min number of neighbours to be "seed"
                                       float eps,     // max absolute distance to cluster
                                       float errmax,  // max error to be "seed"
@@ -90,7 +92,7 @@ namespace gpuVertexFinder {
     fitVertices(pdata, pws, maxChi2ForFirstFit);
   }
 
-  __global__ void vertexFinderKernel2(gpuVertexFinder::ZVertices* pdata, gpuVertexFinder::WorkSpace* pws) {
+  __global__ void vertexFinderKernel2(VtxSoAView pdata, WsSoAView pws) {
     fitVertices(pdata, pws, maxChi2ForFinalFit);
     __syncthreads();
     sortByPt2(pdata, pws);
@@ -99,44 +101,42 @@ namespace gpuVertexFinder {
 
   template <typename TrackerTraits>
 #ifdef __CUDACC__
-  ZVertexHeterogeneous Producer<TrackerTraits>::makeAsync(cudaStream_t stream,
-                                                          pixelTrack::TrackSoAT<TrackerTraits> const* tksoa,
-                                                          float ptMin,
-                                                          float ptMax) const {
+  ZVertexSoADevice Producer<TrackerTraits>::makeAsync(cudaStream_t stream,
+                                                      const TrackSoAConstView<TrackerTraits>& tracks_view,
+                                                      float ptMin,
+                                                      float ptMax) const {
 #ifdef PIXVERTEX_DEBUG_PRODUCE
     std::cout << "producing Vertices on GPU" << std::endl;
 #endif  // PIXVERTEX_DEBUG_PRODUCE
-    ZVertexHeterogeneous vertices(cms::cuda::make_device_unique<ZVertexSoA>(stream));
+    ZVertexSoADevice vertices(stream);
 #else
-
-  ZVertexHeterogeneous Producer<TrackerTraits>::make(pixelTrack::TrackSoAT<TrackerTraits> const* tksoa,
-                                                     float ptMin,
-                                                     float ptMax) const {
-
+  ZVertexSoAHost Producer<TrackerTraits>::make(const TrackSoAConstView<TrackerTraits>& tracks_view,
+                                               float ptMin,
+                                               float ptMax) const {
 #ifdef PIXVERTEX_DEBUG_PRODUCE
     std::cout << "producing Vertices on  CPU" << std::endl;
 #endif  // PIXVERTEX_DEBUG_PRODUCE
-    ZVertexHeterogeneous vertices(std::make_unique<ZVertexSoA>());
+    ZVertexSoAHost vertices;
 #endif
-    assert(tksoa);
-    auto* soa = vertices.get();
-    assert(soa);
+    auto soa = vertices.view();
+
+    assert(vertices.buffer());
 
 #ifdef __CUDACC__
-    auto ws_d = cms::cuda::make_device_unique<WorkSpace>(stream);
+    auto ws_d = gpuVertexFinder::workSpace::PixelVertexWorkSpaceSoADevice(stream);
 #else
-    auto ws_d = std::make_unique<WorkSpace>();
+    auto ws_d = gpuVertexFinder::workSpace::PixelVertexWorkSpaceSoAHost();
 #endif
 
 #ifdef __CUDACC__
-    init<<<1, 1, 0, stream>>>(soa, ws_d.get());
+    init<<<1, 1, 0, stream>>>(soa, ws_d.view());
     auto blockSize = 128;
-    auto numberOfBlocks = (pixelTrack::TrackSoAT<TrackerTraits>::stride() + blockSize - 1) / blockSize;
-    loadTracks<TrackerTraits><<<numberOfBlocks, blockSize, 0, stream>>>(tksoa, soa, ws_d.get(), ptMin, ptMax);
+    auto numberOfBlocks = (tracks_view.metadata().size() + blockSize - 1) / blockSize;
+    loadTracks<TrackerTraits><<<numberOfBlocks, blockSize, 0, stream>>>(tracks_view, soa, ws_d.view(), ptMin, ptMax);
     cudaCheck(cudaGetLastError());
 #else
-    init(soa, ws_d.get());
-    loadTracks<TrackerTraits>(tksoa, soa, ws_d.get(), ptMin, ptMax);
+    init(soa, ws_d.view());
+    loadTracks<TrackerTraits>(tracks_view, soa, ws_d.view(), ptMin, ptMax);
 #endif
 
 #ifdef __CUDACC__
@@ -148,50 +148,51 @@ namespace gpuVertexFinder {
     if (oneKernel_) {
       // implemented only for density clustesrs
 #ifndef THREE_KERNELS
-      vertexFinderOneKernel<<<1, maxThreadsForPrint, 0, stream>>>(soa, ws_d.get(), minT, eps, errmax, chi2max);
+      vertexFinderOneKernel<<<1, maxThreadsForPrint, 0, stream>>>(soa, ws_d.view(), minT, eps, errmax, chi2max);
 #else
-      vertexFinderKernel1<<<1, maxThreadsForPrint, 0, stream>>>(soa, ws_d.get(), minT, eps, errmax, chi2max);
+      vertexFinderKernel1<<<1, maxThreadsForPrint, 0, stream>>>(soa, ws_d.view(), minT, eps, errmax, chi2max);
       cudaCheck(cudaGetLastError());
       // one block per vertex...
-      splitVerticesKernel<<<numBlocks, threadsPerBlock, 0, stream>>>(soa, ws_d.get(), maxChi2ForSplit);
+      splitVerticesKernel<<<numBlocks, threadsPerBlock, 0, stream>>>(soa, ws_d.view(), maxChi2ForSplit);
       cudaCheck(cudaGetLastError());
-      vertexFinderKernel2<<<1, maxThreadsForPrint, 0, stream>>>(soa, ws_d.get());
+      vertexFinderKernel2<<<1, maxThreadsForPrint, 0, stream>>>(soa, ws_d.view());
 #endif
     } else {  // five kernels
       if (useDensity_) {
-        clusterTracksByDensityKernel<<<1, maxThreadsForPrint, 0, stream>>>(soa, ws_d.get(), minT, eps, errmax, chi2max);
+        clusterTracksByDensityKernel<<<1, maxThreadsForPrint, 0, stream>>>(
+            soa, ws_d.view(), minT, eps, errmax, chi2max);
       } else if (useDBSCAN_) {
-        clusterTracksDBSCAN<<<1, maxThreadsForPrint, 0, stream>>>(soa, ws_d.get(), minT, eps, errmax, chi2max);
+        clusterTracksDBSCAN<<<1, maxThreadsForPrint, 0, stream>>>(soa, ws_d.view(), minT, eps, errmax, chi2max);
       } else if (useIterative_) {
-        clusterTracksIterative<<<1, maxThreadsForPrint, 0, stream>>>(soa, ws_d.get(), minT, eps, errmax, chi2max);
+        clusterTracksIterative<<<1, maxThreadsForPrint, 0, stream>>>(soa, ws_d.view(), minT, eps, errmax, chi2max);
       }
       cudaCheck(cudaGetLastError());
-      fitVerticesKernel<<<1, maxThreadsForPrint, 0, stream>>>(soa, ws_d.get(), maxChi2ForFirstFit);
+      fitVerticesKernel<<<1, maxThreadsForPrint, 0, stream>>>(soa, ws_d.view(), maxChi2ForFirstFit);
       cudaCheck(cudaGetLastError());
       // one block per vertex...
-      splitVerticesKernel<<<numBlocks, threadsPerBlock, 0, stream>>>(soa, ws_d.get(), maxChi2ForSplit);
+      splitVerticesKernel<<<numBlocks, threadsPerBlock, 0, stream>>>(soa, ws_d.view(), maxChi2ForSplit);
       cudaCheck(cudaGetLastError());
-      fitVerticesKernel<<<1, maxThreadsForPrint, 0, stream>>>(soa, ws_d.get(), maxChi2ForFinalFit);
+      fitVerticesKernel<<<1, maxThreadsForPrint, 0, stream>>>(soa, ws_d.view(), maxChi2ForFinalFit);
       cudaCheck(cudaGetLastError());
-      sortByPt2Kernel<<<1, maxThreadsForPrint, 0, stream>>>(soa, ws_d.get());
+      sortByPt2Kernel<<<1, maxThreadsForPrint, 0, stream>>>(soa, ws_d.view());
     }
     cudaCheck(cudaGetLastError());
 #else  // __CUDACC__
     if (useDensity_) {
-      clusterTracksByDensity(soa, ws_d.get(), minT, eps, errmax, chi2max);
+      clusterTracksByDensity(soa, ws_d.view(), minT, eps, errmax, chi2max);
     } else if (useDBSCAN_) {
-      clusterTracksDBSCAN(soa, ws_d.get(), minT, eps, errmax, chi2max);
+      clusterTracksDBSCAN(soa, ws_d.view(), minT, eps, errmax, chi2max);
     } else if (useIterative_) {
-      clusterTracksIterative(soa, ws_d.get(), minT, eps, errmax, chi2max);
+      clusterTracksIterative(soa, ws_d.view(), minT, eps, errmax, chi2max);
     }
 #ifdef PIXVERTEX_DEBUG_PRODUCE
-    std::cout << "found " << (*ws_d).nvIntermediate << " vertices " << std::endl;
+    std::cout << "found " << ws_d.view().nvIntermediate() << " vertices " << std::endl;
 #endif  // PIXVERTEX_DEBUG_PRODUCE
-    fitVertices(soa, ws_d.get(), maxChi2ForFirstFit);
+    fitVertices(soa, ws_d.view(), maxChi2ForFirstFit);
     // one block per vertex!
-    splitVertices(soa, ws_d.get(), maxChi2ForSplit);
-    fitVertices(soa, ws_d.get(), maxChi2ForFinalFit);
-    sortByPt2(soa, ws_d.get());
+    splitVertices(soa, ws_d.view(), maxChi2ForSplit);
+    fitVertices(soa, ws_d.view(), maxChi2ForFinalFit);
+    sortByPt2(soa, ws_d.view());
 #endif
 
     return vertices;
@@ -199,5 +200,4 @@ namespace gpuVertexFinder {
 
   template class Producer<pixelTopology::Phase1>;
   template class Producer<pixelTopology::Phase2>;
-
 }  // namespace gpuVertexFinder
diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.h b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.h
index 6128939f6eb87..d5157fec14053 100644
--- a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.h
+++ b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.h
@@ -4,45 +4,29 @@
 #include <cstddef>
 #include <cstdint>
 
-#include "CUDADataFormats/Vertex/interface/ZVertexHeterogeneous.h"
-#include "CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h"
+#include "CUDADataFormats/Track/interface/PixelTrackUtilities.h"
+#include "CUDADataFormats/Vertex/interface/ZVertexSoAHeterogeneousHost.h"
+#include "CUDADataFormats/Vertex/interface/ZVertexSoAHeterogeneousDevice.h"
+#include "CUDADataFormats/Vertex/interface/ZVertexUtilities.h"
+#include "PixelVertexWorkSpaceUtilities.h"
+#include "PixelVertexWorkSpaceSoAHost.h"
+#include "PixelVertexWorkSpaceSoADevice.h"
 
 namespace gpuVertexFinder {
 
-  using ZVertices = ZVertexSoA;
-  // workspace used in the vertex reco algos
-  struct WorkSpace {
-    static constexpr uint32_t MAXTRACKS = ZVertexSoA::MAXTRACKS;
-    static constexpr uint32_t MAXVTX = ZVertexSoA::MAXVTX;
+  using VtxSoAView = zVertex::ZVertexSoAView;
+  using WsSoAView = gpuVertexFinder::workSpace::PixelVertexWorkSpaceSoAView;
 
-    uint32_t ntrks;            // number of "selected tracks"
-    uint32_t itrk[MAXTRACKS];  // index of original track
-    float zt[MAXTRACKS];       // input track z at bs
-    float ezt2[MAXTRACKS];     // input error^2 on the above
-    float ptt2[MAXTRACKS];     // input pt^2 on the above
-    uint8_t izt[MAXTRACKS];    // interized z-position of input tracks
-    int32_t iv[MAXTRACKS];     // vertex index for each associated track
-
-    uint32_t nvIntermediate;  // the number of vertices after splitting pruning etc.
-
-    __host__ __device__ void init() {
-      ntrks = 0;
-      nvIntermediate = 0;
-    }
-  };
-
-  __global__ void init(ZVertexSoA* pdata, WorkSpace* pws) {
-    pdata->init();
-    pws->init();
+  __global__ void init(VtxSoAView pdata, WsSoAView pws) {
+    zVertex::utilities::init(pdata);
+    gpuVertexFinder::workSpace::utilities::init(pws);
   }
 
   template <typename TrackerTraits>
   class Producer {
-  public:
-    using ZVertices = ZVertexSoA;
-    using WorkSpace = gpuVertexFinder::WorkSpace;
-    using TkSoA = pixelTrack::TrackSoAT<TrackerTraits>;
+    using TkSoAConstView = TrackSoAConstView<TrackerTraits>;
 
+  public:
     Producer(bool oneKernel,
              bool useDensity,
              bool useDBSCAN,
@@ -63,8 +47,8 @@ namespace gpuVertexFinder {
 
     ~Producer() = default;
 
-    ZVertexHeterogeneous makeAsync(cudaStream_t stream, TkSoA const* tksoa, float ptMin, float ptMax) const;
-    ZVertexHeterogeneous make(TkSoA const* tksoa, float ptMin, float ptMax) const;
+    ZVertexSoADevice makeAsync(cudaStream_t stream, const TkSoAConstView &tracks_view, float ptMin, float ptMax) const;
+    ZVertexSoAHost make(const TkSoAConstView &tracks_view, float ptMin, float ptMax) const;
 
   private:
     const bool oneKernel_;
diff --git a/RecoPixelVertexing/PixelVertexFinding/test/VertexFinder_t.h b/RecoPixelVertexing/PixelVertexFinding/test/VertexFinder_t.h
index 5f8a0646c726a..ff3048c03f6a4 100644
--- a/RecoPixelVertexing/PixelVertexFinding/test/VertexFinder_t.h
+++ b/RecoPixelVertexing/PixelVertexFinding/test/VertexFinder_t.h
@@ -7,6 +7,17 @@
 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/requireDevices.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/launch.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/allocate_device.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/currentDevice.h"
+// PixelTrackUtilities only included in order to compile SoALayout with Eigen columns
+#include "CUDADataFormats/Track/interface/PixelTrackUtilities.h"
+#include "CUDADataFormats/Vertex/interface/ZVertexUtilities.h"
+#include "CUDADataFormats/Vertex/interface/ZVertexSoAHeterogeneousHost.h"
+#include "CUDADataFormats/Vertex/interface/ZVertexSoAHeterogeneousDevice.h"
+
+#include "RecoPixelVertexing/PixelVertexFinding/plugins/PixelVertexWorkSpaceUtilities.h"
+#include "RecoPixelVertexing/PixelVertexFinding/plugins/PixelVertexWorkSpaceSoAHost.h"
+#include "RecoPixelVertexing/PixelVertexFinding/plugins/PixelVertexWorkSpaceSoADevice.h"
 #ifdef USE_DBSCAN
 #include "RecoPixelVertexing/PixelVertexFinding/plugins/gpuClusterTracksDBSCAN.h"
 #define CLUSTERIZE gpuVertexFinder::clusterTracksDBSCAN
@@ -23,22 +34,22 @@
 
 #ifdef ONE_KERNEL
 #ifdef __CUDACC__
-__global__ void vertexFinderOneKernel(gpuVertexFinder::ZVertices* pdata,
-                                      gpuVertexFinder::WorkSpace* pws,
+__global__ void vertexFinderOneKernel(gpuVertexFinder::VtxSoAView pdata,
+                                      gpuVertexFinder::WsSoAView pws,
                                       int minT,      // min number of neighbours to be "seed"
                                       float eps,     // max absolute distance to cluster
                                       float errmax,  // max error to be "seed"
                                       float chi2max  // max normalized distance to cluster,
 ) {
-  clusterTracksByDensity(pdata, pws, minT, eps, errmax, chi2max);
+  gpuVertexFinder::clusterTracksByDensity(pdata, pws, minT, eps, errmax, chi2max);
   __syncthreads();
-  fitVertices(pdata, pws, 50.);
+  gpuVertexFinder::fitVertices(pdata, pws, 50.);
   __syncthreads();
-  splitVertices(pdata, pws, 9.f);
+  gpuVertexFinder::splitVertices(pdata, pws, 9.f);
   __syncthreads();
-  fitVertices(pdata, pws, 5000.);
+  gpuVertexFinder::fitVertices(pdata, pws, 5000.);
   __syncthreads();
-  sortByPt2(pdata, pws);
+  gpuVertexFinder::sortByPt2(pdata, pws);
 }
 #endif
 #endif
@@ -101,25 +112,23 @@ struct ClusterGenerator {
   std::exponential_distribution<float> ptGen;
 };
 
-// a macro SORRY
-#define LOC_ONGPU(M) ((char*)(onGPU_d.get()) + offsetof(gpuVertexFinder::ZVertices, M))
-#define LOC_WS(M) ((char*)(ws_d.get()) + offsetof(gpuVertexFinder::WorkSpace, M))
-
-__global__ void print(gpuVertexFinder::ZVertices const* pdata, gpuVertexFinder::WorkSpace const* pws) {
-  auto const& __restrict__ data = *pdata;
-  auto const& __restrict__ ws = *pws;
-  printf("nt,nv %d %d,%d\n", ws.ntrks, data.nvFinal, ws.nvIntermediate);
+__global__ void print(gpuVertexFinder::VtxSoAView pdata, gpuVertexFinder::WsSoAView pws) {
+  auto& __restrict__ ws = pws;
+  printf("nt,nv %d %d,%d\n", ws.ntrks(), pdata.nvFinal(), ws.nvIntermediate());
 }
 
 int main() {
 #ifdef __CUDACC__
+  cudaStream_t stream;
   cms::cudatest::requireDevices();
+  cudaCheck(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
 
-  auto onGPU_d = cms::cuda::make_device_unique<gpuVertexFinder::ZVertices[]>(1, nullptr);
-  auto ws_d = cms::cuda::make_device_unique<gpuVertexFinder::WorkSpace[]>(1, nullptr);
+  ZVertexSoADevice onGPU_d(stream);
+  gpuVertexFinder::workSpace::PixelVertexWorkSpaceSoADevice ws_d(stream);
 #else
-  auto onGPU_d = std::make_unique<gpuVertexFinder::ZVertices>();
-  auto ws_d = std::make_unique<gpuVertexFinder::WorkSpace>();
+
+  ZVertexSoAHost onGPU_d;
+  gpuVertexFinder::workSpace::PixelVertexWorkSpaceSoAHost ws_d;
 #endif
 
   Event ev;
@@ -135,24 +144,26 @@ int main() {
       gen(ev);
 
 #ifdef __CUDACC__
-      init<<<1, 1, 0, 0>>>(onGPU_d.get(), ws_d.get());
+      gpuVertexFinder::init<<<1, 1, 0, stream>>>(onGPU_d.view(), ws_d.view());
 #else
-      onGPU_d->init();
-      ws_d->init();
+      gpuVertexFinder::init(onGPU_d.view(), ws_d.view());
 #endif
 
       std::cout << "v,t size " << ev.zvert.size() << ' ' << ev.ztrack.size() << std::endl;
       auto nt = ev.ztrack.size();
 #ifdef __CUDACC__
-      cudaCheck(cudaMemcpy(LOC_WS(ntrks), &nt, sizeof(uint32_t), cudaMemcpyHostToDevice));
-      cudaCheck(cudaMemcpy(LOC_WS(zt), ev.ztrack.data(), sizeof(float) * ev.ztrack.size(), cudaMemcpyHostToDevice));
-      cudaCheck(cudaMemcpy(LOC_WS(ezt2), ev.eztrack.data(), sizeof(float) * ev.eztrack.size(), cudaMemcpyHostToDevice));
-      cudaCheck(cudaMemcpy(LOC_WS(ptt2), ev.pttrack.data(), sizeof(float) * ev.eztrack.size(), cudaMemcpyHostToDevice));
+      cudaCheck(cudaMemcpy(&ws_d.view().ntrks(), &nt, sizeof(uint32_t), cudaMemcpyHostToDevice));
+      cudaCheck(
+          cudaMemcpy(ws_d.view().zt(), ev.ztrack.data(), sizeof(float) * ev.ztrack.size(), cudaMemcpyHostToDevice));
+      cudaCheck(
+          cudaMemcpy(ws_d.view().ezt2(), ev.eztrack.data(), sizeof(float) * ev.eztrack.size(), cudaMemcpyHostToDevice));
+      cudaCheck(
+          cudaMemcpy(ws_d.view().ptt2(), ev.pttrack.data(), sizeof(float) * ev.eztrack.size(), cudaMemcpyHostToDevice));
 #else
-      ::memcpy(LOC_WS(ntrks), &nt, sizeof(uint32_t));
-      ::memcpy(LOC_WS(zt), ev.ztrack.data(), sizeof(float) * ev.ztrack.size());
-      ::memcpy(LOC_WS(ezt2), ev.eztrack.data(), sizeof(float) * ev.eztrack.size());
-      ::memcpy(LOC_WS(ptt2), ev.pttrack.data(), sizeof(float) * ev.eztrack.size());
+      ::memcpy(&ws_d.view().ntrks(), &nt, sizeof(uint32_t));
+      ::memcpy(ws_d.view().zt(), ev.ztrack.data(), sizeof(float) * ev.ztrack.size());
+      ::memcpy(ws_d.view().ezt2(), ev.eztrack.data(), sizeof(float) * ev.eztrack.size());
+      ::memcpy(ws_d.view().ptt2(), ev.pttrack.data(), sizeof(float) * ev.eztrack.size());
 #endif
 
       std::cout << "M eps, pset " << kk << ' ' << eps << ' ' << (i % 4) << std::endl;
@@ -168,30 +179,30 @@ int main() {
 
       uint32_t nv = 0;
 #ifdef __CUDACC__
-      print<<<1, 1, 0, 0>>>(onGPU_d.get(), ws_d.get());
+      print<<<1, 1, 0, stream>>>(onGPU_d.view(), ws_d.view());
       cudaCheck(cudaGetLastError());
       cudaDeviceSynchronize();
 
 #ifdef ONE_KERNEL
-      cms::cuda::launch(vertexFinderOneKernel, {1, 512 + 256}, onGPU_d.get(), ws_d.get(), kk, par[0], par[1], par[2]);
+      cms::cuda::launch(vertexFinderOneKernel, {1, 512 + 256}, onGPU_d.view(), ws_d.view(), kk, par[0], par[1], par[2]);
 #else
-      cms::cuda::launch(CLUSTERIZE, {1, 512 + 256}, onGPU_d.get(), ws_d.get(), kk, par[0], par[1], par[2]);
+      cms::cuda::launch(CLUSTERIZE, {1, 512 + 256}, onGPU_d.view(), ws_d.view(), kk, par[0], par[1], par[2]);
 #endif
-      print<<<1, 1, 0, 0>>>(onGPU_d.get(), ws_d.get());
+      print<<<1, 1, 0, stream>>>(onGPU_d.view(), ws_d.view());
 
       cudaCheck(cudaGetLastError());
       cudaDeviceSynchronize();
 
-      cms::cuda::launch(gpuVertexFinder::fitVerticesKernel, {1, 1024 - 256}, onGPU_d.get(), ws_d.get(), 50.f);
+      cms::cuda::launch(gpuVertexFinder::fitVerticesKernel, {1, 1024 - 256}, onGPU_d.view(), ws_d.view(), 50.f);
       cudaCheck(cudaGetLastError());
-      cudaCheck(cudaMemcpy(&nv, LOC_ONGPU(nvFinal), sizeof(uint32_t), cudaMemcpyDeviceToHost));
+      cudaCheck(cudaMemcpy(&nv, &onGPU_d.view().nvFinal(), sizeof(uint32_t), cudaMemcpyDeviceToHost));
 
 #else
-      print(onGPU_d.get(), ws_d.get());
-      CLUSTERIZE(onGPU_d.get(), ws_d.get(), kk, par[0], par[1], par[2]);
-      print(onGPU_d.get(), ws_d.get());
-      fitVertices(onGPU_d.get(), ws_d.get(), 50.f);
-      nv = onGPU_d->nvFinal;
+      print(onGPU_d.view(), ws_d.view());
+      CLUSTERIZE(onGPU_d.view(), ws_d.view(), kk, par[0], par[1], par[2]);
+      print(onGPU_d.view(), ws_d.view());
+      gpuVertexFinder::fitVertices(onGPU_d.view(), ws_d.view(), 50.f);
+      nv = onGPU_d.view().nvFinal();
 #endif
 
       if (nv == 0) {
@@ -221,18 +232,18 @@ int main() {
       nn = hnn;
       ind = hind;
 #else
-      zv = onGPU_d->zv;
-      wv = onGPU_d->wv;
-      ptv2 = onGPU_d->ptv2;
-      nn = onGPU_d->ndof;
-      ind = onGPU_d->sortInd;
+      zv = onGPU_d.view().zv();
+      wv = onGPU_d.view().wv();
+      ptv2 = onGPU_d.view().ptv2();
+      nn = onGPU_d.view().ndof();
+      ind = onGPU_d.view().sortInd();
 #endif
 
 #ifdef __CUDACC__
-      cudaCheck(cudaMemcpy(nn, LOC_ONGPU(ndof), nv * sizeof(int32_t), cudaMemcpyDeviceToHost));
-      cudaCheck(cudaMemcpy(chi2, LOC_ONGPU(chi2), nv * sizeof(float), cudaMemcpyDeviceToHost));
+      cudaCheck(cudaMemcpy(nn, onGPU_d.view().ndof(), nv * sizeof(int32_t), cudaMemcpyDeviceToHost));
+      cudaCheck(cudaMemcpy(chi2, onGPU_d.view().chi2(), nv * sizeof(float), cudaMemcpyDeviceToHost));
 #else
-      memcpy(chi2, LOC_ONGPU(chi2), nv * sizeof(float));
+      memcpy(chi2, onGPU_d.view().chi2(), nv * sizeof(float));
 #endif
 
       for (auto j = 0U; j < nv; ++j)
@@ -244,14 +255,14 @@ int main() {
       }
 
 #ifdef __CUDACC__
-      cms::cuda::launch(gpuVertexFinder::fitVerticesKernel, {1, 1024 - 256}, onGPU_d.get(), ws_d.get(), 50.f);
-      cudaCheck(cudaMemcpy(&nv, LOC_ONGPU(nvFinal), sizeof(uint32_t), cudaMemcpyDeviceToHost));
-      cudaCheck(cudaMemcpy(nn, LOC_ONGPU(ndof), nv * sizeof(int32_t), cudaMemcpyDeviceToHost));
-      cudaCheck(cudaMemcpy(chi2, LOC_ONGPU(chi2), nv * sizeof(float), cudaMemcpyDeviceToHost));
+      cms::cuda::launch(gpuVertexFinder::fitVerticesKernel, {1, 1024 - 256}, onGPU_d.view(), ws_d.view(), 50.f);
+      cudaCheck(cudaMemcpy(&nv, &onGPU_d.view().nvFinal(), sizeof(uint32_t), cudaMemcpyDeviceToHost));
+      cudaCheck(cudaMemcpy(nn, onGPU_d.view().ndof(), nv * sizeof(int32_t), cudaMemcpyDeviceToHost));
+      cudaCheck(cudaMemcpy(chi2, onGPU_d.view().chi2(), nv * sizeof(float), cudaMemcpyDeviceToHost));
 #else
-      fitVertices(onGPU_d.get(), ws_d.get(), 50.f);
-      nv = onGPU_d->nvFinal;
-      memcpy(chi2, LOC_ONGPU(chi2), nv * sizeof(float));
+      gpuVertexFinder::fitVertices(onGPU_d.view(), ws_d.view(), 50.f);
+      nv = onGPU_d.view().nvFinal();
+      memcpy(chi2, onGPU_d.view().chi2(), nv * sizeof(float));
 #endif
 
       for (auto j = 0U; j < nv; ++j)
@@ -264,26 +275,26 @@ int main() {
 
 #ifdef __CUDACC__
       // one vertex per block!!!
-      cms::cuda::launch(gpuVertexFinder::splitVerticesKernel, {1024, 64}, onGPU_d.get(), ws_d.get(), 9.f);
-      cudaCheck(cudaMemcpy(&nv, LOC_WS(nvIntermediate), sizeof(uint32_t), cudaMemcpyDeviceToHost));
+      cms::cuda::launch(gpuVertexFinder::splitVerticesKernel, {1024, 64}, onGPU_d.view(), ws_d.view(), 9.f);
+      cudaCheck(cudaMemcpy(&nv, &ws_d.view().nvIntermediate(), sizeof(uint32_t), cudaMemcpyDeviceToHost));
 #else
-      splitVertices(onGPU_d.get(), ws_d.get(), 9.f);
-      nv = ws_d->nvIntermediate;
+      gpuVertexFinder::splitVertices(onGPU_d.view(), ws_d.view(), 9.f);
+      nv = ws_d.view().nvIntermediate();
 #endif
       std::cout << "after split " << nv << std::endl;
 
 #ifdef __CUDACC__
-      cms::cuda::launch(gpuVertexFinder::fitVerticesKernel, {1, 1024 - 256}, onGPU_d.get(), ws_d.get(), 5000.f);
+      cms::cuda::launch(gpuVertexFinder::fitVerticesKernel, {1, 1024 - 256}, onGPU_d.view(), ws_d.view(), 5000.f);
       cudaCheck(cudaGetLastError());
 
-      cms::cuda::launch(gpuVertexFinder::sortByPt2Kernel, {1, 256}, onGPU_d.get(), ws_d.get());
+      cms::cuda::launch(gpuVertexFinder::sortByPt2Kernel, {1, 256}, onGPU_d.view(), ws_d.view());
       cudaCheck(cudaGetLastError());
-      cudaCheck(cudaMemcpy(&nv, LOC_ONGPU(nvFinal), sizeof(uint32_t), cudaMemcpyDeviceToHost));
+      cudaCheck(cudaMemcpy(&nv, &onGPU_d.view().nvFinal(), sizeof(uint32_t), cudaMemcpyDeviceToHost));
 #else
-      fitVertices(onGPU_d.get(), ws_d.get(), 5000.f);
-      sortByPt2(onGPU_d.get(), ws_d.get());
-      nv = onGPU_d->nvFinal;
-      memcpy(chi2, LOC_ONGPU(chi2), nv * sizeof(float));
+      gpuVertexFinder::fitVertices(onGPU_d.view(), ws_d.view(), 5000.f);
+      gpuVertexFinder::sortByPt2(onGPU_d.view(), ws_d.view());
+      nv = onGPU_d.view().nvFinal();
+      memcpy(chi2, onGPU_d.view().chi2(), nv * sizeof(float));
 #endif
 
       if (nv == 0) {
@@ -292,12 +303,12 @@ int main() {
       }
 
 #ifdef __CUDACC__
-      cudaCheck(cudaMemcpy(zv, LOC_ONGPU(zv), nv * sizeof(float), cudaMemcpyDeviceToHost));
-      cudaCheck(cudaMemcpy(wv, LOC_ONGPU(wv), nv * sizeof(float), cudaMemcpyDeviceToHost));
-      cudaCheck(cudaMemcpy(chi2, LOC_ONGPU(chi2), nv * sizeof(float), cudaMemcpyDeviceToHost));
-      cudaCheck(cudaMemcpy(ptv2, LOC_ONGPU(ptv2), nv * sizeof(float), cudaMemcpyDeviceToHost));
-      cudaCheck(cudaMemcpy(nn, LOC_ONGPU(ndof), nv * sizeof(int32_t), cudaMemcpyDeviceToHost));
-      cudaCheck(cudaMemcpy(ind, LOC_ONGPU(sortInd), nv * sizeof(uint16_t), cudaMemcpyDeviceToHost));
+      cudaCheck(cudaMemcpy(zv, onGPU_d.view().zv(), nv * sizeof(float), cudaMemcpyDeviceToHost));
+      cudaCheck(cudaMemcpy(wv, onGPU_d.view().wv(), nv * sizeof(float), cudaMemcpyDeviceToHost));
+      cudaCheck(cudaMemcpy(chi2, onGPU_d.view().chi2(), nv * sizeof(float), cudaMemcpyDeviceToHost));
+      cudaCheck(cudaMemcpy(ptv2, onGPU_d.view().ptv2(), nv * sizeof(float), cudaMemcpyDeviceToHost));
+      cudaCheck(cudaMemcpy(nn, onGPU_d.view().ndof(), nv * sizeof(int32_t), cudaMemcpyDeviceToHost));
+      cudaCheck(cudaMemcpy(ind, onGPU_d.view().sortInd(), nv * sizeof(uint16_t), cudaMemcpyDeviceToHost));
 #endif
       for (auto j = 0U; j < nv; ++j)
         if (nn[j] > 0)
diff --git a/RecoTauTag/HLTProducers/src/L2TauTagNNProducer.cc b/RecoTauTag/HLTProducers/src/L2TauTagNNProducer.cc
index c11b53538c5b0..4637bac6fa580 100644
--- a/RecoTauTag/HLTProducers/src/L2TauTagNNProducer.cc
+++ b/RecoTauTag/HLTProducers/src/L2TauTagNNProducer.cc
@@ -45,12 +45,13 @@
 #include "DataFormats/TrajectoryState/interface/LocalTrajectoryParameters.h"
 #include "DataFormats/GeometrySurface/interface/Plane.h"
 #include "DataFormats/BeamSpot/interface/BeamSpot.h"
-#include "CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h"
 #include "MagneticField/Records/interface/IdealMagneticFieldRecord.h"
 #include "CUDADataFormats/SiPixelCluster/interface/gpuClusteringConstants.h"
-#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousT.h"
-#include "CUDADataFormats/Vertex/interface/ZVertexSoA.h"
-#include "CUDADataFormats/Vertex/interface/ZVertexHeterogeneous.h"
+
+#include "CUDADataFormats/Track/interface/PixelTrackUtilities.h"
+#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousHost.h"
+#include "CUDADataFormats/Vertex/interface/ZVertexUtilities.h"
+#include "CUDADataFormats/Vertex/interface/ZVertexSoAHeterogeneousHost.h"
 
 namespace L2TauTagNNv1 {
   constexpr int nCellEta = 5;
@@ -145,10 +146,9 @@ struct L2TauNNProducerCacheData {
 };
 
 class L2TauNNProducer : public edm::stream::EDProducer<edm::GlobalCache<L2TauNNProducerCacheData>> {
-  using TrackSoA = pixelTrack::TrackSoAT<pixelTopology::Phase1>;
-  using PixelTrackHeterogeneous = PixelTrackHeterogeneousT<pixelTopology::Phase1>;
-
 public:
+  using TrackSoAHost = pixelTrack::TrackSoAHostPhase1;
+
   struct caloRecHitCollections {
     const HBHERecHitCollection* hbhe;
     const HORecHitCollection* ho;
@@ -182,16 +182,17 @@ class L2TauNNProducer : public edm::stream::EDProducer<edm::GlobalCache<L2TauNNP
                        const caloRecHitCollections& caloRecHits);
   void fillPatatracks(tensorflow::Tensor& cellGridMatrix,
                       const std::vector<l1t::TauRef>& allTaus,
-                      const TrackSoA& patatracks_tsoa,
-                      const ZVertexSoA& patavtx_soa,
+                      const TrackSoAHost& patatracks_tsoa,
+                      const ZVertexSoAHost& patavtx_soa,
                       const reco::BeamSpot& beamspot,
                       const MagneticField* magfi);
-  void selectGoodTracksAndVertices(const ZVertexSoA& patavtx_soa,
-                                   const TrackSoA& patatracks_tsoa,
+  void selectGoodTracksAndVertices(const ZVertexSoAHost& patavtx_soa,
+                                   const TrackSoAHost& patatracks_tsoa,
                                    std::vector<int>& trkGood,
                                    std::vector<int>& vtxGood);
+
   std::pair<float, float> impactParameter(int it,
-                                          const TrackSoA& patatracks_tsoa,
+                                          const TrackSoAHost& patatracks_tsoa,
                                           float patatrackPhi,
                                           const reco::BeamSpot& beamspot,
                                           const MagneticField* magfi);
@@ -210,8 +211,8 @@ class L2TauNNProducer : public edm::stream::EDProducer<edm::GlobalCache<L2TauNNP
   const edm::EDGetTokenT<EcalRecHitCollection> eeToken_;
   const edm::ESGetToken<CaloGeometry, CaloGeometryRecord> geometryToken_;
   const edm::ESGetToken<MagneticField, IdealMagneticFieldRecord> bFieldToken_;
-  const edm::EDGetTokenT<ZVertexHeterogeneous> pataVerticesToken_;
-  const edm::EDGetTokenT<PixelTrackHeterogeneous> pataTracksToken_;
+  const edm::EDGetTokenT<ZVertexSoAHost> pataVerticesToken_;
+  const edm::EDGetTokenT<TrackSoAHost> pataTracksToken_;
   const edm::EDGetTokenT<reco::BeamSpot> beamSpotToken_;
   const unsigned int maxVtx_;
   const float fractionSumPt2_;
@@ -295,7 +296,7 @@ L2TauNNProducer::L2TauNNProducer(const edm::ParameterSet& cfg, const L2TauNNProd
       eeToken_(consumes<EcalRecHitCollection>(cfg.getParameter<edm::InputTag>("eeInput"))),
       geometryToken_(esConsumes<CaloGeometry, CaloGeometryRecord>()),
       bFieldToken_(esConsumes<MagneticField, IdealMagneticFieldRecord>()),
-      pataVerticesToken_(consumes<ZVertexHeterogeneous>(cfg.getParameter<edm::InputTag>("pataVertices"))),
+      pataVerticesToken_(consumes(cfg.getParameter<edm::InputTag>("pataVertices"))),
       pataTracksToken_(consumes(cfg.getParameter<edm::InputTag>("pataTracks"))),
       beamSpotToken_(consumes<reco::BeamSpot>(cfg.getParameter<edm::InputTag>("BeamSpot"))),
       maxVtx_(cfg.getParameter<uint>("maxVtx")),
@@ -572,32 +573,33 @@ void L2TauNNProducer::fillCaloRecHits(tensorflow::Tensor& cellGridMatrix,
   }
 }
 
-void L2TauNNProducer::selectGoodTracksAndVertices(const ZVertexSoA& patavtx_soa,
-                                                  const TrackSoA& patatracks_tsoa,
+void L2TauNNProducer::selectGoodTracksAndVertices(const ZVertexSoAHost& patavtx_soa,
+                                                  const TrackSoAHost& patatracks_tsoa,
                                                   std::vector<int>& trkGood,
                                                   std::vector<int>& vtxGood) {
-  const auto maxTracks = patatracks_tsoa.stride();
-  const int nv = patavtx_soa.nvFinal;
+  using patatrackHelpers = TracksUtilities<pixelTopology::Phase1>;
+  const auto maxTracks = patatracks_tsoa.view().metadata().size();
+  const int nv = patavtx_soa.view().nvFinal();
   trkGood.clear();
   trkGood.reserve(maxTracks);
   vtxGood.clear();
   vtxGood.reserve(nv);
-  auto const* quality = patatracks_tsoa.qualityData();
+  auto const* quality = patatracks_tsoa.view().quality();
 
   // No need to sort either as the algorithms is just using the max (not even the location, just the max value of pt2sum).
   std::vector<float> pTSquaredSum(nv, 0);
   std::vector<int> nTrkAssociated(nv, 0);
 
   for (int32_t trk_idx = 0; trk_idx < maxTracks; ++trk_idx) {
-    auto nHits = patatracks_tsoa.nHits(trk_idx);
+    auto nHits = patatrackHelpers::nHits(patatracks_tsoa.view(), trk_idx);
     if (nHits == 0) {
       break;
     }
-    int vtx_ass_to_track = patavtx_soa.idv[trk_idx];
+    int vtx_ass_to_track = patavtx_soa.view()[trk_idx].idv();
     if (vtx_ass_to_track >= 0 && vtx_ass_to_track < nv) {
-      auto patatrackPt = patatracks_tsoa.pt[trk_idx];
+      auto patatrackPt = patatracks_tsoa.view()[trk_idx].pt();
       ++nTrkAssociated[vtx_ass_to_track];
-      if (patatrackPt >= trackPtMin_ && patatracks_tsoa.chi2(trk_idx) <= trackChi2Max_) {
+      if (patatrackPt >= trackPtMin_ && patatracks_tsoa.const_view()[trk_idx].chi2() <= trackChi2Max_) {
         patatrackPt = std::min(patatrackPt, trackPtMax_);
         pTSquaredSum[vtx_ass_to_track] += patatrackPt * patatrackPt;
       }
@@ -609,7 +611,7 @@ void L2TauNNProducer::selectGoodTracksAndVertices(const ZVertexSoA& patavtx_soa,
   if (nv > 0) {
     const auto minFOM_fromFrac = (*std::max_element(pTSquaredSum.begin(), pTSquaredSum.end())) * fractionSumPt2_;
     for (int j = nv - 1; j >= 0 && vtxGood.size() < maxVtx_; --j) {
-      auto vtx_idx = patavtx_soa.sortInd[j];
+      auto vtx_idx = patavtx_soa.view()[j].sortInd();
       assert(vtx_idx < nv);
       if (nTrkAssociated[vtx_idx] >= 2 && pTSquaredSum[vtx_idx] >= minFOM_fromFrac &&
           pTSquaredSum[vtx_idx] > minSumPt2_) {
@@ -620,15 +622,14 @@ void L2TauNNProducer::selectGoodTracksAndVertices(const ZVertexSoA& patavtx_soa,
 }
 
 std::pair<float, float> L2TauNNProducer::impactParameter(int it,
-                                                         const TrackSoA& patatracks_tsoa,
+                                                         const TrackSoAHost& patatracks_tsoa,
                                                          float patatrackPhi,
                                                          const reco::BeamSpot& beamspot,
                                                          const MagneticField* magfi) {
-  auto const& fit = patatracks_tsoa.stateAtBS;
   /* dxy and dz */
   riemannFit::Vector5d ipar, opar;
   riemannFit::Matrix5d icov, ocov;
-  fit.copyToDense(ipar, icov, it);
+  TracksUtilities<pixelTopology::Phase1>::copyToDense(patatracks_tsoa.view(), ipar, icov, it);
   riemannFit::transformToPerigeePlane(ipar, icov, opar, ocov);
   LocalTrajectoryParameters lpar(opar(0), opar(1), opar(2), opar(3), opar(4), 1.);
   float sp = std::sin(patatrackPhi);
@@ -653,11 +654,12 @@ std::pair<float, float> L2TauNNProducer::impactParameter(int it,
 
 void L2TauNNProducer::fillPatatracks(tensorflow::Tensor& cellGridMatrix,
                                      const std::vector<l1t::TauRef>& allTaus,
-                                     const TrackSoA& patatracks_tsoa,
-                                     const ZVertexSoA& patavtx_soa,
+                                     const TrackSoAHost& patatracks_tsoa,
+                                     const ZVertexSoAHost& patavtx_soa,
                                      const reco::BeamSpot& beamspot,
                                      const MagneticField* magfi) {
   using NNInputs = L2TauTagNNv1::NNInputs;
+  using patatrackHelpers = TracksUtilities<pixelTopology::Phase1>;
   float deta, dphi;
   int eta_idx = 0;
   int phi_idx = 0;
@@ -678,19 +680,19 @@ void L2TauNNProducer::fillPatatracks(tensorflow::Tensor& cellGridMatrix,
     const float tauPhi = allTaus[tau_idx]->phi();
 
     for (const auto it : trkGood) {
-      const float patatrackPt = patatracks_tsoa.pt[it];
+      const float patatrackPt = patatracks_tsoa.const_view()[it].pt();
       if (patatrackPt <= 0)
         continue;
-      const float patatrackPhi = patatracks_tsoa.phi(it);
-      const float patatrackEta = patatracks_tsoa.eta(it);
-      const float patatrackCharge = patatracks_tsoa.charge(it);
-      const float patatrackChi2OverNdof = patatracks_tsoa.chi2(it);
-      const auto nHits = patatracks_tsoa.nHits(it);
+      const float patatrackPhi = patatrackHelpers::phi(patatracks_tsoa.const_view(), it);
+      const float patatrackEta = patatracks_tsoa.const_view()[it].eta();
+      const float patatrackCharge = patatrackHelpers::charge(patatracks_tsoa.const_view(), it);
+      const float patatrackChi2OverNdof = patatracks_tsoa.view()[it].chi2();
+      const auto nHits = patatrackHelpers::nHits(patatracks_tsoa.const_view(), it);
       if (nHits <= 0)
         continue;
       const int patatrackNdof = 2 * std::min(6, nHits) - 5;
 
-      const int vtx_idx_assTrk = patavtx_soa.idv[it];
+      const int vtx_idx_assTrk = patavtx_soa.view()[it].idv();
       if (reco::deltaR2(patatrackEta, patatrackPhi, tauEta, tauPhi) < dR2_max) {
         std::tie(deta, dphi, eta_idx, phi_idx) =
             getEtaPhiIndices(patatrackEta, patatrackPhi, allTaus[tau_idx]->polarP4());
@@ -766,8 +768,8 @@ void L2TauNNProducer::produce(edm::Event& event, const edm::EventSetup& eventset
   const auto eeCal = event.getHandle(eeToken_);
   const auto hbhe = event.getHandle(hbheToken_);
   const auto ho = event.getHandle(hoToken_);
-  const auto& patatracks_SoA = *event.get(pataTracksToken_);
-  const auto& vertices_SoA = *event.get(pataVerticesToken_);
+  auto const& patatracks_SoA = event.get(pataTracksToken_);
+  auto const& vertices_SoA = event.get(pataVerticesToken_);
   const auto bsHandle = event.getHandle(beamSpotToken_);
 
   auto const fieldESH = eventsetup.getHandle(bFieldToken_);
diff --git a/RecoTracker/TkSeedGenerator/plugins/SeedProducerFromSoA.cc b/RecoTracker/TkSeedGenerator/plugins/SeedProducerFromSoA.cc
index 9023640f62d5a..8225885068cef 100644
--- a/RecoTracker/TkSeedGenerator/plugins/SeedProducerFromSoA.cc
+++ b/RecoTracker/TkSeedGenerator/plugins/SeedProducerFromSoA.cc
@@ -1,4 +1,4 @@
-#include "CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h"
+#include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousHost.h"
 #include "DataFormats/BeamSpot/interface/BeamSpot.h"
 #include "DataFormats/GeometrySurface/interface/Plane.h"
 #include "DataFormats/TrackerCommon/interface/TrackerTopology.h"
@@ -46,7 +46,7 @@ class SeedProducerFromSoAT : public edm::global::EDProducer<> {
 
   // Event data tokens
   const edm::EDGetTokenT<reco::BeamSpot> tBeamSpot_;
-  const edm::EDGetTokenT<PixelTrackHeterogeneousT<TrackerTraits>> tokenTrack_;
+  const edm::EDGetTokenT<TrackSoAHeterogeneousHost<TrackerTraits>> tokenTrack_;
   // Event setup tokens
   const edm::ESGetToken<MagneticField, IdealMagneticFieldRecord> idealMagneticFieldToken_;
   const edm::ESGetToken<TrackerGeometry, TrackerDigiGeometryRecord> trackerDigiGeometryToken_;
@@ -84,6 +84,8 @@ void SeedProducerFromSoAT<TrackerTraits>::produce(edm::StreamID streamID,
   // std::cout << "Converting gpu helix to trajectory seed" << std::endl;
   auto result = std::make_unique<TrajectorySeedCollection>();
 
+  using trackHelper = TracksUtilities<TrackerTraits>;
+
   auto const& fieldESH = iSetup.getHandle(idealMagneticFieldToken_);
   auto const& tracker = iSetup.getHandle(trackerDigiGeometryToken_);
   auto const& dus = tracker->detUnits();
@@ -95,16 +97,15 @@ void SeedProducerFromSoAT<TrackerTraits>::produce(edm::StreamID streamID,
   // std::cout << "beamspot " << bsh.x0() << ' ' << bsh.y0() << ' ' << bsh.z0() << std::endl;
   GlobalPoint bs(bsh.x0(), bsh.y0(), bsh.z0());
 
-  const auto& tsoa = *(iEvent.get(tokenTrack_));
+  auto const& tsoa = iEvent.get(tokenTrack_);
 
-  auto const* quality = tsoa.qualityData();
-  auto const& fit = tsoa.stateAtBS;
-  auto const& detIndices = tsoa.detIndices;
-  auto maxTracks = tsoa.stride();
+  auto const* quality = tsoa.view().quality();
+  auto const& detIndices = tsoa.view().detIndices();
+  auto maxTracks = tsoa.view().metadata().size();
 
   int32_t nt = 0;
   for (int32_t it = 0; it < maxTracks; ++it) {
-    auto nHits = tsoa.nHits(it);
+    auto nHits = trackHelper::nHits(tsoa.view(), it);
     if (nHits == 0)
       break;  // this is a guard: maybe we need to move to nTracks...
 
@@ -126,11 +127,11 @@ void SeedProducerFromSoAT<TrackerTraits>::produce(edm::StreamID streamID,
 
     // mind: this values are respect the beamspot!
 
-    float phi = tsoa.phi(it);
+    float phi = trackHelper::nHits(tsoa.view(), it);
 
     riemannFit::Vector5d ipar, opar;
     riemannFit::Matrix5d icov, ocov;
-    fit.copyToDense(ipar, icov, it);
+    trackHelper::copyToDense(tsoa.view(), ipar, icov, it);
     riemannFit::transformToPerigeePlane(ipar, icov, opar, ocov);
 
     LocalTrajectoryParameters lpar(opar(0), opar(1), opar(2), opar(3), opar(4), 1.);