From 0444702d2d884e853c30d9ad7e207e8dfb737ecd Mon Sep 17 00:00:00 2001
From: Andrea Bocci <andrea.bocci@cern.ch>
Date: Fri, 5 Jul 2019 11:59:12 +0200
Subject: [PATCH] Port the whole pixel workflow to new heterogeneous framework
 (cms-patatrack#384)

  - port the whole pixel workflow to new heterogeneous framework
  - implement a legacy cluster to SoA converter for the pixel RecHits
  - update the vertex producer to run on CPU as well as GPU
---
 CUDADataFormats/Track/BuildFile.xml           |  10 +
 .../Track/interface/PixelTrackHeterogeneous.h |  79 ++
 .../Track/interface/TrajectoryStateSoA.h      |  65 ++
 CUDADataFormats/Track/src/classes.h           |  10 +
 CUDADataFormats/Track/src/classes_def.xml     |   8 +
 CUDADataFormats/Track/test/BuildFile.xml      |  13 +
 .../Track/test/TrajectoryStateSOA_t.cpp       |   1 +
 .../Track/test/TrajectoryStateSOA_t.cu        |   1 +
 .../Track/test/TrajectoryStateSOA_t.h         |  77 ++
 .../customizePixelTracksForProfiling.py       |  55 +-
 .../PixelTrackFitting/interface/FitUtils.h    |  37 +-
 .../PixelTrackFitting/plugins/BuildFile.xml   |   1 +
 .../plugins/PixelTrackDumpCUDA.cc             |  94 ++
 .../plugins/PixelTrackProducerFromSoA.cc      | 214 ++++
 .../plugins/PixelTrackSoAFromCUDA.cc          |  95 ++
 .../PixelTrackFitting/plugins/storeTracks.h   |  10 +-
 .../python/PixelTracks_cff.py                 |  24 +-
 .../test/testEigenJacobian.cpp                |   2 +-
 .../plugins/BrokenLineFitOnGPU.cu             |  69 +-
 .../PixelTriplets/plugins/BuildFile.xml       |   1 +
 .../PixelTriplets/plugins/CAHitNtupletCUDA.cc |  82 ++
 .../plugins/CAHitNtupletGeneratorKernels.cu   | 933 ++++++++++++++++++
 .../plugins/CAHitNtupletGeneratorKernels.h    | 186 ++++
 .../CAHitNtupletGeneratorKernelsAlloc.cu      |  40 +
 .../plugins/CAHitNtupletGeneratorOnGPU.cc     | 164 +++
 .../plugins/CAHitNtupletGeneratorOnGPU.h      |  73 ++
 .../PixelTriplets/plugins/GPUCACell.h         |  12 +-
 .../PixelTriplets/plugins/HelixFitOnGPU.cc    |  10 +-
 .../PixelTriplets/plugins/HelixFitOnGPU.h     |  23 +-
 .../PixelTriplets/plugins/RiemannFitOnGPU.cu  |  67 +-
 .../plugins/SeedProducerFromSoA.cc            | 177 ++++
 31 files changed, 2495 insertions(+), 138 deletions(-)
 create mode 100644 CUDADataFormats/Track/BuildFile.xml
 create mode 100644 CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h
 create mode 100644 CUDADataFormats/Track/interface/TrajectoryStateSoA.h
 create mode 100644 CUDADataFormats/Track/src/classes.h
 create mode 100644 CUDADataFormats/Track/src/classes_def.xml
 create mode 100644 CUDADataFormats/Track/test/BuildFile.xml
 create mode 100644 CUDADataFormats/Track/test/TrajectoryStateSOA_t.cpp
 create mode 100644 CUDADataFormats/Track/test/TrajectoryStateSOA_t.cu
 create mode 100644 CUDADataFormats/Track/test/TrajectoryStateSOA_t.h
 create mode 100644 RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackDumpCUDA.cc
 create mode 100644 RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackProducerFromSoA.cc
 create mode 100644 RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackSoAFromCUDA.cc
 create mode 100644 RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletCUDA.cc
 create mode 100644 RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu
 create mode 100644 RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.h
 create mode 100644 RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsAlloc.cu
 create mode 100644 RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.cc
 create mode 100644 RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.h
 create mode 100644 RecoTracker/TkSeedGenerator/plugins/SeedProducerFromSoA.cc
diff --git a/CUDADataFormats/Track/BuildFile.xml b/CUDADataFormats/Track/BuildFile.xml
new file mode 100644
index 0000000000000..521ea8fe29753
--- /dev/null
+++ b/CUDADataFormats/Track/BuildFile.xml
@@ -0,0 +1,10 @@
+<use name="cuda-api-wrappers"/>
+<use name="rootcore"/>
+<use name="FWCore/ServiceRegistry"/>
+<use name="FWCore/ParameterSetReader"/>
+<use name="HeterogeneousCore/CUDAServices"/>
+<use name="HeterogeneousCore/CUDAUtilities"/>
+<use name="eigen"/>
+<export>
+    <lib name="1"/>
+</export>
diff --git a/CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h b/CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h
new file mode 100644
index 0000000000000..a576604b6e935
--- /dev/null
+++ b/CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h
@@ -0,0 +1,79 @@
+#ifndef CUDADataFormatsTrackTrackHeterogeneous_H
+#define CUDADataFormatsTrackTrackHeterogeneous_H
+
+#include "CUDADataFormats/Track/interface/TrajectoryStateSoA.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h"
+
+#include "CUDADataFormats/Common/interface/HeterogeneousSoA.h"
+
+namespace trackQuality {
+  enum Quality : uint8_t { bad=0, dup, loose, strict, tight, highPurity };
+}
+
+template <int32_t S>
+class TrackSoAT {
+public:
+
+  static constexpr int32_t stride() { return S; }
+
+  using Quality = trackQuality::Quality;
+  using hindex_type = uint16_t;
+  using HitContainer = OneToManyAssoc<hindex_type, S, 5 * S>;
+
+  // Always check quality is at least loose!
+  // CUDA does not support enums  in __lgc ...
+  eigenSoA::ScalarSoA<uint8_t, S> m_quality;
+  constexpr Quality quality(int32_t i) const { return (Quality)(m_quality(i));}
+  constexpr Quality & quality(int32_t i) { return (Quality&)(m_quality(i));}
+  constexpr Quality const * qualityData() const { return (Quality const *)(m_quality.data());}
+  constexpr Quality * qualityData() { return (Quality*)(m_quality.data());}
+
+
+  // this is chi2/ndof as not necessarely all hits are used in the fit  
+  eigenSoA::ScalarSoA<float, S> chi2;
+
+  constexpr int nHits(int i) const { return detIndices.size(i);}
+
+  // State at the Beam spot
+  // phi,tip,1/pt,cotan(theta),zip
+  TrajectoryStateSoA<S> stateAtBS;
+  eigenSoA::ScalarSoA<float, S> eta;
+  eigenSoA::ScalarSoA<float, S> pt;
+  constexpr float charge(int32_t i) const { return std::copysign(1.f,stateAtBS.state(i)(2)); }
+  constexpr float phi(int32_t i) const { return stateAtBS.state(i)(0); }
+  constexpr float tip(int32_t i) const { return stateAtBS.state(i)(1); }
+  constexpr float zip(int32_t i) const { return stateAtBS.state(i)(4); }
+
+  // state at the detector of the outermost hit
+  // representation to be decided...
+  // not yet filled on GPU
+  // TrajectoryStateSoA<S> stateAtOuterDet;
+
+  HitContainer hitIndices;
+  HitContainer detIndices;
+  
+  // total number of tracks (including those not fitted)
+  uint32_t m_nTracks;
+
+};
+
+namespace pixelTrack{
+
+#ifdef GPU_SMALL_EVENTS
+  constexpr uint32_t maxNumber() { return 2 * 1024;}
+#else
+  constexpr uint32_t maxNumber() { return 32 * 1024;}
+#endif
+
+  using TrackSoA = TrackSoAT<maxNumber()>;
+  using TrajectoryState = TrajectoryStateSoA<maxNumber()>;
+  using HitContainer = TrackSoA::HitContainer;
+  using Quality = trackQuality::Quality;
+
+}
+
+using PixelTrackHeterogeneous = HeterogeneousSoA<pixelTrack::TrackSoA>;
+
+
+#endif // CUDADataFormatsTrackTrackSoA_H
+
diff --git a/CUDADataFormats/Track/interface/TrajectoryStateSoA.h b/CUDADataFormats/Track/interface/TrajectoryStateSoA.h
new file mode 100644
index 0000000000000..a6553ff96cb0b
--- /dev/null
+++ b/CUDADataFormats/Track/interface/TrajectoryStateSoA.h
@@ -0,0 +1,65 @@
+#ifndef CUDADataFormatsTrackTrajectoryStateSOA_H
+#define CUDADataFormatsTrackTrajectoryStateSOA_H
+
+#include <Eigen/Dense>
+#include "HeterogeneousCore/CUDAUtilities/interface/eigenSoA.h"
+
+template <int32_t S>
+struct TrajectoryStateSoA {
+
+  using Vector5f = Eigen::Matrix<float, 5, 1>;
+  using Vector15f = Eigen::Matrix<float, 15, 1>;
+
+  using Vector5d = Eigen::Matrix<double, 5, 1>;
+  using Matrix5d = Eigen::Matrix<double, 5, 5>;
+
+
+  static constexpr int32_t stride() { return S; }
+
+  eigenSoA::MatrixSoA<Vector5f,S> state;
+  eigenSoA::MatrixSoA<Vector15f,S> covariance;
+
+
+  template<typename V3, typename M3, typename V2, typename M2>
+  __host__ __device__
+  void copyFromCircle(V3 const & cp, M3 const & ccov, V2 const & lp, M2 const & lcov, float b, int32_t i) {
+     state(i) << cp.template cast<float>(), lp.template cast<float>();
+     state(i)(2) *=b;
+     auto cov =  covariance(i);
+     cov(0) = ccov(0,0);
+     cov(1) = ccov(0,1);
+     cov(2) = b*float(ccov(0,2));
+     cov(4) = cov(3) = 0;
+     cov(5) = ccov(1,1);
+     cov(6) = b*float(ccov(1,2));
+     cov(8) = cov(7) = 0;
+     cov(9) = b*b*float(ccov(2,2));
+     cov(11) = cov(10) = 0;
+     cov(12) = lcov(0,0);
+     cov(13) = lcov(0,1);
+     cov(14) = lcov(1,1);
+  }
+
+
+  template<typename V5, typename M5>
+  __host__ __device__
+  void copyFromDense(V5 const & v, M5 const & cov, int32_t i) {
+     state(i) = v.template cast<float>();
+     for(int j=0, ind=0; j<5; ++j) for (auto k=j;k<5;++k) covariance(i)(ind++) = cov(j,k); 
+  }
+
+  template<typename V5, typename M5>
+  __host__ __device__
+  void copyToDense(V5 & v, M5 & cov, int32_t i) const {
+     v = state(i).template cast<typename V5::Scalar>();
+     for(int j=0, ind=0; j<5; ++j) {
+        cov(j,j) = covariance(i)(ind++); 
+        for (auto k=j+1;k<5;++k) cov(k,j)=cov(j,k) = covariance(i)(ind++);
+     }
+  }
+
+};
+
+#endif // CUDADataFormatsTrackTrajectoryStateSOA_H
+
+
diff --git a/CUDADataFormats/Track/src/classes.h b/CUDADataFormats/Track/src/classes.h
new file mode 100644
index 0000000000000..7f89096977e64
--- /dev/null
+++ b/CUDADataFormats/Track/src/classes.h
@@ -0,0 +1,10 @@
+#ifndef CUDADataFormats__src_classes_h
+#define CUDADataFormats__src_classes_h
+
+#include "CUDADataFormats/Common/interface/CUDAProduct.h"
+#include "CUDADataFormats/Common/interface/HostProduct.h"
+#include "CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h"
+#include "CUDADataFormats/Common/interface/ArrayShadow.h"
+#include "DataFormats/Common/interface/Wrapper.h"
+
+#endif  
diff --git a/CUDADataFormats/Track/src/classes_def.xml b/CUDADataFormats/Track/src/classes_def.xml
new file mode 100644
index 0000000000000..a4c2e766582dd
--- /dev/null
+++ b/CUDADataFormats/Track/src/classes_def.xml
@@ -0,0 +1,8 @@
+<lcgdict>
+  <class name="CUDAProduct<HeterogeneousSoA<pixelTrack::TrackSoA>>" persistent="false"/>
+  <class name="edm::Wrapper<CUDAProduct<HeterogeneousSoA<pixelTrack::TrackSoA>>>" persistent="false"/>
+  <class name="HeterogeneousSoA<pixelTrack::TrackSoA>" persistent="false"/>
+  <class name="edm::Wrapper<HeterogeneousSoA<pixelTrack::TrackSoA>>" persistent="false"/>
+  <class name="ArrayShadow<std::array<unsigned int,2001>>" persistent="false"/>
+  <class name="edm::Wrapper<ArrayShadow<std::array<unsigned int,2001>>>" persistent="false"/>
+</lcgdict>
diff --git a/CUDADataFormats/Track/test/BuildFile.xml b/CUDADataFormats/Track/test/BuildFile.xml
new file mode 100644
index 0000000000000..598b345d4709d
--- /dev/null
+++ b/CUDADataFormats/Track/test/BuildFile.xml
@@ -0,0 +1,13 @@
+<use name="HeterogeneousCore/CUDAUtilities"/>
+
+<bin file="TrajectoryStateSOA_t.cpp" name="cpuTrajectoryStateSOA_t">
+  <use name="eigen"/>
+  <flags CXXFLAGS="-g -DGPU_DEBUG"/>
+</bin>
+
+<bin file="TrajectoryStateSOA_t.cu" name="gpuTrajectoryStateSOA_t">
+  <use name="eigen"/>
+  <flags CUDA_FLAGS="-g -DGPU_DEBUG"/>
+  <flags CXXFLAGS="-g -DGPU_DEBUG"/>
+</bin>
+
diff --git a/CUDADataFormats/Track/test/TrajectoryStateSOA_t.cpp b/CUDADataFormats/Track/test/TrajectoryStateSOA_t.cpp
new file mode 100644
index 0000000000000..d6ff539a642b0
--- /dev/null
+++ b/CUDADataFormats/Track/test/TrajectoryStateSOA_t.cpp
@@ -0,0 +1 @@
+#include "TrajectoryStateSOA_t.h"
diff --git a/CUDADataFormats/Track/test/TrajectoryStateSOA_t.cu b/CUDADataFormats/Track/test/TrajectoryStateSOA_t.cu
new file mode 100644
index 0000000000000..d6ff539a642b0
--- /dev/null
+++ b/CUDADataFormats/Track/test/TrajectoryStateSOA_t.cu
@@ -0,0 +1 @@
+#include "TrajectoryStateSOA_t.h"
diff --git a/CUDADataFormats/Track/test/TrajectoryStateSOA_t.h b/CUDADataFormats/Track/test/TrajectoryStateSOA_t.h
new file mode 100644
index 0000000000000..adefb57d7bbe5
--- /dev/null
+++ b/CUDADataFormats/Track/test/TrajectoryStateSOA_t.h
@@ -0,0 +1,77 @@
+#include "CUDADataFormats/Track/interface/TrajectoryStateSoA.h"
+
+using Vector5d = Eigen::Matrix<double, 5, 1>;
+using Matrix5d = Eigen::Matrix<double, 5, 5>;
+
+__host__ __device__
+Matrix5d loadCov(Vector5d const & e) {
+  Matrix5d cov;
+  for (int i=0; i<5; ++i) cov(i,i) = e(i)*e(i);
+  for (int i = 0; i < 5; ++i) {
+    for (int j = 0; j < i; ++j) {
+      double v = 0.3*std::sqrt( cov(i,i) * cov(j,j) ); // this makes the matrix pos defined
+      cov(i,j) = (i+j)%2 ? -0.4*v  : 0.1*v;
+      cov(j,i) = cov(i,j);
+    }
+   }
+  return cov;
+}
+
+
+using TS = TrajectoryStateSoA<128>;
+
+__global__ void testTSSoA(TS * pts, int n) {
+
+  assert(n<=128);
+
+  Vector5d par0; par0 << 0.2,0.1,3.5,0.8,0.1;
+  Vector5d e0; e0 << 0.01,0.01,0.035,-0.03,-0.01;
+  auto cov0 = loadCov(e0);
+
+  TS & ts = *pts;
+
+  int first = threadIdx.x + blockIdx.x * blockDim.x;
+
+  for (int i = first; i < n; i += blockDim.x * gridDim.x) {
+    ts.copyFromDense(par0,cov0,i);
+    Vector5d par1; Matrix5d cov1;
+    ts.copyToDense(par1,cov1,i);
+    Vector5d delV = par1-par0;
+    Matrix5d delM = cov1-cov0;
+    for(int j=0; j<5; ++j) {
+      assert(std::abs(delV(j))<1.e-5);
+      for (auto k=j;k<5;++k) {
+        assert(cov0(k,j)==cov0(j,k));
+        assert(cov1(k,j)==cov1(j,k));
+        assert(std::abs(delM(k,j))<1.e-5);
+      }
+    }
+
+  }
+}
+
+#ifdef __CUDACC__
+#include "HeterogeneousCore/CUDAUtilities/interface/exitSansCUDADevices.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+#endif
+
+int main() {
+#ifdef __CUDACC__
+  exitSansCUDADevices();
+#endif
+
+
+  TS ts;
+
+#ifdef __CUDACC__
+  TS * ts_d;
+  cudaCheck(cudaMalloc(&ts_d, sizeof(TS)));
+  testTSSoA<<<1, 64>>>(ts_d,128);
+  cudaCheck(cudaGetLastError());
+  cudaCheck(cudaMemcpy(&ts, ts_d, sizeof(TS), cudaMemcpyDefault));
+  cudaCheck(cudaDeviceSynchronize());
+#else
+  testTSSoA(&ts,128);
+#endif
+
+}
diff --git a/RecoPixelVertexing/Configuration/python/customizePixelTracksForProfiling.py b/RecoPixelVertexing/Configuration/python/customizePixelTracksForProfiling.py
index 1021918c0ce6c..ce97de6650244 100644
--- a/RecoPixelVertexing/Configuration/python/customizePixelTracksForProfiling.py
+++ b/RecoPixelVertexing/Configuration/python/customizePixelTracksForProfiling.py
@@ -1,6 +1,41 @@
 import FWCore.ParameterSet.Config as cms
 
-def customizePixelTracksForProfiling(process):
+def customizePixelTracksForProfilingGPUOnly(process):
+    process.MessageLogger.cerr.FwkReport.reportEvery = 100
+
+    process.Raw2Hit = cms.Path(process.offlineBeamSpot+process.offlineBeamSpotCUDA+process.siPixelClustersCUDAPreSplitting+process.siPixelRecHitsCUDAPreSplitting)
+
+    process.load('RecoPixelVertexing.PixelTriplets.caHitNtupletCUDA_cfi')
+    process.load('RecoPixelVertexing.PixelVertexFinding.pixelVertexCUDA_cfi')
+    process.TVreco = cms.Path(process.caHitNtupletCUDA+process.pixelVertexCUDA)
+
+    process.schedule = cms.Schedule(process.Raw2Hit, process.TVreco)
+    return process
+
+def customizePixelTracksForProfilingSoAonCPU(process):
+    process = customizePixelTracksForProfilingGPUOnly(process)
+
+    process.pixelVertexSoA = process.pixelVertexCUDA.clone()
+    process.pixelVertexSoA.onGPU = False
+    process.pixelVertexSoA.pixelTrackSrc = 'pixelTrackSoA'
+    process.TVSoAreco = cms.Path(process.caHitNtupletCUDA+process.pixelTrackSoA+process.pixelVertexSoA)
+
+    process.schedule = cms.Schedule(process.Raw2Hit, process.TVSoAreco)
+
+    return process
+
+def customizePixelTracksForProfilingEnableTransfer(process):
+    process = customizePixelTracksForProfilingGPUOnly(process)
+
+    process.load('RecoPixelVertexing.PixelTrackFitting.pixelTrackSoA_cfi')
+    process.load('RecoPixelVertexing.PixelVertexFinding.pixelVertexSoA_cfi')
+    process.toSoA = cms.Path(process.pixelTrackSoA+process.pixelVertexSoA)
+
+    process.schedule = cms.Schedule(process.Raw2Hit, process.TVreco, process.toSoA)
+    return process
+
+def customizePixelTracksForProfilingEnableConversion(process):
+    # use old trick of output path
     process.MessageLogger.cerr.FwkReport.reportEvery = 100
 
     process.out = cms.OutputModule("AsciiOutputModule",
@@ -17,21 +52,3 @@ def customizePixelTracksForProfiling(process):
 
     return process
 
-def customizePixelTracksForProfilingDisableConversion(process):
-    process = customizePixelTracksForProfiling(process)
-
-    # Disable conversions to legacy
-    process.pixelTracksHitQuadruplets.gpuEnableConversion = False
-    process.pixelTracks.gpuEnableConversion = False
-    process.pixelVertices.gpuEnableConversion = False
-
-    return process
-
-def customizePixelTracksForProfilingDisableTransfer(process):
-    process = customizePixelTracksForProfilingDisableConversion(process)
-
-    # Disable "unnecessary" transfers to CPU
-    process.pixelTracksHitQuadruplets.gpuEnableTransfer = False
-    process.pixelVertices.gpuEnableTransfer = False
-
-    return process
diff --git a/RecoPixelVertexing/PixelTrackFitting/interface/FitUtils.h b/RecoPixelVertexing/PixelTrackFitting/interface/FitUtils.h
index e92c46f654615..e44a58f676106 100644
--- a/RecoPixelVertexing/PixelTrackFitting/interface/FitUtils.h
+++ b/RecoPixelVertexing/PixelTrackFitting/interface/FitUtils.h
@@ -189,18 +189,43 @@ namespace Rfit {
     circle.par = par_pak;
   }
 
+  /*!
+    \brief Transform circle parameter from (X0,Y0,R) to (phi,Tip,q/R) and
+    consequently covariance matrix.
+    \param circle_uvr parameter (X0,Y0,R), covariance matrix to
+    be transformed and particle charge.
+  */
+  __host__ __device__ inline void fromCircleToPerigee(circle_fit& circle) {
+    Vector3d par_pak;
+    const double temp0 = circle.par.head(2).squaredNorm();
+    const double temp1 = sqrt(temp0);
+    par_pak << atan2(circle.q * circle.par(0), -circle.q * circle.par(1)), circle.q * (temp1 - circle.par(2)),
+           circle.q/circle.par(2);
+    
+      const double temp2 = sqr(circle.par(0)) * 1. / temp0;
+      const double temp3 = 1. / temp1 * circle.q;
+      Matrix3d J4;
+      J4 << -circle.par(1) * temp2 * 1. / sqr(circle.par(0)), temp2 * 1. / circle.par(0), 0., circle.par(0) * temp3,
+          circle.par(1) * temp3, -circle.q, 0., 0., -circle.q/(circle.par(2)*circle.par(2));
+      circle.cov = J4 * circle.cov * J4.transpose();
+    
+    circle.par = par_pak;
+  }
+
+
+
   // transformation between the "perigee" to cmssw localcoord frame
   // the plane of the latter is the perigee plane...
-  // from   //!<(phi,Tip,pt,cotan(theta)),Zip)
+  // from   //!<(phi,Tip,q/pt,cotan(theta)),Zip)
   // to q/p,dx/dz,dy/dz,x,z
-  template<typename V5, typename M5>
-  __host__ __device__ inline void transformToPerigeePlane(V5 const & ip, M5 const & icov, V5 & op, M5 & ocov, double charge) {
+  template<typename VI5, typename MI5, typename VO5, typename MO5>
+  __host__ __device__ inline void transformToPerigeePlane(VI5 const & ip, MI5 const & icov, VO5 & op, MO5 & ocov) {
 
     auto sinTheta2 = 1./(1.+ip(3)*ip(3));
     auto sinTheta = std::sqrt(sinTheta2);
     auto cosTheta = ip(3)*sinTheta;
 
-    op(0) = charge*sinTheta/ip(2);
+    op(0) = sinTheta*ip(2);
     op(1) = 0.;
     op(2) = -ip(3);
     op(3) = ip(1);
@@ -208,8 +233,8 @@ namespace Rfit {
 
     Matrix5d J = Matrix5d::Zero();
 
-    J(0,2) = -charge*sinTheta/(ip(2)*ip(2));
-    J(0,3) = -charge*sinTheta2*cosTheta/ip(2);
+    J(0,2) = sinTheta;
+    J(0,3) = -sinTheta2*cosTheta*ip(2);
     J(1,0) = 1.;
     J(2,3) = -1.;
     J(3,1) = 1.;
diff --git a/RecoPixelVertexing/PixelTrackFitting/plugins/BuildFile.xml b/RecoPixelVertexing/PixelTrackFitting/plugins/BuildFile.xml
index 62a8e8541aa64..8c0261ee0d999 100644
--- a/RecoPixelVertexing/PixelTrackFitting/plugins/BuildFile.xml
+++ b/RecoPixelVertexing/PixelTrackFitting/plugins/BuildFile.xml
@@ -2,6 +2,7 @@
 <use name="HeterogeneousCore/CUDACore"/>
 <use name="HeterogeneousCore/Producer"/>
 <use name="HeterogeneousCore/Product"/>
+<use name="CUDADataFormats/Track"/>
 <use name="RecoPixelVertexing/PixelTrackFitting"/>
 <library file="*.cc" name="RecoPixelVertexingPixelTrackFittingPlugins">
   <flags EDM_PLUGIN="1"/>
diff --git a/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackDumpCUDA.cc b/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackDumpCUDA.cc
new file mode 100644
index 0000000000000..025e7abd99cf1
--- /dev/null
+++ b/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackDumpCUDA.cc
@@ -0,0 +1,94 @@
+#include <cuda_runtime.h>
+
+#include "CUDADataFormats/Common/interface/CUDAProduct.h"
+#include "DataFormats/Common/interface/Handle.h"
+#include "FWCore/Framework/interface/ESHandle.h"
+#include "FWCore/Framework/interface/Event.h"
+#include "FWCore/Framework/interface/EventSetup.h"
+#include "FWCore/Framework/interface/MakerMacros.h"
+#include "FWCore/Framework/interface/global/EDAnalyzer.h"
+#include "FWCore/Framework/interface/ConsumesCollector.h"
+#include "FWCore/ParameterSet/interface/ConfigurationDescriptions.h"
+#include "FWCore/ParameterSet/interface/ParameterSet.h"
+#include "FWCore/ParameterSet/interface/ParameterSetDescription.h"
+#include "FWCore/Utilities/interface/InputTag.h"
+#include "FWCore/PluginManager/interface/ModuleDef.h"
+#include "FWCore/ServiceRegistry/interface/Service.h"
+#include "FWCore/Utilities/interface/EDGetToken.h"
+#include "FWCore/Utilities/interface/RunningAverage.h"
+#include "HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h"
+#include "HeterogeneousCore/CUDACore/interface/GPUCuda.h"
+#include "HeterogeneousCore/CUDAServices/interface/CUDAService.h"
+#include "RecoTracker/TkMSParametrization/interface/PixelRecoUtilities.h"
+
+#include "CUDADataFormats/Vertex/interface/ZVertexHeterogeneous.h"
+#include "CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h"
+#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DCUDA.h"
+
+
+class PixelTrackDumpCUDA : public edm::global::EDAnalyzer<> {
+public:
+  explicit PixelTrackDumpCUDA(const edm::ParameterSet& iConfig);
+  ~PixelTrackDumpCUDA() override = default;
+
+  static void fillDescriptions(edm::ConfigurationDescriptions& descriptions);
+
+private:
+  void analyze(edm::StreamID streamID, edm::Event const & iEvent, const edm::EventSetup& iSetup) const override;
+  const bool m_onGPU;
+  edm::EDGetTokenT<CUDAProduct<PixelTrackHeterogeneous>> tokenGPUTrack_;
+  edm::EDGetTokenT<CUDAProduct<ZVertexHeterogeneous>> tokenGPUVertex_;
+  edm::EDGetTokenT<PixelTrackHeterogeneous> tokenSoATrack_;
+  edm::EDGetTokenT<ZVertexHeterogeneous> tokenSoAVertex_;
+
+
+};
+
+PixelTrackDumpCUDA::PixelTrackDumpCUDA(const edm::ParameterSet& iConfig) :
+  m_onGPU(iConfig.getParameter<bool>("onGPU")) {
+  if (m_onGPU) {
+    tokenGPUTrack_ = consumes<CUDAProduct<PixelTrackHeterogeneous>>(iConfig.getParameter<edm::InputTag>("pixelTrackSrc"));
+    tokenGPUVertex_ = consumes<CUDAProduct<ZVertexHeterogeneous>>(iConfig.getParameter<edm::InputTag>("pixelVertexSrc"));
+  } else {
+    tokenSoATrack_ = consumes<PixelTrackHeterogeneous>(iConfig.getParameter<edm::InputTag>("pixelTrackSrc"));
+    tokenSoAVertex_ = consumes<ZVertexHeterogeneous>(iConfig.getParameter<edm::InputTag>("pixelVertexSrc"));
+  }
+}
+
+void PixelTrackDumpCUDA::fillDescriptions(edm::ConfigurationDescriptions& descriptions) {
+  edm::ParameterSetDescription desc;
+
+   desc.add<bool>("onGPU",true);
+   desc.add<edm::InputTag>("pixelTrackSrc", edm::InputTag("caHitNtupletCUDA"));
+   desc.add<edm::InputTag>("pixelVertexSrc", edm::InputTag("pixelVertexCUDA"));
+   descriptions.add("pixelTrackDumpCUDA", desc);
+}
+
+void PixelTrackDumpCUDA::analyze(edm::StreamID streamID, edm::Event const & iEvent, const edm::EventSetup& iSetup) const {
+  if (m_onGPU) {
+
+    auto const & hTracks = iEvent.get(tokenGPUTrack_);
+    CUDAScopedContextProduce ctx{hTracks};
+
+    auto const& tracks = ctx.get(hTracks);
+    auto const * tsoa = tracks.get();
+    assert(tsoa);
+
+    auto const& vertices = ctx.get(iEvent.get(tokenGPUVertex_));
+    auto const * vsoa = vertices.get();
+    assert(vsoa);
+
+  } else {
+    auto const * tsoa = iEvent.get(tokenSoATrack_).get();
+    assert(tsoa);
+
+    auto const * vsoa = iEvent.get(tokenSoAVertex_).get();
+    assert(vsoa);
+
+  }
+
+}
+
+
+DEFINE_FWK_MODULE(PixelTrackDumpCUDA);
+
diff --git a/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackProducerFromSoA.cc b/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackProducerFromSoA.cc
new file mode 100644
index 0000000000000..284bcfc2ebb51
--- /dev/null
+++ b/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackProducerFromSoA.cc
@@ -0,0 +1,214 @@
+#include "DataFormats/BeamSpot/interface/BeamSpot.h"
+#include "DataFormats/Common/interface/OrphanHandle.h"
+#include "DataFormats/TrackReco/interface/Track.h"
+#include "DataFormats/TrackReco/interface/TrackExtra.h"
+#include "DataFormats/TrackReco/interface/TrackFwd.h"
+#include "DataFormats/TrackerCommon/interface/TrackerTopology.h"
+#include "DataFormats/TrajectoryState/interface/LocalTrajectoryParameters.h"
+#include "DataFormats/GeometrySurface/interface/Plane.h"
+#include "DataFormats/TrackerRecHit2D/interface/SiPixelRecHitCollection.h"
+#include "FWCore/Framework/interface/ESHandle.h"
+#include "FWCore/Framework/interface/Event.h"
+#include "FWCore/Framework/interface/EventSetup.h"
+#include "FWCore/Framework/interface/MakerMacros.h"
+#include "FWCore/Framework/interface/global/EDProducer.h"
+#include "FWCore/Framework/interface/ConsumesCollector.h"
+#include "FWCore/ParameterSet/interface/ConfigurationDescriptions.h"
+#include "FWCore/ParameterSet/interface/ParameterSet.h"
+#include "FWCore/ParameterSet/interface/ParameterSetDescription.h"
+#include "FWCore/Utilities/interface/InputTag.h"
+#include "FWCore/PluginManager/interface/ModuleDef.h"
+#include "FWCore/ServiceRegistry/interface/Service.h"
+#include "FWCore/Utilities/interface/EDGetToken.h"
+#include "Geometry/Records/interface/TrackerTopologyRcd.h"
+#include "MagneticField/Records/interface/IdealMagneticFieldRecord.h"
+#include "HeterogeneousCore/CUDACore/interface/GPUCuda.h"
+#include "HeterogeneousCore/CUDAServices/interface/CUDAService.h"
+
+#include "TrackingTools/AnalyticalJacobians/interface/JacobianLocalToCurvilinear.h"
+#include "TrackingTools/TrajectoryParametrization/interface/GlobalTrajectoryParameters.h"
+#include "TrackingTools/TrajectoryParametrization/interface/CurvilinearTrajectoryError.h"
+#include "RecoPixelVertexing/PixelTrackFitting/interface/FitUtils.h"
+
+#include "CUDADataFormats/Common/interface/HostProduct.h"
+#include "CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h"
+#include "CUDADataFormats/SiPixelCluster/interface/gpuClusteringConstants.h"
+
+#include "storeTracks.h"
+#include "CUDADataFormats/Common/interface/ArrayShadow.h"
+
+
+/**
+ * This class creates "leagcy"  reco::Track
+ * objects from the output of GPU CA. 
+ */
+class PixelTrackProducerFromSoA : public edm::global::EDProducer<> {
+public:
+
+  using IndToEdm = std::vector<uint16_t>;
+
+  explicit PixelTrackProducerFromSoA(const edm::ParameterSet &iConfig);
+  ~PixelTrackProducerFromSoA() override = default;
+
+  static void fillDescriptions(edm::ConfigurationDescriptions &descriptions);
+
+  using HitModuleStart = std::array<uint32_t,gpuClustering::MaxNumModules + 1>;
+  using HMSstorage = ArrayShadow<HitModuleStart>;
+
+
+private:
+  void produce(edm::StreamID streamID, edm::Event& iEvent, const edm::EventSetup& iSetup) const override;
+
+  edm::EDGetTokenT<reco::BeamSpot> tBeamSpot_;
+  edm::EDGetTokenT<PixelTrackHeterogeneous> tokenTrack_;
+  edm::EDGetTokenT<SiPixelRecHitCollectionNew> cpuHits_;
+  edm::EDGetTokenT<HMSstorage> hmsToken_;
+
+  int32_t minNumberOfHits_;
+};
+
+PixelTrackProducerFromSoA::PixelTrackProducerFromSoA(const edm::ParameterSet &iConfig) :
+      tBeamSpot_(consumes<reco::BeamSpot>(iConfig.getParameter<edm::InputTag>("beamSpot"))),
+      tokenTrack_(consumes<PixelTrackHeterogeneous>(iConfig.getParameter<edm::InputTag>("trackSrc"))),
+      cpuHits_(consumes<SiPixelRecHitCollectionNew>(iConfig.getParameter<edm::InputTag>("pixelRecHitLegacySrc"))),
+      hmsToken_(consumes<HMSstorage>(iConfig.getParameter<edm::InputTag>("pixelRecHitLegacySrc"))),
+      minNumberOfHits_(iConfig.getParameter<int>("minNumberOfHits"))
+{
+    produces<reco::TrackCollection>();
+    produces<TrackingRecHitCollection>();
+    produces<reco::TrackExtraCollection>();
+    produces<IndToEdm>();
+}
+
+void PixelTrackProducerFromSoA::fillDescriptions(edm::ConfigurationDescriptions &descriptions) {
+  edm::ParameterSetDescription desc;
+  desc.add<edm::InputTag>("beamSpot", edm::InputTag("offlineBeamSpot"));
+  desc.add<edm::InputTag>("trackSrc", edm::InputTag("pixelTrackSoA"));
+  desc.add<edm::InputTag>("pixelRecHitLegacySrc", edm::InputTag("siPixelRecHitsLegacyPreSplitting"));
+  desc.add<int>("minNumberOfHits", 0);
+
+  descriptions.addWithDefaultLabel(desc);
+}
+
+
+void PixelTrackProducerFromSoA::produce(edm::StreamID streamID, edm::Event& iEvent, const edm::EventSetup& iSetup) const {
+  // std::cout << "Converting gpu helix in reco tracks" << std::endl;
+
+  auto indToEdmP = std::make_unique<IndToEdm>();
+  auto & indToEdm = *indToEdmP;
+
+  edm::ESHandle<MagneticField> fieldESH;
+  iSetup.get<IdealMagneticFieldRecord>().get(fieldESH);
+
+  pixeltrackfitting::TracksWithRecHits tracks;
+  edm::ESHandle<TrackerTopology> httopo;
+  iSetup.get<TrackerTopologyRcd>().get(httopo);
+
+
+  edm::Handle<reco::BeamSpot> bsHandle;
+  iEvent.getByToken(tBeamSpot_, bsHandle);
+  const auto &bsh = *bsHandle;
+  // std::cout << "beamspot " << bsh.x0() << ' ' << bsh.y0() << ' ' << bsh.z0() << std::endl;
+  GlobalPoint bs(bsh.x0(), bsh.y0(), bsh.z0());
+
+  edm::Handle<HMSstorage> hhms;
+  iEvent.getByToken(hmsToken_,hhms);
+  auto const & hitsModuleStart = *hhms;
+
+  auto fc = hitsModuleStart.data;
+
+  edm::Handle<SiPixelRecHitCollectionNew> gh;
+  iEvent.getByToken(cpuHits_, gh);
+  auto const &rechits = *gh;
+  std::vector<TrackingRecHit const*> hitmap;
+  auto const &rcs = rechits.data();
+  auto nhits = rcs.size();
+  hitmap.resize(nhits,nullptr);
+  for (auto const &h : rcs) {
+    auto const &thit = static_cast<BaseTrackerRecHit const &>(h);
+    auto detI = thit.det()->index();
+    auto const &clus = thit.firstClusterRef();
+    assert(clus.isPixel());
+    auto i = fc[detI] + clus.pixelCluster().originalId();
+    assert(i < nhits);
+    assert(nullptr==hitmap[i]);
+    hitmap[i] = &h;
+  }
+
+  std::vector<const TrackingRecHit *> hits;
+  hits.reserve(5);
+
+  const auto & tsoa = *iEvent.get(tokenTrack_);
+
+  auto const * quality = tsoa.qualityData();
+  auto const & fit = tsoa.stateAtBS;
+  auto const & hitIndices = tsoa.hitIndices; 
+  auto maxTracks =tsoa.stride();
+
+  int32_t nt = 0;
+ 
+  for (int32_t it = 0; it < maxTracks; ++it) {
+    auto nHits = tsoa.nHits(it);
+    if (nHits == 0) break;  // this is a guard: maybe we need to move to nTracks...
+    indToEdm.push_back(-1);
+    auto q = quality[it];
+    if (q != trackQuality::loose)
+      continue;                           // FIXME
+    if (nHits< minNumberOfHits_) continue;
+    indToEdm.back() = nt;
+    ++nt;
+
+    hits.resize(nHits);
+    auto b = hitIndices.begin(it);
+    for (int iHit = 0; iHit < nHits; ++iHit)
+      hits[iHit] =  hitmap[*(b+iHit)];
+
+    // mind: this values are respect the beamspot!
+
+    float chi2 = tsoa.chi2(it);
+    float phi = tsoa.phi(it);
+
+    Rfit::Vector5d ipar,opar;
+    Rfit::Matrix5d icov,ocov;
+    fit.copyToDense(ipar,icov,it);
+    Rfit::transformToPerigeePlane(ipar,icov,opar,ocov);
+
+    LocalTrajectoryParameters lpar(opar(0),opar(1),opar(2),opar(3),opar(4),1.);
+    AlgebraicSymMatrix55 m;
+    for(int i=0; i<5; ++i) for (int j=i; j<5; ++j) m(i,j) = ocov(i,j);
+
+    float sp = std::sin(phi);
+    float cp = std::cos(phi);
+    Surface::RotationType rot(
+                              sp, -cp,    0,
+                               0,   0, -1.f,
+                              cp,  sp,    0);
+
+    Plane impPointPlane(bs,rot);
+    GlobalTrajectoryParameters gp(impPointPlane.toGlobal(lpar.position()), 
+                                  impPointPlane.toGlobal(lpar.momentum()),lpar.charge(),fieldESH.product());
+    JacobianLocalToCurvilinear jl2c(impPointPlane,lpar,*fieldESH.product());
+
+    AlgebraicSymMatrix55 mo = ROOT::Math::Similarity(jl2c.jacobian(),m);
+
+    int ndof = 2*hits.size()-5;
+    chi2 = chi2*ndof; // FIXME
+    GlobalPoint vv = gp.position();
+    math::XYZPoint  pos( vv.x(), vv.y(), vv.z() );
+    GlobalVector pp = gp.momentum();
+    math::XYZVector mom( pp.x(), pp.y(), pp.z() );
+
+    auto track =  std::make_unique<reco::Track> ( chi2, ndof, pos, mom,
+                  gp.charge(), CurvilinearTrajectoryError(mo));
+    // filter???
+    tracks.emplace_back(track.release(), hits);
+  }
+  // std::cout << "processed " << nt << " good tuples " << tracks.size() << "out of " << indToEdm.size() << std::endl;
+
+  // store tracks
+  storeTracks(iEvent, tracks, *httopo);
+  iEvent.put(std::move(indToEdmP));
+}
+
+
+DEFINE_FWK_MODULE(PixelTrackProducerFromSoA);
diff --git a/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackSoAFromCUDA.cc b/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackSoAFromCUDA.cc
new file mode 100644
index 0000000000000..c8dc04633f832
--- /dev/null
+++ b/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackSoAFromCUDA.cc
@@ -0,0 +1,95 @@
+#include <cuda_runtime.h>
+
+#include "CUDADataFormats/Common/interface/CUDAProduct.h"
+#include "CUDADataFormats/Common/interface/HostProduct.h"
+#include "DataFormats/Common/interface/Handle.h"
+#include "FWCore/Framework/interface/ESHandle.h"
+#include "FWCore/Framework/interface/Event.h"
+#include "FWCore/Framework/interface/EventSetup.h"
+#include "FWCore/Framework/interface/MakerMacros.h"
+#include "FWCore/PluginManager/interface/ModuleDef.h"
+#include "FWCore/Framework/interface/stream/EDProducer.h"
+#include "FWCore/ParameterSet/interface/ConfigurationDescriptions.h"
+#include "FWCore/ParameterSet/interface/ParameterSet.h"
+#include "FWCore/ParameterSet/interface/ParameterSetDescription.h"
+#include "FWCore/Utilities/interface/InputTag.h"
+#include "FWCore/Utilities/interface/EDGetToken.h"
+#include "HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h"
+#include "HeterogeneousCore/CUDACore/interface/GPUCuda.h"
+
+
+#include "CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h"
+
+class PixelTrackSoAFromCUDA : public edm::stream::EDProducer<edm::ExternalWork> {
+public:
+  explicit PixelTrackSoAFromCUDA(const edm::ParameterSet& iConfig);
+  ~PixelTrackSoAFromCUDA() override = default;
+
+  static void fillDescriptions(edm::ConfigurationDescriptions& descriptions);
+
+private:
+  void acquire(edm::Event const& iEvent,
+               edm::EventSetup const& iSetup,
+               edm::WaitingTaskWithArenaHolder waitingTaskHolder) override;
+  void produce(edm::Event& iEvent, edm::EventSetup const& iSetup) override;
+
+
+  edm::EDGetTokenT<CUDAProduct<PixelTrackHeterogeneous>> tokenCUDA_;
+  edm::EDPutTokenT<PixelTrackHeterogeneous> tokenSOA_;
+
+  cudautils::host::unique_ptr<pixelTrack::TrackSoA> m_soa;
+
+};
+
+PixelTrackSoAFromCUDA::PixelTrackSoAFromCUDA(const edm::ParameterSet& iConfig) :
+  tokenCUDA_(consumes<CUDAProduct<PixelTrackHeterogeneous>>(iConfig.getParameter<edm::InputTag>("src"))),
+  tokenSOA_(produces<PixelTrackHeterogeneous>())
+{}
+
+
+void PixelTrackSoAFromCUDA::fillDescriptions(edm::ConfigurationDescriptions& descriptions) {
+
+  edm::ParameterSetDescription desc;
+
+   desc.add<edm::InputTag>("src", edm::InputTag("caHitNtupletCUDA"));
+   descriptions.add("pixelTrackSoA", desc);
+
+}
+
+
+void PixelTrackSoAFromCUDA::acquire(edm::Event const& iEvent,
+               edm::EventSetup const& iSetup,
+               edm::WaitingTaskWithArenaHolder waitingTaskHolder) {
+  CUDAProduct<PixelTrackHeterogeneous> const& inputDataWrapped = iEvent.get(tokenCUDA_);
+  CUDAScopedContextAcquire ctx{inputDataWrapped, std::move(waitingTaskHolder)};
+  auto const& inputData = ctx.get(inputDataWrapped);
+
+  m_soa = inputData.toHostAsync(ctx.stream());
+
+}
+
+void PixelTrackSoAFromCUDA::produce(edm::Event& iEvent, edm::EventSetup const& iSetup) {
+
+  /*
+  auto const & tsoa = *m_soa;
+  auto maxTracks = tsoa.stride();
+  std::cout << "size of SoA" << sizeof(tsoa) << " stride " << maxTracks << std::endl;
+
+  int32_t nt = 0;
+  for (int32_t it = 0; it < maxTracks; ++it) {
+    auto nHits = tsoa.nHits(it);
+    assert(nHits==int(tsoa.hitIndices.size(it)));
+    if (nHits == 0) break;  // this is a guard: maybe we need to move to nTracks...
+    nt++;
+  }
+  std::cout << "found " << nt << " tracks in cpu SoA at " << &tsoa << std::endl;
+  */
+
+  // DO NOT  make a copy  (actually TWO....)
+  iEvent.emplace(tokenSOA_,PixelTrackHeterogeneous(std::move(m_soa)));
+
+  assert(!m_soa);
+}
+
+
+DEFINE_FWK_MODULE(PixelTrackSoAFromCUDA);
diff --git a/RecoPixelVertexing/PixelTrackFitting/plugins/storeTracks.h b/RecoPixelVertexing/PixelTrackFitting/plugins/storeTracks.h
index 48abab5237587..13bdee8164780 100644
--- a/RecoPixelVertexing/PixelTrackFitting/plugins/storeTracks.h
+++ b/RecoPixelVertexing/PixelTrackFitting/plugins/storeTracks.h
@@ -16,8 +16,8 @@
 #include "DataFormats/TrackerCommon/interface/TrackerTopology.h"
 #include "Geometry/Records/interface/TrackerTopologyRcd.h"
 
-template<typename Ev>
-void storeTracks(Ev & ev, const pixeltrackfitting::TracksWithTTRHs& tracksWithHits, const TrackerTopology& ttopo)
+template<typename Ev, typename TWH>
+void storeTracks(Ev & ev, const TWH& tracksWithHits, const TrackerTopology& ttopo)
 {
   auto tracks = std::make_unique<reco::TrackCollection>();
   auto recHits = std::make_unique<TrackingRecHitCollection>();
@@ -27,12 +27,12 @@ void storeTracks(Ev & ev, const pixeltrackfitting::TracksWithTTRHs& tracksWithHi
 
   for (int i = 0; i < nTracks; i++)
   {
-    reco::Track* track =  tracksWithHits.at(i).first;
-    const SeedingHitSet& hits = tracksWithHits.at(i).second;
+    reco::Track* track =  tracksWithHits[i].first;
+    const auto & hits = tracksWithHits[i].second;
 
     for (unsigned int k = 0; k < hits.size(); k++)
     {
-      TrackingRecHit *hit = hits[k]->hit()->clone();
+      auto * hit = hits[k]->clone();
 
       track->appendHitPattern(*hit, ttopo);
       recHits->push_back(hit);
diff --git a/RecoPixelVertexing/PixelTrackFitting/python/PixelTracks_cff.py b/RecoPixelVertexing/PixelTrackFitting/python/PixelTracks_cff.py
index ef6d5d16fb329..ab7738826b1c2 100644
--- a/RecoPixelVertexing/PixelTrackFitting/python/PixelTracks_cff.py
+++ b/RecoPixelVertexing/PixelTrackFitting/python/PixelTracks_cff.py
@@ -51,11 +51,6 @@
     SeedComparitorPSet = dict(clusterShapeCacheSrc = 'siPixelClusterShapeCachePreSplitting')
 )
 
-from Configuration.ProcessModifiers.gpu_cff import gpu
-from RecoPixelVertexing.PixelTriplets.caHitQuadrupletHeterogeneousEDProducer_cfi import caHitQuadrupletHeterogeneousEDProducer as _caHitQuadrupletHeterogeneousEDProducer
-gpu.toReplaceWith(pixelTracksHitQuadruplets, _caHitQuadrupletHeterogeneousEDProducer)
-gpu.toModify(pixelTracksHitQuadruplets, trackingRegions = "pixelTracksTrackingRegions")
-
 # for trackingLowPU
 pixelTracksHitTriplets = _pixelTripletHLTEDProducer.clone(
     doublets = "pixelTracksHitDoublets",
@@ -70,10 +65,6 @@
 )
 trackingLowPU.toModify(pixelTracks, SeedingHitSets = "pixelTracksHitTriplets")
 
-from Configuration.ProcessModifiers.gpu_cff import gpu
-from RecoPixelVertexing.PixelTrackFitting.pixelTrackProducerFromCUDA_cfi import pixelTrackProducerFromCUDA as _pixelTrackProducerFromCUDA
-gpu.toReplaceWith(pixelTracks, _pixelTrackProducerFromCUDA)
-
 pixelTracksTask = cms.Task(
     pixelTracksTrackingRegions,
     pixelFitterByHelixProjections,
@@ -94,4 +85,19 @@
 _pixelTracksTask_ntupleFit.replace(pixelFitterByHelixProjections, pixelNtupletsFitter)
 ntupleFit.toReplaceWith(pixelTracksTask, _pixelTracksTask_ntupleFit)
 
+
+from Configuration.ProcessModifiers.gpu_cff import gpu
+from RecoPixelVertexing.PixelTriplets.caHitNtupletCUDA_cfi import caHitNtupletCUDA
+from RecoPixelVertexing.PixelTrackFitting.pixelTrackSoA_cfi import pixelTrackSoA
+from RecoPixelVertexing.PixelTrackFitting.pixelTrackProducerFromSoA_cfi import pixelTrackProducerFromSoA as _pixelTrackFromSoA
+_pixelTracksGPUTask = cms.Task(
+  caHitNtupletCUDA,
+  pixelTrackSoA,
+  pixelTracks # FromSoA
+)
+
+gpu.toReplaceWith(pixelTracksTask, _pixelTracksGPUTask)
+gpu.toReplaceWith(pixelTracks,_pixelTrackFromSoA)
+
+
 pixelTracksSequence = cms.Sequence(pixelTracksTask)
diff --git a/RecoPixelVertexing/PixelTrackFitting/test/testEigenJacobian.cpp b/RecoPixelVertexing/PixelTrackFitting/test/testEigenJacobian.cpp
index dc12de88001cd..d294e4cc6c1d6 100644
--- a/RecoPixelVertexing/PixelTrackFitting/test/testEigenJacobian.cpp
+++ b/RecoPixelVertexing/PixelTrackFitting/test/testEigenJacobian.cpp
@@ -85,7 +85,7 @@ int main() {
 
   // Matrix5d covf = transfFast(cov0,par0);
 
-  Rfit::transformToPerigeePlane(par0,cov0,par1,cov1,charge);
+  Rfit::transformToPerigeePlane(par0,cov0,par1,cov1);
 
   std::cout << "cov1\n" << cov1 << std::endl;
 
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.cu b/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.cu
index cb8f151233385..bdfb835a02f33 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.cu
+++ b/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.cu
@@ -17,14 +17,15 @@
 #include "HelixFitOnGPU.h"
 
 using HitsOnGPU = TrackingRecHit2DSOAView;
-using TuplesOnGPU = pixelTuplesHeterogeneousProduct::TuplesOnGPU;
+using Tuples = pixelTrack::HitContainer;
+using OutputSoA = pixelTrack::TrackSoA;
 
 using namespace Eigen;
 
 // #define BL_DUMP_HITS
 
 template <int N>
-__global__ void kernelBLFastFit(TuplesOnGPU::Container const *__restrict__ foundNtuplets,
+__global__ void kernelBLFastFit(Tuples const *__restrict__ foundNtuplets,
                                 CAConstants::TupleMultiplicity const *__restrict__ tupleMultiplicity,
                                 HitsOnGPU const *__restrict__ hhp,
                                 double *__restrict__ phits,
@@ -55,10 +56,10 @@ __global__ void kernelBLFastFit(TuplesOnGPU::Container const *__restrict__ found
     return;
 
   // get it from the ntuple container (one to one to helix)
-  auto helix_start = *(tupleMultiplicity->begin(nHits) + tuple_start);
-  assert(helix_start < foundNtuplets->nbins());
+  auto tkid = *(tupleMultiplicity->begin(nHits) + tuple_start);
+  assert(tkid < foundNtuplets->nbins());
 
-  assert(foundNtuplets->size(helix_start) == nHits);
+  assert(foundNtuplets->size(tkid) == nHits);
 
   Rfit::Map3xNd<N> hits(phits + local_start);
   Rfit::Map4d fast_fit(pfast_fit + local_start);
@@ -68,11 +69,11 @@ __global__ void kernelBLFastFit(TuplesOnGPU::Container const *__restrict__ found
   __shared__ int done;
   done = 0;
   __syncthreads();
-  bool dump = (foundNtuplets->size(helix_start) == 5 && 0 == atomicAdd(&done, 1));
+  bool dump = (foundNtuplets->size(tkid) == 5 && 0 == atomicAdd(&done, 1));
 #endif
 
   // Prepare data structure
-  auto const *hitId = foundNtuplets->begin(helix_start);
+  auto const *hitId = foundNtuplets->begin(tkid);
   for (unsigned int i = 0; i < hitsInFit; ++i) {
     auto hit = hitId[i];
     float ge[6];
@@ -80,14 +81,14 @@ __global__ void kernelBLFastFit(TuplesOnGPU::Container const *__restrict__ found
 #ifdef BL_DUMP_HITS
     if (dump) {
       printf("Hit global: %d: %d hits.col(%d) << %f,%f,%f\n",
-             helix_start,
+             tkid,
              hhp->detectorIndex(hit),
              i,
              hhp->xGlobal(hit),
              hhp->yGlobal(hit),
              hhp->zGlobal(hit));
       printf("Error: %d: %d  hits_ge.col(%d) << %e,%e,%e,%e,%e,%e\n",
-             helix_start,
+             tkid,
              hhp->detetectorIndex(hit),
              i,
              ge[0],
@@ -113,7 +114,7 @@ __global__ void kernelBLFastFit(TuplesOnGPU::Container const *__restrict__ found
 template <int N>
 __global__ void kernelBLFit(CAConstants::TupleMultiplicity const *__restrict__ tupleMultiplicity,
                             double B,
-                            Rfit::helix_fit *results,
+                            OutputSoA *results,
                             double *__restrict__ phits,
                             float *__restrict__ phits_ge,
                             double *__restrict__ pfast_fit,
@@ -133,7 +134,7 @@ __global__ void kernelBLFit(CAConstants::TupleMultiplicity const *__restrict__ t
     return;
 
   // get it for the ntuple container (one to one to helix)
-  auto helix_start = *(tupleMultiplicity->begin(nHits) + tuple_start);
+  auto tkid = *(tupleMultiplicity->begin(nHits) + tuple_start);
 
   Rfit::Map3xNd<N> hits(phits + local_start);
   Rfit::Map4d fast_fit(pfast_fit + local_start);
@@ -148,41 +149,31 @@ __global__ void kernelBLFit(CAConstants::TupleMultiplicity const *__restrict__ t
   BrokenLine::prepareBrokenLineData(hits, fast_fit, B, data);
   BrokenLine::BL_Line_fit(hits_ge, fast_fit, B, data, line);
   BrokenLine::BL_Circle_fit(hits, hits_ge, fast_fit, B, data, circle);
-  Jacob << 1, 0, 0, 0, 1, 0, 0, 0, -B / std::copysign(Rfit::sqr(circle.par(2)), circle.par(2));
-  circle.par(2) = B / std::abs(circle.par(2));
-  circle.cov = Jacob * circle.cov * Jacob.transpose();
 
-  // Grab helix_fit from the proper location in the output vector
-  auto &helix = results[helix_start];
-  helix.par << circle.par, line.par;
-
-  helix.cov = Rfit::Matrix5d::Zero();
-  helix.cov.block(0, 0, 3, 3) = circle.cov;
-  helix.cov.block(3, 3, 2, 2) = line.cov;
-
-  helix.q = circle.q;
-  helix.chi2_circle = circle.chi2;
-  helix.chi2_line = line.chi2;
+  results->stateAtBS.copyFromCircle(circle.par,circle.cov,line.par,line.cov,1.f/float(B),tkid);
+  results->pt(tkid) =  float(B)/float(std::abs(circle.par(2)));
+  results->eta(tkid) =  asinhf(line.par(0));
+  results->chi2(tkid) = (circle.chi2+line.chi2)/(2*N-5);
 
 #ifdef BROKENLINE_DEBUG
   if (!(circle.chi2 >= 0) || !(line.chi2 >= 0))
-    printf("kernelBLFit failed! %f/%f\n", helix.chi2_circle, helix.chi2_line);
+    printf("kernelBLFit failed! %f/%f\n", circle.chi2, line.chi2);
   printf("kernelBLFit size %d for %d hits circle.par(0,1,2): %d %f,%f,%f\n",
          N,
          nHits,
-         helix_start,
+         tkid,
          circle.par(0),
          circle.par(1),
          circle.par(2));
-  printf("kernelBLHits line.par(0,1): %d %f,%f\n", helix_start, line.par(0), line.par(1));
+  printf("kernelBLHits line.par(0,1): %d %f,%f\n", tkid, line.par(0), line.par(1));
   printf("kernelBLHits chi2 cov %f/%f  %e,%e,%e,%e,%e\n",
-         helix.chi2_circle,
-         helix.chi2_line,
-         helix.cov(0, 0),
-         helix.cov(1, 1),
-         helix.cov(2, 2),
-         helix.cov(3, 3),
-         helix.cov(4, 4));
+         circle.chi2,
+         line.chi2,
+         circle.cov(0, 0),
+         circle.cov(1, 1),
+         circle.cov(2, 2),
+         line.cov(0, 0),
+         line.cov(1, 1));
 #endif
 }
 
@@ -218,7 +209,7 @@ void HelixFitOnGPU::launchBrokenLineKernels(HitsOnCPU const &hh,
 
     kernelBLFit<3><<<numberOfBlocks, blockSize, 0, stream.id()>>>(tupleMultiplicity_d,
                                                                   bField_,
-                                                                  helix_fit_results_d,
+                                                                  outputSoa_d,
                                                                   hitsGPU_.get(),
                                                                   hits_geGPU_.get(),
                                                                   fast_fit_resultsGPU_.get(),
@@ -239,7 +230,7 @@ void HelixFitOnGPU::launchBrokenLineKernels(HitsOnCPU const &hh,
 
     kernelBLFit<4><<<numberOfBlocks, blockSize, 0, stream.id()>>>(tupleMultiplicity_d,
                                                                   bField_,
-                                                                  helix_fit_results_d,
+                                                                  outputSoa_d,
                                                                   hitsGPU_.get(),
                                                                   hits_geGPU_.get(),
                                                                   fast_fit_resultsGPU_.get(),
@@ -261,7 +252,7 @@ void HelixFitOnGPU::launchBrokenLineKernels(HitsOnCPU const &hh,
 
       kernelBLFit<4><<<numberOfBlocks, blockSize, 0, stream.id()>>>(tupleMultiplicity_d,
                                                                     bField_,
-                                                                    helix_fit_results_d,
+                                                                    outputSoa_d,
                                                                     hitsGPU_.get(),
                                                                     hits_geGPU_.get(),
                                                                     fast_fit_resultsGPU_.get(),
@@ -282,7 +273,7 @@ void HelixFitOnGPU::launchBrokenLineKernels(HitsOnCPU const &hh,
 
       kernelBLFit<5><<<numberOfBlocks, blockSize, 0, stream.id()>>>(tupleMultiplicity_d,
                                                                     bField_,
-                                                                    helix_fit_results_d,
+                                                                    outputSoa_d,
                                                                     hitsGPU_.get(),
                                                                     hits_geGPU_.get(),
                                                                     fast_fit_resultsGPU_.get(),
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/BuildFile.xml b/RecoPixelVertexing/PixelTriplets/plugins/BuildFile.xml
index 341a108348337..6d15cc6883098 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/BuildFile.xml
+++ b/RecoPixelVertexing/PixelTriplets/plugins/BuildFile.xml
@@ -12,6 +12,7 @@
 <use name="RecoPixelVertexing/PixelTriplets"/>
 <use name="RecoTracker/TkSeedingLayers"/>
 <use name="RecoTracker/TkTrackingRegions"/>
+<use name="CUDADataFormats/Track"/>
 
 <library file="*.cu *.cc" name="RecoPixelVertexingPixelTripletsPlugins">
   <flags EDM_PLUGIN="1"/>
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletCUDA.cc b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletCUDA.cc
new file mode 100644
index 0000000000000..ba8a3e1052e7b
--- /dev/null
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletCUDA.cc
@@ -0,0 +1,82 @@
+#include <cuda_runtime.h>
+
+#include "CUDADataFormats/Common/interface/CUDAProduct.h"
+#include "DataFormats/Common/interface/Handle.h"
+#include "FWCore/Framework/interface/ESHandle.h"
+#include "FWCore/Framework/interface/Event.h"
+#include "FWCore/Framework/interface/EventSetup.h"
+#include "FWCore/Framework/interface/MakerMacros.h"
+#include "FWCore/Framework/interface/global/EDProducer.h"
+#include "FWCore/Framework/interface/ConsumesCollector.h"
+#include "FWCore/ParameterSet/interface/ConfigurationDescriptions.h"
+#include "FWCore/ParameterSet/interface/ParameterSet.h"
+#include "FWCore/ParameterSet/interface/ParameterSetDescription.h"
+#include "FWCore/Utilities/interface/InputTag.h"
+#include "FWCore/PluginManager/interface/ModuleDef.h"
+#include "FWCore/ServiceRegistry/interface/Service.h"
+#include "FWCore/Utilities/interface/EDGetToken.h"
+#include "FWCore/Utilities/interface/RunningAverage.h"
+#include "HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h"
+#include "HeterogeneousCore/CUDACore/interface/GPUCuda.h"
+#include "HeterogeneousCore/CUDAServices/interface/CUDAService.h"
+#include "RecoTracker/TkMSParametrization/interface/PixelRecoUtilities.h"
+
+#include "CAHitNtupletGeneratorOnGPU.h"
+#include "CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h"
+#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DCUDA.h"
+
+
+class CAHitNtupletCUDA : public edm::global::EDProducer<> {
+public:
+  explicit CAHitNtupletCUDA(const edm::ParameterSet& iConfig);
+  ~CAHitNtupletCUDA() override = default;
+
+  static void fillDescriptions(edm::ConfigurationDescriptions& descriptions);
+
+private:
+  void produce(edm::StreamID streamID, edm::Event& iEvent, const edm::EventSetup& iSetup) const override;
+
+  edm::EDGetTokenT<CUDAProduct<TrackingRecHit2DCUDA>> tokenHit_;
+  edm::EDPutTokenT<CUDAProduct<PixelTrackHeterogeneous>> tokenTrack_;
+
+  CAHitNtupletGeneratorOnGPU gpuAlgo_;
+
+};
+
+CAHitNtupletCUDA::CAHitNtupletCUDA(const edm::ParameterSet& iConfig) :
+      tokenHit_(consumes<CUDAProduct<TrackingRecHit2DCUDA>>(iConfig.getParameter<edm::InputTag>("pixelRecHitSrc"))),
+      tokenTrack_(produces<CUDAProduct<PixelTrackHeterogeneous>>()),
+      gpuAlgo_(iConfig, consumesCollector()) {}
+
+
+void CAHitNtupletCUDA::fillDescriptions(edm::ConfigurationDescriptions& descriptions) {
+  edm::ParameterSetDescription desc;
+
+  desc.add<edm::InputTag>("pixelRecHitSrc", edm::InputTag("siPixelRecHitsCUDAPreSplitting"));
+
+  CAHitNtupletGeneratorOnGPU::fillDescriptions(desc);
+  auto label = "caHitNtupletCUDA";
+  descriptions.add(label, desc);
+}
+
+void CAHitNtupletCUDA::produce(edm::StreamID streamID, edm::Event& iEvent, const edm::EventSetup& es) const {
+
+  edm::Handle<CUDAProduct<TrackingRecHit2DCUDA>>  hHits;
+  iEvent.getByToken(tokenHit_, hHits);
+
+  CUDAScopedContextProduce ctx{*hHits};
+  auto const& hits = ctx.get(*hHits);
+
+  auto bf = 1./PixelRecoUtilities::fieldInInvGev(es);
+
+  ctx.emplace(
+      iEvent,
+      tokenTrack_,
+      std::move(gpuAlgo_.makeTuplesAsync(hits, bf, ctx.stream()))
+   );
+
+}
+
+
+
+DEFINE_FWK_MODULE(CAHitNtupletCUDA);
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu
new file mode 100644
index 0000000000000..cedef59f78f91
--- /dev/null
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu
@@ -0,0 +1,933 @@
+//
+// Original Author: Felice Pantaleo, CERN
+//
+
+// #define NTUPLE_DEBUG
+
+#include <cmath>
+#include <cstdint>
+
+#include <cuda_runtime.h>
+
+#include "FWCore/ServiceRegistry/interface/Service.h"
+#include "HeterogeneousCore/CUDAServices/interface/CUDAService.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h"
+#include "RecoLocalTracker/SiPixelRecHits/interface/pixelCPEforGPU.h"
+
+#include "CAConstants.h"
+#include "CAHitNtupletGeneratorKernels.h"
+#include "GPUCACell.h"
+#include "gpuFishbone.h"
+#include "gpuPixelDoublets.h"
+
+using namespace gpuPixelDoublets;
+
+  using HitsOnGPU = TrackingRecHit2DSOAView;
+  using HitsOnCPU = TrackingRecHit2DCUDA;
+
+  using HitToTuple = CAConstants::HitToTuple;
+  using TupleMultiplicity = CAConstants::TupleMultiplicity;
+
+  using Quality = pixelTrack::Quality;
+  using TkSoA = pixelTrack::TrackSoA;
+  using HitContainer = pixelTrack::HitContainer;
+
+__global__ void kernel_checkOverflows(HitContainer const * foundNtuplets,
+                                      CAConstants::TupleMultiplicity * tupleMultiplicity,
+                                      AtomicPairCounter *apc,
+                                      GPUCACell const *__restrict__ cells,
+                                      uint32_t const *__restrict__ nCells,
+                                      CellNeighborsVector const *cellNeighbors,
+                                      CellTracksVector const *cellTracks,
+                                      GPUCACell::OuterHitOfCell const *__restrict__ isOuterHitOfCell,
+                                      uint32_t nHits,
+                                      CAHitNtupletGeneratorKernels::Counters *counters) {
+  auto idx = threadIdx.x + blockIdx.x * blockDim.x;
+
+  auto &c = *counters;
+  // counters once per event
+  if (0 == idx) {
+    atomicAdd(&c.nEvents, 1);
+    atomicAdd(&c.nHits, nHits);
+    atomicAdd(&c.nCells, *nCells);
+    atomicAdd(&c.nTuples, apc->get().m);
+    atomicAdd(&c.nFitTracks,tupleMultiplicity->size());
+  }
+
+#ifdef NTUPLE_DEBUG
+  if (0 == idx) {
+    printf("number of found cells %d, found tuples %d with total hits %d out of %d\n",
+           *nCells,
+           apc->get().m,
+           apc->get().n,
+           nHits);
+    if (apc->get().m < CAConstants::maxNumberOfQuadruplets()) {
+      assert(foundNtuplets->size(apc->get().m) == 0);
+      assert(foundNtuplets->size() == apc->get().n);
+    }
+  }
+
+  if (idx < foundNtuplets->nbins()) {
+    if (foundNtuplets->size(idx) > 5)
+      printf("ERROR %d, %d\n", idx, foundNtuplets->size(idx));
+    assert(foundNtuplets->size(idx) < 6);
+    for (auto ih = foundNtuplets->begin(idx); ih != foundNtuplets->end(idx); ++ih)
+      assert(*ih < nHits);
+  }
+#endif
+
+  if (0 == idx) {
+    if (apc->get().m >= CAConstants::maxNumberOfQuadruplets())
+      printf("Tuples overflow\n");
+    if (*nCells >= CAConstants::maxNumberOfDoublets())
+      printf("Cells overflow\n");
+  }
+
+  if (idx < (*nCells)) {
+    auto &thisCell = cells[idx];
+    if (thisCell.outerNeighbors().full())  //++tooManyNeighbors[thisCell.theLayerPairId];
+      printf("OuterNeighbors overflow %d in %d\n", idx, thisCell.theLayerPairId);
+    if (thisCell.tracks().full())  //++tooManyTracks[thisCell.theLayerPairId];
+      printf("Tracks overflow %d in %d\n", idx, thisCell.theLayerPairId);
+    if (thisCell.theDoubletId < 0)
+      atomicAdd(&c.nKilledCells, 1);
+    if (0==thisCell.theUsed)
+      atomicAdd(&c.nEmptyCells, 1);
+    if (thisCell.tracks().empty())
+      atomicAdd(&c.nZeroTrackCells, 1);
+  }
+  if (idx < nHits) {
+    if (isOuterHitOfCell[idx].full())  // ++tooManyOuterHitOfCell;
+      printf("OuterHitOfCell overflow %d\n", idx);
+  }
+}
+
+
+__global__ void kernel_fishboneCleaner(GPUCACell const *cells,
+                                       uint32_t const *__restrict__ nCells,
+                                       Quality *quality) {
+  constexpr auto bad = trackQuality::bad;
+
+  auto cellIndex = threadIdx.x + blockIdx.x * blockDim.x;
+
+  if (cellIndex >= (*nCells))
+    return;
+  auto const &thisCell = cells[cellIndex];
+  if (thisCell.theDoubletId >= 0)
+    return;
+
+  for (auto it : thisCell.tracks())
+    quality[it] = bad;
+}
+
+__global__ void kernel_earlyDuplicateRemover(GPUCACell const *cells,
+                                            uint32_t const *__restrict__ nCells,
+                                            HitContainer *foundNtuplets,
+                                            Quality *quality) {
+  // constexpr auto bad = trackQuality::bad;
+  constexpr auto dup = trackQuality::dup;
+  // constexpr auto loose = trackQuality::loose;
+
+  assert(nCells);
+
+  auto cellIndex = threadIdx.x + blockIdx.x * blockDim.x;
+
+  if (cellIndex >= (*nCells))
+    return;
+  auto const &thisCell = cells[cellIndex];
+  if (thisCell.theDoubletId < 0)
+    return;
+
+  uint32_t maxNh = 0;
+
+  // find maxNh
+  for (auto it : thisCell.tracks()) {
+    auto nh = foundNtuplets->size(it);
+    maxNh = std::max(nh, maxNh);
+  }
+
+  for (auto it : thisCell.tracks()) {
+    if (foundNtuplets->size(it) != maxNh)
+      quality[it] = dup;  //no race:  simple assignment of the same constant
+  }
+
+}
+
+
+__global__ void kernel_fastDuplicateRemover(GPUCACell const * __restrict__ cells,
+                                            uint32_t const *__restrict__ nCells,
+                                            HitContainer const * __restrict__ foundNtuplets,
+                                            TkSoA * __restrict__ tracks) {
+  constexpr auto bad = trackQuality::bad;
+  constexpr auto dup = trackQuality::dup;
+  constexpr auto loose = trackQuality::loose;
+
+  assert(nCells);
+
+  auto cellIndex = threadIdx.x + blockIdx.x * blockDim.x;
+
+  if (cellIndex >= (*nCells))
+    return;
+  auto const &thisCell = cells[cellIndex];
+  if (thisCell.theDoubletId < 0)
+    return;
+
+  float mc = 10000.f;
+  uint16_t im = 60000;
+
+  auto score = [&](auto it) {
+    return std::abs(tracks->tip(it));  // tip
+    // return tracks->chi2(it);  //chi2
+  };
+
+  // find min socre
+  for (auto it : thisCell.tracks()) {
+    if (tracks->quality(it) == loose && score(it) < mc) {
+      mc = score(it);
+      im = it;
+    }
+  }
+  // mark all other duplicates
+  for (auto it : thisCell.tracks()) {
+    if (tracks->quality(it) != bad && it != im)
+        tracks->quality(it) = dup;  //no race:  simple assignment of the same constant
+  }
+}
+
+
+__global__ void kernel_connect(AtomicPairCounter *apc1,
+                               AtomicPairCounter *apc2,  // just to zero them,
+                               GPUCACell::Hits const *__restrict__ hhp,
+                               GPUCACell *cells,
+                               uint32_t const *__restrict__ nCells,
+                               CellNeighborsVector *cellNeighbors,
+                               GPUCACell::OuterHitOfCell const *__restrict__ isOuterHitOfCell,
+                               float hardCurvCut,
+                               float ptmin,
+                               float CAThetaCutBarrel,
+                               float CAThetaCutForward,
+                               float dcaCutInnerTriplet,
+                               float dcaCutOuterTriplet) {
+  auto const &hh = *hhp;
+
+  auto cellIndex = threadIdx.y + blockIdx.y * blockDim.y;
+  auto first = threadIdx.x;
+  auto stride = blockDim.x;
+
+  if (0 == (cellIndex + first)) {
+    (*apc1) = 0;
+    (*apc2) = 0;
+  }  // ready for next kernel
+
+  if (cellIndex >= (*nCells))
+    return;
+  auto & thisCell = cells[cellIndex];
+  //if (thisCell.theDoubletId < 0 || thisCell.theUsed>1)
+  //  return;
+  auto innerHitId = thisCell.get_inner_hit_id();
+  auto numberOfPossibleNeighbors = isOuterHitOfCell[innerHitId].size();
+  auto vi = isOuterHitOfCell[innerHitId].data();
+
+  constexpr uint32_t last_bpix1_detIndex = 96;
+  constexpr uint32_t last_barrel_detIndex = 1184;
+  auto ri = thisCell.get_inner_r(hh);
+  auto zi = thisCell.get_inner_z(hh);
+
+  auto ro = thisCell.get_outer_r(hh);
+  auto zo = thisCell.get_outer_z(hh);
+  auto isBarrel = thisCell.get_inner_detIndex(hh) < last_barrel_detIndex;
+
+  for (auto j = first; j < numberOfPossibleNeighbors; j += stride) {
+    auto otherCell = __ldg(vi + j);
+    auto & oc = cells[otherCell];
+    // if (cells[otherCell].theDoubletId < 0 ||
+    //    cells[otherCell].theUsed>1 )
+    //  continue;
+    auto r1 = oc.get_inner_r(hh);
+    auto z1 = oc.get_inner_z(hh);
+    // auto isBarrel = oc.get_outer_detIndex(hh) < last_barrel_detIndex;
+    bool aligned = GPUCACell::areAlignedRZ(r1,
+                                z1,
+                                ri,
+                                zi,
+                                ro,
+                                zo,
+                                ptmin,
+                                isBarrel ? CAThetaCutBarrel : CAThetaCutForward);  // 2.f*thetaCut); // FIXME tune cuts
+    if(aligned &&
+      thisCell.dcaCut(hh,oc,
+                      oc.get_inner_detIndex(hh) < last_bpix1_detIndex ? dcaCutInnerTriplet : dcaCutOuterTriplet,
+                      hardCurvCut)
+       ) {  // FIXME tune cuts
+      oc.addOuterNeighbor(cellIndex, *cellNeighbors);
+      thisCell.theUsed |= 1;
+      oc.theUsed |= 1;
+    }
+  } // loop on inner cells
+}
+
+__global__ void kernel_find_ntuplets(GPUCACell::Hits const *__restrict__ hhp,
+                                     GPUCACell *__restrict__ cells,
+                                     uint32_t const *nCells,
+                                     CellTracksVector *cellTracks,
+                                     HitContainer *foundNtuplets,
+                                     AtomicPairCounter *apc,
+                                     Quality * __restrict__ quality,
+                                     unsigned int minHitsPerNtuplet) {
+  // recursive: not obvious to widen
+  auto const &hh = *hhp;
+
+  auto cellIndex = threadIdx.x + blockIdx.x * blockDim.x;
+  if (cellIndex >= (*nCells))
+    return;
+  auto &thisCell = cells[cellIndex];
+
+  if (thisCell.theDoubletId < 0)
+    return;
+
+  auto pid = thisCell.theLayerPairId;
+  auto doit = minHitsPerNtuplet>3 ? pid<3 : pid<8 || pid >12;
+  if (doit) {
+    GPUCACell::TmpTuple stack;
+    stack.reset();
+    thisCell.find_ntuplets(hh,
+                           cells,
+                           *cellTracks,
+                           *foundNtuplets,
+                           *apc,
+                           quality,
+                           stack,
+                           minHitsPerNtuplet, 
+                           pid<3);
+    assert(stack.size() == 0);
+    // printf("in %d found quadruplets: %d\n", cellIndex, apc->get());
+  }
+
+}
+
+
+__global__ void kernel_mark_used(GPUCACell::Hits const *__restrict__ hhp,
+                                     GPUCACell *__restrict__ cells,
+                                     uint32_t const *nCells) {
+
+  // auto const &hh = *hhp;
+
+  auto cellIndex = threadIdx.x + blockIdx.x * blockDim.x;
+  if (cellIndex >= (*nCells))
+    return;
+  auto &thisCell = cells[cellIndex];
+  if (!thisCell.tracks().empty())
+    thisCell.theUsed |= 2;
+
+}
+
+
+__global__ void kernel_countMultiplicity(HitContainer const *__restrict__ foundNtuplets,
+                                         Quality const * __restrict__ quality,
+                                         CAConstants::TupleMultiplicity *tupleMultiplicity) {
+  auto it = blockIdx.x * blockDim.x + threadIdx.x;
+
+  if (it >= foundNtuplets->nbins())
+    return;
+
+  auto nhits = foundNtuplets->size(it);
+  if (nhits < 3)
+    return;
+  if (quality[it] == trackQuality::dup) return;
+  assert(quality[it] == trackQuality::bad);
+  if (nhits>5) printf("wrong mult %d %d\n",it,nhits);
+  assert(nhits<8);
+  tupleMultiplicity->countDirect(nhits);
+}
+
+
+
+__global__ void kernel_fillMultiplicity(HitContainer const *__restrict__ foundNtuplets,
+                                        Quality const * __restrict__ quality,
+                                        CAConstants::TupleMultiplicity *tupleMultiplicity) {
+  auto it = blockIdx.x * blockDim.x + threadIdx.x;
+
+  if (it >= foundNtuplets->nbins())
+    return;
+
+  auto nhits = foundNtuplets->size(it);
+  if (nhits < 3)
+    return;
+  if (quality[it] == trackQuality::dup) return;
+  if (nhits>5) printf("wrong mult %d %d\n",it,nhits);
+  assert(nhits<8);
+  tupleMultiplicity->fillDirect(nhits, it);
+}
+
+
+
+__global__ void kernel_classifyTracks(HitContainer const *__restrict__ tuples,
+                                      TkSoA const * __restrict__ tracks,
+                                      CAHitNtupletGeneratorKernels::QualityCuts cuts,
+                                      Quality *__restrict__ quality) {
+  auto idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (idx >= tuples->nbins()) {
+    return;
+  }
+  if (tuples->size(idx) == 0) {
+    return;
+  }
+
+  // if duplicate: not even fit
+  if (quality[idx] == trackQuality::dup) return;
+
+  assert(quality[idx] == trackQuality::bad);
+
+  // mark doublets as bad
+  if (tuples->size(idx) < 3) {
+    return;
+  }
+
+  // if the fit has any invalid parameters, mark it as bad
+  bool isNaN = false;
+  for (int i = 0; i < 5; ++i) {
+    isNaN |= isnan(tracks->stateAtBS.state(idx)(i));
+  }
+  if (isNaN) {
+#ifdef NTUPLE_DEBUG
+    printf("NaN in fit %d size %d chi2 %f\n",
+           idx,
+           tuples->size(idx),
+           tracks->chi2(idx)
+    );
+#endif
+    return;
+  }
+
+  // compute a pT-dependent chi2 cut
+  // default parameters:
+  //   - chi2MaxPt = 10 GeV
+  //   - chi2Coeff = { 0.68177776, 0.74609577, -0.08035491, 0.00315399 }
+  //   - chi2Scale = 30 for broken line fit, 45 for Riemann fit
+  // (see CAHitNtupletGeneratorGPU.cc)
+  float pt = std::min<float>(tracks->pt(idx), cuts.chi2MaxPt);
+  float chi2Cut = cuts.chi2Scale *
+                  (cuts.chi2Coeff[0] + pt * (cuts.chi2Coeff[1] + pt * (cuts.chi2Coeff[2] + pt * cuts.chi2Coeff[3])));
+  // above number were for Quads not normalized so for the time being just multiple by ndof for Quads  (triplets to be understood)
+  if (3.f*tracks->chi2(idx) >= chi2Cut) {
+#ifdef NTUPLE_DEBUG
+    printf("Bad fit %d size %d pt %f eta %f chi2 %f\n",
+           idx,
+           tuples->size(idx), 
+           tracks->pt(idx),
+           tracks->eta(idx),
+           3.f*tracks->chi2(idx)
+    );
+#endif
+    return;
+  }
+
+  // impose "region cuts" based on the fit results (phi, Tip, pt, cotan(theta)), Zip)
+  // default cuts:
+  //   - for triplets:    |Tip| < 0.3 cm, pT > 0.5 GeV, |Zip| < 12.0 cm
+  //   - for quadruplets: |Tip| < 0.5 cm, pT > 0.3 GeV, |Zip| < 12.0 cm
+  // (see CAHitNtupletGeneratorGPU.cc)
+  auto const &region = (tuples->size(idx) > 3) ? cuts.quadruplet : cuts.triplet;
+  bool isOk = (std::abs(tracks->tip(idx)) < region.maxTip) and (tracks->pt(idx) > region.minPt) and
+              (std::abs(tracks->zip(idx)) < region.maxZip);
+
+  if (isOk) {
+    quality[idx] = trackQuality::loose;
+  }
+}
+
+__global__ void kernel_doStatsForTracks(HitContainer const *__restrict__ tuples,
+                                        Quality const *__restrict__ quality,
+                                        CAHitNtupletGeneratorKernels::Counters *counters) {
+  int first = blockDim.x * blockIdx.x + threadIdx.x;
+  for (int idx = first, ntot = tuples->nbins(); idx < ntot; idx += gridDim.x * blockDim.x) {
+    if (tuples->size(idx) == 0)
+      continue;
+    if (quality[idx] != trackQuality::loose)
+      continue;
+    atomicAdd(&(counters->nGoodTracks), 1);
+  }
+}
+
+
+__global__ void kernel_countHitInTracks(HitContainer const *__restrict__ tuples,
+                                        Quality const *__restrict__ quality,
+                                        CAHitNtupletGeneratorKernels::HitToTuple *hitToTuple) {
+  int first = blockDim.x * blockIdx.x + threadIdx.x;
+  for (int idx = first, ntot = tuples->nbins(); idx < ntot; idx += gridDim.x * blockDim.x) {
+    if (tuples->size(idx) == 0)
+      continue;
+    if (quality[idx] != trackQuality::loose)
+      continue;
+    for (auto h = tuples->begin(idx); h != tuples->end(idx); ++h)
+      hitToTuple->countDirect(*h);
+  }
+}
+
+__global__ void kernel_fillHitInTracks(HitContainer const *__restrict__ tuples,
+                                       Quality const *__restrict__ quality,
+                                       CAHitNtupletGeneratorKernels::HitToTuple *hitToTuple) {
+  int first = blockDim.x * blockIdx.x + threadIdx.x;
+  for (int idx = first, ntot = tuples->nbins(); idx < ntot; idx += gridDim.x * blockDim.x) {
+    if (tuples->size(idx) == 0)
+      continue;
+    if (quality[idx] != trackQuality::loose)
+      continue;
+    for (auto h = tuples->begin(idx); h != tuples->end(idx); ++h)
+      hitToTuple->fillDirect(*h, idx);
+  }
+}
+
+__global__ void kernel_fillHitDetIndices(HitContainer const *__restrict__ tuples,
+                                         TrackingRecHit2DSOAView const *__restrict__ hhp,
+                                         HitContainer *__restrict__ hitDetIndices) {
+
+  int first = blockDim.x * blockIdx.x + threadIdx.x;
+  // copy offsets
+  for (int idx = first, ntot = tuples->totbins(); idx < ntot; idx += gridDim.x * blockDim.x) {
+    hitDetIndices->off[idx] = tuples->off[idx];
+  }
+  // fill hit indices
+  auto const & hh = *hhp;
+  auto nhits = hh.nHits();
+  for (int idx = first, ntot = tuples->size(); idx < ntot; idx += gridDim.x * blockDim.x) {
+    assert(tuples->bins[idx]<nhits);
+    hitDetIndices->bins[idx] = hh.detectorIndex(tuples->bins[idx]);
+  }
+}
+
+void CAHitNtupletGeneratorKernels::fillHitDetIndices(HitsOnCPU const &hh, TkSoA * tracks_d, cudaStream_t cudaStream) {
+  auto blockSize=128;
+  auto numberOfBlocks = (HitContainer::capacity() + blockSize - 1) / blockSize;
+
+  kernel_fillHitDetIndices<<<numberOfBlocks,blockSize,0,cudaStream>>>(&tracks_d->hitIndices, hh.view(), &tracks_d->detIndices);
+  cudaCheck(cudaGetLastError());
+#ifdef GPU_DEBUG
+    cudaDeviceSynchronize();
+    cudaCheck(cudaGetLastError());
+#endif
+}
+
+
+__global__ void kernel_doStatsForHitInTracks(CAHitNtupletGeneratorKernels::HitToTuple const *__restrict__ hitToTuple,
+                                             CAHitNtupletGeneratorKernels::Counters *counters) {
+  auto &c = *counters;
+  int first = blockDim.x * blockIdx.x + threadIdx.x;
+  for (int idx = first, ntot = hitToTuple->nbins(); idx < ntot; idx += gridDim.x * blockDim.x) {
+    if (hitToTuple->size(idx) == 0)
+      continue;
+    atomicAdd(&c.nUsedHits, 1);
+    if (hitToTuple->size(idx) > 1)
+      atomicAdd(&c.nDupHits, 1);
+  }
+}
+
+
+__global__ void kernel_tripletCleaner(TrackingRecHit2DSOAView const *__restrict__ hhp,
+                                      HitContainer const *__restrict__ ptuples,
+                                      TkSoA const * __restrict__ ptracks,
+                                      Quality *__restrict__ quality,
+                                      CAHitNtupletGeneratorKernels::HitToTuple const *__restrict__ phitToTuple) {
+  constexpr auto bad = trackQuality::bad;
+  constexpr auto dup = trackQuality::dup;
+  // constexpr auto loose = trackQuality::loose;
+
+  auto &hitToTuple = *phitToTuple;
+  auto const &foundNtuplets = *ptuples;
+  auto const & tracks = *ptracks;
+
+  //  auto const & hh = *hhp;
+  // auto l1end = hh.hitsLayerStart_d[1];
+
+  int first = blockDim.x * blockIdx.x + threadIdx.x;
+
+  for (int idx = first, ntot = hitToTuple.nbins(); idx < ntot; idx += gridDim.x * blockDim.x) {
+    if (hitToTuple.size(idx) < 2)
+      continue;
+
+    float mc = 10000.f;
+    uint16_t im = 60000;
+    uint32_t maxNh = 0;
+
+    // find maxNh
+    for (auto it = hitToTuple.begin(idx); it != hitToTuple.end(idx); ++it) {
+      uint32_t nh = foundNtuplets.size(*it);
+      maxNh = std::max(nh, maxNh);
+    }
+    // kill all tracks shorter than maxHn (only triplets???)
+    for (auto it = hitToTuple.begin(idx); it != hitToTuple.end(idx); ++it) {
+      uint32_t nh = foundNtuplets.size(*it);
+      if (maxNh != nh)
+        quality[*it] = dup;
+    }
+
+    if (maxNh > 3)
+      continue;
+    // if (idx>=l1end) continue;  // only for layer 1
+    // for triplets choose best tip!
+    for (auto ip = hitToTuple.begin(idx); ip != hitToTuple.end(idx); ++ip) {
+      auto const it = *ip;
+      if (quality[it] != bad && std::abs(tracks.tip(it)) < mc) {
+        mc = std::abs(tracks.tip(it));
+        im = it;
+      }
+    }
+    // mark duplicates
+    for (auto ip = hitToTuple.begin(idx); ip != hitToTuple.end(idx); ++ip) {
+      auto const it = *ip;
+      if (quality[it] != bad && it != im)
+        quality[it] = dup;  //no race:  simple assignment of the same constant
+    }
+  }  // loop over hits
+}
+
+__global__ void kernel_print_found_ntuplets(TrackingRecHit2DSOAView const *__restrict__ hhp,
+                                      HitContainer const *__restrict__ ptuples,
+                                      TkSoA const * __restrict__ ptracks,
+                                      Quality const *__restrict__ quality,
+                                      CAHitNtupletGeneratorKernels::HitToTuple const *__restrict__ phitToTuple,
+                                      uint32_t maxPrint, int iev) {
+  auto const & foundNtuplets = *ptuples;
+  auto const & tracks = *ptracks;
+  int first = blockDim.x * blockIdx.x + threadIdx.x;
+  for (int i = first; i < std::min(maxPrint, foundNtuplets.nbins()); i+=blockDim.x*gridDim.x) {
+    auto nh = foundNtuplets.size(i);
+    if (nh<3) continue;
+    printf("TK: %d %d %d %f %f %f %f %f %f %f %d %d %d %d %d\n",
+           10000*iev+i,
+           int(quality[i]),
+           nh,
+           tracks.charge(i),
+           tracks.pt(i),
+           tracks.eta(i),
+           tracks.phi(i),
+           tracks.tip(i),
+           tracks.zip(i),
+//           asinhf(fit_results[i].par(3)),
+           tracks.chi2(i),
+           *foundNtuplets.begin(i),
+           *(foundNtuplets.begin(i) + 1),
+           *(foundNtuplets.begin(i) + 2),
+           nh>3 ? int(*(foundNtuplets.begin(i) + 3)):-1,
+           nh>4 ? int(*(foundNtuplets.begin(i) + 4)):-1
+          );
+  }
+}
+
+
+void CAHitNtupletGeneratorKernels::launchKernels(
+    HitsOnCPU const &hh,
+    TkSoA * tracks_d,
+    cudaStream_t cudaStream) {
+
+  auto maxNumberOfDoublets_ = CAConstants::maxNumberOfDoublets();
+
+  // these are pointer on GPU!
+  auto * tuples_d = &tracks_d->hitIndices; 
+  auto * quality_d = (Quality*)(&tracks_d->m_quality);  
+
+  auto nhits = hh.nHits();
+  assert(nhits <= pixelGPUConstants::maxNumberOfHits);
+
+  // std::cout << "N hits " << nhits << std::endl;
+  // if (nhits<2) std::cout << "too few hits " << nhits << std::endl;
+
+  //
+  // applying conbinatoric cleaning such as fishbone at this stage is too expensive
+  //
+
+  auto nthTot = 64;
+  auto stride = 4;
+  auto blockSize = nthTot / stride;
+  auto numberOfBlocks = (maxNumberOfDoublets_ + blockSize - 1) / blockSize;
+  auto rescale = numberOfBlocks / 65536;
+  blockSize *= (rescale + 1);
+  numberOfBlocks = (maxNumberOfDoublets_ + blockSize - 1) / blockSize;
+  assert(numberOfBlocks < 65536);
+  assert(blockSize > 0 && 0 == blockSize % 16);
+  dim3 blks(1, numberOfBlocks, 1);
+  dim3 thrs(stride, blockSize, 1);
+
+  kernel_connect<<<blks, thrs, 0, cudaStream>>>(
+      device_hitTuple_apc_,
+      device_hitToTuple_apc_,  // needed only to be reset, ready for next kernel
+      hh.view(),
+      device_theCells_.get(),
+      device_nCells_,
+      device_theCellNeighbors_,
+      device_isOuterHitOfCell_.get(),
+      m_params.hardCurvCut_,
+      m_params.ptmin_,
+      m_params.CAThetaCutBarrel_,
+      m_params.CAThetaCutForward_,
+      m_params.dcaCutInnerTriplet_,
+      m_params.dcaCutOuterTriplet_);
+  cudaCheck(cudaGetLastError());
+
+
+  if (nhits > 1 && m_params.earlyFishbone_) {
+    auto nthTot = 128;
+    auto stride = 16;
+    auto blockSize = nthTot / stride;
+    auto numberOfBlocks = (nhits + blockSize - 1) / blockSize;
+    dim3 blks(1, numberOfBlocks, 1);
+    dim3 thrs(stride, blockSize, 1);
+    fishbone<<<blks, thrs, 0, cudaStream>>>(
+        hh.view(), device_theCells_.get(), device_nCells_, device_isOuterHitOfCell_.get(), nhits, false);
+    cudaCheck(cudaGetLastError());
+  }
+
+
+  blockSize = 64;
+  numberOfBlocks = (maxNumberOfDoublets_ + blockSize - 1) / blockSize;
+  kernel_find_ntuplets<<<numberOfBlocks, blockSize, 0, cudaStream>>>(hh.view(),
+                                                                     device_theCells_.get(),
+                                                                     device_nCells_,
+                                                                     device_theCellTracks_,
+                                                                     tuples_d,
+                                                                     device_hitTuple_apc_,
+                                                                     quality_d,
+                                                                     m_params.minHitsPerNtuplet_);
+  cudaCheck(cudaGetLastError());
+
+  if (m_params.doStats_)
+    kernel_mark_used<<<numberOfBlocks, blockSize, 0, cudaStream>>>(hh.view(),
+                                                                   device_theCells_.get(),
+                                                                   device_nCells_);
+  cudaCheck(cudaGetLastError());
+  
+#ifdef GPU_DEBUG
+  cudaDeviceSynchronize();
+  cudaCheck(cudaGetLastError());
+#endif
+
+
+  blockSize = 128;
+  numberOfBlocks = (HitContainer::totbins() + blockSize - 1) / blockSize;
+  cudautils::finalizeBulk<<<numberOfBlocks, blockSize, 0, cudaStream>>>(device_hitTuple_apc_, tuples_d);
+
+  // remove duplicates (tracks that share a doublet)
+  numberOfBlocks = (CAConstants::maxNumberOfDoublets() + blockSize - 1) / blockSize;
+  kernel_earlyDuplicateRemover<<<numberOfBlocks, blockSize, 0, cudaStream>>>(
+      device_theCells_.get(), device_nCells_, tuples_d, quality_d);
+  cudaCheck(cudaGetLastError());
+
+  blockSize = 128;
+  numberOfBlocks = (CAConstants::maxTuples() + blockSize - 1) / blockSize;
+  kernel_countMultiplicity<<<numberOfBlocks, blockSize, 0, cudaStream>>>(tuples_d, quality_d, device_tupleMultiplicity_.get());
+  cudautils::launchFinalize(device_tupleMultiplicity_.get(), device_tmws_, cudaStream);
+  kernel_fillMultiplicity<<<numberOfBlocks, blockSize, 0, cudaStream>>>(tuples_d, quality_d, device_tupleMultiplicity_.get());
+  cudaCheck(cudaGetLastError());
+
+  if (nhits > 1 && m_params.lateFishbone_) {
+    auto nthTot = 128;
+    auto stride = 16;
+    auto blockSize = nthTot / stride;
+    auto numberOfBlocks = (nhits + blockSize - 1) / blockSize;
+    dim3 blks(1, numberOfBlocks, 1);
+    dim3 thrs(stride, blockSize, 1);
+    fishbone<<<blks, thrs, 0, cudaStream>>>(
+        hh.view(), device_theCells_.get(), device_nCells_, device_isOuterHitOfCell_.get(), nhits, true);
+    cudaCheck(cudaGetLastError());
+  }
+
+  if (m_params.doStats_) {
+    numberOfBlocks = (std::max(nhits, maxNumberOfDoublets_) + blockSize - 1) / blockSize;
+    kernel_checkOverflows<<<numberOfBlocks, blockSize, 0, cudaStream>>>(tuples_d,
+                                                                        device_tupleMultiplicity_.get(),
+                                                                        device_hitTuple_apc_,
+                                                                        device_theCells_.get(),
+                                                                        device_nCells_,
+                                                                        device_theCellNeighbors_,
+                                                                        device_theCellTracks_,
+                                                                        device_isOuterHitOfCell_.get(),
+                                                                        nhits,
+                                                                        counters_);
+    cudaCheck(cudaGetLastError());
+  }
+#ifdef GPU_DEBUG
+  cudaDeviceSynchronize();
+  cudaCheck(cudaGetLastError());
+#endif
+
+}
+
+void CAHitNtupletGeneratorKernels::buildDoublets(HitsOnCPU const &hh, cuda::stream_t<> &stream) {
+  auto nhits = hh.nHits();
+
+#ifdef NTUPLE_DEBUG
+  std::cout << "building Doublets out of " << nhits << " Hits" << std::endl;
+#endif
+
+#ifdef GPU_DEBUG
+  cudaDeviceSynchronize();
+  cudaCheck(cudaGetLastError());
+#endif
+
+  // in principle we can use "nhits" to heuristically dimension the workspace...
+  edm::Service<CUDAService> cs;
+  device_isOuterHitOfCell_ = cs->make_device_unique<GPUCACell::OuterHitOfCell[]>(std::max(1U,nhits), stream);
+  assert(device_isOuterHitOfCell_.get());
+  {
+    int threadsPerBlock = 128;
+    // at least one block!
+    int blocks = ( std::max(1U,nhits) + threadsPerBlock - 1) / threadsPerBlock;
+    gpuPixelDoublets::initDoublets<<<blocks, threadsPerBlock, 0, stream.id()>>>(device_isOuterHitOfCell_.get(),
+                                                                                nhits,
+                                                                                device_theCellNeighbors_,
+                                                                                device_theCellNeighborsContainer_.get(),
+                                                                                device_theCellTracks_,
+                                                                                device_theCellTracksContainer_.get());
+    cudaCheck(cudaGetLastError());
+  }
+
+  device_theCells_ = cs->make_device_unique<GPUCACell[]>(CAConstants::maxNumberOfDoublets(), stream);
+
+#ifdef GPU_DEBUG
+  cudaDeviceSynchronize();
+  cudaCheck(cudaGetLastError());
+#endif
+
+  if (0 == nhits)
+    return;  // protect against empty events
+
+  // FIXME avoid magic numbers
+  auto nActualPairs=gpuPixelDoublets::nPairs;
+  if (!m_params.includeJumpingForwardDoublets_) nActualPairs = 15;
+  if (m_params.minHitsPerNtuplet_>3) {
+    nActualPairs = 13;
+  }
+
+  assert(nActualPairs<=gpuPixelDoublets::nPairs);
+  int stride = 1;
+  int threadsPerBlock = gpuPixelDoublets::getDoubletsFromHistoMaxBlockSize / stride;
+  int blocks = (2 * nhits + threadsPerBlock - 1) / threadsPerBlock;
+  dim3 blks(1, blocks, 1);
+  dim3 thrs(stride, threadsPerBlock, 1);
+  gpuPixelDoublets::getDoubletsFromHisto<<<blks, thrs, 0, stream.id()>>>(device_theCells_.get(),
+                                                                         device_nCells_,
+                                                                         device_theCellNeighbors_,
+                                                                         device_theCellTracks_,
+                                                                         hh.view(),
+                                                                         device_isOuterHitOfCell_.get(),
+                                                                         nActualPairs,
+                                                                         m_params.idealConditions_,
+                                                                         m_params.doClusterCut_,
+                                                                         m_params.doZCut_,
+                                                                         m_params.doPhiCut_);
+  cudaCheck(cudaGetLastError());
+
+#ifdef GPU_DEBUG
+  cudaDeviceSynchronize();
+  cudaCheck(cudaGetLastError());
+#endif
+
+}
+
+void CAHitNtupletGeneratorKernels::classifyTuples(HitsOnCPU const &hh,
+                                                     TkSoA * tracks_d,
+                                                     cudaStream_t cudaStream) {
+  // these are pointer on GPU!
+  auto const * tuples_d = &tracks_d->hitIndices;
+  auto * quality_d = (Quality*)(&tracks_d->m_quality);
+
+  auto blockSize = 64;
+
+  // classify tracks based on kinematics
+  auto numberOfBlocks = (CAConstants::maxNumberOfQuadruplets() + blockSize - 1) / blockSize;
+  kernel_classifyTracks<<<numberOfBlocks, blockSize, 0, cudaStream>>>(
+      tuples_d, tracks_d, m_params.cuts_, quality_d);
+  cudaCheck(cudaGetLastError());
+
+  if (m_params.lateFishbone_) {
+    // apply fishbone cleaning to good tracks
+    numberOfBlocks = (CAConstants::maxNumberOfDoublets() + blockSize - 1) / blockSize;
+    kernel_fishboneCleaner<<<numberOfBlocks, blockSize, 0, cudaStream>>>(
+        device_theCells_.get(), device_nCells_, quality_d);
+    cudaCheck(cudaGetLastError());
+  }
+
+  // remove duplicates (tracks that share a doublet)
+  numberOfBlocks = (CAConstants::maxNumberOfDoublets() + blockSize - 1) / blockSize;
+  kernel_fastDuplicateRemover<<<numberOfBlocks, blockSize, 0, cudaStream>>>(
+      device_theCells_.get(), device_nCells_, tuples_d, tracks_d);
+  cudaCheck(cudaGetLastError());
+
+  if (m_params.minHitsPerNtuplet_<4 || m_params.doStats_) {
+    // fill hit->track "map"
+    numberOfBlocks = (CAConstants::maxNumberOfQuadruplets() + blockSize - 1) / blockSize;
+    kernel_countHitInTracks<<<numberOfBlocks, blockSize, 0, cudaStream>>>(
+      tuples_d, quality_d, device_hitToTuple_.get());
+    cudaCheck(cudaGetLastError());
+    cudautils::launchFinalize(device_hitToTuple_.get(), device_tmws_, cudaStream);
+    cudaCheck(cudaGetLastError());
+    kernel_fillHitInTracks<<<numberOfBlocks, blockSize, 0, cudaStream>>>(
+        tuples_d, quality_d, device_hitToTuple_.get());
+    cudaCheck(cudaGetLastError());
+  }
+  if (m_params.minHitsPerNtuplet_<4) {
+    // remove duplicates (tracks that share a hit)
+    numberOfBlocks = (HitToTuple::capacity() + blockSize - 1) / blockSize;
+    kernel_tripletCleaner<<<numberOfBlocks, blockSize, 0, cudaStream>>>(
+        hh.view(), tuples_d, tracks_d, quality_d, device_hitToTuple_.get());
+    cudaCheck(cudaGetLastError());
+  }
+  if (m_params.doStats_) {
+    // counters (add flag???)
+    numberOfBlocks = (HitToTuple::capacity() + blockSize - 1) / blockSize;
+    kernel_doStatsForHitInTracks<<<numberOfBlocks, blockSize, 0, cudaStream>>>(device_hitToTuple_.get(), counters_);
+    cudaCheck(cudaGetLastError());
+    numberOfBlocks = (CAConstants::maxNumberOfQuadruplets() + blockSize - 1) / blockSize;
+    kernel_doStatsForTracks<<<numberOfBlocks, blockSize, 0, cudaStream>>>(tuples_d, quality_d, counters_);
+    cudaCheck(cudaGetLastError());
+  }
+#ifdef GPU_DEBUG
+    cudaDeviceSynchronize();
+    cudaCheck(cudaGetLastError());
+#endif
+
+#ifdef    DUMP_GPU_TK_TUPLES
+  static std::atomic<int> iev(0);
+  ++iev;
+  kernel_print_found_ntuplets<<<1, 32, 0, cudaStream>>>(hh.view(), tuples_d, tracks_d, quality_d, device_hitToTuple_.get(), 100,iev);
+#endif
+
+}
+
+__global__ void kernel_printCounters(CAHitNtupletGeneratorKernels::Counters const *counters) {
+  auto const &c = *counters;
+  printf(
+      "||Counters | nEvents | nHits | nCells | nTuples | nFitTacks  |  nGoodTracks | nUsedHits | nDupHits | nKilledCells | "
+      "nEmptyCells | nZeroTrackCells ||\n");
+  printf("Counters Raw %lld %lld %lld %lld %lld %lld %lld %lld %lld %lld %lld\n",
+         c.nEvents,
+         c.nHits,
+         c.nCells,
+         c.nTuples,
+         c.nGoodTracks,
+         c.nFitTracks,
+         c.nUsedHits,
+         c.nDupHits,
+         c.nKilledCells,
+         c.nEmptyCells,
+         c.nZeroTrackCells);
+  printf("Counters Norm %lld ||  %.1f|  %.1f|  %.1f|  %.1f|  %.1f|  %.1f|  %.1f|  %.1f|  %.3f|  %.3f||\n",
+         c.nEvents,
+         c.nHits / double(c.nEvents),
+         c.nCells / double(c.nEvents),
+         c.nTuples / double(c.nEvents),
+         c.nFitTracks / double(c.nEvents),
+         c.nGoodTracks / double(c.nEvents),
+         c.nUsedHits / double(c.nEvents),
+         c.nDupHits / double(c.nEvents),
+         c.nKilledCells / double(c.nEvents),
+         c.nEmptyCells / double(c.nCells),
+         c.nZeroTrackCells / double(c.nCells));
+}
+
+void CAHitNtupletGeneratorKernels::printCounters(Counters const * counters) { 
+   kernel_printCounters<<<1, 1>>>(counters);
+}
+
+
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.h b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.h
new file mode 100644
index 0000000000000..147ba98310c14
--- /dev/null
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.h
@@ -0,0 +1,186 @@
+#ifndef RecoPixelVertexing_PixelTriplets_plugins_CAHitNtupletGeneratorKernels_h
+#define RecoPixelVertexing_PixelTriplets_plugins_CAHitNtupletGeneratorKernels_h
+
+#include "HeterogeneousCore/CUDAServices/interface/CUDAService.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
+
+#include "CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h"
+#include "GPUCACell.h"
+
+// #define DUMP_GPU_TK_TUPLES
+
+
+class CAHitNtupletGeneratorKernels {
+public:
+  // counters
+  struct Counters {
+    unsigned long long nEvents;
+    unsigned long long nHits;
+    unsigned long long nCells;
+    unsigned long long nTuples;
+    unsigned long long nFitTracks;
+    unsigned long long nGoodTracks;
+    unsigned long long nUsedHits;
+    unsigned long long nDupHits;
+    unsigned long long nKilledCells;
+    unsigned long long nEmptyCells;
+    unsigned long long nZeroTrackCells;
+  };
+
+  using HitsOnGPU = TrackingRecHit2DSOAView;
+  using HitsOnCPU = TrackingRecHit2DCUDA;
+
+  using HitToTuple = CAConstants::HitToTuple;
+  using TupleMultiplicity = CAConstants::TupleMultiplicity;
+
+  using Quality = pixelTrack::Quality;
+  using TkSoA = pixelTrack::TrackSoA;
+  using HitContainer = pixelTrack::HitContainer;
+
+
+  struct QualityCuts {
+    // chi2 cut = chi2Scale * (chi2Coeff[0] + pT/GeV * (chi2Coeff[1] + pT/GeV * (chi2Coeff[2] + pT/GeV * chi2Coeff[3])))
+    float chi2Coeff[4];
+    float chi2MaxPt;    // GeV
+    float chi2Scale;
+
+    struct region {
+      float maxTip;     // cm
+      float minPt;      // GeV
+      float maxZip;     // cm
+    };
+
+    region triplet;
+    region quadruplet;
+  };
+
+
+  // params
+  struct Params {
+    Params(uint32_t minHitsPerNtuplet,
+                                  bool useRiemannFit,
+                                  bool fit5as4,
+                                  bool includeJumpingForwardDoublets,
+                                  bool earlyFishbone,
+                                  bool lateFishbone,
+                                  bool idealConditions,
+                                  bool doStats,
+                                  bool doClusterCut,
+                                  bool doZCut,
+                                  bool doPhiCut,
+                                  float ptmin,
+                                  float CAThetaCutBarrel,
+                                  float CAThetaCutForward,
+                                  float hardCurvCut,
+                                  float dcaCutInnerTriplet,
+                                  float dcaCutOuterTriplet,
+                                  QualityCuts const& cuts)
+      : minHitsPerNtuplet_(minHitsPerNtuplet),
+        useRiemannFit_(useRiemannFit),
+        fit5as4_(fit5as4),
+        includeJumpingForwardDoublets_(includeJumpingForwardDoublets),
+        earlyFishbone_(earlyFishbone),
+        lateFishbone_(lateFishbone),
+        idealConditions_(idealConditions),
+        doStats_(doStats),
+        doClusterCut_(doClusterCut),
+        doZCut_(doZCut),
+        doPhiCut_(doPhiCut),
+        ptmin_(ptmin),
+        CAThetaCutBarrel_(CAThetaCutBarrel),
+        CAThetaCutForward_(CAThetaCutForward),
+        hardCurvCut_(hardCurvCut),
+        dcaCutInnerTriplet_(dcaCutInnerTriplet),
+        dcaCutOuterTriplet_(dcaCutOuterTriplet),
+        cuts_(cuts) { }
+
+  const uint32_t minHitsPerNtuplet_;
+  const bool useRiemannFit_;
+  const bool fit5as4_;
+  const bool includeJumpingForwardDoublets_;
+  const bool earlyFishbone_;
+  const bool lateFishbone_;
+  const bool idealConditions_;
+  const bool doStats_;
+  const bool doClusterCut_;
+  const bool doZCut_;
+  const bool doPhiCut_;
+  const float ptmin_;
+  const float CAThetaCutBarrel_;
+  const float CAThetaCutForward_;
+  const float hardCurvCut_;
+  const float dcaCutInnerTriplet_;
+  const float dcaCutOuterTriplet_;
+
+  // quality cuts
+  QualityCuts cuts_
+  {
+    // polynomial coefficients for the pT-dependent chi2 cut
+    { 0.68177776, 0.74609577, -0.08035491, 0.00315399 },
+    // max pT used to determine the chi2 cut
+    10.,
+    // chi2 scale factor: 30 for broken line fit, 45 for Riemann fit
+    30.,
+    // regional cuts for triplets
+    {
+      0.3,  // |Tip| < 0.3 cm
+      0.5,  // pT > 0.5 GeV
+      12.0  // |Zip| < 12.0 cm
+    },
+    // regional cuts for quadruplets
+    {
+      0.5,  // |Tip| < 0.5 cm
+      0.3,  // pT > 0.3 GeV
+      12.0  // |Zip| < 12.0 cm
+    }
+   };
+  
+  }; // Params
+
+
+  CAHitNtupletGeneratorKernels(Params const & params) : m_params(params){}
+  ~CAHitNtupletGeneratorKernels() = default;
+
+  TupleMultiplicity const* tupleMultiplicity() const { return device_tupleMultiplicity_.get(); }
+
+  void launchKernels(HitsOnCPU const& hh, TkSoA * tuples_d, cudaStream_t cudaStream);
+
+  void classifyTuples(HitsOnCPU const& hh, TkSoA * tuples_d, cudaStream_t cudaStream);
+
+  void fillHitDetIndices(HitsOnCPU const &hh, TkSoA * tuples_d, cudaStream_t cudaStream);
+
+  void buildDoublets(HitsOnCPU const& hh, cuda::stream_t<>& stream);
+  void allocateOnGPU(cuda::stream_t<>& stream);
+  void cleanup(cudaStream_t cudaStream);
+
+  static void printCounters(Counters const * counters);
+  Counters* counters_ = nullptr;
+
+
+private:
+
+  // workspace
+  CAConstants::CellNeighborsVector* device_theCellNeighbors_ = nullptr;
+  cudautils::device::unique_ptr<CAConstants::CellNeighbors[]> device_theCellNeighborsContainer_;
+  CAConstants::CellTracksVector* device_theCellTracks_ = nullptr;
+  cudautils::device::unique_ptr<CAConstants::CellTracks[]> device_theCellTracksContainer_;
+
+  cudautils::device::unique_ptr<GPUCACell[]> device_theCells_;
+  cudautils::device::unique_ptr<GPUCACell::OuterHitOfCell[]> device_isOuterHitOfCell_;
+  uint32_t* device_nCells_ = nullptr;
+
+  cudautils::device::unique_ptr<HitToTuple> device_hitToTuple_;
+  AtomicPairCounter* device_hitToTuple_apc_ = nullptr;
+
+  AtomicPairCounter* device_hitTuple_apc_ = nullptr;
+
+  cudautils::device::unique_ptr<TupleMultiplicity> device_tupleMultiplicity_;
+  
+  uint8_t * device_tmws_;
+
+  cudautils::device::unique_ptr<AtomicPairCounter::c_type[]> device_storage_;
+  // params
+  Params const & m_params;
+};
+
+#endif  // RecoPixelVertexing_PixelTriplets_plugins_CAHitNtupletGeneratorKernels_h
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsAlloc.cu b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsAlloc.cu
new file mode 100644
index 0000000000000..126b6237bd0d7
--- /dev/null
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsAlloc.cu
@@ -0,0 +1,40 @@
+#include "CAHitNtupletGeneratorKernels.h"
+
+#include "FWCore/ServiceRegistry/interface/Service.h"
+#include "HeterogeneousCore/CUDAServices/interface/CUDAService.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+
+void CAHitNtupletGeneratorKernels::allocateOnGPU(cuda::stream_t<>& stream) {
+  //////////////////////////////////////////////////////////
+  // ALLOCATIONS FOR THE INTERMEDIATE RESULTS (STAYS ON WORKER)
+  //////////////////////////////////////////////////////////
+
+  edm::Service<CUDAService> cs;
+
+  /* not used at the moment 
+  cudaCheck(cudaMalloc(&device_theCellNeighbors_, sizeof(CAConstants::CellNeighborsVector)));
+  cudaCheck(cudaMemset(device_theCellNeighbors_, 0, sizeof(CAConstants::CellNeighborsVector)));
+  cudaCheck(cudaMalloc(&device_theCellTracks_, sizeof(CAConstants::CellTracksVector)));
+  cudaCheck(cudaMemset(device_theCellTracks_, 0, sizeof(CAConstants::CellTracksVector)));
+  */
+
+  device_hitToTuple_ = cs->make_device_unique<HitToTuple>(stream);
+
+  device_tupleMultiplicity_ = cs->make_device_unique<TupleMultiplicity>(stream);
+
+  auto storageSize = 3+(std::max(TupleMultiplicity::wsSize(), HitToTuple::wsSize())+sizeof(AtomicPairCounter::c_type))/sizeof(AtomicPairCounter::c_type);
+
+  device_storage_ = cs->make_device_unique<AtomicPairCounter::c_type[]>(storageSize,stream);
+  
+  device_hitTuple_apc_ = (AtomicPairCounter*)device_storage_.get();
+  device_hitToTuple_apc_ = (AtomicPairCounter*)device_storage_.get()+1;
+  device_nCells_ = (uint32_t *)(device_storage_.get()+2);
+  device_tmws_ = (uint8_t*)(device_storage_.get()+3);
+
+  assert(device_tmws_+std::max(TupleMultiplicity::wsSize(), HitToTuple::wsSize()) <= (uint8_t*)(device_storage_.get()+storageSize));
+
+  cudaCheck(cudaMemsetAsync(device_nCells_, 0, sizeof(uint32_t), stream.id()));
+  cudautils::launchZero(device_tupleMultiplicity_.get(), stream.id());
+  cudautils::launchZero(device_hitToTuple_.get(), stream.id());  // we may wish to keep it in the edm...
+}
+
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.cc b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.cc
new file mode 100644
index 0000000000000..08cafc7e8fc09
--- /dev/null
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.cc
@@ -0,0 +1,164 @@
+//
+// Original Author: Felice Pantaleo, CERN
+//
+
+#include <array>
+#include <cassert>
+#include <functional>
+#include <vector>
+
+#include "DataFormats/Common/interface/Handle.h"
+#include "FWCore/Framework/interface/ConsumesCollector.h"
+#include "FWCore/Framework/interface/Event.h"
+#include "FWCore/MessageLogger/interface/MessageLogger.h"
+#include "FWCore/ParameterSet/interface/ParameterSetDescription.h"
+#include "FWCore/Utilities/interface/EDMException.h"
+#include "FWCore/Utilities/interface/isFinite.h"
+#include "TrackingTools/DetLayers/interface/BarrelDetLayer.h"
+
+#include "CAHitNtupletGeneratorOnGPU.h"
+
+namespace {
+
+  template <typename T>
+  T sqr(T x) {
+    return x * x;
+  }
+
+  CAHitNtupletGeneratorKernels::QualityCuts makeQualityCuts(edm::ParameterSet const& pset) {
+    auto coeff = pset.getParameter<std::vector<double>>("chi2Coeff");
+    if (coeff.size() != 4) {
+      throw edm::Exception(edm::errors::Configuration, "CAHitNtupletGeneratorOnGPU.trackQualityCuts.chi2Coeff must have 4 elements");
+    }
+    return CAHitNtupletGeneratorKernels::QualityCuts {
+      // polynomial coefficients for the pT-dependent chi2 cut
+      { (float) coeff[0], (float) coeff[1], (float) coeff[2], (float) coeff[3] },
+      // max pT used to determine the chi2 cut
+      (float) pset.getParameter<double>("chi2MaxPt"),
+      // chi2 scale factor: 30 for broken line fit, 45 for Riemann fit
+      (float) pset.getParameter<double>("chi2Scale"),
+      // regional cuts for triplets
+      {
+        (float) pset.getParameter<double>("tripletMaxTip"),
+        (float) pset.getParameter<double>("tripletMinPt"),
+        (float) pset.getParameter<double>("tripletMaxZip")
+      },
+      // regional cuts for quadruplets
+      {
+        (float) pset.getParameter<double>("quadrupletMaxTip"),
+        (float) pset.getParameter<double>("quadrupletMinPt"),
+        (float) pset.getParameter<double>("quadrupletMaxZip")
+      }
+    };
+  }
+
+}  // namespace
+
+using namespace std;
+
+CAHitNtupletGeneratorOnGPU::CAHitNtupletGeneratorOnGPU(const edm::ParameterSet &cfg, edm::ConsumesCollector &iC)
+    : m_params(cfg.getParameter<unsigned int>("minHitsPerNtuplet"),
+              cfg.getParameter<bool>("useRiemannFit"),
+              cfg.getParameter<bool>("fit5as4"),
+              cfg.getParameter<bool>("includeJumpingForwardDoublets"),
+              cfg.getParameter<bool>("earlyFishbone"),
+              cfg.getParameter<bool>("lateFishbone"),
+              cfg.getParameter<bool>("idealConditions"),
+              cfg.getParameter<bool>("fillStatistics"),
+              cfg.getParameter<bool>("doClusterCut"),
+              cfg.getParameter<bool>("doZCut"),
+              cfg.getParameter<bool>("doPhiCut"),
+              cfg.getParameter<double>("ptmin"),
+              cfg.getParameter<double>("CAThetaCutBarrel"),
+              cfg.getParameter<double>("CAThetaCutForward"),
+              cfg.getParameter<double>("hardCurvCut"),
+              cfg.getParameter<double>("dcaCutInnerTriplet"),
+              cfg.getParameter<double>("dcaCutOuterTriplet"),
+              makeQualityCuts(cfg.getParameterSet("trackQualityCuts"))) {
+
+#ifdef    DUMP_GPU_TK_TUPLES
+      printf("TK: %s %s % %s %s %s %s %s %s %s %s %s %s %s %s %s\n",
+             "tid", "qual", "nh","charge","pt","eta","phi","tip","zip","chi2",
+             "h1","h2","h3","h4","h5");
+#endif
+
+  cudaCheck(cudaMalloc(&m_counters, sizeof(Counters)));
+  cudaCheck(cudaMemset(m_counters, 0, sizeof(Counters)));
+
+}
+
+CAHitNtupletGeneratorOnGPU::~CAHitNtupletGeneratorOnGPU(){
+ if (m_params.doStats_) {
+    // crash on multi-gpu processes
+    CAHitNtupletGeneratorKernels::printCounters(m_counters);
+  }
+  cudaFree(m_counters);
+}
+
+
+void CAHitNtupletGeneratorOnGPU::fillDescriptions(edm::ParameterSetDescription &desc) {
+  // 87 cm/GeV = 1/(3.8T * 0.3)
+  // take less than radius given by the hardPtCut and reject everything below
+  // auto hardCurvCut = 1.f/(0.35 * 87.f);
+  desc.add<double>("ptmin", 0.9f)->setComment("Cut on minimum pt");
+  desc.add<double>("CAThetaCutBarrel", 0.002f)->setComment("Cut on RZ alignement for Barrel");
+  desc.add<double>("CAThetaCutForward", 0.003f)->setComment("Cut on RZ alignment for Forward");
+  desc.add<double>("hardCurvCut", 1.f / (0.35 * 87.f))->setComment("Cut on minimum curvature");
+  desc.add<double>("dcaCutInnerTriplet", 0.15f)->setComment("Cut on origin radius when the inner hit is on BPix1");
+  desc.add<double>("dcaCutOuterTriplet", 0.25f)->setComment("Cut on origin radius when the outer hit is on BPix1");
+  desc.add<bool>("earlyFishbone", true);
+  desc.add<bool>("lateFishbone", false);
+  desc.add<bool>("idealConditions", true);
+  desc.add<bool>("fillStatistics", false);
+  desc.add<unsigned int>("minHitsPerNtuplet", 4);
+  desc.add<bool>("includeJumpingForwardDoublets", false);
+  desc.add<bool>("fit5as4", true);
+  desc.add<bool>("doClusterCut", true);
+  desc.add<bool>("doZCut", true);
+  desc.add<bool>("doPhiCut", true);
+  desc.add<bool>("useRiemannFit", false)->setComment("true for Riemann, false for BrokenLine");
+
+  edm::ParameterSetDescription trackQualityCuts;
+  trackQualityCuts.add<double>("chi2MaxPt", 10.)->setComment("max pT used to determine the pT-dependent chi2 cut");
+  trackQualityCuts.add<std::vector<double>>("chi2Coeff", { 0.68177776, 0.74609577, -0.08035491, 0.00315399 })
+    ->setComment("Polynomial coefficients to derive the pT-dependent chi2 cut");
+  trackQualityCuts.add<double>("chi2Scale", 30.)->setComment("Factor to multiply the pT-dependent chi2 cut (currently: 30 for the broken line fit, 45 for the Riemann fit)");
+  trackQualityCuts.add<double>("tripletMinPt", 0.5)->setComment("Min pT for triplets, in GeV");
+  trackQualityCuts.add<double>("tripletMaxTip", 0.3)->setComment("Max |Tip| for triplets, in cm");
+  trackQualityCuts.add<double>("tripletMaxZip", 12.)->setComment("Max |Zip| for triplets, in cm");
+  trackQualityCuts.add<double>("quadrupletMinPt", 0.3)->setComment("Min pT for quadruplets, in GeV");
+  trackQualityCuts.add<double>("quadrupletMaxTip", 0.5)->setComment("Max |Tip| for quadruplets, in cm");
+  trackQualityCuts.add<double>("quadrupletMaxZip", 12.)->setComment("Max |Zip| for quadruplets, in cm");
+  desc.add<edm::ParameterSetDescription>("trackQualityCuts", trackQualityCuts)
+    ->setComment("Quality cuts based on the results of the track fit:\n  - apply a pT-dependent chi2 cut;\n  - apply \"region cuts\" based on the fit results (pT, Tip, Zip).");
+}
+
+
+PixelTrackHeterogeneous CAHitNtupletGeneratorOnGPU::makeTuplesAsync(TrackingRecHit2DCUDA const& hits_d,
+                                                           float bfield,
+                                                           cuda::stream_t<>& stream) const {
+  edm::Service<CUDAService> cs;
+  PixelTrackHeterogeneous tracks(cs->make_device_unique<pixelTrack::TrackSoA>(stream));
+
+  auto * soa = tracks.get();
+  
+  CAHitNtupletGeneratorKernels kernels(m_params);
+  kernels.counters_ = m_counters;
+  HelixFitOnGPU fitter(bfield,m_params.fit5as4_);
+
+  kernels.allocateOnGPU(stream);
+  fitter.allocateOnGPU(&(soa->hitIndices), kernels.tupleMultiplicity(), soa);
+
+  kernels.buildDoublets(hits_d, stream);
+  kernels.launchKernels(hits_d, soa, stream.id());
+  kernels.fillHitDetIndices(hits_d, soa, stream.id());  // in principle needed only if Hits not "available"
+  if (m_params.useRiemannFit_) {
+    fitter.launchRiemannKernels(hits_d, hits_d.nHits(), CAConstants::maxNumberOfQuadruplets(), stream);
+  } else {
+    fitter.launchBrokenLineKernels(hits_d, hits_d.nHits(), CAConstants::maxNumberOfQuadruplets(), stream);
+  }
+  kernels.classifyTuples(hits_d, soa, stream.id());
+
+  return tracks;
+}
+
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.h b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.h
new file mode 100644
index 0000000000000..169f591c48e45
--- /dev/null
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.h
@@ -0,0 +1,73 @@
+#ifndef RecoPixelVertexing_PixelTriplets_plugins_CAHitNtupletGeneratorOnGPU_h
+#define RecoPixelVertexing_PixelTriplets_plugins_CAHitNtupletGeneratorOnGPU_h
+
+#include <cuda_runtime.h>
+#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DCUDA.h"
+#include "CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h"
+
+
+#include "DataFormats/SiPixelDetId/interface/PixelSubdetector.h"
+#include "FWCore/ParameterSet/interface/ParameterSet.h"
+#include "FWCore/Utilities/interface/EDGetToken.h"
+#include "HeterogeneousCore/CUDAServices/interface/CUDAService.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/GPUSimpleVector.h"
+
+#include "CAHitNtupletGeneratorKernels.h"
+#include "HelixFitOnGPU.h"
+
+// FIXME  (split header???)
+#include "GPUCACell.h"
+
+namespace edm {
+  class Event;
+  class EventSetup;
+  class ParameterSetDescription;
+}  // namespace edm
+
+class CAHitNtupletGeneratorOnGPU {
+public:
+  using HitsOnGPU = TrackingRecHit2DSOAView;
+  using HitsOnCPU = TrackingRecHit2DCUDA;
+  using hindex_type = TrackingRecHit2DSOAView::hindex_type;
+
+  using Quality = pixelTrack::Quality;
+  using OutputSoA = pixelTrack::TrackSoA;
+  using HitContainer = pixelTrack::HitContainer;
+  using Tuple = HitContainer;
+
+public:
+  CAHitNtupletGeneratorOnGPU(const edm::ParameterSet& cfg, edm::ConsumesCollector&& iC)
+      : CAHitNtupletGeneratorOnGPU(cfg, iC) {}
+  CAHitNtupletGeneratorOnGPU(const edm::ParameterSet& cfg, edm::ConsumesCollector& iC);
+
+  ~CAHitNtupletGeneratorOnGPU();
+
+  static void fillDescriptions(edm::ParameterSetDescription& desc);
+  static const char* fillDescriptionsLabel() { return "caHitNtupletOnGPU"; }
+
+  PixelTrackHeterogeneous makeTuplesAsync(TrackingRecHit2DCUDA const& hits_d,
+                                float bfield,
+                                cuda::stream_t<>& stream) const;
+ 
+private:
+
+  void buildDoublets(HitsOnCPU const& hh, cuda::stream_t<>& stream) const;
+
+  void hitNtuplets(HitsOnCPU const& hh,
+                   const edm::EventSetup& es,
+                   bool useRiemannFit,
+                   cuda::stream_t<>& cudaStream);
+
+  void cleanup(cudaStream_t stream);
+
+  void launchKernels(HitsOnCPU const& hh, bool useRiemannFit, cuda::stream_t<>& cudaStream) const;
+
+
+  CAHitNtupletGeneratorKernels::Params m_params;
+
+  using Counters = CAHitNtupletGeneratorKernels::Counters;
+  CAHitNtupletGeneratorKernels::Counters * m_counters = nullptr;
+
+};
+
+#endif  // RecoPixelVertexing_PixelTriplets_plugins_CAHitNtupletGeneratorOnGPU_h
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/GPUCACell.h b/RecoPixelVertexing/PixelTriplets/plugins/GPUCACell.h
index 2beb8b889c94c..f1709f7ae7063 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/GPUCACell.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/GPUCACell.h
@@ -15,7 +15,8 @@
 #include "HeterogeneousCore/CUDAUtilities/interface/GPUVecArray.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h"
 #include "RecoPixelVertexing/PixelTriplets/interface/CircleEq.h"
-#include "RecoPixelVertexing/PixelTriplets/plugins/pixelTuplesHeterogeneousProduct.h"
+#include "CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h"
+#include "CAConstants.h"
 
 class GPUCACell {
 public:
@@ -33,10 +34,9 @@ class GPUCACell {
 
   using TmpTuple = GPU::VecArray<uint32_t, 6>;
 
-  using TuplesOnGPU = pixelTuplesHeterogeneousProduct::TuplesOnGPU;
-
-  using Quality = pixelTuplesHeterogeneousProduct::Quality;
-  static constexpr auto bad = pixelTuplesHeterogeneousProduct::bad;
+  using HitContainer = pixelTrack::HitContainer;
+  using Quality = trackQuality::Quality;
+  static constexpr auto bad = trackQuality::bad;
 
   GPUCACell() = default;
 #ifdef __CUDACC__
@@ -249,7 +249,7 @@ class GPUCACell {
   __device__ inline void find_ntuplets(Hits const& hh,
                                        GPUCACell* __restrict__ cells,
                                        CellTracksVector& cellTracks,
-                                       TuplesOnGPU::Container& foundNtuplets,
+                                       HitContainer & foundNtuplets,
                                        AtomicPairCounter& apc,
                                        Quality* __restrict__ quality,
                                        TmpTuple& tmpNtuplet,
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.cc b/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.cc
index a374c975ef4b6..c071cdd347808 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.cc
+++ b/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.cc
@@ -1,16 +1,16 @@
 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
 #include "HelixFitOnGPU.h"
 
-void HelixFitOnGPU::allocateOnGPU(TuplesOnGPU::Container const* tuples,
-                                  TupleMultiplicity const* tupleMultiplicity,
-                                  Rfit::helix_fit* helix_fit_results) {
+void HelixFitOnGPU::allocateOnGPU(Tuples const *tuples,
+                     TupleMultiplicity const *tupleMultiplicity,
+                     OutputSoA *helix_fit_results) {
   tuples_d = tuples;
   tupleMultiplicity_d = tupleMultiplicity;
-  helix_fit_results_d = helix_fit_results;
+  outputSoa_d = helix_fit_results;
 
   assert(tuples_d);
   assert(tupleMultiplicity_d);
-  assert(helix_fit_results_d);
+  assert(outputSoa_d);
 }
 
 void HelixFitOnGPU::deallocateOnGPU() {}
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.h b/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.h
index 022a6ba5ae623..c9688fc43418c 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/HelixFitOnGPU.h
@@ -4,10 +4,9 @@
 #include <cuda/api_wrappers.h>
 
 #include "RecoPixelVertexing/PixelTrackFitting/interface/FitResult.h"
-#include "RecoPixelVertexing/PixelTriplets/plugins/pixelTuplesHeterogeneousProduct.h"
-
-class TrackingRecHit2DSOAView;
-class TrackingRecHit2DCUDA;
+#include "CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h"
+#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DCUDA.h"
+#include "CAConstants.h"
 
 namespace Rfit {
   // in case of memory issue can be made smaller
@@ -38,10 +37,12 @@ class HelixFitOnGPU {
   using HitsOnGPU = TrackingRecHit2DSOAView;
   using HitsOnCPU = TrackingRecHit2DCUDA;
 
-  using TuplesOnGPU = pixelTuplesHeterogeneousProduct::TuplesOnGPU;
+  using Tuples = pixelTrack::HitContainer;
+  using OutputSoA = pixelTrack::TrackSoA;
+
   using TupleMultiplicity = CAConstants::TupleMultiplicity;
 
-  explicit HelixFitOnGPU(bool fit5as4) : fit5as4_(fit5as4) {}
+  explicit HelixFitOnGPU(float bf, bool fit5as4) : bField_(bf), fit5as4_(fit5as4) {}
   ~HelixFitOnGPU() { deallocateOnGPU(); }
 
   void setBField(double bField) { bField_ = bField; }
@@ -54,19 +55,19 @@ class HelixFitOnGPU {
                                uint32_t maxNumberOfTuples,
                                cuda::stream_t<> &cudaStream);
 
-  void allocateOnGPU(TuplesOnGPU::Container const *tuples,
+  void allocateOnGPU(Tuples const *tuples,
                      TupleMultiplicity const *tupleMultiplicity,
-                     Rfit::helix_fit *helix_fit_results);
+                     OutputSoA * outputSoA);
   void deallocateOnGPU();
 
 private:
   static constexpr uint32_t maxNumberOfConcurrentFits_ = Rfit::maxNumberOfConcurrentFits();
 
   // fowarded
-  TuplesOnGPU::Container const *tuples_d = nullptr;
+  Tuples const *tuples_d = nullptr;
   TupleMultiplicity const *tupleMultiplicity_d = nullptr;
-  double bField_;
-  Rfit::helix_fit *helix_fit_results_d = nullptr;
+  OutputSoA * outputSoa_d;
+  float bField_;
 
   const bool fit5as4_;
 };
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.cu b/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.cu
index 4aea729e913a6..38f51e676a9ca 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.cu
+++ b/RecoPixelVertexing/PixelTriplets/plugins/RiemannFitOnGPU.cu
@@ -17,12 +17,13 @@
 #include "HelixFitOnGPU.h"
 
 using HitsOnGPU = TrackingRecHit2DSOAView;
-using TuplesOnGPU = pixelTuplesHeterogeneousProduct::TuplesOnGPU;
+using Tuples = pixelTrack::HitContainer;
+using OutputSoA = pixelTrack::TrackSoA;
 
 using namespace Eigen;
 
 template <int N>
-__global__ void kernelFastFit(TuplesOnGPU::Container const *__restrict__ foundNtuplets,
+__global__ void kernelFastFit(Tuples const *__restrict__ foundNtuplets,
                               CAConstants::TupleMultiplicity const *__restrict__ tupleMultiplicity,
                               uint32_t nHits,
                               HitsOnGPU const *__restrict__ hhp,
@@ -51,17 +52,17 @@ __global__ void kernelFastFit(TuplesOnGPU::Container const *__restrict__ foundNt
     return;
 
   // get it from the ntuple container (one to one to helix)
-  auto helix_start = *(tupleMultiplicity->begin(nHits) + tuple_start);
-  assert(helix_start < foundNtuplets->nbins());
+  auto tkid = *(tupleMultiplicity->begin(nHits) + tuple_start);
+  assert(tkid < foundNtuplets->nbins());
 
-  assert(foundNtuplets->size(helix_start) == nHits);
+  assert(foundNtuplets->size(tkid) == nHits);
 
   Rfit::Map3xNd<N> hits(phits + local_start);
   Rfit::Map4d fast_fit(pfast_fit + local_start);
   Rfit::Map6xNf<N> hits_ge(phits_ge + local_start);
 
   // Prepare data structure
-  auto const *hitId = foundNtuplets->begin(helix_start);
+  auto const *hitId = foundNtuplets->begin(tkid);
   for (unsigned int i = 0; i < hitsInFit; ++i) {
     auto hit = hitId[i];
     // printf("Hit global: %f,%f,%f\n", hhp->xg_d[hit],hhp->yg_d[hit],hhp->zg_d[hit]);
@@ -102,7 +103,7 @@ __global__ void kernelCircleFit(CAConstants::TupleMultiplicity const *__restrict
     return;
 
   // get it for the ntuple container (one to one to helix)
-  auto helix_start = *(tupleMultiplicity->begin(nHits) + tuple_start);
+  auto tkid = *(tupleMultiplicity->begin(nHits) + tuple_start);
 
   Rfit::Map3xNd<N> hits(phits + local_start);
   Rfit::Map4d fast_fit(pfast_fit_input + local_start);
@@ -116,7 +117,7 @@ __global__ void kernelCircleFit(CAConstants::TupleMultiplicity const *__restrict
   circle_fit[local_start] = Rfit::Circle_fit(hits.block(0, 0, 2, N), hits_cov, fast_fit, rad, B, true);
 
 #ifdef RIEMANN_DEBUG
-//  printf("kernelCircleFit circle.par(0,1,2): %d %f,%f,%f\n", helix_start,
+//  printf("kernelCircleFit circle.par(0,1,2): %d %f,%f,%f\n", tkid,
 //         circle_fit[local_start].par(0), circle_fit[local_start].par(1), circle_fit[local_start].par(2));
 #endif
 }
@@ -125,7 +126,7 @@ template <int N>
 __global__ void kernelLineFit(CAConstants::TupleMultiplicity const *__restrict__ tupleMultiplicity,
                               uint32_t nHits,
                               double B,
-                              Rfit::helix_fit *results,
+                              OutputSoA *results,
                               double *__restrict__ phits,
                               float *__restrict__ phits_ge,
                               double *__restrict__ pfast_fit_input,
@@ -144,7 +145,7 @@ __global__ void kernelLineFit(CAConstants::TupleMultiplicity const *__restrict__
     return;
 
   // get it for the ntuple container (one to one to helix)
-  auto helix_start = *(tupleMultiplicity->begin(nHits) + tuple_start);
+  auto tkid = *(tupleMultiplicity->begin(nHits) + tuple_start);
 
   Rfit::Map3xNd<N> hits(phits + local_start);
   Rfit::Map4d fast_fit(pfast_fit_input + local_start);
@@ -152,39 +153,31 @@ __global__ void kernelLineFit(CAConstants::TupleMultiplicity const *__restrict__
 
   auto const &line_fit = Rfit::Line_fit(hits, hits_ge, circle_fit[local_start], fast_fit, B, true);
 
-  par_uvrtopak(circle_fit[local_start], B, true);
+  Rfit::fromCircleToPerigee(circle_fit[local_start]);
 
-  // Grab helix_fit from the proper location in the output vector
-  auto &helix = results[helix_start];
-  helix.par << circle_fit[local_start].par, line_fit.par;
-
-  // TODO: pass properly error booleans
-
-  helix.cov = Rfit::Matrix5d::Zero();
-  helix.cov.block(0, 0, 3, 3) = circle_fit[local_start].cov;
-  helix.cov.block(3, 3, 2, 2) = line_fit.cov;
-
-  helix.q = circle_fit[local_start].q;
-  helix.chi2_circle = circle_fit[local_start].chi2;
-  helix.chi2_line = line_fit.chi2;
+  results->stateAtBS.copyFromCircle(circle_fit[local_start].par,circle_fit[local_start].cov,
+                                   line_fit.par,line_fit.cov,1.f/float(B),tkid);
+  results->pt(tkid) =  B/std::abs(circle_fit[local_start].par(2));
+  results->eta(tkid) =  asinhf(line_fit.par(0));
+  results->chi2(tkid) = (circle_fit[local_start].chi2+line_fit.chi2)/(2*N-5);
 
 #ifdef RIEMANN_DEBUG
   printf("kernelLineFit size %d for %d hits circle.par(0,1,2): %d %f,%f,%f\n",
          N,
          nHits,
-         helix_start,
+         tkid,
          circle_fit[local_start].par(0),
          circle_fit[local_start].par(1),
          circle_fit[local_start].par(2));
-  printf("kernelLineFit line.par(0,1): %d %f,%f\n", helix_start, line_fit.par(0), line_fit.par(1));
+  printf("kernelLineFit line.par(0,1): %d %f,%f\n", tkid, line_fit.par(0), line_fit.par(1));
   printf("kernelLineFit chi2 cov %f/%f %e,%e,%e,%e,%e\n",
-         helix.chi2_circle,
-         helix.chi2_line,
-         helix.cov(0, 0),
-         helix.cov(1, 1),
-         helix.cov(2, 2),
-         helix.cov(3, 3),
-         helix.cov(4, 4));
+         circle_fit[local_start].chi2,
+         line_fit.chi2,
+         circle_fit[local_start].cov(0, 0),
+         circle_fit[local_start].cov(1, 1),
+         circle_fit[local_start].cov(2, 2),
+         line_fit.cov(0, 0),
+         line_fit.cov(1, 1));
 #endif
 }
 
@@ -234,7 +227,7 @@ void HelixFitOnGPU::launchRiemannKernels(HitsOnCPU const &hh,
     kernelLineFit<3><<<numberOfBlocks, blockSize, 0, stream.id()>>>(tupleMultiplicity_d,
                                                                     3,
                                                                     bField_,
-                                                                    helix_fit_results_d,
+                                                                    outputSoa_d,
                                                                     hitsGPU_.get(),
                                                                     hits_geGPU_.get(),
                                                                     fast_fit_resultsGPU_.get(),
@@ -266,7 +259,7 @@ void HelixFitOnGPU::launchRiemannKernels(HitsOnCPU const &hh,
     kernelLineFit<4><<<numberOfBlocks, blockSize, 0, stream.id()>>>(tupleMultiplicity_d,
                                                                     4,
                                                                     bField_,
-                                                                    helix_fit_results_d,
+                                                                    outputSoa_d,
                                                                     hitsGPU_.get(),
                                                                     hits_geGPU_.get(),
                                                                     fast_fit_resultsGPU_.get(),
@@ -299,7 +292,7 @@ void HelixFitOnGPU::launchRiemannKernels(HitsOnCPU const &hh,
       kernelLineFit<4><<<numberOfBlocks, blockSize, 0, stream.id()>>>(tupleMultiplicity_d,
                                                                       5,
                                                                       bField_,
-                                                                      helix_fit_results_d,
+                                                                      outputSoa_d,
                                                                       hitsGPU_.get(),
                                                                       hits_geGPU_.get(),
                                                                       fast_fit_resultsGPU_.get(),
@@ -331,7 +324,7 @@ void HelixFitOnGPU::launchRiemannKernels(HitsOnCPU const &hh,
       kernelLineFit<5><<<numberOfBlocks, blockSize, 0, stream.id()>>>(tupleMultiplicity_d,
                                                                       5,
                                                                       bField_,
-                                                                      helix_fit_results_d,
+                                                                      outputSoa_d,
                                                                       hitsGPU_.get(),
                                                                       hits_geGPU_.get(),
                                                                       fast_fit_resultsGPU_.get(),
diff --git a/RecoTracker/TkSeedGenerator/plugins/SeedProducerFromSoA.cc b/RecoTracker/TkSeedGenerator/plugins/SeedProducerFromSoA.cc
new file mode 100644
index 0000000000000..b10de55871185
--- /dev/null
+++ b/RecoTracker/TkSeedGenerator/plugins/SeedProducerFromSoA.cc
@@ -0,0 +1,177 @@
+#include "DataFormats/BeamSpot/interface/BeamSpot.h"
+#include "DataFormats/TrackerCommon/interface/TrackerTopology.h"
+#include "DataFormats/TrajectoryState/interface/LocalTrajectoryParameters.h"
+#include "DataFormats/GeometrySurface/interface/Plane.h"
+#include "DataFormats/TrajectorySeed/interface/TrajectorySeedCollection.h"
+#include "DataFormats/TrackingRecHit/interface/InvalidTrackingRecHit.h"
+#include "FWCore/Framework/interface/ESHandle.h"
+#include "FWCore/Framework/interface/Event.h"
+#include "FWCore/Framework/interface/EventSetup.h"
+#include "FWCore/Framework/interface/MakerMacros.h"
+#include "FWCore/Framework/interface/global/EDProducer.h"
+#include "FWCore/Framework/interface/ConsumesCollector.h"
+#include "FWCore/ParameterSet/interface/ConfigurationDescriptions.h"
+#include "FWCore/ParameterSet/interface/ParameterSet.h"
+#include "FWCore/ParameterSet/interface/ParameterSetDescription.h"
+#include "FWCore/Utilities/interface/InputTag.h"
+#include "FWCore/PluginManager/interface/ModuleDef.h"
+#include "FWCore/ServiceRegistry/interface/Service.h"
+#include "FWCore/Utilities/interface/EDGetToken.h"
+#include "Geometry/Records/interface/TrackerTopologyRcd.h"
+#include "MagneticField/Records/interface/IdealMagneticFieldRecord.h"
+#include "HeterogeneousCore/CUDACore/interface/GPUCuda.h"
+#include "HeterogeneousCore/CUDAServices/interface/CUDAService.h"
+#include "Geometry/CommonDetUnit/interface/GeomDet.h"
+#include "Geometry/TrackerGeometryBuilder/interface/TrackerGeometry.h"
+#include "Geometry/Records/interface/TrackerDigiGeometryRecord.h"
+#include "TrackingTools/MaterialEffects/interface/PropagatorWithMaterial.h"
+#include "TrackingTools/Records/interface/TrackingComponentsRecord.h"
+#include "TrackingTools/AnalyticalJacobians/interface/JacobianLocalToCurvilinear.h"
+#include "TrackingTools/TrajectoryParametrization/interface/GlobalTrajectoryParameters.h"
+#include "TrackingTools/TrajectoryParametrization/interface/CurvilinearTrajectoryError.h"
+#include "TrackingTools/TrajectoryState/interface/TrajectoryStateTransform.h"
+
+#include "CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h"
+#include "RecoPixelVertexing/PixelTrackFitting/interface/FitUtils.h"
+
+/*
+  produces seeds directly from cuda produced tuples
+*/
+class SeedProducerFromSoA : public edm::global::EDProducer<> {
+public:
+
+  explicit SeedProducerFromSoA(const edm::ParameterSet &iConfig);
+  ~SeedProducerFromSoA() override = default;
+
+  static void fillDescriptions(edm::ConfigurationDescriptions &descriptions);
+
+private:
+  void produce(edm::StreamID streamID, edm::Event& iEvent, const edm::EventSetup& iSetup) const override;
+
+  edm::EDGetTokenT<reco::BeamSpot> tBeamSpot_;
+  edm::EDGetTokenT<PixelTrackHeterogeneous> tokenTrack_;
+
+  int32_t minNumberOfHits_;
+};
+
+SeedProducerFromSoA::SeedProducerFromSoA(const edm::ParameterSet &iConfig) :
+      tBeamSpot_(consumes<reco::BeamSpot>(iConfig.getParameter<edm::InputTag>("beamSpot"))),
+      tokenTrack_(consumes<PixelTrackHeterogeneous>(iConfig.getParameter<edm::InputTag>("src"))),
+      minNumberOfHits_(iConfig.getParameter<int>("minNumberOfHits"))
+
+{
+    produces<TrajectorySeedCollection>();
+}
+
+void SeedProducerFromSoA::fillDescriptions(edm::ConfigurationDescriptions &descriptions) {
+  edm::ParameterSetDescription desc;
+  desc.add<edm::InputTag>("beamSpot", edm::InputTag("offlineBeamSpot"));
+  desc.add<edm::InputTag>("src", edm::InputTag("pixelTrackSoA"));
+  desc.add<int>("minNumberOfHits", 0);
+
+  descriptions.addWithDefaultLabel(desc);
+}
+
+void SeedProducerFromSoA::produce(edm::StreamID streamID, edm::Event& iEvent, const edm::EventSetup& iSetup) const {
+
+  // std::cout << "Converting gpu helix to trajectory seed" << std::endl;
+  auto result = std::make_unique<TrajectorySeedCollection>();
+
+
+  edm::ESHandle<MagneticField> fieldESH;
+  iSetup.get<IdealMagneticFieldRecord>().get(fieldESH);
+
+  edm::ESHandle<TrackerGeometry> tracker;
+  iSetup.get<TrackerDigiGeometryRecord>().get(tracker);
+  auto const & dus = tracker->detUnits();
+
+  edm::ESHandle<Propagator>  propagatorHandle;
+  iSetup.get<TrackingComponentsRecord>().get("PropagatorWithMaterial",propagatorHandle);
+  const Propagator*  propagator = &(*propagatorHandle);
+
+  edm::ESHandle<TrackerTopology> httopo;
+  iSetup.get<TrackerTopologyRcd>().get(httopo);
+
+
+  const auto &bsh = iEvent.get(tBeamSpot_);
+  // std::cout << "beamspot " << bsh.x0() << ' ' << bsh.y0() << ' ' << bsh.z0() << std::endl;
+  GlobalPoint bs(bsh.x0(), bsh.y0(), bsh.z0());
+
+  const auto & tsoa = *(iEvent.get(tokenTrack_));
+
+  auto const * quality = tsoa.qualityData();
+  auto const & fit = tsoa.stateAtBS;
+  auto const & detIndices = tsoa.detIndices;
+  auto maxTracks = tsoa.stride();
+
+  int32_t nt = 0;
+  for (int32_t it = 0; it < maxTracks; ++it) {
+    auto nHits = tsoa.nHits(it);
+    if (nHits == 0) break;  // this is a guard: maybe we need to move to nTracks...
+
+    auto q = quality[it];
+    if (q != trackQuality::loose)
+      continue;                           // FIXME
+   if (nHits< minNumberOfHits_) continue;
+    ++nt;
+
+    // fill hits with invalid just to hold the detId
+    auto b = detIndices.begin(it);
+    edm::OwnVector<TrackingRecHit> hits;
+    for (int iHit = 0; iHit < nHits; ++iHit) {
+      auto const * det =  dus[*(b+iHit)];
+      // FIXME at some point get a proper type ...
+      hits.push_back(new InvalidTrackingRecHit(*det,TrackingRecHit::bad));
+    }
+
+
+   // mind: this values are respect the beamspot!
+
+    float phi = tsoa.phi(it);
+
+    Rfit::Vector5d ipar,opar;
+    Rfit::Matrix5d icov,ocov;
+    fit.copyToDense(ipar,icov,it);
+    Rfit::transformToPerigeePlane(ipar,icov,opar,ocov);
+
+    LocalTrajectoryParameters lpar(opar(0),opar(1),opar(2),opar(3),opar(4),1.);
+    AlgebraicSymMatrix55 m;
+    for(int i=0; i<5; ++i) for (int j=i; j<5; ++j) m(i,j) = ocov(i,j);
+
+    float sp = std::sin(phi);
+    float cp = std::cos(phi);
+    Surface::RotationType rot(
+                              sp, -cp,    0,
+                               0,   0, -1.f,
+                              cp,  sp,    0);
+
+    Plane impPointPlane(bs,rot);
+    GlobalTrajectoryParameters gp(impPointPlane.toGlobal(lpar.position()),
+                                  impPointPlane.toGlobal(lpar.momentum()),lpar.charge(),fieldESH.product());
+
+    JacobianLocalToCurvilinear jl2c(impPointPlane,lpar,*fieldESH.product());
+
+    AlgebraicSymMatrix55 mo = ROOT::Math::Similarity(jl2c.jacobian(),m);
+
+    FreeTrajectoryState fts(gp, CurvilinearTrajectoryError(mo));
+
+    auto const & lastHit = hits.back();
+
+    TrajectoryStateOnSurface outerState = propagator->propagate(fts, *lastHit.surface());
+
+    if (!outerState.isValid()){
+      edm::LogError("SeedFromGPU")<<" was trying to create a seed from:\n"<<fts<<"\n propagating to: " 
+                                         << lastHit.geographicalId().rawId();
+      continue;
+    }
+
+    auto const & pTraj = trajectoryStateTransform::persistentState(outerState, lastHit.geographicalId().rawId());
+
+    result->emplace_back(pTraj, hits, alongMomentum);
+
+  }
+
+  iEvent.put(std::move(result));
+}
+
+DEFINE_FWK_MODULE(SeedProducerFromSoA);