From abf1a46a184edd35ad3f5791be03b9e94f9afda7 Mon Sep 17 00:00:00 2001
From: Vincenzo Innocente <vincenzo.innocente@cern.ch>
Date: Wed, 8 Aug 2018 19:17:46 +0200
Subject: [PATCH] Pixel doublets on GPU (#118)

Pixel doublets (actually CACells) are created on GPU and fed to CA.
The whole workflow up to quadruplets candidates is now fully on GPU.
---
 .../CUDAUtilities/interface/GPUVecArray.h     |  14 +-
 .../SiPixelRecHits/plugins/PixelRecHits.cu    |   1 +
 .../siPixelRecHitsHeterogeneousProduct.h      |   2 +
 .../CAHitNtupletHeterogeneousEDProducer.cc    |  54 +-
 .../plugins/CAHitQuadrupletGeneratorGPU.cc    | 487 +++++------------
 .../plugins/CAHitQuadrupletGeneratorGPU.cu    | 497 ++++--------------
 .../plugins/CAHitQuadrupletGeneratorGPU.h     |  57 +-
 .../PixelTriplets/plugins/GPUCACell.h         |  74 +--
 .../PixelTriplets/plugins/RecHitsMap.h        |  77 +++
 .../PixelTriplets/plugins/gpuPixelDoublets.h  | 233 ++++----
 10 files changed, 510 insertions(+), 986 deletions(-)
 create mode 100644 RecoPixelVertexing/PixelTriplets/plugins/RecHitsMap.h
diff --git a/HeterogeneousCore/CUDAUtilities/interface/GPUVecArray.h b/HeterogeneousCore/CUDAUtilities/interface/GPUVecArray.h
index 601eb6db0305a..7ea4f6422d7fa 100644
--- a/HeterogeneousCore/CUDAUtilities/interface/GPUVecArray.h
+++ b/HeterogeneousCore/CUDAUtilities/interface/GPUVecArray.h
@@ -5,7 +5,6 @@
 #ifndef GPU_VECARRAY_H_
 #define GPU_VECARRAY_H_
 
-
 #include <cuda.h>
 #include <cuda_runtime.h>
 
@@ -24,8 +23,6 @@ template <class T, int maxSize> struct VecArray {
     }
   }
 
-
-
   template <class... Ts> constexpr int emplace_back_unsafe(Ts &&... args) {
     auto previousSize = m_size;
     m_size++;
@@ -38,9 +35,7 @@ template <class T, int maxSize> struct VecArray {
     }
   }
 
-
   __inline__ constexpr T & back() const {
-
     if (m_size > 0) {
       return m_data[m_size - 1];
     } else
@@ -95,10 +90,15 @@ template <class T, int maxSize> struct VecArray {
 
   __inline__ constexpr void resize(int size) { m_size = size; }
 
+  __inline__ constexpr bool empty() const { return 0 == m_size; }
+
+  __inline__ constexpr bool full() const { return maxSize == m_size; }
 
-  int m_size;
+  int m_size = 0;
 
   T m_data[maxSize];
 };
+
 }
-#endif
+
+#endif // GPU_VECARRAY_H_
diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.cu b/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.cu
index 1e3d6a72d0a94..3f0c5c81b9140 100644
--- a/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.cu
+++ b/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.cu
@@ -129,6 +129,7 @@ namespace pixelgpudetails {
 #endif
     auto nhits = input.nClusters;
     cpu_ = std::make_unique<HitsOnCPU>(nhits);
+    cudaCheck(cudaMemcpyAsync(cpu_->detInd.data(), gpu_.detInd_d, nhits*sizeof(int16_t), cudaMemcpyDefault, stream.id()));
     cudaCheck(cudaMemcpyAsync(cpu_->charge.data(), gpu_.charge_d, nhits*sizeof(int32_t), cudaMemcpyDefault, stream.id()));
     cudaCheck(cudaMemcpyAsync(cpu_->xl.data(), gpu_.xl_d, nhits*sizeof(float), cudaMemcpyDefault, stream.id()));
     cudaCheck(cudaMemcpyAsync(cpu_->yl.data(), gpu_.yl_d, nhits*sizeof(float), cudaMemcpyDefault, stream.id()));
diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/siPixelRecHitsHeterogeneousProduct.h b/RecoLocalTracker/SiPixelRecHits/plugins/siPixelRecHitsHeterogeneousProduct.h
index 23ca8805991bd..864f7f4f7b2aa 100644
--- a/RecoLocalTracker/SiPixelRecHits/plugins/siPixelRecHitsHeterogeneousProduct.h
+++ b/RecoLocalTracker/SiPixelRecHits/plugins/siPixelRecHitsHeterogeneousProduct.h
@@ -39,6 +39,7 @@ namespace siPixelRecHitsHeterogeneousProduct {
     HitsOnCPU() = default;
 
     explicit HitsOnCPU(uint32_t nhits) :
+      detInd(nhits),
       charge(nhits),
       xl(nhits),
       yl(nhits),
@@ -50,6 +51,7 @@ namespace siPixelRecHitsHeterogeneousProduct {
     { }
 
     uint32_t hitsModuleStart[2001];
+    std::vector<uint16_t,  CUDAHostAllocator<uint16_t>> detInd;
     std::vector<int32_t,  CUDAHostAllocator<int32_t>> charge;
     std::vector<float,    CUDAHostAllocator<float>> xl, yl;
     std::vector<float,    CUDAHostAllocator<float>> xe, ye;
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletHeterogeneousEDProducer.cc b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletHeterogeneousEDProducer.cc
index a4183d17e070f..1d945aa1e0c44 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletHeterogeneousEDProducer.cc
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletHeterogeneousEDProducer.cc
@@ -98,45 +98,42 @@ void CAHitNtupletHeterogeneousEDProducer::beginStreamGPUCuda(
 void CAHitNtupletHeterogeneousEDProducer::acquireGPUCuda(
     const edm::HeterogeneousEvent &iEvent, const edm::EventSetup &iSetup,
     cuda::stream_t<> &cudaStream) {
+
+  seedingHitSets_ = std::make_unique<RegionsSeedingHitSets>();
+
+
+  // FIXME: move directly to region or similar...
   edm::Handle<IntermediateHitDoublets> hdoublets;
   iEvent.getByToken(doubletToken_, hdoublets);
   const auto &regionDoublets = *hdoublets;
+  assert(regionDoublets.regionSize()<=1);
 
-  const SeedingLayerSetsHits &seedingLayerHits =
-      regionDoublets.seedingLayerHits();
-  if (seedingLayerHits.numberOfLayersInSet() <
-      CAHitQuadrupletGeneratorGPU::minLayers) {
-    throw cms::Exception("LogicError")
-        << "CAHitNtupletHeterogeneousEDProducer expects "
-           "SeedingLayerSetsHits::numberOfLayersInSet() to be >= "
-        << CAHitQuadrupletGeneratorGPU::minLayers << ", got "
-        << seedingLayerHits.numberOfLayersInSet()
-        << ". This is likely caused by a configuration error of this module, "
-           "HitPairEDProducer, or SeedingLayersEDProducer.";
+  if (regionDoublets.empty()) {
+    emptyRegionDoublets = true;
+    return;
   }
 
-  seedingHitSets_ = std::make_unique<RegionsSeedingHitSets>();
+  const TrackingRegion &region = (*regionDoublets.begin()).region();
+
 
   edm::Handle<siPixelRecHitsHeterogeneousProduct::GPUProduct> gh;
   iEvent.getByToken<siPixelRecHitsHeterogeneousProduct::HeterogeneousPixelRecHit>(tGpuHits, gh);
   auto const & gHits = *gh;
 //  auto nhits = gHits.nHits;
 
-  GPUGenerator_.buildDoublets(gHits,0.06f,cudaStream.id());
+  // move inside hitNtuplets???
+  GPUGenerator_.buildDoublets(gHits,cudaStream.id());
 
-  if (regionDoublets.empty()) {
-    emptyRegionDoublets = true;
-  } else {
-    seedingHitSets_->reserve(regionDoublets.regionSize(), localRA_.upper());
-    GPUGenerator_.initEvent(iEvent.event(), iSetup);
+  seedingHitSets_->reserve(regionDoublets.regionSize(), localRA_.upper());
+  GPUGenerator_.initEvent(iEvent.event(), iSetup);
 
-    LogDebug("CAHitNtupletHeterogeneousEDProducer")
+  LogDebug("CAHitNtupletHeterogeneousEDProducer")
         << "Creating ntuplets for " << regionDoublets.regionSize()
         << " regions, and " << regionDoublets.layerPairsSize()
         << " layer pairs";
 
-    GPUGenerator_.hitNtuplets(regionDoublets, iSetup, seedingLayerHits, cudaStream.id());
-  }
+  GPUGenerator_.hitNtuplets(region, gHits, iSetup, cudaStream.id());
+  
 }
 
 void CAHitNtupletHeterogeneousEDProducer::produceGPUCuda(
@@ -147,16 +144,18 @@ void CAHitNtupletHeterogeneousEDProducer::produceGPUCuda(
     edm::Handle<IntermediateHitDoublets> hdoublets;
     iEvent.getByToken(doubletToken_, hdoublets);
     const auto &regionDoublets = *hdoublets;
-    const SeedingLayerSetsHits &seedingLayerHits =
-        regionDoublets.seedingLayerHits();
-    int index = 0;
+
+    edm::Handle<HeterogeneousProduct> gh;
+    iEvent.getByToken(tGpuHits, gh);
+    auto const & rechits = gh->get<siPixelRecHitsHeterogeneousProduct::HeterogeneousPixelRecHit>().getProduct<HeterogeneousDevice::kCPU>();
+
     std::vector<OrderedHitSeeds> ntuplets(regionDoublets.regionSize());
-    for (auto &ntuplet : ntuplets)
-      ntuplet.reserve(localRA_.upper());
+    for (auto &ntuplet : ntuplets) ntuplet.reserve(localRA_.upper());
+    int index = 0;
     for (const auto &regionLayerPairs : regionDoublets) {
       const TrackingRegion &region = regionLayerPairs.region();
       auto seedingHitSetsFiller = seedingHitSets_->beginRegion(&region);
-      GPUGenerator_.fillResults(regionDoublets, ntuplets, iSetup, seedingLayerHits, cudaStream.id());
+      GPUGenerator_.fillResults(region, rechits.collection, ntuplets, iSetup, cudaStream.id());
       fillNtuplets(seedingHitSetsFiller, ntuplets[index]);
       ntuplets[index].clear();
       index++;
@@ -188,6 +187,7 @@ void CAHitNtupletHeterogeneousEDProducer::produceCPU(
     iEvent.put(std::move(seedingHitSets));
     return;
   }
+
   seedingHitSets->reserve(regionDoublets.regionSize(), localRA_.upper());
   CPUGenerator_.initEvent(iEvent.event(), iSetup);
 
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitQuadrupletGeneratorGPU.cc b/RecoPixelVertexing/PixelTriplets/plugins/CAHitQuadrupletGeneratorGPU.cc
index 1230d29f123ff..e9116fcd58eac 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitQuadrupletGeneratorGPU.cc
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitQuadrupletGeneratorGPU.cc
@@ -30,30 +30,23 @@ constexpr unsigned int CAHitQuadrupletGeneratorGPU::minLayers;
 CAHitQuadrupletGeneratorGPU::CAHitQuadrupletGeneratorGPU(
     const edm::ParameterSet &cfg,
     edm::ConsumesCollector &iC)
-    : extraHitRPhitolerance(cfg.getParameter<double>(
-          "extraHitRPhitolerance")), // extra window in
-                                     // ThirdHitPredictionFromCircle range
-                                     // (divide by R to get phi)
-      maxChi2(cfg.getParameter<edm::ParameterSet>("maxChi2")),
-      fitFastCircle(cfg.getParameter<bool>("fitFastCircle")),
-      fitFastCircleChi2Cut(cfg.getParameter<bool>("fitFastCircleChi2Cut")),
-      useBendingCorrection(cfg.getParameter<bool>("useBendingCorrection")),
-      caThetaCut(cfg.getParameter<double>("CAThetaCut")),
-      caPhiCut(cfg.getParameter<double>("CAPhiCut")),
-      caHardPtCut(cfg.getParameter<double>("CAHardPtCut")) {
-  edm::ParameterSet comparitorPSet =
-      cfg.getParameter<edm::ParameterSet>("SeedComparitorPSet");
-  std::string comparitorName =
-      comparitorPSet.getParameter<std::string>("ComponentName");
+  : extraHitRPhitolerance(cfg.getParameter<double>("extraHitRPhitolerance")), // extra window in ThirdHitPredictionFromCircle range (divide by R to get phi)
+    maxChi2(cfg.getParameter<edm::ParameterSet>("maxChi2")),
+    fitFastCircle(cfg.getParameter<bool>("fitFastCircle")),
+    fitFastCircleChi2Cut(cfg.getParameter<bool>("fitFastCircleChi2Cut")),
+    useBendingCorrection(cfg.getParameter<bool>("useBendingCorrection")),
+    caThetaCut(cfg.getParameter<double>("CAThetaCut")),
+    caPhiCut(cfg.getParameter<double>("CAPhiCut")),
+    caHardPtCut(cfg.getParameter<double>("CAHardPtCut"))
+{
+  edm::ParameterSet comparitorPSet = cfg.getParameter<edm::ParameterSet>("SeedComparitorPSet");
+  std::string comparitorName = comparitorPSet.getParameter<std::string>("ComponentName");
   if (comparitorName != "none") {
-    theComparitor.reset(SeedComparitorFactory::get()->create(
-        comparitorName, comparitorPSet, iC));
+    theComparitor.reset(SeedComparitorFactory::get()->create(comparitorName, comparitorPSet, iC));
   }
-
 }
 
-void CAHitQuadrupletGeneratorGPU::fillDescriptions(
-    edm::ParameterSetDescription &desc) {
+void CAHitQuadrupletGeneratorGPU::fillDescriptions(edm::ParameterSetDescription &desc) {
   desc.add<double>("extraHitRPhitolerance", 0.1);
   desc.add<bool>("fitFastCircle", false);
   desc.add<bool>("fitFastCircleChi2Cut", false);
@@ -61,10 +54,9 @@ void CAHitQuadrupletGeneratorGPU::fillDescriptions(
   desc.add<double>("CAThetaCut", 0.00125);
   desc.add<double>("CAPhiCut", 10);
   desc.add<double>("CAHardPtCut", 0);
-  desc.addOptional<bool>("CAOnlyOneLastHitPerLayerFilter")
-      ->setComment(
-          "Deprecated and has no effect. To be fully removed later when the "
-          "parameter is no longer used in HLT configurations.");
+  desc.addOptional<bool>("CAOnlyOneLastHitPerLayerFilter")->setComment(
+      "Deprecated and has no effect. To be fully removed later when the "
+      "parameter is no longer used in HLT configurations.");
   edm::ParameterSetDescription descMaxChi2;
   descMaxChi2.add<double>("pt1", 0.2);
   descMaxChi2.add<double>("pt2", 1.5);
@@ -75,13 +67,11 @@ void CAHitQuadrupletGeneratorGPU::fillDescriptions(
 
   edm::ParameterSetDescription descComparitor;
   descComparitor.add<std::string>("ComponentName", "none");
-  descComparitor.setAllowAnything(); // until we have moved SeedComparitor too
-                                     // to EDProducers
+  descComparitor.setAllowAnything();    // until we have moved SeedComparitor to EDProducers too
   desc.add<edm::ParameterSetDescription>("SeedComparitorPSet", descComparitor);
 }
 
-void CAHitQuadrupletGeneratorGPU::initEvent(const edm::Event &ev,
-                                            const edm::EventSetup &es) {
+void CAHitQuadrupletGeneratorGPU::initEvent(edm::Event const& ev, edm::EventSetup const& es) {
   if (theComparitor)
     theComparitor->init(ev, es);
 }
@@ -90,356 +80,121 @@ CAHitQuadrupletGeneratorGPU::~CAHitQuadrupletGeneratorGPU() {
     deallocateOnGPU();
 }
 
-namespace {
-void createGraphStructure(const SeedingLayerSetsHits &layers, CAGraph &g,
-                          GPULayerHits *h_layers_, unsigned int maxNumberOfHits_,
-                          float *h_x_, float *h_y_, float *h_z_) {
-  for (unsigned int i = 0; i < layers.size(); i++) {
-    for (unsigned int j = 0; j < 4; ++j) {
-      auto vertexIndex = 0;
-      auto foundVertex = std::find(g.theLayers.begin(), g.theLayers.end(),
-                                   layers[i][j].name());
-      if (foundVertex == g.theLayers.end()) {
-        g.theLayers.emplace_back(layers[i][j].name(),
-                                 layers[i][j].hits().size());
-        vertexIndex = g.theLayers.size() - 1;
-
-      } else {
-        vertexIndex = foundVertex - g.theLayers.begin();
-      }
-      if (j == 0) {
-
-        if (std::find(g.theRootLayers.begin(), g.theRootLayers.end(),
-                      vertexIndex) == g.theRootLayers.end()) {
-          g.theRootLayers.emplace_back(vertexIndex);
-        }
-      }
-    }
-  }
-}
-void clearGraphStructure(const SeedingLayerSetsHits &layers, CAGraph &g) {
-  g.theLayerPairs.clear();
-  for (unsigned int i = 0; i < g.theLayers.size(); i++) {
-    g.theLayers[i].theInnerLayers.clear();
-    g.theLayers[i].theInnerLayerPairs.clear();
-    g.theLayers[i].theOuterLayers.clear();
-    g.theLayers[i].theOuterLayerPairs.clear();
-    for (auto &v : g.theLayers[i].isOuterHitOfCell)
-      v.clear();
-  }
+void CAHitQuadrupletGeneratorGPU::hitNtuplets(
+    TrackingRegion const& region,
+    HitsOnCPU const& hh,
+    edm::EventSetup const& es,
+    cudaStream_t cudaStream)
+{
+  hitsOnCPU = &hh;
+  int index = 0;
+  launchKernels(region, index, hh, cudaStream);
 }
-void fillGraph(const SeedingLayerSetsHits &layers,
-               const IntermediateHitDoublets::RegionLayerSets &regionLayerPairs,
-               CAGraph &g, std::vector<const HitDoublets *> &hitDoublets) {
-
-  for (unsigned int i = 0; i < layers.size(); i++) {
-    for (unsigned int j = 0; j < 4; ++j) {
-      auto vertexIndex = 0;
-      auto foundVertex = std::find(g.theLayers.begin(), g.theLayers.end(),
-                                   layers[i][j].name());
-      if (foundVertex == g.theLayers.end()) {
-        vertexIndex = g.theLayers.size() - 1;
-      } else {
-        vertexIndex = foundVertex - g.theLayers.begin();
-      }
 
-      if (j > 0) {
-        auto innerVertex = std::find(g.theLayers.begin(), g.theLayers.end(),
-                                     layers[i][j - 1].name());
+void CAHitQuadrupletGeneratorGPU::fillResults(
+    const TrackingRegion &region, SiPixelRecHitCollectionNew const & rechits,
+    std::vector<OrderedHitSeeds> &result, const edm::EventSetup &es,
+    cudaStream_t cudaStream)
+{
+  hitmap_.clear();
+  auto const & rcs = rechits.data();
+  for (auto const & h : rcs) hitmap_.add(h, &h);
 
-        CALayerPair tmpInnerLayerPair(innerVertex - g.theLayers.begin(),
-                                      vertexIndex);
+  assert(hitsOnCPU);
+  auto nhits = hitsOnCPU->nHits;
+  int index = 0;
 
-        if (std::find(g.theLayerPairs.begin(), g.theLayerPairs.end(),
-                      tmpInnerLayerPair) == g.theLayerPairs.end()) {
-          auto found = std::find_if(
-              regionLayerPairs.begin(), regionLayerPairs.end(),
-              [&](const IntermediateHitDoublets::LayerPairHitDoublets &pair) {
-                return pair.innerLayerIndex() == layers[i][j - 1].index() &&
-                       pair.outerLayerIndex() == layers[i][j].index();
-              });
-          if (found != regionLayerPairs.end()) {
-            hitDoublets.emplace_back(&(found->doublets()));
-            g.theLayerPairs.push_back(tmpInnerLayerPair);
-            g.theLayers[vertexIndex].theInnerLayers.push_back(
-                innerVertex - g.theLayers.begin());
-            innerVertex->theOuterLayers.push_back(vertexIndex);
-            g.theLayers[vertexIndex].theInnerLayerPairs.push_back(
-                g.theLayerPairs.size() - 1);
-            innerVertex->theOuterLayerPairs.push_back(g.theLayerPairs.size() -
-                                                      1);
-          }
-        }
+  auto const & foundQuads = fetchKernelResult(index, cudaStream);
+  unsigned int numberOfFoundQuadruplets = foundQuads.size();
+  const QuantityDependsPtEval maxChi2Eval = maxChi2.evaluator(es);
+
+  // re-used throughout
+  std::array<float, 4> bc_r;
+  std::array<float, 4> bc_z;
+  std::array<float, 4> bc_errZ2;
+  std::array<GlobalPoint, 4> gps;
+  std::array<GlobalError, 4> ges;
+  std::array<bool, 4> barrels;
+  std::array<BaseTrackerRecHit const*, 4> phits;
+
+  // loop over quadruplets
+  for (unsigned int quadId = 0; quadId < numberOfFoundQuadruplets; ++quadId) {
+    auto isBarrel = [](const unsigned id) -> bool {
+      return id == PixelSubdetector::PixelBarrel;
+    };
+    bool bad = false;
+    for (unsigned int i = 0; i < 4; ++i) {
+      auto k = foundQuads[quadId][i];
+      assert(k<int(nhits));
+      auto hp = hitmap_.get((*hitsOnCPU).detInd[k],(*hitsOnCPU).mr[k], (*hitsOnCPU).mc[k]);
+      if (hp==nullptr) {
+        bad=true;
+        break;
       }
-    }
-  }
+      phits[i] = static_cast<BaseTrackerRecHit const *>(hp);
+      auto const &ahit = *phits[i];
+      gps[i] = ahit.globalPosition();
+      ges[i] = ahit.globalPositionError();
+      barrels[i] = isBarrel(ahit.geographicalId().subdetId());
 
-}
-} // namespace
-
-void CAHitQuadrupletGeneratorGPU::hitNtuplets(
-    const IntermediateHitDoublets &regionDoublets,
-    const edm::EventSetup &es,
-    const SeedingLayerSetsHits &layers, cudaStream_t cudaStream) {
-  CAGraph g;
-
-  hitDoublets.resize(regionDoublets.regionSize());
-
-  for (unsigned int lpIdx = 0; lpIdx < maxNumberOfLayerPairs_; ++lpIdx) {
-    h_doublets_[lpIdx].size = 0;
-  }
-  numberOfRootLayerPairs_ = 0;
-  numberOfLayerPairs_ = 0;
-  numberOfLayers_ = 0;
-
-  for (unsigned int layerIdx = 0; layerIdx < maxNumberOfLayers_; ++layerIdx) {
-    h_layers_[layerIdx].size = 0;
-  }
-
-  int index = 0;
-  for (const auto &regionLayerPairs : regionDoublets) {
-    const TrackingRegion &region = regionLayerPairs.region();
-    hitDoublets[index].clear();
-    if (index == 0) {
-      createGraphStructure(layers, g, h_layers_, maxNumberOfHits_, h_x_, h_y_, h_z_);
-    } else {
-      clearGraphStructure(layers, g);
     }
-
-    fillGraph(layers, regionLayerPairs, g, hitDoublets[index]);
-    numberOfLayers_ = g.theLayers.size();
-    numberOfLayerPairs_ = hitDoublets[index].size();
-    std::vector<bool> layerAlreadyParsed(g.theLayers.size(), false);
-
-    for (unsigned int i = 0; i < numberOfLayerPairs_; ++i) {
-      h_doublets_[i].size = hitDoublets[index][i]->size();
-      h_doublets_[i].innerLayerId = g.theLayerPairs[i].theLayers[0];
-      h_doublets_[i].outerLayerId = g.theLayerPairs[i].theLayers[1];
-
-      if (layerAlreadyParsed[h_doublets_[i].innerLayerId] == false) {
-        layerAlreadyParsed[h_doublets_[i].innerLayerId] = true;
-        h_layers_[h_doublets_[i].innerLayerId].size =
-            hitDoublets[index][i]->innerLayer().hits().size();
-        h_layers_[h_doublets_[i].innerLayerId].layerId =
-            h_doublets_[i].innerLayerId;
-
-        for (unsigned int l = 0; l < h_layers_[h_doublets_[i].innerLayerId].size;
-             ++l) {
-          auto hitId =
-              h_layers_[h_doublets_[i].innerLayerId].layerId * maxNumberOfHits_ +
-              l;
-          h_x_[hitId] =
-              hitDoublets[index][i]->innerLayer().hits()[l]->globalPosition().x();
-          h_y_[hitId] =
-              hitDoublets[index][i]->innerLayer().hits()[l]->globalPosition().y();
-          h_z_[hitId] =
-              hitDoublets[index][i]->innerLayer().hits()[l]->globalPosition().z();
-        }
-      }
-      if (layerAlreadyParsed[h_doublets_[i].outerLayerId] == false) {
-        layerAlreadyParsed[h_doublets_[i].outerLayerId] = true;
-        h_layers_[h_doublets_[i].outerLayerId].size =
-            hitDoublets[index][i]->outerLayer().hits().size();
-        h_layers_[h_doublets_[i].outerLayerId].layerId =
-            h_doublets_[i].outerLayerId;
-        for (unsigned int l = 0; l < h_layers_[h_doublets_[i].outerLayerId].size;
-             ++l) {
-          auto hitId =
-              h_layers_[h_doublets_[i].outerLayerId].layerId * maxNumberOfHits_ +
-              l;
-          h_x_[hitId] =
-              hitDoublets[index][i]->outerLayer().hits()[l]->globalPosition().x();
-          h_y_[hitId] =
-              hitDoublets[index][i]->outerLayer().hits()[l]->globalPosition().y();
-          h_z_[hitId] =
-              hitDoublets[index][i]->outerLayer().hits()[l]->globalPosition().z();
-        }
+    if (bad) continue;
+
+    // TODO:
+    // - if we decide to always do the circle fit for 4 hits, we don't
+    //   need ThirdHitPredictionFromCircle for the curvature; then we
+    //   could remove extraHitRPhitolerance configuration parameter
+    ThirdHitPredictionFromCircle predictionRPhi(gps[0], gps[2],
+        extraHitRPhitolerance);
+    const float curvature = predictionRPhi.curvature(
+        ThirdHitPredictionFromCircle::Vector2D(gps[1].x(), gps[1].y()));
+    const float abscurv = std::abs(curvature);
+    const float thisMaxChi2 = maxChi2Eval.value(abscurv);
+    if (theComparitor) {
+      SeedingHitSet tmpTriplet(phits[0],  phits[1],  phits[3]);
+      if (!theComparitor->compatible(tmpTriplet)) {
+        continue;
       }
+    }
 
-      for (unsigned int rl : g.theRootLayers) {
-        if (rl == h_doublets_[i].innerLayerId) {
-          auto rootlayerPairId = numberOfRootLayerPairs_;
-          h_rootLayerPairs_[rootlayerPairId] = i;
-          numberOfRootLayerPairs_++;
-        }
-      }
-      auto numberOfDoublets = hitDoublets[index][i]->size();
-      if(numberOfDoublets > maxNumberOfDoublets_)
-      {
-          edm::LogError("CAHitQuadrupletGeneratorGPU")<<" too many doublets: " << numberOfDoublets << " max is " <<  maxNumberOfDoublets_;
-          return;
-      }
-      for (unsigned int l = 0; l < numberOfDoublets; ++l) {
-        auto hitId = i * maxNumberOfDoublets_ * 2 + 2 * l;
-        h_indices_[hitId] = hitDoublets[index][i]->innerHitId(l);
-        h_indices_[hitId + 1] = hitDoublets[index][i]->outerHitId(l);
+    float chi2 = std::numeric_limits<float>::quiet_NaN();
+    // TODO: Do we have any use case to not use bending correction?
+    if (useBendingCorrection) {
+      // Following PixelFitterByConformalMappingAndLine
+      const float simpleCot = (gps.back().z() - gps.front().z()) /
+        (gps.back().perp() - gps.front().perp());
+      const float pt = 1.f / PixelRecoUtilities::inversePt(abscurv, es);
+      for (int i = 0; i < 4; ++i) {
+        const GlobalPoint &point = gps[i];
+        const GlobalError &error = ges[i];
+        bc_r[i] = sqrt(sqr(point.x() - region.origin().x()) +
+            sqr(point.y() - region.origin().y()));
+        bc_r[i] += pixelrecoutilities::LongitudinalBendingCorrection(pt, es)(
+            bc_r[i]);
+        bc_z[i] = point.z() - region.origin().z();
+        bc_errZ2[i] =
+          (barrels[i]) ? error.czz() : error.rerr(point) * sqr(simpleCot);
       }
+      RZLine rzLine(bc_r, bc_z, bc_errZ2, RZLine::ErrZ2_tag());
+      chi2 = rzLine.chi2();
+    } else {
+      RZLine rzLine(gps, ges, barrels);
+      chi2 = rzLine.chi2();
     }
-
-
-
-    for (unsigned int j = 0; j < numberOfLayerPairs_; ++j) {
-      tmp_layerDoublets_[j] = h_doublets_[j];
-      tmp_layerDoublets_[j].indices = &d_indices_[j * maxNumberOfDoublets_ * 2];
-      cudaMemcpyAsync(&d_indices_[j * maxNumberOfDoublets_ * 2],
-                      &h_indices_[j * maxNumberOfDoublets_ * 2],
-                      tmp_layerDoublets_[j].size * 2 * sizeof(int),
-                      cudaMemcpyHostToDevice, cudaStream);
+    if (edm::isNotFinite(chi2) || chi2 > thisMaxChi2) {
+      continue;
     }
-
-    for (unsigned int j = 0; j < numberOfLayers_; ++j) {
-        if(h_layers_[j].size > maxNumberOfHits_)
-        {
-            edm::LogError("CAHitQuadrupletGeneratorGPU")<<" too many hits: " << h_layers_[j].size << " max is " << maxNumberOfHits_;
-            return;
-        }
-      tmp_layers_[j] = h_layers_[j];
-      tmp_layers_[j].x = &d_x_[maxNumberOfHits_ * j];
-
-      cudaMemcpyAsync(&d_x_[maxNumberOfHits_ * j], &h_x_[j * maxNumberOfHits_],
-                      tmp_layers_[j].size * sizeof(float),
-                      cudaMemcpyHostToDevice, cudaStream);
-
-      tmp_layers_[j].y = &d_y_[maxNumberOfHits_ * j];
-      cudaMemcpyAsync(&d_y_[maxNumberOfHits_ * j], &h_y_[j * maxNumberOfHits_],
-                      tmp_layers_[j].size * sizeof(float),
-                      cudaMemcpyHostToDevice, cudaStream);
-
-      tmp_layers_[j].z = &d_z_[maxNumberOfHits_ * j];
-
-      cudaMemcpyAsync(&d_z_[maxNumberOfHits_ * j], &h_z_[j * maxNumberOfHits_],
-                      tmp_layers_[j].size * sizeof(float),
-                      cudaMemcpyHostToDevice, cudaStream);
+    // TODO: Do we have any use case to not use circle fit? Maybe
+    // HLT where low-pT inefficiency is not a problem?
+    if (fitFastCircle) {
+      FastCircleFit c(gps, ges);
+      chi2 += c.chi2();
+      if (edm::isNotFinite(chi2))
+        continue;
+      if (fitFastCircleChi2Cut && chi2 > thisMaxChi2)
+        continue;
     }
+    result[index].emplace_back(phits[0],  phits[1],  phits[2],  phits[3]);
 
-    cudaMemcpyAsync(d_rootLayerPairs_, h_rootLayerPairs_,
-                    numberOfRootLayerPairs_ * sizeof(unsigned int),
-                    cudaMemcpyHostToDevice, cudaStream);
-    cudaMemcpyAsync(d_doublets_, tmp_layerDoublets_,
-                    numberOfLayerPairs_ * sizeof(GPULayerDoublets),
-                    cudaMemcpyHostToDevice, cudaStream);
-    cudaMemcpyAsync(d_layers_, tmp_layers_, numberOfLayers_ * sizeof(GPULayerHits),
-                    cudaMemcpyHostToDevice, cudaStream);
-
-    launchKernels(region, index, cudaStream);
-  }
-}
-
-void CAHitQuadrupletGeneratorGPU::fillResults(
-    const IntermediateHitDoublets &regionDoublets,
-    std::vector<OrderedHitSeeds> &result, const edm::EventSetup &es,
-    const SeedingLayerSetsHits &layers, cudaStream_t cudaStream)
-{
-    int index = 0;
-
-    for (const auto &regionLayerPairs : regionDoublets) {
-      const TrackingRegion &region = regionLayerPairs.region();
-      auto foundQuads = fetchKernelResult(index, cudaStream);
-      unsigned int numberOfFoundQuadruplets = foundQuads.size();
-      const QuantityDependsPtEval maxChi2Eval = maxChi2.evaluator(es);
-
-      // re-used thoughout
-      std::array<float, 4> bc_r;
-      std::array<float, 4> bc_z;
-      std::array<float, 4> bc_errZ2;
-      std::array<GlobalPoint, 4> gps;
-      std::array<GlobalError, 4> ges;
-      std::array<bool, 4> barrels;
-      // Loop over quadruplets
-      for (unsigned int quadId = 0; quadId < numberOfFoundQuadruplets; ++quadId) {
-        auto isBarrel = [](const unsigned id) -> bool {
-          return id == PixelSubdetector::PixelBarrel;
-        };
-        for (unsigned int i = 0; i < 3; ++i) {
-          auto layerPair = foundQuads[quadId][i].first;
-          auto doubletId = foundQuads[quadId][i].second;
-
-          auto const &ahit =
-              hitDoublets[index][layerPair]->hit(doubletId, HitDoublets::inner);
-          gps[i] = ahit->globalPosition();
-          ges[i] = ahit->globalPositionError();
-          barrels[i] = isBarrel(ahit->geographicalId().subdetId());
-
-        }
-        auto layerPair = foundQuads[quadId][2].first;
-        auto doubletId = foundQuads[quadId][2].second;
-
-        auto const &ahit =
-            hitDoublets[index][layerPair]->hit(doubletId, HitDoublets::outer);
-        gps[3] = ahit->globalPosition();
-        ges[3] = ahit->globalPositionError();
-        barrels[3] = isBarrel(ahit->geographicalId().subdetId());
-
-        // TODO:
-        // - if we decide to always do the circle fit for 4 hits, we don't
-        //   need ThirdHitPredictionFromCircle for the curvature; then we
-        //   could remove extraHitRPhitolerance configuration parameter
-        ThirdHitPredictionFromCircle predictionRPhi(gps[0], gps[2],
-                                                    extraHitRPhitolerance);
-        const float curvature = predictionRPhi.curvature(
-            ThirdHitPredictionFromCircle::Vector2D(gps[1].x(), gps[1].y()));
-        const float abscurv = std::abs(curvature);
-        const float thisMaxChi2 = maxChi2Eval.value(abscurv);
-        if (theComparitor) {
-          SeedingHitSet tmpTriplet(
-              hitDoublets[index][foundQuads[quadId][0].first]->hit(
-                  foundQuads[quadId][0].second, HitDoublets::inner),
-              hitDoublets[index][foundQuads[quadId][2].first]->hit(
-                  foundQuads[quadId][2].second, HitDoublets::inner),
-              hitDoublets[index][foundQuads[quadId][2].first]->hit(
-                  foundQuads[quadId][2].second, HitDoublets::outer));
-          if (!theComparitor->compatible(tmpTriplet)) {
-            continue;
-          }
-        }
-
-        float chi2 = std::numeric_limits<float>::quiet_NaN();
-        // TODO: Do we have any use case to not use bending correction?
-        if (useBendingCorrection) {
-          // Following PixelFitterByConformalMappingAndLine
-          const float simpleCot = (gps.back().z() - gps.front().z()) /
-                                  (gps.back().perp() - gps.front().perp());
-          const float pt = 1.f / PixelRecoUtilities::inversePt(abscurv, es);
-          for (int i = 0; i < 4; ++i) {
-            const GlobalPoint &point = gps[i];
-            const GlobalError &error = ges[i];
-            bc_r[i] = sqrt(sqr(point.x() - region.origin().x()) +
-                           sqr(point.y() - region.origin().y()));
-            bc_r[i] += pixelrecoutilities::LongitudinalBendingCorrection(pt, es)(
-                bc_r[i]);
-            bc_z[i] = point.z() - region.origin().z();
-            bc_errZ2[i] =
-                (barrels[i]) ? error.czz() : error.rerr(point) * sqr(simpleCot);
-          }
-          RZLine rzLine(bc_r, bc_z, bc_errZ2, RZLine::ErrZ2_tag());
-          chi2 = rzLine.chi2();
-        } else {
-          RZLine rzLine(gps, ges, barrels);
-          chi2 = rzLine.chi2();
-        }
-        if (edm::isNotFinite(chi2) || chi2 > thisMaxChi2) {
-          continue;
-        }
-        // TODO: Do we have any use case to not use circle fit? Maybe
-        // HLT where low-pT inefficiency is not a problem?
-        if (fitFastCircle) {
-          FastCircleFit c(gps, ges);
-          chi2 += c.chi2();
-          if (edm::isNotFinite(chi2))
-            continue;
-          if (fitFastCircleChi2Cut && chi2 > thisMaxChi2)
-            continue;
-        }
-        result[index].emplace_back(
-            hitDoublets[index][foundQuads[quadId][0].first]->hit(
-                foundQuads[quadId][0].second, HitDoublets::inner),
-            hitDoublets[index][foundQuads[quadId][1].first]->hit(
-                foundQuads[quadId][1].second, HitDoublets::inner),
-            hitDoublets[index][foundQuads[quadId][2].first]->hit(
-                foundQuads[quadId][2].second, HitDoublets::inner),
-            hitDoublets[index][foundQuads[quadId][2].first]->hit(
-                foundQuads[quadId][2].second, HitDoublets::outer));
-      }
-      index++;
-  }
+  } // end loop over quads
 }
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitQuadrupletGeneratorGPU.cu b/RecoPixelVertexing/PixelTriplets/plugins/CAHitQuadrupletGeneratorGPU.cu
index 6e367e59e320d..2f8966c1a42c4 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitQuadrupletGeneratorGPU.cu
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitQuadrupletGeneratorGPU.cu
@@ -6,361 +6,99 @@
 #include "CAHitQuadrupletGeneratorGPU.h"
 #include "GPUCACell.h"
 #include "gpuPixelDoublets.h"
+#include <cstdint>
+
+using HitsOnCPU = siPixelRecHitsHeterogeneousProduct::HitsOnCPU;
 
 __global__ void
-kernel_debug(unsigned int numberOfLayerPairs_, unsigned int numberOfLayers_,
-             const GPULayerDoublets *gpuDoublets,
-             const GPULayerHits *gpuHitsOnLayers, GPUCACell *cells,
-             GPU::VecArray<unsigned int, 512> *isOuterHitOfCell,
-             GPU::SimpleVector<Quadruplet> *foundNtuplets,
-             float ptmin, float region_origin_x, float region_origin_y,
-             float region_origin_radius, const float thetaCut,
-             const float phiCut, const float hardPtCut,
-             unsigned int maxNumberOfDoublets_, unsigned int maxNumberOfHits_) {
-  if (threadIdx.x == 0 and blockIdx.x == 0)
-    foundNtuplets->reset();
-
-  printf("kernel_debug_create: theEvent contains numberOfLayerPairs_: %d\n",
-         numberOfLayerPairs_);
-  for (unsigned int layerPairIndex = 0; layerPairIndex < numberOfLayerPairs_;
-       ++layerPairIndex) {
-
-    int outerLayerId = gpuDoublets[layerPairIndex].outerLayerId;
-    int innerLayerId = gpuDoublets[layerPairIndex].innerLayerId;
-    int numberOfDoublets = gpuDoublets[layerPairIndex].size;
-    printf(
-        "kernel_debug_create: layerPairIndex: %d inner %d outer %d size %u\n",
-        layerPairIndex, innerLayerId, outerLayerId, numberOfDoublets);
-
-    auto globalFirstDoubletIdx = layerPairIndex * maxNumberOfDoublets_;
-    auto globalFirstHitIdx = outerLayerId * maxNumberOfHits_;
-    printf("kernel_debug_create: theIdOfThefirstCellInLayerPair: %d "
-           "globalFirstHitIdx %d\n",
-           globalFirstDoubletIdx, globalFirstHitIdx);
-
-    for (unsigned int i = 0; i < gpuDoublets[layerPairIndex].size; i++) {
-
-      auto globalCellIdx = i + globalFirstDoubletIdx;
-      auto &thisCell = cells[globalCellIdx];
-      auto outerHitId = gpuDoublets[layerPairIndex].indices[2 * i + 1];
-      thisCell.init(&gpuDoublets[layerPairIndex], gpuHitsOnLayers,
-                    layerPairIndex, globalCellIdx,
-                    gpuDoublets[layerPairIndex].indices[2 * i], outerHitId,
-                    region_origin_x, region_origin_y);
-
-      isOuterHitOfCell[globalFirstHitIdx + outerHitId].push_back(
-          globalCellIdx);
-    }
-  }
+kernel_checkOverflows(GPU::SimpleVector<Quadruplet> *foundNtuplets,
+               GPUCACell *cells, uint32_t const * nCells,
+               GPU::VecArray< unsigned int, 2048> *isOuterHitOfCell,
+               uint32_t nHits) {
+
+ auto idx = threadIdx.x + blockIdx.x * blockDim.x;
+ #ifdef GPU_DEBUG
+ if (0==idx)
+   printf("number of found cells %d\n",*nCells);
+ #endif
+ if (idx < (*nCells) ) {
+   auto &thisCell = cells[idx];
+   if (thisCell.theOuterNeighbors.full()) //++tooManyNeighbors[thisCell.theLayerPairId];
+     printf("OuterNeighbors overflow %d in %d\n",idx,thisCell.theLayerPairId);
+ }
+ if (idx < nHits) {
+   if (isOuterHitOfCell[idx].full()) // ++tooManyOuterHitOfCell;
+     printf("OuterHitOfCell overflow %d\n", idx); 
+ }
 
-  // for(unsigned int layerIndex = 0; layerIndex < numberOfLayers_;++layerIndex )
-  // {
-  //     auto numberOfHitsOnLayer = gpuHitsOnLayers[layerIndex].size;
-  //     for(unsigned hitId = 0; hitId < numberOfHitsOnLayer; hitId++)
-  //     {
-  //
-  //         if(isOuterHitOfCell[layerIndex*maxNumberOfHits_+hitId].size()>0)
-  //         {
-  //             printf("\nlayer %d hit %d is outer hit of %d
-  //             cells\n",layerIndex, hitId,
-  //             isOuterHitOfCell[layerIndex*maxNumberOfHits_+hitId].size());
-  //             printf("\n\t%f %f %f
-  //             \n",gpuHitsOnLayers[layerIndex].x[hitId],gpuHitsOnLayers[layerIndex].y[hitId],gpuHitsOnLayers[layerIndex].z[hitId]);
-  //
-  //             for(unsigned cell = 0; cell<
-  //             isOuterHitOfCell[layerIndex*maxNumberOfHits_+hitId].size();
-  //             cell++)
-  //             {
-  //                 printf("cell %d\n",
-  //                 isOuterHitOfCell[layerIndex*maxNumberOfHits_+hitId].m_data[cell]);
-  //                 auto& thisCell =
-  //                 cells[isOuterHitOfCell[layerIndex*maxNumberOfHits_+hitId].m_data[cell]];
-  //                             float x1, y1, z1, x2, y2, z2;
-  //
-  //                             x1 = thisCell.get_inner_x();
-  //                             y1 = thisCell.get_inner_y();
-  //                             z1 = thisCell.get_inner_z();
-  //                             x2 = thisCell.get_outer_x();
-  //                             y2 = thisCell.get_outer_y();
-  //                             z2 = thisCell.get_outer_z();
-  //                 printf("\n\tDEBUG cellid %d innerhit outerhit (xyz) (%f %f
-  //                 %f), (%f %f
-  //                 %f)\n",isOuterHitOfCell[layerIndex*maxNumberOfHits_+hitId].m_data[cell],
-  //                 x1,y1,z1,x2,y2,z2);
-  //             }
-  //         }
-  //     }
-  // }
-
-  // starting connect
-
-  for (unsigned int layerPairIndex = 0; layerPairIndex < numberOfLayerPairs_;
-       ++layerPairIndex) {
-
-    int outerLayerId = gpuDoublets[layerPairIndex].outerLayerId;
-    int innerLayerId = gpuDoublets[layerPairIndex].innerLayerId;
-    int numberOfDoublets = gpuDoublets[layerPairIndex].size;
-    printf("kernel_debug_connect: connecting layerPairIndex: %d inner %d outer "
-           "%d size %u\n",
-           layerPairIndex, innerLayerId, outerLayerId, numberOfDoublets);
-
-    auto globalFirstDoubletIdx = layerPairIndex * maxNumberOfDoublets_;
-    auto globalFirstHitIdx = innerLayerId * maxNumberOfHits_;
-    //        printf("kernel_debug_connect: theIdOfThefirstCellInLayerPair: %d
-    //        globalFirstHitIdx %d\n", globalFirstDoubletIdx,
-    //        globalFirstHitIdx);
-
-    for (unsigned int i = 0; i < numberOfDoublets; i++) {
-
-      auto globalCellIdx = i + globalFirstDoubletIdx;
-
-      auto &thisCell = cells[globalCellIdx];
-      auto innerHitId = thisCell.get_inner_hit_id();
-      auto numberOfPossibleNeighbors =
-          isOuterHitOfCell[globalFirstHitIdx + innerHitId].size();
-      //            if(numberOfPossibleNeighbors>0)
-      //            printf("kernel_debug_connect: cell: %d has %d possible
-      //            neighbors\n", globalCellIdx, numberOfPossibleNeighbors);
-      float x1, y1, z1, x2, y2, z2;
-
-      x1 = thisCell.get_inner_x();
-      y1 = thisCell.get_inner_y();
-      z1 = thisCell.get_inner_z();
-      x2 = thisCell.get_outer_x();
-      y2 = thisCell.get_outer_y();
-      z2 = thisCell.get_outer_z();
-      printf("\n\n\nDEBUG cellid %d innerhit outerhit (xyz) (%f %f %f), (%f %f "
-             "%f)\n",
-             globalCellIdx, x1, y1, z1, x2, y2, z2);
-
-      for (auto j = 0; j < numberOfPossibleNeighbors; ++j) {
-        unsigned int otherCell =
-            isOuterHitOfCell[globalFirstHitIdx + innerHitId][j];
-
-        float x3, y3, z3, x4, y4, z4;
-        x3 = cells[otherCell].get_inner_x();
-        y3 = cells[otherCell].get_inner_y();
-        z3 = cells[otherCell].get_inner_z();
-        x4 = cells[otherCell].get_outer_x();
-        y4 = cells[otherCell].get_outer_y();
-        z4 = cells[otherCell].get_outer_z();
-
-        printf("kernel_debug_connect: checking compatibility with %d \n",
-               otherCell);
-        printf("DEBUG \tinnerhit outerhit (xyz) (%f %f %f), (%f %f %f)\n", x3,
-               y3, z3, x4, y4, z4);
-
-        if (thisCell.check_alignment_and_tag(
-                cells, otherCell, ptmin, region_origin_x, region_origin_y,
-                region_origin_radius, thetaCut, phiCut, hardPtCut)) {
-
-          printf("kernel_debug_connect: \t\tcell %d is outer neighbor of %d \n",
-                 globalCellIdx, otherCell);
-
-          cells[otherCell].theOuterNeighbors.push_back(globalCellIdx);
-        }
-      }
-    }
-  }
 }
 
-__global__ void debug_input_data(unsigned int numberOfLayerPairs_,
-                                 const GPULayerDoublets *gpuDoublets,
-                                 const GPULayerHits *gpuHitsOnLayers,
-                                 float ptmin, float region_origin_x,
-                                 float region_origin_y,
-                                 float region_origin_radius,
-                                 unsigned int maxNumberOfHits_) {
-  printf("GPU: Region ptmin %f , region_origin_x %f , region_origin_y %f , "
-         "region_origin_radius  %f \n",
-         ptmin, region_origin_x, region_origin_y, region_origin_radius);
-  printf("GPU: numberOfLayerPairs_: %d\n", numberOfLayerPairs_);
-
-  for (unsigned int layerPairIndex = 0; layerPairIndex < numberOfLayerPairs_;
-       ++layerPairIndex) {
-    printf("\t numberOfDoublets: %d \n", gpuDoublets[layerPairIndex].size);
-    printf("\t innerLayer: %d outerLayer: %d \n",
-           gpuDoublets[layerPairIndex].innerLayerId,
-           gpuDoublets[layerPairIndex].outerLayerId);
-
-    for (unsigned int cellIndexInLayerPair = 0;
-         cellIndexInLayerPair < gpuDoublets[layerPairIndex].size;
-         ++cellIndexInLayerPair) {
-
-      if (cellIndexInLayerPair < 5) {
-        auto innerhit =
-            gpuDoublets[layerPairIndex].indices[2 * cellIndexInLayerPair];
-        auto innerX = gpuHitsOnLayers[gpuDoublets[layerPairIndex].innerLayerId]
-                          .x[innerhit];
-        auto innerY = gpuHitsOnLayers[gpuDoublets[layerPairIndex].innerLayerId]
-                          .y[innerhit];
-        auto innerZ = gpuHitsOnLayers[gpuDoublets[layerPairIndex].innerLayerId]
-                          .z[innerhit];
-
-        auto outerhit =
-            gpuDoublets[layerPairIndex].indices[2 * cellIndexInLayerPair + 1];
-        auto outerX = gpuHitsOnLayers[gpuDoublets[layerPairIndex].outerLayerId]
-                          .x[outerhit];
-        auto outerY = gpuHitsOnLayers[gpuDoublets[layerPairIndex].outerLayerId]
-                          .y[outerhit];
-        auto outerZ = gpuHitsOnLayers[gpuDoublets[layerPairIndex].outerLayerId]
-                          .z[outerhit];
-        printf("\t \t %d innerHit: %d %f %f %f outerHit: %d %f %f %f\n",
-               cellIndexInLayerPair, innerhit, innerX, innerY, innerZ, outerhit,
-               outerX, outerY, outerZ);
-      }
-    }
-  }
-}
-
-template <int maxNumberOfQuadruplets_>
-__global__ void kernel_debug_find_ntuplets(
-    unsigned int numberOfRootLayerPairs_, const GPULayerDoublets *gpuDoublets,
-    GPUCACell *cells,
-    GPU::VecArray<Quadruplet, maxNumberOfQuadruplets_> *foundNtuplets,
-    unsigned int *rootLayerPairs, unsigned int minHitsPerNtuplet,
-    unsigned int maxNumberOfDoublets_) {
-  printf("numberOfRootLayerPairs_ = %d", numberOfRootLayerPairs_);
-  for (int rootLayerPair = 0; rootLayerPair < numberOfRootLayerPairs_;
-       ++rootLayerPair) {
-    unsigned int rootLayerPairIndex = rootLayerPairs[rootLayerPair];
-    auto globalFirstDoubletIdx = rootLayerPairIndex * maxNumberOfDoublets_;
-
-    GPU::VecArray<unsigned int, 3> stack;
-    for (int i = 0; i < gpuDoublets[rootLayerPairIndex].size; i++) {
-      auto globalCellIdx = i + globalFirstDoubletIdx;
-      stack.reset();
-      stack.push_back(globalCellIdx);
-      cells[globalCellIdx].find_ntuplets(cells, foundNtuplets, stack,
-                                         minHitsPerNtuplet);
-    }
-    printf("found quadruplets: %d", foundNtuplets->size());
-  }
-}
-
-__global__ void kernel_create(
-    const unsigned int numberOfLayerPairs_, const GPULayerDoublets *gpuDoublets,
-    const GPULayerHits *gpuHitsOnLayers, GPUCACell *cells,
-    GPU::VecArray<unsigned int, 512> *isOuterHitOfCell,
-    GPU::SimpleVector<Quadruplet> *foundNtuplets,
-    const float region_origin_x, const float region_origin_y,
-    unsigned int maxNumberOfDoublets_, unsigned int maxNumberOfHits_) {
-
-  unsigned int layerPairIndex = blockIdx.y;
-  unsigned int cellIndexInLayerPair = threadIdx.x + blockIdx.x * blockDim.x;
-  if (cellIndexInLayerPair == 0 && layerPairIndex == 0) {
-    foundNtuplets->reset();
-  }
-
-  if (layerPairIndex < numberOfLayerPairs_) {
-    int outerLayerId = gpuDoublets[layerPairIndex].outerLayerId;
-    auto globalFirstDoubletIdx = layerPairIndex * maxNumberOfDoublets_;
-    auto globalFirstHitIdx = outerLayerId * maxNumberOfHits_;
-
-    for (unsigned int i = cellIndexInLayerPair;
-         i < gpuDoublets[layerPairIndex].size; i += gridDim.x * blockDim.x) {
-      auto globalCellIdx = i + globalFirstDoubletIdx;
-      auto &thisCell = cells[globalCellIdx];
-      auto outerHitId = gpuDoublets[layerPairIndex].indices[2 * i + 1];
-      thisCell.init(&gpuDoublets[layerPairIndex], gpuHitsOnLayers,
-                    layerPairIndex, globalCellIdx,
-                    gpuDoublets[layerPairIndex].indices[2 * i], outerHitId,
-                    region_origin_x, region_origin_y);
-
-      isOuterHitOfCell[globalFirstHitIdx + outerHitId].push_back(
-          globalCellIdx);
-    }
-  }
-}
 
 __global__ void
-kernel_connect(unsigned int numberOfLayerPairs_,
-               const GPULayerDoublets *gpuDoublets, GPUCACell *cells,
-               GPU::VecArray< unsigned int, 512> *isOuterHitOfCell,
-               float ptmin, float region_origin_x, float region_origin_y,
+kernel_connect(GPU::SimpleVector<Quadruplet> *foundNtuplets,
+               GPUCACell *cells, uint32_t const * nCells,
+               GPU::VecArray< unsigned int, 2048> *isOuterHitOfCell,
+               float ptmin, 
                float region_origin_radius, const float thetaCut,
                const float phiCut, const float hardPtCut,
                unsigned int maxNumberOfDoublets_, unsigned int maxNumberOfHits_) {
-  unsigned int layerPairIndex = blockIdx.y;
-  unsigned int cellIndexInLayerPair = threadIdx.x + blockIdx.x * blockDim.x;
-  if (layerPairIndex < numberOfLayerPairs_) {
-    int innerLayerId = gpuDoublets[layerPairIndex].innerLayerId;
-    auto globalFirstDoubletIdx = layerPairIndex * maxNumberOfDoublets_;
-    auto globalFirstHitIdx = innerLayerId * maxNumberOfHits_;
-
-    for (int i = cellIndexInLayerPair; i < gpuDoublets[layerPairIndex].size;
-         i += gridDim.x * blockDim.x) {
-      auto globalCellIdx = i + globalFirstDoubletIdx;
-
-      auto &thisCell = cells[globalCellIdx];
-      auto innerHitId = thisCell.get_inner_hit_id();
-      auto numberOfPossibleNeighbors =
-          isOuterHitOfCell[globalFirstHitIdx + innerHitId].size();
-      for (auto j = 0; j < numberOfPossibleNeighbors; ++j) {
-        unsigned int otherCell =
-            isOuterHitOfCell[globalFirstHitIdx + innerHitId][j];
-
-        if (thisCell.check_alignment_and_tag(
-                cells, otherCell, ptmin, region_origin_x, region_origin_y,
-                region_origin_radius, thetaCut, phiCut, hardPtCut)) {
-          cells[otherCell].theOuterNeighbors.push_back(globalCellIdx);
-        }
-      }
-    }
+
+  float region_origin_x =0.;
+  float region_origin_y =0.;
+
+  auto cellIndex = threadIdx.x + blockIdx.x * blockDim.x;
+
+  if (0==cellIndex) foundNtuplets->reset(); // ready for next kernel
+
+  if (cellIndex >= (*nCells) ) return;
+  auto &thisCell = cells[cellIndex];
+  auto innerHitId = thisCell.get_inner_hit_id();
+  auto numberOfPossibleNeighbors = isOuterHitOfCell[innerHitId].size();
+  for (auto j = 0; j < numberOfPossibleNeighbors; ++j) {
+     auto otherCell = isOuterHitOfCell[innerHitId][j];
+
+     if (thisCell.check_alignment_and_tag(
+                 cells, otherCell, ptmin, region_origin_x, region_origin_y,
+                  region_origin_radius, thetaCut, phiCut, hardPtCut)
+        ) {
+          cells[otherCell].theOuterNeighbors.push_back(cellIndex);
+     }
   }
 }
 
 __global__ void kernel_find_ntuplets(
-    unsigned int numberOfRootLayerPairs_, const GPULayerDoublets *gpuDoublets,
-    GPUCACell *cells,
+    GPUCACell *cells, uint32_t const * nCells,
     GPU::SimpleVector<Quadruplet> *foundNtuplets,
-    unsigned int *rootLayerPairs, unsigned int minHitsPerNtuplet,
+    unsigned int minHitsPerNtuplet,
     unsigned int maxNumberOfDoublets_)
 {
-  if (blockIdx.y < numberOfRootLayerPairs_) {
-    unsigned int cellIndexInRootLayerPair = threadIdx.x + blockIdx.x * blockDim.x;
-    unsigned int rootLayerPairIndex = rootLayerPairs[blockIdx.y];
-    auto globalFirstDoubletIdx = rootLayerPairIndex * maxNumberOfDoublets_;
-    GPU::VecArray<unsigned int, 3> stack;
-    for (int i = cellIndexInRootLayerPair;
-         i < gpuDoublets[rootLayerPairIndex].size;
-         i += gridDim.x * blockDim.x) {
-      auto globalCellIdx = i + globalFirstDoubletIdx;
-      stack.reset();
-      stack.push_back_unsafe(globalCellIdx);
-      cells[globalCellIdx].find_ntuplets(cells, foundNtuplets, stack, minHitsPerNtuplet);
-    }
-  }
+
+  auto cellIndex = threadIdx.x + blockIdx.x * blockDim.x;
+  if (cellIndex >= (*nCells) ) return;
+  auto &thisCell = cells[cellIndex];
+  if (thisCell.theLayerPairId!=0 && thisCell.theLayerPairId!=3 && thisCell.theLayerPairId!=8) return; // inner layer is 0 FIXME
+  GPU::VecArray<unsigned int, 3> stack;
+  stack.reset();
+  thisCell.find_ntuplets(cells, foundNtuplets, stack, minHitsPerNtuplet);
+  assert(stack.size()==0);
+  // printf("in %d found quadruplets: %d\n", cellIndex, foundNtuplets->size());
 }
 
-template <int maxNumberOfDoublets_>
 __global__ void
-kernel_print_found_ntuplets(GPU::SimpleVector<Quadruplet> *foundNtuplets) {
-  for (int i = 0; i < foundNtuplets->size(); ++i) {
-    printf("\nquadruplet %d: %d %d, %d %d, %d %d\n", i,
-           (*foundNtuplets)[i].layerPairsAndCellId[0].x,
-           (*foundNtuplets)[i].layerPairsAndCellId[0].y -
-               maxNumberOfDoublets_ *
-                   ((*foundNtuplets)[i].layerPairsAndCellId[0].x),
-           (*foundNtuplets)[i].layerPairsAndCellId[1].x,
-           (*foundNtuplets)[i].layerPairsAndCellId[1].y -
-               maxNumberOfDoublets_ *
-                   (*foundNtuplets)[i].layerPairsAndCellId[1].x,
-           (*foundNtuplets)[i].layerPairsAndCellId[2].x,
-           (*foundNtuplets)[i].layerPairsAndCellId[2].y -
-               maxNumberOfDoublets_ *
-                   ((*foundNtuplets)[i].layerPairsAndCellId[2].x));
+kernel_print_found_ntuplets(GPU::SimpleVector<Quadruplet> *foundNtuplets, int maxPrint) {
+  for (int i = 0; i < std::min(maxPrint,foundNtuplets->size()); ++i) {
+    printf("\nquadruplet %d: %d %d %d %d\n", i,
+           (*foundNtuplets)[i].hitId[0],
+           (*foundNtuplets)[i].hitId[1],
+           (*foundNtuplets)[i].hitId[2],
+           (*foundNtuplets)[i].hitId[3]
+          );
+         
   }
 }
 
 void CAHitQuadrupletGeneratorGPU::deallocateOnGPU()
 {
-  cudaFreeHost(h_indices_);
-  cudaFreeHost(h_doublets_);
-  cudaFreeHost(h_x_);
-  cudaFreeHost(h_y_);
-  cudaFreeHost(h_z_);
-  cudaFreeHost(h_rootLayerPairs_);
   for (size_t i = 0; i < h_foundNtupletsVec_.size(); ++i)
   {
     cudaFreeHost(h_foundNtupletsVec_[i]);
@@ -368,44 +106,21 @@ void CAHitQuadrupletGeneratorGPU::deallocateOnGPU()
     cudaFree(d_foundNtupletsVec_[i]);
     cudaFree(d_foundNtupletsData_[i]);
   }
-  cudaFreeHost(tmp_layers_);
-  cudaFreeHost(tmp_layerDoublets_);
-  cudaFreeHost(h_layers_);
-
-  cudaFree(d_indices_);
-  cudaFree(d_doublets_);
-  cudaFree(d_layers_);
-  cudaFree(d_x_);
-  cudaFree(d_y_);
-  cudaFree(d_z_);
-  cudaFree(d_rootLayerPairs_);
+
   cudaFree(device_theCells_);
   cudaFree(device_isOuterHitOfCell_);
+  cudaFree(device_nCells_);
 }
 
 void CAHitQuadrupletGeneratorGPU::allocateOnGPU()
 {
-  cudaCheck(cudaMallocHost(&h_doublets_, maxNumberOfLayerPairs_ * sizeof(GPULayerDoublets)));
-  cudaCheck(cudaMallocHost(&h_indices_, maxNumberOfLayerPairs_ * maxNumberOfDoublets_ * 2 * sizeof(int)));
-  cudaCheck(cudaMallocHost(&h_x_, maxNumberOfLayers_ * maxNumberOfHits_ * sizeof(float)));
-  cudaCheck(cudaMallocHost(&h_y_, maxNumberOfLayers_ * maxNumberOfHits_ * sizeof(float)));
-  cudaCheck(cudaMallocHost(&h_z_, maxNumberOfLayers_ * maxNumberOfHits_ * sizeof(float)));
-  cudaCheck(cudaMallocHost(&h_rootLayerPairs_, maxNumberOfRootLayerPairs_ * sizeof(int)));
-
-  cudaCheck(cudaMalloc(&d_indices_, maxNumberOfLayerPairs_ * maxNumberOfDoublets_ * 2 * sizeof(int)));
-  cudaCheck(cudaMalloc(&d_doublets_, maxNumberOfLayerPairs_ * sizeof(GPULayerDoublets)));
-  cudaCheck(cudaMalloc(&d_layers_, maxNumberOfLayers_ * sizeof(GPULayerHits)));
-  cudaCheck(cudaMalloc(&d_x_, maxNumberOfLayers_ * maxNumberOfHits_ * sizeof(float)));
-  cudaCheck(cudaMalloc(&d_y_, maxNumberOfLayers_ * maxNumberOfHits_ * sizeof(float)));
-  cudaCheck(cudaMalloc(&d_z_, maxNumberOfLayers_ * maxNumberOfHits_ * sizeof(float)));
-  cudaCheck(cudaMalloc(&d_rootLayerPairs_, maxNumberOfRootLayerPairs_ * sizeof(unsigned int)));
-
   //////////////////////////////////////////////////////////
   // ALLOCATIONS FOR THE INTERMEDIATE RESULTS (STAYS ON WORKER)
   //////////////////////////////////////////////////////////
 
   cudaCheck(cudaMalloc(&device_theCells_,
              maxNumberOfLayerPairs_ * maxNumberOfDoublets_ * sizeof(GPUCACell)));
+  cudaCheck(cudaMalloc(&device_nCells_,sizeof(uint32_t)));
 
   cudaCheck(cudaMalloc(&device_isOuterHitOfCell_,
              maxNumberOfLayers_ * maxNumberOfHits_ * sizeof(GPU::VecArray<unsigned int, maxCellsPerHit_>)));
@@ -429,40 +144,47 @@ void CAHitQuadrupletGeneratorGPU::allocateOnGPU()
     cudaCheck(cudaMemcpy(d_foundNtupletsVec_[i], & tmp_foundNtuplets, sizeof(GPU::SimpleVector<Quadruplet>), cudaMemcpyDefault));
   }
 
-  cudaCheck(cudaMallocHost(&tmp_layers_, maxNumberOfLayers_ * sizeof(GPULayerHits)));
-  cudaCheck(cudaMallocHost(&tmp_layerDoublets_,maxNumberOfLayerPairs_ * sizeof(GPULayerDoublets)));
-  cudaCheck(cudaMallocHost(&h_layers_, maxNumberOfLayers_ * sizeof(GPULayerHits)));
 }
 
 void CAHitQuadrupletGeneratorGPU::launchKernels(const TrackingRegion &region,
-                                                int regionIndex, cudaStream_t cudaStream)
+                                                int regionIndex, HitsOnCPU const & hh,
+                                                cudaStream_t cudaStream)
 {
   assert(regionIndex < maxNumberOfRegions_);
-  dim3 numberOfBlocks_create(64, numberOfLayerPairs_);
-  dim3 numberOfBlocks_connect(32, numberOfLayerPairs_);
-  dim3 numberOfBlocks_find(16, numberOfRootLayerPairs_);
+  assert(0==regionIndex);
+
   h_foundNtupletsVec_[regionIndex]->reset();
-  kernel_create<<<numberOfBlocks_create, 32, 0, cudaStream>>>(
-      numberOfLayerPairs_, d_doublets_, d_layers_, device_theCells_,
-      device_isOuterHitOfCell_, d_foundNtupletsVec_[regionIndex],
-      region.origin().x(), region.origin().y(), maxNumberOfDoublets_,
-      maxNumberOfHits_);
-  cudaCheck(cudaGetLastError());
 
-  kernel_connect<<<numberOfBlocks_connect, 512, 0, cudaStream>>>(
-      numberOfLayerPairs_, d_doublets_, device_theCells_,
+  auto nhits = hh.nHits;
+
+  auto numberOfBlocks = (maxNumberOfDoublets_ + 512 - 1)/512;
+  kernel_connect<<<numberOfBlocks, 512, 0, cudaStream>>>(
+      d_foundNtupletsVec_[regionIndex], // needed only to be reset, ready for next kernel
+      device_theCells_, device_nCells_,
       device_isOuterHitOfCell_,
-      region.ptMin(), region.origin().x(), region.origin().y(),
+      region.ptMin(), 
       region.originRBound(), caThetaCut, caPhiCut, caHardPtCut,
-      maxNumberOfDoublets_, maxNumberOfHits_);
+      maxNumberOfDoublets_, maxNumberOfHits_
+  );
   cudaCheck(cudaGetLastError());
 
-  kernel_find_ntuplets<<<numberOfBlocks_find, 1024, 0, cudaStream>>>(
-      numberOfRootLayerPairs_, d_doublets_, device_theCells_,
+  kernel_find_ntuplets<<<numberOfBlocks, 512, 0, cudaStream>>>(
+      device_theCells_, device_nCells_,
       d_foundNtupletsVec_[regionIndex],
-      d_rootLayerPairs_, 4, maxNumberOfDoublets_);
+      4, maxNumberOfDoublets_);
   cudaCheck(cudaGetLastError());
 
+
+  numberOfBlocks = (std::max(int(nhits),maxNumberOfDoublets_) + 512 - 1)/512;
+  kernel_checkOverflows<<<numberOfBlocks, 512, 0, cudaStream>>>(
+                        d_foundNtupletsVec_[regionIndex],
+                        device_theCells_, device_nCells_,
+                        device_isOuterHitOfCell_, nhits
+                       );
+
+
+  // kernel_print_found_ntuplets<<<1,1,0, cudaStream>>>(d_foundNtupletsVec_[regionIndex],10);
+
   cudaCheck(cudaMemcpyAsync(h_foundNtupletsVec_[regionIndex], d_foundNtupletsVec_[regionIndex],
                             sizeof(GPU::SimpleVector<Quadruplet>),
                             cudaMemcpyDeviceToHost, cudaStream));
@@ -473,34 +195,29 @@ void CAHitQuadrupletGeneratorGPU::launchKernels(const TrackingRegion &region,
 
 }
 
-std::vector<std::array<std::pair<int, int>, 3>>
+std::vector<std::array<int, 4>>
 CAHitQuadrupletGeneratorGPU::fetchKernelResult(int regionIndex, cudaStream_t cudaStream)
 {
+  assert(0==regionIndex);
   h_foundNtupletsVec_[regionIndex]->set_data(h_foundNtupletsData_[regionIndex]);
   // this lazily resets temporary memory for the next event, and is not needed for reading the output
   cudaCheck(cudaMemsetAsync(device_isOuterHitOfCell_, 0,
                             maxNumberOfLayers_ * maxNumberOfHits_ * sizeof(GPU::VecArray<unsigned int, maxCellsPerHit_>),
                             cudaStream));
-  std::vector<std::array<std::pair<int, int>, 3>> quadsInterface;
-  for (int i = 0; i < h_foundNtupletsVec_[regionIndex]->size(); ++i) {
-    auto const& layerPairsAndCellId = (*h_foundNtupletsVec_[regionIndex])[i].layerPairsAndCellId;
-    std::array<std::pair<int, int>, 3> tmpQuad = {
-        {std::make_pair(layerPairsAndCellId[0].x, layerPairsAndCellId[0].y - maxNumberOfDoublets_ * layerPairsAndCellId[0].x),
-         std::make_pair(layerPairsAndCellId[1].x, layerPairsAndCellId[1].y - maxNumberOfDoublets_ * layerPairsAndCellId[1].x),
-         std::make_pair(layerPairsAndCellId[2].x, layerPairsAndCellId[2].y - maxNumberOfDoublets_ * layerPairsAndCellId[2].x)}};
+  cudaCheck(cudaMemsetAsync(device_nCells_,0,sizeof(uint32_t),cudaStream));
 
-    quadsInterface.push_back(tmpQuad);
+  std::vector<std::array<int, 4>> quadsInterface(h_foundNtupletsVec_[regionIndex]->size());
+  for (int i = 0; i < h_foundNtupletsVec_[regionIndex]->size(); ++i) {
+    for (int j = 0; j<4; ++j) quadsInterface[i][j] = (*h_foundNtupletsVec_[regionIndex])[i].hitId[j];
   }
   return quadsInterface;
 }
 
-void CAHitQuadrupletGeneratorGPU::buildDoublets(HitsOnCPU const & hh, float phicut, cudaStream_t stream) {
+void CAHitQuadrupletGeneratorGPU::buildDoublets(HitsOnCPU const & hh, cudaStream_t stream) {
    auto nhits = hh.nHits;
 
-  float phiCut=0.06;
   int threadsPerBlock = 256;
-  int blocks = (nhits + threadsPerBlock - 1) / threadsPerBlock;
-
-  gpuPixelDoublets::getDoubletsFromHisto<<<blocks, threadsPerBlock, 0, stream>>>(hh.gpu_d,phiCut);
+  int blocks = (3*nhits + threadsPerBlock - 1) / threadsPerBlock;
+  gpuPixelDoublets::getDoubletsFromHisto<<<blocks, threadsPerBlock, 0, stream>>>(device_theCells_,device_nCells_,hh.gpu_d, device_isOuterHitOfCell_);
   cudaCheck(cudaGetLastError());
 }
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitQuadrupletGeneratorGPU.h b/RecoPixelVertexing/PixelTriplets/plugins/CAHitQuadrupletGeneratorGPU.h
index a7f33219f66a1..07aabbe2cacbb 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitQuadrupletGeneratorGPU.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitQuadrupletGeneratorGPU.h
@@ -18,12 +18,12 @@
 #include "RecoTracker/TkSeedGenerator/interface/FastCircleFit.h"
 #include "RecoTracker/TkSeedingLayers/interface/SeedComparitor.h"
 #include "RecoTracker/TkSeedingLayers/interface/SeedComparitorFactory.h"
+#include "RecoPixelVertexing/PixelTriplets/plugins/RecHitsMap.h"
+
 
 #include "GPUCACell.h"
-#include "GPUHitsAndDoublets.h"
 
 class TrackingRegion;
-class SeedingLayerSetsHits;
 
 namespace edm {
     class Event;
@@ -37,8 +37,6 @@ class CAHitQuadrupletGeneratorGPU {
     using HitsOnGPU = siPixelRecHitsHeterogeneousProduct::HitsOnGPU;
     using HitsOnCPU = siPixelRecHitsHeterogeneousProduct::HitsOnCPU;
 
-    typedef LayerHitMapCache LayerCacheType;
-
     static constexpr unsigned int minLayers = 4;
     typedef OrderedHitSeeds ResultType;
 
@@ -54,21 +52,21 @@ class CAHitQuadrupletGeneratorGPU {
 
     void initEvent(const edm::Event& ev, const edm::EventSetup& es);
 
-    void buildDoublets(HitsOnCPU const & hh, float phicut, cudaStream_t stream);
+    void buildDoublets(HitsOnCPU const & hh, cudaStream_t stream);
 
-    void hitNtuplets(const IntermediateHitDoublets& regionDoublets,
+    void hitNtuplets(const TrackingRegion &region, HitsOnCPU const & hh,
                      const edm::EventSetup& es,
-                     const SeedingLayerSetsHits& layers, cudaStream_t stream);
-    void fillResults(const IntermediateHitDoublets& regionDoublets,
+                     cudaStream_t stream);
+    void fillResults(const TrackingRegion &region, SiPixelRecHitCollectionNew const & rechits,
                      std::vector<OrderedHitSeeds>& result,
                      const edm::EventSetup& es,
-                     const SeedingLayerSetsHits& layers, cudaStream_t stream);
+                     cudaStream_t stream);
 
     void allocateOnGPU();
     void deallocateOnGPU();
 
 private:
-    LayerCacheType theLayerCache;
+//    LayerCacheType theLayerCache;
 
     std::unique_ptr<SeedComparitor> theComparitor;
 
@@ -136,10 +134,11 @@ class CAHitQuadrupletGeneratorGPU {
         const bool enabled_;
     };
 
-    void  launchKernels(const TrackingRegion &, int, cudaStream_t);
-    std::vector<std::array<std::pair<int,int> ,3>> fetchKernelResult(int, cudaStream_t);
+    void  launchKernels(const TrackingRegion &, int, HitsOnCPU const & hh, cudaStream_t);
+    std::vector<std::array<int,4>> fetchKernelResult(int, cudaStream_t);
+
+
     const float extraHitRPhitolerance;
-    std::vector<std::vector<const HitDoublets *>> hitDoublets;
 
     const QuantityDependsPt maxChi2;
     const bool fitFastCircle;
@@ -151,29 +150,14 @@ class CAHitQuadrupletGeneratorGPU {
     const float caHardPtCut = 0.f;
 
     static constexpr int maxNumberOfQuadruplets_ = 10000;
-    static constexpr int maxCellsPerHit_ = 512;
+    static constexpr int maxCellsPerHit_ = 2048; // 512;
     static constexpr int maxNumberOfLayerPairs_ = 13;
-    static constexpr unsigned int maxNumberOfRootLayerPairs_ = 13;
     static constexpr int maxNumberOfLayers_ = 10;
     static constexpr int maxNumberOfDoublets_ = 262144;
-    static constexpr int maxNumberOfHits_ = 10000;
-    static constexpr int maxNumberOfRegions_ = 30;
-
-    unsigned int numberOfRootLayerPairs_ = 0;
-    unsigned int numberOfLayerPairs_ = 0;
-    unsigned int numberOfLayers_ = 0;
-
-    GPULayerDoublets* h_doublets_ = nullptr;
-    GPULayerHits* h_layers_ = nullptr;
-
-    unsigned int* h_indices_ = nullptr;
-    float *h_x_=nullptr, *h_y_=nullptr, *h_z_=nullptr;
-    float *d_x_=nullptr, *d_y_=nullptr, *d_z_=nullptr;
-    unsigned int* d_rootLayerPairs_ = nullptr;
-    GPULayerHits* d_layers_ = nullptr;
-    GPULayerDoublets* d_doublets_ = nullptr;
-    unsigned int* d_indices_ = nullptr;
-    unsigned int* h_rootLayerPairs_ = nullptr;
+    static constexpr int maxNumberOfHits_ = 20000;
+    static constexpr int maxNumberOfRegions_ = 2;
+
+
     std::vector<GPU::SimpleVector<Quadruplet>*> h_foundNtupletsVec_;
     std::vector<Quadruplet*> h_foundNtupletsData_;
 
@@ -182,9 +166,12 @@ class CAHitQuadrupletGeneratorGPU {
 
     GPUCACell* device_theCells_ = nullptr;
     GPU::VecArray< unsigned int, maxCellsPerHit_>* device_isOuterHitOfCell_ = nullptr;
+    uint32_t* device_nCells_ = nullptr;
+
+    HitsOnCPU const * hitsOnCPU=nullptr;
+
+    RecHitsMap<TrackingRecHit const *> hitmap_ = RecHitsMap<TrackingRecHit const *>(nullptr);
 
-    GPULayerHits* tmp_layers_ = nullptr;
-    GPULayerDoublets* tmp_layerDoublets_ = nullptr;
 };
 
 #endif // RecoPixelVertexing_PixelTriplets_plugins_CAHitQuadrupletGeneratorGPU_h
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/GPUCACell.h b/RecoPixelVertexing/PixelTriplets/plugins/GPUCACell.h
index 14d8ee833ce71..e8d389f00712b 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/GPUCACell.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/GPUCACell.h
@@ -5,43 +5,43 @@
 #define GPU_CACELL_H_
 
 #include "GPUHitsAndDoublets.h"
+#include "RecoLocalTracker/SiPixelRecHits/plugins/siPixelRecHitsHeterogeneousProduct.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/GPUVecArray.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/GPUSimpleVector.h"
 #include <cuda.h>
+
 struct Quadruplet {
-  int2 layerPairsAndCellId[3];
+  int hitId[4];
 };
 
+
 class GPUCACell {
 public:
   __host__ __device__  GPUCACell() {}
 
-  __host__ __device__ void init(const GPULayerDoublets *doublets,
-                                const GPULayerHits *hitsOnLayer,
-                                int layerPairId, int doubletId, int innerHitId,
-                                int outerHitId, float regionX, float regionY) {
 
+__host__ __device__ void init(siPixelRecHitsHeterogeneousProduct::HitsOnGPU const & hh,
+                              int layerPairId, int doubletId, int innerHitId,int outerHitId) {
     theInnerHitId = innerHitId;
     theOuterHitId = outerHitId;
     theDoubletId = doubletId;
     theLayerPairId = layerPairId;
 
-    auto innerLayerId = doublets->innerLayerId;
-    auto outerLayerId = doublets->outerLayerId;
-
-    theInnerX = hitsOnLayer[innerLayerId].x[innerHitId];
-    theOuterX = hitsOnLayer[outerLayerId].x[outerHitId];
+    theInnerX = hh.xg_d[innerHitId];
+    theOuterX = hh.xg_d[outerHitId];
 
-    theInnerY = hitsOnLayer[innerLayerId].y[innerHitId];
-    theOuterY = hitsOnLayer[outerLayerId].y[outerHitId];
+    theInnerY = hh.yg_d[innerHitId];
+    theOuterY = hh.yg_d[outerHitId];
 
-    theInnerZ = hitsOnLayer[innerLayerId].z[innerHitId];
-    theOuterZ = hitsOnLayer[outerLayerId].z[outerHitId];
-    theInnerR = hypot(theInnerX - regionX, theInnerY - regionY);
-    theOuterR = hypot(theOuterX - regionX, theOuterY - regionY);
+    theInnerZ = hh.zg_d[innerHitId];
+    theOuterZ = hh.zg_d[outerHitId];
+    theInnerR = hh.rg_d[innerHitId];
+    theOuterR = hh.rg_d[outerHitId];
     theOuterNeighbors.reset();
   }
 
+
+
   constexpr float get_inner_x() const { return theInnerX; }
   constexpr float get_outer_x() const { return theOuterX; }
   constexpr float get_inner_y() const { return theInnerY; }
@@ -201,57 +201,31 @@ class GPUCACell {
     // the ntuplets is then saved if the number of hits it contains is greater
     // than a threshold
 
+    tmpNtuplet.push_back_unsafe(theInnerHitId);
+    assert(tmpNtuplet.size()<=3);
 
-    if ((unsigned int)(tmpNtuplet.size()) >= minHitsPerNtuplet - 1) {
+    if ((unsigned int)(tmpNtuplet.size()) >= minHitsPerNtuplet-1) {
       Quadruplet tmpQuadruplet;
-      for (unsigned int i = 0; i < minHitsPerNtuplet - 1; ++i) {
-        tmpQuadruplet.layerPairsAndCellId[i].x = cells[tmpNtuplet[i]].theLayerPairId;
-        tmpQuadruplet.layerPairsAndCellId[i].y = tmpNtuplet[i];
+      for (unsigned int i = 0; i < minHitsPerNtuplet-1; ++i) {
+        tmpQuadruplet.hitId[i] = tmpNtuplet[i];
       }
+      tmpQuadruplet.hitId[minHitsPerNtuplet-1] = theOuterHitId;
       foundNtuplets->push_back(tmpQuadruplet);
     }
     else {
       for (int j = 0; j < theOuterNeighbors.size(); ++j) {
         auto otherCell = theOuterNeighbors[j];
-        tmpNtuplet.push_back_unsafe(otherCell);
         cells[otherCell].find_ntuplets(cells, foundNtuplets, tmpNtuplet,
                                        minHitsPerNtuplet);
-        tmpNtuplet.pop_back();
       }
     }
+    tmpNtuplet.pop_back();
+    assert(tmpNtuplet.size()<3);
   }
 
 #endif
-  template <int maxNumberOfQuadruplets>
-  __host__ inline void find_ntuplets_host(
-      const GPUCACell *cells,
-      GPU::VecArray<Quadruplet, maxNumberOfQuadruplets> *foundNtuplets,
-      GPU::VecArray<unsigned int, 3> &tmpNtuplet,
-      const unsigned int minHitsPerNtuplet) const {
-
-    Quadruplet tmpQuadruplet;
-    if (tmpNtuplet.size() >= minHitsPerNtuplet - 1) {
-      for (int i = 0; i < minHitsPerNtuplet - 1; ++i) {
-        tmpQuadruplet.layerPairsAndCellId[i].x =
-            cells[tmpNtuplet[i]].theLayerPairId;
 
-        tmpQuadruplet.layerPairsAndCellId[i].y = tmpNtuplet[i];
-      }
-      foundNtuplets->push_back(tmpQuadruplet);
 
-    }
-
-    else {
-      for (int j = 0; j < theOuterNeighbors.size(); ++j) {
-        auto otherCell = theOuterNeighbors[j];
-        tmpNtuplet.push_back_unsafe(otherCell);
-        cells[otherCell].find_ntuplets_host(cells, foundNtuplets, tmpNtuplet,
-                                            minHitsPerNtuplet);
-
-        tmpNtuplet.pop_back();
-      }
-    }
-  }
   GPU::VecArray< unsigned int, 40> theOuterNeighbors;
 
   int theDoubletId;
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/RecHitsMap.h b/RecoPixelVertexing/PixelTriplets/plugins/RecHitsMap.h
new file mode 100644
index 0000000000000..566a15591472c
--- /dev/null
+++ b/RecoPixelVertexing/PixelTriplets/plugins/RecHitsMap.h
@@ -0,0 +1,77 @@
+#ifndef RecHitsMap_H
+#define RecHitsMap_H
+  // store T for each cluster...
+
+#include "DataFormats/TrackerRecHit2D/interface/BaseTrackerRecHit.h"
+#include <cstdint>
+#include <unordered_map>
+
+#include "FWCore/MessageLogger/interface/MessageLogger.h"
+
+
+//FIXME move it to a better place...
+template<typename T>
+class RecHitsMap {
+public: 
+
+     explicit RecHitsMap(T const & d=T()) : dummy(d){}
+
+     void clear() {m_map.clear();}
+
+     void error(const GeomDetUnit& gd) const {edm::LogError("RecHitMap") << "hit not found in det " << gd.index();  }
+     void error(uint32_t ind) const {edm::LogError("RecHitMap") << "hit not found in det " << ind;  }
+
+     // does not work for matched hits... (easy to extend)
+     void add(TrackingRecHit const & hit, T const & v) {
+       auto const & thit = static_cast<BaseTrackerRecHit const&>(hit);
+       auto const & clus = thit.firstClusterRef();
+
+       if (clus.isPixel()) 
+          add(clus.pixelCluster(), *thit.detUnit(),v);
+       else
+          add(clus.stripCluster(), *thit.detUnit(),v);
+     }
+
+     template<typename Cluster>
+     void add(const Cluster& cluster, const GeomDetUnit& gd, T const & v) { m_map[encode(cluster,gd)] = v; }
+
+     template<typename Cluster>
+     T const & get(const Cluster& cluster, const GeomDetUnit& gd) const {
+       auto p = m_map.find(encode(cluster,gd));
+       if (p!=m_map.end()) { return (*p).second; }
+       error(gd);
+       return dummy;
+     }
+
+     T const & get(uint32_t ind, uint16_t mr, uint16_t mc) const {
+       auto p = m_map.find(encode(ind,mr,mc));
+       if (p!=m_map.end()) { return (*p).second; }
+       error(ind);
+       return dummy;
+     }
+
+     static uint64_t encode(uint32_t ind, uint16_t mr, uint16_t mc) {
+          uint64_t u1 = ind;                            
+          uint64_t u2 = mr;
+          uint64_t u3 = mc;
+          return (u1<<32) | (u2<<16) | u3;
+     }
+
+     static uint64_t encode(const SiPixelCluster& cluster, const GeomDetUnit& det) {
+          uint64_t u1 = det.index();
+          uint64_t u2 = cluster.minPixelRow();
+          uint64_t u3 = cluster.minPixelCol();
+          return (u1<<32) | (u2<<16) | u3;
+     }
+     static uint64_t encode(const SiStripCluster& cluster, const GeomDetUnit& det) {
+          uint64_t u1 = det.index();
+          uint64_t u2 = cluster.firstStrip();
+       	  return (u1<<32) | u2;
+     }
+
+
+     std::unordered_map<uint64_t, T > m_map;
+     T dummy;
+  };
+
+#endif
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoublets.h b/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoublets.h
index f09ae6aba5efb..6290c47b9e1ef 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoublets.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/gpuPixelDoublets.h
@@ -11,151 +11,162 @@
 #include "DataFormats/Math/interface/approx_atan2.h"
 #include "RecoLocalTracker/SiPixelRecHits/plugins/siPixelRecHitsHeterogeneousProduct.h"
 
+#include "GPUCACell.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/GPUVecArray.h"
+
 namespace gpuPixelDoublets {
 
+  constexpr uint32_t MaxNumOfDoublets = 1024*1024*256;
+
+  template<typename Hist>
   __device__
-  std::pair<int,int>
-  findPhiLimits(int16_t phiMe, int16_t * iphi, uint16_t * index, uint16_t size, int16_t iphicut) {
+  void doubletsFromHisto(uint8_t const * layerPairs, uint32_t nPairs, GPUCACell * cells, uint32_t * nCells,
+                         int16_t const * iphi, Hist const * hist, uint32_t const * offsets,
+                         siPixelRecHitsHeterogeneousProduct::HitsOnGPU const & hh,
+                         GPU::VecArray< unsigned int, 2048>  * isOuterHitOfCell,
+                         int16_t const * phicuts, float const * minz, float const * maxz, float const * maxr) {
+
+    auto layerSize = [=](uint8_t li) { return offsets[li+1]-offsets[li]; };
+
+    // to be optimized later
+    uint32_t innerLayerCumulativeSize[64];
+    assert(nPairs<=64);
+    innerLayerCumulativeSize[0] = layerSize(layerPairs[0]);
+    for (uint32_t i=1; i<nPairs; ++i) {
+       innerLayerCumulativeSize[i] = innerLayerCumulativeSize[i-1] + layerSize(layerPairs[2*i]);
+    }
 
-  assert(iphicut>0);
+    auto ntot = innerLayerCumulativeSize[nPairs-1];
 
-  // find extreemes in top
-  int16_t minPhi = phiMe-iphicut;
-  int16_t maxPhi = phiMe+iphicut;
 
-  // std::cout << "\n phi min/max " << phiMe << ' ' << minPhi << ' ' << maxPhi << std::endl;
+    auto idx = blockIdx.x*blockDim.x + threadIdx.x;
+  for(auto j=idx;j<ntot;j+=blockDim.x*gridDim.x) {
+    auto j = idx; 
 
-  // guess and adjust
-  auto findLimit = [&](int16_t mPhi) {
-    int jm = float(0.5f*size)*(1.f+float(mPhi)/float(std::numeric_limits<short>::max()));
-    // std::cout << "jm for " << mPhi << ' ' << jm << std::endl;
-    jm = std::min(size-1,std::max(0,jm));
-    bool notDone=true;
-    while(jm>0 && mPhi<iphi[index[--jm]]){notDone=false;}
-    if (notDone) while(jm<size && mPhi>iphi[index[++jm]]){}
-    jm = std::min(size-1,std::max(0,jm));
-    return jm;
-  };
+    uint32_t pairLayerId=0;
+    while(j>=innerLayerCumulativeSize[pairLayerId++]);  --pairLayerId; // move to lower_bound ??
 
-  auto jmin = findLimit(minPhi);
-  auto jmax = findLimit(maxPhi);
+    assert(pairLayerId<nPairs);
+    assert(j<innerLayerCumulativeSize[pairLayerId]);
+    assert(0==pairLayerId || j>=innerLayerCumulativeSize[pairLayerId-1]);
 
+    uint8_t inner = layerPairs[2*pairLayerId];
+    uint8_t outer = layerPairs[2*pairLayerId+1];
+    assert(outer>inner);
 
-  /*
-  std::cout << "j min/max " << jmin << ' ' << jmax << std::endl;
-  std::cout << "found min/max " << iphi[index[jmin]] << ' ' << iphi[index[jmax]] << std::endl;
-  std::cout << "found min/max +1 " << iphi[index[jmin+1]] << ' ' << iphi[index[jmax+1]] << std::endl;
-  std::cout << "found min/max -1 " << iphi[index[jmin-1]] << ' ' << iphi[index[jmax-1]] << std::endl;
-  */
+    auto i = (0==pairLayerId) ? j :  j-innerLayerCumulativeSize[pairLayerId-1];
+    i += offsets[inner];
 
-  return std::make_pair(jmin,jmax);
-  }
+    // printf("Hit in Layer %d %d %d %d\n", i, inner, pairLayerId, j);
 
+    assert(i>=offsets[inner]);
+    assert(i<offsets[inner+1]);
 
-  __global__
-  void getDoubletsFromSorted(int16_t * iphi, uint16_t * index, uint32_t * offsets, float phiCut) {
-    auto iphicut = phi2short(phiCut);
-    auto i = blockIdx.x*blockDim.x + threadIdx.x;
-    if (i>=offsets[9]) {
-      // get rid of last layer
-      return;
-    }
-
-    assert(0==offsets[0]);
-    int top = (i>offsets[5]) ? 5: 0;
-    while (i>=offsets[++top]){};
-    assert(top<10);
-    auto bottom = top-1;
-    if (bottom == 3 or bottom == 6) {
-      // do not have UP... (9 we got rid already)
-      return;
-    }
-    assert(i >= offsets[bottom]);
-    assert(i < offsets[top]);
-
-    if (index[i]>= (offsets[top]-offsets[bottom])) {
-      printf("index problem: %d %d %d %d %d\n",i, offsets[top], offsets[bottom], offsets[top]-offsets[bottom], index[i]);
-      return;
-    }
+    // found hit corresponding to our cuda thread!!!!!
+    // do the job
 
-    assert(index[i]<offsets[top]-offsets[bottom]);
-
-    int16_t phiMe = iphi[offsets[bottom]+index[i]];
-
-    auto size = offsets[top+1] - offsets[top];
-    assert(size<std::numeric_limits<uint16_t>::max());
-
-    auto jLimits = findPhiLimits(phiMe, iphi+offsets[top],index+offsets[top],size,iphicut);
-
-    auto slidingWindow = [&](uint16_t mysize, uint16_t mymin,uint16_t mymax) {
-      auto topPhi = iphi+offsets[top];
-      uint16_t imax =  std::numeric_limits<uint16_t>::max();
-      uint16_t offset = (mymin>mymax) ? imax-(mysize-1) : 0;
-      int n=0;
-      for (uint16_t i = mymin+offset; i!=mymax; i++) {
-        assert(i<=imax);
-        uint16_t k = (i>mymax) ? i-offset : i;
-        assert(k<mysize);
-        assert(k>=mymin || k<mymax);
-        if (int16_t(topPhi[k]-phiMe)>2*iphicut && int16_t(phiMe-topPhi[k])>2*iphicut)
-          printf("deltaPhi problem: %d %d %d %d, deltas %d:%d cut %d\n",i,k,phiMe,topPhi[k],int16_t(topPhi[k]-phiMe),int16_t(phiMe-topPhi[k]),iphicut);
-        n++;
-      }
-      int tot = (mymin>mymax) ? (mysize-mymin)+mymax : mymax-mymin;
-      assert(n==tot);
+    auto mep = iphi[i];
+    auto mez = hh.zg_d[i];
+    auto mer = hh.rg_d[i];
+    auto cutoff = [&](int j) { return 
+        abs(hh.zg_d[j]-mez) > maxz[pairLayerId] ||
+      	abs(hh.zg_d[j]-mez) < minz[pairLayerId] ||
+        hh.rg_d[j]-mer > maxr[pairLayerId];
     };
 
-    slidingWindow(size,jLimits.first,jLimits.second);
-  }
-
-  template<typename Hist>
-  __device__
-  void doubletsFromHisto(int16_t const * iphi, Hist const * hist, uint32_t const * offsets, float phiCut) {
-    auto iphicut = phi2short(phiCut);
-    auto i = blockIdx.x*blockDim.x + threadIdx.x;
-    if (i>=offsets[9]) {
-      // get rid of last layer
-      return;
-    }
+    constexpr float z0cut = 12.f;
+    auto z0cutoff = [&](int j) {
+      auto zo =	hh.zg_d[j];
+      auto ro = hh.rg_d[j]; 
+      auto dr = ro-mer;
+      return dr > maxr[pairLayerId] || 
+             dr<0 || std::abs((mez*ro - mer*zo)) > z0cut*dr;
+    };
 
-    assert(0==offsets[0]);
-    int top = (i>offsets[5]) ? 5: 0;
-    while (i>=offsets[++top]){};
-    assert(top<10);
-    auto bottom = top-1;
-    if (bottom==3 || bottom==6) {
-      // do not have UP... (9 we got rid already)
-      return;
-    }
-    assert(i>=offsets[bottom]);
-    assert(i<offsets[top]);
+    auto iphicut = phicuts[pairLayerId];
 
-    auto mep = iphi[i];
-    auto kl = hist[top].bin(mep-iphicut);
-    auto kh = hist[top].bin(mep+iphicut);
+    auto kl = hist[outer].bin(int16_t(mep-iphicut));
+    auto kh = hist[outer].bin(int16_t(mep+iphicut));
     auto incr = [](auto & k) { return k = (k+1)%Hist::nbins();};
     int tot  = 0;
     int nmin = 0;
     auto khh = kh;
     incr(khh);
+    
+    int tooMany=0;
     for (auto kk=kl; kk!=khh; incr(kk)) {
-      if (kk!=kl && kk!=kh) nmin+=hist[top].size(kk);
-      for (auto p=hist[top].begin(kk); p<hist[top].end(kk); ++p) {
-        if (std::min(std::abs(int16_t(iphi[*p]-mep)), std::abs(int16_t(mep-iphi[*p]))) > iphicut)
+      if (kk!=kl && kk!=kh) nmin+=hist[outer].size(kk);
+      for (auto p=hist[outer].begin(kk); p<hist[outer].end(kk); ++p) {
+        auto oi=*p;
+        assert(oi>=offsets[outer]);
+        assert(oi<offsets[outer+1]);
+
+        if (std::min(std::abs(int16_t(iphi[oi]-mep)), std::abs(int16_t(mep-iphi[oi]))) > iphicut)
           continue;
+        if (z0cutoff(oi)) continue;
+        auto ind = atomicInc(nCells,MaxNumOfDoublets);
+        // int layerPairId, int doubletId, int innerHitId,int outerHitId)
+        cells[ind].init(hh,pairLayerId,ind,i,oi);
+        isOuterHitOfCell[oi].push_back(ind);
+        if (isOuterHitOfCell[oi].full()) ++tooMany;
         ++tot;
       }
     }
-    if (0==hist[top].nspills) assert(tot>=nmin);
+    if (tooMany>0) printf("OuterHitOfCell full for %d in layer %d/%d, %d:%d   %d,%d\n", i, inner,outer, kl,kh,nmin,tot);
+
+    if (hist[outer].nspills>0)
+      printf("spill bin to be checked in %d %d\n",outer,hist[outer].nspills);
+
+    // if (0==hist[outer].nspills) assert(tot>=nmin);
     // look in spill bin as well....
-  }
 
+
+  }  // loop in block...
+  }
   __global__
-  void getDoubletsFromHisto(siPixelRecHitsHeterogeneousProduct::HitsOnGPU const * hhp, float phiCut) {
+  void getDoubletsFromHisto(GPUCACell * cells, uint32_t * nCells, siPixelRecHitsHeterogeneousProduct::HitsOnGPU const * hhp,                
+                            GPU::VecArray< unsigned int, 2048> *isOuterHitOfCell) {
+
+    uint8_t const layerPairs[2*13] = {0,1 ,1,2 ,2,3 
+                                     // ,0,4 ,1,4 ,2,4 ,4,5 ,5,6  
+                                     ,0,7 ,1,7 ,2,7 ,7,8 ,8,9
+                                     ,0,4 ,1,4 ,2,4 ,4,5 ,5,6
+                                     };
+
+    const int16_t phi0p05 = phi2short(0.05);
+    const int16_t phi0p06 = phi2short(0.06);
+    const int16_t phi0p07 = phi2short(0.07);
+
+    int16_t const phicuts[13] { phi0p05, phi0p05, phi0p06
+                               ,phi0p07, phi0p06, phi0p06, phi0p05, phi0p05
+                               ,phi0p07, phi0p06, phi0p06, phi0p05, phi0p05
+                              };
+
+    float const minz[13] = { 0., 0., 0.
+                            ,0., 0., 0., 0., 0.
+      	       	       	    ,0., 0., 0., 0., 0.
+                           };
+
+    float const	maxz[13] = { 20.,15.,12.
+                            ,30.,20.,20., 50., 50.
+       	       	       	    ,30.,20.,20., 50., 50.
+                           };
+
+    float const maxr[13] = { 20., 20., 20.
+                            ,9., 7., 6., 5., 5.
+      	       	       	    ,9., 7., 6., 5., 5.
+                           };
+
+
     auto const & hh = *hhp;
-    doubletsFromHisto(hh.iphi_d,hh.hist_d,hh.hitsLayerStart_d,phiCut);
+    doubletsFromHisto(layerPairs, 13, cells, nCells, 
+                      hh.iphi_d,hh.hist_d,hh.hitsLayerStart_d,
+                      hh, isOuterHitOfCell,
+                      phicuts, minz, maxz, maxr);
   }
 
+
+
 } // namespace end
 
 #endif // RecoLocalTracker_SiPixelRecHits_plugins_gpuPixelDouplets_h