diff --git a/RecoVertex/PrimaryVertexProducer/interface/fitterCUDA.h b/RecoVertex/PrimaryVertexProducer/interface/fitterCUDA.h
index c0cf5b1ea132b..27b03cbe45246 100644
--- a/RecoVertex/PrimaryVertexProducer/interface/fitterCUDA.h
+++ b/RecoVertex/PrimaryVertexProducer/interface/fitterCUDA.h
@@ -1,6 +1,6 @@
 #ifndef fitterCUDA_h
 #define fitterCUDA_h
-//#include "CUDADataFormats/Track/interface/TrackForPVHeterogeneous.h"
+#include "CUDADataFormats/Track/interface/TrackForPVSoAT.h"
 #include "TrackingTools/TransientTrack/interface/TransientTrack.h"
 #include "RecoVertex/VertexPrimitives/interface/TransientVertex.h"
 //#include "RecoVertex/VertexTools/interface/VertexDistanceXY.h"
@@ -31,15 +31,12 @@ namespace fitterCUDA {
 
 
   //void wrapper(unsigned int ntracks, TrackForPV::TrackForPVSoA* tracks, cudaStream_t stream);
-  std::vector<TransientVertex> wrapper(
-    algo algorithm,
-    std::vector<std::vector<reco::TransientTrack> >&& clusters,
-    reco::BeamSpot beamSpot,
-    VertexState beamVertexState,
-    bool f4D,
-    bool validBS,
-    bool weightFit,
-    bool fVerbose );
+  void wrapper(
+    unsigned int ntracks,
+    TrackForPV::TrackForPVSoA* GPUtracksObject,
+    TrackForPV::VertexForPVSoA* GPUverticesObject,
+    algo algorithm
+  );
 }
 
 #endif
diff --git a/RecoVertex/PrimaryVertexProducer/plugins/PrimaryVertexProducerCUDA.cc b/RecoVertex/PrimaryVertexProducer/plugins/PrimaryVertexProducerCUDA.cc
index 0116363d5d31b..9218e37912930 100644
--- a/RecoVertex/PrimaryVertexProducer/plugins/PrimaryVertexProducerCUDA.cc
+++ b/RecoVertex/PrimaryVertexProducer/plugins/PrimaryVertexProducerCUDA.cc
@@ -398,9 +398,93 @@ void PrimaryVertexProducerCUDA::produce(edm::Event& iEvent, const edm::EventSetu
   clusterizerCUDA::bigKernelWrapper(ntracks, GPUtracksObject, GPUverticesObject, GPUbeta.get(), osumtkwt.get(), cParams, cudaStreamDefault);
 //  std::cout << "End kernel" << std::endl;
   
-   
+
+
+  ////////////////////////////////////////////////////////////////////
+  ////////////////////// Fitting on GPU //////////////////////////////
+  ////////////////////////////////////////////////////////////////////
+
+  //JS_EDIT: fitting moved here before copies back to the cpu
+  clusterizerCUDA::verticesAndClusterize(ntracks, GPUtracksObject, GPUverticesObject, cParams, cudaStreamDefault);
+
+
+  //change to just first algo with beamspot constraint
+  for (std::vector<algo>::const_iterator algorithm = algorithms.begin(); algorithm != algorithms.end(); algorithm++) {
+    auto result = std::make_unique<reco::VertexCollection>();
+    reco::VertexCollection& vColl = (*result);
+    std::vector<TransientVertex> pvs;
+    //outsource move algo to fitterCUDA::algo, less data movement
+    fitterCUDA::algo algorithm_for_fitter;
+    algorithm_for_fitter.fitter = (*algorithm).fitter;
+    algorithm_for_fitter.vertexSelector = (*algorithm).vertexSelector;
+    algorithm_for_fitter.label = (*algorithm).label;
+    algorithm_for_fitter.useBeamConstraint = (*algorithm).useBeamConstraint;
+    algorithm_for_fitter.minNdof = (*algorithm).minNdof;
+
+    fitterCUDA::wrapper(ntracks, GPUtracksObject, GPUverticesObject, algorithm_for_fitter);
+
+    //copy over back to CPU, keep conditionals below the same
+    //conversion happens here//
+
+    //POST_CONDITIONS
+
+    //if (fVerbose) std::cout << "PrimaryVertexProducerCUDAAlgorithm::vertices  candidates =" << pvs.size() << std::endl;
+    //if (clusters.size() > 2 && clusters.size() > 2 * GPUverticesObject->nTrueVertex(0))
+    //  edm::LogWarning("PrimaryVertexProducerCUDA")
+    //      << "more than half of candidate vertices lost " << GPUverticesObject->nTrueVertex(0) << ' ' << clusters.size();
+
+    //should already be sorted
+    //if (GPUverticesObject->nTrueVertex(0) > 1) sort(GPUverticesObject->order(0), GPUverticesObject->order(GPUverticesObject->nTrueVertex(0)-1), VertexHigherPtSquared());
+    for (unsigned int i = 0; i < GPUverticesObject->nTrueVertex(0); i++) {
+      auto iv = GPUverticesObject->order(i);
+    //for (TrackForPV::VertexForPvSoA::const_iterator iv = GPUverticesObject->order(0); iv != GPUverticesObject->order(GPUverticesObject->nTrueVertex(0)-1); iv++) {
+      vColl.push_back(*iv);
+    }
+
+    if (vColl.empty()) {
+      GlobalError bse(beamSpot.rotatedCovariance3D());
+      if ((bse.cxx() <= 0.) || (bse.cyy() <= 0.) || (bse.czz() <= 0.)) {
+        AlgebraicSymMatrix33 we;
+        we(0, 0) = 10000;
+        we(1, 1) = 10000;
+        we(2, 2) = 10000;
+        vColl.push_back(reco::Vertex(beamSpot.position(), we, 0., 0., 0));
+        if (fVerbose) {
+          std::cout << "RecoVertex/PrimaryVertexProducerCUDA: "
+                    << "Beamspot with invalid errors " << bse.matrix() << std::endl;
+          std::cout << "Will put Vertex derived from dummy-fake BeamSpot into Event.\n";
+        }
+      } else {
+        vColl.push_back(reco::Vertex(beamSpot.position(), beamSpot.rotatedCovariance3D(), 0., 0., 0));
+        if (fVerbose) {
+          std::cout << "RecoVertex/PrimaryVertexProducerCUDA: "
+                    << " will put Vertex derived from BeamSpot into Event.\n";
+        }
+      }
+    }
+
+    if (fVerbose) {
+      int ivtx = 0;
+      for (reco::VertexCollection::const_iterator v = vColl.begin(); v != vColl.end(); ++v) {
+        std::cout << "recvtx " << ivtx++ << "#trk " << std::setw(3) << v->tracksSize() << " chi2 " << std::setw(4)
+                  << v->chi2() << " ndof " << std::setw(3) << v->ndof() << " x " << std::setw(6) << v->position().x()
+                  << " dx " << std::setw(6) << v->xError() << " y " << std::setw(6) << v->position().y() << " dy "
+                  << std::setw(6) << v->yError() << " z " << std::setw(6) << v->position().z() << " dz " << std::setw(6)
+                  << v->zError();
+        if (f4D) {
+          std::cout << " t " << std::setw(6) << v->t() << " dt " << std::setw(6) << v->tError();
+        }
+        std::cout << std::endl;
+      }
+    }
+    iEvent.put(std::move(result), algorithm->label);
+  }
+
+
+
+
+
   ///// TODO:: update this when we put the fitter into GPU as well ////
-  
   //cudaCheck(cudaFree(GPUverticesObject));
   //cudaCheck(cudaFree(CPUtracksObject));
   //cudaCheck(cudaFree(GPUtracksObject));
@@ -422,21 +506,27 @@ void PrimaryVertexProducerCUDA::produce(edm::Event& iEvent, const edm::EventSetu
  // cudaCheck(cudaFree(GPUtracksObject));
   //cudaCheck(cudaFree(d_obj_ptr));  
 //  //std::cout << "Finished copying 2" << std::endl;
-  std::vector<TransientVertex> pv = clusterizerCUDA::vertices(ntracks, CPUtracksObject, CPUverticesObject, cParams, t_tks, CPUbeta.get());
+
+
+  //JS_EDIT: COMMENT BC CHANGED
+//  std::vector<TransientVertex> pv = clusterizerCUDA::vertices(ntracks, CPUtracksObject, CPUverticesObject, cParams, t_tks, CPUbeta.get());
   // clusterize tracks in Z
-  std::vector<std::vector<reco::TransientTrack> >&& clusters = clusterizerCUDA::clusterize(pv, cParams);
+  //std::vector<std::vector<reco::TransientTrack> >&& clusters = clusterizerCUDA::clusterize(pv, cParams);
+
+
  // cudaCheck(cudaFree(CPUverticesObject));
  // cudaCheck(cudaFree(CPUtracksObject));
   cudaCheck(cudaDeviceSynchronize());
-  ////////////////////////////////////////////////////////////////////
-  ////////////////////// Fitting on GPU //////////////////////////////
-  ////////////////////////////////////////////////////////////////////
+
+
+
   //std::vector<reco::TransientTrack> seltks;
   // std::vector<std::vector<reco::TransientTrack> > clusters;
   
   // std::vector<std::vector<reco::TransientTrack> > clusters;
 
   // vertex fits
+  /*
   for (std::vector<algo>::const_iterator algorithm = algorithms.begin(); algorithm != algorithms.end(); algorithm++) {
     auto result = std::make_unique<reco::VertexCollection>();
     reco::VertexCollection& vColl = (*result);
@@ -452,7 +542,7 @@ void PrimaryVertexProducerCUDA::produce(edm::Event& iEvent, const edm::EventSetu
     algorithm_for_fitter.minNdof = (*algorithm).minNdof;
 
     pvs = fitterCUDA::wrapper(algorithm_for_fitter, std::move(clusters), beamSpot, beamVertexState, f4D, validBS, weightFit, fVerbose);
-
+  */
     /*
     for (std::vector<std::vector<reco::TransientTrack> >::const_iterator iclus = clusters.begin();
          iclus != clusters.end();
@@ -540,7 +630,7 @@ void PrimaryVertexProducerCUDA::produce(edm::Event& iEvent, const edm::EventSetu
           (!validBS || (*(algorithm->vertexSelector))(v, beamVertexState)))
         pvs.push_back(v);
     }  // end of cluster loop
-    */
+
 
     if (fVerbose) {
       std::cout << "PrimaryVertexProducerCUDAAlgorithm::vertices  candidates =" << pvs.size() << std::endl;
@@ -553,7 +643,7 @@ void PrimaryVertexProducerCUDA::produce(edm::Event& iEvent, const edm::EventSetu
     if (pvs.empty() && seltks.size() > 5)
       edm::LogWarning("PrimaryVertexProducerCUDA")
           << "no vertex found with " << seltks.size() << " tracks and " << clusters.size() << " vertex-candidates";
-    */
+
     // sort vertices by pt**2  vertex (aka signal vertex tagging)
     if (pvs.size() > 1) {
       sort(pvs.begin(), pvs.end(), VertexHigherPtSquared());
@@ -611,9 +701,10 @@ void PrimaryVertexProducerCUDA::produce(edm::Event& iEvent, const edm::EventSetu
                 << v->zError();
       std::cout << std::endl;
     }
-    */
+
     iEvent.put(std::move(result), algorithm->label);
   }
+  */
 }
 
 void PrimaryVertexProducerCUDA::fillDescriptions(edm::ConfigurationDescriptions& descriptions) {
diff --git a/RecoVertex/PrimaryVertexProducer/plugins/fitterCUDA.cc b/RecoVertex/PrimaryVertexProducer/plugins/fitterCUDA.cc
index 21fe48a11af46..64a65f71a9f22 100644
--- a/RecoVertex/PrimaryVertexProducer/plugins/fitterCUDA.cc
+++ b/RecoVertex/PrimaryVertexProducer/plugins/fitterCUDA.cc
@@ -22,29 +22,22 @@
 namespace fitterCUDA {
 
 
-/*
+
 __global__ void fitterKernel(
-    TransientVertex cuda_pvs[],
-    algo algorithm,
-    reco::TransientTrack cuda_clusters[][max_size],
-    reco::BeamSpot beamSpot,
-    VertexState beamVertexState,
-    int clusters_size,
-    int max_size,
-    bool f4D,
-    bool validBS,
-    bool weightFit,
-    bool fVerbose
+    unsigned int ntracks,
+    TrackForPV::TrackForPVSoA* GPUtracksObject,
+    TrackForPV::VertexForPVSoA* GPUverticesObject,
+    algo algorithm
 ){
-
+//      return;
+// RECOMMENT IF NEEDED FROM HERE
       int idx = blockIdx.x * blockDim.x + threadIdx.x;
 
-
 //    for (std::vector<std::vector<reco::reco::TransientTrack> >::const_iterator iclus = clusters.begin();
 //         iclus != clusters.end();
 //         iclus++) {
 
-
+/*
       for (int i = 0; i < clusters_size; i++) {
       double sumwt = 0.;
       double sumwt2 = 0.;
@@ -122,92 +115,89 @@ __global__ void fitterKernel(
           (!validBS || (*(algorithm.vertexSelector))(v, beamVertexState)))
         cuda_pvs[idx] = v; //pvs.push_back(v);
     }
-}
 */
+}
+
 
 #ifdef __CUDACC__
-std::vector<TransientVertex> wrapper(
-    algo algorithm,
-    std::vector<std::vector<reco::TransientTrack> >&& clusters,
-    reco::BeamSpot beamSpot,
-    VertexState beamVertexState,
-    bool f4D,
-    bool validBS,
-    bool weightFit,
-    bool fVerbose ){
-
-    std::cout << "\n\n\n\n\ngot to the wrapper\n";
+void wrapper(
+    unsigned int ntracks,
+    TrackForPV::TrackForPVSoA* GPUtracksObject,
+    TrackForPV::VertexForPVSoA* GPUverticesObject,
+    algo algorithm
+){
 
     //defines grid
     unsigned int blockSize = 1;
     unsigned int gridSize  = 1;
     std::cout << "defined grid size\n";
 
+
+    /*
     //create and allocate all host memory (only pvs needed)
     TransientVertex *cpu_pvs;
-    cpu_pvs = (TransientVertex *) malloc(clusters.size() * sizeof(TransientVertex));
-    std::cout << "created and allocated all host memory\n";
+    TrackForPV::TrackForPVSoA* cpu_clusters;
 
+    cpu_pvs = (TransientVertex *) malloc(ntracks * sizeof(TransientVertex));
+    cpu_clusters = (reco::TransientTrack *) malloc(clusters.size() * max_size * sizeof(reco::TransientTrack)); //1D flattened
 
-    //create and allocate all device memory (pvs and clusters)
-    reco::TransientTrack **cuda_clusters;
-    TransientVertex *cuda_pvs;
-    std::cout << "initial defs good\n";
-    cudaCheck(cudaMalloc(&cuda_pvs, clusters.size() * sizeof(TransientVertex)));
-    cudaCheck(cudaMalloc(&cuda_clusters, clusters.size() * sizeof(reco::TransientTrack *)));
-    std::cout << "initial cuda mallocs good\n";
 
-    reco::TransientTrack sample_track;
-    long unsigned int max_size = 0;
-    for (long unsigned int i = 0; i < clusters.size(); i++) if (max_size < clusters[i].size()) max_size = clusters[i].size();
+    /////////////////////////////////////////////////////////
+    ////// MODIFY TYPE / CONVERT TO WHAT'S NEEDED HERE //////
+    /////////////////////////////////////////////////////////
+
+    long unsigned int n_iter[clusters.size()];
+    std::cout << "size of clusters array: " << clusters.size() << "X" << max_size << "\n";
+    //std::cout << "size of cpu_clusters: " << end(cpu_clusters) - begin(cpu_clusters) << "\n";
     for (long unsigned int i = 0; i < clusters.size(); i++) {
-        cudaCheck(cudaMalloc(&(cuda_clusters[i]), (max_size+1) * sizeof(reco::TransientTrack))); //this is segfaulting rn
-        std::cout << "cudamalloc of clusters 2d is good\n";
+        std::cout << "i is " << i << ". current size is " << clusters[i].size() << "\n";
+        n_iter[i] = clusters[i].size(); //use in kernel to keep track of bounds on 2D array
         for (long unsigned int j = 0; j < clusters[i].size(); j++) {
-            std::cout << "everything until population good\n";
-            cuda_clusters[i][j] = clusters[i][j]; //populating rectangular 2D cluster array on valid vals
+            std::cout << "j is " << j << "\t";
+            reco::TransientTrack test = clusters[i][j];
+            std::cout << "read ok\t";
+            cpu_clusters[i * max_size + j] = test; //populating rectangular 2D cluster array on valid vals
+            std::cout << "write ok\n";
         }
-        cuda_clusters[i][clusters[i].size()] = sample_track; //SET FOR FINAL VALUES, COMPARE IN KERNEL
     }
-    std::cout << "created and allocated all device memory\n";
 
-    //host to device memory copy (NONE NEEDED)
-    //cudaCheck(cudaMemcpy(cuda_clusters, cpu_clusters, memSize1, cudaMemcpyHostToDevice));
-    //cudaCheck(cudaMemcpy(cuda_pvs, cpu_pvs, memSize2, cudaMemcpyHostToDevice));
-    //std::cout << "host memory copied to device memory\n";
+    std::cout << "created and allocated all host memory\n";
+
 
+    //create and allocate all device memory (pvs and clusters)
+    reco::TransientTrack *cuda_clusters;
+    TransientVertex *cuda_pvs;
+    cudaCheck(cudaMalloc(&cuda_pvs, clusters.size() * sizeof(TransientVertex)));
+    cudaCheck(cudaMalloc(&cuda_clusters, clusters.size() * max_size * sizeof(reco::TransientTrack)));
+
+    std::cout << "created and allocated all device memory\n";
+
+    //host to device memory copy
+    cudaCheck(cudaMemcpy(cuda_clusters, cpu_clusters, clusters.size() * max_size * sizeof(reco::TransientTrack), cudaMemcpyHostToDevice));
+    std::cout << "host memory copied to device memory\n";
+    */
 
     //action!
-    /*
     fitterKernel<<<gridSize, blockSize>>>(
-      cuda_pvs,
-      algorithm,
-      cuda_clusters,
-      beamSpot,
-      beamVertexState,
-      clusters.size(),
-      max_size,
-      f4D,
-      validBS,
-      weightFit,
-      fVerbose
+    	ntracks,
+        GPUtracksObject,
+    	GPUverticesObject,
+    	algorithm
     );
     std::cout << "main action complete\n";
-    */
 
     //wait for device to complete / error check
     cudaDeviceSynchronize();
     cudaCheck(cudaGetLastError());
     std::cout << "sync / error check complete\n";
 
+
+    /*
     //device to host memory copy
     cudaCheck(cudaMemcpy(cpu_pvs, cuda_pvs, clusters.size() * sizeof(TransientVertex), cudaMemcpyDeviceToHost));
     std::cout << "device memory copied to host memory\n";
 
     //clear device memory
-    for (long unsigned int i = 0; i < clusters.size(); i++) {
-        cudaCheck(cudaFree(cuda_clusters[i]));
-    }
     cudaCheck(cudaFree(cuda_clusters));
     cudaCheck(cudaFree(cuda_pvs));
     std::cout << "cleared device memory\n\n";
@@ -219,6 +209,8 @@ std::vector<TransientVertex> wrapper(
 
     //done
     return pvs;
+
+    */
 }
 #endif
 }
diff --git a/RecoVertex/PrimaryVertexProducer/test/test_dqm_gpu.root b/RecoVertex/PrimaryVertexProducer/test/test_dqm_gpu.root
index d7b3f4ce85684..e81e8c6a294f7 100644
Binary files a/RecoVertex/PrimaryVertexProducer/test/test_dqm_gpu.root and b/RecoVertex/PrimaryVertexProducer/test/test_dqm_gpu.root differ
diff --git a/RecoVertex/PrimaryVertexProducer/test/test_gpu.root b/RecoVertex/PrimaryVertexProducer/test/test_gpu.root
index 46da6af46c08c..95119c4d64b9b 100644
Binary files a/RecoVertex/PrimaryVertexProducer/test/test_gpu.root and b/RecoVertex/PrimaryVertexProducer/test/test_gpu.root differ