diff --git a/RecoTracker/MkFit/plugins/MkFitGeometryESProducer.cc b/RecoTracker/MkFit/plugins/MkFitGeometryESProducer.cc
index b62e9aeb1f7b0..25d5959d4631a 100644
--- a/RecoTracker/MkFit/plugins/MkFitGeometryESProducer.cc
+++ b/RecoTracker/MkFit/plugins/MkFitGeometryESProducer.cc
@@ -569,7 +569,10 @@ std::unique_ptr<MkFitGeometry> MkFitGeometryESProducer::produce(const TrackerRec
     pconf.backward_fit_to_pca = false;
     pconf.finding_requires_propagation_to_hit_pos = true;
     pconf.finding_inter_layer_pflags = PropagationFlags(PF_use_param_b_field | PF_apply_material);
-    pconf.finding_intra_layer_pflags = PropagationFlags(PF_none);
+    if (Config::usePropToPlane)
+      pconf.finding_intra_layer_pflags = PropagationFlags(PF_use_param_b_field | PF_apply_material);
+    else
+      pconf.finding_intra_layer_pflags = PropagationFlags(PF_none);
     pconf.backward_fit_pflags = PropagationFlags(PF_use_param_b_field | PF_apply_material);
     pconf.forward_fit_pflags = PropagationFlags(PF_use_param_b_field | PF_apply_material);
     pconf.seed_fit_pflags = PropagationFlags(PF_none);
diff --git a/RecoTracker/MkFitCore/interface/Config.h b/RecoTracker/MkFitCore/interface/Config.h
index ac4dfe1277b0a..f76c362eec0ab 100644
--- a/RecoTracker/MkFitCore/interface/Config.h
+++ b/RecoTracker/MkFitCore/interface/Config.h
@@ -48,6 +48,8 @@ namespace mkfit {
     // Config for propagation - could/should enter into PropagationFlags?!
     constexpr int Niter = 5;
     constexpr bool useTrigApprox = true;
+    constexpr bool usePropToPlane = false;
+    constexpr bool usePtMultScat = false;
 
     // Config for Bfield. Note: for now the same for CMS-phase1 and CylCowWLids.
     constexpr float Bfield = 3.8112;
diff --git a/RecoTracker/MkFitCore/interface/TrackerInfo.h b/RecoTracker/MkFitCore/interface/TrackerInfo.h
index 965f47976b4f3..0ee221b156e77 100644
--- a/RecoTracker/MkFitCore/interface/TrackerInfo.h
+++ b/RecoTracker/MkFitCore/interface/TrackerInfo.h
@@ -5,6 +5,7 @@
 #include "RecoTracker/MkFitCore/interface/PropagationConfig.h"
 #include "RecoTracker/MkFitCore/interface/Config.h"
 #include <string>
+#include <unordered_map>
 #include <vector>
 #include <unordered_map>
 
diff --git a/RecoTracker/MkFitCore/src/KalmanUtilsMPlex.cc b/RecoTracker/MkFitCore/src/KalmanUtilsMPlex.cc
index eba6d9b367ba5..481ad42150fdc 100644
--- a/RecoTracker/MkFitCore/src/KalmanUtilsMPlex.cc
+++ b/RecoTracker/MkFitCore/src/KalmanUtilsMPlex.cc
@@ -236,6 +236,90 @@ namespace {
     RotateResidualsOnTangentPlane_impl(R00, R01, A, B, 0, NN);
   }
 
+  //==============================================================================
+
+  inline void ProjectResErr(const MPlex2H& A, const MPlexHS& B, MPlex2H& C) {
+    // C = A * B, C is 2x3, A is 2x3 , B is 3x3 sym
+
+    /*
+    A 0 1 2
+      3 4 5
+    B 0 1 3
+      1 2 4
+      3 4 5
+    */
+
+    typedef float T;
+    const idx_t N = NN;
+
+    const T* a = A.fArray;
+    ASSUME_ALIGNED(a, 64);
+    const T* b = B.fArray;
+    ASSUME_ALIGNED(b, 64);
+    T* c = C.fArray;
+    ASSUME_ALIGNED(c, 64);
+
+#pragma omp simd
+    for (int n = 0; n < N; ++n) {
+      c[0 * N + n] = a[0 * N + n] * b[0 * N + n] + a[1 * N + n] * b[1 * N + n] + a[2 * N + n] * b[3 * N + n];
+      c[1 * N + n] = a[0 * N + n] * b[1 * N + n] + a[1 * N + n] * b[2 * N + n] + a[2 * N + n] * b[4 * N + n];
+      c[2 * N + n] = a[0 * N + n] * b[3 * N + n] + a[1 * N + n] * b[4 * N + n] + a[2 * N + n] * b[5 * N + n];
+      c[3 * N + n] = a[3 * N + n] * b[0 * N + n] + a[4 * N + n] * b[1 * N + n] + a[5 * N + n] * b[3 * N + n];
+      c[4 * N + n] = a[3 * N + n] * b[1 * N + n] + a[4 * N + n] * b[2 * N + n] + a[5 * N + n] * b[4 * N + n];
+      c[5 * N + n] = a[3 * N + n] * b[3 * N + n] + a[4 * N + n] * b[4 * N + n] + a[5 * N + n] * b[5 * N + n];
+    }
+  }
+
+  inline void ProjectResErrTransp(const MPlex2H& A, const MPlex2H& B, MPlex2S& C) {
+    // C = B * A^T, C is 2x2 sym, A is 2x3 (A^T is 3x2), B is 2x3
+
+    /*
+    B   0 1 2
+        3 4 5
+    A^T 0 3
+        1 4
+        2 5
+    */
+
+    typedef float T;
+    const idx_t N = NN;
+
+    const T* a = A.fArray;
+    ASSUME_ALIGNED(a, 64);
+    const T* b = B.fArray;
+    ASSUME_ALIGNED(b, 64);
+    T* c = C.fArray;
+    ASSUME_ALIGNED(c, 64);
+
+#pragma omp simd
+    for (int n = 0; n < N; ++n) {
+      c[0 * N + n] = b[0 * N + n] * a[0 * N + n] + b[1 * N + n] * a[1 * N + n] + b[2 * N + n] * a[2 * N + n];
+      c[1 * N + n] = b[0 * N + n] * a[3 * N + n] + b[1 * N + n] * a[4 * N + n] + b[2 * N + n] * a[5 * N + n];
+      c[2 * N + n] = b[3 * N + n] * a[3 * N + n] + b[4 * N + n] * a[4 * N + n] + b[5 * N + n] * a[5 * N + n];
+    }
+  }
+
+  inline void RotateResidualsOnPlane(const MPlex2H& R,  //prj
+                                     const MPlexHV& A,  //res_glo
+                                     MPlex2V& B)        //res_loc
+  {
+    // typedef float T;
+    // const idx_t N = NN;
+
+    // const T* a = A.fArray;
+    // ASSUME_ALIGNED(a, 64);
+    // T* b = B.fArray;
+    // ASSUME_ALIGNED(b, 64);
+    // const T* r = R.fArray;
+    // ASSUME_ALIGNED(r, 64);
+
+#pragma omp simd
+    for (int n = 0; n < NN; ++n) {
+      B(n, 0, 0) = R(n, 0, 0) * A(n, 0, 0) + R(n, 0, 1) * A(n, 1, 0) + R(n, 0, 2) * A(n, 2, 0);
+      B(n, 1, 0) = R(n, 1, 0) * A(n, 0, 0) + R(n, 1, 1) * A(n, 1, 0) + R(n, 1, 2) * A(n, 2, 0);
+    }
+  }
+
   inline void KalmanHTG(const MPlexQF& A00, const MPlexQF& A01, const MPlex2S& B, MPlexHH& C) {
     // HTG  = rot * res_loc
     //   C  =  A  *    B
@@ -304,6 +388,93 @@ namespace {
     }
   }
 
+  inline void KalmanHTG(const MPlex2H& A, const MPlex2S& B, MPlexH2& C) {
+    // HTG  = prj^T * res_loc
+    //   C  =  A^T  *   B
+
+    /*
+    A^T 0 3
+        1 4
+        2 5
+    B 0 1
+      1 2
+    C 0 1
+      2 3
+      4 5
+    */
+
+    typedef float T;
+    const idx_t N = NN;
+
+    const T* a = A.fArray;
+    ASSUME_ALIGNED(a, 64);
+    const T* b = B.fArray;
+    ASSUME_ALIGNED(b, 64);
+    T* c = C.fArray;
+    ASSUME_ALIGNED(c, 64);
+
+#pragma omp simd
+    for (int n = 0; n < N; ++n) {
+      c[0 * N + n] = a[0 * N + n] * b[0 * N + n] + a[3 * N + n] * b[1 * N + n];
+      c[1 * N + n] = a[0 * N + n] * b[1 * N + n] + a[3 * N + n] * b[2 * N + n];
+      c[2 * N + n] = a[1 * N + n] * b[0 * N + n] + a[4 * N + n] * b[1 * N + n];
+      c[3 * N + n] = a[1 * N + n] * b[1 * N + n] + a[4 * N + n] * b[2 * N + n];
+      c[4 * N + n] = a[2 * N + n] * b[0 * N + n] + a[5 * N + n] * b[1 * N + n];
+      c[5 * N + n] = a[2 * N + n] * b[1 * N + n] + a[5 * N + n] * b[2 * N + n];
+    }
+  }
+
+  inline void KalmanGain(const MPlexLS& A, const MPlexH2& B, MPlexL2& C) {
+    // C = A * B, C is 6x2, A is 6x6 sym , B is 3x2 (6x2 but half of it is zeros)
+
+    /*
+      A 0  1  3  6 10 15
+        1  2  4  7 11 16
+        3  4  5  8 12 17
+        6  7  8  9 13 18
+       10 11 12 13 14 19
+       15 16 17 18 19 20
+      B 0  1
+        2  3
+	4  5
+        X  X with X=0, so not even included in B
+        X  X
+        X  X
+      C 0  1
+        2  3
+	4  5
+        6  7
+        8  9
+       10 11
+     */
+
+    typedef float T;
+    const idx_t N = NN;
+
+    const T* a = A.fArray;
+    ASSUME_ALIGNED(a, 64);
+    const T* b = B.fArray;
+    ASSUME_ALIGNED(b, 64);
+    T* c = C.fArray;
+    ASSUME_ALIGNED(c, 64);
+
+#pragma omp simd
+    for (int n = 0; n < N; ++n) {
+      c[0 * N + n] = a[0 * N + n] * b[0 * N + n] + a[1 * N + n] * b[2 * N + n] + a[3 * N + n] * b[4 * N + n];
+      c[1 * N + n] = a[0 * N + n] * b[1 * N + n] + a[1 * N + n] * b[3 * N + n] + a[3 * N + n] * b[5 * N + n];
+      c[2 * N + n] = a[1 * N + n] * b[0 * N + n] + a[2 * N + n] * b[2 * N + n] + a[4 * N + n] * b[4 * N + n];
+      c[3 * N + n] = a[1 * N + n] * b[1 * N + n] + a[2 * N + n] * b[3 * N + n] + a[4 * N + n] * b[5 * N + n];
+      c[4 * N + n] = a[3 * N + n] * b[0 * N + n] + a[4 * N + n] * b[2 * N + n] + a[5 * N + n] * b[4 * N + n];
+      c[5 * N + n] = a[3 * N + n] * b[1 * N + n] + a[4 * N + n] * b[3 * N + n] + a[5 * N + n] * b[5 * N + n];
+      c[6 * N + n] = a[6 * N + n] * b[0 * N + n] + a[7 * N + n] * b[2 * N + n] + a[8 * N + n] * b[4 * N + n];
+      c[7 * N + n] = a[6 * N + n] * b[1 * N + n] + a[7 * N + n] * b[3 * N + n] + a[8 * N + n] * b[5 * N + n];
+      c[8 * N + n] = a[10 * N + n] * b[0 * N + n] + a[11 * N + n] * b[2 * N + n] + a[12 * N + n] * b[4 * N + n];
+      c[9 * N + n] = a[10 * N + n] * b[1 * N + n] + a[11 * N + n] * b[3 * N + n] + a[12 * N + n] * b[5 * N + n];
+      c[10 * N + n] = a[15 * N + n] * b[0 * N + n] + a[16 * N + n] * b[2 * N + n] + a[17 * N + n] * b[4 * N + n];
+      c[11 * N + n] = a[15 * N + n] * b[1 * N + n] + a[16 * N + n] * b[3 * N + n] + a[17 * N + n] * b[5 * N + n];
+    }
+  }
+
   inline void CovXYconstrain(const MPlexQF& R00, const MPlexQF& R01, const MPlexLS& Ci, MPlexLS& Co) {
     // C is transformed to align along y after rotation and rotated back
 
@@ -367,6 +538,77 @@ namespace {
     KHMult_imp(A, B00, B01, C, 0, NN);
   }
 
+  inline void KHMult(const MPlexL2& A, const MPlex2H& B, MPlexLL& C) {
+    // C = A * B, C is 6x6, A is 6x2 , B is 2x3 (2x6 but half of it made of zeros)
+
+    /*
+    A 0  1
+      2  3
+      4  5
+      6  7
+      8  9
+     10 11
+    B  0  1  2  X  X  X with X=0 so not included in B
+       3  4  5  X  X  X
+    C  0  1  2  3  4  5
+       6  7  8  9 10 11
+      12 13 14 15 16 17
+      18 19 20 21 22 23
+      24 25 26 27 28 29
+      30 31 32 33 34 34
+    */
+
+    // typedef float T;
+    // const idx_t N = NN;
+
+    // const T* a = A.fArray;
+    // ASSUME_ALIGNED(a, 64);
+    // const T* b = B.fArray;
+    // ASSUME_ALIGNED(b, 64);
+    // T* c = C.fArray;
+    // ASSUME_ALIGNED(c, 64);
+
+#pragma omp simd
+    for (int n = 0; n < NN; ++n) {
+      C(n, 0, 0) = A(n, 0, 0) * B(n, 0, 0) + A(n, 0, 1) * B(n, 1, 0);
+      C(n, 0, 1) = A(n, 0, 0) * B(n, 0, 1) + A(n, 0, 1) * B(n, 1, 1);
+      C(n, 0, 2) = A(n, 0, 0) * B(n, 0, 2) + A(n, 0, 1) * B(n, 1, 2);
+      C(n, 0, 3) = 0;
+      C(n, 0, 4) = 0;
+      C(n, 0, 5) = 0;
+      C(n, 0, 6) = A(n, 1, 0) * B(n, 0, 0) + A(n, 1, 1) * B(n, 1, 0);
+      C(n, 0, 7) = A(n, 1, 0) * B(n, 0, 1) + A(n, 1, 1) * B(n, 1, 1);
+      C(n, 0, 8) = A(n, 1, 0) * B(n, 0, 2) + A(n, 1, 1) * B(n, 1, 2);
+      C(n, 0, 9) = 0;
+      C(n, 0, 10) = 0;
+      C(n, 0, 11) = 0;
+      C(n, 0, 12) = A(n, 2, 0) * B(n, 0, 0) + A(n, 2, 1) * B(n, 1, 0);
+      C(n, 0, 13) = A(n, 2, 0) * B(n, 0, 1) + A(n, 2, 1) * B(n, 1, 1);
+      C(n, 0, 14) = A(n, 2, 0) * B(n, 0, 2) + A(n, 2, 1) * B(n, 1, 2);
+      C(n, 0, 15) = 0;
+      C(n, 0, 16) = 0;
+      C(n, 0, 17) = 0;
+      C(n, 0, 18) = A(n, 3, 0) * B(n, 0, 0) + A(n, 3, 1) * B(n, 1, 0);
+      C(n, 0, 19) = A(n, 3, 0) * B(n, 0, 1) + A(n, 3, 1) * B(n, 1, 1);
+      C(n, 0, 20) = A(n, 3, 0) * B(n, 0, 2) + A(n, 3, 1) * B(n, 1, 2);
+      C(n, 0, 21) = 0;
+      C(n, 0, 22) = 0;
+      C(n, 0, 23) = 0;
+      C(n, 0, 24) = A(n, 4, 0) * B(n, 0, 0) + A(n, 4, 1) * B(n, 1, 0);
+      C(n, 0, 25) = A(n, 4, 0) * B(n, 0, 1) + A(n, 4, 1) * B(n, 1, 1);
+      C(n, 0, 26) = A(n, 4, 0) * B(n, 0, 2) + A(n, 4, 1) * B(n, 1, 2);
+      C(n, 0, 27) = 0;
+      C(n, 0, 28) = 0;
+      C(n, 0, 29) = 0;
+      C(n, 0, 30) = A(n, 5, 0) * B(n, 0, 0) + A(n, 5, 1) * B(n, 1, 0);
+      C(n, 0, 31) = A(n, 5, 0) * B(n, 0, 1) + A(n, 5, 1) * B(n, 1, 1);
+      C(n, 0, 32) = A(n, 5, 0) * B(n, 0, 2) + A(n, 5, 1) * B(n, 1, 2);
+      C(n, 0, 33) = 0;
+      C(n, 0, 34) = 0;
+      C(n, 0, 35) = 0;
+    }
+  }
+
   inline void KHC(const MPlexLL& A, const MPlexLS& B, MPlexLS& C) {
     // C = A * B, C is 6x6, A is 6x6 , B is 6x6 sym
 
@@ -622,6 +864,30 @@ namespace mkfit {
 #ifdef DEBUG
     {
       dmutex_guard;
+      printf("res_glo:\n");
+      for (int i = 0; i < 3; ++i) {
+        printf("%8f ", res_glo.At(0, i, 0));
+      }
+      printf("\n");
+      printf("resErr_glo:\n");
+      for (int i = 0; i < 3; ++i) {
+        for (int j = 0; j < 3; ++j)
+          printf("%8f ", resErr_glo.At(0, i, j));
+        printf("\n");
+      }
+      printf("\n");
+      printf("res_loc:\n");
+      for (int i = 0; i < 2; ++i) {
+        printf("%8f ", res_loc.At(0, i, 0));
+      }
+      printf("\n");
+      printf("tempHH:\n");
+      for (int i = 0; i < 3; ++i) {
+        for (int j = 0; j < 3; ++j)
+          printf("%8f ", tempHH.At(0, i, j));
+        printf("\n");
+      }
+      printf("\n");
       printf("resErr_loc:\n");
       for (int i = 0; i < 2; ++i) {
         for (int j = 0; j < 2; ++j)
@@ -663,10 +929,10 @@ namespace mkfit {
       KalmanGain(psErrLoc, tempHH, K);
 
       MultResidualsAdd(K, psPar, res_loc, outPar);
-      MPlexLL tempLL;
 
       squashPhiMPlex(outPar, N_proc);  // ensure phi is between |pi|
 
+      MPlexLL tempLL;
       KHMult(K, rotT00, rotT01, tempLL);
       KHC(tempLL, psErrLoc, outErr);
       outErr.subtract(psErrLoc, outErr);
@@ -683,16 +949,330 @@ namespace mkfit {
           }
           printf("\n");
         }
-        printf("res_glo:\n");
+        printf("resErr_loc (Inv):\n");
+        for (int i = 0; i < 2; ++i) {
+          for (int j = 0; j < 2; ++j)
+            printf("%8f ", resErr_loc.At(0, i, j));
+          printf("\n");
+        }
+        printf("\n");
+        printf("tempHH:\n");
         for (int i = 0; i < 3; ++i) {
-          printf("%8f ", res_glo.At(0, i, 0));
+          for (int j = 0; j < 3; ++j)
+            printf("%8f ", tempHH.At(0, i, j));
+          printf("\n");
+        }
+        printf("\n");
+        printf("K:\n");
+        for (int i = 0; i < 6; ++i) {
+          for (int j = 0; j < 3; ++j)
+            printf("%8f ", K.At(0, i, j));
+          printf("\n");
         }
         printf("\n");
-        printf("res_loc:\n");
+        printf("tempLL:\n");
+        for (int i = 0; i < 6; ++i) {
+          for (int j = 0; j < 6; ++j)
+            printf("%8f ", tempLL.At(0, i, j));
+          printf("\n");
+        }
+        printf("\n");
+        printf("outPar:\n");
+        for (int i = 0; i < 6; ++i) {
+          printf("%8f  ", outPar.At(0, i, 0));
+        }
+        printf("\n");
+        printf("outErr:\n");
+        for (int i = 0; i < 6; ++i) {
+          for (int j = 0; j < 6; ++j)
+            printf("%8f ", outErr.At(0, i, j));
+          printf("\n");
+        }
+        printf("\n");
+      }
+#endif
+    }
+  }
+
+  //==============================================================================
+  // Kalman operations - Plane
+  //==============================================================================
+
+  void kalmanUpdatePlane(const MPlexLS& psErr,
+                         const MPlexLV& psPar,
+                         const MPlexHS& msErr,
+                         const MPlexHV& msPar,
+                         const MPlexHV& plNrm,
+                         const MPlexHV& plDir,
+                         MPlexLS& outErr,
+                         MPlexLV& outPar,
+                         const int N_proc) {
+    kalmanOperationPlane(
+        KFO_Update_Params | KFO_Local_Cov, psErr, psPar, msErr, msPar, plNrm, plDir, outErr, outPar, dummy_chi2, N_proc);
+  }
+
+  void kalmanPropagateAndUpdatePlane(const MPlexLS& psErr,
+                                     const MPlexLV& psPar,
+                                     MPlexQI& Chg,
+                                     const MPlexHS& msErr,
+                                     const MPlexHV& msPar,
+                                     const MPlexHV& plNrm,
+                                     const MPlexHV& plDir,
+                                     MPlexLS& outErr,
+                                     MPlexLV& outPar,
+                                     MPlexQI& outFailFlag,
+                                     const int N_proc,
+                                     const PropagationFlags& propFlags,
+                                     const bool propToHit) {
+    if (propToHit) {
+      MPlexLS propErr;
+      MPlexLV propPar;
+      propagateHelixToPlaneMPlex(psErr, psPar, Chg, msPar, plNrm, propErr, propPar, outFailFlag, N_proc, propFlags);
+
+      kalmanOperationPlane(KFO_Update_Params | KFO_Local_Cov,
+                           propErr,
+                           propPar,
+                           msErr,
+                           msPar,
+                           plNrm,
+                           plDir,
+                           outErr,
+                           outPar,
+                           dummy_chi2,
+                           N_proc);
+    } else {
+      kalmanOperationPlane(KFO_Update_Params | KFO_Local_Cov,
+                           psErr,
+                           psPar,
+                           msErr,
+                           msPar,
+                           plNrm,
+                           plDir,
+                           outErr,
+                           outPar,
+                           dummy_chi2,
+                           N_proc);
+    }
+    for (int n = 0; n < NN; ++n) {
+      if (outPar.At(n, 3, 0) < 0) {
+        Chg.At(n, 0, 0) = -Chg.At(n, 0, 0);
+        outPar.At(n, 3, 0) = -outPar.At(n, 3, 0);
+      }
+    }
+  }
+
+  //------------------------------------------------------------------------------
+
+  void kalmanComputeChi2Plane(const MPlexLS& psErr,
+                              const MPlexLV& psPar,
+                              const MPlexQI& inChg,
+                              const MPlexHS& msErr,
+                              const MPlexHV& msPar,
+                              const MPlexHV& plNrm,
+                              const MPlexHV& plDir,
+                              MPlexQF& outChi2,
+                              const int N_proc) {
+    kalmanOperationPlane(
+        KFO_Calculate_Chi2, psErr, psPar, msErr, msPar, plNrm, plDir, dummy_err, dummy_par, outChi2, N_proc);
+  }
+
+  void kalmanPropagateAndComputeChi2Plane(const MPlexLS& psErr,
+                                          const MPlexLV& psPar,
+                                          const MPlexQI& inChg,
+                                          const MPlexHS& msErr,
+                                          const MPlexHV& msPar,
+                                          const MPlexHV& plNrm,
+                                          const MPlexHV& plDir,
+                                          MPlexQF& outChi2,
+                                          MPlexLV& propPar,
+                                          MPlexQI& outFailFlag,
+                                          const int N_proc,
+                                          const PropagationFlags& propFlags,
+                                          const bool propToHit) {
+    propPar = psPar;
+    if (propToHit) {
+      MPlexLS propErr;
+      propagateHelixToPlaneMPlex(psErr, psPar, inChg, msPar, plNrm, propErr, propPar, outFailFlag, N_proc, propFlags);
+
+      kalmanOperationPlane(
+          KFO_Calculate_Chi2, propErr, propPar, msErr, msPar, plNrm, plDir, dummy_err, dummy_par, outChi2, N_proc);
+    } else {
+      kalmanOperationPlane(
+          KFO_Calculate_Chi2, psErr, psPar, msErr, msPar, plNrm, plDir, dummy_err, dummy_par, outChi2, N_proc);
+    }
+  }
+
+  //------------------------------------------------------------------------------
+
+  void kalmanOperationPlane(const int kfOp,
+                            const MPlexLS& psErr,
+                            const MPlexLV& psPar,
+                            const MPlexHS& msErr,
+                            const MPlexHV& msPar,
+                            const MPlexHV& plNrm,
+                            const MPlexHV& plDir,
+                            MPlexLS& outErr,
+                            MPlexLV& outPar,
+                            MPlexQF& outChi2,
+                            const int N_proc) {
+#ifdef DEBUG
+    {
+      dmutex_guard;
+      printf("psPar:\n");
+      for (int i = 0; i < 6; ++i) {
+        printf("%8f ", psPar.constAt(0, 0, i));
+        printf("\n");
+      }
+      printf("\n");
+      printf("psErr:\n");
+      for (int i = 0; i < 6; ++i) {
+        for (int j = 0; j < 6; ++j)
+          printf("%8f ", psErr.constAt(0, i, j));
+        printf("\n");
+      }
+      printf("\n");
+      printf("msPar:\n");
+      for (int i = 0; i < 3; ++i) {
+        printf("%8f ", msPar.constAt(0, 0, i));
+        printf("\n");
+      }
+      printf("\n");
+      printf("msErr:\n");
+      for (int i = 0; i < 3; ++i) {
+        for (int j = 0; j < 3; ++j)
+          printf("%8f ", msErr.constAt(0, i, j));
+        printf("\n");
+      }
+      printf("\n");
+    }
+#endif
+
+    // Rotate global point on tangent plane to cylinder
+    // Tangent point is half way between hit and propagate position
+
+    // Rotation matrix
+    //    D0  D1   D2
+    //    X0  X1   X2
+    //    N0  N1   N2
+    // where D is the strip direction vector plDir, N is the normal plNrm, and X is the cross product between the two
+
+    MPlex2H prj;
+    for (int n = 0; n < NN; ++n) {
+      prj(n, 0, 0) = plDir(n, 0, 0);
+      prj(n, 0, 1) = plDir(n, 1, 0);
+      prj(n, 0, 2) = plDir(n, 2, 0);
+      prj(n, 1, 0) = plNrm(n, 1, 0) * plDir(n, 2, 0) - plNrm(n, 2, 0) * plDir(n, 1, 0);
+      prj(n, 1, 1) = plNrm(n, 2, 0) * plDir(n, 0, 0) - plNrm(n, 0, 0) * plDir(n, 2, 0);
+      prj(n, 1, 2) = plNrm(n, 0, 0) * plDir(n, 1, 0) - plNrm(n, 1, 0) * plDir(n, 0, 0);
+    }
+
+    MPlexHV res_glo;  //position residual in global coordinates
+    SubtractFirst3(msPar, psPar, res_glo);
+
+    MPlexHS resErr_glo;  //covariance sum in global position coordinates
+    AddIntoUpperLeft3x3(psErr, msErr, resErr_glo);
+
+    MPlex2V res_loc;  //position residual in local coordinates
+    RotateResidualsOnPlane(prj, res_glo, res_loc);
+    MPlex2S resErr_loc;  //covariance sum in local position coordinates
+    MPlex2H temp2H;
+    ProjectResErr(prj, resErr_glo, temp2H);
+    ProjectResErrTransp(prj, temp2H, resErr_loc);
+
+#ifdef DEBUG
+    {
+      dmutex_guard;
+      printf("prj:\n");
+      for (int i = 0; i < 2; ++i) {
+        for (int j = 0; j < 3; ++j)
+          printf("%8f ", prj.At(0, i, j));
+        printf("\n");
+      }
+      printf("\n");
+      printf("res_glo:\n");
+      for (int i = 0; i < 3; ++i) {
+        printf("%8f ", res_glo.At(0, i, 0));
+      }
+      printf("\n");
+      printf("resErr_glo:\n");
+      for (int i = 0; i < 3; ++i) {
+        for (int j = 0; j < 3; ++j)
+          printf("%8f ", resErr_glo.At(0, i, j));
+        printf("\n");
+      }
+      printf("\n");
+      printf("res_loc:\n");
+      for (int i = 0; i < 2; ++i) {
+        printf("%8f ", res_loc.At(0, i, 0));
+      }
+      printf("\n");
+      printf("temp2H:\n");
+      for (int i = 0; i < 2; ++i) {
+        for (int j = 0; j < 3; ++j)
+          printf("%8f ", temp2H.At(0, i, j));
+        printf("\n");
+      }
+      printf("\n");
+      printf("resErr_loc:\n");
+      for (int i = 0; i < 2; ++i) {
+        for (int j = 0; j < 2; ++j)
+          printf("%8f ", resErr_loc.At(0, i, j));
+        printf("\n");
+      }
+      printf("\n");
+    }
+#endif
+
+    //invert the 2x2 matrix
+    Matriplex::invertCramerSym(resErr_loc);
+
+    if (kfOp & KFO_Calculate_Chi2) {
+      Chi2Similarity(res_loc, resErr_loc, outChi2);
+
+#ifdef DEBUG
+      {
+        dmutex_guard;
+        printf("resErr_loc (Inv):\n");
         for (int i = 0; i < 2; ++i) {
-          printf("%8f ", res_loc.At(0, i, 0));
+          for (int j = 0; j < 2; ++j)
+            printf("%8f ", resErr_loc.At(0, i, j));
+          printf("\n");
         }
         printf("\n");
+        printf("chi2: %8f\n", outChi2.At(0, 0, 0));
+      }
+#endif
+    }
+
+    if (kfOp & KFO_Update_Params) {
+      MPlexLS psErrLoc = psErr;
+
+      MPlexH2 tempH2;
+      MPlexL2 K;                           // kalman gain, fixme should be L2
+      KalmanHTG(prj, resErr_loc, tempH2);  // intermediate term to get kalman gain (H^T*G)
+      KalmanGain(psErrLoc, tempH2, K);
+
+      MultResidualsAdd(K, psPar, res_loc, outPar);
+
+      squashPhiMPlex(outPar, N_proc);  // ensure phi is between |pi|
+
+      MPlexLL tempLL;
+      KHMult(K, prj, tempLL);
+      KHC(tempLL, psErrLoc, outErr);
+      outErr.subtract(psErrLoc, outErr);
+
+#ifdef DEBUG
+      {
+        dmutex_guard;
+        if (kfOp & KFO_Local_Cov) {
+          printf("psErrLoc:\n");
+          for (int i = 0; i < 6; ++i) {
+            for (int j = 0; j < 6; ++j)
+              printf("% 8e ", psErrLoc.At(0, i, j));
+            printf("\n");
+          }
+          printf("\n");
+        }
         printf("resErr_loc (Inv):\n");
         for (int i = 0; i < 2; ++i) {
           for (int j = 0; j < 2; ++j)
@@ -700,13 +1280,27 @@ namespace mkfit {
           printf("\n");
         }
         printf("\n");
+        printf("tempH2:\n");
+        for (int i = 0; i < 3; ++i) {
+          for (int j = 0; j < 2; ++j)
+            printf("%8f ", tempH2.At(0, i, j));
+          printf("\n");
+        }
+        printf("\n");
         printf("K:\n");
         for (int i = 0; i < 6; ++i) {
-          for (int j = 0; j < 3; ++j)
+          for (int j = 0; j < 2; ++j)
             printf("%8f ", K.At(0, i, j));
           printf("\n");
         }
         printf("\n");
+        printf("tempLL:\n");
+        for (int i = 0; i < 6; ++i) {
+          for (int j = 0; j < 6; ++j)
+            printf("%8f ", tempLL.At(0, i, j));
+          printf("\n");
+        }
+        printf("\n");
         printf("outPar:\n");
         for (int i = 0; i < 6; ++i) {
           printf("%8f  ", outPar.At(0, i, 0));
diff --git a/RecoTracker/MkFitCore/src/KalmanUtilsMPlex.h b/RecoTracker/MkFitCore/src/KalmanUtilsMPlex.h
index 67291ebf06ce8..746e1b05a6f9b 100644
--- a/RecoTracker/MkFitCore/src/KalmanUtilsMPlex.h
+++ b/RecoTracker/MkFitCore/src/KalmanUtilsMPlex.h
@@ -114,5 +114,67 @@ namespace mkfit {
                              MPlexQF& outChi2,
                              const int N_proc);
 
+  //------------------------------------------------------------------------------
+
+  void kalmanUpdatePlane(const MPlexLS& psErr,
+                         const MPlexLV& psPar,
+                         const MPlexHS& msErr,
+                         const MPlexHV& msPar,
+                         const MPlexHV& plNrm,
+                         const MPlexHV& plDir,
+                         MPlexLS& outErr,
+                         MPlexLV& outPar,
+                         const int N_proc);
+
+  void kalmanPropagateAndUpdatePlane(const MPlexLS& psErr,
+                                     const MPlexLV& psPar,
+                                     MPlexQI& Chg,
+                                     const MPlexHS& msErr,
+                                     const MPlexHV& msPar,
+                                     const MPlexHV& plNrm,
+                                     const MPlexHV& plDir,
+                                     MPlexLS& outErr,
+                                     MPlexLV& outPar,
+                                     MPlexQI& outFailFlag,
+                                     const int N_proc,
+                                     const PropagationFlags& propFlags,
+                                     const bool propToHit);
+
+  void kalmanComputeChi2Plane(const MPlexLS& psErr,
+                              const MPlexLV& psPar,
+                              const MPlexQI& inChg,
+                              const MPlexHS& msErr,
+                              const MPlexHV& msPar,
+                              const MPlexHV& plNrm,
+                              const MPlexHV& plDir,
+                              MPlexQF& outChi2,
+                              const int N_proc);
+
+  void kalmanPropagateAndComputeChi2Plane(const MPlexLS& psErr,
+                                          const MPlexLV& psPar,
+                                          const MPlexQI& inChg,
+                                          const MPlexHS& msErr,
+                                          const MPlexHV& msPar,
+                                          const MPlexHV& plNrm,
+                                          const MPlexHV& plDir,
+                                          MPlexQF& outChi2,
+                                          MPlexLV& propPar,
+                                          MPlexQI& outFailFlag,
+                                          const int N_proc,
+                                          const PropagationFlags& propFlags,
+                                          const bool propToHit);
+
+  void kalmanOperationPlane(const int kfOp,
+                            const MPlexLS& psErr,
+                            const MPlexLV& psPar,
+                            const MPlexHS& msErr,
+                            const MPlexHV& msPar,
+                            const MPlexHV& plNrm,
+                            const MPlexHV& plDir,
+                            MPlexLS& outErr,
+                            MPlexLV& outPar,
+                            MPlexQF& outChi2,
+                            const int N_proc);
+
 }  // end namespace mkfit
 #endif
diff --git a/RecoTracker/MkFitCore/src/Matrix.h b/RecoTracker/MkFitCore/src/Matrix.h
index 809b438964e5e..ba426f59acdcc 100644
--- a/RecoTracker/MkFitCore/src/Matrix.h
+++ b/RecoTracker/MkFitCore/src/Matrix.h
@@ -53,6 +53,10 @@ namespace mkfit {
   typedef Matriplex::Matriplex<float, HH, 1, NN> MPlexHV;
   typedef Matriplex::MatriplexSym<float, HH, NN> MPlexHS;
 
+  typedef Matriplex::Matriplex<float, 5, 5, NN> MPlex55;
+  typedef Matriplex::Matriplex<float, 5, 6, NN> MPlex56;
+  typedef Matriplex::Matriplex<float, 6, 5, NN> MPlex65;
+
   typedef Matriplex::Matriplex<float, 2, 2, NN> MPlex22;
   typedef Matriplex::Matriplex<float, 2, 1, NN> MPlex2V;
   typedef Matriplex::MatriplexSym<float, 2, NN> MPlex2S;
@@ -61,6 +65,8 @@ namespace mkfit {
   typedef Matriplex::Matriplex<float, HH, LL, NN> MPlexHL;
 
   typedef Matriplex::Matriplex<float, LL, 2, NN> MPlexL2;
+  typedef Matriplex::Matriplex<float, HH, 2, NN> MPlexH2;
+  typedef Matriplex::Matriplex<float, 2, HH, NN> MPlex2H;
 
   typedef Matriplex::Matriplex<float, 1, 1, NN> MPlexQF;
   typedef Matriplex::Matriplex<int, 1, 1, NN> MPlexQI;
diff --git a/RecoTracker/MkFitCore/src/MkBuilder.cc b/RecoTracker/MkFitCore/src/MkBuilder.cc
index 97c0c6dd06f01..2c984991519b8 100644
--- a/RecoTracker/MkFitCore/src/MkBuilder.cc
+++ b/RecoTracker/MkFitCore/src/MkBuilder.cc
@@ -1098,6 +1098,11 @@ namespace mkfit {
         // from intra-layer to inter-layer.
         // mkfndr->copyOutParErr(eoccs.refCandidates_nc(), end - itrack, true);
 
+        // For prop-to-plane propagate from the last hit, not layer center.
+        if (Config::usePropToPlane) {
+          mkfndr->inputTracksAndHitIdx(eoccs.refCandidates(), seed_cand_idx, itrack, end, false);
+        }
+
         dprint("make new candidates");
         cloner.begin_iteration();
 
@@ -1118,7 +1123,7 @@ namespace mkfit {
 
         mkfndr->inputTracksAndHits(eoccs.refCandidates(), layer_of_hits, seed_cand_update_idx, itrack, end, true);
 
-        mkfndr->updateWithLoadedHit(end - itrack, fnd_foos);
+        mkfndr->updateWithLoadedHit(end - itrack, layer_of_hits, fnd_foos);
 
         // copy_out the updated track params, errors only (hit-idcs and chi2 already set)
         mkfndr->copyOutParErr(eoccs.refCandidates_nc(), end - itrack, false);
diff --git a/RecoTracker/MkFitCore/src/MkFinder.cc b/RecoTracker/MkFitCore/src/MkFinder.cc
index ecfc46c6dfdac..ee9726a8eb485 100644
--- a/RecoTracker/MkFitCore/src/MkFinder.cc
+++ b/RecoTracker/MkFitCore/src/MkFinder.cc
@@ -168,6 +168,10 @@ namespace mkfit {
       m_CandIdx(imp, 0, 0) = idxs[i].cand_idx;
       m_SeedOriginIdx[imp] = tracks[idxs[i].seed_idx].seed_origin_index();
 
+      // Reuse selectHitIndices() arrays -- used also in packModuleNormDir()
+      m_XHitArr(imp, 0, 0) = idxs[i].hit_idx;
+      m_XHitSize(imp, 0, 0) = 1;
+
       const Hit &hit = layer_of_hits.refHit(idxs[i].hit_idx);
       m_msErr.copyIn(imp, hit.errArray());
       m_msPar.copyIn(imp, hit.posArray());
@@ -220,6 +224,23 @@ namespace mkfit {
     }
   }
 
+  void MkFinder::packModuleNormDir(
+      const LayerOfHits &layer_of_hits, int hit_cnt, MPlexHV &norm, MPlexHV &dir, int N_proc) const {
+    for (int itrack = 0; itrack < N_proc; ++itrack) {
+      if (hit_cnt < m_XHitSize[itrack]) {
+        const auto &hit = layer_of_hits.refHit(m_XHitArr.constAt(itrack, hit_cnt, 0));
+        unsigned int mid = hit.detIDinLayer();
+        const ModuleInfo &mi = layer_of_hits.layer_info()->module_info(mid);
+        norm.At(itrack, 0, 0) = mi.zdir[0];
+        norm.At(itrack, 1, 0) = mi.zdir[1];
+        norm.At(itrack, 2, 0) = mi.zdir[2];
+        dir.At(itrack, 0, 0) = mi.xdir[0];
+        dir.At(itrack, 1, 0) = mi.xdir[1];
+        dir.At(itrack, 2, 0) = mi.xdir[2];
+      }
+    }
+  }
+
   //==============================================================================
   // getHitSelDynamicWindows
   //==============================================================================
@@ -1245,17 +1266,37 @@ namespace mkfit {
       MPlexQF outChi2;
       MPlexLV propPar;
       clearFailFlag();
-      (*fnd_foos.m_compute_chi2_foo)(m_Err[iP],
-                                     m_Par[iP],
-                                     m_Chg,
-                                     m_msErr,
-                                     m_msPar,
-                                     outChi2,
-                                     propPar,
-                                     m_FailFlag,
-                                     N_proc,
-                                     m_prop_config->finding_intra_layer_pflags,
-                                     m_prop_config->finding_requires_propagation_to_hit_pos);
+
+      if (Config::usePropToPlane) {
+        // Maybe could use 2 matriplex packers ... ModuleInfo has 3 * SVector3 and uint
+        MPlexHV norm, dir;
+        packModuleNormDir(layer_of_hits, hit_cnt, norm, dir, N_proc);
+        kalmanPropagateAndComputeChi2Plane(m_Err[iP],
+                                           m_Par[iP],
+                                           m_Chg,
+                                           m_msErr,
+                                           m_msPar,
+                                           norm,
+                                           dir,
+                                           outChi2,
+                                           propPar,
+                                           m_FailFlag,
+                                           N_proc,
+                                           m_prop_config->finding_intra_layer_pflags,
+                                           m_prop_config->finding_requires_propagation_to_hit_pos);
+      } else {
+        (*fnd_foos.m_compute_chi2_foo)(m_Err[iP],
+                                       m_Par[iP],
+                                       m_Chg,
+                                       m_msErr,
+                                       m_msPar,
+                                       outChi2,
+                                       propPar,
+                                       m_FailFlag,
+                                       N_proc,
+                                       m_prop_config->finding_intra_layer_pflags,
+                                       m_prop_config->finding_requires_propagation_to_hit_pos);
+      }
 
       // Now update the track parameters with this hit (note that some
       // calculations are already done when computing chi2, to be optimized).
@@ -1492,17 +1533,37 @@ namespace mkfit {
       MPlexQF outChi2;
       MPlexLV propPar;
       clearFailFlag();
-      (*fnd_foos.m_compute_chi2_foo)(m_Err[iP],
-                                     m_Par[iP],
-                                     m_Chg,
-                                     m_msErr,
-                                     m_msPar,
-                                     outChi2,
-                                     propPar,
-                                     m_FailFlag,
-                                     N_proc,
-                                     m_prop_config->finding_intra_layer_pflags,
-                                     m_prop_config->finding_requires_propagation_to_hit_pos);
+
+      if (Config::usePropToPlane) {
+        // Maybe could use 2 matriplex packers ... ModuleInfo has 3 * SVector3 and uint
+        MPlexHV norm, dir;
+        packModuleNormDir(layer_of_hits, hit_cnt, norm, dir, N_proc);
+        kalmanPropagateAndComputeChi2Plane(m_Err[iP],
+                                           m_Par[iP],
+                                           m_Chg,
+                                           m_msErr,
+                                           m_msPar,
+                                           norm,
+                                           dir,
+                                           outChi2,
+                                           propPar,
+                                           m_FailFlag,
+                                           N_proc,
+                                           m_prop_config->finding_intra_layer_pflags,
+                                           m_prop_config->finding_requires_propagation_to_hit_pos);
+      } else {
+        (*fnd_foos.m_compute_chi2_foo)(m_Err[iP],
+                                       m_Par[iP],
+                                       m_Chg,
+                                       m_msErr,
+                                       m_msPar,
+                                       outChi2,
+                                       propPar,
+                                       m_FailFlag,
+                                       N_proc,
+                                       m_prop_config->finding_intra_layer_pflags,
+                                       m_prop_config->finding_requires_propagation_to_hit_pos);
+      }
 
       //#pragma omp simd  // DOES NOT VECTORIZE AS IT IS NOW
       for (int itrack = 0; itrack < N_proc; ++itrack) {
@@ -1646,21 +1707,39 @@ namespace mkfit {
   // UpdateWithLoadedHit
   //==============================================================================
 
-  void MkFinder::updateWithLoadedHit(int N_proc, const FindingFoos &fnd_foos) {
+  void MkFinder::updateWithLoadedHit(int N_proc, const LayerOfHits &layer_of_hits, const FindingFoos &fnd_foos) {
     // See comment in MkBuilder::find_tracks_in_layer() about intra / inter flags used here
     // for propagation to the hit.
     clearFailFlag();
-    (*fnd_foos.m_update_param_foo)(m_Err[iP],
-                                   m_Par[iP],
-                                   m_Chg,
-                                   m_msErr,
-                                   m_msPar,
-                                   m_Err[iC],
-                                   m_Par[iC],
-                                   m_FailFlag,
-                                   N_proc,
-                                   m_prop_config->finding_inter_layer_pflags,
-                                   m_prop_config->finding_requires_propagation_to_hit_pos);
+    if (Config::usePropToPlane) {
+      MPlexHV norm, dir;
+      packModuleNormDir(layer_of_hits, 0, norm, dir, N_proc);
+      kalmanPropagateAndUpdatePlane(m_Err[iP],
+                                    m_Par[iP],
+                                    m_Chg,
+                                    m_msErr,
+                                    m_msPar,
+                                    norm,
+                                    dir,
+                                    m_Err[iC],
+                                    m_Par[iC],
+                                    m_FailFlag,
+                                    N_proc,
+                                    m_prop_config->finding_inter_layer_pflags,
+                                    m_prop_config->finding_requires_propagation_to_hit_pos);
+    } else {
+      (*fnd_foos.m_update_param_foo)(m_Err[iP],
+                                     m_Par[iP],
+                                     m_Chg,
+                                     m_msErr,
+                                     m_msPar,
+                                     m_Err[iC],
+                                     m_Par[iC],
+                                     m_FailFlag,
+                                     N_proc,
+                                     m_prop_config->finding_inter_layer_pflags,
+                                     m_prop_config->finding_requires_propagation_to_hit_pos);
+    }
 
     // PROP-FAIL-ENABLE The following to be enabled when propagation failure
     // detection is properly implemented in propagate-to-R/Z.
diff --git a/RecoTracker/MkFitCore/src/MkFinder.h b/RecoTracker/MkFitCore/src/MkFinder.h
index 17da11ad8aa9d..7a9e0f3c96500 100644
--- a/RecoTracker/MkFitCore/src/MkFinder.h
+++ b/RecoTracker/MkFitCore/src/MkFinder.h
@@ -113,6 +113,8 @@ namespace mkfit {
 
     HitOnTrack bestHitLastHoT(int itrack) const { return m_HoTArrs[itrack][m_NHits(itrack, 0, 0) - 1]; }
 
+    void packModuleNormDir(const LayerOfHits &layer_of_hits, int hit_cnt, MPlexHV &norm, MPlexHV &dir, int N_proc) const;
+
     //----------------------------------------------------------------------------
 
     void getHitSelDynamicWindows(
@@ -141,7 +143,7 @@ namespace mkfit {
                                    const int N_proc,
                                    const FindingFoos &fnd_foos);
 
-    void updateWithLoadedHit(int N_proc, const FindingFoos &fnd_foos);
+    void updateWithLoadedHit(int N_proc, const LayerOfHits &layer_of_hits, const FindingFoos &fnd_foos);
 
     void copyOutParErr(std::vector<CombCandidate> &seed_cand_vec, int N_proc, bool outputProp) const;
 
diff --git a/RecoTracker/MkFitCore/src/PropagationMPlex.cc b/RecoTracker/MkFitCore/src/PropagationMPlex.cc
index f5bdff96e3180..bc8057ee81129 100644
--- a/RecoTracker/MkFitCore/src/PropagationMPlex.cc
+++ b/RecoTracker/MkFitCore/src/PropagationMPlex.cc
@@ -12,7 +12,7 @@
 // propagateLineToRMPlex
 //==============================================================================
 
-using namespace Matriplex;
+//using namespace Matriplex;
 
 namespace mkfit {
 
@@ -25,7 +25,7 @@ namespace mkfit {
                              const int N_proc) {
     // XXX Regenerate parts below with a script.
 
-    const idx_t N = NN;
+    const Matriplex::idx_t N = NN;
 
 #pragma omp simd
     for (int n = 0; n < NN; ++n) {
@@ -97,7 +97,7 @@ namespace {
     // C = A * B
 
     typedef float T;
-    const idx_t N = NN;
+    const Matriplex::idx_t N = NN;
 
     const T* a = A.fArray;
     ASSUME_ALIGNED(a, 64);
@@ -113,7 +113,7 @@ namespace {
     // C = B * AT;
 
     typedef float T;
-    const idx_t N = NN;
+    const Matriplex::idx_t N = NN;
 
     const T* a = A.fArray;
     ASSUME_ALIGNED(a, 64);
@@ -129,7 +129,7 @@ namespace {
     // C = A * B
 
     typedef float T;
-    const idx_t N = NN;
+    const Matriplex::idx_t N = NN;
 
     const T* a = A.fArray;
     ASSUME_ALIGNED(a, 64);
@@ -145,7 +145,7 @@ namespace {
     // C = B * AT;
 
     typedef float T;
-    const idx_t N = NN;
+    const Matriplex::idx_t N = NN;
 
     const T* a = A.fArray;
     ASSUME_ALIGNED(a, 64);
@@ -161,7 +161,7 @@ namespace {
     // C = A * B
 
     typedef float T;
-    const idx_t N = NN;
+    const Matriplex::idx_t N = NN;
 
     const T* a = A.fArray;
     ASSUME_ALIGNED(a, 64);
@@ -226,7 +226,6 @@ namespace {
     c[35 * N + n] = a[32 * N + n] * b[17 * N + n] + a[35 * N + n];
   }
 
-#ifdef UNUSED
   // this version does not assume to know which elements are 0 or 1, so it does the full multiplication
   void MultHelixPropFull(const MPlexLL& A, const MPlexLS& B, MPlexLL& C) {
 #pragma omp simd
@@ -241,29 +240,30 @@ namespace {
     }
   }
 
-  // this version does not assume to know which elements are 0 or 1, so it does the full multiplication
-  void MultHelixPropFull(const MPlexLL& A, const MPlexLL& B, MPlexLL& C) {
+  // this version does not assume to know which elements are 0 or 1, so it does the full mupltiplication
+  void MultHelixPropTranspFull(const MPlexLL& A, const MPlexLL& B, MPlexLS& C) {
 #pragma omp simd
     for (int n = 0; n < NN; ++n) {
       for (int i = 0; i < 6; ++i) {
         for (int j = 0; j < 6; ++j) {
           C(n, i, j) = 0.;
           for (int k = 0; k < 6; ++k)
-            C(n, i, j) += A.constAt(n, i, k) * B.constAt(n, k, j);
+            C(n, i, j) += B.constAt(n, i, k) * A.constAt(n, j, k);
         }
       }
     }
   }
 
-  // this version does not assume to know which elements are 0 or 1, so it does the full mupltiplication
-  void MultHelixPropTranspFull(const MPlexLL& A, const MPlexLL& B, MPlexLS& C) {
+#ifdef UNUSED
+  // this version does not assume to know which elements are 0 or 1, so it does the full multiplication
+  void MultHelixPropFull(const MPlexLL& A, const MPlexLL& B, MPlexLL& C) {
 #pragma omp simd
     for (int n = 0; n < NN; ++n) {
       for (int i = 0; i < 6; ++i) {
         for (int j = 0; j < 6; ++j) {
           C(n, i, j) = 0.;
           for (int k = 0; k < 6; ++k)
-            C(n, i, j) += B.constAt(n, i, k) * A.constAt(n, j, k);
+            C(n, i, j) += A.constAt(n, i, k) * B.constAt(n, k, j);
         }
       }
     }
@@ -488,7 +488,19 @@ namespace mkfit {
     errorProp.setVal(0.f);
     outFailFlag.setVal(0.f);
 
+    //helixAtRFromIterativeCCS_impl_new(inPar, inChg, msRad, outPar, errorProp, outFailFlag, 0, NN, N_proc, pflags);
+    helixAtRFromIterativeCCS_impl(inPar, inChg, msRad, outPar, errorProp, outFailFlag, 0, NN, N_proc, pflags);
+    /*
+    //float nv = errorProp(0,0,0);
+
+    outPar = inPar;
+    errorProp.setVal(0.f);
+    outFailFlag.setVal(0.f);
+
     helixAtRFromIterativeCCS_impl(inPar, inChg, msRad, outPar, errorProp, outFailFlag, 0, NN, N_proc, pflags);
+    //float ov = errorProp(0,0,0);
+    assert(0);
+    */
   }
 
   void propagateHelixToRMPlex(const MPlexLS& inErr,
@@ -557,18 +569,27 @@ namespace mkfit {
         const float r = msRad(n, 0, 0);
         propSign(n, 0, 0) = (r > r0 ? 1. : -1.);
       }
-      applyMaterialEffects(hitsRl, hitsXi, propSign, outErr, outPar, N_proc, true);
+      MPlexHV plNrm;
+#pragma omp simd
+      for (int n = 0; n < NN; ++n) {
+        plNrm(n, 0, 0) = std::cos(outPar.constAt(n, 4, 0));
+        plNrm(n, 1, 0) = std::sin(outPar.constAt(n, 4, 0));
+        plNrm(n, 2, 0) = 0.f;
+      }
+      applyMaterialEffects(hitsRl, hitsXi, propSign, plNrm, outErr, outPar, N_proc);
     }
 
     squashPhiMPlex(outPar, N_proc);  // ensure phi is between |pi|
 
-    // Matriplex version of:
-    // result.errors = ROOT::Math::Similarity(errorProp, outErr);
-
     // MultHelixProp can be optimized for CCS coordinates, see GenMPlexOps.pl
     MPlexLL temp;
     MultHelixProp(errorProp, outErr, temp);
     MultHelixPropTransp(errorProp, temp, outErr);
+    // MultHelixPropFull(errorProp, outErr, temp);
+    // MultHelixPropTranspFull(errorProp, temp, outErr);
+
+    // Matriplex version of:
+    // result.errors = ROOT::Math::Similarity(errorProp, outErr);
 
     /*
      // To be used with: MPT_DIM = 1
@@ -613,11 +634,18 @@ namespace mkfit {
 
     MPlexLL errorProp;
 
+    //helixAtZ_new(inPar, inChg, msZ, outPar, errorProp, outFailFlag, N_proc, pflags);
     helixAtZ(inPar, inChg, msZ, outPar, errorProp, outFailFlag, N_proc, pflags);
 
 #ifdef DEBUG
     if (debug && g_debug) {
       for (int kk = 0; kk < N_proc; ++kk) {
+        dprintf("inPar %d\n", kk);
+        for (int i = 0; i < 6; ++i) {
+          dprintf("%8f ", inPar.constAt(kk, i, 0));
+        }
+        dprintf("\n");
+
         dprintf("inErr %d\n", kk);
         for (int i = 0; i < 6; ++i) {
           for (int j = 0; j < 6; ++j)
@@ -637,6 +665,20 @@ namespace mkfit {
     }
 #endif
 
+#ifdef DEBUG
+    if (debug && g_debug) {
+      for (int kk = 0; kk < N_proc; ++kk) {
+        dprintf("outErr %d\n", kk);
+        for (int i = 0; i < 6; ++i) {
+          for (int j = 0; j < 6; ++j)
+            dprintf("%8f ", outErr.constAt(kk, i, j));
+          dprintf("\n");
+        }
+        dprintf("\n");
+      }
+    }
+#endif
+
     if (pflags.apply_material) {
       MPlexQF hitsRl;
       MPlexQF hitsXi;
@@ -657,9 +699,39 @@ namespace mkfit {
         }
         const float zout = msZ.constAt(n, 0, 0);
         const float zin = inPar.constAt(n, 2, 0);
-        propSign(n, 0, 0) = (std::abs(zout) > std::abs(zin) ? 1. : -1.);
+        propSign(n, 0, 0) = (std::abs(zout) > std::abs(zin) ? 1.f : -1.f);
+      }
+      MPlexHV plNrm;
+#pragma omp simd
+      for (int n = 0; n < NN; ++n) {
+        plNrm(n, 0, 0) = 0.f;
+        plNrm(n, 1, 0) = 0.f;
+        plNrm(n, 2, 0) = 1.f;
+      }
+      applyMaterialEffects(hitsRl, hitsXi, propSign, plNrm, outErr, outPar, N_proc);
+#ifdef DEBUG
+      if (debug && g_debug) {
+        for (int kk = 0; kk < N_proc; ++kk) {
+          dprintf("propSign %d\n", kk);
+          for (int i = 0; i < 1; ++i) {
+            dprintf("%8f ", propSign.constAt(kk, i, 0));
+          }
+          dprintf("\n");
+          dprintf("plNrm %d\n", kk);
+          for (int i = 0; i < 3; ++i) {
+            dprintf("%8f ", plNrm.constAt(kk, i, 0));
+          }
+          dprintf("\n");
+          dprintf("outErr(after material) %d\n", kk);
+          for (int i = 0; i < 6; ++i) {
+            for (int j = 0; j < 6; ++j)
+              dprintf("%8f ", outErr.constAt(kk, i, j));
+            dprintf("\n");
+          }
+          dprintf("\n");
+        }
       }
-      applyMaterialEffects(hitsRl, hitsXi, propSign, outErr, outPar, N_proc, false);
+#endif
     }
 
     squashPhiMPlex(outPar, N_proc);  // ensure phi is between |pi|
@@ -669,6 +741,8 @@ namespace mkfit {
     MPlexLL temp;
     MultHelixPropEndcap(errorProp, outErr, temp);
     MultHelixPropTranspEndcap(errorProp, temp, outErr);
+    // MultHelixPropFull(errorProp, outErr, temp);
+    // MultHelixPropTranspFull(errorProp, temp, outErr);
 
     // PROP-FAIL-ENABLE To keep physics changes minimal, we always restore the
     // state to input when propagation fails -- as was the default before.
@@ -680,33 +754,6 @@ namespace mkfit {
       }
     }
     // }
-
-    // This dump is now out of its place as similarity is done with matriplex ops.
-    /*
-#ifdef DEBUG
-   {
-     dmutex_guard;
-     for (int kk = 0; kk < N_proc; ++kk)
-     {
-       dprintf("outErr %d\n", kk);
-       for (int i = 0; i < 6; ++i) { for (int j = 0; j < 6; ++j)
-           dprintf("%8f ", outErr.At(kk,i,j)); printf("\n");
-       } dprintf("\n");
-
-       dprintf("outPar %d\n", kk);
-       for (int i = 0; i < 6; ++i) {
-           dprintf("%8f ", outPar.At(kk,i,0)); printf("\n");
-       } dprintf("\n");
-       if (std::abs(outPar.At(kk,2,0) - msZ.constAt(kk, 0, 0)) > 0.0001) {
-         float pt = 1.0f / inPar.constAt(kk,3,0);
-	 dprint_np(kk, "DID NOT GET TO Z, dZ=" << std::abs(outPar.At(kk,2,0) - msZ.constAt(kk, 0, 0))
-		   << " z=" << msZ.constAt(kk, 0, 0) << " zin=" << inPar.constAt(kk,2,0) << " zout=" << outPar.At(kk,2,0) << std::endl
-		   << "pt=" << pt << " pz=" << pt/std::tan(inPar.constAt(kk,5,0)));
-       }
-     }
-   }
-#endif
-   */
   }
 
   void helixAtZ(const MPlexLV& inPar,
@@ -718,7 +765,9 @@ namespace mkfit {
                 const int N_proc,
                 const PropagationFlags& pflags) {
     errorProp.setVal(0.f);
+    outFailFlag.setVal(0.f);
 
+    // debug = true;
 #pragma omp simd
     for (int n = 0; n < NN; ++n) {
       //initialize erroProp to identity matrix, except element 2,2 which is zero
@@ -773,7 +822,23 @@ namespace mkfit {
                     << " inPar.constAt(n, 2, 0)=" << std::setprecision(9) << inPar.constAt(n, 2, 0)
                     << " inPar.constAt(n, 3, 0)=" << std::setprecision(9) << inPar.constAt(n, 3, 0)
                     << " inPar.constAt(n, 4, 0)=" << std::setprecision(9) << inPar.constAt(n, 4, 0)
-                    << " inPar.constAt(n, 5, 0)=" << std::setprecision(9) << inPar.constAt(n, 5, 0));
+                    << " inPar.constAt(n, 5, 0)=" << std::setprecision(9) << inPar.constAt(n, 5, 0)
+                    << " inChg.constAt(n, 0, 0)=" << std::setprecision(9) << inChg.constAt(n, 0, 0));
+    }
+#pragma omp simd
+    for (int n = 0; n < NN; ++n) {
+      dprint_np(n,
+                "propagation start, dump parameters"
+                    << std::endl
+                    << "pos = " << inPar.constAt(n, 0, 0) << " " << inPar.constAt(n, 1, 0) << " "
+                    << inPar.constAt(n, 2, 0) << std::endl
+                    << "mom (cart) = " << std::cos(inPar.constAt(n, 4, 0)) / inPar.constAt(n, 3, 0) << " "
+                    << std::sin(inPar.constAt(n, 4, 0)) / inPar.constAt(n, 3, 0) << " "
+                    << 1. / (inPar.constAt(n, 3, 0) * tan(inPar.constAt(n, 5, 0))) << " r="
+                    << std::sqrt(inPar.constAt(n, 0, 0) * inPar.constAt(n, 0, 0) +
+                                 inPar.constAt(n, 1, 0) * inPar.constAt(n, 1, 0))
+                    << " pT=" << 1. / std::abs(inPar.constAt(n, 3, 0)) << " q=" << inChg.constAt(n, 0, 0)
+                    << " targetZ=" << msZ.constAt(n, 0, 0) << std::endl);
     }
 
     float pt[NN];
@@ -818,15 +883,7 @@ namespace mkfit {
       pxin[n] = cosP[n] * pt[n];
       pyin[n] = sinP[n] * pt[n];
     }
-#pragma omp simd
-    for (int n = 0; n < NN; ++n) {
-      //fixme, make this printout useful for propagation to z
-      dprint_np(n,
-                std::endl
-                    << "k=" << std::setprecision(9) << k[n] << " pxin=" << std::setprecision(9) << pxin[n]
-                    << " pyin=" << std::setprecision(9) << pyin[n] << " cosP=" << std::setprecision(9) << cosP[n]
-                    << " sinP=" << std::setprecision(9) << sinP[n] << " pt=" << std::setprecision(9) << pt[n]);
-    }
+
     float deltaZ[NN];
     float alpha[NN];
 #pragma omp simd
@@ -870,6 +927,7 @@ namespace mkfit {
       cosa[n] = 1.f - 2.f * sinah[n] * sinah[n];
       sina[n] = 2.f * sinah[n] * cosah[n];
     }
+
 //update parameters
 #pragma omp simd
     for (int n = 0; n < NN; ++n) {
@@ -882,9 +940,14 @@ namespace mkfit {
 #pragma omp simd
     for (int n = 0; n < NN; ++n) {
       dprint_np(n,
-                std::endl
-                    << "outPar.At(n, 0, 0)=" << outPar.At(n, 0, 0) << " outPar.At(n, 1, 0)=" << outPar.At(n, 1, 0)
-                    << " pxin=" << pxin[n] << " pyin=" << pyin[n]);
+                "propagation to Z end (OLD), dump parameters\n"
+                    << "   pos = " << outPar(n, 0, 0) << " " << outPar(n, 1, 0) << " " << outPar(n, 2, 0) << "\t\t r="
+                    << std::sqrt(outPar(n, 0, 0) * outPar(n, 0, 0) + outPar(n, 1, 0) * outPar(n, 1, 0)) << std::endl
+                    << "   mom = " << outPar(n, 3, 0) << " " << outPar(n, 4, 0) << " " << outPar(n, 5, 0) << std::endl
+                    << " cart= " << std::cos(outPar(n, 4, 0)) / outPar(n, 3, 0) << " "
+                    << std::sin(outPar(n, 4, 0)) / outPar(n, 3, 0) << " "
+                    << 1. / (outPar(n, 3, 0) * tan(outPar(n, 5, 0))) << "\t\tpT=" << 1. / std::abs(outPar(n, 3, 0))
+                    << std::endl);
     }
 
     float pxcaMpysa[NN];
@@ -933,7 +996,7 @@ namespace mkfit {
           "propagation end, dump parameters"
               << std::endl
               << "pos = " << outPar.At(n, 0, 0) << " " << outPar.At(n, 1, 0) << " " << outPar.At(n, 2, 0) << std::endl
-              << "mom = " << std::cos(outPar.At(n, 4, 0)) / outPar.At(n, 3, 0) << " "
+              << "mom (cart) = " << std::cos(outPar.At(n, 4, 0)) / outPar.At(n, 3, 0) << " "
               << std::sin(outPar.At(n, 4, 0)) / outPar.At(n, 3, 0) << " "
               << 1. / (outPar.At(n, 3, 0) * tan(outPar.At(n, 5, 0)))
               << " r=" << std::sqrt(outPar.At(n, 0, 0) * outPar.At(n, 0, 0) + outPar.At(n, 1, 0) * outPar.At(n, 1, 0))
@@ -1015,30 +1078,234 @@ namespace mkfit {
 #endif
   }
 
+  void helixAtPlane(const MPlexLV& inPar,
+                    const MPlexQI& inChg,
+                    const MPlexHV& plPnt,
+                    const MPlexHV& plNrm,
+                    MPlexQF& pathL,
+                    MPlexLV& outPar,
+                    MPlexLL& errorProp,
+                    MPlexQI& outFailFlag,
+                    const int N_proc,
+                    const PropagationFlags& pflags) {
+    errorProp.setVal(0.f);
+    outFailFlag.setVal(0.f);
+
+    helixAtPlane_impl(inPar, inChg, plPnt, plNrm, pathL, outPar, errorProp, outFailFlag, 0, NN, N_proc, pflags);
+  }
+
+  void propagateHelixToPlaneMPlex(const MPlexLS& inErr,
+                                  const MPlexLV& inPar,
+                                  const MPlexQI& inChg,
+                                  const MPlexHV& plPnt,
+                                  const MPlexHV& plNrm,
+                                  MPlexLS& outErr,
+                                  MPlexLV& outPar,
+                                  MPlexQI& outFailFlag,
+                                  const int N_proc,
+                                  const PropagationFlags& pflags,
+                                  const MPlexQI* noMatEffPtr) {
+    // debug = true;
+
+    outErr = inErr;
+    outPar = inPar;
+
+    MPlexQF pathL;
+    MPlexLL errorProp;
+
+    helixAtPlane(inPar, inChg, plPnt, plNrm, pathL, outPar, errorProp, outFailFlag, N_proc, pflags);
+
+    for (int n = 0; n < NN; ++n) {
+      dprint_np(
+          n,
+          "propagation to plane end, dump parameters\n"
+              //<< "   D = " << s[n] << " alpha = " << s[n] * std::sin(inPar(n, 5, 0)) * inPar(n, 3, 0) * kinv[n] << " kinv = " << kinv[n] << std::endl
+              << "   pos = " << outPar(n, 0, 0) << " " << outPar(n, 1, 0) << " " << outPar(n, 2, 0) << "\t\t r="
+              << std::sqrt(outPar(n, 0, 0) * outPar(n, 0, 0) + outPar(n, 1, 0) * outPar(n, 1, 0)) << std::endl
+              << "   mom = " << outPar(n, 3, 0) << " " << outPar(n, 4, 0) << " " << outPar(n, 5, 0) << std::endl
+              << " cart= " << std::cos(outPar(n, 4, 0)) / outPar(n, 3, 0) << " "
+              << std::sin(outPar(n, 4, 0)) / outPar(n, 3, 0) << " " << 1. / (outPar(n, 3, 0) * tan(outPar(n, 5, 0)))
+              << "\t\tpT=" << 1. / std::abs(outPar(n, 3, 0)) << std::endl);
+    }
+
+#ifdef DEBUG
+    if (debug && g_debug) {
+      for (int kk = 0; kk < N_proc; ++kk) {
+        dprintf("inPar %d\n", kk);
+        for (int i = 0; i < 6; ++i) {
+          dprintf("%8f ", inPar.constAt(kk, i, 0));
+        }
+        dprintf("\n");
+        dprintf("inErr %d\n", kk);
+        for (int i = 0; i < 6; ++i) {
+          for (int j = 0; j < 6; ++j)
+            dprintf("%8f ", inErr.constAt(kk, i, j));
+          dprintf("\n");
+        }
+        dprintf("\n");
+
+        for (int kk = 0; kk < N_proc; ++kk) {
+          dprintf("plNrm %d\n", kk);
+          for (int j = 0; j < 3; ++j)
+            dprintf("%8f ", plNrm.constAt(kk, 0, j));
+        }
+        dprintf("\n");
+
+        for (int kk = 0; kk < N_proc; ++kk) {
+          dprintf("pathL %d\n", kk);
+          for (int j = 0; j < 1; ++j)
+            dprintf("%8f ", pathL.constAt(kk, 0, j));
+        }
+        dprintf("\n");
+
+        dprintf("errorProp %d\n", kk);
+        for (int i = 0; i < 6; ++i) {
+          for (int j = 0; j < 6; ++j)
+            dprintf("%8f ", errorProp.At(kk, i, j));
+          dprintf("\n");
+        }
+        dprintf("\n");
+      }
+    }
+#endif
+
+    // Matriplex version of:
+    // result.errors = ROOT::Math::Similarity(errorProp, outErr);
+    MPlexLL temp;
+    MultHelixPropFull(errorProp, outErr, temp);
+    MultHelixPropTranspFull(errorProp, temp, outErr);
+
+#ifdef DEBUG
+    if (debug && g_debug) {
+      for (int kk = 0; kk < N_proc; ++kk) {
+        dprintf("outErr %d\n", kk);
+        for (int i = 0; i < 6; ++i) {
+          for (int j = 0; j < 6; ++j)
+            dprintf("%8f ", outErr.constAt(kk, i, j));
+          dprintf("\n");
+        }
+        dprintf("\n");
+      }
+    }
+#endif
+
+    if (pflags.apply_material) {
+      MPlexQF hitsRl;
+      MPlexQF hitsXi;
+      MPlexQF propSign;
+
+      const TrackerInfo& tinfo = *pflags.tracker_info;
+
+#pragma omp simd
+      for (int n = 0; n < NN; ++n) {
+        if (n >= N_proc || (noMatEffPtr && noMatEffPtr->constAt(n, 0, 0))) {
+          hitsRl(n, 0, 0) = 0.f;
+          hitsXi(n, 0, 0) = 0.f;
+        } else {
+          const float hypo = std::hypot(outPar(n, 0, 0), outPar(n, 1, 0));
+          auto mat = tinfo.material_checked(std::abs(outPar(n, 2, 0)), hypo);
+          hitsRl(n, 0, 0) = mat.radl;
+          hitsXi(n, 0, 0) = mat.bbxi;
+        }
+        propSign(n, 0, 0) = (pathL(n, 0, 0) > 0.f ? 1.f : -1.f);
+      }
+      applyMaterialEffects(hitsRl, hitsXi, propSign, plNrm, outErr, outPar, N_proc);
+#ifdef DEBUG
+      if (debug && g_debug) {
+        for (int kk = 0; kk < N_proc; ++kk) {
+          dprintf("propSign %d\n", kk);
+          for (int i = 0; i < 1; ++i) {
+            dprintf("%8f ", propSign.constAt(kk, i, 0));
+          }
+          dprintf("\n");
+          dprintf("plNrm %d\n", kk);
+          for (int i = 0; i < 3; ++i) {
+            dprintf("%8f ", plNrm.constAt(kk, i, 0));
+          }
+          dprintf("\n");
+          dprintf("outErr(after material) %d\n", kk);
+          for (int i = 0; i < 6; ++i) {
+            for (int j = 0; j < 6; ++j)
+              dprintf("%8f ", outErr.constAt(kk, i, j));
+            dprintf("\n");
+          }
+          dprintf("\n");
+        }
+      }
+#endif
+    }
+
+    squashPhiMPlex(outPar, N_proc);  // ensure phi is between |pi|
+
+    // PROP-FAIL-ENABLE To keep physics changes minimal, we always restore the
+    // state to input when propagation fails -- as was the default before.
+    // if (pflags.copy_input_state_on_fail) {
+    for (int i = 0; i < N_proc; ++i) {
+      if (outFailFlag(i, 0, 0)) {
+        outPar.copySlot(i, inPar);
+        outErr.copySlot(i, inErr);
+      }
+    }
+    // }
+
+    // This dump is now out of its place as similarity is done with matriplex ops.
+    /*
+#ifdef DEBUG
+   {
+     dmutex_guard;
+     for (int kk = 0; kk < N_proc; ++kk)
+     {
+       dprintf("outErr %d\n", kk);
+       for (int i = 0; i < 6; ++i) { for (int j = 0; j < 6; ++j)
+           dprintf("%8f ", outErr.At(kk,i,j)); printf("\n");
+       } dprintf("\n");
+
+       dprintf("outPar %d\n", kk);
+       for (int i = 0; i < 6; ++i) {
+           dprintf("%8f ", outPar.At(kk,i,0)); printf("\n");
+       } dprintf("\n");
+       if (std::abs(outPar.At(kk,2,0) - msZ.constAt(kk, 0, 0)) > 0.0001) {
+         float pt = 1.0f / inPar.constAt(kk,3,0);
+	 dprint_np(kk, "DID NOT GET TO Z, dZ=" << std::abs(outPar.At(kk,2,0) - msZ.constAt(kk, 0, 0))
+		   << " z=" << msZ.constAt(kk, 0, 0) << " zin=" << inPar.constAt(kk,2,0) << " zout=" << outPar.At(kk,2,0) << std::endl
+		   << "pt=" << pt << " pz=" << pt/std::tan(inPar.constAt(kk,5,0)));
+       }
+     }
+   }
+#endif
+   */
+  }
+
   //==============================================================================
 
   void applyMaterialEffects(const MPlexQF& hitsRl,
                             const MPlexQF& hitsXi,
                             const MPlexQF& propSign,
+                            const MPlexHV& plNrm,
                             MPlexLS& outErr,
                             MPlexLV& outPar,
-                            const int N_proc,
-                            const bool isBarrel) {
+                            const int N_proc) {
 #pragma omp simd
     for (int n = 0; n < NN; ++n) {
       float radL = hitsRl.constAt(n, 0, 0);
       if (radL < 1e-13f)
         continue;  //ugly, please fixme
       const float theta = outPar.constAt(n, 5, 0);
-      const float pt = 1.f / outPar.constAt(n, 3, 0);  //fixme, make sure it is positive?
+      // const float pt = 1.f / outPar.constAt(n, 3, 0);  //fixme, make sure it is positive?
+      const float ipt = outPar.constAt(n, 3, 0);
+      const float pt = 1.f / ipt;  //fixme, make sure it is positive?
+      const float ipt2 = ipt * ipt;
       const float p = pt / std::sin(theta);
+      const float pz = p * std::cos(theta);
       const float p2 = p * p;
       constexpr float mpi = 0.140;       // m=140 MeV, pion
       constexpr float mpi2 = mpi * mpi;  // m=140 MeV, pion
       const float beta2 = p2 / (p2 + mpi2);
       const float beta = std::sqrt(beta2);
       //radiation lenght, corrected for the crossing angle (cos alpha from dot product of radius vector and momentum)
-      const float invCos = (isBarrel ? p / pt : 1.f / std::abs(std::cos(theta)));
+      const float invCos =
+          p / std::abs(pt * std::cos(outPar.constAt(n, 4, 0)) * plNrm.constAt(n, 0, 0) +
+                       pt * std::sin(outPar.constAt(n, 4, 0)) * plNrm.constAt(n, 1, 0) + pz * plNrm.constAt(n, 2, 0));
       radL = radL * invCos;  //fixme works only for barrel geom
       // multiple scattering
       //vary independently phi and theta by the rms of the planar multiple scattering angle
@@ -1049,9 +1316,15 @@ namespace mkfit {
       // const float thetaMSC2 = thetaMSC*thetaMSC;
       const float thetaMSC = 0.0136f * (1.f + 0.038f * std::log(radL)) / (beta * p);  // eq 32.15
       const float thetaMSC2 = thetaMSC * thetaMSC * radL;
-      outErr.At(n, 4, 4) += thetaMSC2;
-      // outErr.At(n, 4, 5) += thetaMSC2;
-      outErr.At(n, 5, 5) += thetaMSC2;
+      if (Config::usePtMultScat) {
+        outErr.At(n, 3, 3) += thetaMSC2 * pz * pz * ipt2 * ipt2;
+        outErr.At(n, 3, 5) -= thetaMSC2 * pz * ipt2;
+        outErr.At(n, 4, 4) += thetaMSC2 * p2 * ipt2;
+        outErr.At(n, 5, 5) += thetaMSC2;
+      } else {
+        outErr.At(n, 4, 4) += thetaMSC2;
+        outErr.At(n, 5, 5) += thetaMSC2;
+      }
       //std::cout << "beta=" << beta << " p=" << p << std::endl;
       //std::cout << "multiple scattering thetaMSC=" << thetaMSC << " thetaMSC2=" << thetaMSC2 << " radL=" << radL << std::endl;
       // energy loss
diff --git a/RecoTracker/MkFitCore/src/PropagationMPlex.h b/RecoTracker/MkFitCore/src/PropagationMPlex.h
index 5d1b0034ad400..3522365538b13 100644
--- a/RecoTracker/MkFitCore/src/PropagationMPlex.h
+++ b/RecoTracker/MkFitCore/src/PropagationMPlex.h
@@ -80,13 +80,35 @@ namespace mkfit {
                 const int N_proc,
                 const PropagationFlags& pflags);
 
+  void helixAtPlane(const MPlexLV& inPar,
+                    const MPlexQI& inChg,
+                    const MPlexHV& plPnt,
+                    const MPlexHV& plNrm,
+                    MPlexQF& pathL,
+                    MPlexLV& outPar,
+                    MPlexLL& errorProp,
+                    MPlexQI& outFailFlag,
+                    const int N_proc,
+                    const PropagationFlags& pflags);
+
+  void propagateHelixToPlaneMPlex(const MPlexLS& inErr,
+                                  const MPlexLV& inPar,
+                                  const MPlexQI& inChg,
+                                  const MPlexHV& plPnt,
+                                  const MPlexHV& plNrm,
+                                  MPlexLS& outErr,
+                                  MPlexLV& outPar,
+                                  MPlexQI& outFailFlag,
+                                  const int N_proc,
+                                  const PropagationFlags& pflags,
+                                  const MPlexQI* noMatEffPtr = nullptr);
+
   void applyMaterialEffects(const MPlexQF& hitsRl,
                             const MPlexQF& hitsXi,
                             const MPlexQF& propSign,
+                            const MPlexHV& plNrm,
                             MPlexLS& outErr,
                             MPlexLV& outPar,
-                            const int N_proc,
-                            const bool isBarrel);
-
+                            const int N_proc);
 }  // end namespace mkfit
 #endif
diff --git a/RecoTracker/MkFitCore/src/PropagationMPlex.icc b/RecoTracker/MkFitCore/src/PropagationMPlex.icc
index 8cb581401ad88..bb13d0e75ad91 100644
--- a/RecoTracker/MkFitCore/src/PropagationMPlex.icc
+++ b/RecoTracker/MkFitCore/src/PropagationMPlex.icc
@@ -2,6 +2,753 @@
 /// helixAtRFromIterativeCCS_impl
 ///////////////////////////////////////////////////////////////////////////////
 
+//#define DEBUG
+//#include "Debug.h"
+
+template <typename Tf, typename TfLL1, typename Tf1>
+static inline void parsFromPathL_impl(const Tf& __restrict__ inPar,
+                                      TfLL1& __restrict__ outPar,
+                                      const float* kinv,
+                                      const Tf1& __restrict__ s,
+                                      const int nmin,
+                                      const int nmax) {
+  float alpha[nmax - nmin];
+  for (int n = nmin; n < nmax; ++n) {
+    alpha[n - nmin] = s[n - nmin] * std::sin(inPar(n, 5, 0)) * inPar(n, 3, 0) * kinv[n - nmin];
+  }
+
+  float cosah[nmax - nmin];
+  float sinah[nmax - nmin];
+  if constexpr (Config::useTrigApprox) {
+#if !defined(__INTEL_COMPILER)
+#pragma omp simd
+#endif
+    for (int n = nmin; n < nmax; ++n) {
+      sincos4(alpha[n - nmin] * 0.5f, sinah[n - nmin], cosah[n - nmin]);
+    }
+  } else {
+#if !defined(__INTEL_COMPILER)
+#pragma omp simd
+#endif
+    for (int n = nmin; n < nmax; ++n) {
+      cosah[n - nmin] = std::cos(alpha[n - nmin] * 0.5f);
+      sinah[n - nmin] = std::sin(alpha[n - nmin] * 0.5f);
+    }
+  }
+
+  for (int n = nmin; n < nmax; ++n) {
+    outPar(n, 0, 0) =
+        inPar(n, 0, 0) + 2.f * sinah[n - nmin] *
+                             (std::cos(inPar(n, 4, 0)) * cosah[n - nmin] - std::sin(inPar(n, 4, 0)) * sinah[n - nmin]) /
+                             (inPar(n, 3, 0) * kinv[n - nmin]);
+    outPar(n, 1, 0) =
+        inPar(n, 1, 0) + 2.f * sinah[n - nmin] *
+                             (std::sin(inPar(n, 4, 0)) * cosah[n - nmin] + std::cos(inPar(n, 4, 0)) * sinah[n - nmin]) /
+                             (inPar(n, 3, 0) * kinv[n - nmin]);
+    outPar(n, 2, 0) = inPar(n, 2, 0) + alpha[n - nmin] / kinv[n - nmin] * std::cos(inPar(n, 5, 0)) /
+                                           (inPar(n, 3, 0) * std::sin(inPar(n, 5, 0)));
+    outPar(n, 3, 0) = inPar(n, 3, 0);
+    outPar(n, 4, 0) = inPar(n, 4, 0) + alpha[n - nmin];
+    outPar(n, 5, 0) = inPar(n, 5, 0);
+  }
+}
+
+//should kinv and D be templated???
+template <typename Tf, typename Ti, typename TfLL1, typename TfLLL, typename Tf1>
+static inline void parsAndErrPropFromPathL_impl(const Tf& __restrict__ inPar,
+                                                const Ti& __restrict__ inChg,
+                                                TfLL1& __restrict__ outPar,
+                                                const float* kinv,
+                                                const Tf1& __restrict__ s,
+                                                TfLLL& __restrict__ errorProp,
+                                                const int nmin,
+                                                const int nmax,
+                                                const int N_proc,
+                                                const PropagationFlags& pf) {
+  //iteration should return the path length s, then update parameters and compute errors
+
+  parsFromPathL_impl(inPar, outPar, kinv, s, nmin, nmax);
+
+  float cosPin[nmax - nmin];
+  float sinPin[nmax - nmin];
+  float cosPout[nmax - nmin];
+  float sinPout[nmax - nmin];
+  float cosT[nmax - nmin];
+  float sinT[nmax - nmin];
+
+#pragma omp simd
+  for (int n = nmin; n < nmax; ++n) {
+    cosPin[n - nmin] = std::cos(inPar(n, 4, 0));
+    sinPin[n - nmin] = std::sin(inPar(n, 4, 0));
+    cosPout[n - nmin] = std::cos(outPar(n, 4, 0));
+    sinPout[n - nmin] = std::sin(outPar(n, 4, 0));
+    cosT[n - nmin] = std::cos(inPar(n, 5, 0));
+    sinT[n - nmin] = std::sin(inPar(n, 5, 0));
+  }
+
+  // use code from AnalyticalCurvilinearJacobian::computeFullJacobian for error propagation in curvilinear coordinates, then convert to CCS
+  // main difference from the above function is that we assume that the magnetic field is purely along z (which also implies that there is no change in pz)
+  // this simplifies significantly the code
+
+  MPlex55 errorPropCurv;
+  for (int n = nmin; n < nmax; ++n) {
+    const float qbp = inChg(n, 0, 0) * sinT[n - nmin] * inPar(n, 3, 0);
+    // calculate transport matrix
+    // Origin: TRPRFN
+    const float t11 = cosPin[n - nmin] * sinT[n - nmin];
+    const float t12 = sinPin[n - nmin] * sinT[n - nmin];
+    const float t21 = cosPout[n - nmin] * sinT[n - nmin];
+    const float t22 = sinPout[n - nmin] * sinT[n - nmin];
+    const float cosl1 = 1.f / sinT[n - nmin];
+    // define average magnetic field and gradient
+    // at initial point - inlike TRPRFN
+    const float bF =
+        (pf.use_param_b_field
+             ? 0.01f * Const::sol * Config::bFieldFromZR(inPar(n, 2, 0), hipo(inPar(n, 0, 0), inPar(n, 1, 0)))
+             : 0.01f * Const::sol * Config::Bfield);
+    const float q = -bF * qbp;
+    const float theta = q * s[n - nmin];
+    //float sint, cost;
+    //vdt::fast_sincos(theta, sint, cost);
+    const float sint = std::sin(theta);
+    const float cost = std::cos(theta);
+    const float dx1 = inPar(n, 0, 0) - outPar(n, 0, 0);
+    const float dx2 = inPar(n, 1, 0) - outPar(n, 1, 0);
+    const float dx3 = inPar(n, 2, 0) - outPar(n, 2, 0);
+    float au = 1.f / sqrt(t11 * t11 + t12 * t12);
+    const float u11 = -au * t12;
+    const float u12 = au * t11;
+    const float v11 = -cosT[n - nmin] * u12;
+    const float v12 = cosT[n - nmin] * u11;
+    const float v13 = t11 * u12 - t12 * u11;
+    au = 1.f / sqrt(t21 * t21 + t22 * t22);
+    const float u21 = -au * t22;
+    const float u22 = au * t21;
+    const float v21 = -cosT[n - nmin] * u22;
+    const float v22 = cosT[n - nmin] * u21;
+    const float v23 = t21 * u22 - t22 * u21;
+    // now prepare the transport matrix
+    const float omcost = 1.f - cost;
+    const float tmsint = theta - sint;
+    //   1/p - doesn't change since |p1| = |p2|
+    errorPropCurv(n, 0, 0) = 1.f;
+    for (auto i = 1; i < 5; ++i)
+      errorPropCurv(n, 0, i) = 0.f;
+    //   lambda
+    errorPropCurv(n, 1, 0) = 0.f;
+    errorPropCurv(n, 1, 1) =
+        cost * (v11 * v21 + v12 * v22 + v13 * v23) + sint * (-v12 * v21 + v11 * v22) + omcost * v13 * v23;
+    errorPropCurv(n, 1, 2) = (cost * (u11 * v21 + u12 * v22) + sint * (-u12 * v21 + u11 * v22)) * sinT[n - nmin];
+    errorPropCurv(n, 1, 3) = 0.f;
+    errorPropCurv(n, 1, 4) = 0.f;
+    //   phi
+    errorPropCurv(n, 2, 0) = bF * v23 * (t21 * dx1 + t22 * dx2 + cosT[n - nmin] * dx3) * cosl1;
+    errorPropCurv(n, 2, 1) = (cost * (v11 * u21 + v12 * u22) + sint * (-v12 * u21 + v11 * u22) +
+                              v23 * (-sint * (v11 * t21 + v12 * t22 + v13 * cosT[n - nmin]) +
+                                     omcost * (-v11 * t22 + v12 * t21) - tmsint * cosT[n - nmin] * v13)) *
+                             cosl1;
+    errorPropCurv(n, 2, 2) = (cost * (u11 * u21 + u12 * u22) + sint * (-u12 * u21 + u11 * u22) +
+                              v23 * (-sint * (u11 * t21 + u12 * t22) + omcost * (-u11 * t22 + u12 * t21))) *
+                             cosl1 * sinT[n - nmin];
+    errorPropCurv(n, 2, 3) = -q * v23 * (u11 * t21 + u12 * t22) * cosl1;
+    errorPropCurv(n, 2, 4) = -q * v23 * (v11 * t21 + v12 * t22 + v13 * cosT[n - nmin]) * cosl1;
+    //   yt
+    float cutCriterion = fabs(s[n - nmin] * sinT[n - nmin] * inPar(n, 3, 0));
+    const float limit = 5.f;  // valid for propagations with effectively float precision
+    if (cutCriterion > limit) {
+      const float pp = 1.f / qbp;
+      errorPropCurv(n, 3, 0) = pp * (u21 * dx1 + u22 * dx2);
+      errorPropCurv(n, 4, 0) = pp * (v21 * dx1 + v22 * dx2 + v23 * dx3);
+    } else {
+      const float temp1 = -t12 * u21 + t11 * u22;
+      const float s2 = s[n - nmin] * s[n - nmin];
+      const float secondOrder41 = -0.5f * bF * temp1 * s2;
+      const float temp2 = -t11 * u21 - t12 * u22;
+      const float s3 = s2 * s[n - nmin];
+      const float s4 = s3 * s[n - nmin];
+      const float h2 = bF * bF;
+      const float h3 = h2 * bF;
+      const float qbp2 = qbp * qbp;
+      const float thirdOrder41 = 1.f / 3 * h2 * s3 * qbp * temp2;
+      const float fourthOrder41 = 1.f / 8 * h3 * s4 * qbp2 * temp1;
+      errorPropCurv(n, 3, 0) = secondOrder41 + (thirdOrder41 + fourthOrder41);
+      const float temp3 = -t12 * v21 + t11 * v22;
+      const float secondOrder51 = -0.5f * bF * temp3 * s2;
+      const float temp4 = -t11 * v21 - t12 * v22 - cosT[n - nmin] * v23;
+      const float thirdOrder51 = 1.f / 3 * h2 * s3 * qbp * temp4;
+      const float fourthOrder51 = 1.f / 8 * h3 * s4 * qbp2 * temp3;
+      errorPropCurv(n, 4, 0) = secondOrder51 + (thirdOrder51 + fourthOrder51);
+    }
+    errorPropCurv(n, 3, 1) = (sint * (v11 * u21 + v12 * u22) + omcost * (-v12 * u21 + v11 * u22)) / q;
+    errorPropCurv(n, 3, 2) = (sint * (u11 * u21 + u12 * u22) + omcost * (-u12 * u21 + u11 * u22)) * sinT[n - nmin] / q;
+    errorPropCurv(n, 3, 3) = (u11 * u21 + u12 * u22);
+    errorPropCurv(n, 3, 4) = (v11 * u21 + v12 * u22);
+    //   zt
+    errorPropCurv(n, 4, 1) =
+        (sint * (v11 * v21 + v12 * v22 + v13 * v23) + omcost * (-v12 * v21 + v11 * v22) + tmsint * v23 * v13) / q;
+    errorPropCurv(n, 4, 2) = (sint * (u11 * v21 + u12 * v22) + omcost * (-u12 * v21 + u11 * v22)) * sinT[n - nmin] / q;
+    errorPropCurv(n, 4, 3) = (u11 * v21 + u12 * v22);
+    errorPropCurv(n, 4, 4) = (v11 * v21 + v12 * v22 + v13 * v23);
+
+  }  //end loop over n
+
+//debug = true;
+#ifdef DEBUG
+  for (int n = nmin; n < nmax; ++n) {
+    if (debug && g_debug && n < N_proc) {
+      dmutex_guard;
+      std::cout << n << ": errorPropCurv" << std::endl;
+      printf("%5f %5f %5f %5f %5f\n",
+             errorPropCurv(n, 0, 0),
+             errorPropCurv(n, 0, 1),
+             errorPropCurv(n, 0, 2),
+             errorPropCurv(n, 0, 3),
+             errorPropCurv(n, 0, 4));
+      printf("%5f %5f %5f %5f %5f\n",
+             errorPropCurv(n, 1, 0),
+             errorPropCurv(n, 1, 1),
+             errorPropCurv(n, 1, 2),
+             errorPropCurv(n, 1, 3),
+             errorPropCurv(n, 1, 4));
+      printf("%5f %5f %5f %5f %5f\n",
+             errorPropCurv(n, 2, 0),
+             errorPropCurv(n, 2, 1),
+             errorPropCurv(n, 2, 2),
+             errorPropCurv(n, 2, 3),
+             errorPropCurv(n, 2, 4));
+      printf("%5f %5f %5f %5f %5f\n",
+             errorPropCurv(n, 3, 0),
+             errorPropCurv(n, 3, 1),
+             errorPropCurv(n, 3, 2),
+             errorPropCurv(n, 3, 3),
+             errorPropCurv(n, 3, 4));
+      printf("%5f %5f %5f %5f %5f\n",
+             errorPropCurv(n, 4, 0),
+             errorPropCurv(n, 4, 1),
+             errorPropCurv(n, 4, 2),
+             errorPropCurv(n, 4, 3),
+             errorPropCurv(n, 4, 4));
+      printf("\n");
+    }
+  }
+#endif
+
+  //now we need jacobians to convert to/from curvilinear and CCS
+  // code from TrackState::jacobianCCSToCurvilinear
+  MPlex56 jacCCS2Curv;
+  for (int n = nmin; n < nmax; ++n) {
+    for (int ii = 0; ii < 5; ii++) {
+      for (int jj = 0; jj < 6; jj++) {
+        jacCCS2Curv(n, ii, jj) = 0.f;
+      }
+    }
+    jacCCS2Curv(n, 0, 3) = inChg(n, 0, 0) * sinT[n - nmin];
+    jacCCS2Curv(n, 0, 5) = inChg(n, 0, 0) * cosT[n - nmin] * inPar(n, 3, 0);
+    jacCCS2Curv(n, 1, 5) = -1.f;
+    jacCCS2Curv(n, 2, 4) = 1.f;
+    jacCCS2Curv(n, 3, 0) = -sinPin[n - nmin];
+    jacCCS2Curv(n, 3, 1) = cosPin[n - nmin];
+    jacCCS2Curv(n, 4, 0) = -cosPin[n - nmin] * cosT[n - nmin];
+    jacCCS2Curv(n, 4, 1) = -sinPin[n - nmin] * cosT[n - nmin];
+    jacCCS2Curv(n, 4, 2) = sinT[n - nmin];
+  }
+
+  // code from TrackState::jacobianCurvilinearToCCS
+  MPlex65 jacCurv2CCS;
+  for (int n = nmin; n < nmax; ++n) {
+    for (int ii = 0; ii < 6; ii++) {
+      for (int jj = 0; jj < 5; jj++) {
+        jacCurv2CCS(n, ii, jj) = 0.f;
+      }
+    }
+
+    jacCurv2CCS(n, 0, 3) = -sinPout[n - nmin];
+    jacCurv2CCS(n, 0, 4) = -cosT[n - nmin] * cosPout[n - nmin];
+    jacCurv2CCS(n, 1, 3) = cosPout[n - nmin];
+    jacCurv2CCS(n, 1, 4) = -cosT[n - nmin] * sinPout[n - nmin];
+    jacCurv2CCS(n, 2, 4) = sinT[n - nmin];
+    jacCurv2CCS(n, 3, 0) = inChg(n, 0, 0) / sinT[n - nmin];
+    jacCurv2CCS(n, 3, 1) = outPar(n, 3, 0) * cosT[n - nmin] / sinT[n - nmin];
+    jacCurv2CCS(n, 4, 2) = 1.f;
+    jacCurv2CCS(n, 5, 1) = -1.f;
+  }
+
+  //need to compute errorProp = jacCurv2CCS*errorPropCurv*jacCCS2Curv
+  Matriplex::MPlex<float, 6, 5, NN> tmp;
+  Matriplex::multiplyGeneral(jacCurv2CCS, errorPropCurv, tmp);
+  Matriplex::multiplyGeneral(tmp, jacCCS2Curv, errorProp);
+}
+
+// from P.Avery's notes (http://www.phys.ufl.edu/~avery/fitting/transport.pdf eq. 5)
+inline float getS(float delta0,
+                  float delta1,
+                  float delta2,
+                  float eta0,
+                  float eta1,
+                  float eta2,
+                  float sinP,
+                  float cosP,
+                  float sinT,
+                  float cosT,
+                  float pt,
+                  int q,
+                  float kinv) {
+  float A = delta0 * eta0 + delta1 * eta1 + delta2 * eta2;
+  float ip = sinT / pt;
+  float p0[3] = {pt * cosP, pt * sinP, cosT / ip};
+  float B = (p0[0] * eta0 + p0[1] * eta1 + p0[2] * eta2) * ip;
+  float rho = kinv * ip;
+  float C = (eta0 * p0[1] - eta1 * p0[0]) * rho * 0.5f * ip;
+  float sqb2m4ac = std::sqrt(B * B - 4.f * A * C);
+  float s1 = (-B + sqb2m4ac) * 0.5f / C;
+  float s2 = (-B - sqb2m4ac) * 0.5f / C;
+#ifdef DEBUG
+  if (debug)
+    std::cout << "A=" << A << " B=" << B << " C=" << C << " s1=" << s1 << " s2=" << s2 << std::endl;
+#endif
+  //take the closest
+  return (std::abs(s1) > std::abs(s2) ? s2 : s1);
+}
+
+template <typename Tf, typename Ti, typename TfLL1, typename Tf11, typename TfLLL, typename Tf1>
+static inline void helixAtPlane_impl(const Tf& __restrict__ inPar,
+                                     const Ti& __restrict__ inChg,
+                                     const Tf11& __restrict__ plPnt,
+                                     const Tf11& __restrict__ plNrm,
+                                     Tf1& __restrict__ s,
+                                     TfLL1& __restrict__ outPar,
+                                     TfLLL& __restrict__ errorProp,
+                                     Ti& __restrict__ outFailFlag,  // expected to be initialized to 0
+                                     const int nmin,
+                                     const int nmax,
+                                     const int N_proc,
+                                     const PropagationFlags& pf) {
+  for (int n = nmin; n < nmax; ++n) {
+    dprint_np(n,
+              "input parameters"
+                  << " inPar(n, 0, 0)=" << std::setprecision(9) << inPar(n, 0, 0) << " inPar(n, 1, 0)="
+                  << std::setprecision(9) << inPar(n, 1, 0) << " inPar(n, 2, 0)=" << std::setprecision(9)
+                  << inPar(n, 2, 0) << " inPar(n, 3, 0)=" << std::setprecision(9) << inPar(n, 3, 0)
+                  << " inPar(n, 4, 0)=" << std::setprecision(9) << inPar(n, 4, 0)
+                  << " inPar(n, 5, 0)=" << std::setprecision(9) << inPar(n, 5, 0));
+  }
+
+  float kinv[nmax - nmin];
+  if (pf.use_param_b_field) {
+#pragma omp simd
+    for (int n = nmin; n < nmax; ++n) {
+      kinv[n - nmin] = inChg(n, 0, 0) * 0.01f *
+                       (-Const::sol * Config::bFieldFromZR(inPar(n, 2, 0), hipo(inPar(n, 0, 0), inPar(n, 1, 0))));
+    }
+  } else {
+#pragma omp simd
+    for (int n = nmin; n < nmax; ++n) {
+      kinv[n - nmin] = inChg(n, 0, 0) * 0.01f * (-Const::sol * Config::Bfield);
+    }
+  }
+
+  float delta0[nmax - nmin];
+  float delta1[nmax - nmin];
+  float delta2[nmax - nmin];
+#pragma omp simd
+  for (int n = nmin; n < nmax; ++n) {
+    delta0[n - nmin] = inPar(n, 0, 0) - plPnt(n, 0, 0);
+    delta1[n - nmin] = inPar(n, 1, 0) - plPnt(n, 1, 0);
+    delta2[n - nmin] = inPar(n, 2, 0) - plPnt(n, 2, 0);
+  }
+
+  float sinP[nmax - nmin];
+  float cosP[nmax - nmin];
+#pragma omp simd
+  for (int n = nmin; n < nmax; ++n) {
+    sinP[n - nmin] = std::sin(inPar(n, 4, 0));
+    cosP[n - nmin] = std::cos(inPar(n, 4, 0));
+  }
+
+  // determine solution for straight line
+  float sl[nmax - nmin];
+#pragma omp simd
+  for (int n = nmin; n < nmax; ++n) {
+    //sl[n - nmin] = - ( plNrm(n, 0, 0)*delta0[n - nmin] + plNrm(n, 1, 0)*delta1[n - nmin] + plNrm(n, 2, 0)*delta2[n - nmin] ) / ( plNrm(n, 0, 0)*cosP[n - nmin]/inPar(n,3,0) + plNrm(n, 1, 0)*sinP[n - nmin]/inPar(n,3,0) + plNrm(n, 2, 0)*std::cos(inPar(n,5,0))/std::sin(inPar(n,5,0))/inPar(n,3,0) );
+    sl[n - nmin] =
+        -(plNrm(n, 0, 0) * delta0[n - nmin] + plNrm(n, 1, 0) * delta1[n - nmin] + plNrm(n, 2, 0) * delta2[n - nmin]) /
+        (plNrm(n, 0, 0) * cosP[n - nmin] * std::sin(inPar(n, 5, 0)) +
+         plNrm(n, 1, 0) * sinP[n - nmin] * std::sin(inPar(n, 5, 0)) + plNrm(n, 2, 0) * std::cos(inPar(n, 5, 0)));
+  }
+
+  //float s[nmax - nmin];
+  //first iteration outside the loop
+#pragma omp simd
+  for (int n = nmin; n < nmax; ++n) {
+    s[n - nmin] = (std::abs(plNrm(n, 2, 0)) < 1.f
+                       ? getS(delta0[n - nmin],
+                              delta1[n - nmin],
+                              delta2[n - nmin],
+                              plNrm(n, 0, 0),
+                              plNrm(n, 1, 0),
+                              plNrm(n, 2, 0),
+                              sinP[n - nmin],
+                              cosP[n - nmin],
+                              std::sin(inPar(n, 5, 0)),
+                              std::cos(inPar(n, 5, 0)),
+                              inPar(n, 3, 0),
+                              inChg(n, 0, 0),
+                              kinv[n - nmin])
+                       : (plPnt.constAt(n, 2, 0) - inPar.constAt(n, 2, 0)) / std::cos(inPar.constAt(n, 5, 0)));
+  }
+
+  MPlexLV outParTmp;
+
+  CMS_UNROLL_LOOP_COUNT(Config::Niter - 1)
+  for (int i = 0; i < Config::Niter - 1; ++i) {
+    parsFromPathL_impl(inPar, outParTmp, kinv, s, nmin, nmax);
+
+#pragma omp simd
+    for (int n = nmin; n < nmax; ++n) {
+      delta0[n - nmin] = outParTmp(n, 0, 0) - plPnt(n, 0, 0);
+      delta1[n - nmin] = outParTmp(n, 1, 0) - plPnt(n, 1, 0);
+      delta2[n - nmin] = outParTmp(n, 2, 0) - plPnt(n, 2, 0);
+    }
+
+#pragma omp simd
+    for (int n = nmin; n < nmax; ++n) {
+      sinP[n - nmin] = std::sin(outParTmp(n, 4, 0));
+      cosP[n - nmin] = std::cos(outParTmp(n, 4, 0));
+    }
+
+#pragma omp simd
+    for (int n = nmin; n < nmax; ++n) {
+      s[n - nmin] += (std::abs(plNrm(n, 2, 0)) < 1.f ? getS(delta0[n - nmin],
+                                                            delta1[n - nmin],
+                                                            delta2[n - nmin],
+                                                            plNrm(n, 0, 0),
+                                                            plNrm(n, 1, 0),
+                                                            plNrm(n, 2, 0),
+                                                            sinP[n - nmin],
+                                                            cosP[n - nmin],
+                                                            std::sin(inPar(n, 5, 0)),
+                                                            std::cos(inPar(n, 5, 0)),
+                                                            inPar(n, 3, 0),
+                                                            inChg(n, 0, 0),
+                                                            kinv[n - nmin])
+                                                     : (plPnt.constAt(n, 2, 0) - outParTmp.constAt(n, 2, 0)) /
+                                                           std::cos(outParTmp.constAt(n, 5, 0)));
+    }
+  }  //end Niter-1
+
+  // use linear approximation if s did not converge (for very high pT tracks)
+  for (int n = nmin; n < nmax; ++n) {
+#ifdef DEBUG
+    if (debug)
+      std::cout << "s[n - nmin]=" << s[n - nmin] << " sl[n - nmin]=" << sl[n - nmin]
+                << " std::isnan(s[n - nmin])=" << std::isnan(s[n - nmin])
+                << " std::isfinite(s[n - nmin])=" << std::isfinite(s[n - nmin])
+                << " std::isnormal(s[n - nmin])=" << std::isnormal(s[n - nmin]) << std::endl;
+#endif
+    if ((std::abs(sl[n - nmin]) > std::abs(s[n - nmin])) || std::isnormal(s[n - nmin]) == false)
+      s[n - nmin] = sl[n - nmin];
+  }
+
+#ifdef DEBUG
+  if (debug)
+    std::cout << "s=" << s[0] << std::endl;
+#endif
+  parsAndErrPropFromPathL_impl(inPar, inChg, outPar, kinv, s, errorProp, nmin, nmax, N_proc, pf);
+}
+
+/*
+// this function just calculates the path length (using the iterative approach as before)
+// and then calls parsAndErrPropFromPathL_impl for error propagation
+template <typename Tf, typename Ti, typename TfLL1, typename Tf11, typename TfLLL>
+static inline void helixAtRFromIterativeCCS_impl_new(const Tf& __restrict__ inPar,
+                                                     const Ti& __restrict__ inChg,
+                                                     const Tf11& __restrict__ msRad,
+                                                     TfLL1& __restrict__ outPar,
+                                                     TfLLL& __restrict__ errorProp,
+                                                     Ti& __restrict__ outFailFlag,  // expected to be initialized to 0
+                                                     const int nmin,
+                                                     const int nmax,
+                                                     const int N_proc,
+                                                     const PropagationFlags& pf) {
+
+#pragma omp simd
+  for (int n = nmin; n < nmax; ++n) {
+    //initialize erroProp to identity matrix
+    errorProp(n, 0, 0) = 1.f;
+    errorProp(n, 1, 1) = 1.f;
+    errorProp(n, 2, 2) = 1.f;
+    errorProp(n, 3, 3) = 1.f;
+    errorProp(n, 4, 4) = 1.f;
+    errorProp(n, 5, 5) = 1.f;
+  }
+  float r0[nmax - nmin];
+#pragma omp simd
+  for (int n = nmin; n < nmax; ++n) {
+    //initialize erroProp to identity matrix
+    r0[n - nmin] = hipo(inPar(n, 0, 0), inPar(n, 1, 0));
+  }
+  float k[nmax - nmin];
+  if (pf.use_param_b_field) {
+#pragma omp simd
+    for (int n = nmin; n < nmax; ++n) {
+      k[n - nmin] = inChg(n, 0, 0) * 100.f / (-Const::sol * Config::bFieldFromZR(inPar(n, 2, 0), r0[n - nmin]));
+    }
+  } else {
+#pragma omp simd
+    for (int n = nmin; n < nmax; ++n) {
+      k[n - nmin] = inChg(n, 0, 0) * 100.f / (-Const::sol * Config::Bfield);
+    }
+  }
+  float r[nmax - nmin];
+#pragma omp simd
+  for (int n = nmin; n < nmax; ++n) {
+    r[n - nmin] = msRad(n, 0, 0);
+  }
+  float xin[nmax - nmin];
+  float yin[nmax - nmin];
+  float ipt[nmax - nmin];
+  float phiin[nmax - nmin];
+#pragma omp simd
+  for (int n = nmin; n < nmax; ++n) {
+    xin[n - nmin] = inPar(n, 0, 0);
+    yin[n - nmin] = inPar(n, 1, 0);
+    ipt[n - nmin] = inPar(n, 3, 0);
+    phiin[n - nmin] = inPar(n, 4, 0);
+  }
+
+  for (int n = nmin; n < nmax; ++n) {
+    dprint_np(n,
+              "input parameters"
+                  << " inPar(n, 0, 0)=" << std::setprecision(9) << inPar(n, 0, 0) << " inPar(n, 1, 0)="
+                  << std::setprecision(9) << inPar(n, 1, 0) << " inPar(n, 2, 0)=" << std::setprecision(9)
+                  << inPar(n, 2, 0) << " inPar(n, 3, 0)=" << std::setprecision(9) << inPar(n, 3, 0)
+                  << " inPar(n, 4, 0)=" << std::setprecision(9) << inPar(n, 4, 0)
+                  << " inPar(n, 5, 0)=" << std::setprecision(9) << inPar(n, 5, 0));
+  }
+
+  float kinv[nmax - nmin];
+  float pt[nmax - nmin];
+#pragma omp simd
+  for (int n = nmin; n < nmax; ++n) {
+    kinv[n - nmin] = 1.f / k[n - nmin];
+    pt[n - nmin] = 1.f / ipt[n - nmin];
+  }
+  float D[nmax - nmin];
+  float cosa[nmax - nmin];
+  float sina[nmax - nmin];
+  float cosah[nmax - nmin];
+  float sinah[nmax - nmin];
+  float id[nmax - nmin];
+
+#pragma omp simd
+  for (int n = nmin; n < nmax; ++n) {
+    D[n - nmin] = 0.;
+  }
+
+  //no trig approx here, phi can be large
+  float cosPorT[nmax - nmin];
+  float sinPorT[nmax - nmin];
+#pragma omp simd
+  for (int n = nmin; n < nmax; ++n) {
+    cosPorT[n - nmin] = std::cos(phiin[n - nmin]);
+  }
+#pragma omp simd
+  for (int n = nmin; n < nmax; ++n) {
+    sinPorT[n - nmin] = std::sin(phiin[n - nmin]);
+  }
+
+  float pxin[nmax - nmin];
+  float pyin[nmax - nmin];
+#pragma omp simd
+  for (int n = nmin; n < nmax; ++n) {
+    pxin[n - nmin] = cosPorT[n - nmin] * pt[n - nmin];
+    pyin[n - nmin] = sinPorT[n - nmin] * pt[n - nmin];
+  }
+
+  for (int n = nmin; n < nmax; ++n) {
+    dprint_np(n,
+              "k=" << std::setprecision(9) << k[n - nmin] << " pxin=" << std::setprecision(9) << pxin[n - nmin]
+                   << " pyin=" << std::setprecision(9) << pyin[n - nmin] << " cosPorT=" << std::setprecision(9)
+                   << cosPorT[n - nmin] << " sinPorT=" << std::setprecision(9) << sinPorT[n - nmin]
+                   << " pt=" << std::setprecision(9) << pt[n - nmin]);
+  }
+
+  float oodotp[nmax - nmin];
+  float pxinold[nmax - nmin];
+
+  CMS_UNROLL_LOOP_COUNT(Config::Niter)
+  for (int i = 0; i < Config::Niter; ++i) {
+#pragma omp simd
+    for (int n = nmin; n < nmax; ++n) {
+      //compute distance and path for the current iteration
+      r0[n - nmin] = hipo(xin[n - nmin], yin[n - nmin]);
+    }
+
+    // Use one over dot product of transverse momentum and radial
+    // direction to scale the step. Propagation is prevented from reaching
+    // too close to the apex (dotp > 0.2).
+    // - Can / should we come up with a better approximation?
+    // - Can / should take +/- curvature into account?
+
+#pragma omp simd
+    for (int n = nmin; n < nmax; ++n) {
+      oodotp[n - nmin] =
+          r0[n - nmin] * pt[n - nmin] / (pxin[n - nmin] * xin[n - nmin] + pyin[n - nmin] * yin[n - nmin]);
+    }
+
+#pragma omp simd
+    for (int n = nmin; n < nmax; ++n) {
+      if (oodotp[n - nmin] > 5.0f || oodotp[n - nmin] < 0)  // 0.2 is 78.5 deg
+      {
+        outFailFlag(n, 0, 0) = 1;
+        oodotp[n - nmin] = 0.0f;
+      } else if (r[n - nmin] - r0[n - nmin] < 0.0f && pt[n - nmin] < 1.0f) {
+        // Scale down the correction for low-pT ingoing tracks.
+        oodotp[n - nmin] = 1.0f + (oodotp[n - nmin] - 1.0f) * pt[n - nmin];
+      }
+    }
+
+#pragma omp simd
+    for (int n = nmin; n < nmax; ++n) {
+      // Can we come up with a better approximation?
+      // Should take +/- curvature into account.
+      id[n - nmin] = (r[n - nmin] - r0[n - nmin]) * oodotp[n - nmin];
+    }
+
+#pragma omp simd
+    for (int n = nmin; n < nmax; ++n) {
+      D[n - nmin] += id[n - nmin];
+    }
+
+    if constexpr (Config::useTrigApprox) {
+#if !defined(__INTEL_COMPILER)
+#pragma omp simd
+#endif
+      for (int n = nmin; n < nmax; ++n) {
+        sincos4(id[n - nmin] * ipt[n - nmin] * kinv[n - nmin] * 0.5f, sinah[n - nmin], cosah[n - nmin]);
+      }
+    } else {
+#if !defined(__INTEL_COMPILER)
+#pragma omp simd
+#endif
+      for (int n = nmin; n < nmax; ++n) {
+        cosah[n - nmin] = std::cos(id[n - nmin] * ipt[n - nmin] * kinv[n - nmin] * 0.5f);
+        sinah[n - nmin] = std::sin(id[n - nmin] * ipt[n - nmin] * kinv[n - nmin] * 0.5f);
+      }
+    }
+
+#pragma omp simd
+    for (int n = nmin; n < nmax; ++n) {
+      cosa[n - nmin] = 1.f - 2.f * sinah[n - nmin] * sinah[n - nmin];
+      sina[n - nmin] = 2.f * sinah[n - nmin] * cosah[n - nmin];
+    }
+
+    for (int n = nmin; n < nmax; ++n) {
+      dprint_np(n,
+                "Attempt propagation from r="
+                    << r0[n - nmin] << " to r=" << r[n - nmin] << std::endl
+                    << "   x=" << xin[n - nmin] << " y=" << yin[n - nmin] << " z=" << inPar(n, 2, 0)
+                    << " px=" << pxin[n - nmin] << " py=" << pyin[n - nmin]
+                    << " pz=" << pt[n - nmin] / std::tan(inPar(n, 5, 0)) << " q=" << inChg(n, 0, 0) << std::endl
+                    << "   r=" << std::setprecision(9) << r[n - nmin] << " r0=" << std::setprecision(9) << r0[n - nmin]
+                    << " id=" << std::setprecision(9) << id[n - nmin] << " dr=" << std::setprecision(9)
+                    << r[n - nmin] - r0[n - nmin] << " cosa=" << cosa[n - nmin] << " sina=" << sina[n - nmin]
+                    << " dir_cos(rad,pT)=" << 1.0f / oodotp[n - nmin]);
+    }
+
+#pragma omp simd
+    for (int n = nmin; n < nmax; ++n) {
+      //update parameters
+      xin[n - nmin] = xin[n - nmin] + 2.f * k[n - nmin] * sinah[n - nmin] *
+                                              (pxin[n - nmin] * cosah[n - nmin] - pyin[n - nmin] * sinah[n - nmin]);
+      yin[n - nmin] = yin[n - nmin] + 2.f * k[n - nmin] * sinah[n - nmin] *
+                                              (pyin[n - nmin] * cosah[n - nmin] + pxin[n - nmin] * sinah[n - nmin]);
+      pxinold[n - nmin] = pxin[n - nmin];  //copy before overwriting
+      pxin[n - nmin] = pxin[n - nmin] * cosa[n - nmin] - pyin[n - nmin] * sina[n - nmin];
+      pyin[n - nmin] = pyin[n - nmin] * cosa[n - nmin] + pxinold[n - nmin] * sina[n - nmin];
+    }
+    for (int n = nmin; n < nmax; ++n) {
+      dprint_np(n,
+                "outPar(n, 0, 0)=" << outPar(n, 0, 0) << " outPar(n, 1, 0)=" << outPar(n, 1, 0)
+                                   << " pxin=" << pxin[n - nmin] << " pyin=" << pyin[n - nmin]);
+    }
+  }  // iteration loop
+
+  //float s[nmax - nmin];
+  MPlexQF s;
+  for (int n = nmin; n < nmax; ++n) {
+    //s[n - nmin] = D[n - nmin]/std::sin(inPar(n, 5, 0));
+    s(n, 0, 0) = D[n - nmin]/std::sin(inPar(n, 5, 0));
+  }
+  parsAndErrPropFromPathL_impl(inPar, inChg, outPar, kinv, s, errorProp, nmin, nmax, N_proc, pf);
+
+  for (int n = nmin; n < nmax; ++n) {
+    dprint_np(n,
+              "propagation to R end (NEW), dump parameters\n"
+                  << "   D = " << D[n - nmin] << " alpha = " << D[n - nmin] * inPar(n, 3, 0) * kinv[n - nmin] << " kinv = " << kinv[n - nmin] << std::endl
+                  << "   pos = " << outPar(n, 0, 0) << " " << outPar(n, 1, 0) << " " << outPar(n, 2, 0) << "\t\t r="
+                  << std::sqrt(outPar(n, 0, 0) * outPar(n, 0, 0) + outPar(n, 1, 0) * outPar(n, 1, 0)) << std::endl
+                  << "   mom = " << outPar(n, 3, 0) << " " << outPar(n, 4, 0) << " " << outPar(n, 5, 0) << std::endl
+		  << " cart= " << std::cos(outPar(n, 4, 0)) / outPar(n, 3, 0) << " "
+                  << std::sin(outPar(n, 4, 0)) / outPar(n, 3, 0) << " " << 1. / (outPar(n, 3, 0) * tan(outPar(n, 5, 0)))
+                  << "\t\tpT=" << 1. / std::abs(outPar(n, 3, 0)) << std::endl);
+  }
+
+#ifdef DEBUG
+  for (int n = nmin; n < nmax; ++n) {
+    if (debug && g_debug && n < N_proc) {
+      dmutex_guard;
+      std::cout << n << ": jacobian" << std::endl;
+      printf("%5f %5f %5f %5f %5f %5f\n",
+             errorProp(n, 0, 0),
+             errorProp(n, 0, 1),
+             errorProp(n, 0, 2),
+             errorProp(n, 0, 3),
+             errorProp(n, 0, 4),
+             errorProp(n, 0, 5));
+      printf("%5f %5f %5f %5f %5f %5f\n",
+             errorProp(n, 1, 0),
+             errorProp(n, 1, 1),
+             errorProp(n, 1, 2),
+             errorProp(n, 1, 3),
+             errorProp(n, 1, 4),
+             errorProp(n, 1, 5));
+      printf("%5f %5f %5f %5f %5f %5f\n",
+             errorProp(n, 2, 0),
+             errorProp(n, 2, 1),
+             errorProp(n, 2, 2),
+             errorProp(n, 2, 3),
+             errorProp(n, 2, 4),
+             errorProp(n, 2, 5));
+      printf("%5f %5f %5f %5f %5f %5f\n",
+             errorProp(n, 3, 0),
+             errorProp(n, 3, 1),
+             errorProp(n, 3, 2),
+             errorProp(n, 3, 3),
+             errorProp(n, 3, 4),
+             errorProp(n, 3, 5));
+      printf("%5f %5f %5f %5f %5f %5f\n",
+             errorProp(n, 4, 0),
+             errorProp(n, 4, 1),
+             errorProp(n, 4, 2),
+             errorProp(n, 4, 3),
+             errorProp(n, 4, 4),
+             errorProp(n, 4, 5));
+      printf("%5f %5f %5f %5f %5f %5f\n",
+             errorProp(n, 5, 0),
+             errorProp(n, 5, 1),
+             errorProp(n, 5, 2),
+             errorProp(n, 5, 3),
+             errorProp(n, 5, 4),
+             errorProp(n, 5, 5));
+      printf("\n");
+    }
+  }
+#endif
+
+}
+*/
+
 template <typename Tf, typename Ti, typename TfLL1, typename Tf11, typename TfLLL>
 static inline void helixAtRFromIterativeCCS_impl(const Tf& __restrict__ inPar,
                                                  const Ti& __restrict__ inChg,
@@ -69,6 +816,7 @@ static inline void helixAtRFromIterativeCCS_impl(const Tf& __restrict__ inPar,
     //dprint(std::endl);
   }
 
+  //debug = true;
   for (int n = nmin; n < nmax; ++n) {
     dprint_np(n,
               "input parameters"
@@ -240,7 +988,7 @@ static inline void helixAtRFromIterativeCCS_impl(const Tf& __restrict__ inPar,
                     << "   r=" << std::setprecision(9) << r[n - nmin] << " r0=" << std::setprecision(9) << r0[n - nmin]
                     << " id=" << std::setprecision(9) << id[n - nmin] << " dr=" << std::setprecision(9)
                     << r[n - nmin] - r0[n - nmin] << " cosa=" << cosa[n - nmin] << " sina=" << sina[n - nmin]
-                    << " dir_cos(rad,pT)=" << 1.0f / oodotp[n]);
+                    << " dir_cos(rad,pT)=" << 1.0f / oodotp[n - nmin]);
     }
 
     //update derivatives on total distance
@@ -457,9 +1205,12 @@ static inline void helixAtRFromIterativeCCS_impl(const Tf& __restrict__ inPar,
   for (int n = nmin; n < nmax; ++n) {
     dprint_np(n,
               "propagation end, dump parameters\n"
+                  << "   D = " << D[n - nmin] << " alpha = " << alpha[n - nmin] << " kinv = " << kinv[n - nmin]
+                  << std::endl
                   << "   pos = " << outPar(n, 0, 0) << " " << outPar(n, 1, 0) << " " << outPar(n, 2, 0) << "\t\t r="
                   << std::sqrt(outPar(n, 0, 0) * outPar(n, 0, 0) + outPar(n, 1, 0) * outPar(n, 1, 0)) << std::endl
-                  << "   mom = " << std::cos(outPar(n, 4, 0)) / outPar(n, 3, 0) << " "
+                  << "   mom = " << outPar(n, 3, 0) << " " << outPar(n, 4, 0) << " " << outPar(n, 5, 0) << std::endl
+                  << "   cart= " << std::cos(outPar(n, 4, 0)) / outPar(n, 3, 0) << " "
                   << std::sin(outPar(n, 4, 0)) / outPar(n, 3, 0) << " " << 1. / (outPar(n, 3, 0) * tan(outPar(n, 5, 0)))
                   << "\t\tpT=" << 1. / std::abs(outPar(n, 3, 0)) << std::endl);
   }