diff --git a/.github/workflows/datamodel-doc.yml b/.github/workflows/datamodel-doc.yml
index 0146fbd35fe46..6717a5b08570c 100644
--- a/.github/workflows/datamodel-doc.yml
+++ b/.github/workflows/datamodel-doc.yml
@@ -70,7 +70,7 @@ jobs:
           # Send pull request
           # We need to use "gh" ourselves because alisw/pull-request gets
           # confused when multiple repos are checked out.
-          GH_TOKEN=${{ secrets.GITHUB_TOKEN }} gh pr create -B \
+          GH_TOKEN="$GITHUB_TOKEN" gh pr create -B \
           AliceO2Group:master -H alibuild:auto-datamodel-doc \
           --no-maintainer-edit -t 'Automatic data model update' -b "This update \
           to the data model documentation was automatically created from \
diff --git a/Common/SimConfig/include/SimConfig/G4Params.h b/Common/SimConfig/include/SimConfig/G4Params.h
index c6a5bc0882320..fd36ae046d520 100644
--- a/Common/SimConfig/include/SimConfig/G4Params.h
+++ b/Common/SimConfig/include/SimConfig/G4Params.h
@@ -22,17 +22,24 @@ namespace conf
 
 // enumerating the possible G4 physics settings
 enum class EG4Physics {
-  kFTFP_BERT_optical = 0,         /* just ordinary */
-  kFTFP_BERT_optical_biasing = 1, /* with biasing enabled */
-  kFTFP_INCLXX_optical = 2,       /* special INCL++ version */
-  kFTFP_BERT_HP_optical = 3       /* enable low energy neutron transport */
+  kFTFP_BERT_optical = 0,             /* just ordinary */
+  kFTFP_BERT_optical_biasing = 1,     /* with biasing enabled */
+  kFTFP_INCLXX_optical = 2,           /* special INCL++ version */
+  kFTFP_BERT_HP_optical = 3,          /* enable low energy neutron transport */
+  kFTFP_BERT_EMV_optical = 4,         /* just ordinary with faster electromagnetic physics */
+  kFTFP_BERT_EMV_optical_biasing = 5, /* with biasing enabled with faster electromagnetic physics */
+  kFTFP_INCLXX_EMV_optical = 6,       /* special INCL++ version */
+  kFTFP_BERT_EMV_HP_optical = 7,      /* enable low energy neutron transport */
+  kUSER = 8                           /* allows to give own string combination */
 };
 
 // parameters to influence the G4 engine
 struct G4Params : public o2::conf::ConfigurableParamHelper<G4Params> {
-  EG4Physics physicsmode = EG4Physics::kFTFP_BERT_optical; // physics mode with which to configure G4
+  EG4Physics physicsmode = EG4Physics::kFTFP_BERT_EMV_optical; // default physics mode with which to configure G4
 
   std::string configMacroFile = ""; // a user provided g4Config.in file (otherwise standard one fill be taken)
+  std::string userPhysicsList = ""; // possibility to directly give physics list as string
+
   std::string const& getPhysicsConfigString() const;
 
   O2ParamDef(G4Params, "G4");
diff --git a/Common/SimConfig/src/G4Params.cxx b/Common/SimConfig/src/G4Params.cxx
index 48f7a1c240d7b..37625d914a85b 100644
--- a/Common/SimConfig/src/G4Params.cxx
+++ b/Common/SimConfig/src/G4Params.cxx
@@ -19,12 +19,21 @@ namespace conf
 
 namespace
 {
-static const std::string confstrings[4] = {"FTFP_BERT_EMV+optical", "FTFP_BERT_EMV+optical+biasing", "FTFP_INCLXX_EMV+optical",
+static const std::string confstrings[8] = {"FTFP_BERT+optical",
+                                           "FTFP_BERT+optical+biasing",
+                                           "FTFP_INCLXX+optical",
+                                           "FTFP_BERT_HP+optical",
+                                           "FTFP_BERT_EMV+optical",
+                                           "FTFP_BERT_EMV+optical+biasing",
+                                           "FTFP_INCLXX_EMV+optical",
                                            "FTFP_BERT_HP_EMV+optical"};
 }
 
 std::string const& G4Params::getPhysicsConfigString() const
 {
+  if (physicsmode == o2::conf::EG4Physics::kUSER) {
+    return userPhysicsList;
+  }
   return confstrings[(int)physicsmode];
 }
 
diff --git a/Detectors/CTP/workflowScalers/src/ctpCCDBManager.cxx b/Detectors/CTP/workflowScalers/src/ctpCCDBManager.cxx
index 0a462726deb01..3484cb97279b5 100644
--- a/Detectors/CTP/workflowScalers/src/ctpCCDBManager.cxx
+++ b/Detectors/CTP/workflowScalers/src/ctpCCDBManager.cxx
@@ -48,7 +48,7 @@ int ctpCCDBManager::saveRunScalersToCCDB(CTPRunScalers& scalers, long timeStart,
   if (ret == 0) {
     LOG(info) << "CTP scalers saved in ccdb:" << mCCDBHost << " run:" << scalers.getRunNumber() << " tmin:" << tmin << " tmax:" << tmax;
   } else {
-    LOG(FATAL) << "Problem writing to database ret:" << ret;
+    LOG(fatal) << "Problem writing to database ret:" << ret;
   }
   return ret;
 }
@@ -76,7 +76,7 @@ int ctpCCDBManager::saveRunScalersToQCDB(CTPRunScalers& scalers, long timeStart,
   if (ret == 0) {
     LOG(info) << "CTP scalers saved in qcdb:" << mQCDBHost << " run:" << scalers.getRunNumber() << " tmin:" << tmin << " tmax:" << tmax;
   } else {
-    LOG(FATAL) << "CTP scalers Problem writing to database qcdb ret:" << ret;
+    LOG(fatal) << "CTP scalers Problem writing to database qcdb ret:" << ret;
   }
   return ret;
 }
@@ -103,7 +103,7 @@ int ctpCCDBManager::saveRunConfigToCCDB(CTPConfiguration* cfg, long timeStart)
   if (ret == 0) {
     LOG(info) << "CTP config  saved in ccdb:" << mCCDBHost << " run:" << cfg->getRunNumber() << " tmin:" << tmin << " tmax:" << tmax;
   } else {
-    LOG(FATAL) << "CTPConfig: Problem writing to database ret:" << ret;
+    LOG(fatal) << "CTPConfig: Problem writing to database ret:" << ret;
   }
   return ret;
 }
diff --git a/Detectors/EMCAL/base/src/Geometry.cxx b/Detectors/EMCAL/base/src/Geometry.cxx
index 5d4b793525e20..920dc24823e83 100644
--- a/Detectors/EMCAL/base/src/Geometry.cxx
+++ b/Detectors/EMCAL/base/src/Geometry.cxx
@@ -309,6 +309,9 @@ void Geometry::DefineSamplingFraction(const std::string_view mcname, const std::
   }
 
   Float_t samplingFactorTranportModel = 1.;
+  // Note: The sampling factors are chosen so that results from the simulation
+  // engines correspond well with testbeam data
+
   if (contains(mcname, "Geant3")) {
     samplingFactorTranportModel = 1.; // 0.988 // Do nothing
   } else if (contains(mcname, "Fluka")) {
@@ -318,6 +321,9 @@ void Geometry::DefineSamplingFraction(const std::string_view mcname, const std::
     LOG(info) << "Selected physics list: " << physicslist;
     // sampling factors for different Geant4 physics list
     // GEANT4 10.7 -> EMCAL-784
+
+    // set a default (there may be many physics list strings)
+    samplingFactorTranportModel = 0.81;
     if (physicslist == "FTFP_BERT_EMV+optical") {
       samplingFactorTranportModel = 0.821;
     } else if (physicslist == "FTFP_BERT_EMV+optical+biasing") {
diff --git a/Detectors/TPC/qc/src/PID.cxx b/Detectors/TPC/qc/src/PID.cxx
index 556dcb273b627..76d747165ea32 100644
--- a/Detectors/TPC/qc/src/PID.cxx
+++ b/Detectors/TPC/qc/src/PID.cxx
@@ -131,8 +131,8 @@ bool PID::processTrack(const o2::tpc::TrackTPC& track, size_t nTracks)
 {
   // ===| variables required for cutting and filling |===
   const auto& dEdx = track.getdEdx();
-  const auto magCharge = track.getAbsCharge();
-  const auto pTPC = track.getP() * magCharge; // charge magnitude is divided getP() via getPtInv therefore magCharge is required to be multiplied [fix for He3]
+  const auto absCharge = track.getAbsCharge();
+  const auto pTPC = (absCharge > 0) ? (track.getP() / absCharge) : track.getP(); // charge magnitude is divided getP() via getPtInv for pTPC/Z [fix for He3]
   const auto tgl = track.getTgl();
   const auto snp = track.getSnp();
   const auto phi = track.getPhi();
diff --git a/Detectors/TPC/simulation/include/TPCSimulation/Digitizer.h b/Detectors/TPC/simulation/include/TPCSimulation/Digitizer.h
index e2c7e9ec7d175..1c76dcaaeedd3 100644
--- a/Detectors/TPC/simulation/include/TPCSimulation/Digitizer.h
+++ b/Detectors/TPC/simulation/include/TPCSimulation/Digitizer.h
@@ -138,6 +138,9 @@ class Digitizer
   void setMeanLumiDistortions(float meanLumi);
   void setMeanLumiDistortionsDerivative(float meanLumi);
 
+  /// in case of scaled distortions, the distortions can be recalculated to ensure consistent distortions and corrections
+  void recalculateDistortions();
+
  private:
   DigitContainer mDigitContainer;      ///< Container for the Digits
   std::unique_ptr<SC> mSpaceCharge;    ///< Handler of full distortions (static + IR dependant)
@@ -151,7 +154,8 @@ class Digitizer
   bool mUseSCDistortions = false;      ///< Flag to switch on the use of space-charge distortions
   int mDistortionScaleType = 0;        ///< type=0: no scaling of distortions, type=1 distortions without any scaling, type=2 distortions scaling with lumi
   float mLumiScaleFactor = 0;          ///< value used to scale the derivative map
-  ClassDefNV(Digitizer, 2);
+  bool mUseScaledDistortions = false;  ///< whether the distortions are already scaled
+  ClassDefNV(Digitizer, 3);
 };
 } // namespace tpc
 } // namespace o2
diff --git a/Detectors/TPC/simulation/src/Digitizer.cxx b/Detectors/TPC/simulation/src/Digitizer.cxx
index 38968b3b38bb9..290cd84df9a6b 100644
--- a/Detectors/TPC/simulation/src/Digitizer.cxx
+++ b/Detectors/TPC/simulation/src/Digitizer.cxx
@@ -83,7 +83,7 @@ void Digitizer::process(const std::vector<o2::tpc::HitGroup>& hits,
       if (mDistortionScaleType == 1) {
         mSpaceCharge->distortElectron(posEle);
       } else if (mDistortionScaleType == 2) {
-        mSpaceCharge->distortElectron(posEle, mSpaceChargeDer.get(), mLumiScaleFactor);
+        mSpaceCharge->distortElectron(posEle, (mUseScaledDistortions ? nullptr : mSpaceChargeDer.get()), mLumiScaleFactor);
       }
 
       /// Remove electrons that end up more than three sigma of the hit's average diffusion away from the current sector
@@ -237,3 +237,33 @@ void Digitizer::setMeanLumiDistortionsDerivative(float meanLumi)
 {
   mSpaceChargeDer->setMeanLumi(meanLumi);
 }
+
+void Digitizer::recalculateDistortions()
+{
+  if (!mSpaceCharge || !mSpaceChargeDer) {
+    LOGP(info, "Average or derivative distortions not set");
+    return;
+  }
+
+  // recalculate distortions only in case the inst lumi differs from the avg lumi
+  if (mSpaceCharge->getMeanLumi() != CorrMapParam::Instance().lumiInst) {
+    for (int iside = 0; iside < 2; ++iside) {
+      const o2::tpc::Side side = (iside == 0) ? Side::A : Side::C;
+      // this needs to be done only once
+      LOGP(info, "Calculating corrections for average distortions");
+      mSpaceCharge->calcGlobalCorrWithGlobalDistIterative(side, nullptr, 0);
+
+      LOGP(info, "Calculating corrections for derivative distortions");
+      mSpaceChargeDer->calcGlobalCorrWithGlobalDistIterative(side, nullptr, 0);
+
+      LOGP(info, "Calculating scaled distortions with scaling factor {}", mLumiScaleFactor);
+      mSpaceCharge->calcGlobalDistWithGlobalCorrIterative(side, mSpaceChargeDer.get(), mLumiScaleFactor);
+    }
+    // set new lumi of avg map
+    mSpaceCharge->setMeanLumi(CorrMapParam::Instance().lumiInst);
+  } else {
+    LOGP(info, "Inst. lumi {} is same as mean lumi {}. Skip recalculation of distortions", CorrMapParam::Instance().lumiInst, mSpaceCharge->getMeanLumi());
+  }
+
+  mUseScaledDistortions = true;
+}
diff --git a/Detectors/TPC/spacecharge/include/TPCSpaceCharge/SpaceCharge.h b/Detectors/TPC/spacecharge/include/TPCSpaceCharge/SpaceCharge.h
index deed612e846f8..4362d95738692 100644
--- a/Detectors/TPC/spacecharge/include/TPCSpaceCharge/SpaceCharge.h
+++ b/Detectors/TPC/spacecharge/include/TPCSpaceCharge/SpaceCharge.h
@@ -354,7 +354,20 @@ class SpaceCharge
   /// \param approachR when the difference between the desired r coordinate and the position of the global correction is deltaR, approach the desired r coordinate by deltaR * \p approachR.
   /// \param approachPhi when the difference between the desired phi coordinate and the position of the global correction is deltaPhi, approach the desired phi coordinate by deltaPhi * \p approachPhi.
   /// \param diffCorr if the absolute differences from the interpolated values for the global corrections from the last iteration compared to the current iteration is smaller than this value, set converged to true for current global distortion
-  void calcGlobalDistWithGlobalCorrIterative(const DistCorrInterpolator<DataT>& globCorr, const int maxIter = 100, const DataT approachZ = 0.5, const DataT approachR = 0.5, const DataT approachPhi = 0.5, const DataT diffCorr = 1e-6);
+  /// \param type whether to calculate distortions or corrections
+  void calcGlobalDistWithGlobalCorrIterative(const DistCorrInterpolator<DataT>& globCorr, const int maxIter = 100, const DataT approachZ = 1, const DataT approachR = 1, const DataT approachPhi = 1, const DataT diffCorr = 50e-6, const SpaceCharge<DataT>* scSCale = nullptr, float scale = 0);
+
+  /// step 5: calculate global distortions using the global corrections (FAST)
+  /// \param scSCale possible second sc object
+  /// \param scale scaling for second sc object
+  void calcGlobalDistWithGlobalCorrIterative(const Side side, const SpaceCharge<DataT>* scSCale = nullptr, float scale = 0, const int maxIter = 100, const DataT approachZ = 1, const DataT approachR = 1, const DataT approachPhi = 1, const DataT diffCorr = 50e-6);
+  void calcGlobalDistWithGlobalCorrIterative(const SpaceCharge<DataT>* scSCale = nullptr, float scale = 0, const int maxIter = 100, const DataT approachZ = 1, const DataT approachR = 1, const DataT approachPhi = 1, const DataT diffCorr = 50e-6);
+
+  /// calculate global corrections from global distortions
+  /// \param scSCale possible second sc object
+  /// \param scale scaling for second sc object
+  void calcGlobalCorrWithGlobalDistIterative(const Side side, const SpaceCharge<DataT>* scSCale = nullptr, float scale = 0, const int maxIter = 100, const DataT approachZ = 1, const DataT approachR = 1, const DataT approachPhi = 1, const DataT diffCorr = 50e-6);
+  void calcGlobalCorrWithGlobalDistIterative(const SpaceCharge<DataT>* scSCale = nullptr, float scale = 0, const int maxIter = 100, const DataT approachZ = 1, const DataT approachR = 1, const DataT approachPhi = 1, const DataT diffCorr = 50e-6);
 
   /// \return returns number of vertices in z direction
   unsigned short getNZVertices() const { return mParamGrid.NZVertices; }
@@ -1359,6 +1372,8 @@ class SpaceCharge
   /// set potentialsdue to ROD misalignment
   void initRodAlignmentVoltages(const MisalignmentType misalignmentType, const FCType fcType, const int sector, const Side side, const float deltaPot);
 
+  void calcGlobalDistCorrIterative(const DistCorrInterpolator<DataT>& globCorr, const int maxIter, const DataT approachZ, const DataT approachR, const DataT approachPhi, const DataT diffCorr, const SpaceCharge<DataT>* scSCale, float scale, const Type type);
+
   ClassDefNV(SpaceCharge, 6);
 };
 
diff --git a/Detectors/TPC/spacecharge/src/SpaceCharge.cxx b/Detectors/TPC/spacecharge/src/SpaceCharge.cxx
index e51b8de0850f6..6748de7ae9916 100644
--- a/Detectors/TPC/spacecharge/src/SpaceCharge.cxx
+++ b/Detectors/TPC/spacecharge/src/SpaceCharge.cxx
@@ -697,12 +697,59 @@ void SpaceCharge<DataT>::calcEField(const Side side)
 }
 
 template <typename DataT>
-void SpaceCharge<DataT>::calcGlobalDistWithGlobalCorrIterative(const DistCorrInterpolator<DataT>& globCorr, const int maxIter, const DataT approachZ, const DataT approachR, const DataT approachPhi, const DataT diffCorr)
+void SpaceCharge<DataT>::calcGlobalDistWithGlobalCorrIterative(const DistCorrInterpolator<DataT>& globCorr, const int maxIter, const DataT approachZ, const DataT approachR, const DataT approachPhi, const DataT diffCorr, const SpaceCharge<DataT>* scSCale, float scale)
+{
+  calcGlobalDistCorrIterative(globCorr, maxIter, approachZ, approachR, approachPhi, diffCorr, scSCale, scale, Type::Distortions);
+}
+
+template <typename DataT>
+void SpaceCharge<DataT>::calcGlobalDistWithGlobalCorrIterative(const Side side, const SpaceCharge<DataT>* scSCale, float scale, const int maxIter, const DataT approachZ, const DataT approachR, const DataT approachPhi, const DataT diffCorr)
+{
+  calcGlobalDistCorrIterative(getGlobalCorrInterpolator(side), maxIter, approachZ, approachR, approachPhi, diffCorr, scSCale, scale, Type::Distortions);
+}
+
+template <typename DataT>
+void SpaceCharge<DataT>::calcGlobalDistWithGlobalCorrIterative(const SpaceCharge<DataT>* scSCale, float scale, const int maxIter, const DataT approachZ, const DataT approachR, const DataT approachPhi, const DataT diffCorr)
+{
+#pragma omp parallel for num_threads(sNThreads)
+  for (int iside = 0; iside < FNSIDES; ++iside) {
+    const o2::tpc::Side side = (iside == 0) ? Side::A : Side::C;
+    calcGlobalDistWithGlobalCorrIterative(side, scSCale, scale, maxIter, approachZ, approachR, approachPhi, diffCorr);
+  }
+}
+
+template <typename DataT>
+void SpaceCharge<DataT>::calcGlobalCorrWithGlobalDistIterative(const Side side, const SpaceCharge<DataT>* scSCale, float scale, const int maxIter, const DataT approachZ, const DataT approachR, const DataT approachPhi, const DataT diffCorr)
+{
+  calcGlobalDistCorrIterative(getGlobalDistInterpolator(side), maxIter, approachZ, approachR, approachPhi, diffCorr, scSCale, scale, Type::Corrections);
+}
+
+template <typename DataT>
+void SpaceCharge<DataT>::calcGlobalCorrWithGlobalDistIterative(const SpaceCharge<DataT>* scSCale, float scale, const int maxIter, const DataT approachZ, const DataT approachR, const DataT approachPhi, const DataT diffCorr)
+{
+#pragma omp parallel for num_threads(sNThreads)
+  for (int iside = 0; iside < FNSIDES; ++iside) {
+    const o2::tpc::Side side = (iside == 0) ? Side::A : Side::C;
+    calcGlobalCorrWithGlobalDistIterative(side, scSCale, scale, maxIter, approachZ, approachR, approachPhi, diffCorr);
+  }
+}
+
+template <typename DataT>
+void SpaceCharge<DataT>::calcGlobalDistCorrIterative(const DistCorrInterpolator<DataT>& globCorr, const int maxIter, const DataT approachZ, const DataT approachR, const DataT approachPhi, const DataT diffCorr, const SpaceCharge<DataT>* scSCale, float scale, const Type type)
 {
   const Side side = globCorr.getSide();
-  initContainer(mGlobalDistdR[side], true);
-  initContainer(mGlobalDistdZ[side], true);
-  initContainer(mGlobalDistdRPhi[side], true);
+  if (type == Type::Distortions) {
+    initContainer(mGlobalDistdR[side], true);
+    initContainer(mGlobalDistdZ[side], true);
+    initContainer(mGlobalDistdRPhi[side], true);
+  } else {
+    initContainer(mGlobalCorrdR[side], true);
+    initContainer(mGlobalCorrdZ[side], true);
+    initContainer(mGlobalCorrdRPhi[side], true);
+  }
+
+  const auto& scSCaleInterpolator = (type == Type::Distortions) ? scSCale->mInterpolatorGlobalCorr[side] : scSCale->mInterpolatorGlobalDist[side];
+
 #pragma omp parallel for num_threads(sNThreads)
   for (unsigned int iPhi = 0; iPhi < mParamGrid.NPhiVertices; ++iPhi) {
     const DataT phi = getPhiVertex(iPhi, side);
@@ -754,13 +801,25 @@ void SpaceCharge<DataT>::calcGlobalDistWithGlobalCorrIterative(const DistCorrInt
 
           // interpolate global correction at new point and calculate position of global correction
           corrdR = globCorr.evaldR(zCurrPos, rCurrPos, phiCurrPos);
+          if (scSCale && scale != 0) {
+            corrdR += scale * scSCaleInterpolator.evaldR(zCurrPos, rCurrPos, phiCurrPos);
+          }
           const DataT rNewPos = rCurrPos + corrdR;
 
-          const DataT corrPhi = globCorr.evaldRPhi(zCurrPos, rCurrPos, phiCurrPos) / rCurrPos;
+          DataT corrPhi = 0;
+          if (scSCale && scale != 0) {
+            corrPhi = scale * scSCaleInterpolator.evaldRPhi(zCurrPos, rCurrPos, phiCurrPos);
+          }
+          corrPhi += globCorr.evaldRPhi(zCurrPos, rCurrPos, phiCurrPos);
+          corrPhi /= rCurrPos;
+
           corrdRPhi = corrPhi * rNewPos; // normalize to new r coordinate
           const DataT phiNewPos = phiCurrPos + corrPhi;
 
           corrdZ = globCorr.evaldZ(zCurrPos, rCurrPos, phiCurrPos);
+          if (scSCale && scale != 0) {
+            corrdZ += scale * scSCaleInterpolator.evaldZ(zCurrPos, rCurrPos, phiCurrPos);
+          }
           const DataT zNewPos = zCurrPos + corrdZ;
 
           // approach desired coordinate
@@ -783,15 +842,27 @@ void SpaceCharge<DataT>::calcGlobalDistWithGlobalCorrIterative(const DistCorrInt
           lastCorrdRPhi = corrdRPhi;
         }
         // set global distortions if algorithm converged or iterations exceed max numbers of iterations
-        mGlobalDistdR[side](iZ, iR, iPhi) = -corrdR;
-        mGlobalDistdRPhi[side](iZ, iR, iPhi) = -corrdRPhi;
-        mGlobalDistdZ[side](iZ, iR, iPhi) = -corrdZ;
+        if (type == Type::Distortions) {
+          mGlobalDistdR[side](iZ, iR, iPhi) = -corrdR;
+          mGlobalDistdRPhi[side](iZ, iR, iPhi) = -corrdRPhi;
+          mGlobalDistdZ[side](iZ, iR, iPhi) = -corrdZ;
+        } else {
+          mGlobalCorrdR[side](iZ, iR, iPhi) = -corrdR;
+          mGlobalCorrdRPhi[side](iZ, iR, iPhi) = -corrdRPhi;
+          mGlobalCorrdZ[side](iZ, iR, iPhi) = -corrdZ;
+        }
       }
     }
     for (unsigned int iR = 0; iR < mParamGrid.NRVertices; ++iR) {
-      mGlobalDistdR[side](0, iR, iPhi) = 3 * (mGlobalDistdR[side](1, iR, iPhi) - mGlobalDistdR[side](2, iR, iPhi)) + mGlobalDistdR[side](3, iR, iPhi);
-      mGlobalDistdRPhi[side](0, iR, iPhi) = 3 * (mGlobalDistdRPhi[side](1, iR, iPhi) - mGlobalDistdRPhi[side](2, iR, iPhi)) + mGlobalDistdRPhi[side](3, iR, iPhi);
-      mGlobalDistdZ[side](0, iR, iPhi) = 3 * (mGlobalDistdZ[side](1, iR, iPhi) - mGlobalDistdZ[side](2, iR, iPhi)) + mGlobalDistdZ[side](3, iR, iPhi);
+      if (type == Type::Distortions) {
+        mGlobalDistdR[side](0, iR, iPhi) = 3 * (mGlobalDistdR[side](1, iR, iPhi) - mGlobalDistdR[side](2, iR, iPhi)) + mGlobalDistdR[side](3, iR, iPhi);
+        mGlobalDistdRPhi[side](0, iR, iPhi) = 3 * (mGlobalDistdRPhi[side](1, iR, iPhi) - mGlobalDistdRPhi[side](2, iR, iPhi)) + mGlobalDistdRPhi[side](3, iR, iPhi);
+        mGlobalDistdZ[side](0, iR, iPhi) = 3 * (mGlobalDistdZ[side](1, iR, iPhi) - mGlobalDistdZ[side](2, iR, iPhi)) + mGlobalDistdZ[side](3, iR, iPhi);
+      } else {
+        mGlobalCorrdR[side](0, iR, iPhi) = 3 * (mGlobalCorrdR[side](1, iR, iPhi) - mGlobalCorrdR[side](2, iR, iPhi)) + mGlobalCorrdR[side](3, iR, iPhi);
+        mGlobalCorrdRPhi[side](0, iR, iPhi) = 3 * (mGlobalCorrdRPhi[side](1, iR, iPhi) - mGlobalCorrdRPhi[side](2, iR, iPhi)) + mGlobalCorrdRPhi[side](3, iR, iPhi);
+        mGlobalCorrdZ[side](0, iR, iPhi) = 3 * (mGlobalCorrdZ[side](1, iR, iPhi) - mGlobalCorrdZ[side](2, iR, iPhi)) + mGlobalCorrdZ[side](3, iR, iPhi);
+      }
     }
   }
 }
@@ -1535,7 +1606,7 @@ void SpaceCharge<DataT>::distortElectron(GlobalPosition3D& point, const SpaceCha
 
   // scale distortions if requested
   if (scSCale && scale != 0) {
-    scSCale->getDistortions(point.X(), point.Y(), point.Z(), side, distXTmp, distYTmp, distZTmp);
+    scSCale->getDistortions(point.X() + distX, point.Y() + distY, point.Z() + distZ, side, distXTmp, distYTmp, distZTmp);
     distX += distXTmp * scale;
     distY += distYTmp * scale;
     distZ += distZTmp * scale;
diff --git a/Framework/Core/src/CommonServices.cxx b/Framework/Core/src/CommonServices.cxx
index 39c17599abb9d..0e84333afe115 100644
--- a/Framework/Core/src/CommonServices.cxx
+++ b/Framework/Core/src/CommonServices.cxx
@@ -783,8 +783,13 @@ O2_DECLARE_DYNAMIC_LOG(monitoring_service);
 /// This will flush metrics only once every second.
 auto flushMetrics(ServiceRegistryRef registry, DataProcessingStats& stats) -> void
 {
+  // Flushing metrics should only happen on main thread to avoid
+  // having to have a mutex for the communication with the driver.
   O2_SIGNPOST_ID_GENERATE(sid, monitoring_service);
   O2_SIGNPOST_START(monitoring_service, sid, "flush", "flushing metrics");
+  if (registry.isMainThread() == false) {
+    LOGP(fatal, "Flushing metrics should only happen on the main thread.");
+  }
   auto& monitoring = registry.get<Monitoring>();
   auto& relayer = registry.get<DataRelayer>();
 
@@ -1071,13 +1076,9 @@ o2::framework::ServiceSpec CommonServices::dataProcessingStats()
       return ServiceHandle{TypeIdHelpers::uniqueId<DataProcessingStats>(), stats};
     },
     .configure = noConfiguration(),
-    .preProcessing = [](ProcessingContext& context, void* service) {
-      auto* stats = (DataProcessingStats*)service;
-      flushMetrics(context.services(), *stats); },
     .postProcessing = [](ProcessingContext& context, void* service) {
       auto* stats = (DataProcessingStats*)service;
-      stats->updateStats({(short)ProcessingStatsId::PERFORMED_COMPUTATIONS, DataProcessingStats::Op::Add, 1});
-      flushMetrics(context.services(), *stats); },
+      stats->updateStats({(short)ProcessingStatsId::PERFORMED_COMPUTATIONS, DataProcessingStats::Op::Add, 1}); },
     .preDangling = [](DanglingContext& context, void* service) {
        auto* stats = (DataProcessingStats*)service;
        sendRelayerMetrics(context.services(), *stats);
@@ -1090,9 +1091,6 @@ o2::framework::ServiceSpec CommonServices::dataProcessingStats()
       auto* stats = (DataProcessingStats*)service;
       sendRelayerMetrics(context.services(), *stats);
       flushMetrics(context.services(), *stats); },
-    .postDispatching = [](ProcessingContext& context, void* service) {
-      auto* stats = (DataProcessingStats*)service;
-      flushMetrics(context.services(), *stats); },
     .preLoop = [](ServiceRegistryRef ref, void* service) {
       auto* stats = (DataProcessingStats*)service;
       flushMetrics(ref, *stats); },
diff --git a/GPU/Common/CMakeLists.txt b/GPU/Common/CMakeLists.txt
index c37ec3f5f5a13..2a31747b3673e 100644
--- a/GPU/Common/CMakeLists.txt
+++ b/GPU/Common/CMakeLists.txt
@@ -88,4 +88,4 @@ if(ALIGPU_BUILD_TYPE STREQUAL "ALIROOT")
   install(TARGETS Ali${MODULE} ARCHIVE DESTINATION lib LIBRARY DESTINATION lib)
 
   install(FILES ${HDRS_INSTALL} DESTINATION include)
-endif()
+endif()
\ No newline at end of file
diff --git a/GPU/Common/test/.gitignore b/GPU/Common/test/.gitignore
deleted file mode 100644
index 43fd5862f17c1..0000000000000
--- a/GPU/Common/test/.gitignore
+++ /dev/null
@@ -1 +0,0 @@
-*.hip
\ No newline at end of file
diff --git a/GPU/Common/test/testSMatrixImp.cu b/GPU/Common/test/testSMatrixImp.cu
index 08ca0b823488d..1e1ba9b8cf6dc 100644
--- a/GPU/Common/test/testSMatrixImp.cu
+++ b/GPU/Common/test/testSMatrixImp.cu
@@ -10,9 +10,9 @@
 // or submit itself to any jurisdiction.
 
 /// \file testGPUSMatrixImp.cu
-/// \author Matteo Concas
+/// \author Matteo Concas, Maksym KIzitskyi
 
-#define BOOST_TEST_MODULE Test GPUSMatrixImpl
+#define BOOST_TEST_MODULE Test GPUSMatrixImplementation
 #ifdef __HIPCC__
 #define GPUPLATFORM "HIP"
 #include "hip/hip_runtime.h"
@@ -21,11 +21,179 @@
 #include <cuda.h>
 #endif
 
-#include <boost/test/unit_test.hpp>
 #include <iostream>
-
+#include <boost/test/unit_test.hpp>
 #include <MathUtils/SMatrixGPU.h>
 #include <Math/SMatrix.h>
+#include <random>
+
+using MatSym3DGPU = o2::math_utils::SMatrixGPU<float, 3, 3, o2::math_utils::MatRepSymGPU<float, 3>>;
+using MatSym3D = ROOT::Math::SMatrix<float, 3, 3, ROOT::Math::MatRepSym<float, 3>>;
+using Mat3DGPU = o2::math_utils::SMatrixGPU<float, 3, 3, o2::math_utils::MatRepStdGPU<float, 3, 3>>;
+using Mat3D = ROOT::Math::SMatrix<float, 3, 3, ROOT::Math::MatRepStd<float, 3, 3>>;
+
+#define GPU_CHECK(call)                                                                      \
+  do {                                                                                       \
+    cudaError_t error = call;                                                                \
+    if (error != cudaSuccess) {                                                              \
+      fprintf(stderr, "CUDA Error: %s (error code %d)\n", cudaGetErrorString(error), error); \
+      return;                                                                                \
+    }                                                                                        \
+  } while (0)
+
+namespace gpu
+{
+enum PrintMode {
+  Decimal,
+  Binary,
+  Hexadecimal
+};
+
+__device__ void floatToBinaryString(float number, char* buffer)
+{
+  unsigned char* bytePointer = reinterpret_cast<unsigned char*>(&number);
+  for (int byteIndex = 3; byteIndex >= 0; --byteIndex) {
+    unsigned char byte = bytePointer[byteIndex];
+    for (int bitIndex = 7; bitIndex >= 0; --bitIndex) {
+      buffer[(3 - byteIndex) * 8 + (7 - bitIndex)] = (byte & (1 << bitIndex)) ? '1' : '0';
+    }
+  }
+  buffer[32] = '\0'; // Null terminator
+}
+
+template <typename MatrixType>
+GPUd() void printMatrix(const MatrixType& matrix, const char* name, const PrintMode mode)
+{
+  if (mode == PrintMode::Binary) {
+    char buffer[33];
+    for (int i = 0; i < 3; ++i) {
+      for (int j = 0; j < 3; ++j) {
+        floatToBinaryString(matrix(i, j), buffer);
+        printf("%s(%d,%d) = %s\n", name, i, j, buffer);
+      }
+    }
+  }
+  if (mode == PrintMode::Decimal) {
+    for (int i = 0; i < 3; ++i) {
+      for (int j = 0; j < 3; ++j) {
+        printf("%s(%i,%i) = %f\n", name, i, j, matrix(i, j));
+      }
+    }
+  }
+  if (mode == PrintMode::Hexadecimal) {
+    for (int i = 0; i < 3; ++i) {
+      for (int j = 0; j < 3; ++j) {
+        printf("%s(%d,%d) = %x\n", name, i, j, o2::gpu::CAMath::Float2UIntReint(matrix(i, j)));
+      }
+    }
+  }
+}
+
+// Invert test for a single square matrix
+template <typename T>
+GPUg() void invertMatrixKernelSingle(T* matrix)
+{
+  matrix->Invert();
+}
+
+// Copy test for a single square matrix
+template <typename T>
+GPUg() void copyMatrixKernelSingle(
+  T* srcMatrix,
+  T* dstMatrix)
+{
+  *dstMatrix = *srcMatrix;
+}
+
+// Invert test for an array of square matrices
+template <typename T>
+GPUg() void invertMatrixKernelArray(T* matrices,
+                                    const int numMatrices)
+{
+  for (auto iMatrix = blockIdx.x * blockDim.x + threadIdx.x; iMatrix < numMatrices; iMatrix += blockDim.x * gridDim.x) {
+    matrices[iMatrix].Invert();
+  }
+}
+
+// Copy test for an array of square matrices
+template <typename T>
+GPUg() void copyMatrixKernelArray(
+  T* srcMatrices,
+  T* dstMatrices,
+  const int numMatrices)
+{
+  for (auto iMatrix = blockIdx.x * blockDim.x + threadIdx.x; iMatrix < numMatrices; iMatrix += blockDim.x * gridDim.x) {
+    srcMatrices[iMatrix] = dstMatrices[iMatrix];
+  }
+}
+} // namespace gpu
+
+// Function to compare two matrices element-wise with a specified tolerance
+template <typename MatrixType>
+void compareMatricesElementWise(const MatrixType& mat1, const MatrixType& mat2, float tolerance)
+{
+  auto tol = boost::test_tools::tolerance(tolerance);
+
+  for (unsigned int i = 0; i < mat1.kRows; ++i) {
+    for (unsigned int j = 0; j < mat1.kCols; ++j) {
+      BOOST_TEST(mat1(i, j) == mat2(i, j), tol);
+    }
+  }
+}
+
+// RAII class for CUDA resources
+class GPUMemory
+{
+ public:
+  GPUMemory(size_t size)
+  {
+    GPU_CHECK(cudaMalloc(&device_ptr, size));
+  }
+  ~GPUMemory()
+  {
+    GPU_CHECK(cudaFree(device_ptr));
+  }
+  void* get() const { return device_ptr; }
+
+ private:
+  void* device_ptr;
+};
+
+class GPUBenchmark
+{
+ public:
+  GPUBenchmark(const std::string& testName = "") : title(testName)
+  {
+    GPU_CHECK(cudaEventCreate(&startEvent));
+    GPU_CHECK(cudaEventCreate(&stopEvent));
+  }
+  ~GPUBenchmark()
+  {
+    GPU_CHECK(cudaEventDestroy(startEvent));
+    GPU_CHECK(cudaEventDestroy(stopEvent));
+  }
+  void start()
+  {
+    GPU_CHECK(cudaEventRecord(startEvent));
+  }
+  void stop()
+  {
+    GPU_CHECK(cudaEventRecord(stopEvent));
+    GPU_CHECK(cudaEventSynchronize(stopEvent));
+    GPU_CHECK(cudaEventElapsedTime(&duration, startEvent, stopEvent));
+  }
+  void setTitle(const std::string& newTitle) { title = newTitle; }
+  float getDuration() const { return duration; }
+  void printDuration() const
+  {
+    std::cout << "\t - " << title << " kernel execution time: " << duration << " ms" << std::endl;
+  }
+
+ private:
+  std::string title = "";
+  cudaEvent_t startEvent, stopEvent;
+  float duration;
+};
 
 template <typename T>
 void discardResult(const T&)
@@ -35,97 +203,247 @@ void discardResult(const T&)
 void prologue()
 {
   int deviceCount;
-  discardResult(cudaGetDeviceCount(&deviceCount));
-  if (!deviceCount) {
+  cudaError_t error = cudaGetDeviceCount(&deviceCount);
+  if (error != cudaSuccess || !deviceCount) {
     std::cerr << "No " << GPUPLATFORM << " devices found" << std::endl;
+    return;
   }
+
   for (int iDevice = 0; iDevice < deviceCount; ++iDevice) {
     cudaDeviceProp deviceProp;
     discardResult(cudaGetDeviceProperties(&deviceProp, iDevice));
-    std::cout << GPUPLATFORM << " Device " << iDevice << ": " << deviceProp.name << std::endl;
+    printf("Testing on: %s, Device %d: %s\n", GPUPLATFORM, iDevice, deviceProp.name);
   }
 }
 
-using MatSym3DGPU = o2::math_utils::SMatrixGPU<float, 3, 3, o2::math_utils::MatRepSymGPU<float, 3>>;
-using MatSym3D = ROOT::Math::SMatrix<float, 3, 3, ROOT::Math::MatRepSym<float, 3>>;
-
-template <typename T, int D>
-__global__ void invertSymMatrixKernel(o2::math_utils::SMatrixGPU<float, 3, 3, o2::math_utils::MatRepSymGPU<float, 3>>* matrix)
-{
-  MatSym3DGPU smat2 = *matrix;
+struct GPUSMatrixImplFixtureSolo {
+  GPUSMatrixImplFixtureSolo() : SMatrixSym_d(sizeof(MatSym3DGPU)), SMatrixSym_h(), SMatrix_d(sizeof(Mat3DGPU)), SMatrix_h()
+  {
+    prologue();
+    initializeMatrices();
+  }
 
-  printf("A(0,0) = %f, A(0,1) = %f, A(0,2) = %f\n", (*matrix)(0, 0), (*matrix)(0, 1), (*matrix)(0, 2));
-  printf("A(1,0) = %f, A(1,1) = %f, A(1,2) = %f\n", (*matrix)(1, 0), (*matrix)(1, 1), (*matrix)(1, 2));
-  printf("A(2,0) = %f, A(2,1) = %f, A(2,2) = %f\n", (*matrix)(2, 0), (*matrix)(2, 1), (*matrix)(2, 2));
+  ~GPUSMatrixImplFixtureSolo() = default;
+  void initializeMatrices()
+  {
+    std::random_device rd;
+    std::mt19937 gen(rd());
+    std::uniform_real_distribution<float> dis(1.0, 10.0);
 
-  printf("B(0,0) = %f, B(0,1) = %f, B(0,2) = %f\n", smat2(0, 0), smat2(0, 1), smat2(0, 2));
-  printf("B(1,0) = %f, B(1,1) = %f, B(1,2) = %f\n", smat2(1, 0), smat2(1, 1), smat2(1, 2));
-  printf("B(2,0) = %f, B(2,1) = %f, B(2,2) = %f\n", smat2(2, 0), smat2(2, 1), smat2(2, 2));
+    // Initialize host matrices with random values
+    for (int i = 0; i < 3; ++i) {
+      for (int j = i; j < 3; ++j) {
+        SMatrixSym_h(i, j) = dis(gen);
+        SMatrix_h(i, j) = dis(gen);
+      }
+    }
+    SMatrixSym_original_h = SMatrixSym_h;
+    SMatrix_original_h = SMatrix_h;
 
-  printf("\nInverting A...\n");
-  matrix->Invert();
+    // Copy host matrices to device
+    GPU_CHECK(cudaMemcpy(SMatrixSym_d.get(), &SMatrixSym_h, sizeof(MatSym3DGPU), cudaMemcpyHostToDevice));
+    GPU_CHECK(cudaMemcpy(SMatrix_d.get(), &SMatrix_h, sizeof(Mat3DGPU), cudaMemcpyHostToDevice));
+  }
 
-  printf("A(0,0) = %f, A(0,1) = %f, A(0,2) = %f\n", (*matrix)(0, 0), (*matrix)(0, 1), (*matrix)(0, 2));
-  printf("A(1,0) = %f, A(1,1) = %f, A(1,2) = %f\n", (*matrix)(1, 0), (*matrix)(1, 1), (*matrix)(1, 2));
-  printf("A(2,0) = %f, A(2,1) = %f, A(2,2) = %f\n", (*matrix)(2, 0), (*matrix)(2, 1), (*matrix)(2, 2));
+  GPUMemory SMatrixSym_d;
+  MatSym3D SMatrixSym_h;
+  MatSym3D SMatrixSym_original_h;
+  GPUMemory SMatrix_d;
+  Mat3D SMatrix_h;
+  Mat3D SMatrix_original_h;
+};
 
-  printf("\nC = (A^-1) * B...\n");
-  auto smat3 = (*matrix) * smat2;
+BOOST_FIXTURE_TEST_CASE(MatrixInversion, GPUSMatrixImplFixtureSolo)
+{
+  float tolerance = 0.00001f;
+  const int nBlocks{1}, nThreads{1};
+  GPUBenchmark benchmark("Single symmetric matrix inversion (" + std::to_string(nBlocks) + " blocks, " + std::to_string(nThreads) + " threads)");
+  benchmark.start();
+  gpu::invertMatrixKernelSingle<MatSym3DGPU><<<nBlocks, nThreads>>>(static_cast<MatSym3DGPU*>(SMatrixSym_d.get()));
+  benchmark.stop();
+  benchmark.printDuration();
+  discardResult(cudaDeviceSynchronize());
+  GPU_CHECK(cudaGetLastError());
+  GPU_CHECK(cudaMemcpy(&SMatrixSym_h, SMatrixSym_d.get(), sizeof(MatSym3DGPU), cudaMemcpyDeviceToHost));
 
-  printf("C(0,0) = %f, C(0,1) = %f, C(0,2) = %f\n", smat3(0, 0), smat3(0, 1), smat3(0, 2));
-  printf("C(1,0) = %f, C(1,1) = %f, C(1,2) = %f\n", smat3(1, 0), smat3(1, 1), smat3(1, 2));
-  printf("C(2,0) = %f, C(2,1) = %f, C(2,2) = %f\n", smat3(2, 0), smat3(2, 1), smat3(2, 2));
+  MatSym3D identitySym;
+  identitySym(0, 0) = 1;
+  identitySym(1, 1) = 1;
+  identitySym(2, 2) = 1;
+  auto operationSym = SMatrixSym_h * SMatrixSym_original_h;
+  MatSym3D resultSym;
+  ROOT::Math::AssignSym::Evaluate(resultSym, operationSym);
+  compareMatricesElementWise(resultSym, identitySym, tolerance);
 
-  printf("\nEvaluating...\n");
-  MatSym3DGPU tmp;
-  o2::math_utils::AssignSym::Evaluate(tmp, smat3);
+  benchmark.setTitle("Single general matrix inversion (" + std::to_string(nBlocks) + " blocks, " + std::to_string(nThreads) + " threads)");
+  benchmark.start();
+  gpu::invertMatrixKernelSingle<Mat3DGPU><<<nBlocks, nThreads>>>(static_cast<Mat3DGPU*>(SMatrix_d.get()));
+  benchmark.stop();
+  benchmark.printDuration();
+  discardResult(cudaDeviceSynchronize());
+  GPU_CHECK(cudaGetLastError());
+  GPU_CHECK(cudaMemcpy(&SMatrix_h, SMatrix_d.get(), sizeof(Mat3DGPU), cudaMemcpyDeviceToHost));
 
-  printf("A(0,0) = %f, A(0,1) = %f, A(0,2) = %f\n", tmp(0, 0), tmp(0, 1), tmp(0, 2));
-  printf("A(1,0) = %f, A(1,1) = %f, A(1,2) = %f\n", tmp(1, 0), tmp(1, 1), tmp(1, 2));
-  printf("A(2,0) = %f, A(2,1) = %f, A(2,2) = %f\n", tmp(2, 0), tmp(2, 1), tmp(2, 2));
-  (*matrix) = tmp;
+  Mat3D identity;
+  identity(0, 0) = 1;
+  identity(1, 1) = 1;
+  identity(2, 2) = 1;
+  auto operation = SMatrix_h * SMatrix_original_h;
+  Mat3D result;
+  ROOT::Math::Assign<float, 3, 3, decltype(operation), ROOT::Math::MatRepStd<float, 3, 3>, ROOT::Math::MatRepStd<float, 3, 3>>::Evaluate(result, operation);
+  compareMatricesElementWise(result, identity, tolerance);
 }
 
-struct GPUSMatrixImplFixture {
-  GPUSMatrixImplFixture() : SMatrix3D_d(nullptr)
+struct GPUSMatrixImplFixtureDuo {
+  GPUSMatrixImplFixtureDuo() : SMatrixSym_d_A(sizeof(MatSym3DGPU)), SMatrixSym_h_A(), SMatrix_d_A(sizeof(Mat3DGPU)), SMatrix_h_A(), SMatrixSym_d_B(sizeof(MatSym3DGPU)), SMatrixSym_h_B(), SMatrix_d_B(sizeof(Mat3DGPU)), SMatrix_h_B()
   {
     prologue();
+    initializeMatrices();
+  }
+
+  ~GPUSMatrixImplFixtureDuo() = default;
+
+  void initializeMatrices()
+  {
+    std::random_device rd;
+    std::mt19937 gen(rd());
+    std::uniform_real_distribution<float> dis(1.0, 10.0);
 
-    SMatrix3D_h(0, 0) = 1;
-    SMatrix3D_h(1, 1) = 2;
-    SMatrix3D_h(2, 2) = 3;
-    SMatrix3D_h(0, 1) = 4;
-    SMatrix3D_h(0, 2) = 5;
-    SMatrix3D_h(1, 2) = 6;
+    // Initialize host matrices with random values
+    for (int i = 0; i < 3; ++i) {
+      for (int j = i; j < 3; ++j) {
+        SMatrixSym_h_A(i, j) = dis(gen);
+        SMatrix_h_A(i, j) = dis(gen);
 
-    discardResult(cudaMalloc(&SMatrix3D_d, sizeof(MatSym3DGPU)));
-    discardResult(cudaMemcpy(SMatrix3D_d, &SMatrix3D_h, sizeof(MatSym3DGPU), cudaMemcpyHostToDevice));
+        SMatrixSym_h_B(i, j) = dis(gen);
+        SMatrix_h_B(i, j) = dis(gen);
+      }
+    }
 
-    std::cout << "sizeof(MatSym3DGPU) = " << sizeof(MatSym3DGPU) << std::endl;
-    std::cout << "sizeof(MatSym3D) = " << sizeof(MatSym3D) << std::endl;
-    i = 3;
+    // Copy host matrices to device
+    GPU_CHECK(cudaMemcpy(SMatrixSym_d_A.get(), &SMatrixSym_h_A, sizeof(MatSym3DGPU), cudaMemcpyHostToDevice));
+    GPU_CHECK(cudaMemcpy(SMatrix_d_A.get(), &SMatrix_h_A, sizeof(Mat3DGPU), cudaMemcpyHostToDevice));
+
+    GPU_CHECK(cudaMemcpy(SMatrixSym_d_B.get(), &SMatrixSym_h_B, sizeof(MatSym3DGPU), cudaMemcpyHostToDevice));
+    GPU_CHECK(cudaMemcpy(SMatrix_d_B.get(), &SMatrix_h_B, sizeof(Mat3DGPU), cudaMemcpyHostToDevice));
   }
 
-  ~GPUSMatrixImplFixture()
+  GPUMemory SMatrixSym_d_A;
+  MatSym3D SMatrixSym_h_A;
+
+  GPUMemory SMatrixSym_d_B;
+  MatSym3D SMatrixSym_h_B;
+
+  GPUMemory SMatrix_d_A;
+  Mat3D SMatrix_h_A;
+
+  GPUMemory SMatrix_d_B;
+  Mat3D SMatrix_h_B;
+};
+
+BOOST_FIXTURE_TEST_CASE(TestMatrixCopyingAndComparison, GPUSMatrixImplFixtureDuo)
+{
+  const int nBlocks{1}, nThreads{1};
+  GPUBenchmark benchmark("Single symmetric matrix copy (" + std::to_string(nBlocks) + " blocks, " + std::to_string(nThreads) + " threads)");
+  benchmark.start();
+  gpu::copyMatrixKernelSingle<MatSym3DGPU><<<nBlocks, nThreads>>>(static_cast<MatSym3DGPU*>(SMatrixSym_d_A.get()), static_cast<MatSym3DGPU*>(SMatrixSym_d_B.get()));
+  benchmark.stop();
+  benchmark.printDuration();
+  discardResult(cudaDeviceSynchronize());
+  GPU_CHECK(cudaGetLastError());
+  GPU_CHECK(cudaMemcpy(&SMatrixSym_h_B, SMatrixSym_d_B.get(), sizeof(MatSym3DGPU), cudaMemcpyDeviceToHost));
+
+  compareMatricesElementWise(SMatrixSym_h_A, SMatrixSym_h_B, 0.0);
+
+  benchmark.setTitle("Single general matrix copy (" + std::to_string(nBlocks) + " blocks, " + std::to_string(nThreads) + " threads)");
+  benchmark.start();
+  gpu::copyMatrixKernelSingle<Mat3DGPU><<<nBlocks, nThreads>>>(static_cast<Mat3DGPU*>(SMatrix_d_A.get()), static_cast<Mat3DGPU*>(SMatrix_d_B.get()));
+  benchmark.stop();
+  benchmark.printDuration();
+  discardResult(cudaDeviceSynchronize());
+  GPU_CHECK(cudaGetLastError());
+
+  GPU_CHECK(cudaMemcpy(&SMatrix_h_B, SMatrix_d_B.get(), sizeof(Mat3DGPU), cudaMemcpyDeviceToHost));
+
+  compareMatricesElementWise(SMatrix_h_A, SMatrix_h_B, 0.0);
+}
+template <size_t D = 1000>
+struct GPUSmatrixImplFixtureSoloArray {
+  GPUSmatrixImplFixtureSoloArray() : SMatrixSymArray_d(D * sizeof(MatSym3DGPU)), SMatrixArray_d(D * sizeof(Mat3DGPU))
   {
-    discardResult(cudaFree(SMatrix3D_d));
+    SMatrixSymVector_h.resize(D);
+    SMatrixVector_h.resize(D);
+    SMatrixSym_original_h.resize(D);
+    SMatrix_original_h.resize(D);
+    prologue();
+    initializeMatrices();
   }
 
-  int i;
-  MatSym3DGPU* SMatrix3D_d; // device ptr
-  MatSym3D SMatrix3D_h;
+  ~GPUSmatrixImplFixtureSoloArray() = default;
+  void initializeMatrices()
+  {
+    std::random_device rd;
+    std::mt19937 gen(rd());
+    std::uniform_real_distribution<float> dis(1.0, 10.0);
+
+    // Initialize host matrices with random values
+    for (size_t iMatrix{0}; iMatrix < D; ++iMatrix) {
+      for (int i = 0; i < 3; ++i) {
+        for (int j = i; j < 3; ++j) {
+          SMatrixSymVector_h[iMatrix](i, j) = dis(gen);
+          SMatrixVector_h[iMatrix](i, j) = dis(gen);
+        }
+      }
+
+      SMatrixSym_original_h[iMatrix] = SMatrixSymVector_h[iMatrix];
+      SMatrix_original_h[iMatrix] = SMatrixVector_h[iMatrix];
+    }
+
+    // Copy host matrices to device
+    GPU_CHECK(cudaMemcpy(SMatrixSymArray_d.get(), SMatrixSymVector_h.data(), D * sizeof(MatSym3DGPU), cudaMemcpyHostToDevice));
+    GPU_CHECK(cudaMemcpy(SMatrixArray_d.get(), SMatrixVector_h.data(), D * sizeof(Mat3DGPU), cudaMemcpyHostToDevice));
+  }
+
+  GPUMemory SMatrixSymArray_d;
+  std::vector<MatSym3D> SMatrixSymVector_h;
+  std::vector<MatSym3D> SMatrixSym_original_h;
+  GPUMemory SMatrixArray_d;
+  std::vector<Mat3D> SMatrixVector_h;
+  std::vector<Mat3D> SMatrix_original_h;
 };
 
-BOOST_FIXTURE_TEST_CASE(DummyFixtureUsage, GPUSMatrixImplFixture)
+BOOST_FIXTURE_TEST_CASE(MatrixInversionArray, GPUSmatrixImplFixtureSoloArray<1'000'000>)
 {
-  invertSymMatrixKernel<float, 3><<<1, 1>>>(SMatrix3D_d);
+  float tolerance = 0.00001f;
+  const int nBlocks{20}, nThreads{512};
+
+  GPUBenchmark benchmark("Array of 1'000'000 symmetric matrices inversion (" + std::to_string(nBlocks) + " blocks, " + std::to_string(nThreads) + " threads)");
+  benchmark.start();
+  gpu::invertMatrixKernelArray<MatSym3DGPU><<<nBlocks, nThreads>>>(static_cast<MatSym3DGPU*>(SMatrixSymArray_d.get()), 1'000'000);
+  benchmark.stop();
+  benchmark.printDuration();
   discardResult(cudaDeviceSynchronize());
+  GPU_CHECK(cudaGetLastError());
+  GPU_CHECK(cudaMemcpy(SMatrixSymVector_h.data(), SMatrixSymArray_d.get(), 1'000'000 * sizeof(MatSym3DGPU), cudaMemcpyDeviceToHost));
 
-  discardResult(cudaMemcpy(&SMatrix3D_h, SMatrix3D_d, sizeof(MatSym3DGPU), cudaMemcpyDeviceToHost));
+  for (size_t iMatrix{0}; iMatrix < 1'000'000; ++iMatrix) {
+    // Cross-check with the CPU implementation
+    SMatrixSym_original_h[iMatrix].Invert();
+    compareMatricesElementWise(SMatrixSymVector_h[iMatrix], SMatrixSym_original_h[iMatrix], tolerance);
+  }
 
-  MatSym3D identity;
-  identity(0, 0) = 1;
-  identity(1, 1) = 1;
-  identity(2, 2) = 1;
-  BOOST_TEST(SMatrix3D_h == identity);
+  benchmark.setTitle("Array of 1'000'000 general matrices inversion (" + std::to_string(nBlocks) + " blocks, " + std::to_string(nThreads) + " threads)");
+  benchmark.start();
+  gpu::invertMatrixKernelArray<Mat3DGPU><<<nBlocks, nThreads>>>(static_cast<Mat3DGPU*>(SMatrixArray_d.get()), 1'000'000);
+  benchmark.stop();
+  benchmark.printDuration();
+  discardResult(cudaDeviceSynchronize());
+  GPU_CHECK(cudaGetLastError());
+  GPU_CHECK(cudaMemcpy(SMatrixVector_h.data(), SMatrixArray_d.get(), 1'000'000 * sizeof(Mat3DGPU), cudaMemcpyDeviceToHost));
+
+  for (size_t iMatrix{0}; iMatrix < 1'000'000; ++iMatrix) {
+    // Cross-check with the CPU implementation
+    SMatrix_original_h[iMatrix].Invert();
+    compareMatricesElementWise(SMatrixVector_h[iMatrix], SMatrix_original_h[iMatrix], tolerance);
+  }
 }
\ No newline at end of file
diff --git a/GPU/GPUTracking/Global/GPUChainTrackingCompression.cxx b/GPU/GPUTracking/Global/GPUChainTrackingCompression.cxx
index af603035bd017..3c06bb7a3a3c5 100644
--- a/GPU/GPUTracking/Global/GPUChainTrackingCompression.cxx
+++ b/GPU/GPUTracking/Global/GPUChainTrackingCompression.cxx
@@ -115,13 +115,12 @@ int GPUChainTracking::RunTPCCompression()
         getKernelTimer<GPUTPCCompressionGatherKernels, GPUTPCCompressionGatherKernels::buffered128>(RecoStep::TPCCompression, 0, outputSize);
         break;
       case 4:
-
         static_assert((nBlocksMulti & 1) && nBlocksMulti >= 3);
         runKernel<GPUTPCCompressionGatherKernels, GPUTPCCompressionGatherKernels::multiBlock>(GetGridBlkStep(nBlocksMulti, outputStream, RecoStep::TPCCompression));
         getKernelTimer<GPUTPCCompressionGatherKernels, GPUTPCCompressionGatherKernels::multiBlock>(RecoStep::TPCCompression, 0, outputSize);
         break;
       default:
-        GPUError("Invalid compression kernel selected.");
+        GPUError("Invalid compression kernel %d selected.", (int)ProcessingSettings().tpcCompressionGatherModeKernel);
         return 1;
     }
     if (ProcessingSettings().tpcCompressionGatherMode == 3) {
diff --git a/GPU/TPCFastTransformation/NDPiecewisePolynomials.h b/GPU/TPCFastTransformation/NDPiecewisePolynomials.h
index 13ce49a142470..6df5ac8c99cab 100644
--- a/GPU/TPCFastTransformation/NDPiecewisePolynomials.h
+++ b/GPU/TPCFastTransformation/NDPiecewisePolynomials.h
@@ -71,7 +71,7 @@ struct NDPiecewisePolynomialContainer {
 ///
 /// For usage see: testMultivarPolynomials.cxx
 ///
-/// TODO: add possibillity to perform the fits on scattered data points (+add weighting of points)
+/// TODO: add possibillity to perform the fits with weighting of points
 ///
 /// \tparam Dim number of dimensions
 /// \tparam Degree degree of the polynomials
@@ -184,6 +184,11 @@ class NDPiecewisePolynomials : public FlatObject
   /// \param nAuxiliaryPoints number of points which will be used for the fits (should be at least 2)
   void performFits(const std::function<double(const double x[/* Dim */])>& func, const unsigned int nAuxiliaryPoints[/* Dim */]);
 
+  /// perform the polynomial fits on scatter points
+  /// \param x scatter points used to make the fits of size 'y.size() * Dim' as in TLinearFitter
+  /// \param y response values
+  void performFits(const std::vector<float>& x, const std::vector<float>& y);
+
   /// load parameters from input file (which were written using the writeToFile method)
   /// \param inpf input file
   /// \param name name of the object in the file
@@ -537,7 +542,7 @@ void NDPiecewisePolynomials<Dim, Degree, InteractionOnly>::performFits(const std
     const bool debug = !(++counter % printDebugForNFits);
     if (debug) {
 #ifndef GPUCA_ALIROOT_LIB
-      LOGP(info, "Peforming fit {} out of {}", counter, nTotalFits);
+      LOGP(info, "Performing fit {} out of {}", counter, nTotalFits);
 #endif
     }
 
@@ -554,6 +559,95 @@ void NDPiecewisePolynomials<Dim, Degree, InteractionOnly>::performFits(const std
   }
 }
 
+template <unsigned int Dim, unsigned int Degree, bool InteractionOnly>
+void NDPiecewisePolynomials<Dim, Degree, InteractionOnly>::performFits(const std::vector<float>& x, const std::vector<float>& y)
+{
+  const int nTotalFits = getNPolynomials();
+#ifndef GPUCA_ALIROOT_LIB
+  LOGP(info, "Perform fitting of {}D-Polynomials of degree {} for a total of {} fits.", Dim, Degree, nTotalFits);
+#endif
+
+  // approximate number of points
+  unsigned int nPoints = 2 * y.size() / nTotalFits;
+
+  // polynomial index -> indices to datapoints
+  std::unordered_map<int, std::vector<size_t>> dataPointsIndices;
+  for (int i = 0; i < nTotalFits; ++i) {
+    dataPointsIndices[i].reserve(nPoints);
+  }
+
+  // check for each data point which polynomial to use
+  for (size_t i = 0; i < y.size(); ++i) {
+    std::array<int, Dim> index;
+    float xVal[Dim];
+    std::copy(x.begin() + i * Dim, x.begin() + i * Dim + Dim, xVal);
+    setIndex<Dim - 1>(xVal, index.data());
+
+    std::array<int, Dim> indexClamped{index};
+    clamp<Dim - 1>(xVal, indexClamped.data());
+
+    // check if data points are in the grid
+    if (index == indexClamped) {
+      // index of the polyniomial
+      const unsigned int idx = getDataIndex(index.data()) / MultivariatePolynomialParametersHelper::getNParameters(Degree, Dim, InteractionOnly);
+
+      // store index to data point
+      dataPointsIndices[idx].emplace_back(i);
+    }
+  }
+
+  // for fitting
+  MultivariatePolynomialHelper<0, 0, false> pol(Dim, Degree, InteractionOnly);
+  TLinearFitter fitter = pol.getTLinearFitter();
+
+  unsigned int counter = 0;
+  const int printDebugForNFits = int(nTotalFits / 20) + 1;
+
+  // temp storage for x and y values for fitting
+  std::vector<double> xCords;
+  std::vector<double> response;
+
+  for (int i = 0; i < nTotalFits; ++i) {
+    const bool debug = !(++counter % printDebugForNFits);
+    if (debug) {
+#ifndef GPUCA_ALIROOT_LIB
+      LOGP(info, "Performing fit {} out of {}", counter, nTotalFits);
+#endif
+    }
+
+    // store values for fitting
+    if (dataPointsIndices[i].empty()) {
+#ifndef GPUCA_ALIROOT_LIB
+      LOGP(info, "No data points to fit");
+#endif
+      continue;
+    }
+
+    const auto nP = dataPointsIndices[i].size();
+    xCords.reserve(Dim * nP);
+    response.reserve(nP);
+    xCords.clear();
+    response.clear();
+
+    // add datapoints to fit
+    for (size_t j = 0; j < nP; ++j) {
+      const size_t idxOrig = dataPointsIndices[i][j];
+
+      // insert x values at the end of xCords
+      const int idxXStart = idxOrig * Dim;
+      xCords.insert(xCords.end(), x.begin() + idxXStart, x.begin() + idxXStart + Dim);
+      response.emplace_back(y[idxOrig]);
+    }
+
+    // perform the fit on the points TODO make errors configurable
+    std::vector<double> error;
+    const auto params = MultivariatePolynomialHelper<0, 0, false>::fit(fitter, xCords, response, error, true);
+
+    // store parameters
+    std::copy(params.begin(), params.end(), &mParams[i * MultivariatePolynomialParametersHelper::getNParameters(Degree, Dim, InteractionOnly)]);
+  }
+}
+
 template <unsigned int Dim, unsigned int Degree, bool InteractionOnly>
 void NDPiecewisePolynomials<Dim, Degree, InteractionOnly>::fitInnerGrid(const std::function<double(const double x[/* Dim */])>& func, const unsigned int nAuxiliaryPoints[/* Dim */], const int currentIndex[/* Dim */], TLinearFitter& fitter, std::vector<double>& xCords, std::vector<double>& response)
 {
diff --git a/Steer/DigitizerWorkflow/src/TPCDigitizerSpec.cxx b/Steer/DigitizerWorkflow/src/TPCDigitizerSpec.cxx
index 3f24bb1a9ee12..e1f366080fd9f 100644
--- a/Steer/DigitizerWorkflow/src/TPCDigitizerSpec.cxx
+++ b/Steer/DigitizerWorkflow/src/TPCDigitizerSpec.cxx
@@ -120,6 +120,9 @@ class TPCDPLDigitizerTask : public BaseDPLDigitizer
 
     mWithMCTruth = o2::conf::DigiParams::Instance().mctruth;
     auto triggeredMode = ic.options().get<bool>("TPCtriggered");
+    mRecalcDistortions = !(ic.options().get<bool>("do-not-recalculate-distortions"));
+    const int nthreadsDist = ic.options().get<int>("n-threads-distortions");
+    SC::setNThreads(nthreadsDist);
     mUseCalibrationsFromCCDB = ic.options().get<bool>("TPCuseCCDB");
     mMeanLumiDistortions = ic.options().get<float>("meanLumiDistortions");
     mMeanLumiDistortionsDerivative = ic.options().get<float>("meanLumiDistortionsDerivative");
@@ -220,6 +223,9 @@ class TPCDPLDigitizerTask : public BaseDPLDigitizer
       if (mDistortionType == 2) {
         pc.inputs().get<SC*>("tpcdistortionsderiv");
         mDigitizer.setLumiScaleFactor();
+        if (mRecalcDistortions) {
+          mDigitizer.recalculateDistortions();
+        }
       }
     }
 
@@ -475,6 +481,7 @@ class TPCDPLDigitizerTask : public BaseDPLDigitizer
   int mDistortionType = 0;
   float mMeanLumiDistortions = -1;
   float mMeanLumiDistortionsDerivative = -1;
+  bool mRecalcDistortions = false;
 };
 
 o2::framework::DataProcessorSpec getTPCDigitizerSpec(int channel, bool writeGRP, bool mctruth, bool internalwriter, int distortionType)
@@ -513,6 +520,8 @@ o2::framework::DataProcessorSpec getTPCDigitizerSpec(int channel, bool writeGRP,
       {"TPCuseCCDB", VariantType::Bool, false, {"true: load calibrations from CCDB; false: use random calibratoins"}},
       {"meanLumiDistortions", VariantType::Float, -1.f, {"override lumi of distortion object if >=0"}},
       {"meanLumiDistortionsDerivative", VariantType::Float, -1.f, {"override lumi of derivative distortion object if >=0"}},
+      {"do-not-recalculate-distortions", VariantType::Bool, false, {"Do not recalculate the distortions"}},
+      {"n-threads-distortions", VariantType::Int, 4, {"Number of threads used for the calculation of the distortions"}},
     }};
 }
 
diff --git a/dependencies/FindO2GPU.cmake b/dependencies/FindO2GPU.cmake
index af7da049e0ff6..1e8e72609d16c 100644
--- a/dependencies/FindO2GPU.cmake
+++ b/dependencies/FindO2GPU.cmake
@@ -111,10 +111,13 @@ if(ENABLE_CUDA)
     endif()
   endif()
   if(CMAKE_CUDA_COMPILER)
-    set(CMAKE_CUDA_FLAGS "-Xcompiler \"${O2_GPU_CMAKE_CXX_FLAGS_NOSTD}\" ${CMAKE_CUDA_FLAGS} --expt-relaxed-constexpr --extended-lambda --allow-unsupported-compiler -Xptxas -v -Xcompiler -Wno-attributes")
+    set(CMAKE_CUDA_FLAGS "-Xcompiler \"${O2_GPU_CMAKE_CXX_FLAGS_NOSTD}\" ${CMAKE_CUDA_FLAGS} --expt-relaxed-constexpr --extended-lambda -Xptxas -v -Xcompiler -Wno-attributes")
     if(CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL "12.3")
       string(APPEND CMAKE_CUDA_FLAGS " -Xcudafe --diag_suppress=20257") # TODO: Cleanup
     endif()
+    if (NOT ENABLE_CUDA STREQUAL "AUTO")
+      string(APPEND CMAKE_CUDA_FLAGS " --allow-unsupported-compiler")
+    endif()
     set(CMAKE_CUDA_FLAGS_${CMAKE_BUILD_TYPE_UPPER} "-Xcompiler \"${CMAKE_CXX_FLAGS_${CMAKE_BUILD_TYPE_UPPER}}\" ${CMAKE_CUDA_FLAGS_${CMAKE_BUILD_TYPE_UPPER}}")
     if(CMAKE_BUILD_TYPE_UPPER STREQUAL "DEBUG")
       set(CMAKE_CUDA_FLAGS_${CMAKE_BUILD_TYPE_UPPER} "${CMAKE_CUDA_FLAGS_${CMAKE_BUILD_TYPE_UPPER}} -lineinfo -Xptxas -O0 -Xcompiler -O0")
@@ -267,8 +270,8 @@ if(ENABLE_HIP)
     set(HIP_ENABLED ON)
     set_target_properties(roc::rocthrust PROPERTIES IMPORTED_GLOBAL TRUE)
     message(STATUS "HIP Found (${hip_HIPCC_EXECUTABLE} version ${hip_VERSION})")
-    set(O2_HIP_CMAKE_CXX_FLAGS "-fgpu-defer-diag -mllvm -amdgpu-enable-lower-module-lds=false -Wno-invalid-command-line-argument -Wno-unused-command-line-argument -Wno-invalid-constexpr -Wno-ignored-optimization-argument -Wno-unused-private-field -Wno-pass-failed")
-    if(hip_VERSION VERSION_GREATER_EQUAL "6.0")
+    set(O2_HIP_CMAKE_CXX_FLAGS "-fgpu-defer-diag -mllvm -amdgpu-enable-lower-module-lds=false -mllvm -amdgpu-function-calls=true -Wno-invalid-command-line-argument -Wno-unused-command-line-argument -Wno-invalid-constexpr -Wno-ignored-optimization-argument -Wno-unused-private-field -Wno-pass-failed")
+    if(hip_VERSION VERSION_GREATER_EQUAL "6.0" AND NOT hip_VERSION VERSION_GREATER_EQUAL "6.2")
       set(O2_HIP_CMAKE_CXX_FLAGS "${O2_HIP_CMAKE_CXX_FLAGS} -mllvm -amdgpu-legacy-sgpr-spill-lowering=true") # TODO: Cleanup
     endif()
     set(O2_HIP_CMAKE_LINK_FLAGS "-Wno-pass-failed")