From 31ef2bb9827ccc63ae5535919716332f1bde5537 Mon Sep 17 00:00:00 2001
From: "g.kozlov" <g.kozlov@gsi.de>
Date: Sun, 16 Feb 2025 17:24:02 +0100
Subject: [PATCH 1/6] CA: Add experimental GPU tracking

---
 algo/ca/core/CMakeLists.txt                   |   29 +
 algo/ca/core/data/CaGrid.h                    |    9 +
 algo/ca/core/data/CaHit.h                     |   28 +-
 algo/ca/core/data/CaTriplet.h                 |    7 +-
 algo/ca/core/experimental/CaDeviceImage.cxx   |    6 +
 algo/ca/core/experimental/CaDeviceImage.h     |   14 +
 algo/ca/core/experimental/CaGpuField.h        |  308 ++++
 algo/ca/core/experimental/CaGpuGrid.h         |  175 ++
 algo/ca/core/experimental/CaGpuGridArea.h     |  112 ++
 algo/ca/core/experimental/CaGpuMaterialMap.h  |   74 +
 algo/ca/core/experimental/CaGpuParameters.h   |  111 ++
 algo/ca/core/experimental/CaGpuStation.h      |   72 +
 algo/ca/core/experimental/CaGpuTimeMonitor.h  |  150 ++
 .../experimental/CaGpuTrackFinderSetup.cxx    |  422 +++++
 .../core/experimental/CaGpuTrackFinderSetup.h |   85 +
 .../experimental/CaGpuTripletConstructor.cxx  | 1449 +++++++++++++++++
 .../experimental/CaGpuTripletConstructor.h    |  284 ++++
 algo/ca/core/experimental/CaMeasurementU.cxx  |   40 +
 algo/ca/core/experimental/CaMeasurementU.h    |  126 ++
 algo/ca/core/experimental/CaMeasurementXy.cxx |   42 +
 algo/ca/core/experimental/CaMeasurementXy.h   |  205 +++
 algo/ca/core/tracking/CaTrackFinderWindow.cxx |  127 +-
 algo/ca/core/tracking/CaTrackFinderWindow.h   |   11 +
 algo/kf/core/CMakeLists.txt                   |    2 +
 algo/kf/core/data/KfTrackParam.h              |  192 +--
 algo/kf/core/geo/KfFieldSlice.h               |    9 +
 algo/kf/core/geo/KfMaterialMap.h              |    3 +
 27 files changed, 3982 insertions(+), 110 deletions(-)
 create mode 100644 algo/ca/core/experimental/CaDeviceImage.cxx
 create mode 100644 algo/ca/core/experimental/CaDeviceImage.h
 create mode 100644 algo/ca/core/experimental/CaGpuField.h
 create mode 100644 algo/ca/core/experimental/CaGpuGrid.h
 create mode 100644 algo/ca/core/experimental/CaGpuGridArea.h
 create mode 100644 algo/ca/core/experimental/CaGpuMaterialMap.h
 create mode 100644 algo/ca/core/experimental/CaGpuParameters.h
 create mode 100644 algo/ca/core/experimental/CaGpuStation.h
 create mode 100644 algo/ca/core/experimental/CaGpuTimeMonitor.h
 create mode 100644 algo/ca/core/experimental/CaGpuTrackFinderSetup.cxx
 create mode 100644 algo/ca/core/experimental/CaGpuTrackFinderSetup.h
 create mode 100644 algo/ca/core/experimental/CaGpuTripletConstructor.cxx
 create mode 100644 algo/ca/core/experimental/CaGpuTripletConstructor.h
 create mode 100644 algo/ca/core/experimental/CaMeasurementU.cxx
 create mode 100644 algo/ca/core/experimental/CaMeasurementU.h
 create mode 100644 algo/ca/core/experimental/CaMeasurementXy.cxx
 create mode 100644 algo/ca/core/experimental/CaMeasurementXy.h

diff --git a/algo/ca/core/CMakeLists.txt b/algo/ca/core/CMakeLists.txt
index 2be9249974..f0d11aeb79 100644
--- a/algo/ca/core/CMakeLists.txt
+++ b/algo/ca/core/CMakeLists.txt
@@ -5,6 +5,7 @@ set(INCLUDE_DIRECTORIES
   ${CMAKE_CURRENT_SOURCE_DIR}/qa
   ${CMAKE_CURRENT_SOURCE_DIR}/data
   ${CMAKE_CURRENT_SOURCE_DIR}/tracking
+  ${CMAKE_CURRENT_SOURCE_DIR}/experimental
 )
 
 set(SRCS
@@ -31,6 +32,10 @@ set(SRCS
   ${CMAKE_CURRENT_SOURCE_DIR}/tracking/CaTrackFinderWindow.cxx
   ${CMAKE_CURRENT_SOURCE_DIR}/tracking/CaTrackFitter.cxx
   ${CMAKE_CURRENT_SOURCE_DIR}/tracking/CaTripletConstructor.cxx
+
+  ${CMAKE_CURRENT_SOURCE_DIR}/experimental/CaGpuTrackFinderSetup.cxx
+  ${CMAKE_CURRENT_SOURCE_DIR}/experimental/CaGpuTripletConstructor.cxx
+  ${CMAKE_CURRENT_SOURCE_DIR}/experimental/CaDeviceImage.cxx
 )
 
 SET_SOURCE_FILES_PROPERTIES(${SRCS} PROPERTIES COMPILE_FLAGS "-O3")
@@ -52,6 +57,12 @@ target_include_directories(CaCore
          ${CMAKE_CURRENT_SOURCE_DIR}/pars
          ${CMAKE_CURRENT_SOURCE_DIR}/qa
          ${CMAKE_CURRENT_SOURCE_DIR}/tracking
+         ${CMAKE_CURRENT_SOURCE_DIR}/experimental
+)
+
+set(DEVICE_SRCS
+  experimental/CaDeviceImage.cxx
+  experimental/CaGpuTripletConstructor.cxx
 )
 
 target_compile_definitions(CaCore PUBLIC NO_ROOT)
@@ -62,7 +73,9 @@ target_link_libraries(CaCore
                       external::fles_logging # needed for the logger
                       external::fles_ipc     # needed for the logger
                       external::yaml-cpp
+                      xpu
                      )
+xpu_attach(CaCore ${DEVICE_SRCS})
 
 ##### Offline version without the NO_ROOT in order to get standard logger! #############################################
 if (NOT CBM_ONLINE_STANDALONE)
@@ -74,6 +87,7 @@ if (NOT CBM_ONLINE_STANDALONE)
            ${CMAKE_CURRENT_SOURCE_DIR}/pars
            ${CMAKE_CURRENT_SOURCE_DIR}/qa
            ${CMAKE_CURRENT_SOURCE_DIR}/tracking
+           ${CMAKE_CURRENT_SOURCE_DIR}/experimental
            ${CMAKE_CURRENT_SOURCE_DIR}
   )
 
@@ -81,7 +95,9 @@ if (NOT CBM_ONLINE_STANDALONE)
                  PUBLIC KfCoreOffline
                         Boost::serialization
                         external::yaml-cpp
+                        xpu
                        )
+  xpu_attach(CaCoreOffline ${DEVICE_SRCS})
   install(TARGETS CaCoreOffline DESTINATION lib)
 endif()
 ########################################################################################################################
@@ -91,6 +107,7 @@ install(DIRECTORY utils TYPE INCLUDE FILES_MATCHING PATTERN "*.h")
 install(DIRECTORY data TYPE INCLUDE FILES_MATCHING PATTERN "*.h")
 install(DIRECTORY tracking TYPE INCLUDE FILES_MATCHING PATTERN "*.h")
 install(DIRECTORY pars TYPE INCLUDE FILES_MATCHING PATTERN "*.h")
+install(DIRECTORY experimental TYPE INCLUDE FILES_MATCHING PATTERN "*.h")
 
 install(
   FILES
@@ -134,6 +151,18 @@ install(
     tracking/CaTrackFinderWindow.h
     tracking/CaTrackFitter.h
     tracking/CaTripletConstructor.h
+    
+    experimental/CaDeviceImage.h
+    experimental/CaGpuGrid.h
+    experimental/CaGpuHitArea.h
+    experimental/CaGpuParameters.h
+    experimental/CaGpuStation.h
+    experimental/CaGpuMaterialMap.h
+    experimental/CaGpuField.h
+    experimental/CaMeasurementXy.h
+    experimental/CaGpuTrackFinderSetup.h
+    experimental/CaGpuTripletConstructor.h
+    experimentsl/CaDeviceImage.h
 
     DESTINATION
     include/
diff --git a/algo/ca/core/data/CaGrid.h b/algo/ca/core/data/CaGrid.h
index 8bfc127154..cc81ec8b4e 100644
--- a/algo/ca/core/data/CaGrid.h
+++ b/algo/ca/core/data/CaGrid.h
@@ -126,6 +126,15 @@ namespace cbm::algo::ca
     /// Get maximal entry range in T
     fscal GetMaxRangeT() const { return fMaxRangeT; }
 
+    /// Get bin width in X inversed
+    fscal GetBinWidthXinv() const { return fBinWidthXinv; }
+
+    /// Get bin width in Y inversed
+    fscal GetBinWidthYinv() const { return fBinWidthYinv; }
+
+    /// Get the link to the first bin entry index
+    const ca::Vector<ca::HitIndex_t>& GetFirstBinEntryIndex() const { return fFirstBinEntryIndex; }
+
    private:
     /// --- Data members ---
 
diff --git a/algo/ca/core/data/CaHit.h b/algo/ca/core/data/CaHit.h
index fb0551ee78..ae678be940 100644
--- a/algo/ca/core/data/CaHit.h
+++ b/algo/ca/core/data/CaHit.h
@@ -14,6 +14,8 @@
 
 #include <string>
 
+#include <xpu/device.h>
+
 namespace cbm::algo::ca
 {
   struct CaHitTimeInfo {
@@ -99,43 +101,43 @@ namespace cbm::algo::ca
     HitKeyIndex_t BackKey() const { return fBackKey; }
 
     /// Get the X coordinate
-    fscal X() const { return fX; }
+    XPU_D fscal X() const { return fX; }
 
     /// Get the Y coordinate
-    fscal Y() const { return fY; }
+    XPU_D fscal Y() const { return fY; }
 
     /// Get the Z coordinate
-    fscal Z() const { return fZ; }
+    XPU_D fscal Z() const { return fZ; }
 
     /// Get the time
-    fscal T() const { return fT; }
+    XPU_D fscal T() const { return fT; }
 
     /// Get the uncertainty of X coordinate
-    fscal dX2() const { return fDx2; }
+    XPU_D fscal dX2() const { return fDx2; }
 
     /// Get the uncertainty of Y coordinate
-    fscal dY2() const { return fDy2; }
+    XPU_D fscal dY2() const { return fDy2; }
 
     /// Get the X/Y covariance
-    fscal dXY() const { return fDxy; }
+    XPU_D fscal dXY() const { return fDxy; }
 
     /// Get the uncertainty of time
-    fscal dT2() const { return fDt2; }
+    XPU_D fscal dT2() const { return fDt2; }
 
     /// Get the +/- range of uncertainty of X coordinate
-    fscal RangeX() const { return fRangeX; }
+    XPU_D fscal RangeX() const { return fRangeX; }
 
     /// Get the +/- range of uncertainty of Y coordinate
-    fscal RangeY() const { return fRangeY; }
+    XPU_D fscal RangeY() const { return fRangeY; }
 
     /// Get the +/- range of uncertainty of time
-    fscal RangeT() const { return fRangeT; }
+    XPU_D fscal RangeT() const { return fRangeT; }
 
     /// Get the hit id
-    HitIndex_t Id() const { return fId; }
+    XPU_D HitIndex_t Id() const { return fId; }
 
     /// Get the station index
-    int Station() const { return fStation; }
+    XPU_D int Station() const { return fStation; }
 
     /// \brief Simple string representation of the hit class
     std::string ToString() const;
diff --git a/algo/ca/core/data/CaTriplet.h b/algo/ca/core/data/CaTriplet.h
index 4a9848ea1a..ed6d3a4ff0 100644
--- a/algo/ca/core/data/CaTriplet.h
+++ b/algo/ca/core/data/CaTriplet.h
@@ -25,9 +25,10 @@ namespace cbm::algo::ca
     Triplet() = default;
 
     /// Constructor
-    Triplet(ca::HitIndex_t iHitL, ca::HitIndex_t iHitM, ca::HitIndex_t iHitR, unsigned int iStaL, unsigned int iStaM,
-            unsigned int iStaR, unsigned char Level, unsigned int firstNeighbour, char nNeighbours, fscal Chi2,
-            fscal Qp, fscal Cqp, fscal tx, fscal Ctx, fscal ty, fscal Cty, bool isMomentumFitted)
+    XPU_D Triplet(ca::HitIndex_t iHitL, ca::HitIndex_t iHitM, ca::HitIndex_t iHitR, unsigned int iStaL,
+                  unsigned int iStaM, unsigned int iStaR, unsigned char Level, unsigned int firstNeighbour,
+                  char nNeighbours, fscal Chi2, fscal Qp, fscal Cqp, fscal tx, fscal Ctx, fscal ty, fscal Cty,
+                  bool isMomentumFitted)
       : fChi2(Chi2)
       , fQp(Qp)
       , fCqp(Cqp)
diff --git a/algo/ca/core/experimental/CaDeviceImage.cxx b/algo/ca/core/experimental/CaDeviceImage.cxx
new file mode 100644
index 0000000000..88932879aa
--- /dev/null
+++ b/algo/ca/core/experimental/CaDeviceImage.cxx
@@ -0,0 +1,6 @@
+/* Copyright (C) 2023 FIAS Frankfurt Institute for Advanced Studies, Frankfurt / Main
+   SPDX-License-Identifier: GPL-3.0-only
+   Authors: Felix Weiglhofer [committer] */
+#include "CaDeviceImage.h"
+
+XPU_IMAGE(cbm::algo::ca::GPUReco);
diff --git a/algo/ca/core/experimental/CaDeviceImage.h b/algo/ca/core/experimental/CaDeviceImage.h
new file mode 100644
index 0000000000..49eef6a4b7
--- /dev/null
+++ b/algo/ca/core/experimental/CaDeviceImage.h
@@ -0,0 +1,14 @@
+/* Copyright (C) 2023 FIAS Frankfurt Institute for Advanced Studies, Frankfurt / Main
+   SPDX-License-Identifier: GPL-3.0-only
+   Authors: Felix Weiglhofer [committer] */
+#ifndef CBM_ALGO_CA_GPU_DEVICEIMAGE_H
+#define CBM_ALGO_CA_GPU_DEVICEIMAGE_H
+#include <xpu/device.h>
+
+namespace cbm::algo::ca
+{
+  struct GPUReco : xpu::device_image {
+  };
+}  // namespace cbm::algo::ca
+
+#endif
diff --git a/algo/ca/core/experimental/CaGpuField.h b/algo/ca/core/experimental/CaGpuField.h
new file mode 100644
index 0000000000..7d6358e678
--- /dev/null
+++ b/algo/ca/core/experimental/CaGpuField.h
@@ -0,0 +1,308 @@
+/* Copyright (C) 2007-2022 GSI Helmholtzzentrum fuer Schwerionenforschung, Darmstadt
+   SPDX-License-Identifier: GPL-3.0-only
+   Authors: Sergey Gorbunov [committer], Igor Kulakov, Maksym Zyzak, Sergei Zharko */
+
+#pragma once  // include this header only once per compilation unit
+
+#include "CaUtils.h"
+#include "KfField.h"
+#include "KfTrackParam.h"
+
+#include <xpu/device.h>
+
+namespace cbm::algo::ca
+{
+  class GpuFieldValue {
+   public:
+    float x;  //< x-component of the field
+    float y;  //< y-component of the field
+    float z;  //< z-component of the field
+
+    GpuFieldValue() = default;
+
+    /// \brief Copy constructor with type conversion
+    template<typename DataIn>
+    GpuFieldValue(const kf::FieldValue<DataIn>& other)
+      : x(kf::utils::simd::Cast<DataIn, float>(other.GetBx()))
+      , y(kf::utils::simd::Cast<DataIn, float>(other.GetBy()))
+      , z(kf::utils::simd::Cast<DataIn, float>(other.GetBz()))
+    {
+    }
+
+    /// Combines the magnetic field with another field value object using weight
+    /// \param B  other field value to combine with
+    /// \param w  weight from 0 to 1
+
+    XPU_D void Combine(const GpuFieldValue& B, const bool w)
+    {
+      if (w) {
+        x = B.x;
+        y = B.y;
+        z = B.z;
+      }
+    }
+
+    /// Is the field value zero?
+    XPU_D bool IsZeroField() const
+    {
+      //     constexpr auto e = constants::misc::NegligibleFieldkG;
+      //     float e = 1.e-4;
+      return (x * x + y * y + z * z <= 1.e-8);
+    }
+  };
+
+  class GpuFieldSlice {
+   public:
+    /// Default constructor
+    GpuFieldSlice() = default;
+
+    static constexpr int kPolDegree{5};                                         ///< Approximation polynomial degree
+    static constexpr int kNofCoeff{((kPolDegree + 1) * (kPolDegree + 2)) / 2};  ///< Number of coefficients
+
+    /// \brief Array of the approximation coefficients [<monomial>]
+    using CoeffArray_t = std::array<float, kNofCoeff>;
+
+    /// \brief Copy constructor with type conversion
+    template<typename DataIn>
+    GpuFieldSlice(const kf::FieldSlice<DataIn>& other) : z(kf::utils::simd::Cast<DataIn, float>(other.GetZref()))
+    {
+      for (size_t i = 0; i < kMaxNFieldApproxCoefficients; i++) {
+        cx[i] = kf::utils::simd::Cast<DataIn, float>(other.GetBx()[i]);
+        cy[i] = kf::utils::simd::Cast<DataIn, float>(other.GetBy()[i]);
+        cz[i] = kf::utils::simd::Cast<DataIn, float>(other.GetBz()[i]);
+      }
+    }
+
+    /// Gets field value from (x, y) fvec point
+    /// \param x  x-coordinate of input
+    /// \param y  y-coordinate of input
+    /// \return B  the FieldValue output
+    XPU_D GpuFieldValue GetFieldValue_old(float x, float y) const;
+
+    //
+    XPU_D GpuFieldValue GetFieldValue(float x, float y) const
+    {
+      using cbm::algo::kf::math::Horner;
+      GpuFieldValue B;
+      B.x = Horner<kPolDegree>(cx.cbegin(), x, y);
+      B.y = Horner<kPolDegree>(cy.cbegin(), x, y);
+      B.z = Horner<kPolDegree>(cz.cbegin(), x, y);
+      return B;
+    }
+    //
+
+    /// Gets field value for the intersection with a straight track
+    /// \param t  straight track
+    /// \return B  the FieldValue output
+    XPU_D GpuFieldValue GetFieldValueForLine(const kf::TrackParamBase<float>& t) const;
+
+   public:
+    // NOTE: We don't use an initialization of arrays here because we cannot be sure
+    //       if the underlying type (fvec) has a default constructor, but
+    //       we are sure, that it can be initialized with a float. (S.Zharko)
+    static constexpr auto kMaxNFieldApproxCoefficients = 21;  //constants::size::MaxNFieldApproxCoefficients;
+
+    CoeffArray_t cx;  //fBx;  ///< Approximation coefficients for the x-component of the field
+    CoeffArray_t cy;  //fBy;  ///< Approximation coefficients for the y-component of the field
+    CoeffArray_t cz;  //fBz;  ///< Approximation coefficients for the z-component of the field
+
+    float z;  ///< z coordinate of the slice
+  };
+
+  XPU_D inline GpuFieldValue GpuFieldSlice::GetFieldValue_old(float x, float y) const
+  {
+    float x2 = x * x;
+    float y2 = y * y;
+    float xy = x * y;
+
+    float x3  = x2 * x;
+    float y3  = y2 * y;
+    float xy2 = x * y2;
+    float x2y = x2 * y;
+
+    float x4   = x3 * x;
+    float y4   = y3 * y;
+    float xy3  = x * y3;
+    float x2y2 = x2 * y2;
+    float x3y  = x3 * y;
+
+    float x5   = x4 * x;
+    float y5   = y4 * y;
+    float xy4  = x * y4;
+    float x2y3 = x2 * y3;
+    float x3y2 = x3 * y2;
+    float x4y  = x4 * y;
+
+    GpuFieldValue B;
+
+    B.x = cx[0] + cx[1] * x + cx[2] * y + cx[3] * x2 + cx[4] * xy + cx[5] * y2 + cx[6] * x3 + cx[7] * x2y + cx[8] * xy2
+          + cx[9] * y3 + cx[10] * x4 + cx[11] * x3y + cx[12] * x2y2 + cx[13] * xy3 + cx[14] * y4 + cx[15] * x5
+          + cx[16] * x4y + cx[17] * x3y2 + cx[18] * x2y3 + cx[19] * xy4 + cx[20] * y5;
+
+    B.y = cy[0] + cy[1] * x + cy[2] * y + cy[3] * x2 + cy[4] * xy + cy[5] * y2 + cy[6] * x3 + cy[7] * x2y + cy[8] * xy2
+          + cy[9] * y3 + cy[10] * x4 + cy[11] * x3y + cy[12] * x2y2 + cy[13] * xy3 + cy[14] * y4 + cy[15] * x5
+          + cy[16] * x4y + cy[17] * x3y2 + cy[18] * x2y3 + cy[19] * xy4 + cy[20] * y5;
+
+    B.z = cz[0] + cz[1] * x + cz[2] * y + cz[3] * x2 + cz[4] * xy + cz[5] * y2 + cz[6] * x3 + cz[7] * x2y + cz[8] * xy2
+          + cz[9] * y3 + cz[10] * x4 + cz[11] * x3y + cz[12] * x2y2 + cz[13] * xy3 + cz[14] * y4 + cz[15] * x5
+          + cz[16] * x4y + cz[17] * x3y2 + cz[18] * x2y3 + cz[19] * xy4 + cz[20] * y5;
+    return B;
+  }
+
+  XPU_D inline GpuFieldValue GpuFieldSlice::GetFieldValueForLine(const kf::TrackParamBase<float>& t) const
+  {
+    float dz = z - t.GetZ();
+    return GetFieldValue(t.GetX() + t.GetTx() * dz, t.GetY() + t.GetTy() * dz);
+  }
+
+  class GpuFieldRegion {
+   public:
+    GpuFieldRegion() = default;
+
+    /// \brief Copy constructor with type conversion
+    template<typename DataIn>
+    GpuFieldRegion(const kf::FieldRegion<DataIn>& other)
+      : cx0(kf::utils::simd::Cast<DataIn, float>(other.cx0))
+      , cx1(kf::utils::simd::Cast<DataIn, float>(other.cx1))
+      , cx2(kf::utils::simd::Cast<DataIn, float>(other.cx2))
+      , cy0(kf::utils::simd::Cast<DataIn, float>(other.cy0))
+      , cy1(kf::utils::simd::Cast<DataIn, float>(other.cy1))
+      , cy2(kf::utils::simd::Cast<DataIn, float>(other.cy2))
+      , cz0(kf::utils::simd::Cast<DataIn, float>(other.cz0))
+      , cz1(kf::utils::simd::Cast<DataIn, float>(other.cz1))
+      , cz2(kf::utils::simd::Cast<DataIn, float>(other.cz2))
+      , z0(kf::utils::simd::Cast<DataIn, float>(other.z0))
+      //     , fUseOriginalField(other.fUseOriginalField)
+      , fIsZeroField(other.fIsZeroField)
+    {
+    }
+
+    /// Gets the field vector and writes it into B pointer
+    /// \param x  x-coordinate of the point to calculate the field
+    /// \param y  y-coordinate of the point to calculate the field
+    /// \param z  z-coordinate of the point to calculate the field
+    /// \return    the magnetic field value
+    XPU_D GpuFieldValue Get([[maybe_unused]] float x, [[maybe_unused]] float y, float z) const
+    {
+      GpuFieldValue B;
+      float dz  = z - z0;
+      float dz2 = dz * dz;
+      B.x       = cx0 + cx1 * dz + cx2 * dz2;
+      B.y       = cy0 + cy1 * dz + cy2 * dz2;
+      B.z       = cz0 + cz1 * dz + cz2 * dz2;
+      return B;
+    }
+
+    /// Is the field region empty?
+    XPU_D bool IsZeroField() const { return fIsZeroField; }
+
+    /// Interpolates the magnetic field by three nodal points and sets the result to this FieldRegion object
+    /// The result is a quadratic interpolation of the field as a function of z
+    /// \param b0   field value in the first nodal point
+    /// \param b0z  z-coordinate of the first nodal point
+    /// \param b1   field value in the second nodal point
+    /// \param b1z  z-coordinate of the second nodal point
+    /// \param b2   field value in the third nodal point
+    /// \param b2z  z-coordinate of the third nodal point
+    /// TODO: does the sequence of b0z, b1z and b2z matter? If yes, probalby we need an assertion (S.Zharko)
+    XPU_D void Set(const GpuFieldValue& b0, float b0z, const GpuFieldValue& b1, float b1z, const GpuFieldValue& b2,
+                   float b2z);
+
+    /// Interpolates the magnetic field by thwo nodal points and sets the result to this FieldRegion object.
+    /// The result is a linear interpolation of the field as a function of z
+    /// \param b0   field value in the first nodal point
+    /// \param b0z  z-coordinate of the first nodal point
+    /// \param b1   field value in the second nodal point
+    /// \param b1z  z-coordinate of the second nodal point
+    /// TODO: does the sequence of b0z and b1z matter? If yes, probalby we need an assertion (S.Zharko)
+    XPU_D void Set(const GpuFieldValue& b0, float b0z, const GpuFieldValue& b1, float b1z)
+    {
+      fIsZeroField = (b0.IsZeroField() && b1.IsZeroField());
+      z0           = b0z;
+      float dzi    = 1. / (b1z - b0z);  //TODO
+      cx0          = b0.x;
+      cy0          = b0.y;
+      cz0          = b0.z;
+      cx1          = (b1.x - b0.x) * dzi;
+      cy1          = (b1.y - b0.y) * dzi;
+      cz1          = (b1.z - b0.z) * dzi;
+      cx2          = 0.f;
+      cy2          = 0.f;
+      cz2          = 0.f;
+    }
+
+    /// Shifts the coefficients to new central point
+    /// \param z  z-coordinate of the new central point
+    XPU_D void Shift(float z)
+    {
+      float dz    = z - z0;
+      float cx2dz = cx2 * dz;
+      float cy2dz = cy2 * dz;
+      float cz2dz = cz2 * dz;
+      z0          = z;
+      cx0 += (cx1 + cx2dz) * dz;
+      cy0 += (cy1 + cy2dz) * dz;
+      cz0 += (cz1 + cz2dz) * dz;
+      cx1 += cx2dz + cx2dz;
+      cy1 += cy2dz + cy2dz;
+      cz1 += cz2dz + cz2dz;
+    }
+
+
+   public:
+    // TODO: Probably it's better to have arrays instead of separate fvec values? (S.Zharko)
+    // Bx(z) = cx0 + cx1*(z-z0) + cx2*(z-z0)^2
+    float cx0;
+    float cx1;
+    float cx2;
+
+    // By(z) = cy0 + cy1*(z-z0) + cy2*(z-z0)^2
+    float cy0;
+    float cy1;
+    float cy2;
+
+    // Bz(z) = cz0 + cz1*(z-z0) + cz2*(z-z0)^2
+    float cz0;
+    float cz1;
+    float cz2;
+
+    float z0;  ///< z-coordinate of the field region central point
+
+    bool fIsZeroField;  //{false};  ///< Is the field region empty?
+  };
+
+  XPU_D inline void GpuFieldRegion::Set(const GpuFieldValue& b0, float b0z, const GpuFieldValue& b1, float b1z,
+                                        const GpuFieldValue& b2, float b2z)
+  {
+    fIsZeroField = (b0.IsZeroField() && b1.IsZeroField() && b2.IsZeroField());
+
+    z0        = b0z;
+    float dz1 = b1z - b0z;
+    float dz2 = b2z - b0z;
+    float det = 1. / (dz1 * dz2 * (b2z - b1z));  //TODO
+
+    float w21 = -dz2 * det;
+    float w22 = dz1 * det;
+    float w11 = -dz2 * w21;
+    float w12 = -dz1 * w22;
+
+    float db1 = b1.x - b0.x;
+    float db2 = b2.x - b0.x;
+    cx0       = b0.x;
+    cx1       = db1 * w11 + db2 * w12;
+    cx2       = db1 * w21 + db2 * w22;
+
+    db1 = b1.y - b0.y;
+    db2 = b2.y - b0.y;
+    cy0 = b0.y;
+    cy1 = db1 * w11 + db2 * w12;
+    cy2 = db1 * w21 + db2 * w22;
+
+    db1 = b1.z - b0.z;
+    db2 = b2.z - b0.z;
+    cz0 = b0.z;
+    cz1 = db1 * w11 + db2 * w12;
+    cz2 = db1 * w21 + db2 * w22;
+  }
+
+}  // namespace cbm::algo::ca
diff --git a/algo/ca/core/experimental/CaGpuGrid.h b/algo/ca/core/experimental/CaGpuGrid.h
new file mode 100644
index 0000000000..a61f4d72cc
--- /dev/null
+++ b/algo/ca/core/experimental/CaGpuGrid.h
@@ -0,0 +1,175 @@
+/* Copyright (C) 2010-2023 GSI Helmholtzzentrum fuer Schwerionenforschung, Darmstadt
+   SPDX-License-Identifier: GPL-3.0-only
+   Authors: Igor Kulakov [committer], Sergey Gorbunov */
+
+/// \file CaGpuGrid.h
+/// \brief A class to store hit information in a backet-sorted way on 2D grid
+///
+/// This is a temporary solution to store the grid on GPU
+/// The original code is based on the code of the ALICE HLT Project
+///
+
+#pragma once  // include this header only once per compilation unit
+
+#include "CaGrid.h"
+
+#include <xpu/device.h>
+
+namespace cbm::algo::ca
+{
+
+  namespace
+  {
+    using namespace cbm::algo;
+  }
+
+  /// \class Grid
+  /// \brief Class for storing 2d objects in a grid
+
+  /// It creates 2-dimensional backet-sorted grid of pointers to 2-dimensional objects.
+  ///
+  /// The class provides an access to the objects in selected 2D bin without touching the rest of the data.
+  /// To loop over the objects in arbitrary XY-area one can use the ca::GridArea class.
+  ///
+  /// The class is used by CaTracker to speed-up the hit search operations
+  /// The grid axis are named X and Y
+  ///
+  class GpuGrid {
+   public:
+    /// \brief  Default constructor
+    GpuGrid() = default;
+
+    /// \brief Destructor
+    ~GpuGrid() = default;
+
+    /// \brief Constructor
+    GpuGrid(const Grid& grid, int bin_start, int entries_start)
+      : fN(grid.GetNofBins())
+      , fNx(grid.GetNofBinsX())
+      , fNy(grid.GetNofBinsY())
+      , fMinX(grid.GetMinX())
+      , fMinY(grid.GetMinY())
+      , fBinWidthX(grid.GetBinWidthX())
+      , fBinWidthY(grid.GetBinWidthY())
+      , fBinWidthXinv(grid.GetBinWidthXinv())
+      , fBinWidthYinv(grid.GetBinWidthYinv())
+      , fMaxRangeX(grid.GetMaxRangeX())
+      , fMaxRangeY(grid.GetMaxRangeY())
+      , fMaxRangeT(grid.GetMaxRangeT())
+      , fBinEntryIndex_start(bin_start)
+      , fBinEntryIndex_amount(grid.GetFirstBinEntryIndex().size())
+      , fEntries_start(entries_start)
+      , fEntries_amount(grid.GetEntries().size())
+    {
+    }
+
+    /// \brief Get bin index for (X,Y) point with boundary check
+    /// \param X - point x coordinate
+    /// \param Y - point y coordinate
+    /// \return bin index in the range [0, fN-1]
+    XPU_D int GetBin(float X, float Y) const;
+
+    /// \brief Get bin X index with boundary check
+    /// \param X - x coordinate
+    /// \return binX index
+    XPU_D int GetBinX(float X) const;
+
+    /// \brief Get bin Y index with boundary check
+    /// \param Y - y coordinate
+    /// \return binY index
+    XPU_D int GetBinY(float Y) const;
+
+    /// Get number of bins
+    XPU_D int GetNofBins() const { return fN; }
+
+    /// Get number of bins along X
+    XPU_D int GetNofBinsX() const { return fNx; }
+
+    /// Get number of bins along Y
+    XPU_D int GetNofBinsY() const { return fNy; }
+
+    /// Get minimal X value
+    XPU_D float GetMinX() const { return fMinX; }
+
+    /// Get minimal Y value
+    XPU_D float GetMinY() const { return fMinY; }
+
+    /// Get bin width in X
+    XPU_D float GetBinWidthX() const { return fBinWidthX; }
+
+    /// Get bin width in Y
+    XPU_D float GetBinWidthY() const { return fBinWidthY; }
+
+    /// Get maximal entry range in X
+    XPU_D float GetMaxRangeX() const { return fMaxRangeX; }
+
+    /// Get maximal entry range in Y
+    XPU_D float GetMaxRangeY() const { return fMaxRangeY; }
+
+    /// Get maximal entry range in T
+    XPU_D float GetMaxRangeT() const { return fMaxRangeT; }
+
+    /// Get bin width in X inversed
+    XPU_D float GetBinWidthXinv() const { return fBinWidthXinv; }
+
+    /// Get bin width in Y inversed
+    XPU_D float GetBinWidthYinv() const { return fBinWidthYinv; }
+
+    /// Get the index of the first bin entry in the index array
+    XPU_D int GetBinEntryIndexStart() const { return fBinEntryIndex_start; }
+
+    /// Get the number of entries in the index array
+    XPU_D int GetBinEntryIndexAmount() const { return fBinEntryIndex_amount; }
+
+    /// Get the index of the first entry in the entry array
+    XPU_D int GetEntriesStart() const { return fEntries_start; }
+
+    /// Get the number of entries in the entry array
+    XPU_D int GetEntriesAmount() const { return fEntries_amount; }
+
+   private:
+    /// --- Data members ---
+
+    int fN{0};   ///< total N bins
+    int fNx{0};  ///< N bins in X
+    int fNy{0};  ///< N bins in Y
+
+    float fMinX{0.};          ///< minimal X value
+    float fMinY{0.};          ///< minimal Y value
+    float fBinWidthX{0.};     ///< bin width in X
+    float fBinWidthY{0.};     ///< bin width in Y
+    float fBinWidthXinv{0.};  ///< inverse bin width in X
+    float fBinWidthYinv{0.};  ///< inverse bin width in Y
+
+    float fMaxRangeX{0.};  ///< maximal entry range in X
+    float fMaxRangeY{0.};  ///< maximal entry range in Y
+    float fMaxRangeT{0.};  ///< maximal entry range in T
+
+    int fBinEntryIndex_start{0};   ///< index of the first bin entry in the index array
+    int fBinEntryIndex_amount{0};  ///< number of entries in the index array
+
+    int fEntries_start{0};   ///< index of the first entry in the entry array
+    int fEntries_amount{0};  ///< number of entries in the entry array
+  };
+
+  /// --- Inline methods ---
+
+  XPU_D inline int GpuGrid::GetBin(float X, float Y) const
+  {
+    //
+    return GetBinY(Y) * fNx + GetBinX(X);
+  }
+
+  XPU_D inline int GpuGrid::GetBinX(float X) const
+  {
+    int binX = static_cast<int>((X - fMinX) * fBinWidthXinv);
+    return xpu::max(0, xpu::min(fNx - 1, binX));
+  }
+
+  XPU_D inline int GpuGrid::GetBinY(float Y) const
+  {
+    int binY = static_cast<int>((Y - fMinY) * fBinWidthYinv);
+    return xpu::max(0, xpu::min(fNy - 1, binY));
+  }
+
+}  // namespace cbm::algo::ca
diff --git a/algo/ca/core/experimental/CaGpuGridArea.h b/algo/ca/core/experimental/CaGpuGridArea.h
new file mode 100644
index 0000000000..bb7a96d64a
--- /dev/null
+++ b/algo/ca/core/experimental/CaGpuGridArea.h
@@ -0,0 +1,112 @@
+/* Copyright (C) 2012-2020 Frankfurt Institute for Advanced Studies, Goethe-Universität Frankfurt, Frankfurt
+   SPDX-License-Identifier: GPL-3.0-only
+   Authors: Maksym Zyzak, Igor Kulakov [committer] */
+
+/// \file CaGrid.h
+
+#pragma once  // include this header only once per compilation unit
+
+#include "CaGpuGrid.h"
+#include "CaHit.h"
+
+namespace cbm::algo::ca
+{
+
+  namespace
+  {
+    using namespace cbm::algo;  // to keep 'ca::' prefices in the code
+  }
+
+  /// \brief Class for accessing objects in the 2D area that are stored in ca::Grid
+  ///
+  class GpuGridArea {
+   public:
+    /// \brief Constructor
+    /// \param grid - the grid to work with
+    /// \param x - X coordinate of the center of the area
+    /// \param y - Y coordinate of the center of the area
+    /// \param dx - half-width of the area in X
+    /// \param dy - half-width of the area in Y
+    //TODO: try to provide first element for the current grid instead of first element of the original arrays
+    XPU_D GpuGridArea(const ca::GpuGrid& grid, const unsigned int* BinEntryIndex, const unsigned int* Entries, float x,
+                      float y, float dx, float dy)
+      : fGrid(grid)
+      , fBinEntryIndex(BinEntryIndex)
+      , fEntries(Entries)
+      , fGridNbinsX(fGrid.GetNofBinsX())
+    {
+      int binXmin = fGrid.GetBinX(x - dx);
+      int binXmax = fGrid.GetBinX(x + dx);
+      int binYmin = fGrid.GetBinY(y - dy);
+      int binYmax = fGrid.GetBinY(y + dy);
+
+      fAreaLastBinY = binYmax;
+      fAreaNbinsX   = (binXmax - binXmin + 1);  // bin index span in x direction
+
+      fAreaFirstBin = (binYmin * fGridNbinsX + binXmin);
+
+      fAreaCurrentBinY = binYmin;
+      fCurentEntry     = fBinEntryIndex[fAreaFirstBin];
+      fEntriesXend     = fBinEntryIndex[fAreaFirstBin + fAreaNbinsX];
+    }
+
+
+    /// \brief look up the next grid entry in the requested area
+    /// \return ind - the entry index in the grid.GetEntries() vector
+    /// \return true if the entry is found, false if there are no more entries in the area
+    XPU_D bool GetNextGridEntry(unsigned int& ind);
+
+    /// \brief look up the next object id in the requested area
+    /// \return objectId - the id of the object
+    /// \return true if the object is found, false if there are no more pbjects in the area
+    XPU_D bool GetNextObjectId(unsigned int& objectId);
+
+    /// \brief debug mode: loop over the entire GetEntries() vector ignoring the search area
+    //    void DoLoopOverEntireGrid();
+
+   private:
+    const ca::GpuGrid& fGrid;
+    const unsigned int* fBinEntryIndex;
+    const unsigned int* fEntries;
+
+    int fAreaLastBinY{0};            // last Y bin of the area
+    int fAreaNbinsX{0};              // number of area bins in X
+    int fAreaFirstBin{0};            // first bin of the area (left-down corner of the area)
+    int fAreaCurrentBinY{0};         // current Y bin (incremented while iterating)
+    ca::HitIndex_t fCurentEntry{0};  // index of the current entry in fGrid.GetEntries()
+    ca::HitIndex_t fEntriesXend{0};  // end of the hit indices in current y-row
+    int fGridNbinsX{0};              // Number of grid bins in X (copy of fGrid::GetNofBinsX())
+  };
+
+  XPU_D inline bool GpuGridArea::GetNextGridEntry(unsigned int& ind)
+  {
+    bool xIndexOutOfRange = (fCurentEntry >= fEntriesXend);  // current entry is not in the area
+
+    // jump to the next y row if fCurentEntry is outside of the X area
+    while (xIndexOutOfRange) {
+      if (fAreaCurrentBinY >= fAreaLastBinY) {
+        return false;
+      }
+      fAreaCurrentBinY++;                                // get new y-line
+      fAreaFirstBin += fGridNbinsX;                      // move the left-down corner of the area to the next y-line
+      fCurentEntry     = fBinEntryIndex[fAreaFirstBin];  // get first hit in cell, if y-line is new
+      fEntriesXend     = fBinEntryIndex[fAreaFirstBin + fAreaNbinsX];
+      xIndexOutOfRange = (fCurentEntry >= fEntriesXend);
+    }
+    ind = fCurentEntry;  // return value
+    fCurentEntry++;      // go to next
+    return true;
+  }
+
+  XPU_D inline bool GpuGridArea::GetNextObjectId(unsigned int& objectId)
+  {
+    unsigned int entryIndex = 0;
+
+    bool ok = GetNextGridEntry(entryIndex);
+    if (ok) {
+      objectId = fEntries[entryIndex];
+    }
+    return ok;
+  }
+
+}  // namespace cbm::algo::ca
diff --git a/algo/ca/core/experimental/CaGpuMaterialMap.h b/algo/ca/core/experimental/CaGpuMaterialMap.h
new file mode 100644
index 0000000000..20a282bc6d
--- /dev/null
+++ b/algo/ca/core/experimental/CaGpuMaterialMap.h
@@ -0,0 +1,74 @@
+/* Copyright (C) 2007-2023 GSI Helmholtzzentrum fuer Schwerionenforschung, Darmstadt
+   SPDX-License-Identifier: GPL-3.0-only
+   Authors: Igor Kulakov, Sergey Gorbunov [committer], Andrey Lebedev, Sergei Zharko */
+
+#pragma once  // include this header only once per compilation unit
+
+//#include "CaSimd.h"
+//#include "CaUtils.h"
+#include "KfMaterialMap.h"
+
+#include <xpu/device.h>
+
+namespace cbm::algo::ca
+{
+  /// \class MaterialMap
+  /// \brief A map of station thickness in units of radiation length (X0) to the specific point in XY plane
+  class GpuMaterialMap {
+   public:
+    /// \brief Default constructor
+    GpuMaterialMap() = default;
+
+    /// \brief Copy constructor
+    GpuMaterialMap(const kf::MaterialMap& other, int binStart)
+      : fNbins(other.GetNbins())
+      , fBinStart(binStart)
+      , fXYmax(other.GetXYmax())
+      , fZref(other.GetZref())
+      , fZmin(other.GetZmin())
+      , fZmax(other.GetZmax())
+    {
+      fFactor = 0.5 * fNbins / fXYmax;
+    }
+
+    /// \brief Destructor
+    ~GpuMaterialMap() noexcept = default;
+
+    /// \brief Gets number of bins (rows or columns) of the material table
+    XPU_D int GetNbins() const { return fNbins; }
+
+    /// \brief Gets radius in cm of the material table
+    XPU_D float GetXYmax() const { return fXYmax; }
+
+    /// \brief Gets reference Z of the material in cm
+    XPU_D float GetZref() const { return fZref; }
+
+    /// \brief Gets minimal Z of the collected material in cm
+    XPU_D float GetZmin() const { return fZmin; }
+
+    /// \brief Gets maximal Z of the collected material in cm
+    XPU_D float GetZmax() const { return fZmax; }
+
+    /// \brief Get bin index for (x,y). Returns -1 when outside of the map
+    XPU_D int GetBin(float x, float y) const
+    {
+      int i = static_cast<int>((x + fXYmax) * fFactor);
+      int j = static_cast<int>((y + fXYmax) * fFactor);
+      if (i < 0 || j < 0 || i >= fNbins || j >= fNbins) {
+        return -1;
+      }
+      //TODO
+      return fBinStart + i + j * fNbins;
+    }
+
+   private:
+    int fNbins;     ///< Number of rows (== N columns) in the material budget table
+    int fBinStart;  ///< Start index of the material budget table
+    float fXYmax;   ///< Size of the station in x and y dimensions [cm]
+    float fFactor;  ///< Util. var. for the conversion of point coordinates to row/column id
+    float fZref;    ///< Reference Z of the collected material [cm]
+    float fZmin;    ///< Minimal Z of the collected material [cm]
+    float fZmax;    ///< Minimal Z of the collected material [cm]
+  };
+
+}  // namespace cbm::algo::ca
diff --git a/algo/ca/core/experimental/CaGpuParameters.h b/algo/ca/core/experimental/CaGpuParameters.h
new file mode 100644
index 0000000000..fbbee5538d
--- /dev/null
+++ b/algo/ca/core/experimental/CaGpuParameters.h
@@ -0,0 +1,111 @@
+/* Copyright (C) 2021-2022 GSI Helmholtzzentrum fuer Schwerionenforschung, Darmstadt
+   SPDX-License-Identifier: GPL-3.0-only
+   Authors: Sergey Gorbunov, Sergei Zharko [committer] */
+
+/// @file L1Parameters.h
+/// @brief Parameter container for the L1Algo library
+/// @since 19.12.2021
+/// @author S.Zharko <s.zharko@gsi.de>
+
+#pragma once  // include this header only once per compilation unit
+
+#include "CaGpuField.h"
+#include "CaGpuMaterialMap.h"
+#include "CaGpuStation.h"
+#include "CaMeasurementXy.h"
+#include "CaParameters.h"
+
+#include <xpu/device.h>
+
+namespace cbm::algo::ca
+{
+
+  class GpuParameters {
+   public:
+    GpuParameters() = default;
+
+    /// \brief Copy constructor with type conversion
+    template<typename DataIn>
+    GpuParameters(const Parameters<DataIn>& other)
+      : fNstationsActiveTotal(other.GetNstationsActive())
+      , fMaxDoubletsPerSinglet(other.GetMaxDoubletsPerSinglet())
+      , fMaxTripletPerDoublets(other.GetMaxTripletPerDoublets())
+      , maxDoubletsFromHit(150)
+      , maxTripletsFromDoublet(15)
+      , maxExtrapolationStep(50.)
+      , SpeedOfLightInv(0.03335641)
+      , maxSlopePV(0.1)
+      , maxQp(0.1)
+      , maxDZ(0.1)
+      , particleMass(0.13957)
+      , doubletChi2Cut(5)
+      , tripletChi2Cut(5)
+      , tripletFinalChi2Cut(5)
+      , fNStations(0)
+      , primaryFlag(true)
+      , isTargetField(false)
+    {
+      fTargetPos[0] = kf::utils::simd::Cast<DataIn, float>(other.GetTargetPositionX());
+      fTargetPos[1] = kf::utils::simd::Cast<DataIn, float>(other.GetTargetPositionY());
+      fTargetPos[2] = kf::utils::simd::Cast<DataIn, float>(other.GetTargetPositionZ());
+      for (size_t i = 0; i < fNstationsActiveTotal; i++) {
+        fStations[i] = other.GetStation(i);
+      }
+    }
+    ~GpuParameters() = default;
+    //
+   public:
+    ca::GpuFieldValue targB;                     ///< Magnetic field in the target region
+    ca::MeasurementXy<float> targetMeasurement;  ///< Measurement XY
+
+    int GetNstationsActive() const { return fNstationsActiveTotal; }
+    void SetNstationsActive(int nsta) { fNstationsActiveTotal = nsta; }
+
+    /// \brief Gets upper-bound cut on max number of doublets per one singlet
+    XPU_D unsigned int GetMaxDoubletsPerSinglet() const { return fMaxDoubletsPerSinglet; }
+
+    /// \brief Gets upper-bound cut on max number of triplets per one doublet
+    XPU_D unsigned int GetMaxTripletPerDoublets() const { return fMaxTripletPerDoublets; }
+
+    XPU_D const ca::GpuStation& GetStation(int iStation) const { return fStations[iStation]; }
+    void SetStation(int iStation, const ca::GpuStation& station) { fStations[iStation] = station; }
+
+    /// \brief Gets X component of target position
+    XPU_D float GetTargetPositionX() const { return fTargetPos[0]; }
+
+    /// \brief Gets Y component of target position
+    XPU_D float GetTargetPositionY() const { return fTargetPos[1]; }
+
+    /// \brief Gets Z component of target position
+    XPU_D float GetTargetPositionZ() const { return fTargetPos[2]; }
+
+    std::array<ca::GpuStation, 15>
+      fStations;  ///< Array of stations	//TODO: 15 is temporary, think how to do it properly on gpu
+
+    std::array<float, 3> fTargetPos;  ///< Target position
+
+    size_t fNstationsActiveTotal;  ///< total number of active tracking stations
+
+    unsigned int fMaxDoubletsPerSinglet;  ///< Upper-bound cut on max number of doublets per one singlet
+    unsigned int fMaxTripletPerDoublets;  ///< Upper-bound cut on max number of triplets per one doublet
+
+    int maxDoubletsFromHit;      ///< Max doublets from hit
+    int maxTripletsFromDoublet;  ///< Max triplets from doublet
+
+    float maxExtrapolationStep;  ///< Max extrapolation step
+
+    float SpeedOfLightInv;      ///< Speed of light inverse
+    float maxSlopePV;           ///< Max slope to primary vertex
+    float maxQp;                ///< Max Q/p
+    float maxDZ;                ///< Max DZ
+    float particleMass;         ///< Particle mass
+    float doubletChi2Cut;       ///< Doublet chi2 cut
+    float tripletChi2Cut;       ///< Triplet chi2 cut
+    float tripletFinalChi2Cut;  ///< Triplet final chi2 cut
+
+    size_t fNStations;  ///< Number of stations
+
+    bool primaryFlag;    ///< Primary flag
+    bool isTargetField;  ///< Is target field
+  };
+}  // namespace cbm::algo::ca
diff --git a/algo/ca/core/experimental/CaGpuStation.h b/algo/ca/core/experimental/CaGpuStation.h
new file mode 100644
index 0000000000..2a816d665d
--- /dev/null
+++ b/algo/ca/core/experimental/CaGpuStation.h
@@ -0,0 +1,72 @@
+/* Copyright (C) 2007-2022 GSI Helmholtzzentrum fuer Schwerionenforschung, Darmstadt
+   SPDX-License-Identifier: GPL-3.0-only
+   Authors: Sergey Gorbunov [committer], Igor Kulakov, Sergei Zharko */
+
+#pragma once  // include this header only once per compilation unit
+
+//#include "CaConstants.h"
+#include "CaGpuField.h"
+#include "CaStation.h"
+
+//#include <string>
+
+#include <xpu/device.h>
+
+namespace cbm::algo::ca
+{
+  /// Structure Station
+  /// Contains a set of geometry parameters for a particular station
+  ///
+  class GpuStation {
+   public:
+    int type;      //     = constants::Undef<int>;  //  TODO: replace with L1DetectorID
+    int timeInfo;  // = constants::Undef<int>;  ///< flag: if time information can be used
+    int
+      fieldStatus;  // = constants::Undef<int>;               ///< flag: 1 - station is INSIDE the field, 0 - station is OUTSIDE the field
+    float fZ;    //   = constants::Undef<DataT>;  ///< z position of station     [cm]
+    float Xmax;  // = constants::Undef<DataT>;  ///< min radius of the station [cm]
+    float Ymax;  // = constants::Undef<DataT>;  ///< max radius of the station [cm]
+
+    GpuFieldSlice fieldSlice;  ///< Magnetic field near the station
+
+    GpuStation() = default;
+
+    /// \brief Copy constructor with type conversion
+    template<typename DataIn>
+    GpuStation(const Station<DataIn>& other)
+      : type(other.GetType())
+      , timeInfo(other.GetTimeStatus())
+      , fieldStatus(other.GetFieldStatus())
+      , fZ(other.template GetZ<float>())
+      , Xmax(other.template GetXmax<float>())
+      , Ymax(other.template GetYmax<float>())
+      , fieldSlice(other.fieldSlice)
+    {
+    }
+
+    /// \brief Gets type of the station
+    int GetType() const { return type; }
+
+    /// \brief Gets time-measurement flag
+    int GetTimeStatus() const { return timeInfo; }
+
+    /// \brief Gets field status flag
+    int GetFieldStatus() const { return fieldStatus; }
+
+    /// \brief Gets z-position of the station
+    XPU_D float GetZ() const { return fZ; }
+
+    /// \brief Gets limit of the station size in x-axis direction
+    XPU_D float GetXmax() const { return Xmax; }
+
+    /// \brief Gets limit of the station size in x-axis direction
+    XPU_D float GetXmin() const { return -Xmax; }
+
+    /// \brief Gets limit of the station size in y-axis direction
+    XPU_D float GetYmax() const { return Ymax; }
+
+    /// \brief Gets limit of the station size in y-axis direction
+    XPU_D float GetYmin() const { return -Ymax; }
+  };
+
+}  // namespace cbm::algo::ca
diff --git a/algo/ca/core/experimental/CaGpuTimeMonitor.h b/algo/ca/core/experimental/CaGpuTimeMonitor.h
new file mode 100644
index 0000000000..77f05241b4
--- /dev/null
+++ b/algo/ca/core/experimental/CaGpuTimeMonitor.h
@@ -0,0 +1,150 @@
+#pragma once  // include this header only once per compilation unit
+
+#include <xpu/host.h>
+
+namespace cbm::algo::ca
+{
+
+  namespace
+  {
+    using namespace cbm::algo;
+  }
+
+  struct XpuTimings {
+    xpu::timings PrepareData_time[4];
+    xpu::timings MakeSinglets_time[4];
+    xpu::timings MakeDoublets_time[4];
+    xpu::timings CompressDoublets_time[4];
+    xpu::timings ResetDoublets_time[4];
+    xpu::timings FitDoublets_time[4];
+    xpu::timings MakeTriplets_time[4];
+    xpu::timings CompressTriplets_time[4];
+    xpu::timings ResetTriplets_time[4];
+    xpu::timings FitTriplets_time[4];
+    xpu::timings SortTriplets_time[4];
+    xpu::timings Total_time[4];
+    int nIterations;
+
+    void PrintTimings(int iteration) const
+    {
+      if (iteration < 0 || iteration >= 4) {
+        std::cerr << "Error: wrong iteration number: " << iteration << std::endl;
+        return;
+      }
+
+      auto print_timing = [](const std::string& name, const xpu::timings& t) {
+        std::cout << std::left << std::setw(25) << name << "Kernel time: " << std::setw(10) << t.kernel_time()
+                  << " H2D copy: " << std::setw(10) << t.copy(xpu::h2d) << " D2H copy: " << std::setw(10)
+                  << t.copy(xpu::d2h) << " Memset: " << std::setw(10) << t.memset() << " Total: " << std::setw(10)
+                  << t.wall() << std::endl;
+      };
+
+      print_timing("PrepareData_time", PrepareData_time[iteration]);
+      print_timing("MakeSinglets_time", MakeSinglets_time[iteration]);
+      print_timing("MakeDoublets_time", MakeDoublets_time[iteration]);
+      print_timing("CompressDoublets_time", CompressDoublets_time[iteration]);
+      print_timing("ResetDoublets_time", ResetDoublets_time[iteration]);
+      print_timing("FitDoublets_time", FitDoublets_time[iteration]);
+      print_timing("MakeTriplets_time", MakeTriplets_time[iteration]);
+      print_timing("CompressTriplets_time", CompressTriplets_time[iteration]);
+      print_timing("ResetTriplets_time", ResetTriplets_time[iteration]);
+      print_timing("FitTriplets_time", FitTriplets_time[iteration]);
+      print_timing("SortTriplets_time", SortTriplets_time[iteration]);
+      print_timing("Total_time", Total_time[iteration]);
+    }
+  };
+
+  ///
+  class GpuTimeMonitor {
+   public:
+    /// \brief  Default constructor
+    GpuTimeMonitor() : nEvents(0)
+    //     , nIterations(4)
+    {
+      //      for( int i = 0; i < 4; i++ ) {
+      //	MakeSinglets_time[i] = 0;
+      //	MakeDoublets_time[i] = 0;
+      //	CompressDoublets_time[i] = 0;
+      //	FitDoublets_time[i] = 0;
+      //	MakeTriplets_time[i] = 0;
+      //	CompressTriplets_time[i] = 0;
+      //      }
+    }
+
+    /// \brief Destructor
+    ~GpuTimeMonitor() = default;
+
+    //    void
+    //    AddEventInfo (GpuTimeMonitor &event)
+    //    {
+    //      for( int i = 0; i < 4; i++ ) {
+    ////	MakeSinglets_time[i] += event.GetMakeSingletsTime(i);
+    ////	MakeDoublets_time[i] += event.GetMakeDoubletsTime(i);
+    ////	CompressDoublets_time[i] += event.GetCompressDoubletsTime(i);
+    ////	FitDoublets_time[i] += event.GetFitDoubletsTime(i);
+    ////	MakeTriplets_time[i] += event.GetMakeTripletsTime(i);
+    ////	CompressTriplets_time[i] += event.GetCompressTripletsTime(i);
+    ////	FitTriplets_time[i] += event.GetFitTripletsTime(i);
+    //      }
+    //     nEvents++;
+    //    }
+
+    void AddEventInfo(XpuTimings& event)
+    {
+      xpuTimings.push_back(event);
+      nEvents++;
+    }
+
+    void SetNEvents(int nEv) { this->nEvents = nEv; }
+
+    int GetNEvents() { return nEvents; }
+
+    //    void SetNIterations(int nIt) { this->nIterations = nIt; }
+
+    //    int GetNIterations() { return nIterations; }
+
+    //    void SetMakeSingletsTime(int i, xpu::timings time) { MakeSinglets_time[i] = time; }
+
+    //    void SetMakeDoubletsTime(int i, xpu::timings time) { MakeDoublets_time[i] = time; }
+
+    //    void SetCompressDoubletsTime(int i, xpu::timings time) { CompressDoublets_time[i] = time; }
+
+    //    void SetFitDoubletsTime(int i, xpu::timings time) { FitDoublets_time[i] = time; }
+
+    //    void SetMakeTripletsTime(int i, xpu::timings time) { MakeTriplets_time[i] = time; }
+
+    //    void SetCompressTripletsTime(int i, xpu::timings time) { CompressTriplets_time[i] = time; }
+
+    //    void SetFitTripletsTime(int i, xpu::timings time) { FitTriplets_time[i] = time; }
+
+    xpu::timings GetMakeSingletsTime(int ev, int it) { return xpuTimings[ev].MakeSinglets_time[it]; }
+
+    xpu::timings GetMakeDoubletsTime(int ev, int it) { return xpuTimings[ev].MakeDoublets_time[it]; }
+
+    xpu::timings GetCompressDoubletsTime(int ev, int it) { return xpuTimings[ev].CompressDoublets_time[it]; }
+
+    xpu::timings GetFitDoubletsTime(int ev, int it) { return xpuTimings[ev].FitDoublets_time[it]; }
+
+    xpu::timings GetMakeTripletsTime(int ev, int it) { return xpuTimings[ev].MakeTriplets_time[it]; }
+
+    xpu::timings GetCompressTripletsTime(int ev, int it) { return xpuTimings[ev].CompressTriplets_time[it]; }
+
+    xpu::timings GetFitTripletsTime(int ev, int it) { return xpuTimings[ev].FitTriplets_time[it]; }
+
+    xpu::timings GetTotalTime(int ev, int it) { return xpuTimings[ev].Total_time[it]; }
+
+   private:
+    //     xpu::timings MakeSinglets_time[4];
+    //     xpu::timings MakeDoublets_time[4];
+    //     xpu::timings CompressDoublets_time[4];
+    //     xpu::timings FitDoublets_time[4];
+    //     xpu::timings MakeTriplets_time[4];
+    //     xpu::timings CompressTriplets_time[4];
+    //     xpu::timings FitTriplets_time[4];
+    std::vector<XpuTimings> xpuTimings;
+
+    int nEvents;
+    //     int nIterations;
+  };
+
+}  // namespace cbm::algo::ca
diff --git a/algo/ca/core/experimental/CaGpuTrackFinderSetup.cxx b/algo/ca/core/experimental/CaGpuTrackFinderSetup.cxx
new file mode 100644
index 0000000000..33aca92757
--- /dev/null
+++ b/algo/ca/core/experimental/CaGpuTrackFinderSetup.cxx
@@ -0,0 +1,422 @@
+#include "CaGpuTrackFinderSetup.h"
+
+using namespace cbm::algo::ca;
+
+GpuTrackFinderSetup::GpuTrackFinderSetup(WindowData& wData, const ca::Parameters<fvec>& pars)
+  : fParameters(pars)
+  , fSetup(pars.GetGeometrySetup())
+  , frWData(wData)
+  , fIteration(0)
+{
+}
+
+void GpuTrackFinderSetup::SetupParameters()
+{
+  int nStations   = fParameters.GetNstationsActive();
+  int nIterations = fParameters.GetCAIterations().size();
+  fTripletConstructor.fParams.reset(nIterations, xpu::buf_io);
+  xpu::h_view vfParams{fTripletConstructor.fParams};
+
+  for (int i = 0; i < nIterations; i++) {
+    GpuParameters gpuParams              = fParameters;
+    fTripletConstructor.fParams_const[i] = fParameters;
+    const auto& caIteration              = fParameters.GetCAIterations()[i];
+
+    gpuParams.fNStations                                 = nStations;
+    gpuParams.SpeedOfLightInv                            = constants::phys::SpeedOfLightInv;
+    gpuParams.maxSlopePV                                 = caIteration.GetMaxSlopePV();
+    gpuParams.maxQp                                      = caIteration.GetMaxQp();
+    gpuParams.maxDZ                                      = caIteration.GetMaxDZ();
+    fTripletConstructor.fParams_const[i].fNStations      = nStations;
+    fTripletConstructor.fParams_const[i].SpeedOfLightInv = constants::phys::SpeedOfLightInv;
+    fTripletConstructor.fParams_const[i].maxSlopePV      = caIteration.GetMaxSlopePV();
+    fTripletConstructor.fParams_const[i].maxQp           = caIteration.GetMaxQp();
+    fTripletConstructor.fParams_const[i].maxDZ           = caIteration.GetMaxDZ();
+
+    if (caIteration.GetElectronFlag()) {
+      gpuParams.particleMass                            = constants::phys::ElectronMass;
+      fTripletConstructor.fParams_const[i].particleMass = constants::phys::ElectronMass;
+    }
+    else {
+      gpuParams.particleMass                            = constants::phys::MuonMass;
+      fTripletConstructor.fParams_const[i].particleMass = constants::phys::MuonMass;
+    }
+
+    gpuParams.primaryFlag                            = caIteration.GetPrimaryFlag();
+    fTripletConstructor.fParams_const[i].primaryFlag = caIteration.GetPrimaryFlag();
+
+    if (caIteration.GetPrimaryFlag()) {
+      gpuParams.targB                            = fParameters.GetVertexFieldValue();
+      fTripletConstructor.fParams_const[i].targB = fParameters.GetVertexFieldValue();
+    }
+    else {
+      gpuParams.targB                            = fParameters.GetStation(0).fieldSlice.GetFieldValue(0, 0);
+      fTripletConstructor.fParams_const[i].targB = fParameters.GetStation(0).fieldSlice.GetFieldValue(0, 0);
+    }
+
+    gpuParams.doubletChi2Cut                                    = caIteration.GetDoubletChi2Cut();
+    gpuParams.tripletChi2Cut                                    = caIteration.GetTripletChi2Cut();
+    gpuParams.tripletFinalChi2Cut                               = caIteration.GetTripletFinalChi2Cut();
+    gpuParams.maxDoubletsFromHit                                = maxDoubletsFromHit;
+    gpuParams.maxTripletsFromDoublet                            = maxTripletsFromDoublet;
+    gpuParams.maxExtrapolationStep                              = 50.f;
+    gpuParams.isTargetField                                     = (fabs(gpuParams.targB.y) > 0.001);
+    fTripletConstructor.fParams_const[i].doubletChi2Cut         = caIteration.GetDoubletChi2Cut();
+    fTripletConstructor.fParams_const[i].tripletChi2Cut         = caIteration.GetTripletChi2Cut();
+    fTripletConstructor.fParams_const[i].tripletFinalChi2Cut    = caIteration.GetTripletFinalChi2Cut();
+    fTripletConstructor.fParams_const[i].maxDoubletsFromHit     = maxDoubletsFromHit;
+    fTripletConstructor.fParams_const[i].maxTripletsFromDoublet = maxTripletsFromDoublet;
+    fTripletConstructor.fParams_const[i].maxExtrapolationStep   = 50.f;
+
+    MeasurementXy<float> targMeas(fParameters.GetTargetPositionX()[0], fParameters.GetTargetPositionY()[0],
+                                  caIteration.GetTargetPosSigmaX() * caIteration.GetTargetPosSigmaX(),
+                                  caIteration.GetTargetPosSigmaY() * caIteration.GetTargetPosSigmaX(), 0, 1, 1);
+    gpuParams.targetMeasurement                            = targMeas;
+    fTripletConstructor.fParams_const[i].targetMeasurement = targMeas;
+
+    vfParams[i] = gpuParams;
+  }
+
+  fQueue.copy(fTripletConstructor.fParams, xpu::h2d);
+
+  for (int ista = 0; ista < nStations; ista++) {
+    fTripletConstructor.fStations_const[ista] = fParameters.GetStation(ista);
+  }
+
+  xpu::set<strGpuTripletConstructor>(fTripletConstructor);
+}
+
+void GpuTrackFinderSetup::SetupGrid()
+{
+  //  std::cout<<"Setting up grid"<<std::endl;
+  unsigned int nStations     = fParameters.GetNstationsActive();
+  unsigned int bin_start     = 0;
+  unsigned int entries_start = 0;
+
+  fTripletConstructor.fvGpuGrid.reset(nStations, xpu::buf_io);
+  xpu::h_view vfvGpuGrid{fTripletConstructor.fvGpuGrid};
+
+  for (unsigned int ista = 0; ista < nStations; ista++) {
+    vfvGpuGrid[ista] = GpuGrid(frWData.Grid(ista), bin_start, entries_start);
+    bin_start += frWData.Grid(ista).GetFirstBinEntryIndex().size();
+    entries_start += frWData.Grid(ista).GetEntries().size();
+  }
+
+  fTripletConstructor.fgridFirstBinEntryIndex.reset(bin_start, xpu::buf_io);
+  fTripletConstructor.fgridEntries.reset(entries_start, xpu::buf_io);
+
+  xpu::h_view vfgridFirstBinEntryIndex{fTripletConstructor.fgridFirstBinEntryIndex};
+  xpu::h_view vfgridEntries{fTripletConstructor.fgridEntries};
+
+  bin_start = entries_start = 0;
+
+  for (unsigned int ista = 0; ista < nStations; ista++) {
+    std::copy_n(frWData.Grid(ista).GetFirstBinEntryIndex().begin(), frWData.Grid(ista).GetFirstBinEntryIndex().size(),
+                &vfgridFirstBinEntryIndex[bin_start]);
+    for (unsigned int i = 0; i < frWData.Grid(ista).GetEntries().size(); i++) {
+      vfgridEntries[entries_start + i] = frWData.Grid(ista).GetEntries()[i].GetObjectId();
+    }
+    bin_start += frWData.Grid(ista).GetFirstBinEntryIndex().size();
+    entries_start += frWData.Grid(ista).GetEntries().size();
+  }
+  fNHits = entries_start;
+
+  fQueue.copy(fTripletConstructor.fvGpuGrid, xpu::h2d);
+}
+
+void GpuTrackFinderSetup::SetupMaterialMap()
+{
+  ///Set up material map
+  int bin_start  = 0;
+  int nStations  = fParameters.GetNstationsActive();
+  int fstStation = 0;
+  if (nStations == 8) fstStation = 4;
+  fTripletConstructor.fMaterialMap.reset(nStations, xpu::buf_io);
+  xpu::h_view vfMaterialMap{fTripletConstructor.fMaterialMap};
+
+  for (int ista = 0; ista < nStations; ista++) {
+    vfMaterialMap[ista] = GpuMaterialMap(fSetup.GetMaterial(fstStation + ista), bin_start);
+    bin_start += fSetup.GetMaterial(fstStation + ista).GetNbins() * fSetup.GetMaterial(fstStation + ista).GetNbins();
+  }
+
+  fTripletConstructor.fMaterialMapTables.reset(bin_start, xpu::buf_io);
+  xpu::h_view vfMaterialMapTables{fTripletConstructor.fMaterialMapTables};
+
+  bin_start = 0;
+
+  for (int ista = 0; ista < nStations; ista++) {
+    std::copy_n(fSetup.GetMaterial(fstStation + ista).GetTable().begin(),
+                fSetup.GetMaterial(fstStation + ista).GetTable().size(), &vfMaterialMapTables[bin_start]);
+    bin_start += fSetup.GetMaterial(fstStation + ista).GetNbins() * fSetup.GetMaterial(fstStation + ista).GetNbins();
+  }
+
+  fQueue.copy(fTripletConstructor.fMaterialMap, xpu::h2d);
+  fQueue.copy(fTripletConstructor.fMaterialMapTables, xpu::h2d);
+}
+
+void GpuTrackFinderSetup::SetInputData()
+{
+  fTripletConstructor.fvHits.reset(frWData.Hits().size(), xpu::buf_io);
+  xpu::h_view vfvHits{fTripletConstructor.fvHits};
+
+  std::copy_n(frWData.Hits().begin(), frWData.Hits().size(), &vfvHits[0]);
+
+  fQueue.copy(fTripletConstructor.fvHits, xpu::h2d);
+}
+
+void GpuTrackFinderSetup::SetupIterationData(int iter)
+{
+  //  xpu::scoped_timer t_("SetupIterationData");
+  fIteration = iter;
+
+  fTripletConstructor.fIterationData.reset(1, xpu::buf_io);
+  xpu::h_view vfIterationData{fTripletConstructor.fIterationData};
+  vfIterationData[0].fNHits             = fNHits;
+  vfIterationData[0].fIteration         = iter;
+  vfIterationData[0].fNDoublets         = 0;
+  vfIterationData[0].fNDoublets_counter = 0;
+  vfIterationData[0].fNTriplets         = 0;
+  vfIterationData[0].fNTriplets_counter = 0;
+
+  fQueue.copy(fTripletConstructor.fIterationData, xpu::h2d);
+}
+
+#define TIME_MONITORING_FULL
+
+void GpuTrackFinderSetup::RunGpuTrackingSetup()
+{
+  fNTriplets = 0;
+
+  xpu::device_prop prop{xpu::device::active()};
+
+  LOG(info) << "Running GPU tracking chain on device " << prop.name() << ", fIteration: " << fIteration;
+
+  bool isCpu  = xpu::device::active().backend() == xpu::cpu;
+  int nBlocks = isCpu ? fNHits : std::ceil(float(fNHits) / float(kSingletConstructorBlockSize));
+  //nBlocks = 1;
+
+  int nBlocksCD =
+    nBlocks
+    * maxDoubletsFromHit;  //std::ceil(float(nBlocks * maxDoubletsFromHit) / float(kSingletConstructorBlockSize));
+
+  //***
+  fTripletConstructor.fvTrackParams.reset(frWData.Hits().size(), xpu::buf_device);
+  fTripletConstructor.fHitDoublets.reset(frWData.Hits().size() * maxDoubletsFromHit, xpu::buf_device);
+  //***
+  fTripletConstructor.fIteration = fIteration;
+
+  xpu::set<strGpuTripletConstructor>(fTripletConstructor);
+
+#ifdef TIME_MONITORING_FULL
+  fEventTimeMonitor.nIterations = fIteration;
+  xpu::push_timer("Full_time");
+#endif
+
+#ifdef TIME_MONITORING_FULL
+  xpu::push_timer("PrepareData_time");
+#endif
+
+  //  if(fIteration == 0) fQueue.copy(fTripletConstructor.fvHits, xpu::h2d);
+
+  //  fQueue.copy(fTripletConstructor.fIterationData, xpu::h2d);
+
+  //  fQueue.copy(fTripletConstructor.fvGpuGrid, xpu::h2d);
+
+  fQueue.copy(fTripletConstructor.fgridFirstBinEntryIndex, xpu::h2d);
+
+  fQueue.copy(fTripletConstructor.fgridEntries, xpu::h2d);
+
+  //  if(fIteration == 0) fQueue.copy(fTripletConstructor.fMaterialMap, xpu::h2d);
+
+  //  if(fIteration == 0) fQueue.copy(fTripletConstructor.fMaterialMapTables, xpu::h2d);
+
+  fQueue.memset(fTripletConstructor.fHitDoublets, 0);
+
+#ifdef TIME_MONITORING_FULL
+  xpu::timings pd_time                           = xpu::pop_timer();
+  fEventTimeMonitor.PrepareData_time[fIteration] = pd_time;
+#endif
+
+#ifdef TIME_MONITORING_FULL
+  xpu::push_timer("MakeSinglets_time");
+#endif
+
+  fQueue.launch<MakeSinglets>(xpu::n_blocks(nBlocks));
+
+#ifdef TIME_MONITORING_FULL
+  xpu::timings sind_time                          = xpu::pop_timer();
+  fEventTimeMonitor.MakeSinglets_time[fIteration] = sind_time;
+#endif
+
+#ifdef TIME_MONITORING_FULL
+  xpu::push_timer("CompressDoublets_time");
+#endif
+
+  fQueue.launch<MakeDoublets>(xpu::n_blocks(nBlocks));
+
+#ifdef TIME_MONITORING_FULL
+  xpu::timings dind_time                          = xpu::pop_timer();
+  fEventTimeMonitor.MakeDoublets_time[fIteration] = dind_time;
+#endif
+
+#ifdef TIME_MONITORING_FULL
+  xpu::push_timer("ResetDoublets_time");
+#endif
+
+  fQueue.copy(fTripletConstructor.fIterationData, xpu::d2h);
+  xpu::h_view vfIterationData1{fTripletConstructor.fIterationData};
+
+  LOG(info) << "GPU tracking :: Doublets found: " << vfIterationData1[0].fNDoublets;
+
+  if (vfIterationData1[0].fNDoublets > 0) {
+    fTripletConstructor.fHitDoubletsCompressed.reset(vfIterationData1[0].fNDoublets, xpu::buf_device);
+    fTripletConstructor.fvTrackParamsDoublets.reset(vfIterationData1[0].fNDoublets, xpu::buf_device);
+    fTripletConstructor.fHitTriplets.reset(vfIterationData1[0].fNDoublets * maxTripletsFromDoublet, xpu::buf_device);
+
+    fQueue.memset(fTripletConstructor.fHitTriplets, 0);
+    xpu::set<strGpuTripletConstructor>(fTripletConstructor);
+
+#ifdef TIME_MONITORING_FULL
+    xpu::timings rd_time                             = xpu::pop_timer();
+    fEventTimeMonitor.ResetDoublets_time[fIteration] = rd_time;
+#endif
+
+#ifdef TIME_MONITORING_FULL
+    xpu::push_timer("CompressDoublets_time");
+#endif
+
+    fQueue.launch<CompressDoublets>(xpu::n_blocks(nBlocksCD));
+#ifdef TIME_MONITORING_FULL
+
+    xpu::timings cd_time                                = xpu::pop_timer();
+    fEventTimeMonitor.CompressDoublets_time[fIteration] = cd_time;
+#endif
+
+    int nBlocksD = isCpu ? vfIterationData1[0].fNDoublets
+                         : std::ceil(float(vfIterationData1[0].fNDoublets) / float(kSingletConstructorBlockSize));
+
+#ifdef TIME_MONITORING_FULL
+    xpu::push_timer("FitDoublets_time");
+#endif
+
+    fQueue.launch<FitDoublets>(xpu::n_blocks(nBlocksD));
+
+#ifdef TIME_MONITORING_FULL
+    xpu::timings fd_time                           = xpu::pop_timer();
+    fEventTimeMonitor.FitDoublets_time[fIteration] = fd_time;
+#endif
+
+#ifdef TIME_MONITORING_FULL
+    xpu::push_timer("MakeTriplets_time");
+#endif
+    fQueue.launch<MakeTriplets>(xpu::n_blocks(nBlocksD));
+#ifdef TIME_MONITORING_FULL
+    xpu::timings mt_time                            = xpu::pop_timer();
+    fEventTimeMonitor.MakeTriplets_time[fIteration] = mt_time;
+#endif
+
+#ifdef TIME_MONITORING_FULL
+    xpu::push_timer("ResetTriplets_time");
+#endif
+
+    fTripletConstructor.fHitDoublets.reset(0, xpu::buf_device);  // TODO: check if we need it for the next iteration
+
+    fQueue.copy(fTripletConstructor.fIterationData, xpu::d2h);
+    xpu::h_view vfIterationData2{fTripletConstructor.fIterationData};
+
+    if (vfIterationData2[0].fNTriplets > 0) {
+      fTripletConstructor.fHitTripletsCompressed.reset(vfIterationData2[0].fNTriplets, xpu::buf_device);
+      //      fTripletConstructor.fvTrackParamsTriplets.reset(vfIterationData2[0].fNTriplets, xpu::buf_device);
+      fTripletConstructor.fvTriplets.reset(vfIterationData2[0].fNTriplets, xpu::buf_io);
+
+      int nBlocksCT = isCpu ? vfIterationData1[0].fNDoublets * maxTripletsFromDoublet
+                            : std::ceil(float(vfIterationData1[0].fNDoublets * maxTripletsFromDoublet)
+                                        / float(kSingletConstructorBlockSize));
+
+#ifdef SORT_TRIPLETS_GPU
+      fTripletConstructor.fTripletSortHelper.reset(vfIterationData2[0].fNTriplets, xpu::buf_io);
+      fTripletConstructor.fTripletSortHelperTmp.reset(vfIterationData2[0].fNTriplets, xpu::buf_io);
+//      fTripletConstructor.dst.reset(nBlocksCT, xpu::buf_io);
+#endif
+
+      xpu::set<strGpuTripletConstructor>(fTripletConstructor);
+
+#ifdef TIME_MONITORING_FULL
+      xpu::timings rt_time                             = xpu::pop_timer();
+      fEventTimeMonitor.ResetTriplets_time[fIteration] = rt_time;
+#endif
+
+      LOG(info) << "GPU tracking :: Triplets found: " << vfIterationData2[0].fNTriplets;
+
+#ifdef TIME_MONITORING_FULL
+      xpu::push_timer("CompressTriplets_time");
+#endif
+
+      fQueue.launch<CompressTriplets>(xpu::n_blocks(nBlocksCT));
+
+#ifdef TIME_MONITORING_FULL
+      xpu::timings ct_time                                = xpu::pop_timer();
+      fEventTimeMonitor.CompressTriplets_time[fIteration] = ct_time;
+#endif
+
+#ifdef TIME_MONITORING_FULL
+      xpu::push_timer("FitTriplets_time");
+#endif
+
+      fQueue.launch<FitTriplets>(
+        xpu::n_blocks(/*fTripletConstructor.fIterationData[0].fNTriplets*/ vfIterationData2[0].fNTriplets));
+
+#ifdef TIME_MONITORING_FULL
+      xpu::timings ft_time                           = xpu::pop_timer();
+      fEventTimeMonitor.FitTriplets_time[fIteration] = ft_time;
+#endif
+
+#ifdef TIME_MONITORING_FULL
+      xpu::push_timer("SortTriplets_time");
+#endif
+
+#ifdef SORT_TRIPLETS_GPU
+      fQueue.launch<SortTriplets>(xpu::n_blocks(vfIterationData2[0].fNTriplets));
+#endif
+
+#ifdef TIME_MONITORING_FULL
+      xpu::timings srt_time                           = xpu::pop_timer();
+      fEventTimeMonitor.SortTriplets_time[fIteration] = srt_time;
+#endif
+
+      fQueue.copy(fTripletConstructor.fvTriplets, xpu::d2h);
+      fQueue.copy(fTripletConstructor.fIterationData, xpu::d2h);
+#ifdef SORT_TRIPLETS_GPU
+      fQueue.copy(fTripletConstructor.fTripletSortHelper, xpu::d2h);
+      std::cout << " --- Sorted triplets --- " << vfIterationData2[0].fNTriplets << " --- \n";
+      for (int i = 0; i < vfIterationData2[0].fNTriplets; i += 100) {
+        //          std::cout<<" --- fTripletSortHelper["<<i<<"]: "<<fTripletConstructor.fTripletSortHelper[i]<<" --- \n";
+        printf("   --->>> triplet: %d; lsta: %d; lhit: %d\n", i, fTripletConstructor.fTripletSortHelper[i].lsta,
+               fTripletConstructor.fTripletSortHelper[i].lhit);
+      }
+#endif
+
+      xpu::h_view vfIterationData3{fTripletConstructor.fIterationData};
+      fNTriplets = vfIterationData3[0].fNTriplets;
+    }
+    else {
+      std::cout << "No triplets found" << std::endl;
+    }
+  }
+  else {
+    std::cout << "No doublets found" << std::endl;
+  }
+
+  fTripletConstructor.fIterationData.reset(0, xpu::buf_device);
+  fTripletConstructor.fvGpuGrid.reset(0, xpu::buf_io);
+  fTripletConstructor.fgridFirstBinEntryIndex.reset(0, xpu::buf_io);
+  fTripletConstructor.fgridEntries.reset(0, xpu::buf_io);
+
+  xpu::set<strGpuTripletConstructor>(fTripletConstructor);  //TODO: check if we need to reset all the buffers here
+
+#ifdef TIME_MONITORING_FULL
+  xpu::timings t                           = xpu::pop_timer();
+  fEventTimeMonitor.Total_time[fIteration] = t;
+  fEventTimeMonitor.PrintTimings(fIteration);
+#endif
+}
diff --git a/algo/ca/core/experimental/CaGpuTrackFinderSetup.h b/algo/ca/core/experimental/CaGpuTrackFinderSetup.h
new file mode 100644
index 0000000000..a2472bee5a
--- /dev/null
+++ b/algo/ca/core/experimental/CaGpuTrackFinderSetup.h
@@ -0,0 +1,85 @@
+#pragma once
+
+#include "CaBranch.h"
+#include "CaGpuParameters.h"
+#include "CaGpuTimeMonitor.h"
+#include "CaGpuTripletConstructor.h"
+#include "CaHit.h"
+#include "CaVector.h"
+#include "CaWindowData.h"
+#include "KfTrackParam.h"
+
+#include <xpu/host.h>
+
+namespace cbm::algo::ca
+{
+  class GpuTrackFinderSetup {
+   public:
+    ///                             ------  Constructors and destructor ------
+
+    /// Constructor
+    /// \param nThreads  Number of threads for multi-threaded mode
+    GpuTrackFinderSetup(ca::WindowData& wData, const ca::Parameters<fvec>& pars);
+
+    /// Copy constructor
+    GpuTrackFinderSetup(const GpuTrackFinderSetup&) = delete;
+
+    /// Move constructor
+    GpuTrackFinderSetup(GpuTrackFinderSetup&&) = delete;
+
+    /// Copy assignment operator
+    GpuTrackFinderSetup& operator=(const GpuTrackFinderSetup&) = delete;
+
+    /// Move assignment operator
+    GpuTrackFinderSetup& operator=(GpuTrackFinderSetup&&) = delete;
+
+    /// Destructor
+    ~GpuTrackFinderSetup() = default;
+
+    ///                             ------  Public member functions ------
+
+    ///Set the GPU tracking parameters
+    void SetupParameters();
+
+    void SetupGrid();
+
+    void SetupMaterialMap();
+
+    /// Set the input data for the GPU tracking
+    void SetInputData();
+
+    /// Set the iteration data
+    void SetupIterationData(int iter);
+
+    /// Run the track finding algorithm chain
+    void RunGpuTrackingSetup();
+
+    /// Get the number of triplets
+    unsigned int GetNofTriplets() const { return fNTriplets; }  //fTripletConstructor.fTempData[0].fNTriplets; }
+
+    ca::Triplet& GetTriplet(int itr) { return fTripletConstructor.fvTriplets[itr]; }  //TODO: switch back
+    //    ca::Triplet& GetTriplet(int itr) { return fvTriplets[itr]; }//fTripletConstructor.fvTriplets[itr]; }	//TODO: switch back
+
+    /// Get timings
+    XpuTimings& GetTimings() { return fEventTimeMonitor; }
+
+    /// Get triplets
+    //    const Vector<ca::Triplet>& GetTriplets() const { return fTripletConstructor.fvTriplets; }
+
+   private:
+    const Parameters<fvec>& fParameters;       ///< Object of Framework parameters class
+    const cbm::algo::kf::Setup<fvec>& fSetup;  ///< Setup instance
+    WindowData& frWData;                       ///< Reference to the window data
+    xpu::queue fQueue;                         ///< GPU queue TODO: initialization is ~220 ms. Why and how to avoid?
+    ca::GpuTripletConstructor fTripletConstructor;  ///< GPU triplet constructor
+    unsigned int fIteration;                        ///< Iteration number
+    int fNHits;
+
+    int fNTriplets;
+
+    int maxDoubletsFromHit     = 150.;
+    int maxTripletsFromDoublet = 15.;
+
+    XpuTimings fEventTimeMonitor;
+  };
+}  // namespace cbm::algo::ca
diff --git a/algo/ca/core/experimental/CaGpuTripletConstructor.cxx b/algo/ca/core/experimental/CaGpuTripletConstructor.cxx
new file mode 100644
index 0000000000..71d43827b4
--- /dev/null
+++ b/algo/ca/core/experimental/CaGpuTripletConstructor.cxx
@@ -0,0 +1,1449 @@
+#include "CaGpuTripletConstructor.h"
+
+using namespace cbm::algo::ca;
+
+// Export Constants
+XPU_EXPORT(strGpuTripletConstructor);
+
+// Kernel Definitions
+//XPU_EXPORT(TestFunc);
+//XPU_D void TestFunc::operator()(context& ctx) { ctx.cmem<strGpuTripletConstructor>().TestFunc(ctx); }
+
+XPU_EXPORT(MakeSinglets);
+XPU_D void MakeSinglets::operator()(context& ctx) { ctx.cmem<strGpuTripletConstructor>().MakeSinglets(ctx); }
+
+XPU_EXPORT(MakeDoublets);
+XPU_D void MakeDoublets::operator()(context& ctx) { ctx.cmem<strGpuTripletConstructor>().MakeDoublets(ctx); }
+
+XPU_EXPORT(CompressDoublets);
+XPU_D void CompressDoublets::operator()(context& ctx) { ctx.cmem<strGpuTripletConstructor>().CompressDoublets(ctx); }
+
+XPU_EXPORT(FitDoublets);
+XPU_D void FitDoublets::operator()(context& ctx) { ctx.cmem<strGpuTripletConstructor>().FitDoublets(ctx); }
+
+XPU_EXPORT(MakeTriplets);
+XPU_D void MakeTriplets::operator()(context& ctx) { ctx.cmem<strGpuTripletConstructor>().MakeTriplets(ctx); }
+
+XPU_EXPORT(CompressTriplets);
+XPU_D void CompressTriplets::operator()(context& ctx) { ctx.cmem<strGpuTripletConstructor>().CompressTriplets(ctx); }
+
+XPU_EXPORT(FitTriplets);
+XPU_D void FitTriplets::operator()(context& ctx) { ctx.cmem<strGpuTripletConstructor>().FitTriplets(ctx); }
+#ifdef SORT_TRIPLETS_GPU
+XPU_EXPORT(SortTriplets);
+XPU_D void SortTriplets::operator()(context& ctx) { ctx.cmem<strGpuTripletConstructor>().SortTripletsFunc(ctx); }
+#endif
+
+//XPU_D void GpuTripletConstructor::TestFunc(TestFunc::context& ctx) const
+//{
+//  const int iGThread    = ctx.block_dim_x() * ctx.block_idx_x() + ctx.thread_idx_x();
+//}
+
+XPU_D void GpuTripletConstructor::MakeSinglets(MakeSinglets::context& ctx) const
+{
+  const int iGThread = ctx.block_dim_x() * ctx.block_idx_x() + ctx.thread_idx_x();
+
+  if (iGThread >= fIterationData[0].fNHits) return;
+
+  ca::HitIndex_t ihitl = fgridEntries[iGThread];  //iGThread;
+  int ista             = fvHits[ihitl].Station();
+
+  //  if(ista < 0 || ista > (int)fParams[fIterationData[0].fIteration].fNStations - 3) return;
+  if (ista < 0 || ista > (int) fParams_const[fIteration].fNStations - 3) return;
+
+  //  const ca::Station<float>* sta_l = &fParams[fIterationData[0].fIteration].GetStation(ista);
+  //  const ca::Station<float>* sta_m = &fParams[fIterationData[0].fIteration].GetStation(ista + 1);
+  const ca::GpuStation* sta_l = &fStations_const[ista];
+  const ca::GpuStation* sta_m = &fStations_const[ista + 1];
+
+  float stal_l_timeInfo = sta_l->timeInfo;
+  float sta_m_z         = sta_m->fZ;
+
+  const auto& hitl = fvHits[ihitl];
+
+  //  const float dzli = 1.f / (hitl.Z() - fParams[fIterationData[0].fIteration].GetTargetPositionZ());
+  //  const float tx = (hitl.X() - fParams[fIterationData[0].fIteration].GetTargetPositionX()) * dzli;
+  //  const float ty = (hitl.Y() - fParams[fIterationData[0].fIteration].GetTargetPositionY()) * dzli;
+  const float dzli = 1.f / (hitl.Z() - fParams_const[fIteration].GetTargetPositionZ());
+  const float tx   = (hitl.X() - fParams_const[fIteration].GetTargetPositionX()) * dzli;
+  const float ty   = (hitl.Y() - fParams_const[fIteration].GetTargetPositionY()) * dzli;
+
+  kf::TrackParamBase<float>& fit_params = fvTrackParams[ihitl];
+
+  fit_params.X()    = hitl.X();
+  fit_params.Y()    = hitl.Y();
+  fit_params.Z()    = hitl.Z();
+  fit_params.Tx()   = tx;
+  fit_params.Ty()   = ty;
+  fit_params.Qp()   = 0.;
+  fit_params.Time() = hitl.T();
+  //  fit_params.Vi()   = fParams[fIterationData[0].fIteration].SpeedOfLightInv;//0.03335641;//constants::phys::SpeedOfLightInv;
+  fit_params.Vi() = fParams_const[fIteration].SpeedOfLightInv;
+
+  //  float txErr2           = fParams[fIterationData[0].fIteration].maxSlopePV * fParams[fIterationData[0].fIteration].maxSlopePV / 9.;
+  //  float qpErr2           = fParams[fIterationData[0].fIteration].maxQp * fParams[fIterationData[0].fIteration].maxQp / 9.;
+  float txErr2 = fParams_const[fIteration].maxSlopePV * fParams_const[fIteration].maxSlopePV / 9.;
+  float qpErr2 = fParams_const[fIteration].maxQp * fParams_const[fIteration].maxQp / 9.;
+
+  fit_params.ResetErrors(1., 1., txErr2, txErr2, qpErr2, (stal_l_timeInfo ? hitl.dT2() : 1.e6), 1.e10);
+
+  fit_params.InitVelocityRange(1. / /*fParams[fIterationData[0].fIteration]*/ fParams_const[fIteration].maxQp);
+
+  fit_params.C(0, 0) = hitl.dX2();
+  fit_params.C(1, 0) = hitl.dXY();
+  fit_params.C(1, 1) = hitl.dY2();
+
+  //  fit_params.Ndf()     = fParams[fIterationData[0].fIteration].primaryFlag ? 2. : 0.;
+  fit_params.Ndf()     = /*fParams[fIterationData[0].fIteration]*/ fParams_const[fIteration].primaryFlag ? 2. : 0.;
+  fit_params.NdfTime() = (stal_l_timeInfo ? 0 : -1);
+
+  //  ca::GpuFieldRegion fld0;
+  {
+    int sta1 = ista;
+    if (sta1 == 0) sta1 = 1;
+    int sta0 = sta1 / 2;  // station between ista and the target
+    //    const ca::GpuFieldValue &B0 = fParams[fIterationData[0].fIteration].GetStation(sta0).fieldSlice.GetFieldValueForLine(fit_params);
+    //    const ca::GpuFieldValue &B1 = fParams[fIterationData[0].fIteration].GetStation(sta1).fieldSlice.GetFieldValueForLine(fit_params);
+    const ca::GpuFieldValue& B0 = fStations_const[sta0].fieldSlice.GetFieldValueForLine(fit_params);
+    const ca::GpuFieldValue& B1 = fStations_const[sta1].fieldSlice.GetFieldValueForLine(fit_params);
+    //fld0.Set(fParams[fIterationData[0].fIteration].targB, fParams[fIterationData[0].fIteration].GetTargetPositionZ(), B0, fParams[fIterationData[0].fIteration].GetStation(sta0).fZ, B1, fParams[fIterationData[0].fIteration].GetStation(sta1).fZ);
+    ctx.smem().fld0_shared[ctx.thread_idx_x()].Set(fParams_const[fIteration].targB,
+                                                   fParams_const[fIteration].GetTargetPositionZ(), B0,
+                                                   fStations_const[sta0].fZ, B1, fStations_const[sta1].fZ);
+  }
+  //  ca::GpuFieldRegion fld1;
+  {  // field, made by the left hit, the middle station and the right station
+    //    const ca::GpuFieldValue &B0 = fParams[fIterationData[0].fIteration].GetStation(ista)*/fStations_const[ista]/*ctx.smem().fStations_shared[ista]*/.fieldSlice.GetFieldValueForLine(fit_params);
+    //    const ca::GpuFieldValue &B1 = fParams[fIterationData[0].fIteration].GetStation(ista + 1).fieldSlice.GetFieldValueForLine(fit_params);
+    //    const ca::GpuFieldValue &B2 = fParams[fIterationData[0].fIteration].GetStation(ista + 2).fieldSlice.GetFieldValueForLine(fit_params);
+    const ca::GpuFieldValue& B0 = fStations_const[ista].fieldSlice.GetFieldValueForLine(fit_params);
+    const ca::GpuFieldValue& B1 = fStations_const[ista + 1].fieldSlice.GetFieldValueForLine(fit_params);
+    const ca::GpuFieldValue& B2 = fStations_const[ista + 2].fieldSlice.GetFieldValueForLine(fit_params);
+    //    fld1.Set(B0, fParams[fIterationData[0].fIteration].GetStation(ista).fZ, B1, fParams[fIterationData[0].fIteration].GetStation(ista + 1).fZ, B2, fParams[fIterationData[0].fIteration].GetStation(ista + 2).fZ);
+    ctx.smem().fld1_shared[ctx.thread_idx_x()].Set(B0, fStations_const[ista].fZ, B1, fStations_const[ista + 1].fZ, B2,
+                                                   fStations_const[ista + 2].fZ);
+  }
+
+  FilterWithTargetAtLine(&fit_params, &/*fld0*/ ctx.smem().fld0_shared[ctx.thread_idx_x()]);
+
+  MultipleScattering(&fit_params, fMaterialMapTables[fMaterialMap[ista].GetBin(fit_params.GetX(), fit_params.GetY())],
+                     /*fParams[fIterationData[0].fIteration]*/ fParams_const[fIteration].maxQp);
+
+  // extrapolate to the middle hit
+  ExtrapolateStep(&fit_params, sta_m_z, 0.f, &/*fld1*/ ctx.smem().fld1_shared[ctx.thread_idx_x()]);
+
+  //  ExtrapolateLine(tr_par, sta_m->fZ, F1);	//TODO: check if ExtrapolateStep is enough
+}
+
+XPU_D void GpuTripletConstructor::MakeDoublets(MakeDoublets::context& ctx) const
+{
+  const int iGThread = ctx.block_dim_x() * ctx.block_idx_x() + ctx.thread_idx_x();
+  if (iGThread >= fIterationData[0].fNHits) return;
+
+#if 0
+  if (ctx.thread_idx_x() == 0) {
+      ctx.smem().counter = 0;
+  }
+#endif
+
+  ca::HitIndex_t ihitl = fgridEntries[iGThread];  //iGThread;
+  int ista             = fvHits[ihitl].Station();
+
+  if (ista > (int) fParams[fIterationData[0].fIteration].fNStations - 3) return;
+
+  int collectedHits = 0;
+  int maxNHits      = fParams[fIterationData[0].fIteration].maxDoubletsFromHit;
+
+  CollectHits(iGThread, ista + 1, ihitl, maxNHits, &collectedHits);
+
+  xpu::atomic_add(&(fIterationData[0].fNDoublets), collectedHits);
+
+#if 0
+  xpu::atomic_add(&(ctx.smem().counter), collectedHits);
+
+  xpu::barrier(ctx.pos());
+  if(ctx.thread_idx_x() == 0) {
+    xpu::atomic_add(&(fIterationData[0].fNDoublets), ctx.smem().counter);
+  }
+#endif
+}
+
+XPU_D void GpuTripletConstructor::CompressDoublets(CompressDoublets::context& ctx) const
+{
+  const int iGThread = ctx.block_dim_x() * ctx.block_idx_x() + ctx.thread_idx_x();
+
+  if (iGThread >= fIterationData[0].fNHits * fParams[fIterationData[0].fIteration].maxDoubletsFromHit) return;
+
+  if (fHitDoublets[iGThread].hit2 != 0) {
+    int index                          = xpu::atomic_add(&(fIterationData[0].fNDoublets_counter), 1);
+    fHitDoubletsCompressed[index].hit1 = fHitDoublets[iGThread].hit1;
+    fHitDoubletsCompressed[index].hit2 = fHitDoublets[iGThread].hit2;
+  }
+}
+
+XPU_D void GpuTripletConstructor::FitDoublets(FitDoublets::context& ctx) const
+{
+  const int iGThread = ctx.block_dim_x() * ctx.block_idx_x() + ctx.thread_idx_x();
+  if (iGThread >= fIterationData[0].fNDoublets) return;
+
+  int hit_l = fHitDoubletsCompressed[iGThread].hit1;
+  int hit_m = fHitDoubletsCompressed[iGThread].hit2;
+
+  if (hit_l < 0 || hit_m < 0) return;
+
+  int ista = fvHits[hit_l].Station();
+
+  if (ista < 0 || ista > (int) fParams[fIterationData[0].fIteration].fNStations - 3) return;
+
+  const ca::GpuStation& sta_l = fParams[fIterationData[0].fIteration].GetStation(ista);
+  const ca::GpuStation& sta_m = fParams[fIterationData[0].fIteration].GetStation(ista + 1);
+
+  auto fit_params = fvTrackParams[hit_l];
+
+  bool isMomentumFitted =
+    (fParams[fIterationData[0].fIteration].isTargetField || (sta_l.fieldStatus != 0) || (sta_m.fieldStatus != 0));
+  float maxQp = fParams[fIterationData[0].fIteration].maxQp;
+
+  // fit doublet
+  const ca::Hit& hitm = fvHits[hit_m];
+
+  float z_2 = hitm.Z();
+  ca::MeasurementXy<float> m_2(hitm.X(), hitm.Y(), hitm.dX2(), hitm.dY2(), hitm.dXY(), 1.f, 1.f);
+  float t_2   = hitm.T();
+  float dt2_2 = hitm.dT2();
+
+  // add the middle hit
+  ExtrapolateLineNoField(&fit_params, z_2);
+
+  FilterXY(&fit_params, m_2);
+
+  FilterTime(&fit_params, t_2, dt2_2, sta_m.timeInfo);
+
+  //      fFit.SetQp0(isMomentumFitted ? fFit.Tr().GetQp() : maxQp);	//TODO: check why it is required
+  const float qp0 = isMomentumFitted ? fit_params.GetQp() : maxQp;
+
+  MultipleScattering(&fit_params,
+                     fMaterialMapTables[fMaterialMap[ista + 1].GetBin(fit_params.GetX(), fit_params.GetY())], qp0);
+
+  fvTrackParamsDoublets[iGThread] = fit_params;
+  //      fFit.SetQp0(fFit.Tr().Qp());	//TODO: check why it is required and how it is used later, have to be stored different way
+}
+
+XPU_D void GpuTripletConstructor::MakeTriplets(MakeTriplets::context& ctx) const
+{
+  const int iGThread = ctx.block_dim_x() * ctx.block_idx_x() + ctx.thread_idx_x();
+  if (iGThread >= fIterationData[0].fNDoublets) return;
+
+  int hit_l = fHitDoubletsCompressed[iGThread].hit1;
+  //  int hit_m = fHitDoubletsCompressed[iGThread].hit2;
+
+#if 0
+  if (ctx.thread_idx_x() == 0) {
+      ctx.smem().counter = 0;
+  }
+#endif
+
+  int ista = fvHits[hit_l].Station();
+
+  if (ista < 0 || ista > (int) fParams[fIterationData[0].fIteration].fNStations - 3) return;
+
+  //
+  ista = fvHits[hit_l].Station();
+  //
+
+  //  const ca::Station<float>& sta_l = fParams[fIterationData[0].fIteration].GetStation(ista);
+  //  const ca::Station<float>& sta_m = fParams[fIterationData[0].fIteration].GetStation(ista + 1);
+  const ca::GpuStation& sta_r = fParams[fIterationData[0].fIteration].GetStation(ista + 2);
+
+  auto& fit_params = fvTrackParamsDoublets[iGThread];
+
+  ca::GpuFieldRegion
+    fld1;  //TODO: with fit_params(doublets) this is not fFldL from orig, but it should be more precise, need to be checker and clarified
+  {        // field, made by the left hit, the middle station and the right station
+    ca::GpuFieldValue B0 =
+      fParams[fIterationData[0].fIteration].GetStation(ista).fieldSlice.GetFieldValueForLine(fit_params);
+    ca::GpuFieldValue B1 =
+      fParams[fIterationData[0].fIteration].GetStation(ista + 1).fieldSlice.GetFieldValueForLine(fit_params);
+    ca::GpuFieldValue B2 =
+      fParams[fIterationData[0].fIteration].GetStation(ista + 2).fieldSlice.GetFieldValueForLine(fit_params);
+    fld1.Set(B0, fParams[fIterationData[0].fIteration].GetStation(ista).fZ, B1,
+             fParams[fIterationData[0].fIteration].GetStation(ista + 1).fZ, B2,
+             fParams[fIterationData[0].fIteration].GetStation(ista + 2).fZ);
+  }
+
+  ExtrapolateStep(&fit_params, sta_r.fZ, 0.f, &fld1);
+
+  //TODO: add isDoubletGood check if needed
+
+  int collectedHits = 0;
+  int maxNHits      = fParams[fIterationData[0].fIteration].maxTripletsFromDoublet;
+
+  CollectHitsTriplets(iGThread, ista + 2, maxNHits, &collectedHits);
+
+  xpu::atomic_add(&(fIterationData[0].fNTriplets), collectedHits);
+
+#if 0
+  xpu::atomic_add(&(ctx.smem().counter), collectedHits);
+
+  xpu::barrier(ctx.pos());
+
+  if (ctx.thread_idx_x () == 0) {
+    fIterationData[0].fNTriplets += ctx.smem().counter;
+  }
+#endif
+}
+
+XPU_D void GpuTripletConstructor::CompressTriplets(CompressTriplets::context& ctx) const
+{
+  const int iGThread = ctx.block_dim_x() * ctx.block_idx_x() + ctx.thread_idx_x();
+  if (iGThread >= fIterationData[0].fNDoublets * fParams[fIterationData[0].fIteration].maxTripletsFromDoublet) return;
+
+  if (fHitTriplets[iGThread].hit3 != 0) {
+    int index                          = xpu::atomic_add(&(fIterationData[0].fNTriplets_counter), 1);
+    fHitTripletsCompressed[index].hit1 = fHitTriplets[iGThread].hit1;
+    fHitTripletsCompressed[index].hit2 = fHitTriplets[iGThread].hit2;
+    fHitTripletsCompressed[index].hit3 = fHitTriplets[iGThread].hit3;
+  }
+}
+
+XPU_D void GpuTripletConstructor::FitTriplets(FitTriplets::context& ctx) const
+{
+  const int iGThread = ctx.block_dim_x() * ctx.block_idx_x() + ctx.thread_idx_x();
+
+  if (iGThread >= fIterationData[0].fNTriplets) return;
+
+  int hit_l = fHitTripletsCompressed[iGThread].hit1;
+
+  int ista[3] = {-1, -1, -1};
+
+  ista[0] = fvHits[hit_l].Station();
+  ista[1] = ista[0] + 1;
+  ista[2] = ista[1] + 1;
+
+  //  ca::GpuStation sta[3];
+  const ca::GpuStation* sta[3];
+  for (int is = 0; is < 3; ++is) {
+    //    sta[is] = fParams[fIterationData[0].fIteration].GetStation(ista[is]);
+    sta[is] = &fStations_const[ista[is]];
+  };
+
+  bool isMomentumFitted = ((sta[0]->fieldStatus != 0) || (sta[1]->fieldStatus != 0) || (sta[2]->fieldStatus != 0));
+
+  float ndfTrackModel = 4;                    // straight line
+  ndfTrackModel += isMomentumFitted ? 1 : 0;  // track with momentum
+
+  const float maxQp = fParams[fIterationData[0].fIteration].maxQp;
+
+  ca::HitIndex_t ihit[3] = {fHitTripletsCompressed[iGThread].hit1, fHitTripletsCompressed[iGThread].hit2,
+                            fHitTripletsCompressed[iGThread].hit3};
+
+  float x[3], y[3], z[3], t[3];
+  float dt2[3];
+  ca::MeasurementXy<float> mxy[3];
+
+  for (int ih = 0; ih < 3; ++ih) {
+    const ca::Hit& hit = fvHits[ihit[ih]];
+    mxy[ih]            = ca::MeasurementXy<float>(hit.X(), hit.Y(), hit.dX2(), hit.dY2(), hit.dXY(), 1.f, 1.f);
+
+    x[ih]   = hit.X();
+    y[ih]   = hit.Y();
+    z[ih]   = hit.Z();
+    t[ih]   = hit.T();
+    dt2[ih] = hit.dT2();
+  };
+
+  // find the field along the track
+
+  ca::GpuFieldValue B[3];
+  ca::GpuFieldRegion fld;
+  ca::GpuFieldRegion fldTarget;
+
+  float tx[3] = {(x[1] - x[0]) / (z[1] - z[0]), (x[2] - x[0]) / (z[2] - z[0]), (x[2] - x[1]) / (z[2] - z[1])};
+  float ty[3] = {(y[1] - y[0]) / (z[1] - z[0]), (y[2] - y[0]) / (z[2] - z[0]), (y[2] - y[1]) / (z[2] - z[1])};
+  for (int ih = 0; ih < 3; ++ih) {
+    float dz = (sta[ih]->fZ - z[ih]);
+    B[ih]    = sta[ih]->fieldSlice.GetFieldValue(x[ih] + tx[ih] * dz, y[ih] + ty[ih] * dz);
+  };
+
+  fld.Set(B[0], sta[0]->fZ, B[1], sta[1]->fZ, B[2], sta[2]->fZ);
+  fldTarget.Set(fParams[fIterationData[0].fIteration].targB, fParams[fIterationData[0].fIteration].GetTargetPositionZ(),
+                B[0], sta[0]->fZ, B[1], sta[1]->fZ);
+
+  kf::TrackParamBase<float>
+    fit_params;  // = fvTrackParamsTriplets[iGThread];	//TODO: set all to 0 and recheck - done, no difference
+  //  auto *fit_params = &ctx.smem().fit_params_shared[ctx.thread_idx_x()];	//TODO: shared memory cannot be used for classes with preinitialized data
+
+  fit_params.ResetCovMatrix();
+
+  // initial parameters
+  {
+    float dz01      = 1.f / (z[1] - z[0]);
+    fit_params.Tx() = (x[1] - x[0]) * dz01;
+    fit_params.Ty() = (y[1] - y[0]) * dz01;
+    fit_params.Qp() = 0.f;
+    fit_params.Vi() = 0.f;
+  }
+
+  bool not_fitted = false;
+  float qp0       = 0;
+  // repeat several times in order to improve the precision
+  for (int iiter = 0; iiter < 2 /*nIterations*/; ++iiter) {
+    // fit forward
+    {
+      qp0 = fit_params.GetQp();
+      if (qp0 > maxQp) qp0 = maxQp;
+      if (qp0 < -maxQp) qp0 = -maxQp;
+
+      fit_params.Qp()   = 0.f;
+      fit_params.Vi()   = 0.f;
+      fit_params.X()    = x[0];
+      fit_params.Y()    = y[0];
+      fit_params.Z()    = z[0];
+      fit_params.Time() = t[0];
+      fit_params.ResetErrors(1.f, 1.f, 1.f, 1.f, 100., (sta[0]->timeInfo ? dt2[0] : 1.e6f), 1.e2f);
+      fit_params.C(0, 0) = mxy[0].Dx2();
+      fit_params.C(1, 0) = mxy[0].Dxy();
+      fit_params.C(1, 1) = mxy[0].Dy2();
+
+      fit_params.Ndf()     = -ndfTrackModel + 2;
+      fit_params.NdfTime() = sta[0]->timeInfo ? 0 : -1;
+      //  add the target constraint
+      FilterWithTargetAtLine(&fit_params, &fldTarget);
+      for (int ih = 1; ih < 3; ++ih) {
+        ExtrapolateStep(&fit_params, z[ih], qp0, &fld);
+
+        //	Extrapolate(&fit_params, z[ih], 0.f, &fld);	//TODO: add qp0 instead of 0.f
+
+        int bin       = fMaterialMap[ista[ih]].GetBin(fit_params.GetX(), fit_params.GetY());
+        auto radThick = fMaterialMapTables[bin];
+
+        MultipleScattering(&fit_params, radThick, qp0);  //TODO: why it was commented in original gpu code?
+
+        float Tx    = fit_params.Tx();
+        float Ty    = fit_params.Ty();
+        float txtx  = Tx * Tx;
+        float tyty  = Ty * Ty;
+        float txtx1 = txtx + 1.f;
+        float h     = txtx + tyty;
+        float tt    = xpu::sqrt(txtx1 + tyty);
+        float h2    = h * h;
+        float qp0t  = qp0 * tt;
+        float c1 = 0.0136f, c2 = c1 * 0.038f, c3 = c2 * 0.5f, c4 = -c3 * 0.5f, c5 = c3 * 0.333333f, c6 = -c3 * 0.25f;
+
+        float s0 = (c1 + c2 * xpu::log(radThick) + c3 * h + h2 * (c4 + c5 * h + c6 * h2)) * qp0t;
+        float a  = ((tt
+                    + fParams[fIterationData[0].fIteration].particleMass
+                        * fParams[fIterationData[0].fIteration].particleMass * qp0 * qp0t)
+                   * radThick * s0 * s0);
+        //        fit_params.C(2, 2) += 0.001;//txtx1 * a;	//TODO: check if it is needed. Switching on leads to the strong ghosts increase
+
+        fit_params.C(3, 2) += Tx * Ty * a;
+        fit_params.C(3, 3) += (1. + tyty) * a;
+
+        EnergyLossCorrection(&fit_params, radThick, -1.f);
+        FilterXY(&fit_params, mxy[ih]);
+        FilterTime(&fit_params, t[ih], dt2[ih], sta[ih]->timeInfo);
+      }
+    }
+
+    if (iiter == 1) break;
+
+    // fit backward
+    {
+      qp0 = fit_params.GetQp();
+      if (qp0 > maxQp) qp0 = maxQp;
+      if (qp0 < -maxQp) qp0 = -maxQp;
+
+      fit_params.X()    = x[2];
+      fit_params.Y()    = y[2];
+      fit_params.Z()    = z[2];
+      fit_params.Time() = t[2];
+      fit_params.Qp()   = 0.f;
+      fit_params.Vi()   = 0.f;
+
+      fit_params.ResetErrors(1.f, 1.f, 1.f, 1.f, 100., (sta[2]->timeInfo ? dt2[2] : 1.e6f), 1.e2f);
+
+      fit_params.Ndf()     = -ndfTrackModel + 2;
+      fit_params.NdfTime() = sta[2]->timeInfo ? 0 : -1;
+
+      for (int ih = 1; ih >= 0; --ih) {
+        ExtrapolateStep(&fit_params, z[ih], qp0, &fld);
+        auto radThick = fMaterialMapTables[fMaterialMap[ista[ih]].GetBin(fit_params.GetX(), fit_params.GetY())];
+        MultipleScattering(&fit_params, radThick, qp0);
+        EnergyLossCorrection(&fit_params, radThick, 1.f);
+        FilterXY(&fit_params, mxy[ih]);
+        FilterTime(&fit_params, t[ih], dt2[ih], sta[ih]->timeInfo);
+      }
+    }
+  }  // for iiter
+
+  float chi2 = fit_params.GetChiSq() + fit_params.GetChiSqTime();
+
+  if (chi2 > fParams[fIterationData[0].fIteration].tripletFinalChi2Cut || !xpu::isfinite(chi2) || chi2 < 0.f
+      || not_fitted) {
+    chi2 = -1.;
+  }
+
+  const float qp  = fit_params.GetQp();
+  const float Cqp = fit_params.C(4, 4) + 0.001;
+  fvTriplets[iGThread] =
+    ca::Triplet(ihit[0], ihit[1], ihit[2], ista[0], ista[1], ista[2], 0, 0, 0, chi2, qp, Cqp, fit_params.Tx(),
+                fit_params.C(2, 2), fit_params.Ty(), fit_params.C(3, 3), isMomentumFitted);
+//  delete fit_params;
+#ifdef SORT_TRIPLETS_GPU
+  fTripletSortHelper[iGThread].tr_id = iGThread;
+  fTripletSortHelper[iGThread].lsta  = ista[0];
+  fTripletSortHelper[iGThread].lhit  = ihit[0];
+#endif
+}
+
+#ifdef SORT_TRIPLETS_GPU
+XPU_D void GpuTripletConstructor::SortTripletsFunc(SortTriplets::context& ctx) const
+{
+  //TODO: implement sorting of triplets on GPU or make tracking order independent
+}
+#endif
+
+XPU_D void GpuTripletConstructor::CollectHits(int iThread, int iSta, int iHit, int maxNHits, int* collectedHits) const
+{
+  // Collect hits in the station iSta and make doublets
+  const ca::GpuStation& sta            = fParams[fIterationData[0].fIteration].GetStation(iSta);
+  kf::TrackParamBase<float> fit_params = fvTrackParams[iHit];
+
+  fit_params.ChiSq() = 0.;
+
+  const float Pick_m22 = fParams[fIterationData[0].fIteration].doubletChi2Cut - fit_params.GetChiSq();
+
+  const float timeError2 = fit_params.C(5, 5);
+  const float time       = fit_params.Time();
+
+  const auto& gpuGrid = fvGpuGrid[iSta];
+
+  const float pick_range_x = xpu::sqrt(Pick_m22 * fit_params.C(0, 0)) + gpuGrid.GetMaxRangeX()
+                             + fParams[fIterationData[0].fIteration].maxDZ * abs(fit_params.Tx());
+  const float pick_range_y = xpu::sqrt(Pick_m22 * fit_params.C(1, 1)) + gpuGrid.GetMaxRangeY()
+                             + fParams[fIterationData[0].fIteration].maxDZ * abs(fit_params.Ty());
+
+  ca::GpuGridArea areaGpu(gpuGrid, &(fgridFirstBinEntryIndex[fvGpuGrid[iSta].GetBinEntryIndexStart()]),
+                          &(fgridEntries[fvGpuGrid[iSta].GetEntriesStart()]), fit_params.X(), fit_params.Y(),
+                          pick_range_x, pick_range_y);
+  unsigned int ih_gpu = 0;
+
+  while (areaGpu.GetNextObjectId(ih_gpu) && (*collectedHits < maxNHits)) {  // loop over station hits
+                                                                            //    if (fvbHitSuppressed[ih]) {
+                                                                            //      continue;
+                                                                            //    }
+    const ca::Hit& hit = fvHits[ih_gpu];
+
+    // check time-boundaries
+    if ((sta.timeInfo) && (fit_params.NdfTime() >= 0)) {
+      if (xpu::abs(time - hit.T()) > 1.4 * (3.5 * xpu::sqrt(timeError2) + hit.RangeT())) {
+        continue;
+      }
+      // if (fabs(time - hit.T()) > 30) continue;
+    }
+
+    // - check whether hit belong to the window ( track position +\- errors ) -
+
+    // check y-boundaries
+    float y = fit_params.Y() + fit_params.Ty() * (hit.Z() - fit_params.Z());
+    float C11 =
+      fit_params.C(1, 1)
+      + (hit.Z() - fit_params.Z()) * (2 * fit_params.C(3, 1) + (hit.Z() - fit_params.Z()) * fit_params.C(3, 3));
+
+    float dy_est = xpu::sqrt(Pick_m22 * C11) + hit.RangeY();
+
+    const float dY = hit.Y() - y;
+
+    if (xpu::abs(dY) > dy_est) {
+      continue;
+    }
+
+    // check x-boundaries
+    float x = fit_params.X() + fit_params.Tx() * (hit.Z() - fit_params.Z());
+    float C00 =
+      fit_params.C(0, 0)
+      + (hit.Z() - fit_params.Z()) * (2 * fit_params.C(2, 0) + (hit.Z() - fit_params.Z()) * fit_params.C(2, 2));
+
+    float dx_est = xpu::sqrt(Pick_m22 * C00) + hit.RangeX();
+
+    const float dX = hit.X() - x;
+
+    if (xpu::abs(dX) > dx_est) {
+      continue;
+    }
+
+    // check chi2
+    float C10 = fit_params.C(1, 0)
+                + (hit.Z() - fit_params.Z())
+                    * (fit_params.C(2, 1) + fit_params.C(3, 0) + (hit.Z() - fit_params.Z()) * fit_params.C(3, 2));
+
+    ca::MeasurementXy<float> mxy(hit.X(), hit.Y(), hit.dX2(), hit.dY2(), hit.dXY(), 1.f, 1.f);
+
+    float chi2x = 0.;
+    {  // filter X measurement
+      const float zeta = x - mxy.X();
+
+      const float F0 = C00;
+      const float F1 = C10;
+
+      const float HCH = F0;
+
+      const float wi     = 1.f / (mxy.Dx2() + HCH);
+      const float zetawi = zeta * wi;
+      chi2x              = mxy.NdfX() * zeta * zetawi;
+
+      const float K1 = F1 * wi;
+
+      x -= F0 * zetawi;
+      y -= F1 * zetawi;
+
+      C00 -= F0 * F0 * wi;
+      C10 -= K1 * F0;
+      C11 -= K1 * F1;
+    }
+
+    float chi2u = 0.;
+    {  // filter U measurement, we need only chi2 here
+      const float cosPhi = -mxy.Dxy() / mxy.Dx2();
+      const float u      = cosPhi * mxy.X() + mxy.Y();
+      const float du2    = mxy.Dy2() + cosPhi * mxy.Dxy();
+
+      const float zeta = cosPhi * x + y - u;
+
+      const float F0 = cosPhi * C00 + C10;
+      const float F1 = cosPhi * C10 + C11;
+
+      const float HCH = (F0 * cosPhi + F1);
+
+      chi2u += mxy.NdfY() * zeta * zeta / (du2 + HCH);
+    }
+
+    if (chi2x > fParams[fIterationData[0].fIteration].doubletChi2Cut) {
+      continue;
+    }
+    if (chi2x + chi2u > fParams[fIterationData[0].fIteration].doubletChi2Cut) {
+      continue;
+    }
+    int id                = iThread * maxNHits + *collectedHits;
+    fHitDoublets[id].hit1 = iHit;
+    fHitDoublets[id].hit2 = ih_gpu;
+    *collectedHits += 1;
+    //TODO: add clone hit suppression -> should be done after doublet fitting actually
+  }  // loop over the hits in the area
+}
+
+XPU_D void GpuTripletConstructor::CollectHitsTriplets(int iThread, int iSta, int maxNHits, int* collectedHits) const
+{
+  const ca::GpuStation& sta            = fParams[fIterationData[0].fIteration].GetStation(iSta);
+  kf::TrackParamBase<float> fit_params = fvTrackParamsDoublets[iThread];
+
+  fit_params.ChiSq() = 0.;
+
+  const float Pick_m22 = fParams[fIterationData[0].fIteration].tripletChi2Cut - fit_params.GetChiSq();
+
+  const float timeError2 = fit_params.C(5, 5);
+  const float time       = fit_params.Time();
+
+  const auto& gpuGrid = fvGpuGrid[iSta];
+
+  const float pick_range_x = xpu::sqrt(Pick_m22 * fit_params.C(0, 0)) + gpuGrid.GetMaxRangeX()
+                             + fParams[fIterationData[0].fIteration].maxDZ * abs(fit_params.Tx());
+  const float pick_range_y = xpu::sqrt(Pick_m22 * fit_params.C(1, 1)) + gpuGrid.GetMaxRangeY()
+                             + fParams[fIterationData[0].fIteration].maxDZ * abs(fit_params.Ty());
+
+  ca::GpuGridArea areaGpu(gpuGrid, &(fgridFirstBinEntryIndex[fvGpuGrid[iSta].GetBinEntryIndexStart()]),
+                          &(fgridEntries[fvGpuGrid[iSta].GetEntriesStart()]), fit_params.X(), fit_params.Y(),
+                          pick_range_x, pick_range_y);
+  unsigned int ih_gpu = 0;
+
+  while (areaGpu.GetNextObjectId(ih_gpu) && (*collectedHits < maxNHits)) {  // loop over station hits
+                                                                            //    if (fvbHitSuppressed[ih]) {
+                                                                            //      continue;
+                                                                            //    }
+    const ca::Hit& hit = fvHits[ih_gpu];
+
+    // check time-boundaries
+
+    //TODO: move hardcoded cuts to parameters
+    if ((sta.timeInfo) && (fit_params.NdfTime() >= 0)) {
+      if (xpu::abs(time - hit.T()) > 1.4 * (3.5 * xpu::sqrt(timeError2) + hit.RangeT())) {
+        continue;
+      }
+      // if (fabs(time - hit.T()) > 30) continue;
+    }
+
+    // - check whether hit belong to the window ( track position +\- errors ) -
+
+    // check y-boundaries
+    float y = fit_params.Y() + fit_params.Ty() * (hit.Z() - fit_params.Z());
+    float C11 =
+      fit_params.C(1, 1)
+      + (hit.Z() - fit_params.Z()) * (2 * fit_params.C(3, 1) + (hit.Z() - fit_params.Z()) * fit_params.C(3, 3));
+
+    float dy_est = xpu::sqrt(Pick_m22 * C11) + hit.RangeY();
+
+    const float dY = hit.Y() - y;
+
+    if (xpu::abs(dY) > dy_est) {
+      continue;
+    }
+
+    // check x-boundaries
+    float x = fit_params.X() + fit_params.Tx() * (hit.Z() - fit_params.Z());
+    float C00 =
+      fit_params.C(0, 0)
+      + (hit.Z() - fit_params.Z()) * (2 * fit_params.C(2, 0) + (hit.Z() - fit_params.Z()) * fit_params.C(2, 2));
+
+    float dx_est = xpu::sqrt(Pick_m22 * C00) + hit.RangeX();
+
+    const float dX = hit.X() - x;
+
+    if (fabs(dX) > dx_est) {
+      continue;
+    }
+
+    // check chi2
+    float C10 = fit_params.C(1, 0)
+                + (hit.Z() - fit_params.Z())
+                    * (fit_params.C(2, 1) + fit_params.C(3, 0) + (hit.Z() - fit_params.Z()) * fit_params.C(3, 2));
+
+    ca::MeasurementXy<float> mxy(hit.X(), hit.Y(), hit.dX2(), hit.dY2(), hit.dXY(), 1.f, 1.f);
+
+    float chi2x = 0.;
+    {  // filter X measurement
+      const float zeta = x - mxy.X();
+
+      const float F0 = C00;
+      const float F1 = C10;
+
+      const float HCH = F0;
+
+      const float wi     = 1.f / (mxy.Dx2() + HCH);
+      const float zetawi = zeta * wi;
+      chi2x              = mxy.NdfX() * zeta * zetawi;
+
+      const float K1 = F1 * wi;
+
+      x -= F0 * zetawi;
+      y -= F1 * zetawi;
+
+      C00 -= F0 * F0 * wi;
+      C10 -= K1 * F0;
+      C11 -= K1 * F1;
+    }
+
+    float chi2u = 0.;
+    {  // filter U measurement, we need only chi2 here
+      const float cosPhi = -mxy.Dxy() / mxy.Dx2();
+      const float u      = cosPhi * mxy.X() + mxy.Y();
+      const float du2    = mxy.Dy2() + cosPhi * mxy.Dxy();
+
+      const float zeta = cosPhi * x + y - u;
+
+      const float F0 = cosPhi * C00 + C10;
+      const float F1 = cosPhi * C10 + C11;
+
+      const float HCH = (F0 * cosPhi + F1);
+
+      chi2u += mxy.NdfY() * zeta * zeta / (du2 + HCH);
+    }
+
+    if (chi2x > fParams[fIterationData[0].fIteration].tripletChi2Cut) {
+      continue;
+    }
+    if (chi2x + chi2u > fParams[fIterationData[0].fIteration].tripletChi2Cut) {
+      continue;
+    }
+
+    int id                = iThread * maxNHits + *collectedHits;
+    fHitTriplets[id].hit1 = fHitDoubletsCompressed[iThread].hit1;
+    fHitTriplets[id].hit2 = fHitDoubletsCompressed[iThread].hit2;
+    fHitTriplets[id].hit3 = ih_gpu;
+    *collectedHits += 1;
+  }  // loop over the hits in the area
+  //  if( *collectedHits > 2 ) printf("- iThread: %d; collectedHits: %d\n", iThread, *collectedHits);
+}
+
+XPU_D void GpuTripletConstructor::FilterWithTargetAtLine(kf::TrackParamBase<float>* tr_par,
+                                                         const ca::GpuFieldRegion* F0) const
+{
+  // Add the target constraint to a straight line track
+
+  float eXY[2];
+  float Jx[7], Jy[7];
+
+  GetExtrapolatedXYline(tr_par,
+                        /*fParams[fIterationData[0].fIteration]*/ fParams_const[fIteration].GetTargetPositionZ(), F0,
+                        eXY /*, &eY*/, Jx, Jy);
+
+  FilterExtrapolatedXY(tr_par, &(/*fParams[fIterationData[0].fIteration]*/ fParams_const[fIteration].targetMeasurement),
+                       eXY /*, eY*/, Jx, Jy);
+}
+
+XPU_D void GpuTripletConstructor::GetExtrapolatedXYline(
+  kf::TrackParamBase<float>* tr_par, float z, const ca::GpuFieldRegion* F, float* extrXY,  // float* extrY,
+  //                                   std::array<float, ca::TrackParamBase<float>::kNtrackParam>* Jx,
+  //                                   std::array<float, ca::TrackParamBase<float>::kNtrackParam>* Jy) const
+  float* Jx, float* Jy) const
+{
+  // extrapolate track assuming it is straight (qp==0)
+  // return the extrapolated X, Y and the derivatives of the extrapolated X and Y
+
+  const float c_light(0.000299792458), c1(1.), c2i(0.5), c6i(1. / 6.), c12i(1. / 12.);
+
+  const float tx = tr_par->GetTx();
+  const float ty = tr_par->GetTy();
+  const float dz = z - tr_par->GetZ();
+
+  const float dz2     = dz * dz;
+  const float dzc6i   = dz * c6i;
+  const float dz2c12i = dz2 * c12i;
+
+  const float xx = tx * tx;
+  const float yy = ty * ty;
+  const float xy = tx * ty;
+
+  const float Ay = -xx - c1;
+  const float Bx = yy + c1;
+
+  const float ctdz2 = c_light * xpu::sqrt(c1 + xx + yy) * dz2;
+
+  const float Sx = F->cx0 * c2i + F->cx1 * dzc6i + F->cx2 * dz2c12i;
+  const float Sy = F->cy0 * c2i + F->cy1 * dzc6i + F->cy2 * dz2c12i;
+  const float Sz = F->cz0 * c2i + F->cz1 * dzc6i + F->cz2 * dz2c12i;
+
+  extrXY[0] = tr_par->GetX() + tx * dz;
+  extrXY[1] = tr_par->GetY() + ty * dz;
+
+  //  Jx->fill(0.);  // for a case
+  //  Jy->fill(0.);
+
+  Jx[0] = 1.;
+  Jx[1] = 0.;
+  Jx[2] = dz;
+  Jx[3] = 0.;
+  Jx[4] = ctdz2 * (Sx * xy + Sy * Ay + Sz * ty);
+  Jx[5] = 0.;
+  Jx[6] = 0.;
+
+  Jy[0] = 0.;
+  Jy[1] = 1.;
+  Jy[2] = 0.;
+  Jy[3] = dz;
+  Jy[4] = ctdz2 * (Sx * Bx - Sy * xy - Sz * tx);
+  Jy[5] = 0.;
+  //  Jx[6] = 0.;	//TODO: is it a bug???
+  Jy[6] = 0.;
+}
+
+XPU_D void GpuTripletConstructor::FilterExtrapolatedXY(
+  kf::TrackParamBase<float>* tr_par, const ca::MeasurementXy<float>* m, float* extrXY,  // fvec extrY,
+  //			      std::array<float, ca::TrackParamBase<float>::kNtrackParam>* Jx,
+  //			      std::array<float, ca::TrackParamBase<float>::kNtrackParam>* Jy) const
+  float* Jx, float* Jy) const
+{
+  // add a 2-D measurenent (x,y) at some z, that differs from fTr.GetZ()
+  // extrX, extrY are extrapolated track parameters at z, Jx, Jy are derivatives of the extrapolation
+
+  // ! it is assumed that in the track covariance matrix all non-diagonal covariances are 0
+  // ! except of C10
+
+  //  TrackParamV& T = fTr;
+
+  //zeta0 = T.x + Jx[2]*T.Tx() + Jx[3]*T.Ty() + Jx[4]*T.qp - x;
+  //zeta1 = T.y + Jy[2]*T.Tx() + Jy[3]*T.Ty() + Jy[4]*T.qp - y;
+
+  const float zeta0 = extrXY[0] - m->X();
+  const float zeta1 = extrXY[1] - m->Y();
+
+  // H = 1 0 Jx[2] Jx[3] Jx[4] 0
+  //     0 1 Jy[2] Jy[3] Jy[4] 0
+
+  // F = CH'
+  const float F00 = tr_par->C(0, 0);
+  const float F01 = tr_par->C(1, 0);  //TODO: is it a bug???
+  const float F10 = F01;              //tr_par->C10();
+  const float F11 = tr_par->C(1, 1);
+  const float F20 = Jx[2] * tr_par->C(2, 2);
+  const float F21 = Jy[2] * tr_par->C(2, 2);
+  const float F30 = Jx[3] * tr_par->C(3, 3);
+  const float F31 = Jy[3] * tr_par->C(3, 3);
+  const float F40 = Jx[4] * tr_par->C(4, 4);
+  const float F41 = Jy[4] * tr_par->C(4, 4);
+
+  // Jx[5,6] and Jy[5,6] are 0.
+
+  float S00 = m->Dx2() + F00 + Jx[2] * F20 + Jx[3] * F30 + Jx[4] * F40;
+  float S10 = m->Dxy() + F10 + Jy[2] * F20 + Jy[3] * F30 + Jy[4] * F40;
+  float S11 = m->Dy2() + F11 + Jy[2] * F21 + Jy[3] * F31 + Jy[4] * F41;
+
+  const float si     = 1. / (S00 * S11 - S10 * S10);
+  const float S00tmp = S00;
+  S00                = si * S11;
+  S10                = -si * S10;
+  S11                = si * S00tmp;
+
+  tr_par->ChiSq() += zeta0 * zeta0 * S00 + 2. * zeta0 * zeta1 * S10 + zeta1 * zeta1 * S11;
+  tr_par->Ndf() += m->NdfX() + m->NdfY();
+
+  const float K00 = F00 * S00 + F01 * S10;
+  const float K01 = F00 * S10 + F01 * S11;
+  const float K10 = F10 * S00 + F11 * S10;
+  const float K11 = F10 * S10 + F11 * S11;
+  const float K20 = F20 * S00 + F21 * S10;
+  const float K21 = F20 * S10 + F21 * S11;
+  const float K30 = F30 * S00 + F31 * S10;
+  const float K31 = F30 * S10 + F31 * S11;
+  const float K40 = F40 * S00 + F41 * S10;
+  const float K41 = F40 * S10 + F41 * S11;
+
+  tr_par->X() -= K00 * zeta0 + K01 * zeta1;
+  tr_par->Y() -= K10 * zeta0 + K11 * zeta1;
+  tr_par->Tx() -= K20 * zeta0 + K21 * zeta1;
+  tr_par->Ty() -= K30 * zeta0 + K31 * zeta1;
+  tr_par->Qp() -= K40 * zeta0 + K41 * zeta1;
+
+  tr_par->C(0, 0) -= (K00 * F00 + K01 * F01);
+  tr_par->C(1, 0) -= (K10 * F00 + K11 * F01);
+  tr_par->C(1, 1) -= (K10 * F10 + K11 * F11);
+  tr_par->C(2, 0) = -(K20 * F00 + K21 * F01);
+  tr_par->C(2, 1) = -(K20 * F10 + K21 * F11);
+  tr_par->C(2, 2) -= (K20 * F20 + K21 * F21);
+  tr_par->C(3, 0) = -(K30 * F00 + K31 * F01);
+  tr_par->C(3, 1) = -(K30 * F10 + K31 * F11);
+  tr_par->C(3, 2) = -(K30 * F20 + K31 * F21);
+  tr_par->C(3, 3) -= (K30 * F30 + K31 * F31);
+  tr_par->C(4, 0) = -(K40 * F00 + K41 * F01);
+  tr_par->C(4, 1) = -(K40 * F10 + K41 * F11);
+  tr_par->C(4, 2) = -(K40 * F20 + K41 * F21);
+  tr_par->C(4, 3) = -(K40 * F30 + K41 * F31);
+  tr_par->C(4, 4) -= (K40 * F40 + K41 * F41);
+}
+
+XPU_D void GpuTripletConstructor::MultipleScattering(kf::TrackParamBase<float>* tr_par, float radThick, float qp0) const
+{
+  float tx    = tr_par->Tx();
+  float ty    = tr_par->Ty();
+  float txtx  = tx * tx;
+  float tyty  = ty * ty;
+  float txtx1 = txtx + 1.f;
+  float h     = txtx + tyty;
+  float t     = xpu::sqrt(txtx1 + tyty);
+  float h2    = h * h;
+  float qp0t  = qp0 * t;
+
+  float c1 = 0.0136f, c2 = c1 * 0.038f, c3 = c2 * 0.5f, c4 = -c3 * 0.5f, c5 = c3 * 0.333333f, c6 = -c3 * 0.25f;
+
+  float s0 = (c1 + c2 * xpu::log(radThick) + c3 * h + h2 * (c4 + c5 * h + c6 * h2)) * qp0t;
+  //  float a = ((t + fParams[fIterationData[0].fIteration].particleMass * fParams[fIterationData[0].fIteration].particleMass * qp0 * qp0t) * radThick * s0 * s0);
+  float a = ((t + fParams_const[fIteration].particleMass * fParams_const[fIteration].particleMass * qp0 * qp0t)
+             * radThick * s0 * s0);
+  tr_par->C(2, 2) += txtx1 * a;
+  tr_par->C(3, 2) += tx * ty * a;
+  tr_par->C(3, 3) += (1. + tyty) * a;
+}
+
+XPU_D void GpuTripletConstructor::ExtrapolateStep(kf::TrackParamBase<float>* tr_par, float z_out, float qp0,
+                                                  const ca::GpuFieldRegion* Field) const
+{
+  const float c_light = 0.000299792458;
+
+  //----------------------------------------------------------------
+  const float h = z_out - tr_par->GetZ();
+
+  const float stepDz[5] = {0., 0., h * 0.5f, h * 0.5f, h};
+
+  float f[5][7]    = {{0.}};    // ( d*/dz  ) [step]
+  float F[5][7][7] = {{{0.}}};  // ( d *new [step] / d *old  )
+
+  //   Runge-Kutta steps
+  //
+  const float r0[7] = {tr_par->X(), tr_par->Y(), tr_par->Tx(), tr_par->Ty(), qp0, tr_par->Time(), tr_par->Vi()};
+  float R0[7][7]    = {{0.}};
+  for (int i = 0; i < 7; ++i) {
+    R0[i][i] = 1.;
+  }
+
+  for (int step = 1; step <= 4; ++step) {
+
+    float rstep[7] = {0.};
+    for (int i = 0; i < 7; ++i) {
+      rstep[i] = r0[i] + stepDz[step] * f[step - 1][i];
+    }
+    const float z       = tr_par->GetZ() + stepDz[step];
+    ca::GpuFieldValue B = Field->Get(rstep[0], rstep[1], z);
+    const float tx2     = rstep[2] * rstep[2];
+    const float ty2     = rstep[3] * rstep[3];
+    const float txty    = rstep[2] * rstep[3];
+    const float L2      = 1.f + tx2 + ty2;
+    const float L2i     = 1.f / L2;
+    const float L       = xpu::sqrt(L2);
+    const float cL      = c_light * L;
+    const float cLqp0   = cL * qp0;
+
+    f[step][0]    = rstep[2];
+    F[step][0][2] = 1.;
+
+    f[step][1]    = rstep[3];
+    F[step][1][3] = 1.;
+
+    const float f2tmp = txty * B.x - (1. + tx2) * B.y + rstep[3] * B.z;
+    f[step][2]        = cLqp0 * f2tmp;
+
+    F[step][2][2] = cLqp0 * (rstep[2] * f2tmp * L2i + rstep[3] * B.x - 2. * rstep[2] * B.y);
+    F[step][2][3] = cLqp0 * (rstep[3] * f2tmp * L2i + rstep[2] * B.x + B.z);
+    F[step][2][4] = cL * f2tmp;
+
+    const float f3tmp = -txty * B.y - rstep[2] * B.z + (1. + ty2) * B.x;
+    f[step][3]        = cLqp0 * f3tmp;
+    F[step][3][2]     = cLqp0 * (rstep[2] * f3tmp * L2i - rstep[3] * B.y - B.z);
+    F[step][3][3]     = cLqp0 * (rstep[3] * f3tmp * L2i + 2. * rstep[3] * B.x - rstep[2] * B.y);
+    F[step][3][4]     = cL * f3tmp;
+
+    f[step][4] = 0.;
+
+    //    if (fDoFitVelocity) {	//TODO: check and switch it on back if needed (for triplets, probably, not needed)
+    ////      fvec vi       = rstep[6];
+    f[step][5]     = rstep[6] * L;
+    const float Li = 1.f / L;
+    F[step][5][2]  = rstep[6] * rstep[2] * Li;  // / L;
+    F[step][5][3]  = rstep[6] * rstep[3] * Li;  // / L;
+    F[step][5][4]  = 0.;
+    F[step][5][5]  = 0.;
+    F[step][5][6]  = L;
+    //    }
+    //    else {
+    //      const float vi       = xpu::sqrt(1. + fSettings[0].particleMass * fSettings[0].particleMass * qp0 * qp0) * fSettings[0].SpeedOfLightInv;
+    //      f[step][5]    = rstep[6] * L;
+    //      F[step][5][2] = rstep[6] * rstep[2] / L;
+    //      F[step][5][3] = rstep[6] * rstep[3] / L;
+    //      F[step][5][4] =
+    //	  fSettings[0].particleMass * fSettings[0].particleMass * qp0 * L /
+    //	  xpu::sqrt(1. + fSettings[0].particleMass * fSettings[0].particleMass * qp0 * qp0) * fSettings[0].SpeedOfLightInv;
+    //      F[step][5][5] = 0.;
+    //      F[step][5][6] = 0.;
+    //    }
+
+    f[step][6] = 0.;
+  }  // end of Runge-Kutta step
+
+  float r[7]    = {0.};    // extrapolated parameters
+  float R[7][7] = {{0.}};  // Jacobian of the extrapolation
+
+  float stepW[5];  // = {0., h / 6.f, h / 3.f, h / 3.f, h / 6.};
+  stepW[0] = 0.;
+  stepW[1] = h / 6.f;
+  stepW[2] = h / 3.f;
+  stepW[3] = h / 3.f;
+  stepW[4] = h / 6.f;
+  //  const float stepW[5] = {0., h * 0.16666667f, h * 0.33333334f, h * 0.33333334f, h * 0.16666667f};
+  //  const float stepW[5] = {0., h * 0.166667f, h * 0.333334f, h * 0.333334f, h * 0.166667f};	//TODO: test constants: leads to a drop in efficiency
+
+  float k[5][7][7] = {{0.}};
+  for (int step = 1; step <= 4; ++step) {
+    for (int i = 0; i < 7; i++) {
+      for (int j = 0; j < 7; j++) {
+        k[step][i][j] = F[step][i][j];
+        for (int m = 0; m < 7; m++) {
+          k[step][i][j] += stepDz[step] * F[step][i][m] * k[step - 1][m][j];
+        }
+      }
+    }
+  }
+
+  for (int i = 0; i < 7; i++) {
+    for (int j = 0; j < 7; j++) {
+      R[i][j] = R0[i][j];
+      for (int step = 1; step <= 4; step++) {
+        R[i][j] += stepW[step] * k[step][i][j];
+      }
+    }
+  }
+
+  const float dqp = tr_par->Qp() - qp0;
+
+  for (int i = 0; i < 7; i++) {
+    r[i] = r0[i];
+    for (int step = 1; step <= 4; step++) {
+      r[i] += stepW[step] * f[step][i];
+    }
+    // take into account linearisation at fQp0
+    r[i] += R[i][4] * dqp;
+  }
+
+  // update parameters
+
+  tr_par->X()    = r[0];
+  tr_par->Y()    = r[1];
+  tr_par->Tx()   = r[2];
+  tr_par->Ty()   = r[3];
+  tr_par->Qp()   = r[4];
+  tr_par->Time() = r[5];
+  tr_par->Vi()   = r[6];
+
+  //fTr.Vi()( fTr.Vi() < fvec(TrackParamV::kClightNsInv) ) = fvec(TrackParamV::kClightNsInv);
+  tr_par->Z() = z_out;  //zMasked;
+
+  //          covariance matrix transport
+  float C[7][7];
+  for (int i = 0; i < 7; i++) {
+    for (int j = 0; j < 7; j++) {
+      C[i][j] = tr_par->C(i, j);
+    }
+  }
+  float RC[7][7];
+  for (int i = 0; i < 7; i++) {
+    for (int j = 0; j < 7; j++) {
+      RC[i][j] = 0.;
+      for (int m = 0; m < 7; m++) {
+        RC[i][j] += R[i][m] * C[m][j];
+      }
+    }
+  }
+  for (int i = 0; i < 7; i++) {
+    for (int j = 0; j < 7; j++) {
+      float Cij = 0.f;
+      for (int m = 0; m < 7; m++) {
+        Cij += RC[i][m] * R[j][m];
+      }
+      tr_par->C(i, j) = Cij;
+    }
+  }
+}
+
+XPU_D void GpuTripletConstructor::ExtrapolateLineNoField(kf::TrackParamBase<float>* tr_par, float zOut) const
+{
+  // extrapolate the track assuming no field
+
+  const float dz = zOut - tr_par->GetZ();
+
+  const float tx = tr_par->GetTx();
+  const float ty = tr_par->GetTy();
+  const float vi = tr_par->GetVi();
+
+  const float L = xpu::sqrt(1.f + tx * tx + ty * ty);
+
+  const float j52 = dz * tx * vi / L;
+  const float j53 = dz * ty * vi / L;
+  const float j56 = dz * L;
+
+  // transport parameters
+
+  tr_par->X() += tx * dz;
+  tr_par->Y() += ty * dz;
+  tr_par->Time() += L * vi * dz;
+  tr_par->Z() = zOut;
+
+  // transport covariance matrix (see details in CaTrackFit)
+
+  const float jc00 = tr_par->C(0, 0) + dz * tr_par->C(2, 0);
+  const float jc02 = tr_par->C(0, 2) + dz * tr_par->C(2, 2);
+
+  const float jc10 = tr_par->C(1, 0) + dz * tr_par->C(3, 0);
+  const float jc11 = tr_par->C(1, 1) + dz * tr_par->C(3, 1);
+  const float jc12 = tr_par->C(1, 2) + dz * tr_par->C(3, 2);
+  const float jc13 = tr_par->C(1, 3) + dz * tr_par->C(3, 3);
+
+  const float jc50 = tr_par->C(5, 0) + j52 * tr_par->C(2, 0) + j53 * tr_par->C(3, 0) + j56 * tr_par->C(6, 0);
+  const float jc51 = tr_par->C(5, 1) + j52 * tr_par->C(2, 1) + j53 * tr_par->C(3, 1) + j56 * tr_par->C(6, 1);
+  const float jc52 = tr_par->C(5, 2) + j52 * tr_par->C(2, 2) + j53 * tr_par->C(3, 2) + j56 * tr_par->C(6, 2);
+  const float jc53 = tr_par->C(5, 3) + j52 * tr_par->C(2, 3) + j53 * tr_par->C(3, 3) + j56 * tr_par->C(6, 3);
+  const float jc54 = tr_par->C(5, 4) + j52 * tr_par->C(2, 4) + j53 * tr_par->C(3, 4) + j56 * tr_par->C(6, 4);
+  const float jc55 = tr_par->C(5, 5) + j52 * tr_par->C(2, 5) + j53 * tr_par->C(3, 5) + j56 * tr_par->C(6, 5);
+  const float jc56 = tr_par->C(5, 6) + j52 * tr_par->C(2, 6) + j53 * tr_par->C(3, 6) + j56 * tr_par->C(6, 6);
+
+  tr_par->C(0, 0) = jc00 + jc02 * dz;
+  tr_par->C(1, 0) = jc10 + jc12 * dz;
+  //  tr_par->C(2, 0) = tr_par->C(2, 0) + tr_par->C(2, 2) * dz;
+  //  tr_par->C(3, 0) = tr_par->C(3, 0) + tr_par->C(3, 2) * dz;
+  //  tr_par->C(4, 0) = tr_par->C(4, 0) + tr_par->C(4, 2) * dz;
+  tr_par->C(2, 0) += tr_par->C(2, 2) * dz;
+  tr_par->C(3, 0) += tr_par->C(3, 2) * dz;
+  tr_par->C(4, 0) += tr_par->C(4, 2) * dz;
+  tr_par->C(5, 0) = jc50 + jc52 * dz;
+  //  tr_par->C(6, 0) = tr_par->C(6, 0) + tr_par->C(6, 2) * dz;
+  tr_par->C(6, 0) += tr_par->C(6, 2) * dz;
+
+  tr_par->C(1, 1) = jc11 + jc13 * dz;
+  //  tr_par->C21() = tr_par->C21() + tr_par->C23() * dz;
+  //  tr_par->C31() = tr_par->C31() + tr_par->C33() * dz;
+  //  tr_par->C41() = tr_par->C41() + tr_par->C43() * dz;
+  tr_par->C(2, 1) += tr_par->C(2, 3) * dz;
+  tr_par->C(3, 1) += tr_par->C(3, 3) * dz;
+  tr_par->C(4, 1) += tr_par->C(4, 3) * dz;
+  tr_par->C(5, 1) = jc51 + jc53 * dz;
+  //  tr_par->C61() = tr_par->C61() + tr_par->C63() * dz;
+  tr_par->C(6, 1) += tr_par->C(6, 3) * dz;
+
+  tr_par->C(5, 2) = jc52;
+  tr_par->C(5, 3) = jc53;
+  tr_par->C(5, 4) = jc54;
+  tr_par->C(5, 5) = jc55 + jc52 * j52 + jc53 * j53 + jc56 * j56;
+  //  tr_par->C65() = tr_par->C65() + tr_par->C62() * j52 + tr_par->C63() * j53 + tr_par->C66() * j56;
+  tr_par->C(6, 5) += tr_par->C(6, 2) * j52 + tr_par->C(6, 3) * j53 + tr_par->C(6, 6) * j56;
+}
+
+XPU_D void GpuTripletConstructor::FilterXY(kf::TrackParamBase<float>* tr_par, const ca::MeasurementXy<float>& mxy) const
+{
+  {
+    ca::MeasurementU<float> mx;
+    mx.SetCosPhi(1.f);
+    mx.SetSinPhi(0.f);
+    mx.SetU(mxy.X());
+    mx.SetDu2(mxy.Dx2());
+    mx.SetNdf(mxy.NdfX());
+
+    ca::MeasurementU<float> mu;
+    mu.SetCosPhi(-mxy.Dxy() / mxy.Dx2());
+    mu.SetSinPhi(1.f);
+    mu.SetU(mu.CosPhi() * mxy.X() + mxy.Y());
+    mu.SetDu2(mxy.Dy2() - mxy.Dxy() * mxy.Dxy() / mxy.Dx2());
+    mu.SetNdf(mxy.NdfY());
+
+    Filter1d(tr_par, mx);
+    Filter1d(tr_par, mu);
+    return;
+  }
+}
+
+XPU_D void GpuTripletConstructor::Filter1d(kf::TrackParamBase<float>* tr_par, const ca::MeasurementU<float>& m) const
+{
+  float zeta, HCH;
+  float F0, F1, F2, F3, F4, F5, F6;
+  float K1, K2, K3, K4, K5, K6;
+
+  zeta = m.CosPhi() * tr_par->X() + m.SinPhi() * tr_par->Y() - m.U();
+
+  // F = CH'
+  F0 = m.CosPhi() * tr_par->C(0, 0) + m.SinPhi() * tr_par->C(1, 0);
+  F1 = m.CosPhi() * tr_par->C(1, 0) + m.SinPhi() * tr_par->C(1, 1);
+
+  HCH = F0 * m.CosPhi() + F1 * m.SinPhi();
+
+  F2 = m.CosPhi() * tr_par->C(2, 0) + m.SinPhi() * tr_par->C(2, 1);
+  F3 = m.CosPhi() * tr_par->C(3, 0) + m.SinPhi() * tr_par->C(3, 1);
+  F4 = m.CosPhi() * tr_par->C(4, 0) + m.SinPhi() * tr_par->C(4, 1);
+  F5 = m.CosPhi() * tr_par->C(5, 0) + m.SinPhi() * tr_par->C(5, 1);
+  F6 = m.CosPhi() * tr_par->C(6, 0) + m.SinPhi() * tr_par->C(6, 1);
+
+  const bool maskDoFilter = (HCH < m.Du2() * 16.f);
+
+  // correction to HCH is needed for the case when sigma2 is so small
+  // with respect to HCH that it disappears due to the roundoff error
+  //    fvec wi     = fMaskF / (m.Du2() + fvec(1.0000001) * HCH);
+  //    fvec zetawi = fMaskF * zeta / (iif(maskDoFilter, m.Du2(), fvec::Zero()) + HCH);
+
+  //TODO: (wi, zetawi): GPU results are different from CPU ones and depends on the order of the operations
+  //TODO: need to compare effects made on the tracking efficiency and parameters quality
+  float wi           = 1.f / (m.Du2() + 1.0000001f * HCH);
+  const float zetawi = zeta / (maskDoFilter ? m.Du2() : 0.f + HCH);
+
+  wi = m.Du2() > 0.f ? wi : 0.f;
+
+  tr_par->ChiSq() += m.Ndf() * zeta * zeta * wi;
+  tr_par->Ndf() += m.Ndf();
+
+  K1 = F1 * wi;
+  K2 = F2 * wi;
+  K3 = F3 * wi;
+  K4 = F4 * wi;
+  K5 = F5 * wi;
+  K6 = F6 * wi;
+
+  tr_par->X() -= F0 * zetawi;
+  tr_par->Y() -= F1 * zetawi;
+  tr_par->Tx() -= F2 * zetawi;
+  tr_par->Ty() -= F3 * zetawi;
+  tr_par->Qp() -= F4 * zetawi;
+  tr_par->Time() -= F5 * zetawi;
+  tr_par->Vi() -= F6 * zetawi;
+
+  tr_par->C(0, 0) -= F0 * F0 * wi;
+  tr_par->C(1, 0) -= K1 * F0;
+  tr_par->C(1, 1) -= K1 * F1;
+
+  tr_par->C(2, 0) -= K2 * F0;
+  tr_par->C(2, 1) -= K2 * F1;
+  tr_par->C(2, 2) -= K2 * F2;
+
+  tr_par->C(3, 0) -= K3 * F0;
+  tr_par->C(3, 1) -= K3 * F1;
+  tr_par->C(3, 2) -= K3 * F2;
+  tr_par->C(3, 3) -= K3 * F3;
+
+  tr_par->C(4, 0) -= K4 * F0;
+  tr_par->C(4, 1) -= K4 * F1;
+  tr_par->C(4, 2) -= K4 * F2;
+  tr_par->C(4, 3) -= K4 * F3;
+  tr_par->C(4, 4) -= K4 * F4;
+
+  tr_par->C(5, 0) -= K5 * F0;
+  tr_par->C(5, 1) -= K5 * F1;
+  tr_par->C(5, 2) -= K5 * F2;
+  tr_par->C(5, 3) -= K5 * F3;
+  tr_par->C(5, 4) -= K5 * F4;
+  tr_par->C(5, 5) -= K5 * F5;
+
+  tr_par->C(6, 0) -= K6 * F0;
+  tr_par->C(6, 1) -= K6 * F1;
+  tr_par->C(6, 2) -= K6 * F2;
+  tr_par->C(6, 3) -= K6 * F3;
+  tr_par->C(6, 4) -= K6 * F4;
+  tr_par->C(6, 5) -= K6 * F5;
+  tr_par->C(6, 6) -= K6 * F6;
+}
+
+//TODO: update this function
+XPU_D void GpuTripletConstructor::FilterTime(kf::TrackParamBase<float>* tr_par, float t, float dt2,
+                                             float timeInfo) const
+{
+  // filter track with a time measurement
+
+  // F = CH'
+  const float F0 = tr_par->C(5, 0);
+  const float F1 = tr_par->C(5, 1);
+  const float F2 = tr_par->C(5, 2);
+  const float F3 = tr_par->C(5, 3);
+  const float F4 = tr_par->C(5, 4);
+  const float F5 = tr_par->C(5, 5);
+  const float F6 = tr_par->C(6, 5);
+
+  const float HCH = F5;  //tr_par->C55();
+
+  // when dt0 is much smaller than current time error,
+  // set track time exactly to the measurement value without filtering
+  // it helps to keep the initial time errors reasonably small
+  // the calculations in the covariance matrix are not affected
+
+  const bool maskDoFilter = (HCH < dt2 * 16.f) && timeInfo;
+
+  float w = 1.f;
+
+  if (timeInfo <= 0.f) w = 0.f;
+
+  //  float wi = 1.f / (dt2 + 1.0000001f * HCH);
+  float hch_d = 1.0000001f * HCH;
+  float den   = dt2 + hch_d;
+  float wi    = w / den;
+
+  const float zeta   = tr_par->Time() - t;
+  const float zetawi = w * zeta / (maskDoFilter ? dt2 : 0.f + HCH);
+
+  tr_par->ChiSqTime() += maskDoFilter ? (zeta * zeta * wi) : 0.f;
+  tr_par->NdfTime() += w;
+
+  const float K1 = F1 * wi;
+  const float K2 = F2 * wi;
+  const float K3 = F3 * wi;
+  const float K4 = F4 * wi;
+  const float K5 = F5 * wi;
+  const float K6 = F6 * wi;
+
+  tr_par->X() -= F0 * zetawi;
+  tr_par->Y() -= F1 * zetawi;
+  tr_par->Tx() -= F2 * zetawi;
+  tr_par->Ty() -= F3 * zetawi;
+  tr_par->Qp() -= F4 * zetawi;
+  tr_par->Time() -= F5 * zetawi;
+  tr_par->Vi() -= F6 * zetawi;
+
+  tr_par->C(0, 0) -= F0 * F0 * wi;
+
+  tr_par->C(1, 0) -= K1 * F0;
+  tr_par->C(1, 1) -= K1 * F1;
+
+  tr_par->C(2, 0) -= K2 * F0;
+  tr_par->C(2, 1) -= K2 * F1;
+  tr_par->C(2, 2) -= K2 * F2;
+
+  tr_par->C(3, 0) -= K3 * F0;
+  tr_par->C(3, 1) -= K3 * F1;
+  tr_par->C(3, 2) -= K3 * F2;
+  tr_par->C(3, 3) -= K3 * F3;
+
+  tr_par->C(4, 0) -= K4 * F0;
+  tr_par->C(4, 1) -= K4 * F1;
+  tr_par->C(4, 2) -= K4 * F2;
+  tr_par->C(4, 3) -= K4 * F3;
+  tr_par->C(4, 4) -= K4 * F4;
+
+  tr_par->C(5, 0) -= K5 * F0;
+  tr_par->C(5, 1) -= K5 * F1;
+  tr_par->C(5, 2) -= K5 * F2;
+  tr_par->C(5, 3) -= K5 * F3;
+  tr_par->C(5, 4) -= K5 * F4;
+  tr_par->C(5, 5) -= K5 * F5;
+
+  tr_par->C(6, 0) -= K6 * F0;
+  tr_par->C(6, 1) -= K6 * F1;
+  tr_par->C(6, 2) -= K6 * F2;
+  tr_par->C(6, 3) -= K6 * F3;
+  tr_par->C(6, 4) -= K6 * F4;
+  tr_par->C(6, 5) -= K6 * F5;
+  tr_par->C(6, 6) -= K6 * F6;
+}
+
+XPU_D void GpuTripletConstructor::EnergyLossCorrection(kf::TrackParamBase<float>* tr_par, float radThick,
+                                                       float upstreamDirection) const
+{
+  const float qp02 = xpu::max(tr_par->Qp() * tr_par->Qp(), 0.01f);
+  const float p2   = 1.f / qp02;
+  //  const float mass2 = fParams[fIterationData[0].fIteration].particleMass * fParams[fIterationData[0].fIteration].particleMass;	//TODO: fMass2 = 0.01116, fSettings[0].particleMass^2 = 0.011172, might lead to a different result
+  const float mass2   = fParams_const[0].particleMass * fParams_const[0].particleMass;
+  const float r_mass2 = 1.f / mass2;
+  const float E2      = mass2 + p2;
+
+  const float bg2 = p2 * r_mass2;
+  //ApproximateBetheBloch
+  const float kp0 = 2.33f;
+  const float kp1 = 0.20f;
+  const float kp2 = 3.00f;
+  const float kp3 = 173e-9f;
+  const float kp4 = 0.49848f;
+
+  const float mK   = 0.307075e-3f;
+  const float _2me = 1.022e-3f;
+  const float rho  = kp0;
+  const float x0   = kp1 * 2.303f;
+  const float x1   = kp2 * 2.303f;
+  const float mI   = kp3;
+  const float mZA  = kp4;
+  const float maxT = _2me * bg2;
+
+  //*** Density effect
+  float d2         = 0.f;
+  const float x    = 0.5f * xpu::log(bg2);
+  const float lhwI = xpu::log(28.816f * 1e-9f * xpu::sqrt(rho * mZA) / mI);
+
+  if (x > x1) {
+    d2 = lhwI + x - 0.5f;
+  }
+  if ((x > x0) && (x1 > x)) {
+    const float r = (x1 - x) / (x1 - x0);
+    d2            = lhwI + x - 0.5f + (0.5f - lhwI - x0) * r * r * r;
+  }
+
+  const float bethe =
+    mK * mZA * (1.f + bg2) / bg2 * (0.5f * xpu::log(_2me * bg2 * maxT / (mI * mI)) - bg2 / (1.f + bg2) - d2);
+
+  const float tr = xpu::sqrt(1.f + tr_par->Tx() * tr_par->Tx() + tr_par->Ty() * tr_par->Ty());
+
+  const float dE = bethe * radThick * tr * 2.33f * 9.34961f;
+
+  const float ECorrected  = xpu::sqrt(E2) + upstreamDirection * dE;
+  const float E2Corrected = ECorrected * ECorrected;
+
+  float corr = xpu::sqrt(p2 / (E2Corrected - mass2));
+  if (xpu::isnan(corr) || xpu::isinf(corr)) {
+    corr = 1.f;
+  }
+
+  tr_par->Qp() *= corr;
+  tr_par->C(4, 0) *= corr;
+  tr_par->C(4, 1) *= corr;
+  tr_par->C(4, 2) *= corr;
+  tr_par->C(4, 3) *= corr;
+  tr_par->C(4, 4) *= corr * corr;
+  tr_par->C(5, 4) *= corr;
+}
diff --git a/algo/ca/core/experimental/CaGpuTripletConstructor.h b/algo/ca/core/experimental/CaGpuTripletConstructor.h
new file mode 100644
index 0000000000..722d520c9c
--- /dev/null
+++ b/algo/ca/core/experimental/CaGpuTripletConstructor.h
@@ -0,0 +1,284 @@
+
+#pragma once  // include this header only once per compilation unit
+
+#include "CaDeviceImage.h"
+#include "CaGpuGrid.h"
+#include "CaGpuGridArea.h"
+#include "CaGpuMaterialMap.h"
+#include "CaGpuParameters.h"
+#include "CaHit.h"
+#include "CaMeasurementU.h"
+#include "CaMeasurementXy.h"
+#include "CaTriplet.h"
+
+#include <xpu/device.h>
+#include <xpu/host.h>
+
+namespace cbm::algo
+{
+  static constexpr int kMaxNofStations = 24;
+  // Block sizes / other compile time constants that need tuning
+  enum CaGpuConstants
+  {
+#if XPU_IS_CUDA
+    kSingletConstructorBlockSize      = 512,
+    kSingletConstructorItemsPerThread = 8,
+#else  // HIP, values ignored on CPU
+    kSingletConstructorBlockSize      = 64,  //1024,	//128 is optimal (preliminary tests)
+    kSingletConstructorItemsPerThread = 6,
+    kSortTripletsBlockSize            = 64,
+    kSortTripletsItemsPerThread       = 6,
+#endif
+  };
+}  // namespace cbm::algo
+
+namespace cbm::algo::ca
+{
+  //  class Framework;
+  class GpuTripletConstructor;
+
+  struct TrackingParams {
+  };
+
+  struct Params : xpu::constant<ca::GPUReco, TrackingParams> {
+  };
+
+  // Declare Constant Memory
+  struct strGpuTripletConstructor : xpu::constant<GPUReco, GpuTripletConstructor> {
+  };
+
+  struct hit_doublets {
+    ca::HitIndex_t hit1;
+    ca::HitIndex_t hit2;
+  };
+
+  struct hit_triplets {
+    ca::HitIndex_t hit1;
+    ca::HitIndex_t hit2;
+    ca::HitIndex_t hit3;
+  };
+
+  struct sort_triplets {
+    unsigned int tr_id;
+    unsigned int lsta;
+    unsigned int lhit;
+  };
+
+  struct shared_counters {
+    int counter;
+  };
+
+  struct shared_data {
+    //    ca::GpuStation fStations_shared[kMaxNofStations];
+    ca::GpuFieldRegion fld0_shared[kSingletConstructorBlockSize];
+    ca::GpuFieldRegion fld1_shared[kSingletConstructorBlockSize];
+  };
+
+  struct shared_data_fit_triplets {
+    //    ca::GpuStation fStations_shared[kMaxNofStations];
+    kf::TrackParamBase<float> fit_params_shared[kSingletConstructorBlockSize];
+  };
+
+  //  struct TestFunc : xpu::kernel<GPUReco> {
+  //    using block_size    = xpu::block_size<kSingletConstructorBlockSize>;
+  //    using constants     = xpu::cmem<strGpuTripletConstructor, Params>;
+  //    using context       = xpu::kernel_context<shared_memory, constants>;
+  //    XPU_D void operator()(context&);
+  //  };
+
+  // Declare Kernels
+  struct MakeSinglets : xpu::kernel<GPUReco> {
+    using block_size    = xpu::block_size<kSingletConstructorBlockSize>;
+    using shared_memory = shared_data;
+    using constants     = xpu::cmem<strGpuTripletConstructor, Params>;
+    using context       = xpu::kernel_context<shared_memory, constants>;
+    XPU_D void operator()(context&);
+  };
+
+  struct MakeDoublets : xpu::kernel<GPUReco> {
+    using block_size = xpu::block_size<kSingletConstructorBlockSize>;
+    //    using shared_memory = shared_counters;
+    using constants = xpu::cmem<strGpuTripletConstructor>;  //, Params>;
+    using context   = xpu::kernel_context<shared_memory, constants>;
+    XPU_D void operator()(context&);
+  };
+
+  struct CompressDoublets : xpu::kernel<GPUReco> {
+    using block_size = xpu::block_size<kSingletConstructorBlockSize>;
+    using constants  = xpu::cmem<strGpuTripletConstructor>;  //, Params>;
+    using context    = xpu::kernel_context<shared_memory, constants>;
+    XPU_D void operator()(context&);
+  };
+
+  struct FitDoublets : xpu::kernel<GPUReco> {
+    using block_size = xpu::block_size<kSingletConstructorBlockSize>;
+    using constants  = xpu::cmem<strGpuTripletConstructor>;  //, Params>;
+    using context    = xpu::kernel_context<shared_memory, constants>;
+    XPU_D void operator()(context&);
+  };
+
+  struct MakeTriplets : xpu::kernel<GPUReco> {
+    using block_size = xpu::block_size<kSingletConstructorBlockSize>;
+    //    using shared_memory = shared_counters;
+    using constants = xpu::cmem<strGpuTripletConstructor>;  //, Params>;
+    using context   = xpu::kernel_context<shared_memory, constants>;
+    XPU_D void operator()(context&);
+  };
+
+  struct CompressTriplets : xpu::kernel<GPUReco> {
+    using block_size = xpu::block_size<kSingletConstructorBlockSize>;
+    using constants  = xpu::cmem<strGpuTripletConstructor>;  //, Params>;
+    using context    = xpu::kernel_context<shared_memory, constants>;
+    XPU_D void operator()(context&);
+  };
+
+  struct FitTriplets : xpu::kernel<GPUReco> {
+    using block_size = xpu::block_size<kSingletConstructorBlockSize>;
+    //    using shared_memory = shared_data_fit_triplets;
+    using constants = xpu::cmem<strGpuTripletConstructor>;  //, Params>;
+    using context   = xpu::kernel_context<shared_memory, constants>;
+    XPU_D void operator()(context&);
+  };
+
+//#define SORT_TRIPLETS_GPU
+#ifdef SORT_TRIPLETS_GPU
+  struct SortTriplets : xpu::kernel<GPUReco> {
+    using block_size = xpu::block_size<kSortTripletsBlockSize>;
+    using sort_t =
+      xpu::block_sort<unsigned int, /*ca::Triplet*/ sort_triplets, kSortTripletsBlockSize, kSortTripletsItemsPerThread>;
+    using shared_memory = sort_t::storage_t;
+    using constants     = xpu::cmem<strGpuTripletConstructor>;  //, Params>;
+    using context       = xpu::kernel_context<shared_memory, constants>;
+    XPU_D void operator()(context&);
+  };
+#endif
+
+  struct IterationData {
+    int fNHits;              ///< Number of hits in grid
+    int fIteration;          ///< Iteration number
+    int fNDoublets;          ///< Number of doublets
+    int fNDoublets_counter;  ///< Number of doublets counter
+    int fNTriplets;          ///< Number of triplets
+    int fNTriplets_counter;  ///< Number of triplets counter
+  };
+
+  class GpuTripletConstructor {
+   public:
+    ///                             ------  FUNCTIONAL PART ------
+
+    //    XPU_D void TestFunc(TestFunc::context&) const;
+
+    XPU_D void MakeSinglets(MakeSinglets::context&) const;
+
+    XPU_D void MakeDoublets(MakeDoublets::context&) const;
+
+    XPU_D void CompressDoublets(CompressDoublets::context&) const;
+
+    XPU_D void FitDoublets(FitDoublets::context&) const;
+
+    XPU_D void MakeTriplets(MakeTriplets::context&) const;
+
+    XPU_D void CompressTriplets(CompressTriplets::context&) const;
+
+    XPU_D void FitTriplets(FitTriplets::context&) const;
+#ifdef SORT_TRIPLETS_GPU
+    XPU_D void SortTripletsFunc(SortTriplets::context&) const;
+#endif
+
+   private:
+    XPU_D void CollectHits(int iThread, int iSta, int iHit, int maxNHits, int* collectedHits) const;
+
+    XPU_D void CollectHitsTriplets(int iThread, int iSta, int maxNHits,
+                                   int* collectedHits) const;  //TODO: merge with CollectHits
+
+
+    ///                             ------  FITTING FINCTIONALITY ------ //TODO: check and switch to the standard CaTrackFit if possible
+    XPU_D void FilterWithTargetAtLine(kf::TrackParamBase<float>* tr_par, const ca::GpuFieldRegion* F0) const;
+
+    /// extrapolate track as a line, return the extrapolated X, Y and the Jacobians
+    XPU_D void GetExtrapolatedXYline(kf::TrackParamBase<float>* tr_par, float z, const ca::GpuFieldRegion* F,
+                                     float* extrXY, float* Jx, float* Jy) const;
+
+    XPU_D void MultipleScattering(kf::TrackParamBase<float>* tr_par, float radThick, float qp0) const;
+
+    /// extrapolate the track to the given Z using the field F
+    /// it does extrapolation in one step
+    XPU_D void ExtrapolateStep(kf::TrackParamBase<float>* tr_par, float z_out, float qp0,
+                               const ca::GpuFieldRegion* Field) const;
+
+    /// filter the track with the XY measurement placed at different Z
+    /// \param m - measurement
+    /// \param extrX - extrapolated X of the track
+    /// \param extrY - extrapolated Y of the track
+    /// \param Jx - Jacobian of the extrapolated X
+    /// \param Jy - Jacobian of the extrapolated Y
+    XPU_D void FilterExtrapolatedXY(
+      kf::TrackParamBase<float>* tr_par, const ca::MeasurementXy<float>* m, float* extrXY,  // fvec extrY,
+      //			      std::array<float, ca::TrackParamBase<float>::kNtrackParam>* Jx,
+      //			      std::array<float, ca::TrackParamBase<float>::kNtrackParam>* Jy) const;
+      float* Jx, float* Jy) const;
+
+    XPU_D void ExtrapolateLineNoField(kf::TrackParamBase<float>* tr_par, float zOut) const;
+
+    XPU_D void FilterXY(kf::TrackParamBase<float>* tr_par, const ca::MeasurementXy<float>& mxy) const;
+
+    XPU_D void Filter1d(kf::TrackParamBase<float>* tr_par, const ca::MeasurementU<float>& m) const;
+
+    XPU_D void FilterTime(kf::TrackParamBase<float>* tr_par, float t, float dt2, float timeInfo) const;
+
+    XPU_D void EnergyLossCorrection(kf::TrackParamBase<float>* tr_par, float radThick, float upstreamDirection) const;
+
+    ///                          ------  DATA MEMBERS ------
+   public:
+    xpu::buffer<ca::GpuGrid> fvGpuGrid;  ///< Grid
+
+    xpu::buffer<unsigned int> fgridFirstBinEntryIndex;  ///< Index of the first entry in the grid
+
+    xpu::buffer<unsigned int> fgridEntries;  ///< Entries in the grid	//TODO: check if it should be ca::HitIndex_t
+
+    ///Material map
+    xpu::buffer<ca::GpuMaterialMap> fMaterialMap;  ///< Material map base objects
+
+    xpu::buffer<float> fMaterialMapTables;  ///< Material map tables
+
+    xpu::buffer<IterationData> fIterationData;  ///< Temporary data
+
+    //    xpu::buffer<unsigned char> fvbHitSuppressed;  ///< Hit suppression flags
+
+    xpu::buffer<hit_doublets> fHitDoublets;  ///< Hit doublets
+
+    xpu::buffer<hit_doublets> fHitDoubletsCompressed;  ///< Hit doublets compressed
+
+    xpu::buffer<hit_triplets> fHitTriplets;  ///< Hit triplets
+
+    xpu::buffer<hit_triplets> fHitTripletsCompressed;  ///< Hit triplets compressed
+
+    /// \brief Hits of the current time window
+    ///
+    /// It is a portion of fInputData to process in the current time window
+    /// hit.Id is replaced by the hit index in fInputData
+    xpu::buffer<ca::Hit> fvHits;
+
+    xpu::buffer<kf::TrackParamBase<float>> fvTrackParams;          ///< Track parameters
+    xpu::buffer<kf::TrackParamBase<float>> fvTrackParamsDoublets;  ///< Track parameters for doublets
+    //    xpu::buffer<ca::TrackParamBase<float>> fvTrackParamsTriplets;  ///< Track parameters for triplets
+
+    xpu::buffer<ca::Triplet> fvTriplets;  ///< Triplets
+
+#ifdef SORT_TRIPLETS_GPU
+    xpu::buffer<sort_triplets> fTripletSortHelper;  ///< Helper for sorting triplets
+    xpu::buffer<sort_triplets> fTripletSortHelperTmp;
+    xpu::buffer<sort_triplets*> dst;
+#endif
+
+    //    GpuParameters fParams;
+    xpu::buffer<GpuParameters> fParams;
+
+    int fIteration;
+
+    GpuParameters fParams_const[4];
+
+    ca::GpuStation fStations_const[kMaxNofStations];
+  };
+
+}  // namespace cbm::algo::ca
diff --git a/algo/ca/core/experimental/CaMeasurementU.cxx b/algo/ca/core/experimental/CaMeasurementU.cxx
new file mode 100644
index 0000000000..05e72e07be
--- /dev/null
+++ b/algo/ca/core/experimental/CaMeasurementU.cxx
@@ -0,0 +1,40 @@
+/* Copyright (C) 2022 GSI Helmholtzzentrum fuer Schwerionenforschung, Darmstadt
+   SPDX-License-Identifier: GPL-3.0-only
+   Authors: Sergey Gorbunov, Sergei Zharko [committer] */
+
+/// \file CaMeasurementU.cxx
+/// \brief Implementation of the CaMeasurementU class
+
+#include "CaMeasurementU.h"
+
+#include <iomanip>
+#include <sstream>
+
+namespace cbm::algo::ca
+{
+
+  //----------------------------------------------------------------------------------------------------------------------
+  //
+  template<typename DataT>
+  std::string MeasurementU<DataT>::ToString(int indentLevel) const
+  {
+    std::stringstream aStream{};
+    // TODO: possibly it is better to place the indentChar into ca::Parameters (S.Zharko)
+    constexpr char indentChar = '\t';
+    std::string indent(indentLevel, indentChar);
+    aStream << indent << "cos(phi):    " << std::setw(12) << std::setfill(' ') << CosPhi() << '\n';
+    aStream << indent << "sin(phi):    " << std::setw(12) << std::setfill(' ') << SinPhi() << '\n';
+    aStream << indent << "u:           " << std::setw(12) << std::setfill(' ') << U() << '\n';
+    aStream << indent << "du2:         " << std::setw(12) << std::setfill(' ') << Du2() << '\n';
+    aStream << indent << "ndf:         " << std::setw(12) << std::setfill(' ') << Ndf() << '\n';
+    return aStream.str();
+  }
+
+  ///----------------------------------------------------------------------------------------------------------------------
+  /// All necessary instantiations of the template class
+
+  template class MeasurementU<fvec>;
+  template class MeasurementU<float>;
+  template class MeasurementU<double>;
+
+}  // namespace cbm::algo::ca
diff --git a/algo/ca/core/experimental/CaMeasurementU.h b/algo/ca/core/experimental/CaMeasurementU.h
new file mode 100644
index 0000000000..3fb46f6a98
--- /dev/null
+++ b/algo/ca/core/experimental/CaMeasurementU.h
@@ -0,0 +1,126 @@
+/* Copyright (C) 2007-2023 GSI Helmholtzzentrum fuer Schwerionenforschung, Darmstadt
+   SPDX-License-Identifier: GPL-3.0-only
+   Authors: Sergey Gorbunov [committer], Igor Kulakov, Sergei Zharko */
+
+/// \file CaMeasurementU
+/// \brief Definition of the CaMeasurementU class
+
+#pragma once  // include this header only once per compilation unit
+
+
+#include "CaDefs.h"
+#include "CaSimd.h"
+#include "KfUtils.h"
+
+#include <boost/serialization/access.hpp>
+
+#include <string>
+
+#include <xpu/device.h>
+
+namespace cbm::algo::ca
+{
+  /// \brief The class describes a 1D - measurement U in XY coordinate system
+  ///
+  /// The measurement is defined as
+  ///
+  /// u = x * cos(phi) + y * sin(phi)
+  ///
+  /// where phi is the azimuthal angle of the measurement direction
+  /// and x, y are the coordinates of the measurement
+  /// The measurement may be scaled, i.e. cos^2 + sin^2 must not necessarily be equal to 1
+  /// The measurement has a finite resolution, i.e. the measurement is not a point, but a distribution
+  /// with a certain rms.
+  /// The measurement may be used in the chi2 calculation or not
+  /// The measurement may be a SIMD vector of values, when DataT is fvec type
+  ///
+  template<typename DataT>
+  class MeasurementU {
+
+   public:
+    friend class boost::serialization::access;
+
+    /// default constructor
+    XPU_D MeasurementU() = default;
+
+    /// constructor
+    /// \param cosPhi   cos(phi)
+    /// \param sinPhi   sin(phi)
+    /// \param u        measurement, u = x * cos(phi) + y * sin(phi)
+    /// \param du2      rms^2 of the measurement
+    /// \param ndf      number of degrees of freedom (used for chi2 calculation)
+    ///                 if ndf == 1, the measurement is used in the chi2 calculation
+    ///                 if ndf == 0, the measurement is not used in the chi2 calculation
+    MeasurementU(DataT cosPhi, DataT sinPhi, DataT u, DataT du2, DataT ndf)
+      : fCosPhi(cosPhi)
+      , fSinPhi(sinPhi)
+      , fU(u)
+      , fDu2(du2)
+      , fNdf(ndf)
+    {
+    }
+
+    ///------------------------------
+    /// Setters and getters
+
+    XPU_D void SetCosPhi(DataT cosPhi) { fCosPhi = cosPhi; }
+    XPU_D void SetSinPhi(DataT sinPhi) { fSinPhi = sinPhi; }
+    XPU_D void SetU(DataT u) { fU = u; }
+    XPU_D void SetDu2(DataT du2) { fDu2 = du2; }
+    XPU_D void SetNdf(DataT ndf) { fNdf = ndf; }
+
+    XPU_D DataT CosPhi() const { return fCosPhi; }
+    XPU_D DataT SinPhi() const { return fSinPhi; }
+    XPU_D DataT U() const { return fU; }
+    XPU_D DataT Du2() const { return fDu2; }
+    XPU_D DataT Ndf() const { return fNdf; }
+
+    ///------------------------------
+    /// Methods for debugging
+
+    /// String representation of class contents
+    /// \param indentLevel      number of indent characters in the output
+    std::string ToString(int indentLevel = 0) const;
+
+    /// Checks, if all fields are finite
+    bool IsFinite() const
+    {
+      return (kf::utils::IsFinite(CosPhi()) && kf::utils::IsFinite(SinPhi()) && kf::utils::IsFinite(U())
+              && kf::utils::IsFinite(Du2()) && kf::utils::IsFinite(Ndf()));
+    }
+
+    /// Checks, if some fields are undefined
+    bool IsUndefined() const
+    {
+      return (kf::utils::IsUndefined(CosPhi()) || kf::utils::IsUndefined(SinPhi()) || kf::utils::IsUndefined(U())
+              || kf::utils::IsUndefined(Du2()) || kf::utils::IsUndefined(Ndf()));
+    }
+
+    /// Serialization function
+    friend class boost::serialization::access;
+    template<class Archive>
+    void serialize(Archive& ar, const unsigned int)
+    {
+      ar& fCosPhi;
+      ar& fSinPhi;
+      ar& fU;
+      ar& fDu2;
+      ar& fNdf;
+    }
+
+   private:
+    ///------------------------------
+    /// Data members
+
+    DataT fCosPhi = constants::Undef<DataT>;  ///< cos(phi)
+    DataT fSinPhi = constants::Undef<DataT>;  ///< sin(phi)
+    DataT fU      = constants::Undef<DataT>;  ///< measurement, u = x * cos(phi) + y * sin(phi)
+    DataT fDu2    = constants::Undef<DataT>;  ///< rms^2 of the measurement
+
+    /// \brief number of degrees of freedom (used for chi2 calculation)
+    /// if ndf == 1, the measurement is used in the chi2 calculation
+    /// if ndf == 0, the measurement is not used in the chi2 calculation
+    DataT fNdf = constants::Undef<DataT>;
+  } _fvecalignment;
+
+}  // namespace cbm::algo::ca
diff --git a/algo/ca/core/experimental/CaMeasurementXy.cxx b/algo/ca/core/experimental/CaMeasurementXy.cxx
new file mode 100644
index 0000000000..ebd861fd38
--- /dev/null
+++ b/algo/ca/core/experimental/CaMeasurementXy.cxx
@@ -0,0 +1,42 @@
+/* Copyright (C) 2022 GSI Helmholtzzentrum fuer Schwerionenforschung, Darmstadt
+   SPDX-License-Identifier: GPL-3.0-only
+   Authors: Sergey Gorbunov, Sergei Zharko [committer] */
+
+/// \file CaMeasurementXy.cxx
+/// \brief Implementation of the CaMeasurementXy class
+
+#include "CaMeasurementXy.h"
+
+#include <iomanip>
+#include <sstream>  // for stringstream
+
+namespace cbm::algo::ca
+{
+
+  //----------------------------------------------------------------------------------------------------------------------
+  //
+  template<typename DataT>
+  std::string MeasurementXy<DataT>::ToString(int indentLevel) const
+  {
+    std::stringstream aStream{};
+    // TODO: possibly it is better to place the indentChar into ca::Parameters (S.Zharko)
+    constexpr char indentChar = '\t';
+    std::string indent(indentLevel, indentChar);
+    aStream << indent << "x:    " << std::setw(12) << std::setfill(' ') << X() << '\n';
+    aStream << indent << "y:    " << std::setw(12) << std::setfill(' ') << Y() << '\n';
+    aStream << indent << "dx2:  " << std::setw(12) << std::setfill(' ') << Dx2() << '\n';
+    aStream << indent << "dy2:  " << std::setw(12) << std::setfill(' ') << Dy2() << '\n';
+    aStream << indent << "dxy:  " << std::setw(12) << std::setfill(' ') << Dxy() << '\n';
+    aStream << indent << "ndfX: " << std::setw(12) << std::setfill(' ') << NdfX() << '\n';
+    aStream << indent << "ndfY: " << std::setw(12) << std::setfill(' ') << NdfY() << '\n';
+    return aStream.str();
+  }
+
+  ///----------------------------------------------------------------------------------------------------------------------
+  /// All necessary instantiations of the template class
+
+  template class MeasurementXy<fvec>;
+  template class MeasurementXy<float>;
+  template class MeasurementXy<double>;
+
+}  // namespace cbm::algo::ca
diff --git a/algo/ca/core/experimental/CaMeasurementXy.h b/algo/ca/core/experimental/CaMeasurementXy.h
new file mode 100644
index 0000000000..7841ec5f2c
--- /dev/null
+++ b/algo/ca/core/experimental/CaMeasurementXy.h
@@ -0,0 +1,205 @@
+/* Copyright (C) 2007-2022 GSI Helmholtzzentrum fuer Schwerionenforschung, Darmstadt
+   SPDX-License-Identifier: GPL-3.0-only
+   Authors: Sergey Gorbunov [committer], Sergei Zharko */
+
+/// \file CaMeasurementXy.h
+/// \brief Definition of the CaMeasurementXy class
+
+#pragma once  // include this header only once per compilation unit
+
+
+#include "CaDefs.h"
+#include "CaSimd.h"
+#include "KfUtils.h"
+
+#include <string>
+
+#include <xpu/device.h>
+
+namespace cbm::algo::ca
+{
+
+  /// \brief The class describes a 2D - measurement (x, y) in XY coordinate system
+  ///
+  /// The measurement has a finite resolution, i.e. the measurement is not a point, but a distribution
+  /// with a certain rms.
+  /// The measurement components may be used in the chi2 calculation or not
+  /// The measurement may be a SIMD vector of values, when DataT is fvec type
+  ///
+  template<typename DataT>
+  class MeasurementXy {
+
+   public:
+    friend class boost::serialization::access;
+
+    /// default constructor
+    MeasurementXy() = default;
+
+    /// constructor
+    /// \param x        x coordinate of the measurement
+    /// \param y        y coordinate of the measurement
+    /// \param dx2      rms^2 of the x coordinate measurement
+    /// \param dy2      rms^2 of the y coordinate measurement
+    /// \param dxy      covariance of the x and y coordinate measurements
+    /// \param ndfX     number of degrees of freedom for the x coordinate measurement
+    ///                 if ndfX == 1, the measurement is used in fit and in the chi2 calculation
+    ///                 if ndfX == 0, the measurement is used in fit, but not used in the chi2 calculation
+    /// \param ndfY     number of degrees of freedom for the y coordinate measurement
+    ///                 if ndfY == 1, the measurement is used in fit and in the chi2 calculation
+    ///                 if ndfY == 0, the measurement is used in fit, but not used in the chi2 calculation
+    XPU_D MeasurementXy(DataT x, DataT y, DataT dx2, DataT dy2, DataT dxy, DataT ndfX, DataT ndfY)
+      : fX(x)
+      , fY(y)
+      , fDx2(dx2)
+      , fDy2(dy2)
+      , fDxy(dxy)
+      , fNdfX(ndfX)
+      , fNdfY(ndfY)
+    {
+    }
+
+
+    /// Set all SIMD entries from all SIMD entries of the other class
+    /// It works for scalar and fvec types,
+    /// except of the case when DataT is scalar and TdataB is fvec.
+    template<typename DataTb>
+    void Set(const MeasurementXy<DataTb>& m)
+    {
+      CopyBase<DataTb, true, true>(0, m, 0);
+    }
+
+    /// Set all SIMD entries from one SIMD entry of the other class
+    /// It also works when DataT is scalar
+    void Set(const MeasurementXy<fvec>& m, const int im) { CopyBase<fvec, true, false>(0, m, im); }
+
+    /// Set one SIMD entry from one SIMD entry of the other class
+    /// It only works when DataT is fvec, TdataB is scalar
+    template<typename DataTb>
+    void SetOneEntry(const int i, const MeasurementXy<DataTb>& m)
+    {
+      CopyBase<DataTb, false, true>(i, m, 0);
+    }
+
+    /// Set one SIMD entry from one SIMD entry of the other class
+    /// It only works when DataT is fvec, TdataB is fvec
+    void SetOneEntry(const int i, const MeasurementXy<fvec>& m, const int im)
+    {
+      CopyBase<fvec, false, false>(i, m, im);
+    }
+
+
+    ///------------------------------
+    /// Setters and getters
+
+    void SetX(DataT x) { fX = x; }
+    void SetY(DataT y) { fY = y; }
+    void SetDx2(DataT dx2) { fDx2 = dx2; }
+    void SetDy2(DataT dy2) { fDy2 = dy2; }
+    void SetDxy(DataT dxy) { fDxy = dxy; }
+    void SetNdfX(DataT ndfX) { fNdfX = ndfX; }
+    void SetNdfY(DataT ndfY) { fNdfY = ndfY; }
+    void SetCov(DataT dx2, DataT dxy, DataT dy2)
+    {
+      fDx2 = dx2;
+      fDxy = dxy;
+      fDy2 = dy2;
+    }
+
+    XPU_D DataT X() const { return fX; }
+    XPU_D DataT Y() const { return fY; }
+    XPU_D DataT Dx2() const { return fDx2; }
+    XPU_D DataT Dy2() const { return fDy2; }
+    XPU_D DataT Dxy() const { return fDxy; }
+    XPU_D DataT NdfX() const { return fNdfX; }
+    XPU_D DataT NdfY() const { return fNdfY; }
+
+    ///------------------------------
+    /// references, to ease assignment to SIMD vector components when DataT has fvec type
+
+    DataT& X() { return fX; }
+    DataT& Y() { return fY; }
+    DataT& Dx2() { return fDx2; }
+    DataT& Dy2() { return fDy2; }
+    DataT& Dxy() { return fDxy; }
+    DataT& NdfX() { return fNdfX; }
+    DataT& NdfY() { return fNdfY; }
+
+    ///------------------------------
+    /// Methods for debugging
+
+    /// String representation of class contents
+    /// \param indentLevel      number of indent characters in the output
+    std::string ToString(int indentLevel = 0) const;
+
+    /// Checks, if all fields are finite
+    bool IsFinite() const
+    {
+      return (kf::utils::IsFinite(X()) && kf::utils::IsFinite(Y()) && kf::utils::IsFinite(Dx2())
+              && kf::utils::IsFinite(Dy2()) && kf::utils::IsFinite(Dxy()) && kf::utils::IsFinite(NdfX())
+              && kf::utils::IsFinite(NdfY()));
+    }
+
+    /// Checks, if some fields are undefined
+    bool IsUndefined() const
+    {
+      return (kf::utils::IsUndefined(X()) || kf::utils::IsUndefined(Y()) || kf::utils::IsUndefined(Dx2())
+              || kf::utils::IsUndefined(Dy2()) || kf::utils::IsUndefined(Dxy()) || kf::utils::IsUndefined(NdfX())
+              || kf::utils::IsUndefined(NdfY()));
+    }
+
+   private:
+    /// \brief Copies all/one entries from the other class
+    /// \tparam TdataB  Type of the other class
+    /// \tparam TDoAllA  If true, all entries of the current class must be set
+    /// \tparam TDoAllB  If true, all entries of the other class must be used
+    /// \param ia  Index of SIMD vector element of the current class
+    /// \param Tb  Other class
+    /// \param ib  Index of SIMD vector element of the other class
+    template<typename TdataB, bool TDoAllA, bool TDoAllB>
+    void CopyBase(const int ia, const MeasurementXy<TdataB>& Tb, const int ib);
+
+   private:
+    ///------------------------------
+    /// Data members
+
+    // Initializing parameters with NANs spoils the track fit where
+    // the masked-out SIMD entries are suppressed by a multication by 0.
+    // Therefore, we initialize the data members here with finite numbers.
+    // For the numerical safety, with some reasonable numbers.
+
+    DataT fX;    //{0.};    ///< x coordinate of the measurement
+    DataT fY;    //{0.};    ///< y coordinate of the measurement
+    DataT fDx2;  //{1.};  ///< rms^2 of the x coordinate measurement
+    DataT fDy2;  //{1.};  ///< rms^2 of the y coordinate measurement
+    DataT fDxy;  //{0.};  ///< covariance of the x and y coordinate measurements
+
+    /// number of degrees of freedom (used for chi2 calculation)
+    /// if ndf == 1, the measurement is used in the chi2 calculation
+    /// if ndf == 0, the measurement is not used in the chi2 calculation
+
+    DataT fNdfX;  // = 0.;  ///< ndf for the x coordinate measurement
+    DataT fNdfY;  // = 0.;  ///< ndf for the y coordinate measurement
+
+  } _fvecalignment;
+
+
+  // ---------------------------------------------------------------------------------------------------------------------
+  //
+  template<typename TdataA>
+  template<typename TdataB, bool TDoAllA, bool TDoAllB>
+  inline void MeasurementXy<TdataA>::CopyBase(const int ia, const MeasurementXy<TdataB>& Tb, const int ib)
+  {
+    auto copy = [&](TdataA& a, const TdataB& b) {
+      kf::utils::VecCopy<TdataA, TdataB, TDoAllA, TDoAllB>::CopyEntries(a, ia, b, ib);
+    };
+
+    copy(fX, Tb.X());
+    copy(fY, Tb.Y());
+    copy(fDx2, Tb.Dx2());
+    copy(fDy2, Tb.Dy2());
+    copy(fDxy, Tb.Dxy());
+    copy(fNdfX, Tb.NdfX());
+    copy(fNdfY, Tb.NdfY());
+  }  // CopyBase
+
+}  // namespace cbm::algo::ca
diff --git a/algo/ca/core/tracking/CaTrackFinderWindow.cxx b/algo/ca/core/tracking/CaTrackFinderWindow.cxx
index cce227803a..4f0696514e 100644
--- a/algo/ca/core/tracking/CaTrackFinderWindow.cxx
+++ b/algo/ca/core/tracking/CaTrackFinderWindow.cxx
@@ -127,20 +127,48 @@ namespace cbm::algo::ca
     PrepareGrid(input.GetHits(), wData);
     frMonitorData.StopTimer(ETimer::PrepareGrid);
 
+#ifdef GPU_TRACKING
+    // XPU initialization
+    // - temporary solution for tests only
+    // - should not be initialized here
+    setenv("XPU_PROFILE", "1", 1);
+    xpu::settings settings;
+    //    settings.device = "cpu0";
+    settings.device = "hip0";
+    //    settings.verbose = true;
+    xpu::initialize(settings);
+
+    // Set up environment for GPU tracking
+    xpu::push_timer("gpuTFinit");
+    ca::GpuTrackFinderSetup gpuTrackFinderSetup(wData, fParameters);
+    xpu::timings gpuTFinit = xpu::pop_timer();
+    LOG(info) << "GPU tracking :: Initialization: " << gpuTFinit.wall() << " ms";
+
+    SetupGpuTrackFinder(gpuTrackFinderSetup);
+    size_t iter_num = 0;
+#endif
+
     // Run CA iterations
     frMonitorData.StartTimer(ETimer::FindTracks);
     auto& caIterations = fParameters.GetCAIterations();
     for (auto iter = caIterations.begin(); iter != caIterations.end(); ++iter) {
-
       // ----- Prepare iteration
       frMonitorData.StartTimer(ETimer::PrepareIteration);
       PrepareCAIteration(*iter, wData, iter == caIterations.begin());
       frMonitorData.StopTimer(ETimer::PrepareIteration);
 
+#ifdef GPU_TRACKING
+      // ----- Triplets construction on GPU -----
+      frMonitorData.StartTimer(ETimer::ConstructTriplets);
+      ConstructTripletsGPU(wData, gpuTrackFinderSetup, iter_num);
+      iter_num++;
+      frMonitorData.StopTimer(ETimer::ConstructTriplets);
+#else
       // ----- Triplets construction -----
       frMonitorData.StartTimer(ETimer::ConstructTriplets);
       ConstructTriplets(wData);
       frMonitorData.StopTimer(ETimer::ConstructTriplets);
+#endif
 
       // ----- Search for neighbouring triplets -----
       frMonitorData.StartTimer(ETimer::SearchNeighbours);
@@ -332,7 +360,6 @@ namespace cbm::algo::ca
         staPattern.push_back(std::make_pair(1 + i, 2 + gap));
       }
     }
-
     for (int istal = fParameters.GetNstationsActive() - 2; istal >= wData.CurrentIteration()->GetFirstStationIndex();
          istal--) {
       // start with downstream chambers
@@ -760,4 +787,100 @@ namespace cbm::algo::ca
     }      // level = 0
   }
 
+#ifdef GPU_TRACKING
+  void TrackFinderWindow::SetupGpuTrackFinder(GpuTrackFinderSetup& gpuTrackFinderSetup)
+  {
+    xpu::push_timer("SetupParametersTime");
+    gpuTrackFinderSetup.SetupParameters();
+    xpu::timings SetupParametersTime = xpu::pop_timer();
+    LOG(info) << "GPU tracking :: SetupParameters: " << SetupParametersTime.wall() << " ms";
+
+    xpu::push_timer("SetInputDataTime");
+    gpuTrackFinderSetup.SetInputData();
+    xpu::timings SetInputDataTime = xpu::pop_timer();
+    LOG(info) << "GPU tracking :: SetInputData: " << SetInputDataTime.wall() << " ms";
+
+    xpu::push_timer("SetupMaterialMapTime");
+    gpuTrackFinderSetup.SetupMaterialMap();
+    xpu::timings SetupMaterialMapTime = xpu::pop_timer();
+    LOG(info) << "GPU tracking :: SetupMaterialMap: " << SetupMaterialMapTime.wall() << " ms";
+  }
+
+  void TrackFinderWindow::ConstructTripletsGPU(WindowData& wData, GpuTrackFinderSetup& gpuTrackFinderSetup,
+                                               int iteration)
+  {
+    xpu::push_timer("SetupGridTime");
+    gpuTrackFinderSetup.SetupGrid();
+    xpu::timings SetupGridTime = xpu::pop_timer();
+    LOG(info) << "GPU tracking :: SetupGrid: " << SetupGridTime.wall() << " ms";
+
+    xpu::push_timer("SetupIterationDataTime");
+    gpuTrackFinderSetup.SetupIterationData(iteration);
+    xpu::timings SetupIterationDataTime = xpu::pop_timer();
+    LOG(info) << "GPU tracking :: SetupIterationData: " << SetupIterationDataTime.wall() << " ms";
+
+    xpu::push_timer("RunGpuTrackingSetup");
+    gpuTrackFinderSetup.RunGpuTrackingSetup();
+    xpu::timings RunGpuTrackingSetup = xpu::pop_timer();
+    LOG(info) << "GPU tracking :: RunGpuTracking: " << RunGpuTrackingSetup.wall() << " ms";
+
+#define SORT_TRIPLETS_CPU
+#ifdef SORT_TRIPLETS_CPU
+    xpu::push_timer("SortTriplets");
+    std::vector<std::array<unsigned int, 2>> triplet_sort;
+    triplet_sort.reserve(gpuTrackFinderSetup.GetNofTriplets());
+
+    for (unsigned int itr = 0; itr < gpuTrackFinderSetup.GetNofTriplets(); itr++) {
+      if (gpuTrackFinderSetup.GetTriplet(itr).GetChi2() <= 0.) continue;
+      const auto& triplet = gpuTrackFinderSetup.GetTriplet(itr);
+      triplet_sort.push_back({triplet.GetLHit(), itr});
+    }
+    std::sort(triplet_sort.begin(), triplet_sort.end(),
+              [](const std::array<unsigned int, 2>& a, const std::array<unsigned int, 2>& b) { return (a[0] < b[0]); });
+    xpu::timings SortTriplets = xpu::pop_timer();
+    LOG(info) << "GPU tracking :: SortTripletsCPU: " << SortTriplets.wall() << " ms";
+#endif
+
+    int ihitl_0 = -1;
+
+    auto& [vHitFirstTriplet, vHitNofTriplets, vTriplets] = fTripletData;
+    vHitFirstTriplet.reset(wData.Hits().size(), 0);  /// link hit -> first triplet { hit, *, *}
+    vHitNofTriplets.reset(wData.Hits().size(), 0);   /// link hit ->n triplets { hit, *, *}
+
+    // prepare triplet storage
+    for (int j = 0; j < fParameters.GetNstationsActive(); j++) {
+      const size_t nHitsStation = wData.TsHitIndices(j).size();
+      vTriplets[j].clear();
+      vTriplets[j].reserve(2 * nHitsStation);
+    }
+
+#ifdef SORT_TRIPLETS_CPU
+    for (size_t itr = 0; itr < triplet_sort.size(); itr++) {
+      int iSta  = gpuTrackFinderSetup.GetTriplet(std::get<1>(triplet_sort[itr])).GetLSta();
+      int ihitl = gpuTrackFinderSetup.GetTriplet(std::get<1>(triplet_sort[itr])).GetLHit();
+#else
+    for (int itr = 0; itr < gpuTrackFinderSetup.GetNofTriplets(); itr++) {
+      int iSta  = gpuTrackFinderSetup.GetTriplet(itr).GetLSta();
+      int ihitl = gpuTrackFinderSetup.GetTriplet(itr).GetLHit();
+#endif
+      const size_t oldSize = vTriplets[iSta].size();
+#ifdef SORT_TRIPLETS_CPU
+      vTriplets[iSta].emplace_back(gpuTrackFinderSetup.GetTriplet(std::get<1>(triplet_sort[itr])));
+      if (ihitl == ihitl_0) {
+        vHitNofTriplets[ihitl]++;
+      }
+      else {
+        vHitFirstTriplet[ihitl] = PackTripletId(iSta, oldSize);
+        vHitNofTriplets[ihitl]  = vTriplets[iSta].size() - oldSize;
+      }
+      ihitl_0 = ihitl;
+#else
+      if (gpuTrackFinderSetup.GetTriplet(itr).GetChi2() > 0.)
+        vTriplets[iSta].emplace_back(gpuTrackFinderSetup.GetTriplet(itr));
+      vHitFirstTriplet[ihitl] = PackTripletId(iSta, oldSize);
+      vHitNofTriplets[ihitl]  = vTriplets[iSta].size() - oldSize;
+#endif
+    }
+  }
+#endif
 }  // namespace cbm::algo::ca
diff --git a/algo/ca/core/tracking/CaTrackFinderWindow.h b/algo/ca/core/tracking/CaTrackFinderWindow.h
index c08ea5570a..fdae1253ef 100644
--- a/algo/ca/core/tracking/CaTrackFinderWindow.h
+++ b/algo/ca/core/tracking/CaTrackFinderWindow.h
@@ -20,6 +20,11 @@
 #include "CaVector.h"
 #include "CaWindowData.h"
 
+//#define GPU_TRACKING
+#ifdef GPU_TRACKING
+#include "CaGpuTrackFinderSetup.h"
+#endif
+
 namespace cbm::algo::ca
 {
 
@@ -90,6 +95,12 @@ namespace cbm::algo::ca
 
     bool checkTripletMatch(const ca::Triplet& l, const ca::Triplet& r, fscal& dchi2, WindowData& wData) const;
 
+#ifdef GPU_TRACKING
+    void SetupGpuTrackFinder(GpuTrackFinderSetup& gpuTrackFinderSetup);
+
+    void ConstructTripletsGPU(WindowData& wData, GpuTrackFinderSetup& gpuTrackFinderSetup, int iteration);
+#endif
+
     // ** Functions, which pack and unpack indexes of station and triplet **
 
     // TODO: move to ca::Triplet class (S.Zharko)
diff --git a/algo/kf/core/CMakeLists.txt b/algo/kf/core/CMakeLists.txt
index 0c2168e867..cbd05a33d1 100644
--- a/algo/kf/core/CMakeLists.txt
+++ b/algo/kf/core/CMakeLists.txt
@@ -56,6 +56,7 @@ target_link_libraries(KfCore
                       OnlineDataLog          # needed for the logger
                       external::fles_logging # needed for the logger
                       external::fles_ipc     # needed for the logger
+                      xpu
              PRIVATE  Boost::serialization
                       fmt::fmt
                       external::yaml-cpp
@@ -78,6 +79,7 @@ if (NOT CBM_ONLINE_STANDALONE)
                 PUBLIC  Vc::Vc
                         OnlineDataLog          # needed for the logger?
                         FairLogger::FairLogger
+                        xpu
                PRIVATE  Boost::serialization
                         fmt::fmt
                         external::yaml-cpp
diff --git a/algo/kf/core/data/KfTrackParam.h b/algo/kf/core/data/KfTrackParam.h
index d439bbd8c8..6d2183e9fd 100644
--- a/algo/kf/core/data/KfTrackParam.h
+++ b/algo/kf/core/data/KfTrackParam.h
@@ -19,6 +19,8 @@
 
 #include <string>
 
+#include <xpu/device.h>
+
 namespace cbm::algo::kf
 {
   /// \class cbm::algo::kf::TrackParamBase
@@ -201,49 +203,49 @@ namespace cbm::algo::kf
     ///  Some of them involve calculations
 
     /// \brief Gets z position [cm]
-    T GetZ() const { return fZ; }
+    XPU_D T GetZ() const { return fZ; }
 
     /// \brief Gets x position [cm]
-    T GetX() const { return fX; }
+    XPU_D T GetX() const { return fX; }
 
     /// \brief Gets x position error [cm]
-    T GetXError() const { return sqrt(C00()); }
+    XPU_D T GetXError() const { return sqrt(C00()); }
 
     /// \brief Gets y position [cm]
-    T GetY() const { return fY; }
+    XPU_D T GetY() const { return fY; }
 
     /// \brief Gets y position error [cm]
-    T GetYError() const { return sqrt(C11()); }
+    XPU_D T GetYError() const { return sqrt(C11()); }
 
     /// \brief Gets slope along x-axis
-    T GetTx() const { return fTx; }
+    XPU_D T GetTx() const { return fTx; }
 
     /// \brief Gets error of slope along x-axis
-    T GetTxError() const { return sqrt(C22()); }
+    XPU_D T GetTxError() const { return sqrt(C22()); }
 
     /// \brief Gets slope along y-axis
-    T GetTy() const { return fTy; }
+    XPU_D T GetTy() const { return fTy; }
 
     /// \brief Gets error of slope along y-axis
-    T GetTyError() const { return sqrt(C33()); }
+    XPU_D T GetTyError() const { return sqrt(C33()); }
 
     /// \brief Gets charge over momentum [ec/GeV]
-    T GetQp() const { return fQp; }
+    XPU_D T GetQp() const { return fQp; }
 
     /// \brief Gets error of charge over momentum [ec/GeV]
-    T GetQpError() const { return sqrt(C44()); }
+    XPU_D T GetQpError() const { return sqrt(C44()); }
 
     /// \brief Gets time [ns]
-    T GetTime() const { return fT; }
+    XPU_D T GetTime() const { return fT; }
 
     /// \brief Gets time error [ns]
-    T GetTimeError() const { return sqrt(C55()); }
+    XPU_D T GetTimeError() const { return sqrt(C55()); }
 
     /// \brief Gets inverse velocity [ns/cm] in downstream direction
-    T GetVi() const { return fVi; }
+    XPU_D T GetVi() const { return fVi; }
 
     /// \brief Gets inverse velocity error [ns/cm]
-    T GetViError() const { return sqrt(C66()); }
+    XPU_D T GetViError() const { return sqrt(C66()); }
 
     /// \brief Gets covariance matrix
     const CovMatrix_t& GetCovMatrix() const { return fCovMatrix; }
@@ -252,49 +254,49 @@ namespace cbm::algo::kf
     /// \param i row
     /// \param j column
     /// \return covariance matrix element
-    T GetCovariance(int i, int j) const { return C(i, j); }
+    XPU_D T GetCovariance(int i, int j) const { return C(i, j); }
 
     /// Gets Chi-square of track fit model
-    T GetChiSq() const { return fChiSq; }
+    XPU_D T GetChiSq() const { return fChiSq; }
 
     /// Gets NDF of track fit model
-    T GetNdf() const { return fNdf; }
+    XPU_D T GetNdf() const { return fNdf; }
 
     /// Gets Chi-square of time measurements
-    T GetChiSqTime() const { return fChiSqTime; }
+    XPU_D T GetChiSqTime() const { return fChiSqTime; }
 
     /// Gets NDF of time measurements
-    T GetNdfTime() const { return fNdfTime; }
+    XPU_D T GetNdfTime() const { return fNdfTime; }
 
     /// Gets charge
-    T GetCharge() const { return utils::iif(GetQp() > T(0.), T(1.), T(-1.)); }
+    XPU_D T GetCharge() const { return utils::iif(GetQp() > T(0.), T(1.), T(-1.)); }
 
     /// \brief Gets azimuthal angle [rad]
-    T GetPhi() const { return atan2(GetTy(), GetTx()); }
+    XPU_D T GetPhi() const { return atan2(GetTy(), GetTx()); }
 
     /// \brief Gets azimuthal angle error [rad]
-    T GetPhiError() const;
+    XPU_D T GetPhiError() const;
 
     /// \brief Gets polar angle [rad]
-    T GetTheta() const { return atan(sqrt(GetTx() * GetTx() + GetTy() * GetTy())); }
+    XPU_D T GetTheta() const { return atan(sqrt(GetTx() * GetTx() + GetTy() * GetTy())); }
 
     /// \brief Gets polar angle error [rad]
-    T GetThetaError() const;
+    XPU_D T GetThetaError() const;
 
     /// Gets momentum [GeV/ec]. For the straight tracks returns 1.e4 [GeV/ec]
-    T GetP() const { return utils::iif(utils::fabs(GetQp()) > T(1.e-4), T(1.) / utils::fabs(GetQp()), T(1.e4)); }
+    XPU_D T GetP() const { return utils::iif(utils::fabs(GetQp()) > T(1.e-4), T(1.) / utils::fabs(GetQp()), T(1.e4)); }
 
     /// Gets z-component of the momentum [GeV/ec]
-    T GetPz() const { return GetP() / sqrt(T(1.) + GetTx() * GetTx() + GetTy() * GetTy()); }
+    XPU_D T GetPz() const { return GetP() / sqrt(T(1.) + GetTx() * GetTx() + GetTy() * GetTy()); }
 
     /// Gets x-component of the momentum [GeV/ec]
-    T GetPx() const { return GetPz() * GetTx(); }
+    XPU_D T GetPx() const { return GetPz() * GetTx(); }
 
     /// Gets y-component of the momentum [GeV/ec]
-    T GetPy() const { return GetPz() * GetTy(); }
+    XPU_D T GetPy() const { return GetPz() * GetTy(); }
 
     /// Gets transverse momentum
-    T GetPt() const
+    XPU_D T GetPt() const
     {
       T t2 = GetTx() * GetTx() + GetTy() * GetTy();
       return GetP() * sqrt(t2 / (T(1.) + t2));
@@ -305,43 +307,46 @@ namespace cbm::algo::kf
     ///
 
     /// \brief Sets z position [cm]
-    void SetZ(T v) { fZ = v; }
+    XPU_D void SetZ(T v) { fZ = v; }
 
     /// \brief Sets x position [cm]
-    void SetX(T v) { fX = v; }
+    XPU_D void SetX(T v) { fX = v; }
 
     /// \brief Sets y position [cm]
-    void SetY(T v) { fY = v; }
+    XPU_D void SetY(T v) { fY = v; }
 
     /// \brief Sets slope along x-axis
-    void SetTx(T v) { fTx = v; }
+    XPU_D void SetTx(T v) { fTx = v; }
 
     /// \brief Sets slope along y-axis
-    void SetTy(T v) { fTy = v; }
+    XPU_D void SetTy(T v) { fTy = v; }
 
     /// \brief Sets charge over momentum [ec/GeV]
-    void SetQp(T v) { fQp = v; }
+    XPU_D void SetQp(T v) { fQp = v; }
 
     /// \brief Sets time [ns]
-    void SetTime(T v) { fT = v; }
+    XPU_D void SetTime(T v) { fT = v; }
 
     /// \brief Sets inverse velocity [ns/cm]
-    void SetVi(T v) { fVi = v; }
+    XPU_D void SetVi(T v) { fVi = v; }
 
     /// \brief Sets Chi-square of track fit model
-    void SetChiSq(T v) { fChiSq = v; }
+    XPU_D void SetChiSq(T v) { fChiSq = v; }
 
     /// \brief Sets NDF of track fit model
-    void SetNdf(T v) { fNdf = v; }
+    XPU_D void SetNdf(T v) { fNdf = v; }
 
     /// \brief Sets Chi-square of time measurements
-    void SetChiSqTime(T v) { fChiSqTime = v; }
+    XPU_D void SetChiSqTime(T v) { fChiSqTime = v; }
 
     /// \brief Sets NDF of time measurements
-    void SetNdfTime(T v) { fNdfTime = v; }
+    XPU_D void SetNdfTime(T v) { fNdfTime = v; }
 
     /// \brief Sets covariance matrix
-    void SetCovMatrix(const CovMatrix_t& val) { fCovMatrix = val; }
+    XPU_D void SetCovMatrix(const CovMatrix_t& val) { fCovMatrix = val; }
+
+    /// \brief Sets all elements of the covariance matrix to zero
+    XPU_D void ResetCovMatrix() { memset(fCovMatrix.begin(), 0., kNcovParam * sizeof(T)); }
 
     /// \brief Get covariance matrix element
     /// \param i row
@@ -358,7 +363,7 @@ namespace cbm::algo::kf
     /// \param j column
     /// \param val matrix element
     template<int i, int j>
-    void SetCovariance(T val)
+    XPU_D void SetCovariance(T val)
     {
       constexpr int ind = (j <= i) ? i * (1 + i) / 2 + j : j * (1 + j) / 2 + i;
       fCovMatrix[ind]   = val;
@@ -366,34 +371,34 @@ namespace cbm::algo::kf
 
     /// \brief Individual setters for covariance matrix elements
     ///
-    void SetC00(T val) { SetCovariance<0, 0>(val); }
-    void SetC10(T val) { SetCovariance<1, 0>(val); }
-    void SetC11(T val) { SetCovariance<1, 1>(val); }
-    void SetC20(T val) { SetCovariance<2, 0>(val); }
-    void SetC21(T val) { SetCovariance<2, 1>(val); }
-    void SetC22(T val) { SetCovariance<2, 2>(val); }
-    void SetC30(T val) { SetCovariance<3, 0>(val); }
-    void SetC31(T val) { SetCovariance<3, 1>(val); }
-    void SetC32(T val) { SetCovariance<3, 2>(val); }
-    void SetC33(T val) { SetCovariance<3, 3>(val); }
-    void SetC40(T val) { SetCovariance<4, 0>(val); }
-    void SetC41(T val) { SetCovariance<4, 1>(val); }
-    void SetC42(T val) { SetCovariance<4, 2>(val); }
-    void SetC43(T val) { SetCovariance<4, 3>(val); }
-    void SetC44(T val) { SetCovariance<4, 4>(val); }
-    void SetC50(T val) { SetCovariance<5, 0>(val); }
-    void SetC51(T val) { SetCovariance<5, 1>(val); }
-    void SetC52(T val) { SetCovariance<5, 2>(val); }
-    void SetC53(T val) { SetCovariance<5, 3>(val); }
-    void SetC54(T val) { SetCovariance<5, 4>(val); }
-    void SetC55(T val) { SetCovariance<5, 5>(val); }
-    void SetC60(T val) { SetCovariance<6, 0>(val); }
-    void SetC61(T val) { SetCovariance<6, 1>(val); }
-    void SetC62(T val) { SetCovariance<6, 2>(val); }
-    void SetC63(T val) { SetCovariance<6, 3>(val); }
-    void SetC64(T val) { SetCovariance<6, 4>(val); }
-    void SetC65(T val) { SetCovariance<6, 5>(val); }
-    void SetC66(T val) { SetCovariance<6, 6>(val); }
+    XPU_D void SetC00(T val) { SetCovariance<0, 0>(val); }
+    XPU_D void SetC10(T val) { SetCovariance<1, 0>(val); }
+    XPU_D void SetC11(T val) { SetCovariance<1, 1>(val); }
+    XPU_D void SetC20(T val) { SetCovariance<2, 0>(val); }
+    XPU_D void SetC21(T val) { SetCovariance<2, 1>(val); }
+    XPU_D void SetC22(T val) { SetCovariance<2, 2>(val); }
+    XPU_D void SetC30(T val) { SetCovariance<3, 0>(val); }
+    XPU_D void SetC31(T val) { SetCovariance<3, 1>(val); }
+    XPU_D void SetC32(T val) { SetCovariance<3, 2>(val); }
+    XPU_D void SetC33(T val) { SetCovariance<3, 3>(val); }
+    XPU_D void SetC40(T val) { SetCovariance<4, 0>(val); }
+    XPU_D void SetC41(T val) { SetCovariance<4, 1>(val); }
+    XPU_D void SetC42(T val) { SetCovariance<4, 2>(val); }
+    XPU_D void SetC43(T val) { SetCovariance<4, 3>(val); }
+    XPU_D void SetC44(T val) { SetCovariance<4, 4>(val); }
+    XPU_D void SetC50(T val) { SetCovariance<5, 0>(val); }
+    XPU_D void SetC51(T val) { SetCovariance<5, 1>(val); }
+    XPU_D void SetC52(T val) { SetCovariance<5, 2>(val); }
+    XPU_D void SetC53(T val) { SetCovariance<5, 3>(val); }
+    XPU_D void SetC54(T val) { SetCovariance<5, 4>(val); }
+    XPU_D void SetC55(T val) { SetCovariance<5, 5>(val); }
+    XPU_D void SetC60(T val) { SetCovariance<6, 0>(val); }
+    XPU_D void SetC61(T val) { SetCovariance<6, 1>(val); }
+    XPU_D void SetC62(T val) { SetCovariance<6, 2>(val); }
+    XPU_D void SetC63(T val) { SetCovariance<6, 3>(val); }
+    XPU_D void SetC64(T val) { SetCovariance<6, 4>(val); }
+    XPU_D void SetC65(T val) { SetCovariance<6, 5>(val); }
+    XPU_D void SetC66(T val) { SetCovariance<6, 6>(val); }
 
     ///---------------------------------------------------------------------------------------------------------------------
     /// References to parameters to ease the math formulae
@@ -401,49 +406,49 @@ namespace cbm::algo::kf
     ///
 
     /// \brief Reference to z position [cm]
-    T& Z() { return fZ; }
+    XPU_D T& Z() { return fZ; }
 
     /// \brief Reference to x position [cm]
-    T& X() { return fX; }
+    XPU_D T& X() { return fX; }
 
     /// \brief Reference to y position [cm]
-    T& Y() { return fY; }
+    XPU_D T& Y() { return fY; }
 
     /// \brief Reference to slope along x-axis
-    T& Tx() { return fTx; }
+    XPU_D T& Tx() { return fTx; }
 
     /// \brief Reference to slope along y-axis
-    T& Ty() { return fTy; }
+    XPU_D T& Ty() { return fTy; }
 
     /// \brief Reference to charge over momentum [ec/GeV]
-    T& Qp() { return fQp; }
+    XPU_D T& Qp() { return fQp; }
 
     /// \brief Reference to time [ns]
-    T& Time() { return fT; }
+    XPU_D T& Time() { return fT; }
 
     /// \brief Reference to inverse velocity [ns/cm]
-    T& Vi() { return fVi; }
+    XPU_D T& Vi() { return fVi; }
 
     /// \brief Reference to covariance matrix
     CovMatrix_t& CovMatrix() { return fCovMatrix; }
 
     /// \brief Reference to Chi-square of track fit model
-    T& ChiSq() { return fChiSq; }
+    XPU_D T& ChiSq() { return fChiSq; }
 
     /// \brief Reference to NDF of track fit model
-    T& Ndf() { return fNdf; }
+    XPU_D T& Ndf() { return fNdf; }
 
     /// \brief Reference to Chi-square of time measurements
-    T& ChiSqTime() { return fChiSqTime; }
+    XPU_D T& ChiSqTime() { return fChiSqTime; }
 
     /// \brief Reference to NDF of time measurements
-    T& NdfTime() { return fNdfTime; }
+    XPU_D T& NdfTime() { return fNdfTime; }
 
     /// \brief Get a reference to the covariance matrix element
     /// \param i row
     /// \param j column
     /// \return matrix element
-    T& C(int i, int j)
+    XPU_D T& C(int i, int j)
     {
       int ind = (j <= i) ? i * (1 + i) / 2 + j : j * (1 + j) / 2 + i;
       return fCovMatrix[ind];
@@ -530,7 +535,7 @@ namespace cbm::algo::kf
     /// \param c44  Variance of charge over momentum [(ec/GeV)2]
     /// \param c55  Variance of time [ns2]
     /// \param c66  Variance of inverse velocity [1/c2]
-    void ResetErrors(T c00, T c11, T c22, T c33, T c44, T c55, T c66);
+    XPU_D void ResetErrors(T c00, T c11, T c22, T c33, T c44, T c55, T c66);
 
     /// \brief Prints parameters to a string
     /// \param i  Index of SIMD vector element (when i== -1, the entire vector is printed)
@@ -550,7 +555,7 @@ namespace cbm::algo::kf
     bool IsConsistent(bool printWhenWrong, int nFilled) const;
 
     /// \brief Initializes inverse velocity range
-    void InitVelocityRange(fscal minP);
+    XPU_D void InitVelocityRange(fscal minP);
 
 
     ///---------------------------------------------------------------------------------------------------------------------
@@ -668,7 +673,7 @@ namespace cbm::algo::kf
   /// ---------------------------------------------------------------------------------------------------------------------
   ///
   template<typename T>
-  inline T TrackParamBase<T>::GetPhiError() const
+  XPU_D inline T TrackParamBase<T>::GetPhiError() const
   {
     // phi = atan( tx / ty ); }
 
@@ -688,7 +693,7 @@ namespace cbm::algo::kf
   /// ---------------------------------------------------------------------------------------------------------------------
   ///
   template<typename T>
-  inline T TrackParamBase<T>::GetThetaError() const
+  XPU_D inline T TrackParamBase<T>::GetThetaError() const
   {
     // theta = atan(sqrt( tx * tx + ty * ty) )
 
@@ -741,9 +746,12 @@ namespace cbm::algo::kf
   // ---------------------------------------------------------------------------------------------------------------------
   //
   template<typename T>
-  inline void TrackParamBase<T>::ResetErrors(T c00, T c11, T c22, T c33, T c44, T c55, T c66)
+  XPU_D inline void TrackParamBase<T>::ResetErrors(T c00, T c11, T c22, T c33, T c44, T c55, T c66)
   {
-    fCovMatrix.fill(0.);
+    //    fCovMatrix.fill(0.);
+    for (auto& element : fCovMatrix) {
+      element = 0.;
+    }
 
     SetC00(c00);
     SetC11(c11);
@@ -762,7 +770,7 @@ namespace cbm::algo::kf
   // ---------------------------------------------------------------------------------------------------------------------
   //
   template<typename T>
-  inline void TrackParamBase<T>::InitVelocityRange(fscal minP)
+  XPU_D inline void TrackParamBase<T>::InitVelocityRange(fscal minP)
   {
     // initialise the velocity range with respect to the minimal momentum minP {GeV/c}
     using defs::ProtonMass;
diff --git a/algo/kf/core/geo/KfFieldSlice.h b/algo/kf/core/geo/KfFieldSlice.h
index a626ca4a24..50654a1c8e 100644
--- a/algo/kf/core/geo/KfFieldSlice.h
+++ b/algo/kf/core/geo/KfFieldSlice.h
@@ -87,6 +87,15 @@ namespace cbm::algo::kf
       return GetFieldValue(trkPar.GetX() + trkPar.GetTx() * dz, trkPar.GetY() + trkPar.GetTy() * dz);
     }
 
+    /// \brief Gets approximation coefficients for the x-component of the field
+    const CoeffArray_t& GetBx() const { return fBx; }
+
+    /// \brief Gets approximation coefficients for the y-component of the field
+    const CoeffArray_t& GetBy() const { return fBy; }
+
+    /// \brief Gets approximation coefficients for the z-component of the field
+    const CoeffArray_t& GetBz() const { return fBz; }
+
     /// \brief Gets reference z-coordinate of the slice [cm]
     const T& GetZref() const { return fZref; }
 
diff --git a/algo/kf/core/geo/KfMaterialMap.h b/algo/kf/core/geo/KfMaterialMap.h
index e453f6bd9a..2ba56e81e6 100644
--- a/algo/kf/core/geo/KfMaterialMap.h
+++ b/algo/kf/core/geo/KfMaterialMap.h
@@ -128,6 +128,9 @@ namespace cbm::algo::kf
       return GetBinThicknessX0<I>(iX + iY * fNbins);
     }
 
+    /// \brief Gets material thickness table
+    const std::vector<float>& GetTable() const { return fTable; }
+
     /// \brief Function to test the instance for NaN
     bool IsUndefined() const
     {
-- 
GitLab


From d142bba534dc6ef5452a16b064e7586b05c38457 Mon Sep 17 00:00:00 2001
From: "g.kozlov" <g.kozlov@gsi.de>
Date: Thu, 6 Mar 2025 16:12:21 +0100
Subject: [PATCH 2/6] CA: multiple fixes in the experimental GPU code

---
 algo/ca/core/experimental/CaGpuField.h        | 325 ++++-----
 algo/ca/core/experimental/CaGpuGrid.h         |  12 +-
 algo/ca/core/experimental/CaGpuGridArea.h     |  14 +-
 algo/ca/core/experimental/CaGpuMaterialMap.h  |  19 +-
 algo/ca/core/experimental/CaGpuParameters.h   |  78 +--
 algo/ca/core/experimental/CaGpuStation.h      |  51 +-
 algo/ca/core/experimental/CaGpuTimeMonitor.h  |  90 +--
 .../experimental/CaGpuTrackFinderSetup.cxx    | 375 +++++------
 .../core/experimental/CaGpuTrackFinderSetup.h |  47 +-
 .../experimental/CaGpuTripletConstructor.cxx  | 629 ++++++++----------
 .../experimental/CaGpuTripletConstructor.h    | 189 +++---
 algo/ca/core/experimental/CaMeasurementU.cxx  |  40 --
 algo/ca/core/experimental/CaMeasurementU.h    | 126 ----
 algo/ca/core/experimental/CaMeasurementXy.cxx |  42 --
 algo/ca/core/experimental/CaMeasurementXy.h   |  59 +-
 algo/ca/core/pars/CaDefs.h                    |  12 +
 algo/ca/core/tracking/CaTrackFinderWindow.cxx | 173 ++---
 algo/ca/core/tracking/CaTrackFinderWindow.h   |   6 -
 algo/kf/core/data/KfMeasurementU.h            |  28 +-
 19 files changed, 1012 insertions(+), 1303 deletions(-)
 delete mode 100644 algo/ca/core/experimental/CaMeasurementU.cxx
 delete mode 100644 algo/ca/core/experimental/CaMeasurementU.h
 delete mode 100644 algo/ca/core/experimental/CaMeasurementXy.cxx

diff --git a/algo/ca/core/experimental/CaGpuField.h b/algo/ca/core/experimental/CaGpuField.h
index 7d6358e678..8346640d41 100644
--- a/algo/ca/core/experimental/CaGpuField.h
+++ b/algo/ca/core/experimental/CaGpuField.h
@@ -2,54 +2,60 @@
    SPDX-License-Identifier: GPL-3.0-only
    Authors: Sergey Gorbunov [committer], Igor Kulakov, Maksym Zyzak, Sergei Zharko */
 
+/// \file CaGpuField.h
+/// \brief Magnetic field classes
+///
+/// This is a temporary solution made to ensure compatibility with GPU
+///
+
 #pragma once  // include this header only once per compilation unit
 
-#include "CaUtils.h"
-#include "KfField.h"
 #include "KfTrackParam.h"
+#include "KfField.h"
+#include "CaUtils.h"
 
 #include <xpu/device.h>
 
 namespace cbm::algo::ca
 {
   class GpuFieldValue {
-   public:
-    float x;  //< x-component of the field
-    float y;  //< y-component of the field
-    float z;  //< z-component of the field
-
-    GpuFieldValue() = default;
-
-    /// \brief Copy constructor with type conversion
-    template<typename DataIn>
-    GpuFieldValue(const kf::FieldValue<DataIn>& other)
-      : x(kf::utils::simd::Cast<DataIn, float>(other.GetBx()))
-      , y(kf::utils::simd::Cast<DataIn, float>(other.GetBy()))
-      , z(kf::utils::simd::Cast<DataIn, float>(other.GetBz()))
-    {
-    }
-
-    /// Combines the magnetic field with another field value object using weight
-    /// \param B  other field value to combine with
-    /// \param w  weight from 0 to 1
-
-    XPU_D void Combine(const GpuFieldValue& B, const bool w)
-    {
-      if (w) {
-        x = B.x;
-        y = B.y;
-        z = B.z;
-      }
-    }
-
-    /// Is the field value zero?
-    XPU_D bool IsZeroField() const
-    {
-      //     constexpr auto e = constants::misc::NegligibleFieldkG;
-      //     float e = 1.e-4;
-      return (x * x + y * y + z * z <= 1.e-8);
-    }
-  };
+  public:
+   float x;  //< x-component of the field
+   float y;  //< y-component of the field
+   float z;  //< z-component of the field
+
+   GpuFieldValue() = default;
+
+   /// \brief Copy constructor with type conversion
+   template<typename DataIn>
+   GpuFieldValue(const kf::FieldValue<DataIn>& other)
+     : x(kf::utils::simd::Cast<DataIn, float>(other.GetBx()))
+     , y(kf::utils::simd::Cast<DataIn, float>(other.GetBy()))
+     , z(kf::utils::simd::Cast<DataIn, float>(other.GetBz()))
+   {
+   }
+
+   /// Combines the magnetic field with another field value object using weight
+   /// \param B  other field value to combine with
+   /// \param w  weight from 0 to 1
+
+   XPU_D void Combine(const GpuFieldValue& B, const bool w)
+   {
+     if (w) {
+       x = B.x;
+       y = B.y;
+       z = B.z;
+     }
+   }
+
+   /// Is the field value zero?
+   XPU_D bool IsZeroField() const
+   {
+//     constexpr auto e = constants::misc::NegligibleFieldkG;
+//     float e = 1.e-4;
+     return (x * x + y * y + z * z <= 1.e-8);
+   }
+ };
 
   class GpuFieldSlice {
    public:
@@ -100,11 +106,11 @@ namespace cbm::algo::ca
     // NOTE: We don't use an initialization of arrays here because we cannot be sure
     //       if the underlying type (fvec) has a default constructor, but
     //       we are sure, that it can be initialized with a float. (S.Zharko)
-    static constexpr auto kMaxNFieldApproxCoefficients = 21;  //constants::size::MaxNFieldApproxCoefficients;
+    static constexpr auto kMaxNFieldApproxCoefficients = 21;//constants::size::MaxNFieldApproxCoefficients;
 
-    CoeffArray_t cx;  //fBx;  ///< Approximation coefficients for the x-component of the field
-    CoeffArray_t cy;  //fBy;  ///< Approximation coefficients for the y-component of the field
-    CoeffArray_t cz;  //fBz;  ///< Approximation coefficients for the z-component of the field
+    CoeffArray_t cx;//fBx;  ///< Approximation coefficients for the x-component of the field
+    CoeffArray_t cy;//fBy;  ///< Approximation coefficients for the y-component of the field
+    CoeffArray_t cz;//fBz;  ///< Approximation coefficients for the z-component of the field
 
     float z;  ///< z coordinate of the slice
   };
@@ -156,130 +162,127 @@ namespace cbm::algo::ca
   }
 
   class GpuFieldRegion {
-   public:
-    GpuFieldRegion() = default;
-
-    /// \brief Copy constructor with type conversion
-    template<typename DataIn>
-    GpuFieldRegion(const kf::FieldRegion<DataIn>& other)
-      : cx0(kf::utils::simd::Cast<DataIn, float>(other.cx0))
-      , cx1(kf::utils::simd::Cast<DataIn, float>(other.cx1))
-      , cx2(kf::utils::simd::Cast<DataIn, float>(other.cx2))
-      , cy0(kf::utils::simd::Cast<DataIn, float>(other.cy0))
-      , cy1(kf::utils::simd::Cast<DataIn, float>(other.cy1))
-      , cy2(kf::utils::simd::Cast<DataIn, float>(other.cy2))
-      , cz0(kf::utils::simd::Cast<DataIn, float>(other.cz0))
-      , cz1(kf::utils::simd::Cast<DataIn, float>(other.cz1))
-      , cz2(kf::utils::simd::Cast<DataIn, float>(other.cz2))
-      , z0(kf::utils::simd::Cast<DataIn, float>(other.z0))
-      //     , fUseOriginalField(other.fUseOriginalField)
-      , fIsZeroField(other.fIsZeroField)
-    {
-    }
-
-    /// Gets the field vector and writes it into B pointer
-    /// \param x  x-coordinate of the point to calculate the field
-    /// \param y  y-coordinate of the point to calculate the field
-    /// \param z  z-coordinate of the point to calculate the field
-    /// \return    the magnetic field value
-    XPU_D GpuFieldValue Get([[maybe_unused]] float x, [[maybe_unused]] float y, float z) const
-    {
-      GpuFieldValue B;
-      float dz  = z - z0;
-      float dz2 = dz * dz;
-      B.x       = cx0 + cx1 * dz + cx2 * dz2;
-      B.y       = cy0 + cy1 * dz + cy2 * dz2;
-      B.z       = cz0 + cz1 * dz + cz2 * dz2;
-      return B;
-    }
-
-    /// Is the field region empty?
-    XPU_D bool IsZeroField() const { return fIsZeroField; }
-
-    /// Interpolates the magnetic field by three nodal points and sets the result to this FieldRegion object
-    /// The result is a quadratic interpolation of the field as a function of z
-    /// \param b0   field value in the first nodal point
-    /// \param b0z  z-coordinate of the first nodal point
-    /// \param b1   field value in the second nodal point
-    /// \param b1z  z-coordinate of the second nodal point
-    /// \param b2   field value in the third nodal point
-    /// \param b2z  z-coordinate of the third nodal point
-    /// TODO: does the sequence of b0z, b1z and b2z matter? If yes, probalby we need an assertion (S.Zharko)
-    XPU_D void Set(const GpuFieldValue& b0, float b0z, const GpuFieldValue& b1, float b1z, const GpuFieldValue& b2,
-                   float b2z);
-
-    /// Interpolates the magnetic field by thwo nodal points and sets the result to this FieldRegion object.
-    /// The result is a linear interpolation of the field as a function of z
-    /// \param b0   field value in the first nodal point
-    /// \param b0z  z-coordinate of the first nodal point
-    /// \param b1   field value in the second nodal point
-    /// \param b1z  z-coordinate of the second nodal point
-    /// TODO: does the sequence of b0z and b1z matter? If yes, probalby we need an assertion (S.Zharko)
-    XPU_D void Set(const GpuFieldValue& b0, float b0z, const GpuFieldValue& b1, float b1z)
-    {
-      fIsZeroField = (b0.IsZeroField() && b1.IsZeroField());
-      z0           = b0z;
-      float dzi    = 1. / (b1z - b0z);  //TODO
-      cx0          = b0.x;
-      cy0          = b0.y;
-      cz0          = b0.z;
-      cx1          = (b1.x - b0.x) * dzi;
-      cy1          = (b1.y - b0.y) * dzi;
-      cz1          = (b1.z - b0.z) * dzi;
-      cx2          = 0.f;
-      cy2          = 0.f;
-      cz2          = 0.f;
-    }
-
-    /// Shifts the coefficients to new central point
-    /// \param z  z-coordinate of the new central point
-    XPU_D void Shift(float z)
-    {
-      float dz    = z - z0;
-      float cx2dz = cx2 * dz;
-      float cy2dz = cy2 * dz;
-      float cz2dz = cz2 * dz;
-      z0          = z;
-      cx0 += (cx1 + cx2dz) * dz;
-      cy0 += (cy1 + cy2dz) * dz;
-      cz0 += (cz1 + cz2dz) * dz;
-      cx1 += cx2dz + cx2dz;
-      cy1 += cy2dz + cy2dz;
-      cz1 += cz2dz + cz2dz;
-    }
-
-
-   public:
-    // TODO: Probably it's better to have arrays instead of separate fvec values? (S.Zharko)
-    // Bx(z) = cx0 + cx1*(z-z0) + cx2*(z-z0)^2
-    float cx0;
-    float cx1;
-    float cx2;
-
-    // By(z) = cy0 + cy1*(z-z0) + cy2*(z-z0)^2
-    float cy0;
-    float cy1;
-    float cy2;
-
-    // Bz(z) = cz0 + cz1*(z-z0) + cz2*(z-z0)^2
-    float cz0;
-    float cz1;
-    float cz2;
-
-    float z0;  ///< z-coordinate of the field region central point
-
-    bool fIsZeroField;  //{false};  ///< Is the field region empty?
-  };
-
-  XPU_D inline void GpuFieldRegion::Set(const GpuFieldValue& b0, float b0z, const GpuFieldValue& b1, float b1z,
-                                        const GpuFieldValue& b2, float b2z)
+  public:
+   GpuFieldRegion() = default;
+
+   /// \brief Copy constructor with type conversion
+   template<typename DataIn>
+   GpuFieldRegion(const kf::FieldRegion<DataIn>& other)
+     : cx0(kf::utils::simd::Cast<DataIn, float>(other.cx0))
+     , cx1(kf::utils::simd::Cast<DataIn, float>(other.cx1))
+     , cx2(kf::utils::simd::Cast<DataIn, float>(other.cx2))
+     , cy0(kf::utils::simd::Cast<DataIn, float>(other.cy0))
+     , cy1(kf::utils::simd::Cast<DataIn, float>(other.cy1))
+     , cy2(kf::utils::simd::Cast<DataIn, float>(other.cy2))
+     , cz0(kf::utils::simd::Cast<DataIn, float>(other.cz0))
+     , cz1(kf::utils::simd::Cast<DataIn, float>(other.cz1))
+     , cz2(kf::utils::simd::Cast<DataIn, float>(other.cz2))
+     , z0(kf::utils::simd::Cast<DataIn, float>(other.z0))
+//     , fUseOriginalField(other.fUseOriginalField)
+     , fIsZeroField(other.fIsZeroField)
+   {
+   }
+
+   /// Gets the field vector and writes it into B pointer
+   /// \param x  x-coordinate of the point to calculate the field
+   /// \param y  y-coordinate of the point to calculate the field
+   /// \param z  z-coordinate of the point to calculate the field
+   /// \return    the magnetic field value
+   XPU_D GpuFieldValue Get([[maybe_unused]] float x, [[maybe_unused]] float y, float z) const
+   {
+     GpuFieldValue B;
+     float dz  = z - z0;
+     float dz2 = dz * dz;
+     B.x       = cx0 + cx1 * dz + cx2 * dz2;
+     B.y       = cy0 + cy1 * dz + cy2 * dz2;
+     B.z       = cz0 + cz1 * dz + cz2 * dz2;
+     return B;
+   }
+
+   /// Is the field region empty?
+   XPU_D bool IsZeroField() const { return fIsZeroField; }
+
+   /// Interpolates the magnetic field by three nodal points and sets the result to this FieldRegion object
+   /// The result is a quadratic interpolation of the field as a function of z
+   /// \param b0   field value in the first nodal point
+   /// \param b0z  z-coordinate of the first nodal point
+   /// \param b1   field value in the second nodal point
+   /// \param b1z  z-coordinate of the second nodal point
+   /// \param b2   field value in the third nodal point
+   /// \param b2z  z-coordinate of the third nodal point
+   /// TODO: does the sequence of b0z, b1z and b2z matter? If yes, probalby we need an assertion (S.Zharko)
+   XPU_D void Set(const GpuFieldValue& b0, float b0z, const GpuFieldValue& b1, float b1z, const GpuFieldValue& b2, float b2z);
+
+   /// Interpolates the magnetic field by thwo nodal points and sets the result to this FieldRegion object.
+   /// The result is a linear interpolation of the field as a function of z
+   /// \param b0   field value in the first nodal point
+   /// \param b0z  z-coordinate of the first nodal point
+   /// \param b1   field value in the second nodal point
+   /// \param b1z  z-coordinate of the second nodal point
+   /// TODO: does the sequence of b0z and b1z matter? If yes, probalby we need an assertion (S.Zharko)
+   XPU_D void Set(const GpuFieldValue& b0, float b0z, const GpuFieldValue& b1, float b1z) {
+     fIsZeroField = (b0.IsZeroField() && b1.IsZeroField());
+     z0        = b0z;
+     float dzi = 1. / (b1z - b0z);	//TODO
+     cx0       = b0.x;
+     cy0       = b0.y;
+     cz0       = b0.z;
+     cx1       = (b1.x - b0.x) * dzi;
+     cy1       = (b1.y - b0.y) * dzi;
+     cz1       = (b1.z - b0.z) * dzi;
+     cx2       = 0.f;
+     cy2       = 0.f;
+     cz2       = 0.f;
+   }
+
+   /// Shifts the coefficients to new central point
+   /// \param z  z-coordinate of the new central point
+   XPU_D void Shift(float z) {
+     float dz    = z - z0;
+     float cx2dz = cx2 * dz;
+     float cy2dz = cy2 * dz;
+     float cz2dz = cz2 * dz;
+     z0          = z;
+     cx0 += (cx1 + cx2dz) * dz;
+     cy0 += (cy1 + cy2dz) * dz;
+     cz0 += (cz1 + cz2dz) * dz;
+     cx1 += cx2dz + cx2dz;
+     cy1 += cy2dz + cy2dz;
+     cz1 += cz2dz + cz2dz;
+   }
+
+
+  public:
+
+   // TODO: Probably it's better to have arrays instead of separate fvec values? (S.Zharko)
+   // Bx(z) = cx0 + cx1*(z-z0) + cx2*(z-z0)^2
+   float cx0;
+   float cx1;
+   float cx2;
+
+   // By(z) = cy0 + cy1*(z-z0) + cy2*(z-z0)^2
+   float cy0;
+   float cy1;
+   float cy2;
+
+   // Bz(z) = cz0 + cz1*(z-z0) + cz2*(z-z0)^2
+   float cz0;
+   float cz1;
+   float cz2;
+
+   float z0;  ///< z-coordinate of the field region central point
+
+   bool fIsZeroField;//{false};  ///< Is the field region empty?
+ };
+
+  XPU_D inline void GpuFieldRegion::Set(const GpuFieldValue& b0, float b0z, const GpuFieldValue& b1, float b1z, const GpuFieldValue& b2, float b2z)
   {
     fIsZeroField = (b0.IsZeroField() && b1.IsZeroField() && b2.IsZeroField());
 
     z0        = b0z;
     float dz1 = b1z - b0z;
     float dz2 = b2z - b0z;
-    float det = 1. / (dz1 * dz2 * (b2z - b1z));  //TODO
+    float det = 1. / (dz1 * dz2 * (b2z - b1z));	//TODO
 
     float w21 = -dz2 * det;
     float w22 = dz1 * det;
diff --git a/algo/ca/core/experimental/CaGpuGrid.h b/algo/ca/core/experimental/CaGpuGrid.h
index a61f4d72cc..cccd0cb458 100644
--- a/algo/ca/core/experimental/CaGpuGrid.h
+++ b/algo/ca/core/experimental/CaGpuGrid.h
@@ -1,18 +1,18 @@
 /* Copyright (C) 2010-2023 GSI Helmholtzzentrum fuer Schwerionenforschung, Darmstadt
    SPDX-License-Identifier: GPL-3.0-only
-   Authors: Igor Kulakov [committer], Sergey Gorbunov */
+   Authors: Igor Kulakov, Sergey Gorbunov, Grigory Kozlov [committer] */
 
 /// \file CaGpuGrid.h
 /// \brief A class to store hit information in a backet-sorted way on 2D grid
 ///
 /// This is a temporary solution to store the grid on GPU
+/// The code has been simplified and modified relative to the CaGrid class for use on GPU
 /// The original code is based on the code of the ALICE HLT Project
 ///
 
 #pragma once  // include this header only once per compilation unit
 
 #include "CaGrid.h"
-
 #include <xpu/device.h>
 
 namespace cbm::algo::ca
@@ -145,11 +145,11 @@ namespace cbm::algo::ca
     float fMaxRangeY{0.};  ///< maximal entry range in Y
     float fMaxRangeT{0.};  ///< maximal entry range in T
 
-    int fBinEntryIndex_start{0};   ///< index of the first bin entry in the index array
-    int fBinEntryIndex_amount{0};  ///< number of entries in the index array
+    int fBinEntryIndex_start{0};	///< index of the first bin entry in the index array
+    int fBinEntryIndex_amount{0};	///< number of entries in the index array
 
-    int fEntries_start{0};   ///< index of the first entry in the entry array
-    int fEntries_amount{0};  ///< number of entries in the entry array
+    int fEntries_start{0};		///< index of the first entry in the entry array
+    int fEntries_amount{0};		///< number of entries in the entry array
   };
 
   /// --- Inline methods ---
diff --git a/algo/ca/core/experimental/CaGpuGridArea.h b/algo/ca/core/experimental/CaGpuGridArea.h
index bb7a96d64a..f18851e722 100644
--- a/algo/ca/core/experimental/CaGpuGridArea.h
+++ b/algo/ca/core/experimental/CaGpuGridArea.h
@@ -2,7 +2,10 @@
    SPDX-License-Identifier: GPL-3.0-only
    Authors: Maksym Zyzak, Igor Kulakov [committer] */
 
-/// \file CaGrid.h
+/// \file CaGpuGridArea.h
+///
+/// \brief A temporary implementation for CaGridArea, created for compatibility with GPU code
+///
 
 #pragma once  // include this header only once per compilation unit
 
@@ -28,8 +31,7 @@ namespace cbm::algo::ca
     /// \param dx - half-width of the area in X
     /// \param dy - half-width of the area in Y
     //TODO: try to provide first element for the current grid instead of first element of the original arrays
-    XPU_D GpuGridArea(const ca::GpuGrid& grid, const unsigned int* BinEntryIndex, const unsigned int* Entries, float x,
-                      float y, float dx, float dy)
+    XPU_D GpuGridArea(const ca::GpuGrid& grid, const unsigned int* BinEntryIndex, const unsigned int* Entries, float x, float y, float dx, float dy)
       : fGrid(grid)
       , fBinEntryIndex(BinEntryIndex)
       , fEntries(Entries)
@@ -62,7 +64,7 @@ namespace cbm::algo::ca
     XPU_D bool GetNextObjectId(unsigned int& objectId);
 
     /// \brief debug mode: loop over the entire GetEntries() vector ignoring the search area
-    //    void DoLoopOverEntireGrid();
+//    void DoLoopOverEntireGrid();
 
    private:
     const ca::GpuGrid& fGrid;
@@ -87,8 +89,8 @@ namespace cbm::algo::ca
       if (fAreaCurrentBinY >= fAreaLastBinY) {
         return false;
       }
-      fAreaCurrentBinY++;                                // get new y-line
-      fAreaFirstBin += fGridNbinsX;                      // move the left-down corner of the area to the next y-line
+      fAreaCurrentBinY++;            // get new y-line
+      fAreaFirstBin += fGridNbinsX;  // move the left-down corner of the area to the next y-line
       fCurentEntry     = fBinEntryIndex[fAreaFirstBin];  // get first hit in cell, if y-line is new
       fEntriesXend     = fBinEntryIndex[fAreaFirstBin + fAreaNbinsX];
       xIndexOutOfRange = (fCurentEntry >= fEntriesXend);
diff --git a/algo/ca/core/experimental/CaGpuMaterialMap.h b/algo/ca/core/experimental/CaGpuMaterialMap.h
index 20a282bc6d..1699216736 100644
--- a/algo/ca/core/experimental/CaGpuMaterialMap.h
+++ b/algo/ca/core/experimental/CaGpuMaterialMap.h
@@ -1,11 +1,14 @@
 /* Copyright (C) 2007-2023 GSI Helmholtzzentrum fuer Schwerionenforschung, Darmstadt
    SPDX-License-Identifier: GPL-3.0-only
-   Authors: Igor Kulakov, Sergey Gorbunov [committer], Andrey Lebedev, Sergei Zharko */
+   Authors: Igor Kulakov, Sergey Gorbunov, Andrey Lebedev, Sergei Zharko, Grigory Kozlov [committer] */
+
+/// \file GpuMaterialMap.h
+/// \brief A map of station thickness in units of radiation length (X0) to the specific point in XY plane
+///
+/// A simplified version of the KfMaterialMap class, intended for compatibility with GPU code
 
 #pragma once  // include this header only once per compilation unit
 
-//#include "CaSimd.h"
-//#include "CaUtils.h"
 #include "KfMaterialMap.h"
 
 #include <xpu/device.h>
@@ -62,13 +65,13 @@ namespace cbm::algo::ca
     }
 
    private:
-    int fNbins;     ///< Number of rows (== N columns) in the material budget table
+    int fNbins;    ///< Number of rows (== N columns) in the material budget table
     int fBinStart;  ///< Start index of the material budget table
-    float fXYmax;   ///< Size of the station in x and y dimensions [cm]
+    float fXYmax;  ///< Size of the station in x and y dimensions [cm]
     float fFactor;  ///< Util. var. for the conversion of point coordinates to row/column id
-    float fZref;    ///< Reference Z of the collected material [cm]
-    float fZmin;    ///< Minimal Z of the collected material [cm]
-    float fZmax;    ///< Minimal Z of the collected material [cm]
+    float fZref;  ///< Reference Z of the collected material [cm]
+    float fZmin;  ///< Minimal Z of the collected material [cm]
+    float fZmax;  ///< Minimal Z of the collected material [cm]
   };
 
 }  // namespace cbm::algo::ca
diff --git a/algo/ca/core/experimental/CaGpuParameters.h b/algo/ca/core/experimental/CaGpuParameters.h
index fbbee5538d..231e98359f 100644
--- a/algo/ca/core/experimental/CaGpuParameters.h
+++ b/algo/ca/core/experimental/CaGpuParameters.h
@@ -1,18 +1,16 @@
 /* Copyright (C) 2021-2022 GSI Helmholtzzentrum fuer Schwerionenforschung, Darmstadt
    SPDX-License-Identifier: GPL-3.0-only
-   Authors: Sergey Gorbunov, Sergei Zharko [committer] */
+   Authors: Grigory Kozlov [committer] */
 
-/// @file L1Parameters.h
-/// @brief Parameter container for the L1Algo library
-/// @since 19.12.2021
-/// @author S.Zharko <s.zharko@gsi.de>
+/// \file CaGpuParameters.h
+/// \brief Parameter container for the GPU CA library
 
 #pragma once  // include this header only once per compilation unit
 
 #include "CaGpuField.h"
+#include "CaMeasurementXy.h"
 #include "CaGpuMaterialMap.h"
 #include "CaGpuStation.h"
-#include "CaMeasurementXy.h"
 #include "CaParameters.h"
 
 #include <xpu/device.h>
@@ -22,19 +20,14 @@ namespace cbm::algo::ca
 
   class GpuParameters {
    public:
+
+    /// \brief Default constructor
     GpuParameters() = default;
 
     /// \brief Copy constructor with type conversion
     template<typename DataIn>
     GpuParameters(const Parameters<DataIn>& other)
-      : fNstationsActiveTotal(other.GetNstationsActive())
-      , fMaxDoubletsPerSinglet(other.GetMaxDoubletsPerSinglet())
-      , fMaxTripletPerDoublets(other.GetMaxTripletPerDoublets())
-      , maxDoubletsFromHit(150)
-      , maxTripletsFromDoublet(15)
-      , maxExtrapolationStep(50.)
-      , SpeedOfLightInv(0.03335641)
-      , maxSlopePV(0.1)
+      : maxSlopePV(0.1)
       , maxQp(0.1)
       , maxDZ(0.1)
       , particleMass(0.13957)
@@ -48,26 +41,23 @@ namespace cbm::algo::ca
       fTargetPos[0] = kf::utils::simd::Cast<DataIn, float>(other.GetTargetPositionX());
       fTargetPos[1] = kf::utils::simd::Cast<DataIn, float>(other.GetTargetPositionY());
       fTargetPos[2] = kf::utils::simd::Cast<DataIn, float>(other.GetTargetPositionZ());
-      for (size_t i = 0; i < fNstationsActiveTotal; i++) {
+      for (size_t i = 0; i < (size_t)other.GetNstationsActive(); i++) {
         fStations[i] = other.GetStation(i);
       }
     }
-    ~GpuParameters() = default;
-    //
-   public:
-    ca::GpuFieldValue targB;                     ///< Magnetic field in the target region
-    ca::MeasurementXy<float> targetMeasurement;  ///< Measurement XY
 
-    int GetNstationsActive() const { return fNstationsActiveTotal; }
-    void SetNstationsActive(int nsta) { fNstationsActiveTotal = nsta; }
+    /// \brief Destructor
+    ~GpuParameters() = default;
 
-    /// \brief Gets upper-bound cut on max number of doublets per one singlet
-    XPU_D unsigned int GetMaxDoubletsPerSinglet() const { return fMaxDoubletsPerSinglet; }
+   public:
 
-    /// \brief Gets upper-bound cut on max number of triplets per one doublet
-    XPU_D unsigned int GetMaxTripletPerDoublets() const { return fMaxTripletPerDoublets; }
+    ca::GpuFieldValue targB;             ///< Magnetic field in the target region
+    ca::MeasurementXy<float> targetMeasurement;          ///< Measurement XY in the target region
 
+    /// \brief Gets Station
     XPU_D const ca::GpuStation& GetStation(int iStation) const { return fStations[iStation]; }
+
+    /// \brief Sets Station
     void SetStation(int iStation, const ca::GpuStation& station) { fStations[iStation] = station; }
 
     /// \brief Gets X component of target position
@@ -79,33 +69,21 @@ namespace cbm::algo::ca
     /// \brief Gets Z component of target position
     XPU_D float GetTargetPositionZ() const { return fTargetPos[2]; }
 
-    std::array<ca::GpuStation, 15>
-      fStations;  ///< Array of stations	//TODO: 15 is temporary, think how to do it properly on gpu
-
-    std::array<float, 3> fTargetPos;  ///< Target position
-
-    size_t fNstationsActiveTotal;  ///< total number of active tracking stations
-
-    unsigned int fMaxDoubletsPerSinglet;  ///< Upper-bound cut on max number of doublets per one singlet
-    unsigned int fMaxTripletPerDoublets;  ///< Upper-bound cut on max number of triplets per one doublet
-
-    int maxDoubletsFromHit;      ///< Max doublets from hit
-    int maxTripletsFromDoublet;  ///< Max triplets from doublet
+    std::array<ca::GpuStation, constants::gpu::MaxNofStations> fStations;  ///< Array of stations
 
-    float maxExtrapolationStep;  ///< Max extrapolation step
+    std::array<float, 3> fTargetPos;	///< Target position
 
-    float SpeedOfLightInv;      ///< Speed of light inverse
-    float maxSlopePV;           ///< Max slope to primary vertex
-    float maxQp;                ///< Max Q/p
-    float maxDZ;                ///< Max DZ
-    float particleMass;         ///< Particle mass
-    float doubletChi2Cut;       ///< Doublet chi2 cut
-    float tripletChi2Cut;       ///< Triplet chi2 cut
-    float tripletFinalChi2Cut;  ///< Triplet final chi2 cut
+    float maxSlopePV;                         ///< Max slope to primary vertex
+    float maxQp;                              ///< Max Q/p
+    float maxDZ;                              ///< Max DZ
+    float particleMass;                       ///< Particle mass
+    float doubletChi2Cut;                     ///< Doublet chi2 cut
+    float tripletChi2Cut;                     ///< Triplet chi2 cut
+    float tripletFinalChi2Cut;                ///< Triplet final chi2 cut
 
-    size_t fNStations;  ///< Number of stations
+    size_t fNStations;                        ///< Number of stations
 
-    bool primaryFlag;    ///< Primary flag
-    bool isTargetField;  ///< Is target field
+    bool primaryFlag;                         ///< Primary flag
+    bool isTargetField;                       ///< Is target field
   };
 }  // namespace cbm::algo::ca
diff --git a/algo/ca/core/experimental/CaGpuStation.h b/algo/ca/core/experimental/CaGpuStation.h
index 2a816d665d..a2433e45ae 100644
--- a/algo/ca/core/experimental/CaGpuStation.h
+++ b/algo/ca/core/experimental/CaGpuStation.h
@@ -1,15 +1,18 @@
 /* Copyright (C) 2007-2022 GSI Helmholtzzentrum fuer Schwerionenforschung, Darmstadt
    SPDX-License-Identifier: GPL-3.0-only
-   Authors: Sergey Gorbunov [committer], Igor Kulakov, Sergei Zharko */
+   Authors: Sergey Gorbunov, Igor Kulakov, Sergei Zharko, Grigory Kozlov [committer] */
+
+/// \file GpuStation.h
+///
+/// \brief Contains a set of geometry parameters for a particular station
+///
+/// A temporary simplified version of the CaStation class, intended for compatibility with GPU code
 
 #pragma once  // include this header only once per compilation unit
 
-//#include "CaConstants.h"
 #include "CaGpuField.h"
 #include "CaStation.h"
 
-//#include <string>
-
 #include <xpu/device.h>
 
 namespace cbm::algo::ca
@@ -19,13 +22,13 @@ namespace cbm::algo::ca
   ///
   class GpuStation {
    public:
-    int type;      //     = constants::Undef<int>;  //  TODO: replace with L1DetectorID
-    int timeInfo;  // = constants::Undef<int>;  ///< flag: if time information can be used
-    int
-      fieldStatus;  // = constants::Undef<int>;               ///< flag: 1 - station is INSIDE the field, 0 - station is OUTSIDE the field
-    float fZ;    //   = constants::Undef<DataT>;  ///< z position of station     [cm]
-    float Xmax;  // = constants::Undef<DataT>;  ///< min radius of the station [cm]
-    float Ymax;  // = constants::Undef<DataT>;  ///< max radius of the station [cm]
+
+    int type;  //  TODO: replace with L1DetectorID
+    int timeInfo;  ///< flag: if time information can be used
+    int fieldStatus;  ///< flag: 1 - station is INSIDE the field, 0 - station is OUTSIDE the field
+    float fZ;  ///< z position of station     [cm]
+    float Xmax;  ///< min radius of the station [cm]
+    float Ymax;  ///< max radius of the station [cm]
 
     GpuFieldSlice fieldSlice;  ///< Magnetic field near the station
 
@@ -54,19 +57,35 @@ namespace cbm::algo::ca
     int GetFieldStatus() const { return fieldStatus; }
 
     /// \brief Gets z-position of the station
-    XPU_D float GetZ() const { return fZ; }
+    XPU_D float GetZ() const
+    {
+      return fZ;
+    }
 
     /// \brief Gets limit of the station size in x-axis direction
-    XPU_D float GetXmax() const { return Xmax; }
+    XPU_D float GetXmax() const
+    {
+      return Xmax;
+    }
 
     /// \brief Gets limit of the station size in x-axis direction
-    XPU_D float GetXmin() const { return -Xmax; }
+    XPU_D float GetXmin() const
+    {
+      return -Xmax;
+    }
 
     /// \brief Gets limit of the station size in y-axis direction
-    XPU_D float GetYmax() const { return Ymax; }
+    XPU_D float GetYmax() const
+    {
+      return Ymax;
+    }
 
     /// \brief Gets limit of the station size in y-axis direction
-    XPU_D float GetYmin() const { return -Ymax; }
+    XPU_D float GetYmin() const
+    {
+      return -Ymax;
+    }
+
   };
 
 }  // namespace cbm::algo::ca
diff --git a/algo/ca/core/experimental/CaGpuTimeMonitor.h b/algo/ca/core/experimental/CaGpuTimeMonitor.h
index 77f05241b4..43f213d628 100644
--- a/algo/ca/core/experimental/CaGpuTimeMonitor.h
+++ b/algo/ca/core/experimental/CaGpuTimeMonitor.h
@@ -1,3 +1,11 @@
+/* Copyright (C) 2024-2025 GSI Helmholtzzentrum fuer Schwerionenforschung, Darmstadt
+   SPDX-License-Identifier: GPL-3.0-only
+   Authors: Grigory Kozlov [committer] */
+
+/// \file GpuTimeMonitor.h
+///
+/// \brief Time monitor for GPU kernels within the CA algorithm
+
 #pragma once  // include this header only once per compilation unit
 
 #include <xpu/host.h>
@@ -25,18 +33,19 @@ namespace cbm::algo::ca
     xpu::timings Total_time[4];
     int nIterations;
 
-    void PrintTimings(int iteration) const
-    {
+    void PrintTimings(int iteration) const {
       if (iteration < 0 || iteration >= 4) {
-        std::cerr << "Error: wrong iteration number: " << iteration << std::endl;
-        return;
+	std::cerr << "Error: wrong iteration number: " << iteration << std::endl;
+	return;
       }
 
       auto print_timing = [](const std::string& name, const xpu::timings& t) {
-        std::cout << std::left << std::setw(25) << name << "Kernel time: " << std::setw(10) << t.kernel_time()
-                  << " H2D copy: " << std::setw(10) << t.copy(xpu::h2d) << " D2H copy: " << std::setw(10)
-                  << t.copy(xpu::d2h) << " Memset: " << std::setw(10) << t.memset() << " Total: " << std::setw(10)
-                  << t.wall() << std::endl;
+	std::cout << std::left << std::setw(25) << name
+	    << "Kernel time: " << std::setw(10) << t.kernel_time()
+	    << " H2D copy: " << std::setw(10) << t.copy(xpu::h2d)
+	    << " D2H copy: " << std::setw(10) << t.copy(xpu::d2h)
+	    << " Memset: " << std::setw(10) << t.memset()
+	    << " Total: " << std::setw(10) << t.wall() << std::endl;
       };
 
       print_timing("PrepareData_time", PrepareData_time[iteration]);
@@ -58,39 +67,15 @@ namespace cbm::algo::ca
   class GpuTimeMonitor {
    public:
     /// \brief  Default constructor
-    GpuTimeMonitor() : nEvents(0)
-    //     , nIterations(4)
+    GpuTimeMonitor()
+     : nEvents(0)
     {
-      //      for( int i = 0; i < 4; i++ ) {
-      //	MakeSinglets_time[i] = 0;
-      //	MakeDoublets_time[i] = 0;
-      //	CompressDoublets_time[i] = 0;
-      //	FitDoublets_time[i] = 0;
-      //	MakeTriplets_time[i] = 0;
-      //	CompressTriplets_time[i] = 0;
-      //      }
     }
 
     /// \brief Destructor
     ~GpuTimeMonitor() = default;
 
-    //    void
-    //    AddEventInfo (GpuTimeMonitor &event)
-    //    {
-    //      for( int i = 0; i < 4; i++ ) {
-    ////	MakeSinglets_time[i] += event.GetMakeSingletsTime(i);
-    ////	MakeDoublets_time[i] += event.GetMakeDoubletsTime(i);
-    ////	CompressDoublets_time[i] += event.GetCompressDoubletsTime(i);
-    ////	FitDoublets_time[i] += event.GetFitDoubletsTime(i);
-    ////	MakeTriplets_time[i] += event.GetMakeTripletsTime(i);
-    ////	CompressTriplets_time[i] += event.GetCompressTripletsTime(i);
-    ////	FitTriplets_time[i] += event.GetFitTripletsTime(i);
-    //      }
-    //     nEvents++;
-    //    }
-
-    void AddEventInfo(XpuTimings& event)
-    {
+    void AddEventInfo(XpuTimings &event) {
       xpuTimings.push_back(event);
       nEvents++;
     }
@@ -99,24 +84,6 @@ namespace cbm::algo::ca
 
     int GetNEvents() { return nEvents; }
 
-    //    void SetNIterations(int nIt) { this->nIterations = nIt; }
-
-    //    int GetNIterations() { return nIterations; }
-
-    //    void SetMakeSingletsTime(int i, xpu::timings time) { MakeSinglets_time[i] = time; }
-
-    //    void SetMakeDoubletsTime(int i, xpu::timings time) { MakeDoublets_time[i] = time; }
-
-    //    void SetCompressDoubletsTime(int i, xpu::timings time) { CompressDoublets_time[i] = time; }
-
-    //    void SetFitDoubletsTime(int i, xpu::timings time) { FitDoublets_time[i] = time; }
-
-    //    void SetMakeTripletsTime(int i, xpu::timings time) { MakeTriplets_time[i] = time; }
-
-    //    void SetCompressTripletsTime(int i, xpu::timings time) { CompressTriplets_time[i] = time; }
-
-    //    void SetFitTripletsTime(int i, xpu::timings time) { FitTriplets_time[i] = time; }
-
     xpu::timings GetMakeSingletsTime(int ev, int it) { return xpuTimings[ev].MakeSinglets_time[it]; }
 
     xpu::timings GetMakeDoubletsTime(int ev, int it) { return xpuTimings[ev].MakeDoublets_time[it]; }
@@ -133,18 +100,9 @@ namespace cbm::algo::ca
 
     xpu::timings GetTotalTime(int ev, int it) { return xpuTimings[ev].Total_time[it]; }
 
-   private:
-    //     xpu::timings MakeSinglets_time[4];
-    //     xpu::timings MakeDoublets_time[4];
-    //     xpu::timings CompressDoublets_time[4];
-    //     xpu::timings FitDoublets_time[4];
-    //     xpu::timings MakeTriplets_time[4];
-    //     xpu::timings CompressTriplets_time[4];
-    //     xpu::timings FitTriplets_time[4];
-    std::vector<XpuTimings> xpuTimings;
-
-    int nEvents;
-    //     int nIterations;
-  };
+    private:
+     std::vector<XpuTimings> xpuTimings;
+     int nEvents;
+    };
 
 }  // namespace cbm::algo::ca
diff --git a/algo/ca/core/experimental/CaGpuTrackFinderSetup.cxx b/algo/ca/core/experimental/CaGpuTrackFinderSetup.cxx
index 33aca92757..fb11f74006 100644
--- a/algo/ca/core/experimental/CaGpuTrackFinderSetup.cxx
+++ b/algo/ca/core/experimental/CaGpuTrackFinderSetup.cxx
@@ -1,77 +1,72 @@
+/* Copyright (C) 2024-2025 GSI Helmholtzzentrum fuer Schwerionenforschung, Darmstadt
+   SPDX-License-Identifier: GPL-3.0-only
+   Authors: Grigory Kozlov [committer] */
+
+/// \file CaGpuTrackFinderSetup.cxx
+/// \brief The class is responsible for setting up the environment and the order in which kernels are launched to perform tracking using XPU
+
 #include "CaGpuTrackFinderSetup.h"
 
 using namespace cbm::algo::ca;
 
 GpuTrackFinderSetup::GpuTrackFinderSetup(WindowData& wData, const ca::Parameters<fvec>& pars)
   : fParameters(pars)
-  , fSetup(pars.GetGeometrySetup())
   , frWData(wData)
-  , fIteration(0)
-{
-}
+  , fIteration(0) {}
 
 void GpuTrackFinderSetup::SetupParameters()
 {
-  int nStations   = fParameters.GetNstationsActive();
+  int nStations = fParameters.GetNstationsActive();
   int nIterations = fParameters.GetCAIterations().size();
   fTripletConstructor.fParams.reset(nIterations, xpu::buf_io);
   xpu::h_view vfParams{fTripletConstructor.fParams};
 
-  for (int i = 0; i < nIterations; i++) {
-    GpuParameters gpuParams              = fParameters;
+  for( int i = 0; i < nIterations; i++ ) {
+    GpuParameters gpuParams = fParameters;
     fTripletConstructor.fParams_const[i] = fParameters;
-    const auto& caIteration              = fParameters.GetCAIterations()[i];
-
-    gpuParams.fNStations                                 = nStations;
-    gpuParams.SpeedOfLightInv                            = constants::phys::SpeedOfLightInv;
-    gpuParams.maxSlopePV                                 = caIteration.GetMaxSlopePV();
-    gpuParams.maxQp                                      = caIteration.GetMaxQp();
-    gpuParams.maxDZ                                      = caIteration.GetMaxDZ();
-    fTripletConstructor.fParams_const[i].fNStations      = nStations;
-    fTripletConstructor.fParams_const[i].SpeedOfLightInv = constants::phys::SpeedOfLightInv;
-    fTripletConstructor.fParams_const[i].maxSlopePV      = caIteration.GetMaxSlopePV();
-    fTripletConstructor.fParams_const[i].maxQp           = caIteration.GetMaxQp();
-    fTripletConstructor.fParams_const[i].maxDZ           = caIteration.GetMaxDZ();
-
-    if (caIteration.GetElectronFlag()) {
-      gpuParams.particleMass                            = constants::phys::ElectronMass;
-      fTripletConstructor.fParams_const[i].particleMass = constants::phys::ElectronMass;
-    }
-    else {
-      gpuParams.particleMass                            = constants::phys::MuonMass;
+    const auto& caIteration   = fParameters.GetCAIterations()[i];
+
+    gpuParams.fNStations = nStations;
+    gpuParams.maxSlopePV = caIteration.GetMaxSlopePV();
+    gpuParams.maxQp = caIteration.GetMaxQp();
+    gpuParams.maxDZ = caIteration.GetMaxDZ();
+    fTripletConstructor.fParams_const[i].fNStations = nStations;
+    fTripletConstructor.fParams_const[i].maxSlopePV = caIteration.GetMaxSlopePV();
+    fTripletConstructor.fParams_const[i].maxQp = caIteration.GetMaxQp();
+    fTripletConstructor.fParams_const[i].maxDZ = caIteration.GetMaxDZ();
+
+    if(caIteration.GetElectronFlag()){
+        gpuParams.particleMass = constants::phys::ElectronMass;
+        fTripletConstructor.fParams_const[i].particleMass = constants::phys::ElectronMass;
+    } else {
+      gpuParams.particleMass = constants::phys::MuonMass;
       fTripletConstructor.fParams_const[i].particleMass = constants::phys::MuonMass;
     }
 
-    gpuParams.primaryFlag                            = caIteration.GetPrimaryFlag();
+    gpuParams.primaryFlag = caIteration.GetPrimaryFlag();
     fTripletConstructor.fParams_const[i].primaryFlag = caIteration.GetPrimaryFlag();
 
-    if (caIteration.GetPrimaryFlag()) {
-      gpuParams.targB                            = fParameters.GetVertexFieldValue();
+    if(caIteration.GetPrimaryFlag()) {
+      gpuParams.targB = fParameters.GetVertexFieldValue();
       fTripletConstructor.fParams_const[i].targB = fParameters.GetVertexFieldValue();
-    }
-    else {
-      gpuParams.targB                            = fParameters.GetStation(0).fieldSlice.GetFieldValue(0, 0);
+    } else {
+      gpuParams.targB = fParameters.GetStation(0).fieldSlice.GetFieldValue(0, 0);
       fTripletConstructor.fParams_const[i].targB = fParameters.GetStation(0).fieldSlice.GetFieldValue(0, 0);
     }
 
-    gpuParams.doubletChi2Cut                                    = caIteration.GetDoubletChi2Cut();
-    gpuParams.tripletChi2Cut                                    = caIteration.GetTripletChi2Cut();
-    gpuParams.tripletFinalChi2Cut                               = caIteration.GetTripletFinalChi2Cut();
-    gpuParams.maxDoubletsFromHit                                = maxDoubletsFromHit;
-    gpuParams.maxTripletsFromDoublet                            = maxTripletsFromDoublet;
-    gpuParams.maxExtrapolationStep                              = 50.f;
-    gpuParams.isTargetField                                     = (fabs(gpuParams.targB.y) > 0.001);
-    fTripletConstructor.fParams_const[i].doubletChi2Cut         = caIteration.GetDoubletChi2Cut();
-    fTripletConstructor.fParams_const[i].tripletChi2Cut         = caIteration.GetTripletChi2Cut();
-    fTripletConstructor.fParams_const[i].tripletFinalChi2Cut    = caIteration.GetTripletFinalChi2Cut();
-    fTripletConstructor.fParams_const[i].maxDoubletsFromHit     = maxDoubletsFromHit;
-    fTripletConstructor.fParams_const[i].maxTripletsFromDoublet = maxTripletsFromDoublet;
-    fTripletConstructor.fParams_const[i].maxExtrapolationStep   = 50.f;
-
-    MeasurementXy<float> targMeas(fParameters.GetTargetPositionX()[0], fParameters.GetTargetPositionY()[0],
-                                  caIteration.GetTargetPosSigmaX() * caIteration.GetTargetPosSigmaX(),
-                                  caIteration.GetTargetPosSigmaY() * caIteration.GetTargetPosSigmaX(), 0, 1, 1);
-    gpuParams.targetMeasurement                            = targMeas;
+    gpuParams.doubletChi2Cut = caIteration.GetDoubletChi2Cut();
+    gpuParams.tripletChi2Cut = caIteration.GetTripletChi2Cut();
+    gpuParams.tripletFinalChi2Cut = caIteration.GetTripletFinalChi2Cut();
+    gpuParams.isTargetField = (fabs(gpuParams.targB.y) > 0.001);
+    fTripletConstructor.fParams_const[i].doubletChi2Cut = caIteration.GetDoubletChi2Cut();
+    fTripletConstructor.fParams_const[i].tripletChi2Cut = caIteration.GetTripletChi2Cut();
+    fTripletConstructor.fParams_const[i].tripletFinalChi2Cut = caIteration.GetTripletFinalChi2Cut();
+
+    ca::MeasurementXy<float> targMeas(fParameters.GetTargetPositionX()[0], fParameters.GetTargetPositionY()[0],
+				  caIteration.GetTargetPosSigmaX() * caIteration.GetTargetPosSigmaX(),
+				  caIteration.GetTargetPosSigmaY() * caIteration.GetTargetPosSigmaX(),
+				  0, 1, 1);
+    gpuParams.targetMeasurement = targMeas;
     fTripletConstructor.fParams_const[i].targetMeasurement = targMeas;
 
     vfParams[i] = gpuParams;
@@ -79,7 +74,7 @@ void GpuTrackFinderSetup::SetupParameters()
 
   fQueue.copy(fTripletConstructor.fParams, xpu::h2d);
 
-  for (int ista = 0; ista < nStations; ista++) {
+  for( int ista = 0; ista < nStations; ista++ ) {
     fTripletConstructor.fStations_const[ista] = fParameters.GetStation(ista);
   }
 
@@ -88,9 +83,8 @@ void GpuTrackFinderSetup::SetupParameters()
 
 void GpuTrackFinderSetup::SetupGrid()
 {
-  //  std::cout<<"Setting up grid"<<std::endl;
-  unsigned int nStations     = fParameters.GetNstationsActive();
-  unsigned int bin_start     = 0;
+  unsigned int nStations = fParameters.GetNstationsActive();
+  unsigned int bin_start = 0;
   unsigned int entries_start = 0;
 
   fTripletConstructor.fvGpuGrid.reset(nStations, xpu::buf_io);
@@ -111,8 +105,7 @@ void GpuTrackFinderSetup::SetupGrid()
   bin_start = entries_start = 0;
 
   for (unsigned int ista = 0; ista < nStations; ista++) {
-    std::copy_n(frWData.Grid(ista).GetFirstBinEntryIndex().begin(), frWData.Grid(ista).GetFirstBinEntryIndex().size(),
-                &vfgridFirstBinEntryIndex[bin_start]);
+    std::copy_n(frWData.Grid(ista).GetFirstBinEntryIndex().begin(), frWData.Grid(ista).GetFirstBinEntryIndex().size(), &vfgridFirstBinEntryIndex[bin_start]);
     for (unsigned int i = 0; i < frWData.Grid(ista).GetEntries().size(); i++) {
       vfgridEntries[entries_start + i] = frWData.Grid(ista).GetEntries()[i].GetObjectId();
     }
@@ -127,16 +120,17 @@ void GpuTrackFinderSetup::SetupGrid()
 void GpuTrackFinderSetup::SetupMaterialMap()
 {
   ///Set up material map
-  int bin_start  = 0;
-  int nStations  = fParameters.GetNstationsActive();
+  int bin_start = 0;
+  int nStations = fParameters.GetNstationsActive();
   int fstStation = 0;
-  if (nStations == 8) fstStation = 4;
+  if( nStations == 8 ) fstStation = 4;
   fTripletConstructor.fMaterialMap.reset(nStations, xpu::buf_io);
   xpu::h_view vfMaterialMap{fTripletConstructor.fMaterialMap};
 
   for (int ista = 0; ista < nStations; ista++) {
-    vfMaterialMap[ista] = GpuMaterialMap(fSetup.GetMaterial(fstStation + ista), bin_start);
-    bin_start += fSetup.GetMaterial(fstStation + ista).GetNbins() * fSetup.GetMaterial(fstStation + ista).GetNbins();
+      vfMaterialMap[ista] = GpuMaterialMap(fParameters.GetGeometrySetup().GetMaterial(fstStation + ista), bin_start);
+      bin_start += fParameters.GetGeometrySetup().GetMaterial(fstStation + ista).GetNbins() *
+	  fParameters.GetGeometrySetup().GetMaterial(fstStation + ista).GetNbins();
   }
 
   fTripletConstructor.fMaterialMapTables.reset(bin_start, xpu::buf_io);
@@ -145,9 +139,12 @@ void GpuTrackFinderSetup::SetupMaterialMap()
   bin_start = 0;
 
   for (int ista = 0; ista < nStations; ista++) {
-    std::copy_n(fSetup.GetMaterial(fstStation + ista).GetTable().begin(),
-                fSetup.GetMaterial(fstStation + ista).GetTable().size(), &vfMaterialMapTables[bin_start]);
-    bin_start += fSetup.GetMaterial(fstStation + ista).GetNbins() * fSetup.GetMaterial(fstStation + ista).GetNbins();
+    std::copy_n (
+	fParameters.GetGeometrySetup().GetMaterial(fstStation + ista).GetTable().begin(),
+	fParameters.GetGeometrySetup().GetMaterial(fstStation + ista).GetTable().size(),
+	&vfMaterialMapTables[bin_start]);
+    bin_start += fParameters.GetGeometrySetup().GetMaterial(fstStation + ista).GetNbins() *
+	fParameters.GetGeometrySetup().GetMaterial(fstStation + ista).GetNbins();
   }
 
   fQueue.copy(fTripletConstructor.fMaterialMap, xpu::h2d);
@@ -166,245 +163,211 @@ void GpuTrackFinderSetup::SetInputData()
 
 void GpuTrackFinderSetup::SetupIterationData(int iter)
 {
-  //  xpu::scoped_timer t_("SetupIterationData");
+//  xpu::scoped_timer t_("SetupIterationData");
   fIteration = iter;
 
   fTripletConstructor.fIterationData.reset(1, xpu::buf_io);
   xpu::h_view vfIterationData{fTripletConstructor.fIterationData};
-  vfIterationData[0].fNHits             = fNHits;
-  vfIterationData[0].fIteration         = iter;
-  vfIterationData[0].fNDoublets         = 0;
+  vfIterationData[0].fNHits = fNHits;
+  vfIterationData[0].fIteration = iter;
+  vfIterationData[0].fNDoublets = 0;
   vfIterationData[0].fNDoublets_counter = 0;
-  vfIterationData[0].fNTriplets         = 0;
+  vfIterationData[0].fNTriplets = 0;
   vfIterationData[0].fNTriplets_counter = 0;
 
   fQueue.copy(fTripletConstructor.fIterationData, xpu::h2d);
 }
 
-#define TIME_MONITORING_FULL
-
 void GpuTrackFinderSetup::RunGpuTrackingSetup()
 {
   fNTriplets = 0;
 
   xpu::device_prop prop{xpu::device::active()};
 
-  LOG(info) << "Running GPU tracking chain on device " << prop.name() << ", fIteration: " << fIteration;
+  if constexpr (constants::gpu::GpuTimeMonitoring) {
+    LOG(info) << "Running GPU tracking chain on device " << prop.name() << ", fIteration: " << fIteration;
+  }
 
   bool isCpu  = xpu::device::active().backend() == xpu::cpu;
-  int nBlocks = isCpu ? fNHits : std::ceil(float(fNHits) / float(kSingletConstructorBlockSize));
-  //nBlocks = 1;
+  int nBlocks = isCpu ? fNHits : std::ceil(float(fNHits)/ float(kSingletConstructorBlockSize));
+//nBlocks = 1;
 
-  int nBlocksCD =
-    nBlocks
-    * maxDoubletsFromHit;  //std::ceil(float(nBlocks * maxDoubletsFromHit) / float(kSingletConstructorBlockSize));
+  int nBlocksCD = nBlocks * constants::gpu::MaxDoubletsFromHit;//std::ceil(float(nBlocks * maxDoubletsFromHit) / float(kSingletConstructorBlockSize));
 
   //***
   fTripletConstructor.fvTrackParams.reset(frWData.Hits().size(), xpu::buf_device);
-  fTripletConstructor.fHitDoublets.reset(frWData.Hits().size() * maxDoubletsFromHit, xpu::buf_device);
+  fTripletConstructor.fHitDoublets.reset(frWData.Hits().size() * constants::gpu::MaxDoubletsFromHit, xpu::buf_device);
   //***
   fTripletConstructor.fIteration = fIteration;
 
   xpu::set<strGpuTripletConstructor>(fTripletConstructor);
 
-#ifdef TIME_MONITORING_FULL
-  fEventTimeMonitor.nIterations = fIteration;
-  xpu::push_timer("Full_time");
-#endif
+  if constexpr (constants::gpu::GpuTimeMonitoring) {
+    fEventTimeMonitor.nIterations = fIteration;
+    xpu::push_timer("Full_time");
 
-#ifdef TIME_MONITORING_FULL
-  xpu::push_timer("PrepareData_time");
-#endif
+    xpu::push_timer("PrepareData_time");
+  }
 
-  //  if(fIteration == 0) fQueue.copy(fTripletConstructor.fvHits, xpu::h2d);
+//  if(fIteration == 0) fQueue.copy(fTripletConstructor.fvHits, xpu::h2d);
 
-  //  fQueue.copy(fTripletConstructor.fIterationData, xpu::h2d);
+//  fQueue.copy(fTripletConstructor.fIterationData, xpu::h2d);
 
-  //  fQueue.copy(fTripletConstructor.fvGpuGrid, xpu::h2d);
+//  fQueue.copy(fTripletConstructor.fvGpuGrid, xpu::h2d);
 
   fQueue.copy(fTripletConstructor.fgridFirstBinEntryIndex, xpu::h2d);
 
   fQueue.copy(fTripletConstructor.fgridEntries, xpu::h2d);
 
-  //  if(fIteration == 0) fQueue.copy(fTripletConstructor.fMaterialMap, xpu::h2d);
+//  if(fIteration == 0) fQueue.copy(fTripletConstructor.fMaterialMap, xpu::h2d);
 
-  //  if(fIteration == 0) fQueue.copy(fTripletConstructor.fMaterialMapTables, xpu::h2d);
+//  if(fIteration == 0) fQueue.copy(fTripletConstructor.fMaterialMapTables, xpu::h2d);
 
   fQueue.memset(fTripletConstructor.fHitDoublets, 0);
 
-#ifdef TIME_MONITORING_FULL
-  xpu::timings pd_time                           = xpu::pop_timer();
-  fEventTimeMonitor.PrepareData_time[fIteration] = pd_time;
-#endif
+  if constexpr (constants::gpu::GpuTimeMonitoring) {
+    xpu::timings pd_time = xpu::pop_timer();
+    fEventTimeMonitor.PrepareData_time[fIteration] = pd_time;
 
-#ifdef TIME_MONITORING_FULL
-  xpu::push_timer("MakeSinglets_time");
-#endif
+    xpu::push_timer("MakeSinglets_time");
+  }
 
   fQueue.launch<MakeSinglets>(xpu::n_blocks(nBlocks));
 
-#ifdef TIME_MONITORING_FULL
-  xpu::timings sind_time                          = xpu::pop_timer();
-  fEventTimeMonitor.MakeSinglets_time[fIteration] = sind_time;
-#endif
+  if constexpr (constants::gpu::GpuTimeMonitoring) {
+    xpu::timings sind_time = xpu::pop_timer();
+    fEventTimeMonitor.MakeSinglets_time[fIteration] = sind_time;
 
-#ifdef TIME_MONITORING_FULL
-  xpu::push_timer("CompressDoublets_time");
-#endif
+    xpu::push_timer("CompressDoublets_time");
+  }
 
   fQueue.launch<MakeDoublets>(xpu::n_blocks(nBlocks));
 
-#ifdef TIME_MONITORING_FULL
-  xpu::timings dind_time                          = xpu::pop_timer();
-  fEventTimeMonitor.MakeDoublets_time[fIteration] = dind_time;
-#endif
+  if constexpr (constants::gpu::GpuTimeMonitoring) {
+    xpu::timings dind_time = xpu::pop_timer();
+    fEventTimeMonitor.MakeDoublets_time[fIteration] = dind_time;
 
-#ifdef TIME_MONITORING_FULL
-  xpu::push_timer("ResetDoublets_time");
-#endif
+    xpu::push_timer("ResetDoublets_time");
+  }
 
   fQueue.copy(fTripletConstructor.fIterationData, xpu::d2h);
   xpu::h_view vfIterationData1{fTripletConstructor.fIterationData};
 
-  LOG(info) << "GPU tracking :: Doublets found: " << vfIterationData1[0].fNDoublets;
+  if constexpr (constants::gpu::GpuTimeMonitoring) {
+    LOG(info) << "GPU tracking :: Doublets found: " << vfIterationData1[0].fNDoublets;
+  }
 
-  if (vfIterationData1[0].fNDoublets > 0) {
+  if(vfIterationData1[0].fNDoublets > 0) {
     fTripletConstructor.fHitDoubletsCompressed.reset(vfIterationData1[0].fNDoublets, xpu::buf_device);
     fTripletConstructor.fvTrackParamsDoublets.reset(vfIterationData1[0].fNDoublets, xpu::buf_device);
-    fTripletConstructor.fHitTriplets.reset(vfIterationData1[0].fNDoublets * maxTripletsFromDoublet, xpu::buf_device);
+    fTripletConstructor.fHitTriplets.reset(vfIterationData1[0].fNDoublets * constants::gpu::MaxTripletsFromDoublet, xpu::buf_device);
 
     fQueue.memset(fTripletConstructor.fHitTriplets, 0);
     xpu::set<strGpuTripletConstructor>(fTripletConstructor);
 
-#ifdef TIME_MONITORING_FULL
-    xpu::timings rd_time                             = xpu::pop_timer();
-    fEventTimeMonitor.ResetDoublets_time[fIteration] = rd_time;
-#endif
+    if constexpr (constants::gpu::GpuTimeMonitoring) {
+      xpu::timings rd_time = xpu::pop_timer();
+      fEventTimeMonitor.ResetDoublets_time[fIteration] = rd_time;
 
-#ifdef TIME_MONITORING_FULL
-    xpu::push_timer("CompressDoublets_time");
-#endif
+      xpu::push_timer("CompressDoublets_time");
+    }
 
     fQueue.launch<CompressDoublets>(xpu::n_blocks(nBlocksCD));
-#ifdef TIME_MONITORING_FULL
 
-    xpu::timings cd_time                                = xpu::pop_timer();
-    fEventTimeMonitor.CompressDoublets_time[fIteration] = cd_time;
-#endif
+    if constexpr (constants::gpu::GpuTimeMonitoring) {
+      xpu::timings cd_time = xpu::pop_timer();
+      fEventTimeMonitor.CompressDoublets_time[fIteration] = cd_time;
 
-    int nBlocksD = isCpu ? vfIterationData1[0].fNDoublets
-                         : std::ceil(float(vfIterationData1[0].fNDoublets) / float(kSingletConstructorBlockSize));
+      xpu::push_timer("FitDoublets_time");
+    }
 
-#ifdef TIME_MONITORING_FULL
-    xpu::push_timer("FitDoublets_time");
-#endif
+    int nBlocksD = isCpu ? vfIterationData1[0].fNDoublets : std::ceil(float(vfIterationData1[0].fNDoublets) / float(kSingletConstructorBlockSize));
 
     fQueue.launch<FitDoublets>(xpu::n_blocks(nBlocksD));
 
-#ifdef TIME_MONITORING_FULL
-    xpu::timings fd_time                           = xpu::pop_timer();
-    fEventTimeMonitor.FitDoublets_time[fIteration] = fd_time;
-#endif
+    if constexpr (constants::gpu::GpuTimeMonitoring) {
+      xpu::timings fd_time = xpu::pop_timer();
+      fEventTimeMonitor.FitDoublets_time[fIteration] = fd_time;
+
+      xpu::push_timer("MakeTriplets_time");
+    }
 
-#ifdef TIME_MONITORING_FULL
-    xpu::push_timer("MakeTriplets_time");
-#endif
     fQueue.launch<MakeTriplets>(xpu::n_blocks(nBlocksD));
-#ifdef TIME_MONITORING_FULL
-    xpu::timings mt_time                            = xpu::pop_timer();
-    fEventTimeMonitor.MakeTriplets_time[fIteration] = mt_time;
-#endif
 
-#ifdef TIME_MONITORING_FULL
-    xpu::push_timer("ResetTriplets_time");
-#endif
+    if constexpr (constants::gpu::GpuTimeMonitoring) {
+      xpu::timings mt_time = xpu::pop_timer();
+      fEventTimeMonitor.MakeTriplets_time[fIteration] = mt_time;
 
-    fTripletConstructor.fHitDoublets.reset(0, xpu::buf_device);  // TODO: check if we need it for the next iteration
+      xpu::push_timer("ResetTriplets_time");
+    }
+
+    fTripletConstructor.fHitDoublets.reset(0, xpu::buf_device);		// TODO: check if we need it for the next iteration
 
     fQueue.copy(fTripletConstructor.fIterationData, xpu::d2h);
     xpu::h_view vfIterationData2{fTripletConstructor.fIterationData};
 
-    if (vfIterationData2[0].fNTriplets > 0) {
+    if(vfIterationData2[0].fNTriplets > 0) {
       fTripletConstructor.fHitTripletsCompressed.reset(vfIterationData2[0].fNTriplets, xpu::buf_device);
-      //      fTripletConstructor.fvTrackParamsTriplets.reset(vfIterationData2[0].fNTriplets, xpu::buf_device);
+//      fTripletConstructor.fvTrackParamsTriplets.reset(vfIterationData2[0].fNTriplets, xpu::buf_device);
       fTripletConstructor.fvTriplets.reset(vfIterationData2[0].fNTriplets, xpu::buf_io);
 
-      int nBlocksCT = isCpu ? vfIterationData1[0].fNDoublets * maxTripletsFromDoublet
-                            : std::ceil(float(vfIterationData1[0].fNDoublets * maxTripletsFromDoublet)
-                                        / float(kSingletConstructorBlockSize));
+      int nBlocksCT = isCpu ? vfIterationData1[0].fNDoublets * constants::gpu::MaxTripletsFromDoublet :
+	  std::ceil(float(vfIterationData1[0].fNDoublets * constants::gpu::MaxTripletsFromDoublet) / float(kSingletConstructorBlockSize));
 
-#ifdef SORT_TRIPLETS_GPU
-      fTripletConstructor.fTripletSortHelper.reset(vfIterationData2[0].fNTriplets, xpu::buf_io);
-      fTripletConstructor.fTripletSortHelperTmp.reset(vfIterationData2[0].fNTriplets, xpu::buf_io);
-//      fTripletConstructor.dst.reset(nBlocksCT, xpu::buf_io);
-#endif
+      if constexpr (constants::gpu::CpuSortTriplets) {
+        fTripletConstructor.fTripletSortHelper.reset(vfIterationData2[0].fNTriplets, xpu::buf_io);
+//        fTripletConstructor.fTripletSortHelperTmp.reset(vfIterationData2[0].fNTriplets, xpu::buf_io);
+//        fTripletConstructor.dst.reset(nBlocksCT, xpu::buf_io);
+      }
 
       xpu::set<strGpuTripletConstructor>(fTripletConstructor);
 
-#ifdef TIME_MONITORING_FULL
-      xpu::timings rt_time                             = xpu::pop_timer();
-      fEventTimeMonitor.ResetTriplets_time[fIteration] = rt_time;
-#endif
+      if constexpr (constants::gpu::GpuTimeMonitoring) {
+        xpu::timings rt_time = xpu::pop_timer();
+        fEventTimeMonitor.ResetTriplets_time[fIteration] = rt_time;
 
-      LOG(info) << "GPU tracking :: Triplets found: " << vfIterationData2[0].fNTriplets;
+        LOG(info) << "GPU tracking :: Triplets found: " << vfIterationData2[0].fNTriplets;
 
-#ifdef TIME_MONITORING_FULL
-      xpu::push_timer("CompressTriplets_time");
-#endif
+        xpu::push_timer("CompressTriplets_time");
+      }
 
       fQueue.launch<CompressTriplets>(xpu::n_blocks(nBlocksCT));
 
-#ifdef TIME_MONITORING_FULL
-      xpu::timings ct_time                                = xpu::pop_timer();
-      fEventTimeMonitor.CompressTriplets_time[fIteration] = ct_time;
-#endif
+      if constexpr (constants::gpu::GpuTimeMonitoring) {
+        xpu::timings ct_time = xpu::pop_timer ();
+        fEventTimeMonitor.CompressTriplets_time[fIteration] = ct_time;
 
-#ifdef TIME_MONITORING_FULL
-      xpu::push_timer("FitTriplets_time");
-#endif
+        xpu::push_timer("FitTriplets_time");
+      }
 
-      fQueue.launch<FitTriplets>(
-        xpu::n_blocks(/*fTripletConstructor.fIterationData[0].fNTriplets*/ vfIterationData2[0].fNTriplets));
+      fQueue.launch<FitTriplets>(xpu::n_blocks(/*fTripletConstructor.fIterationData[0].fNTriplets*/vfIterationData2[0].fNTriplets));
 
-#ifdef TIME_MONITORING_FULL
-      xpu::timings ft_time                           = xpu::pop_timer();
-      fEventTimeMonitor.FitTriplets_time[fIteration] = ft_time;
-#endif
+      if constexpr (constants::gpu::GpuTimeMonitoring) {
+        xpu::timings ft_time = xpu::pop_timer ();
+        fEventTimeMonitor.FitTriplets_time[fIteration] = ft_time;
 
-#ifdef TIME_MONITORING_FULL
-      xpu::push_timer("SortTriplets_time");
-#endif
+        xpu::push_timer("SortTriplets_time");
+      }
 
-#ifdef SORT_TRIPLETS_GPU
-      fQueue.launch<SortTriplets>(xpu::n_blocks(vfIterationData2[0].fNTriplets));
-#endif
+      if constexpr (constants::gpu::CpuSortTriplets) {
+        fQueue.launch<SortTriplets>(xpu::n_blocks(vfIterationData2[0].fNTriplets));
+      }
 
-#ifdef TIME_MONITORING_FULL
-      xpu::timings srt_time                           = xpu::pop_timer();
-      fEventTimeMonitor.SortTriplets_time[fIteration] = srt_time;
-#endif
+      if constexpr (constants::gpu::GpuTimeMonitoring) {
+        xpu::timings srt_time = xpu::pop_timer ();
+        fEventTimeMonitor.SortTriplets_time[fIteration] = srt_time;
+      }
 
       fQueue.copy(fTripletConstructor.fvTriplets, xpu::d2h);
       fQueue.copy(fTripletConstructor.fIterationData, xpu::d2h);
-#ifdef SORT_TRIPLETS_GPU
-      fQueue.copy(fTripletConstructor.fTripletSortHelper, xpu::d2h);
-      std::cout << " --- Sorted triplets --- " << vfIterationData2[0].fNTriplets << " --- \n";
-      for (int i = 0; i < vfIterationData2[0].fNTriplets; i += 100) {
-        //          std::cout<<" --- fTripletSortHelper["<<i<<"]: "<<fTripletConstructor.fTripletSortHelper[i]<<" --- \n";
-        printf("   --->>> triplet: %d; lsta: %d; lhit: %d\n", i, fTripletConstructor.fTripletSortHelper[i].lsta,
-               fTripletConstructor.fTripletSortHelper[i].lhit);
+
+      if constexpr (constants::gpu::CpuSortTriplets) {
+        fQueue.copy(fTripletConstructor.fTripletSortHelper, xpu::d2h);
       }
-#endif
 
       xpu::h_view vfIterationData3{fTripletConstructor.fIterationData};
       fNTriplets = vfIterationData3[0].fNTriplets;
     }
-    else {
-      std::cout << "No triplets found" << std::endl;
-    }
-  }
-  else {
-    std::cout << "No doublets found" << std::endl;
   }
 
   fTripletConstructor.fIterationData.reset(0, xpu::buf_device);
@@ -412,11 +375,11 @@ void GpuTrackFinderSetup::RunGpuTrackingSetup()
   fTripletConstructor.fgridFirstBinEntryIndex.reset(0, xpu::buf_io);
   fTripletConstructor.fgridEntries.reset(0, xpu::buf_io);
 
-  xpu::set<strGpuTripletConstructor>(fTripletConstructor);  //TODO: check if we need to reset all the buffers here
+  xpu::set<strGpuTripletConstructor>(fTripletConstructor);	//TODO: check if we need to reset all the buffers here
 
-#ifdef TIME_MONITORING_FULL
-  xpu::timings t                           = xpu::pop_timer();
-  fEventTimeMonitor.Total_time[fIteration] = t;
-  fEventTimeMonitor.PrintTimings(fIteration);
-#endif
+  if constexpr (constants::gpu::GpuTimeMonitoring) {
+    xpu::timings t = xpu::pop_timer();
+    fEventTimeMonitor.Total_time[fIteration] = t;
+    fEventTimeMonitor.PrintTimings(fIteration);
+  }
 }
diff --git a/algo/ca/core/experimental/CaGpuTrackFinderSetup.h b/algo/ca/core/experimental/CaGpuTrackFinderSetup.h
index a2472bee5a..3f5132641b 100644
--- a/algo/ca/core/experimental/CaGpuTrackFinderSetup.h
+++ b/algo/ca/core/experimental/CaGpuTrackFinderSetup.h
@@ -1,13 +1,20 @@
-#pragma once
+/* Copyright (C) 2024-2025 GSI Helmholtzzentrum fuer Schwerionenforschung, Darmstadt
+   SPDX-License-Identifier: GPL-3.0-only
+   Authors: Grigory Kozlov [committer] */
+
+/// \file CaGpuTrackFinderSetup.h
+/// \brief The class is responsible for setting up the environment and the order in which kernels are launched to perform tracking using XPU
+
+#pragma once  // include this header only once per compilation unit
 
 #include "CaBranch.h"
-#include "CaGpuParameters.h"
-#include "CaGpuTimeMonitor.h"
-#include "CaGpuTripletConstructor.h"
 #include "CaHit.h"
+#include "KfTrackParam.h"
 #include "CaVector.h"
+#include "CaGpuParameters.h"
 #include "CaWindowData.h"
-#include "KfTrackParam.h"
+#include "CaGpuTripletConstructor.h"
+#include "CaGpuTimeMonitor.h"
 
 #include <xpu/host.h>
 
@@ -41,8 +48,10 @@ namespace cbm::algo::ca
     ///Set the GPU tracking parameters
     void SetupParameters();
 
+    /// Set the input grid data for the GPU tracking
     void SetupGrid();
 
+    /// Set the material map for the GPU tracking
     void SetupMaterialMap();
 
     /// Set the input data for the GPU tracking
@@ -55,31 +64,29 @@ namespace cbm::algo::ca
     void RunGpuTrackingSetup();
 
     /// Get the number of triplets
-    unsigned int GetNofTriplets() const { return fNTriplets; }  //fTripletConstructor.fTempData[0].fNTriplets; }
+    unsigned int GetNofTriplets() const { return fNTriplets; }//fTripletConstructor.fTempData[0].fNTriplets; }
 
-    ca::Triplet& GetTriplet(int itr) { return fTripletConstructor.fvTriplets[itr]; }  //TODO: switch back
-    //    ca::Triplet& GetTriplet(int itr) { return fvTriplets[itr]; }//fTripletConstructor.fvTriplets[itr]; }	//TODO: switch back
+    ca::Triplet& GetTriplet(int itr) { return fTripletConstructor.fvTriplets[itr]; }	//TODO: switch back
+//    ca::Triplet& GetTriplet(int itr) { return fvTriplets[itr]; }//fTripletConstructor.fvTriplets[itr]; }	//TODO: switch back
 
     /// Get timings
     XpuTimings& GetTimings() { return fEventTimeMonitor; }
 
     /// Get triplets
-    //    const Vector<ca::Triplet>& GetTriplets() const { return fTripletConstructor.fvTriplets; }
+//    const Vector<ca::Triplet>& GetTriplets() const { return fTripletConstructor.fvTriplets; }
+
+//    static constexpr bool fGPU_time_monitoring = true;  // print debug info
 
    private:
-    const Parameters<fvec>& fParameters;       ///< Object of Framework parameters class
-    const cbm::algo::kf::Setup<fvec>& fSetup;  ///< Setup instance
-    WindowData& frWData;                       ///< Reference to the window data
-    xpu::queue fQueue;                         ///< GPU queue TODO: initialization is ~220 ms. Why and how to avoid?
+    const Parameters<fvec>& fParameters;            ///< Object of Framework parameters class
+    WindowData& frWData;    ///< Reference to the window data
+    xpu::queue fQueue;  ///< GPU queue TODO: initialization is ~220 ms. Why and how to avoid?
     ca::GpuTripletConstructor fTripletConstructor;  ///< GPU triplet constructor
-    unsigned int fIteration;                        ///< Iteration number
-    int fNHits;
-
-    int fNTriplets;
+    unsigned int fIteration;  ///< Iteration number
+    int fNHits;	///< Number of hits
 
-    int maxDoubletsFromHit     = 150.;
-    int maxTripletsFromDoublet = 15.;
+    int fNTriplets;  ///< Number of triplets
 
     XpuTimings fEventTimeMonitor;
   };
-}  // namespace cbm::algo::ca
+}
diff --git a/algo/ca/core/experimental/CaGpuTripletConstructor.cxx b/algo/ca/core/experimental/CaGpuTripletConstructor.cxx
index 71d43827b4..adeb92d0da 100644
--- a/algo/ca/core/experimental/CaGpuTripletConstructor.cxx
+++ b/algo/ca/core/experimental/CaGpuTripletConstructor.cxx
@@ -1,3 +1,11 @@
+/* Copyright (C) 2024-2025 GSI Helmholtzzentrum fuer Schwerionenforschung, Darmstadt
+   SPDX-License-Identifier: GPL-3.0-only
+   Authors: Grigory Kozlov [committer] */
+
+/// \file CaGpuTripletConstructor.cxx
+/// \brief The class contains data and kernels for running CA tracking on CPU and GPU using XPU libraries
+
+
 #include "CaGpuTripletConstructor.h"
 
 using namespace cbm::algo::ca;
@@ -29,10 +37,9 @@ XPU_D void CompressTriplets::operator()(context& ctx) { ctx.cmem<strGpuTripletCo
 
 XPU_EXPORT(FitTriplets);
 XPU_D void FitTriplets::operator()(context& ctx) { ctx.cmem<strGpuTripletConstructor>().FitTriplets(ctx); }
-#ifdef SORT_TRIPLETS_GPU
+
 XPU_EXPORT(SortTriplets);
 XPU_D void SortTriplets::operator()(context& ctx) { ctx.cmem<strGpuTripletConstructor>().SortTripletsFunc(ctx); }
-#endif
 
 //XPU_D void GpuTripletConstructor::TestFunc(TestFunc::context& ctx) const
 //{
@@ -41,32 +48,32 @@ XPU_D void SortTriplets::operator()(context& ctx) { ctx.cmem<strGpuTripletConstr
 
 XPU_D void GpuTripletConstructor::MakeSinglets(MakeSinglets::context& ctx) const
 {
-  const int iGThread = ctx.block_dim_x() * ctx.block_idx_x() + ctx.thread_idx_x();
+  const int iGThread    = ctx.block_dim_x() * ctx.block_idx_x() + ctx.thread_idx_x();
 
-  if (iGThread >= fIterationData[0].fNHits) return;
+  if(iGThread >= fIterationData[0].fNHits) return;
 
-  ca::HitIndex_t ihitl = fgridEntries[iGThread];  //iGThread;
-  int ista             = fvHits[ihitl].Station();
+  ca::HitIndex_t ihitl = fgridEntries[iGThread];//iGThread;
+  int ista = fvHits[ihitl].Station();
 
-  //  if(ista < 0 || ista > (int)fParams[fIterationData[0].fIteration].fNStations - 3) return;
-  if (ista < 0 || ista > (int) fParams_const[fIteration].fNStations - 3) return;
+//  if(ista < 0 || ista > (int)fParams[fIterationData[0].fIteration].fNStations - 3) return;
+  if(ista < 0 || ista > (int)fParams_const[fIteration].fNStations - 3) return;
 
-  //  const ca::Station<float>* sta_l = &fParams[fIterationData[0].fIteration].GetStation(ista);
-  //  const ca::Station<float>* sta_m = &fParams[fIterationData[0].fIteration].GetStation(ista + 1);
+//  const ca::Station<float>* sta_l = &fParams[fIterationData[0].fIteration].GetStation(ista);
+//  const ca::Station<float>* sta_m = &fParams[fIterationData[0].fIteration].GetStation(ista + 1);
   const ca::GpuStation* sta_l = &fStations_const[ista];
   const ca::GpuStation* sta_m = &fStations_const[ista + 1];
 
   float stal_l_timeInfo = sta_l->timeInfo;
-  float sta_m_z         = sta_m->fZ;
+  float sta_m_z = sta_m->fZ;
 
   const auto& hitl = fvHits[ihitl];
 
-  //  const float dzli = 1.f / (hitl.Z() - fParams[fIterationData[0].fIteration].GetTargetPositionZ());
-  //  const float tx = (hitl.X() - fParams[fIterationData[0].fIteration].GetTargetPositionX()) * dzli;
-  //  const float ty = (hitl.Y() - fParams[fIterationData[0].fIteration].GetTargetPositionY()) * dzli;
+//  const float dzli = 1.f / (hitl.Z() - fParams[fIterationData[0].fIteration].GetTargetPositionZ());
+//  const float tx = (hitl.X() - fParams[fIterationData[0].fIteration].GetTargetPositionX()) * dzli;
+//  const float ty = (hitl.Y() - fParams[fIterationData[0].fIteration].GetTargetPositionY()) * dzli;
   const float dzli = 1.f / (hitl.Z() - fParams_const[fIteration].GetTargetPositionZ());
-  const float tx   = (hitl.X() - fParams_const[fIteration].GetTargetPositionX()) * dzli;
-  const float ty   = (hitl.Y() - fParams_const[fIteration].GetTargetPositionY()) * dzli;
+  const float tx = (hitl.X() - fParams_const[fIteration].GetTargetPositionX()) * dzli;
+  const float ty = (hitl.Y() - fParams_const[fIteration].GetTargetPositionY()) * dzli;
 
   kf::TrackParamBase<float>& fit_params = fvTrackParams[ihitl];
 
@@ -77,68 +84,64 @@ XPU_D void GpuTripletConstructor::MakeSinglets(MakeSinglets::context& ctx) const
   fit_params.Ty()   = ty;
   fit_params.Qp()   = 0.;
   fit_params.Time() = hitl.T();
-  //  fit_params.Vi()   = fParams[fIterationData[0].fIteration].SpeedOfLightInv;//0.03335641;//constants::phys::SpeedOfLightInv;
-  fit_params.Vi() = fParams_const[fIteration].SpeedOfLightInv;
+//  fit_params.Vi()   = fParams[fIterationData[0].fIteration].SpeedOfLightInv;//0.03335641;//constants::phys::SpeedOfLightInv;
+  fit_params.Vi()   = constants::phys::SpeedOfLightInv;//fParams_const[fIteration].SpeedOfLightInv;
 
-  //  float txErr2           = fParams[fIterationData[0].fIteration].maxSlopePV * fParams[fIterationData[0].fIteration].maxSlopePV / 9.;
-  //  float qpErr2           = fParams[fIterationData[0].fIteration].maxQp * fParams[fIterationData[0].fIteration].maxQp / 9.;
-  float txErr2 = fParams_const[fIteration].maxSlopePV * fParams_const[fIteration].maxSlopePV / 9.;
-  float qpErr2 = fParams_const[fIteration].maxQp * fParams_const[fIteration].maxQp / 9.;
+//  float txErr2           = fParams[fIterationData[0].fIteration].maxSlopePV * fParams[fIterationData[0].fIteration].maxSlopePV / 9.;
+//  float qpErr2           = fParams[fIterationData[0].fIteration].maxQp * fParams[fIterationData[0].fIteration].maxQp / 9.;
+  float txErr2           = fParams_const[fIteration].maxSlopePV * fParams_const[fIteration].maxSlopePV / 9.;
+  float qpErr2           = fParams_const[fIteration].maxQp * fParams_const[fIteration].maxQp / 9.;
 
   fit_params.ResetErrors(1., 1., txErr2, txErr2, qpErr2, (stal_l_timeInfo ? hitl.dT2() : 1.e6), 1.e10);
 
-  fit_params.InitVelocityRange(1. / /*fParams[fIterationData[0].fIteration]*/ fParams_const[fIteration].maxQp);
+  fit_params.InitVelocityRange(1. / /*fParams[fIterationData[0].fIteration]*/fParams_const[fIteration].maxQp);
 
   fit_params.C(0, 0) = hitl.dX2();
   fit_params.C(1, 0) = hitl.dXY();
   fit_params.C(1, 1) = hitl.dY2();
 
-  //  fit_params.Ndf()     = fParams[fIterationData[0].fIteration].primaryFlag ? 2. : 0.;
-  fit_params.Ndf()     = /*fParams[fIterationData[0].fIteration]*/ fParams_const[fIteration].primaryFlag ? 2. : 0.;
+//  fit_params.Ndf()     = fParams[fIterationData[0].fIteration].primaryFlag ? 2. : 0.;
+  fit_params.Ndf()     = fParams_const[fIteration].primaryFlag ? 2. : 0.;
   fit_params.NdfTime() = (stal_l_timeInfo ? 0 : -1);
 
-  //  ca::GpuFieldRegion fld0;
+//  ca::GpuFieldRegion fld0;
   {
     int sta1 = ista;
-    if (sta1 == 0) sta1 = 1;
+    if(sta1 == 0) sta1 = 1;
     int sta0 = sta1 / 2;  // station between ista and the target
-    //    const ca::GpuFieldValue &B0 = fParams[fIterationData[0].fIteration].GetStation(sta0).fieldSlice.GetFieldValueForLine(fit_params);
-    //    const ca::GpuFieldValue &B1 = fParams[fIterationData[0].fIteration].GetStation(sta1).fieldSlice.GetFieldValueForLine(fit_params);
-    const ca::GpuFieldValue& B0 = fStations_const[sta0].fieldSlice.GetFieldValueForLine(fit_params);
-    const ca::GpuFieldValue& B1 = fStations_const[sta1].fieldSlice.GetFieldValueForLine(fit_params);
+//    const ca::GpuFieldValue &B0 = fParams[fIterationData[0].fIteration].GetStation(sta0).fieldSlice.GetFieldValueForLine(fit_params);
+//    const ca::GpuFieldValue &B1 = fParams[fIterationData[0].fIteration].GetStation(sta1).fieldSlice.GetFieldValueForLine(fit_params);
+    const ca::GpuFieldValue &B0 = fStations_const[sta0].fieldSlice.GetFieldValueForLine(fit_params);
+    const ca::GpuFieldValue &B1 = fStations_const[sta1].fieldSlice.GetFieldValueForLine(fit_params);
     //fld0.Set(fParams[fIterationData[0].fIteration].targB, fParams[fIterationData[0].fIteration].GetTargetPositionZ(), B0, fParams[fIterationData[0].fIteration].GetStation(sta0).fZ, B1, fParams[fIterationData[0].fIteration].GetStation(sta1).fZ);
-    ctx.smem().fld0_shared[ctx.thread_idx_x()].Set(fParams_const[fIteration].targB,
-                                                   fParams_const[fIteration].GetTargetPositionZ(), B0,
-                                                   fStations_const[sta0].fZ, B1, fStations_const[sta1].fZ);
+    ctx.smem().fld0_shared[ctx.thread_idx_x()].Set(fParams_const[fIteration].targB, fParams_const[fIteration].GetTargetPositionZ(), B0, fStations_const[sta0].fZ, B1, fStations_const[sta1].fZ);
   }
-  //  ca::GpuFieldRegion fld1;
+//  ca::GpuFieldRegion fld1;
   {  // field, made by the left hit, the middle station and the right station
-    //    const ca::GpuFieldValue &B0 = fParams[fIterationData[0].fIteration].GetStation(ista)*/fStations_const[ista]/*ctx.smem().fStations_shared[ista]*/.fieldSlice.GetFieldValueForLine(fit_params);
-    //    const ca::GpuFieldValue &B1 = fParams[fIterationData[0].fIteration].GetStation(ista + 1).fieldSlice.GetFieldValueForLine(fit_params);
-    //    const ca::GpuFieldValue &B2 = fParams[fIterationData[0].fIteration].GetStation(ista + 2).fieldSlice.GetFieldValueForLine(fit_params);
-    const ca::GpuFieldValue& B0 = fStations_const[ista].fieldSlice.GetFieldValueForLine(fit_params);
-    const ca::GpuFieldValue& B1 = fStations_const[ista + 1].fieldSlice.GetFieldValueForLine(fit_params);
-    const ca::GpuFieldValue& B2 = fStations_const[ista + 2].fieldSlice.GetFieldValueForLine(fit_params);
-    //    fld1.Set(B0, fParams[fIterationData[0].fIteration].GetStation(ista).fZ, B1, fParams[fIterationData[0].fIteration].GetStation(ista + 1).fZ, B2, fParams[fIterationData[0].fIteration].GetStation(ista + 2).fZ);
-    ctx.smem().fld1_shared[ctx.thread_idx_x()].Set(B0, fStations_const[ista].fZ, B1, fStations_const[ista + 1].fZ, B2,
-                                                   fStations_const[ista + 2].fZ);
+//    const ca::GpuFieldValue &B0 = fParams[fIterationData[0].fIteration].GetStation(ista)*/fStations_const[ista]/*ctx.smem().fStations_shared[ista]*/.fieldSlice.GetFieldValueForLine(fit_params);
+//    const ca::GpuFieldValue &B1 = fParams[fIterationData[0].fIteration].GetStation(ista + 1).fieldSlice.GetFieldValueForLine(fit_params);
+//    const ca::GpuFieldValue &B2 = fParams[fIterationData[0].fIteration].GetStation(ista + 2).fieldSlice.GetFieldValueForLine(fit_params);
+    const ca::GpuFieldValue &B0 = fStations_const[ista].fieldSlice.GetFieldValueForLine(fit_params);
+    const ca::GpuFieldValue &B1 = fStations_const[ista + 1].fieldSlice.GetFieldValueForLine(fit_params);
+    const ca::GpuFieldValue &B2 = fStations_const[ista + 2].fieldSlice.GetFieldValueForLine(fit_params);
+//    fld1.Set(B0, fParams[fIterationData[0].fIteration].GetStation(ista).fZ, B1, fParams[fIterationData[0].fIteration].GetStation(ista + 1).fZ, B2, fParams[fIterationData[0].fIteration].GetStation(ista + 2).fZ);
+    ctx.smem().fld1_shared[ctx.thread_idx_x()].Set(B0, fStations_const[ista].fZ, B1, fStations_const[ista + 1].fZ, B2, fStations_const[ista + 2].fZ);
   }
 
-  FilterWithTargetAtLine(&fit_params, &/*fld0*/ ctx.smem().fld0_shared[ctx.thread_idx_x()]);
+  FilterWithTargetAtLine(&fit_params, &/*fld0*/ctx.smem().fld0_shared[ctx.thread_idx_x()]);
 
-  MultipleScattering(&fit_params, fMaterialMapTables[fMaterialMap[ista].GetBin(fit_params.GetX(), fit_params.GetY())],
-                     /*fParams[fIterationData[0].fIteration]*/ fParams_const[fIteration].maxQp);
+  MultipleScattering(&fit_params, fMaterialMapTables[fMaterialMap[ista].GetBin(fit_params.GetX(), fit_params.GetY())], /*fParams[fIterationData[0].fIteration]*/fParams_const[fIteration].maxQp);
 
   // extrapolate to the middle hit
-  ExtrapolateStep(&fit_params, sta_m_z, 0.f, &/*fld1*/ ctx.smem().fld1_shared[ctx.thread_idx_x()]);
+  ExtrapolateStep(&fit_params, sta_m_z, 0.f, &/*fld1*/ctx.smem().fld1_shared[ctx.thread_idx_x()]);
 
-  //  ExtrapolateLine(tr_par, sta_m->fZ, F1);	//TODO: check if ExtrapolateStep is enough
+//  ExtrapolateLine(tr_par, sta_m->fZ, F1);	//TODO: check if ExtrapolateStep is enough
 }
 
 XPU_D void GpuTripletConstructor::MakeDoublets(MakeDoublets::context& ctx) const
 {
-  const int iGThread = ctx.block_dim_x() * ctx.block_idx_x() + ctx.thread_idx_x();
-  if (iGThread >= fIterationData[0].fNHits) return;
+  const int iGThread    = ctx.block_dim_x() * ctx.block_idx_x() + ctx.thread_idx_x();
+  if(iGThread >= fIterationData[0].fNHits) return;
 
 #if 0
   if (ctx.thread_idx_x() == 0) {
@@ -146,15 +149,15 @@ XPU_D void GpuTripletConstructor::MakeDoublets(MakeDoublets::context& ctx) const
   }
 #endif
 
-  ca::HitIndex_t ihitl = fgridEntries[iGThread];  //iGThread;
-  int ista             = fvHits[ihitl].Station();
+  ca::HitIndex_t ihitl = fgridEntries[iGThread];//iGThread;
+  int ista = fvHits[ihitl].Station();
 
-  if (ista > (int) fParams[fIterationData[0].fIteration].fNStations - 3) return;
+  if(ista > (int)fParams[fIterationData[0].fIteration].fNStations - 3) return;
 
   int collectedHits = 0;
-  int maxNHits      = fParams[fIterationData[0].fIteration].maxDoubletsFromHit;
+  int maxNHits = constants::gpu::MaxDoubletsFromHit;//fParams[fIterationData[0].fIteration].maxDoubletsFromHit;
 
-  CollectHits(iGThread, ista + 1, ihitl, maxNHits, &collectedHits);
+  CollectHits(iGThread, ista+1, ihitl, maxNHits, &collectedHits);
 
   xpu::atomic_add(&(fIterationData[0].fNDoublets), collectedHits);
 
@@ -170,12 +173,13 @@ XPU_D void GpuTripletConstructor::MakeDoublets(MakeDoublets::context& ctx) const
 
 XPU_D void GpuTripletConstructor::CompressDoublets(CompressDoublets::context& ctx) const
 {
-  const int iGThread = ctx.block_dim_x() * ctx.block_idx_x() + ctx.thread_idx_x();
+  const int iGThread    = ctx.block_dim_x() * ctx.block_idx_x() + ctx.thread_idx_x();
 
-  if (iGThread >= fIterationData[0].fNHits * fParams[fIterationData[0].fIteration].maxDoubletsFromHit) return;
+//  if( iGThread >= fIterationData[0].fNHits * fParams[fIterationData[0].fIteration].maxDoubletsFromHit ) return;
+  if( iGThread >= fIterationData[0].fNHits * constants::gpu::MaxDoubletsFromHit ) return;
 
-  if (fHitDoublets[iGThread].hit2 != 0) {
-    int index                          = xpu::atomic_add(&(fIterationData[0].fNDoublets_counter), 1);
+  if( fHitDoublets[iGThread].hit2 != 0 ) {
+    int index = xpu::atomic_add(&(fIterationData[0].fNDoublets_counter), 1);
     fHitDoubletsCompressed[index].hit1 = fHitDoublets[iGThread].hit1;
     fHitDoubletsCompressed[index].hit2 = fHitDoublets[iGThread].hit2;
   }
@@ -183,25 +187,24 @@ XPU_D void GpuTripletConstructor::CompressDoublets(CompressDoublets::context& ct
 
 XPU_D void GpuTripletConstructor::FitDoublets(FitDoublets::context& ctx) const
 {
-  const int iGThread = ctx.block_dim_x() * ctx.block_idx_x() + ctx.thread_idx_x();
-  if (iGThread >= fIterationData[0].fNDoublets) return;
+  const int iGThread    = ctx.block_dim_x() * ctx.block_idx_x() + ctx.thread_idx_x();
+  if(iGThread >= fIterationData[0].fNDoublets) return;
 
   int hit_l = fHitDoubletsCompressed[iGThread].hit1;
   int hit_m = fHitDoubletsCompressed[iGThread].hit2;
 
-  if (hit_l < 0 || hit_m < 0) return;
+  if(hit_l < 0 || hit_m < 0) return;
 
   int ista = fvHits[hit_l].Station();
 
-  if (ista < 0 || ista > (int) fParams[fIterationData[0].fIteration].fNStations - 3) return;
+  if(ista < 0 || ista > (int)fParams[fIterationData[0].fIteration].fNStations - 3) return;
 
   const ca::GpuStation& sta_l = fParams[fIterationData[0].fIteration].GetStation(ista);
   const ca::GpuStation& sta_m = fParams[fIterationData[0].fIteration].GetStation(ista + 1);
 
   auto fit_params = fvTrackParams[hit_l];
 
-  bool isMomentumFitted =
-    (fParams[fIterationData[0].fIteration].isTargetField || (sta_l.fieldStatus != 0) || (sta_m.fieldStatus != 0));
+  bool isMomentumFitted = (fParams[fIterationData[0].fIteration].isTargetField || (sta_l.fieldStatus != 0) || (sta_m.fieldStatus != 0));
   float maxQp = fParams[fIterationData[0].fIteration].maxQp;
 
   // fit doublet
@@ -219,23 +222,22 @@ XPU_D void GpuTripletConstructor::FitDoublets(FitDoublets::context& ctx) const
 
   FilterTime(&fit_params, t_2, dt2_2, sta_m.timeInfo);
 
-  //      fFit.SetQp0(isMomentumFitted ? fFit.Tr().GetQp() : maxQp);	//TODO: check why it is required
+//      fFit.SetQp0(isMomentumFitted ? fFit.Tr().GetQp() : maxQp);	//TODO: check why it is required
   const float qp0 = isMomentumFitted ? fit_params.GetQp() : maxQp;
 
-  MultipleScattering(&fit_params,
-                     fMaterialMapTables[fMaterialMap[ista + 1].GetBin(fit_params.GetX(), fit_params.GetY())], qp0);
+  MultipleScattering(&fit_params, fMaterialMapTables[fMaterialMap[ista + 1].GetBin(fit_params.GetX(), fit_params.GetY())], qp0);
 
   fvTrackParamsDoublets[iGThread] = fit_params;
-  //      fFit.SetQp0(fFit.Tr().Qp());	//TODO: check why it is required and how it is used later, have to be stored different way
+//      fFit.SetQp0(fFit.Tr().Qp());	//TODO: check why it is required and how it is used later, have to be stored different way
 }
 
 XPU_D void GpuTripletConstructor::MakeTriplets(MakeTriplets::context& ctx) const
 {
-  const int iGThread = ctx.block_dim_x() * ctx.block_idx_x() + ctx.thread_idx_x();
-  if (iGThread >= fIterationData[0].fNDoublets) return;
+  const int iGThread    = ctx.block_dim_x() * ctx.block_idx_x() + ctx.thread_idx_x();
+  if(iGThread >= fIterationData[0].fNDoublets) return;
 
   int hit_l = fHitDoubletsCompressed[iGThread].hit1;
-  //  int hit_m = fHitDoubletsCompressed[iGThread].hit2;
+//  int hit_m = fHitDoubletsCompressed[iGThread].hit2;
 
 #if 0
   if (ctx.thread_idx_x() == 0) {
@@ -245,30 +247,24 @@ XPU_D void GpuTripletConstructor::MakeTriplets(MakeTriplets::context& ctx) const
 
   int ista = fvHits[hit_l].Station();
 
-  if (ista < 0 || ista > (int) fParams[fIterationData[0].fIteration].fNStations - 3) return;
+  if(ista < 0 || ista > (int)fParams[fIterationData[0].fIteration].fNStations - 3) return;
 
   //
   ista = fvHits[hit_l].Station();
   //
 
-  //  const ca::Station<float>& sta_l = fParams[fIterationData[0].fIteration].GetStation(ista);
-  //  const ca::Station<float>& sta_m = fParams[fIterationData[0].fIteration].GetStation(ista + 1);
+//  const ca::Station<float>& sta_l = fParams[fIterationData[0].fIteration].GetStation(ista);
+//  const ca::Station<float>& sta_m = fParams[fIterationData[0].fIteration].GetStation(ista + 1);
   const ca::GpuStation& sta_r = fParams[fIterationData[0].fIteration].GetStation(ista + 2);
 
   auto& fit_params = fvTrackParamsDoublets[iGThread];
 
-  ca::GpuFieldRegion
-    fld1;  //TODO: with fit_params(doublets) this is not fFldL from orig, but it should be more precise, need to be checker and clarified
-  {        // field, made by the left hit, the middle station and the right station
-    ca::GpuFieldValue B0 =
-      fParams[fIterationData[0].fIteration].GetStation(ista).fieldSlice.GetFieldValueForLine(fit_params);
-    ca::GpuFieldValue B1 =
-      fParams[fIterationData[0].fIteration].GetStation(ista + 1).fieldSlice.GetFieldValueForLine(fit_params);
-    ca::GpuFieldValue B2 =
-      fParams[fIterationData[0].fIteration].GetStation(ista + 2).fieldSlice.GetFieldValueForLine(fit_params);
-    fld1.Set(B0, fParams[fIterationData[0].fIteration].GetStation(ista).fZ, B1,
-             fParams[fIterationData[0].fIteration].GetStation(ista + 1).fZ, B2,
-             fParams[fIterationData[0].fIteration].GetStation(ista + 2).fZ);
+  ca::GpuFieldRegion fld1;	//TODO: with fit_params(doublets) this is not fFldL from orig, but it should be more precise, need to be checker and clarified
+  {  // field, made by the left hit, the middle station and the right station
+    ca::GpuFieldValue B0 = fParams[fIterationData[0].fIteration].GetStation(ista).fieldSlice.GetFieldValueForLine(fit_params);
+    ca::GpuFieldValue B1 = fParams[fIterationData[0].fIteration].GetStation(ista + 1).fieldSlice.GetFieldValueForLine(fit_params);
+    ca::GpuFieldValue B2 = fParams[fIterationData[0].fIteration].GetStation(ista + 2).fieldSlice.GetFieldValueForLine(fit_params);
+    fld1.Set (B0, fParams[fIterationData[0].fIteration].GetStation (ista).fZ, B1, fParams[fIterationData[0].fIteration].GetStation(ista + 1).fZ, B2, fParams[fIterationData[0].fIteration].GetStation(ista + 2).fZ);
   }
 
   ExtrapolateStep(&fit_params, sta_r.fZ, 0.f, &fld1);
@@ -276,9 +272,9 @@ XPU_D void GpuTripletConstructor::MakeTriplets(MakeTriplets::context& ctx) const
   //TODO: add isDoubletGood check if needed
 
   int collectedHits = 0;
-  int maxNHits      = fParams[fIterationData[0].fIteration].maxTripletsFromDoublet;
+  int maxNHits = constants::gpu::MaxTripletsFromDoublet;//fParams[fIterationData[0].fIteration].maxTripletsFromDoublet;
 
-  CollectHitsTriplets(iGThread, ista + 2, maxNHits, &collectedHits);
+  CollectHitsTriplets(iGThread, ista+2, maxNHits, &collectedHits);
 
   xpu::atomic_add(&(fIterationData[0].fNTriplets), collectedHits);
 
@@ -295,11 +291,12 @@ XPU_D void GpuTripletConstructor::MakeTriplets(MakeTriplets::context& ctx) const
 
 XPU_D void GpuTripletConstructor::CompressTriplets(CompressTriplets::context& ctx) const
 {
-  const int iGThread = ctx.block_dim_x() * ctx.block_idx_x() + ctx.thread_idx_x();
-  if (iGThread >= fIterationData[0].fNDoublets * fParams[fIterationData[0].fIteration].maxTripletsFromDoublet) return;
+  const int iGThread    = ctx.block_dim_x() * ctx.block_idx_x() + ctx.thread_idx_x();
+//  if( iGThread >= fIterationData[0].fNDoublets * fParams[fIterationData[0].fIteration].maxTripletsFromDoublet ) return;
+  if( iGThread >= fIterationData[0].fNDoublets * constants::gpu::MaxTripletsFromDoublet ) return;
 
-  if (fHitTriplets[iGThread].hit3 != 0) {
-    int index                          = xpu::atomic_add(&(fIterationData[0].fNTriplets_counter), 1);
+  if( fHitTriplets[iGThread].hit3 != 0 ) {
+    int index = xpu::atomic_add(&(fIterationData[0].fNTriplets_counter), 1);
     fHitTripletsCompressed[index].hit1 = fHitTriplets[iGThread].hit1;
     fHitTripletsCompressed[index].hit2 = fHitTriplets[iGThread].hit2;
     fHitTripletsCompressed[index].hit3 = fHitTriplets[iGThread].hit3;
@@ -308,9 +305,9 @@ XPU_D void GpuTripletConstructor::CompressTriplets(CompressTriplets::context& ct
 
 XPU_D void GpuTripletConstructor::FitTriplets(FitTriplets::context& ctx) const
 {
-  const int iGThread = ctx.block_dim_x() * ctx.block_idx_x() + ctx.thread_idx_x();
+  const int iGThread    = ctx.block_dim_x() * ctx.block_idx_x() + ctx.thread_idx_x();
 
-  if (iGThread >= fIterationData[0].fNTriplets) return;
+  if(iGThread >= fIterationData[0].fNTriplets) return;
 
   int hit_l = fHitTripletsCompressed[iGThread].hit1;
 
@@ -320,22 +317,21 @@ XPU_D void GpuTripletConstructor::FitTriplets(FitTriplets::context& ctx) const
   ista[1] = ista[0] + 1;
   ista[2] = ista[1] + 1;
 
-  //  ca::GpuStation sta[3];
-  const ca::GpuStation* sta[3];
+//  ca::GpuStation sta[3];
+  const ca::GpuStation *sta[3];
   for (int is = 0; is < 3; ++is) {
-    //    sta[is] = fParams[fIterationData[0].fIteration].GetStation(ista[is]);
+//    sta[is] = fParams[fIterationData[0].fIteration].GetStation(ista[is]);
     sta[is] = &fStations_const[ista[is]];
   };
 
   bool isMomentumFitted = ((sta[0]->fieldStatus != 0) || (sta[1]->fieldStatus != 0) || (sta[2]->fieldStatus != 0));
 
-  float ndfTrackModel = 4;                    // straight line
+  float ndfTrackModel = 4;                     // straight line
   ndfTrackModel += isMomentumFitted ? 1 : 0;  // track with momentum
 
   const float maxQp = fParams[fIterationData[0].fIteration].maxQp;
 
-  ca::HitIndex_t ihit[3] = {fHitTripletsCompressed[iGThread].hit1, fHitTripletsCompressed[iGThread].hit2,
-                            fHitTripletsCompressed[iGThread].hit3};
+  ca::HitIndex_t ihit[3] = {fHitTripletsCompressed[iGThread].hit1, fHitTripletsCompressed[iGThread].hit2, fHitTripletsCompressed[iGThread].hit3};
 
   float x[3], y[3], z[3], t[3];
   float dt2[3];
@@ -343,7 +339,7 @@ XPU_D void GpuTripletConstructor::FitTriplets(FitTriplets::context& ctx) const
 
   for (int ih = 0; ih < 3; ++ih) {
     const ca::Hit& hit = fvHits[ihit[ih]];
-    mxy[ih]            = ca::MeasurementXy<float>(hit.X(), hit.Y(), hit.dX2(), hit.dY2(), hit.dXY(), 1.f, 1.f);
+    mxy[ih] = ca::MeasurementXy<float>(hit.X(), hit.Y(), hit.dX2(), hit.dY2(), hit.dXY(), 1.f, 1.f);
 
     x[ih]   = hit.X();
     y[ih]   = hit.Y();
@@ -362,33 +358,31 @@ XPU_D void GpuTripletConstructor::FitTriplets(FitTriplets::context& ctx) const
   float ty[3] = {(y[1] - y[0]) / (z[1] - z[0]), (y[2] - y[0]) / (z[2] - z[0]), (y[2] - y[1]) / (z[2] - z[1])};
   for (int ih = 0; ih < 3; ++ih) {
     float dz = (sta[ih]->fZ - z[ih]);
-    B[ih]    = sta[ih]->fieldSlice.GetFieldValue(x[ih] + tx[ih] * dz, y[ih] + ty[ih] * dz);
+    B[ih]   = sta[ih]->fieldSlice.GetFieldValue(x[ih] + tx[ih] * dz, y[ih] + ty[ih] * dz);
   };
 
   fld.Set(B[0], sta[0]->fZ, B[1], sta[1]->fZ, B[2], sta[2]->fZ);
-  fldTarget.Set(fParams[fIterationData[0].fIteration].targB, fParams[fIterationData[0].fIteration].GetTargetPositionZ(),
-                B[0], sta[0]->fZ, B[1], sta[1]->fZ);
+  fldTarget.Set(fParams[fIterationData[0].fIteration].targB, fParams[fIterationData[0].fIteration].GetTargetPositionZ(), B[0], sta[0]->fZ, B[1], sta[1]->fZ);
 
-  kf::TrackParamBase<float>
-    fit_params;  // = fvTrackParamsTriplets[iGThread];	//TODO: set all to 0 and recheck - done, no difference
-  //  auto *fit_params = &ctx.smem().fit_params_shared[ctx.thread_idx_x()];	//TODO: shared memory cannot be used for classes with preinitialized data
+  kf::TrackParamBase<float> fit_params;// = fvTrackParamsTriplets[iGThread];	//TODO: set all to 0 and recheck - done, no difference
+//  auto *fit_params = &ctx.smem().fit_params_shared[ctx.thread_idx_x()];	//TODO: shared memory cannot be used for classes with preinitialized data
 
   fit_params.ResetCovMatrix();
 
   // initial parameters
   {
-    float dz01      = 1.f / (z[1] - z[0]);
-    fit_params.Tx() = (x[1] - x[0]) * dz01;
-    fit_params.Ty() = (y[1] - y[0]) * dz01;
-    fit_params.Qp() = 0.f;
-    fit_params.Vi() = 0.f;
+    float dz01 = 1.f / (z[1] - z[0]);
+    fit_params.Tx()    = (x[1] - x[0]) * dz01;
+    fit_params.Ty()    = (y[1] - y[0]) * dz01;
+    fit_params.Qp()    = 0.f;
+    fit_params.Vi()    = 0.f;
   }
 
   bool not_fitted = false;
-  float qp0       = 0;
+  float qp0 = 0;
   // repeat several times in order to improve the precision
-  for (int iiter = 0; iiter < 2 /*nIterations*/; ++iiter) {
-    // fit forward
+  for (int iiter = 0; iiter < 2/*nIterations*/; ++iiter) {
+  // fit forward
     {
       qp0 = fit_params.GetQp();
       if (qp0 > maxQp) qp0 = maxQp;
@@ -407,17 +401,17 @@ XPU_D void GpuTripletConstructor::FitTriplets(FitTriplets::context& ctx) const
 
       fit_params.Ndf()     = -ndfTrackModel + 2;
       fit_params.NdfTime() = sta[0]->timeInfo ? 0 : -1;
-      //  add the target constraint
+    //  add the target constraint
       FilterWithTargetAtLine(&fit_params, &fldTarget);
       for (int ih = 1; ih < 3; ++ih) {
         ExtrapolateStep(&fit_params, z[ih], qp0, &fld);
 
-        //	Extrapolate(&fit_params, z[ih], 0.f, &fld);	//TODO: add qp0 instead of 0.f
+//	Extrapolate(&fit_params, z[ih], 0.f, &fld);	//TODO: add qp0 instead of 0.f
 
-        int bin       = fMaterialMap[ista[ih]].GetBin(fit_params.GetX(), fit_params.GetY());
+        int bin = fMaterialMap[ista[ih]].GetBin(fit_params.GetX(), fit_params.GetY());
         auto radThick = fMaterialMapTables[bin];
 
-        MultipleScattering(&fit_params, radThick, qp0);  //TODO: why it was commented in original gpu code?
+        MultipleScattering(&fit_params, radThick, qp0);	//TODO: why it was commented in original gpu code?
 
         float Tx    = fit_params.Tx();
         float Ty    = fit_params.Ty();
@@ -428,29 +422,26 @@ XPU_D void GpuTripletConstructor::FitTriplets(FitTriplets::context& ctx) const
         float tt    = xpu::sqrt(txtx1 + tyty);
         float h2    = h * h;
         float qp0t  = qp0 * tt;
-        float c1 = 0.0136f, c2 = c1 * 0.038f, c3 = c2 * 0.5f, c4 = -c3 * 0.5f, c5 = c3 * 0.333333f, c6 = -c3 * 0.25f;
+        float c1 = 0.0136f, c2 = c1 * 0.038f, c3 = c2 * 0.5f, c4 = -c3 * 0.5f, c5 = c3 *  0.333333f, c6 = -c3 * 0.25f;
 
         float s0 = (c1 + c2 * xpu::log(radThick) + c3 * h + h2 * (c4 + c5 * h + c6 * h2)) * qp0t;
-        float a  = ((tt
-                    + fParams[fIterationData[0].fIteration].particleMass
-                        * fParams[fIterationData[0].fIteration].particleMass * qp0 * qp0t)
-                   * radThick * s0 * s0);
-        //        fit_params.C(2, 2) += 0.001;//txtx1 * a;	//TODO: check if it is needed. Switching on leads to the strong ghosts increase
+        float a = ((tt + fParams[fIterationData[0].fIteration].particleMass * fParams[fIterationData[0].fIteration].particleMass * qp0 * qp0t) * radThick * s0 * s0);
+//        fit_params.C(2, 2) += 0.001;//txtx1 * a;	//TODO: check if it is needed. Switching on leads to the strong ghosts increase
 
         fit_params.C(3, 2) += Tx * Ty * a;
         fit_params.C(3, 3) += (1. + tyty) * a;
 
         EnergyLossCorrection(&fit_params, radThick, -1.f);
-        FilterXY(&fit_params, mxy[ih]);
-        FilterTime(&fit_params, t[ih], dt2[ih], sta[ih]->timeInfo);
+        FilterXY (&fit_params, mxy[ih]);
+        FilterTime (&fit_params, t[ih], dt2[ih], sta[ih]->timeInfo);
       }
     }
 
-    if (iiter == 1) break;
+    if(iiter == 1) break;
 
     // fit backward
     {
-      qp0 = fit_params.GetQp();
+      qp0 = fit_params.GetQp ();
       if (qp0 > maxQp) qp0 = maxQp;
       if (qp0 < -maxQp) qp0 = -maxQp;
 
@@ -463,8 +454,8 @@ XPU_D void GpuTripletConstructor::FitTriplets(FitTriplets::context& ctx) const
 
       fit_params.ResetErrors(1.f, 1.f, 1.f, 1.f, 100., (sta[2]->timeInfo ? dt2[2] : 1.e6f), 1.e2f);
 
-      fit_params.Ndf()     = -ndfTrackModel + 2;
-      fit_params.NdfTime() = sta[2]->timeInfo ? 0 : -1;
+      fit_params.Ndf () = -ndfTrackModel + 2;
+      fit_params.NdfTime () = sta[2]->timeInfo ? 0 : -1;
 
       for (int ih = 1; ih >= 0; --ih) {
         ExtrapolateStep(&fit_params, z[ih], qp0, &fld);
@@ -479,35 +470,32 @@ XPU_D void GpuTripletConstructor::FitTriplets(FitTriplets::context& ctx) const
 
   float chi2 = fit_params.GetChiSq() + fit_params.GetChiSqTime();
 
-  if (chi2 > fParams[fIterationData[0].fIteration].tripletFinalChi2Cut || !xpu::isfinite(chi2) || chi2 < 0.f
-      || not_fitted) {
-    chi2 = -1.;
+  if (chi2 > fParams[fIterationData[0].fIteration].tripletFinalChi2Cut || !xpu::isfinite (chi2) || chi2 < 0.f || not_fitted){
+      chi2 = -1.;
   }
 
-  const float qp  = fit_params.GetQp();
+  const float qp = fit_params.GetQp();
   const float Cqp = fit_params.C(4, 4) + 0.001;
-  fvTriplets[iGThread] =
-    ca::Triplet(ihit[0], ihit[1], ihit[2], ista[0], ista[1], ista[2], 0, 0, 0, chi2, qp, Cqp, fit_params.Tx(),
-                fit_params.C(2, 2), fit_params.Ty(), fit_params.C(3, 3), isMomentumFitted);
-//  delete fit_params;
-#ifdef SORT_TRIPLETS_GPU
-  fTripletSortHelper[iGThread].tr_id = iGThread;
-  fTripletSortHelper[iGThread].lsta  = ista[0];
-  fTripletSortHelper[iGThread].lhit  = ihit[0];
-#endif
+  fvTriplets[iGThread] = ca::Triplet(ihit[0], ihit[1], ihit[2], ista[0], ista[1], ista[2], 0, 0, 0,
+				     chi2, qp, Cqp, fit_params.Tx(), fit_params.C(2, 2), fit_params.Ty(),
+				     fit_params.C(3, 3), isMomentumFitted);
+
+  if constexpr (constants::gpu::GpuSortTriplets) {
+    fTripletSortHelper[iGThread].tr_id = iGThread;
+//    fTripletSortHelper[iGThread].lsta  = ista[0];
+    fTripletSortHelper[iGThread].lhit  = ihit[0];
+  }
 }
 
-#ifdef SORT_TRIPLETS_GPU
 XPU_D void GpuTripletConstructor::SortTripletsFunc(SortTriplets::context& ctx) const
 {
   //TODO: implement sorting of triplets on GPU or make tracking order independent
 }
-#endif
 
 XPU_D void GpuTripletConstructor::CollectHits(int iThread, int iSta, int iHit, int maxNHits, int* collectedHits) const
 {
   // Collect hits in the station iSta and make doublets
-  const ca::GpuStation& sta            = fParams[fIterationData[0].fIteration].GetStation(iSta);
+  const ca::GpuStation& sta = fParams[fIterationData[0].fIteration].GetStation(iSta);
   kf::TrackParamBase<float> fit_params = fvTrackParams[iHit];
 
   fit_params.ChiSq() = 0.;
@@ -519,20 +507,16 @@ XPU_D void GpuTripletConstructor::CollectHits(int iThread, int iSta, int iHit, i
 
   const auto& gpuGrid = fvGpuGrid[iSta];
 
-  const float pick_range_x = xpu::sqrt(Pick_m22 * fit_params.C(0, 0)) + gpuGrid.GetMaxRangeX()
-                             + fParams[fIterationData[0].fIteration].maxDZ * abs(fit_params.Tx());
-  const float pick_range_y = xpu::sqrt(Pick_m22 * fit_params.C(1, 1)) + gpuGrid.GetMaxRangeY()
-                             + fParams[fIterationData[0].fIteration].maxDZ * abs(fit_params.Ty());
+  const float pick_range_x = xpu::sqrt(Pick_m22 * fit_params.C(0, 0)) + gpuGrid.GetMaxRangeX() + fParams[fIterationData[0].fIteration].maxDZ * abs(fit_params.Tx());
+  const float pick_range_y = xpu::sqrt(Pick_m22 * fit_params.C(1, 1)) + gpuGrid.GetMaxRangeY() + fParams[fIterationData[0].fIteration].maxDZ * abs(fit_params.Ty());
 
-  ca::GpuGridArea areaGpu(gpuGrid, &(fgridFirstBinEntryIndex[fvGpuGrid[iSta].GetBinEntryIndexStart()]),
-                          &(fgridEntries[fvGpuGrid[iSta].GetEntriesStart()]), fit_params.X(), fit_params.Y(),
-                          pick_range_x, pick_range_y);
+  ca::GpuGridArea areaGpu(gpuGrid, &(fgridFirstBinEntryIndex[fvGpuGrid[iSta].GetBinEntryIndexStart()]), &(fgridEntries[fvGpuGrid[iSta].GetEntriesStart()]), fit_params.X(), fit_params.Y(), pick_range_x, pick_range_y);
   unsigned int ih_gpu = 0;
 
   while (areaGpu.GetNextObjectId(ih_gpu) && (*collectedHits < maxNHits)) {  // loop over station hits
-                                                                            //    if (fvbHitSuppressed[ih]) {
-                                                                            //      continue;
-                                                                            //    }
+//    if (fvbHitSuppressed[ih]) {
+//      continue;
+//    }
     const ca::Hit& hit = fvHits[ih_gpu];
 
     // check time-boundaries
@@ -547,9 +531,7 @@ XPU_D void GpuTripletConstructor::CollectHits(int iThread, int iSta, int iHit, i
 
     // check y-boundaries
     float y = fit_params.Y() + fit_params.Ty() * (hit.Z() - fit_params.Z());
-    float C11 =
-      fit_params.C(1, 1)
-      + (hit.Z() - fit_params.Z()) * (2 * fit_params.C(3, 1) + (hit.Z() - fit_params.Z()) * fit_params.C(3, 3));
+    float C11 = fit_params.C(1, 1) + (hit.Z() - fit_params.Z()) * (2 * fit_params.C(3, 1) + (hit.Z() - fit_params.Z()) * fit_params.C(3, 3));
 
     float dy_est = xpu::sqrt(Pick_m22 * C11) + hit.RangeY();
 
@@ -561,9 +543,7 @@ XPU_D void GpuTripletConstructor::CollectHits(int iThread, int iSta, int iHit, i
 
     // check x-boundaries
     float x = fit_params.X() + fit_params.Tx() * (hit.Z() - fit_params.Z());
-    float C00 =
-      fit_params.C(0, 0)
-      + (hit.Z() - fit_params.Z()) * (2 * fit_params.C(2, 0) + (hit.Z() - fit_params.Z()) * fit_params.C(2, 2));
+    float C00 = fit_params.C(0, 0) + (hit.Z() - fit_params.Z()) * (2 * fit_params.C(2, 0) + (hit.Z() - fit_params.Z()) * fit_params.C(2, 2));
 
     float dx_est = xpu::sqrt(Pick_m22 * C00) + hit.RangeX();
 
@@ -574,9 +554,7 @@ XPU_D void GpuTripletConstructor::CollectHits(int iThread, int iSta, int iHit, i
     }
 
     // check chi2
-    float C10 = fit_params.C(1, 0)
-                + (hit.Z() - fit_params.Z())
-                    * (fit_params.C(2, 1) + fit_params.C(3, 0) + (hit.Z() - fit_params.Z()) * fit_params.C(3, 2));
+    float C10 = fit_params.C(1, 0) + (hit.Z() - fit_params.Z()) * (fit_params.C(2, 1) + fit_params.C(3, 0) + (hit.Z() - fit_params.Z()) * fit_params.C(3, 2));
 
     ca::MeasurementXy<float> mxy(hit.X(), hit.Y(), hit.dX2(), hit.dY2(), hit.dXY(), 1.f, 1.f);
 
@@ -589,9 +567,9 @@ XPU_D void GpuTripletConstructor::CollectHits(int iThread, int iSta, int iHit, i
 
       const float HCH = F0;
 
-      const float wi     = 1.f / (mxy.Dx2() + HCH);
+      const float wi = 1.f / (mxy.Dx2() + HCH);
       const float zetawi = zeta * wi;
-      chi2x              = mxy.NdfX() * zeta * zetawi;
+      chi2x = mxy.NdfX() * zeta * zetawi;
 
       const float K1 = F1 * wi;
 
@@ -606,8 +584,8 @@ XPU_D void GpuTripletConstructor::CollectHits(int iThread, int iSta, int iHit, i
     float chi2u = 0.;
     {  // filter U measurement, we need only chi2 here
       const float cosPhi = -mxy.Dxy() / mxy.Dx2();
-      const float u      = cosPhi * mxy.X() + mxy.Y();
-      const float du2    = mxy.Dy2() + cosPhi * mxy.Dxy();
+      const float u = cosPhi * mxy.X() + mxy.Y();
+      const float du2 = mxy.Dy2() + cosPhi * mxy.Dxy();
 
       const float zeta = cosPhi * x + y - u;
 
@@ -616,7 +594,7 @@ XPU_D void GpuTripletConstructor::CollectHits(int iThread, int iSta, int iHit, i
 
       const float HCH = (F0 * cosPhi + F1);
 
-      chi2u += mxy.NdfY() * zeta * zeta / (du2 + HCH);
+      chi2u += mxy.NdfY () * zeta * zeta / (du2 + HCH);
     }
 
     if (chi2x > fParams[fIterationData[0].fIteration].doubletChi2Cut) {
@@ -625,7 +603,7 @@ XPU_D void GpuTripletConstructor::CollectHits(int iThread, int iSta, int iHit, i
     if (chi2x + chi2u > fParams[fIterationData[0].fIteration].doubletChi2Cut) {
       continue;
     }
-    int id                = iThread * maxNHits + *collectedHits;
+    int id = iThread * maxNHits + *collectedHits;
     fHitDoublets[id].hit1 = iHit;
     fHitDoublets[id].hit2 = ih_gpu;
     *collectedHits += 1;
@@ -635,7 +613,7 @@ XPU_D void GpuTripletConstructor::CollectHits(int iThread, int iSta, int iHit, i
 
 XPU_D void GpuTripletConstructor::CollectHitsTriplets(int iThread, int iSta, int maxNHits, int* collectedHits) const
 {
-  const ca::GpuStation& sta            = fParams[fIterationData[0].fIteration].GetStation(iSta);
+  const ca::GpuStation& sta = fParams[fIterationData[0].fIteration].GetStation(iSta);
   kf::TrackParamBase<float> fit_params = fvTrackParamsDoublets[iThread];
 
   fit_params.ChiSq() = 0.;
@@ -647,20 +625,16 @@ XPU_D void GpuTripletConstructor::CollectHitsTriplets(int iThread, int iSta, int
 
   const auto& gpuGrid = fvGpuGrid[iSta];
 
-  const float pick_range_x = xpu::sqrt(Pick_m22 * fit_params.C(0, 0)) + gpuGrid.GetMaxRangeX()
-                             + fParams[fIterationData[0].fIteration].maxDZ * abs(fit_params.Tx());
-  const float pick_range_y = xpu::sqrt(Pick_m22 * fit_params.C(1, 1)) + gpuGrid.GetMaxRangeY()
-                             + fParams[fIterationData[0].fIteration].maxDZ * abs(fit_params.Ty());
+  const float pick_range_x = xpu::sqrt(Pick_m22 * fit_params.C(0, 0)) + gpuGrid.GetMaxRangeX() + fParams[fIterationData[0].fIteration].maxDZ * abs(fit_params.Tx());
+  const float pick_range_y = xpu::sqrt(Pick_m22 * fit_params.C(1, 1)) + gpuGrid.GetMaxRangeY() + fParams[fIterationData[0].fIteration].maxDZ * abs(fit_params.Ty());
 
-  ca::GpuGridArea areaGpu(gpuGrid, &(fgridFirstBinEntryIndex[fvGpuGrid[iSta].GetBinEntryIndexStart()]),
-                          &(fgridEntries[fvGpuGrid[iSta].GetEntriesStart()]), fit_params.X(), fit_params.Y(),
-                          pick_range_x, pick_range_y);
+  ca::GpuGridArea areaGpu(gpuGrid, &(fgridFirstBinEntryIndex[fvGpuGrid[iSta].GetBinEntryIndexStart()]), &(fgridEntries[fvGpuGrid[iSta].GetEntriesStart()]), fit_params.X(), fit_params.Y(), pick_range_x, pick_range_y);
   unsigned int ih_gpu = 0;
 
   while (areaGpu.GetNextObjectId(ih_gpu) && (*collectedHits < maxNHits)) {  // loop over station hits
-                                                                            //    if (fvbHitSuppressed[ih]) {
-                                                                            //      continue;
-                                                                            //    }
+//    if (fvbHitSuppressed[ih]) {
+//      continue;
+//    }
     const ca::Hit& hit = fvHits[ih_gpu];
 
     // check time-boundaries
@@ -677,9 +651,7 @@ XPU_D void GpuTripletConstructor::CollectHitsTriplets(int iThread, int iSta, int
 
     // check y-boundaries
     float y = fit_params.Y() + fit_params.Ty() * (hit.Z() - fit_params.Z());
-    float C11 =
-      fit_params.C(1, 1)
-      + (hit.Z() - fit_params.Z()) * (2 * fit_params.C(3, 1) + (hit.Z() - fit_params.Z()) * fit_params.C(3, 3));
+    float C11 = fit_params.C(1, 1) + (hit.Z() - fit_params.Z()) * (2 * fit_params.C(3, 1) + (hit.Z() - fit_params.Z()) * fit_params.C(3, 3));
 
     float dy_est = xpu::sqrt(Pick_m22 * C11) + hit.RangeY();
 
@@ -691,9 +663,7 @@ XPU_D void GpuTripletConstructor::CollectHitsTriplets(int iThread, int iSta, int
 
     // check x-boundaries
     float x = fit_params.X() + fit_params.Tx() * (hit.Z() - fit_params.Z());
-    float C00 =
-      fit_params.C(0, 0)
-      + (hit.Z() - fit_params.Z()) * (2 * fit_params.C(2, 0) + (hit.Z() - fit_params.Z()) * fit_params.C(2, 2));
+    float C00 = fit_params.C(0, 0) + (hit.Z() - fit_params.Z()) * (2 * fit_params.C(2, 0) + (hit.Z() - fit_params.Z()) * fit_params.C(2, 2));
 
     float dx_est = xpu::sqrt(Pick_m22 * C00) + hit.RangeX();
 
@@ -704,24 +674,22 @@ XPU_D void GpuTripletConstructor::CollectHitsTriplets(int iThread, int iSta, int
     }
 
     // check chi2
-    float C10 = fit_params.C(1, 0)
-                + (hit.Z() - fit_params.Z())
-                    * (fit_params.C(2, 1) + fit_params.C(3, 0) + (hit.Z() - fit_params.Z()) * fit_params.C(3, 2));
+    float C10 = fit_params.C(1, 0) + (hit.Z() - fit_params.Z()) * (fit_params.C(2, 1) + fit_params.C(3, 0) + (hit.Z() - fit_params.Z()) * fit_params.C(3, 2));
 
     ca::MeasurementXy<float> mxy(hit.X(), hit.Y(), hit.dX2(), hit.dY2(), hit.dXY(), 1.f, 1.f);
 
     float chi2x = 0.;
     {  // filter X measurement
-      const float zeta = x - mxy.X();
+      const float zeta = x - mxy.X ();
 
       const float F0 = C00;
       const float F1 = C10;
 
       const float HCH = F0;
 
-      const float wi     = 1.f / (mxy.Dx2() + HCH);
+      const float wi = 1.f / (mxy.Dx2 () + HCH);
       const float zetawi = zeta * wi;
-      chi2x              = mxy.NdfX() * zeta * zetawi;
+      chi2x = mxy.NdfX () * zeta * zetawi;
 
       const float K1 = F1 * wi;
 
@@ -735,9 +703,9 @@ XPU_D void GpuTripletConstructor::CollectHitsTriplets(int iThread, int iSta, int
 
     float chi2u = 0.;
     {  // filter U measurement, we need only chi2 here
-      const float cosPhi = -mxy.Dxy() / mxy.Dx2();
-      const float u      = cosPhi * mxy.X() + mxy.Y();
-      const float du2    = mxy.Dy2() + cosPhi * mxy.Dxy();
+      const float cosPhi = -mxy.Dxy () / mxy.Dx2 ();
+      const float u = cosPhi * mxy.X () + mxy.Y ();
+      const float du2 = mxy.Dy2 () + cosPhi * mxy.Dxy ();
 
       const float zeta = cosPhi * x + y - u;
 
@@ -746,7 +714,7 @@ XPU_D void GpuTripletConstructor::CollectHitsTriplets(int iThread, int iSta, int
 
       const float HCH = (F0 * cosPhi + F1);
 
-      chi2u += mxy.NdfY() * zeta * zeta / (du2 + HCH);
+      chi2u += mxy.NdfY () * zeta * zeta / (du2 + HCH);
     }
 
     if (chi2x > fParams[fIterationData[0].fIteration].tripletChi2Cut) {
@@ -756,41 +724,37 @@ XPU_D void GpuTripletConstructor::CollectHitsTriplets(int iThread, int iSta, int
       continue;
     }
 
-    int id                = iThread * maxNHits + *collectedHits;
+    int id = iThread * maxNHits + *collectedHits;
     fHitTriplets[id].hit1 = fHitDoubletsCompressed[iThread].hit1;
     fHitTriplets[id].hit2 = fHitDoubletsCompressed[iThread].hit2;
     fHitTriplets[id].hit3 = ih_gpu;
     *collectedHits += 1;
   }  // loop over the hits in the area
-  //  if( *collectedHits > 2 ) printf("- iThread: %d; collectedHits: %d\n", iThread, *collectedHits);
+//  if( *collectedHits > 2 ) printf("- iThread: %d; collectedHits: %d\n", iThread, *collectedHits);
 }
 
-XPU_D void GpuTripletConstructor::FilterWithTargetAtLine(kf::TrackParamBase<float>* tr_par,
-                                                         const ca::GpuFieldRegion* F0) const
+XPU_D void GpuTripletConstructor::FilterWithTargetAtLine(kf::TrackParamBase<float>* tr_par, const ca::GpuFieldRegion* F0) const
 {
   // Add the target constraint to a straight line track
 
   float eXY[2];
   float Jx[7], Jy[7];
 
-  GetExtrapolatedXYline(tr_par,
-                        /*fParams[fIterationData[0].fIteration]*/ fParams_const[fIteration].GetTargetPositionZ(), F0,
-                        eXY /*, &eY*/, Jx, Jy);
+  GetExtrapolatedXYline(tr_par, /*fParams[fIterationData[0].fIteration]*/fParams_const[fIteration].GetTargetPositionZ(), F0, eXY/*, &eY*/, Jx, Jy);
 
-  FilterExtrapolatedXY(tr_par, &(/*fParams[fIterationData[0].fIteration]*/ fParams_const[fIteration].targetMeasurement),
-                       eXY /*, eY*/, Jx, Jy);
+  FilterExtrapolatedXY(tr_par, &(/*fParams[fIterationData[0].fIteration]*/fParams_const[fIteration].targetMeasurement), eXY/*, eY*/, Jx, Jy);
 }
 
-XPU_D void GpuTripletConstructor::GetExtrapolatedXYline(
-  kf::TrackParamBase<float>* tr_par, float z, const ca::GpuFieldRegion* F, float* extrXY,  // float* extrY,
-  //                                   std::array<float, ca::TrackParamBase<float>::kNtrackParam>* Jx,
-  //                                   std::array<float, ca::TrackParamBase<float>::kNtrackParam>* Jy) const
-  float* Jx, float* Jy) const
+XPU_D void GpuTripletConstructor::GetExtrapolatedXYline(kf::TrackParamBase<float>* tr_par, float z, const ca::GpuFieldRegion* F, float* extrXY,// float* extrY,
+//                                   std::array<float, ca::TrackParamBase<float>::kNtrackParam>* Jx,
+//                                   std::array<float, ca::TrackParamBase<float>::kNtrackParam>* Jy) const
+				     float* Jx,
+				     float* Jy) const
 {
   // extrapolate track assuming it is straight (qp==0)
   // return the extrapolated X, Y and the derivatives of the extrapolated X and Y
 
-  const float c_light(0.000299792458), c1(1.), c2i(0.5), c6i(1. / 6.), c12i(1. / 12.);
+  const float c_light(0.000299792458), c1(1.), c2i(0.5), c6i(/*1. / 6.*/0.166667), c12i(/*1. / 12.*/0.083333);	//TODO: test constants
 
   const float tx = tr_par->GetTx();
   const float ty = tr_par->GetTy();
@@ -816,8 +780,8 @@ XPU_D void GpuTripletConstructor::GetExtrapolatedXYline(
   extrXY[0] = tr_par->GetX() + tx * dz;
   extrXY[1] = tr_par->GetY() + ty * dz;
 
-  //  Jx->fill(0.);  // for a case
-  //  Jy->fill(0.);
+//  Jx->fill(0.);  // for a case
+//  Jy->fill(0.);
 
   Jx[0] = 1.;
   Jx[1] = 0.;
@@ -837,11 +801,10 @@ XPU_D void GpuTripletConstructor::GetExtrapolatedXYline(
   Jy[6] = 0.;
 }
 
-XPU_D void GpuTripletConstructor::FilterExtrapolatedXY(
-  kf::TrackParamBase<float>* tr_par, const ca::MeasurementXy<float>* m, float* extrXY,  // fvec extrY,
-  //			      std::array<float, ca::TrackParamBase<float>::kNtrackParam>* Jx,
-  //			      std::array<float, ca::TrackParamBase<float>::kNtrackParam>* Jy) const
-  float* Jx, float* Jy) const
+XPU_D void GpuTripletConstructor::FilterExtrapolatedXY(kf::TrackParamBase<float>* tr_par, const ca::MeasurementXy<float>* m, float* extrXY,// fvec extrY,
+//			      std::array<float, ca::TrackParamBase<float>::kNtrackParam>* Jx,
+//			      std::array<float, ca::TrackParamBase<float>::kNtrackParam>* Jy) const
+						       float *Jx, float *Jy) const
 {
   // add a 2-D measurenent (x,y) at some z, that differs from fTr.GetZ()
   // extrX, extrY are extrapolated track parameters at z, Jx, Jy are derivatives of the extrapolation
@@ -849,7 +812,7 @@ XPU_D void GpuTripletConstructor::FilterExtrapolatedXY(
   // ! it is assumed that in the track covariance matrix all non-diagonal covariances are 0
   // ! except of C10
 
-  //  TrackParamV& T = fTr;
+//  TrackParamV& T = fTr;
 
   //zeta0 = T.x + Jx[2]*T.Tx() + Jx[3]*T.Ty() + Jx[4]*T.qp - x;
   //zeta1 = T.y + Jy[2]*T.Tx() + Jy[3]*T.Ty() + Jy[4]*T.qp - y;
@@ -862,8 +825,8 @@ XPU_D void GpuTripletConstructor::FilterExtrapolatedXY(
 
   // F = CH'
   const float F00 = tr_par->C(0, 0);
-  const float F01 = tr_par->C(1, 0);  //TODO: is it a bug???
-  const float F10 = F01;              //tr_par->C10();
+  const float F01 = tr_par->C(1, 0);	//TODO: is it a bug???
+  const float F10 = F01;//tr_par->C10();
   const float F11 = tr_par->C(1, 1);
   const float F20 = Jx[2] * tr_par->C(2, 2);
   const float F21 = Jy[2] * tr_par->C(2, 2);
@@ -878,11 +841,11 @@ XPU_D void GpuTripletConstructor::FilterExtrapolatedXY(
   float S10 = m->Dxy() + F10 + Jy[2] * F20 + Jy[3] * F30 + Jy[4] * F40;
   float S11 = m->Dy2() + F11 + Jy[2] * F21 + Jy[3] * F31 + Jy[4] * F41;
 
-  const float si     = 1. / (S00 * S11 - S10 * S10);
+  const float si = 1. / (S00 * S11 - S10 * S10);
   const float S00tmp = S00;
-  S00                = si * S11;
-  S10                = -si * S10;
-  S11                = si * S00tmp;
+  S00         = si * S11;
+  S10         = -si * S10;
+  S11         = si * S00tmp;
 
   tr_par->ChiSq() += zeta0 * zeta0 * S00 + 2. * zeta0 * zeta1 * S10 + zeta1 * zeta1 * S11;
   tr_par->Ndf() += m->NdfX() + m->NdfY();
@@ -933,19 +896,17 @@ XPU_D void GpuTripletConstructor::MultipleScattering(kf::TrackParamBase<float>*
   float h2    = h * h;
   float qp0t  = qp0 * t;
 
-  float c1 = 0.0136f, c2 = c1 * 0.038f, c3 = c2 * 0.5f, c4 = -c3 * 0.5f, c5 = c3 * 0.333333f, c6 = -c3 * 0.25f;
+  float c1 = 0.0136f, c2 = c1 * 0.038f, c3 = c2 * 0.5f, c4 = -c3 * 0.5f, c5 = c3 *  0.333333f, c6 = -c3 * 0.25f;
 
   float s0 = (c1 + c2 * xpu::log(radThick) + c3 * h + h2 * (c4 + c5 * h + c6 * h2)) * qp0t;
-  //  float a = ((t + fParams[fIterationData[0].fIteration].particleMass * fParams[fIterationData[0].fIteration].particleMass * qp0 * qp0t) * radThick * s0 * s0);
-  float a = ((t + fParams_const[fIteration].particleMass * fParams_const[fIteration].particleMass * qp0 * qp0t)
-             * radThick * s0 * s0);
+//  float a = ((t + fParams[fIterationData[0].fIteration].particleMass * fParams[fIterationData[0].fIteration].particleMass * qp0 * qp0t) * radThick * s0 * s0);
+  float a = ((t + fParams_const[fIteration].particleMass * fParams_const[fIteration].particleMass * qp0 * qp0t) * radThick * s0 * s0);
   tr_par->C(2, 2) += txtx1 * a;
   tr_par->C(3, 2) += tx * ty * a;
   tr_par->C(3, 3) += (1. + tyty) * a;
 }
 
-XPU_D void GpuTripletConstructor::ExtrapolateStep(kf::TrackParamBase<float>* tr_par, float z_out, float qp0,
-                                                  const ca::GpuFieldRegion* Field) const
+XPU_D void GpuTripletConstructor::ExtrapolateStep(kf::TrackParamBase<float>* tr_par, float z_out, float qp0, const ca::GpuFieldRegion* Field) const
 {
   const float c_light = 0.000299792458;
 
@@ -959,8 +920,8 @@ XPU_D void GpuTripletConstructor::ExtrapolateStep(kf::TrackParamBase<float>* tr_
 
   //   Runge-Kutta steps
   //
-  const float r0[7] = {tr_par->X(), tr_par->Y(), tr_par->Tx(), tr_par->Ty(), qp0, tr_par->Time(), tr_par->Vi()};
-  float R0[7][7]    = {{0.}};
+  const float r0[7]    = {tr_par->X(), tr_par->Y(), tr_par->Tx(), tr_par->Ty(), qp0, tr_par->Time(), tr_par->Vi()};
+  float R0[7][7] = {{0.}};
   for (int i = 0; i < 7; ++i) {
     R0[i][i] = 1.;
   }
@@ -971,16 +932,16 @@ XPU_D void GpuTripletConstructor::ExtrapolateStep(kf::TrackParamBase<float>* tr_
     for (int i = 0; i < 7; ++i) {
       rstep[i] = r0[i] + stepDz[step] * f[step - 1][i];
     }
-    const float z       = tr_par->GetZ() + stepDz[step];
+    const float z = tr_par->GetZ() + stepDz[step];
     ca::GpuFieldValue B = Field->Get(rstep[0], rstep[1], z);
-    const float tx2     = rstep[2] * rstep[2];
-    const float ty2     = rstep[3] * rstep[3];
-    const float txty    = rstep[2] * rstep[3];
-    const float L2      = 1.f + tx2 + ty2;
-    const float L2i     = 1.f / L2;
-    const float L       = xpu::sqrt(L2);
-    const float cL      = c_light * L;
-    const float cLqp0   = cL * qp0;
+    const float tx2   = rstep[2] * rstep[2];
+    const float ty2   = rstep[3] * rstep[3];
+    const float txty  = rstep[2] * rstep[3];
+    const float L2    = 1.f + tx2 + ty2;
+    const float L2i   = 1.f / L2;
+    const float L     = xpu::sqrt(L2);
+    const float cL    = c_light * L;
+    const float cLqp0 = cL * qp0;
 
     f[step][0]    = rstep[2];
     F[step][0][2] = 1.;
@@ -989,56 +950,56 @@ XPU_D void GpuTripletConstructor::ExtrapolateStep(kf::TrackParamBase<float>* tr_
     F[step][1][3] = 1.;
 
     const float f2tmp = txty * B.x - (1. + tx2) * B.y + rstep[3] * B.z;
-    f[step][2]        = cLqp0 * f2tmp;
+    f[step][2] = cLqp0 * f2tmp;
 
     F[step][2][2] = cLqp0 * (rstep[2] * f2tmp * L2i + rstep[3] * B.x - 2. * rstep[2] * B.y);
     F[step][2][3] = cLqp0 * (rstep[3] * f2tmp * L2i + rstep[2] * B.x + B.z);
     F[step][2][4] = cL * f2tmp;
 
-    const float f3tmp = -txty * B.y - rstep[2] * B.z + (1. + ty2) * B.x;
-    f[step][3]        = cLqp0 * f3tmp;
-    F[step][3][2]     = cLqp0 * (rstep[2] * f3tmp * L2i - rstep[3] * B.y - B.z);
-    F[step][3][3]     = cLqp0 * (rstep[3] * f3tmp * L2i + 2. * rstep[3] * B.x - rstep[2] * B.y);
-    F[step][3][4]     = cL * f3tmp;
+    const float f3tmp    = -txty * B.y - rstep[2] * B.z + (1. + ty2) * B.x;
+    f[step][3]    = cLqp0 * f3tmp;
+    F[step][3][2] = cLqp0 * (rstep[2] * f3tmp * L2i - rstep[3] * B.y - B.z);
+    F[step][3][3] = cLqp0 * (rstep[3] * f3tmp * L2i + 2. * rstep[3] * B.x - rstep[2] * B.y);
+    F[step][3][4] = cL * f3tmp;
 
     f[step][4] = 0.;
 
-    //    if (fDoFitVelocity) {	//TODO: check and switch it on back if needed (for triplets, probably, not needed)
-    ////      fvec vi       = rstep[6];
-    f[step][5]     = rstep[6] * L;
-    const float Li = 1.f / L;
-    F[step][5][2]  = rstep[6] * rstep[2] * Li;  // / L;
-    F[step][5][3]  = rstep[6] * rstep[3] * Li;  // / L;
-    F[step][5][4]  = 0.;
-    F[step][5][5]  = 0.;
-    F[step][5][6]  = L;
-    //    }
-    //    else {
-    //      const float vi       = xpu::sqrt(1. + fSettings[0].particleMass * fSettings[0].particleMass * qp0 * qp0) * fSettings[0].SpeedOfLightInv;
-    //      f[step][5]    = rstep[6] * L;
-    //      F[step][5][2] = rstep[6] * rstep[2] / L;
-    //      F[step][5][3] = rstep[6] * rstep[3] / L;
-    //      F[step][5][4] =
-    //	  fSettings[0].particleMass * fSettings[0].particleMass * qp0 * L /
-    //	  xpu::sqrt(1. + fSettings[0].particleMass * fSettings[0].particleMass * qp0 * qp0) * fSettings[0].SpeedOfLightInv;
-    //      F[step][5][5] = 0.;
-    //      F[step][5][6] = 0.;
-    //    }
+//    if (fDoFitVelocity) {	//TODO: check and switch it on back if needed (for triplets, probably, not needed)
+////      fvec vi       = rstep[6];
+      f[step][5]    = rstep[6] * L;
+      const float Li = 1.f / L;
+      F[step][5][2] = rstep[6] * rstep[2] * Li;// / L;
+      F[step][5][3] = rstep[6] * rstep[3] * Li;// / L;
+      F[step][5][4] = 0.;
+      F[step][5][5] = 0.;
+      F[step][5][6] = L;
+//    }
+//    else {
+//      const float vi       = xpu::sqrt(1. + fSettings[0].particleMass * fSettings[0].particleMass * qp0 * qp0) * fSettings[0].SpeedOfLightInv;
+//      f[step][5]    = rstep[6] * L;
+//      F[step][5][2] = rstep[6] * rstep[2] / L;
+//      F[step][5][3] = rstep[6] * rstep[3] / L;
+//      F[step][5][4] =
+//	  fSettings[0].particleMass * fSettings[0].particleMass * qp0 * L /
+//	  xpu::sqrt(1. + fSettings[0].particleMass * fSettings[0].particleMass * qp0 * qp0) * fSettings[0].SpeedOfLightInv;
+//      F[step][5][5] = 0.;
+//      F[step][5][6] = 0.;
+//    }
 
     f[step][6] = 0.;
   }  // end of Runge-Kutta step
 
-  float r[7]    = {0.};    // extrapolated parameters
+  float r[7]    = {0.};  // extrapolated parameters
   float R[7][7] = {{0.}};  // Jacobian of the extrapolation
 
-  float stepW[5];  // = {0., h / 6.f, h / 3.f, h / 3.f, h / 6.};
+  float stepW[5];// = {0., h / 6.f, h / 3.f, h / 3.f, h / 6.};
   stepW[0] = 0.;
   stepW[1] = h / 6.f;
   stepW[2] = h / 3.f;
   stepW[3] = h / 3.f;
   stepW[4] = h / 6.f;
-  //  const float stepW[5] = {0., h * 0.16666667f, h * 0.33333334f, h * 0.33333334f, h * 0.16666667f};
-  //  const float stepW[5] = {0., h * 0.166667f, h * 0.333334f, h * 0.333334f, h * 0.166667f};	//TODO: test constants: leads to a drop in efficiency
+//  const float stepW[5] = {0., h * 0.16666667f, h * 0.33333334f, h * 0.33333334f, h * 0.16666667f};
+//  const float stepW[5] = {0., h * 0.166667f, h * 0.333334f, h * 0.333334f, h * 0.166667f};	//TODO: test constants: leads to a drop in efficiency
 
   float k[5][7][7] = {{0.}};
   for (int step = 1; step <= 4; ++step) {
@@ -1083,7 +1044,7 @@ XPU_D void GpuTripletConstructor::ExtrapolateStep(kf::TrackParamBase<float>* tr_
   tr_par->Vi()   = r[6];
 
   //fTr.Vi()( fTr.Vi() < fvec(TrackParamV::kClightNsInv) ) = fvec(TrackParamV::kClightNsInv);
-  tr_par->Z() = z_out;  //zMasked;
+  tr_par->Z() = z_out;//zMasked;
 
   //          covariance matrix transport
   float C[7][7];
@@ -1114,7 +1075,7 @@ XPU_D void GpuTripletConstructor::ExtrapolateStep(kf::TrackParamBase<float>* tr_
 
 XPU_D void GpuTripletConstructor::ExtrapolateLineNoField(kf::TrackParamBase<float>* tr_par, float zOut) const
 {
-  // extrapolate the track assuming no field
+    // extrapolate the track assuming no field
 
   const float dz = zOut - tr_par->GetZ();
 
@@ -1128,14 +1089,14 @@ XPU_D void GpuTripletConstructor::ExtrapolateLineNoField(kf::TrackParamBase<floa
   const float j53 = dz * ty * vi / L;
   const float j56 = dz * L;
 
-  // transport parameters
+    // transport parameters
 
   tr_par->X() += tx * dz;
   tr_par->Y() += ty * dz;
   tr_par->Time() += L * vi * dz;
   tr_par->Z() = zOut;
 
-  // transport covariance matrix (see details in CaTrackFit)
+    // transport covariance matrix (see details in CaTrackFit)
 
   const float jc00 = tr_par->C(0, 0) + dz * tr_par->C(2, 0);
   const float jc02 = tr_par->C(0, 2) + dz * tr_par->C(2, 2);
@@ -1155,46 +1116,46 @@ XPU_D void GpuTripletConstructor::ExtrapolateLineNoField(kf::TrackParamBase<floa
 
   tr_par->C(0, 0) = jc00 + jc02 * dz;
   tr_par->C(1, 0) = jc10 + jc12 * dz;
-  //  tr_par->C(2, 0) = tr_par->C(2, 0) + tr_par->C(2, 2) * dz;
-  //  tr_par->C(3, 0) = tr_par->C(3, 0) + tr_par->C(3, 2) * dz;
-  //  tr_par->C(4, 0) = tr_par->C(4, 0) + tr_par->C(4, 2) * dz;
+//  tr_par->C(2, 0) = tr_par->C(2, 0) + tr_par->C(2, 2) * dz;
+//  tr_par->C(3, 0) = tr_par->C(3, 0) + tr_par->C(3, 2) * dz;
+//  tr_par->C(4, 0) = tr_par->C(4, 0) + tr_par->C(4, 2) * dz;
   tr_par->C(2, 0) += tr_par->C(2, 2) * dz;
   tr_par->C(3, 0) += tr_par->C(3, 2) * dz;
   tr_par->C(4, 0) += tr_par->C(4, 2) * dz;
   tr_par->C(5, 0) = jc50 + jc52 * dz;
-  //  tr_par->C(6, 0) = tr_par->C(6, 0) + tr_par->C(6, 2) * dz;
+//  tr_par->C(6, 0) = tr_par->C(6, 0) + tr_par->C(6, 2) * dz;
   tr_par->C(6, 0) += tr_par->C(6, 2) * dz;
 
   tr_par->C(1, 1) = jc11 + jc13 * dz;
-  //  tr_par->C21() = tr_par->C21() + tr_par->C23() * dz;
-  //  tr_par->C31() = tr_par->C31() + tr_par->C33() * dz;
-  //  tr_par->C41() = tr_par->C41() + tr_par->C43() * dz;
+//  tr_par->C21() = tr_par->C21() + tr_par->C23() * dz;
+//  tr_par->C31() = tr_par->C31() + tr_par->C33() * dz;
+//  tr_par->C41() = tr_par->C41() + tr_par->C43() * dz;
   tr_par->C(2, 1) += tr_par->C(2, 3) * dz;
   tr_par->C(3, 1) += tr_par->C(3, 3) * dz;
   tr_par->C(4, 1) += tr_par->C(4, 3) * dz;
   tr_par->C(5, 1) = jc51 + jc53 * dz;
-  //  tr_par->C61() = tr_par->C61() + tr_par->C63() * dz;
+//  tr_par->C61() = tr_par->C61() + tr_par->C63() * dz;
   tr_par->C(6, 1) += tr_par->C(6, 3) * dz;
 
   tr_par->C(5, 2) = jc52;
   tr_par->C(5, 3) = jc53;
   tr_par->C(5, 4) = jc54;
   tr_par->C(5, 5) = jc55 + jc52 * j52 + jc53 * j53 + jc56 * j56;
-  //  tr_par->C65() = tr_par->C65() + tr_par->C62() * j52 + tr_par->C63() * j53 + tr_par->C66() * j56;
+//  tr_par->C65() = tr_par->C65() + tr_par->C62() * j52 + tr_par->C63() * j53 + tr_par->C66() * j56;
   tr_par->C(6, 5) += tr_par->C(6, 2) * j52 + tr_par->C(6, 3) * j53 + tr_par->C(6, 6) * j56;
 }
 
 XPU_D void GpuTripletConstructor::FilterXY(kf::TrackParamBase<float>* tr_par, const ca::MeasurementXy<float>& mxy) const
 {
   {
-    ca::MeasurementU<float> mx;
+    kf::MeasurementU<float> mx;
     mx.SetCosPhi(1.f);
     mx.SetSinPhi(0.f);
     mx.SetU(mxy.X());
     mx.SetDu2(mxy.Dx2());
     mx.SetNdf(mxy.NdfX());
 
-    ca::MeasurementU<float> mu;
+    kf::MeasurementU<float> mu;
     mu.SetCosPhi(-mxy.Dxy() / mxy.Dx2());
     mu.SetSinPhi(1.f);
     mu.SetU(mu.CosPhi() * mxy.X() + mxy.Y());
@@ -1207,7 +1168,7 @@ XPU_D void GpuTripletConstructor::FilterXY(kf::TrackParamBase<float>* tr_par, co
   }
 }
 
-XPU_D void GpuTripletConstructor::Filter1d(kf::TrackParamBase<float>* tr_par, const ca::MeasurementU<float>& m) const
+XPU_D void GpuTripletConstructor::Filter1d(kf::TrackParamBase<float>* tr_par, const kf::MeasurementU<float>& m) const
 {
   float zeta, HCH;
   float F0, F1, F2, F3, F4, F5, F6;
@@ -1231,12 +1192,12 @@ XPU_D void GpuTripletConstructor::Filter1d(kf::TrackParamBase<float>* tr_par, co
 
   // correction to HCH is needed for the case when sigma2 is so small
   // with respect to HCH that it disappears due to the roundoff error
-  //    fvec wi     = fMaskF / (m.Du2() + fvec(1.0000001) * HCH);
-  //    fvec zetawi = fMaskF * zeta / (iif(maskDoFilter, m.Du2(), fvec::Zero()) + HCH);
+//    fvec wi     = fMaskF / (m.Du2() + fvec(1.0000001) * HCH);
+//    fvec zetawi = fMaskF * zeta / (iif(maskDoFilter, m.Du2(), fvec::Zero()) + HCH);
 
   //TODO: (wi, zetawi): GPU results are different from CPU ones and depends on the order of the operations
   //TODO: need to compare effects made on the tracking efficiency and parameters quality
-  float wi           = 1.f / (m.Du2() + 1.0000001f * HCH);
+  float wi = 1.f / (m.Du2() + 1.0000001f * HCH);
   const float zetawi = zeta / (maskDoFilter ? m.Du2() : 0.f + HCH);
 
   wi = m.Du2() > 0.f ? wi : 0.f;
@@ -1259,8 +1220,8 @@ XPU_D void GpuTripletConstructor::Filter1d(kf::TrackParamBase<float>* tr_par, co
   tr_par->Time() -= F5 * zetawi;
   tr_par->Vi() -= F6 * zetawi;
 
-  tr_par->C(0, 0) -= F0 * F0 * wi;
-  tr_par->C(1, 0) -= K1 * F0;
+  tr_par->C(0 ,0) -= F0 * F0 * wi;
+  tr_par->C(1 ,0) -= K1 * F0;
   tr_par->C(1, 1) -= K1 * F1;
 
   tr_par->C(2, 0) -= K2 * F0;
@@ -1295,8 +1256,7 @@ XPU_D void GpuTripletConstructor::Filter1d(kf::TrackParamBase<float>* tr_par, co
 }
 
 //TODO: update this function
-XPU_D void GpuTripletConstructor::FilterTime(kf::TrackParamBase<float>* tr_par, float t, float dt2,
-                                             float timeInfo) const
+XPU_D void GpuTripletConstructor::FilterTime(kf::TrackParamBase<float>* tr_par, float t, float dt2, float timeInfo) const
 {
   // filter track with a time measurement
 
@@ -1309,25 +1269,25 @@ XPU_D void GpuTripletConstructor::FilterTime(kf::TrackParamBase<float>* tr_par,
   const float F5 = tr_par->C(5, 5);
   const float F6 = tr_par->C(6, 5);
 
-  const float HCH = F5;  //tr_par->C55();
+  const float HCH = F5;//tr_par->C55();
 
-  // when dt0 is much smaller than current time error,
-  // set track time exactly to the measurement value without filtering
-  // it helps to keep the initial time errors reasonably small
-  // the calculations in the covariance matrix are not affected
+    // when dt0 is much smaller than current time error,
+    // set track time exactly to the measurement value without filtering
+    // it helps to keep the initial time errors reasonably small
+    // the calculations in the covariance matrix are not affected
 
   const bool maskDoFilter = (HCH < dt2 * 16.f) && timeInfo;
 
   float w = 1.f;
 
-  if (timeInfo <= 0.f) w = 0.f;
+  if( timeInfo <= 0.f ) w = 0.f;
 
-  //  float wi = 1.f / (dt2 + 1.0000001f * HCH);
+//  float wi = 1.f / (dt2 + 1.0000001f * HCH);
   float hch_d = 1.0000001f * HCH;
-  float den   = dt2 + hch_d;
-  float wi    = w / den;
+  float den = dt2 + hch_d;
+  float wi = w / den;
 
-  const float zeta   = tr_par->Time() - t;
+  const float zeta = tr_par->Time() - t;
   const float zetawi = w * zeta / (maskDoFilter ? dt2 : 0.f + HCH);
 
   tr_par->ChiSqTime() += maskDoFilter ? (zeta * zeta * wi) : 0.f;
@@ -1384,58 +1344,57 @@ XPU_D void GpuTripletConstructor::FilterTime(kf::TrackParamBase<float>* tr_par,
   tr_par->C(6, 6) -= K6 * F6;
 }
 
-XPU_D void GpuTripletConstructor::EnergyLossCorrection(kf::TrackParamBase<float>* tr_par, float radThick,
-                                                       float upstreamDirection) const
+XPU_D void GpuTripletConstructor::EnergyLossCorrection(kf::TrackParamBase<float>* tr_par, float radThick, float upstreamDirection) const
 {
   const float qp02 = xpu::max(tr_par->Qp() * tr_par->Qp(), 0.01f);
-  const float p2   = 1.f / qp02;
-  //  const float mass2 = fParams[fIterationData[0].fIteration].particleMass * fParams[fIterationData[0].fIteration].particleMass;	//TODO: fMass2 = 0.01116, fSettings[0].particleMass^2 = 0.011172, might lead to a different result
-  const float mass2   = fParams_const[0].particleMass * fParams_const[0].particleMass;
+  const float p2 = 1.f / qp02;
+//  const float mass2 = fParams[fIterationData[0].fIteration].particleMass * fParams[fIterationData[0].fIteration].particleMass;	//TODO: fMass2 = 0.01116, fSettings[0].particleMass^2 = 0.011172, might lead to a different result
+  const float mass2 = fParams_const[0].particleMass * fParams_const[0].particleMass;
   const float r_mass2 = 1.f / mass2;
-  const float E2      = mass2 + p2;
+  const float E2 = mass2 + p2;
 
   const float bg2 = p2 * r_mass2;
-  //ApproximateBetheBloch
+    //ApproximateBetheBloch
   const float kp0 = 2.33f;
   const float kp1 = 0.20f;
   const float kp2 = 3.00f;
   const float kp3 = 173e-9f;
   const float kp4 = 0.49848f;
 
-  const float mK   = 0.307075e-3f;
+  const float mK = 0.307075e-3f;
   const float _2me = 1.022e-3f;
-  const float rho  = kp0;
-  const float x0   = kp1 * 2.303f;
-  const float x1   = kp2 * 2.303f;
-  const float mI   = kp3;
-  const float mZA  = kp4;
+  const float rho = kp0;
+  const float x0 = kp1 * 2.303f;
+  const float x1 = kp2 * 2.303f;
+  const float mI = kp3;
+  const float mZA = kp4;
   const float maxT = _2me * bg2;
 
-  //*** Density effect
-  float d2         = 0.f;
-  const float x    = 0.5f * xpu::log(bg2);
+      //*** Density effect
+  float d2 = 0.f;
+  const float x = 0.5f * xpu::log(bg2);
   const float lhwI = xpu::log(28.816f * 1e-9f * xpu::sqrt(rho * mZA) / mI);
 
-  if (x > x1) {
+  if (x > x1){
     d2 = lhwI + x - 0.5f;
   }
-  if ((x > x0) && (x1 > x)) {
+  if ((x > x0) && (x1 > x)){
     const float r = (x1 - x) / (x1 - x0);
-    d2            = lhwI + x - 0.5f + (0.5f - lhwI - x0) * r * r * r;
+    d2 = lhwI + x - 0.5f + (0.5f - lhwI - x0) * r * r * r;
   }
 
-  const float bethe =
-    mK * mZA * (1.f + bg2) / bg2 * (0.5f * xpu::log(_2me * bg2 * maxT / (mI * mI)) - bg2 / (1.f + bg2) - d2);
+  const float bethe = mK * mZA * (1.f + bg2) / bg2
+	  * (0.5f * xpu::log (_2me * bg2 * maxT / (mI * mI)) - bg2 / (1.f + bg2) - d2);
 
   const float tr = xpu::sqrt(1.f + tr_par->Tx() * tr_par->Tx() + tr_par->Ty() * tr_par->Ty());
 
   const float dE = bethe * radThick * tr * 2.33f * 9.34961f;
 
-  const float ECorrected  = xpu::sqrt(E2) + upstreamDirection * dE;
+  const float ECorrected = xpu::sqrt (E2) + upstreamDirection * dE;
   const float E2Corrected = ECorrected * ECorrected;
 
-  float corr = xpu::sqrt(p2 / (E2Corrected - mass2));
-  if (xpu::isnan(corr) || xpu::isinf(corr)) {
+  float corr = xpu::sqrt (p2 / (E2Corrected - mass2));
+  if (xpu::isnan (corr) || xpu::isinf (corr)){
     corr = 1.f;
   }
 
diff --git a/algo/ca/core/experimental/CaGpuTripletConstructor.h b/algo/ca/core/experimental/CaGpuTripletConstructor.h
index 722d520c9c..859bc796c8 100644
--- a/algo/ca/core/experimental/CaGpuTripletConstructor.h
+++ b/algo/ca/core/experimental/CaGpuTripletConstructor.h
@@ -1,40 +1,48 @@
+/* Copyright (C) 2024-2025 GSI Helmholtzzentrum fuer Schwerionenforschung, Darmstadt
+   SPDX-License-Identifier: GPL-3.0-only
+   Authors: Grigory Kozlov [committer] */
+
+/// \file CaGpuTripletConstructor.h
+/// \brief The class contains data and kernels for running CA tracking on CPU and GPU using XPU libraries
+
 
 #pragma once  // include this header only once per compilation unit
 
-#include "CaDeviceImage.h"
+#include "CaMeasurementXy.h"
+#include "KfMeasurementU.h"
+#include "CaGpuParameters.h"
+#include "CaTriplet.h"
+#include "CaHit.h"
 #include "CaGpuGrid.h"
 #include "CaGpuGridArea.h"
 #include "CaGpuMaterialMap.h"
-#include "CaGpuParameters.h"
-#include "CaHit.h"
-#include "CaMeasurementU.h"
-#include "CaMeasurementXy.h"
-#include "CaTriplet.h"
+
+#include "CaDeviceImage.h"
 
 #include <xpu/device.h>
 #include <xpu/host.h>
 
 namespace cbm::algo
 {
-  static constexpr int kMaxNofStations = 24;
+//  static constexpr int kMaxNofStations = 24;
   // Block sizes / other compile time constants that need tuning
   enum CaGpuConstants
   {
 #if XPU_IS_CUDA
-    kSingletConstructorBlockSize      = 512,
-    kSingletConstructorItemsPerThread = 8,
+    kSingletConstructorBlockSize         = 512,
+    kSingletConstructorItemsPerThread    = 8,
 #else  // HIP, values ignored on CPU
-    kSingletConstructorBlockSize      = 64,  //1024,	//128 is optimal (preliminary tests)
-    kSingletConstructorItemsPerThread = 6,
-    kSortTripletsBlockSize            = 64,
-    kSortTripletsItemsPerThread       = 6,
+    kSingletConstructorBlockSize         = 64,//1024,	//128 is optimal (preliminary tests)
+    kSingletConstructorItemsPerThread    = 6,
+    kSortTripletsBlockSize               = 64,
+    kSortTripletsItemsPerThread 	 = 6,
 #endif
   };
 }  // namespace cbm::algo
 
 namespace cbm::algo::ca
 {
-  //  class Framework;
+//  class Framework;
   class GpuTripletConstructor;
 
   struct TrackingParams {
@@ -47,126 +55,128 @@ namespace cbm::algo::ca
   struct strGpuTripletConstructor : xpu::constant<GPUReco, GpuTripletConstructor> {
   };
 
-  struct hit_doublets {
+  struct hit_doublets
+  {
     ca::HitIndex_t hit1;
     ca::HitIndex_t hit2;
   };
 
-  struct hit_triplets {
+  struct hit_triplets
+  {
     ca::HitIndex_t hit1;
     ca::HitIndex_t hit2;
     ca::HitIndex_t hit3;
   };
 
-  struct sort_triplets {
+  struct sort_triplets
+  {
     unsigned int tr_id;
-    unsigned int lsta;
+//    unsigned int lsta;
     unsigned int lhit;
   };
 
-  struct shared_counters {
+  struct shared_counters
+  {
     int counter;
   };
 
-  struct shared_data {
-    //    ca::GpuStation fStations_shared[kMaxNofStations];
+  struct shared_data
+  {
+//    ca::GpuStation fStations_shared[kMaxNofStations];
     ca::GpuFieldRegion fld0_shared[kSingletConstructorBlockSize];
     ca::GpuFieldRegion fld1_shared[kSingletConstructorBlockSize];
   };
 
-  struct shared_data_fit_triplets {
-    //    ca::GpuStation fStations_shared[kMaxNofStations];
+  struct shared_data_fit_triplets
+  {
+//    ca::GpuStation fStations_shared[kMaxNofStations];
     kf::TrackParamBase<float> fit_params_shared[kSingletConstructorBlockSize];
   };
 
-  //  struct TestFunc : xpu::kernel<GPUReco> {
-  //    using block_size    = xpu::block_size<kSingletConstructorBlockSize>;
-  //    using constants     = xpu::cmem<strGpuTripletConstructor, Params>;
-  //    using context       = xpu::kernel_context<shared_memory, constants>;
-  //    XPU_D void operator()(context&);
-  //  };
+//  struct TestFunc : xpu::kernel<GPUReco> {
+//    using block_size    = xpu::block_size<kSingletConstructorBlockSize>;
+//    using constants     = xpu::cmem<strGpuTripletConstructor, Params>;
+//    using context       = xpu::kernel_context<shared_memory, constants>;
+//    XPU_D void operator()(context&);
+//  };
 
   // Declare Kernels
   struct MakeSinglets : xpu::kernel<GPUReco> {
     using block_size    = xpu::block_size<kSingletConstructorBlockSize>;
-    using shared_memory = shared_data;
+    using shared_memory     = shared_data;
     using constants     = xpu::cmem<strGpuTripletConstructor, Params>;
     using context       = xpu::kernel_context<shared_memory, constants>;
     XPU_D void operator()(context&);
   };
 
   struct MakeDoublets : xpu::kernel<GPUReco> {
-    using block_size = xpu::block_size<kSingletConstructorBlockSize>;
-    //    using shared_memory = shared_counters;
-    using constants = xpu::cmem<strGpuTripletConstructor>;  //, Params>;
-    using context   = xpu::kernel_context<shared_memory, constants>;
+    using block_size    = xpu::block_size<kSingletConstructorBlockSize>;
+//    using shared_memory = shared_counters;
+    using constants     = xpu::cmem<strGpuTripletConstructor>;//, Params>;
+    using context       = xpu::kernel_context<shared_memory, constants>;
     XPU_D void operator()(context&);
   };
 
   struct CompressDoublets : xpu::kernel<GPUReco> {
-    using block_size = xpu::block_size<kSingletConstructorBlockSize>;
-    using constants  = xpu::cmem<strGpuTripletConstructor>;  //, Params>;
-    using context    = xpu::kernel_context<shared_memory, constants>;
+    using block_size    = xpu::block_size<kSingletConstructorBlockSize>;
+    using constants     = xpu::cmem<strGpuTripletConstructor>;//, Params>;
+    using context       = xpu::kernel_context<shared_memory, constants>;
     XPU_D void operator()(context&);
   };
 
   struct FitDoublets : xpu::kernel<GPUReco> {
-    using block_size = xpu::block_size<kSingletConstructorBlockSize>;
-    using constants  = xpu::cmem<strGpuTripletConstructor>;  //, Params>;
-    using context    = xpu::kernel_context<shared_memory, constants>;
+    using block_size    = xpu::block_size<kSingletConstructorBlockSize>;
+    using constants     = xpu::cmem<strGpuTripletConstructor>;//, Params>;
+    using context       = xpu::kernel_context<shared_memory, constants>;
     XPU_D void operator()(context&);
   };
 
   struct MakeTriplets : xpu::kernel<GPUReco> {
-    using block_size = xpu::block_size<kSingletConstructorBlockSize>;
-    //    using shared_memory = shared_counters;
-    using constants = xpu::cmem<strGpuTripletConstructor>;  //, Params>;
-    using context   = xpu::kernel_context<shared_memory, constants>;
+    using block_size    = xpu::block_size<kSingletConstructorBlockSize>;
+//    using shared_memory = shared_counters;
+    using constants     = xpu::cmem<strGpuTripletConstructor>;//, Params>;
+    using context       = xpu::kernel_context<shared_memory, constants>;
     XPU_D void operator()(context&);
   };
 
   struct CompressTriplets : xpu::kernel<GPUReco> {
-    using block_size = xpu::block_size<kSingletConstructorBlockSize>;
-    using constants  = xpu::cmem<strGpuTripletConstructor>;  //, Params>;
-    using context    = xpu::kernel_context<shared_memory, constants>;
+    using block_size    = xpu::block_size<kSingletConstructorBlockSize>;
+    using constants     = xpu::cmem<strGpuTripletConstructor>;//, Params>;
+    using context       = xpu::kernel_context<shared_memory, constants>;
     XPU_D void operator()(context&);
   };
 
   struct FitTriplets : xpu::kernel<GPUReco> {
-    using block_size = xpu::block_size<kSingletConstructorBlockSize>;
-    //    using shared_memory = shared_data_fit_triplets;
-    using constants = xpu::cmem<strGpuTripletConstructor>;  //, Params>;
-    using context   = xpu::kernel_context<shared_memory, constants>;
+    using block_size    = xpu::block_size<kSingletConstructorBlockSize>;
+//    using shared_memory = shared_data_fit_triplets;
+    using constants     = xpu::cmem<strGpuTripletConstructor>;//, Params>;
+    using context       = xpu::kernel_context<shared_memory, constants>;
     XPU_D void operator()(context&);
   };
 
-//#define SORT_TRIPLETS_GPU
-#ifdef SORT_TRIPLETS_GPU
   struct SortTriplets : xpu::kernel<GPUReco> {
-    using block_size = xpu::block_size<kSortTripletsBlockSize>;
-    using sort_t =
-      xpu::block_sort<unsigned int, /*ca::Triplet*/ sort_triplets, kSortTripletsBlockSize, kSortTripletsItemsPerThread>;
+    using block_size    = xpu::block_size<kSortTripletsBlockSize>;
+    using sort_t        = xpu::block_sort<unsigned int, sort_triplets, kSortTripletsBlockSize, kSortTripletsItemsPerThread>;
     using shared_memory = sort_t::storage_t;
-    using constants     = xpu::cmem<strGpuTripletConstructor>;  //, Params>;
+    using constants     = xpu::cmem<strGpuTripletConstructor>;//, Params>;
     using context       = xpu::kernel_context<shared_memory, constants>;
     XPU_D void operator()(context&);
   };
-#endif
 
   struct IterationData {
-    int fNHits;              ///< Number of hits in grid
-    int fIteration;          ///< Iteration number
-    int fNDoublets;          ///< Number of doublets
-    int fNDoublets_counter;  ///< Number of doublets counter
-    int fNTriplets;          ///< Number of triplets
-    int fNTriplets_counter;  ///< Number of triplets counter
+    int fNHits;  		///< Number of hits in grid
+    int fIteration;  		///< Iteration number
+    int fNDoublets;  		///< Number of doublets
+    int fNDoublets_counter;  	///< Number of doublets counter
+    int fNTriplets;  		///< Number of triplets
+    int fNTriplets_counter;  	///< Number of triplets counter
   };
 
   class GpuTripletConstructor {
    public:
     ///                             ------  FUNCTIONAL PART ------
 
-    //    XPU_D void TestFunc(TestFunc::context&) const;
+//    XPU_D void TestFunc(TestFunc::context&) const;
 
     XPU_D void MakeSinglets(MakeSinglets::context&) const;
 
@@ -181,30 +191,28 @@ namespace cbm::algo::ca
     XPU_D void CompressTriplets(CompressTriplets::context&) const;
 
     XPU_D void FitTriplets(FitTriplets::context&) const;
-#ifdef SORT_TRIPLETS_GPU
+
     XPU_D void SortTripletsFunc(SortTriplets::context&) const;
-#endif
 
    private:
+
     XPU_D void CollectHits(int iThread, int iSta, int iHit, int maxNHits, int* collectedHits) const;
 
-    XPU_D void CollectHitsTriplets(int iThread, int iSta, int maxNHits,
-                                   int* collectedHits) const;  //TODO: merge with CollectHits
+    XPU_D void CollectHitsTriplets(int iThread, int iSta, int maxNHits, int* collectedHits) const;	//TODO: merge with CollectHits
 
 
     ///                             ------  FITTING FINCTIONALITY ------ //TODO: check and switch to the standard CaTrackFit if possible
     XPU_D void FilterWithTargetAtLine(kf::TrackParamBase<float>* tr_par, const ca::GpuFieldRegion* F0) const;
 
     /// extrapolate track as a line, return the extrapolated X, Y and the Jacobians
-    XPU_D void GetExtrapolatedXYline(kf::TrackParamBase<float>* tr_par, float z, const ca::GpuFieldRegion* F,
-                                     float* extrXY, float* Jx, float* Jy) const;
+    XPU_D void GetExtrapolatedXYline(kf::TrackParamBase<float>* tr_par, float z, const ca::GpuFieldRegion* F, float* extrXY,
+	float *Jx, float *Jy) const;
 
     XPU_D void MultipleScattering(kf::TrackParamBase<float>* tr_par, float radThick, float qp0) const;
 
     /// extrapolate the track to the given Z using the field F
     /// it does extrapolation in one step
-    XPU_D void ExtrapolateStep(kf::TrackParamBase<float>* tr_par, float z_out, float qp0,
-                               const ca::GpuFieldRegion* Field) const;
+    XPU_D void ExtrapolateStep(kf::TrackParamBase<float>* tr_par, float z_out, float qp0, const ca::GpuFieldRegion* Field) const;
 
     /// filter the track with the XY measurement placed at different Z
     /// \param m - measurement
@@ -212,17 +220,17 @@ namespace cbm::algo::ca
     /// \param extrY - extrapolated Y of the track
     /// \param Jx - Jacobian of the extrapolated X
     /// \param Jy - Jacobian of the extrapolated Y
-    XPU_D void FilterExtrapolatedXY(
-      kf::TrackParamBase<float>* tr_par, const ca::MeasurementXy<float>* m, float* extrXY,  // fvec extrY,
-      //			      std::array<float, ca::TrackParamBase<float>::kNtrackParam>* Jx,
-      //			      std::array<float, ca::TrackParamBase<float>::kNtrackParam>* Jy) const;
-      float* Jx, float* Jy) const;
+    XPU_D void FilterExtrapolatedXY(kf::TrackParamBase<float>* tr_par, const ca::MeasurementXy<float>* m, float* extrXY,// fvec extrY,
+//			      std::array<float, ca::TrackParamBase<float>::kNtrackParam>* Jx,
+//			      std::array<float, ca::TrackParamBase<float>::kNtrackParam>* Jy) const;
+			      float* Jx,
+			      float* Jy) const;
 
     XPU_D void ExtrapolateLineNoField(kf::TrackParamBase<float>* tr_par, float zOut) const;
 
     XPU_D void FilterXY(kf::TrackParamBase<float>* tr_par, const ca::MeasurementXy<float>& mxy) const;
 
-    XPU_D void Filter1d(kf::TrackParamBase<float>* tr_par, const ca::MeasurementU<float>& m) const;
+    XPU_D void Filter1d(kf::TrackParamBase<float>* tr_par, const kf::MeasurementU<float>& m) const;
 
     XPU_D void FilterTime(kf::TrackParamBase<float>* tr_par, float t, float dt2, float timeInfo) const;
 
@@ -230,20 +238,21 @@ namespace cbm::algo::ca
 
     ///                          ------  DATA MEMBERS ------
    public:
-    xpu::buffer<ca::GpuGrid> fvGpuGrid;  ///< Grid
+
+    xpu::buffer<ca::GpuGrid> fvGpuGrid;	///< Grid
 
     xpu::buffer<unsigned int> fgridFirstBinEntryIndex;  ///< Index of the first entry in the grid
 
     xpu::buffer<unsigned int> fgridEntries;  ///< Entries in the grid	//TODO: check if it should be ca::HitIndex_t
 
     ///Material map
-    xpu::buffer<ca::GpuMaterialMap> fMaterialMap;  ///< Material map base objects
+    xpu::buffer<ca::GpuMaterialMap> fMaterialMap;	///< Material map base objects
 
-    xpu::buffer<float> fMaterialMapTables;  ///< Material map tables
+    xpu::buffer<float> fMaterialMapTables;	///< Material map tables
 
     xpu::buffer<IterationData> fIterationData;  ///< Temporary data
 
-    //    xpu::buffer<unsigned char> fvbHitSuppressed;  ///< Hit suppression flags
+//    xpu::buffer<unsigned char> fvbHitSuppressed;  ///< Hit suppression flags
 
     xpu::buffer<hit_doublets> fHitDoublets;  ///< Hit doublets
 
@@ -259,26 +268,24 @@ namespace cbm::algo::ca
     /// hit.Id is replaced by the hit index in fInputData
     xpu::buffer<ca::Hit> fvHits;
 
-    xpu::buffer<kf::TrackParamBase<float>> fvTrackParams;          ///< Track parameters
+    xpu::buffer<kf::TrackParamBase<float>> fvTrackParams;  ///< Track parameters
     xpu::buffer<kf::TrackParamBase<float>> fvTrackParamsDoublets;  ///< Track parameters for doublets
-    //    xpu::buffer<ca::TrackParamBase<float>> fvTrackParamsTriplets;  ///< Track parameters for triplets
+//    xpu::buffer<ca::TrackParamBase<float>> fvTrackParamsTriplets;  ///< Track parameters for triplets
 
     xpu::buffer<ca::Triplet> fvTriplets;  ///< Triplets
 
-#ifdef SORT_TRIPLETS_GPU
     xpu::buffer<sort_triplets> fTripletSortHelper;  ///< Helper for sorting triplets
-    xpu::buffer<sort_triplets> fTripletSortHelperTmp;
-    xpu::buffer<sort_triplets*> dst;
-#endif
+//    xpu::buffer<sort_triplets> fTripletSortHelperTmp;
+//    xpu::buffer<sort_triplets*> dst;
 
-    //    GpuParameters fParams;
+//    GpuParameters fParams;
     xpu::buffer<GpuParameters> fParams;
 
     int fIteration;
 
     GpuParameters fParams_const[4];
 
-    ca::GpuStation fStations_const[kMaxNofStations];
+    ca::GpuStation fStations_const[constants::gpu::MaxNofStations];
   };
 
 }  // namespace cbm::algo::ca
diff --git a/algo/ca/core/experimental/CaMeasurementU.cxx b/algo/ca/core/experimental/CaMeasurementU.cxx
deleted file mode 100644
index 05e72e07be..0000000000
--- a/algo/ca/core/experimental/CaMeasurementU.cxx
+++ /dev/null
@@ -1,40 +0,0 @@
-/* Copyright (C) 2022 GSI Helmholtzzentrum fuer Schwerionenforschung, Darmstadt
-   SPDX-License-Identifier: GPL-3.0-only
-   Authors: Sergey Gorbunov, Sergei Zharko [committer] */
-
-/// \file CaMeasurementU.cxx
-/// \brief Implementation of the CaMeasurementU class
-
-#include "CaMeasurementU.h"
-
-#include <iomanip>
-#include <sstream>
-
-namespace cbm::algo::ca
-{
-
-  //----------------------------------------------------------------------------------------------------------------------
-  //
-  template<typename DataT>
-  std::string MeasurementU<DataT>::ToString(int indentLevel) const
-  {
-    std::stringstream aStream{};
-    // TODO: possibly it is better to place the indentChar into ca::Parameters (S.Zharko)
-    constexpr char indentChar = '\t';
-    std::string indent(indentLevel, indentChar);
-    aStream << indent << "cos(phi):    " << std::setw(12) << std::setfill(' ') << CosPhi() << '\n';
-    aStream << indent << "sin(phi):    " << std::setw(12) << std::setfill(' ') << SinPhi() << '\n';
-    aStream << indent << "u:           " << std::setw(12) << std::setfill(' ') << U() << '\n';
-    aStream << indent << "du2:         " << std::setw(12) << std::setfill(' ') << Du2() << '\n';
-    aStream << indent << "ndf:         " << std::setw(12) << std::setfill(' ') << Ndf() << '\n';
-    return aStream.str();
-  }
-
-  ///----------------------------------------------------------------------------------------------------------------------
-  /// All necessary instantiations of the template class
-
-  template class MeasurementU<fvec>;
-  template class MeasurementU<float>;
-  template class MeasurementU<double>;
-
-}  // namespace cbm::algo::ca
diff --git a/algo/ca/core/experimental/CaMeasurementU.h b/algo/ca/core/experimental/CaMeasurementU.h
deleted file mode 100644
index 3fb46f6a98..0000000000
--- a/algo/ca/core/experimental/CaMeasurementU.h
+++ /dev/null
@@ -1,126 +0,0 @@
-/* Copyright (C) 2007-2023 GSI Helmholtzzentrum fuer Schwerionenforschung, Darmstadt
-   SPDX-License-Identifier: GPL-3.0-only
-   Authors: Sergey Gorbunov [committer], Igor Kulakov, Sergei Zharko */
-
-/// \file CaMeasurementU
-/// \brief Definition of the CaMeasurementU class
-
-#pragma once  // include this header only once per compilation unit
-
-
-#include "CaDefs.h"
-#include "CaSimd.h"
-#include "KfUtils.h"
-
-#include <boost/serialization/access.hpp>
-
-#include <string>
-
-#include <xpu/device.h>
-
-namespace cbm::algo::ca
-{
-  /// \brief The class describes a 1D - measurement U in XY coordinate system
-  ///
-  /// The measurement is defined as
-  ///
-  /// u = x * cos(phi) + y * sin(phi)
-  ///
-  /// where phi is the azimuthal angle of the measurement direction
-  /// and x, y are the coordinates of the measurement
-  /// The measurement may be scaled, i.e. cos^2 + sin^2 must not necessarily be equal to 1
-  /// The measurement has a finite resolution, i.e. the measurement is not a point, but a distribution
-  /// with a certain rms.
-  /// The measurement may be used in the chi2 calculation or not
-  /// The measurement may be a SIMD vector of values, when DataT is fvec type
-  ///
-  template<typename DataT>
-  class MeasurementU {
-
-   public:
-    friend class boost::serialization::access;
-
-    /// default constructor
-    XPU_D MeasurementU() = default;
-
-    /// constructor
-    /// \param cosPhi   cos(phi)
-    /// \param sinPhi   sin(phi)
-    /// \param u        measurement, u = x * cos(phi) + y * sin(phi)
-    /// \param du2      rms^2 of the measurement
-    /// \param ndf      number of degrees of freedom (used for chi2 calculation)
-    ///                 if ndf == 1, the measurement is used in the chi2 calculation
-    ///                 if ndf == 0, the measurement is not used in the chi2 calculation
-    MeasurementU(DataT cosPhi, DataT sinPhi, DataT u, DataT du2, DataT ndf)
-      : fCosPhi(cosPhi)
-      , fSinPhi(sinPhi)
-      , fU(u)
-      , fDu2(du2)
-      , fNdf(ndf)
-    {
-    }
-
-    ///------------------------------
-    /// Setters and getters
-
-    XPU_D void SetCosPhi(DataT cosPhi) { fCosPhi = cosPhi; }
-    XPU_D void SetSinPhi(DataT sinPhi) { fSinPhi = sinPhi; }
-    XPU_D void SetU(DataT u) { fU = u; }
-    XPU_D void SetDu2(DataT du2) { fDu2 = du2; }
-    XPU_D void SetNdf(DataT ndf) { fNdf = ndf; }
-
-    XPU_D DataT CosPhi() const { return fCosPhi; }
-    XPU_D DataT SinPhi() const { return fSinPhi; }
-    XPU_D DataT U() const { return fU; }
-    XPU_D DataT Du2() const { return fDu2; }
-    XPU_D DataT Ndf() const { return fNdf; }
-
-    ///------------------------------
-    /// Methods for debugging
-
-    /// String representation of class contents
-    /// \param indentLevel      number of indent characters in the output
-    std::string ToString(int indentLevel = 0) const;
-
-    /// Checks, if all fields are finite
-    bool IsFinite() const
-    {
-      return (kf::utils::IsFinite(CosPhi()) && kf::utils::IsFinite(SinPhi()) && kf::utils::IsFinite(U())
-              && kf::utils::IsFinite(Du2()) && kf::utils::IsFinite(Ndf()));
-    }
-
-    /// Checks, if some fields are undefined
-    bool IsUndefined() const
-    {
-      return (kf::utils::IsUndefined(CosPhi()) || kf::utils::IsUndefined(SinPhi()) || kf::utils::IsUndefined(U())
-              || kf::utils::IsUndefined(Du2()) || kf::utils::IsUndefined(Ndf()));
-    }
-
-    /// Serialization function
-    friend class boost::serialization::access;
-    template<class Archive>
-    void serialize(Archive& ar, const unsigned int)
-    {
-      ar& fCosPhi;
-      ar& fSinPhi;
-      ar& fU;
-      ar& fDu2;
-      ar& fNdf;
-    }
-
-   private:
-    ///------------------------------
-    /// Data members
-
-    DataT fCosPhi = constants::Undef<DataT>;  ///< cos(phi)
-    DataT fSinPhi = constants::Undef<DataT>;  ///< sin(phi)
-    DataT fU      = constants::Undef<DataT>;  ///< measurement, u = x * cos(phi) + y * sin(phi)
-    DataT fDu2    = constants::Undef<DataT>;  ///< rms^2 of the measurement
-
-    /// \brief number of degrees of freedom (used for chi2 calculation)
-    /// if ndf == 1, the measurement is used in the chi2 calculation
-    /// if ndf == 0, the measurement is not used in the chi2 calculation
-    DataT fNdf = constants::Undef<DataT>;
-  } _fvecalignment;
-
-}  // namespace cbm::algo::ca
diff --git a/algo/ca/core/experimental/CaMeasurementXy.cxx b/algo/ca/core/experimental/CaMeasurementXy.cxx
deleted file mode 100644
index ebd861fd38..0000000000
--- a/algo/ca/core/experimental/CaMeasurementXy.cxx
+++ /dev/null
@@ -1,42 +0,0 @@
-/* Copyright (C) 2022 GSI Helmholtzzentrum fuer Schwerionenforschung, Darmstadt
-   SPDX-License-Identifier: GPL-3.0-only
-   Authors: Sergey Gorbunov, Sergei Zharko [committer] */
-
-/// \file CaMeasurementXy.cxx
-/// \brief Implementation of the CaMeasurementXy class
-
-#include "CaMeasurementXy.h"
-
-#include <iomanip>
-#include <sstream>  // for stringstream
-
-namespace cbm::algo::ca
-{
-
-  //----------------------------------------------------------------------------------------------------------------------
-  //
-  template<typename DataT>
-  std::string MeasurementXy<DataT>::ToString(int indentLevel) const
-  {
-    std::stringstream aStream{};
-    // TODO: possibly it is better to place the indentChar into ca::Parameters (S.Zharko)
-    constexpr char indentChar = '\t';
-    std::string indent(indentLevel, indentChar);
-    aStream << indent << "x:    " << std::setw(12) << std::setfill(' ') << X() << '\n';
-    aStream << indent << "y:    " << std::setw(12) << std::setfill(' ') << Y() << '\n';
-    aStream << indent << "dx2:  " << std::setw(12) << std::setfill(' ') << Dx2() << '\n';
-    aStream << indent << "dy2:  " << std::setw(12) << std::setfill(' ') << Dy2() << '\n';
-    aStream << indent << "dxy:  " << std::setw(12) << std::setfill(' ') << Dxy() << '\n';
-    aStream << indent << "ndfX: " << std::setw(12) << std::setfill(' ') << NdfX() << '\n';
-    aStream << indent << "ndfY: " << std::setw(12) << std::setfill(' ') << NdfY() << '\n';
-    return aStream.str();
-  }
-
-  ///----------------------------------------------------------------------------------------------------------------------
-  /// All necessary instantiations of the template class
-
-  template class MeasurementXy<fvec>;
-  template class MeasurementXy<float>;
-  template class MeasurementXy<double>;
-
-}  // namespace cbm::algo::ca
diff --git a/algo/ca/core/experimental/CaMeasurementXy.h b/algo/ca/core/experimental/CaMeasurementXy.h
index 7841ec5f2c..fa95a074b7 100644
--- a/algo/ca/core/experimental/CaMeasurementXy.h
+++ b/algo/ca/core/experimental/CaMeasurementXy.h
@@ -4,6 +4,8 @@
 
 /// \file CaMeasurementXy.h
 /// \brief Definition of the CaMeasurementXy class
+/// Temporary class for compatibility with GPU code
+/// Should be replaced by KfMeasurementXy
 
 #pragma once  // include this header only once per compilation unit
 
@@ -33,7 +35,7 @@ namespace cbm::algo::ca
     friend class boost::serialization::access;
 
     /// default constructor
-    MeasurementXy() = default;
+    XPU_D MeasurementXy() = default;
 
     /// constructor
     /// \param x        x coordinate of the measurement
@@ -91,14 +93,14 @@ namespace cbm::algo::ca
     ///------------------------------
     /// Setters and getters
 
-    void SetX(DataT x) { fX = x; }
-    void SetY(DataT y) { fY = y; }
-    void SetDx2(DataT dx2) { fDx2 = dx2; }
-    void SetDy2(DataT dy2) { fDy2 = dy2; }
-    void SetDxy(DataT dxy) { fDxy = dxy; }
-    void SetNdfX(DataT ndfX) { fNdfX = ndfX; }
-    void SetNdfY(DataT ndfY) { fNdfY = ndfY; }
-    void SetCov(DataT dx2, DataT dxy, DataT dy2)
+    XPU_D void SetX(DataT x) { fX = x; }
+    XPU_D void SetY(DataT y) { fY = y; }
+    XPU_D void SetDx2(DataT dx2) { fDx2 = dx2; }
+    XPU_D void SetDy2(DataT dy2) { fDy2 = dy2; }
+    XPU_D void SetDxy(DataT dxy) { fDxy = dxy; }
+    XPU_D void SetNdfX(DataT ndfX) { fNdfX = ndfX; }
+    XPU_D void SetNdfY(DataT ndfY) { fNdfY = ndfY; }
+    XPU_D void SetCov(DataT dx2, DataT dxy, DataT dy2)
     {
       fDx2 = dx2;
       fDxy = dxy;
@@ -116,27 +118,22 @@ namespace cbm::algo::ca
     ///------------------------------
     /// references, to ease assignment to SIMD vector components when DataT has fvec type
 
-    DataT& X() { return fX; }
-    DataT& Y() { return fY; }
-    DataT& Dx2() { return fDx2; }
-    DataT& Dy2() { return fDy2; }
-    DataT& Dxy() { return fDxy; }
-    DataT& NdfX() { return fNdfX; }
-    DataT& NdfY() { return fNdfY; }
+    XPU_D DataT& X() { return fX; }
+    XPU_D DataT& Y() { return fY; }
+    XPU_D DataT& Dx2() { return fDx2; }
+    XPU_D DataT& Dy2() { return fDy2; }
+    XPU_D DataT& Dxy() { return fDxy; }
+    XPU_D DataT& NdfX() { return fNdfX; }
+    XPU_D DataT& NdfY() { return fNdfY; }
 
     ///------------------------------
     /// Methods for debugging
 
-    /// String representation of class contents
-    /// \param indentLevel      number of indent characters in the output
-    std::string ToString(int indentLevel = 0) const;
-
     /// Checks, if all fields are finite
     bool IsFinite() const
     {
-      return (kf::utils::IsFinite(X()) && kf::utils::IsFinite(Y()) && kf::utils::IsFinite(Dx2())
-              && kf::utils::IsFinite(Dy2()) && kf::utils::IsFinite(Dxy()) && kf::utils::IsFinite(NdfX())
-              && kf::utils::IsFinite(NdfY()));
+      return (kf::utils::IsFinite(X()) && kf::utils::IsFinite(Y()) && kf::utils::IsFinite(Dx2()) && kf::utils::IsFinite(Dy2())
+              && kf::utils::IsFinite(Dxy()) && kf::utils::IsFinite(NdfX()) && kf::utils::IsFinite(NdfY()));
     }
 
     /// Checks, if some fields are undefined
@@ -167,20 +164,20 @@ namespace cbm::algo::ca
     // Therefore, we initialize the data members here with finite numbers.
     // For the numerical safety, with some reasonable numbers.
 
-    DataT fX;    //{0.};    ///< x coordinate of the measurement
-    DataT fY;    //{0.};    ///< y coordinate of the measurement
-    DataT fDx2;  //{1.};  ///< rms^2 of the x coordinate measurement
-    DataT fDy2;  //{1.};  ///< rms^2 of the y coordinate measurement
-    DataT fDxy;  //{0.};  ///< covariance of the x and y coordinate measurements
+    DataT fX;    ///< x coordinate of the measurement
+    DataT fY;    ///< y coordinate of the measurement
+    DataT fDx2;  ///< rms^2 of the x coordinate measurement
+    DataT fDy2;  ///< rms^2 of the y coordinate measurement
+    DataT fDxy;  ///< covariance of the x and y coordinate measurements
 
     /// number of degrees of freedom (used for chi2 calculation)
     /// if ndf == 1, the measurement is used in the chi2 calculation
     /// if ndf == 0, the measurement is not used in the chi2 calculation
 
-    DataT fNdfX;  // = 0.;  ///< ndf for the x coordinate measurement
-    DataT fNdfY;  // = 0.;  ///< ndf for the y coordinate measurement
+    DataT fNdfX;  ///< ndf for the x coordinate measurement
+    DataT fNdfY;  ///< ndf for the y coordinate measurement
 
-  } _fvecalignment;
+  };
 
 
   // ---------------------------------------------------------------------------------------------------------------------
diff --git a/algo/ca/core/pars/CaDefs.h b/algo/ca/core/pars/CaDefs.h
index 8b4a2280d5..b30f7a10a8 100644
--- a/algo/ca/core/pars/CaDefs.h
+++ b/algo/ca/core/pars/CaDefs.h
@@ -105,6 +105,18 @@ namespace cbm::algo::ca::constants
 
   }  // namespace misc
 
+  /// GPU constants
+  namespace gpu
+  {
+    constexpr int MaxDoubletsFromHit = 150;	///< Maximum number of doublets from a hit
+    constexpr int MaxTripletsFromDoublet = 15;	///< Maximum number of triplets from a doublet
+    constexpr int MaxNofStations = 20;	///< Maximum number of stations //TODO: temporary solution
+    constexpr bool GpuTracking = false;	///< Flag: use GPU for tracking
+    constexpr bool GpuTimeMonitoring = false;	///< Flag: use GPU for time monitoring
+    constexpr bool GpuSortTriplets = false;	///< Flag: use GPU for sorting triplets
+    constexpr bool CpuSortTriplets = true;	///< Flag: use CPU for sorting triplets
+  }  // namespace gpu
+
   /// \brief Undefined values
   template<typename T1, typename T2 = T1>
   constexpr T2 Undef;
diff --git a/algo/ca/core/tracking/CaTrackFinderWindow.cxx b/algo/ca/core/tracking/CaTrackFinderWindow.cxx
index 4f0696514e..41b9b6c4a9 100644
--- a/algo/ca/core/tracking/CaTrackFinderWindow.cxx
+++ b/algo/ca/core/tracking/CaTrackFinderWindow.cxx
@@ -127,26 +127,29 @@ namespace cbm::algo::ca
     PrepareGrid(input.GetHits(), wData);
     frMonitorData.StopTimer(ETimer::PrepareGrid);
 
-#ifdef GPU_TRACKING
-    // XPU initialization
-    // - temporary solution for tests only
-    // - should not be initialized here
-    setenv("XPU_PROFILE", "1", 1);
-    xpu::settings settings;
-    //    settings.device = "cpu0";
-    settings.device = "hip0";
-    //    settings.verbose = true;
-    xpu::initialize(settings);
-
-    // Set up environment for GPU tracking
-    xpu::push_timer("gpuTFinit");
-    ca::GpuTrackFinderSetup gpuTrackFinderSetup(wData, fParameters);
-    xpu::timings gpuTFinit = xpu::pop_timer();
-    LOG(info) << "GPU tracking :: Initialization: " << gpuTFinit.wall() << " ms";
-
-    SetupGpuTrackFinder(gpuTrackFinderSetup);
+    std::optional<ca::GpuTrackFinderSetup> gpuTrackFinderSetup;
     size_t iter_num = 0;
-#endif
+    if constexpr (constants::gpu::GpuTracking) {
+      // XPU initialization
+      // - temporary solution for tests only
+      // - should not be initialized here
+      setenv("XPU_PROFILE", "1", 1);
+      xpu::settings settings;
+//      settings.device = "cpu0";
+      settings.device = "hip0";
+//      settings.verbose = true;
+      xpu::initialize(settings);
+
+      // Set up environment for GPU tracking
+      xpu::push_timer("gpuTFinit");
+      gpuTrackFinderSetup.emplace(wData, fParameters);
+      xpu::timings gpuTFinit = xpu::pop_timer();
+      if constexpr (constants::gpu::GpuTimeMonitoring) {
+        LOG(info) << "GPU tracking :: Initialization: " << gpuTFinit.wall() << " ms";
+      }
+
+      SetupGpuTrackFinder(gpuTrackFinderSetup.value());
+    }
 
     // Run CA iterations
     frMonitorData.StartTimer(ETimer::FindTracks);
@@ -157,18 +160,18 @@ namespace cbm::algo::ca
       PrepareCAIteration(*iter, wData, iter == caIterations.begin());
       frMonitorData.StopTimer(ETimer::PrepareIteration);
 
-#ifdef GPU_TRACKING
-      // ----- Triplets construction on GPU -----
-      frMonitorData.StartTimer(ETimer::ConstructTriplets);
-      ConstructTripletsGPU(wData, gpuTrackFinderSetup, iter_num);
-      iter_num++;
-      frMonitorData.StopTimer(ETimer::ConstructTriplets);
-#else
-      // ----- Triplets construction -----
-      frMonitorData.StartTimer(ETimer::ConstructTriplets);
-      ConstructTriplets(wData);
-      frMonitorData.StopTimer(ETimer::ConstructTriplets);
-#endif
+      if constexpr (constants::gpu::GpuTracking) {
+        // ----- Triplets construction on GPU -----
+        frMonitorData.StartTimer(ETimer::ConstructTriplets);
+        ConstructTripletsGPU(wData, gpuTrackFinderSetup.value(), iter_num);
+        iter_num++;
+        frMonitorData.StopTimer(ETimer::ConstructTriplets);
+      } else {
+        // ----- Triplets construction -----
+        frMonitorData.StartTimer(ETimer::ConstructTriplets);
+        ConstructTriplets(wData);
+        frMonitorData.StopTimer(ETimer::ConstructTriplets);
+      }
 
       // ----- Search for neighbouring triplets -----
       frMonitorData.StartTimer(ETimer::SearchNeighbours);
@@ -787,59 +790,66 @@ namespace cbm::algo::ca
     }      // level = 0
   }
 
-#ifdef GPU_TRACKING
   void TrackFinderWindow::SetupGpuTrackFinder(GpuTrackFinderSetup& gpuTrackFinderSetup)
   {
     xpu::push_timer("SetupParametersTime");
     gpuTrackFinderSetup.SetupParameters();
     xpu::timings SetupParametersTime = xpu::pop_timer();
-    LOG(info) << "GPU tracking :: SetupParameters: " << SetupParametersTime.wall() << " ms";
 
     xpu::push_timer("SetInputDataTime");
     gpuTrackFinderSetup.SetInputData();
     xpu::timings SetInputDataTime = xpu::pop_timer();
-    LOG(info) << "GPU tracking :: SetInputData: " << SetInputDataTime.wall() << " ms";
 
     xpu::push_timer("SetupMaterialMapTime");
     gpuTrackFinderSetup.SetupMaterialMap();
     xpu::timings SetupMaterialMapTime = xpu::pop_timer();
-    LOG(info) << "GPU tracking :: SetupMaterialMap: " << SetupMaterialMapTime.wall() << " ms";
+
+    if constexpr (constants::gpu::GpuTimeMonitoring) {
+      LOG(info) << "GPU tracking :: SetupParameters: " << SetupParametersTime.wall() << " ms";
+      LOG(info) << "GPU tracking :: SetInputData: " << SetInputDataTime.wall() << " ms";
+      LOG(info) << "GPU tracking :: SetupMaterialMap: " << SetupMaterialMapTime.wall() << " ms";
+    }
   }
 
-  void TrackFinderWindow::ConstructTripletsGPU(WindowData& wData, GpuTrackFinderSetup& gpuTrackFinderSetup,
-                                               int iteration)
+  void TrackFinderWindow::ConstructTripletsGPU(WindowData& wData, GpuTrackFinderSetup& gpuTrackFinderSetup, int iteration)
   {
     xpu::push_timer("SetupGridTime");
     gpuTrackFinderSetup.SetupGrid();
     xpu::timings SetupGridTime = xpu::pop_timer();
-    LOG(info) << "GPU tracking :: SetupGrid: " << SetupGridTime.wall() << " ms";
 
     xpu::push_timer("SetupIterationDataTime");
     gpuTrackFinderSetup.SetupIterationData(iteration);
     xpu::timings SetupIterationDataTime = xpu::pop_timer();
-    LOG(info) << "GPU tracking :: SetupIterationData: " << SetupIterationDataTime.wall() << " ms";
 
     xpu::push_timer("RunGpuTrackingSetup");
     gpuTrackFinderSetup.RunGpuTrackingSetup();
     xpu::timings RunGpuTrackingSetup = xpu::pop_timer();
-    LOG(info) << "GPU tracking :: RunGpuTracking: " << RunGpuTrackingSetup.wall() << " ms";
-
-#define SORT_TRIPLETS_CPU
-#ifdef SORT_TRIPLETS_CPU
-    xpu::push_timer("SortTriplets");
-    std::vector<std::array<unsigned int, 2>> triplet_sort;
-    triplet_sort.reserve(gpuTrackFinderSetup.GetNofTriplets());
-
-    for (unsigned int itr = 0; itr < gpuTrackFinderSetup.GetNofTriplets(); itr++) {
-      if (gpuTrackFinderSetup.GetTriplet(itr).GetChi2() <= 0.) continue;
-      const auto& triplet = gpuTrackFinderSetup.GetTriplet(itr);
-      triplet_sort.push_back({triplet.GetLHit(), itr});
+
+    std::optional<std::vector<std::array<unsigned int, 2>>> triplet_sort;
+    if constexpr (constants::gpu::CpuSortTriplets) {
+      xpu::push_timer("SortTriplets");
+      triplet_sort.emplace();
+      triplet_sort.value().reserve(gpuTrackFinderSetup.GetNofTriplets());
+
+      for(unsigned int itr = 0; itr < gpuTrackFinderSetup.GetNofTriplets(); itr++) {
+        if( gpuTrackFinderSetup.GetTriplet(itr).GetChi2() <= 0. ) continue;
+        const auto& triplet = gpuTrackFinderSetup.GetTriplet(itr);
+        triplet_sort.value().push_back({triplet.GetLHit(), itr});
+      }
+      std::sort(triplet_sort.value().begin(), triplet_sort.value().end(),
+        [](const std::array<unsigned int, 2>& a, const std::array<unsigned int, 2>& b) {
+            return (a[0] < b[0]);
+        }
+      );
+      xpu::timings SortTriplets = xpu::pop_timer();
+
+      if constexpr (constants::gpu::GpuTimeMonitoring) {
+        LOG(info) << "GPU tracking :: SetupGrid: " << SetupGridTime.wall() << " ms";
+        LOG(info) << "GPU tracking :: SetupIterationData: " << SetupIterationDataTime.wall() << " ms";
+        LOG(info) << "GPU tracking :: RunGpuTracking: " << RunGpuTrackingSetup.wall() << " ms";
+        LOG(info) << "GPU tracking :: SortTripletsCPU: " << SortTriplets.wall() << " ms";
+      }
     }
-    std::sort(triplet_sort.begin(), triplet_sort.end(),
-              [](const std::array<unsigned int, 2>& a, const std::array<unsigned int, 2>& b) { return (a[0] < b[0]); });
-    xpu::timings SortTriplets = xpu::pop_timer();
-    LOG(info) << "GPU tracking :: SortTripletsCPU: " << SortTriplets.wall() << " ms";
-#endif
 
     int ihitl_0 = -1;
 
@@ -854,33 +864,36 @@ namespace cbm::algo::ca
       vTriplets[j].reserve(2 * nHitsStation);
     }
 
-#ifdef SORT_TRIPLETS_CPU
-    for (size_t itr = 0; itr < triplet_sort.size(); itr++) {
-      int iSta  = gpuTrackFinderSetup.GetTriplet(std::get<1>(triplet_sort[itr])).GetLSta();
-      int ihitl = gpuTrackFinderSetup.GetTriplet(std::get<1>(triplet_sort[itr])).GetLHit();
-#else
-    for (int itr = 0; itr < gpuTrackFinderSetup.GetNofTriplets(); itr++) {
-      int iSta  = gpuTrackFinderSetup.GetTriplet(itr).GetLSta();
-      int ihitl = gpuTrackFinderSetup.GetTriplet(itr).GetLHit();
-#endif
-      const size_t oldSize = vTriplets[iSta].size();
-#ifdef SORT_TRIPLETS_CPU
-      vTriplets[iSta].emplace_back(gpuTrackFinderSetup.GetTriplet(std::get<1>(triplet_sort[itr])));
-      if (ihitl == ihitl_0) {
-        vHitNofTriplets[ihitl]++;
+    int nTriplets = gpuTrackFinderSetup.GetNofTriplets();
+    if constexpr (constants::gpu::CpuSortTriplets) {
+      nTriplets = triplet_sort.value().size();
+    }
+    for( size_t itr = 0; itr < nTriplets; itr++ ) {
+      int iSta, ihitl;
+      if constexpr (constants::gpu::CpuSortTriplets) {
+        iSta = gpuTrackFinderSetup.GetTriplet(std::get<1>(triplet_sort.value()[itr])).GetLSta ();
+        ihitl = gpuTrackFinderSetup.GetTriplet(std::get<1>(triplet_sort.value()[itr])).GetLHit ();
+      } else {
+        iSta = gpuTrackFinderSetup.GetTriplet(itr).GetLSta ();
+        ihitl = gpuTrackFinderSetup.GetTriplet(itr).GetLHit ();
       }
-      else {
+      const size_t oldSize = vTriplets[iSta].size();
+
+      if constexpr (constants::gpu::CpuSortTriplets) {
+        vTriplets[iSta].emplace_back(gpuTrackFinderSetup.GetTriplet(std::get<1>(triplet_sort.value()[itr])));
+        if( ihitl == ihitl_0 ) {
+          vHitNofTriplets[ihitl]++;
+        } else {
+          vHitFirstTriplet[ihitl] = PackTripletId(iSta, oldSize);
+          vHitNofTriplets[ihitl] = vTriplets[iSta].size() - oldSize;
+        }
+        ihitl_0 = ihitl;
+      } else {
+        if( gpuTrackFinderSetup.GetTriplet(itr).GetChi2() > 0. )
+          vTriplets[iSta].emplace_back(gpuTrackFinderSetup.GetTriplet(itr));
         vHitFirstTriplet[ihitl] = PackTripletId(iSta, oldSize);
-        vHitNofTriplets[ihitl]  = vTriplets[iSta].size() - oldSize;
+        vHitNofTriplets[ihitl] = vTriplets[iSta].size() - oldSize;
       }
-      ihitl_0 = ihitl;
-#else
-      if (gpuTrackFinderSetup.GetTriplet(itr).GetChi2() > 0.)
-        vTriplets[iSta].emplace_back(gpuTrackFinderSetup.GetTriplet(itr));
-      vHitFirstTriplet[ihitl] = PackTripletId(iSta, oldSize);
-      vHitNofTriplets[ihitl]  = vTriplets[iSta].size() - oldSize;
-#endif
     }
   }
-#endif
 }  // namespace cbm::algo::ca
diff --git a/algo/ca/core/tracking/CaTrackFinderWindow.h b/algo/ca/core/tracking/CaTrackFinderWindow.h
index fdae1253ef..2c56150838 100644
--- a/algo/ca/core/tracking/CaTrackFinderWindow.h
+++ b/algo/ca/core/tracking/CaTrackFinderWindow.h
@@ -19,11 +19,7 @@
 #include "CaTrackingMonitor.h"
 #include "CaVector.h"
 #include "CaWindowData.h"
-
-//#define GPU_TRACKING
-#ifdef GPU_TRACKING
 #include "CaGpuTrackFinderSetup.h"
-#endif
 
 namespace cbm::algo::ca
 {
@@ -95,11 +91,9 @@ namespace cbm::algo::ca
 
     bool checkTripletMatch(const ca::Triplet& l, const ca::Triplet& r, fscal& dchi2, WindowData& wData) const;
 
-#ifdef GPU_TRACKING
     void SetupGpuTrackFinder(GpuTrackFinderSetup& gpuTrackFinderSetup);
 
     void ConstructTripletsGPU(WindowData& wData, GpuTrackFinderSetup& gpuTrackFinderSetup, int iteration);
-#endif
 
     // ** Functions, which pack and unpack indexes of station and triplet **
 
diff --git a/algo/kf/core/data/KfMeasurementU.h b/algo/kf/core/data/KfMeasurementU.h
index 711f237ec2..e4e6a08dcf 100644
--- a/algo/kf/core/data/KfMeasurementU.h
+++ b/algo/kf/core/data/KfMeasurementU.h
@@ -15,6 +15,8 @@
 
 #include <string>
 
+#include <xpu/device.h>
+
 namespace cbm::algo::kf
 {
   /// \brief The class describes a 1D - measurement U in XY coordinate system
@@ -38,7 +40,7 @@ namespace cbm::algo::kf
     friend class boost::serialization::access;
 
     /// default constructor
-    MeasurementU() = default;
+    XPU_D MeasurementU() = default;
 
     /// constructor
     /// \param cosPhi   cos(phi)
@@ -48,7 +50,7 @@ namespace cbm::algo::kf
     /// \param ndf      number of degrees of freedom (used for chi2 calculation)
     ///                 if ndf == 1, the measurement is used in the chi2 calculation
     ///                 if ndf == 0, the measurement is not used in the chi2 calculation
-    MeasurementU(DataT cosPhi, DataT sinPhi, DataT u, DataT du2, DataT ndf)
+    XPU_D MeasurementU(DataT cosPhi, DataT sinPhi, DataT u, DataT du2, DataT ndf)
       : fCosPhi(cosPhi)
       , fSinPhi(sinPhi)
       , fU(u)
@@ -60,17 +62,17 @@ namespace cbm::algo::kf
     ///------------------------------
     /// Setters and getters
 
-    void SetCosPhi(DataT cosPhi) { fCosPhi = cosPhi; }
-    void SetSinPhi(DataT sinPhi) { fSinPhi = sinPhi; }
-    void SetU(DataT u) { fU = u; }
-    void SetDu2(DataT du2) { fDu2 = du2; }
-    void SetNdf(DataT ndf) { fNdf = ndf; }
-
-    DataT CosPhi() const { return fCosPhi; }
-    DataT SinPhi() const { return fSinPhi; }
-    DataT U() const { return fU; }
-    DataT Du2() const { return fDu2; }
-    DataT Ndf() const { return fNdf; }
+    XPU_D void SetCosPhi(DataT cosPhi) { fCosPhi = cosPhi; }
+    XPU_D void SetSinPhi(DataT sinPhi) { fSinPhi = sinPhi; }
+    XPU_D void SetU(DataT u) { fU = u; }
+    XPU_D void SetDu2(DataT du2) { fDu2 = du2; }
+    XPU_D void SetNdf(DataT ndf) { fNdf = ndf; }
+
+    XPU_D DataT CosPhi() const { return fCosPhi; }
+    XPU_D DataT SinPhi() const { return fSinPhi; }
+    XPU_D DataT U() const { return fU; }
+    XPU_D DataT Du2() const { return fDu2; }
+    XPU_D DataT Ndf() const { return fNdf; }
 
     ///------------------------------
     /// Methods for debugging
-- 
GitLab


From 1bc472aad324bffdf33d07f267911bc863557981 Mon Sep 17 00:00:00 2001
From: "g.kozlov" <grxxl@yandex.ru>
Date: Fri, 7 Mar 2025 15:37:01 +0100
Subject: [PATCH 3/6] clang format

---
 algo/ca/core/experimental/CaGpuField.h        | 319 ++++-----
 algo/ca/core/experimental/CaGpuGrid.h         |   9 +-
 algo/ca/core/experimental/CaGpuGridArea.h     |   9 +-
 algo/ca/core/experimental/CaGpuMaterialMap.h  |  10 +-
 algo/ca/core/experimental/CaGpuParameters.h   |  32 +-
 algo/ca/core/experimental/CaGpuStation.h      |  37 +-
 algo/ca/core/experimental/CaGpuTimeMonitor.h  |  33 +-
 .../experimental/CaGpuTrackFinderSetup.cxx    | 180 +++---
 .../core/experimental/CaGpuTrackFinderSetup.h |  30 +-
 .../experimental/CaGpuTripletConstructor.cxx  | 604 ++++++++++--------
 .../experimental/CaGpuTripletConstructor.h    | 169 +++--
 algo/ca/core/experimental/CaMeasurementXy.h   |   6 +-
 algo/ca/core/pars/CaDefs.h                    |  16 +-
 algo/ca/core/tracking/CaTrackFinderWindow.cxx |  49 +-
 algo/ca/core/tracking/CaTrackFinderWindow.h   |   2 +-
 15 files changed, 772 insertions(+), 733 deletions(-)

diff --git a/algo/ca/core/experimental/CaGpuField.h b/algo/ca/core/experimental/CaGpuField.h
index 8346640d41..8fd42b4746 100644
--- a/algo/ca/core/experimental/CaGpuField.h
+++ b/algo/ca/core/experimental/CaGpuField.h
@@ -10,52 +10,52 @@
 
 #pragma once  // include this header only once per compilation unit
 
-#include "KfTrackParam.h"
-#include "KfField.h"
 #include "CaUtils.h"
+#include "KfField.h"
+#include "KfTrackParam.h"
 
 #include <xpu/device.h>
 
 namespace cbm::algo::ca
 {
   class GpuFieldValue {
-  public:
-   float x;  //< x-component of the field
-   float y;  //< y-component of the field
-   float z;  //< z-component of the field
-
-   GpuFieldValue() = default;
-
-   /// \brief Copy constructor with type conversion
-   template<typename DataIn>
-   GpuFieldValue(const kf::FieldValue<DataIn>& other)
-     : x(kf::utils::simd::Cast<DataIn, float>(other.GetBx()))
-     , y(kf::utils::simd::Cast<DataIn, float>(other.GetBy()))
-     , z(kf::utils::simd::Cast<DataIn, float>(other.GetBz()))
-   {
-   }
-
-   /// Combines the magnetic field with another field value object using weight
-   /// \param B  other field value to combine with
-   /// \param w  weight from 0 to 1
-
-   XPU_D void Combine(const GpuFieldValue& B, const bool w)
-   {
-     if (w) {
-       x = B.x;
-       y = B.y;
-       z = B.z;
-     }
-   }
-
-   /// Is the field value zero?
-   XPU_D bool IsZeroField() const
-   {
-//     constexpr auto e = constants::misc::NegligibleFieldkG;
-//     float e = 1.e-4;
-     return (x * x + y * y + z * z <= 1.e-8);
-   }
- };
+   public:
+    float x;  //< x-component of the field
+    float y;  //< y-component of the field
+    float z;  //< z-component of the field
+
+    GpuFieldValue() = default;
+
+    /// \brief Copy constructor with type conversion
+    template<typename DataIn>
+    GpuFieldValue(const kf::FieldValue<DataIn>& other)
+      : x(kf::utils::simd::Cast<DataIn, float>(other.GetBx()))
+      , y(kf::utils::simd::Cast<DataIn, float>(other.GetBy()))
+      , z(kf::utils::simd::Cast<DataIn, float>(other.GetBz()))
+    {
+    }
+
+    /// Combines the magnetic field with another field value object using weight
+    /// \param B  other field value to combine with
+    /// \param w  weight from 0 to 1
+
+    XPU_D void Combine(const GpuFieldValue& B, const bool w)
+    {
+      if (w) {
+        x = B.x;
+        y = B.y;
+        z = B.z;
+      }
+    }
+
+    /// Is the field value zero?
+    XPU_D bool IsZeroField() const
+    {
+      //     constexpr auto e = constants::misc::NegligibleFieldkG;
+      //     float e = 1.e-4;
+      return (x * x + y * y + z * z <= 1.e-8);
+    }
+  };
 
   class GpuFieldSlice {
    public:
@@ -106,11 +106,11 @@ namespace cbm::algo::ca
     // NOTE: We don't use an initialization of arrays here because we cannot be sure
     //       if the underlying type (fvec) has a default constructor, but
     //       we are sure, that it can be initialized with a float. (S.Zharko)
-    static constexpr auto kMaxNFieldApproxCoefficients = 21;//constants::size::MaxNFieldApproxCoefficients;
+    static constexpr auto kMaxNFieldApproxCoefficients = 21;  //constants::size::MaxNFieldApproxCoefficients;
 
-    CoeffArray_t cx;//fBx;  ///< Approximation coefficients for the x-component of the field
-    CoeffArray_t cy;//fBy;  ///< Approximation coefficients for the y-component of the field
-    CoeffArray_t cz;//fBz;  ///< Approximation coefficients for the z-component of the field
+    CoeffArray_t cx;  //fBx;  ///< Approximation coefficients for the x-component of the field
+    CoeffArray_t cy;  //fBy;  ///< Approximation coefficients for the y-component of the field
+    CoeffArray_t cz;  //fBz;  ///< Approximation coefficients for the z-component of the field
 
     float z;  ///< z coordinate of the slice
   };
@@ -162,127 +162,130 @@ namespace cbm::algo::ca
   }
 
   class GpuFieldRegion {
-  public:
-   GpuFieldRegion() = default;
-
-   /// \brief Copy constructor with type conversion
-   template<typename DataIn>
-   GpuFieldRegion(const kf::FieldRegion<DataIn>& other)
-     : cx0(kf::utils::simd::Cast<DataIn, float>(other.cx0))
-     , cx1(kf::utils::simd::Cast<DataIn, float>(other.cx1))
-     , cx2(kf::utils::simd::Cast<DataIn, float>(other.cx2))
-     , cy0(kf::utils::simd::Cast<DataIn, float>(other.cy0))
-     , cy1(kf::utils::simd::Cast<DataIn, float>(other.cy1))
-     , cy2(kf::utils::simd::Cast<DataIn, float>(other.cy2))
-     , cz0(kf::utils::simd::Cast<DataIn, float>(other.cz0))
-     , cz1(kf::utils::simd::Cast<DataIn, float>(other.cz1))
-     , cz2(kf::utils::simd::Cast<DataIn, float>(other.cz2))
-     , z0(kf::utils::simd::Cast<DataIn, float>(other.z0))
-//     , fUseOriginalField(other.fUseOriginalField)
-     , fIsZeroField(other.fIsZeroField)
-   {
-   }
-
-   /// Gets the field vector and writes it into B pointer
-   /// \param x  x-coordinate of the point to calculate the field
-   /// \param y  y-coordinate of the point to calculate the field
-   /// \param z  z-coordinate of the point to calculate the field
-   /// \return    the magnetic field value
-   XPU_D GpuFieldValue Get([[maybe_unused]] float x, [[maybe_unused]] float y, float z) const
-   {
-     GpuFieldValue B;
-     float dz  = z - z0;
-     float dz2 = dz * dz;
-     B.x       = cx0 + cx1 * dz + cx2 * dz2;
-     B.y       = cy0 + cy1 * dz + cy2 * dz2;
-     B.z       = cz0 + cz1 * dz + cz2 * dz2;
-     return B;
-   }
-
-   /// Is the field region empty?
-   XPU_D bool IsZeroField() const { return fIsZeroField; }
-
-   /// Interpolates the magnetic field by three nodal points and sets the result to this FieldRegion object
-   /// The result is a quadratic interpolation of the field as a function of z
-   /// \param b0   field value in the first nodal point
-   /// \param b0z  z-coordinate of the first nodal point
-   /// \param b1   field value in the second nodal point
-   /// \param b1z  z-coordinate of the second nodal point
-   /// \param b2   field value in the third nodal point
-   /// \param b2z  z-coordinate of the third nodal point
-   /// TODO: does the sequence of b0z, b1z and b2z matter? If yes, probalby we need an assertion (S.Zharko)
-   XPU_D void Set(const GpuFieldValue& b0, float b0z, const GpuFieldValue& b1, float b1z, const GpuFieldValue& b2, float b2z);
-
-   /// Interpolates the magnetic field by thwo nodal points and sets the result to this FieldRegion object.
-   /// The result is a linear interpolation of the field as a function of z
-   /// \param b0   field value in the first nodal point
-   /// \param b0z  z-coordinate of the first nodal point
-   /// \param b1   field value in the second nodal point
-   /// \param b1z  z-coordinate of the second nodal point
-   /// TODO: does the sequence of b0z and b1z matter? If yes, probalby we need an assertion (S.Zharko)
-   XPU_D void Set(const GpuFieldValue& b0, float b0z, const GpuFieldValue& b1, float b1z) {
-     fIsZeroField = (b0.IsZeroField() && b1.IsZeroField());
-     z0        = b0z;
-     float dzi = 1. / (b1z - b0z);	//TODO
-     cx0       = b0.x;
-     cy0       = b0.y;
-     cz0       = b0.z;
-     cx1       = (b1.x - b0.x) * dzi;
-     cy1       = (b1.y - b0.y) * dzi;
-     cz1       = (b1.z - b0.z) * dzi;
-     cx2       = 0.f;
-     cy2       = 0.f;
-     cz2       = 0.f;
-   }
-
-   /// Shifts the coefficients to new central point
-   /// \param z  z-coordinate of the new central point
-   XPU_D void Shift(float z) {
-     float dz    = z - z0;
-     float cx2dz = cx2 * dz;
-     float cy2dz = cy2 * dz;
-     float cz2dz = cz2 * dz;
-     z0          = z;
-     cx0 += (cx1 + cx2dz) * dz;
-     cy0 += (cy1 + cy2dz) * dz;
-     cz0 += (cz1 + cz2dz) * dz;
-     cx1 += cx2dz + cx2dz;
-     cy1 += cy2dz + cy2dz;
-     cz1 += cz2dz + cz2dz;
-   }
-
-
-  public:
-
-   // TODO: Probably it's better to have arrays instead of separate fvec values? (S.Zharko)
-   // Bx(z) = cx0 + cx1*(z-z0) + cx2*(z-z0)^2
-   float cx0;
-   float cx1;
-   float cx2;
-
-   // By(z) = cy0 + cy1*(z-z0) + cy2*(z-z0)^2
-   float cy0;
-   float cy1;
-   float cy2;
-
-   // Bz(z) = cz0 + cz1*(z-z0) + cz2*(z-z0)^2
-   float cz0;
-   float cz1;
-   float cz2;
-
-   float z0;  ///< z-coordinate of the field region central point
-
-   bool fIsZeroField;//{false};  ///< Is the field region empty?
- };
-
-  XPU_D inline void GpuFieldRegion::Set(const GpuFieldValue& b0, float b0z, const GpuFieldValue& b1, float b1z, const GpuFieldValue& b2, float b2z)
+   public:
+    GpuFieldRegion() = default;
+
+    /// \brief Copy constructor with type conversion
+    template<typename DataIn>
+    GpuFieldRegion(const kf::FieldRegion<DataIn>& other)
+      : cx0(kf::utils::simd::Cast<DataIn, float>(other.cx0))
+      , cx1(kf::utils::simd::Cast<DataIn, float>(other.cx1))
+      , cx2(kf::utils::simd::Cast<DataIn, float>(other.cx2))
+      , cy0(kf::utils::simd::Cast<DataIn, float>(other.cy0))
+      , cy1(kf::utils::simd::Cast<DataIn, float>(other.cy1))
+      , cy2(kf::utils::simd::Cast<DataIn, float>(other.cy2))
+      , cz0(kf::utils::simd::Cast<DataIn, float>(other.cz0))
+      , cz1(kf::utils::simd::Cast<DataIn, float>(other.cz1))
+      , cz2(kf::utils::simd::Cast<DataIn, float>(other.cz2))
+      , z0(kf::utils::simd::Cast<DataIn, float>(other.z0))
+      //     , fUseOriginalField(other.fUseOriginalField)
+      , fIsZeroField(other.fIsZeroField)
+    {
+    }
+
+    /// Gets the field vector and writes it into B pointer
+    /// \param x  x-coordinate of the point to calculate the field
+    /// \param y  y-coordinate of the point to calculate the field
+    /// \param z  z-coordinate of the point to calculate the field
+    /// \return    the magnetic field value
+    XPU_D GpuFieldValue Get([[maybe_unused]] float x, [[maybe_unused]] float y, float z) const
+    {
+      GpuFieldValue B;
+      float dz  = z - z0;
+      float dz2 = dz * dz;
+      B.x       = cx0 + cx1 * dz + cx2 * dz2;
+      B.y       = cy0 + cy1 * dz + cy2 * dz2;
+      B.z       = cz0 + cz1 * dz + cz2 * dz2;
+      return B;
+    }
+
+    /// Is the field region empty?
+    XPU_D bool IsZeroField() const { return fIsZeroField; }
+
+    /// Interpolates the magnetic field by three nodal points and sets the result to this FieldRegion object
+    /// The result is a quadratic interpolation of the field as a function of z
+    /// \param b0   field value in the first nodal point
+    /// \param b0z  z-coordinate of the first nodal point
+    /// \param b1   field value in the second nodal point
+    /// \param b1z  z-coordinate of the second nodal point
+    /// \param b2   field value in the third nodal point
+    /// \param b2z  z-coordinate of the third nodal point
+    /// TODO: does the sequence of b0z, b1z and b2z matter? If yes, probalby we need an assertion (S.Zharko)
+    XPU_D void Set(const GpuFieldValue& b0, float b0z, const GpuFieldValue& b1, float b1z, const GpuFieldValue& b2,
+                   float b2z);
+
+    /// Interpolates the magnetic field by thwo nodal points and sets the result to this FieldRegion object.
+    /// The result is a linear interpolation of the field as a function of z
+    /// \param b0   field value in the first nodal point
+    /// \param b0z  z-coordinate of the first nodal point
+    /// \param b1   field value in the second nodal point
+    /// \param b1z  z-coordinate of the second nodal point
+    /// TODO: does the sequence of b0z and b1z matter? If yes, probalby we need an assertion (S.Zharko)
+    XPU_D void Set(const GpuFieldValue& b0, float b0z, const GpuFieldValue& b1, float b1z)
+    {
+      fIsZeroField = (b0.IsZeroField() && b1.IsZeroField());
+      z0           = b0z;
+      float dzi    = 1. / (b1z - b0z);  //TODO
+      cx0          = b0.x;
+      cy0          = b0.y;
+      cz0          = b0.z;
+      cx1          = (b1.x - b0.x) * dzi;
+      cy1          = (b1.y - b0.y) * dzi;
+      cz1          = (b1.z - b0.z) * dzi;
+      cx2          = 0.f;
+      cy2          = 0.f;
+      cz2          = 0.f;
+    }
+
+    /// Shifts the coefficients to new central point
+    /// \param z  z-coordinate of the new central point
+    XPU_D void Shift(float z)
+    {
+      float dz    = z - z0;
+      float cx2dz = cx2 * dz;
+      float cy2dz = cy2 * dz;
+      float cz2dz = cz2 * dz;
+      z0          = z;
+      cx0 += (cx1 + cx2dz) * dz;
+      cy0 += (cy1 + cy2dz) * dz;
+      cz0 += (cz1 + cz2dz) * dz;
+      cx1 += cx2dz + cx2dz;
+      cy1 += cy2dz + cy2dz;
+      cz1 += cz2dz + cz2dz;
+    }
+
+
+   public:
+    // TODO: Probably it's better to have arrays instead of separate fvec values? (S.Zharko)
+    // Bx(z) = cx0 + cx1*(z-z0) + cx2*(z-z0)^2
+    float cx0;
+    float cx1;
+    float cx2;
+
+    // By(z) = cy0 + cy1*(z-z0) + cy2*(z-z0)^2
+    float cy0;
+    float cy1;
+    float cy2;
+
+    // Bz(z) = cz0 + cz1*(z-z0) + cz2*(z-z0)^2
+    float cz0;
+    float cz1;
+    float cz2;
+
+    float z0;  ///< z-coordinate of the field region central point
+
+    bool fIsZeroField;  //{false};  ///< Is the field region empty?
+  };
+
+  XPU_D inline void GpuFieldRegion::Set(const GpuFieldValue& b0, float b0z, const GpuFieldValue& b1, float b1z,
+                                        const GpuFieldValue& b2, float b2z)
   {
     fIsZeroField = (b0.IsZeroField() && b1.IsZeroField() && b2.IsZeroField());
 
     z0        = b0z;
     float dz1 = b1z - b0z;
     float dz2 = b2z - b0z;
-    float det = 1. / (dz1 * dz2 * (b2z - b1z));	//TODO
+    float det = 1. / (dz1 * dz2 * (b2z - b1z));  //TODO
 
     float w21 = -dz2 * det;
     float w22 = dz1 * det;
diff --git a/algo/ca/core/experimental/CaGpuGrid.h b/algo/ca/core/experimental/CaGpuGrid.h
index cccd0cb458..8040148772 100644
--- a/algo/ca/core/experimental/CaGpuGrid.h
+++ b/algo/ca/core/experimental/CaGpuGrid.h
@@ -13,6 +13,7 @@
 #pragma once  // include this header only once per compilation unit
 
 #include "CaGrid.h"
+
 #include <xpu/device.h>
 
 namespace cbm::algo::ca
@@ -145,11 +146,11 @@ namespace cbm::algo::ca
     float fMaxRangeY{0.};  ///< maximal entry range in Y
     float fMaxRangeT{0.};  ///< maximal entry range in T
 
-    int fBinEntryIndex_start{0};	///< index of the first bin entry in the index array
-    int fBinEntryIndex_amount{0};	///< number of entries in the index array
+    int fBinEntryIndex_start{0};   ///< index of the first bin entry in the index array
+    int fBinEntryIndex_amount{0};  ///< number of entries in the index array
 
-    int fEntries_start{0};		///< index of the first entry in the entry array
-    int fEntries_amount{0};		///< number of entries in the entry array
+    int fEntries_start{0};   ///< index of the first entry in the entry array
+    int fEntries_amount{0};  ///< number of entries in the entry array
   };
 
   /// --- Inline methods ---
diff --git a/algo/ca/core/experimental/CaGpuGridArea.h b/algo/ca/core/experimental/CaGpuGridArea.h
index f18851e722..265b24e073 100644
--- a/algo/ca/core/experimental/CaGpuGridArea.h
+++ b/algo/ca/core/experimental/CaGpuGridArea.h
@@ -31,7 +31,8 @@ namespace cbm::algo::ca
     /// \param dx - half-width of the area in X
     /// \param dy - half-width of the area in Y
     //TODO: try to provide first element for the current grid instead of first element of the original arrays
-    XPU_D GpuGridArea(const ca::GpuGrid& grid, const unsigned int* BinEntryIndex, const unsigned int* Entries, float x, float y, float dx, float dy)
+    XPU_D GpuGridArea(const ca::GpuGrid& grid, const unsigned int* BinEntryIndex, const unsigned int* Entries, float x,
+                      float y, float dx, float dy)
       : fGrid(grid)
       , fBinEntryIndex(BinEntryIndex)
       , fEntries(Entries)
@@ -64,7 +65,7 @@ namespace cbm::algo::ca
     XPU_D bool GetNextObjectId(unsigned int& objectId);
 
     /// \brief debug mode: loop over the entire GetEntries() vector ignoring the search area
-//    void DoLoopOverEntireGrid();
+    //    void DoLoopOverEntireGrid();
 
    private:
     const ca::GpuGrid& fGrid;
@@ -89,8 +90,8 @@ namespace cbm::algo::ca
       if (fAreaCurrentBinY >= fAreaLastBinY) {
         return false;
       }
-      fAreaCurrentBinY++;            // get new y-line
-      fAreaFirstBin += fGridNbinsX;  // move the left-down corner of the area to the next y-line
+      fAreaCurrentBinY++;                                // get new y-line
+      fAreaFirstBin += fGridNbinsX;                      // move the left-down corner of the area to the next y-line
       fCurentEntry     = fBinEntryIndex[fAreaFirstBin];  // get first hit in cell, if y-line is new
       fEntriesXend     = fBinEntryIndex[fAreaFirstBin + fAreaNbinsX];
       xIndexOutOfRange = (fCurentEntry >= fEntriesXend);
diff --git a/algo/ca/core/experimental/CaGpuMaterialMap.h b/algo/ca/core/experimental/CaGpuMaterialMap.h
index 1699216736..d28babce04 100644
--- a/algo/ca/core/experimental/CaGpuMaterialMap.h
+++ b/algo/ca/core/experimental/CaGpuMaterialMap.h
@@ -65,13 +65,13 @@ namespace cbm::algo::ca
     }
 
    private:
-    int fNbins;    ///< Number of rows (== N columns) in the material budget table
+    int fNbins;     ///< Number of rows (== N columns) in the material budget table
     int fBinStart;  ///< Start index of the material budget table
-    float fXYmax;  ///< Size of the station in x and y dimensions [cm]
+    float fXYmax;   ///< Size of the station in x and y dimensions [cm]
     float fFactor;  ///< Util. var. for the conversion of point coordinates to row/column id
-    float fZref;  ///< Reference Z of the collected material [cm]
-    float fZmin;  ///< Minimal Z of the collected material [cm]
-    float fZmax;  ///< Minimal Z of the collected material [cm]
+    float fZref;    ///< Reference Z of the collected material [cm]
+    float fZmin;    ///< Minimal Z of the collected material [cm]
+    float fZmax;    ///< Minimal Z of the collected material [cm]
   };
 
 }  // namespace cbm::algo::ca
diff --git a/algo/ca/core/experimental/CaGpuParameters.h b/algo/ca/core/experimental/CaGpuParameters.h
index 231e98359f..f558dd575b 100644
--- a/algo/ca/core/experimental/CaGpuParameters.h
+++ b/algo/ca/core/experimental/CaGpuParameters.h
@@ -8,9 +8,9 @@
 #pragma once  // include this header only once per compilation unit
 
 #include "CaGpuField.h"
-#include "CaMeasurementXy.h"
 #include "CaGpuMaterialMap.h"
 #include "CaGpuStation.h"
+#include "CaMeasurementXy.h"
 #include "CaParameters.h"
 
 #include <xpu/device.h>
@@ -20,7 +20,6 @@ namespace cbm::algo::ca
 
   class GpuParameters {
    public:
-
     /// \brief Default constructor
     GpuParameters() = default;
 
@@ -41,7 +40,7 @@ namespace cbm::algo::ca
       fTargetPos[0] = kf::utils::simd::Cast<DataIn, float>(other.GetTargetPositionX());
       fTargetPos[1] = kf::utils::simd::Cast<DataIn, float>(other.GetTargetPositionY());
       fTargetPos[2] = kf::utils::simd::Cast<DataIn, float>(other.GetTargetPositionZ());
-      for (size_t i = 0; i < (size_t)other.GetNstationsActive(); i++) {
+      for (size_t i = 0; i < (size_t) other.GetNstationsActive(); i++) {
         fStations[i] = other.GetStation(i);
       }
     }
@@ -50,9 +49,8 @@ namespace cbm::algo::ca
     ~GpuParameters() = default;
 
    public:
-
-    ca::GpuFieldValue targB;             ///< Magnetic field in the target region
-    ca::MeasurementXy<float> targetMeasurement;          ///< Measurement XY in the target region
+    ca::GpuFieldValue targB;                     ///< Magnetic field in the target region
+    ca::MeasurementXy<float> targetMeasurement;  ///< Measurement XY in the target region
 
     /// \brief Gets Station
     XPU_D const ca::GpuStation& GetStation(int iStation) const { return fStations[iStation]; }
@@ -71,19 +69,19 @@ namespace cbm::algo::ca
 
     std::array<ca::GpuStation, constants::gpu::MaxNofStations> fStations;  ///< Array of stations
 
-    std::array<float, 3> fTargetPos;	///< Target position
+    std::array<float, 3> fTargetPos;  ///< Target position
 
-    float maxSlopePV;                         ///< Max slope to primary vertex
-    float maxQp;                              ///< Max Q/p
-    float maxDZ;                              ///< Max DZ
-    float particleMass;                       ///< Particle mass
-    float doubletChi2Cut;                     ///< Doublet chi2 cut
-    float tripletChi2Cut;                     ///< Triplet chi2 cut
-    float tripletFinalChi2Cut;                ///< Triplet final chi2 cut
+    float maxSlopePV;           ///< Max slope to primary vertex
+    float maxQp;                ///< Max Q/p
+    float maxDZ;                ///< Max DZ
+    float particleMass;         ///< Particle mass
+    float doubletChi2Cut;       ///< Doublet chi2 cut
+    float tripletChi2Cut;       ///< Triplet chi2 cut
+    float tripletFinalChi2Cut;  ///< Triplet final chi2 cut
 
-    size_t fNStations;                        ///< Number of stations
+    size_t fNStations;  ///< Number of stations
 
-    bool primaryFlag;                         ///< Primary flag
-    bool isTargetField;                       ///< Is target field
+    bool primaryFlag;    ///< Primary flag
+    bool isTargetField;  ///< Is target field
   };
 }  // namespace cbm::algo::ca
diff --git a/algo/ca/core/experimental/CaGpuStation.h b/algo/ca/core/experimental/CaGpuStation.h
index a2433e45ae..0ee6d43086 100644
--- a/algo/ca/core/experimental/CaGpuStation.h
+++ b/algo/ca/core/experimental/CaGpuStation.h
@@ -22,13 +22,12 @@ namespace cbm::algo::ca
   ///
   class GpuStation {
    public:
-
-    int type;  //  TODO: replace with L1DetectorID
-    int timeInfo;  ///< flag: if time information can be used
+    int type;         //  TODO: replace with L1DetectorID
+    int timeInfo;     ///< flag: if time information can be used
     int fieldStatus;  ///< flag: 1 - station is INSIDE the field, 0 - station is OUTSIDE the field
-    float fZ;  ///< z position of station     [cm]
-    float Xmax;  ///< min radius of the station [cm]
-    float Ymax;  ///< max radius of the station [cm]
+    float fZ;         ///< z position of station     [cm]
+    float Xmax;       ///< min radius of the station [cm]
+    float Ymax;       ///< max radius of the station [cm]
 
     GpuFieldSlice fieldSlice;  ///< Magnetic field near the station
 
@@ -57,35 +56,19 @@ namespace cbm::algo::ca
     int GetFieldStatus() const { return fieldStatus; }
 
     /// \brief Gets z-position of the station
-    XPU_D float GetZ() const
-    {
-      return fZ;
-    }
+    XPU_D float GetZ() const { return fZ; }
 
     /// \brief Gets limit of the station size in x-axis direction
-    XPU_D float GetXmax() const
-    {
-      return Xmax;
-    }
+    XPU_D float GetXmax() const { return Xmax; }
 
     /// \brief Gets limit of the station size in x-axis direction
-    XPU_D float GetXmin() const
-    {
-      return -Xmax;
-    }
+    XPU_D float GetXmin() const { return -Xmax; }
 
     /// \brief Gets limit of the station size in y-axis direction
-    XPU_D float GetYmax() const
-    {
-      return Ymax;
-    }
+    XPU_D float GetYmax() const { return Ymax; }
 
     /// \brief Gets limit of the station size in y-axis direction
-    XPU_D float GetYmin() const
-    {
-      return -Ymax;
-    }
-
+    XPU_D float GetYmin() const { return -Ymax; }
   };
 
 }  // namespace cbm::algo::ca
diff --git a/algo/ca/core/experimental/CaGpuTimeMonitor.h b/algo/ca/core/experimental/CaGpuTimeMonitor.h
index 43f213d628..f8537dd3ab 100644
--- a/algo/ca/core/experimental/CaGpuTimeMonitor.h
+++ b/algo/ca/core/experimental/CaGpuTimeMonitor.h
@@ -33,19 +33,18 @@ namespace cbm::algo::ca
     xpu::timings Total_time[4];
     int nIterations;
 
-    void PrintTimings(int iteration) const {
+    void PrintTimings(int iteration) const
+    {
       if (iteration < 0 || iteration >= 4) {
-	std::cerr << "Error: wrong iteration number: " << iteration << std::endl;
-	return;
+        std::cerr << "Error: wrong iteration number: " << iteration << std::endl;
+        return;
       }
 
       auto print_timing = [](const std::string& name, const xpu::timings& t) {
-	std::cout << std::left << std::setw(25) << name
-	    << "Kernel time: " << std::setw(10) << t.kernel_time()
-	    << " H2D copy: " << std::setw(10) << t.copy(xpu::h2d)
-	    << " D2H copy: " << std::setw(10) << t.copy(xpu::d2h)
-	    << " Memset: " << std::setw(10) << t.memset()
-	    << " Total: " << std::setw(10) << t.wall() << std::endl;
+        std::cout << std::left << std::setw(25) << name << "Kernel time: " << std::setw(10) << t.kernel_time()
+                  << " H2D copy: " << std::setw(10) << t.copy(xpu::h2d) << " D2H copy: " << std::setw(10)
+                  << t.copy(xpu::d2h) << " Memset: " << std::setw(10) << t.memset() << " Total: " << std::setw(10)
+                  << t.wall() << std::endl;
       };
 
       print_timing("PrepareData_time", PrepareData_time[iteration]);
@@ -67,15 +66,13 @@ namespace cbm::algo::ca
   class GpuTimeMonitor {
    public:
     /// \brief  Default constructor
-    GpuTimeMonitor()
-     : nEvents(0)
-    {
-    }
+    GpuTimeMonitor() : nEvents(0) {}
 
     /// \brief Destructor
     ~GpuTimeMonitor() = default;
 
-    void AddEventInfo(XpuTimings &event) {
+    void AddEventInfo(XpuTimings& event)
+    {
       xpuTimings.push_back(event);
       nEvents++;
     }
@@ -100,9 +97,9 @@ namespace cbm::algo::ca
 
     xpu::timings GetTotalTime(int ev, int it) { return xpuTimings[ev].Total_time[it]; }
 
-    private:
-     std::vector<XpuTimings> xpuTimings;
-     int nEvents;
-    };
+   private:
+    std::vector<XpuTimings> xpuTimings;
+    int nEvents;
+  };
 
 }  // namespace cbm::algo::ca
diff --git a/algo/ca/core/experimental/CaGpuTrackFinderSetup.cxx b/algo/ca/core/experimental/CaGpuTrackFinderSetup.cxx
index fb11f74006..5d40dd2219 100644
--- a/algo/ca/core/experimental/CaGpuTrackFinderSetup.cxx
+++ b/algo/ca/core/experimental/CaGpuTrackFinderSetup.cxx
@@ -12,61 +12,64 @@ using namespace cbm::algo::ca;
 GpuTrackFinderSetup::GpuTrackFinderSetup(WindowData& wData, const ca::Parameters<fvec>& pars)
   : fParameters(pars)
   , frWData(wData)
-  , fIteration(0) {}
+  , fIteration(0)
+{
+}
 
 void GpuTrackFinderSetup::SetupParameters()
 {
-  int nStations = fParameters.GetNstationsActive();
+  int nStations   = fParameters.GetNstationsActive();
   int nIterations = fParameters.GetCAIterations().size();
   fTripletConstructor.fParams.reset(nIterations, xpu::buf_io);
   xpu::h_view vfParams{fTripletConstructor.fParams};
 
-  for( int i = 0; i < nIterations; i++ ) {
-    GpuParameters gpuParams = fParameters;
+  for (int i = 0; i < nIterations; i++) {
+    GpuParameters gpuParams              = fParameters;
     fTripletConstructor.fParams_const[i] = fParameters;
-    const auto& caIteration   = fParameters.GetCAIterations()[i];
+    const auto& caIteration              = fParameters.GetCAIterations()[i];
 
-    gpuParams.fNStations = nStations;
-    gpuParams.maxSlopePV = caIteration.GetMaxSlopePV();
-    gpuParams.maxQp = caIteration.GetMaxQp();
-    gpuParams.maxDZ = caIteration.GetMaxDZ();
+    gpuParams.fNStations                            = nStations;
+    gpuParams.maxSlopePV                            = caIteration.GetMaxSlopePV();
+    gpuParams.maxQp                                 = caIteration.GetMaxQp();
+    gpuParams.maxDZ                                 = caIteration.GetMaxDZ();
     fTripletConstructor.fParams_const[i].fNStations = nStations;
     fTripletConstructor.fParams_const[i].maxSlopePV = caIteration.GetMaxSlopePV();
-    fTripletConstructor.fParams_const[i].maxQp = caIteration.GetMaxQp();
-    fTripletConstructor.fParams_const[i].maxDZ = caIteration.GetMaxDZ();
-
-    if(caIteration.GetElectronFlag()){
-        gpuParams.particleMass = constants::phys::ElectronMass;
-        fTripletConstructor.fParams_const[i].particleMass = constants::phys::ElectronMass;
-    } else {
-      gpuParams.particleMass = constants::phys::MuonMass;
+    fTripletConstructor.fParams_const[i].maxQp      = caIteration.GetMaxQp();
+    fTripletConstructor.fParams_const[i].maxDZ      = caIteration.GetMaxDZ();
+
+    if (caIteration.GetElectronFlag()) {
+      gpuParams.particleMass                            = constants::phys::ElectronMass;
+      fTripletConstructor.fParams_const[i].particleMass = constants::phys::ElectronMass;
+    }
+    else {
+      gpuParams.particleMass                            = constants::phys::MuonMass;
       fTripletConstructor.fParams_const[i].particleMass = constants::phys::MuonMass;
     }
 
-    gpuParams.primaryFlag = caIteration.GetPrimaryFlag();
+    gpuParams.primaryFlag                            = caIteration.GetPrimaryFlag();
     fTripletConstructor.fParams_const[i].primaryFlag = caIteration.GetPrimaryFlag();
 
-    if(caIteration.GetPrimaryFlag()) {
-      gpuParams.targB = fParameters.GetVertexFieldValue();
+    if (caIteration.GetPrimaryFlag()) {
+      gpuParams.targB                            = fParameters.GetVertexFieldValue();
       fTripletConstructor.fParams_const[i].targB = fParameters.GetVertexFieldValue();
-    } else {
-      gpuParams.targB = fParameters.GetStation(0).fieldSlice.GetFieldValue(0, 0);
+    }
+    else {
+      gpuParams.targB                            = fParameters.GetStation(0).fieldSlice.GetFieldValue(0, 0);
       fTripletConstructor.fParams_const[i].targB = fParameters.GetStation(0).fieldSlice.GetFieldValue(0, 0);
     }
 
-    gpuParams.doubletChi2Cut = caIteration.GetDoubletChi2Cut();
-    gpuParams.tripletChi2Cut = caIteration.GetTripletChi2Cut();
-    gpuParams.tripletFinalChi2Cut = caIteration.GetTripletFinalChi2Cut();
-    gpuParams.isTargetField = (fabs(gpuParams.targB.y) > 0.001);
-    fTripletConstructor.fParams_const[i].doubletChi2Cut = caIteration.GetDoubletChi2Cut();
-    fTripletConstructor.fParams_const[i].tripletChi2Cut = caIteration.GetTripletChi2Cut();
+    gpuParams.doubletChi2Cut                                 = caIteration.GetDoubletChi2Cut();
+    gpuParams.tripletChi2Cut                                 = caIteration.GetTripletChi2Cut();
+    gpuParams.tripletFinalChi2Cut                            = caIteration.GetTripletFinalChi2Cut();
+    gpuParams.isTargetField                                  = (fabs(gpuParams.targB.y) > 0.001);
+    fTripletConstructor.fParams_const[i].doubletChi2Cut      = caIteration.GetDoubletChi2Cut();
+    fTripletConstructor.fParams_const[i].tripletChi2Cut      = caIteration.GetTripletChi2Cut();
     fTripletConstructor.fParams_const[i].tripletFinalChi2Cut = caIteration.GetTripletFinalChi2Cut();
 
     ca::MeasurementXy<float> targMeas(fParameters.GetTargetPositionX()[0], fParameters.GetTargetPositionY()[0],
-				  caIteration.GetTargetPosSigmaX() * caIteration.GetTargetPosSigmaX(),
-				  caIteration.GetTargetPosSigmaY() * caIteration.GetTargetPosSigmaX(),
-				  0, 1, 1);
-    gpuParams.targetMeasurement = targMeas;
+                                      caIteration.GetTargetPosSigmaX() * caIteration.GetTargetPosSigmaX(),
+                                      caIteration.GetTargetPosSigmaY() * caIteration.GetTargetPosSigmaX(), 0, 1, 1);
+    gpuParams.targetMeasurement                            = targMeas;
     fTripletConstructor.fParams_const[i].targetMeasurement = targMeas;
 
     vfParams[i] = gpuParams;
@@ -74,7 +77,7 @@ void GpuTrackFinderSetup::SetupParameters()
 
   fQueue.copy(fTripletConstructor.fParams, xpu::h2d);
 
-  for( int ista = 0; ista < nStations; ista++ ) {
+  for (int ista = 0; ista < nStations; ista++) {
     fTripletConstructor.fStations_const[ista] = fParameters.GetStation(ista);
   }
 
@@ -83,8 +86,8 @@ void GpuTrackFinderSetup::SetupParameters()
 
 void GpuTrackFinderSetup::SetupGrid()
 {
-  unsigned int nStations = fParameters.GetNstationsActive();
-  unsigned int bin_start = 0;
+  unsigned int nStations     = fParameters.GetNstationsActive();
+  unsigned int bin_start     = 0;
   unsigned int entries_start = 0;
 
   fTripletConstructor.fvGpuGrid.reset(nStations, xpu::buf_io);
@@ -105,7 +108,8 @@ void GpuTrackFinderSetup::SetupGrid()
   bin_start = entries_start = 0;
 
   for (unsigned int ista = 0; ista < nStations; ista++) {
-    std::copy_n(frWData.Grid(ista).GetFirstBinEntryIndex().begin(), frWData.Grid(ista).GetFirstBinEntryIndex().size(), &vfgridFirstBinEntryIndex[bin_start]);
+    std::copy_n(frWData.Grid(ista).GetFirstBinEntryIndex().begin(), frWData.Grid(ista).GetFirstBinEntryIndex().size(),
+                &vfgridFirstBinEntryIndex[bin_start]);
     for (unsigned int i = 0; i < frWData.Grid(ista).GetEntries().size(); i++) {
       vfgridEntries[entries_start + i] = frWData.Grid(ista).GetEntries()[i].GetObjectId();
     }
@@ -120,17 +124,17 @@ void GpuTrackFinderSetup::SetupGrid()
 void GpuTrackFinderSetup::SetupMaterialMap()
 {
   ///Set up material map
-  int bin_start = 0;
-  int nStations = fParameters.GetNstationsActive();
+  int bin_start  = 0;
+  int nStations  = fParameters.GetNstationsActive();
   int fstStation = 0;
-  if( nStations == 8 ) fstStation = 4;
+  if (nStations == 8) fstStation = 4;
   fTripletConstructor.fMaterialMap.reset(nStations, xpu::buf_io);
   xpu::h_view vfMaterialMap{fTripletConstructor.fMaterialMap};
 
   for (int ista = 0; ista < nStations; ista++) {
-      vfMaterialMap[ista] = GpuMaterialMap(fParameters.GetGeometrySetup().GetMaterial(fstStation + ista), bin_start);
-      bin_start += fParameters.GetGeometrySetup().GetMaterial(fstStation + ista).GetNbins() *
-	  fParameters.GetGeometrySetup().GetMaterial(fstStation + ista).GetNbins();
+    vfMaterialMap[ista] = GpuMaterialMap(fParameters.GetGeometrySetup().GetMaterial(fstStation + ista), bin_start);
+    bin_start += fParameters.GetGeometrySetup().GetMaterial(fstStation + ista).GetNbins()
+                 * fParameters.GetGeometrySetup().GetMaterial(fstStation + ista).GetNbins();
   }
 
   fTripletConstructor.fMaterialMapTables.reset(bin_start, xpu::buf_io);
@@ -139,12 +143,11 @@ void GpuTrackFinderSetup::SetupMaterialMap()
   bin_start = 0;
 
   for (int ista = 0; ista < nStations; ista++) {
-    std::copy_n (
-	fParameters.GetGeometrySetup().GetMaterial(fstStation + ista).GetTable().begin(),
-	fParameters.GetGeometrySetup().GetMaterial(fstStation + ista).GetTable().size(),
-	&vfMaterialMapTables[bin_start]);
-    bin_start += fParameters.GetGeometrySetup().GetMaterial(fstStation + ista).GetNbins() *
-	fParameters.GetGeometrySetup().GetMaterial(fstStation + ista).GetNbins();
+    std::copy_n(fParameters.GetGeometrySetup().GetMaterial(fstStation + ista).GetTable().begin(),
+                fParameters.GetGeometrySetup().GetMaterial(fstStation + ista).GetTable().size(),
+                &vfMaterialMapTables[bin_start]);
+    bin_start += fParameters.GetGeometrySetup().GetMaterial(fstStation + ista).GetNbins()
+                 * fParameters.GetGeometrySetup().GetMaterial(fstStation + ista).GetNbins();
   }
 
   fQueue.copy(fTripletConstructor.fMaterialMap, xpu::h2d);
@@ -163,16 +166,16 @@ void GpuTrackFinderSetup::SetInputData()
 
 void GpuTrackFinderSetup::SetupIterationData(int iter)
 {
-//  xpu::scoped_timer t_("SetupIterationData");
+  //  xpu::scoped_timer t_("SetupIterationData");
   fIteration = iter;
 
   fTripletConstructor.fIterationData.reset(1, xpu::buf_io);
   xpu::h_view vfIterationData{fTripletConstructor.fIterationData};
-  vfIterationData[0].fNHits = fNHits;
-  vfIterationData[0].fIteration = iter;
-  vfIterationData[0].fNDoublets = 0;
+  vfIterationData[0].fNHits             = fNHits;
+  vfIterationData[0].fIteration         = iter;
+  vfIterationData[0].fNDoublets         = 0;
   vfIterationData[0].fNDoublets_counter = 0;
-  vfIterationData[0].fNTriplets = 0;
+  vfIterationData[0].fNTriplets         = 0;
   vfIterationData[0].fNTriplets_counter = 0;
 
   fQueue.copy(fTripletConstructor.fIterationData, xpu::h2d);
@@ -189,10 +192,13 @@ void GpuTrackFinderSetup::RunGpuTrackingSetup()
   }
 
   bool isCpu  = xpu::device::active().backend() == xpu::cpu;
-  int nBlocks = isCpu ? fNHits : std::ceil(float(fNHits)/ float(kSingletConstructorBlockSize));
-//nBlocks = 1;
+  int nBlocks = isCpu ? fNHits : std::ceil(float(fNHits) / float(kSingletConstructorBlockSize));
+  //nBlocks = 1;
 
-  int nBlocksCD = nBlocks * constants::gpu::MaxDoubletsFromHit;//std::ceil(float(nBlocks * maxDoubletsFromHit) / float(kSingletConstructorBlockSize));
+  int nBlocksCD =
+    nBlocks
+    * constants::gpu::
+      MaxDoubletsFromHit;  //std::ceil(float(nBlocks * maxDoubletsFromHit) / float(kSingletConstructorBlockSize));
 
   //***
   fTripletConstructor.fvTrackParams.reset(frWData.Hits().size(), xpu::buf_device);
@@ -209,24 +215,24 @@ void GpuTrackFinderSetup::RunGpuTrackingSetup()
     xpu::push_timer("PrepareData_time");
   }
 
-//  if(fIteration == 0) fQueue.copy(fTripletConstructor.fvHits, xpu::h2d);
+  //  if(fIteration == 0) fQueue.copy(fTripletConstructor.fvHits, xpu::h2d);
 
-//  fQueue.copy(fTripletConstructor.fIterationData, xpu::h2d);
+  //  fQueue.copy(fTripletConstructor.fIterationData, xpu::h2d);
 
-//  fQueue.copy(fTripletConstructor.fvGpuGrid, xpu::h2d);
+  //  fQueue.copy(fTripletConstructor.fvGpuGrid, xpu::h2d);
 
   fQueue.copy(fTripletConstructor.fgridFirstBinEntryIndex, xpu::h2d);
 
   fQueue.copy(fTripletConstructor.fgridEntries, xpu::h2d);
 
-//  if(fIteration == 0) fQueue.copy(fTripletConstructor.fMaterialMap, xpu::h2d);
+  //  if(fIteration == 0) fQueue.copy(fTripletConstructor.fMaterialMap, xpu::h2d);
 
-//  if(fIteration == 0) fQueue.copy(fTripletConstructor.fMaterialMapTables, xpu::h2d);
+  //  if(fIteration == 0) fQueue.copy(fTripletConstructor.fMaterialMapTables, xpu::h2d);
 
   fQueue.memset(fTripletConstructor.fHitDoublets, 0);
 
   if constexpr (constants::gpu::GpuTimeMonitoring) {
-    xpu::timings pd_time = xpu::pop_timer();
+    xpu::timings pd_time                           = xpu::pop_timer();
     fEventTimeMonitor.PrepareData_time[fIteration] = pd_time;
 
     xpu::push_timer("MakeSinglets_time");
@@ -235,7 +241,7 @@ void GpuTrackFinderSetup::RunGpuTrackingSetup()
   fQueue.launch<MakeSinglets>(xpu::n_blocks(nBlocks));
 
   if constexpr (constants::gpu::GpuTimeMonitoring) {
-    xpu::timings sind_time = xpu::pop_timer();
+    xpu::timings sind_time                          = xpu::pop_timer();
     fEventTimeMonitor.MakeSinglets_time[fIteration] = sind_time;
 
     xpu::push_timer("CompressDoublets_time");
@@ -244,7 +250,7 @@ void GpuTrackFinderSetup::RunGpuTrackingSetup()
   fQueue.launch<MakeDoublets>(xpu::n_blocks(nBlocks));
 
   if constexpr (constants::gpu::GpuTimeMonitoring) {
-    xpu::timings dind_time = xpu::pop_timer();
+    xpu::timings dind_time                          = xpu::pop_timer();
     fEventTimeMonitor.MakeDoublets_time[fIteration] = dind_time;
 
     xpu::push_timer("ResetDoublets_time");
@@ -257,16 +263,17 @@ void GpuTrackFinderSetup::RunGpuTrackingSetup()
     LOG(info) << "GPU tracking :: Doublets found: " << vfIterationData1[0].fNDoublets;
   }
 
-  if(vfIterationData1[0].fNDoublets > 0) {
+  if (vfIterationData1[0].fNDoublets > 0) {
     fTripletConstructor.fHitDoubletsCompressed.reset(vfIterationData1[0].fNDoublets, xpu::buf_device);
     fTripletConstructor.fvTrackParamsDoublets.reset(vfIterationData1[0].fNDoublets, xpu::buf_device);
-    fTripletConstructor.fHitTriplets.reset(vfIterationData1[0].fNDoublets * constants::gpu::MaxTripletsFromDoublet, xpu::buf_device);
+    fTripletConstructor.fHitTriplets.reset(vfIterationData1[0].fNDoublets * constants::gpu::MaxTripletsFromDoublet,
+                                           xpu::buf_device);
 
     fQueue.memset(fTripletConstructor.fHitTriplets, 0);
     xpu::set<strGpuTripletConstructor>(fTripletConstructor);
 
     if constexpr (constants::gpu::GpuTimeMonitoring) {
-      xpu::timings rd_time = xpu::pop_timer();
+      xpu::timings rd_time                             = xpu::pop_timer();
       fEventTimeMonitor.ResetDoublets_time[fIteration] = rd_time;
 
       xpu::push_timer("CompressDoublets_time");
@@ -275,18 +282,19 @@ void GpuTrackFinderSetup::RunGpuTrackingSetup()
     fQueue.launch<CompressDoublets>(xpu::n_blocks(nBlocksCD));
 
     if constexpr (constants::gpu::GpuTimeMonitoring) {
-      xpu::timings cd_time = xpu::pop_timer();
+      xpu::timings cd_time                                = xpu::pop_timer();
       fEventTimeMonitor.CompressDoublets_time[fIteration] = cd_time;
 
       xpu::push_timer("FitDoublets_time");
     }
 
-    int nBlocksD = isCpu ? vfIterationData1[0].fNDoublets : std::ceil(float(vfIterationData1[0].fNDoublets) / float(kSingletConstructorBlockSize));
+    int nBlocksD = isCpu ? vfIterationData1[0].fNDoublets
+                         : std::ceil(float(vfIterationData1[0].fNDoublets) / float(kSingletConstructorBlockSize));
 
     fQueue.launch<FitDoublets>(xpu::n_blocks(nBlocksD));
 
     if constexpr (constants::gpu::GpuTimeMonitoring) {
-      xpu::timings fd_time = xpu::pop_timer();
+      xpu::timings fd_time                           = xpu::pop_timer();
       fEventTimeMonitor.FitDoublets_time[fIteration] = fd_time;
 
       xpu::push_timer("MakeTriplets_time");
@@ -295,35 +303,36 @@ void GpuTrackFinderSetup::RunGpuTrackingSetup()
     fQueue.launch<MakeTriplets>(xpu::n_blocks(nBlocksD));
 
     if constexpr (constants::gpu::GpuTimeMonitoring) {
-      xpu::timings mt_time = xpu::pop_timer();
+      xpu::timings mt_time                            = xpu::pop_timer();
       fEventTimeMonitor.MakeTriplets_time[fIteration] = mt_time;
 
       xpu::push_timer("ResetTriplets_time");
     }
 
-    fTripletConstructor.fHitDoublets.reset(0, xpu::buf_device);		// TODO: check if we need it for the next iteration
+    fTripletConstructor.fHitDoublets.reset(0, xpu::buf_device);  // TODO: check if we need it for the next iteration
 
     fQueue.copy(fTripletConstructor.fIterationData, xpu::d2h);
     xpu::h_view vfIterationData2{fTripletConstructor.fIterationData};
 
-    if(vfIterationData2[0].fNTriplets > 0) {
+    if (vfIterationData2[0].fNTriplets > 0) {
       fTripletConstructor.fHitTripletsCompressed.reset(vfIterationData2[0].fNTriplets, xpu::buf_device);
-//      fTripletConstructor.fvTrackParamsTriplets.reset(vfIterationData2[0].fNTriplets, xpu::buf_device);
+      //      fTripletConstructor.fvTrackParamsTriplets.reset(vfIterationData2[0].fNTriplets, xpu::buf_device);
       fTripletConstructor.fvTriplets.reset(vfIterationData2[0].fNTriplets, xpu::buf_io);
 
-      int nBlocksCT = isCpu ? vfIterationData1[0].fNDoublets * constants::gpu::MaxTripletsFromDoublet :
-	  std::ceil(float(vfIterationData1[0].fNDoublets * constants::gpu::MaxTripletsFromDoublet) / float(kSingletConstructorBlockSize));
+      int nBlocksCT = isCpu ? vfIterationData1[0].fNDoublets * constants::gpu::MaxTripletsFromDoublet
+                            : std::ceil(float(vfIterationData1[0].fNDoublets * constants::gpu::MaxTripletsFromDoublet)
+                                        / float(kSingletConstructorBlockSize));
 
       if constexpr (constants::gpu::CpuSortTriplets) {
         fTripletConstructor.fTripletSortHelper.reset(vfIterationData2[0].fNTriplets, xpu::buf_io);
-//        fTripletConstructor.fTripletSortHelperTmp.reset(vfIterationData2[0].fNTriplets, xpu::buf_io);
-//        fTripletConstructor.dst.reset(nBlocksCT, xpu::buf_io);
+        //        fTripletConstructor.fTripletSortHelperTmp.reset(vfIterationData2[0].fNTriplets, xpu::buf_io);
+        //        fTripletConstructor.dst.reset(nBlocksCT, xpu::buf_io);
       }
 
       xpu::set<strGpuTripletConstructor>(fTripletConstructor);
 
       if constexpr (constants::gpu::GpuTimeMonitoring) {
-        xpu::timings rt_time = xpu::pop_timer();
+        xpu::timings rt_time                             = xpu::pop_timer();
         fEventTimeMonitor.ResetTriplets_time[fIteration] = rt_time;
 
         LOG(info) << "GPU tracking :: Triplets found: " << vfIterationData2[0].fNTriplets;
@@ -334,16 +343,17 @@ void GpuTrackFinderSetup::RunGpuTrackingSetup()
       fQueue.launch<CompressTriplets>(xpu::n_blocks(nBlocksCT));
 
       if constexpr (constants::gpu::GpuTimeMonitoring) {
-        xpu::timings ct_time = xpu::pop_timer ();
+        xpu::timings ct_time                                = xpu::pop_timer();
         fEventTimeMonitor.CompressTriplets_time[fIteration] = ct_time;
 
         xpu::push_timer("FitTriplets_time");
       }
 
-      fQueue.launch<FitTriplets>(xpu::n_blocks(/*fTripletConstructor.fIterationData[0].fNTriplets*/vfIterationData2[0].fNTriplets));
+      fQueue.launch<FitTriplets>(
+        xpu::n_blocks(/*fTripletConstructor.fIterationData[0].fNTriplets*/ vfIterationData2[0].fNTriplets));
 
       if constexpr (constants::gpu::GpuTimeMonitoring) {
-        xpu::timings ft_time = xpu::pop_timer ();
+        xpu::timings ft_time                           = xpu::pop_timer();
         fEventTimeMonitor.FitTriplets_time[fIteration] = ft_time;
 
         xpu::push_timer("SortTriplets_time");
@@ -354,7 +364,7 @@ void GpuTrackFinderSetup::RunGpuTrackingSetup()
       }
 
       if constexpr (constants::gpu::GpuTimeMonitoring) {
-        xpu::timings srt_time = xpu::pop_timer ();
+        xpu::timings srt_time                           = xpu::pop_timer();
         fEventTimeMonitor.SortTriplets_time[fIteration] = srt_time;
       }
 
@@ -375,10 +385,10 @@ void GpuTrackFinderSetup::RunGpuTrackingSetup()
   fTripletConstructor.fgridFirstBinEntryIndex.reset(0, xpu::buf_io);
   fTripletConstructor.fgridEntries.reset(0, xpu::buf_io);
 
-  xpu::set<strGpuTripletConstructor>(fTripletConstructor);	//TODO: check if we need to reset all the buffers here
+  xpu::set<strGpuTripletConstructor>(fTripletConstructor);  //TODO: check if we need to reset all the buffers here
 
   if constexpr (constants::gpu::GpuTimeMonitoring) {
-    xpu::timings t = xpu::pop_timer();
+    xpu::timings t                           = xpu::pop_timer();
     fEventTimeMonitor.Total_time[fIteration] = t;
     fEventTimeMonitor.PrintTimings(fIteration);
   }
diff --git a/algo/ca/core/experimental/CaGpuTrackFinderSetup.h b/algo/ca/core/experimental/CaGpuTrackFinderSetup.h
index 3f5132641b..288eaa39ff 100644
--- a/algo/ca/core/experimental/CaGpuTrackFinderSetup.h
+++ b/algo/ca/core/experimental/CaGpuTrackFinderSetup.h
@@ -8,13 +8,13 @@
 #pragma once  // include this header only once per compilation unit
 
 #include "CaBranch.h"
+#include "CaGpuParameters.h"
+#include "CaGpuTimeMonitor.h"
+#include "CaGpuTripletConstructor.h"
 #include "CaHit.h"
-#include "KfTrackParam.h"
 #include "CaVector.h"
-#include "CaGpuParameters.h"
 #include "CaWindowData.h"
-#include "CaGpuTripletConstructor.h"
-#include "CaGpuTimeMonitor.h"
+#include "KfTrackParam.h"
 
 #include <xpu/host.h>
 
@@ -64,29 +64,29 @@ namespace cbm::algo::ca
     void RunGpuTrackingSetup();
 
     /// Get the number of triplets
-    unsigned int GetNofTriplets() const { return fNTriplets; }//fTripletConstructor.fTempData[0].fNTriplets; }
+    unsigned int GetNofTriplets() const { return fNTriplets; }  //fTripletConstructor.fTempData[0].fNTriplets; }
 
-    ca::Triplet& GetTriplet(int itr) { return fTripletConstructor.fvTriplets[itr]; }	//TODO: switch back
-//    ca::Triplet& GetTriplet(int itr) { return fvTriplets[itr]; }//fTripletConstructor.fvTriplets[itr]; }	//TODO: switch back
+    ca::Triplet& GetTriplet(int itr) { return fTripletConstructor.fvTriplets[itr]; }  //TODO: switch back
+    //    ca::Triplet& GetTriplet(int itr) { return fvTriplets[itr]; }//fTripletConstructor.fvTriplets[itr]; }	//TODO: switch back
 
     /// Get timings
     XpuTimings& GetTimings() { return fEventTimeMonitor; }
 
     /// Get triplets
-//    const Vector<ca::Triplet>& GetTriplets() const { return fTripletConstructor.fvTriplets; }
+    //    const Vector<ca::Triplet>& GetTriplets() const { return fTripletConstructor.fvTriplets; }
 
-//    static constexpr bool fGPU_time_monitoring = true;  // print debug info
+    //    static constexpr bool fGPU_time_monitoring = true;  // print debug info
 
    private:
-    const Parameters<fvec>& fParameters;            ///< Object of Framework parameters class
-    WindowData& frWData;    ///< Reference to the window data
-    xpu::queue fQueue;  ///< GPU queue TODO: initialization is ~220 ms. Why and how to avoid?
+    const Parameters<fvec>& fParameters;  ///< Object of Framework parameters class
+    WindowData& frWData;                  ///< Reference to the window data
+    xpu::queue fQueue;                    ///< GPU queue TODO: initialization is ~220 ms. Why and how to avoid?
     ca::GpuTripletConstructor fTripletConstructor;  ///< GPU triplet constructor
-    unsigned int fIteration;  ///< Iteration number
-    int fNHits;	///< Number of hits
+    unsigned int fIteration;                        ///< Iteration number
+    int fNHits;                                     ///< Number of hits
 
     int fNTriplets;  ///< Number of triplets
 
     XpuTimings fEventTimeMonitor;
   };
-}
+}  // namespace cbm::algo::ca
diff --git a/algo/ca/core/experimental/CaGpuTripletConstructor.cxx b/algo/ca/core/experimental/CaGpuTripletConstructor.cxx
index adeb92d0da..bd21eeb5bc 100644
--- a/algo/ca/core/experimental/CaGpuTripletConstructor.cxx
+++ b/algo/ca/core/experimental/CaGpuTripletConstructor.cxx
@@ -48,32 +48,32 @@ XPU_D void SortTriplets::operator()(context& ctx) { ctx.cmem<strGpuTripletConstr
 
 XPU_D void GpuTripletConstructor::MakeSinglets(MakeSinglets::context& ctx) const
 {
-  const int iGThread    = ctx.block_dim_x() * ctx.block_idx_x() + ctx.thread_idx_x();
+  const int iGThread = ctx.block_dim_x() * ctx.block_idx_x() + ctx.thread_idx_x();
 
-  if(iGThread >= fIterationData[0].fNHits) return;
+  if (iGThread >= fIterationData[0].fNHits) return;
 
-  ca::HitIndex_t ihitl = fgridEntries[iGThread];//iGThread;
-  int ista = fvHits[ihitl].Station();
+  ca::HitIndex_t ihitl = fgridEntries[iGThread];  //iGThread;
+  int ista             = fvHits[ihitl].Station();
 
-//  if(ista < 0 || ista > (int)fParams[fIterationData[0].fIteration].fNStations - 3) return;
-  if(ista < 0 || ista > (int)fParams_const[fIteration].fNStations - 3) return;
+  //  if(ista < 0 || ista > (int)fParams[fIterationData[0].fIteration].fNStations - 3) return;
+  if (ista < 0 || ista > (int) fParams_const[fIteration].fNStations - 3) return;
 
-//  const ca::Station<float>* sta_l = &fParams[fIterationData[0].fIteration].GetStation(ista);
-//  const ca::Station<float>* sta_m = &fParams[fIterationData[0].fIteration].GetStation(ista + 1);
+  //  const ca::Station<float>* sta_l = &fParams[fIterationData[0].fIteration].GetStation(ista);
+  //  const ca::Station<float>* sta_m = &fParams[fIterationData[0].fIteration].GetStation(ista + 1);
   const ca::GpuStation* sta_l = &fStations_const[ista];
   const ca::GpuStation* sta_m = &fStations_const[ista + 1];
 
   float stal_l_timeInfo = sta_l->timeInfo;
-  float sta_m_z = sta_m->fZ;
+  float sta_m_z         = sta_m->fZ;
 
   const auto& hitl = fvHits[ihitl];
 
-//  const float dzli = 1.f / (hitl.Z() - fParams[fIterationData[0].fIteration].GetTargetPositionZ());
-//  const float tx = (hitl.X() - fParams[fIterationData[0].fIteration].GetTargetPositionX()) * dzli;
-//  const float ty = (hitl.Y() - fParams[fIterationData[0].fIteration].GetTargetPositionY()) * dzli;
+  //  const float dzli = 1.f / (hitl.Z() - fParams[fIterationData[0].fIteration].GetTargetPositionZ());
+  //  const float tx = (hitl.X() - fParams[fIterationData[0].fIteration].GetTargetPositionX()) * dzli;
+  //  const float ty = (hitl.Y() - fParams[fIterationData[0].fIteration].GetTargetPositionY()) * dzli;
   const float dzli = 1.f / (hitl.Z() - fParams_const[fIteration].GetTargetPositionZ());
-  const float tx = (hitl.X() - fParams_const[fIteration].GetTargetPositionX()) * dzli;
-  const float ty = (hitl.Y() - fParams_const[fIteration].GetTargetPositionY()) * dzli;
+  const float tx   = (hitl.X() - fParams_const[fIteration].GetTargetPositionX()) * dzli;
+  const float ty   = (hitl.Y() - fParams_const[fIteration].GetTargetPositionY()) * dzli;
 
   kf::TrackParamBase<float>& fit_params = fvTrackParams[ihitl];
 
@@ -84,64 +84,68 @@ XPU_D void GpuTripletConstructor::MakeSinglets(MakeSinglets::context& ctx) const
   fit_params.Ty()   = ty;
   fit_params.Qp()   = 0.;
   fit_params.Time() = hitl.T();
-//  fit_params.Vi()   = fParams[fIterationData[0].fIteration].SpeedOfLightInv;//0.03335641;//constants::phys::SpeedOfLightInv;
-  fit_params.Vi()   = constants::phys::SpeedOfLightInv;//fParams_const[fIteration].SpeedOfLightInv;
+  //  fit_params.Vi()   = fParams[fIterationData[0].fIteration].SpeedOfLightInv;//0.03335641;//constants::phys::SpeedOfLightInv;
+  fit_params.Vi() = constants::phys::SpeedOfLightInv;  //fParams_const[fIteration].SpeedOfLightInv;
 
-//  float txErr2           = fParams[fIterationData[0].fIteration].maxSlopePV * fParams[fIterationData[0].fIteration].maxSlopePV / 9.;
-//  float qpErr2           = fParams[fIterationData[0].fIteration].maxQp * fParams[fIterationData[0].fIteration].maxQp / 9.;
-  float txErr2           = fParams_const[fIteration].maxSlopePV * fParams_const[fIteration].maxSlopePV / 9.;
-  float qpErr2           = fParams_const[fIteration].maxQp * fParams_const[fIteration].maxQp / 9.;
+  //  float txErr2           = fParams[fIterationData[0].fIteration].maxSlopePV * fParams[fIterationData[0].fIteration].maxSlopePV / 9.;
+  //  float qpErr2           = fParams[fIterationData[0].fIteration].maxQp * fParams[fIterationData[0].fIteration].maxQp / 9.;
+  float txErr2 = fParams_const[fIteration].maxSlopePV * fParams_const[fIteration].maxSlopePV / 9.;
+  float qpErr2 = fParams_const[fIteration].maxQp * fParams_const[fIteration].maxQp / 9.;
 
   fit_params.ResetErrors(1., 1., txErr2, txErr2, qpErr2, (stal_l_timeInfo ? hitl.dT2() : 1.e6), 1.e10);
 
-  fit_params.InitVelocityRange(1. / /*fParams[fIterationData[0].fIteration]*/fParams_const[fIteration].maxQp);
+  fit_params.InitVelocityRange(1. / /*fParams[fIterationData[0].fIteration]*/ fParams_const[fIteration].maxQp);
 
   fit_params.C(0, 0) = hitl.dX2();
   fit_params.C(1, 0) = hitl.dXY();
   fit_params.C(1, 1) = hitl.dY2();
 
-//  fit_params.Ndf()     = fParams[fIterationData[0].fIteration].primaryFlag ? 2. : 0.;
+  //  fit_params.Ndf()     = fParams[fIterationData[0].fIteration].primaryFlag ? 2. : 0.;
   fit_params.Ndf()     = fParams_const[fIteration].primaryFlag ? 2. : 0.;
   fit_params.NdfTime() = (stal_l_timeInfo ? 0 : -1);
 
-//  ca::GpuFieldRegion fld0;
+  //  ca::GpuFieldRegion fld0;
   {
     int sta1 = ista;
-    if(sta1 == 0) sta1 = 1;
+    if (sta1 == 0) sta1 = 1;
     int sta0 = sta1 / 2;  // station between ista and the target
-//    const ca::GpuFieldValue &B0 = fParams[fIterationData[0].fIteration].GetStation(sta0).fieldSlice.GetFieldValueForLine(fit_params);
-//    const ca::GpuFieldValue &B1 = fParams[fIterationData[0].fIteration].GetStation(sta1).fieldSlice.GetFieldValueForLine(fit_params);
-    const ca::GpuFieldValue &B0 = fStations_const[sta0].fieldSlice.GetFieldValueForLine(fit_params);
-    const ca::GpuFieldValue &B1 = fStations_const[sta1].fieldSlice.GetFieldValueForLine(fit_params);
+    //    const ca::GpuFieldValue &B0 = fParams[fIterationData[0].fIteration].GetStation(sta0).fieldSlice.GetFieldValueForLine(fit_params);
+    //    const ca::GpuFieldValue &B1 = fParams[fIterationData[0].fIteration].GetStation(sta1).fieldSlice.GetFieldValueForLine(fit_params);
+    const ca::GpuFieldValue& B0 = fStations_const[sta0].fieldSlice.GetFieldValueForLine(fit_params);
+    const ca::GpuFieldValue& B1 = fStations_const[sta1].fieldSlice.GetFieldValueForLine(fit_params);
     //fld0.Set(fParams[fIterationData[0].fIteration].targB, fParams[fIterationData[0].fIteration].GetTargetPositionZ(), B0, fParams[fIterationData[0].fIteration].GetStation(sta0).fZ, B1, fParams[fIterationData[0].fIteration].GetStation(sta1).fZ);
-    ctx.smem().fld0_shared[ctx.thread_idx_x()].Set(fParams_const[fIteration].targB, fParams_const[fIteration].GetTargetPositionZ(), B0, fStations_const[sta0].fZ, B1, fStations_const[sta1].fZ);
+    ctx.smem().fld0_shared[ctx.thread_idx_x()].Set(fParams_const[fIteration].targB,
+                                                   fParams_const[fIteration].GetTargetPositionZ(), B0,
+                                                   fStations_const[sta0].fZ, B1, fStations_const[sta1].fZ);
   }
-//  ca::GpuFieldRegion fld1;
+  //  ca::GpuFieldRegion fld1;
   {  // field, made by the left hit, the middle station and the right station
-//    const ca::GpuFieldValue &B0 = fParams[fIterationData[0].fIteration].GetStation(ista)*/fStations_const[ista]/*ctx.smem().fStations_shared[ista]*/.fieldSlice.GetFieldValueForLine(fit_params);
-//    const ca::GpuFieldValue &B1 = fParams[fIterationData[0].fIteration].GetStation(ista + 1).fieldSlice.GetFieldValueForLine(fit_params);
-//    const ca::GpuFieldValue &B2 = fParams[fIterationData[0].fIteration].GetStation(ista + 2).fieldSlice.GetFieldValueForLine(fit_params);
-    const ca::GpuFieldValue &B0 = fStations_const[ista].fieldSlice.GetFieldValueForLine(fit_params);
-    const ca::GpuFieldValue &B1 = fStations_const[ista + 1].fieldSlice.GetFieldValueForLine(fit_params);
-    const ca::GpuFieldValue &B2 = fStations_const[ista + 2].fieldSlice.GetFieldValueForLine(fit_params);
-//    fld1.Set(B0, fParams[fIterationData[0].fIteration].GetStation(ista).fZ, B1, fParams[fIterationData[0].fIteration].GetStation(ista + 1).fZ, B2, fParams[fIterationData[0].fIteration].GetStation(ista + 2).fZ);
-    ctx.smem().fld1_shared[ctx.thread_idx_x()].Set(B0, fStations_const[ista].fZ, B1, fStations_const[ista + 1].fZ, B2, fStations_const[ista + 2].fZ);
+    //    const ca::GpuFieldValue &B0 = fParams[fIterationData[0].fIteration].GetStation(ista)*/fStations_const[ista]/*ctx.smem().fStations_shared[ista]*/.fieldSlice.GetFieldValueForLine(fit_params);
+    //    const ca::GpuFieldValue &B1 = fParams[fIterationData[0].fIteration].GetStation(ista + 1).fieldSlice.GetFieldValueForLine(fit_params);
+    //    const ca::GpuFieldValue &B2 = fParams[fIterationData[0].fIteration].GetStation(ista + 2).fieldSlice.GetFieldValueForLine(fit_params);
+    const ca::GpuFieldValue& B0 = fStations_const[ista].fieldSlice.GetFieldValueForLine(fit_params);
+    const ca::GpuFieldValue& B1 = fStations_const[ista + 1].fieldSlice.GetFieldValueForLine(fit_params);
+    const ca::GpuFieldValue& B2 = fStations_const[ista + 2].fieldSlice.GetFieldValueForLine(fit_params);
+    //    fld1.Set(B0, fParams[fIterationData[0].fIteration].GetStation(ista).fZ, B1, fParams[fIterationData[0].fIteration].GetStation(ista + 1).fZ, B2, fParams[fIterationData[0].fIteration].GetStation(ista + 2).fZ);
+    ctx.smem().fld1_shared[ctx.thread_idx_x()].Set(B0, fStations_const[ista].fZ, B1, fStations_const[ista + 1].fZ, B2,
+                                                   fStations_const[ista + 2].fZ);
   }
 
-  FilterWithTargetAtLine(&fit_params, &/*fld0*/ctx.smem().fld0_shared[ctx.thread_idx_x()]);
+  FilterWithTargetAtLine(&fit_params, &/*fld0*/ ctx.smem().fld0_shared[ctx.thread_idx_x()]);
 
-  MultipleScattering(&fit_params, fMaterialMapTables[fMaterialMap[ista].GetBin(fit_params.GetX(), fit_params.GetY())], /*fParams[fIterationData[0].fIteration]*/fParams_const[fIteration].maxQp);
+  MultipleScattering(&fit_params, fMaterialMapTables[fMaterialMap[ista].GetBin(fit_params.GetX(), fit_params.GetY())],
+                     /*fParams[fIterationData[0].fIteration]*/ fParams_const[fIteration].maxQp);
 
   // extrapolate to the middle hit
-  ExtrapolateStep(&fit_params, sta_m_z, 0.f, &/*fld1*/ctx.smem().fld1_shared[ctx.thread_idx_x()]);
+  ExtrapolateStep(&fit_params, sta_m_z, 0.f, &/*fld1*/ ctx.smem().fld1_shared[ctx.thread_idx_x()]);
 
-//  ExtrapolateLine(tr_par, sta_m->fZ, F1);	//TODO: check if ExtrapolateStep is enough
+  //  ExtrapolateLine(tr_par, sta_m->fZ, F1);	//TODO: check if ExtrapolateStep is enough
 }
 
 XPU_D void GpuTripletConstructor::MakeDoublets(MakeDoublets::context& ctx) const
 {
-  const int iGThread    = ctx.block_dim_x() * ctx.block_idx_x() + ctx.thread_idx_x();
-  if(iGThread >= fIterationData[0].fNHits) return;
+  const int iGThread = ctx.block_dim_x() * ctx.block_idx_x() + ctx.thread_idx_x();
+  if (iGThread >= fIterationData[0].fNHits) return;
 
 #if 0
   if (ctx.thread_idx_x() == 0) {
@@ -149,15 +153,15 @@ XPU_D void GpuTripletConstructor::MakeDoublets(MakeDoublets::context& ctx) const
   }
 #endif
 
-  ca::HitIndex_t ihitl = fgridEntries[iGThread];//iGThread;
-  int ista = fvHits[ihitl].Station();
+  ca::HitIndex_t ihitl = fgridEntries[iGThread];  //iGThread;
+  int ista             = fvHits[ihitl].Station();
 
-  if(ista > (int)fParams[fIterationData[0].fIteration].fNStations - 3) return;
+  if (ista > (int) fParams[fIterationData[0].fIteration].fNStations - 3) return;
 
   int collectedHits = 0;
-  int maxNHits = constants::gpu::MaxDoubletsFromHit;//fParams[fIterationData[0].fIteration].maxDoubletsFromHit;
+  int maxNHits      = constants::gpu::MaxDoubletsFromHit;  //fParams[fIterationData[0].fIteration].maxDoubletsFromHit;
 
-  CollectHits(iGThread, ista+1, ihitl, maxNHits, &collectedHits);
+  CollectHits(iGThread, ista + 1, ihitl, maxNHits, &collectedHits);
 
   xpu::atomic_add(&(fIterationData[0].fNDoublets), collectedHits);
 
@@ -173,13 +177,13 @@ XPU_D void GpuTripletConstructor::MakeDoublets(MakeDoublets::context& ctx) const
 
 XPU_D void GpuTripletConstructor::CompressDoublets(CompressDoublets::context& ctx) const
 {
-  const int iGThread    = ctx.block_dim_x() * ctx.block_idx_x() + ctx.thread_idx_x();
+  const int iGThread = ctx.block_dim_x() * ctx.block_idx_x() + ctx.thread_idx_x();
 
-//  if( iGThread >= fIterationData[0].fNHits * fParams[fIterationData[0].fIteration].maxDoubletsFromHit ) return;
-  if( iGThread >= fIterationData[0].fNHits * constants::gpu::MaxDoubletsFromHit ) return;
+  //  if( iGThread >= fIterationData[0].fNHits * fParams[fIterationData[0].fIteration].maxDoubletsFromHit ) return;
+  if (iGThread >= fIterationData[0].fNHits * constants::gpu::MaxDoubletsFromHit) return;
 
-  if( fHitDoublets[iGThread].hit2 != 0 ) {
-    int index = xpu::atomic_add(&(fIterationData[0].fNDoublets_counter), 1);
+  if (fHitDoublets[iGThread].hit2 != 0) {
+    int index                          = xpu::atomic_add(&(fIterationData[0].fNDoublets_counter), 1);
     fHitDoubletsCompressed[index].hit1 = fHitDoublets[iGThread].hit1;
     fHitDoubletsCompressed[index].hit2 = fHitDoublets[iGThread].hit2;
   }
@@ -187,24 +191,25 @@ XPU_D void GpuTripletConstructor::CompressDoublets(CompressDoublets::context& ct
 
 XPU_D void GpuTripletConstructor::FitDoublets(FitDoublets::context& ctx) const
 {
-  const int iGThread    = ctx.block_dim_x() * ctx.block_idx_x() + ctx.thread_idx_x();
-  if(iGThread >= fIterationData[0].fNDoublets) return;
+  const int iGThread = ctx.block_dim_x() * ctx.block_idx_x() + ctx.thread_idx_x();
+  if (iGThread >= fIterationData[0].fNDoublets) return;
 
   int hit_l = fHitDoubletsCompressed[iGThread].hit1;
   int hit_m = fHitDoubletsCompressed[iGThread].hit2;
 
-  if(hit_l < 0 || hit_m < 0) return;
+  if (hit_l < 0 || hit_m < 0) return;
 
   int ista = fvHits[hit_l].Station();
 
-  if(ista < 0 || ista > (int)fParams[fIterationData[0].fIteration].fNStations - 3) return;
+  if (ista < 0 || ista > (int) fParams[fIterationData[0].fIteration].fNStations - 3) return;
 
   const ca::GpuStation& sta_l = fParams[fIterationData[0].fIteration].GetStation(ista);
   const ca::GpuStation& sta_m = fParams[fIterationData[0].fIteration].GetStation(ista + 1);
 
   auto fit_params = fvTrackParams[hit_l];
 
-  bool isMomentumFitted = (fParams[fIterationData[0].fIteration].isTargetField || (sta_l.fieldStatus != 0) || (sta_m.fieldStatus != 0));
+  bool isMomentumFitted =
+    (fParams[fIterationData[0].fIteration].isTargetField || (sta_l.fieldStatus != 0) || (sta_m.fieldStatus != 0));
   float maxQp = fParams[fIterationData[0].fIteration].maxQp;
 
   // fit doublet
@@ -222,22 +227,23 @@ XPU_D void GpuTripletConstructor::FitDoublets(FitDoublets::context& ctx) const
 
   FilterTime(&fit_params, t_2, dt2_2, sta_m.timeInfo);
 
-//      fFit.SetQp0(isMomentumFitted ? fFit.Tr().GetQp() : maxQp);	//TODO: check why it is required
+  //      fFit.SetQp0(isMomentumFitted ? fFit.Tr().GetQp() : maxQp);	//TODO: check why it is required
   const float qp0 = isMomentumFitted ? fit_params.GetQp() : maxQp;
 
-  MultipleScattering(&fit_params, fMaterialMapTables[fMaterialMap[ista + 1].GetBin(fit_params.GetX(), fit_params.GetY())], qp0);
+  MultipleScattering(&fit_params,
+                     fMaterialMapTables[fMaterialMap[ista + 1].GetBin(fit_params.GetX(), fit_params.GetY())], qp0);
 
   fvTrackParamsDoublets[iGThread] = fit_params;
-//      fFit.SetQp0(fFit.Tr().Qp());	//TODO: check why it is required and how it is used later, have to be stored different way
+  //      fFit.SetQp0(fFit.Tr().Qp());	//TODO: check why it is required and how it is used later, have to be stored different way
 }
 
 XPU_D void GpuTripletConstructor::MakeTriplets(MakeTriplets::context& ctx) const
 {
-  const int iGThread    = ctx.block_dim_x() * ctx.block_idx_x() + ctx.thread_idx_x();
-  if(iGThread >= fIterationData[0].fNDoublets) return;
+  const int iGThread = ctx.block_dim_x() * ctx.block_idx_x() + ctx.thread_idx_x();
+  if (iGThread >= fIterationData[0].fNDoublets) return;
 
   int hit_l = fHitDoubletsCompressed[iGThread].hit1;
-//  int hit_m = fHitDoubletsCompressed[iGThread].hit2;
+  //  int hit_m = fHitDoubletsCompressed[iGThread].hit2;
 
 #if 0
   if (ctx.thread_idx_x() == 0) {
@@ -247,24 +253,30 @@ XPU_D void GpuTripletConstructor::MakeTriplets(MakeTriplets::context& ctx) const
 
   int ista = fvHits[hit_l].Station();
 
-  if(ista < 0 || ista > (int)fParams[fIterationData[0].fIteration].fNStations - 3) return;
+  if (ista < 0 || ista > (int) fParams[fIterationData[0].fIteration].fNStations - 3) return;
 
   //
   ista = fvHits[hit_l].Station();
   //
 
-//  const ca::Station<float>& sta_l = fParams[fIterationData[0].fIteration].GetStation(ista);
-//  const ca::Station<float>& sta_m = fParams[fIterationData[0].fIteration].GetStation(ista + 1);
+  //  const ca::Station<float>& sta_l = fParams[fIterationData[0].fIteration].GetStation(ista);
+  //  const ca::Station<float>& sta_m = fParams[fIterationData[0].fIteration].GetStation(ista + 1);
   const ca::GpuStation& sta_r = fParams[fIterationData[0].fIteration].GetStation(ista + 2);
 
   auto& fit_params = fvTrackParamsDoublets[iGThread];
 
-  ca::GpuFieldRegion fld1;	//TODO: with fit_params(doublets) this is not fFldL from orig, but it should be more precise, need to be checker and clarified
-  {  // field, made by the left hit, the middle station and the right station
-    ca::GpuFieldValue B0 = fParams[fIterationData[0].fIteration].GetStation(ista).fieldSlice.GetFieldValueForLine(fit_params);
-    ca::GpuFieldValue B1 = fParams[fIterationData[0].fIteration].GetStation(ista + 1).fieldSlice.GetFieldValueForLine(fit_params);
-    ca::GpuFieldValue B2 = fParams[fIterationData[0].fIteration].GetStation(ista + 2).fieldSlice.GetFieldValueForLine(fit_params);
-    fld1.Set (B0, fParams[fIterationData[0].fIteration].GetStation (ista).fZ, B1, fParams[fIterationData[0].fIteration].GetStation(ista + 1).fZ, B2, fParams[fIterationData[0].fIteration].GetStation(ista + 2).fZ);
+  ca::GpuFieldRegion
+    fld1;  //TODO: with fit_params(doublets) this is not fFldL from orig, but it should be more precise, need to be checker and clarified
+  {        // field, made by the left hit, the middle station and the right station
+    ca::GpuFieldValue B0 =
+      fParams[fIterationData[0].fIteration].GetStation(ista).fieldSlice.GetFieldValueForLine(fit_params);
+    ca::GpuFieldValue B1 =
+      fParams[fIterationData[0].fIteration].GetStation(ista + 1).fieldSlice.GetFieldValueForLine(fit_params);
+    ca::GpuFieldValue B2 =
+      fParams[fIterationData[0].fIteration].GetStation(ista + 2).fieldSlice.GetFieldValueForLine(fit_params);
+    fld1.Set(B0, fParams[fIterationData[0].fIteration].GetStation(ista).fZ, B1,
+             fParams[fIterationData[0].fIteration].GetStation(ista + 1).fZ, B2,
+             fParams[fIterationData[0].fIteration].GetStation(ista + 2).fZ);
   }
 
   ExtrapolateStep(&fit_params, sta_r.fZ, 0.f, &fld1);
@@ -272,9 +284,10 @@ XPU_D void GpuTripletConstructor::MakeTriplets(MakeTriplets::context& ctx) const
   //TODO: add isDoubletGood check if needed
 
   int collectedHits = 0;
-  int maxNHits = constants::gpu::MaxTripletsFromDoublet;//fParams[fIterationData[0].fIteration].maxTripletsFromDoublet;
+  int maxNHits =
+    constants::gpu::MaxTripletsFromDoublet;  //fParams[fIterationData[0].fIteration].maxTripletsFromDoublet;
 
-  CollectHitsTriplets(iGThread, ista+2, maxNHits, &collectedHits);
+  CollectHitsTriplets(iGThread, ista + 2, maxNHits, &collectedHits);
 
   xpu::atomic_add(&(fIterationData[0].fNTriplets), collectedHits);
 
@@ -291,12 +304,12 @@ XPU_D void GpuTripletConstructor::MakeTriplets(MakeTriplets::context& ctx) const
 
 XPU_D void GpuTripletConstructor::CompressTriplets(CompressTriplets::context& ctx) const
 {
-  const int iGThread    = ctx.block_dim_x() * ctx.block_idx_x() + ctx.thread_idx_x();
-//  if( iGThread >= fIterationData[0].fNDoublets * fParams[fIterationData[0].fIteration].maxTripletsFromDoublet ) return;
-  if( iGThread >= fIterationData[0].fNDoublets * constants::gpu::MaxTripletsFromDoublet ) return;
+  const int iGThread = ctx.block_dim_x() * ctx.block_idx_x() + ctx.thread_idx_x();
+  //  if( iGThread >= fIterationData[0].fNDoublets * fParams[fIterationData[0].fIteration].maxTripletsFromDoublet ) return;
+  if (iGThread >= fIterationData[0].fNDoublets * constants::gpu::MaxTripletsFromDoublet) return;
 
-  if( fHitTriplets[iGThread].hit3 != 0 ) {
-    int index = xpu::atomic_add(&(fIterationData[0].fNTriplets_counter), 1);
+  if (fHitTriplets[iGThread].hit3 != 0) {
+    int index                          = xpu::atomic_add(&(fIterationData[0].fNTriplets_counter), 1);
     fHitTripletsCompressed[index].hit1 = fHitTriplets[iGThread].hit1;
     fHitTripletsCompressed[index].hit2 = fHitTriplets[iGThread].hit2;
     fHitTripletsCompressed[index].hit3 = fHitTriplets[iGThread].hit3;
@@ -305,9 +318,9 @@ XPU_D void GpuTripletConstructor::CompressTriplets(CompressTriplets::context& ct
 
 XPU_D void GpuTripletConstructor::FitTriplets(FitTriplets::context& ctx) const
 {
-  const int iGThread    = ctx.block_dim_x() * ctx.block_idx_x() + ctx.thread_idx_x();
+  const int iGThread = ctx.block_dim_x() * ctx.block_idx_x() + ctx.thread_idx_x();
 
-  if(iGThread >= fIterationData[0].fNTriplets) return;
+  if (iGThread >= fIterationData[0].fNTriplets) return;
 
   int hit_l = fHitTripletsCompressed[iGThread].hit1;
 
@@ -317,21 +330,22 @@ XPU_D void GpuTripletConstructor::FitTriplets(FitTriplets::context& ctx) const
   ista[1] = ista[0] + 1;
   ista[2] = ista[1] + 1;
 
-//  ca::GpuStation sta[3];
-  const ca::GpuStation *sta[3];
+  //  ca::GpuStation sta[3];
+  const ca::GpuStation* sta[3];
   for (int is = 0; is < 3; ++is) {
-//    sta[is] = fParams[fIterationData[0].fIteration].GetStation(ista[is]);
+    //    sta[is] = fParams[fIterationData[0].fIteration].GetStation(ista[is]);
     sta[is] = &fStations_const[ista[is]];
   };
 
   bool isMomentumFitted = ((sta[0]->fieldStatus != 0) || (sta[1]->fieldStatus != 0) || (sta[2]->fieldStatus != 0));
 
-  float ndfTrackModel = 4;                     // straight line
+  float ndfTrackModel = 4;                    // straight line
   ndfTrackModel += isMomentumFitted ? 1 : 0;  // track with momentum
 
   const float maxQp = fParams[fIterationData[0].fIteration].maxQp;
 
-  ca::HitIndex_t ihit[3] = {fHitTripletsCompressed[iGThread].hit1, fHitTripletsCompressed[iGThread].hit2, fHitTripletsCompressed[iGThread].hit3};
+  ca::HitIndex_t ihit[3] = {fHitTripletsCompressed[iGThread].hit1, fHitTripletsCompressed[iGThread].hit2,
+                            fHitTripletsCompressed[iGThread].hit3};
 
   float x[3], y[3], z[3], t[3];
   float dt2[3];
@@ -339,7 +353,7 @@ XPU_D void GpuTripletConstructor::FitTriplets(FitTriplets::context& ctx) const
 
   for (int ih = 0; ih < 3; ++ih) {
     const ca::Hit& hit = fvHits[ihit[ih]];
-    mxy[ih] = ca::MeasurementXy<float>(hit.X(), hit.Y(), hit.dX2(), hit.dY2(), hit.dXY(), 1.f, 1.f);
+    mxy[ih]            = ca::MeasurementXy<float>(hit.X(), hit.Y(), hit.dX2(), hit.dY2(), hit.dXY(), 1.f, 1.f);
 
     x[ih]   = hit.X();
     y[ih]   = hit.Y();
@@ -358,31 +372,33 @@ XPU_D void GpuTripletConstructor::FitTriplets(FitTriplets::context& ctx) const
   float ty[3] = {(y[1] - y[0]) / (z[1] - z[0]), (y[2] - y[0]) / (z[2] - z[0]), (y[2] - y[1]) / (z[2] - z[1])};
   for (int ih = 0; ih < 3; ++ih) {
     float dz = (sta[ih]->fZ - z[ih]);
-    B[ih]   = sta[ih]->fieldSlice.GetFieldValue(x[ih] + tx[ih] * dz, y[ih] + ty[ih] * dz);
+    B[ih]    = sta[ih]->fieldSlice.GetFieldValue(x[ih] + tx[ih] * dz, y[ih] + ty[ih] * dz);
   };
 
   fld.Set(B[0], sta[0]->fZ, B[1], sta[1]->fZ, B[2], sta[2]->fZ);
-  fldTarget.Set(fParams[fIterationData[0].fIteration].targB, fParams[fIterationData[0].fIteration].GetTargetPositionZ(), B[0], sta[0]->fZ, B[1], sta[1]->fZ);
+  fldTarget.Set(fParams[fIterationData[0].fIteration].targB, fParams[fIterationData[0].fIteration].GetTargetPositionZ(),
+                B[0], sta[0]->fZ, B[1], sta[1]->fZ);
 
-  kf::TrackParamBase<float> fit_params;// = fvTrackParamsTriplets[iGThread];	//TODO: set all to 0 and recheck - done, no difference
-//  auto *fit_params = &ctx.smem().fit_params_shared[ctx.thread_idx_x()];	//TODO: shared memory cannot be used for classes with preinitialized data
+  kf::TrackParamBase<float>
+    fit_params;  // = fvTrackParamsTriplets[iGThread];	//TODO: set all to 0 and recheck - done, no difference
+  //  auto *fit_params = &ctx.smem().fit_params_shared[ctx.thread_idx_x()];	//TODO: shared memory cannot be used for classes with preinitialized data
 
   fit_params.ResetCovMatrix();
 
   // initial parameters
   {
-    float dz01 = 1.f / (z[1] - z[0]);
-    fit_params.Tx()    = (x[1] - x[0]) * dz01;
-    fit_params.Ty()    = (y[1] - y[0]) * dz01;
-    fit_params.Qp()    = 0.f;
-    fit_params.Vi()    = 0.f;
+    float dz01      = 1.f / (z[1] - z[0]);
+    fit_params.Tx() = (x[1] - x[0]) * dz01;
+    fit_params.Ty() = (y[1] - y[0]) * dz01;
+    fit_params.Qp() = 0.f;
+    fit_params.Vi() = 0.f;
   }
 
   bool not_fitted = false;
-  float qp0 = 0;
+  float qp0       = 0;
   // repeat several times in order to improve the precision
-  for (int iiter = 0; iiter < 2/*nIterations*/; ++iiter) {
-  // fit forward
+  for (int iiter = 0; iiter < 2 /*nIterations*/; ++iiter) {
+    // fit forward
     {
       qp0 = fit_params.GetQp();
       if (qp0 > maxQp) qp0 = maxQp;
@@ -401,17 +417,17 @@ XPU_D void GpuTripletConstructor::FitTriplets(FitTriplets::context& ctx) const
 
       fit_params.Ndf()     = -ndfTrackModel + 2;
       fit_params.NdfTime() = sta[0]->timeInfo ? 0 : -1;
-    //  add the target constraint
+      //  add the target constraint
       FilterWithTargetAtLine(&fit_params, &fldTarget);
       for (int ih = 1; ih < 3; ++ih) {
         ExtrapolateStep(&fit_params, z[ih], qp0, &fld);
 
-//	Extrapolate(&fit_params, z[ih], 0.f, &fld);	//TODO: add qp0 instead of 0.f
+        //	Extrapolate(&fit_params, z[ih], 0.f, &fld);	//TODO: add qp0 instead of 0.f
 
-        int bin = fMaterialMap[ista[ih]].GetBin(fit_params.GetX(), fit_params.GetY());
+        int bin       = fMaterialMap[ista[ih]].GetBin(fit_params.GetX(), fit_params.GetY());
         auto radThick = fMaterialMapTables[bin];
 
-        MultipleScattering(&fit_params, radThick, qp0);	//TODO: why it was commented in original gpu code?
+        MultipleScattering(&fit_params, radThick, qp0);  //TODO: why it was commented in original gpu code?
 
         float Tx    = fit_params.Tx();
         float Ty    = fit_params.Ty();
@@ -422,26 +438,29 @@ XPU_D void GpuTripletConstructor::FitTriplets(FitTriplets::context& ctx) const
         float tt    = xpu::sqrt(txtx1 + tyty);
         float h2    = h * h;
         float qp0t  = qp0 * tt;
-        float c1 = 0.0136f, c2 = c1 * 0.038f, c3 = c2 * 0.5f, c4 = -c3 * 0.5f, c5 = c3 *  0.333333f, c6 = -c3 * 0.25f;
+        float c1 = 0.0136f, c2 = c1 * 0.038f, c3 = c2 * 0.5f, c4 = -c3 * 0.5f, c5 = c3 * 0.333333f, c6 = -c3 * 0.25f;
 
         float s0 = (c1 + c2 * xpu::log(radThick) + c3 * h + h2 * (c4 + c5 * h + c6 * h2)) * qp0t;
-        float a = ((tt + fParams[fIterationData[0].fIteration].particleMass * fParams[fIterationData[0].fIteration].particleMass * qp0 * qp0t) * radThick * s0 * s0);
-//        fit_params.C(2, 2) += 0.001;//txtx1 * a;	//TODO: check if it is needed. Switching on leads to the strong ghosts increase
+        float a  = ((tt
+                    + fParams[fIterationData[0].fIteration].particleMass
+                        * fParams[fIterationData[0].fIteration].particleMass * qp0 * qp0t)
+                   * radThick * s0 * s0);
+        //        fit_params.C(2, 2) += 0.001;//txtx1 * a;	//TODO: check if it is needed. Switching on leads to the strong ghosts increase
 
         fit_params.C(3, 2) += Tx * Ty * a;
         fit_params.C(3, 3) += (1. + tyty) * a;
 
         EnergyLossCorrection(&fit_params, radThick, -1.f);
-        FilterXY (&fit_params, mxy[ih]);
-        FilterTime (&fit_params, t[ih], dt2[ih], sta[ih]->timeInfo);
+        FilterXY(&fit_params, mxy[ih]);
+        FilterTime(&fit_params, t[ih], dt2[ih], sta[ih]->timeInfo);
       }
     }
 
-    if(iiter == 1) break;
+    if (iiter == 1) break;
 
     // fit backward
     {
-      qp0 = fit_params.GetQp ();
+      qp0 = fit_params.GetQp();
       if (qp0 > maxQp) qp0 = maxQp;
       if (qp0 < -maxQp) qp0 = -maxQp;
 
@@ -454,8 +473,8 @@ XPU_D void GpuTripletConstructor::FitTriplets(FitTriplets::context& ctx) const
 
       fit_params.ResetErrors(1.f, 1.f, 1.f, 1.f, 100., (sta[2]->timeInfo ? dt2[2] : 1.e6f), 1.e2f);
 
-      fit_params.Ndf () = -ndfTrackModel + 2;
-      fit_params.NdfTime () = sta[2]->timeInfo ? 0 : -1;
+      fit_params.Ndf()     = -ndfTrackModel + 2;
+      fit_params.NdfTime() = sta[2]->timeInfo ? 0 : -1;
 
       for (int ih = 1; ih >= 0; --ih) {
         ExtrapolateStep(&fit_params, z[ih], qp0, &fld);
@@ -470,20 +489,21 @@ XPU_D void GpuTripletConstructor::FitTriplets(FitTriplets::context& ctx) const
 
   float chi2 = fit_params.GetChiSq() + fit_params.GetChiSqTime();
 
-  if (chi2 > fParams[fIterationData[0].fIteration].tripletFinalChi2Cut || !xpu::isfinite (chi2) || chi2 < 0.f || not_fitted){
-      chi2 = -1.;
+  if (chi2 > fParams[fIterationData[0].fIteration].tripletFinalChi2Cut || !xpu::isfinite(chi2) || chi2 < 0.f
+      || not_fitted) {
+    chi2 = -1.;
   }
 
-  const float qp = fit_params.GetQp();
+  const float qp  = fit_params.GetQp();
   const float Cqp = fit_params.C(4, 4) + 0.001;
-  fvTriplets[iGThread] = ca::Triplet(ihit[0], ihit[1], ihit[2], ista[0], ista[1], ista[2], 0, 0, 0,
-				     chi2, qp, Cqp, fit_params.Tx(), fit_params.C(2, 2), fit_params.Ty(),
-				     fit_params.C(3, 3), isMomentumFitted);
+  fvTriplets[iGThread] =
+    ca::Triplet(ihit[0], ihit[1], ihit[2], ista[0], ista[1], ista[2], 0, 0, 0, chi2, qp, Cqp, fit_params.Tx(),
+                fit_params.C(2, 2), fit_params.Ty(), fit_params.C(3, 3), isMomentumFitted);
 
   if constexpr (constants::gpu::GpuSortTriplets) {
     fTripletSortHelper[iGThread].tr_id = iGThread;
-//    fTripletSortHelper[iGThread].lsta  = ista[0];
-    fTripletSortHelper[iGThread].lhit  = ihit[0];
+    //    fTripletSortHelper[iGThread].lsta  = ista[0];
+    fTripletSortHelper[iGThread].lhit = ihit[0];
   }
 }
 
@@ -495,7 +515,7 @@ XPU_D void GpuTripletConstructor::SortTripletsFunc(SortTriplets::context& ctx) c
 XPU_D void GpuTripletConstructor::CollectHits(int iThread, int iSta, int iHit, int maxNHits, int* collectedHits) const
 {
   // Collect hits in the station iSta and make doublets
-  const ca::GpuStation& sta = fParams[fIterationData[0].fIteration].GetStation(iSta);
+  const ca::GpuStation& sta            = fParams[fIterationData[0].fIteration].GetStation(iSta);
   kf::TrackParamBase<float> fit_params = fvTrackParams[iHit];
 
   fit_params.ChiSq() = 0.;
@@ -507,16 +527,20 @@ XPU_D void GpuTripletConstructor::CollectHits(int iThread, int iSta, int iHit, i
 
   const auto& gpuGrid = fvGpuGrid[iSta];
 
-  const float pick_range_x = xpu::sqrt(Pick_m22 * fit_params.C(0, 0)) + gpuGrid.GetMaxRangeX() + fParams[fIterationData[0].fIteration].maxDZ * abs(fit_params.Tx());
-  const float pick_range_y = xpu::sqrt(Pick_m22 * fit_params.C(1, 1)) + gpuGrid.GetMaxRangeY() + fParams[fIterationData[0].fIteration].maxDZ * abs(fit_params.Ty());
+  const float pick_range_x = xpu::sqrt(Pick_m22 * fit_params.C(0, 0)) + gpuGrid.GetMaxRangeX()
+                             + fParams[fIterationData[0].fIteration].maxDZ * abs(fit_params.Tx());
+  const float pick_range_y = xpu::sqrt(Pick_m22 * fit_params.C(1, 1)) + gpuGrid.GetMaxRangeY()
+                             + fParams[fIterationData[0].fIteration].maxDZ * abs(fit_params.Ty());
 
-  ca::GpuGridArea areaGpu(gpuGrid, &(fgridFirstBinEntryIndex[fvGpuGrid[iSta].GetBinEntryIndexStart()]), &(fgridEntries[fvGpuGrid[iSta].GetEntriesStart()]), fit_params.X(), fit_params.Y(), pick_range_x, pick_range_y);
+  ca::GpuGridArea areaGpu(gpuGrid, &(fgridFirstBinEntryIndex[fvGpuGrid[iSta].GetBinEntryIndexStart()]),
+                          &(fgridEntries[fvGpuGrid[iSta].GetEntriesStart()]), fit_params.X(), fit_params.Y(),
+                          pick_range_x, pick_range_y);
   unsigned int ih_gpu = 0;
 
   while (areaGpu.GetNextObjectId(ih_gpu) && (*collectedHits < maxNHits)) {  // loop over station hits
-//    if (fvbHitSuppressed[ih]) {
-//      continue;
-//    }
+                                                                            //    if (fvbHitSuppressed[ih]) {
+                                                                            //      continue;
+                                                                            //    }
     const ca::Hit& hit = fvHits[ih_gpu];
 
     // check time-boundaries
@@ -531,7 +555,9 @@ XPU_D void GpuTripletConstructor::CollectHits(int iThread, int iSta, int iHit, i
 
     // check y-boundaries
     float y = fit_params.Y() + fit_params.Ty() * (hit.Z() - fit_params.Z());
-    float C11 = fit_params.C(1, 1) + (hit.Z() - fit_params.Z()) * (2 * fit_params.C(3, 1) + (hit.Z() - fit_params.Z()) * fit_params.C(3, 3));
+    float C11 =
+      fit_params.C(1, 1)
+      + (hit.Z() - fit_params.Z()) * (2 * fit_params.C(3, 1) + (hit.Z() - fit_params.Z()) * fit_params.C(3, 3));
 
     float dy_est = xpu::sqrt(Pick_m22 * C11) + hit.RangeY();
 
@@ -543,7 +569,9 @@ XPU_D void GpuTripletConstructor::CollectHits(int iThread, int iSta, int iHit, i
 
     // check x-boundaries
     float x = fit_params.X() + fit_params.Tx() * (hit.Z() - fit_params.Z());
-    float C00 = fit_params.C(0, 0) + (hit.Z() - fit_params.Z()) * (2 * fit_params.C(2, 0) + (hit.Z() - fit_params.Z()) * fit_params.C(2, 2));
+    float C00 =
+      fit_params.C(0, 0)
+      + (hit.Z() - fit_params.Z()) * (2 * fit_params.C(2, 0) + (hit.Z() - fit_params.Z()) * fit_params.C(2, 2));
 
     float dx_est = xpu::sqrt(Pick_m22 * C00) + hit.RangeX();
 
@@ -554,7 +582,9 @@ XPU_D void GpuTripletConstructor::CollectHits(int iThread, int iSta, int iHit, i
     }
 
     // check chi2
-    float C10 = fit_params.C(1, 0) + (hit.Z() - fit_params.Z()) * (fit_params.C(2, 1) + fit_params.C(3, 0) + (hit.Z() - fit_params.Z()) * fit_params.C(3, 2));
+    float C10 = fit_params.C(1, 0)
+                + (hit.Z() - fit_params.Z())
+                    * (fit_params.C(2, 1) + fit_params.C(3, 0) + (hit.Z() - fit_params.Z()) * fit_params.C(3, 2));
 
     ca::MeasurementXy<float> mxy(hit.X(), hit.Y(), hit.dX2(), hit.dY2(), hit.dXY(), 1.f, 1.f);
 
@@ -567,9 +597,9 @@ XPU_D void GpuTripletConstructor::CollectHits(int iThread, int iSta, int iHit, i
 
       const float HCH = F0;
 
-      const float wi = 1.f / (mxy.Dx2() + HCH);
+      const float wi     = 1.f / (mxy.Dx2() + HCH);
       const float zetawi = zeta * wi;
-      chi2x = mxy.NdfX() * zeta * zetawi;
+      chi2x              = mxy.NdfX() * zeta * zetawi;
 
       const float K1 = F1 * wi;
 
@@ -584,8 +614,8 @@ XPU_D void GpuTripletConstructor::CollectHits(int iThread, int iSta, int iHit, i
     float chi2u = 0.;
     {  // filter U measurement, we need only chi2 here
       const float cosPhi = -mxy.Dxy() / mxy.Dx2();
-      const float u = cosPhi * mxy.X() + mxy.Y();
-      const float du2 = mxy.Dy2() + cosPhi * mxy.Dxy();
+      const float u      = cosPhi * mxy.X() + mxy.Y();
+      const float du2    = mxy.Dy2() + cosPhi * mxy.Dxy();
 
       const float zeta = cosPhi * x + y - u;
 
@@ -594,7 +624,7 @@ XPU_D void GpuTripletConstructor::CollectHits(int iThread, int iSta, int iHit, i
 
       const float HCH = (F0 * cosPhi + F1);
 
-      chi2u += mxy.NdfY () * zeta * zeta / (du2 + HCH);
+      chi2u += mxy.NdfY() * zeta * zeta / (du2 + HCH);
     }
 
     if (chi2x > fParams[fIterationData[0].fIteration].doubletChi2Cut) {
@@ -603,7 +633,7 @@ XPU_D void GpuTripletConstructor::CollectHits(int iThread, int iSta, int iHit, i
     if (chi2x + chi2u > fParams[fIterationData[0].fIteration].doubletChi2Cut) {
       continue;
     }
-    int id = iThread * maxNHits + *collectedHits;
+    int id                = iThread * maxNHits + *collectedHits;
     fHitDoublets[id].hit1 = iHit;
     fHitDoublets[id].hit2 = ih_gpu;
     *collectedHits += 1;
@@ -613,7 +643,7 @@ XPU_D void GpuTripletConstructor::CollectHits(int iThread, int iSta, int iHit, i
 
 XPU_D void GpuTripletConstructor::CollectHitsTriplets(int iThread, int iSta, int maxNHits, int* collectedHits) const
 {
-  const ca::GpuStation& sta = fParams[fIterationData[0].fIteration].GetStation(iSta);
+  const ca::GpuStation& sta            = fParams[fIterationData[0].fIteration].GetStation(iSta);
   kf::TrackParamBase<float> fit_params = fvTrackParamsDoublets[iThread];
 
   fit_params.ChiSq() = 0.;
@@ -625,16 +655,20 @@ XPU_D void GpuTripletConstructor::CollectHitsTriplets(int iThread, int iSta, int
 
   const auto& gpuGrid = fvGpuGrid[iSta];
 
-  const float pick_range_x = xpu::sqrt(Pick_m22 * fit_params.C(0, 0)) + gpuGrid.GetMaxRangeX() + fParams[fIterationData[0].fIteration].maxDZ * abs(fit_params.Tx());
-  const float pick_range_y = xpu::sqrt(Pick_m22 * fit_params.C(1, 1)) + gpuGrid.GetMaxRangeY() + fParams[fIterationData[0].fIteration].maxDZ * abs(fit_params.Ty());
+  const float pick_range_x = xpu::sqrt(Pick_m22 * fit_params.C(0, 0)) + gpuGrid.GetMaxRangeX()
+                             + fParams[fIterationData[0].fIteration].maxDZ * abs(fit_params.Tx());
+  const float pick_range_y = xpu::sqrt(Pick_m22 * fit_params.C(1, 1)) + gpuGrid.GetMaxRangeY()
+                             + fParams[fIterationData[0].fIteration].maxDZ * abs(fit_params.Ty());
 
-  ca::GpuGridArea areaGpu(gpuGrid, &(fgridFirstBinEntryIndex[fvGpuGrid[iSta].GetBinEntryIndexStart()]), &(fgridEntries[fvGpuGrid[iSta].GetEntriesStart()]), fit_params.X(), fit_params.Y(), pick_range_x, pick_range_y);
+  ca::GpuGridArea areaGpu(gpuGrid, &(fgridFirstBinEntryIndex[fvGpuGrid[iSta].GetBinEntryIndexStart()]),
+                          &(fgridEntries[fvGpuGrid[iSta].GetEntriesStart()]), fit_params.X(), fit_params.Y(),
+                          pick_range_x, pick_range_y);
   unsigned int ih_gpu = 0;
 
   while (areaGpu.GetNextObjectId(ih_gpu) && (*collectedHits < maxNHits)) {  // loop over station hits
-//    if (fvbHitSuppressed[ih]) {
-//      continue;
-//    }
+                                                                            //    if (fvbHitSuppressed[ih]) {
+                                                                            //      continue;
+                                                                            //    }
     const ca::Hit& hit = fvHits[ih_gpu];
 
     // check time-boundaries
@@ -651,7 +685,9 @@ XPU_D void GpuTripletConstructor::CollectHitsTriplets(int iThread, int iSta, int
 
     // check y-boundaries
     float y = fit_params.Y() + fit_params.Ty() * (hit.Z() - fit_params.Z());
-    float C11 = fit_params.C(1, 1) + (hit.Z() - fit_params.Z()) * (2 * fit_params.C(3, 1) + (hit.Z() - fit_params.Z()) * fit_params.C(3, 3));
+    float C11 =
+      fit_params.C(1, 1)
+      + (hit.Z() - fit_params.Z()) * (2 * fit_params.C(3, 1) + (hit.Z() - fit_params.Z()) * fit_params.C(3, 3));
 
     float dy_est = xpu::sqrt(Pick_m22 * C11) + hit.RangeY();
 
@@ -663,7 +699,9 @@ XPU_D void GpuTripletConstructor::CollectHitsTriplets(int iThread, int iSta, int
 
     // check x-boundaries
     float x = fit_params.X() + fit_params.Tx() * (hit.Z() - fit_params.Z());
-    float C00 = fit_params.C(0, 0) + (hit.Z() - fit_params.Z()) * (2 * fit_params.C(2, 0) + (hit.Z() - fit_params.Z()) * fit_params.C(2, 2));
+    float C00 =
+      fit_params.C(0, 0)
+      + (hit.Z() - fit_params.Z()) * (2 * fit_params.C(2, 0) + (hit.Z() - fit_params.Z()) * fit_params.C(2, 2));
 
     float dx_est = xpu::sqrt(Pick_m22 * C00) + hit.RangeX();
 
@@ -674,22 +712,24 @@ XPU_D void GpuTripletConstructor::CollectHitsTriplets(int iThread, int iSta, int
     }
 
     // check chi2
-    float C10 = fit_params.C(1, 0) + (hit.Z() - fit_params.Z()) * (fit_params.C(2, 1) + fit_params.C(3, 0) + (hit.Z() - fit_params.Z()) * fit_params.C(3, 2));
+    float C10 = fit_params.C(1, 0)
+                + (hit.Z() - fit_params.Z())
+                    * (fit_params.C(2, 1) + fit_params.C(3, 0) + (hit.Z() - fit_params.Z()) * fit_params.C(3, 2));
 
     ca::MeasurementXy<float> mxy(hit.X(), hit.Y(), hit.dX2(), hit.dY2(), hit.dXY(), 1.f, 1.f);
 
     float chi2x = 0.;
     {  // filter X measurement
-      const float zeta = x - mxy.X ();
+      const float zeta = x - mxy.X();
 
       const float F0 = C00;
       const float F1 = C10;
 
       const float HCH = F0;
 
-      const float wi = 1.f / (mxy.Dx2 () + HCH);
+      const float wi     = 1.f / (mxy.Dx2() + HCH);
       const float zetawi = zeta * wi;
-      chi2x = mxy.NdfX () * zeta * zetawi;
+      chi2x              = mxy.NdfX() * zeta * zetawi;
 
       const float K1 = F1 * wi;
 
@@ -703,9 +743,9 @@ XPU_D void GpuTripletConstructor::CollectHitsTriplets(int iThread, int iSta, int
 
     float chi2u = 0.;
     {  // filter U measurement, we need only chi2 here
-      const float cosPhi = -mxy.Dxy () / mxy.Dx2 ();
-      const float u = cosPhi * mxy.X () + mxy.Y ();
-      const float du2 = mxy.Dy2 () + cosPhi * mxy.Dxy ();
+      const float cosPhi = -mxy.Dxy() / mxy.Dx2();
+      const float u      = cosPhi * mxy.X() + mxy.Y();
+      const float du2    = mxy.Dy2() + cosPhi * mxy.Dxy();
 
       const float zeta = cosPhi * x + y - u;
 
@@ -714,7 +754,7 @@ XPU_D void GpuTripletConstructor::CollectHitsTriplets(int iThread, int iSta, int
 
       const float HCH = (F0 * cosPhi + F1);
 
-      chi2u += mxy.NdfY () * zeta * zeta / (du2 + HCH);
+      chi2u += mxy.NdfY() * zeta * zeta / (du2 + HCH);
     }
 
     if (chi2x > fParams[fIterationData[0].fIteration].tripletChi2Cut) {
@@ -724,37 +764,42 @@ XPU_D void GpuTripletConstructor::CollectHitsTriplets(int iThread, int iSta, int
       continue;
     }
 
-    int id = iThread * maxNHits + *collectedHits;
+    int id                = iThread * maxNHits + *collectedHits;
     fHitTriplets[id].hit1 = fHitDoubletsCompressed[iThread].hit1;
     fHitTriplets[id].hit2 = fHitDoubletsCompressed[iThread].hit2;
     fHitTriplets[id].hit3 = ih_gpu;
     *collectedHits += 1;
   }  // loop over the hits in the area
-//  if( *collectedHits > 2 ) printf("- iThread: %d; collectedHits: %d\n", iThread, *collectedHits);
+  //  if( *collectedHits > 2 ) printf("- iThread: %d; collectedHits: %d\n", iThread, *collectedHits);
 }
 
-XPU_D void GpuTripletConstructor::FilterWithTargetAtLine(kf::TrackParamBase<float>* tr_par, const ca::GpuFieldRegion* F0) const
+XPU_D void GpuTripletConstructor::FilterWithTargetAtLine(kf::TrackParamBase<float>* tr_par,
+                                                         const ca::GpuFieldRegion* F0) const
 {
   // Add the target constraint to a straight line track
 
   float eXY[2];
   float Jx[7], Jy[7];
 
-  GetExtrapolatedXYline(tr_par, /*fParams[fIterationData[0].fIteration]*/fParams_const[fIteration].GetTargetPositionZ(), F0, eXY/*, &eY*/, Jx, Jy);
+  GetExtrapolatedXYline(tr_par,
+                        /*fParams[fIterationData[0].fIteration]*/ fParams_const[fIteration].GetTargetPositionZ(), F0,
+                        eXY /*, &eY*/, Jx, Jy);
 
-  FilterExtrapolatedXY(tr_par, &(/*fParams[fIterationData[0].fIteration]*/fParams_const[fIteration].targetMeasurement), eXY/*, eY*/, Jx, Jy);
+  FilterExtrapolatedXY(tr_par, &(/*fParams[fIterationData[0].fIteration]*/ fParams_const[fIteration].targetMeasurement),
+                       eXY /*, eY*/, Jx, Jy);
 }
 
-XPU_D void GpuTripletConstructor::GetExtrapolatedXYline(kf::TrackParamBase<float>* tr_par, float z, const ca::GpuFieldRegion* F, float* extrXY,// float* extrY,
-//                                   std::array<float, ca::TrackParamBase<float>::kNtrackParam>* Jx,
-//                                   std::array<float, ca::TrackParamBase<float>::kNtrackParam>* Jy) const
-				     float* Jx,
-				     float* Jy) const
+XPU_D void GpuTripletConstructor::GetExtrapolatedXYline(
+  kf::TrackParamBase<float>* tr_par, float z, const ca::GpuFieldRegion* F, float* extrXY,  // float* extrY,
+  //                                   std::array<float, ca::TrackParamBase<float>::kNtrackParam>* Jx,
+  //                                   std::array<float, ca::TrackParamBase<float>::kNtrackParam>* Jy) const
+  float* Jx, float* Jy) const
 {
   // extrapolate track assuming it is straight (qp==0)
   // return the extrapolated X, Y and the derivatives of the extrapolated X and Y
 
-  const float c_light(0.000299792458), c1(1.), c2i(0.5), c6i(/*1. / 6.*/0.166667), c12i(/*1. / 12.*/0.083333);	//TODO: test constants
+  const float c_light(0.000299792458), c1(1.), c2i(0.5), c6i(/*1. / 6.*/ 0.166667),
+    c12i(/*1. / 12.*/ 0.083333);  //TODO: test constants
 
   const float tx = tr_par->GetTx();
   const float ty = tr_par->GetTy();
@@ -780,8 +825,8 @@ XPU_D void GpuTripletConstructor::GetExtrapolatedXYline(kf::TrackParamBase<float
   extrXY[0] = tr_par->GetX() + tx * dz;
   extrXY[1] = tr_par->GetY() + ty * dz;
 
-//  Jx->fill(0.);  // for a case
-//  Jy->fill(0.);
+  //  Jx->fill(0.);  // for a case
+  //  Jy->fill(0.);
 
   Jx[0] = 1.;
   Jx[1] = 0.;
@@ -801,10 +846,11 @@ XPU_D void GpuTripletConstructor::GetExtrapolatedXYline(kf::TrackParamBase<float
   Jy[6] = 0.;
 }
 
-XPU_D void GpuTripletConstructor::FilterExtrapolatedXY(kf::TrackParamBase<float>* tr_par, const ca::MeasurementXy<float>* m, float* extrXY,// fvec extrY,
-//			      std::array<float, ca::TrackParamBase<float>::kNtrackParam>* Jx,
-//			      std::array<float, ca::TrackParamBase<float>::kNtrackParam>* Jy) const
-						       float *Jx, float *Jy) const
+XPU_D void GpuTripletConstructor::FilterExtrapolatedXY(
+  kf::TrackParamBase<float>* tr_par, const ca::MeasurementXy<float>* m, float* extrXY,  // fvec extrY,
+  //			      std::array<float, ca::TrackParamBase<float>::kNtrackParam>* Jx,
+  //			      std::array<float, ca::TrackParamBase<float>::kNtrackParam>* Jy) const
+  float* Jx, float* Jy) const
 {
   // add a 2-D measurenent (x,y) at some z, that differs from fTr.GetZ()
   // extrX, extrY are extrapolated track parameters at z, Jx, Jy are derivatives of the extrapolation
@@ -812,7 +858,7 @@ XPU_D void GpuTripletConstructor::FilterExtrapolatedXY(kf::TrackParamBase<float>
   // ! it is assumed that in the track covariance matrix all non-diagonal covariances are 0
   // ! except of C10
 
-//  TrackParamV& T = fTr;
+  //  TrackParamV& T = fTr;
 
   //zeta0 = T.x + Jx[2]*T.Tx() + Jx[3]*T.Ty() + Jx[4]*T.qp - x;
   //zeta1 = T.y + Jy[2]*T.Tx() + Jy[3]*T.Ty() + Jy[4]*T.qp - y;
@@ -825,8 +871,8 @@ XPU_D void GpuTripletConstructor::FilterExtrapolatedXY(kf::TrackParamBase<float>
 
   // F = CH'
   const float F00 = tr_par->C(0, 0);
-  const float F01 = tr_par->C(1, 0);	//TODO: is it a bug???
-  const float F10 = F01;//tr_par->C10();
+  const float F01 = tr_par->C(1, 0);  //TODO: is it a bug???
+  const float F10 = F01;              //tr_par->C10();
   const float F11 = tr_par->C(1, 1);
   const float F20 = Jx[2] * tr_par->C(2, 2);
   const float F21 = Jy[2] * tr_par->C(2, 2);
@@ -841,11 +887,11 @@ XPU_D void GpuTripletConstructor::FilterExtrapolatedXY(kf::TrackParamBase<float>
   float S10 = m->Dxy() + F10 + Jy[2] * F20 + Jy[3] * F30 + Jy[4] * F40;
   float S11 = m->Dy2() + F11 + Jy[2] * F21 + Jy[3] * F31 + Jy[4] * F41;
 
-  const float si = 1. / (S00 * S11 - S10 * S10);
+  const float si     = 1. / (S00 * S11 - S10 * S10);
   const float S00tmp = S00;
-  S00         = si * S11;
-  S10         = -si * S10;
-  S11         = si * S00tmp;
+  S00                = si * S11;
+  S10                = -si * S10;
+  S11                = si * S00tmp;
 
   tr_par->ChiSq() += zeta0 * zeta0 * S00 + 2. * zeta0 * zeta1 * S10 + zeta1 * zeta1 * S11;
   tr_par->Ndf() += m->NdfX() + m->NdfY();
@@ -896,17 +942,19 @@ XPU_D void GpuTripletConstructor::MultipleScattering(kf::TrackParamBase<float>*
   float h2    = h * h;
   float qp0t  = qp0 * t;
 
-  float c1 = 0.0136f, c2 = c1 * 0.038f, c3 = c2 * 0.5f, c4 = -c3 * 0.5f, c5 = c3 *  0.333333f, c6 = -c3 * 0.25f;
+  float c1 = 0.0136f, c2 = c1 * 0.038f, c3 = c2 * 0.5f, c4 = -c3 * 0.5f, c5 = c3 * 0.333333f, c6 = -c3 * 0.25f;
 
   float s0 = (c1 + c2 * xpu::log(radThick) + c3 * h + h2 * (c4 + c5 * h + c6 * h2)) * qp0t;
-//  float a = ((t + fParams[fIterationData[0].fIteration].particleMass * fParams[fIterationData[0].fIteration].particleMass * qp0 * qp0t) * radThick * s0 * s0);
-  float a = ((t + fParams_const[fIteration].particleMass * fParams_const[fIteration].particleMass * qp0 * qp0t) * radThick * s0 * s0);
+  //  float a = ((t + fParams[fIterationData[0].fIteration].particleMass * fParams[fIterationData[0].fIteration].particleMass * qp0 * qp0t) * radThick * s0 * s0);
+  float a = ((t + fParams_const[fIteration].particleMass * fParams_const[fIteration].particleMass * qp0 * qp0t)
+             * radThick * s0 * s0);
   tr_par->C(2, 2) += txtx1 * a;
   tr_par->C(3, 2) += tx * ty * a;
   tr_par->C(3, 3) += (1. + tyty) * a;
 }
 
-XPU_D void GpuTripletConstructor::ExtrapolateStep(kf::TrackParamBase<float>* tr_par, float z_out, float qp0, const ca::GpuFieldRegion* Field) const
+XPU_D void GpuTripletConstructor::ExtrapolateStep(kf::TrackParamBase<float>* tr_par, float z_out, float qp0,
+                                                  const ca::GpuFieldRegion* Field) const
 {
   const float c_light = 0.000299792458;
 
@@ -920,8 +968,8 @@ XPU_D void GpuTripletConstructor::ExtrapolateStep(kf::TrackParamBase<float>* tr_
 
   //   Runge-Kutta steps
   //
-  const float r0[7]    = {tr_par->X(), tr_par->Y(), tr_par->Tx(), tr_par->Ty(), qp0, tr_par->Time(), tr_par->Vi()};
-  float R0[7][7] = {{0.}};
+  const float r0[7] = {tr_par->X(), tr_par->Y(), tr_par->Tx(), tr_par->Ty(), qp0, tr_par->Time(), tr_par->Vi()};
+  float R0[7][7]    = {{0.}};
   for (int i = 0; i < 7; ++i) {
     R0[i][i] = 1.;
   }
@@ -932,16 +980,16 @@ XPU_D void GpuTripletConstructor::ExtrapolateStep(kf::TrackParamBase<float>* tr_
     for (int i = 0; i < 7; ++i) {
       rstep[i] = r0[i] + stepDz[step] * f[step - 1][i];
     }
-    const float z = tr_par->GetZ() + stepDz[step];
+    const float z       = tr_par->GetZ() + stepDz[step];
     ca::GpuFieldValue B = Field->Get(rstep[0], rstep[1], z);
-    const float tx2   = rstep[2] * rstep[2];
-    const float ty2   = rstep[3] * rstep[3];
-    const float txty  = rstep[2] * rstep[3];
-    const float L2    = 1.f + tx2 + ty2;
-    const float L2i   = 1.f / L2;
-    const float L     = xpu::sqrt(L2);
-    const float cL    = c_light * L;
-    const float cLqp0 = cL * qp0;
+    const float tx2     = rstep[2] * rstep[2];
+    const float ty2     = rstep[3] * rstep[3];
+    const float txty    = rstep[2] * rstep[3];
+    const float L2      = 1.f + tx2 + ty2;
+    const float L2i     = 1.f / L2;
+    const float L       = xpu::sqrt(L2);
+    const float cL      = c_light * L;
+    const float cLqp0   = cL * qp0;
 
     f[step][0]    = rstep[2];
     F[step][0][2] = 1.;
@@ -950,56 +998,56 @@ XPU_D void GpuTripletConstructor::ExtrapolateStep(kf::TrackParamBase<float>* tr_
     F[step][1][3] = 1.;
 
     const float f2tmp = txty * B.x - (1. + tx2) * B.y + rstep[3] * B.z;
-    f[step][2] = cLqp0 * f2tmp;
+    f[step][2]        = cLqp0 * f2tmp;
 
     F[step][2][2] = cLqp0 * (rstep[2] * f2tmp * L2i + rstep[3] * B.x - 2. * rstep[2] * B.y);
     F[step][2][3] = cLqp0 * (rstep[3] * f2tmp * L2i + rstep[2] * B.x + B.z);
     F[step][2][4] = cL * f2tmp;
 
-    const float f3tmp    = -txty * B.y - rstep[2] * B.z + (1. + ty2) * B.x;
-    f[step][3]    = cLqp0 * f3tmp;
-    F[step][3][2] = cLqp0 * (rstep[2] * f3tmp * L2i - rstep[3] * B.y - B.z);
-    F[step][3][3] = cLqp0 * (rstep[3] * f3tmp * L2i + 2. * rstep[3] * B.x - rstep[2] * B.y);
-    F[step][3][4] = cL * f3tmp;
+    const float f3tmp = -txty * B.y - rstep[2] * B.z + (1. + ty2) * B.x;
+    f[step][3]        = cLqp0 * f3tmp;
+    F[step][3][2]     = cLqp0 * (rstep[2] * f3tmp * L2i - rstep[3] * B.y - B.z);
+    F[step][3][3]     = cLqp0 * (rstep[3] * f3tmp * L2i + 2. * rstep[3] * B.x - rstep[2] * B.y);
+    F[step][3][4]     = cL * f3tmp;
 
     f[step][4] = 0.;
 
-//    if (fDoFitVelocity) {	//TODO: check and switch it on back if needed (for triplets, probably, not needed)
-////      fvec vi       = rstep[6];
-      f[step][5]    = rstep[6] * L;
-      const float Li = 1.f / L;
-      F[step][5][2] = rstep[6] * rstep[2] * Li;// / L;
-      F[step][5][3] = rstep[6] * rstep[3] * Li;// / L;
-      F[step][5][4] = 0.;
-      F[step][5][5] = 0.;
-      F[step][5][6] = L;
-//    }
-//    else {
-//      const float vi       = xpu::sqrt(1. + fSettings[0].particleMass * fSettings[0].particleMass * qp0 * qp0) * fSettings[0].SpeedOfLightInv;
-//      f[step][5]    = rstep[6] * L;
-//      F[step][5][2] = rstep[6] * rstep[2] / L;
-//      F[step][5][3] = rstep[6] * rstep[3] / L;
-//      F[step][5][4] =
-//	  fSettings[0].particleMass * fSettings[0].particleMass * qp0 * L /
-//	  xpu::sqrt(1. + fSettings[0].particleMass * fSettings[0].particleMass * qp0 * qp0) * fSettings[0].SpeedOfLightInv;
-//      F[step][5][5] = 0.;
-//      F[step][5][6] = 0.;
-//    }
+    //    if (fDoFitVelocity) {	//TODO: check and switch it on back if needed (for triplets, probably, not needed)
+    ////      fvec vi       = rstep[6];
+    f[step][5]     = rstep[6] * L;
+    const float Li = 1.f / L;
+    F[step][5][2]  = rstep[6] * rstep[2] * Li;  // / L;
+    F[step][5][3]  = rstep[6] * rstep[3] * Li;  // / L;
+    F[step][5][4]  = 0.;
+    F[step][5][5]  = 0.;
+    F[step][5][6]  = L;
+    //    }
+    //    else {
+    //      const float vi       = xpu::sqrt(1. + fSettings[0].particleMass * fSettings[0].particleMass * qp0 * qp0) * fSettings[0].SpeedOfLightInv;
+    //      f[step][5]    = rstep[6] * L;
+    //      F[step][5][2] = rstep[6] * rstep[2] / L;
+    //      F[step][5][3] = rstep[6] * rstep[3] / L;
+    //      F[step][5][4] =
+    //	  fSettings[0].particleMass * fSettings[0].particleMass * qp0 * L /
+    //	  xpu::sqrt(1. + fSettings[0].particleMass * fSettings[0].particleMass * qp0 * qp0) * fSettings[0].SpeedOfLightInv;
+    //      F[step][5][5] = 0.;
+    //      F[step][5][6] = 0.;
+    //    }
 
     f[step][6] = 0.;
   }  // end of Runge-Kutta step
 
-  float r[7]    = {0.};  // extrapolated parameters
+  float r[7]    = {0.};    // extrapolated parameters
   float R[7][7] = {{0.}};  // Jacobian of the extrapolation
 
-  float stepW[5];// = {0., h / 6.f, h / 3.f, h / 3.f, h / 6.};
+  float stepW[5];  // = {0., h / 6.f, h / 3.f, h / 3.f, h / 6.};
   stepW[0] = 0.;
   stepW[1] = h / 6.f;
   stepW[2] = h / 3.f;
   stepW[3] = h / 3.f;
   stepW[4] = h / 6.f;
-//  const float stepW[5] = {0., h * 0.16666667f, h * 0.33333334f, h * 0.33333334f, h * 0.16666667f};
-//  const float stepW[5] = {0., h * 0.166667f, h * 0.333334f, h * 0.333334f, h * 0.166667f};	//TODO: test constants: leads to a drop in efficiency
+  //  const float stepW[5] = {0., h * 0.16666667f, h * 0.33333334f, h * 0.33333334f, h * 0.16666667f};
+  //  const float stepW[5] = {0., h * 0.166667f, h * 0.333334f, h * 0.333334f, h * 0.166667f};	//TODO: test constants: leads to a drop in efficiency
 
   float k[5][7][7] = {{0.}};
   for (int step = 1; step <= 4; ++step) {
@@ -1044,7 +1092,7 @@ XPU_D void GpuTripletConstructor::ExtrapolateStep(kf::TrackParamBase<float>* tr_
   tr_par->Vi()   = r[6];
 
   //fTr.Vi()( fTr.Vi() < fvec(TrackParamV::kClightNsInv) ) = fvec(TrackParamV::kClightNsInv);
-  tr_par->Z() = z_out;//zMasked;
+  tr_par->Z() = z_out;  //zMasked;
 
   //          covariance matrix transport
   float C[7][7];
@@ -1075,7 +1123,7 @@ XPU_D void GpuTripletConstructor::ExtrapolateStep(kf::TrackParamBase<float>* tr_
 
 XPU_D void GpuTripletConstructor::ExtrapolateLineNoField(kf::TrackParamBase<float>* tr_par, float zOut) const
 {
-    // extrapolate the track assuming no field
+  // extrapolate the track assuming no field
 
   const float dz = zOut - tr_par->GetZ();
 
@@ -1089,14 +1137,14 @@ XPU_D void GpuTripletConstructor::ExtrapolateLineNoField(kf::TrackParamBase<floa
   const float j53 = dz * ty * vi / L;
   const float j56 = dz * L;
 
-    // transport parameters
+  // transport parameters
 
   tr_par->X() += tx * dz;
   tr_par->Y() += ty * dz;
   tr_par->Time() += L * vi * dz;
   tr_par->Z() = zOut;
 
-    // transport covariance matrix (see details in CaTrackFit)
+  // transport covariance matrix (see details in CaTrackFit)
 
   const float jc00 = tr_par->C(0, 0) + dz * tr_par->C(2, 0);
   const float jc02 = tr_par->C(0, 2) + dz * tr_par->C(2, 2);
@@ -1116,32 +1164,32 @@ XPU_D void GpuTripletConstructor::ExtrapolateLineNoField(kf::TrackParamBase<floa
 
   tr_par->C(0, 0) = jc00 + jc02 * dz;
   tr_par->C(1, 0) = jc10 + jc12 * dz;
-//  tr_par->C(2, 0) = tr_par->C(2, 0) + tr_par->C(2, 2) * dz;
-//  tr_par->C(3, 0) = tr_par->C(3, 0) + tr_par->C(3, 2) * dz;
-//  tr_par->C(4, 0) = tr_par->C(4, 0) + tr_par->C(4, 2) * dz;
+  //  tr_par->C(2, 0) = tr_par->C(2, 0) + tr_par->C(2, 2) * dz;
+  //  tr_par->C(3, 0) = tr_par->C(3, 0) + tr_par->C(3, 2) * dz;
+  //  tr_par->C(4, 0) = tr_par->C(4, 0) + tr_par->C(4, 2) * dz;
   tr_par->C(2, 0) += tr_par->C(2, 2) * dz;
   tr_par->C(3, 0) += tr_par->C(3, 2) * dz;
   tr_par->C(4, 0) += tr_par->C(4, 2) * dz;
   tr_par->C(5, 0) = jc50 + jc52 * dz;
-//  tr_par->C(6, 0) = tr_par->C(6, 0) + tr_par->C(6, 2) * dz;
+  //  tr_par->C(6, 0) = tr_par->C(6, 0) + tr_par->C(6, 2) * dz;
   tr_par->C(6, 0) += tr_par->C(6, 2) * dz;
 
   tr_par->C(1, 1) = jc11 + jc13 * dz;
-//  tr_par->C21() = tr_par->C21() + tr_par->C23() * dz;
-//  tr_par->C31() = tr_par->C31() + tr_par->C33() * dz;
-//  tr_par->C41() = tr_par->C41() + tr_par->C43() * dz;
+  //  tr_par->C21() = tr_par->C21() + tr_par->C23() * dz;
+  //  tr_par->C31() = tr_par->C31() + tr_par->C33() * dz;
+  //  tr_par->C41() = tr_par->C41() + tr_par->C43() * dz;
   tr_par->C(2, 1) += tr_par->C(2, 3) * dz;
   tr_par->C(3, 1) += tr_par->C(3, 3) * dz;
   tr_par->C(4, 1) += tr_par->C(4, 3) * dz;
   tr_par->C(5, 1) = jc51 + jc53 * dz;
-//  tr_par->C61() = tr_par->C61() + tr_par->C63() * dz;
+  //  tr_par->C61() = tr_par->C61() + tr_par->C63() * dz;
   tr_par->C(6, 1) += tr_par->C(6, 3) * dz;
 
   tr_par->C(5, 2) = jc52;
   tr_par->C(5, 3) = jc53;
   tr_par->C(5, 4) = jc54;
   tr_par->C(5, 5) = jc55 + jc52 * j52 + jc53 * j53 + jc56 * j56;
-//  tr_par->C65() = tr_par->C65() + tr_par->C62() * j52 + tr_par->C63() * j53 + tr_par->C66() * j56;
+  //  tr_par->C65() = tr_par->C65() + tr_par->C62() * j52 + tr_par->C63() * j53 + tr_par->C66() * j56;
   tr_par->C(6, 5) += tr_par->C(6, 2) * j52 + tr_par->C(6, 3) * j53 + tr_par->C(6, 6) * j56;
 }
 
@@ -1192,12 +1240,12 @@ XPU_D void GpuTripletConstructor::Filter1d(kf::TrackParamBase<float>* tr_par, co
 
   // correction to HCH is needed for the case when sigma2 is so small
   // with respect to HCH that it disappears due to the roundoff error
-//    fvec wi     = fMaskF / (m.Du2() + fvec(1.0000001) * HCH);
-//    fvec zetawi = fMaskF * zeta / (iif(maskDoFilter, m.Du2(), fvec::Zero()) + HCH);
+  //    fvec wi     = fMaskF / (m.Du2() + fvec(1.0000001) * HCH);
+  //    fvec zetawi = fMaskF * zeta / (iif(maskDoFilter, m.Du2(), fvec::Zero()) + HCH);
 
   //TODO: (wi, zetawi): GPU results are different from CPU ones and depends on the order of the operations
   //TODO: need to compare effects made on the tracking efficiency and parameters quality
-  float wi = 1.f / (m.Du2() + 1.0000001f * HCH);
+  float wi           = 1.f / (m.Du2() + 1.0000001f * HCH);
   const float zetawi = zeta / (maskDoFilter ? m.Du2() : 0.f + HCH);
 
   wi = m.Du2() > 0.f ? wi : 0.f;
@@ -1220,8 +1268,8 @@ XPU_D void GpuTripletConstructor::Filter1d(kf::TrackParamBase<float>* tr_par, co
   tr_par->Time() -= F5 * zetawi;
   tr_par->Vi() -= F6 * zetawi;
 
-  tr_par->C(0 ,0) -= F0 * F0 * wi;
-  tr_par->C(1 ,0) -= K1 * F0;
+  tr_par->C(0, 0) -= F0 * F0 * wi;
+  tr_par->C(1, 0) -= K1 * F0;
   tr_par->C(1, 1) -= K1 * F1;
 
   tr_par->C(2, 0) -= K2 * F0;
@@ -1256,7 +1304,8 @@ XPU_D void GpuTripletConstructor::Filter1d(kf::TrackParamBase<float>* tr_par, co
 }
 
 //TODO: update this function
-XPU_D void GpuTripletConstructor::FilterTime(kf::TrackParamBase<float>* tr_par, float t, float dt2, float timeInfo) const
+XPU_D void GpuTripletConstructor::FilterTime(kf::TrackParamBase<float>* tr_par, float t, float dt2,
+                                             float timeInfo) const
 {
   // filter track with a time measurement
 
@@ -1269,25 +1318,25 @@ XPU_D void GpuTripletConstructor::FilterTime(kf::TrackParamBase<float>* tr_par,
   const float F5 = tr_par->C(5, 5);
   const float F6 = tr_par->C(6, 5);
 
-  const float HCH = F5;//tr_par->C55();
+  const float HCH = F5;  //tr_par->C55();
 
-    // when dt0 is much smaller than current time error,
-    // set track time exactly to the measurement value without filtering
-    // it helps to keep the initial time errors reasonably small
-    // the calculations in the covariance matrix are not affected
+  // when dt0 is much smaller than current time error,
+  // set track time exactly to the measurement value without filtering
+  // it helps to keep the initial time errors reasonably small
+  // the calculations in the covariance matrix are not affected
 
   const bool maskDoFilter = (HCH < dt2 * 16.f) && timeInfo;
 
   float w = 1.f;
 
-  if( timeInfo <= 0.f ) w = 0.f;
+  if (timeInfo <= 0.f) w = 0.f;
 
-//  float wi = 1.f / (dt2 + 1.0000001f * HCH);
+  //  float wi = 1.f / (dt2 + 1.0000001f * HCH);
   float hch_d = 1.0000001f * HCH;
-  float den = dt2 + hch_d;
-  float wi = w / den;
+  float den   = dt2 + hch_d;
+  float wi    = w / den;
 
-  const float zeta = tr_par->Time() - t;
+  const float zeta   = tr_par->Time() - t;
   const float zetawi = w * zeta / (maskDoFilter ? dt2 : 0.f + HCH);
 
   tr_par->ChiSqTime() += maskDoFilter ? (zeta * zeta * wi) : 0.f;
@@ -1344,57 +1393,58 @@ XPU_D void GpuTripletConstructor::FilterTime(kf::TrackParamBase<float>* tr_par,
   tr_par->C(6, 6) -= K6 * F6;
 }
 
-XPU_D void GpuTripletConstructor::EnergyLossCorrection(kf::TrackParamBase<float>* tr_par, float radThick, float upstreamDirection) const
+XPU_D void GpuTripletConstructor::EnergyLossCorrection(kf::TrackParamBase<float>* tr_par, float radThick,
+                                                       float upstreamDirection) const
 {
   const float qp02 = xpu::max(tr_par->Qp() * tr_par->Qp(), 0.01f);
-  const float p2 = 1.f / qp02;
-//  const float mass2 = fParams[fIterationData[0].fIteration].particleMass * fParams[fIterationData[0].fIteration].particleMass;	//TODO: fMass2 = 0.01116, fSettings[0].particleMass^2 = 0.011172, might lead to a different result
-  const float mass2 = fParams_const[0].particleMass * fParams_const[0].particleMass;
+  const float p2   = 1.f / qp02;
+  //  const float mass2 = fParams[fIterationData[0].fIteration].particleMass * fParams[fIterationData[0].fIteration].particleMass;	//TODO: fMass2 = 0.01116, fSettings[0].particleMass^2 = 0.011172, might lead to a different result
+  const float mass2   = fParams_const[0].particleMass * fParams_const[0].particleMass;
   const float r_mass2 = 1.f / mass2;
-  const float E2 = mass2 + p2;
+  const float E2      = mass2 + p2;
 
   const float bg2 = p2 * r_mass2;
-    //ApproximateBetheBloch
+  //ApproximateBetheBloch
   const float kp0 = 2.33f;
   const float kp1 = 0.20f;
   const float kp2 = 3.00f;
   const float kp3 = 173e-9f;
   const float kp4 = 0.49848f;
 
-  const float mK = 0.307075e-3f;
+  const float mK   = 0.307075e-3f;
   const float _2me = 1.022e-3f;
-  const float rho = kp0;
-  const float x0 = kp1 * 2.303f;
-  const float x1 = kp2 * 2.303f;
-  const float mI = kp3;
-  const float mZA = kp4;
+  const float rho  = kp0;
+  const float x0   = kp1 * 2.303f;
+  const float x1   = kp2 * 2.303f;
+  const float mI   = kp3;
+  const float mZA  = kp4;
   const float maxT = _2me * bg2;
 
-      //*** Density effect
-  float d2 = 0.f;
-  const float x = 0.5f * xpu::log(bg2);
+  //*** Density effect
+  float d2         = 0.f;
+  const float x    = 0.5f * xpu::log(bg2);
   const float lhwI = xpu::log(28.816f * 1e-9f * xpu::sqrt(rho * mZA) / mI);
 
-  if (x > x1){
+  if (x > x1) {
     d2 = lhwI + x - 0.5f;
   }
-  if ((x > x0) && (x1 > x)){
+  if ((x > x0) && (x1 > x)) {
     const float r = (x1 - x) / (x1 - x0);
-    d2 = lhwI + x - 0.5f + (0.5f - lhwI - x0) * r * r * r;
+    d2            = lhwI + x - 0.5f + (0.5f - lhwI - x0) * r * r * r;
   }
 
-  const float bethe = mK * mZA * (1.f + bg2) / bg2
-	  * (0.5f * xpu::log (_2me * bg2 * maxT / (mI * mI)) - bg2 / (1.f + bg2) - d2);
+  const float bethe =
+    mK * mZA * (1.f + bg2) / bg2 * (0.5f * xpu::log(_2me * bg2 * maxT / (mI * mI)) - bg2 / (1.f + bg2) - d2);
 
   const float tr = xpu::sqrt(1.f + tr_par->Tx() * tr_par->Tx() + tr_par->Ty() * tr_par->Ty());
 
   const float dE = bethe * radThick * tr * 2.33f * 9.34961f;
 
-  const float ECorrected = xpu::sqrt (E2) + upstreamDirection * dE;
+  const float ECorrected  = xpu::sqrt(E2) + upstreamDirection * dE;
   const float E2Corrected = ECorrected * ECorrected;
 
-  float corr = xpu::sqrt (p2 / (E2Corrected - mass2));
-  if (xpu::isnan (corr) || xpu::isinf (corr)){
+  float corr = xpu::sqrt(p2 / (E2Corrected - mass2));
+  if (xpu::isnan(corr) || xpu::isinf(corr)) {
     corr = 1.f;
   }
 
diff --git a/algo/ca/core/experimental/CaGpuTripletConstructor.h b/algo/ca/core/experimental/CaGpuTripletConstructor.h
index 859bc796c8..1a438d4e65 100644
--- a/algo/ca/core/experimental/CaGpuTripletConstructor.h
+++ b/algo/ca/core/experimental/CaGpuTripletConstructor.h
@@ -8,41 +8,40 @@
 
 #pragma once  // include this header only once per compilation unit
 
-#include "CaMeasurementXy.h"
-#include "KfMeasurementU.h"
-#include "CaGpuParameters.h"
-#include "CaTriplet.h"
-#include "CaHit.h"
+#include "CaDeviceImage.h"
 #include "CaGpuGrid.h"
 #include "CaGpuGridArea.h"
 #include "CaGpuMaterialMap.h"
-
-#include "CaDeviceImage.h"
+#include "CaGpuParameters.h"
+#include "CaHit.h"
+#include "CaMeasurementXy.h"
+#include "CaTriplet.h"
+#include "KfMeasurementU.h"
 
 #include <xpu/device.h>
 #include <xpu/host.h>
 
 namespace cbm::algo
 {
-//  static constexpr int kMaxNofStations = 24;
+  //  static constexpr int kMaxNofStations = 24;
   // Block sizes / other compile time constants that need tuning
   enum CaGpuConstants
   {
 #if XPU_IS_CUDA
-    kSingletConstructorBlockSize         = 512,
-    kSingletConstructorItemsPerThread    = 8,
+    kSingletConstructorBlockSize      = 512,
+    kSingletConstructorItemsPerThread = 8,
 #else  // HIP, values ignored on CPU
-    kSingletConstructorBlockSize         = 64,//1024,	//128 is optimal (preliminary tests)
-    kSingletConstructorItemsPerThread    = 6,
-    kSortTripletsBlockSize               = 64,
-    kSortTripletsItemsPerThread 	 = 6,
+    kSingletConstructorBlockSize      = 64,  //1024,	//128 is optimal (preliminary tests)
+    kSingletConstructorItemsPerThread = 6,
+    kSortTripletsBlockSize            = 64,
+    kSortTripletsItemsPerThread       = 6,
 #endif
   };
 }  // namespace cbm::algo
 
 namespace cbm::algo::ca
 {
-//  class Framework;
+  //  class Framework;
   class GpuTripletConstructor;
 
   struct TrackingParams {
@@ -55,128 +54,122 @@ namespace cbm::algo::ca
   struct strGpuTripletConstructor : xpu::constant<GPUReco, GpuTripletConstructor> {
   };
 
-  struct hit_doublets
-  {
+  struct hit_doublets {
     ca::HitIndex_t hit1;
     ca::HitIndex_t hit2;
   };
 
-  struct hit_triplets
-  {
+  struct hit_triplets {
     ca::HitIndex_t hit1;
     ca::HitIndex_t hit2;
     ca::HitIndex_t hit3;
   };
 
-  struct sort_triplets
-  {
+  struct sort_triplets {
     unsigned int tr_id;
-//    unsigned int lsta;
+    //    unsigned int lsta;
     unsigned int lhit;
   };
 
-  struct shared_counters
-  {
+  struct shared_counters {
     int counter;
   };
 
-  struct shared_data
-  {
-//    ca::GpuStation fStations_shared[kMaxNofStations];
+  struct shared_data {
+    //    ca::GpuStation fStations_shared[kMaxNofStations];
     ca::GpuFieldRegion fld0_shared[kSingletConstructorBlockSize];
     ca::GpuFieldRegion fld1_shared[kSingletConstructorBlockSize];
   };
 
-  struct shared_data_fit_triplets
-  {
-//    ca::GpuStation fStations_shared[kMaxNofStations];
+  struct shared_data_fit_triplets {
+    //    ca::GpuStation fStations_shared[kMaxNofStations];
     kf::TrackParamBase<float> fit_params_shared[kSingletConstructorBlockSize];
   };
 
-//  struct TestFunc : xpu::kernel<GPUReco> {
-//    using block_size    = xpu::block_size<kSingletConstructorBlockSize>;
-//    using constants     = xpu::cmem<strGpuTripletConstructor, Params>;
-//    using context       = xpu::kernel_context<shared_memory, constants>;
-//    XPU_D void operator()(context&);
-//  };
+  //  struct TestFunc : xpu::kernel<GPUReco> {
+  //    using block_size    = xpu::block_size<kSingletConstructorBlockSize>;
+  //    using constants     = xpu::cmem<strGpuTripletConstructor, Params>;
+  //    using context       = xpu::kernel_context<shared_memory, constants>;
+  //    XPU_D void operator()(context&);
+  //  };
 
   // Declare Kernels
   struct MakeSinglets : xpu::kernel<GPUReco> {
     using block_size    = xpu::block_size<kSingletConstructorBlockSize>;
-    using shared_memory     = shared_data;
+    using shared_memory = shared_data;
     using constants     = xpu::cmem<strGpuTripletConstructor, Params>;
     using context       = xpu::kernel_context<shared_memory, constants>;
     XPU_D void operator()(context&);
   };
 
   struct MakeDoublets : xpu::kernel<GPUReco> {
-    using block_size    = xpu::block_size<kSingletConstructorBlockSize>;
-//    using shared_memory = shared_counters;
-    using constants     = xpu::cmem<strGpuTripletConstructor>;//, Params>;
-    using context       = xpu::kernel_context<shared_memory, constants>;
+    using block_size = xpu::block_size<kSingletConstructorBlockSize>;
+    //    using shared_memory = shared_counters;
+    using constants = xpu::cmem<strGpuTripletConstructor>;  //, Params>;
+    using context   = xpu::kernel_context<shared_memory, constants>;
     XPU_D void operator()(context&);
   };
 
   struct CompressDoublets : xpu::kernel<GPUReco> {
-    using block_size    = xpu::block_size<kSingletConstructorBlockSize>;
-    using constants     = xpu::cmem<strGpuTripletConstructor>;//, Params>;
-    using context       = xpu::kernel_context<shared_memory, constants>;
+    using block_size = xpu::block_size<kSingletConstructorBlockSize>;
+    using constants  = xpu::cmem<strGpuTripletConstructor>;  //, Params>;
+    using context    = xpu::kernel_context<shared_memory, constants>;
     XPU_D void operator()(context&);
   };
 
   struct FitDoublets : xpu::kernel<GPUReco> {
-    using block_size    = xpu::block_size<kSingletConstructorBlockSize>;
-    using constants     = xpu::cmem<strGpuTripletConstructor>;//, Params>;
-    using context       = xpu::kernel_context<shared_memory, constants>;
+    using block_size = xpu::block_size<kSingletConstructorBlockSize>;
+    using constants  = xpu::cmem<strGpuTripletConstructor>;  //, Params>;
+    using context    = xpu::kernel_context<shared_memory, constants>;
     XPU_D void operator()(context&);
   };
 
   struct MakeTriplets : xpu::kernel<GPUReco> {
-    using block_size    = xpu::block_size<kSingletConstructorBlockSize>;
-//    using shared_memory = shared_counters;
-    using constants     = xpu::cmem<strGpuTripletConstructor>;//, Params>;
-    using context       = xpu::kernel_context<shared_memory, constants>;
+    using block_size = xpu::block_size<kSingletConstructorBlockSize>;
+    //    using shared_memory = shared_counters;
+    using constants = xpu::cmem<strGpuTripletConstructor>;  //, Params>;
+    using context   = xpu::kernel_context<shared_memory, constants>;
     XPU_D void operator()(context&);
   };
 
   struct CompressTriplets : xpu::kernel<GPUReco> {
-    using block_size    = xpu::block_size<kSingletConstructorBlockSize>;
-    using constants     = xpu::cmem<strGpuTripletConstructor>;//, Params>;
-    using context       = xpu::kernel_context<shared_memory, constants>;
+    using block_size = xpu::block_size<kSingletConstructorBlockSize>;
+    using constants  = xpu::cmem<strGpuTripletConstructor>;  //, Params>;
+    using context    = xpu::kernel_context<shared_memory, constants>;
     XPU_D void operator()(context&);
   };
 
   struct FitTriplets : xpu::kernel<GPUReco> {
-    using block_size    = xpu::block_size<kSingletConstructorBlockSize>;
-//    using shared_memory = shared_data_fit_triplets;
-    using constants     = xpu::cmem<strGpuTripletConstructor>;//, Params>;
-    using context       = xpu::kernel_context<shared_memory, constants>;
+    using block_size = xpu::block_size<kSingletConstructorBlockSize>;
+    //    using shared_memory = shared_data_fit_triplets;
+    using constants = xpu::cmem<strGpuTripletConstructor>;  //, Params>;
+    using context   = xpu::kernel_context<shared_memory, constants>;
     XPU_D void operator()(context&);
   };
 
   struct SortTriplets : xpu::kernel<GPUReco> {
-    using block_size    = xpu::block_size<kSortTripletsBlockSize>;
-    using sort_t        = xpu::block_sort<unsigned int, sort_triplets, kSortTripletsBlockSize, kSortTripletsItemsPerThread>;
+    using block_size = xpu::block_size<kSortTripletsBlockSize>;
+    using sort_t = xpu::block_sort<unsigned int, sort_triplets, kSortTripletsBlockSize, kSortTripletsItemsPerThread>;
     using shared_memory = sort_t::storage_t;
-    using constants     = xpu::cmem<strGpuTripletConstructor>;//, Params>;
+    using constants     = xpu::cmem<strGpuTripletConstructor>;  //, Params>;
     using context       = xpu::kernel_context<shared_memory, constants>;
     XPU_D void operator()(context&);
   };
 
   struct IterationData {
-    int fNHits;  		///< Number of hits in grid
-    int fIteration;  		///< Iteration number
-    int fNDoublets;  		///< Number of doublets
-    int fNDoublets_counter;  	///< Number of doublets counter
-    int fNTriplets;  		///< Number of triplets
-    int fNTriplets_counter;  	///< Number of triplets counter
+    int fNHits;              ///< Number of hits in grid
+    int fIteration;          ///< Iteration number
+    int fNDoublets;          ///< Number of doublets
+    int fNDoublets_counter;  ///< Number of doublets counter
+    int fNTriplets;          ///< Number of triplets
+    int fNTriplets_counter;  ///< Number of triplets counter
   };
 
   class GpuTripletConstructor {
    public:
     ///                             ------  FUNCTIONAL PART ------
 
-//    XPU_D void TestFunc(TestFunc::context&) const;
+    //    XPU_D void TestFunc(TestFunc::context&) const;
 
     XPU_D void MakeSinglets(MakeSinglets::context&) const;
 
@@ -195,24 +188,25 @@ namespace cbm::algo::ca
     XPU_D void SortTripletsFunc(SortTriplets::context&) const;
 
    private:
-
     XPU_D void CollectHits(int iThread, int iSta, int iHit, int maxNHits, int* collectedHits) const;
 
-    XPU_D void CollectHitsTriplets(int iThread, int iSta, int maxNHits, int* collectedHits) const;	//TODO: merge with CollectHits
+    XPU_D void CollectHitsTriplets(int iThread, int iSta, int maxNHits,
+                                   int* collectedHits) const;  //TODO: merge with CollectHits
 
 
     ///                             ------  FITTING FINCTIONALITY ------ //TODO: check and switch to the standard CaTrackFit if possible
     XPU_D void FilterWithTargetAtLine(kf::TrackParamBase<float>* tr_par, const ca::GpuFieldRegion* F0) const;
 
     /// extrapolate track as a line, return the extrapolated X, Y and the Jacobians
-    XPU_D void GetExtrapolatedXYline(kf::TrackParamBase<float>* tr_par, float z, const ca::GpuFieldRegion* F, float* extrXY,
-	float *Jx, float *Jy) const;
+    XPU_D void GetExtrapolatedXYline(kf::TrackParamBase<float>* tr_par, float z, const ca::GpuFieldRegion* F,
+                                     float* extrXY, float* Jx, float* Jy) const;
 
     XPU_D void MultipleScattering(kf::TrackParamBase<float>* tr_par, float radThick, float qp0) const;
 
     /// extrapolate the track to the given Z using the field F
     /// it does extrapolation in one step
-    XPU_D void ExtrapolateStep(kf::TrackParamBase<float>* tr_par, float z_out, float qp0, const ca::GpuFieldRegion* Field) const;
+    XPU_D void ExtrapolateStep(kf::TrackParamBase<float>* tr_par, float z_out, float qp0,
+                               const ca::GpuFieldRegion* Field) const;
 
     /// filter the track with the XY measurement placed at different Z
     /// \param m - measurement
@@ -220,11 +214,11 @@ namespace cbm::algo::ca
     /// \param extrY - extrapolated Y of the track
     /// \param Jx - Jacobian of the extrapolated X
     /// \param Jy - Jacobian of the extrapolated Y
-    XPU_D void FilterExtrapolatedXY(kf::TrackParamBase<float>* tr_par, const ca::MeasurementXy<float>* m, float* extrXY,// fvec extrY,
-//			      std::array<float, ca::TrackParamBase<float>::kNtrackParam>* Jx,
-//			      std::array<float, ca::TrackParamBase<float>::kNtrackParam>* Jy) const;
-			      float* Jx,
-			      float* Jy) const;
+    XPU_D void FilterExtrapolatedXY(
+      kf::TrackParamBase<float>* tr_par, const ca::MeasurementXy<float>* m, float* extrXY,  // fvec extrY,
+      //			      std::array<float, ca::TrackParamBase<float>::kNtrackParam>* Jx,
+      //			      std::array<float, ca::TrackParamBase<float>::kNtrackParam>* Jy) const;
+      float* Jx, float* Jy) const;
 
     XPU_D void ExtrapolateLineNoField(kf::TrackParamBase<float>* tr_par, float zOut) const;
 
@@ -238,21 +232,20 @@ namespace cbm::algo::ca
 
     ///                          ------  DATA MEMBERS ------
    public:
-
-    xpu::buffer<ca::GpuGrid> fvGpuGrid;	///< Grid
+    xpu::buffer<ca::GpuGrid> fvGpuGrid;  ///< Grid
 
     xpu::buffer<unsigned int> fgridFirstBinEntryIndex;  ///< Index of the first entry in the grid
 
     xpu::buffer<unsigned int> fgridEntries;  ///< Entries in the grid	//TODO: check if it should be ca::HitIndex_t
 
     ///Material map
-    xpu::buffer<ca::GpuMaterialMap> fMaterialMap;	///< Material map base objects
+    xpu::buffer<ca::GpuMaterialMap> fMaterialMap;  ///< Material map base objects
 
-    xpu::buffer<float> fMaterialMapTables;	///< Material map tables
+    xpu::buffer<float> fMaterialMapTables;  ///< Material map tables
 
     xpu::buffer<IterationData> fIterationData;  ///< Temporary data
 
-//    xpu::buffer<unsigned char> fvbHitSuppressed;  ///< Hit suppression flags
+    //    xpu::buffer<unsigned char> fvbHitSuppressed;  ///< Hit suppression flags
 
     xpu::buffer<hit_doublets> fHitDoublets;  ///< Hit doublets
 
@@ -268,17 +261,17 @@ namespace cbm::algo::ca
     /// hit.Id is replaced by the hit index in fInputData
     xpu::buffer<ca::Hit> fvHits;
 
-    xpu::buffer<kf::TrackParamBase<float>> fvTrackParams;  ///< Track parameters
+    xpu::buffer<kf::TrackParamBase<float>> fvTrackParams;          ///< Track parameters
     xpu::buffer<kf::TrackParamBase<float>> fvTrackParamsDoublets;  ///< Track parameters for doublets
-//    xpu::buffer<ca::TrackParamBase<float>> fvTrackParamsTriplets;  ///< Track parameters for triplets
+    //    xpu::buffer<ca::TrackParamBase<float>> fvTrackParamsTriplets;  ///< Track parameters for triplets
 
     xpu::buffer<ca::Triplet> fvTriplets;  ///< Triplets
 
     xpu::buffer<sort_triplets> fTripletSortHelper;  ///< Helper for sorting triplets
-//    xpu::buffer<sort_triplets> fTripletSortHelperTmp;
-//    xpu::buffer<sort_triplets*> dst;
+                                                    //    xpu::buffer<sort_triplets> fTripletSortHelperTmp;
+                                                    //    xpu::buffer<sort_triplets*> dst;
 
-//    GpuParameters fParams;
+    //    GpuParameters fParams;
     xpu::buffer<GpuParameters> fParams;
 
     int fIteration;
diff --git a/algo/ca/core/experimental/CaMeasurementXy.h b/algo/ca/core/experimental/CaMeasurementXy.h
index fa95a074b7..ae4f5232e4 100644
--- a/algo/ca/core/experimental/CaMeasurementXy.h
+++ b/algo/ca/core/experimental/CaMeasurementXy.h
@@ -132,8 +132,9 @@ namespace cbm::algo::ca
     /// Checks, if all fields are finite
     bool IsFinite() const
     {
-      return (kf::utils::IsFinite(X()) && kf::utils::IsFinite(Y()) && kf::utils::IsFinite(Dx2()) && kf::utils::IsFinite(Dy2())
-              && kf::utils::IsFinite(Dxy()) && kf::utils::IsFinite(NdfX()) && kf::utils::IsFinite(NdfY()));
+      return (kf::utils::IsFinite(X()) && kf::utils::IsFinite(Y()) && kf::utils::IsFinite(Dx2())
+              && kf::utils::IsFinite(Dy2()) && kf::utils::IsFinite(Dxy()) && kf::utils::IsFinite(NdfX())
+              && kf::utils::IsFinite(NdfY()));
     }
 
     /// Checks, if some fields are undefined
@@ -176,7 +177,6 @@ namespace cbm::algo::ca
 
     DataT fNdfX;  ///< ndf for the x coordinate measurement
     DataT fNdfY;  ///< ndf for the y coordinate measurement
-
   };
 
 
diff --git a/algo/ca/core/pars/CaDefs.h b/algo/ca/core/pars/CaDefs.h
index b30f7a10a8..89a8c19ee9 100644
--- a/algo/ca/core/pars/CaDefs.h
+++ b/algo/ca/core/pars/CaDefs.h
@@ -108,14 +108,14 @@ namespace cbm::algo::ca::constants
   /// GPU constants
   namespace gpu
   {
-    constexpr int MaxDoubletsFromHit = 150;	///< Maximum number of doublets from a hit
-    constexpr int MaxTripletsFromDoublet = 15;	///< Maximum number of triplets from a doublet
-    constexpr int MaxNofStations = 20;	///< Maximum number of stations //TODO: temporary solution
-    constexpr bool GpuTracking = false;	///< Flag: use GPU for tracking
-    constexpr bool GpuTimeMonitoring = false;	///< Flag: use GPU for time monitoring
-    constexpr bool GpuSortTriplets = false;	///< Flag: use GPU for sorting triplets
-    constexpr bool CpuSortTriplets = true;	///< Flag: use CPU for sorting triplets
-  }  // namespace gpu
+    constexpr int MaxDoubletsFromHit     = 150;    ///< Maximum number of doublets from a hit
+    constexpr int MaxTripletsFromDoublet = 15;     ///< Maximum number of triplets from a doublet
+    constexpr int MaxNofStations         = 20;     ///< Maximum number of stations //TODO: temporary solution
+    constexpr bool GpuTracking           = false;  ///< Flag: use GPU for tracking
+    constexpr bool GpuTimeMonitoring     = false;  ///< Flag: use GPU for time monitoring
+    constexpr bool GpuSortTriplets       = false;  ///< Flag: use GPU for sorting triplets
+    constexpr bool CpuSortTriplets       = true;   ///< Flag: use CPU for sorting triplets
+  }                                                // namespace gpu
 
   /// \brief Undefined values
   template<typename T1, typename T2 = T1>
diff --git a/algo/ca/core/tracking/CaTrackFinderWindow.cxx b/algo/ca/core/tracking/CaTrackFinderWindow.cxx
index 41b9b6c4a9..e3bcc301f5 100644
--- a/algo/ca/core/tracking/CaTrackFinderWindow.cxx
+++ b/algo/ca/core/tracking/CaTrackFinderWindow.cxx
@@ -135,9 +135,9 @@ namespace cbm::algo::ca
       // - should not be initialized here
       setenv("XPU_PROFILE", "1", 1);
       xpu::settings settings;
-//      settings.device = "cpu0";
+      //      settings.device = "cpu0";
       settings.device = "hip0";
-//      settings.verbose = true;
+      //      settings.verbose = true;
       xpu::initialize(settings);
 
       // Set up environment for GPU tracking
@@ -166,7 +166,8 @@ namespace cbm::algo::ca
         ConstructTripletsGPU(wData, gpuTrackFinderSetup.value(), iter_num);
         iter_num++;
         frMonitorData.StopTimer(ETimer::ConstructTriplets);
-      } else {
+      }
+      else {
         // ----- Triplets construction -----
         frMonitorData.StartTimer(ETimer::ConstructTriplets);
         ConstructTriplets(wData);
@@ -811,7 +812,8 @@ namespace cbm::algo::ca
     }
   }
 
-  void TrackFinderWindow::ConstructTripletsGPU(WindowData& wData, GpuTrackFinderSetup& gpuTrackFinderSetup, int iteration)
+  void TrackFinderWindow::ConstructTripletsGPU(WindowData& wData, GpuTrackFinderSetup& gpuTrackFinderSetup,
+                                               int iteration)
   {
     xpu::push_timer("SetupGridTime");
     gpuTrackFinderSetup.SetupGrid();
@@ -831,16 +833,14 @@ namespace cbm::algo::ca
       triplet_sort.emplace();
       triplet_sort.value().reserve(gpuTrackFinderSetup.GetNofTriplets());
 
-      for(unsigned int itr = 0; itr < gpuTrackFinderSetup.GetNofTriplets(); itr++) {
-        if( gpuTrackFinderSetup.GetTriplet(itr).GetChi2() <= 0. ) continue;
+      for (unsigned int itr = 0; itr < gpuTrackFinderSetup.GetNofTriplets(); itr++) {
+        if (gpuTrackFinderSetup.GetTriplet(itr).GetChi2() <= 0.) continue;
         const auto& triplet = gpuTrackFinderSetup.GetTriplet(itr);
         triplet_sort.value().push_back({triplet.GetLHit(), itr});
       }
-      std::sort(triplet_sort.value().begin(), triplet_sort.value().end(),
-        [](const std::array<unsigned int, 2>& a, const std::array<unsigned int, 2>& b) {
-            return (a[0] < b[0]);
-        }
-      );
+      std::sort(
+        triplet_sort.value().begin(), triplet_sort.value().end(),
+        [](const std::array<unsigned int, 2>& a, const std::array<unsigned int, 2>& b) { return (a[0] < b[0]); });
       xpu::timings SortTriplets = xpu::pop_timer();
 
       if constexpr (constants::gpu::GpuTimeMonitoring) {
@@ -868,31 +868,34 @@ namespace cbm::algo::ca
     if constexpr (constants::gpu::CpuSortTriplets) {
       nTriplets = triplet_sort.value().size();
     }
-    for( size_t itr = 0; itr < nTriplets; itr++ ) {
+    for (size_t itr = 0; itr < nTriplets; itr++) {
       int iSta, ihitl;
       if constexpr (constants::gpu::CpuSortTriplets) {
-        iSta = gpuTrackFinderSetup.GetTriplet(std::get<1>(triplet_sort.value()[itr])).GetLSta ();
-        ihitl = gpuTrackFinderSetup.GetTriplet(std::get<1>(triplet_sort.value()[itr])).GetLHit ();
-      } else {
-        iSta = gpuTrackFinderSetup.GetTriplet(itr).GetLSta ();
-        ihitl = gpuTrackFinderSetup.GetTriplet(itr).GetLHit ();
+        iSta  = gpuTrackFinderSetup.GetTriplet(std::get<1>(triplet_sort.value()[itr])).GetLSta();
+        ihitl = gpuTrackFinderSetup.GetTriplet(std::get<1>(triplet_sort.value()[itr])).GetLHit();
+      }
+      else {
+        iSta  = gpuTrackFinderSetup.GetTriplet(itr).GetLSta();
+        ihitl = gpuTrackFinderSetup.GetTriplet(itr).GetLHit();
       }
       const size_t oldSize = vTriplets[iSta].size();
 
       if constexpr (constants::gpu::CpuSortTriplets) {
         vTriplets[iSta].emplace_back(gpuTrackFinderSetup.GetTriplet(std::get<1>(triplet_sort.value()[itr])));
-        if( ihitl == ihitl_0 ) {
+        if (ihitl == ihitl_0) {
           vHitNofTriplets[ihitl]++;
-        } else {
+        }
+        else {
           vHitFirstTriplet[ihitl] = PackTripletId(iSta, oldSize);
-          vHitNofTriplets[ihitl] = vTriplets[iSta].size() - oldSize;
+          vHitNofTriplets[ihitl]  = vTriplets[iSta].size() - oldSize;
         }
         ihitl_0 = ihitl;
-      } else {
-        if( gpuTrackFinderSetup.GetTriplet(itr).GetChi2() > 0. )
+      }
+      else {
+        if (gpuTrackFinderSetup.GetTriplet(itr).GetChi2() > 0.)
           vTriplets[iSta].emplace_back(gpuTrackFinderSetup.GetTriplet(itr));
         vHitFirstTriplet[ihitl] = PackTripletId(iSta, oldSize);
-        vHitNofTriplets[ihitl] = vTriplets[iSta].size() - oldSize;
+        vHitNofTriplets[ihitl]  = vTriplets[iSta].size() - oldSize;
       }
     }
   }
diff --git a/algo/ca/core/tracking/CaTrackFinderWindow.h b/algo/ca/core/tracking/CaTrackFinderWindow.h
index 2c56150838..691ac0da86 100644
--- a/algo/ca/core/tracking/CaTrackFinderWindow.h
+++ b/algo/ca/core/tracking/CaTrackFinderWindow.h
@@ -11,6 +11,7 @@
 
 #include "CaBranch.h"
 #include "CaCloneMerger.h"
+#include "CaGpuTrackFinderSetup.h"
 #include "CaGrid.h"
 #include "CaHit.h"
 #include "CaSimd.h"
@@ -19,7 +20,6 @@
 #include "CaTrackingMonitor.h"
 #include "CaVector.h"
 #include "CaWindowData.h"
-#include "CaGpuTrackFinderSetup.h"
 
 namespace cbm::algo::ca
 {
-- 
GitLab


From 4e0cee044e1c877d54f3ab11bd313e6df027750c Mon Sep 17 00:00:00 2001
From: "g.kozlov" <grxxl@yandex.ru>
Date: Fri, 7 Mar 2025 16:03:25 +0100
Subject: [PATCH 4/6] fix CMakeList

---
 algo/ca/core/CMakeLists.txt | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/algo/ca/core/CMakeLists.txt b/algo/ca/core/CMakeLists.txt
index f0d11aeb79..14c0a72788 100644
--- a/algo/ca/core/CMakeLists.txt
+++ b/algo/ca/core/CMakeLists.txt
@@ -151,10 +151,10 @@ install(
     tracking/CaTrackFinderWindow.h
     tracking/CaTrackFitter.h
     tracking/CaTripletConstructor.h
-    
+
     experimental/CaDeviceImage.h
     experimental/CaGpuGrid.h
-    experimental/CaGpuHitArea.h
+    experimental/CaGpuGridArea.h
     experimental/CaGpuParameters.h
     experimental/CaGpuStation.h
     experimental/CaGpuMaterialMap.h
-- 
GitLab


From 5ecd38c54aaae21a1a82d15acd23dd2e1458986c Mon Sep 17 00:00:00 2001
From: "g.kozlov" <grxxl@yandex.ru>
Date: Mon, 10 Mar 2025 12:01:10 +0100
Subject: [PATCH 5/6] fix CMakeList

---
 algo/ca/core/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/algo/ca/core/CMakeLists.txt b/algo/ca/core/CMakeLists.txt
index 14c0a72788..c9ea28bd0f 100644
--- a/algo/ca/core/CMakeLists.txt
+++ b/algo/ca/core/CMakeLists.txt
@@ -162,7 +162,7 @@ install(
     experimental/CaMeasurementXy.h
     experimental/CaGpuTrackFinderSetup.h
     experimental/CaGpuTripletConstructor.h
-    experimentsl/CaDeviceImage.h
+    experimental/CaDeviceImage.h
 
     DESTINATION
     include/
-- 
GitLab


From ae5d50e11939de4e440dc4b0c9b66565734ba8c2 Mon Sep 17 00:00:00 2001
From: "g.kozlov" <grxxl@yandex.ru>
Date: Mon, 10 Mar 2025 14:35:39 +0100
Subject: [PATCH 6/6] add CaGpuTimeMonitor into CMakeList

---
 algo/ca/core/CMakeLists.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/algo/ca/core/CMakeLists.txt b/algo/ca/core/CMakeLists.txt
index c9ea28bd0f..11376b0ddc 100644
--- a/algo/ca/core/CMakeLists.txt
+++ b/algo/ca/core/CMakeLists.txt
@@ -163,6 +163,7 @@ install(
     experimental/CaGpuTrackFinderSetup.h
     experimental/CaGpuTripletConstructor.h
     experimental/CaDeviceImage.h
+    experimental/CaGpuTimeMonitor.h
 
     DESTINATION
     include/
-- 
GitLab