diff --git a/algo/ca/core/CMakeLists.txt b/algo/ca/core/CMakeLists.txt
index 2be924997474f03ae9311ec060bd857d21cf725a..11376b0ddcb554d29b2914066be711f34c48dae0 100644
--- a/algo/ca/core/CMakeLists.txt
+++ b/algo/ca/core/CMakeLists.txt
@@ -5,6 +5,7 @@ set(INCLUDE_DIRECTORIES
   ${CMAKE_CURRENT_SOURCE_DIR}/qa
   ${CMAKE_CURRENT_SOURCE_DIR}/data
   ${CMAKE_CURRENT_SOURCE_DIR}/tracking
+  ${CMAKE_CURRENT_SOURCE_DIR}/experimental
 )
 
 set(SRCS
@@ -31,6 +32,10 @@ set(SRCS
   ${CMAKE_CURRENT_SOURCE_DIR}/tracking/CaTrackFinderWindow.cxx
   ${CMAKE_CURRENT_SOURCE_DIR}/tracking/CaTrackFitter.cxx
   ${CMAKE_CURRENT_SOURCE_DIR}/tracking/CaTripletConstructor.cxx
+
+  ${CMAKE_CURRENT_SOURCE_DIR}/experimental/CaGpuTrackFinderSetup.cxx
+  ${CMAKE_CURRENT_SOURCE_DIR}/experimental/CaGpuTripletConstructor.cxx
+  ${CMAKE_CURRENT_SOURCE_DIR}/experimental/CaDeviceImage.cxx
 )
 
 SET_SOURCE_FILES_PROPERTIES(${SRCS} PROPERTIES COMPILE_FLAGS "-O3")
@@ -52,6 +57,12 @@ target_include_directories(CaCore
          ${CMAKE_CURRENT_SOURCE_DIR}/pars
          ${CMAKE_CURRENT_SOURCE_DIR}/qa
          ${CMAKE_CURRENT_SOURCE_DIR}/tracking
+         ${CMAKE_CURRENT_SOURCE_DIR}/experimental
+)
+
+set(DEVICE_SRCS
+  experimental/CaDeviceImage.cxx
+  experimental/CaGpuTripletConstructor.cxx
 )
 
 target_compile_definitions(CaCore PUBLIC NO_ROOT)
@@ -62,7 +73,9 @@ target_link_libraries(CaCore
                       external::fles_logging # needed for the logger
                       external::fles_ipc     # needed for the logger
                       external::yaml-cpp
+                      xpu
                      )
+xpu_attach(CaCore ${DEVICE_SRCS})
 
 ##### Offline version without the NO_ROOT in order to get standard logger! #############################################
 if (NOT CBM_ONLINE_STANDALONE)
@@ -74,6 +87,7 @@ if (NOT CBM_ONLINE_STANDALONE)
            ${CMAKE_CURRENT_SOURCE_DIR}/pars
            ${CMAKE_CURRENT_SOURCE_DIR}/qa
            ${CMAKE_CURRENT_SOURCE_DIR}/tracking
+           ${CMAKE_CURRENT_SOURCE_DIR}/experimental
            ${CMAKE_CURRENT_SOURCE_DIR}
   )
 
@@ -81,7 +95,9 @@ if (NOT CBM_ONLINE_STANDALONE)
                  PUBLIC KfCoreOffline
                         Boost::serialization
                         external::yaml-cpp
+                        xpu
                        )
+  xpu_attach(CaCoreOffline ${DEVICE_SRCS})
   install(TARGETS CaCoreOffline DESTINATION lib)
 endif()
 ########################################################################################################################
@@ -91,6 +107,7 @@ install(DIRECTORY utils TYPE INCLUDE FILES_MATCHING PATTERN "*.h")
 install(DIRECTORY data TYPE INCLUDE FILES_MATCHING PATTERN "*.h")
 install(DIRECTORY tracking TYPE INCLUDE FILES_MATCHING PATTERN "*.h")
 install(DIRECTORY pars TYPE INCLUDE FILES_MATCHING PATTERN "*.h")
+install(DIRECTORY experimental TYPE INCLUDE FILES_MATCHING PATTERN "*.h")
 
 install(
   FILES
@@ -135,6 +152,19 @@ install(
     tracking/CaTrackFitter.h
     tracking/CaTripletConstructor.h
 
+    experimental/CaDeviceImage.h
+    experimental/CaGpuGrid.h
+    experimental/CaGpuGridArea.h
+    experimental/CaGpuParameters.h
+    experimental/CaGpuStation.h
+    experimental/CaGpuMaterialMap.h
+    experimental/CaGpuField.h
+    experimental/CaMeasurementXy.h
+    experimental/CaGpuTrackFinderSetup.h
+    experimental/CaGpuTripletConstructor.h
+    experimental/CaDeviceImage.h
+    experimental/CaGpuTimeMonitor.h
+
     DESTINATION
     include/
 )
diff --git a/algo/ca/core/data/CaGrid.h b/algo/ca/core/data/CaGrid.h
index 8bfc127154b6bce51d8187664420e32068ece503..cc81ec8b4e4d6844c12cd37d8958768de8a04f7a 100644
--- a/algo/ca/core/data/CaGrid.h
+++ b/algo/ca/core/data/CaGrid.h
@@ -126,6 +126,15 @@ namespace cbm::algo::ca
     /// Get maximal entry range in T
     fscal GetMaxRangeT() const { return fMaxRangeT; }
 
+    /// Get bin width in X inversed
+    fscal GetBinWidthXinv() const { return fBinWidthXinv; }
+
+    /// Get bin width in Y inversed
+    fscal GetBinWidthYinv() const { return fBinWidthYinv; }
+
+    /// Get the link to the first bin entry index
+    const ca::Vector<ca::HitIndex_t>& GetFirstBinEntryIndex() const { return fFirstBinEntryIndex; }
+
    private:
     /// --- Data members ---
 
diff --git a/algo/ca/core/data/CaHit.h b/algo/ca/core/data/CaHit.h
index fb0551ee784616df3f315d783dd3fd857b05314e..ae678be940d295613001b928c056989b8bc44739 100644
--- a/algo/ca/core/data/CaHit.h
+++ b/algo/ca/core/data/CaHit.h
@@ -14,6 +14,8 @@
 
 #include <string>
 
+#include <xpu/device.h>
+
 namespace cbm::algo::ca
 {
   struct CaHitTimeInfo {
@@ -99,43 +101,43 @@ namespace cbm::algo::ca
     HitKeyIndex_t BackKey() const { return fBackKey; }
 
     /// Get the X coordinate
-    fscal X() const { return fX; }
+    XPU_D fscal X() const { return fX; }
 
     /// Get the Y coordinate
-    fscal Y() const { return fY; }
+    XPU_D fscal Y() const { return fY; }
 
     /// Get the Z coordinate
-    fscal Z() const { return fZ; }
+    XPU_D fscal Z() const { return fZ; }
 
     /// Get the time
-    fscal T() const { return fT; }
+    XPU_D fscal T() const { return fT; }
 
     /// Get the uncertainty of X coordinate
-    fscal dX2() const { return fDx2; }
+    XPU_D fscal dX2() const { return fDx2; }
 
     /// Get the uncertainty of Y coordinate
-    fscal dY2() const { return fDy2; }
+    XPU_D fscal dY2() const { return fDy2; }
 
     /// Get the X/Y covariance
-    fscal dXY() const { return fDxy; }
+    XPU_D fscal dXY() const { return fDxy; }
 
     /// Get the uncertainty of time
-    fscal dT2() const { return fDt2; }
+    XPU_D fscal dT2() const { return fDt2; }
 
     /// Get the +/- range of uncertainty of X coordinate
-    fscal RangeX() const { return fRangeX; }
+    XPU_D fscal RangeX() const { return fRangeX; }
 
     /// Get the +/- range of uncertainty of Y coordinate
-    fscal RangeY() const { return fRangeY; }
+    XPU_D fscal RangeY() const { return fRangeY; }
 
     /// Get the +/- range of uncertainty of time
-    fscal RangeT() const { return fRangeT; }
+    XPU_D fscal RangeT() const { return fRangeT; }
 
     /// Get the hit id
-    HitIndex_t Id() const { return fId; }
+    XPU_D HitIndex_t Id() const { return fId; }
 
     /// Get the station index
-    int Station() const { return fStation; }
+    XPU_D int Station() const { return fStation; }
 
     /// \brief Simple string representation of the hit class
     std::string ToString() const;
diff --git a/algo/ca/core/data/CaTriplet.h b/algo/ca/core/data/CaTriplet.h
index 4a9848ea1aaa8ff2b9d7da14b38a4c7d422dd339..ed6d3a4ff0d89171b7267c19efd1ca4e8ff3da7b 100644
--- a/algo/ca/core/data/CaTriplet.h
+++ b/algo/ca/core/data/CaTriplet.h
@@ -25,9 +25,10 @@ namespace cbm::algo::ca
     Triplet() = default;
 
     /// Constructor
-    Triplet(ca::HitIndex_t iHitL, ca::HitIndex_t iHitM, ca::HitIndex_t iHitR, unsigned int iStaL, unsigned int iStaM,
-            unsigned int iStaR, unsigned char Level, unsigned int firstNeighbour, char nNeighbours, fscal Chi2,
-            fscal Qp, fscal Cqp, fscal tx, fscal Ctx, fscal ty, fscal Cty, bool isMomentumFitted)
+    XPU_D Triplet(ca::HitIndex_t iHitL, ca::HitIndex_t iHitM, ca::HitIndex_t iHitR, unsigned int iStaL,
+                  unsigned int iStaM, unsigned int iStaR, unsigned char Level, unsigned int firstNeighbour,
+                  char nNeighbours, fscal Chi2, fscal Qp, fscal Cqp, fscal tx, fscal Ctx, fscal ty, fscal Cty,
+                  bool isMomentumFitted)
       : fChi2(Chi2)
       , fQp(Qp)
       , fCqp(Cqp)
diff --git a/algo/ca/core/experimental/CaDeviceImage.cxx b/algo/ca/core/experimental/CaDeviceImage.cxx
new file mode 100644
index 0000000000000000000000000000000000000000..88932879aa754b841bbd97713daff47ab61f22cd
--- /dev/null
+++ b/algo/ca/core/experimental/CaDeviceImage.cxx
@@ -0,0 +1,6 @@
+/* Copyright (C) 2023 FIAS Frankfurt Institute for Advanced Studies, Frankfurt / Main
+   SPDX-License-Identifier: GPL-3.0-only
+   Authors: Felix Weiglhofer [committer] */
+#include "CaDeviceImage.h"
+
+XPU_IMAGE(cbm::algo::ca::GPUReco);
diff --git a/algo/ca/core/experimental/CaDeviceImage.h b/algo/ca/core/experimental/CaDeviceImage.h
new file mode 100644
index 0000000000000000000000000000000000000000..49eef6a4b7ec691af0dea1f117fbdd0ee2bd41f3
--- /dev/null
+++ b/algo/ca/core/experimental/CaDeviceImage.h
@@ -0,0 +1,14 @@
+/* Copyright (C) 2023 FIAS Frankfurt Institute for Advanced Studies, Frankfurt / Main
+   SPDX-License-Identifier: GPL-3.0-only
+   Authors: Felix Weiglhofer [committer] */
+#ifndef CBM_ALGO_CA_GPU_DEVICEIMAGE_H
+#define CBM_ALGO_CA_GPU_DEVICEIMAGE_H
+#include <xpu/device.h>
+
+namespace cbm::algo::ca
+{
+  struct GPUReco : xpu::device_image {
+  };
+}  // namespace cbm::algo::ca
+
+#endif
diff --git a/algo/ca/core/experimental/CaGpuField.h b/algo/ca/core/experimental/CaGpuField.h
new file mode 100644
index 0000000000000000000000000000000000000000..8fd42b4746449753e0cc9ae2c485bef637b1ba70
--- /dev/null
+++ b/algo/ca/core/experimental/CaGpuField.h
@@ -0,0 +1,314 @@
+/* Copyright (C) 2007-2022 GSI Helmholtzzentrum fuer Schwerionenforschung, Darmstadt
+   SPDX-License-Identifier: GPL-3.0-only
+   Authors: Sergey Gorbunov [committer], Igor Kulakov, Maksym Zyzak, Sergei Zharko */
+
+/// \file CaGpuField.h
+/// \brief Magnetic field classes
+///
+/// This is a temporary solution made to ensure compatibility with GPU
+///
+
+#pragma once  // include this header only once per compilation unit
+
+#include "CaUtils.h"
+#include "KfField.h"
+#include "KfTrackParam.h"
+
+#include <xpu/device.h>
+
+namespace cbm::algo::ca
+{
+  class GpuFieldValue {
+   public:
+    float x;  //< x-component of the field
+    float y;  //< y-component of the field
+    float z;  //< z-component of the field
+
+    GpuFieldValue() = default;
+
+    /// \brief Copy constructor with type conversion
+    template<typename DataIn>
+    GpuFieldValue(const kf::FieldValue<DataIn>& other)
+      : x(kf::utils::simd::Cast<DataIn, float>(other.GetBx()))
+      , y(kf::utils::simd::Cast<DataIn, float>(other.GetBy()))
+      , z(kf::utils::simd::Cast<DataIn, float>(other.GetBz()))
+    {
+    }
+
+    /// Combines the magnetic field with another field value object using weight
+    /// \param B  other field value to combine with
+    /// \param w  weight from 0 to 1
+
+    XPU_D void Combine(const GpuFieldValue& B, const bool w)
+    {
+      if (w) {
+        x = B.x;
+        y = B.y;
+        z = B.z;
+      }
+    }
+
+    /// Is the field value zero?
+    XPU_D bool IsZeroField() const
+    {
+      //     constexpr auto e = constants::misc::NegligibleFieldkG;
+      //     float e = 1.e-4;
+      return (x * x + y * y + z * z <= 1.e-8);
+    }
+  };
+
+  class GpuFieldSlice {
+   public:
+    /// Default constructor
+    GpuFieldSlice() = default;
+
+    static constexpr int kPolDegree{5};                                         ///< Approximation polynomial degree
+    static constexpr int kNofCoeff{((kPolDegree + 1) * (kPolDegree + 2)) / 2};  ///< Number of coefficients
+
+    /// \brief Array of the approximation coefficients [<monomial>]
+    using CoeffArray_t = std::array<float, kNofCoeff>;
+
+    /// \brief Copy constructor with type conversion
+    template<typename DataIn>
+    GpuFieldSlice(const kf::FieldSlice<DataIn>& other) : z(kf::utils::simd::Cast<DataIn, float>(other.GetZref()))
+    {
+      for (size_t i = 0; i < kMaxNFieldApproxCoefficients; i++) {
+        cx[i] = kf::utils::simd::Cast<DataIn, float>(other.GetBx()[i]);
+        cy[i] = kf::utils::simd::Cast<DataIn, float>(other.GetBy()[i]);
+        cz[i] = kf::utils::simd::Cast<DataIn, float>(other.GetBz()[i]);
+      }
+    }
+
+    /// Gets field value from (x, y) fvec point
+    /// \param x  x-coordinate of input
+    /// \param y  y-coordinate of input
+    /// \return B  the FieldValue output
+    XPU_D GpuFieldValue GetFieldValue_old(float x, float y) const;
+
+    //
+    XPU_D GpuFieldValue GetFieldValue(float x, float y) const
+    {
+      using cbm::algo::kf::math::Horner;
+      GpuFieldValue B;
+      B.x = Horner<kPolDegree>(cx.cbegin(), x, y);
+      B.y = Horner<kPolDegree>(cy.cbegin(), x, y);
+      B.z = Horner<kPolDegree>(cz.cbegin(), x, y);
+      return B;
+    }
+    //
+
+    /// Gets field value for the intersection with a straight track
+    /// \param t  straight track
+    /// \return B  the FieldValue output
+    XPU_D GpuFieldValue GetFieldValueForLine(const kf::TrackParamBase<float>& t) const;
+
+   public:
+    // NOTE: We don't use an initialization of arrays here because we cannot be sure
+    //       if the underlying type (fvec) has a default constructor, but
+    //       we are sure, that it can be initialized with a float. (S.Zharko)
+    static constexpr auto kMaxNFieldApproxCoefficients = 21;  //constants::size::MaxNFieldApproxCoefficients;
+
+    CoeffArray_t cx;  //fBx;  ///< Approximation coefficients for the x-component of the field
+    CoeffArray_t cy;  //fBy;  ///< Approximation coefficients for the y-component of the field
+    CoeffArray_t cz;  //fBz;  ///< Approximation coefficients for the z-component of the field
+
+    float z;  ///< z coordinate of the slice
+  };
+
+  XPU_D inline GpuFieldValue GpuFieldSlice::GetFieldValue_old(float x, float y) const
+  {
+    float x2 = x * x;
+    float y2 = y * y;
+    float xy = x * y;
+
+    float x3  = x2 * x;
+    float y3  = y2 * y;
+    float xy2 = x * y2;
+    float x2y = x2 * y;
+
+    float x4   = x3 * x;
+    float y4   = y3 * y;
+    float xy3  = x * y3;
+    float x2y2 = x2 * y2;
+    float x3y  = x3 * y;
+
+    float x5   = x4 * x;
+    float y5   = y4 * y;
+    float xy4  = x * y4;
+    float x2y3 = x2 * y3;
+    float x3y2 = x3 * y2;
+    float x4y  = x4 * y;
+
+    GpuFieldValue B;
+
+    B.x = cx[0] + cx[1] * x + cx[2] * y + cx[3] * x2 + cx[4] * xy + cx[5] * y2 + cx[6] * x3 + cx[7] * x2y + cx[8] * xy2
+          + cx[9] * y3 + cx[10] * x4 + cx[11] * x3y + cx[12] * x2y2 + cx[13] * xy3 + cx[14] * y4 + cx[15] * x5
+          + cx[16] * x4y + cx[17] * x3y2 + cx[18] * x2y3 + cx[19] * xy4 + cx[20] * y5;
+
+    B.y = cy[0] + cy[1] * x + cy[2] * y + cy[3] * x2 + cy[4] * xy + cy[5] * y2 + cy[6] * x3 + cy[7] * x2y + cy[8] * xy2
+          + cy[9] * y3 + cy[10] * x4 + cy[11] * x3y + cy[12] * x2y2 + cy[13] * xy3 + cy[14] * y4 + cy[15] * x5
+          + cy[16] * x4y + cy[17] * x3y2 + cy[18] * x2y3 + cy[19] * xy4 + cy[20] * y5;
+
+    B.z = cz[0] + cz[1] * x + cz[2] * y + cz[3] * x2 + cz[4] * xy + cz[5] * y2 + cz[6] * x3 + cz[7] * x2y + cz[8] * xy2
+          + cz[9] * y3 + cz[10] * x4 + cz[11] * x3y + cz[12] * x2y2 + cz[13] * xy3 + cz[14] * y4 + cz[15] * x5
+          + cz[16] * x4y + cz[17] * x3y2 + cz[18] * x2y3 + cz[19] * xy4 + cz[20] * y5;
+    return B;
+  }
+
+  XPU_D inline GpuFieldValue GpuFieldSlice::GetFieldValueForLine(const kf::TrackParamBase<float>& t) const
+  {
+    float dz = z - t.GetZ();
+    return GetFieldValue(t.GetX() + t.GetTx() * dz, t.GetY() + t.GetTy() * dz);
+  }
+
+  class GpuFieldRegion {
+   public:
+    GpuFieldRegion() = default;
+
+    /// \brief Copy constructor with type conversion
+    template<typename DataIn>
+    GpuFieldRegion(const kf::FieldRegion<DataIn>& other)
+      : cx0(kf::utils::simd::Cast<DataIn, float>(other.cx0))
+      , cx1(kf::utils::simd::Cast<DataIn, float>(other.cx1))
+      , cx2(kf::utils::simd::Cast<DataIn, float>(other.cx2))
+      , cy0(kf::utils::simd::Cast<DataIn, float>(other.cy0))
+      , cy1(kf::utils::simd::Cast<DataIn, float>(other.cy1))
+      , cy2(kf::utils::simd::Cast<DataIn, float>(other.cy2))
+      , cz0(kf::utils::simd::Cast<DataIn, float>(other.cz0))
+      , cz1(kf::utils::simd::Cast<DataIn, float>(other.cz1))
+      , cz2(kf::utils::simd::Cast<DataIn, float>(other.cz2))
+      , z0(kf::utils::simd::Cast<DataIn, float>(other.z0))
+      //     , fUseOriginalField(other.fUseOriginalField)
+      , fIsZeroField(other.fIsZeroField)
+    {
+    }
+
+    /// Gets the field vector and writes it into B pointer
+    /// \param x  x-coordinate of the point to calculate the field
+    /// \param y  y-coordinate of the point to calculate the field
+    /// \param z  z-coordinate of the point to calculate the field
+    /// \return    the magnetic field value
+    XPU_D GpuFieldValue Get([[maybe_unused]] float x, [[maybe_unused]] float y, float z) const
+    {
+      GpuFieldValue B;
+      float dz  = z - z0;
+      float dz2 = dz * dz;
+      B.x       = cx0 + cx1 * dz + cx2 * dz2;
+      B.y       = cy0 + cy1 * dz + cy2 * dz2;
+      B.z       = cz0 + cz1 * dz + cz2 * dz2;
+      return B;
+    }
+
+    /// Is the field region empty?
+    XPU_D bool IsZeroField() const { return fIsZeroField; }
+
+    /// Interpolates the magnetic field by three nodal points and sets the result to this FieldRegion object
+    /// The result is a quadratic interpolation of the field as a function of z
+    /// \param b0   field value in the first nodal point
+    /// \param b0z  z-coordinate of the first nodal point
+    /// \param b1   field value in the second nodal point
+    /// \param b1z  z-coordinate of the second nodal point
+    /// \param b2   field value in the third nodal point
+    /// \param b2z  z-coordinate of the third nodal point
+    /// TODO: does the sequence of b0z, b1z and b2z matter? If yes, probalby we need an assertion (S.Zharko)
+    XPU_D void Set(const GpuFieldValue& b0, float b0z, const GpuFieldValue& b1, float b1z, const GpuFieldValue& b2,
+                   float b2z);
+
+    /// Interpolates the magnetic field by thwo nodal points and sets the result to this FieldRegion object.
+    /// The result is a linear interpolation of the field as a function of z
+    /// \param b0   field value in the first nodal point
+    /// \param b0z  z-coordinate of the first nodal point
+    /// \param b1   field value in the second nodal point
+    /// \param b1z  z-coordinate of the second nodal point
+    /// TODO: does the sequence of b0z and b1z matter? If yes, probalby we need an assertion (S.Zharko)
+    XPU_D void Set(const GpuFieldValue& b0, float b0z, const GpuFieldValue& b1, float b1z)
+    {
+      fIsZeroField = (b0.IsZeroField() && b1.IsZeroField());
+      z0           = b0z;
+      float dzi    = 1. / (b1z - b0z);  //TODO
+      cx0          = b0.x;
+      cy0          = b0.y;
+      cz0          = b0.z;
+      cx1          = (b1.x - b0.x) * dzi;
+      cy1          = (b1.y - b0.y) * dzi;
+      cz1          = (b1.z - b0.z) * dzi;
+      cx2          = 0.f;
+      cy2          = 0.f;
+      cz2          = 0.f;
+    }
+
+    /// Shifts the coefficients to new central point
+    /// \param z  z-coordinate of the new central point
+    XPU_D void Shift(float z)
+    {
+      float dz    = z - z0;
+      float cx2dz = cx2 * dz;
+      float cy2dz = cy2 * dz;
+      float cz2dz = cz2 * dz;
+      z0          = z;
+      cx0 += (cx1 + cx2dz) * dz;
+      cy0 += (cy1 + cy2dz) * dz;
+      cz0 += (cz1 + cz2dz) * dz;
+      cx1 += cx2dz + cx2dz;
+      cy1 += cy2dz + cy2dz;
+      cz1 += cz2dz + cz2dz;
+    }
+
+
+   public:
+    // TODO: Probably it's better to have arrays instead of separate fvec values? (S.Zharko)
+    // Bx(z) = cx0 + cx1*(z-z0) + cx2*(z-z0)^2
+    float cx0;
+    float cx1;
+    float cx2;
+
+    // By(z) = cy0 + cy1*(z-z0) + cy2*(z-z0)^2
+    float cy0;
+    float cy1;
+    float cy2;
+
+    // Bz(z) = cz0 + cz1*(z-z0) + cz2*(z-z0)^2
+    float cz0;
+    float cz1;
+    float cz2;
+
+    float z0;  ///< z-coordinate of the field region central point
+
+    bool fIsZeroField;  //{false};  ///< Is the field region empty?
+  };
+
+  XPU_D inline void GpuFieldRegion::Set(const GpuFieldValue& b0, float b0z, const GpuFieldValue& b1, float b1z,
+                                        const GpuFieldValue& b2, float b2z)
+  {
+    fIsZeroField = (b0.IsZeroField() && b1.IsZeroField() && b2.IsZeroField());
+
+    z0        = b0z;
+    float dz1 = b1z - b0z;
+    float dz2 = b2z - b0z;
+    float det = 1. / (dz1 * dz2 * (b2z - b1z));  //TODO
+
+    float w21 = -dz2 * det;
+    float w22 = dz1 * det;
+    float w11 = -dz2 * w21;
+    float w12 = -dz1 * w22;
+
+    float db1 = b1.x - b0.x;
+    float db2 = b2.x - b0.x;
+    cx0       = b0.x;
+    cx1       = db1 * w11 + db2 * w12;
+    cx2       = db1 * w21 + db2 * w22;
+
+    db1 = b1.y - b0.y;
+    db2 = b2.y - b0.y;
+    cy0 = b0.y;
+    cy1 = db1 * w11 + db2 * w12;
+    cy2 = db1 * w21 + db2 * w22;
+
+    db1 = b1.z - b0.z;
+    db2 = b2.z - b0.z;
+    cz0 = b0.z;
+    cz1 = db1 * w11 + db2 * w12;
+    cz2 = db1 * w21 + db2 * w22;
+  }
+
+}  // namespace cbm::algo::ca
diff --git a/algo/ca/core/experimental/CaGpuGrid.h b/algo/ca/core/experimental/CaGpuGrid.h
new file mode 100644
index 0000000000000000000000000000000000000000..8040148772e30db5c28838fa7d50efbab6467de0
--- /dev/null
+++ b/algo/ca/core/experimental/CaGpuGrid.h
@@ -0,0 +1,176 @@
+/* Copyright (C) 2010-2023 GSI Helmholtzzentrum fuer Schwerionenforschung, Darmstadt
+   SPDX-License-Identifier: GPL-3.0-only
+   Authors: Igor Kulakov, Sergey Gorbunov, Grigory Kozlov [committer] */
+
+/// \file CaGpuGrid.h
+/// \brief A class to store hit information in a backet-sorted way on 2D grid
+///
+/// This is a temporary solution to store the grid on GPU
+/// The code has been simplified and modified relative to the CaGrid class for use on GPU
+/// The original code is based on the code of the ALICE HLT Project
+///
+
+#pragma once  // include this header only once per compilation unit
+
+#include "CaGrid.h"
+
+#include <xpu/device.h>
+
+namespace cbm::algo::ca
+{
+
+  namespace
+  {
+    using namespace cbm::algo;
+  }
+
+  /// \class Grid
+  /// \brief Class for storing 2d objects in a grid
+
+  /// It creates 2-dimensional backet-sorted grid of pointers to 2-dimensional objects.
+  ///
+  /// The class provides an access to the objects in selected 2D bin without touching the rest of the data.
+  /// To loop over the objects in arbitrary XY-area one can use the ca::GridArea class.
+  ///
+  /// The class is used by CaTracker to speed-up the hit search operations
+  /// The grid axis are named X and Y
+  ///
+  class GpuGrid {
+   public:
+    /// \brief  Default constructor
+    GpuGrid() = default;
+
+    /// \brief Destructor
+    ~GpuGrid() = default;
+
+    /// \brief Constructor
+    GpuGrid(const Grid& grid, int bin_start, int entries_start)
+      : fN(grid.GetNofBins())
+      , fNx(grid.GetNofBinsX())
+      , fNy(grid.GetNofBinsY())
+      , fMinX(grid.GetMinX())
+      , fMinY(grid.GetMinY())
+      , fBinWidthX(grid.GetBinWidthX())
+      , fBinWidthY(grid.GetBinWidthY())
+      , fBinWidthXinv(grid.GetBinWidthXinv())
+      , fBinWidthYinv(grid.GetBinWidthYinv())
+      , fMaxRangeX(grid.GetMaxRangeX())
+      , fMaxRangeY(grid.GetMaxRangeY())
+      , fMaxRangeT(grid.GetMaxRangeT())
+      , fBinEntryIndex_start(bin_start)
+      , fBinEntryIndex_amount(grid.GetFirstBinEntryIndex().size())
+      , fEntries_start(entries_start)
+      , fEntries_amount(grid.GetEntries().size())
+    {
+    }
+
+    /// \brief Get bin index for (X,Y) point with boundary check
+    /// \param X - point x coordinate
+    /// \param Y - point y coordinate
+    /// \return bin index in the range [0, fN-1]
+    XPU_D int GetBin(float X, float Y) const;
+
+    /// \brief Get bin X index with boundary check
+    /// \param X - x coordinate
+    /// \return binX index
+    XPU_D int GetBinX(float X) const;
+
+    /// \brief Get bin Y index with boundary check
+    /// \param Y - y coordinate
+    /// \return binY index
+    XPU_D int GetBinY(float Y) const;
+
+    /// Get number of bins
+    XPU_D int GetNofBins() const { return fN; }
+
+    /// Get number of bins along X
+    XPU_D int GetNofBinsX() const { return fNx; }
+
+    /// Get number of bins along Y
+    XPU_D int GetNofBinsY() const { return fNy; }
+
+    /// Get minimal X value
+    XPU_D float GetMinX() const { return fMinX; }
+
+    /// Get minimal Y value
+    XPU_D float GetMinY() const { return fMinY; }
+
+    /// Get bin width in X
+    XPU_D float GetBinWidthX() const { return fBinWidthX; }
+
+    /// Get bin width in Y
+    XPU_D float GetBinWidthY() const { return fBinWidthY; }
+
+    /// Get maximal entry range in X
+    XPU_D float GetMaxRangeX() const { return fMaxRangeX; }
+
+    /// Get maximal entry range in Y
+    XPU_D float GetMaxRangeY() const { return fMaxRangeY; }
+
+    /// Get maximal entry range in T
+    XPU_D float GetMaxRangeT() const { return fMaxRangeT; }
+
+    /// Get bin width in X inversed
+    XPU_D float GetBinWidthXinv() const { return fBinWidthXinv; }
+
+    /// Get bin width in Y inversed
+    XPU_D float GetBinWidthYinv() const { return fBinWidthYinv; }
+
+    /// Get the index of the first bin entry in the index array
+    XPU_D int GetBinEntryIndexStart() const { return fBinEntryIndex_start; }
+
+    /// Get the number of entries in the index array
+    XPU_D int GetBinEntryIndexAmount() const { return fBinEntryIndex_amount; }
+
+    /// Get the index of the first entry in the entry array
+    XPU_D int GetEntriesStart() const { return fEntries_start; }
+
+    /// Get the number of entries in the entry array
+    XPU_D int GetEntriesAmount() const { return fEntries_amount; }
+
+   private:
+    /// --- Data members ---
+
+    int fN{0};   ///< total N bins
+    int fNx{0};  ///< N bins in X
+    int fNy{0};  ///< N bins in Y
+
+    float fMinX{0.};          ///< minimal X value
+    float fMinY{0.};          ///< minimal Y value
+    float fBinWidthX{0.};     ///< bin width in X
+    float fBinWidthY{0.};     ///< bin width in Y
+    float fBinWidthXinv{0.};  ///< inverse bin width in X
+    float fBinWidthYinv{0.};  ///< inverse bin width in Y
+
+    float fMaxRangeX{0.};  ///< maximal entry range in X
+    float fMaxRangeY{0.};  ///< maximal entry range in Y
+    float fMaxRangeT{0.};  ///< maximal entry range in T
+
+    int fBinEntryIndex_start{0};   ///< index of the first bin entry in the index array
+    int fBinEntryIndex_amount{0};  ///< number of entries in the index array
+
+    int fEntries_start{0};   ///< index of the first entry in the entry array
+    int fEntries_amount{0};  ///< number of entries in the entry array
+  };
+
+  /// --- Inline methods ---
+
+  XPU_D inline int GpuGrid::GetBin(float X, float Y) const
+  {
+    //
+    return GetBinY(Y) * fNx + GetBinX(X);
+  }
+
+  XPU_D inline int GpuGrid::GetBinX(float X) const
+  {
+    int binX = static_cast<int>((X - fMinX) * fBinWidthXinv);
+    return xpu::max(0, xpu::min(fNx - 1, binX));
+  }
+
+  XPU_D inline int GpuGrid::GetBinY(float Y) const
+  {
+    int binY = static_cast<int>((Y - fMinY) * fBinWidthYinv);
+    return xpu::max(0, xpu::min(fNy - 1, binY));
+  }
+
+}  // namespace cbm::algo::ca
diff --git a/algo/ca/core/experimental/CaGpuGridArea.h b/algo/ca/core/experimental/CaGpuGridArea.h
new file mode 100644
index 0000000000000000000000000000000000000000..265b24e073b8108b66d663c3a7116e19494153c3
--- /dev/null
+++ b/algo/ca/core/experimental/CaGpuGridArea.h
@@ -0,0 +1,115 @@
+/* Copyright (C) 2012-2020 Frankfurt Institute for Advanced Studies, Goethe-Universität Frankfurt, Frankfurt
+   SPDX-License-Identifier: GPL-3.0-only
+   Authors: Maksym Zyzak, Igor Kulakov [committer] */
+
+/// \file CaGpuGridArea.h
+///
+/// \brief A temporary implementation for CaGridArea, created for compatibility with GPU code
+///
+
+#pragma once  // include this header only once per compilation unit
+
+#include "CaGpuGrid.h"
+#include "CaHit.h"
+
+namespace cbm::algo::ca
+{
+
+  namespace
+  {
+    using namespace cbm::algo;  // to keep 'ca::' prefices in the code
+  }
+
+  /// \brief Class for accessing objects in the 2D area that are stored in ca::Grid
+  ///
+  class GpuGridArea {
+   public:
+    /// \brief Constructor
+    /// \param grid - the grid to work with
+    /// \param x - X coordinate of the center of the area
+    /// \param y - Y coordinate of the center of the area
+    /// \param dx - half-width of the area in X
+    /// \param dy - half-width of the area in Y
+    //TODO: try to provide first element for the current grid instead of first element of the original arrays
+    XPU_D GpuGridArea(const ca::GpuGrid& grid, const unsigned int* BinEntryIndex, const unsigned int* Entries, float x,
+                      float y, float dx, float dy)
+      : fGrid(grid)
+      , fBinEntryIndex(BinEntryIndex)
+      , fEntries(Entries)
+      , fGridNbinsX(fGrid.GetNofBinsX())
+    {
+      int binXmin = fGrid.GetBinX(x - dx);
+      int binXmax = fGrid.GetBinX(x + dx);
+      int binYmin = fGrid.GetBinY(y - dy);
+      int binYmax = fGrid.GetBinY(y + dy);
+
+      fAreaLastBinY = binYmax;
+      fAreaNbinsX   = (binXmax - binXmin + 1);  // bin index span in x direction
+
+      fAreaFirstBin = (binYmin * fGridNbinsX + binXmin);
+
+      fAreaCurrentBinY = binYmin;
+      fCurentEntry     = fBinEntryIndex[fAreaFirstBin];
+      fEntriesXend     = fBinEntryIndex[fAreaFirstBin + fAreaNbinsX];
+    }
+
+
+    /// \brief look up the next grid entry in the requested area
+    /// \return ind - the entry index in the grid.GetEntries() vector
+    /// \return true if the entry is found, false if there are no more entries in the area
+    XPU_D bool GetNextGridEntry(unsigned int& ind);
+
+    /// \brief look up the next object id in the requested area
+    /// \return objectId - the id of the object
+    /// \return true if the object is found, false if there are no more pbjects in the area
+    XPU_D bool GetNextObjectId(unsigned int& objectId);
+
+    /// \brief debug mode: loop over the entire GetEntries() vector ignoring the search area
+    //    void DoLoopOverEntireGrid();
+
+   private:
+    const ca::GpuGrid& fGrid;
+    const unsigned int* fBinEntryIndex;
+    const unsigned int* fEntries;
+
+    int fAreaLastBinY{0};            // last Y bin of the area
+    int fAreaNbinsX{0};              // number of area bins in X
+    int fAreaFirstBin{0};            // first bin of the area (left-down corner of the area)
+    int fAreaCurrentBinY{0};         // current Y bin (incremented while iterating)
+    ca::HitIndex_t fCurentEntry{0};  // index of the current entry in fGrid.GetEntries()
+    ca::HitIndex_t fEntriesXend{0};  // end of the hit indices in current y-row
+    int fGridNbinsX{0};              // Number of grid bins in X (copy of fGrid::GetNofBinsX())
+  };
+
+  XPU_D inline bool GpuGridArea::GetNextGridEntry(unsigned int& ind)
+  {
+    bool xIndexOutOfRange = (fCurentEntry >= fEntriesXend);  // current entry is not in the area
+
+    // jump to the next y row if fCurentEntry is outside of the X area
+    while (xIndexOutOfRange) {
+      if (fAreaCurrentBinY >= fAreaLastBinY) {
+        return false;
+      }
+      fAreaCurrentBinY++;                                // get new y-line
+      fAreaFirstBin += fGridNbinsX;                      // move the left-down corner of the area to the next y-line
+      fCurentEntry     = fBinEntryIndex[fAreaFirstBin];  // get first hit in cell, if y-line is new
+      fEntriesXend     = fBinEntryIndex[fAreaFirstBin + fAreaNbinsX];
+      xIndexOutOfRange = (fCurentEntry >= fEntriesXend);
+    }
+    ind = fCurentEntry;  // return value
+    fCurentEntry++;      // go to next
+    return true;
+  }
+
+  XPU_D inline bool GpuGridArea::GetNextObjectId(unsigned int& objectId)
+  {
+    unsigned int entryIndex = 0;
+
+    bool ok = GetNextGridEntry(entryIndex);
+    if (ok) {
+      objectId = fEntries[entryIndex];
+    }
+    return ok;
+  }
+
+}  // namespace cbm::algo::ca
diff --git a/algo/ca/core/experimental/CaGpuMaterialMap.h b/algo/ca/core/experimental/CaGpuMaterialMap.h
new file mode 100644
index 0000000000000000000000000000000000000000..d28babce0427ca7988baea0bf66b48b1e8457360
--- /dev/null
+++ b/algo/ca/core/experimental/CaGpuMaterialMap.h
@@ -0,0 +1,77 @@
+/* Copyright (C) 2007-2023 GSI Helmholtzzentrum fuer Schwerionenforschung, Darmstadt
+   SPDX-License-Identifier: GPL-3.0-only
+   Authors: Igor Kulakov, Sergey Gorbunov, Andrey Lebedev, Sergei Zharko, Grigory Kozlov [committer] */
+
+/// \file GpuMaterialMap.h
+/// \brief A map of station thickness in units of radiation length (X0) to the specific point in XY plane
+///
+/// A simplified version of the KfMaterialMap class, intended for compatibility with GPU code
+
+#pragma once  // include this header only once per compilation unit
+
+#include "KfMaterialMap.h"
+
+#include <xpu/device.h>
+
+namespace cbm::algo::ca
+{
+  /// \class MaterialMap
+  /// \brief A map of station thickness in units of radiation length (X0) to the specific point in XY plane
+  class GpuMaterialMap {
+   public:
+    /// \brief Default constructor
+    GpuMaterialMap() = default;
+
+    /// \brief Copy constructor
+    GpuMaterialMap(const kf::MaterialMap& other, int binStart)
+      : fNbins(other.GetNbins())
+      , fBinStart(binStart)
+      , fXYmax(other.GetXYmax())
+      , fZref(other.GetZref())
+      , fZmin(other.GetZmin())
+      , fZmax(other.GetZmax())
+    {
+      fFactor = 0.5 * fNbins / fXYmax;
+    }
+
+    /// \brief Destructor
+    ~GpuMaterialMap() noexcept = default;
+
+    /// \brief Gets number of bins (rows or columns) of the material table
+    XPU_D int GetNbins() const { return fNbins; }
+
+    /// \brief Gets radius in cm of the material table
+    XPU_D float GetXYmax() const { return fXYmax; }
+
+    /// \brief Gets reference Z of the material in cm
+    XPU_D float GetZref() const { return fZref; }
+
+    /// \brief Gets minimal Z of the collected material in cm
+    XPU_D float GetZmin() const { return fZmin; }
+
+    /// \brief Gets maximal Z of the collected material in cm
+    XPU_D float GetZmax() const { return fZmax; }
+
+    /// \brief Get bin index for (x,y). Returns -1 when outside of the map
+    XPU_D int GetBin(float x, float y) const
+    {
+      int i = static_cast<int>((x + fXYmax) * fFactor);
+      int j = static_cast<int>((y + fXYmax) * fFactor);
+      if (i < 0 || j < 0 || i >= fNbins || j >= fNbins) {
+        return -1;
+      }
+      //TODO
+      return fBinStart + i + j * fNbins;
+    }
+
+   private:
+    int fNbins;     ///< Number of rows (== N columns) in the material budget table
+    int fBinStart;  ///< Start index of the material budget table
+    float fXYmax;   ///< Size of the station in x and y dimensions [cm]
+    float fFactor;  ///< Util. var. for the conversion of point coordinates to row/column id
+    float fZref;    ///< Reference Z of the collected material [cm]
+    float fZmin;    ///< Minimal Z of the collected material [cm]
+    float fZmax;    ///< Minimal Z of the collected material [cm]
+  };
+
+}  // namespace cbm::algo::ca
diff --git a/algo/ca/core/experimental/CaGpuParameters.h b/algo/ca/core/experimental/CaGpuParameters.h
new file mode 100644
index 0000000000000000000000000000000000000000..f558dd575b4395dd71533335e9f4c7eecef33827
--- /dev/null
+++ b/algo/ca/core/experimental/CaGpuParameters.h
@@ -0,0 +1,87 @@
+/* Copyright (C) 2021-2022 GSI Helmholtzzentrum fuer Schwerionenforschung, Darmstadt
+   SPDX-License-Identifier: GPL-3.0-only
+   Authors: Grigory Kozlov [committer] */
+
+/// \file CaGpuParameters.h
+/// \brief Parameter container for the GPU CA library
+
+#pragma once  // include this header only once per compilation unit
+
+#include "CaGpuField.h"
+#include "CaGpuMaterialMap.h"
+#include "CaGpuStation.h"
+#include "CaMeasurementXy.h"
+#include "CaParameters.h"
+
+#include <xpu/device.h>
+
+namespace cbm::algo::ca
+{
+
+  class GpuParameters {
+   public:
+    /// \brief Default constructor
+    GpuParameters() = default;
+
+    /// \brief Copy constructor with type conversion
+    template<typename DataIn>
+    GpuParameters(const Parameters<DataIn>& other)
+      : maxSlopePV(0.1)
+      , maxQp(0.1)
+      , maxDZ(0.1)
+      , particleMass(0.13957)
+      , doubletChi2Cut(5)
+      , tripletChi2Cut(5)
+      , tripletFinalChi2Cut(5)
+      , fNStations(0)
+      , primaryFlag(true)
+      , isTargetField(false)
+    {
+      fTargetPos[0] = kf::utils::simd::Cast<DataIn, float>(other.GetTargetPositionX());
+      fTargetPos[1] = kf::utils::simd::Cast<DataIn, float>(other.GetTargetPositionY());
+      fTargetPos[2] = kf::utils::simd::Cast<DataIn, float>(other.GetTargetPositionZ());
+      for (size_t i = 0; i < (size_t) other.GetNstationsActive(); i++) {
+        fStations[i] = other.GetStation(i);
+      }
+    }
+
+    /// \brief Destructor
+    ~GpuParameters() = default;
+
+   public:
+    ca::GpuFieldValue targB;                     ///< Magnetic field in the target region
+    ca::MeasurementXy<float> targetMeasurement;  ///< Measurement XY in the target region
+
+    /// \brief Gets Station
+    XPU_D const ca::GpuStation& GetStation(int iStation) const { return fStations[iStation]; }
+
+    /// \brief Sets Station
+    void SetStation(int iStation, const ca::GpuStation& station) { fStations[iStation] = station; }
+
+    /// \brief Gets X component of target position
+    XPU_D float GetTargetPositionX() const { return fTargetPos[0]; }
+
+    /// \brief Gets Y component of target position
+    XPU_D float GetTargetPositionY() const { return fTargetPos[1]; }
+
+    /// \brief Gets Z component of target position
+    XPU_D float GetTargetPositionZ() const { return fTargetPos[2]; }
+
+    std::array<ca::GpuStation, constants::gpu::MaxNofStations> fStations;  ///< Array of stations
+
+    std::array<float, 3> fTargetPos;  ///< Target position
+
+    float maxSlopePV;           ///< Max slope to primary vertex
+    float maxQp;                ///< Max Q/p
+    float maxDZ;                ///< Max DZ
+    float particleMass;         ///< Particle mass
+    float doubletChi2Cut;       ///< Doublet chi2 cut
+    float tripletChi2Cut;       ///< Triplet chi2 cut
+    float tripletFinalChi2Cut;  ///< Triplet final chi2 cut
+
+    size_t fNStations;  ///< Number of stations
+
+    bool primaryFlag;    ///< Primary flag
+    bool isTargetField;  ///< Is target field
+  };
+}  // namespace cbm::algo::ca
diff --git a/algo/ca/core/experimental/CaGpuStation.h b/algo/ca/core/experimental/CaGpuStation.h
new file mode 100644
index 0000000000000000000000000000000000000000..0ee6d43086a9f48823819493f52d9000c63142d2
--- /dev/null
+++ b/algo/ca/core/experimental/CaGpuStation.h
@@ -0,0 +1,74 @@
+/* Copyright (C) 2007-2022 GSI Helmholtzzentrum fuer Schwerionenforschung, Darmstadt
+   SPDX-License-Identifier: GPL-3.0-only
+   Authors: Sergey Gorbunov, Igor Kulakov, Sergei Zharko, Grigory Kozlov [committer] */
+
+/// \file GpuStation.h
+///
+/// \brief Contains a set of geometry parameters for a particular station
+///
+/// A temporary simplified version of the CaStation class, intended for compatibility with GPU code
+
+#pragma once  // include this header only once per compilation unit
+
+#include "CaGpuField.h"
+#include "CaStation.h"
+
+#include <xpu/device.h>
+
+namespace cbm::algo::ca
+{
+  /// Structure Station
+  /// Contains a set of geometry parameters for a particular station
+  ///
+  class GpuStation {
+   public:
+    int type;         //  TODO: replace with L1DetectorID
+    int timeInfo;     ///< flag: if time information can be used
+    int fieldStatus;  ///< flag: 1 - station is INSIDE the field, 0 - station is OUTSIDE the field
+    float fZ;         ///< z position of station     [cm]
+    float Xmax;       ///< min radius of the station [cm]
+    float Ymax;       ///< max radius of the station [cm]
+
+    GpuFieldSlice fieldSlice;  ///< Magnetic field near the station
+
+    GpuStation() = default;
+
+    /// \brief Copy constructor with type conversion
+    template<typename DataIn>
+    GpuStation(const Station<DataIn>& other)
+      : type(other.GetType())
+      , timeInfo(other.GetTimeStatus())
+      , fieldStatus(other.GetFieldStatus())
+      , fZ(other.template GetZ<float>())
+      , Xmax(other.template GetXmax<float>())
+      , Ymax(other.template GetYmax<float>())
+      , fieldSlice(other.fieldSlice)
+    {
+    }
+
+    /// \brief Gets type of the station
+    int GetType() const { return type; }
+
+    /// \brief Gets time-measurement flag
+    int GetTimeStatus() const { return timeInfo; }
+
+    /// \brief Gets field status flag
+    int GetFieldStatus() const { return fieldStatus; }
+
+    /// \brief Gets z-position of the station
+    XPU_D float GetZ() const { return fZ; }
+
+    /// \brief Gets limit of the station size in x-axis direction
+    XPU_D float GetXmax() const { return Xmax; }
+
+    /// \brief Gets limit of the station size in x-axis direction
+    XPU_D float GetXmin() const { return -Xmax; }
+
+    /// \brief Gets limit of the station size in y-axis direction
+    XPU_D float GetYmax() const { return Ymax; }
+
+    /// \brief Gets limit of the station size in y-axis direction
+    XPU_D float GetYmin() const { return -Ymax; }
+  };
+
+}  // namespace cbm::algo::ca
diff --git a/algo/ca/core/experimental/CaGpuTimeMonitor.h b/algo/ca/core/experimental/CaGpuTimeMonitor.h
new file mode 100644
index 0000000000000000000000000000000000000000..f8537dd3ab7318c1d63be047f1797f5486f4d51b
--- /dev/null
+++ b/algo/ca/core/experimental/CaGpuTimeMonitor.h
@@ -0,0 +1,105 @@
+/* Copyright (C) 2024-2025 GSI Helmholtzzentrum fuer Schwerionenforschung, Darmstadt
+   SPDX-License-Identifier: GPL-3.0-only
+   Authors: Grigory Kozlov [committer] */
+
+/// \file GpuTimeMonitor.h
+///
+/// \brief Time monitor for GPU kernels within the CA algorithm
+
+#pragma once  // include this header only once per compilation unit
+
+#include <xpu/host.h>
+
+namespace cbm::algo::ca
+{
+
+  namespace
+  {
+    using namespace cbm::algo;
+  }
+
+  struct XpuTimings {
+    xpu::timings PrepareData_time[4];
+    xpu::timings MakeSinglets_time[4];
+    xpu::timings MakeDoublets_time[4];
+    xpu::timings CompressDoublets_time[4];
+    xpu::timings ResetDoublets_time[4];
+    xpu::timings FitDoublets_time[4];
+    xpu::timings MakeTriplets_time[4];
+    xpu::timings CompressTriplets_time[4];
+    xpu::timings ResetTriplets_time[4];
+    xpu::timings FitTriplets_time[4];
+    xpu::timings SortTriplets_time[4];
+    xpu::timings Total_time[4];
+    int nIterations;
+
+    void PrintTimings(int iteration) const
+    {
+      if (iteration < 0 || iteration >= 4) {
+        std::cerr << "Error: wrong iteration number: " << iteration << std::endl;
+        return;
+      }
+
+      auto print_timing = [](const std::string& name, const xpu::timings& t) {
+        std::cout << std::left << std::setw(25) << name << "Kernel time: " << std::setw(10) << t.kernel_time()
+                  << " H2D copy: " << std::setw(10) << t.copy(xpu::h2d) << " D2H copy: " << std::setw(10)
+                  << t.copy(xpu::d2h) << " Memset: " << std::setw(10) << t.memset() << " Total: " << std::setw(10)
+                  << t.wall() << std::endl;
+      };
+
+      print_timing("PrepareData_time", PrepareData_time[iteration]);
+      print_timing("MakeSinglets_time", MakeSinglets_time[iteration]);
+      print_timing("MakeDoublets_time", MakeDoublets_time[iteration]);
+      print_timing("CompressDoublets_time", CompressDoublets_time[iteration]);
+      print_timing("ResetDoublets_time", ResetDoublets_time[iteration]);
+      print_timing("FitDoublets_time", FitDoublets_time[iteration]);
+      print_timing("MakeTriplets_time", MakeTriplets_time[iteration]);
+      print_timing("CompressTriplets_time", CompressTriplets_time[iteration]);
+      print_timing("ResetTriplets_time", ResetTriplets_time[iteration]);
+      print_timing("FitTriplets_time", FitTriplets_time[iteration]);
+      print_timing("SortTriplets_time", SortTriplets_time[iteration]);
+      print_timing("Total_time", Total_time[iteration]);
+    }
+  };
+
+  ///
+  class GpuTimeMonitor {
+   public:
+    /// \brief  Default constructor
+    GpuTimeMonitor() : nEvents(0) {}
+
+    /// \brief Destructor
+    ~GpuTimeMonitor() = default;
+
+    void AddEventInfo(XpuTimings& event)
+    {
+      xpuTimings.push_back(event);
+      nEvents++;
+    }
+
+    void SetNEvents(int nEv) { this->nEvents = nEv; }
+
+    int GetNEvents() { return nEvents; }
+
+    xpu::timings GetMakeSingletsTime(int ev, int it) { return xpuTimings[ev].MakeSinglets_time[it]; }
+
+    xpu::timings GetMakeDoubletsTime(int ev, int it) { return xpuTimings[ev].MakeDoublets_time[it]; }
+
+    xpu::timings GetCompressDoubletsTime(int ev, int it) { return xpuTimings[ev].CompressDoublets_time[it]; }
+
+    xpu::timings GetFitDoubletsTime(int ev, int it) { return xpuTimings[ev].FitDoublets_time[it]; }
+
+    xpu::timings GetMakeTripletsTime(int ev, int it) { return xpuTimings[ev].MakeTriplets_time[it]; }
+
+    xpu::timings GetCompressTripletsTime(int ev, int it) { return xpuTimings[ev].CompressTriplets_time[it]; }
+
+    xpu::timings GetFitTripletsTime(int ev, int it) { return xpuTimings[ev].FitTriplets_time[it]; }
+
+    xpu::timings GetTotalTime(int ev, int it) { return xpuTimings[ev].Total_time[it]; }
+
+   private:
+    std::vector<XpuTimings> xpuTimings;
+    int nEvents;
+  };
+
+}  // namespace cbm::algo::ca
diff --git a/algo/ca/core/experimental/CaGpuTrackFinderSetup.cxx b/algo/ca/core/experimental/CaGpuTrackFinderSetup.cxx
new file mode 100644
index 0000000000000000000000000000000000000000..5d40dd22191dfc65b72c905581d9b6b8c2eb8c84
--- /dev/null
+++ b/algo/ca/core/experimental/CaGpuTrackFinderSetup.cxx
@@ -0,0 +1,395 @@
+/* Copyright (C) 2024-2025 GSI Helmholtzzentrum fuer Schwerionenforschung, Darmstadt
+   SPDX-License-Identifier: GPL-3.0-only
+   Authors: Grigory Kozlov [committer] */
+
+/// \file CaGpuTrackFinderSetup.cxx
+/// \brief The class is responsible for setting up the environment and the order in which kernels are launched to perform tracking using XPU
+
+#include "CaGpuTrackFinderSetup.h"
+
+using namespace cbm::algo::ca;
+
+GpuTrackFinderSetup::GpuTrackFinderSetup(WindowData& wData, const ca::Parameters<fvec>& pars)
+  : fParameters(pars)
+  , frWData(wData)
+  , fIteration(0)
+{
+}
+
+void GpuTrackFinderSetup::SetupParameters()
+{
+  int nStations   = fParameters.GetNstationsActive();
+  int nIterations = fParameters.GetCAIterations().size();
+  fTripletConstructor.fParams.reset(nIterations, xpu::buf_io);
+  xpu::h_view vfParams{fTripletConstructor.fParams};
+
+  for (int i = 0; i < nIterations; i++) {
+    GpuParameters gpuParams              = fParameters;
+    fTripletConstructor.fParams_const[i] = fParameters;
+    const auto& caIteration              = fParameters.GetCAIterations()[i];
+
+    gpuParams.fNStations                            = nStations;
+    gpuParams.maxSlopePV                            = caIteration.GetMaxSlopePV();
+    gpuParams.maxQp                                 = caIteration.GetMaxQp();
+    gpuParams.maxDZ                                 = caIteration.GetMaxDZ();
+    fTripletConstructor.fParams_const[i].fNStations = nStations;
+    fTripletConstructor.fParams_const[i].maxSlopePV = caIteration.GetMaxSlopePV();
+    fTripletConstructor.fParams_const[i].maxQp      = caIteration.GetMaxQp();
+    fTripletConstructor.fParams_const[i].maxDZ      = caIteration.GetMaxDZ();
+
+    if (caIteration.GetElectronFlag()) {
+      gpuParams.particleMass                            = constants::phys::ElectronMass;
+      fTripletConstructor.fParams_const[i].particleMass = constants::phys::ElectronMass;
+    }
+    else {
+      gpuParams.particleMass                            = constants::phys::MuonMass;
+      fTripletConstructor.fParams_const[i].particleMass = constants::phys::MuonMass;
+    }
+
+    gpuParams.primaryFlag                            = caIteration.GetPrimaryFlag();
+    fTripletConstructor.fParams_const[i].primaryFlag = caIteration.GetPrimaryFlag();
+
+    if (caIteration.GetPrimaryFlag()) {
+      gpuParams.targB                            = fParameters.GetVertexFieldValue();
+      fTripletConstructor.fParams_const[i].targB = fParameters.GetVertexFieldValue();
+    }
+    else {
+      gpuParams.targB                            = fParameters.GetStation(0).fieldSlice.GetFieldValue(0, 0);
+      fTripletConstructor.fParams_const[i].targB = fParameters.GetStation(0).fieldSlice.GetFieldValue(0, 0);
+    }
+
+    gpuParams.doubletChi2Cut                                 = caIteration.GetDoubletChi2Cut();
+    gpuParams.tripletChi2Cut                                 = caIteration.GetTripletChi2Cut();
+    gpuParams.tripletFinalChi2Cut                            = caIteration.GetTripletFinalChi2Cut();
+    gpuParams.isTargetField                                  = (fabs(gpuParams.targB.y) > 0.001);
+    fTripletConstructor.fParams_const[i].doubletChi2Cut      = caIteration.GetDoubletChi2Cut();
+    fTripletConstructor.fParams_const[i].tripletChi2Cut      = caIteration.GetTripletChi2Cut();
+    fTripletConstructor.fParams_const[i].tripletFinalChi2Cut = caIteration.GetTripletFinalChi2Cut();
+
+    ca::MeasurementXy<float> targMeas(fParameters.GetTargetPositionX()[0], fParameters.GetTargetPositionY()[0],
+                                      caIteration.GetTargetPosSigmaX() * caIteration.GetTargetPosSigmaX(),
+                                      caIteration.GetTargetPosSigmaY() * caIteration.GetTargetPosSigmaX(), 0, 1, 1);
+    gpuParams.targetMeasurement                            = targMeas;
+    fTripletConstructor.fParams_const[i].targetMeasurement = targMeas;
+
+    vfParams[i] = gpuParams;
+  }
+
+  fQueue.copy(fTripletConstructor.fParams, xpu::h2d);
+
+  for (int ista = 0; ista < nStations; ista++) {
+    fTripletConstructor.fStations_const[ista] = fParameters.GetStation(ista);
+  }
+
+  xpu::set<strGpuTripletConstructor>(fTripletConstructor);
+}
+
+void GpuTrackFinderSetup::SetupGrid()
+{
+  unsigned int nStations     = fParameters.GetNstationsActive();
+  unsigned int bin_start     = 0;
+  unsigned int entries_start = 0;
+
+  fTripletConstructor.fvGpuGrid.reset(nStations, xpu::buf_io);
+  xpu::h_view vfvGpuGrid{fTripletConstructor.fvGpuGrid};
+
+  for (unsigned int ista = 0; ista < nStations; ista++) {
+    vfvGpuGrid[ista] = GpuGrid(frWData.Grid(ista), bin_start, entries_start);
+    bin_start += frWData.Grid(ista).GetFirstBinEntryIndex().size();
+    entries_start += frWData.Grid(ista).GetEntries().size();
+  }
+
+  fTripletConstructor.fgridFirstBinEntryIndex.reset(bin_start, xpu::buf_io);
+  fTripletConstructor.fgridEntries.reset(entries_start, xpu::buf_io);
+
+  xpu::h_view vfgridFirstBinEntryIndex{fTripletConstructor.fgridFirstBinEntryIndex};
+  xpu::h_view vfgridEntries{fTripletConstructor.fgridEntries};
+
+  bin_start = entries_start = 0;
+
+  for (unsigned int ista = 0; ista < nStations; ista++) {
+    std::copy_n(frWData.Grid(ista).GetFirstBinEntryIndex().begin(), frWData.Grid(ista).GetFirstBinEntryIndex().size(),
+                &vfgridFirstBinEntryIndex[bin_start]);
+    for (unsigned int i = 0; i < frWData.Grid(ista).GetEntries().size(); i++) {
+      vfgridEntries[entries_start + i] = frWData.Grid(ista).GetEntries()[i].GetObjectId();
+    }
+    bin_start += frWData.Grid(ista).GetFirstBinEntryIndex().size();
+    entries_start += frWData.Grid(ista).GetEntries().size();
+  }
+  fNHits = entries_start;
+
+  fQueue.copy(fTripletConstructor.fvGpuGrid, xpu::h2d);
+}
+
+void GpuTrackFinderSetup::SetupMaterialMap()
+{
+  ///Set up material map
+  int bin_start  = 0;
+  int nStations  = fParameters.GetNstationsActive();
+  int fstStation = 0;
+  if (nStations == 8) fstStation = 4;
+  fTripletConstructor.fMaterialMap.reset(nStations, xpu::buf_io);
+  xpu::h_view vfMaterialMap{fTripletConstructor.fMaterialMap};
+
+  for (int ista = 0; ista < nStations; ista++) {
+    vfMaterialMap[ista] = GpuMaterialMap(fParameters.GetGeometrySetup().GetMaterial(fstStation + ista), bin_start);
+    bin_start += fParameters.GetGeometrySetup().GetMaterial(fstStation + ista).GetNbins()
+                 * fParameters.GetGeometrySetup().GetMaterial(fstStation + ista).GetNbins();
+  }
+
+  fTripletConstructor.fMaterialMapTables.reset(bin_start, xpu::buf_io);
+  xpu::h_view vfMaterialMapTables{fTripletConstructor.fMaterialMapTables};
+
+  bin_start = 0;
+
+  for (int ista = 0; ista < nStations; ista++) {
+    std::copy_n(fParameters.GetGeometrySetup().GetMaterial(fstStation + ista).GetTable().begin(),
+                fParameters.GetGeometrySetup().GetMaterial(fstStation + ista).GetTable().size(),
+                &vfMaterialMapTables[bin_start]);
+    bin_start += fParameters.GetGeometrySetup().GetMaterial(fstStation + ista).GetNbins()
+                 * fParameters.GetGeometrySetup().GetMaterial(fstStation + ista).GetNbins();
+  }
+
+  fQueue.copy(fTripletConstructor.fMaterialMap, xpu::h2d);
+  fQueue.copy(fTripletConstructor.fMaterialMapTables, xpu::h2d);
+}
+
+void GpuTrackFinderSetup::SetInputData()
+{
+  fTripletConstructor.fvHits.reset(frWData.Hits().size(), xpu::buf_io);
+  xpu::h_view vfvHits{fTripletConstructor.fvHits};
+
+  std::copy_n(frWData.Hits().begin(), frWData.Hits().size(), &vfvHits[0]);
+
+  fQueue.copy(fTripletConstructor.fvHits, xpu::h2d);
+}
+
+void GpuTrackFinderSetup::SetupIterationData(int iter)
+{
+  //  xpu::scoped_timer t_("SetupIterationData");
+  fIteration = iter;
+
+  fTripletConstructor.fIterationData.reset(1, xpu::buf_io);
+  xpu::h_view vfIterationData{fTripletConstructor.fIterationData};
+  vfIterationData[0].fNHits             = fNHits;
+  vfIterationData[0].fIteration         = iter;
+  vfIterationData[0].fNDoublets         = 0;
+  vfIterationData[0].fNDoublets_counter = 0;
+  vfIterationData[0].fNTriplets         = 0;
+  vfIterationData[0].fNTriplets_counter = 0;
+
+  fQueue.copy(fTripletConstructor.fIterationData, xpu::h2d);
+}
+
+void GpuTrackFinderSetup::RunGpuTrackingSetup()
+{
+  fNTriplets = 0;
+
+  xpu::device_prop prop{xpu::device::active()};
+
+  if constexpr (constants::gpu::GpuTimeMonitoring) {
+    LOG(info) << "Running GPU tracking chain on device " << prop.name() << ", fIteration: " << fIteration;
+  }
+
+  bool isCpu  = xpu::device::active().backend() == xpu::cpu;
+  int nBlocks = isCpu ? fNHits : std::ceil(float(fNHits) / float(kSingletConstructorBlockSize));
+  //nBlocks = 1;
+
+  int nBlocksCD =
+    nBlocks
+    * constants::gpu::
+      MaxDoubletsFromHit;  //std::ceil(float(nBlocks * maxDoubletsFromHit) / float(kSingletConstructorBlockSize));
+
+  //***
+  fTripletConstructor.fvTrackParams.reset(frWData.Hits().size(), xpu::buf_device);
+  fTripletConstructor.fHitDoublets.reset(frWData.Hits().size() * constants::gpu::MaxDoubletsFromHit, xpu::buf_device);
+  //***
+  fTripletConstructor.fIteration = fIteration;
+
+  xpu::set<strGpuTripletConstructor>(fTripletConstructor);
+
+  if constexpr (constants::gpu::GpuTimeMonitoring) {
+    fEventTimeMonitor.nIterations = fIteration;
+    xpu::push_timer("Full_time");
+
+    xpu::push_timer("PrepareData_time");
+  }
+
+  //  if(fIteration == 0) fQueue.copy(fTripletConstructor.fvHits, xpu::h2d);
+
+  //  fQueue.copy(fTripletConstructor.fIterationData, xpu::h2d);
+
+  //  fQueue.copy(fTripletConstructor.fvGpuGrid, xpu::h2d);
+
+  fQueue.copy(fTripletConstructor.fgridFirstBinEntryIndex, xpu::h2d);
+
+  fQueue.copy(fTripletConstructor.fgridEntries, xpu::h2d);
+
+  //  if(fIteration == 0) fQueue.copy(fTripletConstructor.fMaterialMap, xpu::h2d);
+
+  //  if(fIteration == 0) fQueue.copy(fTripletConstructor.fMaterialMapTables, xpu::h2d);
+
+  fQueue.memset(fTripletConstructor.fHitDoublets, 0);
+
+  if constexpr (constants::gpu::GpuTimeMonitoring) {
+    xpu::timings pd_time                           = xpu::pop_timer();
+    fEventTimeMonitor.PrepareData_time[fIteration] = pd_time;
+
+    xpu::push_timer("MakeSinglets_time");
+  }
+
+  fQueue.launch<MakeSinglets>(xpu::n_blocks(nBlocks));
+
+  if constexpr (constants::gpu::GpuTimeMonitoring) {
+    xpu::timings sind_time                          = xpu::pop_timer();
+    fEventTimeMonitor.MakeSinglets_time[fIteration] = sind_time;
+
+    xpu::push_timer("CompressDoublets_time");
+  }
+
+  fQueue.launch<MakeDoublets>(xpu::n_blocks(nBlocks));
+
+  if constexpr (constants::gpu::GpuTimeMonitoring) {
+    xpu::timings dind_time                          = xpu::pop_timer();
+    fEventTimeMonitor.MakeDoublets_time[fIteration] = dind_time;
+
+    xpu::push_timer("ResetDoublets_time");
+  }
+
+  fQueue.copy(fTripletConstructor.fIterationData, xpu::d2h);
+  xpu::h_view vfIterationData1{fTripletConstructor.fIterationData};
+
+  if constexpr (constants::gpu::GpuTimeMonitoring) {
+    LOG(info) << "GPU tracking :: Doublets found: " << vfIterationData1[0].fNDoublets;
+  }
+
+  if (vfIterationData1[0].fNDoublets > 0) {
+    fTripletConstructor.fHitDoubletsCompressed.reset(vfIterationData1[0].fNDoublets, xpu::buf_device);
+    fTripletConstructor.fvTrackParamsDoublets.reset(vfIterationData1[0].fNDoublets, xpu::buf_device);
+    fTripletConstructor.fHitTriplets.reset(vfIterationData1[0].fNDoublets * constants::gpu::MaxTripletsFromDoublet,
+                                           xpu::buf_device);
+
+    fQueue.memset(fTripletConstructor.fHitTriplets, 0);
+    xpu::set<strGpuTripletConstructor>(fTripletConstructor);
+
+    if constexpr (constants::gpu::GpuTimeMonitoring) {
+      xpu::timings rd_time                             = xpu::pop_timer();
+      fEventTimeMonitor.ResetDoublets_time[fIteration] = rd_time;
+
+      xpu::push_timer("CompressDoublets_time");
+    }
+
+    fQueue.launch<CompressDoublets>(xpu::n_blocks(nBlocksCD));
+
+    if constexpr (constants::gpu::GpuTimeMonitoring) {
+      xpu::timings cd_time                                = xpu::pop_timer();
+      fEventTimeMonitor.CompressDoublets_time[fIteration] = cd_time;
+
+      xpu::push_timer("FitDoublets_time");
+    }
+
+    int nBlocksD = isCpu ? vfIterationData1[0].fNDoublets
+                         : std::ceil(float(vfIterationData1[0].fNDoublets) / float(kSingletConstructorBlockSize));
+
+    fQueue.launch<FitDoublets>(xpu::n_blocks(nBlocksD));
+
+    if constexpr (constants::gpu::GpuTimeMonitoring) {
+      xpu::timings fd_time                           = xpu::pop_timer();
+      fEventTimeMonitor.FitDoublets_time[fIteration] = fd_time;
+
+      xpu::push_timer("MakeTriplets_time");
+    }
+
+    fQueue.launch<MakeTriplets>(xpu::n_blocks(nBlocksD));
+
+    if constexpr (constants::gpu::GpuTimeMonitoring) {
+      xpu::timings mt_time                            = xpu::pop_timer();
+      fEventTimeMonitor.MakeTriplets_time[fIteration] = mt_time;
+
+      xpu::push_timer("ResetTriplets_time");
+    }
+
+    fTripletConstructor.fHitDoublets.reset(0, xpu::buf_device);  // TODO: check if we need it for the next iteration
+
+    fQueue.copy(fTripletConstructor.fIterationData, xpu::d2h);
+    xpu::h_view vfIterationData2{fTripletConstructor.fIterationData};
+
+    if (vfIterationData2[0].fNTriplets > 0) {
+      fTripletConstructor.fHitTripletsCompressed.reset(vfIterationData2[0].fNTriplets, xpu::buf_device);
+      //      fTripletConstructor.fvTrackParamsTriplets.reset(vfIterationData2[0].fNTriplets, xpu::buf_device);
+      fTripletConstructor.fvTriplets.reset(vfIterationData2[0].fNTriplets, xpu::buf_io);
+
+      int nBlocksCT = isCpu ? vfIterationData1[0].fNDoublets * constants::gpu::MaxTripletsFromDoublet
+                            : std::ceil(float(vfIterationData1[0].fNDoublets * constants::gpu::MaxTripletsFromDoublet)
+                                        / float(kSingletConstructorBlockSize));
+
+      if constexpr (constants::gpu::CpuSortTriplets) {
+        fTripletConstructor.fTripletSortHelper.reset(vfIterationData2[0].fNTriplets, xpu::buf_io);
+        //        fTripletConstructor.fTripletSortHelperTmp.reset(vfIterationData2[0].fNTriplets, xpu::buf_io);
+        //        fTripletConstructor.dst.reset(nBlocksCT, xpu::buf_io);
+      }
+
+      xpu::set<strGpuTripletConstructor>(fTripletConstructor);
+
+      if constexpr (constants::gpu::GpuTimeMonitoring) {
+        xpu::timings rt_time                             = xpu::pop_timer();
+        fEventTimeMonitor.ResetTriplets_time[fIteration] = rt_time;
+
+        LOG(info) << "GPU tracking :: Triplets found: " << vfIterationData2[0].fNTriplets;
+
+        xpu::push_timer("CompressTriplets_time");
+      }
+
+      fQueue.launch<CompressTriplets>(xpu::n_blocks(nBlocksCT));
+
+      if constexpr (constants::gpu::GpuTimeMonitoring) {
+        xpu::timings ct_time                                = xpu::pop_timer();
+        fEventTimeMonitor.CompressTriplets_time[fIteration] = ct_time;
+
+        xpu::push_timer("FitTriplets_time");
+      }
+
+      fQueue.launch<FitTriplets>(
+        xpu::n_blocks(/*fTripletConstructor.fIterationData[0].fNTriplets*/ vfIterationData2[0].fNTriplets));
+
+      if constexpr (constants::gpu::GpuTimeMonitoring) {
+        xpu::timings ft_time                           = xpu::pop_timer();
+        fEventTimeMonitor.FitTriplets_time[fIteration] = ft_time;
+
+        xpu::push_timer("SortTriplets_time");
+      }
+
+      if constexpr (constants::gpu::CpuSortTriplets) {
+        fQueue.launch<SortTriplets>(xpu::n_blocks(vfIterationData2[0].fNTriplets));
+      }
+
+      if constexpr (constants::gpu::GpuTimeMonitoring) {
+        xpu::timings srt_time                           = xpu::pop_timer();
+        fEventTimeMonitor.SortTriplets_time[fIteration] = srt_time;
+      }
+
+      fQueue.copy(fTripletConstructor.fvTriplets, xpu::d2h);
+      fQueue.copy(fTripletConstructor.fIterationData, xpu::d2h);
+
+      if constexpr (constants::gpu::CpuSortTriplets) {
+        fQueue.copy(fTripletConstructor.fTripletSortHelper, xpu::d2h);
+      }
+
+      xpu::h_view vfIterationData3{fTripletConstructor.fIterationData};
+      fNTriplets = vfIterationData3[0].fNTriplets;
+    }
+  }
+
+  fTripletConstructor.fIterationData.reset(0, xpu::buf_device);
+  fTripletConstructor.fvGpuGrid.reset(0, xpu::buf_io);
+  fTripletConstructor.fgridFirstBinEntryIndex.reset(0, xpu::buf_io);
+  fTripletConstructor.fgridEntries.reset(0, xpu::buf_io);
+
+  xpu::set<strGpuTripletConstructor>(fTripletConstructor);  //TODO: check if we need to reset all the buffers here
+
+  if constexpr (constants::gpu::GpuTimeMonitoring) {
+    xpu::timings t                           = xpu::pop_timer();
+    fEventTimeMonitor.Total_time[fIteration] = t;
+    fEventTimeMonitor.PrintTimings(fIteration);
+  }
+}
diff --git a/algo/ca/core/experimental/CaGpuTrackFinderSetup.h b/algo/ca/core/experimental/CaGpuTrackFinderSetup.h
new file mode 100644
index 0000000000000000000000000000000000000000..288eaa39ff00f9e37871e0218352af1ea8690f8c
--- /dev/null
+++ b/algo/ca/core/experimental/CaGpuTrackFinderSetup.h
@@ -0,0 +1,92 @@
+/* Copyright (C) 2024-2025 GSI Helmholtzzentrum fuer Schwerionenforschung, Darmstadt
+   SPDX-License-Identifier: GPL-3.0-only
+   Authors: Grigory Kozlov [committer] */
+
+/// \file CaGpuTrackFinderSetup.h
+/// \brief The class is responsible for setting up the environment and the order in which kernels are launched to perform tracking using XPU
+
+#pragma once  // include this header only once per compilation unit
+
+#include "CaBranch.h"
+#include "CaGpuParameters.h"
+#include "CaGpuTimeMonitor.h"
+#include "CaGpuTripletConstructor.h"
+#include "CaHit.h"
+#include "CaVector.h"
+#include "CaWindowData.h"
+#include "KfTrackParam.h"
+
+#include <xpu/host.h>
+
+namespace cbm::algo::ca
+{
+  class GpuTrackFinderSetup {
+   public:
+    ///                             ------  Constructors and destructor ------
+
+    /// Constructor
+    /// \param nThreads  Number of threads for multi-threaded mode
+    GpuTrackFinderSetup(ca::WindowData& wData, const ca::Parameters<fvec>& pars);
+
+    /// Copy constructor
+    GpuTrackFinderSetup(const GpuTrackFinderSetup&) = delete;
+
+    /// Move constructor
+    GpuTrackFinderSetup(GpuTrackFinderSetup&&) = delete;
+
+    /// Copy assignment operator
+    GpuTrackFinderSetup& operator=(const GpuTrackFinderSetup&) = delete;
+
+    /// Move assignment operator
+    GpuTrackFinderSetup& operator=(GpuTrackFinderSetup&&) = delete;
+
+    /// Destructor
+    ~GpuTrackFinderSetup() = default;
+
+    ///                             ------  Public member functions ------
+
+    ///Set the GPU tracking parameters
+    void SetupParameters();
+
+    /// Set the input grid data for the GPU tracking
+    void SetupGrid();
+
+    /// Set the material map for the GPU tracking
+    void SetupMaterialMap();
+
+    /// Set the input data for the GPU tracking
+    void SetInputData();
+
+    /// Set the iteration data
+    void SetupIterationData(int iter);
+
+    /// Run the track finding algorithm chain
+    void RunGpuTrackingSetup();
+
+    /// Get the number of triplets
+    unsigned int GetNofTriplets() const { return fNTriplets; }  //fTripletConstructor.fTempData[0].fNTriplets; }
+
+    ca::Triplet& GetTriplet(int itr) { return fTripletConstructor.fvTriplets[itr]; }  //TODO: switch back
+    //    ca::Triplet& GetTriplet(int itr) { return fvTriplets[itr]; }//fTripletConstructor.fvTriplets[itr]; }	//TODO: switch back
+
+    /// Get timings
+    XpuTimings& GetTimings() { return fEventTimeMonitor; }
+
+    /// Get triplets
+    //    const Vector<ca::Triplet>& GetTriplets() const { return fTripletConstructor.fvTriplets; }
+
+    //    static constexpr bool fGPU_time_monitoring = true;  // print debug info
+
+   private:
+    const Parameters<fvec>& fParameters;  ///< Object of Framework parameters class
+    WindowData& frWData;                  ///< Reference to the window data
+    xpu::queue fQueue;                    ///< GPU queue TODO: initialization is ~220 ms. Why and how to avoid?
+    ca::GpuTripletConstructor fTripletConstructor;  ///< GPU triplet constructor
+    unsigned int fIteration;                        ///< Iteration number
+    int fNHits;                                     ///< Number of hits
+
+    int fNTriplets;  ///< Number of triplets
+
+    XpuTimings fEventTimeMonitor;
+  };
+}  // namespace cbm::algo::ca
diff --git a/algo/ca/core/experimental/CaGpuTripletConstructor.cxx b/algo/ca/core/experimental/CaGpuTripletConstructor.cxx
new file mode 100644
index 0000000000000000000000000000000000000000..bd21eeb5bc4f0e814de5a4481322cadfb1c6c613
--- /dev/null
+++ b/algo/ca/core/experimental/CaGpuTripletConstructor.cxx
@@ -0,0 +1,1458 @@
+/* Copyright (C) 2024-2025 GSI Helmholtzzentrum fuer Schwerionenforschung, Darmstadt
+   SPDX-License-Identifier: GPL-3.0-only
+   Authors: Grigory Kozlov [committer] */
+
+/// \file CaGpuTripletConstructor.cxx
+/// \brief The class contains data and kernels for running CA tracking on CPU and GPU using XPU libraries
+
+
+#include "CaGpuTripletConstructor.h"
+
+using namespace cbm::algo::ca;
+
+// Export Constants
+XPU_EXPORT(strGpuTripletConstructor);
+
+// Kernel Definitions
+//XPU_EXPORT(TestFunc);
+//XPU_D void TestFunc::operator()(context& ctx) { ctx.cmem<strGpuTripletConstructor>().TestFunc(ctx); }
+
+XPU_EXPORT(MakeSinglets);
+XPU_D void MakeSinglets::operator()(context& ctx) { ctx.cmem<strGpuTripletConstructor>().MakeSinglets(ctx); }
+
+XPU_EXPORT(MakeDoublets);
+XPU_D void MakeDoublets::operator()(context& ctx) { ctx.cmem<strGpuTripletConstructor>().MakeDoublets(ctx); }
+
+XPU_EXPORT(CompressDoublets);
+XPU_D void CompressDoublets::operator()(context& ctx) { ctx.cmem<strGpuTripletConstructor>().CompressDoublets(ctx); }
+
+XPU_EXPORT(FitDoublets);
+XPU_D void FitDoublets::operator()(context& ctx) { ctx.cmem<strGpuTripletConstructor>().FitDoublets(ctx); }
+
+XPU_EXPORT(MakeTriplets);
+XPU_D void MakeTriplets::operator()(context& ctx) { ctx.cmem<strGpuTripletConstructor>().MakeTriplets(ctx); }
+
+XPU_EXPORT(CompressTriplets);
+XPU_D void CompressTriplets::operator()(context& ctx) { ctx.cmem<strGpuTripletConstructor>().CompressTriplets(ctx); }
+
+XPU_EXPORT(FitTriplets);
+XPU_D void FitTriplets::operator()(context& ctx) { ctx.cmem<strGpuTripletConstructor>().FitTriplets(ctx); }
+
+XPU_EXPORT(SortTriplets);
+XPU_D void SortTriplets::operator()(context& ctx) { ctx.cmem<strGpuTripletConstructor>().SortTripletsFunc(ctx); }
+
+//XPU_D void GpuTripletConstructor::TestFunc(TestFunc::context& ctx) const
+//{
+//  const int iGThread    = ctx.block_dim_x() * ctx.block_idx_x() + ctx.thread_idx_x();
+//}
+
+XPU_D void GpuTripletConstructor::MakeSinglets(MakeSinglets::context& ctx) const
+{
+  const int iGThread = ctx.block_dim_x() * ctx.block_idx_x() + ctx.thread_idx_x();
+
+  if (iGThread >= fIterationData[0].fNHits) return;
+
+  ca::HitIndex_t ihitl = fgridEntries[iGThread];  //iGThread;
+  int ista             = fvHits[ihitl].Station();
+
+  //  if(ista < 0 || ista > (int)fParams[fIterationData[0].fIteration].fNStations - 3) return;
+  if (ista < 0 || ista > (int) fParams_const[fIteration].fNStations - 3) return;
+
+  //  const ca::Station<float>* sta_l = &fParams[fIterationData[0].fIteration].GetStation(ista);
+  //  const ca::Station<float>* sta_m = &fParams[fIterationData[0].fIteration].GetStation(ista + 1);
+  const ca::GpuStation* sta_l = &fStations_const[ista];
+  const ca::GpuStation* sta_m = &fStations_const[ista + 1];
+
+  float stal_l_timeInfo = sta_l->timeInfo;
+  float sta_m_z         = sta_m->fZ;
+
+  const auto& hitl = fvHits[ihitl];
+
+  //  const float dzli = 1.f / (hitl.Z() - fParams[fIterationData[0].fIteration].GetTargetPositionZ());
+  //  const float tx = (hitl.X() - fParams[fIterationData[0].fIteration].GetTargetPositionX()) * dzli;
+  //  const float ty = (hitl.Y() - fParams[fIterationData[0].fIteration].GetTargetPositionY()) * dzli;
+  const float dzli = 1.f / (hitl.Z() - fParams_const[fIteration].GetTargetPositionZ());
+  const float tx   = (hitl.X() - fParams_const[fIteration].GetTargetPositionX()) * dzli;
+  const float ty   = (hitl.Y() - fParams_const[fIteration].GetTargetPositionY()) * dzli;
+
+  kf::TrackParamBase<float>& fit_params = fvTrackParams[ihitl];
+
+  fit_params.X()    = hitl.X();
+  fit_params.Y()    = hitl.Y();
+  fit_params.Z()    = hitl.Z();
+  fit_params.Tx()   = tx;
+  fit_params.Ty()   = ty;
+  fit_params.Qp()   = 0.;
+  fit_params.Time() = hitl.T();
+  //  fit_params.Vi()   = fParams[fIterationData[0].fIteration].SpeedOfLightInv;//0.03335641;//constants::phys::SpeedOfLightInv;
+  fit_params.Vi() = constants::phys::SpeedOfLightInv;  //fParams_const[fIteration].SpeedOfLightInv;
+
+  //  float txErr2           = fParams[fIterationData[0].fIteration].maxSlopePV * fParams[fIterationData[0].fIteration].maxSlopePV / 9.;
+  //  float qpErr2           = fParams[fIterationData[0].fIteration].maxQp * fParams[fIterationData[0].fIteration].maxQp / 9.;
+  float txErr2 = fParams_const[fIteration].maxSlopePV * fParams_const[fIteration].maxSlopePV / 9.;
+  float qpErr2 = fParams_const[fIteration].maxQp * fParams_const[fIteration].maxQp / 9.;
+
+  fit_params.ResetErrors(1., 1., txErr2, txErr2, qpErr2, (stal_l_timeInfo ? hitl.dT2() : 1.e6), 1.e10);
+
+  fit_params.InitVelocityRange(1. / /*fParams[fIterationData[0].fIteration]*/ fParams_const[fIteration].maxQp);
+
+  fit_params.C(0, 0) = hitl.dX2();
+  fit_params.C(1, 0) = hitl.dXY();
+  fit_params.C(1, 1) = hitl.dY2();
+
+  //  fit_params.Ndf()     = fParams[fIterationData[0].fIteration].primaryFlag ? 2. : 0.;
+  fit_params.Ndf()     = fParams_const[fIteration].primaryFlag ? 2. : 0.;
+  fit_params.NdfTime() = (stal_l_timeInfo ? 0 : -1);
+
+  //  ca::GpuFieldRegion fld0;
+  {
+    int sta1 = ista;
+    if (sta1 == 0) sta1 = 1;
+    int sta0 = sta1 / 2;  // station between ista and the target
+    //    const ca::GpuFieldValue &B0 = fParams[fIterationData[0].fIteration].GetStation(sta0).fieldSlice.GetFieldValueForLine(fit_params);
+    //    const ca::GpuFieldValue &B1 = fParams[fIterationData[0].fIteration].GetStation(sta1).fieldSlice.GetFieldValueForLine(fit_params);
+    const ca::GpuFieldValue& B0 = fStations_const[sta0].fieldSlice.GetFieldValueForLine(fit_params);
+    const ca::GpuFieldValue& B1 = fStations_const[sta1].fieldSlice.GetFieldValueForLine(fit_params);
+    //fld0.Set(fParams[fIterationData[0].fIteration].targB, fParams[fIterationData[0].fIteration].GetTargetPositionZ(), B0, fParams[fIterationData[0].fIteration].GetStation(sta0).fZ, B1, fParams[fIterationData[0].fIteration].GetStation(sta1).fZ);
+    ctx.smem().fld0_shared[ctx.thread_idx_x()].Set(fParams_const[fIteration].targB,
+                                                   fParams_const[fIteration].GetTargetPositionZ(), B0,
+                                                   fStations_const[sta0].fZ, B1, fStations_const[sta1].fZ);
+  }
+  //  ca::GpuFieldRegion fld1;
+  {  // field, made by the left hit, the middle station and the right station
+    //    const ca::GpuFieldValue &B0 = fParams[fIterationData[0].fIteration].GetStation(ista)*/fStations_const[ista]/*ctx.smem().fStations_shared[ista]*/.fieldSlice.GetFieldValueForLine(fit_params);
+    //    const ca::GpuFieldValue &B1 = fParams[fIterationData[0].fIteration].GetStation(ista + 1).fieldSlice.GetFieldValueForLine(fit_params);
+    //    const ca::GpuFieldValue &B2 = fParams[fIterationData[0].fIteration].GetStation(ista + 2).fieldSlice.GetFieldValueForLine(fit_params);
+    const ca::GpuFieldValue& B0 = fStations_const[ista].fieldSlice.GetFieldValueForLine(fit_params);
+    const ca::GpuFieldValue& B1 = fStations_const[ista + 1].fieldSlice.GetFieldValueForLine(fit_params);
+    const ca::GpuFieldValue& B2 = fStations_const[ista + 2].fieldSlice.GetFieldValueForLine(fit_params);
+    //    fld1.Set(B0, fParams[fIterationData[0].fIteration].GetStation(ista).fZ, B1, fParams[fIterationData[0].fIteration].GetStation(ista + 1).fZ, B2, fParams[fIterationData[0].fIteration].GetStation(ista + 2).fZ);
+    ctx.smem().fld1_shared[ctx.thread_idx_x()].Set(B0, fStations_const[ista].fZ, B1, fStations_const[ista + 1].fZ, B2,
+                                                   fStations_const[ista + 2].fZ);
+  }
+
+  FilterWithTargetAtLine(&fit_params, &/*fld0*/ ctx.smem().fld0_shared[ctx.thread_idx_x()]);
+
+  MultipleScattering(&fit_params, fMaterialMapTables[fMaterialMap[ista].GetBin(fit_params.GetX(), fit_params.GetY())],
+                     /*fParams[fIterationData[0].fIteration]*/ fParams_const[fIteration].maxQp);
+
+  // extrapolate to the middle hit
+  ExtrapolateStep(&fit_params, sta_m_z, 0.f, &/*fld1*/ ctx.smem().fld1_shared[ctx.thread_idx_x()]);
+
+  //  ExtrapolateLine(tr_par, sta_m->fZ, F1);	//TODO: check if ExtrapolateStep is enough
+}
+
+XPU_D void GpuTripletConstructor::MakeDoublets(MakeDoublets::context& ctx) const
+{
+  const int iGThread = ctx.block_dim_x() * ctx.block_idx_x() + ctx.thread_idx_x();
+  if (iGThread >= fIterationData[0].fNHits) return;
+
+#if 0
+  if (ctx.thread_idx_x() == 0) {
+      ctx.smem().counter = 0;
+  }
+#endif
+
+  ca::HitIndex_t ihitl = fgridEntries[iGThread];  //iGThread;
+  int ista             = fvHits[ihitl].Station();
+
+  if (ista > (int) fParams[fIterationData[0].fIteration].fNStations - 3) return;
+
+  int collectedHits = 0;
+  int maxNHits      = constants::gpu::MaxDoubletsFromHit;  //fParams[fIterationData[0].fIteration].maxDoubletsFromHit;
+
+  CollectHits(iGThread, ista + 1, ihitl, maxNHits, &collectedHits);
+
+  xpu::atomic_add(&(fIterationData[0].fNDoublets), collectedHits);
+
+#if 0
+  xpu::atomic_add(&(ctx.smem().counter), collectedHits);
+
+  xpu::barrier(ctx.pos());
+  if(ctx.thread_idx_x() == 0) {
+    xpu::atomic_add(&(fIterationData[0].fNDoublets), ctx.smem().counter);
+  }
+#endif
+}
+
+XPU_D void GpuTripletConstructor::CompressDoublets(CompressDoublets::context& ctx) const
+{
+  const int iGThread = ctx.block_dim_x() * ctx.block_idx_x() + ctx.thread_idx_x();
+
+  //  if( iGThread >= fIterationData[0].fNHits * fParams[fIterationData[0].fIteration].maxDoubletsFromHit ) return;
+  if (iGThread >= fIterationData[0].fNHits * constants::gpu::MaxDoubletsFromHit) return;
+
+  if (fHitDoublets[iGThread].hit2 != 0) {
+    int index                          = xpu::atomic_add(&(fIterationData[0].fNDoublets_counter), 1);
+    fHitDoubletsCompressed[index].hit1 = fHitDoublets[iGThread].hit1;
+    fHitDoubletsCompressed[index].hit2 = fHitDoublets[iGThread].hit2;
+  }
+}
+
+XPU_D void GpuTripletConstructor::FitDoublets(FitDoublets::context& ctx) const
+{
+  const int iGThread = ctx.block_dim_x() * ctx.block_idx_x() + ctx.thread_idx_x();
+  if (iGThread >= fIterationData[0].fNDoublets) return;
+
+  int hit_l = fHitDoubletsCompressed[iGThread].hit1;
+  int hit_m = fHitDoubletsCompressed[iGThread].hit2;
+
+  if (hit_l < 0 || hit_m < 0) return;
+
+  int ista = fvHits[hit_l].Station();
+
+  if (ista < 0 || ista > (int) fParams[fIterationData[0].fIteration].fNStations - 3) return;
+
+  const ca::GpuStation& sta_l = fParams[fIterationData[0].fIteration].GetStation(ista);
+  const ca::GpuStation& sta_m = fParams[fIterationData[0].fIteration].GetStation(ista + 1);
+
+  auto fit_params = fvTrackParams[hit_l];
+
+  bool isMomentumFitted =
+    (fParams[fIterationData[0].fIteration].isTargetField || (sta_l.fieldStatus != 0) || (sta_m.fieldStatus != 0));
+  float maxQp = fParams[fIterationData[0].fIteration].maxQp;
+
+  // fit doublet
+  const ca::Hit& hitm = fvHits[hit_m];
+
+  float z_2 = hitm.Z();
+  ca::MeasurementXy<float> m_2(hitm.X(), hitm.Y(), hitm.dX2(), hitm.dY2(), hitm.dXY(), 1.f, 1.f);
+  float t_2   = hitm.T();
+  float dt2_2 = hitm.dT2();
+
+  // add the middle hit
+  ExtrapolateLineNoField(&fit_params, z_2);
+
+  FilterXY(&fit_params, m_2);
+
+  FilterTime(&fit_params, t_2, dt2_2, sta_m.timeInfo);
+
+  //      fFit.SetQp0(isMomentumFitted ? fFit.Tr().GetQp() : maxQp);	//TODO: check why it is required
+  const float qp0 = isMomentumFitted ? fit_params.GetQp() : maxQp;
+
+  MultipleScattering(&fit_params,
+                     fMaterialMapTables[fMaterialMap[ista + 1].GetBin(fit_params.GetX(), fit_params.GetY())], qp0);
+
+  fvTrackParamsDoublets[iGThread] = fit_params;
+  //      fFit.SetQp0(fFit.Tr().Qp());	//TODO: check why it is required and how it is used later, have to be stored different way
+}
+
+XPU_D void GpuTripletConstructor::MakeTriplets(MakeTriplets::context& ctx) const
+{
+  const int iGThread = ctx.block_dim_x() * ctx.block_idx_x() + ctx.thread_idx_x();
+  if (iGThread >= fIterationData[0].fNDoublets) return;
+
+  int hit_l = fHitDoubletsCompressed[iGThread].hit1;
+  //  int hit_m = fHitDoubletsCompressed[iGThread].hit2;
+
+#if 0
+  if (ctx.thread_idx_x() == 0) {
+      ctx.smem().counter = 0;
+  }
+#endif
+
+  int ista = fvHits[hit_l].Station();
+
+  if (ista < 0 || ista > (int) fParams[fIterationData[0].fIteration].fNStations - 3) return;
+
+  //
+  ista = fvHits[hit_l].Station();
+  //
+
+  //  const ca::Station<float>& sta_l = fParams[fIterationData[0].fIteration].GetStation(ista);
+  //  const ca::Station<float>& sta_m = fParams[fIterationData[0].fIteration].GetStation(ista + 1);
+  const ca::GpuStation& sta_r = fParams[fIterationData[0].fIteration].GetStation(ista + 2);
+
+  auto& fit_params = fvTrackParamsDoublets[iGThread];
+
+  ca::GpuFieldRegion
+    fld1;  //TODO: with fit_params(doublets) this is not fFldL from orig, but it should be more precise, need to be checker and clarified
+  {        // field, made by the left hit, the middle station and the right station
+    ca::GpuFieldValue B0 =
+      fParams[fIterationData[0].fIteration].GetStation(ista).fieldSlice.GetFieldValueForLine(fit_params);
+    ca::GpuFieldValue B1 =
+      fParams[fIterationData[0].fIteration].GetStation(ista + 1).fieldSlice.GetFieldValueForLine(fit_params);
+    ca::GpuFieldValue B2 =
+      fParams[fIterationData[0].fIteration].GetStation(ista + 2).fieldSlice.GetFieldValueForLine(fit_params);
+    fld1.Set(B0, fParams[fIterationData[0].fIteration].GetStation(ista).fZ, B1,
+             fParams[fIterationData[0].fIteration].GetStation(ista + 1).fZ, B2,
+             fParams[fIterationData[0].fIteration].GetStation(ista + 2).fZ);
+  }
+
+  ExtrapolateStep(&fit_params, sta_r.fZ, 0.f, &fld1);
+
+  //TODO: add isDoubletGood check if needed
+
+  int collectedHits = 0;
+  int maxNHits =
+    constants::gpu::MaxTripletsFromDoublet;  //fParams[fIterationData[0].fIteration].maxTripletsFromDoublet;
+
+  CollectHitsTriplets(iGThread, ista + 2, maxNHits, &collectedHits);
+
+  xpu::atomic_add(&(fIterationData[0].fNTriplets), collectedHits);
+
+#if 0
+  xpu::atomic_add(&(ctx.smem().counter), collectedHits);
+
+  xpu::barrier(ctx.pos());
+
+  if (ctx.thread_idx_x () == 0) {
+    fIterationData[0].fNTriplets += ctx.smem().counter;
+  }
+#endif
+}
+
+XPU_D void GpuTripletConstructor::CompressTriplets(CompressTriplets::context& ctx) const
+{
+  const int iGThread = ctx.block_dim_x() * ctx.block_idx_x() + ctx.thread_idx_x();
+  //  if( iGThread >= fIterationData[0].fNDoublets * fParams[fIterationData[0].fIteration].maxTripletsFromDoublet ) return;
+  if (iGThread >= fIterationData[0].fNDoublets * constants::gpu::MaxTripletsFromDoublet) return;
+
+  if (fHitTriplets[iGThread].hit3 != 0) {
+    int index                          = xpu::atomic_add(&(fIterationData[0].fNTriplets_counter), 1);
+    fHitTripletsCompressed[index].hit1 = fHitTriplets[iGThread].hit1;
+    fHitTripletsCompressed[index].hit2 = fHitTriplets[iGThread].hit2;
+    fHitTripletsCompressed[index].hit3 = fHitTriplets[iGThread].hit3;
+  }
+}
+
+XPU_D void GpuTripletConstructor::FitTriplets(FitTriplets::context& ctx) const
+{
+  const int iGThread = ctx.block_dim_x() * ctx.block_idx_x() + ctx.thread_idx_x();
+
+  if (iGThread >= fIterationData[0].fNTriplets) return;
+
+  int hit_l = fHitTripletsCompressed[iGThread].hit1;
+
+  int ista[3] = {-1, -1, -1};
+
+  ista[0] = fvHits[hit_l].Station();
+  ista[1] = ista[0] + 1;
+  ista[2] = ista[1] + 1;
+
+  //  ca::GpuStation sta[3];
+  const ca::GpuStation* sta[3];
+  for (int is = 0; is < 3; ++is) {
+    //    sta[is] = fParams[fIterationData[0].fIteration].GetStation(ista[is]);
+    sta[is] = &fStations_const[ista[is]];
+  };
+
+  bool isMomentumFitted = ((sta[0]->fieldStatus != 0) || (sta[1]->fieldStatus != 0) || (sta[2]->fieldStatus != 0));
+
+  float ndfTrackModel = 4;                    // straight line
+  ndfTrackModel += isMomentumFitted ? 1 : 0;  // track with momentum
+
+  const float maxQp = fParams[fIterationData[0].fIteration].maxQp;
+
+  ca::HitIndex_t ihit[3] = {fHitTripletsCompressed[iGThread].hit1, fHitTripletsCompressed[iGThread].hit2,
+                            fHitTripletsCompressed[iGThread].hit3};
+
+  float x[3], y[3], z[3], t[3];
+  float dt2[3];
+  ca::MeasurementXy<float> mxy[3];
+
+  for (int ih = 0; ih < 3; ++ih) {
+    const ca::Hit& hit = fvHits[ihit[ih]];
+    mxy[ih]            = ca::MeasurementXy<float>(hit.X(), hit.Y(), hit.dX2(), hit.dY2(), hit.dXY(), 1.f, 1.f);
+
+    x[ih]   = hit.X();
+    y[ih]   = hit.Y();
+    z[ih]   = hit.Z();
+    t[ih]   = hit.T();
+    dt2[ih] = hit.dT2();
+  };
+
+  // find the field along the track
+
+  ca::GpuFieldValue B[3];
+  ca::GpuFieldRegion fld;
+  ca::GpuFieldRegion fldTarget;
+
+  float tx[3] = {(x[1] - x[0]) / (z[1] - z[0]), (x[2] - x[0]) / (z[2] - z[0]), (x[2] - x[1]) / (z[2] - z[1])};
+  float ty[3] = {(y[1] - y[0]) / (z[1] - z[0]), (y[2] - y[0]) / (z[2] - z[0]), (y[2] - y[1]) / (z[2] - z[1])};
+  for (int ih = 0; ih < 3; ++ih) {
+    float dz = (sta[ih]->fZ - z[ih]);
+    B[ih]    = sta[ih]->fieldSlice.GetFieldValue(x[ih] + tx[ih] * dz, y[ih] + ty[ih] * dz);
+  };
+
+  fld.Set(B[0], sta[0]->fZ, B[1], sta[1]->fZ, B[2], sta[2]->fZ);
+  fldTarget.Set(fParams[fIterationData[0].fIteration].targB, fParams[fIterationData[0].fIteration].GetTargetPositionZ(),
+                B[0], sta[0]->fZ, B[1], sta[1]->fZ);
+
+  kf::TrackParamBase<float>
+    fit_params;  // = fvTrackParamsTriplets[iGThread];	//TODO: set all to 0 and recheck - done, no difference
+  //  auto *fit_params = &ctx.smem().fit_params_shared[ctx.thread_idx_x()];	//TODO: shared memory cannot be used for classes with preinitialized data
+
+  fit_params.ResetCovMatrix();
+
+  // initial parameters
+  {
+    float dz01      = 1.f / (z[1] - z[0]);
+    fit_params.Tx() = (x[1] - x[0]) * dz01;
+    fit_params.Ty() = (y[1] - y[0]) * dz01;
+    fit_params.Qp() = 0.f;
+    fit_params.Vi() = 0.f;
+  }
+
+  bool not_fitted = false;
+  float qp0       = 0;
+  // repeat several times in order to improve the precision
+  for (int iiter = 0; iiter < 2 /*nIterations*/; ++iiter) {
+    // fit forward
+    {
+      qp0 = fit_params.GetQp();
+      if (qp0 > maxQp) qp0 = maxQp;
+      if (qp0 < -maxQp) qp0 = -maxQp;
+
+      fit_params.Qp()   = 0.f;
+      fit_params.Vi()   = 0.f;
+      fit_params.X()    = x[0];
+      fit_params.Y()    = y[0];
+      fit_params.Z()    = z[0];
+      fit_params.Time() = t[0];
+      fit_params.ResetErrors(1.f, 1.f, 1.f, 1.f, 100., (sta[0]->timeInfo ? dt2[0] : 1.e6f), 1.e2f);
+      fit_params.C(0, 0) = mxy[0].Dx2();
+      fit_params.C(1, 0) = mxy[0].Dxy();
+      fit_params.C(1, 1) = mxy[0].Dy2();
+
+      fit_params.Ndf()     = -ndfTrackModel + 2;
+      fit_params.NdfTime() = sta[0]->timeInfo ? 0 : -1;
+      //  add the target constraint
+      FilterWithTargetAtLine(&fit_params, &fldTarget);
+      for (int ih = 1; ih < 3; ++ih) {
+        ExtrapolateStep(&fit_params, z[ih], qp0, &fld);
+
+        //	Extrapolate(&fit_params, z[ih], 0.f, &fld);	//TODO: add qp0 instead of 0.f
+
+        int bin       = fMaterialMap[ista[ih]].GetBin(fit_params.GetX(), fit_params.GetY());
+        auto radThick = fMaterialMapTables[bin];
+
+        MultipleScattering(&fit_params, radThick, qp0);  //TODO: why it was commented in original gpu code?
+
+        float Tx    = fit_params.Tx();
+        float Ty    = fit_params.Ty();
+        float txtx  = Tx * Tx;
+        float tyty  = Ty * Ty;
+        float txtx1 = txtx + 1.f;
+        float h     = txtx + tyty;
+        float tt    = xpu::sqrt(txtx1 + tyty);
+        float h2    = h * h;
+        float qp0t  = qp0 * tt;
+        float c1 = 0.0136f, c2 = c1 * 0.038f, c3 = c2 * 0.5f, c4 = -c3 * 0.5f, c5 = c3 * 0.333333f, c6 = -c3 * 0.25f;
+
+        float s0 = (c1 + c2 * xpu::log(radThick) + c3 * h + h2 * (c4 + c5 * h + c6 * h2)) * qp0t;
+        float a  = ((tt
+                    + fParams[fIterationData[0].fIteration].particleMass
+                        * fParams[fIterationData[0].fIteration].particleMass * qp0 * qp0t)
+                   * radThick * s0 * s0);
+        //        fit_params.C(2, 2) += 0.001;//txtx1 * a;	//TODO: check if it is needed. Switching on leads to the strong ghosts increase
+
+        fit_params.C(3, 2) += Tx * Ty * a;
+        fit_params.C(3, 3) += (1. + tyty) * a;
+
+        EnergyLossCorrection(&fit_params, radThick, -1.f);
+        FilterXY(&fit_params, mxy[ih]);
+        FilterTime(&fit_params, t[ih], dt2[ih], sta[ih]->timeInfo);
+      }
+    }
+
+    if (iiter == 1) break;
+
+    // fit backward
+    {
+      qp0 = fit_params.GetQp();
+      if (qp0 > maxQp) qp0 = maxQp;
+      if (qp0 < -maxQp) qp0 = -maxQp;
+
+      fit_params.X()    = x[2];
+      fit_params.Y()    = y[2];
+      fit_params.Z()    = z[2];
+      fit_params.Time() = t[2];
+      fit_params.Qp()   = 0.f;
+      fit_params.Vi()   = 0.f;
+
+      fit_params.ResetErrors(1.f, 1.f, 1.f, 1.f, 100., (sta[2]->timeInfo ? dt2[2] : 1.e6f), 1.e2f);
+
+      fit_params.Ndf()     = -ndfTrackModel + 2;
+      fit_params.NdfTime() = sta[2]->timeInfo ? 0 : -1;
+
+      for (int ih = 1; ih >= 0; --ih) {
+        ExtrapolateStep(&fit_params, z[ih], qp0, &fld);
+        auto radThick = fMaterialMapTables[fMaterialMap[ista[ih]].GetBin(fit_params.GetX(), fit_params.GetY())];
+        MultipleScattering(&fit_params, radThick, qp0);
+        EnergyLossCorrection(&fit_params, radThick, 1.f);
+        FilterXY(&fit_params, mxy[ih]);
+        FilterTime(&fit_params, t[ih], dt2[ih], sta[ih]->timeInfo);
+      }
+    }
+  }  // for iiter
+
+  float chi2 = fit_params.GetChiSq() + fit_params.GetChiSqTime();
+
+  if (chi2 > fParams[fIterationData[0].fIteration].tripletFinalChi2Cut || !xpu::isfinite(chi2) || chi2 < 0.f
+      || not_fitted) {
+    chi2 = -1.;
+  }
+
+  const float qp  = fit_params.GetQp();
+  const float Cqp = fit_params.C(4, 4) + 0.001;
+  fvTriplets[iGThread] =
+    ca::Triplet(ihit[0], ihit[1], ihit[2], ista[0], ista[1], ista[2], 0, 0, 0, chi2, qp, Cqp, fit_params.Tx(),
+                fit_params.C(2, 2), fit_params.Ty(), fit_params.C(3, 3), isMomentumFitted);
+
+  if constexpr (constants::gpu::GpuSortTriplets) {
+    fTripletSortHelper[iGThread].tr_id = iGThread;
+    //    fTripletSortHelper[iGThread].lsta  = ista[0];
+    fTripletSortHelper[iGThread].lhit = ihit[0];
+  }
+}
+
+XPU_D void GpuTripletConstructor::SortTripletsFunc(SortTriplets::context& ctx) const
+{
+  //TODO: implement sorting of triplets on GPU or make tracking order independent
+}
+
+XPU_D void GpuTripletConstructor::CollectHits(int iThread, int iSta, int iHit, int maxNHits, int* collectedHits) const
+{
+  // Collect hits in the station iSta and make doublets
+  const ca::GpuStation& sta            = fParams[fIterationData[0].fIteration].GetStation(iSta);
+  kf::TrackParamBase<float> fit_params = fvTrackParams[iHit];
+
+  fit_params.ChiSq() = 0.;
+
+  const float Pick_m22 = fParams[fIterationData[0].fIteration].doubletChi2Cut - fit_params.GetChiSq();
+
+  const float timeError2 = fit_params.C(5, 5);
+  const float time       = fit_params.Time();
+
+  const auto& gpuGrid = fvGpuGrid[iSta];
+
+  const float pick_range_x = xpu::sqrt(Pick_m22 * fit_params.C(0, 0)) + gpuGrid.GetMaxRangeX()
+                             + fParams[fIterationData[0].fIteration].maxDZ * abs(fit_params.Tx());
+  const float pick_range_y = xpu::sqrt(Pick_m22 * fit_params.C(1, 1)) + gpuGrid.GetMaxRangeY()
+                             + fParams[fIterationData[0].fIteration].maxDZ * abs(fit_params.Ty());
+
+  ca::GpuGridArea areaGpu(gpuGrid, &(fgridFirstBinEntryIndex[fvGpuGrid[iSta].GetBinEntryIndexStart()]),
+                          &(fgridEntries[fvGpuGrid[iSta].GetEntriesStart()]), fit_params.X(), fit_params.Y(),
+                          pick_range_x, pick_range_y);
+  unsigned int ih_gpu = 0;
+
+  while (areaGpu.GetNextObjectId(ih_gpu) && (*collectedHits < maxNHits)) {  // loop over station hits
+                                                                            //    if (fvbHitSuppressed[ih]) {
+                                                                            //      continue;
+                                                                            //    }
+    const ca::Hit& hit = fvHits[ih_gpu];
+
+    // check time-boundaries
+    if ((sta.timeInfo) && (fit_params.NdfTime() >= 0)) {
+      if (xpu::abs(time - hit.T()) > 1.4 * (3.5 * xpu::sqrt(timeError2) + hit.RangeT())) {
+        continue;
+      }
+      // if (fabs(time - hit.T()) > 30) continue;
+    }
+
+    // - check whether hit belong to the window ( track position +\- errors ) -
+
+    // check y-boundaries
+    float y = fit_params.Y() + fit_params.Ty() * (hit.Z() - fit_params.Z());
+    float C11 =
+      fit_params.C(1, 1)
+      + (hit.Z() - fit_params.Z()) * (2 * fit_params.C(3, 1) + (hit.Z() - fit_params.Z()) * fit_params.C(3, 3));
+
+    float dy_est = xpu::sqrt(Pick_m22 * C11) + hit.RangeY();
+
+    const float dY = hit.Y() - y;
+
+    if (xpu::abs(dY) > dy_est) {
+      continue;
+    }
+
+    // check x-boundaries
+    float x = fit_params.X() + fit_params.Tx() * (hit.Z() - fit_params.Z());
+    float C00 =
+      fit_params.C(0, 0)
+      + (hit.Z() - fit_params.Z()) * (2 * fit_params.C(2, 0) + (hit.Z() - fit_params.Z()) * fit_params.C(2, 2));
+
+    float dx_est = xpu::sqrt(Pick_m22 * C00) + hit.RangeX();
+
+    const float dX = hit.X() - x;
+
+    if (xpu::abs(dX) > dx_est) {
+      continue;
+    }
+
+    // check chi2
+    float C10 = fit_params.C(1, 0)
+                + (hit.Z() - fit_params.Z())
+                    * (fit_params.C(2, 1) + fit_params.C(3, 0) + (hit.Z() - fit_params.Z()) * fit_params.C(3, 2));
+
+    ca::MeasurementXy<float> mxy(hit.X(), hit.Y(), hit.dX2(), hit.dY2(), hit.dXY(), 1.f, 1.f);
+
+    float chi2x = 0.;
+    {  // filter X measurement
+      const float zeta = x - mxy.X();
+
+      const float F0 = C00;
+      const float F1 = C10;
+
+      const float HCH = F0;
+
+      const float wi     = 1.f / (mxy.Dx2() + HCH);
+      const float zetawi = zeta * wi;
+      chi2x              = mxy.NdfX() * zeta * zetawi;
+
+      const float K1 = F1 * wi;
+
+      x -= F0 * zetawi;
+      y -= F1 * zetawi;
+
+      C00 -= F0 * F0 * wi;
+      C10 -= K1 * F0;
+      C11 -= K1 * F1;
+    }
+
+    float chi2u = 0.;
+    {  // filter U measurement, we need only chi2 here
+      const float cosPhi = -mxy.Dxy() / mxy.Dx2();
+      const float u      = cosPhi * mxy.X() + mxy.Y();
+      const float du2    = mxy.Dy2() + cosPhi * mxy.Dxy();
+
+      const float zeta = cosPhi * x + y - u;
+
+      const float F0 = cosPhi * C00 + C10;
+      const float F1 = cosPhi * C10 + C11;
+
+      const float HCH = (F0 * cosPhi + F1);
+
+      chi2u += mxy.NdfY() * zeta * zeta / (du2 + HCH);
+    }
+
+    if (chi2x > fParams[fIterationData[0].fIteration].doubletChi2Cut) {
+      continue;
+    }
+    if (chi2x + chi2u > fParams[fIterationData[0].fIteration].doubletChi2Cut) {
+      continue;
+    }
+    int id                = iThread * maxNHits + *collectedHits;
+    fHitDoublets[id].hit1 = iHit;
+    fHitDoublets[id].hit2 = ih_gpu;
+    *collectedHits += 1;
+    //TODO: add clone hit suppression -> should be done after doublet fitting actually
+  }  // loop over the hits in the area
+}
+
+XPU_D void GpuTripletConstructor::CollectHitsTriplets(int iThread, int iSta, int maxNHits, int* collectedHits) const
+{
+  const ca::GpuStation& sta            = fParams[fIterationData[0].fIteration].GetStation(iSta);
+  kf::TrackParamBase<float> fit_params = fvTrackParamsDoublets[iThread];
+
+  fit_params.ChiSq() = 0.;
+
+  const float Pick_m22 = fParams[fIterationData[0].fIteration].tripletChi2Cut - fit_params.GetChiSq();
+
+  const float timeError2 = fit_params.C(5, 5);
+  const float time       = fit_params.Time();
+
+  const auto& gpuGrid = fvGpuGrid[iSta];
+
+  const float pick_range_x = xpu::sqrt(Pick_m22 * fit_params.C(0, 0)) + gpuGrid.GetMaxRangeX()
+                             + fParams[fIterationData[0].fIteration].maxDZ * abs(fit_params.Tx());
+  const float pick_range_y = xpu::sqrt(Pick_m22 * fit_params.C(1, 1)) + gpuGrid.GetMaxRangeY()
+                             + fParams[fIterationData[0].fIteration].maxDZ * abs(fit_params.Ty());
+
+  ca::GpuGridArea areaGpu(gpuGrid, &(fgridFirstBinEntryIndex[fvGpuGrid[iSta].GetBinEntryIndexStart()]),
+                          &(fgridEntries[fvGpuGrid[iSta].GetEntriesStart()]), fit_params.X(), fit_params.Y(),
+                          pick_range_x, pick_range_y);
+  unsigned int ih_gpu = 0;
+
+  while (areaGpu.GetNextObjectId(ih_gpu) && (*collectedHits < maxNHits)) {  // loop over station hits
+                                                                            //    if (fvbHitSuppressed[ih]) {
+                                                                            //      continue;
+                                                                            //    }
+    const ca::Hit& hit = fvHits[ih_gpu];
+
+    // check time-boundaries
+
+    //TODO: move hardcoded cuts to parameters
+    if ((sta.timeInfo) && (fit_params.NdfTime() >= 0)) {
+      if (xpu::abs(time - hit.T()) > 1.4 * (3.5 * xpu::sqrt(timeError2) + hit.RangeT())) {
+        continue;
+      }
+      // if (fabs(time - hit.T()) > 30) continue;
+    }
+
+    // - check whether hit belong to the window ( track position +\- errors ) -
+
+    // check y-boundaries
+    float y = fit_params.Y() + fit_params.Ty() * (hit.Z() - fit_params.Z());
+    float C11 =
+      fit_params.C(1, 1)
+      + (hit.Z() - fit_params.Z()) * (2 * fit_params.C(3, 1) + (hit.Z() - fit_params.Z()) * fit_params.C(3, 3));
+
+    float dy_est = xpu::sqrt(Pick_m22 * C11) + hit.RangeY();
+
+    const float dY = hit.Y() - y;
+
+    if (xpu::abs(dY) > dy_est) {
+      continue;
+    }
+
+    // check x-boundaries
+    float x = fit_params.X() + fit_params.Tx() * (hit.Z() - fit_params.Z());
+    float C00 =
+      fit_params.C(0, 0)
+      + (hit.Z() - fit_params.Z()) * (2 * fit_params.C(2, 0) + (hit.Z() - fit_params.Z()) * fit_params.C(2, 2));
+
+    float dx_est = xpu::sqrt(Pick_m22 * C00) + hit.RangeX();
+
+    const float dX = hit.X() - x;
+
+    if (fabs(dX) > dx_est) {
+      continue;
+    }
+
+    // check chi2
+    float C10 = fit_params.C(1, 0)
+                + (hit.Z() - fit_params.Z())
+                    * (fit_params.C(2, 1) + fit_params.C(3, 0) + (hit.Z() - fit_params.Z()) * fit_params.C(3, 2));
+
+    ca::MeasurementXy<float> mxy(hit.X(), hit.Y(), hit.dX2(), hit.dY2(), hit.dXY(), 1.f, 1.f);
+
+    float chi2x = 0.;
+    {  // filter X measurement
+      const float zeta = x - mxy.X();
+
+      const float F0 = C00;
+      const float F1 = C10;
+
+      const float HCH = F0;
+
+      const float wi     = 1.f / (mxy.Dx2() + HCH);
+      const float zetawi = zeta * wi;
+      chi2x              = mxy.NdfX() * zeta * zetawi;
+
+      const float K1 = F1 * wi;
+
+      x -= F0 * zetawi;
+      y -= F1 * zetawi;
+
+      C00 -= F0 * F0 * wi;
+      C10 -= K1 * F0;
+      C11 -= K1 * F1;
+    }
+
+    float chi2u = 0.;
+    {  // filter U measurement, we need only chi2 here
+      const float cosPhi = -mxy.Dxy() / mxy.Dx2();
+      const float u      = cosPhi * mxy.X() + mxy.Y();
+      const float du2    = mxy.Dy2() + cosPhi * mxy.Dxy();
+
+      const float zeta = cosPhi * x + y - u;
+
+      const float F0 = cosPhi * C00 + C10;
+      const float F1 = cosPhi * C10 + C11;
+
+      const float HCH = (F0 * cosPhi + F1);
+
+      chi2u += mxy.NdfY() * zeta * zeta / (du2 + HCH);
+    }
+
+    if (chi2x > fParams[fIterationData[0].fIteration].tripletChi2Cut) {
+      continue;
+    }
+    if (chi2x + chi2u > fParams[fIterationData[0].fIteration].tripletChi2Cut) {
+      continue;
+    }
+
+    int id                = iThread * maxNHits + *collectedHits;
+    fHitTriplets[id].hit1 = fHitDoubletsCompressed[iThread].hit1;
+    fHitTriplets[id].hit2 = fHitDoubletsCompressed[iThread].hit2;
+    fHitTriplets[id].hit3 = ih_gpu;
+    *collectedHits += 1;
+  }  // loop over the hits in the area
+  //  if( *collectedHits > 2 ) printf("- iThread: %d; collectedHits: %d\n", iThread, *collectedHits);
+}
+
+XPU_D void GpuTripletConstructor::FilterWithTargetAtLine(kf::TrackParamBase<float>* tr_par,
+                                                         const ca::GpuFieldRegion* F0) const
+{
+  // Add the target constraint to a straight line track
+
+  float eXY[2];
+  float Jx[7], Jy[7];
+
+  GetExtrapolatedXYline(tr_par,
+                        /*fParams[fIterationData[0].fIteration]*/ fParams_const[fIteration].GetTargetPositionZ(), F0,
+                        eXY /*, &eY*/, Jx, Jy);
+
+  FilterExtrapolatedXY(tr_par, &(/*fParams[fIterationData[0].fIteration]*/ fParams_const[fIteration].targetMeasurement),
+                       eXY /*, eY*/, Jx, Jy);
+}
+
+XPU_D void GpuTripletConstructor::GetExtrapolatedXYline(
+  kf::TrackParamBase<float>* tr_par, float z, const ca::GpuFieldRegion* F, float* extrXY,  // float* extrY,
+  //                                   std::array<float, ca::TrackParamBase<float>::kNtrackParam>* Jx,
+  //                                   std::array<float, ca::TrackParamBase<float>::kNtrackParam>* Jy) const
+  float* Jx, float* Jy) const
+{
+  // extrapolate track assuming it is straight (qp==0)
+  // return the extrapolated X, Y and the derivatives of the extrapolated X and Y
+
+  const float c_light(0.000299792458), c1(1.), c2i(0.5), c6i(/*1. / 6.*/ 0.166667),
+    c12i(/*1. / 12.*/ 0.083333);  //TODO: test constants
+
+  const float tx = tr_par->GetTx();
+  const float ty = tr_par->GetTy();
+  const float dz = z - tr_par->GetZ();
+
+  const float dz2     = dz * dz;
+  const float dzc6i   = dz * c6i;
+  const float dz2c12i = dz2 * c12i;
+
+  const float xx = tx * tx;
+  const float yy = ty * ty;
+  const float xy = tx * ty;
+
+  const float Ay = -xx - c1;
+  const float Bx = yy + c1;
+
+  const float ctdz2 = c_light * xpu::sqrt(c1 + xx + yy) * dz2;
+
+  const float Sx = F->cx0 * c2i + F->cx1 * dzc6i + F->cx2 * dz2c12i;
+  const float Sy = F->cy0 * c2i + F->cy1 * dzc6i + F->cy2 * dz2c12i;
+  const float Sz = F->cz0 * c2i + F->cz1 * dzc6i + F->cz2 * dz2c12i;
+
+  extrXY[0] = tr_par->GetX() + tx * dz;
+  extrXY[1] = tr_par->GetY() + ty * dz;
+
+  //  Jx->fill(0.);  // for a case
+  //  Jy->fill(0.);
+
+  Jx[0] = 1.;
+  Jx[1] = 0.;
+  Jx[2] = dz;
+  Jx[3] = 0.;
+  Jx[4] = ctdz2 * (Sx * xy + Sy * Ay + Sz * ty);
+  Jx[5] = 0.;
+  Jx[6] = 0.;
+
+  Jy[0] = 0.;
+  Jy[1] = 1.;
+  Jy[2] = 0.;
+  Jy[3] = dz;
+  Jy[4] = ctdz2 * (Sx * Bx - Sy * xy - Sz * tx);
+  Jy[5] = 0.;
+  //  Jx[6] = 0.;	//TODO: is it a bug???
+  Jy[6] = 0.;
+}
+
+XPU_D void GpuTripletConstructor::FilterExtrapolatedXY(
+  kf::TrackParamBase<float>* tr_par, const ca::MeasurementXy<float>* m, float* extrXY,  // fvec extrY,
+  //			      std::array<float, ca::TrackParamBase<float>::kNtrackParam>* Jx,
+  //			      std::array<float, ca::TrackParamBase<float>::kNtrackParam>* Jy) const
+  float* Jx, float* Jy) const
+{
+  // add a 2-D measurenent (x,y) at some z, that differs from fTr.GetZ()
+  // extrX, extrY are extrapolated track parameters at z, Jx, Jy are derivatives of the extrapolation
+
+  // ! it is assumed that in the track covariance matrix all non-diagonal covariances are 0
+  // ! except of C10
+
+  //  TrackParamV& T = fTr;
+
+  //zeta0 = T.x + Jx[2]*T.Tx() + Jx[3]*T.Ty() + Jx[4]*T.qp - x;
+  //zeta1 = T.y + Jy[2]*T.Tx() + Jy[3]*T.Ty() + Jy[4]*T.qp - y;
+
+  const float zeta0 = extrXY[0] - m->X();
+  const float zeta1 = extrXY[1] - m->Y();
+
+  // H = 1 0 Jx[2] Jx[3] Jx[4] 0
+  //     0 1 Jy[2] Jy[3] Jy[4] 0
+
+  // F = CH'
+  const float F00 = tr_par->C(0, 0);
+  const float F01 = tr_par->C(1, 0);  //TODO: is it a bug???
+  const float F10 = F01;              //tr_par->C10();
+  const float F11 = tr_par->C(1, 1);
+  const float F20 = Jx[2] * tr_par->C(2, 2);
+  const float F21 = Jy[2] * tr_par->C(2, 2);
+  const float F30 = Jx[3] * tr_par->C(3, 3);
+  const float F31 = Jy[3] * tr_par->C(3, 3);
+  const float F40 = Jx[4] * tr_par->C(4, 4);
+  const float F41 = Jy[4] * tr_par->C(4, 4);
+
+  // Jx[5,6] and Jy[5,6] are 0.
+
+  float S00 = m->Dx2() + F00 + Jx[2] * F20 + Jx[3] * F30 + Jx[4] * F40;
+  float S10 = m->Dxy() + F10 + Jy[2] * F20 + Jy[3] * F30 + Jy[4] * F40;
+  float S11 = m->Dy2() + F11 + Jy[2] * F21 + Jy[3] * F31 + Jy[4] * F41;
+
+  const float si     = 1. / (S00 * S11 - S10 * S10);
+  const float S00tmp = S00;
+  S00                = si * S11;
+  S10                = -si * S10;
+  S11                = si * S00tmp;
+
+  tr_par->ChiSq() += zeta0 * zeta0 * S00 + 2. * zeta0 * zeta1 * S10 + zeta1 * zeta1 * S11;
+  tr_par->Ndf() += m->NdfX() + m->NdfY();
+
+  const float K00 = F00 * S00 + F01 * S10;
+  const float K01 = F00 * S10 + F01 * S11;
+  const float K10 = F10 * S00 + F11 * S10;
+  const float K11 = F10 * S10 + F11 * S11;
+  const float K20 = F20 * S00 + F21 * S10;
+  const float K21 = F20 * S10 + F21 * S11;
+  const float K30 = F30 * S00 + F31 * S10;
+  const float K31 = F30 * S10 + F31 * S11;
+  const float K40 = F40 * S00 + F41 * S10;
+  const float K41 = F40 * S10 + F41 * S11;
+
+  tr_par->X() -= K00 * zeta0 + K01 * zeta1;
+  tr_par->Y() -= K10 * zeta0 + K11 * zeta1;
+  tr_par->Tx() -= K20 * zeta0 + K21 * zeta1;
+  tr_par->Ty() -= K30 * zeta0 + K31 * zeta1;
+  tr_par->Qp() -= K40 * zeta0 + K41 * zeta1;
+
+  tr_par->C(0, 0) -= (K00 * F00 + K01 * F01);
+  tr_par->C(1, 0) -= (K10 * F00 + K11 * F01);
+  tr_par->C(1, 1) -= (K10 * F10 + K11 * F11);
+  tr_par->C(2, 0) = -(K20 * F00 + K21 * F01);
+  tr_par->C(2, 1) = -(K20 * F10 + K21 * F11);
+  tr_par->C(2, 2) -= (K20 * F20 + K21 * F21);
+  tr_par->C(3, 0) = -(K30 * F00 + K31 * F01);
+  tr_par->C(3, 1) = -(K30 * F10 + K31 * F11);
+  tr_par->C(3, 2) = -(K30 * F20 + K31 * F21);
+  tr_par->C(3, 3) -= (K30 * F30 + K31 * F31);
+  tr_par->C(4, 0) = -(K40 * F00 + K41 * F01);
+  tr_par->C(4, 1) = -(K40 * F10 + K41 * F11);
+  tr_par->C(4, 2) = -(K40 * F20 + K41 * F21);
+  tr_par->C(4, 3) = -(K40 * F30 + K41 * F31);
+  tr_par->C(4, 4) -= (K40 * F40 + K41 * F41);
+}
+
+XPU_D void GpuTripletConstructor::MultipleScattering(kf::TrackParamBase<float>* tr_par, float radThick, float qp0) const
+{
+  float tx    = tr_par->Tx();
+  float ty    = tr_par->Ty();
+  float txtx  = tx * tx;
+  float tyty  = ty * ty;
+  float txtx1 = txtx + 1.f;
+  float h     = txtx + tyty;
+  float t     = xpu::sqrt(txtx1 + tyty);
+  float h2    = h * h;
+  float qp0t  = qp0 * t;
+
+  float c1 = 0.0136f, c2 = c1 * 0.038f, c3 = c2 * 0.5f, c4 = -c3 * 0.5f, c5 = c3 * 0.333333f, c6 = -c3 * 0.25f;
+
+  float s0 = (c1 + c2 * xpu::log(radThick) + c3 * h + h2 * (c4 + c5 * h + c6 * h2)) * qp0t;
+  //  float a = ((t + fParams[fIterationData[0].fIteration].particleMass * fParams[fIterationData[0].fIteration].particleMass * qp0 * qp0t) * radThick * s0 * s0);
+  float a = ((t + fParams_const[fIteration].particleMass * fParams_const[fIteration].particleMass * qp0 * qp0t)
+             * radThick * s0 * s0);
+  tr_par->C(2, 2) += txtx1 * a;
+  tr_par->C(3, 2) += tx * ty * a;
+  tr_par->C(3, 3) += (1. + tyty) * a;
+}
+
+XPU_D void GpuTripletConstructor::ExtrapolateStep(kf::TrackParamBase<float>* tr_par, float z_out, float qp0,
+                                                  const ca::GpuFieldRegion* Field) const
+{
+  const float c_light = 0.000299792458;
+
+  //----------------------------------------------------------------
+  const float h = z_out - tr_par->GetZ();
+
+  const float stepDz[5] = {0., 0., h * 0.5f, h * 0.5f, h};
+
+  float f[5][7]    = {{0.}};    // ( d*/dz  ) [step]
+  float F[5][7][7] = {{{0.}}};  // ( d *new [step] / d *old  )
+
+  //   Runge-Kutta steps
+  //
+  const float r0[7] = {tr_par->X(), tr_par->Y(), tr_par->Tx(), tr_par->Ty(), qp0, tr_par->Time(), tr_par->Vi()};
+  float R0[7][7]    = {{0.}};
+  for (int i = 0; i < 7; ++i) {
+    R0[i][i] = 1.;
+  }
+
+  for (int step = 1; step <= 4; ++step) {
+
+    float rstep[7] = {0.};
+    for (int i = 0; i < 7; ++i) {
+      rstep[i] = r0[i] + stepDz[step] * f[step - 1][i];
+    }
+    const float z       = tr_par->GetZ() + stepDz[step];
+    ca::GpuFieldValue B = Field->Get(rstep[0], rstep[1], z);
+    const float tx2     = rstep[2] * rstep[2];
+    const float ty2     = rstep[3] * rstep[3];
+    const float txty    = rstep[2] * rstep[3];
+    const float L2      = 1.f + tx2 + ty2;
+    const float L2i     = 1.f / L2;
+    const float L       = xpu::sqrt(L2);
+    const float cL      = c_light * L;
+    const float cLqp0   = cL * qp0;
+
+    f[step][0]    = rstep[2];
+    F[step][0][2] = 1.;
+
+    f[step][1]    = rstep[3];
+    F[step][1][3] = 1.;
+
+    const float f2tmp = txty * B.x - (1. + tx2) * B.y + rstep[3] * B.z;
+    f[step][2]        = cLqp0 * f2tmp;
+
+    F[step][2][2] = cLqp0 * (rstep[2] * f2tmp * L2i + rstep[3] * B.x - 2. * rstep[2] * B.y);
+    F[step][2][3] = cLqp0 * (rstep[3] * f2tmp * L2i + rstep[2] * B.x + B.z);
+    F[step][2][4] = cL * f2tmp;
+
+    const float f3tmp = -txty * B.y - rstep[2] * B.z + (1. + ty2) * B.x;
+    f[step][3]        = cLqp0 * f3tmp;
+    F[step][3][2]     = cLqp0 * (rstep[2] * f3tmp * L2i - rstep[3] * B.y - B.z);
+    F[step][3][3]     = cLqp0 * (rstep[3] * f3tmp * L2i + 2. * rstep[3] * B.x - rstep[2] * B.y);
+    F[step][3][4]     = cL * f3tmp;
+
+    f[step][4] = 0.;
+
+    //    if (fDoFitVelocity) {	//TODO: check and switch it on back if needed (for triplets, probably, not needed)
+    ////      fvec vi       = rstep[6];
+    f[step][5]     = rstep[6] * L;
+    const float Li = 1.f / L;
+    F[step][5][2]  = rstep[6] * rstep[2] * Li;  // / L;
+    F[step][5][3]  = rstep[6] * rstep[3] * Li;  // / L;
+    F[step][5][4]  = 0.;
+    F[step][5][5]  = 0.;
+    F[step][5][6]  = L;
+    //    }
+    //    else {
+    //      const float vi       = xpu::sqrt(1. + fSettings[0].particleMass * fSettings[0].particleMass * qp0 * qp0) * fSettings[0].SpeedOfLightInv;
+    //      f[step][5]    = rstep[6] * L;
+    //      F[step][5][2] = rstep[6] * rstep[2] / L;
+    //      F[step][5][3] = rstep[6] * rstep[3] / L;
+    //      F[step][5][4] =
+    //	  fSettings[0].particleMass * fSettings[0].particleMass * qp0 * L /
+    //	  xpu::sqrt(1. + fSettings[0].particleMass * fSettings[0].particleMass * qp0 * qp0) * fSettings[0].SpeedOfLightInv;
+    //      F[step][5][5] = 0.;
+    //      F[step][5][6] = 0.;
+    //    }
+
+    f[step][6] = 0.;
+  }  // end of Runge-Kutta step
+
+  float r[7]    = {0.};    // extrapolated parameters
+  float R[7][7] = {{0.}};  // Jacobian of the extrapolation
+
+  float stepW[5];  // = {0., h / 6.f, h / 3.f, h / 3.f, h / 6.};
+  stepW[0] = 0.;
+  stepW[1] = h / 6.f;
+  stepW[2] = h / 3.f;
+  stepW[3] = h / 3.f;
+  stepW[4] = h / 6.f;
+  //  const float stepW[5] = {0., h * 0.16666667f, h * 0.33333334f, h * 0.33333334f, h * 0.16666667f};
+  //  const float stepW[5] = {0., h * 0.166667f, h * 0.333334f, h * 0.333334f, h * 0.166667f};	//TODO: test constants: leads to a drop in efficiency
+
+  float k[5][7][7] = {{0.}};
+  for (int step = 1; step <= 4; ++step) {
+    for (int i = 0; i < 7; i++) {
+      for (int j = 0; j < 7; j++) {
+        k[step][i][j] = F[step][i][j];
+        for (int m = 0; m < 7; m++) {
+          k[step][i][j] += stepDz[step] * F[step][i][m] * k[step - 1][m][j];
+        }
+      }
+    }
+  }
+
+  for (int i = 0; i < 7; i++) {
+    for (int j = 0; j < 7; j++) {
+      R[i][j] = R0[i][j];
+      for (int step = 1; step <= 4; step++) {
+        R[i][j] += stepW[step] * k[step][i][j];
+      }
+    }
+  }
+
+  const float dqp = tr_par->Qp() - qp0;
+
+  for (int i = 0; i < 7; i++) {
+    r[i] = r0[i];
+    for (int step = 1; step <= 4; step++) {
+      r[i] += stepW[step] * f[step][i];
+    }
+    // take into account linearisation at fQp0
+    r[i] += R[i][4] * dqp;
+  }
+
+  // update parameters
+
+  tr_par->X()    = r[0];
+  tr_par->Y()    = r[1];
+  tr_par->Tx()   = r[2];
+  tr_par->Ty()   = r[3];
+  tr_par->Qp()   = r[4];
+  tr_par->Time() = r[5];
+  tr_par->Vi()   = r[6];
+
+  //fTr.Vi()( fTr.Vi() < fvec(TrackParamV::kClightNsInv) ) = fvec(TrackParamV::kClightNsInv);
+  tr_par->Z() = z_out;  //zMasked;
+
+  //          covariance matrix transport
+  float C[7][7];
+  for (int i = 0; i < 7; i++) {
+    for (int j = 0; j < 7; j++) {
+      C[i][j] = tr_par->C(i, j);
+    }
+  }
+  float RC[7][7];
+  for (int i = 0; i < 7; i++) {
+    for (int j = 0; j < 7; j++) {
+      RC[i][j] = 0.;
+      for (int m = 0; m < 7; m++) {
+        RC[i][j] += R[i][m] * C[m][j];
+      }
+    }
+  }
+  for (int i = 0; i < 7; i++) {
+    for (int j = 0; j < 7; j++) {
+      float Cij = 0.f;
+      for (int m = 0; m < 7; m++) {
+        Cij += RC[i][m] * R[j][m];
+      }
+      tr_par->C(i, j) = Cij;
+    }
+  }
+}
+
+XPU_D void GpuTripletConstructor::ExtrapolateLineNoField(kf::TrackParamBase<float>* tr_par, float zOut) const
+{
+  // extrapolate the track assuming no field
+
+  const float dz = zOut - tr_par->GetZ();
+
+  const float tx = tr_par->GetTx();
+  const float ty = tr_par->GetTy();
+  const float vi = tr_par->GetVi();
+
+  const float L = xpu::sqrt(1.f + tx * tx + ty * ty);
+
+  const float j52 = dz * tx * vi / L;
+  const float j53 = dz * ty * vi / L;
+  const float j56 = dz * L;
+
+  // transport parameters
+
+  tr_par->X() += tx * dz;
+  tr_par->Y() += ty * dz;
+  tr_par->Time() += L * vi * dz;
+  tr_par->Z() = zOut;
+
+  // transport covariance matrix (see details in CaTrackFit)
+
+  const float jc00 = tr_par->C(0, 0) + dz * tr_par->C(2, 0);
+  const float jc02 = tr_par->C(0, 2) + dz * tr_par->C(2, 2);
+
+  const float jc10 = tr_par->C(1, 0) + dz * tr_par->C(3, 0);
+  const float jc11 = tr_par->C(1, 1) + dz * tr_par->C(3, 1);
+  const float jc12 = tr_par->C(1, 2) + dz * tr_par->C(3, 2);
+  const float jc13 = tr_par->C(1, 3) + dz * tr_par->C(3, 3);
+
+  const float jc50 = tr_par->C(5, 0) + j52 * tr_par->C(2, 0) + j53 * tr_par->C(3, 0) + j56 * tr_par->C(6, 0);
+  const float jc51 = tr_par->C(5, 1) + j52 * tr_par->C(2, 1) + j53 * tr_par->C(3, 1) + j56 * tr_par->C(6, 1);
+  const float jc52 = tr_par->C(5, 2) + j52 * tr_par->C(2, 2) + j53 * tr_par->C(3, 2) + j56 * tr_par->C(6, 2);
+  const float jc53 = tr_par->C(5, 3) + j52 * tr_par->C(2, 3) + j53 * tr_par->C(3, 3) + j56 * tr_par->C(6, 3);
+  const float jc54 = tr_par->C(5, 4) + j52 * tr_par->C(2, 4) + j53 * tr_par->C(3, 4) + j56 * tr_par->C(6, 4);
+  const float jc55 = tr_par->C(5, 5) + j52 * tr_par->C(2, 5) + j53 * tr_par->C(3, 5) + j56 * tr_par->C(6, 5);
+  const float jc56 = tr_par->C(5, 6) + j52 * tr_par->C(2, 6) + j53 * tr_par->C(3, 6) + j56 * tr_par->C(6, 6);
+
+  tr_par->C(0, 0) = jc00 + jc02 * dz;
+  tr_par->C(1, 0) = jc10 + jc12 * dz;
+  //  tr_par->C(2, 0) = tr_par->C(2, 0) + tr_par->C(2, 2) * dz;
+  //  tr_par->C(3, 0) = tr_par->C(3, 0) + tr_par->C(3, 2) * dz;
+  //  tr_par->C(4, 0) = tr_par->C(4, 0) + tr_par->C(4, 2) * dz;
+  tr_par->C(2, 0) += tr_par->C(2, 2) * dz;
+  tr_par->C(3, 0) += tr_par->C(3, 2) * dz;
+  tr_par->C(4, 0) += tr_par->C(4, 2) * dz;
+  tr_par->C(5, 0) = jc50 + jc52 * dz;
+  //  tr_par->C(6, 0) = tr_par->C(6, 0) + tr_par->C(6, 2) * dz;
+  tr_par->C(6, 0) += tr_par->C(6, 2) * dz;
+
+  tr_par->C(1, 1) = jc11 + jc13 * dz;
+  //  tr_par->C21() = tr_par->C21() + tr_par->C23() * dz;
+  //  tr_par->C31() = tr_par->C31() + tr_par->C33() * dz;
+  //  tr_par->C41() = tr_par->C41() + tr_par->C43() * dz;
+  tr_par->C(2, 1) += tr_par->C(2, 3) * dz;
+  tr_par->C(3, 1) += tr_par->C(3, 3) * dz;
+  tr_par->C(4, 1) += tr_par->C(4, 3) * dz;
+  tr_par->C(5, 1) = jc51 + jc53 * dz;
+  //  tr_par->C61() = tr_par->C61() + tr_par->C63() * dz;
+  tr_par->C(6, 1) += tr_par->C(6, 3) * dz;
+
+  tr_par->C(5, 2) = jc52;
+  tr_par->C(5, 3) = jc53;
+  tr_par->C(5, 4) = jc54;
+  tr_par->C(5, 5) = jc55 + jc52 * j52 + jc53 * j53 + jc56 * j56;
+  //  tr_par->C65() = tr_par->C65() + tr_par->C62() * j52 + tr_par->C63() * j53 + tr_par->C66() * j56;
+  tr_par->C(6, 5) += tr_par->C(6, 2) * j52 + tr_par->C(6, 3) * j53 + tr_par->C(6, 6) * j56;
+}
+
+XPU_D void GpuTripletConstructor::FilterXY(kf::TrackParamBase<float>* tr_par, const ca::MeasurementXy<float>& mxy) const
+{
+  {
+    kf::MeasurementU<float> mx;
+    mx.SetCosPhi(1.f);
+    mx.SetSinPhi(0.f);
+    mx.SetU(mxy.X());
+    mx.SetDu2(mxy.Dx2());
+    mx.SetNdf(mxy.NdfX());
+
+    kf::MeasurementU<float> mu;
+    mu.SetCosPhi(-mxy.Dxy() / mxy.Dx2());
+    mu.SetSinPhi(1.f);
+    mu.SetU(mu.CosPhi() * mxy.X() + mxy.Y());
+    mu.SetDu2(mxy.Dy2() - mxy.Dxy() * mxy.Dxy() / mxy.Dx2());
+    mu.SetNdf(mxy.NdfY());
+
+    Filter1d(tr_par, mx);
+    Filter1d(tr_par, mu);
+    return;
+  }
+}
+
+XPU_D void GpuTripletConstructor::Filter1d(kf::TrackParamBase<float>* tr_par, const kf::MeasurementU<float>& m) const
+{
+  float zeta, HCH;
+  float F0, F1, F2, F3, F4, F5, F6;
+  float K1, K2, K3, K4, K5, K6;
+
+  zeta = m.CosPhi() * tr_par->X() + m.SinPhi() * tr_par->Y() - m.U();
+
+  // F = CH'
+  F0 = m.CosPhi() * tr_par->C(0, 0) + m.SinPhi() * tr_par->C(1, 0);
+  F1 = m.CosPhi() * tr_par->C(1, 0) + m.SinPhi() * tr_par->C(1, 1);
+
+  HCH = F0 * m.CosPhi() + F1 * m.SinPhi();
+
+  F2 = m.CosPhi() * tr_par->C(2, 0) + m.SinPhi() * tr_par->C(2, 1);
+  F3 = m.CosPhi() * tr_par->C(3, 0) + m.SinPhi() * tr_par->C(3, 1);
+  F4 = m.CosPhi() * tr_par->C(4, 0) + m.SinPhi() * tr_par->C(4, 1);
+  F5 = m.CosPhi() * tr_par->C(5, 0) + m.SinPhi() * tr_par->C(5, 1);
+  F6 = m.CosPhi() * tr_par->C(6, 0) + m.SinPhi() * tr_par->C(6, 1);
+
+  const bool maskDoFilter = (HCH < m.Du2() * 16.f);
+
+  // correction to HCH is needed for the case when sigma2 is so small
+  // with respect to HCH that it disappears due to the roundoff error
+  //    fvec wi     = fMaskF / (m.Du2() + fvec(1.0000001) * HCH);
+  //    fvec zetawi = fMaskF * zeta / (iif(maskDoFilter, m.Du2(), fvec::Zero()) + HCH);
+
+  //TODO: (wi, zetawi): GPU results are different from CPU ones and depends on the order of the operations
+  //TODO: need to compare effects made on the tracking efficiency and parameters quality
+  float wi           = 1.f / (m.Du2() + 1.0000001f * HCH);
+  const float zetawi = zeta / (maskDoFilter ? m.Du2() : 0.f + HCH);
+
+  wi = m.Du2() > 0.f ? wi : 0.f;
+
+  tr_par->ChiSq() += m.Ndf() * zeta * zeta * wi;
+  tr_par->Ndf() += m.Ndf();
+
+  K1 = F1 * wi;
+  K2 = F2 * wi;
+  K3 = F3 * wi;
+  K4 = F4 * wi;
+  K5 = F5 * wi;
+  K6 = F6 * wi;
+
+  tr_par->X() -= F0 * zetawi;
+  tr_par->Y() -= F1 * zetawi;
+  tr_par->Tx() -= F2 * zetawi;
+  tr_par->Ty() -= F3 * zetawi;
+  tr_par->Qp() -= F4 * zetawi;
+  tr_par->Time() -= F5 * zetawi;
+  tr_par->Vi() -= F6 * zetawi;
+
+  tr_par->C(0, 0) -= F0 * F0 * wi;
+  tr_par->C(1, 0) -= K1 * F0;
+  tr_par->C(1, 1) -= K1 * F1;
+
+  tr_par->C(2, 0) -= K2 * F0;
+  tr_par->C(2, 1) -= K2 * F1;
+  tr_par->C(2, 2) -= K2 * F2;
+
+  tr_par->C(3, 0) -= K3 * F0;
+  tr_par->C(3, 1) -= K3 * F1;
+  tr_par->C(3, 2) -= K3 * F2;
+  tr_par->C(3, 3) -= K3 * F3;
+
+  tr_par->C(4, 0) -= K4 * F0;
+  tr_par->C(4, 1) -= K4 * F1;
+  tr_par->C(4, 2) -= K4 * F2;
+  tr_par->C(4, 3) -= K4 * F3;
+  tr_par->C(4, 4) -= K4 * F4;
+
+  tr_par->C(5, 0) -= K5 * F0;
+  tr_par->C(5, 1) -= K5 * F1;
+  tr_par->C(5, 2) -= K5 * F2;
+  tr_par->C(5, 3) -= K5 * F3;
+  tr_par->C(5, 4) -= K5 * F4;
+  tr_par->C(5, 5) -= K5 * F5;
+
+  tr_par->C(6, 0) -= K6 * F0;
+  tr_par->C(6, 1) -= K6 * F1;
+  tr_par->C(6, 2) -= K6 * F2;
+  tr_par->C(6, 3) -= K6 * F3;
+  tr_par->C(6, 4) -= K6 * F4;
+  tr_par->C(6, 5) -= K6 * F5;
+  tr_par->C(6, 6) -= K6 * F6;
+}
+
+//TODO: update this function
+XPU_D void GpuTripletConstructor::FilterTime(kf::TrackParamBase<float>* tr_par, float t, float dt2,
+                                             float timeInfo) const
+{
+  // filter track with a time measurement
+
+  // F = CH'
+  const float F0 = tr_par->C(5, 0);
+  const float F1 = tr_par->C(5, 1);
+  const float F2 = tr_par->C(5, 2);
+  const float F3 = tr_par->C(5, 3);
+  const float F4 = tr_par->C(5, 4);
+  const float F5 = tr_par->C(5, 5);
+  const float F6 = tr_par->C(6, 5);
+
+  const float HCH = F5;  //tr_par->C55();
+
+  // when dt0 is much smaller than current time error,
+  // set track time exactly to the measurement value without filtering
+  // it helps to keep the initial time errors reasonably small
+  // the calculations in the covariance matrix are not affected
+
+  const bool maskDoFilter = (HCH < dt2 * 16.f) && timeInfo;
+
+  float w = 1.f;
+
+  if (timeInfo <= 0.f) w = 0.f;
+
+  //  float wi = 1.f / (dt2 + 1.0000001f * HCH);
+  float hch_d = 1.0000001f * HCH;
+  float den   = dt2 + hch_d;
+  float wi    = w / den;
+
+  const float zeta   = tr_par->Time() - t;
+  const float zetawi = w * zeta / (maskDoFilter ? dt2 : 0.f + HCH);
+
+  tr_par->ChiSqTime() += maskDoFilter ? (zeta * zeta * wi) : 0.f;
+  tr_par->NdfTime() += w;
+
+  const float K1 = F1 * wi;
+  const float K2 = F2 * wi;
+  const float K3 = F3 * wi;
+  const float K4 = F4 * wi;
+  const float K5 = F5 * wi;
+  const float K6 = F6 * wi;
+
+  tr_par->X() -= F0 * zetawi;
+  tr_par->Y() -= F1 * zetawi;
+  tr_par->Tx() -= F2 * zetawi;
+  tr_par->Ty() -= F3 * zetawi;
+  tr_par->Qp() -= F4 * zetawi;
+  tr_par->Time() -= F5 * zetawi;
+  tr_par->Vi() -= F6 * zetawi;
+
+  tr_par->C(0, 0) -= F0 * F0 * wi;
+
+  tr_par->C(1, 0) -= K1 * F0;
+  tr_par->C(1, 1) -= K1 * F1;
+
+  tr_par->C(2, 0) -= K2 * F0;
+  tr_par->C(2, 1) -= K2 * F1;
+  tr_par->C(2, 2) -= K2 * F2;
+
+  tr_par->C(3, 0) -= K3 * F0;
+  tr_par->C(3, 1) -= K3 * F1;
+  tr_par->C(3, 2) -= K3 * F2;
+  tr_par->C(3, 3) -= K3 * F3;
+
+  tr_par->C(4, 0) -= K4 * F0;
+  tr_par->C(4, 1) -= K4 * F1;
+  tr_par->C(4, 2) -= K4 * F2;
+  tr_par->C(4, 3) -= K4 * F3;
+  tr_par->C(4, 4) -= K4 * F4;
+
+  tr_par->C(5, 0) -= K5 * F0;
+  tr_par->C(5, 1) -= K5 * F1;
+  tr_par->C(5, 2) -= K5 * F2;
+  tr_par->C(5, 3) -= K5 * F3;
+  tr_par->C(5, 4) -= K5 * F4;
+  tr_par->C(5, 5) -= K5 * F5;
+
+  tr_par->C(6, 0) -= K6 * F0;
+  tr_par->C(6, 1) -= K6 * F1;
+  tr_par->C(6, 2) -= K6 * F2;
+  tr_par->C(6, 3) -= K6 * F3;
+  tr_par->C(6, 4) -= K6 * F4;
+  tr_par->C(6, 5) -= K6 * F5;
+  tr_par->C(6, 6) -= K6 * F6;
+}
+
+XPU_D void GpuTripletConstructor::EnergyLossCorrection(kf::TrackParamBase<float>* tr_par, float radThick,
+                                                       float upstreamDirection) const
+{
+  const float qp02 = xpu::max(tr_par->Qp() * tr_par->Qp(), 0.01f);
+  const float p2   = 1.f / qp02;
+  //  const float mass2 = fParams[fIterationData[0].fIteration].particleMass * fParams[fIterationData[0].fIteration].particleMass;	//TODO: fMass2 = 0.01116, fSettings[0].particleMass^2 = 0.011172, might lead to a different result
+  const float mass2   = fParams_const[0].particleMass * fParams_const[0].particleMass;
+  const float r_mass2 = 1.f / mass2;
+  const float E2      = mass2 + p2;
+
+  const float bg2 = p2 * r_mass2;
+  //ApproximateBetheBloch
+  const float kp0 = 2.33f;
+  const float kp1 = 0.20f;
+  const float kp2 = 3.00f;
+  const float kp3 = 173e-9f;
+  const float kp4 = 0.49848f;
+
+  const float mK   = 0.307075e-3f;
+  const float _2me = 1.022e-3f;
+  const float rho  = kp0;
+  const float x0   = kp1 * 2.303f;
+  const float x1   = kp2 * 2.303f;
+  const float mI   = kp3;
+  const float mZA  = kp4;
+  const float maxT = _2me * bg2;
+
+  //*** Density effect
+  float d2         = 0.f;
+  const float x    = 0.5f * xpu::log(bg2);
+  const float lhwI = xpu::log(28.816f * 1e-9f * xpu::sqrt(rho * mZA) / mI);
+
+  if (x > x1) {
+    d2 = lhwI + x - 0.5f;
+  }
+  if ((x > x0) && (x1 > x)) {
+    const float r = (x1 - x) / (x1 - x0);
+    d2            = lhwI + x - 0.5f + (0.5f - lhwI - x0) * r * r * r;
+  }
+
+  const float bethe =
+    mK * mZA * (1.f + bg2) / bg2 * (0.5f * xpu::log(_2me * bg2 * maxT / (mI * mI)) - bg2 / (1.f + bg2) - d2);
+
+  const float tr = xpu::sqrt(1.f + tr_par->Tx() * tr_par->Tx() + tr_par->Ty() * tr_par->Ty());
+
+  const float dE = bethe * radThick * tr * 2.33f * 9.34961f;
+
+  const float ECorrected  = xpu::sqrt(E2) + upstreamDirection * dE;
+  const float E2Corrected = ECorrected * ECorrected;
+
+  float corr = xpu::sqrt(p2 / (E2Corrected - mass2));
+  if (xpu::isnan(corr) || xpu::isinf(corr)) {
+    corr = 1.f;
+  }
+
+  tr_par->Qp() *= corr;
+  tr_par->C(4, 0) *= corr;
+  tr_par->C(4, 1) *= corr;
+  tr_par->C(4, 2) *= corr;
+  tr_par->C(4, 3) *= corr;
+  tr_par->C(4, 4) *= corr * corr;
+  tr_par->C(5, 4) *= corr;
+}
diff --git a/algo/ca/core/experimental/CaGpuTripletConstructor.h b/algo/ca/core/experimental/CaGpuTripletConstructor.h
new file mode 100644
index 0000000000000000000000000000000000000000..1a438d4e65930032c5ecaf715d65eb6393801bff
--- /dev/null
+++ b/algo/ca/core/experimental/CaGpuTripletConstructor.h
@@ -0,0 +1,284 @@
+/* Copyright (C) 2024-2025 GSI Helmholtzzentrum fuer Schwerionenforschung, Darmstadt
+   SPDX-License-Identifier: GPL-3.0-only
+   Authors: Grigory Kozlov [committer] */
+
+/// \file CaGpuTripletConstructor.h
+/// \brief The class contains data and kernels for running CA tracking on CPU and GPU using XPU libraries
+
+
+#pragma once  // include this header only once per compilation unit
+
+#include "CaDeviceImage.h"
+#include "CaGpuGrid.h"
+#include "CaGpuGridArea.h"
+#include "CaGpuMaterialMap.h"
+#include "CaGpuParameters.h"
+#include "CaHit.h"
+#include "CaMeasurementXy.h"
+#include "CaTriplet.h"
+#include "KfMeasurementU.h"
+
+#include <xpu/device.h>
+#include <xpu/host.h>
+
+namespace cbm::algo
+{
+  //  static constexpr int kMaxNofStations = 24;
+  // Block sizes / other compile time constants that need tuning
+  enum CaGpuConstants
+  {
+#if XPU_IS_CUDA
+    kSingletConstructorBlockSize      = 512,
+    kSingletConstructorItemsPerThread = 8,
+#else  // HIP, values ignored on CPU
+    kSingletConstructorBlockSize      = 64,  //1024,	//128 is optimal (preliminary tests)
+    kSingletConstructorItemsPerThread = 6,
+    kSortTripletsBlockSize            = 64,
+    kSortTripletsItemsPerThread       = 6,
+#endif
+  };
+}  // namespace cbm::algo
+
+namespace cbm::algo::ca
+{
+  //  class Framework;
+  class GpuTripletConstructor;
+
+  struct TrackingParams {
+  };
+
+  struct Params : xpu::constant<ca::GPUReco, TrackingParams> {
+  };
+
+  // Declare Constant Memory
+  struct strGpuTripletConstructor : xpu::constant<GPUReco, GpuTripletConstructor> {
+  };
+
+  struct hit_doublets {
+    ca::HitIndex_t hit1;
+    ca::HitIndex_t hit2;
+  };
+
+  struct hit_triplets {
+    ca::HitIndex_t hit1;
+    ca::HitIndex_t hit2;
+    ca::HitIndex_t hit3;
+  };
+
+  struct sort_triplets {
+    unsigned int tr_id;
+    //    unsigned int lsta;
+    unsigned int lhit;
+  };
+
+  struct shared_counters {
+    int counter;
+  };
+
+  struct shared_data {
+    //    ca::GpuStation fStations_shared[kMaxNofStations];
+    ca::GpuFieldRegion fld0_shared[kSingletConstructorBlockSize];
+    ca::GpuFieldRegion fld1_shared[kSingletConstructorBlockSize];
+  };
+
+  struct shared_data_fit_triplets {
+    //    ca::GpuStation fStations_shared[kMaxNofStations];
+    kf::TrackParamBase<float> fit_params_shared[kSingletConstructorBlockSize];
+  };
+
+  //  struct TestFunc : xpu::kernel<GPUReco> {
+  //    using block_size    = xpu::block_size<kSingletConstructorBlockSize>;
+  //    using constants     = xpu::cmem<strGpuTripletConstructor, Params>;
+  //    using context       = xpu::kernel_context<shared_memory, constants>;
+  //    XPU_D void operator()(context&);
+  //  };
+
+  // Declare Kernels
+  struct MakeSinglets : xpu::kernel<GPUReco> {
+    using block_size    = xpu::block_size<kSingletConstructorBlockSize>;
+    using shared_memory = shared_data;
+    using constants     = xpu::cmem<strGpuTripletConstructor, Params>;
+    using context       = xpu::kernel_context<shared_memory, constants>;
+    XPU_D void operator()(context&);
+  };
+
+  struct MakeDoublets : xpu::kernel<GPUReco> {
+    using block_size = xpu::block_size<kSingletConstructorBlockSize>;
+    //    using shared_memory = shared_counters;
+    using constants = xpu::cmem<strGpuTripletConstructor>;  //, Params>;
+    using context   = xpu::kernel_context<shared_memory, constants>;
+    XPU_D void operator()(context&);
+  };
+
+  struct CompressDoublets : xpu::kernel<GPUReco> {
+    using block_size = xpu::block_size<kSingletConstructorBlockSize>;
+    using constants  = xpu::cmem<strGpuTripletConstructor>;  //, Params>;
+    using context    = xpu::kernel_context<shared_memory, constants>;
+    XPU_D void operator()(context&);
+  };
+
+  struct FitDoublets : xpu::kernel<GPUReco> {
+    using block_size = xpu::block_size<kSingletConstructorBlockSize>;
+    using constants  = xpu::cmem<strGpuTripletConstructor>;  //, Params>;
+    using context    = xpu::kernel_context<shared_memory, constants>;
+    XPU_D void operator()(context&);
+  };
+
+  struct MakeTriplets : xpu::kernel<GPUReco> {
+    using block_size = xpu::block_size<kSingletConstructorBlockSize>;
+    //    using shared_memory = shared_counters;
+    using constants = xpu::cmem<strGpuTripletConstructor>;  //, Params>;
+    using context   = xpu::kernel_context<shared_memory, constants>;
+    XPU_D void operator()(context&);
+  };
+
+  struct CompressTriplets : xpu::kernel<GPUReco> {
+    using block_size = xpu::block_size<kSingletConstructorBlockSize>;
+    using constants  = xpu::cmem<strGpuTripletConstructor>;  //, Params>;
+    using context    = xpu::kernel_context<shared_memory, constants>;
+    XPU_D void operator()(context&);
+  };
+
+  struct FitTriplets : xpu::kernel<GPUReco> {
+    using block_size = xpu::block_size<kSingletConstructorBlockSize>;
+    //    using shared_memory = shared_data_fit_triplets;
+    using constants = xpu::cmem<strGpuTripletConstructor>;  //, Params>;
+    using context   = xpu::kernel_context<shared_memory, constants>;
+    XPU_D void operator()(context&);
+  };
+
+  struct SortTriplets : xpu::kernel<GPUReco> {
+    using block_size = xpu::block_size<kSortTripletsBlockSize>;
+    using sort_t = xpu::block_sort<unsigned int, sort_triplets, kSortTripletsBlockSize, kSortTripletsItemsPerThread>;
+    using shared_memory = sort_t::storage_t;
+    using constants     = xpu::cmem<strGpuTripletConstructor>;  //, Params>;
+    using context       = xpu::kernel_context<shared_memory, constants>;
+    XPU_D void operator()(context&);
+  };
+
+  struct IterationData {
+    int fNHits;              ///< Number of hits in grid
+    int fIteration;          ///< Iteration number
+    int fNDoublets;          ///< Number of doublets
+    int fNDoublets_counter;  ///< Number of doublets counter
+    int fNTriplets;          ///< Number of triplets
+    int fNTriplets_counter;  ///< Number of triplets counter
+  };
+
+  class GpuTripletConstructor {
+   public:
+    ///                             ------  FUNCTIONAL PART ------
+
+    //    XPU_D void TestFunc(TestFunc::context&) const;
+
+    XPU_D void MakeSinglets(MakeSinglets::context&) const;
+
+    XPU_D void MakeDoublets(MakeDoublets::context&) const;
+
+    XPU_D void CompressDoublets(CompressDoublets::context&) const;
+
+    XPU_D void FitDoublets(FitDoublets::context&) const;
+
+    XPU_D void MakeTriplets(MakeTriplets::context&) const;
+
+    XPU_D void CompressTriplets(CompressTriplets::context&) const;
+
+    XPU_D void FitTriplets(FitTriplets::context&) const;
+
+    XPU_D void SortTripletsFunc(SortTriplets::context&) const;
+
+   private:
+    XPU_D void CollectHits(int iThread, int iSta, int iHit, int maxNHits, int* collectedHits) const;
+
+    XPU_D void CollectHitsTriplets(int iThread, int iSta, int maxNHits,
+                                   int* collectedHits) const;  //TODO: merge with CollectHits
+
+
+    ///                             ------  FITTING FINCTIONALITY ------ //TODO: check and switch to the standard CaTrackFit if possible
+    XPU_D void FilterWithTargetAtLine(kf::TrackParamBase<float>* tr_par, const ca::GpuFieldRegion* F0) const;
+
+    /// extrapolate track as a line, return the extrapolated X, Y and the Jacobians
+    XPU_D void GetExtrapolatedXYline(kf::TrackParamBase<float>* tr_par, float z, const ca::GpuFieldRegion* F,
+                                     float* extrXY, float* Jx, float* Jy) const;
+
+    XPU_D void MultipleScattering(kf::TrackParamBase<float>* tr_par, float radThick, float qp0) const;
+
+    /// extrapolate the track to the given Z using the field F
+    /// it does extrapolation in one step
+    XPU_D void ExtrapolateStep(kf::TrackParamBase<float>* tr_par, float z_out, float qp0,
+                               const ca::GpuFieldRegion* Field) const;
+
+    /// filter the track with the XY measurement placed at different Z
+    /// \param m - measurement
+    /// \param extrX - extrapolated X of the track
+    /// \param extrY - extrapolated Y of the track
+    /// \param Jx - Jacobian of the extrapolated X
+    /// \param Jy - Jacobian of the extrapolated Y
+    XPU_D void FilterExtrapolatedXY(
+      kf::TrackParamBase<float>* tr_par, const ca::MeasurementXy<float>* m, float* extrXY,  // fvec extrY,
+      //			      std::array<float, ca::TrackParamBase<float>::kNtrackParam>* Jx,
+      //			      std::array<float, ca::TrackParamBase<float>::kNtrackParam>* Jy) const;
+      float* Jx, float* Jy) const;
+
+    XPU_D void ExtrapolateLineNoField(kf::TrackParamBase<float>* tr_par, float zOut) const;
+
+    XPU_D void FilterXY(kf::TrackParamBase<float>* tr_par, const ca::MeasurementXy<float>& mxy) const;
+
+    XPU_D void Filter1d(kf::TrackParamBase<float>* tr_par, const kf::MeasurementU<float>& m) const;
+
+    XPU_D void FilterTime(kf::TrackParamBase<float>* tr_par, float t, float dt2, float timeInfo) const;
+
+    XPU_D void EnergyLossCorrection(kf::TrackParamBase<float>* tr_par, float radThick, float upstreamDirection) const;
+
+    ///                          ------  DATA MEMBERS ------
+   public:
+    xpu::buffer<ca::GpuGrid> fvGpuGrid;  ///< Grid
+
+    xpu::buffer<unsigned int> fgridFirstBinEntryIndex;  ///< Index of the first entry in the grid
+
+    xpu::buffer<unsigned int> fgridEntries;  ///< Entries in the grid	//TODO: check if it should be ca::HitIndex_t
+
+    ///Material map
+    xpu::buffer<ca::GpuMaterialMap> fMaterialMap;  ///< Material map base objects
+
+    xpu::buffer<float> fMaterialMapTables;  ///< Material map tables
+
+    xpu::buffer<IterationData> fIterationData;  ///< Temporary data
+
+    //    xpu::buffer<unsigned char> fvbHitSuppressed;  ///< Hit suppression flags
+
+    xpu::buffer<hit_doublets> fHitDoublets;  ///< Hit doublets
+
+    xpu::buffer<hit_doublets> fHitDoubletsCompressed;  ///< Hit doublets compressed
+
+    xpu::buffer<hit_triplets> fHitTriplets;  ///< Hit triplets
+
+    xpu::buffer<hit_triplets> fHitTripletsCompressed;  ///< Hit triplets compressed
+
+    /// \brief Hits of the current time window
+    ///
+    /// It is a portion of fInputData to process in the current time window
+    /// hit.Id is replaced by the hit index in fInputData
+    xpu::buffer<ca::Hit> fvHits;
+
+    xpu::buffer<kf::TrackParamBase<float>> fvTrackParams;          ///< Track parameters
+    xpu::buffer<kf::TrackParamBase<float>> fvTrackParamsDoublets;  ///< Track parameters for doublets
+    //    xpu::buffer<ca::TrackParamBase<float>> fvTrackParamsTriplets;  ///< Track parameters for triplets
+
+    xpu::buffer<ca::Triplet> fvTriplets;  ///< Triplets
+
+    xpu::buffer<sort_triplets> fTripletSortHelper;  ///< Helper for sorting triplets
+                                                    //    xpu::buffer<sort_triplets> fTripletSortHelperTmp;
+                                                    //    xpu::buffer<sort_triplets*> dst;
+
+    //    GpuParameters fParams;
+    xpu::buffer<GpuParameters> fParams;
+
+    int fIteration;
+
+    GpuParameters fParams_const[4];
+
+    ca::GpuStation fStations_const[constants::gpu::MaxNofStations];
+  };
+
+}  // namespace cbm::algo::ca
diff --git a/algo/ca/core/experimental/CaMeasurementXy.h b/algo/ca/core/experimental/CaMeasurementXy.h
new file mode 100644
index 0000000000000000000000000000000000000000..ae4f5232e46aee0f25ee0da27366529f4ed3cb43
--- /dev/null
+++ b/algo/ca/core/experimental/CaMeasurementXy.h
@@ -0,0 +1,202 @@
+/* Copyright (C) 2007-2022 GSI Helmholtzzentrum fuer Schwerionenforschung, Darmstadt
+   SPDX-License-Identifier: GPL-3.0-only
+   Authors: Sergey Gorbunov [committer], Sergei Zharko */
+
+/// \file CaMeasurementXy.h
+/// \brief Definition of the CaMeasurementXy class
+/// Temporary class for compatibility with GPU code
+/// Should be replaced by KfMeasurementXy
+
+#pragma once  // include this header only once per compilation unit
+
+
+#include "CaDefs.h"
+#include "CaSimd.h"
+#include "KfUtils.h"
+
+#include <string>
+
+#include <xpu/device.h>
+
+namespace cbm::algo::ca
+{
+
+  /// \brief The class describes a 2D - measurement (x, y) in XY coordinate system
+  ///
+  /// The measurement has a finite resolution, i.e. the measurement is not a point, but a distribution
+  /// with a certain rms.
+  /// The measurement components may be used in the chi2 calculation or not
+  /// The measurement may be a SIMD vector of values, when DataT is fvec type
+  ///
+  template<typename DataT>
+  class MeasurementXy {
+
+   public:
+    friend class boost::serialization::access;
+
+    /// default constructor
+    XPU_D MeasurementXy() = default;
+
+    /// constructor
+    /// \param x        x coordinate of the measurement
+    /// \param y        y coordinate of the measurement
+    /// \param dx2      rms^2 of the x coordinate measurement
+    /// \param dy2      rms^2 of the y coordinate measurement
+    /// \param dxy      covariance of the x and y coordinate measurements
+    /// \param ndfX     number of degrees of freedom for the x coordinate measurement
+    ///                 if ndfX == 1, the measurement is used in fit and in the chi2 calculation
+    ///                 if ndfX == 0, the measurement is used in fit, but not used in the chi2 calculation
+    /// \param ndfY     number of degrees of freedom for the y coordinate measurement
+    ///                 if ndfY == 1, the measurement is used in fit and in the chi2 calculation
+    ///                 if ndfY == 0, the measurement is used in fit, but not used in the chi2 calculation
+    XPU_D MeasurementXy(DataT x, DataT y, DataT dx2, DataT dy2, DataT dxy, DataT ndfX, DataT ndfY)
+      : fX(x)
+      , fY(y)
+      , fDx2(dx2)
+      , fDy2(dy2)
+      , fDxy(dxy)
+      , fNdfX(ndfX)
+      , fNdfY(ndfY)
+    {
+    }
+
+
+    /// Set all SIMD entries from all SIMD entries of the other class
+    /// It works for scalar and fvec types,
+    /// except of the case when DataT is scalar and TdataB is fvec.
+    template<typename DataTb>
+    void Set(const MeasurementXy<DataTb>& m)
+    {
+      CopyBase<DataTb, true, true>(0, m, 0);
+    }
+
+    /// Set all SIMD entries from one SIMD entry of the other class
+    /// It also works when DataT is scalar
+    void Set(const MeasurementXy<fvec>& m, const int im) { CopyBase<fvec, true, false>(0, m, im); }
+
+    /// Set one SIMD entry from one SIMD entry of the other class
+    /// It only works when DataT is fvec, TdataB is scalar
+    template<typename DataTb>
+    void SetOneEntry(const int i, const MeasurementXy<DataTb>& m)
+    {
+      CopyBase<DataTb, false, true>(i, m, 0);
+    }
+
+    /// Set one SIMD entry from one SIMD entry of the other class
+    /// It only works when DataT is fvec, TdataB is fvec
+    void SetOneEntry(const int i, const MeasurementXy<fvec>& m, const int im)
+    {
+      CopyBase<fvec, false, false>(i, m, im);
+    }
+
+
+    ///------------------------------
+    /// Setters and getters
+
+    XPU_D void SetX(DataT x) { fX = x; }
+    XPU_D void SetY(DataT y) { fY = y; }
+    XPU_D void SetDx2(DataT dx2) { fDx2 = dx2; }
+    XPU_D void SetDy2(DataT dy2) { fDy2 = dy2; }
+    XPU_D void SetDxy(DataT dxy) { fDxy = dxy; }
+    XPU_D void SetNdfX(DataT ndfX) { fNdfX = ndfX; }
+    XPU_D void SetNdfY(DataT ndfY) { fNdfY = ndfY; }
+    XPU_D void SetCov(DataT dx2, DataT dxy, DataT dy2)
+    {
+      fDx2 = dx2;
+      fDxy = dxy;
+      fDy2 = dy2;
+    }
+
+    XPU_D DataT X() const { return fX; }
+    XPU_D DataT Y() const { return fY; }
+    XPU_D DataT Dx2() const { return fDx2; }
+    XPU_D DataT Dy2() const { return fDy2; }
+    XPU_D DataT Dxy() const { return fDxy; }
+    XPU_D DataT NdfX() const { return fNdfX; }
+    XPU_D DataT NdfY() const { return fNdfY; }
+
+    ///------------------------------
+    /// references, to ease assignment to SIMD vector components when DataT has fvec type
+
+    XPU_D DataT& X() { return fX; }
+    XPU_D DataT& Y() { return fY; }
+    XPU_D DataT& Dx2() { return fDx2; }
+    XPU_D DataT& Dy2() { return fDy2; }
+    XPU_D DataT& Dxy() { return fDxy; }
+    XPU_D DataT& NdfX() { return fNdfX; }
+    XPU_D DataT& NdfY() { return fNdfY; }
+
+    ///------------------------------
+    /// Methods for debugging
+
+    /// Checks, if all fields are finite
+    bool IsFinite() const
+    {
+      return (kf::utils::IsFinite(X()) && kf::utils::IsFinite(Y()) && kf::utils::IsFinite(Dx2())
+              && kf::utils::IsFinite(Dy2()) && kf::utils::IsFinite(Dxy()) && kf::utils::IsFinite(NdfX())
+              && kf::utils::IsFinite(NdfY()));
+    }
+
+    /// Checks, if some fields are undefined
+    bool IsUndefined() const
+    {
+      return (kf::utils::IsUndefined(X()) || kf::utils::IsUndefined(Y()) || kf::utils::IsUndefined(Dx2())
+              || kf::utils::IsUndefined(Dy2()) || kf::utils::IsUndefined(Dxy()) || kf::utils::IsUndefined(NdfX())
+              || kf::utils::IsUndefined(NdfY()));
+    }
+
+   private:
+    /// \brief Copies all/one entries from the other class
+    /// \tparam TdataB  Type of the other class
+    /// \tparam TDoAllA  If true, all entries of the current class must be set
+    /// \tparam TDoAllB  If true, all entries of the other class must be used
+    /// \param ia  Index of SIMD vector element of the current class
+    /// \param Tb  Other class
+    /// \param ib  Index of SIMD vector element of the other class
+    template<typename TdataB, bool TDoAllA, bool TDoAllB>
+    void CopyBase(const int ia, const MeasurementXy<TdataB>& Tb, const int ib);
+
+   private:
+    ///------------------------------
+    /// Data members
+
+    // Initializing parameters with NANs spoils the track fit where
+    // the masked-out SIMD entries are suppressed by a multication by 0.
+    // Therefore, we initialize the data members here with finite numbers.
+    // For the numerical safety, with some reasonable numbers.
+
+    DataT fX;    ///< x coordinate of the measurement
+    DataT fY;    ///< y coordinate of the measurement
+    DataT fDx2;  ///< rms^2 of the x coordinate measurement
+    DataT fDy2;  ///< rms^2 of the y coordinate measurement
+    DataT fDxy;  ///< covariance of the x and y coordinate measurements
+
+    /// number of degrees of freedom (used for chi2 calculation)
+    /// if ndf == 1, the measurement is used in the chi2 calculation
+    /// if ndf == 0, the measurement is not used in the chi2 calculation
+
+    DataT fNdfX;  ///< ndf for the x coordinate measurement
+    DataT fNdfY;  ///< ndf for the y coordinate measurement
+  };
+
+
+  // ---------------------------------------------------------------------------------------------------------------------
+  //
+  template<typename TdataA>
+  template<typename TdataB, bool TDoAllA, bool TDoAllB>
+  inline void MeasurementXy<TdataA>::CopyBase(const int ia, const MeasurementXy<TdataB>& Tb, const int ib)
+  {
+    auto copy = [&](TdataA& a, const TdataB& b) {
+      kf::utils::VecCopy<TdataA, TdataB, TDoAllA, TDoAllB>::CopyEntries(a, ia, b, ib);
+    };
+
+    copy(fX, Tb.X());
+    copy(fY, Tb.Y());
+    copy(fDx2, Tb.Dx2());
+    copy(fDy2, Tb.Dy2());
+    copy(fDxy, Tb.Dxy());
+    copy(fNdfX, Tb.NdfX());
+    copy(fNdfY, Tb.NdfY());
+  }  // CopyBase
+
+}  // namespace cbm::algo::ca
diff --git a/algo/ca/core/pars/CaDefs.h b/algo/ca/core/pars/CaDefs.h
index 8b4a2280d57d45267329ca11edf13613faaa6df0..89a8c19ee9b7fbe112f7b15e9e437ed43d469a1b 100644
--- a/algo/ca/core/pars/CaDefs.h
+++ b/algo/ca/core/pars/CaDefs.h
@@ -105,6 +105,18 @@ namespace cbm::algo::ca::constants
 
   }  // namespace misc
 
+  /// GPU constants
+  namespace gpu
+  {
+    constexpr int MaxDoubletsFromHit     = 150;    ///< Maximum number of doublets from a hit
+    constexpr int MaxTripletsFromDoublet = 15;     ///< Maximum number of triplets from a doublet
+    constexpr int MaxNofStations         = 20;     ///< Maximum number of stations //TODO: temporary solution
+    constexpr bool GpuTracking           = false;  ///< Flag: use GPU for tracking
+    constexpr bool GpuTimeMonitoring     = false;  ///< Flag: use GPU for time monitoring
+    constexpr bool GpuSortTriplets       = false;  ///< Flag: use GPU for sorting triplets
+    constexpr bool CpuSortTriplets       = true;   ///< Flag: use CPU for sorting triplets
+  }                                                // namespace gpu
+
   /// \brief Undefined values
   template<typename T1, typename T2 = T1>
   constexpr T2 Undef;
diff --git a/algo/ca/core/tracking/CaTrackFinderWindow.cxx b/algo/ca/core/tracking/CaTrackFinderWindow.cxx
index cce227803ac3fbfe80b4e5b3163853eca30b7fee..e3bcc301f5eff5c69380b57d922f03e9cccda9d3 100644
--- a/algo/ca/core/tracking/CaTrackFinderWindow.cxx
+++ b/algo/ca/core/tracking/CaTrackFinderWindow.cxx
@@ -127,20 +127,52 @@ namespace cbm::algo::ca
     PrepareGrid(input.GetHits(), wData);
     frMonitorData.StopTimer(ETimer::PrepareGrid);
 
+    std::optional<ca::GpuTrackFinderSetup> gpuTrackFinderSetup;
+    size_t iter_num = 0;
+    if constexpr (constants::gpu::GpuTracking) {
+      // XPU initialization
+      // - temporary solution for tests only
+      // - should not be initialized here
+      setenv("XPU_PROFILE", "1", 1);
+      xpu::settings settings;
+      //      settings.device = "cpu0";
+      settings.device = "hip0";
+      //      settings.verbose = true;
+      xpu::initialize(settings);
+
+      // Set up environment for GPU tracking
+      xpu::push_timer("gpuTFinit");
+      gpuTrackFinderSetup.emplace(wData, fParameters);
+      xpu::timings gpuTFinit = xpu::pop_timer();
+      if constexpr (constants::gpu::GpuTimeMonitoring) {
+        LOG(info) << "GPU tracking :: Initialization: " << gpuTFinit.wall() << " ms";
+      }
+
+      SetupGpuTrackFinder(gpuTrackFinderSetup.value());
+    }
+
     // Run CA iterations
     frMonitorData.StartTimer(ETimer::FindTracks);
     auto& caIterations = fParameters.GetCAIterations();
     for (auto iter = caIterations.begin(); iter != caIterations.end(); ++iter) {
-
       // ----- Prepare iteration
       frMonitorData.StartTimer(ETimer::PrepareIteration);
       PrepareCAIteration(*iter, wData, iter == caIterations.begin());
       frMonitorData.StopTimer(ETimer::PrepareIteration);
 
-      // ----- Triplets construction -----
-      frMonitorData.StartTimer(ETimer::ConstructTriplets);
-      ConstructTriplets(wData);
-      frMonitorData.StopTimer(ETimer::ConstructTriplets);
+      if constexpr (constants::gpu::GpuTracking) {
+        // ----- Triplets construction on GPU -----
+        frMonitorData.StartTimer(ETimer::ConstructTriplets);
+        ConstructTripletsGPU(wData, gpuTrackFinderSetup.value(), iter_num);
+        iter_num++;
+        frMonitorData.StopTimer(ETimer::ConstructTriplets);
+      }
+      else {
+        // ----- Triplets construction -----
+        frMonitorData.StartTimer(ETimer::ConstructTriplets);
+        ConstructTriplets(wData);
+        frMonitorData.StopTimer(ETimer::ConstructTriplets);
+      }
 
       // ----- Search for neighbouring triplets -----
       frMonitorData.StartTimer(ETimer::SearchNeighbours);
@@ -332,7 +364,6 @@ namespace cbm::algo::ca
         staPattern.push_back(std::make_pair(1 + i, 2 + gap));
       }
     }
-
     for (int istal = fParameters.GetNstationsActive() - 2; istal >= wData.CurrentIteration()->GetFirstStationIndex();
          istal--) {
       // start with downstream chambers
@@ -760,4 +791,112 @@ namespace cbm::algo::ca
     }      // level = 0
   }
 
+  void TrackFinderWindow::SetupGpuTrackFinder(GpuTrackFinderSetup& gpuTrackFinderSetup)
+  {
+    xpu::push_timer("SetupParametersTime");
+    gpuTrackFinderSetup.SetupParameters();
+    xpu::timings SetupParametersTime = xpu::pop_timer();
+
+    xpu::push_timer("SetInputDataTime");
+    gpuTrackFinderSetup.SetInputData();
+    xpu::timings SetInputDataTime = xpu::pop_timer();
+
+    xpu::push_timer("SetupMaterialMapTime");
+    gpuTrackFinderSetup.SetupMaterialMap();
+    xpu::timings SetupMaterialMapTime = xpu::pop_timer();
+
+    if constexpr (constants::gpu::GpuTimeMonitoring) {
+      LOG(info) << "GPU tracking :: SetupParameters: " << SetupParametersTime.wall() << " ms";
+      LOG(info) << "GPU tracking :: SetInputData: " << SetInputDataTime.wall() << " ms";
+      LOG(info) << "GPU tracking :: SetupMaterialMap: " << SetupMaterialMapTime.wall() << " ms";
+    }
+  }
+
+  void TrackFinderWindow::ConstructTripletsGPU(WindowData& wData, GpuTrackFinderSetup& gpuTrackFinderSetup,
+                                               int iteration)
+  {
+    xpu::push_timer("SetupGridTime");
+    gpuTrackFinderSetup.SetupGrid();
+    xpu::timings SetupGridTime = xpu::pop_timer();
+
+    xpu::push_timer("SetupIterationDataTime");
+    gpuTrackFinderSetup.SetupIterationData(iteration);
+    xpu::timings SetupIterationDataTime = xpu::pop_timer();
+
+    xpu::push_timer("RunGpuTrackingSetup");
+    gpuTrackFinderSetup.RunGpuTrackingSetup();
+    xpu::timings RunGpuTrackingSetup = xpu::pop_timer();
+
+    std::optional<std::vector<std::array<unsigned int, 2>>> triplet_sort;
+    if constexpr (constants::gpu::CpuSortTriplets) {
+      xpu::push_timer("SortTriplets");
+      triplet_sort.emplace();
+      triplet_sort.value().reserve(gpuTrackFinderSetup.GetNofTriplets());
+
+      for (unsigned int itr = 0; itr < gpuTrackFinderSetup.GetNofTriplets(); itr++) {
+        if (gpuTrackFinderSetup.GetTriplet(itr).GetChi2() <= 0.) continue;
+        const auto& triplet = gpuTrackFinderSetup.GetTriplet(itr);
+        triplet_sort.value().push_back({triplet.GetLHit(), itr});
+      }
+      std::sort(
+        triplet_sort.value().begin(), triplet_sort.value().end(),
+        [](const std::array<unsigned int, 2>& a, const std::array<unsigned int, 2>& b) { return (a[0] < b[0]); });
+      xpu::timings SortTriplets = xpu::pop_timer();
+
+      if constexpr (constants::gpu::GpuTimeMonitoring) {
+        LOG(info) << "GPU tracking :: SetupGrid: " << SetupGridTime.wall() << " ms";
+        LOG(info) << "GPU tracking :: SetupIterationData: " << SetupIterationDataTime.wall() << " ms";
+        LOG(info) << "GPU tracking :: RunGpuTracking: " << RunGpuTrackingSetup.wall() << " ms";
+        LOG(info) << "GPU tracking :: SortTripletsCPU: " << SortTriplets.wall() << " ms";
+      }
+    }
+
+    int ihitl_0 = -1;
+
+    auto& [vHitFirstTriplet, vHitNofTriplets, vTriplets] = fTripletData;
+    vHitFirstTriplet.reset(wData.Hits().size(), 0);  /// link hit -> first triplet { hit, *, *}
+    vHitNofTriplets.reset(wData.Hits().size(), 0);   /// link hit ->n triplets { hit, *, *}
+
+    // prepare triplet storage
+    for (int j = 0; j < fParameters.GetNstationsActive(); j++) {
+      const size_t nHitsStation = wData.TsHitIndices(j).size();
+      vTriplets[j].clear();
+      vTriplets[j].reserve(2 * nHitsStation);
+    }
+
+    int nTriplets = gpuTrackFinderSetup.GetNofTriplets();
+    if constexpr (constants::gpu::CpuSortTriplets) {
+      nTriplets = triplet_sort.value().size();
+    }
+    for (size_t itr = 0; itr < nTriplets; itr++) {
+      int iSta, ihitl;
+      if constexpr (constants::gpu::CpuSortTriplets) {
+        iSta  = gpuTrackFinderSetup.GetTriplet(std::get<1>(triplet_sort.value()[itr])).GetLSta();
+        ihitl = gpuTrackFinderSetup.GetTriplet(std::get<1>(triplet_sort.value()[itr])).GetLHit();
+      }
+      else {
+        iSta  = gpuTrackFinderSetup.GetTriplet(itr).GetLSta();
+        ihitl = gpuTrackFinderSetup.GetTriplet(itr).GetLHit();
+      }
+      const size_t oldSize = vTriplets[iSta].size();
+
+      if constexpr (constants::gpu::CpuSortTriplets) {
+        vTriplets[iSta].emplace_back(gpuTrackFinderSetup.GetTriplet(std::get<1>(triplet_sort.value()[itr])));
+        if (ihitl == ihitl_0) {
+          vHitNofTriplets[ihitl]++;
+        }
+        else {
+          vHitFirstTriplet[ihitl] = PackTripletId(iSta, oldSize);
+          vHitNofTriplets[ihitl]  = vTriplets[iSta].size() - oldSize;
+        }
+        ihitl_0 = ihitl;
+      }
+      else {
+        if (gpuTrackFinderSetup.GetTriplet(itr).GetChi2() > 0.)
+          vTriplets[iSta].emplace_back(gpuTrackFinderSetup.GetTriplet(itr));
+        vHitFirstTriplet[ihitl] = PackTripletId(iSta, oldSize);
+        vHitNofTriplets[ihitl]  = vTriplets[iSta].size() - oldSize;
+      }
+    }
+  }
 }  // namespace cbm::algo::ca
diff --git a/algo/ca/core/tracking/CaTrackFinderWindow.h b/algo/ca/core/tracking/CaTrackFinderWindow.h
index c08ea5570af54c2c4b817681ea339f57032623ca..691ac0da86aa0b78a343acc2f173fca5a7cef473 100644
--- a/algo/ca/core/tracking/CaTrackFinderWindow.h
+++ b/algo/ca/core/tracking/CaTrackFinderWindow.h
@@ -11,6 +11,7 @@
 
 #include "CaBranch.h"
 #include "CaCloneMerger.h"
+#include "CaGpuTrackFinderSetup.h"
 #include "CaGrid.h"
 #include "CaHit.h"
 #include "CaSimd.h"
@@ -90,6 +91,10 @@ namespace cbm::algo::ca
 
     bool checkTripletMatch(const ca::Triplet& l, const ca::Triplet& r, fscal& dchi2, WindowData& wData) const;
 
+    void SetupGpuTrackFinder(GpuTrackFinderSetup& gpuTrackFinderSetup);
+
+    void ConstructTripletsGPU(WindowData& wData, GpuTrackFinderSetup& gpuTrackFinderSetup, int iteration);
+
     // ** Functions, which pack and unpack indexes of station and triplet **
 
     // TODO: move to ca::Triplet class (S.Zharko)
diff --git a/algo/kf/core/CMakeLists.txt b/algo/kf/core/CMakeLists.txt
index 0c2168e867a71397ae6f1e2a18c408a03375d262..cbd05a33d1cbd9def9da8dd1e6c919089fe89ed4 100644
--- a/algo/kf/core/CMakeLists.txt
+++ b/algo/kf/core/CMakeLists.txt
@@ -56,6 +56,7 @@ target_link_libraries(KfCore
                       OnlineDataLog          # needed for the logger
                       external::fles_logging # needed for the logger
                       external::fles_ipc     # needed for the logger
+                      xpu
              PRIVATE  Boost::serialization
                       fmt::fmt
                       external::yaml-cpp
@@ -78,6 +79,7 @@ if (NOT CBM_ONLINE_STANDALONE)
                 PUBLIC  Vc::Vc
                         OnlineDataLog          # needed for the logger?
                         FairLogger::FairLogger
+                        xpu
                PRIVATE  Boost::serialization
                         fmt::fmt
                         external::yaml-cpp
diff --git a/algo/kf/core/data/KfMeasurementU.h b/algo/kf/core/data/KfMeasurementU.h
index 711f237ec237b063ded4afa03ebc96b5348b6ac0..e4e6a08dcf6c53fa4849379eca4070b91ac05ae3 100644
--- a/algo/kf/core/data/KfMeasurementU.h
+++ b/algo/kf/core/data/KfMeasurementU.h
@@ -15,6 +15,8 @@
 
 #include <string>
 
+#include <xpu/device.h>
+
 namespace cbm::algo::kf
 {
   /// \brief The class describes a 1D - measurement U in XY coordinate system
@@ -38,7 +40,7 @@ namespace cbm::algo::kf
     friend class boost::serialization::access;
 
     /// default constructor
-    MeasurementU() = default;
+    XPU_D MeasurementU() = default;
 
     /// constructor
     /// \param cosPhi   cos(phi)
@@ -48,7 +50,7 @@ namespace cbm::algo::kf
     /// \param ndf      number of degrees of freedom (used for chi2 calculation)
     ///                 if ndf == 1, the measurement is used in the chi2 calculation
     ///                 if ndf == 0, the measurement is not used in the chi2 calculation
-    MeasurementU(DataT cosPhi, DataT sinPhi, DataT u, DataT du2, DataT ndf)
+    XPU_D MeasurementU(DataT cosPhi, DataT sinPhi, DataT u, DataT du2, DataT ndf)
       : fCosPhi(cosPhi)
       , fSinPhi(sinPhi)
       , fU(u)
@@ -60,17 +62,17 @@ namespace cbm::algo::kf
     ///------------------------------
     /// Setters and getters
 
-    void SetCosPhi(DataT cosPhi) { fCosPhi = cosPhi; }
-    void SetSinPhi(DataT sinPhi) { fSinPhi = sinPhi; }
-    void SetU(DataT u) { fU = u; }
-    void SetDu2(DataT du2) { fDu2 = du2; }
-    void SetNdf(DataT ndf) { fNdf = ndf; }
-
-    DataT CosPhi() const { return fCosPhi; }
-    DataT SinPhi() const { return fSinPhi; }
-    DataT U() const { return fU; }
-    DataT Du2() const { return fDu2; }
-    DataT Ndf() const { return fNdf; }
+    XPU_D void SetCosPhi(DataT cosPhi) { fCosPhi = cosPhi; }
+    XPU_D void SetSinPhi(DataT sinPhi) { fSinPhi = sinPhi; }
+    XPU_D void SetU(DataT u) { fU = u; }
+    XPU_D void SetDu2(DataT du2) { fDu2 = du2; }
+    XPU_D void SetNdf(DataT ndf) { fNdf = ndf; }
+
+    XPU_D DataT CosPhi() const { return fCosPhi; }
+    XPU_D DataT SinPhi() const { return fSinPhi; }
+    XPU_D DataT U() const { return fU; }
+    XPU_D DataT Du2() const { return fDu2; }
+    XPU_D DataT Ndf() const { return fNdf; }
 
     ///------------------------------
     /// Methods for debugging
diff --git a/algo/kf/core/data/KfTrackParam.h b/algo/kf/core/data/KfTrackParam.h
index d439bbd8c8a6424ed0765317c4cf003c63876604..6d2183e9fd10122117e30bffc3a4ecfb3da3af19 100644
--- a/algo/kf/core/data/KfTrackParam.h
+++ b/algo/kf/core/data/KfTrackParam.h
@@ -19,6 +19,8 @@
 
 #include <string>
 
+#include <xpu/device.h>
+
 namespace cbm::algo::kf
 {
   /// \class cbm::algo::kf::TrackParamBase
@@ -201,49 +203,49 @@ namespace cbm::algo::kf
     ///  Some of them involve calculations
 
     /// \brief Gets z position [cm]
-    T GetZ() const { return fZ; }
+    XPU_D T GetZ() const { return fZ; }
 
     /// \brief Gets x position [cm]
-    T GetX() const { return fX; }
+    XPU_D T GetX() const { return fX; }
 
     /// \brief Gets x position error [cm]
-    T GetXError() const { return sqrt(C00()); }
+    XPU_D T GetXError() const { return sqrt(C00()); }
 
     /// \brief Gets y position [cm]
-    T GetY() const { return fY; }
+    XPU_D T GetY() const { return fY; }
 
     /// \brief Gets y position error [cm]
-    T GetYError() const { return sqrt(C11()); }
+    XPU_D T GetYError() const { return sqrt(C11()); }
 
     /// \brief Gets slope along x-axis
-    T GetTx() const { return fTx; }
+    XPU_D T GetTx() const { return fTx; }
 
     /// \brief Gets error of slope along x-axis
-    T GetTxError() const { return sqrt(C22()); }
+    XPU_D T GetTxError() const { return sqrt(C22()); }
 
     /// \brief Gets slope along y-axis
-    T GetTy() const { return fTy; }
+    XPU_D T GetTy() const { return fTy; }
 
     /// \brief Gets error of slope along y-axis
-    T GetTyError() const { return sqrt(C33()); }
+    XPU_D T GetTyError() const { return sqrt(C33()); }
 
     /// \brief Gets charge over momentum [ec/GeV]
-    T GetQp() const { return fQp; }
+    XPU_D T GetQp() const { return fQp; }
 
     /// \brief Gets error of charge over momentum [ec/GeV]
-    T GetQpError() const { return sqrt(C44()); }
+    XPU_D T GetQpError() const { return sqrt(C44()); }
 
     /// \brief Gets time [ns]
-    T GetTime() const { return fT; }
+    XPU_D T GetTime() const { return fT; }
 
     /// \brief Gets time error [ns]
-    T GetTimeError() const { return sqrt(C55()); }
+    XPU_D T GetTimeError() const { return sqrt(C55()); }
 
     /// \brief Gets inverse velocity [ns/cm] in downstream direction
-    T GetVi() const { return fVi; }
+    XPU_D T GetVi() const { return fVi; }
 
     /// \brief Gets inverse velocity error [ns/cm]
-    T GetViError() const { return sqrt(C66()); }
+    XPU_D T GetViError() const { return sqrt(C66()); }
 
     /// \brief Gets covariance matrix
     const CovMatrix_t& GetCovMatrix() const { return fCovMatrix; }
@@ -252,49 +254,49 @@ namespace cbm::algo::kf
     /// \param i row
     /// \param j column
     /// \return covariance matrix element
-    T GetCovariance(int i, int j) const { return C(i, j); }
+    XPU_D T GetCovariance(int i, int j) const { return C(i, j); }
 
     /// Gets Chi-square of track fit model
-    T GetChiSq() const { return fChiSq; }
+    XPU_D T GetChiSq() const { return fChiSq; }
 
     /// Gets NDF of track fit model
-    T GetNdf() const { return fNdf; }
+    XPU_D T GetNdf() const { return fNdf; }
 
     /// Gets Chi-square of time measurements
-    T GetChiSqTime() const { return fChiSqTime; }
+    XPU_D T GetChiSqTime() const { return fChiSqTime; }
 
     /// Gets NDF of time measurements
-    T GetNdfTime() const { return fNdfTime; }
+    XPU_D T GetNdfTime() const { return fNdfTime; }
 
     /// Gets charge
-    T GetCharge() const { return utils::iif(GetQp() > T(0.), T(1.), T(-1.)); }
+    XPU_D T GetCharge() const { return utils::iif(GetQp() > T(0.), T(1.), T(-1.)); }
 
     /// \brief Gets azimuthal angle [rad]
-    T GetPhi() const { return atan2(GetTy(), GetTx()); }
+    XPU_D T GetPhi() const { return atan2(GetTy(), GetTx()); }
 
     /// \brief Gets azimuthal angle error [rad]
-    T GetPhiError() const;
+    XPU_D T GetPhiError() const;
 
     /// \brief Gets polar angle [rad]
-    T GetTheta() const { return atan(sqrt(GetTx() * GetTx() + GetTy() * GetTy())); }
+    XPU_D T GetTheta() const { return atan(sqrt(GetTx() * GetTx() + GetTy() * GetTy())); }
 
     /// \brief Gets polar angle error [rad]
-    T GetThetaError() const;
+    XPU_D T GetThetaError() const;
 
     /// Gets momentum [GeV/ec]. For the straight tracks returns 1.e4 [GeV/ec]
-    T GetP() const { return utils::iif(utils::fabs(GetQp()) > T(1.e-4), T(1.) / utils::fabs(GetQp()), T(1.e4)); }
+    XPU_D T GetP() const { return utils::iif(utils::fabs(GetQp()) > T(1.e-4), T(1.) / utils::fabs(GetQp()), T(1.e4)); }
 
     /// Gets z-component of the momentum [GeV/ec]
-    T GetPz() const { return GetP() / sqrt(T(1.) + GetTx() * GetTx() + GetTy() * GetTy()); }
+    XPU_D T GetPz() const { return GetP() / sqrt(T(1.) + GetTx() * GetTx() + GetTy() * GetTy()); }
 
     /// Gets x-component of the momentum [GeV/ec]
-    T GetPx() const { return GetPz() * GetTx(); }
+    XPU_D T GetPx() const { return GetPz() * GetTx(); }
 
     /// Gets y-component of the momentum [GeV/ec]
-    T GetPy() const { return GetPz() * GetTy(); }
+    XPU_D T GetPy() const { return GetPz() * GetTy(); }
 
     /// Gets transverse momentum
-    T GetPt() const
+    XPU_D T GetPt() const
     {
       T t2 = GetTx() * GetTx() + GetTy() * GetTy();
       return GetP() * sqrt(t2 / (T(1.) + t2));
@@ -305,43 +307,46 @@ namespace cbm::algo::kf
     ///
 
     /// \brief Sets z position [cm]
-    void SetZ(T v) { fZ = v; }
+    XPU_D void SetZ(T v) { fZ = v; }
 
     /// \brief Sets x position [cm]
-    void SetX(T v) { fX = v; }
+    XPU_D void SetX(T v) { fX = v; }
 
     /// \brief Sets y position [cm]
-    void SetY(T v) { fY = v; }
+    XPU_D void SetY(T v) { fY = v; }
 
     /// \brief Sets slope along x-axis
-    void SetTx(T v) { fTx = v; }
+    XPU_D void SetTx(T v) { fTx = v; }
 
     /// \brief Sets slope along y-axis
-    void SetTy(T v) { fTy = v; }
+    XPU_D void SetTy(T v) { fTy = v; }
 
     /// \brief Sets charge over momentum [ec/GeV]
-    void SetQp(T v) { fQp = v; }
+    XPU_D void SetQp(T v) { fQp = v; }
 
     /// \brief Sets time [ns]
-    void SetTime(T v) { fT = v; }
+    XPU_D void SetTime(T v) { fT = v; }
 
     /// \brief Sets inverse velocity [ns/cm]
-    void SetVi(T v) { fVi = v; }
+    XPU_D void SetVi(T v) { fVi = v; }
 
     /// \brief Sets Chi-square of track fit model
-    void SetChiSq(T v) { fChiSq = v; }
+    XPU_D void SetChiSq(T v) { fChiSq = v; }
 
     /// \brief Sets NDF of track fit model
-    void SetNdf(T v) { fNdf = v; }
+    XPU_D void SetNdf(T v) { fNdf = v; }
 
     /// \brief Sets Chi-square of time measurements
-    void SetChiSqTime(T v) { fChiSqTime = v; }
+    XPU_D void SetChiSqTime(T v) { fChiSqTime = v; }
 
     /// \brief Sets NDF of time measurements
-    void SetNdfTime(T v) { fNdfTime = v; }
+    XPU_D void SetNdfTime(T v) { fNdfTime = v; }
 
     /// \brief Sets covariance matrix
-    void SetCovMatrix(const CovMatrix_t& val) { fCovMatrix = val; }
+    XPU_D void SetCovMatrix(const CovMatrix_t& val) { fCovMatrix = val; }
+
+    /// \brief Sets all elements of the covariance matrix to zero
+    XPU_D void ResetCovMatrix() { memset(fCovMatrix.begin(), 0., kNcovParam * sizeof(T)); }
 
     /// \brief Get covariance matrix element
     /// \param i row
@@ -358,7 +363,7 @@ namespace cbm::algo::kf
     /// \param j column
     /// \param val matrix element
     template<int i, int j>
-    void SetCovariance(T val)
+    XPU_D void SetCovariance(T val)
     {
       constexpr int ind = (j <= i) ? i * (1 + i) / 2 + j : j * (1 + j) / 2 + i;
       fCovMatrix[ind]   = val;
@@ -366,34 +371,34 @@ namespace cbm::algo::kf
 
     /// \brief Individual setters for covariance matrix elements
     ///
-    void SetC00(T val) { SetCovariance<0, 0>(val); }
-    void SetC10(T val) { SetCovariance<1, 0>(val); }
-    void SetC11(T val) { SetCovariance<1, 1>(val); }
-    void SetC20(T val) { SetCovariance<2, 0>(val); }
-    void SetC21(T val) { SetCovariance<2, 1>(val); }
-    void SetC22(T val) { SetCovariance<2, 2>(val); }
-    void SetC30(T val) { SetCovariance<3, 0>(val); }
-    void SetC31(T val) { SetCovariance<3, 1>(val); }
-    void SetC32(T val) { SetCovariance<3, 2>(val); }
-    void SetC33(T val) { SetCovariance<3, 3>(val); }
-    void SetC40(T val) { SetCovariance<4, 0>(val); }
-    void SetC41(T val) { SetCovariance<4, 1>(val); }
-    void SetC42(T val) { SetCovariance<4, 2>(val); }
-    void SetC43(T val) { SetCovariance<4, 3>(val); }
-    void SetC44(T val) { SetCovariance<4, 4>(val); }
-    void SetC50(T val) { SetCovariance<5, 0>(val); }
-    void SetC51(T val) { SetCovariance<5, 1>(val); }
-    void SetC52(T val) { SetCovariance<5, 2>(val); }
-    void SetC53(T val) { SetCovariance<5, 3>(val); }
-    void SetC54(T val) { SetCovariance<5, 4>(val); }
-    void SetC55(T val) { SetCovariance<5, 5>(val); }
-    void SetC60(T val) { SetCovariance<6, 0>(val); }
-    void SetC61(T val) { SetCovariance<6, 1>(val); }
-    void SetC62(T val) { SetCovariance<6, 2>(val); }
-    void SetC63(T val) { SetCovariance<6, 3>(val); }
-    void SetC64(T val) { SetCovariance<6, 4>(val); }
-    void SetC65(T val) { SetCovariance<6, 5>(val); }
-    void SetC66(T val) { SetCovariance<6, 6>(val); }
+    XPU_D void SetC00(T val) { SetCovariance<0, 0>(val); }
+    XPU_D void SetC10(T val) { SetCovariance<1, 0>(val); }
+    XPU_D void SetC11(T val) { SetCovariance<1, 1>(val); }
+    XPU_D void SetC20(T val) { SetCovariance<2, 0>(val); }
+    XPU_D void SetC21(T val) { SetCovariance<2, 1>(val); }
+    XPU_D void SetC22(T val) { SetCovariance<2, 2>(val); }
+    XPU_D void SetC30(T val) { SetCovariance<3, 0>(val); }
+    XPU_D void SetC31(T val) { SetCovariance<3, 1>(val); }
+    XPU_D void SetC32(T val) { SetCovariance<3, 2>(val); }
+    XPU_D void SetC33(T val) { SetCovariance<3, 3>(val); }
+    XPU_D void SetC40(T val) { SetCovariance<4, 0>(val); }
+    XPU_D void SetC41(T val) { SetCovariance<4, 1>(val); }
+    XPU_D void SetC42(T val) { SetCovariance<4, 2>(val); }
+    XPU_D void SetC43(T val) { SetCovariance<4, 3>(val); }
+    XPU_D void SetC44(T val) { SetCovariance<4, 4>(val); }
+    XPU_D void SetC50(T val) { SetCovariance<5, 0>(val); }
+    XPU_D void SetC51(T val) { SetCovariance<5, 1>(val); }
+    XPU_D void SetC52(T val) { SetCovariance<5, 2>(val); }
+    XPU_D void SetC53(T val) { SetCovariance<5, 3>(val); }
+    XPU_D void SetC54(T val) { SetCovariance<5, 4>(val); }
+    XPU_D void SetC55(T val) { SetCovariance<5, 5>(val); }
+    XPU_D void SetC60(T val) { SetCovariance<6, 0>(val); }
+    XPU_D void SetC61(T val) { SetCovariance<6, 1>(val); }
+    XPU_D void SetC62(T val) { SetCovariance<6, 2>(val); }
+    XPU_D void SetC63(T val) { SetCovariance<6, 3>(val); }
+    XPU_D void SetC64(T val) { SetCovariance<6, 4>(val); }
+    XPU_D void SetC65(T val) { SetCovariance<6, 5>(val); }
+    XPU_D void SetC66(T val) { SetCovariance<6, 6>(val); }
 
     ///---------------------------------------------------------------------------------------------------------------------
     /// References to parameters to ease the math formulae
@@ -401,49 +406,49 @@ namespace cbm::algo::kf
     ///
 
     /// \brief Reference to z position [cm]
-    T& Z() { return fZ; }
+    XPU_D T& Z() { return fZ; }
 
     /// \brief Reference to x position [cm]
-    T& X() { return fX; }
+    XPU_D T& X() { return fX; }
 
     /// \brief Reference to y position [cm]
-    T& Y() { return fY; }
+    XPU_D T& Y() { return fY; }
 
     /// \brief Reference to slope along x-axis
-    T& Tx() { return fTx; }
+    XPU_D T& Tx() { return fTx; }
 
     /// \brief Reference to slope along y-axis
-    T& Ty() { return fTy; }
+    XPU_D T& Ty() { return fTy; }
 
     /// \brief Reference to charge over momentum [ec/GeV]
-    T& Qp() { return fQp; }
+    XPU_D T& Qp() { return fQp; }
 
     /// \brief Reference to time [ns]
-    T& Time() { return fT; }
+    XPU_D T& Time() { return fT; }
 
     /// \brief Reference to inverse velocity [ns/cm]
-    T& Vi() { return fVi; }
+    XPU_D T& Vi() { return fVi; }
 
     /// \brief Reference to covariance matrix
     CovMatrix_t& CovMatrix() { return fCovMatrix; }
 
     /// \brief Reference to Chi-square of track fit model
-    T& ChiSq() { return fChiSq; }
+    XPU_D T& ChiSq() { return fChiSq; }
 
     /// \brief Reference to NDF of track fit model
-    T& Ndf() { return fNdf; }
+    XPU_D T& Ndf() { return fNdf; }
 
     /// \brief Reference to Chi-square of time measurements
-    T& ChiSqTime() { return fChiSqTime; }
+    XPU_D T& ChiSqTime() { return fChiSqTime; }
 
     /// \brief Reference to NDF of time measurements
-    T& NdfTime() { return fNdfTime; }
+    XPU_D T& NdfTime() { return fNdfTime; }
 
     /// \brief Get a reference to the covariance matrix element
     /// \param i row
     /// \param j column
     /// \return matrix element
-    T& C(int i, int j)
+    XPU_D T& C(int i, int j)
     {
       int ind = (j <= i) ? i * (1 + i) / 2 + j : j * (1 + j) / 2 + i;
       return fCovMatrix[ind];
@@ -530,7 +535,7 @@ namespace cbm::algo::kf
     /// \param c44  Variance of charge over momentum [(ec/GeV)2]
     /// \param c55  Variance of time [ns2]
     /// \param c66  Variance of inverse velocity [1/c2]
-    void ResetErrors(T c00, T c11, T c22, T c33, T c44, T c55, T c66);
+    XPU_D void ResetErrors(T c00, T c11, T c22, T c33, T c44, T c55, T c66);
 
     /// \brief Prints parameters to a string
     /// \param i  Index of SIMD vector element (when i== -1, the entire vector is printed)
@@ -550,7 +555,7 @@ namespace cbm::algo::kf
     bool IsConsistent(bool printWhenWrong, int nFilled) const;
 
     /// \brief Initializes inverse velocity range
-    void InitVelocityRange(fscal minP);
+    XPU_D void InitVelocityRange(fscal minP);
 
 
     ///---------------------------------------------------------------------------------------------------------------------
@@ -668,7 +673,7 @@ namespace cbm::algo::kf
   /// ---------------------------------------------------------------------------------------------------------------------
   ///
   template<typename T>
-  inline T TrackParamBase<T>::GetPhiError() const
+  XPU_D inline T TrackParamBase<T>::GetPhiError() const
   {
     // phi = atan( tx / ty ); }
 
@@ -688,7 +693,7 @@ namespace cbm::algo::kf
   /// ---------------------------------------------------------------------------------------------------------------------
   ///
   template<typename T>
-  inline T TrackParamBase<T>::GetThetaError() const
+  XPU_D inline T TrackParamBase<T>::GetThetaError() const
   {
     // theta = atan(sqrt( tx * tx + ty * ty) )
 
@@ -741,9 +746,12 @@ namespace cbm::algo::kf
   // ---------------------------------------------------------------------------------------------------------------------
   //
   template<typename T>
-  inline void TrackParamBase<T>::ResetErrors(T c00, T c11, T c22, T c33, T c44, T c55, T c66)
+  XPU_D inline void TrackParamBase<T>::ResetErrors(T c00, T c11, T c22, T c33, T c44, T c55, T c66)
   {
-    fCovMatrix.fill(0.);
+    //    fCovMatrix.fill(0.);
+    for (auto& element : fCovMatrix) {
+      element = 0.;
+    }
 
     SetC00(c00);
     SetC11(c11);
@@ -762,7 +770,7 @@ namespace cbm::algo::kf
   // ---------------------------------------------------------------------------------------------------------------------
   //
   template<typename T>
-  inline void TrackParamBase<T>::InitVelocityRange(fscal minP)
+  XPU_D inline void TrackParamBase<T>::InitVelocityRange(fscal minP)
   {
     // initialise the velocity range with respect to the minimal momentum minP {GeV/c}
     using defs::ProtonMass;
diff --git a/algo/kf/core/geo/KfFieldSlice.h b/algo/kf/core/geo/KfFieldSlice.h
index a626ca4a2414b66f745d3193caa8afbe7eb9e601..50654a1c8ec24cc5743cad31e21ca3b47ec263e5 100644
--- a/algo/kf/core/geo/KfFieldSlice.h
+++ b/algo/kf/core/geo/KfFieldSlice.h
@@ -87,6 +87,15 @@ namespace cbm::algo::kf
       return GetFieldValue(trkPar.GetX() + trkPar.GetTx() * dz, trkPar.GetY() + trkPar.GetTy() * dz);
     }
 
+    /// \brief Gets approximation coefficients for the x-component of the field
+    const CoeffArray_t& GetBx() const { return fBx; }
+
+    /// \brief Gets approximation coefficients for the y-component of the field
+    const CoeffArray_t& GetBy() const { return fBy; }
+
+    /// \brief Gets approximation coefficients for the z-component of the field
+    const CoeffArray_t& GetBz() const { return fBz; }
+
     /// \brief Gets reference z-coordinate of the slice [cm]
     const T& GetZref() const { return fZref; }
 
diff --git a/algo/kf/core/geo/KfMaterialMap.h b/algo/kf/core/geo/KfMaterialMap.h
index e453f6bd9ab7b72034c38ffb75663f9e7c1bc18a..2ba56e81e624b6704bda40aaf8d67b90abed523f 100644
--- a/algo/kf/core/geo/KfMaterialMap.h
+++ b/algo/kf/core/geo/KfMaterialMap.h
@@ -128,6 +128,9 @@ namespace cbm::algo::kf
       return GetBinThicknessX0<I>(iX + iY * fNbins);
     }
 
+    /// \brief Gets material thickness table
+    const std::vector<float>& GetTable() const { return fTable; }
+
     /// \brief Function to test the instance for NaN
     bool IsUndefined() const
     {