diff --git a/CMakeLists.txt b/CMakeLists.txt
index be903c4c3f5a391ecdc45ebb4dfaf1b79c9f9a9b..cf471aa6a281a2ebe674d6f46fcda839f050c88d 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -210,6 +210,11 @@ endif()
 # Must be the first subdirectory since the defined targets are needed by
 # following targets
 add_subdirectory (external)
+set (BASE_INCLUDE_DIRECTORIES
+${BASE_INCLUDE_DIRECTORIES}
+${XPU_INCLUDE_DIRECTORY} # Required for XPU_D macro in base data types
+)
+
 
 ### Base directories
 add_subdirectory (core)
diff --git a/MQ/mcbm/CMakeLists.txt b/MQ/mcbm/CMakeLists.txt
index a40ade8e9092648244627d8d428eda7a7b52915c..57851cde0984f9476517f1056932ea8b97136158 100644
--- a/MQ/mcbm/CMakeLists.txt
+++ b/MQ/mcbm/CMakeLists.txt
@@ -49,6 +49,7 @@ set(INCLUDE_DIRECTORIES
 
 Set(SYSTEM_INCLUDE_DIRECTORIES
     ${SYSTEM_INCLUDE_DIRECTORIES}
+    ${BASE_INCLUDE_DIRECTORIES}
     ${ZeroMQ_INCLUDE_DIR}
     ${Boost_INCLUDE_DIR}
     ${FAIRROOT_INCLUDE_DIR}
diff --git a/MQ/source/CMakeLists.txt b/MQ/source/CMakeLists.txt
index f25d984625433fc092608489d43de1d445344130..54178de7b641937625fee34961698e6c850e5254 100644
--- a/MQ/source/CMakeLists.txt
+++ b/MQ/source/CMakeLists.txt
@@ -18,6 +18,7 @@ set(INCLUDE_DIRECTORIES
 )
 
 Set(SYSTEM_INCLUDE_DIRECTORIES
+    ${BASE_INCLUDE_DIRECTORIES}
     ${SYSTEM_INCLUDE_DIRECTORIES}
     ${ZeroMQ_INCLUDE_DIR}
     ${Boost_INCLUDE_DIR}
diff --git a/algo/data/CMakeLists.txt b/algo/data/CMakeLists.txt
index 9e3c8b1bbaef0a533c6eca5854e4f576e2b69f6e..fd0ad4d94de6aa141031e83dc8f98dff1f96a867 100644
--- a/algo/data/CMakeLists.txt
+++ b/algo/data/CMakeLists.txt
@@ -46,6 +46,7 @@ target_include_directories(OnlineData
 target_include_directories(OnlineData SYSTEM
   PUBLIC ${Boost_INCLUDE_DIR}
   PUBLIC ${FAIRLOGGER_INCLUDE_DIR}
+  PUBLIC ${XPU_INCLUDE_DIRECTORY}
 )
 
 target_link_directories(OnlineData
diff --git a/core/data/sts/CbmStsDigi.h b/core/data/sts/CbmStsDigi.h
index e2e6a1de5862c1a054b82a2cc5c6643f935fcd55..513234f7ff98b251c8aa400ea366d6251734bc82 100644
--- a/core/data/sts/CbmStsDigi.h
+++ b/core/data/sts/CbmStsDigi.h
@@ -16,10 +16,13 @@
 #include "CbmDefs.h"  // for ECbmModuleId::kSts
 #include "CbmStsAddress.h"
 
+#include <xpu/defines.h>  // for XPU_D
+
 #ifndef NO_ROOT
 #include <Rtypes.h>  // for ClassDef
 #endif
 
+
 #include <boost/serialization/access.hpp>
 #include <boost/serialization/base_object.hpp>
 
@@ -38,9 +41,10 @@ class CbmStsDigi {
 
 public:
   /** Default constructor **/
-  CbmStsDigi() {}
+  CbmStsDigi() = default;
 
 
+#if XPU_IS_CPU
   /** Standard constructor
    ** @param  address  Unique element address
    ** @param  channel  Channel number
@@ -56,10 +60,10 @@ public:
     PackAddressAndTime(address, time);
     PackChannelAndCharge(channel, charge);
   }
-
+#endif
 
   /** Destructor **/
-  ~CbmStsDigi() {};
+  ~CbmStsDigi() = default;
 
   /** Unique detector element address  (see CbmStsAddress)
    ** @value Unique address of readout channel
@@ -76,7 +80,7 @@ public:
   /** @brief Channel number in module
    ** @value Channel number
    **/
-  uint16_t GetChannel() const { return UnpackChannel(); }
+  XPU_D uint16_t GetChannel() const { return UnpackChannel(); }
 
 
   /** @brief Class name (static)
@@ -84,11 +88,14 @@ public:
    **/
   static const char* GetClassName() { return "CbmStsDigi"; }
 
-
+#if XPU_IS_CPU
   /** Charge
    ** @value Charge [ADC units]
    **/
   double GetCharge() const { return static_cast<double>(UnpackCharge()); }
+#endif
+
+  XPU_D uint16_t GetChargeU16() const { return UnpackCharge(); }
 
 
   /** System ID (static)
@@ -96,11 +103,14 @@ public:
   **/
   static ECbmModuleId GetSystem() { return ECbmModuleId::kSts; }
 
-
+#if XPU_IS_CPU
   /** Time of measurement
    ** @value Time [ns]
    **/
   double GetTime() const { return static_cast<double>(UnpackTime()); }
+#endif
+
+  XPU_D uint32_t GetTimeU32() const { return UnpackTime(); }
 
 
   template<class Archive>
@@ -115,7 +125,7 @@ public:
   /** Update Time of measurement
    ** @param New Time [ns]
    **/
-  void SetTime(double dNewTime)
+  XPU_D void SetTime(double dNewTime)
   {
     // StsDigi is not able to store negative timestamps.
     assert(dNewTime >= 0);
@@ -124,18 +134,20 @@ public:
     PackTime(dNewTime);
   }
 
-  void SetChannel(uint16_t channel) { PackChannelAndCharge(channel, UnpackCharge()); }
+  XPU_D void SetChannel(uint16_t channel) { PackChannelAndCharge(channel, UnpackCharge()); }
 
-  void SetCharge(uint16_t charge) { PackChannelAndCharge(UnpackChannel(), charge); }
+  XPU_D void SetCharge(uint16_t charge) { PackChannelAndCharge(UnpackChannel(), charge); }
 
+#if XPU_IS_CPU
   void SetAddress(int32_t address) { PackAddressAndTime(address, UnpackTime()); }
+#endif
 
 
   /** Set new channel and charge.
    **
    ** Slightly more efficient than calling both individual setters.
    **/
-  void SetChannelAndCharge(uint16_t channel, uint16_t charge) { PackChannelAndCharge(channel, charge); }
+  XPU_D void SetChannelAndCharge(uint16_t channel, uint16_t charge) { PackChannelAndCharge(channel, charge); }
 
   /** Set new address and time at once.
    **
@@ -160,23 +172,32 @@ private:
   static constexpr uint32_t kMaxTimestamp       = kTimestampMask;
   static constexpr uint32_t kTimeAddressBitMask = ~kTimestampMask;
 
-  uint32_t fTime             = 0;  ///< Time [ns] in lower 31 bits, highest bit is the 17th address bit.
-  uint16_t fChannelAndCharge = 0;  ///< Channel number (lower 11 bits) and charge [ADC Units] in upper 5 bits.
-  uint16_t fAddress          = 0;  ///< Unique element address (lower 16 bits of 17)
+// CUDA requires types in shared memory to have trivial constructors / destructors
+#if XPU_IS_HIP_CUDA
+#define CPU_ONLY(x)
+#else
+#define CPU_ONLY(x) x
+#endif
 
+  uint32_t fTime CPU_ONLY(= 0);              ///< Time [ns] in lower 31 bits, highest bit is the 17th address bit.
+  uint16_t fChannelAndCharge CPU_ONLY(= 0);  ///< Channel number (lower 11 bits) and charge [ADC Units] in upper 5 bits.
+  uint16_t fAddress CPU_ONLY(= 0);           ///< Unique element address (lower 16 bits of 17)
 
-  void PackTime(uint32_t newTime) { fTime = (fTime & kTimeAddressBitMask) | (newTime & kTimestampMask); }
-  uint32_t UnpackTime() const { return fTime & kTimestampMask; }
+#undef CPU_ONLY
 
 
-  void PackChannelAndCharge(uint16_t channel, uint16_t charge)
+  XPU_D void PackTime(uint32_t newTime) { fTime = (fTime & kTimeAddressBitMask) | (newTime & kTimestampMask); }
+  XPU_D uint32_t UnpackTime() const { return fTime & kTimestampMask; }
+
+
+  XPU_D void PackChannelAndCharge(uint16_t channel, uint16_t charge)
   {
     fChannelAndCharge = (channel << kNumAdcBits) | charge;
   }
-  uint16_t UnpackChannel() const { return fChannelAndCharge >> kNumAdcBits; }
-  uint16_t UnpackCharge() const { return fChannelAndCharge & kAdcMask; }
-
+  XPU_D uint16_t UnpackChannel() const { return fChannelAndCharge >> kNumAdcBits; }
+  XPU_D uint16_t UnpackCharge() const { return fChannelAndCharge & kAdcMask; }
 
+  // Packing / Unpacking address not available on GPU for now...
   void PackAddressAndTime(int32_t address, uint32_t time);
   int32_t UnpackAddress() const;
 
diff --git a/core/detectors/sts/CbmStsPhysics.h b/core/detectors/sts/CbmStsPhysics.h
index 8e727cc5b879a80f41fa9aa06f0d882579fc94f5..5e5068b5a9ff2d14ec661cbe8384118d79ac0940 100644
--- a/core/detectors/sts/CbmStsPhysics.h
+++ b/core/detectors/sts/CbmStsPhysics.h
@@ -88,6 +88,8 @@ public:
      **/
   Double_t LandauWidth(Double_t mostProbableCharge);
 
+  std::map<Double_t, Double_t> GetLandauWidthTable() const { return fLandauWidth; }
+
 
   /** @brief Energy for electron-hole pair creation in silicon
      ** @return Pair creation energy [GeV]
diff --git a/external/.gitignore b/external/.gitignore
index 3ac2b17b841ebfe86a453bfd505850570c4ff4f6..f100b752415cf1171c20fce0f4d54fae7c0743e2 100644
--- a/external/.gitignore
+++ b/external/.gitignore
@@ -4,6 +4,7 @@ KFParticle
 NicaFemto
 Vc
 cppzmq
+xpu
 flib_dpb/flib_dpb
 flesnet/
 jsroot
diff --git a/external/CMakeLists.txt b/external/CMakeLists.txt
index fe9f1da40310132c2bba0ffa1782bee5b4c75e50..70e591d9610a9bbbc681475cd58ec3a6b06357c7 100644
--- a/external/CMakeLists.txt
+++ b/external/CMakeLists.txt
@@ -35,14 +35,15 @@ if(DOWNLOAD_EXTERNALS)
   endif()
 
   download_project_if_needed(PROJECT           xpu
-    GIT_REPOSITORY    "https://git.cbm.gsi.de/fweig/xpu.git"
-    GIT_TAG           "v0.6.4"
-    SOURCE_DIR        ${CMAKE_CURRENT_SOURCE_DIR}/xpu
-    CONFIGURE_COMMAND ""
-    BUILD_COMMAND     ""
-    INSTALL_COMMAND   ""
+                             GIT_REPOSITORY    "https://github.com/fweig/xpu"
+                             GIT_TAG           "v0.7.2"
+                             SOURCE_DIR        ${CMAKE_CURRENT_SOURCE_DIR}/xpu
+                             CONFIGURE_COMMAND ""
+                             BUILD_COMMAND     ""
+                             INSTALL_COMMAND   ""
   )
   Add_Subdirectory(xpu)
+  Set(XPU_INCLUDE_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/xpu/src PARENT_SCOPE)
 
   Include(InstallKFParticle.cmake)
   Include(InstallNicaFemto.cmake)
@@ -76,7 +77,7 @@ else()
 
   # Define an empty macro such that ctest is happy when no externals are
   # available
-  # This is needed since to speed up the execution of the code format check 
+  # This is needed since to speed up the execution of the code format check
   # no externals are downloaded
   macro(xpu_attach)
   endmacro()
diff --git a/macro/run/run_reco.C b/macro/run/run_reco.C
index 7a242c6c889c20a191c791c4d7ef3df699c6a96f..450e632bb3725ee004775bfbd4ddc2964303db03 100644
--- a/macro/run/run_reco.C
+++ b/macro/run/run_reco.C
@@ -75,7 +75,7 @@
  ** selects the ideal raw event builder, which associates digis to events
  ** based on the MC truth. The option "Real" selects a real raw event builder
  ** (latest version, for older versions use "Real2018" or "Real2019").
- ** 
+ **
  **
  ** The file names must be specified without extensions. The convention is
  ** that the raw (input) file is [input].raw.root. The output file
@@ -106,6 +106,8 @@ void run_reco(TString input = "", Int_t nTimeSlices = -1, Int_t firstTimeSlice =
   TString srcDir = gSystem->Getenv("VMCWORKDIR");  // top source directory
   // ------------------------------------------------------------------------
 
+  // Other settings
+  bool stsRecoUseGpu = false;
 
   // -----   In- and output file names   ------------------------------------
   if (input.IsNull()) input = "test";
@@ -278,6 +280,7 @@ void run_reco(TString input = "", Int_t nTimeSlices = -1, Int_t firstTimeSlice =
   // -----   Local reconstruction in STS   ----------------------------------
   if (useSts) {
     CbmRecoSts* stsReco = new CbmRecoSts(ECbmRecoMode::kCbmRecoTimeslice);
+    stsReco->SetUseGpuReco(stsRecoUseGpu);
     if (eventBased) stsReco->SetMode(ECbmRecoMode::kCbmRecoEvent);
     run->AddTask(stsReco);
     std::cout << "-I- " << myName << ": Added task " << stsReco->GetName() << std::endl;
@@ -465,7 +468,6 @@ void run_reco(TString input = "", Int_t nTimeSlices = -1, Int_t firstTimeSlice =
 
   }  //? time-based reco
 
-
   // -----  Parameter database   --------------------------------------------
   std::cout << std::endl << std::endl;
   std::cout << "-I- " << myName << ": Set runtime DB" << std::endl;
diff --git a/reco/CMakeLists.txt b/reco/CMakeLists.txt
index ec5b6e588c586a47b6e2972cbbcda2d2b8da0719..eaad23123eb51fa8732f69dace08d208894d17cc 100644
--- a/reco/CMakeLists.txt
+++ b/reco/CMakeLists.txt
@@ -1,6 +1,9 @@
 # CMakeList file for libraries CbmRecoXXXXX
 # P.-A. Loizeau, 7 February 2020
 
+# set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsanitize=address")
+# set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -fsanitize=address")
+
 add_subdirectory(base)
 add_subdirectory(calibration)
 add_subdirectory(detectors)
diff --git a/reco/L1/L1Algo/L1CATrackFinder.cxx b/reco/L1/L1Algo/L1CATrackFinder.cxx
index 6c03eb47b1f208ef5b9fcd099dc23841de368ca2..39473695abc4f901fb1b10dd591f57d6ceb66895 100644
--- a/reco/L1/L1Algo/L1CATrackFinder.cxx
+++ b/reco/L1/L1Algo/L1CATrackFinder.cxx
@@ -1543,7 +1543,7 @@ inline void L1Algo::TripletsStaPort(  /// creates triplets:
 
 
 /***********************************************************************************************/ /**
- *                                                                                                * 
+ *                                                                                                *
  *                            ------ CATrackFinder procedure ------                               *
  *                                                                                                *
  **************************************************************************************************/
@@ -1868,21 +1868,21 @@ void L1Algo::CATrackFinder()
          /// possible left hits of triplets are splited in portions of 16 (4 SIMDs) to use memory faster
          fDupletPortionStopIndex[fNstations-1] = 0;
          unsigned int ip = 0;  //index of curent portion
-         
+
          for (int istal = fNstations-2; istal >= fFirstCAstation; istal--)  //start downstream chambers
          {
          int nHits = HitsUnusedStopIndex[istal] - HitsUnusedStartIndex[istal];
-         
+
          int NHits_P = nHits/Portion;
-         
+
          for( int ipp = 0; ipp < NHits_P; ipp++ )
          {
          n_g1[ip] = Portion;
          ip++;
          } // start_lh - portion of left hits
-         
+
          n_g1[ip] = nHits - NHits_P*Portion;
-         
+
          ip++;
          fDupletPortionStopIndex[istal] = ip;
          }// lstations
@@ -2475,7 +2475,7 @@ void L1Algo::CATrackFinder()
 #if 0
         static long int NTimes =0, NHits=0, HitSize =0, NStrips=0, StripSize =0, NStripsB=0, StripSizeB =0,
         NDup=0, DupSize=0, NTrip=0, TripSize=0, NBranches=0, BranchSize=0, NTracks=0, TrackSize=0 ;
-        
+
         NTimes++;
         NHits += vHitsUnused->size();
         HitSize += vHitsUnused->size()*sizeof(L1Hit);
@@ -2487,13 +2487,13 @@ void L1Algo::CATrackFinder()
         DupSize += stat_max_n_dup*sizeof(/*short*/ int);
         NTrip += stat_max_n_trip;
         TripSize += stat_max_trip_size;
-        
+
         NBranches += stat_max_n_branches;
         BranchSize += stat_max_BranchSize;
         NTracks += fTracks.size();
         TrackSize += sizeof(L1Track)*fTracks.size() + sizeof(L1HitIndex_t)*fRecoHits.size();
         int k = 1024*NTimes;
-        
+
         cout<<"L1 Event size: \n"
         <<HitSize/k<<"kB for "<<NHits/NTimes<<" hits, \n"
         <<StripSize/k<<"kB for "<<NStrips/NTimes<<" strips, \n"
diff --git a/reco/detectors/sts/CMakeLists.txt b/reco/detectors/sts/CMakeLists.txt
index 8386c5ad4437c085fc03a4111f2320ce3df687a9..376f5f640c341b90b456656ac04db4dd3c83e82f 100644
--- a/reco/detectors/sts/CMakeLists.txt
+++ b/reco/detectors/sts/CMakeLists.txt
@@ -28,6 +28,16 @@ unpack/CbmStsUnpackMonitor.cxx
 
 qa/CbmStsFindTracksQa.cxx
 )
+
+set(DEVICE_SRCS
+experimental/CbmStsGpuRecoDevice.cxx
+experimental/CbmStsGpuHitFinder.cxx
+)
+
+set(NO_DICT_SRCS
+${DEVICE_SRCS}
+experimental/CbmGpuRecoSts.cxx
+)
 # -----  End of sources   ---------------------------------
 
 
@@ -39,6 +49,10 @@ set(INCLUDE_DIRECTORIES
 ${CBMROOT_SOURCE_DIR}/reco/detectors/sts
 ${CBMROOT_SOURCE_DIR}/reco/detectors/sts/unpack
 ${CBMROOT_SOURCE_DIR}/reco/detectors/sts/qa
+${CBMROOT_SOURCE_DIR}/reco/detectors/sts/experimental
+
+# external
+${CBMROOT_SOURCE_DIR}/external/xpu/src
 
 # Reco
 ${CBMROOT_SOURCE_DIR}/reco/base
@@ -82,7 +96,7 @@ ${Boost_LIBRARY_DIRS}
 
 # -----   Specify library dependences   -------------------
 Set(DEPENDENCIES
-    CbmBase CbmData CbmQaBase CbmMvd CbmRecoBase CbmStsBase
+    CbmBase CbmData CbmQaBase CbmMvd CbmRecoBase CbmStsBase xpu
 )
 # ---------------------------------------------------------
 
@@ -94,10 +108,11 @@ set(LINKDEF ${LIBRARY_NAME}LinkDef.h)
 
 # ---- Enable OpenMP  -------------------------------------
 # Comented since the OpenMP code is also commented in the moment
-#if (OPENMP_FOUND)
+# if (OPENMP_FOUND)
+#  message(STATUS "OPENMP_FOUND = TRUE")
 #  set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}")
 #  set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
-#endif()
+# endif()
 # ---------------------------------------------------------
 
 
@@ -106,4 +121,10 @@ include_directories( ${INCLUDE_DIRECTORIES})
 include_directories(SYSTEM ${SYSTEM_INCLUDE_DIRECTORIES})
 link_directories( ${LINK_DIRECTORIES})
 GENERATE_LIBRARY()
+xpu_attach(CbmRecoSts ${DEVICE_SRCS})
+
+# FIXME: Why is this needed to install headers in subfolder?
+install(DIRECTORY experimental/ TYPE INCLUDE
+    FILES_MATCHING PATTERN "*.h"
+)
 # ---------------------------------------------------------
diff --git a/reco/detectors/sts/CbmRecoSts.cxx b/reco/detectors/sts/CbmRecoSts.cxx
index 3d9d0230b8023f21dfa378e3f5812aba55dff60f..49be2bf19dd6cc9b4100a0ea81f9fb0a779a1a49 100644
--- a/reco/detectors/sts/CbmRecoSts.cxx
+++ b/reco/detectors/sts/CbmRecoSts.cxx
@@ -13,11 +13,13 @@
 #include "CbmDigiManager.h"
 #include "CbmEvent.h"
 #include "CbmStsDigi.h"
+#include "CbmStsGpuHitFinder.h"
 #include "CbmStsModule.h"
 #include "CbmStsParSetModule.h"
 #include "CbmStsParSetSensor.h"
 #include "CbmStsParSetSensorCond.h"
 #include "CbmStsParSim.h"
+#include "CbmStsPhysics.h"
 #include "CbmStsRecoModule.h"
 #include "CbmStsSetup.h"
 
@@ -31,6 +33,10 @@
 
 #include <iomanip>
 
+#if __has_include(<omp.h>)
+#include <omp.h>
+#endif
+
 using std::fixed;
 using std::left;
 using std::right;
@@ -40,15 +46,13 @@ using std::stringstream;
 using std::vector;
 
 
-ClassImp(CbmRecoSts)
-
+ClassImp(CbmRecoSts);
 
-  // -----   Constructor   ---------------------------------------------------
-  CbmRecoSts::CbmRecoSts(ECbmRecoMode mode, Bool_t writeClusters, Bool_t runParallel)
+// -----   Constructor   ---------------------------------------------------
+CbmRecoSts::CbmRecoSts(ECbmRecoMode mode, Bool_t writeClusters)
   : FairTask("RecoSts", 1)
   , fMode(mode)
   , fWriteClusters(writeClusters)
-  , fRunParallel(runParallel)
 {
 }
 // -------------------------------------------------------------------------
@@ -64,6 +68,11 @@ UInt_t CbmRecoSts::CreateModules()
 {
 
   assert(fSetup);
+
+  std::vector<CbmStsParModule> gpuModules;  // for gpu reco
+  std::vector<int> moduleAddrs;
+  std::vector<experimental::CbmStsHitFinderConfig> hfCfg;
+
   for (Int_t iModule = 0; iModule < fSetup->GetNofModules(); iModule++) {
 
     // --- Setup module and sensor
@@ -80,6 +89,9 @@ UInt_t CbmRecoSts::CreateModules()
     const CbmStsParSensor& sensPar      = fParSetSensor->GetParSensor(sensorAddress);
     const CbmStsParSensorCond& sensCond = fParSetCond->GetParSensor(sensorAddress);
 
+    gpuModules.push_back(modPar);
+    moduleAddrs.push_back(moduleAddress);
+
     // --- Calculate and set average Lorentz shift
     // --- This will be used in hit finding for correcting the position.
     Double_t lorentzF = 0.;
@@ -120,8 +132,26 @@ UInt_t CbmRecoSts::CreateModules()
     auto result = fModules.insert({moduleAddress, recoModule});
     assert(result.second);
     fModuleIndex.push_back(recoModule);
+    hfCfg.push_back({
+      .dY       = sensPar.GetPar(3),
+      .pitch    = sensPar.GetPar(6),
+      .stereoF  = sensPar.GetPar(8),
+      .stereoB  = sensPar.GetPar(9),
+      .lorentzF = float(lorentzF),
+      .lorentzB = float(lorentzB),
+    });
   }
 
+  // Gpu hitfinder
+  experimental::CbmGpuRecoSts::Config gpuConfig {
+    .parModules   = gpuModules,
+    .recoModules  = fModuleIndex,
+    .moduleAddrs  = moduleAddrs,
+    .hitfinderCfg = hfCfg,
+    .physics      = CbmStsPhysics::Instance(),
+  };
+  fGpuReco.SetConfig(gpuConfig);
+
   return fModules.size();
 }
 // -------------------------------------------------------------------------
@@ -156,9 +186,23 @@ void CbmRecoSts::Exec(Option_t*)
   CbmEvent* event  = nullptr;
 
   // --- Time-slice mode: process entire array
-  if (fMode == kCbmRecoTimeslice) ProcessData(nullptr);
 
-  // --- Event mode: loop over events
+  if (fUseGpuReco) {
+
+    ProcessDataGpu();
+    fNofDigis     = fGpuReco.nDigis;
+    fNofDigisUsed = fGpuReco.nDigisUsed;
+    fNofClusters  = fGpuReco.nCluster;
+    fNofHits      = fGpuReco.nHits;
+
+    // Old reco in time based mode
+  }
+  else if (fMode == kCbmRecoTimeslice) {
+
+    ProcessData(nullptr);
+
+    // --- Event mode: loop over events
+  }
   else {
     assert(fEvents);
     nEvents = fEvents->GetEntriesFast();
@@ -194,6 +238,8 @@ void CbmRecoSts::Exec(Option_t*)
   fTime2Run += fTime2;
   fTime3Run += fTime3;
   fTime4Run += fTime4;
+
+  // allDigis = fNofDigisUsed;
 }
 // -------------------------------------------------------------------------
 
@@ -201,12 +247,20 @@ void CbmRecoSts::Exec(Option_t*)
 // -----   End-of-run action   ---------------------------------------------
 void CbmRecoSts::Finish()
 {
+  int ompThreads = -1;
+#ifdef _OPENMP
+  ompThreads = omp_get_max_threads();
+#endif
 
-  std::cout << std::endl;
   Double_t digiCluster = Double_t(fNofDigisUsed) / Double_t(fNofClusters);
   Double_t clusterHit  = Double_t(fNofClusters) / Double_t(fNofHits);
   LOG(info) << "=====================================";
   LOG(info) << GetName() << ": Run summary";
+  if (fUseGpuReco) LOG(info) << "Ran new GPU STS reconstruction.";
+  else if (ompThreads < 0)
+    LOG(info) << "STS reconstruction ran single threaded (No OpenMP).";
+  else
+    LOG(info) << "STS reconstruction ran multithreaded with OpenMP (nthreads = " << ompThreads << ")";
   LOG(info) << "Time slices            : " << fNofTs;
   if (fMode == kCbmRecoEvent) LOG(info) << "Events                 : " << fNofEvents;
   LOG(info) << "Digis / TSlice         : " << fixed << setprecision(2) << fNofDigisRun / Double_t(fNofTs);
@@ -218,19 +272,54 @@ void CbmRecoSts::Finish()
   LOG(info) << "Clusters per hit       : " << fixed << setprecision(2) << clusterHit;
   LOG(info) << "Time per TSlice        : " << fixed << setprecision(2) << 1000. * fTimeRun / Double_t(fNofTs) << " ms ";
 
-  fTimeRun /= Double_t(fNofEvents);
-  fTime1Run /= Double_t(fNofEvents);
-  fTime2Run /= Double_t(fNofEvents);
-  fTime3Run /= Double_t(fNofEvents);
-  fTime4Run /= Double_t(fNofEvents);
-  LOG(info) << "Time Reset       : " << fixed << setprecision(1) << setw(6) << 1000. * fTime1Run << " ms ("
-            << setprecision(1) << setw(4) << 100. * fTime1Run / fTimeRun << " %)";
-  LOG(info) << "Time Distribute  : " << fixed << setprecision(1) << setw(6) << 1000. * fTime2Run << " ms ("
-            << setprecision(1) << 100. * fTime2Run / fTimeRun << " %)";
-  LOG(info) << "Time Reconstruct : " << fixed << setprecision(1) << setw(6) << 1000. * fTime3Run << " ms ("
-            << setprecision(1) << setw(4) << 100. * fTime3Run / fTimeRun << " %)";
-  LOG(info) << "Time Output      : " << fixed << setprecision(1) << setw(6) << 1000. * fTime4Run << " ms ("
-            << setprecision(1) << setw(4) << 100. * fTime4Run / fTimeRun << " %)";
+
+  // Aggregate times for substeps of reconstruction
+  // Note: These times are meaningless when reconstruction runs with > 1 thread.
+  CbmStsRecoModule::Timings timingsTotal;
+  for (const auto* m : fModuleIndex) {
+    auto moduleTime = m->GetTimings();
+    timingsTotal.timeSortDigi += moduleTime.timeSortDigi;
+    timingsTotal.timeCluster += moduleTime.timeCluster;
+    timingsTotal.timeSortCluster += moduleTime.timeSortCluster;
+    timingsTotal.timeHits += moduleTime.timeHits;
+  }
+
+  // Avoid division by zero if no events present
+  double nEvent = std::max(fNofEvents, 1);
+
+  fTimeTot /= nEvent;
+  fTime1 /= nEvent;
+  fTime2 /= nEvent;
+  fTime3 /= nEvent;
+  fTime4 /= nEvent;
+
+  if (not fUseGpuReco) {
+    LOG(info) << "NofEvents        : " << fNofEvents;
+    LOG(info) << "Time Reset       : " << fixed << setprecision(1) << setw(6) << 1000. * fTime1 << " ms ("
+              << setprecision(1) << setw(4) << 100. * fTime1 / fTimeTot << " %)";
+    LOG(info) << "Time Distribute  : " << fixed << setprecision(1) << setw(6) << 1000. * fTime2 << " ms ("
+              << setprecision(1) << 100. * fTime2 / fTimeTot << " %)";
+    LOG(info) << "Time Reconstruct: " << fixed << setprecision(1) << setw(6) << 1000. * fTime3 << " ms ("
+              << setprecision(1) << setw(4) << 100. * fTime3 / fTimeTot << " %)";
+    LOG(info) << "Time by step:\n"
+              << "  Sort Digi   : " << fixed << setprecision(1) << setw(6) << 1000. * timingsTotal.timeSortDigi
+              << " ms\n"
+              << "  Find Cluster: " << fixed << setprecision(1) << setw(6) << 1000. * timingsTotal.timeCluster
+              << " ms\n"
+              << "  Sort Cluster: " << fixed << setprecision(1) << setw(6) << 1000. * timingsTotal.timeSortCluster
+              << " ms\n"
+              << "  Find Hits   : " << fixed << setprecision(1) << setw(6) << 1000. * timingsTotal.timeHits << " ms\n";
+  }
+  else {
+    double gpuHitfinderTimeTotal =
+      fGpuReco.timeSortDigi + fGpuReco.timeCluster + fGpuReco.timeSortCluster + fGpuReco.timeHits;
+    LOG(info) << "Time Reconstruct (GPU) : " << fixed << setprecision(1) << setw(6) << gpuHitfinderTimeTotal << " ms";
+    LOG(info) << "Time by step:\n"
+              << "  Sort Digi   : " << fixed << setprecision(1) << setw(6) << fGpuReco.timeSortDigi << " ms\n"
+              << "  Find Cluster: " << fixed << setprecision(1) << setw(6) << fGpuReco.timeCluster << " ms\n"
+              << "  Sort Cluster: " << fixed << setprecision(1) << setw(6) << fGpuReco.timeSortCluster << " ms\n"
+              << "  Find Hits   : " << fixed << setprecision(1) << setw(6) << fGpuReco.timeHits << " ms";
+  }
   LOG(info) << "=====================================";
 }
 // -------------------------------------------------------------------------
@@ -406,7 +495,9 @@ void CbmRecoSts::ProcessData(CbmEvent* event)
   Int_t nClusters     = 0;
   Int_t nHits         = 0;
 
-  //  #pragma omp parallel for schedule(static) if(fParallelism_enabled)
+#ifdef _OPENMP
+#pragma omp parallel for schedule(static)
+#endif
   for (UInt_t it = 0; it < fModuleIndex.size(); it++) {
     fModuleIndex[it]->Reset();
   }
@@ -450,7 +541,9 @@ void CbmRecoSts::ProcessData(CbmEvent* event)
 
   // --- Execute reconstruction in the modules
   fTimer.Start();
-  //  #pragma omp parallel for schedule(static) if(fParallelism_enabled)
+#ifdef _OPENMP
+#pragma omp parallel for schedule(static)
+#endif
   for (UInt_t it = 0; it < fModuleIndex.size(); it++) {
     assert(fModuleIndex[it]);
     fModuleIndex[it]->Reconstruct();
@@ -458,7 +551,6 @@ void CbmRecoSts::ProcessData(CbmEvent* event)
   fTimer.Stop();
   Double_t time3 = fTimer.RealTime();  // Time for reconstruction
 
-
   // --- Collect clusters and hits from modules
   // Here, the hits (and optionally clusters) are copied to the
   // TClonesArrays in the ROOT tree. This is surely not the last word
@@ -471,7 +563,8 @@ void CbmRecoSts::ProcessData(CbmEvent* event)
   for (UInt_t it = 0; it < fModuleIndex.size(); it++) {
 
     const vector<CbmStsCluster>& moduleClustersF = fModuleIndex[it]->GetClustersF();
-    offsetClustersF                              = fClusters->GetEntriesFast();
+
+    offsetClustersF = fClusters->GetEntriesFast();
     for (auto& cluster : moduleClustersF) {
       UInt_t index = fClusters->GetEntriesFast();
       new ((*fClusters)[index]) CbmStsCluster(cluster);
@@ -480,7 +573,8 @@ void CbmRecoSts::ProcessData(CbmEvent* event)
     }  //# front-side clusters in module
 
     const vector<CbmStsCluster>& moduleClustersB = fModuleIndex[it]->GetClustersB();
-    offsetClustersB                              = fClusters->GetEntriesFast();
+
+    offsetClustersB = fClusters->GetEntriesFast();
     for (auto& cluster : moduleClustersB) {
       UInt_t index = fClusters->GetEntriesFast();
       new ((*fClusters)[index]) CbmStsCluster(cluster);
@@ -528,6 +622,14 @@ void CbmRecoSts::ProcessData(CbmEvent* event)
 }
 // -------------------------------------------------------------------------
 
+void CbmRecoSts::ProcessDataGpu()
+{
+  if (fMode == kCbmRecoEvent) throw std::runtime_error("STS GPU Reco does not yet support event-by-event mode.");
+
+  fGpuReco.RunHitFinder();
+  fGpuReco.ForwardClustersAndHits(fClusters, fHits);
+}
+
 
 // -----   Connect parameter container   -----------------------------------
 void CbmRecoSts::SetParContainers()
@@ -539,3 +641,36 @@ void CbmRecoSts::SetParContainers()
   fParSetCond       = dynamic_cast<CbmStsParSetSensorCond*>(db->getContainer("CbmStsParSetSensorCond"));
 }
 // -------------------------------------------------------------------------
+
+void CbmRecoSts::DumpNewHits()
+{
+  std::ofstream out {"newHits.csv"};
+
+  out << "module, x, y, z, deltaX, deltaY, deltaZ, deltaXY, time, timeError, deltaU, deltaV" << std::endl;
+
+  for (size_t m = 0; m < fModuleIndex.size(); m++) {
+    auto hits = fGpuReco.GetHits(m);
+
+    for (const auto& h : hits) {
+      out << m << ", " << h.fX << ", " << h.fY << ", " << h.fZ << ", " << h.fDx << ", " << h.fDy << ", " << h.fDz
+          << ", " << h.fDxy << ", " << h.fTime << ", " << h.fTimeError << ", " << h.fDu << ", " << h.fDv << std::endl;
+    }
+  }
+}
+
+void CbmRecoSts::DumpOldHits()
+{
+  std::ofstream out {"oldHits.csv"};
+
+  out << "module, x, y, z, deltaX, deltaY, deltaZ, deltaXY, time, timeError, deltaU, deltaV" << std::endl;
+
+  for (size_t m = 0; m < fModuleIndex.size(); m++) {
+    const auto& hits = fModuleIndex[m]->GetHits();
+
+    for (const auto& h : hits) {
+      out << m << ", " << h.GetX() << ", " << h.GetY() << ", " << h.GetZ() << ", " << h.GetDx() << ", " << h.GetDy()
+          << ", " << h.GetDz() << ", " << h.GetDxy() << ", " << h.GetTime() << ", " << h.GetTimeError() << ", "
+          << h.GetDu() << ", " << h.GetDv() << std::endl;
+    }
+  }
+}
diff --git a/reco/detectors/sts/CbmRecoSts.h b/reco/detectors/sts/CbmRecoSts.h
index 15b5a804cd72927464dbb9f8384677d8b3b51fda..f11af70cdbbccd3423c9c9b0a89e60599cad18c7 100644
--- a/reco/detectors/sts/CbmRecoSts.h
+++ b/reco/detectors/sts/CbmRecoSts.h
@@ -11,12 +11,12 @@
 #ifndef CBMRECOSTS_H
 #define CBMRECOSTS_H 1
 
+#include "CbmGpuRecoSts.h"
 
 #include <FairTask.h>
 
 #include <TStopwatch.h>
 
-
 class CbmDigiManager;
 class CbmEvent;
 class CbmStsElement;
@@ -64,7 +64,7 @@ class CbmRecoSts : public FairTask {
 
 public:
   /** @brief Constructor **/
-  CbmRecoSts(ECbmRecoMode mode = kCbmRecoTimeslice, Bool_t writeClusters = kFALSE, Bool_t runParallel = kFALSE);
+  CbmRecoSts(ECbmRecoMode mode = kCbmRecoTimeslice, Bool_t writeClusters = kFALSE);
 
 
   /** @brief Copy constructor (disabled) **/
@@ -117,6 +117,8 @@ public:
      **/
   void SetMode(ECbmRecoMode mode) { fMode = mode; }
 
+  void SetUseGpuReco(bool useGpu) { fUseGpuReco = useGpu; }
+
 
   /** @brief Define the needed parameter containers **/
   virtual void SetParContainers();
@@ -254,6 +256,12 @@ private:
      **/
   void ProcessData(CbmEvent* event = nullptr);
 
+  void ProcessDataGpu();
+
+  void DumpNewHits();
+
+  void DumpOldHits();
+
 
 private:
   // --- I/O
@@ -287,7 +295,6 @@ private:
   Double_t fTimeCutClustersSig = 4.;             ///< Time cut for hit finding
   Double_t fTimeCutClustersAbs = -1.;            ///< Time cut for hit finding [ns]
   Bool_t fWriteClusters        = kFALSE;         ///< Write clusters to tree
-  Bool_t fRunParallel          = kFALSE;         ///< Use OpenMP multithreading
 
   // --- Timeslice counters
   Long64_t fNofDigis        = 0;   ///< Total number of digis processed
@@ -321,6 +328,8 @@ private:
   std::map<UInt_t, CbmStsRecoModule*> fModules {};  //!
   std::vector<CbmStsRecoModule*> fModuleIndex {};   //!
 
+  bool fUseGpuReco = false;
+  ::experimental::CbmGpuRecoSts fGpuReco;
 
   ClassDef(CbmRecoSts, 1);
 };
diff --git a/reco/detectors/sts/CbmStsAlgoAnaCluster.cxx b/reco/detectors/sts/CbmStsAlgoAnaCluster.cxx
index 5bdd47aae849585d5232344a08fd45b9cd4218ce..5801e08b0689fb13bf992fd450292b1cf57a6d32 100644
--- a/reco/detectors/sts/CbmStsAlgoAnaCluster.cxx
+++ b/reco/detectors/sts/CbmStsAlgoAnaCluster.cxx
@@ -19,7 +19,6 @@
 
 using std::unique_ptr;
 
-
 ClassImp(CbmStsAlgoAnaCluster)
 
 
@@ -40,6 +39,10 @@ void CbmStsAlgoAnaCluster::AnaSize1(CbmStsCluster& cluster, const CbmStsParModul
   Int_t index            = cluster.GetDigi(0);
   const CbmStsDigi* digi = fDigiMan->Get<CbmStsDigi>(index);
   assert(digi);
+  // printf("BEGIN CLUSTER CALC 1\n");
+  // printf("Digi %d: channel=%d, time=%d, charge=%d\n", index, digi->GetChannel(), digi->GetTimeU32(), digi->GetChargeU16());
+  // printf("END CLUSTER CALC\n");
+
   auto& asic         = module->GetParAsic(digi->GetChannel());
   Double_t x         = Double_t(digi->GetChannel());
   Double_t time      = digi->GetTime();
@@ -118,6 +121,7 @@ void CbmStsAlgoAnaCluster::AnaSize2(CbmStsCluster& cluster, const CbmStsParModul
   }
   Double_t xError = TMath::Sqrt(ex0sq + ex1sq + ex2sq);
 
+
   // Cluster charge
   Double_t charge = q1 + q2;
 
@@ -147,10 +151,12 @@ void CbmStsAlgoAnaCluster::AnaSizeN(CbmStsCluster& cluster, const CbmStsParModul
 
     Int_t index            = cluster.GetDigi(iDigi);
     const CbmStsDigi* digi = fDigiMan->Get<CbmStsDigi>(index);
+
     assert(digi);
     Int_t channel = digi->GetChannel();
     auto& asic    = modPar->GetParAsic(channel);
 
+
     // --- Uncertainties of the charge measurements
     Double_t eNoiseSq     = asic.GetNoise() * asic.GetNoise();
     Double_t chargePerAdc = asic.GetDynRange() / Double_t(asic.GetNofAdc());
@@ -183,6 +189,7 @@ void CbmStsAlgoAnaCluster::AnaSizeN(CbmStsCluster& cluster, const CbmStsParModul
 
   }  //# digis in cluster
 
+
   // Periodic channel position for clusters round the edge
   if (chanF > chanL) chanF -= modPar->GetNofChannels() / 2;
 
diff --git a/reco/detectors/sts/CbmStsAlgoFindHits.cxx b/reco/detectors/sts/CbmStsAlgoFindHits.cxx
index 9e1a4d88a27e440d0550da1ae662f3f579173894..5f2634b96fcb41a7cf1b75d0b3c48daddc1d8f9a 100644
--- a/reco/detectors/sts/CbmStsAlgoFindHits.cxx
+++ b/reco/detectors/sts/CbmStsAlgoFindHits.cxx
@@ -23,7 +23,6 @@
 using std::pair;
 using std::vector;
 
-
 // -----   Constructor   ---------------------------------------------------
 CbmStsAlgoFindHits::CbmStsAlgoFindHits() {}
 // -------------------------------------------------------------------------
@@ -124,6 +123,7 @@ Long64_t CbmStsAlgoFindHits::Exec(const vector<CbmStsCluster>& clustersF, const
       const CbmStsCluster& clusterB = clustersB[iClusterB];
 
       Double_t timeDiff = tF - clusterB.GetTime();
+
       if ((timeDiff > 0) && (timeDiff > max_sigma_both)) {
         startB++;
         continue;
diff --git a/reco/detectors/sts/CbmStsFindTracks.cxx b/reco/detectors/sts/CbmStsFindTracks.cxx
index 0e4610fc65f78b44d3b82313ff3887061dd3bed4..638be437e6f8cd32c3b548c937c05aaf372c8432 100644
--- a/reco/detectors/sts/CbmStsFindTracks.cxx
+++ b/reco/detectors/sts/CbmStsFindTracks.cxx
@@ -178,8 +178,8 @@ InitStatus CbmStsFindTracks::Init()
     else if (fVerbose >  2) fDigiScheme->Print(kTRUE);
     cout << "-I- "
 	 << "STS digitisation scheme succesfully initialised" << endl;
-    cout << "    Stations: " << fDigiScheme->GetNStations() 
-	 << ", Sectors: " << fDigiScheme->GetNSectors() << ", Channels: " 
+    cout << "    Stations: " << fDigiScheme->GetNStations()
+	 << ", Sectors: " << fDigiScheme->GetNSectors() << ", Channels: "
 	 << fDigiScheme->GetNChannels() << endl;
   }
 	 */
diff --git a/reco/detectors/sts/CbmStsRecoModule.cxx b/reco/detectors/sts/CbmStsRecoModule.cxx
index a0c2173258e71bb922246d40f6f2cc4d906ac695..fa95b5ea56d6cf12bb04a17b8a1f6384407e386b 100644
--- a/reco/detectors/sts/CbmStsRecoModule.cxx
+++ b/reco/detectors/sts/CbmStsRecoModule.cxx
@@ -25,10 +25,10 @@
 #include <TGeoBBox.h>
 #include <TGeoPhysicalNode.h>
 #include <TMath.h>
+#include <TStopwatch.h>
 
 using std::pair;
 
-
 ClassImp(CbmStsRecoModule)
 
 
@@ -77,6 +77,10 @@ void CbmStsRecoModule::AddDigiToQueue(const CbmStsDigi* digi, Int_t digiIndex)
 void CbmStsRecoModule::Reconstruct()
 {
 
+  // return;
+  TStopwatch timer;
+
+  timer.Start();
   // --- Sort the digi queues by digi time stamp
   std::sort(fDigisF.begin(), fDigisF.end(),
             [](pair<const CbmStsDigi*, Int_t> digi1, pair<const CbmStsDigi*, Int_t> digi2) {
@@ -86,8 +90,11 @@ void CbmStsRecoModule::Reconstruct()
             [](pair<const CbmStsDigi*, Int_t> digi1, pair<const CbmStsDigi*, Int_t> digi2) {
               return digi1.first->GetTime() < digi2.first->GetTime();
             });
+  timer.Stop();
+  fTimings.timeSortDigi = timer.RealTime();
 
   // --- Perform cluster finding
+  timer.Start();
   fClusterFinder->Exec(fDigisF, fClustersF, fSetupModule->GetAddress(), fNofStripsF, 0, fTimeCutDigisSig,
                        fTimeCutDigisAbs, fConnectEdgeFront, fParModule);
   fClusterFinder->Exec(fDigisB, fClustersB, fSetupModule->GetAddress(), fNofStripsB, fNofStripsF, fTimeCutDigisSig,
@@ -99,15 +106,22 @@ void CbmStsRecoModule::Reconstruct()
   for (auto& cluster : fClustersB)
     fClusterAna->Exec(cluster, fParModule);
 
+  timer.Stop();
+  fTimings.timeCluster = timer.RealTime();
+
   // --- Sort clusters by time
+  timer.Start();
   std::sort(fClustersF.begin(), fClustersF.end(), [](const CbmStsCluster& cluster1, const CbmStsCluster& cluster2) {
     return (cluster1.GetTime() < cluster2.GetTime());
   });
   std::sort(fClustersB.begin(), fClustersB.end(), [](const CbmStsCluster& cluster1, const CbmStsCluster& cluster2) {
     return (cluster1.GetTime() < cluster2.GetTime());
   });
+  timer.Stop();
+  fTimings.timeSortCluster = timer.RealTime();
 
   // --- Perform hit finding
+  timer.Start();
   if (fHitFinder)
     fHitFinder->Exec(fClustersF, fClustersB, fHits, fSetupModule->GetAddress(), fTimeCutClustersSig,
                      fTimeCutClustersAbs, fDyActive, fNofStripsF, fStripPitchF, fStereoFront, fStereoBack,
@@ -116,6 +130,8 @@ void CbmStsRecoModule::Reconstruct()
     fHitFinderOrtho->Exec(fClustersF, fClustersB, fHits, fSetupModule->GetAddress(), fTimeCutClustersSig,
                           fTimeCutClustersAbs, fNofStripsF, fNofStripsB, fStripPitchF, fStripPitchB, fLorentzShiftF,
                           fLorentzShiftB, fMatrix);
+  timer.Stop();
+  fTimings.timeHits = timer.RealTime();
 }
 // -------------------------------------------------------------------------
 
diff --git a/reco/detectors/sts/CbmStsRecoModule.h b/reco/detectors/sts/CbmStsRecoModule.h
index 7a591b329a0cafbee8e326d05874b9a1d3566de2..261ec35c21ccde53f7566bbef7cefeb7d3499992 100644
--- a/reco/detectors/sts/CbmStsRecoModule.h
+++ b/reco/detectors/sts/CbmStsRecoModule.h
@@ -51,6 +51,13 @@ class CbmStsParSensorCond;
 class CbmStsRecoModule : public TNamed {
 
 public:
+  struct Timings {
+    double timeSortDigi    = 0;
+    double timeCluster     = 0;
+    double timeSortCluster = 0;
+    double timeHits        = 0;
+  };
+
   /** @brief Default constructor **/
   CbmStsRecoModule();
 
@@ -105,6 +112,10 @@ public:
      **/
   const std::vector<CbmStsHit>& GetHits() const { return fHits; }
 
+  /** @brief Time measurements
+   **/
+  Timings GetTimings() const { return fTimings; }
+
 
   /** @brief Perform reconstruction **/
   void Reconstruct();
@@ -114,6 +125,9 @@ public:
   void Reset();
 
 
+  TGeoHMatrix* getMatrix() { return fMatrix; }
+
+
   /** @brief Info to string **/
   std::string ToString() const;
 
@@ -201,6 +215,9 @@ private:
   Bool_t fConnectEdgeFront    = kFALSE;  ///< Round-the edge clustering front side
   Bool_t fConnectEdgeBack     = kFALSE;  ///< Round-the edge clustering back side
 
+  // --- Time measurement
+  Timings fTimings;
+
 
   ClassDef(CbmStsRecoModule, 1);
 };
diff --git a/reco/detectors/sts/experimental/CbmGpuRecoSts.cxx b/reco/detectors/sts/experimental/CbmGpuRecoSts.cxx
new file mode 100644
index 0000000000000000000000000000000000000000..923cb2ce6d758f237feed2e9b656bd1d9077e420
--- /dev/null
+++ b/reco/detectors/sts/experimental/CbmGpuRecoSts.cxx
@@ -0,0 +1,404 @@
+/* Copyright (C) 2022 FIAS Frankfurt Institute for Advanced Studies, Frankfurt / Main
+   SPDX-License-Identifier: GPL-3.0-only
+   Authors: Felix Weiglhofer [committer], Kilian Hunold */
+#include "CbmGpuRecoSts.h"
+
+#include "CbmAddress.h"
+#include "CbmDigiManager.h"
+#include "CbmStsAddress.h"
+#include "CbmStsCluster.h"
+#include "CbmStsGpuRecoDevice.h"
+#include "CbmStsHit.h"
+#include "CbmStsPhysics.h"
+#include "CbmStsRecoModule.h"
+
+#include <TCanvas.h>
+#include <TClonesArray.h>
+#include <TGeoMatrix.h>
+#include <TH1D.h>
+#include <TH2D.h>
+#include <TStopwatch.h>
+
+#include <algorithm>
+#include <cassert>
+#include <unordered_set>
+#include <vector>
+
+using namespace experimental;
+
+void CbmGpuRecoSts::SetupConstants()
+{
+  assert(fConfig != std::nullopt);
+  assert(fConfig->recoModules.size() == fConfig->parModules.size());
+  assert(fConfig->moduleAddrs.size() == fConfig->parModules.size());
+  assert(fConfig->hitfinderCfg.size() == fConfig->parModules.size());
+
+  size_t nModules = fConfig->parModules.size();
+
+  auto& hfc = fHitfinderCpu;
+  auto& hfg = fHitfinderGpu;
+
+  LOG(info) << "STS GPU RECO: setting constants";
+
+  size_t nChannels     = fConfig->parModules[0].GetNofChannels();
+  ::CbmStsParAsic asic = fConfig->parModules[0].GetParAsic(0);  // TODO values are currently identical for all asics??
+  // for (auto &mod : modules) {
+  //     assert(nChannels == mod.GetNofChannels());
+  //     for (size_t c = 0; c < nChannels; c++) {
+  //         ::CbmStsParAsic otherAsic = mod.GetParAsic(c);
+  //         assert(asic.GetNofAdc() == otherAsic.GetNofAdc());
+  //         assert(asic.GetDynRange() == otherAsic.GetDynRange());
+  //         assert(asic.GetThreshold() == otherAsic.GetThreshold());
+  //         assert(asic.GetTimeResol() == otherAsic.GetTimeResol());
+  //         assert(asic.GetDeadTime() == otherAsic.GetDeadTime());
+  //         assert(asic.GetNoise() == otherAsic.GetNoise());
+  //         assert(asic.GetZeroNoiseRate() == otherAsic.GetZeroNoiseRate());
+  //     }
+  // }
+
+  nChannels /= 2;
+  hfc.nModules = hfg.nModules = nModules;
+  hfc.nChannels = hfg.nChannels = nChannels;
+
+  hfc.timeCutDigiAbs = hfg.timeCutDigiAbs = -1.f;
+  hfc.timeCutDigiSig = hfg.timeCutDigiSig = 3.f;
+  hfc.timeCutClusterAbs = hfg.timeCutClusterAbs = -1.f;
+  hfc.timeCutClusterSig = hfg.timeCutClusterSig = 4.f;
+
+  hfc.asic.nAdc           = asic.GetNofAdc();
+  hfc.asic.dynRange       = asic.GetDynRange();
+  hfc.asic.threshold      = asic.GetThreshold();
+  hfc.asic.timeResolution = asic.GetTimeResol();
+  hfc.asic.deadTime       = asic.GetDeadTime();
+  hfc.asic.noise          = asic.GetNoise();
+  hfc.asic.zeroNoiseRate  = asic.GetZeroNoiseRate();
+  hfg.asic                = hfc.asic;
+}
+
+void CbmGpuRecoSts::Setup(size_t maxDigisPerModule, size_t nDigitsTotal)
+{
+  assert(fConfig != std::nullopt);
+
+  // static constexpr size_t maxDigisPerModule    = 60000;
+  // static constexpr size_t maxClustersPerModule = 30000;
+  // static constexpr size_t maxHitsPerModule     = 120000;
+
+  const size_t maxClustersPerModule = maxDigisPerModule * 0.5;
+  const size_t maxHitsPerModule     = maxClustersPerModule * 4;
+
+  size_t nModules  = fConfig->parModules.size();
+  size_t nChannels = fConfig->parModules[0].GetNofChannels() / 2;
+
+
+  size_t nModulesTotal = nModules * 2;  // Front- and backside
+
+  auto& hfc = fHitfinderCpu;
+  auto& hfg = fHitfinderGpu;
+
+  auto landauTable    = fConfig->physics->GetLandauWidthTable();
+  hfc.landauTableSize = hfg.landauTableSize = landauTable.size();
+  hfc.landauTable                           = xpu::hd_buffer<float> {size_t(hfc.landauTableSize)};
+  hfg.landauTable                           = hfc.landauTable.d();
+  auto landauEntry                          = landauTable.begin();
+  auto prevLandauEntry                      = landauEntry;
+  hfc.landauTable.h()[0]                    = landauEntry->second;
+  landauEntry++;
+  hfc.landauStepSize = hfg.landauStepSize = landauEntry->first - prevLandauEntry->first;
+  for (size_t i = 1; landauEntry != landauTable.end(); i++, landauEntry++) {
+    assert(hfc.landauStepSize == landauEntry->first - prevLandauEntry->first);
+    assert(i < hfc.landauTable.size());
+    hfc.landauTable.h()[i] = landauEntry->second;
+    prevLandauEntry        = landauEntry;
+  }
+
+  LOG(info) << "STS GPU RECO: finished landau";
+
+  hfc.localToGlobalTranslationByModule = xpu::hd_buffer<float> {nModules * 3};
+  hfg.localToGlobalTranslationByModule = hfc.localToGlobalTranslationByModule.d();
+  hfc.localToGlobalRotationByModule    = xpu::hd_buffer<float> {nModules * 9};
+  hfg.localToGlobalRotationByModule    = hfc.localToGlobalRotationByModule.d();
+  for (size_t m = 0; m < fConfig->recoModules.size(); m++) {
+    auto& recoMod       = fConfig->recoModules[m];
+    TGeoHMatrix* matrix = recoMod->getMatrix();
+
+    const double* rot = matrix->GetRotationMatrix();
+    float* tgt        = &hfc.localToGlobalRotationByModule.h()[m * 9];
+    std::copy_n(rot, 9, tgt);
+
+    const double* tr = matrix->GetTranslation();
+    tgt              = &hfc.localToGlobalTranslationByModule.h()[m * 3];
+    std::copy_n(tr, 3, tgt);
+  }
+
+  LOG(info) << "STS GPU RECO: finished transformation matrix";
+
+  hfc.hitfinderConfig = xpu::hd_buffer<CbmStsHitFinderConfig>(nModules);
+  hfg.hitfinderConfig = hfc.hitfinderConfig.d();
+
+  std::copy_n(fConfig->hitfinderCfg.begin(), nModules, hfc.hitfinderConfig.h());
+
+  LOG(info) << "STS GPU RECO: finished setting hitfinder config";
+
+  // FIXME: allocate by total number of digis instead. This array is flat.
+  hfc.digiOffsetPerModule    = xpu::hd_buffer<size_t> {nModulesTotal + 1};
+  hfg.digiOffsetPerModule    = hfc.digiOffsetPerModule.d();
+  hfc.digisPerModule         = xpu::hd_buffer<CbmStsDigi> {nDigitsTotal};
+  hfg.digisPerModule         = hfc.digisPerModule.d();
+  hfc.digisPerModuleTmp      = xpu::d_buffer<CbmStsDigi> {nDigitsTotal};
+  hfg.digisPerModuleTmp      = hfc.digisPerModuleTmp.d();
+  hfc.digisSortedPerModule   = xpu::d_buffer<CbmStsDigi*> {nModulesTotal};
+  hfg.digisSortedPerModule   = hfc.digisSortedPerModule.d();
+  hfc.digiConnectorPerModule = xpu::d_buffer<CbmStsDigiConnector> {nDigitsTotal};
+  hfg.digiConnectorPerModule = hfc.digiConnectorPerModule.d();
+
+  hfc.channelOffsetPerModule = xpu::d_buffer<unsigned int> {nModulesTotal * nChannels};
+  hfg.channelOffsetPerModule = hfc.channelOffsetPerModule.d();
+
+  hfc.maxErrorFront = xpu::d_buffer<float> {1};
+  hfg.maxErrorFront = hfc.maxErrorFront.d();
+  hfc.maxErrorBack  = xpu::d_buffer<float> {1};
+  hfg.maxErrorBack  = hfc.maxErrorBack.d();
+
+  LOG(info) << "STS GPU RECO: finished digis";
+
+  hfc.maxClustersPerModule = hfg.maxClustersPerModule = maxClustersPerModule;
+  hfc.channelStatusPerModule                          = xpu::d_buffer<int> {nChannels * nModulesTotal};
+  hfg.channelStatusPerModule                          = hfc.channelStatusPerModule.d();
+  hfc.clusterIdxPerModule       = xpu::hd_buffer<CbmStsClusterIdx> {maxClustersPerModule * nModulesTotal};
+  hfg.clusterIdxPerModule       = hfc.clusterIdxPerModule.d();
+  hfc.clusterIdxPerModuleTmp    = xpu::hd_buffer<CbmStsClusterIdx> {maxClustersPerModule * nModulesTotal};
+  hfg.clusterIdxPerModuleTmp    = hfc.clusterIdxPerModuleTmp.d();
+  hfc.clusterIdxSortedPerModule = xpu::hd_buffer<CbmStsClusterIdx*> {nModulesTotal};
+  hfg.clusterIdxSortedPerModule = hfc.clusterIdxSortedPerModule.d();
+  hfc.clusterDataPerModule      = xpu::hd_buffer<CbmStsClusterData> {maxClustersPerModule * nModulesTotal};
+  hfg.clusterDataPerModule      = hfc.clusterDataPerModule.d();
+  hfc.nClustersPerModule        = xpu::hd_buffer<int> {nModulesTotal};
+  hfg.nClustersPerModule        = hfc.nClustersPerModule.d();
+
+  LOG(info) << "STS GPU RECO: finished cluster";
+
+  hfc.maxHitsPerModule = hfg.maxHitsPerModule = maxHitsPerModule;
+  hfc.hitsPerModule                           = xpu::hd_buffer<CbmStsHit>(maxHitsPerModule * nModules);
+  hfg.hitsPerModule                           = hfc.hitsPerModule.d();
+  hfc.nHitsPerModule                          = xpu::hd_buffer<int>(nModules);
+  hfg.nHitsPerModule                          = hfc.nHitsPerModule.d();
+
+  LOG(info) << "STS GPU RECO: finished setup";
+  LOG(info) << "- nModules = " << nModules;
+  LOG(info) << "- nChannels = " << nChannels;
+}
+
+void CbmGpuRecoSts::RunHitFinder()
+{
+  setenv("XPU_PROFILE", "1", 1);
+  xpu::initialize();
+
+  auto& hfc = fHitfinderCpu;
+
+  // ROCm bug: Mi100 name is an emtpy string...
+  auto deviceName = (xpu::device_properties().name.empty()) ? "AMD Mi100" : xpu::device_properties().name;
+
+  LOG(info) << "STS GPU RECO: running GPU hit finder on device " << deviceName;
+  LOG(info) << "STS GPU RECO: Allocate and fetch digis";
+
+
+  // FIXME: This is ugly, setup has to be split in two parts, constants required by FetchDigis
+  // And FetchDigis required by Setup...
+  SetupConstants();
+  size_t maxDigisPerModule = 0;
+  size_t nDigisTotal       = 0;
+  FetchDigis(maxDigisPerModule, nDigisTotal);
+  Setup(maxDigisPerModule, nDigisTotal);
+
+  LOG(info) << "STS GPU RECO: finished digis / running clusterizer now";
+  LOG(info) << "STS GPU RECO: largest module has " << maxDigisPerModule << " digis";
+
+  // Not all buffers have to initialized, but useful for debugging
+  // xpu::memset(hfc.digisPerModule, 0);
+  // xpu::memset(hfc.digisPerModuleTmp, 0);
+  // xpu::memset(hfc.digisSortedPerModule, 0);
+  // xpu::memset(hfc.digiOffsetPerModule, 0);
+  xpu::memset(hfc.digiConnectorPerModule, 0);
+  // xpu::memset(hfc.channelOffsetPerModule, 0);
+  xpu::memset(hfc.channelStatusPerModule, -1);
+  xpu::memset(hfc.clusterIdxPerModule, 0);
+  // xpu::memset(hfc.clusterIdxPerModuleTmp, 0);
+  // xpu::memset(hfc.clusterIdxSortedPerModule, 0);
+  // xpu::memset(hfc.clusterDataPerModule, 0);
+  xpu::memset(hfc.nClustersPerModule, 0);
+  // xpu::memset(hfc.hitsPerModule, 0);
+  xpu::memset(hfc.nHitsPerModule, 0);
+  xpu::memset(hfc.maxErrorFront, 0);
+  xpu::memset(hfc.maxErrorBack, 0);
+
+  hfc.digiOffsetPerModule.h()[0] = 0;
+  for (int m = 0; m < hfc.nModules; m++) {
+    size_t offset = hfc.digiOffsetPerModule.h()[m];
+    auto& md      = fDigisByModuleF[fConfig->moduleAddrs[m]];
+    std::copy(md.begin(), md.end(), &hfc.digisPerModule.h()[offset]);
+    hfc.digiOffsetPerModule.h()[m + 1] = offset + md.size();
+  }
+  for (int m = 0; m < hfc.nModules; m++) {
+    size_t offset = hfc.digiOffsetPerModule.h()[m + hfc.nModules];
+    auto& md      = fDigisByModuleB[fConfig->moduleAddrs[m]];
+    std::copy(md.begin(), md.end(), &hfc.digisPerModule.h()[offset]);
+    hfc.digiOffsetPerModule.h()[m + hfc.nModules + 1] = offset + md.size();
+  }
+  assert(nDigisTotal == hfc.digiOffsetPerModule.h()[2 * hfc.nModules]);
+
+  TStopwatch timer;
+
+  // constants only need to be transferred once. should be excluded from time measurement
+  xpu::copy(hfc.landauTable, xpu::host_to_device);
+  xpu::copy(hfc.hitfinderConfig, xpu::host_to_device);
+  xpu::copy(hfc.localToGlobalRotationByModule, xpu::host_to_device);
+  xpu::copy(hfc.localToGlobalTranslationByModule, xpu::host_to_device);
+
+  xpu::set_constant<HitFinder>(fHitfinderGpu);
+  xpu::copy(hfc.digisPerModule, xpu::host_to_device);
+  xpu::copy(hfc.digiOffsetPerModule, xpu::host_to_device);
+
+  xpu::run_kernel<SortDigis>(xpu::grid::n_blocks(hfc.nModules * 2));
+  xpu::run_kernel<FindClustersSingleStep>(xpu::grid::n_blocks(hfc.nModules * 2));
+  // Run cluster finding steps in indivual kernels, useful for debugging / profiling
+  // xpu::run_kernel<CalculateOffsets>(xpu::grid::n_blocks(hfc.nModules * 2));
+  // xpu::run_kernel<FindClustersBasic>(xpu::grid::n_blocks(hfc.nModules * 2));
+  // xpu::run_kernel<CalculateClusters>(xpu::grid::n_blocks(hfc.nModules * 2));
+  // xpu::run_kernel<FindClustersBasic>(xpu::grid::n_blocks(hfc.nModules * 2));
+  // xpu::run_kernel<CalculateClustersBasic>(xpu::grid::n_blocks(hfc.nModules * 2));
+  xpu::run_kernel<SortClusters>(xpu::grid::n_blocks(hfc.nModules * 2));
+  xpu::run_kernel<FindHits>(xpu::grid::n_blocks(hfc.nModules));
+
+  // Check if profiling is enabled
+  if (not xpu::get_timing<SortDigis>().empty()) {
+    timeSortDigi    = xpu::get_timing<SortDigis>().back();
+    timeCluster     = xpu::get_timing<FindClustersSingleStep>().back();
+    timeSortCluster = xpu::get_timing<SortClusters>().back();
+    timeHits        = xpu::get_timing<FindHits>().back();
+  }
+
+  xpu::copy(hfc.clusterDataPerModule, xpu::device_to_host);
+  xpu::copy(hfc.nClustersPerModule, xpu::device_to_host);
+  xpu::copy(hfc.hitsPerModule, xpu::device_to_host);
+  xpu::copy(hfc.nHitsPerModule, xpu::device_to_host);
+
+
+  size_t mostCluster = 0, mostHits = 0;
+  for (int m = 0; m < hfc.nModules; m++) {
+    mostCluster = std::max<size_t>(hfc.nClustersPerModule.h()[m], mostCluster);
+    mostCluster = std::max<size_t>(hfc.nClustersPerModule.h()[m + hfc.nModules], mostCluster);
+    mostHits    = std::max<size_t>(hfc.nHitsPerModule.h()[m], mostHits);
+
+    nCluster += hfc.nClustersPerModule.h()[m];
+    nCluster += hfc.nClustersPerModule.h()[m + hfc.nModules];
+
+    nHits += hfc.nHitsPerModule.h()[m];
+  }
+
+  LOG(info) << "STS GPU RECO: finished hitfinder";
+}
+
+void CbmGpuRecoSts::ForwardClustersAndHits(TClonesArray* clusters, TClonesArray* hits)
+{
+  auto& hfc = fHitfinderCpu;
+
+  for (int module = 0; module < hfc.nModules; module++) {
+
+    auto* gpuClusterF    = &hfc.clusterDataPerModule.h()[module * hfc.maxClustersPerModule];
+    auto* gpuClusterIdxF = &hfc.clusterIdxPerModule.h()[module * hfc.maxClustersPerModule];
+    int nClustersFGpu    = hfc.nClustersPerModule.h()[module];
+
+    for (int i = 0; i < nClustersFGpu; i++) {
+      auto& cidx       = gpuClusterIdxF[i];
+      auto& clusterGpu = gpuClusterF[cidx.fIdx];
+
+      unsigned int outIdx = clusters->GetEntriesFast();
+      auto* clusterOut    = new ((*clusters)[outIdx])::CbmStsCluster {};
+
+      clusterOut->SetAddress(fConfig->moduleAddrs[module]);
+      clusterOut->SetProperties(clusterGpu.fCharge, clusterGpu.fPosition, clusterGpu.fPositionError, cidx.fTime,
+                                clusterGpu.fTimeError);
+      clusterOut->SetSize(clusterGpu.fSize);
+    }
+
+    auto* gpuClusterB    = &hfc.clusterDataPerModule.h()[(module + hfc.nModules) * hfc.maxClustersPerModule];
+    auto* gpuClusterIdxB = &hfc.clusterIdxPerModule.h()[(module + hfc.nModules) * hfc.maxClustersPerModule];
+    int nClustersBGpu    = hfc.nClustersPerModule.h()[module + hfc.nModules];
+
+    for (int i = 0; i < nClustersBGpu; i++) {
+      auto& cidx          = gpuClusterIdxB[i];
+      auto& clusterGpu    = gpuClusterB[cidx.fIdx];
+      unsigned int outIdx = clusters->GetEntriesFast();
+      auto* clusterOut    = new ((*clusters)[outIdx])::CbmStsCluster {};
+
+      clusterOut->SetAddress(fConfig->moduleAddrs[module]);
+      clusterOut->SetProperties(clusterGpu.fCharge, clusterGpu.fPosition, clusterGpu.fPositionError, cidx.fTime,
+                                clusterGpu.fTimeError);
+      clusterOut->SetSize(clusterGpu.fSize);
+    }
+
+    auto* gpuHits = &hfc.hitsPerModule.h()[module * hfc.maxHitsPerModule];
+    int nHitsGpu  = hfc.nHitsPerModule.h()[module];
+
+    for (int i = 0; i < nHitsGpu; i++) {
+      auto& hitGpu = gpuHits[i];
+
+      unsigned int outIdx = hits->GetEntriesFast();
+      new ((*hits)[outIdx])::CbmStsHit {fConfig->moduleAddrs[module],
+                                        TVector3 {hitGpu.fX, hitGpu.fY, hitGpu.fZ},
+                                        TVector3 {hitGpu.fDx, hitGpu.fDy, hitGpu.fDz},
+                                        hitGpu.fDxy,
+                                        0,
+                                        0,
+                                        double(hitGpu.fTime),
+                                        hitGpu.fTimeError,
+                                        hitGpu.fDu,
+                                        hitGpu.fDv};
+    }
+
+  }  // for (int module = 0; module < hfc.nModules; module++)
+}
+
+void CbmGpuRecoSts::FetchDigis(size_t& maxDigisPerModule, size_t& nDigisTotal)
+{
+
+  // FIXME: This function is a crime against humanity.
+  // Digis should be presorted by module already...
+
+  CbmDigiManager* digis = CbmDigiManager::Instance();
+  auto& hfc             = fHitfinderCpu;
+
+  // FIXME: GPU reco should use regular digi class too
+  nDigis      = digis->GetNofDigis(ECbmModuleId::kSts);
+  nDigisTotal = 0;
+  for (size_t i = 0; i < nDigis; i++) {
+    const ::CbmStsDigi* digi = digis->Get<::CbmStsDigi>(i);
+    Int_t systemId           = CbmAddress::GetSystemId(digi->GetAddress());
+    // TODO: is this check still needed?
+    if (systemId != ToIntegralType(ECbmModuleId::kSts)) { continue; }
+    int moduleAddr = CbmStsAddress::GetMotherAddress(digi->GetAddress(), kStsModule);
+    // int module     = addrToModuleId(moduleAddr);
+    // assert(module < hfc.nModules);
+    bool isFront = digi->GetChannel() < hfc.nChannels;
+
+    if (isFront) { fDigisByModuleF[moduleAddr].push_back(*digi); }
+    else {
+      CbmStsDigi tmpdigi = *digi;
+      tmpdigi.SetChannel(digi->GetChannel() - hfc.nChannels);
+      fDigisByModuleB[moduleAddr].push_back(tmpdigi);
+    }
+
+    nDigisTotal++;
+  }
+
+  nDigisUsed = nDigisTotal;
+
+  maxDigisPerModule = 0;
+  for (const auto& mod : fDigisByModuleF) {
+    maxDigisPerModule = std::max(maxDigisPerModule, mod.second.size());
+  }
+
+  for (const auto& mod : fDigisByModuleB) {
+    maxDigisPerModule = std::max(maxDigisPerModule, mod.second.size());
+  }
+}
diff --git a/reco/detectors/sts/experimental/CbmGpuRecoSts.h b/reco/detectors/sts/experimental/CbmGpuRecoSts.h
new file mode 100644
index 0000000000000000000000000000000000000000..a26efdbe911d569a09e0856d433010f2828c9c20
--- /dev/null
+++ b/reco/detectors/sts/experimental/CbmGpuRecoSts.h
@@ -0,0 +1,78 @@
+/* Copyright (C) 2022 FIAS Frankfurt Institute for Advanced Studies, Frankfurt / Main
+   SPDX-License-Identifier: GPL-3.0-only
+   Authors: Felix Weiglhofer [committer], Kilian Hunold */
+#ifndef CBMGPURECOSTS_H
+#define CBMGPURECOSTS_H
+
+#include "CbmStsDigi.h"
+#include "CbmStsGpuHitFinder.h"
+#include "CbmStsParModule.h"
+
+#include <map>
+#include <optional>
+#include <vector>
+
+#include <xpu/host.h>
+
+class CbmStsPhysics;
+class CbmStsRecoModule;
+class TClonesArray;
+
+namespace experimental
+{
+
+  class CbmGpuRecoSts {
+
+  public:
+    struct Config {
+      std::vector<CbmStsParModule> parModules;
+      std::vector<CbmStsRecoModule*> recoModules;
+      std::vector<int> moduleAddrs;
+      std::vector<CbmStsHitFinderConfig> hitfinderCfg;
+      CbmStsPhysics* physics;
+    };
+
+    double timeIO          = 0;
+    double timeSortDigi    = 0;
+    double timeCluster     = 0;
+    double timeSortCluster = 0;
+    double timeHits        = 0;
+
+    size_t nDigis     = 0;
+    size_t nDigisUsed = 0;
+    size_t nHits      = 0;
+    size_t nCluster   = 0;
+
+    void SetConfig(const Config& cfg) { fConfig = cfg; }
+
+    void SetupConstants();
+    void Setup(size_t maxDigisPerModule, size_t nDigitsTotal);
+
+    void RunHitFinder();
+
+    void ForwardClustersAndHits(TClonesArray* clusters, TClonesArray* hits);
+
+    std::vector<CbmStsHit> GetHits(int module)
+    {
+      auto& hfc = fHitfinderCpu;
+      auto* st  = &hfc.hitsPerModule.h()[module * hfc.maxHitsPerModule];
+      auto* end = st + hfc.nHitsPerModule.h()[module];
+      return std::vector<CbmStsHit> {st, end};
+    }
+
+  private:
+    std::optional<Config> fConfig;  // Initialized late...
+
+    // std::vector<int> fModuleAddrs;
+    CbmStsHitFinder<xpu::side::host> fHitfinderCpu;
+    CbmStsGpuHitFinder fHitfinderGpu;
+
+    std::map<int, std::vector<CbmStsDigi>> fDigisByModuleF;
+    std::map<int, std::vector<CbmStsDigi>> fDigisByModuleB;
+
+    void FetchDigis(size_t& maxDigisPerModule, size_t& nDigisTotal);
+  };
+
+}  // namespace experimental
+
+#endif
diff --git a/reco/detectors/sts/experimental/CbmStsGpuHitFinder.cxx b/reco/detectors/sts/experimental/CbmStsGpuHitFinder.cxx
new file mode 100644
index 0000000000000000000000000000000000000000..c02877d377ddaaac4d7b718ef1e58bb34f541ebf
--- /dev/null
+++ b/reco/detectors/sts/experimental/CbmStsGpuHitFinder.cxx
@@ -0,0 +1,957 @@
+/* Copyright (C) 2021-2022 FIAS Frankfurt Institute for Advanced Studies, Frankfurt / Main
+   SPDX-License-Identifier: GPL-3.0-only
+   Authors: Felix Weiglhofer [committer], Kilian Hunold */
+#include "CbmStsGpuHitFinder.h"
+
+using namespace experimental;
+
+/**
+  * CbmStsGpuHitFinder::sortDigisKhun
+  * Sorts digis channelwise. Inside a channel all digis are sorted in time.
+  *
+  * @param smem is the shared memory it is worked in
+  * @param iBlock is the Block/Module that is currenty running
+  *
+*/
+XPU_D void CbmStsGpuHitFinder::SortDigisInSpaceAndTime(CbmStsSortDigiSmem& smem, int iBlock) const
+{
+  int iModule          = iBlock;
+  CbmStsDigi* digis    = &digisPerModule[digiOffsetPerModule[iModule]];
+  CbmStsDigi* digisTmp = &digisPerModuleTmp[digiOffsetPerModule[iModule]];
+  int nDigis           = GetNDigis(iModule);
+
+  SortDigisT digiSort(smem.sortBuf);
+
+  digis = digiSort.sort(digis, nDigis, digisTmp, [](const CbmStsDigi a) {
+    return ((unsigned long int) a.GetChannel()) << 32 | (unsigned long int) (a.GetTimeU32());
+  });
+
+  if (xpu::thread_idx::x() == 0) { digisSortedPerModule[iModule] = digis; }
+}
+
+XPU_D void CbmStsGpuHitFinder::FindClustersSingleStep(int iBlock) const
+{
+  CalculateOffsetsParallel(iBlock);
+  FindClustersParallel(iBlock);
+  CalculateClustersParallel(iBlock);
+}
+
+/**
+  * CbmStsGpuHitFinder::calculateChannelOffsets
+  * Calculates the Offsest of every channel. digis-Array is sorted in Channel and Time.
+  * If a channelChange is detected it is stored in channelOffsetsPerModule.
+  *
+  * @param digis All Digis that are relevant for this Block
+  * @param channelOffsets The Array where all offsets are written to
+  * @param nDigis Amount of digis in digis-Array
+  *
+*/
+XPU_D void CbmStsGpuHitFinder::CalculateChannelOffsets(CbmStsDigi* digis, unsigned int* channelOffsets,
+                                                       unsigned int const nDigis) const
+{
+  channelOffsets[0] = 0;
+
+  //Parallel
+  for (unsigned int pos = xpu::thread_idx::x(); pos < nDigis - 1; pos += (unsigned int) xpu::block_dim::x()) {
+    auto const currChannel = digis[pos].GetChannel();
+    auto const nextChannel = digis[pos + 1].GetChannel();
+    if (currChannel != nextChannel) {
+      for (int i = currChannel + 1; i <= nextChannel; i++) {
+        channelOffsets[i] = pos + 1;
+      }
+    }
+    XPU_ASSERT(digis[pos].GetChannel()
+               <= digis[pos + 1].GetChannel());  //channel are supposed to be sorted increasingly
+  }
+
+  for (int c = digis[nDigis - 1].GetChannel() + 1; c < nChannels; c++) {
+    channelOffsets[c] = nDigis;
+  }
+}
+
+
+/**
+  * CbmStsGpuHitFinder::calculateOffsetsParallel
+  * This function calculates the channeloffsets in a certain module.
+  * An Offset is the Startingpoint of a Channel in a sorted array of Digis.
+  *
+  * @param iBlock is the Block/Module that is currently worked on
+  *
+*/
+XPU_D void CbmStsGpuHitFinder::CalculateOffsetsParallel(int iBlock) const
+{
+  int const iModule = iBlock;
+  CbmStsDigi* digis = digisSortedPerModule[iModule];
+  auto const nDigis = GetNDigis(iModule);
+
+  if (nDigis == 0) return;
+
+  auto* channelOffsets = &channelOffsetPerModule[iModule * nChannels];
+
+  CalculateChannelOffsets(digis, channelOffsets, nDigis);
+}
+
+/**
+  * CbmStsGpuHitFinder::findClustersParallel
+  * This function finds clusters form an sts-Digi Array inserted.
+  * It runs the finding process highly parallel.
+  *
+  * @param iBlock is the Block/Module that is currently worked on
+  *
+*/
+XPU_D void CbmStsGpuHitFinder::FindClustersParallel(int iBlock) const
+{
+  int const iModule           = iBlock;
+  CbmStsDigi* digis           = digisSortedPerModule[iModule];
+  auto const nDigis           = GetNDigis(iModule);
+  unsigned int const threadId = xpu::thread_idx::x();
+
+  if (nDigis == 0) return;
+
+  auto* digiConnector  = &digiConnectorPerModule[digiOffsetPerModule[iModule]];
+  auto* channelOffsets = &channelOffsetPerModule[iModule * nChannels];
+
+  // findClusterConnectionsChannelWise(digis, digiConnector, channelOffsets, iModule, threadId);
+  FindClusterConnectionsDigiWise(digis, digiConnector, channelOffsets, iModule, threadId, nDigis);
+}
+
+
+/**
+  * CbmStsGpuHitFinder::calculateClustersParallel
+  * This function calculates clusters form an sts-Digi Array inserted.
+  * It runs the calculating process highly parallel.
+  *
+  * @param iBlock is the Block/Module that is currently worked on
+  *
+*/
+XPU_D void CbmStsGpuHitFinder::CalculateClustersParallel(int iBlock) const
+{
+  int const iModule  = iBlock;
+  CbmStsDigi* digis  = digisSortedPerModule[iModule];
+  auto const nDigis  = GetNDigis(iModule);
+  int const threadId = xpu::thread_idx::x();
+
+  if (nDigis == 0) { return; }
+
+  auto* digiConnector = &digiConnectorPerModule[digiOffsetPerModule[iModule]];
+  // auto* channelOffsets = &channelOffsetPerModule[iModule * nChannels];
+
+  // calculateClustersChannelWise(digis, digiConnector, channelOffsets, iModule, threadId, nDigis);
+  CalculateClustersDigiWise(digis, digiConnector, iModule, threadId, nDigis);
+}
+
+
+/**
+  * CbmStsGpuHitFinder::findClusterConnectionsBasic
+  *
+  * Each Thread one Channel
+  *
+  * ChannelId: 00000 11111
+  * DigiIndex: 01234 56789
+  * ThreadId:  00000 00000
+  *
+  * Finds Clusters in a basic way. One thread takes care of the whole calculation.
+  *
+  * @param digis All Digis that are relevant for this Block
+  * @param digiConnector Array where the connection between 2 digis is sotred in.(next Digi; has Previous)
+  * @param channelOffsets The Array where all offsets are written to
+  * @param iModule The Module that the curren Block is working on
+  * @param threadId Id of the thrad currently working
+  *
+**/
+XPU_D void CbmStsGpuHitFinder::FindClusterConnectionsBasic(int iBlock) const
+{
+  int iModule          = iBlock;
+  int threadId         = xpu::thread_idx::x();
+  auto nDigis          = GetNDigis(iModule);
+  CbmStsDigi* digis    = digisSortedPerModule[iModule];
+  auto* digiConnector  = &digiConnectorPerModule[digiOffsetPerModule[iModule]];
+  auto* channelOffsets = &channelOffsetPerModule[iModule * nChannels];
+
+  if (threadId != 0) return;
+
+  for (unsigned int currIter = 0; currIter < nDigis; currIter++) {
+    const CbmStsDigi& digi = digis[currIter];
+    uint16_t channel       = digi.GetChannel();
+
+    if (channel == nChannels - 1) break;
+
+    //DeltaCalculation
+    float const tResol = GetTimeResolution(iModule, channel);
+    float const deltaT = (timeCutDigiAbs > 0.f ? timeCutDigiAbs : timeCutDigiSig * 1.4142f * tResol);
+
+    unsigned int nextChannelStart = channelOffsets[channel + 1];
+    unsigned int nextChannelEnd   = (channel + 2 < nChannels) ? channelOffsets[channel + 2] : nDigis;
+
+    // printf("Comparing digis (deltaT=%f, nextChannelStart=%u, nextChannelEnd=%u, nDigis=%u):\n", deltaT,
+    //        nextChannelStart, nextChannelEnd, nDigis);
+    // printf(" channel=%d, time=%d, charge=%d\n", digi.GetChannel(), digi.GetTimeU32(), digi.GetChargeU16());
+
+    //Calculate DigiConnections
+    for (unsigned int nextIter = nextChannelStart; nextIter < nextChannelEnd; nextIter++) {
+
+      const CbmStsDigi& nextDigi = digis[nextIter];
+
+      // printf("> channel=%d, time=%d, charge=%d\n", nextDigi.GetChannel(), nextDigi.GetTimeU32(),
+      //        nextDigi.GetChargeU16());
+
+      if (digi.GetChannel() >= nextDigi.GetChannel()) continue;
+
+      if (float(digis[currIter].GetTimeU32() + deltaT) <= float(digis[nextIter].GetTimeU32())) break;
+      if (float(digis[currIter].GetTimeU32() - deltaT) >= float(digis[nextIter].GetTimeU32())) continue;
+
+      digiConnector[currIter].SetNext(nextIter);
+      digiConnector[nextIter].SetHasPrevious(true);
+      break;
+    }
+  }
+}
+
+
+/**
+  * CbmStsGpuHitFinder::findClusterConnectionsChannelWise
+  *
+  * Each Thread one Channel
+  *
+  * ChannelId: 00000 11111
+  * DigiIndex: 01234 56789
+  * ThreadId:  00000 11111
+  *
+  * Finds Clusters highly parallel in a basic way. Each thread gets the same anmount of Channels to work with.
+  * Every Thread checks all digis in its channel, if there is a pendant to the current one in the channel to its right.
+  * If there is another digi, the digi will be connected inside the digiConnector-Array.
+  *
+  * @param digis All Digis that are relevant for this Block
+  * @param digiConnector Array where the connection between 2 digis is sotred in.(next Digi; has Previous)
+  * @param channelOffsets The Array where all offsets are written to
+  * @param iModule The Module that the curren Block is working on
+  * @param threadId Id of the thrad currently working
+  *
+**/
+XPU_D void CbmStsGpuHitFinder::FindClusterConnectionsChannelWise(CbmStsDigi* digis, CbmStsDigiConnector* digiConnector,
+                                                                 unsigned int* channelOffsets, int const iModule,
+                                                                 unsigned int const threadId) const
+{
+  // Expl: here we need to be smaller than nChannels because the last channel is the most Right.
+  for (unsigned int channel = threadId; channel < (unsigned int) (nChannels - 1); channel += xpu::block_dim::x()) {
+    unsigned int const currChannelBegin = channelOffsets[channel];      //Begin of current Channel
+    unsigned int const nextChannelBegin = channelOffsets[channel + 1];  //Begin of next (right) channel
+    auto const nextChannelEnd =
+      (channel + 2 < (unsigned int) nChannels) ? channelOffsets[channel + 2] : GetNDigis(iModule);
+    unsigned int nextIter = nextChannelBegin;
+
+    //Check if first Digi of Channel belongs to Channel
+    if (channel != digis[currChannelBegin].GetChannel()) continue;
+
+    //DeltaCalculation
+    float const tResol = GetTimeResolution(iModule, channel);
+    float const deltaT = (timeCutDigiAbs > 0.f ? timeCutDigiAbs : timeCutDigiSig * 1.4142f * tResol);
+
+    //Calculate DigiConnections
+    for (auto currIter = currChannelBegin; currIter < nextChannelBegin; currIter++) {
+      while (nextIter < nextChannelEnd
+             && ((digis[currIter].GetTimeU32() + deltaT) >= float(digis[nextIter].GetTimeU32()))) {
+        if (digis[currIter].GetChannel() >= digis[nextIter].GetChannel()) { continue; }
+        if (digis[currIter].GetTimeU32() + deltaT < float(digis[nextIter].GetTimeU32())) {
+          nextIter++;
+          break;
+        }
+        if (digis[currIter].GetTimeU32() - deltaT > float(digis[nextIter].GetTimeU32())) {
+          nextIter++;
+          continue;
+        }
+
+        digiConnector[currIter].SetNext(nextIter);
+        digiConnector[nextIter].SetHasPrevious(true);
+        nextIter++;
+        break;
+      }
+    }
+  }
+}
+
+/**
+  * CbmStsGpuHitFinder::findClusterConnectionsDigiWise
+  *
+  * Each thread one Digi
+  *
+  * ChannelId: 00000 11111
+  * DigiIndex: 01234 56789
+  * ThreadId:  01234 01234
+  *
+  * Finds Clusters highly parallel in a basic way.
+  * The Threads work on Data thats stored nearby, so it's been taken advantage of the data locality.
+  * Each thread takes one digi and calculates if there is a neighbour.
+  * This is repeated until all digis have been processed
+  *
+  * @param digis All Digis that are relevant for this Block
+  * @param digiConnector Array where the connection between 2 digis is sotred in.(next Digi; has Previous)
+  * @param channelOffsets The Array where all offsets are written to
+  * @param iModule The Module that the curren Block is working on
+  * @param threadId Id of the thrad currently working
+  * @param nDigis Amount of digis in digis-Array
+  *
+**/
+XPU_D void CbmStsGpuHitFinder::FindClusterConnectionsDigiWise(CbmStsDigi* digis, CbmStsDigiConnector* digiConnector,
+                                                              unsigned int* channelOffsets, int const iModule,
+                                                              unsigned int const threadId,
+                                                              unsigned int const nDigis) const
+{
+
+  for (unsigned int currIter = threadId; currIter < nDigis; currIter += xpu::block_dim::x()) {
+    auto const digi    = digis[currIter];
+    auto const channel = digi.GetChannel();
+
+    if (channel == nChannels - 1) break;
+
+    //DeltaCalculation
+    float const tResol = GetTimeResolution(iModule, channel);
+    float const deltaT = (timeCutDigiAbs > 0.f ? timeCutDigiAbs : timeCutDigiSig * 1.4142f * tResol);
+
+    auto const nextChannelEnd = (channel + 2 < nChannels) ? channelOffsets[channel + 2] : nDigis;
+    //Calculate DigiConnections
+    for (auto nextIter = channelOffsets[channel + 1]; nextIter < nextChannelEnd; nextIter++) {
+      if (digis[currIter].GetChannel() >= digis[nextIter].GetChannel()) continue;
+      if (float(digis[currIter].GetTimeU32()) + deltaT < float(digis[nextIter].GetTimeU32())) break;
+      if (float(digis[currIter].GetTimeU32()) - deltaT > float(digis[nextIter].GetTimeU32())) continue;
+
+      digiConnector[currIter].SetNext(nextIter);
+      digiConnector[nextIter].SetHasPrevious(true);
+      break;
+    }
+  }
+}
+
+
+/**
+  * CbmStsGpuHitFinder::calculateClustersBasic
+  *
+  * One Thread all Digis
+  *
+  * ChannelId: 00000 11111
+  * DigiIndex: 01234 56789
+  * ThreadId:  00000 00000
+  *
+  * Calculates the Clustercharges of all found ClusterConnections.
+  * One thread walks through all Digis and looks for connections. If one Digi does not have a previous one
+  * it's a cluster start and may be the start of either a single-element-cluster, double-element-cluster
+  * or multi-element-cluster.
+  * All Other Threads are canceled
+  *
+  * @param digis All Digis that are relevant for this Block
+  * @param digiConnector Array where the connection between 2 digis is sotred in.(next Digi; has Previous)
+  * @param iModule The Module that the curren Block is working on
+  * @param threadId Id of the thrad currently working
+  *
+**/
+XPU_D void CbmStsGpuHitFinder::CalculateClustersBasic(int iBlock) const
+{
+  int iModule         = iBlock;
+  int threadId        = xpu::thread_idx::x();
+  CbmStsDigi* digis   = digisSortedPerModule[iModule];
+  auto* digiConnector = &digiConnectorPerModule[digiOffsetPerModule[iModule]];
+
+  if (threadId != 0) return;
+
+  for (unsigned int i = 0; i < GetNDigis(iModule); i++) {
+    if (digiConnector[i].HasPrevious()) { continue; }
+    if (!digiConnector[i].HasNext()) {
+      //if Cluster has 1 element
+      CreateClusterFromConnectors1(iModule, digis, i);
+    }
+    else if (!digiConnector[digiConnector[i].next()].HasNext()) {
+      //if Cluster has 2 elements
+      CreateClusterFromConnectors2(iModule, digis, digiConnector, i);
+    }
+    else {
+      //if Cluster has N elements
+      CreateClusterFromConnectorsN(iModule, digis, digiConnector, i);
+    }
+  }
+}
+
+
+/**
+  * CbmStsGpuHitFinder::calculateClustersChannelWise
+  *
+  * Each Thread one Channel
+  *
+  * ChannelId: 00000 11111
+  * DigiIndex: 01234 56789
+  * ThreadId:  00000 11111
+  *
+  * Calculates the Clustercharges of all found ClusterConnections.
+  * Each Thread walks through all digis of one channel and looks for connections.
+  * If one Digi does not have a previous one it's a cluster start and may be the
+  * start of either a single-element-cluster, double-element-cluster or
+  * multi-element-cluster.
+  *
+  * @param digis All Digis that are relevant for this Block
+  * @param digiConnector Array where the connection between 2 digis is sotred in.(next Digi; has Previous)
+  * @param iModule The Module that the curren Block is working on
+  * @param threadId Id of the thrad currently working
+  *
+**/
+XPU_D void CbmStsGpuHitFinder::CalculateClustersChannelWise(CbmStsDigi* digis, CbmStsDigiConnector* digiConnector,
+                                                            unsigned int* channelOffsets, int const iModule,
+                                                            unsigned int const threadId,
+                                                            unsigned int const nDigis) const
+{
+
+  for (unsigned int channel = threadId; channel < (unsigned int) nChannels;
+       channel += (unsigned int) xpu::block_dim::x()) {
+    unsigned int const currChannelBegin = channelOffsets[channel];
+    unsigned int const nextChannelBegin =
+      (channel + 1 < (unsigned int) nChannels) ? channelOffsets[channel + 1] : nDigis;
+
+    //Check if first Digi of Channel belongs to Channel
+    if (channel != digis[currChannelBegin].GetChannel()) { continue; }
+
+    //Calculate DigiConnections
+    for (auto currIter = currChannelBegin; currIter < nextChannelBegin; currIter++) {
+      if (digiConnector[currIter].HasPrevious()) { continue; }
+      if (!digiConnector[currIter].HasNext()) {
+        //if Cluster has 1 element
+        CreateClusterFromConnectors1(iModule, digis, currIter);
+      }
+      else if (!digiConnector[digiConnector[currIter].next()].HasNext()) {
+        //if Cluster has 2 elements
+        CreateClusterFromConnectors2(iModule, digis, digiConnector, currIter);
+      }
+      else {
+        //if Cluster has N elements
+        CreateClusterFromConnectorsN(iModule, digis, digiConnector, currIter);
+      }
+    }
+  }
+}
+
+
+/**
+  * CbmStsGpuHitFinder::calculateClustersDigiWise
+  *
+  * Each Thread one Channel
+  *
+  * ChannelId: 00000 11111
+  * DigiIndex: 01234 56789
+  * ThreadId:  01234 56789
+  *
+  * Calculates the Clustercharges of all found ClusterConnections.
+  * Each Thread takes on digi and looks for connections. When the thread is done, it takes the next digi.
+  * If one Digi does not have a previous one it's a cluster start and may be the
+  * start of either a single-element-cluster, double-element-cluster or
+  * multi-element-cluster.
+  *
+  * @param digis All Digis that are relevant for this Block
+  * @param digiConnector Array where the connection between 2 digis is sotred in.(next Digi; has Previous)
+  * @param iModule The Module that the curren Block is working on
+  * @param threadId Id of the thrad currently working
+  *
+**/
+XPU_D void CbmStsGpuHitFinder::CalculateClustersDigiWise(CbmStsDigi* digis, CbmStsDigiConnector* digiConnector,
+                                                         int const iModule, unsigned int const threadId,
+                                                         unsigned int const nDigis) const
+{
+
+  for (unsigned int currIter = threadId; currIter < nDigis; currIter += (unsigned int) xpu::block_dim::x()) {
+
+    if (digiConnector[currIter].HasPrevious()) continue;
+
+    if (!digiConnector[currIter].HasNext()) {
+      //if Cluster has 1 element
+      CreateClusterFromConnectors1(iModule, digis, currIter);
+    }
+    else if (!digiConnector[digiConnector[currIter].next()].HasNext()) {
+      //if Cluster has 2 elements
+      CreateClusterFromConnectors2(iModule, digis, digiConnector, currIter);
+    }
+    else {
+      //if Cluster has N elements
+      CreateClusterFromConnectorsN(iModule, digis, digiConnector, currIter);
+    }
+  }
+}
+
+
+XPU_D void CbmStsGpuHitFinder::CreateClusterFromConnectors1(int const iModule, CbmStsDigi* digis, int digiIndex) const
+{
+  const CbmStsDigi& digi = digis[digiIndex];
+
+  CbmStsTimeNs time = digi.GetTimeU32();
+
+  const float timeError = asic.timeResolution;
+
+  CbmStsClusterData cluster {
+    .fCharge        = asic.AdcToCharge(digi.GetChargeU16()),
+    .fSize          = 1,
+    .fPosition      = float(digi.GetChannel()) + (IsBackside(iModule) ? nChannels : 0.f),
+    .fPositionError = 1.f / xpu::sqrt(24.f),
+    .fTimeError     = timeError,
+  };
+
+  SaveMaxError(timeError, iModule);
+
+  AddCluster(iModule, time, cluster);
+}
+
+XPU_D void CbmStsGpuHitFinder::CreateClusterFromConnectors2(int const iModule, CbmStsDigi* digis,
+                                                            CbmStsDigiConnector* digiConnector,
+                                                            int const digiIndex) const
+{
+
+  const CbmStsDigi& digi1 = digis[digiIndex];
+  const CbmStsDigi& digi2 = digis[digiConnector[digiIndex].next()];
+
+  float eNoiseSq = asic.noise * asic.noise;
+
+  float chargePerAdc = asic.dynRange / float(asic.nAdc);
+  float eDigitSq     = chargePerAdc * chargePerAdc / 12.f;
+
+  float x1 = float(digi1.GetChannel());
+  float q1 = asic.AdcToCharge(digi1.GetChargeU16());
+  float q2 = asic.AdcToCharge(digi2.GetChargeU16());
+
+  // Periodic position for clusters round the edge
+  if (digi1.GetChannel() > digi2.GetChannel()) { x1 -= float(nChannels); }
+
+  // Uncertainties of the charge measurements
+  XErrorT width1 = LandauWidth(q1);
+  XErrorT eq1sq  = width1 * width1 + eNoiseSq + eDigitSq;
+  XErrorT width2 = LandauWidth(q2);
+  XErrorT eq2sq  = width2 * width2 + eNoiseSq + eDigitSq;
+
+  // Cluster time
+  float time      = 0.5f * (digi1.GetTimeU32() + digi2.GetTimeU32());
+  float timeError = asic.timeResolution * 0.70710678f;  // 1/sqrt(2)
+
+  // Cluster position
+  float x = x1 + 0.5f + (q2 - q1) / 3.f / xpu::max(q1, q2);
+
+  // Correct negative position for clusters around the edge
+  if (x < -0.5f) { x += float(nChannels); }
+
+  // Uncertainty on cluster position. See software note.
+  XErrorT ex0sq = 0.f;
+  XErrorT ex1sq = 0.f;
+  XErrorT ex2sq = 0.f;
+  if (q1 < q2) {
+    ex0sq = (q2 - q1) * (q2 - q1) / q2 / q2 / 72.f;
+    ex1sq = eq1sq / q2 / q2 / 9.f;
+    ex2sq = eq2sq * q1 * q1 / q2 / q2 / q2 / q2 / 9.f;
+  }
+  else {
+    ex0sq = (q2 - q1) * (q2 - q1) / q1 / q1 / 72.f;
+    ex1sq = eq1sq * q2 * q2 / q1 / q1 / q1 / q1 / 9.f;
+    ex2sq = eq2sq / q1 / q1 / 9.f;
+  }
+  XErrorT xError = xpu::sqrt(ex0sq + ex1sq + ex2sq);
+
+  // Cluster charge
+  float charge = q1 + q2;
+
+  if (IsBackside(iModule)) x += nChannels;
+
+  CbmStsClusterData cls {
+    .fCharge        = charge,
+    .fSize          = 2,
+    .fPosition      = x,
+    .fPositionError = xError,
+    .fTimeError     = timeError,
+  };
+
+  SaveMaxError(timeError, iModule);
+  AddCluster(iModule, time, cls);
+}
+
+
+XPU_D void CbmStsGpuHitFinder::CreateClusterFromConnectorsN(int iModule, CbmStsDigi* digis,
+                                                            CbmStsDigiConnector* digiConnector, int digiIndex) const
+{
+  CbmStsClusterCalculationProperties cProps;
+
+  //This Lambda calculates all charges for a single digi inside of a cluster
+  auto calculateClusterCharges = [this, &cProps, &digis, &digiConnector](int index) {
+    CbmStsDigi digi    = digis[index];
+    float eNoiseSq     = asic.noise * asic.noise;
+    float chargePerAdc = asic.dynRange / float(asic.nAdc);
+    float eDigitSq     = chargePerAdc * chargePerAdc / 12.f;
+    cProps.tResolSum += asic.timeResolution;
+    cProps.tSum += digi.GetTimeU32();
+    float charge    = asic.AdcToCharge(digi.GetChargeU16());
+    float lWidth    = LandauWidth(charge);
+    float eChargeSq = lWidth * lWidth + eNoiseSq + eDigitSq;
+
+    if (!digiConnector[index].HasPrevious()) {
+      //begin of Cluster (most left Element)
+      cProps.chanF = digi.GetChannel();
+      cProps.qF    = charge;
+      cProps.eqFsq = eChargeSq;
+    }
+    else if (!digiConnector[index].HasNext()) {
+      //end of Cluster (most right Element)
+      cProps.chanL = digi.GetChannel();
+      cProps.qL    = charge;
+      cProps.eqLsq = eChargeSq;
+    }
+    else {
+      cProps.qM += charge;
+      cProps.eqMsq += eChargeSq;
+    }
+    cProps.xSum += charge * float(digi.GetChannel());
+  };
+
+  calculateClusterCharges(digiIndex);
+
+  // Calculate ClusterCharges
+  while (digiConnector[digiIndex].HasNext()) {
+    digiIndex = digiConnector[digiIndex].next();
+    calculateClusterCharges(digiIndex);
+  }
+
+  // Periodic channel position for clusters round the edge
+  if (cProps.chanF > cProps.chanL) cProps.chanF -= nChannels;
+
+  // Cluster time and total charge
+  float nDigis    = ChanDist(cProps.chanF, cProps.chanL) + 1;
+  cProps.tSum     = cProps.tSum / nDigis;
+  float timeError = (cProps.tResolSum / nDigis) / xpu::sqrt(nDigis);
+  float qSum      = cProps.qF + cProps.qM + cProps.qL;
+
+  // Average charge in middle strips
+  cProps.qM /= (nDigis - 2.f);
+  cProps.eqMsq /= (nDigis - 2.f);
+
+  // Cluster position
+  float x = 0.5f * (float(cProps.chanF + cProps.chanL) + (cProps.qL - cProps.qF) / cProps.qM);
+
+  // Correct negative cluster position for clusters round the edge
+  if (x < -0.5f) { x += float(nChannels); }
+
+  // Cluster position error
+  float exFsq = cProps.eqFsq / cProps.qM / cProps.qM / 4.f;  // error from first charge
+  float exMsq = cProps.eqMsq * (cProps.qL - cProps.qF) * (cProps.qL - cProps.qF) / cProps.qM / cProps.qM / cProps.qM
+                / cProps.qM / 4.f;
+  float exLsq  = cProps.eqLsq / cProps.qM / cProps.qM / 4.f;
+  float xError = xpu::sqrt(exFsq + exMsq + exLsq);
+
+  // Correction for corrupt clusters
+  if (x < cProps.chanF || x > cProps.chanL) { x = cProps.xSum / qSum; }
+
+  XPU_ASSERT(x >= cProps.chanF && x <= cProps.chanL);
+  XPU_ASSERT(nDigis > 2);
+
+  if (IsBackside(iModule)) { x += nChannels; }
+
+  CbmStsClusterData cls {
+    .fCharge        = qSum,
+    .fSize          = int(nDigis),
+    .fPosition      = x,
+    .fPositionError = xError,
+    .fTimeError     = timeError,
+  };
+
+  SaveMaxError(timeError, iModule);
+  AddCluster(iModule, cProps.tSum, cls);
+}
+
+XPU_D void CbmStsGpuHitFinder::SortClusters(CbmStsSortClusterSmem& smem, int iBlock) const
+{
+  int iModule                     = iBlock;
+  size_t offset                   = iModule * maxClustersPerModule;
+  CbmStsClusterIdx* clusterIdx    = &clusterIdxPerModule[offset];
+  CbmStsClusterIdx* clusterIdxTmp = &clusterIdxPerModuleTmp[offset];
+  int nClusters                   = nClustersPerModule[iModule];
+
+  SortClustersT clsSort(smem.sortBuf);
+  clusterIdx = clsSort.sort(clusterIdx, nClusters, clusterIdxTmp, [](const CbmStsClusterIdx a) { return a.fTime; });
+
+  if (xpu::thread_idx::x() == 0) clusterIdxSortedPerModule[iModule] = clusterIdx;
+}
+
+XPU_D void CbmStsGpuHitFinder::FindHits(int iBlock) const
+{
+  int iModuleF                    = iBlock;
+  int iModuleB                    = iBlock + nModules;
+  size_t offsetF                  = iModuleF * maxClustersPerModule;
+  size_t offsetB                  = iModuleB * maxClustersPerModule;
+  CbmStsClusterIdx* clusterIdxF   = clusterIdxSortedPerModule[iModuleF];
+  CbmStsClusterIdx* clusterIdxB   = clusterIdxSortedPerModule[iModuleB];
+  CbmStsClusterData* clusterDataF = &clusterDataPerModule[offsetF];
+  CbmStsClusterData* clusterDataB = &clusterDataPerModule[offsetB];
+  int nClustersF                  = nClustersPerModule[iModuleF];
+  int nClustersB                  = nClustersPerModule[iModuleB];
+
+
+  CbmStsHitFinderParams pars;
+  {
+    CbmStsHitFinderConfig cfg = hitfinderConfig[iBlock];
+
+    pars.dY         = cfg.dY;
+    pars.pitch      = cfg.pitch;
+    pars.stereoF    = cfg.stereoF;
+    pars.stereoB    = cfg.stereoB;
+    pars.lorentzF   = cfg.lorentzF;
+    pars.lorentzB   = cfg.lorentzB;
+    pars.tanStereoF = xpu::tan(pars.stereoF * xpu::deg_to_rad());
+    pars.tanStereoB = xpu::tan(pars.stereoB * xpu::deg_to_rad());
+    pars.errorFac   = 1.f / (pars.tanStereoB - pars.tanStereoF) / (pars.tanStereoB - pars.tanStereoF);
+    pars.dX         = float(nChannels) * pars.pitch;
+  }
+
+  float maxTerrF = *maxErrorFront;
+  float maxTerrB = *maxErrorBack;
+
+  float maxSigmaBoth = 4.f * xpu::sqrt(maxTerrF * maxTerrF + maxTerrB * maxTerrB);
+
+  int startB = 0;
+  for (int iClusterF = xpu::thread_idx::x(); iClusterF < nClustersF; iClusterF += xpu::block_dim::x()) {
+    CbmStsClusterIdx clsIdxF   = clusterIdxF[iClusterF];
+    CbmStsClusterData clsDataF = clusterDataF[clsIdxF.fIdx];
+
+    float maxSigma = 4.f * xpu::sqrt(clsDataF.fTimeError * clsDataF.fTimeError + maxTerrF * maxTerrF);
+
+    for (int iClusterB = startB; iClusterB < nClustersB; iClusterB++) {
+      CbmStsClusterIdx clsIdxB   = clusterIdxB[iClusterB];
+      CbmStsClusterData clsDataB = clusterDataB[clsIdxB.fIdx];
+
+      float timeDiff = float(clsIdxF.fTime) - float(clsIdxB.fTime);
+
+      if (timeDiff > 0 && timeDiff > maxSigmaBoth) {
+        startB++;
+        continue;
+      }
+      else if (timeDiff > 0 && timeDiff > maxSigma) {
+        continue;
+      }
+      else if (timeDiff < 0 && xpu::abs(timeDiff) > maxSigma) {
+        break;
+      }
+
+      float timeCut = -1.f;
+      if (timeCutClusterAbs > 0.f) timeCut = timeCutClusterAbs;
+      else if (timeCutClusterSig > 0.f) {
+        float eF = clsDataF.fTimeError;
+        float eB = clsDataB.fTimeError;
+        timeCut  = timeCutClusterSig * xpu::sqrt(eF * eF + eB * eB);
+      }
+
+      if (xpu::abs(float(clsIdxF.fTime) - float(clsIdxB.fTime)) > timeCut) continue;
+
+      IntersectClusters(iBlock, pars, clsIdxF.fTime, clsDataF, clsIdxB.fTime, clsDataB);
+    }
+  }
+}
+
+XPU_D void CbmStsGpuHitFinder::IntersectClusters(int iBlock, const CbmStsHitFinderParams& pars, CbmStsTimeNs timeF,
+                                                 const CbmStsClusterData& clsF, CbmStsTimeNs timeB,
+                                                 const CbmStsClusterData& clsB) const
+{
+  // --- Calculate cluster centre position on readout edge
+
+  float xF  = GetClusterPosition(pars, clsF.fPosition, true);
+  float exF = clsF.fPositionError * pars.pitch;
+  float du  = exF * xpu::cos(xpu::deg_to_rad() * pars.stereoF);
+  float xB  = GetClusterPosition(pars, clsB.fPosition, false);
+  float exB = clsB.fPositionError * pars.pitch;
+  float dv  = exB * xpu::cos(xpu::deg_to_rad() * pars.stereoB);
+
+  // // --- Should be inside active area
+  if (xF < 0.f || xF > pars.dX || xB < 0.f || xB > pars.dX) { return; }
+
+  // // --- Calculate number of line segments due to horizontal
+  // // --- cross-connection. If x(y=0) does not fall on the bottom edge,
+  // // --- the strip is connected to the one corresponding to the line
+  // // --- with top edge coordinate xF' = xF +/- Dx. For odd combinations
+  // // --- of stereo angle and sensor dimensions, this could even happen
+  // // --- multiple times. For each of these lines, the intersection with
+  // // --- the line on the other side is calculated. If inside the active area,
+  // // --- a hit is created.
+  int nF = (xF + pars.dY * pars.tanStereoF) / pars.dX;
+  int nB = (xB + pars.dY * pars.tanStereoB) / pars.dX;
+
+  // --- If n is positive, all lines from 0 to n must be considered,
+  // --- if it is negative (phi negative), all lines from n to 0.
+  int nF1 = xpu::min(0, nF);
+  int nF2 = xpu::max(0, nF);
+  int nB1 = xpu::min(0, nB);
+  int nB2 = xpu::max(0, nB);
+
+  // --- Double loop over possible lines
+  float xC    = -1.f;
+  float yC    = -1.f;
+  float varX  = 0.f;
+  float varY  = 0.f;
+  float varXY = 0.f;
+  for (int iF = nF1; iF <= nF2; iF++) {
+    float xFi = xF - float(iF) * pars.dX;
+    for (int iB = nB1; iB <= nB2; iB++) {
+      float xBi = xB - float(iB) * pars.dX;
+
+      // --- Intersect the two lines
+      bool found = Intersect(pars, xFi, exF, xBi, exB, xC, yC, varX, varY, varXY);
+      if (not found) continue;
+
+      // --- Transform into sensor system with origin at sensor centre
+
+      xC -= 0.5f * pars.dX;
+      yC -= 0.5f * pars.dY;
+
+      CreateHit(iBlock, xC, yC, varX, varY, varXY, timeF, clsF, timeB, clsB, du, dv);
+    }
+  }
+}
+
+XPU_D float CbmStsGpuHitFinder::GetClusterPosition(const CbmStsHitFinderParams& pars, float centre, bool isFront) const
+{
+  // Take integer channel
+
+  int iChannel = int(centre);
+  float xDif   = centre - float(iChannel);
+
+  // Calculate corresponding strip on sensor
+
+  int iStrip = iChannel - (isFront ? 0 : nChannels);
+
+  // Re-add difference to integer channel. Convert channel to
+  // coordinate
+  float xCluster = (float(iStrip) + xDif + 0.5f) * pars.pitch;
+
+  // // Correct for Lorentz-Shift
+  // // Simplification: The correction uses only the y component of the
+  // // magnetic field. The shift is calculated using the mid-plane of the
+  // // sensor, which is not correct for tracks not traversing the entire
+  // // sensor thickness (i.e., are created or stopped somewhere in the sensor).
+  // // However, this is the best one can do in reconstruction.
+  xCluster -= (isFront ? pars.lorentzF : pars.lorentzB);
+
+  return xCluster;
+}
+
+XPU_D bool CbmStsGpuHitFinder::Intersect(const CbmStsHitFinderParams& pars, float xF, float exF, float xB, float exB,
+                                         float& x, float& y, float& varX, float& varY, float& varXY) const
+{
+
+  // In the coordinate system with origin at the bottom left corner,
+  // a line along the strips with coordinate x0 at the top edge is
+  // given by the function y(x) = Dy - ( x - x0 ) / tan(phi), if
+  // the stereo angle phi does not vanish. Two lines yF(x), yB(x) with top
+  // edge coordinates xF, xB intersect at
+  // x = ( tan(phiB)*xF - tan(phiF)*xB ) / (tan(phiB) - tan(phiF)
+  // y = Dy + ( xB - xF ) / ( tan(phiB) - tan(phiF) )
+  // For the case that one of the stereo angles vanish (vertical strips),
+  // the calculation of the intersection is straightforward.
+
+  // --- First check whether stereo angles are different. Else there is
+  // --- no intersection.
+  // TODO: if this is true, no hits can be found? So just do this once at the beginning?
+  if (xpu::abs(pars.stereoF - pars.stereoB) < 0.5f) return false;
+
+  // --- Now treat vertical front strips
+  if (xpu::abs(pars.stereoF) < 0.001f) {
+    x     = xF;
+    y     = pars.dY - (xF - xB) / pars.tanStereoB;
+    varX  = exF * exF;
+    varY  = (exF * exF + exB * exB) / pars.tanStereoB / pars.tanStereoB;
+    varXY = -1.f * exF * exF / pars.tanStereoB;
+    return IsInside(pars, x - pars.dX / 2.f, y - pars.dY / 2.f);
+  }
+
+  // --- Maybe the back side has vertical strips?
+  if (xpu::abs(pars.stereoB) < 0.001f) {
+    x     = xB;
+    y     = pars.dY - (xB - xF) / pars.tanStereoF;
+    varX  = exB * exB;
+    varY  = (exF * exF + exB * exB) / pars.tanStereoF / pars.tanStereoF;
+    varXY = -1.f * exB * exB / pars.tanStereoF;
+    return IsInside(pars, x - pars.dX / 2.f, y - pars.dY / 2.f);
+  }
+
+  // --- OK, both sides have stereo angle
+
+  x = (pars.tanStereoB * xF - pars.tanStereoF * xB) / (pars.tanStereoB - pars.tanStereoF);
+  y = pars.dY + (xB - xF) / (pars.tanStereoB - pars.tanStereoF);
+  varX =
+    pars.errorFac * (exF * exF * pars.tanStereoB * pars.tanStereoB + exB * exB * pars.tanStereoF * pars.tanStereoF);
+  varY  = pars.errorFac * (exF * exF + exB * exB);
+  varXY = -1.f * pars.errorFac * (exF * exF * pars.tanStereoB + exB * exB * pars.tanStereoF);
+
+  return IsInside(pars, x - pars.dX / 2.f, y - pars.dY / 2.f);
+}
+
+XPU_D bool CbmStsGpuHitFinder::IsInside(const CbmStsHitFinderParams& pars, float x, float y) const
+{
+  // clang-format off
+  return x >= -pars.dX / 2.f
+      && x <= pars.dX / 2.f
+      && y >= -pars.dY / 2.f
+      && y <= pars.dY / 2.f;
+  // clang-format on
+}
+
+XPU_D void CbmStsGpuHitFinder::CreateHit(int iModule, float xLocal, float yLocal, float varX, float varY, float varXY,
+                                         CbmStsTimeNs timeF, const CbmStsClusterData& clsF, CbmStsTimeNs timeB,
+                                         const CbmStsClusterData& clsB, float du, float dv) const
+{
+  // --- Transform into global coordinate system
+  float globalX, globalY, globalZ;
+  ToGlobal(iModule, xLocal, yLocal, 0.f, globalX, globalY, globalZ);
+
+  // We assume here that the local-to-global transformations is only translation
+  // plus maybe rotation upside down or front-side back. In that case, the
+  // global covariance matrix is the same as the local one.
+  float errX = xpu::sqrt(varX);
+  float errY = xpu::sqrt(varY);
+  float errZ = 0.f;
+
+  // --- Calculate hit time (average of cluster times)
+  float hitTime      = 0.5f * (float(timeF) + float(timeB));
+  float etF          = clsF.fTimeError;
+  float etB          = clsB.fTimeError;
+  float hitTimeError = 0.5f * xpu::sqrt(etF * etF + etB * etB);
+
+  CbmStsHit hit {
+    .fX         = globalX,
+    .fY         = globalY,
+    .fZ         = globalZ,
+    .fTime      = static_cast<CbmStsTimeNs>(hitTime),
+    .fDx        = errX,
+    .fDy        = errY,
+    .fDz        = errZ,
+    .fDxy       = varXY,
+    .fTimeError = hitTimeError,
+    .fDu        = du,
+    .fDv        = dv,
+  };
+
+  int idx = xpu::atomic_add_block(&nHitsPerModule[iModule], 1);
+
+  assert(size_t(idx) < maxHitsPerModule);
+
+  hitsPerModule[iModule * maxHitsPerModule + idx] = hit;
+}
+
+
+XPU_D XErrorT CbmStsGpuHitFinder::LandauWidth(float charge) const
+{
+  if (charge <= landauStepSize) return landauTable[0];
+  if (charge > landauStepSize * (landauTableSize - 1)) return landauTable[landauTableSize - 1];
+
+  int tableIdx = xpu::ceil(charge / landauStepSize);
+  XErrorT e2   = tableIdx * landauStepSize;
+  XErrorT v2   = landauTable[tableIdx];
+  tableIdx--;
+  XErrorT e1 = tableIdx * landauStepSize;
+  XErrorT v1 = landauTable[tableIdx];
+  return v1 + (charge - e1) * (v2 - v1) / (e2 - e1);
+}
+
+XPU_D void CbmStsGpuHitFinder::ToGlobal(int iModule, float lx, float ly, float lz, float& gx, float& gy,
+                                        float& gz) const
+{
+  const float* tr  = &localToGlobalTranslationByModule[iModule * 3];
+  const float* rot = &localToGlobalRotationByModule[iModule * 9];
+
+  gx = tr[0] + lx * rot[0] + ly * rot[1] + lz * rot[2];
+  gy = tr[1] + lx * rot[3] + ly * rot[4] + lz * rot[5];
+  gz = tr[2] + lx * rot[6] + ly * rot[7] + lz * rot[8];
+}
diff --git a/reco/detectors/sts/experimental/CbmStsGpuHitFinder.h b/reco/detectors/sts/experimental/CbmStsGpuHitFinder.h
new file mode 100644
index 0000000000000000000000000000000000000000..5708982fab38ce0117271d1d59a2db47c0e8a0a6
--- /dev/null
+++ b/reco/detectors/sts/experimental/CbmStsGpuHitFinder.h
@@ -0,0 +1,309 @@
+/* Copyright (C) 2021-2022 FIAS Frankfurt Institute for Advanced Studies, Frankfurt / Main
+   SPDX-License-Identifier: GPL-3.0-only
+   Authors: Felix Weiglhofer [committer], Kilian Hunold */
+#ifndef CBMSTSGPUHITFINDER_H
+#define CBMSTSGPUHITFINDER_H
+
+#include "CbmStsDigi.h"
+#include "CbmStsGpuRecoDevice.h"
+
+#include <xpu/device.h>
+
+// experimental namespace to avoid name collisions with original data types
+namespace experimental
+{
+  using CbmStsTimeNs = uint32_t;
+
+  using XErrorT = float;  // FIXME: remove and replace by float
+
+  struct CbmStsClusterIdx {
+    CbmStsTimeNs fTime;  ///< Cluster time (average of digi times) [ns]
+    int fIdx;
+  };
+
+  struct CbmStsClusterData {
+    float fCharge;           ///< Total charge
+    int fSize;               ///< Difference between first and last channel
+    float fPosition;         ///< Cluster centre in channel number units
+    XErrorT fPositionError;  ///< Cluster centre error (r.m.s.) in channel number units
+    float fTimeError;        ///< Error of cluster time [ns]
+  };
+
+  struct CbmStsHit {
+    float fX, fY;        ///< X, Y positions of hit [cm]
+    float fZ;            ///< Z position of hit [cm]
+    CbmStsTimeNs fTime;  ///< Hit time [ns]
+    float fDx, fDy;      ///< X, Y errors [cm]
+    float fDz;           ///< Z position error [cm]
+    float fDxy;          ///< XY correlation
+    float fTimeError;    ///< Error of hit time [ns]
+    float fDu;           ///< Error of coordinate across front-side strips [cm]
+    float fDv;           ///< Error of coordinate across back-side strips [cm]
+  };
+
+  struct CbmStsParAsic {
+    int nAdc;
+    float dynRange;
+    float threshold;
+    float timeResolution;
+    float deadTime;
+    float noise;
+    float zeroNoiseRate;
+
+    XPU_D float AdcToCharge(unsigned short adc) const
+    {
+      return threshold + dynRange / float(nAdc) * (float(adc) + 0.5f);
+    }
+  };
+
+  struct CbmStsHitFinderConfig {
+    float dY;
+    float pitch;
+    float stereoF;
+    float stereoB;
+    float lorentzF;
+    float lorentzB;
+  };
+
+  // cache of various parameters used by the hit finder
+  struct CbmStsHitFinderParams : public CbmStsHitFinderConfig {
+    float tanStereoF;
+    float tanStereoB;
+    float errorFac;
+    float dX;
+  };
+
+  using SortDigisT = xpu::block_sort<unsigned long int, ::CbmStsDigi, xpu::block_size<SortDigis> {},
+                                     CBM_STS_SORT_DIGIS_ITEMS_PER_THREAD>;
+
+  struct CbmStsSortDigiSmem {
+    typename SortDigisT::storage_t sortBuf;
+  };
+
+  using SortClustersT = xpu::block_sort<CbmStsTimeNs, CbmStsClusterIdx, xpu::block_size<SortClusters> {},
+                                        CBM_STS_SORT_CLUSTERS_ITEMS_PER_THREAD>;
+
+  struct CbmStsSortClusterSmem {
+    typename SortClustersT::storage_t sortBuf;
+  };
+
+  struct CbmStsClusterCalculationProperties {
+    float tSum      = 0.;       // sum of digi times
+    int chanF       = 9999999;  // first channel in cluster
+    int chanL       = -1;       // last channel in cluster
+    float qF        = 0.f;      // charge in first channel
+    float qM        = 0.f;      // sum of charges in middle channels
+    float qL        = 0.f;      // charge in last cluster
+    float eqFsq     = 0.f;      // uncertainty of qF
+    float eqMsq     = 0.f;      // uncertainty of qMid
+    float eqLsq     = 0.f;      // uncertainty of qL
+    float tResolSum = 0.f;
+
+    float xSum = 0.f;  // weighted charge sum, used to correct corrupt clusters
+  };
+
+  class CbmStsDigiConnector {
+  private:
+    unsigned int hasPreviousAndNext;
+
+    XPU_D unsigned int UnpackNext(unsigned int val) const { return val & ~(1u << 31); }
+
+  public:
+    XPU_D bool HasPrevious() const { return (hasPreviousAndNext >> 31) & 1u; }
+
+    XPU_D void SetHasPrevious(bool hasPrevious)
+    {
+      unsigned int old          = hasPreviousAndNext;
+      unsigned int hasPreviousI = ((unsigned int) hasPrevious) << 31;
+      unsigned int assumed;
+
+      do {
+        assumed = old;
+        old     = xpu::atomic_cas_block(&hasPreviousAndNext, assumed, UnpackNext(assumed) | hasPreviousI);
+      } while (old != assumed);
+    }
+
+    XPU_D unsigned int next() const { return UnpackNext(hasPreviousAndNext); }
+
+    XPU_D void SetNext(unsigned int next)
+    {
+      unsigned int old = hasPreviousAndNext;
+      unsigned int assumed;
+
+      next = xpu::min((1u << 31) - 1, next);
+
+      do {
+        assumed = old;
+        old     = xpu::atomic_cas_block(&hasPreviousAndNext, assumed, (assumed & (1u << 31)) | next);
+      } while (old != assumed);
+    }
+
+    XPU_D bool HasNext() const { return UnpackNext(hasPreviousAndNext) != 0; }
+  };
+
+  template<xpu::side S>
+  class CbmStsHitFinder {
+
+  public:
+    // Constants
+    int nModules;
+    int nChannels;
+
+    // calibration data
+    float timeCutDigiAbs;
+    float timeCutDigiSig;
+    float timeCutClusterAbs;
+    float timeCutClusterSig;
+    CbmStsParAsic asic;
+    int landauTableSize;
+    float landauStepSize;
+    xpu::cmem_io_t<float, S> landauTable;
+
+    xpu::cmem_device_t<float, S> maxErrorFront;
+    xpu::cmem_device_t<float, S> maxErrorBack;
+
+    // TODO fill this array
+    xpu::cmem_io_t<CbmStsHitFinderConfig, S> hitfinderConfig;
+
+    // input
+    // Store all digis in a flat array with a header that contains the offset for every module (front and back)
+    xpu::cmem_io_t<size_t, S> digiOffsetPerModule;  // 2 * nModules + 1 entries, last entry contains total digi count
+    xpu::cmem_io_t<CbmStsDigi, S> digisPerModule;   // TODO this is a flat array now, should be renamed
+    // Two digi arrays needed, as sorting can't sort in place right now
+    xpu::cmem_device_t<CbmStsDigi, S> digisPerModuleTmp;
+    xpu::cmem_device_t<CbmStsDigi*, S> digisSortedPerModule;
+
+    xpu::cmem_io_t<float, S> localToGlobalTranslationByModule;
+    xpu::cmem_io_t<float, S> localToGlobalRotationByModule;
+
+    xpu::cmem_device_t<CbmStsDigiConnector, S> digiConnectorPerModule;
+    xpu::cmem_device_t<unsigned int, S> channelOffsetPerModule;
+
+    // intermediate results
+    size_t maxClustersPerModule;
+    xpu::cmem_device_t<int, S> channelStatusPerModule;
+    xpu::cmem_io_t<CbmStsClusterIdx, S> clusterIdxPerModule;
+    xpu::cmem_io_t<CbmStsClusterIdx, S> clusterIdxPerModuleTmp;
+    xpu::cmem_io_t<CbmStsClusterIdx*, S> clusterIdxSortedPerModule;
+    xpu::cmem_io_t<CbmStsClusterData, S> clusterDataPerModule;
+    xpu::cmem_io_t<int, S> nClustersPerModule;
+
+    // output
+    size_t maxHitsPerModule;
+    xpu::cmem_io_t<CbmStsHit, S> hitsPerModule;
+    xpu::cmem_io_t<int, S> nHitsPerModule;
+  };
+
+  class CbmStsGpuHitFinder : public CbmStsHitFinder<xpu::side::device> {
+
+  public:
+    XPU_D void SortDigisInSpaceAndTime(CbmStsSortDigiSmem& smem, int iBlock) const;
+    XPU_D void FindClustersSingleStep(int iBlock) const;
+    XPU_D void SortClusters(CbmStsSortClusterSmem& smem, int iBlock) const;
+    XPU_D void FindHits(int iBlock) const;
+
+    // Individual steps of cluster finder, for more granular time measurement
+    XPU_D void CalculateOffsetsParallel(int iBlock) const;
+    XPU_D void FindClustersParallel(int iBlock) const;
+    XPU_D void CalculateClustersParallel(int iBlock) const;
+
+    XPU_D void FindClusterConnectionsBasic(int iBlock) const;
+    XPU_D void CalculateClustersBasic(int iBlock) const;
+
+  private:
+    XPU_D void CalculateChannelOffsets(CbmStsDigi* digis, unsigned int* channelOffsets, unsigned int nDigis) const;
+
+    XPU_D void FindClusterConnectionsChannelWise(CbmStsDigi* digis, CbmStsDigiConnector* digiConnector,
+                                                 unsigned int* channelOffsets, int iModule,
+                                                 unsigned int threadId) const;
+    XPU_D void FindClusterConnectionsDigiWise(CbmStsDigi* digis, CbmStsDigiConnector* digiConnector,
+                                              unsigned int* channelOffsets, int iModule, unsigned int threadId,
+                                              unsigned int nDigis) const;
+
+    XPU_D void CalculateClustersBasic(CbmStsDigi* digis, CbmStsDigiConnector* digiConnector, int iModule,
+                                      unsigned int const threadId) const;
+    XPU_D void CalculateClustersChannelWise(CbmStsDigi* digis, CbmStsDigiConnector* digiConnector,
+                                            unsigned int* channelOffsets, int iModule, unsigned int const threadId,
+                                            unsigned int const nDigis) const;
+    XPU_D void CalculateClustersDigiWise(CbmStsDigi* digis, CbmStsDigiConnector* digiConnector, int iModule,
+                                         unsigned int const threadId, unsigned int const nDigis) const;
+
+    XPU_D void CreateClusterFromConnectors1(int const iModule, CbmStsDigi* digis, int const digiIndex) const;
+    XPU_D void CreateClusterFromConnectors2(int const iModule, CbmStsDigi* digis, CbmStsDigiConnector* digiConnector,
+                                            int const digiIndex) const;
+    XPU_D void CreateClusterFromConnectorsN(int const iModule, CbmStsDigi* digis, CbmStsDigiConnector* digiConnector,
+                                            int const digiIndex) const;
+
+  private:
+    XPU_D unsigned int GetNDigis(int iModule) const
+    {
+      return digiOffsetPerModule[iModule + 1] - digiOffsetPerModule[iModule];
+    }
+
+    XPU_D float GetTimeResolution(int /* iModule */, int /* channel */) const { return asic.timeResolution; }
+
+    XPU_D bool IsActive(int* channelStatus, int channel) const
+    {
+      // TODO add padding to channels to remove this?
+      if (channel < 0 || channel >= nChannels) { return false; }
+      return channelStatus[channel] > -1;
+    }
+
+    XPU_D int ChanLeft(int channel) const { return channel - 1; }
+
+    XPU_D int ChanRight(int channel) const { return channel + 1; }
+
+    XPU_D int ChanDist(int c1, int c2) const
+    {
+      int diff = c2 - c1;
+      // TODO handle wrap around
+      return diff;
+    }
+
+    XPU_D void AddCluster(int iModule, CbmStsTimeNs time, const CbmStsClusterData& cls) const
+    {
+      CbmStsClusterIdx* tgtIdx   = &clusterIdxPerModule[iModule * maxClustersPerModule];
+      CbmStsClusterData* tgtData = &clusterDataPerModule[iModule * maxClustersPerModule];
+
+      int pos = xpu::atomic_add_block(&nClustersPerModule[iModule], 1);
+      XPU_ASSERT(size_t(pos) < maxClustersPerModule);
+
+      CbmStsClusterIdx idx {time, pos};
+      tgtIdx[idx.fIdx]  = idx;
+      tgtData[idx.fIdx] = cls;
+    }
+
+    XPU_D bool IsBackside(int iModule) const { return iModule >= nModules; }
+
+    XPU_D XErrorT LandauWidth(float charge) const;
+
+    XPU_D void ToGlobal(int iModule, float lx, float ly, float lz, float& gx, float& gy, float& gz) const;
+
+    XPU_D void IntersectClusters(int iBlock, const CbmStsHitFinderParams& pars, CbmStsTimeNs timeF,
+                                 const CbmStsClusterData& clsF, CbmStsTimeNs timeB,
+                                 const CbmStsClusterData& clsB) const;
+    XPU_D float GetClusterPosition(const CbmStsHitFinderParams& pars, float centre, bool isFront) const;
+    XPU_D bool Intersect(const CbmStsHitFinderParams& pars, float xF, float exF, float xB, float exB, float& x,
+                         float& y, float& varX, float& varY, float& varXY) const;
+    XPU_D bool IsInside(const CbmStsHitFinderParams& pars, float x, float y) const;
+    XPU_D void CreateHit(int iBlocks, float xLocal, float yLocal, float varX, float varY, float varXY,
+                         CbmStsTimeNs timeF, const CbmStsClusterData& clsF, CbmStsTimeNs timeB,
+                         const CbmStsClusterData& clsB, float du, float dv) const;
+
+    XPU_D void SaveMaxError(float errorValue, int iModule) const
+    {
+
+      float* maxError = IsBackside(iModule) ? maxErrorBack : maxErrorFront;
+      float old {};
+      do {
+        old = *maxError;
+        if (old >= errorValue) { break; }
+      } while (!xpu::atomic_cas_block(maxError, *maxError, xpu::max(errorValue, *maxError)));
+    }
+  };
+
+}  // namespace experimental
+
+XPU_EXPORT_CONSTANT(CbmStsGpuRecoDevice, ::experimental::CbmStsGpuHitFinder, HitFinder);
+
+#endif
diff --git a/reco/detectors/sts/experimental/CbmStsGpuRecoDevice.cxx b/reco/detectors/sts/experimental/CbmStsGpuRecoDevice.cxx
new file mode 100644
index 0000000000000000000000000000000000000000..c8f1b62a75ed58a81264e79bfc8a6eb83cff7998
--- /dev/null
+++ b/reco/detectors/sts/experimental/CbmStsGpuRecoDevice.cxx
@@ -0,0 +1,30 @@
+/* Copyright (C) 2022 FIAS Frankfurt Institute for Advanced Studies, Frankfurt / Main
+   SPDX-License-Identifier: GPL-3.0-only
+   Authors: Felix Weiglhofer [committer], Kilian Hunold */
+#include "CbmStsGpuRecoDevice.h"
+
+#include "CbmStsGpuHitFinder.h"
+
+using namespace experimental;
+
+XPU_IMAGE(CbmStsGpuRecoDevice);
+
+XPU_CONSTANT(HitFinder);
+
+XPU_KERNEL(SortDigis, CbmStsSortDigiSmem) { xpu::cmem<HitFinder>().SortDigisInSpaceAndTime(smem, xpu::block_idx::x()); }
+
+XPU_KERNEL(FindClustersSingleStep, xpu::no_smem) { xpu::cmem<HitFinder>().FindClustersSingleStep(xpu::block_idx::x()); }
+
+XPU_KERNEL(CalculateOffsets, xpu::no_smem) { xpu::cmem<HitFinder>().CalculateOffsetsParallel(xpu::block_idx::x()); }
+
+XPU_KERNEL(FindClusters, xpu::no_smem) { xpu::cmem<HitFinder>().FindClustersParallel(xpu::block_idx::x()); }
+
+XPU_KERNEL(FindClustersBasic, xpu::no_smem) { xpu::cmem<HitFinder>().FindClusterConnectionsBasic(xpu::block_idx::x()); }
+
+XPU_KERNEL(CalculateClusters, xpu::no_smem) { xpu::cmem<HitFinder>().CalculateClustersParallel(xpu::block_idx::x()); }
+
+XPU_KERNEL(CalculateClustersBasic, xpu::no_smem) { xpu::cmem<HitFinder>().CalculateClustersBasic(xpu::block_idx::x()); }
+
+XPU_KERNEL(SortClusters, CbmStsSortClusterSmem) { xpu::cmem<HitFinder>().SortClusters(smem, xpu::block_idx::x()); }
+
+XPU_KERNEL(FindHits, xpu::no_smem) { xpu::cmem<HitFinder>().FindHits(xpu::block_idx::x()); }
diff --git a/reco/detectors/sts/experimental/CbmStsGpuRecoDevice.h b/reco/detectors/sts/experimental/CbmStsGpuRecoDevice.h
new file mode 100644
index 0000000000000000000000000000000000000000..1d5f9020e521538cb87a14f934beb55ab54aaab6
--- /dev/null
+++ b/reco/detectors/sts/experimental/CbmStsGpuRecoDevice.h
@@ -0,0 +1,55 @@
+/* Copyright (C) 2022 FIAS Frankfurt Institute for Advanced Studies, Frankfurt / Main
+   SPDX-License-Identifier: GPL-3.0-only
+   Authors: Felix Weiglhofer [committer], Kilian Hunold */
+#ifndef CBMSTSGPURECODEVICE_H
+#define CBMSTSGPURECODEVICE_H 1
+
+#include <xpu/device.h>
+
+// TODO: Tune these values for various gpus
+#if XPU_IS_CUDA
+#define CBM_STS_SORT_DIGIS_BLOCK_SIZE 32
+#define CBM_STS_SORT_DIGIS_ITEMS_PER_THREAD 32
+#define CBM_STS_SORT_CLUSTERS_BLOCK_SIZE 32
+#define CBM_STS_SORT_CLUSTERS_ITEMS_PER_THREAD 32
+#define CBM_STS_FIND_CLUSTERS_BLOCK_SIZE 32
+#define CBM_STS_FIND_HITS_BLOCK_SIZE 32
+#else  // HIP, values ignored on CPU
+#define CBM_STS_SORT_DIGIS_BLOCK_SIZE 256
+#define CBM_STS_SORT_DIGIS_ITEMS_PER_THREAD 6
+#define CBM_STS_SORT_CLUSTERS_BLOCK_SIZE 256
+#define CBM_STS_SORT_CLUSTERS_ITEMS_PER_THREAD 6
+#define CBM_STS_FIND_CLUSTERS_BLOCK_SIZE 256
+#define CBM_STS_FIND_HITS_BLOCK_SIZE 256
+#endif
+
+struct CbmStsGpuRecoDevice {
+};
+
+XPU_EXPORT_KERNEL(CbmStsGpuRecoDevice, SortDigis);
+XPU_BLOCK_SIZE(SortDigis, CBM_STS_SORT_DIGIS_BLOCK_SIZE);
+
+// Combine substeps for finding clusters into a single kernel
+XPU_EXPORT_KERNEL(CbmStsGpuRecoDevice, FindClustersSingleStep);
+XPU_BLOCK_SIZE(FindClustersSingleStep, CBM_STS_FIND_CLUSTERS_BLOCK_SIZE);
+
+XPU_EXPORT_KERNEL(CbmStsGpuRecoDevice, CalculateOffsets);
+XPU_BLOCK_SIZE(CalculateOffsets, CBM_STS_FIND_CLUSTERS_BLOCK_SIZE);
+
+XPU_EXPORT_KERNEL(CbmStsGpuRecoDevice, FindClusters);
+XPU_BLOCK_SIZE(FindClusters, CBM_STS_FIND_CLUSTERS_BLOCK_SIZE);
+
+XPU_EXPORT_KERNEL(CbmStsGpuRecoDevice, FindClustersBasic);
+
+XPU_EXPORT_KERNEL(CbmStsGpuRecoDevice, CalculateClusters);
+XPU_BLOCK_SIZE(CalculateClusters, CBM_STS_FIND_CLUSTERS_BLOCK_SIZE);
+
+XPU_EXPORT_KERNEL(CbmStsGpuRecoDevice, CalculateClustersBasic);
+
+XPU_EXPORT_KERNEL(CbmStsGpuRecoDevice, SortClusters);
+XPU_BLOCK_SIZE(SortClusters, CBM_STS_SORT_CLUSTERS_BLOCK_SIZE);
+
+XPU_EXPORT_KERNEL(CbmStsGpuRecoDevice, FindHits);
+XPU_BLOCK_SIZE(FindHits, CBM_STS_FIND_HITS_BLOCK_SIZE);
+
+#endif
diff --git a/reco/mq/CMakeLists.txt b/reco/mq/CMakeLists.txt
index 035786646b130aef40b135f60da6543e48e00a08..1671e330eef7191d7c664ea0c3f6068cf28fca3e 100644
--- a/reco/mq/CMakeLists.txt
+++ b/reco/mq/CMakeLists.txt
@@ -29,15 +29,12 @@ set(INCLUDE_DIRECTORIES
 )
 
 Set(SYSTEM_INCLUDE_DIRECTORIES
-    ${SYSTEM_INCLUDE_DIRECTORIES}
+    ${BASE_INCLUDE_DIRECTORIES}
     ${ZeroMQ_INCLUDE_DIR}
     ${Boost_INCLUDE_DIR}
     ${FAIRROOT_INCLUDE_DIR}
     ${FAIRMQ_INCLUDE_DIR}
     ${FAIRMQ_INCLUDE_DIR}/options
-    ${VMC_INCLUDE_DIRS}
-    ${FAIRLOGGER_INCLUDE_DIR}
-
     ${IPC_INCLUDE_DIRECTORY}
     ${CBMROOT_SOURCE_DIR}/external/cppzmq
 )