diff --git a/algo/CMakeLists.txt b/algo/CMakeLists.txt
index 48571713f093dab8540f17068f2089c9563d6ceb..f4f946b5c994506c32994e72699150c1f04b4110 100644
--- a/algo/CMakeLists.txt
+++ b/algo/CMakeLists.txt
@@ -4,14 +4,15 @@ add_subdirectory(test)
 set(DEVICE_SRCS
   base/gpu/DeviceImage.cxx
   detectors/sts/UnpackStsXpu.cxx
+  detectors/sts/StsHitfinder.cxx
 )
 
-# Create a library libCbmAlgo
 set(SRCS
   ${DEVICE_SRCS}
   evbuild/EventBuilder.cxx
   trigger/TimeClusterTrigger.cxx
   evselector/DigiEventSelector.cxx
+  detectors/sts/StsHitfinderChain.cxx
   detectors/sts/StsReadoutConfig.cxx
   detectors/sts/UnpackSts.cxx
   detectors/much/MuchReadoutConfig.cxx
@@ -40,7 +41,14 @@ target_include_directories(Algo
          ${CMAKE_CURRENT_SOURCE_DIR}/detectors/trd
  )
 
-target_link_libraries(Algo PUBLIC OnlineData ROOT::GenVector INTERFACE FairLogger::FairLogger external::fles_ipc xpu)
+target_link_libraries(Algo
+  PUBLIC    OnlineData
+            ROOT::GenVector
+            GSL
+  INTERFACE FairLogger::FairLogger
+            external::fles_ipc
+            xpu
+)
 target_compile_definitions(Algo PUBLIC NO_ROOT)
 xpu_attach(Algo ${DEVICE_SRCS})
 
@@ -49,8 +57,11 @@ install(DIRECTORY base/gpu TYPE INCLUDE
   FILES_MATCHING PATTERN "*.h"
 )
 
-install(FILES ca/simd/CaSimd.h
-  ca/simd/CaSimdVc.h
-  ca/simd/CaSimdPseudo.h
+install(
+  FILES       ca/simd/CaSimd.h
+              ca/simd/CaSimdVc.h
+              ca/simd/CaSimdPseudo.h
+              detectors/sts/StsHitfinder.h
+              detectors/sts/StsHitfinderChain.h
   DESTINATION include/
 )
diff --git a/reco/detectors/sts/experimental/CbmStsGpuHitFinder.cxx b/algo/detectors/sts/StsHitfinder.cxx
similarity index 79%
rename from reco/detectors/sts/experimental/CbmStsGpuHitFinder.cxx
rename to algo/detectors/sts/StsHitfinder.cxx
index c02877d377ddaaac4d7b718ef1e58bb34f541ebf..23422af0b8cc15d933d24d38a1bc9523a3ada0a4 100644
--- a/reco/detectors/sts/experimental/CbmStsGpuHitFinder.cxx
+++ b/algo/detectors/sts/StsHitfinder.cxx
@@ -1,19 +1,61 @@
-/* Copyright (C) 2021-2022 FIAS Frankfurt Institute for Advanced Studies, Frankfurt / Main
+/* Copyright (C) 2022 FIAS Frankfurt Institute for Advanced Studies, Frankfurt / Main
    SPDX-License-Identifier: GPL-3.0-only
    Authors: Felix Weiglhofer [committer], Kilian Hunold */
-#include "CbmStsGpuHitFinder.h"
 
-using namespace experimental;
+#include "StsHitfinder.h"
+
+using namespace cbm::algo;
+
+
+XPU_IMAGE(GpuOnline);
+
+XPU_CONSTANT(TheStsHitfinder);
+
+XPU_KERNEL(SortDigis, StsSortDigiSmem)
+{
+  xpu::cmem<TheStsHitfinder>().SortDigisInSpaceAndTime(smem, xpu::block_idx::x());
+}
+
+XPU_KERNEL(FindClustersSingleStep, xpu::no_smem)
+{
+  xpu::cmem<TheStsHitfinder>().FindClustersSingleStep(xpu::block_idx::x());
+}
+
+XPU_KERNEL(CalculateOffsets, xpu::no_smem)
+{
+  xpu::cmem<TheStsHitfinder>().CalculateOffsetsParallel(xpu::block_idx::x());
+}
+
+XPU_KERNEL(FindClusters, xpu::no_smem) { xpu::cmem<TheStsHitfinder>().FindClustersParallel(xpu::block_idx::x()); }
+
+XPU_KERNEL(FindClustersBasic, xpu::no_smem)
+{
+  xpu::cmem<TheStsHitfinder>().FindClusterConnectionsBasic(xpu::block_idx::x());
+}
+
+XPU_KERNEL(CalculateClusters, xpu::no_smem)
+{
+  xpu::cmem<TheStsHitfinder>().CalculateClustersParallel(xpu::block_idx::x());
+}
+
+XPU_KERNEL(CalculateClustersBasic, xpu::no_smem)
+{
+  xpu::cmem<TheStsHitfinder>().CalculateClustersBasic(xpu::block_idx::x());
+}
+
+XPU_KERNEL(SortClusters, StsSortClusterSmem) { xpu::cmem<TheStsHitfinder>().SortClusters(smem, xpu::block_idx::x()); }
+
+XPU_KERNEL(FindHits, xpu::no_smem) { xpu::cmem<TheStsHitfinder>().FindHits(xpu::block_idx::x()); }
 
 /**
-  * CbmStsGpuHitFinder::sortDigisKhun
+  * StsHitfinderGpu::sortDigisKhun
   * Sorts digis channelwise. Inside a channel all digis are sorted in time.
   *
   * @param smem is the shared memory it is worked in
   * @param iBlock is the Block/Module that is currenty running
   *
 */
-XPU_D void CbmStsGpuHitFinder::SortDigisInSpaceAndTime(CbmStsSortDigiSmem& smem, int iBlock) const
+XPU_D void StsHitfinderGpu::SortDigisInSpaceAndTime(StsSortDigiSmem& smem, int iBlock) const
 {
   int iModule          = iBlock;
   CbmStsDigi* digis    = &digisPerModule[digiOffsetPerModule[iModule]];
@@ -29,7 +71,7 @@ XPU_D void CbmStsGpuHitFinder::SortDigisInSpaceAndTime(CbmStsSortDigiSmem& smem,
   if (xpu::thread_idx::x() == 0) { digisSortedPerModule[iModule] = digis; }
 }
 
-XPU_D void CbmStsGpuHitFinder::FindClustersSingleStep(int iBlock) const
+XPU_D void StsHitfinderGpu::FindClustersSingleStep(int iBlock) const
 {
   CalculateOffsetsParallel(iBlock);
   FindClustersParallel(iBlock);
@@ -37,7 +79,7 @@ XPU_D void CbmStsGpuHitFinder::FindClustersSingleStep(int iBlock) const
 }
 
 /**
-  * CbmStsGpuHitFinder::calculateChannelOffsets
+  * StsHitfinderGpu::calculateChannelOffsets
   * Calculates the Offsest of every channel. digis-Array is sorted in Channel and Time.
   * If a channelChange is detected it is stored in channelOffsetsPerModule.
   *
@@ -46,8 +88,8 @@ XPU_D void CbmStsGpuHitFinder::FindClustersSingleStep(int iBlock) const
   * @param nDigis Amount of digis in digis-Array
   *
 */
-XPU_D void CbmStsGpuHitFinder::CalculateChannelOffsets(CbmStsDigi* digis, unsigned int* channelOffsets,
-                                                       unsigned int const nDigis) const
+XPU_D void StsHitfinderGpu::CalculateChannelOffsets(CbmStsDigi* digis, unsigned int* channelOffsets,
+                                                    unsigned int const nDigis) const
 {
   channelOffsets[0] = 0;
 
@@ -71,14 +113,14 @@ XPU_D void CbmStsGpuHitFinder::CalculateChannelOffsets(CbmStsDigi* digis, unsign
 
 
 /**
-  * CbmStsGpuHitFinder::calculateOffsetsParallel
+  * StsHitfinderGpu::calculateOffsetsParallel
   * This function calculates the channeloffsets in a certain module.
   * An Offset is the Startingpoint of a Channel in a sorted array of Digis.
   *
   * @param iBlock is the Block/Module that is currently worked on
   *
 */
-XPU_D void CbmStsGpuHitFinder::CalculateOffsetsParallel(int iBlock) const
+XPU_D void StsHitfinderGpu::CalculateOffsetsParallel(int iBlock) const
 {
   int const iModule = iBlock;
   CbmStsDigi* digis = digisSortedPerModule[iModule];
@@ -92,14 +134,14 @@ XPU_D void CbmStsGpuHitFinder::CalculateOffsetsParallel(int iBlock) const
 }
 
 /**
-  * CbmStsGpuHitFinder::findClustersParallel
+  * StsHitfinderGpu::findClustersParallel
   * This function finds clusters form an sts-Digi Array inserted.
   * It runs the finding process highly parallel.
   *
   * @param iBlock is the Block/Module that is currently worked on
   *
 */
-XPU_D void CbmStsGpuHitFinder::FindClustersParallel(int iBlock) const
+XPU_D void StsHitfinderGpu::FindClustersParallel(int iBlock) const
 {
   int const iModule           = iBlock;
   CbmStsDigi* digis           = digisSortedPerModule[iModule];
@@ -108,7 +150,7 @@ XPU_D void CbmStsGpuHitFinder::FindClustersParallel(int iBlock) const
 
   if (nDigis == 0) return;
 
-  auto* digiConnector  = &digiConnectorPerModule[digiOffsetPerModule[iModule]];
+  auto* digiConnector  = &digiConnectorsPerModule[digiOffsetPerModule[iModule]];
   auto* channelOffsets = &channelOffsetPerModule[iModule * nChannels];
 
   // findClusterConnectionsChannelWise(digis, digiConnector, channelOffsets, iModule, threadId);
@@ -117,14 +159,14 @@ XPU_D void CbmStsGpuHitFinder::FindClustersParallel(int iBlock) const
 
 
 /**
-  * CbmStsGpuHitFinder::calculateClustersParallel
+  * StsHitfinderGpu::calculateClustersParallel
   * This function calculates clusters form an sts-Digi Array inserted.
   * It runs the calculating process highly parallel.
   *
   * @param iBlock is the Block/Module that is currently worked on
   *
 */
-XPU_D void CbmStsGpuHitFinder::CalculateClustersParallel(int iBlock) const
+XPU_D void StsHitfinderGpu::CalculateClustersParallel(int iBlock) const
 {
   int const iModule  = iBlock;
   CbmStsDigi* digis  = digisSortedPerModule[iModule];
@@ -133,7 +175,7 @@ XPU_D void CbmStsGpuHitFinder::CalculateClustersParallel(int iBlock) const
 
   if (nDigis == 0) { return; }
 
-  auto* digiConnector = &digiConnectorPerModule[digiOffsetPerModule[iModule]];
+  auto* digiConnector = &digiConnectorsPerModule[digiOffsetPerModule[iModule]];
   // auto* channelOffsets = &channelOffsetPerModule[iModule * nChannels];
 
   // calculateClustersChannelWise(digis, digiConnector, channelOffsets, iModule, threadId, nDigis);
@@ -142,7 +184,7 @@ XPU_D void CbmStsGpuHitFinder::CalculateClustersParallel(int iBlock) const
 
 
 /**
-  * CbmStsGpuHitFinder::findClusterConnectionsBasic
+  * StsHitfinderGpu::findClusterConnectionsBasic
   *
   * Each Thread one Channel
   *
@@ -159,13 +201,13 @@ XPU_D void CbmStsGpuHitFinder::CalculateClustersParallel(int iBlock) const
   * @param threadId Id of the thrad currently working
   *
 **/
-XPU_D void CbmStsGpuHitFinder::FindClusterConnectionsBasic(int iBlock) const
+XPU_D void StsHitfinderGpu::FindClusterConnectionsBasic(int iBlock) const
 {
   int iModule          = iBlock;
   int threadId         = xpu::thread_idx::x();
   auto nDigis          = GetNDigis(iModule);
   CbmStsDigi* digis    = digisSortedPerModule[iModule];
-  auto* digiConnector  = &digiConnectorPerModule[digiOffsetPerModule[iModule]];
+  auto* digiConnector  = &digiConnectorsPerModule[digiOffsetPerModule[iModule]];
   auto* channelOffsets = &channelOffsetPerModule[iModule * nChannels];
 
   if (threadId != 0) return;
@@ -209,7 +251,7 @@ XPU_D void CbmStsGpuHitFinder::FindClusterConnectionsBasic(int iBlock) const
 
 
 /**
-  * CbmStsGpuHitFinder::findClusterConnectionsChannelWise
+  * StsHitfinderGpu::findClusterConnectionsChannelWise
   *
   * Each Thread one Channel
   *
@@ -228,9 +270,9 @@ XPU_D void CbmStsGpuHitFinder::FindClusterConnectionsBasic(int iBlock) const
   * @param threadId Id of the thrad currently working
   *
 **/
-XPU_D void CbmStsGpuHitFinder::FindClusterConnectionsChannelWise(CbmStsDigi* digis, CbmStsDigiConnector* digiConnector,
-                                                                 unsigned int* channelOffsets, int const iModule,
-                                                                 unsigned int const threadId) const
+XPU_D void StsHitfinderGpu::FindClusterConnectionsChannelWise(CbmStsDigi* digis, StsDigiConnector* digiConnector,
+                                                              unsigned int* channelOffsets, int const iModule,
+                                                              unsigned int const threadId) const
 {
   // Expl: here we need to be smaller than nChannels because the last channel is the most Right.
   for (unsigned int channel = threadId; channel < (unsigned int) (nChannels - 1); channel += xpu::block_dim::x()) {
@@ -271,7 +313,7 @@ XPU_D void CbmStsGpuHitFinder::FindClusterConnectionsChannelWise(CbmStsDigi* dig
 }
 
 /**
-  * CbmStsGpuHitFinder::findClusterConnectionsDigiWise
+  * StsHitfinderGpu::findClusterConnectionsDigiWise
   *
   * Each thread one Digi
   *
@@ -292,10 +334,9 @@ XPU_D void CbmStsGpuHitFinder::FindClusterConnectionsChannelWise(CbmStsDigi* dig
   * @param nDigis Amount of digis in digis-Array
   *
 **/
-XPU_D void CbmStsGpuHitFinder::FindClusterConnectionsDigiWise(CbmStsDigi* digis, CbmStsDigiConnector* digiConnector,
-                                                              unsigned int* channelOffsets, int const iModule,
-                                                              unsigned int const threadId,
-                                                              unsigned int const nDigis) const
+XPU_D void StsHitfinderGpu::FindClusterConnectionsDigiWise(CbmStsDigi* digis, StsDigiConnector* digiConnector,
+                                                           unsigned int* channelOffsets, int const iModule,
+                                                           unsigned int const threadId, unsigned int const nDigis) const
 {
 
   for (unsigned int currIter = threadId; currIter < nDigis; currIter += xpu::block_dim::x()) {
@@ -324,7 +365,7 @@ XPU_D void CbmStsGpuHitFinder::FindClusterConnectionsDigiWise(CbmStsDigi* digis,
 
 
 /**
-  * CbmStsGpuHitFinder::calculateClustersBasic
+  * StsHitfinderGpu::calculateClustersBasic
   *
   * One Thread all Digis
   *
@@ -344,12 +385,12 @@ XPU_D void CbmStsGpuHitFinder::FindClusterConnectionsDigiWise(CbmStsDigi* digis,
   * @param threadId Id of the thrad currently working
   *
 **/
-XPU_D void CbmStsGpuHitFinder::CalculateClustersBasic(int iBlock) const
+XPU_D void StsHitfinderGpu::CalculateClustersBasic(int iBlock) const
 {
   int iModule         = iBlock;
   int threadId        = xpu::thread_idx::x();
   CbmStsDigi* digis   = digisSortedPerModule[iModule];
-  auto* digiConnector = &digiConnectorPerModule[digiOffsetPerModule[iModule]];
+  auto* digiConnector = &digiConnectorsPerModule[digiOffsetPerModule[iModule]];
 
   if (threadId != 0) return;
 
@@ -372,7 +413,7 @@ XPU_D void CbmStsGpuHitFinder::CalculateClustersBasic(int iBlock) const
 
 
 /**
-  * CbmStsGpuHitFinder::calculateClustersChannelWise
+  * StsHitfinderGpu::calculateClustersChannelWise
   *
   * Each Thread one Channel
   *
@@ -392,10 +433,9 @@ XPU_D void CbmStsGpuHitFinder::CalculateClustersBasic(int iBlock) const
   * @param threadId Id of the thrad currently working
   *
 **/
-XPU_D void CbmStsGpuHitFinder::CalculateClustersChannelWise(CbmStsDigi* digis, CbmStsDigiConnector* digiConnector,
-                                                            unsigned int* channelOffsets, int const iModule,
-                                                            unsigned int const threadId,
-                                                            unsigned int const nDigis) const
+XPU_D void StsHitfinderGpu::CalculateClustersChannelWise(CbmStsDigi* digis, StsDigiConnector* digiConnector,
+                                                         unsigned int* channelOffsets, int const iModule,
+                                                         unsigned int const threadId, unsigned int const nDigis) const
 {
 
   for (unsigned int channel = threadId; channel < (unsigned int) nChannels;
@@ -428,7 +468,7 @@ XPU_D void CbmStsGpuHitFinder::CalculateClustersChannelWise(CbmStsDigi* digis, C
 
 
 /**
-  * CbmStsGpuHitFinder::calculateClustersDigiWise
+  * StsHitfinderGpu::calculateClustersDigiWise
   *
   * Each Thread one Channel
   *
@@ -448,9 +488,9 @@ XPU_D void CbmStsGpuHitFinder::CalculateClustersChannelWise(CbmStsDigi* digis, C
   * @param threadId Id of the thrad currently working
   *
 **/
-XPU_D void CbmStsGpuHitFinder::CalculateClustersDigiWise(CbmStsDigi* digis, CbmStsDigiConnector* digiConnector,
-                                                         int const iModule, unsigned int const threadId,
-                                                         unsigned int const nDigis) const
+XPU_D void StsHitfinderGpu::CalculateClustersDigiWise(CbmStsDigi* digis, StsDigiConnector* digiConnector,
+                                                      int const iModule, unsigned int const threadId,
+                                                      unsigned int const nDigis) const
 {
 
   for (unsigned int currIter = threadId; currIter < nDigis; currIter += (unsigned int) xpu::block_dim::x()) {
@@ -473,19 +513,20 @@ XPU_D void CbmStsGpuHitFinder::CalculateClustersDigiWise(CbmStsDigi* digis, CbmS
 }
 
 
-XPU_D void CbmStsGpuHitFinder::CreateClusterFromConnectors1(int const iModule, CbmStsDigi* digis, int digiIndex) const
+XPU_D void StsHitfinderGpu::CreateClusterFromConnectors1(int const iModule, CbmStsDigi* digis, int digiIndex) const
 {
   const CbmStsDigi& digi = digis[digiIndex];
 
-  CbmStsTimeNs time = digi.GetTimeU32();
+  uint32_t time = digi.GetTimeU32();
 
   const float timeError = asic.timeResolution;
 
-  CbmStsClusterData cluster {
+  StsGpuCluster cluster {
     .fCharge        = asic.AdcToCharge(digi.GetChargeU16()),
     .fSize          = 1,
     .fPosition      = float(digi.GetChannel()) + (IsBackside(iModule) ? nChannels : 0.f),
     .fPositionError = 1.f / xpu::sqrt(24.f),
+    .fTime          = time,
     .fTimeError     = timeError,
   };
 
@@ -494,9 +535,8 @@ XPU_D void CbmStsGpuHitFinder::CreateClusterFromConnectors1(int const iModule, C
   AddCluster(iModule, time, cluster);
 }
 
-XPU_D void CbmStsGpuHitFinder::CreateClusterFromConnectors2(int const iModule, CbmStsDigi* digis,
-                                                            CbmStsDigiConnector* digiConnector,
-                                                            int const digiIndex) const
+XPU_D void StsHitfinderGpu::CreateClusterFromConnectors2(int const iModule, CbmStsDigi* digis,
+                                                         StsDigiConnector* digiConnector, int const digiIndex) const
 {
 
   const CbmStsDigi& digi1 = digis[digiIndex];
@@ -504,7 +544,7 @@ XPU_D void CbmStsGpuHitFinder::CreateClusterFromConnectors2(int const iModule, C
 
   float eNoiseSq = asic.noise * asic.noise;
 
-  float chargePerAdc = asic.dynRange / float(asic.nAdc);
+  float chargePerAdc = asic.dynamicRange / float(asic.nAdc);
   float eDigitSq     = chargePerAdc * chargePerAdc / 12.f;
 
   float x1 = float(digi1.GetChannel());
@@ -515,10 +555,10 @@ XPU_D void CbmStsGpuHitFinder::CreateClusterFromConnectors2(int const iModule, C
   if (digi1.GetChannel() > digi2.GetChannel()) { x1 -= float(nChannels); }
 
   // Uncertainties of the charge measurements
-  XErrorT width1 = LandauWidth(q1);
-  XErrorT eq1sq  = width1 * width1 + eNoiseSq + eDigitSq;
-  XErrorT width2 = LandauWidth(q2);
-  XErrorT eq2sq  = width2 * width2 + eNoiseSq + eDigitSq;
+  float width1 = LandauWidth(q1);
+  float eq1sq  = width1 * width1 + eNoiseSq + eDigitSq;
+  float width2 = LandauWidth(q2);
+  float eq2sq  = width2 * width2 + eNoiseSq + eDigitSq;
 
   // Cluster time
   float time      = 0.5f * (digi1.GetTimeU32() + digi2.GetTimeU32());
@@ -531,9 +571,9 @@ XPU_D void CbmStsGpuHitFinder::CreateClusterFromConnectors2(int const iModule, C
   if (x < -0.5f) { x += float(nChannels); }
 
   // Uncertainty on cluster position. See software note.
-  XErrorT ex0sq = 0.f;
-  XErrorT ex1sq = 0.f;
-  XErrorT ex2sq = 0.f;
+  float ex0sq = 0.f;
+  float ex1sq = 0.f;
+  float ex2sq = 0.f;
   if (q1 < q2) {
     ex0sq = (q2 - q1) * (q2 - q1) / q2 / q2 / 72.f;
     ex1sq = eq1sq / q2 / q2 / 9.f;
@@ -544,18 +584,19 @@ XPU_D void CbmStsGpuHitFinder::CreateClusterFromConnectors2(int const iModule, C
     ex1sq = eq1sq * q2 * q2 / q1 / q1 / q1 / q1 / 9.f;
     ex2sq = eq2sq / q1 / q1 / 9.f;
   }
-  XErrorT xError = xpu::sqrt(ex0sq + ex1sq + ex2sq);
+  float xError = xpu::sqrt(ex0sq + ex1sq + ex2sq);
 
   // Cluster charge
   float charge = q1 + q2;
 
   if (IsBackside(iModule)) x += nChannels;
 
-  CbmStsClusterData cls {
+  StsGpuCluster cls {
     .fCharge        = charge,
     .fSize          = 2,
     .fPosition      = x,
     .fPositionError = xError,
+    .fTime          = uint32_t(time),
     .fTimeError     = timeError,
   };
 
@@ -564,16 +605,16 @@ XPU_D void CbmStsGpuHitFinder::CreateClusterFromConnectors2(int const iModule, C
 }
 
 
-XPU_D void CbmStsGpuHitFinder::CreateClusterFromConnectorsN(int iModule, CbmStsDigi* digis,
-                                                            CbmStsDigiConnector* digiConnector, int digiIndex) const
+XPU_D void StsHitfinderGpu::CreateClusterFromConnectorsN(int iModule, CbmStsDigi* digis,
+                                                         StsDigiConnector* digiConnector, int digiIndex) const
 {
-  CbmStsClusterCalculationProperties cProps;
+  StsClusterCalculationProperties cProps;
 
   //This Lambda calculates all charges for a single digi inside of a cluster
   auto calculateClusterCharges = [this, &cProps, &digis, &digiConnector](int index) {
     CbmStsDigi digi    = digis[index];
     float eNoiseSq     = asic.noise * asic.noise;
-    float chargePerAdc = asic.dynRange / float(asic.nAdc);
+    float chargePerAdc = asic.dynamicRange / float(asic.nAdc);
     float eDigitSq     = chargePerAdc * chargePerAdc / 12.f;
     cProps.tResolSum += asic.timeResolution;
     cProps.tSum += digi.GetTimeU32();
@@ -642,11 +683,12 @@ XPU_D void CbmStsGpuHitFinder::CreateClusterFromConnectorsN(int iModule, CbmStsD
 
   if (IsBackside(iModule)) { x += nChannels; }
 
-  CbmStsClusterData cls {
+  StsGpuCluster cls {
     .fCharge        = qSum,
     .fSize          = int(nDigis),
     .fPosition      = x,
     .fPositionError = xError,
+    .fTime          = uint32_t(cProps.tSum),
     .fTimeError     = timeError,
   };
 
@@ -654,37 +696,37 @@ XPU_D void CbmStsGpuHitFinder::CreateClusterFromConnectorsN(int iModule, CbmStsD
   AddCluster(iModule, cProps.tSum, cls);
 }
 
-XPU_D void CbmStsGpuHitFinder::SortClusters(CbmStsSortClusterSmem& smem, int iBlock) const
+XPU_D void StsHitfinderGpu::SortClusters(StsSortClusterSmem& smem, int iBlock) const
 {
   int iModule                     = iBlock;
   size_t offset                   = iModule * maxClustersPerModule;
-  CbmStsClusterIdx* clusterIdx    = &clusterIdxPerModule[offset];
-  CbmStsClusterIdx* clusterIdxTmp = &clusterIdxPerModuleTmp[offset];
+  StsGpuClusterIdx* clusterIdx    = &clusterIdxPerModule[offset];
+  StsGpuClusterIdx* clusterIdxTmp = &clusterIdxPerModuleTmp[offset];
   int nClusters                   = nClustersPerModule[iModule];
 
   SortClustersT clsSort(smem.sortBuf);
-  clusterIdx = clsSort.sort(clusterIdx, nClusters, clusterIdxTmp, [](const CbmStsClusterIdx a) { return a.fTime; });
+  clusterIdx = clsSort.sort(clusterIdx, nClusters, clusterIdxTmp, [](const StsGpuClusterIdx a) { return a.fTime; });
 
   if (xpu::thread_idx::x() == 0) clusterIdxSortedPerModule[iModule] = clusterIdx;
 }
 
-XPU_D void CbmStsGpuHitFinder::FindHits(int iBlock) const
+XPU_D void StsHitfinderGpu::FindHits(int iBlock) const
 {
-  int iModuleF                    = iBlock;
-  int iModuleB                    = iBlock + nModules;
-  size_t offsetF                  = iModuleF * maxClustersPerModule;
-  size_t offsetB                  = iModuleB * maxClustersPerModule;
-  CbmStsClusterIdx* clusterIdxF   = clusterIdxSortedPerModule[iModuleF];
-  CbmStsClusterIdx* clusterIdxB   = clusterIdxSortedPerModule[iModuleB];
-  CbmStsClusterData* clusterDataF = &clusterDataPerModule[offsetF];
-  CbmStsClusterData* clusterDataB = &clusterDataPerModule[offsetB];
-  int nClustersF                  = nClustersPerModule[iModuleF];
-  int nClustersB                  = nClustersPerModule[iModuleB];
-
-
-  CbmStsHitFinderParams pars;
+  int iModuleF                  = iBlock;
+  int iModuleB                  = iBlock + nModules;
+  size_t offsetF                = iModuleF * maxClustersPerModule;
+  size_t offsetB                = iModuleB * maxClustersPerModule;
+  StsGpuClusterIdx* clusterIdxF = clusterIdxSortedPerModule[iModuleF];
+  StsGpuClusterIdx* clusterIdxB = clusterIdxSortedPerModule[iModuleB];
+  StsGpuCluster* clusterDataF   = &clusterDataPerModule[offsetF];
+  StsGpuCluster* clusterDataB   = &clusterDataPerModule[offsetB];
+  int nClustersF                = nClustersPerModule[iModuleF];
+  int nClustersB                = nClustersPerModule[iModuleB];
+
+
+  StsHitfinderCache pars;
   {
-    CbmStsHitFinderConfig cfg = hitfinderConfig[iBlock];
+    StsSensorPar cfg = sensorPars[iBlock];
 
     pars.dY         = cfg.dY;
     pars.pitch      = cfg.pitch;
@@ -705,14 +747,14 @@ XPU_D void CbmStsGpuHitFinder::FindHits(int iBlock) const
 
   int startB = 0;
   for (int iClusterF = xpu::thread_idx::x(); iClusterF < nClustersF; iClusterF += xpu::block_dim::x()) {
-    CbmStsClusterIdx clsIdxF   = clusterIdxF[iClusterF];
-    CbmStsClusterData clsDataF = clusterDataF[clsIdxF.fIdx];
+    StsGpuClusterIdx clsIdxF = clusterIdxF[iClusterF];
+    StsGpuCluster clsDataF   = clusterDataF[clsIdxF.fIdx];
 
     float maxSigma = 4.f * xpu::sqrt(clsDataF.fTimeError * clsDataF.fTimeError + maxTerrF * maxTerrF);
 
     for (int iClusterB = startB; iClusterB < nClustersB; iClusterB++) {
-      CbmStsClusterIdx clsIdxB   = clusterIdxB[iClusterB];
-      CbmStsClusterData clsDataB = clusterDataB[clsIdxB.fIdx];
+      StsGpuClusterIdx clsIdxB = clusterIdxB[iClusterB];
+      StsGpuCluster clsDataB   = clusterDataB[clsIdxB.fIdx];
 
       float timeDiff = float(clsIdxF.fTime) - float(clsIdxB.fTime);
 
@@ -742,9 +784,9 @@ XPU_D void CbmStsGpuHitFinder::FindHits(int iBlock) const
   }
 }
 
-XPU_D void CbmStsGpuHitFinder::IntersectClusters(int iBlock, const CbmStsHitFinderParams& pars, CbmStsTimeNs timeF,
-                                                 const CbmStsClusterData& clsF, CbmStsTimeNs timeB,
-                                                 const CbmStsClusterData& clsB) const
+XPU_D void StsHitfinderGpu::IntersectClusters(int iBlock, const StsHitfinderCache& pars, uint32_t timeF,
+                                              const StsGpuCluster& clsF, uint32_t timeB,
+                                              const StsGpuCluster& clsB) const
 {
   // --- Calculate cluster centre position on readout edge
 
@@ -801,7 +843,7 @@ XPU_D void CbmStsGpuHitFinder::IntersectClusters(int iBlock, const CbmStsHitFind
   }
 }
 
-XPU_D float CbmStsGpuHitFinder::GetClusterPosition(const CbmStsHitFinderParams& pars, float centre, bool isFront) const
+XPU_D float StsHitfinderGpu::GetClusterPosition(const StsHitfinderCache& pars, float centre, bool isFront) const
 {
   // Take integer channel
 
@@ -827,8 +869,8 @@ XPU_D float CbmStsGpuHitFinder::GetClusterPosition(const CbmStsHitFinderParams&
   return xCluster;
 }
 
-XPU_D bool CbmStsGpuHitFinder::Intersect(const CbmStsHitFinderParams& pars, float xF, float exF, float xB, float exB,
-                                         float& x, float& y, float& varX, float& varY, float& varXY) const
+XPU_D bool StsHitfinderGpu::Intersect(const StsHitfinderCache& pars, float xF, float exF, float xB, float exB, float& x,
+                                      float& y, float& varX, float& varY, float& varXY) const
 {
 
   // In the coordinate system with origin at the bottom left corner,
@@ -878,7 +920,7 @@ XPU_D bool CbmStsGpuHitFinder::Intersect(const CbmStsHitFinderParams& pars, floa
   return IsInside(pars, x - pars.dX / 2.f, y - pars.dY / 2.f);
 }
 
-XPU_D bool CbmStsGpuHitFinder::IsInside(const CbmStsHitFinderParams& pars, float x, float y) const
+XPU_D bool StsHitfinderGpu::IsInside(const StsHitfinderCache& pars, float x, float y) const
 {
   // clang-format off
   return x >= -pars.dX / 2.f
@@ -888,9 +930,9 @@ XPU_D bool CbmStsGpuHitFinder::IsInside(const CbmStsHitFinderParams& pars, float
   // clang-format on
 }
 
-XPU_D void CbmStsGpuHitFinder::CreateHit(int iModule, float xLocal, float yLocal, float varX, float varY, float varXY,
-                                         CbmStsTimeNs timeF, const CbmStsClusterData& clsF, CbmStsTimeNs timeB,
-                                         const CbmStsClusterData& clsB, float du, float dv) const
+XPU_D void StsHitfinderGpu::CreateHit(int iModule, float xLocal, float yLocal, float varX, float varY, float varXY,
+                                      uint32_t timeF, const StsGpuCluster& clsF, uint32_t timeB,
+                                      const StsGpuCluster& clsB, float du, float dv) const
 {
   // --- Transform into global coordinate system
   float globalX, globalY, globalZ;
@@ -909,11 +951,11 @@ XPU_D void CbmStsGpuHitFinder::CreateHit(int iModule, float xLocal, float yLocal
   float etB          = clsB.fTimeError;
   float hitTimeError = 0.5f * xpu::sqrt(etF * etF + etB * etB);
 
-  CbmStsHit hit {
+  StsGpuHit hit {
     .fX         = globalX,
     .fY         = globalY,
     .fZ         = globalZ,
-    .fTime      = static_cast<CbmStsTimeNs>(hitTime),
+    .fTime      = static_cast<uint32_t>(hitTime),
     .fDx        = errX,
     .fDy        = errY,
     .fDz        = errZ,
@@ -931,22 +973,21 @@ XPU_D void CbmStsGpuHitFinder::CreateHit(int iModule, float xLocal, float yLocal
 }
 
 
-XPU_D XErrorT CbmStsGpuHitFinder::LandauWidth(float charge) const
+XPU_D float StsHitfinderGpu::LandauWidth(float charge) const
 {
   if (charge <= landauStepSize) return landauTable[0];
   if (charge > landauStepSize * (landauTableSize - 1)) return landauTable[landauTableSize - 1];
 
   int tableIdx = xpu::ceil(charge / landauStepSize);
-  XErrorT e2   = tableIdx * landauStepSize;
-  XErrorT v2   = landauTable[tableIdx];
+  float e2     = tableIdx * landauStepSize;
+  float v2     = landauTable[tableIdx];
   tableIdx--;
-  XErrorT e1 = tableIdx * landauStepSize;
-  XErrorT v1 = landauTable[tableIdx];
+  float e1 = tableIdx * landauStepSize;
+  float v1 = landauTable[tableIdx];
   return v1 + (charge - e1) * (v2 - v1) / (e2 - e1);
 }
 
-XPU_D void CbmStsGpuHitFinder::ToGlobal(int iModule, float lx, float ly, float lz, float& gx, float& gy,
-                                        float& gz) const
+XPU_D void StsHitfinderGpu::ToGlobal(int iModule, float lx, float ly, float lz, float& gx, float& gy, float& gz) const
 {
   const float* tr  = &localToGlobalTranslationByModule[iModule * 3];
   const float* rot = &localToGlobalRotationByModule[iModule * 9];
diff --git a/algo/detectors/sts/StsHitfinder.h b/algo/detectors/sts/StsHitfinder.h
new file mode 100644
index 0000000000000000000000000000000000000000..656f12a15db471c3a4770c002cdb8f01b8f721dc
--- /dev/null
+++ b/algo/detectors/sts/StsHitfinder.h
@@ -0,0 +1,420 @@
+/* Copyright (C) 2022 FIAS Frankfurt Institute for Advanced Studies, Frankfurt / Main
+   SPDX-License-Identifier: GPL-3.0-only
+   Authors: Felix Weiglhofer [committer], Kilian Hunold */
+
+#ifndef CBM_ALGO_STS_HITFINDER_H
+#define CBM_ALGO_STS_HITFINDER_H
+
+#include "CbmStsDigi.h"
+
+#include <xpu/device.h>
+
+namespace cbm::algo
+{
+  struct GpuOnline {
+  };  // xpu image identifier, has to be moved when more steps are moved to GPU
+
+  // Block sizes / other compile time constants that need tuning
+  enum GpuConstants
+  {
+#if XPU_IS_CUDA
+    kSortDigisBlockSize         = 32,
+    kSortDigisItemsPerThread    = 32,
+    kSortClustersBlockSize      = 32,
+    kSortClustersItemsPerThread = 32,
+    kFindClusterBlockSize       = 32,
+    kFindHitsBlockSize          = 32,
+#else  // HIP, values ignored on CPU
+    kSortDigisBlockSize         = 256,
+    kSortDigisItemsPerThread    = 6,
+    kSortClustersBlockSize      = 256,
+    kSortClustersItemsPerThread = 6,
+    kFindClusterBlockSize       = 256,
+    kFindHitsBlockSize          = 256,
+#endif
+  };
+
+  // Kernel declarations
+  XPU_EXPORT_KERNEL(GpuOnline, SortDigis);
+  // Combine substeps for finding clusters into a single kernel
+  XPU_EXPORT_KERNEL(GpuOnline, FindClustersSingleStep);
+  XPU_EXPORT_KERNEL(GpuOnline, CalculateOffsets);
+  XPU_EXPORT_KERNEL(GpuOnline, FindClusters);
+  XPU_EXPORT_KERNEL(GpuOnline, FindClustersBasic);
+  XPU_EXPORT_KERNEL(GpuOnline, CalculateClusters);
+  XPU_EXPORT_KERNEL(GpuOnline, CalculateClustersBasic);
+  XPU_EXPORT_KERNEL(GpuOnline, SortClusters);
+  XPU_EXPORT_KERNEL(GpuOnline, FindHits);
+
+}  // namespace cbm::algo
+
+// Set block sizes. Currently this needs to be done in toplevel namespace.
+XPU_BLOCK_SIZE_1D(cbm::algo::SortClusters, cbm::algo::kSortClustersBlockSize);
+XPU_BLOCK_SIZE_1D(cbm::algo::FindClustersSingleStep, cbm::algo::kFindClusterBlockSize);
+XPU_BLOCK_SIZE_1D(cbm::algo::CalculateOffsets, cbm::algo::kFindClusterBlockSize);
+XPU_BLOCK_SIZE_1D(cbm::algo::FindClusters, cbm::algo::kFindClusterBlockSize);
+XPU_BLOCK_SIZE_1D(cbm::algo::CalculateClusters, cbm::algo::kFindClusterBlockSize);
+XPU_BLOCK_SIZE_1D(cbm::algo::SortDigis, cbm::algo::kSortClustersBlockSize);
+XPU_BLOCK_SIZE_1D(cbm::algo::FindHits, cbm::algo::kFindClusterBlockSize);
+
+namespace cbm::algo
+{
+  // GPU Data structures
+  // TODO: Replace with regular StsCluster / StsHit once the changes
+  // land to make them fit for GPU
+  struct StsGpuClusterIdx {
+    uint32_t fTime;  ///< Cluster time (average of digi times) [ns]
+    int fIdx;
+  };
+
+  struct StsGpuCluster {
+    float fCharge;         ///< Total charge
+    int fSize;             ///< Difference between first and last channel
+    float fPosition;       ///< Cluster centre in channel number units
+    float fPositionError;  ///< Cluster centre error (r.m.s.) in channel number units
+    uint32_t fTime;        ///< cluster time [ns]
+    float fTimeError;      ///< Error of cluster time [ns]
+  };
+
+  struct StsGpuHit {
+    float fX, fY;      ///< X, Y positions of hit [cm]
+    float fZ;          ///< Z position of hit [cm]
+    uint32_t fTime;    ///< Hit time [ns]
+    float fDx, fDy;    ///< X, Y errors [cm]
+    float fDz;         ///< Z position error [cm]
+    float fDxy;        ///< XY correlation
+    float fTimeError;  ///< Error of hit time [ns]
+    float fDu;         ///< Error of coordinate across front-side strips [cm]
+    float fDv;         ///< Error of coordinate across back-side strips [cm]
+  };
+
+  // Calibration data structures
+
+  struct StsAsicPar {
+    int nAdc;
+    float dynamicRange;
+    float threshold;
+    float timeResolution;
+    float deadTime;
+    float noise;
+    float zeroNoiseRate;
+
+    XPU_D float AdcToCharge(unsigned short adc) const
+    {
+      return threshold + dynamicRange / float(nAdc) * (float(adc) + 0.5f);
+    }
+  };
+
+  struct StsSensorPar {
+    float dY;
+    float pitch;
+    float stereoF;
+    float stereoB;
+    float lorentzF;
+    float lorentzB;
+  };
+
+  // Cache for various parameters of the hitfindiner
+  // Used internally to avoid computing values multiple times
+  // TODO: is this really needed? Overhead from additional computations should be miniscule.
+  // TODO: Also store in shared memory, not registers. Values identical for all threads.
+  struct StsHitfinderCache : StsSensorPar {
+    float tanStereoF;
+    float tanStereoB;
+    float errorFac;
+    float dX;
+  };
+
+  struct StsClusterCalculationProperties {
+    float tSum      = 0.;       // sum of digi times
+    int chanF       = 9999999;  // first channel in cluster
+    int chanL       = -1;       // last channel in cluster
+    float qF        = 0.f;      // charge in first channel
+    float qM        = 0.f;      // sum of charges in middle channels
+    float qL        = 0.f;      // charge in last cluster
+    float eqFsq     = 0.f;      // uncertainty of qF
+    float eqMsq     = 0.f;      // uncertainty of qMid
+    float eqLsq     = 0.f;      // uncertainty of qL
+    float tResolSum = 0.f;
+
+    float xSum = 0.f;  // weighted charge sum, used to correct corrupt clusters
+  };
+
+  class StsDigiConnector {
+  private:
+    unsigned int hasPreviousAndNext;
+
+    XPU_D unsigned int UnpackNext(unsigned int val) const { return val & ~(1u << 31); }
+
+  public:
+    XPU_D bool HasPrevious() const { return (hasPreviousAndNext >> 31) & 1u; }
+
+    XPU_D void SetHasPrevious(bool hasPrevious)
+    {
+      unsigned int old          = hasPreviousAndNext;
+      unsigned int hasPreviousI = ((unsigned int) hasPrevious) << 31;
+      unsigned int assumed;
+
+      do {
+        assumed = old;
+        old     = xpu::atomic_cas_block(&hasPreviousAndNext, assumed, UnpackNext(assumed) | hasPreviousI);
+      } while (old != assumed);
+    }
+
+    XPU_D unsigned int next() const { return UnpackNext(hasPreviousAndNext); }
+
+    XPU_D void SetNext(unsigned int next)
+    {
+      unsigned int old = hasPreviousAndNext;
+      unsigned int assumed;
+
+      next = xpu::min((1u << 31) - 1, next);
+
+      do {
+        assumed = old;
+        old     = xpu::atomic_cas_block(&hasPreviousAndNext, assumed, (assumed & (1u << 31)) | next);
+      } while (old != assumed);
+    }
+
+    XPU_D bool HasNext() const { return UnpackNext(hasPreviousAndNext) != 0; }
+  };
+
+  // Shared Memory defintions
+  using SortDigisT =
+    xpu::block_sort<unsigned long int, ::CbmStsDigi, xpu::block_size<SortDigis>::value.x, kSortDigisItemsPerThread>;
+
+  struct StsSortDigiSmem {
+    typename SortDigisT::storage_t sortBuf;
+  };
+
+  using SortClustersT =
+    xpu::block_sort<uint32_t, StsGpuClusterIdx, xpu::block_size<SortClusters>::value.x, kSortClustersItemsPerThread>;
+
+  struct StsSortClusterSmem {
+    typename SortClustersT::storage_t sortBuf;
+  };
+
+  // Hitfinder buffers
+  template<xpu::side S>
+  class StsHitfinderBase {
+
+  public:
+    // calibration / configuration data
+    // Only set once
+
+    int nModules;   // Number of modules
+    int nChannels;  // Number of channels per module
+
+    // Time cuts for Digis / Clusters
+    // TODO: currently not used!
+    float timeCutDigiAbs;
+    float timeCutDigiSig;
+    float timeCutClusterAbs;
+    float timeCutClusterSig;
+
+    StsAsicPar asic;
+
+    int landauTableSize;
+    float landauStepSize;
+    // Entries of landauTable. size = landauTableSize
+    xpu::cmem_device_t<float, S> landauTable;
+
+    // transformation matrix to convert local to global coordinate space
+    // size = nModules
+    xpu::cmem_io_t<float, S> localToGlobalTranslationByModule;
+    xpu::cmem_io_t<float, S> localToGlobalRotationByModule;
+
+    // Sensor Parameters
+    // size = nModules
+    xpu::cmem_io_t<StsSensorPar, S> sensorPars;
+
+    // input
+    // Store all digis in a flat array with a header that contains the offset for every module (front and back)
+    xpu::cmem_io_t<size_t, S>
+      digiOffsetPerModule;  // size = 2 * nModules + 1 entries, last entry contains total digi count
+    xpu::cmem_io_t<CbmStsDigi, S> digisPerModule;  // Flat array of input digis. size = nDigisTotal
+
+    // Temporary Digi-Array used for sorting as sorting is not in place. size = nDigisTotal
+    xpu::cmem_device_t<CbmStsDigi, S> digisPerModuleTmp;
+
+    // Pointer to sorted of a module side. Points either to digisPerModule or digisPerModuleTmp
+    // size = 2 * nModules
+    xpu::cmem_device_t<CbmStsDigi*, S> digisSortedPerModule;
+
+    // DigiConnectors used internally by StsClusterizer
+    // Connects digis that belong to the same cluster via linked-list.
+    // size = nDigisTotal
+    xpu::cmem_device_t<StsDigiConnector, S> digiConnectorsPerModule;
+
+    // Digis are sorted by channel + within channel by time
+    // Contains the offset for each channel within each channel
+    // size = 2 * nModules * nChannels
+    xpu::cmem_device_t<unsigned int, S> channelOffsetPerModule;
+
+    // intermediate results
+    size_t maxClustersPerModule;
+
+    // FIXME: Not used anymore?
+    xpu::cmem_device_t<int, S> channelStatusPerModule;
+
+    // Cluster Index by module. Produced by cluster finder.
+    // Stored as buckets with a max size of maxClustersPerModule
+    // Actual number of entries in each bucket is stored in nClustersPerModule
+    // size = 2 * nModules * maxClustersPerModule
+    xpu::cmem_io_t<StsGpuClusterIdx, S> clusterIdxPerModule;
+
+    // Temporary cluster index array used for sorting as sorting is not in place.
+    // size =  2 * nModules * maxClustersPerModule
+    xpu::cmem_io_t<StsGpuClusterIdx, S> clusterIdxPerModuleTmp;
+
+    // Pointer to sorted cluster idx for every module side.
+    // Either points to clusterIdxPerModule or clusterIdxPerModuleTmp.
+    // size = 2 * nModules
+    xpu::cmem_io_t<StsGpuClusterIdx*, S> clusterIdxSortedPerModule;
+
+    // Clusters stored by modules. Stored as buckets with a max size of maxClustersPerModule
+    // Actual number of entries in each bucket is stored in nClustersPerModule
+    // size = 2 * nModules * maxClustersPerModule
+    xpu::cmem_io_t<StsGpuCluster, S> clusterDataPerModule;
+
+    // Number of clusters in each module
+    // size = 2 * nModules
+    xpu::cmem_io_t<int, S> nClustersPerModule;
+
+    // Max time error of clusters on front- and backside of a module
+    // size = 1 (???)
+    // FIXME: size should be 2 * nModules? And only one array!
+    xpu::cmem_device_t<float, S> maxErrorFront;
+    xpu::cmem_device_t<float, S> maxErrorBack;
+
+    // output
+
+    // Max number of Hits that can be stored in each module
+    size_t maxHitsPerModule;
+
+    // Hits sorted by module. Stored in buckets of size maxHitsPerModule.
+    // Actual number of hits is stored in nHitsPerModule
+    // size = maxHitsPerModule * nModules
+    xpu::cmem_io_t<StsGpuHit, S> hitsPerModule;
+
+    // Number of hits in each module
+    // size = nModules
+    xpu::cmem_io_t<int, S> nHitsPerModule;
+  };
+
+  // Host-side hitfinder. Only holds memory buffers.
+  using StsHitfinderHost = StsHitfinderBase<xpu::side::host>;
+
+  // Device-side hitfinder. Holds pointers to buffers + algorithm.
+  class StsHitfinderGpu : public StsHitfinderBase<xpu::side::device> {
+
+  public:
+    XPU_D void SortDigisInSpaceAndTime(StsSortDigiSmem& smem, int iBlock) const;
+    XPU_D void FindClustersSingleStep(int iBlock) const;
+    XPU_D void SortClusters(StsSortClusterSmem& smem, int iBlock) const;
+    XPU_D void FindHits(int iBlock) const;
+
+    // Individual steps of cluster finder, for more granular time measurement
+    XPU_D void CalculateOffsetsParallel(int iBlock) const;
+    XPU_D void FindClustersParallel(int iBlock) const;
+    XPU_D void CalculateClustersParallel(int iBlock) const;
+
+    XPU_D void FindClusterConnectionsBasic(int iBlock) const;
+    XPU_D void CalculateClustersBasic(int iBlock) const;
+
+  private:
+    XPU_D void CalculateChannelOffsets(CbmStsDigi* digis, unsigned int* channelOffsets, unsigned int nDigis) const;
+
+    XPU_D void FindClusterConnectionsChannelWise(CbmStsDigi* digis, StsDigiConnector* digiConnector,
+                                                 unsigned int* channelOffsets, int iModule,
+                                                 unsigned int threadId) const;
+    XPU_D void FindClusterConnectionsDigiWise(CbmStsDigi* digis, StsDigiConnector* digiConnector,
+                                              unsigned int* channelOffsets, int iModule, unsigned int threadId,
+                                              unsigned int nDigis) const;
+
+    XPU_D void CalculateClustersBasic(CbmStsDigi* digis, StsDigiConnector* digiConnector, int iModule,
+                                      unsigned int const threadId) const;
+    XPU_D void CalculateClustersChannelWise(CbmStsDigi* digis, StsDigiConnector* digiConnector,
+                                            unsigned int* channelOffsets, int iModule, unsigned int const threadId,
+                                            unsigned int const nDigis) const;
+    XPU_D void CalculateClustersDigiWise(CbmStsDigi* digis, StsDigiConnector* digiConnector, int iModule,
+                                         unsigned int const threadId, unsigned int const nDigis) const;
+
+    XPU_D void CreateClusterFromConnectors1(int const iModule, CbmStsDigi* digis, int const digiIndex) const;
+    XPU_D void CreateClusterFromConnectors2(int const iModule, CbmStsDigi* digis, StsDigiConnector* digiConnector,
+                                            int const digiIndex) const;
+    XPU_D void CreateClusterFromConnectorsN(int const iModule, CbmStsDigi* digis, StsDigiConnector* digiConnector,
+                                            int const digiIndex) const;
+
+  private:
+    XPU_D unsigned int GetNDigis(int iModule) const
+    {
+      return digiOffsetPerModule[iModule + 1] - digiOffsetPerModule[iModule];
+    }
+
+    XPU_D float GetTimeResolution(int /* iModule */, int /* channel */) const { return asic.timeResolution; }
+
+    XPU_D bool IsActive(int* channelStatus, int channel) const
+    {
+      // TODO add padding to channels to remove this?
+      if (channel < 0 || channel >= nChannels) { return false; }
+      return channelStatus[channel] > -1;
+    }
+
+    XPU_D int ChanLeft(int channel) const { return channel - 1; }
+
+    XPU_D int ChanRight(int channel) const { return channel + 1; }
+
+    XPU_D int ChanDist(int c1, int c2) const
+    {
+      int diff = c2 - c1;
+      // TODO handle wrap around
+      return diff;
+    }
+
+    XPU_D void AddCluster(int iModule, uint32_t time, const StsGpuCluster& cls) const
+    {
+      StsGpuClusterIdx* tgtIdx = &clusterIdxPerModule[iModule * maxClustersPerModule];
+      StsGpuCluster* tgtData   = &clusterDataPerModule[iModule * maxClustersPerModule];
+
+      int pos = xpu::atomic_add_block(&nClustersPerModule[iModule], 1);
+      XPU_ASSERT(size_t(pos) < maxClustersPerModule);
+
+      StsGpuClusterIdx idx {time, pos};
+      tgtIdx[idx.fIdx]  = idx;
+      tgtData[idx.fIdx] = cls;
+    }
+
+    XPU_D bool IsBackside(int iModule) const { return iModule >= nModules; }
+
+    XPU_D float LandauWidth(float charge) const;
+
+    XPU_D void ToGlobal(int iModule, float lx, float ly, float lz, float& gx, float& gy, float& gz) const;
+
+    XPU_D void IntersectClusters(int iBlock, const StsHitfinderCache& pars, uint32_t timeF, const StsGpuCluster& clsF,
+                                 uint32_t timeB, const StsGpuCluster& clsB) const;
+    XPU_D float GetClusterPosition(const StsHitfinderCache& pars, float centre, bool isFront) const;
+    XPU_D bool Intersect(const StsHitfinderCache& pars, float xF, float exF, float xB, float exB, float& x, float& y,
+                         float& varX, float& varY, float& varXY) const;
+    XPU_D bool IsInside(const StsHitfinderCache& pars, float x, float y) const;
+    XPU_D void CreateHit(int iBlocks, float xLocal, float yLocal, float varX, float varY, float varXY, uint32_t timeF,
+                         const StsGpuCluster& clsF, uint32_t timeB, const StsGpuCluster& clsB, float du,
+                         float dv) const;
+
+    XPU_D void SaveMaxError(float errorValue, int iModule) const
+    {
+      float* maxError = IsBackside(iModule) ? maxErrorBack : maxErrorFront;
+      float old {};
+      do {
+        old = *maxError;
+        if (old >= errorValue) { break; }
+      } while (!xpu::atomic_cas_block(maxError, *maxError, xpu::max(errorValue, *maxError)));
+    }
+  };
+
+  // StsHitfinder lives in constant memory
+  XPU_EXPORT_CONSTANT(GpuOnline, StsHitfinderGpu, TheStsHitfinder);
+
+}  // namespace cbm::algo
+
+#endif  // #ifndef CBM_ALGO_STS_HITFINDER_H
diff --git a/algo/detectors/sts/StsHitfinderChain.cxx b/algo/detectors/sts/StsHitfinderChain.cxx
new file mode 100644
index 0000000000000000000000000000000000000000..9c37e24b5d3d0b58570ea7f2301ad163ee9e9fbf
--- /dev/null
+++ b/algo/detectors/sts/StsHitfinderChain.cxx
@@ -0,0 +1,321 @@
+/* Copyright (C) 2022 FIAS Frankfurt Institute for Advanced Studies, Frankfurt / Main
+   SPDX-License-Identifier: GPL-3.0-only
+   Authors: Felix Weiglhofer [committer], Kilian Hunold */
+
+#include "StsHitfinderChain.h"
+
+#include "Logger.h"
+
+using namespace cbm::algo;
+
+// Shorthands to set data / allocate buffers on GPU
+// FIXME: Eventually we'll need a better way to do this. Preferably supplied by xpu
+#define SET_GPU_CONSTANT(constant, val) fHitfinderHost.constant = fHitfinderGpu.constant = val
+#define GPU_ALLOC(field, nEntries)                                                                                     \
+  fHitfinderHost.field = decltype(fHitfinderHost.field)(nEntries);                                                     \
+  fHitfinderGpu.field  = fHitfinderHost.field.d()
+
+void StsHitfinderChain::SetParameters(const StsHitfinderPar& parameters)
+{
+  fPars.emplace(parameters);
+  AllocateStatic();
+}
+
+void StsHitfinderChain::operator()(gsl::span<const CbmStsDigi> digis)
+{
+  EnsureParameters();
+
+  // FIXME: this has to be called only once
+  // and should happen during initialization not in reco loop
+  setenv("XPU_PROFILE", "1", 1);
+  xpu::initialize();
+
+  int nModules       = fPars->modules.size();
+  int nModuleSides   = nModules * 2;
+  size_t nDigisTotal = digis.size();
+
+  // Getting the digis on the GPU requires 3 steps
+  // 1. Sort digis into buckets by module
+  size_t maxNDigisPerModule;
+  DigiMap digiMap = SortDigisIntoModules(digis, maxNDigisPerModule);
+  // 2. Once we know number of digis per module, we can allocate
+  //    the dynamic buffers on the gpu, as the buffer sizes depend on that value
+  AllocateDynamic(maxNDigisPerModule, nDigisTotal);
+  // 3. Copy digis into flat array with offsets per module
+  FlattenDigis(digiMap);
+
+  // Clear buffers
+  // Not all buffers have to initialized, but useful for debugging
+
+  auto& hfc = fHitfinderHost;
+  // xpu::memset(hfc.digisPerModule, 0);
+  // xpu::memset(hfc.digisPerModuleTmp, 0);
+  // xpu::memset(hfc.digisSortedPerModule, 0);
+  // xpu::memset(hfc.digiOffsetPerModule, 0);
+  xpu::memset(hfc.digiConnectorsPerModule, 0);
+  // xpu::memset(hfc.channelOffsetPerModule, 0);
+  xpu::memset(hfc.channelStatusPerModule, -1);
+  xpu::memset(hfc.clusterIdxPerModule, 0);
+  // xpu::memset(hfc.clusterIdxPerModuleTmp, 0);
+  // xpu::memset(hfc.clusterIdxSortedPerModule, 0);
+  // xpu::memset(hfc.clusterDataPerModule, 0);
+  xpu::memset(hfc.nClustersPerModule, 0);
+  // xpu::memset(hfc.hitsPerModule, 0);
+  xpu::memset(hfc.nHitsPerModule, 0);
+  xpu::memset(hfc.maxErrorFront, 0);
+  xpu::memset(hfc.maxErrorBack, 0);
+
+  xpu::set_constant<TheStsHitfinder>(fHitfinderGpu);
+  xpu::copy(hfc.digisPerModule, xpu::host_to_device);
+  xpu::copy(hfc.digiOffsetPerModule, xpu::host_to_device);
+
+  xpu::run_kernel<SortDigis>(xpu::grid::n_blocks(nModuleSides));
+  xpu::run_kernel<FindClustersSingleStep>(xpu::grid::n_blocks(nModuleSides));
+  // Run cluster finding steps in indivual kernels, useful for debugging / profiling
+  // xpu::run_kernel<CalculateOffsets>(xpu::grid::n_blocks(hfc.nModules * 2));
+  // xpu::run_kernel<FindClustersBasic>(xpu::grid::n_blocks(hfc.nModules * 2));
+  // xpu::run_kernel<CalculateClusters>(xpu::grid::n_blocks(hfc.nModules * 2));
+  // xpu::run_kernel<FindClustersBasic>(xpu::grid::n_blocks(hfc.nModules * 2));
+  // xpu::run_kernel<CalculateClustersBasic>(xpu::grid::n_blocks(hfc.nModules * 2));
+  xpu::run_kernel<SortClusters>(xpu::grid::n_blocks(nModuleSides));
+  xpu::run_kernel<FindHits>(xpu::grid::n_blocks(nModules));
+
+  CollectTimingInfo();
+
+  // Transfer Hits / Cluster back to host
+  // TODO: Hits and Clusters are stored in buckets. Currently we transfer the entire
+  // bucket arrays back to the host. This includes a lot of unused bytes.
+  // Better to either flatten both arrays on the GPU and transfer flat array back.
+  // Or do multiple copies to copy contents of individual buckets.
+  xpu::copy(hfc.clusterDataPerModule, xpu::device_to_host);
+  xpu::copy(hfc.nClustersPerModule, xpu::device_to_host);
+  xpu::copy(hfc.hitsPerModule, xpu::device_to_host);
+  xpu::copy(hfc.nHitsPerModule, xpu::device_to_host);
+}
+
+void StsHitfinderChain::EnsureParameters()
+{
+  LOG_IF(fatal, fPars == std::nullopt) << "StsHitfinderChain: Parameters not set. Can't continue!";
+}
+
+void StsHitfinderChain::AllocateStatic()
+{
+  EnsureParameters();
+
+  // Shorthands for common constants
+  const int nChannels = fPars->nChannels / 2;  // Only count channels on one side of the module
+  const int nModules  = fPars->modules.size();
+  // const int nModuleSides = 2 * nModules; // Number of module front + backsides
+
+  // Set GPU constants
+  SET_GPU_CONSTANT(nModules, nModules);
+  SET_GPU_CONSTANT(nChannels, nChannels);
+
+  SET_GPU_CONSTANT(timeCutDigiAbs, -1.f);
+  SET_GPU_CONSTANT(timeCutDigiSig, 3.f);
+  SET_GPU_CONSTANT(timeCutClusterAbs, -1.f);
+  SET_GPU_CONSTANT(timeCutClusterSig, 4.f);
+
+  SET_GPU_CONSTANT(asic, fPars->asic);
+
+  // Copy landau table
+  size_t nLandauTableEntries = fPars->landauTable.values.size();
+  SET_GPU_CONSTANT(landauTableSize, nLandauTableEntries);
+  SET_GPU_CONSTANT(landauStepSize, fPars->landauTable.stepSize);
+  GPU_ALLOC(landauTable, nLandauTableEntries);
+  xpu::copy(fHitfinderGpu.landauTable, fPars->landauTable.values.data(), nLandauTableEntries);
+
+  // Copy transformation matrix
+  GPU_ALLOC(localToGlobalRotationByModule, 9 * nModules);
+  GPU_ALLOC(localToGlobalTranslationByModule, 3 * nModules);
+
+  // - Copy matrix into flat array
+  for (int m = 0; m < nModules; m++) {
+    const auto& module = fPars->modules.at(m);
+    std::copy_n(module.localToGlobal.rotation.data(), 9, &fHitfinderHost.localToGlobalRotationByModule.h()[m * 9]);
+    std::copy_n(module.localToGlobal.translation.data(), 3, &fHitfinderHost.localToGlobalRotationByModule.h()[m * 3]);
+  }
+
+  // - Then copy to GPU
+  xpu::copy(fHitfinderHost.localToGlobalRotationByModule, xpu::host_to_device);
+  xpu::copy(fHitfinderHost.localToGlobalTranslationByModule, xpu::host_to_device);
+
+  // Copy Sensor Parameteres
+  GPU_ALLOC(sensorPars, nModules);
+  for (int m = 0; m < nModules; m++) {
+    const auto& module = fPars->modules.at(m);
+    auto& gpuPars      = fHitfinderHost.sensorPars.h()[m];
+    gpuPars.dY         = module.dY;
+    gpuPars.pitch      = module.pitch;
+    gpuPars.stereoF    = module.stereoF;
+    gpuPars.stereoB    = module.stereoB;
+    gpuPars.lorentzF   = module.lorentzF;
+    gpuPars.lorentzB   = module.lorentzB;
+  }
+  xpu::copy(fHitfinderHost.sensorPars, xpu::host_to_device);
+}
+
+void StsHitfinderChain::AllocateDynamic(size_t maxNDigisPerModule, size_t nDigisTotal)
+{
+  EnsureParameters();
+
+  // TODO: some of these buffers have a constant size and can be allocated statically.
+  // Just the data they contain is static.
+
+  // Shorthands for common constants
+  const int nChannels    = fPars->nChannels / 2;  // Only count channels on one side of the module
+  const int nModules     = fPars->modules.size();
+  const int nModuleSides = 2 * nModules;  // Number of module front + backsides
+
+  const size_t maxNClustersPerModule = maxNDigisPerModule * kClustersPerDigiFactor;
+  const size_t maxNHitsPerModule     = maxNClustersPerModule * kHitsPerClustersFactor;
+
+
+  // Allocate Digi Buffers
+  GPU_ALLOC(digiOffsetPerModule, nModuleSides + 1);
+  GPU_ALLOC(digisPerModule, nDigisTotal);
+  GPU_ALLOC(digisPerModuleTmp, nDigisTotal);
+  GPU_ALLOC(digisSortedPerModule, nModuleSides);
+  GPU_ALLOC(digiConnectorsPerModule, nDigisTotal);
+
+  GPU_ALLOC(channelOffsetPerModule, nModuleSides * nChannels);
+
+  GPU_ALLOC(maxErrorFront, 1);
+  GPU_ALLOC(maxErrorBack, 1);
+
+  // Allocate Cluster Buffers
+  SET_GPU_CONSTANT(maxClustersPerModule, maxNClustersPerModule);
+  GPU_ALLOC(channelStatusPerModule, nChannels * nModules);
+  GPU_ALLOC(clusterIdxPerModule, maxNClustersPerModule * nModuleSides);
+  GPU_ALLOC(clusterIdxPerModuleTmp, maxNClustersPerModule * nModuleSides);
+  GPU_ALLOC(clusterIdxSortedPerModule, nModuleSides);
+  GPU_ALLOC(clusterDataPerModule, maxNClustersPerModule * nModuleSides);
+  GPU_ALLOC(nClustersPerModule, nModuleSides);
+
+  // Allocate Hit Buffers
+  SET_GPU_CONSTANT(maxHitsPerModule, maxNHitsPerModule);
+  GPU_ALLOC(hitsPerModule, maxNHitsPerModule * nModules);
+  GPU_ALLOC(nHitsPerModule, nModules);
+}
+
+StsHitfinderChain::DigiMap StsHitfinderChain::SortDigisIntoModules(gsl::span<const CbmStsDigi> digis,
+                                                                   size_t& maxNDigisPerModule)
+{
+  DigiMap digiMap;
+
+  int nChannelsPerSide = fPars->nChannels / 2;
+
+  for (const auto& digi : digis) {
+    int moduleAddr = CbmStsAddress::GetMotherAddress(digi.GetAddress(), kStsModule);
+    bool isFront   = digi.GetChannel() < fPars->nChannels / 2;
+    if (isFront) digiMap.front[moduleAddr].emplace_back(digi);
+    else {
+      CbmStsDigi tmpdigi = digi;
+      tmpdigi.SetChannel(tmpdigi.GetChannel() - nChannelsPerSide);
+      digiMap.back[moduleAddr].emplace_back(tmpdigi);
+    }
+  }
+
+  maxNDigisPerModule = 0;
+  for (const auto& mod : digiMap.front) {
+    maxNDigisPerModule = std::max(maxNDigisPerModule, mod.second.size());
+  }
+
+  for (const auto& mod : digiMap.back) {
+    maxNDigisPerModule = std::max(maxNDigisPerModule, mod.second.size());
+  }
+
+  return digiMap;
+}
+
+void StsHitfinderChain::FlattenDigis(DigiMap& digiMap)
+{
+  FlattenDigisSide(digiMap.front, true);
+  FlattenDigisSide(digiMap.back, false);
+}
+
+void StsHitfinderChain::FlattenDigisSide(DigiMapSide& digis, bool isFront)
+{
+  EnsureParameters();
+
+  int nModules           = fPars->modules.size();
+  size_t* pMdigiOffset   = fHitfinderHost.digiOffsetPerModule.h();
+  CbmStsDigi* pDigisFlat = fHitfinderHost.digisPerModule.h();
+
+  int sideOffset = (isFront ? 0 : nModules);
+
+  if (isFront) pMdigiOffset[0] = 0;
+  for (int m = 0; m < nModules; m++) {
+    size_t moduleOffset     = pMdigiOffset[sideOffset + m];
+    const auto& moduleDigis = digis[fPars->modules[m].address];
+    std::copy(moduleDigis.begin(), moduleDigis.end(), &pDigisFlat[moduleOffset]);
+    pMdigiOffset[sideOffset + m + 1] = moduleOffset + moduleDigis.size();
+  }
+}
+
+void StsHitfinderChain::CollectTimingInfo()
+{
+  // Check if profiling is enabled
+  // TODO: xpu should have a nicer way to query if profiling is enabled
+  if (xpu::get_timing<SortDigis>().empty()) return;
+
+  // TODO: Add time counters
+  fHitfinderTimes.timeSortDigi    = xpu::get_timing<SortDigis>().back();
+  fHitfinderTimes.timeCluster     = xpu::get_timing<FindClustersSingleStep>().back();
+  fHitfinderTimes.timeSortCluster = xpu::get_timing<SortClusters>().back();
+  fHitfinderTimes.timeHits        = xpu::get_timing<FindHits>().back();
+}
+
+void StsHitfinderChain::AppendClustersModule(int module, bool isFront, std::vector<StsGpuCluster>& clusters)
+{
+  EnsureParameters();
+
+  auto& hfc      = fHitfinderHost;
+  int moduleSide = (isFront ? module : 2 * module);
+
+  int nClusters     = hfc.nClustersPerModule.h()[moduleSide];
+  auto* gpuClusters = &hfc.clusterDataPerModule.h()[moduleSide * hfc.maxClustersPerModule];
+
+  clusters.reserve(clusters.size() + nClusters);
+
+  for (int i = 0; i < nClusters; i++) {
+    clusters.emplace_back(gpuClusters[i]);
+
+    //   auto& clusterGpu = gpuClusters[i];
+
+    //   auto& clusterOut = clusters.emplace_back();
+
+    //   clusterOut.SetAddress(fPars->modules[module].address);
+    //   clusterOut.SetProperties(clusterGpu.fCharge, clusterGpu.fPosition, clusterGpu.fPositionError, clusterGpu.fTime,
+    //                             clusterGpu.fTimeError);
+    //   clusterOut.SetSize(clusterGpu.fSize);
+  }
+}
+
+void StsHitfinderChain::AppendHitsModule(int module, std::vector<StsGpuHit>& hits)
+{
+  EnsureParameters();
+
+  auto& hfc = fHitfinderHost;
+
+  auto* gpuHits = &hfc.hitsPerModule.h()[module * hfc.maxHitsPerModule];
+  int nHitsGpu  = hfc.nHitsPerModule.h()[module];
+
+  hits.reserve(hits.size() + nHitsGpu);
+
+  for (int i = 0; i < nHitsGpu; i++) {
+    hits.emplace_back(gpuHits[i]);
+    // auto& hitGpu = gpuHits[i];
+
+    // hits.emplace_back(fPars->modules[module].address,
+    //                   TVector3 {hitGpu.fX, hitGpu.fY, hitGpu.fZ},
+    //                   TVector3 {hitGpu.fDx, hitGpu.fDy, hitGpu.fDz},
+    //                   hitGpu.fDxy,
+    //                   0,
+    //                   0,
+    //                   double(hitGpu.fTime),
+    //                   hitGpu.fTimeError,
+    //                   hitGpu.fDu,
+    //                   hitGpu.fDv);
+  }
+}
diff --git a/algo/detectors/sts/StsHitfinderChain.h b/algo/detectors/sts/StsHitfinderChain.h
new file mode 100644
index 0000000000000000000000000000000000000000..37fcd4277bcad92e57d33a3e11ddaddc8063c336
--- /dev/null
+++ b/algo/detectors/sts/StsHitfinderChain.h
@@ -0,0 +1,147 @@
+/* Copyright (C) 2022 FIAS Frankfurt Institute for Advanced Studies, Frankfurt / Main
+   SPDX-License-Identifier: GPL-3.0-only
+   Authors: Felix Weiglhofer [committer], Kilian Hunold */
+
+#ifndef CBM_ALGO_STS_HITFINDER_CHAIN_H
+#define CBM_ALGO_STS_HITFINDER_CHAIN_H
+
+#include "CbmStsDigi.h"
+
+#include <array>
+#include <cstdint>
+#include <gsl/span>
+#include <map>
+#include <optional>
+#include <vector>
+
+#include <xpu/host.h>
+
+#include "StsHitfinder.h"
+
+namespace cbm::algo
+{
+
+  struct StsModuleTransformationMatrix {
+    // Rotation + translation matrix to transform
+    // local module coordinates into global coordinate system.
+    // No need for fancy math types here. These values are just copied
+    // and moved to the GPU.
+    std::array<float, 9> rotation;  // 3x3 matrix
+    std::array<float, 3> translation;
+  };
+
+  struct StsModulePar {
+    int32_t address;
+    float dY;
+    float pitch;
+    float stereoF;
+    float stereoB;
+    float lorentzF;
+    float lorentzB;
+    StsModuleTransformationMatrix localToGlobal;
+  };
+
+  struct StsLandauTable {
+    std::vector<float> values;
+    float stepSize;
+  };
+
+  struct StsHitfinderPar {
+    StsAsicPar asic;  // Asic definitions. Currently assumes same parameters for all asics.
+    int nChannels;    // Total number of channels per module. Hitfinder assumes nChannels / 2 channels per side.
+    std::vector<StsModulePar> modules;
+    StsLandauTable landauTable;
+  };
+
+  struct StsHitfinderTimes {
+    double timeIO          = 0;
+    double timeSortDigi    = 0;
+    double timeCluster     = 0;
+    double timeSortCluster = 0;
+    double timeHits        = 0;
+  };
+
+  /**
+   * Sts Hitfinder Chain. This class is responsible for all memory allocations
+   * on the GPU required by the hitfinder. It executes the hitfinder kernels and
+   * handles memory transfer for input / output data and conversion to the
+   * regular CBM STS types.
+   */
+  class StsHitfinderChain {
+
+  public:
+    void SetParameters(const StsHitfinderPar& parameters);
+    const StsHitfinderPar& GetParameters() const { return *fPars; }
+
+    void operator()(gsl::span<const CbmStsDigi>);
+
+    StsHitfinderTimes GetHitfinderTimes() const { return fHitfinderTimes; }
+
+    /**
+       * Returns access to (host copies of) the raw buffers used by the gpu hitfinder.
+       * This is currently the only way to access the produced hits and cluster.
+       */
+    const StsHitfinderHost& GetHitfinderBuffers() const { return fHitfinderHost; }
+
+  private:
+    // Shorthands to map module-address onto digis in that module
+    using DigiMapSide = std::map<int, std::vector<CbmStsDigi>>;
+    struct DigiMap {
+      DigiMapSide front;
+      DigiMapSide back;
+    };
+
+    /**
+       *  Factors used to estimate buffer sizes based on number of digis.
+       *  TODO: These values might be wildly off. Look for better estimations.
+       *  TODO: Should be configurable at runtime.
+       */
+    static constexpr float kClustersPerDigiFactor = 0.5f;
+    static constexpr float kHitsPerClustersFactor = 4.f;
+
+    /**
+       * Ensure parameters were set. Raises log(fatal) otherwise.
+       */
+    void EnsureParameters();
+
+    /**
+       * Allocate memory that doesn't depend on input
+       * Data stays on GPU for the entire duration the hitfinder is alive
+       */
+    void AllocateStatic();
+
+    /**
+       * Allocate memory that depends on input
+       * Data stays on GPU until next timeslice is processed
+       */
+    void AllocateDynamic(size_t, size_t);
+
+    /**
+       * Sort Digis into buckets by module.
+       */
+    DigiMap SortDigisIntoModules(gsl::span<const CbmStsDigi> digis, size_t& maxNDigisPerModule);
+
+    /**
+       * Copy Digis into flat array that can be copied to the GPU.
+       */
+    void FlattenDigis(DigiMap& digiMap);
+    void FlattenDigisSide(DigiMapSide& digis, bool isFront);
+
+    void CollectTimingInfo();
+
+    /**
+       * Transfer Hits / Clusters back to host and convert to common CBM types.
+       */
+    void AppendClustersModule(int module, bool isFront, std::vector<StsGpuCluster>&);
+    void AppendHitsModule(int module, std::vector<StsGpuHit>&);
+
+    std::optional<const StsHitfinderPar> fPars;
+    StsHitfinderTimes fHitfinderTimes;
+
+    StsHitfinderHost fHitfinderHost;
+    StsHitfinderGpu fHitfinderGpu;
+  };
+
+}  // namespace cbm::algo
+
+#endif  // #ifndef CBM_ALGO_STS_HITFINDER_CHAIN_H
diff --git a/core/detectors/sts/CMakeLists.txt b/core/detectors/sts/CMakeLists.txt
index 423efe49eb6177ea0fb627cb9001ec2547a307b2..3e3d803beef86e1904d07621d70c1d3a2dbc26e7 100644
--- a/core/detectors/sts/CMakeLists.txt
+++ b/core/detectors/sts/CMakeLists.txt
@@ -34,23 +34,23 @@ endif()
 
 set(LIBRARY_NAME CbmStsBase)
 set(LINKDEF ${LIBRARY_NAME}LinkDef.h)
-set(PUBLIC_DEPENDENCIES 
+set(PUBLIC_DEPENDENCIES
   FairRoot::Base
   FairRoot::ParBase
   CbmBase
-  ROOT::Core 
-  ROOT::Geom 
-  ROOT::Gpad 
-  ROOT::Hist 
-  ROOT::MathCore 
+  ROOT::Core
+  ROOT::Geom
+  ROOT::Gpad
+  ROOT::Hist
+  ROOT::MathCore
   ROOT::Physics
   )
 
-set(PRIVATE_DEPENDENCIES 
+set(PRIVATE_DEPENDENCIES
   FairLogger::FairLogger
   CbmData
-  ROOT::Graf 
-  ROOT::MLP 
+  ROOT::Graf
+  ROOT::MLP
   ROOT::Tree
   )
 
diff --git a/core/detectors/sts/CbmStsParSensorCond.h b/core/detectors/sts/CbmStsParSensorCond.h
index eafb33a4c55d870be5b0d60205c50aca54dc2d7d..d4f1240a1c401c6332a83cc09826802fea1401f9 100644
--- a/core/detectors/sts/CbmStsParSensorCond.h
+++ b/core/detectors/sts/CbmStsParSensorCond.h
@@ -116,7 +116,6 @@ public:
      */
   void Init();
 
-
   /** @brief Copy assignment operator **/
   CbmStsParSensorCond& operator=(const CbmStsParSensorCond&);
 
diff --git a/reco/detectors/sts/CMakeLists.txt b/reco/detectors/sts/CMakeLists.txt
index 284386c655f212ed4755f9d5ad263c1dc069be42..7944e331201078365e7636381fbbc691ca4fac6a 100644
--- a/reco/detectors/sts/CMakeLists.txt
+++ b/reco/detectors/sts/CMakeLists.txt
@@ -32,20 +32,10 @@ set(SRCS
   qa/CbmStsFindTracksQa.cxx
   )
 
-
-set(DEVICE_SRCS
-  experimental/CbmStsGpuRecoDevice.cxx
-  experimental/CbmStsGpuHitFinder.cxx
-  )
-
-set(NO_DICT_SRCS
-  ${DEVICE_SRCS}
-  experimental/CbmGpuRecoSts.cxx
-  )
-
 set(LIBRARY_NAME CbmRecoSts)
 set(LINKDEF ${LIBRARY_NAME}LinkDef.h)
 set(PUBLIC_DEPENDENCIES
+  Algo
   CbmData
   CbmStsBase
   FairRoot::Base
@@ -83,10 +73,3 @@ set(INTERFACE_DEPENDENCIES
 # ---------------------------------------------------------
 
 generate_cbm_library()
-
-xpu_attach(CbmRecoSts ${DEVICE_SRCS})
-
-# FIXME: Why is this needed to install headers in subfolder?
-install(DIRECTORY experimental/ TYPE INCLUDE
-    FILES_MATCHING PATTERN "*.h"
-)
diff --git a/reco/detectors/sts/CbmRecoSts.cxx b/reco/detectors/sts/CbmRecoSts.cxx
index d9df65037620226d4e4ca246a120c856bb2e551c..287a83316448d6b0aa8a6c39252d5bae8baadae4 100644
--- a/reco/detectors/sts/CbmRecoSts.cxx
+++ b/reco/detectors/sts/CbmRecoSts.cxx
@@ -13,7 +13,6 @@
 #include "CbmDigiManager.h"
 #include "CbmEvent.h"
 #include "CbmStsDigi.h"
-#include "CbmStsGpuHitFinder.h"
 #include "CbmStsModule.h"
 #include "CbmStsParSetModule.h"
 #include "CbmStsParSetSensor.h"
@@ -72,9 +71,9 @@ UInt_t CbmRecoSts::CreateModules()
 
   assert(fSetup);
 
-  std::vector<CbmStsParModule> gpuModules;  // for gpu reco
-  std::vector<int> moduleAddrs;
-  std::vector<experimental::CbmStsHitFinderConfig> hfCfg;
+  std::vector<cbm::algo::StsModulePar> gpuModules;  // for gpu reco
+  // std::vector<int> moduleAddrs;
+  // std::vector<experimental::CbmStsHitFinderConfig> hfCfg;
 
   for (Int_t iModule = 0; iModule < fSetup->GetNofModules(); iModule++) {
 
@@ -92,9 +91,6 @@ UInt_t CbmRecoSts::CreateModules()
     const CbmStsParSensor& sensPar      = fParSetSensor->GetParSensor(sensorAddress);
     const CbmStsParSensorCond& sensCond = fParSetCond->GetParSensor(sensorAddress);
 
-    gpuModules.push_back(modPar);
-    moduleAddrs.push_back(moduleAddress);
-
     // --- Calculate and set average Lorentz shift
     // --- This will be used in hit finding for correcting the position.
     Double_t lorentzF = 0.;
@@ -135,25 +131,56 @@ UInt_t CbmRecoSts::CreateModules()
     auto result = fModules.insert({moduleAddress, recoModule});
     assert(result.second);
     fModuleIndex.push_back(recoModule);
-    hfCfg.push_back({
-      .dY       = sensPar.GetPar(3),
-      .pitch    = sensPar.GetPar(6),
-      .stereoF  = sensPar.GetPar(8),
-      .stereoB  = sensPar.GetPar(9),
-      .lorentzF = float(lorentzF),
-      .lorentzB = float(lorentzB),
-    });
+
+    // Get Transformation Matrix
+    cbm::algo::StsModuleTransformationMatrix localToGlobal;
+    TGeoHMatrix* matrix = recoModule->getMatrix();
+    std::copy_n(matrix->GetRotationMatrix(), 9, localToGlobal.rotation.begin());
+    std::copy_n(matrix->GetTranslation(), 3, localToGlobal.translation.begin());
+
+    // Collect GPU parameters
+    cbm::algo::StsModulePar gpuModulePars {
+      .address       = moduleAddress,
+      .dY            = sensPar.GetPar(3),
+      .pitch         = sensPar.GetPar(6),
+      .stereoF       = sensPar.GetPar(8),
+      .stereoB       = sensPar.GetPar(9),
+      .lorentzF      = float(lorentzF),
+      .lorentzB      = float(lorentzB),
+      .localToGlobal = localToGlobal,
+    };
+    gpuModules.emplace_back(gpuModulePars);
   }
 
-  // Gpu hitfinder
-  experimental::CbmGpuRecoSts::Config gpuConfig {
-    .parModules   = gpuModules,
-    .recoModules  = fModuleIndex,
-    .moduleAddrs  = moduleAddrs,
-    .hitfinderCfg = hfCfg,
-    .physics      = CbmStsPhysics::Instance(),
+  const CbmStsParModule& firstModulePars = fParSetModule->GetParModule(gpuModules[0].address);
+
+  CbmStsParAsic asic = firstModulePars.GetParAsic(0);
+  cbm::algo::StsAsicPar algoAsic {
+    .nAdc           = asic.GetNofAdc(),
+    .dynamicRange   = float(asic.GetDynRange()),
+    .threshold      = float(asic.GetThreshold()),
+    .timeResolution = float(asic.GetTimeResol()),
+    .deadTime       = float(asic.GetDeadTime()),
+    .noise          = float(asic.GetNoise()),
+    .zeroNoiseRate  = float(asic.GetZeroNoiseRate()),
+  };
+
+  int nChannels = firstModulePars.GetNofChannels();
+
+  auto [landauValues, landauStepSize] = CbmStsPhysics::Instance()->GetLandauWidthTable();
+  std::vector<float> landauValuesF;
+  std::copy(landauValues.begin(), landauValues.end(), std::back_inserter(landauValuesF));
+  cbm::algo::StsHitfinderPar pars {
+    .asic      = algoAsic,
+    .nChannels = nChannels,
+    .modules   = gpuModules,
+    .landauTable =
+      {
+        .values   = landauValuesF,
+        .stepSize = float(landauStepSize),
+      },
   };
-  fGpuReco.SetConfig(gpuConfig);
+  if (fUseGpuReco) fGpuReco.SetParameters(pars);
 
   return fModules.size();
 }
@@ -193,10 +220,10 @@ void CbmRecoSts::Exec(Option_t*)
   if (fUseGpuReco) {
 
     ProcessDataGpu();
-    fNofDigis     = fGpuReco.nDigis;
-    fNofDigisUsed = fGpuReco.nDigisUsed;
-    fNofClusters  = fGpuReco.nCluster;
-    fNofHits      = fGpuReco.nHits;
+    // fNofDigis     = fGpuReco.nDigis;
+    // fNofDigisUsed = fGpuReco.nDigisUsed;
+    // fNofClusters  = fGpuReco.nCluster;
+    // fNofHits      = fGpuReco.nHits;
 
     // Old reco in time based mode
   }
@@ -314,14 +341,15 @@ void CbmRecoSts::Finish()
               << "  Find Hits   : " << fixed << setprecision(1) << setw(6) << 1000. * timingsTotal.timeHits << " ms\n";
   }
   else {
-    double gpuHitfinderTimeTotal =
-      fGpuReco.timeSortDigi + fGpuReco.timeCluster + fGpuReco.timeSortCluster + fGpuReco.timeHits;
+    cbm::algo::StsHitfinderTimes times = fGpuReco.GetHitfinderTimes();
+
+    double gpuHitfinderTimeTotal = times.timeSortDigi + times.timeCluster + times.timeSortCluster + times.timeHits;
     LOG(info) << "Time Reconstruct (GPU) : " << fixed << setprecision(1) << setw(6) << gpuHitfinderTimeTotal << " ms";
     LOG(info) << "Time by step:\n"
-              << "  Sort Digi   : " << fixed << setprecision(1) << setw(6) << fGpuReco.timeSortDigi << " ms\n"
-              << "  Find Cluster: " << fixed << setprecision(1) << setw(6) << fGpuReco.timeCluster << " ms\n"
-              << "  Sort Cluster: " << fixed << setprecision(1) << setw(6) << fGpuReco.timeSortCluster << " ms\n"
-              << "  Find Hits   : " << fixed << setprecision(1) << setw(6) << fGpuReco.timeHits << " ms";
+              << "  Sort Digi   : " << fixed << setprecision(1) << setw(6) << times.timeSortDigi << " ms\n"
+              << "  Find Cluster: " << fixed << setprecision(1) << setw(6) << times.timeCluster << " ms\n"
+              << "  Sort Cluster: " << fixed << setprecision(1) << setw(6) << times.timeSortCluster << " ms\n"
+              << "  Find Hits   : " << fixed << setprecision(1) << setw(6) << times.timeHits << " ms";
   }
   LOG(info) << "=====================================";
 }
@@ -636,8 +664,86 @@ void CbmRecoSts::ProcessDataGpu()
 {
   if (fMode == kCbmRecoEvent) throw std::runtime_error("STS GPU Reco does not yet support event-by-event mode.");
 
-  fGpuReco.RunHitFinder();
-  fGpuReco.ForwardClustersAndHits(fClusters, fHits);
+  auto digis = fDigiManager->GetArray<CbmStsDigi>();
+  fGpuReco(digis);
+  auto [nClustersForwarded, nHitsForwarded] = ForwardGpuClusterAndHits();
+
+  fNofDigis     = digis.size();
+  fNofDigisUsed = digis.size();
+  fNofClusters  = nClustersForwarded;
+  fNofHits      = nHitsForwarded;
+}
+
+std::pair<size_t, size_t> CbmRecoSts::ForwardGpuClusterAndHits()
+{
+  size_t nClustersForwarded = 0, nHitsForwarded = 0;
+
+  const cbm::algo::StsHitfinderHost& hfc = fGpuReco.GetHitfinderBuffers();
+  const cbm::algo::StsHitfinderPar& pars = fGpuReco.GetParameters();
+
+  for (int module = 0; module < hfc.nModules; module++) {
+
+    auto* gpuClusterF    = &hfc.clusterDataPerModule.h()[module * hfc.maxClustersPerModule];
+    auto* gpuClusterIdxF = &hfc.clusterIdxPerModule.h()[module * hfc.maxClustersPerModule];
+    int nClustersFGpu    = hfc.nClustersPerModule.h()[module];
+
+    nClustersForwarded += nClustersFGpu;
+
+    for (int i = 0; i < nClustersFGpu; i++) {
+      auto& cidx       = gpuClusterIdxF[i];
+      auto& clusterGpu = gpuClusterF[cidx.fIdx];
+
+      unsigned int outIdx = fClusters->GetEntriesFast();
+      auto* clusterOut    = new ((*fClusters)[outIdx])::CbmStsCluster {};
+
+      clusterOut->SetAddress(pars.modules[module].address);
+      clusterOut->SetProperties(clusterGpu.fCharge, clusterGpu.fPosition, clusterGpu.fPositionError, cidx.fTime,
+                                clusterGpu.fTimeError);
+      clusterOut->SetSize(clusterGpu.fSize);
+    }
+
+    auto* gpuClusterB    = &hfc.clusterDataPerModule.h()[(module + hfc.nModules) * hfc.maxClustersPerModule];
+    auto* gpuClusterIdxB = &hfc.clusterIdxPerModule.h()[(module + hfc.nModules) * hfc.maxClustersPerModule];
+    int nClustersBGpu    = hfc.nClustersPerModule.h()[module + hfc.nModules];
+
+    nClustersForwarded += nClustersBGpu;
+
+    for (int i = 0; i < nClustersBGpu; i++) {
+      auto& cidx          = gpuClusterIdxB[i];
+      auto& clusterGpu    = gpuClusterB[cidx.fIdx];
+      unsigned int outIdx = fClusters->GetEntriesFast();
+      auto* clusterOut    = new ((*fClusters)[outIdx])::CbmStsCluster {};
+
+      clusterOut->SetAddress(pars.modules[module].address);
+      clusterOut->SetProperties(clusterGpu.fCharge, clusterGpu.fPosition, clusterGpu.fPositionError, cidx.fTime,
+                                clusterGpu.fTimeError);
+      clusterOut->SetSize(clusterGpu.fSize);
+    }
+
+    auto* gpuHits = &hfc.hitsPerModule.h()[module * hfc.maxHitsPerModule];
+    int nHitsGpu  = hfc.nHitsPerModule.h()[module];
+
+    nHitsForwarded += nHitsGpu;
+
+    for (int i = 0; i < nHitsGpu; i++) {
+      auto& hitGpu = gpuHits[i];
+
+      unsigned int outIdx = fHits->GetEntriesFast();
+      new ((*fHits)[outIdx])::CbmStsHit {pars.modules[module].address,
+                                         TVector3 {hitGpu.fX, hitGpu.fY, hitGpu.fZ},
+                                         TVector3 {hitGpu.fDx, hitGpu.fDy, hitGpu.fDz},
+                                         hitGpu.fDxy,
+                                         0,
+                                         0,
+                                         double(hitGpu.fTime),
+                                         hitGpu.fTimeError,
+                                         hitGpu.fDu,
+                                         hitGpu.fDv};
+    }
+
+  }  // for (int module = 0; module < hfc.nModules; module++)
+
+  return {nClustersForwarded, nHitsForwarded};
 }
 
 
@@ -655,13 +761,13 @@ void CbmRecoSts::SetParContainers()
 void CbmRecoSts::DumpNewHits()
 {
   std::ofstream out {"newHits.csv"};
-
+  const cbm::algo::StsHitfinderHost& hfc = fGpuReco.GetHitfinderBuffers();
   out << "module, x, y, z, deltaX, deltaY, deltaZ, deltaXY, time, timeError, deltaU, deltaV" << std::endl;
-
   for (size_t m = 0; m < fModuleIndex.size(); m++) {
-    auto hits = fGpuReco.GetHits(m);
-
-    for (const auto& h : hits) {
+    int nHitsGpu  = hfc.nHitsPerModule.h()[m];
+    auto* gpuHits = &hfc.hitsPerModule.h()[m * hfc.maxHitsPerModule];
+    for (int i = 0; i < nHitsGpu; i++) {
+      auto& h = gpuHits[i];
       out << m << ", " << h.fX << ", " << h.fY << ", " << h.fZ << ", " << h.fDx << ", " << h.fDy << ", " << h.fDz
           << ", " << h.fDxy << ", " << h.fTime << ", " << h.fTimeError << ", " << h.fDu << ", " << h.fDv << std::endl;
     }
diff --git a/reco/detectors/sts/CbmRecoSts.h b/reco/detectors/sts/CbmRecoSts.h
index f11af70cdbbccd3423c9c9b0a89e60599cad18c7..c6f4f0755ca3ef0d0ee45abf42bba8e9661d2bea 100644
--- a/reco/detectors/sts/CbmRecoSts.h
+++ b/reco/detectors/sts/CbmRecoSts.h
@@ -11,12 +11,13 @@
 #ifndef CBMRECOSTS_H
 #define CBMRECOSTS_H 1
 
-#include "CbmGpuRecoSts.h"
-
 #include <FairTask.h>
 
+#include <TClonesArray.h>
 #include <TStopwatch.h>
 
+#include "StsHitfinderChain.h"
+
 class CbmDigiManager;
 class CbmEvent;
 class CbmStsElement;
@@ -214,6 +215,8 @@ public:
      */
   void UseSensorParSet(CbmStsParSetSensor* sensorParSet) { fUserParSetSensor = sensorParSet; }
 
+  void DumpNewHits();
+  void DumpOldHits();
 
 private:
   /** @brief Average Lorentz Shift in a sensor
@@ -258,11 +261,6 @@ private:
 
   void ProcessDataGpu();
 
-  void DumpNewHits();
-
-  void DumpOldHits();
-
-
 private:
   // --- I/O
   TClonesArray* fEvents        = nullptr;  //! Input array of events
@@ -329,7 +327,9 @@ private:
   std::vector<CbmStsRecoModule*> fModuleIndex {};   //!
 
   bool fUseGpuReco = false;
-  ::experimental::CbmGpuRecoSts fGpuReco;
+  cbm::algo::StsHitfinderChain fGpuReco;
+
+  std::pair<size_t, size_t> ForwardGpuClusterAndHits();
 
   ClassDef(CbmRecoSts, 1);
 };
diff --git a/reco/detectors/sts/experimental/CbmGpuRecoSts.cxx b/reco/detectors/sts/experimental/CbmGpuRecoSts.cxx
deleted file mode 100644
index 1001a5ba8b15cf1fe3dde9508adfcf35654c9be0..0000000000000000000000000000000000000000
--- a/reco/detectors/sts/experimental/CbmGpuRecoSts.cxx
+++ /dev/null
@@ -1,405 +0,0 @@
-/* Copyright (C) 2022 FIAS Frankfurt Institute for Advanced Studies, Frankfurt / Main
-   SPDX-License-Identifier: GPL-3.0-only
-   Authors: Felix Weiglhofer [committer], Kilian Hunold */
-#include "CbmGpuRecoSts.h"
-
-#include "CbmAddress.h"
-#include "CbmDigiManager.h"
-#include "CbmStsAddress.h"
-#include "CbmStsCluster.h"
-#include "CbmStsGpuRecoDevice.h"
-#include "CbmStsHit.h"
-#include "CbmStsPhysics.h"
-#include "CbmStsRecoModule.h"
-
-#include <TCanvas.h>
-#include <TClonesArray.h>
-#include <TGeoMatrix.h>
-#include <TH1D.h>
-#include <TH2D.h>
-#include <TStopwatch.h>
-
-#include <algorithm>
-#include <cassert>
-#include <unordered_set>
-#include <vector>
-
-using namespace experimental;
-
-void CbmGpuRecoSts::SetupConstants()
-{
-  assert(fConfig != std::nullopt);
-  assert(fConfig->recoModules.size() == fConfig->parModules.size());
-  assert(fConfig->moduleAddrs.size() == fConfig->parModules.size());
-  assert(fConfig->hitfinderCfg.size() == fConfig->parModules.size());
-
-  size_t nModules = fConfig->parModules.size();
-
-  auto& hfc = fHitfinderCpu;
-  auto& hfg = fHitfinderGpu;
-
-  LOG(info) << "STS GPU RECO: setting constants";
-
-  size_t nChannels     = fConfig->parModules[0].GetNofChannels();
-  ::CbmStsParAsic asic = fConfig->parModules[0].GetParAsic(0);  // TODO values are currently identical for all asics??
-  // for (auto &mod : modules) {
-  //     assert(nChannels == mod.GetNofChannels());
-  //     for (size_t c = 0; c < nChannels; c++) {
-  //         ::CbmStsParAsic otherAsic = mod.GetParAsic(c);
-  //         assert(asic.GetNofAdc() == otherAsic.GetNofAdc());
-  //         assert(asic.GetDynRange() == otherAsic.GetDynRange());
-  //         assert(asic.GetThreshold() == otherAsic.GetThreshold());
-  //         assert(asic.GetTimeResol() == otherAsic.GetTimeResol());
-  //         assert(asic.GetDeadTime() == otherAsic.GetDeadTime());
-  //         assert(asic.GetNoise() == otherAsic.GetNoise());
-  //         assert(asic.GetZeroNoiseRate() == otherAsic.GetZeroNoiseRate());
-  //     }
-  // }
-
-  nChannels /= 2;
-  hfc.nModules = hfg.nModules = nModules;
-  hfc.nChannels = hfg.nChannels = nChannels;
-
-  hfc.timeCutDigiAbs = hfg.timeCutDigiAbs = -1.f;
-  hfc.timeCutDigiSig = hfg.timeCutDigiSig = 3.f;
-  hfc.timeCutClusterAbs = hfg.timeCutClusterAbs = -1.f;
-  hfc.timeCutClusterSig = hfg.timeCutClusterSig = 4.f;
-
-  hfc.asic.nAdc           = asic.GetNofAdc();
-  hfc.asic.dynRange       = asic.GetDynRange();
-  hfc.asic.threshold      = asic.GetThreshold();
-  hfc.asic.timeResolution = asic.GetTimeResol();
-  hfc.asic.deadTime       = asic.GetDeadTime();
-  hfc.asic.noise          = asic.GetNoise();
-  hfc.asic.zeroNoiseRate  = asic.GetZeroNoiseRate();
-  hfg.asic                = hfc.asic;
-}
-
-void CbmGpuRecoSts::Setup(size_t maxDigisPerModule, size_t nDigitsTotal)
-{
-  assert(fConfig != std::nullopt);
-
-  // static constexpr size_t maxDigisPerModule    = 60000;
-  // static constexpr size_t maxClustersPerModule = 30000;
-  // static constexpr size_t maxHitsPerModule     = 120000;
-
-  const size_t maxClustersPerModule = maxDigisPerModule * 0.5;
-  const size_t maxHitsPerModule     = maxClustersPerModule * 4;
-
-  size_t nModules  = fConfig->parModules.size();
-  size_t nChannels = fConfig->parModules[0].GetNofChannels() / 2;
-
-
-  size_t nModulesTotal = nModules * 2;  // Front- and backside
-
-  auto& hfc = fHitfinderCpu;
-  auto& hfg = fHitfinderGpu;
-
-  auto landauTable    = fConfig->physics->GetLandauWidthTable();
-  hfc.landauTableSize = hfg.landauTableSize = landauTable.size();
-  hfc.landauTable                           = xpu::hd_buffer<float> {size_t(hfc.landauTableSize)};
-  hfg.landauTable                           = hfc.landauTable.d();
-  auto landauEntry                          = landauTable.begin();
-  auto prevLandauEntry                      = landauEntry;
-  hfc.landauTable.h()[0]                    = landauEntry->second;
-  landauEntry++;
-  hfc.landauStepSize = hfg.landauStepSize = landauEntry->first - prevLandauEntry->first;
-  for (size_t i = 1; landauEntry != landauTable.end(); i++, landauEntry++) {
-    assert(hfc.landauStepSize == landauEntry->first - prevLandauEntry->first);
-    assert(i < hfc.landauTable.size());
-    hfc.landauTable.h()[i] = landauEntry->second;
-    prevLandauEntry        = landauEntry;
-  }
-
-  LOG(info) << "STS GPU RECO: finished landau";
-
-  hfc.localToGlobalTranslationByModule = xpu::hd_buffer<float> {nModules * 3};
-  hfg.localToGlobalTranslationByModule = hfc.localToGlobalTranslationByModule.d();
-  hfc.localToGlobalRotationByModule    = xpu::hd_buffer<float> {nModules * 9};
-  hfg.localToGlobalRotationByModule    = hfc.localToGlobalRotationByModule.d();
-  for (size_t m = 0; m < fConfig->recoModules.size(); m++) {
-    auto& recoMod       = fConfig->recoModules[m];
-    TGeoHMatrix* matrix = recoMod->getMatrix();
-
-    const double* rot = matrix->GetRotationMatrix();
-    float* tgt        = &hfc.localToGlobalRotationByModule.h()[m * 9];
-    std::copy_n(rot, 9, tgt);
-
-    const double* tr = matrix->GetTranslation();
-    tgt              = &hfc.localToGlobalTranslationByModule.h()[m * 3];
-    std::copy_n(tr, 3, tgt);
-  }
-
-  LOG(info) << "STS GPU RECO: finished transformation matrix";
-
-  hfc.hitfinderConfig = xpu::hd_buffer<CbmStsHitFinderConfig>(nModules);
-  hfg.hitfinderConfig = hfc.hitfinderConfig.d();
-
-  std::copy_n(fConfig->hitfinderCfg.begin(), nModules, hfc.hitfinderConfig.h());
-
-  LOG(info) << "STS GPU RECO: finished setting hitfinder config";
-
-  // FIXME: allocate by total number of digis instead. This array is flat.
-  hfc.digiOffsetPerModule    = xpu::hd_buffer<size_t> {nModulesTotal + 1};
-  hfg.digiOffsetPerModule    = hfc.digiOffsetPerModule.d();
-  hfc.digisPerModule         = xpu::hd_buffer<CbmStsDigi> {nDigitsTotal};
-  hfg.digisPerModule         = hfc.digisPerModule.d();
-  hfc.digisPerModuleTmp      = xpu::d_buffer<CbmStsDigi> {nDigitsTotal};
-  hfg.digisPerModuleTmp      = hfc.digisPerModuleTmp.d();
-  hfc.digisSortedPerModule   = xpu::d_buffer<CbmStsDigi*> {nModulesTotal};
-  hfg.digisSortedPerModule   = hfc.digisSortedPerModule.d();
-  hfc.digiConnectorPerModule = xpu::d_buffer<CbmStsDigiConnector> {nDigitsTotal};
-  hfg.digiConnectorPerModule = hfc.digiConnectorPerModule.d();
-
-  hfc.channelOffsetPerModule = xpu::d_buffer<unsigned int> {nModulesTotal * nChannels};
-  hfg.channelOffsetPerModule = hfc.channelOffsetPerModule.d();
-
-  hfc.maxErrorFront = xpu::d_buffer<float> {1};
-  hfg.maxErrorFront = hfc.maxErrorFront.d();
-  hfc.maxErrorBack  = xpu::d_buffer<float> {1};
-  hfg.maxErrorBack  = hfc.maxErrorBack.d();
-
-  LOG(info) << "STS GPU RECO: finished digis";
-
-  hfc.maxClustersPerModule = hfg.maxClustersPerModule = maxClustersPerModule;
-  hfc.channelStatusPerModule                          = xpu::d_buffer<int> {nChannels * nModulesTotal};
-  hfg.channelStatusPerModule                          = hfc.channelStatusPerModule.d();
-  hfc.clusterIdxPerModule       = xpu::hd_buffer<CbmStsClusterIdx> {maxClustersPerModule * nModulesTotal};
-  hfg.clusterIdxPerModule       = hfc.clusterIdxPerModule.d();
-  hfc.clusterIdxPerModuleTmp    = xpu::hd_buffer<CbmStsClusterIdx> {maxClustersPerModule * nModulesTotal};
-  hfg.clusterIdxPerModuleTmp    = hfc.clusterIdxPerModuleTmp.d();
-  hfc.clusterIdxSortedPerModule = xpu::hd_buffer<CbmStsClusterIdx*> {nModulesTotal};
-  hfg.clusterIdxSortedPerModule = hfc.clusterIdxSortedPerModule.d();
-  hfc.clusterDataPerModule      = xpu::hd_buffer<CbmStsClusterData> {maxClustersPerModule * nModulesTotal};
-  hfg.clusterDataPerModule      = hfc.clusterDataPerModule.d();
-  hfc.nClustersPerModule        = xpu::hd_buffer<int> {nModulesTotal};
-  hfg.nClustersPerModule        = hfc.nClustersPerModule.d();
-
-  LOG(info) << "STS GPU RECO: finished cluster";
-
-  hfc.maxHitsPerModule = hfg.maxHitsPerModule = maxHitsPerModule;
-  hfc.hitsPerModule                           = xpu::hd_buffer<CbmStsHit>(maxHitsPerModule * nModules);
-  hfg.hitsPerModule                           = hfc.hitsPerModule.d();
-  hfc.nHitsPerModule                          = xpu::hd_buffer<int>(nModules);
-  hfg.nHitsPerModule                          = hfc.nHitsPerModule.d();
-
-  LOG(info) << "STS GPU RECO: finished setup";
-  LOG(info) << "- nModules = " << nModules;
-  LOG(info) << "- nChannels = " << nChannels;
-}
-
-void CbmGpuRecoSts::RunHitFinder()
-{
-  auto& hfc = fHitfinderCpu;
-
-  // ROCm bug: Mi100 name is an emtpy string...
-  auto deviceName = (xpu::device_properties().name.empty()) ? "AMD Mi100" : xpu::device_properties().name;
-
-  LOG(info) << "STS GPU RECO: running GPU hit finder on device " << deviceName;
-  LOG(info) << "STS GPU RECO: Allocate and fetch digis";
-
-
-  // FIXME: This is ugly, setup has to be split in two parts, constants required by FetchDigis
-  // And FetchDigis required by Setup...
-  SetupConstants();
-  size_t maxDigisPerModule = 0;
-  size_t nDigisTotal       = 0;
-  FetchDigis(maxDigisPerModule, nDigisTotal);
-  Setup(maxDigisPerModule, nDigisTotal);
-
-  LOG(info) << "STS GPU RECO: finished digis / running clusterizer now";
-  LOG(info) << "STS GPU RECO: largest module has " << maxDigisPerModule << " digis";
-
-  // Not all buffers have to initialized, but useful for debugging
-  // xpu::memset(hfc.digisPerModule, 0);
-  // xpu::memset(hfc.digisPerModuleTmp, 0);
-  // xpu::memset(hfc.digisSortedPerModule, 0);
-  // xpu::memset(hfc.digiOffsetPerModule, 0);
-  xpu::memset(hfc.digiConnectorPerModule, 0);
-  // xpu::memset(hfc.channelOffsetPerModule, 0);
-  xpu::memset(hfc.channelStatusPerModule, -1);
-  xpu::memset(hfc.clusterIdxPerModule, 0);
-  // xpu::memset(hfc.clusterIdxPerModuleTmp, 0);
-  // xpu::memset(hfc.clusterIdxSortedPerModule, 0);
-  // xpu::memset(hfc.clusterDataPerModule, 0);
-  xpu::memset(hfc.nClustersPerModule, 0);
-  // xpu::memset(hfc.hitsPerModule, 0);
-  xpu::memset(hfc.nHitsPerModule, 0);
-  xpu::memset(hfc.maxErrorFront, 0);
-  xpu::memset(hfc.maxErrorBack, 0);
-
-  hfc.digiOffsetPerModule.h()[0] = 0;
-  for (int m = 0; m < hfc.nModules; m++) {
-    size_t offset = hfc.digiOffsetPerModule.h()[m];
-    auto& md      = fDigisByModuleF[fConfig->moduleAddrs[m]];
-    std::copy(md.begin(), md.end(), &hfc.digisPerModule.h()[offset]);
-    hfc.digiOffsetPerModule.h()[m + 1] = offset + md.size();
-  }
-  for (int m = 0; m < hfc.nModules; m++) {
-    size_t offset = hfc.digiOffsetPerModule.h()[m + hfc.nModules];
-    auto& md      = fDigisByModuleB[fConfig->moduleAddrs[m]];
-    std::copy(md.begin(), md.end(), &hfc.digisPerModule.h()[offset]);
-    hfc.digiOffsetPerModule.h()[m + hfc.nModules + 1] = offset + md.size();
-  }
-  assert(nDigisTotal == hfc.digiOffsetPerModule.h()[2 * hfc.nModules]);
-
-  TStopwatch timer;
-
-  // constants only need to be transferred once. should be excluded from time measurement
-  xpu::copy(hfc.landauTable, xpu::host_to_device);
-  xpu::copy(hfc.hitfinderConfig, xpu::host_to_device);
-  xpu::copy(hfc.localToGlobalRotationByModule, xpu::host_to_device);
-  xpu::copy(hfc.localToGlobalTranslationByModule, xpu::host_to_device);
-
-  xpu::set_constant<HitFinder>(fHitfinderGpu);
-  xpu::copy(hfc.digisPerModule, xpu::host_to_device);
-  xpu::copy(hfc.digiOffsetPerModule, xpu::host_to_device);
-
-  xpu::run_kernel<SortDigis>(xpu::grid::n_blocks(hfc.nModules * 2));
-  xpu::run_kernel<FindClustersSingleStep>(xpu::grid::n_blocks(hfc.nModules * 2));
-  // Run cluster finding steps in indivual kernels, useful for debugging / profiling
-  // xpu::run_kernel<CalculateOffsets>(xpu::grid::n_blocks(hfc.nModules * 2));
-  // xpu::run_kernel<FindClustersBasic>(xpu::grid::n_blocks(hfc.nModules * 2));
-  // xpu::run_kernel<CalculateClusters>(xpu::grid::n_blocks(hfc.nModules * 2));
-  // xpu::run_kernel<FindClustersBasic>(xpu::grid::n_blocks(hfc.nModules * 2));
-  // xpu::run_kernel<CalculateClustersBasic>(xpu::grid::n_blocks(hfc.nModules * 2));
-  xpu::run_kernel<SortClusters>(xpu::grid::n_blocks(hfc.nModules * 2));
-  xpu::run_kernel<FindHits>(xpu::grid::n_blocks(hfc.nModules));
-
-  // Check if profiling is enabled
-  if (not xpu::get_timing<SortDigis>().empty()) {
-    timeSortDigi    = xpu::get_timing<SortDigis>().back();
-    timeCluster     = xpu::get_timing<FindClustersSingleStep>().back();
-    timeSortCluster = xpu::get_timing<SortClusters>().back();
-    timeHits        = xpu::get_timing<FindHits>().back();
-  }
-
-  xpu::copy(hfc.clusterDataPerModule, xpu::device_to_host);
-  xpu::copy(hfc.nClustersPerModule, xpu::device_to_host);
-  xpu::copy(hfc.hitsPerModule, xpu::device_to_host);
-  xpu::copy(hfc.nHitsPerModule, xpu::device_to_host);
-
-
-  size_t mostCluster = 0, mostHits = 0;
-  for (int m = 0; m < hfc.nModules; m++) {
-    mostCluster = std::max<size_t>(hfc.nClustersPerModule.h()[m], mostCluster);
-    mostCluster = std::max<size_t>(hfc.nClustersPerModule.h()[m + hfc.nModules], mostCluster);
-    mostHits    = std::max<size_t>(hfc.nHitsPerModule.h()[m], mostHits);
-
-    nCluster += hfc.nClustersPerModule.h()[m];
-    nCluster += hfc.nClustersPerModule.h()[m + hfc.nModules];
-
-    nHits += hfc.nHitsPerModule.h()[m];
-  }
-
-  LOG(info) << "STS GPU RECO: finished hitfinder";
-}
-
-void CbmGpuRecoSts::ForwardClustersAndHits(TClonesArray* clusters, TClonesArray* hits)
-{
-  auto& hfc = fHitfinderCpu;
-
-  for (int module = 0; module < hfc.nModules; module++) {
-
-    auto* gpuClusterF    = &hfc.clusterDataPerModule.h()[module * hfc.maxClustersPerModule];
-    auto* gpuClusterIdxF = &hfc.clusterIdxPerModule.h()[module * hfc.maxClustersPerModule];
-    int nClustersFGpu    = hfc.nClustersPerModule.h()[module];
-
-    for (int i = 0; i < nClustersFGpu; i++) {
-      auto& cidx       = gpuClusterIdxF[i];
-      auto& clusterGpu = gpuClusterF[cidx.fIdx];
-
-      unsigned int outIdx = clusters->GetEntriesFast();
-      auto* clusterOut    = new ((*clusters)[outIdx])::CbmStsCluster {};
-
-      clusterOut->SetAddress(fConfig->moduleAddrs[module]);
-      clusterOut->SetProperties(clusterGpu.fCharge, clusterGpu.fPosition, clusterGpu.fPositionError, cidx.fTime,
-                                clusterGpu.fTimeError);
-      clusterOut->SetSize(clusterGpu.fSize);
-    }
-
-    auto* gpuClusterB    = &hfc.clusterDataPerModule.h()[(module + hfc.nModules) * hfc.maxClustersPerModule];
-    auto* gpuClusterIdxB = &hfc.clusterIdxPerModule.h()[(module + hfc.nModules) * hfc.maxClustersPerModule];
-    int nClustersBGpu    = hfc.nClustersPerModule.h()[module + hfc.nModules];
-
-    for (int i = 0; i < nClustersBGpu; i++) {
-      auto& cidx          = gpuClusterIdxB[i];
-      auto& clusterGpu    = gpuClusterB[cidx.fIdx];
-      unsigned int outIdx = clusters->GetEntriesFast();
-      auto* clusterOut    = new ((*clusters)[outIdx])::CbmStsCluster {};
-
-      clusterOut->SetAddress(fConfig->moduleAddrs[module]);
-      clusterOut->SetProperties(clusterGpu.fCharge, clusterGpu.fPosition, clusterGpu.fPositionError, cidx.fTime,
-                                clusterGpu.fTimeError);
-      clusterOut->SetSize(clusterGpu.fSize);
-    }
-
-    auto* gpuHits = &hfc.hitsPerModule.h()[module * hfc.maxHitsPerModule];
-    int nHitsGpu  = hfc.nHitsPerModule.h()[module];
-
-    for (int i = 0; i < nHitsGpu; i++) {
-      auto& hitGpu = gpuHits[i];
-
-      unsigned int outIdx = hits->GetEntriesFast();
-      new ((*hits)[outIdx])::CbmStsHit {fConfig->moduleAddrs[module],
-                                        TVector3 {hitGpu.fX, hitGpu.fY, hitGpu.fZ},
-                                        TVector3 {hitGpu.fDx, hitGpu.fDy, hitGpu.fDz},
-                                        hitGpu.fDxy,
-                                        0,
-                                        0,
-                                        double(hitGpu.fTime),
-                                        hitGpu.fTimeError,
-                                        hitGpu.fDu,
-                                        hitGpu.fDv};
-    }
-
-  }  // for (int module = 0; module < hfc.nModules; module++)
-}
-
-void CbmGpuRecoSts::FetchDigis(size_t& maxDigisPerModule, size_t& nDigisTotal)
-{
-
-  // FIXME: This function is a crime against humanity.
-  // Digis should be presorted by module already...
-
-  CbmDigiManager* digis = CbmDigiManager::Instance();
-  auto& hfc             = fHitfinderCpu;
-
-  // Remove digis from previous timeslice
-  fDigisByModuleB.clear();
-  fDigisByModuleF.clear();
-
-  // FIXME: GPU reco should use regular digi class too
-  nDigis      = digis->GetNofDigis(ECbmModuleId::kSts);
-  nDigisTotal = 0;
-  for (size_t i = 0; i < nDigis; i++) {
-    const ::CbmStsDigi* digi = digis->Get<::CbmStsDigi>(i);
-    Int_t systemId           = CbmAddress::GetSystemId(digi->GetAddress());
-    // TODO: is this check still needed?
-    if (systemId != ToIntegralType(ECbmModuleId::kSts)) { continue; }
-    int moduleAddr = CbmStsAddress::GetMotherAddress(digi->GetAddress(), kStsModule);
-    // int module     = addrToModuleId(moduleAddr);
-    // assert(module < hfc.nModules);
-    bool isFront = digi->GetChannel() < hfc.nChannels;
-
-    if (isFront) { fDigisByModuleF[moduleAddr].push_back(*digi); }
-    else {
-      CbmStsDigi tmpdigi = *digi;
-      tmpdigi.SetChannel(digi->GetChannel() - hfc.nChannels);
-      fDigisByModuleB[moduleAddr].push_back(tmpdigi);
-    }
-
-    nDigisTotal++;
-  }
-
-  nDigisUsed = nDigisTotal;
-
-  maxDigisPerModule = 0;
-  for (const auto& mod : fDigisByModuleF) {
-    maxDigisPerModule = std::max(maxDigisPerModule, mod.second.size());
-  }
-
-  for (const auto& mod : fDigisByModuleB) {
-    maxDigisPerModule = std::max(maxDigisPerModule, mod.second.size());
-  }
-}
diff --git a/reco/detectors/sts/experimental/CbmGpuRecoSts.h b/reco/detectors/sts/experimental/CbmGpuRecoSts.h
deleted file mode 100644
index a26efdbe911d569a09e0856d433010f2828c9c20..0000000000000000000000000000000000000000
--- a/reco/detectors/sts/experimental/CbmGpuRecoSts.h
+++ /dev/null
@@ -1,78 +0,0 @@
-/* Copyright (C) 2022 FIAS Frankfurt Institute for Advanced Studies, Frankfurt / Main
-   SPDX-License-Identifier: GPL-3.0-only
-   Authors: Felix Weiglhofer [committer], Kilian Hunold */
-#ifndef CBMGPURECOSTS_H
-#define CBMGPURECOSTS_H
-
-#include "CbmStsDigi.h"
-#include "CbmStsGpuHitFinder.h"
-#include "CbmStsParModule.h"
-
-#include <map>
-#include <optional>
-#include <vector>
-
-#include <xpu/host.h>
-
-class CbmStsPhysics;
-class CbmStsRecoModule;
-class TClonesArray;
-
-namespace experimental
-{
-
-  class CbmGpuRecoSts {
-
-  public:
-    struct Config {
-      std::vector<CbmStsParModule> parModules;
-      std::vector<CbmStsRecoModule*> recoModules;
-      std::vector<int> moduleAddrs;
-      std::vector<CbmStsHitFinderConfig> hitfinderCfg;
-      CbmStsPhysics* physics;
-    };
-
-    double timeIO          = 0;
-    double timeSortDigi    = 0;
-    double timeCluster     = 0;
-    double timeSortCluster = 0;
-    double timeHits        = 0;
-
-    size_t nDigis     = 0;
-    size_t nDigisUsed = 0;
-    size_t nHits      = 0;
-    size_t nCluster   = 0;
-
-    void SetConfig(const Config& cfg) { fConfig = cfg; }
-
-    void SetupConstants();
-    void Setup(size_t maxDigisPerModule, size_t nDigitsTotal);
-
-    void RunHitFinder();
-
-    void ForwardClustersAndHits(TClonesArray* clusters, TClonesArray* hits);
-
-    std::vector<CbmStsHit> GetHits(int module)
-    {
-      auto& hfc = fHitfinderCpu;
-      auto* st  = &hfc.hitsPerModule.h()[module * hfc.maxHitsPerModule];
-      auto* end = st + hfc.nHitsPerModule.h()[module];
-      return std::vector<CbmStsHit> {st, end};
-    }
-
-  private:
-    std::optional<Config> fConfig;  // Initialized late...
-
-    // std::vector<int> fModuleAddrs;
-    CbmStsHitFinder<xpu::side::host> fHitfinderCpu;
-    CbmStsGpuHitFinder fHitfinderGpu;
-
-    std::map<int, std::vector<CbmStsDigi>> fDigisByModuleF;
-    std::map<int, std::vector<CbmStsDigi>> fDigisByModuleB;
-
-    void FetchDigis(size_t& maxDigisPerModule, size_t& nDigisTotal);
-  };
-
-}  // namespace experimental
-
-#endif
diff --git a/reco/detectors/sts/experimental/CbmStsGpuHitFinder.h b/reco/detectors/sts/experimental/CbmStsGpuHitFinder.h
deleted file mode 100644
index 5d3a5319ef02a1acaf219ca377485254e1e613ce..0000000000000000000000000000000000000000
--- a/reco/detectors/sts/experimental/CbmStsGpuHitFinder.h
+++ /dev/null
@@ -1,309 +0,0 @@
-/* Copyright (C) 2021-2022 FIAS Frankfurt Institute for Advanced Studies, Frankfurt / Main
-   SPDX-License-Identifier: GPL-3.0-only
-   Authors: Felix Weiglhofer [committer], Kilian Hunold */
-#ifndef CBMSTSGPUHITFINDER_H
-#define CBMSTSGPUHITFINDER_H
-
-#include "CbmStsDigi.h"
-#include "CbmStsGpuRecoDevice.h"
-
-#include <xpu/device.h>
-
-// experimental namespace to avoid name collisions with original data types
-namespace experimental
-{
-  using CbmStsTimeNs = uint32_t;
-
-  using XErrorT = float;  // FIXME: remove and replace by float
-
-  struct CbmStsClusterIdx {
-    CbmStsTimeNs fTime;  ///< Cluster time (average of digi times) [ns]
-    int fIdx;
-  };
-
-  struct CbmStsClusterData {
-    float fCharge;           ///< Total charge
-    int fSize;               ///< Difference between first and last channel
-    float fPosition;         ///< Cluster centre in channel number units
-    XErrorT fPositionError;  ///< Cluster centre error (r.m.s.) in channel number units
-    float fTimeError;        ///< Error of cluster time [ns]
-  };
-
-  struct CbmStsHit {
-    float fX, fY;        ///< X, Y positions of hit [cm]
-    float fZ;            ///< Z position of hit [cm]
-    CbmStsTimeNs fTime;  ///< Hit time [ns]
-    float fDx, fDy;      ///< X, Y errors [cm]
-    float fDz;           ///< Z position error [cm]
-    float fDxy;          ///< XY correlation
-    float fTimeError;    ///< Error of hit time [ns]
-    float fDu;           ///< Error of coordinate across front-side strips [cm]
-    float fDv;           ///< Error of coordinate across back-side strips [cm]
-  };
-
-  struct CbmStsParAsic {
-    int nAdc;
-    float dynRange;
-    float threshold;
-    float timeResolution;
-    float deadTime;
-    float noise;
-    float zeroNoiseRate;
-
-    XPU_D float AdcToCharge(unsigned short adc) const
-    {
-      return threshold + dynRange / float(nAdc) * (float(adc) + 0.5f);
-    }
-  };
-
-  struct CbmStsHitFinderConfig {
-    float dY;
-    float pitch;
-    float stereoF;
-    float stereoB;
-    float lorentzF;
-    float lorentzB;
-  };
-
-  // cache of various parameters used by the hit finder
-  struct CbmStsHitFinderParams : public CbmStsHitFinderConfig {
-    float tanStereoF;
-    float tanStereoB;
-    float errorFac;
-    float dX;
-  };
-
-  using SortDigisT = xpu::block_sort<unsigned long int, ::CbmStsDigi, xpu::block_size<SortDigis>::value.x,
-                                     CBM_STS_SORT_DIGIS_ITEMS_PER_THREAD>;
-
-  struct CbmStsSortDigiSmem {
-    typename SortDigisT::storage_t sortBuf;
-  };
-
-  using SortClustersT = xpu::block_sort<CbmStsTimeNs, CbmStsClusterIdx, xpu::block_size<SortClusters>::value.x,
-                                        CBM_STS_SORT_CLUSTERS_ITEMS_PER_THREAD>;
-
-  struct CbmStsSortClusterSmem {
-    typename SortClustersT::storage_t sortBuf;
-  };
-
-  struct CbmStsClusterCalculationProperties {
-    float tSum      = 0.;       // sum of digi times
-    int chanF       = 9999999;  // first channel in cluster
-    int chanL       = -1;       // last channel in cluster
-    float qF        = 0.f;      // charge in first channel
-    float qM        = 0.f;      // sum of charges in middle channels
-    float qL        = 0.f;      // charge in last cluster
-    float eqFsq     = 0.f;      // uncertainty of qF
-    float eqMsq     = 0.f;      // uncertainty of qMid
-    float eqLsq     = 0.f;      // uncertainty of qL
-    float tResolSum = 0.f;
-
-    float xSum = 0.f;  // weighted charge sum, used to correct corrupt clusters
-  };
-
-  class CbmStsDigiConnector {
-  private:
-    unsigned int hasPreviousAndNext;
-
-    XPU_D unsigned int UnpackNext(unsigned int val) const { return val & ~(1u << 31); }
-
-  public:
-    XPU_D bool HasPrevious() const { return (hasPreviousAndNext >> 31) & 1u; }
-
-    XPU_D void SetHasPrevious(bool hasPrevious)
-    {
-      unsigned int old          = hasPreviousAndNext;
-      unsigned int hasPreviousI = ((unsigned int) hasPrevious) << 31;
-      unsigned int assumed;
-
-      do {
-        assumed = old;
-        old     = xpu::atomic_cas_block(&hasPreviousAndNext, assumed, UnpackNext(assumed) | hasPreviousI);
-      } while (old != assumed);
-    }
-
-    XPU_D unsigned int next() const { return UnpackNext(hasPreviousAndNext); }
-
-    XPU_D void SetNext(unsigned int next)
-    {
-      unsigned int old = hasPreviousAndNext;
-      unsigned int assumed;
-
-      next = xpu::min((1u << 31) - 1, next);
-
-      do {
-        assumed = old;
-        old     = xpu::atomic_cas_block(&hasPreviousAndNext, assumed, (assumed & (1u << 31)) | next);
-      } while (old != assumed);
-    }
-
-    XPU_D bool HasNext() const { return UnpackNext(hasPreviousAndNext) != 0; }
-  };
-
-  template<xpu::side S>
-  class CbmStsHitFinder {
-
-  public:
-    // Constants
-    int nModules;
-    int nChannels;
-
-    // calibration data
-    float timeCutDigiAbs;
-    float timeCutDigiSig;
-    float timeCutClusterAbs;
-    float timeCutClusterSig;
-    CbmStsParAsic asic;
-    int landauTableSize;
-    float landauStepSize;
-    xpu::cmem_io_t<float, S> landauTable;
-
-    xpu::cmem_device_t<float, S> maxErrorFront;
-    xpu::cmem_device_t<float, S> maxErrorBack;
-
-    // TODO fill this array
-    xpu::cmem_io_t<CbmStsHitFinderConfig, S> hitfinderConfig;
-
-    // input
-    // Store all digis in a flat array with a header that contains the offset for every module (front and back)
-    xpu::cmem_io_t<size_t, S> digiOffsetPerModule;  // 2 * nModules + 1 entries, last entry contains total digi count
-    xpu::cmem_io_t<CbmStsDigi, S> digisPerModule;   // TODO this is a flat array now, should be renamed
-    // Two digi arrays needed, as sorting can't sort in place right now
-    xpu::cmem_device_t<CbmStsDigi, S> digisPerModuleTmp;
-    xpu::cmem_device_t<CbmStsDigi*, S> digisSortedPerModule;
-
-    xpu::cmem_io_t<float, S> localToGlobalTranslationByModule;
-    xpu::cmem_io_t<float, S> localToGlobalRotationByModule;
-
-    xpu::cmem_device_t<CbmStsDigiConnector, S> digiConnectorPerModule;
-    xpu::cmem_device_t<unsigned int, S> channelOffsetPerModule;
-
-    // intermediate results
-    size_t maxClustersPerModule;
-    xpu::cmem_device_t<int, S> channelStatusPerModule;
-    xpu::cmem_io_t<CbmStsClusterIdx, S> clusterIdxPerModule;
-    xpu::cmem_io_t<CbmStsClusterIdx, S> clusterIdxPerModuleTmp;
-    xpu::cmem_io_t<CbmStsClusterIdx*, S> clusterIdxSortedPerModule;
-    xpu::cmem_io_t<CbmStsClusterData, S> clusterDataPerModule;
-    xpu::cmem_io_t<int, S> nClustersPerModule;
-
-    // output
-    size_t maxHitsPerModule;
-    xpu::cmem_io_t<CbmStsHit, S> hitsPerModule;
-    xpu::cmem_io_t<int, S> nHitsPerModule;
-  };
-
-  class CbmStsGpuHitFinder : public CbmStsHitFinder<xpu::side::device> {
-
-  public:
-    XPU_D void SortDigisInSpaceAndTime(CbmStsSortDigiSmem& smem, int iBlock) const;
-    XPU_D void FindClustersSingleStep(int iBlock) const;
-    XPU_D void SortClusters(CbmStsSortClusterSmem& smem, int iBlock) const;
-    XPU_D void FindHits(int iBlock) const;
-
-    // Individual steps of cluster finder, for more granular time measurement
-    XPU_D void CalculateOffsetsParallel(int iBlock) const;
-    XPU_D void FindClustersParallel(int iBlock) const;
-    XPU_D void CalculateClustersParallel(int iBlock) const;
-
-    XPU_D void FindClusterConnectionsBasic(int iBlock) const;
-    XPU_D void CalculateClustersBasic(int iBlock) const;
-
-  private:
-    XPU_D void CalculateChannelOffsets(CbmStsDigi* digis, unsigned int* channelOffsets, unsigned int nDigis) const;
-
-    XPU_D void FindClusterConnectionsChannelWise(CbmStsDigi* digis, CbmStsDigiConnector* digiConnector,
-                                                 unsigned int* channelOffsets, int iModule,
-                                                 unsigned int threadId) const;
-    XPU_D void FindClusterConnectionsDigiWise(CbmStsDigi* digis, CbmStsDigiConnector* digiConnector,
-                                              unsigned int* channelOffsets, int iModule, unsigned int threadId,
-                                              unsigned int nDigis) const;
-
-    XPU_D void CalculateClustersBasic(CbmStsDigi* digis, CbmStsDigiConnector* digiConnector, int iModule,
-                                      unsigned int const threadId) const;
-    XPU_D void CalculateClustersChannelWise(CbmStsDigi* digis, CbmStsDigiConnector* digiConnector,
-                                            unsigned int* channelOffsets, int iModule, unsigned int const threadId,
-                                            unsigned int const nDigis) const;
-    XPU_D void CalculateClustersDigiWise(CbmStsDigi* digis, CbmStsDigiConnector* digiConnector, int iModule,
-                                         unsigned int const threadId, unsigned int const nDigis) const;
-
-    XPU_D void CreateClusterFromConnectors1(int const iModule, CbmStsDigi* digis, int const digiIndex) const;
-    XPU_D void CreateClusterFromConnectors2(int const iModule, CbmStsDigi* digis, CbmStsDigiConnector* digiConnector,
-                                            int const digiIndex) const;
-    XPU_D void CreateClusterFromConnectorsN(int const iModule, CbmStsDigi* digis, CbmStsDigiConnector* digiConnector,
-                                            int const digiIndex) const;
-
-  private:
-    XPU_D unsigned int GetNDigis(int iModule) const
-    {
-      return digiOffsetPerModule[iModule + 1] - digiOffsetPerModule[iModule];
-    }
-
-    XPU_D float GetTimeResolution(int /* iModule */, int /* channel */) const { return asic.timeResolution; }
-
-    XPU_D bool IsActive(int* channelStatus, int channel) const
-    {
-      // TODO add padding to channels to remove this?
-      if (channel < 0 || channel >= nChannels) { return false; }
-      return channelStatus[channel] > -1;
-    }
-
-    XPU_D int ChanLeft(int channel) const { return channel - 1; }
-
-    XPU_D int ChanRight(int channel) const { return channel + 1; }
-
-    XPU_D int ChanDist(int c1, int c2) const
-    {
-      int diff = c2 - c1;
-      // TODO handle wrap around
-      return diff;
-    }
-
-    XPU_D void AddCluster(int iModule, CbmStsTimeNs time, const CbmStsClusterData& cls) const
-    {
-      CbmStsClusterIdx* tgtIdx   = &clusterIdxPerModule[iModule * maxClustersPerModule];
-      CbmStsClusterData* tgtData = &clusterDataPerModule[iModule * maxClustersPerModule];
-
-      int pos = xpu::atomic_add_block(&nClustersPerModule[iModule], 1);
-      XPU_ASSERT(size_t(pos) < maxClustersPerModule);
-
-      CbmStsClusterIdx idx {time, pos};
-      tgtIdx[idx.fIdx]  = idx;
-      tgtData[idx.fIdx] = cls;
-    }
-
-    XPU_D bool IsBackside(int iModule) const { return iModule >= nModules; }
-
-    XPU_D XErrorT LandauWidth(float charge) const;
-
-    XPU_D void ToGlobal(int iModule, float lx, float ly, float lz, float& gx, float& gy, float& gz) const;
-
-    XPU_D void IntersectClusters(int iBlock, const CbmStsHitFinderParams& pars, CbmStsTimeNs timeF,
-                                 const CbmStsClusterData& clsF, CbmStsTimeNs timeB,
-                                 const CbmStsClusterData& clsB) const;
-    XPU_D float GetClusterPosition(const CbmStsHitFinderParams& pars, float centre, bool isFront) const;
-    XPU_D bool Intersect(const CbmStsHitFinderParams& pars, float xF, float exF, float xB, float exB, float& x,
-                         float& y, float& varX, float& varY, float& varXY) const;
-    XPU_D bool IsInside(const CbmStsHitFinderParams& pars, float x, float y) const;
-    XPU_D void CreateHit(int iBlocks, float xLocal, float yLocal, float varX, float varY, float varXY,
-                         CbmStsTimeNs timeF, const CbmStsClusterData& clsF, CbmStsTimeNs timeB,
-                         const CbmStsClusterData& clsB, float du, float dv) const;
-
-    XPU_D void SaveMaxError(float errorValue, int iModule) const
-    {
-
-      float* maxError = IsBackside(iModule) ? maxErrorBack : maxErrorFront;
-      float old {};
-      do {
-        old = *maxError;
-        if (old >= errorValue) { break; }
-      } while (!xpu::atomic_cas_block(maxError, *maxError, xpu::max(errorValue, *maxError)));
-    }
-  };
-
-}  // namespace experimental
-
-XPU_EXPORT_CONSTANT(CbmStsGpuRecoDevice, ::experimental::CbmStsGpuHitFinder, HitFinder);
-
-#endif
diff --git a/reco/detectors/sts/experimental/CbmStsGpuRecoDevice.cxx b/reco/detectors/sts/experimental/CbmStsGpuRecoDevice.cxx
deleted file mode 100644
index c8f1b62a75ed58a81264e79bfc8a6eb83cff7998..0000000000000000000000000000000000000000
--- a/reco/detectors/sts/experimental/CbmStsGpuRecoDevice.cxx
+++ /dev/null
@@ -1,30 +0,0 @@
-/* Copyright (C) 2022 FIAS Frankfurt Institute for Advanced Studies, Frankfurt / Main
-   SPDX-License-Identifier: GPL-3.0-only
-   Authors: Felix Weiglhofer [committer], Kilian Hunold */
-#include "CbmStsGpuRecoDevice.h"
-
-#include "CbmStsGpuHitFinder.h"
-
-using namespace experimental;
-
-XPU_IMAGE(CbmStsGpuRecoDevice);
-
-XPU_CONSTANT(HitFinder);
-
-XPU_KERNEL(SortDigis, CbmStsSortDigiSmem) { xpu::cmem<HitFinder>().SortDigisInSpaceAndTime(smem, xpu::block_idx::x()); }
-
-XPU_KERNEL(FindClustersSingleStep, xpu::no_smem) { xpu::cmem<HitFinder>().FindClustersSingleStep(xpu::block_idx::x()); }
-
-XPU_KERNEL(CalculateOffsets, xpu::no_smem) { xpu::cmem<HitFinder>().CalculateOffsetsParallel(xpu::block_idx::x()); }
-
-XPU_KERNEL(FindClusters, xpu::no_smem) { xpu::cmem<HitFinder>().FindClustersParallel(xpu::block_idx::x()); }
-
-XPU_KERNEL(FindClustersBasic, xpu::no_smem) { xpu::cmem<HitFinder>().FindClusterConnectionsBasic(xpu::block_idx::x()); }
-
-XPU_KERNEL(CalculateClusters, xpu::no_smem) { xpu::cmem<HitFinder>().CalculateClustersParallel(xpu::block_idx::x()); }
-
-XPU_KERNEL(CalculateClustersBasic, xpu::no_smem) { xpu::cmem<HitFinder>().CalculateClustersBasic(xpu::block_idx::x()); }
-
-XPU_KERNEL(SortClusters, CbmStsSortClusterSmem) { xpu::cmem<HitFinder>().SortClusters(smem, xpu::block_idx::x()); }
-
-XPU_KERNEL(FindHits, xpu::no_smem) { xpu::cmem<HitFinder>().FindHits(xpu::block_idx::x()); }
diff --git a/reco/detectors/sts/experimental/CbmStsGpuRecoDevice.h b/reco/detectors/sts/experimental/CbmStsGpuRecoDevice.h
deleted file mode 100644
index fdb674e973a332d85984ca7c196a7350693255ba..0000000000000000000000000000000000000000
--- a/reco/detectors/sts/experimental/CbmStsGpuRecoDevice.h
+++ /dev/null
@@ -1,55 +0,0 @@
-/* Copyright (C) 2022 FIAS Frankfurt Institute for Advanced Studies, Frankfurt / Main
-   SPDX-License-Identifier: GPL-3.0-only
-   Authors: Felix Weiglhofer [committer], Kilian Hunold */
-#ifndef CBMSTSGPURECODEVICE_H
-#define CBMSTSGPURECODEVICE_H 1
-
-#include <xpu/device.h>
-
-// TODO: Tune these values for various gpus
-#if XPU_IS_CUDA
-#define CBM_STS_SORT_DIGIS_BLOCK_SIZE 32
-#define CBM_STS_SORT_DIGIS_ITEMS_PER_THREAD 32
-#define CBM_STS_SORT_CLUSTERS_BLOCK_SIZE 32
-#define CBM_STS_SORT_CLUSTERS_ITEMS_PER_THREAD 32
-#define CBM_STS_FIND_CLUSTERS_BLOCK_SIZE 32
-#define CBM_STS_FIND_HITS_BLOCK_SIZE 32
-#else  // HIP, values ignored on CPU
-#define CBM_STS_SORT_DIGIS_BLOCK_SIZE 256
-#define CBM_STS_SORT_DIGIS_ITEMS_PER_THREAD 6
-#define CBM_STS_SORT_CLUSTERS_BLOCK_SIZE 256
-#define CBM_STS_SORT_CLUSTERS_ITEMS_PER_THREAD 6
-#define CBM_STS_FIND_CLUSTERS_BLOCK_SIZE 256
-#define CBM_STS_FIND_HITS_BLOCK_SIZE 256
-#endif
-
-struct CbmStsGpuRecoDevice {
-};
-
-XPU_EXPORT_KERNEL(CbmStsGpuRecoDevice, SortDigis);
-XPU_BLOCK_SIZE_1D(SortDigis, CBM_STS_SORT_DIGIS_BLOCK_SIZE);
-
-// Combine substeps for finding clusters into a single kernel
-XPU_EXPORT_KERNEL(CbmStsGpuRecoDevice, FindClustersSingleStep);
-XPU_BLOCK_SIZE_1D(FindClustersSingleStep, CBM_STS_FIND_CLUSTERS_BLOCK_SIZE);
-
-XPU_EXPORT_KERNEL(CbmStsGpuRecoDevice, CalculateOffsets);
-XPU_BLOCK_SIZE_1D(CalculateOffsets, CBM_STS_FIND_CLUSTERS_BLOCK_SIZE);
-
-XPU_EXPORT_KERNEL(CbmStsGpuRecoDevice, FindClusters);
-XPU_BLOCK_SIZE_1D(FindClusters, CBM_STS_FIND_CLUSTERS_BLOCK_SIZE);
-
-XPU_EXPORT_KERNEL(CbmStsGpuRecoDevice, FindClustersBasic);
-
-XPU_EXPORT_KERNEL(CbmStsGpuRecoDevice, CalculateClusters);
-XPU_BLOCK_SIZE_1D(CalculateClusters, CBM_STS_FIND_CLUSTERS_BLOCK_SIZE);
-
-XPU_EXPORT_KERNEL(CbmStsGpuRecoDevice, CalculateClustersBasic);
-
-XPU_EXPORT_KERNEL(CbmStsGpuRecoDevice, SortClusters);
-XPU_BLOCK_SIZE_1D(SortClusters, CBM_STS_SORT_CLUSTERS_BLOCK_SIZE);
-
-XPU_EXPORT_KERNEL(CbmStsGpuRecoDevice, FindHits);
-XPU_BLOCK_SIZE_1D(FindHits, CBM_STS_FIND_HITS_BLOCK_SIZE);
-
-#endif
diff --git a/reco/tasks/CMakeLists.txt b/reco/tasks/CMakeLists.txt
index d71ec0eaca2b5d84f41f801ead71f7481db94a7b..135c5a4f9813a4c42011cb484cc66ff981790bbf 100644
--- a/reco/tasks/CMakeLists.txt
+++ b/reco/tasks/CMakeLists.txt
@@ -69,3 +69,4 @@ if (HAS_STD_EXECUTION)
 endif()
 
 generate_cbm_library()
+