diff --git a/algo/base/ChainContext.h b/algo/base/ChainContext.h
index 954e45d6f0bc0f30f5afe47de0c2fe76dae725a9..3277028b2ab0762e1f27ced91ced778d60471746 100644
--- a/algo/base/ChainContext.h
+++ b/algo/base/ChainContext.h
@@ -5,11 +5,13 @@
 #define CBM_ALGO_BASE_CHAINCONTEXT_H
 
 #include "Options.h"
+#include "RecoParams.h"
 
 namespace cbm::algo
 {
   struct ChainContext {
     Options opts;
+    RecoParams recoParams;
   };
 }  // namespace cbm::algo
 
diff --git a/algo/base/SubChain.h b/algo/base/SubChain.h
index 13bfcf63aa47e9f46cb0ccd1b8e98ebb0188a5c0..b787c1b64c2c13860233e84ee7a2162ded31f207 100644
--- a/algo/base/SubChain.h
+++ b/algo/base/SubChain.h
@@ -16,6 +16,7 @@ namespace cbm::algo
     void SetContext(ChainContext* ctx) { fContext = ctx; }
 
     const Options& Opts() const { return gsl::make_not_null(fContext)->opts; }
+    const RecoParams& Params() const { return gsl::make_not_null(fContext)->recoParams; }
 
   private:
     ChainContext* fContext = nullptr;
diff --git a/algo/base/config/Yaml.h b/algo/base/config/Yaml.h
index 0109c5b2344f0732a97c3b6d8a441545fabe5c0e..a7100184a9144145779c7987d37389787adc8e95 100644
--- a/algo/base/config/Yaml.h
+++ b/algo/base/config/Yaml.h
@@ -15,7 +15,7 @@
 #include "Prelude.h"
 #include "Property.h"
 
-namespace CORE_INTERFACE_NS::config
+namespace cbm::algo::config
 {
 
   template<typename T, T... Values, typename Func>
@@ -144,6 +144,6 @@ namespace CORE_INTERFACE_NS::config
     }
   };
 
-}  // namespace CORE_INTERFACE_NS::config
+}  // namespace cbm::algo::config
 
 #endif
diff --git a/algo/data/sts/LandauTable.cxx b/algo/data/sts/LandauTable.cxx
index ba99e8b783701cff6ba4c45af2fa798f424420c3..3dcf38f7533856733717ffd418ca5b95e58932d8 100644
--- a/algo/data/sts/LandauTable.cxx
+++ b/algo/data/sts/LandauTable.cxx
@@ -3,6 +3,8 @@
    Authors: Felix Weiglhofer [committer] */
 #include "LandauTable.h"
 
+#include <fairlogger/Logger.h>
+
 #include <fstream>
 
 using namespace cbm::algo;
@@ -16,10 +18,13 @@ sts::LandauTable sts::LandauTable::FromFile(std::filesystem::path path)
   std::ifstream file(path);
 
   while (!file.eof()) {
+
     f32 q, p;
     file >> q >> p;
     charge.push_back(q);
     prob.push_back(p);
+
+    LOG(trace) << "Reading Landau table " << path << " q=" << q << " p=" << p;
   }
 
   // TODO: check if charge is monotonically increasing, also more than 2 entries
diff --git a/algo/detectors/sts/StsHitfinder.cxx b/algo/detectors/sts/StsHitfinder.cxx
index 620e30488c3056c9df3f5df72f191109f22de2b3..94a5c4173b83015e29318f2e5d54fc9ed170f4e1 100644
--- a/algo/detectors/sts/StsHitfinder.cxx
+++ b/algo/detectors/sts/StsHitfinder.cxx
@@ -6,77 +6,81 @@
 
 using namespace cbm::algo;
 
-XPU_CONSTANT(TheStsHitfinder);
+// Export Constants
+XPU_EXPORT(sts::TheHitfinder);
 
-XPU_KERNEL(SortDigis, StsSortDigiSmem)
-{
-  xpu::cmem<TheStsHitfinder>().SortDigisInSpaceAndTime(smem, xpu::block_idx::x());
-}
-
-XPU_KERNEL(FindClustersSingleStep, xpu::no_smem)
-{
-  xpu::cmem<TheStsHitfinder>().FindClustersSingleStep(xpu::block_idx::x());
-}
 
-XPU_KERNEL(CalculateOffsets, xpu::no_smem)
-{
-  xpu::cmem<TheStsHitfinder>().CalculateOffsetsParallel(xpu::block_idx::x());
-}
+// Kernel Definitions
+XPU_EXPORT(sts::SortDigis);
+XPU_D void sts::SortDigis::operator()(context& ctx) { ctx.cmem<sts::TheHitfinder>().SortDigisInSpaceAndTime(ctx); }
 
-XPU_KERNEL(FindClusters, xpu::no_smem) { xpu::cmem<TheStsHitfinder>().FindClustersParallel(xpu::block_idx::x()); }
+XPU_EXPORT(sts::FindClusters);
+XPU_D void sts::FindClusters::operator()(context& ctx) { ctx.cmem<sts::TheHitfinder>().FindClustersSingleStep(ctx); }
 
-XPU_KERNEL(FindClustersBasic, xpu::no_smem)
+XPU_EXPORT(sts::ChannelOffsets);
+XPU_D void sts::ChannelOffsets::operator()(context& ctx)
 {
-  xpu::cmem<TheStsHitfinder>().FindClusterConnectionsBasic(xpu::block_idx::x());
+  ctx.cmem<sts::TheHitfinder>().CalculateOffsetsParallel(ctx);
 }
 
-XPU_KERNEL(CalculateClusters, xpu::no_smem)
+XPU_EXPORT(sts::CreateDigiConnections);
+XPU_D void sts::CreateDigiConnections::operator()(context& ctx)
 {
-  xpu::cmem<TheStsHitfinder>().CalculateClustersParallel(xpu::block_idx::x());
+  ctx.cmem<sts::TheHitfinder>().FindClustersParallel(ctx);
 }
 
-XPU_KERNEL(CalculateClustersBasic, xpu::no_smem)
+XPU_EXPORT(sts::CreateClusters);
+XPU_D void sts::CreateClusters::operator()(context& ctx)
 {
-  xpu::cmem<TheStsHitfinder>().CalculateClustersBasic(xpu::block_idx::x());
+  ctx.cmem<sts::TheHitfinder>().CalculateClustersParallel(ctx);
 }
 
-XPU_KERNEL(SortClusters, StsSortClusterSmem) { xpu::cmem<TheStsHitfinder>().SortClusters(smem, xpu::block_idx::x()); }
+XPU_EXPORT(sts::SortClusters);
+XPU_D void sts::SortClusters::operator()(context& ctx) { ctx.cmem<sts::TheHitfinder>().SortClusters(ctx); }
 
-XPU_KERNEL(FindHits, xpu::no_smem) { xpu::cmem<TheStsHitfinder>().FindHits(xpu::block_idx::x()); }
+XPU_EXPORT(sts::FindHits);
+XPU_D void sts::FindHits::operator()(context& ctx) { ctx.cmem<sts::TheHitfinder>().FindHits(ctx); }
 
 /**
-  * StsHitfinderGpu::sortDigisKhun
+  * sts::Hitfinder::sortDigisKhun
   * Sorts digis channelwise. Inside a channel all digis are sorted in time.
   *
   * @param smem is the shared memory it is worked in
   * @param iBlock is the Block/Module that is currenty running
   *
 */
-XPU_D void StsHitfinderGpu::SortDigisInSpaceAndTime(StsSortDigiSmem& smem, int iBlock) const
+XPU_D void sts::Hitfinder::SortDigisInSpaceAndTime(SortDigis::context& ctx) const
 {
-  int iModule          = iBlock;
+  int iModule          = ctx.block_idx_x();
   CbmStsDigi* digis    = &digisPerModule[digiOffsetPerModule[iModule]];
   CbmStsDigi* digisTmp = &digisPerModuleTmp[digiOffsetPerModule[iModule]];
   int nDigis           = GetNDigis(iModule);
 
-  SortDigisT digiSort(smem.sortBuf);
+  SortDigis::sort_t digiSort(ctx.pos(), ctx.smem());
 
-  digis = digiSort.sort(digis, nDigis, digisTmp, [](const CbmStsDigi a) {
+  CbmStsDigi* digisOut = digiSort.sort(digis, nDigis, digisTmp, [](const CbmStsDigi a) {
     return ((unsigned long int) a.GetChannel()) << 32 | (unsigned long int) (a.GetTimeU32());
   });
 
-  if (xpu::thread_idx::x() == 0) { digisSortedPerModule[iModule] = digis; }
+  if (digisOut == digisTmp) {
+    // Copy back to digis
+    for (int i = ctx.thread_idx_x(); i < nDigis; i += ctx.block_dim_x()) {
+      digis[i] = digisTmp[i];
+    }
+  }
 }
 
-XPU_D void StsHitfinderGpu::FindClustersSingleStep(int iBlock) const
+XPU_D void sts::Hitfinder::FindClustersSingleStep(FindClusters::context& ctx) const
 {
-  CalculateOffsetsParallel(iBlock);
-  FindClustersParallel(iBlock);
-  CalculateClustersParallel(iBlock);
+  CalculateOffsetsParallel(ctx);
+  xpu::barrier(ctx.pos());
+  FindClustersParallel(ctx);
+  xpu::barrier(ctx.pos());
+  CalculateClustersParallel(ctx);
 }
 
 /**
-  * StsHitfinderGpu::calculateChannelOffsets
+  * sts::Hitfinder::calculateChannelOffsets
   * Calculates the Offsest of every channel. digis-Array is sorted in Channel and Time.
   * If a channelChange is detected it is stored in channelOffsetsPerModule.
   *
@@ -85,13 +89,12 @@ XPU_D void StsHitfinderGpu::FindClustersSingleStep(int iBlock) const
   * @param nDigis Amount of digis in digis-Array
   *
 */
-XPU_D void StsHitfinderGpu::CalculateChannelOffsets(CbmStsDigi* digis, unsigned int* channelOffsets,
-                                                    unsigned int const nDigis) const
+XPU_D void sts::Hitfinder::CalculateChannelOffsets(FindClusters::context& ctx, CbmStsDigi* digis,
+                                                   unsigned int* channelOffsets, unsigned int const nDigis) const
 {
   channelOffsets[0] = 0;
 
-  //Parallel
-  for (unsigned int pos = xpu::thread_idx::x(); pos < nDigis - 1; pos += (unsigned int) xpu::block_dim::x()) {
+  for (unsigned int pos = ctx.thread_idx_x(); pos < nDigis - 1; pos += (unsigned int) ctx.block_dim_x()) {
     auto const currChannel = digis[pos].GetChannel();
     auto const nextChannel = digis[pos + 1].GetChannel();
     if (currChannel != nextChannel) {
@@ -108,364 +111,122 @@ XPU_D void StsHitfinderGpu::CalculateChannelOffsets(CbmStsDigi* digis, unsigned
   }
 }
 
-
 /**
-  * StsHitfinderGpu::calculateOffsetsParallel
+  * sts::Hitfinder::calculateOffsetsParallel
   * This function calculates the channeloffsets in a certain module.
   * An Offset is the Startingpoint of a Channel in a sorted array of Digis.
   *
   * @param iBlock is the Block/Module that is currently worked on
   *
 */
-XPU_D void StsHitfinderGpu::CalculateOffsetsParallel(int iBlock) const
+XPU_D void sts::Hitfinder::CalculateOffsetsParallel(FindClusters::context& ctx) const
 {
-  int const iModule = iBlock;
-  CbmStsDigi* digis = digisSortedPerModule[iModule];
+  int const iModule = ctx.block_idx_x();
+  CbmStsDigi* digis = &digisPerModule[digiOffsetPerModule[iModule]];
   auto const nDigis = GetNDigis(iModule);
 
   if (nDigis == 0) return;
 
   auto* channelOffsets = &channelOffsetPerModule[iModule * nChannels];
 
-  CalculateChannelOffsets(digis, channelOffsets, nDigis);
+  CalculateChannelOffsets(ctx, digis, channelOffsets, nDigis);
 }
 
 /**
-  * StsHitfinderGpu::findClustersParallel
+  * sts::Hitfinder::findClustersParallel
   * This function finds clusters form an sts-Digi Array inserted.
   * It runs the finding process highly parallel.
   *
   * @param iBlock is the Block/Module that is currently worked on
   *
 */
-XPU_D void StsHitfinderGpu::FindClustersParallel(int iBlock) const
-{
-  int const iModule           = iBlock;
-  CbmStsDigi* digis           = digisSortedPerModule[iModule];
-  auto const nDigis           = GetNDigis(iModule);
-  unsigned int const threadId = xpu::thread_idx::x();
-
-  if (nDigis == 0) return;
-
-  auto* digiConnector  = &digiConnectorsPerModule[digiOffsetPerModule[iModule]];
-  auto* channelOffsets = &channelOffsetPerModule[iModule * nChannels];
-
-  // findClusterConnectionsChannelWise(digis, digiConnector, channelOffsets, iModule, threadId);
-  FindClusterConnectionsDigiWise(digis, digiConnector, channelOffsets, iModule, threadId, nDigis);
-}
-
-
-/**
-  * StsHitfinderGpu::calculateClustersParallel
-  * This function calculates clusters form an sts-Digi Array inserted.
-  * It runs the calculating process highly parallel.
-  *
-  * @param iBlock is the Block/Module that is currently worked on
-  *
-*/
-XPU_D void StsHitfinderGpu::CalculateClustersParallel(int iBlock) const
-{
-  int const iModule  = iBlock;
-  CbmStsDigi* digis  = digisSortedPerModule[iModule];
-  auto const nDigis  = GetNDigis(iModule);
-  int const threadId = xpu::thread_idx::x();
-
-  if (nDigis == 0) { return; }
-
-  auto* digiConnector = &digiConnectorsPerModule[digiOffsetPerModule[iModule]];
-  // auto* channelOffsets = &channelOffsetPerModule[iModule * nChannels];
-
-  // calculateClustersChannelWise(digis, digiConnector, channelOffsets, iModule, threadId, nDigis);
-  CalculateClustersDigiWise(digis, digiConnector, iModule, threadId, nDigis);
-}
-
-
-/**
-  * StsHitfinderGpu::findClusterConnectionsBasic
-  *
-  * Each Thread one Channel
-  *
-  * ChannelId: 00000 11111
-  * DigiIndex: 01234 56789
-  * ThreadId:  00000 00000
-  *
-  * Finds Clusters in a basic way. One thread takes care of the whole calculation.
-  *
-  * @param digis All Digis that are relevant for this Block
-  * @param digiConnector Array where the connection between 2 digis is sotred in.(next Digi; has Previous)
-  * @param channelOffsets The Array where all offsets are written to
-  * @param iModule The Module that the curren Block is working on
-  * @param threadId Id of the thrad currently working
-  *
-**/
-XPU_D void StsHitfinderGpu::FindClusterConnectionsBasic(int iBlock) const
+XPU_D void sts::Hitfinder::FindClustersParallel(FindClusters::context& ctx) const
 {
-  int iModule          = iBlock;
-  int threadId         = xpu::thread_idx::x();
-  auto nDigis          = GetNDigis(iModule);
-  CbmStsDigi* digis    = digisSortedPerModule[iModule];
-  auto* digiConnector  = &digiConnectorsPerModule[digiOffsetPerModule[iModule]];
-  auto* channelOffsets = &channelOffsetPerModule[iModule * nChannels];
-
-  if (threadId != 0) return;
-
-  for (unsigned int currIter = 0; currIter < nDigis; currIter++) {
-    const CbmStsDigi& digi = digis[currIter];
-    uint16_t channel       = digi.GetChannel();
+  const i32 iGThread    = ctx.block_dim_x() * ctx.block_idx_x() + ctx.thread_idx_x();
+  const i32 nDigisTotal = digiOffsetPerModule[2 * nModules];
+  if (iGThread >= nDigisTotal) { return; };
 
-    if (channel == nChannels - 1) break;
+  i32 iModule = 0;
+  while (size_t(iGThread) >= digiOffsetPerModule[iModule + 1]) {
+    iModule++;
+  }
 
-    //DeltaCalculation
-    float const tResol = GetTimeResolution(iModule, channel);
-    float const deltaT = (timeCutDigiAbs > 0.f ? timeCutDigiAbs : timeCutDigiSig * 1.4142f * tResol);
+  i32 iLocal = iGThread - digiOffsetPerModule[iModule];
 
-    unsigned int nextChannelStart = channelOffsets[channel + 1];
-    unsigned int nextChannelEnd   = (channel + 2 < nChannels) ? channelOffsets[channel + 2] : nDigis;
+  const CbmStsDigi myDigi = digisPerModule[iGThread];
 
-    // printf("Comparing digis (deltaT=%f, nextChannelStart=%u, nextChannelEnd=%u, nDigis=%u):\n", deltaT,
-    //        nextChannelStart, nextChannelEnd, nDigis);
-    // printf(" channel=%d, time=%d, charge=%d\n", digi.GetChannel(), digi.GetTimeU32(), digi.GetChargeU16());
+  u32 nDigis = GetNDigis(iModule);
+  if (nDigis == 0) return;
 
-    //Calculate DigiConnections
-    for (unsigned int nextIter = nextChannelStart; nextIter < nextChannelEnd; nextIter++) {
+  xpu::view digis(&digisPerModule[digiOffsetPerModule[iModule]], nDigis);
+  xpu::view digiConnector(&digiConnectorsPerModule[digiOffsetPerModule[iModule]], nDigis);
+  xpu::view channelOffsets(&channelOffsetPerModule[iModule * nChannels], nChannels);
 
-      const CbmStsDigi& nextDigi = digis[nextIter];
+  u16 channel = myDigi.GetChannel();
+  if (channel == nChannels - 1) return;
 
-      // printf("> channel=%d, time=%d, charge=%d\n", nextDigi.GetChannel(), nextDigi.GetTimeU32(),
-      //        nextDigi.GetChargeU16());
+  //DeltaCalculation
+  const f32 tResol              = GetTimeResolution(iModule, channel);
+  const RecoParams::STS& params = ctx.cmem<Params>().sts;
+  const f32 deltaT = (params.timeCutDigiAbs > 0.f ? params.timeCutDigiAbs : params.timeCutDigiSig * 1.4142f * tResol);
 
-      if (digi.GetChannel() >= nextDigi.GetChannel()) continue;
+  const u32 nextChannelEnd = (channel + 2 < nChannels) ? channelOffsets[channel + 2] : nDigis;
 
-      if (float(digis[currIter].GetTimeU32() + deltaT) <= float(digis[nextIter].GetTimeU32())) break;
-      if (float(digis[currIter].GetTimeU32() - deltaT) >= float(digis[nextIter].GetTimeU32())) continue;
+  float firstPossibleTime = float(myDigi.GetTimeU32()) - deltaT;
 
-      digiConnector[currIter].SetNext(nextIter);
-      digiConnector[nextIter].SetHasPrevious(true);
-      break;
+  // Binary search for the first possible digi
+  u32 start = channelOffsets[channel + 1];
+  u32 end   = nextChannelEnd;
+  while (start < end) {
+    u32 mid = (start + end) / 2;
+    if (float(digis[mid].GetTimeU32()) < firstPossibleTime) { start = mid + 1; }
+    else {
+      end = mid;
     }
   }
-}
 
+  for (auto nextIter = start; nextIter < nextChannelEnd; nextIter++) {
 
-/**
-  * StsHitfinderGpu::findClusterConnectionsChannelWise
-  *
-  * Each Thread one Channel
-  *
-  * ChannelId: 00000 11111
-  * DigiIndex: 01234 56789
-  * ThreadId:  00000 11111
-  *
-  * Finds Clusters highly parallel in a basic way. Each thread gets the same anmount of Channels to work with.
-  * Every Thread checks all digis in its channel, if there is a pendant to the current one in the channel to its right.
-  * If there is another digi, the digi will be connected inside the digiConnector-Array.
-  *
-  * @param digis All Digis that are relevant for this Block
-  * @param digiConnector Array where the connection between 2 digis is sotred in.(next Digi; has Previous)
-  * @param channelOffsets The Array where all offsets are written to
-  * @param iModule The Module that the curren Block is working on
-  * @param threadId Id of the thrad currently working
-  *
-**/
-XPU_D void StsHitfinderGpu::FindClusterConnectionsChannelWise(CbmStsDigi* digis, StsDigiConnector* digiConnector,
-                                                              unsigned int* channelOffsets, int const iModule,
-                                                              unsigned int const threadId) const
-{
-  // Expl: here we need to be smaller than nChannels because the last channel is the most Right.
-  for (unsigned int channel = threadId; channel < (unsigned int) (nChannels - 1); channel += xpu::block_dim::x()) {
-    unsigned int const currChannelBegin = channelOffsets[channel];      //Begin of current Channel
-    unsigned int const nextChannelBegin = channelOffsets[channel + 1];  //Begin of next (right) channel
-    auto const nextChannelEnd =
-      (channel + 2 < (unsigned int) nChannels) ? channelOffsets[channel + 2] : GetNDigis(iModule);
-    unsigned int nextIter = nextChannelBegin;
-
-    //Check if first Digi of Channel belongs to Channel
-    if (channel != digis[currChannelBegin].GetChannel()) continue;
-
-    //DeltaCalculation
-    float const tResol = GetTimeResolution(iModule, channel);
-    float const deltaT = (timeCutDigiAbs > 0.f ? timeCutDigiAbs : timeCutDigiSig * 1.4142f * tResol);
-
-    //Calculate DigiConnections
-    for (auto currIter = currChannelBegin; currIter < nextChannelBegin; currIter++) {
-      while (nextIter < nextChannelEnd
-             && ((digis[currIter].GetTimeU32() + deltaT) >= float(digis[nextIter].GetTimeU32()))) {
-        if (digis[currIter].GetChannel() >= digis[nextIter].GetChannel()) { continue; }
-        if (digis[currIter].GetTimeU32() + deltaT < float(digis[nextIter].GetTimeU32())) {
-          nextIter++;
-          break;
-        }
-        if (digis[currIter].GetTimeU32() - deltaT > float(digis[nextIter].GetTimeU32())) {
-          nextIter++;
-          continue;
-        }
-
-        digiConnector[currIter].SetNext(nextIter);
-        digiConnector[nextIter].SetHasPrevious(true);
-        nextIter++;
-        break;
-      }
-    }
-  }
-}
+    const CbmStsDigi otherDigi = digis[nextIter];
 
-/**
-  * StsHitfinderGpu::findClusterConnectionsDigiWise
-  *
-  * Each thread one Digi
-  *
-  * ChannelId: 00000 11111
-  * DigiIndex: 01234 56789
-  * ThreadId:  01234 01234
-  *
-  * Finds Clusters highly parallel in a basic way.
-  * The Threads work on Data thats stored nearby, so it's been taken advantage of the data locality.
-  * Each thread takes one digi and calculates if there is a neighbour.
-  * This is repeated until all digis have been processed
-  *
-  * @param digis All Digis that are relevant for this Block
-  * @param digiConnector Array where the connection between 2 digis is sotred in.(next Digi; has Previous)
-  * @param channelOffsets The Array where all offsets are written to
-  * @param iModule The Module that the curren Block is working on
-  * @param threadId Id of the thrad currently working
-  * @param nDigis Amount of digis in digis-Array
-  *
-**/
-XPU_D void StsHitfinderGpu::FindClusterConnectionsDigiWise(CbmStsDigi* digis, StsDigiConnector* digiConnector,
-                                                           unsigned int* channelOffsets, int const iModule,
-                                                           unsigned int const threadId, unsigned int const nDigis) const
-{
-
-  for (unsigned int currIter = threadId; currIter < nDigis; currIter += xpu::block_dim::x()) {
-    auto const digi    = digis[currIter];
-    auto const channel = digi.GetChannel();
+    // This should never happen?
+    if (myDigi.GetChannel() >= otherDigi.GetChannel()) continue;
 
-    if (channel == nChannels - 1) break;
+    if (float(myDigi.GetTimeU32()) + deltaT < float(otherDigi.GetTimeU32())) { break; }
+    if (float(myDigi.GetTimeU32()) - deltaT > float(otherDigi.GetTimeU32())) { continue; }
 
-    //DeltaCalculation
-    float const tResol = GetTimeResolution(iModule, channel);
-    float const deltaT = (timeCutDigiAbs > 0.f ? timeCutDigiAbs : timeCutDigiSig * 1.4142f * tResol);
-
-    auto const nextChannelEnd = (channel + 2 < nChannels) ? channelOffsets[channel + 2] : nDigis;
-    //Calculate DigiConnections
-    for (auto nextIter = channelOffsets[channel + 1]; nextIter < nextChannelEnd; nextIter++) {
-      if (digis[currIter].GetChannel() >= digis[nextIter].GetChannel()) continue;
-      if (float(digis[currIter].GetTimeU32()) + deltaT < float(digis[nextIter].GetTimeU32())) break;
-      if (float(digis[currIter].GetTimeU32()) - deltaT > float(digis[nextIter].GetTimeU32())) continue;
-
-      digiConnector[currIter].SetNext(nextIter);
-      digiConnector[nextIter].SetHasPrevious(true);
-      break;
-    }
+    digiConnector[iLocal].SetNext(nextIter);
+    digiConnector[nextIter].SetHasPrevious(true);
+    break;
   }
 }
 
-
 /**
-  * StsHitfinderGpu::calculateClustersBasic
-  *
-  * One Thread all Digis
-  *
-  * ChannelId: 00000 11111
-  * DigiIndex: 01234 56789
-  * ThreadId:  00000 00000
-  *
-  * Calculates the Clustercharges of all found ClusterConnections.
-  * One thread walks through all Digis and looks for connections. If one Digi does not have a previous one
-  * it's a cluster start and may be the start of either a single-element-cluster, double-element-cluster
-  * or multi-element-cluster.
-  * All Other Threads are canceled
+  * sts::Hitfinder::calculateClustersParallel
+  * This function calculates clusters form an sts-Digi Array inserted.
+  * It runs the calculating process highly parallel.
   *
-  * @param digis All Digis that are relevant for this Block
-  * @param digiConnector Array where the connection between 2 digis is sotred in.(next Digi; has Previous)
-  * @param iModule The Module that the curren Block is working on
-  * @param threadId Id of the thrad currently working
+  * @param iBlock is the Block/Module that is currently worked on
   *
-**/
-XPU_D void StsHitfinderGpu::CalculateClustersBasic(int iBlock) const
+*/
+XPU_D void sts::Hitfinder::CalculateClustersParallel(FindClusters::context& ctx) const
 {
-  int iModule         = iBlock;
-  int threadId        = xpu::thread_idx::x();
-  CbmStsDigi* digis   = digisSortedPerModule[iModule];
-  auto* digiConnector = &digiConnectorsPerModule[digiOffsetPerModule[iModule]];
-
-  if (threadId != 0) return;
-
-  for (unsigned int i = 0; i < GetNDigis(iModule); i++) {
-    if (digiConnector[i].HasPrevious()) { continue; }
-    if (!digiConnector[i].HasNext()) {
-      //if Cluster has 1 element
-      CreateClusterFromConnectors1(iModule, digis, i);
-    }
-    else if (!digiConnector[digiConnector[i].next()].HasNext()) {
-      //if Cluster has 2 elements
-      CreateClusterFromConnectors2(iModule, digis, digiConnector, i);
-    }
-    else {
-      //if Cluster has N elements
-      CreateClusterFromConnectorsN(iModule, digis, digiConnector, i);
-    }
-  }
-}
+  int const iModule = ctx.block_idx_x();
+  CbmStsDigi* digis = &digisPerModule[digiOffsetPerModule[iModule]];
+  ;
+  auto const nDigis = GetNDigis(iModule);
 
+  if (nDigis == 0) return;
 
-/**
-  * StsHitfinderGpu::calculateClustersChannelWise
-  *
-  * Each Thread one Channel
-  *
-  * ChannelId: 00000 11111
-  * DigiIndex: 01234 56789
-  * ThreadId:  00000 11111
-  *
-  * Calculates the Clustercharges of all found ClusterConnections.
-  * Each Thread walks through all digis of one channel and looks for connections.
-  * If one Digi does not have a previous one it's a cluster start and may be the
-  * start of either a single-element-cluster, double-element-cluster or
-  * multi-element-cluster.
-  *
-  * @param digis All Digis that are relevant for this Block
-  * @param digiConnector Array where the connection between 2 digis is sotred in.(next Digi; has Previous)
-  * @param iModule The Module that the curren Block is working on
-  * @param threadId Id of the thrad currently working
-  *
-**/
-XPU_D void StsHitfinderGpu::CalculateClustersChannelWise(CbmStsDigi* digis, StsDigiConnector* digiConnector,
-                                                         unsigned int* channelOffsets, int const iModule,
-                                                         unsigned int const threadId, unsigned int const nDigis) const
-{
+  auto* digiConnector = &digiConnectorsPerModule[digiOffsetPerModule[iModule]];
+  // auto* channelOffsets = &channelOffsetPerModule[iModule * nChannels];
 
-  for (unsigned int channel = threadId; channel < (unsigned int) nChannels;
-       channel += (unsigned int) xpu::block_dim::x()) {
-    unsigned int const currChannelBegin = channelOffsets[channel];
-    unsigned int const nextChannelBegin =
-      (channel + 1 < (unsigned int) nChannels) ? channelOffsets[channel + 1] : nDigis;
-
-    //Check if first Digi of Channel belongs to Channel
-    if (channel != digis[currChannelBegin].GetChannel()) { continue; }
-
-    //Calculate DigiConnections
-    for (auto currIter = currChannelBegin; currIter < nextChannelBegin; currIter++) {
-      if (digiConnector[currIter].HasPrevious()) { continue; }
-      if (!digiConnector[currIter].HasNext()) {
-        //if Cluster has 1 element
-        CreateClusterFromConnectors1(iModule, digis, currIter);
-      }
-      else if (!digiConnector[digiConnector[currIter].next()].HasNext()) {
-        //if Cluster has 2 elements
-        CreateClusterFromConnectors2(iModule, digis, digiConnector, currIter);
-      }
-      else {
-        //if Cluster has N elements
-        CreateClusterFromConnectorsN(iModule, digis, digiConnector, currIter);
-      }
-    }
-  }
+  // calculateClustersChannelWise(digis, digiConnector, channelOffsets, iModule, threadId, nDigis);
+  CalculateClustersDigiWise(ctx, digis, digiConnector, nDigis);
 }
 
-
 /**
-  * StsHitfinderGpu::calculateClustersDigiWise
+  * sts::Hitfinder::calculateClustersDigiWise
   *
   * Each Thread one Channel
   *
@@ -485,12 +246,11 @@ XPU_D void StsHitfinderGpu::CalculateClustersChannelWise(CbmStsDigi* digis, StsD
   * @param threadId Id of the thrad currently working
   *
 **/
-XPU_D void StsHitfinderGpu::CalculateClustersDigiWise(CbmStsDigi* digis, StsDigiConnector* digiConnector,
-                                                      int const iModule, unsigned int const threadId,
-                                                      unsigned int const nDigis) const
+XPU_D void sts::Hitfinder::CalculateClustersDigiWise(FindClusters::context& ctx, CbmStsDigi* digis,
+                                                     sts::DigiConnector* digiConnector, unsigned int const nDigis) const
 {
-
-  for (unsigned int currIter = threadId; currIter < nDigis; currIter += (unsigned int) xpu::block_dim::x()) {
+  int const iModule = ctx.block_idx_x();
+  for (unsigned int currIter = ctx.thread_idx_x(); currIter < nDigis; currIter += (unsigned int) ctx.block_dim_x()) {
 
     if (digiConnector[currIter].HasPrevious()) continue;
 
@@ -509,8 +269,7 @@ XPU_D void StsHitfinderGpu::CalculateClustersDigiWise(CbmStsDigi* digis, StsDigi
   }
 }
 
-
-XPU_D void StsHitfinderGpu::CreateClusterFromConnectors1(int const iModule, CbmStsDigi* digis, int digiIndex) const
+XPU_D void sts::Hitfinder::CreateClusterFromConnectors1(int const iModule, CbmStsDigi* digis, int digiIndex) const
 {
   const CbmStsDigi& digi = digis[digiIndex];
 
@@ -518,7 +277,7 @@ XPU_D void StsHitfinderGpu::CreateClusterFromConnectors1(int const iModule, CbmS
 
   const float timeError = asic.timeResolution;
 
-  StsGpuCluster cluster {
+  sts::Cluster cluster {
     .fCharge        = asic.AdcToCharge(digi.GetChargeU16()),
     .fSize          = 1,
     .fPosition      = float(digi.GetChannel()) + (IsBackside(iModule) ? nChannels : 0.f),
@@ -532,8 +291,8 @@ XPU_D void StsHitfinderGpu::CreateClusterFromConnectors1(int const iModule, CbmS
   AddCluster(iModule, time, cluster);
 }
 
-XPU_D void StsHitfinderGpu::CreateClusterFromConnectors2(int const iModule, CbmStsDigi* digis,
-                                                         StsDigiConnector* digiConnector, int const digiIndex) const
+XPU_D void sts::Hitfinder::CreateClusterFromConnectors2(int const iModule, CbmStsDigi* digis,
+                                                        sts::DigiConnector* digiConnector, int const digiIndex) const
 {
 
   const CbmStsDigi& digi1 = digis[digiIndex];
@@ -588,7 +347,7 @@ XPU_D void StsHitfinderGpu::CreateClusterFromConnectors2(int const iModule, CbmS
 
   if (IsBackside(iModule)) x += nChannels;
 
-  StsGpuCluster cls {
+  sts::Cluster cls {
     .fCharge        = charge,
     .fSize          = 2,
     .fPosition      = x,
@@ -601,11 +360,10 @@ XPU_D void StsHitfinderGpu::CreateClusterFromConnectors2(int const iModule, CbmS
   AddCluster(iModule, time, cls);
 }
 
-
-XPU_D void StsHitfinderGpu::CreateClusterFromConnectorsN(int iModule, CbmStsDigi* digis,
-                                                         StsDigiConnector* digiConnector, int digiIndex) const
+XPU_D void sts::Hitfinder::CreateClusterFromConnectorsN(int iModule, CbmStsDigi* digis,
+                                                        sts::DigiConnector* digiConnector, int digiIndex) const
 {
-  StsClusterCalculationProperties cProps;
+  ClusterCalculationProperties cProps;
 
   //This Lambda calculates all charges for a single digi inside of a cluster
   auto calculateClusterCharges = [this, &cProps, &digis, &digiConnector](int index) {
@@ -680,7 +438,7 @@ XPU_D void StsHitfinderGpu::CreateClusterFromConnectorsN(int iModule, CbmStsDigi
 
   if (IsBackside(iModule)) { x += nChannels; }
 
-  StsGpuCluster cls {
+  sts::Cluster cls {
     .fCharge        = qSum,
     .fSize          = int(nDigis),
     .fPosition      = x,
@@ -693,37 +451,81 @@ XPU_D void StsHitfinderGpu::CreateClusterFromConnectorsN(int iModule, CbmStsDigi
   AddCluster(iModule, cProps.tSum, cls);
 }
 
-XPU_D void StsHitfinderGpu::SortClusters(StsSortClusterSmem& smem, int iBlock) const
+XPU_D void sts::Hitfinder::SortClusters(SortClusters::context& ctx) const
 {
-  int iModule                     = iBlock;
-  size_t offset                   = iModule * maxClustersPerModule;
-  StsGpuClusterIdx* clusterIdx    = &clusterIdxPerModule[offset];
-  StsGpuClusterIdx* clusterIdxTmp = &clusterIdxPerModuleTmp[offset];
-  int nClusters                   = nClustersPerModule[iModule];
+  // By default sort all modules in parallel,
+  // but in debug mode sort only one module at a time
+  // to narrow down the error source
+  int firstModule = ctx.block_idx_x();
+  int lastModule  = ctx.block_idx_x();
+  if (ctx.grid_dim_x() == 1) {
+    firstModule = 0;
+    lastModule  = 2 * nModules - 1;
+  }
+
+  for (int iModule = firstModule; iModule <= lastModule; iModule++) {
+    size_t offset                = iModule * maxClustersPerModule;
+    GpuClusterIdx* clusterIdx    = &clusterIdxPerModule[offset];
+    GpuClusterIdx* clusterIdxTmp = &clusterIdxPerModuleTmp[offset];
+    int nClusters                = nClustersPerModule[iModule];
 
-  SortClustersT clsSort(smem.sortBuf);
-  clusterIdx = clsSort.sort(clusterIdx, nClusters, clusterIdxTmp, [](const StsGpuClusterIdx a) { return a.fTime; });
+    // if (nClusters == 0 && ctx.thread_idx_x() == 0) {
+    //   printf("Module %d: No clusters found, exit early\n", iModule);
+    // }
+    if (nClusters == 0) return;
 
-  if (xpu::thread_idx::x() == 0) clusterIdxSortedPerModule[iModule] = clusterIdx;
+    // if (ctx.thread_idx_x() == 0) {
+    //   printf("Module %d: Start sorting %d clusters\n", iModule, nClusters);
+    // }
+
+    SortClusters::sort_t clsSort(ctx.pos(), ctx.smem());
+    clusterIdx = clsSort.sort(clusterIdx, nClusters, clusterIdxTmp, [](const GpuClusterIdx a) { return a.fTime; });
+
+    // if (ctx.thread_idx_x() == 0) {
+    //   printf("Module %d: Finished sorting %d clusters\n", iModule, nClusters);
+    // }
+
+    if (ctx.thread_idx_x() == 0) clusterIdxSortedPerModule[iModule] = clusterIdx;
+  }
 }
 
-XPU_D void StsHitfinderGpu::FindHits(int iBlock) const
+XPU_D void sts::Hitfinder::FindHits(FindHits::context& ctx) const
 {
-  int iModuleF                  = iBlock;
-  int iModuleB                  = iBlock + nModules;
-  size_t offsetF                = iModuleF * maxClustersPerModule;
-  size_t offsetB                = iModuleB * maxClustersPerModule;
-  StsGpuClusterIdx* clusterIdxF = clusterIdxSortedPerModule[iModuleF];
-  StsGpuClusterIdx* clusterIdxB = clusterIdxSortedPerModule[iModuleB];
-  StsGpuCluster* clusterDataF   = &clusterDataPerModule[offsetF];
-  StsGpuCluster* clusterDataB   = &clusterDataPerModule[offsetB];
-  int nClustersF                = nClustersPerModule[iModuleF];
-  int nClustersB                = nClustersPerModule[iModuleB];
-
-
-  StsHitfinderCache pars;
+
+// On CPU process all modules in parallel (one thread per module).
+// On GPU process all front clusters in parallel instead (one thread per cluster)
+//   to fully utilize the GPU.
+// Currently use option 2 for both as it is faster on CPU as well.
+#if XPU_IS_CPU
+  int iModule = ctx.block_idx_x();
+#else
+  int iModule = 0;
+  int iThread = ctx.block_dim_x() * ctx.block_idx_x() + ctx.thread_idx_x();
+
+  for (; iModule < nModules; iModule++) {
+    if (iThread < nClustersPerModule[iModule]) break;
+    iThread -= nClustersPerModule[iModule];
+  }
+
+  if (iModule >= nModules) return;
+#endif
+
+  int iModuleF               = iModule;
+  int iModuleB               = iModuleF + nModules;
+  size_t offsetF             = iModuleF * maxClustersPerModule;
+  size_t offsetB             = iModuleB * maxClustersPerModule;
+  GpuClusterIdx* clusterIdxF = clusterIdxSortedPerModule[iModuleF];
+  GpuClusterIdx* clusterIdxB = clusterIdxSortedPerModule[iModuleB];
+  sts::Cluster* clusterDataF = &clusterDataPerModule[offsetF];
+  sts::Cluster* clusterDataB = &clusterDataPerModule[offsetB];
+  int nClustersF             = nClustersPerModule[iModuleF];
+  int nClustersB             = nClustersPerModule[iModuleB];
+
+  if (nClustersF == 0 || nClustersB == 0) return;
+
+  HitfinderCache pars;
   {
-    StsSensorPar cfg = sensorPars[iBlock];
+    SensorPar cfg = sensorPars[iModule];
 
     pars.dY         = cfg.dY;
     pars.pitch      = cfg.pitch;
@@ -737,21 +539,42 @@ XPU_D void StsHitfinderGpu::FindHits(int iBlock) const
     pars.dX         = float(nChannels) * pars.pitch;
   }
 
-  float maxTerrF = *maxErrorFront;
-  float maxTerrB = *maxErrorBack;
+  float maxTerrF = maxClusterTimeErrorByModuleSide[iModuleF];
+  float maxTerrB = maxClusterTimeErrorByModuleSide[iModuleB];
 
   float maxSigmaBoth = 4.f * xpu::sqrt(maxTerrF * maxTerrF + maxTerrB * maxTerrB);
 
   int startB = 0;
-  for (int iClusterF = xpu::thread_idx::x(); iClusterF < nClustersF; iClusterF += xpu::block_dim::x()) {
-    StsGpuClusterIdx clsIdxF = clusterIdxF[iClusterF];
-    StsGpuCluster clsDataF   = clusterDataF[clsIdxF.fIdx];
+#if XPU_IS_CPU
+  for (int iClusterF = ctx.thread_idx_x(); iClusterF < nClustersF; iClusterF += ctx.block_dim_x()) {
+#else
+  int iClusterF = iThread;
+  if (iClusterF < nClustersF) {
+
+    float firstPossibleTime = float(clusterIdxF[iClusterF].fTime) - maxSigmaBoth;
+
+    // Use binary search to find the first cluster on back side that can be matched
+    // with the current cluster on front side
+    int start = startB;
+    int end   = nClustersB;
+    while (end - start > 1) {
+      int mid = (start + end) / 2;
+      if (float(clusterIdxB[mid].fTime) < firstPossibleTime) { start = mid; }
+      else {
+        end = mid;
+      }
+    }
+
+    startB = start;
+#endif
+    GpuClusterIdx clsIdxF = clusterIdxF[iClusterF];
+    sts::Cluster clsDataF = clusterDataF[clsIdxF.fIdx];
 
-    float maxSigma = 4.f * xpu::sqrt(clsDataF.fTimeError * clsDataF.fTimeError + maxTerrF * maxTerrF);
+    float maxSigma = 4.f * xpu::sqrt(clsDataF.fTimeError * clsDataF.fTimeError + maxTerrB * maxTerrB);
 
     for (int iClusterB = startB; iClusterB < nClustersB; iClusterB++) {
-      StsGpuClusterIdx clsIdxB = clusterIdxB[iClusterB];
-      StsGpuCluster clsDataB   = clusterDataB[clsIdxB.fIdx];
+      GpuClusterIdx clsIdxB = clusterIdxB[iClusterB];
+      sts::Cluster clsDataB = clusterDataB[clsIdxB.fIdx];
 
       float timeDiff = float(clsIdxF.fTime) - float(clsIdxB.fTime);
 
@@ -767,23 +590,23 @@ XPU_D void StsHitfinderGpu::FindHits(int iBlock) const
       }
 
       float timeCut = -1.f;
-      if (timeCutClusterAbs > 0.f) timeCut = timeCutClusterAbs;
-      else if (timeCutClusterSig > 0.f) {
+      const RecoParams::STS& params = ctx.cmem<Params>().sts;
+      if (params.timeCutClusterAbs > 0.f) timeCut = params.timeCutClusterAbs;
+      else if (params.timeCutClusterSig > 0.f) {
         float eF = clsDataF.fTimeError;
         float eB = clsDataB.fTimeError;
-        timeCut  = timeCutClusterSig * xpu::sqrt(eF * eF + eB * eB);
+        timeCut  = params.timeCutClusterSig * xpu::sqrt(eF * eF + eB * eB);
       }
 
       if (xpu::abs(float(clsIdxF.fTime) - float(clsIdxB.fTime)) > timeCut) continue;
 
-      IntersectClusters(iBlock, pars, clsIdxF.fTime, clsDataF, clsIdxB.fTime, clsDataB);
+      IntersectClusters(iModule, pars, clsIdxF.fTime, clsDataF, clsIdxB.fTime, clsDataB);
     }
   }
 }
 
-XPU_D void StsHitfinderGpu::IntersectClusters(int iBlock, const StsHitfinderCache& pars, uint32_t timeF,
-                                              const StsGpuCluster& clsF, uint32_t timeB,
-                                              const StsGpuCluster& clsB) const
+XPU_D void sts::Hitfinder::IntersectClusters(int iBlock, const HitfinderCache& pars, uint32_t timeF,
+                                             const sts::Cluster& clsF, uint32_t timeB, const sts::Cluster& clsB) const
 {
   // --- Calculate cluster centre position on readout edge
 
@@ -840,7 +663,7 @@ XPU_D void StsHitfinderGpu::IntersectClusters(int iBlock, const StsHitfinderCach
   }
 }
 
-XPU_D float StsHitfinderGpu::GetClusterPosition(const StsHitfinderCache& pars, float centre, bool isFront) const
+XPU_D float sts::Hitfinder::GetClusterPosition(const HitfinderCache& pars, float centre, bool isFront) const
 {
   // Take integer channel
 
@@ -866,8 +689,8 @@ XPU_D float StsHitfinderGpu::GetClusterPosition(const StsHitfinderCache& pars, f
   return xCluster;
 }
 
-XPU_D bool StsHitfinderGpu::Intersect(const StsHitfinderCache& pars, float xF, float exF, float xB, float exB, float& x,
-                                      float& y, float& varX, float& varY, float& varXY) const
+XPU_D bool sts::Hitfinder::Intersect(const HitfinderCache& pars, float xF, float exF, float xB, float exB, float& x,
+                                     float& y, float& varX, float& varY, float& varXY) const
 {
 
   // In the coordinate system with origin at the bottom left corner,
@@ -917,7 +740,7 @@ XPU_D bool StsHitfinderGpu::Intersect(const StsHitfinderCache& pars, float xF, f
   return IsInside(pars, x - pars.dX / 2.f, y - pars.dY / 2.f);
 }
 
-XPU_D bool StsHitfinderGpu::IsInside(const StsHitfinderCache& pars, float x, float y) const
+XPU_D bool sts::Hitfinder::IsInside(const HitfinderCache& pars, float x, float y) const
 {
   // clang-format off
   return x >= -pars.dX / 2.f
@@ -927,9 +750,9 @@ XPU_D bool StsHitfinderGpu::IsInside(const StsHitfinderCache& pars, float x, flo
   // clang-format on
 }
 
-XPU_D void StsHitfinderGpu::CreateHit(int iModule, float xLocal, float yLocal, float varX, float varY, float varXY,
-                                      uint32_t timeF, const StsGpuCluster& clsF, uint32_t timeB,
-                                      const StsGpuCluster& clsB, float du, float dv) const
+XPU_D void sts::Hitfinder::CreateHit(int iModule, float xLocal, float yLocal, float varX, float varY, float varXY,
+                                     uint32_t timeF, const sts::Cluster& clsF, uint32_t timeB, const sts::Cluster& clsB,
+                                     float du, float dv) const
 {
   // --- Transform into global coordinate system
   float globalX, globalY, globalZ;
@@ -948,7 +771,7 @@ XPU_D void StsHitfinderGpu::CreateHit(int iModule, float xLocal, float yLocal, f
   float etB          = clsB.fTimeError;
   float hitTimeError = 0.5f * xpu::sqrt(etF * etF + etB * etB);
 
-  StsGpuHit hit {
+  sts::Hit hit {
     .fX         = globalX,
     .fY         = globalY,
     .fZ         = globalZ,
@@ -962,15 +785,13 @@ XPU_D void StsHitfinderGpu::CreateHit(int iModule, float xLocal, float yLocal, f
     .fDv        = dv,
   };
 
-  int idx = xpu::atomic_add_block(&nHitsPerModule[iModule], 1);
+  int idx = xpu::atomic_add(&nHitsPerModule[iModule], 1);
 
   assert(size_t(idx) < maxHitsPerModule);
-
-  hitsPerModule[iModule * maxHitsPerModule + idx] = hit;
+  if (size_t(idx) < maxHitsPerModule) { hitsPerModule[iModule * maxHitsPerModule + idx] = hit; }
 }
 
-
-XPU_D float StsHitfinderGpu::LandauWidth(float charge) const
+XPU_D float sts::Hitfinder::LandauWidth(float charge) const
 {
   if (charge <= landauStepSize) return landauTable[0];
   if (charge > landauStepSize * (landauTableSize - 1)) return landauTable[landauTableSize - 1];
@@ -984,7 +805,7 @@ XPU_D float StsHitfinderGpu::LandauWidth(float charge) const
   return v1 + (charge - e1) * (v2 - v1) / (e2 - e1);
 }
 
-XPU_D void StsHitfinderGpu::ToGlobal(int iModule, float lx, float ly, float lz, float& gx, float& gy, float& gz) const
+XPU_D void sts::Hitfinder::ToGlobal(int iModule, float lx, float ly, float lz, float& gx, float& gy, float& gz) const
 {
   const float* tr  = &localToGlobalTranslationByModule[iModule * 3];
   const float* rot = &localToGlobalRotationByModule[iModule * 9];
diff --git a/algo/detectors/sts/StsHitfinder.h b/algo/detectors/sts/StsHitfinder.h
index 6cba6abd1356ac17bdf518d2ecb654d3bd5ef4a5..570bdc6a40bd86bd607b4a1622a4ef0422b6150f 100644
--- a/algo/detectors/sts/StsHitfinder.h
+++ b/algo/detectors/sts/StsHitfinder.h
@@ -9,102 +9,110 @@
 
 #include <xpu/device.h>
 
+#include "Prelude.h"
 #include "gpu/DeviceImage.h"
+#include "gpu/Params.h"
+#include "sts/Cluster.h"
+#include "sts/Hit.h"
+#include "sts/HitfinderPars.h"
 
 namespace cbm::algo
 {
+
   // Block sizes / other compile time constants that need tuning
   enum GpuConstants
   {
 #if XPU_IS_CUDA
-    kSortDigisBlockSize         = 32,
-    kSortDigisItemsPerThread    = 32,
-    kSortClustersBlockSize      = 32,
-    kSortClustersItemsPerThread = 32,
-    kFindClusterBlockSize       = 32,
-    kFindHitsBlockSize          = 32,
+    kSortDigisBlockSize         = 512,
+    kSortDigisItemsPerThread    = 8,
+    kSortClustersBlockSize      = 512,
+    kSortClustersItemsPerThread = 8,
+    kFindClusterBlockSize       = 1024,
+    kFindHitsBlockSize          = 512,
 #else  // HIP, values ignored on CPU
     kSortDigisBlockSize         = 256,
     kSortDigisItemsPerThread    = 6,
     kSortClustersBlockSize      = 256,
     kSortClustersItemsPerThread = 6,
-    kFindClusterBlockSize       = 256,
-    kFindHitsBlockSize          = 256,
+    kFindClusterBlockSize       = 1024,
+    kFindHitsBlockSize          = 1024,
 #endif
   };
-
-  // Kernel declarations
-  XPU_EXPORT_KERNEL(GPUReco, SortDigis);
-  // Combine substeps for finding clusters into a single kernel
-  XPU_EXPORT_KERNEL(GPUReco, FindClustersSingleStep);
-  XPU_EXPORT_KERNEL(GPUReco, CalculateOffsets);
-  XPU_EXPORT_KERNEL(GPUReco, FindClusters);
-  XPU_EXPORT_KERNEL(GPUReco, FindClustersBasic);
-  XPU_EXPORT_KERNEL(GPUReco, CalculateClusters);
-  XPU_EXPORT_KERNEL(GPUReco, CalculateClustersBasic);
-  XPU_EXPORT_KERNEL(GPUReco, SortClusters);
-  XPU_EXPORT_KERNEL(GPUReco, FindHits);
-
 }  // namespace cbm::algo
 
-// Set block sizes. Currently this needs to be done in toplevel namespace.
-XPU_BLOCK_SIZE_1D(cbm::algo::SortClusters, cbm::algo::kSortClustersBlockSize);
-XPU_BLOCK_SIZE_1D(cbm::algo::FindClustersSingleStep, cbm::algo::kFindClusterBlockSize);
-XPU_BLOCK_SIZE_1D(cbm::algo::CalculateOffsets, cbm::algo::kFindClusterBlockSize);
-XPU_BLOCK_SIZE_1D(cbm::algo::FindClusters, cbm::algo::kFindClusterBlockSize);
-XPU_BLOCK_SIZE_1D(cbm::algo::CalculateClusters, cbm::algo::kFindClusterBlockSize);
-XPU_BLOCK_SIZE_1D(cbm::algo::SortDigis, cbm::algo::kSortClustersBlockSize);
-XPU_BLOCK_SIZE_1D(cbm::algo::FindHits, cbm::algo::kFindClusterBlockSize);
-
-namespace cbm::algo
+namespace cbm::algo::sts
 {
+
+  class Hitfinder;  // forward declaration
+
   // GPU Data structures
   // TODO: Replace with regular StsCluster / StsHit once the changes
   // land to make them fit for GPU
-  struct StsGpuClusterIdx {
+  struct GpuClusterIdx {
     uint32_t fTime;  ///< Cluster time (average of digi times) [ns]
     int fIdx;
   };
 
-  struct StsGpuCluster {
-    float fCharge;         ///< Total charge
-    int fSize;             ///< Difference between first and last channel
-    float fPosition;       ///< Cluster centre in channel number units
-    float fPositionError;  ///< Cluster centre error (r.m.s.) in channel number units
-    uint32_t fTime;        ///< cluster time [ns]
-    float fTimeError;      ///< Error of cluster time [ns]
+  // Declare Constant Memory
+  struct TheHitfinder : xpu::constant<GPUReco, Hitfinder> {
   };
 
-  struct StsGpuHit {
-    float fX, fY;      ///< X, Y positions of hit [cm]
-    float fZ;          ///< Z position of hit [cm]
-    uint32_t fTime;    ///< Hit time [ns]
-    float fDx, fDy;    ///< X, Y errors [cm]
-    float fDz;         ///< Z position error [cm]
-    float fDxy;        ///< XY correlation
-    float fTimeError;  ///< Error of hit time [ns]
-    float fDu;         ///< Error of coordinate across front-side strips [cm]
-    float fDv;         ///< Error of coordinate across back-side strips [cm]
+  // Declare Kernels
+  struct SortDigis : xpu::kernel<GPUReco> {
+    using block_size    = xpu::block_size<kSortDigisBlockSize>;
+    using sort_t        = xpu::block_sort<u64, CbmStsDigi, kSortDigisBlockSize, kSortDigisItemsPerThread>;
+    using shared_memory = sort_t::storage_t;
+    using constants     = xpu::cmem<TheHitfinder, Params>;
+    using context       = xpu::kernel_context<shared_memory, constants>;
+    XPU_D void operator()(context&);
   };
 
-  // Calibration data structures
+  struct FindClusters : xpu::kernel<GPUReco> {
+    using block_size = xpu::block_size<kFindClusterBlockSize>;
+    using constants  = xpu::cmem<TheHitfinder, Params>;
+    using context    = xpu::kernel_context<shared_memory, constants>;
+    XPU_D void operator()(context&);
+  };
 
-  struct StsAsicPar {
-    int nAdc;
-    float dynamicRange;
-    float threshold;
-    float timeResolution;
-    float deadTime;
-    float noise;
-    float zeroNoiseRate;
+  struct ChannelOffsets : xpu::kernel<GPUReco> {
+    using block_size = xpu::block_size<kFindClusterBlockSize>;
+    using constants  = xpu::cmem<TheHitfinder, Params>;
+    using context    = xpu::kernel_context<shared_memory, constants>;
+    XPU_D void operator()(context&);
+  };
 
-    XPU_D float AdcToCharge(unsigned short adc) const
-    {
-      return threshold + dynamicRange / float(nAdc) * (float(adc) + 0.5f);
-    }
+  struct CreateDigiConnections : xpu::kernel<GPUReco> {
+    using block_size = xpu::block_size<kFindClusterBlockSize>;
+    using constants  = xpu::cmem<TheHitfinder, Params>;
+    using context    = xpu::kernel_context<shared_memory, constants>;
+    XPU_D void operator()(context&);
+  };
+
+  struct CreateClusters : xpu::kernel<GPUReco> {
+    using block_size = xpu::block_size<kFindClusterBlockSize>;
+    using constants  = xpu::cmem<TheHitfinder, Params>;
+    using context    = xpu::kernel_context<shared_memory, constants>;
+    XPU_D void operator()(context&);
+  };
+
+  struct SortClusters : xpu::kernel<GPUReco> {
+    using block_size    = xpu::block_size<kSortClustersBlockSize>;
+    using sort_t        = xpu::block_sort<u32, GpuClusterIdx, kSortClustersBlockSize, kSortClustersItemsPerThread>;
+    using shared_memory = sort_t::storage_t;
+    using constants     = xpu::cmem<TheHitfinder, Params>;
+    using context       = xpu::kernel_context<shared_memory, constants>;
+    XPU_D void operator()(context&);
+  };
+
+  struct FindHits : xpu::kernel<GPUReco> {
+    using block_size = xpu::block_size<kFindHitsBlockSize>;
+    using constants  = xpu::cmem<TheHitfinder, Params>;
+    using context    = xpu::kernel_context<shared_memory, constants>;
+    XPU_D void operator()(context&);
   };
 
-  struct StsSensorPar {
+  // Calibration data structures
+  struct SensorPar {
     float dY;
     float pitch;
     float stereoF;
@@ -117,14 +125,14 @@ namespace cbm::algo
   // Used internally to avoid computing values multiple times
   // TODO: is this really needed? Overhead from additional computations should be miniscule.
   // TODO: Also store in shared memory, not registers. Values identical for all threads.
-  struct StsHitfinderCache : StsSensorPar {
+  struct HitfinderCache : SensorPar {
     float tanStereoF;
     float tanStereoB;
     float errorFac;
     float dX;
   };
 
-  struct StsClusterCalculationProperties {
+  struct ClusterCalculationProperties {
     float tSum      = 0.;       // sum of digi times
     int chanF       = 9999999;  // first channel in cluster
     int chanL       = -1;       // last channel in cluster
@@ -139,7 +147,7 @@ namespace cbm::algo
     float xSum = 0.f;  // weighted charge sum, used to correct corrupt clusters
   };
 
-  class StsDigiConnector {
+  class DigiConnector {
   private:
     unsigned int hasPreviousAndNext;
 
@@ -156,7 +164,7 @@ namespace cbm::algo
 
       do {
         assumed = old;
-        old     = xpu::atomic_cas_block(&hasPreviousAndNext, assumed, UnpackNext(assumed) | hasPreviousI);
+        old     = xpu::atomic_cas(&hasPreviousAndNext, assumed, UnpackNext(assumed) | hasPreviousI);
       } while (old != assumed);
     }
 
@@ -171,31 +179,15 @@ namespace cbm::algo
 
       do {
         assumed = old;
-        old     = xpu::atomic_cas_block(&hasPreviousAndNext, assumed, (assumed & (1u << 31)) | next);
+        old     = xpu::atomic_cas(&hasPreviousAndNext, assumed, (assumed & (1u << 31)) | next);
       } while (old != assumed);
     }
 
     XPU_D bool HasNext() const { return UnpackNext(hasPreviousAndNext) != 0; }
   };
 
-  // Shared Memory defintions
-  using SortDigisT =
-    xpu::block_sort<unsigned long int, ::CbmStsDigi, xpu::block_size<SortDigis>::value.x, kSortDigisItemsPerThread>;
-
-  struct StsSortDigiSmem {
-    typename SortDigisT::storage_t sortBuf;
-  };
-
-  using SortClustersT =
-    xpu::block_sort<uint32_t, StsGpuClusterIdx, xpu::block_size<SortClusters>::value.x, kSortClustersItemsPerThread>;
-
-  struct StsSortClusterSmem {
-    typename SortClustersT::storage_t sortBuf;
-  };
-
-  // Hitfinder buffers
-  template<xpu::side S>
-  class StsHitfinderBase {
+  // Hitfinder class. Holds pointers to buffers + algorithm.
+  class Hitfinder {
 
   public:
     // calibration / configuration data
@@ -204,87 +196,71 @@ namespace cbm::algo
     int nModules;   // Number of modules
     int nChannels;  // Number of channels per module
 
-    // Time cuts for Digis / Clusters
-    // TODO: currently not used!
-    float timeCutDigiAbs;
-    float timeCutDigiSig;
-    float timeCutClusterAbs;
-    float timeCutClusterSig;
-
-    StsAsicPar asic;
+    sts::HitfinderPars::Asic asic;
 
     int landauTableSize;
     float landauStepSize;
     // Entries of landauTable. size = landauTableSize
-    xpu::cmem_device_t<float, S> landauTable;
+    xpu::buffer<float> landauTable;
 
     // transformation matrix to convert local to global coordinate space
     // size = nModules
-    xpu::cmem_io_t<float, S> localToGlobalTranslationByModule;
-    xpu::cmem_io_t<float, S> localToGlobalRotationByModule;
+    xpu::buffer<float> localToGlobalTranslationByModule;
+    xpu::buffer<float> localToGlobalRotationByModule;
 
     // Sensor Parameters
     // size = nModules
-    xpu::cmem_io_t<StsSensorPar, S> sensorPars;
+    xpu::buffer<SensorPar> sensorPars;
 
     // input
     // Store all digis in a flat array with a header that contains the offset for every module (front and back)
-    xpu::cmem_io_t<size_t, S>
-      digiOffsetPerModule;  // size = 2 * nModules + 1 entries, last entry contains total digi count
-    xpu::cmem_io_t<CbmStsDigi, S> digisPerModule;  // Flat array of input digis. size = nDigisTotal
+    xpu::buffer<size_t> digiOffsetPerModule;  // size = 2 * nModules + 1 entries, last entry contains total digi count
+    xpu::buffer<CbmStsDigi> digisPerModule;   // Flat array of input digis. size = nDigisTotal
 
     // Temporary Digi-Array used for sorting as sorting is not in place. size = nDigisTotal
-    xpu::cmem_device_t<CbmStsDigi, S> digisPerModuleTmp;
-
-    // Pointer to sorted of a module side. Points either to digisPerModule or digisPerModuleTmp
-    // size = 2 * nModules
-    xpu::cmem_device_t<CbmStsDigi*, S> digisSortedPerModule;
+    xpu::buffer<CbmStsDigi> digisPerModuleTmp;
 
     // DigiConnectors used internally by StsClusterizer
     // Connects digis that belong to the same cluster via linked-list.
     // size = nDigisTotal
-    xpu::cmem_device_t<StsDigiConnector, S> digiConnectorsPerModule;
+    xpu::buffer<DigiConnector> digiConnectorsPerModule;
 
     // Digis are sorted by channel + within channel by time
     // Contains the offset for each channel within each channel
     // size = 2 * nModules * nChannels
-    xpu::cmem_device_t<unsigned int, S> channelOffsetPerModule;
+    xpu::buffer<unsigned int> channelOffsetPerModule;
 
     // intermediate results
     size_t maxClustersPerModule;
 
-    // FIXME: Not used anymore?
-    xpu::cmem_device_t<int, S> channelStatusPerModule;
-
     // Cluster Index by module. Produced by cluster finder.
     // Stored as buckets with a max size of maxClustersPerModule
     // Actual number of entries in each bucket is stored in nClustersPerModule
     // size = 2 * nModules * maxClustersPerModule
-    xpu::cmem_io_t<StsGpuClusterIdx, S> clusterIdxPerModule;
+    xpu::buffer<GpuClusterIdx> clusterIdxPerModule;
 
     // Temporary cluster index array used for sorting as sorting is not in place.
     // size =  2 * nModules * maxClustersPerModule
-    xpu::cmem_io_t<StsGpuClusterIdx, S> clusterIdxPerModuleTmp;
+    xpu::buffer<GpuClusterIdx> clusterIdxPerModuleTmp;
 
     // Pointer to sorted cluster idx for every module side.
     // Either points to clusterIdxPerModule or clusterIdxPerModuleTmp.
     // size = 2 * nModules
-    xpu::cmem_io_t<StsGpuClusterIdx*, S> clusterIdxSortedPerModule;
+    xpu::buffer<GpuClusterIdx*> clusterIdxSortedPerModule;
 
     // Clusters stored by modules. Stored as buckets with a max size of maxClustersPerModule
     // Actual number of entries in each bucket is stored in nClustersPerModule
     // size = 2 * nModules * maxClustersPerModule
-    xpu::cmem_io_t<StsGpuCluster, S> clusterDataPerModule;
+    xpu::buffer<sts::Cluster> clusterDataPerModule;
 
     // Number of clusters in each module
     // size = 2 * nModules
-    xpu::cmem_io_t<int, S> nClustersPerModule;
+    xpu::buffer<int> nClustersPerModule;
 
     // Max time error of clusters on front- and backside of a module
     // size = 1 (???)
     // FIXME: size should be 2 * nModules? And only one array!
-    xpu::cmem_device_t<float, S> maxErrorFront;
-    xpu::cmem_device_t<float, S> maxErrorBack;
+    xpu::buffer<float> maxClusterTimeErrorByModuleSide;
 
     // output
 
@@ -294,55 +270,38 @@ namespace cbm::algo
     // Hits sorted by module. Stored in buckets of size maxHitsPerModule.
     // Actual number of hits is stored in nHitsPerModule
     // size = maxHitsPerModule * nModules
-    xpu::cmem_io_t<StsGpuHit, S> hitsPerModule;
+    xpu::buffer<sts::Hit> hitsPerModule;
 
     // Number of hits in each module
     // size = nModules
-    xpu::cmem_io_t<int, S> nHitsPerModule;
-  };
-
-  // Host-side hitfinder. Only holds memory buffers.
-  using StsHitfinderHost = StsHitfinderBase<xpu::side::host>;
-
-  // Device-side hitfinder. Holds pointers to buffers + algorithm.
-  class StsHitfinderGpu : public StsHitfinderBase<xpu::side::device> {
+    xpu::buffer<int> nHitsPerModule;
 
   public:
-    XPU_D void SortDigisInSpaceAndTime(StsSortDigiSmem& smem, int iBlock) const;
-    XPU_D void FindClustersSingleStep(int iBlock) const;
-    XPU_D void SortClusters(StsSortClusterSmem& smem, int iBlock) const;
-    XPU_D void FindHits(int iBlock) const;
+    XPU_D void SortDigisInSpaceAndTime(SortDigis::context&) const;
+    XPU_D void FindClustersSingleStep(FindClusters::context&) const;
+    XPU_D void SortClusters(SortClusters::context&) const;
+    XPU_D void FindHits(FindHits::context&) const;
 
     // Individual steps of cluster finder, for more granular time measurement
-    XPU_D void CalculateOffsetsParallel(int iBlock) const;
-    XPU_D void FindClustersParallel(int iBlock) const;
-    XPU_D void CalculateClustersParallel(int iBlock) const;
-
-    XPU_D void FindClusterConnectionsBasic(int iBlock) const;
-    XPU_D void CalculateClustersBasic(int iBlock) const;
+    XPU_D void CalculateOffsetsParallel(FindClusters::context&) const;
+    XPU_D void FindClustersParallel(FindClusters::context&) const;
+    XPU_D void CalculateClustersParallel(FindClusters::context&) const;
 
   private:
-    XPU_D void CalculateChannelOffsets(CbmStsDigi* digis, unsigned int* channelOffsets, unsigned int nDigis) const;
+    XPU_D void CalculateChannelOffsets(FindClusters::context& ctx, CbmStsDigi* digis, unsigned int* channelOffsets,
+                                       unsigned int nDigis) const;
 
-    XPU_D void FindClusterConnectionsChannelWise(CbmStsDigi* digis, StsDigiConnector* digiConnector,
-                                                 unsigned int* channelOffsets, int iModule,
-                                                 unsigned int threadId) const;
-    XPU_D void FindClusterConnectionsDigiWise(CbmStsDigi* digis, StsDigiConnector* digiConnector,
-                                              unsigned int* channelOffsets, int iModule, unsigned int threadId,
+    XPU_D void FindClusterConnectionsDigiWise(FindClusters::context& ctx, CbmStsDigi* digis,
+                                              DigiConnector* digiConnector, unsigned int* channelOffsets,
                                               unsigned int nDigis) const;
 
-    XPU_D void CalculateClustersBasic(CbmStsDigi* digis, StsDigiConnector* digiConnector, int iModule,
-                                      unsigned int const threadId) const;
-    XPU_D void CalculateClustersChannelWise(CbmStsDigi* digis, StsDigiConnector* digiConnector,
-                                            unsigned int* channelOffsets, int iModule, unsigned int const threadId,
-                                            unsigned int const nDigis) const;
-    XPU_D void CalculateClustersDigiWise(CbmStsDigi* digis, StsDigiConnector* digiConnector, int iModule,
-                                         unsigned int const threadId, unsigned int const nDigis) const;
+    XPU_D void CalculateClustersDigiWise(FindClusters::context& ctx, CbmStsDigi* digis, DigiConnector* digiConnector,
+                                         unsigned int const nDigis) const;
 
     XPU_D void CreateClusterFromConnectors1(int const iModule, CbmStsDigi* digis, int const digiIndex) const;
-    XPU_D void CreateClusterFromConnectors2(int const iModule, CbmStsDigi* digis, StsDigiConnector* digiConnector,
+    XPU_D void CreateClusterFromConnectors2(int const iModule, CbmStsDigi* digis, DigiConnector* digiConnector,
                                             int const digiIndex) const;
-    XPU_D void CreateClusterFromConnectorsN(int const iModule, CbmStsDigi* digis, StsDigiConnector* digiConnector,
+    XPU_D void CreateClusterFromConnectorsN(int const iModule, CbmStsDigi* digis, DigiConnector* digiConnector,
                                             int const digiIndex) const;
 
   private:
@@ -371,17 +330,19 @@ namespace cbm::algo
       return diff;
     }
 
-    XPU_D void AddCluster(int iModule, uint32_t time, const StsGpuCluster& cls) const
+    XPU_D void AddCluster(int iModule, uint32_t time, const sts::Cluster& cls) const
     {
-      StsGpuClusterIdx* tgtIdx = &clusterIdxPerModule[iModule * maxClustersPerModule];
-      StsGpuCluster* tgtData   = &clusterDataPerModule[iModule * maxClustersPerModule];
+      GpuClusterIdx* tgtIdx = &clusterIdxPerModule[iModule * maxClustersPerModule];
+      sts::Cluster* tgtData = &clusterDataPerModule[iModule * maxClustersPerModule];
 
       int pos = xpu::atomic_add_block(&nClustersPerModule[iModule], 1);
       XPU_ASSERT(size_t(pos) < maxClustersPerModule);
 
-      StsGpuClusterIdx idx {time, pos};
-      tgtIdx[idx.fIdx]  = idx;
-      tgtData[idx.fIdx] = cls;
+      if (size_t(pos) < maxClustersPerModule) {
+        GpuClusterIdx idx {time, pos};
+        tgtIdx[idx.fIdx]  = idx;
+        tgtData[idx.fIdx] = cls;
+      }
     }
 
     XPU_D bool IsBackside(int iModule) const { return iModule >= nModules; }
@@ -390,19 +351,18 @@ namespace cbm::algo
 
     XPU_D void ToGlobal(int iModule, float lx, float ly, float lz, float& gx, float& gy, float& gz) const;
 
-    XPU_D void IntersectClusters(int iBlock, const StsHitfinderCache& pars, uint32_t timeF, const StsGpuCluster& clsF,
-                                 uint32_t timeB, const StsGpuCluster& clsB) const;
-    XPU_D float GetClusterPosition(const StsHitfinderCache& pars, float centre, bool isFront) const;
-    XPU_D bool Intersect(const StsHitfinderCache& pars, float xF, float exF, float xB, float exB, float& x, float& y,
+    XPU_D void IntersectClusters(int iBlock, const HitfinderCache& pars, uint32_t timeF, const sts::Cluster& clsF,
+                                 uint32_t timeB, const sts::Cluster& clsB) const;
+    XPU_D float GetClusterPosition(const HitfinderCache& pars, float centre, bool isFront) const;
+    XPU_D bool Intersect(const HitfinderCache& pars, float xF, float exF, float xB, float exB, float& x, float& y,
                          float& varX, float& varY, float& varXY) const;
-    XPU_D bool IsInside(const StsHitfinderCache& pars, float x, float y) const;
+    XPU_D bool IsInside(const HitfinderCache& pars, float x, float y) const;
     XPU_D void CreateHit(int iBlocks, float xLocal, float yLocal, float varX, float varY, float varXY, uint32_t timeF,
-                         const StsGpuCluster& clsF, uint32_t timeB, const StsGpuCluster& clsB, float du,
-                         float dv) const;
+                         const sts::Cluster& clsF, uint32_t timeB, const sts::Cluster& clsB, float du, float dv) const;
 
     XPU_D void SaveMaxError(float errorValue, int iModule) const
     {
-      float* maxError = IsBackside(iModule) ? maxErrorBack : maxErrorFront;
+      float* maxError = &maxClusterTimeErrorByModuleSide[iModule];
       float old {};
       do {
         old = *maxError;
@@ -411,9 +371,6 @@ namespace cbm::algo
     }
   };
 
-  // StsHitfinder lives in constant memory
-  XPU_EXPORT_CONSTANT(GPUReco, StsHitfinderGpu, TheStsHitfinder);
-
-}  // namespace cbm::algo
+}  // namespace cbm::algo::sts
 
-#endif  // #ifndef CBM_ALGO_STS_HITFINDER_H
+#endif  // CBM_ALGO_STS_HITFINDER_H
diff --git a/algo/detectors/sts/StsHitfinderChain.cxx b/algo/detectors/sts/StsHitfinderChain.cxx
index 9c37e24b5d3d0b58570ea7f2301ad163ee9e9fbf..dc1f856d86b21a3b92f25f8aa605942035504f74 100644
--- a/algo/detectors/sts/StsHitfinderChain.cxx
+++ b/algo/detectors/sts/StsHitfinderChain.cxx
@@ -4,35 +4,28 @@
 
 #include "StsHitfinderChain.h"
 
-#include "Logger.h"
+#include <fairlogger/Logger.h>
 
-using namespace cbm::algo;
+#include <numeric>
 
-// Shorthands to set data / allocate buffers on GPU
-// FIXME: Eventually we'll need a better way to do this. Preferably supplied by xpu
-#define SET_GPU_CONSTANT(constant, val) fHitfinderHost.constant = fHitfinderGpu.constant = val
-#define GPU_ALLOC(field, nEntries)                                                                                     \
-  fHitfinderHost.field = decltype(fHitfinderHost.field)(nEntries);                                                     \
-  fHitfinderGpu.field  = fHitfinderHost.field.d()
+using namespace cbm::algo;
 
-void StsHitfinderChain::SetParameters(const StsHitfinderPar& parameters)
+void sts::HitfinderChain::SetParameters(const sts::HitfinderPars& parameters)
 {
   fPars.emplace(parameters);
   AllocateStatic();
 }
 
-void StsHitfinderChain::operator()(gsl::span<const CbmStsDigi> digis)
+void sts::HitfinderChain::operator()(gsl::span<const CbmStsDigi> digis)
 {
   EnsureParameters();
 
-  // FIXME: this has to be called only once
-  // and should happen during initialization not in reco loop
-  setenv("XPU_PROFILE", "1", 1);
-  xpu::initialize();
+  xpu::scoped_timer t_("STS Hitfinder");
 
   int nModules       = fPars->modules.size();
   int nModuleSides   = nModules * 2;
   size_t nDigisTotal = digis.size();
+  xpu::t_add_bytes(nDigisTotal * sizeof(CbmStsDigi));
 
   // Getting the digis on the GPU requires 3 steps
   // 1. Sort digis into buckets by module
@@ -44,107 +37,173 @@ void StsHitfinderChain::operator()(gsl::span<const CbmStsDigi> digis)
   // 3. Copy digis into flat array with offsets per module
   FlattenDigis(digiMap);
 
+  if (fair::Logger::GetConsoleSeverity() == fair::Severity::trace) { EnsureDigiOffsets(digiMap); }
+
+  xpu::queue queue;
+
   // Clear buffers
   // Not all buffers have to initialized, but useful for debugging
 
-  auto& hfc = fHitfinderHost;
+  LOG(debug) << "STS Hitfinder Chain: Clearing buffers...";
+  auto& hfc = fHitfinder;
   // xpu::memset(hfc.digisPerModule, 0);
   // xpu::memset(hfc.digisPerModuleTmp, 0);
   // xpu::memset(hfc.digisSortedPerModule, 0);
   // xpu::memset(hfc.digiOffsetPerModule, 0);
-  xpu::memset(hfc.digiConnectorsPerModule, 0);
-  // xpu::memset(hfc.channelOffsetPerModule, 0);
-  xpu::memset(hfc.channelStatusPerModule, -1);
-  xpu::memset(hfc.clusterIdxPerModule, 0);
+  queue.memset(hfc.digiConnectorsPerModule, 0);
+  queue.memset(hfc.channelOffsetPerModule, 0);
+  queue.memset(hfc.clusterIdxPerModule, 0);
   // xpu::memset(hfc.clusterIdxPerModuleTmp, 0);
   // xpu::memset(hfc.clusterIdxSortedPerModule, 0);
   // xpu::memset(hfc.clusterDataPerModule, 0);
-  xpu::memset(hfc.nClustersPerModule, 0);
+  queue.memset(hfc.nClustersPerModule, 0);
   // xpu::memset(hfc.hitsPerModule, 0);
-  xpu::memset(hfc.nHitsPerModule, 0);
-  xpu::memset(hfc.maxErrorFront, 0);
-  xpu::memset(hfc.maxErrorBack, 0);
+  queue.memset(hfc.nHitsPerModule, 0);
+  queue.memset(hfc.maxClusterTimeErrorByModuleSide, 0);
+
+  LOG(debug) << "STS Hitfinder Chain: Copy digis buffers...";
+  xpu::set<TheHitfinder>(fHitfinder);
+  queue.copy(hfc.digisPerModule, xpu::h2d);
+  queue.copy(hfc.digiOffsetPerModule, xpu::h2d);
+
+  LOG(debug) << "STS Hitfinder Chain: Sort Digis...";
+  // TODO: skip temporary buffers and sort directly into digisSortedPerModule
+
+  queue.launch<SortDigis>(xpu::n_blocks(nModuleSides));
+  xpu::k_add_bytes<SortDigis>(nDigisTotal * sizeof(CbmStsDigi));
+  if (fair::Logger::GetConsoleSeverity() == fair::Severity::trace) {
+    LOG(trace) << "Ensuring STS digis are sorted...";
+    queue.copy(hfc.digisPerModule, xpu::d2h);
+    queue.copy(hfc.digiOffsetPerModule, xpu::d2h);
+    queue.wait();
+    EnsureDigisSorted();
+  }
 
-  xpu::set_constant<TheStsHitfinder>(fHitfinderGpu);
-  xpu::copy(hfc.digisPerModule, xpu::host_to_device);
-  xpu::copy(hfc.digiOffsetPerModule, xpu::host_to_device);
+  LOG(debug) << "STS Hitfinder Chain: Find Clusters...";
+  if (!Params().sts.findClustersMultiKernels) { queue.launch<FindClusters>(xpu::n_blocks(nModuleSides)); }
+  else {
+    queue.launch<ChannelOffsets>(xpu::n_blocks(nModuleSides));
+    xpu::k_add_bytes<ChannelOffsets>(nDigisTotal * sizeof(CbmStsDigi));
+    queue.launch<CreateDigiConnections>(xpu::n_threads(nDigisTotal));
+    xpu::k_add_bytes<CreateDigiConnections>(nDigisTotal * sizeof(CbmStsDigi));
+    queue.launch<CreateClusters>(xpu::n_blocks(nModuleSides));
+    xpu::k_add_bytes<CreateClusters>(nDigisTotal * sizeof(CbmStsDigi));
+  }
+  if (fair::Logger::GetConsoleSeverity() == fair::Severity::trace) {
+    LOG(trace) << "Ensuring STS channel offsets correct...";
+    xpu::buffer_prop propsOffset {hfc.channelOffsetPerModule};
+    std::vector<u32> channelOffsetPerModule;
+    channelOffsetPerModule.resize(propsOffset.size());
+    queue.copy(hfc.channelOffsetPerModule.get(), channelOffsetPerModule.data(), propsOffset.size_bytes());
+    queue.wait();
+    EnsureChannelOffsets(channelOffsetPerModule);
+
+    LOG(trace) << "Ensuring STS clusters are ok...";
+    xpu::buffer_prop props {hfc.clusterIdxPerModule};
+
+    std::vector<GpuClusterIdx> clusterIdxPerModule;
+    clusterIdxPerModule.resize(props.size());
+    std::vector<int> nClustersPerModule;
+    nClustersPerModule.resize(fPars->modules.size() * 2);
+
+    queue.copy(hfc.clusterIdxPerModule.get(), clusterIdxPerModule.data(), props.size_bytes());
+    queue.copy(hfc.nClustersPerModule.get(), nClustersPerModule.data(), nClustersPerModule.size() * sizeof(int));
+    queue.wait();
+    EnsureClustersSane(clusterIdxPerModule, nClustersPerModule);
+  }
 
-  xpu::run_kernel<SortDigis>(xpu::grid::n_blocks(nModuleSides));
-  xpu::run_kernel<FindClustersSingleStep>(xpu::grid::n_blocks(nModuleSides));
   // Run cluster finding steps in indivual kernels, useful for debugging / profiling
   // xpu::run_kernel<CalculateOffsets>(xpu::grid::n_blocks(hfc.nModules * 2));
   // xpu::run_kernel<FindClustersBasic>(xpu::grid::n_blocks(hfc.nModules * 2));
   // xpu::run_kernel<CalculateClusters>(xpu::grid::n_blocks(hfc.nModules * 2));
   // xpu::run_kernel<FindClustersBasic>(xpu::grid::n_blocks(hfc.nModules * 2));
   // xpu::run_kernel<CalculateClustersBasic>(xpu::grid::n_blocks(hfc.nModules * 2));
-  xpu::run_kernel<SortClusters>(xpu::grid::n_blocks(nModuleSides));
-  xpu::run_kernel<FindHits>(xpu::grid::n_blocks(nModules));
-
-  CollectTimingInfo();
+  LOG(debug) << "STS Hitfinder Chain: Sort Clusters...";
+  queue.launch<SortClusters>(xpu::n_blocks(nModuleSides));  // FIXME n_blocks(nModuleSides) once debugging is done
+
+  LOG(debug) << "STS Hitfinder Chain: Find Hits...";
+  queue.copy(hfc.nClustersPerModule, xpu::d2h);
+  queue.wait();
+  xpu::h_view nClusters {hfc.nClustersPerModule};
+  size_t nClustersFront = std::accumulate(nClusters.begin(), nClusters.begin() + nModules, 0);
+  bool isCpu            = xpu::device::active().backend() == xpu::cpu;
+  xpu::grid findHitsG   = isCpu ? xpu::n_blocks(nModules) : xpu::n_threads(nClustersFront);
+  queue.launch<FindHits>(findHitsG);
 
   // Transfer Hits / Cluster back to host
   // TODO: Hits and Clusters are stored in buckets. Currently we transfer the entire
   // bucket arrays back to the host. This includes a lot of unused bytes.
   // Better to either flatten both arrays on the GPU and transfer flat array back.
   // Or do multiple copies to copy contents of individual buckets.
-  xpu::copy(hfc.clusterDataPerModule, xpu::device_to_host);
-  xpu::copy(hfc.nClustersPerModule, xpu::device_to_host);
-  xpu::copy(hfc.hitsPerModule, xpu::device_to_host);
-  xpu::copy(hfc.nHitsPerModule, xpu::device_to_host);
+  queue.copy(hfc.clusterDataPerModule, xpu::d2h);
+  queue.copy(hfc.nClustersPerModule, xpu::d2h);
+  queue.copy(hfc.hitsPerModule, xpu::d2h);
+  queue.copy(hfc.nHitsPerModule, xpu::d2h);
+
+  queue.wait();
+
+  xpu::h_view nHits {hfc.nHitsPerModule};
+  // xpu::h_view nClusters{hfc.nClustersPerModule};
+  size_t nHitsTotal     = std::accumulate(nHits.begin(), nHits.end(), 0);
+  size_t nClustersTotal = std::accumulate(nClusters.begin(), nClusters.end(), 0);
+
+  xpu::k_add_bytes<SortClusters>(nClustersTotal * sizeof(GpuClusterIdx));
+  xpu::k_add_bytes<FindHits>(nClustersTotal * sizeof(sts::Cluster));
+
+  LOG(info) << "Timeslice contains " << nHitsTotal << " STS hits and " << nClustersTotal << " STS clusters";
 }
 
-void StsHitfinderChain::EnsureParameters()
+void sts::HitfinderChain::EnsureParameters()
 {
-  LOG_IF(fatal, fPars == std::nullopt) << "StsHitfinderChain: Parameters not set. Can't continue!";
+  LOG_IF(fatal, fPars == std::nullopt) << "sts::HitfinderChain: Parameters not set. Can't continue!";
 }
 
-void StsHitfinderChain::AllocateStatic()
+void sts::HitfinderChain::AllocateStatic()
 {
   EnsureParameters();
 
   // Shorthands for common constants
   const int nChannels = fPars->nChannels / 2;  // Only count channels on one side of the module
   const int nModules  = fPars->modules.size();
-  // const int nModuleSides = 2 * nModules; // Number of module front + backsides
-
-  // Set GPU constants
-  SET_GPU_CONSTANT(nModules, nModules);
-  SET_GPU_CONSTANT(nChannels, nChannels);
+  const int nModuleSides = 2 * nModules;  // Number of module front + backsides
 
-  SET_GPU_CONSTANT(timeCutDigiAbs, -1.f);
-  SET_GPU_CONSTANT(timeCutDigiSig, 3.f);
-  SET_GPU_CONSTANT(timeCutClusterAbs, -1.f);
-  SET_GPU_CONSTANT(timeCutClusterSig, 4.f);
+  xpu::queue q;
 
-  SET_GPU_CONSTANT(asic, fPars->asic);
+  // Set GPU constants
+  fHitfinder.nModules  = nModules;
+  fHitfinder.nChannels = nChannels;
+  fHitfinder.asic      = fPars->asic;
 
   // Copy landau table
   size_t nLandauTableEntries = fPars->landauTable.values.size();
-  SET_GPU_CONSTANT(landauTableSize, nLandauTableEntries);
-  SET_GPU_CONSTANT(landauStepSize, fPars->landauTable.stepSize);
-  GPU_ALLOC(landauTable, nLandauTableEntries);
-  xpu::copy(fHitfinderGpu.landauTable, fPars->landauTable.values.data(), nLandauTableEntries);
+  fHitfinder.landauTableSize = nLandauTableEntries;
+  fHitfinder.landauStepSize  = fPars->landauTable.stepSize;
+  fHitfinder.landauTable.reset(nLandauTableEntries, xpu::buf_io);
+  q.copy(fPars->landauTable.values.data(), fHitfinder.landauTable.get(), nLandauTableEntries * sizeof(float));
 
   // Copy transformation matrix
-  GPU_ALLOC(localToGlobalRotationByModule, 9 * nModules);
-  GPU_ALLOC(localToGlobalTranslationByModule, 3 * nModules);
+  fHitfinder.localToGlobalRotationByModule.reset(9 * nModules, xpu::buf_io);
+  fHitfinder.localToGlobalTranslationByModule.reset(3 * nModules, xpu::buf_io);
 
   // - Copy matrix into flat array
+  xpu::h_view hRot {fHitfinder.localToGlobalRotationByModule};
+  xpu::h_view hTrans {fHitfinder.localToGlobalTranslationByModule};
   for (int m = 0; m < nModules; m++) {
     const auto& module = fPars->modules.at(m);
-    std::copy_n(module.localToGlobal.rotation.data(), 9, &fHitfinderHost.localToGlobalRotationByModule.h()[m * 9]);
-    std::copy_n(module.localToGlobal.translation.data(), 3, &fHitfinderHost.localToGlobalRotationByModule.h()[m * 3]);
+    std::copy_n(module.localToGlobal.rotation.data(), 9, &hRot[m * 9]);
+    std::copy_n(module.localToGlobal.translation.data(), 3, &hTrans[m * 3]);
   }
 
   // - Then copy to GPU
-  xpu::copy(fHitfinderHost.localToGlobalRotationByModule, xpu::host_to_device);
-  xpu::copy(fHitfinderHost.localToGlobalTranslationByModule, xpu::host_to_device);
+  q.copy(fHitfinder.localToGlobalRotationByModule, xpu::h2d);
+  q.copy(fHitfinder.localToGlobalTranslationByModule, xpu::h2d);
 
   // Copy Sensor Parameteres
-  GPU_ALLOC(sensorPars, nModules);
+  fHitfinder.sensorPars.reset(nModules, xpu::buf_io);
+  xpu::h_view hSensorPars {fHitfinder.sensorPars};
   for (int m = 0; m < nModules; m++) {
     const auto& module = fPars->modules.at(m);
-    auto& gpuPars      = fHitfinderHost.sensorPars.h()[m];
+    auto& gpuPars      = hSensorPars[m];
     gpuPars.dY         = module.dY;
     gpuPars.pitch      = module.pitch;
     gpuPars.stereoF    = module.stereoF;
@@ -152,13 +211,22 @@ void StsHitfinderChain::AllocateStatic()
     gpuPars.lorentzF   = module.lorentzF;
     gpuPars.lorentzB   = module.lorentzB;
   }
-  xpu::copy(fHitfinderHost.sensorPars, xpu::host_to_device);
+  xpu::copy(fHitfinder.sensorPars, xpu::h2d);
+
+  // Time errors
+  fHitfinder.maxClusterTimeErrorByModuleSide.reset(nModuleSides, xpu::buf_device);
+
+  q.wait();
 }
 
-void StsHitfinderChain::AllocateDynamic(size_t maxNDigisPerModule, size_t nDigisTotal)
+void sts::HitfinderChain::AllocateDynamic(size_t maxNDigisPerModule, size_t nDigisTotal)
 {
+  LOG(debug) << "STS Hitfinder Chain: Allocating dynamic memory for " << maxNDigisPerModule << " digis per module and "
+             << nDigisTotal << " digis in total";
   EnsureParameters();
 
+  xpu::scoped_timer t_ {"Allocate"};
+
   // TODO: some of these buffers have a constant size and can be allocated statically.
   // Just the data they contain is static.
 
@@ -170,42 +238,56 @@ void StsHitfinderChain::AllocateDynamic(size_t maxNDigisPerModule, size_t nDigis
   const size_t maxNClustersPerModule = maxNDigisPerModule * kClustersPerDigiFactor;
   const size_t maxNHitsPerModule     = maxNClustersPerModule * kHitsPerClustersFactor;
 
-
   // Allocate Digi Buffers
-  GPU_ALLOC(digiOffsetPerModule, nModuleSides + 1);
-  GPU_ALLOC(digisPerModule, nDigisTotal);
-  GPU_ALLOC(digisPerModuleTmp, nDigisTotal);
-  GPU_ALLOC(digisSortedPerModule, nModuleSides);
-  GPU_ALLOC(digiConnectorsPerModule, nDigisTotal);
+  fHitfinder.digiOffsetPerModule.reset(nModuleSides + 1, xpu::buf_io);
+  fHitfinder.digisPerModule.reset(nDigisTotal, xpu::buf_io);
 
-  GPU_ALLOC(channelOffsetPerModule, nModuleSides * nChannels);
+  fHitfinder.digisPerModuleTmp.reset(nDigisTotal, xpu::buf_device);
+  fHitfinder.digiConnectorsPerModule.reset(nDigisTotal, xpu::buf_device);
 
-  GPU_ALLOC(maxErrorFront, 1);
-  GPU_ALLOC(maxErrorBack, 1);
+  fHitfinder.channelOffsetPerModule.reset(nModuleSides * nChannels, xpu::buf_device);
 
   // Allocate Cluster Buffers
-  SET_GPU_CONSTANT(maxClustersPerModule, maxNClustersPerModule);
-  GPU_ALLOC(channelStatusPerModule, nChannels * nModules);
-  GPU_ALLOC(clusterIdxPerModule, maxNClustersPerModule * nModuleSides);
-  GPU_ALLOC(clusterIdxPerModuleTmp, maxNClustersPerModule * nModuleSides);
-  GPU_ALLOC(clusterIdxSortedPerModule, nModuleSides);
-  GPU_ALLOC(clusterDataPerModule, maxNClustersPerModule * nModuleSides);
-  GPU_ALLOC(nClustersPerModule, nModuleSides);
+  fHitfinder.maxClustersPerModule = maxNClustersPerModule;
+
+  fHitfinder.clusterIdxPerModule.reset(maxNClustersPerModule * nModuleSides, xpu::buf_device);
+  fHitfinder.clusterIdxPerModuleTmp.reset(maxNClustersPerModule * nModuleSides, xpu::buf_device);
+  fHitfinder.clusterIdxSortedPerModule.reset(nModuleSides, xpu::buf_device);
+
+  fHitfinder.clusterDataPerModule.reset(maxNClustersPerModule * nModuleSides, xpu::buf_io);
+  fHitfinder.nClustersPerModule.reset(nModuleSides, xpu::buf_io);
 
   // Allocate Hit Buffers
-  SET_GPU_CONSTANT(maxHitsPerModule, maxNHitsPerModule);
-  GPU_ALLOC(hitsPerModule, maxNHitsPerModule * nModules);
-  GPU_ALLOC(nHitsPerModule, nModules);
+  fHitfinder.maxHitsPerModule = maxNHitsPerModule;
+  fHitfinder.hitsPerModule.reset(maxNHitsPerModule * nModules, xpu::buf_io);
+  fHitfinder.nHitsPerModule.reset(nModules, xpu::buf_io);
 }
 
-StsHitfinderChain::DigiMap StsHitfinderChain::SortDigisIntoModules(gsl::span<const CbmStsDigi> digis,
-                                                                   size_t& maxNDigisPerModule)
+sts::HitfinderChain::DigiMap sts::HitfinderChain::SortDigisIntoModules(gsl::span<const CbmStsDigi> digis,
+                                                                       size_t& maxNDigisPerModule)
 {
+  LOG(debug) << "STS Hitfinder Chain: Sorting " << digis.size() << " digis into modules";
+  xpu::scoped_timer t_ {"Sort Digis"};
+
   DigiMap digiMap;
 
   int nChannelsPerSide = fPars->nChannels / 2;
 
+  auto isPulser = [&](const CbmStsDigi& digi) {
+    return std::find_if(fPars->modules.begin(), fPars->modules.end(),
+                        [&](const auto& mod) { return mod.address == digi.GetAddress(); })
+           == fPars->modules.end();
+  };
+
+  size_t nPulsers = 0;
+
   for (const auto& digi : digis) {
+
+    if (isPulser(digi)) {
+      nPulsers++;
+      continue;
+    }
+
     int moduleAddr = CbmStsAddress::GetMotherAddress(digi.GetAddress(), kStsModule);
     bool isFront   = digi.GetChannel() < fPars->nChannels / 2;
     if (isFront) digiMap.front[moduleAddr].emplace_back(digi);
@@ -216,6 +298,15 @@ StsHitfinderChain::DigiMap StsHitfinderChain::SortDigisIntoModules(gsl::span<con
     }
   }
 
+  LOG_IF(warn, nPulsers > 0) << "STS Hitfinder: Discarded " << nPulsers << " pulser digis";
+
+  // Print digi counts per module
+  for (const auto& mod : fPars->modules) {
+    i32 moduleAddr = mod.address;
+    LOG(debug1) << "Module " << moduleAddr << " has " << digiMap.front[moduleAddr].size() << " front digis and "
+                << digiMap.back[moduleAddr].size() << " back digis";
+  }
+
   maxNDigisPerModule = 0;
   for (const auto& mod : digiMap.front) {
     maxNDigisPerModule = std::max(maxNDigisPerModule, mod.second.size());
@@ -228,19 +319,21 @@ StsHitfinderChain::DigiMap StsHitfinderChain::SortDigisIntoModules(gsl::span<con
   return digiMap;
 }
 
-void StsHitfinderChain::FlattenDigis(DigiMap& digiMap)
+void sts::HitfinderChain::FlattenDigis(DigiMap& digiMap)
 {
+  LOG(debug) << "STS Hitfinder Chain: Flattening digis";
+  xpu::scoped_timer t_ {"Flatten Digis"};
   FlattenDigisSide(digiMap.front, true);
   FlattenDigisSide(digiMap.back, false);
 }
 
-void StsHitfinderChain::FlattenDigisSide(DigiMapSide& digis, bool isFront)
+void sts::HitfinderChain::FlattenDigisSide(DigiMapSide& digis, bool isFront)
 {
   EnsureParameters();
 
   int nModules           = fPars->modules.size();
-  size_t* pMdigiOffset   = fHitfinderHost.digiOffsetPerModule.h();
-  CbmStsDigi* pDigisFlat = fHitfinderHost.digisPerModule.h();
+  xpu::h_view pMdigiOffset {fHitfinder.digiOffsetPerModule};
+  xpu::h_view pDigisFlat {fHitfinder.digisPerModule};
 
   int sideOffset = (isFront ? 0 : nModules);
 
@@ -253,28 +346,17 @@ void StsHitfinderChain::FlattenDigisSide(DigiMapSide& digis, bool isFront)
   }
 }
 
-void StsHitfinderChain::CollectTimingInfo()
-{
-  // Check if profiling is enabled
-  // TODO: xpu should have a nicer way to query if profiling is enabled
-  if (xpu::get_timing<SortDigis>().empty()) return;
-
-  // TODO: Add time counters
-  fHitfinderTimes.timeSortDigi    = xpu::get_timing<SortDigis>().back();
-  fHitfinderTimes.timeCluster     = xpu::get_timing<FindClustersSingleStep>().back();
-  fHitfinderTimes.timeSortCluster = xpu::get_timing<SortClusters>().back();
-  fHitfinderTimes.timeHits        = xpu::get_timing<FindHits>().back();
-}
-
-void StsHitfinderChain::AppendClustersModule(int module, bool isFront, std::vector<StsGpuCluster>& clusters)
+void sts::HitfinderChain::AppendClustersModule(int module, bool isFront, std::vector<sts::Cluster>& clusters)
 {
   EnsureParameters();
 
-  auto& hfc      = fHitfinderHost;
+  auto& hfc      = fHitfinder;
   int moduleSide = (isFront ? module : 2 * module);
+  xpu::h_view hNClusters {hfc.nClustersPerModule};
+  xpu::h_view hClusters {hfc.clusterDataPerModule};
 
-  int nClusters     = hfc.nClustersPerModule.h()[moduleSide];
-  auto* gpuClusters = &hfc.clusterDataPerModule.h()[moduleSide * hfc.maxClustersPerModule];
+  int nClusters     = hNClusters[moduleSide];
+  auto* gpuClusters = &hClusters[moduleSide * hfc.maxClustersPerModule];
 
   clusters.reserve(clusters.size() + nClusters);
 
@@ -292,14 +374,15 @@ void StsHitfinderChain::AppendClustersModule(int module, bool isFront, std::vect
   }
 }
 
-void StsHitfinderChain::AppendHitsModule(int module, std::vector<StsGpuHit>& hits)
+void sts::HitfinderChain::AppendHitsModule(int module, std::vector<sts::Hit>& hits)
 {
   EnsureParameters();
 
-  auto& hfc = fHitfinderHost;
+  xpu::h_view hHits {fHitfinder.hitsPerModule};
+  xpu::h_view hNHits {fHitfinder.nHitsPerModule};
 
-  auto* gpuHits = &hfc.hitsPerModule.h()[module * hfc.maxHitsPerModule];
-  int nHitsGpu  = hfc.nHitsPerModule.h()[module];
+  auto* gpuHits = &hHits[module * fHitfinder.maxHitsPerModule];
+  int nHitsGpu  = hNHits[module];
 
   hits.reserve(hits.size() + nHitsGpu);
 
@@ -319,3 +402,133 @@ void StsHitfinderChain::AppendHitsModule(int module, std::vector<StsGpuHit>& hit
     //                   hitGpu.fDv);
   }
 }
+
+void sts::HitfinderChain::EnsureDigiOffsets(DigiMap& digi)
+{
+  xpu::h_view digiOffset {fHitfinder.digiOffsetPerModule};
+
+  size_t nModules = fPars->modules.size();
+  size_t offset   = 0;
+  // Front
+  for (size_t m = 0; m < nModules; m++) {
+    const auto& moduleDigis = digi.front[fPars->modules[m].address];
+    LOG_IF(fatal, digiOffset[m] != offset) << "Digi offset mismatch";
+    offset += moduleDigis.size();
+  }
+
+  // Back
+  for (size_t m = 0; m < nModules; m++) {
+    const auto& moduleDigis = digi.back[fPars->modules[m].address];
+    LOG_IF(fatal, digiOffset[nModules + m] != offset)
+      << "Module " << nModules + m << ": Digi offset mismatch: " << digiOffset[nModules + m] << " != " << offset;
+    offset += moduleDigis.size();
+  }
+
+  LOG_IF(fatal, offset != digiOffset[2 * nModules]) << "Digi offset mismatch";
+}
+
+void sts::HitfinderChain::EnsureDigisSorted()
+{
+  xpu::h_view digiOffset {fHitfinder.digiOffsetPerModule};
+  xpu::h_view digis {fHitfinder.digisPerModule};
+
+  bool isSorted = true;
+
+  for (size_t m = 0; m < fPars->modules.size(); m++) {
+    int nDigis = digiOffset[m + 1] - digiOffset[m];
+    if (nDigis == 0) continue;
+
+    auto* digisModule = &digis[digiOffset[m]];
+
+    for (int i = 0; i < nDigis - 1; i++) {
+      const auto &digi1 = digisModule[i], &digi2 = digisModule[i + 1];
+
+      if ((digi1.GetChannel() < digi2.GetChannel())
+          || (digi1.GetChannel() == digi2.GetChannel()
+              && digi1.GetTime() <= digi2.GetTime())  // Unpacker sometimes produces multiple
+                                                      // digis with the same time and channel, FIXME!
+      ) {
+        continue;
+      }
+
+      isSorted = false;
+      LOG(error) << "Module " << m << " not sorted: " << digi1.ToString() << " " << digi2.ToString();
+      break;
+    }
+  }
+
+  LOG_IF(fatal, !isSorted) << "Digis are not sorted";
+}
+
+void sts::HitfinderChain::EnsureChannelOffsets(span<u32> channelOffsetsByModule)
+{
+  xpu::h_view digisPerModule {fHitfinder.digisPerModule};
+  xpu::h_view digiOffsetPerModule {fHitfinder.digiOffsetPerModule};
+  for (size_t m = 0; m < fPars->modules.size() * 2; m++) {
+    int nChannels = fPars->nChannels / 2;  // Consider module sides
+
+    std::vector<u32> expectedChannelOffsets(nChannels, 0);
+
+    int offset = digiOffsetPerModule[m];
+    int nDigis = digiOffsetPerModule[m + 1] - offset;
+    span<CbmStsDigi> digis(&digisPerModule[offset], nDigis);
+    span<u32> channelOffsets = channelOffsetsByModule.subspan(m * nChannels, nChannels);
+
+    if (nDigis == 0) continue;
+
+    LOG_IF(fatal, channelOffsets[0] != 0) << "Module " << m << ": First channel offset is not 0";
+
+    int chan = digis[0].GetChannel();
+    for (int i = 0; i < nDigis; i++) {
+      if (digis[i].GetChannel() != chan) {
+        while (chan < digis[i].GetChannel()) {
+          chan++;
+          expectedChannelOffsets[chan] = i;
+        }
+      }
+    }
+    while (chan < nChannels) {
+      chan++;
+      expectedChannelOffsets[chan] = nDigis;
+    }
+
+    for (int i = 0; i < nChannels; i++) {
+      LOG_IF(fatal, channelOffsets[i] != expectedChannelOffsets[i])
+        << "Module " << m << ": Channel offset for channel " << i << " is " << channelOffsets[i] << " but should be "
+        << expectedChannelOffsets[i];
+    }
+  }
+}
+
+void sts::HitfinderChain::EnsureClustersSane(span<GpuClusterIdx> hClusterIdx, span<int> hNClusters)
+{
+  for (size_t m = 0; m < 2 * fPars->modules.size(); m++) {
+    int nClusters = hNClusters[m];
+
+    LOG(trace) << "Module " << m << " has " << nClusters << " clusters of " << fHitfinder.maxClustersPerModule;
+
+    if (nClusters == 0) continue;
+
+    if (nClusters < 0) { LOG(fatal) << "Module " << m << " has negative number of clusters " << nClusters; }
+    if (size_t(nClusters) > fHitfinder.maxClustersPerModule) {
+      LOG(fatal) << "Module " << m << " has " << nClusters << " clusters, but only " << fHitfinder.maxClustersPerModule
+                 << " are allowed";
+    }
+
+    auto* clusterIdx = &hClusterIdx[m * fHitfinder.maxClustersPerModule];
+
+    for (int i = 0; i < nClusters; i++) {
+      auto& cidx = clusterIdx[i];
+
+      if (cidx.fIdx < 0 || size_t(cidx.fIdx) >= fHitfinder.maxClustersPerModule) {
+        LOG(fatal) << "Cluster " << i << " of module " << m << " has invalid index " << cidx.fIdx;
+      }
+
+      if (cidx.fTime == 0xFFFFFFFF) {
+        LOG(fatal) << "Cluster " << i << " of module " << m << " has invalid time " << cidx.fTime;
+      }
+    }
+  }
+
+  LOG(trace) << "Clusters ok";
+}
diff --git a/algo/detectors/sts/StsHitfinderChain.h b/algo/detectors/sts/StsHitfinderChain.h
index 37fcd4277bcad92e57d33a3e11ddaddc8063c336..bb38e93f66288ad9d09eb68b7990b92db8df99a8 100644
--- a/algo/detectors/sts/StsHitfinderChain.h
+++ b/algo/detectors/sts/StsHitfinderChain.h
@@ -14,46 +14,16 @@
 #include <optional>
 #include <vector>
 
-#include <xpu/host.h>
-
 #include "StsHitfinder.h"
+#include "SubChain.h"
+#include "gpu/xpu_legacy.h"
+#include "sts/HitfinderPars.h"
+#include "sts/LandauTable.h"
 
-namespace cbm::algo
+namespace cbm::algo::sts
 {
 
-  struct StsModuleTransformationMatrix {
-    // Rotation + translation matrix to transform
-    // local module coordinates into global coordinate system.
-    // No need for fancy math types here. These values are just copied
-    // and moved to the GPU.
-    std::array<float, 9> rotation;  // 3x3 matrix
-    std::array<float, 3> translation;
-  };
-
-  struct StsModulePar {
-    int32_t address;
-    float dY;
-    float pitch;
-    float stereoF;
-    float stereoB;
-    float lorentzF;
-    float lorentzB;
-    StsModuleTransformationMatrix localToGlobal;
-  };
-
-  struct StsLandauTable {
-    std::vector<float> values;
-    float stepSize;
-  };
-
-  struct StsHitfinderPar {
-    StsAsicPar asic;  // Asic definitions. Currently assumes same parameters for all asics.
-    int nChannels;    // Total number of channels per module. Hitfinder assumes nChannels / 2 channels per side.
-    std::vector<StsModulePar> modules;
-    StsLandauTable landauTable;
-  };
-
-  struct StsHitfinderTimes {
+  struct HitfinderTimes {
     double timeIO          = 0;
     double timeSortDigi    = 0;
     double timeCluster     = 0;
@@ -67,22 +37,14 @@ namespace cbm::algo
    * handles memory transfer for input / output data and conversion to the
    * regular CBM STS types.
    */
-  class StsHitfinderChain {
+  class HitfinderChain : public SubChain {
 
   public:
-    void SetParameters(const StsHitfinderPar& parameters);
-    const StsHitfinderPar& GetParameters() const { return *fPars; }
+    void SetParameters(const sts::HitfinderPars& parameters);
+    const sts::HitfinderPars& GetParameters() const { return *fPars; }
 
     void operator()(gsl::span<const CbmStsDigi>);
 
-    StsHitfinderTimes GetHitfinderTimes() const { return fHitfinderTimes; }
-
-    /**
-       * Returns access to (host copies of) the raw buffers used by the gpu hitfinder.
-       * This is currently the only way to access the produced hits and cluster.
-       */
-    const StsHitfinderHost& GetHitfinderBuffers() const { return fHitfinderHost; }
-
   private:
     // Shorthands to map module-address onto digis in that module
     using DigiMapSide = std::map<int, std::vector<CbmStsDigi>>;
@@ -96,8 +58,8 @@ namespace cbm::algo
        *  TODO: These values might be wildly off. Look for better estimations.
        *  TODO: Should be configurable at runtime.
        */
-    static constexpr float kClustersPerDigiFactor = 0.5f;
-    static constexpr float kHitsPerClustersFactor = 4.f;
+    static constexpr float kClustersPerDigiFactor = 1.f;
+    static constexpr float kHitsPerClustersFactor = 1.5f;
 
     /**
        * Ensure parameters were set. Raises log(fatal) otherwise.
@@ -127,21 +89,24 @@ namespace cbm::algo
     void FlattenDigis(DigiMap& digiMap);
     void FlattenDigisSide(DigiMapSide& digis, bool isFront);
 
-    void CollectTimingInfo();
-
     /**
        * Transfer Hits / Clusters back to host and convert to common CBM types.
        */
-    void AppendClustersModule(int module, bool isFront, std::vector<StsGpuCluster>&);
-    void AppendHitsModule(int module, std::vector<StsGpuHit>&);
+    void AppendClustersModule(int module, bool isFront, std::vector<sts::Cluster>&);
+    void AppendHitsModule(int module, std::vector<sts::Hit>&);
+
+    // Debug functions, ensure reco produces sane results
+    void EnsureDigiOffsets(DigiMap&);
+    void EnsureDigisSorted();
+    void EnsureChannelOffsets(span<u32>);
+    void EnsureClustersSane(span<GpuClusterIdx>, span<int>);
+    void EnsureClustersSorted();
 
-    std::optional<const StsHitfinderPar> fPars;
-    StsHitfinderTimes fHitfinderTimes;
+    std::optional<const sts::HitfinderPars> fPars;
 
-    StsHitfinderHost fHitfinderHost;
-    StsHitfinderGpu fHitfinderGpu;
+    Hitfinder fHitfinder;
   };
 
-}  // namespace cbm::algo
+}  // namespace cbm::algo::sts
 
 #endif  // #ifndef CBM_ALGO_STS_HITFINDER_CHAIN_H
diff --git a/algo/params/LandauWidthTable.txt b/algo/params/LandauWidthTable.txt
new file mode 120000
index 0000000000000000000000000000000000000000..66b518d868f39fcf70c482274ee247aefe58489f
--- /dev/null
+++ b/algo/params/LandauWidthTable.txt
@@ -0,0 +1 @@
+../../parameters/sts/LandauWidthTable.txt
\ No newline at end of file
diff --git a/algo/params/StsHitfinder.yaml b/algo/params/StsHitfinder.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..184f9d4e69c3bef05f2904b319a0f96ab369342f
--- /dev/null
+++ b/algo/params/StsHitfinder.yaml
@@ -0,0 +1,122 @@
+---
+asic:
+  nAdc: 32
+  dynamicRange: 75000
+  threshold: 3000
+  timeResolution: 5
+  deadTime: 800
+  noise: 1000
+  zeroNoiseRate: 0.00397890015
+nChannels: 2048
+modules:
+  - address: 0x10008002
+    dY: 5.96000004
+    pitch: 0.00579999993
+    stereoF: 0
+    stereoB: 7.5
+    lorentzF: 0
+    lorentzB: 0
+    localToGlobal:
+      rotation: [-1, -1.22464685e-16, 0, 1.22464685e-16, -1, 0, 0, 0, 1]
+      translation: [-2.97959995, 2.86999989, 27.2849998]
+  - address: 0x10018002
+    dY: 5.96000004
+    pitch: 0.00579999993
+    stereoF: 0
+    stereoB: 7.5
+    lorentzF: 0
+    lorentzB: 0
+    localToGlobal:
+      rotation: [-1, -1.22464685e-16, 0, 1.22464685e-16, -1, 0, 0, 0, 1]
+      translation: [-2.97959995, -2.86999989, 27.4850006]
+  - address: 0x10008402
+    dY: 5.96000004
+    pitch: 0.00579999993
+    stereoF: 0
+    stereoB: 7.5
+    lorentzF: 0
+    lorentzB: 0
+    localToGlobal:
+      rotation: [1, 1.22464685e-16, 1.22464685e-16, 1.22464685e-16, -1, 0, 1.22464685e-16, 1.49975976e-32, -1]
+      translation: [2.97959995, 2.86999989, 28.7150002]
+  - address: 0x10018402
+    dY: 5.96000004
+    pitch: 0.00579999993
+    stereoF: 0
+    stereoB: 7.5
+    lorentzF: 0
+    lorentzB: 0
+    localToGlobal:
+      rotation: [1, 1.22464685e-16, 1.22464685e-16, 1.22464685e-16, -1, 0, 1.22464685e-16, 1.49975976e-32, -1]
+      translation: [2.97959995, -2.86999989, 28.5149994]
+  - address: 0x10008012
+    dY: 5.96000004
+    pitch: 0.00579999993
+    stereoF: 0
+    stereoB: 7.5
+    lorentzF: 0
+    lorentzB: 0
+    localToGlobal:
+      rotation: [1, 1.22464685e-16, 1.22464685e-16, 1.22464685e-16, -1, 0, 1.22464685e-16, 1.49975976e-32, -1]
+      translation: [-5.95919991, 5.96999979, 42.9150009]
+  - address: 0x10018012
+    dY: 12.1599998
+    pitch: 0.00579999993
+    stereoF: 0
+    stereoB: 7.5
+    lorentzF: 0
+    lorentzB: 0
+    localToGlobal:
+      rotation: [1, 1.22464685e-16, 1.22464685e-16, 1.22464685e-16, -1, 0, 1.22464685e-16, 1.49975976e-32, -1]
+      translation: [-5.95919991, -2.86999989, 42.7150002]
+  - address: 0x10008412
+    dY: 5.96000004
+    pitch: 0.00579999993
+    stereoF: 0
+    stereoB: 7.5
+    lorentzF: 0
+    lorentzB: 0
+    localToGlobal:
+      rotation: [-1, -1.22464685e-16, 0, 1.22464685e-16, -1, 0, 0, 0, 1]
+      translation: [7.31114159e-16, 5.96999979, 41.0849991]
+  - address: 0x10018412
+    dY: 12.1599998
+    pitch: 0.00579999993
+    stereoF: 0
+    stereoB: 7.5
+    lorentzF: 0
+    lorentzB: 0
+    localToGlobal:
+      rotation: [-1, -1.22464685e-16, 0, 1.22464685e-16, -1, 0, 0, 0, 1]
+      translation: [-3.51473622e-16, -2.86999989, 41.2849998]
+  - address: 0x10008812
+    dY: 5.96000004
+    pitch: 0.00579999993
+    stereoF: 0
+    stereoB: 7.5
+    lorentzF: 0
+    lorentzB: 0
+    localToGlobal:
+      rotation: [1, 1.22464685e-16, 1.22464685e-16, 1.22464685e-16, -1, 0, 1.22464685e-16, 1.49975976e-32, -1]
+      translation: [5.95919991, 5.73999977, 42.9150009]
+  - address: 0x10018812
+    dY: 5.96000004
+    pitch: 0.00579999993
+    stereoF: 0
+    stereoB: 7.5
+    lorentzF: 0
+    lorentzB: 0
+    localToGlobal:
+      rotation: [1, 1.22464685e-16, 1.22464685e-16, 1.22464685e-16, -1, 0, 1.22464685e-16, 1.49975976e-32, -1]
+      translation: [5.95919991, 1.33226763e-15, 42.7150002]
+  - address: 0x10028812
+    dY: 5.96000004
+    pitch: 0.00579999993
+    stereoF: 0
+    stereoB: 7.5
+    lorentzF: 0
+    lorentzB: 0
+    localToGlobal:
+      rotation: [1, 1.22464685e-16, 1.22464685e-16, 1.22464685e-16, -1, 0, 1.22464685e-16, 1.49975976e-32, -1]
+      translation: [5.95919991, -5.73999977, 42.5149994]
+...