diff --git a/algo/detectors/sts/Hitfinder.cxx b/algo/detectors/sts/Hitfinder.cxx
index dd85d750b65eec4df000c02a6d1d37edd95cbf62..bc129e2e645b6459cacfc099e00e6499fde93ad0 100644
--- a/algo/detectors/sts/Hitfinder.cxx
+++ b/algo/detectors/sts/Hitfinder.cxx
@@ -222,18 +222,45 @@ XPU_D void sts::Hitfinder::FindClustersParallel(FindClusters::context& ctx) cons
 */
 XPU_D void sts::Hitfinder::CalculateClustersParallel(FindClusters::context& ctx) const
 {
-  int const iModule = ctx.block_idx_x();
-  CbmStsDigi* digis = &digisPerModule[digiOffsetPerModule[iModule]];
-  ;
-  auto const nDigis = GetNDigis(iModule);
+  const int nModuleSides = 2 * nModules;
 
-  if (nDigis == 0) return;
+  int iModule = 0;
+  int iThread = ctx.block_dim_x() * ctx.block_idx_x() + ctx.thread_idx_x();
+
+  for (; iModule < nModuleSides; iModule++) {
+    i32 nDigis = GetNDigis(iModule);
+    if (iThread < nDigis) {
+      break;
+    }
+    iThread -= nDigis;
+  }
+
+  if (iModule >= nModuleSides) {
+    return;
+  }
+
+  const CbmStsDigi* digis = &digisPerModule[digiOffsetPerModule[iModule]];
+  auto* digiConnector     = &digiConnectorsPerModule[digiOffsetPerModule[iModule]];
 
-  auto* digiConnector = &digiConnectorsPerModule[digiOffsetPerModule[iModule]];
-  // auto* channelOffsets = &channelOffsetPerModule[iModule * nChannels];
+  // Local index of digi in sensor
+  const int iDigi = iThread;
 
-  // calculateClustersChannelWise(digis, digiConnector, channelOffsets, iModule, threadId, nDigis);
-  CalculateClustersDigiWise(ctx, digis, digiConnector, nDigis);
+  if (digiConnector[iDigi].HasPrevious()) {
+    return;
+  }
+
+  if (!digiConnector[iDigi].HasNext()) {
+    // Cluster has 1 element
+    CreateClusterFromConnectors1(iModule, digis, iDigi);
+  }
+  else if (!digiConnector[digiConnector[iDigi].next()].HasNext()) {
+    // Cluster has 2 elements
+    CreateClusterFromConnectors2(iModule, digis, digiConnector, iDigi);
+  }
+  else {
+    // Cluster has >2 elements
+    CreateClusterFromConnectorsN(iModule, digis, digiConnector, iDigi);
+  }
 }
 
 /**
@@ -264,23 +291,10 @@ XPU_D void sts::Hitfinder::CalculateClustersDigiWise(FindClusters::context& ctx,
   for (unsigned int currIter = ctx.thread_idx_x(); currIter < nDigis; currIter += (unsigned int) ctx.block_dim_x()) {
 
     if (digiConnector[currIter].HasPrevious()) continue;
-
-    if (!digiConnector[currIter].HasNext()) {
-      //if Cluster has 1 element
-      CreateClusterFromConnectors1(iModule, digis, currIter);
-    }
-    else if (!digiConnector[digiConnector[currIter].next()].HasNext()) {
-      //if Cluster has 2 elements
-      CreateClusterFromConnectors2(iModule, digis, digiConnector, currIter);
-    }
-    else {
-      //if Cluster has N elements
-      CreateClusterFromConnectorsN(iModule, digis, digiConnector, currIter);
-    }
   }
 }
 
-XPU_D void sts::Hitfinder::CreateClusterFromConnectors1(int const iModule, CbmStsDigi* digis, int digiIndex) const
+XPU_D void sts::Hitfinder::CreateClusterFromConnectors1(int const iModule, const CbmStsDigi* digis, int digiIndex) const
 {
   const CbmStsDigi& digi = digis[digiIndex];
 
@@ -302,7 +316,7 @@ XPU_D void sts::Hitfinder::CreateClusterFromConnectors1(int const iModule, CbmSt
   AddCluster(iModule, time, cluster);
 }
 
-XPU_D void sts::Hitfinder::CreateClusterFromConnectors2(int const iModule, CbmStsDigi* digis,
+XPU_D void sts::Hitfinder::CreateClusterFromConnectors2(int const iModule, const CbmStsDigi* digis,
                                                         sts::DigiConnector* digiConnector, int const digiIndex) const
 {
 
@@ -375,7 +389,7 @@ XPU_D void sts::Hitfinder::CreateClusterFromConnectors2(int const iModule, CbmSt
   AddCluster(iModule, time, cls);
 }
 
-XPU_D void sts::Hitfinder::CreateClusterFromConnectorsN(int iModule, CbmStsDigi* digis,
+XPU_D void sts::Hitfinder::CreateClusterFromConnectorsN(int iModule, const CbmStsDigi* digis,
                                                         sts::DigiConnector* digiConnector, int digiIndex) const
 {
   ClusterCalculationProperties cProps;
@@ -634,7 +648,7 @@ XPU_D void sts::Hitfinder::FindHits(FindHits::context& ctx) const
       IntersectClusters(iModule, pars, clsIdxF, clsDataF, clsIdxB, clsDataB);
     }
   }
-// clang-format on
+  // clang-format on
 }
 
 XPU_D void sts::Hitfinder::IntersectClusters(int iBlock, const HitfinderCache& pars, const ClusterIdx& idxF,
diff --git a/algo/detectors/sts/Hitfinder.h b/algo/detectors/sts/Hitfinder.h
index af69fab65a2451fd91ee77191962412a207cbc45..383e48cd226534e9627804f42904bade77d7d823 100644
--- a/algo/detectors/sts/Hitfinder.h
+++ b/algo/detectors/sts/Hitfinder.h
@@ -322,10 +322,10 @@ namespace cbm::algo::sts
     XPU_D void CalculateClustersDigiWise(FindClusters::context& ctx, CbmStsDigi* digis, DigiConnector* digiConnector,
                                          unsigned int const nDigis) const;
 
-    XPU_D void CreateClusterFromConnectors1(int const iModule, CbmStsDigi* digis, int const digiIndex) const;
-    XPU_D void CreateClusterFromConnectors2(int const iModule, CbmStsDigi* digis, DigiConnector* digiConnector,
+    XPU_D void CreateClusterFromConnectors1(int const iModule, const CbmStsDigi* digis, int const digiIndex) const;
+    XPU_D void CreateClusterFromConnectors2(int const iModule, const CbmStsDigi* digis, DigiConnector* digiConnector,
                                             int const digiIndex) const;
-    XPU_D void CreateClusterFromConnectorsN(int const iModule, CbmStsDigi* digis, DigiConnector* digiConnector,
+    XPU_D void CreateClusterFromConnectorsN(int const iModule, const CbmStsDigi* digis, DigiConnector* digiConnector,
                                             int const digiIndex) const;
 
    private:
@@ -361,7 +361,7 @@ namespace cbm::algo::sts
       ClusterIdx* tgtIdx    = &clusterIdxPerModule[iModule * maxClustersPerModule];
       sts::Cluster* tgtData = &clusterDataPerModule[iModule * maxClustersPerModule];
 
-      u32 pos = xpu::atomic_add_block(&nClustersPerModule[iModule], 1);
+      u32 pos = xpu::atomic_add(&nClustersPerModule[iModule], 1);
 
       if (size_t(pos) >= maxClustersPerModule) {
         xpu::atomic_add(&monitor->fNumClusterBucketOverflow, 1);
diff --git a/algo/detectors/sts/HitfinderChain.cxx b/algo/detectors/sts/HitfinderChain.cxx
index 68d253d2592ce0e1e6f6856d3bcb9c004cdba457..280911c5c2125eff15efe0cec2c4da25fb4acff7 100644
--- a/algo/detectors/sts/HitfinderChain.cxx
+++ b/algo/detectors/sts/HitfinderChain.cxx
@@ -141,7 +141,7 @@ sts::HitfinderChain::Result sts::HitfinderChain::operator()(gsl::span<const CbmS
     xpu::k_add_bytes<ChannelOffsets>(nDigisTotal * sizeof(CbmStsDigi));
     queue.launch<CreateDigiConnections>(xpu::n_threads(nDigisTotal));
     xpu::k_add_bytes<CreateDigiConnections>(nDigisTotal * sizeof(CbmStsDigi));
-    queue.launch<CreateClusters>(xpu::n_blocks(nModuleSides));
+    queue.launch<CreateClusters>(xpu::n_threads(nDigisTotal));
     xpu::k_add_bytes<CreateClusters>(nDigisTotal * sizeof(CbmStsDigi));
   }
   if (Opts().LogLevel() == trace) {
@@ -167,12 +167,6 @@ sts::HitfinderChain::Result sts::HitfinderChain::operator()(gsl::span<const CbmS
     EnsureClustersSane(clusterIdxPerModule, nClustersPerModule);
   }
 
-  // Run cluster finding steps in indivual kernels, useful for debugging / profiling
-  // xpu::run_kernel<CalculateOffsets>(xpu::grid::n_blocks(hfc.nModules * 2));
-  // xpu::run_kernel<FindClustersBasic>(xpu::grid::n_blocks(hfc.nModules * 2));
-  // xpu::run_kernel<CalculateClusters>(xpu::grid::n_blocks(hfc.nModules * 2));
-  // xpu::run_kernel<FindClustersBasic>(xpu::grid::n_blocks(hfc.nModules * 2));
-  // xpu::run_kernel<CalculateClustersBasic>(xpu::grid::n_blocks(hfc.nModules * 2));
   L_(debug) << "STS Hitfinder Chain: Sort Clusters...";
   queue.launch<SortClusters>(xpu::n_blocks(nModuleSides));