diff --git a/algo/detectors/sts/Hitfinder.cxx b/algo/detectors/sts/Hitfinder.cxx
index 09ebeb6a8d5d6d7eb72a5f19852691306b7b6a81..b2a9629658782dfd67576a21b6cdd8543841d219 100644
--- a/algo/detectors/sts/Hitfinder.cxx
+++ b/algo/detectors/sts/Hitfinder.cxx
@@ -524,24 +524,22 @@ XPU_D void sts::Hitfinder::SortClusters(SortClusters::context& ctx) const
 XPU_D void sts::Hitfinder::FindHits(FindHits::context& ctx) const
 {
 
-// On CPU process all modules in parallel (one thread per module).
-// On GPU process all front clusters in parallel instead (one thread per cluster)
-//   to fully utilize the GPU.
-// Currently use option 2 for both as it is faster on CPU as well.
-#if 0
-  int iModule = ctx.block_idx_x();
-#else
   int iModule = 0;
   int iThread = ctx.block_dim_x() * ctx.block_idx_x() + ctx.thread_idx_x();
 
+#if XPU_IS_CPU
+  iModule = iThread / kFindHitsChunksPerModule;
+#else
   for (; iModule < nModules; iModule++) {
     if (iThread < nClustersPerModule[iModule]) break;
     iThread -= nClustersPerModule[iModule];
   }
-
-  if (iModule >= nModules) return;
 #endif
 
+  if (iModule >= nModules) {
+    return;
+  }
+
   int iModuleF               = iModule;
   int iModuleB               = iModuleF + nModules;
   size_t offsetF             = iModuleF * maxClustersPerModule;
@@ -554,14 +552,33 @@ XPU_D void sts::Hitfinder::FindHits(FindHits::context& ctx) const
   int nClustersB             = nClustersPerModule[iModuleB];
   size_t nHitsWritten        = nHitsPerModule[iModule];
 
-  if (nClustersF == 0 || nClustersB == 0) return;
+  if (nClustersF == 0 || nClustersB == 0) {
+    return;
+  }
+
+  // Only used on CPU
+  [[maybe_unused]] int nClustersPerChunk = (nClustersF + kFindHitsChunksPerModule - 1) / kFindHitsChunksPerModule;
+  [[maybe_unused]] int iChunk            = iThread % kFindHitsChunksPerModule;
+
+  int iClusterF =
+#if XPU_IS_CPU
+    iChunk * nClustersPerChunk;
+#else
+    iThread;
+#endif
+  if (iClusterF >= nClustersF) {
+    return;
+  }
+
 
   // Stop processing if memory limits are exceed by a large amount
   // In general we would still want to process all clusters to get an idea
   // by how much memory estimates are wrong.
   // However they are currently chosen very generous, so if they are exceeded by a large amount
   // something must have gone awry. (e.g. monster events in the mCBM data that explode the hit finding combinatorics)
-  if (nHitsWritten > 2 * maxHitsPerModule) return;
+  if (nHitsWritten > 2 * maxHitsPerModule) {
+    return;
+  }
 
   HitfinderCache pars;
   {
@@ -584,41 +601,39 @@ XPU_D void sts::Hitfinder::FindHits(FindHits::context& ctx) const
 
   float maxSigmaBoth = 4.f * xpu::sqrt(maxTerrF * maxTerrF + maxTerrB * maxTerrB);
 
-  int startB = 0;
-#if 0
-  for (int iClusterF = ctx.thread_idx_x(); iClusterF < nClustersF; iClusterF += ctx.block_dim_x()) {
-#else
-  int iClusterF = iThread;
-  if (iClusterF < nClustersF) {
-
-    float firstPossibleTime = float(clusterIdxF[iClusterF].fTime) - maxSigmaBoth;
-
-    // Use binary search to find the first cluster on back side that can be matched
-    // with the current cluster on front side
-    int start = startB;
-    int end   = nClustersB;
-    while (end - start > 1) {
-      int mid = (start + end) / 2;
-      if (float(clusterIdxB[mid].fTime) < firstPossibleTime) {
-        start = mid;
-      }
-      else {
-        end = mid;
-      }
+  int startB              = 0;
+  float firstPossibleTime = float(clusterIdxF[iClusterF].fTime) - maxSigmaBoth;
+
+  // Use binary search to find the first cluster on back side that can be matched
+  // with the current cluster on front side
+  int start = startB;
+  int end   = nClustersB;
+  while (end - start > 1) {
+    int mid = (start + end) / 2;
+    if (float(clusterIdxB[mid].fTime) < firstPossibleTime) {
+      start = mid;
     }
+    else {
+      end = mid;
+    }
+  }
+  startB = start;
 
-    startB = start;
+  int endClusterF =
+#if XPU_IS_CPU
+    xpu::min(iClusterF + nClustersPerChunk, nClustersF);
+#else
+    iClusterF + 1;
 #endif
-  // Temporarily disable clang-format because it indents this block wrongly for some reason.
-  // TODO: Code above needs cleanup. Then clang-format can be re-enabled too.
-  // clang-format off
-    ClusterIdx clsIdxF = clusterIdxF[iClusterF];
+
+  for (; iClusterF < endClusterF; iClusterF++) {
+    ClusterIdx clsIdxF    = clusterIdxF[iClusterF];
     sts::Cluster clsDataF = clusterDataF[clsIdxF.fIdx];
 
     float maxSigma = 4.f * xpu::sqrt(clsDataF.fTimeError * clsDataF.fTimeError + maxTerrB * maxTerrB);
 
     for (int iClusterB = startB; iClusterB < nClustersB; iClusterB++) {
-      ClusterIdx clsIdxB = clusterIdxB[iClusterB];
+      ClusterIdx clsIdxB    = clusterIdxB[iClusterB];
       sts::Cluster clsDataB = clusterDataB[clsIdxB.fIdx];
 
       float timeDiff = float(clsIdxF.fTime) - float(clsIdxB.fTime);
@@ -636,18 +651,21 @@ XPU_D void sts::Hitfinder::FindHits(FindHits::context& ctx) const
 
       float timeCut                 = -1.f;
       const RecoParams::STS& params = ctx.cmem<Params>().sts;
-      if (params.timeCutClusterAbs > 0.f) timeCut = params.timeCutClusterAbs;
+      if (params.timeCutClusterAbs > 0.f)
+        timeCut = params.timeCutClusterAbs;
       else if (params.timeCutClusterSig > 0.f) {
         float eF = clsDataF.fTimeError;
         float eB = clsDataB.fTimeError;
         timeCut  = params.timeCutClusterSig * xpu::sqrt(eF * eF + eB * eB);
       }
 
-      if (xpu::abs(float(clsIdxF.fTime) - float(clsIdxB.fTime)) > timeCut) continue;
+      if (xpu::abs(float(clsIdxF.fTime) - float(clsIdxB.fTime)) > timeCut) {
+        continue;
+      }
 
       IntersectClusters(iModule, pars, clsIdxF, clsDataF, clsIdxB, clsDataB);
-    }
-  }
+    }  // end loop over clusters on back side
+  }    // end loop over clusters on front side
   // clang-format on
 }
 
diff --git a/algo/detectors/sts/Hitfinder.h b/algo/detectors/sts/Hitfinder.h
index af0d6ac765473ece86662bc416810a1f8de59abc..c96c5522536edd7325354f2115bbe269bc5640ac 100644
--- a/algo/detectors/sts/Hitfinder.h
+++ b/algo/detectors/sts/Hitfinder.h
@@ -37,6 +37,7 @@ namespace cbm::algo
     kFindClusterBlockSize       = 1024,
     kFindHitsBlockSize          = 64,
 #endif
+    kFindHitsChunksPerModule = 16,
   };
 }  // namespace cbm::algo
 
@@ -108,7 +109,7 @@ namespace cbm::algo::sts
     using block_size = xpu::block_size<kFindHitsBlockSize>;
     using constants  = xpu::cmem<TheHitfinder, Params>;
     using context    = xpu::kernel_context<shared_memory, constants>;
-    using openmp     = xpu::openmp_settings<xpu::schedule_dynamic, 128>;
+    using openmp     = xpu::openmp_settings<xpu::schedule_static, 1>;
     XPU_D void operator()(context&);
   };
 
diff --git a/algo/detectors/sts/HitfinderChain.cxx b/algo/detectors/sts/HitfinderChain.cxx
index a2dcbbc3d55a1e7ec6283095e499e654044b358e..18c6d0a4b6db6eeaa462a2da9175ef0693616727 100644
--- a/algo/detectors/sts/HitfinderChain.cxx
+++ b/algo/detectors/sts/HitfinderChain.cxx
@@ -181,12 +181,10 @@ sts::HitfinderChain::Result sts::HitfinderChain::operator()(gsl::span<const CbmS
 
   size_t nClustersFront = std::accumulate(nClusters.begin(), nClusters.begin() + nModules, 0);
 
-  // FindHits supports to modes of parallelization: One thread per cluster or one block per module
-  // Currently we use method one for CPU and GPU.
-  // See sts::Hitfinder::FindHits() for details.
-  // bool isCpu            = xpu::device::active().backend() == xpu::cpu;
-  bool isCpu          = false;
-  xpu::grid findHitsG = isCpu ? xpu::n_blocks(nModules) : xpu::n_threads(nClustersFront);
+  bool isCpu = xpu::device::active().backend() == xpu::cpu;
+  // bool isCpu          = false;
+  int threadsCPU      = kFindHitsChunksPerModule * nModules;
+  xpu::grid findHitsG = xpu::n_threads(isCpu ? threadsCPU : nClustersFront);
   queue.launch<FindHits>(findHitsG);
   xpu::k_add_bytes<FindHits>(nClustersTotal * sizeof(sts::Cluster));