From 7aab8d24d1250c0594bb87efa219a7cff41e86b5 Mon Sep 17 00:00:00 2001
From: Felix Weiglhofer <weiglhofer@fias.uni-frankfurt.de>
Date: Tue, 11 Jul 2023 12:29:25 +0000
Subject: [PATCH] alg::sts::Hitfinder: Use more efficient parallelization on
 CPU.

---
 algo/detectors/sts/StsHitfinder.cxx      | 4 ++--
 algo/detectors/sts/StsHitfinderChain.cxx | 7 ++++++-
 2 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/algo/detectors/sts/StsHitfinder.cxx b/algo/detectors/sts/StsHitfinder.cxx
index 4bed13ff02..1b5933982c 100644
--- a/algo/detectors/sts/StsHitfinder.cxx
+++ b/algo/detectors/sts/StsHitfinder.cxx
@@ -496,7 +496,7 @@ XPU_D void sts::Hitfinder::FindHits(FindHits::context& ctx) const
 // On GPU process all front clusters in parallel instead (one thread per cluster)
 //   to fully utilize the GPU.
 // Currently use option 2 for both as it is faster on CPU as well.
-#if XPU_IS_CPU
+#if 0
   int iModule = ctx.block_idx_x();
 #else
   int iModule = 0;
@@ -545,7 +545,7 @@ XPU_D void sts::Hitfinder::FindHits(FindHits::context& ctx) const
   float maxSigmaBoth = 4.f * xpu::sqrt(maxTerrF * maxTerrF + maxTerrB * maxTerrB);
 
   int startB = 0;
-#if XPU_IS_CPU
+#if 0
   for (int iClusterF = ctx.thread_idx_x(); iClusterF < nClustersF; iClusterF += ctx.block_dim_x()) {
 #else
   int iClusterF = iThread;
diff --git a/algo/detectors/sts/StsHitfinderChain.cxx b/algo/detectors/sts/StsHitfinderChain.cxx
index d743d65d81..e70bfe9f1c 100644
--- a/algo/detectors/sts/StsHitfinderChain.cxx
+++ b/algo/detectors/sts/StsHitfinderChain.cxx
@@ -134,7 +134,12 @@ void sts::HitfinderChain::operator()(gsl::span<const CbmStsDigi> digis)
   queue.wait();
   xpu::h_view nClusters {hfc.nClustersPerModule};
   size_t nClustersFront = std::accumulate(nClusters.begin(), nClusters.begin() + nModules, 0);
-  bool isCpu            = xpu::device::active().backend() == xpu::cpu;
+
+  // FindHits supports to modes of parallelization: One thread per cluster or one block per module
+  // Currently we use method one for CPU and GPU.
+  // See sts::Hitfinder::FindHits() for details.
+  // bool isCpu            = xpu::device::active().backend() == xpu::cpu;
+  bool isCpu            = false;
   xpu::grid findHitsG   = isCpu ? xpu::n_blocks(nModules) : xpu::n_threads(nClustersFront);
   queue.launch<FindHits>(findHitsG);
 
-- 
GitLab