From bdf5f24521ccaa479cb1f177d0e828b370f4d0a5 Mon Sep 17 00:00:00 2001
From: Dominik Smith <d.smith@gsi.de>
Date: Thu, 25 Apr 2024 13:53:37 +0200
Subject: [PATCH] Parallelized cluster finding in trd::Hitfind across rows.

---
 algo/detectors/trd/Hitfind.cxx | 103 +++++++++++++++++++++++++--------
 algo/detectors/trd/Hitfind.h   |  11 +++-
 2 files changed, 89 insertions(+), 25 deletions(-)

diff --git a/algo/detectors/trd/Hitfind.cxx b/algo/detectors/trd/Hitfind.cxx
index 4367147765..571fbd8bb8 100644
--- a/algo/detectors/trd/Hitfind.cxx
+++ b/algo/detectors/trd/Hitfind.cxx
@@ -50,10 +50,12 @@ namespace cbm::algo::trd
           padPar.pos               = ROOT::Math::XYZVector(pos_ptr[0], pos_ptr[1], pos_ptr[2]);
           padPar.posErr            = ROOT::Math::XYZVector(posErr_ptr[0], posErr_ptr[1], posErr_ptr[2]);
         }
+        fRowList.emplace_back(module.address, false, row);
       }
       fHitFind[module.address]      = std::make_unique<cbm::algo::trd::HitFinder>(params);
       fClusterBuild[module.address] = std::make_unique<cbm::algo::trd::Clusterizer>(params);
-      fModList.emplace_back(module.address, false);
+      fModId[module.address]        = fModList.size();
+      fModList.emplace_back(module.address, false, module.rowPar.size(), module.rowPar[0].padPar.size());
     }
 
     // Create one algorithm per module for TRD2D and configure it with parameters
@@ -86,10 +88,12 @@ namespace cbm::algo::trd
           padPar.chRMasked         = pad.chRMasked;
           padPar.chTMasked         = pad.chTMasked;
         }
+        fRowList.emplace_back(module.address, true, row);
       }
       fHitFind2d[module.address]      = std::make_unique<cbm::algo::trd::HitFinder2D>(params);
       fClusterBuild2d[module.address] = std::make_unique<cbm::algo::trd::Clusterizer2D>(params);
-      fModList.emplace_back(module.address, true);
+      fModId[module.address]          = fModList.size();
+      fModList.emplace_back(module.address, true, module.rowPar.size(), module.rowPar[0].padPar.size());
     }
 
     L_(info) << "--- Configured hitfinder algorithms for TRD.";
@@ -110,34 +114,78 @@ namespace cbm::algo::trd
 
     xpu::scoped_timer t_{"TRDHitfind", &monitor.timeTotal};
 
-    // Intermediate storage variables (digi, index) per module
-    std::unordered_map<int, std::vector<std::pair<CbmTrdDigi, int32_t>>> digiBuffer;  //[modAddress]
+    // Intermediate digi storage variables (digi, index) per module and row
+    std::unordered_map<int, std::vector<std::vector<std::pair<CbmTrdDigi, int32_t>>>> digiBuffer;  //[modAddress][row]
+
+    // Intermediate 1D clusters per module and row
+    std::unordered_map<int, std::vector<std::vector<Cluster>>> clusterBuffer;  //[modAddress][row]
+
+    // Intermediate 2D clusters per module and row
+    std::unordered_map<int, std::vector<std::vector<Cluster2D>>> clusterBuffer2d;  //[modAddress][row]
+
+    // Initialize storage buffers
+    for (size_t mod = 0; mod < fModList.size(); mod++) {
+      const int address    = std::get<0>(fModList[mod]);
+      const bool is2D      = std::get<1>(fModList[mod]);
+      const size_t numRows = std::get<2>(fModList[mod]);
+      digiBuffer[address].resize(numRows);
+      if (is2D) {
+        clusterBuffer2d[address].resize(numRows);
+      }
+      else {
+        clusterBuffer[address].resize(numRows);
+      }
+    }
 
     // Loop over the digis array and store the digis in separate vectors for
-    // each modules
+    // each module and row
     xpu::push_timer("DigiModuleSort");
     for (size_t idigi = 0; idigi < digiIn.size(); idigi++) {
       const CbmTrdDigi* digi = &digiIn[idigi];
-      auto addrModule        = digi->GetAddressModule();
-
+      const int address      = digi->GetAddressModule();
       if constexpr (DebugCheckInput) {
-        auto modInfo = std::find_if(fModList.begin(), fModList.end(), [&](auto m) { return m.first == addrModule; });
+        auto modInfo =
+          std::find_if(fModList.begin(), fModList.end(), [&](auto m) { return std::get<0>(m) == address; });
         if (modInfo == fModList.end()) {
           L_(error) << "TRD: Unknown module ID";
           continue;
         }
         bool digiIs2D = digi->IsFASP();
-        if (modInfo->second != digiIs2D) {
-          L_(error) << "TRD: Module + Digi type mismatch: " << modInfo->first << ": " << modInfo->second << " "
-                    << digiIs2D;
+        if (std::get<1>(*modInfo) != digiIs2D) {
+          L_(error) << "TRD: Module + Digi type mismatch: " << std::get<0>(*modInfo) << ": " << std::get<1>(*modInfo)
+                    << " " << digiIs2D;
           continue;
         }
       }
-
-      digiBuffer[addrModule].emplace_back(*digi, idigi);
+      const size_t modId   = fModId[address];
+      const size_t numCols = std::get<3>(fModList[modId]);
+      const int row        = digi->GetAddressChannel() / numCols;
+      digiBuffer[address][row].emplace_back(*digi, idigi);
     }
     monitor.sortTime = xpu::pop_timer();
 
+    xpu::push_timer("BuildClusters");
+    xpu::t_add_bytes(digiIn.size_bytes());
+
+    // Cluster building
+    CBM_PARALLEL_FOR(schedule(dynamic))
+    for (size_t row = 0; row < fRowList.size(); row++) {
+
+      const int address     = std::get<0>(fRowList[row]);
+      const bool is2D       = std::get<1>(fRowList[row]);
+      const size_t rowInMod = std::get<2>(fRowList[row]);
+      const auto& digiInput = digiBuffer[address][rowInMod];
+      if (is2D) {
+        clusterBuffer2d[address][rowInMod] =
+          (*fClusterBuild2d[address])(digiInput, 0.);  // Number is TS start time (T0)
+      }
+      else {
+        clusterBuffer[address][rowInMod] = (*fClusterBuild[address])(digiInput);
+      }
+    }
+    monitor.timeClusterize = xpu::pop_timer();
+
+    // Hit finding
     PODVector<Hit> hitsFlat;                          // hit storage
     std::vector<size_t> modSizes(fModList.size());    // nHits per modules
     std::vector<uint> modAddresses(fModList.size());  // address of modules
@@ -146,8 +194,6 @@ namespace cbm::algo::trd
     std::vector<size_t> hitsPrefix;
 
     xpu::push_timer("FindHits");
-    xpu::t_add_bytes(digiIn.size_bytes());
-
     CBM_PARALLEL()
     {
       const int ithread  = openmp::GetThreadNum();
@@ -159,17 +205,28 @@ namespace cbm::algo::trd
 
       CBM_OMP(for schedule(dynamic) nowait)
       for (size_t mod = 0; mod < fModList.size(); mod++) {
-        const int address = fModList[mod].first;
-        const bool is2D   = fModList[mod].second;
+        const int address = std::get<0>(fModList[mod]);
+        const bool is2D   = std::get<1>(fModList[mod]);
+
+        // Lambda expression for vector concatenation
+        auto concatVec = [](auto& acc, const auto& innerVec) {
+          acc.insert(acc.end(), innerVec.begin(), innerVec.end());
+          return std::move(acc);
+        };
 
+        // Flatten the input cluster vector of vectors and build hits
         std::vector<Hit> mod_hits;
-        if (is2D) {                                                              // 2D module
-          auto clusters = (*fClusterBuild2d[address])(digiBuffer[address], 0.);  // Number is TS start time (T0)
-          mod_hits      = (*fHitFind2d[address])(&clusters);
+        if (is2D) {
+          auto& buffer  = clusterBuffer2d[address];
+          auto clusters = std::accumulate(buffer.begin(), buffer.end(), std::vector<Cluster2D>(), concatVec);
+          //sort(clusters.begin(), clusters.end(), [](const auto& a, const auto& b) { return a.GetStartTime() < b.GetStartTime(); });
+          mod_hits = (*fHitFind2d[address])(&clusters);
         }
-        else {  // 1D module
-          auto clusters = (*fClusterBuild[address])(digiBuffer[address]);
-          mod_hits      = (*fHitFind[address])(&clusters);
+        else {
+          auto& buffer  = clusterBuffer[address];
+          auto clusters = std::accumulate(buffer.begin(), buffer.end(), std::vector<Cluster>(), concatVec);
+          //sort(clusters.begin(), clusters.end(), [](const auto& a, const auto& b) { return a.GetStartTime() < b.GetStartTime(); });
+          mod_hits = (*fHitFind[address])(&clusters);
         }
         // Append clusters to output
         local_hits.insert(local_hits.end(), std::make_move_iterator(mod_hits.begin()),
diff --git a/algo/detectors/trd/Hitfind.h b/algo/detectors/trd/Hitfind.h
index 07f64557ba..b3882f94af 100644
--- a/algo/detectors/trd/Hitfind.h
+++ b/algo/detectors/trd/Hitfind.h
@@ -33,6 +33,7 @@ namespace cbm::algo::trd
   struct HitfindMonitorData {
     xpu::timings timeTotal;
     xpu::timings timeHitfind;
+    xpu::timings timeClusterize;
     xpu::timings sortTime;
     size_t numDigis = 0;
     size_t numHits  = 0;
@@ -78,7 +79,13 @@ namespace cbm::algo::trd
     std::unordered_map<int, std::unique_ptr<cbm::algo::trd::HitFinder2D>> fHitFind2d;
     std::unordered_map<int, std::unique_ptr<cbm::algo::trd::HitFinder>> fHitFind;
 
-    /** @brief List of modules (address and type flag; true = 2D) */
-    std::vector<std::pair<int, bool>> fModList;
+    /** @brief List of modules (address, type flag (true = 2D), numRows, numCols) */
+    std::vector<std::tuple<int, bool, size_t, size_t>> fModList;
+
+    /** @brief List of rows (module address, type flag (true = 2D), row in module) */
+    std::vector<std::tuple<int, bool, size_t>> fRowList;
+
+    /** @brief Map from module address to module Id (sequential number) */
+    std::unordered_map<int, size_t> fModId;
   };
 }  // namespace cbm::algo::trd
-- 
GitLab