From f0abc1ace270ce43474997cde508b4696fb0d4ee Mon Sep 17 00:00:00 2001
From: Felix Weiglhofer <weiglhofer@fias.uni-frankfurt.de>
Date: Fri, 27 Jan 2023 13:11:19 +0000
Subject: [PATCH] RecoSts: Improvements to logging / benchmarking.

- Log throughput in addition to runtime for each reconstrution step.
- Log device name, when running GPU reconstruction.
- Seperate steps in legacy reconstruction, to allow measuring the time
  spent in each step, even when running in parallel.
---
 reco/detectors/sts/CbmRecoSts.cxx       | 82 ++++++++++++++++++++-----
 reco/detectors/sts/CbmRecoSts.h         |  4 ++
 reco/detectors/sts/CbmStsRecoModule.cxx | 24 +++++++-
 reco/detectors/sts/CbmStsRecoModule.h   |  8 +++
 4 files changed, 102 insertions(+), 16 deletions(-)

diff --git a/reco/detectors/sts/CbmRecoSts.cxx b/reco/detectors/sts/CbmRecoSts.cxx
index 4b716c4660..2a78872031 100644
--- a/reco/detectors/sts/CbmRecoSts.cxx
+++ b/reco/detectors/sts/CbmRecoSts.cxx
@@ -286,7 +286,7 @@ void CbmRecoSts::Finish()
   Double_t clusterHit  = Double_t(fNofClusters) / Double_t(fNofHits);
   LOG(info) << "=====================================";
   LOG(info) << GetName() << ": Run summary";
-  if (fUseGpuReco) LOG(info) << "Ran new GPU STS reconstruction.";
+  if (fUseGpuReco) LOG(info) << "Ran new GPU STS reconstruction. (Device " << xpu::device_properties().name << ")";
   else if (ompThreads < 0)
     LOG(info) << "STS reconstruction ran single threaded (No OpenMP).";
   else
@@ -302,7 +302,6 @@ void CbmRecoSts::Finish()
   LOG(info) << "Clusters per hit       : " << fixed << setprecision(2) << clusterHit;
   LOG(info) << "Time per TSlice        : " << fixed << setprecision(2) << 1000. * fTimeRun / Double_t(fNofTs) << " ms ";
 
-
   // Aggregate times for substeps of reconstruction
   // Note: These times are meaningless when reconstruction runs with > 1 thread.
   CbmStsRecoModule::Timings timingsTotal;
@@ -323,6 +322,8 @@ void CbmRecoSts::Finish()
   fTime3 /= nEvent;
   fTime4 /= nEvent;
 
+  auto throughput = [](auto bytes, auto timeMs) { return bytes * 1000. / timeMs / double(1ull << 30); };
+
   if (not fUseGpuReco) {
     LOG(info) << "NofEvents        : " << fNofEvents;
     LOG(info) << "Time Reset       : " << fixed << setprecision(1) << setw(6) << 1000. * fTime1 << " ms ("
@@ -332,24 +333,35 @@ void CbmRecoSts::Finish()
     LOG(info) << "Time Reconstruct: " << fixed << setprecision(1) << setw(6) << 1000. * fTime3 << " ms ("
               << setprecision(1) << setw(4) << 100. * fTime3 / fTimeTot << " %)";
     LOG(info) << "Time by step:\n"
-              << "  Sort Digi   : " << fixed << setprecision(1) << setw(6) << 1000. * timingsTotal.timeSortDigi
-              << " ms\n"
-              << "  Find Cluster: " << fixed << setprecision(1) << setw(6) << 1000. * timingsTotal.timeCluster
-              << " ms\n"
-              << "  Sort Cluster: " << fixed << setprecision(1) << setw(6) << 1000. * timingsTotal.timeSortCluster
-              << " ms\n"
-              << "  Find Hits   : " << fixed << setprecision(1) << setw(6) << 1000. * timingsTotal.timeHits << " ms\n";
+              << "  Sort Digi   : " << fixed << setprecision(2) << setw(6) << 1000. * fTimeSortDigis << " ms ("
+              << throughput(fNofDigis * 8, 1000. * fTimeSortDigis) << " GB/s)\n"
+              << "  Find Cluster: " << fixed << setprecision(2) << setw(6) << 1000. * fTimeFindClusters << " ms ("
+              << throughput(fNofDigis * sizeof(CbmStsDigi), 1000. * fTimeFindClusters) << " GB/s)\n"
+              << "  Sort Cluster: " << fixed << setprecision(2) << setw(6) << 1000. * fTimeSortClusters << " ms ("
+              << throughput(fNofClusters * sizeof(CbmStsCluster), 1000. * fTimeSortClusters) << " GB/s)\n"
+              << "  Find Hits   : " << fixed << setprecision(2) << setw(6) << 1000. * fTimeFindHits << " ms ("
+              << throughput(fNofClusters * sizeof(CbmStsCluster), 1000. * fTimeFindHits) << " GB/s)";
   }
   else {
     cbm::algo::StsHitfinderTimes times = fGpuReco.GetHitfinderTimes();
 
     double gpuHitfinderTimeTotal = times.timeSortDigi + times.timeCluster + times.timeSortCluster + times.timeHits;
-    LOG(info) << "Time Reconstruct (GPU) : " << fixed << setprecision(1) << setw(6) << gpuHitfinderTimeTotal << " ms";
+
+    double sortDigiThroughput    = throughput(fNofDigis * sizeof(CbmStsDigi), times.timeSortDigi);
+    double findClusterThroughput = throughput(fNofDigis * sizeof(CbmStsDigi), times.timeCluster);
+    double sortClusterThroughput = throughput(fNofClusters * 8, times.timeSortCluster);
+    double findHitThroughput     = throughput(fNofClusters * 24, times.timeHits);
+
+    LOG(info) << "Time Reconstruct (GPU) : " << fixed << setprecision(2) << setw(6) << gpuHitfinderTimeTotal << " ms";
     LOG(info) << "Time by step:\n"
-              << "  Sort Digi   : " << fixed << setprecision(1) << setw(6) << times.timeSortDigi << " ms\n"
-              << "  Find Cluster: " << fixed << setprecision(1) << setw(6) << times.timeCluster << " ms\n"
-              << "  Sort Cluster: " << fixed << setprecision(1) << setw(6) << times.timeSortCluster << " ms\n"
-              << "  Find Hits   : " << fixed << setprecision(1) << setw(6) << times.timeHits << " ms";
+              << "  Sort Digi   : " << fixed << setprecision(2) << setw(6) << times.timeSortDigi << " ms ("
+              << sortDigiThroughput << " GB/s)\n"
+              << "  Find Cluster: " << fixed << setprecision(2) << setw(6) << times.timeCluster << " ms ("
+              << findClusterThroughput << " GB/s)\n"
+              << "  Sort Cluster: " << fixed << setprecision(2) << setw(6) << times.timeSortCluster << " ms ("
+              << sortClusterThroughput << " GB/s)\n"
+              << "  Find Hits   : " << fixed << setprecision(2) << setw(6) << times.timeHits << "ms ("
+              << findHitThroughput << " GB/s)";
   }
   LOG(info) << "=====================================";
 }
@@ -586,14 +598,54 @@ void CbmRecoSts::ProcessData(CbmEvent* event)
 
 
   // --- Execute reconstruction in the modules
+  // Run each step individually. This allows us to meassure the runtime of each step
+  // even when running in parallel
+  TStopwatch timeSubstep;
   fTimer.Start();
+  timeSubstep.Start();
+#ifdef _OPENMP
+#pragma omp parallel for schedule(static)
+#endif
+  for (UInt_t it = 0; it < fModuleIndex.size(); it++) {
+    assert(fModuleIndex[it]);
+    fModuleIndex[it]->SortDigis();
+  }
+  timeSubstep.Stop();
+  fTimeSortDigis = timeSubstep.RealTime();
+
+  timeSubstep.Start();
+#ifdef _OPENMP
+#pragma omp parallel for schedule(static)
+#endif
+  for (UInt_t it = 0; it < fModuleIndex.size(); it++) {
+    assert(fModuleIndex[it]);
+    fModuleIndex[it]->FindClusters();
+  }
+  timeSubstep.Stop();
+  fTimeFindClusters = timeSubstep.RealTime();
+
+  timeSubstep.Start();
 #ifdef _OPENMP
 #pragma omp parallel for schedule(static)
 #endif
   for (UInt_t it = 0; it < fModuleIndex.size(); it++) {
     assert(fModuleIndex[it]);
-    fModuleIndex[it]->Reconstruct();
+    fModuleIndex[it]->SortClusters();
   }
+  timeSubstep.Stop();
+  fTimeSortClusters = timeSubstep.RealTime();
+
+  timeSubstep.Start();
+#ifdef _OPENMP
+#pragma omp parallel for schedule(static)
+#endif
+  for (UInt_t it = 0; it < fModuleIndex.size(); it++) {
+    assert(fModuleIndex[it]);
+    fModuleIndex[it]->FindHits();
+  }
+  timeSubstep.Stop();
+  fTimeFindHits = timeSubstep.RealTime();
+
   fTimer.Stop();
   Double_t time3 = fTimer.RealTime();  // Time for reconstruction
 
diff --git a/reco/detectors/sts/CbmRecoSts.h b/reco/detectors/sts/CbmRecoSts.h
index bc4e84e85d..e7fdf84db2 100644
--- a/reco/detectors/sts/CbmRecoSts.h
+++ b/reco/detectors/sts/CbmRecoSts.h
@@ -304,6 +304,10 @@ private:
   Double_t fTime2           = 0.;  ///< Time for distributing data
   Double_t fTime3           = 0.;  ///< Time for reconstruction
   Double_t fTime4           = 0.;  ///< Time for output results
+  double fTimeSortDigis     = 0.;
+  double fTimeFindClusters  = 0.;
+  double fTimeSortClusters  = 0.;
+  double fTimeFindHits      = 0.;
 
   // --- Run counters
   TStopwatch fTimer {};               //! ROOT timer
diff --git a/reco/detectors/sts/CbmStsRecoModule.cxx b/reco/detectors/sts/CbmStsRecoModule.cxx
index fa95b5ea56..d4af3f7703 100644
--- a/reco/detectors/sts/CbmStsRecoModule.cxx
+++ b/reco/detectors/sts/CbmStsRecoModule.cxx
@@ -76,8 +76,14 @@ void CbmStsRecoModule::AddDigiToQueue(const CbmStsDigi* digi, Int_t digiIndex)
 // -----   Reconstruction   ------------------------------------------------
 void CbmStsRecoModule::Reconstruct()
 {
+  SortDigis();
+  FindClusters();
+  SortClusters();
+  FindHits();
+}
 
-  // return;
+void CbmStsRecoModule::SortDigis()
+{
   TStopwatch timer;
 
   timer.Start();
@@ -92,8 +98,13 @@ void CbmStsRecoModule::Reconstruct()
             });
   timer.Stop();
   fTimings.timeSortDigi = timer.RealTime();
+}
 
+void CbmStsRecoModule::FindClusters()
+{
   // --- Perform cluster finding
+  TStopwatch timer;
+
   timer.Start();
   fClusterFinder->Exec(fDigisF, fClustersF, fSetupModule->GetAddress(), fNofStripsF, 0, fTimeCutDigisSig,
                        fTimeCutDigisAbs, fConnectEdgeFront, fParModule);
@@ -108,8 +119,13 @@ void CbmStsRecoModule::Reconstruct()
 
   timer.Stop();
   fTimings.timeCluster = timer.RealTime();
+}
 
+void CbmStsRecoModule::SortClusters()
+{
   // --- Sort clusters by time
+  TStopwatch timer;
+
   timer.Start();
   std::sort(fClustersF.begin(), fClustersF.end(), [](const CbmStsCluster& cluster1, const CbmStsCluster& cluster2) {
     return (cluster1.GetTime() < cluster2.GetTime());
@@ -119,8 +135,13 @@ void CbmStsRecoModule::Reconstruct()
   });
   timer.Stop();
   fTimings.timeSortCluster = timer.RealTime();
+}
 
+void CbmStsRecoModule::FindHits()
+{
   // --- Perform hit finding
+  TStopwatch timer;
+
   timer.Start();
   if (fHitFinder)
     fHitFinder->Exec(fClustersF, fClustersB, fHits, fSetupModule->GetAddress(), fTimeCutClustersSig,
@@ -133,6 +154,7 @@ void CbmStsRecoModule::Reconstruct()
   timer.Stop();
   fTimings.timeHits = timer.RealTime();
 }
+
 // -------------------------------------------------------------------------
 
 
diff --git a/reco/detectors/sts/CbmStsRecoModule.h b/reco/detectors/sts/CbmStsRecoModule.h
index 261ec35c21..a6de560278 100644
--- a/reco/detectors/sts/CbmStsRecoModule.h
+++ b/reco/detectors/sts/CbmStsRecoModule.h
@@ -120,6 +120,14 @@ public:
   /** @brief Perform reconstruction **/
   void Reconstruct();
 
+  void SortDigis();
+
+  void FindClusters();
+
+  void SortClusters();
+
+  void FindHits();
+
 
   /** @brief Clear input queue **/
   void Reset();
-- 
GitLab