From f0abc1ace270ce43474997cde508b4696fb0d4ee Mon Sep 17 00:00:00 2001 From: Felix Weiglhofer <weiglhofer@fias.uni-frankfurt.de> Date: Fri, 27 Jan 2023 13:11:19 +0000 Subject: [PATCH] RecoSts: Improvements to logging / benchmarking. - Log throughput in addition to runtime for each reconstrution step. - Log device name, when running GPU reconstruction. - Seperate steps in legacy reconstruction, to allow measuring the time spent in each step, even when running in parallel. --- reco/detectors/sts/CbmRecoSts.cxx | 82 ++++++++++++++++++++----- reco/detectors/sts/CbmRecoSts.h | 4 ++ reco/detectors/sts/CbmStsRecoModule.cxx | 24 +++++++- reco/detectors/sts/CbmStsRecoModule.h | 8 +++ 4 files changed, 102 insertions(+), 16 deletions(-) diff --git a/reco/detectors/sts/CbmRecoSts.cxx b/reco/detectors/sts/CbmRecoSts.cxx index 4b716c4660..2a78872031 100644 --- a/reco/detectors/sts/CbmRecoSts.cxx +++ b/reco/detectors/sts/CbmRecoSts.cxx @@ -286,7 +286,7 @@ void CbmRecoSts::Finish() Double_t clusterHit = Double_t(fNofClusters) / Double_t(fNofHits); LOG(info) << "====================================="; LOG(info) << GetName() << ": Run summary"; - if (fUseGpuReco) LOG(info) << "Ran new GPU STS reconstruction."; + if (fUseGpuReco) LOG(info) << "Ran new GPU STS reconstruction. (Device " << xpu::device_properties().name << ")"; else if (ompThreads < 0) LOG(info) << "STS reconstruction ran single threaded (No OpenMP)."; else @@ -302,7 +302,6 @@ void CbmRecoSts::Finish() LOG(info) << "Clusters per hit : " << fixed << setprecision(2) << clusterHit; LOG(info) << "Time per TSlice : " << fixed << setprecision(2) << 1000. * fTimeRun / Double_t(fNofTs) << " ms "; - // Aggregate times for substeps of reconstruction // Note: These times are meaningless when reconstruction runs with > 1 thread. CbmStsRecoModule::Timings timingsTotal; @@ -323,6 +322,8 @@ void CbmRecoSts::Finish() fTime3 /= nEvent; fTime4 /= nEvent; + auto throughput = [](auto bytes, auto timeMs) { return bytes * 1000. / timeMs / double(1ull << 30); }; + if (not fUseGpuReco) { LOG(info) << "NofEvents : " << fNofEvents; LOG(info) << "Time Reset : " << fixed << setprecision(1) << setw(6) << 1000. * fTime1 << " ms (" @@ -332,24 +333,35 @@ void CbmRecoSts::Finish() LOG(info) << "Time Reconstruct: " << fixed << setprecision(1) << setw(6) << 1000. * fTime3 << " ms (" << setprecision(1) << setw(4) << 100. * fTime3 / fTimeTot << " %)"; LOG(info) << "Time by step:\n" - << " Sort Digi : " << fixed << setprecision(1) << setw(6) << 1000. * timingsTotal.timeSortDigi - << " ms\n" - << " Find Cluster: " << fixed << setprecision(1) << setw(6) << 1000. * timingsTotal.timeCluster - << " ms\n" - << " Sort Cluster: " << fixed << setprecision(1) << setw(6) << 1000. * timingsTotal.timeSortCluster - << " ms\n" - << " Find Hits : " << fixed << setprecision(1) << setw(6) << 1000. * timingsTotal.timeHits << " ms\n"; + << " Sort Digi : " << fixed << setprecision(2) << setw(6) << 1000. * fTimeSortDigis << " ms (" + << throughput(fNofDigis * 8, 1000. * fTimeSortDigis) << " GB/s)\n" + << " Find Cluster: " << fixed << setprecision(2) << setw(6) << 1000. * fTimeFindClusters << " ms (" + << throughput(fNofDigis * sizeof(CbmStsDigi), 1000. * fTimeFindClusters) << " GB/s)\n" + << " Sort Cluster: " << fixed << setprecision(2) << setw(6) << 1000. * fTimeSortClusters << " ms (" + << throughput(fNofClusters * sizeof(CbmStsCluster), 1000. * fTimeSortClusters) << " GB/s)\n" + << " Find Hits : " << fixed << setprecision(2) << setw(6) << 1000. * fTimeFindHits << " ms (" + << throughput(fNofClusters * sizeof(CbmStsCluster), 1000. * fTimeFindHits) << " GB/s)"; } else { cbm::algo::StsHitfinderTimes times = fGpuReco.GetHitfinderTimes(); double gpuHitfinderTimeTotal = times.timeSortDigi + times.timeCluster + times.timeSortCluster + times.timeHits; - LOG(info) << "Time Reconstruct (GPU) : " << fixed << setprecision(1) << setw(6) << gpuHitfinderTimeTotal << " ms"; + + double sortDigiThroughput = throughput(fNofDigis * sizeof(CbmStsDigi), times.timeSortDigi); + double findClusterThroughput = throughput(fNofDigis * sizeof(CbmStsDigi), times.timeCluster); + double sortClusterThroughput = throughput(fNofClusters * 8, times.timeSortCluster); + double findHitThroughput = throughput(fNofClusters * 24, times.timeHits); + + LOG(info) << "Time Reconstruct (GPU) : " << fixed << setprecision(2) << setw(6) << gpuHitfinderTimeTotal << " ms"; LOG(info) << "Time by step:\n" - << " Sort Digi : " << fixed << setprecision(1) << setw(6) << times.timeSortDigi << " ms\n" - << " Find Cluster: " << fixed << setprecision(1) << setw(6) << times.timeCluster << " ms\n" - << " Sort Cluster: " << fixed << setprecision(1) << setw(6) << times.timeSortCluster << " ms\n" - << " Find Hits : " << fixed << setprecision(1) << setw(6) << times.timeHits << " ms"; + << " Sort Digi : " << fixed << setprecision(2) << setw(6) << times.timeSortDigi << " ms (" + << sortDigiThroughput << " GB/s)\n" + << " Find Cluster: " << fixed << setprecision(2) << setw(6) << times.timeCluster << " ms (" + << findClusterThroughput << " GB/s)\n" + << " Sort Cluster: " << fixed << setprecision(2) << setw(6) << times.timeSortCluster << " ms (" + << sortClusterThroughput << " GB/s)\n" + << " Find Hits : " << fixed << setprecision(2) << setw(6) << times.timeHits << "ms (" + << findHitThroughput << " GB/s)"; } LOG(info) << "====================================="; } @@ -586,14 +598,54 @@ void CbmRecoSts::ProcessData(CbmEvent* event) // --- Execute reconstruction in the modules + // Run each step individually. This allows us to meassure the runtime of each step + // even when running in parallel + TStopwatch timeSubstep; fTimer.Start(); + timeSubstep.Start(); +#ifdef _OPENMP +#pragma omp parallel for schedule(static) +#endif + for (UInt_t it = 0; it < fModuleIndex.size(); it++) { + assert(fModuleIndex[it]); + fModuleIndex[it]->SortDigis(); + } + timeSubstep.Stop(); + fTimeSortDigis = timeSubstep.RealTime(); + + timeSubstep.Start(); +#ifdef _OPENMP +#pragma omp parallel for schedule(static) +#endif + for (UInt_t it = 0; it < fModuleIndex.size(); it++) { + assert(fModuleIndex[it]); + fModuleIndex[it]->FindClusters(); + } + timeSubstep.Stop(); + fTimeFindClusters = timeSubstep.RealTime(); + + timeSubstep.Start(); #ifdef _OPENMP #pragma omp parallel for schedule(static) #endif for (UInt_t it = 0; it < fModuleIndex.size(); it++) { assert(fModuleIndex[it]); - fModuleIndex[it]->Reconstruct(); + fModuleIndex[it]->SortClusters(); } + timeSubstep.Stop(); + fTimeSortClusters = timeSubstep.RealTime(); + + timeSubstep.Start(); +#ifdef _OPENMP +#pragma omp parallel for schedule(static) +#endif + for (UInt_t it = 0; it < fModuleIndex.size(); it++) { + assert(fModuleIndex[it]); + fModuleIndex[it]->FindHits(); + } + timeSubstep.Stop(); + fTimeFindHits = timeSubstep.RealTime(); + fTimer.Stop(); Double_t time3 = fTimer.RealTime(); // Time for reconstruction diff --git a/reco/detectors/sts/CbmRecoSts.h b/reco/detectors/sts/CbmRecoSts.h index bc4e84e85d..e7fdf84db2 100644 --- a/reco/detectors/sts/CbmRecoSts.h +++ b/reco/detectors/sts/CbmRecoSts.h @@ -304,6 +304,10 @@ private: Double_t fTime2 = 0.; ///< Time for distributing data Double_t fTime3 = 0.; ///< Time for reconstruction Double_t fTime4 = 0.; ///< Time for output results + double fTimeSortDigis = 0.; + double fTimeFindClusters = 0.; + double fTimeSortClusters = 0.; + double fTimeFindHits = 0.; // --- Run counters TStopwatch fTimer {}; //! ROOT timer diff --git a/reco/detectors/sts/CbmStsRecoModule.cxx b/reco/detectors/sts/CbmStsRecoModule.cxx index fa95b5ea56..d4af3f7703 100644 --- a/reco/detectors/sts/CbmStsRecoModule.cxx +++ b/reco/detectors/sts/CbmStsRecoModule.cxx @@ -76,8 +76,14 @@ void CbmStsRecoModule::AddDigiToQueue(const CbmStsDigi* digi, Int_t digiIndex) // ----- Reconstruction ------------------------------------------------ void CbmStsRecoModule::Reconstruct() { + SortDigis(); + FindClusters(); + SortClusters(); + FindHits(); +} - // return; +void CbmStsRecoModule::SortDigis() +{ TStopwatch timer; timer.Start(); @@ -92,8 +98,13 @@ void CbmStsRecoModule::Reconstruct() }); timer.Stop(); fTimings.timeSortDigi = timer.RealTime(); +} +void CbmStsRecoModule::FindClusters() +{ // --- Perform cluster finding + TStopwatch timer; + timer.Start(); fClusterFinder->Exec(fDigisF, fClustersF, fSetupModule->GetAddress(), fNofStripsF, 0, fTimeCutDigisSig, fTimeCutDigisAbs, fConnectEdgeFront, fParModule); @@ -108,8 +119,13 @@ void CbmStsRecoModule::Reconstruct() timer.Stop(); fTimings.timeCluster = timer.RealTime(); +} +void CbmStsRecoModule::SortClusters() +{ // --- Sort clusters by time + TStopwatch timer; + timer.Start(); std::sort(fClustersF.begin(), fClustersF.end(), [](const CbmStsCluster& cluster1, const CbmStsCluster& cluster2) { return (cluster1.GetTime() < cluster2.GetTime()); @@ -119,8 +135,13 @@ void CbmStsRecoModule::Reconstruct() }); timer.Stop(); fTimings.timeSortCluster = timer.RealTime(); +} +void CbmStsRecoModule::FindHits() +{ // --- Perform hit finding + TStopwatch timer; + timer.Start(); if (fHitFinder) fHitFinder->Exec(fClustersF, fClustersB, fHits, fSetupModule->GetAddress(), fTimeCutClustersSig, @@ -133,6 +154,7 @@ void CbmStsRecoModule::Reconstruct() timer.Stop(); fTimings.timeHits = timer.RealTime(); } + // ------------------------------------------------------------------------- diff --git a/reco/detectors/sts/CbmStsRecoModule.h b/reco/detectors/sts/CbmStsRecoModule.h index 261ec35c21..a6de560278 100644 --- a/reco/detectors/sts/CbmStsRecoModule.h +++ b/reco/detectors/sts/CbmStsRecoModule.h @@ -120,6 +120,14 @@ public: /** @brief Perform reconstruction **/ void Reconstruct(); + void SortDigis(); + + void FindClusters(); + + void SortClusters(); + + void FindHits(); + /** @brief Clear input queue **/ void Reset(); -- GitLab