diff --git a/algo/detectors/tof/Hitfind.cxx b/algo/detectors/tof/Hitfind.cxx index c90838986bf2c57252b488de5009bc574c5da2c5..02f3e3098956898f756ca2094d7e18e2db0230b9 100644 --- a/algo/detectors/tof/Hitfind.cxx +++ b/algo/detectors/tof/Hitfind.cxx @@ -6,6 +6,7 @@ #include <chrono> +#include "compat/OpenMP.h" #include "log.hpp" #include "util/TimingsFormat.h" @@ -57,7 +58,10 @@ namespace cbm::algo::tof par->fChanPar[Ch].cell.sizeX = rpcPar.cell.sizeX; par->fChanPar[Ch].cell.sizeY = rpcPar.cell.sizeY; } - fAlgo[SmType].emplace(std::make_pair(Sm * NbRpc + Rpc, tof::Clusterizer(std::move(*par)))); + fAlgo.emplace_back(std::move(*par)); + + // fill unique rpc pointer vectors for parallelization + fStorDigiPtr.push_back(&fStorDigi[SmType][Sm * NbRpc + Rpc]); } } } @@ -70,18 +74,13 @@ namespace cbm::algo::tof // ----- Execution ------------------------------------------------------- Hitfind::resultType Hitfind::operator()(gsl::span<CbmTofDigi> digiIn) { - xpu::push_timer("TofHitfind"); - xpu::t_add_bytes(digiIn.size_bytes()); - // --- Output data - resultType result = {}; - - auto& clusterTs = std::get<0>(result); - auto& monitor = std::get<1>(result); - auto& digiInd = std::get<2>(result); // digi indices + resultType result = {}; + auto& [clusterTs, monitor, digiInd] = result; // Loop over the digis array and store the Digis in separate vectors for // each RPC modules + xpu::push_timer("TofHitfindChanSort"); for (size_t idigi = 0; idigi < digiIn.size(); idigi++) { CbmTofDigi* pDigi = &(digiIn[idigi]); @@ -96,48 +95,91 @@ namespace cbm::algo::tof } fStorDigi[SmType][Sm * NbRpc + Rpc].emplace_back(*pDigi, idigi); } + monitor.fSortTime = xpu::pop_timer(); - std::vector<Hit> clustersFlat; + std::vector<Hit> clustersFlat; // cluster storage std::vector<size_t> chanSizes; // nClusters per channel std::vector<u32> chanAddresses; // channel addresses - // --- RPC loop - for (uint32_t SmType = 0; SmType < fNbSm.size(); SmType++) { - const uint32_t NbRpc = fNbRpc[SmType]; - const uint32_t NbSm = fNbSm[SmType]; - for (uint32_t Sm = 0; Sm < NbSm; Sm++) { - for (uint32_t Rpc = 0; Rpc < NbRpc; Rpc++) { - - // Get digis - std::vector<std::pair<CbmTofDigi, int32_t>>& digiExp = fStorDigi[SmType][Sm * NbRpc + Rpc]; - - // Build clusters - auto rpcresult = fAlgo[SmType][Sm * NbRpc + Rpc](digiExp); - std::vector<Hit>& clusters = std::get<0>(rpcresult); // Hits - std::vector<size_t>& sizes = std::get<1>(rpcresult); // nClusters per channel - std::vector<u32>& addresses = std::get<2>(rpcresult); // channel addresses - std::vector<i32>& indices = std::get<3>(rpcresult); // digi indices - - // Append clusters to output - clustersFlat.insert(clustersFlat.end(), std::make_move_iterator(clusters.begin()), - std::make_move_iterator(clusters.end())); - - // Store hw address of partition - chanAddresses.insert(chanAddresses.end(), std::make_move_iterator(addresses.begin()), - std::make_move_iterator(addresses.end())); - - // store partition size - chanSizes.insert(chanSizes.end(), std::make_move_iterator(sizes.begin()), - std::make_move_iterator(sizes.end())); - - // store digi indices - digiInd.insert(digiInd.end(), std::make_move_iterator(indices.begin()), - std::make_move_iterator(indices.end())); - - // Clear digi storage - digiExp.clear(); + // Prefix arrays for parallelization + std::vector<size_t> cluPrefix; + std::vector<size_t> sizePrefix; + std::vector<size_t> addrPrefix; + std::vector<size_t> indPrefix; + + xpu::push_timer("TofHitfind"); + xpu::t_add_bytes(digiIn.size_bytes()); +#pragma omp parallel + { + +#ifdef _OPENMP + int ithread = omp_get_thread_num(); + int nthreads = omp_get_num_threads(); +#else + int ithread = 0; + int nthreads = 1; +#endif + +#pragma omp single + { + cluPrefix.resize(nthreads + 1); + sizePrefix.resize(nthreads + 1); + addrPrefix.resize(nthreads + 1); + indPrefix.resize(nthreads + 1); + } + + auto [clusters, sizes, addresses, indices] = Clusterizer::resultType(); + +#pragma omp for schedule(dynamic) nowait + for (uint32_t iRpc = 0; iRpc < fAlgo.size(); iRpc++) { + + // Get digis + std::vector<std::pair<CbmTofDigi, int32_t>>& digiExp = *fStorDigiPtr[iRpc]; + + // Build clusters + auto [rpc_clu, rpc_size, rpc_addr, rpc_ind] = fAlgo[iRpc](digiExp); + + // Append clusters to output + clusters.insert(clusters.end(), std::make_move_iterator(rpc_clu.begin()), + std::make_move_iterator(rpc_clu.end())); + + // store partition size + sizes.insert(sizes.end(), std::make_move_iterator(rpc_size.begin()), std::make_move_iterator(rpc_size.end())); + + // Store hw address of partition + addresses.insert(addresses.end(), std::make_move_iterator(rpc_addr.begin()), + std::make_move_iterator(rpc_addr.end())); + + // store digi indices + indices.insert(indices.end(), std::make_move_iterator(rpc_ind.begin()), std::make_move_iterator(rpc_ind.end())); + + // Clear digi storage + digiExp.clear(); + } + cluPrefix[ithread + 1] = clusters.size(); + sizePrefix[ithread + 1] = sizes.size(); + addrPrefix[ithread + 1] = addresses.size(); + indPrefix[ithread + 1] = indices.size(); +#pragma omp barrier + +#pragma omp single + { + for (int i = 1; i < (nthreads + 1); i++) { + cluPrefix[i] += cluPrefix[i - 1]; + sizePrefix[i] += sizePrefix[i - 1]; + addrPrefix[i] += addrPrefix[i - 1]; + indPrefix[i] += indPrefix[i - 1]; } + + clustersFlat.resize(cluPrefix[nthreads]); + chanSizes.resize(sizePrefix[nthreads]); + chanAddresses.resize(addrPrefix[nthreads]); + digiInd.resize(indPrefix[nthreads]); } + std::move(clusters.begin(), clusters.end(), clustersFlat.begin() + cluPrefix[ithread]); + std::move(sizes.begin(), sizes.end(), chanSizes.begin() + sizePrefix[ithread]); + std::move(addresses.begin(), addresses.end(), chanAddresses.begin() + addrPrefix[ithread]); + std::move(indices.begin(), indices.end(), digiInd.begin() + indPrefix[ithread]); } // Monitoring diff --git a/algo/detectors/tof/Hitfind.h b/algo/detectors/tof/Hitfind.h index 59a3e62de1a966df8422757e17a5b9dbc6f0a76f..23d40eeda5e07f234550ba5b8362a0e6c815d65c 100644 --- a/algo/detectors/tof/Hitfind.h +++ b/algo/detectors/tof/Hitfind.h @@ -30,6 +30,7 @@ namespace cbm::algo::tof struct HitfindMonitorData { //std::vector<tof::ClusterizerMonitorData> fMonitor; //Per RPC monitoring data, to be implemented xpu::timings fTime; + xpu::timings fSortTime; size_t fNumDigis = 0; size_t fNumHits = 0; @@ -37,7 +38,7 @@ namespace cbm::algo::tof { std::stringstream ss; ss << "Hitfind stats: num digis " << fNumDigis << ", time " << fTime.wall() << " ms ( " << fTime.throughput() - << " GB/s ), num hits " << fNumHits << std::endl; + << " GB/s ), sort time " << fSortTime.wall() << " ms, num hits " << fNumHits << std::endl; return ss.str(); } }; @@ -65,8 +66,8 @@ namespace cbm::algo::tof explicit Hitfind(tof::HitfindSetup); private: // members - /** @brief TOF hitfinders **/ - std::map<uint32_t, std::map<uint32_t, tof::Clusterizer>> fAlgo; //[nbType][nbSm*nbRpc] + /** @brief TOF hitfinders (with unique RPC index for OpenMP) **/ + std::vector<tof::Clusterizer> fAlgo; //[rpcUnique] /** @brief Number of SMs per super module type **/ std::vector<int32_t> fNbSm; @@ -76,6 +77,9 @@ namespace cbm::algo::tof /** @brief Intermediate storage variables (digi, index) **/ std::vector<std::vector<std::vector<std::pair<CbmTofDigi, int32_t>>>> fStorDigi; //[nbType][nbSm*nbRpc][nDigis] + + /** @brief Pointer to storage variables with unique RPC index (for OpenMP) **/ + std::vector<std::vector<std::pair<CbmTofDigi, int32_t>>*> fStorDigiPtr; //[rpcUnique][nDigis] }; } // namespace cbm::algo::tof