diff --git a/algo/detectors/sts/Hitfinder.cxx b/algo/detectors/sts/Hitfinder.cxx index 09ebeb6a8d5d6d7eb72a5f19852691306b7b6a81..b2a9629658782dfd67576a21b6cdd8543841d219 100644 --- a/algo/detectors/sts/Hitfinder.cxx +++ b/algo/detectors/sts/Hitfinder.cxx @@ -524,24 +524,22 @@ XPU_D void sts::Hitfinder::SortClusters(SortClusters::context& ctx) const XPU_D void sts::Hitfinder::FindHits(FindHits::context& ctx) const { -// On CPU process all modules in parallel (one thread per module). -// On GPU process all front clusters in parallel instead (one thread per cluster) -// to fully utilize the GPU. -// Currently use option 2 for both as it is faster on CPU as well. -#if 0 - int iModule = ctx.block_idx_x(); -#else int iModule = 0; int iThread = ctx.block_dim_x() * ctx.block_idx_x() + ctx.thread_idx_x(); +#if XPU_IS_CPU + iModule = iThread / kFindHitsChunksPerModule; +#else for (; iModule < nModules; iModule++) { if (iThread < nClustersPerModule[iModule]) break; iThread -= nClustersPerModule[iModule]; } - - if (iModule >= nModules) return; #endif + if (iModule >= nModules) { + return; + } + int iModuleF = iModule; int iModuleB = iModuleF + nModules; size_t offsetF = iModuleF * maxClustersPerModule; @@ -554,14 +552,33 @@ XPU_D void sts::Hitfinder::FindHits(FindHits::context& ctx) const int nClustersB = nClustersPerModule[iModuleB]; size_t nHitsWritten = nHitsPerModule[iModule]; - if (nClustersF == 0 || nClustersB == 0) return; + if (nClustersF == 0 || nClustersB == 0) { + return; + } + + // Only used on CPU + [[maybe_unused]] int nClustersPerChunk = (nClustersF + kFindHitsChunksPerModule - 1) / kFindHitsChunksPerModule; + [[maybe_unused]] int iChunk = iThread % kFindHitsChunksPerModule; + + int iClusterF = +#if XPU_IS_CPU + iChunk * nClustersPerChunk; +#else + iThread; +#endif + if (iClusterF >= nClustersF) { + return; + } + // Stop processing if memory limits are exceed by a large amount // In general we would still want to process all clusters to get an idea // by how much memory estimates are wrong. // However they are currently chosen very generous, so if they are exceeded by a large amount // something must have gone awry. (e.g. monster events in the mCBM data that explode the hit finding combinatorics) - if (nHitsWritten > 2 * maxHitsPerModule) return; + if (nHitsWritten > 2 * maxHitsPerModule) { + return; + } HitfinderCache pars; { @@ -584,41 +601,39 @@ XPU_D void sts::Hitfinder::FindHits(FindHits::context& ctx) const float maxSigmaBoth = 4.f * xpu::sqrt(maxTerrF * maxTerrF + maxTerrB * maxTerrB); - int startB = 0; -#if 0 - for (int iClusterF = ctx.thread_idx_x(); iClusterF < nClustersF; iClusterF += ctx.block_dim_x()) { -#else - int iClusterF = iThread; - if (iClusterF < nClustersF) { - - float firstPossibleTime = float(clusterIdxF[iClusterF].fTime) - maxSigmaBoth; - - // Use binary search to find the first cluster on back side that can be matched - // with the current cluster on front side - int start = startB; - int end = nClustersB; - while (end - start > 1) { - int mid = (start + end) / 2; - if (float(clusterIdxB[mid].fTime) < firstPossibleTime) { - start = mid; - } - else { - end = mid; - } + int startB = 0; + float firstPossibleTime = float(clusterIdxF[iClusterF].fTime) - maxSigmaBoth; + + // Use binary search to find the first cluster on back side that can be matched + // with the current cluster on front side + int start = startB; + int end = nClustersB; + while (end - start > 1) { + int mid = (start + end) / 2; + if (float(clusterIdxB[mid].fTime) < firstPossibleTime) { + start = mid; } + else { + end = mid; + } + } + startB = start; - startB = start; + int endClusterF = +#if XPU_IS_CPU + xpu::min(iClusterF + nClustersPerChunk, nClustersF); +#else + iClusterF + 1; #endif - // Temporarily disable clang-format because it indents this block wrongly for some reason. - // TODO: Code above needs cleanup. Then clang-format can be re-enabled too. - // clang-format off - ClusterIdx clsIdxF = clusterIdxF[iClusterF]; + + for (; iClusterF < endClusterF; iClusterF++) { + ClusterIdx clsIdxF = clusterIdxF[iClusterF]; sts::Cluster clsDataF = clusterDataF[clsIdxF.fIdx]; float maxSigma = 4.f * xpu::sqrt(clsDataF.fTimeError * clsDataF.fTimeError + maxTerrB * maxTerrB); for (int iClusterB = startB; iClusterB < nClustersB; iClusterB++) { - ClusterIdx clsIdxB = clusterIdxB[iClusterB]; + ClusterIdx clsIdxB = clusterIdxB[iClusterB]; sts::Cluster clsDataB = clusterDataB[clsIdxB.fIdx]; float timeDiff = float(clsIdxF.fTime) - float(clsIdxB.fTime); @@ -636,18 +651,21 @@ XPU_D void sts::Hitfinder::FindHits(FindHits::context& ctx) const float timeCut = -1.f; const RecoParams::STS& params = ctx.cmem<Params>().sts; - if (params.timeCutClusterAbs > 0.f) timeCut = params.timeCutClusterAbs; + if (params.timeCutClusterAbs > 0.f) + timeCut = params.timeCutClusterAbs; else if (params.timeCutClusterSig > 0.f) { float eF = clsDataF.fTimeError; float eB = clsDataB.fTimeError; timeCut = params.timeCutClusterSig * xpu::sqrt(eF * eF + eB * eB); } - if (xpu::abs(float(clsIdxF.fTime) - float(clsIdxB.fTime)) > timeCut) continue; + if (xpu::abs(float(clsIdxF.fTime) - float(clsIdxB.fTime)) > timeCut) { + continue; + } IntersectClusters(iModule, pars, clsIdxF, clsDataF, clsIdxB, clsDataB); - } - } + } // end loop over clusters on back side + } // end loop over clusters on front side // clang-format on } diff --git a/algo/detectors/sts/Hitfinder.h b/algo/detectors/sts/Hitfinder.h index af0d6ac765473ece86662bc416810a1f8de59abc..c96c5522536edd7325354f2115bbe269bc5640ac 100644 --- a/algo/detectors/sts/Hitfinder.h +++ b/algo/detectors/sts/Hitfinder.h @@ -37,6 +37,7 @@ namespace cbm::algo kFindClusterBlockSize = 1024, kFindHitsBlockSize = 64, #endif + kFindHitsChunksPerModule = 16, }; } // namespace cbm::algo @@ -108,7 +109,7 @@ namespace cbm::algo::sts using block_size = xpu::block_size<kFindHitsBlockSize>; using constants = xpu::cmem<TheHitfinder, Params>; using context = xpu::kernel_context<shared_memory, constants>; - using openmp = xpu::openmp_settings<xpu::schedule_dynamic, 128>; + using openmp = xpu::openmp_settings<xpu::schedule_static, 1>; XPU_D void operator()(context&); }; diff --git a/algo/detectors/sts/HitfinderChain.cxx b/algo/detectors/sts/HitfinderChain.cxx index a2dcbbc3d55a1e7ec6283095e499e654044b358e..18c6d0a4b6db6eeaa462a2da9175ef0693616727 100644 --- a/algo/detectors/sts/HitfinderChain.cxx +++ b/algo/detectors/sts/HitfinderChain.cxx @@ -181,12 +181,10 @@ sts::HitfinderChain::Result sts::HitfinderChain::operator()(gsl::span<const CbmS size_t nClustersFront = std::accumulate(nClusters.begin(), nClusters.begin() + nModules, 0); - // FindHits supports to modes of parallelization: One thread per cluster or one block per module - // Currently we use method one for CPU and GPU. - // See sts::Hitfinder::FindHits() for details. - // bool isCpu = xpu::device::active().backend() == xpu::cpu; - bool isCpu = false; - xpu::grid findHitsG = isCpu ? xpu::n_blocks(nModules) : xpu::n_threads(nClustersFront); + bool isCpu = xpu::device::active().backend() == xpu::cpu; + // bool isCpu = false; + int threadsCPU = kFindHitsChunksPerModule * nModules; + xpu::grid findHitsG = xpu::n_threads(isCpu ? threadsCPU : nClustersFront); queue.launch<FindHits>(findHitsG); xpu::k_add_bytes<FindHits>(nClustersTotal * sizeof(sts::Cluster));