diff --git a/algo/base/ChainContext.h b/algo/base/ChainContext.h index 954e45d6f0bc0f30f5afe47de0c2fe76dae725a9..3277028b2ab0762e1f27ced91ced778d60471746 100644 --- a/algo/base/ChainContext.h +++ b/algo/base/ChainContext.h @@ -5,11 +5,13 @@ #define CBM_ALGO_BASE_CHAINCONTEXT_H #include "Options.h" +#include "RecoParams.h" namespace cbm::algo { struct ChainContext { Options opts; + RecoParams recoParams; }; } // namespace cbm::algo diff --git a/algo/base/SubChain.h b/algo/base/SubChain.h index 13bfcf63aa47e9f46cb0ccd1b8e98ebb0188a5c0..b787c1b64c2c13860233e84ee7a2162ded31f207 100644 --- a/algo/base/SubChain.h +++ b/algo/base/SubChain.h @@ -16,6 +16,7 @@ namespace cbm::algo void SetContext(ChainContext* ctx) { fContext = ctx; } const Options& Opts() const { return gsl::make_not_null(fContext)->opts; } + const RecoParams& Params() const { return gsl::make_not_null(fContext)->recoParams; } private: ChainContext* fContext = nullptr; diff --git a/algo/base/config/Yaml.h b/algo/base/config/Yaml.h index 0109c5b2344f0732a97c3b6d8a441545fabe5c0e..a7100184a9144145779c7987d37389787adc8e95 100644 --- a/algo/base/config/Yaml.h +++ b/algo/base/config/Yaml.h @@ -15,7 +15,7 @@ #include "Prelude.h" #include "Property.h" -namespace CORE_INTERFACE_NS::config +namespace cbm::algo::config { template<typename T, T... Values, typename Func> @@ -144,6 +144,6 @@ namespace CORE_INTERFACE_NS::config } }; -} // namespace CORE_INTERFACE_NS::config +} // namespace cbm::algo::config #endif diff --git a/algo/data/sts/LandauTable.cxx b/algo/data/sts/LandauTable.cxx index ba99e8b783701cff6ba4c45af2fa798f424420c3..3dcf38f7533856733717ffd418ca5b95e58932d8 100644 --- a/algo/data/sts/LandauTable.cxx +++ b/algo/data/sts/LandauTable.cxx @@ -3,6 +3,8 @@ Authors: Felix Weiglhofer [committer] */ #include "LandauTable.h" +#include <fairlogger/Logger.h> + #include <fstream> using namespace cbm::algo; @@ -16,10 +18,13 @@ sts::LandauTable sts::LandauTable::FromFile(std::filesystem::path path) std::ifstream file(path); while (!file.eof()) { + f32 q, p; file >> q >> p; charge.push_back(q); prob.push_back(p); + + LOG(trace) << "Reading Landau table " << path << " q=" << q << " p=" << p; } // TODO: check if charge is monotonically increasing, also more than 2 entries diff --git a/algo/detectors/sts/StsHitfinder.cxx b/algo/detectors/sts/StsHitfinder.cxx index 620e30488c3056c9df3f5df72f191109f22de2b3..94a5c4173b83015e29318f2e5d54fc9ed170f4e1 100644 --- a/algo/detectors/sts/StsHitfinder.cxx +++ b/algo/detectors/sts/StsHitfinder.cxx @@ -6,77 +6,81 @@ using namespace cbm::algo; -XPU_CONSTANT(TheStsHitfinder); +// Export Constants +XPU_EXPORT(sts::TheHitfinder); -XPU_KERNEL(SortDigis, StsSortDigiSmem) -{ - xpu::cmem<TheStsHitfinder>().SortDigisInSpaceAndTime(smem, xpu::block_idx::x()); -} - -XPU_KERNEL(FindClustersSingleStep, xpu::no_smem) -{ - xpu::cmem<TheStsHitfinder>().FindClustersSingleStep(xpu::block_idx::x()); -} -XPU_KERNEL(CalculateOffsets, xpu::no_smem) -{ - xpu::cmem<TheStsHitfinder>().CalculateOffsetsParallel(xpu::block_idx::x()); -} +// Kernel Definitions +XPU_EXPORT(sts::SortDigis); +XPU_D void sts::SortDigis::operator()(context& ctx) { ctx.cmem<sts::TheHitfinder>().SortDigisInSpaceAndTime(ctx); } -XPU_KERNEL(FindClusters, xpu::no_smem) { xpu::cmem<TheStsHitfinder>().FindClustersParallel(xpu::block_idx::x()); } +XPU_EXPORT(sts::FindClusters); +XPU_D void sts::FindClusters::operator()(context& ctx) { ctx.cmem<sts::TheHitfinder>().FindClustersSingleStep(ctx); } -XPU_KERNEL(FindClustersBasic, xpu::no_smem) +XPU_EXPORT(sts::ChannelOffsets); +XPU_D void sts::ChannelOffsets::operator()(context& ctx) { - xpu::cmem<TheStsHitfinder>().FindClusterConnectionsBasic(xpu::block_idx::x()); + ctx.cmem<sts::TheHitfinder>().CalculateOffsetsParallel(ctx); } -XPU_KERNEL(CalculateClusters, xpu::no_smem) +XPU_EXPORT(sts::CreateDigiConnections); +XPU_D void sts::CreateDigiConnections::operator()(context& ctx) { - xpu::cmem<TheStsHitfinder>().CalculateClustersParallel(xpu::block_idx::x()); + ctx.cmem<sts::TheHitfinder>().FindClustersParallel(ctx); } -XPU_KERNEL(CalculateClustersBasic, xpu::no_smem) +XPU_EXPORT(sts::CreateClusters); +XPU_D void sts::CreateClusters::operator()(context& ctx) { - xpu::cmem<TheStsHitfinder>().CalculateClustersBasic(xpu::block_idx::x()); + ctx.cmem<sts::TheHitfinder>().CalculateClustersParallel(ctx); } -XPU_KERNEL(SortClusters, StsSortClusterSmem) { xpu::cmem<TheStsHitfinder>().SortClusters(smem, xpu::block_idx::x()); } +XPU_EXPORT(sts::SortClusters); +XPU_D void sts::SortClusters::operator()(context& ctx) { ctx.cmem<sts::TheHitfinder>().SortClusters(ctx); } -XPU_KERNEL(FindHits, xpu::no_smem) { xpu::cmem<TheStsHitfinder>().FindHits(xpu::block_idx::x()); } +XPU_EXPORT(sts::FindHits); +XPU_D void sts::FindHits::operator()(context& ctx) { ctx.cmem<sts::TheHitfinder>().FindHits(ctx); } /** - * StsHitfinderGpu::sortDigisKhun + * sts::Hitfinder::sortDigisKhun * Sorts digis channelwise. Inside a channel all digis are sorted in time. * * @param smem is the shared memory it is worked in * @param iBlock is the Block/Module that is currenty running * */ -XPU_D void StsHitfinderGpu::SortDigisInSpaceAndTime(StsSortDigiSmem& smem, int iBlock) const +XPU_D void sts::Hitfinder::SortDigisInSpaceAndTime(SortDigis::context& ctx) const { - int iModule = iBlock; + int iModule = ctx.block_idx_x(); CbmStsDigi* digis = &digisPerModule[digiOffsetPerModule[iModule]]; CbmStsDigi* digisTmp = &digisPerModuleTmp[digiOffsetPerModule[iModule]]; int nDigis = GetNDigis(iModule); - SortDigisT digiSort(smem.sortBuf); + SortDigis::sort_t digiSort(ctx.pos(), ctx.smem()); - digis = digiSort.sort(digis, nDigis, digisTmp, [](const CbmStsDigi a) { + CbmStsDigi* digisOut = digiSort.sort(digis, nDigis, digisTmp, [](const CbmStsDigi a) { return ((unsigned long int) a.GetChannel()) << 32 | (unsigned long int) (a.GetTimeU32()); }); - if (xpu::thread_idx::x() == 0) { digisSortedPerModule[iModule] = digis; } + if (digisOut == digisTmp) { + // Copy back to digis + for (int i = ctx.thread_idx_x(); i < nDigis; i += ctx.block_dim_x()) { + digis[i] = digisTmp[i]; + } + } } -XPU_D void StsHitfinderGpu::FindClustersSingleStep(int iBlock) const +XPU_D void sts::Hitfinder::FindClustersSingleStep(FindClusters::context& ctx) const { - CalculateOffsetsParallel(iBlock); - FindClustersParallel(iBlock); - CalculateClustersParallel(iBlock); + CalculateOffsetsParallel(ctx); + xpu::barrier(ctx.pos()); + FindClustersParallel(ctx); + xpu::barrier(ctx.pos()); + CalculateClustersParallel(ctx); } /** - * StsHitfinderGpu::calculateChannelOffsets + * sts::Hitfinder::calculateChannelOffsets * Calculates the Offsest of every channel. digis-Array is sorted in Channel and Time. * If a channelChange is detected it is stored in channelOffsetsPerModule. * @@ -85,13 +89,12 @@ XPU_D void StsHitfinderGpu::FindClustersSingleStep(int iBlock) const * @param nDigis Amount of digis in digis-Array * */ -XPU_D void StsHitfinderGpu::CalculateChannelOffsets(CbmStsDigi* digis, unsigned int* channelOffsets, - unsigned int const nDigis) const +XPU_D void sts::Hitfinder::CalculateChannelOffsets(FindClusters::context& ctx, CbmStsDigi* digis, + unsigned int* channelOffsets, unsigned int const nDigis) const { channelOffsets[0] = 0; - //Parallel - for (unsigned int pos = xpu::thread_idx::x(); pos < nDigis - 1; pos += (unsigned int) xpu::block_dim::x()) { + for (unsigned int pos = ctx.thread_idx_x(); pos < nDigis - 1; pos += (unsigned int) ctx.block_dim_x()) { auto const currChannel = digis[pos].GetChannel(); auto const nextChannel = digis[pos + 1].GetChannel(); if (currChannel != nextChannel) { @@ -108,364 +111,122 @@ XPU_D void StsHitfinderGpu::CalculateChannelOffsets(CbmStsDigi* digis, unsigned } } - /** - * StsHitfinderGpu::calculateOffsetsParallel + * sts::Hitfinder::calculateOffsetsParallel * This function calculates the channeloffsets in a certain module. * An Offset is the Startingpoint of a Channel in a sorted array of Digis. * * @param iBlock is the Block/Module that is currently worked on * */ -XPU_D void StsHitfinderGpu::CalculateOffsetsParallel(int iBlock) const +XPU_D void sts::Hitfinder::CalculateOffsetsParallel(FindClusters::context& ctx) const { - int const iModule = iBlock; - CbmStsDigi* digis = digisSortedPerModule[iModule]; + int const iModule = ctx.block_idx_x(); + CbmStsDigi* digis = &digisPerModule[digiOffsetPerModule[iModule]]; auto const nDigis = GetNDigis(iModule); if (nDigis == 0) return; auto* channelOffsets = &channelOffsetPerModule[iModule * nChannels]; - CalculateChannelOffsets(digis, channelOffsets, nDigis); + CalculateChannelOffsets(ctx, digis, channelOffsets, nDigis); } /** - * StsHitfinderGpu::findClustersParallel + * sts::Hitfinder::findClustersParallel * This function finds clusters form an sts-Digi Array inserted. * It runs the finding process highly parallel. * * @param iBlock is the Block/Module that is currently worked on * */ -XPU_D void StsHitfinderGpu::FindClustersParallel(int iBlock) const -{ - int const iModule = iBlock; - CbmStsDigi* digis = digisSortedPerModule[iModule]; - auto const nDigis = GetNDigis(iModule); - unsigned int const threadId = xpu::thread_idx::x(); - - if (nDigis == 0) return; - - auto* digiConnector = &digiConnectorsPerModule[digiOffsetPerModule[iModule]]; - auto* channelOffsets = &channelOffsetPerModule[iModule * nChannels]; - - // findClusterConnectionsChannelWise(digis, digiConnector, channelOffsets, iModule, threadId); - FindClusterConnectionsDigiWise(digis, digiConnector, channelOffsets, iModule, threadId, nDigis); -} - - -/** - * StsHitfinderGpu::calculateClustersParallel - * This function calculates clusters form an sts-Digi Array inserted. - * It runs the calculating process highly parallel. - * - * @param iBlock is the Block/Module that is currently worked on - * -*/ -XPU_D void StsHitfinderGpu::CalculateClustersParallel(int iBlock) const -{ - int const iModule = iBlock; - CbmStsDigi* digis = digisSortedPerModule[iModule]; - auto const nDigis = GetNDigis(iModule); - int const threadId = xpu::thread_idx::x(); - - if (nDigis == 0) { return; } - - auto* digiConnector = &digiConnectorsPerModule[digiOffsetPerModule[iModule]]; - // auto* channelOffsets = &channelOffsetPerModule[iModule * nChannels]; - - // calculateClustersChannelWise(digis, digiConnector, channelOffsets, iModule, threadId, nDigis); - CalculateClustersDigiWise(digis, digiConnector, iModule, threadId, nDigis); -} - - -/** - * StsHitfinderGpu::findClusterConnectionsBasic - * - * Each Thread one Channel - * - * ChannelId: 00000 11111 - * DigiIndex: 01234 56789 - * ThreadId: 00000 00000 - * - * Finds Clusters in a basic way. One thread takes care of the whole calculation. - * - * @param digis All Digis that are relevant for this Block - * @param digiConnector Array where the connection between 2 digis is sotred in.(next Digi; has Previous) - * @param channelOffsets The Array where all offsets are written to - * @param iModule The Module that the curren Block is working on - * @param threadId Id of the thrad currently working - * -**/ -XPU_D void StsHitfinderGpu::FindClusterConnectionsBasic(int iBlock) const +XPU_D void sts::Hitfinder::FindClustersParallel(FindClusters::context& ctx) const { - int iModule = iBlock; - int threadId = xpu::thread_idx::x(); - auto nDigis = GetNDigis(iModule); - CbmStsDigi* digis = digisSortedPerModule[iModule]; - auto* digiConnector = &digiConnectorsPerModule[digiOffsetPerModule[iModule]]; - auto* channelOffsets = &channelOffsetPerModule[iModule * nChannels]; - - if (threadId != 0) return; - - for (unsigned int currIter = 0; currIter < nDigis; currIter++) { - const CbmStsDigi& digi = digis[currIter]; - uint16_t channel = digi.GetChannel(); + const i32 iGThread = ctx.block_dim_x() * ctx.block_idx_x() + ctx.thread_idx_x(); + const i32 nDigisTotal = digiOffsetPerModule[2 * nModules]; + if (iGThread >= nDigisTotal) { return; }; - if (channel == nChannels - 1) break; + i32 iModule = 0; + while (size_t(iGThread) >= digiOffsetPerModule[iModule + 1]) { + iModule++; + } - //DeltaCalculation - float const tResol = GetTimeResolution(iModule, channel); - float const deltaT = (timeCutDigiAbs > 0.f ? timeCutDigiAbs : timeCutDigiSig * 1.4142f * tResol); + i32 iLocal = iGThread - digiOffsetPerModule[iModule]; - unsigned int nextChannelStart = channelOffsets[channel + 1]; - unsigned int nextChannelEnd = (channel + 2 < nChannels) ? channelOffsets[channel + 2] : nDigis; + const CbmStsDigi myDigi = digisPerModule[iGThread]; - // printf("Comparing digis (deltaT=%f, nextChannelStart=%u, nextChannelEnd=%u, nDigis=%u):\n", deltaT, - // nextChannelStart, nextChannelEnd, nDigis); - // printf(" channel=%d, time=%d, charge=%d\n", digi.GetChannel(), digi.GetTimeU32(), digi.GetChargeU16()); + u32 nDigis = GetNDigis(iModule); + if (nDigis == 0) return; - //Calculate DigiConnections - for (unsigned int nextIter = nextChannelStart; nextIter < nextChannelEnd; nextIter++) { + xpu::view digis(&digisPerModule[digiOffsetPerModule[iModule]], nDigis); + xpu::view digiConnector(&digiConnectorsPerModule[digiOffsetPerModule[iModule]], nDigis); + xpu::view channelOffsets(&channelOffsetPerModule[iModule * nChannels], nChannels); - const CbmStsDigi& nextDigi = digis[nextIter]; + u16 channel = myDigi.GetChannel(); + if (channel == nChannels - 1) return; - // printf("> channel=%d, time=%d, charge=%d\n", nextDigi.GetChannel(), nextDigi.GetTimeU32(), - // nextDigi.GetChargeU16()); + //DeltaCalculation + const f32 tResol = GetTimeResolution(iModule, channel); + const RecoParams::STS& params = ctx.cmem<Params>().sts; + const f32 deltaT = (params.timeCutDigiAbs > 0.f ? params.timeCutDigiAbs : params.timeCutDigiSig * 1.4142f * tResol); - if (digi.GetChannel() >= nextDigi.GetChannel()) continue; + const u32 nextChannelEnd = (channel + 2 < nChannels) ? channelOffsets[channel + 2] : nDigis; - if (float(digis[currIter].GetTimeU32() + deltaT) <= float(digis[nextIter].GetTimeU32())) break; - if (float(digis[currIter].GetTimeU32() - deltaT) >= float(digis[nextIter].GetTimeU32())) continue; + float firstPossibleTime = float(myDigi.GetTimeU32()) - deltaT; - digiConnector[currIter].SetNext(nextIter); - digiConnector[nextIter].SetHasPrevious(true); - break; + // Binary search for the first possible digi + u32 start = channelOffsets[channel + 1]; + u32 end = nextChannelEnd; + while (start < end) { + u32 mid = (start + end) / 2; + if (float(digis[mid].GetTimeU32()) < firstPossibleTime) { start = mid + 1; } + else { + end = mid; } } -} + for (auto nextIter = start; nextIter < nextChannelEnd; nextIter++) { -/** - * StsHitfinderGpu::findClusterConnectionsChannelWise - * - * Each Thread one Channel - * - * ChannelId: 00000 11111 - * DigiIndex: 01234 56789 - * ThreadId: 00000 11111 - * - * Finds Clusters highly parallel in a basic way. Each thread gets the same anmount of Channels to work with. - * Every Thread checks all digis in its channel, if there is a pendant to the current one in the channel to its right. - * If there is another digi, the digi will be connected inside the digiConnector-Array. - * - * @param digis All Digis that are relevant for this Block - * @param digiConnector Array where the connection between 2 digis is sotred in.(next Digi; has Previous) - * @param channelOffsets The Array where all offsets are written to - * @param iModule The Module that the curren Block is working on - * @param threadId Id of the thrad currently working - * -**/ -XPU_D void StsHitfinderGpu::FindClusterConnectionsChannelWise(CbmStsDigi* digis, StsDigiConnector* digiConnector, - unsigned int* channelOffsets, int const iModule, - unsigned int const threadId) const -{ - // Expl: here we need to be smaller than nChannels because the last channel is the most Right. - for (unsigned int channel = threadId; channel < (unsigned int) (nChannels - 1); channel += xpu::block_dim::x()) { - unsigned int const currChannelBegin = channelOffsets[channel]; //Begin of current Channel - unsigned int const nextChannelBegin = channelOffsets[channel + 1]; //Begin of next (right) channel - auto const nextChannelEnd = - (channel + 2 < (unsigned int) nChannels) ? channelOffsets[channel + 2] : GetNDigis(iModule); - unsigned int nextIter = nextChannelBegin; - - //Check if first Digi of Channel belongs to Channel - if (channel != digis[currChannelBegin].GetChannel()) continue; - - //DeltaCalculation - float const tResol = GetTimeResolution(iModule, channel); - float const deltaT = (timeCutDigiAbs > 0.f ? timeCutDigiAbs : timeCutDigiSig * 1.4142f * tResol); - - //Calculate DigiConnections - for (auto currIter = currChannelBegin; currIter < nextChannelBegin; currIter++) { - while (nextIter < nextChannelEnd - && ((digis[currIter].GetTimeU32() + deltaT) >= float(digis[nextIter].GetTimeU32()))) { - if (digis[currIter].GetChannel() >= digis[nextIter].GetChannel()) { continue; } - if (digis[currIter].GetTimeU32() + deltaT < float(digis[nextIter].GetTimeU32())) { - nextIter++; - break; - } - if (digis[currIter].GetTimeU32() - deltaT > float(digis[nextIter].GetTimeU32())) { - nextIter++; - continue; - } - - digiConnector[currIter].SetNext(nextIter); - digiConnector[nextIter].SetHasPrevious(true); - nextIter++; - break; - } - } - } -} + const CbmStsDigi otherDigi = digis[nextIter]; -/** - * StsHitfinderGpu::findClusterConnectionsDigiWise - * - * Each thread one Digi - * - * ChannelId: 00000 11111 - * DigiIndex: 01234 56789 - * ThreadId: 01234 01234 - * - * Finds Clusters highly parallel in a basic way. - * The Threads work on Data thats stored nearby, so it's been taken advantage of the data locality. - * Each thread takes one digi and calculates if there is a neighbour. - * This is repeated until all digis have been processed - * - * @param digis All Digis that are relevant for this Block - * @param digiConnector Array where the connection between 2 digis is sotred in.(next Digi; has Previous) - * @param channelOffsets The Array where all offsets are written to - * @param iModule The Module that the curren Block is working on - * @param threadId Id of the thrad currently working - * @param nDigis Amount of digis in digis-Array - * -**/ -XPU_D void StsHitfinderGpu::FindClusterConnectionsDigiWise(CbmStsDigi* digis, StsDigiConnector* digiConnector, - unsigned int* channelOffsets, int const iModule, - unsigned int const threadId, unsigned int const nDigis) const -{ - - for (unsigned int currIter = threadId; currIter < nDigis; currIter += xpu::block_dim::x()) { - auto const digi = digis[currIter]; - auto const channel = digi.GetChannel(); + // This should never happen? + if (myDigi.GetChannel() >= otherDigi.GetChannel()) continue; - if (channel == nChannels - 1) break; + if (float(myDigi.GetTimeU32()) + deltaT < float(otherDigi.GetTimeU32())) { break; } + if (float(myDigi.GetTimeU32()) - deltaT > float(otherDigi.GetTimeU32())) { continue; } - //DeltaCalculation - float const tResol = GetTimeResolution(iModule, channel); - float const deltaT = (timeCutDigiAbs > 0.f ? timeCutDigiAbs : timeCutDigiSig * 1.4142f * tResol); - - auto const nextChannelEnd = (channel + 2 < nChannels) ? channelOffsets[channel + 2] : nDigis; - //Calculate DigiConnections - for (auto nextIter = channelOffsets[channel + 1]; nextIter < nextChannelEnd; nextIter++) { - if (digis[currIter].GetChannel() >= digis[nextIter].GetChannel()) continue; - if (float(digis[currIter].GetTimeU32()) + deltaT < float(digis[nextIter].GetTimeU32())) break; - if (float(digis[currIter].GetTimeU32()) - deltaT > float(digis[nextIter].GetTimeU32())) continue; - - digiConnector[currIter].SetNext(nextIter); - digiConnector[nextIter].SetHasPrevious(true); - break; - } + digiConnector[iLocal].SetNext(nextIter); + digiConnector[nextIter].SetHasPrevious(true); + break; } } - /** - * StsHitfinderGpu::calculateClustersBasic - * - * One Thread all Digis - * - * ChannelId: 00000 11111 - * DigiIndex: 01234 56789 - * ThreadId: 00000 00000 - * - * Calculates the Clustercharges of all found ClusterConnections. - * One thread walks through all Digis and looks for connections. If one Digi does not have a previous one - * it's a cluster start and may be the start of either a single-element-cluster, double-element-cluster - * or multi-element-cluster. - * All Other Threads are canceled + * sts::Hitfinder::calculateClustersParallel + * This function calculates clusters form an sts-Digi Array inserted. + * It runs the calculating process highly parallel. * - * @param digis All Digis that are relevant for this Block - * @param digiConnector Array where the connection between 2 digis is sotred in.(next Digi; has Previous) - * @param iModule The Module that the curren Block is working on - * @param threadId Id of the thrad currently working + * @param iBlock is the Block/Module that is currently worked on * -**/ -XPU_D void StsHitfinderGpu::CalculateClustersBasic(int iBlock) const +*/ +XPU_D void sts::Hitfinder::CalculateClustersParallel(FindClusters::context& ctx) const { - int iModule = iBlock; - int threadId = xpu::thread_idx::x(); - CbmStsDigi* digis = digisSortedPerModule[iModule]; - auto* digiConnector = &digiConnectorsPerModule[digiOffsetPerModule[iModule]]; - - if (threadId != 0) return; - - for (unsigned int i = 0; i < GetNDigis(iModule); i++) { - if (digiConnector[i].HasPrevious()) { continue; } - if (!digiConnector[i].HasNext()) { - //if Cluster has 1 element - CreateClusterFromConnectors1(iModule, digis, i); - } - else if (!digiConnector[digiConnector[i].next()].HasNext()) { - //if Cluster has 2 elements - CreateClusterFromConnectors2(iModule, digis, digiConnector, i); - } - else { - //if Cluster has N elements - CreateClusterFromConnectorsN(iModule, digis, digiConnector, i); - } - } -} + int const iModule = ctx.block_idx_x(); + CbmStsDigi* digis = &digisPerModule[digiOffsetPerModule[iModule]]; + ; + auto const nDigis = GetNDigis(iModule); + if (nDigis == 0) return; -/** - * StsHitfinderGpu::calculateClustersChannelWise - * - * Each Thread one Channel - * - * ChannelId: 00000 11111 - * DigiIndex: 01234 56789 - * ThreadId: 00000 11111 - * - * Calculates the Clustercharges of all found ClusterConnections. - * Each Thread walks through all digis of one channel and looks for connections. - * If one Digi does not have a previous one it's a cluster start and may be the - * start of either a single-element-cluster, double-element-cluster or - * multi-element-cluster. - * - * @param digis All Digis that are relevant for this Block - * @param digiConnector Array where the connection between 2 digis is sotred in.(next Digi; has Previous) - * @param iModule The Module that the curren Block is working on - * @param threadId Id of the thrad currently working - * -**/ -XPU_D void StsHitfinderGpu::CalculateClustersChannelWise(CbmStsDigi* digis, StsDigiConnector* digiConnector, - unsigned int* channelOffsets, int const iModule, - unsigned int const threadId, unsigned int const nDigis) const -{ + auto* digiConnector = &digiConnectorsPerModule[digiOffsetPerModule[iModule]]; + // auto* channelOffsets = &channelOffsetPerModule[iModule * nChannels]; - for (unsigned int channel = threadId; channel < (unsigned int) nChannels; - channel += (unsigned int) xpu::block_dim::x()) { - unsigned int const currChannelBegin = channelOffsets[channel]; - unsigned int const nextChannelBegin = - (channel + 1 < (unsigned int) nChannels) ? channelOffsets[channel + 1] : nDigis; - - //Check if first Digi of Channel belongs to Channel - if (channel != digis[currChannelBegin].GetChannel()) { continue; } - - //Calculate DigiConnections - for (auto currIter = currChannelBegin; currIter < nextChannelBegin; currIter++) { - if (digiConnector[currIter].HasPrevious()) { continue; } - if (!digiConnector[currIter].HasNext()) { - //if Cluster has 1 element - CreateClusterFromConnectors1(iModule, digis, currIter); - } - else if (!digiConnector[digiConnector[currIter].next()].HasNext()) { - //if Cluster has 2 elements - CreateClusterFromConnectors2(iModule, digis, digiConnector, currIter); - } - else { - //if Cluster has N elements - CreateClusterFromConnectorsN(iModule, digis, digiConnector, currIter); - } - } - } + // calculateClustersChannelWise(digis, digiConnector, channelOffsets, iModule, threadId, nDigis); + CalculateClustersDigiWise(ctx, digis, digiConnector, nDigis); } - /** - * StsHitfinderGpu::calculateClustersDigiWise + * sts::Hitfinder::calculateClustersDigiWise * * Each Thread one Channel * @@ -485,12 +246,11 @@ XPU_D void StsHitfinderGpu::CalculateClustersChannelWise(CbmStsDigi* digis, StsD * @param threadId Id of the thrad currently working * **/ -XPU_D void StsHitfinderGpu::CalculateClustersDigiWise(CbmStsDigi* digis, StsDigiConnector* digiConnector, - int const iModule, unsigned int const threadId, - unsigned int const nDigis) const +XPU_D void sts::Hitfinder::CalculateClustersDigiWise(FindClusters::context& ctx, CbmStsDigi* digis, + sts::DigiConnector* digiConnector, unsigned int const nDigis) const { - - for (unsigned int currIter = threadId; currIter < nDigis; currIter += (unsigned int) xpu::block_dim::x()) { + int const iModule = ctx.block_idx_x(); + for (unsigned int currIter = ctx.thread_idx_x(); currIter < nDigis; currIter += (unsigned int) ctx.block_dim_x()) { if (digiConnector[currIter].HasPrevious()) continue; @@ -509,8 +269,7 @@ XPU_D void StsHitfinderGpu::CalculateClustersDigiWise(CbmStsDigi* digis, StsDigi } } - -XPU_D void StsHitfinderGpu::CreateClusterFromConnectors1(int const iModule, CbmStsDigi* digis, int digiIndex) const +XPU_D void sts::Hitfinder::CreateClusterFromConnectors1(int const iModule, CbmStsDigi* digis, int digiIndex) const { const CbmStsDigi& digi = digis[digiIndex]; @@ -518,7 +277,7 @@ XPU_D void StsHitfinderGpu::CreateClusterFromConnectors1(int const iModule, CbmS const float timeError = asic.timeResolution; - StsGpuCluster cluster { + sts::Cluster cluster { .fCharge = asic.AdcToCharge(digi.GetChargeU16()), .fSize = 1, .fPosition = float(digi.GetChannel()) + (IsBackside(iModule) ? nChannels : 0.f), @@ -532,8 +291,8 @@ XPU_D void StsHitfinderGpu::CreateClusterFromConnectors1(int const iModule, CbmS AddCluster(iModule, time, cluster); } -XPU_D void StsHitfinderGpu::CreateClusterFromConnectors2(int const iModule, CbmStsDigi* digis, - StsDigiConnector* digiConnector, int const digiIndex) const +XPU_D void sts::Hitfinder::CreateClusterFromConnectors2(int const iModule, CbmStsDigi* digis, + sts::DigiConnector* digiConnector, int const digiIndex) const { const CbmStsDigi& digi1 = digis[digiIndex]; @@ -588,7 +347,7 @@ XPU_D void StsHitfinderGpu::CreateClusterFromConnectors2(int const iModule, CbmS if (IsBackside(iModule)) x += nChannels; - StsGpuCluster cls { + sts::Cluster cls { .fCharge = charge, .fSize = 2, .fPosition = x, @@ -601,11 +360,10 @@ XPU_D void StsHitfinderGpu::CreateClusterFromConnectors2(int const iModule, CbmS AddCluster(iModule, time, cls); } - -XPU_D void StsHitfinderGpu::CreateClusterFromConnectorsN(int iModule, CbmStsDigi* digis, - StsDigiConnector* digiConnector, int digiIndex) const +XPU_D void sts::Hitfinder::CreateClusterFromConnectorsN(int iModule, CbmStsDigi* digis, + sts::DigiConnector* digiConnector, int digiIndex) const { - StsClusterCalculationProperties cProps; + ClusterCalculationProperties cProps; //This Lambda calculates all charges for a single digi inside of a cluster auto calculateClusterCharges = [this, &cProps, &digis, &digiConnector](int index) { @@ -680,7 +438,7 @@ XPU_D void StsHitfinderGpu::CreateClusterFromConnectorsN(int iModule, CbmStsDigi if (IsBackside(iModule)) { x += nChannels; } - StsGpuCluster cls { + sts::Cluster cls { .fCharge = qSum, .fSize = int(nDigis), .fPosition = x, @@ -693,37 +451,81 @@ XPU_D void StsHitfinderGpu::CreateClusterFromConnectorsN(int iModule, CbmStsDigi AddCluster(iModule, cProps.tSum, cls); } -XPU_D void StsHitfinderGpu::SortClusters(StsSortClusterSmem& smem, int iBlock) const +XPU_D void sts::Hitfinder::SortClusters(SortClusters::context& ctx) const { - int iModule = iBlock; - size_t offset = iModule * maxClustersPerModule; - StsGpuClusterIdx* clusterIdx = &clusterIdxPerModule[offset]; - StsGpuClusterIdx* clusterIdxTmp = &clusterIdxPerModuleTmp[offset]; - int nClusters = nClustersPerModule[iModule]; + // By default sort all modules in parallel, + // but in debug mode sort only one module at a time + // to narrow down the error source + int firstModule = ctx.block_idx_x(); + int lastModule = ctx.block_idx_x(); + if (ctx.grid_dim_x() == 1) { + firstModule = 0; + lastModule = 2 * nModules - 1; + } + + for (int iModule = firstModule; iModule <= lastModule; iModule++) { + size_t offset = iModule * maxClustersPerModule; + GpuClusterIdx* clusterIdx = &clusterIdxPerModule[offset]; + GpuClusterIdx* clusterIdxTmp = &clusterIdxPerModuleTmp[offset]; + int nClusters = nClustersPerModule[iModule]; - SortClustersT clsSort(smem.sortBuf); - clusterIdx = clsSort.sort(clusterIdx, nClusters, clusterIdxTmp, [](const StsGpuClusterIdx a) { return a.fTime; }); + // if (nClusters == 0 && ctx.thread_idx_x() == 0) { + // printf("Module %d: No clusters found, exit early\n", iModule); + // } + if (nClusters == 0) return; - if (xpu::thread_idx::x() == 0) clusterIdxSortedPerModule[iModule] = clusterIdx; + // if (ctx.thread_idx_x() == 0) { + // printf("Module %d: Start sorting %d clusters\n", iModule, nClusters); + // } + + SortClusters::sort_t clsSort(ctx.pos(), ctx.smem()); + clusterIdx = clsSort.sort(clusterIdx, nClusters, clusterIdxTmp, [](const GpuClusterIdx a) { return a.fTime; }); + + // if (ctx.thread_idx_x() == 0) { + // printf("Module %d: Finished sorting %d clusters\n", iModule, nClusters); + // } + + if (ctx.thread_idx_x() == 0) clusterIdxSortedPerModule[iModule] = clusterIdx; + } } -XPU_D void StsHitfinderGpu::FindHits(int iBlock) const +XPU_D void sts::Hitfinder::FindHits(FindHits::context& ctx) const { - int iModuleF = iBlock; - int iModuleB = iBlock + nModules; - size_t offsetF = iModuleF * maxClustersPerModule; - size_t offsetB = iModuleB * maxClustersPerModule; - StsGpuClusterIdx* clusterIdxF = clusterIdxSortedPerModule[iModuleF]; - StsGpuClusterIdx* clusterIdxB = clusterIdxSortedPerModule[iModuleB]; - StsGpuCluster* clusterDataF = &clusterDataPerModule[offsetF]; - StsGpuCluster* clusterDataB = &clusterDataPerModule[offsetB]; - int nClustersF = nClustersPerModule[iModuleF]; - int nClustersB = nClustersPerModule[iModuleB]; - - - StsHitfinderCache pars; + +// On CPU process all modules in parallel (one thread per module). +// On GPU process all front clusters in parallel instead (one thread per cluster) +// to fully utilize the GPU. +// Currently use option 2 for both as it is faster on CPU as well. +#if XPU_IS_CPU + int iModule = ctx.block_idx_x(); +#else + int iModule = 0; + int iThread = ctx.block_dim_x() * ctx.block_idx_x() + ctx.thread_idx_x(); + + for (; iModule < nModules; iModule++) { + if (iThread < nClustersPerModule[iModule]) break; + iThread -= nClustersPerModule[iModule]; + } + + if (iModule >= nModules) return; +#endif + + int iModuleF = iModule; + int iModuleB = iModuleF + nModules; + size_t offsetF = iModuleF * maxClustersPerModule; + size_t offsetB = iModuleB * maxClustersPerModule; + GpuClusterIdx* clusterIdxF = clusterIdxSortedPerModule[iModuleF]; + GpuClusterIdx* clusterIdxB = clusterIdxSortedPerModule[iModuleB]; + sts::Cluster* clusterDataF = &clusterDataPerModule[offsetF]; + sts::Cluster* clusterDataB = &clusterDataPerModule[offsetB]; + int nClustersF = nClustersPerModule[iModuleF]; + int nClustersB = nClustersPerModule[iModuleB]; + + if (nClustersF == 0 || nClustersB == 0) return; + + HitfinderCache pars; { - StsSensorPar cfg = sensorPars[iBlock]; + SensorPar cfg = sensorPars[iModule]; pars.dY = cfg.dY; pars.pitch = cfg.pitch; @@ -737,21 +539,42 @@ XPU_D void StsHitfinderGpu::FindHits(int iBlock) const pars.dX = float(nChannels) * pars.pitch; } - float maxTerrF = *maxErrorFront; - float maxTerrB = *maxErrorBack; + float maxTerrF = maxClusterTimeErrorByModuleSide[iModuleF]; + float maxTerrB = maxClusterTimeErrorByModuleSide[iModuleB]; float maxSigmaBoth = 4.f * xpu::sqrt(maxTerrF * maxTerrF + maxTerrB * maxTerrB); int startB = 0; - for (int iClusterF = xpu::thread_idx::x(); iClusterF < nClustersF; iClusterF += xpu::block_dim::x()) { - StsGpuClusterIdx clsIdxF = clusterIdxF[iClusterF]; - StsGpuCluster clsDataF = clusterDataF[clsIdxF.fIdx]; +#if XPU_IS_CPU + for (int iClusterF = ctx.thread_idx_x(); iClusterF < nClustersF; iClusterF += ctx.block_dim_x()) { +#else + int iClusterF = iThread; + if (iClusterF < nClustersF) { + + float firstPossibleTime = float(clusterIdxF[iClusterF].fTime) - maxSigmaBoth; + + // Use binary search to find the first cluster on back side that can be matched + // with the current cluster on front side + int start = startB; + int end = nClustersB; + while (end - start > 1) { + int mid = (start + end) / 2; + if (float(clusterIdxB[mid].fTime) < firstPossibleTime) { start = mid; } + else { + end = mid; + } + } + + startB = start; +#endif + GpuClusterIdx clsIdxF = clusterIdxF[iClusterF]; + sts::Cluster clsDataF = clusterDataF[clsIdxF.fIdx]; - float maxSigma = 4.f * xpu::sqrt(clsDataF.fTimeError * clsDataF.fTimeError + maxTerrF * maxTerrF); + float maxSigma = 4.f * xpu::sqrt(clsDataF.fTimeError * clsDataF.fTimeError + maxTerrB * maxTerrB); for (int iClusterB = startB; iClusterB < nClustersB; iClusterB++) { - StsGpuClusterIdx clsIdxB = clusterIdxB[iClusterB]; - StsGpuCluster clsDataB = clusterDataB[clsIdxB.fIdx]; + GpuClusterIdx clsIdxB = clusterIdxB[iClusterB]; + sts::Cluster clsDataB = clusterDataB[clsIdxB.fIdx]; float timeDiff = float(clsIdxF.fTime) - float(clsIdxB.fTime); @@ -767,23 +590,23 @@ XPU_D void StsHitfinderGpu::FindHits(int iBlock) const } float timeCut = -1.f; - if (timeCutClusterAbs > 0.f) timeCut = timeCutClusterAbs; - else if (timeCutClusterSig > 0.f) { + const RecoParams::STS& params = ctx.cmem<Params>().sts; + if (params.timeCutClusterAbs > 0.f) timeCut = params.timeCutClusterAbs; + else if (params.timeCutClusterSig > 0.f) { float eF = clsDataF.fTimeError; float eB = clsDataB.fTimeError; - timeCut = timeCutClusterSig * xpu::sqrt(eF * eF + eB * eB); + timeCut = params.timeCutClusterSig * xpu::sqrt(eF * eF + eB * eB); } if (xpu::abs(float(clsIdxF.fTime) - float(clsIdxB.fTime)) > timeCut) continue; - IntersectClusters(iBlock, pars, clsIdxF.fTime, clsDataF, clsIdxB.fTime, clsDataB); + IntersectClusters(iModule, pars, clsIdxF.fTime, clsDataF, clsIdxB.fTime, clsDataB); } } } -XPU_D void StsHitfinderGpu::IntersectClusters(int iBlock, const StsHitfinderCache& pars, uint32_t timeF, - const StsGpuCluster& clsF, uint32_t timeB, - const StsGpuCluster& clsB) const +XPU_D void sts::Hitfinder::IntersectClusters(int iBlock, const HitfinderCache& pars, uint32_t timeF, + const sts::Cluster& clsF, uint32_t timeB, const sts::Cluster& clsB) const { // --- Calculate cluster centre position on readout edge @@ -840,7 +663,7 @@ XPU_D void StsHitfinderGpu::IntersectClusters(int iBlock, const StsHitfinderCach } } -XPU_D float StsHitfinderGpu::GetClusterPosition(const StsHitfinderCache& pars, float centre, bool isFront) const +XPU_D float sts::Hitfinder::GetClusterPosition(const HitfinderCache& pars, float centre, bool isFront) const { // Take integer channel @@ -866,8 +689,8 @@ XPU_D float StsHitfinderGpu::GetClusterPosition(const StsHitfinderCache& pars, f return xCluster; } -XPU_D bool StsHitfinderGpu::Intersect(const StsHitfinderCache& pars, float xF, float exF, float xB, float exB, float& x, - float& y, float& varX, float& varY, float& varXY) const +XPU_D bool sts::Hitfinder::Intersect(const HitfinderCache& pars, float xF, float exF, float xB, float exB, float& x, + float& y, float& varX, float& varY, float& varXY) const { // In the coordinate system with origin at the bottom left corner, @@ -917,7 +740,7 @@ XPU_D bool StsHitfinderGpu::Intersect(const StsHitfinderCache& pars, float xF, f return IsInside(pars, x - pars.dX / 2.f, y - pars.dY / 2.f); } -XPU_D bool StsHitfinderGpu::IsInside(const StsHitfinderCache& pars, float x, float y) const +XPU_D bool sts::Hitfinder::IsInside(const HitfinderCache& pars, float x, float y) const { // clang-format off return x >= -pars.dX / 2.f @@ -927,9 +750,9 @@ XPU_D bool StsHitfinderGpu::IsInside(const StsHitfinderCache& pars, float x, flo // clang-format on } -XPU_D void StsHitfinderGpu::CreateHit(int iModule, float xLocal, float yLocal, float varX, float varY, float varXY, - uint32_t timeF, const StsGpuCluster& clsF, uint32_t timeB, - const StsGpuCluster& clsB, float du, float dv) const +XPU_D void sts::Hitfinder::CreateHit(int iModule, float xLocal, float yLocal, float varX, float varY, float varXY, + uint32_t timeF, const sts::Cluster& clsF, uint32_t timeB, const sts::Cluster& clsB, + float du, float dv) const { // --- Transform into global coordinate system float globalX, globalY, globalZ; @@ -948,7 +771,7 @@ XPU_D void StsHitfinderGpu::CreateHit(int iModule, float xLocal, float yLocal, f float etB = clsB.fTimeError; float hitTimeError = 0.5f * xpu::sqrt(etF * etF + etB * etB); - StsGpuHit hit { + sts::Hit hit { .fX = globalX, .fY = globalY, .fZ = globalZ, @@ -962,15 +785,13 @@ XPU_D void StsHitfinderGpu::CreateHit(int iModule, float xLocal, float yLocal, f .fDv = dv, }; - int idx = xpu::atomic_add_block(&nHitsPerModule[iModule], 1); + int idx = xpu::atomic_add(&nHitsPerModule[iModule], 1); assert(size_t(idx) < maxHitsPerModule); - - hitsPerModule[iModule * maxHitsPerModule + idx] = hit; + if (size_t(idx) < maxHitsPerModule) { hitsPerModule[iModule * maxHitsPerModule + idx] = hit; } } - -XPU_D float StsHitfinderGpu::LandauWidth(float charge) const +XPU_D float sts::Hitfinder::LandauWidth(float charge) const { if (charge <= landauStepSize) return landauTable[0]; if (charge > landauStepSize * (landauTableSize - 1)) return landauTable[landauTableSize - 1]; @@ -984,7 +805,7 @@ XPU_D float StsHitfinderGpu::LandauWidth(float charge) const return v1 + (charge - e1) * (v2 - v1) / (e2 - e1); } -XPU_D void StsHitfinderGpu::ToGlobal(int iModule, float lx, float ly, float lz, float& gx, float& gy, float& gz) const +XPU_D void sts::Hitfinder::ToGlobal(int iModule, float lx, float ly, float lz, float& gx, float& gy, float& gz) const { const float* tr = &localToGlobalTranslationByModule[iModule * 3]; const float* rot = &localToGlobalRotationByModule[iModule * 9]; diff --git a/algo/detectors/sts/StsHitfinder.h b/algo/detectors/sts/StsHitfinder.h index 6cba6abd1356ac17bdf518d2ecb654d3bd5ef4a5..570bdc6a40bd86bd607b4a1622a4ef0422b6150f 100644 --- a/algo/detectors/sts/StsHitfinder.h +++ b/algo/detectors/sts/StsHitfinder.h @@ -9,102 +9,110 @@ #include <xpu/device.h> +#include "Prelude.h" #include "gpu/DeviceImage.h" +#include "gpu/Params.h" +#include "sts/Cluster.h" +#include "sts/Hit.h" +#include "sts/HitfinderPars.h" namespace cbm::algo { + // Block sizes / other compile time constants that need tuning enum GpuConstants { #if XPU_IS_CUDA - kSortDigisBlockSize = 32, - kSortDigisItemsPerThread = 32, - kSortClustersBlockSize = 32, - kSortClustersItemsPerThread = 32, - kFindClusterBlockSize = 32, - kFindHitsBlockSize = 32, + kSortDigisBlockSize = 512, + kSortDigisItemsPerThread = 8, + kSortClustersBlockSize = 512, + kSortClustersItemsPerThread = 8, + kFindClusterBlockSize = 1024, + kFindHitsBlockSize = 512, #else // HIP, values ignored on CPU kSortDigisBlockSize = 256, kSortDigisItemsPerThread = 6, kSortClustersBlockSize = 256, kSortClustersItemsPerThread = 6, - kFindClusterBlockSize = 256, - kFindHitsBlockSize = 256, + kFindClusterBlockSize = 1024, + kFindHitsBlockSize = 1024, #endif }; - - // Kernel declarations - XPU_EXPORT_KERNEL(GPUReco, SortDigis); - // Combine substeps for finding clusters into a single kernel - XPU_EXPORT_KERNEL(GPUReco, FindClustersSingleStep); - XPU_EXPORT_KERNEL(GPUReco, CalculateOffsets); - XPU_EXPORT_KERNEL(GPUReco, FindClusters); - XPU_EXPORT_KERNEL(GPUReco, FindClustersBasic); - XPU_EXPORT_KERNEL(GPUReco, CalculateClusters); - XPU_EXPORT_KERNEL(GPUReco, CalculateClustersBasic); - XPU_EXPORT_KERNEL(GPUReco, SortClusters); - XPU_EXPORT_KERNEL(GPUReco, FindHits); - } // namespace cbm::algo -// Set block sizes. Currently this needs to be done in toplevel namespace. -XPU_BLOCK_SIZE_1D(cbm::algo::SortClusters, cbm::algo::kSortClustersBlockSize); -XPU_BLOCK_SIZE_1D(cbm::algo::FindClustersSingleStep, cbm::algo::kFindClusterBlockSize); -XPU_BLOCK_SIZE_1D(cbm::algo::CalculateOffsets, cbm::algo::kFindClusterBlockSize); -XPU_BLOCK_SIZE_1D(cbm::algo::FindClusters, cbm::algo::kFindClusterBlockSize); -XPU_BLOCK_SIZE_1D(cbm::algo::CalculateClusters, cbm::algo::kFindClusterBlockSize); -XPU_BLOCK_SIZE_1D(cbm::algo::SortDigis, cbm::algo::kSortClustersBlockSize); -XPU_BLOCK_SIZE_1D(cbm::algo::FindHits, cbm::algo::kFindClusterBlockSize); - -namespace cbm::algo +namespace cbm::algo::sts { + + class Hitfinder; // forward declaration + // GPU Data structures // TODO: Replace with regular StsCluster / StsHit once the changes // land to make them fit for GPU - struct StsGpuClusterIdx { + struct GpuClusterIdx { uint32_t fTime; ///< Cluster time (average of digi times) [ns] int fIdx; }; - struct StsGpuCluster { - float fCharge; ///< Total charge - int fSize; ///< Difference between first and last channel - float fPosition; ///< Cluster centre in channel number units - float fPositionError; ///< Cluster centre error (r.m.s.) in channel number units - uint32_t fTime; ///< cluster time [ns] - float fTimeError; ///< Error of cluster time [ns] + // Declare Constant Memory + struct TheHitfinder : xpu::constant<GPUReco, Hitfinder> { }; - struct StsGpuHit { - float fX, fY; ///< X, Y positions of hit [cm] - float fZ; ///< Z position of hit [cm] - uint32_t fTime; ///< Hit time [ns] - float fDx, fDy; ///< X, Y errors [cm] - float fDz; ///< Z position error [cm] - float fDxy; ///< XY correlation - float fTimeError; ///< Error of hit time [ns] - float fDu; ///< Error of coordinate across front-side strips [cm] - float fDv; ///< Error of coordinate across back-side strips [cm] + // Declare Kernels + struct SortDigis : xpu::kernel<GPUReco> { + using block_size = xpu::block_size<kSortDigisBlockSize>; + using sort_t = xpu::block_sort<u64, CbmStsDigi, kSortDigisBlockSize, kSortDigisItemsPerThread>; + using shared_memory = sort_t::storage_t; + using constants = xpu::cmem<TheHitfinder, Params>; + using context = xpu::kernel_context<shared_memory, constants>; + XPU_D void operator()(context&); }; - // Calibration data structures + struct FindClusters : xpu::kernel<GPUReco> { + using block_size = xpu::block_size<kFindClusterBlockSize>; + using constants = xpu::cmem<TheHitfinder, Params>; + using context = xpu::kernel_context<shared_memory, constants>; + XPU_D void operator()(context&); + }; - struct StsAsicPar { - int nAdc; - float dynamicRange; - float threshold; - float timeResolution; - float deadTime; - float noise; - float zeroNoiseRate; + struct ChannelOffsets : xpu::kernel<GPUReco> { + using block_size = xpu::block_size<kFindClusterBlockSize>; + using constants = xpu::cmem<TheHitfinder, Params>; + using context = xpu::kernel_context<shared_memory, constants>; + XPU_D void operator()(context&); + }; - XPU_D float AdcToCharge(unsigned short adc) const - { - return threshold + dynamicRange / float(nAdc) * (float(adc) + 0.5f); - } + struct CreateDigiConnections : xpu::kernel<GPUReco> { + using block_size = xpu::block_size<kFindClusterBlockSize>; + using constants = xpu::cmem<TheHitfinder, Params>; + using context = xpu::kernel_context<shared_memory, constants>; + XPU_D void operator()(context&); + }; + + struct CreateClusters : xpu::kernel<GPUReco> { + using block_size = xpu::block_size<kFindClusterBlockSize>; + using constants = xpu::cmem<TheHitfinder, Params>; + using context = xpu::kernel_context<shared_memory, constants>; + XPU_D void operator()(context&); + }; + + struct SortClusters : xpu::kernel<GPUReco> { + using block_size = xpu::block_size<kSortClustersBlockSize>; + using sort_t = xpu::block_sort<u32, GpuClusterIdx, kSortClustersBlockSize, kSortClustersItemsPerThread>; + using shared_memory = sort_t::storage_t; + using constants = xpu::cmem<TheHitfinder, Params>; + using context = xpu::kernel_context<shared_memory, constants>; + XPU_D void operator()(context&); + }; + + struct FindHits : xpu::kernel<GPUReco> { + using block_size = xpu::block_size<kFindHitsBlockSize>; + using constants = xpu::cmem<TheHitfinder, Params>; + using context = xpu::kernel_context<shared_memory, constants>; + XPU_D void operator()(context&); }; - struct StsSensorPar { + // Calibration data structures + struct SensorPar { float dY; float pitch; float stereoF; @@ -117,14 +125,14 @@ namespace cbm::algo // Used internally to avoid computing values multiple times // TODO: is this really needed? Overhead from additional computations should be miniscule. // TODO: Also store in shared memory, not registers. Values identical for all threads. - struct StsHitfinderCache : StsSensorPar { + struct HitfinderCache : SensorPar { float tanStereoF; float tanStereoB; float errorFac; float dX; }; - struct StsClusterCalculationProperties { + struct ClusterCalculationProperties { float tSum = 0.; // sum of digi times int chanF = 9999999; // first channel in cluster int chanL = -1; // last channel in cluster @@ -139,7 +147,7 @@ namespace cbm::algo float xSum = 0.f; // weighted charge sum, used to correct corrupt clusters }; - class StsDigiConnector { + class DigiConnector { private: unsigned int hasPreviousAndNext; @@ -156,7 +164,7 @@ namespace cbm::algo do { assumed = old; - old = xpu::atomic_cas_block(&hasPreviousAndNext, assumed, UnpackNext(assumed) | hasPreviousI); + old = xpu::atomic_cas(&hasPreviousAndNext, assumed, UnpackNext(assumed) | hasPreviousI); } while (old != assumed); } @@ -171,31 +179,15 @@ namespace cbm::algo do { assumed = old; - old = xpu::atomic_cas_block(&hasPreviousAndNext, assumed, (assumed & (1u << 31)) | next); + old = xpu::atomic_cas(&hasPreviousAndNext, assumed, (assumed & (1u << 31)) | next); } while (old != assumed); } XPU_D bool HasNext() const { return UnpackNext(hasPreviousAndNext) != 0; } }; - // Shared Memory defintions - using SortDigisT = - xpu::block_sort<unsigned long int, ::CbmStsDigi, xpu::block_size<SortDigis>::value.x, kSortDigisItemsPerThread>; - - struct StsSortDigiSmem { - typename SortDigisT::storage_t sortBuf; - }; - - using SortClustersT = - xpu::block_sort<uint32_t, StsGpuClusterIdx, xpu::block_size<SortClusters>::value.x, kSortClustersItemsPerThread>; - - struct StsSortClusterSmem { - typename SortClustersT::storage_t sortBuf; - }; - - // Hitfinder buffers - template<xpu::side S> - class StsHitfinderBase { + // Hitfinder class. Holds pointers to buffers + algorithm. + class Hitfinder { public: // calibration / configuration data @@ -204,87 +196,71 @@ namespace cbm::algo int nModules; // Number of modules int nChannels; // Number of channels per module - // Time cuts for Digis / Clusters - // TODO: currently not used! - float timeCutDigiAbs; - float timeCutDigiSig; - float timeCutClusterAbs; - float timeCutClusterSig; - - StsAsicPar asic; + sts::HitfinderPars::Asic asic; int landauTableSize; float landauStepSize; // Entries of landauTable. size = landauTableSize - xpu::cmem_device_t<float, S> landauTable; + xpu::buffer<float> landauTable; // transformation matrix to convert local to global coordinate space // size = nModules - xpu::cmem_io_t<float, S> localToGlobalTranslationByModule; - xpu::cmem_io_t<float, S> localToGlobalRotationByModule; + xpu::buffer<float> localToGlobalTranslationByModule; + xpu::buffer<float> localToGlobalRotationByModule; // Sensor Parameters // size = nModules - xpu::cmem_io_t<StsSensorPar, S> sensorPars; + xpu::buffer<SensorPar> sensorPars; // input // Store all digis in a flat array with a header that contains the offset for every module (front and back) - xpu::cmem_io_t<size_t, S> - digiOffsetPerModule; // size = 2 * nModules + 1 entries, last entry contains total digi count - xpu::cmem_io_t<CbmStsDigi, S> digisPerModule; // Flat array of input digis. size = nDigisTotal + xpu::buffer<size_t> digiOffsetPerModule; // size = 2 * nModules + 1 entries, last entry contains total digi count + xpu::buffer<CbmStsDigi> digisPerModule; // Flat array of input digis. size = nDigisTotal // Temporary Digi-Array used for sorting as sorting is not in place. size = nDigisTotal - xpu::cmem_device_t<CbmStsDigi, S> digisPerModuleTmp; - - // Pointer to sorted of a module side. Points either to digisPerModule or digisPerModuleTmp - // size = 2 * nModules - xpu::cmem_device_t<CbmStsDigi*, S> digisSortedPerModule; + xpu::buffer<CbmStsDigi> digisPerModuleTmp; // DigiConnectors used internally by StsClusterizer // Connects digis that belong to the same cluster via linked-list. // size = nDigisTotal - xpu::cmem_device_t<StsDigiConnector, S> digiConnectorsPerModule; + xpu::buffer<DigiConnector> digiConnectorsPerModule; // Digis are sorted by channel + within channel by time // Contains the offset for each channel within each channel // size = 2 * nModules * nChannels - xpu::cmem_device_t<unsigned int, S> channelOffsetPerModule; + xpu::buffer<unsigned int> channelOffsetPerModule; // intermediate results size_t maxClustersPerModule; - // FIXME: Not used anymore? - xpu::cmem_device_t<int, S> channelStatusPerModule; - // Cluster Index by module. Produced by cluster finder. // Stored as buckets with a max size of maxClustersPerModule // Actual number of entries in each bucket is stored in nClustersPerModule // size = 2 * nModules * maxClustersPerModule - xpu::cmem_io_t<StsGpuClusterIdx, S> clusterIdxPerModule; + xpu::buffer<GpuClusterIdx> clusterIdxPerModule; // Temporary cluster index array used for sorting as sorting is not in place. // size = 2 * nModules * maxClustersPerModule - xpu::cmem_io_t<StsGpuClusterIdx, S> clusterIdxPerModuleTmp; + xpu::buffer<GpuClusterIdx> clusterIdxPerModuleTmp; // Pointer to sorted cluster idx for every module side. // Either points to clusterIdxPerModule or clusterIdxPerModuleTmp. // size = 2 * nModules - xpu::cmem_io_t<StsGpuClusterIdx*, S> clusterIdxSortedPerModule; + xpu::buffer<GpuClusterIdx*> clusterIdxSortedPerModule; // Clusters stored by modules. Stored as buckets with a max size of maxClustersPerModule // Actual number of entries in each bucket is stored in nClustersPerModule // size = 2 * nModules * maxClustersPerModule - xpu::cmem_io_t<StsGpuCluster, S> clusterDataPerModule; + xpu::buffer<sts::Cluster> clusterDataPerModule; // Number of clusters in each module // size = 2 * nModules - xpu::cmem_io_t<int, S> nClustersPerModule; + xpu::buffer<int> nClustersPerModule; // Max time error of clusters on front- and backside of a module // size = 1 (???) // FIXME: size should be 2 * nModules? And only one array! - xpu::cmem_device_t<float, S> maxErrorFront; - xpu::cmem_device_t<float, S> maxErrorBack; + xpu::buffer<float> maxClusterTimeErrorByModuleSide; // output @@ -294,55 +270,38 @@ namespace cbm::algo // Hits sorted by module. Stored in buckets of size maxHitsPerModule. // Actual number of hits is stored in nHitsPerModule // size = maxHitsPerModule * nModules - xpu::cmem_io_t<StsGpuHit, S> hitsPerModule; + xpu::buffer<sts::Hit> hitsPerModule; // Number of hits in each module // size = nModules - xpu::cmem_io_t<int, S> nHitsPerModule; - }; - - // Host-side hitfinder. Only holds memory buffers. - using StsHitfinderHost = StsHitfinderBase<xpu::side::host>; - - // Device-side hitfinder. Holds pointers to buffers + algorithm. - class StsHitfinderGpu : public StsHitfinderBase<xpu::side::device> { + xpu::buffer<int> nHitsPerModule; public: - XPU_D void SortDigisInSpaceAndTime(StsSortDigiSmem& smem, int iBlock) const; - XPU_D void FindClustersSingleStep(int iBlock) const; - XPU_D void SortClusters(StsSortClusterSmem& smem, int iBlock) const; - XPU_D void FindHits(int iBlock) const; + XPU_D void SortDigisInSpaceAndTime(SortDigis::context&) const; + XPU_D void FindClustersSingleStep(FindClusters::context&) const; + XPU_D void SortClusters(SortClusters::context&) const; + XPU_D void FindHits(FindHits::context&) const; // Individual steps of cluster finder, for more granular time measurement - XPU_D void CalculateOffsetsParallel(int iBlock) const; - XPU_D void FindClustersParallel(int iBlock) const; - XPU_D void CalculateClustersParallel(int iBlock) const; - - XPU_D void FindClusterConnectionsBasic(int iBlock) const; - XPU_D void CalculateClustersBasic(int iBlock) const; + XPU_D void CalculateOffsetsParallel(FindClusters::context&) const; + XPU_D void FindClustersParallel(FindClusters::context&) const; + XPU_D void CalculateClustersParallel(FindClusters::context&) const; private: - XPU_D void CalculateChannelOffsets(CbmStsDigi* digis, unsigned int* channelOffsets, unsigned int nDigis) const; + XPU_D void CalculateChannelOffsets(FindClusters::context& ctx, CbmStsDigi* digis, unsigned int* channelOffsets, + unsigned int nDigis) const; - XPU_D void FindClusterConnectionsChannelWise(CbmStsDigi* digis, StsDigiConnector* digiConnector, - unsigned int* channelOffsets, int iModule, - unsigned int threadId) const; - XPU_D void FindClusterConnectionsDigiWise(CbmStsDigi* digis, StsDigiConnector* digiConnector, - unsigned int* channelOffsets, int iModule, unsigned int threadId, + XPU_D void FindClusterConnectionsDigiWise(FindClusters::context& ctx, CbmStsDigi* digis, + DigiConnector* digiConnector, unsigned int* channelOffsets, unsigned int nDigis) const; - XPU_D void CalculateClustersBasic(CbmStsDigi* digis, StsDigiConnector* digiConnector, int iModule, - unsigned int const threadId) const; - XPU_D void CalculateClustersChannelWise(CbmStsDigi* digis, StsDigiConnector* digiConnector, - unsigned int* channelOffsets, int iModule, unsigned int const threadId, - unsigned int const nDigis) const; - XPU_D void CalculateClustersDigiWise(CbmStsDigi* digis, StsDigiConnector* digiConnector, int iModule, - unsigned int const threadId, unsigned int const nDigis) const; + XPU_D void CalculateClustersDigiWise(FindClusters::context& ctx, CbmStsDigi* digis, DigiConnector* digiConnector, + unsigned int const nDigis) const; XPU_D void CreateClusterFromConnectors1(int const iModule, CbmStsDigi* digis, int const digiIndex) const; - XPU_D void CreateClusterFromConnectors2(int const iModule, CbmStsDigi* digis, StsDigiConnector* digiConnector, + XPU_D void CreateClusterFromConnectors2(int const iModule, CbmStsDigi* digis, DigiConnector* digiConnector, int const digiIndex) const; - XPU_D void CreateClusterFromConnectorsN(int const iModule, CbmStsDigi* digis, StsDigiConnector* digiConnector, + XPU_D void CreateClusterFromConnectorsN(int const iModule, CbmStsDigi* digis, DigiConnector* digiConnector, int const digiIndex) const; private: @@ -371,17 +330,19 @@ namespace cbm::algo return diff; } - XPU_D void AddCluster(int iModule, uint32_t time, const StsGpuCluster& cls) const + XPU_D void AddCluster(int iModule, uint32_t time, const sts::Cluster& cls) const { - StsGpuClusterIdx* tgtIdx = &clusterIdxPerModule[iModule * maxClustersPerModule]; - StsGpuCluster* tgtData = &clusterDataPerModule[iModule * maxClustersPerModule]; + GpuClusterIdx* tgtIdx = &clusterIdxPerModule[iModule * maxClustersPerModule]; + sts::Cluster* tgtData = &clusterDataPerModule[iModule * maxClustersPerModule]; int pos = xpu::atomic_add_block(&nClustersPerModule[iModule], 1); XPU_ASSERT(size_t(pos) < maxClustersPerModule); - StsGpuClusterIdx idx {time, pos}; - tgtIdx[idx.fIdx] = idx; - tgtData[idx.fIdx] = cls; + if (size_t(pos) < maxClustersPerModule) { + GpuClusterIdx idx {time, pos}; + tgtIdx[idx.fIdx] = idx; + tgtData[idx.fIdx] = cls; + } } XPU_D bool IsBackside(int iModule) const { return iModule >= nModules; } @@ -390,19 +351,18 @@ namespace cbm::algo XPU_D void ToGlobal(int iModule, float lx, float ly, float lz, float& gx, float& gy, float& gz) const; - XPU_D void IntersectClusters(int iBlock, const StsHitfinderCache& pars, uint32_t timeF, const StsGpuCluster& clsF, - uint32_t timeB, const StsGpuCluster& clsB) const; - XPU_D float GetClusterPosition(const StsHitfinderCache& pars, float centre, bool isFront) const; - XPU_D bool Intersect(const StsHitfinderCache& pars, float xF, float exF, float xB, float exB, float& x, float& y, + XPU_D void IntersectClusters(int iBlock, const HitfinderCache& pars, uint32_t timeF, const sts::Cluster& clsF, + uint32_t timeB, const sts::Cluster& clsB) const; + XPU_D float GetClusterPosition(const HitfinderCache& pars, float centre, bool isFront) const; + XPU_D bool Intersect(const HitfinderCache& pars, float xF, float exF, float xB, float exB, float& x, float& y, float& varX, float& varY, float& varXY) const; - XPU_D bool IsInside(const StsHitfinderCache& pars, float x, float y) const; + XPU_D bool IsInside(const HitfinderCache& pars, float x, float y) const; XPU_D void CreateHit(int iBlocks, float xLocal, float yLocal, float varX, float varY, float varXY, uint32_t timeF, - const StsGpuCluster& clsF, uint32_t timeB, const StsGpuCluster& clsB, float du, - float dv) const; + const sts::Cluster& clsF, uint32_t timeB, const sts::Cluster& clsB, float du, float dv) const; XPU_D void SaveMaxError(float errorValue, int iModule) const { - float* maxError = IsBackside(iModule) ? maxErrorBack : maxErrorFront; + float* maxError = &maxClusterTimeErrorByModuleSide[iModule]; float old {}; do { old = *maxError; @@ -411,9 +371,6 @@ namespace cbm::algo } }; - // StsHitfinder lives in constant memory - XPU_EXPORT_CONSTANT(GPUReco, StsHitfinderGpu, TheStsHitfinder); - -} // namespace cbm::algo +} // namespace cbm::algo::sts -#endif // #ifndef CBM_ALGO_STS_HITFINDER_H +#endif // CBM_ALGO_STS_HITFINDER_H diff --git a/algo/detectors/sts/StsHitfinderChain.cxx b/algo/detectors/sts/StsHitfinderChain.cxx index 9c37e24b5d3d0b58570ea7f2301ad163ee9e9fbf..dc1f856d86b21a3b92f25f8aa605942035504f74 100644 --- a/algo/detectors/sts/StsHitfinderChain.cxx +++ b/algo/detectors/sts/StsHitfinderChain.cxx @@ -4,35 +4,28 @@ #include "StsHitfinderChain.h" -#include "Logger.h" +#include <fairlogger/Logger.h> -using namespace cbm::algo; +#include <numeric> -// Shorthands to set data / allocate buffers on GPU -// FIXME: Eventually we'll need a better way to do this. Preferably supplied by xpu -#define SET_GPU_CONSTANT(constant, val) fHitfinderHost.constant = fHitfinderGpu.constant = val -#define GPU_ALLOC(field, nEntries) \ - fHitfinderHost.field = decltype(fHitfinderHost.field)(nEntries); \ - fHitfinderGpu.field = fHitfinderHost.field.d() +using namespace cbm::algo; -void StsHitfinderChain::SetParameters(const StsHitfinderPar& parameters) +void sts::HitfinderChain::SetParameters(const sts::HitfinderPars& parameters) { fPars.emplace(parameters); AllocateStatic(); } -void StsHitfinderChain::operator()(gsl::span<const CbmStsDigi> digis) +void sts::HitfinderChain::operator()(gsl::span<const CbmStsDigi> digis) { EnsureParameters(); - // FIXME: this has to be called only once - // and should happen during initialization not in reco loop - setenv("XPU_PROFILE", "1", 1); - xpu::initialize(); + xpu::scoped_timer t_("STS Hitfinder"); int nModules = fPars->modules.size(); int nModuleSides = nModules * 2; size_t nDigisTotal = digis.size(); + xpu::t_add_bytes(nDigisTotal * sizeof(CbmStsDigi)); // Getting the digis on the GPU requires 3 steps // 1. Sort digis into buckets by module @@ -44,107 +37,173 @@ void StsHitfinderChain::operator()(gsl::span<const CbmStsDigi> digis) // 3. Copy digis into flat array with offsets per module FlattenDigis(digiMap); + if (fair::Logger::GetConsoleSeverity() == fair::Severity::trace) { EnsureDigiOffsets(digiMap); } + + xpu::queue queue; + // Clear buffers // Not all buffers have to initialized, but useful for debugging - auto& hfc = fHitfinderHost; + LOG(debug) << "STS Hitfinder Chain: Clearing buffers..."; + auto& hfc = fHitfinder; // xpu::memset(hfc.digisPerModule, 0); // xpu::memset(hfc.digisPerModuleTmp, 0); // xpu::memset(hfc.digisSortedPerModule, 0); // xpu::memset(hfc.digiOffsetPerModule, 0); - xpu::memset(hfc.digiConnectorsPerModule, 0); - // xpu::memset(hfc.channelOffsetPerModule, 0); - xpu::memset(hfc.channelStatusPerModule, -1); - xpu::memset(hfc.clusterIdxPerModule, 0); + queue.memset(hfc.digiConnectorsPerModule, 0); + queue.memset(hfc.channelOffsetPerModule, 0); + queue.memset(hfc.clusterIdxPerModule, 0); // xpu::memset(hfc.clusterIdxPerModuleTmp, 0); // xpu::memset(hfc.clusterIdxSortedPerModule, 0); // xpu::memset(hfc.clusterDataPerModule, 0); - xpu::memset(hfc.nClustersPerModule, 0); + queue.memset(hfc.nClustersPerModule, 0); // xpu::memset(hfc.hitsPerModule, 0); - xpu::memset(hfc.nHitsPerModule, 0); - xpu::memset(hfc.maxErrorFront, 0); - xpu::memset(hfc.maxErrorBack, 0); + queue.memset(hfc.nHitsPerModule, 0); + queue.memset(hfc.maxClusterTimeErrorByModuleSide, 0); + + LOG(debug) << "STS Hitfinder Chain: Copy digis buffers..."; + xpu::set<TheHitfinder>(fHitfinder); + queue.copy(hfc.digisPerModule, xpu::h2d); + queue.copy(hfc.digiOffsetPerModule, xpu::h2d); + + LOG(debug) << "STS Hitfinder Chain: Sort Digis..."; + // TODO: skip temporary buffers and sort directly into digisSortedPerModule + + queue.launch<SortDigis>(xpu::n_blocks(nModuleSides)); + xpu::k_add_bytes<SortDigis>(nDigisTotal * sizeof(CbmStsDigi)); + if (fair::Logger::GetConsoleSeverity() == fair::Severity::trace) { + LOG(trace) << "Ensuring STS digis are sorted..."; + queue.copy(hfc.digisPerModule, xpu::d2h); + queue.copy(hfc.digiOffsetPerModule, xpu::d2h); + queue.wait(); + EnsureDigisSorted(); + } - xpu::set_constant<TheStsHitfinder>(fHitfinderGpu); - xpu::copy(hfc.digisPerModule, xpu::host_to_device); - xpu::copy(hfc.digiOffsetPerModule, xpu::host_to_device); + LOG(debug) << "STS Hitfinder Chain: Find Clusters..."; + if (!Params().sts.findClustersMultiKernels) { queue.launch<FindClusters>(xpu::n_blocks(nModuleSides)); } + else { + queue.launch<ChannelOffsets>(xpu::n_blocks(nModuleSides)); + xpu::k_add_bytes<ChannelOffsets>(nDigisTotal * sizeof(CbmStsDigi)); + queue.launch<CreateDigiConnections>(xpu::n_threads(nDigisTotal)); + xpu::k_add_bytes<CreateDigiConnections>(nDigisTotal * sizeof(CbmStsDigi)); + queue.launch<CreateClusters>(xpu::n_blocks(nModuleSides)); + xpu::k_add_bytes<CreateClusters>(nDigisTotal * sizeof(CbmStsDigi)); + } + if (fair::Logger::GetConsoleSeverity() == fair::Severity::trace) { + LOG(trace) << "Ensuring STS channel offsets correct..."; + xpu::buffer_prop propsOffset {hfc.channelOffsetPerModule}; + std::vector<u32> channelOffsetPerModule; + channelOffsetPerModule.resize(propsOffset.size()); + queue.copy(hfc.channelOffsetPerModule.get(), channelOffsetPerModule.data(), propsOffset.size_bytes()); + queue.wait(); + EnsureChannelOffsets(channelOffsetPerModule); + + LOG(trace) << "Ensuring STS clusters are ok..."; + xpu::buffer_prop props {hfc.clusterIdxPerModule}; + + std::vector<GpuClusterIdx> clusterIdxPerModule; + clusterIdxPerModule.resize(props.size()); + std::vector<int> nClustersPerModule; + nClustersPerModule.resize(fPars->modules.size() * 2); + + queue.copy(hfc.clusterIdxPerModule.get(), clusterIdxPerModule.data(), props.size_bytes()); + queue.copy(hfc.nClustersPerModule.get(), nClustersPerModule.data(), nClustersPerModule.size() * sizeof(int)); + queue.wait(); + EnsureClustersSane(clusterIdxPerModule, nClustersPerModule); + } - xpu::run_kernel<SortDigis>(xpu::grid::n_blocks(nModuleSides)); - xpu::run_kernel<FindClustersSingleStep>(xpu::grid::n_blocks(nModuleSides)); // Run cluster finding steps in indivual kernels, useful for debugging / profiling // xpu::run_kernel<CalculateOffsets>(xpu::grid::n_blocks(hfc.nModules * 2)); // xpu::run_kernel<FindClustersBasic>(xpu::grid::n_blocks(hfc.nModules * 2)); // xpu::run_kernel<CalculateClusters>(xpu::grid::n_blocks(hfc.nModules * 2)); // xpu::run_kernel<FindClustersBasic>(xpu::grid::n_blocks(hfc.nModules * 2)); // xpu::run_kernel<CalculateClustersBasic>(xpu::grid::n_blocks(hfc.nModules * 2)); - xpu::run_kernel<SortClusters>(xpu::grid::n_blocks(nModuleSides)); - xpu::run_kernel<FindHits>(xpu::grid::n_blocks(nModules)); - - CollectTimingInfo(); + LOG(debug) << "STS Hitfinder Chain: Sort Clusters..."; + queue.launch<SortClusters>(xpu::n_blocks(nModuleSides)); // FIXME n_blocks(nModuleSides) once debugging is done + + LOG(debug) << "STS Hitfinder Chain: Find Hits..."; + queue.copy(hfc.nClustersPerModule, xpu::d2h); + queue.wait(); + xpu::h_view nClusters {hfc.nClustersPerModule}; + size_t nClustersFront = std::accumulate(nClusters.begin(), nClusters.begin() + nModules, 0); + bool isCpu = xpu::device::active().backend() == xpu::cpu; + xpu::grid findHitsG = isCpu ? xpu::n_blocks(nModules) : xpu::n_threads(nClustersFront); + queue.launch<FindHits>(findHitsG); // Transfer Hits / Cluster back to host // TODO: Hits and Clusters are stored in buckets. Currently we transfer the entire // bucket arrays back to the host. This includes a lot of unused bytes. // Better to either flatten both arrays on the GPU and transfer flat array back. // Or do multiple copies to copy contents of individual buckets. - xpu::copy(hfc.clusterDataPerModule, xpu::device_to_host); - xpu::copy(hfc.nClustersPerModule, xpu::device_to_host); - xpu::copy(hfc.hitsPerModule, xpu::device_to_host); - xpu::copy(hfc.nHitsPerModule, xpu::device_to_host); + queue.copy(hfc.clusterDataPerModule, xpu::d2h); + queue.copy(hfc.nClustersPerModule, xpu::d2h); + queue.copy(hfc.hitsPerModule, xpu::d2h); + queue.copy(hfc.nHitsPerModule, xpu::d2h); + + queue.wait(); + + xpu::h_view nHits {hfc.nHitsPerModule}; + // xpu::h_view nClusters{hfc.nClustersPerModule}; + size_t nHitsTotal = std::accumulate(nHits.begin(), nHits.end(), 0); + size_t nClustersTotal = std::accumulate(nClusters.begin(), nClusters.end(), 0); + + xpu::k_add_bytes<SortClusters>(nClustersTotal * sizeof(GpuClusterIdx)); + xpu::k_add_bytes<FindHits>(nClustersTotal * sizeof(sts::Cluster)); + + LOG(info) << "Timeslice contains " << nHitsTotal << " STS hits and " << nClustersTotal << " STS clusters"; } -void StsHitfinderChain::EnsureParameters() +void sts::HitfinderChain::EnsureParameters() { - LOG_IF(fatal, fPars == std::nullopt) << "StsHitfinderChain: Parameters not set. Can't continue!"; + LOG_IF(fatal, fPars == std::nullopt) << "sts::HitfinderChain: Parameters not set. Can't continue!"; } -void StsHitfinderChain::AllocateStatic() +void sts::HitfinderChain::AllocateStatic() { EnsureParameters(); // Shorthands for common constants const int nChannels = fPars->nChannels / 2; // Only count channels on one side of the module const int nModules = fPars->modules.size(); - // const int nModuleSides = 2 * nModules; // Number of module front + backsides - - // Set GPU constants - SET_GPU_CONSTANT(nModules, nModules); - SET_GPU_CONSTANT(nChannels, nChannels); + const int nModuleSides = 2 * nModules; // Number of module front + backsides - SET_GPU_CONSTANT(timeCutDigiAbs, -1.f); - SET_GPU_CONSTANT(timeCutDigiSig, 3.f); - SET_GPU_CONSTANT(timeCutClusterAbs, -1.f); - SET_GPU_CONSTANT(timeCutClusterSig, 4.f); + xpu::queue q; - SET_GPU_CONSTANT(asic, fPars->asic); + // Set GPU constants + fHitfinder.nModules = nModules; + fHitfinder.nChannels = nChannels; + fHitfinder.asic = fPars->asic; // Copy landau table size_t nLandauTableEntries = fPars->landauTable.values.size(); - SET_GPU_CONSTANT(landauTableSize, nLandauTableEntries); - SET_GPU_CONSTANT(landauStepSize, fPars->landauTable.stepSize); - GPU_ALLOC(landauTable, nLandauTableEntries); - xpu::copy(fHitfinderGpu.landauTable, fPars->landauTable.values.data(), nLandauTableEntries); + fHitfinder.landauTableSize = nLandauTableEntries; + fHitfinder.landauStepSize = fPars->landauTable.stepSize; + fHitfinder.landauTable.reset(nLandauTableEntries, xpu::buf_io); + q.copy(fPars->landauTable.values.data(), fHitfinder.landauTable.get(), nLandauTableEntries * sizeof(float)); // Copy transformation matrix - GPU_ALLOC(localToGlobalRotationByModule, 9 * nModules); - GPU_ALLOC(localToGlobalTranslationByModule, 3 * nModules); + fHitfinder.localToGlobalRotationByModule.reset(9 * nModules, xpu::buf_io); + fHitfinder.localToGlobalTranslationByModule.reset(3 * nModules, xpu::buf_io); // - Copy matrix into flat array + xpu::h_view hRot {fHitfinder.localToGlobalRotationByModule}; + xpu::h_view hTrans {fHitfinder.localToGlobalTranslationByModule}; for (int m = 0; m < nModules; m++) { const auto& module = fPars->modules.at(m); - std::copy_n(module.localToGlobal.rotation.data(), 9, &fHitfinderHost.localToGlobalRotationByModule.h()[m * 9]); - std::copy_n(module.localToGlobal.translation.data(), 3, &fHitfinderHost.localToGlobalRotationByModule.h()[m * 3]); + std::copy_n(module.localToGlobal.rotation.data(), 9, &hRot[m * 9]); + std::copy_n(module.localToGlobal.translation.data(), 3, &hTrans[m * 3]); } // - Then copy to GPU - xpu::copy(fHitfinderHost.localToGlobalRotationByModule, xpu::host_to_device); - xpu::copy(fHitfinderHost.localToGlobalTranslationByModule, xpu::host_to_device); + q.copy(fHitfinder.localToGlobalRotationByModule, xpu::h2d); + q.copy(fHitfinder.localToGlobalTranslationByModule, xpu::h2d); // Copy Sensor Parameteres - GPU_ALLOC(sensorPars, nModules); + fHitfinder.sensorPars.reset(nModules, xpu::buf_io); + xpu::h_view hSensorPars {fHitfinder.sensorPars}; for (int m = 0; m < nModules; m++) { const auto& module = fPars->modules.at(m); - auto& gpuPars = fHitfinderHost.sensorPars.h()[m]; + auto& gpuPars = hSensorPars[m]; gpuPars.dY = module.dY; gpuPars.pitch = module.pitch; gpuPars.stereoF = module.stereoF; @@ -152,13 +211,22 @@ void StsHitfinderChain::AllocateStatic() gpuPars.lorentzF = module.lorentzF; gpuPars.lorentzB = module.lorentzB; } - xpu::copy(fHitfinderHost.sensorPars, xpu::host_to_device); + xpu::copy(fHitfinder.sensorPars, xpu::h2d); + + // Time errors + fHitfinder.maxClusterTimeErrorByModuleSide.reset(nModuleSides, xpu::buf_device); + + q.wait(); } -void StsHitfinderChain::AllocateDynamic(size_t maxNDigisPerModule, size_t nDigisTotal) +void sts::HitfinderChain::AllocateDynamic(size_t maxNDigisPerModule, size_t nDigisTotal) { + LOG(debug) << "STS Hitfinder Chain: Allocating dynamic memory for " << maxNDigisPerModule << " digis per module and " + << nDigisTotal << " digis in total"; EnsureParameters(); + xpu::scoped_timer t_ {"Allocate"}; + // TODO: some of these buffers have a constant size and can be allocated statically. // Just the data they contain is static. @@ -170,42 +238,56 @@ void StsHitfinderChain::AllocateDynamic(size_t maxNDigisPerModule, size_t nDigis const size_t maxNClustersPerModule = maxNDigisPerModule * kClustersPerDigiFactor; const size_t maxNHitsPerModule = maxNClustersPerModule * kHitsPerClustersFactor; - // Allocate Digi Buffers - GPU_ALLOC(digiOffsetPerModule, nModuleSides + 1); - GPU_ALLOC(digisPerModule, nDigisTotal); - GPU_ALLOC(digisPerModuleTmp, nDigisTotal); - GPU_ALLOC(digisSortedPerModule, nModuleSides); - GPU_ALLOC(digiConnectorsPerModule, nDigisTotal); + fHitfinder.digiOffsetPerModule.reset(nModuleSides + 1, xpu::buf_io); + fHitfinder.digisPerModule.reset(nDigisTotal, xpu::buf_io); - GPU_ALLOC(channelOffsetPerModule, nModuleSides * nChannels); + fHitfinder.digisPerModuleTmp.reset(nDigisTotal, xpu::buf_device); + fHitfinder.digiConnectorsPerModule.reset(nDigisTotal, xpu::buf_device); - GPU_ALLOC(maxErrorFront, 1); - GPU_ALLOC(maxErrorBack, 1); + fHitfinder.channelOffsetPerModule.reset(nModuleSides * nChannels, xpu::buf_device); // Allocate Cluster Buffers - SET_GPU_CONSTANT(maxClustersPerModule, maxNClustersPerModule); - GPU_ALLOC(channelStatusPerModule, nChannels * nModules); - GPU_ALLOC(clusterIdxPerModule, maxNClustersPerModule * nModuleSides); - GPU_ALLOC(clusterIdxPerModuleTmp, maxNClustersPerModule * nModuleSides); - GPU_ALLOC(clusterIdxSortedPerModule, nModuleSides); - GPU_ALLOC(clusterDataPerModule, maxNClustersPerModule * nModuleSides); - GPU_ALLOC(nClustersPerModule, nModuleSides); + fHitfinder.maxClustersPerModule = maxNClustersPerModule; + + fHitfinder.clusterIdxPerModule.reset(maxNClustersPerModule * nModuleSides, xpu::buf_device); + fHitfinder.clusterIdxPerModuleTmp.reset(maxNClustersPerModule * nModuleSides, xpu::buf_device); + fHitfinder.clusterIdxSortedPerModule.reset(nModuleSides, xpu::buf_device); + + fHitfinder.clusterDataPerModule.reset(maxNClustersPerModule * nModuleSides, xpu::buf_io); + fHitfinder.nClustersPerModule.reset(nModuleSides, xpu::buf_io); // Allocate Hit Buffers - SET_GPU_CONSTANT(maxHitsPerModule, maxNHitsPerModule); - GPU_ALLOC(hitsPerModule, maxNHitsPerModule * nModules); - GPU_ALLOC(nHitsPerModule, nModules); + fHitfinder.maxHitsPerModule = maxNHitsPerModule; + fHitfinder.hitsPerModule.reset(maxNHitsPerModule * nModules, xpu::buf_io); + fHitfinder.nHitsPerModule.reset(nModules, xpu::buf_io); } -StsHitfinderChain::DigiMap StsHitfinderChain::SortDigisIntoModules(gsl::span<const CbmStsDigi> digis, - size_t& maxNDigisPerModule) +sts::HitfinderChain::DigiMap sts::HitfinderChain::SortDigisIntoModules(gsl::span<const CbmStsDigi> digis, + size_t& maxNDigisPerModule) { + LOG(debug) << "STS Hitfinder Chain: Sorting " << digis.size() << " digis into modules"; + xpu::scoped_timer t_ {"Sort Digis"}; + DigiMap digiMap; int nChannelsPerSide = fPars->nChannels / 2; + auto isPulser = [&](const CbmStsDigi& digi) { + return std::find_if(fPars->modules.begin(), fPars->modules.end(), + [&](const auto& mod) { return mod.address == digi.GetAddress(); }) + == fPars->modules.end(); + }; + + size_t nPulsers = 0; + for (const auto& digi : digis) { + + if (isPulser(digi)) { + nPulsers++; + continue; + } + int moduleAddr = CbmStsAddress::GetMotherAddress(digi.GetAddress(), kStsModule); bool isFront = digi.GetChannel() < fPars->nChannels / 2; if (isFront) digiMap.front[moduleAddr].emplace_back(digi); @@ -216,6 +298,15 @@ StsHitfinderChain::DigiMap StsHitfinderChain::SortDigisIntoModules(gsl::span<con } } + LOG_IF(warn, nPulsers > 0) << "STS Hitfinder: Discarded " << nPulsers << " pulser digis"; + + // Print digi counts per module + for (const auto& mod : fPars->modules) { + i32 moduleAddr = mod.address; + LOG(debug1) << "Module " << moduleAddr << " has " << digiMap.front[moduleAddr].size() << " front digis and " + << digiMap.back[moduleAddr].size() << " back digis"; + } + maxNDigisPerModule = 0; for (const auto& mod : digiMap.front) { maxNDigisPerModule = std::max(maxNDigisPerModule, mod.second.size()); @@ -228,19 +319,21 @@ StsHitfinderChain::DigiMap StsHitfinderChain::SortDigisIntoModules(gsl::span<con return digiMap; } -void StsHitfinderChain::FlattenDigis(DigiMap& digiMap) +void sts::HitfinderChain::FlattenDigis(DigiMap& digiMap) { + LOG(debug) << "STS Hitfinder Chain: Flattening digis"; + xpu::scoped_timer t_ {"Flatten Digis"}; FlattenDigisSide(digiMap.front, true); FlattenDigisSide(digiMap.back, false); } -void StsHitfinderChain::FlattenDigisSide(DigiMapSide& digis, bool isFront) +void sts::HitfinderChain::FlattenDigisSide(DigiMapSide& digis, bool isFront) { EnsureParameters(); int nModules = fPars->modules.size(); - size_t* pMdigiOffset = fHitfinderHost.digiOffsetPerModule.h(); - CbmStsDigi* pDigisFlat = fHitfinderHost.digisPerModule.h(); + xpu::h_view pMdigiOffset {fHitfinder.digiOffsetPerModule}; + xpu::h_view pDigisFlat {fHitfinder.digisPerModule}; int sideOffset = (isFront ? 0 : nModules); @@ -253,28 +346,17 @@ void StsHitfinderChain::FlattenDigisSide(DigiMapSide& digis, bool isFront) } } -void StsHitfinderChain::CollectTimingInfo() -{ - // Check if profiling is enabled - // TODO: xpu should have a nicer way to query if profiling is enabled - if (xpu::get_timing<SortDigis>().empty()) return; - - // TODO: Add time counters - fHitfinderTimes.timeSortDigi = xpu::get_timing<SortDigis>().back(); - fHitfinderTimes.timeCluster = xpu::get_timing<FindClustersSingleStep>().back(); - fHitfinderTimes.timeSortCluster = xpu::get_timing<SortClusters>().back(); - fHitfinderTimes.timeHits = xpu::get_timing<FindHits>().back(); -} - -void StsHitfinderChain::AppendClustersModule(int module, bool isFront, std::vector<StsGpuCluster>& clusters) +void sts::HitfinderChain::AppendClustersModule(int module, bool isFront, std::vector<sts::Cluster>& clusters) { EnsureParameters(); - auto& hfc = fHitfinderHost; + auto& hfc = fHitfinder; int moduleSide = (isFront ? module : 2 * module); + xpu::h_view hNClusters {hfc.nClustersPerModule}; + xpu::h_view hClusters {hfc.clusterDataPerModule}; - int nClusters = hfc.nClustersPerModule.h()[moduleSide]; - auto* gpuClusters = &hfc.clusterDataPerModule.h()[moduleSide * hfc.maxClustersPerModule]; + int nClusters = hNClusters[moduleSide]; + auto* gpuClusters = &hClusters[moduleSide * hfc.maxClustersPerModule]; clusters.reserve(clusters.size() + nClusters); @@ -292,14 +374,15 @@ void StsHitfinderChain::AppendClustersModule(int module, bool isFront, std::vect } } -void StsHitfinderChain::AppendHitsModule(int module, std::vector<StsGpuHit>& hits) +void sts::HitfinderChain::AppendHitsModule(int module, std::vector<sts::Hit>& hits) { EnsureParameters(); - auto& hfc = fHitfinderHost; + xpu::h_view hHits {fHitfinder.hitsPerModule}; + xpu::h_view hNHits {fHitfinder.nHitsPerModule}; - auto* gpuHits = &hfc.hitsPerModule.h()[module * hfc.maxHitsPerModule]; - int nHitsGpu = hfc.nHitsPerModule.h()[module]; + auto* gpuHits = &hHits[module * fHitfinder.maxHitsPerModule]; + int nHitsGpu = hNHits[module]; hits.reserve(hits.size() + nHitsGpu); @@ -319,3 +402,133 @@ void StsHitfinderChain::AppendHitsModule(int module, std::vector<StsGpuHit>& hit // hitGpu.fDv); } } + +void sts::HitfinderChain::EnsureDigiOffsets(DigiMap& digi) +{ + xpu::h_view digiOffset {fHitfinder.digiOffsetPerModule}; + + size_t nModules = fPars->modules.size(); + size_t offset = 0; + // Front + for (size_t m = 0; m < nModules; m++) { + const auto& moduleDigis = digi.front[fPars->modules[m].address]; + LOG_IF(fatal, digiOffset[m] != offset) << "Digi offset mismatch"; + offset += moduleDigis.size(); + } + + // Back + for (size_t m = 0; m < nModules; m++) { + const auto& moduleDigis = digi.back[fPars->modules[m].address]; + LOG_IF(fatal, digiOffset[nModules + m] != offset) + << "Module " << nModules + m << ": Digi offset mismatch: " << digiOffset[nModules + m] << " != " << offset; + offset += moduleDigis.size(); + } + + LOG_IF(fatal, offset != digiOffset[2 * nModules]) << "Digi offset mismatch"; +} + +void sts::HitfinderChain::EnsureDigisSorted() +{ + xpu::h_view digiOffset {fHitfinder.digiOffsetPerModule}; + xpu::h_view digis {fHitfinder.digisPerModule}; + + bool isSorted = true; + + for (size_t m = 0; m < fPars->modules.size(); m++) { + int nDigis = digiOffset[m + 1] - digiOffset[m]; + if (nDigis == 0) continue; + + auto* digisModule = &digis[digiOffset[m]]; + + for (int i = 0; i < nDigis - 1; i++) { + const auto &digi1 = digisModule[i], &digi2 = digisModule[i + 1]; + + if ((digi1.GetChannel() < digi2.GetChannel()) + || (digi1.GetChannel() == digi2.GetChannel() + && digi1.GetTime() <= digi2.GetTime()) // Unpacker sometimes produces multiple + // digis with the same time and channel, FIXME! + ) { + continue; + } + + isSorted = false; + LOG(error) << "Module " << m << " not sorted: " << digi1.ToString() << " " << digi2.ToString(); + break; + } + } + + LOG_IF(fatal, !isSorted) << "Digis are not sorted"; +} + +void sts::HitfinderChain::EnsureChannelOffsets(span<u32> channelOffsetsByModule) +{ + xpu::h_view digisPerModule {fHitfinder.digisPerModule}; + xpu::h_view digiOffsetPerModule {fHitfinder.digiOffsetPerModule}; + for (size_t m = 0; m < fPars->modules.size() * 2; m++) { + int nChannels = fPars->nChannels / 2; // Consider module sides + + std::vector<u32> expectedChannelOffsets(nChannels, 0); + + int offset = digiOffsetPerModule[m]; + int nDigis = digiOffsetPerModule[m + 1] - offset; + span<CbmStsDigi> digis(&digisPerModule[offset], nDigis); + span<u32> channelOffsets = channelOffsetsByModule.subspan(m * nChannels, nChannels); + + if (nDigis == 0) continue; + + LOG_IF(fatal, channelOffsets[0] != 0) << "Module " << m << ": First channel offset is not 0"; + + int chan = digis[0].GetChannel(); + for (int i = 0; i < nDigis; i++) { + if (digis[i].GetChannel() != chan) { + while (chan < digis[i].GetChannel()) { + chan++; + expectedChannelOffsets[chan] = i; + } + } + } + while (chan < nChannels) { + chan++; + expectedChannelOffsets[chan] = nDigis; + } + + for (int i = 0; i < nChannels; i++) { + LOG_IF(fatal, channelOffsets[i] != expectedChannelOffsets[i]) + << "Module " << m << ": Channel offset for channel " << i << " is " << channelOffsets[i] << " but should be " + << expectedChannelOffsets[i]; + } + } +} + +void sts::HitfinderChain::EnsureClustersSane(span<GpuClusterIdx> hClusterIdx, span<int> hNClusters) +{ + for (size_t m = 0; m < 2 * fPars->modules.size(); m++) { + int nClusters = hNClusters[m]; + + LOG(trace) << "Module " << m << " has " << nClusters << " clusters of " << fHitfinder.maxClustersPerModule; + + if (nClusters == 0) continue; + + if (nClusters < 0) { LOG(fatal) << "Module " << m << " has negative number of clusters " << nClusters; } + if (size_t(nClusters) > fHitfinder.maxClustersPerModule) { + LOG(fatal) << "Module " << m << " has " << nClusters << " clusters, but only " << fHitfinder.maxClustersPerModule + << " are allowed"; + } + + auto* clusterIdx = &hClusterIdx[m * fHitfinder.maxClustersPerModule]; + + for (int i = 0; i < nClusters; i++) { + auto& cidx = clusterIdx[i]; + + if (cidx.fIdx < 0 || size_t(cidx.fIdx) >= fHitfinder.maxClustersPerModule) { + LOG(fatal) << "Cluster " << i << " of module " << m << " has invalid index " << cidx.fIdx; + } + + if (cidx.fTime == 0xFFFFFFFF) { + LOG(fatal) << "Cluster " << i << " of module " << m << " has invalid time " << cidx.fTime; + } + } + } + + LOG(trace) << "Clusters ok"; +} diff --git a/algo/detectors/sts/StsHitfinderChain.h b/algo/detectors/sts/StsHitfinderChain.h index 37fcd4277bcad92e57d33a3e11ddaddc8063c336..bb38e93f66288ad9d09eb68b7990b92db8df99a8 100644 --- a/algo/detectors/sts/StsHitfinderChain.h +++ b/algo/detectors/sts/StsHitfinderChain.h @@ -14,46 +14,16 @@ #include <optional> #include <vector> -#include <xpu/host.h> - #include "StsHitfinder.h" +#include "SubChain.h" +#include "gpu/xpu_legacy.h" +#include "sts/HitfinderPars.h" +#include "sts/LandauTable.h" -namespace cbm::algo +namespace cbm::algo::sts { - struct StsModuleTransformationMatrix { - // Rotation + translation matrix to transform - // local module coordinates into global coordinate system. - // No need for fancy math types here. These values are just copied - // and moved to the GPU. - std::array<float, 9> rotation; // 3x3 matrix - std::array<float, 3> translation; - }; - - struct StsModulePar { - int32_t address; - float dY; - float pitch; - float stereoF; - float stereoB; - float lorentzF; - float lorentzB; - StsModuleTransformationMatrix localToGlobal; - }; - - struct StsLandauTable { - std::vector<float> values; - float stepSize; - }; - - struct StsHitfinderPar { - StsAsicPar asic; // Asic definitions. Currently assumes same parameters for all asics. - int nChannels; // Total number of channels per module. Hitfinder assumes nChannels / 2 channels per side. - std::vector<StsModulePar> modules; - StsLandauTable landauTable; - }; - - struct StsHitfinderTimes { + struct HitfinderTimes { double timeIO = 0; double timeSortDigi = 0; double timeCluster = 0; @@ -67,22 +37,14 @@ namespace cbm::algo * handles memory transfer for input / output data and conversion to the * regular CBM STS types. */ - class StsHitfinderChain { + class HitfinderChain : public SubChain { public: - void SetParameters(const StsHitfinderPar& parameters); - const StsHitfinderPar& GetParameters() const { return *fPars; } + void SetParameters(const sts::HitfinderPars& parameters); + const sts::HitfinderPars& GetParameters() const { return *fPars; } void operator()(gsl::span<const CbmStsDigi>); - StsHitfinderTimes GetHitfinderTimes() const { return fHitfinderTimes; } - - /** - * Returns access to (host copies of) the raw buffers used by the gpu hitfinder. - * This is currently the only way to access the produced hits and cluster. - */ - const StsHitfinderHost& GetHitfinderBuffers() const { return fHitfinderHost; } - private: // Shorthands to map module-address onto digis in that module using DigiMapSide = std::map<int, std::vector<CbmStsDigi>>; @@ -96,8 +58,8 @@ namespace cbm::algo * TODO: These values might be wildly off. Look for better estimations. * TODO: Should be configurable at runtime. */ - static constexpr float kClustersPerDigiFactor = 0.5f; - static constexpr float kHitsPerClustersFactor = 4.f; + static constexpr float kClustersPerDigiFactor = 1.f; + static constexpr float kHitsPerClustersFactor = 1.5f; /** * Ensure parameters were set. Raises log(fatal) otherwise. @@ -127,21 +89,24 @@ namespace cbm::algo void FlattenDigis(DigiMap& digiMap); void FlattenDigisSide(DigiMapSide& digis, bool isFront); - void CollectTimingInfo(); - /** * Transfer Hits / Clusters back to host and convert to common CBM types. */ - void AppendClustersModule(int module, bool isFront, std::vector<StsGpuCluster>&); - void AppendHitsModule(int module, std::vector<StsGpuHit>&); + void AppendClustersModule(int module, bool isFront, std::vector<sts::Cluster>&); + void AppendHitsModule(int module, std::vector<sts::Hit>&); + + // Debug functions, ensure reco produces sane results + void EnsureDigiOffsets(DigiMap&); + void EnsureDigisSorted(); + void EnsureChannelOffsets(span<u32>); + void EnsureClustersSane(span<GpuClusterIdx>, span<int>); + void EnsureClustersSorted(); - std::optional<const StsHitfinderPar> fPars; - StsHitfinderTimes fHitfinderTimes; + std::optional<const sts::HitfinderPars> fPars; - StsHitfinderHost fHitfinderHost; - StsHitfinderGpu fHitfinderGpu; + Hitfinder fHitfinder; }; -} // namespace cbm::algo +} // namespace cbm::algo::sts #endif // #ifndef CBM_ALGO_STS_HITFINDER_CHAIN_H diff --git a/algo/params/LandauWidthTable.txt b/algo/params/LandauWidthTable.txt new file mode 120000 index 0000000000000000000000000000000000000000..66b518d868f39fcf70c482274ee247aefe58489f --- /dev/null +++ b/algo/params/LandauWidthTable.txt @@ -0,0 +1 @@ +../../parameters/sts/LandauWidthTable.txt \ No newline at end of file diff --git a/algo/params/StsHitfinder.yaml b/algo/params/StsHitfinder.yaml new file mode 100644 index 0000000000000000000000000000000000000000..184f9d4e69c3bef05f2904b319a0f96ab369342f --- /dev/null +++ b/algo/params/StsHitfinder.yaml @@ -0,0 +1,122 @@ +--- +asic: + nAdc: 32 + dynamicRange: 75000 + threshold: 3000 + timeResolution: 5 + deadTime: 800 + noise: 1000 + zeroNoiseRate: 0.00397890015 +nChannels: 2048 +modules: + - address: 0x10008002 + dY: 5.96000004 + pitch: 0.00579999993 + stereoF: 0 + stereoB: 7.5 + lorentzF: 0 + lorentzB: 0 + localToGlobal: + rotation: [-1, -1.22464685e-16, 0, 1.22464685e-16, -1, 0, 0, 0, 1] + translation: [-2.97959995, 2.86999989, 27.2849998] + - address: 0x10018002 + dY: 5.96000004 + pitch: 0.00579999993 + stereoF: 0 + stereoB: 7.5 + lorentzF: 0 + lorentzB: 0 + localToGlobal: + rotation: [-1, -1.22464685e-16, 0, 1.22464685e-16, -1, 0, 0, 0, 1] + translation: [-2.97959995, -2.86999989, 27.4850006] + - address: 0x10008402 + dY: 5.96000004 + pitch: 0.00579999993 + stereoF: 0 + stereoB: 7.5 + lorentzF: 0 + lorentzB: 0 + localToGlobal: + rotation: [1, 1.22464685e-16, 1.22464685e-16, 1.22464685e-16, -1, 0, 1.22464685e-16, 1.49975976e-32, -1] + translation: [2.97959995, 2.86999989, 28.7150002] + - address: 0x10018402 + dY: 5.96000004 + pitch: 0.00579999993 + stereoF: 0 + stereoB: 7.5 + lorentzF: 0 + lorentzB: 0 + localToGlobal: + rotation: [1, 1.22464685e-16, 1.22464685e-16, 1.22464685e-16, -1, 0, 1.22464685e-16, 1.49975976e-32, -1] + translation: [2.97959995, -2.86999989, 28.5149994] + - address: 0x10008012 + dY: 5.96000004 + pitch: 0.00579999993 + stereoF: 0 + stereoB: 7.5 + lorentzF: 0 + lorentzB: 0 + localToGlobal: + rotation: [1, 1.22464685e-16, 1.22464685e-16, 1.22464685e-16, -1, 0, 1.22464685e-16, 1.49975976e-32, -1] + translation: [-5.95919991, 5.96999979, 42.9150009] + - address: 0x10018012 + dY: 12.1599998 + pitch: 0.00579999993 + stereoF: 0 + stereoB: 7.5 + lorentzF: 0 + lorentzB: 0 + localToGlobal: + rotation: [1, 1.22464685e-16, 1.22464685e-16, 1.22464685e-16, -1, 0, 1.22464685e-16, 1.49975976e-32, -1] + translation: [-5.95919991, -2.86999989, 42.7150002] + - address: 0x10008412 + dY: 5.96000004 + pitch: 0.00579999993 + stereoF: 0 + stereoB: 7.5 + lorentzF: 0 + lorentzB: 0 + localToGlobal: + rotation: [-1, -1.22464685e-16, 0, 1.22464685e-16, -1, 0, 0, 0, 1] + translation: [7.31114159e-16, 5.96999979, 41.0849991] + - address: 0x10018412 + dY: 12.1599998 + pitch: 0.00579999993 + stereoF: 0 + stereoB: 7.5 + lorentzF: 0 + lorentzB: 0 + localToGlobal: + rotation: [-1, -1.22464685e-16, 0, 1.22464685e-16, -1, 0, 0, 0, 1] + translation: [-3.51473622e-16, -2.86999989, 41.2849998] + - address: 0x10008812 + dY: 5.96000004 + pitch: 0.00579999993 + stereoF: 0 + stereoB: 7.5 + lorentzF: 0 + lorentzB: 0 + localToGlobal: + rotation: [1, 1.22464685e-16, 1.22464685e-16, 1.22464685e-16, -1, 0, 1.22464685e-16, 1.49975976e-32, -1] + translation: [5.95919991, 5.73999977, 42.9150009] + - address: 0x10018812 + dY: 5.96000004 + pitch: 0.00579999993 + stereoF: 0 + stereoB: 7.5 + lorentzF: 0 + lorentzB: 0 + localToGlobal: + rotation: [1, 1.22464685e-16, 1.22464685e-16, 1.22464685e-16, -1, 0, 1.22464685e-16, 1.49975976e-32, -1] + translation: [5.95919991, 1.33226763e-15, 42.7150002] + - address: 0x10028812 + dY: 5.96000004 + pitch: 0.00579999993 + stereoF: 0 + stereoB: 7.5 + lorentzF: 0 + lorentzB: 0 + localToGlobal: + rotation: [1, 1.22464685e-16, 1.22464685e-16, 1.22464685e-16, -1, 0, 1.22464685e-16, 1.49975976e-32, -1] + translation: [5.95919991, -5.73999977, 42.5149994] +...