diff --git a/algo/CMakeLists.txt b/algo/CMakeLists.txt index 48571713f093dab8540f17068f2089c9563d6ceb..f4f946b5c994506c32994e72699150c1f04b4110 100644 --- a/algo/CMakeLists.txt +++ b/algo/CMakeLists.txt @@ -4,14 +4,15 @@ add_subdirectory(test) set(DEVICE_SRCS base/gpu/DeviceImage.cxx detectors/sts/UnpackStsXpu.cxx + detectors/sts/StsHitfinder.cxx ) -# Create a library libCbmAlgo set(SRCS ${DEVICE_SRCS} evbuild/EventBuilder.cxx trigger/TimeClusterTrigger.cxx evselector/DigiEventSelector.cxx + detectors/sts/StsHitfinderChain.cxx detectors/sts/StsReadoutConfig.cxx detectors/sts/UnpackSts.cxx detectors/much/MuchReadoutConfig.cxx @@ -40,7 +41,14 @@ target_include_directories(Algo ${CMAKE_CURRENT_SOURCE_DIR}/detectors/trd ) -target_link_libraries(Algo PUBLIC OnlineData ROOT::GenVector INTERFACE FairLogger::FairLogger external::fles_ipc xpu) +target_link_libraries(Algo + PUBLIC OnlineData + ROOT::GenVector + GSL + INTERFACE FairLogger::FairLogger + external::fles_ipc + xpu +) target_compile_definitions(Algo PUBLIC NO_ROOT) xpu_attach(Algo ${DEVICE_SRCS}) @@ -49,8 +57,11 @@ install(DIRECTORY base/gpu TYPE INCLUDE FILES_MATCHING PATTERN "*.h" ) -install(FILES ca/simd/CaSimd.h - ca/simd/CaSimdVc.h - ca/simd/CaSimdPseudo.h +install( + FILES ca/simd/CaSimd.h + ca/simd/CaSimdVc.h + ca/simd/CaSimdPseudo.h + detectors/sts/StsHitfinder.h + detectors/sts/StsHitfinderChain.h DESTINATION include/ ) diff --git a/reco/detectors/sts/experimental/CbmStsGpuHitFinder.cxx b/algo/detectors/sts/StsHitfinder.cxx similarity index 79% rename from reco/detectors/sts/experimental/CbmStsGpuHitFinder.cxx rename to algo/detectors/sts/StsHitfinder.cxx index c02877d377ddaaac4d7b718ef1e58bb34f541ebf..23422af0b8cc15d933d24d38a1bc9523a3ada0a4 100644 --- a/reco/detectors/sts/experimental/CbmStsGpuHitFinder.cxx +++ b/algo/detectors/sts/StsHitfinder.cxx @@ -1,19 +1,61 @@ -/* Copyright (C) 2021-2022 FIAS Frankfurt Institute for Advanced Studies, Frankfurt / Main +/* Copyright (C) 2022 FIAS Frankfurt Institute for Advanced Studies, Frankfurt / Main SPDX-License-Identifier: GPL-3.0-only Authors: Felix Weiglhofer [committer], Kilian Hunold */ -#include "CbmStsGpuHitFinder.h" -using namespace experimental; +#include "StsHitfinder.h" + +using namespace cbm::algo; + + +XPU_IMAGE(GpuOnline); + +XPU_CONSTANT(TheStsHitfinder); + +XPU_KERNEL(SortDigis, StsSortDigiSmem) +{ + xpu::cmem<TheStsHitfinder>().SortDigisInSpaceAndTime(smem, xpu::block_idx::x()); +} + +XPU_KERNEL(FindClustersSingleStep, xpu::no_smem) +{ + xpu::cmem<TheStsHitfinder>().FindClustersSingleStep(xpu::block_idx::x()); +} + +XPU_KERNEL(CalculateOffsets, xpu::no_smem) +{ + xpu::cmem<TheStsHitfinder>().CalculateOffsetsParallel(xpu::block_idx::x()); +} + +XPU_KERNEL(FindClusters, xpu::no_smem) { xpu::cmem<TheStsHitfinder>().FindClustersParallel(xpu::block_idx::x()); } + +XPU_KERNEL(FindClustersBasic, xpu::no_smem) +{ + xpu::cmem<TheStsHitfinder>().FindClusterConnectionsBasic(xpu::block_idx::x()); +} + +XPU_KERNEL(CalculateClusters, xpu::no_smem) +{ + xpu::cmem<TheStsHitfinder>().CalculateClustersParallel(xpu::block_idx::x()); +} + +XPU_KERNEL(CalculateClustersBasic, xpu::no_smem) +{ + xpu::cmem<TheStsHitfinder>().CalculateClustersBasic(xpu::block_idx::x()); +} + +XPU_KERNEL(SortClusters, StsSortClusterSmem) { xpu::cmem<TheStsHitfinder>().SortClusters(smem, xpu::block_idx::x()); } + +XPU_KERNEL(FindHits, xpu::no_smem) { xpu::cmem<TheStsHitfinder>().FindHits(xpu::block_idx::x()); } /** - * CbmStsGpuHitFinder::sortDigisKhun + * StsHitfinderGpu::sortDigisKhun * Sorts digis channelwise. Inside a channel all digis are sorted in time. * * @param smem is the shared memory it is worked in * @param iBlock is the Block/Module that is currenty running * */ -XPU_D void CbmStsGpuHitFinder::SortDigisInSpaceAndTime(CbmStsSortDigiSmem& smem, int iBlock) const +XPU_D void StsHitfinderGpu::SortDigisInSpaceAndTime(StsSortDigiSmem& smem, int iBlock) const { int iModule = iBlock; CbmStsDigi* digis = &digisPerModule[digiOffsetPerModule[iModule]]; @@ -29,7 +71,7 @@ XPU_D void CbmStsGpuHitFinder::SortDigisInSpaceAndTime(CbmStsSortDigiSmem& smem, if (xpu::thread_idx::x() == 0) { digisSortedPerModule[iModule] = digis; } } -XPU_D void CbmStsGpuHitFinder::FindClustersSingleStep(int iBlock) const +XPU_D void StsHitfinderGpu::FindClustersSingleStep(int iBlock) const { CalculateOffsetsParallel(iBlock); FindClustersParallel(iBlock); @@ -37,7 +79,7 @@ XPU_D void CbmStsGpuHitFinder::FindClustersSingleStep(int iBlock) const } /** - * CbmStsGpuHitFinder::calculateChannelOffsets + * StsHitfinderGpu::calculateChannelOffsets * Calculates the Offsest of every channel. digis-Array is sorted in Channel and Time. * If a channelChange is detected it is stored in channelOffsetsPerModule. * @@ -46,8 +88,8 @@ XPU_D void CbmStsGpuHitFinder::FindClustersSingleStep(int iBlock) const * @param nDigis Amount of digis in digis-Array * */ -XPU_D void CbmStsGpuHitFinder::CalculateChannelOffsets(CbmStsDigi* digis, unsigned int* channelOffsets, - unsigned int const nDigis) const +XPU_D void StsHitfinderGpu::CalculateChannelOffsets(CbmStsDigi* digis, unsigned int* channelOffsets, + unsigned int const nDigis) const { channelOffsets[0] = 0; @@ -71,14 +113,14 @@ XPU_D void CbmStsGpuHitFinder::CalculateChannelOffsets(CbmStsDigi* digis, unsign /** - * CbmStsGpuHitFinder::calculateOffsetsParallel + * StsHitfinderGpu::calculateOffsetsParallel * This function calculates the channeloffsets in a certain module. * An Offset is the Startingpoint of a Channel in a sorted array of Digis. * * @param iBlock is the Block/Module that is currently worked on * */ -XPU_D void CbmStsGpuHitFinder::CalculateOffsetsParallel(int iBlock) const +XPU_D void StsHitfinderGpu::CalculateOffsetsParallel(int iBlock) const { int const iModule = iBlock; CbmStsDigi* digis = digisSortedPerModule[iModule]; @@ -92,14 +134,14 @@ XPU_D void CbmStsGpuHitFinder::CalculateOffsetsParallel(int iBlock) const } /** - * CbmStsGpuHitFinder::findClustersParallel + * StsHitfinderGpu::findClustersParallel * This function finds clusters form an sts-Digi Array inserted. * It runs the finding process highly parallel. * * @param iBlock is the Block/Module that is currently worked on * */ -XPU_D void CbmStsGpuHitFinder::FindClustersParallel(int iBlock) const +XPU_D void StsHitfinderGpu::FindClustersParallel(int iBlock) const { int const iModule = iBlock; CbmStsDigi* digis = digisSortedPerModule[iModule]; @@ -108,7 +150,7 @@ XPU_D void CbmStsGpuHitFinder::FindClustersParallel(int iBlock) const if (nDigis == 0) return; - auto* digiConnector = &digiConnectorPerModule[digiOffsetPerModule[iModule]]; + auto* digiConnector = &digiConnectorsPerModule[digiOffsetPerModule[iModule]]; auto* channelOffsets = &channelOffsetPerModule[iModule * nChannels]; // findClusterConnectionsChannelWise(digis, digiConnector, channelOffsets, iModule, threadId); @@ -117,14 +159,14 @@ XPU_D void CbmStsGpuHitFinder::FindClustersParallel(int iBlock) const /** - * CbmStsGpuHitFinder::calculateClustersParallel + * StsHitfinderGpu::calculateClustersParallel * This function calculates clusters form an sts-Digi Array inserted. * It runs the calculating process highly parallel. * * @param iBlock is the Block/Module that is currently worked on * */ -XPU_D void CbmStsGpuHitFinder::CalculateClustersParallel(int iBlock) const +XPU_D void StsHitfinderGpu::CalculateClustersParallel(int iBlock) const { int const iModule = iBlock; CbmStsDigi* digis = digisSortedPerModule[iModule]; @@ -133,7 +175,7 @@ XPU_D void CbmStsGpuHitFinder::CalculateClustersParallel(int iBlock) const if (nDigis == 0) { return; } - auto* digiConnector = &digiConnectorPerModule[digiOffsetPerModule[iModule]]; + auto* digiConnector = &digiConnectorsPerModule[digiOffsetPerModule[iModule]]; // auto* channelOffsets = &channelOffsetPerModule[iModule * nChannels]; // calculateClustersChannelWise(digis, digiConnector, channelOffsets, iModule, threadId, nDigis); @@ -142,7 +184,7 @@ XPU_D void CbmStsGpuHitFinder::CalculateClustersParallel(int iBlock) const /** - * CbmStsGpuHitFinder::findClusterConnectionsBasic + * StsHitfinderGpu::findClusterConnectionsBasic * * Each Thread one Channel * @@ -159,13 +201,13 @@ XPU_D void CbmStsGpuHitFinder::CalculateClustersParallel(int iBlock) const * @param threadId Id of the thrad currently working * **/ -XPU_D void CbmStsGpuHitFinder::FindClusterConnectionsBasic(int iBlock) const +XPU_D void StsHitfinderGpu::FindClusterConnectionsBasic(int iBlock) const { int iModule = iBlock; int threadId = xpu::thread_idx::x(); auto nDigis = GetNDigis(iModule); CbmStsDigi* digis = digisSortedPerModule[iModule]; - auto* digiConnector = &digiConnectorPerModule[digiOffsetPerModule[iModule]]; + auto* digiConnector = &digiConnectorsPerModule[digiOffsetPerModule[iModule]]; auto* channelOffsets = &channelOffsetPerModule[iModule * nChannels]; if (threadId != 0) return; @@ -209,7 +251,7 @@ XPU_D void CbmStsGpuHitFinder::FindClusterConnectionsBasic(int iBlock) const /** - * CbmStsGpuHitFinder::findClusterConnectionsChannelWise + * StsHitfinderGpu::findClusterConnectionsChannelWise * * Each Thread one Channel * @@ -228,9 +270,9 @@ XPU_D void CbmStsGpuHitFinder::FindClusterConnectionsBasic(int iBlock) const * @param threadId Id of the thrad currently working * **/ -XPU_D void CbmStsGpuHitFinder::FindClusterConnectionsChannelWise(CbmStsDigi* digis, CbmStsDigiConnector* digiConnector, - unsigned int* channelOffsets, int const iModule, - unsigned int const threadId) const +XPU_D void StsHitfinderGpu::FindClusterConnectionsChannelWise(CbmStsDigi* digis, StsDigiConnector* digiConnector, + unsigned int* channelOffsets, int const iModule, + unsigned int const threadId) const { // Expl: here we need to be smaller than nChannels because the last channel is the most Right. for (unsigned int channel = threadId; channel < (unsigned int) (nChannels - 1); channel += xpu::block_dim::x()) { @@ -271,7 +313,7 @@ XPU_D void CbmStsGpuHitFinder::FindClusterConnectionsChannelWise(CbmStsDigi* dig } /** - * CbmStsGpuHitFinder::findClusterConnectionsDigiWise + * StsHitfinderGpu::findClusterConnectionsDigiWise * * Each thread one Digi * @@ -292,10 +334,9 @@ XPU_D void CbmStsGpuHitFinder::FindClusterConnectionsChannelWise(CbmStsDigi* dig * @param nDigis Amount of digis in digis-Array * **/ -XPU_D void CbmStsGpuHitFinder::FindClusterConnectionsDigiWise(CbmStsDigi* digis, CbmStsDigiConnector* digiConnector, - unsigned int* channelOffsets, int const iModule, - unsigned int const threadId, - unsigned int const nDigis) const +XPU_D void StsHitfinderGpu::FindClusterConnectionsDigiWise(CbmStsDigi* digis, StsDigiConnector* digiConnector, + unsigned int* channelOffsets, int const iModule, + unsigned int const threadId, unsigned int const nDigis) const { for (unsigned int currIter = threadId; currIter < nDigis; currIter += xpu::block_dim::x()) { @@ -324,7 +365,7 @@ XPU_D void CbmStsGpuHitFinder::FindClusterConnectionsDigiWise(CbmStsDigi* digis, /** - * CbmStsGpuHitFinder::calculateClustersBasic + * StsHitfinderGpu::calculateClustersBasic * * One Thread all Digis * @@ -344,12 +385,12 @@ XPU_D void CbmStsGpuHitFinder::FindClusterConnectionsDigiWise(CbmStsDigi* digis, * @param threadId Id of the thrad currently working * **/ -XPU_D void CbmStsGpuHitFinder::CalculateClustersBasic(int iBlock) const +XPU_D void StsHitfinderGpu::CalculateClustersBasic(int iBlock) const { int iModule = iBlock; int threadId = xpu::thread_idx::x(); CbmStsDigi* digis = digisSortedPerModule[iModule]; - auto* digiConnector = &digiConnectorPerModule[digiOffsetPerModule[iModule]]; + auto* digiConnector = &digiConnectorsPerModule[digiOffsetPerModule[iModule]]; if (threadId != 0) return; @@ -372,7 +413,7 @@ XPU_D void CbmStsGpuHitFinder::CalculateClustersBasic(int iBlock) const /** - * CbmStsGpuHitFinder::calculateClustersChannelWise + * StsHitfinderGpu::calculateClustersChannelWise * * Each Thread one Channel * @@ -392,10 +433,9 @@ XPU_D void CbmStsGpuHitFinder::CalculateClustersBasic(int iBlock) const * @param threadId Id of the thrad currently working * **/ -XPU_D void CbmStsGpuHitFinder::CalculateClustersChannelWise(CbmStsDigi* digis, CbmStsDigiConnector* digiConnector, - unsigned int* channelOffsets, int const iModule, - unsigned int const threadId, - unsigned int const nDigis) const +XPU_D void StsHitfinderGpu::CalculateClustersChannelWise(CbmStsDigi* digis, StsDigiConnector* digiConnector, + unsigned int* channelOffsets, int const iModule, + unsigned int const threadId, unsigned int const nDigis) const { for (unsigned int channel = threadId; channel < (unsigned int) nChannels; @@ -428,7 +468,7 @@ XPU_D void CbmStsGpuHitFinder::CalculateClustersChannelWise(CbmStsDigi* digis, C /** - * CbmStsGpuHitFinder::calculateClustersDigiWise + * StsHitfinderGpu::calculateClustersDigiWise * * Each Thread one Channel * @@ -448,9 +488,9 @@ XPU_D void CbmStsGpuHitFinder::CalculateClustersChannelWise(CbmStsDigi* digis, C * @param threadId Id of the thrad currently working * **/ -XPU_D void CbmStsGpuHitFinder::CalculateClustersDigiWise(CbmStsDigi* digis, CbmStsDigiConnector* digiConnector, - int const iModule, unsigned int const threadId, - unsigned int const nDigis) const +XPU_D void StsHitfinderGpu::CalculateClustersDigiWise(CbmStsDigi* digis, StsDigiConnector* digiConnector, + int const iModule, unsigned int const threadId, + unsigned int const nDigis) const { for (unsigned int currIter = threadId; currIter < nDigis; currIter += (unsigned int) xpu::block_dim::x()) { @@ -473,19 +513,20 @@ XPU_D void CbmStsGpuHitFinder::CalculateClustersDigiWise(CbmStsDigi* digis, CbmS } -XPU_D void CbmStsGpuHitFinder::CreateClusterFromConnectors1(int const iModule, CbmStsDigi* digis, int digiIndex) const +XPU_D void StsHitfinderGpu::CreateClusterFromConnectors1(int const iModule, CbmStsDigi* digis, int digiIndex) const { const CbmStsDigi& digi = digis[digiIndex]; - CbmStsTimeNs time = digi.GetTimeU32(); + uint32_t time = digi.GetTimeU32(); const float timeError = asic.timeResolution; - CbmStsClusterData cluster { + StsGpuCluster cluster { .fCharge = asic.AdcToCharge(digi.GetChargeU16()), .fSize = 1, .fPosition = float(digi.GetChannel()) + (IsBackside(iModule) ? nChannels : 0.f), .fPositionError = 1.f / xpu::sqrt(24.f), + .fTime = time, .fTimeError = timeError, }; @@ -494,9 +535,8 @@ XPU_D void CbmStsGpuHitFinder::CreateClusterFromConnectors1(int const iModule, C AddCluster(iModule, time, cluster); } -XPU_D void CbmStsGpuHitFinder::CreateClusterFromConnectors2(int const iModule, CbmStsDigi* digis, - CbmStsDigiConnector* digiConnector, - int const digiIndex) const +XPU_D void StsHitfinderGpu::CreateClusterFromConnectors2(int const iModule, CbmStsDigi* digis, + StsDigiConnector* digiConnector, int const digiIndex) const { const CbmStsDigi& digi1 = digis[digiIndex]; @@ -504,7 +544,7 @@ XPU_D void CbmStsGpuHitFinder::CreateClusterFromConnectors2(int const iModule, C float eNoiseSq = asic.noise * asic.noise; - float chargePerAdc = asic.dynRange / float(asic.nAdc); + float chargePerAdc = asic.dynamicRange / float(asic.nAdc); float eDigitSq = chargePerAdc * chargePerAdc / 12.f; float x1 = float(digi1.GetChannel()); @@ -515,10 +555,10 @@ XPU_D void CbmStsGpuHitFinder::CreateClusterFromConnectors2(int const iModule, C if (digi1.GetChannel() > digi2.GetChannel()) { x1 -= float(nChannels); } // Uncertainties of the charge measurements - XErrorT width1 = LandauWidth(q1); - XErrorT eq1sq = width1 * width1 + eNoiseSq + eDigitSq; - XErrorT width2 = LandauWidth(q2); - XErrorT eq2sq = width2 * width2 + eNoiseSq + eDigitSq; + float width1 = LandauWidth(q1); + float eq1sq = width1 * width1 + eNoiseSq + eDigitSq; + float width2 = LandauWidth(q2); + float eq2sq = width2 * width2 + eNoiseSq + eDigitSq; // Cluster time float time = 0.5f * (digi1.GetTimeU32() + digi2.GetTimeU32()); @@ -531,9 +571,9 @@ XPU_D void CbmStsGpuHitFinder::CreateClusterFromConnectors2(int const iModule, C if (x < -0.5f) { x += float(nChannels); } // Uncertainty on cluster position. See software note. - XErrorT ex0sq = 0.f; - XErrorT ex1sq = 0.f; - XErrorT ex2sq = 0.f; + float ex0sq = 0.f; + float ex1sq = 0.f; + float ex2sq = 0.f; if (q1 < q2) { ex0sq = (q2 - q1) * (q2 - q1) / q2 / q2 / 72.f; ex1sq = eq1sq / q2 / q2 / 9.f; @@ -544,18 +584,19 @@ XPU_D void CbmStsGpuHitFinder::CreateClusterFromConnectors2(int const iModule, C ex1sq = eq1sq * q2 * q2 / q1 / q1 / q1 / q1 / 9.f; ex2sq = eq2sq / q1 / q1 / 9.f; } - XErrorT xError = xpu::sqrt(ex0sq + ex1sq + ex2sq); + float xError = xpu::sqrt(ex0sq + ex1sq + ex2sq); // Cluster charge float charge = q1 + q2; if (IsBackside(iModule)) x += nChannels; - CbmStsClusterData cls { + StsGpuCluster cls { .fCharge = charge, .fSize = 2, .fPosition = x, .fPositionError = xError, + .fTime = uint32_t(time), .fTimeError = timeError, }; @@ -564,16 +605,16 @@ XPU_D void CbmStsGpuHitFinder::CreateClusterFromConnectors2(int const iModule, C } -XPU_D void CbmStsGpuHitFinder::CreateClusterFromConnectorsN(int iModule, CbmStsDigi* digis, - CbmStsDigiConnector* digiConnector, int digiIndex) const +XPU_D void StsHitfinderGpu::CreateClusterFromConnectorsN(int iModule, CbmStsDigi* digis, + StsDigiConnector* digiConnector, int digiIndex) const { - CbmStsClusterCalculationProperties cProps; + StsClusterCalculationProperties cProps; //This Lambda calculates all charges for a single digi inside of a cluster auto calculateClusterCharges = [this, &cProps, &digis, &digiConnector](int index) { CbmStsDigi digi = digis[index]; float eNoiseSq = asic.noise * asic.noise; - float chargePerAdc = asic.dynRange / float(asic.nAdc); + float chargePerAdc = asic.dynamicRange / float(asic.nAdc); float eDigitSq = chargePerAdc * chargePerAdc / 12.f; cProps.tResolSum += asic.timeResolution; cProps.tSum += digi.GetTimeU32(); @@ -642,11 +683,12 @@ XPU_D void CbmStsGpuHitFinder::CreateClusterFromConnectorsN(int iModule, CbmStsD if (IsBackside(iModule)) { x += nChannels; } - CbmStsClusterData cls { + StsGpuCluster cls { .fCharge = qSum, .fSize = int(nDigis), .fPosition = x, .fPositionError = xError, + .fTime = uint32_t(cProps.tSum), .fTimeError = timeError, }; @@ -654,37 +696,37 @@ XPU_D void CbmStsGpuHitFinder::CreateClusterFromConnectorsN(int iModule, CbmStsD AddCluster(iModule, cProps.tSum, cls); } -XPU_D void CbmStsGpuHitFinder::SortClusters(CbmStsSortClusterSmem& smem, int iBlock) const +XPU_D void StsHitfinderGpu::SortClusters(StsSortClusterSmem& smem, int iBlock) const { int iModule = iBlock; size_t offset = iModule * maxClustersPerModule; - CbmStsClusterIdx* clusterIdx = &clusterIdxPerModule[offset]; - CbmStsClusterIdx* clusterIdxTmp = &clusterIdxPerModuleTmp[offset]; + StsGpuClusterIdx* clusterIdx = &clusterIdxPerModule[offset]; + StsGpuClusterIdx* clusterIdxTmp = &clusterIdxPerModuleTmp[offset]; int nClusters = nClustersPerModule[iModule]; SortClustersT clsSort(smem.sortBuf); - clusterIdx = clsSort.sort(clusterIdx, nClusters, clusterIdxTmp, [](const CbmStsClusterIdx a) { return a.fTime; }); + clusterIdx = clsSort.sort(clusterIdx, nClusters, clusterIdxTmp, [](const StsGpuClusterIdx a) { return a.fTime; }); if (xpu::thread_idx::x() == 0) clusterIdxSortedPerModule[iModule] = clusterIdx; } -XPU_D void CbmStsGpuHitFinder::FindHits(int iBlock) const +XPU_D void StsHitfinderGpu::FindHits(int iBlock) const { - int iModuleF = iBlock; - int iModuleB = iBlock + nModules; - size_t offsetF = iModuleF * maxClustersPerModule; - size_t offsetB = iModuleB * maxClustersPerModule; - CbmStsClusterIdx* clusterIdxF = clusterIdxSortedPerModule[iModuleF]; - CbmStsClusterIdx* clusterIdxB = clusterIdxSortedPerModule[iModuleB]; - CbmStsClusterData* clusterDataF = &clusterDataPerModule[offsetF]; - CbmStsClusterData* clusterDataB = &clusterDataPerModule[offsetB]; - int nClustersF = nClustersPerModule[iModuleF]; - int nClustersB = nClustersPerModule[iModuleB]; - - - CbmStsHitFinderParams pars; + int iModuleF = iBlock; + int iModuleB = iBlock + nModules; + size_t offsetF = iModuleF * maxClustersPerModule; + size_t offsetB = iModuleB * maxClustersPerModule; + StsGpuClusterIdx* clusterIdxF = clusterIdxSortedPerModule[iModuleF]; + StsGpuClusterIdx* clusterIdxB = clusterIdxSortedPerModule[iModuleB]; + StsGpuCluster* clusterDataF = &clusterDataPerModule[offsetF]; + StsGpuCluster* clusterDataB = &clusterDataPerModule[offsetB]; + int nClustersF = nClustersPerModule[iModuleF]; + int nClustersB = nClustersPerModule[iModuleB]; + + + StsHitfinderCache pars; { - CbmStsHitFinderConfig cfg = hitfinderConfig[iBlock]; + StsSensorPar cfg = sensorPars[iBlock]; pars.dY = cfg.dY; pars.pitch = cfg.pitch; @@ -705,14 +747,14 @@ XPU_D void CbmStsGpuHitFinder::FindHits(int iBlock) const int startB = 0; for (int iClusterF = xpu::thread_idx::x(); iClusterF < nClustersF; iClusterF += xpu::block_dim::x()) { - CbmStsClusterIdx clsIdxF = clusterIdxF[iClusterF]; - CbmStsClusterData clsDataF = clusterDataF[clsIdxF.fIdx]; + StsGpuClusterIdx clsIdxF = clusterIdxF[iClusterF]; + StsGpuCluster clsDataF = clusterDataF[clsIdxF.fIdx]; float maxSigma = 4.f * xpu::sqrt(clsDataF.fTimeError * clsDataF.fTimeError + maxTerrF * maxTerrF); for (int iClusterB = startB; iClusterB < nClustersB; iClusterB++) { - CbmStsClusterIdx clsIdxB = clusterIdxB[iClusterB]; - CbmStsClusterData clsDataB = clusterDataB[clsIdxB.fIdx]; + StsGpuClusterIdx clsIdxB = clusterIdxB[iClusterB]; + StsGpuCluster clsDataB = clusterDataB[clsIdxB.fIdx]; float timeDiff = float(clsIdxF.fTime) - float(clsIdxB.fTime); @@ -742,9 +784,9 @@ XPU_D void CbmStsGpuHitFinder::FindHits(int iBlock) const } } -XPU_D void CbmStsGpuHitFinder::IntersectClusters(int iBlock, const CbmStsHitFinderParams& pars, CbmStsTimeNs timeF, - const CbmStsClusterData& clsF, CbmStsTimeNs timeB, - const CbmStsClusterData& clsB) const +XPU_D void StsHitfinderGpu::IntersectClusters(int iBlock, const StsHitfinderCache& pars, uint32_t timeF, + const StsGpuCluster& clsF, uint32_t timeB, + const StsGpuCluster& clsB) const { // --- Calculate cluster centre position on readout edge @@ -801,7 +843,7 @@ XPU_D void CbmStsGpuHitFinder::IntersectClusters(int iBlock, const CbmStsHitFind } } -XPU_D float CbmStsGpuHitFinder::GetClusterPosition(const CbmStsHitFinderParams& pars, float centre, bool isFront) const +XPU_D float StsHitfinderGpu::GetClusterPosition(const StsHitfinderCache& pars, float centre, bool isFront) const { // Take integer channel @@ -827,8 +869,8 @@ XPU_D float CbmStsGpuHitFinder::GetClusterPosition(const CbmStsHitFinderParams& return xCluster; } -XPU_D bool CbmStsGpuHitFinder::Intersect(const CbmStsHitFinderParams& pars, float xF, float exF, float xB, float exB, - float& x, float& y, float& varX, float& varY, float& varXY) const +XPU_D bool StsHitfinderGpu::Intersect(const StsHitfinderCache& pars, float xF, float exF, float xB, float exB, float& x, + float& y, float& varX, float& varY, float& varXY) const { // In the coordinate system with origin at the bottom left corner, @@ -878,7 +920,7 @@ XPU_D bool CbmStsGpuHitFinder::Intersect(const CbmStsHitFinderParams& pars, floa return IsInside(pars, x - pars.dX / 2.f, y - pars.dY / 2.f); } -XPU_D bool CbmStsGpuHitFinder::IsInside(const CbmStsHitFinderParams& pars, float x, float y) const +XPU_D bool StsHitfinderGpu::IsInside(const StsHitfinderCache& pars, float x, float y) const { // clang-format off return x >= -pars.dX / 2.f @@ -888,9 +930,9 @@ XPU_D bool CbmStsGpuHitFinder::IsInside(const CbmStsHitFinderParams& pars, float // clang-format on } -XPU_D void CbmStsGpuHitFinder::CreateHit(int iModule, float xLocal, float yLocal, float varX, float varY, float varXY, - CbmStsTimeNs timeF, const CbmStsClusterData& clsF, CbmStsTimeNs timeB, - const CbmStsClusterData& clsB, float du, float dv) const +XPU_D void StsHitfinderGpu::CreateHit(int iModule, float xLocal, float yLocal, float varX, float varY, float varXY, + uint32_t timeF, const StsGpuCluster& clsF, uint32_t timeB, + const StsGpuCluster& clsB, float du, float dv) const { // --- Transform into global coordinate system float globalX, globalY, globalZ; @@ -909,11 +951,11 @@ XPU_D void CbmStsGpuHitFinder::CreateHit(int iModule, float xLocal, float yLocal float etB = clsB.fTimeError; float hitTimeError = 0.5f * xpu::sqrt(etF * etF + etB * etB); - CbmStsHit hit { + StsGpuHit hit { .fX = globalX, .fY = globalY, .fZ = globalZ, - .fTime = static_cast<CbmStsTimeNs>(hitTime), + .fTime = static_cast<uint32_t>(hitTime), .fDx = errX, .fDy = errY, .fDz = errZ, @@ -931,22 +973,21 @@ XPU_D void CbmStsGpuHitFinder::CreateHit(int iModule, float xLocal, float yLocal } -XPU_D XErrorT CbmStsGpuHitFinder::LandauWidth(float charge) const +XPU_D float StsHitfinderGpu::LandauWidth(float charge) const { if (charge <= landauStepSize) return landauTable[0]; if (charge > landauStepSize * (landauTableSize - 1)) return landauTable[landauTableSize - 1]; int tableIdx = xpu::ceil(charge / landauStepSize); - XErrorT e2 = tableIdx * landauStepSize; - XErrorT v2 = landauTable[tableIdx]; + float e2 = tableIdx * landauStepSize; + float v2 = landauTable[tableIdx]; tableIdx--; - XErrorT e1 = tableIdx * landauStepSize; - XErrorT v1 = landauTable[tableIdx]; + float e1 = tableIdx * landauStepSize; + float v1 = landauTable[tableIdx]; return v1 + (charge - e1) * (v2 - v1) / (e2 - e1); } -XPU_D void CbmStsGpuHitFinder::ToGlobal(int iModule, float lx, float ly, float lz, float& gx, float& gy, - float& gz) const +XPU_D void StsHitfinderGpu::ToGlobal(int iModule, float lx, float ly, float lz, float& gx, float& gy, float& gz) const { const float* tr = &localToGlobalTranslationByModule[iModule * 3]; const float* rot = &localToGlobalRotationByModule[iModule * 9]; diff --git a/algo/detectors/sts/StsHitfinder.h b/algo/detectors/sts/StsHitfinder.h new file mode 100644 index 0000000000000000000000000000000000000000..656f12a15db471c3a4770c002cdb8f01b8f721dc --- /dev/null +++ b/algo/detectors/sts/StsHitfinder.h @@ -0,0 +1,420 @@ +/* Copyright (C) 2022 FIAS Frankfurt Institute for Advanced Studies, Frankfurt / Main + SPDX-License-Identifier: GPL-3.0-only + Authors: Felix Weiglhofer [committer], Kilian Hunold */ + +#ifndef CBM_ALGO_STS_HITFINDER_H +#define CBM_ALGO_STS_HITFINDER_H + +#include "CbmStsDigi.h" + +#include <xpu/device.h> + +namespace cbm::algo +{ + struct GpuOnline { + }; // xpu image identifier, has to be moved when more steps are moved to GPU + + // Block sizes / other compile time constants that need tuning + enum GpuConstants + { +#if XPU_IS_CUDA + kSortDigisBlockSize = 32, + kSortDigisItemsPerThread = 32, + kSortClustersBlockSize = 32, + kSortClustersItemsPerThread = 32, + kFindClusterBlockSize = 32, + kFindHitsBlockSize = 32, +#else // HIP, values ignored on CPU + kSortDigisBlockSize = 256, + kSortDigisItemsPerThread = 6, + kSortClustersBlockSize = 256, + kSortClustersItemsPerThread = 6, + kFindClusterBlockSize = 256, + kFindHitsBlockSize = 256, +#endif + }; + + // Kernel declarations + XPU_EXPORT_KERNEL(GpuOnline, SortDigis); + // Combine substeps for finding clusters into a single kernel + XPU_EXPORT_KERNEL(GpuOnline, FindClustersSingleStep); + XPU_EXPORT_KERNEL(GpuOnline, CalculateOffsets); + XPU_EXPORT_KERNEL(GpuOnline, FindClusters); + XPU_EXPORT_KERNEL(GpuOnline, FindClustersBasic); + XPU_EXPORT_KERNEL(GpuOnline, CalculateClusters); + XPU_EXPORT_KERNEL(GpuOnline, CalculateClustersBasic); + XPU_EXPORT_KERNEL(GpuOnline, SortClusters); + XPU_EXPORT_KERNEL(GpuOnline, FindHits); + +} // namespace cbm::algo + +// Set block sizes. Currently this needs to be done in toplevel namespace. +XPU_BLOCK_SIZE_1D(cbm::algo::SortClusters, cbm::algo::kSortClustersBlockSize); +XPU_BLOCK_SIZE_1D(cbm::algo::FindClustersSingleStep, cbm::algo::kFindClusterBlockSize); +XPU_BLOCK_SIZE_1D(cbm::algo::CalculateOffsets, cbm::algo::kFindClusterBlockSize); +XPU_BLOCK_SIZE_1D(cbm::algo::FindClusters, cbm::algo::kFindClusterBlockSize); +XPU_BLOCK_SIZE_1D(cbm::algo::CalculateClusters, cbm::algo::kFindClusterBlockSize); +XPU_BLOCK_SIZE_1D(cbm::algo::SortDigis, cbm::algo::kSortClustersBlockSize); +XPU_BLOCK_SIZE_1D(cbm::algo::FindHits, cbm::algo::kFindClusterBlockSize); + +namespace cbm::algo +{ + // GPU Data structures + // TODO: Replace with regular StsCluster / StsHit once the changes + // land to make them fit for GPU + struct StsGpuClusterIdx { + uint32_t fTime; ///< Cluster time (average of digi times) [ns] + int fIdx; + }; + + struct StsGpuCluster { + float fCharge; ///< Total charge + int fSize; ///< Difference between first and last channel + float fPosition; ///< Cluster centre in channel number units + float fPositionError; ///< Cluster centre error (r.m.s.) in channel number units + uint32_t fTime; ///< cluster time [ns] + float fTimeError; ///< Error of cluster time [ns] + }; + + struct StsGpuHit { + float fX, fY; ///< X, Y positions of hit [cm] + float fZ; ///< Z position of hit [cm] + uint32_t fTime; ///< Hit time [ns] + float fDx, fDy; ///< X, Y errors [cm] + float fDz; ///< Z position error [cm] + float fDxy; ///< XY correlation + float fTimeError; ///< Error of hit time [ns] + float fDu; ///< Error of coordinate across front-side strips [cm] + float fDv; ///< Error of coordinate across back-side strips [cm] + }; + + // Calibration data structures + + struct StsAsicPar { + int nAdc; + float dynamicRange; + float threshold; + float timeResolution; + float deadTime; + float noise; + float zeroNoiseRate; + + XPU_D float AdcToCharge(unsigned short adc) const + { + return threshold + dynamicRange / float(nAdc) * (float(adc) + 0.5f); + } + }; + + struct StsSensorPar { + float dY; + float pitch; + float stereoF; + float stereoB; + float lorentzF; + float lorentzB; + }; + + // Cache for various parameters of the hitfindiner + // Used internally to avoid computing values multiple times + // TODO: is this really needed? Overhead from additional computations should be miniscule. + // TODO: Also store in shared memory, not registers. Values identical for all threads. + struct StsHitfinderCache : StsSensorPar { + float tanStereoF; + float tanStereoB; + float errorFac; + float dX; + }; + + struct StsClusterCalculationProperties { + float tSum = 0.; // sum of digi times + int chanF = 9999999; // first channel in cluster + int chanL = -1; // last channel in cluster + float qF = 0.f; // charge in first channel + float qM = 0.f; // sum of charges in middle channels + float qL = 0.f; // charge in last cluster + float eqFsq = 0.f; // uncertainty of qF + float eqMsq = 0.f; // uncertainty of qMid + float eqLsq = 0.f; // uncertainty of qL + float tResolSum = 0.f; + + float xSum = 0.f; // weighted charge sum, used to correct corrupt clusters + }; + + class StsDigiConnector { + private: + unsigned int hasPreviousAndNext; + + XPU_D unsigned int UnpackNext(unsigned int val) const { return val & ~(1u << 31); } + + public: + XPU_D bool HasPrevious() const { return (hasPreviousAndNext >> 31) & 1u; } + + XPU_D void SetHasPrevious(bool hasPrevious) + { + unsigned int old = hasPreviousAndNext; + unsigned int hasPreviousI = ((unsigned int) hasPrevious) << 31; + unsigned int assumed; + + do { + assumed = old; + old = xpu::atomic_cas_block(&hasPreviousAndNext, assumed, UnpackNext(assumed) | hasPreviousI); + } while (old != assumed); + } + + XPU_D unsigned int next() const { return UnpackNext(hasPreviousAndNext); } + + XPU_D void SetNext(unsigned int next) + { + unsigned int old = hasPreviousAndNext; + unsigned int assumed; + + next = xpu::min((1u << 31) - 1, next); + + do { + assumed = old; + old = xpu::atomic_cas_block(&hasPreviousAndNext, assumed, (assumed & (1u << 31)) | next); + } while (old != assumed); + } + + XPU_D bool HasNext() const { return UnpackNext(hasPreviousAndNext) != 0; } + }; + + // Shared Memory defintions + using SortDigisT = + xpu::block_sort<unsigned long int, ::CbmStsDigi, xpu::block_size<SortDigis>::value.x, kSortDigisItemsPerThread>; + + struct StsSortDigiSmem { + typename SortDigisT::storage_t sortBuf; + }; + + using SortClustersT = + xpu::block_sort<uint32_t, StsGpuClusterIdx, xpu::block_size<SortClusters>::value.x, kSortClustersItemsPerThread>; + + struct StsSortClusterSmem { + typename SortClustersT::storage_t sortBuf; + }; + + // Hitfinder buffers + template<xpu::side S> + class StsHitfinderBase { + + public: + // calibration / configuration data + // Only set once + + int nModules; // Number of modules + int nChannels; // Number of channels per module + + // Time cuts for Digis / Clusters + // TODO: currently not used! + float timeCutDigiAbs; + float timeCutDigiSig; + float timeCutClusterAbs; + float timeCutClusterSig; + + StsAsicPar asic; + + int landauTableSize; + float landauStepSize; + // Entries of landauTable. size = landauTableSize + xpu::cmem_device_t<float, S> landauTable; + + // transformation matrix to convert local to global coordinate space + // size = nModules + xpu::cmem_io_t<float, S> localToGlobalTranslationByModule; + xpu::cmem_io_t<float, S> localToGlobalRotationByModule; + + // Sensor Parameters + // size = nModules + xpu::cmem_io_t<StsSensorPar, S> sensorPars; + + // input + // Store all digis in a flat array with a header that contains the offset for every module (front and back) + xpu::cmem_io_t<size_t, S> + digiOffsetPerModule; // size = 2 * nModules + 1 entries, last entry contains total digi count + xpu::cmem_io_t<CbmStsDigi, S> digisPerModule; // Flat array of input digis. size = nDigisTotal + + // Temporary Digi-Array used for sorting as sorting is not in place. size = nDigisTotal + xpu::cmem_device_t<CbmStsDigi, S> digisPerModuleTmp; + + // Pointer to sorted of a module side. Points either to digisPerModule or digisPerModuleTmp + // size = 2 * nModules + xpu::cmem_device_t<CbmStsDigi*, S> digisSortedPerModule; + + // DigiConnectors used internally by StsClusterizer + // Connects digis that belong to the same cluster via linked-list. + // size = nDigisTotal + xpu::cmem_device_t<StsDigiConnector, S> digiConnectorsPerModule; + + // Digis are sorted by channel + within channel by time + // Contains the offset for each channel within each channel + // size = 2 * nModules * nChannels + xpu::cmem_device_t<unsigned int, S> channelOffsetPerModule; + + // intermediate results + size_t maxClustersPerModule; + + // FIXME: Not used anymore? + xpu::cmem_device_t<int, S> channelStatusPerModule; + + // Cluster Index by module. Produced by cluster finder. + // Stored as buckets with a max size of maxClustersPerModule + // Actual number of entries in each bucket is stored in nClustersPerModule + // size = 2 * nModules * maxClustersPerModule + xpu::cmem_io_t<StsGpuClusterIdx, S> clusterIdxPerModule; + + // Temporary cluster index array used for sorting as sorting is not in place. + // size = 2 * nModules * maxClustersPerModule + xpu::cmem_io_t<StsGpuClusterIdx, S> clusterIdxPerModuleTmp; + + // Pointer to sorted cluster idx for every module side. + // Either points to clusterIdxPerModule or clusterIdxPerModuleTmp. + // size = 2 * nModules + xpu::cmem_io_t<StsGpuClusterIdx*, S> clusterIdxSortedPerModule; + + // Clusters stored by modules. Stored as buckets with a max size of maxClustersPerModule + // Actual number of entries in each bucket is stored in nClustersPerModule + // size = 2 * nModules * maxClustersPerModule + xpu::cmem_io_t<StsGpuCluster, S> clusterDataPerModule; + + // Number of clusters in each module + // size = 2 * nModules + xpu::cmem_io_t<int, S> nClustersPerModule; + + // Max time error of clusters on front- and backside of a module + // size = 1 (???) + // FIXME: size should be 2 * nModules? And only one array! + xpu::cmem_device_t<float, S> maxErrorFront; + xpu::cmem_device_t<float, S> maxErrorBack; + + // output + + // Max number of Hits that can be stored in each module + size_t maxHitsPerModule; + + // Hits sorted by module. Stored in buckets of size maxHitsPerModule. + // Actual number of hits is stored in nHitsPerModule + // size = maxHitsPerModule * nModules + xpu::cmem_io_t<StsGpuHit, S> hitsPerModule; + + // Number of hits in each module + // size = nModules + xpu::cmem_io_t<int, S> nHitsPerModule; + }; + + // Host-side hitfinder. Only holds memory buffers. + using StsHitfinderHost = StsHitfinderBase<xpu::side::host>; + + // Device-side hitfinder. Holds pointers to buffers + algorithm. + class StsHitfinderGpu : public StsHitfinderBase<xpu::side::device> { + + public: + XPU_D void SortDigisInSpaceAndTime(StsSortDigiSmem& smem, int iBlock) const; + XPU_D void FindClustersSingleStep(int iBlock) const; + XPU_D void SortClusters(StsSortClusterSmem& smem, int iBlock) const; + XPU_D void FindHits(int iBlock) const; + + // Individual steps of cluster finder, for more granular time measurement + XPU_D void CalculateOffsetsParallel(int iBlock) const; + XPU_D void FindClustersParallel(int iBlock) const; + XPU_D void CalculateClustersParallel(int iBlock) const; + + XPU_D void FindClusterConnectionsBasic(int iBlock) const; + XPU_D void CalculateClustersBasic(int iBlock) const; + + private: + XPU_D void CalculateChannelOffsets(CbmStsDigi* digis, unsigned int* channelOffsets, unsigned int nDigis) const; + + XPU_D void FindClusterConnectionsChannelWise(CbmStsDigi* digis, StsDigiConnector* digiConnector, + unsigned int* channelOffsets, int iModule, + unsigned int threadId) const; + XPU_D void FindClusterConnectionsDigiWise(CbmStsDigi* digis, StsDigiConnector* digiConnector, + unsigned int* channelOffsets, int iModule, unsigned int threadId, + unsigned int nDigis) const; + + XPU_D void CalculateClustersBasic(CbmStsDigi* digis, StsDigiConnector* digiConnector, int iModule, + unsigned int const threadId) const; + XPU_D void CalculateClustersChannelWise(CbmStsDigi* digis, StsDigiConnector* digiConnector, + unsigned int* channelOffsets, int iModule, unsigned int const threadId, + unsigned int const nDigis) const; + XPU_D void CalculateClustersDigiWise(CbmStsDigi* digis, StsDigiConnector* digiConnector, int iModule, + unsigned int const threadId, unsigned int const nDigis) const; + + XPU_D void CreateClusterFromConnectors1(int const iModule, CbmStsDigi* digis, int const digiIndex) const; + XPU_D void CreateClusterFromConnectors2(int const iModule, CbmStsDigi* digis, StsDigiConnector* digiConnector, + int const digiIndex) const; + XPU_D void CreateClusterFromConnectorsN(int const iModule, CbmStsDigi* digis, StsDigiConnector* digiConnector, + int const digiIndex) const; + + private: + XPU_D unsigned int GetNDigis(int iModule) const + { + return digiOffsetPerModule[iModule + 1] - digiOffsetPerModule[iModule]; + } + + XPU_D float GetTimeResolution(int /* iModule */, int /* channel */) const { return asic.timeResolution; } + + XPU_D bool IsActive(int* channelStatus, int channel) const + { + // TODO add padding to channels to remove this? + if (channel < 0 || channel >= nChannels) { return false; } + return channelStatus[channel] > -1; + } + + XPU_D int ChanLeft(int channel) const { return channel - 1; } + + XPU_D int ChanRight(int channel) const { return channel + 1; } + + XPU_D int ChanDist(int c1, int c2) const + { + int diff = c2 - c1; + // TODO handle wrap around + return diff; + } + + XPU_D void AddCluster(int iModule, uint32_t time, const StsGpuCluster& cls) const + { + StsGpuClusterIdx* tgtIdx = &clusterIdxPerModule[iModule * maxClustersPerModule]; + StsGpuCluster* tgtData = &clusterDataPerModule[iModule * maxClustersPerModule]; + + int pos = xpu::atomic_add_block(&nClustersPerModule[iModule], 1); + XPU_ASSERT(size_t(pos) < maxClustersPerModule); + + StsGpuClusterIdx idx {time, pos}; + tgtIdx[idx.fIdx] = idx; + tgtData[idx.fIdx] = cls; + } + + XPU_D bool IsBackside(int iModule) const { return iModule >= nModules; } + + XPU_D float LandauWidth(float charge) const; + + XPU_D void ToGlobal(int iModule, float lx, float ly, float lz, float& gx, float& gy, float& gz) const; + + XPU_D void IntersectClusters(int iBlock, const StsHitfinderCache& pars, uint32_t timeF, const StsGpuCluster& clsF, + uint32_t timeB, const StsGpuCluster& clsB) const; + XPU_D float GetClusterPosition(const StsHitfinderCache& pars, float centre, bool isFront) const; + XPU_D bool Intersect(const StsHitfinderCache& pars, float xF, float exF, float xB, float exB, float& x, float& y, + float& varX, float& varY, float& varXY) const; + XPU_D bool IsInside(const StsHitfinderCache& pars, float x, float y) const; + XPU_D void CreateHit(int iBlocks, float xLocal, float yLocal, float varX, float varY, float varXY, uint32_t timeF, + const StsGpuCluster& clsF, uint32_t timeB, const StsGpuCluster& clsB, float du, + float dv) const; + + XPU_D void SaveMaxError(float errorValue, int iModule) const + { + float* maxError = IsBackside(iModule) ? maxErrorBack : maxErrorFront; + float old {}; + do { + old = *maxError; + if (old >= errorValue) { break; } + } while (!xpu::atomic_cas_block(maxError, *maxError, xpu::max(errorValue, *maxError))); + } + }; + + // StsHitfinder lives in constant memory + XPU_EXPORT_CONSTANT(GpuOnline, StsHitfinderGpu, TheStsHitfinder); + +} // namespace cbm::algo + +#endif // #ifndef CBM_ALGO_STS_HITFINDER_H diff --git a/algo/detectors/sts/StsHitfinderChain.cxx b/algo/detectors/sts/StsHitfinderChain.cxx new file mode 100644 index 0000000000000000000000000000000000000000..9c37e24b5d3d0b58570ea7f2301ad163ee9e9fbf --- /dev/null +++ b/algo/detectors/sts/StsHitfinderChain.cxx @@ -0,0 +1,321 @@ +/* Copyright (C) 2022 FIAS Frankfurt Institute for Advanced Studies, Frankfurt / Main + SPDX-License-Identifier: GPL-3.0-only + Authors: Felix Weiglhofer [committer], Kilian Hunold */ + +#include "StsHitfinderChain.h" + +#include "Logger.h" + +using namespace cbm::algo; + +// Shorthands to set data / allocate buffers on GPU +// FIXME: Eventually we'll need a better way to do this. Preferably supplied by xpu +#define SET_GPU_CONSTANT(constant, val) fHitfinderHost.constant = fHitfinderGpu.constant = val +#define GPU_ALLOC(field, nEntries) \ + fHitfinderHost.field = decltype(fHitfinderHost.field)(nEntries); \ + fHitfinderGpu.field = fHitfinderHost.field.d() + +void StsHitfinderChain::SetParameters(const StsHitfinderPar& parameters) +{ + fPars.emplace(parameters); + AllocateStatic(); +} + +void StsHitfinderChain::operator()(gsl::span<const CbmStsDigi> digis) +{ + EnsureParameters(); + + // FIXME: this has to be called only once + // and should happen during initialization not in reco loop + setenv("XPU_PROFILE", "1", 1); + xpu::initialize(); + + int nModules = fPars->modules.size(); + int nModuleSides = nModules * 2; + size_t nDigisTotal = digis.size(); + + // Getting the digis on the GPU requires 3 steps + // 1. Sort digis into buckets by module + size_t maxNDigisPerModule; + DigiMap digiMap = SortDigisIntoModules(digis, maxNDigisPerModule); + // 2. Once we know number of digis per module, we can allocate + // the dynamic buffers on the gpu, as the buffer sizes depend on that value + AllocateDynamic(maxNDigisPerModule, nDigisTotal); + // 3. Copy digis into flat array with offsets per module + FlattenDigis(digiMap); + + // Clear buffers + // Not all buffers have to initialized, but useful for debugging + + auto& hfc = fHitfinderHost; + // xpu::memset(hfc.digisPerModule, 0); + // xpu::memset(hfc.digisPerModuleTmp, 0); + // xpu::memset(hfc.digisSortedPerModule, 0); + // xpu::memset(hfc.digiOffsetPerModule, 0); + xpu::memset(hfc.digiConnectorsPerModule, 0); + // xpu::memset(hfc.channelOffsetPerModule, 0); + xpu::memset(hfc.channelStatusPerModule, -1); + xpu::memset(hfc.clusterIdxPerModule, 0); + // xpu::memset(hfc.clusterIdxPerModuleTmp, 0); + // xpu::memset(hfc.clusterIdxSortedPerModule, 0); + // xpu::memset(hfc.clusterDataPerModule, 0); + xpu::memset(hfc.nClustersPerModule, 0); + // xpu::memset(hfc.hitsPerModule, 0); + xpu::memset(hfc.nHitsPerModule, 0); + xpu::memset(hfc.maxErrorFront, 0); + xpu::memset(hfc.maxErrorBack, 0); + + xpu::set_constant<TheStsHitfinder>(fHitfinderGpu); + xpu::copy(hfc.digisPerModule, xpu::host_to_device); + xpu::copy(hfc.digiOffsetPerModule, xpu::host_to_device); + + xpu::run_kernel<SortDigis>(xpu::grid::n_blocks(nModuleSides)); + xpu::run_kernel<FindClustersSingleStep>(xpu::grid::n_blocks(nModuleSides)); + // Run cluster finding steps in indivual kernels, useful for debugging / profiling + // xpu::run_kernel<CalculateOffsets>(xpu::grid::n_blocks(hfc.nModules * 2)); + // xpu::run_kernel<FindClustersBasic>(xpu::grid::n_blocks(hfc.nModules * 2)); + // xpu::run_kernel<CalculateClusters>(xpu::grid::n_blocks(hfc.nModules * 2)); + // xpu::run_kernel<FindClustersBasic>(xpu::grid::n_blocks(hfc.nModules * 2)); + // xpu::run_kernel<CalculateClustersBasic>(xpu::grid::n_blocks(hfc.nModules * 2)); + xpu::run_kernel<SortClusters>(xpu::grid::n_blocks(nModuleSides)); + xpu::run_kernel<FindHits>(xpu::grid::n_blocks(nModules)); + + CollectTimingInfo(); + + // Transfer Hits / Cluster back to host + // TODO: Hits and Clusters are stored in buckets. Currently we transfer the entire + // bucket arrays back to the host. This includes a lot of unused bytes. + // Better to either flatten both arrays on the GPU and transfer flat array back. + // Or do multiple copies to copy contents of individual buckets. + xpu::copy(hfc.clusterDataPerModule, xpu::device_to_host); + xpu::copy(hfc.nClustersPerModule, xpu::device_to_host); + xpu::copy(hfc.hitsPerModule, xpu::device_to_host); + xpu::copy(hfc.nHitsPerModule, xpu::device_to_host); +} + +void StsHitfinderChain::EnsureParameters() +{ + LOG_IF(fatal, fPars == std::nullopt) << "StsHitfinderChain: Parameters not set. Can't continue!"; +} + +void StsHitfinderChain::AllocateStatic() +{ + EnsureParameters(); + + // Shorthands for common constants + const int nChannels = fPars->nChannels / 2; // Only count channels on one side of the module + const int nModules = fPars->modules.size(); + // const int nModuleSides = 2 * nModules; // Number of module front + backsides + + // Set GPU constants + SET_GPU_CONSTANT(nModules, nModules); + SET_GPU_CONSTANT(nChannels, nChannels); + + SET_GPU_CONSTANT(timeCutDigiAbs, -1.f); + SET_GPU_CONSTANT(timeCutDigiSig, 3.f); + SET_GPU_CONSTANT(timeCutClusterAbs, -1.f); + SET_GPU_CONSTANT(timeCutClusterSig, 4.f); + + SET_GPU_CONSTANT(asic, fPars->asic); + + // Copy landau table + size_t nLandauTableEntries = fPars->landauTable.values.size(); + SET_GPU_CONSTANT(landauTableSize, nLandauTableEntries); + SET_GPU_CONSTANT(landauStepSize, fPars->landauTable.stepSize); + GPU_ALLOC(landauTable, nLandauTableEntries); + xpu::copy(fHitfinderGpu.landauTable, fPars->landauTable.values.data(), nLandauTableEntries); + + // Copy transformation matrix + GPU_ALLOC(localToGlobalRotationByModule, 9 * nModules); + GPU_ALLOC(localToGlobalTranslationByModule, 3 * nModules); + + // - Copy matrix into flat array + for (int m = 0; m < nModules; m++) { + const auto& module = fPars->modules.at(m); + std::copy_n(module.localToGlobal.rotation.data(), 9, &fHitfinderHost.localToGlobalRotationByModule.h()[m * 9]); + std::copy_n(module.localToGlobal.translation.data(), 3, &fHitfinderHost.localToGlobalRotationByModule.h()[m * 3]); + } + + // - Then copy to GPU + xpu::copy(fHitfinderHost.localToGlobalRotationByModule, xpu::host_to_device); + xpu::copy(fHitfinderHost.localToGlobalTranslationByModule, xpu::host_to_device); + + // Copy Sensor Parameteres + GPU_ALLOC(sensorPars, nModules); + for (int m = 0; m < nModules; m++) { + const auto& module = fPars->modules.at(m); + auto& gpuPars = fHitfinderHost.sensorPars.h()[m]; + gpuPars.dY = module.dY; + gpuPars.pitch = module.pitch; + gpuPars.stereoF = module.stereoF; + gpuPars.stereoB = module.stereoB; + gpuPars.lorentzF = module.lorentzF; + gpuPars.lorentzB = module.lorentzB; + } + xpu::copy(fHitfinderHost.sensorPars, xpu::host_to_device); +} + +void StsHitfinderChain::AllocateDynamic(size_t maxNDigisPerModule, size_t nDigisTotal) +{ + EnsureParameters(); + + // TODO: some of these buffers have a constant size and can be allocated statically. + // Just the data they contain is static. + + // Shorthands for common constants + const int nChannels = fPars->nChannels / 2; // Only count channels on one side of the module + const int nModules = fPars->modules.size(); + const int nModuleSides = 2 * nModules; // Number of module front + backsides + + const size_t maxNClustersPerModule = maxNDigisPerModule * kClustersPerDigiFactor; + const size_t maxNHitsPerModule = maxNClustersPerModule * kHitsPerClustersFactor; + + + // Allocate Digi Buffers + GPU_ALLOC(digiOffsetPerModule, nModuleSides + 1); + GPU_ALLOC(digisPerModule, nDigisTotal); + GPU_ALLOC(digisPerModuleTmp, nDigisTotal); + GPU_ALLOC(digisSortedPerModule, nModuleSides); + GPU_ALLOC(digiConnectorsPerModule, nDigisTotal); + + GPU_ALLOC(channelOffsetPerModule, nModuleSides * nChannels); + + GPU_ALLOC(maxErrorFront, 1); + GPU_ALLOC(maxErrorBack, 1); + + // Allocate Cluster Buffers + SET_GPU_CONSTANT(maxClustersPerModule, maxNClustersPerModule); + GPU_ALLOC(channelStatusPerModule, nChannels * nModules); + GPU_ALLOC(clusterIdxPerModule, maxNClustersPerModule * nModuleSides); + GPU_ALLOC(clusterIdxPerModuleTmp, maxNClustersPerModule * nModuleSides); + GPU_ALLOC(clusterIdxSortedPerModule, nModuleSides); + GPU_ALLOC(clusterDataPerModule, maxNClustersPerModule * nModuleSides); + GPU_ALLOC(nClustersPerModule, nModuleSides); + + // Allocate Hit Buffers + SET_GPU_CONSTANT(maxHitsPerModule, maxNHitsPerModule); + GPU_ALLOC(hitsPerModule, maxNHitsPerModule * nModules); + GPU_ALLOC(nHitsPerModule, nModules); +} + +StsHitfinderChain::DigiMap StsHitfinderChain::SortDigisIntoModules(gsl::span<const CbmStsDigi> digis, + size_t& maxNDigisPerModule) +{ + DigiMap digiMap; + + int nChannelsPerSide = fPars->nChannels / 2; + + for (const auto& digi : digis) { + int moduleAddr = CbmStsAddress::GetMotherAddress(digi.GetAddress(), kStsModule); + bool isFront = digi.GetChannel() < fPars->nChannels / 2; + if (isFront) digiMap.front[moduleAddr].emplace_back(digi); + else { + CbmStsDigi tmpdigi = digi; + tmpdigi.SetChannel(tmpdigi.GetChannel() - nChannelsPerSide); + digiMap.back[moduleAddr].emplace_back(tmpdigi); + } + } + + maxNDigisPerModule = 0; + for (const auto& mod : digiMap.front) { + maxNDigisPerModule = std::max(maxNDigisPerModule, mod.second.size()); + } + + for (const auto& mod : digiMap.back) { + maxNDigisPerModule = std::max(maxNDigisPerModule, mod.second.size()); + } + + return digiMap; +} + +void StsHitfinderChain::FlattenDigis(DigiMap& digiMap) +{ + FlattenDigisSide(digiMap.front, true); + FlattenDigisSide(digiMap.back, false); +} + +void StsHitfinderChain::FlattenDigisSide(DigiMapSide& digis, bool isFront) +{ + EnsureParameters(); + + int nModules = fPars->modules.size(); + size_t* pMdigiOffset = fHitfinderHost.digiOffsetPerModule.h(); + CbmStsDigi* pDigisFlat = fHitfinderHost.digisPerModule.h(); + + int sideOffset = (isFront ? 0 : nModules); + + if (isFront) pMdigiOffset[0] = 0; + for (int m = 0; m < nModules; m++) { + size_t moduleOffset = pMdigiOffset[sideOffset + m]; + const auto& moduleDigis = digis[fPars->modules[m].address]; + std::copy(moduleDigis.begin(), moduleDigis.end(), &pDigisFlat[moduleOffset]); + pMdigiOffset[sideOffset + m + 1] = moduleOffset + moduleDigis.size(); + } +} + +void StsHitfinderChain::CollectTimingInfo() +{ + // Check if profiling is enabled + // TODO: xpu should have a nicer way to query if profiling is enabled + if (xpu::get_timing<SortDigis>().empty()) return; + + // TODO: Add time counters + fHitfinderTimes.timeSortDigi = xpu::get_timing<SortDigis>().back(); + fHitfinderTimes.timeCluster = xpu::get_timing<FindClustersSingleStep>().back(); + fHitfinderTimes.timeSortCluster = xpu::get_timing<SortClusters>().back(); + fHitfinderTimes.timeHits = xpu::get_timing<FindHits>().back(); +} + +void StsHitfinderChain::AppendClustersModule(int module, bool isFront, std::vector<StsGpuCluster>& clusters) +{ + EnsureParameters(); + + auto& hfc = fHitfinderHost; + int moduleSide = (isFront ? module : 2 * module); + + int nClusters = hfc.nClustersPerModule.h()[moduleSide]; + auto* gpuClusters = &hfc.clusterDataPerModule.h()[moduleSide * hfc.maxClustersPerModule]; + + clusters.reserve(clusters.size() + nClusters); + + for (int i = 0; i < nClusters; i++) { + clusters.emplace_back(gpuClusters[i]); + + // auto& clusterGpu = gpuClusters[i]; + + // auto& clusterOut = clusters.emplace_back(); + + // clusterOut.SetAddress(fPars->modules[module].address); + // clusterOut.SetProperties(clusterGpu.fCharge, clusterGpu.fPosition, clusterGpu.fPositionError, clusterGpu.fTime, + // clusterGpu.fTimeError); + // clusterOut.SetSize(clusterGpu.fSize); + } +} + +void StsHitfinderChain::AppendHitsModule(int module, std::vector<StsGpuHit>& hits) +{ + EnsureParameters(); + + auto& hfc = fHitfinderHost; + + auto* gpuHits = &hfc.hitsPerModule.h()[module * hfc.maxHitsPerModule]; + int nHitsGpu = hfc.nHitsPerModule.h()[module]; + + hits.reserve(hits.size() + nHitsGpu); + + for (int i = 0; i < nHitsGpu; i++) { + hits.emplace_back(gpuHits[i]); + // auto& hitGpu = gpuHits[i]; + + // hits.emplace_back(fPars->modules[module].address, + // TVector3 {hitGpu.fX, hitGpu.fY, hitGpu.fZ}, + // TVector3 {hitGpu.fDx, hitGpu.fDy, hitGpu.fDz}, + // hitGpu.fDxy, + // 0, + // 0, + // double(hitGpu.fTime), + // hitGpu.fTimeError, + // hitGpu.fDu, + // hitGpu.fDv); + } +} diff --git a/algo/detectors/sts/StsHitfinderChain.h b/algo/detectors/sts/StsHitfinderChain.h new file mode 100644 index 0000000000000000000000000000000000000000..37fcd4277bcad92e57d33a3e11ddaddc8063c336 --- /dev/null +++ b/algo/detectors/sts/StsHitfinderChain.h @@ -0,0 +1,147 @@ +/* Copyright (C) 2022 FIAS Frankfurt Institute for Advanced Studies, Frankfurt / Main + SPDX-License-Identifier: GPL-3.0-only + Authors: Felix Weiglhofer [committer], Kilian Hunold */ + +#ifndef CBM_ALGO_STS_HITFINDER_CHAIN_H +#define CBM_ALGO_STS_HITFINDER_CHAIN_H + +#include "CbmStsDigi.h" + +#include <array> +#include <cstdint> +#include <gsl/span> +#include <map> +#include <optional> +#include <vector> + +#include <xpu/host.h> + +#include "StsHitfinder.h" + +namespace cbm::algo +{ + + struct StsModuleTransformationMatrix { + // Rotation + translation matrix to transform + // local module coordinates into global coordinate system. + // No need for fancy math types here. These values are just copied + // and moved to the GPU. + std::array<float, 9> rotation; // 3x3 matrix + std::array<float, 3> translation; + }; + + struct StsModulePar { + int32_t address; + float dY; + float pitch; + float stereoF; + float stereoB; + float lorentzF; + float lorentzB; + StsModuleTransformationMatrix localToGlobal; + }; + + struct StsLandauTable { + std::vector<float> values; + float stepSize; + }; + + struct StsHitfinderPar { + StsAsicPar asic; // Asic definitions. Currently assumes same parameters for all asics. + int nChannels; // Total number of channels per module. Hitfinder assumes nChannels / 2 channels per side. + std::vector<StsModulePar> modules; + StsLandauTable landauTable; + }; + + struct StsHitfinderTimes { + double timeIO = 0; + double timeSortDigi = 0; + double timeCluster = 0; + double timeSortCluster = 0; + double timeHits = 0; + }; + + /** + * Sts Hitfinder Chain. This class is responsible for all memory allocations + * on the GPU required by the hitfinder. It executes the hitfinder kernels and + * handles memory transfer for input / output data and conversion to the + * regular CBM STS types. + */ + class StsHitfinderChain { + + public: + void SetParameters(const StsHitfinderPar& parameters); + const StsHitfinderPar& GetParameters() const { return *fPars; } + + void operator()(gsl::span<const CbmStsDigi>); + + StsHitfinderTimes GetHitfinderTimes() const { return fHitfinderTimes; } + + /** + * Returns access to (host copies of) the raw buffers used by the gpu hitfinder. + * This is currently the only way to access the produced hits and cluster. + */ + const StsHitfinderHost& GetHitfinderBuffers() const { return fHitfinderHost; } + + private: + // Shorthands to map module-address onto digis in that module + using DigiMapSide = std::map<int, std::vector<CbmStsDigi>>; + struct DigiMap { + DigiMapSide front; + DigiMapSide back; + }; + + /** + * Factors used to estimate buffer sizes based on number of digis. + * TODO: These values might be wildly off. Look for better estimations. + * TODO: Should be configurable at runtime. + */ + static constexpr float kClustersPerDigiFactor = 0.5f; + static constexpr float kHitsPerClustersFactor = 4.f; + + /** + * Ensure parameters were set. Raises log(fatal) otherwise. + */ + void EnsureParameters(); + + /** + * Allocate memory that doesn't depend on input + * Data stays on GPU for the entire duration the hitfinder is alive + */ + void AllocateStatic(); + + /** + * Allocate memory that depends on input + * Data stays on GPU until next timeslice is processed + */ + void AllocateDynamic(size_t, size_t); + + /** + * Sort Digis into buckets by module. + */ + DigiMap SortDigisIntoModules(gsl::span<const CbmStsDigi> digis, size_t& maxNDigisPerModule); + + /** + * Copy Digis into flat array that can be copied to the GPU. + */ + void FlattenDigis(DigiMap& digiMap); + void FlattenDigisSide(DigiMapSide& digis, bool isFront); + + void CollectTimingInfo(); + + /** + * Transfer Hits / Clusters back to host and convert to common CBM types. + */ + void AppendClustersModule(int module, bool isFront, std::vector<StsGpuCluster>&); + void AppendHitsModule(int module, std::vector<StsGpuHit>&); + + std::optional<const StsHitfinderPar> fPars; + StsHitfinderTimes fHitfinderTimes; + + StsHitfinderHost fHitfinderHost; + StsHitfinderGpu fHitfinderGpu; + }; + +} // namespace cbm::algo + +#endif // #ifndef CBM_ALGO_STS_HITFINDER_CHAIN_H diff --git a/core/detectors/sts/CMakeLists.txt b/core/detectors/sts/CMakeLists.txt index 423efe49eb6177ea0fb627cb9001ec2547a307b2..3e3d803beef86e1904d07621d70c1d3a2dbc26e7 100644 --- a/core/detectors/sts/CMakeLists.txt +++ b/core/detectors/sts/CMakeLists.txt @@ -34,23 +34,23 @@ endif() set(LIBRARY_NAME CbmStsBase) set(LINKDEF ${LIBRARY_NAME}LinkDef.h) -set(PUBLIC_DEPENDENCIES +set(PUBLIC_DEPENDENCIES FairRoot::Base FairRoot::ParBase CbmBase - ROOT::Core - ROOT::Geom - ROOT::Gpad - ROOT::Hist - ROOT::MathCore + ROOT::Core + ROOT::Geom + ROOT::Gpad + ROOT::Hist + ROOT::MathCore ROOT::Physics ) -set(PRIVATE_DEPENDENCIES +set(PRIVATE_DEPENDENCIES FairLogger::FairLogger CbmData - ROOT::Graf - ROOT::MLP + ROOT::Graf + ROOT::MLP ROOT::Tree ) diff --git a/core/detectors/sts/CbmStsParSensorCond.h b/core/detectors/sts/CbmStsParSensorCond.h index eafb33a4c55d870be5b0d60205c50aca54dc2d7d..d4f1240a1c401c6332a83cc09826802fea1401f9 100644 --- a/core/detectors/sts/CbmStsParSensorCond.h +++ b/core/detectors/sts/CbmStsParSensorCond.h @@ -116,7 +116,6 @@ public: */ void Init(); - /** @brief Copy assignment operator **/ CbmStsParSensorCond& operator=(const CbmStsParSensorCond&); diff --git a/reco/detectors/sts/CMakeLists.txt b/reco/detectors/sts/CMakeLists.txt index 284386c655f212ed4755f9d5ad263c1dc069be42..7944e331201078365e7636381fbbc691ca4fac6a 100644 --- a/reco/detectors/sts/CMakeLists.txt +++ b/reco/detectors/sts/CMakeLists.txt @@ -32,20 +32,10 @@ set(SRCS qa/CbmStsFindTracksQa.cxx ) - -set(DEVICE_SRCS - experimental/CbmStsGpuRecoDevice.cxx - experimental/CbmStsGpuHitFinder.cxx - ) - -set(NO_DICT_SRCS - ${DEVICE_SRCS} - experimental/CbmGpuRecoSts.cxx - ) - set(LIBRARY_NAME CbmRecoSts) set(LINKDEF ${LIBRARY_NAME}LinkDef.h) set(PUBLIC_DEPENDENCIES + Algo CbmData CbmStsBase FairRoot::Base @@ -83,10 +73,3 @@ set(INTERFACE_DEPENDENCIES # --------------------------------------------------------- generate_cbm_library() - -xpu_attach(CbmRecoSts ${DEVICE_SRCS}) - -# FIXME: Why is this needed to install headers in subfolder? -install(DIRECTORY experimental/ TYPE INCLUDE - FILES_MATCHING PATTERN "*.h" -) diff --git a/reco/detectors/sts/CbmRecoSts.cxx b/reco/detectors/sts/CbmRecoSts.cxx index d9df65037620226d4e4ca246a120c856bb2e551c..287a83316448d6b0aa8a6c39252d5bae8baadae4 100644 --- a/reco/detectors/sts/CbmRecoSts.cxx +++ b/reco/detectors/sts/CbmRecoSts.cxx @@ -13,7 +13,6 @@ #include "CbmDigiManager.h" #include "CbmEvent.h" #include "CbmStsDigi.h" -#include "CbmStsGpuHitFinder.h" #include "CbmStsModule.h" #include "CbmStsParSetModule.h" #include "CbmStsParSetSensor.h" @@ -72,9 +71,9 @@ UInt_t CbmRecoSts::CreateModules() assert(fSetup); - std::vector<CbmStsParModule> gpuModules; // for gpu reco - std::vector<int> moduleAddrs; - std::vector<experimental::CbmStsHitFinderConfig> hfCfg; + std::vector<cbm::algo::StsModulePar> gpuModules; // for gpu reco + // std::vector<int> moduleAddrs; + // std::vector<experimental::CbmStsHitFinderConfig> hfCfg; for (Int_t iModule = 0; iModule < fSetup->GetNofModules(); iModule++) { @@ -92,9 +91,6 @@ UInt_t CbmRecoSts::CreateModules() const CbmStsParSensor& sensPar = fParSetSensor->GetParSensor(sensorAddress); const CbmStsParSensorCond& sensCond = fParSetCond->GetParSensor(sensorAddress); - gpuModules.push_back(modPar); - moduleAddrs.push_back(moduleAddress); - // --- Calculate and set average Lorentz shift // --- This will be used in hit finding for correcting the position. Double_t lorentzF = 0.; @@ -135,25 +131,56 @@ UInt_t CbmRecoSts::CreateModules() auto result = fModules.insert({moduleAddress, recoModule}); assert(result.second); fModuleIndex.push_back(recoModule); - hfCfg.push_back({ - .dY = sensPar.GetPar(3), - .pitch = sensPar.GetPar(6), - .stereoF = sensPar.GetPar(8), - .stereoB = sensPar.GetPar(9), - .lorentzF = float(lorentzF), - .lorentzB = float(lorentzB), - }); + + // Get Transformation Matrix + cbm::algo::StsModuleTransformationMatrix localToGlobal; + TGeoHMatrix* matrix = recoModule->getMatrix(); + std::copy_n(matrix->GetRotationMatrix(), 9, localToGlobal.rotation.begin()); + std::copy_n(matrix->GetTranslation(), 3, localToGlobal.translation.begin()); + + // Collect GPU parameters + cbm::algo::StsModulePar gpuModulePars { + .address = moduleAddress, + .dY = sensPar.GetPar(3), + .pitch = sensPar.GetPar(6), + .stereoF = sensPar.GetPar(8), + .stereoB = sensPar.GetPar(9), + .lorentzF = float(lorentzF), + .lorentzB = float(lorentzB), + .localToGlobal = localToGlobal, + }; + gpuModules.emplace_back(gpuModulePars); } - // Gpu hitfinder - experimental::CbmGpuRecoSts::Config gpuConfig { - .parModules = gpuModules, - .recoModules = fModuleIndex, - .moduleAddrs = moduleAddrs, - .hitfinderCfg = hfCfg, - .physics = CbmStsPhysics::Instance(), + const CbmStsParModule& firstModulePars = fParSetModule->GetParModule(gpuModules[0].address); + + CbmStsParAsic asic = firstModulePars.GetParAsic(0); + cbm::algo::StsAsicPar algoAsic { + .nAdc = asic.GetNofAdc(), + .dynamicRange = float(asic.GetDynRange()), + .threshold = float(asic.GetThreshold()), + .timeResolution = float(asic.GetTimeResol()), + .deadTime = float(asic.GetDeadTime()), + .noise = float(asic.GetNoise()), + .zeroNoiseRate = float(asic.GetZeroNoiseRate()), + }; + + int nChannels = firstModulePars.GetNofChannels(); + + auto [landauValues, landauStepSize] = CbmStsPhysics::Instance()->GetLandauWidthTable(); + std::vector<float> landauValuesF; + std::copy(landauValues.begin(), landauValues.end(), std::back_inserter(landauValuesF)); + cbm::algo::StsHitfinderPar pars { + .asic = algoAsic, + .nChannels = nChannels, + .modules = gpuModules, + .landauTable = + { + .values = landauValuesF, + .stepSize = float(landauStepSize), + }, }; - fGpuReco.SetConfig(gpuConfig); + if (fUseGpuReco) fGpuReco.SetParameters(pars); return fModules.size(); } @@ -193,10 +220,10 @@ void CbmRecoSts::Exec(Option_t*) if (fUseGpuReco) { ProcessDataGpu(); - fNofDigis = fGpuReco.nDigis; - fNofDigisUsed = fGpuReco.nDigisUsed; - fNofClusters = fGpuReco.nCluster; - fNofHits = fGpuReco.nHits; + // fNofDigis = fGpuReco.nDigis; + // fNofDigisUsed = fGpuReco.nDigisUsed; + // fNofClusters = fGpuReco.nCluster; + // fNofHits = fGpuReco.nHits; // Old reco in time based mode } @@ -314,14 +341,15 @@ void CbmRecoSts::Finish() << " Find Hits : " << fixed << setprecision(1) << setw(6) << 1000. * timingsTotal.timeHits << " ms\n"; } else { - double gpuHitfinderTimeTotal = - fGpuReco.timeSortDigi + fGpuReco.timeCluster + fGpuReco.timeSortCluster + fGpuReco.timeHits; + cbm::algo::StsHitfinderTimes times = fGpuReco.GetHitfinderTimes(); + + double gpuHitfinderTimeTotal = times.timeSortDigi + times.timeCluster + times.timeSortCluster + times.timeHits; LOG(info) << "Time Reconstruct (GPU) : " << fixed << setprecision(1) << setw(6) << gpuHitfinderTimeTotal << " ms"; LOG(info) << "Time by step:\n" - << " Sort Digi : " << fixed << setprecision(1) << setw(6) << fGpuReco.timeSortDigi << " ms\n" - << " Find Cluster: " << fixed << setprecision(1) << setw(6) << fGpuReco.timeCluster << " ms\n" - << " Sort Cluster: " << fixed << setprecision(1) << setw(6) << fGpuReco.timeSortCluster << " ms\n" - << " Find Hits : " << fixed << setprecision(1) << setw(6) << fGpuReco.timeHits << " ms"; + << " Sort Digi : " << fixed << setprecision(1) << setw(6) << times.timeSortDigi << " ms\n" + << " Find Cluster: " << fixed << setprecision(1) << setw(6) << times.timeCluster << " ms\n" + << " Sort Cluster: " << fixed << setprecision(1) << setw(6) << times.timeSortCluster << " ms\n" + << " Find Hits : " << fixed << setprecision(1) << setw(6) << times.timeHits << " ms"; } LOG(info) << "====================================="; } @@ -636,8 +664,86 @@ void CbmRecoSts::ProcessDataGpu() { if (fMode == kCbmRecoEvent) throw std::runtime_error("STS GPU Reco does not yet support event-by-event mode."); - fGpuReco.RunHitFinder(); - fGpuReco.ForwardClustersAndHits(fClusters, fHits); + auto digis = fDigiManager->GetArray<CbmStsDigi>(); + fGpuReco(digis); + auto [nClustersForwarded, nHitsForwarded] = ForwardGpuClusterAndHits(); + + fNofDigis = digis.size(); + fNofDigisUsed = digis.size(); + fNofClusters = nClustersForwarded; + fNofHits = nHitsForwarded; +} + +std::pair<size_t, size_t> CbmRecoSts::ForwardGpuClusterAndHits() +{ + size_t nClustersForwarded = 0, nHitsForwarded = 0; + + const cbm::algo::StsHitfinderHost& hfc = fGpuReco.GetHitfinderBuffers(); + const cbm::algo::StsHitfinderPar& pars = fGpuReco.GetParameters(); + + for (int module = 0; module < hfc.nModules; module++) { + + auto* gpuClusterF = &hfc.clusterDataPerModule.h()[module * hfc.maxClustersPerModule]; + auto* gpuClusterIdxF = &hfc.clusterIdxPerModule.h()[module * hfc.maxClustersPerModule]; + int nClustersFGpu = hfc.nClustersPerModule.h()[module]; + + nClustersForwarded += nClustersFGpu; + + for (int i = 0; i < nClustersFGpu; i++) { + auto& cidx = gpuClusterIdxF[i]; + auto& clusterGpu = gpuClusterF[cidx.fIdx]; + + unsigned int outIdx = fClusters->GetEntriesFast(); + auto* clusterOut = new ((*fClusters)[outIdx])::CbmStsCluster {}; + + clusterOut->SetAddress(pars.modules[module].address); + clusterOut->SetProperties(clusterGpu.fCharge, clusterGpu.fPosition, clusterGpu.fPositionError, cidx.fTime, + clusterGpu.fTimeError); + clusterOut->SetSize(clusterGpu.fSize); + } + + auto* gpuClusterB = &hfc.clusterDataPerModule.h()[(module + hfc.nModules) * hfc.maxClustersPerModule]; + auto* gpuClusterIdxB = &hfc.clusterIdxPerModule.h()[(module + hfc.nModules) * hfc.maxClustersPerModule]; + int nClustersBGpu = hfc.nClustersPerModule.h()[module + hfc.nModules]; + + nClustersForwarded += nClustersBGpu; + + for (int i = 0; i < nClustersBGpu; i++) { + auto& cidx = gpuClusterIdxB[i]; + auto& clusterGpu = gpuClusterB[cidx.fIdx]; + unsigned int outIdx = fClusters->GetEntriesFast(); + auto* clusterOut = new ((*fClusters)[outIdx])::CbmStsCluster {}; + + clusterOut->SetAddress(pars.modules[module].address); + clusterOut->SetProperties(clusterGpu.fCharge, clusterGpu.fPosition, clusterGpu.fPositionError, cidx.fTime, + clusterGpu.fTimeError); + clusterOut->SetSize(clusterGpu.fSize); + } + + auto* gpuHits = &hfc.hitsPerModule.h()[module * hfc.maxHitsPerModule]; + int nHitsGpu = hfc.nHitsPerModule.h()[module]; + + nHitsForwarded += nHitsGpu; + + for (int i = 0; i < nHitsGpu; i++) { + auto& hitGpu = gpuHits[i]; + + unsigned int outIdx = fHits->GetEntriesFast(); + new ((*fHits)[outIdx])::CbmStsHit {pars.modules[module].address, + TVector3 {hitGpu.fX, hitGpu.fY, hitGpu.fZ}, + TVector3 {hitGpu.fDx, hitGpu.fDy, hitGpu.fDz}, + hitGpu.fDxy, + 0, + 0, + double(hitGpu.fTime), + hitGpu.fTimeError, + hitGpu.fDu, + hitGpu.fDv}; + } + + } // for (int module = 0; module < hfc.nModules; module++) + + return {nClustersForwarded, nHitsForwarded}; } @@ -655,13 +761,13 @@ void CbmRecoSts::SetParContainers() void CbmRecoSts::DumpNewHits() { std::ofstream out {"newHits.csv"}; - + const cbm::algo::StsHitfinderHost& hfc = fGpuReco.GetHitfinderBuffers(); out << "module, x, y, z, deltaX, deltaY, deltaZ, deltaXY, time, timeError, deltaU, deltaV" << std::endl; - for (size_t m = 0; m < fModuleIndex.size(); m++) { - auto hits = fGpuReco.GetHits(m); - - for (const auto& h : hits) { + int nHitsGpu = hfc.nHitsPerModule.h()[m]; + auto* gpuHits = &hfc.hitsPerModule.h()[m * hfc.maxHitsPerModule]; + for (int i = 0; i < nHitsGpu; i++) { + auto& h = gpuHits[i]; out << m << ", " << h.fX << ", " << h.fY << ", " << h.fZ << ", " << h.fDx << ", " << h.fDy << ", " << h.fDz << ", " << h.fDxy << ", " << h.fTime << ", " << h.fTimeError << ", " << h.fDu << ", " << h.fDv << std::endl; } diff --git a/reco/detectors/sts/CbmRecoSts.h b/reco/detectors/sts/CbmRecoSts.h index f11af70cdbbccd3423c9c9b0a89e60599cad18c7..c6f4f0755ca3ef0d0ee45abf42bba8e9661d2bea 100644 --- a/reco/detectors/sts/CbmRecoSts.h +++ b/reco/detectors/sts/CbmRecoSts.h @@ -11,12 +11,13 @@ #ifndef CBMRECOSTS_H #define CBMRECOSTS_H 1 -#include "CbmGpuRecoSts.h" - #include <FairTask.h> +#include <TClonesArray.h> #include <TStopwatch.h> +#include "StsHitfinderChain.h" + class CbmDigiManager; class CbmEvent; class CbmStsElement; @@ -214,6 +215,8 @@ public: */ void UseSensorParSet(CbmStsParSetSensor* sensorParSet) { fUserParSetSensor = sensorParSet; } + void DumpNewHits(); + void DumpOldHits(); private: /** @brief Average Lorentz Shift in a sensor @@ -258,11 +261,6 @@ private: void ProcessDataGpu(); - void DumpNewHits(); - - void DumpOldHits(); - - private: // --- I/O TClonesArray* fEvents = nullptr; //! Input array of events @@ -329,7 +327,9 @@ private: std::vector<CbmStsRecoModule*> fModuleIndex {}; //! bool fUseGpuReco = false; - ::experimental::CbmGpuRecoSts fGpuReco; + cbm::algo::StsHitfinderChain fGpuReco; + + std::pair<size_t, size_t> ForwardGpuClusterAndHits(); ClassDef(CbmRecoSts, 1); }; diff --git a/reco/detectors/sts/experimental/CbmGpuRecoSts.cxx b/reco/detectors/sts/experimental/CbmGpuRecoSts.cxx deleted file mode 100644 index 1001a5ba8b15cf1fe3dde9508adfcf35654c9be0..0000000000000000000000000000000000000000 --- a/reco/detectors/sts/experimental/CbmGpuRecoSts.cxx +++ /dev/null @@ -1,405 +0,0 @@ -/* Copyright (C) 2022 FIAS Frankfurt Institute for Advanced Studies, Frankfurt / Main - SPDX-License-Identifier: GPL-3.0-only - Authors: Felix Weiglhofer [committer], Kilian Hunold */ -#include "CbmGpuRecoSts.h" - -#include "CbmAddress.h" -#include "CbmDigiManager.h" -#include "CbmStsAddress.h" -#include "CbmStsCluster.h" -#include "CbmStsGpuRecoDevice.h" -#include "CbmStsHit.h" -#include "CbmStsPhysics.h" -#include "CbmStsRecoModule.h" - -#include <TCanvas.h> -#include <TClonesArray.h> -#include <TGeoMatrix.h> -#include <TH1D.h> -#include <TH2D.h> -#include <TStopwatch.h> - -#include <algorithm> -#include <cassert> -#include <unordered_set> -#include <vector> - -using namespace experimental; - -void CbmGpuRecoSts::SetupConstants() -{ - assert(fConfig != std::nullopt); - assert(fConfig->recoModules.size() == fConfig->parModules.size()); - assert(fConfig->moduleAddrs.size() == fConfig->parModules.size()); - assert(fConfig->hitfinderCfg.size() == fConfig->parModules.size()); - - size_t nModules = fConfig->parModules.size(); - - auto& hfc = fHitfinderCpu; - auto& hfg = fHitfinderGpu; - - LOG(info) << "STS GPU RECO: setting constants"; - - size_t nChannels = fConfig->parModules[0].GetNofChannels(); - ::CbmStsParAsic asic = fConfig->parModules[0].GetParAsic(0); // TODO values are currently identical for all asics?? - // for (auto &mod : modules) { - // assert(nChannels == mod.GetNofChannels()); - // for (size_t c = 0; c < nChannels; c++) { - // ::CbmStsParAsic otherAsic = mod.GetParAsic(c); - // assert(asic.GetNofAdc() == otherAsic.GetNofAdc()); - // assert(asic.GetDynRange() == otherAsic.GetDynRange()); - // assert(asic.GetThreshold() == otherAsic.GetThreshold()); - // assert(asic.GetTimeResol() == otherAsic.GetTimeResol()); - // assert(asic.GetDeadTime() == otherAsic.GetDeadTime()); - // assert(asic.GetNoise() == otherAsic.GetNoise()); - // assert(asic.GetZeroNoiseRate() == otherAsic.GetZeroNoiseRate()); - // } - // } - - nChannels /= 2; - hfc.nModules = hfg.nModules = nModules; - hfc.nChannels = hfg.nChannels = nChannels; - - hfc.timeCutDigiAbs = hfg.timeCutDigiAbs = -1.f; - hfc.timeCutDigiSig = hfg.timeCutDigiSig = 3.f; - hfc.timeCutClusterAbs = hfg.timeCutClusterAbs = -1.f; - hfc.timeCutClusterSig = hfg.timeCutClusterSig = 4.f; - - hfc.asic.nAdc = asic.GetNofAdc(); - hfc.asic.dynRange = asic.GetDynRange(); - hfc.asic.threshold = asic.GetThreshold(); - hfc.asic.timeResolution = asic.GetTimeResol(); - hfc.asic.deadTime = asic.GetDeadTime(); - hfc.asic.noise = asic.GetNoise(); - hfc.asic.zeroNoiseRate = asic.GetZeroNoiseRate(); - hfg.asic = hfc.asic; -} - -void CbmGpuRecoSts::Setup(size_t maxDigisPerModule, size_t nDigitsTotal) -{ - assert(fConfig != std::nullopt); - - // static constexpr size_t maxDigisPerModule = 60000; - // static constexpr size_t maxClustersPerModule = 30000; - // static constexpr size_t maxHitsPerModule = 120000; - - const size_t maxClustersPerModule = maxDigisPerModule * 0.5; - const size_t maxHitsPerModule = maxClustersPerModule * 4; - - size_t nModules = fConfig->parModules.size(); - size_t nChannels = fConfig->parModules[0].GetNofChannels() / 2; - - - size_t nModulesTotal = nModules * 2; // Front- and backside - - auto& hfc = fHitfinderCpu; - auto& hfg = fHitfinderGpu; - - auto landauTable = fConfig->physics->GetLandauWidthTable(); - hfc.landauTableSize = hfg.landauTableSize = landauTable.size(); - hfc.landauTable = xpu::hd_buffer<float> {size_t(hfc.landauTableSize)}; - hfg.landauTable = hfc.landauTable.d(); - auto landauEntry = landauTable.begin(); - auto prevLandauEntry = landauEntry; - hfc.landauTable.h()[0] = landauEntry->second; - landauEntry++; - hfc.landauStepSize = hfg.landauStepSize = landauEntry->first - prevLandauEntry->first; - for (size_t i = 1; landauEntry != landauTable.end(); i++, landauEntry++) { - assert(hfc.landauStepSize == landauEntry->first - prevLandauEntry->first); - assert(i < hfc.landauTable.size()); - hfc.landauTable.h()[i] = landauEntry->second; - prevLandauEntry = landauEntry; - } - - LOG(info) << "STS GPU RECO: finished landau"; - - hfc.localToGlobalTranslationByModule = xpu::hd_buffer<float> {nModules * 3}; - hfg.localToGlobalTranslationByModule = hfc.localToGlobalTranslationByModule.d(); - hfc.localToGlobalRotationByModule = xpu::hd_buffer<float> {nModules * 9}; - hfg.localToGlobalRotationByModule = hfc.localToGlobalRotationByModule.d(); - for (size_t m = 0; m < fConfig->recoModules.size(); m++) { - auto& recoMod = fConfig->recoModules[m]; - TGeoHMatrix* matrix = recoMod->getMatrix(); - - const double* rot = matrix->GetRotationMatrix(); - float* tgt = &hfc.localToGlobalRotationByModule.h()[m * 9]; - std::copy_n(rot, 9, tgt); - - const double* tr = matrix->GetTranslation(); - tgt = &hfc.localToGlobalTranslationByModule.h()[m * 3]; - std::copy_n(tr, 3, tgt); - } - - LOG(info) << "STS GPU RECO: finished transformation matrix"; - - hfc.hitfinderConfig = xpu::hd_buffer<CbmStsHitFinderConfig>(nModules); - hfg.hitfinderConfig = hfc.hitfinderConfig.d(); - - std::copy_n(fConfig->hitfinderCfg.begin(), nModules, hfc.hitfinderConfig.h()); - - LOG(info) << "STS GPU RECO: finished setting hitfinder config"; - - // FIXME: allocate by total number of digis instead. This array is flat. - hfc.digiOffsetPerModule = xpu::hd_buffer<size_t> {nModulesTotal + 1}; - hfg.digiOffsetPerModule = hfc.digiOffsetPerModule.d(); - hfc.digisPerModule = xpu::hd_buffer<CbmStsDigi> {nDigitsTotal}; - hfg.digisPerModule = hfc.digisPerModule.d(); - hfc.digisPerModuleTmp = xpu::d_buffer<CbmStsDigi> {nDigitsTotal}; - hfg.digisPerModuleTmp = hfc.digisPerModuleTmp.d(); - hfc.digisSortedPerModule = xpu::d_buffer<CbmStsDigi*> {nModulesTotal}; - hfg.digisSortedPerModule = hfc.digisSortedPerModule.d(); - hfc.digiConnectorPerModule = xpu::d_buffer<CbmStsDigiConnector> {nDigitsTotal}; - hfg.digiConnectorPerModule = hfc.digiConnectorPerModule.d(); - - hfc.channelOffsetPerModule = xpu::d_buffer<unsigned int> {nModulesTotal * nChannels}; - hfg.channelOffsetPerModule = hfc.channelOffsetPerModule.d(); - - hfc.maxErrorFront = xpu::d_buffer<float> {1}; - hfg.maxErrorFront = hfc.maxErrorFront.d(); - hfc.maxErrorBack = xpu::d_buffer<float> {1}; - hfg.maxErrorBack = hfc.maxErrorBack.d(); - - LOG(info) << "STS GPU RECO: finished digis"; - - hfc.maxClustersPerModule = hfg.maxClustersPerModule = maxClustersPerModule; - hfc.channelStatusPerModule = xpu::d_buffer<int> {nChannels * nModulesTotal}; - hfg.channelStatusPerModule = hfc.channelStatusPerModule.d(); - hfc.clusterIdxPerModule = xpu::hd_buffer<CbmStsClusterIdx> {maxClustersPerModule * nModulesTotal}; - hfg.clusterIdxPerModule = hfc.clusterIdxPerModule.d(); - hfc.clusterIdxPerModuleTmp = xpu::hd_buffer<CbmStsClusterIdx> {maxClustersPerModule * nModulesTotal}; - hfg.clusterIdxPerModuleTmp = hfc.clusterIdxPerModuleTmp.d(); - hfc.clusterIdxSortedPerModule = xpu::hd_buffer<CbmStsClusterIdx*> {nModulesTotal}; - hfg.clusterIdxSortedPerModule = hfc.clusterIdxSortedPerModule.d(); - hfc.clusterDataPerModule = xpu::hd_buffer<CbmStsClusterData> {maxClustersPerModule * nModulesTotal}; - hfg.clusterDataPerModule = hfc.clusterDataPerModule.d(); - hfc.nClustersPerModule = xpu::hd_buffer<int> {nModulesTotal}; - hfg.nClustersPerModule = hfc.nClustersPerModule.d(); - - LOG(info) << "STS GPU RECO: finished cluster"; - - hfc.maxHitsPerModule = hfg.maxHitsPerModule = maxHitsPerModule; - hfc.hitsPerModule = xpu::hd_buffer<CbmStsHit>(maxHitsPerModule * nModules); - hfg.hitsPerModule = hfc.hitsPerModule.d(); - hfc.nHitsPerModule = xpu::hd_buffer<int>(nModules); - hfg.nHitsPerModule = hfc.nHitsPerModule.d(); - - LOG(info) << "STS GPU RECO: finished setup"; - LOG(info) << "- nModules = " << nModules; - LOG(info) << "- nChannels = " << nChannels; -} - -void CbmGpuRecoSts::RunHitFinder() -{ - auto& hfc = fHitfinderCpu; - - // ROCm bug: Mi100 name is an emtpy string... - auto deviceName = (xpu::device_properties().name.empty()) ? "AMD Mi100" : xpu::device_properties().name; - - LOG(info) << "STS GPU RECO: running GPU hit finder on device " << deviceName; - LOG(info) << "STS GPU RECO: Allocate and fetch digis"; - - - // FIXME: This is ugly, setup has to be split in two parts, constants required by FetchDigis - // And FetchDigis required by Setup... - SetupConstants(); - size_t maxDigisPerModule = 0; - size_t nDigisTotal = 0; - FetchDigis(maxDigisPerModule, nDigisTotal); - Setup(maxDigisPerModule, nDigisTotal); - - LOG(info) << "STS GPU RECO: finished digis / running clusterizer now"; - LOG(info) << "STS GPU RECO: largest module has " << maxDigisPerModule << " digis"; - - // Not all buffers have to initialized, but useful for debugging - // xpu::memset(hfc.digisPerModule, 0); - // xpu::memset(hfc.digisPerModuleTmp, 0); - // xpu::memset(hfc.digisSortedPerModule, 0); - // xpu::memset(hfc.digiOffsetPerModule, 0); - xpu::memset(hfc.digiConnectorPerModule, 0); - // xpu::memset(hfc.channelOffsetPerModule, 0); - xpu::memset(hfc.channelStatusPerModule, -1); - xpu::memset(hfc.clusterIdxPerModule, 0); - // xpu::memset(hfc.clusterIdxPerModuleTmp, 0); - // xpu::memset(hfc.clusterIdxSortedPerModule, 0); - // xpu::memset(hfc.clusterDataPerModule, 0); - xpu::memset(hfc.nClustersPerModule, 0); - // xpu::memset(hfc.hitsPerModule, 0); - xpu::memset(hfc.nHitsPerModule, 0); - xpu::memset(hfc.maxErrorFront, 0); - xpu::memset(hfc.maxErrorBack, 0); - - hfc.digiOffsetPerModule.h()[0] = 0; - for (int m = 0; m < hfc.nModules; m++) { - size_t offset = hfc.digiOffsetPerModule.h()[m]; - auto& md = fDigisByModuleF[fConfig->moduleAddrs[m]]; - std::copy(md.begin(), md.end(), &hfc.digisPerModule.h()[offset]); - hfc.digiOffsetPerModule.h()[m + 1] = offset + md.size(); - } - for (int m = 0; m < hfc.nModules; m++) { - size_t offset = hfc.digiOffsetPerModule.h()[m + hfc.nModules]; - auto& md = fDigisByModuleB[fConfig->moduleAddrs[m]]; - std::copy(md.begin(), md.end(), &hfc.digisPerModule.h()[offset]); - hfc.digiOffsetPerModule.h()[m + hfc.nModules + 1] = offset + md.size(); - } - assert(nDigisTotal == hfc.digiOffsetPerModule.h()[2 * hfc.nModules]); - - TStopwatch timer; - - // constants only need to be transferred once. should be excluded from time measurement - xpu::copy(hfc.landauTable, xpu::host_to_device); - xpu::copy(hfc.hitfinderConfig, xpu::host_to_device); - xpu::copy(hfc.localToGlobalRotationByModule, xpu::host_to_device); - xpu::copy(hfc.localToGlobalTranslationByModule, xpu::host_to_device); - - xpu::set_constant<HitFinder>(fHitfinderGpu); - xpu::copy(hfc.digisPerModule, xpu::host_to_device); - xpu::copy(hfc.digiOffsetPerModule, xpu::host_to_device); - - xpu::run_kernel<SortDigis>(xpu::grid::n_blocks(hfc.nModules * 2)); - xpu::run_kernel<FindClustersSingleStep>(xpu::grid::n_blocks(hfc.nModules * 2)); - // Run cluster finding steps in indivual kernels, useful for debugging / profiling - // xpu::run_kernel<CalculateOffsets>(xpu::grid::n_blocks(hfc.nModules * 2)); - // xpu::run_kernel<FindClustersBasic>(xpu::grid::n_blocks(hfc.nModules * 2)); - // xpu::run_kernel<CalculateClusters>(xpu::grid::n_blocks(hfc.nModules * 2)); - // xpu::run_kernel<FindClustersBasic>(xpu::grid::n_blocks(hfc.nModules * 2)); - // xpu::run_kernel<CalculateClustersBasic>(xpu::grid::n_blocks(hfc.nModules * 2)); - xpu::run_kernel<SortClusters>(xpu::grid::n_blocks(hfc.nModules * 2)); - xpu::run_kernel<FindHits>(xpu::grid::n_blocks(hfc.nModules)); - - // Check if profiling is enabled - if (not xpu::get_timing<SortDigis>().empty()) { - timeSortDigi = xpu::get_timing<SortDigis>().back(); - timeCluster = xpu::get_timing<FindClustersSingleStep>().back(); - timeSortCluster = xpu::get_timing<SortClusters>().back(); - timeHits = xpu::get_timing<FindHits>().back(); - } - - xpu::copy(hfc.clusterDataPerModule, xpu::device_to_host); - xpu::copy(hfc.nClustersPerModule, xpu::device_to_host); - xpu::copy(hfc.hitsPerModule, xpu::device_to_host); - xpu::copy(hfc.nHitsPerModule, xpu::device_to_host); - - - size_t mostCluster = 0, mostHits = 0; - for (int m = 0; m < hfc.nModules; m++) { - mostCluster = std::max<size_t>(hfc.nClustersPerModule.h()[m], mostCluster); - mostCluster = std::max<size_t>(hfc.nClustersPerModule.h()[m + hfc.nModules], mostCluster); - mostHits = std::max<size_t>(hfc.nHitsPerModule.h()[m], mostHits); - - nCluster += hfc.nClustersPerModule.h()[m]; - nCluster += hfc.nClustersPerModule.h()[m + hfc.nModules]; - - nHits += hfc.nHitsPerModule.h()[m]; - } - - LOG(info) << "STS GPU RECO: finished hitfinder"; -} - -void CbmGpuRecoSts::ForwardClustersAndHits(TClonesArray* clusters, TClonesArray* hits) -{ - auto& hfc = fHitfinderCpu; - - for (int module = 0; module < hfc.nModules; module++) { - - auto* gpuClusterF = &hfc.clusterDataPerModule.h()[module * hfc.maxClustersPerModule]; - auto* gpuClusterIdxF = &hfc.clusterIdxPerModule.h()[module * hfc.maxClustersPerModule]; - int nClustersFGpu = hfc.nClustersPerModule.h()[module]; - - for (int i = 0; i < nClustersFGpu; i++) { - auto& cidx = gpuClusterIdxF[i]; - auto& clusterGpu = gpuClusterF[cidx.fIdx]; - - unsigned int outIdx = clusters->GetEntriesFast(); - auto* clusterOut = new ((*clusters)[outIdx])::CbmStsCluster {}; - - clusterOut->SetAddress(fConfig->moduleAddrs[module]); - clusterOut->SetProperties(clusterGpu.fCharge, clusterGpu.fPosition, clusterGpu.fPositionError, cidx.fTime, - clusterGpu.fTimeError); - clusterOut->SetSize(clusterGpu.fSize); - } - - auto* gpuClusterB = &hfc.clusterDataPerModule.h()[(module + hfc.nModules) * hfc.maxClustersPerModule]; - auto* gpuClusterIdxB = &hfc.clusterIdxPerModule.h()[(module + hfc.nModules) * hfc.maxClustersPerModule]; - int nClustersBGpu = hfc.nClustersPerModule.h()[module + hfc.nModules]; - - for (int i = 0; i < nClustersBGpu; i++) { - auto& cidx = gpuClusterIdxB[i]; - auto& clusterGpu = gpuClusterB[cidx.fIdx]; - unsigned int outIdx = clusters->GetEntriesFast(); - auto* clusterOut = new ((*clusters)[outIdx])::CbmStsCluster {}; - - clusterOut->SetAddress(fConfig->moduleAddrs[module]); - clusterOut->SetProperties(clusterGpu.fCharge, clusterGpu.fPosition, clusterGpu.fPositionError, cidx.fTime, - clusterGpu.fTimeError); - clusterOut->SetSize(clusterGpu.fSize); - } - - auto* gpuHits = &hfc.hitsPerModule.h()[module * hfc.maxHitsPerModule]; - int nHitsGpu = hfc.nHitsPerModule.h()[module]; - - for (int i = 0; i < nHitsGpu; i++) { - auto& hitGpu = gpuHits[i]; - - unsigned int outIdx = hits->GetEntriesFast(); - new ((*hits)[outIdx])::CbmStsHit {fConfig->moduleAddrs[module], - TVector3 {hitGpu.fX, hitGpu.fY, hitGpu.fZ}, - TVector3 {hitGpu.fDx, hitGpu.fDy, hitGpu.fDz}, - hitGpu.fDxy, - 0, - 0, - double(hitGpu.fTime), - hitGpu.fTimeError, - hitGpu.fDu, - hitGpu.fDv}; - } - - } // for (int module = 0; module < hfc.nModules; module++) -} - -void CbmGpuRecoSts::FetchDigis(size_t& maxDigisPerModule, size_t& nDigisTotal) -{ - - // FIXME: This function is a crime against humanity. - // Digis should be presorted by module already... - - CbmDigiManager* digis = CbmDigiManager::Instance(); - auto& hfc = fHitfinderCpu; - - // Remove digis from previous timeslice - fDigisByModuleB.clear(); - fDigisByModuleF.clear(); - - // FIXME: GPU reco should use regular digi class too - nDigis = digis->GetNofDigis(ECbmModuleId::kSts); - nDigisTotal = 0; - for (size_t i = 0; i < nDigis; i++) { - const ::CbmStsDigi* digi = digis->Get<::CbmStsDigi>(i); - Int_t systemId = CbmAddress::GetSystemId(digi->GetAddress()); - // TODO: is this check still needed? - if (systemId != ToIntegralType(ECbmModuleId::kSts)) { continue; } - int moduleAddr = CbmStsAddress::GetMotherAddress(digi->GetAddress(), kStsModule); - // int module = addrToModuleId(moduleAddr); - // assert(module < hfc.nModules); - bool isFront = digi->GetChannel() < hfc.nChannels; - - if (isFront) { fDigisByModuleF[moduleAddr].push_back(*digi); } - else { - CbmStsDigi tmpdigi = *digi; - tmpdigi.SetChannel(digi->GetChannel() - hfc.nChannels); - fDigisByModuleB[moduleAddr].push_back(tmpdigi); - } - - nDigisTotal++; - } - - nDigisUsed = nDigisTotal; - - maxDigisPerModule = 0; - for (const auto& mod : fDigisByModuleF) { - maxDigisPerModule = std::max(maxDigisPerModule, mod.second.size()); - } - - for (const auto& mod : fDigisByModuleB) { - maxDigisPerModule = std::max(maxDigisPerModule, mod.second.size()); - } -} diff --git a/reco/detectors/sts/experimental/CbmGpuRecoSts.h b/reco/detectors/sts/experimental/CbmGpuRecoSts.h deleted file mode 100644 index a26efdbe911d569a09e0856d433010f2828c9c20..0000000000000000000000000000000000000000 --- a/reco/detectors/sts/experimental/CbmGpuRecoSts.h +++ /dev/null @@ -1,78 +0,0 @@ -/* Copyright (C) 2022 FIAS Frankfurt Institute for Advanced Studies, Frankfurt / Main - SPDX-License-Identifier: GPL-3.0-only - Authors: Felix Weiglhofer [committer], Kilian Hunold */ -#ifndef CBMGPURECOSTS_H -#define CBMGPURECOSTS_H - -#include "CbmStsDigi.h" -#include "CbmStsGpuHitFinder.h" -#include "CbmStsParModule.h" - -#include <map> -#include <optional> -#include <vector> - -#include <xpu/host.h> - -class CbmStsPhysics; -class CbmStsRecoModule; -class TClonesArray; - -namespace experimental -{ - - class CbmGpuRecoSts { - - public: - struct Config { - std::vector<CbmStsParModule> parModules; - std::vector<CbmStsRecoModule*> recoModules; - std::vector<int> moduleAddrs; - std::vector<CbmStsHitFinderConfig> hitfinderCfg; - CbmStsPhysics* physics; - }; - - double timeIO = 0; - double timeSortDigi = 0; - double timeCluster = 0; - double timeSortCluster = 0; - double timeHits = 0; - - size_t nDigis = 0; - size_t nDigisUsed = 0; - size_t nHits = 0; - size_t nCluster = 0; - - void SetConfig(const Config& cfg) { fConfig = cfg; } - - void SetupConstants(); - void Setup(size_t maxDigisPerModule, size_t nDigitsTotal); - - void RunHitFinder(); - - void ForwardClustersAndHits(TClonesArray* clusters, TClonesArray* hits); - - std::vector<CbmStsHit> GetHits(int module) - { - auto& hfc = fHitfinderCpu; - auto* st = &hfc.hitsPerModule.h()[module * hfc.maxHitsPerModule]; - auto* end = st + hfc.nHitsPerModule.h()[module]; - return std::vector<CbmStsHit> {st, end}; - } - - private: - std::optional<Config> fConfig; // Initialized late... - - // std::vector<int> fModuleAddrs; - CbmStsHitFinder<xpu::side::host> fHitfinderCpu; - CbmStsGpuHitFinder fHitfinderGpu; - - std::map<int, std::vector<CbmStsDigi>> fDigisByModuleF; - std::map<int, std::vector<CbmStsDigi>> fDigisByModuleB; - - void FetchDigis(size_t& maxDigisPerModule, size_t& nDigisTotal); - }; - -} // namespace experimental - -#endif diff --git a/reco/detectors/sts/experimental/CbmStsGpuHitFinder.h b/reco/detectors/sts/experimental/CbmStsGpuHitFinder.h deleted file mode 100644 index 5d3a5319ef02a1acaf219ca377485254e1e613ce..0000000000000000000000000000000000000000 --- a/reco/detectors/sts/experimental/CbmStsGpuHitFinder.h +++ /dev/null @@ -1,309 +0,0 @@ -/* Copyright (C) 2021-2022 FIAS Frankfurt Institute for Advanced Studies, Frankfurt / Main - SPDX-License-Identifier: GPL-3.0-only - Authors: Felix Weiglhofer [committer], Kilian Hunold */ -#ifndef CBMSTSGPUHITFINDER_H -#define CBMSTSGPUHITFINDER_H - -#include "CbmStsDigi.h" -#include "CbmStsGpuRecoDevice.h" - -#include <xpu/device.h> - -// experimental namespace to avoid name collisions with original data types -namespace experimental -{ - using CbmStsTimeNs = uint32_t; - - using XErrorT = float; // FIXME: remove and replace by float - - struct CbmStsClusterIdx { - CbmStsTimeNs fTime; ///< Cluster time (average of digi times) [ns] - int fIdx; - }; - - struct CbmStsClusterData { - float fCharge; ///< Total charge - int fSize; ///< Difference between first and last channel - float fPosition; ///< Cluster centre in channel number units - XErrorT fPositionError; ///< Cluster centre error (r.m.s.) in channel number units - float fTimeError; ///< Error of cluster time [ns] - }; - - struct CbmStsHit { - float fX, fY; ///< X, Y positions of hit [cm] - float fZ; ///< Z position of hit [cm] - CbmStsTimeNs fTime; ///< Hit time [ns] - float fDx, fDy; ///< X, Y errors [cm] - float fDz; ///< Z position error [cm] - float fDxy; ///< XY correlation - float fTimeError; ///< Error of hit time [ns] - float fDu; ///< Error of coordinate across front-side strips [cm] - float fDv; ///< Error of coordinate across back-side strips [cm] - }; - - struct CbmStsParAsic { - int nAdc; - float dynRange; - float threshold; - float timeResolution; - float deadTime; - float noise; - float zeroNoiseRate; - - XPU_D float AdcToCharge(unsigned short adc) const - { - return threshold + dynRange / float(nAdc) * (float(adc) + 0.5f); - } - }; - - struct CbmStsHitFinderConfig { - float dY; - float pitch; - float stereoF; - float stereoB; - float lorentzF; - float lorentzB; - }; - - // cache of various parameters used by the hit finder - struct CbmStsHitFinderParams : public CbmStsHitFinderConfig { - float tanStereoF; - float tanStereoB; - float errorFac; - float dX; - }; - - using SortDigisT = xpu::block_sort<unsigned long int, ::CbmStsDigi, xpu::block_size<SortDigis>::value.x, - CBM_STS_SORT_DIGIS_ITEMS_PER_THREAD>; - - struct CbmStsSortDigiSmem { - typename SortDigisT::storage_t sortBuf; - }; - - using SortClustersT = xpu::block_sort<CbmStsTimeNs, CbmStsClusterIdx, xpu::block_size<SortClusters>::value.x, - CBM_STS_SORT_CLUSTERS_ITEMS_PER_THREAD>; - - struct CbmStsSortClusterSmem { - typename SortClustersT::storage_t sortBuf; - }; - - struct CbmStsClusterCalculationProperties { - float tSum = 0.; // sum of digi times - int chanF = 9999999; // first channel in cluster - int chanL = -1; // last channel in cluster - float qF = 0.f; // charge in first channel - float qM = 0.f; // sum of charges in middle channels - float qL = 0.f; // charge in last cluster - float eqFsq = 0.f; // uncertainty of qF - float eqMsq = 0.f; // uncertainty of qMid - float eqLsq = 0.f; // uncertainty of qL - float tResolSum = 0.f; - - float xSum = 0.f; // weighted charge sum, used to correct corrupt clusters - }; - - class CbmStsDigiConnector { - private: - unsigned int hasPreviousAndNext; - - XPU_D unsigned int UnpackNext(unsigned int val) const { return val & ~(1u << 31); } - - public: - XPU_D bool HasPrevious() const { return (hasPreviousAndNext >> 31) & 1u; } - - XPU_D void SetHasPrevious(bool hasPrevious) - { - unsigned int old = hasPreviousAndNext; - unsigned int hasPreviousI = ((unsigned int) hasPrevious) << 31; - unsigned int assumed; - - do { - assumed = old; - old = xpu::atomic_cas_block(&hasPreviousAndNext, assumed, UnpackNext(assumed) | hasPreviousI); - } while (old != assumed); - } - - XPU_D unsigned int next() const { return UnpackNext(hasPreviousAndNext); } - - XPU_D void SetNext(unsigned int next) - { - unsigned int old = hasPreviousAndNext; - unsigned int assumed; - - next = xpu::min((1u << 31) - 1, next); - - do { - assumed = old; - old = xpu::atomic_cas_block(&hasPreviousAndNext, assumed, (assumed & (1u << 31)) | next); - } while (old != assumed); - } - - XPU_D bool HasNext() const { return UnpackNext(hasPreviousAndNext) != 0; } - }; - - template<xpu::side S> - class CbmStsHitFinder { - - public: - // Constants - int nModules; - int nChannels; - - // calibration data - float timeCutDigiAbs; - float timeCutDigiSig; - float timeCutClusterAbs; - float timeCutClusterSig; - CbmStsParAsic asic; - int landauTableSize; - float landauStepSize; - xpu::cmem_io_t<float, S> landauTable; - - xpu::cmem_device_t<float, S> maxErrorFront; - xpu::cmem_device_t<float, S> maxErrorBack; - - // TODO fill this array - xpu::cmem_io_t<CbmStsHitFinderConfig, S> hitfinderConfig; - - // input - // Store all digis in a flat array with a header that contains the offset for every module (front and back) - xpu::cmem_io_t<size_t, S> digiOffsetPerModule; // 2 * nModules + 1 entries, last entry contains total digi count - xpu::cmem_io_t<CbmStsDigi, S> digisPerModule; // TODO this is a flat array now, should be renamed - // Two digi arrays needed, as sorting can't sort in place right now - xpu::cmem_device_t<CbmStsDigi, S> digisPerModuleTmp; - xpu::cmem_device_t<CbmStsDigi*, S> digisSortedPerModule; - - xpu::cmem_io_t<float, S> localToGlobalTranslationByModule; - xpu::cmem_io_t<float, S> localToGlobalRotationByModule; - - xpu::cmem_device_t<CbmStsDigiConnector, S> digiConnectorPerModule; - xpu::cmem_device_t<unsigned int, S> channelOffsetPerModule; - - // intermediate results - size_t maxClustersPerModule; - xpu::cmem_device_t<int, S> channelStatusPerModule; - xpu::cmem_io_t<CbmStsClusterIdx, S> clusterIdxPerModule; - xpu::cmem_io_t<CbmStsClusterIdx, S> clusterIdxPerModuleTmp; - xpu::cmem_io_t<CbmStsClusterIdx*, S> clusterIdxSortedPerModule; - xpu::cmem_io_t<CbmStsClusterData, S> clusterDataPerModule; - xpu::cmem_io_t<int, S> nClustersPerModule; - - // output - size_t maxHitsPerModule; - xpu::cmem_io_t<CbmStsHit, S> hitsPerModule; - xpu::cmem_io_t<int, S> nHitsPerModule; - }; - - class CbmStsGpuHitFinder : public CbmStsHitFinder<xpu::side::device> { - - public: - XPU_D void SortDigisInSpaceAndTime(CbmStsSortDigiSmem& smem, int iBlock) const; - XPU_D void FindClustersSingleStep(int iBlock) const; - XPU_D void SortClusters(CbmStsSortClusterSmem& smem, int iBlock) const; - XPU_D void FindHits(int iBlock) const; - - // Individual steps of cluster finder, for more granular time measurement - XPU_D void CalculateOffsetsParallel(int iBlock) const; - XPU_D void FindClustersParallel(int iBlock) const; - XPU_D void CalculateClustersParallel(int iBlock) const; - - XPU_D void FindClusterConnectionsBasic(int iBlock) const; - XPU_D void CalculateClustersBasic(int iBlock) const; - - private: - XPU_D void CalculateChannelOffsets(CbmStsDigi* digis, unsigned int* channelOffsets, unsigned int nDigis) const; - - XPU_D void FindClusterConnectionsChannelWise(CbmStsDigi* digis, CbmStsDigiConnector* digiConnector, - unsigned int* channelOffsets, int iModule, - unsigned int threadId) const; - XPU_D void FindClusterConnectionsDigiWise(CbmStsDigi* digis, CbmStsDigiConnector* digiConnector, - unsigned int* channelOffsets, int iModule, unsigned int threadId, - unsigned int nDigis) const; - - XPU_D void CalculateClustersBasic(CbmStsDigi* digis, CbmStsDigiConnector* digiConnector, int iModule, - unsigned int const threadId) const; - XPU_D void CalculateClustersChannelWise(CbmStsDigi* digis, CbmStsDigiConnector* digiConnector, - unsigned int* channelOffsets, int iModule, unsigned int const threadId, - unsigned int const nDigis) const; - XPU_D void CalculateClustersDigiWise(CbmStsDigi* digis, CbmStsDigiConnector* digiConnector, int iModule, - unsigned int const threadId, unsigned int const nDigis) const; - - XPU_D void CreateClusterFromConnectors1(int const iModule, CbmStsDigi* digis, int const digiIndex) const; - XPU_D void CreateClusterFromConnectors2(int const iModule, CbmStsDigi* digis, CbmStsDigiConnector* digiConnector, - int const digiIndex) const; - XPU_D void CreateClusterFromConnectorsN(int const iModule, CbmStsDigi* digis, CbmStsDigiConnector* digiConnector, - int const digiIndex) const; - - private: - XPU_D unsigned int GetNDigis(int iModule) const - { - return digiOffsetPerModule[iModule + 1] - digiOffsetPerModule[iModule]; - } - - XPU_D float GetTimeResolution(int /* iModule */, int /* channel */) const { return asic.timeResolution; } - - XPU_D bool IsActive(int* channelStatus, int channel) const - { - // TODO add padding to channels to remove this? - if (channel < 0 || channel >= nChannels) { return false; } - return channelStatus[channel] > -1; - } - - XPU_D int ChanLeft(int channel) const { return channel - 1; } - - XPU_D int ChanRight(int channel) const { return channel + 1; } - - XPU_D int ChanDist(int c1, int c2) const - { - int diff = c2 - c1; - // TODO handle wrap around - return diff; - } - - XPU_D void AddCluster(int iModule, CbmStsTimeNs time, const CbmStsClusterData& cls) const - { - CbmStsClusterIdx* tgtIdx = &clusterIdxPerModule[iModule * maxClustersPerModule]; - CbmStsClusterData* tgtData = &clusterDataPerModule[iModule * maxClustersPerModule]; - - int pos = xpu::atomic_add_block(&nClustersPerModule[iModule], 1); - XPU_ASSERT(size_t(pos) < maxClustersPerModule); - - CbmStsClusterIdx idx {time, pos}; - tgtIdx[idx.fIdx] = idx; - tgtData[idx.fIdx] = cls; - } - - XPU_D bool IsBackside(int iModule) const { return iModule >= nModules; } - - XPU_D XErrorT LandauWidth(float charge) const; - - XPU_D void ToGlobal(int iModule, float lx, float ly, float lz, float& gx, float& gy, float& gz) const; - - XPU_D void IntersectClusters(int iBlock, const CbmStsHitFinderParams& pars, CbmStsTimeNs timeF, - const CbmStsClusterData& clsF, CbmStsTimeNs timeB, - const CbmStsClusterData& clsB) const; - XPU_D float GetClusterPosition(const CbmStsHitFinderParams& pars, float centre, bool isFront) const; - XPU_D bool Intersect(const CbmStsHitFinderParams& pars, float xF, float exF, float xB, float exB, float& x, - float& y, float& varX, float& varY, float& varXY) const; - XPU_D bool IsInside(const CbmStsHitFinderParams& pars, float x, float y) const; - XPU_D void CreateHit(int iBlocks, float xLocal, float yLocal, float varX, float varY, float varXY, - CbmStsTimeNs timeF, const CbmStsClusterData& clsF, CbmStsTimeNs timeB, - const CbmStsClusterData& clsB, float du, float dv) const; - - XPU_D void SaveMaxError(float errorValue, int iModule) const - { - - float* maxError = IsBackside(iModule) ? maxErrorBack : maxErrorFront; - float old {}; - do { - old = *maxError; - if (old >= errorValue) { break; } - } while (!xpu::atomic_cas_block(maxError, *maxError, xpu::max(errorValue, *maxError))); - } - }; - -} // namespace experimental - -XPU_EXPORT_CONSTANT(CbmStsGpuRecoDevice, ::experimental::CbmStsGpuHitFinder, HitFinder); - -#endif diff --git a/reco/detectors/sts/experimental/CbmStsGpuRecoDevice.cxx b/reco/detectors/sts/experimental/CbmStsGpuRecoDevice.cxx deleted file mode 100644 index c8f1b62a75ed58a81264e79bfc8a6eb83cff7998..0000000000000000000000000000000000000000 --- a/reco/detectors/sts/experimental/CbmStsGpuRecoDevice.cxx +++ /dev/null @@ -1,30 +0,0 @@ -/* Copyright (C) 2022 FIAS Frankfurt Institute for Advanced Studies, Frankfurt / Main - SPDX-License-Identifier: GPL-3.0-only - Authors: Felix Weiglhofer [committer], Kilian Hunold */ -#include "CbmStsGpuRecoDevice.h" - -#include "CbmStsGpuHitFinder.h" - -using namespace experimental; - -XPU_IMAGE(CbmStsGpuRecoDevice); - -XPU_CONSTANT(HitFinder); - -XPU_KERNEL(SortDigis, CbmStsSortDigiSmem) { xpu::cmem<HitFinder>().SortDigisInSpaceAndTime(smem, xpu::block_idx::x()); } - -XPU_KERNEL(FindClustersSingleStep, xpu::no_smem) { xpu::cmem<HitFinder>().FindClustersSingleStep(xpu::block_idx::x()); } - -XPU_KERNEL(CalculateOffsets, xpu::no_smem) { xpu::cmem<HitFinder>().CalculateOffsetsParallel(xpu::block_idx::x()); } - -XPU_KERNEL(FindClusters, xpu::no_smem) { xpu::cmem<HitFinder>().FindClustersParallel(xpu::block_idx::x()); } - -XPU_KERNEL(FindClustersBasic, xpu::no_smem) { xpu::cmem<HitFinder>().FindClusterConnectionsBasic(xpu::block_idx::x()); } - -XPU_KERNEL(CalculateClusters, xpu::no_smem) { xpu::cmem<HitFinder>().CalculateClustersParallel(xpu::block_idx::x()); } - -XPU_KERNEL(CalculateClustersBasic, xpu::no_smem) { xpu::cmem<HitFinder>().CalculateClustersBasic(xpu::block_idx::x()); } - -XPU_KERNEL(SortClusters, CbmStsSortClusterSmem) { xpu::cmem<HitFinder>().SortClusters(smem, xpu::block_idx::x()); } - -XPU_KERNEL(FindHits, xpu::no_smem) { xpu::cmem<HitFinder>().FindHits(xpu::block_idx::x()); } diff --git a/reco/detectors/sts/experimental/CbmStsGpuRecoDevice.h b/reco/detectors/sts/experimental/CbmStsGpuRecoDevice.h deleted file mode 100644 index fdb674e973a332d85984ca7c196a7350693255ba..0000000000000000000000000000000000000000 --- a/reco/detectors/sts/experimental/CbmStsGpuRecoDevice.h +++ /dev/null @@ -1,55 +0,0 @@ -/* Copyright (C) 2022 FIAS Frankfurt Institute for Advanced Studies, Frankfurt / Main - SPDX-License-Identifier: GPL-3.0-only - Authors: Felix Weiglhofer [committer], Kilian Hunold */ -#ifndef CBMSTSGPURECODEVICE_H -#define CBMSTSGPURECODEVICE_H 1 - -#include <xpu/device.h> - -// TODO: Tune these values for various gpus -#if XPU_IS_CUDA -#define CBM_STS_SORT_DIGIS_BLOCK_SIZE 32 -#define CBM_STS_SORT_DIGIS_ITEMS_PER_THREAD 32 -#define CBM_STS_SORT_CLUSTERS_BLOCK_SIZE 32 -#define CBM_STS_SORT_CLUSTERS_ITEMS_PER_THREAD 32 -#define CBM_STS_FIND_CLUSTERS_BLOCK_SIZE 32 -#define CBM_STS_FIND_HITS_BLOCK_SIZE 32 -#else // HIP, values ignored on CPU -#define CBM_STS_SORT_DIGIS_BLOCK_SIZE 256 -#define CBM_STS_SORT_DIGIS_ITEMS_PER_THREAD 6 -#define CBM_STS_SORT_CLUSTERS_BLOCK_SIZE 256 -#define CBM_STS_SORT_CLUSTERS_ITEMS_PER_THREAD 6 -#define CBM_STS_FIND_CLUSTERS_BLOCK_SIZE 256 -#define CBM_STS_FIND_HITS_BLOCK_SIZE 256 -#endif - -struct CbmStsGpuRecoDevice { -}; - -XPU_EXPORT_KERNEL(CbmStsGpuRecoDevice, SortDigis); -XPU_BLOCK_SIZE_1D(SortDigis, CBM_STS_SORT_DIGIS_BLOCK_SIZE); - -// Combine substeps for finding clusters into a single kernel -XPU_EXPORT_KERNEL(CbmStsGpuRecoDevice, FindClustersSingleStep); -XPU_BLOCK_SIZE_1D(FindClustersSingleStep, CBM_STS_FIND_CLUSTERS_BLOCK_SIZE); - -XPU_EXPORT_KERNEL(CbmStsGpuRecoDevice, CalculateOffsets); -XPU_BLOCK_SIZE_1D(CalculateOffsets, CBM_STS_FIND_CLUSTERS_BLOCK_SIZE); - -XPU_EXPORT_KERNEL(CbmStsGpuRecoDevice, FindClusters); -XPU_BLOCK_SIZE_1D(FindClusters, CBM_STS_FIND_CLUSTERS_BLOCK_SIZE); - -XPU_EXPORT_KERNEL(CbmStsGpuRecoDevice, FindClustersBasic); - -XPU_EXPORT_KERNEL(CbmStsGpuRecoDevice, CalculateClusters); -XPU_BLOCK_SIZE_1D(CalculateClusters, CBM_STS_FIND_CLUSTERS_BLOCK_SIZE); - -XPU_EXPORT_KERNEL(CbmStsGpuRecoDevice, CalculateClustersBasic); - -XPU_EXPORT_KERNEL(CbmStsGpuRecoDevice, SortClusters); -XPU_BLOCK_SIZE_1D(SortClusters, CBM_STS_SORT_CLUSTERS_BLOCK_SIZE); - -XPU_EXPORT_KERNEL(CbmStsGpuRecoDevice, FindHits); -XPU_BLOCK_SIZE_1D(FindHits, CBM_STS_FIND_HITS_BLOCK_SIZE); - -#endif diff --git a/reco/tasks/CMakeLists.txt b/reco/tasks/CMakeLists.txt index d71ec0eaca2b5d84f41f801ead71f7481db94a7b..135c5a4f9813a4c42011cb484cc66ff981790bbf 100644 --- a/reco/tasks/CMakeLists.txt +++ b/reco/tasks/CMakeLists.txt @@ -69,3 +69,4 @@ if (HAS_STD_EXECUTION) endif() generate_cbm_library() +