From df1c517c74bc96364c2deb355285dcc6987a8a41 Mon Sep 17 00:00:00 2001 From: Felix Weiglhofer <weiglhofer@fias.uni-frankfurt.de> Date: Mon, 13 Jun 2022 14:27:43 +0000 Subject: [PATCH] StsReco: Add GPU Hitfinder besides existing local reconstruction. --- CMakeLists.txt | 5 + MQ/mcbm/CMakeLists.txt | 1 + MQ/source/CMakeLists.txt | 1 + algo/data/CMakeLists.txt | 1 + core/data/sts/CbmStsDigi.h | 59 +- core/detectors/sts/CbmStsPhysics.h | 2 + external/.gitignore | 1 + external/CMakeLists.txt | 15 +- macro/run/run_reco.C | 6 +- reco/CMakeLists.txt | 3 + reco/L1/L1Algo/L1CATrackFinder.cxx | 18 +- reco/detectors/sts/CMakeLists.txt | 27 +- reco/detectors/sts/CbmRecoSts.cxx | 187 +++- reco/detectors/sts/CbmRecoSts.h | 15 +- reco/detectors/sts/CbmStsAlgoAnaCluster.cxx | 9 +- reco/detectors/sts/CbmStsAlgoFindHits.cxx | 2 +- reco/detectors/sts/CbmStsFindTracks.cxx | 4 +- reco/detectors/sts/CbmStsRecoModule.cxx | 18 +- reco/detectors/sts/CbmStsRecoModule.h | 17 + .../sts/experimental/CbmGpuRecoSts.cxx | 404 ++++++++ .../sts/experimental/CbmGpuRecoSts.h | 78 ++ .../sts/experimental/CbmStsGpuHitFinder.cxx | 957 ++++++++++++++++++ .../sts/experimental/CbmStsGpuHitFinder.h | 309 ++++++ .../sts/experimental/CbmStsGpuRecoDevice.cxx | 30 + .../sts/experimental/CbmStsGpuRecoDevice.h | 55 + reco/mq/CMakeLists.txt | 5 +- 26 files changed, 2151 insertions(+), 78 deletions(-) create mode 100644 reco/detectors/sts/experimental/CbmGpuRecoSts.cxx create mode 100644 reco/detectors/sts/experimental/CbmGpuRecoSts.h create mode 100644 reco/detectors/sts/experimental/CbmStsGpuHitFinder.cxx create mode 100644 reco/detectors/sts/experimental/CbmStsGpuHitFinder.h create mode 100644 reco/detectors/sts/experimental/CbmStsGpuRecoDevice.cxx create mode 100644 reco/detectors/sts/experimental/CbmStsGpuRecoDevice.h diff --git a/CMakeLists.txt b/CMakeLists.txt index be903c4c3f..cf471aa6a2 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -210,6 +210,11 @@ endif() # Must be the first subdirectory since the defined targets are needed by # following targets add_subdirectory (external) +set (BASE_INCLUDE_DIRECTORIES +${BASE_INCLUDE_DIRECTORIES} +${XPU_INCLUDE_DIRECTORY} # Required for XPU_D macro in base data types +) + ### Base directories add_subdirectory (core) diff --git a/MQ/mcbm/CMakeLists.txt b/MQ/mcbm/CMakeLists.txt index a40ade8e90..57851cde09 100644 --- a/MQ/mcbm/CMakeLists.txt +++ b/MQ/mcbm/CMakeLists.txt @@ -49,6 +49,7 @@ set(INCLUDE_DIRECTORIES Set(SYSTEM_INCLUDE_DIRECTORIES ${SYSTEM_INCLUDE_DIRECTORIES} + ${BASE_INCLUDE_DIRECTORIES} ${ZeroMQ_INCLUDE_DIR} ${Boost_INCLUDE_DIR} ${FAIRROOT_INCLUDE_DIR} diff --git a/MQ/source/CMakeLists.txt b/MQ/source/CMakeLists.txt index f25d984625..54178de7b6 100644 --- a/MQ/source/CMakeLists.txt +++ b/MQ/source/CMakeLists.txt @@ -18,6 +18,7 @@ set(INCLUDE_DIRECTORIES ) Set(SYSTEM_INCLUDE_DIRECTORIES + ${BASE_INCLUDE_DIRECTORIES} ${SYSTEM_INCLUDE_DIRECTORIES} ${ZeroMQ_INCLUDE_DIR} ${Boost_INCLUDE_DIR} diff --git a/algo/data/CMakeLists.txt b/algo/data/CMakeLists.txt index 9e3c8b1bba..fd0ad4d94d 100644 --- a/algo/data/CMakeLists.txt +++ b/algo/data/CMakeLists.txt @@ -46,6 +46,7 @@ target_include_directories(OnlineData target_include_directories(OnlineData SYSTEM PUBLIC ${Boost_INCLUDE_DIR} PUBLIC ${FAIRLOGGER_INCLUDE_DIR} + PUBLIC ${XPU_INCLUDE_DIRECTORY} ) target_link_directories(OnlineData diff --git a/core/data/sts/CbmStsDigi.h b/core/data/sts/CbmStsDigi.h index e2e6a1de58..513234f7ff 100644 --- a/core/data/sts/CbmStsDigi.h +++ b/core/data/sts/CbmStsDigi.h @@ -16,10 +16,13 @@ #include "CbmDefs.h" // for ECbmModuleId::kSts #include "CbmStsAddress.h" +#include <xpu/defines.h> // for XPU_D + #ifndef NO_ROOT #include <Rtypes.h> // for ClassDef #endif + #include <boost/serialization/access.hpp> #include <boost/serialization/base_object.hpp> @@ -38,9 +41,10 @@ class CbmStsDigi { public: /** Default constructor **/ - CbmStsDigi() {} + CbmStsDigi() = default; +#if XPU_IS_CPU /** Standard constructor ** @param address Unique element address ** @param channel Channel number @@ -56,10 +60,10 @@ public: PackAddressAndTime(address, time); PackChannelAndCharge(channel, charge); } - +#endif /** Destructor **/ - ~CbmStsDigi() {}; + ~CbmStsDigi() = default; /** Unique detector element address (see CbmStsAddress) ** @value Unique address of readout channel @@ -76,7 +80,7 @@ public: /** @brief Channel number in module ** @value Channel number **/ - uint16_t GetChannel() const { return UnpackChannel(); } + XPU_D uint16_t GetChannel() const { return UnpackChannel(); } /** @brief Class name (static) @@ -84,11 +88,14 @@ public: **/ static const char* GetClassName() { return "CbmStsDigi"; } - +#if XPU_IS_CPU /** Charge ** @value Charge [ADC units] **/ double GetCharge() const { return static_cast<double>(UnpackCharge()); } +#endif + + XPU_D uint16_t GetChargeU16() const { return UnpackCharge(); } /** System ID (static) @@ -96,11 +103,14 @@ public: **/ static ECbmModuleId GetSystem() { return ECbmModuleId::kSts; } - +#if XPU_IS_CPU /** Time of measurement ** @value Time [ns] **/ double GetTime() const { return static_cast<double>(UnpackTime()); } +#endif + + XPU_D uint32_t GetTimeU32() const { return UnpackTime(); } template<class Archive> @@ -115,7 +125,7 @@ public: /** Update Time of measurement ** @param New Time [ns] **/ - void SetTime(double dNewTime) + XPU_D void SetTime(double dNewTime) { // StsDigi is not able to store negative timestamps. assert(dNewTime >= 0); @@ -124,18 +134,20 @@ public: PackTime(dNewTime); } - void SetChannel(uint16_t channel) { PackChannelAndCharge(channel, UnpackCharge()); } + XPU_D void SetChannel(uint16_t channel) { PackChannelAndCharge(channel, UnpackCharge()); } - void SetCharge(uint16_t charge) { PackChannelAndCharge(UnpackChannel(), charge); } + XPU_D void SetCharge(uint16_t charge) { PackChannelAndCharge(UnpackChannel(), charge); } +#if XPU_IS_CPU void SetAddress(int32_t address) { PackAddressAndTime(address, UnpackTime()); } +#endif /** Set new channel and charge. ** ** Slightly more efficient than calling both individual setters. **/ - void SetChannelAndCharge(uint16_t channel, uint16_t charge) { PackChannelAndCharge(channel, charge); } + XPU_D void SetChannelAndCharge(uint16_t channel, uint16_t charge) { PackChannelAndCharge(channel, charge); } /** Set new address and time at once. ** @@ -160,23 +172,32 @@ private: static constexpr uint32_t kMaxTimestamp = kTimestampMask; static constexpr uint32_t kTimeAddressBitMask = ~kTimestampMask; - uint32_t fTime = 0; ///< Time [ns] in lower 31 bits, highest bit is the 17th address bit. - uint16_t fChannelAndCharge = 0; ///< Channel number (lower 11 bits) and charge [ADC Units] in upper 5 bits. - uint16_t fAddress = 0; ///< Unique element address (lower 16 bits of 17) +// CUDA requires types in shared memory to have trivial constructors / destructors +#if XPU_IS_HIP_CUDA +#define CPU_ONLY(x) +#else +#define CPU_ONLY(x) x +#endif + uint32_t fTime CPU_ONLY(= 0); ///< Time [ns] in lower 31 bits, highest bit is the 17th address bit. + uint16_t fChannelAndCharge CPU_ONLY(= 0); ///< Channel number (lower 11 bits) and charge [ADC Units] in upper 5 bits. + uint16_t fAddress CPU_ONLY(= 0); ///< Unique element address (lower 16 bits of 17) - void PackTime(uint32_t newTime) { fTime = (fTime & kTimeAddressBitMask) | (newTime & kTimestampMask); } - uint32_t UnpackTime() const { return fTime & kTimestampMask; } +#undef CPU_ONLY - void PackChannelAndCharge(uint16_t channel, uint16_t charge) + XPU_D void PackTime(uint32_t newTime) { fTime = (fTime & kTimeAddressBitMask) | (newTime & kTimestampMask); } + XPU_D uint32_t UnpackTime() const { return fTime & kTimestampMask; } + + + XPU_D void PackChannelAndCharge(uint16_t channel, uint16_t charge) { fChannelAndCharge = (channel << kNumAdcBits) | charge; } - uint16_t UnpackChannel() const { return fChannelAndCharge >> kNumAdcBits; } - uint16_t UnpackCharge() const { return fChannelAndCharge & kAdcMask; } - + XPU_D uint16_t UnpackChannel() const { return fChannelAndCharge >> kNumAdcBits; } + XPU_D uint16_t UnpackCharge() const { return fChannelAndCharge & kAdcMask; } + // Packing / Unpacking address not available on GPU for now... void PackAddressAndTime(int32_t address, uint32_t time); int32_t UnpackAddress() const; diff --git a/core/detectors/sts/CbmStsPhysics.h b/core/detectors/sts/CbmStsPhysics.h index 8e727cc5b8..5e5068b5a9 100644 --- a/core/detectors/sts/CbmStsPhysics.h +++ b/core/detectors/sts/CbmStsPhysics.h @@ -88,6 +88,8 @@ public: **/ Double_t LandauWidth(Double_t mostProbableCharge); + std::map<Double_t, Double_t> GetLandauWidthTable() const { return fLandauWidth; } + /** @brief Energy for electron-hole pair creation in silicon ** @return Pair creation energy [GeV] diff --git a/external/.gitignore b/external/.gitignore index 3ac2b17b84..f100b75241 100644 --- a/external/.gitignore +++ b/external/.gitignore @@ -4,6 +4,7 @@ KFParticle NicaFemto Vc cppzmq +xpu flib_dpb/flib_dpb flesnet/ jsroot diff --git a/external/CMakeLists.txt b/external/CMakeLists.txt index fe9f1da403..70e591d961 100644 --- a/external/CMakeLists.txt +++ b/external/CMakeLists.txt @@ -35,14 +35,15 @@ if(DOWNLOAD_EXTERNALS) endif() download_project_if_needed(PROJECT xpu - GIT_REPOSITORY "https://git.cbm.gsi.de/fweig/xpu.git" - GIT_TAG "v0.6.4" - SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/xpu - CONFIGURE_COMMAND "" - BUILD_COMMAND "" - INSTALL_COMMAND "" + GIT_REPOSITORY "https://github.com/fweig/xpu" + GIT_TAG "v0.7.2" + SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/xpu + CONFIGURE_COMMAND "" + BUILD_COMMAND "" + INSTALL_COMMAND "" ) Add_Subdirectory(xpu) + Set(XPU_INCLUDE_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/xpu/src PARENT_SCOPE) Include(InstallKFParticle.cmake) Include(InstallNicaFemto.cmake) @@ -76,7 +77,7 @@ else() # Define an empty macro such that ctest is happy when no externals are # available - # This is needed since to speed up the execution of the code format check + # This is needed since to speed up the execution of the code format check # no externals are downloaded macro(xpu_attach) endmacro() diff --git a/macro/run/run_reco.C b/macro/run/run_reco.C index 7a242c6c88..450e632bb3 100644 --- a/macro/run/run_reco.C +++ b/macro/run/run_reco.C @@ -75,7 +75,7 @@ ** selects the ideal raw event builder, which associates digis to events ** based on the MC truth. The option "Real" selects a real raw event builder ** (latest version, for older versions use "Real2018" or "Real2019"). - ** + ** ** ** The file names must be specified without extensions. The convention is ** that the raw (input) file is [input].raw.root. The output file @@ -106,6 +106,8 @@ void run_reco(TString input = "", Int_t nTimeSlices = -1, Int_t firstTimeSlice = TString srcDir = gSystem->Getenv("VMCWORKDIR"); // top source directory // ------------------------------------------------------------------------ + // Other settings + bool stsRecoUseGpu = false; // ----- In- and output file names ------------------------------------ if (input.IsNull()) input = "test"; @@ -278,6 +280,7 @@ void run_reco(TString input = "", Int_t nTimeSlices = -1, Int_t firstTimeSlice = // ----- Local reconstruction in STS ---------------------------------- if (useSts) { CbmRecoSts* stsReco = new CbmRecoSts(ECbmRecoMode::kCbmRecoTimeslice); + stsReco->SetUseGpuReco(stsRecoUseGpu); if (eventBased) stsReco->SetMode(ECbmRecoMode::kCbmRecoEvent); run->AddTask(stsReco); std::cout << "-I- " << myName << ": Added task " << stsReco->GetName() << std::endl; @@ -465,7 +468,6 @@ void run_reco(TString input = "", Int_t nTimeSlices = -1, Int_t firstTimeSlice = } //? time-based reco - // ----- Parameter database -------------------------------------------- std::cout << std::endl << std::endl; std::cout << "-I- " << myName << ": Set runtime DB" << std::endl; diff --git a/reco/CMakeLists.txt b/reco/CMakeLists.txt index ec5b6e588c..eaad23123e 100644 --- a/reco/CMakeLists.txt +++ b/reco/CMakeLists.txt @@ -1,6 +1,9 @@ # CMakeList file for libraries CbmRecoXXXXX # P.-A. Loizeau, 7 February 2020 +# set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsanitize=address") +# set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -fsanitize=address") + add_subdirectory(base) add_subdirectory(calibration) add_subdirectory(detectors) diff --git a/reco/L1/L1Algo/L1CATrackFinder.cxx b/reco/L1/L1Algo/L1CATrackFinder.cxx index 6c03eb47b1..39473695ab 100644 --- a/reco/L1/L1Algo/L1CATrackFinder.cxx +++ b/reco/L1/L1Algo/L1CATrackFinder.cxx @@ -1543,7 +1543,7 @@ inline void L1Algo::TripletsStaPort( /// creates triplets: /***********************************************************************************************/ /** - * * + * * * ------ CATrackFinder procedure ------ * * * **************************************************************************************************/ @@ -1868,21 +1868,21 @@ void L1Algo::CATrackFinder() /// possible left hits of triplets are splited in portions of 16 (4 SIMDs) to use memory faster fDupletPortionStopIndex[fNstations-1] = 0; unsigned int ip = 0; //index of curent portion - + for (int istal = fNstations-2; istal >= fFirstCAstation; istal--) //start downstream chambers { int nHits = HitsUnusedStopIndex[istal] - HitsUnusedStartIndex[istal]; - + int NHits_P = nHits/Portion; - + for( int ipp = 0; ipp < NHits_P; ipp++ ) { n_g1[ip] = Portion; ip++; } // start_lh - portion of left hits - + n_g1[ip] = nHits - NHits_P*Portion; - + ip++; fDupletPortionStopIndex[istal] = ip; }// lstations @@ -2475,7 +2475,7 @@ void L1Algo::CATrackFinder() #if 0 static long int NTimes =0, NHits=0, HitSize =0, NStrips=0, StripSize =0, NStripsB=0, StripSizeB =0, NDup=0, DupSize=0, NTrip=0, TripSize=0, NBranches=0, BranchSize=0, NTracks=0, TrackSize=0 ; - + NTimes++; NHits += vHitsUnused->size(); HitSize += vHitsUnused->size()*sizeof(L1Hit); @@ -2487,13 +2487,13 @@ void L1Algo::CATrackFinder() DupSize += stat_max_n_dup*sizeof(/*short*/ int); NTrip += stat_max_n_trip; TripSize += stat_max_trip_size; - + NBranches += stat_max_n_branches; BranchSize += stat_max_BranchSize; NTracks += fTracks.size(); TrackSize += sizeof(L1Track)*fTracks.size() + sizeof(L1HitIndex_t)*fRecoHits.size(); int k = 1024*NTimes; - + cout<<"L1 Event size: \n" <<HitSize/k<<"kB for "<<NHits/NTimes<<" hits, \n" <<StripSize/k<<"kB for "<<NStrips/NTimes<<" strips, \n" diff --git a/reco/detectors/sts/CMakeLists.txt b/reco/detectors/sts/CMakeLists.txt index 8386c5ad44..376f5f640c 100644 --- a/reco/detectors/sts/CMakeLists.txt +++ b/reco/detectors/sts/CMakeLists.txt @@ -28,6 +28,16 @@ unpack/CbmStsUnpackMonitor.cxx qa/CbmStsFindTracksQa.cxx ) + +set(DEVICE_SRCS +experimental/CbmStsGpuRecoDevice.cxx +experimental/CbmStsGpuHitFinder.cxx +) + +set(NO_DICT_SRCS +${DEVICE_SRCS} +experimental/CbmGpuRecoSts.cxx +) # ----- End of sources --------------------------------- @@ -39,6 +49,10 @@ set(INCLUDE_DIRECTORIES ${CBMROOT_SOURCE_DIR}/reco/detectors/sts ${CBMROOT_SOURCE_DIR}/reco/detectors/sts/unpack ${CBMROOT_SOURCE_DIR}/reco/detectors/sts/qa +${CBMROOT_SOURCE_DIR}/reco/detectors/sts/experimental + +# external +${CBMROOT_SOURCE_DIR}/external/xpu/src # Reco ${CBMROOT_SOURCE_DIR}/reco/base @@ -82,7 +96,7 @@ ${Boost_LIBRARY_DIRS} # ----- Specify library dependences ------------------- Set(DEPENDENCIES - CbmBase CbmData CbmQaBase CbmMvd CbmRecoBase CbmStsBase + CbmBase CbmData CbmQaBase CbmMvd CbmRecoBase CbmStsBase xpu ) # --------------------------------------------------------- @@ -94,10 +108,11 @@ set(LINKDEF ${LIBRARY_NAME}LinkDef.h) # ---- Enable OpenMP ------------------------------------- # Comented since the OpenMP code is also commented in the moment -#if (OPENMP_FOUND) +# if (OPENMP_FOUND) +# message(STATUS "OPENMP_FOUND = TRUE") # set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}") # set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}") -#endif() +# endif() # --------------------------------------------------------- @@ -106,4 +121,10 @@ include_directories( ${INCLUDE_DIRECTORIES}) include_directories(SYSTEM ${SYSTEM_INCLUDE_DIRECTORIES}) link_directories( ${LINK_DIRECTORIES}) GENERATE_LIBRARY() +xpu_attach(CbmRecoSts ${DEVICE_SRCS}) + +# FIXME: Why is this needed to install headers in subfolder? +install(DIRECTORY experimental/ TYPE INCLUDE + FILES_MATCHING PATTERN "*.h" +) # --------------------------------------------------------- diff --git a/reco/detectors/sts/CbmRecoSts.cxx b/reco/detectors/sts/CbmRecoSts.cxx index 3d9d0230b8..49be2bf19d 100644 --- a/reco/detectors/sts/CbmRecoSts.cxx +++ b/reco/detectors/sts/CbmRecoSts.cxx @@ -13,11 +13,13 @@ #include "CbmDigiManager.h" #include "CbmEvent.h" #include "CbmStsDigi.h" +#include "CbmStsGpuHitFinder.h" #include "CbmStsModule.h" #include "CbmStsParSetModule.h" #include "CbmStsParSetSensor.h" #include "CbmStsParSetSensorCond.h" #include "CbmStsParSim.h" +#include "CbmStsPhysics.h" #include "CbmStsRecoModule.h" #include "CbmStsSetup.h" @@ -31,6 +33,10 @@ #include <iomanip> +#if __has_include(<omp.h>) +#include <omp.h> +#endif + using std::fixed; using std::left; using std::right; @@ -40,15 +46,13 @@ using std::stringstream; using std::vector; -ClassImp(CbmRecoSts) - +ClassImp(CbmRecoSts); - // ----- Constructor --------------------------------------------------- - CbmRecoSts::CbmRecoSts(ECbmRecoMode mode, Bool_t writeClusters, Bool_t runParallel) +// ----- Constructor --------------------------------------------------- +CbmRecoSts::CbmRecoSts(ECbmRecoMode mode, Bool_t writeClusters) : FairTask("RecoSts", 1) , fMode(mode) , fWriteClusters(writeClusters) - , fRunParallel(runParallel) { } // ------------------------------------------------------------------------- @@ -64,6 +68,11 @@ UInt_t CbmRecoSts::CreateModules() { assert(fSetup); + + std::vector<CbmStsParModule> gpuModules; // for gpu reco + std::vector<int> moduleAddrs; + std::vector<experimental::CbmStsHitFinderConfig> hfCfg; + for (Int_t iModule = 0; iModule < fSetup->GetNofModules(); iModule++) { // --- Setup module and sensor @@ -80,6 +89,9 @@ UInt_t CbmRecoSts::CreateModules() const CbmStsParSensor& sensPar = fParSetSensor->GetParSensor(sensorAddress); const CbmStsParSensorCond& sensCond = fParSetCond->GetParSensor(sensorAddress); + gpuModules.push_back(modPar); + moduleAddrs.push_back(moduleAddress); + // --- Calculate and set average Lorentz shift // --- This will be used in hit finding for correcting the position. Double_t lorentzF = 0.; @@ -120,8 +132,26 @@ UInt_t CbmRecoSts::CreateModules() auto result = fModules.insert({moduleAddress, recoModule}); assert(result.second); fModuleIndex.push_back(recoModule); + hfCfg.push_back({ + .dY = sensPar.GetPar(3), + .pitch = sensPar.GetPar(6), + .stereoF = sensPar.GetPar(8), + .stereoB = sensPar.GetPar(9), + .lorentzF = float(lorentzF), + .lorentzB = float(lorentzB), + }); } + // Gpu hitfinder + experimental::CbmGpuRecoSts::Config gpuConfig { + .parModules = gpuModules, + .recoModules = fModuleIndex, + .moduleAddrs = moduleAddrs, + .hitfinderCfg = hfCfg, + .physics = CbmStsPhysics::Instance(), + }; + fGpuReco.SetConfig(gpuConfig); + return fModules.size(); } // ------------------------------------------------------------------------- @@ -156,9 +186,23 @@ void CbmRecoSts::Exec(Option_t*) CbmEvent* event = nullptr; // --- Time-slice mode: process entire array - if (fMode == kCbmRecoTimeslice) ProcessData(nullptr); - // --- Event mode: loop over events + if (fUseGpuReco) { + + ProcessDataGpu(); + fNofDigis = fGpuReco.nDigis; + fNofDigisUsed = fGpuReco.nDigisUsed; + fNofClusters = fGpuReco.nCluster; + fNofHits = fGpuReco.nHits; + + // Old reco in time based mode + } + else if (fMode == kCbmRecoTimeslice) { + + ProcessData(nullptr); + + // --- Event mode: loop over events + } else { assert(fEvents); nEvents = fEvents->GetEntriesFast(); @@ -194,6 +238,8 @@ void CbmRecoSts::Exec(Option_t*) fTime2Run += fTime2; fTime3Run += fTime3; fTime4Run += fTime4; + + // allDigis = fNofDigisUsed; } // ------------------------------------------------------------------------- @@ -201,12 +247,20 @@ void CbmRecoSts::Exec(Option_t*) // ----- End-of-run action --------------------------------------------- void CbmRecoSts::Finish() { + int ompThreads = -1; +#ifdef _OPENMP + ompThreads = omp_get_max_threads(); +#endif - std::cout << std::endl; Double_t digiCluster = Double_t(fNofDigisUsed) / Double_t(fNofClusters); Double_t clusterHit = Double_t(fNofClusters) / Double_t(fNofHits); LOG(info) << "====================================="; LOG(info) << GetName() << ": Run summary"; + if (fUseGpuReco) LOG(info) << "Ran new GPU STS reconstruction."; + else if (ompThreads < 0) + LOG(info) << "STS reconstruction ran single threaded (No OpenMP)."; + else + LOG(info) << "STS reconstruction ran multithreaded with OpenMP (nthreads = " << ompThreads << ")"; LOG(info) << "Time slices : " << fNofTs; if (fMode == kCbmRecoEvent) LOG(info) << "Events : " << fNofEvents; LOG(info) << "Digis / TSlice : " << fixed << setprecision(2) << fNofDigisRun / Double_t(fNofTs); @@ -218,19 +272,54 @@ void CbmRecoSts::Finish() LOG(info) << "Clusters per hit : " << fixed << setprecision(2) << clusterHit; LOG(info) << "Time per TSlice : " << fixed << setprecision(2) << 1000. * fTimeRun / Double_t(fNofTs) << " ms "; - fTimeRun /= Double_t(fNofEvents); - fTime1Run /= Double_t(fNofEvents); - fTime2Run /= Double_t(fNofEvents); - fTime3Run /= Double_t(fNofEvents); - fTime4Run /= Double_t(fNofEvents); - LOG(info) << "Time Reset : " << fixed << setprecision(1) << setw(6) << 1000. * fTime1Run << " ms (" - << setprecision(1) << setw(4) << 100. * fTime1Run / fTimeRun << " %)"; - LOG(info) << "Time Distribute : " << fixed << setprecision(1) << setw(6) << 1000. * fTime2Run << " ms (" - << setprecision(1) << 100. * fTime2Run / fTimeRun << " %)"; - LOG(info) << "Time Reconstruct : " << fixed << setprecision(1) << setw(6) << 1000. * fTime3Run << " ms (" - << setprecision(1) << setw(4) << 100. * fTime3Run / fTimeRun << " %)"; - LOG(info) << "Time Output : " << fixed << setprecision(1) << setw(6) << 1000. * fTime4Run << " ms (" - << setprecision(1) << setw(4) << 100. * fTime4Run / fTimeRun << " %)"; + + // Aggregate times for substeps of reconstruction + // Note: These times are meaningless when reconstruction runs with > 1 thread. + CbmStsRecoModule::Timings timingsTotal; + for (const auto* m : fModuleIndex) { + auto moduleTime = m->GetTimings(); + timingsTotal.timeSortDigi += moduleTime.timeSortDigi; + timingsTotal.timeCluster += moduleTime.timeCluster; + timingsTotal.timeSortCluster += moduleTime.timeSortCluster; + timingsTotal.timeHits += moduleTime.timeHits; + } + + // Avoid division by zero if no events present + double nEvent = std::max(fNofEvents, 1); + + fTimeTot /= nEvent; + fTime1 /= nEvent; + fTime2 /= nEvent; + fTime3 /= nEvent; + fTime4 /= nEvent; + + if (not fUseGpuReco) { + LOG(info) << "NofEvents : " << fNofEvents; + LOG(info) << "Time Reset : " << fixed << setprecision(1) << setw(6) << 1000. * fTime1 << " ms (" + << setprecision(1) << setw(4) << 100. * fTime1 / fTimeTot << " %)"; + LOG(info) << "Time Distribute : " << fixed << setprecision(1) << setw(6) << 1000. * fTime2 << " ms (" + << setprecision(1) << 100. * fTime2 / fTimeTot << " %)"; + LOG(info) << "Time Reconstruct: " << fixed << setprecision(1) << setw(6) << 1000. * fTime3 << " ms (" + << setprecision(1) << setw(4) << 100. * fTime3 / fTimeTot << " %)"; + LOG(info) << "Time by step:\n" + << " Sort Digi : " << fixed << setprecision(1) << setw(6) << 1000. * timingsTotal.timeSortDigi + << " ms\n" + << " Find Cluster: " << fixed << setprecision(1) << setw(6) << 1000. * timingsTotal.timeCluster + << " ms\n" + << " Sort Cluster: " << fixed << setprecision(1) << setw(6) << 1000. * timingsTotal.timeSortCluster + << " ms\n" + << " Find Hits : " << fixed << setprecision(1) << setw(6) << 1000. * timingsTotal.timeHits << " ms\n"; + } + else { + double gpuHitfinderTimeTotal = + fGpuReco.timeSortDigi + fGpuReco.timeCluster + fGpuReco.timeSortCluster + fGpuReco.timeHits; + LOG(info) << "Time Reconstruct (GPU) : " << fixed << setprecision(1) << setw(6) << gpuHitfinderTimeTotal << " ms"; + LOG(info) << "Time by step:\n" + << " Sort Digi : " << fixed << setprecision(1) << setw(6) << fGpuReco.timeSortDigi << " ms\n" + << " Find Cluster: " << fixed << setprecision(1) << setw(6) << fGpuReco.timeCluster << " ms\n" + << " Sort Cluster: " << fixed << setprecision(1) << setw(6) << fGpuReco.timeSortCluster << " ms\n" + << " Find Hits : " << fixed << setprecision(1) << setw(6) << fGpuReco.timeHits << " ms"; + } LOG(info) << "====================================="; } // ------------------------------------------------------------------------- @@ -406,7 +495,9 @@ void CbmRecoSts::ProcessData(CbmEvent* event) Int_t nClusters = 0; Int_t nHits = 0; - // #pragma omp parallel for schedule(static) if(fParallelism_enabled) +#ifdef _OPENMP +#pragma omp parallel for schedule(static) +#endif for (UInt_t it = 0; it < fModuleIndex.size(); it++) { fModuleIndex[it]->Reset(); } @@ -450,7 +541,9 @@ void CbmRecoSts::ProcessData(CbmEvent* event) // --- Execute reconstruction in the modules fTimer.Start(); - // #pragma omp parallel for schedule(static) if(fParallelism_enabled) +#ifdef _OPENMP +#pragma omp parallel for schedule(static) +#endif for (UInt_t it = 0; it < fModuleIndex.size(); it++) { assert(fModuleIndex[it]); fModuleIndex[it]->Reconstruct(); @@ -458,7 +551,6 @@ void CbmRecoSts::ProcessData(CbmEvent* event) fTimer.Stop(); Double_t time3 = fTimer.RealTime(); // Time for reconstruction - // --- Collect clusters and hits from modules // Here, the hits (and optionally clusters) are copied to the // TClonesArrays in the ROOT tree. This is surely not the last word @@ -471,7 +563,8 @@ void CbmRecoSts::ProcessData(CbmEvent* event) for (UInt_t it = 0; it < fModuleIndex.size(); it++) { const vector<CbmStsCluster>& moduleClustersF = fModuleIndex[it]->GetClustersF(); - offsetClustersF = fClusters->GetEntriesFast(); + + offsetClustersF = fClusters->GetEntriesFast(); for (auto& cluster : moduleClustersF) { UInt_t index = fClusters->GetEntriesFast(); new ((*fClusters)[index]) CbmStsCluster(cluster); @@ -480,7 +573,8 @@ void CbmRecoSts::ProcessData(CbmEvent* event) } //# front-side clusters in module const vector<CbmStsCluster>& moduleClustersB = fModuleIndex[it]->GetClustersB(); - offsetClustersB = fClusters->GetEntriesFast(); + + offsetClustersB = fClusters->GetEntriesFast(); for (auto& cluster : moduleClustersB) { UInt_t index = fClusters->GetEntriesFast(); new ((*fClusters)[index]) CbmStsCluster(cluster); @@ -528,6 +622,14 @@ void CbmRecoSts::ProcessData(CbmEvent* event) } // ------------------------------------------------------------------------- +void CbmRecoSts::ProcessDataGpu() +{ + if (fMode == kCbmRecoEvent) throw std::runtime_error("STS GPU Reco does not yet support event-by-event mode."); + + fGpuReco.RunHitFinder(); + fGpuReco.ForwardClustersAndHits(fClusters, fHits); +} + // ----- Connect parameter container ----------------------------------- void CbmRecoSts::SetParContainers() @@ -539,3 +641,36 @@ void CbmRecoSts::SetParContainers() fParSetCond = dynamic_cast<CbmStsParSetSensorCond*>(db->getContainer("CbmStsParSetSensorCond")); } // ------------------------------------------------------------------------- + +void CbmRecoSts::DumpNewHits() +{ + std::ofstream out {"newHits.csv"}; + + out << "module, x, y, z, deltaX, deltaY, deltaZ, deltaXY, time, timeError, deltaU, deltaV" << std::endl; + + for (size_t m = 0; m < fModuleIndex.size(); m++) { + auto hits = fGpuReco.GetHits(m); + + for (const auto& h : hits) { + out << m << ", " << h.fX << ", " << h.fY << ", " << h.fZ << ", " << h.fDx << ", " << h.fDy << ", " << h.fDz + << ", " << h.fDxy << ", " << h.fTime << ", " << h.fTimeError << ", " << h.fDu << ", " << h.fDv << std::endl; + } + } +} + +void CbmRecoSts::DumpOldHits() +{ + std::ofstream out {"oldHits.csv"}; + + out << "module, x, y, z, deltaX, deltaY, deltaZ, deltaXY, time, timeError, deltaU, deltaV" << std::endl; + + for (size_t m = 0; m < fModuleIndex.size(); m++) { + const auto& hits = fModuleIndex[m]->GetHits(); + + for (const auto& h : hits) { + out << m << ", " << h.GetX() << ", " << h.GetY() << ", " << h.GetZ() << ", " << h.GetDx() << ", " << h.GetDy() + << ", " << h.GetDz() << ", " << h.GetDxy() << ", " << h.GetTime() << ", " << h.GetTimeError() << ", " + << h.GetDu() << ", " << h.GetDv() << std::endl; + } + } +} diff --git a/reco/detectors/sts/CbmRecoSts.h b/reco/detectors/sts/CbmRecoSts.h index 15b5a804cd..f11af70cdb 100644 --- a/reco/detectors/sts/CbmRecoSts.h +++ b/reco/detectors/sts/CbmRecoSts.h @@ -11,12 +11,12 @@ #ifndef CBMRECOSTS_H #define CBMRECOSTS_H 1 +#include "CbmGpuRecoSts.h" #include <FairTask.h> #include <TStopwatch.h> - class CbmDigiManager; class CbmEvent; class CbmStsElement; @@ -64,7 +64,7 @@ class CbmRecoSts : public FairTask { public: /** @brief Constructor **/ - CbmRecoSts(ECbmRecoMode mode = kCbmRecoTimeslice, Bool_t writeClusters = kFALSE, Bool_t runParallel = kFALSE); + CbmRecoSts(ECbmRecoMode mode = kCbmRecoTimeslice, Bool_t writeClusters = kFALSE); /** @brief Copy constructor (disabled) **/ @@ -117,6 +117,8 @@ public: **/ void SetMode(ECbmRecoMode mode) { fMode = mode; } + void SetUseGpuReco(bool useGpu) { fUseGpuReco = useGpu; } + /** @brief Define the needed parameter containers **/ virtual void SetParContainers(); @@ -254,6 +256,12 @@ private: **/ void ProcessData(CbmEvent* event = nullptr); + void ProcessDataGpu(); + + void DumpNewHits(); + + void DumpOldHits(); + private: // --- I/O @@ -287,7 +295,6 @@ private: Double_t fTimeCutClustersSig = 4.; ///< Time cut for hit finding Double_t fTimeCutClustersAbs = -1.; ///< Time cut for hit finding [ns] Bool_t fWriteClusters = kFALSE; ///< Write clusters to tree - Bool_t fRunParallel = kFALSE; ///< Use OpenMP multithreading // --- Timeslice counters Long64_t fNofDigis = 0; ///< Total number of digis processed @@ -321,6 +328,8 @@ private: std::map<UInt_t, CbmStsRecoModule*> fModules {}; //! std::vector<CbmStsRecoModule*> fModuleIndex {}; //! + bool fUseGpuReco = false; + ::experimental::CbmGpuRecoSts fGpuReco; ClassDef(CbmRecoSts, 1); }; diff --git a/reco/detectors/sts/CbmStsAlgoAnaCluster.cxx b/reco/detectors/sts/CbmStsAlgoAnaCluster.cxx index 5bdd47aae8..5801e08b06 100644 --- a/reco/detectors/sts/CbmStsAlgoAnaCluster.cxx +++ b/reco/detectors/sts/CbmStsAlgoAnaCluster.cxx @@ -19,7 +19,6 @@ using std::unique_ptr; - ClassImp(CbmStsAlgoAnaCluster) @@ -40,6 +39,10 @@ void CbmStsAlgoAnaCluster::AnaSize1(CbmStsCluster& cluster, const CbmStsParModul Int_t index = cluster.GetDigi(0); const CbmStsDigi* digi = fDigiMan->Get<CbmStsDigi>(index); assert(digi); + // printf("BEGIN CLUSTER CALC 1\n"); + // printf("Digi %d: channel=%d, time=%d, charge=%d\n", index, digi->GetChannel(), digi->GetTimeU32(), digi->GetChargeU16()); + // printf("END CLUSTER CALC\n"); + auto& asic = module->GetParAsic(digi->GetChannel()); Double_t x = Double_t(digi->GetChannel()); Double_t time = digi->GetTime(); @@ -118,6 +121,7 @@ void CbmStsAlgoAnaCluster::AnaSize2(CbmStsCluster& cluster, const CbmStsParModul } Double_t xError = TMath::Sqrt(ex0sq + ex1sq + ex2sq); + // Cluster charge Double_t charge = q1 + q2; @@ -147,10 +151,12 @@ void CbmStsAlgoAnaCluster::AnaSizeN(CbmStsCluster& cluster, const CbmStsParModul Int_t index = cluster.GetDigi(iDigi); const CbmStsDigi* digi = fDigiMan->Get<CbmStsDigi>(index); + assert(digi); Int_t channel = digi->GetChannel(); auto& asic = modPar->GetParAsic(channel); + // --- Uncertainties of the charge measurements Double_t eNoiseSq = asic.GetNoise() * asic.GetNoise(); Double_t chargePerAdc = asic.GetDynRange() / Double_t(asic.GetNofAdc()); @@ -183,6 +189,7 @@ void CbmStsAlgoAnaCluster::AnaSizeN(CbmStsCluster& cluster, const CbmStsParModul } //# digis in cluster + // Periodic channel position for clusters round the edge if (chanF > chanL) chanF -= modPar->GetNofChannels() / 2; diff --git a/reco/detectors/sts/CbmStsAlgoFindHits.cxx b/reco/detectors/sts/CbmStsAlgoFindHits.cxx index 9e1a4d88a2..5f2634b96f 100644 --- a/reco/detectors/sts/CbmStsAlgoFindHits.cxx +++ b/reco/detectors/sts/CbmStsAlgoFindHits.cxx @@ -23,7 +23,6 @@ using std::pair; using std::vector; - // ----- Constructor --------------------------------------------------- CbmStsAlgoFindHits::CbmStsAlgoFindHits() {} // ------------------------------------------------------------------------- @@ -124,6 +123,7 @@ Long64_t CbmStsAlgoFindHits::Exec(const vector<CbmStsCluster>& clustersF, const const CbmStsCluster& clusterB = clustersB[iClusterB]; Double_t timeDiff = tF - clusterB.GetTime(); + if ((timeDiff > 0) && (timeDiff > max_sigma_both)) { startB++; continue; diff --git a/reco/detectors/sts/CbmStsFindTracks.cxx b/reco/detectors/sts/CbmStsFindTracks.cxx index 0e4610fc65..638be437e6 100644 --- a/reco/detectors/sts/CbmStsFindTracks.cxx +++ b/reco/detectors/sts/CbmStsFindTracks.cxx @@ -178,8 +178,8 @@ InitStatus CbmStsFindTracks::Init() else if (fVerbose > 2) fDigiScheme->Print(kTRUE); cout << "-I- " << "STS digitisation scheme succesfully initialised" << endl; - cout << " Stations: " << fDigiScheme->GetNStations() - << ", Sectors: " << fDigiScheme->GetNSectors() << ", Channels: " + cout << " Stations: " << fDigiScheme->GetNStations() + << ", Sectors: " << fDigiScheme->GetNSectors() << ", Channels: " << fDigiScheme->GetNChannels() << endl; } */ diff --git a/reco/detectors/sts/CbmStsRecoModule.cxx b/reco/detectors/sts/CbmStsRecoModule.cxx index a0c2173258..fa95b5ea56 100644 --- a/reco/detectors/sts/CbmStsRecoModule.cxx +++ b/reco/detectors/sts/CbmStsRecoModule.cxx @@ -25,10 +25,10 @@ #include <TGeoBBox.h> #include <TGeoPhysicalNode.h> #include <TMath.h> +#include <TStopwatch.h> using std::pair; - ClassImp(CbmStsRecoModule) @@ -77,6 +77,10 @@ void CbmStsRecoModule::AddDigiToQueue(const CbmStsDigi* digi, Int_t digiIndex) void CbmStsRecoModule::Reconstruct() { + // return; + TStopwatch timer; + + timer.Start(); // --- Sort the digi queues by digi time stamp std::sort(fDigisF.begin(), fDigisF.end(), [](pair<const CbmStsDigi*, Int_t> digi1, pair<const CbmStsDigi*, Int_t> digi2) { @@ -86,8 +90,11 @@ void CbmStsRecoModule::Reconstruct() [](pair<const CbmStsDigi*, Int_t> digi1, pair<const CbmStsDigi*, Int_t> digi2) { return digi1.first->GetTime() < digi2.first->GetTime(); }); + timer.Stop(); + fTimings.timeSortDigi = timer.RealTime(); // --- Perform cluster finding + timer.Start(); fClusterFinder->Exec(fDigisF, fClustersF, fSetupModule->GetAddress(), fNofStripsF, 0, fTimeCutDigisSig, fTimeCutDigisAbs, fConnectEdgeFront, fParModule); fClusterFinder->Exec(fDigisB, fClustersB, fSetupModule->GetAddress(), fNofStripsB, fNofStripsF, fTimeCutDigisSig, @@ -99,15 +106,22 @@ void CbmStsRecoModule::Reconstruct() for (auto& cluster : fClustersB) fClusterAna->Exec(cluster, fParModule); + timer.Stop(); + fTimings.timeCluster = timer.RealTime(); + // --- Sort clusters by time + timer.Start(); std::sort(fClustersF.begin(), fClustersF.end(), [](const CbmStsCluster& cluster1, const CbmStsCluster& cluster2) { return (cluster1.GetTime() < cluster2.GetTime()); }); std::sort(fClustersB.begin(), fClustersB.end(), [](const CbmStsCluster& cluster1, const CbmStsCluster& cluster2) { return (cluster1.GetTime() < cluster2.GetTime()); }); + timer.Stop(); + fTimings.timeSortCluster = timer.RealTime(); // --- Perform hit finding + timer.Start(); if (fHitFinder) fHitFinder->Exec(fClustersF, fClustersB, fHits, fSetupModule->GetAddress(), fTimeCutClustersSig, fTimeCutClustersAbs, fDyActive, fNofStripsF, fStripPitchF, fStereoFront, fStereoBack, @@ -116,6 +130,8 @@ void CbmStsRecoModule::Reconstruct() fHitFinderOrtho->Exec(fClustersF, fClustersB, fHits, fSetupModule->GetAddress(), fTimeCutClustersSig, fTimeCutClustersAbs, fNofStripsF, fNofStripsB, fStripPitchF, fStripPitchB, fLorentzShiftF, fLorentzShiftB, fMatrix); + timer.Stop(); + fTimings.timeHits = timer.RealTime(); } // ------------------------------------------------------------------------- diff --git a/reco/detectors/sts/CbmStsRecoModule.h b/reco/detectors/sts/CbmStsRecoModule.h index 7a591b329a..261ec35c21 100644 --- a/reco/detectors/sts/CbmStsRecoModule.h +++ b/reco/detectors/sts/CbmStsRecoModule.h @@ -51,6 +51,13 @@ class CbmStsParSensorCond; class CbmStsRecoModule : public TNamed { public: + struct Timings { + double timeSortDigi = 0; + double timeCluster = 0; + double timeSortCluster = 0; + double timeHits = 0; + }; + /** @brief Default constructor **/ CbmStsRecoModule(); @@ -105,6 +112,10 @@ public: **/ const std::vector<CbmStsHit>& GetHits() const { return fHits; } + /** @brief Time measurements + **/ + Timings GetTimings() const { return fTimings; } + /** @brief Perform reconstruction **/ void Reconstruct(); @@ -114,6 +125,9 @@ public: void Reset(); + TGeoHMatrix* getMatrix() { return fMatrix; } + + /** @brief Info to string **/ std::string ToString() const; @@ -201,6 +215,9 @@ private: Bool_t fConnectEdgeFront = kFALSE; ///< Round-the edge clustering front side Bool_t fConnectEdgeBack = kFALSE; ///< Round-the edge clustering back side + // --- Time measurement + Timings fTimings; + ClassDef(CbmStsRecoModule, 1); }; diff --git a/reco/detectors/sts/experimental/CbmGpuRecoSts.cxx b/reco/detectors/sts/experimental/CbmGpuRecoSts.cxx new file mode 100644 index 0000000000..923cb2ce6d --- /dev/null +++ b/reco/detectors/sts/experimental/CbmGpuRecoSts.cxx @@ -0,0 +1,404 @@ +/* Copyright (C) 2022 FIAS Frankfurt Institute for Advanced Studies, Frankfurt / Main + SPDX-License-Identifier: GPL-3.0-only + Authors: Felix Weiglhofer [committer], Kilian Hunold */ +#include "CbmGpuRecoSts.h" + +#include "CbmAddress.h" +#include "CbmDigiManager.h" +#include "CbmStsAddress.h" +#include "CbmStsCluster.h" +#include "CbmStsGpuRecoDevice.h" +#include "CbmStsHit.h" +#include "CbmStsPhysics.h" +#include "CbmStsRecoModule.h" + +#include <TCanvas.h> +#include <TClonesArray.h> +#include <TGeoMatrix.h> +#include <TH1D.h> +#include <TH2D.h> +#include <TStopwatch.h> + +#include <algorithm> +#include <cassert> +#include <unordered_set> +#include <vector> + +using namespace experimental; + +void CbmGpuRecoSts::SetupConstants() +{ + assert(fConfig != std::nullopt); + assert(fConfig->recoModules.size() == fConfig->parModules.size()); + assert(fConfig->moduleAddrs.size() == fConfig->parModules.size()); + assert(fConfig->hitfinderCfg.size() == fConfig->parModules.size()); + + size_t nModules = fConfig->parModules.size(); + + auto& hfc = fHitfinderCpu; + auto& hfg = fHitfinderGpu; + + LOG(info) << "STS GPU RECO: setting constants"; + + size_t nChannels = fConfig->parModules[0].GetNofChannels(); + ::CbmStsParAsic asic = fConfig->parModules[0].GetParAsic(0); // TODO values are currently identical for all asics?? + // for (auto &mod : modules) { + // assert(nChannels == mod.GetNofChannels()); + // for (size_t c = 0; c < nChannels; c++) { + // ::CbmStsParAsic otherAsic = mod.GetParAsic(c); + // assert(asic.GetNofAdc() == otherAsic.GetNofAdc()); + // assert(asic.GetDynRange() == otherAsic.GetDynRange()); + // assert(asic.GetThreshold() == otherAsic.GetThreshold()); + // assert(asic.GetTimeResol() == otherAsic.GetTimeResol()); + // assert(asic.GetDeadTime() == otherAsic.GetDeadTime()); + // assert(asic.GetNoise() == otherAsic.GetNoise()); + // assert(asic.GetZeroNoiseRate() == otherAsic.GetZeroNoiseRate()); + // } + // } + + nChannels /= 2; + hfc.nModules = hfg.nModules = nModules; + hfc.nChannels = hfg.nChannels = nChannels; + + hfc.timeCutDigiAbs = hfg.timeCutDigiAbs = -1.f; + hfc.timeCutDigiSig = hfg.timeCutDigiSig = 3.f; + hfc.timeCutClusterAbs = hfg.timeCutClusterAbs = -1.f; + hfc.timeCutClusterSig = hfg.timeCutClusterSig = 4.f; + + hfc.asic.nAdc = asic.GetNofAdc(); + hfc.asic.dynRange = asic.GetDynRange(); + hfc.asic.threshold = asic.GetThreshold(); + hfc.asic.timeResolution = asic.GetTimeResol(); + hfc.asic.deadTime = asic.GetDeadTime(); + hfc.asic.noise = asic.GetNoise(); + hfc.asic.zeroNoiseRate = asic.GetZeroNoiseRate(); + hfg.asic = hfc.asic; +} + +void CbmGpuRecoSts::Setup(size_t maxDigisPerModule, size_t nDigitsTotal) +{ + assert(fConfig != std::nullopt); + + // static constexpr size_t maxDigisPerModule = 60000; + // static constexpr size_t maxClustersPerModule = 30000; + // static constexpr size_t maxHitsPerModule = 120000; + + const size_t maxClustersPerModule = maxDigisPerModule * 0.5; + const size_t maxHitsPerModule = maxClustersPerModule * 4; + + size_t nModules = fConfig->parModules.size(); + size_t nChannels = fConfig->parModules[0].GetNofChannels() / 2; + + + size_t nModulesTotal = nModules * 2; // Front- and backside + + auto& hfc = fHitfinderCpu; + auto& hfg = fHitfinderGpu; + + auto landauTable = fConfig->physics->GetLandauWidthTable(); + hfc.landauTableSize = hfg.landauTableSize = landauTable.size(); + hfc.landauTable = xpu::hd_buffer<float> {size_t(hfc.landauTableSize)}; + hfg.landauTable = hfc.landauTable.d(); + auto landauEntry = landauTable.begin(); + auto prevLandauEntry = landauEntry; + hfc.landauTable.h()[0] = landauEntry->second; + landauEntry++; + hfc.landauStepSize = hfg.landauStepSize = landauEntry->first - prevLandauEntry->first; + for (size_t i = 1; landauEntry != landauTable.end(); i++, landauEntry++) { + assert(hfc.landauStepSize == landauEntry->first - prevLandauEntry->first); + assert(i < hfc.landauTable.size()); + hfc.landauTable.h()[i] = landauEntry->second; + prevLandauEntry = landauEntry; + } + + LOG(info) << "STS GPU RECO: finished landau"; + + hfc.localToGlobalTranslationByModule = xpu::hd_buffer<float> {nModules * 3}; + hfg.localToGlobalTranslationByModule = hfc.localToGlobalTranslationByModule.d(); + hfc.localToGlobalRotationByModule = xpu::hd_buffer<float> {nModules * 9}; + hfg.localToGlobalRotationByModule = hfc.localToGlobalRotationByModule.d(); + for (size_t m = 0; m < fConfig->recoModules.size(); m++) { + auto& recoMod = fConfig->recoModules[m]; + TGeoHMatrix* matrix = recoMod->getMatrix(); + + const double* rot = matrix->GetRotationMatrix(); + float* tgt = &hfc.localToGlobalRotationByModule.h()[m * 9]; + std::copy_n(rot, 9, tgt); + + const double* tr = matrix->GetTranslation(); + tgt = &hfc.localToGlobalTranslationByModule.h()[m * 3]; + std::copy_n(tr, 3, tgt); + } + + LOG(info) << "STS GPU RECO: finished transformation matrix"; + + hfc.hitfinderConfig = xpu::hd_buffer<CbmStsHitFinderConfig>(nModules); + hfg.hitfinderConfig = hfc.hitfinderConfig.d(); + + std::copy_n(fConfig->hitfinderCfg.begin(), nModules, hfc.hitfinderConfig.h()); + + LOG(info) << "STS GPU RECO: finished setting hitfinder config"; + + // FIXME: allocate by total number of digis instead. This array is flat. + hfc.digiOffsetPerModule = xpu::hd_buffer<size_t> {nModulesTotal + 1}; + hfg.digiOffsetPerModule = hfc.digiOffsetPerModule.d(); + hfc.digisPerModule = xpu::hd_buffer<CbmStsDigi> {nDigitsTotal}; + hfg.digisPerModule = hfc.digisPerModule.d(); + hfc.digisPerModuleTmp = xpu::d_buffer<CbmStsDigi> {nDigitsTotal}; + hfg.digisPerModuleTmp = hfc.digisPerModuleTmp.d(); + hfc.digisSortedPerModule = xpu::d_buffer<CbmStsDigi*> {nModulesTotal}; + hfg.digisSortedPerModule = hfc.digisSortedPerModule.d(); + hfc.digiConnectorPerModule = xpu::d_buffer<CbmStsDigiConnector> {nDigitsTotal}; + hfg.digiConnectorPerModule = hfc.digiConnectorPerModule.d(); + + hfc.channelOffsetPerModule = xpu::d_buffer<unsigned int> {nModulesTotal * nChannels}; + hfg.channelOffsetPerModule = hfc.channelOffsetPerModule.d(); + + hfc.maxErrorFront = xpu::d_buffer<float> {1}; + hfg.maxErrorFront = hfc.maxErrorFront.d(); + hfc.maxErrorBack = xpu::d_buffer<float> {1}; + hfg.maxErrorBack = hfc.maxErrorBack.d(); + + LOG(info) << "STS GPU RECO: finished digis"; + + hfc.maxClustersPerModule = hfg.maxClustersPerModule = maxClustersPerModule; + hfc.channelStatusPerModule = xpu::d_buffer<int> {nChannels * nModulesTotal}; + hfg.channelStatusPerModule = hfc.channelStatusPerModule.d(); + hfc.clusterIdxPerModule = xpu::hd_buffer<CbmStsClusterIdx> {maxClustersPerModule * nModulesTotal}; + hfg.clusterIdxPerModule = hfc.clusterIdxPerModule.d(); + hfc.clusterIdxPerModuleTmp = xpu::hd_buffer<CbmStsClusterIdx> {maxClustersPerModule * nModulesTotal}; + hfg.clusterIdxPerModuleTmp = hfc.clusterIdxPerModuleTmp.d(); + hfc.clusterIdxSortedPerModule = xpu::hd_buffer<CbmStsClusterIdx*> {nModulesTotal}; + hfg.clusterIdxSortedPerModule = hfc.clusterIdxSortedPerModule.d(); + hfc.clusterDataPerModule = xpu::hd_buffer<CbmStsClusterData> {maxClustersPerModule * nModulesTotal}; + hfg.clusterDataPerModule = hfc.clusterDataPerModule.d(); + hfc.nClustersPerModule = xpu::hd_buffer<int> {nModulesTotal}; + hfg.nClustersPerModule = hfc.nClustersPerModule.d(); + + LOG(info) << "STS GPU RECO: finished cluster"; + + hfc.maxHitsPerModule = hfg.maxHitsPerModule = maxHitsPerModule; + hfc.hitsPerModule = xpu::hd_buffer<CbmStsHit>(maxHitsPerModule * nModules); + hfg.hitsPerModule = hfc.hitsPerModule.d(); + hfc.nHitsPerModule = xpu::hd_buffer<int>(nModules); + hfg.nHitsPerModule = hfc.nHitsPerModule.d(); + + LOG(info) << "STS GPU RECO: finished setup"; + LOG(info) << "- nModules = " << nModules; + LOG(info) << "- nChannels = " << nChannels; +} + +void CbmGpuRecoSts::RunHitFinder() +{ + setenv("XPU_PROFILE", "1", 1); + xpu::initialize(); + + auto& hfc = fHitfinderCpu; + + // ROCm bug: Mi100 name is an emtpy string... + auto deviceName = (xpu::device_properties().name.empty()) ? "AMD Mi100" : xpu::device_properties().name; + + LOG(info) << "STS GPU RECO: running GPU hit finder on device " << deviceName; + LOG(info) << "STS GPU RECO: Allocate and fetch digis"; + + + // FIXME: This is ugly, setup has to be split in two parts, constants required by FetchDigis + // And FetchDigis required by Setup... + SetupConstants(); + size_t maxDigisPerModule = 0; + size_t nDigisTotal = 0; + FetchDigis(maxDigisPerModule, nDigisTotal); + Setup(maxDigisPerModule, nDigisTotal); + + LOG(info) << "STS GPU RECO: finished digis / running clusterizer now"; + LOG(info) << "STS GPU RECO: largest module has " << maxDigisPerModule << " digis"; + + // Not all buffers have to initialized, but useful for debugging + // xpu::memset(hfc.digisPerModule, 0); + // xpu::memset(hfc.digisPerModuleTmp, 0); + // xpu::memset(hfc.digisSortedPerModule, 0); + // xpu::memset(hfc.digiOffsetPerModule, 0); + xpu::memset(hfc.digiConnectorPerModule, 0); + // xpu::memset(hfc.channelOffsetPerModule, 0); + xpu::memset(hfc.channelStatusPerModule, -1); + xpu::memset(hfc.clusterIdxPerModule, 0); + // xpu::memset(hfc.clusterIdxPerModuleTmp, 0); + // xpu::memset(hfc.clusterIdxSortedPerModule, 0); + // xpu::memset(hfc.clusterDataPerModule, 0); + xpu::memset(hfc.nClustersPerModule, 0); + // xpu::memset(hfc.hitsPerModule, 0); + xpu::memset(hfc.nHitsPerModule, 0); + xpu::memset(hfc.maxErrorFront, 0); + xpu::memset(hfc.maxErrorBack, 0); + + hfc.digiOffsetPerModule.h()[0] = 0; + for (int m = 0; m < hfc.nModules; m++) { + size_t offset = hfc.digiOffsetPerModule.h()[m]; + auto& md = fDigisByModuleF[fConfig->moduleAddrs[m]]; + std::copy(md.begin(), md.end(), &hfc.digisPerModule.h()[offset]); + hfc.digiOffsetPerModule.h()[m + 1] = offset + md.size(); + } + for (int m = 0; m < hfc.nModules; m++) { + size_t offset = hfc.digiOffsetPerModule.h()[m + hfc.nModules]; + auto& md = fDigisByModuleB[fConfig->moduleAddrs[m]]; + std::copy(md.begin(), md.end(), &hfc.digisPerModule.h()[offset]); + hfc.digiOffsetPerModule.h()[m + hfc.nModules + 1] = offset + md.size(); + } + assert(nDigisTotal == hfc.digiOffsetPerModule.h()[2 * hfc.nModules]); + + TStopwatch timer; + + // constants only need to be transferred once. should be excluded from time measurement + xpu::copy(hfc.landauTable, xpu::host_to_device); + xpu::copy(hfc.hitfinderConfig, xpu::host_to_device); + xpu::copy(hfc.localToGlobalRotationByModule, xpu::host_to_device); + xpu::copy(hfc.localToGlobalTranslationByModule, xpu::host_to_device); + + xpu::set_constant<HitFinder>(fHitfinderGpu); + xpu::copy(hfc.digisPerModule, xpu::host_to_device); + xpu::copy(hfc.digiOffsetPerModule, xpu::host_to_device); + + xpu::run_kernel<SortDigis>(xpu::grid::n_blocks(hfc.nModules * 2)); + xpu::run_kernel<FindClustersSingleStep>(xpu::grid::n_blocks(hfc.nModules * 2)); + // Run cluster finding steps in indivual kernels, useful for debugging / profiling + // xpu::run_kernel<CalculateOffsets>(xpu::grid::n_blocks(hfc.nModules * 2)); + // xpu::run_kernel<FindClustersBasic>(xpu::grid::n_blocks(hfc.nModules * 2)); + // xpu::run_kernel<CalculateClusters>(xpu::grid::n_blocks(hfc.nModules * 2)); + // xpu::run_kernel<FindClustersBasic>(xpu::grid::n_blocks(hfc.nModules * 2)); + // xpu::run_kernel<CalculateClustersBasic>(xpu::grid::n_blocks(hfc.nModules * 2)); + xpu::run_kernel<SortClusters>(xpu::grid::n_blocks(hfc.nModules * 2)); + xpu::run_kernel<FindHits>(xpu::grid::n_blocks(hfc.nModules)); + + // Check if profiling is enabled + if (not xpu::get_timing<SortDigis>().empty()) { + timeSortDigi = xpu::get_timing<SortDigis>().back(); + timeCluster = xpu::get_timing<FindClustersSingleStep>().back(); + timeSortCluster = xpu::get_timing<SortClusters>().back(); + timeHits = xpu::get_timing<FindHits>().back(); + } + + xpu::copy(hfc.clusterDataPerModule, xpu::device_to_host); + xpu::copy(hfc.nClustersPerModule, xpu::device_to_host); + xpu::copy(hfc.hitsPerModule, xpu::device_to_host); + xpu::copy(hfc.nHitsPerModule, xpu::device_to_host); + + + size_t mostCluster = 0, mostHits = 0; + for (int m = 0; m < hfc.nModules; m++) { + mostCluster = std::max<size_t>(hfc.nClustersPerModule.h()[m], mostCluster); + mostCluster = std::max<size_t>(hfc.nClustersPerModule.h()[m + hfc.nModules], mostCluster); + mostHits = std::max<size_t>(hfc.nHitsPerModule.h()[m], mostHits); + + nCluster += hfc.nClustersPerModule.h()[m]; + nCluster += hfc.nClustersPerModule.h()[m + hfc.nModules]; + + nHits += hfc.nHitsPerModule.h()[m]; + } + + LOG(info) << "STS GPU RECO: finished hitfinder"; +} + +void CbmGpuRecoSts::ForwardClustersAndHits(TClonesArray* clusters, TClonesArray* hits) +{ + auto& hfc = fHitfinderCpu; + + for (int module = 0; module < hfc.nModules; module++) { + + auto* gpuClusterF = &hfc.clusterDataPerModule.h()[module * hfc.maxClustersPerModule]; + auto* gpuClusterIdxF = &hfc.clusterIdxPerModule.h()[module * hfc.maxClustersPerModule]; + int nClustersFGpu = hfc.nClustersPerModule.h()[module]; + + for (int i = 0; i < nClustersFGpu; i++) { + auto& cidx = gpuClusterIdxF[i]; + auto& clusterGpu = gpuClusterF[cidx.fIdx]; + + unsigned int outIdx = clusters->GetEntriesFast(); + auto* clusterOut = new ((*clusters)[outIdx])::CbmStsCluster {}; + + clusterOut->SetAddress(fConfig->moduleAddrs[module]); + clusterOut->SetProperties(clusterGpu.fCharge, clusterGpu.fPosition, clusterGpu.fPositionError, cidx.fTime, + clusterGpu.fTimeError); + clusterOut->SetSize(clusterGpu.fSize); + } + + auto* gpuClusterB = &hfc.clusterDataPerModule.h()[(module + hfc.nModules) * hfc.maxClustersPerModule]; + auto* gpuClusterIdxB = &hfc.clusterIdxPerModule.h()[(module + hfc.nModules) * hfc.maxClustersPerModule]; + int nClustersBGpu = hfc.nClustersPerModule.h()[module + hfc.nModules]; + + for (int i = 0; i < nClustersBGpu; i++) { + auto& cidx = gpuClusterIdxB[i]; + auto& clusterGpu = gpuClusterB[cidx.fIdx]; + unsigned int outIdx = clusters->GetEntriesFast(); + auto* clusterOut = new ((*clusters)[outIdx])::CbmStsCluster {}; + + clusterOut->SetAddress(fConfig->moduleAddrs[module]); + clusterOut->SetProperties(clusterGpu.fCharge, clusterGpu.fPosition, clusterGpu.fPositionError, cidx.fTime, + clusterGpu.fTimeError); + clusterOut->SetSize(clusterGpu.fSize); + } + + auto* gpuHits = &hfc.hitsPerModule.h()[module * hfc.maxHitsPerModule]; + int nHitsGpu = hfc.nHitsPerModule.h()[module]; + + for (int i = 0; i < nHitsGpu; i++) { + auto& hitGpu = gpuHits[i]; + + unsigned int outIdx = hits->GetEntriesFast(); + new ((*hits)[outIdx])::CbmStsHit {fConfig->moduleAddrs[module], + TVector3 {hitGpu.fX, hitGpu.fY, hitGpu.fZ}, + TVector3 {hitGpu.fDx, hitGpu.fDy, hitGpu.fDz}, + hitGpu.fDxy, + 0, + 0, + double(hitGpu.fTime), + hitGpu.fTimeError, + hitGpu.fDu, + hitGpu.fDv}; + } + + } // for (int module = 0; module < hfc.nModules; module++) +} + +void CbmGpuRecoSts::FetchDigis(size_t& maxDigisPerModule, size_t& nDigisTotal) +{ + + // FIXME: This function is a crime against humanity. + // Digis should be presorted by module already... + + CbmDigiManager* digis = CbmDigiManager::Instance(); + auto& hfc = fHitfinderCpu; + + // FIXME: GPU reco should use regular digi class too + nDigis = digis->GetNofDigis(ECbmModuleId::kSts); + nDigisTotal = 0; + for (size_t i = 0; i < nDigis; i++) { + const ::CbmStsDigi* digi = digis->Get<::CbmStsDigi>(i); + Int_t systemId = CbmAddress::GetSystemId(digi->GetAddress()); + // TODO: is this check still needed? + if (systemId != ToIntegralType(ECbmModuleId::kSts)) { continue; } + int moduleAddr = CbmStsAddress::GetMotherAddress(digi->GetAddress(), kStsModule); + // int module = addrToModuleId(moduleAddr); + // assert(module < hfc.nModules); + bool isFront = digi->GetChannel() < hfc.nChannels; + + if (isFront) { fDigisByModuleF[moduleAddr].push_back(*digi); } + else { + CbmStsDigi tmpdigi = *digi; + tmpdigi.SetChannel(digi->GetChannel() - hfc.nChannels); + fDigisByModuleB[moduleAddr].push_back(tmpdigi); + } + + nDigisTotal++; + } + + nDigisUsed = nDigisTotal; + + maxDigisPerModule = 0; + for (const auto& mod : fDigisByModuleF) { + maxDigisPerModule = std::max(maxDigisPerModule, mod.second.size()); + } + + for (const auto& mod : fDigisByModuleB) { + maxDigisPerModule = std::max(maxDigisPerModule, mod.second.size()); + } +} diff --git a/reco/detectors/sts/experimental/CbmGpuRecoSts.h b/reco/detectors/sts/experimental/CbmGpuRecoSts.h new file mode 100644 index 0000000000..a26efdbe91 --- /dev/null +++ b/reco/detectors/sts/experimental/CbmGpuRecoSts.h @@ -0,0 +1,78 @@ +/* Copyright (C) 2022 FIAS Frankfurt Institute for Advanced Studies, Frankfurt / Main + SPDX-License-Identifier: GPL-3.0-only + Authors: Felix Weiglhofer [committer], Kilian Hunold */ +#ifndef CBMGPURECOSTS_H +#define CBMGPURECOSTS_H + +#include "CbmStsDigi.h" +#include "CbmStsGpuHitFinder.h" +#include "CbmStsParModule.h" + +#include <map> +#include <optional> +#include <vector> + +#include <xpu/host.h> + +class CbmStsPhysics; +class CbmStsRecoModule; +class TClonesArray; + +namespace experimental +{ + + class CbmGpuRecoSts { + + public: + struct Config { + std::vector<CbmStsParModule> parModules; + std::vector<CbmStsRecoModule*> recoModules; + std::vector<int> moduleAddrs; + std::vector<CbmStsHitFinderConfig> hitfinderCfg; + CbmStsPhysics* physics; + }; + + double timeIO = 0; + double timeSortDigi = 0; + double timeCluster = 0; + double timeSortCluster = 0; + double timeHits = 0; + + size_t nDigis = 0; + size_t nDigisUsed = 0; + size_t nHits = 0; + size_t nCluster = 0; + + void SetConfig(const Config& cfg) { fConfig = cfg; } + + void SetupConstants(); + void Setup(size_t maxDigisPerModule, size_t nDigitsTotal); + + void RunHitFinder(); + + void ForwardClustersAndHits(TClonesArray* clusters, TClonesArray* hits); + + std::vector<CbmStsHit> GetHits(int module) + { + auto& hfc = fHitfinderCpu; + auto* st = &hfc.hitsPerModule.h()[module * hfc.maxHitsPerModule]; + auto* end = st + hfc.nHitsPerModule.h()[module]; + return std::vector<CbmStsHit> {st, end}; + } + + private: + std::optional<Config> fConfig; // Initialized late... + + // std::vector<int> fModuleAddrs; + CbmStsHitFinder<xpu::side::host> fHitfinderCpu; + CbmStsGpuHitFinder fHitfinderGpu; + + std::map<int, std::vector<CbmStsDigi>> fDigisByModuleF; + std::map<int, std::vector<CbmStsDigi>> fDigisByModuleB; + + void FetchDigis(size_t& maxDigisPerModule, size_t& nDigisTotal); + }; + +} // namespace experimental + +#endif diff --git a/reco/detectors/sts/experimental/CbmStsGpuHitFinder.cxx b/reco/detectors/sts/experimental/CbmStsGpuHitFinder.cxx new file mode 100644 index 0000000000..c02877d377 --- /dev/null +++ b/reco/detectors/sts/experimental/CbmStsGpuHitFinder.cxx @@ -0,0 +1,957 @@ +/* Copyright (C) 2021-2022 FIAS Frankfurt Institute for Advanced Studies, Frankfurt / Main + SPDX-License-Identifier: GPL-3.0-only + Authors: Felix Weiglhofer [committer], Kilian Hunold */ +#include "CbmStsGpuHitFinder.h" + +using namespace experimental; + +/** + * CbmStsGpuHitFinder::sortDigisKhun + * Sorts digis channelwise. Inside a channel all digis are sorted in time. + * + * @param smem is the shared memory it is worked in + * @param iBlock is the Block/Module that is currenty running + * +*/ +XPU_D void CbmStsGpuHitFinder::SortDigisInSpaceAndTime(CbmStsSortDigiSmem& smem, int iBlock) const +{ + int iModule = iBlock; + CbmStsDigi* digis = &digisPerModule[digiOffsetPerModule[iModule]]; + CbmStsDigi* digisTmp = &digisPerModuleTmp[digiOffsetPerModule[iModule]]; + int nDigis = GetNDigis(iModule); + + SortDigisT digiSort(smem.sortBuf); + + digis = digiSort.sort(digis, nDigis, digisTmp, [](const CbmStsDigi a) { + return ((unsigned long int) a.GetChannel()) << 32 | (unsigned long int) (a.GetTimeU32()); + }); + + if (xpu::thread_idx::x() == 0) { digisSortedPerModule[iModule] = digis; } +} + +XPU_D void CbmStsGpuHitFinder::FindClustersSingleStep(int iBlock) const +{ + CalculateOffsetsParallel(iBlock); + FindClustersParallel(iBlock); + CalculateClustersParallel(iBlock); +} + +/** + * CbmStsGpuHitFinder::calculateChannelOffsets + * Calculates the Offsest of every channel. digis-Array is sorted in Channel and Time. + * If a channelChange is detected it is stored in channelOffsetsPerModule. + * + * @param digis All Digis that are relevant for this Block + * @param channelOffsets The Array where all offsets are written to + * @param nDigis Amount of digis in digis-Array + * +*/ +XPU_D void CbmStsGpuHitFinder::CalculateChannelOffsets(CbmStsDigi* digis, unsigned int* channelOffsets, + unsigned int const nDigis) const +{ + channelOffsets[0] = 0; + + //Parallel + for (unsigned int pos = xpu::thread_idx::x(); pos < nDigis - 1; pos += (unsigned int) xpu::block_dim::x()) { + auto const currChannel = digis[pos].GetChannel(); + auto const nextChannel = digis[pos + 1].GetChannel(); + if (currChannel != nextChannel) { + for (int i = currChannel + 1; i <= nextChannel; i++) { + channelOffsets[i] = pos + 1; + } + } + XPU_ASSERT(digis[pos].GetChannel() + <= digis[pos + 1].GetChannel()); //channel are supposed to be sorted increasingly + } + + for (int c = digis[nDigis - 1].GetChannel() + 1; c < nChannels; c++) { + channelOffsets[c] = nDigis; + } +} + + +/** + * CbmStsGpuHitFinder::calculateOffsetsParallel + * This function calculates the channeloffsets in a certain module. + * An Offset is the Startingpoint of a Channel in a sorted array of Digis. + * + * @param iBlock is the Block/Module that is currently worked on + * +*/ +XPU_D void CbmStsGpuHitFinder::CalculateOffsetsParallel(int iBlock) const +{ + int const iModule = iBlock; + CbmStsDigi* digis = digisSortedPerModule[iModule]; + auto const nDigis = GetNDigis(iModule); + + if (nDigis == 0) return; + + auto* channelOffsets = &channelOffsetPerModule[iModule * nChannels]; + + CalculateChannelOffsets(digis, channelOffsets, nDigis); +} + +/** + * CbmStsGpuHitFinder::findClustersParallel + * This function finds clusters form an sts-Digi Array inserted. + * It runs the finding process highly parallel. + * + * @param iBlock is the Block/Module that is currently worked on + * +*/ +XPU_D void CbmStsGpuHitFinder::FindClustersParallel(int iBlock) const +{ + int const iModule = iBlock; + CbmStsDigi* digis = digisSortedPerModule[iModule]; + auto const nDigis = GetNDigis(iModule); + unsigned int const threadId = xpu::thread_idx::x(); + + if (nDigis == 0) return; + + auto* digiConnector = &digiConnectorPerModule[digiOffsetPerModule[iModule]]; + auto* channelOffsets = &channelOffsetPerModule[iModule * nChannels]; + + // findClusterConnectionsChannelWise(digis, digiConnector, channelOffsets, iModule, threadId); + FindClusterConnectionsDigiWise(digis, digiConnector, channelOffsets, iModule, threadId, nDigis); +} + + +/** + * CbmStsGpuHitFinder::calculateClustersParallel + * This function calculates clusters form an sts-Digi Array inserted. + * It runs the calculating process highly parallel. + * + * @param iBlock is the Block/Module that is currently worked on + * +*/ +XPU_D void CbmStsGpuHitFinder::CalculateClustersParallel(int iBlock) const +{ + int const iModule = iBlock; + CbmStsDigi* digis = digisSortedPerModule[iModule]; + auto const nDigis = GetNDigis(iModule); + int const threadId = xpu::thread_idx::x(); + + if (nDigis == 0) { return; } + + auto* digiConnector = &digiConnectorPerModule[digiOffsetPerModule[iModule]]; + // auto* channelOffsets = &channelOffsetPerModule[iModule * nChannels]; + + // calculateClustersChannelWise(digis, digiConnector, channelOffsets, iModule, threadId, nDigis); + CalculateClustersDigiWise(digis, digiConnector, iModule, threadId, nDigis); +} + + +/** + * CbmStsGpuHitFinder::findClusterConnectionsBasic + * + * Each Thread one Channel + * + * ChannelId: 00000 11111 + * DigiIndex: 01234 56789 + * ThreadId: 00000 00000 + * + * Finds Clusters in a basic way. One thread takes care of the whole calculation. + * + * @param digis All Digis that are relevant for this Block + * @param digiConnector Array where the connection between 2 digis is sotred in.(next Digi; has Previous) + * @param channelOffsets The Array where all offsets are written to + * @param iModule The Module that the curren Block is working on + * @param threadId Id of the thrad currently working + * +**/ +XPU_D void CbmStsGpuHitFinder::FindClusterConnectionsBasic(int iBlock) const +{ + int iModule = iBlock; + int threadId = xpu::thread_idx::x(); + auto nDigis = GetNDigis(iModule); + CbmStsDigi* digis = digisSortedPerModule[iModule]; + auto* digiConnector = &digiConnectorPerModule[digiOffsetPerModule[iModule]]; + auto* channelOffsets = &channelOffsetPerModule[iModule * nChannels]; + + if (threadId != 0) return; + + for (unsigned int currIter = 0; currIter < nDigis; currIter++) { + const CbmStsDigi& digi = digis[currIter]; + uint16_t channel = digi.GetChannel(); + + if (channel == nChannels - 1) break; + + //DeltaCalculation + float const tResol = GetTimeResolution(iModule, channel); + float const deltaT = (timeCutDigiAbs > 0.f ? timeCutDigiAbs : timeCutDigiSig * 1.4142f * tResol); + + unsigned int nextChannelStart = channelOffsets[channel + 1]; + unsigned int nextChannelEnd = (channel + 2 < nChannels) ? channelOffsets[channel + 2] : nDigis; + + // printf("Comparing digis (deltaT=%f, nextChannelStart=%u, nextChannelEnd=%u, nDigis=%u):\n", deltaT, + // nextChannelStart, nextChannelEnd, nDigis); + // printf(" channel=%d, time=%d, charge=%d\n", digi.GetChannel(), digi.GetTimeU32(), digi.GetChargeU16()); + + //Calculate DigiConnections + for (unsigned int nextIter = nextChannelStart; nextIter < nextChannelEnd; nextIter++) { + + const CbmStsDigi& nextDigi = digis[nextIter]; + + // printf("> channel=%d, time=%d, charge=%d\n", nextDigi.GetChannel(), nextDigi.GetTimeU32(), + // nextDigi.GetChargeU16()); + + if (digi.GetChannel() >= nextDigi.GetChannel()) continue; + + if (float(digis[currIter].GetTimeU32() + deltaT) <= float(digis[nextIter].GetTimeU32())) break; + if (float(digis[currIter].GetTimeU32() - deltaT) >= float(digis[nextIter].GetTimeU32())) continue; + + digiConnector[currIter].SetNext(nextIter); + digiConnector[nextIter].SetHasPrevious(true); + break; + } + } +} + + +/** + * CbmStsGpuHitFinder::findClusterConnectionsChannelWise + * + * Each Thread one Channel + * + * ChannelId: 00000 11111 + * DigiIndex: 01234 56789 + * ThreadId: 00000 11111 + * + * Finds Clusters highly parallel in a basic way. Each thread gets the same anmount of Channels to work with. + * Every Thread checks all digis in its channel, if there is a pendant to the current one in the channel to its right. + * If there is another digi, the digi will be connected inside the digiConnector-Array. + * + * @param digis All Digis that are relevant for this Block + * @param digiConnector Array where the connection between 2 digis is sotred in.(next Digi; has Previous) + * @param channelOffsets The Array where all offsets are written to + * @param iModule The Module that the curren Block is working on + * @param threadId Id of the thrad currently working + * +**/ +XPU_D void CbmStsGpuHitFinder::FindClusterConnectionsChannelWise(CbmStsDigi* digis, CbmStsDigiConnector* digiConnector, + unsigned int* channelOffsets, int const iModule, + unsigned int const threadId) const +{ + // Expl: here we need to be smaller than nChannels because the last channel is the most Right. + for (unsigned int channel = threadId; channel < (unsigned int) (nChannels - 1); channel += xpu::block_dim::x()) { + unsigned int const currChannelBegin = channelOffsets[channel]; //Begin of current Channel + unsigned int const nextChannelBegin = channelOffsets[channel + 1]; //Begin of next (right) channel + auto const nextChannelEnd = + (channel + 2 < (unsigned int) nChannels) ? channelOffsets[channel + 2] : GetNDigis(iModule); + unsigned int nextIter = nextChannelBegin; + + //Check if first Digi of Channel belongs to Channel + if (channel != digis[currChannelBegin].GetChannel()) continue; + + //DeltaCalculation + float const tResol = GetTimeResolution(iModule, channel); + float const deltaT = (timeCutDigiAbs > 0.f ? timeCutDigiAbs : timeCutDigiSig * 1.4142f * tResol); + + //Calculate DigiConnections + for (auto currIter = currChannelBegin; currIter < nextChannelBegin; currIter++) { + while (nextIter < nextChannelEnd + && ((digis[currIter].GetTimeU32() + deltaT) >= float(digis[nextIter].GetTimeU32()))) { + if (digis[currIter].GetChannel() >= digis[nextIter].GetChannel()) { continue; } + if (digis[currIter].GetTimeU32() + deltaT < float(digis[nextIter].GetTimeU32())) { + nextIter++; + break; + } + if (digis[currIter].GetTimeU32() - deltaT > float(digis[nextIter].GetTimeU32())) { + nextIter++; + continue; + } + + digiConnector[currIter].SetNext(nextIter); + digiConnector[nextIter].SetHasPrevious(true); + nextIter++; + break; + } + } + } +} + +/** + * CbmStsGpuHitFinder::findClusterConnectionsDigiWise + * + * Each thread one Digi + * + * ChannelId: 00000 11111 + * DigiIndex: 01234 56789 + * ThreadId: 01234 01234 + * + * Finds Clusters highly parallel in a basic way. + * The Threads work on Data thats stored nearby, so it's been taken advantage of the data locality. + * Each thread takes one digi and calculates if there is a neighbour. + * This is repeated until all digis have been processed + * + * @param digis All Digis that are relevant for this Block + * @param digiConnector Array where the connection between 2 digis is sotred in.(next Digi; has Previous) + * @param channelOffsets The Array where all offsets are written to + * @param iModule The Module that the curren Block is working on + * @param threadId Id of the thrad currently working + * @param nDigis Amount of digis in digis-Array + * +**/ +XPU_D void CbmStsGpuHitFinder::FindClusterConnectionsDigiWise(CbmStsDigi* digis, CbmStsDigiConnector* digiConnector, + unsigned int* channelOffsets, int const iModule, + unsigned int const threadId, + unsigned int const nDigis) const +{ + + for (unsigned int currIter = threadId; currIter < nDigis; currIter += xpu::block_dim::x()) { + auto const digi = digis[currIter]; + auto const channel = digi.GetChannel(); + + if (channel == nChannels - 1) break; + + //DeltaCalculation + float const tResol = GetTimeResolution(iModule, channel); + float const deltaT = (timeCutDigiAbs > 0.f ? timeCutDigiAbs : timeCutDigiSig * 1.4142f * tResol); + + auto const nextChannelEnd = (channel + 2 < nChannels) ? channelOffsets[channel + 2] : nDigis; + //Calculate DigiConnections + for (auto nextIter = channelOffsets[channel + 1]; nextIter < nextChannelEnd; nextIter++) { + if (digis[currIter].GetChannel() >= digis[nextIter].GetChannel()) continue; + if (float(digis[currIter].GetTimeU32()) + deltaT < float(digis[nextIter].GetTimeU32())) break; + if (float(digis[currIter].GetTimeU32()) - deltaT > float(digis[nextIter].GetTimeU32())) continue; + + digiConnector[currIter].SetNext(nextIter); + digiConnector[nextIter].SetHasPrevious(true); + break; + } + } +} + + +/** + * CbmStsGpuHitFinder::calculateClustersBasic + * + * One Thread all Digis + * + * ChannelId: 00000 11111 + * DigiIndex: 01234 56789 + * ThreadId: 00000 00000 + * + * Calculates the Clustercharges of all found ClusterConnections. + * One thread walks through all Digis and looks for connections. If one Digi does not have a previous one + * it's a cluster start and may be the start of either a single-element-cluster, double-element-cluster + * or multi-element-cluster. + * All Other Threads are canceled + * + * @param digis All Digis that are relevant for this Block + * @param digiConnector Array where the connection between 2 digis is sotred in.(next Digi; has Previous) + * @param iModule The Module that the curren Block is working on + * @param threadId Id of the thrad currently working + * +**/ +XPU_D void CbmStsGpuHitFinder::CalculateClustersBasic(int iBlock) const +{ + int iModule = iBlock; + int threadId = xpu::thread_idx::x(); + CbmStsDigi* digis = digisSortedPerModule[iModule]; + auto* digiConnector = &digiConnectorPerModule[digiOffsetPerModule[iModule]]; + + if (threadId != 0) return; + + for (unsigned int i = 0; i < GetNDigis(iModule); i++) { + if (digiConnector[i].HasPrevious()) { continue; } + if (!digiConnector[i].HasNext()) { + //if Cluster has 1 element + CreateClusterFromConnectors1(iModule, digis, i); + } + else if (!digiConnector[digiConnector[i].next()].HasNext()) { + //if Cluster has 2 elements + CreateClusterFromConnectors2(iModule, digis, digiConnector, i); + } + else { + //if Cluster has N elements + CreateClusterFromConnectorsN(iModule, digis, digiConnector, i); + } + } +} + + +/** + * CbmStsGpuHitFinder::calculateClustersChannelWise + * + * Each Thread one Channel + * + * ChannelId: 00000 11111 + * DigiIndex: 01234 56789 + * ThreadId: 00000 11111 + * + * Calculates the Clustercharges of all found ClusterConnections. + * Each Thread walks through all digis of one channel and looks for connections. + * If one Digi does not have a previous one it's a cluster start and may be the + * start of either a single-element-cluster, double-element-cluster or + * multi-element-cluster. + * + * @param digis All Digis that are relevant for this Block + * @param digiConnector Array where the connection between 2 digis is sotred in.(next Digi; has Previous) + * @param iModule The Module that the curren Block is working on + * @param threadId Id of the thrad currently working + * +**/ +XPU_D void CbmStsGpuHitFinder::CalculateClustersChannelWise(CbmStsDigi* digis, CbmStsDigiConnector* digiConnector, + unsigned int* channelOffsets, int const iModule, + unsigned int const threadId, + unsigned int const nDigis) const +{ + + for (unsigned int channel = threadId; channel < (unsigned int) nChannels; + channel += (unsigned int) xpu::block_dim::x()) { + unsigned int const currChannelBegin = channelOffsets[channel]; + unsigned int const nextChannelBegin = + (channel + 1 < (unsigned int) nChannels) ? channelOffsets[channel + 1] : nDigis; + + //Check if first Digi of Channel belongs to Channel + if (channel != digis[currChannelBegin].GetChannel()) { continue; } + + //Calculate DigiConnections + for (auto currIter = currChannelBegin; currIter < nextChannelBegin; currIter++) { + if (digiConnector[currIter].HasPrevious()) { continue; } + if (!digiConnector[currIter].HasNext()) { + //if Cluster has 1 element + CreateClusterFromConnectors1(iModule, digis, currIter); + } + else if (!digiConnector[digiConnector[currIter].next()].HasNext()) { + //if Cluster has 2 elements + CreateClusterFromConnectors2(iModule, digis, digiConnector, currIter); + } + else { + //if Cluster has N elements + CreateClusterFromConnectorsN(iModule, digis, digiConnector, currIter); + } + } + } +} + + +/** + * CbmStsGpuHitFinder::calculateClustersDigiWise + * + * Each Thread one Channel + * + * ChannelId: 00000 11111 + * DigiIndex: 01234 56789 + * ThreadId: 01234 56789 + * + * Calculates the Clustercharges of all found ClusterConnections. + * Each Thread takes on digi and looks for connections. When the thread is done, it takes the next digi. + * If one Digi does not have a previous one it's a cluster start and may be the + * start of either a single-element-cluster, double-element-cluster or + * multi-element-cluster. + * + * @param digis All Digis that are relevant for this Block + * @param digiConnector Array where the connection between 2 digis is sotred in.(next Digi; has Previous) + * @param iModule The Module that the curren Block is working on + * @param threadId Id of the thrad currently working + * +**/ +XPU_D void CbmStsGpuHitFinder::CalculateClustersDigiWise(CbmStsDigi* digis, CbmStsDigiConnector* digiConnector, + int const iModule, unsigned int const threadId, + unsigned int const nDigis) const +{ + + for (unsigned int currIter = threadId; currIter < nDigis; currIter += (unsigned int) xpu::block_dim::x()) { + + if (digiConnector[currIter].HasPrevious()) continue; + + if (!digiConnector[currIter].HasNext()) { + //if Cluster has 1 element + CreateClusterFromConnectors1(iModule, digis, currIter); + } + else if (!digiConnector[digiConnector[currIter].next()].HasNext()) { + //if Cluster has 2 elements + CreateClusterFromConnectors2(iModule, digis, digiConnector, currIter); + } + else { + //if Cluster has N elements + CreateClusterFromConnectorsN(iModule, digis, digiConnector, currIter); + } + } +} + + +XPU_D void CbmStsGpuHitFinder::CreateClusterFromConnectors1(int const iModule, CbmStsDigi* digis, int digiIndex) const +{ + const CbmStsDigi& digi = digis[digiIndex]; + + CbmStsTimeNs time = digi.GetTimeU32(); + + const float timeError = asic.timeResolution; + + CbmStsClusterData cluster { + .fCharge = asic.AdcToCharge(digi.GetChargeU16()), + .fSize = 1, + .fPosition = float(digi.GetChannel()) + (IsBackside(iModule) ? nChannels : 0.f), + .fPositionError = 1.f / xpu::sqrt(24.f), + .fTimeError = timeError, + }; + + SaveMaxError(timeError, iModule); + + AddCluster(iModule, time, cluster); +} + +XPU_D void CbmStsGpuHitFinder::CreateClusterFromConnectors2(int const iModule, CbmStsDigi* digis, + CbmStsDigiConnector* digiConnector, + int const digiIndex) const +{ + + const CbmStsDigi& digi1 = digis[digiIndex]; + const CbmStsDigi& digi2 = digis[digiConnector[digiIndex].next()]; + + float eNoiseSq = asic.noise * asic.noise; + + float chargePerAdc = asic.dynRange / float(asic.nAdc); + float eDigitSq = chargePerAdc * chargePerAdc / 12.f; + + float x1 = float(digi1.GetChannel()); + float q1 = asic.AdcToCharge(digi1.GetChargeU16()); + float q2 = asic.AdcToCharge(digi2.GetChargeU16()); + + // Periodic position for clusters round the edge + if (digi1.GetChannel() > digi2.GetChannel()) { x1 -= float(nChannels); } + + // Uncertainties of the charge measurements + XErrorT width1 = LandauWidth(q1); + XErrorT eq1sq = width1 * width1 + eNoiseSq + eDigitSq; + XErrorT width2 = LandauWidth(q2); + XErrorT eq2sq = width2 * width2 + eNoiseSq + eDigitSq; + + // Cluster time + float time = 0.5f * (digi1.GetTimeU32() + digi2.GetTimeU32()); + float timeError = asic.timeResolution * 0.70710678f; // 1/sqrt(2) + + // Cluster position + float x = x1 + 0.5f + (q2 - q1) / 3.f / xpu::max(q1, q2); + + // Correct negative position for clusters around the edge + if (x < -0.5f) { x += float(nChannels); } + + // Uncertainty on cluster position. See software note. + XErrorT ex0sq = 0.f; + XErrorT ex1sq = 0.f; + XErrorT ex2sq = 0.f; + if (q1 < q2) { + ex0sq = (q2 - q1) * (q2 - q1) / q2 / q2 / 72.f; + ex1sq = eq1sq / q2 / q2 / 9.f; + ex2sq = eq2sq * q1 * q1 / q2 / q2 / q2 / q2 / 9.f; + } + else { + ex0sq = (q2 - q1) * (q2 - q1) / q1 / q1 / 72.f; + ex1sq = eq1sq * q2 * q2 / q1 / q1 / q1 / q1 / 9.f; + ex2sq = eq2sq / q1 / q1 / 9.f; + } + XErrorT xError = xpu::sqrt(ex0sq + ex1sq + ex2sq); + + // Cluster charge + float charge = q1 + q2; + + if (IsBackside(iModule)) x += nChannels; + + CbmStsClusterData cls { + .fCharge = charge, + .fSize = 2, + .fPosition = x, + .fPositionError = xError, + .fTimeError = timeError, + }; + + SaveMaxError(timeError, iModule); + AddCluster(iModule, time, cls); +} + + +XPU_D void CbmStsGpuHitFinder::CreateClusterFromConnectorsN(int iModule, CbmStsDigi* digis, + CbmStsDigiConnector* digiConnector, int digiIndex) const +{ + CbmStsClusterCalculationProperties cProps; + + //This Lambda calculates all charges for a single digi inside of a cluster + auto calculateClusterCharges = [this, &cProps, &digis, &digiConnector](int index) { + CbmStsDigi digi = digis[index]; + float eNoiseSq = asic.noise * asic.noise; + float chargePerAdc = asic.dynRange / float(asic.nAdc); + float eDigitSq = chargePerAdc * chargePerAdc / 12.f; + cProps.tResolSum += asic.timeResolution; + cProps.tSum += digi.GetTimeU32(); + float charge = asic.AdcToCharge(digi.GetChargeU16()); + float lWidth = LandauWidth(charge); + float eChargeSq = lWidth * lWidth + eNoiseSq + eDigitSq; + + if (!digiConnector[index].HasPrevious()) { + //begin of Cluster (most left Element) + cProps.chanF = digi.GetChannel(); + cProps.qF = charge; + cProps.eqFsq = eChargeSq; + } + else if (!digiConnector[index].HasNext()) { + //end of Cluster (most right Element) + cProps.chanL = digi.GetChannel(); + cProps.qL = charge; + cProps.eqLsq = eChargeSq; + } + else { + cProps.qM += charge; + cProps.eqMsq += eChargeSq; + } + cProps.xSum += charge * float(digi.GetChannel()); + }; + + calculateClusterCharges(digiIndex); + + // Calculate ClusterCharges + while (digiConnector[digiIndex].HasNext()) { + digiIndex = digiConnector[digiIndex].next(); + calculateClusterCharges(digiIndex); + } + + // Periodic channel position for clusters round the edge + if (cProps.chanF > cProps.chanL) cProps.chanF -= nChannels; + + // Cluster time and total charge + float nDigis = ChanDist(cProps.chanF, cProps.chanL) + 1; + cProps.tSum = cProps.tSum / nDigis; + float timeError = (cProps.tResolSum / nDigis) / xpu::sqrt(nDigis); + float qSum = cProps.qF + cProps.qM + cProps.qL; + + // Average charge in middle strips + cProps.qM /= (nDigis - 2.f); + cProps.eqMsq /= (nDigis - 2.f); + + // Cluster position + float x = 0.5f * (float(cProps.chanF + cProps.chanL) + (cProps.qL - cProps.qF) / cProps.qM); + + // Correct negative cluster position for clusters round the edge + if (x < -0.5f) { x += float(nChannels); } + + // Cluster position error + float exFsq = cProps.eqFsq / cProps.qM / cProps.qM / 4.f; // error from first charge + float exMsq = cProps.eqMsq * (cProps.qL - cProps.qF) * (cProps.qL - cProps.qF) / cProps.qM / cProps.qM / cProps.qM + / cProps.qM / 4.f; + float exLsq = cProps.eqLsq / cProps.qM / cProps.qM / 4.f; + float xError = xpu::sqrt(exFsq + exMsq + exLsq); + + // Correction for corrupt clusters + if (x < cProps.chanF || x > cProps.chanL) { x = cProps.xSum / qSum; } + + XPU_ASSERT(x >= cProps.chanF && x <= cProps.chanL); + XPU_ASSERT(nDigis > 2); + + if (IsBackside(iModule)) { x += nChannels; } + + CbmStsClusterData cls { + .fCharge = qSum, + .fSize = int(nDigis), + .fPosition = x, + .fPositionError = xError, + .fTimeError = timeError, + }; + + SaveMaxError(timeError, iModule); + AddCluster(iModule, cProps.tSum, cls); +} + +XPU_D void CbmStsGpuHitFinder::SortClusters(CbmStsSortClusterSmem& smem, int iBlock) const +{ + int iModule = iBlock; + size_t offset = iModule * maxClustersPerModule; + CbmStsClusterIdx* clusterIdx = &clusterIdxPerModule[offset]; + CbmStsClusterIdx* clusterIdxTmp = &clusterIdxPerModuleTmp[offset]; + int nClusters = nClustersPerModule[iModule]; + + SortClustersT clsSort(smem.sortBuf); + clusterIdx = clsSort.sort(clusterIdx, nClusters, clusterIdxTmp, [](const CbmStsClusterIdx a) { return a.fTime; }); + + if (xpu::thread_idx::x() == 0) clusterIdxSortedPerModule[iModule] = clusterIdx; +} + +XPU_D void CbmStsGpuHitFinder::FindHits(int iBlock) const +{ + int iModuleF = iBlock; + int iModuleB = iBlock + nModules; + size_t offsetF = iModuleF * maxClustersPerModule; + size_t offsetB = iModuleB * maxClustersPerModule; + CbmStsClusterIdx* clusterIdxF = clusterIdxSortedPerModule[iModuleF]; + CbmStsClusterIdx* clusterIdxB = clusterIdxSortedPerModule[iModuleB]; + CbmStsClusterData* clusterDataF = &clusterDataPerModule[offsetF]; + CbmStsClusterData* clusterDataB = &clusterDataPerModule[offsetB]; + int nClustersF = nClustersPerModule[iModuleF]; + int nClustersB = nClustersPerModule[iModuleB]; + + + CbmStsHitFinderParams pars; + { + CbmStsHitFinderConfig cfg = hitfinderConfig[iBlock]; + + pars.dY = cfg.dY; + pars.pitch = cfg.pitch; + pars.stereoF = cfg.stereoF; + pars.stereoB = cfg.stereoB; + pars.lorentzF = cfg.lorentzF; + pars.lorentzB = cfg.lorentzB; + pars.tanStereoF = xpu::tan(pars.stereoF * xpu::deg_to_rad()); + pars.tanStereoB = xpu::tan(pars.stereoB * xpu::deg_to_rad()); + pars.errorFac = 1.f / (pars.tanStereoB - pars.tanStereoF) / (pars.tanStereoB - pars.tanStereoF); + pars.dX = float(nChannels) * pars.pitch; + } + + float maxTerrF = *maxErrorFront; + float maxTerrB = *maxErrorBack; + + float maxSigmaBoth = 4.f * xpu::sqrt(maxTerrF * maxTerrF + maxTerrB * maxTerrB); + + int startB = 0; + for (int iClusterF = xpu::thread_idx::x(); iClusterF < nClustersF; iClusterF += xpu::block_dim::x()) { + CbmStsClusterIdx clsIdxF = clusterIdxF[iClusterF]; + CbmStsClusterData clsDataF = clusterDataF[clsIdxF.fIdx]; + + float maxSigma = 4.f * xpu::sqrt(clsDataF.fTimeError * clsDataF.fTimeError + maxTerrF * maxTerrF); + + for (int iClusterB = startB; iClusterB < nClustersB; iClusterB++) { + CbmStsClusterIdx clsIdxB = clusterIdxB[iClusterB]; + CbmStsClusterData clsDataB = clusterDataB[clsIdxB.fIdx]; + + float timeDiff = float(clsIdxF.fTime) - float(clsIdxB.fTime); + + if (timeDiff > 0 && timeDiff > maxSigmaBoth) { + startB++; + continue; + } + else if (timeDiff > 0 && timeDiff > maxSigma) { + continue; + } + else if (timeDiff < 0 && xpu::abs(timeDiff) > maxSigma) { + break; + } + + float timeCut = -1.f; + if (timeCutClusterAbs > 0.f) timeCut = timeCutClusterAbs; + else if (timeCutClusterSig > 0.f) { + float eF = clsDataF.fTimeError; + float eB = clsDataB.fTimeError; + timeCut = timeCutClusterSig * xpu::sqrt(eF * eF + eB * eB); + } + + if (xpu::abs(float(clsIdxF.fTime) - float(clsIdxB.fTime)) > timeCut) continue; + + IntersectClusters(iBlock, pars, clsIdxF.fTime, clsDataF, clsIdxB.fTime, clsDataB); + } + } +} + +XPU_D void CbmStsGpuHitFinder::IntersectClusters(int iBlock, const CbmStsHitFinderParams& pars, CbmStsTimeNs timeF, + const CbmStsClusterData& clsF, CbmStsTimeNs timeB, + const CbmStsClusterData& clsB) const +{ + // --- Calculate cluster centre position on readout edge + + float xF = GetClusterPosition(pars, clsF.fPosition, true); + float exF = clsF.fPositionError * pars.pitch; + float du = exF * xpu::cos(xpu::deg_to_rad() * pars.stereoF); + float xB = GetClusterPosition(pars, clsB.fPosition, false); + float exB = clsB.fPositionError * pars.pitch; + float dv = exB * xpu::cos(xpu::deg_to_rad() * pars.stereoB); + + // // --- Should be inside active area + if (xF < 0.f || xF > pars.dX || xB < 0.f || xB > pars.dX) { return; } + + // // --- Calculate number of line segments due to horizontal + // // --- cross-connection. If x(y=0) does not fall on the bottom edge, + // // --- the strip is connected to the one corresponding to the line + // // --- with top edge coordinate xF' = xF +/- Dx. For odd combinations + // // --- of stereo angle and sensor dimensions, this could even happen + // // --- multiple times. For each of these lines, the intersection with + // // --- the line on the other side is calculated. If inside the active area, + // // --- a hit is created. + int nF = (xF + pars.dY * pars.tanStereoF) / pars.dX; + int nB = (xB + pars.dY * pars.tanStereoB) / pars.dX; + + // --- If n is positive, all lines from 0 to n must be considered, + // --- if it is negative (phi negative), all lines from n to 0. + int nF1 = xpu::min(0, nF); + int nF2 = xpu::max(0, nF); + int nB1 = xpu::min(0, nB); + int nB2 = xpu::max(0, nB); + + // --- Double loop over possible lines + float xC = -1.f; + float yC = -1.f; + float varX = 0.f; + float varY = 0.f; + float varXY = 0.f; + for (int iF = nF1; iF <= nF2; iF++) { + float xFi = xF - float(iF) * pars.dX; + for (int iB = nB1; iB <= nB2; iB++) { + float xBi = xB - float(iB) * pars.dX; + + // --- Intersect the two lines + bool found = Intersect(pars, xFi, exF, xBi, exB, xC, yC, varX, varY, varXY); + if (not found) continue; + + // --- Transform into sensor system with origin at sensor centre + + xC -= 0.5f * pars.dX; + yC -= 0.5f * pars.dY; + + CreateHit(iBlock, xC, yC, varX, varY, varXY, timeF, clsF, timeB, clsB, du, dv); + } + } +} + +XPU_D float CbmStsGpuHitFinder::GetClusterPosition(const CbmStsHitFinderParams& pars, float centre, bool isFront) const +{ + // Take integer channel + + int iChannel = int(centre); + float xDif = centre - float(iChannel); + + // Calculate corresponding strip on sensor + + int iStrip = iChannel - (isFront ? 0 : nChannels); + + // Re-add difference to integer channel. Convert channel to + // coordinate + float xCluster = (float(iStrip) + xDif + 0.5f) * pars.pitch; + + // // Correct for Lorentz-Shift + // // Simplification: The correction uses only the y component of the + // // magnetic field. The shift is calculated using the mid-plane of the + // // sensor, which is not correct for tracks not traversing the entire + // // sensor thickness (i.e., are created or stopped somewhere in the sensor). + // // However, this is the best one can do in reconstruction. + xCluster -= (isFront ? pars.lorentzF : pars.lorentzB); + + return xCluster; +} + +XPU_D bool CbmStsGpuHitFinder::Intersect(const CbmStsHitFinderParams& pars, float xF, float exF, float xB, float exB, + float& x, float& y, float& varX, float& varY, float& varXY) const +{ + + // In the coordinate system with origin at the bottom left corner, + // a line along the strips with coordinate x0 at the top edge is + // given by the function y(x) = Dy - ( x - x0 ) / tan(phi), if + // the stereo angle phi does not vanish. Two lines yF(x), yB(x) with top + // edge coordinates xF, xB intersect at + // x = ( tan(phiB)*xF - tan(phiF)*xB ) / (tan(phiB) - tan(phiF) + // y = Dy + ( xB - xF ) / ( tan(phiB) - tan(phiF) ) + // For the case that one of the stereo angles vanish (vertical strips), + // the calculation of the intersection is straightforward. + + // --- First check whether stereo angles are different. Else there is + // --- no intersection. + // TODO: if this is true, no hits can be found? So just do this once at the beginning? + if (xpu::abs(pars.stereoF - pars.stereoB) < 0.5f) return false; + + // --- Now treat vertical front strips + if (xpu::abs(pars.stereoF) < 0.001f) { + x = xF; + y = pars.dY - (xF - xB) / pars.tanStereoB; + varX = exF * exF; + varY = (exF * exF + exB * exB) / pars.tanStereoB / pars.tanStereoB; + varXY = -1.f * exF * exF / pars.tanStereoB; + return IsInside(pars, x - pars.dX / 2.f, y - pars.dY / 2.f); + } + + // --- Maybe the back side has vertical strips? + if (xpu::abs(pars.stereoB) < 0.001f) { + x = xB; + y = pars.dY - (xB - xF) / pars.tanStereoF; + varX = exB * exB; + varY = (exF * exF + exB * exB) / pars.tanStereoF / pars.tanStereoF; + varXY = -1.f * exB * exB / pars.tanStereoF; + return IsInside(pars, x - pars.dX / 2.f, y - pars.dY / 2.f); + } + + // --- OK, both sides have stereo angle + + x = (pars.tanStereoB * xF - pars.tanStereoF * xB) / (pars.tanStereoB - pars.tanStereoF); + y = pars.dY + (xB - xF) / (pars.tanStereoB - pars.tanStereoF); + varX = + pars.errorFac * (exF * exF * pars.tanStereoB * pars.tanStereoB + exB * exB * pars.tanStereoF * pars.tanStereoF); + varY = pars.errorFac * (exF * exF + exB * exB); + varXY = -1.f * pars.errorFac * (exF * exF * pars.tanStereoB + exB * exB * pars.tanStereoF); + + return IsInside(pars, x - pars.dX / 2.f, y - pars.dY / 2.f); +} + +XPU_D bool CbmStsGpuHitFinder::IsInside(const CbmStsHitFinderParams& pars, float x, float y) const +{ + // clang-format off + return x >= -pars.dX / 2.f + && x <= pars.dX / 2.f + && y >= -pars.dY / 2.f + && y <= pars.dY / 2.f; + // clang-format on +} + +XPU_D void CbmStsGpuHitFinder::CreateHit(int iModule, float xLocal, float yLocal, float varX, float varY, float varXY, + CbmStsTimeNs timeF, const CbmStsClusterData& clsF, CbmStsTimeNs timeB, + const CbmStsClusterData& clsB, float du, float dv) const +{ + // --- Transform into global coordinate system + float globalX, globalY, globalZ; + ToGlobal(iModule, xLocal, yLocal, 0.f, globalX, globalY, globalZ); + + // We assume here that the local-to-global transformations is only translation + // plus maybe rotation upside down or front-side back. In that case, the + // global covariance matrix is the same as the local one. + float errX = xpu::sqrt(varX); + float errY = xpu::sqrt(varY); + float errZ = 0.f; + + // --- Calculate hit time (average of cluster times) + float hitTime = 0.5f * (float(timeF) + float(timeB)); + float etF = clsF.fTimeError; + float etB = clsB.fTimeError; + float hitTimeError = 0.5f * xpu::sqrt(etF * etF + etB * etB); + + CbmStsHit hit { + .fX = globalX, + .fY = globalY, + .fZ = globalZ, + .fTime = static_cast<CbmStsTimeNs>(hitTime), + .fDx = errX, + .fDy = errY, + .fDz = errZ, + .fDxy = varXY, + .fTimeError = hitTimeError, + .fDu = du, + .fDv = dv, + }; + + int idx = xpu::atomic_add_block(&nHitsPerModule[iModule], 1); + + assert(size_t(idx) < maxHitsPerModule); + + hitsPerModule[iModule * maxHitsPerModule + idx] = hit; +} + + +XPU_D XErrorT CbmStsGpuHitFinder::LandauWidth(float charge) const +{ + if (charge <= landauStepSize) return landauTable[0]; + if (charge > landauStepSize * (landauTableSize - 1)) return landauTable[landauTableSize - 1]; + + int tableIdx = xpu::ceil(charge / landauStepSize); + XErrorT e2 = tableIdx * landauStepSize; + XErrorT v2 = landauTable[tableIdx]; + tableIdx--; + XErrorT e1 = tableIdx * landauStepSize; + XErrorT v1 = landauTable[tableIdx]; + return v1 + (charge - e1) * (v2 - v1) / (e2 - e1); +} + +XPU_D void CbmStsGpuHitFinder::ToGlobal(int iModule, float lx, float ly, float lz, float& gx, float& gy, + float& gz) const +{ + const float* tr = &localToGlobalTranslationByModule[iModule * 3]; + const float* rot = &localToGlobalRotationByModule[iModule * 9]; + + gx = tr[0] + lx * rot[0] + ly * rot[1] + lz * rot[2]; + gy = tr[1] + lx * rot[3] + ly * rot[4] + lz * rot[5]; + gz = tr[2] + lx * rot[6] + ly * rot[7] + lz * rot[8]; +} diff --git a/reco/detectors/sts/experimental/CbmStsGpuHitFinder.h b/reco/detectors/sts/experimental/CbmStsGpuHitFinder.h new file mode 100644 index 0000000000..5708982fab --- /dev/null +++ b/reco/detectors/sts/experimental/CbmStsGpuHitFinder.h @@ -0,0 +1,309 @@ +/* Copyright (C) 2021-2022 FIAS Frankfurt Institute for Advanced Studies, Frankfurt / Main + SPDX-License-Identifier: GPL-3.0-only + Authors: Felix Weiglhofer [committer], Kilian Hunold */ +#ifndef CBMSTSGPUHITFINDER_H +#define CBMSTSGPUHITFINDER_H + +#include "CbmStsDigi.h" +#include "CbmStsGpuRecoDevice.h" + +#include <xpu/device.h> + +// experimental namespace to avoid name collisions with original data types +namespace experimental +{ + using CbmStsTimeNs = uint32_t; + + using XErrorT = float; // FIXME: remove and replace by float + + struct CbmStsClusterIdx { + CbmStsTimeNs fTime; ///< Cluster time (average of digi times) [ns] + int fIdx; + }; + + struct CbmStsClusterData { + float fCharge; ///< Total charge + int fSize; ///< Difference between first and last channel + float fPosition; ///< Cluster centre in channel number units + XErrorT fPositionError; ///< Cluster centre error (r.m.s.) in channel number units + float fTimeError; ///< Error of cluster time [ns] + }; + + struct CbmStsHit { + float fX, fY; ///< X, Y positions of hit [cm] + float fZ; ///< Z position of hit [cm] + CbmStsTimeNs fTime; ///< Hit time [ns] + float fDx, fDy; ///< X, Y errors [cm] + float fDz; ///< Z position error [cm] + float fDxy; ///< XY correlation + float fTimeError; ///< Error of hit time [ns] + float fDu; ///< Error of coordinate across front-side strips [cm] + float fDv; ///< Error of coordinate across back-side strips [cm] + }; + + struct CbmStsParAsic { + int nAdc; + float dynRange; + float threshold; + float timeResolution; + float deadTime; + float noise; + float zeroNoiseRate; + + XPU_D float AdcToCharge(unsigned short adc) const + { + return threshold + dynRange / float(nAdc) * (float(adc) + 0.5f); + } + }; + + struct CbmStsHitFinderConfig { + float dY; + float pitch; + float stereoF; + float stereoB; + float lorentzF; + float lorentzB; + }; + + // cache of various parameters used by the hit finder + struct CbmStsHitFinderParams : public CbmStsHitFinderConfig { + float tanStereoF; + float tanStereoB; + float errorFac; + float dX; + }; + + using SortDigisT = xpu::block_sort<unsigned long int, ::CbmStsDigi, xpu::block_size<SortDigis> {}, + CBM_STS_SORT_DIGIS_ITEMS_PER_THREAD>; + + struct CbmStsSortDigiSmem { + typename SortDigisT::storage_t sortBuf; + }; + + using SortClustersT = xpu::block_sort<CbmStsTimeNs, CbmStsClusterIdx, xpu::block_size<SortClusters> {}, + CBM_STS_SORT_CLUSTERS_ITEMS_PER_THREAD>; + + struct CbmStsSortClusterSmem { + typename SortClustersT::storage_t sortBuf; + }; + + struct CbmStsClusterCalculationProperties { + float tSum = 0.; // sum of digi times + int chanF = 9999999; // first channel in cluster + int chanL = -1; // last channel in cluster + float qF = 0.f; // charge in first channel + float qM = 0.f; // sum of charges in middle channels + float qL = 0.f; // charge in last cluster + float eqFsq = 0.f; // uncertainty of qF + float eqMsq = 0.f; // uncertainty of qMid + float eqLsq = 0.f; // uncertainty of qL + float tResolSum = 0.f; + + float xSum = 0.f; // weighted charge sum, used to correct corrupt clusters + }; + + class CbmStsDigiConnector { + private: + unsigned int hasPreviousAndNext; + + XPU_D unsigned int UnpackNext(unsigned int val) const { return val & ~(1u << 31); } + + public: + XPU_D bool HasPrevious() const { return (hasPreviousAndNext >> 31) & 1u; } + + XPU_D void SetHasPrevious(bool hasPrevious) + { + unsigned int old = hasPreviousAndNext; + unsigned int hasPreviousI = ((unsigned int) hasPrevious) << 31; + unsigned int assumed; + + do { + assumed = old; + old = xpu::atomic_cas_block(&hasPreviousAndNext, assumed, UnpackNext(assumed) | hasPreviousI); + } while (old != assumed); + } + + XPU_D unsigned int next() const { return UnpackNext(hasPreviousAndNext); } + + XPU_D void SetNext(unsigned int next) + { + unsigned int old = hasPreviousAndNext; + unsigned int assumed; + + next = xpu::min((1u << 31) - 1, next); + + do { + assumed = old; + old = xpu::atomic_cas_block(&hasPreviousAndNext, assumed, (assumed & (1u << 31)) | next); + } while (old != assumed); + } + + XPU_D bool HasNext() const { return UnpackNext(hasPreviousAndNext) != 0; } + }; + + template<xpu::side S> + class CbmStsHitFinder { + + public: + // Constants + int nModules; + int nChannels; + + // calibration data + float timeCutDigiAbs; + float timeCutDigiSig; + float timeCutClusterAbs; + float timeCutClusterSig; + CbmStsParAsic asic; + int landauTableSize; + float landauStepSize; + xpu::cmem_io_t<float, S> landauTable; + + xpu::cmem_device_t<float, S> maxErrorFront; + xpu::cmem_device_t<float, S> maxErrorBack; + + // TODO fill this array + xpu::cmem_io_t<CbmStsHitFinderConfig, S> hitfinderConfig; + + // input + // Store all digis in a flat array with a header that contains the offset for every module (front and back) + xpu::cmem_io_t<size_t, S> digiOffsetPerModule; // 2 * nModules + 1 entries, last entry contains total digi count + xpu::cmem_io_t<CbmStsDigi, S> digisPerModule; // TODO this is a flat array now, should be renamed + // Two digi arrays needed, as sorting can't sort in place right now + xpu::cmem_device_t<CbmStsDigi, S> digisPerModuleTmp; + xpu::cmem_device_t<CbmStsDigi*, S> digisSortedPerModule; + + xpu::cmem_io_t<float, S> localToGlobalTranslationByModule; + xpu::cmem_io_t<float, S> localToGlobalRotationByModule; + + xpu::cmem_device_t<CbmStsDigiConnector, S> digiConnectorPerModule; + xpu::cmem_device_t<unsigned int, S> channelOffsetPerModule; + + // intermediate results + size_t maxClustersPerModule; + xpu::cmem_device_t<int, S> channelStatusPerModule; + xpu::cmem_io_t<CbmStsClusterIdx, S> clusterIdxPerModule; + xpu::cmem_io_t<CbmStsClusterIdx, S> clusterIdxPerModuleTmp; + xpu::cmem_io_t<CbmStsClusterIdx*, S> clusterIdxSortedPerModule; + xpu::cmem_io_t<CbmStsClusterData, S> clusterDataPerModule; + xpu::cmem_io_t<int, S> nClustersPerModule; + + // output + size_t maxHitsPerModule; + xpu::cmem_io_t<CbmStsHit, S> hitsPerModule; + xpu::cmem_io_t<int, S> nHitsPerModule; + }; + + class CbmStsGpuHitFinder : public CbmStsHitFinder<xpu::side::device> { + + public: + XPU_D void SortDigisInSpaceAndTime(CbmStsSortDigiSmem& smem, int iBlock) const; + XPU_D void FindClustersSingleStep(int iBlock) const; + XPU_D void SortClusters(CbmStsSortClusterSmem& smem, int iBlock) const; + XPU_D void FindHits(int iBlock) const; + + // Individual steps of cluster finder, for more granular time measurement + XPU_D void CalculateOffsetsParallel(int iBlock) const; + XPU_D void FindClustersParallel(int iBlock) const; + XPU_D void CalculateClustersParallel(int iBlock) const; + + XPU_D void FindClusterConnectionsBasic(int iBlock) const; + XPU_D void CalculateClustersBasic(int iBlock) const; + + private: + XPU_D void CalculateChannelOffsets(CbmStsDigi* digis, unsigned int* channelOffsets, unsigned int nDigis) const; + + XPU_D void FindClusterConnectionsChannelWise(CbmStsDigi* digis, CbmStsDigiConnector* digiConnector, + unsigned int* channelOffsets, int iModule, + unsigned int threadId) const; + XPU_D void FindClusterConnectionsDigiWise(CbmStsDigi* digis, CbmStsDigiConnector* digiConnector, + unsigned int* channelOffsets, int iModule, unsigned int threadId, + unsigned int nDigis) const; + + XPU_D void CalculateClustersBasic(CbmStsDigi* digis, CbmStsDigiConnector* digiConnector, int iModule, + unsigned int const threadId) const; + XPU_D void CalculateClustersChannelWise(CbmStsDigi* digis, CbmStsDigiConnector* digiConnector, + unsigned int* channelOffsets, int iModule, unsigned int const threadId, + unsigned int const nDigis) const; + XPU_D void CalculateClustersDigiWise(CbmStsDigi* digis, CbmStsDigiConnector* digiConnector, int iModule, + unsigned int const threadId, unsigned int const nDigis) const; + + XPU_D void CreateClusterFromConnectors1(int const iModule, CbmStsDigi* digis, int const digiIndex) const; + XPU_D void CreateClusterFromConnectors2(int const iModule, CbmStsDigi* digis, CbmStsDigiConnector* digiConnector, + int const digiIndex) const; + XPU_D void CreateClusterFromConnectorsN(int const iModule, CbmStsDigi* digis, CbmStsDigiConnector* digiConnector, + int const digiIndex) const; + + private: + XPU_D unsigned int GetNDigis(int iModule) const + { + return digiOffsetPerModule[iModule + 1] - digiOffsetPerModule[iModule]; + } + + XPU_D float GetTimeResolution(int /* iModule */, int /* channel */) const { return asic.timeResolution; } + + XPU_D bool IsActive(int* channelStatus, int channel) const + { + // TODO add padding to channels to remove this? + if (channel < 0 || channel >= nChannels) { return false; } + return channelStatus[channel] > -1; + } + + XPU_D int ChanLeft(int channel) const { return channel - 1; } + + XPU_D int ChanRight(int channel) const { return channel + 1; } + + XPU_D int ChanDist(int c1, int c2) const + { + int diff = c2 - c1; + // TODO handle wrap around + return diff; + } + + XPU_D void AddCluster(int iModule, CbmStsTimeNs time, const CbmStsClusterData& cls) const + { + CbmStsClusterIdx* tgtIdx = &clusterIdxPerModule[iModule * maxClustersPerModule]; + CbmStsClusterData* tgtData = &clusterDataPerModule[iModule * maxClustersPerModule]; + + int pos = xpu::atomic_add_block(&nClustersPerModule[iModule], 1); + XPU_ASSERT(size_t(pos) < maxClustersPerModule); + + CbmStsClusterIdx idx {time, pos}; + tgtIdx[idx.fIdx] = idx; + tgtData[idx.fIdx] = cls; + } + + XPU_D bool IsBackside(int iModule) const { return iModule >= nModules; } + + XPU_D XErrorT LandauWidth(float charge) const; + + XPU_D void ToGlobal(int iModule, float lx, float ly, float lz, float& gx, float& gy, float& gz) const; + + XPU_D void IntersectClusters(int iBlock, const CbmStsHitFinderParams& pars, CbmStsTimeNs timeF, + const CbmStsClusterData& clsF, CbmStsTimeNs timeB, + const CbmStsClusterData& clsB) const; + XPU_D float GetClusterPosition(const CbmStsHitFinderParams& pars, float centre, bool isFront) const; + XPU_D bool Intersect(const CbmStsHitFinderParams& pars, float xF, float exF, float xB, float exB, float& x, + float& y, float& varX, float& varY, float& varXY) const; + XPU_D bool IsInside(const CbmStsHitFinderParams& pars, float x, float y) const; + XPU_D void CreateHit(int iBlocks, float xLocal, float yLocal, float varX, float varY, float varXY, + CbmStsTimeNs timeF, const CbmStsClusterData& clsF, CbmStsTimeNs timeB, + const CbmStsClusterData& clsB, float du, float dv) const; + + XPU_D void SaveMaxError(float errorValue, int iModule) const + { + + float* maxError = IsBackside(iModule) ? maxErrorBack : maxErrorFront; + float old {}; + do { + old = *maxError; + if (old >= errorValue) { break; } + } while (!xpu::atomic_cas_block(maxError, *maxError, xpu::max(errorValue, *maxError))); + } + }; + +} // namespace experimental + +XPU_EXPORT_CONSTANT(CbmStsGpuRecoDevice, ::experimental::CbmStsGpuHitFinder, HitFinder); + +#endif diff --git a/reco/detectors/sts/experimental/CbmStsGpuRecoDevice.cxx b/reco/detectors/sts/experimental/CbmStsGpuRecoDevice.cxx new file mode 100644 index 0000000000..c8f1b62a75 --- /dev/null +++ b/reco/detectors/sts/experimental/CbmStsGpuRecoDevice.cxx @@ -0,0 +1,30 @@ +/* Copyright (C) 2022 FIAS Frankfurt Institute for Advanced Studies, Frankfurt / Main + SPDX-License-Identifier: GPL-3.0-only + Authors: Felix Weiglhofer [committer], Kilian Hunold */ +#include "CbmStsGpuRecoDevice.h" + +#include "CbmStsGpuHitFinder.h" + +using namespace experimental; + +XPU_IMAGE(CbmStsGpuRecoDevice); + +XPU_CONSTANT(HitFinder); + +XPU_KERNEL(SortDigis, CbmStsSortDigiSmem) { xpu::cmem<HitFinder>().SortDigisInSpaceAndTime(smem, xpu::block_idx::x()); } + +XPU_KERNEL(FindClustersSingleStep, xpu::no_smem) { xpu::cmem<HitFinder>().FindClustersSingleStep(xpu::block_idx::x()); } + +XPU_KERNEL(CalculateOffsets, xpu::no_smem) { xpu::cmem<HitFinder>().CalculateOffsetsParallel(xpu::block_idx::x()); } + +XPU_KERNEL(FindClusters, xpu::no_smem) { xpu::cmem<HitFinder>().FindClustersParallel(xpu::block_idx::x()); } + +XPU_KERNEL(FindClustersBasic, xpu::no_smem) { xpu::cmem<HitFinder>().FindClusterConnectionsBasic(xpu::block_idx::x()); } + +XPU_KERNEL(CalculateClusters, xpu::no_smem) { xpu::cmem<HitFinder>().CalculateClustersParallel(xpu::block_idx::x()); } + +XPU_KERNEL(CalculateClustersBasic, xpu::no_smem) { xpu::cmem<HitFinder>().CalculateClustersBasic(xpu::block_idx::x()); } + +XPU_KERNEL(SortClusters, CbmStsSortClusterSmem) { xpu::cmem<HitFinder>().SortClusters(smem, xpu::block_idx::x()); } + +XPU_KERNEL(FindHits, xpu::no_smem) { xpu::cmem<HitFinder>().FindHits(xpu::block_idx::x()); } diff --git a/reco/detectors/sts/experimental/CbmStsGpuRecoDevice.h b/reco/detectors/sts/experimental/CbmStsGpuRecoDevice.h new file mode 100644 index 0000000000..1d5f9020e5 --- /dev/null +++ b/reco/detectors/sts/experimental/CbmStsGpuRecoDevice.h @@ -0,0 +1,55 @@ +/* Copyright (C) 2022 FIAS Frankfurt Institute for Advanced Studies, Frankfurt / Main + SPDX-License-Identifier: GPL-3.0-only + Authors: Felix Weiglhofer [committer], Kilian Hunold */ +#ifndef CBMSTSGPURECODEVICE_H +#define CBMSTSGPURECODEVICE_H 1 + +#include <xpu/device.h> + +// TODO: Tune these values for various gpus +#if XPU_IS_CUDA +#define CBM_STS_SORT_DIGIS_BLOCK_SIZE 32 +#define CBM_STS_SORT_DIGIS_ITEMS_PER_THREAD 32 +#define CBM_STS_SORT_CLUSTERS_BLOCK_SIZE 32 +#define CBM_STS_SORT_CLUSTERS_ITEMS_PER_THREAD 32 +#define CBM_STS_FIND_CLUSTERS_BLOCK_SIZE 32 +#define CBM_STS_FIND_HITS_BLOCK_SIZE 32 +#else // HIP, values ignored on CPU +#define CBM_STS_SORT_DIGIS_BLOCK_SIZE 256 +#define CBM_STS_SORT_DIGIS_ITEMS_PER_THREAD 6 +#define CBM_STS_SORT_CLUSTERS_BLOCK_SIZE 256 +#define CBM_STS_SORT_CLUSTERS_ITEMS_PER_THREAD 6 +#define CBM_STS_FIND_CLUSTERS_BLOCK_SIZE 256 +#define CBM_STS_FIND_HITS_BLOCK_SIZE 256 +#endif + +struct CbmStsGpuRecoDevice { +}; + +XPU_EXPORT_KERNEL(CbmStsGpuRecoDevice, SortDigis); +XPU_BLOCK_SIZE(SortDigis, CBM_STS_SORT_DIGIS_BLOCK_SIZE); + +// Combine substeps for finding clusters into a single kernel +XPU_EXPORT_KERNEL(CbmStsGpuRecoDevice, FindClustersSingleStep); +XPU_BLOCK_SIZE(FindClustersSingleStep, CBM_STS_FIND_CLUSTERS_BLOCK_SIZE); + +XPU_EXPORT_KERNEL(CbmStsGpuRecoDevice, CalculateOffsets); +XPU_BLOCK_SIZE(CalculateOffsets, CBM_STS_FIND_CLUSTERS_BLOCK_SIZE); + +XPU_EXPORT_KERNEL(CbmStsGpuRecoDevice, FindClusters); +XPU_BLOCK_SIZE(FindClusters, CBM_STS_FIND_CLUSTERS_BLOCK_SIZE); + +XPU_EXPORT_KERNEL(CbmStsGpuRecoDevice, FindClustersBasic); + +XPU_EXPORT_KERNEL(CbmStsGpuRecoDevice, CalculateClusters); +XPU_BLOCK_SIZE(CalculateClusters, CBM_STS_FIND_CLUSTERS_BLOCK_SIZE); + +XPU_EXPORT_KERNEL(CbmStsGpuRecoDevice, CalculateClustersBasic); + +XPU_EXPORT_KERNEL(CbmStsGpuRecoDevice, SortClusters); +XPU_BLOCK_SIZE(SortClusters, CBM_STS_SORT_CLUSTERS_BLOCK_SIZE); + +XPU_EXPORT_KERNEL(CbmStsGpuRecoDevice, FindHits); +XPU_BLOCK_SIZE(FindHits, CBM_STS_FIND_HITS_BLOCK_SIZE); + +#endif diff --git a/reco/mq/CMakeLists.txt b/reco/mq/CMakeLists.txt index 035786646b..1671e330ee 100644 --- a/reco/mq/CMakeLists.txt +++ b/reco/mq/CMakeLists.txt @@ -29,15 +29,12 @@ set(INCLUDE_DIRECTORIES ) Set(SYSTEM_INCLUDE_DIRECTORIES - ${SYSTEM_INCLUDE_DIRECTORIES} + ${BASE_INCLUDE_DIRECTORIES} ${ZeroMQ_INCLUDE_DIR} ${Boost_INCLUDE_DIR} ${FAIRROOT_INCLUDE_DIR} ${FAIRMQ_INCLUDE_DIR} ${FAIRMQ_INCLUDE_DIR}/options - ${VMC_INCLUDE_DIRS} - ${FAIRLOGGER_INCLUDE_DIR} - ${IPC_INCLUDE_DIRECTORY} ${CBMROOT_SOURCE_DIR}/external/cppzmq ) -- GitLab