CbmGpuRecoSts.cxx

/* Copyright (C) 2022 FIAS Frankfurt Institute for Advanced Studies, Frankfurt / Main
   SPDX-License-Identifier: GPL-3.0-only
   Authors: Felix Weiglhofer [committer], Kilian Hunold */
#include "CbmGpuRecoSts.h"

#include "CbmAddress.h"
#include "CbmDigiManager.h"
#include "CbmStsAddress.h"
#include "CbmStsCluster.h"
#include "CbmStsGpuRecoDevice.h"
#include "CbmStsHit.h"
#include "CbmStsPhysics.h"
#include "CbmStsRecoModule.h"

#include <TCanvas.h>
#include <TClonesArray.h>
#include <TGeoMatrix.h>
#include <TH1D.h>
#include <TH2D.h>
#include <TStopwatch.h>

#include <algorithm>
#include <cassert>
#include <unordered_set>
#include <vector>

using namespace experimental;

void CbmGpuRecoSts::SetupConstants()
{
  assert(fConfig != std::nullopt);
  assert(fConfig->recoModules.size() == fConfig->parModules.size());
  assert(fConfig->moduleAddrs.size() == fConfig->parModules.size());
  assert(fConfig->hitfinderCfg.size() == fConfig->parModules.size());

  size_t nModules = fConfig->parModules.size();

  auto& hfc = fHitfinderCpu;
  auto& hfg = fHitfinderGpu;

  LOG(info) << "STS GPU RECO: setting constants";

  size_t nChannels     = fConfig->parModules[0].GetNofChannels();
  ::CbmStsParAsic asic = fConfig->parModules[0].GetParAsic(0);  // TODO values are currently identical for all asics??
  // for (auto &mod : modules) {
  //     assert(nChannels == mod.GetNofChannels());
  //     for (size_t c = 0; c < nChannels; c++) {
  //         ::CbmStsParAsic otherAsic = mod.GetParAsic(c);
  //         assert(asic.GetNofAdc() == otherAsic.GetNofAdc());
  //         assert(asic.GetDynRange() == otherAsic.GetDynRange());
  //         assert(asic.GetThreshold() == otherAsic.GetThreshold());
  //         assert(asic.GetTimeResol() == otherAsic.GetTimeResol());
  //         assert(asic.GetDeadTime() == otherAsic.GetDeadTime());
  //         assert(asic.GetNoise() == otherAsic.GetNoise());
  //         assert(asic.GetZeroNoiseRate() == otherAsic.GetZeroNoiseRate());
  //     }
  // }

  nChannels /= 2;
  hfc.nModules = hfg.nModules = nModules;
  hfc.nChannels = hfg.nChannels = nChannels;

  hfc.timeCutDigiAbs = hfg.timeCutDigiAbs = -1.f;
  hfc.timeCutDigiSig = hfg.timeCutDigiSig = 3.f;
  hfc.timeCutClusterAbs = hfg.timeCutClusterAbs = -1.f;
  hfc.timeCutClusterSig = hfg.timeCutClusterSig = 4.f;

  hfc.asic.nAdc           = asic.GetNofAdc();
  hfc.asic.dynRange       = asic.GetDynRange();
  hfc.asic.threshold      = asic.GetThreshold();
  hfc.asic.timeResolution = asic.GetTimeResol();
  hfc.asic.deadTime       = asic.GetDeadTime();
  hfc.asic.noise          = asic.GetNoise();
  hfc.asic.zeroNoiseRate  = asic.GetZeroNoiseRate();
  hfg.asic                = hfc.asic;
}

void CbmGpuRecoSts::Setup(size_t maxDigisPerModule, size_t nDigitsTotal)
{
  assert(fConfig != std::nullopt);

  // static constexpr size_t maxDigisPerModule    = 60000;
  // static constexpr size_t maxClustersPerModule = 30000;
  // static constexpr size_t maxHitsPerModule     = 120000;

  const size_t maxClustersPerModule = maxDigisPerModule * 0.5;
  const size_t maxHitsPerModule     = maxClustersPerModule * 4;

  size_t nModules  = fConfig->parModules.size();
  size_t nChannels = fConfig->parModules[0].GetNofChannels() / 2;


  size_t nModulesTotal = nModules * 2;  // Front- and backside

  auto& hfc = fHitfinderCpu;
  auto& hfg = fHitfinderGpu;

  auto landauTable    = fConfig->physics->GetLandauWidthTable();
  hfc.landauTableSize = hfg.landauTableSize = landauTable.size();
  hfc.landauTable                           = xpu::hd_buffer<float> {size_t(hfc.landauTableSize)};
  hfg.landauTable                           = hfc.landauTable.d();
  auto landauEntry                          = landauTable.begin();
  auto prevLandauEntry                      = landauEntry;
  hfc.landauTable.h()[0]                    = landauEntry->second;
  landauEntry++;
  hfc.landauStepSize = hfg.landauStepSize = landauEntry->first - prevLandauEntry->first;
  for (size_t i = 1; landauEntry != landauTable.end(); i++, landauEntry++) {
    assert(hfc.landauStepSize == landauEntry->first - prevLandauEntry->first);
    assert(i < hfc.landauTable.size());
    hfc.landauTable.h()[i] = landauEntry->second;
    prevLandauEntry        = landauEntry;
  }

  LOG(info) << "STS GPU RECO: finished landau";

  hfc.localToGlobalTranslationByModule = xpu::hd_buffer<float> {nModules * 3};
  hfg.localToGlobalTranslationByModule = hfc.localToGlobalTranslationByModule.d();
  hfc.localToGlobalRotationByModule    = xpu::hd_buffer<float> {nModules * 9};
  hfg.localToGlobalRotationByModule    = hfc.localToGlobalRotationByModule.d();
  for (size_t m = 0; m < fConfig->recoModules.size(); m++) {
    auto& recoMod       = fConfig->recoModules[m];
    TGeoHMatrix* matrix = recoMod->getMatrix();

    const double* rot = matrix->GetRotationMatrix();
    float* tgt        = &hfc.localToGlobalRotationByModule.h()[m * 9];
    std::copy_n(rot, 9, tgt);

    const double* tr = matrix->GetTranslation();
    tgt              = &hfc.localToGlobalTranslationByModule.h()[m * 3];
    std::copy_n(tr, 3, tgt);
  }

  LOG(info) << "STS GPU RECO: finished transformation matrix";

  hfc.hitfinderConfig = xpu::hd_buffer<CbmStsHitFinderConfig>(nModules);
  hfg.hitfinderConfig = hfc.hitfinderConfig.d();

  std::copy_n(fConfig->hitfinderCfg.begin(), nModules, hfc.hitfinderConfig.h());

  LOG(info) << "STS GPU RECO: finished setting hitfinder config";

  // FIXME: allocate by total number of digis instead. This array is flat.
  hfc.digiOffsetPerModule    = xpu::hd_buffer<size_t> {nModulesTotal + 1};
  hfg.digiOffsetPerModule    = hfc.digiOffsetPerModule.d();
  hfc.digisPerModule         = xpu::hd_buffer<CbmStsDigi> {nDigitsTotal};
  hfg.digisPerModule         = hfc.digisPerModule.d();
  hfc.digisPerModuleTmp      = xpu::d_buffer<CbmStsDigi> {nDigitsTotal};
  hfg.digisPerModuleTmp      = hfc.digisPerModuleTmp.d();
  hfc.digisSortedPerModule   = xpu::d_buffer<CbmStsDigi*> {nModulesTotal};
  hfg.digisSortedPerModule   = hfc.digisSortedPerModule.d();
  hfc.digiConnectorPerModule = xpu::d_buffer<CbmStsDigiConnector> {nDigitsTotal};
  hfg.digiConnectorPerModule = hfc.digiConnectorPerModule.d();

  hfc.channelOffsetPerModule = xpu::d_buffer<unsigned int> {nModulesTotal * nChannels};
  hfg.channelOffsetPerModule = hfc.channelOffsetPerModule.d();

  hfc.maxErrorFront = xpu::d_buffer<float> {1};
  hfg.maxErrorFront = hfc.maxErrorFront.d();
  hfc.maxErrorBack  = xpu::d_buffer<float> {1};
  hfg.maxErrorBack  = hfc.maxErrorBack.d();

  LOG(info) << "STS GPU RECO: finished digis";

  hfc.maxClustersPerModule = hfg.maxClustersPerModule = maxClustersPerModule;
  hfc.channelStatusPerModule                          = xpu::d_buffer<int> {nChannels * nModulesTotal};
  hfg.channelStatusPerModule                          = hfc.channelStatusPerModule.d();
  hfc.clusterIdxPerModule       = xpu::hd_buffer<CbmStsClusterIdx> {maxClustersPerModule * nModulesTotal};
  hfg.clusterIdxPerModule       = hfc.clusterIdxPerModule.d();
  hfc.clusterIdxPerModuleTmp    = xpu::hd_buffer<CbmStsClusterIdx> {maxClustersPerModule * nModulesTotal};
  hfg.clusterIdxPerModuleTmp    = hfc.clusterIdxPerModuleTmp.d();
  hfc.clusterIdxSortedPerModule = xpu::hd_buffer<CbmStsClusterIdx*> {nModulesTotal};
  hfg.clusterIdxSortedPerModule = hfc.clusterIdxSortedPerModule.d();
  hfc.clusterDataPerModule      = xpu::hd_buffer<CbmStsClusterData> {maxClustersPerModule * nModulesTotal};
  hfg.clusterDataPerModule      = hfc.clusterDataPerModule.d();
  hfc.nClustersPerModule        = xpu::hd_buffer<int> {nModulesTotal};
  hfg.nClustersPerModule        = hfc.nClustersPerModule.d();

  LOG(info) << "STS GPU RECO: finished cluster";

  hfc.maxHitsPerModule = hfg.maxHitsPerModule = maxHitsPerModule;
  hfc.hitsPerModule                           = xpu::hd_buffer<CbmStsHit>(maxHitsPerModule * nModules);
  hfg.hitsPerModule                           = hfc.hitsPerModule.d();
  hfc.nHitsPerModule                          = xpu::hd_buffer<int>(nModules);
  hfg.nHitsPerModule                          = hfc.nHitsPerModule.d();

  LOG(info) << "STS GPU RECO: finished setup";
  LOG(info) << "- nModules = " << nModules;
  LOG(info) << "- nChannels = " << nChannels;
}

void CbmGpuRecoSts::RunHitFinder()
{
  auto& hfc = fHitfinderCpu;

  // ROCm bug: Mi100 name is an emtpy string...
  auto deviceName = (xpu::device_properties().name.empty()) ? "AMD Mi100" : xpu::device_properties().name;

  LOG(info) << "STS GPU RECO: running GPU hit finder on device " << deviceName;
  LOG(info) << "STS GPU RECO: Allocate and fetch digis";


  // FIXME: This is ugly, setup has to be split in two parts, constants required by FetchDigis
  // And FetchDigis required by Setup...
  SetupConstants();
  size_t maxDigisPerModule = 0;
  size_t nDigisTotal       = 0;
  FetchDigis(maxDigisPerModule, nDigisTotal);
  Setup(maxDigisPerModule, nDigisTotal);

  LOG(info) << "STS GPU RECO: finished digis / running clusterizer now";
  LOG(info) << "STS GPU RECO: largest module has " << maxDigisPerModule << " digis";

  // Not all buffers have to initialized, but useful for debugging
  // xpu::memset(hfc.digisPerModule, 0);
  // xpu::memset(hfc.digisPerModuleTmp, 0);
  // xpu::memset(hfc.digisSortedPerModule, 0);
  // xpu::memset(hfc.digiOffsetPerModule, 0);
  xpu::memset(hfc.digiConnectorPerModule, 0);
  // xpu::memset(hfc.channelOffsetPerModule, 0);
  xpu::memset(hfc.channelStatusPerModule, -1);
  xpu::memset(hfc.clusterIdxPerModule, 0);
  // xpu::memset(hfc.clusterIdxPerModuleTmp, 0);
  // xpu::memset(hfc.clusterIdxSortedPerModule, 0);
  // xpu::memset(hfc.clusterDataPerModule, 0);
  xpu::memset(hfc.nClustersPerModule, 0);
  // xpu::memset(hfc.hitsPerModule, 0);
  xpu::memset(hfc.nHitsPerModule, 0);
  xpu::memset(hfc.maxErrorFront, 0);
  xpu::memset(hfc.maxErrorBack, 0);

  hfc.digiOffsetPerModule.h()[0] = 0;
  for (int m = 0; m < hfc.nModules; m++) {
    size_t offset = hfc.digiOffsetPerModule.h()[m];
    auto& md      = fDigisByModuleF[fConfig->moduleAddrs[m]];
    std::copy(md.begin(), md.end(), &hfc.digisPerModule.h()[offset]);
    hfc.digiOffsetPerModule.h()[m + 1] = offset + md.size();
  }
  for (int m = 0; m < hfc.nModules; m++) {
    size_t offset = hfc.digiOffsetPerModule.h()[m + hfc.nModules];
    auto& md      = fDigisByModuleB[fConfig->moduleAddrs[m]];
    std::copy(md.begin(), md.end(), &hfc.digisPerModule.h()[offset]);
    hfc.digiOffsetPerModule.h()[m + hfc.nModules + 1] = offset + md.size();
  }
  assert(nDigisTotal == hfc.digiOffsetPerModule.h()[2 * hfc.nModules]);

  TStopwatch timer;

  // constants only need to be transferred once. should be excluded from time measurement
  xpu::copy(hfc.landauTable, xpu::host_to_device);
  xpu::copy(hfc.hitfinderConfig, xpu::host_to_device);
  xpu::copy(hfc.localToGlobalRotationByModule, xpu::host_to_device);
  xpu::copy(hfc.localToGlobalTranslationByModule, xpu::host_to_device);

  xpu::set_constant<HitFinder>(fHitfinderGpu);
  xpu::copy(hfc.digisPerModule, xpu::host_to_device);
  xpu::copy(hfc.digiOffsetPerModule, xpu::host_to_device);

  xpu::run_kernel<SortDigis>(xpu::grid::n_blocks(hfc.nModules * 2));
  xpu::run_kernel<FindClustersSingleStep>(xpu::grid::n_blocks(hfc.nModules * 2));
  // Run cluster finding steps in indivual kernels, useful for debugging / profiling
  // xpu::run_kernel<CalculateOffsets>(xpu::grid::n_blocks(hfc.nModules * 2));
  // xpu::run_kernel<FindClustersBasic>(xpu::grid::n_blocks(hfc.nModules * 2));
  // xpu::run_kernel<CalculateClusters>(xpu::grid::n_blocks(hfc.nModules * 2));
  // xpu::run_kernel<FindClustersBasic>(xpu::grid::n_blocks(hfc.nModules * 2));
  // xpu::run_kernel<CalculateClustersBasic>(xpu::grid::n_blocks(hfc.nModules * 2));
  xpu::run_kernel<SortClusters>(xpu::grid::n_blocks(hfc.nModules * 2));
  xpu::run_kernel<FindHits>(xpu::grid::n_blocks(hfc.nModules));

  // Check if profiling is enabled
  if (not xpu::get_timing<SortDigis>().empty()) {
    timeSortDigi    = xpu::get_timing<SortDigis>().back();
    timeCluster     = xpu::get_timing<FindClustersSingleStep>().back();
    timeSortCluster = xpu::get_timing<SortClusters>().back();
    timeHits        = xpu::get_timing<FindHits>().back();
  }

  xpu::copy(hfc.clusterDataPerModule, xpu::device_to_host);
  xpu::copy(hfc.nClustersPerModule, xpu::device_to_host);
  xpu::copy(hfc.hitsPerModule, xpu::device_to_host);
  xpu::copy(hfc.nHitsPerModule, xpu::device_to_host);


  size_t mostCluster = 0, mostHits = 0;
  for (int m = 0; m < hfc.nModules; m++) {
    mostCluster = std::max<size_t>(hfc.nClustersPerModule.h()[m], mostCluster);
    mostCluster = std::max<size_t>(hfc.nClustersPerModule.h()[m + hfc.nModules], mostCluster);
    mostHits    = std::max<size_t>(hfc.nHitsPerModule.h()[m], mostHits);

    nCluster += hfc.nClustersPerModule.h()[m];
    nCluster += hfc.nClustersPerModule.h()[m + hfc.nModules];

    nHits += hfc.nHitsPerModule.h()[m];
  }

  LOG(info) << "STS GPU RECO: finished hitfinder";
}

void CbmGpuRecoSts::ForwardClustersAndHits(TClonesArray* clusters, TClonesArray* hits)
{
  auto& hfc = fHitfinderCpu;

  for (int module = 0; module < hfc.nModules; module++) {

    auto* gpuClusterF    = &hfc.clusterDataPerModule.h()[module * hfc.maxClustersPerModule];
    auto* gpuClusterIdxF = &hfc.clusterIdxPerModule.h()[module * hfc.maxClustersPerModule];
    int nClustersFGpu    = hfc.nClustersPerModule.h()[module];

    for (int i = 0; i < nClustersFGpu; i++) {
      auto& cidx       = gpuClusterIdxF[i];
      auto& clusterGpu = gpuClusterF[cidx.fIdx];

      unsigned int outIdx = clusters->GetEntriesFast();
      auto* clusterOut    = new ((*clusters)[outIdx])::CbmStsCluster {};

      clusterOut->SetAddress(fConfig->moduleAddrs[module]);
      clusterOut->SetProperties(clusterGpu.fCharge, clusterGpu.fPosition, clusterGpu.fPositionError, cidx.fTime,
                                clusterGpu.fTimeError);
      clusterOut->SetSize(clusterGpu.fSize);
    }

    auto* gpuClusterB    = &hfc.clusterDataPerModule.h()[(module + hfc.nModules) * hfc.maxClustersPerModule];
    auto* gpuClusterIdxB = &hfc.clusterIdxPerModule.h()[(module + hfc.nModules) * hfc.maxClustersPerModule];
    int nClustersBGpu    = hfc.nClustersPerModule.h()[module + hfc.nModules];

    for (int i = 0; i < nClustersBGpu; i++) {
      auto& cidx          = gpuClusterIdxB[i];
      auto& clusterGpu    = gpuClusterB[cidx.fIdx];
      unsigned int outIdx = clusters->GetEntriesFast();
      auto* clusterOut    = new ((*clusters)[outIdx])::CbmStsCluster {};

      clusterOut->SetAddress(fConfig->moduleAddrs[module]);
      clusterOut->SetProperties(clusterGpu.fCharge, clusterGpu.fPosition, clusterGpu.fPositionError, cidx.fTime,
                                clusterGpu.fTimeError);
      clusterOut->SetSize(clusterGpu.fSize);
    }

    auto* gpuHits = &hfc.hitsPerModule.h()[module * hfc.maxHitsPerModule];
    int nHitsGpu  = hfc.nHitsPerModule.h()[module];

    for (int i = 0; i < nHitsGpu; i++) {
      auto& hitGpu = gpuHits[i];

      unsigned int outIdx = hits->GetEntriesFast();
      new ((*hits)[outIdx])::CbmStsHit {fConfig->moduleAddrs[module],
                                        TVector3 {hitGpu.fX, hitGpu.fY, hitGpu.fZ},
                                        TVector3 {hitGpu.fDx, hitGpu.fDy, hitGpu.fDz},
                                        hitGpu.fDxy,
                                        0,
                                        0,
                                        double(hitGpu.fTime),
                                        hitGpu.fTimeError,
                                        hitGpu.fDu,
                                        hitGpu.fDv};
    }

  }  // for (int module = 0; module < hfc.nModules; module++)
}

void CbmGpuRecoSts::FetchDigis(size_t& maxDigisPerModule, size_t& nDigisTotal)
{

  // FIXME: This function is a crime against humanity.
  // Digis should be presorted by module already...

  CbmDigiManager* digis = CbmDigiManager::Instance();
  auto& hfc             = fHitfinderCpu;

  // Remove digis from previous timeslice
  fDigisByModuleB.clear();
  fDigisByModuleF.clear();

  // FIXME: GPU reco should use regular digi class too
  nDigis      = digis->GetNofDigis(ECbmModuleId::kSts);
  nDigisTotal = 0;
  for (size_t i = 0; i < nDigis; i++) {
    const ::CbmStsDigi* digi = digis->Get<::CbmStsDigi>(i);
    Int_t systemId           = CbmAddress::GetSystemId(digi->GetAddress());
    // TODO: is this check still needed?
    if (systemId != ToIntegralType(ECbmModuleId::kSts)) { continue; }
    int moduleAddr = CbmStsAddress::GetMotherAddress(digi->GetAddress(), kStsModule);
    // int module     = addrToModuleId(moduleAddr);
    // assert(module < hfc.nModules);
    bool isFront = digi->GetChannel() < hfc.nChannels;

    if (isFront) { fDigisByModuleF[moduleAddr].push_back(*digi); }
    else {
      CbmStsDigi tmpdigi = *digi;
      tmpdigi.SetChannel(digi->GetChannel() - hfc.nChannels);
      fDigisByModuleB[moduleAddr].push_back(tmpdigi);
    }

    nDigisTotal++;
  }

  nDigisUsed = nDigisTotal;

  maxDigisPerModule = 0;
  for (const auto& mod : fDigisByModuleF) {
    maxDigisPerModule = std::max(maxDigisPerModule, mod.second.size());
  }

  for (const auto& mod : fDigisByModuleB) {
    maxDigisPerModule = std::max(maxDigisPerModule, mod.second.size());
  }
}