diff --git a/algo/base/gpu/xpu_legacy.h b/algo/base/gpu/xpu_legacy.h index ad78c090fc329cc56166676b06d62a426bf161af..71e4f6bd6b158e133ae447f3a06516f666a39be8 100644 --- a/algo/base/gpu/xpu_legacy.h +++ b/algo/base/gpu/xpu_legacy.h @@ -52,6 +52,22 @@ namespace xpu _Q.wait(); } + template<typename T> + void copy(T* dst, const T* src, size_t nelems) + { + static xpu::queue _Q; + _Q.copy(src, dst, nelems); + _Q.wait(); + } + + template<typename Kernel, typename... Args> + void run_kernel(xpu::grid params, Args&&... args) + { + static xpu::queue _Q; + _Q.launch<Kernel>(params, std::forward<Args>(args)...); + _Q.wait(); + } + enum class side { host, diff --git a/algo/detectors/sts/HitfinderChain.cxx b/algo/detectors/sts/HitfinderChain.cxx index 906877f4f07a7e568cc6329132c68fd6e458f79e..03d63a89815c727d70c8a0a56ca3aad9935725f3 100644 --- a/algo/detectors/sts/HitfinderChain.cxx +++ b/algo/detectors/sts/HitfinderChain.cxx @@ -116,7 +116,7 @@ sts::HitfinderChain::Result sts::HitfinderChain::operator()(gsl::span<const CbmS // TODO add support in xpu, for buffer copies with offset + size const CbmStsDigi* digisH = xpu::h_view(hfc.digisPerModule).data(); CbmStsDigi* digisD = hfc.digisPerModule.get(); - if (digisH != digisD) queue.copy(digisH, digisD, sizeof(CbmStsDigi) * nDigisTotal); + if (digisH != digisD) queue.copy(digisH, digisD, nDigisTotal); queue.copy(hfc.digiOffsetPerModule, xpu::h2d); L_(debug) << "STS Hitfinder Chain: Sort Digis..."; @@ -149,7 +149,7 @@ sts::HitfinderChain::Result sts::HitfinderChain::operator()(gsl::span<const CbmS xpu::buffer_prop propsOffset{hfc.channelOffsetPerModule}; std::vector<u32> channelOffsetPerModule; channelOffsetPerModule.resize(propsOffset.size()); - queue.copy(hfc.channelOffsetPerModule.get(), channelOffsetPerModule.data(), propsOffset.size_bytes()); + queue.copy(hfc.channelOffsetPerModule.get(), channelOffsetPerModule.data(), propsOffset.size()); queue.wait(); EnsureChannelOffsets(channelOffsetPerModule); @@ -161,8 +161,8 @@ sts::HitfinderChain::Result sts::HitfinderChain::operator()(gsl::span<const CbmS std::vector<int> nClustersPerModule; nClustersPerModule.resize(fPars->setup.modules.size() * 2); - queue.copy(hfc.clusterIdxPerModule.get(), clusterIdxPerModule.data(), props.size_bytes()); - queue.copy(hfc.nClustersPerModule.get(), nClustersPerModule.data(), nClustersPerModule.size() * sizeof(int)); + queue.copy(hfc.clusterIdxPerModule.get(), clusterIdxPerModule.data(), props.size()); + queue.copy(hfc.nClustersPerModule.get(), nClustersPerModule.data(), nClustersPerModule.size()); queue.wait(); EnsureClustersSane(clusterIdxPerModule, nClustersPerModule); } @@ -381,7 +381,7 @@ void sts::HitfinderChain::AllocateDynamic(size_t maxNDigisPerModule, size_t nDig fHitfinder.nHitsPerModule.reset(nModules, xpu::buf_io); fHitfinder.hitsFlatCapacity = maxHitsTotal; - fHitfinder.hitsFlat.reset(maxHitsTotal, xpu::buf_host); + fHitfinder.hitsFlat.reset(maxHitsTotal, xpu::buf_pinned); } sts::HitfinderChain::DigiMap sts::HitfinderChain::CountDigisPerModules(gsl::span<const CbmStsDigi> digis) @@ -576,8 +576,7 @@ PartitionedSpan<sts::Hit> sts::HitfinderChain::FlattenHits(xpu::queue queue) size_t nHitsCopied = 0; for (int m = 0; m < hfc.nModules; m++) { size_t numHitsInModule = GetNHits(nHits, m); - queue.copy(hfc.hitsPerModule.get() + hfc.hitsAllocatedPerModule * m, hits.data() + nHitsCopied, - numHitsInModule * sizeof(sts::Hit)); + queue.copy(hfc.hitsPerModule.get() + hfc.hitsAllocatedPerModule * m, hits.data() + nHitsCopied, numHitsInModule); nHitsCopied += numHitsInModule; } } @@ -626,8 +625,7 @@ PartitionedVector<sts::Cluster> sts::HitfinderChain::FlattenClusters(xpu::queue size_t offset = 0; for (int m = 0; m < nModuleSides; m++) { size_t nClustersInModule = nClusters[m]; - queue.copy(clusters + maxClustersPerModule * m, clustersFlat.data() + offset, - nClustersInModule * sizeof(sts::Cluster)); + queue.copy(clusters + maxClustersPerModule * m, clustersFlat.data() + offset, nClustersInModule); offset += nClustersInModule; } queue.wait();