diff --git a/algo/base/gpu/PaddedValue.h b/algo/base/gpu/PaddedValue.h new file mode 100644 index 0000000000000000000000000000000000000000..f82242e8068a4db76a5725afbe4f34c1f5664292 --- /dev/null +++ b/algo/base/gpu/PaddedValue.h @@ -0,0 +1,82 @@ +/* Copyright (C) 2024 FIAS Frankfurt Institute for Advanced Studies, Frankfurt / Main + SPDX-License-Identifier: GPL-3.0-only + Authors: Felix Weiglhofer [committer] */ + +#pragma once + +#include <cstddef> + +#include <xpu/defines.h> + +/** + * @file PaddedValue.h + * @brief This file contains the definition of the PaddedValue class. + */ + +namespace cbm::algo +{ + + /** + * @brief A class that represents a value with padding to a certain size. + * @tparam T The type of the value. + * @tparam N Number of bytes the value should be padded to. + * + * @note This class is useful for aligning values to a certain size, e.g. to ensure that atomic counters are spread across different cache lines. (Prevent false sharing) + */ + template<typename T, size_t N> + class PaddedValue { + static_assert(N % alignof(T) == 0, "N must be a multiple of alignof(T)"); + + public: + XPU_D PaddedValue() = default; + XPU_D PaddedValue(const T& value) : fValue(value) {} + + XPU_D PaddedValue(const PaddedValue& other) : fValue(other.fValue) {} + XPU_D PaddedValue& operator=(const PaddedValue& other) + { + fValue = other.fValue; + return *this; + } + + XPU_D PaddedValue(PaddedValue&& other) : fValue(std::move(other.fValue)) {} + XPU_D PaddedValue& operator=(PaddedValue&& other) + { + fValue = std::move(other.fValue); + return *this; + } + + XPU_D T& operator=(const T& value) + { + fValue = value; + return fValue; + } + + XPU_D T& Get() { return fValue; } + XPU_D const T& Get() const { return fValue; } + + XPU_D T* operator&() { return &fValue; } + XPU_D const T* operator&() const { return &fValue; } + + XPU_D T& operator*() { return fValue; } + XPU_D const T& operator*() const { return fValue; } + + XPU_D operator T&() { return fValue; } + XPU_D operator const T&() const { return fValue; } + + XPU_D operator T*() { return &fValue; } + XPU_D operator const T*() const { return &fValue; } + + XPU_D T* operator->() { return &fValue; } + XPU_D const T* operator->() const { return &fValue; } + + private: + T fValue; + unsigned char fPadding[N - sizeof(T)]; + }; + + inline constexpr size_t SizeOfCacheLine = 64; + + template<typename T> + using PaddedToCacheLine = PaddedValue<T, SizeOfCacheLine>; + +} // namespace cbm::algo diff --git a/algo/detectors/sts/Hitfinder.h b/algo/detectors/sts/Hitfinder.h index 03ad607c0f5fef73c15af672b2ea236ea89a5fe2..af69fab65a2451fd91ee77191962412a207cbc45 100644 --- a/algo/detectors/sts/Hitfinder.h +++ b/algo/detectors/sts/Hitfinder.h @@ -8,6 +8,7 @@ #include "CbmStsDigi.h" #include "Definitions.h" #include "gpu/DeviceImage.h" +#include "gpu/PaddedValue.h" #include "gpu/Params.h" #include "sts/Cluster.h" #include "sts/Hit.h" @@ -268,12 +269,11 @@ namespace cbm::algo::sts // Number of clusters in each module // size = 2 * nModules // FIXME: Should be size_t! - xpu::buffer<int> nClustersPerModule; + xpu::buffer<PaddedToCacheLine<int>> nClustersPerModule; // Max time error of clusters on front- and backside of a module - // size = 1 (???) - // FIXME: size should be 2 * nModules? And only one array! - xpu::buffer<float> maxClusterTimeErrorByModuleSide; + // size = 2 * nModules + xpu::buffer<PaddedToCacheLine<float>> maxClusterTimeErrorByModuleSide; // output @@ -294,7 +294,7 @@ namespace cbm::algo::sts // Number of hits in each module // size = nModules // FIXME: Should be size_t! - xpu::buffer<int> nHitsPerModule; + xpu::buffer<PaddedToCacheLine<int>> nHitsPerModule; // Flat array of hits. size = nHitsTotal size_t hitsFlatCapacity; diff --git a/algo/detectors/sts/HitfinderChain.cxx b/algo/detectors/sts/HitfinderChain.cxx index c96b3f2dca8e11b8c3d098cdff155a09158cadc2..68d253d2592ce0e1e6f6856d3bcb9c004cdba457 100644 --- a/algo/detectors/sts/HitfinderChain.cxx +++ b/algo/detectors/sts/HitfinderChain.cxx @@ -158,8 +158,8 @@ sts::HitfinderChain::Result sts::HitfinderChain::operator()(gsl::span<const CbmS std::vector<ClusterIdx> clusterIdxPerModule; clusterIdxPerModule.resize(props.size()); - std::vector<int> nClustersPerModule; - nClustersPerModule.resize(fPars->setup.modules.size() * 2); + std::vector<PaddedToCacheLine<int>> nClustersPerModule; + nClustersPerModule.resize(fPars->setup.modules.size() * 2, 0); queue.copy(hfc.clusterIdxPerModule.get(), clusterIdxPerModule.data(), props.size()); queue.copy(hfc.nClustersPerModule.get(), nClustersPerModule.data(), nClustersPerModule.size()); @@ -225,7 +225,7 @@ sts::HitfinderChain::Result sts::HitfinderChain::operator()(gsl::span<const CbmS for (size_t m = 0; m < nModules * 2; m++) { if (static_cast<size_t>(nClusters[m]) > hfc.maxClustersPerModule) { - L_(error) << "STS Hitfinder Chain: Cluster bucket overflow in module " << m << " with " << nClusters[m] + L_(error) << "STS Hitfinder Chain: Cluster bucket overflow in module " << m << " with " << *nClusters[m] << " (of " << hfc.maxClustersPerModule << " max)" << " clusters!"; nClusters[m] = hfc.maxClustersPerModule; @@ -240,7 +240,7 @@ sts::HitfinderChain::Result sts::HitfinderChain::operator()(gsl::span<const CbmS for (size_t m = 0; m < nModules; m++) { if (static_cast<size_t>(nHits[m]) > hfc.maxHitsPerModule) { - L_(error) << "STS Hitfinder Chain: Hit bucket overflow in module " << m << " with " << nHits[m] << " (of " + L_(error) << "STS Hitfinder Chain: Hit bucket overflow in module " << m << " with " << *nHits[m] << " (of " << hfc.maxHitsPerModule << " max)" << " hits!"; nHits[m] = hfc.maxHitsPerModule; @@ -635,7 +635,7 @@ PartitionedVector<sts::Cluster> sts::HitfinderChain::FlattenClusters(xpu::queue return out; } -size_t sts::HitfinderChain::GetNHits(xpu::h_view<int> nHitsPerModule, int module) +size_t sts::HitfinderChain::GetNHits(xpu::h_view<PaddedToCacheLine<int>> nHitsPerModule, int module) { return std::min<size_t>(nHitsPerModule[module], fHitfinder.maxHitsPerModule); } @@ -806,7 +806,8 @@ void sts::HitfinderChain::EnsureChannelOffsets(gsl::span<u32> channelOffsetsByMo } } -void sts::HitfinderChain::EnsureClustersSane(gsl::span<ClusterIdx> hClusterIdx, gsl::span<int> hNClusters) +void sts::HitfinderChain::EnsureClustersSane(gsl::span<ClusterIdx> hClusterIdx, + gsl::span<PaddedToCacheLine<int>> hNClusters) { for (size_t m = 0; m < 2 * fPars->setup.modules.size(); m++) { int nClusters = hNClusters[m]; diff --git a/algo/detectors/sts/HitfinderChain.h b/algo/detectors/sts/HitfinderChain.h index 8bd27cf5604438ba2ae93745c9b5d868499f37f6..fa3e695c352bc9b139595c0a0f0e2138a52e903c 100644 --- a/algo/detectors/sts/HitfinderChain.h +++ b/algo/detectors/sts/HitfinderChain.h @@ -116,7 +116,7 @@ namespace cbm::algo::sts * * @note: Wrapper method required as buckets might overflow. This corrects for that. **/ - size_t GetNHits(xpu::h_view<int> nHitsPerModule, int module); + size_t GetNHits(xpu::h_view<PaddedToCacheLine<int>> nHitsPerModule, int module); /** * Divide Hits into streams. @@ -138,7 +138,7 @@ namespace cbm::algo::sts void EnsureDigiOffsets(DigiMap&); void EnsureDigisSorted(); void EnsureChannelOffsets(gsl::span<u32>); - void EnsureClustersSane(gsl::span<ClusterIdx>, gsl::span<int>); + void EnsureClustersSane(gsl::span<ClusterIdx>, gsl::span<PaddedToCacheLine<int>>); void EnsureClustersSorted(); void EnsureHitsSorted(PartitionedSpan<sts::Hit>);