From 005efdadad6491483902b78a7d9fe90ce793275d Mon Sep 17 00:00:00 2001 From: Felix Weiglhofer <weiglhofer@fias.uni-frankfurt.de> Date: Mon, 18 Mar 2024 14:19:55 +0000 Subject: [PATCH] STS reco: Add padding between atomic counters to prevent false sharing. --- algo/base/gpu/PaddedValue.h | 82 +++++++++++++++++++++++++++ algo/detectors/sts/Hitfinder.h | 10 ++-- algo/detectors/sts/HitfinderChain.cxx | 13 +++-- algo/detectors/sts/HitfinderChain.h | 4 +- 4 files changed, 96 insertions(+), 13 deletions(-) create mode 100644 algo/base/gpu/PaddedValue.h diff --git a/algo/base/gpu/PaddedValue.h b/algo/base/gpu/PaddedValue.h new file mode 100644 index 0000000000..f82242e806 --- /dev/null +++ b/algo/base/gpu/PaddedValue.h @@ -0,0 +1,82 @@ +/* Copyright (C) 2024 FIAS Frankfurt Institute for Advanced Studies, Frankfurt / Main + SPDX-License-Identifier: GPL-3.0-only + Authors: Felix Weiglhofer [committer] */ + +#pragma once + +#include <cstddef> + +#include <xpu/defines.h> + +/** + * @file PaddedValue.h + * @brief This file contains the definition of the PaddedValue class. + */ + +namespace cbm::algo +{ + + /** + * @brief A class that represents a value with padding to a certain size. + * @tparam T The type of the value. + * @tparam N Number of bytes the value should be padded to. + * + * @note This class is useful for aligning values to a certain size, e.g. to ensure that atomic counters are spread across different cache lines. (Prevent false sharing) + */ + template<typename T, size_t N> + class PaddedValue { + static_assert(N % alignof(T) == 0, "N must be a multiple of alignof(T)"); + + public: + XPU_D PaddedValue() = default; + XPU_D PaddedValue(const T& value) : fValue(value) {} + + XPU_D PaddedValue(const PaddedValue& other) : fValue(other.fValue) {} + XPU_D PaddedValue& operator=(const PaddedValue& other) + { + fValue = other.fValue; + return *this; + } + + XPU_D PaddedValue(PaddedValue&& other) : fValue(std::move(other.fValue)) {} + XPU_D PaddedValue& operator=(PaddedValue&& other) + { + fValue = std::move(other.fValue); + return *this; + } + + XPU_D T& operator=(const T& value) + { + fValue = value; + return fValue; + } + + XPU_D T& Get() { return fValue; } + XPU_D const T& Get() const { return fValue; } + + XPU_D T* operator&() { return &fValue; } + XPU_D const T* operator&() const { return &fValue; } + + XPU_D T& operator*() { return fValue; } + XPU_D const T& operator*() const { return fValue; } + + XPU_D operator T&() { return fValue; } + XPU_D operator const T&() const { return fValue; } + + XPU_D operator T*() { return &fValue; } + XPU_D operator const T*() const { return &fValue; } + + XPU_D T* operator->() { return &fValue; } + XPU_D const T* operator->() const { return &fValue; } + + private: + T fValue; + unsigned char fPadding[N - sizeof(T)]; + }; + + inline constexpr size_t SizeOfCacheLine = 64; + + template<typename T> + using PaddedToCacheLine = PaddedValue<T, SizeOfCacheLine>; + +} // namespace cbm::algo diff --git a/algo/detectors/sts/Hitfinder.h b/algo/detectors/sts/Hitfinder.h index 03ad607c0f..af69fab65a 100644 --- a/algo/detectors/sts/Hitfinder.h +++ b/algo/detectors/sts/Hitfinder.h @@ -8,6 +8,7 @@ #include "CbmStsDigi.h" #include "Definitions.h" #include "gpu/DeviceImage.h" +#include "gpu/PaddedValue.h" #include "gpu/Params.h" #include "sts/Cluster.h" #include "sts/Hit.h" @@ -268,12 +269,11 @@ namespace cbm::algo::sts // Number of clusters in each module // size = 2 * nModules // FIXME: Should be size_t! - xpu::buffer<int> nClustersPerModule; + xpu::buffer<PaddedToCacheLine<int>> nClustersPerModule; // Max time error of clusters on front- and backside of a module - // size = 1 (???) - // FIXME: size should be 2 * nModules? And only one array! - xpu::buffer<float> maxClusterTimeErrorByModuleSide; + // size = 2 * nModules + xpu::buffer<PaddedToCacheLine<float>> maxClusterTimeErrorByModuleSide; // output @@ -294,7 +294,7 @@ namespace cbm::algo::sts // Number of hits in each module // size = nModules // FIXME: Should be size_t! - xpu::buffer<int> nHitsPerModule; + xpu::buffer<PaddedToCacheLine<int>> nHitsPerModule; // Flat array of hits. size = nHitsTotal size_t hitsFlatCapacity; diff --git a/algo/detectors/sts/HitfinderChain.cxx b/algo/detectors/sts/HitfinderChain.cxx index c96b3f2dca..68d253d259 100644 --- a/algo/detectors/sts/HitfinderChain.cxx +++ b/algo/detectors/sts/HitfinderChain.cxx @@ -158,8 +158,8 @@ sts::HitfinderChain::Result sts::HitfinderChain::operator()(gsl::span<const CbmS std::vector<ClusterIdx> clusterIdxPerModule; clusterIdxPerModule.resize(props.size()); - std::vector<int> nClustersPerModule; - nClustersPerModule.resize(fPars->setup.modules.size() * 2); + std::vector<PaddedToCacheLine<int>> nClustersPerModule; + nClustersPerModule.resize(fPars->setup.modules.size() * 2, 0); queue.copy(hfc.clusterIdxPerModule.get(), clusterIdxPerModule.data(), props.size()); queue.copy(hfc.nClustersPerModule.get(), nClustersPerModule.data(), nClustersPerModule.size()); @@ -225,7 +225,7 @@ sts::HitfinderChain::Result sts::HitfinderChain::operator()(gsl::span<const CbmS for (size_t m = 0; m < nModules * 2; m++) { if (static_cast<size_t>(nClusters[m]) > hfc.maxClustersPerModule) { - L_(error) << "STS Hitfinder Chain: Cluster bucket overflow in module " << m << " with " << nClusters[m] + L_(error) << "STS Hitfinder Chain: Cluster bucket overflow in module " << m << " with " << *nClusters[m] << " (of " << hfc.maxClustersPerModule << " max)" << " clusters!"; nClusters[m] = hfc.maxClustersPerModule; @@ -240,7 +240,7 @@ sts::HitfinderChain::Result sts::HitfinderChain::operator()(gsl::span<const CbmS for (size_t m = 0; m < nModules; m++) { if (static_cast<size_t>(nHits[m]) > hfc.maxHitsPerModule) { - L_(error) << "STS Hitfinder Chain: Hit bucket overflow in module " << m << " with " << nHits[m] << " (of " + L_(error) << "STS Hitfinder Chain: Hit bucket overflow in module " << m << " with " << *nHits[m] << " (of " << hfc.maxHitsPerModule << " max)" << " hits!"; nHits[m] = hfc.maxHitsPerModule; @@ -635,7 +635,7 @@ PartitionedVector<sts::Cluster> sts::HitfinderChain::FlattenClusters(xpu::queue return out; } -size_t sts::HitfinderChain::GetNHits(xpu::h_view<int> nHitsPerModule, int module) +size_t sts::HitfinderChain::GetNHits(xpu::h_view<PaddedToCacheLine<int>> nHitsPerModule, int module) { return std::min<size_t>(nHitsPerModule[module], fHitfinder.maxHitsPerModule); } @@ -806,7 +806,8 @@ void sts::HitfinderChain::EnsureChannelOffsets(gsl::span<u32> channelOffsetsByMo } } -void sts::HitfinderChain::EnsureClustersSane(gsl::span<ClusterIdx> hClusterIdx, gsl::span<int> hNClusters) +void sts::HitfinderChain::EnsureClustersSane(gsl::span<ClusterIdx> hClusterIdx, + gsl::span<PaddedToCacheLine<int>> hNClusters) { for (size_t m = 0; m < 2 * fPars->setup.modules.size(); m++) { int nClusters = hNClusters[m]; diff --git a/algo/detectors/sts/HitfinderChain.h b/algo/detectors/sts/HitfinderChain.h index 8bd27cf560..fa3e695c35 100644 --- a/algo/detectors/sts/HitfinderChain.h +++ b/algo/detectors/sts/HitfinderChain.h @@ -116,7 +116,7 @@ namespace cbm::algo::sts * * @note: Wrapper method required as buckets might overflow. This corrects for that. **/ - size_t GetNHits(xpu::h_view<int> nHitsPerModule, int module); + size_t GetNHits(xpu::h_view<PaddedToCacheLine<int>> nHitsPerModule, int module); /** * Divide Hits into streams. @@ -138,7 +138,7 @@ namespace cbm::algo::sts void EnsureDigiOffsets(DigiMap&); void EnsureDigisSorted(); void EnsureChannelOffsets(gsl::span<u32>); - void EnsureClustersSane(gsl::span<ClusterIdx>, gsl::span<int>); + void EnsureClustersSane(gsl::span<ClusterIdx>, gsl::span<PaddedToCacheLine<int>>); void EnsureClustersSorted(); void EnsureHitsSorted(PartitionedSpan<sts::Hit>); -- GitLab