From 005efdadad6491483902b78a7d9fe90ce793275d Mon Sep 17 00:00:00 2001
From: Felix Weiglhofer <weiglhofer@fias.uni-frankfurt.de>
Date: Mon, 18 Mar 2024 14:19:55 +0000
Subject: [PATCH] STS reco: Add padding between atomic counters to prevent
 false sharing.

---
 algo/base/gpu/PaddedValue.h           | 82 +++++++++++++++++++++++++++
 algo/detectors/sts/Hitfinder.h        | 10 ++--
 algo/detectors/sts/HitfinderChain.cxx | 13 +++--
 algo/detectors/sts/HitfinderChain.h   |  4 +-
 4 files changed, 96 insertions(+), 13 deletions(-)
 create mode 100644 algo/base/gpu/PaddedValue.h

diff --git a/algo/base/gpu/PaddedValue.h b/algo/base/gpu/PaddedValue.h
new file mode 100644
index 0000000000..f82242e806
--- /dev/null
+++ b/algo/base/gpu/PaddedValue.h
@@ -0,0 +1,82 @@
+/* Copyright (C) 2024 FIAS Frankfurt Institute for Advanced Studies, Frankfurt / Main
+   SPDX-License-Identifier: GPL-3.0-only
+   Authors: Felix Weiglhofer [committer] */
+
+#pragma once
+
+#include <cstddef>
+
+#include <xpu/defines.h>
+
+/**
+ * @file PaddedValue.h
+ * @brief This file contains the definition of the PaddedValue class.
+ */
+
+namespace cbm::algo
+{
+
+  /**
+   * @brief A class that represents a value with padding to a certain size.
+   * @tparam T The type of the value.
+   * @tparam N Number of bytes the value should be padded to.
+   *
+   * @note This class is useful for aligning values to a certain size, e.g. to ensure that atomic counters are spread across different cache lines. (Prevent false sharing)
+   */
+  template<typename T, size_t N>
+  class PaddedValue {
+    static_assert(N % alignof(T) == 0, "N must be a multiple of alignof(T)");
+
+   public:
+    XPU_D PaddedValue() = default;
+    XPU_D PaddedValue(const T& value) : fValue(value) {}
+
+    XPU_D PaddedValue(const PaddedValue& other) : fValue(other.fValue) {}
+    XPU_D PaddedValue& operator=(const PaddedValue& other)
+    {
+      fValue = other.fValue;
+      return *this;
+    }
+
+    XPU_D PaddedValue(PaddedValue&& other) : fValue(std::move(other.fValue)) {}
+    XPU_D PaddedValue& operator=(PaddedValue&& other)
+    {
+      fValue = std::move(other.fValue);
+      return *this;
+    }
+
+    XPU_D T& operator=(const T& value)
+    {
+      fValue = value;
+      return fValue;
+    }
+
+    XPU_D T& Get() { return fValue; }
+    XPU_D const T& Get() const { return fValue; }
+
+    XPU_D T* operator&() { return &fValue; }
+    XPU_D const T* operator&() const { return &fValue; }
+
+    XPU_D T& operator*() { return fValue; }
+    XPU_D const T& operator*() const { return fValue; }
+
+    XPU_D operator T&() { return fValue; }
+    XPU_D operator const T&() const { return fValue; }
+
+    XPU_D operator T*() { return &fValue; }
+    XPU_D operator const T*() const { return &fValue; }
+
+    XPU_D T* operator->() { return &fValue; }
+    XPU_D const T* operator->() const { return &fValue; }
+
+   private:
+    T fValue;
+    unsigned char fPadding[N - sizeof(T)];
+  };
+
+  inline constexpr size_t SizeOfCacheLine = 64;
+
+  template<typename T>
+  using PaddedToCacheLine = PaddedValue<T, SizeOfCacheLine>;
+
+}  // namespace cbm::algo
diff --git a/algo/detectors/sts/Hitfinder.h b/algo/detectors/sts/Hitfinder.h
index 03ad607c0f..af69fab65a 100644
--- a/algo/detectors/sts/Hitfinder.h
+++ b/algo/detectors/sts/Hitfinder.h
@@ -8,6 +8,7 @@
 #include "CbmStsDigi.h"
 #include "Definitions.h"
 #include "gpu/DeviceImage.h"
+#include "gpu/PaddedValue.h"
 #include "gpu/Params.h"
 #include "sts/Cluster.h"
 #include "sts/Hit.h"
@@ -268,12 +269,11 @@ namespace cbm::algo::sts
     // Number of clusters in each module
     // size = 2 * nModules
     // FIXME: Should be size_t!
-    xpu::buffer<int> nClustersPerModule;
+    xpu::buffer<PaddedToCacheLine<int>> nClustersPerModule;
 
     // Max time error of clusters on front- and backside of a module
-    // size = 1 (???)
-    // FIXME: size should be 2 * nModules? And only one array!
-    xpu::buffer<float> maxClusterTimeErrorByModuleSide;
+    // size = 2 * nModules
+    xpu::buffer<PaddedToCacheLine<float>> maxClusterTimeErrorByModuleSide;
 
     // output
 
@@ -294,7 +294,7 @@ namespace cbm::algo::sts
     // Number of hits in each module
     // size = nModules
     // FIXME: Should be size_t!
-    xpu::buffer<int> nHitsPerModule;
+    xpu::buffer<PaddedToCacheLine<int>> nHitsPerModule;
 
     // Flat array of hits. size = nHitsTotal
     size_t hitsFlatCapacity;
diff --git a/algo/detectors/sts/HitfinderChain.cxx b/algo/detectors/sts/HitfinderChain.cxx
index c96b3f2dca..68d253d259 100644
--- a/algo/detectors/sts/HitfinderChain.cxx
+++ b/algo/detectors/sts/HitfinderChain.cxx
@@ -158,8 +158,8 @@ sts::HitfinderChain::Result sts::HitfinderChain::operator()(gsl::span<const CbmS
 
     std::vector<ClusterIdx> clusterIdxPerModule;
     clusterIdxPerModule.resize(props.size());
-    std::vector<int> nClustersPerModule;
-    nClustersPerModule.resize(fPars->setup.modules.size() * 2);
+    std::vector<PaddedToCacheLine<int>> nClustersPerModule;
+    nClustersPerModule.resize(fPars->setup.modules.size() * 2, 0);
 
     queue.copy(hfc.clusterIdxPerModule.get(), clusterIdxPerModule.data(), props.size());
     queue.copy(hfc.nClustersPerModule.get(), nClustersPerModule.data(), nClustersPerModule.size());
@@ -225,7 +225,7 @@ sts::HitfinderChain::Result sts::HitfinderChain::operator()(gsl::span<const CbmS
 
     for (size_t m = 0; m < nModules * 2; m++) {
       if (static_cast<size_t>(nClusters[m]) > hfc.maxClustersPerModule) {
-        L_(error) << "STS Hitfinder Chain: Cluster bucket overflow in module " << m << " with " << nClusters[m]
+        L_(error) << "STS Hitfinder Chain: Cluster bucket overflow in module " << m << " with " << *nClusters[m]
                   << " (of " << hfc.maxClustersPerModule << " max)"
                   << " clusters!";
         nClusters[m] = hfc.maxClustersPerModule;
@@ -240,7 +240,7 @@ sts::HitfinderChain::Result sts::HitfinderChain::operator()(gsl::span<const CbmS
 
     for (size_t m = 0; m < nModules; m++) {
       if (static_cast<size_t>(nHits[m]) > hfc.maxHitsPerModule) {
-        L_(error) << "STS Hitfinder Chain: Hit bucket overflow in module " << m << " with " << nHits[m] << " (of "
+        L_(error) << "STS Hitfinder Chain: Hit bucket overflow in module " << m << " with " << *nHits[m] << " (of "
                   << hfc.maxHitsPerModule << " max)"
                   << " hits!";
         nHits[m] = hfc.maxHitsPerModule;
@@ -635,7 +635,7 @@ PartitionedVector<sts::Cluster> sts::HitfinderChain::FlattenClusters(xpu::queue
   return out;
 }
 
-size_t sts::HitfinderChain::GetNHits(xpu::h_view<int> nHitsPerModule, int module)
+size_t sts::HitfinderChain::GetNHits(xpu::h_view<PaddedToCacheLine<int>> nHitsPerModule, int module)
 {
   return std::min<size_t>(nHitsPerModule[module], fHitfinder.maxHitsPerModule);
 }
@@ -806,7 +806,8 @@ void sts::HitfinderChain::EnsureChannelOffsets(gsl::span<u32> channelOffsetsByMo
   }
 }
 
-void sts::HitfinderChain::EnsureClustersSane(gsl::span<ClusterIdx> hClusterIdx, gsl::span<int> hNClusters)
+void sts::HitfinderChain::EnsureClustersSane(gsl::span<ClusterIdx> hClusterIdx,
+                                             gsl::span<PaddedToCacheLine<int>> hNClusters)
 {
   for (size_t m = 0; m < 2 * fPars->setup.modules.size(); m++) {
     int nClusters = hNClusters[m];
diff --git a/algo/detectors/sts/HitfinderChain.h b/algo/detectors/sts/HitfinderChain.h
index 8bd27cf560..fa3e695c35 100644
--- a/algo/detectors/sts/HitfinderChain.h
+++ b/algo/detectors/sts/HitfinderChain.h
@@ -116,7 +116,7 @@ namespace cbm::algo::sts
      *
      * @note: Wrapper method required as buckets might overflow. This corrects for that.
      **/
-    size_t GetNHits(xpu::h_view<int> nHitsPerModule, int module);
+    size_t GetNHits(xpu::h_view<PaddedToCacheLine<int>> nHitsPerModule, int module);
 
     /**
      * Divide Hits into streams.
@@ -138,7 +138,7 @@ namespace cbm::algo::sts
     void EnsureDigiOffsets(DigiMap&);
     void EnsureDigisSorted();
     void EnsureChannelOffsets(gsl::span<u32>);
-    void EnsureClustersSane(gsl::span<ClusterIdx>, gsl::span<int>);
+    void EnsureClustersSane(gsl::span<ClusterIdx>, gsl::span<PaddedToCacheLine<int>>);
     void EnsureClustersSorted();
     void EnsureHitsSorted(PartitionedSpan<sts::Hit>);
 
-- 
GitLab