From c7ae6fc446d323b14680644dc677af0c6d602ccb Mon Sep 17 00:00:00 2001
From: Felix Weiglhofer <weiglhofer@fias.uni-frankfurt.de>
Date: Sun, 22 Oct 2023 09:29:35 +0000
Subject: [PATCH] cbmreco: Enable storage of STS hits.

---
 algo/CMakeLists.txt                   |   1 +
 algo/base/Definitions.h               |   4 +-
 algo/base/PartitionedVector.h         | 208 ++++++++++++++++++++++++++
 algo/data/sts/Hit.h                   |  21 +++
 algo/detectors/sts/HitfinderChain.cxx | 104 ++++++++++---
 algo/detectors/sts/HitfinderChain.h   |  20 ++-
 algo/global/Reco.cxx                  |  24 ++-
 algo/global/Reco.h                    |   1 +
 algo/global/StorableRecoResults.h     |   5 +
 algo/test/CMakeLists.txt              |   6 +
 algo/test/_GTestPartitionedVector.cxx |  98 ++++++++++++
 external/CMakeLists.txt               |   2 +-
 reco/app/cbmreco/main.cxx             |  24 ++-
 13 files changed, 488 insertions(+), 30 deletions(-)
 create mode 100644 algo/base/PartitionedVector.h
 create mode 100644 algo/test/_GTestPartitionedVector.cxx

diff --git a/algo/CMakeLists.txt b/algo/CMakeLists.txt
index 3b601fea6c..7e7b4c6abb 100644
--- a/algo/CMakeLists.txt
+++ b/algo/CMakeLists.txt
@@ -144,6 +144,7 @@ install(
     base/Options.h
     base/RecoParams.h
     base/SubChain.h
+    base/PartitionedVector.h
     global/Reco.h
     global/RecoResultsInputArchive.h
     global/RecoResultsOutputArchive.h
diff --git a/algo/base/Definitions.h b/algo/base/Definitions.h
index 274c2e8e05..ff02c3afac 100644
--- a/algo/base/Definitions.h
+++ b/algo/base/Definitions.h
@@ -40,7 +40,8 @@ namespace cbm::algo
 
   enum class RecoData
   {
-    Digi,
+    Digi,       //< Raw output from unpackers
+    DigiEvent,  //< Digis after event building
     Cluster,
     Hit,
   };
@@ -78,6 +79,7 @@ CBM_ENUM_DICT(cbm::algo::Step,
 
 CBM_ENUM_DICT(cbm::algo::RecoData,
   {"Digi", RecoData::Digi},
+  {"DigiEvent", RecoData::DigiEvent},
   {"Cluster", RecoData::Cluster},
   {"Hit", RecoData::Hit}
 );
diff --git a/algo/base/PartitionedVector.h b/algo/base/PartitionedVector.h
new file mode 100644
index 0000000000..008b65d738
--- /dev/null
+++ b/algo/base/PartitionedVector.h
@@ -0,0 +1,208 @@
+/* Copyright (C) 2023 FIAS Frankfurt Institute for Advanced Studies, Frankfurt / Main
+   SPDX-License-Identifier: GPL-3.0-only
+   Authors: Felix Weiglhofer [committer] */
+#ifndef CBM_ALGO_BASE_PARTITIONED_VECTOR_H
+#define CBM_ALGO_BASE_PARTITIONED_VECTOR_H
+
+#include <boost/serialization/access.hpp>
+#include <boost/serialization/vector.hpp>
+
+#include <gsl/span>
+#include <vector>
+
+#include "Definitions.h"
+#include "util/PODAllocator.h"
+
+namespace cbm::algo
+{
+
+  /**
+   * @brief A vector that is partitioned into multiple subvectors.
+   *
+   * @tparam T Type of the elements
+   * @tparam Allocator Allocator for the underlying container
+   *
+   * @note The underlying container is contiguous in memory.
+   */
+  template<typename T, class Allocator = std::allocator<T>>
+  class PartitionedVector {
+
+  public:
+    using Container_t = std::vector<T, Allocator>;  //< Underlying container type
+
+    /**
+     * @brief Default constructor. Creates an empty vector.
+     */
+    PartitionedVector()
+    {
+      fOffsets.push_back(0);
+      EnsureDimensions();
+    }
+
+    /**
+     * @brief Constructor. Creates a vector with n partitions.
+     *
+     * @param data Underlying data. Assusmes that the data is already partitioned and takes ownership of it.
+     * @param sizes Sizes of each partitions
+     * @param addresses Hardware addresses of each partition
+     *
+     * @note Requires sizes.size() == addresses.size()
+     */
+    PartitionedVector(Container_t&& data, gsl::span<const size_t> sizes, gsl::span<const u32> addresses)
+      : fData(std::move(data))
+      , fOffsets()
+      , fAdresses(addresses.begin(), addresses.end())
+    {
+      ComputeOffsets(sizes);
+      EnsureDimensions();
+    }
+
+    /**
+     * @brief Copy constructor. Copy the data from other vector.
+     */
+    template<typename OtherAllocator>
+    PartitionedVector(const PartitionedVector<T, OtherAllocator>& other)
+      : fData(other.Data().begin(), other.Data().end())
+      , fOffsets(other.Offsets())
+      , fAdresses(other.Addresses())
+    {
+      // TODO: this check is overkill? We already know that the dimensions are correct,
+      // since they were already checked in the other vector
+      EnsureDimensions();
+    }
+
+    /**
+     * @brief Access data at partition i.
+     */
+    gsl::span<T> operator[](size_t i)
+    {
+      EnsureBounds(i);
+      return UnsafePartitionSpan(i);
+    }
+
+    /**
+     * @brief Access data at partition i.
+     */
+    gsl::span<const T> operator[](size_t i) const
+    {
+      EnsureBounds(i);
+      return UnsafePartitionSpan(i);
+    }
+
+    /**
+     * @brief Get the hardware address of partition i.
+     */
+    u32 Address(size_t i) const
+    {
+      EnsureBounds(i);
+      return fAdresses[i];
+    }
+
+    /**
+     * @brief Get a pair of the data and the hardware address of partition i.
+     */
+    std::pair<gsl::span<T>, u32> Partition(size_t i)
+    {
+      EnsureBounds(i);
+      return std::pair<gsl::span<T>, u32>(UnsafePartitionSpan(i), fAdresses[i]);
+    }
+
+    /**
+     * @brief Get a pair of the data and the hardware address of partition i.
+     */
+    std::pair<gsl::span<const T>, u32> Partition(size_t i) const
+    {
+      EnsureBounds(i);
+      return std::pair<gsl::span<const T>, u32>(UnsafePartitionSpan(i), fAdresses[i]);
+    }
+
+    /**
+     * @brief Get the number of partitions.
+     */
+    size_t NPartitions() const { return fOffsets.size() - 1; }
+
+    /**
+     * @brief Get the size of partition i.
+     */
+    size_t Size(size_t i) const
+    {
+      EnsureBounds(i);
+      return UnsafeSize(i);
+    }
+
+    /**
+     * @brief Get the total number of elements in the container across all partitions.
+     */
+    size_t NElements() const { return fData.size(); }
+
+    /**
+     * @brief Get the underlying data.
+     */
+    const Container_t& Data() const { return fData; }
+
+    /**
+     * @brief Get the addresses.
+     */
+    const std::vector<u32>& Addresses() const { return fAdresses; }
+
+    /**
+     * @brief Get the underlying offsets.
+     */
+    const std::vector<size_t>& Offsets() const { return fOffsets; }
+
+  private:
+    Container_t fData;             //< Data
+    std::vector<size_t> fOffsets;  // < Offsets of the partitions in fData
+    std::vector<u32> fAdresses;    //< Hardware addresses of the partitions
+
+    void EnsureDimensions() const
+    {
+      if (fOffsets.size() - 1 != fAdresses.size()) {
+        throw std::runtime_error("PartitionedVector: fOffsets.size() != fAdresses.size()");
+      }
+      if (fOffsets.back() != fData.size()) {
+        throw std::runtime_error("PartitionedVector: fOffsets.back() != fData.size()");
+      }
+    }
+
+    void EnsureBounds(size_t i) const
+    {
+      if (i >= fOffsets.size() - 1) throw std::out_of_range("PartitionedVector: index out of bounds");
+    }
+
+    void ComputeOffsets(gsl::span<const size_t> sizes)
+    {
+      fOffsets.reserve(sizes.size() + 1);
+      fOffsets.push_back(0);
+      for (auto n : sizes) {
+        fOffsets.push_back(fOffsets.back() + n);
+      }
+    }
+
+    size_t UnsafeSize(size_t i) const { return fOffsets[i + 1] - fOffsets[i]; }
+
+    gsl::span<T> UnsafePartitionSpan(size_t i) { return gsl::span<T>(fData.data() + fOffsets[i], UnsafeSize(i)); }
+
+    gsl::span<const T> UnsafePartitionSpan(size_t i) const
+    {
+      return gsl::span<const T>(fData.data() + fOffsets[i], UnsafeSize(i));
+    }
+
+  private:  // serialization
+    friend class boost::serialization::access;
+
+    template<class Archive>
+    void serialize(Archive& ar, unsigned int /*version*/)
+    {
+      ar& fData;
+      ar& fOffsets;
+      ar& fAdresses;
+    }
+  };
+
+  template<typename T>
+  using PartitionedPODVector = PartitionedVector<T, PODAllocator<T>>;
+
+}  // namespace cbm::algo
+
+#endif
diff --git a/algo/data/sts/Hit.h b/algo/data/sts/Hit.h
index 8446e6e1fc..f310fff53f 100644
--- a/algo/data/sts/Hit.h
+++ b/algo/data/sts/Hit.h
@@ -5,6 +5,8 @@
 #ifndef CBM_ALGO_DATA_STS_HIT_H
 #define CBM_ALGO_DATA_STS_HIT_H
 
+#include <boost/serialization/access.hpp>
+
 #include "Definitions.h"
 
 namespace cbm::algo::sts
@@ -20,6 +22,25 @@ namespace cbm::algo::sts
     real fTimeError;  ///< Error of hit time [ns]
     real fDu;         ///< Error of coordinate across front-side strips [cm]
     real fDv;         ///< Error of coordinate across back-side strips [cm]
+
+  private:  // serialization
+    friend class boost::serialization::access;
+
+    template<class Archive>
+    void serialize(Archive& ar, unsigned int /*version*/)
+    {
+      ar& fX;
+      ar& fY;
+      ar& fZ;
+      ar& fTime;
+      ar& fDx;
+      ar& fDy;
+      ar& fDz;
+      ar& fDxy;
+      ar& fTimeError;
+      ar& fDu;
+      ar& fDv;
+    }
   };
 
 }  // namespace cbm::algo::sts
diff --git a/algo/detectors/sts/HitfinderChain.cxx b/algo/detectors/sts/HitfinderChain.cxx
index 4daa50c861..be4e75377b 100644
--- a/algo/detectors/sts/HitfinderChain.cxx
+++ b/algo/detectors/sts/HitfinderChain.cxx
@@ -7,6 +7,9 @@
 #include <log.hpp>
 #include <numeric>
 
+#include "PODVector.h"
+#include "compat/OpenMP.h"
+
 using namespace cbm::algo;
 
 void sts::HitfinderChain::SetParameters(const sts::HitfinderPars& parameters)
@@ -23,7 +26,7 @@ void sts::HitfinderChain::Finalize()
   xpu::set<TheHitfinder>(fHitfinder);
 }
 
-sts::HitfinderMonitor sts::HitfinderChain::operator()(gsl::span<const CbmStsDigi> digis)
+sts::HitfinderChain::Result sts::HitfinderChain::operator()(gsl::span<const CbmStsDigi> digis)
 {
   EnsureParameters();
 
@@ -132,7 +135,11 @@ sts::HitfinderMonitor sts::HitfinderChain::operator()(gsl::span<const CbmStsDigi
   L_(debug) << "STS Hitfinder Chain: Find Hits...";
   queue.copy(hfc.nClustersPerModule, xpu::d2h);
   queue.wait();
-  xpu::h_view nClusters {hfc.nClustersPerModule};
+  xpu::h_view nClusters(hfc.nClustersPerModule);
+  size_t nClustersTotal = std::accumulate(nClusters.begin(), nClusters.end(), 0);
+  xpu::k_add_bytes<SortClusters>(nClustersTotal * sizeof(GpuClusterIdx));
+
+
   size_t nClustersFront = std::accumulate(nClusters.begin(), nClusters.begin() + nModules, 0);
 
   // FindHits supports to modes of parallelization: One thread per cluster or one block per module
@@ -142,23 +149,17 @@ sts::HitfinderMonitor sts::HitfinderChain::operator()(gsl::span<const CbmStsDigi
   bool isCpu          = false;
   xpu::grid findHitsG = isCpu ? xpu::n_blocks(nModules) : xpu::n_threads(nClustersFront);
   queue.launch<FindHits>(findHitsG);
+  xpu::k_add_bytes<FindHits>(nClustersTotal * sizeof(sts::Cluster));
 
-  // Transfer Hits / Cluster back to host
-  // TODO: Hits and Clusters are stored in buckets. Currently we transfer the entire
-  // bucket arrays back to the host. This includes a lot of unused bytes.
-  // Better to either flatten both arrays on the GPU and transfer flat array back.
-  // Or do multiple copies to copy contents of individual buckets.
-  queue.copy(hfc.clusterDataPerModule, xpu::d2h);
-  queue.copy(hfc.nClustersPerModule, xpu::d2h);
-  queue.copy(hfc.hitsPerModule, xpu::d2h);
   queue.copy(hfc.nHitsPerModule, xpu::d2h);
+  queue.wait();
 
+  auto hits = FlattenHits(queue);
   queue.copy(hfc.monitor, xpu::d2h);
 
   queue.wait();
 
   xpu::h_view monitor(hfc.monitor);
-  xpu::h_view nHits(hfc.nHitsPerModule);
 
   // Note: Checking for cluster bucket overflow is probably paranoid
   // as we allocate enough memory for one cluster per digi.
@@ -176,6 +177,7 @@ sts::HitfinderMonitor sts::HitfinderChain::operator()(gsl::span<const CbmStsDigi
     }
   }
 
+  xpu::h_view nHits(hfc.nHitsPerModule);
   if (monitor[0].fNumHitBucketOverflow > 0) {
     L_(error) << "STS Hitfinder Chain: Hit bucket overflow! " << monitor[0].fNumHitBucketOverflow
               << " hits were discarded!";
@@ -190,12 +192,7 @@ sts::HitfinderMonitor sts::HitfinderChain::operator()(gsl::span<const CbmStsDigi
     }
   }
 
-  size_t nHitsTotal     = std::accumulate(nHits.begin(), nHits.end(), 0);
-  size_t nClustersTotal = std::accumulate(nClusters.begin(), nClusters.end(), 0);
-
-  xpu::k_add_bytes<SortClusters>(nClustersTotal * sizeof(GpuClusterIdx));
-  xpu::k_add_bytes<FindHits>(nClustersTotal * sizeof(sts::Cluster));
-
+  size_t nHitsTotal = std::accumulate(nHits.begin(), nHits.end(), 0);
   L_(info) << "Timeslice contains " << nHitsTotal << " STS hits and " << nClustersTotal << " STS clusters";
 
   auto timings = xpu::pop_timer();
@@ -203,7 +200,11 @@ sts::HitfinderMonitor sts::HitfinderChain::operator()(gsl::span<const CbmStsDigi
   monitor[0].fTimeTotal       = timings.wall();
   monitor[0].fNumClusterTotal = nClustersTotal;
   monitor[0].fNumHitsTotal    = nHitsTotal;
-  return monitor[0];
+
+  Result result;
+  result.hits    = std::move(hits);
+  result.monitor = monitor[0];
+  return result;
 }
 
 void sts::HitfinderChain::EnsureParameters()
@@ -402,6 +403,73 @@ void sts::HitfinderChain::FlattenDigisSide(DigiMapSide& digis, bool isFront)
   }
 }
 
+PartitionedPODVector<sts::Hit> sts::HitfinderChain::FlattenHits(xpu::queue queue)
+{
+  auto& hfc = fHitfinder;
+  xpu::h_view nHits(hfc.nHitsPerModule);
+
+  size_t nHitsTotal = 0;
+  for (size_t m = 0; m < hfc.nModules; m++)
+    nHitsTotal += GetNHits(nHits, m);
+  L_(info) << "STS Hitfinder Chain: Flattening " << nHitsTotal << " hits";
+  PODVector<sts::Hit> hits(nHitsTotal);
+
+  // Transfer Hits / Cluster back to host
+  // Hits and Clusters are stored in buckets. Buckets aren't necessarily full.
+  // So we have to copy each bucket individually.
+  // Alternative: flatten both arrays on the GPU and transfer flat array back.
+  // Latter would require a lot of memory but give us parallel copying when running on the host.
+  //
+  // Also at this point, values in nHits could be wrong if the bucket overflowed.
+  // So take extra care to not access nHits directly.
+  if (xpu::device::active().backend() == xpu::cpu) {
+    // Hits already in host memory
+    // Flatten in parallel
+    // Note: this is still pretty slow for mCBM because there are only 9 modules...
+    xpu::scoped_timer t_("Flatten Hits");
+    xpu::t_add_bytes(nHitsTotal * sizeof(sts::Hit));
+    CBM_PARALLEL_FOR(schedule(dynamic))
+    for (size_t m = 0; m < hfc.nModules; m++) {
+      size_t offset = 0;
+      for (size_t i = 0; i < m; i++) {
+        offset += GetNHits(nHits, i);
+      }
+      std::copy_n(hfc.hitsPerModule.get() + hfc.maxHitsPerModule * m, GetNHits(nHits, m), hits.begin() + offset);
+    }
+  }
+  else {  // GPU
+    // Initialize host memory:
+    // memset(hits.data(), 0, nHitsTotal * sizeof(sts::Hit));
+
+    // Hits not in host memory
+    // FIXME this is very slow (1.7 GBs vs 16 GBs we should get from PCIe 3.0 x16)
+    // I assume because of page faults as the allocated memory is not touched before
+    // But even with a memset on host memory before, throughput only goes up to 3.7 GBs
+    size_t nHitsCopied = 0;
+    for (size_t m = 0; m < hfc.nModules; m++) {
+      size_t numHitsInModule = GetNHits(nHits, m);
+      queue.copy(hfc.hitsPerModule.get() + hfc.maxHitsPerModule * m, hits.data() + nHitsCopied,
+                 numHitsInModule * sizeof(sts::Hit));
+      nHitsCopied += numHitsInModule;
+    }
+  }
+
+  std::vector<size_t> nHitsPerModule;
+  for (size_t m = 0; m < hfc.nModules; m++)
+    nHitsPerModule.push_back(GetNHits(nHits, m));
+  std::vector<u32> addresses;
+  for (auto& m : fPars->modules)
+    addresses.push_back(m.address);
+  PartitionedPODVector<sts::Hit> partitionedHits(std::move(hits), nHitsPerModule, addresses);
+
+  return partitionedHits;
+}
+
+size_t sts::HitfinderChain::GetNHits(xpu::h_view<int> nHitsPerModule, int module)
+{
+  return std::min<size_t>(nHitsPerModule[module], fHitfinder.maxHitsPerModule);
+}
+
 void sts::HitfinderChain::AppendClustersModule(int module, bool isFront, std::vector<sts::Cluster>& clusters)
 {
   EnsureParameters();
diff --git a/algo/detectors/sts/HitfinderChain.h b/algo/detectors/sts/HitfinderChain.h
index dcd0265e9a..008eeddcbf 100644
--- a/algo/detectors/sts/HitfinderChain.h
+++ b/algo/detectors/sts/HitfinderChain.h
@@ -14,6 +14,7 @@
 #include <optional>
 #include <vector>
 
+#include "PartitionedVector.h"
 #include "SubChain.h"
 #include "gpu/xpu_legacy.h"
 #include "sts/Hitfinder.h"
@@ -40,6 +41,11 @@ namespace cbm::algo::sts
   class HitfinderChain : public SubChain {
 
   public:
+    struct Result {
+      PartitionedPODVector<sts::Hit> hits;
+      sts::HitfinderMonitor monitor;
+    };
+
     void SetParameters(const sts::HitfinderPars& parameters);
     const sts::HitfinderPars& GetParameters() const { return *fPars; }
 
@@ -48,7 +54,7 @@ namespace cbm::algo::sts
      */
     void Finalize();
 
-    sts::HitfinderMonitor operator()(gsl::span<const CbmStsDigi>);
+    Result operator()(gsl::span<const CbmStsDigi>);
 
   private:
     // Shorthands to map module-address onto digis in that module
@@ -86,6 +92,18 @@ namespace cbm::algo::sts
     void FlattenDigis(DigiMap& digiMap);
     void FlattenDigisSide(DigiMapSide& digis, bool isFront);
 
+    /**
+     * Copy Hits back to host into a flat array.
+    */
+    PartitionedPODVector<sts::Hit> FlattenHits(xpu::queue);
+
+    /**
+    * @brief Returns the number of hits of a module.
+    *
+    * @note: Wrapper method required as buckets might overflow. This corrects for that.
+   */
+    size_t GetNHits(xpu::h_view<int> nHitsPerModule, int module);
+
     /**
        * Transfer Hits / Clusters back to host and convert to common CBM types.
        */
diff --git a/algo/global/Reco.cxx b/algo/global/Reco.cxx
index 97cacad54a..75ae721b05 100644
--- a/algo/global/Reco.cxx
+++ b/algo/global/Reco.cxx
@@ -27,6 +27,12 @@ void Reco::Validate(const Options& opts)
   if (!fs::exists(opts.ParamsDir())) {
     throw std::runtime_error("ParamsDir does not exist: " + opts.ParamsDir().string());
   }
+
+  if (opts.HasOutput(RecoData::Digi)) { throw std::runtime_error("Storing raw digis via 'Digi' option not supported"); }
+
+  if (opts.HasOutput(RecoData::Cluster)) {
+    throw std::runtime_error("Storing clusters via 'Cluster' option not supported");
+  }
 }
 
 void Reco::Init(const Options& opts)
@@ -121,8 +127,15 @@ RecoResults Reco::Run(const fles::Timeslice& ts)
   }
 
   sts::HitfinderMonitor stsHitfinderMonitor;
-  if (Opts().HasStep(Step::LocalReco) && Opts().HasDetector(fles::Subsystem::STS))
-    stsHitfinderMonitor = fStsHitFinder(unpackResult.first.fSts);
+  PartitionedPODVector<sts::Hit> stsHits;
+  if (Opts().HasStep(Step::LocalReco) && Opts().HasDetector(fles::Subsystem::STS)) {
+    auto stsResults     = fStsHitFinder(unpackResult.first.fSts);
+    stsHits             = std::move(stsResults.hits);
+    stsHitfinderMonitor = stsResults.monitor;
+  }
+
+  // --- Tracking
+  std::vector<ca::Track> tracks = fTracking.Run();
 
   xpu::timings ts_times = xpu::pop_timer();
 
@@ -130,14 +143,11 @@ RecoResults Reco::Run(const fles::Timeslice& ts)
   QueueStsRecoMetrics(stsHitfinderMonitor);
   QueueEvbuildMetrics(evbuildMonitor);
 
-
   PrintTimings(ts_times);
 
   RecoResults results;
-  results.events = std::move(events);
-
-  // --- Tracking
-  std::vector<ca::Track> tracks = fTracking.Run();
+  if (Opts().HasOutput(RecoData::DigiEvent)) results.events = std::move(events);
+  if (Opts().HasOutput(RecoData::Hit)) results.stsHits = std::move(stsHits);
 
   return results;
 }
diff --git a/algo/global/Reco.h b/algo/global/Reco.h
index 17e1e09d77..7ce0bb2a73 100644
--- a/algo/global/Reco.h
+++ b/algo/global/Reco.h
@@ -23,6 +23,7 @@ namespace cbm::algo
 
   struct RecoResults {
     std::vector<DigiEvent> events;
+    PartitionedPODVector<sts::Hit> stsHits;
   };
 
   class Reco : SubChain {
diff --git a/algo/global/StorableRecoResults.h b/algo/global/StorableRecoResults.h
index 0f0369d542..c29d434c8d 100644
--- a/algo/global/StorableRecoResults.h
+++ b/algo/global/StorableRecoResults.h
@@ -37,11 +37,15 @@ namespace cbm::algo
     std::vector<CbmDigiEvent>& DigiEvents() { return fDigiEvents; }
     const std::vector<CbmDigiEvent>& DigiEvents() const { return fDigiEvents; }
 
+    PartitionedVector<sts::Hit>& StsHits() { return fStsHits; }
+    const PartitionedVector<sts::Hit>& StsHits() const { return fStsHits; }
+
   private:
     uint64_t fTsIndex     = UINT64_MAX;
     uint64_t fTsStartTime = UINT64_MAX;
 
     std::vector<CbmDigiEvent> fDigiEvents;
+    PartitionedVector<sts::Hit> fStsHits;
 
     friend class boost::serialization::access;
 
@@ -52,6 +56,7 @@ namespace cbm::algo
       ar& fTsStartTime;
 
       ar& fDigiEvents;
+      ar& fStsHits;
     }
   };
 
diff --git a/algo/test/CMakeLists.txt b/algo/test/CMakeLists.txt
index 11caa16616..6eeb86a8a1 100644
--- a/algo/test/CMakeLists.txt
+++ b/algo/test/CMakeLists.txt
@@ -51,3 +51,9 @@ Set(YamlConfigSources
 )
 CreateGTestExeAndAddTest(_GTestYamlConfig "${INCLUDE_DIRECTORIES}" "${LINK_DIRECTORIES}"
                          "${YamlConfigSources}" "${PUB_DEPS}" "${PVT_DEPS}" "${INT_DEPS}" "")
+
+Set(PartitionedVectorSources
+  _GTestPartitionedVector.cxx
+)
+CreateGTestExeAndAddTest(_GTestPartitionedVector "${INCLUDE_DIRECTORIES}" "${LINK_DIRECTORIES}"
+                         "${PartitionedVectorSources}" "${PUB_DEPS}" "${PVT_DEPS}" "${INT_DEPS}" "")
diff --git a/algo/test/_GTestPartitionedVector.cxx b/algo/test/_GTestPartitionedVector.cxx
new file mode 100644
index 0000000000..41dca1e213
--- /dev/null
+++ b/algo/test/_GTestPartitionedVector.cxx
@@ -0,0 +1,98 @@
+/* Copyright (C) 2023 FIAS Frankfurt Institute for Advanced Studies, Frankfurt / Main
+   SPDX-License-Identifier: GPL-3.0-only
+   Authors: Felix Weiglhofer [committer] */
+
+#include "PODVector.h"
+#include "PartitionedVector.h"
+#include "gtest/gtest.h"
+
+using namespace cbm::algo;
+
+template<typename T>
+void EXPECT_CONTAINER_EQ(gsl::span<T> a, std::vector<T> b)
+{
+  EXPECT_EQ(a.size(), b.size());
+  for (size_t i = 0; i < a.size(); ++i) {
+    EXPECT_EQ(a[i], b[i]);
+  }
+}
+
+TEST(PartitionedVector, CanCreateEmpty)
+{
+  PartitionedVector<i32> vec;
+  EXPECT_EQ(vec.NElements(), 0);
+  EXPECT_EQ(vec.NPartitions(), 0);
+}
+
+TEST(PartitionedVector, CanCreateWithPartitions)
+{
+  std::vector<i32> data      = {1, 2, 3, 4, 5, 6, 7, 8, 9};
+  std::vector<size_t> sizes  = {2, 4, 3};
+  std::vector<u32> addresses = {0x0, 0x100, 0x200};
+
+  PartitionedVector vec(std::move(data), sizes, addresses);
+
+  EXPECT_EQ(vec.NElements(), 9);
+  EXPECT_EQ(vec.NPartitions(), 3);
+
+  EXPECT_EQ(vec.Size(0), 2);
+  EXPECT_EQ(vec[0].size(), 2);
+  EXPECT_EQ(vec.Address(0), 0x0);
+  EXPECT_CONTAINER_EQ(vec[0], {1, 2});
+
+  auto part = vec.Partition(0);
+  EXPECT_EQ(part.first.size(), vec.Size(0));
+  EXPECT_EQ(part.second, vec.Address(0));
+
+  EXPECT_EQ(vec.Size(1), 4);
+  EXPECT_EQ(vec[1].size(), 4);
+  EXPECT_EQ(vec.Address(1), 0x100);
+  EXPECT_CONTAINER_EQ(vec[1], {3, 4, 5, 6});
+
+  part = vec.Partition(1);
+  EXPECT_EQ(part.first.size(), vec.Size(1));
+  EXPECT_EQ(part.second, vec.Address(1));
+
+  EXPECT_EQ(vec.Size(2), 3);
+  EXPECT_EQ(vec[2].size(), 3);
+  EXPECT_EQ(vec.Address(2), 0x200);
+  EXPECT_CONTAINER_EQ(vec[2], {7, 8, 9});
+
+  part = vec.Partition(2);
+  EXPECT_EQ(part.first.size(), vec.Size(2));
+  EXPECT_EQ(part.second, vec.Address(2));
+}
+
+TEST(PartitionedVector, ThrowsOnAddressSizesMismatch)
+{
+  std::vector<i32> data      = {1, 2, 3, 4, 5, 6, 7, 8, 9};
+  std::vector<size_t> sizes  = {2, 4, 3};
+  std::vector<u32> addresses = {0x0, 0x100};
+
+  EXPECT_THROW(PartitionedVector vec(std::move(data), sizes, addresses), std::runtime_error);
+}
+
+TEST(PartitionedVector, ThrowsOnSizesMismatch)
+{
+  std::vector<i32> data      = {1, 2, 3, 4, 5, 6, 7, 8, 9};
+  std::vector<size_t> sizes  = {2, 3, 3};
+  std::vector<u32> addresses = {0x0, 0x100, 0x200};
+
+  EXPECT_THROW(PartitionedVector vec(std::move(data), sizes, addresses), std::runtime_error);
+}
+
+TEST(PartitionedVector, ThrowsOnOutOfBounds)
+{
+  std::vector<i32> data      = {1, 2, 3, 4, 5, 6, 7, 8, 9};
+  std::vector<size_t> sizes  = {2, 4, 3};
+  std::vector<u32> addresses = {0x0, 0x100, 0x200};
+
+  PartitionedVector vec(std::move(data), sizes, addresses);
+
+  EXPECT_THROW(vec[3], std::out_of_range);
+  EXPECT_THROW(vec.Partition(3), std::out_of_range);
+  EXPECT_THROW(vec.Address(3), std::out_of_range);
+  EXPECT_THROW(vec.Size(3), std::out_of_range);
+}
+
+TEST(PartitionedVector, IsCopyConstructable) {}
diff --git a/external/CMakeLists.txt b/external/CMakeLists.txt
index 5935eef5c8..eea9de219d 100644
--- a/external/CMakeLists.txt
+++ b/external/CMakeLists.txt
@@ -52,7 +52,7 @@ if(DOWNLOAD_EXTERNALS)
   if (NOT ${CBM_XPU_DEV})
     download_project_if_needed(PROJECT           xpu
                               GIT_REPOSITORY    "https://github.com/fweig/xpu.git"
-                              GIT_TAG           "593374ca55880e3d10e1688f9af3a778b409afd8" # v0.9.4
+                              GIT_TAG           "bef33a10894cbd56ea1f9880bf49cb43ae87a407" # 23-10-12
                               SOURCE_DIR        ${CMAKE_CURRENT_SOURCE_DIR}/xpu
                               CONFIGURE_COMMAND ""
                               BUILD_COMMAND     ""
diff --git a/reco/app/cbmreco/main.cxx b/reco/app/cbmreco/main.cxx
index 883679877f..f0f69c7e07 100644
--- a/reco/app/cbmreco/main.cxx
+++ b/reco/app/cbmreco/main.cxx
@@ -22,10 +22,14 @@ using namespace cbm::algo;
 std::shared_ptr<StorableRecoResults> makeStorableRecoResults(const fles::Timeslice& ts, const RecoResults& results)
 {
   auto storable = std::make_shared<StorableRecoResults>(ts.index(), ts.start_time());
+
   storable->DigiEvents().reserve(results.events.size());
   for (const auto& digiEvent : results.events) {
     storable->DigiEvents().emplace_back(digiEvent.ToStorable());
   }
+
+  storable->StsHits() = results.stsHits;
+
   return storable;
 }
 
@@ -33,6 +37,7 @@ bool dumpArchive(const Options& opts)
 {
   // Limit the number of events per timeslice to dump to avoid spamming the log
   constexpr size_t DumpEventsPerTS = 10;
+  constexpr size_t DumpHitsPerSensor = 2;
 
   if (!opts.DumpArchive()) return false;
 
@@ -53,14 +58,29 @@ bool dumpArchive(const Options& opts)
     }
 
     size_t nEvents = recoResults->DigiEvents().size();
-    L_(info) << "TS " << recoResults->TsIndex() << " start: " << recoResults->TsStartTime() << " events: " << nEvents;
+    L_(info) << "TS " << recoResults->TsIndex() << " start: " << recoResults->TsStartTime() << " events: " << nEvents
+             << ", stsHits: " << recoResults->StsHits().NElements();
 
     for (size_t i = 0; i < std::min(DumpEventsPerTS, nEvents); i++) {
       const auto& digiEvent = recoResults->DigiEvents().at(i);
       L_(info) << " - Event " << i << " number: " << digiEvent.fNumber << "; time: " << digiEvent.fTime
-               << "; nStsDigis: " << digiEvent.fData.Size(ECbmModuleId::kSts);
+               << "; nStsDigis: " << digiEvent.fData.Size(ECbmModuleId::kSts)
+               << "; nTofDigis: " << digiEvent.fData.Size(ECbmModuleId::kTof);
+    }
+
+    if (nEvents > DumpEventsPerTS) L_(info) << "...";
+
+    auto& stsHits = recoResults->StsHits();
+    for (size_t m = 0; m < stsHits.NPartitions(); m++) {
+      auto [hits, address] = stsHits.Partition(m);
+      for (size_t i = 0; i < std::min(DumpHitsPerSensor, hits.size()); i++) {
+        const auto& hit = hits[i];
+        L_(info) << " - STS Hit " << i << " sensor: " << address << "; time: " << hit.fTime << ", X: " << hit.fX
+                 << ", Y: " << hit.fY << ", Z: " << hit.fZ;
+      }
     }
 
+
     if (nEvents > DumpEventsPerTS) L_(info) << "...";
   }
 
-- 
GitLab