diff --git a/algo/detectors/trd/Hitfind.cxx b/algo/detectors/trd/Hitfind.cxx
index f1328a68ec4b64a6ffd4d232078dfc9eecc132c9..3a6dfc010141951a3a73665a49733d923f9a03b2 100644
--- a/algo/detectors/trd/Hitfind.cxx
+++ b/algo/detectors/trd/Hitfind.cxx
@@ -16,7 +16,7 @@ using fles::Subsystem;
 namespace cbm::algo::trd
 {
   // -----   Constructor   ------------------------------------------------------
-  Hitfind::Hitfind(trd::HitfindSetup setup, trd::Hitfind2DSetup setup2D)  // : fStorDigi(fNbSm.size())
+  Hitfind::Hitfind(trd::HitfindSetup setup, trd::Hitfind2DSetup setup2D)
   {
 
     // Create one algorithm per module for TRD and configure it with parameters
@@ -53,6 +53,7 @@ namespace cbm::algo::trd
       }
       fHitFind[module.address]      = std::make_unique<cbm::algo::trd::HitFinder>(params);
       fClusterBuild[module.address] = std::make_unique<cbm::algo::trd::Clusterizer>(params);
+      fModList.emplace_back(module.address, false);
     }
 
     // Create one algorithm per module for TRD2D and configure it with parameters
@@ -88,6 +89,7 @@ namespace cbm::algo::trd
       }
       fHitFind2d[module.address]      = std::make_unique<cbm::algo::trd::HitFinder2D>(params);
       fClusterBuild2d[module.address] = std::make_unique<cbm::algo::trd::Clusterizer2D>(params);
+      fModList.emplace_back(module.address, true);
     }
 
     L_(info) << "--- Configured hitfinder algorithms for TRD.";
@@ -100,37 +102,84 @@ namespace cbm::algo::trd
   Hitfind::resultType Hitfind::operator()(gsl::span<CbmTrdDigi> digiIn)
   {
     // --- Output data
-    resultType result         = {};
-    std::vector<Hit>& hitsOut = result.first;
+    resultType result = {};
+    auto& hitsOut     = std::get<0>(result);
+    auto& monitor     = std::get<1>(result);
 
     // Intermediate storage variables (digi, index) per module
     std::unordered_map<int, std::vector<std::pair<CbmTrdDigi, int32_t>>> digiBuffer;  //[modAddress]
 
-    const size_t maxdigis = 100000000001;
-
-    // Sort digis by module address
+    // Loop over the digis array and store the digis in separate vectors for
+    // each modules
+    xpu::push_timer("TrdHitfindModuleSort");
     for (size_t idigi = 0; idigi < digiIn.size(); idigi++) {
       const CbmTrdDigi* digi = &digiIn[idigi];
       digiBuffer[digi->GetAddressModule()].emplace_back(*digi, idigi);
-      if (idigi == maxdigis) {
-        break;
-      }
     }
+    monitor.fSortTime = xpu::pop_timer();
 
-    // Process 1D digis
-    for (auto imod = fHitFind.begin(); imod != fHitFind.end(); imod++) {
-      auto clusters = (*fClusterBuild[imod->first])(digiBuffer[imod->first]);
-      auto hits     = (*fHitFind[imod->first])(&clusters);
-      hitsOut.insert(hitsOut.end(), hits.begin(), hits.end());
-    }
+    PODVector<Hit> hitsFlat;                          // hit storage
+    std::vector<size_t> modSizes(fModList.size());    // nHits per modules
+    std::vector<uint> modAddresses(fModList.size());  // address of modules
+
+    // Prefix array for parallelization
+    std::vector<size_t> hitsPrefix;
+
+    xpu::push_timer("TrdHitfind");
+    xpu::t_add_bytes(digiIn.size_bytes());
+
+    CBM_PARALLEL()
+    {
+      const int ithread  = openmp::GetThreadNum();
+      const int nthreads = openmp::GetNumThreads();
+
+      CBM_OMP(single) { hitsPrefix.resize(nthreads + 1); }
+
+      std::vector<Hit> local_hits;
+
+      CBM_OMP(for schedule(dynamic) nowait)
+      for (size_t mod = 0; mod < fModList.size(); mod++) {
+        const int address = fModList[mod].first;
+        const bool is2D   = fModList[mod].second;
 
-    // Process 2D digis
-    for (auto imod = fHitFind2d.begin(); imod != fHitFind2d.end(); imod++) {
-      auto clusters = (*fClusterBuild2d[imod->first])(digiBuffer[imod->first], 0.);  // Number is TS start time (T0)
-      auto hits     = (*fHitFind2d[imod->first])(&clusters);
-      hitsOut.insert(hitsOut.end(), hits.begin(), hits.end());
+        std::vector<Hit> mod_hits;
+        if (is2D) {                                                              // 2D module
+          auto clusters = (*fClusterBuild2d[address])(digiBuffer[address], 0.);  // Number is TS start time (T0)
+          mod_hits      = (*fHitFind2d[address])(&clusters);
+        }
+        else {  // 1D module
+          auto clusters = (*fClusterBuild[address])(digiBuffer[address]);
+          mod_hits      = (*fHitFind[address])(&clusters);
+        }
+        // Append clusters to output
+        local_hits.insert(local_hits.end(), std::make_move_iterator(mod_hits.begin()),
+                          std::make_move_iterator(mod_hits.end()));
+
+        // store partition size
+        modSizes[mod] = mod_hits.size();
+
+        // store hw address of partition
+        modAddresses[mod] = address;
+      }
+
+      hitsPrefix[ithread + 1] = local_hits.size();
+      CBM_OMP(barrier)
+      CBM_OMP(single)
+      {
+        for (int i = 1; i < (nthreads + 1); i++) {
+          hitsPrefix[i] += hitsPrefix[i - 1];
+        }
+        hitsFlat.resize(hitsPrefix[nthreads]);
+      }
+      std::move(local_hits.begin(), local_hits.end(), hitsFlat.begin() + hitsPrefix[ithread]);
     }
+    // Monitoring
+    monitor.fTime     = xpu::pop_timer();
+    monitor.fNumDigis = digiIn.size();
+    monitor.fNumHits  = hitsFlat.size();
 
+    // Create ouput vector
+    hitsOut = PartitionedVector(std::move(hitsFlat), modSizes, modAddresses);
     return result;
   }
   // ----------------------------------------------------------------------------
diff --git a/algo/detectors/trd/Hitfind.h b/algo/detectors/trd/Hitfind.h
index c3fcfa2745922e293bdf4f702ef8809e167b304a..90ea01cc561c714d96758244f790b00cefaf6d5f 100644
--- a/algo/detectors/trd/Hitfind.h
+++ b/algo/detectors/trd/Hitfind.h
@@ -54,9 +54,7 @@ namespace cbm::algo::trd
   class Hitfind {
 
    public:
-    //typedef std::tuple<PartitionedVector<Hit>, HitfindMonitorData, PODVector<i32>> resultType;
-    typedef std::pair<std::vector<Hit>, HitfindMonitorData> resultType;
-
+    typedef std::tuple<PartitionedVector<Hit>, HitfindMonitorData> resultType;
 
     /** @brief Algorithm execution
      ** @param fles timeslice to hitfind
@@ -78,7 +76,7 @@ namespace cbm::algo::trd
     std::unordered_map<int, std::unique_ptr<cbm::algo::trd::HitFinder2D>> fHitFind2d;
     std::unordered_map<int, std::unique_ptr<cbm::algo::trd::HitFinder>> fHitFind;
 
-    /** @brief Intermediate storage variables (digi, index) **/
-    std::vector<std::vector<std::vector<std::pair<CbmTrdDigi, int32_t>>>> fStorDigi;  //[nbType][nbSm*nbRpc][nDigis]
+    /** @brief List of modules (address and type flag; true = 2D) */
+    std::vector<std::pair<int, bool>> fModList;
   };
 }  // namespace cbm::algo::trd
diff --git a/reco/tasks/CbmTaskTrdHitFinder.cxx b/reco/tasks/CbmTaskTrdHitFinder.cxx
index 9e505e9e3b941d8d617bd804a7470ea06dd3b0a0..cefdaa6c6732df1b7779421e310aa91fc07a4a9e 100644
--- a/reco/tasks/CbmTaskTrdHitFinder.cxx
+++ b/reco/tasks/CbmTaskTrdHitFinder.cxx
@@ -91,7 +91,7 @@ void CbmTaskTrdHitFinder::Exec(Option_t* /*option*/)
   }
 
   auto [hits, monitor] = (*fAlgo)(digiVec);
-  AddHits(&hits);
+  AddHits(hits.Data());
 
   timerTs.Stop();
   fProcessTime += timerTs.RealTime();
@@ -105,9 +105,9 @@ void CbmTaskTrdHitFinder::Exec(Option_t* /*option*/)
 
 
 //_____________________________________________________________________
-void CbmTaskTrdHitFinder::AddHits(std::vector<cbm::algo::trd::Hit>* hits)
+void CbmTaskTrdHitFinder::AddHits(gsl::span<cbm::algo::trd::Hit> hits)
 {
-  for (auto& hit : *hits) {
+  for (auto& hit : hits) {
     TVector3 pos(hit.GetX(), hit.GetY(), hit.GetZ());
     TVector3 dpos(hit.GetDx(), hit.GetDy(), hit.GetDz());
 
diff --git a/reco/tasks/CbmTaskTrdHitFinder.h b/reco/tasks/CbmTaskTrdHitFinder.h
index 15ca0995f42cda8f5becbf0f7b4490078e3fa2a6..204c5189df8eeaf857c2e40e41d9e39650ef6504 100644
--- a/reco/tasks/CbmTaskTrdHitFinder.h
+++ b/reco/tasks/CbmTaskTrdHitFinder.h
@@ -12,6 +12,7 @@
 #include "trd/Hit.h"
 #include "trd/Hitfind.h"
 
+#include <gsl/span>
 #include <map>
 #include <set>
 #include <vector>
@@ -56,7 +57,7 @@ class CbmTaskTrdHitFinder : public FairTask {
 
   template<class TCluster>
   void AddClusters(std::vector<TCluster>* clusters);
-  void AddHits(std::vector<cbm::algo::trd::Hit>* hits);
+  void AddHits(gsl::span<cbm::algo::trd::Hit> hits);
 
   /** @brief Create one algo object for each RPC **/
   bool InitAlgos();