diff --git a/algo/detectors/trd/Hitfind.cxx b/algo/detectors/trd/Hitfind.cxx
index 36670ae31f267c276c053064da5db0af091a1b81..0320962a7a179155ca1ee0234eaa0609a2faec14 100644
--- a/algo/detectors/trd/Hitfind.cxx
+++ b/algo/detectors/trd/Hitfind.cxx
@@ -17,7 +17,14 @@ using fles::Subsystem;
 // by row index. Hit merging is thereby only done between pairs of neighboring rows,
 // which can miss some large digi clusters. Enable flag below to instead parallelize
 // the last step (hit merging) by module index.
-#define MERGE_BY_MODULE
+//#define MERGE_BY_MODULE
+
+// If hit merging by module index is enabled, enable the flag below to include an
+// additional row-wise merge step before the final module-wise merge step is applied.
+// This already catches most of the important merges and hence speeds up the process.
+// In addition doing two sweeps through the buffers catches some multi-hit merges which are
+// missed even in the module-wise method.
+//#define PREPROCESS_BY_ROW
 
 namespace cbm::algo::trd
 {
@@ -369,10 +376,9 @@ namespace cbm::algo::trd
     xpu::push_timer("BuildClusters");
     xpu::t_add_bytes(digiIn.size_bytes());
 
-    // Cluster building
+    // Cluster building and hit finding
     CBM_PARALLEL_FOR(schedule(dynamic))
     for (size_t row = 0; row < fRowList.size(); row++) {
-
       const int address     = std::get<0>(fRowList[row]);
       const bool is2D       = std::get<1>(fRowList[row]);
       const size_t rowInMod = std::get<2>(fRowList[row]);
@@ -386,9 +392,58 @@ namespace cbm::algo::trd
         hitBuffer[address][rowInMod] = (*fHitFind[address])(&clusters);
       }
     }
+
+#ifdef PREPROCESS_BY_ROW
+    // Row-merging for even rows
+    CBM_PARALLEL_FOR(schedule(dynamic))
+    for (size_t row = 0; row < fRowList.size() / 2; row++) {
+      const size_t row1      = 2 * row;
+      const size_t row2      = 2 * row + 1;
+      const int address      = std::get<0>(fRowList[row1]);
+      const bool is2D        = std::get<1>(fRowList[row1]);
+      const size_t rowInMod1 = std::get<2>(fRowList[row1]);
+      const size_t rowInMod2 = std::get<2>(fRowList[row2]);
+      auto& buffer           = hitBuffer[address];
+
+      if (row2 >= fRowList.size() || std::get<0>(fRowList[row2]) != address) {
+        continue;
+      }
+      if (is2D) {
+        std::tie(buffer[rowInMod1], buffer[rowInMod2]) = (*fHitMerge2d[address])(buffer[rowInMod1], buffer[rowInMod2]);
+      }
+      else {
+        std::tie(buffer[rowInMod1], buffer[rowInMod2]) = (*fHitMerge[address])(buffer[rowInMod1], buffer[rowInMod2]);
+      }
+    }
+
+    // Row-merging for odd rows
+    CBM_PARALLEL_FOR(schedule(dynamic))
+    for (size_t row = 0; row < fRowList.size() / 2; row++) {
+      const size_t row1 = 2 * row + 1;
+      const size_t row2 = 2 * row + 2;
+      if (row2 >= fRowList.size()) {
+        continue;
+      }
+      const int address = std::get<0>(fRowList[row1]);
+      const bool is2D   = std::get<1>(fRowList[row1]);
+      if (std::get<0>(fRowList[row2]) != address) {
+        continue;
+      }
+      const size_t rowInMod1 = std::get<2>(fRowList[row1]);
+      const size_t rowInMod2 = std::get<2>(fRowList[row2]);
+      auto& buffer           = hitBuffer[address];
+      if (is2D) {
+        std::tie(buffer[rowInMod1], buffer[rowInMod2]) = (*fHitMerge2d[address])(buffer[rowInMod1], buffer[rowInMod2]);
+      }
+      else {
+        std::tie(buffer[rowInMod1], buffer[rowInMod2]) = (*fHitMerge[address])(buffer[rowInMod1], buffer[rowInMod2]);
+      }
+    }
+#endif
+
     monitor.timeClusterize = xpu::pop_timer();
 
-    // Hit finding
+    // Result storage
     PODVector<Hit> hitsFlat;       // hit storage
     PODVector<size_t> modSizes;    // nHits per modules
     PODVector<uint> modAddresses;  // address of modules
@@ -399,6 +454,9 @@ namespace cbm::algo::trd
     std::vector<size_t> addrPrefix;
 
     xpu::push_timer("FindHits");
+
+    // Combine row buffers into module buffers.
+    // Then run a final module-wise row-merging iteration and arrange results.
     CBM_PARALLEL()
     {
       const int ithread  = openmp::GetThreadNum();
@@ -439,6 +497,8 @@ namespace cbm::algo::trd
         else {
           mod_hitdata = (*fHitMerge[address])(hitData, dummy).first;
         }
+
+        // Remove digi data from hits
         std::vector<Hit> mod_hits;
         std::transform(mod_hitdata.begin(), mod_hitdata.end(), std::back_inserter(mod_hits),
                        [](const auto& p) { return p.first; });