diff --git a/reco/detectors/sts/CbmRecoSts.cxx b/reco/detectors/sts/CbmRecoSts.cxx
index 4b716c4660961b461bf17ba5b6533143f0f2aa45..2a7887203183f225ae79a67d65af3f4dfdc21341 100644
--- a/reco/detectors/sts/CbmRecoSts.cxx
+++ b/reco/detectors/sts/CbmRecoSts.cxx
@@ -286,7 +286,7 @@ void CbmRecoSts::Finish()
   Double_t clusterHit  = Double_t(fNofClusters) / Double_t(fNofHits);
   LOG(info) << "=====================================";
   LOG(info) << GetName() << ": Run summary";
-  if (fUseGpuReco) LOG(info) << "Ran new GPU STS reconstruction.";
+  if (fUseGpuReco) LOG(info) << "Ran new GPU STS reconstruction. (Device " << xpu::device_properties().name << ")";
   else if (ompThreads < 0)
     LOG(info) << "STS reconstruction ran single threaded (No OpenMP).";
   else
@@ -302,7 +302,6 @@ void CbmRecoSts::Finish()
   LOG(info) << "Clusters per hit       : " << fixed << setprecision(2) << clusterHit;
   LOG(info) << "Time per TSlice        : " << fixed << setprecision(2) << 1000. * fTimeRun / Double_t(fNofTs) << " ms ";
 
-
   // Aggregate times for substeps of reconstruction
   // Note: These times are meaningless when reconstruction runs with > 1 thread.
   CbmStsRecoModule::Timings timingsTotal;
@@ -323,6 +322,8 @@ void CbmRecoSts::Finish()
   fTime3 /= nEvent;
   fTime4 /= nEvent;
 
+  auto throughput = [](auto bytes, auto timeMs) { return bytes * 1000. / timeMs / double(1ull << 30); };
+
   if (not fUseGpuReco) {
     LOG(info) << "NofEvents        : " << fNofEvents;
     LOG(info) << "Time Reset       : " << fixed << setprecision(1) << setw(6) << 1000. * fTime1 << " ms ("
@@ -332,24 +333,35 @@ void CbmRecoSts::Finish()
     LOG(info) << "Time Reconstruct: " << fixed << setprecision(1) << setw(6) << 1000. * fTime3 << " ms ("
               << setprecision(1) << setw(4) << 100. * fTime3 / fTimeTot << " %)";
     LOG(info) << "Time by step:\n"
-              << "  Sort Digi   : " << fixed << setprecision(1) << setw(6) << 1000. * timingsTotal.timeSortDigi
-              << " ms\n"
-              << "  Find Cluster: " << fixed << setprecision(1) << setw(6) << 1000. * timingsTotal.timeCluster
-              << " ms\n"
-              << "  Sort Cluster: " << fixed << setprecision(1) << setw(6) << 1000. * timingsTotal.timeSortCluster
-              << " ms\n"
-              << "  Find Hits   : " << fixed << setprecision(1) << setw(6) << 1000. * timingsTotal.timeHits << " ms\n";
+              << "  Sort Digi   : " << fixed << setprecision(2) << setw(6) << 1000. * fTimeSortDigis << " ms ("
+              << throughput(fNofDigis * 8, 1000. * fTimeSortDigis) << " GB/s)\n"
+              << "  Find Cluster: " << fixed << setprecision(2) << setw(6) << 1000. * fTimeFindClusters << " ms ("
+              << throughput(fNofDigis * sizeof(CbmStsDigi), 1000. * fTimeFindClusters) << " GB/s)\n"
+              << "  Sort Cluster: " << fixed << setprecision(2) << setw(6) << 1000. * fTimeSortClusters << " ms ("
+              << throughput(fNofClusters * sizeof(CbmStsCluster), 1000. * fTimeSortClusters) << " GB/s)\n"
+              << "  Find Hits   : " << fixed << setprecision(2) << setw(6) << 1000. * fTimeFindHits << " ms ("
+              << throughput(fNofClusters * sizeof(CbmStsCluster), 1000. * fTimeFindHits) << " GB/s)";
   }
   else {
     cbm::algo::StsHitfinderTimes times = fGpuReco.GetHitfinderTimes();
 
     double gpuHitfinderTimeTotal = times.timeSortDigi + times.timeCluster + times.timeSortCluster + times.timeHits;
-    LOG(info) << "Time Reconstruct (GPU) : " << fixed << setprecision(1) << setw(6) << gpuHitfinderTimeTotal << " ms";
+
+    double sortDigiThroughput    = throughput(fNofDigis * sizeof(CbmStsDigi), times.timeSortDigi);
+    double findClusterThroughput = throughput(fNofDigis * sizeof(CbmStsDigi), times.timeCluster);
+    double sortClusterThroughput = throughput(fNofClusters * 8, times.timeSortCluster);
+    double findHitThroughput     = throughput(fNofClusters * 24, times.timeHits);
+
+    LOG(info) << "Time Reconstruct (GPU) : " << fixed << setprecision(2) << setw(6) << gpuHitfinderTimeTotal << " ms";
     LOG(info) << "Time by step:\n"
-              << "  Sort Digi   : " << fixed << setprecision(1) << setw(6) << times.timeSortDigi << " ms\n"
-              << "  Find Cluster: " << fixed << setprecision(1) << setw(6) << times.timeCluster << " ms\n"
-              << "  Sort Cluster: " << fixed << setprecision(1) << setw(6) << times.timeSortCluster << " ms\n"
-              << "  Find Hits   : " << fixed << setprecision(1) << setw(6) << times.timeHits << " ms";
+              << "  Sort Digi   : " << fixed << setprecision(2) << setw(6) << times.timeSortDigi << " ms ("
+              << sortDigiThroughput << " GB/s)\n"
+              << "  Find Cluster: " << fixed << setprecision(2) << setw(6) << times.timeCluster << " ms ("
+              << findClusterThroughput << " GB/s)\n"
+              << "  Sort Cluster: " << fixed << setprecision(2) << setw(6) << times.timeSortCluster << " ms ("
+              << sortClusterThroughput << " GB/s)\n"
+              << "  Find Hits   : " << fixed << setprecision(2) << setw(6) << times.timeHits << "ms ("
+              << findHitThroughput << " GB/s)";
   }
   LOG(info) << "=====================================";
 }
@@ -586,14 +598,54 @@ void CbmRecoSts::ProcessData(CbmEvent* event)
 
 
   // --- Execute reconstruction in the modules
+  // Run each step individually. This allows us to meassure the runtime of each step
+  // even when running in parallel
+  TStopwatch timeSubstep;
   fTimer.Start();
+  timeSubstep.Start();
+#ifdef _OPENMP
+#pragma omp parallel for schedule(static)
+#endif
+  for (UInt_t it = 0; it < fModuleIndex.size(); it++) {
+    assert(fModuleIndex[it]);
+    fModuleIndex[it]->SortDigis();
+  }
+  timeSubstep.Stop();
+  fTimeSortDigis = timeSubstep.RealTime();
+
+  timeSubstep.Start();
+#ifdef _OPENMP
+#pragma omp parallel for schedule(static)
+#endif
+  for (UInt_t it = 0; it < fModuleIndex.size(); it++) {
+    assert(fModuleIndex[it]);
+    fModuleIndex[it]->FindClusters();
+  }
+  timeSubstep.Stop();
+  fTimeFindClusters = timeSubstep.RealTime();
+
+  timeSubstep.Start();
 #ifdef _OPENMP
 #pragma omp parallel for schedule(static)
 #endif
   for (UInt_t it = 0; it < fModuleIndex.size(); it++) {
     assert(fModuleIndex[it]);
-    fModuleIndex[it]->Reconstruct();
+    fModuleIndex[it]->SortClusters();
   }
+  timeSubstep.Stop();
+  fTimeSortClusters = timeSubstep.RealTime();
+
+  timeSubstep.Start();
+#ifdef _OPENMP
+#pragma omp parallel for schedule(static)
+#endif
+  for (UInt_t it = 0; it < fModuleIndex.size(); it++) {
+    assert(fModuleIndex[it]);
+    fModuleIndex[it]->FindHits();
+  }
+  timeSubstep.Stop();
+  fTimeFindHits = timeSubstep.RealTime();
+
   fTimer.Stop();
   Double_t time3 = fTimer.RealTime();  // Time for reconstruction
 
diff --git a/reco/detectors/sts/CbmRecoSts.h b/reco/detectors/sts/CbmRecoSts.h
index bc4e84e85df4418c605fa558b4e6e2b5ddc9177f..e7fdf84db221e76a70dbf993c18f5a2fb8c0090f 100644
--- a/reco/detectors/sts/CbmRecoSts.h
+++ b/reco/detectors/sts/CbmRecoSts.h
@@ -304,6 +304,10 @@ private:
   Double_t fTime2           = 0.;  ///< Time for distributing data
   Double_t fTime3           = 0.;  ///< Time for reconstruction
   Double_t fTime4           = 0.;  ///< Time for output results
+  double fTimeSortDigis     = 0.;
+  double fTimeFindClusters  = 0.;
+  double fTimeSortClusters  = 0.;
+  double fTimeFindHits      = 0.;
 
   // --- Run counters
   TStopwatch fTimer {};               //! ROOT timer
diff --git a/reco/detectors/sts/CbmStsRecoModule.cxx b/reco/detectors/sts/CbmStsRecoModule.cxx
index fa95b5ea56d6cf12bb04a17b8a1f6384407e386b..d4af3f77034fa6624bd4ada4bc6aa34ccef37198 100644
--- a/reco/detectors/sts/CbmStsRecoModule.cxx
+++ b/reco/detectors/sts/CbmStsRecoModule.cxx
@@ -76,8 +76,14 @@ void CbmStsRecoModule::AddDigiToQueue(const CbmStsDigi* digi, Int_t digiIndex)
 // -----   Reconstruction   ------------------------------------------------
 void CbmStsRecoModule::Reconstruct()
 {
+  SortDigis();
+  FindClusters();
+  SortClusters();
+  FindHits();
+}
 
-  // return;
+void CbmStsRecoModule::SortDigis()
+{
   TStopwatch timer;
 
   timer.Start();
@@ -92,8 +98,13 @@ void CbmStsRecoModule::Reconstruct()
             });
   timer.Stop();
   fTimings.timeSortDigi = timer.RealTime();
+}
 
+void CbmStsRecoModule::FindClusters()
+{
   // --- Perform cluster finding
+  TStopwatch timer;
+
   timer.Start();
   fClusterFinder->Exec(fDigisF, fClustersF, fSetupModule->GetAddress(), fNofStripsF, 0, fTimeCutDigisSig,
                        fTimeCutDigisAbs, fConnectEdgeFront, fParModule);
@@ -108,8 +119,13 @@ void CbmStsRecoModule::Reconstruct()
 
   timer.Stop();
   fTimings.timeCluster = timer.RealTime();
+}
 
+void CbmStsRecoModule::SortClusters()
+{
   // --- Sort clusters by time
+  TStopwatch timer;
+
   timer.Start();
   std::sort(fClustersF.begin(), fClustersF.end(), [](const CbmStsCluster& cluster1, const CbmStsCluster& cluster2) {
     return (cluster1.GetTime() < cluster2.GetTime());
@@ -119,8 +135,13 @@ void CbmStsRecoModule::Reconstruct()
   });
   timer.Stop();
   fTimings.timeSortCluster = timer.RealTime();
+}
 
+void CbmStsRecoModule::FindHits()
+{
   // --- Perform hit finding
+  TStopwatch timer;
+
   timer.Start();
   if (fHitFinder)
     fHitFinder->Exec(fClustersF, fClustersB, fHits, fSetupModule->GetAddress(), fTimeCutClustersSig,
@@ -133,6 +154,7 @@ void CbmStsRecoModule::Reconstruct()
   timer.Stop();
   fTimings.timeHits = timer.RealTime();
 }
+
 // -------------------------------------------------------------------------
 
 
diff --git a/reco/detectors/sts/CbmStsRecoModule.h b/reco/detectors/sts/CbmStsRecoModule.h
index 261ec35c21ccde53f7566bbef7cefeb7d3499992..a6de560278adaf0a252606810fd3f78cc57c98bd 100644
--- a/reco/detectors/sts/CbmStsRecoModule.h
+++ b/reco/detectors/sts/CbmStsRecoModule.h
@@ -120,6 +120,14 @@ public:
   /** @brief Perform reconstruction **/
   void Reconstruct();
 
+  void SortDigis();
+
+  void FindClusters();
+
+  void SortClusters();
+
+  void FindHits();
+
 
   /** @brief Clear input queue **/
   void Reset();