From ba6868963187c34af32af833ba23b29a899e224d Mon Sep 17 00:00:00 2001
From: Felix Weiglhofer <weiglhofer@fias.uni-frankfurt.de>
Date: Tue, 6 Jun 2023 14:19:11 +0000
Subject: [PATCH] algo: Add monitoring support in cbmreco.

---
 algo/CMakeLists.txt      |  1 +
 algo/base/ChainContext.h |  4 ++++
 algo/base/Options.cxx    |  2 ++
 algo/base/Options.h      |  2 ++
 algo/base/SubChain.h     |  8 ++++++++
 algo/base/util/TsUtils.h | 19 +++++++++++++++++++
 algo/global/Reco.cxx     | 11 +++++++++++
 7 files changed, 47 insertions(+)
 create mode 100644 algo/base/util/TsUtils.h

diff --git a/algo/CMakeLists.txt b/algo/CMakeLists.txt
index 2f8182f323..d107ef3998 100644
--- a/algo/CMakeLists.txt
+++ b/algo/CMakeLists.txt
@@ -83,6 +83,7 @@ target_link_libraries(Algo
             external::yaml-cpp
             external::fles_logging
             external::fles_ipc
+            external::fles_monitoring
 )
 target_compile_definitions(Algo PUBLIC NO_ROOT)
 xpu_attach(Algo ${DEVICE_SRCS})
diff --git a/algo/base/ChainContext.h b/algo/base/ChainContext.h
index 3277028b2a..0ed47dcc38 100644
--- a/algo/base/ChainContext.h
+++ b/algo/base/ChainContext.h
@@ -4,6 +4,9 @@
 #ifndef CBM_ALGO_BASE_CHAINCONTEXT_H
 #define CBM_ALGO_BASE_CHAINCONTEXT_H
 
+#include <Monitor.hpp>
+#include <optional>
+
 #include "Options.h"
 #include "RecoParams.h"
 
@@ -12,6 +15,7 @@ namespace cbm::algo
   struct ChainContext {
     Options opts;
     RecoParams recoParams;
+    std::optional<cbm::Monitor> monitor;
   };
 }  // namespace cbm::algo
 
diff --git a/algo/base/Options.cxx b/algo/base/Options.cxx
index ff74e0e559..efd141f09a 100644
--- a/algo/base/Options.cxx
+++ b/algo/base/Options.cxx
@@ -52,6 +52,8 @@ Options::Options(int argc, char** argv)
       "select device (cpu, cuda0, cuda1, hip0, ...)")
     ("log-level,l", po::value(&fLogLevel)->default_value(info)->value_name("<level>"),
       "set log level (debug, info, warning, error, fatal)")
+    ("monitor,m", po::value(&fMonitorUri)->value_name("<uri>"),
+      "URI specifying monitor output (e.g. file:tmp/monitor.txt, empty = no monitor)")
     ("num-ts,n", po::value<int>(&fNumTimeslices)->default_value(-1)->value_name("<num>"),
       "Stop after <num> timeslices (-1 = all)")
     ("skip-ts,s", po::value<int>(&fSkipTimeslices)->default_value(0)->value_name("<num>"),
diff --git a/algo/base/Options.h b/algo/base/Options.h
index 25c3f1b99a..61b480a4d2 100644
--- a/algo/base/Options.h
+++ b/algo/base/Options.h
@@ -22,6 +22,7 @@ namespace cbm::algo
     const std::string& InputLocator() const { return fInputLocator; }
     severity_level LogLevel() const { return fLogLevel; }
     const std::string& Device() const { return fDevice; }
+    const std::string& MonitorUri() const { return fMonitorUri; }
     bool CollectKernelTimes() const { return fCollectKernelTimes; }
     int NumTimeslices() const { return fNumTimeslices; }
     int SkipTimeslices() const { return fSkipTimeslices; }
@@ -31,6 +32,7 @@ namespace cbm::algo
     std::string fInputLocator;
     severity_level fLogLevel;
     std::string fDevice;
+    std::string fMonitorUri;
     bool fCollectKernelTimes = false;
     int fNumTimeslices       = -1;
     int fSkipTimeslices      = 0;
diff --git a/algo/base/SubChain.h b/algo/base/SubChain.h
index b787c1b64c..b206247190 100644
--- a/algo/base/SubChain.h
+++ b/algo/base/SubChain.h
@@ -18,6 +18,14 @@ namespace cbm::algo
     const Options& Opts() const { return gsl::make_not_null(fContext)->opts; }
     const RecoParams& Params() const { return gsl::make_not_null(fContext)->recoParams; }
 
+    bool HasMonitor() const { return gsl::make_not_null(fContext)->monitor.has_value(); }
+
+    Monitor& GetMonitor() const
+    {
+      // Need Get-prefix to avoid conflict with Monitor-class name
+      return gsl::make_not_null(fContext)->monitor.value();
+    }
+
   private:
     ChainContext* fContext = nullptr;
   };
diff --git a/algo/base/util/TsUtils.h b/algo/base/util/TsUtils.h
new file mode 100644
index 0000000000..fa45dc060b
--- /dev/null
+++ b/algo/base/util/TsUtils.h
@@ -0,0 +1,19 @@
+/* Copyright (C) 2023 FIAS Frankfurt Institute for Advanced Studies, Frankfurt / Main
+   SPDX-License-Identifier: GPL-3.0-only
+   Authors: Felix Weiglhofer [committer] */
+
+#include <Timeslice.hpp>
+
+namespace cbm::algo::ts_utils
+{
+
+  inline size_t SizeBytes(const fles::Timeslice& ts)
+  {
+    size_t size = 0;
+    for (size_t i = 0; i < ts.num_components(); i++) {
+      size += ts.size_component(i);
+    }
+    return size;
+  }
+
+}  // namespace cbm::algo::ts_utils
diff --git a/algo/global/Reco.cxx b/algo/global/Reco.cxx
index cea651468a..9984134c92 100644
--- a/algo/global/Reco.cxx
+++ b/algo/global/Reco.cxx
@@ -3,11 +3,14 @@
    Authors: Felix Weiglhofer [committer] */
 #include "Reco.h"
 
+#include <System.hpp>
+
 #include <xpu/host.h>
 
 #include "config/Yaml.h"
 #include "log.hpp"
 #include "util/TimingsFormat.h"
+#include "util/TsUtils.h"
 
 using namespace cbm::algo;
 
@@ -26,6 +29,11 @@ void Reco::Init(const Options& opts)
   xpu::device_prop props {xpu::device::active()};
   L_(info) << "Running CBM Reco on Device " << props.name();
 
+  if (!opts.MonitorUri().empty()) {
+    fContext.monitor.emplace(opts.MonitorUri());
+    L_(info) << "Monitoring enabled, sending to " << opts.MonitorUri();
+  }
+
   // Reco Params
   fs::path recoParamsPath = opts.ParamsDir() / "RecoParams.yaml";
   YAML::Node yaml         = YAML::LoadFile(recoParamsPath.string());
@@ -76,6 +84,9 @@ void Reco::Run(const fles::Timeslice& ts)
 
   xpu::timings ts_times = xpu::pop_timer();
 
+  GetMonitor().QueueMetric("cbm_reco", {{"hostname", fles::system::current_hostname()}},
+                           {{"tsProcessed", 1}, {"bytesProcessed", ts_utils::SizeBytes(ts)}});
+
   PrintTimings(ts_times);
 }
 
-- 
GitLab