diff --git a/algo/base/gpu/DeviceImage.h b/algo/base/gpu/DeviceImage.h
index 14a84da25e53aa41658544e5b84ce84a9b1cb910..523310d98647b5ba4242d060a2521dfce9cd927c 100644
--- a/algo/base/gpu/DeviceImage.h
+++ b/algo/base/gpu/DeviceImage.h
@@ -7,7 +7,7 @@
 
 namespace cbm::algo
 {
-  struct GPUReco {
+  struct GPUReco : xpu::device_image {
   };
 }  // namespace cbm::algo
 
diff --git a/algo/base/gpu/xpu_legacy.h b/algo/base/gpu/xpu_legacy.h
new file mode 100644
index 0000000000000000000000000000000000000000..861f15f2fe75c08d03d210f914c8ec4a9fd7f694
--- /dev/null
+++ b/algo/base/gpu/xpu_legacy.h
@@ -0,0 +1,103 @@
+/* Copyright (C) 2022 FIAS Frankfurt Institute for Advanced Studies, Frankfurt / Main
+   SPDX-License-Identifier: GPL-3.0-only
+   Authors: Felix Weiglhofer [committer]*/
+#ifndef CORE_COMPAT_XPU_LEGACY_H
+#define CORE_COMPAT_XPU_LEGACY_H
+
+#include <xpu/host.h>
+
+namespace xpu {
+
+inline constexpr auto host_to_device = xpu::h2d;
+inline constexpr auto device_to_host = xpu::d2h;
+
+template<typename T>
+class hd_buffer {
+
+public:
+  hd_buffer() = default;
+  hd_buffer(size_t size) : m_buffer(size, xpu::buf_io) {}
+
+  T *h() { return xpu::h_view(m_buffer).begin(); }
+  T *d() { return m_buffer.get(); }
+
+  xpu::buffer<T> &underlying() { return m_buffer; }
+
+private:
+  xpu::buffer<T> m_buffer;
+
+};
+
+template<typename T>
+class d_buffer {
+
+public:
+  d_buffer() = default;
+  d_buffer(size_t size) : m_buffer(size, xpu::buf_device) {}
+
+  T *d() { return m_buffer.get(); }
+
+  xpu::buffer<T> &underlying() { return m_buffer; }
+
+private:
+  xpu::buffer<T> m_buffer;
+
+};
+
+
+template<typename T>
+void copy(hd_buffer<T> &buf, direction dir) {
+  static xpu::queue _Q;
+  _Q.copy(buf.underlying(), dir);
+  _Q.wait();
+}
+
+enum class side {
+  host,
+  device
+};
+
+template<typename T, side S>
+struct cmem_io {
+  using type = T *;
+};
+
+template<typename T>
+struct cmem_io<T, side::host> {
+  using type = hd_buffer<T>;
+};
+
+template<typename T, side S>
+using cmem_io_t = typename cmem_io<T, S>::type;
+
+template<typename T, side S>
+struct cmem_device {
+  using type = T *;
+};
+
+template<typename T>
+struct cmem_device<T, side::host> {
+  using type = d_buffer<T>;
+};
+
+template<typename T, side S>
+using cmem_device_t = typename cmem_device<T, S>::type;
+
+} // namespace xpu
+
+#define XPU_BLOCK_SIZE_1D(...)
+
+#define XPU_EXPORT_KERNEL(Image, Kernel, ...) XPU_EXPORT_KERNEL_II(Image, Kernel, xpu::no_smem, 64, ##__VA_ARGS__)
+
+#define XPU_EXPORT_KERNEL_II(Image, Kernel, SMEM, BlockSize, ...) \
+  struct Kernel : xpu::kernel<Image> { \
+    using block_size = xpu::block_size<BlockSize>; \
+    using context = xpu::kernel_context<SMEM>; \
+    XPU_D void operator()(context &ctx, ##__VA_ARGS__); \
+  }
+
+#define XPU_KERNEL(Kernel, smemIgnored, ...) \
+  XPU_EXPORT(Kernel); \
+  XPU_D void Kernel::operator()(context &ctx, ##__VA_ARGS__)
+
+#endif
diff --git a/algo/detectors/sts/UnpackStsXpu.cxx b/algo/detectors/sts/UnpackStsXpu.cxx
index 7c9a5556359113b226e4906c21ffc66212403ee0..ab0dfcf45d32a27c15080b2e29f0d12653d9e46b 100644
--- a/algo/detectors/sts/UnpackStsXpu.cxx
+++ b/algo/detectors/sts/UnpackStsXpu.cxx
@@ -15,11 +15,83 @@
 using std::unique_ptr;
 using std::vector;
 
-XPU_BLOCK_SIZE_1D(cbm::algo::UnpackStsXpu::Unpack, 32);
+XPU_KERNEL(cbm::algo::UnpackK, xpu::no_smem, UnpackStsXpuPar* params, UnpackStsXpuElinkPar* elinkParams,
+             stsxyter::Message* content, uint64_t* msMessCount, uint64_t* msMessOffset, uint64_t* msStartTime,
+             uint32_t* msCompIdx, CbmStsDigi* digisOut, const uint64_t currentTsTime, int NElems)
+  {
+    int id = ctx.block_idx_x() * ctx.block_dim_x() + ctx.thread_idx_x();
+    if (id >= NElems || msMessCount[id] < 2) return;  // exit if out of bounds or too few messages
+
+    UnpackStsXpuMonitorData monitor;  //Monitor data, currently not stored. TO DO: Implement!
+
+    // --- Get message count and offset for this MS
+    const uint32_t numMessages = msMessCount[id];
+    const uint32_t messOffset  = msMessOffset[id];
+
+    // --- Get starting position of this MS in message buffer
+    stsxyter::Message* message = &content[messOffset];
+
+    // --- Get starting position of this MS in digi buffer
+    CbmStsDigi* digis = &digisOut[messOffset];
+
+    // --- Get component index and unpack parameters of this MS
+    const uint32_t comp              = msCompIdx[id];
+    const UnpackStsXpuPar& unpackPar = params[comp];
+
+    // --- Get starting position of elink parameters of this MS
+    UnpackStsXpuElinkPar* elinkPar = &elinkParams[unpackPar.fElinkOffset];
+
+    // --- Init counter for produced digis
+    uint64_t numDigis = 0;
+
+    // --- The first message in the MS is expected to be of type EPOCH and can be ignored.
+    if (message[0].GetMessType() != stsxyter::MessType::Epoch) {
+      monitor.fNumErrInvalidFirstMessage++;
+      msMessCount[id] = 0;
+      return;
+    }
+
+    // --- The second message must be of type ts_msb.
+    if (message[1].GetMessType() != stsxyter::MessType::TsMsb) {
+      monitor.fNumErrInvalidFirstMessage++;
+      msMessCount[id] = 0;
+      return;
+    }
+
+    // --- Current TS_MSB epoch cycle
+    uint64_t currentCycle = msStartTime[id] / UnpackStsXpu::fkCycleLength;
+
+    // --- Process first message (ts_msb)
+    uint32_t currentEpoch     = 0;  ///< Current epoch number within epoch cycle
+    uint64_t currentEpochTime = 0;  ///< Current epoch time relative to timeslice in clock cycles
+    UnpackStsXpu::ProcessTsmsbMessage(message[1], currentEpoch, currentEpochTime, currentCycle, currentTsTime);
+
+    // --- Message loop
+    for (uint32_t messageNr = 2; messageNr < numMessages; messageNr++) {
+
+      // --- Action depending on message type
+      switch (message[messageNr].GetMessType()) {
+        case stsxyter::MessType::Hit: {
+          UnpackStsXpu::ProcessHitMessage(message[messageNr], digis, numDigis, unpackPar, elinkPar, monitor, currentEpochTime);
+          break;
+        }
+        case stsxyter::MessType::TsMsb: {
+          UnpackStsXpu::ProcessTsmsbMessage(message[messageNr], currentEpoch, currentEpochTime, currentCycle, currentTsTime);
+          break;
+        }
+        default: {
+          monitor.fNumNonHitOrTsbMessage++;
+          break;
+        }
+      }
+    }
+    // --- Store number of digis in buffer
+    msMessCount[id] = numDigis;
+  }
+
 
 namespace cbm::algo
 {
-
   // ----   Algorithm execution   ---------------------------------------------
   UnpackStsXpu::resultType UnpackStsXpu::operator()(const fles::Timeslice* ts, StsReadoutConfig& config)
   {
@@ -60,6 +132,8 @@ namespace cbm::algo
               result.second.fNumErrInvalidMsSize++;
               continue;
             }
+            xpu::t_add_bytes(msDescr.size);
+            xpu::k_add_bytes<UnpackK>(msDescr.size);
             msIdx.push_back(msDescr.idx);
             compIdx.push_back(comp);
             messCount.push_back(numMessages);
@@ -101,7 +175,7 @@ namespace cbm::algo
     const uint64_t currentTsTime   = ts->start_time() / epochLengthInNs;
 
     // --- Do unpacking for each microslice
-    xpu::run_kernel<Unpack>(xpu::grid::n_threads(numMs), fParams.d(), fElinkParams.d(), tsContent.d(), msMessCount.d(),
+    xpu::run_kernel<UnpackK>(xpu::n_threads(numMs), fParams.d(), fElinkParams.d(), tsContent.d(), msMessCount.d(),
                             msMessOffset.d(), msStartTime.d(), msCompIdx.d(), digisOut.d(), currentTsTime, numMs);
 
     // --- Copy results back to host (only two buffers are modified on device)
@@ -109,6 +183,7 @@ namespace cbm::algo
     xpu::copy(digisOut, xpu::device_to_host);
 
     // --- Store digis  TO DO: make Kernel for this, needs a way to sum arrays in XPU first
+    xpu::push_timer("Store digis");
     for (uint64_t i = 0; i < numMs; i++) {
       uint64_t offset   = msMessOffset.h()[i];
       uint64_t numDigis = msMessCount.h()[i];
@@ -116,85 +191,11 @@ namespace cbm::algo
         result.first.push_back(digisOut.h()[offset + j]);
       }
     }
+    xpu::pop_timer();
 
     return result;
   }
 
-  XPU_KERNEL(UnpackStsXpu::Unpack, xpu::no_smem, UnpackStsXpuPar* params, UnpackStsXpuElinkPar* elinkParams,
-             stsxyter::Message* content, uint64_t* msMessCount, uint64_t* msMessOffset, uint64_t* msStartTime,
-             uint32_t* msCompIdx, CbmStsDigi* digisOut, const uint64_t currentTsTime, int NElems)
-  {
-    int id = xpu::block_idx::x() * xpu::block_dim::x() + xpu::thread_idx::x();
-    if (id >= NElems || msMessCount[id] < 2) return;  // exit if out of bounds or too few messages
-
-    UnpackStsXpuMonitorData monitor;  //Monitor data, currently not stored. TO DO: Implement!
-
-    // --- Get message count and offset for this MS
-    const uint32_t numMessages = msMessCount[id];
-    const uint32_t messOffset  = msMessOffset[id];
-
-    // --- Get starting position of this MS in message buffer
-    stsxyter::Message* message = &content[messOffset];
-
-    // --- Get starting position of this MS in digi buffer
-    CbmStsDigi* digis = &digisOut[messOffset];
-
-    // --- Get component index and unpack parameters of this MS
-    const uint32_t comp              = msCompIdx[id];
-    const UnpackStsXpuPar& unpackPar = params[comp];
-
-    // --- Get starting position of elink parameters of this MS
-    UnpackStsXpuElinkPar* elinkPar = &elinkParams[unpackPar.fElinkOffset];
-
-    // --- Init counter for produced digis
-    uint64_t numDigis = 0;
-
-    // --- The first message in the MS is expected to be of type EPOCH and can be ignored.
-    if (message[0].GetMessType() != stsxyter::MessType::Epoch) {
-      monitor.fNumErrInvalidFirstMessage++;
-      msMessCount[id] = 0;
-      return;
-    }
-
-    // --- The second message must be of type ts_msb.
-    if (message[1].GetMessType() != stsxyter::MessType::TsMsb) {
-      monitor.fNumErrInvalidFirstMessage++;
-      msMessCount[id] = 0;
-      return;
-    }
-
-    // --- Current TS_MSB epoch cycle
-    uint64_t currentCycle = msStartTime[id] / fkCycleLength;
-
-    // --- Process first message (ts_msb)
-    uint32_t currentEpoch     = 0;  ///< Current epoch number within epoch cycle
-    uint64_t currentEpochTime = 0;  ///< Current epoch time relative to timeslice in clock cycles
-    ProcessTsmsbMessage(message[1], currentEpoch, currentEpochTime, currentCycle, currentTsTime);
-
-    // --- Message loop
-    for (uint32_t messageNr = 2; messageNr < numMessages; messageNr++) {
-
-      // --- Action depending on message type
-      switch (message[messageNr].GetMessType()) {
-        case stsxyter::MessType::Hit: {
-          ProcessHitMessage(message[messageNr], digis, numDigis, unpackPar, elinkPar, monitor, currentEpochTime);
-          break;
-        }
-        case stsxyter::MessType::TsMsb: {
-          ProcessTsmsbMessage(message[messageNr], currentEpoch, currentEpochTime, currentCycle, currentTsTime);
-          break;
-        }
-        default: {
-          monitor.fNumNonHitOrTsbMessage++;
-          break;
-        }
-      }
-    }
-    // --- Store number of digis in buffer
-    msMessCount[id] = numDigis;
-  }
-
-
   // -----   Process hit message   --------------------------------------------
   XPU_D inline void UnpackStsXpu::ProcessHitMessage(const stsxyter::Message& message, CbmStsDigi* digis,
                                                     uint64_t& numDigis, const UnpackStsXpuPar& unpackPar,
diff --git a/algo/detectors/sts/UnpackStsXpu.h b/algo/detectors/sts/UnpackStsXpu.h
index 1e88b7f5ea08800f15ba89ed5a54991bbdf54e93..f9da48db38e4d02a827c838f80833a62c6dc9a62 100644
--- a/algo/detectors/sts/UnpackStsXpu.h
+++ b/algo/detectors/sts/UnpackStsXpu.h
@@ -5,7 +5,10 @@
 #ifndef CBM_ALGO_UNPACKSTSXPU_H
 #define CBM_ALGO_UNPACKSTSXPU_H 1
 
+
 #include "CbmStsDigi.h"
+#include "gpu/DeviceImage.h"
+#include "gpu/xpu_legacy.h"
 
 #include "MicrosliceDescriptor.hpp"
 #include "Timeslice.hpp"
@@ -21,7 +24,6 @@
 
 #include "StsReadoutConfig.h"
 #include "StsXyterMessage.h"
-#include "gpu/DeviceImage.h"
 
 
 namespace cbm::algo
@@ -73,6 +75,9 @@ namespace cbm::algo
     }
   };
 
+  XPU_EXPORT_KERNEL(GPUReco, UnpackK, UnpackStsXpuPar* params, UnpackStsXpuElinkPar* elinkParams,
+                  stsxyter::Message* content, uint64_t* msMessCount, uint64_t* msMessOffset, uint64_t* msStartTime,
+                  uint32_t* msCompIdx, CbmStsDigi* digisOut, const uint64_t currentTsTime, int NElems);
 
   /** @class UnpackStsXpu
    ** @author Pierre-Alain Loizeau <p.-a.loizeau@gsi.de>
@@ -101,16 +106,6 @@ namespace cbm::algo
      **/
     resultType operator()(const fles::Timeslice* ts, StsReadoutConfig& config);
 
-
-    struct StsXpuUnpack {
-    };  // Identifier used by xpu to find where kernels are located
-
-
-    // Run unpacker for each microslice
-    XPU_EXPORT_KERNEL(GPUReco, Unpack, UnpackStsXpuPar* params, UnpackStsXpuElinkPar* elinkParams,
-                      stsxyter::Message* content, uint64_t* msMessCount, uint64_t* msMessOffset, uint64_t* msStartTime,
-                      uint32_t* msCompIdx, CbmStsDigi* digisOut, const uint64_t currentTsTime, int NElems);
-
     //Stores parameter structs for all elinks
     xpu::hd_buffer<UnpackStsXpuElinkPar> fElinkParams;
 
@@ -120,6 +115,8 @@ namespace cbm::algo
 
 
   private:  // methods
+    friend struct UnpackK;
+
     /** @brief Process a hit message
      ** @param message SMX message (32-bit word)
      ** @param digi buffer
diff --git a/external/.gitignore b/external/.gitignore
index 732dd15370edb746477f1fc066877aaa4e0a8f25..74cb20f77bcff3c16dfba60a8bf1e0c0ac4feccd 100644
--- a/external/.gitignore
+++ b/external/.gitignore
@@ -11,5 +11,6 @@ jsroot
 googletest
 yaml-cpp/
 xpu/
+xpu-dev
 GSL
 bba
diff --git a/external/CMakeLists.txt b/external/CMakeLists.txt
index abeed2bf6461baa432cec83c724ce4f90431a22e..0238ffa7b666d50d1c23af2b2655cc32b1983b03 100644
--- a/external/CMakeLists.txt
+++ b/external/CMakeLists.txt
@@ -52,7 +52,7 @@ if(DOWNLOAD_EXTERNALS)
   if (NOT ${CBM_XPU_DEV})
     download_project_if_needed(PROJECT           xpu
                               GIT_REPOSITORY    "https://github.com/fweig/xpu.git"
-                              GIT_TAG           "dd7d1d7e4b4d71079b9cfadbd662a264288308b0" # v0.7.6
+                              GIT_TAG           "d142d9ac7135488925b5b94f36da22ec55d9271c" # v0.9.3
                               SOURCE_DIR        ${CMAKE_CURRENT_SOURCE_DIR}/xpu
                               CONFIGURE_COMMAND ""
                               BUILD_COMMAND     ""