diff --git a/algo/base/gpu/DeviceImage.h b/algo/base/gpu/DeviceImage.h index 14a84da25e53aa41658544e5b84ce84a9b1cb910..523310d98647b5ba4242d060a2521dfce9cd927c 100644 --- a/algo/base/gpu/DeviceImage.h +++ b/algo/base/gpu/DeviceImage.h @@ -7,7 +7,7 @@ namespace cbm::algo { - struct GPUReco { + struct GPUReco : xpu::device_image { }; } // namespace cbm::algo diff --git a/algo/base/gpu/xpu_legacy.h b/algo/base/gpu/xpu_legacy.h new file mode 100644 index 0000000000000000000000000000000000000000..861f15f2fe75c08d03d210f914c8ec4a9fd7f694 --- /dev/null +++ b/algo/base/gpu/xpu_legacy.h @@ -0,0 +1,103 @@ +/* Copyright (C) 2022 FIAS Frankfurt Institute for Advanced Studies, Frankfurt / Main + SPDX-License-Identifier: GPL-3.0-only + Authors: Felix Weiglhofer [committer]*/ +#ifndef CORE_COMPAT_XPU_LEGACY_H +#define CORE_COMPAT_XPU_LEGACY_H + +#include <xpu/host.h> + +namespace xpu { + +inline constexpr auto host_to_device = xpu::h2d; +inline constexpr auto device_to_host = xpu::d2h; + +template<typename T> +class hd_buffer { + +public: + hd_buffer() = default; + hd_buffer(size_t size) : m_buffer(size, xpu::buf_io) {} + + T *h() { return xpu::h_view(m_buffer).begin(); } + T *d() { return m_buffer.get(); } + + xpu::buffer<T> &underlying() { return m_buffer; } + +private: + xpu::buffer<T> m_buffer; + +}; + +template<typename T> +class d_buffer { + +public: + d_buffer() = default; + d_buffer(size_t size) : m_buffer(size, xpu::buf_device) {} + + T *d() { return m_buffer.get(); } + + xpu::buffer<T> &underlying() { return m_buffer; } + +private: + xpu::buffer<T> m_buffer; + +}; + + +template<typename T> +void copy(hd_buffer<T> &buf, direction dir) { + static xpu::queue _Q; + _Q.copy(buf.underlying(), dir); + _Q.wait(); +} + +enum class side { + host, + device +}; + +template<typename T, side S> +struct cmem_io { + using type = T *; +}; + +template<typename T> +struct cmem_io<T, side::host> { + using type = hd_buffer<T>; +}; + +template<typename T, side S> +using cmem_io_t = typename cmem_io<T, S>::type; + +template<typename T, side S> +struct cmem_device { + using type = T *; +}; + +template<typename T> +struct cmem_device<T, side::host> { + using type = d_buffer<T>; +}; + +template<typename T, side S> +using cmem_device_t = typename cmem_device<T, S>::type; + +} // namespace xpu + +#define XPU_BLOCK_SIZE_1D(...) + +#define XPU_EXPORT_KERNEL(Image, Kernel, ...) XPU_EXPORT_KERNEL_II(Image, Kernel, xpu::no_smem, 64, ##__VA_ARGS__) + +#define XPU_EXPORT_KERNEL_II(Image, Kernel, SMEM, BlockSize, ...) \ + struct Kernel : xpu::kernel<Image> { \ + using block_size = xpu::block_size<BlockSize>; \ + using context = xpu::kernel_context<SMEM>; \ + XPU_D void operator()(context &ctx, ##__VA_ARGS__); \ + } + +#define XPU_KERNEL(Kernel, smemIgnored, ...) \ + XPU_EXPORT(Kernel); \ + XPU_D void Kernel::operator()(context &ctx, ##__VA_ARGS__) + +#endif diff --git a/algo/detectors/sts/UnpackStsXpu.cxx b/algo/detectors/sts/UnpackStsXpu.cxx index 7c9a5556359113b226e4906c21ffc66212403ee0..ab0dfcf45d32a27c15080b2e29f0d12653d9e46b 100644 --- a/algo/detectors/sts/UnpackStsXpu.cxx +++ b/algo/detectors/sts/UnpackStsXpu.cxx @@ -15,11 +15,83 @@ using std::unique_ptr; using std::vector; -XPU_BLOCK_SIZE_1D(cbm::algo::UnpackStsXpu::Unpack, 32); +XPU_KERNEL(cbm::algo::UnpackK, xpu::no_smem, UnpackStsXpuPar* params, UnpackStsXpuElinkPar* elinkParams, + stsxyter::Message* content, uint64_t* msMessCount, uint64_t* msMessOffset, uint64_t* msStartTime, + uint32_t* msCompIdx, CbmStsDigi* digisOut, const uint64_t currentTsTime, int NElems) + { + int id = ctx.block_idx_x() * ctx.block_dim_x() + ctx.thread_idx_x(); + if (id >= NElems || msMessCount[id] < 2) return; // exit if out of bounds or too few messages + + UnpackStsXpuMonitorData monitor; //Monitor data, currently not stored. TO DO: Implement! + + // --- Get message count and offset for this MS + const uint32_t numMessages = msMessCount[id]; + const uint32_t messOffset = msMessOffset[id]; + + // --- Get starting position of this MS in message buffer + stsxyter::Message* message = &content[messOffset]; + + // --- Get starting position of this MS in digi buffer + CbmStsDigi* digis = &digisOut[messOffset]; + + // --- Get component index and unpack parameters of this MS + const uint32_t comp = msCompIdx[id]; + const UnpackStsXpuPar& unpackPar = params[comp]; + + // --- Get starting position of elink parameters of this MS + UnpackStsXpuElinkPar* elinkPar = &elinkParams[unpackPar.fElinkOffset]; + + // --- Init counter for produced digis + uint64_t numDigis = 0; + + // --- The first message in the MS is expected to be of type EPOCH and can be ignored. + if (message[0].GetMessType() != stsxyter::MessType::Epoch) { + monitor.fNumErrInvalidFirstMessage++; + msMessCount[id] = 0; + return; + } + + // --- The second message must be of type ts_msb. + if (message[1].GetMessType() != stsxyter::MessType::TsMsb) { + monitor.fNumErrInvalidFirstMessage++; + msMessCount[id] = 0; + return; + } + + // --- Current TS_MSB epoch cycle + uint64_t currentCycle = msStartTime[id] / UnpackStsXpu::fkCycleLength; + + // --- Process first message (ts_msb) + uint32_t currentEpoch = 0; ///< Current epoch number within epoch cycle + uint64_t currentEpochTime = 0; ///< Current epoch time relative to timeslice in clock cycles + UnpackStsXpu::ProcessTsmsbMessage(message[1], currentEpoch, currentEpochTime, currentCycle, currentTsTime); + + // --- Message loop + for (uint32_t messageNr = 2; messageNr < numMessages; messageNr++) { + + // --- Action depending on message type + switch (message[messageNr].GetMessType()) { + case stsxyter::MessType::Hit: { + UnpackStsXpu::ProcessHitMessage(message[messageNr], digis, numDigis, unpackPar, elinkPar, monitor, currentEpochTime); + break; + } + case stsxyter::MessType::TsMsb: { + UnpackStsXpu::ProcessTsmsbMessage(message[messageNr], currentEpoch, currentEpochTime, currentCycle, currentTsTime); + break; + } + default: { + monitor.fNumNonHitOrTsbMessage++; + break; + } + } + } + // --- Store number of digis in buffer + msMessCount[id] = numDigis; + } + namespace cbm::algo { - // ---- Algorithm execution --------------------------------------------- UnpackStsXpu::resultType UnpackStsXpu::operator()(const fles::Timeslice* ts, StsReadoutConfig& config) { @@ -60,6 +132,8 @@ namespace cbm::algo result.second.fNumErrInvalidMsSize++; continue; } + xpu::t_add_bytes(msDescr.size); + xpu::k_add_bytes<UnpackK>(msDescr.size); msIdx.push_back(msDescr.idx); compIdx.push_back(comp); messCount.push_back(numMessages); @@ -101,7 +175,7 @@ namespace cbm::algo const uint64_t currentTsTime = ts->start_time() / epochLengthInNs; // --- Do unpacking for each microslice - xpu::run_kernel<Unpack>(xpu::grid::n_threads(numMs), fParams.d(), fElinkParams.d(), tsContent.d(), msMessCount.d(), + xpu::run_kernel<UnpackK>(xpu::n_threads(numMs), fParams.d(), fElinkParams.d(), tsContent.d(), msMessCount.d(), msMessOffset.d(), msStartTime.d(), msCompIdx.d(), digisOut.d(), currentTsTime, numMs); // --- Copy results back to host (only two buffers are modified on device) @@ -109,6 +183,7 @@ namespace cbm::algo xpu::copy(digisOut, xpu::device_to_host); // --- Store digis TO DO: make Kernel for this, needs a way to sum arrays in XPU first + xpu::push_timer("Store digis"); for (uint64_t i = 0; i < numMs; i++) { uint64_t offset = msMessOffset.h()[i]; uint64_t numDigis = msMessCount.h()[i]; @@ -116,85 +191,11 @@ namespace cbm::algo result.first.push_back(digisOut.h()[offset + j]); } } + xpu::pop_timer(); return result; } - XPU_KERNEL(UnpackStsXpu::Unpack, xpu::no_smem, UnpackStsXpuPar* params, UnpackStsXpuElinkPar* elinkParams, - stsxyter::Message* content, uint64_t* msMessCount, uint64_t* msMessOffset, uint64_t* msStartTime, - uint32_t* msCompIdx, CbmStsDigi* digisOut, const uint64_t currentTsTime, int NElems) - { - int id = xpu::block_idx::x() * xpu::block_dim::x() + xpu::thread_idx::x(); - if (id >= NElems || msMessCount[id] < 2) return; // exit if out of bounds or too few messages - - UnpackStsXpuMonitorData monitor; //Monitor data, currently not stored. TO DO: Implement! - - // --- Get message count and offset for this MS - const uint32_t numMessages = msMessCount[id]; - const uint32_t messOffset = msMessOffset[id]; - - // --- Get starting position of this MS in message buffer - stsxyter::Message* message = &content[messOffset]; - - // --- Get starting position of this MS in digi buffer - CbmStsDigi* digis = &digisOut[messOffset]; - - // --- Get component index and unpack parameters of this MS - const uint32_t comp = msCompIdx[id]; - const UnpackStsXpuPar& unpackPar = params[comp]; - - // --- Get starting position of elink parameters of this MS - UnpackStsXpuElinkPar* elinkPar = &elinkParams[unpackPar.fElinkOffset]; - - // --- Init counter for produced digis - uint64_t numDigis = 0; - - // --- The first message in the MS is expected to be of type EPOCH and can be ignored. - if (message[0].GetMessType() != stsxyter::MessType::Epoch) { - monitor.fNumErrInvalidFirstMessage++; - msMessCount[id] = 0; - return; - } - - // --- The second message must be of type ts_msb. - if (message[1].GetMessType() != stsxyter::MessType::TsMsb) { - monitor.fNumErrInvalidFirstMessage++; - msMessCount[id] = 0; - return; - } - - // --- Current TS_MSB epoch cycle - uint64_t currentCycle = msStartTime[id] / fkCycleLength; - - // --- Process first message (ts_msb) - uint32_t currentEpoch = 0; ///< Current epoch number within epoch cycle - uint64_t currentEpochTime = 0; ///< Current epoch time relative to timeslice in clock cycles - ProcessTsmsbMessage(message[1], currentEpoch, currentEpochTime, currentCycle, currentTsTime); - - // --- Message loop - for (uint32_t messageNr = 2; messageNr < numMessages; messageNr++) { - - // --- Action depending on message type - switch (message[messageNr].GetMessType()) { - case stsxyter::MessType::Hit: { - ProcessHitMessage(message[messageNr], digis, numDigis, unpackPar, elinkPar, monitor, currentEpochTime); - break; - } - case stsxyter::MessType::TsMsb: { - ProcessTsmsbMessage(message[messageNr], currentEpoch, currentEpochTime, currentCycle, currentTsTime); - break; - } - default: { - monitor.fNumNonHitOrTsbMessage++; - break; - } - } - } - // --- Store number of digis in buffer - msMessCount[id] = numDigis; - } - - // ----- Process hit message -------------------------------------------- XPU_D inline void UnpackStsXpu::ProcessHitMessage(const stsxyter::Message& message, CbmStsDigi* digis, uint64_t& numDigis, const UnpackStsXpuPar& unpackPar, diff --git a/algo/detectors/sts/UnpackStsXpu.h b/algo/detectors/sts/UnpackStsXpu.h index 1e88b7f5ea08800f15ba89ed5a54991bbdf54e93..f9da48db38e4d02a827c838f80833a62c6dc9a62 100644 --- a/algo/detectors/sts/UnpackStsXpu.h +++ b/algo/detectors/sts/UnpackStsXpu.h @@ -5,7 +5,10 @@ #ifndef CBM_ALGO_UNPACKSTSXPU_H #define CBM_ALGO_UNPACKSTSXPU_H 1 + #include "CbmStsDigi.h" +#include "gpu/DeviceImage.h" +#include "gpu/xpu_legacy.h" #include "MicrosliceDescriptor.hpp" #include "Timeslice.hpp" @@ -21,7 +24,6 @@ #include "StsReadoutConfig.h" #include "StsXyterMessage.h" -#include "gpu/DeviceImage.h" namespace cbm::algo @@ -73,6 +75,9 @@ namespace cbm::algo } }; + XPU_EXPORT_KERNEL(GPUReco, UnpackK, UnpackStsXpuPar* params, UnpackStsXpuElinkPar* elinkParams, + stsxyter::Message* content, uint64_t* msMessCount, uint64_t* msMessOffset, uint64_t* msStartTime, + uint32_t* msCompIdx, CbmStsDigi* digisOut, const uint64_t currentTsTime, int NElems); /** @class UnpackStsXpu ** @author Pierre-Alain Loizeau <p.-a.loizeau@gsi.de> @@ -101,16 +106,6 @@ namespace cbm::algo **/ resultType operator()(const fles::Timeslice* ts, StsReadoutConfig& config); - - struct StsXpuUnpack { - }; // Identifier used by xpu to find where kernels are located - - - // Run unpacker for each microslice - XPU_EXPORT_KERNEL(GPUReco, Unpack, UnpackStsXpuPar* params, UnpackStsXpuElinkPar* elinkParams, - stsxyter::Message* content, uint64_t* msMessCount, uint64_t* msMessOffset, uint64_t* msStartTime, - uint32_t* msCompIdx, CbmStsDigi* digisOut, const uint64_t currentTsTime, int NElems); - //Stores parameter structs for all elinks xpu::hd_buffer<UnpackStsXpuElinkPar> fElinkParams; @@ -120,6 +115,8 @@ namespace cbm::algo private: // methods + friend struct UnpackK; + /** @brief Process a hit message ** @param message SMX message (32-bit word) ** @param digi buffer diff --git a/external/.gitignore b/external/.gitignore index 732dd15370edb746477f1fc066877aaa4e0a8f25..74cb20f77bcff3c16dfba60a8bf1e0c0ac4feccd 100644 --- a/external/.gitignore +++ b/external/.gitignore @@ -11,5 +11,6 @@ jsroot googletest yaml-cpp/ xpu/ +xpu-dev GSL bba diff --git a/external/CMakeLists.txt b/external/CMakeLists.txt index abeed2bf6461baa432cec83c724ce4f90431a22e..0238ffa7b666d50d1c23af2b2655cc32b1983b03 100644 --- a/external/CMakeLists.txt +++ b/external/CMakeLists.txt @@ -52,7 +52,7 @@ if(DOWNLOAD_EXTERNALS) if (NOT ${CBM_XPU_DEV}) download_project_if_needed(PROJECT xpu GIT_REPOSITORY "https://github.com/fweig/xpu.git" - GIT_TAG "dd7d1d7e4b4d71079b9cfadbd662a264288308b0" # v0.7.6 + GIT_TAG "d142d9ac7135488925b5b94f36da22ec55d9271c" # v0.9.3 SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/xpu CONFIGURE_COMMAND "" BUILD_COMMAND ""