diff --git a/algo/ca/core/CMakeLists.txt b/algo/ca/core/CMakeLists.txt index 2be924997474f03ae9311ec060bd857d21cf725a..11376b0ddcb554d29b2914066be711f34c48dae0 100644 --- a/algo/ca/core/CMakeLists.txt +++ b/algo/ca/core/CMakeLists.txt @@ -5,6 +5,7 @@ set(INCLUDE_DIRECTORIES ${CMAKE_CURRENT_SOURCE_DIR}/qa ${CMAKE_CURRENT_SOURCE_DIR}/data ${CMAKE_CURRENT_SOURCE_DIR}/tracking + ${CMAKE_CURRENT_SOURCE_DIR}/experimental ) set(SRCS @@ -31,6 +32,10 @@ set(SRCS ${CMAKE_CURRENT_SOURCE_DIR}/tracking/CaTrackFinderWindow.cxx ${CMAKE_CURRENT_SOURCE_DIR}/tracking/CaTrackFitter.cxx ${CMAKE_CURRENT_SOURCE_DIR}/tracking/CaTripletConstructor.cxx + + ${CMAKE_CURRENT_SOURCE_DIR}/experimental/CaGpuTrackFinderSetup.cxx + ${CMAKE_CURRENT_SOURCE_DIR}/experimental/CaGpuTripletConstructor.cxx + ${CMAKE_CURRENT_SOURCE_DIR}/experimental/CaDeviceImage.cxx ) SET_SOURCE_FILES_PROPERTIES(${SRCS} PROPERTIES COMPILE_FLAGS "-O3") @@ -52,6 +57,12 @@ target_include_directories(CaCore ${CMAKE_CURRENT_SOURCE_DIR}/pars ${CMAKE_CURRENT_SOURCE_DIR}/qa ${CMAKE_CURRENT_SOURCE_DIR}/tracking + ${CMAKE_CURRENT_SOURCE_DIR}/experimental +) + +set(DEVICE_SRCS + experimental/CaDeviceImage.cxx + experimental/CaGpuTripletConstructor.cxx ) target_compile_definitions(CaCore PUBLIC NO_ROOT) @@ -62,7 +73,9 @@ target_link_libraries(CaCore external::fles_logging # needed for the logger external::fles_ipc # needed for the logger external::yaml-cpp + xpu ) +xpu_attach(CaCore ${DEVICE_SRCS}) ##### Offline version without the NO_ROOT in order to get standard logger! ############################################# if (NOT CBM_ONLINE_STANDALONE) @@ -74,6 +87,7 @@ if (NOT CBM_ONLINE_STANDALONE) ${CMAKE_CURRENT_SOURCE_DIR}/pars ${CMAKE_CURRENT_SOURCE_DIR}/qa ${CMAKE_CURRENT_SOURCE_DIR}/tracking + ${CMAKE_CURRENT_SOURCE_DIR}/experimental ${CMAKE_CURRENT_SOURCE_DIR} ) @@ -81,7 +95,9 @@ if (NOT CBM_ONLINE_STANDALONE) PUBLIC KfCoreOffline Boost::serialization external::yaml-cpp + xpu ) + xpu_attach(CaCoreOffline ${DEVICE_SRCS}) install(TARGETS CaCoreOffline DESTINATION lib) endif() ######################################################################################################################## @@ -91,6 +107,7 @@ install(DIRECTORY utils TYPE INCLUDE FILES_MATCHING PATTERN "*.h") install(DIRECTORY data TYPE INCLUDE FILES_MATCHING PATTERN "*.h") install(DIRECTORY tracking TYPE INCLUDE FILES_MATCHING PATTERN "*.h") install(DIRECTORY pars TYPE INCLUDE FILES_MATCHING PATTERN "*.h") +install(DIRECTORY experimental TYPE INCLUDE FILES_MATCHING PATTERN "*.h") install( FILES @@ -135,6 +152,19 @@ install( tracking/CaTrackFitter.h tracking/CaTripletConstructor.h + experimental/CaDeviceImage.h + experimental/CaGpuGrid.h + experimental/CaGpuGridArea.h + experimental/CaGpuParameters.h + experimental/CaGpuStation.h + experimental/CaGpuMaterialMap.h + experimental/CaGpuField.h + experimental/CaMeasurementXy.h + experimental/CaGpuTrackFinderSetup.h + experimental/CaGpuTripletConstructor.h + experimental/CaDeviceImage.h + experimental/CaGpuTimeMonitor.h + DESTINATION include/ ) diff --git a/algo/ca/core/data/CaGrid.h b/algo/ca/core/data/CaGrid.h index 8bfc127154b6bce51d8187664420e32068ece503..cc81ec8b4e4d6844c12cd37d8958768de8a04f7a 100644 --- a/algo/ca/core/data/CaGrid.h +++ b/algo/ca/core/data/CaGrid.h @@ -126,6 +126,15 @@ namespace cbm::algo::ca /// Get maximal entry range in T fscal GetMaxRangeT() const { return fMaxRangeT; } + /// Get bin width in X inversed + fscal GetBinWidthXinv() const { return fBinWidthXinv; } + + /// Get bin width in Y inversed + fscal GetBinWidthYinv() const { return fBinWidthYinv; } + + /// Get the link to the first bin entry index + const ca::Vector<ca::HitIndex_t>& GetFirstBinEntryIndex() const { return fFirstBinEntryIndex; } + private: /// --- Data members --- diff --git a/algo/ca/core/data/CaHit.h b/algo/ca/core/data/CaHit.h index fb0551ee784616df3f315d783dd3fd857b05314e..ae678be940d295613001b928c056989b8bc44739 100644 --- a/algo/ca/core/data/CaHit.h +++ b/algo/ca/core/data/CaHit.h @@ -14,6 +14,8 @@ #include <string> +#include <xpu/device.h> + namespace cbm::algo::ca { struct CaHitTimeInfo { @@ -99,43 +101,43 @@ namespace cbm::algo::ca HitKeyIndex_t BackKey() const { return fBackKey; } /// Get the X coordinate - fscal X() const { return fX; } + XPU_D fscal X() const { return fX; } /// Get the Y coordinate - fscal Y() const { return fY; } + XPU_D fscal Y() const { return fY; } /// Get the Z coordinate - fscal Z() const { return fZ; } + XPU_D fscal Z() const { return fZ; } /// Get the time - fscal T() const { return fT; } + XPU_D fscal T() const { return fT; } /// Get the uncertainty of X coordinate - fscal dX2() const { return fDx2; } + XPU_D fscal dX2() const { return fDx2; } /// Get the uncertainty of Y coordinate - fscal dY2() const { return fDy2; } + XPU_D fscal dY2() const { return fDy2; } /// Get the X/Y covariance - fscal dXY() const { return fDxy; } + XPU_D fscal dXY() const { return fDxy; } /// Get the uncertainty of time - fscal dT2() const { return fDt2; } + XPU_D fscal dT2() const { return fDt2; } /// Get the +/- range of uncertainty of X coordinate - fscal RangeX() const { return fRangeX; } + XPU_D fscal RangeX() const { return fRangeX; } /// Get the +/- range of uncertainty of Y coordinate - fscal RangeY() const { return fRangeY; } + XPU_D fscal RangeY() const { return fRangeY; } /// Get the +/- range of uncertainty of time - fscal RangeT() const { return fRangeT; } + XPU_D fscal RangeT() const { return fRangeT; } /// Get the hit id - HitIndex_t Id() const { return fId; } + XPU_D HitIndex_t Id() const { return fId; } /// Get the station index - int Station() const { return fStation; } + XPU_D int Station() const { return fStation; } /// \brief Simple string representation of the hit class std::string ToString() const; diff --git a/algo/ca/core/data/CaTriplet.h b/algo/ca/core/data/CaTriplet.h index 4a9848ea1aaa8ff2b9d7da14b38a4c7d422dd339..ed6d3a4ff0d89171b7267c19efd1ca4e8ff3da7b 100644 --- a/algo/ca/core/data/CaTriplet.h +++ b/algo/ca/core/data/CaTriplet.h @@ -25,9 +25,10 @@ namespace cbm::algo::ca Triplet() = default; /// Constructor - Triplet(ca::HitIndex_t iHitL, ca::HitIndex_t iHitM, ca::HitIndex_t iHitR, unsigned int iStaL, unsigned int iStaM, - unsigned int iStaR, unsigned char Level, unsigned int firstNeighbour, char nNeighbours, fscal Chi2, - fscal Qp, fscal Cqp, fscal tx, fscal Ctx, fscal ty, fscal Cty, bool isMomentumFitted) + XPU_D Triplet(ca::HitIndex_t iHitL, ca::HitIndex_t iHitM, ca::HitIndex_t iHitR, unsigned int iStaL, + unsigned int iStaM, unsigned int iStaR, unsigned char Level, unsigned int firstNeighbour, + char nNeighbours, fscal Chi2, fscal Qp, fscal Cqp, fscal tx, fscal Ctx, fscal ty, fscal Cty, + bool isMomentumFitted) : fChi2(Chi2) , fQp(Qp) , fCqp(Cqp) diff --git a/algo/ca/core/experimental/CaDeviceImage.cxx b/algo/ca/core/experimental/CaDeviceImage.cxx new file mode 100644 index 0000000000000000000000000000000000000000..88932879aa754b841bbd97713daff47ab61f22cd --- /dev/null +++ b/algo/ca/core/experimental/CaDeviceImage.cxx @@ -0,0 +1,6 @@ +/* Copyright (C) 2023 FIAS Frankfurt Institute for Advanced Studies, Frankfurt / Main + SPDX-License-Identifier: GPL-3.0-only + Authors: Felix Weiglhofer [committer] */ +#include "CaDeviceImage.h" + +XPU_IMAGE(cbm::algo::ca::GPUReco); diff --git a/algo/ca/core/experimental/CaDeviceImage.h b/algo/ca/core/experimental/CaDeviceImage.h new file mode 100644 index 0000000000000000000000000000000000000000..49eef6a4b7ec691af0dea1f117fbdd0ee2bd41f3 --- /dev/null +++ b/algo/ca/core/experimental/CaDeviceImage.h @@ -0,0 +1,14 @@ +/* Copyright (C) 2023 FIAS Frankfurt Institute for Advanced Studies, Frankfurt / Main + SPDX-License-Identifier: GPL-3.0-only + Authors: Felix Weiglhofer [committer] */ +#ifndef CBM_ALGO_CA_GPU_DEVICEIMAGE_H +#define CBM_ALGO_CA_GPU_DEVICEIMAGE_H +#include <xpu/device.h> + +namespace cbm::algo::ca +{ + struct GPUReco : xpu::device_image { + }; +} // namespace cbm::algo::ca + +#endif diff --git a/algo/ca/core/experimental/CaGpuField.h b/algo/ca/core/experimental/CaGpuField.h new file mode 100644 index 0000000000000000000000000000000000000000..8fd42b4746449753e0cc9ae2c485bef637b1ba70 --- /dev/null +++ b/algo/ca/core/experimental/CaGpuField.h @@ -0,0 +1,314 @@ +/* Copyright (C) 2007-2022 GSI Helmholtzzentrum fuer Schwerionenforschung, Darmstadt + SPDX-License-Identifier: GPL-3.0-only + Authors: Sergey Gorbunov [committer], Igor Kulakov, Maksym Zyzak, Sergei Zharko */ + +/// \file CaGpuField.h +/// \brief Magnetic field classes +/// +/// This is a temporary solution made to ensure compatibility with GPU +/// + +#pragma once // include this header only once per compilation unit + +#include "CaUtils.h" +#include "KfField.h" +#include "KfTrackParam.h" + +#include <xpu/device.h> + +namespace cbm::algo::ca +{ + class GpuFieldValue { + public: + float x; //< x-component of the field + float y; //< y-component of the field + float z; //< z-component of the field + + GpuFieldValue() = default; + + /// \brief Copy constructor with type conversion + template<typename DataIn> + GpuFieldValue(const kf::FieldValue<DataIn>& other) + : x(kf::utils::simd::Cast<DataIn, float>(other.GetBx())) + , y(kf::utils::simd::Cast<DataIn, float>(other.GetBy())) + , z(kf::utils::simd::Cast<DataIn, float>(other.GetBz())) + { + } + + /// Combines the magnetic field with another field value object using weight + /// \param B other field value to combine with + /// \param w weight from 0 to 1 + + XPU_D void Combine(const GpuFieldValue& B, const bool w) + { + if (w) { + x = B.x; + y = B.y; + z = B.z; + } + } + + /// Is the field value zero? + XPU_D bool IsZeroField() const + { + // constexpr auto e = constants::misc::NegligibleFieldkG; + // float e = 1.e-4; + return (x * x + y * y + z * z <= 1.e-8); + } + }; + + class GpuFieldSlice { + public: + /// Default constructor + GpuFieldSlice() = default; + + static constexpr int kPolDegree{5}; ///< Approximation polynomial degree + static constexpr int kNofCoeff{((kPolDegree + 1) * (kPolDegree + 2)) / 2}; ///< Number of coefficients + + /// \brief Array of the approximation coefficients [<monomial>] + using CoeffArray_t = std::array<float, kNofCoeff>; + + /// \brief Copy constructor with type conversion + template<typename DataIn> + GpuFieldSlice(const kf::FieldSlice<DataIn>& other) : z(kf::utils::simd::Cast<DataIn, float>(other.GetZref())) + { + for (size_t i = 0; i < kMaxNFieldApproxCoefficients; i++) { + cx[i] = kf::utils::simd::Cast<DataIn, float>(other.GetBx()[i]); + cy[i] = kf::utils::simd::Cast<DataIn, float>(other.GetBy()[i]); + cz[i] = kf::utils::simd::Cast<DataIn, float>(other.GetBz()[i]); + } + } + + /// Gets field value from (x, y) fvec point + /// \param x x-coordinate of input + /// \param y y-coordinate of input + /// \return B the FieldValue output + XPU_D GpuFieldValue GetFieldValue_old(float x, float y) const; + + // + XPU_D GpuFieldValue GetFieldValue(float x, float y) const + { + using cbm::algo::kf::math::Horner; + GpuFieldValue B; + B.x = Horner<kPolDegree>(cx.cbegin(), x, y); + B.y = Horner<kPolDegree>(cy.cbegin(), x, y); + B.z = Horner<kPolDegree>(cz.cbegin(), x, y); + return B; + } + // + + /// Gets field value for the intersection with a straight track + /// \param t straight track + /// \return B the FieldValue output + XPU_D GpuFieldValue GetFieldValueForLine(const kf::TrackParamBase<float>& t) const; + + public: + // NOTE: We don't use an initialization of arrays here because we cannot be sure + // if the underlying type (fvec) has a default constructor, but + // we are sure, that it can be initialized with a float. (S.Zharko) + static constexpr auto kMaxNFieldApproxCoefficients = 21; //constants::size::MaxNFieldApproxCoefficients; + + CoeffArray_t cx; //fBx; ///< Approximation coefficients for the x-component of the field + CoeffArray_t cy; //fBy; ///< Approximation coefficients for the y-component of the field + CoeffArray_t cz; //fBz; ///< Approximation coefficients for the z-component of the field + + float z; ///< z coordinate of the slice + }; + + XPU_D inline GpuFieldValue GpuFieldSlice::GetFieldValue_old(float x, float y) const + { + float x2 = x * x; + float y2 = y * y; + float xy = x * y; + + float x3 = x2 * x; + float y3 = y2 * y; + float xy2 = x * y2; + float x2y = x2 * y; + + float x4 = x3 * x; + float y4 = y3 * y; + float xy3 = x * y3; + float x2y2 = x2 * y2; + float x3y = x3 * y; + + float x5 = x4 * x; + float y5 = y4 * y; + float xy4 = x * y4; + float x2y3 = x2 * y3; + float x3y2 = x3 * y2; + float x4y = x4 * y; + + GpuFieldValue B; + + B.x = cx[0] + cx[1] * x + cx[2] * y + cx[3] * x2 + cx[4] * xy + cx[5] * y2 + cx[6] * x3 + cx[7] * x2y + cx[8] * xy2 + + cx[9] * y3 + cx[10] * x4 + cx[11] * x3y + cx[12] * x2y2 + cx[13] * xy3 + cx[14] * y4 + cx[15] * x5 + + cx[16] * x4y + cx[17] * x3y2 + cx[18] * x2y3 + cx[19] * xy4 + cx[20] * y5; + + B.y = cy[0] + cy[1] * x + cy[2] * y + cy[3] * x2 + cy[4] * xy + cy[5] * y2 + cy[6] * x3 + cy[7] * x2y + cy[8] * xy2 + + cy[9] * y3 + cy[10] * x4 + cy[11] * x3y + cy[12] * x2y2 + cy[13] * xy3 + cy[14] * y4 + cy[15] * x5 + + cy[16] * x4y + cy[17] * x3y2 + cy[18] * x2y3 + cy[19] * xy4 + cy[20] * y5; + + B.z = cz[0] + cz[1] * x + cz[2] * y + cz[3] * x2 + cz[4] * xy + cz[5] * y2 + cz[6] * x3 + cz[7] * x2y + cz[8] * xy2 + + cz[9] * y3 + cz[10] * x4 + cz[11] * x3y + cz[12] * x2y2 + cz[13] * xy3 + cz[14] * y4 + cz[15] * x5 + + cz[16] * x4y + cz[17] * x3y2 + cz[18] * x2y3 + cz[19] * xy4 + cz[20] * y5; + return B; + } + + XPU_D inline GpuFieldValue GpuFieldSlice::GetFieldValueForLine(const kf::TrackParamBase<float>& t) const + { + float dz = z - t.GetZ(); + return GetFieldValue(t.GetX() + t.GetTx() * dz, t.GetY() + t.GetTy() * dz); + } + + class GpuFieldRegion { + public: + GpuFieldRegion() = default; + + /// \brief Copy constructor with type conversion + template<typename DataIn> + GpuFieldRegion(const kf::FieldRegion<DataIn>& other) + : cx0(kf::utils::simd::Cast<DataIn, float>(other.cx0)) + , cx1(kf::utils::simd::Cast<DataIn, float>(other.cx1)) + , cx2(kf::utils::simd::Cast<DataIn, float>(other.cx2)) + , cy0(kf::utils::simd::Cast<DataIn, float>(other.cy0)) + , cy1(kf::utils::simd::Cast<DataIn, float>(other.cy1)) + , cy2(kf::utils::simd::Cast<DataIn, float>(other.cy2)) + , cz0(kf::utils::simd::Cast<DataIn, float>(other.cz0)) + , cz1(kf::utils::simd::Cast<DataIn, float>(other.cz1)) + , cz2(kf::utils::simd::Cast<DataIn, float>(other.cz2)) + , z0(kf::utils::simd::Cast<DataIn, float>(other.z0)) + // , fUseOriginalField(other.fUseOriginalField) + , fIsZeroField(other.fIsZeroField) + { + } + + /// Gets the field vector and writes it into B pointer + /// \param x x-coordinate of the point to calculate the field + /// \param y y-coordinate of the point to calculate the field + /// \param z z-coordinate of the point to calculate the field + /// \return the magnetic field value + XPU_D GpuFieldValue Get([[maybe_unused]] float x, [[maybe_unused]] float y, float z) const + { + GpuFieldValue B; + float dz = z - z0; + float dz2 = dz * dz; + B.x = cx0 + cx1 * dz + cx2 * dz2; + B.y = cy0 + cy1 * dz + cy2 * dz2; + B.z = cz0 + cz1 * dz + cz2 * dz2; + return B; + } + + /// Is the field region empty? + XPU_D bool IsZeroField() const { return fIsZeroField; } + + /// Interpolates the magnetic field by three nodal points and sets the result to this FieldRegion object + /// The result is a quadratic interpolation of the field as a function of z + /// \param b0 field value in the first nodal point + /// \param b0z z-coordinate of the first nodal point + /// \param b1 field value in the second nodal point + /// \param b1z z-coordinate of the second nodal point + /// \param b2 field value in the third nodal point + /// \param b2z z-coordinate of the third nodal point + /// TODO: does the sequence of b0z, b1z and b2z matter? If yes, probalby we need an assertion (S.Zharko) + XPU_D void Set(const GpuFieldValue& b0, float b0z, const GpuFieldValue& b1, float b1z, const GpuFieldValue& b2, + float b2z); + + /// Interpolates the magnetic field by thwo nodal points and sets the result to this FieldRegion object. + /// The result is a linear interpolation of the field as a function of z + /// \param b0 field value in the first nodal point + /// \param b0z z-coordinate of the first nodal point + /// \param b1 field value in the second nodal point + /// \param b1z z-coordinate of the second nodal point + /// TODO: does the sequence of b0z and b1z matter? If yes, probalby we need an assertion (S.Zharko) + XPU_D void Set(const GpuFieldValue& b0, float b0z, const GpuFieldValue& b1, float b1z) + { + fIsZeroField = (b0.IsZeroField() && b1.IsZeroField()); + z0 = b0z; + float dzi = 1. / (b1z - b0z); //TODO + cx0 = b0.x; + cy0 = b0.y; + cz0 = b0.z; + cx1 = (b1.x - b0.x) * dzi; + cy1 = (b1.y - b0.y) * dzi; + cz1 = (b1.z - b0.z) * dzi; + cx2 = 0.f; + cy2 = 0.f; + cz2 = 0.f; + } + + /// Shifts the coefficients to new central point + /// \param z z-coordinate of the new central point + XPU_D void Shift(float z) + { + float dz = z - z0; + float cx2dz = cx2 * dz; + float cy2dz = cy2 * dz; + float cz2dz = cz2 * dz; + z0 = z; + cx0 += (cx1 + cx2dz) * dz; + cy0 += (cy1 + cy2dz) * dz; + cz0 += (cz1 + cz2dz) * dz; + cx1 += cx2dz + cx2dz; + cy1 += cy2dz + cy2dz; + cz1 += cz2dz + cz2dz; + } + + + public: + // TODO: Probably it's better to have arrays instead of separate fvec values? (S.Zharko) + // Bx(z) = cx0 + cx1*(z-z0) + cx2*(z-z0)^2 + float cx0; + float cx1; + float cx2; + + // By(z) = cy0 + cy1*(z-z0) + cy2*(z-z0)^2 + float cy0; + float cy1; + float cy2; + + // Bz(z) = cz0 + cz1*(z-z0) + cz2*(z-z0)^2 + float cz0; + float cz1; + float cz2; + + float z0; ///< z-coordinate of the field region central point + + bool fIsZeroField; //{false}; ///< Is the field region empty? + }; + + XPU_D inline void GpuFieldRegion::Set(const GpuFieldValue& b0, float b0z, const GpuFieldValue& b1, float b1z, + const GpuFieldValue& b2, float b2z) + { + fIsZeroField = (b0.IsZeroField() && b1.IsZeroField() && b2.IsZeroField()); + + z0 = b0z; + float dz1 = b1z - b0z; + float dz2 = b2z - b0z; + float det = 1. / (dz1 * dz2 * (b2z - b1z)); //TODO + + float w21 = -dz2 * det; + float w22 = dz1 * det; + float w11 = -dz2 * w21; + float w12 = -dz1 * w22; + + float db1 = b1.x - b0.x; + float db2 = b2.x - b0.x; + cx0 = b0.x; + cx1 = db1 * w11 + db2 * w12; + cx2 = db1 * w21 + db2 * w22; + + db1 = b1.y - b0.y; + db2 = b2.y - b0.y; + cy0 = b0.y; + cy1 = db1 * w11 + db2 * w12; + cy2 = db1 * w21 + db2 * w22; + + db1 = b1.z - b0.z; + db2 = b2.z - b0.z; + cz0 = b0.z; + cz1 = db1 * w11 + db2 * w12; + cz2 = db1 * w21 + db2 * w22; + } + +} // namespace cbm::algo::ca diff --git a/algo/ca/core/experimental/CaGpuGrid.h b/algo/ca/core/experimental/CaGpuGrid.h new file mode 100644 index 0000000000000000000000000000000000000000..8040148772e30db5c28838fa7d50efbab6467de0 --- /dev/null +++ b/algo/ca/core/experimental/CaGpuGrid.h @@ -0,0 +1,176 @@ +/* Copyright (C) 2010-2023 GSI Helmholtzzentrum fuer Schwerionenforschung, Darmstadt + SPDX-License-Identifier: GPL-3.0-only + Authors: Igor Kulakov, Sergey Gorbunov, Grigory Kozlov [committer] */ + +/// \file CaGpuGrid.h +/// \brief A class to store hit information in a backet-sorted way on 2D grid +/// +/// This is a temporary solution to store the grid on GPU +/// The code has been simplified and modified relative to the CaGrid class for use on GPU +/// The original code is based on the code of the ALICE HLT Project +/// + +#pragma once // include this header only once per compilation unit + +#include "CaGrid.h" + +#include <xpu/device.h> + +namespace cbm::algo::ca +{ + + namespace + { + using namespace cbm::algo; + } + + /// \class Grid + /// \brief Class for storing 2d objects in a grid + + /// It creates 2-dimensional backet-sorted grid of pointers to 2-dimensional objects. + /// + /// The class provides an access to the objects in selected 2D bin without touching the rest of the data. + /// To loop over the objects in arbitrary XY-area one can use the ca::GridArea class. + /// + /// The class is used by CaTracker to speed-up the hit search operations + /// The grid axis are named X and Y + /// + class GpuGrid { + public: + /// \brief Default constructor + GpuGrid() = default; + + /// \brief Destructor + ~GpuGrid() = default; + + /// \brief Constructor + GpuGrid(const Grid& grid, int bin_start, int entries_start) + : fN(grid.GetNofBins()) + , fNx(grid.GetNofBinsX()) + , fNy(grid.GetNofBinsY()) + , fMinX(grid.GetMinX()) + , fMinY(grid.GetMinY()) + , fBinWidthX(grid.GetBinWidthX()) + , fBinWidthY(grid.GetBinWidthY()) + , fBinWidthXinv(grid.GetBinWidthXinv()) + , fBinWidthYinv(grid.GetBinWidthYinv()) + , fMaxRangeX(grid.GetMaxRangeX()) + , fMaxRangeY(grid.GetMaxRangeY()) + , fMaxRangeT(grid.GetMaxRangeT()) + , fBinEntryIndex_start(bin_start) + , fBinEntryIndex_amount(grid.GetFirstBinEntryIndex().size()) + , fEntries_start(entries_start) + , fEntries_amount(grid.GetEntries().size()) + { + } + + /// \brief Get bin index for (X,Y) point with boundary check + /// \param X - point x coordinate + /// \param Y - point y coordinate + /// \return bin index in the range [0, fN-1] + XPU_D int GetBin(float X, float Y) const; + + /// \brief Get bin X index with boundary check + /// \param X - x coordinate + /// \return binX index + XPU_D int GetBinX(float X) const; + + /// \brief Get bin Y index with boundary check + /// \param Y - y coordinate + /// \return binY index + XPU_D int GetBinY(float Y) const; + + /// Get number of bins + XPU_D int GetNofBins() const { return fN; } + + /// Get number of bins along X + XPU_D int GetNofBinsX() const { return fNx; } + + /// Get number of bins along Y + XPU_D int GetNofBinsY() const { return fNy; } + + /// Get minimal X value + XPU_D float GetMinX() const { return fMinX; } + + /// Get minimal Y value + XPU_D float GetMinY() const { return fMinY; } + + /// Get bin width in X + XPU_D float GetBinWidthX() const { return fBinWidthX; } + + /// Get bin width in Y + XPU_D float GetBinWidthY() const { return fBinWidthY; } + + /// Get maximal entry range in X + XPU_D float GetMaxRangeX() const { return fMaxRangeX; } + + /// Get maximal entry range in Y + XPU_D float GetMaxRangeY() const { return fMaxRangeY; } + + /// Get maximal entry range in T + XPU_D float GetMaxRangeT() const { return fMaxRangeT; } + + /// Get bin width in X inversed + XPU_D float GetBinWidthXinv() const { return fBinWidthXinv; } + + /// Get bin width in Y inversed + XPU_D float GetBinWidthYinv() const { return fBinWidthYinv; } + + /// Get the index of the first bin entry in the index array + XPU_D int GetBinEntryIndexStart() const { return fBinEntryIndex_start; } + + /// Get the number of entries in the index array + XPU_D int GetBinEntryIndexAmount() const { return fBinEntryIndex_amount; } + + /// Get the index of the first entry in the entry array + XPU_D int GetEntriesStart() const { return fEntries_start; } + + /// Get the number of entries in the entry array + XPU_D int GetEntriesAmount() const { return fEntries_amount; } + + private: + /// --- Data members --- + + int fN{0}; ///< total N bins + int fNx{0}; ///< N bins in X + int fNy{0}; ///< N bins in Y + + float fMinX{0.}; ///< minimal X value + float fMinY{0.}; ///< minimal Y value + float fBinWidthX{0.}; ///< bin width in X + float fBinWidthY{0.}; ///< bin width in Y + float fBinWidthXinv{0.}; ///< inverse bin width in X + float fBinWidthYinv{0.}; ///< inverse bin width in Y + + float fMaxRangeX{0.}; ///< maximal entry range in X + float fMaxRangeY{0.}; ///< maximal entry range in Y + float fMaxRangeT{0.}; ///< maximal entry range in T + + int fBinEntryIndex_start{0}; ///< index of the first bin entry in the index array + int fBinEntryIndex_amount{0}; ///< number of entries in the index array + + int fEntries_start{0}; ///< index of the first entry in the entry array + int fEntries_amount{0}; ///< number of entries in the entry array + }; + + /// --- Inline methods --- + + XPU_D inline int GpuGrid::GetBin(float X, float Y) const + { + // + return GetBinY(Y) * fNx + GetBinX(X); + } + + XPU_D inline int GpuGrid::GetBinX(float X) const + { + int binX = static_cast<int>((X - fMinX) * fBinWidthXinv); + return xpu::max(0, xpu::min(fNx - 1, binX)); + } + + XPU_D inline int GpuGrid::GetBinY(float Y) const + { + int binY = static_cast<int>((Y - fMinY) * fBinWidthYinv); + return xpu::max(0, xpu::min(fNy - 1, binY)); + } + +} // namespace cbm::algo::ca diff --git a/algo/ca/core/experimental/CaGpuGridArea.h b/algo/ca/core/experimental/CaGpuGridArea.h new file mode 100644 index 0000000000000000000000000000000000000000..265b24e073b8108b66d663c3a7116e19494153c3 --- /dev/null +++ b/algo/ca/core/experimental/CaGpuGridArea.h @@ -0,0 +1,115 @@ +/* Copyright (C) 2012-2020 Frankfurt Institute for Advanced Studies, Goethe-Universität Frankfurt, Frankfurt + SPDX-License-Identifier: GPL-3.0-only + Authors: Maksym Zyzak, Igor Kulakov [committer] */ + +/// \file CaGpuGridArea.h +/// +/// \brief A temporary implementation for CaGridArea, created for compatibility with GPU code +/// + +#pragma once // include this header only once per compilation unit + +#include "CaGpuGrid.h" +#include "CaHit.h" + +namespace cbm::algo::ca +{ + + namespace + { + using namespace cbm::algo; // to keep 'ca::' prefices in the code + } + + /// \brief Class for accessing objects in the 2D area that are stored in ca::Grid + /// + class GpuGridArea { + public: + /// \brief Constructor + /// \param grid - the grid to work with + /// \param x - X coordinate of the center of the area + /// \param y - Y coordinate of the center of the area + /// \param dx - half-width of the area in X + /// \param dy - half-width of the area in Y + //TODO: try to provide first element for the current grid instead of first element of the original arrays + XPU_D GpuGridArea(const ca::GpuGrid& grid, const unsigned int* BinEntryIndex, const unsigned int* Entries, float x, + float y, float dx, float dy) + : fGrid(grid) + , fBinEntryIndex(BinEntryIndex) + , fEntries(Entries) + , fGridNbinsX(fGrid.GetNofBinsX()) + { + int binXmin = fGrid.GetBinX(x - dx); + int binXmax = fGrid.GetBinX(x + dx); + int binYmin = fGrid.GetBinY(y - dy); + int binYmax = fGrid.GetBinY(y + dy); + + fAreaLastBinY = binYmax; + fAreaNbinsX = (binXmax - binXmin + 1); // bin index span in x direction + + fAreaFirstBin = (binYmin * fGridNbinsX + binXmin); + + fAreaCurrentBinY = binYmin; + fCurentEntry = fBinEntryIndex[fAreaFirstBin]; + fEntriesXend = fBinEntryIndex[fAreaFirstBin + fAreaNbinsX]; + } + + + /// \brief look up the next grid entry in the requested area + /// \return ind - the entry index in the grid.GetEntries() vector + /// \return true if the entry is found, false if there are no more entries in the area + XPU_D bool GetNextGridEntry(unsigned int& ind); + + /// \brief look up the next object id in the requested area + /// \return objectId - the id of the object + /// \return true if the object is found, false if there are no more pbjects in the area + XPU_D bool GetNextObjectId(unsigned int& objectId); + + /// \brief debug mode: loop over the entire GetEntries() vector ignoring the search area + // void DoLoopOverEntireGrid(); + + private: + const ca::GpuGrid& fGrid; + const unsigned int* fBinEntryIndex; + const unsigned int* fEntries; + + int fAreaLastBinY{0}; // last Y bin of the area + int fAreaNbinsX{0}; // number of area bins in X + int fAreaFirstBin{0}; // first bin of the area (left-down corner of the area) + int fAreaCurrentBinY{0}; // current Y bin (incremented while iterating) + ca::HitIndex_t fCurentEntry{0}; // index of the current entry in fGrid.GetEntries() + ca::HitIndex_t fEntriesXend{0}; // end of the hit indices in current y-row + int fGridNbinsX{0}; // Number of grid bins in X (copy of fGrid::GetNofBinsX()) + }; + + XPU_D inline bool GpuGridArea::GetNextGridEntry(unsigned int& ind) + { + bool xIndexOutOfRange = (fCurentEntry >= fEntriesXend); // current entry is not in the area + + // jump to the next y row if fCurentEntry is outside of the X area + while (xIndexOutOfRange) { + if (fAreaCurrentBinY >= fAreaLastBinY) { + return false; + } + fAreaCurrentBinY++; // get new y-line + fAreaFirstBin += fGridNbinsX; // move the left-down corner of the area to the next y-line + fCurentEntry = fBinEntryIndex[fAreaFirstBin]; // get first hit in cell, if y-line is new + fEntriesXend = fBinEntryIndex[fAreaFirstBin + fAreaNbinsX]; + xIndexOutOfRange = (fCurentEntry >= fEntriesXend); + } + ind = fCurentEntry; // return value + fCurentEntry++; // go to next + return true; + } + + XPU_D inline bool GpuGridArea::GetNextObjectId(unsigned int& objectId) + { + unsigned int entryIndex = 0; + + bool ok = GetNextGridEntry(entryIndex); + if (ok) { + objectId = fEntries[entryIndex]; + } + return ok; + } + +} // namespace cbm::algo::ca diff --git a/algo/ca/core/experimental/CaGpuMaterialMap.h b/algo/ca/core/experimental/CaGpuMaterialMap.h new file mode 100644 index 0000000000000000000000000000000000000000..d28babce0427ca7988baea0bf66b48b1e8457360 --- /dev/null +++ b/algo/ca/core/experimental/CaGpuMaterialMap.h @@ -0,0 +1,77 @@ +/* Copyright (C) 2007-2023 GSI Helmholtzzentrum fuer Schwerionenforschung, Darmstadt + SPDX-License-Identifier: GPL-3.0-only + Authors: Igor Kulakov, Sergey Gorbunov, Andrey Lebedev, Sergei Zharko, Grigory Kozlov [committer] */ + +/// \file GpuMaterialMap.h +/// \brief A map of station thickness in units of radiation length (X0) to the specific point in XY plane +/// +/// A simplified version of the KfMaterialMap class, intended for compatibility with GPU code + +#pragma once // include this header only once per compilation unit + +#include "KfMaterialMap.h" + +#include <xpu/device.h> + +namespace cbm::algo::ca +{ + /// \class MaterialMap + /// \brief A map of station thickness in units of radiation length (X0) to the specific point in XY plane + class GpuMaterialMap { + public: + /// \brief Default constructor + GpuMaterialMap() = default; + + /// \brief Copy constructor + GpuMaterialMap(const kf::MaterialMap& other, int binStart) + : fNbins(other.GetNbins()) + , fBinStart(binStart) + , fXYmax(other.GetXYmax()) + , fZref(other.GetZref()) + , fZmin(other.GetZmin()) + , fZmax(other.GetZmax()) + { + fFactor = 0.5 * fNbins / fXYmax; + } + + /// \brief Destructor + ~GpuMaterialMap() noexcept = default; + + /// \brief Gets number of bins (rows or columns) of the material table + XPU_D int GetNbins() const { return fNbins; } + + /// \brief Gets radius in cm of the material table + XPU_D float GetXYmax() const { return fXYmax; } + + /// \brief Gets reference Z of the material in cm + XPU_D float GetZref() const { return fZref; } + + /// \brief Gets minimal Z of the collected material in cm + XPU_D float GetZmin() const { return fZmin; } + + /// \brief Gets maximal Z of the collected material in cm + XPU_D float GetZmax() const { return fZmax; } + + /// \brief Get bin index for (x,y). Returns -1 when outside of the map + XPU_D int GetBin(float x, float y) const + { + int i = static_cast<int>((x + fXYmax) * fFactor); + int j = static_cast<int>((y + fXYmax) * fFactor); + if (i < 0 || j < 0 || i >= fNbins || j >= fNbins) { + return -1; + } + //TODO + return fBinStart + i + j * fNbins; + } + + private: + int fNbins; ///< Number of rows (== N columns) in the material budget table + int fBinStart; ///< Start index of the material budget table + float fXYmax; ///< Size of the station in x and y dimensions [cm] + float fFactor; ///< Util. var. for the conversion of point coordinates to row/column id + float fZref; ///< Reference Z of the collected material [cm] + float fZmin; ///< Minimal Z of the collected material [cm] + float fZmax; ///< Minimal Z of the collected material [cm] + }; + +} // namespace cbm::algo::ca diff --git a/algo/ca/core/experimental/CaGpuParameters.h b/algo/ca/core/experimental/CaGpuParameters.h new file mode 100644 index 0000000000000000000000000000000000000000..f558dd575b4395dd71533335e9f4c7eecef33827 --- /dev/null +++ b/algo/ca/core/experimental/CaGpuParameters.h @@ -0,0 +1,87 @@ +/* Copyright (C) 2021-2022 GSI Helmholtzzentrum fuer Schwerionenforschung, Darmstadt + SPDX-License-Identifier: GPL-3.0-only + Authors: Grigory Kozlov [committer] */ + +/// \file CaGpuParameters.h +/// \brief Parameter container for the GPU CA library + +#pragma once // include this header only once per compilation unit + +#include "CaGpuField.h" +#include "CaGpuMaterialMap.h" +#include "CaGpuStation.h" +#include "CaMeasurementXy.h" +#include "CaParameters.h" + +#include <xpu/device.h> + +namespace cbm::algo::ca +{ + + class GpuParameters { + public: + /// \brief Default constructor + GpuParameters() = default; + + /// \brief Copy constructor with type conversion + template<typename DataIn> + GpuParameters(const Parameters<DataIn>& other) + : maxSlopePV(0.1) + , maxQp(0.1) + , maxDZ(0.1) + , particleMass(0.13957) + , doubletChi2Cut(5) + , tripletChi2Cut(5) + , tripletFinalChi2Cut(5) + , fNStations(0) + , primaryFlag(true) + , isTargetField(false) + { + fTargetPos[0] = kf::utils::simd::Cast<DataIn, float>(other.GetTargetPositionX()); + fTargetPos[1] = kf::utils::simd::Cast<DataIn, float>(other.GetTargetPositionY()); + fTargetPos[2] = kf::utils::simd::Cast<DataIn, float>(other.GetTargetPositionZ()); + for (size_t i = 0; i < (size_t) other.GetNstationsActive(); i++) { + fStations[i] = other.GetStation(i); + } + } + + /// \brief Destructor + ~GpuParameters() = default; + + public: + ca::GpuFieldValue targB; ///< Magnetic field in the target region + ca::MeasurementXy<float> targetMeasurement; ///< Measurement XY in the target region + + /// \brief Gets Station + XPU_D const ca::GpuStation& GetStation(int iStation) const { return fStations[iStation]; } + + /// \brief Sets Station + void SetStation(int iStation, const ca::GpuStation& station) { fStations[iStation] = station; } + + /// \brief Gets X component of target position + XPU_D float GetTargetPositionX() const { return fTargetPos[0]; } + + /// \brief Gets Y component of target position + XPU_D float GetTargetPositionY() const { return fTargetPos[1]; } + + /// \brief Gets Z component of target position + XPU_D float GetTargetPositionZ() const { return fTargetPos[2]; } + + std::array<ca::GpuStation, constants::gpu::MaxNofStations> fStations; ///< Array of stations + + std::array<float, 3> fTargetPos; ///< Target position + + float maxSlopePV; ///< Max slope to primary vertex + float maxQp; ///< Max Q/p + float maxDZ; ///< Max DZ + float particleMass; ///< Particle mass + float doubletChi2Cut; ///< Doublet chi2 cut + float tripletChi2Cut; ///< Triplet chi2 cut + float tripletFinalChi2Cut; ///< Triplet final chi2 cut + + size_t fNStations; ///< Number of stations + + bool primaryFlag; ///< Primary flag + bool isTargetField; ///< Is target field + }; +} // namespace cbm::algo::ca diff --git a/algo/ca/core/experimental/CaGpuStation.h b/algo/ca/core/experimental/CaGpuStation.h new file mode 100644 index 0000000000000000000000000000000000000000..0ee6d43086a9f48823819493f52d9000c63142d2 --- /dev/null +++ b/algo/ca/core/experimental/CaGpuStation.h @@ -0,0 +1,74 @@ +/* Copyright (C) 2007-2022 GSI Helmholtzzentrum fuer Schwerionenforschung, Darmstadt + SPDX-License-Identifier: GPL-3.0-only + Authors: Sergey Gorbunov, Igor Kulakov, Sergei Zharko, Grigory Kozlov [committer] */ + +/// \file GpuStation.h +/// +/// \brief Contains a set of geometry parameters for a particular station +/// +/// A temporary simplified version of the CaStation class, intended for compatibility with GPU code + +#pragma once // include this header only once per compilation unit + +#include "CaGpuField.h" +#include "CaStation.h" + +#include <xpu/device.h> + +namespace cbm::algo::ca +{ + /// Structure Station + /// Contains a set of geometry parameters for a particular station + /// + class GpuStation { + public: + int type; // TODO: replace with L1DetectorID + int timeInfo; ///< flag: if time information can be used + int fieldStatus; ///< flag: 1 - station is INSIDE the field, 0 - station is OUTSIDE the field + float fZ; ///< z position of station [cm] + float Xmax; ///< min radius of the station [cm] + float Ymax; ///< max radius of the station [cm] + + GpuFieldSlice fieldSlice; ///< Magnetic field near the station + + GpuStation() = default; + + /// \brief Copy constructor with type conversion + template<typename DataIn> + GpuStation(const Station<DataIn>& other) + : type(other.GetType()) + , timeInfo(other.GetTimeStatus()) + , fieldStatus(other.GetFieldStatus()) + , fZ(other.template GetZ<float>()) + , Xmax(other.template GetXmax<float>()) + , Ymax(other.template GetYmax<float>()) + , fieldSlice(other.fieldSlice) + { + } + + /// \brief Gets type of the station + int GetType() const { return type; } + + /// \brief Gets time-measurement flag + int GetTimeStatus() const { return timeInfo; } + + /// \brief Gets field status flag + int GetFieldStatus() const { return fieldStatus; } + + /// \brief Gets z-position of the station + XPU_D float GetZ() const { return fZ; } + + /// \brief Gets limit of the station size in x-axis direction + XPU_D float GetXmax() const { return Xmax; } + + /// \brief Gets limit of the station size in x-axis direction + XPU_D float GetXmin() const { return -Xmax; } + + /// \brief Gets limit of the station size in y-axis direction + XPU_D float GetYmax() const { return Ymax; } + + /// \brief Gets limit of the station size in y-axis direction + XPU_D float GetYmin() const { return -Ymax; } + }; + +} // namespace cbm::algo::ca diff --git a/algo/ca/core/experimental/CaGpuTimeMonitor.h b/algo/ca/core/experimental/CaGpuTimeMonitor.h new file mode 100644 index 0000000000000000000000000000000000000000..f8537dd3ab7318c1d63be047f1797f5486f4d51b --- /dev/null +++ b/algo/ca/core/experimental/CaGpuTimeMonitor.h @@ -0,0 +1,105 @@ +/* Copyright (C) 2024-2025 GSI Helmholtzzentrum fuer Schwerionenforschung, Darmstadt + SPDX-License-Identifier: GPL-3.0-only + Authors: Grigory Kozlov [committer] */ + +/// \file GpuTimeMonitor.h +/// +/// \brief Time monitor for GPU kernels within the CA algorithm + +#pragma once // include this header only once per compilation unit + +#include <xpu/host.h> + +namespace cbm::algo::ca +{ + + namespace + { + using namespace cbm::algo; + } + + struct XpuTimings { + xpu::timings PrepareData_time[4]; + xpu::timings MakeSinglets_time[4]; + xpu::timings MakeDoublets_time[4]; + xpu::timings CompressDoublets_time[4]; + xpu::timings ResetDoublets_time[4]; + xpu::timings FitDoublets_time[4]; + xpu::timings MakeTriplets_time[4]; + xpu::timings CompressTriplets_time[4]; + xpu::timings ResetTriplets_time[4]; + xpu::timings FitTriplets_time[4]; + xpu::timings SortTriplets_time[4]; + xpu::timings Total_time[4]; + int nIterations; + + void PrintTimings(int iteration) const + { + if (iteration < 0 || iteration >= 4) { + std::cerr << "Error: wrong iteration number: " << iteration << std::endl; + return; + } + + auto print_timing = [](const std::string& name, const xpu::timings& t) { + std::cout << std::left << std::setw(25) << name << "Kernel time: " << std::setw(10) << t.kernel_time() + << " H2D copy: " << std::setw(10) << t.copy(xpu::h2d) << " D2H copy: " << std::setw(10) + << t.copy(xpu::d2h) << " Memset: " << std::setw(10) << t.memset() << " Total: " << std::setw(10) + << t.wall() << std::endl; + }; + + print_timing("PrepareData_time", PrepareData_time[iteration]); + print_timing("MakeSinglets_time", MakeSinglets_time[iteration]); + print_timing("MakeDoublets_time", MakeDoublets_time[iteration]); + print_timing("CompressDoublets_time", CompressDoublets_time[iteration]); + print_timing("ResetDoublets_time", ResetDoublets_time[iteration]); + print_timing("FitDoublets_time", FitDoublets_time[iteration]); + print_timing("MakeTriplets_time", MakeTriplets_time[iteration]); + print_timing("CompressTriplets_time", CompressTriplets_time[iteration]); + print_timing("ResetTriplets_time", ResetTriplets_time[iteration]); + print_timing("FitTriplets_time", FitTriplets_time[iteration]); + print_timing("SortTriplets_time", SortTriplets_time[iteration]); + print_timing("Total_time", Total_time[iteration]); + } + }; + + /// + class GpuTimeMonitor { + public: + /// \brief Default constructor + GpuTimeMonitor() : nEvents(0) {} + + /// \brief Destructor + ~GpuTimeMonitor() = default; + + void AddEventInfo(XpuTimings& event) + { + xpuTimings.push_back(event); + nEvents++; + } + + void SetNEvents(int nEv) { this->nEvents = nEv; } + + int GetNEvents() { return nEvents; } + + xpu::timings GetMakeSingletsTime(int ev, int it) { return xpuTimings[ev].MakeSinglets_time[it]; } + + xpu::timings GetMakeDoubletsTime(int ev, int it) { return xpuTimings[ev].MakeDoublets_time[it]; } + + xpu::timings GetCompressDoubletsTime(int ev, int it) { return xpuTimings[ev].CompressDoublets_time[it]; } + + xpu::timings GetFitDoubletsTime(int ev, int it) { return xpuTimings[ev].FitDoublets_time[it]; } + + xpu::timings GetMakeTripletsTime(int ev, int it) { return xpuTimings[ev].MakeTriplets_time[it]; } + + xpu::timings GetCompressTripletsTime(int ev, int it) { return xpuTimings[ev].CompressTriplets_time[it]; } + + xpu::timings GetFitTripletsTime(int ev, int it) { return xpuTimings[ev].FitTriplets_time[it]; } + + xpu::timings GetTotalTime(int ev, int it) { return xpuTimings[ev].Total_time[it]; } + + private: + std::vector<XpuTimings> xpuTimings; + int nEvents; + }; + +} // namespace cbm::algo::ca diff --git a/algo/ca/core/experimental/CaGpuTrackFinderSetup.cxx b/algo/ca/core/experimental/CaGpuTrackFinderSetup.cxx new file mode 100644 index 0000000000000000000000000000000000000000..5d40dd22191dfc65b72c905581d9b6b8c2eb8c84 --- /dev/null +++ b/algo/ca/core/experimental/CaGpuTrackFinderSetup.cxx @@ -0,0 +1,395 @@ +/* Copyright (C) 2024-2025 GSI Helmholtzzentrum fuer Schwerionenforschung, Darmstadt + SPDX-License-Identifier: GPL-3.0-only + Authors: Grigory Kozlov [committer] */ + +/// \file CaGpuTrackFinderSetup.cxx +/// \brief The class is responsible for setting up the environment and the order in which kernels are launched to perform tracking using XPU + +#include "CaGpuTrackFinderSetup.h" + +using namespace cbm::algo::ca; + +GpuTrackFinderSetup::GpuTrackFinderSetup(WindowData& wData, const ca::Parameters<fvec>& pars) + : fParameters(pars) + , frWData(wData) + , fIteration(0) +{ +} + +void GpuTrackFinderSetup::SetupParameters() +{ + int nStations = fParameters.GetNstationsActive(); + int nIterations = fParameters.GetCAIterations().size(); + fTripletConstructor.fParams.reset(nIterations, xpu::buf_io); + xpu::h_view vfParams{fTripletConstructor.fParams}; + + for (int i = 0; i < nIterations; i++) { + GpuParameters gpuParams = fParameters; + fTripletConstructor.fParams_const[i] = fParameters; + const auto& caIteration = fParameters.GetCAIterations()[i]; + + gpuParams.fNStations = nStations; + gpuParams.maxSlopePV = caIteration.GetMaxSlopePV(); + gpuParams.maxQp = caIteration.GetMaxQp(); + gpuParams.maxDZ = caIteration.GetMaxDZ(); + fTripletConstructor.fParams_const[i].fNStations = nStations; + fTripletConstructor.fParams_const[i].maxSlopePV = caIteration.GetMaxSlopePV(); + fTripletConstructor.fParams_const[i].maxQp = caIteration.GetMaxQp(); + fTripletConstructor.fParams_const[i].maxDZ = caIteration.GetMaxDZ(); + + if (caIteration.GetElectronFlag()) { + gpuParams.particleMass = constants::phys::ElectronMass; + fTripletConstructor.fParams_const[i].particleMass = constants::phys::ElectronMass; + } + else { + gpuParams.particleMass = constants::phys::MuonMass; + fTripletConstructor.fParams_const[i].particleMass = constants::phys::MuonMass; + } + + gpuParams.primaryFlag = caIteration.GetPrimaryFlag(); + fTripletConstructor.fParams_const[i].primaryFlag = caIteration.GetPrimaryFlag(); + + if (caIteration.GetPrimaryFlag()) { + gpuParams.targB = fParameters.GetVertexFieldValue(); + fTripletConstructor.fParams_const[i].targB = fParameters.GetVertexFieldValue(); + } + else { + gpuParams.targB = fParameters.GetStation(0).fieldSlice.GetFieldValue(0, 0); + fTripletConstructor.fParams_const[i].targB = fParameters.GetStation(0).fieldSlice.GetFieldValue(0, 0); + } + + gpuParams.doubletChi2Cut = caIteration.GetDoubletChi2Cut(); + gpuParams.tripletChi2Cut = caIteration.GetTripletChi2Cut(); + gpuParams.tripletFinalChi2Cut = caIteration.GetTripletFinalChi2Cut(); + gpuParams.isTargetField = (fabs(gpuParams.targB.y) > 0.001); + fTripletConstructor.fParams_const[i].doubletChi2Cut = caIteration.GetDoubletChi2Cut(); + fTripletConstructor.fParams_const[i].tripletChi2Cut = caIteration.GetTripletChi2Cut(); + fTripletConstructor.fParams_const[i].tripletFinalChi2Cut = caIteration.GetTripletFinalChi2Cut(); + + ca::MeasurementXy<float> targMeas(fParameters.GetTargetPositionX()[0], fParameters.GetTargetPositionY()[0], + caIteration.GetTargetPosSigmaX() * caIteration.GetTargetPosSigmaX(), + caIteration.GetTargetPosSigmaY() * caIteration.GetTargetPosSigmaX(), 0, 1, 1); + gpuParams.targetMeasurement = targMeas; + fTripletConstructor.fParams_const[i].targetMeasurement = targMeas; + + vfParams[i] = gpuParams; + } + + fQueue.copy(fTripletConstructor.fParams, xpu::h2d); + + for (int ista = 0; ista < nStations; ista++) { + fTripletConstructor.fStations_const[ista] = fParameters.GetStation(ista); + } + + xpu::set<strGpuTripletConstructor>(fTripletConstructor); +} + +void GpuTrackFinderSetup::SetupGrid() +{ + unsigned int nStations = fParameters.GetNstationsActive(); + unsigned int bin_start = 0; + unsigned int entries_start = 0; + + fTripletConstructor.fvGpuGrid.reset(nStations, xpu::buf_io); + xpu::h_view vfvGpuGrid{fTripletConstructor.fvGpuGrid}; + + for (unsigned int ista = 0; ista < nStations; ista++) { + vfvGpuGrid[ista] = GpuGrid(frWData.Grid(ista), bin_start, entries_start); + bin_start += frWData.Grid(ista).GetFirstBinEntryIndex().size(); + entries_start += frWData.Grid(ista).GetEntries().size(); + } + + fTripletConstructor.fgridFirstBinEntryIndex.reset(bin_start, xpu::buf_io); + fTripletConstructor.fgridEntries.reset(entries_start, xpu::buf_io); + + xpu::h_view vfgridFirstBinEntryIndex{fTripletConstructor.fgridFirstBinEntryIndex}; + xpu::h_view vfgridEntries{fTripletConstructor.fgridEntries}; + + bin_start = entries_start = 0; + + for (unsigned int ista = 0; ista < nStations; ista++) { + std::copy_n(frWData.Grid(ista).GetFirstBinEntryIndex().begin(), frWData.Grid(ista).GetFirstBinEntryIndex().size(), + &vfgridFirstBinEntryIndex[bin_start]); + for (unsigned int i = 0; i < frWData.Grid(ista).GetEntries().size(); i++) { + vfgridEntries[entries_start + i] = frWData.Grid(ista).GetEntries()[i].GetObjectId(); + } + bin_start += frWData.Grid(ista).GetFirstBinEntryIndex().size(); + entries_start += frWData.Grid(ista).GetEntries().size(); + } + fNHits = entries_start; + + fQueue.copy(fTripletConstructor.fvGpuGrid, xpu::h2d); +} + +void GpuTrackFinderSetup::SetupMaterialMap() +{ + ///Set up material map + int bin_start = 0; + int nStations = fParameters.GetNstationsActive(); + int fstStation = 0; + if (nStations == 8) fstStation = 4; + fTripletConstructor.fMaterialMap.reset(nStations, xpu::buf_io); + xpu::h_view vfMaterialMap{fTripletConstructor.fMaterialMap}; + + for (int ista = 0; ista < nStations; ista++) { + vfMaterialMap[ista] = GpuMaterialMap(fParameters.GetGeometrySetup().GetMaterial(fstStation + ista), bin_start); + bin_start += fParameters.GetGeometrySetup().GetMaterial(fstStation + ista).GetNbins() + * fParameters.GetGeometrySetup().GetMaterial(fstStation + ista).GetNbins(); + } + + fTripletConstructor.fMaterialMapTables.reset(bin_start, xpu::buf_io); + xpu::h_view vfMaterialMapTables{fTripletConstructor.fMaterialMapTables}; + + bin_start = 0; + + for (int ista = 0; ista < nStations; ista++) { + std::copy_n(fParameters.GetGeometrySetup().GetMaterial(fstStation + ista).GetTable().begin(), + fParameters.GetGeometrySetup().GetMaterial(fstStation + ista).GetTable().size(), + &vfMaterialMapTables[bin_start]); + bin_start += fParameters.GetGeometrySetup().GetMaterial(fstStation + ista).GetNbins() + * fParameters.GetGeometrySetup().GetMaterial(fstStation + ista).GetNbins(); + } + + fQueue.copy(fTripletConstructor.fMaterialMap, xpu::h2d); + fQueue.copy(fTripletConstructor.fMaterialMapTables, xpu::h2d); +} + +void GpuTrackFinderSetup::SetInputData() +{ + fTripletConstructor.fvHits.reset(frWData.Hits().size(), xpu::buf_io); + xpu::h_view vfvHits{fTripletConstructor.fvHits}; + + std::copy_n(frWData.Hits().begin(), frWData.Hits().size(), &vfvHits[0]); + + fQueue.copy(fTripletConstructor.fvHits, xpu::h2d); +} + +void GpuTrackFinderSetup::SetupIterationData(int iter) +{ + // xpu::scoped_timer t_("SetupIterationData"); + fIteration = iter; + + fTripletConstructor.fIterationData.reset(1, xpu::buf_io); + xpu::h_view vfIterationData{fTripletConstructor.fIterationData}; + vfIterationData[0].fNHits = fNHits; + vfIterationData[0].fIteration = iter; + vfIterationData[0].fNDoublets = 0; + vfIterationData[0].fNDoublets_counter = 0; + vfIterationData[0].fNTriplets = 0; + vfIterationData[0].fNTriplets_counter = 0; + + fQueue.copy(fTripletConstructor.fIterationData, xpu::h2d); +} + +void GpuTrackFinderSetup::RunGpuTrackingSetup() +{ + fNTriplets = 0; + + xpu::device_prop prop{xpu::device::active()}; + + if constexpr (constants::gpu::GpuTimeMonitoring) { + LOG(info) << "Running GPU tracking chain on device " << prop.name() << ", fIteration: " << fIteration; + } + + bool isCpu = xpu::device::active().backend() == xpu::cpu; + int nBlocks = isCpu ? fNHits : std::ceil(float(fNHits) / float(kSingletConstructorBlockSize)); + //nBlocks = 1; + + int nBlocksCD = + nBlocks + * constants::gpu:: + MaxDoubletsFromHit; //std::ceil(float(nBlocks * maxDoubletsFromHit) / float(kSingletConstructorBlockSize)); + + //*** + fTripletConstructor.fvTrackParams.reset(frWData.Hits().size(), xpu::buf_device); + fTripletConstructor.fHitDoublets.reset(frWData.Hits().size() * constants::gpu::MaxDoubletsFromHit, xpu::buf_device); + //*** + fTripletConstructor.fIteration = fIteration; + + xpu::set<strGpuTripletConstructor>(fTripletConstructor); + + if constexpr (constants::gpu::GpuTimeMonitoring) { + fEventTimeMonitor.nIterations = fIteration; + xpu::push_timer("Full_time"); + + xpu::push_timer("PrepareData_time"); + } + + // if(fIteration == 0) fQueue.copy(fTripletConstructor.fvHits, xpu::h2d); + + // fQueue.copy(fTripletConstructor.fIterationData, xpu::h2d); + + // fQueue.copy(fTripletConstructor.fvGpuGrid, xpu::h2d); + + fQueue.copy(fTripletConstructor.fgridFirstBinEntryIndex, xpu::h2d); + + fQueue.copy(fTripletConstructor.fgridEntries, xpu::h2d); + + // if(fIteration == 0) fQueue.copy(fTripletConstructor.fMaterialMap, xpu::h2d); + + // if(fIteration == 0) fQueue.copy(fTripletConstructor.fMaterialMapTables, xpu::h2d); + + fQueue.memset(fTripletConstructor.fHitDoublets, 0); + + if constexpr (constants::gpu::GpuTimeMonitoring) { + xpu::timings pd_time = xpu::pop_timer(); + fEventTimeMonitor.PrepareData_time[fIteration] = pd_time; + + xpu::push_timer("MakeSinglets_time"); + } + + fQueue.launch<MakeSinglets>(xpu::n_blocks(nBlocks)); + + if constexpr (constants::gpu::GpuTimeMonitoring) { + xpu::timings sind_time = xpu::pop_timer(); + fEventTimeMonitor.MakeSinglets_time[fIteration] = sind_time; + + xpu::push_timer("CompressDoublets_time"); + } + + fQueue.launch<MakeDoublets>(xpu::n_blocks(nBlocks)); + + if constexpr (constants::gpu::GpuTimeMonitoring) { + xpu::timings dind_time = xpu::pop_timer(); + fEventTimeMonitor.MakeDoublets_time[fIteration] = dind_time; + + xpu::push_timer("ResetDoublets_time"); + } + + fQueue.copy(fTripletConstructor.fIterationData, xpu::d2h); + xpu::h_view vfIterationData1{fTripletConstructor.fIterationData}; + + if constexpr (constants::gpu::GpuTimeMonitoring) { + LOG(info) << "GPU tracking :: Doublets found: " << vfIterationData1[0].fNDoublets; + } + + if (vfIterationData1[0].fNDoublets > 0) { + fTripletConstructor.fHitDoubletsCompressed.reset(vfIterationData1[0].fNDoublets, xpu::buf_device); + fTripletConstructor.fvTrackParamsDoublets.reset(vfIterationData1[0].fNDoublets, xpu::buf_device); + fTripletConstructor.fHitTriplets.reset(vfIterationData1[0].fNDoublets * constants::gpu::MaxTripletsFromDoublet, + xpu::buf_device); + + fQueue.memset(fTripletConstructor.fHitTriplets, 0); + xpu::set<strGpuTripletConstructor>(fTripletConstructor); + + if constexpr (constants::gpu::GpuTimeMonitoring) { + xpu::timings rd_time = xpu::pop_timer(); + fEventTimeMonitor.ResetDoublets_time[fIteration] = rd_time; + + xpu::push_timer("CompressDoublets_time"); + } + + fQueue.launch<CompressDoublets>(xpu::n_blocks(nBlocksCD)); + + if constexpr (constants::gpu::GpuTimeMonitoring) { + xpu::timings cd_time = xpu::pop_timer(); + fEventTimeMonitor.CompressDoublets_time[fIteration] = cd_time; + + xpu::push_timer("FitDoublets_time"); + } + + int nBlocksD = isCpu ? vfIterationData1[0].fNDoublets + : std::ceil(float(vfIterationData1[0].fNDoublets) / float(kSingletConstructorBlockSize)); + + fQueue.launch<FitDoublets>(xpu::n_blocks(nBlocksD)); + + if constexpr (constants::gpu::GpuTimeMonitoring) { + xpu::timings fd_time = xpu::pop_timer(); + fEventTimeMonitor.FitDoublets_time[fIteration] = fd_time; + + xpu::push_timer("MakeTriplets_time"); + } + + fQueue.launch<MakeTriplets>(xpu::n_blocks(nBlocksD)); + + if constexpr (constants::gpu::GpuTimeMonitoring) { + xpu::timings mt_time = xpu::pop_timer(); + fEventTimeMonitor.MakeTriplets_time[fIteration] = mt_time; + + xpu::push_timer("ResetTriplets_time"); + } + + fTripletConstructor.fHitDoublets.reset(0, xpu::buf_device); // TODO: check if we need it for the next iteration + + fQueue.copy(fTripletConstructor.fIterationData, xpu::d2h); + xpu::h_view vfIterationData2{fTripletConstructor.fIterationData}; + + if (vfIterationData2[0].fNTriplets > 0) { + fTripletConstructor.fHitTripletsCompressed.reset(vfIterationData2[0].fNTriplets, xpu::buf_device); + // fTripletConstructor.fvTrackParamsTriplets.reset(vfIterationData2[0].fNTriplets, xpu::buf_device); + fTripletConstructor.fvTriplets.reset(vfIterationData2[0].fNTriplets, xpu::buf_io); + + int nBlocksCT = isCpu ? vfIterationData1[0].fNDoublets * constants::gpu::MaxTripletsFromDoublet + : std::ceil(float(vfIterationData1[0].fNDoublets * constants::gpu::MaxTripletsFromDoublet) + / float(kSingletConstructorBlockSize)); + + if constexpr (constants::gpu::CpuSortTriplets) { + fTripletConstructor.fTripletSortHelper.reset(vfIterationData2[0].fNTriplets, xpu::buf_io); + // fTripletConstructor.fTripletSortHelperTmp.reset(vfIterationData2[0].fNTriplets, xpu::buf_io); + // fTripletConstructor.dst.reset(nBlocksCT, xpu::buf_io); + } + + xpu::set<strGpuTripletConstructor>(fTripletConstructor); + + if constexpr (constants::gpu::GpuTimeMonitoring) { + xpu::timings rt_time = xpu::pop_timer(); + fEventTimeMonitor.ResetTriplets_time[fIteration] = rt_time; + + LOG(info) << "GPU tracking :: Triplets found: " << vfIterationData2[0].fNTriplets; + + xpu::push_timer("CompressTriplets_time"); + } + + fQueue.launch<CompressTriplets>(xpu::n_blocks(nBlocksCT)); + + if constexpr (constants::gpu::GpuTimeMonitoring) { + xpu::timings ct_time = xpu::pop_timer(); + fEventTimeMonitor.CompressTriplets_time[fIteration] = ct_time; + + xpu::push_timer("FitTriplets_time"); + } + + fQueue.launch<FitTriplets>( + xpu::n_blocks(/*fTripletConstructor.fIterationData[0].fNTriplets*/ vfIterationData2[0].fNTriplets)); + + if constexpr (constants::gpu::GpuTimeMonitoring) { + xpu::timings ft_time = xpu::pop_timer(); + fEventTimeMonitor.FitTriplets_time[fIteration] = ft_time; + + xpu::push_timer("SortTriplets_time"); + } + + if constexpr (constants::gpu::CpuSortTriplets) { + fQueue.launch<SortTriplets>(xpu::n_blocks(vfIterationData2[0].fNTriplets)); + } + + if constexpr (constants::gpu::GpuTimeMonitoring) { + xpu::timings srt_time = xpu::pop_timer(); + fEventTimeMonitor.SortTriplets_time[fIteration] = srt_time; + } + + fQueue.copy(fTripletConstructor.fvTriplets, xpu::d2h); + fQueue.copy(fTripletConstructor.fIterationData, xpu::d2h); + + if constexpr (constants::gpu::CpuSortTriplets) { + fQueue.copy(fTripletConstructor.fTripletSortHelper, xpu::d2h); + } + + xpu::h_view vfIterationData3{fTripletConstructor.fIterationData}; + fNTriplets = vfIterationData3[0].fNTriplets; + } + } + + fTripletConstructor.fIterationData.reset(0, xpu::buf_device); + fTripletConstructor.fvGpuGrid.reset(0, xpu::buf_io); + fTripletConstructor.fgridFirstBinEntryIndex.reset(0, xpu::buf_io); + fTripletConstructor.fgridEntries.reset(0, xpu::buf_io); + + xpu::set<strGpuTripletConstructor>(fTripletConstructor); //TODO: check if we need to reset all the buffers here + + if constexpr (constants::gpu::GpuTimeMonitoring) { + xpu::timings t = xpu::pop_timer(); + fEventTimeMonitor.Total_time[fIteration] = t; + fEventTimeMonitor.PrintTimings(fIteration); + } +} diff --git a/algo/ca/core/experimental/CaGpuTrackFinderSetup.h b/algo/ca/core/experimental/CaGpuTrackFinderSetup.h new file mode 100644 index 0000000000000000000000000000000000000000..288eaa39ff00f9e37871e0218352af1ea8690f8c --- /dev/null +++ b/algo/ca/core/experimental/CaGpuTrackFinderSetup.h @@ -0,0 +1,92 @@ +/* Copyright (C) 2024-2025 GSI Helmholtzzentrum fuer Schwerionenforschung, Darmstadt + SPDX-License-Identifier: GPL-3.0-only + Authors: Grigory Kozlov [committer] */ + +/// \file CaGpuTrackFinderSetup.h +/// \brief The class is responsible for setting up the environment and the order in which kernels are launched to perform tracking using XPU + +#pragma once // include this header only once per compilation unit + +#include "CaBranch.h" +#include "CaGpuParameters.h" +#include "CaGpuTimeMonitor.h" +#include "CaGpuTripletConstructor.h" +#include "CaHit.h" +#include "CaVector.h" +#include "CaWindowData.h" +#include "KfTrackParam.h" + +#include <xpu/host.h> + +namespace cbm::algo::ca +{ + class GpuTrackFinderSetup { + public: + /// ------ Constructors and destructor ------ + + /// Constructor + /// \param nThreads Number of threads for multi-threaded mode + GpuTrackFinderSetup(ca::WindowData& wData, const ca::Parameters<fvec>& pars); + + /// Copy constructor + GpuTrackFinderSetup(const GpuTrackFinderSetup&) = delete; + + /// Move constructor + GpuTrackFinderSetup(GpuTrackFinderSetup&&) = delete; + + /// Copy assignment operator + GpuTrackFinderSetup& operator=(const GpuTrackFinderSetup&) = delete; + + /// Move assignment operator + GpuTrackFinderSetup& operator=(GpuTrackFinderSetup&&) = delete; + + /// Destructor + ~GpuTrackFinderSetup() = default; + + /// ------ Public member functions ------ + + ///Set the GPU tracking parameters + void SetupParameters(); + + /// Set the input grid data for the GPU tracking + void SetupGrid(); + + /// Set the material map for the GPU tracking + void SetupMaterialMap(); + + /// Set the input data for the GPU tracking + void SetInputData(); + + /// Set the iteration data + void SetupIterationData(int iter); + + /// Run the track finding algorithm chain + void RunGpuTrackingSetup(); + + /// Get the number of triplets + unsigned int GetNofTriplets() const { return fNTriplets; } //fTripletConstructor.fTempData[0].fNTriplets; } + + ca::Triplet& GetTriplet(int itr) { return fTripletConstructor.fvTriplets[itr]; } //TODO: switch back + // ca::Triplet& GetTriplet(int itr) { return fvTriplets[itr]; }//fTripletConstructor.fvTriplets[itr]; } //TODO: switch back + + /// Get timings + XpuTimings& GetTimings() { return fEventTimeMonitor; } + + /// Get triplets + // const Vector<ca::Triplet>& GetTriplets() const { return fTripletConstructor.fvTriplets; } + + // static constexpr bool fGPU_time_monitoring = true; // print debug info + + private: + const Parameters<fvec>& fParameters; ///< Object of Framework parameters class + WindowData& frWData; ///< Reference to the window data + xpu::queue fQueue; ///< GPU queue TODO: initialization is ~220 ms. Why and how to avoid? + ca::GpuTripletConstructor fTripletConstructor; ///< GPU triplet constructor + unsigned int fIteration; ///< Iteration number + int fNHits; ///< Number of hits + + int fNTriplets; ///< Number of triplets + + XpuTimings fEventTimeMonitor; + }; +} // namespace cbm::algo::ca diff --git a/algo/ca/core/experimental/CaGpuTripletConstructor.cxx b/algo/ca/core/experimental/CaGpuTripletConstructor.cxx new file mode 100644 index 0000000000000000000000000000000000000000..bd21eeb5bc4f0e814de5a4481322cadfb1c6c613 --- /dev/null +++ b/algo/ca/core/experimental/CaGpuTripletConstructor.cxx @@ -0,0 +1,1458 @@ +/* Copyright (C) 2024-2025 GSI Helmholtzzentrum fuer Schwerionenforschung, Darmstadt + SPDX-License-Identifier: GPL-3.0-only + Authors: Grigory Kozlov [committer] */ + +/// \file CaGpuTripletConstructor.cxx +/// \brief The class contains data and kernels for running CA tracking on CPU and GPU using XPU libraries + + +#include "CaGpuTripletConstructor.h" + +using namespace cbm::algo::ca; + +// Export Constants +XPU_EXPORT(strGpuTripletConstructor); + +// Kernel Definitions +//XPU_EXPORT(TestFunc); +//XPU_D void TestFunc::operator()(context& ctx) { ctx.cmem<strGpuTripletConstructor>().TestFunc(ctx); } + +XPU_EXPORT(MakeSinglets); +XPU_D void MakeSinglets::operator()(context& ctx) { ctx.cmem<strGpuTripletConstructor>().MakeSinglets(ctx); } + +XPU_EXPORT(MakeDoublets); +XPU_D void MakeDoublets::operator()(context& ctx) { ctx.cmem<strGpuTripletConstructor>().MakeDoublets(ctx); } + +XPU_EXPORT(CompressDoublets); +XPU_D void CompressDoublets::operator()(context& ctx) { ctx.cmem<strGpuTripletConstructor>().CompressDoublets(ctx); } + +XPU_EXPORT(FitDoublets); +XPU_D void FitDoublets::operator()(context& ctx) { ctx.cmem<strGpuTripletConstructor>().FitDoublets(ctx); } + +XPU_EXPORT(MakeTriplets); +XPU_D void MakeTriplets::operator()(context& ctx) { ctx.cmem<strGpuTripletConstructor>().MakeTriplets(ctx); } + +XPU_EXPORT(CompressTriplets); +XPU_D void CompressTriplets::operator()(context& ctx) { ctx.cmem<strGpuTripletConstructor>().CompressTriplets(ctx); } + +XPU_EXPORT(FitTriplets); +XPU_D void FitTriplets::operator()(context& ctx) { ctx.cmem<strGpuTripletConstructor>().FitTriplets(ctx); } + +XPU_EXPORT(SortTriplets); +XPU_D void SortTriplets::operator()(context& ctx) { ctx.cmem<strGpuTripletConstructor>().SortTripletsFunc(ctx); } + +//XPU_D void GpuTripletConstructor::TestFunc(TestFunc::context& ctx) const +//{ +// const int iGThread = ctx.block_dim_x() * ctx.block_idx_x() + ctx.thread_idx_x(); +//} + +XPU_D void GpuTripletConstructor::MakeSinglets(MakeSinglets::context& ctx) const +{ + const int iGThread = ctx.block_dim_x() * ctx.block_idx_x() + ctx.thread_idx_x(); + + if (iGThread >= fIterationData[0].fNHits) return; + + ca::HitIndex_t ihitl = fgridEntries[iGThread]; //iGThread; + int ista = fvHits[ihitl].Station(); + + // if(ista < 0 || ista > (int)fParams[fIterationData[0].fIteration].fNStations - 3) return; + if (ista < 0 || ista > (int) fParams_const[fIteration].fNStations - 3) return; + + // const ca::Station<float>* sta_l = &fParams[fIterationData[0].fIteration].GetStation(ista); + // const ca::Station<float>* sta_m = &fParams[fIterationData[0].fIteration].GetStation(ista + 1); + const ca::GpuStation* sta_l = &fStations_const[ista]; + const ca::GpuStation* sta_m = &fStations_const[ista + 1]; + + float stal_l_timeInfo = sta_l->timeInfo; + float sta_m_z = sta_m->fZ; + + const auto& hitl = fvHits[ihitl]; + + // const float dzli = 1.f / (hitl.Z() - fParams[fIterationData[0].fIteration].GetTargetPositionZ()); + // const float tx = (hitl.X() - fParams[fIterationData[0].fIteration].GetTargetPositionX()) * dzli; + // const float ty = (hitl.Y() - fParams[fIterationData[0].fIteration].GetTargetPositionY()) * dzli; + const float dzli = 1.f / (hitl.Z() - fParams_const[fIteration].GetTargetPositionZ()); + const float tx = (hitl.X() - fParams_const[fIteration].GetTargetPositionX()) * dzli; + const float ty = (hitl.Y() - fParams_const[fIteration].GetTargetPositionY()) * dzli; + + kf::TrackParamBase<float>& fit_params = fvTrackParams[ihitl]; + + fit_params.X() = hitl.X(); + fit_params.Y() = hitl.Y(); + fit_params.Z() = hitl.Z(); + fit_params.Tx() = tx; + fit_params.Ty() = ty; + fit_params.Qp() = 0.; + fit_params.Time() = hitl.T(); + // fit_params.Vi() = fParams[fIterationData[0].fIteration].SpeedOfLightInv;//0.03335641;//constants::phys::SpeedOfLightInv; + fit_params.Vi() = constants::phys::SpeedOfLightInv; //fParams_const[fIteration].SpeedOfLightInv; + + // float txErr2 = fParams[fIterationData[0].fIteration].maxSlopePV * fParams[fIterationData[0].fIteration].maxSlopePV / 9.; + // float qpErr2 = fParams[fIterationData[0].fIteration].maxQp * fParams[fIterationData[0].fIteration].maxQp / 9.; + float txErr2 = fParams_const[fIteration].maxSlopePV * fParams_const[fIteration].maxSlopePV / 9.; + float qpErr2 = fParams_const[fIteration].maxQp * fParams_const[fIteration].maxQp / 9.; + + fit_params.ResetErrors(1., 1., txErr2, txErr2, qpErr2, (stal_l_timeInfo ? hitl.dT2() : 1.e6), 1.e10); + + fit_params.InitVelocityRange(1. / /*fParams[fIterationData[0].fIteration]*/ fParams_const[fIteration].maxQp); + + fit_params.C(0, 0) = hitl.dX2(); + fit_params.C(1, 0) = hitl.dXY(); + fit_params.C(1, 1) = hitl.dY2(); + + // fit_params.Ndf() = fParams[fIterationData[0].fIteration].primaryFlag ? 2. : 0.; + fit_params.Ndf() = fParams_const[fIteration].primaryFlag ? 2. : 0.; + fit_params.NdfTime() = (stal_l_timeInfo ? 0 : -1); + + // ca::GpuFieldRegion fld0; + { + int sta1 = ista; + if (sta1 == 0) sta1 = 1; + int sta0 = sta1 / 2; // station between ista and the target + // const ca::GpuFieldValue &B0 = fParams[fIterationData[0].fIteration].GetStation(sta0).fieldSlice.GetFieldValueForLine(fit_params); + // const ca::GpuFieldValue &B1 = fParams[fIterationData[0].fIteration].GetStation(sta1).fieldSlice.GetFieldValueForLine(fit_params); + const ca::GpuFieldValue& B0 = fStations_const[sta0].fieldSlice.GetFieldValueForLine(fit_params); + const ca::GpuFieldValue& B1 = fStations_const[sta1].fieldSlice.GetFieldValueForLine(fit_params); + //fld0.Set(fParams[fIterationData[0].fIteration].targB, fParams[fIterationData[0].fIteration].GetTargetPositionZ(), B0, fParams[fIterationData[0].fIteration].GetStation(sta0).fZ, B1, fParams[fIterationData[0].fIteration].GetStation(sta1).fZ); + ctx.smem().fld0_shared[ctx.thread_idx_x()].Set(fParams_const[fIteration].targB, + fParams_const[fIteration].GetTargetPositionZ(), B0, + fStations_const[sta0].fZ, B1, fStations_const[sta1].fZ); + } + // ca::GpuFieldRegion fld1; + { // field, made by the left hit, the middle station and the right station + // const ca::GpuFieldValue &B0 = fParams[fIterationData[0].fIteration].GetStation(ista)*/fStations_const[ista]/*ctx.smem().fStations_shared[ista]*/.fieldSlice.GetFieldValueForLine(fit_params); + // const ca::GpuFieldValue &B1 = fParams[fIterationData[0].fIteration].GetStation(ista + 1).fieldSlice.GetFieldValueForLine(fit_params); + // const ca::GpuFieldValue &B2 = fParams[fIterationData[0].fIteration].GetStation(ista + 2).fieldSlice.GetFieldValueForLine(fit_params); + const ca::GpuFieldValue& B0 = fStations_const[ista].fieldSlice.GetFieldValueForLine(fit_params); + const ca::GpuFieldValue& B1 = fStations_const[ista + 1].fieldSlice.GetFieldValueForLine(fit_params); + const ca::GpuFieldValue& B2 = fStations_const[ista + 2].fieldSlice.GetFieldValueForLine(fit_params); + // fld1.Set(B0, fParams[fIterationData[0].fIteration].GetStation(ista).fZ, B1, fParams[fIterationData[0].fIteration].GetStation(ista + 1).fZ, B2, fParams[fIterationData[0].fIteration].GetStation(ista + 2).fZ); + ctx.smem().fld1_shared[ctx.thread_idx_x()].Set(B0, fStations_const[ista].fZ, B1, fStations_const[ista + 1].fZ, B2, + fStations_const[ista + 2].fZ); + } + + FilterWithTargetAtLine(&fit_params, &/*fld0*/ ctx.smem().fld0_shared[ctx.thread_idx_x()]); + + MultipleScattering(&fit_params, fMaterialMapTables[fMaterialMap[ista].GetBin(fit_params.GetX(), fit_params.GetY())], + /*fParams[fIterationData[0].fIteration]*/ fParams_const[fIteration].maxQp); + + // extrapolate to the middle hit + ExtrapolateStep(&fit_params, sta_m_z, 0.f, &/*fld1*/ ctx.smem().fld1_shared[ctx.thread_idx_x()]); + + // ExtrapolateLine(tr_par, sta_m->fZ, F1); //TODO: check if ExtrapolateStep is enough +} + +XPU_D void GpuTripletConstructor::MakeDoublets(MakeDoublets::context& ctx) const +{ + const int iGThread = ctx.block_dim_x() * ctx.block_idx_x() + ctx.thread_idx_x(); + if (iGThread >= fIterationData[0].fNHits) return; + +#if 0 + if (ctx.thread_idx_x() == 0) { + ctx.smem().counter = 0; + } +#endif + + ca::HitIndex_t ihitl = fgridEntries[iGThread]; //iGThread; + int ista = fvHits[ihitl].Station(); + + if (ista > (int) fParams[fIterationData[0].fIteration].fNStations - 3) return; + + int collectedHits = 0; + int maxNHits = constants::gpu::MaxDoubletsFromHit; //fParams[fIterationData[0].fIteration].maxDoubletsFromHit; + + CollectHits(iGThread, ista + 1, ihitl, maxNHits, &collectedHits); + + xpu::atomic_add(&(fIterationData[0].fNDoublets), collectedHits); + +#if 0 + xpu::atomic_add(&(ctx.smem().counter), collectedHits); + + xpu::barrier(ctx.pos()); + if(ctx.thread_idx_x() == 0) { + xpu::atomic_add(&(fIterationData[0].fNDoublets), ctx.smem().counter); + } +#endif +} + +XPU_D void GpuTripletConstructor::CompressDoublets(CompressDoublets::context& ctx) const +{ + const int iGThread = ctx.block_dim_x() * ctx.block_idx_x() + ctx.thread_idx_x(); + + // if( iGThread >= fIterationData[0].fNHits * fParams[fIterationData[0].fIteration].maxDoubletsFromHit ) return; + if (iGThread >= fIterationData[0].fNHits * constants::gpu::MaxDoubletsFromHit) return; + + if (fHitDoublets[iGThread].hit2 != 0) { + int index = xpu::atomic_add(&(fIterationData[0].fNDoublets_counter), 1); + fHitDoubletsCompressed[index].hit1 = fHitDoublets[iGThread].hit1; + fHitDoubletsCompressed[index].hit2 = fHitDoublets[iGThread].hit2; + } +} + +XPU_D void GpuTripletConstructor::FitDoublets(FitDoublets::context& ctx) const +{ + const int iGThread = ctx.block_dim_x() * ctx.block_idx_x() + ctx.thread_idx_x(); + if (iGThread >= fIterationData[0].fNDoublets) return; + + int hit_l = fHitDoubletsCompressed[iGThread].hit1; + int hit_m = fHitDoubletsCompressed[iGThread].hit2; + + if (hit_l < 0 || hit_m < 0) return; + + int ista = fvHits[hit_l].Station(); + + if (ista < 0 || ista > (int) fParams[fIterationData[0].fIteration].fNStations - 3) return; + + const ca::GpuStation& sta_l = fParams[fIterationData[0].fIteration].GetStation(ista); + const ca::GpuStation& sta_m = fParams[fIterationData[0].fIteration].GetStation(ista + 1); + + auto fit_params = fvTrackParams[hit_l]; + + bool isMomentumFitted = + (fParams[fIterationData[0].fIteration].isTargetField || (sta_l.fieldStatus != 0) || (sta_m.fieldStatus != 0)); + float maxQp = fParams[fIterationData[0].fIteration].maxQp; + + // fit doublet + const ca::Hit& hitm = fvHits[hit_m]; + + float z_2 = hitm.Z(); + ca::MeasurementXy<float> m_2(hitm.X(), hitm.Y(), hitm.dX2(), hitm.dY2(), hitm.dXY(), 1.f, 1.f); + float t_2 = hitm.T(); + float dt2_2 = hitm.dT2(); + + // add the middle hit + ExtrapolateLineNoField(&fit_params, z_2); + + FilterXY(&fit_params, m_2); + + FilterTime(&fit_params, t_2, dt2_2, sta_m.timeInfo); + + // fFit.SetQp0(isMomentumFitted ? fFit.Tr().GetQp() : maxQp); //TODO: check why it is required + const float qp0 = isMomentumFitted ? fit_params.GetQp() : maxQp; + + MultipleScattering(&fit_params, + fMaterialMapTables[fMaterialMap[ista + 1].GetBin(fit_params.GetX(), fit_params.GetY())], qp0); + + fvTrackParamsDoublets[iGThread] = fit_params; + // fFit.SetQp0(fFit.Tr().Qp()); //TODO: check why it is required and how it is used later, have to be stored different way +} + +XPU_D void GpuTripletConstructor::MakeTriplets(MakeTriplets::context& ctx) const +{ + const int iGThread = ctx.block_dim_x() * ctx.block_idx_x() + ctx.thread_idx_x(); + if (iGThread >= fIterationData[0].fNDoublets) return; + + int hit_l = fHitDoubletsCompressed[iGThread].hit1; + // int hit_m = fHitDoubletsCompressed[iGThread].hit2; + +#if 0 + if (ctx.thread_idx_x() == 0) { + ctx.smem().counter = 0; + } +#endif + + int ista = fvHits[hit_l].Station(); + + if (ista < 0 || ista > (int) fParams[fIterationData[0].fIteration].fNStations - 3) return; + + // + ista = fvHits[hit_l].Station(); + // + + // const ca::Station<float>& sta_l = fParams[fIterationData[0].fIteration].GetStation(ista); + // const ca::Station<float>& sta_m = fParams[fIterationData[0].fIteration].GetStation(ista + 1); + const ca::GpuStation& sta_r = fParams[fIterationData[0].fIteration].GetStation(ista + 2); + + auto& fit_params = fvTrackParamsDoublets[iGThread]; + + ca::GpuFieldRegion + fld1; //TODO: with fit_params(doublets) this is not fFldL from orig, but it should be more precise, need to be checker and clarified + { // field, made by the left hit, the middle station and the right station + ca::GpuFieldValue B0 = + fParams[fIterationData[0].fIteration].GetStation(ista).fieldSlice.GetFieldValueForLine(fit_params); + ca::GpuFieldValue B1 = + fParams[fIterationData[0].fIteration].GetStation(ista + 1).fieldSlice.GetFieldValueForLine(fit_params); + ca::GpuFieldValue B2 = + fParams[fIterationData[0].fIteration].GetStation(ista + 2).fieldSlice.GetFieldValueForLine(fit_params); + fld1.Set(B0, fParams[fIterationData[0].fIteration].GetStation(ista).fZ, B1, + fParams[fIterationData[0].fIteration].GetStation(ista + 1).fZ, B2, + fParams[fIterationData[0].fIteration].GetStation(ista + 2).fZ); + } + + ExtrapolateStep(&fit_params, sta_r.fZ, 0.f, &fld1); + + //TODO: add isDoubletGood check if needed + + int collectedHits = 0; + int maxNHits = + constants::gpu::MaxTripletsFromDoublet; //fParams[fIterationData[0].fIteration].maxTripletsFromDoublet; + + CollectHitsTriplets(iGThread, ista + 2, maxNHits, &collectedHits); + + xpu::atomic_add(&(fIterationData[0].fNTriplets), collectedHits); + +#if 0 + xpu::atomic_add(&(ctx.smem().counter), collectedHits); + + xpu::barrier(ctx.pos()); + + if (ctx.thread_idx_x () == 0) { + fIterationData[0].fNTriplets += ctx.smem().counter; + } +#endif +} + +XPU_D void GpuTripletConstructor::CompressTriplets(CompressTriplets::context& ctx) const +{ + const int iGThread = ctx.block_dim_x() * ctx.block_idx_x() + ctx.thread_idx_x(); + // if( iGThread >= fIterationData[0].fNDoublets * fParams[fIterationData[0].fIteration].maxTripletsFromDoublet ) return; + if (iGThread >= fIterationData[0].fNDoublets * constants::gpu::MaxTripletsFromDoublet) return; + + if (fHitTriplets[iGThread].hit3 != 0) { + int index = xpu::atomic_add(&(fIterationData[0].fNTriplets_counter), 1); + fHitTripletsCompressed[index].hit1 = fHitTriplets[iGThread].hit1; + fHitTripletsCompressed[index].hit2 = fHitTriplets[iGThread].hit2; + fHitTripletsCompressed[index].hit3 = fHitTriplets[iGThread].hit3; + } +} + +XPU_D void GpuTripletConstructor::FitTriplets(FitTriplets::context& ctx) const +{ + const int iGThread = ctx.block_dim_x() * ctx.block_idx_x() + ctx.thread_idx_x(); + + if (iGThread >= fIterationData[0].fNTriplets) return; + + int hit_l = fHitTripletsCompressed[iGThread].hit1; + + int ista[3] = {-1, -1, -1}; + + ista[0] = fvHits[hit_l].Station(); + ista[1] = ista[0] + 1; + ista[2] = ista[1] + 1; + + // ca::GpuStation sta[3]; + const ca::GpuStation* sta[3]; + for (int is = 0; is < 3; ++is) { + // sta[is] = fParams[fIterationData[0].fIteration].GetStation(ista[is]); + sta[is] = &fStations_const[ista[is]]; + }; + + bool isMomentumFitted = ((sta[0]->fieldStatus != 0) || (sta[1]->fieldStatus != 0) || (sta[2]->fieldStatus != 0)); + + float ndfTrackModel = 4; // straight line + ndfTrackModel += isMomentumFitted ? 1 : 0; // track with momentum + + const float maxQp = fParams[fIterationData[0].fIteration].maxQp; + + ca::HitIndex_t ihit[3] = {fHitTripletsCompressed[iGThread].hit1, fHitTripletsCompressed[iGThread].hit2, + fHitTripletsCompressed[iGThread].hit3}; + + float x[3], y[3], z[3], t[3]; + float dt2[3]; + ca::MeasurementXy<float> mxy[3]; + + for (int ih = 0; ih < 3; ++ih) { + const ca::Hit& hit = fvHits[ihit[ih]]; + mxy[ih] = ca::MeasurementXy<float>(hit.X(), hit.Y(), hit.dX2(), hit.dY2(), hit.dXY(), 1.f, 1.f); + + x[ih] = hit.X(); + y[ih] = hit.Y(); + z[ih] = hit.Z(); + t[ih] = hit.T(); + dt2[ih] = hit.dT2(); + }; + + // find the field along the track + + ca::GpuFieldValue B[3]; + ca::GpuFieldRegion fld; + ca::GpuFieldRegion fldTarget; + + float tx[3] = {(x[1] - x[0]) / (z[1] - z[0]), (x[2] - x[0]) / (z[2] - z[0]), (x[2] - x[1]) / (z[2] - z[1])}; + float ty[3] = {(y[1] - y[0]) / (z[1] - z[0]), (y[2] - y[0]) / (z[2] - z[0]), (y[2] - y[1]) / (z[2] - z[1])}; + for (int ih = 0; ih < 3; ++ih) { + float dz = (sta[ih]->fZ - z[ih]); + B[ih] = sta[ih]->fieldSlice.GetFieldValue(x[ih] + tx[ih] * dz, y[ih] + ty[ih] * dz); + }; + + fld.Set(B[0], sta[0]->fZ, B[1], sta[1]->fZ, B[2], sta[2]->fZ); + fldTarget.Set(fParams[fIterationData[0].fIteration].targB, fParams[fIterationData[0].fIteration].GetTargetPositionZ(), + B[0], sta[0]->fZ, B[1], sta[1]->fZ); + + kf::TrackParamBase<float> + fit_params; // = fvTrackParamsTriplets[iGThread]; //TODO: set all to 0 and recheck - done, no difference + // auto *fit_params = &ctx.smem().fit_params_shared[ctx.thread_idx_x()]; //TODO: shared memory cannot be used for classes with preinitialized data + + fit_params.ResetCovMatrix(); + + // initial parameters + { + float dz01 = 1.f / (z[1] - z[0]); + fit_params.Tx() = (x[1] - x[0]) * dz01; + fit_params.Ty() = (y[1] - y[0]) * dz01; + fit_params.Qp() = 0.f; + fit_params.Vi() = 0.f; + } + + bool not_fitted = false; + float qp0 = 0; + // repeat several times in order to improve the precision + for (int iiter = 0; iiter < 2 /*nIterations*/; ++iiter) { + // fit forward + { + qp0 = fit_params.GetQp(); + if (qp0 > maxQp) qp0 = maxQp; + if (qp0 < -maxQp) qp0 = -maxQp; + + fit_params.Qp() = 0.f; + fit_params.Vi() = 0.f; + fit_params.X() = x[0]; + fit_params.Y() = y[0]; + fit_params.Z() = z[0]; + fit_params.Time() = t[0]; + fit_params.ResetErrors(1.f, 1.f, 1.f, 1.f, 100., (sta[0]->timeInfo ? dt2[0] : 1.e6f), 1.e2f); + fit_params.C(0, 0) = mxy[0].Dx2(); + fit_params.C(1, 0) = mxy[0].Dxy(); + fit_params.C(1, 1) = mxy[0].Dy2(); + + fit_params.Ndf() = -ndfTrackModel + 2; + fit_params.NdfTime() = sta[0]->timeInfo ? 0 : -1; + // add the target constraint + FilterWithTargetAtLine(&fit_params, &fldTarget); + for (int ih = 1; ih < 3; ++ih) { + ExtrapolateStep(&fit_params, z[ih], qp0, &fld); + + // Extrapolate(&fit_params, z[ih], 0.f, &fld); //TODO: add qp0 instead of 0.f + + int bin = fMaterialMap[ista[ih]].GetBin(fit_params.GetX(), fit_params.GetY()); + auto radThick = fMaterialMapTables[bin]; + + MultipleScattering(&fit_params, radThick, qp0); //TODO: why it was commented in original gpu code? + + float Tx = fit_params.Tx(); + float Ty = fit_params.Ty(); + float txtx = Tx * Tx; + float tyty = Ty * Ty; + float txtx1 = txtx + 1.f; + float h = txtx + tyty; + float tt = xpu::sqrt(txtx1 + tyty); + float h2 = h * h; + float qp0t = qp0 * tt; + float c1 = 0.0136f, c2 = c1 * 0.038f, c3 = c2 * 0.5f, c4 = -c3 * 0.5f, c5 = c3 * 0.333333f, c6 = -c3 * 0.25f; + + float s0 = (c1 + c2 * xpu::log(radThick) + c3 * h + h2 * (c4 + c5 * h + c6 * h2)) * qp0t; + float a = ((tt + + fParams[fIterationData[0].fIteration].particleMass + * fParams[fIterationData[0].fIteration].particleMass * qp0 * qp0t) + * radThick * s0 * s0); + // fit_params.C(2, 2) += 0.001;//txtx1 * a; //TODO: check if it is needed. Switching on leads to the strong ghosts increase + + fit_params.C(3, 2) += Tx * Ty * a; + fit_params.C(3, 3) += (1. + tyty) * a; + + EnergyLossCorrection(&fit_params, radThick, -1.f); + FilterXY(&fit_params, mxy[ih]); + FilterTime(&fit_params, t[ih], dt2[ih], sta[ih]->timeInfo); + } + } + + if (iiter == 1) break; + + // fit backward + { + qp0 = fit_params.GetQp(); + if (qp0 > maxQp) qp0 = maxQp; + if (qp0 < -maxQp) qp0 = -maxQp; + + fit_params.X() = x[2]; + fit_params.Y() = y[2]; + fit_params.Z() = z[2]; + fit_params.Time() = t[2]; + fit_params.Qp() = 0.f; + fit_params.Vi() = 0.f; + + fit_params.ResetErrors(1.f, 1.f, 1.f, 1.f, 100., (sta[2]->timeInfo ? dt2[2] : 1.e6f), 1.e2f); + + fit_params.Ndf() = -ndfTrackModel + 2; + fit_params.NdfTime() = sta[2]->timeInfo ? 0 : -1; + + for (int ih = 1; ih >= 0; --ih) { + ExtrapolateStep(&fit_params, z[ih], qp0, &fld); + auto radThick = fMaterialMapTables[fMaterialMap[ista[ih]].GetBin(fit_params.GetX(), fit_params.GetY())]; + MultipleScattering(&fit_params, radThick, qp0); + EnergyLossCorrection(&fit_params, radThick, 1.f); + FilterXY(&fit_params, mxy[ih]); + FilterTime(&fit_params, t[ih], dt2[ih], sta[ih]->timeInfo); + } + } + } // for iiter + + float chi2 = fit_params.GetChiSq() + fit_params.GetChiSqTime(); + + if (chi2 > fParams[fIterationData[0].fIteration].tripletFinalChi2Cut || !xpu::isfinite(chi2) || chi2 < 0.f + || not_fitted) { + chi2 = -1.; + } + + const float qp = fit_params.GetQp(); + const float Cqp = fit_params.C(4, 4) + 0.001; + fvTriplets[iGThread] = + ca::Triplet(ihit[0], ihit[1], ihit[2], ista[0], ista[1], ista[2], 0, 0, 0, chi2, qp, Cqp, fit_params.Tx(), + fit_params.C(2, 2), fit_params.Ty(), fit_params.C(3, 3), isMomentumFitted); + + if constexpr (constants::gpu::GpuSortTriplets) { + fTripletSortHelper[iGThread].tr_id = iGThread; + // fTripletSortHelper[iGThread].lsta = ista[0]; + fTripletSortHelper[iGThread].lhit = ihit[0]; + } +} + +XPU_D void GpuTripletConstructor::SortTripletsFunc(SortTriplets::context& ctx) const +{ + //TODO: implement sorting of triplets on GPU or make tracking order independent +} + +XPU_D void GpuTripletConstructor::CollectHits(int iThread, int iSta, int iHit, int maxNHits, int* collectedHits) const +{ + // Collect hits in the station iSta and make doublets + const ca::GpuStation& sta = fParams[fIterationData[0].fIteration].GetStation(iSta); + kf::TrackParamBase<float> fit_params = fvTrackParams[iHit]; + + fit_params.ChiSq() = 0.; + + const float Pick_m22 = fParams[fIterationData[0].fIteration].doubletChi2Cut - fit_params.GetChiSq(); + + const float timeError2 = fit_params.C(5, 5); + const float time = fit_params.Time(); + + const auto& gpuGrid = fvGpuGrid[iSta]; + + const float pick_range_x = xpu::sqrt(Pick_m22 * fit_params.C(0, 0)) + gpuGrid.GetMaxRangeX() + + fParams[fIterationData[0].fIteration].maxDZ * abs(fit_params.Tx()); + const float pick_range_y = xpu::sqrt(Pick_m22 * fit_params.C(1, 1)) + gpuGrid.GetMaxRangeY() + + fParams[fIterationData[0].fIteration].maxDZ * abs(fit_params.Ty()); + + ca::GpuGridArea areaGpu(gpuGrid, &(fgridFirstBinEntryIndex[fvGpuGrid[iSta].GetBinEntryIndexStart()]), + &(fgridEntries[fvGpuGrid[iSta].GetEntriesStart()]), fit_params.X(), fit_params.Y(), + pick_range_x, pick_range_y); + unsigned int ih_gpu = 0; + + while (areaGpu.GetNextObjectId(ih_gpu) && (*collectedHits < maxNHits)) { // loop over station hits + // if (fvbHitSuppressed[ih]) { + // continue; + // } + const ca::Hit& hit = fvHits[ih_gpu]; + + // check time-boundaries + if ((sta.timeInfo) && (fit_params.NdfTime() >= 0)) { + if (xpu::abs(time - hit.T()) > 1.4 * (3.5 * xpu::sqrt(timeError2) + hit.RangeT())) { + continue; + } + // if (fabs(time - hit.T()) > 30) continue; + } + + // - check whether hit belong to the window ( track position +\- errors ) - + + // check y-boundaries + float y = fit_params.Y() + fit_params.Ty() * (hit.Z() - fit_params.Z()); + float C11 = + fit_params.C(1, 1) + + (hit.Z() - fit_params.Z()) * (2 * fit_params.C(3, 1) + (hit.Z() - fit_params.Z()) * fit_params.C(3, 3)); + + float dy_est = xpu::sqrt(Pick_m22 * C11) + hit.RangeY(); + + const float dY = hit.Y() - y; + + if (xpu::abs(dY) > dy_est) { + continue; + } + + // check x-boundaries + float x = fit_params.X() + fit_params.Tx() * (hit.Z() - fit_params.Z()); + float C00 = + fit_params.C(0, 0) + + (hit.Z() - fit_params.Z()) * (2 * fit_params.C(2, 0) + (hit.Z() - fit_params.Z()) * fit_params.C(2, 2)); + + float dx_est = xpu::sqrt(Pick_m22 * C00) + hit.RangeX(); + + const float dX = hit.X() - x; + + if (xpu::abs(dX) > dx_est) { + continue; + } + + // check chi2 + float C10 = fit_params.C(1, 0) + + (hit.Z() - fit_params.Z()) + * (fit_params.C(2, 1) + fit_params.C(3, 0) + (hit.Z() - fit_params.Z()) * fit_params.C(3, 2)); + + ca::MeasurementXy<float> mxy(hit.X(), hit.Y(), hit.dX2(), hit.dY2(), hit.dXY(), 1.f, 1.f); + + float chi2x = 0.; + { // filter X measurement + const float zeta = x - mxy.X(); + + const float F0 = C00; + const float F1 = C10; + + const float HCH = F0; + + const float wi = 1.f / (mxy.Dx2() + HCH); + const float zetawi = zeta * wi; + chi2x = mxy.NdfX() * zeta * zetawi; + + const float K1 = F1 * wi; + + x -= F0 * zetawi; + y -= F1 * zetawi; + + C00 -= F0 * F0 * wi; + C10 -= K1 * F0; + C11 -= K1 * F1; + } + + float chi2u = 0.; + { // filter U measurement, we need only chi2 here + const float cosPhi = -mxy.Dxy() / mxy.Dx2(); + const float u = cosPhi * mxy.X() + mxy.Y(); + const float du2 = mxy.Dy2() + cosPhi * mxy.Dxy(); + + const float zeta = cosPhi * x + y - u; + + const float F0 = cosPhi * C00 + C10; + const float F1 = cosPhi * C10 + C11; + + const float HCH = (F0 * cosPhi + F1); + + chi2u += mxy.NdfY() * zeta * zeta / (du2 + HCH); + } + + if (chi2x > fParams[fIterationData[0].fIteration].doubletChi2Cut) { + continue; + } + if (chi2x + chi2u > fParams[fIterationData[0].fIteration].doubletChi2Cut) { + continue; + } + int id = iThread * maxNHits + *collectedHits; + fHitDoublets[id].hit1 = iHit; + fHitDoublets[id].hit2 = ih_gpu; + *collectedHits += 1; + //TODO: add clone hit suppression -> should be done after doublet fitting actually + } // loop over the hits in the area +} + +XPU_D void GpuTripletConstructor::CollectHitsTriplets(int iThread, int iSta, int maxNHits, int* collectedHits) const +{ + const ca::GpuStation& sta = fParams[fIterationData[0].fIteration].GetStation(iSta); + kf::TrackParamBase<float> fit_params = fvTrackParamsDoublets[iThread]; + + fit_params.ChiSq() = 0.; + + const float Pick_m22 = fParams[fIterationData[0].fIteration].tripletChi2Cut - fit_params.GetChiSq(); + + const float timeError2 = fit_params.C(5, 5); + const float time = fit_params.Time(); + + const auto& gpuGrid = fvGpuGrid[iSta]; + + const float pick_range_x = xpu::sqrt(Pick_m22 * fit_params.C(0, 0)) + gpuGrid.GetMaxRangeX() + + fParams[fIterationData[0].fIteration].maxDZ * abs(fit_params.Tx()); + const float pick_range_y = xpu::sqrt(Pick_m22 * fit_params.C(1, 1)) + gpuGrid.GetMaxRangeY() + + fParams[fIterationData[0].fIteration].maxDZ * abs(fit_params.Ty()); + + ca::GpuGridArea areaGpu(gpuGrid, &(fgridFirstBinEntryIndex[fvGpuGrid[iSta].GetBinEntryIndexStart()]), + &(fgridEntries[fvGpuGrid[iSta].GetEntriesStart()]), fit_params.X(), fit_params.Y(), + pick_range_x, pick_range_y); + unsigned int ih_gpu = 0; + + while (areaGpu.GetNextObjectId(ih_gpu) && (*collectedHits < maxNHits)) { // loop over station hits + // if (fvbHitSuppressed[ih]) { + // continue; + // } + const ca::Hit& hit = fvHits[ih_gpu]; + + // check time-boundaries + + //TODO: move hardcoded cuts to parameters + if ((sta.timeInfo) && (fit_params.NdfTime() >= 0)) { + if (xpu::abs(time - hit.T()) > 1.4 * (3.5 * xpu::sqrt(timeError2) + hit.RangeT())) { + continue; + } + // if (fabs(time - hit.T()) > 30) continue; + } + + // - check whether hit belong to the window ( track position +\- errors ) - + + // check y-boundaries + float y = fit_params.Y() + fit_params.Ty() * (hit.Z() - fit_params.Z()); + float C11 = + fit_params.C(1, 1) + + (hit.Z() - fit_params.Z()) * (2 * fit_params.C(3, 1) + (hit.Z() - fit_params.Z()) * fit_params.C(3, 3)); + + float dy_est = xpu::sqrt(Pick_m22 * C11) + hit.RangeY(); + + const float dY = hit.Y() - y; + + if (xpu::abs(dY) > dy_est) { + continue; + } + + // check x-boundaries + float x = fit_params.X() + fit_params.Tx() * (hit.Z() - fit_params.Z()); + float C00 = + fit_params.C(0, 0) + + (hit.Z() - fit_params.Z()) * (2 * fit_params.C(2, 0) + (hit.Z() - fit_params.Z()) * fit_params.C(2, 2)); + + float dx_est = xpu::sqrt(Pick_m22 * C00) + hit.RangeX(); + + const float dX = hit.X() - x; + + if (fabs(dX) > dx_est) { + continue; + } + + // check chi2 + float C10 = fit_params.C(1, 0) + + (hit.Z() - fit_params.Z()) + * (fit_params.C(2, 1) + fit_params.C(3, 0) + (hit.Z() - fit_params.Z()) * fit_params.C(3, 2)); + + ca::MeasurementXy<float> mxy(hit.X(), hit.Y(), hit.dX2(), hit.dY2(), hit.dXY(), 1.f, 1.f); + + float chi2x = 0.; + { // filter X measurement + const float zeta = x - mxy.X(); + + const float F0 = C00; + const float F1 = C10; + + const float HCH = F0; + + const float wi = 1.f / (mxy.Dx2() + HCH); + const float zetawi = zeta * wi; + chi2x = mxy.NdfX() * zeta * zetawi; + + const float K1 = F1 * wi; + + x -= F0 * zetawi; + y -= F1 * zetawi; + + C00 -= F0 * F0 * wi; + C10 -= K1 * F0; + C11 -= K1 * F1; + } + + float chi2u = 0.; + { // filter U measurement, we need only chi2 here + const float cosPhi = -mxy.Dxy() / mxy.Dx2(); + const float u = cosPhi * mxy.X() + mxy.Y(); + const float du2 = mxy.Dy2() + cosPhi * mxy.Dxy(); + + const float zeta = cosPhi * x + y - u; + + const float F0 = cosPhi * C00 + C10; + const float F1 = cosPhi * C10 + C11; + + const float HCH = (F0 * cosPhi + F1); + + chi2u += mxy.NdfY() * zeta * zeta / (du2 + HCH); + } + + if (chi2x > fParams[fIterationData[0].fIteration].tripletChi2Cut) { + continue; + } + if (chi2x + chi2u > fParams[fIterationData[0].fIteration].tripletChi2Cut) { + continue; + } + + int id = iThread * maxNHits + *collectedHits; + fHitTriplets[id].hit1 = fHitDoubletsCompressed[iThread].hit1; + fHitTriplets[id].hit2 = fHitDoubletsCompressed[iThread].hit2; + fHitTriplets[id].hit3 = ih_gpu; + *collectedHits += 1; + } // loop over the hits in the area + // if( *collectedHits > 2 ) printf("- iThread: %d; collectedHits: %d\n", iThread, *collectedHits); +} + +XPU_D void GpuTripletConstructor::FilterWithTargetAtLine(kf::TrackParamBase<float>* tr_par, + const ca::GpuFieldRegion* F0) const +{ + // Add the target constraint to a straight line track + + float eXY[2]; + float Jx[7], Jy[7]; + + GetExtrapolatedXYline(tr_par, + /*fParams[fIterationData[0].fIteration]*/ fParams_const[fIteration].GetTargetPositionZ(), F0, + eXY /*, &eY*/, Jx, Jy); + + FilterExtrapolatedXY(tr_par, &(/*fParams[fIterationData[0].fIteration]*/ fParams_const[fIteration].targetMeasurement), + eXY /*, eY*/, Jx, Jy); +} + +XPU_D void GpuTripletConstructor::GetExtrapolatedXYline( + kf::TrackParamBase<float>* tr_par, float z, const ca::GpuFieldRegion* F, float* extrXY, // float* extrY, + // std::array<float, ca::TrackParamBase<float>::kNtrackParam>* Jx, + // std::array<float, ca::TrackParamBase<float>::kNtrackParam>* Jy) const + float* Jx, float* Jy) const +{ + // extrapolate track assuming it is straight (qp==0) + // return the extrapolated X, Y and the derivatives of the extrapolated X and Y + + const float c_light(0.000299792458), c1(1.), c2i(0.5), c6i(/*1. / 6.*/ 0.166667), + c12i(/*1. / 12.*/ 0.083333); //TODO: test constants + + const float tx = tr_par->GetTx(); + const float ty = tr_par->GetTy(); + const float dz = z - tr_par->GetZ(); + + const float dz2 = dz * dz; + const float dzc6i = dz * c6i; + const float dz2c12i = dz2 * c12i; + + const float xx = tx * tx; + const float yy = ty * ty; + const float xy = tx * ty; + + const float Ay = -xx - c1; + const float Bx = yy + c1; + + const float ctdz2 = c_light * xpu::sqrt(c1 + xx + yy) * dz2; + + const float Sx = F->cx0 * c2i + F->cx1 * dzc6i + F->cx2 * dz2c12i; + const float Sy = F->cy0 * c2i + F->cy1 * dzc6i + F->cy2 * dz2c12i; + const float Sz = F->cz0 * c2i + F->cz1 * dzc6i + F->cz2 * dz2c12i; + + extrXY[0] = tr_par->GetX() + tx * dz; + extrXY[1] = tr_par->GetY() + ty * dz; + + // Jx->fill(0.); // for a case + // Jy->fill(0.); + + Jx[0] = 1.; + Jx[1] = 0.; + Jx[2] = dz; + Jx[3] = 0.; + Jx[4] = ctdz2 * (Sx * xy + Sy * Ay + Sz * ty); + Jx[5] = 0.; + Jx[6] = 0.; + + Jy[0] = 0.; + Jy[1] = 1.; + Jy[2] = 0.; + Jy[3] = dz; + Jy[4] = ctdz2 * (Sx * Bx - Sy * xy - Sz * tx); + Jy[5] = 0.; + // Jx[6] = 0.; //TODO: is it a bug??? + Jy[6] = 0.; +} + +XPU_D void GpuTripletConstructor::FilterExtrapolatedXY( + kf::TrackParamBase<float>* tr_par, const ca::MeasurementXy<float>* m, float* extrXY, // fvec extrY, + // std::array<float, ca::TrackParamBase<float>::kNtrackParam>* Jx, + // std::array<float, ca::TrackParamBase<float>::kNtrackParam>* Jy) const + float* Jx, float* Jy) const +{ + // add a 2-D measurenent (x,y) at some z, that differs from fTr.GetZ() + // extrX, extrY are extrapolated track parameters at z, Jx, Jy are derivatives of the extrapolation + + // ! it is assumed that in the track covariance matrix all non-diagonal covariances are 0 + // ! except of C10 + + // TrackParamV& T = fTr; + + //zeta0 = T.x + Jx[2]*T.Tx() + Jx[3]*T.Ty() + Jx[4]*T.qp - x; + //zeta1 = T.y + Jy[2]*T.Tx() + Jy[3]*T.Ty() + Jy[4]*T.qp - y; + + const float zeta0 = extrXY[0] - m->X(); + const float zeta1 = extrXY[1] - m->Y(); + + // H = 1 0 Jx[2] Jx[3] Jx[4] 0 + // 0 1 Jy[2] Jy[3] Jy[4] 0 + + // F = CH' + const float F00 = tr_par->C(0, 0); + const float F01 = tr_par->C(1, 0); //TODO: is it a bug??? + const float F10 = F01; //tr_par->C10(); + const float F11 = tr_par->C(1, 1); + const float F20 = Jx[2] * tr_par->C(2, 2); + const float F21 = Jy[2] * tr_par->C(2, 2); + const float F30 = Jx[3] * tr_par->C(3, 3); + const float F31 = Jy[3] * tr_par->C(3, 3); + const float F40 = Jx[4] * tr_par->C(4, 4); + const float F41 = Jy[4] * tr_par->C(4, 4); + + // Jx[5,6] and Jy[5,6] are 0. + + float S00 = m->Dx2() + F00 + Jx[2] * F20 + Jx[3] * F30 + Jx[4] * F40; + float S10 = m->Dxy() + F10 + Jy[2] * F20 + Jy[3] * F30 + Jy[4] * F40; + float S11 = m->Dy2() + F11 + Jy[2] * F21 + Jy[3] * F31 + Jy[4] * F41; + + const float si = 1. / (S00 * S11 - S10 * S10); + const float S00tmp = S00; + S00 = si * S11; + S10 = -si * S10; + S11 = si * S00tmp; + + tr_par->ChiSq() += zeta0 * zeta0 * S00 + 2. * zeta0 * zeta1 * S10 + zeta1 * zeta1 * S11; + tr_par->Ndf() += m->NdfX() + m->NdfY(); + + const float K00 = F00 * S00 + F01 * S10; + const float K01 = F00 * S10 + F01 * S11; + const float K10 = F10 * S00 + F11 * S10; + const float K11 = F10 * S10 + F11 * S11; + const float K20 = F20 * S00 + F21 * S10; + const float K21 = F20 * S10 + F21 * S11; + const float K30 = F30 * S00 + F31 * S10; + const float K31 = F30 * S10 + F31 * S11; + const float K40 = F40 * S00 + F41 * S10; + const float K41 = F40 * S10 + F41 * S11; + + tr_par->X() -= K00 * zeta0 + K01 * zeta1; + tr_par->Y() -= K10 * zeta0 + K11 * zeta1; + tr_par->Tx() -= K20 * zeta0 + K21 * zeta1; + tr_par->Ty() -= K30 * zeta0 + K31 * zeta1; + tr_par->Qp() -= K40 * zeta0 + K41 * zeta1; + + tr_par->C(0, 0) -= (K00 * F00 + K01 * F01); + tr_par->C(1, 0) -= (K10 * F00 + K11 * F01); + tr_par->C(1, 1) -= (K10 * F10 + K11 * F11); + tr_par->C(2, 0) = -(K20 * F00 + K21 * F01); + tr_par->C(2, 1) = -(K20 * F10 + K21 * F11); + tr_par->C(2, 2) -= (K20 * F20 + K21 * F21); + tr_par->C(3, 0) = -(K30 * F00 + K31 * F01); + tr_par->C(3, 1) = -(K30 * F10 + K31 * F11); + tr_par->C(3, 2) = -(K30 * F20 + K31 * F21); + tr_par->C(3, 3) -= (K30 * F30 + K31 * F31); + tr_par->C(4, 0) = -(K40 * F00 + K41 * F01); + tr_par->C(4, 1) = -(K40 * F10 + K41 * F11); + tr_par->C(4, 2) = -(K40 * F20 + K41 * F21); + tr_par->C(4, 3) = -(K40 * F30 + K41 * F31); + tr_par->C(4, 4) -= (K40 * F40 + K41 * F41); +} + +XPU_D void GpuTripletConstructor::MultipleScattering(kf::TrackParamBase<float>* tr_par, float radThick, float qp0) const +{ + float tx = tr_par->Tx(); + float ty = tr_par->Ty(); + float txtx = tx * tx; + float tyty = ty * ty; + float txtx1 = txtx + 1.f; + float h = txtx + tyty; + float t = xpu::sqrt(txtx1 + tyty); + float h2 = h * h; + float qp0t = qp0 * t; + + float c1 = 0.0136f, c2 = c1 * 0.038f, c3 = c2 * 0.5f, c4 = -c3 * 0.5f, c5 = c3 * 0.333333f, c6 = -c3 * 0.25f; + + float s0 = (c1 + c2 * xpu::log(radThick) + c3 * h + h2 * (c4 + c5 * h + c6 * h2)) * qp0t; + // float a = ((t + fParams[fIterationData[0].fIteration].particleMass * fParams[fIterationData[0].fIteration].particleMass * qp0 * qp0t) * radThick * s0 * s0); + float a = ((t + fParams_const[fIteration].particleMass * fParams_const[fIteration].particleMass * qp0 * qp0t) + * radThick * s0 * s0); + tr_par->C(2, 2) += txtx1 * a; + tr_par->C(3, 2) += tx * ty * a; + tr_par->C(3, 3) += (1. + tyty) * a; +} + +XPU_D void GpuTripletConstructor::ExtrapolateStep(kf::TrackParamBase<float>* tr_par, float z_out, float qp0, + const ca::GpuFieldRegion* Field) const +{ + const float c_light = 0.000299792458; + + //---------------------------------------------------------------- + const float h = z_out - tr_par->GetZ(); + + const float stepDz[5] = {0., 0., h * 0.5f, h * 0.5f, h}; + + float f[5][7] = {{0.}}; // ( d*/dz ) [step] + float F[5][7][7] = {{{0.}}}; // ( d *new [step] / d *old ) + + // Runge-Kutta steps + // + const float r0[7] = {tr_par->X(), tr_par->Y(), tr_par->Tx(), tr_par->Ty(), qp0, tr_par->Time(), tr_par->Vi()}; + float R0[7][7] = {{0.}}; + for (int i = 0; i < 7; ++i) { + R0[i][i] = 1.; + } + + for (int step = 1; step <= 4; ++step) { + + float rstep[7] = {0.}; + for (int i = 0; i < 7; ++i) { + rstep[i] = r0[i] + stepDz[step] * f[step - 1][i]; + } + const float z = tr_par->GetZ() + stepDz[step]; + ca::GpuFieldValue B = Field->Get(rstep[0], rstep[1], z); + const float tx2 = rstep[2] * rstep[2]; + const float ty2 = rstep[3] * rstep[3]; + const float txty = rstep[2] * rstep[3]; + const float L2 = 1.f + tx2 + ty2; + const float L2i = 1.f / L2; + const float L = xpu::sqrt(L2); + const float cL = c_light * L; + const float cLqp0 = cL * qp0; + + f[step][0] = rstep[2]; + F[step][0][2] = 1.; + + f[step][1] = rstep[3]; + F[step][1][3] = 1.; + + const float f2tmp = txty * B.x - (1. + tx2) * B.y + rstep[3] * B.z; + f[step][2] = cLqp0 * f2tmp; + + F[step][2][2] = cLqp0 * (rstep[2] * f2tmp * L2i + rstep[3] * B.x - 2. * rstep[2] * B.y); + F[step][2][3] = cLqp0 * (rstep[3] * f2tmp * L2i + rstep[2] * B.x + B.z); + F[step][2][4] = cL * f2tmp; + + const float f3tmp = -txty * B.y - rstep[2] * B.z + (1. + ty2) * B.x; + f[step][3] = cLqp0 * f3tmp; + F[step][3][2] = cLqp0 * (rstep[2] * f3tmp * L2i - rstep[3] * B.y - B.z); + F[step][3][3] = cLqp0 * (rstep[3] * f3tmp * L2i + 2. * rstep[3] * B.x - rstep[2] * B.y); + F[step][3][4] = cL * f3tmp; + + f[step][4] = 0.; + + // if (fDoFitVelocity) { //TODO: check and switch it on back if needed (for triplets, probably, not needed) + //// fvec vi = rstep[6]; + f[step][5] = rstep[6] * L; + const float Li = 1.f / L; + F[step][5][2] = rstep[6] * rstep[2] * Li; // / L; + F[step][5][3] = rstep[6] * rstep[3] * Li; // / L; + F[step][5][4] = 0.; + F[step][5][5] = 0.; + F[step][5][6] = L; + // } + // else { + // const float vi = xpu::sqrt(1. + fSettings[0].particleMass * fSettings[0].particleMass * qp0 * qp0) * fSettings[0].SpeedOfLightInv; + // f[step][5] = rstep[6] * L; + // F[step][5][2] = rstep[6] * rstep[2] / L; + // F[step][5][3] = rstep[6] * rstep[3] / L; + // F[step][5][4] = + // fSettings[0].particleMass * fSettings[0].particleMass * qp0 * L / + // xpu::sqrt(1. + fSettings[0].particleMass * fSettings[0].particleMass * qp0 * qp0) * fSettings[0].SpeedOfLightInv; + // F[step][5][5] = 0.; + // F[step][5][6] = 0.; + // } + + f[step][6] = 0.; + } // end of Runge-Kutta step + + float r[7] = {0.}; // extrapolated parameters + float R[7][7] = {{0.}}; // Jacobian of the extrapolation + + float stepW[5]; // = {0., h / 6.f, h / 3.f, h / 3.f, h / 6.}; + stepW[0] = 0.; + stepW[1] = h / 6.f; + stepW[2] = h / 3.f; + stepW[3] = h / 3.f; + stepW[4] = h / 6.f; + // const float stepW[5] = {0., h * 0.16666667f, h * 0.33333334f, h * 0.33333334f, h * 0.16666667f}; + // const float stepW[5] = {0., h * 0.166667f, h * 0.333334f, h * 0.333334f, h * 0.166667f}; //TODO: test constants: leads to a drop in efficiency + + float k[5][7][7] = {{0.}}; + for (int step = 1; step <= 4; ++step) { + for (int i = 0; i < 7; i++) { + for (int j = 0; j < 7; j++) { + k[step][i][j] = F[step][i][j]; + for (int m = 0; m < 7; m++) { + k[step][i][j] += stepDz[step] * F[step][i][m] * k[step - 1][m][j]; + } + } + } + } + + for (int i = 0; i < 7; i++) { + for (int j = 0; j < 7; j++) { + R[i][j] = R0[i][j]; + for (int step = 1; step <= 4; step++) { + R[i][j] += stepW[step] * k[step][i][j]; + } + } + } + + const float dqp = tr_par->Qp() - qp0; + + for (int i = 0; i < 7; i++) { + r[i] = r0[i]; + for (int step = 1; step <= 4; step++) { + r[i] += stepW[step] * f[step][i]; + } + // take into account linearisation at fQp0 + r[i] += R[i][4] * dqp; + } + + // update parameters + + tr_par->X() = r[0]; + tr_par->Y() = r[1]; + tr_par->Tx() = r[2]; + tr_par->Ty() = r[3]; + tr_par->Qp() = r[4]; + tr_par->Time() = r[5]; + tr_par->Vi() = r[6]; + + //fTr.Vi()( fTr.Vi() < fvec(TrackParamV::kClightNsInv) ) = fvec(TrackParamV::kClightNsInv); + tr_par->Z() = z_out; //zMasked; + + // covariance matrix transport + float C[7][7]; + for (int i = 0; i < 7; i++) { + for (int j = 0; j < 7; j++) { + C[i][j] = tr_par->C(i, j); + } + } + float RC[7][7]; + for (int i = 0; i < 7; i++) { + for (int j = 0; j < 7; j++) { + RC[i][j] = 0.; + for (int m = 0; m < 7; m++) { + RC[i][j] += R[i][m] * C[m][j]; + } + } + } + for (int i = 0; i < 7; i++) { + for (int j = 0; j < 7; j++) { + float Cij = 0.f; + for (int m = 0; m < 7; m++) { + Cij += RC[i][m] * R[j][m]; + } + tr_par->C(i, j) = Cij; + } + } +} + +XPU_D void GpuTripletConstructor::ExtrapolateLineNoField(kf::TrackParamBase<float>* tr_par, float zOut) const +{ + // extrapolate the track assuming no field + + const float dz = zOut - tr_par->GetZ(); + + const float tx = tr_par->GetTx(); + const float ty = tr_par->GetTy(); + const float vi = tr_par->GetVi(); + + const float L = xpu::sqrt(1.f + tx * tx + ty * ty); + + const float j52 = dz * tx * vi / L; + const float j53 = dz * ty * vi / L; + const float j56 = dz * L; + + // transport parameters + + tr_par->X() += tx * dz; + tr_par->Y() += ty * dz; + tr_par->Time() += L * vi * dz; + tr_par->Z() = zOut; + + // transport covariance matrix (see details in CaTrackFit) + + const float jc00 = tr_par->C(0, 0) + dz * tr_par->C(2, 0); + const float jc02 = tr_par->C(0, 2) + dz * tr_par->C(2, 2); + + const float jc10 = tr_par->C(1, 0) + dz * tr_par->C(3, 0); + const float jc11 = tr_par->C(1, 1) + dz * tr_par->C(3, 1); + const float jc12 = tr_par->C(1, 2) + dz * tr_par->C(3, 2); + const float jc13 = tr_par->C(1, 3) + dz * tr_par->C(3, 3); + + const float jc50 = tr_par->C(5, 0) + j52 * tr_par->C(2, 0) + j53 * tr_par->C(3, 0) + j56 * tr_par->C(6, 0); + const float jc51 = tr_par->C(5, 1) + j52 * tr_par->C(2, 1) + j53 * tr_par->C(3, 1) + j56 * tr_par->C(6, 1); + const float jc52 = tr_par->C(5, 2) + j52 * tr_par->C(2, 2) + j53 * tr_par->C(3, 2) + j56 * tr_par->C(6, 2); + const float jc53 = tr_par->C(5, 3) + j52 * tr_par->C(2, 3) + j53 * tr_par->C(3, 3) + j56 * tr_par->C(6, 3); + const float jc54 = tr_par->C(5, 4) + j52 * tr_par->C(2, 4) + j53 * tr_par->C(3, 4) + j56 * tr_par->C(6, 4); + const float jc55 = tr_par->C(5, 5) + j52 * tr_par->C(2, 5) + j53 * tr_par->C(3, 5) + j56 * tr_par->C(6, 5); + const float jc56 = tr_par->C(5, 6) + j52 * tr_par->C(2, 6) + j53 * tr_par->C(3, 6) + j56 * tr_par->C(6, 6); + + tr_par->C(0, 0) = jc00 + jc02 * dz; + tr_par->C(1, 0) = jc10 + jc12 * dz; + // tr_par->C(2, 0) = tr_par->C(2, 0) + tr_par->C(2, 2) * dz; + // tr_par->C(3, 0) = tr_par->C(3, 0) + tr_par->C(3, 2) * dz; + // tr_par->C(4, 0) = tr_par->C(4, 0) + tr_par->C(4, 2) * dz; + tr_par->C(2, 0) += tr_par->C(2, 2) * dz; + tr_par->C(3, 0) += tr_par->C(3, 2) * dz; + tr_par->C(4, 0) += tr_par->C(4, 2) * dz; + tr_par->C(5, 0) = jc50 + jc52 * dz; + // tr_par->C(6, 0) = tr_par->C(6, 0) + tr_par->C(6, 2) * dz; + tr_par->C(6, 0) += tr_par->C(6, 2) * dz; + + tr_par->C(1, 1) = jc11 + jc13 * dz; + // tr_par->C21() = tr_par->C21() + tr_par->C23() * dz; + // tr_par->C31() = tr_par->C31() + tr_par->C33() * dz; + // tr_par->C41() = tr_par->C41() + tr_par->C43() * dz; + tr_par->C(2, 1) += tr_par->C(2, 3) * dz; + tr_par->C(3, 1) += tr_par->C(3, 3) * dz; + tr_par->C(4, 1) += tr_par->C(4, 3) * dz; + tr_par->C(5, 1) = jc51 + jc53 * dz; + // tr_par->C61() = tr_par->C61() + tr_par->C63() * dz; + tr_par->C(6, 1) += tr_par->C(6, 3) * dz; + + tr_par->C(5, 2) = jc52; + tr_par->C(5, 3) = jc53; + tr_par->C(5, 4) = jc54; + tr_par->C(5, 5) = jc55 + jc52 * j52 + jc53 * j53 + jc56 * j56; + // tr_par->C65() = tr_par->C65() + tr_par->C62() * j52 + tr_par->C63() * j53 + tr_par->C66() * j56; + tr_par->C(6, 5) += tr_par->C(6, 2) * j52 + tr_par->C(6, 3) * j53 + tr_par->C(6, 6) * j56; +} + +XPU_D void GpuTripletConstructor::FilterXY(kf::TrackParamBase<float>* tr_par, const ca::MeasurementXy<float>& mxy) const +{ + { + kf::MeasurementU<float> mx; + mx.SetCosPhi(1.f); + mx.SetSinPhi(0.f); + mx.SetU(mxy.X()); + mx.SetDu2(mxy.Dx2()); + mx.SetNdf(mxy.NdfX()); + + kf::MeasurementU<float> mu; + mu.SetCosPhi(-mxy.Dxy() / mxy.Dx2()); + mu.SetSinPhi(1.f); + mu.SetU(mu.CosPhi() * mxy.X() + mxy.Y()); + mu.SetDu2(mxy.Dy2() - mxy.Dxy() * mxy.Dxy() / mxy.Dx2()); + mu.SetNdf(mxy.NdfY()); + + Filter1d(tr_par, mx); + Filter1d(tr_par, mu); + return; + } +} + +XPU_D void GpuTripletConstructor::Filter1d(kf::TrackParamBase<float>* tr_par, const kf::MeasurementU<float>& m) const +{ + float zeta, HCH; + float F0, F1, F2, F3, F4, F5, F6; + float K1, K2, K3, K4, K5, K6; + + zeta = m.CosPhi() * tr_par->X() + m.SinPhi() * tr_par->Y() - m.U(); + + // F = CH' + F0 = m.CosPhi() * tr_par->C(0, 0) + m.SinPhi() * tr_par->C(1, 0); + F1 = m.CosPhi() * tr_par->C(1, 0) + m.SinPhi() * tr_par->C(1, 1); + + HCH = F0 * m.CosPhi() + F1 * m.SinPhi(); + + F2 = m.CosPhi() * tr_par->C(2, 0) + m.SinPhi() * tr_par->C(2, 1); + F3 = m.CosPhi() * tr_par->C(3, 0) + m.SinPhi() * tr_par->C(3, 1); + F4 = m.CosPhi() * tr_par->C(4, 0) + m.SinPhi() * tr_par->C(4, 1); + F5 = m.CosPhi() * tr_par->C(5, 0) + m.SinPhi() * tr_par->C(5, 1); + F6 = m.CosPhi() * tr_par->C(6, 0) + m.SinPhi() * tr_par->C(6, 1); + + const bool maskDoFilter = (HCH < m.Du2() * 16.f); + + // correction to HCH is needed for the case when sigma2 is so small + // with respect to HCH that it disappears due to the roundoff error + // fvec wi = fMaskF / (m.Du2() + fvec(1.0000001) * HCH); + // fvec zetawi = fMaskF * zeta / (iif(maskDoFilter, m.Du2(), fvec::Zero()) + HCH); + + //TODO: (wi, zetawi): GPU results are different from CPU ones and depends on the order of the operations + //TODO: need to compare effects made on the tracking efficiency and parameters quality + float wi = 1.f / (m.Du2() + 1.0000001f * HCH); + const float zetawi = zeta / (maskDoFilter ? m.Du2() : 0.f + HCH); + + wi = m.Du2() > 0.f ? wi : 0.f; + + tr_par->ChiSq() += m.Ndf() * zeta * zeta * wi; + tr_par->Ndf() += m.Ndf(); + + K1 = F1 * wi; + K2 = F2 * wi; + K3 = F3 * wi; + K4 = F4 * wi; + K5 = F5 * wi; + K6 = F6 * wi; + + tr_par->X() -= F0 * zetawi; + tr_par->Y() -= F1 * zetawi; + tr_par->Tx() -= F2 * zetawi; + tr_par->Ty() -= F3 * zetawi; + tr_par->Qp() -= F4 * zetawi; + tr_par->Time() -= F5 * zetawi; + tr_par->Vi() -= F6 * zetawi; + + tr_par->C(0, 0) -= F0 * F0 * wi; + tr_par->C(1, 0) -= K1 * F0; + tr_par->C(1, 1) -= K1 * F1; + + tr_par->C(2, 0) -= K2 * F0; + tr_par->C(2, 1) -= K2 * F1; + tr_par->C(2, 2) -= K2 * F2; + + tr_par->C(3, 0) -= K3 * F0; + tr_par->C(3, 1) -= K3 * F1; + tr_par->C(3, 2) -= K3 * F2; + tr_par->C(3, 3) -= K3 * F3; + + tr_par->C(4, 0) -= K4 * F0; + tr_par->C(4, 1) -= K4 * F1; + tr_par->C(4, 2) -= K4 * F2; + tr_par->C(4, 3) -= K4 * F3; + tr_par->C(4, 4) -= K4 * F4; + + tr_par->C(5, 0) -= K5 * F0; + tr_par->C(5, 1) -= K5 * F1; + tr_par->C(5, 2) -= K5 * F2; + tr_par->C(5, 3) -= K5 * F3; + tr_par->C(5, 4) -= K5 * F4; + tr_par->C(5, 5) -= K5 * F5; + + tr_par->C(6, 0) -= K6 * F0; + tr_par->C(6, 1) -= K6 * F1; + tr_par->C(6, 2) -= K6 * F2; + tr_par->C(6, 3) -= K6 * F3; + tr_par->C(6, 4) -= K6 * F4; + tr_par->C(6, 5) -= K6 * F5; + tr_par->C(6, 6) -= K6 * F6; +} + +//TODO: update this function +XPU_D void GpuTripletConstructor::FilterTime(kf::TrackParamBase<float>* tr_par, float t, float dt2, + float timeInfo) const +{ + // filter track with a time measurement + + // F = CH' + const float F0 = tr_par->C(5, 0); + const float F1 = tr_par->C(5, 1); + const float F2 = tr_par->C(5, 2); + const float F3 = tr_par->C(5, 3); + const float F4 = tr_par->C(5, 4); + const float F5 = tr_par->C(5, 5); + const float F6 = tr_par->C(6, 5); + + const float HCH = F5; //tr_par->C55(); + + // when dt0 is much smaller than current time error, + // set track time exactly to the measurement value without filtering + // it helps to keep the initial time errors reasonably small + // the calculations in the covariance matrix are not affected + + const bool maskDoFilter = (HCH < dt2 * 16.f) && timeInfo; + + float w = 1.f; + + if (timeInfo <= 0.f) w = 0.f; + + // float wi = 1.f / (dt2 + 1.0000001f * HCH); + float hch_d = 1.0000001f * HCH; + float den = dt2 + hch_d; + float wi = w / den; + + const float zeta = tr_par->Time() - t; + const float zetawi = w * zeta / (maskDoFilter ? dt2 : 0.f + HCH); + + tr_par->ChiSqTime() += maskDoFilter ? (zeta * zeta * wi) : 0.f; + tr_par->NdfTime() += w; + + const float K1 = F1 * wi; + const float K2 = F2 * wi; + const float K3 = F3 * wi; + const float K4 = F4 * wi; + const float K5 = F5 * wi; + const float K6 = F6 * wi; + + tr_par->X() -= F0 * zetawi; + tr_par->Y() -= F1 * zetawi; + tr_par->Tx() -= F2 * zetawi; + tr_par->Ty() -= F3 * zetawi; + tr_par->Qp() -= F4 * zetawi; + tr_par->Time() -= F5 * zetawi; + tr_par->Vi() -= F6 * zetawi; + + tr_par->C(0, 0) -= F0 * F0 * wi; + + tr_par->C(1, 0) -= K1 * F0; + tr_par->C(1, 1) -= K1 * F1; + + tr_par->C(2, 0) -= K2 * F0; + tr_par->C(2, 1) -= K2 * F1; + tr_par->C(2, 2) -= K2 * F2; + + tr_par->C(3, 0) -= K3 * F0; + tr_par->C(3, 1) -= K3 * F1; + tr_par->C(3, 2) -= K3 * F2; + tr_par->C(3, 3) -= K3 * F3; + + tr_par->C(4, 0) -= K4 * F0; + tr_par->C(4, 1) -= K4 * F1; + tr_par->C(4, 2) -= K4 * F2; + tr_par->C(4, 3) -= K4 * F3; + tr_par->C(4, 4) -= K4 * F4; + + tr_par->C(5, 0) -= K5 * F0; + tr_par->C(5, 1) -= K5 * F1; + tr_par->C(5, 2) -= K5 * F2; + tr_par->C(5, 3) -= K5 * F3; + tr_par->C(5, 4) -= K5 * F4; + tr_par->C(5, 5) -= K5 * F5; + + tr_par->C(6, 0) -= K6 * F0; + tr_par->C(6, 1) -= K6 * F1; + tr_par->C(6, 2) -= K6 * F2; + tr_par->C(6, 3) -= K6 * F3; + tr_par->C(6, 4) -= K6 * F4; + tr_par->C(6, 5) -= K6 * F5; + tr_par->C(6, 6) -= K6 * F6; +} + +XPU_D void GpuTripletConstructor::EnergyLossCorrection(kf::TrackParamBase<float>* tr_par, float radThick, + float upstreamDirection) const +{ + const float qp02 = xpu::max(tr_par->Qp() * tr_par->Qp(), 0.01f); + const float p2 = 1.f / qp02; + // const float mass2 = fParams[fIterationData[0].fIteration].particleMass * fParams[fIterationData[0].fIteration].particleMass; //TODO: fMass2 = 0.01116, fSettings[0].particleMass^2 = 0.011172, might lead to a different result + const float mass2 = fParams_const[0].particleMass * fParams_const[0].particleMass; + const float r_mass2 = 1.f / mass2; + const float E2 = mass2 + p2; + + const float bg2 = p2 * r_mass2; + //ApproximateBetheBloch + const float kp0 = 2.33f; + const float kp1 = 0.20f; + const float kp2 = 3.00f; + const float kp3 = 173e-9f; + const float kp4 = 0.49848f; + + const float mK = 0.307075e-3f; + const float _2me = 1.022e-3f; + const float rho = kp0; + const float x0 = kp1 * 2.303f; + const float x1 = kp2 * 2.303f; + const float mI = kp3; + const float mZA = kp4; + const float maxT = _2me * bg2; + + //*** Density effect + float d2 = 0.f; + const float x = 0.5f * xpu::log(bg2); + const float lhwI = xpu::log(28.816f * 1e-9f * xpu::sqrt(rho * mZA) / mI); + + if (x > x1) { + d2 = lhwI + x - 0.5f; + } + if ((x > x0) && (x1 > x)) { + const float r = (x1 - x) / (x1 - x0); + d2 = lhwI + x - 0.5f + (0.5f - lhwI - x0) * r * r * r; + } + + const float bethe = + mK * mZA * (1.f + bg2) / bg2 * (0.5f * xpu::log(_2me * bg2 * maxT / (mI * mI)) - bg2 / (1.f + bg2) - d2); + + const float tr = xpu::sqrt(1.f + tr_par->Tx() * tr_par->Tx() + tr_par->Ty() * tr_par->Ty()); + + const float dE = bethe * radThick * tr * 2.33f * 9.34961f; + + const float ECorrected = xpu::sqrt(E2) + upstreamDirection * dE; + const float E2Corrected = ECorrected * ECorrected; + + float corr = xpu::sqrt(p2 / (E2Corrected - mass2)); + if (xpu::isnan(corr) || xpu::isinf(corr)) { + corr = 1.f; + } + + tr_par->Qp() *= corr; + tr_par->C(4, 0) *= corr; + tr_par->C(4, 1) *= corr; + tr_par->C(4, 2) *= corr; + tr_par->C(4, 3) *= corr; + tr_par->C(4, 4) *= corr * corr; + tr_par->C(5, 4) *= corr; +} diff --git a/algo/ca/core/experimental/CaGpuTripletConstructor.h b/algo/ca/core/experimental/CaGpuTripletConstructor.h new file mode 100644 index 0000000000000000000000000000000000000000..1a438d4e65930032c5ecaf715d65eb6393801bff --- /dev/null +++ b/algo/ca/core/experimental/CaGpuTripletConstructor.h @@ -0,0 +1,284 @@ +/* Copyright (C) 2024-2025 GSI Helmholtzzentrum fuer Schwerionenforschung, Darmstadt + SPDX-License-Identifier: GPL-3.0-only + Authors: Grigory Kozlov [committer] */ + +/// \file CaGpuTripletConstructor.h +/// \brief The class contains data and kernels for running CA tracking on CPU and GPU using XPU libraries + + +#pragma once // include this header only once per compilation unit + +#include "CaDeviceImage.h" +#include "CaGpuGrid.h" +#include "CaGpuGridArea.h" +#include "CaGpuMaterialMap.h" +#include "CaGpuParameters.h" +#include "CaHit.h" +#include "CaMeasurementXy.h" +#include "CaTriplet.h" +#include "KfMeasurementU.h" + +#include <xpu/device.h> +#include <xpu/host.h> + +namespace cbm::algo +{ + // static constexpr int kMaxNofStations = 24; + // Block sizes / other compile time constants that need tuning + enum CaGpuConstants + { +#if XPU_IS_CUDA + kSingletConstructorBlockSize = 512, + kSingletConstructorItemsPerThread = 8, +#else // HIP, values ignored on CPU + kSingletConstructorBlockSize = 64, //1024, //128 is optimal (preliminary tests) + kSingletConstructorItemsPerThread = 6, + kSortTripletsBlockSize = 64, + kSortTripletsItemsPerThread = 6, +#endif + }; +} // namespace cbm::algo + +namespace cbm::algo::ca +{ + // class Framework; + class GpuTripletConstructor; + + struct TrackingParams { + }; + + struct Params : xpu::constant<ca::GPUReco, TrackingParams> { + }; + + // Declare Constant Memory + struct strGpuTripletConstructor : xpu::constant<GPUReco, GpuTripletConstructor> { + }; + + struct hit_doublets { + ca::HitIndex_t hit1; + ca::HitIndex_t hit2; + }; + + struct hit_triplets { + ca::HitIndex_t hit1; + ca::HitIndex_t hit2; + ca::HitIndex_t hit3; + }; + + struct sort_triplets { + unsigned int tr_id; + // unsigned int lsta; + unsigned int lhit; + }; + + struct shared_counters { + int counter; + }; + + struct shared_data { + // ca::GpuStation fStations_shared[kMaxNofStations]; + ca::GpuFieldRegion fld0_shared[kSingletConstructorBlockSize]; + ca::GpuFieldRegion fld1_shared[kSingletConstructorBlockSize]; + }; + + struct shared_data_fit_triplets { + // ca::GpuStation fStations_shared[kMaxNofStations]; + kf::TrackParamBase<float> fit_params_shared[kSingletConstructorBlockSize]; + }; + + // struct TestFunc : xpu::kernel<GPUReco> { + // using block_size = xpu::block_size<kSingletConstructorBlockSize>; + // using constants = xpu::cmem<strGpuTripletConstructor, Params>; + // using context = xpu::kernel_context<shared_memory, constants>; + // XPU_D void operator()(context&); + // }; + + // Declare Kernels + struct MakeSinglets : xpu::kernel<GPUReco> { + using block_size = xpu::block_size<kSingletConstructorBlockSize>; + using shared_memory = shared_data; + using constants = xpu::cmem<strGpuTripletConstructor, Params>; + using context = xpu::kernel_context<shared_memory, constants>; + XPU_D void operator()(context&); + }; + + struct MakeDoublets : xpu::kernel<GPUReco> { + using block_size = xpu::block_size<kSingletConstructorBlockSize>; + // using shared_memory = shared_counters; + using constants = xpu::cmem<strGpuTripletConstructor>; //, Params>; + using context = xpu::kernel_context<shared_memory, constants>; + XPU_D void operator()(context&); + }; + + struct CompressDoublets : xpu::kernel<GPUReco> { + using block_size = xpu::block_size<kSingletConstructorBlockSize>; + using constants = xpu::cmem<strGpuTripletConstructor>; //, Params>; + using context = xpu::kernel_context<shared_memory, constants>; + XPU_D void operator()(context&); + }; + + struct FitDoublets : xpu::kernel<GPUReco> { + using block_size = xpu::block_size<kSingletConstructorBlockSize>; + using constants = xpu::cmem<strGpuTripletConstructor>; //, Params>; + using context = xpu::kernel_context<shared_memory, constants>; + XPU_D void operator()(context&); + }; + + struct MakeTriplets : xpu::kernel<GPUReco> { + using block_size = xpu::block_size<kSingletConstructorBlockSize>; + // using shared_memory = shared_counters; + using constants = xpu::cmem<strGpuTripletConstructor>; //, Params>; + using context = xpu::kernel_context<shared_memory, constants>; + XPU_D void operator()(context&); + }; + + struct CompressTriplets : xpu::kernel<GPUReco> { + using block_size = xpu::block_size<kSingletConstructorBlockSize>; + using constants = xpu::cmem<strGpuTripletConstructor>; //, Params>; + using context = xpu::kernel_context<shared_memory, constants>; + XPU_D void operator()(context&); + }; + + struct FitTriplets : xpu::kernel<GPUReco> { + using block_size = xpu::block_size<kSingletConstructorBlockSize>; + // using shared_memory = shared_data_fit_triplets; + using constants = xpu::cmem<strGpuTripletConstructor>; //, Params>; + using context = xpu::kernel_context<shared_memory, constants>; + XPU_D void operator()(context&); + }; + + struct SortTriplets : xpu::kernel<GPUReco> { + using block_size = xpu::block_size<kSortTripletsBlockSize>; + using sort_t = xpu::block_sort<unsigned int, sort_triplets, kSortTripletsBlockSize, kSortTripletsItemsPerThread>; + using shared_memory = sort_t::storage_t; + using constants = xpu::cmem<strGpuTripletConstructor>; //, Params>; + using context = xpu::kernel_context<shared_memory, constants>; + XPU_D void operator()(context&); + }; + + struct IterationData { + int fNHits; ///< Number of hits in grid + int fIteration; ///< Iteration number + int fNDoublets; ///< Number of doublets + int fNDoublets_counter; ///< Number of doublets counter + int fNTriplets; ///< Number of triplets + int fNTriplets_counter; ///< Number of triplets counter + }; + + class GpuTripletConstructor { + public: + /// ------ FUNCTIONAL PART ------ + + // XPU_D void TestFunc(TestFunc::context&) const; + + XPU_D void MakeSinglets(MakeSinglets::context&) const; + + XPU_D void MakeDoublets(MakeDoublets::context&) const; + + XPU_D void CompressDoublets(CompressDoublets::context&) const; + + XPU_D void FitDoublets(FitDoublets::context&) const; + + XPU_D void MakeTriplets(MakeTriplets::context&) const; + + XPU_D void CompressTriplets(CompressTriplets::context&) const; + + XPU_D void FitTriplets(FitTriplets::context&) const; + + XPU_D void SortTripletsFunc(SortTriplets::context&) const; + + private: + XPU_D void CollectHits(int iThread, int iSta, int iHit, int maxNHits, int* collectedHits) const; + + XPU_D void CollectHitsTriplets(int iThread, int iSta, int maxNHits, + int* collectedHits) const; //TODO: merge with CollectHits + + + /// ------ FITTING FINCTIONALITY ------ //TODO: check and switch to the standard CaTrackFit if possible + XPU_D void FilterWithTargetAtLine(kf::TrackParamBase<float>* tr_par, const ca::GpuFieldRegion* F0) const; + + /// extrapolate track as a line, return the extrapolated X, Y and the Jacobians + XPU_D void GetExtrapolatedXYline(kf::TrackParamBase<float>* tr_par, float z, const ca::GpuFieldRegion* F, + float* extrXY, float* Jx, float* Jy) const; + + XPU_D void MultipleScattering(kf::TrackParamBase<float>* tr_par, float radThick, float qp0) const; + + /// extrapolate the track to the given Z using the field F + /// it does extrapolation in one step + XPU_D void ExtrapolateStep(kf::TrackParamBase<float>* tr_par, float z_out, float qp0, + const ca::GpuFieldRegion* Field) const; + + /// filter the track with the XY measurement placed at different Z + /// \param m - measurement + /// \param extrX - extrapolated X of the track + /// \param extrY - extrapolated Y of the track + /// \param Jx - Jacobian of the extrapolated X + /// \param Jy - Jacobian of the extrapolated Y + XPU_D void FilterExtrapolatedXY( + kf::TrackParamBase<float>* tr_par, const ca::MeasurementXy<float>* m, float* extrXY, // fvec extrY, + // std::array<float, ca::TrackParamBase<float>::kNtrackParam>* Jx, + // std::array<float, ca::TrackParamBase<float>::kNtrackParam>* Jy) const; + float* Jx, float* Jy) const; + + XPU_D void ExtrapolateLineNoField(kf::TrackParamBase<float>* tr_par, float zOut) const; + + XPU_D void FilterXY(kf::TrackParamBase<float>* tr_par, const ca::MeasurementXy<float>& mxy) const; + + XPU_D void Filter1d(kf::TrackParamBase<float>* tr_par, const kf::MeasurementU<float>& m) const; + + XPU_D void FilterTime(kf::TrackParamBase<float>* tr_par, float t, float dt2, float timeInfo) const; + + XPU_D void EnergyLossCorrection(kf::TrackParamBase<float>* tr_par, float radThick, float upstreamDirection) const; + + /// ------ DATA MEMBERS ------ + public: + xpu::buffer<ca::GpuGrid> fvGpuGrid; ///< Grid + + xpu::buffer<unsigned int> fgridFirstBinEntryIndex; ///< Index of the first entry in the grid + + xpu::buffer<unsigned int> fgridEntries; ///< Entries in the grid //TODO: check if it should be ca::HitIndex_t + + ///Material map + xpu::buffer<ca::GpuMaterialMap> fMaterialMap; ///< Material map base objects + + xpu::buffer<float> fMaterialMapTables; ///< Material map tables + + xpu::buffer<IterationData> fIterationData; ///< Temporary data + + // xpu::buffer<unsigned char> fvbHitSuppressed; ///< Hit suppression flags + + xpu::buffer<hit_doublets> fHitDoublets; ///< Hit doublets + + xpu::buffer<hit_doublets> fHitDoubletsCompressed; ///< Hit doublets compressed + + xpu::buffer<hit_triplets> fHitTriplets; ///< Hit triplets + + xpu::buffer<hit_triplets> fHitTripletsCompressed; ///< Hit triplets compressed + + /// \brief Hits of the current time window + /// + /// It is a portion of fInputData to process in the current time window + /// hit.Id is replaced by the hit index in fInputData + xpu::buffer<ca::Hit> fvHits; + + xpu::buffer<kf::TrackParamBase<float>> fvTrackParams; ///< Track parameters + xpu::buffer<kf::TrackParamBase<float>> fvTrackParamsDoublets; ///< Track parameters for doublets + // xpu::buffer<ca::TrackParamBase<float>> fvTrackParamsTriplets; ///< Track parameters for triplets + + xpu::buffer<ca::Triplet> fvTriplets; ///< Triplets + + xpu::buffer<sort_triplets> fTripletSortHelper; ///< Helper for sorting triplets + // xpu::buffer<sort_triplets> fTripletSortHelperTmp; + // xpu::buffer<sort_triplets*> dst; + + // GpuParameters fParams; + xpu::buffer<GpuParameters> fParams; + + int fIteration; + + GpuParameters fParams_const[4]; + + ca::GpuStation fStations_const[constants::gpu::MaxNofStations]; + }; + +} // namespace cbm::algo::ca diff --git a/algo/ca/core/experimental/CaMeasurementXy.h b/algo/ca/core/experimental/CaMeasurementXy.h new file mode 100644 index 0000000000000000000000000000000000000000..ae4f5232e46aee0f25ee0da27366529f4ed3cb43 --- /dev/null +++ b/algo/ca/core/experimental/CaMeasurementXy.h @@ -0,0 +1,202 @@ +/* Copyright (C) 2007-2022 GSI Helmholtzzentrum fuer Schwerionenforschung, Darmstadt + SPDX-License-Identifier: GPL-3.0-only + Authors: Sergey Gorbunov [committer], Sergei Zharko */ + +/// \file CaMeasurementXy.h +/// \brief Definition of the CaMeasurementXy class +/// Temporary class for compatibility with GPU code +/// Should be replaced by KfMeasurementXy + +#pragma once // include this header only once per compilation unit + + +#include "CaDefs.h" +#include "CaSimd.h" +#include "KfUtils.h" + +#include <string> + +#include <xpu/device.h> + +namespace cbm::algo::ca +{ + + /// \brief The class describes a 2D - measurement (x, y) in XY coordinate system + /// + /// The measurement has a finite resolution, i.e. the measurement is not a point, but a distribution + /// with a certain rms. + /// The measurement components may be used in the chi2 calculation or not + /// The measurement may be a SIMD vector of values, when DataT is fvec type + /// + template<typename DataT> + class MeasurementXy { + + public: + friend class boost::serialization::access; + + /// default constructor + XPU_D MeasurementXy() = default; + + /// constructor + /// \param x x coordinate of the measurement + /// \param y y coordinate of the measurement + /// \param dx2 rms^2 of the x coordinate measurement + /// \param dy2 rms^2 of the y coordinate measurement + /// \param dxy covariance of the x and y coordinate measurements + /// \param ndfX number of degrees of freedom for the x coordinate measurement + /// if ndfX == 1, the measurement is used in fit and in the chi2 calculation + /// if ndfX == 0, the measurement is used in fit, but not used in the chi2 calculation + /// \param ndfY number of degrees of freedom for the y coordinate measurement + /// if ndfY == 1, the measurement is used in fit and in the chi2 calculation + /// if ndfY == 0, the measurement is used in fit, but not used in the chi2 calculation + XPU_D MeasurementXy(DataT x, DataT y, DataT dx2, DataT dy2, DataT dxy, DataT ndfX, DataT ndfY) + : fX(x) + , fY(y) + , fDx2(dx2) + , fDy2(dy2) + , fDxy(dxy) + , fNdfX(ndfX) + , fNdfY(ndfY) + { + } + + + /// Set all SIMD entries from all SIMD entries of the other class + /// It works for scalar and fvec types, + /// except of the case when DataT is scalar and TdataB is fvec. + template<typename DataTb> + void Set(const MeasurementXy<DataTb>& m) + { + CopyBase<DataTb, true, true>(0, m, 0); + } + + /// Set all SIMD entries from one SIMD entry of the other class + /// It also works when DataT is scalar + void Set(const MeasurementXy<fvec>& m, const int im) { CopyBase<fvec, true, false>(0, m, im); } + + /// Set one SIMD entry from one SIMD entry of the other class + /// It only works when DataT is fvec, TdataB is scalar + template<typename DataTb> + void SetOneEntry(const int i, const MeasurementXy<DataTb>& m) + { + CopyBase<DataTb, false, true>(i, m, 0); + } + + /// Set one SIMD entry from one SIMD entry of the other class + /// It only works when DataT is fvec, TdataB is fvec + void SetOneEntry(const int i, const MeasurementXy<fvec>& m, const int im) + { + CopyBase<fvec, false, false>(i, m, im); + } + + + ///------------------------------ + /// Setters and getters + + XPU_D void SetX(DataT x) { fX = x; } + XPU_D void SetY(DataT y) { fY = y; } + XPU_D void SetDx2(DataT dx2) { fDx2 = dx2; } + XPU_D void SetDy2(DataT dy2) { fDy2 = dy2; } + XPU_D void SetDxy(DataT dxy) { fDxy = dxy; } + XPU_D void SetNdfX(DataT ndfX) { fNdfX = ndfX; } + XPU_D void SetNdfY(DataT ndfY) { fNdfY = ndfY; } + XPU_D void SetCov(DataT dx2, DataT dxy, DataT dy2) + { + fDx2 = dx2; + fDxy = dxy; + fDy2 = dy2; + } + + XPU_D DataT X() const { return fX; } + XPU_D DataT Y() const { return fY; } + XPU_D DataT Dx2() const { return fDx2; } + XPU_D DataT Dy2() const { return fDy2; } + XPU_D DataT Dxy() const { return fDxy; } + XPU_D DataT NdfX() const { return fNdfX; } + XPU_D DataT NdfY() const { return fNdfY; } + + ///------------------------------ + /// references, to ease assignment to SIMD vector components when DataT has fvec type + + XPU_D DataT& X() { return fX; } + XPU_D DataT& Y() { return fY; } + XPU_D DataT& Dx2() { return fDx2; } + XPU_D DataT& Dy2() { return fDy2; } + XPU_D DataT& Dxy() { return fDxy; } + XPU_D DataT& NdfX() { return fNdfX; } + XPU_D DataT& NdfY() { return fNdfY; } + + ///------------------------------ + /// Methods for debugging + + /// Checks, if all fields are finite + bool IsFinite() const + { + return (kf::utils::IsFinite(X()) && kf::utils::IsFinite(Y()) && kf::utils::IsFinite(Dx2()) + && kf::utils::IsFinite(Dy2()) && kf::utils::IsFinite(Dxy()) && kf::utils::IsFinite(NdfX()) + && kf::utils::IsFinite(NdfY())); + } + + /// Checks, if some fields are undefined + bool IsUndefined() const + { + return (kf::utils::IsUndefined(X()) || kf::utils::IsUndefined(Y()) || kf::utils::IsUndefined(Dx2()) + || kf::utils::IsUndefined(Dy2()) || kf::utils::IsUndefined(Dxy()) || kf::utils::IsUndefined(NdfX()) + || kf::utils::IsUndefined(NdfY())); + } + + private: + /// \brief Copies all/one entries from the other class + /// \tparam TdataB Type of the other class + /// \tparam TDoAllA If true, all entries of the current class must be set + /// \tparam TDoAllB If true, all entries of the other class must be used + /// \param ia Index of SIMD vector element of the current class + /// \param Tb Other class + /// \param ib Index of SIMD vector element of the other class + template<typename TdataB, bool TDoAllA, bool TDoAllB> + void CopyBase(const int ia, const MeasurementXy<TdataB>& Tb, const int ib); + + private: + ///------------------------------ + /// Data members + + // Initializing parameters with NANs spoils the track fit where + // the masked-out SIMD entries are suppressed by a multication by 0. + // Therefore, we initialize the data members here with finite numbers. + // For the numerical safety, with some reasonable numbers. + + DataT fX; ///< x coordinate of the measurement + DataT fY; ///< y coordinate of the measurement + DataT fDx2; ///< rms^2 of the x coordinate measurement + DataT fDy2; ///< rms^2 of the y coordinate measurement + DataT fDxy; ///< covariance of the x and y coordinate measurements + + /// number of degrees of freedom (used for chi2 calculation) + /// if ndf == 1, the measurement is used in the chi2 calculation + /// if ndf == 0, the measurement is not used in the chi2 calculation + + DataT fNdfX; ///< ndf for the x coordinate measurement + DataT fNdfY; ///< ndf for the y coordinate measurement + }; + + + // --------------------------------------------------------------------------------------------------------------------- + // + template<typename TdataA> + template<typename TdataB, bool TDoAllA, bool TDoAllB> + inline void MeasurementXy<TdataA>::CopyBase(const int ia, const MeasurementXy<TdataB>& Tb, const int ib) + { + auto copy = [&](TdataA& a, const TdataB& b) { + kf::utils::VecCopy<TdataA, TdataB, TDoAllA, TDoAllB>::CopyEntries(a, ia, b, ib); + }; + + copy(fX, Tb.X()); + copy(fY, Tb.Y()); + copy(fDx2, Tb.Dx2()); + copy(fDy2, Tb.Dy2()); + copy(fDxy, Tb.Dxy()); + copy(fNdfX, Tb.NdfX()); + copy(fNdfY, Tb.NdfY()); + } // CopyBase + +} // namespace cbm::algo::ca diff --git a/algo/ca/core/pars/CaDefs.h b/algo/ca/core/pars/CaDefs.h index 8b4a2280d57d45267329ca11edf13613faaa6df0..89a8c19ee9b7fbe112f7b15e9e437ed43d469a1b 100644 --- a/algo/ca/core/pars/CaDefs.h +++ b/algo/ca/core/pars/CaDefs.h @@ -105,6 +105,18 @@ namespace cbm::algo::ca::constants } // namespace misc + /// GPU constants + namespace gpu + { + constexpr int MaxDoubletsFromHit = 150; ///< Maximum number of doublets from a hit + constexpr int MaxTripletsFromDoublet = 15; ///< Maximum number of triplets from a doublet + constexpr int MaxNofStations = 20; ///< Maximum number of stations //TODO: temporary solution + constexpr bool GpuTracking = false; ///< Flag: use GPU for tracking + constexpr bool GpuTimeMonitoring = false; ///< Flag: use GPU for time monitoring + constexpr bool GpuSortTriplets = false; ///< Flag: use GPU for sorting triplets + constexpr bool CpuSortTriplets = true; ///< Flag: use CPU for sorting triplets + } // namespace gpu + /// \brief Undefined values template<typename T1, typename T2 = T1> constexpr T2 Undef; diff --git a/algo/ca/core/tracking/CaTrackFinderWindow.cxx b/algo/ca/core/tracking/CaTrackFinderWindow.cxx index cce227803ac3fbfe80b4e5b3163853eca30b7fee..e3bcc301f5eff5c69380b57d922f03e9cccda9d3 100644 --- a/algo/ca/core/tracking/CaTrackFinderWindow.cxx +++ b/algo/ca/core/tracking/CaTrackFinderWindow.cxx @@ -127,20 +127,52 @@ namespace cbm::algo::ca PrepareGrid(input.GetHits(), wData); frMonitorData.StopTimer(ETimer::PrepareGrid); + std::optional<ca::GpuTrackFinderSetup> gpuTrackFinderSetup; + size_t iter_num = 0; + if constexpr (constants::gpu::GpuTracking) { + // XPU initialization + // - temporary solution for tests only + // - should not be initialized here + setenv("XPU_PROFILE", "1", 1); + xpu::settings settings; + // settings.device = "cpu0"; + settings.device = "hip0"; + // settings.verbose = true; + xpu::initialize(settings); + + // Set up environment for GPU tracking + xpu::push_timer("gpuTFinit"); + gpuTrackFinderSetup.emplace(wData, fParameters); + xpu::timings gpuTFinit = xpu::pop_timer(); + if constexpr (constants::gpu::GpuTimeMonitoring) { + LOG(info) << "GPU tracking :: Initialization: " << gpuTFinit.wall() << " ms"; + } + + SetupGpuTrackFinder(gpuTrackFinderSetup.value()); + } + // Run CA iterations frMonitorData.StartTimer(ETimer::FindTracks); auto& caIterations = fParameters.GetCAIterations(); for (auto iter = caIterations.begin(); iter != caIterations.end(); ++iter) { - // ----- Prepare iteration frMonitorData.StartTimer(ETimer::PrepareIteration); PrepareCAIteration(*iter, wData, iter == caIterations.begin()); frMonitorData.StopTimer(ETimer::PrepareIteration); - // ----- Triplets construction ----- - frMonitorData.StartTimer(ETimer::ConstructTriplets); - ConstructTriplets(wData); - frMonitorData.StopTimer(ETimer::ConstructTriplets); + if constexpr (constants::gpu::GpuTracking) { + // ----- Triplets construction on GPU ----- + frMonitorData.StartTimer(ETimer::ConstructTriplets); + ConstructTripletsGPU(wData, gpuTrackFinderSetup.value(), iter_num); + iter_num++; + frMonitorData.StopTimer(ETimer::ConstructTriplets); + } + else { + // ----- Triplets construction ----- + frMonitorData.StartTimer(ETimer::ConstructTriplets); + ConstructTriplets(wData); + frMonitorData.StopTimer(ETimer::ConstructTriplets); + } // ----- Search for neighbouring triplets ----- frMonitorData.StartTimer(ETimer::SearchNeighbours); @@ -332,7 +364,6 @@ namespace cbm::algo::ca staPattern.push_back(std::make_pair(1 + i, 2 + gap)); } } - for (int istal = fParameters.GetNstationsActive() - 2; istal >= wData.CurrentIteration()->GetFirstStationIndex(); istal--) { // start with downstream chambers @@ -760,4 +791,112 @@ namespace cbm::algo::ca } // level = 0 } + void TrackFinderWindow::SetupGpuTrackFinder(GpuTrackFinderSetup& gpuTrackFinderSetup) + { + xpu::push_timer("SetupParametersTime"); + gpuTrackFinderSetup.SetupParameters(); + xpu::timings SetupParametersTime = xpu::pop_timer(); + + xpu::push_timer("SetInputDataTime"); + gpuTrackFinderSetup.SetInputData(); + xpu::timings SetInputDataTime = xpu::pop_timer(); + + xpu::push_timer("SetupMaterialMapTime"); + gpuTrackFinderSetup.SetupMaterialMap(); + xpu::timings SetupMaterialMapTime = xpu::pop_timer(); + + if constexpr (constants::gpu::GpuTimeMonitoring) { + LOG(info) << "GPU tracking :: SetupParameters: " << SetupParametersTime.wall() << " ms"; + LOG(info) << "GPU tracking :: SetInputData: " << SetInputDataTime.wall() << " ms"; + LOG(info) << "GPU tracking :: SetupMaterialMap: " << SetupMaterialMapTime.wall() << " ms"; + } + } + + void TrackFinderWindow::ConstructTripletsGPU(WindowData& wData, GpuTrackFinderSetup& gpuTrackFinderSetup, + int iteration) + { + xpu::push_timer("SetupGridTime"); + gpuTrackFinderSetup.SetupGrid(); + xpu::timings SetupGridTime = xpu::pop_timer(); + + xpu::push_timer("SetupIterationDataTime"); + gpuTrackFinderSetup.SetupIterationData(iteration); + xpu::timings SetupIterationDataTime = xpu::pop_timer(); + + xpu::push_timer("RunGpuTrackingSetup"); + gpuTrackFinderSetup.RunGpuTrackingSetup(); + xpu::timings RunGpuTrackingSetup = xpu::pop_timer(); + + std::optional<std::vector<std::array<unsigned int, 2>>> triplet_sort; + if constexpr (constants::gpu::CpuSortTriplets) { + xpu::push_timer("SortTriplets"); + triplet_sort.emplace(); + triplet_sort.value().reserve(gpuTrackFinderSetup.GetNofTriplets()); + + for (unsigned int itr = 0; itr < gpuTrackFinderSetup.GetNofTriplets(); itr++) { + if (gpuTrackFinderSetup.GetTriplet(itr).GetChi2() <= 0.) continue; + const auto& triplet = gpuTrackFinderSetup.GetTriplet(itr); + triplet_sort.value().push_back({triplet.GetLHit(), itr}); + } + std::sort( + triplet_sort.value().begin(), triplet_sort.value().end(), + [](const std::array<unsigned int, 2>& a, const std::array<unsigned int, 2>& b) { return (a[0] < b[0]); }); + xpu::timings SortTriplets = xpu::pop_timer(); + + if constexpr (constants::gpu::GpuTimeMonitoring) { + LOG(info) << "GPU tracking :: SetupGrid: " << SetupGridTime.wall() << " ms"; + LOG(info) << "GPU tracking :: SetupIterationData: " << SetupIterationDataTime.wall() << " ms"; + LOG(info) << "GPU tracking :: RunGpuTracking: " << RunGpuTrackingSetup.wall() << " ms"; + LOG(info) << "GPU tracking :: SortTripletsCPU: " << SortTriplets.wall() << " ms"; + } + } + + int ihitl_0 = -1; + + auto& [vHitFirstTriplet, vHitNofTriplets, vTriplets] = fTripletData; + vHitFirstTriplet.reset(wData.Hits().size(), 0); /// link hit -> first triplet { hit, *, *} + vHitNofTriplets.reset(wData.Hits().size(), 0); /// link hit ->n triplets { hit, *, *} + + // prepare triplet storage + for (int j = 0; j < fParameters.GetNstationsActive(); j++) { + const size_t nHitsStation = wData.TsHitIndices(j).size(); + vTriplets[j].clear(); + vTriplets[j].reserve(2 * nHitsStation); + } + + int nTriplets = gpuTrackFinderSetup.GetNofTriplets(); + if constexpr (constants::gpu::CpuSortTriplets) { + nTriplets = triplet_sort.value().size(); + } + for (size_t itr = 0; itr < nTriplets; itr++) { + int iSta, ihitl; + if constexpr (constants::gpu::CpuSortTriplets) { + iSta = gpuTrackFinderSetup.GetTriplet(std::get<1>(triplet_sort.value()[itr])).GetLSta(); + ihitl = gpuTrackFinderSetup.GetTriplet(std::get<1>(triplet_sort.value()[itr])).GetLHit(); + } + else { + iSta = gpuTrackFinderSetup.GetTriplet(itr).GetLSta(); + ihitl = gpuTrackFinderSetup.GetTriplet(itr).GetLHit(); + } + const size_t oldSize = vTriplets[iSta].size(); + + if constexpr (constants::gpu::CpuSortTriplets) { + vTriplets[iSta].emplace_back(gpuTrackFinderSetup.GetTriplet(std::get<1>(triplet_sort.value()[itr]))); + if (ihitl == ihitl_0) { + vHitNofTriplets[ihitl]++; + } + else { + vHitFirstTriplet[ihitl] = PackTripletId(iSta, oldSize); + vHitNofTriplets[ihitl] = vTriplets[iSta].size() - oldSize; + } + ihitl_0 = ihitl; + } + else { + if (gpuTrackFinderSetup.GetTriplet(itr).GetChi2() > 0.) + vTriplets[iSta].emplace_back(gpuTrackFinderSetup.GetTriplet(itr)); + vHitFirstTriplet[ihitl] = PackTripletId(iSta, oldSize); + vHitNofTriplets[ihitl] = vTriplets[iSta].size() - oldSize; + } + } + } } // namespace cbm::algo::ca diff --git a/algo/ca/core/tracking/CaTrackFinderWindow.h b/algo/ca/core/tracking/CaTrackFinderWindow.h index c08ea5570af54c2c4b817681ea339f57032623ca..691ac0da86aa0b78a343acc2f173fca5a7cef473 100644 --- a/algo/ca/core/tracking/CaTrackFinderWindow.h +++ b/algo/ca/core/tracking/CaTrackFinderWindow.h @@ -11,6 +11,7 @@ #include "CaBranch.h" #include "CaCloneMerger.h" +#include "CaGpuTrackFinderSetup.h" #include "CaGrid.h" #include "CaHit.h" #include "CaSimd.h" @@ -90,6 +91,10 @@ namespace cbm::algo::ca bool checkTripletMatch(const ca::Triplet& l, const ca::Triplet& r, fscal& dchi2, WindowData& wData) const; + void SetupGpuTrackFinder(GpuTrackFinderSetup& gpuTrackFinderSetup); + + void ConstructTripletsGPU(WindowData& wData, GpuTrackFinderSetup& gpuTrackFinderSetup, int iteration); + // ** Functions, which pack and unpack indexes of station and triplet ** // TODO: move to ca::Triplet class (S.Zharko) diff --git a/algo/kf/core/CMakeLists.txt b/algo/kf/core/CMakeLists.txt index 0c2168e867a71397ae6f1e2a18c408a03375d262..cbd05a33d1cbd9def9da8dd1e6c919089fe89ed4 100644 --- a/algo/kf/core/CMakeLists.txt +++ b/algo/kf/core/CMakeLists.txt @@ -56,6 +56,7 @@ target_link_libraries(KfCore OnlineDataLog # needed for the logger external::fles_logging # needed for the logger external::fles_ipc # needed for the logger + xpu PRIVATE Boost::serialization fmt::fmt external::yaml-cpp @@ -78,6 +79,7 @@ if (NOT CBM_ONLINE_STANDALONE) PUBLIC Vc::Vc OnlineDataLog # needed for the logger? FairLogger::FairLogger + xpu PRIVATE Boost::serialization fmt::fmt external::yaml-cpp diff --git a/algo/kf/core/data/KfMeasurementU.h b/algo/kf/core/data/KfMeasurementU.h index 711f237ec237b063ded4afa03ebc96b5348b6ac0..e4e6a08dcf6c53fa4849379eca4070b91ac05ae3 100644 --- a/algo/kf/core/data/KfMeasurementU.h +++ b/algo/kf/core/data/KfMeasurementU.h @@ -15,6 +15,8 @@ #include <string> +#include <xpu/device.h> + namespace cbm::algo::kf { /// \brief The class describes a 1D - measurement U in XY coordinate system @@ -38,7 +40,7 @@ namespace cbm::algo::kf friend class boost::serialization::access; /// default constructor - MeasurementU() = default; + XPU_D MeasurementU() = default; /// constructor /// \param cosPhi cos(phi) @@ -48,7 +50,7 @@ namespace cbm::algo::kf /// \param ndf number of degrees of freedom (used for chi2 calculation) /// if ndf == 1, the measurement is used in the chi2 calculation /// if ndf == 0, the measurement is not used in the chi2 calculation - MeasurementU(DataT cosPhi, DataT sinPhi, DataT u, DataT du2, DataT ndf) + XPU_D MeasurementU(DataT cosPhi, DataT sinPhi, DataT u, DataT du2, DataT ndf) : fCosPhi(cosPhi) , fSinPhi(sinPhi) , fU(u) @@ -60,17 +62,17 @@ namespace cbm::algo::kf ///------------------------------ /// Setters and getters - void SetCosPhi(DataT cosPhi) { fCosPhi = cosPhi; } - void SetSinPhi(DataT sinPhi) { fSinPhi = sinPhi; } - void SetU(DataT u) { fU = u; } - void SetDu2(DataT du2) { fDu2 = du2; } - void SetNdf(DataT ndf) { fNdf = ndf; } - - DataT CosPhi() const { return fCosPhi; } - DataT SinPhi() const { return fSinPhi; } - DataT U() const { return fU; } - DataT Du2() const { return fDu2; } - DataT Ndf() const { return fNdf; } + XPU_D void SetCosPhi(DataT cosPhi) { fCosPhi = cosPhi; } + XPU_D void SetSinPhi(DataT sinPhi) { fSinPhi = sinPhi; } + XPU_D void SetU(DataT u) { fU = u; } + XPU_D void SetDu2(DataT du2) { fDu2 = du2; } + XPU_D void SetNdf(DataT ndf) { fNdf = ndf; } + + XPU_D DataT CosPhi() const { return fCosPhi; } + XPU_D DataT SinPhi() const { return fSinPhi; } + XPU_D DataT U() const { return fU; } + XPU_D DataT Du2() const { return fDu2; } + XPU_D DataT Ndf() const { return fNdf; } ///------------------------------ /// Methods for debugging diff --git a/algo/kf/core/data/KfTrackParam.h b/algo/kf/core/data/KfTrackParam.h index d439bbd8c8a6424ed0765317c4cf003c63876604..6d2183e9fd10122117e30bffc3a4ecfb3da3af19 100644 --- a/algo/kf/core/data/KfTrackParam.h +++ b/algo/kf/core/data/KfTrackParam.h @@ -19,6 +19,8 @@ #include <string> +#include <xpu/device.h> + namespace cbm::algo::kf { /// \class cbm::algo::kf::TrackParamBase @@ -201,49 +203,49 @@ namespace cbm::algo::kf /// Some of them involve calculations /// \brief Gets z position [cm] - T GetZ() const { return fZ; } + XPU_D T GetZ() const { return fZ; } /// \brief Gets x position [cm] - T GetX() const { return fX; } + XPU_D T GetX() const { return fX; } /// \brief Gets x position error [cm] - T GetXError() const { return sqrt(C00()); } + XPU_D T GetXError() const { return sqrt(C00()); } /// \brief Gets y position [cm] - T GetY() const { return fY; } + XPU_D T GetY() const { return fY; } /// \brief Gets y position error [cm] - T GetYError() const { return sqrt(C11()); } + XPU_D T GetYError() const { return sqrt(C11()); } /// \brief Gets slope along x-axis - T GetTx() const { return fTx; } + XPU_D T GetTx() const { return fTx; } /// \brief Gets error of slope along x-axis - T GetTxError() const { return sqrt(C22()); } + XPU_D T GetTxError() const { return sqrt(C22()); } /// \brief Gets slope along y-axis - T GetTy() const { return fTy; } + XPU_D T GetTy() const { return fTy; } /// \brief Gets error of slope along y-axis - T GetTyError() const { return sqrt(C33()); } + XPU_D T GetTyError() const { return sqrt(C33()); } /// \brief Gets charge over momentum [ec/GeV] - T GetQp() const { return fQp; } + XPU_D T GetQp() const { return fQp; } /// \brief Gets error of charge over momentum [ec/GeV] - T GetQpError() const { return sqrt(C44()); } + XPU_D T GetQpError() const { return sqrt(C44()); } /// \brief Gets time [ns] - T GetTime() const { return fT; } + XPU_D T GetTime() const { return fT; } /// \brief Gets time error [ns] - T GetTimeError() const { return sqrt(C55()); } + XPU_D T GetTimeError() const { return sqrt(C55()); } /// \brief Gets inverse velocity [ns/cm] in downstream direction - T GetVi() const { return fVi; } + XPU_D T GetVi() const { return fVi; } /// \brief Gets inverse velocity error [ns/cm] - T GetViError() const { return sqrt(C66()); } + XPU_D T GetViError() const { return sqrt(C66()); } /// \brief Gets covariance matrix const CovMatrix_t& GetCovMatrix() const { return fCovMatrix; } @@ -252,49 +254,49 @@ namespace cbm::algo::kf /// \param i row /// \param j column /// \return covariance matrix element - T GetCovariance(int i, int j) const { return C(i, j); } + XPU_D T GetCovariance(int i, int j) const { return C(i, j); } /// Gets Chi-square of track fit model - T GetChiSq() const { return fChiSq; } + XPU_D T GetChiSq() const { return fChiSq; } /// Gets NDF of track fit model - T GetNdf() const { return fNdf; } + XPU_D T GetNdf() const { return fNdf; } /// Gets Chi-square of time measurements - T GetChiSqTime() const { return fChiSqTime; } + XPU_D T GetChiSqTime() const { return fChiSqTime; } /// Gets NDF of time measurements - T GetNdfTime() const { return fNdfTime; } + XPU_D T GetNdfTime() const { return fNdfTime; } /// Gets charge - T GetCharge() const { return utils::iif(GetQp() > T(0.), T(1.), T(-1.)); } + XPU_D T GetCharge() const { return utils::iif(GetQp() > T(0.), T(1.), T(-1.)); } /// \brief Gets azimuthal angle [rad] - T GetPhi() const { return atan2(GetTy(), GetTx()); } + XPU_D T GetPhi() const { return atan2(GetTy(), GetTx()); } /// \brief Gets azimuthal angle error [rad] - T GetPhiError() const; + XPU_D T GetPhiError() const; /// \brief Gets polar angle [rad] - T GetTheta() const { return atan(sqrt(GetTx() * GetTx() + GetTy() * GetTy())); } + XPU_D T GetTheta() const { return atan(sqrt(GetTx() * GetTx() + GetTy() * GetTy())); } /// \brief Gets polar angle error [rad] - T GetThetaError() const; + XPU_D T GetThetaError() const; /// Gets momentum [GeV/ec]. For the straight tracks returns 1.e4 [GeV/ec] - T GetP() const { return utils::iif(utils::fabs(GetQp()) > T(1.e-4), T(1.) / utils::fabs(GetQp()), T(1.e4)); } + XPU_D T GetP() const { return utils::iif(utils::fabs(GetQp()) > T(1.e-4), T(1.) / utils::fabs(GetQp()), T(1.e4)); } /// Gets z-component of the momentum [GeV/ec] - T GetPz() const { return GetP() / sqrt(T(1.) + GetTx() * GetTx() + GetTy() * GetTy()); } + XPU_D T GetPz() const { return GetP() / sqrt(T(1.) + GetTx() * GetTx() + GetTy() * GetTy()); } /// Gets x-component of the momentum [GeV/ec] - T GetPx() const { return GetPz() * GetTx(); } + XPU_D T GetPx() const { return GetPz() * GetTx(); } /// Gets y-component of the momentum [GeV/ec] - T GetPy() const { return GetPz() * GetTy(); } + XPU_D T GetPy() const { return GetPz() * GetTy(); } /// Gets transverse momentum - T GetPt() const + XPU_D T GetPt() const { T t2 = GetTx() * GetTx() + GetTy() * GetTy(); return GetP() * sqrt(t2 / (T(1.) + t2)); @@ -305,43 +307,46 @@ namespace cbm::algo::kf /// /// \brief Sets z position [cm] - void SetZ(T v) { fZ = v; } + XPU_D void SetZ(T v) { fZ = v; } /// \brief Sets x position [cm] - void SetX(T v) { fX = v; } + XPU_D void SetX(T v) { fX = v; } /// \brief Sets y position [cm] - void SetY(T v) { fY = v; } + XPU_D void SetY(T v) { fY = v; } /// \brief Sets slope along x-axis - void SetTx(T v) { fTx = v; } + XPU_D void SetTx(T v) { fTx = v; } /// \brief Sets slope along y-axis - void SetTy(T v) { fTy = v; } + XPU_D void SetTy(T v) { fTy = v; } /// \brief Sets charge over momentum [ec/GeV] - void SetQp(T v) { fQp = v; } + XPU_D void SetQp(T v) { fQp = v; } /// \brief Sets time [ns] - void SetTime(T v) { fT = v; } + XPU_D void SetTime(T v) { fT = v; } /// \brief Sets inverse velocity [ns/cm] - void SetVi(T v) { fVi = v; } + XPU_D void SetVi(T v) { fVi = v; } /// \brief Sets Chi-square of track fit model - void SetChiSq(T v) { fChiSq = v; } + XPU_D void SetChiSq(T v) { fChiSq = v; } /// \brief Sets NDF of track fit model - void SetNdf(T v) { fNdf = v; } + XPU_D void SetNdf(T v) { fNdf = v; } /// \brief Sets Chi-square of time measurements - void SetChiSqTime(T v) { fChiSqTime = v; } + XPU_D void SetChiSqTime(T v) { fChiSqTime = v; } /// \brief Sets NDF of time measurements - void SetNdfTime(T v) { fNdfTime = v; } + XPU_D void SetNdfTime(T v) { fNdfTime = v; } /// \brief Sets covariance matrix - void SetCovMatrix(const CovMatrix_t& val) { fCovMatrix = val; } + XPU_D void SetCovMatrix(const CovMatrix_t& val) { fCovMatrix = val; } + + /// \brief Sets all elements of the covariance matrix to zero + XPU_D void ResetCovMatrix() { memset(fCovMatrix.begin(), 0., kNcovParam * sizeof(T)); } /// \brief Get covariance matrix element /// \param i row @@ -358,7 +363,7 @@ namespace cbm::algo::kf /// \param j column /// \param val matrix element template<int i, int j> - void SetCovariance(T val) + XPU_D void SetCovariance(T val) { constexpr int ind = (j <= i) ? i * (1 + i) / 2 + j : j * (1 + j) / 2 + i; fCovMatrix[ind] = val; @@ -366,34 +371,34 @@ namespace cbm::algo::kf /// \brief Individual setters for covariance matrix elements /// - void SetC00(T val) { SetCovariance<0, 0>(val); } - void SetC10(T val) { SetCovariance<1, 0>(val); } - void SetC11(T val) { SetCovariance<1, 1>(val); } - void SetC20(T val) { SetCovariance<2, 0>(val); } - void SetC21(T val) { SetCovariance<2, 1>(val); } - void SetC22(T val) { SetCovariance<2, 2>(val); } - void SetC30(T val) { SetCovariance<3, 0>(val); } - void SetC31(T val) { SetCovariance<3, 1>(val); } - void SetC32(T val) { SetCovariance<3, 2>(val); } - void SetC33(T val) { SetCovariance<3, 3>(val); } - void SetC40(T val) { SetCovariance<4, 0>(val); } - void SetC41(T val) { SetCovariance<4, 1>(val); } - void SetC42(T val) { SetCovariance<4, 2>(val); } - void SetC43(T val) { SetCovariance<4, 3>(val); } - void SetC44(T val) { SetCovariance<4, 4>(val); } - void SetC50(T val) { SetCovariance<5, 0>(val); } - void SetC51(T val) { SetCovariance<5, 1>(val); } - void SetC52(T val) { SetCovariance<5, 2>(val); } - void SetC53(T val) { SetCovariance<5, 3>(val); } - void SetC54(T val) { SetCovariance<5, 4>(val); } - void SetC55(T val) { SetCovariance<5, 5>(val); } - void SetC60(T val) { SetCovariance<6, 0>(val); } - void SetC61(T val) { SetCovariance<6, 1>(val); } - void SetC62(T val) { SetCovariance<6, 2>(val); } - void SetC63(T val) { SetCovariance<6, 3>(val); } - void SetC64(T val) { SetCovariance<6, 4>(val); } - void SetC65(T val) { SetCovariance<6, 5>(val); } - void SetC66(T val) { SetCovariance<6, 6>(val); } + XPU_D void SetC00(T val) { SetCovariance<0, 0>(val); } + XPU_D void SetC10(T val) { SetCovariance<1, 0>(val); } + XPU_D void SetC11(T val) { SetCovariance<1, 1>(val); } + XPU_D void SetC20(T val) { SetCovariance<2, 0>(val); } + XPU_D void SetC21(T val) { SetCovariance<2, 1>(val); } + XPU_D void SetC22(T val) { SetCovariance<2, 2>(val); } + XPU_D void SetC30(T val) { SetCovariance<3, 0>(val); } + XPU_D void SetC31(T val) { SetCovariance<3, 1>(val); } + XPU_D void SetC32(T val) { SetCovariance<3, 2>(val); } + XPU_D void SetC33(T val) { SetCovariance<3, 3>(val); } + XPU_D void SetC40(T val) { SetCovariance<4, 0>(val); } + XPU_D void SetC41(T val) { SetCovariance<4, 1>(val); } + XPU_D void SetC42(T val) { SetCovariance<4, 2>(val); } + XPU_D void SetC43(T val) { SetCovariance<4, 3>(val); } + XPU_D void SetC44(T val) { SetCovariance<4, 4>(val); } + XPU_D void SetC50(T val) { SetCovariance<5, 0>(val); } + XPU_D void SetC51(T val) { SetCovariance<5, 1>(val); } + XPU_D void SetC52(T val) { SetCovariance<5, 2>(val); } + XPU_D void SetC53(T val) { SetCovariance<5, 3>(val); } + XPU_D void SetC54(T val) { SetCovariance<5, 4>(val); } + XPU_D void SetC55(T val) { SetCovariance<5, 5>(val); } + XPU_D void SetC60(T val) { SetCovariance<6, 0>(val); } + XPU_D void SetC61(T val) { SetCovariance<6, 1>(val); } + XPU_D void SetC62(T val) { SetCovariance<6, 2>(val); } + XPU_D void SetC63(T val) { SetCovariance<6, 3>(val); } + XPU_D void SetC64(T val) { SetCovariance<6, 4>(val); } + XPU_D void SetC65(T val) { SetCovariance<6, 5>(val); } + XPU_D void SetC66(T val) { SetCovariance<6, 6>(val); } ///--------------------------------------------------------------------------------------------------------------------- /// References to parameters to ease the math formulae @@ -401,49 +406,49 @@ namespace cbm::algo::kf /// /// \brief Reference to z position [cm] - T& Z() { return fZ; } + XPU_D T& Z() { return fZ; } /// \brief Reference to x position [cm] - T& X() { return fX; } + XPU_D T& X() { return fX; } /// \brief Reference to y position [cm] - T& Y() { return fY; } + XPU_D T& Y() { return fY; } /// \brief Reference to slope along x-axis - T& Tx() { return fTx; } + XPU_D T& Tx() { return fTx; } /// \brief Reference to slope along y-axis - T& Ty() { return fTy; } + XPU_D T& Ty() { return fTy; } /// \brief Reference to charge over momentum [ec/GeV] - T& Qp() { return fQp; } + XPU_D T& Qp() { return fQp; } /// \brief Reference to time [ns] - T& Time() { return fT; } + XPU_D T& Time() { return fT; } /// \brief Reference to inverse velocity [ns/cm] - T& Vi() { return fVi; } + XPU_D T& Vi() { return fVi; } /// \brief Reference to covariance matrix CovMatrix_t& CovMatrix() { return fCovMatrix; } /// \brief Reference to Chi-square of track fit model - T& ChiSq() { return fChiSq; } + XPU_D T& ChiSq() { return fChiSq; } /// \brief Reference to NDF of track fit model - T& Ndf() { return fNdf; } + XPU_D T& Ndf() { return fNdf; } /// \brief Reference to Chi-square of time measurements - T& ChiSqTime() { return fChiSqTime; } + XPU_D T& ChiSqTime() { return fChiSqTime; } /// \brief Reference to NDF of time measurements - T& NdfTime() { return fNdfTime; } + XPU_D T& NdfTime() { return fNdfTime; } /// \brief Get a reference to the covariance matrix element /// \param i row /// \param j column /// \return matrix element - T& C(int i, int j) + XPU_D T& C(int i, int j) { int ind = (j <= i) ? i * (1 + i) / 2 + j : j * (1 + j) / 2 + i; return fCovMatrix[ind]; @@ -530,7 +535,7 @@ namespace cbm::algo::kf /// \param c44 Variance of charge over momentum [(ec/GeV)2] /// \param c55 Variance of time [ns2] /// \param c66 Variance of inverse velocity [1/c2] - void ResetErrors(T c00, T c11, T c22, T c33, T c44, T c55, T c66); + XPU_D void ResetErrors(T c00, T c11, T c22, T c33, T c44, T c55, T c66); /// \brief Prints parameters to a string /// \param i Index of SIMD vector element (when i== -1, the entire vector is printed) @@ -550,7 +555,7 @@ namespace cbm::algo::kf bool IsConsistent(bool printWhenWrong, int nFilled) const; /// \brief Initializes inverse velocity range - void InitVelocityRange(fscal minP); + XPU_D void InitVelocityRange(fscal minP); ///--------------------------------------------------------------------------------------------------------------------- @@ -668,7 +673,7 @@ namespace cbm::algo::kf /// --------------------------------------------------------------------------------------------------------------------- /// template<typename T> - inline T TrackParamBase<T>::GetPhiError() const + XPU_D inline T TrackParamBase<T>::GetPhiError() const { // phi = atan( tx / ty ); } @@ -688,7 +693,7 @@ namespace cbm::algo::kf /// --------------------------------------------------------------------------------------------------------------------- /// template<typename T> - inline T TrackParamBase<T>::GetThetaError() const + XPU_D inline T TrackParamBase<T>::GetThetaError() const { // theta = atan(sqrt( tx * tx + ty * ty) ) @@ -741,9 +746,12 @@ namespace cbm::algo::kf // --------------------------------------------------------------------------------------------------------------------- // template<typename T> - inline void TrackParamBase<T>::ResetErrors(T c00, T c11, T c22, T c33, T c44, T c55, T c66) + XPU_D inline void TrackParamBase<T>::ResetErrors(T c00, T c11, T c22, T c33, T c44, T c55, T c66) { - fCovMatrix.fill(0.); + // fCovMatrix.fill(0.); + for (auto& element : fCovMatrix) { + element = 0.; + } SetC00(c00); SetC11(c11); @@ -762,7 +770,7 @@ namespace cbm::algo::kf // --------------------------------------------------------------------------------------------------------------------- // template<typename T> - inline void TrackParamBase<T>::InitVelocityRange(fscal minP) + XPU_D inline void TrackParamBase<T>::InitVelocityRange(fscal minP) { // initialise the velocity range with respect to the minimal momentum minP {GeV/c} using defs::ProtonMass; diff --git a/algo/kf/core/geo/KfFieldSlice.h b/algo/kf/core/geo/KfFieldSlice.h index a626ca4a2414b66f745d3193caa8afbe7eb9e601..50654a1c8ec24cc5743cad31e21ca3b47ec263e5 100644 --- a/algo/kf/core/geo/KfFieldSlice.h +++ b/algo/kf/core/geo/KfFieldSlice.h @@ -87,6 +87,15 @@ namespace cbm::algo::kf return GetFieldValue(trkPar.GetX() + trkPar.GetTx() * dz, trkPar.GetY() + trkPar.GetTy() * dz); } + /// \brief Gets approximation coefficients for the x-component of the field + const CoeffArray_t& GetBx() const { return fBx; } + + /// \brief Gets approximation coefficients for the y-component of the field + const CoeffArray_t& GetBy() const { return fBy; } + + /// \brief Gets approximation coefficients for the z-component of the field + const CoeffArray_t& GetBz() const { return fBz; } + /// \brief Gets reference z-coordinate of the slice [cm] const T& GetZref() const { return fZref; } diff --git a/algo/kf/core/geo/KfMaterialMap.h b/algo/kf/core/geo/KfMaterialMap.h index e453f6bd9ab7b72034c38ffb75663f9e7c1bc18a..2ba56e81e624b6704bda40aaf8d67b90abed523f 100644 --- a/algo/kf/core/geo/KfMaterialMap.h +++ b/algo/kf/core/geo/KfMaterialMap.h @@ -128,6 +128,9 @@ namespace cbm::algo::kf return GetBinThicknessX0<I>(iX + iY * fNbins); } + /// \brief Gets material thickness table + const std::vector<float>& GetTable() const { return fTable; } + /// \brief Function to test the instance for NaN bool IsUndefined() const {