diff --git a/src/xpu/device.h b/src/xpu/device.h
index 0a765ab6b51a5c87aec9455ca80bbb48cd618e1f..98da18a2a00944588cc09291e61fb847315166f7 100644
--- a/src/xpu/device.h
+++ b/src/xpu/device.h
@@ -59,47 +59,53 @@ XPU_D XPU_FORCE_INLINE const typename C::data_t &cmem() { return C::get(); }
XPU_D XPU_FORCE_INLINE constexpr float pi() { return M_PIf32; }
XPU_D XPU_FORCE_INLINE constexpr float deg_to_rad() { return pi() / 180.f; }
-XPU_D XPU_FORCE_INLINE int abs(int x);
+XPU_D XPU_FORCE_INLINE int abs(int x);
XPU_D XPU_FORCE_INLINE float abs(float x);
+
XPU_D XPU_FORCE_INLINE float ceil(float x);
+
XPU_D XPU_FORCE_INLINE float cos(float x);
-XPU_D XPU_FORCE_INLINE int min(int a, int b);
+
+XPU_D XPU_FORCE_INLINE int min(int a, int b);
XPU_D XPU_FORCE_INLINE unsigned long long int min(unsigned long long int a, unsigned long long int b);
-XPU_D XPU_FORCE_INLINE long long int min(long long int a, long long int b);
-XPU_D XPU_FORCE_INLINE float min(float a, float b);
-XPU_D XPU_FORCE_INLINE int max(int a, int b);
+XPU_D XPU_FORCE_INLINE long long int min(long long int a, long long int b);
+XPU_D XPU_FORCE_INLINE float min(float a, float b);
+
+XPU_D XPU_FORCE_INLINE int max(int a, int b);
XPU_D XPU_FORCE_INLINE float max(float a, float b);
+
XPU_D XPU_FORCE_INLINE float sqrt(float x);
+
XPU_D XPU_FORCE_INLINE float tan(float x);
-XPU_D XPU_FORCE_INLINE int atomic_cas(int *addr, int compare, int val);
+XPU_D XPU_FORCE_INLINE int atomic_cas(int *addr, int compare, int val);
XPU_D XPU_FORCE_INLINE unsigned int atomic_cas(unsigned int *addr, unsigned int compare, unsigned int val);
-XPU_D XPU_FORCE_INLINE int atomic_cas_block(int *addr, int compare, int val);
+XPU_D XPU_FORCE_INLINE int atomic_cas_block(int *addr, int compare, int val);
XPU_D XPU_FORCE_INLINE unsigned int atomic_cas_block(unsigned int *addr, unsigned int compare, unsigned int val);
-XPU_D XPU_FORCE_INLINE int atomic_add(int *addr, int val);
+XPU_D XPU_FORCE_INLINE int atomic_add(int *addr, int val);
XPU_D XPU_FORCE_INLINE unsigned int atomic_add(unsigned int *addr, unsigned int val);
-XPU_D XPU_FORCE_INLINE int atomic_add_block(int *addr, int val);
+XPU_D XPU_FORCE_INLINE int atomic_add_block(int *addr, int val);
XPU_D XPU_FORCE_INLINE unsigned int atomic_add_block(unsigned int *addr, unsigned int val);
-XPU_D XPU_FORCE_INLINE int atomic_sub(int *addr, int val);
+XPU_D XPU_FORCE_INLINE int atomic_sub(int *addr, int val);
XPU_D XPU_FORCE_INLINE unsigned int atomic_sub(unsigned int *addr, unsigned int val);
-XPU_D XPU_FORCE_INLINE int atomic_sub_block(int *addr, int val);
+XPU_D XPU_FORCE_INLINE int atomic_sub_block(int *addr, int val);
XPU_D XPU_FORCE_INLINE unsigned int atomic_sub_block(unsigned int *addr, unsigned int val);
-XPU_D XPU_FORCE_INLINE int atomic_and(int *addr, int val);
+XPU_D XPU_FORCE_INLINE int atomic_and(int *addr, int val);
XPU_D XPU_FORCE_INLINE unsigned int atomic_and(unsigned int *addr, unsigned int val);
-XPU_D XPU_FORCE_INLINE int atomic_and_block(int *addr, int val);
+XPU_D XPU_FORCE_INLINE int atomic_and_block(int *addr, int val);
XPU_D XPU_FORCE_INLINE unsigned int atomic_and_block(unsigned int *addr, unsigned int val);
-XPU_D XPU_FORCE_INLINE int atomic_or(int *addr, int val);
+XPU_D XPU_FORCE_INLINE int atomic_or(int *addr, int val);
XPU_D XPU_FORCE_INLINE unsigned int atomic_or(unsigned int *addr, unsigned int val);
-XPU_D XPU_FORCE_INLINE int atomic_or_block(int *addr, int val);
-XPU_D XPU_FORCE_INLINE unsigned atomic_or_block(unsigned int *addr, unsigned int val);
+XPU_D XPU_FORCE_INLINE int atomic_or_block(int *addr, int val);
+XPU_D XPU_FORCE_INLINE unsigned int atomic_or_block(unsigned int *addr, unsigned int val);
-XPU_D XPU_FORCE_INLINE int atomic_xor(int *addr, int val);
+XPU_D XPU_FORCE_INLINE int atomic_xor(int *addr, int val);
XPU_D XPU_FORCE_INLINE unsigned int atomic_xor(unsigned int *addr, unsigned int val);
-XPU_D XPU_FORCE_INLINE int atomic_xor_block(int *addr, int val);
+XPU_D XPU_FORCE_INLINE int atomic_xor_block(int *addr, int val);
XPU_D XPU_FORCE_INLINE unsigned int atomic_xor_block(unsigned int *addr, unsigned int val);
XPU_D XPU_FORCE_INLINE void barrier();
diff --git a/src/xpu/driver/cpu/device.h b/src/xpu/driver/cpu/device.h
index cd891bf3b182c3800513bfa00c5c82483bbdd87d..aaedf95f9dd15bd5085f05a05d842dd3dbfaea0d 100644
--- a/src/xpu/driver/cpu/device.h
+++ b/src/xpu/driver/cpu/device.h
@@ -1,5 +1,5 @@
-#ifndef XPU_DRIVER_CPU_DEVICE_RUNTIME
-#define XPU_DRIVER_CPU_DEVICE_RUNTIME
+#ifndef XPU_DRIVER_CPU_DEVICE_RUNTIME_H
+#define XPU_DRIVER_CPU_DEVICE_RUNTIME_H
#ifndef XPU_DEVICE_H
#error "This header should not be included directly. Include xpu/device.h instead."
@@ -18,7 +18,7 @@ namespace xpu {
namespace detail {
-// workaround until c++14 / c++17 is available
+// workaround until c++14 / c++17 with std::exchange is available
template<class T>
inline T exchange(T &obj, T new_val) {
T old_val = std::move(obj);
diff --git a/src/xpu/driver/hip_cuda/device.h b/src/xpu/driver/hip_cuda/device.h
index 3ae53fd9a0bf4dbe63e36df96e06a5e4cf0d0be4..e43b0d0d96e6f6ac605d8610421cbbbf517c40cf 100644
--- a/src/xpu/driver/hip_cuda/device.h
+++ b/src/xpu/driver/hip_cuda/device.h
@@ -2,8 +2,6 @@
#ifndef XPU_DRIVER_CUDA_DEVICE_RUNTIME_H
#define XPU_DRIVER_CUDA_DEVICE_RUNTIME_H
-
-
#ifndef XPU_DEVICE_H
#error "This header should not be included directly. Include xpu/device.h instead."
#endif