4 #ifndef __UTIL_ATOMIC_H__
5 #define __UTIL_ATOMIC_H__
12 # define atomic_add_and_fetch_float(p, x) atomic_add_and_fetch_fl((p), (x))
13 # define atomic_compare_and_swap_float(p, old_val, new_val) \
14 atomic_cas_float((p), (old_val), (new_val))
16 # define atomic_fetch_and_inc_uint32(p) atomic_fetch_and_add_uint32((p), 1)
17 # define atomic_fetch_and_dec_uint32(p) atomic_fetch_and_add_uint32((p), -1)
19 # define CCL_LOCAL_MEM_FENCE 0
20 # define ccl_barrier(flags) ((void)0)
24 # if defined(__KERNEL_CUDA__) || defined(__KERNEL_HIP__)
26 # define atomic_add_and_fetch_float(p, x) (atomicAdd((float *)(p), (float)(x)) + (float)(x))
28 # define atomic_fetch_and_add_uint32(p, x) atomicAdd((unsigned int *)(p), (unsigned int)(x))
29 # define atomic_fetch_and_sub_uint32(p, x) atomicSub((unsigned int *)(p), (unsigned int)(x))
30 # define atomic_fetch_and_inc_uint32(p) atomic_fetch_and_add_uint32((p), 1)
31 # define atomic_fetch_and_dec_uint32(p) atomic_fetch_and_sub_uint32((p), 1)
32 # define atomic_fetch_and_or_uint32(p, x) atomicOr((unsigned int *)(p), (unsigned int)(x))
39 unsigned int int_value;
41 } new_value, prev_value,
result;
42 prev_value.float_value = old_val;
43 new_value.float_value = new_val;
44 result.int_value = atomicCAS((
unsigned int *)
dest, prev_value.int_value, new_value.int_value);
48 # define CCL_LOCAL_MEM_FENCE
49 # define ccl_barrier(flags) __syncthreads()
53 # ifdef __KERNEL_METAL__
63 } new_value, prev_value;
64 prev_value.int_value = atomic_load_explicit(source, memory_order_relaxed);
66 new_value.float_value = prev_value.float_value + operand;
67 }
while (!atomic_compare_exchange_weak_explicit(source,
68 &prev_value.int_value,
71 memory_order_relaxed));
73 return new_value.float_value;
76 # define atomic_fetch_and_add_uint32(p, x) \
77 atomic_fetch_add_explicit((device atomic_uint *)p, x, memory_order_relaxed)
78 # define atomic_fetch_and_sub_uint32(p, x) \
79 atomic_fetch_sub_explicit((device atomic_uint *)p, x, memory_order_relaxed)
80 # define atomic_fetch_and_inc_uint32(p) \
81 atomic_fetch_add_explicit((device atomic_uint *)p, 1, memory_order_relaxed)
82 # define atomic_fetch_and_dec_uint32(p) \
83 atomic_fetch_sub_explicit((device atomic_uint *)p, 1, memory_order_relaxed)
84 # define atomic_fetch_and_or_uint32(p, x) \
85 atomic_fetch_or_explicit((device atomic_uint *)p, x, memory_order_relaxed)
93 atomic_compare_exchange_weak_explicit((
ccl_global atomic_int *)
dest,
97 memory_order_relaxed);
101 # define atomic_store(p, x) atomic_store_explicit(p, x, memory_order_relaxed)
102 # define atomic_fetch(p) atomic_load_explicit(p, memory_order_relaxed)
104 # define CCL_LOCAL_MEM_FENCE mem_flags::mem_threadgroup
105 # define ccl_barrier(flags) threadgroup_barrier(flags)
109 # ifdef __KERNEL_ONEAPI__
113 sycl::atomic_ref<
float,
114 sycl::memory_order::relaxed,
115 sycl::memory_scope::device,
116 sycl::access::address_space::ext_intel_global_device_space>
118 return atomic.fetch_add(
x);
125 sycl::atomic_ref<
float,
126 sycl::memory_order::relaxed,
127 sycl::memory_scope::device,
128 sycl::access::address_space::ext_intel_global_device_space>
130 atomic.compare_exchange_weak(old_val, new_val);
137 sycl::atomic_ref<
unsigned int,
138 sycl::memory_order::relaxed,
139 sycl::memory_scope::device,
140 sycl::access::address_space::ext_intel_global_device_space>
142 return atomic.fetch_add(
x);
147 sycl::atomic_ref<int,
148 sycl::memory_order::relaxed,
149 sycl::memory_scope::device,
150 sycl::access::address_space::ext_intel_global_device_space>
152 return atomic.fetch_add(
x);
158 sycl::atomic_ref<
unsigned int,
159 sycl::memory_order::relaxed,
160 sycl::memory_scope::device,
161 sycl::access::address_space::ext_intel_global_device_space>
163 return atomic.fetch_sub(
x);
168 sycl::atomic_ref<int,
169 sycl::memory_order::relaxed,
170 sycl::memory_scope::device,
171 sycl::access::address_space::ext_intel_global_device_space>
173 return atomic.fetch_sub(
x);
188 return atomic_fetch_and_sub_uint32(p, 1);
193 return atomic_fetch_and_sub_uint32(p, 1);
199 sycl::atomic_ref<
unsigned int,
200 sycl::memory_order::relaxed,
201 sycl::memory_scope::device,
202 sycl::access::address_space::ext_intel_global_device_space>
204 return atomic.fetch_or(
x);
209 sycl::atomic_ref<int,
210 sycl::memory_order::relaxed,
211 sycl::memory_scope::device,
212 sycl::access::address_space::ext_intel_global_device_space>
214 return atomic.fetch_or(
x);
typedef float(TangentPoint)[2]
#define atomic_fetch_and_dec_uint32(p)
#define atomic_compare_and_swap_float(p, old_val, new_val)
#define atomic_fetch_and_inc_uint32(p)
#define atomic_add_and_fetch_float(p, x)
Provides wrapper around system-specific atomic primitives, and some extensions (faked-atomic operatio...
ATOMIC_INLINE uint32_t atomic_fetch_and_or_uint32(uint32_t *p, uint32_t x)
ATOMIC_INLINE uint32_t atomic_fetch_and_add_uint32(uint32_t *p, uint32_t x)
#define ccl_device_inline
ccl_device_inline int __float_as_int(float f)
ccl_device_inline float __int_as_float(int i)