16 # define GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE 1024
18 # define GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE 512
25 #ifdef __KERNEL_ONEAPI__
26 # ifdef WITH_ONEAPI_SYCL_HOST_ENABLED
27 template<
typename IsActiveOp>
31 IsActiveOp is_active_op)
43 template<
typename IsActiveOp>
47 IsActiveOp is_active_op)
49 const sycl::nd_item<1> &item_id = sycl::ext::oneapi::experimental::this_nd_item<1>();
50 const uint blocksize = item_id.get_local_range(0);
53 sycl::access::address_space::local_space>
54 ptr = sycl::ext::oneapi::group_local_memory<
56 int *warp_offset = *
ptr;
63 const uint thread_index = item_id.get_local_id(0);
64 const uint thread_warp = item_id.get_sub_group().get_local_id();
66 const uint warp_index = item_id.get_sub_group().get_group_id();
67 const uint num_warps = item_id.get_sub_group().get_group_range()[0];
74 # ifndef __KERNEL_METAL__
75 template<u
int blocksize,
typename IsActiveOp>
85 const int thread_index,
88 const int thread_warp,
91 threadgroup
int *warp_offset)
94 IsActiveOp is_active_op)
111 #ifdef __KERNEL_ONEAPI__
112 const uint thread_offset = sycl::exclusive_scan_over_group(
113 item_id.get_sub_group(), is_active, std::plus<>());
120 #ifdef __KERNEL_ONEAPI__
121 if (thread_warp == item_id.get_sub_group().get_local_range()[0] - 1) {
125 warp_offset[warp_index] = thread_offset + is_active;
128 #ifdef __KERNEL_ONEAPI__
138 if (thread_index == blocksize - 1) {
141 for (
int i = 0; i < num_warps; i++) {
142 int num_active = warp_offset[i];
147 const uint block_num_active = warp_offset[warp_index] + thread_offset + is_active;
151 #ifdef __KERNEL_ONEAPI__
161 const uint block_offset = warp_offset[num_warps];
166 #ifdef __KERNEL_METAL__
168 # define gpu_parallel_active_index_array(dummy, num_states, indices, num_indices, is_active_op) \
169 const uint is_active = (ccl_gpu_global_id_x() < num_states) ? \
170 is_active_op(ccl_gpu_global_id_x()) : \
172 gpu_parallel_active_index_array_impl(num_states, \
184 #elif defined(__KERNEL_ONEAPI__)
185 # ifdef WITH_ONEAPI_SYCL_HOST_ENABLED
186 # define gpu_parallel_active_index_array( \
187 blocksize, num_states, indices, num_indices, is_active_op) \
188 if (ccl_gpu_global_size_x() == 1) \
189 cpu_serial_active_index_array_impl(num_states, indices, num_indices, is_active_op); \
191 gpu_parallel_active_index_array_impl(num_states, indices, num_indices, is_active_op);
193 # define gpu_parallel_active_index_array( \
194 blocksize, num_states, indices, num_indices, is_active_op) \
195 gpu_parallel_active_index_array_impl(num_states, indices, num_indices, is_active_op)
199 # define gpu_parallel_active_index_array( \
200 blocksize, num_states, indices, num_indices, is_active_op) \
201 gpu_parallel_active_index_array_impl<blocksize>(num_states, indices, num_indices, is_active_op)
ATOMIC_INLINE uint32_t atomic_fetch_and_add_uint32(uint32_t *p, uint32_t x)
#define ccl_gpu_thread_idx_x
#define ccl_gpu_syncthreads()
#define ccl_gpu_warp_size
#define ccl_gpu_thread_mask(thread_warp)
#define ccl_gpu_ballot(predicate)
#define ccl_gpu_block_idx_x
#define CCL_NAMESPACE_END
ccl_gpu_kernel_postfix int ccl_global int ccl_global int * num_indices
ccl_gpu_kernel_postfix ccl_global float int int int int float bool int offset
ccl_gpu_kernel_postfix int ccl_global int * indices
#define ccl_gpu_local_syncthreads()
#define GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE
__device__ void gpu_parallel_active_index_array_impl(const uint num_states, ccl_global int *indices, ccl_global int *num_indices, IsActiveOp is_active_op)
ccl_device_inline uint popcount(uint x)