Blender  V3.3
parallel_active_index.h
Go to the documentation of this file.
1 /* SPDX-License-Identifier: Apache-2.0
2  * Copyright 2021-2022 Blender Foundation */
3 
4 #pragma once
5 
7 
8 /* Given an array of states, build an array of indices for which the states
9  * are active.
10  *
11  * Shared memory requirement is `sizeof(int) * (number_of_warps + 1)`. */
12 
13 #include "util/atomic.h"
14 
15 #ifdef __HIP__
16 # define GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE 1024
17 #else
18 # define GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE 512
19 #endif
20 
21 /* TODO: abstract more device differences, define ccl_gpu_local_syncthreads,
22  * ccl_gpu_thread_warp, ccl_gpu_warp_index, ccl_gpu_num_warps for all devices
23  * and keep device specific code in compat.h */
24 
25 #ifdef __KERNEL_ONEAPI__
26 # ifdef WITH_ONEAPI_SYCL_HOST_ENABLED
27 template<typename IsActiveOp>
28 void cpu_serial_active_index_array_impl(const uint num_states,
31  IsActiveOp is_active_op)
32 {
33  int write_index = 0;
34  for (int state_index = 0; state_index < num_states; state_index++) {
35  if (is_active_op(state_index))
36  indices[write_index++] = state_index;
37  }
38  *num_indices = write_index;
39  return;
40 }
41 # endif /* WITH_ONEAPI_SYCL_HOST_ENABLED */
42 
43 template<typename IsActiveOp>
47  IsActiveOp is_active_op)
48 {
49  const sycl::nd_item<1> &item_id = sycl::ext::oneapi::experimental::this_nd_item<1>();
50  const uint blocksize = item_id.get_local_range(0);
51 
52  sycl::multi_ptr<int[GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE + 1],
53  sycl::access::address_space::local_space>
54  ptr = sycl::ext::oneapi::group_local_memory<
55  int[GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE + 1]>(item_id.get_group());
56  int *warp_offset = *ptr;
57 
58  /* NOTE(@nsirgien): Here we calculate the same value as below but
59  * faster for DPC++ : seems CUDA converting "%", "/", "*" based calculations below into
60  * something faster already but DPC++ doesn't, so it's better to use
61  * direct request of needed parameters - switching from this computation to computation below
62  * will cause 2.5x performance slowdown. */
63  const uint thread_index = item_id.get_local_id(0);
64  const uint thread_warp = item_id.get_sub_group().get_local_id();
65 
66  const uint warp_index = item_id.get_sub_group().get_group_id();
67  const uint num_warps = item_id.get_sub_group().get_group_range()[0];
68 
69  const uint state_index = item_id.get_global_id(0);
70 
71  /* Test if state corresponding to this thread is active. */
72  const uint is_active = (state_index < num_states) ? is_active_op(state_index) : 0;
73 #else /* !__KERNEL__ONEAPI__ */
74 # ifndef __KERNEL_METAL__
75 template<uint blocksize, typename IsActiveOp>
77 # endif
78  void
80  ccl_global int *indices,
82 # ifdef __KERNEL_METAL__
83  const uint is_active,
84  const uint blocksize,
85  const int thread_index,
86  const uint state_index,
87  const int ccl_gpu_warp_size,
88  const int thread_warp,
89  const int warp_index,
90  const int num_warps,
91  threadgroup int *warp_offset)
92 {
93 # else
94  IsActiveOp is_active_op)
95 {
96  extern ccl_gpu_shared int warp_offset[];
97 
98  const uint thread_index = ccl_gpu_thread_idx_x;
99  const uint thread_warp = thread_index % ccl_gpu_warp_size;
100 
101  const uint warp_index = thread_index / ccl_gpu_warp_size;
102  const uint num_warps = blocksize / ccl_gpu_warp_size;
103 
104  const uint state_index = ccl_gpu_block_idx_x * blocksize + thread_index;
105 
106  /* Test if state corresponding to this thread is active. */
107  const uint is_active = (state_index < num_states) ? is_active_op(state_index) : 0;
108 # endif
109 #endif /* !__KERNEL_ONEAPI__ */
110  /* For each thread within a warp compute how many other active states precede it. */
111 #ifdef __KERNEL_ONEAPI__
112  const uint thread_offset = sycl::exclusive_scan_over_group(
113  item_id.get_sub_group(), is_active, std::plus<>());
114 #else
115  const uint thread_offset = popcount(ccl_gpu_ballot(is_active) &
116  ccl_gpu_thread_mask(thread_warp));
117 #endif
118 
119  /* Last thread in warp stores number of active states for each warp. */
120 #ifdef __KERNEL_ONEAPI__
121  if (thread_warp == item_id.get_sub_group().get_local_range()[0] - 1) {
122 #else
123  if (thread_warp == ccl_gpu_warp_size - 1) {
124 #endif
125  warp_offset[warp_index] = thread_offset + is_active;
126  }
127 
128 #ifdef __KERNEL_ONEAPI__
129  /* NOTE(@nsirgien): For us here only local memory writing (warp_offset) is important,
130  * so faster local barriers can be used. */
132 #else
134 #endif
135 
136  /* Last thread in block converts per-warp sizes to offsets, increments global size of
137  * index array and gets offset to write to. */
138  if (thread_index == blocksize - 1) {
139  /* TODO: parallelize this. */
140  int offset = 0;
141  for (int i = 0; i < num_warps; i++) {
142  int num_active = warp_offset[i];
143  warp_offset[i] = offset;
144  offset += num_active;
145  }
146 
147  const uint block_num_active = warp_offset[warp_index] + thread_offset + is_active;
148  warp_offset[num_warps] = atomic_fetch_and_add_uint32(num_indices, block_num_active);
149  }
150 
151 #ifdef __KERNEL_ONEAPI__
152  /* NOTE(@nsirgien): For us here only important local memory writing (warp_offset),
153  * so faster local barriers can be used. */
155 #else
157 #endif
158 
159  /* Write to index array. */
160  if (is_active) {
161  const uint block_offset = warp_offset[num_warps];
162  indices[block_offset + warp_offset[warp_index] + thread_offset] = state_index;
163  }
164 }
165 
166 #ifdef __KERNEL_METAL__
167 
168 # define gpu_parallel_active_index_array(dummy, num_states, indices, num_indices, is_active_op) \
169  const uint is_active = (ccl_gpu_global_id_x() < num_states) ? \
170  is_active_op(ccl_gpu_global_id_x()) : \
171  0; \
172  gpu_parallel_active_index_array_impl(num_states, \
173  indices, \
174  num_indices, \
175  is_active, \
176  metal_local_size, \
177  metal_local_id, \
178  metal_global_id, \
179  simdgroup_size, \
180  simd_lane_index, \
181  simd_group_index, \
182  num_simd_groups, \
183  simdgroup_offset)
184 #elif defined(__KERNEL_ONEAPI__)
185 # ifdef WITH_ONEAPI_SYCL_HOST_ENABLED
186 # define gpu_parallel_active_index_array( \
187  blocksize, num_states, indices, num_indices, is_active_op) \
188  if (ccl_gpu_global_size_x() == 1) \
189  cpu_serial_active_index_array_impl(num_states, indices, num_indices, is_active_op); \
190  else \
191  gpu_parallel_active_index_array_impl(num_states, indices, num_indices, is_active_op);
192 # else
193 # define gpu_parallel_active_index_array( \
194  blocksize, num_states, indices, num_indices, is_active_op) \
195  gpu_parallel_active_index_array_impl(num_states, indices, num_indices, is_active_op)
196 # endif
197 #else
198 
199 # define gpu_parallel_active_index_array( \
200  blocksize, num_states, indices, num_indices, is_active_op) \
201  gpu_parallel_active_index_array_impl<blocksize>(num_states, indices, num_indices, is_active_op)
202 
203 #endif
204 
unsigned int uint
Definition: BLI_sys_types.h:67
ATOMIC_INLINE uint32_t atomic_fetch_and_add_uint32(uint32_t *p, uint32_t x)
#define ccl_restrict
Definition: cuda/compat.h:50
#define ccl_gpu_thread_idx_x
Definition: cuda/compat.h:61
#define ccl_gpu_syncthreads()
Definition: cuda/compat.h:73
#define ccl_gpu_warp_size
Definition: cuda/compat.h:65
#define ccl_gpu_thread_mask(thread_warp)
Definition: cuda/compat.h:66
#define ccl_gpu_shared
Definition: cuda/compat.h:47
#define ccl_gpu_ballot(predicate)
Definition: cuda/compat.h:74
#define ccl_gpu_block_idx_x
Definition: cuda/compat.h:63
#define ccl_global
Definition: cuda/compat.h:43
#define CCL_NAMESPACE_END
Definition: cuda/compat.h:9
int num_states
const uint state_index
ccl_gpu_kernel_postfix int ccl_global int ccl_global int * num_indices
ccl_gpu_kernel_postfix ccl_global float int int int int float bool int offset
ccl_gpu_kernel_postfix int ccl_global int * indices
#define __KERNEL_METAL__
Definition: metal/compat.h:7
#define __device__
Definition: metal/compat.h:248
#define ccl_gpu_local_syncthreads()
#define GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE
__device__ void gpu_parallel_active_index_array_impl(const uint num_states, ccl_global int *indices, ccl_global int *num_indices, IsActiveOp is_active_op)
ccl_device_inline uint popcount(uint x)
Definition: util/math.h:794
PointerRNA * ptr
Definition: wm_files.c:3480