Blender  V3.3
kernel/device/cuda/config.h
Go to the documentation of this file.
1 /* SPDX-License-Identifier: Apache-2.0
2  * Copyright 2011-2022 Blender Foundation */
3 
4 /* Device data taken from CUDA occupancy calculator.
5  *
6  * Terminology
7  * - CUDA GPUs have multiple streaming multiprocessors
8  * - Each multiprocessor executes multiple thread blocks
9  * - Each thread block contains a number of threads, also known as the block size
10  * - Multiprocessors have a fixed number of registers, and the amount of registers
11  * used by each threads limits the number of threads per block.
12  */
13 
14 /* 3.0 and 3.5 */
15 #if __CUDA_ARCH__ == 300 || __CUDA_ARCH__ == 350
16 # define GPU_MULTIPRESSOR_MAX_REGISTERS 65536
17 # define GPU_MULTIPROCESSOR_MAX_BLOCKS 16
18 # define GPU_BLOCK_MAX_THREADS 1024
19 # define GPU_THREAD_MAX_REGISTERS 63
20 
21 /* tunable parameters */
22 # define GPU_KERNEL_BLOCK_NUM_THREADS 256
23 # define GPU_KERNEL_MAX_REGISTERS 63
24 
25 /* 3.2 */
26 #elif __CUDA_ARCH__ == 320
27 # define GPU_MULTIPRESSOR_MAX_REGISTERS 32768
28 # define GPU_MULTIPROCESSOR_MAX_BLOCKS 16
29 # define GPU_BLOCK_MAX_THREADS 1024
30 # define GPU_THREAD_MAX_REGISTERS 63
31 
32 /* tunable parameters */
33 # define GPU_KERNEL_BLOCK_NUM_THREADS 256
34 # define GPU_KERNEL_MAX_REGISTERS 63
35 
36 /* 3.7 */
37 #elif __CUDA_ARCH__ == 370
38 # define GPU_MULTIPRESSOR_MAX_REGISTERS 65536
39 # define GPU_MULTIPROCESSOR_MAX_BLOCKS 16
40 # define GPU_BLOCK_MAX_THREADS 1024
41 # define GPU_THREAD_MAX_REGISTERS 255
42 
43 /* tunable parameters */
44 # define GPU_KERNEL_BLOCK_NUM_THREADS 256
45 # define GPU_KERNEL_MAX_REGISTERS 63
46 
47 /* 5.x, 6.x */
48 #elif __CUDA_ARCH__ <= 699
49 # define GPU_MULTIPRESSOR_MAX_REGISTERS 65536
50 # define GPU_MULTIPROCESSOR_MAX_BLOCKS 32
51 # define GPU_BLOCK_MAX_THREADS 1024
52 # define GPU_THREAD_MAX_REGISTERS 255
53 
54 /* tunable parameters */
55 # define GPU_KERNEL_BLOCK_NUM_THREADS 256
56 /* CUDA 9.0 seems to cause slowdowns on high-end Pascal cards unless we increase the number of
57  * registers */
58 # if __CUDACC_VER_MAJOR__ >= 9 && __CUDA_ARCH__ >= 600
59 # define GPU_KERNEL_MAX_REGISTERS 64
60 # else
61 # define GPU_KERNEL_MAX_REGISTERS 48
62 # endif
63 
64 /* 7.x, 8.x */
65 #elif __CUDA_ARCH__ <= 899
66 # define GPU_MULTIPRESSOR_MAX_REGISTERS 65536
67 # define GPU_MULTIPROCESSOR_MAX_BLOCKS 32
68 # define GPU_BLOCK_MAX_THREADS 1024
69 # define GPU_THREAD_MAX_REGISTERS 255
70 
71 /* tunable parameters */
72 # define GPU_KERNEL_BLOCK_NUM_THREADS 512
73 # define GPU_KERNEL_MAX_REGISTERS 96
74 
75 /* unknown architecture */
76 #else
77 # error "Unknown or unsupported CUDA architecture, can't determine launch bounds"
78 #endif
79 
80 /* Compute number of threads per block and minimum blocks per multiprocessor
81  * given the maximum number of registers per thread. */
82 #define ccl_gpu_kernel(block_num_threads, thread_num_registers) \
83  extern "C" __global__ void __launch_bounds__(block_num_threads, \
84  GPU_MULTIPRESSOR_MAX_REGISTERS / \
85  (block_num_threads * thread_num_registers))
86 
87 #define ccl_gpu_kernel_threads(block_num_threads) \
88  extern "C" __global__ void __launch_bounds__(block_num_threads)
89 
90 #define ccl_gpu_kernel_signature(name, ...) kernel_gpu_##name(__VA_ARGS__)
91 #define ccl_gpu_kernel_postfix
92 
93 #define ccl_gpu_kernel_call(x) x
94 
95 /* Define a function object where "func" is the lambda body, and additional parameters are used to
96  * specify captured state */
97 #define ccl_gpu_kernel_lambda(func, ...) \
98  struct KernelLambda { \
99  __VA_ARGS__; \
100  __device__ int operator()(const int state) \
101  { \
102  return (func); \
103  } \
104  } ccl_gpu_kernel_lambda_pass
105 
106 /* sanity checks */
107 
108 #if GPU_KERNEL_BLOCK_NUM_THREADS > GPU_BLOCK_MAX_THREADS
109 # error "Maximum number of threads per block exceeded"
110 #endif
111 
112 #if GPU_MULTIPRESSOR_MAX_REGISTERS / (GPU_KERNEL_BLOCK_NUM_THREADS * GPU_KERNEL_MAX_REGISTERS) > \
113  GPU_MULTIPROCESSOR_MAX_BLOCKS
114 # error "Maximum number of blocks per multiprocessor exceeded"
115 #endif
116 
117 #if GPU_KERNEL_MAX_REGISTERS > GPU_THREAD_MAX_REGISTERS
118 # error "Maximum number of registers per thread exceeded"
119 #endif