Blender  V3.3
metal/compat.h
Go to the documentation of this file.
1 /* SPDX-License-Identifier: Apache-2.0
2  * Copyright 2011-2022 Blender Foundation */
3 
4 #pragma once
5 
6 #define __KERNEL_GPU__
7 #define __KERNEL_METAL__
8 #define CCL_NAMESPACE_BEGIN
9 #define CCL_NAMESPACE_END
10 
11 #ifndef ATTR_FALLTHROUGH
12 # define ATTR_FALLTHROUGH
13 #endif
14 
15 #include <metal_atomic>
16 #include <metal_pack>
17 #include <metal_stdlib>
18 #include <simd/simd.h>
19 
20 using namespace metal;
21 
22 #ifdef __METALRT__
23 using namespace metal::raytracing;
24 #endif
25 
26 #pragma clang diagnostic ignored "-Wunused-variable"
27 #pragma clang diagnostic ignored "-Wsign-compare"
28 #pragma clang diagnostic ignored "-Wuninitialized"
29 
30 /* Qualifiers */
31 
32 #define ccl_device
33 #define ccl_device_inline ccl_device __attribute__((always_inline))
34 #define ccl_device_forceinline ccl_device __attribute__((always_inline))
35 #if defined(__KERNEL_METAL_APPLE__)
36 # define ccl_device_noinline ccl_device
37 #else
38 # define ccl_device_noinline ccl_device __attribute__((noinline))
39 #endif
40 
41 #define ccl_device_noinline_cpu ccl_device
42 #define ccl_device_inline_method ccl_device
43 #define ccl_global device
44 #define ccl_inline_constant static constant constexpr
45 #define ccl_device_constant constant
46 #define ccl_constant constant
47 #define ccl_gpu_shared threadgroup
48 #define ccl_private thread
49 #define ccl_may_alias
50 #define ccl_restrict __restrict
51 #define ccl_loop_no_unroll
52 #define ccl_align(n) alignas(n)
53 #define ccl_optional_struct_init
54 
55 /* No assert supported for Metal */
56 
57 #define kernel_assert(cond)
58 
59 #define ccl_gpu_global_id_x() metal_global_id
60 #define ccl_gpu_warp_size simdgroup_size
61 #define ccl_gpu_thread_idx_x simd_group_index
62 #define ccl_gpu_thread_mask(thread_warp) uint64_t((1ull << thread_warp) - 1)
63 
64 #define ccl_gpu_ballot(predicate) ((uint64_t)((simd_vote::vote_t)simd_ballot(predicate)))
65 #define ccl_gpu_syncthreads() threadgroup_barrier(mem_flags::mem_threadgroup);
66 
67 // clang-format off
68 
69 /* kernel.h adapters */
70 
71 #define ccl_gpu_kernel(block_num_threads, thread_num_registers)
72 #define ccl_gpu_kernel_threads(block_num_threads)
73 
74 /* Convert a comma-separated list into a semicolon-separated list
75  * (so that we can generate a struct based on kernel entry-point parameters). */
76 #define FN0()
77 #define FN1(p1) p1;
78 #define FN2(p1, p2) p1; p2;
79 #define FN3(p1, p2, p3) p1; p2; p3;
80 #define FN4(p1, p2, p3, p4) p1; p2; p3; p4;
81 #define FN5(p1, p2, p3, p4, p5) p1; p2; p3; p4; p5;
82 #define FN6(p1, p2, p3, p4, p5, p6) p1; p2; p3; p4; p5; p6;
83 #define FN7(p1, p2, p3, p4, p5, p6, p7) p1; p2; p3; p4; p5; p6; p7;
84 #define FN8(p1, p2, p3, p4, p5, p6, p7, p8) p1; p2; p3; p4; p5; p6; p7; p8;
85 #define FN9(p1, p2, p3, p4, p5, p6, p7, p8, p9) p1; p2; p3; p4; p5; p6; p7; p8; p9;
86 #define FN10(p1, p2, p3, p4, p5, p6, p7, p8, p9, p10) p1; p2; p3; p4; p5; p6; p7; p8; p9; p10;
87 #define FN11(p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11) p1; p2; p3; p4; p5; p6; p7; p8; p9; p10; p11;
88 #define FN12(p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11, p12) p1; p2; p3; p4; p5; p6; p7; p8; p9; p10; p11; p12;
89 #define FN13(p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11, p12, p13) p1; p2; p3; p4; p5; p6; p7; p8; p9; p10; p11; p12; p13;
90 #define FN14(p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11, p12, p13, p14) p1; p2; p3; p4; p5; p6; p7; p8; p9; p10; p11; p12; p13; p14;
91 #define FN15(p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11, p12, p13, p14, p15) p1; p2; p3; p4; p5; p6; p7; p8; p9; p10; p11; p12; p13; p14; p15;
92 #define FN16(p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11, p12, p13, p14, p15, p16) p1; p2; p3; p4; p5; p6; p7; p8; p9; p10; p11; p12; p13; p14; p15; p16;
93 #define FN17(p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11, p12, p13, p14, p15, p16, p17) p1; p2; p3; p4; p5; p6; p7; p8; p9; p10; p11; p12; p13; p14; p15; p16; p17;
94 #define FN18(p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11, p12, p13, p14, p15, p16, p17, p18) p1; p2; p3; p4; p5; p6; p7; p8; p9; p10; p11; p12; p13; p14; p15; p16; p17; p18;
95 #define FN19(p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11, p12, p13, p14, p15, p16, p17, p18, p19) p1; p2; p3; p4; p5; p6; p7; p8; p9; p10; p11; p12; p13; p14; p15; p16; p17; p18; p19;
96 #define FN20(p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11, p12, p13, p14, p15, p16, p17, p18, p19, p20) p1; p2; p3; p4; p5; p6; p7; p8; p9; p10; p11; p12; p13; p14; p15; p16; p17; p18; p19; p20;
97 #define GET_LAST_ARG(p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11, p12, p13, p14, p15, p16, p17, p18, p19, p20, ...) p20
98 #define PARAMS_MAKER(...) GET_LAST_ARG(__VA_ARGS__, FN20, FN19, FN18, FN17, FN16, FN15, FN14, FN13, FN12, FN11, FN10, FN9, FN8, FN7, FN6, FN5, FN4, FN3, FN2, FN1, FN0)
99 
100 /* Generate a struct containing the entry-point parameters and a "run"
101  * method which can access them implicitly via this-> */
102 #define ccl_gpu_kernel_signature(name, ...) \
103 struct kernel_gpu_##name \
104 { \
105  PARAMS_MAKER(__VA_ARGS__)(__VA_ARGS__) \
106  void run(thread MetalKernelContext& context, \
107  threadgroup int *simdgroup_offset, \
108  const uint metal_global_id, \
109  const ushort metal_local_id, \
110  const ushort metal_local_size, \
111  uint simdgroup_size, \
112  uint simd_lane_index, \
113  uint simd_group_index, \
114  uint num_simd_groups) ccl_global const; \
115 }; \
116 kernel void cycles_metal_##name(device const kernel_gpu_##name *params_struct, \
117  constant KernelParamsMetal &ccl_restrict _launch_params_metal, \
118  constant MetalAncillaries *_metal_ancillaries, \
119  threadgroup int *simdgroup_offset[[ threadgroup(0) ]], \
120  const uint metal_global_id [[thread_position_in_grid]], \
121  const ushort metal_local_id [[thread_position_in_threadgroup]], \
122  const ushort metal_local_size [[threads_per_threadgroup]], \
123  uint simdgroup_size [[threads_per_simdgroup]], \
124  uint simd_lane_index [[thread_index_in_simdgroup]], \
125  uint simd_group_index [[simdgroup_index_in_threadgroup]], \
126  uint num_simd_groups [[simdgroups_per_threadgroup]]) { \
127  MetalKernelContext context(_launch_params_metal, _metal_ancillaries); \
128  params_struct->run(context, simdgroup_offset, metal_global_id, metal_local_id, metal_local_size, simdgroup_size, simd_lane_index, simd_group_index, num_simd_groups); \
129 } \
130 void kernel_gpu_##name::run(thread MetalKernelContext& context, \
131  threadgroup int *simdgroup_offset, \
132  const uint metal_global_id, \
133  const ushort metal_local_id, \
134  const ushort metal_local_size, \
135  uint simdgroup_size, \
136  uint simd_lane_index, \
137  uint simd_group_index, \
138  uint num_simd_groups) ccl_global const
139 
140 #define ccl_gpu_kernel_postfix
141 #define ccl_gpu_kernel_call(x) context.x
142 
143 /* define a function object where "func" is the lambda body, and additional parameters are used to specify captured state */
144 #define ccl_gpu_kernel_lambda(func, ...) \
145  struct KernelLambda \
146  { \
147  KernelLambda(ccl_private MetalKernelContext &_context) : context(_context) {} \
148  ccl_private MetalKernelContext &context; \
149  __VA_ARGS__; \
150  int operator()(const int state) const { return (func); } \
151  } ccl_gpu_kernel_lambda_pass(context)
152 
153 // clang-format on
154 
155 /* volumetric lambda functions - use function objects for lambda-like functionality */
156 #define VOLUME_READ_LAMBDA(function_call) \
157  struct FnObjectRead { \
158  KernelGlobals kg; \
159  ccl_private MetalKernelContext *context; \
160  int state; \
161 \
162  VolumeStack operator()(const int i) const \
163  { \
164  return context->function_call; \
165  } \
166  } volume_read_lambda_pass{kg, this, state};
167 
168 #define VOLUME_WRITE_LAMBDA(function_call) \
169  struct FnObjectWrite { \
170  KernelGlobals kg; \
171  ccl_private MetalKernelContext *context; \
172  int state; \
173 \
174  void operator()(const int i, VolumeStack entry) const \
175  { \
176  context->function_call; \
177  } \
178  } volume_write_lambda_pass{kg, this, state};
179 
180 /* make_type definitions with Metal style element initializers */
181 #ifdef make_float2
182 # undef make_float2
183 #endif
184 #ifdef make_float3
185 # undef make_float3
186 #endif
187 #ifdef make_float4
188 # undef make_float4
189 #endif
190 #ifdef make_int2
191 # undef make_int2
192 #endif
193 #ifdef make_int3
194 # undef make_int3
195 #endif
196 #ifdef make_int4
197 # undef make_int4
198 #endif
199 #ifdef make_uchar4
200 # undef make_uchar4
201 #endif
202 
203 #define make_float2(x, y) float2(x, y)
204 #define make_float3(x, y, z) float3(x, y, z)
205 #define make_float4(x, y, z, w) float4(x, y, z, w)
206 #define make_int2(x, y) int2(x, y)
207 #define make_int3(x, y, z) int3(x, y, z)
208 #define make_int4(x, y, z, w) int4(x, y, z, w)
209 #define make_uchar4(x, y, z, w) uchar4(x, y, z, w)
210 
211 /* Math functions */
212 
213 #define __uint_as_float(x) as_type<float>(x)
214 #define __float_as_uint(x) as_type<uint>(x)
215 #define __int_as_float(x) as_type<float>(x)
216 #define __float_as_int(x) as_type<int>(x)
217 #define __float2half(x) half(x)
218 #define powf(x, y) pow(float(x), float(y))
219 #define fabsf(x) fabs(float(x))
220 #define copysignf(x, y) copysign(float(x), float(y))
221 #define asinf(x) asin(float(x))
222 #define acosf(x) acos(float(x))
223 #define atanf(x) atan(float(x))
224 #define floorf(x) floor(float(x))
225 #define ceilf(x) ceil(float(x))
226 #define hypotf(x, y) hypot(float(x), float(y))
227 #define atan2f(x, y) atan2(float(x), float(y))
228 #define fmaxf(x, y) fmax(float(x), float(y))
229 #define fminf(x, y) fmin(float(x), float(y))
230 #define fmodf(x, y) fmod(float(x), float(y))
231 #define sinhf(x) sinh(float(x))
232 #define coshf(x) cosh(float(x))
233 #define tanhf(x) tanh(float(x))
234 #define saturatef(x) saturate(float(x))
235 
236 /* Use native functions with possibly lower precision for performance,
237  * no issues found so far. */
238 #define trigmode fast
239 #define sinf(x) trigmode::sin(float(x))
240 #define cosf(x) trigmode::cos(float(x))
241 #define tanf(x) trigmode::tan(float(x))
242 #define expf(x) trigmode::exp(float(x))
243 #define sqrtf(x) trigmode::sqrt(float(x))
244 #define logf(x) trigmode::log(float(x))
245 
246 #define NULL 0
247 
248 #define __device__
249 
250 #ifdef __METALRT__
251 
252 # if defined(__METALRT_MOTION__)
253 # define METALRT_TAGS instancing, instance_motion, primitive_motion
254 # else
255 # define METALRT_TAGS instancing
256 # endif /* __METALRT_MOTION__ */
257 
258 typedef acceleration_structure<METALRT_TAGS> metalrt_as_type;
259 typedef intersection_function_table<triangle_data, METALRT_TAGS> metalrt_ift_type;
260 typedef metal::raytracing::intersector<triangle_data, METALRT_TAGS> metalrt_intersector_type;
261 
262 #endif /* __METALRT__ */
263 
264 /* texture bindings and sampler setup */
265 
267  texture2d<float, access::sample> tex;
268 };
270  texture3d<float, access::sample> tex;
271 };
272 
276 
277 #ifdef __METALRT__
278  metalrt_as_type accel_struct;
279  metalrt_ift_type ift_default;
280  metalrt_ift_type ift_shadow;
281  metalrt_ift_type ift_local;
282 #endif
283 };
284 
285 #include "util/half.h"
286 #include "util/types.h"
287 
292 
296 
298 };
299 
300 constant constexpr array<sampler, SamplerCount> metal_samplers = {
301  sampler(address::repeat, filter::nearest),
302  sampler(address::clamp_to_edge, filter::nearest),
303  sampler(address::clamp_to_zero, filter::nearest),
304  sampler(address::repeat, filter::linear),
305  sampler(address::clamp_to_edge, filter::linear),
306  sampler(address::clamp_to_zero, filter::linear),
307 };
depth_tx sampler(1, ImageType::FLOAT_2D, "combined_tx") .sampler(2
constexpr constant array< sampler, SamplerCount > metal_samplers
Definition: metal/compat.h:300
SamplerType
Definition: metal/compat.h:288
@ SamplerFilterNearest_AddressRepeat
Definition: metal/compat.h:289
@ SamplerFilterLinear_AddressClampEdge
Definition: metal/compat.h:294
@ SamplerFilterNearest_AddressClampZero
Definition: metal/compat.h:291
@ SamplerFilterLinear_AddressClampZero
Definition: metal/compat.h:295
@ SamplerFilterNearest_AddressClampEdge
Definition: metal/compat.h:290
@ SamplerFilterLinear_AddressRepeat
Definition: metal/compat.h:293
@ SamplerCount
Definition: metal/compat.h:297
device Texture3DParamsMetal * textures_3d
Definition: metal/compat.h:275
device Texture2DParamsMetal * textures_2d
Definition: metal/compat.h:274
texture2d< float, access::sample > tex
Definition: metal/compat.h:267
texture3d< float, access::sample > tex
Definition: metal/compat.h:270