Blender  V3.3
kernel/device/gpu/kernel.h
Go to the documentation of this file.
1 /* SPDX-License-Identifier: Apache-2.0
2  * Copyright 2011-2022 Blender Foundation */
3 
4 /* Common GPU kernels. */
5 
9 
10 #include "kernel/sample/lcg.h"
11 
12 /* Include constant tables before entering Metal's context class scope (context_begin.h) */
13 #include "kernel/tables.h"
14 
15 #ifdef __KERNEL_METAL__
17 #elif defined(__KERNEL_ONEAPI__)
19 #endif
20 
22 
26 
38 
39 #include "kernel/bake/bake.h"
40 
42 
43 #ifdef __KERNEL_METAL__
45 #elif defined(__KERNEL_ONEAPI__)
47 #endif
48 
49 #include "kernel/film/read.h"
50 
51 /* --------------------------------------------------------------------
52  * Integrator.
53  */
54 
56  ccl_gpu_kernel_signature(integrator_reset, int num_states)
57 {
58  const int state = ccl_gpu_global_id_x();
59 
60  if (state < num_states) {
61  INTEGRATOR_STATE_WRITE(state, path, queued_kernel) = 0;
62  INTEGRATOR_STATE_WRITE(state, shadow_path, queued_kernel) = 0;
63  }
64 }
66 
70  const int num_tiles,
72  const int max_tile_work_size)
73 {
74  const int work_index = ccl_gpu_global_id_x();
75 
77  return;
78  }
79 
82 
84 
86  return;
87  }
88 
90 
93 
96 }
98 
102  const int num_tiles,
103  ccl_global float *render_buffer,
104  const int max_tile_work_size)
105 {
106  const int work_index = ccl_gpu_global_id_x();
107 
109  return;
110  }
111 
114 
116 
117  if (tile_work_index >= tile->work_size) {
118  return;
119  }
120 
122 
123  uint x, y, sample;
125 
128 }
130 
134  ccl_global float *render_buffer,
135  const int work_size)
136 {
137  const int global_index = ccl_gpu_global_id_x();
138 
139  if (global_index < work_size) {
140  const int state = (path_index_array) ? path_index_array[global_index] : global_index;
142  }
143 }
145 
148  ccl_global const int *path_index_array,
149  const int work_size)
150 {
151  const int global_index = ccl_gpu_global_id_x();
152 
153  if (global_index < work_size) {
154  const int state = (path_index_array) ? path_index_array[global_index] : global_index;
156  }
157 }
159 
162  ccl_global const int *path_index_array,
163  const int work_size)
164 {
165  const int global_index = ccl_gpu_global_id_x();
166 
167  if (global_index < work_size) {
168  const int state = (path_index_array) ? path_index_array[global_index] : global_index;
170  }
171 }
173 
176  ccl_global const int *path_index_array,
177  const int work_size)
178 {
179  const int global_index = ccl_gpu_global_id_x();
180 
181  if (global_index < work_size) {
182  const int state = (path_index_array) ? path_index_array[global_index] : global_index;
184  }
185 }
187 
190  ccl_global const int *path_index_array,
191  ccl_global float *render_buffer,
192  const int work_size)
193 {
194  const int global_index = ccl_gpu_global_id_x();
195 
196  if (global_index < work_size) {
197  const int state = (path_index_array) ? path_index_array[global_index] : global_index;
199  }
200 }
202 
205  ccl_global const int *path_index_array,
206  ccl_global float *render_buffer,
207  const int work_size)
208 {
209  const int global_index = ccl_gpu_global_id_x();
210 
211  if (global_index < work_size) {
212  const int state = (path_index_array) ? path_index_array[global_index] : global_index;
214  }
215 }
217 
220  ccl_global const int *path_index_array,
221  ccl_global float *render_buffer,
222  const int work_size)
223 {
224  const int global_index = ccl_gpu_global_id_x();
225 
226  if (global_index < work_size) {
227  const int state = (path_index_array) ? path_index_array[global_index] : global_index;
229  }
230 }
232 
235  ccl_global const int *path_index_array,
236  ccl_global float *render_buffer,
237  const int work_size)
238 {
239  const int global_index = ccl_gpu_global_id_x();
240 
241  if (global_index < work_size) {
242  const int state = (path_index_array) ? path_index_array[global_index] : global_index;
244  }
245 }
247 
248 #if defined(__KERNEL_METAL_APPLE__) && defined(__METALRT__)
249 constant int __dummy_constant [[function_constant(Kernel_DummyConstant)]];
250 #endif
251 
254  ccl_global const int *path_index_array,
255  ccl_global float *render_buffer,
256  const int work_size)
257 {
258  const int global_index = ccl_gpu_global_id_x();
259 
260  if (global_index < work_size) {
261  const int state = (path_index_array) ? path_index_array[global_index] : global_index;
262 
263 #if defined(__KERNEL_METAL_APPLE__) && defined(__METALRT__)
264  KernelGlobals kg = NULL;
265  /* Workaround Ambient Occlusion and Bevel nodes not working with Metal.
266  * Dummy offset should not affect result, but somehow fixes bug! */
267  kg += __dummy_constant;
269 #else
271 #endif
272  }
273 }
275 
278  ccl_global const int *path_index_array,
279  ccl_global float *render_buffer,
280  const int work_size)
281 {
282  const int global_index = ccl_gpu_global_id_x();
283 
284  if (global_index < work_size) {
285  const int state = (path_index_array) ? path_index_array[global_index] : global_index;
287  }
288 }
290 
293  ccl_global const int *path_index_array,
294  ccl_global float *render_buffer,
295  const int work_size)
296 {
297  const int global_index = ccl_gpu_global_id_x();
298 
299  if (global_index < work_size) {
300  const int state = (path_index_array) ? path_index_array[global_index] : global_index;
302  }
303 }
305 
307  ccl_gpu_kernel_signature(integrator_queued_paths_array,
308  int num_states,
312 {
313  ccl_gpu_kernel_lambda(INTEGRATOR_STATE(state, path, queued_kernel) == kernel_index,
314  int kernel_index);
315  ccl_gpu_kernel_lambda_pass.kernel_index = kernel_index;
316 
318  num_states,
319  indices,
320  num_indices,
321  ccl_gpu_kernel_lambda_pass);
322 }
324 
326  ccl_gpu_kernel_signature(integrator_queued_shadow_paths_array,
327  int num_states,
328  ccl_global int *indices,
329  ccl_global int *num_indices,
330  int kernel_index)
331 {
332  ccl_gpu_kernel_lambda(INTEGRATOR_STATE(state, shadow_path, queued_kernel) == kernel_index,
333  int kernel_index);
334  ccl_gpu_kernel_lambda_pass.kernel_index = kernel_index;
335 
337  num_states,
338  indices,
339  num_indices,
340  ccl_gpu_kernel_lambda_pass);
341 }
343 
345  ccl_gpu_kernel_signature(integrator_active_paths_array,
346  int num_states,
347  ccl_global int *indices,
348  ccl_global int *num_indices)
349 {
350  ccl_gpu_kernel_lambda(INTEGRATOR_STATE(state, path, queued_kernel) != 0);
351 
353  num_states,
354  indices,
355  num_indices,
356  ccl_gpu_kernel_lambda_pass);
357 }
359 
361  ccl_gpu_kernel_signature(integrator_terminated_paths_array,
362  int num_states,
363  ccl_global int *indices,
364  ccl_global int *num_indices,
366 {
367  ccl_gpu_kernel_lambda(INTEGRATOR_STATE(state, path, queued_kernel) == 0);
368 
370  num_states,
372  num_indices,
373  ccl_gpu_kernel_lambda_pass);
374 }
376 
378  ccl_gpu_kernel_signature(integrator_terminated_shadow_paths_array,
379  int num_states,
380  ccl_global int *indices,
381  ccl_global int *num_indices,
382  int indices_offset)
383 {
384  ccl_gpu_kernel_lambda(INTEGRATOR_STATE(state, shadow_path, queued_kernel) == 0);
385 
387  num_states,
389  num_indices,
390  ccl_gpu_kernel_lambda_pass);
391 }
393 
395  ccl_gpu_kernel_signature(integrator_sorted_paths_array,
396  int num_states,
398  ccl_global int *indices,
399  ccl_global int *num_indices,
402  int kernel_index)
403 {
404  ccl_gpu_kernel_lambda((INTEGRATOR_STATE(state, path, queued_kernel) == kernel_index) ?
405  INTEGRATOR_STATE(state, path, shader_sort_key) :
407  int kernel_index);
408  ccl_gpu_kernel_lambda_pass.kernel_index = kernel_index;
409 
412  num_states,
414  indices,
415  num_indices,
416  key_counter,
418  ccl_gpu_kernel_lambda_pass);
419 }
421 
423  ccl_gpu_kernel_signature(integrator_compact_paths_array,
424  int num_states,
425  ccl_global int *indices,
426  ccl_global int *num_indices,
428 {
430  (INTEGRATOR_STATE(state, path, queued_kernel) != 0),
431  int num_active_paths);
432  ccl_gpu_kernel_lambda_pass.num_active_paths = num_active_paths;
433 
435  num_states,
436  indices,
437  num_indices,
438  ccl_gpu_kernel_lambda_pass);
439 }
441 
443  ccl_gpu_kernel_signature(integrator_compact_states,
447  const int work_size)
448 {
449  const int global_index = ccl_gpu_global_id_x();
450 
451  if (global_index < work_size) {
452  const int from_state = active_terminated_states[active_states_offset + global_index];
453  const int to_state = active_terminated_states[terminated_states_offset + global_index];
454 
455  ccl_gpu_kernel_call(integrator_state_move(NULL, to_state, from_state));
456  }
457 }
459 
461  ccl_gpu_kernel_signature(integrator_compact_shadow_paths_array,
462  int num_states,
463  ccl_global int *indices,
464  ccl_global int *num_indices,
465  int num_active_paths)
466 {
468  (INTEGRATOR_STATE(state, shadow_path, queued_kernel) != 0),
469  int num_active_paths);
470  ccl_gpu_kernel_lambda_pass.num_active_paths = num_active_paths;
471 
473  num_states,
474  indices,
475  num_indices,
476  ccl_gpu_kernel_lambda_pass);
477 }
479 
481  ccl_gpu_kernel_signature(integrator_compact_shadow_states,
483  const int active_states_offset,
484  const int terminated_states_offset,
485  const int work_size)
486 {
487  const int global_index = ccl_gpu_global_id_x();
488 
489  if (global_index < work_size) {
490  const int from_state = active_terminated_states[active_states_offset + global_index];
491  const int to_state = active_terminated_states[terminated_states_offset + global_index];
492 
493  ccl_gpu_kernel_call(integrator_shadow_state_move(NULL, to_state, from_state));
494  }
495 }
497 
500 {
502 }
504 
505 /* --------------------------------------------------------------------
506  * Adaptive sampling.
507  */
508 
511  ccl_global float *render_buffer,
512  int sx,
513  int sy,
514  int sw,
515  int sh,
516  float threshold,
517  bool reset,
518  int offset,
519  int stride,
521 {
522  const int work_index = ccl_gpu_global_id_x();
523  const int y = work_index / sw;
524  const int x = work_index - y * sw;
525 
526  bool converged = true;
527 
528  if (x < sw && y < sh) {
530  nullptr, render_buffer, sx + x, sy + y, threshold, reset, offset, stride));
531  }
532 
533  /* NOTE: All threads specified in the mask must execute the intrinsic. */
536  if (lane_id == 0) {
538  }
539 }
541 
544  ccl_global float *render_buffer,
545  int sx,
546  int sy,
547  int sw,
548  int sh,
549  int offset,
550  int stride)
551 {
552  const int y = ccl_gpu_global_id_x();
553 
554  if (y < sh) {
557  }
558 }
560 
563  ccl_global float *render_buffer,
564  int sx,
565  int sy,
566  int sw,
567  int sh,
568  int offset,
569  int stride)
570 {
571  const int x = ccl_gpu_global_id_x();
572 
573  if (x < sw) {
576  }
577 }
579 
580 /* --------------------------------------------------------------------
581  * Cryptomatte.
582  */
583 
586  ccl_global float *render_buffer,
588 {
589  const int pixel_index = ccl_gpu_global_id_x();
590 
591  if (pixel_index < num_pixels) {
593  }
594 }
596 
597 /* --------------------------------------------------------------------
598  * Film.
599  */
600 
602  const int rgba_offset,
603  const int rgba_stride,
604  const int x,
605  const int y,
606  const half4 half_pixel)
607 {
608  /* Work around HIP issue with half float display, see T92972. */
609 #ifdef __KERNEL_HIP__
610  ccl_global half *out = ((ccl_global half *)rgba) + (rgba_offset + y * rgba_stride + x) * 4;
611  out[0] = half_pixel.x;
612  out[1] = half_pixel.y;
613  out[2] = half_pixel.z;
614  out[3] = half_pixel.w;
615 #else
616  ccl_global half4 *out = ((ccl_global half4 *)rgba) + rgba_offset + y * rgba_stride + x;
617  *out = half_pixel;
618 #endif
619 }
620 
621 #ifdef __KERNEL_METAL__
622 
623 /* Fetch into a local variable on Metal - there is minimal overhead. Templating the
624  * film_get_pass_pixel_... functions works on MSL, but not on other compilers. */
625 # define FILM_GET_PASS_PIXEL_F32(variant, input_channel_count) \
626  float local_pixel[4]; \
627  film_get_pass_pixel_##variant(&kfilm_convert, buffer, local_pixel); \
628  if (input_channel_count >= 1) { \
629  pixel[0] = local_pixel[0]; \
630  } \
631  if (input_channel_count >= 2) { \
632  pixel[1] = local_pixel[1]; \
633  } \
634  if (input_channel_count >= 3) { \
635  pixel[2] = local_pixel[2]; \
636  } \
637  if (input_channel_count >= 4) { \
638  pixel[3] = local_pixel[3]; \
639  }
640 
641 #else
642 
643 # define FILM_GET_PASS_PIXEL_F32(variant, input_channel_count) \
644  film_get_pass_pixel_##variant(&kfilm_convert, buffer, pixel);
645 
646 #endif
647 
648 #define KERNEL_FILM_CONVERT_VARIANT(variant, input_channel_count) \
649  ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) \
650  ccl_gpu_kernel_signature(film_convert_##variant, \
651  const KernelFilmConvert kfilm_convert, \
652  ccl_global float *pixels, \
653  ccl_global float *render_buffer, \
654  int num_pixels, \
655  int width, \
656  int offset, \
657  int stride, \
658  int rgba_offset, \
659  int rgba_stride) \
660  { \
661  const int render_pixel_index = ccl_gpu_global_id_x(); \
662  if (render_pixel_index >= num_pixels) { \
663  return; \
664  } \
665 \
666  const int x = render_pixel_index % width; \
667  const int y = render_pixel_index / width; \
668 \
669  const uint64_t buffer_pixel_index = x + y * stride; \
670  ccl_global const float *buffer = render_buffer + offset + \
671  buffer_pixel_index * kfilm_convert.pass_stride; \
672 \
673  ccl_global float *pixel = pixels + \
674  (render_pixel_index + rgba_offset) * kfilm_convert.pixel_stride; \
675 \
676  FILM_GET_PASS_PIXEL_F32(variant, input_channel_count); \
677  } \
678  ccl_gpu_kernel_postfix \
679 \
680  ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) \
681  ccl_gpu_kernel_signature(film_convert_##variant##_half_rgba, \
682  const KernelFilmConvert kfilm_convert, \
683  ccl_global uchar4 *rgba, \
684  ccl_global float *render_buffer, \
685  int num_pixels, \
686  int width, \
687  int offset, \
688  int stride, \
689  int rgba_offset, \
690  int rgba_stride) \
691  { \
692  const int render_pixel_index = ccl_gpu_global_id_x(); \
693  if (render_pixel_index >= num_pixels) { \
694  return; \
695  } \
696 \
697  const int x = render_pixel_index % width; \
698  const int y = render_pixel_index / width; \
699 \
700  const uint64_t buffer_pixel_index = x + y * stride; \
701  ccl_global const float *buffer = render_buffer + offset + \
702  buffer_pixel_index * kfilm_convert.pass_stride; \
703 \
704  float pixel[4]; \
705  film_get_pass_pixel_##variant(&kfilm_convert, buffer, pixel); \
706 \
707  if (input_channel_count == 1) { \
708  pixel[1] = pixel[2] = pixel[0]; \
709  } \
710  if (input_channel_count <= 3) { \
711  pixel[3] = 1.0f; \
712  } \
713 \
714  film_apply_pass_pixel_overlays_rgba(&kfilm_convert, buffer, pixel); \
715 \
716  const half4 half_pixel = float4_to_half4_display( \
717  make_float4(pixel[0], pixel[1], pixel[2], pixel[3])); \
718  kernel_gpu_film_convert_half_write(rgba, rgba_offset, rgba_stride, x, y, half_pixel); \
719  } \
720  ccl_gpu_kernel_postfix
721 
722 /* 1 channel inputs */
725 KERNEL_FILM_CONVERT_VARIANT(sample_count, 1)
727 
728 /* 3 channel inputs */
729 KERNEL_FILM_CONVERT_VARIANT(light_path, 3)
731 
732 /* 4 channel inputs */
734 KERNEL_FILM_CONVERT_VARIANT(cryptomatte, 4)
735 KERNEL_FILM_CONVERT_VARIANT(shadow_catcher, 4)
736 KERNEL_FILM_CONVERT_VARIANT(shadow_catcher_matte_with_shadow, 4)
737 KERNEL_FILM_CONVERT_VARIANT(combined, 4)
739 
740 #undef KERNEL_FILM_CONVERT_VARIANT
741 
742 /* --------------------------------------------------------------------
743  * Shader evaluation.
744  */
745 
746 /* Displacement */
747 
752  const int offset,
753  const int work_size)
754 {
755  int i = ccl_gpu_global_id_x();
756  if (i < work_size) {
758  }
759 }
761 
762 /* Background */
763 
768  const int offset,
769  const int work_size)
770 {
771  int i = ccl_gpu_global_id_x();
772  if (i < work_size) {
774  }
775 }
777 
778 /* Curve Shadow Transparency */
779 
783  ccl_global float *output,
784  const int offset,
785  const int work_size)
786 {
787  int i = ccl_gpu_global_id_x();
788  if (i < work_size) {
791  }
792 }
794 
795 /* --------------------------------------------------------------------
796  * Denoising.
797  */
798 
800  ccl_gpu_kernel_signature(filter_color_preprocess,
801  ccl_global float *render_buffer,
802  int full_x,
803  int full_y,
804  int width,
805  int height,
806  int offset,
807  int stride,
810 {
811  const int work_index = ccl_gpu_global_id_x();
812  const int y = work_index / width;
813  const int x = work_index - y * width;
814 
815  if (x >= width || y >= height) {
816  return;
817  }
818 
821 
823  color_out[0] = clamp(color_out[0], 0.0f, 10000.0f);
824  color_out[1] = clamp(color_out[1], 0.0f, 10000.0f);
825  color_out[2] = clamp(color_out[2], 0.0f, 10000.0f);
826 }
828 
830  ccl_gpu_kernel_signature(filter_guiding_preprocess,
836  ccl_global const float *render_buffer,
844  int full_x,
845  int full_y,
846  int width,
847  int height,
849 {
850  const int work_index = ccl_gpu_global_id_x();
851  const int y = work_index / width;
852  const int x = work_index - y * width;
853 
854  if (x >= width || y >= height) {
855  return;
856  }
857 
860 
863 
864  float pixel_scale;
866  pixel_scale = 1.0f / num_samples;
867  }
868  else {
870  }
871 
872  /* Albedo pass. */
875 
876  ccl_global const float *aledo_in = buffer + render_pass_denoising_albedo;
878 
879  albedo_out[0] = aledo_in[0] * pixel_scale;
880  albedo_out[1] = aledo_in[1] * pixel_scale;
881  albedo_out[2] = aledo_in[2] * pixel_scale;
882  }
883 
884  /* Normal pass. */
887 
888  ccl_global const float *normal_in = buffer + render_pass_denoising_normal;
889  ccl_global float *normal_out = guiding_pixel + guiding_pass_normal;
890 
891  normal_out[0] = normal_in[0] * pixel_scale;
892  normal_out[1] = normal_in[1] * pixel_scale;
893  normal_out[2] = normal_in[2] * pixel_scale;
894  }
895 
896  /* Flow pass. */
899 
900  ccl_global const float *motion_in = buffer + render_pass_motion;
901  ccl_global float *flow_out = guiding_pixel + guiding_pass_flow;
902 
903  flow_out[0] = -motion_in[0] * pixel_scale;
904  flow_out[1] = -motion_in[1] * pixel_scale;
905  }
906 }
908 
910  ccl_gpu_kernel_signature(filter_guiding_set_fake_albedo,
911  ccl_global float *guiding_buffer,
914  int width,
915  int height)
916 {
918 
920  const int y = work_index / width;
921  const int x = work_index - y * width;
922 
923  if (x >= width || y >= height) {
924  return;
925  }
926 
927  const uint64_t guiding_pixel_index = x + y * width;
929 
931 
932  albedo_out[0] = 0.5f;
933  albedo_out[1] = 0.5f;
934  albedo_out[2] = 0.5f;
935 }
937 
939  ccl_gpu_kernel_signature(filter_color_postprocess,
940  ccl_global float *render_buffer,
941  int full_x,
942  int full_y,
943  int width,
944  int height,
945  int offset,
946  int stride,
947  int pass_stride,
948  int num_samples,
950  int pass_denoised,
954 {
955  const int work_index = ccl_gpu_global_id_x();
956  const int y = work_index / width;
957  const int x = work_index - y * width;
958 
959  if (x >= width || y >= height) {
960  return;
961  }
962 
963  const uint64_t render_pixel_index = offset + (x + full_x) + (y + full_y) * stride;
965 
966  float pixel_scale;
969  }
970  else {
972  }
973 
975 
979 
980  if (num_components == 3) {
981  /* Pass without alpha channel. */
982  }
983  else if (!use_compositing) {
984  /* Currently compositing passes are either 3-component (derived by dividing light passes)
985  * or do not have transparency (shadow catcher). Implicitly rely on this logic, as it
986  * simplifies logic and avoids extra memory allocation. */
987  ccl_global const float *noisy_pixel = buffer + pass_noisy;
988  denoised_pixel[3] = noisy_pixel[3];
989  }
990  else {
991  /* Assigning to zero since this is a default alpha value for 3-component passes, and it
992  * is an opaque pixel for 4 component passes. */
993  denoised_pixel[3] = 0;
994  }
995 }
997 
998 /* --------------------------------------------------------------------
999  * Shadow catcher.
1000  */
1001 
1003  ccl_gpu_kernel_signature(integrator_shadow_catcher_count_possible_splits,
1004  int num_states,
1006 {
1007  const int state = ccl_gpu_global_id_x();
1008 
1009  bool can_split = false;
1010 
1011  if (state < num_states) {
1013  }
1014 
1015  /* NOTE: All threads specified in the mask must execute the intrinsic. */
1018  if (lane_id == 0) {
1020  }
1021 }
unsigned int uint
Definition: BLI_sys_types.h:67
float float4[4]
ATOMIC_INLINE uint32_t atomic_fetch_and_add_uint32(uint32_t *p, uint32_t x)
Definition: half.h:41
#define kernel_assert(cond)
Definition: cpu/compat.h:34
#define ccl_gpu_thread_idx_x
Definition: cuda/compat.h:61
#define ccl_gpu_global_id_x()
Definition: cuda/compat.h:68
#define ccl_gpu_warp_size
Definition: cuda/compat.h:65
#define ccl_device_inline
Definition: cuda/compat.h:34
#define ccl_gpu_ballot(predicate)
Definition: cuda/compat.h:74
#define ccl_global
Definition: cuda/compat.h:43
const KernelGlobalsCPU *ccl_restrict KernelGlobals
@ Kernel_DummyConstant
ccl_device_inline void kernel_cryptomatte_post(KernelGlobals kg, ccl_global float *render_buffer, int pixel_index)
Definition: id_passes.h:82
ccl_device bool integrator_init_from_bake(KernelGlobals kg, IntegratorState state, ccl_global const KernelWorkTile *ccl_restrict tile, ccl_global float *render_buffer, const int x, const int y, const int scheduled_sample)
ccl_device bool integrator_init_from_camera(KernelGlobals kg, IntegratorState state, ccl_global const KernelWorkTile *ccl_restrict tile, ccl_global float *render_buffer, const int x, const int y, const int scheduled_sample)
ccl_device void integrator_intersect_closest(KernelGlobals kg, IntegratorState state, ccl_global float *ccl_restrict render_buffer)
ccl_device void integrator_intersect_shadow(KernelGlobals kg, IntegratorShadowState state)
CCL_NAMESPACE_BEGIN ccl_device void integrator_intersect_subsurface(KernelGlobals kg, IntegratorState state)
ccl_device void integrator_intersect_volume_stack(KernelGlobals kg, IntegratorState state)
CCL_NAMESPACE_BEGIN ccl_device void kernel_displace_evaluate(KernelGlobals kg, ccl_global const KernelShaderEvalInput *input, ccl_global float *output, const int offset)
ccl_device void kernel_curve_shadow_transparency_evaluate(KernelGlobals kg, ccl_global const KernelShaderEvalInput *input, ccl_global float *output, const int offset)
ccl_device void kernel_background_evaluate(KernelGlobals kg, ccl_global const KernelShaderEvalInput *input, ccl_global float *output, const int offset)
#define GPU_KERNEL_MAX_REGISTERS
#define GPU_KERNEL_BLOCK_NUM_THREADS
#define ccl_gpu_kernel_postfix
#define ccl_gpu_kernel_lambda(func,...)
#define ccl_gpu_kernel_signature(name,...)
ccl_gpu_kernel_postfix ccl_global int ccl_global int int num_values
ccl_gpu_kernel_postfix ccl_global float * guiding_buffer
const int tile_work_index
ccl_gpu_kernel_postfix ccl_global int ccl_global int * prefix_sum
ccl_global float * color_out
ccl_gpu_kernel_postfix ccl_global float int full_x
ccl_gpu_kernel_postfix ccl_global float int int int int ccl_global const float int int render_stride
ccl_gpu_kernel_postfix int ccl_global uint * num_possible_splits
ccl_gpu_kernel_postfix ccl_global const int ccl_global float const int work_size
ccl_gpu_kernel_postfix ccl_global float int int int int int int int int int int int int num_components
ccl_gpu_kernel_postfix ccl_global KernelWorkTile const int num_tiles
ccl_gpu_kernel_postfix ccl_global float int int int int guiding_pass_flow
ccl_gpu_kernel_postfix ccl_global float int int int int ccl_global const float int int int int int int render_pass_denoising_normal
ccl_gpu_kernel_postfix int int ccl_global int ccl_global int ccl_global int * key_counter
gpu_parallel_sorted_index_array(state_index, num_states, num_states_limit, indices, num_indices, key_counter, key_prefix_sum, ccl_gpu_kernel_lambda_pass)
ccl_gpu_kernel_postfix ccl_global float int int int int ccl_global const float int render_offset
ccl_global float * buffer
ccl_gpu_kernel_call(get_work_pixel(tile, tile_work_index, &x, &y, &sample))
int num_states
ccl_gpu_kernel_postfix ccl_gpu_kernel_threads(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE) ccl_gpu_kernel_signature(integrator_queued_paths_array
ccl_gpu_kernel_postfix ccl_global const int const int active_states_offset
ccl_gpu_kernel_postfix ccl_global const int * path_index_array
#define KERNEL_FILM_CONVERT_VARIANT(variant, input_channel_count)
ccl_gpu_kernel_postfix int ccl_global int ccl_global int int kernel_index
ccl_gpu_kernel_postfix ccl_global float int int int int float threshold
ccl_global float * albedo_out
ccl_gpu_kernel_postfix ccl_global float int int int int height
gpu_parallel_active_index_array(GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE, num_states, indices, num_indices, ccl_gpu_kernel_lambda_pass)
ccl_gpu_kernel_postfix ccl_global KernelWorkTile * tiles
const uint state_index
ccl_gpu_kernel_postfix int ccl_global int ccl_global int * num_indices
ccl_gpu_kernel_postfix ccl_global int * counter
ccl_gpu_kernel_postfix ccl_global KernelWorkTile const int ccl_global float * render_buffer
ccl_gpu_kernel_postfix ccl_global float int int int int ccl_global const float int int int render_pass_stride
ccl_gpu_kernel_postfix ccl_global KernelWorkTile const int ccl_global float const int max_tile_work_size
float pixel_scale
ccl_gpu_kernel_postfix ccl_global float int int int int ccl_global const float int int int int int int int render_pass_motion
ccl_gpu_kernel_postfix int int ccl_global int ccl_global int ccl_global int ccl_global int * key_prefix_sum
const int tile_index
ccl_gpu_kernel_postfix ccl_global float int num_pixels
ccl_gpu_kernel_postfix ccl_global float int int sy
ccl_gpu_kernel_postfix ccl_global float int int int guiding_pass_normal
ccl_gpu_kernel_postfix int ccl_global int ccl_global int int num_active_paths
ccl_gpu_kernel_postfix ccl_global float int int int int int int int int pass_denoised
const auto can_split_mask
ccl_gpu_kernel_postfix ccl_global float int int int int ccl_global const float int int int int render_pass_sample_count
ccl_global KernelShaderEvalInput ccl_global float * output
ccl_gpu_kernel_postfix ccl_global float int guiding_pass_stride
const uint64_t render_pixel_index
ccl_gpu_kernel_postfix ccl_global float int int guiding_pass_albedo
ccl_gpu_kernel_postfix ccl_global float int int int sw
ccl_global const KernelWorkTile * tile
ccl_gpu_kernel_postfix ccl_global float int int int width
ccl_gpu_kernel_postfix ccl_global float int int int int float bool int int ccl_global uint * num_active_pixels
ccl_gpu_kernel_postfix ccl_global const int const int const int terminated_states_offset
ccl_gpu_kernel_postfix ccl_global float int int int int float bool int offset
ccl_global KernelShaderEvalInput * input
ccl_gpu_kernel_postfix ccl_global float int int int int ccl_global const float int int int int int render_pass_denoising_albedo
const int state
ccl_gpu_kernel_postfix ccl_global float int int int int int int int pass_stride
ccl_gpu_kernel_postfix ccl_global const int * active_terminated_states
ccl_gpu_kernel_postfix ccl_global float int int int int float bool int int stride
const uint64_t guiding_pixel_index
ccl_global float * guiding_pixel
ccl_gpu_kernel_postfix ccl_global float int int int int float bool reset
clear internal cached data and reset random seed
ccl_global float * denoised_pixel
ccl_gpu_kernel_postfix int ccl_global int ccl_global int int indices_offset
ccl_gpu_kernel_postfix ccl_global float int int int int int int int int int int int pass_sample_count
const auto num_active_pixels_mask
ccl_gpu_kernel_postfix int int num_states_limit
ccl_gpu_kernel_postfix int ccl_global int * indices
ccl_gpu_kernel_postfix ccl_global float int sx
const int lane_id
const int work_index
ccl_gpu_kernel_postfix ccl_global float int int full_y
ccl_gpu_kernel_postfix ccl_global float int int int int sh
ccl_gpu_kernel_postfix ccl_global float int int int int ccl_global const float int int int int int int int int int int int int num_samples
ccl_gpu_kernel_postfix ccl_global float int int int int int int int int int pass_noisy
ccl_gpu_kernel_postfix ccl_device_inline void kernel_gpu_film_convert_half_write(ccl_global uchar4 *rgba, const int rgba_offset, const int rgba_stride, const int x, const int y, const half4 half_pixel)
ccl_gpu_kernel_postfix ccl_global float int int int int int int int int int int int int bool use_compositing
bool converged
ccl_gpu_kernel(GPU_KERNEL_BLOCK_NUM_THREADS, GPU_KERNEL_MAX_REGISTERS) ccl_gpu_kernel_signature(integrator_reset
ccl_device void kernel_adaptive_sampling_filter_x(KernelGlobals kg, ccl_global float *render_buffer, int y, int start_x, int width, int offset, int stride)
ccl_device void kernel_adaptive_sampling_filter_y(KernelGlobals kg, ccl_global float *render_buffer, int x, int start_y, int height, int offset, int stride)
ccl_device bool kernel_adaptive_sampling_convergence_check(KernelGlobals kg, ccl_global float *render_buffer, int x, int y, float threshold, bool reset, int offset, int stride)
#define PASS_UNUSED
Definition: kernel/types.h:44
void KERNEL_FUNCTION_FULL_NAME() adaptive_sampling_filter_x(const KernelGlobalsCPU *kg, ccl_global float *render_buffer, int y, int start_x, int width, int offset, int stride)
bool KERNEL_FUNCTION_FULL_NAME() adaptive_sampling_convergence_check(const KernelGlobalsCPU *kg, ccl_global float *render_buffer, int x, int y, float threshold, bool reset, int offset, int stride)
void KERNEL_FUNCTION_FULL_NAME() shader_eval_displace(const KernelGlobalsCPU *kg, const KernelShaderEvalInput *input, float *output, const int offset)
void KERNEL_FUNCTION_FULL_NAME() cryptomatte_postprocess(const KernelGlobalsCPU *kg, ccl_global float *render_buffer, int pixel_index)
void KERNEL_FUNCTION_FULL_NAME() shader_eval_background(const KernelGlobalsCPU *kg, const KernelShaderEvalInput *input, float *output, const int offset)
void KERNEL_FUNCTION_FULL_NAME() shader_eval_curve_shadow_transparency(const KernelGlobalsCPU *kg, const KernelShaderEvalInput *input, float *output, const int offset)
void KERNEL_FUNCTION_FULL_NAME() adaptive_sampling_filter_y(const KernelGlobalsCPU *kg, ccl_global float *render_buffer, int x, int start_y, int height, int offset, int stride)
T clamp(const T &a, const T &min, const T &max)
static const pxr::TfToken out("out", pxr::TfToken::Immortal)
static const pxr::TfToken rgba("rgba", pxr::TfToken::Immortal)
#define GPU_PARALLEL_ACTIVE_INDEX_DEFAULT_BLOCK_SIZE
__device__ void gpu_parallel_prefix_sum(const int global_id, ccl_global int *counter, ccl_global int *prefix_sum, const int num_values)
#define GPU_PARALLEL_PREFIX_SUM_DEFAULT_BLOCK_SIZE
#define GPU_PARALLEL_SORTED_INDEX_INACTIVE_KEY
#define GPU_PARALLEL_SORTED_INDEX_DEFAULT_BLOCK_SIZE
ccl_device void integrator_shade_background(KernelGlobals kg, IntegratorState state, ccl_global float *ccl_restrict render_buffer)
ccl_device void integrator_shade_light(KernelGlobals kg, IntegratorState state, ccl_global float *ccl_restrict render_buffer)
Definition: shade_light.h:73
ccl_device void integrator_shade_shadow(KernelGlobals kg, IntegratorShadowState state, ccl_global float *ccl_restrict render_buffer)
Definition: shade_shadow.h:143
ccl_device_forceinline void integrator_shade_surface(KernelGlobals kg, IntegratorState state, ccl_global float *ccl_restrict render_buffer)
ccl_device_forceinline void integrator_shade_surface_mnee(KernelGlobals kg, IntegratorState state, ccl_global float *ccl_restrict render_buffer)
ccl_device_forceinline void integrator_shade_surface_raytrace(KernelGlobals kg, IntegratorState state, ccl_global float *ccl_restrict render_buffer)
CCL_NAMESPACE_BEGIN ccl_device void integrator_shade_volume(KernelGlobals kg, IntegratorState state, ccl_global float *ccl_restrict render_buffer)
ccl_device_inline bool kernel_shadow_catcher_path_can_split(KernelGlobals kg, ConstIntegratorState state)
#define INTEGRATOR_STATE_WRITE(state, nested_struct, member)
Definition: state.h:155
#define INTEGRATOR_STATE(state, nested_struct, member)
Definition: state.h:154
unsigned __int64 uint64_t
Definition: stdint.h:90
Definition: half.h:64
half x
Definition: half.h:65
half w
Definition: half.h:65
half z
Definition: half.h:65
half y
Definition: half.h:65
ccl_device_inline uint __float_as_uint(float f)
Definition: util/math.h:263
ccl_device_inline uint popcount(uint x)
Definition: util/math.h:794
CCL_NAMESPACE_BEGIN ccl_device_inline void get_work_pixel(ccl_global const KernelWorkTile *tile, uint global_work_index, ccl_private uint *x, ccl_private uint *y, ccl_private uint *sample)
Definition: work_stealing.h:13