23 size_t state_size = 0;
25 #define KERNEL_STRUCT_BEGIN(name) for (int array_index = 0;; array_index++) {
26 #define KERNEL_STRUCT_MEMBER(parent_struct, type, name, feature) state_size += sizeof(type);
27 #define KERNEL_STRUCT_ARRAY_MEMBER(parent_struct, type, name, feature) state_size += sizeof(type);
28 #define KERNEL_STRUCT_END(name) \
31 #define KERNEL_STRUCT_END_ARRAY(name, cpu_array_size, gpu_array_size) \
32 if (array_index >= gpu_array_size - 1) { \
41 #define KERNEL_STRUCT_VOLUME_STACK_SIZE 4
47 #undef KERNEL_STRUCT_BEGIN
48 #undef KERNEL_STRUCT_MEMBER
49 #undef KERNEL_STRUCT_ARRAY_MEMBER
50 #undef KERNEL_STRUCT_END
51 #undef KERNEL_STRUCT_END_ARRAY
52 #undef KERNEL_STRUCT_VOLUME_STACK_SIZE
60 bool *cancel_requested_flag)
61 :
PathTraceWork(device, film, device_scene, cancel_requested_flag),
62 queue_(device->gpu_queue_create()),
63 integrator_state_soa_kernel_features_(0),
64 integrator_queue_counter_(device,
"integrator_queue_counter",
MEM_READ_WRITE),
65 integrator_shader_sort_counter_(device,
"integrator_shader_sort_counter",
MEM_READ_WRITE),
66 integrator_shader_raytrace_sort_counter_(
67 device,
"integrator_shader_raytrace_sort_counter",
MEM_READ_WRITE),
68 integrator_shader_mnee_sort_counter_(
70 integrator_shader_sort_prefix_sum_(
72 integrator_next_main_path_index_(device,
"integrator_next_main_path_index",
MEM_READ_WRITE),
73 integrator_next_shadow_path_index_(
80 min_num_active_main_paths_(queue_->num_concurrent_busy_states()),
81 max_active_main_path_index_(0)
104 requested_volume_stack_size);
110 #define KERNEL_STRUCT_BEGIN(name) for (int array_index = 0;; array_index++) {
111 #define KERNEL_STRUCT_MEMBER(parent_struct, type, name, feature) \
112 if ((kernel_features & (feature)) && (integrator_state_gpu_.parent_struct.name == nullptr)) { \
113 device_only_memory<type> *array = new device_only_memory<type>(device_, \
114 "integrator_state_" #name); \
115 array->alloc_to_device(max_num_paths_); \
116 integrator_state_soa_.emplace_back(array); \
117 integrator_state_gpu_.parent_struct.name = (type *)array->device_pointer; \
119 #define KERNEL_STRUCT_ARRAY_MEMBER(parent_struct, type, name, feature) \
120 if ((kernel_features & (feature)) && \
121 (integrator_state_gpu_.parent_struct[array_index].name == nullptr)) { \
122 device_only_memory<type> *array = new device_only_memory<type>(device_, \
123 "integrator_state_" #name); \
124 array->alloc_to_device(max_num_paths_); \
125 integrator_state_soa_.emplace_back(array); \
126 integrator_state_gpu_.parent_struct[array_index].name = (type *)array->device_pointer; \
128 #define KERNEL_STRUCT_END(name) \
131 #define KERNEL_STRUCT_END_ARRAY(name, cpu_array_size, gpu_array_size) \
132 if (array_index >= gpu_array_size - 1) { \
136 #define KERNEL_STRUCT_VOLUME_STACK_SIZE (integrator_state_soa_volume_stack_size_)
142 #undef KERNEL_STRUCT_BEGIN
143 #undef KERNEL_STRUCT_MEMBER
144 #undef KERNEL_STRUCT_ARRAY_MEMBER
145 #undef KERNEL_STRUCT_END
146 #undef KERNEL_STRUCT_END_ARRAY
147 #undef KERNEL_STRUCT_VOLUME_STACK_SIZE
150 size_t total_soa_size = 0;
152 total_soa_size += soa_memory->memory_size();
190 const int num_elements =
queue_->num_sort_partition_elements();
287 int num_iterations = 0;
299 if (!
queue_->synchronize()) {
318 if (!
queue_->synchronize()) {
338 int max_num_queued = 0;
342 if (queue_counter->
num_queued[i] > max_num_queued) {
344 max_num_queued = queue_counter->
num_queued[i];
394 int num_paths_limit = INT_MAX;
401 if (available_shadow_paths < queue_counter->num_queued[
kernel]) {
413 num_paths_limit = available_shadow_paths / 2;
498 <<
" used for path iteration, should never happen.";
505 const int num_paths_limit)
507 int d_queued_kernel = queued_kernel;
510 assert(d_counter != 0 && d_prefix_sum != 0);
550 int d_queued_kernel = queued_kernel;
571 const int min_compact_paths = 32;
606 const float shadow_compact_ratio = 0.5f;
607 const int min_compact_paths = 32;
626 const int max_active_path_index,
666 if (num_compact_paths > 0) {
704 int num_predicted_splits = 0;
720 const int num_new_paths = num_available_paths / 2;
723 num_predicted_splits += num_scheduled_possible_split + num_new_paths;
731 while (num_paths < max_num_camera_paths) {
734 work_tiles.push_back(work_tile);
735 num_paths += work_tile.
w * work_tile.
h * work_tile.
num_samples;
743 if (work_tiles.size() == 0 && num_paths == 0) {
750 if (work_tiles.size() == 0) {
768 num_predicted_splits);
775 const int num_work_tiles,
777 const int num_predicted_splits)
786 for (
int i = 0; i < num_work_tiles; i++) {
788 work_tile = work_tiles[i];
790 const int tile_work_size = work_tile.
w * work_tile.
h * work_tile.
num_samples;
795 path_index_offset += tile_work_size;
821 <<
"Invalid number of queued states for kernel "
849 VLOG_INFO <<
"Using graphics interop GPU display update.";
852 VLOG_INFO <<
"Using naive GPU display update.";
871 if (!
buffers_->buffer.device_pointer) {
872 LOG(WARNING) <<
"Request for GPU display update without allocated render buffers.";
897 const int final_width =
buffers_->params.window_width;
898 const int final_height =
buffers_->params.window_height;
1065 return queue_->synchronize();
_GL_VOID GLfloat value _GL_VOID_RET _GL_VOID const GLuint GLboolean *residences _GL_BOOL_RET _GL_VOID GLsizei height
_GL_VOID GLfloat value _GL_VOID_RET _GL_VOID const GLuint GLboolean *residences _GL_BOOL_RET _GL_VOID GLsizei GLfloat GLfloat GLfloat GLfloat const GLubyte *bitmap _GL_VOID_RET _GL_VOID GLenum const void *lists _GL_VOID_RET _GL_VOID const GLdouble *equation _GL_VOID_RET _GL_VOID GLdouble GLdouble blue _GL_VOID_RET _GL_VOID GLfloat GLfloat blue _GL_VOID_RET _GL_VOID GLint GLint blue _GL_VOID_RET _GL_VOID GLshort GLshort blue _GL_VOID_RET _GL_VOID GLubyte GLubyte blue _GL_VOID_RET _GL_VOID GLuint GLuint blue _GL_VOID_RET _GL_VOID GLushort GLushort blue _GL_VOID_RET _GL_VOID GLbyte GLbyte GLbyte alpha _GL_VOID_RET _GL_VOID GLdouble GLdouble GLdouble alpha _GL_VOID_RET _GL_VOID GLfloat GLfloat GLfloat alpha _GL_VOID_RET _GL_VOID GLint GLint GLint alpha _GL_VOID_RET _GL_VOID GLshort GLshort GLshort alpha _GL_VOID_RET _GL_VOID GLubyte GLubyte GLubyte alpha _GL_VOID_RET _GL_VOID GLuint GLuint GLuint alpha _GL_VOID_RET _GL_VOID GLushort GLushort GLushort alpha _GL_VOID_RET _GL_VOID GLenum mode _GL_VOID_RET _GL_VOID GLint GLsizei width
virtual BVHLayoutMask get_bvh_layout_mask() const =0
virtual void const_copy_to(const char *name, void *host, size_t size)=0
virtual bool should_use_graphics_interop()
device_ptr d_pixels_half_rgba
bool get_render_tile_pixels(const RenderBuffers *render_buffers, const Destination &destination) const
void graphics_interop_activate()
DisplayDriver::GraphicsInterop graphics_interop_get()
void graphics_interop_deactivate()
void copy_pixels_to_texture(const half4 *rgba_pixels, int texture_x, int texture_y, int pixels_width, int pixels_height)
bool kernel_is_shadow_path(DeviceKernel kernel)
virtual bool copy_render_buffers_from_device() override
void compact_paths(const int num_active_paths, const int max_active_path_index, DeviceKernel terminated_paths_kernel, DeviceKernel compact_paths_kernel, DeviceKernel compact_kernel)
void compact_shadow_paths()
IntegratorStateGPU integrator_state_gpu_
device_vector< int > integrator_shader_sort_counter_
int integrator_state_soa_volume_stack_size_
void alloc_integrator_sorting()
uint integrator_state_soa_kernel_features_
bool interop_use_checked_
void enqueue_adaptive_sampling_filter_y()
PathTraceWorkGPU(Device *device, Film *film, DeviceScene *device_scene, bool *cancel_requested_flag)
void enqueue_adaptive_sampling_filter_x()
void alloc_integrator_soa()
device_vector< int > num_queued_paths_
bool copy_to_display_interop(PathTraceDisplay *display, PassMode pass_mode, int num_samples)
int min_num_active_main_paths_
virtual void destroy_gpu_resources(PathTraceDisplay *display) override
virtual void alloc_work_memory() override
device_vector< int > integrator_next_main_path_index_
unique_ptr< DeviceQueue > queue_
int adaptive_sampling_convergence_check_count_active(float threshold, bool reset)
virtual bool zero_render_buffers() override
bool kernel_uses_sorting(DeviceKernel kernel)
virtual void init_execution() override
device_vector< int > integrator_shader_sort_prefix_sum_
device_vector< KernelWorkTile > work_tiles_
virtual void cryptomatte_postproces() override
void alloc_integrator_path_split()
void compute_sorted_queued_paths(DeviceKernel kernel, DeviceKernel queued_kernel, const int num_paths_limit)
device_vector< IntegratorQueueCounter > integrator_queue_counter_
bool should_use_graphics_interop()
bool enqueue_work_tiles(bool &finished)
device_vector< int > queued_paths_
void compact_main_paths(const int num_active_paths)
bool has_shadow_catcher() const
bool kernel_creates_ao_paths(DeviceKernel kernel)
bool enqueue_path_iteration()
int num_active_main_paths_paths()
virtual void render_samples(RenderStatistics &statistics, int start_sample, int samples_num, int sample_offset) override
unique_ptr< DeviceGraphicsInterop > device_graphics_interop_
device_vector< int > integrator_next_shadow_path_index_
void compute_queued_paths(DeviceKernel kernel, DeviceKernel queued_kernel)
DeviceKernel get_most_queued_kernel() const
device_vector< int > integrator_shader_raytrace_sort_counter_
bool kernel_creates_shadow_paths(DeviceKernel kernel)
void copy_to_display_naive(PathTraceDisplay *display, PassMode pass_mode, int num_samples)
void alloc_integrator_queue()
void get_render_tile_film_pixels(const PassAccessor::Destination &destination, PassMode pass_mode, int num_samples)
vector< unique_ptr< device_memory > > integrator_state_soa_
device_vector< int > integrator_shader_mnee_sort_counter_
WorkTileScheduler work_tile_scheduler_
virtual bool copy_render_buffers_to_device() override
int max_active_main_path_index_
virtual void copy_to_display(PathTraceDisplay *display, PassMode pass_mode, int num_samples) override
virtual int adaptive_sampling_converge_filter_count_active(float threshold, bool reset) override
int kernel_max_active_main_path_index(DeviceKernel kernel)
int shadow_catcher_count_possible_splits()
device_vector< half4 > display_rgba_half_
unique_ptr< RenderBuffers > buffers_
PassAccessor::PassAccessInfo get_display_pass_access_info(PassMode pass_mode) const
BufferParams effective_big_tile_params_
PassAccessor::Destination get_display_destination_template(const PathTraceDisplay *display) const
bool has_multiple_works() const
BufferParams effective_buffer_params_
DeviceScene * device_scene_
bool is_cancel_requested() const
bool get_work(KernelWorkTile *work_tile, const int max_work_size=0)
void set_accelerated_rt(bool state)
void set_max_num_path_states(int max_num_path_states)
void reset(const BufferParams &buffer_params, int sample_start, int samples_num, int sample_offset, float scrambling_distance)
device_ptr device_pointer
T * alloc(size_t width, size_t height=0, size_t depth=0)
#define CCL_NAMESPACE_END
CCL_NAMESPACE_BEGIN const char * device_kernel_as_string(DeviceKernel kernel)
SyclQueue void void size_t num_bytes SyclQueue void const char void *memory_device_pointer KernelContext int kernel
ccl_gpu_kernel_postfix ccl_global float int full_x
ccl_gpu_kernel_postfix ccl_global const int ccl_global float const int work_size
ccl_gpu_kernel_postfix ccl_global const int const int active_states_offset
ccl_gpu_kernel_postfix ccl_global float int int int int float threshold
ccl_gpu_kernel_postfix ccl_global KernelWorkTile const int ccl_global float const int max_tile_work_size
ccl_gpu_kernel_postfix int ccl_global int ccl_global int int num_active_paths
ccl_gpu_kernel_postfix ccl_global float int int int int float bool int int ccl_global uint * num_active_pixels
ccl_gpu_kernel_postfix ccl_global const int const int const int terminated_states_offset
ccl_gpu_kernel_postfix ccl_global float int int int int float bool int offset
ccl_gpu_kernel_postfix ccl_global float int int int int float bool reset
clear internal cached data and reset random seed
ccl_gpu_kernel_postfix ccl_global float int int full_y
ccl_gpu_kernel_postfix ccl_global float int int int int ccl_global const float int int int int int int int int int int int int num_samples
@ KERNEL_FEATURE_NODE_RAYTRACE
@ DEVICE_KERNEL_INTEGRATOR_NUM
@ DEVICE_KERNEL_ADAPTIVE_SAMPLING_CONVERGENCE_CHECK
@ DEVICE_KERNEL_INTEGRATOR_RESET
@ DEVICE_KERNEL_INTEGRATOR_QUEUED_PATHS_ARRAY
@ DEVICE_KERNEL_INTEGRATOR_SHADE_LIGHT
@ DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE
@ DEVICE_KERNEL_INTEGRATOR_SHADOW_CATCHER_COUNT_POSSIBLE_SPLITS
@ DEVICE_KERNEL_INTEGRATOR_INTERSECT_SUBSURFACE
@ DEVICE_KERNEL_INTEGRATOR_QUEUED_SHADOW_PATHS_ARRAY
@ DEVICE_KERNEL_INTEGRATOR_SHADE_SHADOW
@ DEVICE_KERNEL_INTEGRATOR_TERMINATED_PATHS_ARRAY
@ DEVICE_KERNEL_INTEGRATOR_INTERSECT_VOLUME_STACK
@ DEVICE_KERNEL_INTEGRATOR_SORTED_PATHS_ARRAY
@ DEVICE_KERNEL_INTEGRATOR_COMPACT_SHADOW_PATHS_ARRAY
@ DEVICE_KERNEL_INTEGRATOR_COMPACT_SHADOW_STATES
@ DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE
@ DEVICE_KERNEL_INTEGRATOR_COMPACT_STATES
@ DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_MNEE
@ DEVICE_KERNEL_INTEGRATOR_TERMINATED_SHADOW_PATHS_ARRAY
@ DEVICE_KERNEL_INTEGRATOR_INIT_FROM_CAMERA
@ DEVICE_KERNEL_ADAPTIVE_SAMPLING_CONVERGENCE_FILTER_Y
@ DEVICE_KERNEL_ADAPTIVE_SAMPLING_CONVERGENCE_FILTER_X
@ DEVICE_KERNEL_CRYPTOMATTE_POSTPROCESS
@ DEVICE_KERNEL_INTEGRATOR_COMPACT_PATHS_ARRAY
@ DEVICE_KERNEL_INTEGRATOR_INIT_FROM_BAKE
@ DEVICE_KERNEL_INTEGRATOR_SHADE_VOLUME
@ DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW
@ DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST
@ DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND
@ DEVICE_KERNEL_PREFIX_SUM
#define VLOG_IS_ON(severity)
#define VLOG_DEVICE_STATS
static CCL_NAMESPACE_BEGIN size_t estimate_single_state_size()
unsigned __int64 uint64_t
string string_human_readable_size(size_t size)
int num_queued[DEVICE_KERNEL_INTEGRATOR_NUM]
ccl_global IntegratorQueueCounter * queue_counter
ccl_global int * next_shadow_path_index
ccl_global int * next_main_path_index
ccl_global int * sort_key_counter[DEVICE_KERNEL_INTEGRATOR_NUM]
uint sort_partition_divisor
ccl_device_inline size_t divide_up(size_t x, size_t y)