Blender  V3.3
queue.mm
Go to the documentation of this file.
1 /* SPDX-License-Identifier: Apache-2.0
2  * Copyright 2021-2022 Blender Foundation */
3 
4 #ifdef WITH_METAL
5 
6 # include "device/metal/queue.h"
7 
9 # include "device/metal/kernel.h"
10 
11 # include "util/path.h"
12 # include "util/string.h"
13 # include "util/time.h"
14 
16 
17 /* MetalDeviceQueue */
18 
19 MetalDeviceQueue::MetalDeviceQueue(MetalDevice *device)
20  : DeviceQueue(device), metal_device_(device), stats_(device->stats)
21 {
22  if (@available(macos 11.0, *)) {
23  command_buffer_desc_ = [[MTLCommandBufferDescriptor alloc] init];
24  command_buffer_desc_.errorOptions = MTLCommandBufferErrorOptionEncoderExecutionStatus;
25  }
26 
27  mtlDevice_ = device->mtlDevice;
28  mtlCommandQueue_ = [mtlDevice_ newCommandQueue];
29 
30  if (@available(macos 10.14, *)) {
31  shared_event_ = [mtlDevice_ newSharedEvent];
32  shared_event_id_ = 1;
33 
34  /* Shareable event listener */
35  event_queue_ = dispatch_queue_create("com.cycles.metal.event_queue", NULL);
36  shared_event_listener_ = [[MTLSharedEventListener alloc] initWithDispatchQueue:event_queue_];
37  }
38 
39  wait_semaphore_ = dispatch_semaphore_create(0);
40 
41  if (@available(macos 10.14, *)) {
42  if (getenv("CYCLES_METAL_PROFILING")) {
43  /* Enable per-kernel timing breakdown (shown at end of render). */
44  timing_shared_event_ = [mtlDevice_ newSharedEvent];
45  label_command_encoders_ = true;
46  }
47  if (getenv("CYCLES_METAL_DEBUG")) {
48  /* Enable very verbose tracing (shows every dispatch). */
49  verbose_tracing_ = true;
50  label_command_encoders_ = true;
51  }
52  timing_shared_event_id_ = 1;
53  }
54 
55  setup_capture();
56 }
57 
58 void MetalDeviceQueue::setup_capture()
59 {
60  capture_kernel_ = DeviceKernel(-1);
61 
62  if (auto capture_kernel_str = getenv("CYCLES_DEBUG_METAL_CAPTURE_KERNEL")) {
63  /* CYCLES_DEBUG_METAL_CAPTURE_KERNEL captures a single dispatch of the specified kernel. */
64  capture_kernel_ = DeviceKernel(atoi(capture_kernel_str));
65  printf("Capture kernel: %d = %s\n", capture_kernel_, device_kernel_as_string(capture_kernel_));
66 
67  capture_dispatch_counter_ = 0;
68  if (auto capture_dispatch_str = getenv("CYCLES_DEBUG_METAL_CAPTURE_DISPATCH")) {
69  capture_dispatch_counter_ = atoi(capture_dispatch_str);
70 
71  printf("Capture dispatch number %d\n", capture_dispatch_counter_);
72  }
73  }
74  else if (auto capture_samples_str = getenv("CYCLES_DEBUG_METAL_CAPTURE_SAMPLES")) {
75  /* CYCLES_DEBUG_METAL_CAPTURE_SAMPLES captures a block of dispatches from reset#(N) to
76  * reset#(N+1). */
77  capture_samples_ = true;
78  capture_reset_counter_ = atoi(capture_samples_str);
79 
80  capture_dispatch_counter_ = INT_MAX;
81  if (auto capture_limit_str = getenv("CYCLES_DEBUG_METAL_CAPTURE_LIMIT")) {
82  /* CYCLES_DEBUG_METAL_CAPTURE_LIMIT sets the maximum number of dispatches to capture. */
83  capture_dispatch_counter_ = atoi(capture_limit_str);
84  }
85 
86  printf("Capturing sample block %d (dispatch limit: %d)\n",
87  capture_reset_counter_,
88  capture_dispatch_counter_);
89  }
90  else {
91  /* No capturing requested. */
92  return;
93  }
94 
95  /* Enable .gputrace capture for the specified DeviceKernel. */
96  MTLCaptureManager *captureManager = [MTLCaptureManager sharedCaptureManager];
97  mtlCaptureScope_ = [captureManager newCaptureScopeWithDevice:mtlDevice_];
98  mtlCaptureScope_.label = [NSString stringWithFormat:@"Cycles kernel dispatch"];
99  [captureManager setDefaultCaptureScope:mtlCaptureScope_];
100 
101  label_command_encoders_ = true;
102 
103  if (auto capture_url = getenv("CYCLES_DEBUG_METAL_CAPTURE_URL")) {
104  if (@available(macos 10.15, *)) {
105  if ([captureManager supportsDestination:MTLCaptureDestinationGPUTraceDocument]) {
106 
107  MTLCaptureDescriptor *captureDescriptor = [[MTLCaptureDescriptor alloc] init];
108  captureDescriptor.captureObject = mtlCaptureScope_;
109  captureDescriptor.destination = MTLCaptureDestinationGPUTraceDocument;
110  captureDescriptor.outputURL = [NSURL fileURLWithPath:@(capture_url)];
111 
112  NSError *error;
113  if (![captureManager startCaptureWithDescriptor:captureDescriptor error:&error]) {
114  NSString *err = [error localizedDescription];
115  printf("Start capture failed: %s\n", [err UTF8String]);
116  }
117  else {
118  printf("Capture started (URL: %s)\n", capture_url);
119  is_capturing_to_disk_ = true;
120  }
121  }
122  else {
123  printf("Capture to file is not supported\n");
124  }
125  }
126  }
127 }
128 
129 void MetalDeviceQueue::update_capture(DeviceKernel kernel)
130 {
131  /* Handle capture end triggers. */
132  if (is_capturing_) {
133  capture_dispatch_counter_ -= 1;
134  if (capture_dispatch_counter_ <= 0 || kernel == DEVICE_KERNEL_INTEGRATOR_RESET) {
135  /* End capture if we've hit the dispatch limit or we hit a "reset". */
136  end_capture();
137  }
138  return;
139  }
140 
141  if (capture_dispatch_counter_ < 0) {
142  /* We finished capturing. */
143  return;
144  }
145 
146  /* Handle single-capture start trigger. */
147  if (kernel == capture_kernel_) {
148  /* Start capturing when the we hit the Nth dispatch of the specified kernel. */
149  if (capture_dispatch_counter_ == 0) {
150  begin_capture();
151  }
152  capture_dispatch_counter_ -= 1;
153  return;
154  }
155 
156  /* Handle multi-capture start trigger. */
157  if (capture_samples_) {
158  /* Start capturing when the reset countdown is at 0. */
159  if (capture_reset_counter_ == 0) {
160  begin_capture();
161  }
162 
164  capture_reset_counter_ -= 1;
165  }
166  return;
167  }
168 }
169 
170 void MetalDeviceQueue::begin_capture()
171 {
172  /* Start gputrace capture. */
173  if (mtlCommandBuffer_) {
174  synchronize();
175  }
176  [mtlCaptureScope_ beginScope];
177  printf("[mtlCaptureScope_ beginScope]\n");
178  is_capturing_ = true;
179 }
180 
181 void MetalDeviceQueue::end_capture()
182 {
183  [mtlCaptureScope_ endScope];
184  is_capturing_ = false;
185  printf("[mtlCaptureScope_ endScope]\n");
186 
187  if (is_capturing_to_disk_) {
188  if (@available(macos 10.15, *)) {
189  [[MTLCaptureManager sharedCaptureManager] stopCapture];
190  has_captured_to_disk_ = true;
191  is_capturing_to_disk_ = false;
192  is_capturing_ = false;
193  printf("Capture stopped\n");
194  }
195  }
196 }
197 
198 MetalDeviceQueue::~MetalDeviceQueue()
199 {
200  /* Tidying up here isn't really practical - we should expect and require the work
201  * queue to be empty here. */
202  assert(mtlCommandBuffer_ == nil);
203  assert(command_buffers_submitted_ == command_buffers_completed_);
204 
205  if (@available(macos 10.14, *)) {
206  [shared_event_listener_ release];
207  [shared_event_ release];
208  }
209 
210  if (@available(macos 11.0, *)) {
211  [command_buffer_desc_ release];
212  }
213  if (mtlCommandQueue_) {
214  [mtlCommandQueue_ release];
215  mtlCommandQueue_ = nil;
216  }
217 
218  if (mtlCaptureScope_) {
219  [mtlCaptureScope_ release];
220  }
221 
222  double total_time = 0.0;
223 
224  /* Show per-kernel timings, if gathered (see CYCLES_METAL_PROFILING). */
225  int64_t num_dispatches = 0;
226  for (auto &stat : timing_stats_) {
227  total_time += stat.total_time;
228  num_dispatches += stat.num_dispatches;
229  }
230 
231  if (num_dispatches) {
232  printf("\nMetal dispatch stats:\n\n");
233  auto header = string_printf("%-40s %16s %12s %12s %7s %7s",
234  "Kernel name",
235  "Total threads",
236  "Dispatches",
237  "Avg. T/D",
238  "Time",
239  "Time%");
240  auto divider = string(header.length(), '-');
241  printf("%s\n%s\n%s\n", divider.c_str(), header.c_str(), divider.c_str());
242 
243  for (size_t i = 0; i < DEVICE_KERNEL_NUM; i++) {
244  auto &stat = timing_stats_[i];
245  if (stat.num_dispatches > 0) {
246  printf("%-40s %16s %12s %12s %6.2fs %6.2f%%\n",
248  string_human_readable_number(stat.total_work_size).c_str(),
249  string_human_readable_number(stat.num_dispatches).c_str(),
250  string_human_readable_number(stat.total_work_size / stat.num_dispatches).c_str(),
251  stat.total_time,
252  stat.total_time * 100.0 / total_time);
253  }
254  }
255  printf("%s\n", divider.c_str());
256  printf("%-40s %16s %12s %12s %6.2fs %6.2f%%\n",
257  "",
258  "",
259  string_human_readable_number(num_dispatches).c_str(),
260  "",
261  total_time,
262  100.0);
263  printf("%s\n\n", divider.c_str());
264  }
265 }
266 
267 int MetalDeviceQueue::num_concurrent_states(const size_t /*state_size*/) const
268 {
269  /* METAL_WIP */
270  /* TODO: compute automatically. */
271  /* TODO: must have at least num_threads_per_block. */
272  int result = 1048576;
273  if (metal_device_->device_vendor == METAL_GPU_AMD) {
274  result *= 2;
275  }
276  else if (metal_device_->device_vendor == METAL_GPU_APPLE) {
277  result *= 4;
278  }
279  return result;
280 }
281 
282 int MetalDeviceQueue::num_concurrent_busy_states() const
283 {
284  /* METAL_WIP */
285  /* TODO: compute automatically. */
286  int result = 65536;
287  if (metal_device_->device_vendor == METAL_GPU_AMD) {
288  result *= 2;
289  }
290  else if (metal_device_->device_vendor == METAL_GPU_APPLE) {
291  result *= 4;
292  }
293  return result;
294 }
295 
296 int MetalDeviceQueue::num_sort_partition_elements() const
297 {
298  return MetalInfo::optimal_sort_partition_elements(metal_device_->mtlDevice);
299 }
300 
302 {
303  /* Synchronize all textures and memory copies before executing task. */
304  metal_device_->load_texture_info();
305 
306  synchronize();
307 }
308 
309 bool MetalDeviceQueue::enqueue(DeviceKernel kernel,
310  const int work_size,
311  DeviceKernelArguments const &args)
312 {
313  update_capture(kernel);
314 
315  if (metal_device_->have_error()) {
316  return false;
317  }
318 
319  VLOG_DEVICE_STATS << "Metal queue launch " << device_kernel_as_string(kernel) << ", work_size "
320  << work_size;
321 
322  id<MTLComputeCommandEncoder> mtlComputeCommandEncoder = get_compute_encoder(kernel);
323 
324  if (@available(macos 10.14, *)) {
325  if (timing_shared_event_) {
326  command_encoder_labels_.push_back({kernel, work_size, timing_shared_event_id_});
327  }
328  }
329 
330  /* Determine size requirement for argument buffer. */
331  size_t arg_buffer_length = 0;
332  for (size_t i = 0; i < args.count; i++) {
333  size_t size_in_bytes = args.sizes[i];
334  arg_buffer_length = round_up(arg_buffer_length, size_in_bytes) + size_in_bytes;
335  }
336  /* 256 is the Metal offset alignment for constant address space bindings */
337  arg_buffer_length = round_up(arg_buffer_length, 256);
338 
339  /* Globals placed after "vanilla" arguments. */
340  size_t globals_offsets = arg_buffer_length;
341  arg_buffer_length += sizeof(KernelParamsMetal);
342  arg_buffer_length = round_up(arg_buffer_length, 256);
343 
344  /* Metal ancillary bindless pointers. */
345  size_t metal_offsets = arg_buffer_length;
346  arg_buffer_length += metal_device_->mtlAncillaryArgEncoder.encodedLength;
347  arg_buffer_length = round_up(arg_buffer_length, metal_device_->mtlAncillaryArgEncoder.alignment);
348 
349  /* Temporary buffer used to prepare arg_buffer */
350  uint8_t *init_arg_buffer = (uint8_t *)alloca(arg_buffer_length);
351  memset(init_arg_buffer, 0, arg_buffer_length);
352 
353  /* Prepare the non-pointer "enqueue" arguments */
354  size_t bytes_written = 0;
355  for (size_t i = 0; i < args.count; i++) {
356  size_t size_in_bytes = args.sizes[i];
357  bytes_written = round_up(bytes_written, size_in_bytes);
358  if (args.types[i] != DeviceKernelArguments::POINTER) {
359  memcpy(init_arg_buffer + bytes_written, args.values[i], size_in_bytes);
360  }
361  bytes_written += size_in_bytes;
362  }
363 
364  /* Prepare any non-pointer (i.e. plain-old-data) KernelParamsMetal data */
365  /* The plain-old-data is contiguous, continuing to the end of KernelParamsMetal */
366  size_t plain_old_launch_data_offset = offsetof(KernelParamsMetal, integrator_state) +
367  offsetof(IntegratorStateGPU, sort_partition_divisor);
368  size_t plain_old_launch_data_size = sizeof(KernelParamsMetal) - plain_old_launch_data_offset;
369  memcpy(init_arg_buffer + globals_offsets + plain_old_launch_data_offset,
370  (uint8_t *)&metal_device_->launch_params + plain_old_launch_data_offset,
371  plain_old_launch_data_size);
372 
373  /* Allocate an argument buffer. */
374  MTLResourceOptions arg_buffer_options = MTLResourceStorageModeManaged;
375  if (@available(macOS 11.0, *)) {
376  if ([mtlDevice_ hasUnifiedMemory]) {
377  arg_buffer_options = MTLResourceStorageModeShared;
378  }
379  }
380 
381  id<MTLBuffer> arg_buffer = temp_buffer_pool_.get_buffer(mtlDevice_,
382  mtlCommandBuffer_,
383  arg_buffer_length,
384  arg_buffer_options,
385  init_arg_buffer,
386  stats_);
387 
388  /* Encode the pointer "enqueue" arguments */
389  bytes_written = 0;
390  for (size_t i = 0; i < args.count; i++) {
391  size_t size_in_bytes = args.sizes[i];
392  bytes_written = round_up(bytes_written, size_in_bytes);
393  if (args.types[i] == DeviceKernelArguments::POINTER) {
394  [metal_device_->mtlBufferKernelParamsEncoder setArgumentBuffer:arg_buffer
395  offset:bytes_written];
396  if (MetalDevice::MetalMem *mmem = *(MetalDevice::MetalMem **)args.values[i]) {
397  [mtlComputeCommandEncoder useResource:mmem->mtlBuffer
398  usage:MTLResourceUsageRead | MTLResourceUsageWrite];
399  [metal_device_->mtlBufferKernelParamsEncoder setBuffer:mmem->mtlBuffer offset:0 atIndex:0];
400  }
401  else {
402  if (@available(macos 12.0, *)) {
403  [metal_device_->mtlBufferKernelParamsEncoder setBuffer:nil offset:0 atIndex:0];
404  }
405  }
406  }
407  bytes_written += size_in_bytes;
408  }
409 
410  /* Encode KernelParamsMetal buffers */
411  [metal_device_->mtlBufferKernelParamsEncoder setArgumentBuffer:arg_buffer
412  offset:globals_offsets];
413 
414  if (label_command_encoders_) {
415  /* Add human-readable labels if we're doing any form of debugging / profiling. */
416  mtlComputeCommandEncoder.label = [[NSString alloc]
417  initWithFormat:@"Metal queue launch %s, work_size %d",
419  work_size];
420  }
421 
422  /* this relies on IntegratorStateGPU layout being contiguous device_ptrs */
423  const size_t pointer_block_end = offsetof(KernelParamsMetal, integrator_state) +
424  offsetof(IntegratorStateGPU, sort_partition_divisor);
425  for (size_t offset = 0; offset < pointer_block_end; offset += sizeof(device_ptr)) {
426  int pointer_index = int(offset / sizeof(device_ptr));
427  MetalDevice::MetalMem *mmem = *(
428  MetalDevice::MetalMem **)((uint8_t *)&metal_device_->launch_params + offset);
429  if (mmem && mmem->mem && (mmem->mtlBuffer || mmem->mtlTexture)) {
430  [metal_device_->mtlBufferKernelParamsEncoder setBuffer:mmem->mtlBuffer
431  offset:0
432  atIndex:pointer_index];
433  }
434  else {
435  if (@available(macos 12.0, *)) {
436  [metal_device_->mtlBufferKernelParamsEncoder setBuffer:nil offset:0 atIndex:pointer_index];
437  }
438  }
439  }
440  bytes_written = globals_offsets + sizeof(KernelParamsMetal);
441 
442  const MetalKernelPipeline *metal_kernel_pso = MetalDeviceKernels::get_best_pipeline(
443  metal_device_, kernel);
444  if (!metal_kernel_pso) {
445  metal_device_->set_error(
446  string_printf("No MetalKernelPipeline for %s\n", device_kernel_as_string(kernel)));
447  return false;
448  }
449 
450  /* Encode ancillaries */
451  [metal_device_->mtlAncillaryArgEncoder setArgumentBuffer:arg_buffer offset:metal_offsets];
452  [metal_device_->mtlAncillaryArgEncoder setBuffer:metal_device_->texture_bindings_2d
453  offset:0
454  atIndex:0];
455  [metal_device_->mtlAncillaryArgEncoder setBuffer:metal_device_->texture_bindings_3d
456  offset:0
457  atIndex:1];
458  if (@available(macos 12.0, *)) {
459  if (metal_device_->use_metalrt) {
460  if (metal_device_->bvhMetalRT) {
461  id<MTLAccelerationStructure> accel_struct = metal_device_->bvhMetalRT->accel_struct;
462  [metal_device_->mtlAncillaryArgEncoder setAccelerationStructure:accel_struct atIndex:2];
463  }
464 
465  for (int table = 0; table < METALRT_TABLE_NUM; table++) {
466  if (metal_kernel_pso->intersection_func_table[table]) {
467  [metal_kernel_pso->intersection_func_table[table] setBuffer:arg_buffer
468  offset:globals_offsets
469  atIndex:1];
470  [metal_device_->mtlAncillaryArgEncoder
471  setIntersectionFunctionTable:metal_kernel_pso->intersection_func_table[table]
472  atIndex:3 + table];
473  [mtlComputeCommandEncoder useResource:metal_kernel_pso->intersection_func_table[table]
474  usage:MTLResourceUsageRead];
475  }
476  else {
477  [metal_device_->mtlAncillaryArgEncoder setIntersectionFunctionTable:nil
478  atIndex:3 + table];
479  }
480  }
481  }
482  bytes_written = metal_offsets + metal_device_->mtlAncillaryArgEncoder.encodedLength;
483  }
484 
485  if (arg_buffer.storageMode == MTLStorageModeManaged) {
486  [arg_buffer didModifyRange:NSMakeRange(0, bytes_written)];
487  }
488 
489  [mtlComputeCommandEncoder setBuffer:arg_buffer offset:0 atIndex:0];
490  [mtlComputeCommandEncoder setBuffer:arg_buffer offset:globals_offsets atIndex:1];
491  [mtlComputeCommandEncoder setBuffer:arg_buffer offset:metal_offsets atIndex:2];
492 
493  if (metal_device_->use_metalrt) {
494  if (@available(macos 12.0, *)) {
495 
496  auto bvhMetalRT = metal_device_->bvhMetalRT;
497  switch (kernel) {
504  break;
505  default:
506  bvhMetalRT = nil;
507  break;
508  }
509 
510  if (bvhMetalRT) {
511  /* Mark all Accelerations resources as used */
512  [mtlComputeCommandEncoder useResource:bvhMetalRT->accel_struct usage:MTLResourceUsageRead];
513  [mtlComputeCommandEncoder useResources:bvhMetalRT->blas_array.data()
514  count:bvhMetalRT->blas_array.size()
515  usage:MTLResourceUsageRead];
516  }
517  }
518  }
519 
520  [mtlComputeCommandEncoder setComputePipelineState:metal_kernel_pso->pipeline];
521 
522  /* Compute kernel launch parameters. */
523  const int num_threads_per_block = metal_kernel_pso->num_threads_per_block;
524 
525  int shared_mem_bytes = 0;
526 
527  switch (kernel) {
536  /* See parallel_active_index.h for why this amount of shared memory is needed.
537  * Rounded up to 16 bytes for Metal */
538  shared_mem_bytes = (int)round_up((num_threads_per_block + 1) * sizeof(int), 16);
539  [mtlComputeCommandEncoder setThreadgroupMemoryLength:shared_mem_bytes atIndex:0];
540  break;
541 
542  default:
543  break;
544  }
545 
546  MTLSize size_threadgroups_per_dispatch = MTLSizeMake(
547  divide_up(work_size, num_threads_per_block), 1, 1);
548  MTLSize size_threads_per_threadgroup = MTLSizeMake(num_threads_per_block, 1, 1);
549  [mtlComputeCommandEncoder dispatchThreadgroups:size_threadgroups_per_dispatch
550  threadsPerThreadgroup:size_threads_per_threadgroup];
551 
552  [mtlCommandBuffer_ addCompletedHandler:^(id<MTLCommandBuffer> command_buffer) {
553  NSString *kernel_name = metal_kernel_pso->function.label;
554 
555  /* Enhanced command buffer errors are only available in 11.0+ */
556  if (@available(macos 11.0, *)) {
557  if (command_buffer.status == MTLCommandBufferStatusError && command_buffer.error != nil) {
558  metal_device_->set_error(string("CommandBuffer Failed: ") + [kernel_name UTF8String]);
559  NSArray<id<MTLCommandBufferEncoderInfo>> *encoderInfos = [command_buffer.error.userInfo
560  valueForKey:MTLCommandBufferEncoderInfoErrorKey];
561  if (encoderInfos != nil) {
562  for (id<MTLCommandBufferEncoderInfo> encoderInfo : encoderInfos) {
563  NSLog(@"%@", encoderInfo);
564  }
565  }
566  id<MTLLogContainer> logs = command_buffer.logs;
567  for (id<MTLFunctionLog> log in logs) {
568  NSLog(@"%@", log);
569  }
570  }
571  else if (command_buffer.error) {
572  metal_device_->set_error(string("CommandBuffer Failed: ") + [kernel_name UTF8String]);
573  }
574  }
575  }];
576 
577  if (verbose_tracing_ || is_capturing_) {
578  /* Force a sync we've enabled step-by-step verbose tracing or if we're capturing. */
579  synchronize();
580 
581  /* Show queue counters and dispatch timing. */
582  if (verbose_tracing_) {
584  printf(
585  "_____________________________________.____________________.______________.___________"
586  "______________________________________\n");
587  }
588 
589  printf("%-40s| %7d threads |%5.2fms | buckets [",
591  work_size,
592  last_completion_time_ * 1000.0);
593  std::lock_guard<std::recursive_mutex> lock(metal_device_->metal_mem_map_mutex);
594  for (auto &it : metal_device_->metal_mem_map) {
595  const string c_integrator_queue_counter = "integrator_queue_counter";
596  if (it.first->name == c_integrator_queue_counter) {
597  /* Workaround "device_copy_from" being protected. */
598  struct MyDeviceMemory : device_memory {
599  void device_copy_from__IntegratorQueueCounter()
600  {
601  device_copy_from(0, data_width, 1, sizeof(IntegratorQueueCounter));
602  }
603  };
604  ((MyDeviceMemory *)it.first)->device_copy_from__IntegratorQueueCounter();
605 
606  if (IntegratorQueueCounter *queue_counter = (IntegratorQueueCounter *)
607  it.first->host_pointer) {
608  for (int i = 0; i < DEVICE_KERNEL_INTEGRATOR_NUM; i++)
609  printf("%s%d", i == 0 ? "" : ",", int(queue_counter->num_queued[i]));
610  }
611  break;
612  }
613  }
614  printf("]\n");
615  }
616  }
617 
618  return !(metal_device_->have_error());
619 }
620 
621 bool MetalDeviceQueue::synchronize()
622 {
623  if (has_captured_to_disk_ || metal_device_->have_error()) {
624  return false;
625  }
626 
627  if (mtlComputeEncoder_) {
628  close_compute_encoder();
629  }
630  close_blit_encoder();
631 
632  if (mtlCommandBuffer_) {
633  scoped_timer timer;
634 
635  if (@available(macos 10.14, *)) {
636  if (timing_shared_event_) {
637  /* For per-kernel timing, add event handlers to measure & accumulate dispatch times. */
638  __block double completion_time = 0;
639  for (uint64_t i = command_buffer_start_timing_id_; i < timing_shared_event_id_; i++) {
640  [timing_shared_event_ notifyListener:shared_event_listener_
641  atValue:i
642  block:^(id<MTLSharedEvent> sharedEvent, uint64_t value) {
643  completion_time = timer.get_time() - completion_time;
644  last_completion_time_ = completion_time;
645  for (auto label : command_encoder_labels_) {
646  if (label.timing_id == value) {
647  TimingStats &stat = timing_stats_[label.kernel];
648  stat.num_dispatches++;
649  stat.total_time += completion_time;
650  stat.total_work_size += label.work_size;
651  }
652  }
653  }];
654  }
655  }
656  }
657 
658  uint64_t shared_event_id_ = this->shared_event_id_++;
659 
660  if (@available(macos 10.14, *)) {
661  __block dispatch_semaphore_t block_sema = wait_semaphore_;
662  [shared_event_ notifyListener:shared_event_listener_
663  atValue:shared_event_id_
664  block:^(id<MTLSharedEvent> sharedEvent, uint64_t value) {
665  dispatch_semaphore_signal(block_sema);
666  }];
667 
668  [mtlCommandBuffer_ encodeSignalEvent:shared_event_ value:shared_event_id_];
669  [mtlCommandBuffer_ commit];
670  dispatch_semaphore_wait(wait_semaphore_, DISPATCH_TIME_FOREVER);
671  }
672 
673  [mtlCommandBuffer_ release];
674 
675  for (const CopyBack &mmem : copy_back_mem_) {
676  memcpy((uchar *)mmem.host_pointer, (uchar *)mmem.gpu_mem, mmem.size);
677  }
678  copy_back_mem_.clear();
679 
680  temp_buffer_pool_.process_command_buffer_completion(mtlCommandBuffer_);
681  metal_device_->flush_delayed_free_list();
682 
683  mtlCommandBuffer_ = nil;
684  command_encoder_labels_.clear();
685  }
686 
687  return !(metal_device_->have_error());
688 }
689 
690 void MetalDeviceQueue::zero_to_device(device_memory &mem)
691 {
692  assert(mem.type != MEM_GLOBAL && mem.type != MEM_TEXTURE);
693 
694  if (mem.memory_size() == 0) {
695  return;
696  }
697 
698  /* Allocate on demand. */
699  if (mem.device_pointer == 0) {
700  metal_device_->mem_alloc(mem);
701  }
702 
703  /* Zero memory on device. */
704  assert(mem.device_pointer != 0);
705 
706  std::lock_guard<std::recursive_mutex> lock(metal_device_->metal_mem_map_mutex);
707  MetalDevice::MetalMem &mmem = *metal_device_->metal_mem_map.at(&mem);
708  if (mmem.mtlBuffer) {
709  id<MTLBlitCommandEncoder> blitEncoder = get_blit_encoder();
710  [blitEncoder fillBuffer:mmem.mtlBuffer range:NSMakeRange(mmem.offset, mmem.size) value:0];
711  }
712  else {
713  metal_device_->mem_zero(mem);
714  }
715 }
716 
717 void MetalDeviceQueue::copy_to_device(device_memory &mem)
718 {
719  if (mem.memory_size() == 0) {
720  return;
721  }
722 
723  /* Allocate on demand. */
724  if (mem.device_pointer == 0) {
725  metal_device_->mem_alloc(mem);
726  }
727 
728  assert(mem.device_pointer != 0);
729  assert(mem.host_pointer != nullptr);
730 
731  std::lock_guard<std::recursive_mutex> lock(metal_device_->metal_mem_map_mutex);
732  auto result = metal_device_->metal_mem_map.find(&mem);
733  if (result != metal_device_->metal_mem_map.end()) {
734  if (mem.host_pointer == mem.shared_pointer) {
735  return;
736  }
737 
738  MetalDevice::MetalMem &mmem = *result->second;
739  id<MTLBlitCommandEncoder> blitEncoder = get_blit_encoder();
740 
741  id<MTLBuffer> buffer = temp_buffer_pool_.get_buffer(mtlDevice_,
742  mtlCommandBuffer_,
743  mmem.size,
744  MTLResourceStorageModeShared,
745  mem.host_pointer,
746  stats_);
747 
748  [blitEncoder copyFromBuffer:buffer
749  sourceOffset:0
750  toBuffer:mmem.mtlBuffer
751  destinationOffset:mmem.offset
752  size:mmem.size];
753  }
754  else {
755  metal_device_->mem_copy_to(mem);
756  }
757 }
758 
759 void MetalDeviceQueue::copy_from_device(device_memory &mem)
760 {
761  assert(mem.type != MEM_GLOBAL && mem.type != MEM_TEXTURE);
762 
763  if (mem.memory_size() == 0) {
764  return;
765  }
766 
767  assert(mem.device_pointer != 0);
768  assert(mem.host_pointer != nullptr);
769 
770  std::lock_guard<std::recursive_mutex> lock(metal_device_->metal_mem_map_mutex);
771  MetalDevice::MetalMem &mmem = *metal_device_->metal_mem_map.at(&mem);
772  if (mmem.mtlBuffer) {
773  const size_t size = mem.memory_size();
774 
775  if (mem.device_pointer) {
776  if ([mmem.mtlBuffer storageMode] == MTLStorageModeManaged) {
777  id<MTLBlitCommandEncoder> blitEncoder = get_blit_encoder();
778  [blitEncoder synchronizeResource:mmem.mtlBuffer];
779  }
780  if (mem.host_pointer != mmem.hostPtr) {
781  if (mtlCommandBuffer_) {
782  copy_back_mem_.push_back({mem.host_pointer, mmem.hostPtr, size});
783  }
784  else {
785  memcpy((uchar *)mem.host_pointer, (uchar *)mmem.hostPtr, size);
786  }
787  }
788  }
789  else {
790  memset((char *)mem.host_pointer, 0, size);
791  }
792  }
793  else {
794  metal_device_->mem_copy_from(mem);
795  }
796 }
797 
798 void MetalDeviceQueue::prepare_resources(DeviceKernel kernel)
799 {
800  std::lock_guard<std::recursive_mutex> lock(metal_device_->metal_mem_map_mutex);
801 
802  /* declare resource usage */
803  for (auto &it : metal_device_->metal_mem_map) {
804  device_memory *mem = it.first;
805 
806  MTLResourceUsage usage = MTLResourceUsageRead;
807  if (mem->type != MEM_GLOBAL && mem->type != MEM_READ_ONLY && mem->type != MEM_TEXTURE) {
808  usage |= MTLResourceUsageWrite;
809  }
810 
811  if (it.second->mtlBuffer) {
812  /* METAL_WIP - use array version (i.e. useResources) */
813  [mtlComputeEncoder_ useResource:it.second->mtlBuffer usage:usage];
814  }
815  else if (it.second->mtlTexture) {
816  /* METAL_WIP - use array version (i.e. useResources) */
817  [mtlComputeEncoder_ useResource:it.second->mtlTexture usage:usage | MTLResourceUsageSample];
818  }
819  }
820 
821  /* ancillaries */
822  [mtlComputeEncoder_ useResource:metal_device_->texture_bindings_2d usage:MTLResourceUsageRead];
823  [mtlComputeEncoder_ useResource:metal_device_->texture_bindings_3d usage:MTLResourceUsageRead];
824 }
825 
826 id<MTLComputeCommandEncoder> MetalDeviceQueue::get_compute_encoder(DeviceKernel kernel)
827 {
828  bool concurrent = (kernel < DEVICE_KERNEL_INTEGRATOR_NUM);
829 
830  if (@available(macos 10.14, *)) {
831  if (timing_shared_event_) {
832  /* Close the current encoder to ensure we're able to capture per-encoder timing data. */
833  if (mtlComputeEncoder_) {
834  close_compute_encoder();
835  }
836  }
837 
838  if (mtlComputeEncoder_) {
839  if (mtlComputeEncoder_.dispatchType == concurrent ? MTLDispatchTypeConcurrent :
840  MTLDispatchTypeSerial) {
841  /* declare usage of MTLBuffers etc */
842  prepare_resources(kernel);
843 
844  return mtlComputeEncoder_;
845  }
846  close_compute_encoder();
847  }
848 
849  close_blit_encoder();
850 
851  if (!mtlCommandBuffer_) {
852  mtlCommandBuffer_ = [mtlCommandQueue_ commandBuffer];
853  [mtlCommandBuffer_ retain];
854  }
855 
856  mtlComputeEncoder_ = [mtlCommandBuffer_
857  computeCommandEncoderWithDispatchType:concurrent ? MTLDispatchTypeConcurrent :
858  MTLDispatchTypeSerial];
859 
860  [mtlComputeEncoder_ setLabel:@(device_kernel_as_string(kernel))];
861 
862  /* declare usage of MTLBuffers etc */
863  prepare_resources(kernel);
864  }
865 
866  return mtlComputeEncoder_;
867 }
868 
869 id<MTLBlitCommandEncoder> MetalDeviceQueue::get_blit_encoder()
870 {
871  if (mtlBlitEncoder_) {
872  return mtlBlitEncoder_;
873  }
874 
875  if (mtlComputeEncoder_) {
876  close_compute_encoder();
877  }
878 
879  if (!mtlCommandBuffer_) {
880  mtlCommandBuffer_ = [mtlCommandQueue_ commandBuffer];
881  [mtlCommandBuffer_ retain];
882  command_buffer_start_timing_id_ = timing_shared_event_id_;
883  }
884 
885  mtlBlitEncoder_ = [mtlCommandBuffer_ blitCommandEncoder];
886  return mtlBlitEncoder_;
887 }
888 
889 void MetalDeviceQueue::close_compute_encoder()
890 {
891  [mtlComputeEncoder_ endEncoding];
892  mtlComputeEncoder_ = nil;
893 
894  if (@available(macos 10.14, *)) {
895  if (timing_shared_event_) {
896  [mtlCommandBuffer_ encodeSignalEvent:timing_shared_event_ value:timing_shared_event_id_++];
897  }
898  }
899 }
900 
901 void MetalDeviceQueue::close_blit_encoder()
902 {
903  if (mtlBlitEncoder_) {
904  [mtlBlitEncoder_ endEncoding];
905  mtlBlitEncoder_ = nil;
906  }
907 }
908 
910 
911 #endif /* WITH_METAL */
unsigned char uchar
Definition: BLI_sys_types.h:70
volatile int lock
static DBVT_INLINE btScalar size(const btDbvtVolume &a)
Definition: btDbvt.cpp:52
device_ptr device_pointer
void device_copy_from(size_t y, size_t w, size_t h, size_t elem)
Definition: memory.cpp:87
double get_time() const
Definition: time.h:41
#define CCL_NAMESPACE_END
Definition: cuda/compat.h:9
@ MEM_GLOBAL
@ MEM_TEXTURE
@ MEM_READ_ONLY
const char * label
CCL_NAMESPACE_BEGIN const char * device_kernel_as_string(DeviceKernel kernel)
CCL_NAMESPACE_BEGIN struct KernelParamsMetal KernelParamsMetal
SyclQueue void void size_t num_bytes SyclQueue void const char void *memory_device_pointer KernelContext int kernel
static struct ImBuf * init_execution(const SeqRenderData *context, ImBuf *ibuf1, ImBuf *ibuf2, ImBuf *ibuf3)
Definition: effects.c:3519
int count
ccl_gpu_kernel_postfix ccl_global const int ccl_global float const int work_size
ccl_global float * buffer
ccl_gpu_kernel_postfix ccl_global float int int int int float bool int offset
@ DEVICE_KERNEL_INTEGRATOR_NUM
DeviceKernel
@ DEVICE_KERNEL_INTEGRATOR_RESET
@ DEVICE_KERNEL_INTEGRATOR_QUEUED_PATHS_ARRAY
@ DEVICE_KERNEL_INTEGRATOR_INTERSECT_SUBSURFACE
@ DEVICE_KERNEL_INTEGRATOR_QUEUED_SHADOW_PATHS_ARRAY
@ DEVICE_KERNEL_INTEGRATOR_TERMINATED_PATHS_ARRAY
@ DEVICE_KERNEL_INTEGRATOR_INTERSECT_VOLUME_STACK
@ DEVICE_KERNEL_INTEGRATOR_SORTED_PATHS_ARRAY
@ DEVICE_KERNEL_INTEGRATOR_COMPACT_SHADOW_PATHS_ARRAY
@ DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE
@ DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_MNEE
@ DEVICE_KERNEL_INTEGRATOR_TERMINATED_SHADOW_PATHS_ARRAY
@ DEVICE_KERNEL_NUM
@ DEVICE_KERNEL_INTEGRATOR_ACTIVE_PATHS_ARRAY
@ DEVICE_KERNEL_INTEGRATOR_COMPACT_PATHS_ARRAY
@ DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW
@ DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST
#define VLOG_DEVICE_STATS
Definition: log.h:83
ccl_device_inline float3 log(float3 v)
Definition: math_float3.h:397
static void error(const char *str)
Definition: meshlaplacian.c:51
__int64 int64_t
Definition: stdint.h:89
unsigned char uint8_t
Definition: stdint.h:78
unsigned __int64 uint64_t
Definition: stdint.h:90
string string_human_readable_number(size_t num)
Definition: string.cpp:248
CCL_NAMESPACE_BEGIN string string_printf(const char *format,...)
Definition: string.cpp:22
void * values[MAX_ARGS]
Definition: device/queue.h:35
size_t sizes[MAX_ARGS]
Definition: device/queue.h:36
Type types[MAX_ARGS]
Definition: device/queue.h:34
ccl_device_inline size_t round_up(size_t x, size_t multiple)
Definition: util/types.h:56
ccl_device_inline size_t divide_up(size_t x, size_t y)
Definition: util/types.h:51
uint64_t device_ptr
Definition: util/types.h:43
static FT_Error err