Blender  V3.3
device_impl.mm
Go to the documentation of this file.
1 /* SPDX-License-Identifier: Apache-2.0
2  * Copyright 2021-2022 Blender Foundation */
3 
4 #ifdef WITH_METAL
5 
7 # include "device/metal/device.h"
8 
9 # include "scene/scene.h"
10 
11 # include "util/debug.h"
12 # include "util/md5.h"
13 # include "util/path.h"
14 # include "util/time.h"
15 
17 
18 class MetalDevice;
19 
20 BVHLayoutMask MetalDevice::get_bvh_layout_mask() const
21 {
22  return use_metalrt ? BVH_LAYOUT_METAL : BVH_LAYOUT_BVH2;
23 }
24 
25 void MetalDevice::set_error(const string &error)
26 {
27  static std::mutex s_error_mutex;
28  std::lock_guard<std::mutex> lock(s_error_mutex);
29 
31 
32  if (first_error) {
33  fprintf(stderr, "\nRefer to the Cycles GPU rendering documentation for possible solutions:\n");
34  fprintf(stderr,
35  "https://docs.blender.org/manual/en/latest/render/cycles/gpu_rendering.html\n\n");
36  first_error = false;
37  }
38 }
39 
40 MetalDevice::MetalDevice(const DeviceInfo &info, Stats &stats, Profiler &profiler)
41  : Device(info, stats, profiler), texture_info(this, "texture_info", MEM_GLOBAL)
42 {
43  mtlDevId = info.num;
44 
45  /* select chosen device */
46  auto usable_devices = MetalInfo::get_usable_devices();
47  assert(mtlDevId < usable_devices.size());
48  mtlDevice = usable_devices[mtlDevId];
49  device_vendor = MetalInfo::get_device_vendor(mtlDevice);
50  assert(device_vendor != METAL_GPU_UNKNOWN);
51  metal_printf("Creating new Cycles device for Metal: %s\n", info.description.c_str());
52 
53  /* determine default storage mode based on whether UMA is supported */
54 
55  default_storage_mode = MTLResourceStorageModeManaged;
56 
57  if (@available(macos 11.0, *)) {
58  if ([mtlDevice hasUnifiedMemory]) {
59  default_storage_mode = MTLResourceStorageModeShared;
60  init_host_memory();
61  }
62  }
63 
64  texture_bindings_2d = [mtlDevice newBufferWithLength:4096 options:default_storage_mode];
65  texture_bindings_3d = [mtlDevice newBufferWithLength:4096 options:default_storage_mode];
66 
67  stats.mem_alloc(texture_bindings_2d.allocatedSize + texture_bindings_3d.allocatedSize);
68 
69  switch (device_vendor) {
70  default:
71  break;
72  case METAL_GPU_INTEL: {
73  max_threads_per_threadgroup = 64;
74  break;
75  }
76  case METAL_GPU_AMD: {
77  max_threads_per_threadgroup = 128;
78  break;
79  }
80  case METAL_GPU_APPLE: {
81  max_threads_per_threadgroup = 512;
82  use_metalrt = info.use_metalrt;
83 
84  /* Specialize the intersection kernels on Apple GPUs by default as these can be built very
85  * quickly. */
86  kernel_specialization_level = PSO_SPECIALIZED_INTERSECT;
87  break;
88  }
89  }
90 
91  if (auto metalrt = getenv("CYCLES_METALRT")) {
92  use_metalrt = (atoi(metalrt) != 0);
93  }
94 
95  if (getenv("CYCLES_DEBUG_METAL_CAPTURE_KERNEL")) {
96  capture_enabled = true;
97  }
98 
99  if (auto envstr = getenv("CYCLES_METAL_SPECIALIZATION_LEVEL")) {
100  kernel_specialization_level = (MetalPipelineType)atoi(envstr);
101  }
102  metal_printf("kernel_specialization_level = %s\n",
103  kernel_type_as_string(
104  (MetalPipelineType)min((int)kernel_specialization_level, (int)PSO_NUM - 1)));
105 
106  MTLArgumentDescriptor *arg_desc_params = [[MTLArgumentDescriptor alloc] init];
107  arg_desc_params.dataType = MTLDataTypePointer;
108  arg_desc_params.access = MTLArgumentAccessReadOnly;
109  arg_desc_params.arrayLength = sizeof(KernelParamsMetal) / sizeof(device_ptr);
110  mtlBufferKernelParamsEncoder = [mtlDevice newArgumentEncoderWithArguments:@[ arg_desc_params ]];
111 
112  MTLArgumentDescriptor *arg_desc_texture = [[MTLArgumentDescriptor alloc] init];
113  arg_desc_texture.dataType = MTLDataTypeTexture;
114  arg_desc_texture.access = MTLArgumentAccessReadOnly;
115  mtlTextureArgEncoder = [mtlDevice newArgumentEncoderWithArguments:@[ arg_desc_texture ]];
116 
117  /* command queue for non-tracing work on the GPU */
118  mtlGeneralCommandQueue = [mtlDevice newCommandQueue];
119 
120  /* Acceleration structure arg encoder, if needed */
121  if (@available(macos 12.0, *)) {
122  if (use_metalrt) {
123  MTLArgumentDescriptor *arg_desc_as = [[MTLArgumentDescriptor alloc] init];
124  arg_desc_as.dataType = MTLDataTypeInstanceAccelerationStructure;
125  arg_desc_as.access = MTLArgumentAccessReadOnly;
126  mtlASArgEncoder = [mtlDevice newArgumentEncoderWithArguments:@[ arg_desc_as ]];
127  [arg_desc_as release];
128  }
129  }
130 
131  /* Build the arg encoder for the ancillary bindings */
132  {
133  NSMutableArray *ancillary_desc = [[NSMutableArray alloc] init];
134 
135  int index = 0;
136  MTLArgumentDescriptor *arg_desc_tex = [[MTLArgumentDescriptor alloc] init];
137  arg_desc_tex.dataType = MTLDataTypePointer;
138  arg_desc_tex.access = MTLArgumentAccessReadOnly;
139 
140  arg_desc_tex.index = index++;
141  [ancillary_desc addObject:[arg_desc_tex copy]]; /* metal_tex_2d */
142  arg_desc_tex.index = index++;
143  [ancillary_desc addObject:[arg_desc_tex copy]]; /* metal_tex_3d */
144 
145  [arg_desc_tex release];
146 
147  if (@available(macos 12.0, *)) {
148  if (use_metalrt) {
149  MTLArgumentDescriptor *arg_desc_as = [[MTLArgumentDescriptor alloc] init];
150  arg_desc_as.dataType = MTLDataTypeInstanceAccelerationStructure;
151  arg_desc_as.access = MTLArgumentAccessReadOnly;
152 
153  MTLArgumentDescriptor *arg_desc_ift = [[MTLArgumentDescriptor alloc] init];
154  arg_desc_ift.dataType = MTLDataTypeIntersectionFunctionTable;
155  arg_desc_ift.access = MTLArgumentAccessReadOnly;
156 
157  arg_desc_as.index = index++;
158  [ancillary_desc addObject:[arg_desc_as copy]]; /* accel_struct */
159  arg_desc_ift.index = index++;
160  [ancillary_desc addObject:[arg_desc_ift copy]]; /* ift_default */
161  arg_desc_ift.index = index++;
162  [ancillary_desc addObject:[arg_desc_ift copy]]; /* ift_shadow */
163  arg_desc_ift.index = index++;
164  [ancillary_desc addObject:[arg_desc_ift copy]]; /* ift_local */
165 
166  [arg_desc_ift release];
167  [arg_desc_as release];
168  }
169  }
170 
171  mtlAncillaryArgEncoder = [mtlDevice newArgumentEncoderWithArguments:ancillary_desc];
172 
173  for (int i = 0; i < ancillary_desc.count; i++) {
174  [ancillary_desc[i] release];
175  }
176  [ancillary_desc release];
177  }
178  [arg_desc_params release];
179  [arg_desc_texture release];
180 }
181 
182 MetalDevice::~MetalDevice()
183 {
184  for (auto &tex : texture_slot_map) {
185  if (tex) {
186  [tex release];
187  tex = nil;
188  }
189  }
190  flush_delayed_free_list();
191 
192  if (texture_bindings_2d) {
193  stats.mem_free(texture_bindings_2d.allocatedSize + texture_bindings_3d.allocatedSize);
194 
195  [texture_bindings_2d release];
196  [texture_bindings_3d release];
197  }
198  [mtlTextureArgEncoder release];
199  [mtlBufferKernelParamsEncoder release];
200  [mtlASArgEncoder release];
201  [mtlAncillaryArgEncoder release];
202  [mtlGeneralCommandQueue release];
203  [mtlDevice release];
204 
205  texture_info.free();
206 }
207 
208 bool MetalDevice::support_device(const uint kernel_features /*requested_features*/)
209 {
210  return true;
211 }
212 
213 bool MetalDevice::check_peer_access(Device *peer_device)
214 {
215  assert(0);
216  /* does peer access make sense? */
217  return false;
218 }
219 
220 bool MetalDevice::use_adaptive_compilation()
221 {
223 }
224 
225 void MetalDevice::make_source(MetalPipelineType pso_type, const uint kernel_features)
226 {
227  string global_defines;
228  if (use_adaptive_compilation()) {
229  global_defines += "#define __KERNEL_FEATURES__ " + to_string(kernel_features) + "\n";
230  }
231 
232  if (use_metalrt) {
233  global_defines += "#define __METALRT__\n";
234  if (motion_blur) {
235  global_defines += "#define __METALRT_MOTION__\n";
236  }
237  }
238 
239 # ifdef WITH_CYCLES_DEBUG
240  global_defines += "#define __KERNEL_DEBUG__\n";
241 # endif
242 
243  switch (device_vendor) {
244  default:
245  break;
246  case METAL_GPU_INTEL:
247  global_defines += "#define __KERNEL_METAL_INTEL__\n";
248  break;
249  case METAL_GPU_AMD:
250  global_defines += "#define __KERNEL_METAL_AMD__\n";
251  break;
252  case METAL_GPU_APPLE:
253  global_defines += "#define __KERNEL_METAL_APPLE__\n";
254  break;
255  }
256 
257  string &source = this->source[pso_type];
258  source = "\n#include \"kernel/device/metal/kernel.metal\"\n";
259  source = path_source_replace_includes(source, path_get("source"));
260 
261  /* Perform any required specialization on the source.
262  * With Metal function constants we can generate a single variant of the kernel source which can
263  * be repeatedly respecialized.
264  */
265  string baked_constants;
266 
267  /* Replace specific KernelData "dot" dereferences with a Metal function_constant identifier of
268  * the same character length. Build a string of all active constant values which is then hashed
269  * in order to identify the PSO.
270  */
271  if (pso_type != PSO_GENERIC) {
272  const double starttime = time_dt();
273 
274 # define KERNEL_STRUCT_BEGIN(name, parent) \
275  string_replace_same_length(source, "kernel_data." #parent ".", "kernel_data_" #parent "_");
276 
277  /* Add constants to md5 so that 'get_best_pipeline' is able to return a suitable match. */
278 # define KERNEL_STRUCT_MEMBER(parent, _type, name) \
279  baked_constants += string(#parent "." #name "=") + \
280  to_string(_type(launch_params.data.parent.name)) + "\n";
281 
282 # include "kernel/data_template.h"
283 
284  /* Opt in to all of available specializations. This can be made more granular for the
285  * PSO_SPECIALIZED_INTERSECT case in order to minimize the number of specialization requests,
286  * but the overhead should be negligible as these are very quick to (re)build and aren't
287  * serialized to disk via MTLBinaryArchives.
288  */
289  global_defines += "#define __KERNEL_USE_DATA_CONSTANTS__\n";
290 
291  metal_printf("KernelData patching took %.1f ms\n", (time_dt() - starttime) * 1000.0);
292  }
293 
294  source = global_defines + source;
295  metal_printf("================\n%s================\n\%s================\n",
296  global_defines.c_str(),
297  baked_constants.c_str());
298 
299  /* Generate an MD5 from the source and include any baked constants. This is used when caching
300  * PSOs. */
301  MD5Hash md5;
302  md5.append(baked_constants);
303  md5.append(source);
304  if (use_metalrt) {
305  md5.append(std::to_string(kernel_features & METALRT_FEATURE_MASK));
306  }
307  source_md5[pso_type] = md5.get_hex();
308 }
309 
310 bool MetalDevice::load_kernels(const uint _kernel_features)
311 {
312  kernel_features = _kernel_features;
313 
314  /* check if GPU is supported */
315  if (!support_device(kernel_features))
316  return false;
317 
318  /* Keep track of whether motion blur is enabled, so to enable/disable motion in BVH builds
319  * This is necessary since objects may be reported to have motion if the Vector pass is
320  * active, but may still need to be rendered without motion blur if that isn't active as well. */
321  motion_blur = kernel_features & KERNEL_FEATURE_OBJECT_MOTION;
322 
323  bool result = compile_and_load(PSO_GENERIC);
324 
325  reserve_local_memory(kernel_features);
326  return result;
327 }
328 
329 bool MetalDevice::compile_and_load(MetalPipelineType pso_type)
330 {
331  make_source(pso_type, kernel_features);
332 
333  if (!MetalDeviceKernels::should_load_kernels(this, pso_type)) {
334  /* We already have a full set of matching pipelines which are cached or queued. */
335  metal_printf("%s kernels already requested\n", kernel_type_as_string(pso_type));
336  return true;
337  }
338 
339  MTLCompileOptions *options = [[MTLCompileOptions alloc] init];
340 
341 # if defined(MAC_OS_VERSION_13_0)
342  if (@available(macos 13.0, *)) {
343  if (device_vendor == METAL_GPU_INTEL) {
344  [options setOptimizationLevel:MTLLibraryOptimizationLevelSize];
345  }
346  }
347 # endif
348 
349  options.fastMathEnabled = YES;
350  if (@available(macOS 12.0, *)) {
351  options.languageVersion = MTLLanguageVersion2_4;
352  }
353 
354  if (getenv("CYCLES_METAL_PROFILING") || getenv("CYCLES_METAL_DEBUG")) {
355  path_write_text(path_cache_get(string_printf("%s.metal", kernel_type_as_string(pso_type))),
356  source[pso_type]);
357  }
358 
359  const double starttime = time_dt();
360 
361  NSError *error = NULL;
362  mtlLibrary[pso_type] = [mtlDevice newLibraryWithSource:@(source[pso_type].c_str())
364  error:&error];
365 
366  if (!mtlLibrary[pso_type]) {
367  NSString *err = [error localizedDescription];
368  set_error(string_printf("Failed to compile library:\n%s", [err UTF8String]));
369  }
370 
371  metal_printf("Front-end compilation finished in %.1f seconds (%s)\n",
372  time_dt() - starttime,
373  kernel_type_as_string(pso_type));
374 
375  [options release];
376 
377  return MetalDeviceKernels::load(this, pso_type);
378 }
379 
380 void MetalDevice::reserve_local_memory(const uint kernel_features)
381 {
382  /* METAL_WIP - implement this */
383 }
384 
385 void MetalDevice::init_host_memory()
386 {
387  /* METAL_WIP - implement this */
388 }
389 
390 void MetalDevice::load_texture_info()
391 {
392  if (need_texture_info) {
393  /* Unset flag before copying. */
394  need_texture_info = false;
395  texture_info.copy_to_device();
396 
397  int num_textures = texture_info.size();
398 
399  for (int tex = 0; tex < num_textures; tex++) {
400  uint64_t offset = tex * sizeof(void *);
401 
402  id<MTLTexture> metal_texture = texture_slot_map[tex];
403  if (!metal_texture) {
404  [mtlTextureArgEncoder setArgumentBuffer:texture_bindings_2d offset:offset];
405  [mtlTextureArgEncoder setTexture:nil atIndex:0];
406  [mtlTextureArgEncoder setArgumentBuffer:texture_bindings_3d offset:offset];
407  [mtlTextureArgEncoder setTexture:nil atIndex:0];
408  }
409  else {
410  MTLTextureType type = metal_texture.textureType;
411  [mtlTextureArgEncoder setArgumentBuffer:texture_bindings_2d offset:offset];
412  [mtlTextureArgEncoder setTexture:type == MTLTextureType2D ? metal_texture : nil atIndex:0];
413  [mtlTextureArgEncoder setArgumentBuffer:texture_bindings_3d offset:offset];
414  [mtlTextureArgEncoder setTexture:type == MTLTextureType3D ? metal_texture : nil atIndex:0];
415  }
416  }
417  if (default_storage_mode == MTLResourceStorageModeManaged) {
418  [texture_bindings_2d didModifyRange:NSMakeRange(0, num_textures * sizeof(void *))];
419  [texture_bindings_3d didModifyRange:NSMakeRange(0, num_textures * sizeof(void *))];
420  }
421  }
422 }
423 
424 void MetalDevice::erase_allocation(device_memory &mem)
425 {
426  stats.mem_free(mem.device_size);
427  mem.device_pointer = 0;
428  mem.device_size = 0;
429 
430  auto it = metal_mem_map.find(&mem);
431  if (it != metal_mem_map.end()) {
432  MetalMem *mmem = it->second.get();
433 
434  /* blank out reference to MetalMem* in the launch params (fixes crash T94736) */
435  if (mmem->pointer_index >= 0) {
436  device_ptr *pointers = (device_ptr *)&launch_params;
437  pointers[mmem->pointer_index] = 0;
438  }
439  metal_mem_map.erase(it);
440  }
441 }
442 
443 MetalDevice::MetalMem *MetalDevice::generic_alloc(device_memory &mem)
444 {
445  size_t size = mem.memory_size();
446 
447  mem.device_pointer = 0;
448 
449  id<MTLBuffer> metal_buffer = nil;
450  MTLResourceOptions options = default_storage_mode;
451 
452  /* Workaround for "bake" unit tests which fail if RenderBuffers is allocated with
453  * MTLResourceStorageModeShared. */
454  if (strstr(mem.name, "RenderBuffers")) {
455  options = MTLResourceStorageModeManaged;
456  }
457 
458  if (size > 0) {
459  if (mem.type == MEM_DEVICE_ONLY && !capture_enabled) {
460  options = MTLResourceStorageModePrivate;
461  }
462 
463  metal_buffer = [mtlDevice newBufferWithLength:size options:options];
464 
465  if (!metal_buffer) {
466  set_error("System is out of GPU memory");
467  return nullptr;
468  }
469  }
470 
471  if (mem.name) {
472  VLOG_WORK << "Buffer allocate: " << mem.name << ", "
473  << string_human_readable_number(mem.memory_size()) << " bytes. ("
474  << string_human_readable_size(mem.memory_size()) << ")";
475  }
476 
477  mem.device_size = metal_buffer.allocatedSize;
478  stats.mem_alloc(mem.device_size);
479 
480  metal_buffer.label = [[NSString alloc] initWithFormat:@"%s", mem.name];
481 
482  std::lock_guard<std::recursive_mutex> lock(metal_mem_map_mutex);
483 
484  assert(metal_mem_map.count(&mem) == 0); /* assert against double-alloc */
485  MetalMem *mmem = new MetalMem;
486  metal_mem_map[&mem] = std::unique_ptr<MetalMem>(mmem);
487 
488  mmem->mem = &mem;
489  mmem->mtlBuffer = metal_buffer;
490  mmem->offset = 0;
491  mmem->size = size;
492  if (options != MTLResourceStorageModePrivate) {
493  mmem->hostPtr = [metal_buffer contents];
494  }
495  else {
496  mmem->hostPtr = nullptr;
497  }
498 
499  /* encode device_pointer as (MetalMem*) in order to handle resource relocation and device pointer
500  * recalculation */
501  mem.device_pointer = device_ptr(mmem);
502 
503  if (metal_buffer.storageMode == MTLResourceStorageModeShared) {
504  /* Replace host pointer with our host allocation. */
505 
506  if (mem.host_pointer && mem.host_pointer != mmem->hostPtr) {
507  memcpy(mmem->hostPtr, mem.host_pointer, size);
508 
509  mem.host_free();
510  mem.host_pointer = mmem->hostPtr;
511  }
512  mem.shared_pointer = mmem->hostPtr;
513  mem.shared_counter++;
514  mmem->use_UMA = true;
515  }
516  else {
517  mmem->use_UMA = false;
518  }
519 
520  return mmem;
521 }
522 
523 void MetalDevice::generic_copy_to(device_memory &mem)
524 {
525  if (!mem.host_pointer || !mem.device_pointer) {
526  return;
527  }
528 
529  std::lock_guard<std::recursive_mutex> lock(metal_mem_map_mutex);
530  if (!metal_mem_map.at(&mem)->use_UMA || mem.host_pointer != mem.shared_pointer) {
531  MetalMem &mmem = *metal_mem_map.at(&mem);
532  memcpy(mmem.hostPtr, mem.host_pointer, mem.memory_size());
533  if (mmem.mtlBuffer.storageMode == MTLStorageModeManaged) {
534  [mmem.mtlBuffer didModifyRange:NSMakeRange(0, mem.memory_size())];
535  }
536  }
537 }
538 
539 void MetalDevice::generic_free(device_memory &mem)
540 {
541  if (mem.device_pointer) {
542  std::lock_guard<std::recursive_mutex> lock(metal_mem_map_mutex);
543  MetalMem &mmem = *metal_mem_map.at(&mem);
544  size_t size = mmem.size;
545 
546  /* If mmem.use_uma is true, reference counting is used
547  * to safely free memory. */
548 
549  bool free_mtlBuffer = false;
550 
551  if (mmem.use_UMA) {
552  assert(mem.shared_pointer);
553  if (mem.shared_pointer) {
554  assert(mem.shared_counter > 0);
555  if (--mem.shared_counter == 0) {
556  free_mtlBuffer = true;
557  }
558  }
559  }
560  else {
561  free_mtlBuffer = true;
562  }
563 
564  if (free_mtlBuffer) {
565  if (mem.host_pointer && mem.host_pointer == mem.shared_pointer) {
566  /* Safely move the device-side data back to the host before it is freed. */
567  mem.host_pointer = mem.host_alloc(size);
568  memcpy(mem.host_pointer, mem.shared_pointer, size);
569  mmem.use_UMA = false;
570  }
571 
572  mem.shared_pointer = 0;
573 
574  /* Free device memory. */
575  delayed_free_list.push_back(mmem.mtlBuffer);
576  mmem.mtlBuffer = nil;
577  }
578 
579  erase_allocation(mem);
580  }
581 }
582 
583 void MetalDevice::mem_alloc(device_memory &mem)
584 {
585  if (mem.type == MEM_TEXTURE) {
586  assert(!"mem_alloc not supported for textures.");
587  }
588  else if (mem.type == MEM_GLOBAL) {
589  generic_alloc(mem);
590  }
591  else {
592  generic_alloc(mem);
593  }
594 }
595 
596 void MetalDevice::mem_copy_to(device_memory &mem)
597 {
598  if (mem.type == MEM_GLOBAL) {
599  global_free(mem);
600  global_alloc(mem);
601  }
602  else if (mem.type == MEM_TEXTURE) {
603  tex_free((device_texture &)mem);
604  tex_alloc((device_texture &)mem);
605  }
606  else {
607  if (!mem.device_pointer) {
608  generic_alloc(mem);
609  }
610  generic_copy_to(mem);
611  }
612 }
613 
614 void MetalDevice::mem_copy_from(device_memory &mem, size_t y, size_t w, size_t h, size_t elem)
615 {
616  if (mem.host_pointer) {
617 
618  bool subcopy = (w >= 0 && h >= 0);
619  const size_t size = subcopy ? (elem * w * h) : mem.memory_size();
620  const size_t offset = subcopy ? (elem * y * w) : 0;
621 
622  if (mem.device_pointer) {
623  std::lock_guard<std::recursive_mutex> lock(metal_mem_map_mutex);
624  MetalMem &mmem = *metal_mem_map.at(&mem);
625 
626  if ([mmem.mtlBuffer storageMode] == MTLStorageModeManaged) {
627 
628  id<MTLCommandBuffer> cmdBuffer = [mtlGeneralCommandQueue commandBuffer];
629  id<MTLBlitCommandEncoder> blitEncoder = [cmdBuffer blitCommandEncoder];
630  [blitEncoder synchronizeResource:mmem.mtlBuffer];
631  [blitEncoder endEncoding];
632  [cmdBuffer commit];
633  [cmdBuffer waitUntilCompleted];
634  }
635 
636  if (mem.host_pointer != mmem.hostPtr) {
637  memcpy((uchar *)mem.host_pointer + offset, (uchar *)mmem.hostPtr + offset, size);
638  }
639  }
640  else {
641  memset((char *)mem.host_pointer + offset, 0, size);
642  }
643  }
644 }
645 
646 void MetalDevice::mem_zero(device_memory &mem)
647 {
648  if (!mem.device_pointer) {
649  mem_alloc(mem);
650  }
651  if (!mem.device_pointer) {
652  return;
653  }
654 
655  size_t size = mem.memory_size();
656  std::lock_guard<std::recursive_mutex> lock(metal_mem_map_mutex);
657  MetalMem &mmem = *metal_mem_map.at(&mem);
658  memset(mmem.hostPtr, 0, size);
659  if ([mmem.mtlBuffer storageMode] == MTLStorageModeManaged) {
660  [mmem.mtlBuffer didModifyRange:NSMakeRange(0, size)];
661  }
662 }
663 
664 void MetalDevice::mem_free(device_memory &mem)
665 {
666  if (mem.type == MEM_GLOBAL) {
667  global_free(mem);
668  }
669  else if (mem.type == MEM_TEXTURE) {
670  tex_free((device_texture &)mem);
671  }
672  else {
673  generic_free(mem);
674  }
675 }
676 
677 device_ptr MetalDevice::mem_alloc_sub_ptr(device_memory &mem, size_t offset, size_t /*size*/)
678 {
679  /* METAL_WIP - revive if necessary */
680  assert(0);
681  return 0;
682 }
683 
684 void MetalDevice::optimize_for_scene(Scene *scene)
685 {
686  MetalPipelineType specialization_level = kernel_specialization_level;
687 
688  if (specialization_level < PSO_SPECIALIZED_INTERSECT) {
689  return;
690  }
691 
692  /* PSO_SPECIALIZED_INTERSECT kernels are fast to specialize, so we always load them
693  * synchronously. */
694  compile_and_load(PSO_SPECIALIZED_INTERSECT);
695 
696  if (specialization_level < PSO_SPECIALIZED_SHADE) {
697  return;
698  }
699  if (!scene->params.background) {
700  /* Don't load PSO_SPECIALIZED_SHADE kernels during viewport rendering as they are slower to
701  * build. */
702  return;
703  }
704 
705  /* PSO_SPECIALIZED_SHADE kernels are slower to specialize, so we load them asynchronously, and
706  * only if there isn't an existing load in flight.
707  */
708  auto specialize_shade_fn = ^() {
709  compile_and_load(PSO_SPECIALIZED_SHADE);
710  async_compile_and_load = false;
711  };
712 
713  bool async_specialize_shade = true;
714 
715  /* Block if a per-kernel profiling is enabled (ensure steady rendering rate). */
716  if (getenv("CYCLES_METAL_PROFILING") != nullptr) {
717  async_specialize_shade = false;
718  }
719 
720  if (async_specialize_shade) {
721  if (!async_compile_and_load) {
722  async_compile_and_load = true;
723  dispatch_async(dispatch_get_global_queue(DISPATCH_QUEUE_PRIORITY_DEFAULT, 0),
724  specialize_shade_fn);
725  }
726  else {
727  metal_printf(
728  "Async PSO_SPECIALIZED_SHADE load request already in progress - dropping request\n");
729  }
730  }
731  else {
732  specialize_shade_fn();
733  }
734 }
735 
736 void MetalDevice::const_copy_to(const char *name, void *host, size_t size)
737 {
738  if (strcmp(name, "data") == 0) {
739  assert(size == sizeof(KernelData));
740  memcpy((uint8_t *)&launch_params.data, host, sizeof(KernelData));
741  return;
742  }
743 
744  auto update_launch_pointers =
745  [&](size_t offset, void *data, size_t data_size, size_t pointers_size) {
746  memcpy((uint8_t *)&launch_params + offset, data, data_size);
747 
748  MetalMem **mmem = (MetalMem **)data;
749  int pointer_count = pointers_size / sizeof(device_ptr);
750  int pointer_index = offset / sizeof(device_ptr);
751  for (int i = 0; i < pointer_count; i++) {
752  if (mmem[i]) {
753  mmem[i]->pointer_index = pointer_index + i;
754  }
755  }
756  };
757 
758  /* Update data storage pointers in launch parameters. */
759  if (strcmp(name, "integrator_state") == 0) {
760  /* IntegratorStateGPU is contiguous pointers */
761  const size_t pointer_block_size = offsetof(IntegratorStateGPU, sort_partition_divisor);
762  update_launch_pointers(
763  offsetof(KernelParamsMetal, integrator_state), host, size, pointer_block_size);
764  }
765 # define KERNEL_DATA_ARRAY(data_type, tex_name) \
766  else if (strcmp(name, #tex_name) == 0) \
767  { \
768  update_launch_pointers(offsetof(KernelParamsMetal, tex_name), host, size, size); \
769  }
770 # include "kernel/data_arrays.h"
771 # undef KERNEL_DATA_ARRAY
772 }
773 
774 void MetalDevice::global_alloc(device_memory &mem)
775 {
776  if (mem.is_resident(this)) {
777  generic_alloc(mem);
778  generic_copy_to(mem);
779  }
780 
781  const_copy_to(mem.name, &mem.device_pointer, sizeof(mem.device_pointer));
782 }
783 
784 void MetalDevice::global_free(device_memory &mem)
785 {
786  if (mem.is_resident(this) && mem.device_pointer) {
787  generic_free(mem);
788  }
789 }
790 
791 void MetalDevice::tex_alloc_as_buffer(device_texture &mem)
792 {
793  generic_alloc(mem);
794  generic_copy_to(mem);
795 
796  /* Resize once */
797  const uint slot = mem.slot;
798  if (slot >= texture_info.size()) {
799  /* Allocate some slots in advance, to reduce amount
800  * of re-allocations. */
801  texture_info.resize(round_up(slot + 1, 128));
802  }
803 
804  mem.info.data = (uint64_t)mem.device_pointer;
805 
806  /* Set Mapping and tag that we need to (re-)upload to device */
807  texture_info[slot] = mem.info;
808  need_texture_info = true;
809 }
810 
811 void MetalDevice::tex_alloc(device_texture &mem)
812 {
813  /* Check that dimensions fit within maximum allowable size.
814  * See: https://developer.apple.com/metal/Metal-Feature-Set-Tables.pdf */
815  if (mem.data_width > 16384 || mem.data_height > 16384) {
816  set_error(string_printf(
817  "Texture exceeds maximum allowed size of 16384 x 16384 (requested: %zu x %zu)",
818  mem.data_width,
819  mem.data_height));
820  return;
821  }
822 
823  MTLStorageMode storage_mode = MTLStorageModeManaged;
824  if (@available(macos 10.15, *)) {
825  if ([mtlDevice hasUnifiedMemory] &&
826  device_vendor !=
827  METAL_GPU_INTEL) { /* Intel GPUs don't support MTLStorageModeShared for MTLTextures */
828  storage_mode = MTLStorageModeShared;
829  }
830  }
831 
832  /* General variables for both architectures */
833  string bind_name = mem.name;
834  size_t dsize = datatype_size(mem.data_type);
835  size_t size = mem.memory_size();
836 
837  /* sampler_index maps into the GPU's constant 'metal_samplers' array */
838  uint64_t sampler_index = mem.info.extension;
840  sampler_index += 3;
841  }
842 
843  /* Image Texture Storage */
844  MTLPixelFormat format;
845  switch (mem.data_type) {
846  case TYPE_UCHAR: {
847  MTLPixelFormat formats[] = {MTLPixelFormatR8Unorm,
848  MTLPixelFormatRG8Unorm,
849  MTLPixelFormatInvalid,
850  MTLPixelFormatRGBA8Unorm};
851  format = formats[mem.data_elements - 1];
852  } break;
853  case TYPE_UINT16: {
854  MTLPixelFormat formats[] = {MTLPixelFormatR16Unorm,
855  MTLPixelFormatRG16Unorm,
856  MTLPixelFormatInvalid,
857  MTLPixelFormatRGBA16Unorm};
858  format = formats[mem.data_elements - 1];
859  } break;
860  case TYPE_UINT: {
861  MTLPixelFormat formats[] = {MTLPixelFormatR32Uint,
862  MTLPixelFormatRG32Uint,
863  MTLPixelFormatInvalid,
864  MTLPixelFormatRGBA32Uint};
865  format = formats[mem.data_elements - 1];
866  } break;
867  case TYPE_INT: {
868  MTLPixelFormat formats[] = {MTLPixelFormatR32Sint,
869  MTLPixelFormatRG32Sint,
870  MTLPixelFormatInvalid,
871  MTLPixelFormatRGBA32Sint};
872  format = formats[mem.data_elements - 1];
873  } break;
874  case TYPE_FLOAT: {
875  MTLPixelFormat formats[] = {MTLPixelFormatR32Float,
876  MTLPixelFormatRG32Float,
877  MTLPixelFormatInvalid,
878  MTLPixelFormatRGBA32Float};
879  format = formats[mem.data_elements - 1];
880  } break;
881  case TYPE_HALF: {
882  MTLPixelFormat formats[] = {MTLPixelFormatR16Float,
883  MTLPixelFormatRG16Float,
884  MTLPixelFormatInvalid,
885  MTLPixelFormatRGBA16Float};
886  format = formats[mem.data_elements - 1];
887  } break;
888  default:
889  assert(0);
890  return;
891  }
892 
893  assert(format != MTLPixelFormatInvalid);
894 
895  id<MTLTexture> mtlTexture = nil;
896  size_t src_pitch = mem.data_width * dsize * mem.data_elements;
897 
898  if (mem.data_depth > 1) {
899  /* 3D texture using array */
900  MTLTextureDescriptor *desc;
901 
902  desc = [MTLTextureDescriptor texture2DDescriptorWithPixelFormat:format
903  width:mem.data_width
904  height:mem.data_height
905  mipmapped:NO];
906 
907  desc.storageMode = storage_mode;
908  desc.usage = MTLTextureUsageShaderRead;
909 
910  desc.textureType = MTLTextureType3D;
911  desc.depth = mem.data_depth;
912 
913  VLOG_WORK << "Texture 3D allocate: " << mem.name << ", "
914  << string_human_readable_number(mem.memory_size()) << " bytes. ("
915  << string_human_readable_size(mem.memory_size()) << ")";
916 
917  mtlTexture = [mtlDevice newTextureWithDescriptor:desc];
918  assert(mtlTexture);
919 
920  if (!mtlTexture) {
921  return;
922  }
923 
924  const size_t imageBytes = src_pitch * mem.data_height;
925  for (size_t d = 0; d < mem.data_depth; d++) {
926  const size_t offset = d * imageBytes;
927  [mtlTexture replaceRegion:MTLRegionMake3D(0, 0, d, mem.data_width, mem.data_height, 1)
928  mipmapLevel:0
929  slice:0
930  withBytes:(uint8_t *)mem.host_pointer + offset
931  bytesPerRow:src_pitch
932  bytesPerImage:0];
933  }
934  }
935  else if (mem.data_height > 0) {
936  /* 2D texture */
937  MTLTextureDescriptor *desc;
938 
939  desc = [MTLTextureDescriptor texture2DDescriptorWithPixelFormat:format
940  width:mem.data_width
941  height:mem.data_height
942  mipmapped:NO];
943 
944  desc.storageMode = storage_mode;
945  desc.usage = MTLTextureUsageShaderRead;
946 
947  VLOG_WORK << "Texture 2D allocate: " << mem.name << ", "
948  << string_human_readable_number(mem.memory_size()) << " bytes. ("
949  << string_human_readable_size(mem.memory_size()) << ")";
950 
951  mtlTexture = [mtlDevice newTextureWithDescriptor:desc];
952  assert(mtlTexture);
953 
954  [mtlTexture replaceRegion:MTLRegionMake2D(0, 0, mem.data_width, mem.data_height)
955  mipmapLevel:0
956  withBytes:mem.host_pointer
957  bytesPerRow:src_pitch];
958  }
959  else {
960  assert(0);
961  /* 1D texture, using linear memory. */
962  }
963 
964  mem.device_pointer = (device_ptr)mtlTexture;
965  mem.device_size = size;
966  stats.mem_alloc(size);
967 
968  std::lock_guard<std::recursive_mutex> lock(metal_mem_map_mutex);
969  MetalMem *mmem = new MetalMem;
970  metal_mem_map[&mem] = std::unique_ptr<MetalMem>(mmem);
971  mmem->mem = &mem;
972  mmem->mtlTexture = mtlTexture;
973 
974  /* Resize once */
975  const uint slot = mem.slot;
976  if (slot >= texture_info.size()) {
977  /* Allocate some slots in advance, to reduce amount
978  * of re-allocations. */
979  texture_info.resize(slot + 128);
980  texture_slot_map.resize(slot + 128);
981 
982  ssize_t min_buffer_length = sizeof(void *) * texture_info.size();
983  if (!texture_bindings_2d || (texture_bindings_2d.length < min_buffer_length)) {
984  if (texture_bindings_2d) {
985  delayed_free_list.push_back(texture_bindings_2d);
986  delayed_free_list.push_back(texture_bindings_3d);
987 
988  stats.mem_free(texture_bindings_2d.allocatedSize + texture_bindings_3d.allocatedSize);
989  }
990  texture_bindings_2d = [mtlDevice newBufferWithLength:min_buffer_length
991  options:default_storage_mode];
992  texture_bindings_3d = [mtlDevice newBufferWithLength:min_buffer_length
993  options:default_storage_mode];
994 
995  stats.mem_alloc(texture_bindings_2d.allocatedSize + texture_bindings_3d.allocatedSize);
996  }
997  }
998 
999  if (@available(macos 10.14, *)) {
1000  /* Optimize the texture for GPU access. */
1001  id<MTLCommandBuffer> commandBuffer = [mtlGeneralCommandQueue commandBuffer];
1002  id<MTLBlitCommandEncoder> blitCommandEncoder = [commandBuffer blitCommandEncoder];
1003  [blitCommandEncoder optimizeContentsForGPUAccess:mtlTexture];
1004  [blitCommandEncoder endEncoding];
1005  [commandBuffer commit];
1006  }
1007 
1008  /* Set Mapping and tag that we need to (re-)upload to device */
1009  texture_slot_map[slot] = mtlTexture;
1010  texture_info[slot] = mem.info;
1011  need_texture_info = true;
1012 
1013  texture_info[slot].data = uint64_t(slot) | (sampler_index << 32);
1014 }
1015 
1016 void MetalDevice::tex_free(device_texture &mem)
1017 {
1018  if (metal_mem_map.count(&mem)) {
1019  std::lock_guard<std::recursive_mutex> lock(metal_mem_map_mutex);
1020  MetalMem &mmem = *metal_mem_map.at(&mem);
1021 
1022  assert(texture_slot_map[mem.slot] == mmem.mtlTexture);
1023  texture_slot_map[mem.slot] = nil;
1024 
1025  if (mmem.mtlTexture) {
1026  /* Free bindless texture. */
1027  delayed_free_list.push_back(mmem.mtlTexture);
1028  mmem.mtlTexture = nil;
1029  }
1030  erase_allocation(mem);
1031  }
1032 }
1033 
1034 unique_ptr<DeviceQueue> MetalDevice::gpu_queue_create()
1035 {
1036  return make_unique<MetalDeviceQueue>(this);
1037 }
1038 
1039 bool MetalDevice::should_use_graphics_interop()
1040 {
1041  /* METAL_WIP - provide fast interop */
1042  return false;
1043 }
1044 
1045 void MetalDevice::flush_delayed_free_list()
1046 {
1047  /* free any Metal buffers that may have been freed by host while a command
1048  * buffer was being generated. This function should be called after each
1049  * completion of a command buffer */
1050  std::lock_guard<std::recursive_mutex> lock(metal_mem_map_mutex);
1051  for (auto &it : delayed_free_list) {
1052  [it release];
1053  }
1054  delayed_free_list.clear();
1055 }
1056 
1057 void MetalDevice::build_bvh(BVH *bvh, Progress &progress, bool refit)
1058 {
1059  if (bvh->params.bvh_layout == BVH_LAYOUT_BVH2) {
1060  Device::build_bvh(bvh, progress, refit);
1061  return;
1062  }
1063 
1064  BVHMetal *bvh_metal = static_cast<BVHMetal *>(bvh);
1065  bvh_metal->motion_blur = motion_blur;
1066  if (bvh_metal->build(progress, mtlDevice, mtlGeneralCommandQueue, refit)) {
1067 
1068  if (@available(macos 11.0, *)) {
1069  if (bvh->params.top_level) {
1070  bvhMetalRT = bvh_metal;
1071  }
1072  }
1073  }
1074 }
1075 
1077 
1078 #endif
unsigned char uchar
Definition: BLI_sys_types.h:70
unsigned int uint
Definition: BLI_sys_types.h:67
SSIZE_T ssize_t
Definition: BLI_winstuff.h:71
ThreadMutex mutex
_GL_VOID GLfloat value _GL_VOID_RET _GL_VOID const GLuint GLboolean *residences _GL_BOOL_RET _GL_VOID GLsizei height
_GL_VOID GLfloat value _GL_VOID_RET _GL_VOID const GLuint GLboolean *residences _GL_BOOL_RET _GL_VOID GLsizei GLfloat GLfloat GLfloat GLfloat const GLubyte *bitmap _GL_VOID_RET _GL_VOID GLenum const void *lists _GL_VOID_RET _GL_VOID const GLdouble *equation _GL_VOID_RET _GL_VOID GLdouble GLdouble blue _GL_VOID_RET _GL_VOID GLfloat GLfloat blue _GL_VOID_RET _GL_VOID GLint GLint blue _GL_VOID_RET _GL_VOID GLshort GLshort blue _GL_VOID_RET _GL_VOID GLubyte GLubyte blue _GL_VOID_RET _GL_VOID GLuint GLuint blue _GL_VOID_RET _GL_VOID GLushort GLushort blue _GL_VOID_RET _GL_VOID GLbyte GLbyte GLbyte alpha _GL_VOID_RET _GL_VOID GLdouble GLdouble GLdouble alpha _GL_VOID_RET _GL_VOID GLfloat GLfloat GLfloat alpha _GL_VOID_RET _GL_VOID GLint GLint GLint alpha _GL_VOID_RET _GL_VOID GLshort GLshort GLshort alpha _GL_VOID_RET _GL_VOID GLubyte GLubyte GLubyte alpha _GL_VOID_RET _GL_VOID GLuint GLuint GLuint alpha _GL_VOID_RET _GL_VOID GLushort GLushort GLushort alpha _GL_VOID_RET _GL_VOID GLenum mode _GL_VOID_RET _GL_VOID GLint y
_GL_VOID GLfloat value _GL_VOID_RET _GL_VOID const GLuint GLboolean *residences _GL_BOOL_RET _GL_VOID GLsizei GLfloat GLfloat GLfloat GLfloat const GLubyte *bitmap _GL_VOID_RET _GL_VOID GLenum type
_GL_VOID GLfloat value _GL_VOID_RET _GL_VOID const GLuint GLboolean *residences _GL_BOOL_RET _GL_VOID GLsizei GLfloat GLfloat GLfloat GLfloat const GLubyte *bitmap _GL_VOID_RET _GL_VOID GLenum const void *lists _GL_VOID_RET _GL_VOID const GLdouble *equation _GL_VOID_RET _GL_VOID GLdouble GLdouble blue _GL_VOID_RET _GL_VOID GLfloat GLfloat blue _GL_VOID_RET _GL_VOID GLint GLint blue _GL_VOID_RET _GL_VOID GLshort GLshort blue _GL_VOID_RET _GL_VOID GLubyte GLubyte blue _GL_VOID_RET _GL_VOID GLuint GLuint blue _GL_VOID_RET _GL_VOID GLushort GLushort blue _GL_VOID_RET _GL_VOID GLbyte GLbyte GLbyte alpha _GL_VOID_RET _GL_VOID GLdouble GLdouble GLdouble alpha _GL_VOID_RET _GL_VOID GLfloat GLfloat GLfloat alpha _GL_VOID_RET _GL_VOID GLint GLint GLint alpha _GL_VOID_RET _GL_VOID GLshort GLshort GLshort alpha _GL_VOID_RET _GL_VOID GLubyte GLubyte GLubyte alpha _GL_VOID_RET _GL_VOID GLuint GLuint GLuint alpha _GL_VOID_RET _GL_VOID GLushort GLushort GLushort alpha _GL_VOID_RET _GL_VOID GLenum mode _GL_VOID_RET _GL_VOID GLint GLsizei width
volatile int lock
static DBVT_INLINE btScalar size(const btDbvtVolume &a)
Definition: btDbvt.cpp:52
void refit(btStridingMeshInterface *triangles, const btVector3 &aabbMin, const btVector3 &aabbMax)
SIMD_FORCE_INLINE const btScalar & w() const
Return the w value.
Definition: btQuadWord.h:119
BVHLayout bvh_layout
Definition: params.h:80
bool top_level
Definition: params.h:77
Definition: bvh/bvh.h:63
BVHParams params
Definition: bvh/bvh.h:65
Metal metal
Definition: debug.h:140
bool use_metalrt
Definition: device/device.h:72
string description
Definition: device/device.h:63
virtual void build_bvh(BVH *bvh, Progress &progress, bool refit)
virtual void set_error(const string &error)
Definition: md5.h:20
string get_hex()
Definition: md5.cpp:348
void append(const uint8_t *data, int size)
Definition: md5.cpp:256
bool background
Definition: scene.h:157
void mem_free(size_t size)
Definition: util/stats.h:29
void mem_alloc(size_t size)
Definition: util/stats.h:23
bool is_resident(Device *sub_device) const
Definition: memory.cpp:125
device_ptr device_pointer
void * host_alloc(size_t size)
Definition: memory.cpp:40
void host_free()
Definition: memory.cpp:58
#define CCL_NAMESPACE_END
Definition: cuda/compat.h:9
static constexpr size_t datatype_size(DataType datatype)
@ MEM_GLOBAL
@ MEM_TEXTURE
@ MEM_DEVICE_ONLY
@ TYPE_FLOAT
@ TYPE_INT
@ TYPE_HALF
@ TYPE_UINT
@ TYPE_UINT16
@ TYPE_UCHAR
CCL_NAMESPACE_BEGIN struct Options options
DebugFlags & DebugFlags()
Definition: debug.h:159
Scene scene
CCL_NAMESPACE_BEGIN struct KernelParamsMetal KernelParamsMetal
static const char * to_string(const Interpolation &interp)
Definition: gl_shader.cc:63
ccl_gpu_kernel_postfix ccl_global float int int int int float bool int offset
@ KERNEL_FEATURE_OBJECT_MOTION
@ BVH_LAYOUT_METAL
@ BVH_LAYOUT_BVH2
format
Definition: logImageCore.h:38
#define VLOG_WORK
Definition: log.h:80
static void error(const char *str)
Definition: meshlaplacian.c:51
std::string to_string(const T &n)
static void copy(bNodeTree *dest_ntree, bNode *dest_node, const bNode *src_node)
int BVHLayoutMask
Definition: params.h:47
string path_cache_get(const string &sub)
Definition: path.cpp:358
string path_source_replace_includes(const string &source, const string &path)
Definition: path.cpp:883
string path_get(const string &sub)
Definition: path.cpp:338
bool path_write_text(const string &path, string &text)
Definition: path.cpp:666
#define min(a, b)
Definition: sort.c:35
unsigned char uint8_t
Definition: stdint.h:78
unsigned __int64 uint64_t
Definition: stdint.h:90
string string_human_readable_size(size_t size)
Definition: string.cpp:229
string string_human_readable_number(size_t num)
Definition: string.cpp:248
CCL_NAMESPACE_BEGIN string string_printf(const char *format,...)
Definition: string.cpp:22
bool adaptive_compile
Definition: debug.h:114
SceneParams params
Definition: scene.h:243
uint64_t data
Definition: util/texture.h:74
uint extension
Definition: util/texture.h:78
uint interpolation
Definition: util/texture.h:78
CCL_NAMESPACE_BEGIN double time_dt()
Definition: time.cpp:35
@ INTERPOLATION_CLOSEST
Definition: util/texture.h:22
ccl_device_inline size_t round_up(size_t x, size_t multiple)
Definition: util/types.h:56
uint64_t device_ptr
Definition: util/types.h:43
static FT_Error err