Blender  V3.3
cuda/device_impl.cpp
Go to the documentation of this file.
1 /* SPDX-License-Identifier: Apache-2.0
2  * Copyright 2011-2022 Blender Foundation */
3 
4 #ifdef WITH_CUDA
5 
6 # include <climits>
7 # include <limits.h>
8 # include <stdio.h>
9 # include <stdlib.h>
10 # include <string.h>
11 
12 # include "device/cuda/device_impl.h"
13 
14 # include "util/debug.h"
15 # include "util/foreach.h"
16 # include "util/log.h"
17 # include "util/map.h"
18 # include "util/md5.h"
19 # include "util/path.h"
20 # include "util/string.h"
21 # include "util/system.h"
22 # include "util/time.h"
23 # include "util/types.h"
24 # include "util/windows.h"
25 
27 
29 
30 class CUDADevice;
31 
32 bool CUDADevice::have_precompiled_kernels()
33 {
34  string cubins_path = path_get("lib");
35  return path_exists(cubins_path);
36 }
37 
38 BVHLayoutMask CUDADevice::get_bvh_layout_mask() const
39 {
40  return BVH_LAYOUT_BVH2;
41 }
42 
43 void CUDADevice::set_error(const string &error)
44 {
46 
47  if (first_error) {
48  fprintf(stderr, "\nRefer to the Cycles GPU rendering documentation for possible solutions:\n");
49  fprintf(stderr,
50  "https://docs.blender.org/manual/en/latest/render/cycles/gpu_rendering.html\n\n");
51  first_error = false;
52  }
53 }
54 
55 CUDADevice::CUDADevice(const DeviceInfo &info, Stats &stats, Profiler &profiler)
56  : Device(info, stats, profiler), texture_info(this, "texture_info", MEM_GLOBAL)
57 {
58  first_error = true;
59 
60  cuDevId = info.num;
61  cuDevice = 0;
62  cuContext = 0;
63 
64  cuModule = 0;
65 
66  need_texture_info = false;
67 
68  device_texture_headroom = 0;
69  device_working_headroom = 0;
70  move_texture_to_host = false;
71  map_host_limit = 0;
72  map_host_used = 0;
73  can_map_host = 0;
74  pitch_alignment = 0;
75 
76  /* Initialize CUDA. */
77  CUresult result = cuInit(0);
78  if (result != CUDA_SUCCESS) {
79  set_error(string_printf("Failed to initialize CUDA runtime (%s)", cuewErrorString(result)));
80  return;
81  }
82 
83  /* Setup device and context. */
84  result = cuDeviceGet(&cuDevice, cuDevId);
85  if (result != CUDA_SUCCESS) {
86  set_error(string_printf("Failed to get CUDA device handle from ordinal (%s)",
87  cuewErrorString(result)));
88  return;
89  }
90 
91  /* CU_CTX_MAP_HOST for mapping host memory when out of device memory.
92  * CU_CTX_LMEM_RESIZE_TO_MAX for reserving local memory ahead of render,
93  * so we can predict which memory to map to host. */
94  cuda_assert(
95  cuDeviceGetAttribute(&can_map_host, CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, cuDevice));
96 
97  cuda_assert(cuDeviceGetAttribute(
98  &pitch_alignment, CU_DEVICE_ATTRIBUTE_TEXTURE_PITCH_ALIGNMENT, cuDevice));
99 
100  unsigned int ctx_flags = CU_CTX_LMEM_RESIZE_TO_MAX;
101  if (can_map_host) {
102  ctx_flags |= CU_CTX_MAP_HOST;
103  init_host_memory();
104  }
105 
106  /* Create context. */
107  result = cuCtxCreate(&cuContext, ctx_flags, cuDevice);
108 
109  if (result != CUDA_SUCCESS) {
110  set_error(string_printf("Failed to create CUDA context (%s)", cuewErrorString(result)));
111  return;
112  }
113 
114  int major, minor;
115  cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevId);
116  cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevId);
117  cuDevArchitecture = major * 100 + minor * 10;
118 
119  /* Pop context set by cuCtxCreate. */
120  cuCtxPopCurrent(NULL);
121 }
122 
123 CUDADevice::~CUDADevice()
124 {
125  texture_info.free();
126 
127  cuda_assert(cuCtxDestroy(cuContext));
128 }
129 
130 bool CUDADevice::support_device(const uint /*kernel_features*/)
131 {
132  int major, minor;
133  cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevId);
134  cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevId);
135 
136  /* We only support sm_30 and above */
137  if (major < 3) {
138  set_error(string_printf(
139  "CUDA backend requires compute capability 3.0 or up, but found %d.%d.", major, minor));
140  return false;
141  }
142 
143  return true;
144 }
145 
146 bool CUDADevice::check_peer_access(Device *peer_device)
147 {
148  if (peer_device == this) {
149  return false;
150  }
151  if (peer_device->info.type != DEVICE_CUDA && peer_device->info.type != DEVICE_OPTIX) {
152  return false;
153  }
154 
155  CUDADevice *const peer_device_cuda = static_cast<CUDADevice *>(peer_device);
156 
157  int can_access = 0;
158  cuda_assert(cuDeviceCanAccessPeer(&can_access, cuDevice, peer_device_cuda->cuDevice));
159  if (can_access == 0) {
160  return false;
161  }
162 
163  // Ensure array access over the link is possible as well (for 3D textures)
164  cuda_assert(cuDeviceGetP2PAttribute(&can_access,
165  CU_DEVICE_P2P_ATTRIBUTE_CUDA_ARRAY_ACCESS_SUPPORTED,
166  cuDevice,
167  peer_device_cuda->cuDevice));
168  if (can_access == 0) {
169  return false;
170  }
171 
172  // Enable peer access in both directions
173  {
174  const CUDAContextScope scope(this);
175  CUresult result = cuCtxEnablePeerAccess(peer_device_cuda->cuContext, 0);
176  if (result != CUDA_SUCCESS) {
177  set_error(string_printf("Failed to enable peer access on CUDA context (%s)",
178  cuewErrorString(result)));
179  return false;
180  }
181  }
182  {
183  const CUDAContextScope scope(peer_device_cuda);
184  CUresult result = cuCtxEnablePeerAccess(cuContext, 0);
185  if (result != CUDA_SUCCESS) {
186  set_error(string_printf("Failed to enable peer access on CUDA context (%s)",
187  cuewErrorString(result)));
188  return false;
189  }
190  }
191 
192  return true;
193 }
194 
195 bool CUDADevice::use_adaptive_compilation()
196 {
198 }
199 
200 /* Common NVCC flags which stays the same regardless of shading model,
201  * kernel sources md5 and only depends on compiler or compilation settings.
202  */
203 string CUDADevice::compile_kernel_get_common_cflags(const uint kernel_features)
204 {
205  const int machine = system_cpu_bits();
206  const string source_path = path_get("source");
207  const string include_path = source_path;
208  string cflags = string_printf(
209  "-m%d "
210  "--ptxas-options=\"-v\" "
211  "--use_fast_math "
212  "-DNVCC "
213  "-I\"%s\"",
214  machine,
215  include_path.c_str());
216  if (use_adaptive_compilation()) {
217  cflags += " -D__KERNEL_FEATURES__=" + to_string(kernel_features);
218  }
219  const char *extra_cflags = getenv("CYCLES_CUDA_EXTRA_CFLAGS");
220  if (extra_cflags) {
221  cflags += string(" ") + string(extra_cflags);
222  }
223 
224 # ifdef WITH_NANOVDB
225  cflags += " -DWITH_NANOVDB";
226 # endif
227 
228 # ifdef WITH_CYCLES_DEBUG
229  cflags += " -DWITH_CYCLES_DEBUG";
230 # endif
231 
232  return cflags;
233 }
234 
235 string CUDADevice::compile_kernel(const uint kernel_features,
236  const char *name,
237  const char *base,
238  bool force_ptx)
239 {
240  /* Compute kernel name. */
241  int major, minor;
242  cuDeviceGetAttribute(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevId);
243  cuDeviceGetAttribute(&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevId);
244 
245  /* Attempt to use kernel provided with Blender. */
246  if (!use_adaptive_compilation()) {
247  if (!force_ptx) {
248  const string cubin = path_get(string_printf("lib/%s_sm_%d%d.cubin", name, major, minor));
249  VLOG_INFO << "Testing for pre-compiled kernel " << cubin << ".";
250  if (path_exists(cubin)) {
251  VLOG_INFO << "Using precompiled kernel.";
252  return cubin;
253  }
254  }
255 
256  /* The driver can JIT-compile PTX generated for older generations, so find the closest one. */
257  int ptx_major = major, ptx_minor = minor;
258  while (ptx_major >= 3) {
259  const string ptx = path_get(
260  string_printf("lib/%s_compute_%d%d.ptx", name, ptx_major, ptx_minor));
261  VLOG_INFO << "Testing for pre-compiled kernel " << ptx << ".";
262  if (path_exists(ptx)) {
263  VLOG_INFO << "Using precompiled kernel.";
264  return ptx;
265  }
266 
267  if (ptx_minor > 0) {
268  ptx_minor--;
269  }
270  else {
271  ptx_major--;
272  ptx_minor = 9;
273  }
274  }
275  }
276 
277  /* Try to use locally compiled kernel. */
278  string source_path = path_get("source");
279  const string source_md5 = path_files_md5_hash(source_path);
280 
281  /* We include cflags into md5 so changing cuda toolkit or changing other
282  * compiler command line arguments makes sure cubin gets re-built.
283  */
284  string common_cflags = compile_kernel_get_common_cflags(kernel_features);
285  const string kernel_md5 = util_md5_string(source_md5 + common_cflags);
286 
287  const char *const kernel_ext = force_ptx ? "ptx" : "cubin";
288  const char *const kernel_arch = force_ptx ? "compute" : "sm";
289  const string cubin_file = string_printf(
290  "cycles_%s_%s_%d%d_%s.%s", name, kernel_arch, major, minor, kernel_md5.c_str(), kernel_ext);
291  const string cubin = path_cache_get(path_join("kernels", cubin_file));
292  VLOG_INFO << "Testing for locally compiled kernel " << cubin << ".";
293  if (path_exists(cubin)) {
294  VLOG_INFO << "Using locally compiled kernel.";
295  return cubin;
296  }
297 
298 # ifdef _WIN32
299  if (!use_adaptive_compilation() && have_precompiled_kernels()) {
300  if (major < 3) {
301  set_error(
302  string_printf("CUDA backend requires compute capability 3.0 or up, but found %d.%d. "
303  "Your GPU is not supported.",
304  major,
305  minor));
306  }
307  else {
308  set_error(
309  string_printf("CUDA binary kernel for this graphics card compute "
310  "capability (%d.%d) not found.",
311  major,
312  minor));
313  }
314  return string();
315  }
316 # endif
317 
318  /* Compile. */
319  const char *const nvcc = cuewCompilerPath();
320  if (nvcc == NULL) {
321  set_error(
322  "CUDA nvcc compiler not found. "
323  "Install CUDA toolkit in default location.");
324  return string();
325  }
326 
327  const int nvcc_cuda_version = cuewCompilerVersion();
328  VLOG_INFO << "Found nvcc " << nvcc << ", CUDA version " << nvcc_cuda_version << ".";
329  if (nvcc_cuda_version < 101) {
330  printf(
331  "Unsupported CUDA version %d.%d detected, "
332  "you need CUDA 10.1 or newer.\n",
333  nvcc_cuda_version / 10,
334  nvcc_cuda_version % 10);
335  return string();
336  }
337  else if (!(nvcc_cuda_version == 101 || nvcc_cuda_version == 102 || nvcc_cuda_version == 111 ||
338  nvcc_cuda_version == 112 || nvcc_cuda_version == 113 || nvcc_cuda_version == 114)) {
339  printf(
340  "CUDA version %d.%d detected, build may succeed but only "
341  "CUDA 10.1 to 11.4 are officially supported.\n",
342  nvcc_cuda_version / 10,
343  nvcc_cuda_version % 10);
344  }
345 
346  double starttime = time_dt();
347 
349 
350  source_path = path_join(path_join(source_path, "kernel"),
351  path_join("device", path_join(base, string_printf("%s.cu", name))));
352 
353  string command = string_printf(
354  "\"%s\" "
355  "-arch=%s_%d%d "
356  "--%s \"%s\" "
357  "-o \"%s\" "
358  "%s",
359  nvcc,
360  kernel_arch,
361  major,
362  minor,
363  kernel_ext,
364  source_path.c_str(),
365  cubin.c_str(),
366  common_cflags.c_str());
367 
368  printf("Compiling %sCUDA kernel ...\n%s\n",
369  (use_adaptive_compilation()) ? "adaptive " : "",
370  command.c_str());
371 
372 # ifdef _WIN32
373  command = "call " + command;
374 # endif
375  if (system(command.c_str()) != 0) {
376  set_error(
377  "Failed to execute compilation command, "
378  "see console for details.");
379  return string();
380  }
381 
382  /* Verify if compilation succeeded */
383  if (!path_exists(cubin)) {
384  set_error(
385  "CUDA kernel compilation failed, "
386  "see console for details.");
387  return string();
388  }
389 
390  printf("Kernel compilation finished in %.2lfs.\n", time_dt() - starttime);
391 
392  return cubin;
393 }
394 
395 bool CUDADevice::load_kernels(const uint kernel_features)
396 {
397  /* TODO(sergey): Support kernels re-load for CUDA devices adaptive compile.
398  *
399  * Currently re-loading kernel will invalidate memory pointers,
400  * causing problems in cuCtxSynchronize.
401  */
402  if (cuModule) {
403  if (use_adaptive_compilation()) {
404  VLOG_INFO
405  << "Skipping CUDA kernel reload for adaptive compilation, not currently supported.";
406  }
407  return true;
408  }
409 
410  /* check if cuda init succeeded */
411  if (cuContext == 0)
412  return false;
413 
414  /* check if GPU is supported */
415  if (!support_device(kernel_features))
416  return false;
417 
418  /* get kernel */
419  const char *kernel_name = "kernel";
420  string cubin = compile_kernel(kernel_features, kernel_name);
421  if (cubin.empty())
422  return false;
423 
424  /* open module */
425  CUDAContextScope scope(this);
426 
427  string cubin_data;
428  CUresult result;
429 
430  if (path_read_text(cubin, cubin_data))
431  result = cuModuleLoadData(&cuModule, cubin_data.c_str());
432  else
433  result = CUDA_ERROR_FILE_NOT_FOUND;
434 
435  if (result != CUDA_SUCCESS)
436  set_error(string_printf(
437  "Failed to load CUDA kernel from '%s' (%s)", cubin.c_str(), cuewErrorString(result)));
438 
439  if (result == CUDA_SUCCESS) {
440  kernels.load(this);
441  reserve_local_memory(kernel_features);
442  }
443 
444  return (result == CUDA_SUCCESS);
445 }
446 
447 void CUDADevice::reserve_local_memory(const uint kernel_features)
448 {
449  /* Together with CU_CTX_LMEM_RESIZE_TO_MAX, this reserves local memory
450  * needed for kernel launches, so that we can reliably figure out when
451  * to allocate scene data in mapped host memory. */
452  size_t total = 0, free_before = 0, free_after = 0;
453 
454  {
455  CUDAContextScope scope(this);
456  cuMemGetInfo(&free_before, &total);
457  }
458 
459  {
460  /* Use the biggest kernel for estimation. */
461  const DeviceKernel test_kernel = (kernel_features & KERNEL_FEATURE_NODE_RAYTRACE) ?
463  (kernel_features & KERNEL_FEATURE_MNEE) ?
466 
467  /* Launch kernel, using just 1 block appears sufficient to reserve memory for all
468  * multiprocessors. It would be good to do this in parallel for the multi GPU case
469  * still to make it faster. */
470  CUDADeviceQueue queue(this);
471 
472  device_ptr d_path_index = 0;
473  device_ptr d_render_buffer = 0;
474  int d_work_size = 0;
475  DeviceKernelArguments args(&d_path_index, &d_render_buffer, &d_work_size);
476 
477  queue.init_execution();
478  queue.enqueue(test_kernel, 1, args);
479  queue.synchronize();
480  }
481 
482  {
483  CUDAContextScope scope(this);
484  cuMemGetInfo(&free_after, &total);
485  }
486 
487  VLOG_INFO << "Local memory reserved " << string_human_readable_number(free_before - free_after)
488  << " bytes. (" << string_human_readable_size(free_before - free_after) << ")";
489 
490 # if 0
491  /* For testing mapped host memory, fill up device memory. */
492  const size_t keep_mb = 1024;
493 
494  while (free_after > keep_mb * 1024 * 1024LL) {
495  CUdeviceptr tmp;
496  cuda_assert(cuMemAlloc(&tmp, 10 * 1024 * 1024LL));
497  cuMemGetInfo(&free_after, &total);
498  }
499 # endif
500 }
501 
502 void CUDADevice::init_host_memory()
503 {
504  /* Limit amount of host mapped memory, because allocating too much can
505  * cause system instability. Leave at least half or 4 GB of system
506  * memory free, whichever is smaller. */
507  size_t default_limit = 4 * 1024 * 1024 * 1024LL;
508  size_t system_ram = system_physical_ram();
509 
510  if (system_ram > 0) {
511  if (system_ram / 2 > default_limit) {
512  map_host_limit = system_ram - default_limit;
513  }
514  else {
515  map_host_limit = system_ram / 2;
516  }
517  }
518  else {
519  VLOG_WARNING << "Mapped host memory disabled, failed to get system RAM";
520  map_host_limit = 0;
521  }
522 
523  /* Amount of device memory to keep is free after texture memory
524  * and working memory allocations respectively. We set the working
525  * memory limit headroom lower so that some space is left after all
526  * texture memory allocations. */
527  device_working_headroom = 32 * 1024 * 1024LL; // 32MB
528  device_texture_headroom = 128 * 1024 * 1024LL; // 128MB
529 
530  VLOG_INFO << "Mapped host memory limit set to " << string_human_readable_number(map_host_limit)
531  << " bytes. (" << string_human_readable_size(map_host_limit) << ")";
532 }
533 
534 void CUDADevice::load_texture_info()
535 {
536  if (need_texture_info) {
537  /* Unset flag before copying, so this does not loop indefinitely if the copy below calls
538  * into 'move_textures_to_host' (which calls 'load_texture_info' again). */
539  need_texture_info = false;
540  texture_info.copy_to_device();
541  }
542 }
543 
544 void CUDADevice::move_textures_to_host(size_t size, bool for_texture)
545 {
546  /* Break out of recursive call, which can happen when moving memory on a multi device. */
547  static bool any_device_moving_textures_to_host = false;
548  if (any_device_moving_textures_to_host) {
549  return;
550  }
551 
552  /* Signal to reallocate textures in host memory only. */
553  move_texture_to_host = true;
554 
555  while (size > 0) {
556  /* Find suitable memory allocation to move. */
557  device_memory *max_mem = NULL;
558  size_t max_size = 0;
559  bool max_is_image = false;
560 
561  thread_scoped_lock lock(cuda_mem_map_mutex);
562  foreach (CUDAMemMap::value_type &pair, cuda_mem_map) {
563  device_memory &mem = *pair.first;
564  CUDAMem *cmem = &pair.second;
565 
566  /* Can only move textures allocated on this device (and not those from peer devices).
567  * And need to ignore memory that is already on the host. */
568  if (!mem.is_resident(this) || cmem->use_mapped_host) {
569  continue;
570  }
571 
572  bool is_texture = (mem.type == MEM_TEXTURE || mem.type == MEM_GLOBAL) &&
573  (&mem != &texture_info);
574  bool is_image = is_texture && (mem.data_height > 1);
575 
576  /* Can't move this type of memory. */
577  if (!is_texture || cmem->array) {
578  continue;
579  }
580 
581  /* For other textures, only move image textures. */
582  if (for_texture && !is_image) {
583  continue;
584  }
585 
586  /* Try to move largest allocation, prefer moving images. */
587  if (is_image > max_is_image || (is_image == max_is_image && mem.device_size > max_size)) {
588  max_is_image = is_image;
589  max_size = mem.device_size;
590  max_mem = &mem;
591  }
592  }
593  lock.unlock();
594 
595  /* Move to host memory. This part is mutex protected since
596  * multiple CUDA devices could be moving the memory. The
597  * first one will do it, and the rest will adopt the pointer. */
598  if (max_mem) {
599  VLOG_WORK << "Move memory from device to host: " << max_mem->name;
600 
601  static thread_mutex move_mutex;
602  thread_scoped_lock lock(move_mutex);
603 
604  any_device_moving_textures_to_host = true;
605 
606  /* Potentially need to call back into multi device, so pointer mapping
607  * and peer devices are updated. This is also necessary since the device
608  * pointer may just be a key here, so cannot be accessed and freed directly.
609  * Unfortunately it does mean that memory is reallocated on all other
610  * devices as well, which is potentially dangerous when still in use (since
611  * a thread rendering on another devices would only be caught in this mutex
612  * if it so happens to do an allocation at the same time as well. */
613  max_mem->device_copy_to();
614  size = (max_size >= size) ? 0 : size - max_size;
615 
616  any_device_moving_textures_to_host = false;
617  }
618  else {
619  break;
620  }
621  }
622 
623  /* Unset flag before texture info is reloaded, since it should stay in device memory. */
624  move_texture_to_host = false;
625 
626  /* Update texture info array with new pointers. */
627  load_texture_info();
628 }
629 
630 CUDADevice::CUDAMem *CUDADevice::generic_alloc(device_memory &mem, size_t pitch_padding)
631 {
632  CUDAContextScope scope(this);
633 
634  CUdeviceptr device_pointer = 0;
635  size_t size = mem.memory_size() + pitch_padding;
636 
637  CUresult mem_alloc_result = CUDA_ERROR_OUT_OF_MEMORY;
638  const char *status = "";
639 
640  /* First try allocating in device memory, respecting headroom. We make
641  * an exception for texture info. It is small and frequently accessed,
642  * so treat it as working memory.
643  *
644  * If there is not enough room for working memory, we will try to move
645  * textures to host memory, assuming the performance impact would have
646  * been worse for working memory. */
647  bool is_texture = (mem.type == MEM_TEXTURE || mem.type == MEM_GLOBAL) && (&mem != &texture_info);
648  bool is_image = is_texture && (mem.data_height > 1);
649 
650  size_t headroom = (is_texture) ? device_texture_headroom : device_working_headroom;
651 
652  size_t total = 0, free = 0;
653  cuMemGetInfo(&free, &total);
654 
655  /* Move textures to host memory if needed. */
656  if (!move_texture_to_host && !is_image && (size + headroom) >= free && can_map_host) {
657  move_textures_to_host(size + headroom - free, is_texture);
658  cuMemGetInfo(&free, &total);
659  }
660 
661  /* Allocate in device memory. */
662  if (!move_texture_to_host && (size + headroom) < free) {
663  mem_alloc_result = cuMemAlloc(&device_pointer, size);
664  if (mem_alloc_result == CUDA_SUCCESS) {
665  status = " in device memory";
666  }
667  }
668 
669  /* Fall back to mapped host memory if needed and possible. */
670 
671  void *shared_pointer = 0;
672 
673  if (mem_alloc_result != CUDA_SUCCESS && can_map_host && mem.type != MEM_DEVICE_ONLY) {
674  if (mem.shared_pointer) {
675  /* Another device already allocated host memory. */
676  mem_alloc_result = CUDA_SUCCESS;
677  shared_pointer = mem.shared_pointer;
678  }
679  else if (map_host_used + size < map_host_limit) {
680  /* Allocate host memory ourselves. */
681  mem_alloc_result = cuMemHostAlloc(
682  &shared_pointer, size, CU_MEMHOSTALLOC_DEVICEMAP | CU_MEMHOSTALLOC_WRITECOMBINED);
683 
684  assert((mem_alloc_result == CUDA_SUCCESS && shared_pointer != 0) ||
685  (mem_alloc_result != CUDA_SUCCESS && shared_pointer == 0));
686  }
687 
688  if (mem_alloc_result == CUDA_SUCCESS) {
689  cuda_assert(cuMemHostGetDevicePointer_v2(&device_pointer, shared_pointer, 0));
690  map_host_used += size;
691  status = " in host memory";
692  }
693  }
694 
695  if (mem_alloc_result != CUDA_SUCCESS) {
696  if (mem.type == MEM_DEVICE_ONLY) {
697  status = " failed, out of device memory";
698  set_error("System is out of GPU memory");
699  }
700  else {
701  status = " failed, out of device and host memory";
702  set_error("System is out of GPU and shared host memory");
703  }
704  }
705 
706  if (mem.name) {
707  VLOG_WORK << "Buffer allocate: " << mem.name << ", "
708  << string_human_readable_number(mem.memory_size()) << " bytes. ("
709  << string_human_readable_size(mem.memory_size()) << ")" << status;
710  }
711 
712  mem.device_pointer = (device_ptr)device_pointer;
713  mem.device_size = size;
714  stats.mem_alloc(size);
715 
716  if (!mem.device_pointer) {
717  return NULL;
718  }
719 
720  /* Insert into map of allocations. */
721  thread_scoped_lock lock(cuda_mem_map_mutex);
722  CUDAMem *cmem = &cuda_mem_map[&mem];
723  if (shared_pointer != 0) {
724  /* Replace host pointer with our host allocation. Only works if
725  * CUDA memory layout is the same and has no pitch padding. Also
726  * does not work if we move textures to host during a render,
727  * since other devices might be using the memory. */
728 
729  if (!move_texture_to_host && pitch_padding == 0 && mem.host_pointer &&
730  mem.host_pointer != shared_pointer) {
731  memcpy(shared_pointer, mem.host_pointer, size);
732 
733  /* A Call to device_memory::host_free() should be preceded by
734  * a call to device_memory::device_free() for host memory
735  * allocated by a device to be handled properly. Two exceptions
736  * are here and a call in OptiXDevice::generic_alloc(), where
737  * the current host memory can be assumed to be allocated by
738  * device_memory::host_alloc(), not by a device */
739 
740  mem.host_free();
741  mem.host_pointer = shared_pointer;
742  }
743  mem.shared_pointer = shared_pointer;
744  mem.shared_counter++;
745  cmem->use_mapped_host = true;
746  }
747  else {
748  cmem->use_mapped_host = false;
749  }
750 
751  return cmem;
752 }
753 
754 void CUDADevice::generic_copy_to(device_memory &mem)
755 {
756  if (!mem.host_pointer || !mem.device_pointer) {
757  return;
758  }
759 
760  /* If use_mapped_host of mem is false, the current device only uses device memory allocated by
761  * cuMemAlloc regardless of mem.host_pointer and mem.shared_pointer, and should copy data from
762  * mem.host_pointer. */
763  thread_scoped_lock lock(cuda_mem_map_mutex);
764  if (!cuda_mem_map[&mem].use_mapped_host || mem.host_pointer != mem.shared_pointer) {
765  const CUDAContextScope scope(this);
766  cuda_assert(
767  cuMemcpyHtoD((CUdeviceptr)mem.device_pointer, mem.host_pointer, mem.memory_size()));
768  }
769 }
770 
771 void CUDADevice::generic_free(device_memory &mem)
772 {
773  if (mem.device_pointer) {
774  CUDAContextScope scope(this);
775  thread_scoped_lock lock(cuda_mem_map_mutex);
776  DCHECK(cuda_mem_map.find(&mem) != cuda_mem_map.end());
777  const CUDAMem &cmem = cuda_mem_map[&mem];
778 
779  /* If cmem.use_mapped_host is true, reference counting is used
780  * to safely free a mapped host memory. */
781 
782  if (cmem.use_mapped_host) {
783  assert(mem.shared_pointer);
784  if (mem.shared_pointer) {
785  assert(mem.shared_counter > 0);
786  if (--mem.shared_counter == 0) {
787  if (mem.host_pointer == mem.shared_pointer) {
788  mem.host_pointer = 0;
789  }
790  cuMemFreeHost(mem.shared_pointer);
791  mem.shared_pointer = 0;
792  }
793  }
794  map_host_used -= mem.device_size;
795  }
796  else {
797  /* Free device memory. */
798  cuda_assert(cuMemFree(mem.device_pointer));
799  }
800 
801  stats.mem_free(mem.device_size);
802  mem.device_pointer = 0;
803  mem.device_size = 0;
804 
805  cuda_mem_map.erase(cuda_mem_map.find(&mem));
806  }
807 }
808 
809 void CUDADevice::mem_alloc(device_memory &mem)
810 {
811  if (mem.type == MEM_TEXTURE) {
812  assert(!"mem_alloc not supported for textures.");
813  }
814  else if (mem.type == MEM_GLOBAL) {
815  assert(!"mem_alloc not supported for global memory.");
816  }
817  else {
818  generic_alloc(mem);
819  }
820 }
821 
822 void CUDADevice::mem_copy_to(device_memory &mem)
823 {
824  if (mem.type == MEM_GLOBAL) {
825  global_free(mem);
826  global_alloc(mem);
827  }
828  else if (mem.type == MEM_TEXTURE) {
829  tex_free((device_texture &)mem);
830  tex_alloc((device_texture &)mem);
831  }
832  else {
833  if (!mem.device_pointer) {
834  generic_alloc(mem);
835  }
836  generic_copy_to(mem);
837  }
838 }
839 
840 void CUDADevice::mem_copy_from(device_memory &mem, size_t y, size_t w, size_t h, size_t elem)
841 {
842  if (mem.type == MEM_TEXTURE || mem.type == MEM_GLOBAL) {
843  assert(!"mem_copy_from not supported for textures.");
844  }
845  else if (mem.host_pointer) {
846  const size_t size = elem * w * h;
847  const size_t offset = elem * y * w;
848 
849  if (mem.device_pointer) {
850  const CUDAContextScope scope(this);
851  cuda_assert(cuMemcpyDtoH(
852  (char *)mem.host_pointer + offset, (CUdeviceptr)mem.device_pointer + offset, size));
853  }
854  else {
855  memset((char *)mem.host_pointer + offset, 0, size);
856  }
857  }
858 }
859 
860 void CUDADevice::mem_zero(device_memory &mem)
861 {
862  if (!mem.device_pointer) {
863  mem_alloc(mem);
864  }
865  if (!mem.device_pointer) {
866  return;
867  }
868 
869  /* If use_mapped_host of mem is false, mem.device_pointer currently refers to device memory
870  * regardless of mem.host_pointer and mem.shared_pointer. */
871  thread_scoped_lock lock(cuda_mem_map_mutex);
872  if (!cuda_mem_map[&mem].use_mapped_host || mem.host_pointer != mem.shared_pointer) {
873  const CUDAContextScope scope(this);
874  cuda_assert(cuMemsetD8((CUdeviceptr)mem.device_pointer, 0, mem.memory_size()));
875  }
876  else if (mem.host_pointer) {
877  memset(mem.host_pointer, 0, mem.memory_size());
878  }
879 }
880 
881 void CUDADevice::mem_free(device_memory &mem)
882 {
883  if (mem.type == MEM_GLOBAL) {
884  global_free(mem);
885  }
886  else if (mem.type == MEM_TEXTURE) {
887  tex_free((device_texture &)mem);
888  }
889  else {
890  generic_free(mem);
891  }
892 }
893 
894 device_ptr CUDADevice::mem_alloc_sub_ptr(device_memory &mem, size_t offset, size_t /*size*/)
895 {
896  return (device_ptr)(((char *)mem.device_pointer) + mem.memory_elements_size(offset));
897 }
898 
899 void CUDADevice::const_copy_to(const char *name, void *host, size_t size)
900 {
901  CUDAContextScope scope(this);
902  CUdeviceptr mem;
903  size_t bytes;
904 
905  cuda_assert(cuModuleGetGlobal(&mem, &bytes, cuModule, "kernel_params"));
906  assert(bytes == sizeof(KernelParamsCUDA));
907 
908  /* Update data storage pointers in launch parameters. */
909 # define KERNEL_DATA_ARRAY(data_type, data_name) \
910  if (strcmp(name, #data_name) == 0) { \
911  cuda_assert(cuMemcpyHtoD(mem + offsetof(KernelParamsCUDA, data_name), host, size)); \
912  return; \
913  }
915  KERNEL_DATA_ARRAY(IntegratorStateGPU, integrator_state)
916 # include "kernel/data_arrays.h"
917 # undef KERNEL_DATA_ARRAY
918 }
919 
920 void CUDADevice::global_alloc(device_memory &mem)
921 {
922  if (mem.is_resident(this)) {
923  generic_alloc(mem);
924  generic_copy_to(mem);
925  }
926 
927  const_copy_to(mem.name, &mem.device_pointer, sizeof(mem.device_pointer));
928 }
929 
930 void CUDADevice::global_free(device_memory &mem)
931 {
932  if (mem.is_resident(this) && mem.device_pointer) {
933  generic_free(mem);
934  }
935 }
936 
937 void CUDADevice::tex_alloc(device_texture &mem)
938 {
939  CUDAContextScope scope(this);
940 
941  size_t dsize = datatype_size(mem.data_type);
942  size_t size = mem.memory_size();
943 
944  CUaddress_mode address_mode = CU_TR_ADDRESS_MODE_WRAP;
945  switch (mem.info.extension) {
946  case EXTENSION_REPEAT:
947  address_mode = CU_TR_ADDRESS_MODE_WRAP;
948  break;
949  case EXTENSION_EXTEND:
950  address_mode = CU_TR_ADDRESS_MODE_CLAMP;
951  break;
952  case EXTENSION_CLIP:
953  address_mode = CU_TR_ADDRESS_MODE_BORDER;
954  break;
955  default:
956  assert(0);
957  break;
958  }
959 
960  CUfilter_mode filter_mode;
962  filter_mode = CU_TR_FILTER_MODE_POINT;
963  }
964  else {
965  filter_mode = CU_TR_FILTER_MODE_LINEAR;
966  }
967 
968  /* Image Texture Storage */
969  CUarray_format_enum format;
970  switch (mem.data_type) {
971  case TYPE_UCHAR:
972  format = CU_AD_FORMAT_UNSIGNED_INT8;
973  break;
974  case TYPE_UINT16:
975  format = CU_AD_FORMAT_UNSIGNED_INT16;
976  break;
977  case TYPE_UINT:
978  format = CU_AD_FORMAT_UNSIGNED_INT32;
979  break;
980  case TYPE_INT:
981  format = CU_AD_FORMAT_SIGNED_INT32;
982  break;
983  case TYPE_FLOAT:
984  format = CU_AD_FORMAT_FLOAT;
985  break;
986  case TYPE_HALF:
987  format = CU_AD_FORMAT_HALF;
988  break;
989  default:
990  assert(0);
991  return;
992  }
993 
994  CUDAMem *cmem = NULL;
995  CUarray array_3d = NULL;
996  size_t src_pitch = mem.data_width * dsize * mem.data_elements;
997  size_t dst_pitch = src_pitch;
998 
999  if (!mem.is_resident(this)) {
1000  thread_scoped_lock lock(cuda_mem_map_mutex);
1001  cmem = &cuda_mem_map[&mem];
1002  cmem->texobject = 0;
1003 
1004  if (mem.data_depth > 1) {
1005  array_3d = (CUarray)mem.device_pointer;
1006  cmem->array = array_3d;
1007  }
1008  else if (mem.data_height > 0) {
1009  dst_pitch = align_up(src_pitch, pitch_alignment);
1010  }
1011  }
1012  else if (mem.data_depth > 1) {
1013  /* 3D texture using array, there is no API for linear memory. */
1014  CUDA_ARRAY3D_DESCRIPTOR desc;
1015 
1016  desc.Width = mem.data_width;
1017  desc.Height = mem.data_height;
1018  desc.Depth = mem.data_depth;
1019  desc.Format = format;
1020  desc.NumChannels = mem.data_elements;
1021  desc.Flags = 0;
1022 
1023  VLOG_WORK << "Array 3D allocate: " << mem.name << ", "
1024  << string_human_readable_number(mem.memory_size()) << " bytes. ("
1025  << string_human_readable_size(mem.memory_size()) << ")";
1026 
1027  cuda_assert(cuArray3DCreate(&array_3d, &desc));
1028 
1029  if (!array_3d) {
1030  return;
1031  }
1032 
1033  CUDA_MEMCPY3D param;
1034  memset(&param, 0, sizeof(param));
1035  param.dstMemoryType = CU_MEMORYTYPE_ARRAY;
1036  param.dstArray = array_3d;
1037  param.srcMemoryType = CU_MEMORYTYPE_HOST;
1038  param.srcHost = mem.host_pointer;
1039  param.srcPitch = src_pitch;
1040  param.WidthInBytes = param.srcPitch;
1041  param.Height = mem.data_height;
1042  param.Depth = mem.data_depth;
1043 
1044  cuda_assert(cuMemcpy3D(&param));
1045 
1046  mem.device_pointer = (device_ptr)array_3d;
1047  mem.device_size = size;
1048  stats.mem_alloc(size);
1049 
1050  thread_scoped_lock lock(cuda_mem_map_mutex);
1051  cmem = &cuda_mem_map[&mem];
1052  cmem->texobject = 0;
1053  cmem->array = array_3d;
1054  }
1055  else if (mem.data_height > 0) {
1056  /* 2D texture, using pitch aligned linear memory. */
1057  dst_pitch = align_up(src_pitch, pitch_alignment);
1058  size_t dst_size = dst_pitch * mem.data_height;
1059 
1060  cmem = generic_alloc(mem, dst_size - mem.memory_size());
1061  if (!cmem) {
1062  return;
1063  }
1064 
1065  CUDA_MEMCPY2D param;
1066  memset(&param, 0, sizeof(param));
1067  param.dstMemoryType = CU_MEMORYTYPE_DEVICE;
1068  param.dstDevice = mem.device_pointer;
1069  param.dstPitch = dst_pitch;
1070  param.srcMemoryType = CU_MEMORYTYPE_HOST;
1071  param.srcHost = mem.host_pointer;
1072  param.srcPitch = src_pitch;
1073  param.WidthInBytes = param.srcPitch;
1074  param.Height = mem.data_height;
1075 
1076  cuda_assert(cuMemcpy2DUnaligned(&param));
1077  }
1078  else {
1079  /* 1D texture, using linear memory. */
1080  cmem = generic_alloc(mem);
1081  if (!cmem) {
1082  return;
1083  }
1084 
1085  cuda_assert(cuMemcpyHtoD(mem.device_pointer, mem.host_pointer, size));
1086  }
1087 
1088  /* Resize once */
1089  const uint slot = mem.slot;
1090  if (slot >= texture_info.size()) {
1091  /* Allocate some slots in advance, to reduce amount
1092  * of re-allocations. */
1093  texture_info.resize(slot + 128);
1094  }
1095 
1096  /* Set Mapping and tag that we need to (re-)upload to device */
1097  texture_info[slot] = mem.info;
1098  need_texture_info = true;
1099 
1104  CUDA_RESOURCE_DESC resDesc;
1105  memset(&resDesc, 0, sizeof(resDesc));
1106 
1107  if (array_3d) {
1108  resDesc.resType = CU_RESOURCE_TYPE_ARRAY;
1109  resDesc.res.array.hArray = array_3d;
1110  resDesc.flags = 0;
1111  }
1112  else if (mem.data_height > 0) {
1113  resDesc.resType = CU_RESOURCE_TYPE_PITCH2D;
1114  resDesc.res.pitch2D.devPtr = mem.device_pointer;
1115  resDesc.res.pitch2D.format = format;
1116  resDesc.res.pitch2D.numChannels = mem.data_elements;
1117  resDesc.res.pitch2D.height = mem.data_height;
1118  resDesc.res.pitch2D.width = mem.data_width;
1119  resDesc.res.pitch2D.pitchInBytes = dst_pitch;
1120  }
1121  else {
1122  resDesc.resType = CU_RESOURCE_TYPE_LINEAR;
1123  resDesc.res.linear.devPtr = mem.device_pointer;
1124  resDesc.res.linear.format = format;
1125  resDesc.res.linear.numChannels = mem.data_elements;
1126  resDesc.res.linear.sizeInBytes = mem.device_size;
1127  }
1128 
1129  CUDA_TEXTURE_DESC texDesc;
1130  memset(&texDesc, 0, sizeof(texDesc));
1131  texDesc.addressMode[0] = address_mode;
1132  texDesc.addressMode[1] = address_mode;
1133  texDesc.addressMode[2] = address_mode;
1134  texDesc.filterMode = filter_mode;
1135  texDesc.flags = CU_TRSF_NORMALIZED_COORDINATES;
1136 
1137  thread_scoped_lock lock(cuda_mem_map_mutex);
1138  cmem = &cuda_mem_map[&mem];
1139 
1140  cuda_assert(cuTexObjectCreate(&cmem->texobject, &resDesc, &texDesc, NULL));
1141 
1142  texture_info[slot].data = (uint64_t)cmem->texobject;
1143  }
1144  else {
1145  texture_info[slot].data = (uint64_t)mem.device_pointer;
1146  }
1147 }
1148 
1149 void CUDADevice::tex_free(device_texture &mem)
1150 {
1151  if (mem.device_pointer) {
1152  CUDAContextScope scope(this);
1153  thread_scoped_lock lock(cuda_mem_map_mutex);
1154  DCHECK(cuda_mem_map.find(&mem) != cuda_mem_map.end());
1155  const CUDAMem &cmem = cuda_mem_map[&mem];
1156 
1157  if (cmem.texobject) {
1158  /* Free bindless texture. */
1159  cuTexObjectDestroy(cmem.texobject);
1160  }
1161 
1162  if (!mem.is_resident(this)) {
1163  /* Do not free memory here, since it was allocated on a different device. */
1164  cuda_mem_map.erase(cuda_mem_map.find(&mem));
1165  }
1166  else if (cmem.array) {
1167  /* Free array. */
1168  cuArrayDestroy(cmem.array);
1169  stats.mem_free(mem.device_size);
1170  mem.device_pointer = 0;
1171  mem.device_size = 0;
1172 
1173  cuda_mem_map.erase(cuda_mem_map.find(&mem));
1174  }
1175  else {
1176  lock.unlock();
1177  generic_free(mem);
1178  }
1179  }
1180 }
1181 
1182 unique_ptr<DeviceQueue> CUDADevice::gpu_queue_create()
1183 {
1184  return make_unique<CUDADeviceQueue>(this);
1185 }
1186 
1187 bool CUDADevice::should_use_graphics_interop()
1188 {
1189  /* Check whether this device is part of OpenGL context.
1190  *
1191  * Using CUDA device for graphics interoperability which is not part of the OpenGL context is
1192  * possible, but from the empiric measurements it can be considerably slower than using naive
1193  * pixels copy. */
1194 
1195  CUDAContextScope scope(this);
1196 
1197  int num_all_devices = 0;
1198  cuda_assert(cuDeviceGetCount(&num_all_devices));
1199 
1200  if (num_all_devices == 0) {
1201  return false;
1202  }
1203 
1204  vector<CUdevice> gl_devices(num_all_devices);
1205  uint num_gl_devices = 0;
1206  cuGLGetDevices(&num_gl_devices, gl_devices.data(), num_all_devices, CU_GL_DEVICE_LIST_ALL);
1207 
1208  for (uint i = 0; i < num_gl_devices; ++i) {
1209  if (gl_devices[i] == cuDevice) {
1210  return true;
1211  }
1212  }
1213 
1214  return false;
1215 }
1216 
1217 int CUDADevice::get_num_multiprocessors()
1218 {
1219  return get_device_default_attribute(CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, 0);
1220 }
1221 
1222 int CUDADevice::get_max_num_threads_per_multiprocessor()
1223 {
1224  return get_device_default_attribute(CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR, 0);
1225 }
1226 
1227 bool CUDADevice::get_device_attribute(CUdevice_attribute attribute, int *value)
1228 {
1229  CUDAContextScope scope(this);
1230 
1231  return cuDeviceGetAttribute(value, attribute, cuDevice) == CUDA_SUCCESS;
1232 }
1233 
1234 int CUDADevice::get_device_default_attribute(CUdevice_attribute attribute, int default_value)
1235 {
1236  int value = 0;
1237  if (!get_device_attribute(attribute, &value)) {
1238  return default_value;
1239  }
1240  return value;
1241 }
1242 
1244 
1245 #endif
void BLI_kdtree_nd_() free(KDTree *tree)
Definition: kdtree_impl.h:102
unsigned int uint
Definition: BLI_sys_types.h:67
_GL_VOID GLfloat value _GL_VOID_RET _GL_VOID const GLuint GLboolean *residences _GL_BOOL_RET _GL_VOID GLsizei GLfloat GLfloat GLfloat GLfloat const GLubyte *bitmap _GL_VOID_RET _GL_VOID GLenum const void *lists _GL_VOID_RET _GL_VOID const GLdouble *equation _GL_VOID_RET _GL_VOID GLdouble GLdouble blue _GL_VOID_RET _GL_VOID GLfloat GLfloat blue _GL_VOID_RET _GL_VOID GLint GLint blue _GL_VOID_RET _GL_VOID GLshort GLshort blue _GL_VOID_RET _GL_VOID GLubyte GLubyte blue _GL_VOID_RET _GL_VOID GLuint GLuint blue _GL_VOID_RET _GL_VOID GLushort GLushort blue _GL_VOID_RET _GL_VOID GLbyte GLbyte GLbyte alpha _GL_VOID_RET _GL_VOID GLdouble GLdouble GLdouble alpha _GL_VOID_RET _GL_VOID GLfloat GLfloat GLfloat alpha _GL_VOID_RET _GL_VOID GLint GLint GLint alpha _GL_VOID_RET _GL_VOID GLshort GLshort GLshort alpha _GL_VOID_RET _GL_VOID GLubyte GLubyte GLubyte alpha _GL_VOID_RET _GL_VOID GLuint GLuint GLuint alpha _GL_VOID_RET _GL_VOID GLushort GLushort GLushort alpha _GL_VOID_RET _GL_VOID GLenum mode _GL_VOID_RET _GL_VOID GLint y
in reality light always falls off quadratically Particle Retrieve the data of the particle that spawned the object for example to give variation to multiple instances of an object Point Retrieve information about points in a point cloud Retrieve the edges of an object as it appears to Cycles topology will always appear triangulated Convert a blackbody temperature to an RGB value Normal Generate a perturbed normal from an RGB normal map image Typically used for faking highly detailed surfaces Generate an OSL shader from a file or text data block Image Sample an image file as a texture Sky Generate a procedural sky texture Noise Generate fractal Perlin noise Wave Generate procedural bands or rings with noise Voronoi Generate Worley noise based on the distance to random points Typically used to generate textures such as or biological cells Brick Generate a procedural texture producing bricks Texture Retrieve multiple types of texture coordinates nTypically used as inputs for texture nodes Vector Convert a or normal between and object coordinate space Combine Create a color from its and value channels Color Retrieve a color attribute
volatile int lock
static DBVT_INLINE btScalar size(const btDbvtVolume &a)
Definition: btDbvt.cpp:52
SIMD_FORCE_INLINE const btScalar & w() const
Return the w value.
Definition: btQuadWord.h:119
CUDA cuda
Definition: debug.h:131
DeviceType type
Definition: device/device.h:62
virtual void set_error(const string &error)
DeviceInfo info
void mem_free(size_t size)
Definition: util/stats.h:29
void mem_alloc(size_t size)
Definition: util/stats.h:23
bool is_resident(Device *sub_device) const
Definition: memory.cpp:125
size_t memory_elements_size(int elements)
void device_copy_to()
Definition: memory.cpp:80
device_ptr device_pointer
void host_free()
Definition: memory.cpp:58
#define CCL_NAMESPACE_END
Definition: cuda/compat.h:9
static constexpr size_t datatype_size(DataType datatype)
@ MEM_GLOBAL
@ MEM_TEXTURE
@ MEM_DEVICE_ONLY
@ TYPE_FLOAT
@ TYPE_INT
@ TYPE_HALF
@ TYPE_UINT
@ TYPE_UINT16
@ TYPE_UCHAR
#define KERNEL_DATA_ARRAY(type, name)
Definition: data_arrays.h:5
DebugFlags & DebugFlags()
Definition: debug.h:159
@ DEVICE_CUDA
Definition: device/device.h:39
@ DEVICE_OPTIX
Definition: device/device.h:41
SyclQueue * queue
static const char * to_string(const Interpolation &interp)
Definition: gl_shader.cc:63
ccl_gpu_kernel_postfix ccl_global float int int int int float bool int offset
@ KERNEL_FEATURE_MNEE
@ KERNEL_FEATURE_NODE_RAYTRACE
@ BVH_LAYOUT_BVH2
DeviceKernel
@ DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE
@ DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE
@ DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_MNEE
format
Definition: logImageCore.h:38
#define VLOG_INFO
Definition: log.h:77
#define VLOG_WARNING
Definition: log.h:75
#define DCHECK(expression)
Definition: log.h:55
#define VLOG_WORK
Definition: log.h:80
string util_md5_string(const string &str)
Definition: md5.cpp:362
static void error(const char *str)
Definition: meshlaplacian.c:51
int BVHLayoutMask
Definition: params.h:47
string path_cache_get(const string &sub)
Definition: path.cpp:358
string path_get(const string &sub)
Definition: path.cpp:338
string path_files_md5_hash(const string &dir)
Definition: path.cpp:606
string path_join(const string &dir, const string &file)
Definition: path.cpp:413
bool path_exists(const string &path)
Definition: path.cpp:559
void path_create_directories(const string &filepath)
Definition: path.cpp:642
bool path_read_text(const string &path, string &text)
Definition: path.cpp:701
unsigned __int64 uint64_t
Definition: stdint.h:90
string string_human_readable_size(size_t size)
Definition: string.cpp:229
string string_human_readable_number(size_t num)
Definition: string.cpp:248
CCL_NAMESPACE_BEGIN string string_printf(const char *format,...)
Definition: string.cpp:22
bool adaptive_compile
Definition: debug.h:80
uint data_type
Definition: util/texture.h:76
uint extension
Definition: util/texture.h:78
uint interpolation
Definition: util/texture.h:78
size_t system_physical_ram()
Definition: system.cpp:258
int system_cpu_bits()
Definition: system.cpp:123
std::unique_lock< std::mutex > thread_scoped_lock
Definition: thread.h:28
CCL_NAMESPACE_BEGIN typedef std::mutex thread_mutex
Definition: thread.h:27
CCL_NAMESPACE_BEGIN double time_dt()
Definition: time.cpp:35
@ IMAGE_DATA_TYPE_NANOVDB_FP16
Definition: util/texture.h:41
@ IMAGE_DATA_TYPE_NANOVDB_FLOAT
Definition: util/texture.h:38
@ IMAGE_DATA_TYPE_NANOVDB_FLOAT3
Definition: util/texture.h:39
@ IMAGE_DATA_TYPE_NANOVDB_FPN
Definition: util/texture.h:40
@ INTERPOLATION_CLOSEST
Definition: util/texture.h:22
@ EXTENSION_REPEAT
Definition: util/texture.h:63
@ EXTENSION_CLIP
Definition: util/texture.h:67
@ EXTENSION_EXTEND
Definition: util/texture.h:65
ccl_device_inline size_t align_up(size_t offset, size_t alignment)
Definition: util/types.h:46
uint64_t device_ptr
Definition: util/types.h:43