33 bool HIPDevice::have_precompiled_kernels()
35 string fatbins_path =
path_get(
"lib");
44 void HIPDevice::set_error(
const string &
error)
49 fprintf(stderr,
"\nRefer to the Cycles GPU rendering documentation for possible solutions:\n");
51 "https://docs.blender.org/manual/en/latest/render/cycles/gpu_rendering.html\n\n");
57 :
Device(info, stats, profiler), texture_info(this,
"texture_info",
MEM_GLOBAL)
67 need_texture_info =
false;
69 device_texture_headroom = 0;
70 device_working_headroom = 0;
71 move_texture_to_host =
false;
78 hipError_t
result = hipInit(0);
79 if (
result != hipSuccess) {
80 set_error(
string_printf(
"Failed to initialize HIP runtime (%s)", hipewErrorString(
result)));
85 result = hipDeviceGet(&hipDevice, hipDevId);
86 if (
result != hipSuccess) {
87 set_error(
string_printf(
"Failed to get HIP device handle from ordinal (%s)",
88 hipewErrorString(
result)));
95 hip_assert(hipDeviceGetAttribute(&can_map_host, hipDeviceAttributeCanMapHostMemory, hipDevice));
98 hipDeviceGetAttribute(&pitch_alignment, hipDeviceAttributeTexturePitchAlignment, hipDevice));
100 unsigned int ctx_flags = hipDeviceLmemResizeToMax;
102 ctx_flags |= hipDeviceMapHost;
107 result = hipCtxCreate(&hipContext, ctx_flags, hipDevice);
109 if (
result != hipSuccess) {
115 hipDeviceGetAttribute(&major, hipDeviceAttributeComputeCapabilityMajor, hipDevId);
116 hipDeviceGetAttribute(&minor, hipDeviceAttributeComputeCapabilityMinor, hipDevId);
117 hipDevArchitecture = major * 100 + minor * 10;
120 hip_assert(hipRuntimeGetVersion(&hipRuntimeVersion));
123 hipCtxPopCurrent(
NULL);
126 HIPDevice::~HIPDevice()
130 hip_assert(hipCtxDestroy(hipContext));
133 bool HIPDevice::support_device(
const uint )
135 if (hipSupportsDevice(hipDevId)) {
140 hipDeviceProp_t props;
141 hipGetDeviceProperties(&props, hipDevId);
143 set_error(
string_printf(
"HIP backend requires AMD RDNA graphics card or up, but found %s.",
149 bool HIPDevice::check_peer_access(
Device *peer_device)
151 if (peer_device ==
this) {
158 HIPDevice *
const peer_device_hip =
static_cast<HIPDevice *
>(peer_device);
161 hip_assert(hipDeviceCanAccessPeer(&can_access, hipDevice, peer_device_hip->hipDevice));
162 if (can_access == 0) {
167 hip_assert(hipDeviceGetP2PAttribute(
168 &can_access, hipDevP2PAttrHipArrayAccessSupported, hipDevice, peer_device_hip->hipDevice));
169 if (can_access == 0) {
175 const HIPContextScope scope(
this);
176 hipError_t
result = hipCtxEnablePeerAccess(peer_device_hip->hipContext, 0);
177 if (
result != hipSuccess) {
178 set_error(
string_printf(
"Failed to enable peer access on HIP context (%s)",
179 hipewErrorString(
result)));
184 const HIPContextScope scope(peer_device_hip);
185 hipError_t
result = hipCtxEnablePeerAccess(hipContext, 0);
186 if (
result != hipSuccess) {
187 set_error(
string_printf(
"Failed to enable peer access on HIP context (%s)",
188 hipewErrorString(
result)));
196 bool HIPDevice::use_adaptive_compilation()
204 string HIPDevice::compile_kernel_get_common_cflags(
const uint kernel_features)
207 const string source_path =
path_get(
"source");
208 const string include_path = source_path;
215 include_path.c_str());
216 if (use_adaptive_compilation()) {
217 cflags +=
" -D__KERNEL_FEATURES__=" +
to_string(kernel_features);
222 string HIPDevice::compile_kernel(
const uint kernel_features,
const char *name,
const char *base)
226 hipDeviceGetAttribute(&major, hipDeviceAttributeComputeCapabilityMajor, hipDevId);
227 hipDeviceGetAttribute(&minor, hipDeviceAttributeComputeCapabilityMinor, hipDevId);
228 hipDeviceProp_t props;
229 hipGetDeviceProperties(&props, hipDevId);
233 char *arch = strtok(props.gcnArchName,
":");
235 arch = props.gcnArchName;
239 if (!use_adaptive_compilation()) {
241 VLOG_INFO <<
"Testing for pre-compiled kernel " << fatbin <<
".";
243 VLOG_INFO <<
"Using precompiled kernel.";
249 string source_path =
path_get(
"source");
255 string common_cflags = compile_kernel_get_common_cflags(kernel_features);
258 const char *
const kernel_ext =
"genco";
261 options.append(
"Wno-parentheses-equality -Wno-unused-value --hipcc-func-supp -ffast-math");
263 options.append(
"Wno-parentheses-equality -Wno-unused-value --hipcc-func-supp -O3 -ffast-math");
266 options.append(
" -save-temps");
268 options.append(
" --amdgpu-target=").append(arch);
270 const string include_path = source_path;
271 const string fatbin_file =
string_printf(
"cycles_%s_%s_%s", name, arch, kernel_md5.c_str());
273 VLOG_INFO <<
"Testing for locally compiled kernel " << fatbin <<
".";
275 VLOG_INFO <<
"Using locally compiled kernel.";
280 if (!use_adaptive_compilation() && have_precompiled_kernels()) {
281 if (!hipSupportsDevice(hipDevId)) {
283 string_printf(
"HIP backend requires compute capability 10.1 or up, but found %d.%d. "
284 "Your GPU is not supported.",
290 string_printf(
"HIP binary kernel for this graphics card compute "
291 "capability (%d.%d) not found.",
300 const char *
const hipcc = hipewCompilerPath();
303 "HIP hipcc compiler not found. "
304 "Install HIP toolkit in default location.");
308 const int hipcc_hip_version = hipewCompilerVersion();
309 VLOG_INFO <<
"Found hipcc " << hipcc <<
", HIP version " << hipcc_hip_version <<
".";
310 if (hipcc_hip_version < 40) {
312 "Unsupported HIP version %d.%d detected, "
313 "you need HIP 4.0 or newer.\n",
314 hipcc_hip_version / 10,
315 hipcc_hip_version % 10);
326 string command =
string_printf(
"%s -%s -I %s --%s %s -o \"%s\"",
329 include_path.c_str(),
334 printf(
"Compiling %sHIP kernel ...\n%s\n",
335 (use_adaptive_compilation()) ?
"adaptive " :
"",
339 command =
"call " + command;
341 if (system(command.c_str()) != 0) {
343 "Failed to execute compilation command, "
344 "see console for details.");
351 "HIP kernel compilation failed, "
352 "see console for details.");
356 printf(
"Kernel compilation finished in %.2lfs.\n",
time_dt() - starttime);
361 bool HIPDevice::load_kernels(
const uint kernel_features)
368 if (use_adaptive_compilation()) {
369 VLOG_INFO <<
"Skipping HIP kernel reload for adaptive compilation, not currently supported.";
379 if (!support_device(kernel_features)) {
384 const char *kernel_name =
"kernel";
385 string fatbin = compile_kernel(kernel_features, kernel_name);
390 HIPContextScope scope(
this);
396 result = hipModuleLoadData(&hipModule, fatbin_data.c_str());
398 result = hipErrorFileNotFound;
402 "Failed to load HIP kernel from '%s' (%s)", fatbin.c_str(), hipewErrorString(
result)));
404 if (
result == hipSuccess) {
406 reserve_local_memory(kernel_features);
409 return (
result == hipSuccess);
412 void HIPDevice::reserve_local_memory(
const uint kernel_features)
417 size_t total = 0, free_before = 0, free_after = 0;
420 HIPContextScope scope(
this);
421 hipMemGetInfo(&free_before, &total);
435 HIPDeviceQueue
queue(
this);
442 queue.init_execution();
443 queue.enqueue(test_kernel, 1, args);
448 HIPContextScope scope(
this);
449 hipMemGetInfo(&free_after, &total);
457 const size_t keep_mb = 1024;
459 while (free_after > keep_mb * 1024 * 1024LL) {
461 hip_assert(hipMalloc(&tmp, 10 * 1024 * 1024LL));
462 hipMemGetInfo(&free_after, &total);
467 void HIPDevice::init_host_memory()
472 size_t default_limit = 4 * 1024 * 1024 * 1024LL;
475 if (system_ram > 0) {
476 if (system_ram / 2 > default_limit) {
477 map_host_limit = system_ram - default_limit;
480 map_host_limit = system_ram / 2;
484 VLOG_WARNING <<
"Mapped host memory disabled, failed to get system RAM";
492 device_working_headroom = 32 * 1024 * 1024LL;
493 device_texture_headroom = 128 * 1024 * 1024LL;
499 void HIPDevice::load_texture_info()
501 if (need_texture_info) {
504 need_texture_info =
false;
505 texture_info.copy_to_device();
509 void HIPDevice::move_textures_to_host(
size_t size,
bool for_texture)
512 static bool any_device_moving_textures_to_host =
false;
513 if (any_device_moving_textures_to_host) {
518 move_texture_to_host =
true;
524 bool max_is_image =
false;
527 foreach (HIPMemMap::value_type &pair, hip_mem_map) {
529 HIPMem *cmem = &pair.second;
533 if (!mem.
is_resident(
this) || cmem->use_mapped_host) {
538 (&mem != &texture_info);
539 bool is_image = is_texture && (mem.
data_height > 1);
542 if (!is_texture || cmem->array) {
547 if (for_texture && !is_image) {
552 if (is_image > max_is_image || (is_image == max_is_image && mem.
device_size > max_size)) {
553 max_is_image = is_image;
564 VLOG_WORK <<
"Move memory from device to host: " << max_mem->
name;
569 any_device_moving_textures_to_host =
true;
581 any_device_moving_textures_to_host =
false;
589 move_texture_to_host =
false;
595 HIPDevice::HIPMem *HIPDevice::generic_alloc(
device_memory &mem,
size_t pitch_padding)
597 HIPContextScope scope(
this);
599 hipDeviceptr_t device_pointer = 0;
602 hipError_t mem_alloc_result = hipErrorOutOfMemory;
603 const char *status =
"";
613 bool is_image = is_texture && (mem.
data_height > 1);
615 size_t headroom = (is_texture) ? device_texture_headroom : device_working_headroom;
617 size_t total = 0,
free = 0;
618 hipMemGetInfo(&
free, &total);
621 if (!move_texture_to_host && !is_image && (
size + headroom) >=
free && can_map_host) {
622 move_textures_to_host(
size + headroom -
free, is_texture);
623 hipMemGetInfo(&
free, &total);
627 if (!move_texture_to_host && (
size + headroom) <
free) {
628 mem_alloc_result = hipMalloc(&device_pointer,
size);
629 if (mem_alloc_result == hipSuccess) {
630 status =
" in device memory";
636 void *shared_pointer = 0;
638 if (mem_alloc_result != hipSuccess && can_map_host) {
641 mem_alloc_result = hipSuccess;
644 else if (map_host_used +
size < map_host_limit) {
646 mem_alloc_result = hipHostMalloc(
647 &shared_pointer,
size, hipHostMallocMapped | hipHostMallocWriteCombined);
649 assert((mem_alloc_result == hipSuccess && shared_pointer != 0) ||
650 (mem_alloc_result != hipSuccess && shared_pointer == 0));
653 if (mem_alloc_result == hipSuccess) {
654 hip_assert(hipHostGetDevicePointer(&device_pointer, shared_pointer, 0));
655 map_host_used +=
size;
656 status =
" in host memory";
660 if (mem_alloc_result != hipSuccess) {
661 status =
" failed, out of device and host memory";
662 set_error(
"System is out of GPU and shared host memory");
681 HIPMem *cmem = &hip_mem_map[&mem];
682 if (shared_pointer != 0) {
688 if (!move_texture_to_host && pitch_padding == 0 && mem.
host_pointer &&
704 cmem->use_mapped_host =
true;
707 cmem->use_mapped_host =
false;
724 const HIPContextScope scope(
this);
733 HIPContextScope scope(
this);
735 DCHECK(hip_mem_map.find(&mem) != hip_mem_map.end());
736 const HIPMem &cmem = hip_mem_map[&mem];
741 if (cmem.use_mapped_host) {
764 hip_mem_map.erase(hip_mem_map.find(&mem));
771 assert(!
"mem_alloc not supported for textures.");
774 assert(!
"mem_alloc not supported for global memory.");
795 generic_copy_to(mem);
799 void HIPDevice::mem_copy_from(
device_memory &mem,
size_t y,
size_t w,
size_t h,
size_t elem)
802 assert(!
"mem_copy_from not supported for textures.");
805 const size_t size = elem *
w * h;
809 const HIPContextScope scope(
this);
810 hip_assert(hipMemcpyDtoH(
832 const HIPContextScope scope(
this);
858 void HIPDevice::const_copy_to(
const char *name,
void *host,
size_t size)
860 HIPContextScope scope(
this);
864 hip_assert(hipModuleGetGlobal(&mem, &bytes, hipModule,
"kernel_params"));
868 # define KERNEL_DATA_ARRAY(data_type, data_name) \
869 if (strcmp(name, #data_name) == 0) { \
870 hip_assert(hipMemcpyHtoD(mem + offsetof(KernelParamsHIP, data_name), host, size)); \
875 # include "kernel/data_arrays.h"
876 # undef KERNEL_DATA_ARRAY
883 generic_copy_to(mem);
898 HIPContextScope scope(
this);
903 hipTextureAddressMode address_mode = hipAddressModeWrap;
906 address_mode = hipAddressModeWrap;
909 address_mode = hipAddressModeClamp;
912 address_mode = hipAddressModeBorder;
919 hipTextureFilterMode filter_mode;
921 filter_mode = hipFilterModePoint;
924 filter_mode = hipFilterModeLinear;
931 format = HIP_AD_FORMAT_UNSIGNED_INT8;
934 format = HIP_AD_FORMAT_UNSIGNED_INT16;
937 format = HIP_AD_FORMAT_UNSIGNED_INT32;
940 format = HIP_AD_FORMAT_SIGNED_INT32;
943 format = HIP_AD_FORMAT_FLOAT;
946 format = HIP_AD_FORMAT_HALF;
954 hArray array_3d =
NULL;
956 size_t dst_pitch = src_pitch;
960 cmem = &hip_mem_map[&mem];
965 cmem->array = array_3d;
968 dst_pitch =
align_up(src_pitch, pitch_alignment);
973 HIP_ARRAY3D_DESCRIPTOR desc;
986 hip_assert(hipArray3DCreate((hArray *)&array_3d, &desc));
993 memset(¶m, 0,
sizeof(HIP_MEMCPY3D));
994 param.dstMemoryType = get_memory_type(hipMemoryTypeArray);
995 param.dstArray = array_3d;
996 param.srcMemoryType = get_memory_type(hipMemoryTypeHost);
998 param.srcPitch = src_pitch;
999 param.WidthInBytes = param.srcPitch;
1003 hip_assert(hipDrvMemcpy3D(¶m));
1010 cmem = &hip_mem_map[&mem];
1011 cmem->texobject = 0;
1012 cmem->array = array_3d;
1016 dst_pitch =
align_up(src_pitch, pitch_alignment);
1019 cmem = generic_alloc(mem, dst_size - mem.
memory_size());
1025 memset(¶m, 0,
sizeof(param));
1026 param.dstMemoryType = get_memory_type(hipMemoryTypeDevice);
1028 param.dstPitch = dst_pitch;
1029 param.srcMemoryType = get_memory_type(hipMemoryTypeHost);
1031 param.srcPitch = src_pitch;
1032 param.WidthInBytes = param.srcPitch;
1035 hip_assert(hipDrvMemcpy2DUnaligned(¶m));
1039 cmem = generic_alloc(mem);
1049 if (slot >= texture_info.size()) {
1052 texture_info.resize(slot + 128);
1056 texture_info[slot] = mem.
info;
1057 need_texture_info =
true;
1064 hipResourceDesc resDesc;
1065 memset(&resDesc, 0,
sizeof(resDesc));
1068 resDesc.resType = hipResourceTypeArray;
1069 resDesc.res.array.h_Array = array_3d;
1073 resDesc.resType = hipResourceTypePitch2D;
1075 resDesc.res.pitch2D.format =
format;
1079 resDesc.res.pitch2D.pitchInBytes = dst_pitch;
1082 resDesc.resType = hipResourceTypeLinear;
1084 resDesc.res.linear.format =
format;
1089 hipTextureDesc texDesc;
1090 memset(&texDesc, 0,
sizeof(texDesc));
1091 texDesc.addressMode[0] = address_mode;
1092 texDesc.addressMode[1] = address_mode;
1093 texDesc.addressMode[2] = address_mode;
1094 texDesc.filterMode = filter_mode;
1095 texDesc.flags = HIP_TRSF_NORMALIZED_COORDINATES;
1098 cmem = &hip_mem_map[&mem];
1100 if (hipTexObjectCreate(&cmem->texobject, &resDesc, &texDesc,
NULL) != hipSuccess) {
1102 "Failed to create texture. Maximum GPU texture size or available GPU memory was likely "
1106 texture_info[slot].data = (
uint64_t)cmem->texobject;
1116 HIPContextScope scope(
this);
1118 DCHECK(hip_mem_map.find(&mem) != hip_mem_map.end());
1119 const HIPMem &cmem = hip_mem_map[&mem];
1121 if (cmem.texobject) {
1123 hipTexObjectDestroy(cmem.texobject);
1128 hip_mem_map.erase(hip_mem_map.find(&mem));
1130 else if (cmem.array) {
1132 hipArrayDestroy(cmem.array);
1137 hip_mem_map.erase(hip_mem_map.find(&mem));
1146 unique_ptr<DeviceQueue> HIPDevice::gpu_queue_create()
1148 return make_unique<HIPDeviceQueue>(
this);
1151 bool HIPDevice::should_use_graphics_interop()
1161 HIPContextScope scope(
this);
1163 int num_all_devices = 0;
1164 hip_assert(hipGetDeviceCount(&num_all_devices));
1166 if (num_all_devices == 0) {
1171 uint num_gl_devices = 0;
1172 hipGLGetDevices(&num_gl_devices, gl_devices.data(), num_all_devices, hipGLDeviceListAll);
1174 for (hipDevice_t gl_device : gl_devices) {
1175 if (gl_device == hipDevice) {
1184 int HIPDevice::get_num_multiprocessors()
1186 return get_device_default_attribute(hipDeviceAttributeMultiprocessorCount, 0);
1189 int HIPDevice::get_max_num_threads_per_multiprocessor()
1191 return get_device_default_attribute(hipDeviceAttributeMaxThreadsPerMultiProcessor, 0);
1194 bool HIPDevice::get_device_attribute(hipDeviceAttribute_t
attribute,
int *value)
1196 HIPContextScope scope(
this);
1198 return hipDeviceGetAttribute(value,
attribute, hipDevice) == hipSuccess;
1201 int HIPDevice::get_device_default_attribute(hipDeviceAttribute_t
attribute,
int default_value)
1204 if (!get_device_attribute(
attribute, &value)) {
1205 return default_value;
1210 hipMemoryType HIPDevice::get_memory_type(hipMemoryType mem_type)
1212 return get_hip_memory_type(mem_type, hipRuntimeVersion);
void BLI_kdtree_nd_() free(KDTree *tree)
_GL_VOID GLfloat value _GL_VOID_RET _GL_VOID const GLuint GLboolean *residences _GL_BOOL_RET _GL_VOID GLsizei GLfloat GLfloat GLfloat GLfloat const GLubyte *bitmap _GL_VOID_RET _GL_VOID GLenum const void *lists _GL_VOID_RET _GL_VOID const GLdouble *equation _GL_VOID_RET _GL_VOID GLdouble GLdouble blue _GL_VOID_RET _GL_VOID GLfloat GLfloat blue _GL_VOID_RET _GL_VOID GLint GLint blue _GL_VOID_RET _GL_VOID GLshort GLshort blue _GL_VOID_RET _GL_VOID GLubyte GLubyte blue _GL_VOID_RET _GL_VOID GLuint GLuint blue _GL_VOID_RET _GL_VOID GLushort GLushort blue _GL_VOID_RET _GL_VOID GLbyte GLbyte GLbyte alpha _GL_VOID_RET _GL_VOID GLdouble GLdouble GLdouble alpha _GL_VOID_RET _GL_VOID GLfloat GLfloat GLfloat alpha _GL_VOID_RET _GL_VOID GLint GLint GLint alpha _GL_VOID_RET _GL_VOID GLshort GLshort GLshort alpha _GL_VOID_RET _GL_VOID GLubyte GLubyte GLubyte alpha _GL_VOID_RET _GL_VOID GLuint GLuint GLuint alpha _GL_VOID_RET _GL_VOID GLushort GLushort GLushort alpha _GL_VOID_RET _GL_VOID GLenum mode _GL_VOID_RET _GL_VOID GLint y
in reality light always falls off quadratically Particle Retrieve the data of the particle that spawned the object for example to give variation to multiple instances of an object Point Retrieve information about points in a point cloud Retrieve the edges of an object as it appears to Cycles topology will always appear triangulated Convert a blackbody temperature to an RGB value Normal Generate a perturbed normal from an RGB normal map image Typically used for faking highly detailed surfaces Generate an OSL shader from a file or text data block Image Sample an image file as a texture Sky Generate a procedural sky texture Noise Generate fractal Perlin noise Wave Generate procedural bands or rings with noise Voronoi Generate Worley noise based on the distance to random points Typically used to generate textures such as or biological cells Brick Generate a procedural texture producing bricks Texture Retrieve multiple types of texture coordinates nTypically used as inputs for texture nodes Vector Convert a or normal between and object coordinate space Combine Create a color from its and value channels Color Retrieve a color attribute
static DBVT_INLINE btScalar size(const btDbvtVolume &a)
SIMD_FORCE_INLINE const btScalar & w() const
Return the w value.
virtual void set_error(const string &error)
void mem_free(size_t size)
void mem_alloc(size_t size)
bool is_resident(Device *sub_device) const
size_t memory_elements_size(int elements)
device_ptr device_pointer
#define CCL_NAMESPACE_END
static constexpr size_t datatype_size(DataType datatype)
CCL_NAMESPACE_BEGIN struct Options options
#define KERNEL_DATA_ARRAY(type, name)
DebugFlags & DebugFlags()
static const char * to_string(const Interpolation &interp)
ccl_gpu_kernel_postfix ccl_global float int int int int float bool int offset
@ KERNEL_FEATURE_NODE_RAYTRACE
@ DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE
@ DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE
@ DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_MNEE
#define DCHECK(expression)
string util_md5_string(const string &str)
static void error(const char *str)
string path_cache_get(const string &sub)
string path_get(const string &sub)
string path_files_md5_hash(const string &dir)
string path_join(const string &dir, const string &file)
bool path_exists(const string &path)
void path_create_directories(const string &filepath)
bool path_read_text(const string &path, string &text)
unsigned __int64 uint64_t
string string_human_readable_size(size_t size)
string string_human_readable_number(size_t num)
CCL_NAMESPACE_BEGIN string string_printf(const char *format,...)
size_t system_physical_ram()
std::unique_lock< std::mutex > thread_scoped_lock
CCL_NAMESPACE_BEGIN typedef std::mutex thread_mutex
CCL_NAMESPACE_BEGIN double time_dt()
@ IMAGE_DATA_TYPE_NANOVDB_FP16
@ IMAGE_DATA_TYPE_NANOVDB_FLOAT
@ IMAGE_DATA_TYPE_NANOVDB_FLOAT3
@ IMAGE_DATA_TYPE_NANOVDB_FPN
ccl_device_inline size_t align_up(size_t offset, size_t alignment)