18 int max_mtlcompiler_threads = 2;
20 const char *kernel_type_as_string(MetalPipelineType pso_type)
25 case PSO_SPECIALIZED_INTERSECT:
26 return "PSO_SPECIALIZED_INTERSECT";
27 case PSO_SPECIALIZED_SHADE:
28 return "PSO_SPECIALIZED_SHADE";
46 ShaderCache(id<MTLDevice> _mtlDevice) : mtlDevice(_mtlDevice)
52 MetalKernelPipeline *get_best_pipeline(
DeviceKernel kernel,
const MetalDevice *device);
56 void load_kernel(
DeviceKernel kernel, MetalDevice *device, MetalPipelineType pso_type);
60 MetalPipelineType pso_type);
65 friend ShaderCache *get_shader_cache(id<MTLDevice> mtlDevice);
67 void compile_thread_func(
int thread_index);
69 using PipelineCollection = std::vector<unique_ptr<MetalKernelPipeline>>;
71 struct PipelineRequest {
72 MetalKernelPipeline *pipeline =
nullptr;
73 std::function<
void(MetalKernelPipeline *)> completionHandler;
79 id<MTLDevice> mtlDevice;
82 std::condition_variable cond_var;
83 std::deque<PipelineRequest> request_queue;
84 std::vector<std::thread> compile_threads;
85 std::atomic_int incomplete_requests = 0;
89 std::map<id<MTLDevice>, unique_ptr<ShaderCache>> g_shaderCache;
91 ShaderCache *get_shader_cache(id<MTLDevice> mtlDevice)
94 auto it = g_shaderCache.find(mtlDevice);
95 if (it != g_shaderCache.end()) {
96 return it->second.get();
99 g_shaderCache[mtlDevice] = make_unique<ShaderCache>(mtlDevice);
100 return g_shaderCache[mtlDevice].get();
103 ShaderCache::~ShaderCache()
105 metal_printf(
"ShaderCache shutting down with incomplete_requests = %d\n",
106 int(incomplete_requests));
109 cond_var.notify_all();
110 for (
auto &
thread : compile_threads) {
115 void ShaderCache::wait_for_all()
117 while (incomplete_requests > 0) {
118 std::this_thread::sleep_for(std::chrono::milliseconds(100));
122 void ShaderCache::compile_thread_func(
int thread_index)
127 PipelineRequest request;
130 cond_var.wait(
lock, [&] {
return !running || !request_queue.empty(); });
135 if (!request_queue.empty()) {
136 request = request_queue.front();
137 request_queue.pop_front();
142 if (request.pipeline) {
143 request.pipeline->compile();
144 incomplete_requests--;
149 bool ShaderCache::should_load_kernel(
DeviceKernel device_kernel,
151 MetalPipelineType pso_type)
165 if (pso_type != PSO_GENERIC) {
174 bool is_shade_pso = (pso_type == PSO_SPECIALIZED_SHADE);
175 if (is_shade_pso != is_shade_kernel) {
183 for (
auto &pipeline : pipelines[device_kernel]) {
184 if (pipeline->source_md5 == device->source_md5[pso_type]) {
193 void ShaderCache::load_kernel(
DeviceKernel device_kernel,
195 MetalPipelineType pso_type)
200 if (compile_threads.empty()) {
202 for (
int i = 0; i < max_mtlcompiler_threads; i++) {
203 compile_threads.push_back(std::thread([&] { compile_thread_func(i); }));
208 if (!should_load_kernel(device_kernel, device, pso_type)) {
212 incomplete_requests++;
214 PipelineRequest request;
215 request.pipeline =
new MetalKernelPipeline;
216 memcpy(&request.pipeline->kernel_data_,
217 &device->launch_params.data,
218 sizeof(request.pipeline->kernel_data_));
219 request.pipeline->pso_type = pso_type;
220 request.pipeline->mtlDevice = mtlDevice;
221 request.pipeline->source_md5 = device->source_md5[pso_type];
222 request.pipeline->mtlLibrary = device->mtlLibrary[pso_type];
223 request.pipeline->device_kernel = device_kernel;
224 request.pipeline->threads_per_threadgroup = device->max_threads_per_threadgroup;
227 request.pipeline->use_metalrt = device->use_metalrt;
228 request.pipeline->metalrt_features = device->use_metalrt ?
229 (device->kernel_features & METALRT_FEATURE_MASK) :
234 auto &collection = pipelines[device_kernel];
237 int max_entries_of_same_pso_type = 3;
238 for (
int i = (
int)collection.size() - 1; i >= 0; i--) {
239 if (collection[i]->pso_type == pso_type) {
240 max_entries_of_same_pso_type -= 1;
241 if (max_entries_of_same_pso_type == 0) {
242 metal_printf(
"Purging oldest %s:%s kernel from ShaderCache\n",
243 kernel_type_as_string(pso_type),
245 collection.erase(collection.begin() + i);
251 collection.push_back(unique_ptr<MetalKernelPipeline>(request.pipeline));
252 request_queue.push_back(request);
254 cond_var.notify_one();
257 MetalKernelPipeline *ShaderCache::get_best_pipeline(
DeviceKernel kernel,
const MetalDevice *device)
260 auto &collection = pipelines[
kernel];
261 if (collection.empty()) {
266 bool use_metalrt = device->use_metalrt;
268 bool device_metalrt_hair_thick = use_metalrt &&
270 bool device_metalrt_pointcloud = use_metalrt &&
272 bool device_metalrt_motion = use_metalrt &&
275 MetalKernelPipeline *best_pipeline =
nullptr;
276 for (
auto &pipeline : collection) {
277 if (!pipeline->loaded) {
285 bool pipeline_metalrt_motion = use_metalrt &&
288 if (pipeline->use_metalrt != use_metalrt || pipeline_metalrt_hair != device_metalrt_hair ||
289 pipeline_metalrt_hair_thick != device_metalrt_hair_thick ||
290 pipeline_metalrt_pointcloud != device_metalrt_pointcloud ||
291 pipeline_metalrt_motion != device_metalrt_motion) {
296 if (pipeline->pso_type != PSO_GENERIC) {
297 if (pipeline->source_md5 == device->source_md5[PSO_SPECIALIZED_INTERSECT] ||
298 pipeline->source_md5 == device->source_md5[PSO_SPECIALIZED_SHADE]) {
299 best_pipeline = pipeline.get();
302 else if (!best_pipeline) {
303 best_pipeline = pipeline.get();
307 if (best_pipeline->usage_count == 0 && best_pipeline->pso_type != PSO_GENERIC) {
308 metal_printf(
"Swapping in %s version of %s\n",
309 kernel_type_as_string(best_pipeline->pso_type),
312 best_pipeline->usage_count += 1;
314 return best_pipeline;
317 bool MetalKernelPipeline::should_use_binary_archive()
const
320 if (@available(macOS 13.0, *)) {
321 if (
auto str = getenv(
"CYCLES_METAL_DISABLE_BINARY_ARCHIVES")) {
322 if (atoi(
str) != 0) {
329 MetalGPUVendor gpu_vendor = MetalInfo::get_device_vendor(mtlDevice);
330 if (gpu_vendor == METAL_GPU_INTEL) {
334 if (pso_type == PSO_GENERIC) {
351 static MTLFunctionConstantValues *GetConstantValues(
KernelData const *
data =
nullptr)
353 MTLFunctionConstantValues *constant_values = [MTLFunctionConstantValues
new];
355 MTLDataType MTLDataType_int = MTLDataTypeInt;
356 MTLDataType MTLDataType_float = MTLDataTypeFloat;
357 MTLDataType MTLDataType_float4 = MTLDataTypeFloat4;
365 # define KERNEL_STRUCT_MEMBER(parent, _type, name) \
366 [constant_values setConstantValue:&data->parent.name \
367 type:MTLDataType_##_type \
368 atIndex:KernelData_##parent##_##name];
372 return constant_values;
375 void MetalKernelPipeline::compile()
377 const std::string function_name = std::string(
"cycles_metal_") +
380 int threads_per_threadgroup = this->threads_per_threadgroup;
384 threads_per_threadgroup = 512;
387 NSString *entryPoint = [@(function_name.c_str())
copy];
390 if (@available(macOS 11.0, *)) {
391 MTLFunctionDescriptor *func_desc = [MTLIntersectionFunctionDescriptor functionDescriptor];
392 func_desc.name = entryPoint;
394 if (pso_type != PSO_GENERIC) {
395 func_desc.constantValues = GetConstantValues(&kernel_data_);
398 func_desc.constantValues = GetConstantValues();
401 function = [mtlLibrary newFunctionWithDescriptor:func_desc
error:&
error];
404 [entryPoint release];
406 if (
function == nil) {
407 NSString *
err = [
error localizedDescription];
408 string errors = [
err UTF8String];
409 metal_printf(
"Error getting function \"%s\": %s", function_name.c_str(), errors.c_str());
413 function.label = [entryPoint
copy];
416 if (@available(macOS 11.0, *)) {
418 const char *function_names[] = {
419 "__anyhit__cycles_metalrt_visibility_test_tri",
420 "__anyhit__cycles_metalrt_visibility_test_box",
421 "__anyhit__cycles_metalrt_shadow_all_hit_tri",
422 "__anyhit__cycles_metalrt_shadow_all_hit_box",
423 "__anyhit__cycles_metalrt_local_hit_tri",
424 "__anyhit__cycles_metalrt_local_hit_box",
425 "__intersection__curve_ribbon",
426 "__intersection__curve_ribbon_shadow",
427 "__intersection__curve_all",
428 "__intersection__curve_all_shadow",
429 "__intersection__point",
430 "__intersection__point_shadow",
432 assert(
sizeof(function_names) /
sizeof(function_names[0]) == METALRT_FUNC_NUM);
434 MTLFunctionDescriptor *desc = [MTLIntersectionFunctionDescriptor functionDescriptor];
435 for (
int i = 0; i < METALRT_FUNC_NUM; i++) {
436 const char *function_name = function_names[i];
437 desc.name = [@(function_name)
copy];
439 if (pso_type != PSO_GENERIC) {
440 desc.constantValues = GetConstantValues(&kernel_data_);
443 desc.constantValues = GetConstantValues();
447 rt_intersection_function[i] = [mtlLibrary newFunctionWithDescriptor:desc
error:&
error];
449 if (rt_intersection_function[i] == nil) {
450 NSString *
err = [
error localizedDescription];
451 string errors = [
err UTF8String];
454 "Error getting intersection function \"%s\": %s", function_name, errors.c_str());
458 rt_intersection_function[i].label = [@(function_name)
copy];
463 NSArray *table_functions[METALRT_TABLE_NUM] = {nil};
464 NSArray *linked_functions = nil;
471 id<MTLFunction> curve_intersect_default = nil;
472 id<MTLFunction> curve_intersect_shadow = nil;
473 id<MTLFunction> point_intersect_default = nil;
474 id<MTLFunction> point_intersect_shadow = nil;
477 if (metalrt_hair_thick) {
480 curve_intersect_default = rt_intersection_function[METALRT_FUNC_CURVE_ALL];
481 curve_intersect_shadow = rt_intersection_function[METALRT_FUNC_CURVE_ALL_SHADOW];
484 curve_intersect_default = rt_intersection_function[METALRT_FUNC_CURVE_RIBBON];
485 curve_intersect_shadow = rt_intersection_function[METALRT_FUNC_CURVE_RIBBON_SHADOW];
488 if (metalrt_pointcloud) {
489 point_intersect_default = rt_intersection_function[METALRT_FUNC_POINT];
490 point_intersect_shadow = rt_intersection_function[METALRT_FUNC_POINT_SHADOW];
492 table_functions[METALRT_TABLE_DEFAULT] = [NSArray
493 arrayWithObjects:rt_intersection_function[METALRT_FUNC_DEFAULT_TRI],
494 curve_intersect_default ?
495 curve_intersect_default :
496 rt_intersection_function[METALRT_FUNC_DEFAULT_BOX],
497 point_intersect_default ?
498 point_intersect_default :
499 rt_intersection_function[METALRT_FUNC_DEFAULT_BOX],
501 table_functions[METALRT_TABLE_SHADOW] = [NSArray
502 arrayWithObjects:rt_intersection_function[METALRT_FUNC_SHADOW_TRI],
503 curve_intersect_shadow ?
504 curve_intersect_shadow :
505 rt_intersection_function[METALRT_FUNC_SHADOW_BOX],
506 point_intersect_shadow ?
507 point_intersect_shadow :
508 rt_intersection_function[METALRT_FUNC_SHADOW_BOX],
510 table_functions[METALRT_TABLE_LOCAL] = [NSArray
511 arrayWithObjects:rt_intersection_function[METALRT_FUNC_LOCAL_TRI],
512 rt_intersection_function[METALRT_FUNC_LOCAL_BOX],
513 rt_intersection_function[METALRT_FUNC_LOCAL_BOX],
516 NSMutableSet *unique_functions = [NSMutableSet
517 setWithArray:table_functions[METALRT_TABLE_DEFAULT]];
518 [unique_functions addObjectsFromArray:table_functions[METALRT_TABLE_SHADOW]];
519 [unique_functions addObjectsFromArray:table_functions[METALRT_TABLE_LOCAL]];
521 if (kernel_has_intersection(device_kernel)) {
522 linked_functions = [[NSArray arrayWithArray:[unique_functions allObjects]]
523 sortedArrayUsingComparator:^NSComparisonResult(id<MTLFunction> f1, id<MTLFunction> f2) {
524 return [f1.label compare:f2.label];
527 unique_functions = nil;
530 MTLComputePipelineDescriptor *computePipelineStateDescriptor =
531 [[MTLComputePipelineDescriptor alloc]
init];
533 computePipelineStateDescriptor.buffers[0].mutability = MTLMutabilityImmutable;
534 computePipelineStateDescriptor.buffers[1].mutability = MTLMutabilityImmutable;
535 computePipelineStateDescriptor.buffers[2].mutability = MTLMutabilityImmutable;
537 if (@available(macos 10.14, *)) {
538 computePipelineStateDescriptor.maxTotalThreadsPerThreadgroup = threads_per_threadgroup;
540 computePipelineStateDescriptor.threadGroupSizeIsMultipleOfThreadExecutionWidth =
true;
542 computePipelineStateDescriptor.computeFunction =
function;
544 if (@available(macOS 11.0, *)) {
546 if (linked_functions) {
547 computePipelineStateDescriptor.linkedFunctions = [[MTLLinkedFunctions alloc]
init];
548 computePipelineStateDescriptor.linkedFunctions.functions = linked_functions;
550 computePipelineStateDescriptor.maxCallStackDepth = 1;
552 computePipelineStateDescriptor.maxCallStackDepth = 8;
556 MTLPipelineOption pipelineOptions = MTLPipelineOptionNone;
558 bool use_binary_archive = should_use_binary_archive();
560 id<MTLBinaryArchive> archive = nil;
561 string metalbin_path;
562 string metalbin_name;
563 if (use_binary_archive) {
564 NSProcessInfo *processInfo = [NSProcessInfo processInfo];
565 string osVersion = [[processInfo operatingSystemVersionString] UTF8String];
567 local_md5.
append(source_md5);
568 local_md5.
append(osVersion);
570 sizeof(this->threads_per_threadgroup));
573 if (use_metalrt && kernel_has_intersection(device_kernel)) {
576 metalrt_hair ? 1 : 0,
577 metalrt_hair_thick ? 1 : 0,
578 metalrt_pointcloud ? 1 : 0);
582 string device_name = [mtlDevice.name UTF8String];
583 for (
char &
c : device_name) {
584 if ((c < '0' || c >
'9') && (c < 'a' || c >
'z') && (c < 'A' || c >
'Z')) {
589 metalbin_name = device_name;
591 metalbin_name =
path_join(metalbin_name, kernel_type_as_string(pso_type));
597 if (
path_exists(metalbin_path) && use_binary_archive) {
598 if (@available(macOS 11.0, *)) {
599 MTLBinaryArchiveDescriptor *archiveDesc = [[MTLBinaryArchiveDescriptor alloc]
init];
600 archiveDesc.url = [NSURL fileURLWithPath:@(metalbin_path.c_str())];
601 archive = [mtlDevice newBinaryArchiveWithDescriptor:archiveDesc
error:nil];
602 [archiveDesc release];
607 __block
bool creating_new_archive =
false;
608 if (@available(macOS 11.0, *)) {
609 if (use_binary_archive) {
611 MTLBinaryArchiveDescriptor *archiveDesc = [[MTLBinaryArchiveDescriptor alloc]
init];
612 archiveDesc.url = nil;
613 archive = [mtlDevice newBinaryArchiveWithDescriptor:archiveDesc
error:nil];
614 creating_new_archive =
true;
616 computePipelineStateDescriptor.binaryArchives = [NSArray arrayWithObjects:archive, nil];
617 pipelineOptions = MTLPipelineOptionFailOnBinaryArchiveMiss;
623 MTLNewComputePipelineStateWithReflectionCompletionHandler completionHandler = ^(
624 id<MTLComputePipelineState> computePipelineState,
625 MTLComputePipelineReflection *reflection,
627 bool recreate_archive =
false;
628 if (computePipelineState == nil && archive) {
629 NSString *errStr = [
error localizedDescription];
631 "Failed to create compute pipeline state \"%s\" from archive - attempting recreation... "
634 errStr ? [errStr UTF8String] :
"nil");
635 computePipelineState = [mtlDevice
636 newComputePipelineStateWithDescriptor:computePipelineStateDescriptor
640 recreate_archive =
true;
643 double duration =
time_dt() - starttime;
645 if (computePipelineState == nil) {
646 NSString *errStr = [
error localizedDescription];
647 error_str =
string_printf(
"Failed to create compute pipeline state \"%s\", error: \n",
649 error_str += (errStr ? [errStr UTF8String] :
"nil");
650 metal_printf(
"%16s | %2d | %-55s | %7.2fs | FAILED!\n",
651 kernel_type_as_string(pso_type),
658 int num_threads_per_block =
round_down(computePipelineState.maxTotalThreadsPerThreadgroup,
659 computePipelineState.threadExecutionWidth);
660 num_threads_per_block =
std::max(num_threads_per_block,
661 (
int)computePipelineState.threadExecutionWidth);
662 this->pipeline = computePipelineState;
663 this->num_threads_per_block = num_threads_per_block;
665 if (@available(macOS 11.0, *)) {
666 if (creating_new_archive || recreate_archive) {
667 if (![archive serializeToURL:[NSURL fileURLWithPath:@(metalbin_path.c_str())]
669 metal_printf(
"Failed to save binary archive, error:\n%s\n",
670 [[
error localizedDescription] UTF8String]);
677 if (creating_new_archive) {
680 if (![archive addComputePipelineFunctionsWithDescriptor:computePipelineStateDescriptor
682 NSString *errStr = [
error localizedDescription];
683 metal_printf(
"Failed to add PSO to archive:\n%s\n", errStr ? [errStr UTF8String] :
"nil");
686 id<MTLComputePipelineState> pipeline = [mtlDevice
687 newComputePipelineStateWithDescriptor:computePipelineStateDescriptor
691 completionHandler(pipeline,
nullptr,
error);
694 [computePipelineStateDescriptor release];
695 computePipelineStateDescriptor = nil;
697 if (use_metalrt && linked_functions) {
698 for (
int table = 0; table < METALRT_TABLE_NUM; table++) {
699 if (@available(macOS 11.0, *)) {
700 MTLIntersectionFunctionTableDescriptor *ift_desc =
701 [[MTLIntersectionFunctionTableDescriptor alloc]
init];
702 ift_desc.functionCount = table_functions[table].count;
703 intersection_func_table[table] = [this->pipeline
704 newIntersectionFunctionTableWithDescriptor:ift_desc];
707 int size = (int)[table_functions[table]
count];
708 for (
int i = 0; i <
size; i++) {
709 id<MTLFunctionHandle> handle = [pipeline
710 functionHandleWithFunction:table_functions[table][i]];
711 [intersection_func_table[table] setFunction:handle atIndex:i];
717 double duration =
time_dt() - starttime;
719 if (!use_binary_archive) {
720 metal_printf(
"%16s | %2d | %-55s | %7.2fs\n",
721 kernel_type_as_string(pso_type),
727 metal_printf(
"%16s | %2d | %-55s | %7.2fs | %s: %s\n",
728 kernel_type_as_string(pso_type),
732 creating_new_archive ?
" new" :
"load",
733 metalbin_name.c_str());
737 bool MetalDeviceKernels::load(MetalDevice *device, MetalPipelineType pso_type)
739 const double starttime =
time_dt();
740 auto shader_cache = get_shader_cache(device->mtlDevice);
742 shader_cache->load_kernel((
DeviceKernel)i, device, pso_type);
745 shader_cache->wait_for_all();
746 metal_printf(
"Back-end compilation finished in %.1f seconds (%s)\n",
748 kernel_type_as_string(pso_type));
752 bool MetalDeviceKernels::should_load_kernels(MetalDevice *device, MetalPipelineType pso_type)
754 auto shader_cache = get_shader_cache(device->mtlDevice);
756 if (shader_cache->should_load_kernel((
DeviceKernel)i, device, pso_type)) {
763 const MetalKernelPipeline *MetalDeviceKernels::get_best_pipeline(
const MetalDevice *device,
766 return get_shader_cache(device->mtlDevice)->get_best_pipeline(
kernel, device);
_GL_VOID GLfloat value _GL_VOID_RET _GL_VOID const GLuint GLboolean *residences _GL_BOOL_RET _GL_VOID GLsizei GLfloat GLfloat GLfloat GLfloat const GLubyte *bitmap _GL_VOID_RET _GL_VOID GLenum type
static DBVT_INLINE btScalar size(const btDbvtVolume &a)
void append(const uint8_t *data, int size)
#define CCL_NAMESPACE_END
CCL_NAMESPACE_BEGIN struct Options options
CCL_NAMESPACE_BEGIN const char * device_kernel_as_string(DeviceKernel kernel)
SyclQueue void void size_t num_bytes SyclQueue void const char void *memory_device_pointer KernelContext int kernel
SyclQueue void void size_t num_bytes void
@ KERNEL_FEATURE_OBJECT_MOTION
@ KERNEL_FEATURE_HAIR_THICK
@ KERNEL_FEATURE_POINTCLOUD
@ KERNEL_FEATURE_NODE_RAYTRACE
@ DEVICE_KERNEL_INTEGRATOR_RESET
@ DEVICE_KERNEL_INTEGRATOR_INTERSECT_SUBSURFACE
@ DEVICE_KERNEL_INTEGRATOR_SHADE_SHADOW
@ DEVICE_KERNEL_INTEGRATOR_INTERSECT_VOLUME_STACK
@ DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_RAYTRACE
@ DEVICE_KERNEL_INTEGRATOR_SHADE_SURFACE_MNEE
@ DEVICE_KERNEL_INTEGRATOR_MEGAKERNEL
@ DEVICE_KERNEL_INTEGRATOR_INTERSECT_SHADOW
@ DEVICE_KERNEL_INTEGRATOR_INTERSECT_CLOSEST
@ DEVICE_KERNEL_INTEGRATOR_SHADE_BACKGROUND
static void error(const char *str)
static void copy(bNodeTree *dest_ntree, bNode *dest_node, const bNode *src_node)
string path_cache_get(const string &sub)
string path_join(const string &dir, const string &file)
bool path_exists(const string &path)
void path_create_directories(const string &filepath)
CCL_NAMESPACE_BEGIN string string_printf(const char *format,...)
std::unique_lock< std::mutex > thread_scoped_lock
CCL_NAMESPACE_BEGIN double time_dt()
ccl_device_inline size_t round_down(size_t x, size_t multiple)