Blender  V3.3
mtl_memory.hh
Go to the documentation of this file.
1 /* SPDX-License-Identifier: GPL-2.0-or-later */
2 
3 #pragma once
4 
5 #include <atomic>
6 #include <functional>
7 #include <map>
8 #include <mutex>
9 #include <set>
10 #include <unordered_map>
11 
12 #include "mtl_common.hh"
13 
14 #include <Cocoa/Cocoa.h>
15 #include <Metal/Metal.h>
16 #include <QuartzCore/QuartzCore.h>
17 
18 @class CAMetalLayer;
19 @class MTLCommandQueue;
20 @class MTLRenderPipelineState;
21 
22 /* Metal Memory Manager Overview. */
23 /*
24  * The Metal Backend Memory manager is designed to provide an interface
25  * for all other MTL_* modules where memory allocation is required.
26  *
27  * Different allocation strategies and data-structures are used depending
28  * on how the data is used by the backend. These aim to optimally handle
29  * system memory and abstract away any complexity from the MTL_* modules
30  * themselves.
31  *
32  * There are two primary allocation modes which can be used:
33  *
34  * ** MTLScratchBufferManager **
35  *
36  * Each MTLContext owns a ScratchBufferManager which is implemented
37  * as a pool of circular buffers, designed to handle temporary
38  * memory allocations which occur on a per-frame basis. The scratch
39  * buffers allow flushing of host memory to the GPU to be batched.
40  *
41  * Each frame, the next scratch buffer is reset, then later flushed upon
42  * command buffer submission.
43  *
44  * Note: This is allocated per-context due to allocations being tied
45  * to workload submissions and context-specific submissions.
46  *
47  * Examples of scratch buffer usage are:
48  * - Immediate-mode temporary vertex buffers.
49  * - Shader uniform data updates
50  * - Staging of data for resource copies, or, data reads/writes.
51  *
52  * Usage:
53  *
54  * MTLContext::get_scratchbuffer_manager() - to fetch active manager.
55  *
56  * MTLTemporaryBuffer scratch_buffer_allocate_range(size)
57  * MTLTemporaryBuffer scratch_buffer_allocate_range_aligned(size, align)
58  *
59  * ---------------------------------------------------------------------------------
60  * ** MTLBufferPool **
61  *
62  * For static and longer-lasting memory allocations, such as those for UBOs,
63  * Vertex buffers, index buffers, etc; We want an optimal abstraction for
64  * fetching a MTLBuffer of the desired size and resource options.
65  *
66  * Memory allocations can be expensive so the MTLBufferPool provides
67  * functionality to track usage of these buffers and once a buffer
68  * is no longer in use, it is returned to the buffer pool for use
69  * by another backend resource.
70  *
71  * The MTLBufferPool provides functionality for safe tracking of resources,
72  * as buffers freed on the host side must have their usage by the GPU tracked,
73  * to ensure they are not prematurely re-used before they have finished being
74  * used by the GPU.
75  *
76  * Note: The MTLBufferPool is a global construct which can be fetched from anywhere.
77  *
78  * Usage:
79  * MTLContext::get_global_memory_manager(); - static routine to fetch global memory manager.
80  *
81  * gpu::MTLBuffer *allocate(size, is_cpu_visibile)
82  * gpu::MTLBuffer *allocate_aligned(size, alignment, is_cpu_visibile)
83  * gpu::MTLBuffer *allocate_with_data(size, is_cpu_visibile, data_ptr)
84  * gpu::MTLBuffer *allocate_aligned_with_data(size, alignment, is_cpu_visibile, data_ptr)
85  */
86 
87 /* Debug memory statistics: Disabled by Macro rather than guarded for
88  * performance considerations. */
89 #define MTL_DEBUG_MEMORY_STATISTICS 0
90 
91 /* Allows a scratch buffer to temporarily grow beyond its maximum, which allows submission
92  * of one-time-use data packets which are too large. */
93 #define MTL_SCRATCH_BUFFER_ALLOW_TEMPORARY_EXPANSION 1
94 
95 namespace blender::gpu {
96 
97 /* Forward Declarations. */
98 class MTLContext;
99 class MTLCommandBufferManager;
100 class MTLUniformBuf;
101 
102 /* -------------------------------------------------------------------- */
106 /* MTLBuffer allocation wrapper. */
107 class MTLBuffer {
108 
109  private:
110  /* Metal resource. */
111  id<MTLBuffer> metal_buffer_;
112 
113  /* Host-visible mapped-memory pointer. Behavior depends on buffer type:
114  * - Shared buffers: pointer represents base address of #MTLBuffer whose data
115  * access has shared access by both the CPU and GPU on
116  * Unified Memory Architectures (UMA).
117  * - Managed buffer: Host-side mapped buffer region for CPU (Host) access. Managed buffers
118  * must be manually flushed to transfer data to GPU-resident buffer.
119  * - Private buffer: Host access is invalid, `data` will be nullptr. */
120  void *data_;
121 
122  /* Whether buffer is allocated from an external source. */
123  bool is_external_ = false;
124 
125  /* Allocation info. */
126  MTLResourceOptions options_;
127  id<MTLDevice> device_;
128  uint64_t alignment_;
129  uint64_t size_;
130 
131  /* Allocated size may be larger than actual size. */
132  uint64_t usage_size_;
133 
134  /* Lifetime info - whether the current buffer is actively in use. A buffer
135  * should be in use after it has been allocated. De-allocating the buffer, and
136  * returning it to the free buffer pool will set in_use to false. Using a buffer
137  * while it is not in-use should not be allowed and result in an error. */
138  std::atomic<bool> in_use_;
139 
140  public:
141  MTLBuffer(id<MTLDevice> device, uint64_t size, MTLResourceOptions options, uint alignment = 1);
142  MTLBuffer(id<MTLBuffer> external_buffer);
143  ~MTLBuffer();
144 
145  /* Fetch information about backing MTLBuffer. */
146  id<MTLBuffer> get_metal_buffer() const;
147  void *get_host_ptr() const;
148  uint64_t get_size_used() const;
149  uint64_t get_size() const;
150 
151  /* Flush data to GPU. */
152  void flush();
154  bool requires_flush();
155 
156  /* Buffer usage tracking. */
157  void flag_in_use(bool used);
158  bool get_in_use();
159  void set_usage_size(uint64_t size_used);
160 
161  /* Debug. */
162  void set_label(NSString *str);
163 
164  /* Read properties. */
165  MTLResourceOptions get_resource_options();
167 
168  /* Resource-local free: For buffers allocated via memory manager,
169  * this will call the context `free_buffer` method to return the buffer to the context memory
170  * pool.
171  *
172  * Otherwise, free will release the associated metal resource.
173  * As a note, calling the destructor will also destroy the buffer and associated metal
174  * resource. */
175  void free();
176 
177  /* Safety check to ensure buffers are not used after free. */
178  void debug_ensure_used();
179 };
180 
181 /* View into part of an MTLBuffer. */
183  id<MTLBuffer> metal_buffer;
184  void *data;
187  MTLResourceOptions options;
188 
189  void flush();
190  bool requires_flush();
191 };
192 
193 /* Circular scratch buffer allocations should be seen as temporary and only used within the
194  * lifetime of the frame. */
196 
197 /* Round-Robin Circular-buffer. */
200 
201  private:
202  MTLContext &own_context_;
203 
204  /* Wrapped MTLBuffer allocation handled. */
205  gpu::MTLBuffer *cbuffer_;
206 
207  /* Current offset where next allocation will begin. */
208  uint64_t current_offset_;
209 
210  /* Whether the Circular Buffer can grow during re-allocation if
211  * the size is exceeded. */
212  bool can_resize_;
213 
214  /* Usage information. */
215  uint64_t used_frame_index_;
216  uint64_t last_flush_base_offset_;
217 
218  public:
219  MTLCircularBuffer(MTLContext &ctx, uint64_t initial_size, bool allow_grow);
223  void flush();
224 
225  /* Reset pointer back to start of circular buffer. */
226  void reset();
227 };
228 
229 /* Wrapper struct used by Memory Manager to sort and compare gpu::MTLBuffer resources inside the
230  * memory pools. */
234 
236  {
237  this->buffer = buf;
238  this->buffer_size = this->buffer->get_size();
239  }
240 
242  {
243  this->buffer = nullptr;
244  this->buffer_size = compare_size;
245  }
246 };
247 
249  bool operator()(const MTLBufferHandle &lhs, const MTLBufferHandle &rhs) const
250  {
251  return lhs.buffer_size < rhs.buffer_size;
252  }
253 };
254 
255 /* An MTLSafeFreeList is a temporary list of gpu::MTLBuffers which have
256  * been freed by the high level backend, but are pending GPU work execution before
257  * the gpu::MTLBuffers can be returned to the Memory manager pools.
258  * This list is implemented as a chunked linked-list.
259  *
260  * Only a single MTLSafeFreeList is active at one time and is associated with current command
261  * buffer submissions. If an MTLBuffer is freed during the lifetime of a command buffer, it could
262  * still possibly be in-use and as such, the MTLSafeFreeList will increment its reference count for
263  * each command buffer submitted while the current pool is active.
264  *
265  * -- Reference count is incremented upon MTLCommandBuffer commit.
266  * -- Reference count is decremented in the MTLCommandBuffer completion callback handler.
267  *
268  * A new MTLSafeFreeList will begin each render step (frame). This pooling of buffers, rather than
269  * individual buffer resource tracking reduces performance overhead.
270  *
271  * * The reference count starts at 1 to ensure that the reference count cannot prematurely reach
272  * zero until any command buffers have been submitted. This additional decrement happens
273  * when the next MTLSafeFreeList is created, to allow the existing pool to be released once
274  * the reference count hits zero after submitted command buffers complete.
275  *
276  * Note: the Metal API independently tracks resources used by command buffers for the purpose of
277  * keeping resources alive while in-use by the driver and CPU, however, this differs from the
278  * MTLSafeFreeList mechanism in the Metal backend, which exists for the purpose of allowing
279  * previously allocated MTLBuffer resources to be re-used. This allows us to save on the expensive
280  * cost of memory allocation.
281  */
283  friend class MTLBufferPool;
284 
285  private:
286  std::atomic<int> reference_count_;
287  std::atomic<bool> in_free_queue_;
288  std::recursive_mutex lock_;
289 
290  /* Linked list of next MTLSafeFreeList chunk if current chunk is full. */
291  std::atomic<int> has_next_pool_;
292  std::atomic<MTLSafeFreeList *> next_;
293 
294  /* Lockless list. MAX_NUM_BUFFERS_ within a chunk based on considerations
295  * for performance and memory. */
296  static const int MAX_NUM_BUFFERS_ = 1024;
297  std::atomic<int> current_list_index_;
298  gpu::MTLBuffer *safe_free_pool_[MAX_NUM_BUFFERS_];
299 
300  public:
301  MTLSafeFreeList();
302 
303  /* Add buffer to Safe Free List, can be called from secondary threads.
304  * Performs a lockless list insert. */
306 
307  /* Increments command buffer reference count. */
308  void increment_reference();
309 
310  /* Decrement and return of buffers to pool occur on MTLCommandBuffer completion callback thread.
311  */
312  void decrement_reference();
313 
315  {
316  in_free_queue_ = true;
317  if (has_next_pool_) {
318  MTLSafeFreeList *next_pool = next_.load();
319  BLI_assert(next_pool != nullptr);
320  next_pool->flag_in_queue();
321  }
322  }
323 };
324 
325 /* MTLBuffer pools. */
326 /* Allocating Metal buffers is expensive, so we cache all allocated buffers,
327  * and when requesting a new buffer, find one which fits the required dimensions
328  * from an existing pool of buffers.
329  *
330  * When freeing MTLBuffers, we insert them into the current MTLSafeFreeList, which defers
331  * release of the buffer until the associated command buffers have finished executing.
332  * This prevents a buffer from being re-used while it is still in-use by the GPU.
333  *
334  * * Once command buffers complete, MTLSafeFreeList's associated with the current
335  * command buffer submission are added to the `completed_safelist_queue_`.
336  *
337  * * At a set point in time, all MTLSafeFreeList's in `completed_safelist_queue_` have their
338  * MTLBuffers re-inserted into the Memory Manager's pools. */
340 
341  private:
342  /* Memory statistics. */
343  long long int total_allocation_bytes_ = 0;
344 
345 #if MTL_DEBUG_MEMORY_STATISTICS == 1
346  /* Debug statistics. */
347  std::atomic<int> per_frame_allocation_count_;
348  std::atomic<long long int> allocations_in_pool_;
349  std::atomic<long long int> buffers_in_pool_;
350 #endif
351 
352  /* Metal resources. */
353  bool ensure_initialised_ = false;
354  id<MTLDevice> device_ = nil;
355 
356  /* The buffer selection aims to pick a buffer which meets the minimum size requirements.
357  * To do this, we keep an ordered set of all available buffers. If the buffer is larger than the
358  * desired allocation size, we check it against `mtl_buffer_size_threshold_factor_`,
359  * which defines what % larger than the original allocation the buffer can be.
360  * - A higher value results in greater re-use of previously allocated buffers of similar sizes.
361  * - A lower value may result in more dynamic allocations, but minimized memory usage for a given
362  * scenario.
363  * The current value of 1.26 is calibrated for optimal performance and memory utilization. */
364  static constexpr float mtl_buffer_size_threshold_factor_ = 1.26;
365 
366  /* Buffer pools using MTLResourceOptions as key for allocation type.
367  * Aliased as 'uint64_t' for map type compatibility.
368  * - A size-ordered list (MultiSet) of allocated buffers is kept per MTLResourceOptions
369  * permutation. This allows efficient lookup for buffers of a given requested size.
370  * - MTLBufferHandle wraps a gpu::MTLBuffer pointer to achieve easy size-based sorting
371  * via CompareMTLBuffer. */
372  using MTLBufferPoolOrderedList = std::multiset<MTLBufferHandle, CompareMTLBuffer>;
373  using MTLBufferResourceOptions = uint64_t;
374 
377 
378  /* Maintain a queue of all MTLSafeFreeList's that have been released
379  * by the GPU and are ready to have their buffers re-inserted into the
380  * MemoryManager pools.
381  * Access to this queue is made thread-safe through safelist_lock_. */
382  std::mutex safelist_lock_;
383  blender::Vector<MTLSafeFreeList *> completed_safelist_queue_;
384 
385  /* Current free list, associated with active MTLCommandBuffer submission. */
386  /* MTLBuffer::free() can be called from separate threads, due to usage within animation
387  * system/worker threads. */
388  std::atomic<MTLSafeFreeList *> current_free_list_;
389 
390  public:
391  void init(id<MTLDevice> device);
392  ~MTLBufferPool();
393 
394  gpu::MTLBuffer *allocate(uint64_t size, bool cpu_visible);
395  gpu::MTLBuffer *allocate_aligned(uint64_t size, uint alignment, bool cpu_visible);
396  gpu::MTLBuffer *allocate_with_data(uint64_t size, bool cpu_visible, const void *data = nullptr);
398  uint alignment,
399  bool cpu_visible,
400  const void *data = nullptr);
402 
403  /* Flush MTLSafeFreeList buffers, for completed lists in `completed_safelist_queue_`,
404  * back to memory pools. */
405  void update_memory_pools();
406 
407  /* Access and control over active MTLSafeFreeList. */
409  void begin_new_safe_list();
410 
411  /* Add a completed MTLSafeFreeList to completed_safelist_queue_. */
413 
414  private:
415  void ensure_buffer_pool(MTLResourceOptions options);
416  void insert_buffer_into_pool(MTLResourceOptions options, gpu::MTLBuffer *buffer);
417  void free();
418 };
419 
420 /* Scratch buffers are circular-buffers used for temporary data within the current frame.
421  * In order to preserve integrity of contents when having multiple-frames-in-flight,
422  * we cycle through a collection of scratch buffers which are reset upon next use.
423  *
424  * Below are a series of properties, declared to manage scratch buffers. If a scratch buffer
425  * overflows, then the original buffer will be flushed and submitted, with retained references
426  * by usage within the command buffer, and a new buffer will be created.
427  * - The new buffer will grow in size to account for increased demand in temporary memory.
428  */
430 
431  private:
432  /* Maximum number of scratch buffers to allocate. This should be the maximum number of
433  * simultaneous frames in flight. */
434  static constexpr uint mtl_max_scratch_buffers_ = MTL_NUM_SAFE_FRAMES;
435 
436  public:
437  /* Maximum size of single scratch buffer allocation. When re-sizing, this is the maximum size the
438  * newly allocated buffers will grow to. Larger allocations are possible if
439  * `MTL_SCRATCH_BUFFER_ALLOW_TEMPORARY_EXPANSION` is enabled, but these will instead allocate new
440  * buffers from the memory pools on the fly. */
441  static constexpr uint mtl_scratch_buffer_max_size_ = 128 * 1024 * 1024;
442 
443  /* Initial size of circular scratch buffers prior to growth. */
444  static constexpr uint mtl_scratch_buffer_initial_size_ = 16 * 1024 * 1024;
445 
446  private:
447  /* Parent MTLContext. */
448  MTLContext &context_;
449  bool initialised_ = false;
450 
451  /* Scratch buffer currently in-use. */
452  uint current_scratch_buffer_ = 0;
453 
454  /* Scratch buffer pool. */
455  MTLCircularBuffer *scratch_buffers_[mtl_max_scratch_buffers_];
456 
457  public:
460 
461  /* Explicit initialization and freeing of resources.
462  * Initialization must occur after device creation. */
463  void init();
464  void free();
465 
466  /* Allocation functions for creating temporary allocations from active circular buffer. */
469 
470  /* Ensure a new scratch buffer is started if we move onto a new frame.
471  * Called when a new command buffer begins. */
473 
474  /* Flush memory for active scratch buffer to GPU.
475  * This call will perform a partial flush of the buffer starting from
476  * the last offset the data was flushed from, to the current offset. */
478 };
479 
482 } // namespace blender::gpu
#define BLI_assert(a)
Definition: BLI_assert.h:46
unsigned int uint
Definition: BLI_sys_types.h:67
ThreadMutex mutex
static DBVT_INLINE btScalar size(const btDbvtVolume &a)
Definition: btDbvt.cpp:52
void push_completed_safe_list(MTLSafeFreeList *list)
Definition: mtl_memory.mm:306
void init(id< MTLDevice > device)
Definition: mtl_memory.mm:19
gpu::MTLBuffer * allocate_with_data(uint64_t size, bool cpu_visible, const void *data=nullptr)
Definition: mtl_memory.mm:67
MTLSafeFreeList * get_current_safe_list()
Definition: mtl_memory.mm:325
gpu::MTLBuffer * allocate(uint64_t size, bool cpu_visible)
Definition: mtl_memory.mm:60
gpu::MTLBuffer * allocate_aligned(uint64_t size, uint alignment, bool cpu_visible)
Definition: mtl_memory.mm:76
gpu::MTLBuffer * allocate_aligned_with_data(uint64_t size, uint alignment, bool cpu_visible, const void *data=nullptr)
Definition: mtl_memory.mm:169
bool free_buffer(gpu::MTLBuffer *buffer)
Definition: mtl_memory.mm:186
void flag_in_use(bool used)
Definition: mtl_memory.mm:573
id< MTLBuffer > get_metal_buffer() const
Definition: mtl_memory.mm:515
void flush_range(uint64_t offset, uint64_t length)
Definition: mtl_memory.mm:564
void set_usage_size(uint64_t size_used)
Definition: mtl_memory.mm:583
MTLResourceOptions get_resource_options()
Definition: mtl_memory.mm:589
void * get_host_ptr() const
Definition: mtl_memory.mm:520
MTLBuffer(id< MTLDevice > device, uint64_t size, MTLResourceOptions options, uint alignment=1)
Definition: mtl_memory.mm:445
uint64_t get_size_used() const
Definition: mtl_memory.mm:532
uint64_t get_size() const
Definition: mtl_memory.mm:527
void set_label(NSString *str)
Definition: mtl_memory.mm:543
MTLCircularBuffer(MTLContext &ctx, uint64_t initial_size, bool allow_grow)
Definition: mtl_memory.mm:709
MTLTemporaryBuffer allocate_range_aligned(uint64_t alloc_size, uint alignment)
Definition: mtl_memory.mm:740
MTLTemporaryBuffer allocate_range(uint64_t alloc_size)
Definition: mtl_memory.mm:735
void insert_buffer(gpu::MTLBuffer *buffer)
Definition: mtl_memory.mm:387
static constexpr uint mtl_scratch_buffer_max_size_
Definition: mtl_memory.hh:441
static constexpr uint mtl_scratch_buffer_initial_size_
Definition: mtl_memory.hh:444
MTLScratchBufferManager(MTLContext &context)
Definition: mtl_memory.hh:458
MTLTemporaryBuffer scratch_buffer_allocate_range_aligned(uint64_t alloc_size, uint alignment)
Definition: mtl_memory.mm:662
MTLTemporaryBuffer scratch_buffer_allocate_range(uint64_t alloc_size)
Definition: mtl_memory.mm:657
CCL_NAMESPACE_BEGIN struct Options options
#define str(s)
static int compare_size(void *user_data, const void *a1, const void *a2)
Definition: filelist.c:641
ccl_global float * buffer
ccl_gpu_kernel_postfix ccl_global float int int int int float bool int offset
#define MTL_NUM_SAFE_FRAMES
Definition: mtl_common.hh:14
T length(const vec_base< T, Size > &a)
unsigned __int64 uint64_t
Definition: stdint.h:90
bool operator()(const MTLBufferHandle &lhs, const MTLBufferHandle &rhs) const
Definition: mtl_memory.hh:249
MTLBufferHandle(gpu::MTLBuffer *buf)
Definition: mtl_memory.hh:235
MTLBufferHandle(uint64_t compare_size)
Definition: mtl_memory.hh:241
MTLResourceOptions options
Definition: mtl_memory.hh:187
id< MTLBuffer > metal_buffer
Definition: mtl_memory.hh:183