Blender  V3.3
btThreadSupportWin32.cpp
Go to the documentation of this file.
1 /*
2 Bullet Continuous Collision Detection and Physics Library
3 Copyright (c) 2003-2018 Erwin Coumans http://bulletphysics.com
4 
5 This software is provided 'as-is', without any express or implied warranty.
6 In no event will the authors be held liable for any damages arising from the use of this software.
7 Permission is granted to anyone to use this software for any purpose,
8 including commercial applications, and to alter it and redistribute it freely,
9 subject to the following restrictions:
10 
11 1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
12 2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
13 3. This notice may not be removed or altered from any source distribution.
14 */
15 
16 #if defined(_WIN32) && BT_THREADSAFE
17 
18 #include "LinearMath/btScalar.h"
19 #include "LinearMath/btMinMax.h"
21 #include "LinearMath/btThreads.h"
23 #include <windows.h>
24 #include <stdio.h>
25 
26 struct btProcessorInfo
27 {
28  int numLogicalProcessors;
29  int numCores;
30  int numNumaNodes;
31  int numL1Cache;
32  int numL2Cache;
33  int numL3Cache;
34  int numPhysicalPackages;
35  static const int maxNumTeamMasks = 32;
36  int numTeamMasks;
37  UINT64 processorTeamMasks[maxNumTeamMasks];
38 };
39 
40 UINT64 getProcessorTeamMask(const btProcessorInfo& procInfo, int procId)
41 {
42  UINT64 procMask = UINT64(1) << procId;
43  for (int i = 0; i < procInfo.numTeamMasks; ++i)
44  {
45  if (procMask & procInfo.processorTeamMasks[i])
46  {
47  return procInfo.processorTeamMasks[i];
48  }
49  }
50  return 0;
51 }
52 
53 int getProcessorTeamIndex(const btProcessorInfo& procInfo, int procId)
54 {
55  UINT64 procMask = UINT64(1) << procId;
56  for (int i = 0; i < procInfo.numTeamMasks; ++i)
57  {
58  if (procMask & procInfo.processorTeamMasks[i])
59  {
60  return i;
61  }
62  }
63  return -1;
64 }
65 
66 int countSetBits(ULONG64 bits)
67 {
68  int count = 0;
69  while (bits)
70  {
71  if (bits & 1)
72  {
73  count++;
74  }
75  bits >>= 1;
76  }
77  return count;
78 }
79 
80 typedef BOOL(WINAPI* Pfn_GetLogicalProcessorInformation)(PSYSTEM_LOGICAL_PROCESSOR_INFORMATION, PDWORD);
81 
82 void getProcessorInformation(btProcessorInfo* procInfo)
83 {
84  memset(procInfo, 0, sizeof(*procInfo));
85  Pfn_GetLogicalProcessorInformation getLogicalProcInfo =
86  (Pfn_GetLogicalProcessorInformation)GetProcAddress(GetModuleHandle(TEXT("kernel32")), "GetLogicalProcessorInformation");
87  if (getLogicalProcInfo == NULL)
88  {
89  // no info
90  return;
91  }
92  PSYSTEM_LOGICAL_PROCESSOR_INFORMATION buf = NULL;
93  DWORD bufSize = 0;
94  while (true)
95  {
96  if (getLogicalProcInfo(buf, &bufSize))
97  {
98  break;
99  }
100  else
101  {
102  if (GetLastError() == ERROR_INSUFFICIENT_BUFFER)
103  {
104  if (buf)
105  {
106  free(buf);
107  }
108  buf = (PSYSTEM_LOGICAL_PROCESSOR_INFORMATION)malloc(bufSize);
109  }
110  }
111  }
112 
113  int len = bufSize / sizeof(*buf);
114  for (int i = 0; i < len; ++i)
115  {
116  PSYSTEM_LOGICAL_PROCESSOR_INFORMATION info = buf + i;
117  switch (info->Relationship)
118  {
119  case RelationNumaNode:
120  procInfo->numNumaNodes++;
121  break;
122 
123  case RelationProcessorCore:
124  procInfo->numCores++;
125  procInfo->numLogicalProcessors += countSetBits(info->ProcessorMask);
126  break;
127 
128  case RelationCache:
129  if (info->Cache.Level == 1)
130  {
131  procInfo->numL1Cache++;
132  }
133  else if (info->Cache.Level == 2)
134  {
135  procInfo->numL2Cache++;
136  }
137  else if (info->Cache.Level == 3)
138  {
139  procInfo->numL3Cache++;
140  // processors that share L3 cache are considered to be on the same team
141  // because they can more easily work together on the same data.
142  // Large performance penalties will occur if 2 or more threads from different
143  // teams attempt to frequently read and modify the same cache lines.
144  //
145  // On the AMD Ryzen 7 CPU for example, the 8 cores on the CPU are split into
146  // 2 CCX units of 4 cores each. Each CCX has a separate L3 cache, so if both
147  // CCXs are operating on the same data, many cycles will be spent keeping the
148  // two caches coherent.
149  if (procInfo->numTeamMasks < btProcessorInfo::maxNumTeamMasks)
150  {
151  procInfo->processorTeamMasks[procInfo->numTeamMasks] = info->ProcessorMask;
152  procInfo->numTeamMasks++;
153  }
154  }
155  break;
156 
157  case RelationProcessorPackage:
158  procInfo->numPhysicalPackages++;
159  break;
160  }
161  }
162  free(buf);
163 }
164 
166 class btThreadSupportWin32 : public btThreadSupportInterface
167 {
168 public:
169  struct btThreadStatus
170  {
171  int m_taskId;
172  int m_commandId;
173  int m_status;
174 
175  ThreadFunc m_userThreadFunc;
176  void* m_userPtr; //for taskDesc etc
177 
178  void* m_threadHandle; //this one is calling 'Win32ThreadFunc'
179 
180  void* m_eventStartHandle;
181  char m_eventStartHandleName[32];
182 
183  void* m_eventCompleteHandle;
184  char m_eventCompleteHandleName[32];
185  };
186 
187 private:
188  btAlignedObjectArray<btThreadStatus> m_activeThreadStatus;
189  btAlignedObjectArray<void*> m_completeHandles;
190  int m_numThreads;
191  DWORD_PTR m_startedThreadMask;
192  btProcessorInfo m_processorInfo;
193 
194  void startThreads(const ConstructionInfo& threadInfo);
195  void stopThreads();
196  int waitForResponse();
197 
198 public:
199  btThreadSupportWin32(const ConstructionInfo& threadConstructionInfo);
200  virtual ~btThreadSupportWin32();
201 
202  virtual int getNumWorkerThreads() const BT_OVERRIDE { return m_numThreads; }
203  virtual int getCacheFriendlyNumThreads() const BT_OVERRIDE { return countSetBits(m_processorInfo.processorTeamMasks[0]); }
204  virtual int getLogicalToPhysicalCoreRatio() const BT_OVERRIDE { return m_processorInfo.numLogicalProcessors / m_processorInfo.numCores; }
205 
206  virtual void runTask(int threadIndex, void* userData) BT_OVERRIDE;
207  virtual void waitForAllTasks() BT_OVERRIDE;
208 
209  virtual btCriticalSection* createCriticalSection() BT_OVERRIDE;
210  virtual void deleteCriticalSection(btCriticalSection* criticalSection) BT_OVERRIDE;
211 };
212 
213 btThreadSupportWin32::btThreadSupportWin32(const ConstructionInfo& threadConstructionInfo)
214 {
215  startThreads(threadConstructionInfo);
216 }
217 
218 btThreadSupportWin32::~btThreadSupportWin32()
219 {
220  stopThreads();
221 }
222 
223 DWORD WINAPI win32threadStartFunc(LPVOID lpParam)
224 {
225  btThreadSupportWin32::btThreadStatus* status = (btThreadSupportWin32::btThreadStatus*)lpParam;
226 
227  while (1)
228  {
229  WaitForSingleObject(status->m_eventStartHandle, INFINITE);
230  void* userPtr = status->m_userPtr;
231 
232  if (userPtr)
233  {
234  btAssert(status->m_status);
235  status->m_userThreadFunc(userPtr);
236  status->m_status = 2;
237  SetEvent(status->m_eventCompleteHandle);
238  }
239  else
240  {
241  //exit Thread
242  status->m_status = 3;
243  printf("Thread with taskId %i with handle %p exiting\n", status->m_taskId, status->m_threadHandle);
244  SetEvent(status->m_eventCompleteHandle);
245  break;
246  }
247  }
248  printf("Thread TERMINATED\n");
249  return 0;
250 }
251 
252 void btThreadSupportWin32::runTask(int threadIndex, void* userData)
253 {
254  btThreadStatus& threadStatus = m_activeThreadStatus[threadIndex];
255  btAssert(threadIndex >= 0);
256  btAssert(int(threadIndex) < m_activeThreadStatus.size());
257 
258  threadStatus.m_commandId = 1;
259  threadStatus.m_status = 1;
260  threadStatus.m_userPtr = userData;
261  m_startedThreadMask |= DWORD_PTR(1) << threadIndex;
262 
264  SetEvent(threadStatus.m_eventStartHandle);
265 }
266 
267 int btThreadSupportWin32::waitForResponse()
268 {
269  btAssert(m_activeThreadStatus.size());
270 
271  int last = -1;
272  DWORD res = WaitForMultipleObjects(m_completeHandles.size(), &m_completeHandles[0], FALSE, INFINITE);
273  btAssert(res != WAIT_FAILED);
274  last = res - WAIT_OBJECT_0;
275 
276  btThreadStatus& threadStatus = m_activeThreadStatus[last];
277  btAssert(threadStatus.m_threadHandle);
278  btAssert(threadStatus.m_eventCompleteHandle);
279 
280  //WaitForSingleObject(threadStatus.m_eventCompleteHandle, INFINITE);
281  btAssert(threadStatus.m_status > 1);
282  threadStatus.m_status = 0;
283 
285  btAssert(last >= 0);
286  m_startedThreadMask &= ~(DWORD_PTR(1) << last);
287 
288  return last;
289 }
290 
291 void btThreadSupportWin32::waitForAllTasks()
292 {
293  while (m_startedThreadMask)
294  {
295  waitForResponse();
296  }
297 }
298 
299 void btThreadSupportWin32::startThreads(const ConstructionInfo& threadConstructionInfo)
300 {
301  static int uniqueId = 0;
302  uniqueId++;
303  btProcessorInfo& procInfo = m_processorInfo;
304  getProcessorInformation(&procInfo);
305  DWORD_PTR dwProcessAffinityMask = 0;
306  DWORD_PTR dwSystemAffinityMask = 0;
307  if (!GetProcessAffinityMask(GetCurrentProcess(), &dwProcessAffinityMask, &dwSystemAffinityMask))
308  {
309  dwProcessAffinityMask = 0;
310  }
312  m_numThreads = btMin(procInfo.numLogicalProcessors, int(BT_MAX_THREAD_COUNT)) - 1; // cap to max thread count (-1 because main thread already exists)
313 
314  m_activeThreadStatus.resize(m_numThreads);
315  m_completeHandles.resize(m_numThreads);
316  m_startedThreadMask = 0;
317 
318  // set main thread affinity
319  if (DWORD_PTR mask = dwProcessAffinityMask & getProcessorTeamMask(procInfo, 0))
320  {
321  SetThreadAffinityMask(GetCurrentThread(), mask);
322  SetThreadIdealProcessor(GetCurrentThread(), 0);
323  }
324 
325  for (int i = 0; i < m_numThreads; i++)
326  {
327  printf("starting thread %d\n", i);
328 
329  btThreadStatus& threadStatus = m_activeThreadStatus[i];
330 
331  LPSECURITY_ATTRIBUTES lpThreadAttributes = NULL;
332  SIZE_T dwStackSize = threadConstructionInfo.m_threadStackSize;
333  LPTHREAD_START_ROUTINE lpStartAddress = &win32threadStartFunc;
334  LPVOID lpParameter = &threadStatus;
335  DWORD dwCreationFlags = 0;
336  LPDWORD lpThreadId = 0;
337 
338  threadStatus.m_userPtr = 0;
339 
340  sprintf(threadStatus.m_eventStartHandleName, "es%.8s%d%d", threadConstructionInfo.m_uniqueName, uniqueId, i);
341  threadStatus.m_eventStartHandle = CreateEventA(0, false, false, threadStatus.m_eventStartHandleName);
342 
343  sprintf(threadStatus.m_eventCompleteHandleName, "ec%.8s%d%d", threadConstructionInfo.m_uniqueName, uniqueId, i);
344  threadStatus.m_eventCompleteHandle = CreateEventA(0, false, false, threadStatus.m_eventCompleteHandleName);
345 
346  m_completeHandles[i] = threadStatus.m_eventCompleteHandle;
347 
348  HANDLE handle = CreateThread(lpThreadAttributes, dwStackSize, lpStartAddress, lpParameter, dwCreationFlags, lpThreadId);
349  //SetThreadPriority( handle, THREAD_PRIORITY_HIGHEST );
350  // highest priority -- can cause erratic performance when numThreads > numCores
351  // we don't want worker threads to be higher priority than the main thread or the main thread could get
352  // totally shut out and unable to tell the workers to stop
353  //SetThreadPriority( handle, THREAD_PRIORITY_BELOW_NORMAL );
354 
355  {
356  int processorId = i + 1; // leave processor 0 for main thread
357  DWORD_PTR teamMask = getProcessorTeamMask(procInfo, processorId);
358  if (teamMask)
359  {
360  // bind each thread to only execute on processors of it's assigned team
361  // - for single-socket Intel x86 CPUs this has no effect (only a single, shared L3 cache so there is only 1 team)
362  // - for multi-socket Intel this will keep threads from migrating from one socket to another
363  // - for AMD Ryzen this will keep threads from migrating from one CCX to another
364  DWORD_PTR mask = teamMask & dwProcessAffinityMask;
365  if (mask)
366  {
367  SetThreadAffinityMask(handle, mask);
368  }
369  }
370  SetThreadIdealProcessor(handle, processorId);
371  }
372 
373  threadStatus.m_taskId = i;
374  threadStatus.m_commandId = 0;
375  threadStatus.m_status = 0;
376  threadStatus.m_threadHandle = handle;
377  threadStatus.m_userThreadFunc = threadConstructionInfo.m_userThreadFunc;
378 
379  printf("started %s thread %d with threadHandle %p\n", threadConstructionInfo.m_uniqueName, i, handle);
380  }
381 }
382 
384 void btThreadSupportWin32::stopThreads()
385 {
386  for (int i = 0; i < m_activeThreadStatus.size(); i++)
387  {
388  btThreadStatus& threadStatus = m_activeThreadStatus[i];
389  if (threadStatus.m_status > 0)
390  {
391  WaitForSingleObject(threadStatus.m_eventCompleteHandle, INFINITE);
392  }
393 
394  threadStatus.m_userPtr = NULL;
395  SetEvent(threadStatus.m_eventStartHandle);
396  WaitForSingleObject(threadStatus.m_eventCompleteHandle, INFINITE);
397 
398  CloseHandle(threadStatus.m_eventCompleteHandle);
399  CloseHandle(threadStatus.m_eventStartHandle);
400  CloseHandle(threadStatus.m_threadHandle);
401  }
402 
403  m_activeThreadStatus.clear();
404  m_completeHandles.clear();
405 }
406 
407 class btWin32CriticalSection : public btCriticalSection
408 {
409 private:
410  CRITICAL_SECTION mCriticalSection;
411 
412 public:
413  btWin32CriticalSection()
414  {
415  InitializeCriticalSection(&mCriticalSection);
416  }
417 
418  ~btWin32CriticalSection()
419  {
420  DeleteCriticalSection(&mCriticalSection);
421  }
422 
423  void lock()
424  {
425  EnterCriticalSection(&mCriticalSection);
426  }
427 
428  void unlock()
429  {
430  LeaveCriticalSection(&mCriticalSection);
431  }
432 };
433 
434 btCriticalSection* btThreadSupportWin32::createCriticalSection()
435 {
436  unsigned char* mem = (unsigned char*)btAlignedAlloc(sizeof(btWin32CriticalSection), 16);
437  btWin32CriticalSection* cs = new (mem) btWin32CriticalSection();
438  return cs;
439 }
440 
441 void btThreadSupportWin32::deleteCriticalSection(btCriticalSection* criticalSection)
442 {
443  criticalSection->~btCriticalSection();
444  btAlignedFree(criticalSection);
445 }
446 
447 btThreadSupportInterface* btThreadSupportInterface::create(const ConstructionInfo& info)
448 {
449  return new btThreadSupportWin32(info);
450 }
451 
452 #endif //defined(_WIN32) && BT_THREADSAFE
void BLI_kdtree_nd_() free(KDTree *tree)
Definition: kdtree_impl.h:102
#define FALSE
Definition: GHOST_C-Test.c:17
volatile int lock
#define btAlignedFree(ptr)
#define btAlignedAlloc(size, alignment)
SIMD_FORCE_INLINE const T & btMin(const T &a, const T &b)
Definition: btMinMax.h:21
static int uniqueId
Definition: btRigidBody.cpp:27
#define btAssert(x)
Definition: btScalar.h:295
btSequentialImpulseConstraintSolverMt int btPersistentManifold int btTypedConstraint int const btContactSolverInfo btIDebugDraw *debugDrawer BT_OVERRIDE
const unsigned int BT_MAX_THREAD_COUNT
Definition: btThreads.h:31
SIMD_FORCE_INLINE void clear()
clear the array, deallocated memory. Generally it is better to use array.resize(0),...
SIMD_FORCE_INLINE int size() const
return the number of elements in the array
SIMD_FORCE_INLINE void resize(int newsize, const T &fillData=T())
virtual int getCacheFriendlyNumThreads() const =0
virtual int getLogicalToPhysicalCoreRatio() const =0
virtual void waitForAllTasks()=0
virtual void runTask(int threadIndex, void *userData)=0
virtual int getNumWorkerThreads() const =0
static btThreadSupportInterface * create(const ConstructionInfo &info)
int len
Definition: draw_manager.c:108
int count
ccl_device_inline float4 mask(const int4 &mask, const float4 &a)
Definition: math_float4.h:513