LLVM OpenMP* Runtime Library
Loading...
Searching...
No Matches
kmp_affinity.h
1/*
2 * kmp_affinity.h -- header for affinity management
3 */
4
5//===----------------------------------------------------------------------===//
6//
7// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8// See https://llvm.org/LICENSE.txt for license information.
9// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10//
11//===----------------------------------------------------------------------===//
12
13#ifndef KMP_AFFINITY_H
14#define KMP_AFFINITY_H
15
16#include "kmp.h"
17#include "kmp_os.h"
18#include <limits>
19
20#if KMP_AFFINITY_SUPPORTED
21#if KMP_USE_HWLOC
22class KMPHwlocAffinity : public KMPAffinity {
23public:
24 class Mask : public KMPAffinity::Mask {
25 hwloc_cpuset_t mask;
26
27 public:
28 Mask() {
29 mask = hwloc_bitmap_alloc();
30 this->zero();
31 }
32 ~Mask() { hwloc_bitmap_free(mask); }
33 void set(int i) override { hwloc_bitmap_set(mask, i); }
34 bool is_set(int i) const override { return hwloc_bitmap_isset(mask, i); }
35 void clear(int i) override { hwloc_bitmap_clr(mask, i); }
36 void zero() override { hwloc_bitmap_zero(mask); }
37 bool empty() const override { return hwloc_bitmap_iszero(mask); }
38 void copy(const KMPAffinity::Mask *src) override {
39 const Mask *convert = static_cast<const Mask *>(src);
40 hwloc_bitmap_copy(mask, convert->mask);
41 }
42 void bitwise_and(const KMPAffinity::Mask *rhs) override {
43 const Mask *convert = static_cast<const Mask *>(rhs);
44 hwloc_bitmap_and(mask, mask, convert->mask);
45 }
46 void bitwise_or(const KMPAffinity::Mask *rhs) override {
47 const Mask *convert = static_cast<const Mask *>(rhs);
48 hwloc_bitmap_or(mask, mask, convert->mask);
49 }
50 void bitwise_not() override { hwloc_bitmap_not(mask, mask); }
51 bool is_equal(const KMPAffinity::Mask *rhs) const override {
52 const Mask *convert = static_cast<const Mask *>(rhs);
53 return hwloc_bitmap_isequal(mask, convert->mask);
54 }
55 int begin() const override { return hwloc_bitmap_first(mask); }
56 int end() const override { return -1; }
57 int next(int previous) const override {
58 return hwloc_bitmap_next(mask, previous);
59 }
60 int get_system_affinity(bool abort_on_error) override {
61 KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
62 "Illegal get affinity operation when not capable");
63 long retval =
64 hwloc_get_cpubind(__kmp_hwloc_topology, mask, HWLOC_CPUBIND_THREAD);
65 if (retval >= 0) {
66 return 0;
67 }
68 int error = errno;
69 if (abort_on_error) {
70 __kmp_fatal(KMP_MSG(FunctionError, "hwloc_get_cpubind()"),
71 KMP_ERR(error), __kmp_msg_null);
72 }
73 return error;
74 }
75 int set_system_affinity(bool abort_on_error) const override {
76 KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
77 "Illegal set affinity operation when not capable");
78 long retval =
79 hwloc_set_cpubind(__kmp_hwloc_topology, mask, HWLOC_CPUBIND_THREAD);
80 if (retval >= 0) {
81 return 0;
82 }
83 int error = errno;
84 if (abort_on_error) {
85 __kmp_fatal(KMP_MSG(FunctionError, "hwloc_set_cpubind()"),
86 KMP_ERR(error), __kmp_msg_null);
87 }
88 return error;
89 }
90#if KMP_OS_WINDOWS
91 int set_process_affinity(bool abort_on_error) const override {
92 KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
93 "Illegal set process affinity operation when not capable");
94 int error = 0;
95 const hwloc_topology_support *support =
96 hwloc_topology_get_support(__kmp_hwloc_topology);
97 if (support->cpubind->set_proc_cpubind) {
98 int retval;
99 retval = hwloc_set_cpubind(__kmp_hwloc_topology, mask,
100 HWLOC_CPUBIND_PROCESS);
101 if (retval >= 0)
102 return 0;
103 error = errno;
104 if (abort_on_error)
105 __kmp_fatal(KMP_MSG(FunctionError, "hwloc_set_cpubind()"),
106 KMP_ERR(error), __kmp_msg_null);
107 }
108 return error;
109 }
110#endif
111 int get_proc_group() const override {
112 int group = -1;
113#if KMP_OS_WINDOWS
114 if (__kmp_num_proc_groups == 1) {
115 return 1;
116 }
117 for (int i = 0; i < __kmp_num_proc_groups; i++) {
118 // On windows, the long type is always 32 bits
119 unsigned long first_32_bits = hwloc_bitmap_to_ith_ulong(mask, i * 2);
120 unsigned long second_32_bits =
121 hwloc_bitmap_to_ith_ulong(mask, i * 2 + 1);
122 if (first_32_bits == 0 && second_32_bits == 0) {
123 continue;
124 }
125 if (group >= 0) {
126 return -1;
127 }
128 group = i;
129 }
130#endif /* KMP_OS_WINDOWS */
131 return group;
132 }
133 };
134 void determine_capable(const char *var) override {
135 const hwloc_topology_support *topology_support;
136 if (__kmp_hwloc_topology == NULL) {
137 if (hwloc_topology_init(&__kmp_hwloc_topology) < 0) {
138 __kmp_hwloc_error = TRUE;
139 if (__kmp_affinity.flags.verbose) {
140 KMP_WARNING(AffHwlocErrorOccurred, var, "hwloc_topology_init()");
141 }
142 }
143 if (hwloc_topology_load(__kmp_hwloc_topology) < 0) {
144 __kmp_hwloc_error = TRUE;
145 if (__kmp_affinity.flags.verbose) {
146 KMP_WARNING(AffHwlocErrorOccurred, var, "hwloc_topology_load()");
147 }
148 }
149 }
150 topology_support = hwloc_topology_get_support(__kmp_hwloc_topology);
151 // Is the system capable of setting/getting this thread's affinity?
152 // Also, is topology discovery possible? (pu indicates ability to discover
153 // processing units). And finally, were there no errors when calling any
154 // hwloc_* API functions?
155 if (topology_support && topology_support->cpubind->set_thisthread_cpubind &&
156 topology_support->cpubind->get_thisthread_cpubind &&
157 topology_support->discovery->pu && !__kmp_hwloc_error) {
158 // enables affinity according to KMP_AFFINITY_CAPABLE() macro
159 KMP_AFFINITY_ENABLE(TRUE);
160 } else {
161 // indicate that hwloc didn't work and disable affinity
162 __kmp_hwloc_error = TRUE;
163 KMP_AFFINITY_DISABLE();
164 }
165 }
166 void bind_thread(int which) override {
167 KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
168 "Illegal set affinity operation when not capable");
169 KMPAffinity::Mask *mask;
170 KMP_CPU_ALLOC_ON_STACK(mask);
171 KMP_CPU_ZERO(mask);
172 KMP_CPU_SET(which, mask);
173 __kmp_set_system_affinity(mask, TRUE);
174 KMP_CPU_FREE_FROM_STACK(mask);
175 }
176 KMPAffinity::Mask *allocate_mask() override { return new Mask(); }
177 void deallocate_mask(KMPAffinity::Mask *m) override { delete m; }
178 KMPAffinity::Mask *allocate_mask_array(int num) override {
179 return new Mask[num];
180 }
181 void deallocate_mask_array(KMPAffinity::Mask *array) override {
182 Mask *hwloc_array = static_cast<Mask *>(array);
183 delete[] hwloc_array;
184 }
185 KMPAffinity::Mask *index_mask_array(KMPAffinity::Mask *array,
186 int index) override {
187 Mask *hwloc_array = static_cast<Mask *>(array);
188 return &(hwloc_array[index]);
189 }
190 api_type get_api_type() const override { return HWLOC; }
191};
192#endif /* KMP_USE_HWLOC */
193
194#if KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_AIX
195#if KMP_OS_LINUX
196/* On some of the older OS's that we build on, these constants aren't present
197 in <asm/unistd.h> #included from <sys.syscall.h>. They must be the same on
198 all systems of the same arch where they are defined, and they cannot change.
199 stone forever. */
200#include <sys/syscall.h>
201#if KMP_ARCH_X86 || KMP_ARCH_ARM
202#ifndef __NR_sched_setaffinity
203#define __NR_sched_setaffinity 241
204#elif __NR_sched_setaffinity != 241
205#error Wrong code for setaffinity system call.
206#endif /* __NR_sched_setaffinity */
207#ifndef __NR_sched_getaffinity
208#define __NR_sched_getaffinity 242
209#elif __NR_sched_getaffinity != 242
210#error Wrong code for getaffinity system call.
211#endif /* __NR_sched_getaffinity */
212#elif KMP_ARCH_AARCH64
213#ifndef __NR_sched_setaffinity
214#define __NR_sched_setaffinity 122
215#elif __NR_sched_setaffinity != 122
216#error Wrong code for setaffinity system call.
217#endif /* __NR_sched_setaffinity */
218#ifndef __NR_sched_getaffinity
219#define __NR_sched_getaffinity 123
220#elif __NR_sched_getaffinity != 123
221#error Wrong code for getaffinity system call.
222#endif /* __NR_sched_getaffinity */
223#elif KMP_ARCH_RISCV64
224#ifndef __NR_sched_setaffinity
225#define __NR_sched_setaffinity 122
226#elif __NR_sched_setaffinity != 122
227#error Wrong code for setaffinity system call.
228#endif /* __NR_sched_setaffinity */
229#ifndef __NR_sched_getaffinity
230#define __NR_sched_getaffinity 123
231#elif __NR_sched_getaffinity != 123
232#error Wrong code for getaffinity system call.
233#endif /* __NR_sched_getaffinity */
234#elif KMP_ARCH_X86_64
235#ifndef __NR_sched_setaffinity
236#define __NR_sched_setaffinity 203
237#elif __NR_sched_setaffinity != 203
238#error Wrong code for setaffinity system call.
239#endif /* __NR_sched_setaffinity */
240#ifndef __NR_sched_getaffinity
241#define __NR_sched_getaffinity 204
242#elif __NR_sched_getaffinity != 204
243#error Wrong code for getaffinity system call.
244#endif /* __NR_sched_getaffinity */
245#elif KMP_ARCH_PPC64
246#ifndef __NR_sched_setaffinity
247#define __NR_sched_setaffinity 222
248#elif __NR_sched_setaffinity != 222
249#error Wrong code for setaffinity system call.
250#endif /* __NR_sched_setaffinity */
251#ifndef __NR_sched_getaffinity
252#define __NR_sched_getaffinity 223
253#elif __NR_sched_getaffinity != 223
254#error Wrong code for getaffinity system call.
255#endif /* __NR_sched_getaffinity */
256#elif KMP_ARCH_MIPS
257#ifndef __NR_sched_setaffinity
258#define __NR_sched_setaffinity 4239
259#elif __NR_sched_setaffinity != 4239
260#error Wrong code for setaffinity system call.
261#endif /* __NR_sched_setaffinity */
262#ifndef __NR_sched_getaffinity
263#define __NR_sched_getaffinity 4240
264#elif __NR_sched_getaffinity != 4240
265#error Wrong code for getaffinity system call.
266#endif /* __NR_sched_getaffinity */
267#elif KMP_ARCH_MIPS64
268#ifndef __NR_sched_setaffinity
269#define __NR_sched_setaffinity 5195
270#elif __NR_sched_setaffinity != 5195
271#error Wrong code for setaffinity system call.
272#endif /* __NR_sched_setaffinity */
273#ifndef __NR_sched_getaffinity
274#define __NR_sched_getaffinity 5196
275#elif __NR_sched_getaffinity != 5196
276#error Wrong code for getaffinity system call.
277#endif /* __NR_sched_getaffinity */
278#elif KMP_ARCH_LOONGARCH64
279#ifndef __NR_sched_setaffinity
280#define __NR_sched_setaffinity 122
281#elif __NR_sched_setaffinity != 122
282#error Wrong code for setaffinity system call.
283#endif /* __NR_sched_setaffinity */
284#ifndef __NR_sched_getaffinity
285#define __NR_sched_getaffinity 123
286#elif __NR_sched_getaffinity != 123
287#error Wrong code for getaffinity system call.
288#endif /* __NR_sched_getaffinity */
289#elif KMP_ARCH_RISCV64
290#ifndef __NR_sched_setaffinity
291#define __NR_sched_setaffinity 122
292#elif __NR_sched_setaffinity != 122
293#error Wrong code for setaffinity system call.
294#endif /* __NR_sched_setaffinity */
295#ifndef __NR_sched_getaffinity
296#define __NR_sched_getaffinity 123
297#elif __NR_sched_getaffinity != 123
298#error Wrong code for getaffinity system call.
299#endif /* __NR_sched_getaffinity */
300#elif KMP_ARCH_VE
301#ifndef __NR_sched_setaffinity
302#define __NR_sched_setaffinity 203
303#elif __NR_sched_setaffinity != 203
304#error Wrong code for setaffinity system call.
305#endif /* __NR_sched_setaffinity */
306#ifndef __NR_sched_getaffinity
307#define __NR_sched_getaffinity 204
308#elif __NR_sched_getaffinity != 204
309#error Wrong code for getaffinity system call.
310#endif /* __NR_sched_getaffinity */
311#elif KMP_ARCH_S390X
312#ifndef __NR_sched_setaffinity
313#define __NR_sched_setaffinity 239
314#elif __NR_sched_setaffinity != 239
315#error Wrong code for setaffinity system call.
316#endif /* __NR_sched_setaffinity */
317#ifndef __NR_sched_getaffinity
318#define __NR_sched_getaffinity 240
319#elif __NR_sched_getaffinity != 240
320#error Wrong code for getaffinity system call.
321#endif /* __NR_sched_getaffinity */
322#else
323#error Unknown or unsupported architecture
324#endif /* KMP_ARCH_* */
325#elif KMP_OS_FREEBSD
326#include <pthread.h>
327#include <pthread_np.h>
328#elif KMP_OS_AIX
329#include <sys/dr.h>
330#include <sys/rset.h>
331#define VMI_MAXRADS 64 // Maximum number of RADs allowed by AIX.
332#endif
333class KMPNativeAffinity : public KMPAffinity {
334 class Mask : public KMPAffinity::Mask {
335 typedef unsigned long mask_t;
336 typedef decltype(__kmp_affin_mask_size) mask_size_type;
337 static const unsigned int BITS_PER_MASK_T = sizeof(mask_t) * CHAR_BIT;
338 static const mask_t ONE = 1;
339 mask_size_type get_num_mask_types() const {
340 return __kmp_affin_mask_size / sizeof(mask_t);
341 }
342
343 public:
344 mask_t *mask;
345 Mask() { mask = (mask_t *)__kmp_allocate(__kmp_affin_mask_size); }
346 ~Mask() {
347 if (mask)
348 __kmp_free(mask);
349 }
350 void set(int i) override {
351 mask[i / BITS_PER_MASK_T] |= (ONE << (i % BITS_PER_MASK_T));
352 }
353 bool is_set(int i) const override {
354 return (mask[i / BITS_PER_MASK_T] & (ONE << (i % BITS_PER_MASK_T)));
355 }
356 void clear(int i) override {
357 mask[i / BITS_PER_MASK_T] &= ~(ONE << (i % BITS_PER_MASK_T));
358 }
359 void zero() override {
360 mask_size_type e = get_num_mask_types();
361 for (mask_size_type i = 0; i < e; ++i)
362 mask[i] = (mask_t)0;
363 }
364 bool empty() const override {
365 mask_size_type e = get_num_mask_types();
366 for (mask_size_type i = 0; i < e; ++i)
367 if (mask[i] != (mask_t)0)
368 return false;
369 return true;
370 }
371 void copy(const KMPAffinity::Mask *src) override {
372 const Mask *convert = static_cast<const Mask *>(src);
373 mask_size_type e = get_num_mask_types();
374 for (mask_size_type i = 0; i < e; ++i)
375 mask[i] = convert->mask[i];
376 }
377 void bitwise_and(const KMPAffinity::Mask *rhs) override {
378 const Mask *convert = static_cast<const Mask *>(rhs);
379 mask_size_type e = get_num_mask_types();
380 for (mask_size_type i = 0; i < e; ++i)
381 mask[i] &= convert->mask[i];
382 }
383 void bitwise_or(const KMPAffinity::Mask *rhs) override {
384 const Mask *convert = static_cast<const Mask *>(rhs);
385 mask_size_type e = get_num_mask_types();
386 for (mask_size_type i = 0; i < e; ++i)
387 mask[i] |= convert->mask[i];
388 }
389 void bitwise_not() override {
390 mask_size_type e = get_num_mask_types();
391 for (mask_size_type i = 0; i < e; ++i)
392 mask[i] = ~(mask[i]);
393 }
394 bool is_equal(const KMPAffinity::Mask *rhs) const override {
395 const Mask *convert = static_cast<const Mask *>(rhs);
396 mask_size_type e = get_num_mask_types();
397 for (mask_size_type i = 0; i < e; ++i)
398 if (mask[i] != convert->mask[i])
399 return false;
400 return true;
401 }
402 int begin() const override {
403 int retval = 0;
404 while (retval < end() && !is_set(retval))
405 ++retval;
406 return retval;
407 }
408 int end() const override {
409 int e;
410 __kmp_type_convert(get_num_mask_types() * BITS_PER_MASK_T, &e);
411 return e;
412 }
413 int next(int previous) const override {
414 int retval = previous + 1;
415 while (retval < end() && !is_set(retval))
416 ++retval;
417 return retval;
418 }
419#if KMP_OS_AIX
420 // On AIX, we don't have a way to get CPU(s) a thread is bound to.
421 // This routine is only used to get the full mask.
422 int get_system_affinity(bool abort_on_error) override {
423 KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
424 "Illegal get affinity operation when not capable");
425
426 (void)abort_on_error;
427
428 // Set the mask with all CPUs that are available.
429 for (int i = 0; i < __kmp_xproc; ++i)
430 KMP_CPU_SET(i, this);
431 return 0;
432 }
433 int set_system_affinity(bool abort_on_error) const override {
434 KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
435
436 "Illegal set affinity operation when not capable");
437
438 int location;
439 int gtid = __kmp_entry_gtid();
440 int tid = thread_self();
441
442 // Unbind the thread if it was bound to any processors before so that
443 // we can bind the thread to CPUs specified by the mask not others.
444 int retval = bindprocessor(BINDTHREAD, tid, PROCESSOR_CLASS_ANY);
445
446 // On AIX, we can only bind to one instead of a set of CPUs with the
447 // bindprocessor() system call.
448 KMP_CPU_SET_ITERATE(location, this) {
449 if (KMP_CPU_ISSET(location, this)) {
450 retval = bindprocessor(BINDTHREAD, tid, location);
451 if (retval == -1 && errno == 1) {
452 rsid_t rsid;
453 rsethandle_t rsh;
454 // Put something in rsh to prevent compiler warning
455 // about uninitalized use
456 rsh = rs_alloc(RS_EMPTY);
457 rsid.at_pid = getpid();
458 if (RS_DEFAULT_RSET != ra_getrset(R_PROCESS, rsid, 0, rsh)) {
459 retval = ra_detachrset(R_PROCESS, rsid, 0);
460 retval = bindprocessor(BINDTHREAD, tid, location);
461 }
462 }
463 if (retval == 0) {
464 KA_TRACE(10, ("__kmp_set_system_affinity: Done binding "
465 "T#%d to cpu=%d.\n",
466 gtid, location));
467 continue;
468 }
469 int error = errno;
470 if (abort_on_error) {
471 __kmp_fatal(KMP_MSG(FunctionError, "bindprocessor()"),
472 KMP_ERR(error), __kmp_msg_null);
473 KA_TRACE(10, ("__kmp_set_system_affinity: Error binding "
474 "T#%d to cpu=%d, errno=%d.\n",
475 gtid, location, error));
476 return error;
477 }
478 }
479 }
480 return 0;
481 }
482#else // !KMP_OS_AIX
483 int get_system_affinity(bool abort_on_error) override {
484 KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
485 "Illegal get affinity operation when not capable");
486#if KMP_OS_LINUX
487 long retval =
488 syscall(__NR_sched_getaffinity, 0, __kmp_affin_mask_size, mask);
489#elif KMP_OS_FREEBSD
490 int r = pthread_getaffinity_np(pthread_self(), __kmp_affin_mask_size,
491 reinterpret_cast<cpuset_t *>(mask));
492 int retval = (r == 0 ? 0 : -1);
493#endif
494 if (retval >= 0) {
495 return 0;
496 }
497 int error = errno;
498 if (abort_on_error) {
499 __kmp_fatal(KMP_MSG(FunctionError, "pthread_getaffinity_np()"),
500 KMP_ERR(error), __kmp_msg_null);
501 }
502 return error;
503 }
504 int set_system_affinity(bool abort_on_error) const override {
505 KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
506 "Illegal set affinity operation when not capable");
507#if KMP_OS_LINUX
508 long retval =
509 syscall(__NR_sched_setaffinity, 0, __kmp_affin_mask_size, mask);
510#elif KMP_OS_FREEBSD
511 int r = pthread_setaffinity_np(pthread_self(), __kmp_affin_mask_size,
512 reinterpret_cast<cpuset_t *>(mask));
513 int retval = (r == 0 ? 0 : -1);
514#endif
515 if (retval >= 0) {
516 return 0;
517 }
518 int error = errno;
519 if (abort_on_error) {
520 __kmp_fatal(KMP_MSG(FunctionError, "pthread_setaffinity_np()"),
521 KMP_ERR(error), __kmp_msg_null);
522 }
523 return error;
524 }
525#endif // KMP_OS_AIX
526 };
527 void determine_capable(const char *env_var) override {
528 __kmp_affinity_determine_capable(env_var);
529 }
530 void bind_thread(int which) override { __kmp_affinity_bind_thread(which); }
531 KMPAffinity::Mask *allocate_mask() override {
532 KMPNativeAffinity::Mask *retval = new Mask();
533 return retval;
534 }
535 void deallocate_mask(KMPAffinity::Mask *m) override {
536 KMPNativeAffinity::Mask *native_mask =
537 static_cast<KMPNativeAffinity::Mask *>(m);
538 delete native_mask;
539 }
540 KMPAffinity::Mask *allocate_mask_array(int num) override {
541 return new Mask[num];
542 }
543 void deallocate_mask_array(KMPAffinity::Mask *array) override {
544 Mask *linux_array = static_cast<Mask *>(array);
545 delete[] linux_array;
546 }
547 KMPAffinity::Mask *index_mask_array(KMPAffinity::Mask *array,
548 int index) override {
549 Mask *linux_array = static_cast<Mask *>(array);
550 return &(linux_array[index]);
551 }
552 api_type get_api_type() const override { return NATIVE_OS; }
553};
554#endif /* KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_AIX */
555
556#if KMP_OS_WINDOWS
557class KMPNativeAffinity : public KMPAffinity {
558 class Mask : public KMPAffinity::Mask {
559 typedef ULONG_PTR mask_t;
560 static const int BITS_PER_MASK_T = sizeof(mask_t) * CHAR_BIT;
561 mask_t *mask;
562
563 public:
564 Mask() {
565 mask = (mask_t *)__kmp_allocate(sizeof(mask_t) * __kmp_num_proc_groups);
566 }
567 ~Mask() {
568 if (mask)
569 __kmp_free(mask);
570 }
571 void set(int i) override {
572 mask[i / BITS_PER_MASK_T] |= ((mask_t)1 << (i % BITS_PER_MASK_T));
573 }
574 bool is_set(int i) const override {
575 return (mask[i / BITS_PER_MASK_T] & ((mask_t)1 << (i % BITS_PER_MASK_T)));
576 }
577 void clear(int i) override {
578 mask[i / BITS_PER_MASK_T] &= ~((mask_t)1 << (i % BITS_PER_MASK_T));
579 }
580 void zero() override {
581 for (int i = 0; i < __kmp_num_proc_groups; ++i)
582 mask[i] = 0;
583 }
584 bool empty() const override {
585 for (size_t i = 0; i < __kmp_num_proc_groups; ++i)
586 if (mask[i])
587 return false;
588 return true;
589 }
590 void copy(const KMPAffinity::Mask *src) override {
591 const Mask *convert = static_cast<const Mask *>(src);
592 for (int i = 0; i < __kmp_num_proc_groups; ++i)
593 mask[i] = convert->mask[i];
594 }
595 void bitwise_and(const KMPAffinity::Mask *rhs) override {
596 const Mask *convert = static_cast<const Mask *>(rhs);
597 for (int i = 0; i < __kmp_num_proc_groups; ++i)
598 mask[i] &= convert->mask[i];
599 }
600 void bitwise_or(const KMPAffinity::Mask *rhs) override {
601 const Mask *convert = static_cast<const Mask *>(rhs);
602 for (int i = 0; i < __kmp_num_proc_groups; ++i)
603 mask[i] |= convert->mask[i];
604 }
605 void bitwise_not() override {
606 for (int i = 0; i < __kmp_num_proc_groups; ++i)
607 mask[i] = ~(mask[i]);
608 }
609 bool is_equal(const KMPAffinity::Mask *rhs) const override {
610 const Mask *convert = static_cast<const Mask *>(rhs);
611 for (size_t i = 0; i < __kmp_num_proc_groups; ++i)
612 if (mask[i] != convert->mask[i])
613 return false;
614 return true;
615 }
616 int begin() const override {
617 int retval = 0;
618 while (retval < end() && !is_set(retval))
619 ++retval;
620 return retval;
621 }
622 int end() const override { return __kmp_num_proc_groups * BITS_PER_MASK_T; }
623 int next(int previous) const override {
624 int retval = previous + 1;
625 while (retval < end() && !is_set(retval))
626 ++retval;
627 return retval;
628 }
629 int set_process_affinity(bool abort_on_error) const override {
630 if (__kmp_num_proc_groups <= 1) {
631 if (!SetProcessAffinityMask(GetCurrentProcess(), *mask)) {
632 DWORD error = GetLastError();
633 if (abort_on_error) {
634 __kmp_fatal(KMP_MSG(CantSetThreadAffMask), KMP_ERR(error),
635 __kmp_msg_null);
636 }
637 return error;
638 }
639 }
640 return 0;
641 }
642 int set_system_affinity(bool abort_on_error) const override {
643 if (__kmp_num_proc_groups > 1) {
644 // Check for a valid mask.
645 GROUP_AFFINITY ga;
646 int group = get_proc_group();
647 if (group < 0) {
648 if (abort_on_error) {
649 KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity");
650 }
651 return -1;
652 }
653 // Transform the bit vector into a GROUP_AFFINITY struct
654 // and make the system call to set affinity.
655 ga.Group = group;
656 ga.Mask = mask[group];
657 ga.Reserved[0] = ga.Reserved[1] = ga.Reserved[2] = 0;
658
659 KMP_DEBUG_ASSERT(__kmp_SetThreadGroupAffinity != NULL);
660 if (__kmp_SetThreadGroupAffinity(GetCurrentThread(), &ga, NULL) == 0) {
661 DWORD error = GetLastError();
662 if (abort_on_error) {
663 __kmp_fatal(KMP_MSG(CantSetThreadAffMask), KMP_ERR(error),
664 __kmp_msg_null);
665 }
666 return error;
667 }
668 } else {
669 if (!SetThreadAffinityMask(GetCurrentThread(), *mask)) {
670 DWORD error = GetLastError();
671 if (abort_on_error) {
672 __kmp_fatal(KMP_MSG(CantSetThreadAffMask), KMP_ERR(error),
673 __kmp_msg_null);
674 }
675 return error;
676 }
677 }
678 return 0;
679 }
680 int get_system_affinity(bool abort_on_error) override {
681 if (__kmp_num_proc_groups > 1) {
682 this->zero();
683 GROUP_AFFINITY ga;
684 KMP_DEBUG_ASSERT(__kmp_GetThreadGroupAffinity != NULL);
685 if (__kmp_GetThreadGroupAffinity(GetCurrentThread(), &ga) == 0) {
686 DWORD error = GetLastError();
687 if (abort_on_error) {
688 __kmp_fatal(KMP_MSG(FunctionError, "GetThreadGroupAffinity()"),
689 KMP_ERR(error), __kmp_msg_null);
690 }
691 return error;
692 }
693 if ((ga.Group < 0) || (ga.Group > __kmp_num_proc_groups) ||
694 (ga.Mask == 0)) {
695 return -1;
696 }
697 mask[ga.Group] = ga.Mask;
698 } else {
699 mask_t newMask, sysMask, retval;
700 if (!GetProcessAffinityMask(GetCurrentProcess(), &newMask, &sysMask)) {
701 DWORD error = GetLastError();
702 if (abort_on_error) {
703 __kmp_fatal(KMP_MSG(FunctionError, "GetProcessAffinityMask()"),
704 KMP_ERR(error), __kmp_msg_null);
705 }
706 return error;
707 }
708 retval = SetThreadAffinityMask(GetCurrentThread(), newMask);
709 if (!retval) {
710 DWORD error = GetLastError();
711 if (abort_on_error) {
712 __kmp_fatal(KMP_MSG(FunctionError, "SetThreadAffinityMask()"),
713 KMP_ERR(error), __kmp_msg_null);
714 }
715 return error;
716 }
717 newMask = SetThreadAffinityMask(GetCurrentThread(), retval);
718 if (!newMask) {
719 DWORD error = GetLastError();
720 if (abort_on_error) {
721 __kmp_fatal(KMP_MSG(FunctionError, "SetThreadAffinityMask()"),
722 KMP_ERR(error), __kmp_msg_null);
723 }
724 }
725 *mask = retval;
726 }
727 return 0;
728 }
729 int get_proc_group() const override {
730 int group = -1;
731 if (__kmp_num_proc_groups == 1) {
732 return 1;
733 }
734 for (int i = 0; i < __kmp_num_proc_groups; i++) {
735 if (mask[i] == 0)
736 continue;
737 if (group >= 0)
738 return -1;
739 group = i;
740 }
741 return group;
742 }
743 };
744 void determine_capable(const char *env_var) override {
745 __kmp_affinity_determine_capable(env_var);
746 }
747 void bind_thread(int which) override { __kmp_affinity_bind_thread(which); }
748 KMPAffinity::Mask *allocate_mask() override { return new Mask(); }
749 void deallocate_mask(KMPAffinity::Mask *m) override { delete m; }
750 KMPAffinity::Mask *allocate_mask_array(int num) override {
751 return new Mask[num];
752 }
753 void deallocate_mask_array(KMPAffinity::Mask *array) override {
754 Mask *windows_array = static_cast<Mask *>(array);
755 delete[] windows_array;
756 }
757 KMPAffinity::Mask *index_mask_array(KMPAffinity::Mask *array,
758 int index) override {
759 Mask *windows_array = static_cast<Mask *>(array);
760 return &(windows_array[index]);
761 }
762 api_type get_api_type() const override { return NATIVE_OS; }
763};
764#endif /* KMP_OS_WINDOWS */
765#endif /* KMP_AFFINITY_SUPPORTED */
766
767// Describe an attribute for a level in the machine topology
768struct kmp_hw_attr_t {
769 int core_type : 8;
770 int core_eff : 8;
771 unsigned valid : 1;
772 unsigned reserved : 15;
773
774 static const int UNKNOWN_CORE_EFF = -1;
775
776 kmp_hw_attr_t()
777 : core_type(KMP_HW_CORE_TYPE_UNKNOWN), core_eff(UNKNOWN_CORE_EFF),
778 valid(0), reserved(0) {}
779 void set_core_type(kmp_hw_core_type_t type) {
780 valid = 1;
781 core_type = type;
782 }
783 void set_core_eff(int eff) {
784 valid = 1;
785 core_eff = eff;
786 }
787 kmp_hw_core_type_t get_core_type() const {
788 return (kmp_hw_core_type_t)core_type;
789 }
790 int get_core_eff() const { return core_eff; }
791 bool is_core_type_valid() const {
792 return core_type != KMP_HW_CORE_TYPE_UNKNOWN;
793 }
794 bool is_core_eff_valid() const { return core_eff != UNKNOWN_CORE_EFF; }
795 operator bool() const { return valid; }
796 void clear() {
797 core_type = KMP_HW_CORE_TYPE_UNKNOWN;
798 core_eff = UNKNOWN_CORE_EFF;
799 valid = 0;
800 }
801 bool contains(const kmp_hw_attr_t &other) const {
802 if (!valid && !other.valid)
803 return true;
804 if (valid && other.valid) {
805 if (other.is_core_type_valid()) {
806 if (!is_core_type_valid() || (get_core_type() != other.get_core_type()))
807 return false;
808 }
809 if (other.is_core_eff_valid()) {
810 if (!is_core_eff_valid() || (get_core_eff() != other.get_core_eff()))
811 return false;
812 }
813 return true;
814 }
815 return false;
816 }
817#if KMP_AFFINITY_SUPPORTED
818 bool contains(const kmp_affinity_attrs_t &attr) const {
819 if (!valid && !attr.valid)
820 return true;
821 if (valid && attr.valid) {
822 if (attr.core_type != KMP_HW_CORE_TYPE_UNKNOWN)
823 return (is_core_type_valid() &&
824 (get_core_type() == (kmp_hw_core_type_t)attr.core_type));
825 if (attr.core_eff != UNKNOWN_CORE_EFF)
826 return (is_core_eff_valid() && (get_core_eff() == attr.core_eff));
827 return true;
828 }
829 return false;
830 }
831#endif // KMP_AFFINITY_SUPPORTED
832 bool operator==(const kmp_hw_attr_t &rhs) const {
833 return (rhs.valid == valid && rhs.core_eff == core_eff &&
834 rhs.core_type == core_type);
835 }
836 bool operator!=(const kmp_hw_attr_t &rhs) const { return !operator==(rhs); }
837};
838
839#if KMP_AFFINITY_SUPPORTED
840KMP_BUILD_ASSERT(sizeof(kmp_hw_attr_t) == sizeof(kmp_affinity_attrs_t));
841#endif
842
843class kmp_hw_thread_t {
844public:
845 static const int UNKNOWN_ID = -1;
846 static const int MULTIPLE_ID = -2;
847 static int compare_ids(const void *a, const void *b);
848 static int compare_compact(const void *a, const void *b);
849 int ids[KMP_HW_LAST];
850 int sub_ids[KMP_HW_LAST];
851 bool leader;
852 int os_id;
853 kmp_hw_attr_t attrs;
854
855 void print() const;
856 void clear() {
857 for (int i = 0; i < (int)KMP_HW_LAST; ++i)
858 ids[i] = UNKNOWN_ID;
859 leader = false;
860 attrs.clear();
861 }
862};
863
864class kmp_topology_t {
865
866 struct flags_t {
867 int uniform : 1;
868 int reserved : 31;
869 };
870
871 int depth;
872
873 // The following arrays are all 'depth' long and have been
874 // allocated to hold up to KMP_HW_LAST number of objects if
875 // needed so layers can be added without reallocation of any array
876
877 // Orderd array of the types in the topology
878 kmp_hw_t *types;
879
880 // Keep quick topology ratios, for non-uniform topologies,
881 // this ratio holds the max number of itemAs per itemB
882 // e.g., [ 4 packages | 6 cores / package | 2 threads / core ]
883 int *ratio;
884
885 // Storage containing the absolute number of each topology layer
886 int *count;
887
888 // The number of core efficiencies. This is only useful for hybrid
889 // topologies. Core efficiencies will range from 0 to num efficiencies - 1
890 int num_core_efficiencies;
891 int num_core_types;
892 kmp_hw_core_type_t core_types[KMP_HW_MAX_NUM_CORE_TYPES];
893
894 // The hardware threads array
895 // hw_threads is num_hw_threads long
896 // Each hw_thread's ids and sub_ids are depth deep
897 int num_hw_threads;
898 kmp_hw_thread_t *hw_threads;
899
900 // Equivalence hash where the key is the hardware topology item
901 // and the value is the equivalent hardware topology type in the
902 // types[] array, if the value is KMP_HW_UNKNOWN, then there is no
903 // known equivalence for the topology type
904 kmp_hw_t equivalent[KMP_HW_LAST];
905
906 // Flags describing the topology
907 flags_t flags;
908
909 // Compact value used during sort_compact()
910 int compact;
911
912 // Insert a new topology layer after allocation
913 void _insert_layer(kmp_hw_t type, const int *ids);
914
915#if KMP_GROUP_AFFINITY
916 // Insert topology information about Windows Processor groups
917 void _insert_windows_proc_groups();
918#endif
919
920 // Count each item & get the num x's per y
921 // e.g., get the number of cores and the number of threads per core
922 // for each (x, y) in (KMP_HW_* , KMP_HW_*)
923 void _gather_enumeration_information();
924
925 // Remove layers that don't add information to the topology.
926 // This is done by having the layer take on the id = UNKNOWN_ID (-1)
927 void _remove_radix1_layers();
928
929 // Find out if the topology is uniform
930 void _discover_uniformity();
931
932 // Set all the sub_ids for each hardware thread
933 void _set_sub_ids();
934
935 // Set global affinity variables describing the number of threads per
936 // core, the number of packages, the number of cores per package, and
937 // the number of cores.
938 void _set_globals();
939
940 // Set the last level cache equivalent type
941 void _set_last_level_cache();
942
943 // Return the number of cores with a particular attribute, 'attr'.
944 // If 'find_all' is true, then find all cores on the machine, otherwise find
945 // all cores per the layer 'above'
946 int _get_ncores_with_attr(const kmp_hw_attr_t &attr, int above,
947 bool find_all = false) const;
948
949public:
950 // Force use of allocate()/deallocate()
951 kmp_topology_t() = delete;
952 kmp_topology_t(const kmp_topology_t &t) = delete;
953 kmp_topology_t(kmp_topology_t &&t) = delete;
954 kmp_topology_t &operator=(const kmp_topology_t &t) = delete;
955 kmp_topology_t &operator=(kmp_topology_t &&t) = delete;
956
957 static kmp_topology_t *allocate(int nproc, int ndepth, const kmp_hw_t *types);
958 static void deallocate(kmp_topology_t *);
959
960 // Functions used in create_map() routines
961 kmp_hw_thread_t &at(int index) {
962 KMP_DEBUG_ASSERT(index >= 0 && index < num_hw_threads);
963 return hw_threads[index];
964 }
965 const kmp_hw_thread_t &at(int index) const {
966 KMP_DEBUG_ASSERT(index >= 0 && index < num_hw_threads);
967 return hw_threads[index];
968 }
969 int get_num_hw_threads() const { return num_hw_threads; }
970 void sort_ids() {
971 qsort(hw_threads, num_hw_threads, sizeof(kmp_hw_thread_t),
972 kmp_hw_thread_t::compare_ids);
973 }
974 // Check if the hardware ids are unique, if they are
975 // return true, otherwise return false
976 bool check_ids() const;
977
978 // Function to call after the create_map() routine
979 void canonicalize();
980 void canonicalize(int pkgs, int cores_per_pkg, int thr_per_core, int cores);
981
982// Functions used after canonicalize() called
983
984#if KMP_AFFINITY_SUPPORTED
985 // Set the granularity for affinity settings
986 void set_granularity(kmp_affinity_t &stgs) const;
987 bool is_close(int hwt1, int hwt2, const kmp_affinity_t &stgs) const;
988 bool restrict_to_mask(const kmp_affin_mask_t *mask);
989 bool filter_hw_subset();
990#endif
991 bool is_uniform() const { return flags.uniform; }
992 // Tell whether a type is a valid type in the topology
993 // returns KMP_HW_UNKNOWN when there is no equivalent type
994 kmp_hw_t get_equivalent_type(kmp_hw_t type) const {
995 if (type == KMP_HW_UNKNOWN)
996 return KMP_HW_UNKNOWN;
997 return equivalent[type];
998 }
999 // Set type1 = type2
1000 void set_equivalent_type(kmp_hw_t type1, kmp_hw_t type2) {
1001 KMP_DEBUG_ASSERT_VALID_HW_TYPE(type1);
1002 KMP_DEBUG_ASSERT_VALID_HW_TYPE(type2);
1003 kmp_hw_t real_type2 = equivalent[type2];
1004 if (real_type2 == KMP_HW_UNKNOWN)
1005 real_type2 = type2;
1006 equivalent[type1] = real_type2;
1007 // This loop is required since any of the types may have been set to
1008 // be equivalent to type1. They all must be checked and reset to type2.
1009 KMP_FOREACH_HW_TYPE(type) {
1010 if (equivalent[type] == type1) {
1011 equivalent[type] = real_type2;
1012 }
1013 }
1014 }
1015 // Calculate number of types corresponding to level1
1016 // per types corresponding to level2 (e.g., number of threads per core)
1017 int calculate_ratio(int level1, int level2) const {
1018 KMP_DEBUG_ASSERT(level1 >= 0 && level1 < depth);
1019 KMP_DEBUG_ASSERT(level2 >= 0 && level2 < depth);
1020 int r = 1;
1021 for (int level = level1; level > level2; --level)
1022 r *= ratio[level];
1023 return r;
1024 }
1025 int get_ratio(int level) const {
1026 KMP_DEBUG_ASSERT(level >= 0 && level < depth);
1027 return ratio[level];
1028 }
1029 int get_depth() const { return depth; };
1030 kmp_hw_t get_type(int level) const {
1031 KMP_DEBUG_ASSERT(level >= 0 && level < depth);
1032 return types[level];
1033 }
1034 int get_level(kmp_hw_t type) const {
1035 KMP_DEBUG_ASSERT_VALID_HW_TYPE(type);
1036 int eq_type = equivalent[type];
1037 if (eq_type == KMP_HW_UNKNOWN)
1038 return -1;
1039 for (int i = 0; i < depth; ++i)
1040 if (types[i] == eq_type)
1041 return i;
1042 return -1;
1043 }
1044 int get_count(int level) const {
1045 KMP_DEBUG_ASSERT(level >= 0 && level < depth);
1046 return count[level];
1047 }
1048 // Return the total number of cores with attribute 'attr'
1049 int get_ncores_with_attr(const kmp_hw_attr_t &attr) const {
1050 return _get_ncores_with_attr(attr, -1, true);
1051 }
1052 // Return the number of cores with attribute
1053 // 'attr' per topology level 'above'
1054 int get_ncores_with_attr_per(const kmp_hw_attr_t &attr, int above) const {
1055 return _get_ncores_with_attr(attr, above, false);
1056 }
1057
1058#if KMP_AFFINITY_SUPPORTED
1059 friend int kmp_hw_thread_t::compare_compact(const void *a, const void *b);
1060 void sort_compact(kmp_affinity_t &affinity) {
1061 compact = affinity.compact;
1062 qsort(hw_threads, num_hw_threads, sizeof(kmp_hw_thread_t),
1063 kmp_hw_thread_t::compare_compact);
1064 }
1065#endif
1066 void print(const char *env_var = "KMP_AFFINITY") const;
1067 void dump() const;
1068};
1069extern kmp_topology_t *__kmp_topology;
1070
1071class kmp_hw_subset_t {
1072 const static size_t MAX_ATTRS = KMP_HW_MAX_NUM_CORE_EFFS;
1073
1074public:
1075 // Describe a machine topology item in KMP_HW_SUBSET
1076 struct item_t {
1077 kmp_hw_t type;
1078 int num_attrs;
1079 int num[MAX_ATTRS];
1080 int offset[MAX_ATTRS];
1081 kmp_hw_attr_t attr[MAX_ATTRS];
1082 };
1083 // Put parenthesis around max to avoid accidental use of Windows max macro.
1084 const static int USE_ALL = (std::numeric_limits<int>::max)();
1085
1086private:
1087 int depth;
1088 int capacity;
1089 item_t *items;
1090 kmp_uint64 set;
1091 bool absolute;
1092 // The set must be able to handle up to KMP_HW_LAST number of layers
1093 KMP_BUILD_ASSERT(sizeof(set) * 8 >= KMP_HW_LAST);
1094 // Sorting the KMP_HW_SUBSET items to follow topology order
1095 // All unknown topology types will be at the beginning of the subset
1096 static int hw_subset_compare(const void *i1, const void *i2) {
1097 kmp_hw_t type1 = ((const item_t *)i1)->type;
1098 kmp_hw_t type2 = ((const item_t *)i2)->type;
1099 int level1 = __kmp_topology->get_level(type1);
1100 int level2 = __kmp_topology->get_level(type2);
1101 return level1 - level2;
1102 }
1103
1104public:
1105 // Force use of allocate()/deallocate()
1106 kmp_hw_subset_t() = delete;
1107 kmp_hw_subset_t(const kmp_hw_subset_t &t) = delete;
1108 kmp_hw_subset_t(kmp_hw_subset_t &&t) = delete;
1109 kmp_hw_subset_t &operator=(const kmp_hw_subset_t &t) = delete;
1110 kmp_hw_subset_t &operator=(kmp_hw_subset_t &&t) = delete;
1111
1112 static kmp_hw_subset_t *allocate() {
1113 int initial_capacity = 5;
1114 kmp_hw_subset_t *retval =
1115 (kmp_hw_subset_t *)__kmp_allocate(sizeof(kmp_hw_subset_t));
1116 retval->depth = 0;
1117 retval->capacity = initial_capacity;
1118 retval->set = 0ull;
1119 retval->absolute = false;
1120 retval->items = (item_t *)__kmp_allocate(sizeof(item_t) * initial_capacity);
1121 return retval;
1122 }
1123 static void deallocate(kmp_hw_subset_t *subset) {
1124 __kmp_free(subset->items);
1125 __kmp_free(subset);
1126 }
1127 void set_absolute() { absolute = true; }
1128 bool is_absolute() const { return absolute; }
1129 void push_back(int num, kmp_hw_t type, int offset, kmp_hw_attr_t attr) {
1130 for (int i = 0; i < depth; ++i) {
1131 // Found an existing item for this layer type
1132 // Add the num, offset, and attr to this item
1133 if (items[i].type == type) {
1134 int idx = items[i].num_attrs++;
1135 if ((size_t)idx >= MAX_ATTRS)
1136 return;
1137 items[i].num[idx] = num;
1138 items[i].offset[idx] = offset;
1139 items[i].attr[idx] = attr;
1140 return;
1141 }
1142 }
1143 if (depth == capacity - 1) {
1144 capacity *= 2;
1145 item_t *new_items = (item_t *)__kmp_allocate(sizeof(item_t) * capacity);
1146 for (int i = 0; i < depth; ++i)
1147 new_items[i] = items[i];
1148 __kmp_free(items);
1149 items = new_items;
1150 }
1151 items[depth].num_attrs = 1;
1152 items[depth].type = type;
1153 items[depth].num[0] = num;
1154 items[depth].offset[0] = offset;
1155 items[depth].attr[0] = attr;
1156 depth++;
1157 set |= (1ull << type);
1158 }
1159 int get_depth() const { return depth; }
1160 const item_t &at(int index) const {
1161 KMP_DEBUG_ASSERT(index >= 0 && index < depth);
1162 return items[index];
1163 }
1164 item_t &at(int index) {
1165 KMP_DEBUG_ASSERT(index >= 0 && index < depth);
1166 return items[index];
1167 }
1168 void remove(int index) {
1169 KMP_DEBUG_ASSERT(index >= 0 && index < depth);
1170 set &= ~(1ull << items[index].type);
1171 for (int j = index + 1; j < depth; ++j) {
1172 items[j - 1] = items[j];
1173 }
1174 depth--;
1175 }
1176 void sort() {
1177 KMP_DEBUG_ASSERT(__kmp_topology);
1178 qsort(items, depth, sizeof(item_t), hw_subset_compare);
1179 }
1180 bool specified(kmp_hw_t type) const { return ((set & (1ull << type)) > 0); }
1181 void dump() const {
1182 printf("**********************\n");
1183 printf("*** kmp_hw_subset: ***\n");
1184 printf("* depth: %d\n", depth);
1185 printf("* items:\n");
1186 for (int i = 0; i < depth; ++i) {
1187 printf(" type: %s\n", __kmp_hw_get_keyword(items[i].type));
1188 for (int j = 0; j < items[i].num_attrs; ++j) {
1189 printf(" num: %d, offset: %d, attr: ", items[i].num[j],
1190 items[i].offset[j]);
1191 if (!items[i].attr[j]) {
1192 printf(" (none)\n");
1193 } else {
1194 printf(
1195 " core_type = %s, core_eff = %d\n",
1196 __kmp_hw_get_core_type_string(items[i].attr[j].get_core_type()),
1197 items[i].attr[j].get_core_eff());
1198 }
1199 }
1200 }
1201 printf("* set: 0x%llx\n", set);
1202 printf("* absolute: %d\n", absolute);
1203 printf("**********************\n");
1204 }
1205};
1206extern kmp_hw_subset_t *__kmp_hw_subset;
1207
1208/* A structure for holding machine-specific hierarchy info to be computed once
1209 at init. This structure represents a mapping of threads to the actual machine
1210 hierarchy, or to our best guess at what the hierarchy might be, for the
1211 purpose of performing an efficient barrier. In the worst case, when there is
1212 no machine hierarchy information, it produces a tree suitable for a barrier,
1213 similar to the tree used in the hyper barrier. */
1214class hierarchy_info {
1215public:
1216 /* Good default values for number of leaves and branching factor, given no
1217 affinity information. Behaves a bit like hyper barrier. */
1218 static const kmp_uint32 maxLeaves = 4;
1219 static const kmp_uint32 minBranch = 4;
1225 kmp_uint32 maxLevels;
1226
1231 kmp_uint32 depth;
1232 kmp_uint32 base_num_threads;
1233 enum init_status { initialized = 0, not_initialized = 1, initializing = 2 };
1234 volatile kmp_int8 uninitialized; // 0=initialized, 1=not initialized,
1235 // 2=initialization in progress
1236 volatile kmp_int8 resizing; // 0=not resizing, 1=resizing
1237
1242 kmp_uint32 *numPerLevel;
1243 kmp_uint32 *skipPerLevel;
1244
1245 void deriveLevels() {
1246 int hier_depth = __kmp_topology->get_depth();
1247 for (int i = hier_depth - 1, level = 0; i >= 0; --i, ++level) {
1248 numPerLevel[level] = __kmp_topology->get_ratio(i);
1249 }
1250 }
1251
1252 hierarchy_info()
1253 : maxLevels(7), depth(1), uninitialized(not_initialized), resizing(0) {}
1254
1255 void fini() {
1256 if (!uninitialized && numPerLevel) {
1257 __kmp_free(numPerLevel);
1258 numPerLevel = NULL;
1259 uninitialized = not_initialized;
1260 }
1261 }
1262
1263 void init(int num_addrs) {
1264 kmp_int8 bool_result = KMP_COMPARE_AND_STORE_ACQ8(
1265 &uninitialized, not_initialized, initializing);
1266 if (bool_result == 0) { // Wait for initialization
1267 while (TCR_1(uninitialized) != initialized)
1268 KMP_CPU_PAUSE();
1269 return;
1270 }
1271 KMP_DEBUG_ASSERT(bool_result == 1);
1272
1273 /* Added explicit initialization of the data fields here to prevent usage of
1274 dirty value observed when static library is re-initialized multiple times
1275 (e.g. when non-OpenMP thread repeatedly launches/joins thread that uses
1276 OpenMP). */
1277 depth = 1;
1278 resizing = 0;
1279 maxLevels = 7;
1280 numPerLevel =
1281 (kmp_uint32 *)__kmp_allocate(maxLevels * 2 * sizeof(kmp_uint32));
1282 skipPerLevel = &(numPerLevel[maxLevels]);
1283 for (kmp_uint32 i = 0; i < maxLevels;
1284 ++i) { // init numPerLevel[*] to 1 item per level
1285 numPerLevel[i] = 1;
1286 skipPerLevel[i] = 1;
1287 }
1288
1289 // Sort table by physical ID
1290 if (__kmp_topology && __kmp_topology->get_depth() > 0) {
1291 deriveLevels();
1292 } else {
1293 numPerLevel[0] = maxLeaves;
1294 numPerLevel[1] = num_addrs / maxLeaves;
1295 if (num_addrs % maxLeaves)
1296 numPerLevel[1]++;
1297 }
1298
1299 base_num_threads = num_addrs;
1300 for (int i = maxLevels - 1; i >= 0;
1301 --i) // count non-empty levels to get depth
1302 if (numPerLevel[i] != 1 || depth > 1) // only count one top-level '1'
1303 depth++;
1304
1305 kmp_uint32 branch = minBranch;
1306 if (numPerLevel[0] == 1)
1307 branch = num_addrs / maxLeaves;
1308 if (branch < minBranch)
1309 branch = minBranch;
1310 for (kmp_uint32 d = 0; d < depth - 1; ++d) { // optimize hierarchy width
1311 while (numPerLevel[d] > branch ||
1312 (d == 0 && numPerLevel[d] > maxLeaves)) { // max 4 on level 0!
1313 if (numPerLevel[d] & 1)
1314 numPerLevel[d]++;
1315 numPerLevel[d] = numPerLevel[d] >> 1;
1316 if (numPerLevel[d + 1] == 1)
1317 depth++;
1318 numPerLevel[d + 1] = numPerLevel[d + 1] << 1;
1319 }
1320 if (numPerLevel[0] == 1) {
1321 branch = branch >> 1;
1322 if (branch < 4)
1323 branch = minBranch;
1324 }
1325 }
1326
1327 for (kmp_uint32 i = 1; i < depth; ++i)
1328 skipPerLevel[i] = numPerLevel[i - 1] * skipPerLevel[i - 1];
1329 // Fill in hierarchy in the case of oversubscription
1330 for (kmp_uint32 i = depth; i < maxLevels; ++i)
1331 skipPerLevel[i] = 2 * skipPerLevel[i - 1];
1332
1333 uninitialized = initialized; // One writer
1334 }
1335
1336 // Resize the hierarchy if nproc changes to something larger than before
1337 void resize(kmp_uint32 nproc) {
1338 kmp_int8 bool_result = KMP_COMPARE_AND_STORE_ACQ8(&resizing, 0, 1);
1339 while (bool_result == 0) { // someone else is trying to resize
1340 KMP_CPU_PAUSE();
1341 if (nproc <= base_num_threads) // happy with other thread's resize
1342 return;
1343 else // try to resize
1344 bool_result = KMP_COMPARE_AND_STORE_ACQ8(&resizing, 0, 1);
1345 }
1346 KMP_DEBUG_ASSERT(bool_result != 0);
1347 if (nproc <= base_num_threads)
1348 return; // happy with other thread's resize
1349
1350 // Calculate new maxLevels
1351 kmp_uint32 old_sz = skipPerLevel[depth - 1];
1352 kmp_uint32 incs = 0, old_maxLevels = maxLevels;
1353 // First see if old maxLevels is enough to contain new size
1354 for (kmp_uint32 i = depth; i < maxLevels && nproc > old_sz; ++i) {
1355 skipPerLevel[i] = 2 * skipPerLevel[i - 1];
1356 numPerLevel[i - 1] *= 2;
1357 old_sz *= 2;
1358 depth++;
1359 }
1360 if (nproc > old_sz) { // Not enough space, need to expand hierarchy
1361 while (nproc > old_sz) {
1362 old_sz *= 2;
1363 incs++;
1364 depth++;
1365 }
1366 maxLevels += incs;
1367
1368 // Resize arrays
1369 kmp_uint32 *old_numPerLevel = numPerLevel;
1370 kmp_uint32 *old_skipPerLevel = skipPerLevel;
1371 numPerLevel = skipPerLevel = NULL;
1372 numPerLevel =
1373 (kmp_uint32 *)__kmp_allocate(maxLevels * 2 * sizeof(kmp_uint32));
1374 skipPerLevel = &(numPerLevel[maxLevels]);
1375
1376 // Copy old elements from old arrays
1377 for (kmp_uint32 i = 0; i < old_maxLevels; ++i) {
1378 // init numPerLevel[*] to 1 item per level
1379 numPerLevel[i] = old_numPerLevel[i];
1380 skipPerLevel[i] = old_skipPerLevel[i];
1381 }
1382
1383 // Init new elements in arrays to 1
1384 for (kmp_uint32 i = old_maxLevels; i < maxLevels; ++i) {
1385 // init numPerLevel[*] to 1 item per level
1386 numPerLevel[i] = 1;
1387 skipPerLevel[i] = 1;
1388 }
1389
1390 // Free old arrays
1391 __kmp_free(old_numPerLevel);
1392 }
1393
1394 // Fill in oversubscription levels of hierarchy
1395 for (kmp_uint32 i = old_maxLevels; i < maxLevels; ++i)
1396 skipPerLevel[i] = 2 * skipPerLevel[i - 1];
1397
1398 base_num_threads = nproc;
1399 resizing = 0; // One writer
1400 }
1401};
1402#endif // KMP_AFFINITY_H