1/*
2 * Copyright (c) 2001, 2019, Oracle and/or its affiliates. All rights reserved.
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4 *
5 * This code is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU General Public License version 2 only, as
7 * published by the Free Software Foundation.
8 *
9 * This code is distributed in the hope that it will be useful, but WITHOUT
10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
12 * version 2 for more details (a copy is included in the LICENSE file that
13 * accompanied this code).
14 *
15 * You should have received a copy of the GNU General Public License version
16 * 2 along with this work; if not, write to the Free Software Foundation,
17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
18 *
19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
20 * or visit www.oracle.com if you need additional information or have any
21 * questions.
22 *
23 */
24
25#ifndef SHARE_GC_SHARED_TASKQUEUE_HPP
26#define SHARE_GC_SHARED_TASKQUEUE_HPP
27
28#include "memory/allocation.hpp"
29#include "memory/padded.hpp"
30#include "oops/oopsHierarchy.hpp"
31#include "utilities/ostream.hpp"
32#include "utilities/stack.hpp"
33
34// Simple TaskQueue stats that are collected by default in debug builds.
35
36#if !defined(TASKQUEUE_STATS) && defined(ASSERT)
37#define TASKQUEUE_STATS 1
38#elif !defined(TASKQUEUE_STATS)
39#define TASKQUEUE_STATS 0
40#endif
41
42#if TASKQUEUE_STATS
43#define TASKQUEUE_STATS_ONLY(code) code
44#else
45#define TASKQUEUE_STATS_ONLY(code)
46#endif // TASKQUEUE_STATS
47
48#if TASKQUEUE_STATS
49class TaskQueueStats {
50public:
51 enum StatId {
52 push, // number of taskqueue pushes
53 pop, // number of taskqueue pops
54 pop_slow, // subset of taskqueue pops that were done slow-path
55 steal_attempt, // number of taskqueue steal attempts
56 steal, // number of taskqueue steals
57 overflow, // number of overflow pushes
58 overflow_max_len, // max length of overflow stack
59 last_stat_id
60 };
61
62public:
63 inline TaskQueueStats() { reset(); }
64
65 inline void record_push() { ++_stats[push]; }
66 inline void record_pop() { ++_stats[pop]; }
67 inline void record_pop_slow() { record_pop(); ++_stats[pop_slow]; }
68 inline void record_steal_attempt() { ++_stats[steal_attempt]; }
69 inline void record_steal() { ++_stats[steal]; }
70 inline void record_overflow(size_t new_length);
71
72 TaskQueueStats & operator +=(const TaskQueueStats & addend);
73
74 inline size_t get(StatId id) const { return _stats[id]; }
75 inline const size_t* get() const { return _stats; }
76
77 inline void reset();
78
79 // Print the specified line of the header (does not include a line separator).
80 static void print_header(unsigned int line, outputStream* const stream = tty,
81 unsigned int width = 10);
82 // Print the statistics (does not include a line separator).
83 void print(outputStream* const stream = tty, unsigned int width = 10) const;
84
85 DEBUG_ONLY(void verify() const;)
86
87private:
88 size_t _stats[last_stat_id];
89 static const char * const _names[last_stat_id];
90};
91
92void TaskQueueStats::record_overflow(size_t new_len) {
93 ++_stats[overflow];
94 if (new_len > _stats[overflow_max_len]) _stats[overflow_max_len] = new_len;
95}
96
97void TaskQueueStats::reset() {
98 memset(_stats, 0, sizeof(_stats));
99}
100#endif // TASKQUEUE_STATS
101
102// TaskQueueSuper collects functionality common to all GenericTaskQueue instances.
103
104template <unsigned int N, MEMFLAGS F>
105class TaskQueueSuper: public CHeapObj<F> {
106protected:
107 // Internal type for indexing the queue; also used for the tag.
108 typedef NOT_LP64(uint16_t) LP64_ONLY(uint32_t) idx_t;
109
110 // The first free element after the last one pushed (mod N).
111 volatile uint _bottom;
112
113 enum { MOD_N_MASK = N - 1 };
114
115 class Age {
116 public:
117 Age(size_t data = 0) { _data = data; }
118 Age(const Age& age) { _data = age._data; }
119 Age(idx_t top, idx_t tag) { _fields._top = top; _fields._tag = tag; }
120
121 Age get() const volatile { return _data; }
122 void set(Age age) volatile { _data = age._data; }
123
124 idx_t top() const volatile { return _fields._top; }
125 idx_t tag() const volatile { return _fields._tag; }
126
127 // Increment top; if it wraps, increment tag also.
128 void increment() {
129 _fields._top = increment_index(_fields._top);
130 if (_fields._top == 0) ++_fields._tag;
131 }
132
133 Age cmpxchg(const Age new_age, const Age old_age) volatile;
134
135 bool operator ==(const Age& other) const { return _data == other._data; }
136
137 private:
138 struct fields {
139 idx_t _top;
140 idx_t _tag;
141 };
142 union {
143 size_t _data;
144 fields _fields;
145 };
146 };
147
148 volatile Age _age;
149
150 // These both operate mod N.
151 static uint increment_index(uint ind) {
152 return (ind + 1) & MOD_N_MASK;
153 }
154 static uint decrement_index(uint ind) {
155 return (ind - 1) & MOD_N_MASK;
156 }
157
158 // Returns a number in the range [0..N). If the result is "N-1", it should be
159 // interpreted as 0.
160 uint dirty_size(uint bot, uint top) const {
161 return (bot - top) & MOD_N_MASK;
162 }
163
164 // Returns the size corresponding to the given "bot" and "top".
165 uint size(uint bot, uint top) const {
166 uint sz = dirty_size(bot, top);
167 // Has the queue "wrapped", so that bottom is less than top? There's a
168 // complicated special case here. A pair of threads could perform pop_local
169 // and pop_global operations concurrently, starting from a state in which
170 // _bottom == _top+1. The pop_local could succeed in decrementing _bottom,
171 // and the pop_global in incrementing _top (in which case the pop_global
172 // will be awarded the contested queue element.) The resulting state must
173 // be interpreted as an empty queue. (We only need to worry about one such
174 // event: only the queue owner performs pop_local's, and several concurrent
175 // threads attempting to perform the pop_global will all perform the same
176 // CAS, and only one can succeed.) Any stealing thread that reads after
177 // either the increment or decrement will see an empty queue, and will not
178 // join the competitors. The "sz == -1 || sz == N-1" state will not be
179 // modified by concurrent queues, so the owner thread can reset the state to
180 // _bottom == top so subsequent pushes will be performed normally.
181 return (sz == N - 1) ? 0 : sz;
182 }
183
184public:
185 TaskQueueSuper() : _bottom(0), _age() {}
186
187 // Return true if the TaskQueue contains/does not contain any tasks.
188 bool peek() const { return _bottom != _age.top(); }
189 bool is_empty() const { return size() == 0; }
190
191 // Return an estimate of the number of elements in the queue.
192 // The "careful" version admits the possibility of pop_local/pop_global
193 // races.
194 uint size() const {
195 return size(_bottom, _age.top());
196 }
197
198 uint dirty_size() const {
199 return dirty_size(_bottom, _age.top());
200 }
201
202 void set_empty() {
203 _bottom = 0;
204 _age.set(0);
205 }
206
207 // Maximum number of elements allowed in the queue. This is two less
208 // than the actual queue size, for somewhat complicated reasons.
209 uint max_elems() const { return N - 2; }
210
211 // Total size of queue.
212 static const uint total_size() { return N; }
213
214 TASKQUEUE_STATS_ONLY(TaskQueueStats stats;)
215};
216
217//
218// GenericTaskQueue implements an ABP, Aurora-Blumofe-Plaxton, double-
219// ended-queue (deque), intended for use in work stealing. Queue operations
220// are non-blocking.
221//
222// A queue owner thread performs push() and pop_local() operations on one end
223// of the queue, while other threads may steal work using the pop_global()
224// method.
225//
226// The main difference to the original algorithm is that this
227// implementation allows wrap-around at the end of its allocated
228// storage, which is an array.
229//
230// The original paper is:
231//
232// Arora, N. S., Blumofe, R. D., and Plaxton, C. G.
233// Thread scheduling for multiprogrammed multiprocessors.
234// Theory of Computing Systems 34, 2 (2001), 115-144.
235//
236// The following paper provides an correctness proof and an
237// implementation for weakly ordered memory models including (pseudo-)
238// code containing memory barriers for a Chase-Lev deque. Chase-Lev is
239// similar to ABP, with the main difference that it allows resizing of the
240// underlying storage:
241//
242// Le, N. M., Pop, A., Cohen A., and Nardell, F. Z.
243// Correct and efficient work-stealing for weak memory models
244// Proceedings of the 18th ACM SIGPLAN symposium on Principles and
245// practice of parallel programming (PPoPP 2013), 69-80
246//
247
248template <class E, MEMFLAGS F, unsigned int N = TASKQUEUE_SIZE>
249class GenericTaskQueue: public TaskQueueSuper<N, F> {
250protected:
251 typedef typename TaskQueueSuper<N, F>::Age Age;
252 typedef typename TaskQueueSuper<N, F>::idx_t idx_t;
253
254 using TaskQueueSuper<N, F>::_bottom;
255 using TaskQueueSuper<N, F>::_age;
256 using TaskQueueSuper<N, F>::increment_index;
257 using TaskQueueSuper<N, F>::decrement_index;
258 using TaskQueueSuper<N, F>::dirty_size;
259
260public:
261 using TaskQueueSuper<N, F>::max_elems;
262 using TaskQueueSuper<N, F>::size;
263
264#if TASKQUEUE_STATS
265 using TaskQueueSuper<N, F>::stats;
266#endif
267
268private:
269 // Slow paths for push, pop_local. (pop_global has no fast path.)
270 bool push_slow(E t, uint dirty_n_elems);
271 bool pop_local_slow(uint localBot, Age oldAge);
272
273public:
274 typedef E element_type;
275
276 // Initializes the queue to empty.
277 GenericTaskQueue();
278
279 void initialize();
280
281 // Push the task "t" on the queue. Returns "false" iff the queue is full.
282 inline bool push(E t);
283
284 // Attempts to claim a task from the "local" end of the queue (the most
285 // recently pushed) as long as the number of entries exceeds the threshold.
286 // If successful, returns true and sets t to the task; otherwise, returns false
287 // (the queue is empty or the number of elements below the threshold).
288 inline bool pop_local(volatile E& t, uint threshold = 0);
289
290 // Like pop_local(), but uses the "global" end of the queue (the least
291 // recently pushed).
292 bool pop_global(volatile E& t);
293
294 // Delete any resource associated with the queue.
295 ~GenericTaskQueue();
296
297 // Apply fn to each element in the task queue. The queue must not
298 // be modified while iterating.
299 template<typename Fn> void iterate(Fn fn);
300
301private:
302 DEFINE_PAD_MINUS_SIZE(0, DEFAULT_CACHE_LINE_SIZE, 0);
303 // Element array.
304 volatile E* _elems;
305
306 DEFINE_PAD_MINUS_SIZE(1, DEFAULT_CACHE_LINE_SIZE, sizeof(E*));
307 // Queue owner local variables. Not to be accessed by other threads.
308
309 static const uint InvalidQueueId = uint(-1);
310 uint _last_stolen_queue_id; // The id of the queue we last stole from
311
312 int _seed; // Current random seed used for selecting a random queue during stealing.
313
314 DEFINE_PAD_MINUS_SIZE(2, DEFAULT_CACHE_LINE_SIZE, sizeof(uint) + sizeof(int));
315public:
316 int next_random_queue_id();
317
318 void set_last_stolen_queue_id(uint id) { _last_stolen_queue_id = id; }
319 uint last_stolen_queue_id() const { return _last_stolen_queue_id; }
320 bool is_last_stolen_queue_id_valid() const { return _last_stolen_queue_id != InvalidQueueId; }
321 void invalidate_last_stolen_queue_id() { _last_stolen_queue_id = InvalidQueueId; }
322};
323
324template<class E, MEMFLAGS F, unsigned int N>
325GenericTaskQueue<E, F, N>::GenericTaskQueue() : _last_stolen_queue_id(InvalidQueueId), _seed(17 /* random number */) {
326 assert(sizeof(Age) == sizeof(size_t), "Depends on this.");
327}
328
329// OverflowTaskQueue is a TaskQueue that also includes an overflow stack for
330// elements that do not fit in the TaskQueue.
331//
332// This class hides two methods from super classes:
333//
334// push() - push onto the task queue or, if that fails, onto the overflow stack
335// is_empty() - return true if both the TaskQueue and overflow stack are empty
336//
337// Note that size() is not hidden--it returns the number of elements in the
338// TaskQueue, and does not include the size of the overflow stack. This
339// simplifies replacement of GenericTaskQueues with OverflowTaskQueues.
340template<class E, MEMFLAGS F, unsigned int N = TASKQUEUE_SIZE>
341class OverflowTaskQueue: public GenericTaskQueue<E, F, N>
342{
343public:
344 typedef Stack<E, F> overflow_t;
345 typedef GenericTaskQueue<E, F, N> taskqueue_t;
346
347 TASKQUEUE_STATS_ONLY(using taskqueue_t::stats;)
348
349 // Push task t onto the queue or onto the overflow stack. Return true.
350 inline bool push(E t);
351 // Try to push task t onto the queue only. Returns true if successful, false otherwise.
352 inline bool try_push_to_taskqueue(E t);
353
354 // Attempt to pop from the overflow stack; return true if anything was popped.
355 inline bool pop_overflow(E& t);
356
357 inline overflow_t* overflow_stack() { return &_overflow_stack; }
358
359 inline bool taskqueue_empty() const { return taskqueue_t::is_empty(); }
360 inline bool overflow_empty() const { return _overflow_stack.is_empty(); }
361 inline bool is_empty() const {
362 return taskqueue_empty() && overflow_empty();
363 }
364
365private:
366 overflow_t _overflow_stack;
367};
368
369class TaskQueueSetSuper {
370public:
371 // Returns "true" if some TaskQueue in the set contains a task.
372 virtual bool peek() = 0;
373 // Tasks in queue
374 virtual uint tasks() const = 0;
375};
376
377template <MEMFLAGS F> class TaskQueueSetSuperImpl: public CHeapObj<F>, public TaskQueueSetSuper {
378};
379
380template<class T, MEMFLAGS F>
381class GenericTaskQueueSet: public TaskQueueSetSuperImpl<F> {
382public:
383 typedef typename T::element_type E;
384
385private:
386 uint _n;
387 T** _queues;
388
389 bool steal_best_of_2(uint queue_num, E& t);
390
391public:
392 GenericTaskQueueSet(uint n);
393 ~GenericTaskQueueSet();
394
395 void register_queue(uint i, T* q);
396
397 T* queue(uint n);
398
399 // Try to steal a task from some other queue than queue_num. It may perform several attempts at doing so.
400 // Returns if stealing succeeds, and sets "t" to the stolen task.
401 bool steal(uint queue_num, E& t);
402
403 bool peek();
404 uint tasks() const;
405
406 uint size() const { return _n; }
407};
408
409template<class T, MEMFLAGS F> void
410GenericTaskQueueSet<T, F>::register_queue(uint i, T* q) {
411 assert(i < _n, "index out of range.");
412 _queues[i] = q;
413}
414
415template<class T, MEMFLAGS F> T*
416GenericTaskQueueSet<T, F>::queue(uint i) {
417 return _queues[i];
418}
419
420template<class T, MEMFLAGS F>
421bool GenericTaskQueueSet<T, F>::peek() {
422 // Try all the queues.
423 for (uint j = 0; j < _n; j++) {
424 if (_queues[j]->peek())
425 return true;
426 }
427 return false;
428}
429
430template<class T, MEMFLAGS F>
431uint GenericTaskQueueSet<T, F>::tasks() const {
432 uint n = 0;
433 for (uint j = 0; j < _n; j++) {
434 n += _queues[j]->size();
435 }
436 return n;
437}
438
439// When to terminate from the termination protocol.
440class TerminatorTerminator: public CHeapObj<mtInternal> {
441public:
442 virtual bool should_exit_termination() = 0;
443};
444
445// A class to aid in the termination of a set of parallel tasks using
446// TaskQueueSet's for work stealing.
447
448#undef TRACESPINNING
449
450class ParallelTaskTerminator: public CHeapObj<mtGC> {
451protected:
452 uint _n_threads;
453 TaskQueueSetSuper* _queue_set;
454
455 DEFINE_PAD_MINUS_SIZE(0, DEFAULT_CACHE_LINE_SIZE, 0);
456 volatile uint _offered_termination;
457 DEFINE_PAD_MINUS_SIZE(1, DEFAULT_CACHE_LINE_SIZE, sizeof(volatile uint));
458
459#ifdef TRACESPINNING
460 static uint _total_yields;
461 static uint _total_spins;
462 static uint _total_peeks;
463#endif
464
465 bool peek_in_queue_set();
466protected:
467 virtual void yield();
468 void sleep(uint millis);
469
470 // Called when exiting termination is requested.
471 // When the request is made, terminator may have already terminated
472 // (e.g. all threads are arrived and offered termination). In this case,
473 // it should ignore the request and complete the termination.
474 // Return true if termination is completed. Otherwise, return false.
475 bool complete_or_exit_termination();
476public:
477
478 // "n_threads" is the number of threads to be terminated. "queue_set" is a
479 // queue sets of work queues of other threads.
480 ParallelTaskTerminator(uint n_threads, TaskQueueSetSuper* queue_set);
481 virtual ~ParallelTaskTerminator();
482
483 // The current thread has no work, and is ready to terminate if everyone
484 // else is. If returns "true", all threads are terminated. If returns
485 // "false", available work has been observed in one of the task queues,
486 // so the global task is not complete.
487 bool offer_termination() {
488 return offer_termination(NULL);
489 }
490
491 // As above, but it also terminates if the should_exit_termination()
492 // method of the terminator parameter returns true. If terminator is
493 // NULL, then it is ignored.
494 virtual bool offer_termination(TerminatorTerminator* terminator);
495
496 // Reset the terminator, so that it may be reused again.
497 // The caller is responsible for ensuring that this is done
498 // in an MT-safe manner, once the previous round of use of
499 // the terminator is finished.
500 void reset_for_reuse();
501 // Same as above but the number of parallel threads is set to the
502 // given number.
503 void reset_for_reuse(uint n_threads);
504
505#ifdef TRACESPINNING
506 static uint total_yields() { return _total_yields; }
507 static uint total_spins() { return _total_spins; }
508 static uint total_peeks() { return _total_peeks; }
509 static void print_termination_counts();
510#endif
511};
512
513class TaskTerminator : public StackObj {
514private:
515 ParallelTaskTerminator* _terminator;
516
517 // Noncopyable.
518 TaskTerminator(const TaskTerminator&);
519 TaskTerminator& operator=(const TaskTerminator&);
520public:
521 TaskTerminator(uint n_threads, TaskQueueSetSuper* queue_set);
522 ~TaskTerminator();
523
524 ParallelTaskTerminator* terminator() const {
525 return _terminator;
526 }
527};
528
529typedef GenericTaskQueue<oop, mtGC> OopTaskQueue;
530typedef GenericTaskQueueSet<OopTaskQueue, mtGC> OopTaskQueueSet;
531
532#ifdef _MSC_VER
533#pragma warning(push)
534// warning C4522: multiple assignment operators specified
535#pragma warning(disable:4522)
536#endif
537
538// This is a container class for either an oop* or a narrowOop*.
539// Both are pushed onto a task queue and the consumer will test is_narrow()
540// to determine which should be processed.
541class StarTask {
542 void* _holder; // either union oop* or narrowOop*
543
544 enum { COMPRESSED_OOP_MASK = 1 };
545
546 public:
547 StarTask(narrowOop* p) {
548 assert(((uintptr_t)p & COMPRESSED_OOP_MASK) == 0, "Information loss!");
549 _holder = (void *)((uintptr_t)p | COMPRESSED_OOP_MASK);
550 }
551 StarTask(oop* p) {
552 assert(((uintptr_t)p & COMPRESSED_OOP_MASK) == 0, "Information loss!");
553 _holder = (void*)p;
554 }
555 StarTask() { _holder = NULL; }
556 operator oop*() { return (oop*)_holder; }
557 operator narrowOop*() {
558 return (narrowOop*)((uintptr_t)_holder & ~COMPRESSED_OOP_MASK);
559 }
560
561 StarTask& operator=(const StarTask& t) {
562 _holder = t._holder;
563 return *this;
564 }
565 volatile StarTask& operator=(const volatile StarTask& t) volatile {
566 _holder = t._holder;
567 return *this;
568 }
569
570 bool is_narrow() const {
571 return (((uintptr_t)_holder & COMPRESSED_OOP_MASK) != 0);
572 }
573};
574
575class ObjArrayTask
576{
577public:
578 ObjArrayTask(oop o = NULL, int idx = 0): _obj(o), _index(idx) { }
579 ObjArrayTask(oop o, size_t idx): _obj(o), _index(int(idx)) {
580 assert(idx <= size_t(max_jint), "too big");
581 }
582 ObjArrayTask(const ObjArrayTask& t): _obj(t._obj), _index(t._index) { }
583
584 ObjArrayTask& operator =(const ObjArrayTask& t) {
585 _obj = t._obj;
586 _index = t._index;
587 return *this;
588 }
589 volatile ObjArrayTask&
590 operator =(const volatile ObjArrayTask& t) volatile {
591 (void)const_cast<oop&>(_obj = t._obj);
592 _index = t._index;
593 return *this;
594 }
595
596 inline oop obj() const { return _obj; }
597 inline int index() const { return _index; }
598
599 DEBUG_ONLY(bool is_valid() const); // Tasks to be pushed/popped must be valid.
600
601private:
602 oop _obj;
603 int _index;
604};
605
606#ifdef _MSC_VER
607#pragma warning(pop)
608#endif
609
610typedef OverflowTaskQueue<StarTask, mtGC> OopStarTaskQueue;
611typedef GenericTaskQueueSet<OopStarTaskQueue, mtGC> OopStarTaskQueueSet;
612
613typedef OverflowTaskQueue<size_t, mtGC> RegionTaskQueue;
614typedef GenericTaskQueueSet<RegionTaskQueue, mtGC> RegionTaskQueueSet;
615
616#endif // SHARE_GC_SHARED_TASKQUEUE_HPP
617