1/*
2 * Copyright (c) 2016, 2019, Red Hat, Inc. All rights reserved.
3 *
4 * This code is free software; you can redistribute it and/or modify it
5 * under the terms of the GNU General Public License version 2 only, as
6 * published by the Free Software Foundation.
7 *
8 * This code is distributed in the hope that it will be useful, but WITHOUT
9 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
11 * version 2 for more details (a copy is included in the LICENSE file that
12 * accompanied this code).
13 *
14 * You should have received a copy of the GNU General Public License version
15 * 2 along with this work; if not, write to the Free Software Foundation,
16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
17 *
18 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
19 * or visit www.oracle.com if you need additional information or have any
20 * questions.
21 *
22 */
23
24#ifndef SHARE_GC_SHENANDOAH_SHENANDOAHTASKQUEUE_HPP
25#define SHARE_GC_SHENANDOAH_SHENANDOAHTASKQUEUE_HPP
26#include "gc/shared/owstTaskTerminator.hpp"
27#include "gc/shared/taskqueue.hpp"
28#include "memory/allocation.hpp"
29#include "runtime/mutex.hpp"
30#include "runtime/thread.hpp"
31
32template<class E, MEMFLAGS F, unsigned int N = TASKQUEUE_SIZE>
33class BufferedOverflowTaskQueue: public OverflowTaskQueue<E, F, N>
34{
35public:
36 typedef OverflowTaskQueue<E, F, N> taskqueue_t;
37
38 BufferedOverflowTaskQueue() : _buf_empty(true) {};
39
40 TASKQUEUE_STATS_ONLY(using taskqueue_t::stats;)
41
42 // Push task t into the queue. Returns true on success.
43 inline bool push(E t);
44
45 // Attempt to pop from the queue. Returns true on success.
46 inline bool pop(E &t);
47
48 inline void clear();
49
50 inline bool is_empty() const {
51 return _buf_empty && taskqueue_t::is_empty();
52 }
53
54private:
55 bool _buf_empty;
56 E _elem;
57};
58
59// ObjArrayChunkedTask
60//
61// Encodes both regular oops, and the array oops plus chunking data for parallel array processing.
62// The design goal is to make the regular oop ops very fast, because that would be the prevailing
63// case. On the other hand, it should not block parallel array processing from efficiently dividing
64// the array work.
65//
66// The idea is to steal the bits from the 64-bit oop to encode array data, if needed. For the
67// proper divide-and-conquer strategies, we want to encode the "blocking" data. It turns out, the
68// most efficient way to do this is to encode the array block as (chunk * 2^pow), where it is assumed
69// that the block has the size of 2^pow. This requires for pow to have only 5 bits (2^32) to encode
70// all possible arrays.
71//
72// |---------oop---------|-pow-|--chunk---|
73// 0 49 54 64
74//
75// By definition, chunk == 0 means "no chunk", i.e. chunking starts from 1.
76//
77// This encoding gives a few interesting benefits:
78//
79// a) Encoding/decoding regular oops is very simple, because the upper bits are zero in that task:
80//
81// |---------oop---------|00000|0000000000| // no chunk data
82//
83// This helps the most ubiquitous path. The initialization amounts to putting the oop into the word
84// with zero padding. Testing for "chunkedness" is testing for zero with chunk mask.
85//
86// b) Splitting tasks for divide-and-conquer is possible. Suppose we have chunk <C, P> that covers
87// interval [ (C-1)*2^P; C*2^P ). We can then split it into two chunks:
88// <2*C - 1, P-1>, that covers interval [ (2*C - 2)*2^(P-1); (2*C - 1)*2^(P-1) )
89// <2*C, P-1>, that covers interval [ (2*C - 1)*2^(P-1); 2*C*2^(P-1) )
90//
91// Observe that the union of these two intervals is:
92// [ (2*C - 2)*2^(P-1); 2*C*2^(P-1) )
93//
94// ...which is the original interval:
95// [ (C-1)*2^P; C*2^P )
96//
97// c) The divide-and-conquer strategy could even start with chunk <1, round-log2-len(arr)>, and split
98// down in the parallel threads, which alleviates the upfront (serial) splitting costs.
99//
100// Encoding limitations caused by current bitscales mean:
101// 10 bits for chunk: max 1024 blocks per array
102// 5 bits for power: max 2^32 array
103// 49 bits for oop: max 512 TB of addressable space
104//
105// Stealing bits from oop trims down the addressable space. Stealing too few bits for chunk ID limits
106// potential parallelism. Stealing too few bits for pow limits the maximum array size that can be handled.
107// In future, these might be rebalanced to favor one degree of freedom against another. For example,
108// if/when Arrays 2.0 bring 2^64-sized arrays, we might need to steal another bit for power. We could regain
109// some bits back if chunks are counted in ObjArrayMarkingStride units.
110//
111// There is also a fallback version that uses plain fields, when we don't have enough space to steal the
112// bits from the native pointer. It is useful to debug the optimized version.
113//
114
115#ifdef _MSC_VER
116#pragma warning(push)
117// warning C4522: multiple assignment operators specified
118#pragma warning( disable:4522 )
119#endif
120
121#ifdef _LP64
122#define SHENANDOAH_OPTIMIZED_OBJTASK 1
123#else
124#define SHENANDOAH_OPTIMIZED_OBJTASK 0
125#endif
126
127#if SHENANDOAH_OPTIMIZED_OBJTASK
128class ObjArrayChunkedTask
129{
130public:
131 enum {
132 chunk_bits = 10,
133 pow_bits = 5,
134 oop_bits = sizeof(uintptr_t)*8 - chunk_bits - pow_bits
135 };
136 enum {
137 oop_shift = 0,
138 pow_shift = oop_shift + oop_bits,
139 chunk_shift = pow_shift + pow_bits
140 };
141
142public:
143 ObjArrayChunkedTask(oop o = NULL) {
144 assert(oopDesc::equals_raw(decode_oop(encode_oop(o)), o), "oop can be encoded: " PTR_FORMAT, p2i(o));
145 _obj = encode_oop(o);
146 }
147 ObjArrayChunkedTask(oop o, int chunk, int pow) {
148 assert(oopDesc::equals_raw(decode_oop(encode_oop(o)), o), "oop can be encoded: " PTR_FORMAT, p2i(o));
149 assert(decode_chunk(encode_chunk(chunk)) == chunk, "chunk can be encoded: %d", chunk);
150 assert(decode_pow(encode_pow(pow)) == pow, "pow can be encoded: %d", pow);
151 _obj = encode_oop(o) | encode_chunk(chunk) | encode_pow(pow);
152 }
153 ObjArrayChunkedTask(const ObjArrayChunkedTask& t): _obj(t._obj) { }
154
155 ObjArrayChunkedTask& operator =(const ObjArrayChunkedTask& t) {
156 _obj = t._obj;
157 return *this;
158 }
159 volatile ObjArrayChunkedTask&
160 operator =(const volatile ObjArrayChunkedTask& t) volatile {
161 (void)const_cast<uintptr_t&>(_obj = t._obj);
162 return *this;
163 }
164
165 inline oop decode_oop(uintptr_t val) const {
166 return (oop) reinterpret_cast<void*>((val >> oop_shift) & right_n_bits(oop_bits));
167 }
168
169 inline int decode_chunk(uintptr_t val) const {
170 return (int) ((val >> chunk_shift) & right_n_bits(chunk_bits));
171 }
172
173 inline int decode_pow(uintptr_t val) const {
174 return (int) ((val >> pow_shift) & right_n_bits(pow_bits));
175 }
176
177 inline uintptr_t encode_oop(oop obj) const {
178 return ((uintptr_t)(void*) obj) << oop_shift;
179 }
180
181 inline uintptr_t encode_chunk(int chunk) const {
182 return ((uintptr_t) chunk) << chunk_shift;
183 }
184
185 inline uintptr_t encode_pow(int pow) const {
186 return ((uintptr_t) pow) << pow_shift;
187 }
188
189 inline oop obj() const { return decode_oop(_obj); }
190 inline int chunk() const { return decode_chunk(_obj); }
191 inline int pow() const { return decode_pow(_obj); }
192 inline bool is_not_chunked() const { return (_obj & ~right_n_bits(oop_bits + pow_bits)) == 0; }
193
194 DEBUG_ONLY(bool is_valid() const); // Tasks to be pushed/popped must be valid.
195
196 static uintptr_t max_addressable() {
197 return nth_bit(oop_bits);
198 }
199
200 static int chunk_size() {
201 return nth_bit(chunk_bits);
202 }
203
204private:
205 uintptr_t _obj;
206};
207#else
208class ObjArrayChunkedTask
209{
210public:
211 enum {
212 chunk_bits = 10,
213 pow_bits = 5,
214 };
215public:
216 ObjArrayChunkedTask(oop o = NULL, int chunk = 0, int pow = 0): _obj(o) {
217 assert(0 <= chunk && chunk < nth_bit(chunk_bits), "chunk is sane: %d", chunk);
218 assert(0 <= pow && pow < nth_bit(pow_bits), "pow is sane: %d", pow);
219 _chunk = chunk;
220 _pow = pow;
221 }
222 ObjArrayChunkedTask(const ObjArrayChunkedTask& t): _obj(t._obj), _chunk(t._chunk), _pow(t._pow) { }
223
224 ObjArrayChunkedTask& operator =(const ObjArrayChunkedTask& t) {
225 _obj = t._obj;
226 _chunk = t._chunk;
227 _pow = t._pow;
228 return *this;
229 }
230 volatile ObjArrayChunkedTask&
231 operator =(const volatile ObjArrayChunkedTask& t) volatile {
232 (void)const_cast<oop&>(_obj = t._obj);
233 _chunk = t._chunk;
234 _pow = t._pow;
235 return *this;
236 }
237
238 inline oop obj() const { return _obj; }
239 inline int chunk() const { return _chunk; }
240 inline int pow() const { return _pow; }
241
242 inline bool is_not_chunked() const { return _chunk == 0; }
243
244 DEBUG_ONLY(bool is_valid() const); // Tasks to be pushed/popped must be valid.
245
246 static size_t max_addressable() {
247 return sizeof(oop);
248 }
249
250 static int chunk_size() {
251 return nth_bit(chunk_bits);
252 }
253
254private:
255 oop _obj;
256 int _chunk;
257 int _pow;
258};
259#endif // SHENANDOAH_OPTIMIZED_OBJTASK
260
261#ifdef _MSC_VER
262#pragma warning(pop)
263#endif
264
265typedef ObjArrayChunkedTask ShenandoahMarkTask;
266typedef BufferedOverflowTaskQueue<ShenandoahMarkTask, mtGC> ShenandoahBufferedOverflowTaskQueue;
267typedef Padded<ShenandoahBufferedOverflowTaskQueue> ShenandoahObjToScanQueue;
268
269template <class T, MEMFLAGS F>
270class ParallelClaimableQueueSet: public GenericTaskQueueSet<T, F> {
271private:
272 DEFINE_PAD_MINUS_SIZE(0, DEFAULT_CACHE_LINE_SIZE, sizeof(volatile jint));
273 volatile jint _claimed_index;
274 DEFINE_PAD_MINUS_SIZE(1, DEFAULT_CACHE_LINE_SIZE, 0);
275
276 debug_only(uint _reserved; )
277
278public:
279 using GenericTaskQueueSet<T, F>::size;
280
281public:
282 ParallelClaimableQueueSet(int n) : GenericTaskQueueSet<T, F>(n), _claimed_index(0) {
283 debug_only(_reserved = 0; )
284 }
285
286 void clear_claimed() { _claimed_index = 0; }
287 T* claim_next();
288
289 // reserve queues that not for parallel claiming
290 void reserve(uint n) {
291 assert(n <= size(), "Sanity");
292 _claimed_index = (jint)n;
293 debug_only(_reserved = n;)
294 }
295
296 debug_only(uint get_reserved() const { return (uint)_reserved; })
297};
298
299template <class T, MEMFLAGS F>
300T* ParallelClaimableQueueSet<T, F>::claim_next() {
301 jint size = (jint)GenericTaskQueueSet<T, F>::size();
302
303 if (_claimed_index >= size) {
304 return NULL;
305 }
306
307 jint index = Atomic::add(1, &_claimed_index);
308
309 if (index <= size) {
310 return GenericTaskQueueSet<T, F>::queue((uint)index - 1);
311 } else {
312 return NULL;
313 }
314}
315
316class ShenandoahObjToScanQueueSet: public ParallelClaimableQueueSet<ShenandoahObjToScanQueue, mtGC> {
317public:
318 ShenandoahObjToScanQueueSet(int n) : ParallelClaimableQueueSet<ShenandoahObjToScanQueue, mtGC>(n) {}
319
320 bool is_empty();
321 void clear();
322
323#if TASKQUEUE_STATS
324 static void print_taskqueue_stats_hdr(outputStream* const st);
325 void print_taskqueue_stats() const;
326 void reset_taskqueue_stats();
327#endif // TASKQUEUE_STATS
328};
329
330class ShenandoahTerminatorTerminator : public TerminatorTerminator {
331private:
332 ShenandoahHeap* _heap;
333public:
334 ShenandoahTerminatorTerminator(ShenandoahHeap* const heap) : _heap(heap) { }
335 // return true, terminates immediately, even if there's remaining work left
336 virtual bool should_exit_termination() { return _heap->cancelled_gc(); }
337};
338
339class ShenandoahTaskTerminator : public StackObj {
340private:
341 OWSTTaskTerminator* const _terminator;
342public:
343 ShenandoahTaskTerminator(uint n_threads, TaskQueueSetSuper* queue_set);
344 ~ShenandoahTaskTerminator();
345
346 bool offer_termination(ShenandoahTerminatorTerminator* terminator) {
347 return _terminator->offer_termination(terminator);
348 }
349
350 void reset_for_reuse() { _terminator->reset_for_reuse(); }
351 bool offer_termination() { return offer_termination((ShenandoahTerminatorTerminator*)NULL); }
352};
353
354#endif // SHARE_GC_SHENANDOAH_SHENANDOAHTASKQUEUE_HPP
355