1 | /* |
2 | * Copyright (c) 2016, 2019, Red Hat, Inc. All rights reserved. |
3 | * |
4 | * This code is free software; you can redistribute it and/or modify it |
5 | * under the terms of the GNU General Public License version 2 only, as |
6 | * published by the Free Software Foundation. |
7 | * |
8 | * This code is distributed in the hope that it will be useful, but WITHOUT |
9 | * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or |
10 | * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License |
11 | * version 2 for more details (a copy is included in the LICENSE file that |
12 | * accompanied this code). |
13 | * |
14 | * You should have received a copy of the GNU General Public License version |
15 | * 2 along with this work; if not, write to the Free Software Foundation, |
16 | * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. |
17 | * |
18 | * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA |
19 | * or visit www.oracle.com if you need additional information or have any |
20 | * questions. |
21 | * |
22 | */ |
23 | |
24 | #ifndef SHARE_GC_SHENANDOAH_SHENANDOAHTASKQUEUE_HPP |
25 | #define SHARE_GC_SHENANDOAH_SHENANDOAHTASKQUEUE_HPP |
26 | #include "gc/shared/owstTaskTerminator.hpp" |
27 | #include "gc/shared/taskqueue.hpp" |
28 | #include "memory/allocation.hpp" |
29 | #include "runtime/mutex.hpp" |
30 | #include "runtime/thread.hpp" |
31 | |
32 | template<class E, MEMFLAGS F, unsigned int N = TASKQUEUE_SIZE> |
33 | class BufferedOverflowTaskQueue: public OverflowTaskQueue<E, F, N> |
34 | { |
35 | public: |
36 | typedef OverflowTaskQueue<E, F, N> taskqueue_t; |
37 | |
38 | BufferedOverflowTaskQueue() : _buf_empty(true) {}; |
39 | |
40 | TASKQUEUE_STATS_ONLY(using taskqueue_t::stats;) |
41 | |
42 | // Push task t into the queue. Returns true on success. |
43 | inline bool push(E t); |
44 | |
45 | // Attempt to pop from the queue. Returns true on success. |
46 | inline bool pop(E &t); |
47 | |
48 | inline void clear(); |
49 | |
50 | inline bool is_empty() const { |
51 | return _buf_empty && taskqueue_t::is_empty(); |
52 | } |
53 | |
54 | private: |
55 | bool _buf_empty; |
56 | E _elem; |
57 | }; |
58 | |
59 | // ObjArrayChunkedTask |
60 | // |
61 | // Encodes both regular oops, and the array oops plus chunking data for parallel array processing. |
62 | // The design goal is to make the regular oop ops very fast, because that would be the prevailing |
63 | // case. On the other hand, it should not block parallel array processing from efficiently dividing |
64 | // the array work. |
65 | // |
66 | // The idea is to steal the bits from the 64-bit oop to encode array data, if needed. For the |
67 | // proper divide-and-conquer strategies, we want to encode the "blocking" data. It turns out, the |
68 | // most efficient way to do this is to encode the array block as (chunk * 2^pow), where it is assumed |
69 | // that the block has the size of 2^pow. This requires for pow to have only 5 bits (2^32) to encode |
70 | // all possible arrays. |
71 | // |
72 | // |---------oop---------|-pow-|--chunk---| |
73 | // 0 49 54 64 |
74 | // |
75 | // By definition, chunk == 0 means "no chunk", i.e. chunking starts from 1. |
76 | // |
77 | // This encoding gives a few interesting benefits: |
78 | // |
79 | // a) Encoding/decoding regular oops is very simple, because the upper bits are zero in that task: |
80 | // |
81 | // |---------oop---------|00000|0000000000| // no chunk data |
82 | // |
83 | // This helps the most ubiquitous path. The initialization amounts to putting the oop into the word |
84 | // with zero padding. Testing for "chunkedness" is testing for zero with chunk mask. |
85 | // |
86 | // b) Splitting tasks for divide-and-conquer is possible. Suppose we have chunk <C, P> that covers |
87 | // interval [ (C-1)*2^P; C*2^P ). We can then split it into two chunks: |
88 | // <2*C - 1, P-1>, that covers interval [ (2*C - 2)*2^(P-1); (2*C - 1)*2^(P-1) ) |
89 | // <2*C, P-1>, that covers interval [ (2*C - 1)*2^(P-1); 2*C*2^(P-1) ) |
90 | // |
91 | // Observe that the union of these two intervals is: |
92 | // [ (2*C - 2)*2^(P-1); 2*C*2^(P-1) ) |
93 | // |
94 | // ...which is the original interval: |
95 | // [ (C-1)*2^P; C*2^P ) |
96 | // |
97 | // c) The divide-and-conquer strategy could even start with chunk <1, round-log2-len(arr)>, and split |
98 | // down in the parallel threads, which alleviates the upfront (serial) splitting costs. |
99 | // |
100 | // Encoding limitations caused by current bitscales mean: |
101 | // 10 bits for chunk: max 1024 blocks per array |
102 | // 5 bits for power: max 2^32 array |
103 | // 49 bits for oop: max 512 TB of addressable space |
104 | // |
105 | // Stealing bits from oop trims down the addressable space. Stealing too few bits for chunk ID limits |
106 | // potential parallelism. Stealing too few bits for pow limits the maximum array size that can be handled. |
107 | // In future, these might be rebalanced to favor one degree of freedom against another. For example, |
108 | // if/when Arrays 2.0 bring 2^64-sized arrays, we might need to steal another bit for power. We could regain |
109 | // some bits back if chunks are counted in ObjArrayMarkingStride units. |
110 | // |
111 | // There is also a fallback version that uses plain fields, when we don't have enough space to steal the |
112 | // bits from the native pointer. It is useful to debug the optimized version. |
113 | // |
114 | |
115 | #ifdef _MSC_VER |
116 | #pragma warning(push) |
117 | // warning C4522: multiple assignment operators specified |
118 | #pragma warning( disable:4522 ) |
119 | #endif |
120 | |
121 | #ifdef _LP64 |
122 | #define SHENANDOAH_OPTIMIZED_OBJTASK 1 |
123 | #else |
124 | #define SHENANDOAH_OPTIMIZED_OBJTASK 0 |
125 | #endif |
126 | |
127 | #if SHENANDOAH_OPTIMIZED_OBJTASK |
128 | class ObjArrayChunkedTask |
129 | { |
130 | public: |
131 | enum { |
132 | chunk_bits = 10, |
133 | pow_bits = 5, |
134 | oop_bits = sizeof(uintptr_t)*8 - chunk_bits - pow_bits |
135 | }; |
136 | enum { |
137 | oop_shift = 0, |
138 | pow_shift = oop_shift + oop_bits, |
139 | chunk_shift = pow_shift + pow_bits |
140 | }; |
141 | |
142 | public: |
143 | ObjArrayChunkedTask(oop o = NULL) { |
144 | assert(oopDesc::equals_raw(decode_oop(encode_oop(o)), o), "oop can be encoded: " PTR_FORMAT, p2i(o)); |
145 | _obj = encode_oop(o); |
146 | } |
147 | ObjArrayChunkedTask(oop o, int chunk, int pow) { |
148 | assert(oopDesc::equals_raw(decode_oop(encode_oop(o)), o), "oop can be encoded: " PTR_FORMAT, p2i(o)); |
149 | assert(decode_chunk(encode_chunk(chunk)) == chunk, "chunk can be encoded: %d" , chunk); |
150 | assert(decode_pow(encode_pow(pow)) == pow, "pow can be encoded: %d" , pow); |
151 | _obj = encode_oop(o) | encode_chunk(chunk) | encode_pow(pow); |
152 | } |
153 | ObjArrayChunkedTask(const ObjArrayChunkedTask& t): _obj(t._obj) { } |
154 | |
155 | ObjArrayChunkedTask& operator =(const ObjArrayChunkedTask& t) { |
156 | _obj = t._obj; |
157 | return *this; |
158 | } |
159 | volatile ObjArrayChunkedTask& |
160 | operator =(const volatile ObjArrayChunkedTask& t) volatile { |
161 | (void)const_cast<uintptr_t&>(_obj = t._obj); |
162 | return *this; |
163 | } |
164 | |
165 | inline oop decode_oop(uintptr_t val) const { |
166 | return (oop) reinterpret_cast<void*>((val >> oop_shift) & right_n_bits(oop_bits)); |
167 | } |
168 | |
169 | inline int decode_chunk(uintptr_t val) const { |
170 | return (int) ((val >> chunk_shift) & right_n_bits(chunk_bits)); |
171 | } |
172 | |
173 | inline int decode_pow(uintptr_t val) const { |
174 | return (int) ((val >> pow_shift) & right_n_bits(pow_bits)); |
175 | } |
176 | |
177 | inline uintptr_t encode_oop(oop obj) const { |
178 | return ((uintptr_t)(void*) obj) << oop_shift; |
179 | } |
180 | |
181 | inline uintptr_t encode_chunk(int chunk) const { |
182 | return ((uintptr_t) chunk) << chunk_shift; |
183 | } |
184 | |
185 | inline uintptr_t encode_pow(int pow) const { |
186 | return ((uintptr_t) pow) << pow_shift; |
187 | } |
188 | |
189 | inline oop obj() const { return decode_oop(_obj); } |
190 | inline int chunk() const { return decode_chunk(_obj); } |
191 | inline int pow() const { return decode_pow(_obj); } |
192 | inline bool is_not_chunked() const { return (_obj & ~right_n_bits(oop_bits + pow_bits)) == 0; } |
193 | |
194 | DEBUG_ONLY(bool is_valid() const); // Tasks to be pushed/popped must be valid. |
195 | |
196 | static uintptr_t max_addressable() { |
197 | return nth_bit(oop_bits); |
198 | } |
199 | |
200 | static int chunk_size() { |
201 | return nth_bit(chunk_bits); |
202 | } |
203 | |
204 | private: |
205 | uintptr_t _obj; |
206 | }; |
207 | #else |
208 | class ObjArrayChunkedTask |
209 | { |
210 | public: |
211 | enum { |
212 | chunk_bits = 10, |
213 | pow_bits = 5, |
214 | }; |
215 | public: |
216 | ObjArrayChunkedTask(oop o = NULL, int chunk = 0, int pow = 0): _obj(o) { |
217 | assert(0 <= chunk && chunk < nth_bit(chunk_bits), "chunk is sane: %d" , chunk); |
218 | assert(0 <= pow && pow < nth_bit(pow_bits), "pow is sane: %d" , pow); |
219 | _chunk = chunk; |
220 | _pow = pow; |
221 | } |
222 | ObjArrayChunkedTask(const ObjArrayChunkedTask& t): _obj(t._obj), _chunk(t._chunk), _pow(t._pow) { } |
223 | |
224 | ObjArrayChunkedTask& operator =(const ObjArrayChunkedTask& t) { |
225 | _obj = t._obj; |
226 | _chunk = t._chunk; |
227 | _pow = t._pow; |
228 | return *this; |
229 | } |
230 | volatile ObjArrayChunkedTask& |
231 | operator =(const volatile ObjArrayChunkedTask& t) volatile { |
232 | (void)const_cast<oop&>(_obj = t._obj); |
233 | _chunk = t._chunk; |
234 | _pow = t._pow; |
235 | return *this; |
236 | } |
237 | |
238 | inline oop obj() const { return _obj; } |
239 | inline int chunk() const { return _chunk; } |
240 | inline int pow() const { return _pow; } |
241 | |
242 | inline bool is_not_chunked() const { return _chunk == 0; } |
243 | |
244 | DEBUG_ONLY(bool is_valid() const); // Tasks to be pushed/popped must be valid. |
245 | |
246 | static size_t max_addressable() { |
247 | return sizeof(oop); |
248 | } |
249 | |
250 | static int chunk_size() { |
251 | return nth_bit(chunk_bits); |
252 | } |
253 | |
254 | private: |
255 | oop _obj; |
256 | int _chunk; |
257 | int _pow; |
258 | }; |
259 | #endif // SHENANDOAH_OPTIMIZED_OBJTASK |
260 | |
261 | #ifdef _MSC_VER |
262 | #pragma warning(pop) |
263 | #endif |
264 | |
265 | typedef ObjArrayChunkedTask ShenandoahMarkTask; |
266 | typedef BufferedOverflowTaskQueue<ShenandoahMarkTask, mtGC> ShenandoahBufferedOverflowTaskQueue; |
267 | typedef Padded<ShenandoahBufferedOverflowTaskQueue> ShenandoahObjToScanQueue; |
268 | |
269 | template <class T, MEMFLAGS F> |
270 | class ParallelClaimableQueueSet: public GenericTaskQueueSet<T, F> { |
271 | private: |
272 | DEFINE_PAD_MINUS_SIZE(0, DEFAULT_CACHE_LINE_SIZE, sizeof(volatile jint)); |
273 | volatile jint _claimed_index; |
274 | DEFINE_PAD_MINUS_SIZE(1, DEFAULT_CACHE_LINE_SIZE, 0); |
275 | |
276 | debug_only(uint _reserved; ) |
277 | |
278 | public: |
279 | using GenericTaskQueueSet<T, F>::size; |
280 | |
281 | public: |
282 | ParallelClaimableQueueSet(int n) : GenericTaskQueueSet<T, F>(n), _claimed_index(0) { |
283 | debug_only(_reserved = 0; ) |
284 | } |
285 | |
286 | void clear_claimed() { _claimed_index = 0; } |
287 | T* claim_next(); |
288 | |
289 | // reserve queues that not for parallel claiming |
290 | void reserve(uint n) { |
291 | assert(n <= size(), "Sanity" ); |
292 | _claimed_index = (jint)n; |
293 | debug_only(_reserved = n;) |
294 | } |
295 | |
296 | debug_only(uint get_reserved() const { return (uint)_reserved; }) |
297 | }; |
298 | |
299 | template <class T, MEMFLAGS F> |
300 | T* ParallelClaimableQueueSet<T, F>::claim_next() { |
301 | jint size = (jint)GenericTaskQueueSet<T, F>::size(); |
302 | |
303 | if (_claimed_index >= size) { |
304 | return NULL; |
305 | } |
306 | |
307 | jint index = Atomic::add(1, &_claimed_index); |
308 | |
309 | if (index <= size) { |
310 | return GenericTaskQueueSet<T, F>::queue((uint)index - 1); |
311 | } else { |
312 | return NULL; |
313 | } |
314 | } |
315 | |
316 | class ShenandoahObjToScanQueueSet: public ParallelClaimableQueueSet<ShenandoahObjToScanQueue, mtGC> { |
317 | public: |
318 | ShenandoahObjToScanQueueSet(int n) : ParallelClaimableQueueSet<ShenandoahObjToScanQueue, mtGC>(n) {} |
319 | |
320 | bool is_empty(); |
321 | void clear(); |
322 | |
323 | #if TASKQUEUE_STATS |
324 | static void print_taskqueue_stats_hdr(outputStream* const st); |
325 | void print_taskqueue_stats() const; |
326 | void reset_taskqueue_stats(); |
327 | #endif // TASKQUEUE_STATS |
328 | }; |
329 | |
330 | class ShenandoahTerminatorTerminator : public TerminatorTerminator { |
331 | private: |
332 | ShenandoahHeap* _heap; |
333 | public: |
334 | ShenandoahTerminatorTerminator(ShenandoahHeap* const heap) : _heap(heap) { } |
335 | // return true, terminates immediately, even if there's remaining work left |
336 | virtual bool should_exit_termination() { return _heap->cancelled_gc(); } |
337 | }; |
338 | |
339 | class ShenandoahTaskTerminator : public StackObj { |
340 | private: |
341 | OWSTTaskTerminator* const _terminator; |
342 | public: |
343 | ShenandoahTaskTerminator(uint n_threads, TaskQueueSetSuper* queue_set); |
344 | ~ShenandoahTaskTerminator(); |
345 | |
346 | bool offer_termination(ShenandoahTerminatorTerminator* terminator) { |
347 | return _terminator->offer_termination(terminator); |
348 | } |
349 | |
350 | void reset_for_reuse() { _terminator->reset_for_reuse(); } |
351 | bool offer_termination() { return offer_termination((ShenandoahTerminatorTerminator*)NULL); } |
352 | }; |
353 | |
354 | #endif // SHARE_GC_SHENANDOAH_SHENANDOAHTASKQUEUE_HPP |
355 | |