1 /*
2 * Copyright (c) 2016, 2020, Red Hat, Inc. All rights reserved.
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4 *
5 * This code is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU General Public License version 2 only, as
7 * published by the Free Software Foundation.
8 *
9 * This code is distributed in the hope that it will be useful, but WITHOUT
10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
12 * version 2 for more details (a copy is included in the LICENSE file that
13 * accompanied this code).
14 *
15 * You should have received a copy of the GNU General Public License version
16 * 2 along with this work; if not, write to the Free Software Foundation,
17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
18 *
19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
20 * or visit www.oracle.com if you need additional information or have any
21 * questions.
22 *
23 */
24
25 #ifndef SHARE_GC_SHENANDOAH_SHENANDOAHTASKQUEUE_HPP
26 #define SHARE_GC_SHENANDOAH_SHENANDOAHTASKQUEUE_HPP
27
28 #include "gc/shared/taskTerminator.hpp"
29 #include "gc/shared/taskqueue.hpp"
30 #include "gc/shenandoah/shenandoahPadding.hpp"
31 #include "memory/allocation.hpp"
32 #include "runtime/atomic.hpp"
33 #include "runtime/javaThread.hpp"
34 #include "runtime/mutex.hpp"
35 #include "utilities/debug.hpp"
36
37 class ShenandoahHeap;
38
39 template<class E, MEMFLAGS F, unsigned int N = TASKQUEUE_SIZE>
40 class BufferedOverflowTaskQueue: public OverflowTaskQueue<E, F, N>
41 {
42 public:
43 typedef OverflowTaskQueue<E, F, N> taskqueue_t;
44
45 BufferedOverflowTaskQueue() : _buf_empty(true) {};
46
47 TASKQUEUE_STATS_ONLY(using taskqueue_t::stats;)
48
49 // Push task t into the queue. Returns true on success.
50 inline bool push(E t);
51
52 // Attempt to pop from the queue. Returns true on success.
53 inline bool pop(E &t);
54
55 inline void clear();
56
57 inline bool is_empty() const {
58 return _buf_empty && taskqueue_t::is_empty();
59 }
60
61 private:
62 bool _buf_empty;
63 E _elem;
64 };
65
66 // ShenandoahMarkTask
67 //
68 // Encodes both regular oops, and the array oops plus chunking data for parallel array processing.
69 // The design goal is to make the regular oop ops very fast, because that would be the prevailing
70 // case. On the other hand, it should not block parallel array processing from efficiently dividing
71 // the array work.
72 //
73 // The idea is to steal the bits from the 64-bit oop to encode array data, if needed. For the
74 // proper divide-and-conquer strategies, we want to encode the "blocking" data. It turns out, the
75 // most efficient way to do this is to encode the array block as (chunk * 2^pow), where it is assumed
76 // that the block has the size of 2^pow. This requires for pow to have only 5 bits (2^32) to encode
77 // all possible arrays.
78 //
79 // |xx-------oop---------|-pow-|--chunk---|
80 // 0 49 54 64
81 //
82 // By definition, chunk == 0 means "no chunk", i.e. chunking starts from 1.
83 //
84 // Lower bits of oop are reserved to handle "skip_live" and "strong" properties. Since this encoding
85 // stores uncompressed oops, those bits are always available. These bits default to zero for "skip_live"
86 // and "weak". This aligns with their frequent values: strong/counted-live references.
87 //
88 // This encoding gives a few interesting benefits:
89 //
90 // a) Encoding/decoding regular oops is very simple, because the upper bits are zero in that task:
91 //
92 // |---------oop---------|00000|0000000000| // no chunk data
93 //
94 // This helps the most ubiquitous path. The initialization amounts to putting the oop into the word
95 // with zero padding. Testing for "chunkedness" is testing for zero with chunk mask.
96 //
97 // b) Splitting tasks for divide-and-conquer is possible. Suppose we have chunk <C, P> that covers
98 // interval [ (C-1)*2^P; C*2^P ). We can then split it into two chunks:
99 // <2*C - 1, P-1>, that covers interval [ (2*C - 2)*2^(P-1); (2*C - 1)*2^(P-1) )
100 // <2*C, P-1>, that covers interval [ (2*C - 1)*2^(P-1); 2*C*2^(P-1) )
101 //
102 // Observe that the union of these two intervals is:
103 // [ (2*C - 2)*2^(P-1); 2*C*2^(P-1) )
104 //
105 // ...which is the original interval:
106 // [ (C-1)*2^P; C*2^P )
107 //
108 // c) The divide-and-conquer strategy could even start with chunk <1, round-log2-len(arr)>, and split
109 // down in the parallel threads, which alleviates the upfront (serial) splitting costs.
110 //
111 // Encoding limitations caused by current bitscales mean:
112 // 10 bits for chunk: max 1024 blocks per array
113 // 5 bits for power: max 2^32 array
114 // 49 bits for oop: max 512 TB of addressable space
115 //
116 // Stealing bits from oop trims down the addressable space. Stealing too few bits for chunk ID limits
117 // potential parallelism. Stealing too few bits for pow limits the maximum array size that can be handled.
118 // In future, these might be rebalanced to favor one degree of freedom against another. For example,
119 // if/when Arrays 2.0 bring 2^64-sized arrays, we might need to steal another bit for power. We could regain
120 // some bits back if chunks are counted in ObjArrayMarkingStride units.
121 //
122 // There is also a fallback version that uses plain fields, when we don't have enough space to steal the
123 // bits from the native pointer. It is useful to debug the optimized version.
124 //
125
126 #ifdef _MSC_VER
127 #pragma warning(push)
128 // warning C4522: multiple assignment operators specified
129 #pragma warning( disable:4522 )
130 #endif
131
132 #ifdef _LP64
133 #define SHENANDOAH_OPTIMIZED_MARKTASK 1
134 #else
135 #define SHENANDOAH_OPTIMIZED_MARKTASK 0
136 #endif
137
138 #if SHENANDOAH_OPTIMIZED_MARKTASK
139 class ShenandoahMarkTask
140 {
141 private:
142 // Everything is encoded into this field...
143 uintptr_t _obj;
144
145 // ...with these:
146 static const uint8_t chunk_bits = 10;
147 static const uint8_t pow_bits = 5;
148 static const uint8_t oop_bits = sizeof(uintptr_t)*8 - chunk_bits - pow_bits;
149
150 static const uint8_t oop_shift = 0;
151 static const uint8_t pow_shift = oop_bits;
152 static const uint8_t chunk_shift = oop_bits + pow_bits;
153
154 static const uintptr_t oop_extract_mask = right_n_bits(oop_bits) - 3;
155 static const uintptr_t skip_live_extract_mask = 1 << 0;
156 static const uintptr_t weak_extract_mask = 1 << 1;
157 static const uintptr_t chunk_pow_extract_mask = ~right_n_bits(oop_bits);
158
159 static const int chunk_range_mask = right_n_bits(chunk_bits);
160 static const int pow_range_mask = right_n_bits(pow_bits);
161
162 inline oop decode_oop(uintptr_t val) const {
163 STATIC_ASSERT(oop_shift == 0);
164 return cast_to_oop(val & oop_extract_mask);
165 }
166
167 inline bool decode_not_chunked(uintptr_t val) const {
168 // No need to shift for a comparison to zero
169 return (val & chunk_pow_extract_mask) == 0;
170 }
171
172 inline int decode_chunk(uintptr_t val) const {
173 return (int) ((val >> chunk_shift) & chunk_range_mask);
174 }
175
176 inline int decode_pow(uintptr_t val) const {
177 return (int) ((val >> pow_shift) & pow_range_mask);
178 }
179
180 inline bool decode_weak(uintptr_t val) const {
181 return (val & weak_extract_mask) != 0;
182 }
183
184 inline bool decode_cnt_live(uintptr_t val) const {
185 return (val & skip_live_extract_mask) == 0;
186 }
187
188 inline uintptr_t encode_oop(oop obj, bool skip_live, bool weak) const {
189 STATIC_ASSERT(oop_shift == 0);
190 uintptr_t encoded = cast_from_oop<uintptr_t>(obj);
191 if (skip_live) {
192 encoded |= skip_live_extract_mask;
193 }
194 if (weak) {
195 encoded |= weak_extract_mask;
196 }
197 return encoded;
198 }
199
200 inline uintptr_t encode_chunk(int chunk) const {
201 return ((uintptr_t) chunk) << chunk_shift;
202 }
203
204 inline uintptr_t encode_pow(int pow) const {
205 return ((uintptr_t) pow) << pow_shift;
206 }
207
208 public:
209 ShenandoahMarkTask(oop o = nullptr, bool skip_live = false, bool weak = false) {
210 uintptr_t enc = encode_oop(o, skip_live, weak);
211 assert(decode_oop(enc) == o, "oop encoding should work: " PTR_FORMAT, p2i(o));
212 assert(decode_cnt_live(enc) == !skip_live, "skip_live encoding should work");
213 assert(decode_weak(enc) == weak, "weak encoding should work");
214 assert(decode_not_chunked(enc), "task should not be chunked");
215 _obj = enc;
216 }
217
218 ShenandoahMarkTask(oop o, bool skip_live, bool weak, int chunk, int pow) {
219 uintptr_t enc_oop = encode_oop(o, skip_live, weak);
220 uintptr_t enc_chunk = encode_chunk(chunk);
221 uintptr_t enc_pow = encode_pow(pow);
222 uintptr_t enc = enc_oop | enc_chunk | enc_pow;
223 assert(decode_oop(enc) == o, "oop encoding should work: " PTR_FORMAT, p2i(o));
224 assert(decode_cnt_live(enc) == !skip_live, "skip_live should be true for chunked tasks");
225 assert(decode_weak(enc) == weak, "weak encoding should work");
226 assert(decode_chunk(enc) == chunk, "chunk encoding should work: %d", chunk);
227 assert(decode_pow(enc) == pow, "pow encoding should work: %d", pow);
228 assert(!decode_not_chunked(enc), "task should be chunked");
229 _obj = enc;
230 }
231
232 // Trivially copyable.
233
234 public:
235 inline oop obj() const { return decode_oop(_obj); }
236 inline int chunk() const { return decode_chunk(_obj); }
237 inline int pow() const { return decode_pow(_obj); }
238
239 inline bool is_not_chunked() const { return decode_not_chunked(_obj); }
240 inline bool is_weak() const { return decode_weak(_obj); }
241 inline bool count_liveness() const { return decode_cnt_live(_obj); }
242
243 DEBUG_ONLY(bool is_valid() const;) // Tasks to be pushed/popped must be valid.
244
245 static uintptr_t max_addressable() {
246 return nth_bit(oop_bits);
247 }
248
249 static int chunk_size() {
250 return nth_bit(chunk_bits);
251 }
252 };
253 #else
254 class ShenandoahMarkTask
255 {
256 private:
257 static const uint8_t chunk_bits = 10;
258 static const uint8_t pow_bits = 5;
259
260 static const int chunk_max = nth_bit(chunk_bits) - 1;
261 static const int pow_max = nth_bit(pow_bits) - 1;
262
263 oop _obj;
264 bool _skip_live;
265 bool _weak;
266 int _chunk;
267 int _pow;
268
269 public:
270 ShenandoahMarkTask(oop o = nullptr, bool skip_live = false, bool weak = false, int chunk = 0, int pow = 0):
271 _obj(o), _skip_live(skip_live), _weak(weak), _chunk(chunk), _pow(pow) {
272 assert(0 <= chunk && chunk <= chunk_max, "chunk is in range: %d", chunk);
273 assert(0 <= pow && pow <= pow_max, "pow is in range: %d", pow);
274 }
275
276 // Trivially copyable.
277
278 inline oop obj() const { return _obj; }
279 inline int chunk() const { return _chunk; }
280 inline int pow() const { return _pow; }
281 inline bool is_not_chunked() const { return _chunk == 0; }
282 inline bool is_weak() const { return _weak; }
283 inline bool count_liveness() const { return !_skip_live; }
284
285 DEBUG_ONLY(bool is_valid() const;) // Tasks to be pushed/popped must be valid.
286
287 static size_t max_addressable() {
288 return sizeof(oop);
289 }
290
291 static int chunk_size() {
292 return nth_bit(chunk_bits);
293 }
294 };
295 #endif // SHENANDOAH_OPTIMIZED_MARKTASK
296
297 #ifdef _MSC_VER
298 #pragma warning(pop)
299 #endif
300
301 typedef BufferedOverflowTaskQueue<ShenandoahMarkTask, mtGC> ShenandoahBufferedOverflowTaskQueue;
302 typedef Padded<ShenandoahBufferedOverflowTaskQueue> ShenandoahObjToScanQueue;
303
304 template <class T, MEMFLAGS F>
305 class ParallelClaimableQueueSet: public GenericTaskQueueSet<T, F> {
306 private:
307 shenandoah_padding(0);
308 volatile jint _claimed_index;
309 shenandoah_padding(1);
310
311 debug_only(uint _reserved; )
312
313 public:
314 using GenericTaskQueueSet<T, F>::size;
315
316 public:
317 ParallelClaimableQueueSet(int n) : GenericTaskQueueSet<T, F>(n), _claimed_index(0) {
318 debug_only(_reserved = 0; )
319 }
320
321 void clear_claimed() { _claimed_index = 0; }
322 T* claim_next();
323
324 // reserve queues that not for parallel claiming
325 void reserve(uint n) {
326 assert(n <= size(), "Sanity");
327 _claimed_index = (jint)n;
328 debug_only(_reserved = n;)
329 }
330
331 debug_only(uint get_reserved() const { return (uint)_reserved; })
332 };
333
334 template <class T, MEMFLAGS F>
335 T* ParallelClaimableQueueSet<T, F>::claim_next() {
336 jint size = (jint)GenericTaskQueueSet<T, F>::size();
337
338 if (_claimed_index >= size) {
339 return nullptr;
340 }
341
342 jint index = Atomic::add(&_claimed_index, 1, memory_order_relaxed);
343
344 if (index <= size) {
345 return GenericTaskQueueSet<T, F>::queue((uint)index - 1);
346 } else {
347 return nullptr;
348 }
349 }
350
351 class ShenandoahObjToScanQueueSet: public ParallelClaimableQueueSet<ShenandoahObjToScanQueue, mtGC> {
352 public:
353 ShenandoahObjToScanQueueSet(int n) : ParallelClaimableQueueSet<ShenandoahObjToScanQueue, mtGC>(n) {}
354
355 bool is_empty();
356 void clear();
357 };
358
359 class ShenandoahTerminatorTerminator : public TerminatorTerminator {
360 private:
361 ShenandoahHeap* _heap;
362 public:
363 ShenandoahTerminatorTerminator(ShenandoahHeap* const heap) : _heap(heap) { }
364 virtual bool should_exit_termination();
365 };
366
367 #endif // SHARE_GC_SHENANDOAH_SHENANDOAHTASKQUEUE_HPP