1 /*
  2  * Copyright (c) 2016, 2020, Red Hat, Inc. All rights reserved.
  3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  4  *
  5  * This code is free software; you can redistribute it and/or modify it
  6  * under the terms of the GNU General Public License version 2 only, as
  7  * published by the Free Software Foundation.
  8  *
  9  * This code is distributed in the hope that it will be useful, but WITHOUT
 10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
 11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
 12  * version 2 for more details (a copy is included in the LICENSE file that
 13  * accompanied this code).
 14  *
 15  * You should have received a copy of the GNU General Public License version
 16  * 2 along with this work; if not, write to the Free Software Foundation,
 17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
 18  *
 19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
 20  * or visit www.oracle.com if you need additional information or have any
 21  * questions.
 22  *
 23  */
 24 
 25 #ifndef SHARE_GC_SHENANDOAH_SHENANDOAHTASKQUEUE_HPP
 26 #define SHARE_GC_SHENANDOAH_SHENANDOAHTASKQUEUE_HPP
 27 
 28 #include "gc/shared/taskTerminator.hpp"
 29 #include "gc/shared/taskqueue.hpp"
 30 #include "gc/shenandoah/shenandoahPadding.hpp"
 31 #include "memory/allocation.hpp"
 32 #include "runtime/atomic.hpp"
 33 #include "runtime/javaThread.hpp"
 34 #include "runtime/mutex.hpp"
 35 #include "utilities/debug.hpp"
 36 
 37 template<class E, MEMFLAGS F, unsigned int N = TASKQUEUE_SIZE>
 38 class BufferedOverflowTaskQueue: public OverflowTaskQueue<E, F, N>
 39 {
 40 public:
 41   typedef OverflowTaskQueue<E, F, N> taskqueue_t;
 42 
 43   BufferedOverflowTaskQueue() : _buf_empty(true) {};
 44 
 45   TASKQUEUE_STATS_ONLY(using taskqueue_t::stats;)
 46 
 47   // Push task t into the queue. Returns true on success.
 48   inline bool push(E t);
 49 
 50   // Attempt to pop from the queue. Returns true on success.
 51   inline bool pop(E &t);
 52 
 53   inline void clear();
 54 
 55   inline bool is_empty()        const {
 56     return _buf_empty && taskqueue_t::is_empty();
 57   }
 58 
 59 private:
 60   bool _buf_empty;
 61   E _elem;
 62 };
 63 
 64 // ShenandoahMarkTask
 65 //
 66 // Encodes both regular oops, and the array oops plus chunking data for parallel array processing.
 67 // The design goal is to make the regular oop ops very fast, because that would be the prevailing
 68 // case. On the other hand, it should not block parallel array processing from efficiently dividing
 69 // the array work.
 70 //
 71 // The idea is to steal the bits from the 64-bit oop to encode array data, if needed. For the
 72 // proper divide-and-conquer strategies, we want to encode the "blocking" data. It turns out, the
 73 // most efficient way to do this is to encode the array block as (chunk * 2^pow), where it is assumed
 74 // that the block has the size of 2^pow. This requires for pow to have only 5 bits (2^32) to encode
 75 // all possible arrays.
 76 //
 77 //    |xx-------oop---------|-pow-|--chunk---|
 78 //    0                    49     54        64
 79 //
 80 // By definition, chunk == 0 means "no chunk", i.e. chunking starts from 1.
 81 //
 82 // Lower bits of oop are reserved to handle "skip_live" and "strong" properties. Since this encoding
 83 // stores uncompressed oops, those bits are always available. These bits default to zero for "skip_live"
 84 // and "weak". This aligns with their frequent values: strong/counted-live references.
 85 //
 86 // This encoding gives a few interesting benefits:
 87 //
 88 // a) Encoding/decoding regular oops is very simple, because the upper bits are zero in that task:
 89 //
 90 //    |---------oop---------|00000|0000000000| // no chunk data
 91 //
 92 //    This helps the most ubiquitous path. The initialization amounts to putting the oop into the word
 93 //    with zero padding. Testing for "chunkedness" is testing for zero with chunk mask.
 94 //
 95 // b) Splitting tasks for divide-and-conquer is possible. Suppose we have chunk <C, P> that covers
 96 // interval [ (C-1)*2^P; C*2^P ). We can then split it into two chunks:
 97 //      <2*C - 1, P-1>, that covers interval [ (2*C - 2)*2^(P-1); (2*C - 1)*2^(P-1) )
 98 //      <2*C, P-1>,     that covers interval [ (2*C - 1)*2^(P-1);       2*C*2^(P-1) )
 99 //
100 //    Observe that the union of these two intervals is:
101 //      [ (2*C - 2)*2^(P-1); 2*C*2^(P-1) )
102 //
103 //    ...which is the original interval:
104 //      [ (C-1)*2^P; C*2^P )
105 //
106 // c) The divide-and-conquer strategy could even start with chunk <1, round-log2-len(arr)>, and split
107 //    down in the parallel threads, which alleviates the upfront (serial) splitting costs.
108 //
109 // Encoding limitations caused by current bitscales mean:
110 //    10 bits for chunk: max 1024 blocks per array
111 //     5 bits for power: max 2^32 array
112 //    49 bits for   oop: max 512 TB of addressable space
113 //
114 // Stealing bits from oop trims down the addressable space. Stealing too few bits for chunk ID limits
115 // potential parallelism. Stealing too few bits for pow limits the maximum array size that can be handled.
116 // In future, these might be rebalanced to favor one degree of freedom against another. For example,
117 // if/when Arrays 2.0 bring 2^64-sized arrays, we might need to steal another bit for power. We could regain
118 // some bits back if chunks are counted in ObjArrayMarkingStride units.
119 //
120 // There is also a fallback version that uses plain fields, when we don't have enough space to steal the
121 // bits from the native pointer. It is useful to debug the optimized version.
122 //
123 
124 #ifdef _MSC_VER
125 #pragma warning(push)
126 // warning C4522: multiple assignment operators specified
127 #pragma warning( disable:4522 )
128 #endif
129 
130 #ifdef _LP64
131 #define SHENANDOAH_OPTIMIZED_MARKTASK 1
132 #else
133 #define SHENANDOAH_OPTIMIZED_MARKTASK 0
134 #endif
135 
136 #if SHENANDOAH_OPTIMIZED_MARKTASK
137 class ShenandoahMarkTask
138 {
139 private:
140   // Everything is encoded into this field...
141   uintptr_t _obj;
142 
143   // ...with these:
144   static const uint8_t chunk_bits  = 10;
145   static const uint8_t pow_bits    = 5;
146   static const uint8_t oop_bits    = sizeof(uintptr_t)*8 - chunk_bits - pow_bits;
147 
148   static const uint8_t oop_shift   = 0;
149   static const uint8_t pow_shift   = oop_bits;
150   static const uint8_t chunk_shift = oop_bits + pow_bits;
151 
152   static const uintptr_t oop_extract_mask       = right_n_bits(oop_bits) - 3;
153   static const uintptr_t skip_live_extract_mask = 1 << 0;
154   static const uintptr_t weak_extract_mask      = 1 << 1;
155   static const uintptr_t chunk_pow_extract_mask = ~right_n_bits(oop_bits);
156 
157   static const int chunk_range_mask = right_n_bits(chunk_bits);
158   static const int pow_range_mask   = right_n_bits(pow_bits);
159 
160   inline oop decode_oop(uintptr_t val) const {
161     STATIC_ASSERT(oop_shift == 0);
162     return cast_to_oop(val & oop_extract_mask);
163   }
164 
165   inline bool decode_not_chunked(uintptr_t val) const {
166     // No need to shift for a comparison to zero
167     return (val & chunk_pow_extract_mask) == 0;
168   }
169 
170   inline int decode_chunk(uintptr_t val) const {
171     return (int) ((val >> chunk_shift) & chunk_range_mask);
172   }
173 
174   inline int decode_pow(uintptr_t val) const {
175     return (int) ((val >> pow_shift) & pow_range_mask);
176   }
177 
178   inline bool decode_weak(uintptr_t val) const {
179     return (val & weak_extract_mask) != 0;
180   }
181 
182   inline bool decode_cnt_live(uintptr_t val) const {
183     return (val & skip_live_extract_mask) == 0;
184   }
185 
186   inline uintptr_t encode_oop(oop obj, bool skip_live, bool weak) const {
187     STATIC_ASSERT(oop_shift == 0);
188     uintptr_t encoded = cast_from_oop<uintptr_t>(obj);
189     if (skip_live) {
190       encoded |= skip_live_extract_mask;
191     }
192     if (weak) {
193       encoded |= weak_extract_mask;
194     }
195     return encoded;
196   }
197 
198   inline uintptr_t encode_chunk(int chunk) const {
199     return ((uintptr_t) chunk) << chunk_shift;
200   }
201 
202   inline uintptr_t encode_pow(int pow) const {
203     return ((uintptr_t) pow) << pow_shift;
204   }
205 
206 public:
207   ShenandoahMarkTask(oop o = nullptr, bool skip_live = false, bool weak = false) {
208     uintptr_t enc = encode_oop(o, skip_live, weak);
209     assert(decode_oop(enc) == o,     "oop encoding should work: " PTR_FORMAT, p2i(o));
210     assert(decode_cnt_live(enc) == !skip_live, "skip_live encoding should work");
211     assert(decode_weak(enc) == weak, "weak encoding should work");
212     assert(decode_not_chunked(enc),  "task should not be chunked");
213     _obj = enc;
214   }
215 
216   ShenandoahMarkTask(oop o, bool skip_live, bool weak, int chunk, int pow) {
217     uintptr_t enc_oop = encode_oop(o, skip_live, weak);
218     uintptr_t enc_chunk = encode_chunk(chunk);
219     uintptr_t enc_pow = encode_pow(pow);
220     uintptr_t enc = enc_oop | enc_chunk | enc_pow;
221     assert(decode_oop(enc) == o,       "oop encoding should work: " PTR_FORMAT, p2i(o));
222     assert(decode_cnt_live(enc) == !skip_live, "skip_live should be true for chunked tasks");
223     assert(decode_weak(enc) == weak,   "weak encoding should work");
224     assert(decode_chunk(enc) == chunk, "chunk encoding should work: %d", chunk);
225     assert(decode_pow(enc) == pow,     "pow encoding should work: %d", pow);
226     assert(!decode_not_chunked(enc),   "task should be chunked");
227     _obj = enc;
228   }
229 
230   // Trivially copyable.
231 
232 public:
233   inline oop  obj()            const { return decode_oop(_obj);   }
234   inline int  chunk()          const { return decode_chunk(_obj); }
235   inline int  pow()            const { return decode_pow(_obj);   }
236 
237   inline bool is_not_chunked() const { return decode_not_chunked(_obj); }
238   inline bool is_weak()        const { return decode_weak(_obj);        }
239   inline bool count_liveness() const { return decode_cnt_live(_obj);    }
240 
241   DEBUG_ONLY(bool is_valid() const;) // Tasks to be pushed/popped must be valid.
242 
243   static uintptr_t max_addressable() {
244     return nth_bit(oop_bits);
245   }
246 
247   static int chunk_size() {
248     return nth_bit(chunk_bits);
249   }
250 };
251 #else
252 class ShenandoahMarkTask
253 {
254 private:
255   static const uint8_t chunk_bits  = 10;
256   static const uint8_t pow_bits    = 5;
257 
258   static const int chunk_max       = nth_bit(chunk_bits) - 1;
259   static const int pow_max         = nth_bit(pow_bits) - 1;
260 
261   oop _obj;
262   bool _skip_live;
263   bool _weak;
264   int _chunk;
265   int _pow;
266 
267 public:
268   ShenandoahMarkTask(oop o = nullptr, bool skip_live = false, bool weak = false, int chunk = 0, int pow = 0):
269     _obj(o), _skip_live(skip_live), _weak(weak), _chunk(chunk), _pow(pow) {
270     assert(0 <= chunk && chunk <= chunk_max, "chunk is in range: %d", chunk);
271     assert(0 <= pow && pow <= pow_max, "pow is in range: %d", pow);
272   }
273 
274   // Trivially copyable.
275 
276   inline oop obj()             const { return _obj; }
277   inline int chunk()           const { return _chunk; }
278   inline int pow()             const { return _pow; }
279   inline bool is_not_chunked() const { return _chunk == 0; }
280   inline bool is_weak()        const { return _weak; }
281   inline bool count_liveness() const { return !_skip_live; }
282 
283   DEBUG_ONLY(bool is_valid() const;) // Tasks to be pushed/popped must be valid.
284 
285   static size_t max_addressable() {
286     return sizeof(oop);
287   }
288 
289   static int chunk_size() {
290     return nth_bit(chunk_bits);
291   }
292 };
293 #endif // SHENANDOAH_OPTIMIZED_MARKTASK
294 
295 #ifdef _MSC_VER
296 #pragma warning(pop)
297 #endif
298 
299 typedef BufferedOverflowTaskQueue<ShenandoahMarkTask, mtGC> ShenandoahBufferedOverflowTaskQueue;
300 typedef Padded<ShenandoahBufferedOverflowTaskQueue> ShenandoahObjToScanQueue;
301 
302 template <class T, MEMFLAGS F>
303 class ParallelClaimableQueueSet: public GenericTaskQueueSet<T, F> {
304 private:
305   shenandoah_padding(0);
306   volatile jint     _claimed_index;
307   shenandoah_padding(1);
308 
309   debug_only(uint   _reserved;  )
310 
311 public:
312   using GenericTaskQueueSet<T, F>::size;
313 
314 public:
315   ParallelClaimableQueueSet(int n) : GenericTaskQueueSet<T, F>(n), _claimed_index(0) {
316     debug_only(_reserved = 0; )
317   }
318 
319   void clear_claimed() { _claimed_index = 0; }
320   T*   claim_next();
321 
322   // reserve queues that not for parallel claiming
323   void reserve(uint n) {
324     assert(n <= size(), "Sanity");
325     _claimed_index = (jint)n;
326     debug_only(_reserved = n;)
327   }
328 
329   debug_only(uint get_reserved() const { return (uint)_reserved; })
330 };
331 
332 template <class T, MEMFLAGS F>
333 T* ParallelClaimableQueueSet<T, F>::claim_next() {
334   jint size = (jint)GenericTaskQueueSet<T, F>::size();
335 
336   if (_claimed_index >= size) {
337     return nullptr;
338   }
339 
340   jint index = Atomic::add(&_claimed_index, 1, memory_order_relaxed);
341 
342   if (index <= size) {
343     return GenericTaskQueueSet<T, F>::queue((uint)index - 1);
344   } else {
345     return nullptr;
346   }
347 }
348 
349 class ShenandoahObjToScanQueueSet: public ParallelClaimableQueueSet<ShenandoahObjToScanQueue, mtGC> {
350 public:
351   ShenandoahObjToScanQueueSet(int n) : ParallelClaimableQueueSet<ShenandoahObjToScanQueue, mtGC>(n) {}
352 
353   bool is_empty();
354   void clear();
355 
356 #if TASKQUEUE_STATS
357   static void print_taskqueue_stats_hdr(outputStream* const st);
358   void print_taskqueue_stats() const;
359   void reset_taskqueue_stats();
360 #endif // TASKQUEUE_STATS
361 };
362 
363 class ShenandoahTerminatorTerminator : public TerminatorTerminator {
364 private:
365   ShenandoahHeap* _heap;
366 public:
367   ShenandoahTerminatorTerminator(ShenandoahHeap* const heap) : _heap(heap) { }
368   virtual bool should_exit_termination();
369 };
370 
371 #endif // SHARE_GC_SHENANDOAH_SHENANDOAHTASKQUEUE_HPP