1 /* 2 * Copyright (c) 2016, 2020, Red Hat, Inc. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. 8 * 9 * This code is distributed in the hope that it will be useful, but WITHOUT 10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 12 * version 2 for more details (a copy is included in the LICENSE file that 13 * accompanied this code). 14 * 15 * You should have received a copy of the GNU General Public License version 16 * 2 along with this work; if not, write to the Free Software Foundation, 17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 18 * 19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 20 * or visit www.oracle.com if you need additional information or have any 21 * questions. 22 * 23 */ 24 25 #ifndef SHARE_GC_SHENANDOAH_SHENANDOAHTASKQUEUE_HPP 26 #define SHARE_GC_SHENANDOAH_SHENANDOAHTASKQUEUE_HPP 27 28 #include "gc/shared/taskTerminator.hpp" 29 #include "gc/shared/taskqueue.hpp" 30 #include "gc/shenandoah/shenandoahPadding.hpp" 31 #include "memory/allocation.hpp" 32 #include "runtime/atomic.hpp" 33 #include "runtime/javaThread.hpp" 34 #include "runtime/mutex.hpp" 35 #include "utilities/debug.hpp" 36 37 template<class E, MEMFLAGS F, unsigned int N = TASKQUEUE_SIZE> 38 class BufferedOverflowTaskQueue: public OverflowTaskQueue<E, F, N> 39 { 40 public: 41 typedef OverflowTaskQueue<E, F, N> taskqueue_t; 42 43 BufferedOverflowTaskQueue() : _buf_empty(true) {}; 44 45 TASKQUEUE_STATS_ONLY(using taskqueue_t::stats;) 46 47 // Push task t into the queue. Returns true on success. 48 inline bool push(E t); 49 50 // Attempt to pop from the queue. Returns true on success. 51 inline bool pop(E &t); 52 53 inline void clear(); 54 55 inline bool is_empty() const { 56 return _buf_empty && taskqueue_t::is_empty(); 57 } 58 59 private: 60 bool _buf_empty; 61 E _elem; 62 }; 63 64 // ShenandoahMarkTask 65 // 66 // Encodes both regular oops, and the array oops plus chunking data for parallel array processing. 67 // The design goal is to make the regular oop ops very fast, because that would be the prevailing 68 // case. On the other hand, it should not block parallel array processing from efficiently dividing 69 // the array work. 70 // 71 // The idea is to steal the bits from the 64-bit oop to encode array data, if needed. For the 72 // proper divide-and-conquer strategies, we want to encode the "blocking" data. It turns out, the 73 // most efficient way to do this is to encode the array block as (chunk * 2^pow), where it is assumed 74 // that the block has the size of 2^pow. This requires for pow to have only 5 bits (2^32) to encode 75 // all possible arrays. 76 // 77 // |xx-------oop---------|-pow-|--chunk---| 78 // 0 49 54 64 79 // 80 // By definition, chunk == 0 means "no chunk", i.e. chunking starts from 1. 81 // 82 // Lower bits of oop are reserved to handle "skip_live" and "strong" properties. Since this encoding 83 // stores uncompressed oops, those bits are always available. These bits default to zero for "skip_live" 84 // and "weak". This aligns with their frequent values: strong/counted-live references. 85 // 86 // This encoding gives a few interesting benefits: 87 // 88 // a) Encoding/decoding regular oops is very simple, because the upper bits are zero in that task: 89 // 90 // |---------oop---------|00000|0000000000| // no chunk data 91 // 92 // This helps the most ubiquitous path. The initialization amounts to putting the oop into the word 93 // with zero padding. Testing for "chunkedness" is testing for zero with chunk mask. 94 // 95 // b) Splitting tasks for divide-and-conquer is possible. Suppose we have chunk <C, P> that covers 96 // interval [ (C-1)*2^P; C*2^P ). We can then split it into two chunks: 97 // <2*C - 1, P-1>, that covers interval [ (2*C - 2)*2^(P-1); (2*C - 1)*2^(P-1) ) 98 // <2*C, P-1>, that covers interval [ (2*C - 1)*2^(P-1); 2*C*2^(P-1) ) 99 // 100 // Observe that the union of these two intervals is: 101 // [ (2*C - 2)*2^(P-1); 2*C*2^(P-1) ) 102 // 103 // ...which is the original interval: 104 // [ (C-1)*2^P; C*2^P ) 105 // 106 // c) The divide-and-conquer strategy could even start with chunk <1, round-log2-len(arr)>, and split 107 // down in the parallel threads, which alleviates the upfront (serial) splitting costs. 108 // 109 // Encoding limitations caused by current bitscales mean: 110 // 10 bits for chunk: max 1024 blocks per array 111 // 5 bits for power: max 2^32 array 112 // 49 bits for oop: max 512 TB of addressable space 113 // 114 // Stealing bits from oop trims down the addressable space. Stealing too few bits for chunk ID limits 115 // potential parallelism. Stealing too few bits for pow limits the maximum array size that can be handled. 116 // In future, these might be rebalanced to favor one degree of freedom against another. For example, 117 // if/when Arrays 2.0 bring 2^64-sized arrays, we might need to steal another bit for power. We could regain 118 // some bits back if chunks are counted in ObjArrayMarkingStride units. 119 // 120 // There is also a fallback version that uses plain fields, when we don't have enough space to steal the 121 // bits from the native pointer. It is useful to debug the optimized version. 122 // 123 124 #ifdef _MSC_VER 125 #pragma warning(push) 126 // warning C4522: multiple assignment operators specified 127 #pragma warning( disable:4522 ) 128 #endif 129 130 #ifdef _LP64 131 #define SHENANDOAH_OPTIMIZED_MARKTASK 1 132 #else 133 #define SHENANDOAH_OPTIMIZED_MARKTASK 0 134 #endif 135 136 #if SHENANDOAH_OPTIMIZED_MARKTASK 137 class ShenandoahMarkTask 138 { 139 private: 140 // Everything is encoded into this field... 141 uintptr_t _obj; 142 143 // ...with these: 144 static const uint8_t chunk_bits = 10; 145 static const uint8_t pow_bits = 5; 146 static const uint8_t oop_bits = sizeof(uintptr_t)*8 - chunk_bits - pow_bits; 147 148 static const uint8_t oop_shift = 0; 149 static const uint8_t pow_shift = oop_bits; 150 static const uint8_t chunk_shift = oop_bits + pow_bits; 151 152 static const uintptr_t oop_extract_mask = right_n_bits(oop_bits) - 3; 153 static const uintptr_t skip_live_extract_mask = 1 << 0; 154 static const uintptr_t weak_extract_mask = 1 << 1; 155 static const uintptr_t chunk_pow_extract_mask = ~right_n_bits(oop_bits); 156 157 static const int chunk_range_mask = right_n_bits(chunk_bits); 158 static const int pow_range_mask = right_n_bits(pow_bits); 159 160 inline oop decode_oop(uintptr_t val) const { 161 STATIC_ASSERT(oop_shift == 0); 162 return cast_to_oop(val & oop_extract_mask); 163 } 164 165 inline bool decode_not_chunked(uintptr_t val) const { 166 // No need to shift for a comparison to zero 167 return (val & chunk_pow_extract_mask) == 0; 168 } 169 170 inline int decode_chunk(uintptr_t val) const { 171 return (int) ((val >> chunk_shift) & chunk_range_mask); 172 } 173 174 inline int decode_pow(uintptr_t val) const { 175 return (int) ((val >> pow_shift) & pow_range_mask); 176 } 177 178 inline bool decode_weak(uintptr_t val) const { 179 return (val & weak_extract_mask) != 0; 180 } 181 182 inline bool decode_cnt_live(uintptr_t val) const { 183 return (val & skip_live_extract_mask) == 0; 184 } 185 186 inline uintptr_t encode_oop(oop obj, bool skip_live, bool weak) const { 187 STATIC_ASSERT(oop_shift == 0); 188 uintptr_t encoded = cast_from_oop<uintptr_t>(obj); 189 if (skip_live) { 190 encoded |= skip_live_extract_mask; 191 } 192 if (weak) { 193 encoded |= weak_extract_mask; 194 } 195 return encoded; 196 } 197 198 inline uintptr_t encode_chunk(int chunk) const { 199 return ((uintptr_t) chunk) << chunk_shift; 200 } 201 202 inline uintptr_t encode_pow(int pow) const { 203 return ((uintptr_t) pow) << pow_shift; 204 } 205 206 public: 207 ShenandoahMarkTask(oop o = nullptr, bool skip_live = false, bool weak = false) { 208 uintptr_t enc = encode_oop(o, skip_live, weak); 209 assert(decode_oop(enc) == o, "oop encoding should work: " PTR_FORMAT, p2i(o)); 210 assert(decode_cnt_live(enc) == !skip_live, "skip_live encoding should work"); 211 assert(decode_weak(enc) == weak, "weak encoding should work"); 212 assert(decode_not_chunked(enc), "task should not be chunked"); 213 _obj = enc; 214 } 215 216 ShenandoahMarkTask(oop o, bool skip_live, bool weak, int chunk, int pow) { 217 uintptr_t enc_oop = encode_oop(o, skip_live, weak); 218 uintptr_t enc_chunk = encode_chunk(chunk); 219 uintptr_t enc_pow = encode_pow(pow); 220 uintptr_t enc = enc_oop | enc_chunk | enc_pow; 221 assert(decode_oop(enc) == o, "oop encoding should work: " PTR_FORMAT, p2i(o)); 222 assert(decode_cnt_live(enc) == !skip_live, "skip_live should be true for chunked tasks"); 223 assert(decode_weak(enc) == weak, "weak encoding should work"); 224 assert(decode_chunk(enc) == chunk, "chunk encoding should work: %d", chunk); 225 assert(decode_pow(enc) == pow, "pow encoding should work: %d", pow); 226 assert(!decode_not_chunked(enc), "task should be chunked"); 227 _obj = enc; 228 } 229 230 // Trivially copyable. 231 232 public: 233 inline oop obj() const { return decode_oop(_obj); } 234 inline int chunk() const { return decode_chunk(_obj); } 235 inline int pow() const { return decode_pow(_obj); } 236 237 inline bool is_not_chunked() const { return decode_not_chunked(_obj); } 238 inline bool is_weak() const { return decode_weak(_obj); } 239 inline bool count_liveness() const { return decode_cnt_live(_obj); } 240 241 DEBUG_ONLY(bool is_valid() const;) // Tasks to be pushed/popped must be valid. 242 243 static uintptr_t max_addressable() { 244 return nth_bit(oop_bits); 245 } 246 247 static int chunk_size() { 248 return nth_bit(chunk_bits); 249 } 250 }; 251 #else 252 class ShenandoahMarkTask 253 { 254 private: 255 static const uint8_t chunk_bits = 10; 256 static const uint8_t pow_bits = 5; 257 258 static const int chunk_max = nth_bit(chunk_bits) - 1; 259 static const int pow_max = nth_bit(pow_bits) - 1; 260 261 oop _obj; 262 bool _skip_live; 263 bool _weak; 264 int _chunk; 265 int _pow; 266 267 public: 268 ShenandoahMarkTask(oop o = nullptr, bool skip_live = false, bool weak = false, int chunk = 0, int pow = 0): 269 _obj(o), _skip_live(skip_live), _weak(weak), _chunk(chunk), _pow(pow) { 270 assert(0 <= chunk && chunk <= chunk_max, "chunk is in range: %d", chunk); 271 assert(0 <= pow && pow <= pow_max, "pow is in range: %d", pow); 272 } 273 274 // Trivially copyable. 275 276 inline oop obj() const { return _obj; } 277 inline int chunk() const { return _chunk; } 278 inline int pow() const { return _pow; } 279 inline bool is_not_chunked() const { return _chunk == 0; } 280 inline bool is_weak() const { return _weak; } 281 inline bool count_liveness() const { return !_skip_live; } 282 283 DEBUG_ONLY(bool is_valid() const;) // Tasks to be pushed/popped must be valid. 284 285 static size_t max_addressable() { 286 return sizeof(oop); 287 } 288 289 static int chunk_size() { 290 return nth_bit(chunk_bits); 291 } 292 }; 293 #endif // SHENANDOAH_OPTIMIZED_MARKTASK 294 295 #ifdef _MSC_VER 296 #pragma warning(pop) 297 #endif 298 299 typedef BufferedOverflowTaskQueue<ShenandoahMarkTask, mtGC> ShenandoahBufferedOverflowTaskQueue; 300 typedef Padded<ShenandoahBufferedOverflowTaskQueue> ShenandoahObjToScanQueue; 301 302 template <class T, MEMFLAGS F> 303 class ParallelClaimableQueueSet: public GenericTaskQueueSet<T, F> { 304 private: 305 shenandoah_padding(0); 306 volatile jint _claimed_index; 307 shenandoah_padding(1); 308 309 debug_only(uint _reserved; ) 310 311 public: 312 using GenericTaskQueueSet<T, F>::size; 313 314 public: 315 ParallelClaimableQueueSet(int n) : GenericTaskQueueSet<T, F>(n), _claimed_index(0) { 316 debug_only(_reserved = 0; ) 317 } 318 319 void clear_claimed() { _claimed_index = 0; } 320 T* claim_next(); 321 322 // reserve queues that not for parallel claiming 323 void reserve(uint n) { 324 assert(n <= size(), "Sanity"); 325 _claimed_index = (jint)n; 326 debug_only(_reserved = n;) 327 } 328 329 debug_only(uint get_reserved() const { return (uint)_reserved; }) 330 }; 331 332 template <class T, MEMFLAGS F> 333 T* ParallelClaimableQueueSet<T, F>::claim_next() { 334 jint size = (jint)GenericTaskQueueSet<T, F>::size(); 335 336 if (_claimed_index >= size) { 337 return nullptr; 338 } 339 340 jint index = Atomic::add(&_claimed_index, 1, memory_order_relaxed); 341 342 if (index <= size) { 343 return GenericTaskQueueSet<T, F>::queue((uint)index - 1); 344 } else { 345 return nullptr; 346 } 347 } 348 349 class ShenandoahObjToScanQueueSet: public ParallelClaimableQueueSet<ShenandoahObjToScanQueue, mtGC> { 350 public: 351 ShenandoahObjToScanQueueSet(int n) : ParallelClaimableQueueSet<ShenandoahObjToScanQueue, mtGC>(n) {} 352 353 bool is_empty(); 354 void clear(); 355 356 #if TASKQUEUE_STATS 357 static void print_taskqueue_stats_hdr(outputStream* const st); 358 void print_taskqueue_stats() const; 359 void reset_taskqueue_stats(); 360 #endif // TASKQUEUE_STATS 361 }; 362 363 class ShenandoahTerminatorTerminator : public TerminatorTerminator { 364 private: 365 ShenandoahHeap* _heap; 366 public: 367 ShenandoahTerminatorTerminator(ShenandoahHeap* const heap) : _heap(heap) { } 368 virtual bool should_exit_termination(); 369 }; 370 371 #endif // SHARE_GC_SHENANDOAH_SHENANDOAHTASKQUEUE_HPP