1 /* 2 * Copyright (c) 2016, 2019, Red Hat, Inc. All rights reserved. 3 * 4 * This code is free software; you can redistribute it and/or modify it 5 * under the terms of the GNU General Public License version 2 only, as 6 * published by the Free Software Foundation. 7 * 8 * This code is distributed in the hope that it will be useful, but WITHOUT 9 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 10 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 11 * version 2 for more details (a copy is included in the LICENSE file that 12 * accompanied this code). 13 * 14 * You should have received a copy of the GNU General Public License version 15 * 2 along with this work; if not, write to the Free Software Foundation, 16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 17 * 18 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 19 * or visit www.oracle.com if you need additional information or have any 20 * questions. 21 * 22 */ 23 24 #ifndef SHARE_VM_GC_SHENANDOAH_SHENANDOAHTASKQUEUE_HPP 25 #define SHARE_VM_GC_SHENANDOAH_SHENANDOAHTASKQUEUE_HPP 26 27 #include "gc_implementation/shenandoah/shenandoahPadding.hpp" 28 #include "memory/padded.hpp" 29 #include "utilities/taskqueue.hpp" 30 #include "runtime/mutex.hpp" 31 #include "utilities/debug.hpp" 32 33 class ShenandoahHeap; 34 class Thread; 35 36 template<class E, MEMFLAGS F, unsigned int N = TASKQUEUE_SIZE> 37 class BufferedOverflowTaskQueue: public OverflowTaskQueue<E, F, N> 38 { 39 public: 40 typedef OverflowTaskQueue<E, F, N> taskqueue_t; 41 42 BufferedOverflowTaskQueue() : _buf_empty(true) {}; 43 44 TASKQUEUE_STATS_ONLY(using taskqueue_t::stats;) 45 46 // Push task t into the queue. Returns true on success. 47 inline bool push(E t); 48 49 // Attempt to pop from the queue. Returns true on success. 50 inline bool pop(E &t); 51 52 inline void clear(); 53 54 inline bool is_empty() const { 55 return _buf_empty && taskqueue_t::is_empty(); 56 } 57 58 private: 59 bool _buf_empty; 60 E _elem; 61 }; 62 63 #ifdef _MSC_VER 64 #pragma warning(push) 65 // warning C4522: multiple assignment operators specified 66 #pragma warning(disable:4522) 67 #endif 68 69 // ShenandoahMarkTask 70 // 71 // Encodes both regular oops, and the array oops plus chunking data for parallel array processing. 72 // The design goal is to make the regular oop ops very fast, because that would be the prevailing 73 // case. On the other hand, it should not block parallel array processing from efficiently dividing 74 // the array work. 75 // 76 // The idea is to steal the bits from the 64-bit oop to encode array data, if needed. For the 77 // proper divide-and-conquer strategies, we want to encode the "blocking" data. It turns out, the 78 // most efficient way to do this is to encode the array block as (chunk * 2^pow), where it is assumed 79 // that the block has the size of 2^pow. This requires for pow to have only 5 bits (2^32) to encode 80 // all possible arrays. 81 // 82 // |---------oop---------|-pow-|--chunk---| 83 // 0 49 54 64 84 // 85 // By definition, chunk == 0 means "no chunk", i.e. chunking starts from 1. 86 // 87 // This encoding gives a few interesting benefits: 88 // 89 // a) Encoding/decoding regular oops is very simple, because the upper bits are zero in that task: 90 // 91 // |---------oop---------|00000|0000000000| // no chunk data 92 // 93 // This helps the most ubiquitous path. The initialization amounts to putting the oop into the word 94 // with zero padding. Testing for "chunkedness" is testing for zero with chunk mask. 95 // 96 // b) Splitting tasks for divide-and-conquer is possible. Suppose we have chunk <C, P> that covers 97 // interval [ (C-1)*2^P; C*2^P ). We can then split it into two chunks: 98 // <2*C - 1, P-1>, that covers interval [ (2*C - 2)*2^(P-1); (2*C - 1)*2^(P-1) ) 99 // <2*C, P-1>, that covers interval [ (2*C - 1)*2^(P-1); 2*C*2^(P-1) ) 100 // 101 // Observe that the union of these two intervals is: 102 // [ (2*C - 2)*2^(P-1); 2*C*2^(P-1) ) 103 // 104 // ...which is the original interval: 105 // [ (C-1)*2^P; C*2^P ) 106 // 107 // c) The divide-and-conquer strategy could even start with chunk <1, round-log2-len(arr)>, and split 108 // down in the parallel threads, which alleviates the upfront (serial) splitting costs. 109 // 110 // Encoding limitations caused by current bitscales mean: 111 // 10 bits for chunk: max 1024 blocks per array 112 // 5 bits for power: max 2^32 array 113 // 49 bits for oop: max 512 TB of addressable space 114 // 115 // Stealing bits from oop trims down the addressable space. Stealing too few bits for chunk ID limits 116 // potential parallelism. Stealing too few bits for pow limits the maximum array size that can be handled. 117 // In future, these might be rebalanced to favor one degree of freedom against another. For example, 118 // if/when Arrays 2.0 bring 2^64-sized arrays, we might need to steal another bit for power. We could regain 119 // some bits back if chunks are counted in ObjArrayMarkingStride units. 120 // 121 // There is also a fallback version that uses plain fields, when we don't have enough space to steal the 122 // bits from the native pointer. It is useful to debug the optimized version. 123 // 124 125 #ifdef _MSC_VER 126 #pragma warning(push) 127 // warning C4522: multiple assignment operators specified 128 #pragma warning( disable:4522 ) 129 #endif 130 131 #ifdef _LP64 132 #define SHENANDOAH_OPTIMIZED_MARKTASK 1 133 #else 134 #define SHENANDOAH_OPTIMIZED_MARKTASK 0 135 #endif 136 137 #if SHENANDOAH_OPTIMIZED_MARKTASK 138 class ShenandoahMarkTask 139 { 140 private: 141 // Everything is encoded into this field... 142 uintptr_t _obj; 143 144 // ...with these: 145 static const uint8_t chunk_bits = 10; 146 static const uint8_t pow_bits = 5; 147 static const uint8_t oop_bits = sizeof(uintptr_t)*8 - chunk_bits - pow_bits; 148 149 static const uint8_t oop_shift = 0; 150 static const uint8_t pow_shift = oop_bits; 151 static const uint8_t chunk_shift = oop_bits + pow_bits; 152 153 static const uintptr_t oop_extract_mask = right_n_bits(oop_bits); 154 static const uintptr_t chunk_pow_extract_mask = ~right_n_bits(oop_bits); 155 156 static const int chunk_range_mask = right_n_bits(chunk_bits); 157 static const int pow_range_mask = right_n_bits(pow_bits); 158 159 inline oop decode_oop(uintptr_t val) const { 160 STATIC_ASSERT(oop_shift == 0); 161 return cast_to_oop(val & oop_extract_mask); 162 } 163 164 inline bool decode_not_chunked(uintptr_t val) const { 165 // No need to shift for a comparison to zero 166 return (val & chunk_pow_extract_mask) == 0; 167 } 168 169 inline int decode_chunk(uintptr_t val) const { 170 return (int) ((val >> chunk_shift) & chunk_range_mask); 171 } 172 173 inline int decode_pow(uintptr_t val) const { 174 return (int) ((val >> pow_shift) & pow_range_mask); 175 } 176 177 inline uintptr_t encode_oop(oop obj) const { 178 STATIC_ASSERT(oop_shift == 0); 179 return cast_from_oop<uintptr_t>(obj); 180 } 181 182 inline uintptr_t encode_chunk(int chunk) const { 183 return ((uintptr_t) chunk) << chunk_shift; 184 } 185 186 inline uintptr_t encode_pow(int pow) const { 187 return ((uintptr_t) pow) << pow_shift; 188 } 189 190 public: 191 ShenandoahMarkTask(oop o = NULL) { 192 uintptr_t enc = encode_oop(o); 193 assert(decode_oop(enc) == o, err_msg("oop encoding should work: " PTR_FORMAT, p2i(o))); 194 assert(decode_not_chunked(enc), "task should not be chunked"); 195 _obj = enc; 196 } 197 198 ShenandoahMarkTask(oop o, int chunk, int pow) { 199 uintptr_t enc_oop = encode_oop(o); 200 uintptr_t enc_chunk = encode_chunk(chunk); 201 uintptr_t enc_pow = encode_pow(pow); 202 uintptr_t enc = enc_oop | enc_chunk | enc_pow; 203 assert(decode_oop(enc) == o, err_msg("oop encoding should work: " PTR_FORMAT, p2i(o))); 204 assert(decode_chunk(enc) == chunk, err_msg("chunk encoding should work: %d", chunk)); 205 assert(decode_pow(enc) == pow, err_msg("pow encoding should work: %d", pow)); 206 assert(!decode_not_chunked(enc), "task should be chunked"); 207 _obj = enc; 208 } 209 210 ShenandoahMarkTask(const ShenandoahMarkTask& t): _obj(t._obj) { } 211 212 ShenandoahMarkTask& operator =(const ShenandoahMarkTask& t) { 213 _obj = t._obj; 214 return *this; 215 } 216 217 volatile ShenandoahMarkTask& 218 operator =(const volatile ShenandoahMarkTask& t) volatile { 219 (void) const_cast<uintptr_t &>(_obj = t._obj); 220 return *this; 221 } 222 223 public: 224 inline oop obj() const { return decode_oop(_obj); } 225 inline int chunk() const { return decode_chunk(_obj); } 226 inline int pow() const { return decode_pow(_obj); } 227 228 inline bool is_not_chunked() const { return decode_not_chunked(_obj); } 229 230 DEBUG_ONLY(bool is_valid() const;) // Tasks to be pushed/popped must be valid. 231 232 static uintptr_t max_addressable() { 233 return nth_bit(oop_bits); 234 } 235 236 static int chunk_size() { 237 return nth_bit(chunk_bits); 238 } 239 }; 240 #else 241 class ShenandoahMarkTask 242 { 243 private: 244 static const uint8_t chunk_bits = 10; 245 static const uint8_t pow_bits = 5; 246 247 static const int chunk_max = nth_bit(chunk_bits) - 1; 248 static const int pow_max = nth_bit(pow_bits) - 1; 249 250 oop _obj; 251 int _chunk; 252 int _pow; 253 254 public: 255 ShenandoahMarkTask(oop o = NULL, int chunk = 0, int pow = 0): 256 _obj(o), _chunk(chunk), _pow(pow) { 257 assert(0 <= chunk && chunk < chunk_max, err_msg("chunk is sane: %d", chunk)); 258 assert(0 <= pow && pow < pow_max, err_msg("pow is sane: %d", pow)); 259 } 260 261 ShenandoahMarkTask(const ShenandoahMarkTask& t): _obj(t._obj), _chunk(t._chunk), _pow(t._pow) { } 262 263 ShenandoahMarkTask& operator =(const ShenandoahMarkTask& t) { 264 _obj = t._obj; 265 _chunk = t._chunk; 266 _pow = t._pow; 267 return *this; 268 } 269 270 volatile ShenandoahMarkTask& 271 operator =(const volatile ShenandoahMarkTask& t) volatile { 272 (void)const_cast<oop&>(_obj = t._obj); 273 _chunk = t._chunk; 274 _pow = t._pow; 275 return *this; 276 } 277 278 inline oop obj() const { return _obj; } 279 inline int chunk() const { return _chunk; } 280 inline int pow() const { return _pow; } 281 inline bool is_not_chunked() const { return _chunk == 0; } 282 283 DEBUG_ONLY(bool is_valid() const;) // Tasks to be pushed/popped must be valid. 284 285 static size_t max_addressable() { 286 return sizeof(oop); 287 } 288 289 static int chunk_size() { 290 return nth_bit(chunk_bits); 291 } 292 }; 293 #endif // SHENANDOAH_OPTIMIZED_MARKTASK 294 295 #ifdef _MSC_VER 296 #pragma warning(pop) 297 #endif 298 299 typedef BufferedOverflowTaskQueue<ShenandoahMarkTask, mtGC> ShenandoahBufferedOverflowTaskQueue; 300 typedef Padded<ShenandoahBufferedOverflowTaskQueue> ShenandoahObjToScanQueue; 301 302 template <class T, MEMFLAGS F> 303 class ParallelClaimableQueueSet: public GenericTaskQueueSet<T, F> { 304 private: 305 shenandoah_padding(0); 306 volatile jint _claimed_index; 307 shenandoah_padding(1); 308 309 debug_only(uint _reserved; ) 310 311 public: 312 using GenericTaskQueueSet<T, F>::size; 313 314 public: 315 ParallelClaimableQueueSet(int n) : GenericTaskQueueSet<T, F>(n), _claimed_index(0) { 316 debug_only(_reserved = 0; ) 317 } 318 319 void clear_claimed() { _claimed_index = 0; } 320 T* claim_next(); 321 322 // reserve queues that not for parallel claiming 323 void reserve(uint n) { 324 assert(n <= size(), "Sanity"); 325 _claimed_index = (jint)n; 326 debug_only(_reserved = n;) 327 } 328 329 debug_only(uint get_reserved() const { return (uint)_reserved; }) 330 }; 331 332 template <class T, MEMFLAGS F> 333 T* ParallelClaimableQueueSet<T, F>::claim_next() { 334 jint size = (jint)GenericTaskQueueSet<T, F>::size(); 335 336 if (_claimed_index >= size) { 337 return NULL; 338 } 339 340 jint index = Atomic::add(1, &_claimed_index); 341 342 if (index <= size) { 343 return GenericTaskQueueSet<T, F>::queue((uint)index - 1); 344 } else { 345 return NULL; 346 } 347 } 348 349 class ShenandoahObjToScanQueueSet: public ParallelClaimableQueueSet<ShenandoahObjToScanQueue, mtGC> { 350 public: 351 ShenandoahObjToScanQueueSet(int n) : ParallelClaimableQueueSet<ShenandoahObjToScanQueue, mtGC>(n) {} 352 353 bool is_empty(); 354 void clear(); 355 356 #if TASKQUEUE_STATS 357 static void print_taskqueue_stats_hdr(outputStream* const st); 358 void print_taskqueue_stats(); 359 void reset_taskqueue_stats(); 360 #endif // TASKQUEUE_STATS 361 }; 362 363 class ShenandoahTerminatorTerminator : public TerminatorTerminator { 364 private: 365 ShenandoahHeap* const _heap; 366 public: 367 ShenandoahTerminatorTerminator(ShenandoahHeap* const heap) : _heap(heap) { } 368 // return true, terminates immediately, even if there's remaining work left 369 virtual bool should_exit_termination(); 370 }; 371 372 /* 373 * This is an enhanced implementation of Google's work stealing 374 * protocol, which is described in the paper: 375 * Understanding and improving JVM GC work stealing at the data center scale 376 * (http://dl.acm.org/citation.cfm?id=2926706) 377 * 378 * Instead of a dedicated spin-master, our implementation will let spin-master to relinquish 379 * the role before it goes to sleep/wait, so allows newly arrived thread to compete for the role. 380 * The intention of above enhancement, is to reduce spin-master's latency on detecting new tasks 381 * for stealing and termination condition. 382 */ 383 384 class ShenandoahTaskTerminator: public ParallelTaskTerminator { 385 private: 386 Monitor* _blocker; 387 Thread* _spin_master; 388 389 public: 390 ShenandoahTaskTerminator(uint n_threads, TaskQueueSetSuper* queue_set) : 391 ParallelTaskTerminator(n_threads, queue_set), _spin_master(NULL) { 392 _blocker = new Monitor(Mutex::leaf, "ShenandoahTaskTerminator", false); 393 } 394 395 ~ShenandoahTaskTerminator() { 396 assert(_blocker != NULL, "Can not be NULL"); 397 delete _blocker; 398 } 399 400 bool offer_termination(ShenandoahTerminatorTerminator* terminator); 401 bool offer_termination() { return offer_termination((ShenandoahTerminatorTerminator*)NULL); } 402 403 private: 404 bool offer_termination(TerminatorTerminator* terminator) { 405 ShouldNotReachHere(); 406 return false; 407 } 408 409 private: 410 size_t tasks_in_queue_set() { return _queue_set->tasks(); } 411 412 /* 413 * Perform spin-master task. 414 * return true if termination condition is detected 415 * otherwise, return false 416 */ 417 bool do_spin_master_work(ShenandoahTerminatorTerminator* terminator); 418 }; 419 420 #endif // SHARE_VM_GC_SHENANDOAH_SHENANDOAHTASKQUEUE_HPP