1 /*
   2  * Copyright (c) 2016, 2019, Red Hat, Inc. All rights reserved.
   3  *
   4  * This code is free software; you can redistribute it and/or modify it
   5  * under the terms of the GNU General Public License version 2 only, as
   6  * published by the Free Software Foundation.
   7  *
   8  * This code is distributed in the hope that it will be useful, but WITHOUT
   9  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  10  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  11  * version 2 for more details (a copy is included in the LICENSE file that
  12  * accompanied this code).
  13  *
  14  * You should have received a copy of the GNU General Public License version
  15  * 2 along with this work; if not, write to the Free Software Foundation,
  16  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  17  *
  18  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  19  * or visit www.oracle.com if you need additional information or have any
  20  * questions.
  21  *
  22  */
  23 
  24 #ifndef SHARE_VM_GC_SHENANDOAH_SHENANDOAHTASKQUEUE_HPP
  25 #define SHARE_VM_GC_SHENANDOAH_SHENANDOAHTASKQUEUE_HPP
  26 
  27 #include "gc_implementation/shenandoah/shenandoahPadding.hpp"
  28 #include "memory/padded.hpp"
  29 #include "utilities/taskqueue.hpp"
  30 #include "runtime/mutex.hpp"
  31 #include "utilities/debug.hpp"
  32 
  33 class ShenandoahHeap;
  34 class Thread;
  35 
  36 template<class E, MEMFLAGS F, unsigned int N = TASKQUEUE_SIZE>
  37 class BufferedOverflowTaskQueue: public OverflowTaskQueue<E, F, N>
  38 {
  39 public:
  40   typedef OverflowTaskQueue<E, F, N> taskqueue_t;
  41 
  42   BufferedOverflowTaskQueue() : _buf_empty(true) {};
  43 
  44   TASKQUEUE_STATS_ONLY(using taskqueue_t::stats;)
  45 
  46   // Push task t into the queue. Returns true on success.
  47   inline bool push(E t);
  48 
  49   // Attempt to pop from the queue. Returns true on success.
  50   inline bool pop(E &t);
  51 
  52   inline void clear();
  53 
  54   inline bool is_empty()        const {
  55     return _buf_empty && taskqueue_t::is_empty();
  56   }
  57 
  58 private:
  59   bool _buf_empty;
  60   E _elem;
  61 };
  62 
  63 #ifdef _MSC_VER
  64 #pragma warning(push)
  65 // warning C4522: multiple assignment operators specified
  66 #pragma warning(disable:4522)
  67 #endif
  68 
  69 // ShenandoahMarkTask
  70 //
  71 // Encodes both regular oops, and the array oops plus chunking data for parallel array processing.
  72 // The design goal is to make the regular oop ops very fast, because that would be the prevailing
  73 // case. On the other hand, it should not block parallel array processing from efficiently dividing
  74 // the array work.
  75 //
  76 // The idea is to steal the bits from the 64-bit oop to encode array data, if needed. For the
  77 // proper divide-and-conquer strategies, we want to encode the "blocking" data. It turns out, the
  78 // most efficient way to do this is to encode the array block as (chunk * 2^pow), where it is assumed
  79 // that the block has the size of 2^pow. This requires for pow to have only 5 bits (2^32) to encode
  80 // all possible arrays.
  81 //
  82 //    |---------oop---------|-pow-|--chunk---|
  83 //    0                    49     54        64
  84 //
  85 // By definition, chunk == 0 means "no chunk", i.e. chunking starts from 1.
  86 //
  87 // This encoding gives a few interesting benefits:
  88 //
  89 // a) Encoding/decoding regular oops is very simple, because the upper bits are zero in that task:
  90 //
  91 //    |---------oop---------|00000|0000000000| // no chunk data
  92 //
  93 //    This helps the most ubiquitous path. The initialization amounts to putting the oop into the word
  94 //    with zero padding. Testing for "chunkedness" is testing for zero with chunk mask.
  95 //
  96 // b) Splitting tasks for divide-and-conquer is possible. Suppose we have chunk <C, P> that covers
  97 // interval [ (C-1)*2^P; C*2^P ). We can then split it into two chunks:
  98 //      <2*C - 1, P-1>, that covers interval [ (2*C - 2)*2^(P-1); (2*C - 1)*2^(P-1) )
  99 //      <2*C, P-1>,     that covers interval [ (2*C - 1)*2^(P-1);       2*C*2^(P-1) )
 100 //
 101 //    Observe that the union of these two intervals is:
 102 //      [ (2*C - 2)*2^(P-1); 2*C*2^(P-1) )
 103 //
 104 //    ...which is the original interval:
 105 //      [ (C-1)*2^P; C*2^P )
 106 //
 107 // c) The divide-and-conquer strategy could even start with chunk <1, round-log2-len(arr)>, and split
 108 //    down in the parallel threads, which alleviates the upfront (serial) splitting costs.
 109 //
 110 // Encoding limitations caused by current bitscales mean:
 111 //    10 bits for chunk: max 1024 blocks per array
 112 //     5 bits for power: max 2^32 array
 113 //    49 bits for   oop: max 512 TB of addressable space
 114 //
 115 // Stealing bits from oop trims down the addressable space. Stealing too few bits for chunk ID limits
 116 // potential parallelism. Stealing too few bits for pow limits the maximum array size that can be handled.
 117 // In future, these might be rebalanced to favor one degree of freedom against another. For example,
 118 // if/when Arrays 2.0 bring 2^64-sized arrays, we might need to steal another bit for power. We could regain
 119 // some bits back if chunks are counted in ObjArrayMarkingStride units.
 120 //
 121 // There is also a fallback version that uses plain fields, when we don't have enough space to steal the
 122 // bits from the native pointer. It is useful to debug the optimized version.
 123 //
 124 
 125 #ifdef _MSC_VER
 126 #pragma warning(push)
 127 // warning C4522: multiple assignment operators specified
 128 #pragma warning( disable:4522 )
 129 #endif
 130 
 131 #ifdef _LP64
 132 #define SHENANDOAH_OPTIMIZED_MARKTASK 1
 133 #else
 134 #define SHENANDOAH_OPTIMIZED_MARKTASK 0
 135 #endif
 136 
 137 #if SHENANDOAH_OPTIMIZED_MARKTASK
 138 class ShenandoahMarkTask
 139 {
 140 private:
 141   // Everything is encoded into this field...
 142   uintptr_t _obj;
 143 
 144   // ...with these:
 145   static const uint8_t chunk_bits  = 10;
 146   static const uint8_t pow_bits    = 5;
 147   static const uint8_t oop_bits    = sizeof(uintptr_t)*8 - chunk_bits - pow_bits;
 148 
 149   static const uint8_t oop_shift   = 0;
 150   static const uint8_t pow_shift   = oop_bits;
 151   static const uint8_t chunk_shift = oop_bits + pow_bits;
 152 
 153   static const uintptr_t oop_extract_mask       = right_n_bits(oop_bits);
 154   static const uintptr_t chunk_pow_extract_mask = ~right_n_bits(oop_bits);
 155 
 156   static const int chunk_range_mask = right_n_bits(chunk_bits);
 157   static const int pow_range_mask   = right_n_bits(pow_bits);
 158 
 159   inline oop decode_oop(uintptr_t val) const {
 160     STATIC_ASSERT(oop_shift == 0);
 161     return cast_to_oop(val & oop_extract_mask);
 162   }
 163 
 164   inline bool decode_not_chunked(uintptr_t val) const {
 165     // No need to shift for a comparison to zero
 166     return (val & chunk_pow_extract_mask) == 0;
 167   }
 168 
 169   inline int decode_chunk(uintptr_t val) const {
 170     return (int) ((val >> chunk_shift) & chunk_range_mask);
 171   }
 172 
 173   inline int decode_pow(uintptr_t val) const {
 174     return (int) ((val >> pow_shift) & pow_range_mask);
 175   }
 176 
 177   inline uintptr_t encode_oop(oop obj) const {
 178     STATIC_ASSERT(oop_shift == 0);
 179     return cast_from_oop<uintptr_t>(obj);
 180   }
 181 
 182   inline uintptr_t encode_chunk(int chunk) const {
 183     return ((uintptr_t) chunk) << chunk_shift;
 184   }
 185 
 186   inline uintptr_t encode_pow(int pow) const {
 187     return ((uintptr_t) pow) << pow_shift;
 188   }
 189 
 190 public:
 191   ShenandoahMarkTask(oop o = NULL) {
 192     uintptr_t enc = encode_oop(o);
 193     assert(decode_oop(enc) == o,    err_msg("oop encoding should work: " PTR_FORMAT, p2i(o)));
 194     assert(decode_not_chunked(enc), "task should not be chunked");
 195     _obj = enc;
 196   }
 197 
 198   ShenandoahMarkTask(oop o, int chunk, int pow) {
 199     uintptr_t enc_oop = encode_oop(o);
 200     uintptr_t enc_chunk = encode_chunk(chunk);
 201     uintptr_t enc_pow = encode_pow(pow);
 202     uintptr_t enc = enc_oop | enc_chunk | enc_pow;
 203     assert(decode_oop(enc) == o,       err_msg("oop encoding should work: " PTR_FORMAT, p2i(o)));
 204     assert(decode_chunk(enc) == chunk, err_msg("chunk encoding should work: %d", chunk));
 205     assert(decode_pow(enc) == pow,     err_msg("pow encoding should work: %d", pow));
 206     assert(!decode_not_chunked(enc),   "task should be chunked");
 207     _obj = enc;
 208   }
 209 
 210   ShenandoahMarkTask(const ShenandoahMarkTask& t): _obj(t._obj) { }
 211 
 212   ShenandoahMarkTask& operator =(const ShenandoahMarkTask& t) {
 213     _obj = t._obj;
 214     return *this;
 215   }
 216 
 217   volatile ShenandoahMarkTask&
 218   operator =(const volatile ShenandoahMarkTask& t) volatile {
 219     (void) const_cast<uintptr_t &>(_obj = t._obj);
 220     return *this;
 221   }
 222 
 223 public:
 224   inline oop  obj()            const { return decode_oop(_obj);   }
 225   inline int  chunk()          const { return decode_chunk(_obj); }
 226   inline int  pow()            const { return decode_pow(_obj);   }
 227 
 228   inline bool is_not_chunked() const { return decode_not_chunked(_obj); }
 229 
 230   DEBUG_ONLY(bool is_valid() const;) // Tasks to be pushed/popped must be valid.
 231 
 232   static uintptr_t max_addressable() {
 233     return nth_bit(oop_bits);
 234   }
 235 
 236   static int chunk_size() {
 237     return nth_bit(chunk_bits);
 238   }
 239 };
 240 #else
 241 class ShenandoahMarkTask
 242 {
 243 private:
 244   static const uint8_t chunk_bits  = 10;
 245   static const uint8_t pow_bits    = 5;
 246 
 247   static const int chunk_max       = nth_bit(chunk_bits) - 1;
 248   static const int pow_max         = nth_bit(pow_bits) - 1;
 249 
 250   oop _obj;
 251   int _chunk;
 252   int _pow;
 253 
 254 public:
 255   ShenandoahMarkTask(oop o = NULL, int chunk = 0, int pow = 0):
 256     _obj(o), _chunk(chunk), _pow(pow) {
 257     assert(0 <= chunk && chunk < chunk_max, err_msg("chunk is sane: %d", chunk));
 258     assert(0 <= pow && pow < pow_max, err_msg("pow is sane: %d", pow));
 259   }
 260 
 261   ShenandoahMarkTask(const ShenandoahMarkTask& t): _obj(t._obj), _chunk(t._chunk), _pow(t._pow) { }
 262 
 263   ShenandoahMarkTask& operator =(const ShenandoahMarkTask& t) {
 264     _obj = t._obj;
 265     _chunk = t._chunk;
 266     _pow = t._pow;
 267     return *this;
 268   }
 269 
 270   volatile ShenandoahMarkTask&
 271   operator =(const volatile ShenandoahMarkTask& t) volatile {
 272     (void)const_cast<oop&>(_obj = t._obj);
 273     _chunk = t._chunk;
 274     _pow = t._pow;
 275     return *this;
 276   }
 277 
 278   inline oop obj()             const { return _obj; }
 279   inline int chunk()           const { return _chunk; }
 280   inline int pow()             const { return _pow; }
 281   inline bool is_not_chunked() const { return _chunk == 0; }
 282 
 283   DEBUG_ONLY(bool is_valid() const;) // Tasks to be pushed/popped must be valid.
 284 
 285   static size_t max_addressable() {
 286     return sizeof(oop);
 287   }
 288 
 289   static int chunk_size() {
 290     return nth_bit(chunk_bits);
 291   }
 292 };
 293 #endif // SHENANDOAH_OPTIMIZED_MARKTASK
 294 
 295 #ifdef _MSC_VER
 296 #pragma warning(pop)
 297 #endif
 298 
 299 typedef BufferedOverflowTaskQueue<ShenandoahMarkTask, mtGC> ShenandoahBufferedOverflowTaskQueue;
 300 typedef Padded<ShenandoahBufferedOverflowTaskQueue> ShenandoahObjToScanQueue;
 301 
 302 template <class T, MEMFLAGS F>
 303 class ParallelClaimableQueueSet: public GenericTaskQueueSet<T, F> {
 304 private:
 305   shenandoah_padding(0);
 306   volatile jint     _claimed_index;
 307   shenandoah_padding(1);
 308 
 309   debug_only(uint   _reserved;  )
 310 
 311 public:
 312   using GenericTaskQueueSet<T, F>::size;
 313 
 314 public:
 315   ParallelClaimableQueueSet(int n) : GenericTaskQueueSet<T, F>(n), _claimed_index(0) {
 316     debug_only(_reserved = 0; )
 317   }
 318 
 319   void clear_claimed() { _claimed_index = 0; }
 320   T*   claim_next();
 321 
 322   // reserve queues that not for parallel claiming
 323   void reserve(uint n) {
 324     assert(n <= size(), "Sanity");
 325     _claimed_index = (jint)n;
 326     debug_only(_reserved = n;)
 327   }
 328 
 329   debug_only(uint get_reserved() const { return (uint)_reserved; })
 330 };
 331 
 332 template <class T, MEMFLAGS F>
 333 T* ParallelClaimableQueueSet<T, F>::claim_next() {
 334   jint size = (jint)GenericTaskQueueSet<T, F>::size();
 335 
 336   if (_claimed_index >= size) {
 337     return NULL;
 338   }
 339 
 340   jint index = Atomic::add(1, &_claimed_index);
 341 
 342   if (index <= size) {
 343     return GenericTaskQueueSet<T, F>::queue((uint)index - 1);
 344   } else {
 345     return NULL;
 346   }
 347 }
 348 
 349 class ShenandoahObjToScanQueueSet: public ParallelClaimableQueueSet<ShenandoahObjToScanQueue, mtGC> {
 350 public:
 351   ShenandoahObjToScanQueueSet(int n) : ParallelClaimableQueueSet<ShenandoahObjToScanQueue, mtGC>(n) {}
 352 
 353   bool is_empty();
 354   void clear();
 355 
 356 #if TASKQUEUE_STATS
 357   static void print_taskqueue_stats_hdr(outputStream* const st);
 358   void print_taskqueue_stats();
 359   void reset_taskqueue_stats();
 360 #endif // TASKQUEUE_STATS
 361 };
 362 
 363 class ShenandoahTerminatorTerminator : public TerminatorTerminator {
 364 private:
 365   ShenandoahHeap* const _heap;
 366 public:
 367   ShenandoahTerminatorTerminator(ShenandoahHeap* const heap) : _heap(heap) { }
 368   // return true, terminates immediately, even if there's remaining work left
 369   virtual bool should_exit_termination();
 370 };
 371 
 372 /*
 373  * This is an enhanced implementation of Google's work stealing
 374  * protocol, which is described in the paper:
 375  * Understanding and improving JVM GC work stealing at the data center scale
 376  * (http://dl.acm.org/citation.cfm?id=2926706)
 377  *
 378  * Instead of a dedicated spin-master, our implementation will let spin-master to relinquish
 379  * the role before it goes to sleep/wait, so allows newly arrived thread to compete for the role.
 380  * The intention of above enhancement, is to reduce spin-master's latency on detecting new tasks
 381  * for stealing and termination condition.
 382  */
 383 
 384 class ShenandoahTaskTerminator: public ParallelTaskTerminator {
 385 private:
 386   Monitor*    _blocker;
 387   Thread*     _spin_master;
 388 
 389 public:
 390   ShenandoahTaskTerminator(uint n_threads, TaskQueueSetSuper* queue_set) :
 391     ParallelTaskTerminator(n_threads, queue_set), _spin_master(NULL) {
 392     _blocker = new Monitor(Mutex::leaf, "ShenandoahTaskTerminator", false);
 393   }
 394 
 395   ~ShenandoahTaskTerminator() {
 396     assert(_blocker != NULL, "Can not be NULL");
 397     delete _blocker;
 398   }
 399 
 400   bool offer_termination(ShenandoahTerminatorTerminator* terminator);
 401   bool offer_termination() { return offer_termination((ShenandoahTerminatorTerminator*)NULL); }
 402 
 403 private:
 404   bool offer_termination(TerminatorTerminator* terminator) {
 405     ShouldNotReachHere();
 406     return false;
 407   }
 408 
 409 private:
 410   size_t tasks_in_queue_set() { return _queue_set->tasks(); }
 411 
 412   /*
 413    * Perform spin-master task.
 414    * return true if termination condition is detected
 415    * otherwise, return false
 416    */
 417   bool do_spin_master_work(ShenandoahTerminatorTerminator* terminator);
 418 };
 419 
 420 #endif // SHARE_VM_GC_SHENANDOAH_SHENANDOAHTASKQUEUE_HPP