1 /*
   2  * Copyright (c) 2016, 2019, Red Hat, Inc. All rights reserved.
   3  *
   4  * This code is free software; you can redistribute it and/or modify it
   5  * under the terms of the GNU General Public License version 2 only, as
   6  * published by the Free Software Foundation.
   7  *
   8  * This code is distributed in the hope that it will be useful, but WITHOUT
   9  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  10  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  11  * version 2 for more details (a copy is included in the LICENSE file that
  12  * accompanied this code).
  13  *
  14  * You should have received a copy of the GNU General Public License version
  15  * 2 along with this work; if not, write to the Free Software Foundation,
  16  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  17  *
  18  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  19  * or visit www.oracle.com if you need additional information or have any
  20  * questions.
  21  *
  22  */
  23 
  24 #ifndef SHARE_VM_GC_SHENANDOAH_SHENANDOAHTASKQUEUE_HPP
  25 #define SHARE_VM_GC_SHENANDOAH_SHENANDOAHTASKQUEUE_HPP
  26 
  27 #include "memory/padded.hpp"
  28 #include "utilities/taskqueue.hpp"
  29 #include "runtime/mutex.hpp"
  30 
  31 class ShenandoahHeap;
  32 class Thread;
  33 
  34 template<class E, MEMFLAGS F, unsigned int N = TASKQUEUE_SIZE>
  35 class BufferedOverflowTaskQueue: public OverflowTaskQueue<E, F, N>
  36 {
  37 public:
  38   typedef OverflowTaskQueue<E, F, N> taskqueue_t;
  39 
  40   BufferedOverflowTaskQueue() : _buf_empty(true) {};
  41 
  42   TASKQUEUE_STATS_ONLY(using taskqueue_t::stats;)
  43 
  44   // Push task t into the queue. Returns true on success.
  45   inline bool push(E t);
  46 
  47   // Attempt to pop from the queue. Returns true on success.
  48   inline bool pop(E &t);
  49 
  50   inline void clear();
  51 
  52   inline bool is_empty()        const {
  53     return _buf_empty && taskqueue_t::is_empty();
  54   }
  55 
  56 private:
  57   bool _buf_empty;
  58   E _elem;
  59 };
  60 
  61 #ifdef _MSC_VER
  62 #pragma warning(push)
  63 // warning C4522: multiple assignment operators specified
  64 #pragma warning(disable:4522)
  65 #endif
  66 
  67 // ObjArrayChunkedTask
  68 //
  69 // Encodes both regular oops, and the array oops plus chunking data for parallel array processing.
  70 // The design goal is to make the regular oop ops very fast, because that would be the prevailing
  71 // case. On the other hand, it should not block parallel array processing from efficiently dividing
  72 // the array work.
  73 //
  74 // The idea is to steal the bits from the 64-bit oop to encode array data, if needed. For the
  75 // proper divide-and-conquer strategies, we want to encode the "blocking" data. It turns out, the
  76 // most efficient way to do this is to encode the array block as (chunk * 2^pow), where it is assumed
  77 // that the block has the size of 2^pow. This requires for pow to have only 5 bits (2^32) to encode
  78 // all possible arrays.
  79 //
  80 //    |---------oop---------|-pow-|--chunk---|
  81 //    0                    49     54        64
  82 //
  83 // By definition, chunk == 0 means "no chunk", i.e. chunking starts from 1.
  84 //
  85 // This encoding gives a few interesting benefits:
  86 //
  87 // a) Encoding/decoding regular oops is very simple, because the upper bits are zero in that task:
  88 //
  89 //    |---------oop---------|00000|0000000000| // no chunk data
  90 //
  91 //    This helps the most ubiquitous path. The initialization amounts to putting the oop into the word
  92 //    with zero padding. Testing for "chunkedness" is testing for zero with chunk mask.
  93 //
  94 // b) Splitting tasks for divide-and-conquer is possible. Suppose we have chunk <C, P> that covers
  95 // interval [ (C-1)*2^P; C*2^P ). We can then split it into two chunks:
  96 //      <2*C - 1, P-1>, that covers interval [ (2*C - 2)*2^(P-1); (2*C - 1)*2^(P-1) )
  97 //      <2*C, P-1>,     that covers interval [ (2*C - 1)*2^(P-1);       2*C*2^(P-1) )
  98 //
  99 //    Observe that the union of these two intervals is:
 100 //      [ (2*C - 2)*2^(P-1); 2*C*2^(P-1) )
 101 //
 102 //    ...which is the original interval:
 103 //      [ (C-1)*2^P; C*2^P )
 104 //
 105 // c) The divide-and-conquer strategy could even start with chunk <1, round-log2-len(arr)>, and split
 106 //    down in the parallel threads, which alleviates the upfront (serial) splitting costs.
 107 //
 108 // Encoding limitations caused by current bitscales mean:
 109 //    10 bits for chunk: max 1024 blocks per array
 110 //     5 bits for power: max 2^32 array
 111 //    49 bits for   oop: max 512 TB of addressable space
 112 //
 113 // Stealing bits from oop trims down the addressable space. Stealing too few bits for chunk ID limits
 114 // potential parallelism. Stealing too few bits for pow limits the maximum array size that can be handled.
 115 // In future, these might be rebalanced to favor one degree of freedom against another. For example,
 116 // if/when Arrays 2.0 bring 2^64-sized arrays, we might need to steal another bit for power. We could regain
 117 // some bits back if chunks are counted in ObjArrayMarkingStride units.
 118 //
 119 // There is also a fallback version that uses plain fields, when we don't have enough space to steal the
 120 // bits from the native pointer. It is useful to debug the optimized version.
 121 //
 122 
 123 #ifdef _MSC_VER
 124 #pragma warning(push)
 125 // warning C4522: multiple assignment operators specified
 126 #pragma warning( disable:4522 )
 127 #endif
 128 
 129 #ifdef _LP64
 130 #define SHENANDOAH_OPTIMIZED_OBJTASK 1
 131 #else
 132 #define SHENANDOAH_OPTIMIZED_OBJTASK 0
 133 #endif
 134 
 135 #if SHENANDOAH_OPTIMIZED_OBJTASK
 136 class ObjArrayChunkedTask
 137 {
 138 public:
 139   enum {
 140     chunk_bits   = 10,
 141     pow_bits     = 5,
 142     oop_bits     = sizeof(uintptr_t)*8 - chunk_bits - pow_bits
 143   };
 144   enum {
 145     oop_shift    = 0,
 146     pow_shift    = oop_shift + oop_bits,
 147     chunk_shift  = pow_shift + pow_bits
 148   };
 149 
 150 public:
 151   ObjArrayChunkedTask(oop o = NULL) {
 152     assert(decode_oop(encode_oop(o)) == o, err_msg("oop can be encoded: " PTR_FORMAT, p2i(o)));
 153     _obj = encode_oop(o);
 154   }
 155   ObjArrayChunkedTask(oop o, int chunk, int pow) {
 156     assert(decode_oop(encode_oop(o)) == o, err_msg("oop can be encoded: " PTR_FORMAT, p2i(o)));
 157     assert(decode_chunk(encode_chunk(chunk)) == chunk, err_msg("chunk can be encoded: %d", chunk));
 158     assert(decode_pow(encode_pow(pow)) == pow, err_msg("pow can be encoded: %d", pow));
 159     _obj = encode_oop(o) | encode_chunk(chunk) | encode_pow(pow);
 160   }
 161   ObjArrayChunkedTask(const ObjArrayChunkedTask& t): _obj(t._obj) { }
 162 
 163   ObjArrayChunkedTask& operator =(const ObjArrayChunkedTask& t) {
 164     _obj = t._obj;
 165     return *this;
 166   }
 167   volatile ObjArrayChunkedTask&
 168   operator =(const volatile ObjArrayChunkedTask& t) volatile {
 169     (void)const_cast<uintptr_t&>(_obj = t._obj);
 170     return *this;
 171   }
 172 
 173   inline oop decode_oop(uintptr_t val) const {
 174     return (oop) reinterpret_cast<void*>((val >> oop_shift) & right_n_bits(oop_bits));
 175   }
 176 
 177   inline int decode_chunk(uintptr_t val) const {
 178     return (int) ((val >> chunk_shift) & right_n_bits(chunk_bits));
 179   }
 180 
 181   inline int decode_pow(uintptr_t val) const {
 182     return (int) ((val >> pow_shift) & right_n_bits(pow_bits));
 183   }
 184 
 185   inline uintptr_t encode_oop(oop obj) const {
 186     return ((uintptr_t)(void*) obj) << oop_shift;
 187   }
 188 
 189   inline uintptr_t encode_chunk(int chunk) const {
 190     return ((uintptr_t) chunk) << chunk_shift;
 191   }
 192 
 193   inline uintptr_t encode_pow(int pow) const {
 194     return ((uintptr_t) pow) << pow_shift;
 195   }
 196 
 197   inline oop obj()   const { return decode_oop(_obj);   }
 198   inline int chunk() const { return decode_chunk(_obj); }
 199   inline int pow()   const { return decode_pow(_obj);   }
 200   inline bool is_not_chunked() const { return (_obj & ~right_n_bits(oop_bits + pow_bits)) == 0; }
 201 
 202   DEBUG_ONLY(bool is_valid() const); // Tasks to be pushed/popped must be valid.
 203 
 204   static uintptr_t max_addressable() {
 205     return nth_bit(oop_bits);
 206   }
 207 
 208   static int chunk_size() {
 209     return nth_bit(chunk_bits);
 210   }
 211 
 212 private:
 213   uintptr_t _obj;
 214 };
 215 #else
 216 class ObjArrayChunkedTask
 217 {
 218 public:
 219   enum {
 220     chunk_bits  = 10,
 221     pow_bits    = 5
 222   };
 223 public:
 224   ObjArrayChunkedTask(oop o = NULL, int chunk = 0, int pow = 0): _obj(o) {
 225     assert(0 <= chunk && chunk < nth_bit(chunk_bits), err_msg("chunk is sane: %d", chunk));
 226     assert(0 <= pow && pow < nth_bit(pow_bits), err_msg("pow is sane: %d", pow));
 227     _chunk = chunk;
 228     _pow = pow;
 229   }
 230   ObjArrayChunkedTask(const ObjArrayChunkedTask& t): _obj(t._obj), _chunk(t._chunk), _pow(t._pow) { }
 231 
 232   ObjArrayChunkedTask& operator =(const ObjArrayChunkedTask& t) {
 233     _obj = t._obj;
 234     _chunk = t._chunk;
 235     _pow = t._pow;
 236     return *this;
 237   }
 238   volatile ObjArrayChunkedTask&
 239   operator =(const volatile ObjArrayChunkedTask& t) volatile {
 240     (void)const_cast<oop&>(_obj = t._obj);
 241     _chunk = t._chunk;
 242     _pow = t._pow;
 243     return *this;
 244   }
 245 
 246   inline oop obj()   const { return _obj; }
 247   inline int chunk() const { return _chunk; }
 248   inline int pow()  const { return _pow; }
 249 
 250   inline bool is_not_chunked() const { return _chunk == 0; }
 251 
 252   DEBUG_ONLY(bool is_valid() const); // Tasks to be pushed/popped must be valid.
 253 
 254   static size_t max_addressable() {
 255     return sizeof(oop);
 256   }
 257 
 258   static int chunk_size() {
 259     return nth_bit(chunk_bits);
 260   }
 261 
 262 private:
 263   oop _obj;
 264   int _chunk;
 265   int _pow;
 266 };
 267 #endif // SHENANDOAH_OPTIMIZED_OBJTASK
 268 
 269 #ifdef _MSC_VER
 270 #pragma warning(pop)
 271 #endif
 272 
 273 typedef ObjArrayChunkedTask ShenandoahMarkTask;
 274 typedef BufferedOverflowTaskQueue<ShenandoahMarkTask, mtGC> ShenandoahBufferedOverflowTaskQueue;
 275 typedef Padded<ShenandoahBufferedOverflowTaskQueue> ShenandoahObjToScanQueue;
 276 
 277 template <class T, MEMFLAGS F>
 278 class ParallelClaimableQueueSet: public GenericTaskQueueSet<T, F> {
 279 private:
 280   char _pad0[DEFAULT_CACHE_LINE_SIZE];
 281   volatile jint     _claimed_index;
 282   char _pad1[DEFAULT_CACHE_LINE_SIZE];
 283 
 284   debug_only(uint   _reserved;  )
 285 
 286 public:
 287   using GenericTaskQueueSet<T, F>::size;
 288 
 289 public:
 290   ParallelClaimableQueueSet(int n) : GenericTaskQueueSet<T, F>(n), _claimed_index(0) {
 291     debug_only(_reserved = 0; )
 292   }
 293 
 294   void clear_claimed() { _claimed_index = 0; }
 295   T*   claim_next();
 296 
 297   // reserve queues that not for parallel claiming
 298   void reserve(uint n) {
 299     assert(n <= size(), "Sanity");
 300     _claimed_index = (jint)n;
 301     debug_only(_reserved = n;)
 302   }
 303 
 304   debug_only(uint get_reserved() const { return (uint)_reserved; })
 305 };
 306 
 307 template <class T, MEMFLAGS F>
 308 T* ParallelClaimableQueueSet<T, F>::claim_next() {
 309   jint size = (jint)GenericTaskQueueSet<T, F>::size();
 310 
 311   if (_claimed_index >= size) {
 312     return NULL;
 313   }
 314 
 315   jint index = Atomic::add(1, &_claimed_index);
 316 
 317   if (index <= size) {
 318     return GenericTaskQueueSet<T, F>::queue((uint)index - 1);
 319   } else {
 320     return NULL;
 321   }
 322 }
 323 
 324 class ShenandoahObjToScanQueueSet: public ParallelClaimableQueueSet<ShenandoahObjToScanQueue, mtGC> {
 325 public:
 326   ShenandoahObjToScanQueueSet(int n) : ParallelClaimableQueueSet<ShenandoahObjToScanQueue, mtGC>(n) {}
 327 
 328   bool is_empty();
 329   void clear();
 330 
 331 #if TASKQUEUE_STATS
 332   static void print_taskqueue_stats_hdr(outputStream* const st);
 333   void print_taskqueue_stats();
 334   void reset_taskqueue_stats();
 335 #endif // TASKQUEUE_STATS
 336 };
 337 
 338 class ShenandoahTerminatorTerminator : public TerminatorTerminator {
 339 private:
 340   ShenandoahHeap* const _heap;
 341 public:
 342   ShenandoahTerminatorTerminator(ShenandoahHeap* const heap) : _heap(heap) { }
 343   // return true, terminates immediately, even if there's remaining work left
 344   virtual bool should_exit_termination();
 345 };
 346 
 347 /*
 348  * This is an enhanced implementation of Google's work stealing
 349  * protocol, which is described in the paper:
 350  * Understanding and improving JVM GC work stealing at the data center scale
 351  * (http://dl.acm.org/citation.cfm?id=2926706)
 352  *
 353  * Instead of a dedicated spin-master, our implementation will let spin-master to relinquish
 354  * the role before it goes to sleep/wait, so allows newly arrived thread to compete for the role.
 355  * The intention of above enhancement, is to reduce spin-master's latency on detecting new tasks
 356  * for stealing and termination condition.
 357  */
 358 
 359 class ShenandoahTaskTerminator: public ParallelTaskTerminator {
 360 private:
 361   Monitor*    _blocker;
 362   Thread*     _spin_master;
 363 
 364 public:
 365   ShenandoahTaskTerminator(uint n_threads, TaskQueueSetSuper* queue_set) :
 366     ParallelTaskTerminator(n_threads, queue_set), _spin_master(NULL) {
 367     _blocker = new Monitor(Mutex::leaf, "ShenandoahTaskTerminator", false);
 368   }
 369 
 370   ~ShenandoahTaskTerminator() {
 371     assert(_blocker != NULL, "Can not be NULL");
 372     delete _blocker;
 373   }
 374 
 375   bool offer_termination(ShenandoahTerminatorTerminator* terminator);
 376   bool offer_termination() { return offer_termination((ShenandoahTerminatorTerminator*)NULL); }
 377 
 378 private:
 379   bool offer_termination(TerminatorTerminator* terminator) {
 380     ShouldNotReachHere();
 381     return false;
 382   }
 383 
 384 private:
 385   size_t tasks_in_queue_set() { return _queue_set->tasks(); }
 386 
 387   /*
 388    * Perform spin-master task.
 389    * return true if termination condition is detected
 390    * otherwise, return false
 391    */
 392   bool do_spin_master_work(ShenandoahTerminatorTerminator* terminator);
 393 };
 394 
 395 #endif // SHARE_VM_GC_SHENANDOAH_SHENANDOAHTASKQUEUE_HPP