1 /*
   2  * Copyright Amazon.com Inc. or its affiliates. All Rights Reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  *
  23  */
  24 
  25 #ifndef SHARE_GC_SHENANDOAH_SHENANDOAHSCANREMEMBERED_HPP
  26 #define SHARE_GC_SHENANDOAH_SHENANDOAHSCANREMEMBERED_HPP
  27 
  28 // Terminology used within this source file:
  29 //
  30 // Card Entry:   This is the information that identifies whether a
  31 //               particular card-table entry is Clean or Dirty.  A clean
  32 //               card entry denotes that the associated memory does not
  33 //               hold references to young-gen memory.
  34 //
  35 // Card Region, aka
  36 // Card Memory:  This is the region of memory that is assocated with a
  37 //               particular card entry.
  38 //
  39 // Card Cluster: A card cluster represents 64 card entries.  A card
  40 //               cluster is the minimal amount of work performed at a
  41 //               time by a parallel thread.  Note that the work required
  42 //               to scan a card cluster is somewhat variable in that the
  43 //               required effort depends on how many cards are dirty, how
  44 //               many references are held within the objects that span a
  45 //               DIRTY card's memory, and on the size of the object
  46 //               that spans the end of a DIRTY card's memory (because
  47 //               that object, if it's not an array, may need to be scanned in
  48 //               its entirety, when the object is imprecisely dirtied. Imprecise
  49 //               dirtying is when the card corresponding to the object header
  50 //               is dirtied, rather than the card on which the updated field lives).
  51 //               To better balance work amongst them, parallel worker threads dynamically
  52 //               claim clusters and are flexible in the number of clusters they
  53 //               process.
  54 //
  55 // A cluster represents a "natural" quantum of work to be performed by
  56 // a parallel GC thread's background remembered set scanning efforts.
  57 // The notion of cluster is similar to the notion of stripe in the
  58 // implementation of parallel GC card scanning.  However, a cluster is
  59 // typically smaller than a stripe, enabling finer grain division of
  60 // labor between multiple threads, and potentially better load balancing
  61 // when dirty cards are not uniformly distributed in the heap, as is often
  62 // the case with generational workloads where more recently promoted objects
  63 // may be dirtied more frequently that older objects.
  64 //
  65 // For illustration, consider the following possible JVM configurations:
  66 //
  67 //   Scenario 1:
  68 //     RegionSize is 128 MB
  69 //     Span of a card entry is 512 B
  70 //     Each card table entry consumes 1 B
  71 //     Assume one long word (8 B)of the card table represents a cluster.
  72 //       This long word holds 8 card table entries, spanning a
  73 //       total of 8*512 B = 4 KB of the heap
  74 //     The number of clusters per region is 128 MB / 4 KB = 32 K
  75 //
  76 //   Scenario 2:
  77 //     RegionSize is 128 MB
  78 //     Span of each card entry is 128 B
  79 //     Each card table entry consumes 1 bit
  80 //     Assume one int word (4 B) of the card table represents a cluster.
  81 //       This int word holds 32 b/1 b = 32 card table entries, spanning a
  82 //       total of 32 * 128 B = 4 KB of the heap
  83 //     The number of clusters per region is 128 MB / 4 KB = 32 K
  84 //
  85 //   Scenario 3:
  86 //     RegionSize is 128 MB
  87 //     Span of each card entry is 512 B
  88 //     Each card table entry consumes 1 bit
  89 //     Assume one long word (8 B) of card table represents a cluster.
  90 //       This long word holds 64 b/ 1 b = 64 card table entries, spanning a
  91 //       total of 64 * 512 B = 32 KB of the heap
  92 //     The number of clusters per region is 128 MB / 32 KB = 4 K
  93 //
  94 // At the start of a new young-gen concurrent mark pass, the gang of
  95 // Shenandoah worker threads collaborate in performing the following
  96 // actions:
  97 //
  98 //  Let old_regions = number of ShenandoahHeapRegion comprising
  99 //    old-gen memory
 100 //  Let region_size = ShenandoahHeapRegion::region_size_bytes()
 101 //    represent the number of bytes in each region
 102 //  Let clusters_per_region = region_size / 512
 103 //  Let rs represent the relevant RememberedSet implementation
 104 //    (an instance of ShenandoahDirectCardMarkRememberedSet or an instance
 105 //     of a to-be-implemented ShenandoahBufferWithSATBRememberedSet)
 106 //
 107 //  for each ShenandoahHeapRegion old_region in the whole heap
 108 //    determine the cluster number of the first cluster belonging
 109 //      to that region
 110 //    for each cluster contained within that region
 111 //      Assure that exactly one worker thread processes each
 112 //      cluster, each thread making a series of invocations of the
 113 //      following:
 114 //
 115 //        rs->process_clusters(worker_id, ReferenceProcessor *,
 116 //                             ShenandoahConcurrentMark *, cluster_no, cluster_count,
 117 //                             HeapWord *end_of_range, OopClosure *oops);
 118 //
 119 //  For efficiency, divide up the clusters so that different threads
 120 //  are responsible for processing different clusters.  Processing costs
 121 //  may vary greatly between clusters for the following reasons:
 122 //
 123 //        a) some clusters contain mostly dirty cards and other
 124 //           clusters contain mostly clean cards
 125 //        b) some clusters contain mostly primitive data and other
 126 //           clusters contain mostly reference data
 127 //        c) some clusters are spanned by very large non-array objects that
 128 //           begin in some other cluster.  When a large non-array object
 129 //           beginning in a preceding cluster spans large portions of
 130 //           this cluster, then because of imprecise dirtying, the
 131 //           portion of the object in this cluster may be clean, but
 132 //           will need to be processed by the worker responsible for
 133 //           this cluster, potentially increasing its work.
 134 //        d) in the case that the end of this cluster is spanned by a
 135 //           very large non-array object, the worker for this cluster will
 136 //           be responsible for processing the portion of the object
 137 //           in this cluster.
 138 //
 139 // Though an initial division of labor between marking threads may
 140 // assign equal numbers of clusters to be scanned by each thread, it
 141 // should be expected that some threads will finish their assigned
 142 // work before others.  Therefore, some amount of the full remembered
 143 // set scanning effort should be held back and assigned incrementally
 144 // to the threads that end up with excess capacity.  Consider the
 145 // following strategy for dividing labor:
 146 //
 147 //        1. Assume there are 8 marking threads and 1024 remembered
 148 //           set clusters to be scanned.
 149 //        2. Assign each thread to scan 64 clusters.  This leaves
 150 //           512 (1024 - (8*64)) clusters to still be scanned.
 151 //        3. As the 8 server threads complete previous cluster
 152 //           scanning assignments, issue each of the next 8 scanning
 153 //           assignments as units of 32 additional cluster each.
 154 //           In the case that there is high variance in effort
 155 //           associated with previous cluster scanning assignments,
 156 //           multiples of these next assignments may be serviced by
 157 //           the server threads that were previously assigned lighter
 158 //           workloads.
 159 //        4. Make subsequent scanning assignments as follows:
 160 //             a) 8 assignments of size 16 clusters
 161 //             b) 8 assignments of size 8 clusters
 162 //             c) 16 assignments of size 4 clusters
 163 //
 164 //    When there is no more remembered set processing work to be
 165 //    assigned to a newly idled worker thread, that thread can move
 166 //    on to work on other tasks associated with root scanning until such
 167 //    time as all clusters have been examined.
 168 //
 169 // Remembered set scanning is designed to run concurrently with
 170 // mutator threads, with multiple concurrent workers. Furthermore, the
 171 // current implementation of remembered set scanning never clears a
 172 // card once it has been marked.
 173 //
 174 // These limitations will be addressed in future enhancements to the
 175 // existing implementation.
 176 
 177 #include <stdint.h>
 178 #include "gc/shared/workerThread.hpp"
 179 #include "gc/shenandoah/shenandoahCardStats.hpp"
 180 #include "gc/shenandoah/shenandoahCardTable.hpp"
 181 #include "gc/shenandoah/shenandoahNumberSeq.hpp"
 182 #include "gc/shenandoah/shenandoahTaskqueue.hpp"
 183 #include "memory/iterator.hpp"
 184 #include "utilities/globalDefinitions.hpp"
 185 
 186 class ShenandoahReferenceProcessor;
 187 class ShenandoahConcurrentMark;
 188 class ShenandoahHeap;
 189 class ShenandoahHeapRegion;
 190 class ShenandoahRegionIterator;
 191 class ShenandoahMarkingContext;
 192 
 193 class CardTable;
 194 typedef CardTable::CardValue CardValue;
 195 
 196 class ShenandoahDirectCardMarkRememberedSet: public CHeapObj<mtGC> {
 197 
 198 private:
 199 
 200   // Use symbolic constants defined in cardTable.hpp
 201   //  CardTable::card_shift = 9;
 202   //  CardTable::card_size = 512;
 203   //  CardTable::card_size_in_words = 64;
 204   //  CardTable::clean_card_val()
 205   //  CardTable::dirty_card_val()
 206 
 207   ShenandoahHeap *_heap;
 208   ShenandoahCardTable *_card_table;
 209   size_t _card_shift;
 210   size_t _total_card_count;
 211   size_t _cluster_count;
 212   HeapWord *_whole_heap_base;   // Points to first HeapWord of data contained within heap memory
 213   CardValue* _byte_map;         // Points to first entry within the card table
 214   CardValue* _byte_map_base;    // Points to byte_map minus the bias computed from address of heap memory
 215 
 216 public:
 217 
 218   // count is the number of cards represented by the card table.
 219   ShenandoahDirectCardMarkRememberedSet(ShenandoahCardTable *card_table, size_t total_card_count);
 220   ~ShenandoahDirectCardMarkRememberedSet();
 221 
 222   // Card index is zero-based relative to _byte_map.
 223   size_t last_valid_index() const;
 224   size_t total_cards() const;
 225   size_t card_index_for_addr(HeapWord *p) const;
 226   HeapWord *addr_for_card_index(size_t card_index) const;
 227   inline const CardValue* get_card_table_byte_map(bool write_table) const;
 228   inline bool is_card_dirty(size_t card_index) const;
 229   inline bool is_write_card_dirty(size_t card_index) const;
 230   inline void mark_card_as_dirty(size_t card_index);
 231   inline void mark_range_as_dirty(size_t card_index, size_t num_cards);
 232   inline void mark_card_as_clean(size_t card_index);
 233   inline void mark_read_card_as_clean(size_t card_index);
 234   inline void mark_range_as_clean(size_t card_index, size_t num_cards);
 235   inline bool is_card_dirty(HeapWord *p) const;
 236   inline void mark_card_as_dirty(HeapWord *p);
 237   inline void mark_range_as_dirty(HeapWord *p, size_t num_heap_words);
 238   inline void mark_card_as_clean(HeapWord *p);
 239   inline void mark_range_as_clean(HeapWord *p, size_t num_heap_words);
 240   inline size_t cluster_count() const;
 241 
 242   // Called by GC thread at start of concurrent mark to exchange roles of read and write remembered sets.
 243   // Not currently used because mutator write barrier does not honor changes to the location of card table.
 244   void swap_remset() {  _card_table->swap_card_tables(); }
 245 
 246   void merge_write_table(HeapWord* start, size_t word_count) {
 247     size_t card_index = card_index_for_addr(start);
 248     size_t num_cards = word_count / CardTable::card_size_in_words();
 249     size_t iterations = num_cards / (sizeof (intptr_t) / sizeof (CardValue));
 250     intptr_t* read_table_ptr = (intptr_t*) &(_card_table->read_byte_map())[card_index];
 251     intptr_t* write_table_ptr = (intptr_t*) &(_card_table->write_byte_map())[card_index];
 252     for (size_t i = 0; i < iterations; i++) {
 253       intptr_t card_value = *write_table_ptr;
 254       *read_table_ptr++ &= card_value;
 255       write_table_ptr++;
 256     }
 257   }
 258 
 259   // Instead of swap_remset, the current implementation of concurrent remembered set scanning does reset_remset
 260   // in parallel threads, each invocation processing one entire HeapRegion at a time.  Processing of a region
 261   // consists of copying the write table to the read table and cleaning the write table.
 262   void reset_remset(HeapWord* start, size_t word_count) {
 263     size_t card_index = card_index_for_addr(start);
 264     size_t num_cards = word_count / CardTable::card_size_in_words();
 265     size_t iterations = num_cards / (sizeof (intptr_t) / sizeof (CardValue));
 266     intptr_t* read_table_ptr = (intptr_t*) &(_card_table->read_byte_map())[card_index];
 267     intptr_t* write_table_ptr = (intptr_t*) &(_card_table->write_byte_map())[card_index];
 268     for (size_t i = 0; i < iterations; i++) {
 269       *read_table_ptr++ = *write_table_ptr;
 270       *write_table_ptr++ = CardTable::clean_card_row_val();
 271     }
 272   }
 273 
 274   // Called by GC thread after scanning old remembered set in order to prepare for next GC pass
 275   void clear_old_remset() {  _card_table->clear_read_table(); }
 276 
 277 };
 278 
 279 // A ShenandoahCardCluster represents the minimal unit of work
 280 // performed by independent parallel GC threads during scanning of
 281 // remembered sets.
 282 //
 283 // The GC threads that perform card-table remembered set scanning may
 284 // overwrite card-table entries to mark them as clean in the case that
 285 // the associated memory no longer holds references to young-gen
 286 // memory.  Rather than access the card-table entries directly, all GC
 287 // thread access to card-table information is made by way of the
 288 // ShenandoahCardCluster data abstraction.  This abstraction
 289 // effectively manages access to multiple possible underlying
 290 // remembered set implementations, including a traditional card-table
 291 // approach and a SATB-based approach.
 292 //
 293 // The API services represent a compromise between efficiency and
 294 // convenience.
 295 //
 296 // Multiple GC threads that scan the remembered set
 297 // in parallel.  The desire is to divide the complete scanning effort
 298 // into multiple clusters of work that can be independently processed
 299 // by individual threads without need for synchronizing efforts
 300 // between the work performed by each task.  The term "cluster" of
 301 // work is similar to the term "stripe" as used in the implementation
 302 // of Parallel GC.
 303 //
 304 // Complexity arises when an object to be scanned crosses the boundary
 305 // between adjacent cluster regions.  Here is the protocol that we currently
 306 // follow:
 307 //
 308 //  1. The thread responsible for scanning the cards in a cluster modifies
 309 //     the associated card-table entries. Only cards that are dirty are
 310 //     processed, except as described below for the case of objects that
 311 //     straddle more than one card.
 312 //  2. Object Arrays are precisely dirtied, so only the portion of the obj-array
 313 //     that overlaps the range of dirty cards in its cluster are scanned
 314 //     by each worker thread. This holds for portions of obj-arrays that extend
 315 //     over clusters processed by different workers, with each worked responsible
 316 //     for scanning the portion of the obj-array overlapping the dirty cards in
 317 //     its cluster.
 318 //  3. Non-array objects are precisely dirtied by the interpreter and the compilers
 319 //     For such objects that extend over multiple cards, or even multiple clusters,
 320 //     the entire object is scanned by the worker that processes the (dirty) card on
 321 //     which the object's header lies. (However, GC workers should precisely dirty the
 322 //     cards with inter-regional/inter-generational pointers in the body of this object,
 323 //     thus making subsequent scans potentially less expensive.) Such larger non-array
 324 //     objects are relatively rare.
 325 //
 326 //  A possible criticism:
 327 //  C. The representation of pointer location descriptive information
 328 //     within Klass representations is not designed for efficient
 329 //     "random access".  An alternative approach to this design would
 330 //     be to scan very large objects multiple times, once for each
 331 //     cluster that is spanned by the object's range.  This reduces
 332 //     unnecessary overscan, but it introduces different sorts of
 333 //     overhead effort:
 334 //       i) For each spanned cluster, we have to look up the start of
 335 //          the crossing object.
 336 //      ii) Each time we scan the very large object, we have to
 337 //          sequentially walk through its pointer location
 338 //          descriptors, skipping over all of the pointers that
 339 //          precede the start of the range of addresses that we
 340 //          consider relevant.
 341 
 342 
 343 // Because old-gen heap memory is not necessarily contiguous, and
 344 // because cards are not necessarily maintained for young-gen memory,
 345 // consecutive card numbers do not necessarily correspond to consecutive
 346 // address ranges.  For the traditional direct-card-marking
 347 // implementation of this interface, consecutive card numbers are
 348 // likely to correspond to contiguous regions of memory, but this
 349 // should not be assumed.  Instead, rely only upon the following:
 350 //
 351 //  1. All card numbers for cards pertaining to the same
 352 //     ShenandoahHeapRegion are consecutively numbered.
 353 //  2. In the case that neighboring ShenandoahHeapRegions both
 354 //     represent old-gen memory, the card regions that span the
 355 //     boundary between these neighboring heap regions will be
 356 //     consecutively numbered.
 357 //  3. (A corollary) In the case that an old-gen object straddles the
 358 //     boundary between two heap regions, the card regions that
 359 //     correspond to the span of this object will be consecutively
 360 //     numbered.
 361 //
 362 // ShenandoahCardCluster abstracts access to the remembered set
 363 // and also keeps track of crossing map information to allow efficient
 364 // resolution of object start addresses.
 365 //
 366 // ShenandoahCardCluster supports all of the services of
 367 // RememberedSet, plus it supports register_object() and lookup_object().
 368 // Note that we only need to register the start addresses of the object that
 369 // overlays the first address of a card; we need to do this for every card.
 370 // In other words, register_object() checks if the object crosses a card boundary,
 371 // and updates the offset value for each card that the object crosses into.
 372 // For objects that don't straddle cards, nothing needs to be done.
 373 //
 374 // The RememberedSet template parameter is intended to represent either
 375 //     ShenandoahDirectCardMarkRememberedSet, or a to-be-implemented
 376 //     ShenandoahBufferWithSATBRememberedSet.
 377 template<typename RememberedSet>
 378 class ShenandoahCardCluster: public CHeapObj<mtGC> {
 379 
 380 private:
 381   RememberedSet *_rs;
 382 
 383 public:
 384   static const size_t CardsPerCluster = 64;
 385 
 386 private:
 387   typedef struct cross_map { uint8_t first; uint8_t last; } xmap;
 388   typedef union crossing_info { uint16_t short_word; xmap offsets; } crossing_info;
 389 
 390   // ObjectStartsInCardRegion bit is set within a crossing_info.offsets.start iff at least one object starts within
 391   // a particular card region.  We pack this bit into start byte under assumption that start byte is accessed less
 392   // frequently than last byte.  This is true when number of clean cards is greater than number of dirty cards.
 393   static const uint16_t ObjectStartsInCardRegion = 0x80;
 394   static const uint16_t FirstStartBits           = 0x7f;
 395 
 396   // Check that we have enough bits to store the largest possible offset into a card for an object start.
 397   // The value for maximum card size is based on the constraints for GCCardSizeInBytes in gc_globals.hpp.
 398   static const int MaxCardSize = NOT_LP64(512) LP64_ONLY(1024);
 399   STATIC_ASSERT((MaxCardSize / HeapWordSize) - 1 <= FirstStartBits);
 400 
 401   crossing_info *object_starts;
 402 
 403 public:
 404   // If we're setting first_start, assume the card has an object.
 405   inline void set_first_start(size_t card_index, uint8_t value) {
 406     object_starts[card_index].offsets.first = ObjectStartsInCardRegion | value;
 407   }
 408 
 409   inline void set_last_start(size_t card_index, uint8_t value) {
 410     object_starts[card_index].offsets.last = value;
 411   }
 412 
 413   inline void set_starts_object_bit(size_t card_index) {
 414     object_starts[card_index].offsets.first |= ObjectStartsInCardRegion;
 415   }
 416 
 417   inline void clear_starts_object_bit(size_t card_index) {
 418     object_starts[card_index].offsets.first &= ~ObjectStartsInCardRegion;
 419   }
 420 
 421   // Returns true iff an object is known to start within the card memory associated with card card_index.
 422   inline bool starts_object(size_t card_index) const {
 423     return (object_starts[card_index].offsets.first & ObjectStartsInCardRegion) != 0;
 424   }
 425 
 426   inline void clear_objects_in_range(HeapWord *addr, size_t num_words) {
 427     size_t card_index = _rs->card_index_for_addr(addr);
 428     size_t last_card_index = _rs->card_index_for_addr(addr + num_words - 1);
 429     while (card_index <= last_card_index)
 430       object_starts[card_index++].short_word = 0;
 431   }
 432 
 433   ShenandoahCardCluster(RememberedSet *rs) {
 434     _rs = rs;
 435     // TODO: We don't really need object_starts entries for every card entry.  We only need these for
 436     // the card entries that correspond to old-gen memory.  But for now, let's be quick and dirty.
 437     object_starts = NEW_C_HEAP_ARRAY(crossing_info, rs->total_cards(), mtGC);
 438     for (size_t i = 0; i < rs->total_cards(); i++) {
 439       object_starts[i].short_word = 0;
 440     }
 441   }
 442 
 443   ~ShenandoahCardCluster() {
 444     FREE_C_HEAP_ARRAY(crossing_info, object_starts);
 445     object_starts = nullptr;
 446   }
 447 
 448   // There is one entry within the object_starts array for each card entry.
 449   //
 450   //  Suppose multiple garbage objects are coalesced during GC sweep
 451   //  into a single larger "free segment".  As each two objects are
 452   //  coalesced together, the start information pertaining to the second
 453   //  object must be removed from the objects_starts array.  If the
 454   //  second object had been been the first object within card memory,
 455   //  the new first object is the object that follows that object if
 456   //  that starts within the same card memory, or NoObject if the
 457   //  following object starts within the following cluster.  If the
 458   //  second object had been the last object in the card memory,
 459   //  replace this entry with the newly coalesced object if it starts
 460   //  within the same card memory, or with NoObject if it starts in a
 461   //  preceding card's memory.
 462   //
 463   //  Suppose a large free segment is divided into a smaller free
 464   //  segment and a new object.  The second part of the newly divided
 465   //  memory must be registered as a new object, overwriting at most
 466   //  one first_start and one last_start entry.  Note that one of the
 467   //  newly divided two objects might be a new GCLAB.
 468   //
 469   //  Suppose postprocessing of a GCLAB finds that the original GCLAB
 470   //  has been divided into N objects.  Each of the N newly allocated
 471   //  objects will be registered, overwriting at most one first_start
 472   //  and one last_start entries.
 473   //
 474   //  No object registration operations are linear in the length of
 475   //  the registered objects.
 476   //
 477   // Consider further the following observations regarding object
 478   // registration costs:
 479   //
 480   //   1. The cost is paid once for each old-gen object (Except when
 481   //      an object is demoted and repromoted, in which case we would
 482   //      pay the cost again).
 483   //   2. The cost can be deferred so that there is no urgency during
 484   //      mutator copy-on-first-access promotion.  Background GC
 485   //      threads will update the object_starts array by post-
 486   //      processing the contents of retired PLAB buffers.
 487   //   3. The bet is that these costs are paid relatively rarely
 488   //      because:
 489   //      a) Most objects die young and objects that die in young-gen
 490   //         memory never need to be registered with the object_starts
 491   //         array.
 492   //      b) Most objects that are promoted into old-gen memory live
 493   //         there without further relocation for a relatively long
 494   //         time, so we get a lot of benefit from each investment
 495   //         in registering an object.
 496 
 497 public:
 498 
 499   // The starting locations of objects contained within old-gen memory
 500   // are registered as part of the remembered set implementation.  This
 501   // information is required when scanning dirty card regions that are
 502   // spanned by objects beginning within preceding card regions.  It
 503   // is necessary to find the first and last objects that begin within
 504   // this card region.  Starting addresses of objects are required to
 505   // find the object headers, and object headers provide information
 506   // about which fields within the object hold addresses.
 507   //
 508   // The old-gen memory allocator invokes register_object() for any
 509   // object that is allocated within old-gen memory.  This identifies
 510   // the starting addresses of objects that span boundaries between
 511   // card regions.
 512   //
 513   // It is not necessary to invoke register_object at the very instant
 514   // an object is allocated.  It is only necessary to invoke it
 515   // prior to the next start of a garbage collection concurrent mark
 516   // or concurrent update-references phase.  An "ideal" time to register
 517   // objects is during post-processing of a GCLAB after the GCLAB is
 518   // retired due to depletion of its memory.
 519   //
 520   // register_object() does not perform synchronization.  In the case
 521   // that multiple threads are registering objects whose starting
 522   // addresses are within the same cluster, races between these
 523   // threads may result in corruption of the object-start data
 524   // structures.  Parallel GC threads should avoid registering objects
 525   // residing within the same cluster by adhering to the following
 526   // coordination protocols:
 527   //
 528   //  1. Align thread-local GCLAB buffers with some TBD multiple of
 529   //     card clusters.  The card cluster size is 32 KB.  If the
 530   //     desired GCLAB size is 128 KB, align the buffer on a multiple
 531   //     of 4 card clusters.
 532   //  2. Post-process the contents of GCLAB buffers to register the
 533   //     objects allocated therein.  Allow one GC thread at a
 534   //     time to do the post-processing of each GCLAB.
 535   //  3. Since only one GC thread at a time is registering objects
 536   //     belonging to a particular allocation buffer, no locking
 537   //     is performed when registering these objects.
 538   //  4. Any remnant of unallocated memory within an expended GC
 539   //     allocation buffer is not returned to the old-gen allocation
 540   //     pool until after the GC allocation buffer has been post
 541   //     processed.  Before any remnant memory is returned to the
 542   //     old-gen allocation pool, the GC thread that scanned this GC
 543   //     allocation buffer performs a write-commit memory barrier.
 544   //  5. Background GC threads that perform tenuring of young-gen
 545   //     objects without a GCLAB use a CAS lock before registering
 546   //     each tenured object.  The CAS lock assures both mutual
 547   //     exclusion and memory coherency/visibility.  Note that an
 548   //     object tenured by a background GC thread will not overlap
 549   //     with any of the clusters that are receiving tenured objects
 550   //     by way of GCLAB buffers.  Multiple independent GC threads may
 551   //     attempt to tenure objects into a shared cluster.  This is why
 552   //     sychronization may be necessary.  Consider the following
 553   //     scenarios:
 554   //
 555   //     a) If two objects are tenured into the same card region, each
 556   //        registration may attempt to modify the first-start or
 557   //        last-start information associated with that card region.
 558   //        Furthermore, because the representations of first-start
 559   //        and last-start information within the object_starts array
 560   //        entry uses different bits of a shared uint_16 to represent
 561   //        each, it is necessary to lock the entire card entry
 562   //        before modifying either the first-start or last-start
 563   //        information within the entry.
 564   //     b) Suppose GC thread X promotes a tenured object into
 565   //        card region A and this tenured object spans into
 566   //        neighboring card region B.  Suppose GC thread Y (not equal
 567   //        to X) promotes a tenured object into cluster B.  GC thread X
 568   //        will update the object_starts information for card A.  No
 569   //        synchronization is required.
 570   //     c) In summary, when background GC threads register objects
 571   //        newly tenured into old-gen memory, they must acquire a
 572   //        mutual exclusion lock on the card that holds the starting
 573   //        address of the newly tenured object.  This can be achieved
 574   //        by using a CAS instruction to assure that the previous
 575   //        values of first-offset and last-offset have not been
 576   //        changed since the same thread inquired as to their most
 577   //        current values.
 578   //
 579   //     One way to minimize the need for synchronization between
 580   //     background tenuring GC threads is for each tenuring GC thread
 581   //     to promote young-gen objects into distinct dedicated cluster
 582   //     ranges.
 583   //  6. The object_starts information is only required during the
 584   //     starting of concurrent marking and concurrent evacuation
 585   //     phases of GC.  Before we start either of these GC phases, the
 586   //     JVM enters a safe point and all GC threads perform
 587   //     commit-write barriers to assure that access to the
 588   //     object_starts information is coherent.
 589 
 590 
 591   // Notes on synchronization of register_object():
 592   //
 593   //  1. For efficiency, there is no locking in the implementation of register_object()
 594   //  2. Thus, it is required that users of this service assure that concurrent/parallel invocations of
 595   //     register_object() do pertain to the same card's memory range.  See discussion below to understand
 596   //     the risks.
 597   //  3. When allocating from a TLAB or GCLAB, the mutual exclusion can be guaranteed by assuring that each
 598   //     LAB's start and end are aligned on card memory boundaries.
 599   //  4. Use the same lock that guarantees exclusivity when performing free-list allocation within heap regions.
 600   //
 601   // Register the newly allocated object while we're holding the global lock since there's no synchronization
 602   // built in to the implementation of register_object().  There are potential races when multiple independent
 603   // threads are allocating objects, some of which might span the same card region.  For example, consider
 604   // a card table's memory region within which three objects are being allocated by three different threads:
 605   //
 606   // objects being "concurrently" allocated:
 607   //    [-----a------][-----b-----][--------------c------------------]
 608   //            [---- card table memory range --------------]
 609   //
 610   // Before any objects are allocated, this card's memory range holds no objects.  Note that:
 611   //   allocation of object a wants to set the has-object, first-start, and last-start attributes of the preceding card region.
 612   //   allocation of object b wants to set the has-object, first-start, and last-start attributes of this card region.
 613   //   allocation of object c also wants to set the has-object, first-start, and last-start attributes of this card region.
 614   //
 615   // The thread allocating b and the thread allocating c can "race" in various ways, resulting in confusion, such as last-start
 616   // representing object b while first-start represents object c.  This is why we need to require all register_object()
 617   // invocations associated with objects that are allocated from "free lists" to provide their own mutual exclusion locking
 618   // mechanism.
 619 
 620   // Reset the starts_object() information to false for all cards in the range between from and to.
 621   void reset_object_range(HeapWord *from, HeapWord *to);
 622 
 623   // register_object() requires that the caller hold the heap lock
 624   // before calling it.
 625   void register_object(HeapWord* address);
 626 
 627   // register_object_without_lock() does not require that the caller hold
 628   // the heap lock before calling it, under the assumption that the
 629   // caller has assure no other thread will endeavor to concurrently
 630   // register objects that start within the same card's memory region
 631   // as address.
 632   void register_object_without_lock(HeapWord* address);
 633 
 634   // During the reference updates phase of GC, we walk through each old-gen memory region that was
 635   // not part of the collection set and we invalidate all unmarked objects.  As part of this effort,
 636   // we coalesce neighboring dead objects in order to make future remembered set scanning more
 637   // efficient (since future remembered set scanning of any card region containing consecutive
 638   // dead objects can skip over all of them at once by reading only a single dead object header
 639   // instead of having to read the header of each of the coalesced dead objects.
 640   //
 641   // At some future time, we may implement a further optimization: satisfy future allocation requests
 642   // by carving new objects out of the range of memory that represents the coalesced dead objects.
 643   //
 644   // Suppose we want to combine several dead objects into a single coalesced object.  How does this
 645   // impact our representation of crossing map information?
 646   //  1. If the newly coalesced range is contained entirely within a card range, that card's last
 647   //     start entry either remains the same or it is changed to the start of the coalesced region.
 648   //  2. For the card that holds the start of the coalesced object, it will not impact the first start
 649   //     but it may impact the last start.
 650   //  3. For following cards spanned entirely by the newly coalesced object, it will change starts_object
 651   //     to false (and make first-start and last-start "undefined").
 652   //  4. For a following card that is spanned patially by the newly coalesced object, it may change
 653   //     first-start value, but it will not change the last-start value.
 654   //
 655   // The range of addresses represented by the arguments to coalesce_objects() must represent a range
 656   // of memory that was previously occupied exactly by one or more previously registered objects.  For
 657   // convenience, it is legal to invoke coalesce_objects() with arguments that span a single previously
 658   // registered object.
 659   //
 660   // The role of coalesce_objects is to change the crossing map information associated with all of the coalesced
 661   // objects.
 662   void coalesce_objects(HeapWord* address, size_t length_in_words);
 663 
 664   // The typical use case is going to look something like this:
 665   //   for each heapregion that comprises old-gen memory
 666   //     for each card number that corresponds to this heap region
 667   //       scan the objects contained therein if the card is dirty
 668   // To avoid excessive lookups in a sparse array, the API queries
 669   // the card number pertaining to a particular address and then uses the
 670   // card number for subsequent information lookups and stores.
 671 
 672   // If starts_object(card_index), this returns the word offset within this card
 673   // memory at which the first object begins.  If !starts_object(card_index), the
 674   // result is a don't care value -- asserts in a debug build.
 675   size_t get_first_start(size_t card_index) const;
 676 
 677   // If starts_object(card_index), this returns the word offset within this card
 678   // memory at which the last object begins.  If !starts_object(card_index), the
 679   // result is a don't care value.
 680   size_t get_last_start(size_t card_index) const;
 681 
 682 
 683   // Given a card_index, return the starting address of the first block in the heap
 684   // that straddles into the card. If the card is co-initial with an object, then
 685   // this would return the starting address of the heap that this card covers.
 686   // Expects to be called for a card affiliated with the old generation in
 687   // generational mode.
 688   HeapWord* block_start(size_t card_index) const;
 689 };
 690 
 691 // ShenandoahScanRemembered is a concrete class representing the
 692 // ability to scan the old-gen remembered set for references to
 693 // objects residing in young-gen memory.
 694 //
 695 // Scanning normally begins with an invocation of numRegions and ends
 696 // after all clusters of all regions have been scanned.
 697 //
 698 // Throughout the scanning effort, the number of regions does not
 699 // change.
 700 //
 701 // Even though the regions that comprise old-gen memory are not
 702 // necessarily contiguous, the abstraction represented by this class
 703 // identifies each of the old-gen regions with an integer value
 704 // in the range from 0 to (numRegions() - 1) inclusive.
 705 //
 706 
 707 template<typename RememberedSet>
 708 class ShenandoahScanRemembered: public CHeapObj<mtGC> {
 709 
 710 private:
 711   RememberedSet* _rs;
 712   ShenandoahCardCluster<RememberedSet>* _scc;
 713 
 714   // Global card stats (cumulative)
 715   HdrSeq _card_stats_scan_rs[MAX_CARD_STAT_TYPE];
 716   HdrSeq _card_stats_update_refs[MAX_CARD_STAT_TYPE];
 717   // Per worker card stats (multiplexed by phase)
 718   HdrSeq** _card_stats;
 719 
 720   // The types of card metrics that we gather
 721   const char* _card_stats_name[MAX_CARD_STAT_TYPE] = {
 722    "dirty_run", "clean_run",
 723    "dirty_cards", "clean_cards",
 724    "max_dirty_run", "max_clean_run",
 725    "dirty_scan_objs",
 726    "alternations"
 727   };
 728 
 729   // The statistics are collected and logged separately for
 730   // card-scans for initial marking, and for updating refs.
 731   const char* _card_stat_log_type[MAX_CARD_STAT_LOG_TYPE] = {
 732    "Scan Remembered Set", "Update Refs"
 733   };
 734 
 735   int _card_stats_log_counter[2] = {0, 0};
 736 
 737 public:
 738   // How to instantiate this object?
 739   //   ShenandoahDirectCardMarkRememberedSet *rs =
 740   //       new ShenandoahDirectCardMarkRememberedSet();
 741   //   scr = new
 742   //     ShenandoahScanRememberd<ShenandoahDirectCardMarkRememberedSet>(rs);
 743   //
 744   // or, after the planned implementation of
 745   // ShenandoahBufferWithSATBRememberedSet has been completed:
 746   //
 747   //   ShenandoahBufferWithSATBRememberedSet *rs =
 748   //       new ShenandoahBufferWithSATBRememberedSet();
 749   //   scr = new
 750   //     ShenandoahScanRememberd<ShenandoahBufferWithSATBRememberedSet>(rs);
 751 
 752 
 753   ShenandoahScanRemembered(RememberedSet *rs) {
 754     _rs = rs;
 755     _scc = new ShenandoahCardCluster<RememberedSet>(rs);
 756 
 757     // We allocate ParallelGCThreads worth even though we usually only
 758     // use up to ConcGCThreads, because degenerate collections may employ
 759     // ParallelGCThreads for remembered set scanning.
 760     if (ShenandoahEnableCardStats) {
 761       _card_stats = NEW_C_HEAP_ARRAY(HdrSeq*, ParallelGCThreads, mtGC);
 762       for (uint i = 0; i < ParallelGCThreads; i++) {
 763         _card_stats[i] = new HdrSeq[MAX_CARD_STAT_TYPE];
 764       }
 765     } else {
 766       _card_stats = nullptr;
 767     }
 768   }
 769 
 770   ~ShenandoahScanRemembered() {
 771     delete _scc;
 772     if (ShenandoahEnableCardStats) {
 773       for (uint i = 0; i < ParallelGCThreads; i++) {
 774         delete _card_stats[i];
 775       }
 776       FREE_C_HEAP_ARRAY(HdrSeq*, _card_stats);
 777       _card_stats = nullptr;
 778     }
 779     assert(_card_stats == nullptr, "Error");
 780   }
 781 
 782   HdrSeq* card_stats(uint worker_id) {
 783     assert(worker_id < ParallelGCThreads, "Error");
 784     assert(ShenandoahEnableCardStats == (_card_stats != nullptr), "Error");
 785     return ShenandoahEnableCardStats ? _card_stats[worker_id] : nullptr;
 786   }
 787 
 788   HdrSeq* card_stats_for_phase(CardStatLogType t) {
 789     switch (t) {
 790       case CARD_STAT_SCAN_RS:
 791         return _card_stats_scan_rs;
 792       case CARD_STAT_UPDATE_REFS:
 793         return _card_stats_update_refs;
 794       default:
 795         guarantee(false, "No such CardStatLogType");
 796     }
 797     return nullptr; // Quiet compiler
 798   }
 799 
 800   // TODO:  We really don't want to share all of these APIs with arbitrary consumers of the ShenandoahScanRemembered abstraction.
 801   // But in the spirit of quick and dirty for the time being, I'm going to go ahead and publish everything for right now.  Some
 802   // of existing code already depends on having access to these services (because existing code has not been written to honor
 803   // full abstraction of remembered set scanning.  In the not too distant future, we want to try to make most, if not all, of
 804   // these services private.  Two problems with publicizing:
 805   //  1. Allowing arbitrary users to reach beneath the hood allows the users to make assumptions about underlying implementation.
 806   //     This will make it more difficult to change underlying implementation at a future time, such as when we eventually experiment
 807   //     with SATB-based implementation of remembered set representation.
 808   //  2. If we carefully control sharing of certain of these services, we can reduce the overhead of synchronization by assuring
 809   //     that all users follow protocols that avoid contention that might require synchronization.  When we publish these APIs, we
 810   //     lose control over who and how the data is accessed.  As a result, we are required to insert more defensive measures into
 811   //     the implementation, including synchronization locks.
 812 
 813 
 814   // Card index is zero-based relative to first spanned card region.
 815   size_t last_valid_index();
 816   size_t total_cards();
 817   size_t card_index_for_addr(HeapWord *p);
 818   HeapWord *addr_for_card_index(size_t card_index);
 819   bool is_card_dirty(size_t card_index);
 820   bool is_write_card_dirty(size_t card_index) { return _rs->is_write_card_dirty(card_index); }
 821   void mark_card_as_dirty(size_t card_index);
 822   void mark_range_as_dirty(size_t card_index, size_t num_cards);
 823   void mark_card_as_clean(size_t card_index);
 824   void mark_read_card_as_clean(size_t card_index) { _rs->mark_read_card_clean(card_index); }
 825   void mark_range_as_clean(size_t card_index, size_t num_cards);
 826   bool is_card_dirty(HeapWord *p);
 827   void mark_card_as_dirty(HeapWord *p);
 828   void mark_range_as_dirty(HeapWord *p, size_t num_heap_words);
 829   void mark_card_as_clean(HeapWord *p);
 830   void mark_range_as_clean(HeapWord *p, size_t num_heap_words);
 831   size_t cluster_count();
 832 
 833   // Called by GC thread at start of concurrent mark to exchange roles of read and write remembered sets.
 834   void swap_remset() { _rs->swap_remset(); }
 835 
 836   void reset_remset(HeapWord* start, size_t word_count) { _rs->reset_remset(start, word_count); }
 837 
 838   void merge_write_table(HeapWord* start, size_t word_count) { _rs->merge_write_table(start, word_count); }
 839 
 840   // Called by GC thread after scanning old remembered set in order to prepare for next GC pass
 841   void clear_old_remset() { _rs->clear_old_remset(); }
 842 
 843   size_t cluster_for_addr(HeapWord *addr);
 844   HeapWord* addr_for_cluster(size_t cluster_no);
 845 
 846   void reset_object_range(HeapWord *from, HeapWord *to);
 847   void register_object(HeapWord *addr);
 848   void register_object_without_lock(HeapWord *addr);
 849   void coalesce_objects(HeapWord *addr, size_t length_in_words);
 850 
 851   HeapWord* first_object_in_card(size_t card_index) {
 852     if (_scc->starts_object(card_index)) {
 853       return addr_for_card_index(card_index) + _scc->get_first_start(card_index);
 854     } else {
 855       return nullptr;
 856     }
 857   }
 858 
 859   // Return true iff this object is "properly" registered.
 860   bool verify_registration(HeapWord* address, ShenandoahMarkingContext* ctx);
 861 
 862   // clear the cards to clean, and clear the object_starts info to no objects
 863   void mark_range_as_empty(HeapWord *addr, size_t length_in_words);
 864 
 865   // process_clusters() scans a portion of the remembered set
 866   // for references from old gen into young. Several worker threads
 867   // scan different portions of the remembered set by making parallel invocations
 868   // of process_clusters() with each invocation scanning different
 869   // "clusters" of the remembered set.
 870   //
 871   // An invocation of process_clusters() examines all of the
 872   // intergenerational references spanned by `count` clusters starting
 873   // with `first_cluster`.  The `oops` argument is a worker-thread-local
 874   // OopClosure that is applied to all "valid" references in the remembered set.
 875   //
 876   // A side-effect of executing process_clusters() is to update the remembered
 877   // set entries (e.g. marking dirty cards clean if they no longer
 878   // hold references to young-gen memory).
 879   //
 880   // An implementation of process_clusters() may choose to efficiently
 881   // address more typical scenarios in the structure of remembered sets. E.g.
 882   // in the generational setting, one might expect remembered sets to be very sparse
 883   // (low mutation rates in the old generation leading to sparse dirty cards,
 884   // each with very few intergenerational pointers). Specific implementations
 885   // may choose to degrade gracefully as the sparsity assumption fails to hold,
 886   // such as when there are sudden spikes in (premature) promotion or in the
 887   // case of an underprovisioned, poorly-tuned, or poorly-shaped heap.
 888   //
 889   // At the start of a concurrent young generation marking cycle, we invoke process_clusters
 890   // with ClosureType ShenandoahInitMarkRootsClosure.
 891   //
 892   // At the start of a concurrent evacuation phase, we invoke process_clusters with
 893   // ClosureType ShenandoahEvacuateUpdateRootsClosure.
 894 
 895   // All template expansions require methods to be defined in the inline.hpp file, but larger
 896   // such methods need not be declared as inline.
 897   template <typename ClosureType>
 898   void process_clusters(size_t first_cluster, size_t count, HeapWord *end_of_range, ClosureType *oops,
 899                                bool use_write_table, uint worker_id);
 900 
 901   template <typename ClosureType>
 902   inline void process_humongous_clusters(ShenandoahHeapRegion* r, size_t first_cluster, size_t count,
 903                                          HeapWord *end_of_range, ClosureType *oops, bool use_write_table);
 904 
 905   template <typename ClosureType>
 906   inline void process_region_slice(ShenandoahHeapRegion* region, size_t offset, size_t clusters, HeapWord* end_of_range,
 907                                    ClosureType *cl, bool use_write_table, uint worker_id);
 908 
 909   // To Do:
 910   //  Create subclasses of ShenandoahInitMarkRootsClosure and
 911   //  ShenandoahEvacuateUpdateRootsClosure and any other closures
 912   //  that need to participate in remembered set scanning.  Within the
 913   //  subclasses, add a (probably templated) instance variable that
 914   //  refers to the associated ShenandoahCardCluster object.  Use this
 915   //  ShenandoahCardCluster instance to "enhance" the do_oops
 916   //  processing so that we can:
 917   //
 918   //   1. Avoid processing references that correspond to clean card
 919   //      regions, and
 920   //   2. Set card status to CLEAN when the associated card region no
 921   //      longer holds inter-generatioanal references.
 922   //
 923   //  To enable efficient implementation of these behaviors, we
 924   //  probably also want to add a few fields into the
 925   //  ShenandoahCardCluster object that allow us to precompute and
 926   //  remember the addresses at which card status is going to change
 927   //  from dirty to clean and clean to dirty.  The do_oops
 928   //  implementations will want to update this value each time they
 929   //  cross one of these boundaries.
 930   void roots_do(OopIterateClosure* cl);
 931 
 932   // Log stats related to card/RS stats for given phase t
 933   void log_card_stats(uint nworkers, CardStatLogType t) PRODUCT_RETURN;
 934 private:
 935   // Log stats for given worker id related into given summary card/RS stats
 936   void log_worker_card_stats(uint worker_id, HdrSeq* sum_stats) PRODUCT_RETURN;
 937 
 938   // Log given stats
 939   inline void log_card_stats(HdrSeq* stats) PRODUCT_RETURN;
 940 
 941   // Merge the stats from worked_id into the given summary stats, and clear the worker_id's stats.
 942   void merge_worker_card_stats_cumulative(HdrSeq* worker_stats, HdrSeq* sum_stats) PRODUCT_RETURN;
 943 };
 944 
 945 
 946 // A ShenandoahRegionChunk represents a contiguous interval of a ShenandoahHeapRegion, typically representing
 947 // work to be done by a worker thread.
 948 struct ShenandoahRegionChunk {
 949   ShenandoahHeapRegion *_r;      // The region of which this represents a chunk
 950   size_t _chunk_offset;          // HeapWordSize offset
 951   size_t _chunk_size;            // HeapWordSize qty
 952 };
 953 
 954 // ShenandoahRegionChunkIterator divides the total remembered set scanning effort into ShenandoahRegionChunks
 955 // that are assigned one at a time to worker threads. (Here, we use the terms `assignments` and `chunks`
 956 // interchangeably.) Note that the effort required to scan a range of memory is not necessarily a linear
 957 // function of the size of the range.  Some memory ranges hold only a small number of live objects.
 958 // Some ranges hold primarily primitive (non-pointer) data.  We start with larger chunk sizes because larger chunks
 959 // reduce coordination overhead.  We expect that the GC worker threads that receive more difficult assignments
 960 // will work longer on those chunks.  Meanwhile, other worker will threads repeatedly accept and complete multiple
 961 // easier chunks.  As the total amount of work remaining to be completed decreases, we decrease the size of chunks
 962 // given to individual threads.  This reduces the likelihood of significant imbalance between worker thread assignments
 963 // when there is less meaningful work to be performed by the remaining worker threads while they wait for
 964 // worker threads with difficult assignments to finish, reducing the overall duration of the phase.
 965 
 966 class ShenandoahRegionChunkIterator : public StackObj {
 967 private:
 968   // The largest chunk size is 4 MiB, measured in words.  Otherwise, remembered set scanning may become too unbalanced.
 969   // If the largest chunk size is too small, there is too much overhead sifting out assignments to individual worker threads.
 970   static const size_t _maximum_chunk_size_words = (4 * 1024 * 1024) / HeapWordSize;
 971 
 972   static const size_t _clusters_in_smallest_chunk = 4;
 973 
 974   // smallest_chunk_size is 4 clusters.  Each cluster spans 128 KiB.
 975   // This is computed from CardTable::card_size_in_words() *
 976   //      ShenandoahCardCluster<ShenandoahDirectCardMarkRememberedSet>::CardsPerCluster;
 977   static size_t smallest_chunk_size_words() {
 978       return _clusters_in_smallest_chunk * CardTable::card_size_in_words() *
 979              ShenandoahCardCluster<ShenandoahDirectCardMarkRememberedSet>::CardsPerCluster;
 980   }
 981 
 982   // The total remembered set scanning effort is divided into chunks of work that are assigned to individual worker tasks.
 983   // The chunks of assigned work are divided into groups, where the size of the typical group (_regular_group_size) is half the
 984   // total number of regions.  The first group may be larger than
 985   // _regular_group_size in the case that the first group's chunk
 986   // size is less than the region size.  The last group may be larger
 987   // than _regular_group_size because no group is allowed to
 988   // have smaller assignments than _smallest_chunk_size, which is 128 KB.
 989 
 990   // Under normal circumstances, no configuration needs more than _maximum_groups (default value of 16).
 991   // The first group "effectively" processes chunks of size 1 MiB (or smaller for smaller region sizes).
 992   // The last group processes chunks of size 128 KiB.  There are four groups total.
 993 
 994   // group[0] is 4 MiB chunk size (_maximum_chunk_size_words)
 995   // group[1] is 2 MiB chunk size
 996   // group[2] is 1 MiB chunk size
 997   // group[3] is 512 KiB chunk size
 998   // group[4] is 256 KiB chunk size
 999   // group[5] is 128 Kib shunk size (_smallest_chunk_size_words = 4 * 64 * 64
1000   static const size_t _maximum_groups = 6;
1001 
1002   const ShenandoahHeap* _heap;
1003 
1004   const size_t _regular_group_size;                        // Number of chunks in each group
1005   const size_t _first_group_chunk_size_b4_rebalance;
1006   const size_t _num_groups;                        // Number of groups in this configuration
1007   const size_t _total_chunks;
1008 
1009   shenandoah_padding(0);
1010   volatile size_t _index;
1011   shenandoah_padding(1);
1012 
1013   size_t _region_index[_maximum_groups];           // The region index for the first region spanned by this group
1014   size_t _group_offset[_maximum_groups];           // The offset at which group begins within first region spanned by this group
1015   size_t _group_chunk_size[_maximum_groups];       // The size of each chunk within this group
1016   size_t _group_entries[_maximum_groups];          // Total chunks spanned by this group and the ones before it.
1017 
1018   // No implicit copying: iterators should be passed by reference to capture the state
1019   NONCOPYABLE(ShenandoahRegionChunkIterator);
1020 
1021   // Makes use of _heap.
1022   size_t calc_regular_group_size();
1023 
1024   // Makes use of _regular_group_size, which must be initialized before call.
1025   size_t calc_first_group_chunk_size_b4_rebalance();
1026 
1027   // Makes use of _regular_group_size and _first_group_chunk_size_b4_rebalance, both of which must be initialized before call.
1028   size_t calc_num_groups();
1029 
1030   // Makes use of _regular_group_size, _first_group_chunk_size_b4_rebalance, which must be initialized before call.
1031   size_t calc_total_chunks();
1032 
1033 public:
1034   ShenandoahRegionChunkIterator(size_t worker_count);
1035   ShenandoahRegionChunkIterator(ShenandoahHeap* heap, size_t worker_count);
1036 
1037   // Reset iterator to default state
1038   void reset();
1039 
1040   // Fills in assignment with next chunk of work and returns true iff there is more work.
1041   // Otherwise, returns false.  This is multi-thread-safe.
1042   inline bool next(struct ShenandoahRegionChunk *assignment);
1043 
1044   // This is *not* MT safe. However, in the absence of multithreaded access, it
1045   // can be used to determine if there is more work to do.
1046   inline bool has_next() const;
1047 };
1048 
1049 typedef ShenandoahScanRemembered<ShenandoahDirectCardMarkRememberedSet> RememberedScanner;
1050 
1051 class ShenandoahScanRememberedTask : public WorkerTask {
1052  private:
1053   ShenandoahObjToScanQueueSet* _queue_set;
1054   ShenandoahObjToScanQueueSet* _old_queue_set;
1055   ShenandoahReferenceProcessor* _rp;
1056   ShenandoahRegionChunkIterator* _work_list;
1057   bool _is_concurrent;
1058 
1059  public:
1060   ShenandoahScanRememberedTask(ShenandoahObjToScanQueueSet* queue_set,
1061                                ShenandoahObjToScanQueueSet* old_queue_set,
1062                                ShenandoahReferenceProcessor* rp,
1063                                ShenandoahRegionChunkIterator* work_list,
1064                                bool is_concurrent);
1065 
1066   void work(uint worker_id);
1067   void do_work(uint worker_id);
1068 };
1069 
1070 // Verify that the oop doesn't point into the young generation
1071 class ShenandoahVerifyNoYoungRefsClosure: public BasicOopIterateClosure {
1072   ShenandoahHeap* _heap;
1073   template<class T> void work(T* p);
1074 
1075  public:
1076   ShenandoahVerifyNoYoungRefsClosure();
1077 
1078   virtual void do_oop(narrowOop* p) { work(p); }
1079   virtual void do_oop(oop* p)       { work(p); }
1080 };
1081 
1082 #endif // SHARE_GC_SHENANDOAH_SHENANDOAHSCANREMEMBERED_HPP