1 /*
   2  * Copyright (c) 2021, Amazon.com, Inc. or its affiliates.  All rights reserved.
   3  *
   4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   5  *
   6  * This code is free software; you can redistribute it and/or modify it
   7  * under the terms of the GNU General Public License version 2 only, as
   8  * published by the Free Software Foundation.
   9  *
  10  * This code is distributed in the hope that it will be useful, but WITHOUT
  11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  13  * version 2 for more details (a copy is included in the LICENSE file that
  14  * accompanied this code).
  15  *
  16  * You should have received a copy of the GNU General Public License version
  17  * 2 along with this work; if not, write to the Free Software Foundation,
  18  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  19  *
  20  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  21  * or visit www.oracle.com if you need additional information or have any
  22  * questions.
  23  *
  24  */
  25 
  26 #ifndef SHARE_GC_SHENANDOAH_SHENANDOAHSCANREMEMBERED_HPP
  27 #define SHARE_GC_SHENANDOAH_SHENANDOAHSCANREMEMBERED_HPP
  28 
  29 // Terminology used within this source file:
  30 //
  31 // Card Entry:   This is the information that identifies whether a
  32 //               particular card-table entry is Clean or Dirty.  A clean
  33 //               card entry denotes that the associated memory does not
  34 //               hold references to young-gen memory.
  35 //
  36 // Card Region, aka
  37 // Card Memory:  This is the region of memory that is assocated with a
  38 //               particular card entry.
  39 //
  40 // Card Cluster: A card cluster represents 64 card entries.  A card
  41 //               cluster is the minimal amount of work performed at a
  42 //               time by a parallel thread.  Note that the work required
  43 //               to scan a card cluster is somewhat variable in that the
  44 //               required effort depends on how many cards are dirty, how
  45 //               many references are held within the objects that span a
  46 //               DIRTY card's memory, and on the size of the object
  47 //               that spans the end of a DIRTY card's memory (because
  48 //               that object will be scanned in its entirety). For these
  49 //               reasons, it is advisable for the multiple worker threads
  50 //               to be flexible in the number of clusters to be
  51 //               processed by each thread.
  52 //
  53 // A cluster represents a "natural" quantum of work to be performed by
  54 // a parallel GC thread's background remembered set scanning efforts.
  55 // The notion of cluster is similar to the notion of stripe in the
  56 // implementation of parallel GC card scanning.  However, a cluster is
  57 // typically smaller than a stripe, enabling finer grain division of
  58 // labor between multiple threads.
  59 //
  60 // For illustration, consider the following possible JVM configurations:
  61 //
  62 //   Scenario 1:
  63 //     RegionSize is 128 MB
  64 //     Span of a card entry is 512 B
  65 //     Each card table entry consumes 1 B
  66 //     Assume one long word of card table entries represents a cluster.
  67 //       This long word holds 8 card table entries, spanning a
  68 //       total of 4KB
  69 //     The number of clusters per region is 128 MB / 4 KB = 32K
  70 //
  71 //   Scenario 2:
  72 //     RegionSize is 128 MB
  73 //     Span of each card entry is 128 B
  74 //     Each card table entry consumes 1 bit
  75 //     Assume one int word of card tables represents a cluster.
  76 //       This int word holds 32 card table entries, spanning a
  77 //       total of 4KB
  78 //     The number of clusters per region is 128 MB / 4 KB = 32K
  79 //
  80 //   Scenario 3:
  81 //     RegionSize is 128 MB
  82 //     Span of each card entry is 512 B
  83 //     Each card table entry consumes 1 bit
  84 //     Assume one long word of card tables represents a cluster.
  85 //       This long word holds 64 card table entries, spanning a
  86 //       total of 32 KB
  87 //     The number of clusters per region is 128 MB / 32 KB = 4K
  88 //
  89 // At the start of a new young-gen concurrent mark pass, the gang of
  90 // Shenandoah worker threads collaborate in performing the following
  91 // actions:
  92 //
  93 //  Let old_regions = number of ShenandoahHeapRegion comprising
  94 //    old-gen memory
  95 //  Let region_size = ShenandoahHeapRegion::region_size_bytes()
  96 //    represent the number of bytes in each region
  97 //  Let clusters_per_region = region_size / 512
  98 //  Let rs represent the relevant RememberedSet implementation
  99 //    (an instance of ShenandoahDirectCardMarkRememberedSet or an instance
 100 //     of a to-be-implemented ShenandoahBufferWithSATBRememberedSet)
 101 //
 102 //  for each ShenandoahHeapRegion old_region in the whole heap
 103 //    determine the cluster number of the first cluster belonging
 104 //      to that region
 105 //    for each cluster contained within that region
 106 //      Assure that exactly one worker thread initializes each
 107 //      cluster of overreach memory by invoking:
 108 //
 109 //        rs->initialize_overreach(cluster_no, cluster_count)
 110 //
 111 //      in separate threads.  (Divide up the clusters so that
 112 //      different threads are responsible for initializing different
 113 //      clusters.  Initialization cost is essentially identical for
 114 //      each cluster.)
 115 //
 116 //  Next, we repeat the process for invocations of process_clusters.
 117 //  for each ShenandoahHeapRegion old_region in the whole heap
 118 //    determine the cluster number of the first cluster belonging
 119 //      to that region
 120 //    for each cluster contained within that region
 121 //      Assure that exactly one worker thread processes each
 122 //      cluster, each thread making a series of invocations of the
 123 //      following:
 124 //
 125 //        rs->process_clusters(worker_id, ReferenceProcessor *,
 126 //                             ShenandoahConcurrentMark *, cluster_no, cluster_count,
 127 //                             HeapWord *end_of_range, OopClosure *oops);
 128 //
 129 //  For efficiency, divide up the clusters so that different threads
 130 //  are responsible for processing different clusters.  Processing costs
 131 //  may vary greatly between clusters for the following reasons:
 132 //
 133 //        a) some clusters contain mostly dirty cards and other
 134 //           clusters contain mostly clean cards
 135 //        b) some clusters contain mostly primitive data and other
 136 //           clusters contain mostly reference data
 137 //        c) some clusters are spanned by very large objects that
 138 //           begin in some other cluster.  When a large object
 139 //           beginning in a preceding cluster spans large portions of
 140 //           this cluster, the processing of this cluster gets a
 141 //           "free ride" because the thread responsible for processing
 142 //           the cluster that holds the object's header does the
 143 //           processing.
 144 //        d) in the case that the end of this cluster is spanned by a
 145 //           very large object, the processing of this cluster will
 146 //           be responsible for examining the entire object,
 147 //           potentially requiring this thread to process large amounts
 148 //           of memory pertaining to other clusters.
 149 //
 150 // Though an initial division of labor between marking threads may
 151 // assign equal numbers of clusters to be scanned by each thread, it
 152 // should be expected that some threads will finish their assigned
 153 // work before others.  Therefore, some amount of the full remembered
 154 // set scanning effort should be held back and assigned incrementally
 155 // to the threads that end up with excess capacity.  Consider the
 156 // following strategy for dividing labor:
 157 //
 158 //        1. Assume there are 8 marking threads and 1024 remembered
 159 //           set clusters to be scanned.
 160 //        2. Assign each thread to scan 64 clusters.  This leaves
 161 //           512 (1024 - (8*64)) clusters to still be scanned.
 162 //        3. As the 8 server threads complete previous cluster
 163 //           scanning assignments, issue each of the next 8 scanning
 164 //           assignments as units of 32 additional cluster each.
 165 //           In the case that there is high variance in effort
 166 //           associated with previous cluster scanning assignments,
 167 //           multiples of these next assignments may be serviced by
 168 //           the server threads that were previously assigned lighter
 169 //           workloads.
 170 //        4. Make subsequent scanning assignments as follows:
 171 //             a) 8 assignments of size 16 clusters
 172 //             b) 8 assignments of size 8 clusters
 173 //             c) 16 assignments of size 4 clusters
 174 //
 175 //    When there is no more remembered set processing work to be
 176 //    assigned to a newly idled worker thread, that thread can move
 177 //    on to work on other tasks associated with root scanning until such
 178 //    time as all clusters have been examined.
 179 //
 180 //  Once all clusters have been processed, the gang of GC worker
 181 //  threads collaborate to merge the overreach data.
 182 //
 183 //  for each ShenandoahHeapRegion old_region in the whole heap
 184 //    determine the cluster number of the first cluster belonging
 185 //      to that region
 186 //    for each cluster contained within that region
 187 //      Assure that exactly one worker thread initializes each
 188 //      cluster of overreach memory by invoking:
 189 //
 190 //        rs->merge_overreach(cluster_no, cluster_count)
 191 //
 192 //      in separate threads.  (Divide up the clusters so that
 193 //      different threads are responsible for merging different
 194 //      clusters.  Merging cost is essentially identical for
 195 //      each cluster.)
 196 //
 197 // Though remembered set scanning is designed to run concurrently with
 198 // mutator threads, the current implementation of remembered set
 199 // scanning runs in parallel during a GC safepoint.  Furthermore, the
 200 // current implementation of remembered set scanning never clears a
 201 // card once it has been marked.  Since the current implementation
 202 // never clears marked pages, the current implementation does not
 203 // invoke initialize_overreach() or merge_overreach().
 204 //
 205 // These limitations will be addressed in future enhancements to the
 206 // existing implementation.
 207 
 208 #include <stdint.h>
 209 #include "gc/shared/workerThread.hpp"
 210 #include "gc/shenandoah/shenandoahCardTable.hpp"
 211 #include "gc/shenandoah/shenandoahHeap.hpp"
 212 #include "gc/shenandoah/shenandoahHeapRegion.hpp"
 213 #include "gc/shenandoah/shenandoahTaskqueue.hpp"
 214 #include "memory/iterator.hpp"
 215 
 216 class ShenandoahReferenceProcessor;
 217 class ShenandoahConcurrentMark;
 218 class ShenandoahHeap;
 219 class ShenandoahRegionIterator;
 220 class ShenandoahMarkingContext;
 221 
 222 class CardTable;
 223 
 224 class ShenandoahDirectCardMarkRememberedSet: public CHeapObj<mtGC> {
 225 
 226 private:
 227 
 228   // Use symbolic constants defined in cardTable.hpp
 229   //  CardTable::card_shift = 9;
 230   //  CardTable::card_size = 512;
 231   //  CardTable::card_size_in_words = 64;
 232 
 233   //  CardTable::clean_card_val()
 234   //  CardTable::dirty_card_val()
 235 
 236   ShenandoahHeap *_heap;
 237   ShenandoahCardTable *_card_table;
 238   size_t _card_shift;
 239   size_t _total_card_count;
 240   size_t _cluster_count;
 241   HeapWord *_whole_heap_base;   // Points to first HeapWord of data contained within heap memory
 242   uint8_t *_byte_map;           // Points to first entry within the card table
 243   uint8_t *_byte_map_base;      // Points to byte_map minus the bias computed from address of heap memory
 244 
 245 public:
 246   // count is the number of cards represented by the card table.
 247   ShenandoahDirectCardMarkRememberedSet(ShenandoahCardTable *card_table, size_t total_card_count);
 248   ~ShenandoahDirectCardMarkRememberedSet();
 249 
 250   // Card index is zero-based relative to _byte_map.
 251   size_t last_valid_index();
 252   size_t total_cards();
 253   size_t card_index_for_addr(HeapWord *p);
 254   HeapWord *addr_for_card_index(size_t card_index);
 255   bool is_card_dirty(size_t card_index);
 256   bool is_write_card_dirty(size_t card_index);
 257   void mark_card_as_dirty(size_t card_index);
 258   void mark_range_as_dirty(size_t card_index, size_t num_cards);
 259   void mark_card_as_clean(size_t card_index);
 260   void mark_read_card_as_clean(size_t card_index);
 261   void mark_range_as_clean(size_t card_index, size_t num_cards);
 262   bool is_card_dirty(HeapWord *p);
 263   void mark_card_as_dirty(HeapWord *p);
 264   void mark_range_as_dirty(HeapWord *p, size_t num_heap_words);
 265   void mark_card_as_clean(HeapWord *p);
 266   void mark_range_as_clean(HeapWord *p, size_t num_heap_words);
 267   size_t cluster_count();
 268 
 269   // Called by GC thread at start of concurrent mark to exchange roles of read and write remembered sets.
 270   // Not currently used because mutator write barrier does not honor changes to the location of card table.
 271   void swap_remset() {  _card_table->swap_card_tables(); }
 272 
 273   void merge_write_table(HeapWord* start, size_t word_count) {
 274     size_t card_index = card_index_for_addr(start);
 275     size_t num_cards = word_count / CardTable::card_size_in_words();
 276     size_t iterations = num_cards / (sizeof (intptr_t) / sizeof (CardTable::CardValue));
 277     intptr_t* read_table_ptr = (intptr_t*) &(_card_table->read_byte_map())[card_index];
 278     intptr_t* write_table_ptr = (intptr_t*) &(_card_table->write_byte_map())[card_index];
 279     for (size_t i = 0; i < iterations; i++) {
 280       intptr_t card_value = *write_table_ptr;
 281       *read_table_ptr++ &= card_value;
 282       write_table_ptr++;
 283     }
 284   }
 285 
 286   // Instead of swap_remset, the current implementation of concurrent remembered set scanning does reset_remset
 287   // in parallel threads, each invocation processing one entire HeapRegion at a time.  Processing of a region
 288   // consists of copying the write table to the read table and cleaning the write table.
 289   void reset_remset(HeapWord* start, size_t word_count) {
 290     size_t card_index = card_index_for_addr(start);
 291     size_t num_cards = word_count / CardTable::card_size_in_words();
 292     size_t iterations = num_cards / (sizeof (intptr_t) / sizeof (CardTable::CardValue));
 293     intptr_t* read_table_ptr = (intptr_t*) &(_card_table->read_byte_map())[card_index];
 294     intptr_t* write_table_ptr = (intptr_t*) &(_card_table->write_byte_map())[card_index];
 295     for (size_t i = 0; i < iterations; i++) {
 296       *read_table_ptr++ = *write_table_ptr;
 297       *write_table_ptr++ = CardTable::clean_card_row_val();
 298     }
 299   }
 300 
 301   // Called by GC thread after scanning old remembered set in order to prepare for next GC pass
 302   void clear_old_remset() {  _card_table->clear_read_table(); }
 303 
 304 };
 305 
 306 // A ShenandoahCardCluster represents the minimal unit of work
 307 // performed by independent parallel GC threads during scanning of
 308 // remembered sets.
 309 //
 310 // The GC threads that perform card-table remembered set scanning may
 311 // overwrite card-table entries to mark them as clean in the case that
 312 // the associated memory no longer holds references to young-gen
 313 // memory.  Rather than access the card-table entries directly, all GC
 314 // thread access to card-table information is made by way of the
 315 // ShenandoahCardCluster data abstraction.  This abstraction
 316 // effectively manages access to multiple possible underlying
 317 // remembered set implementations, including a traditional card-table
 318 // approach and a SATB-based approach.
 319 //
 320 // The API services represent a compromise between efficiency and
 321 // convenience.
 322 //
 323 // In the initial implementation, we assume that scanning of card
 324 // table entries occurs only while the JVM is at a safe point.  Thus,
 325 // there is no synchronization required between GC threads that are
 326 // scanning card-table entries and marking certain entries that were
 327 // previously dirty as clean, and mutator threads which would possibly
 328 // be marking certain card-table entries as dirty.
 329 //
 330 // There is however a need to implement concurrency control and memory
 331 // coherency between multiple GC threads that scan the remembered set
 332 // in parallel.  The desire is to divide the complete scanning effort
 333 // into multiple clusters of work that can be independently processed
 334 // by individual threads without need for synchronizing efforts
 335 // between the work performed by each task.  The term "cluster" of
 336 // work is similar to the term "stripe" as used in the implementation
 337 // of Parallel GC.
 338 //
 339 // Complexity arises when an object to be scanned crosses the boundary
 340 // between adjacent cluster regions.  Here is the protocol that is
 341 // followed:
 342 //
 343 //  1. We implement a supplemental data structure known as the overreach
 344 //     card table.  The thread that is responsible for scanning each
 345 //     cluster of card-table entries is granted exclusive access to
 346 //     modify the associated card-table entries.  In the case that a
 347 //     thread scans a very large object that reaches into one or more
 348 //     following clusters, that thread has exclusive access to the
 349 //     overreach card table for all of the entries belonging to the
 350 //     following clusters that are spanned by this large object.
 351 //     After all clusters have been scanned, the scanning threads
 352 //     briefly synchronize to merge the contents of the overreach
 353 //     entries with the traditional card table entries using logical-
 354 //     and operations.
 355 //  2. Every object is scanned in its "entirety" by the thread that is
 356 //     responsible for the cluster that holds its starting address.
 357 //     Entirety is in quotes because there are various situations in
 358 //     which some portions of the object will not be scanned by this
 359 //     thread:
 360 //     a) If an object spans multiple card regions, all of which are
 361 //        contained within the same cluster, the scanning thread
 362 //        consults the existing card-table entries and does not scan
 363 //        portions of the object that are not currently dirty.
 364 //     b) For any cluster that is spanned in its entirety by a very
 365 //        large object, the GC thread that scans this object assumes
 366 //        full responsibility for maintenance of the associated
 367 //        card-table entries.
 368 //     c) If a cluster is partially spanned by an object originating
 369 //        in a preceding cluster, the portion of the object that
 370 //        partially spans the following cluster is scanned in its
 371 //        entirety (because the thread that is responsible for
 372 //        scanning the object cannot rely upon the card-table entries
 373 //        associated with the following cluster).  Whenever references
 374 //        to young-gen memory are found within the scanned data, the
 375 //        associated overreach card table entries are marked as dirty
 376 //        by the scanning thread.
 377 //  3. If a cluster is spanned in its entirety by an object that
 378 //     originates within a preceding cluster's memory, the thread
 379 //     assigned to examine this cluster does absolutely nothing.  The
 380 //     thread assigned to scan the cluster that holds the object's
 381 //     starting address takes full responsibility for scanning the
 382 //     entire object and updating the associated card-table entries.
 383 //  4. If a cluster is spanned partially by an object that originates
 384 //     within a preceding cluster's memory, the thread assigned to
 385 //     examine this cluster marks the card-table entry as clean for
 386 //     each card table that is fully spanned by this overreaching
 387 //     object.  If a card-table entry's memory is partially spanned
 388 //     by the overreaching object, the thread sets the card-table
 389 //     entry to clean if it was previously dirty and if the portion
 390 //     of the card-table entry's memory that is not spanned by the
 391 //     overreaching object does not hold pointers to young-gen
 392 //     memory.
 393 //  5. While examining a particular card belonging to a particular
 394 //     cluster, if an object reaches beyond the end of its card
 395 //     memory, the thread "scans" all portions of the object that
 396 //     correspond to DIRTY card entries within the current cluster and
 397 //     all portions of the object that reach into following clustesr.
 398 //     After this object is scanned, continue scanning with the memory
 399 //     that follows this object if this memory pertains to the same
 400 //     cluster.  Otherwise, consider this cluster's memory to have
 401 //     been fully examined.
 402 //
 403 // Discussion:
 404 //  Though this design results from careful consideration of multiple
 405 //  design objectives, it is subject to various criticisms.  Some
 406 //  discussion of the design choices is provided here:
 407 //
 408 //  1. Note that remembered sets are a heuristic technique to avoid
 409 //     the need to scan all of old-gen memory with each young-gen
 410 //     collection.  If we sometimes scan a bit more memory than is
 411 //     absolutely necessary, that should be considered a reasonable
 412 //     compromise.  This compromise is already present in the sizing
 413 //     of card table memory areas.  Note that a single dirty pointer
 414 //     within a 512-byte card region forces the "unnecessary" scanning
 415 //     of 63 = ((512 - 8 = 504) / 8) pointers.
 416 //  2. One undesirable aspect of this design is that we sometimes have
 417 //     to scan large amounts of memory belonging to very large
 418 //     objects, even for parts of the very large object that do not
 419 //     correspond to dirty card table entries.  Note that this design
 420 //     limits the amount of non-dirty scanning that might have to
 421 //     be performed for these very large objects.  In particular, only
 422 //     the last part of the very large object that extends into but
 423 //     does not completely span a particular cluster is unnecessarily
 424 //     scanned.  Thus, for each very large object, the maximum
 425 //     over-scan is the size of memory spanned by a single cluster.
 426 //  3. The representation of pointer location descriptive information
 427 //     within Klass representations is not designed for efficient
 428 //     "random access".  An alternative approach to this design would
 429 //     be to scan very large objects multiple times, once for each
 430 //     cluster that is spanned by the object's range.  This reduces
 431 //     unnecessary overscan, but it introduces different sorts of
 432 //     overhead effort:
 433 //       i) For each spanned cluster, we have to look up the start of
 434 //          the crossing object.
 435 //      ii) Each time we scan the very large object, we have to
 436 //          sequentially walk through its pointer location
 437 //          descriptors, skipping over all of the pointers that
 438 //          precede the start of the range of addresses that we
 439 //          consider relevant.
 440 
 441 
 442 // Because old-gen heap memory is not necessarily contiguous, and
 443 // because cards are not necessarily maintained for young-gen memory,
 444 // consecutive card numbers do not necessarily correspond to consecutive
 445 // address ranges.  For the traditional direct-card-marking
 446 // implementation of this interface, consecutive card numbers are
 447 // likely to correspond to contiguous regions of memory, but this
 448 // should not be assumed.  Instead, rely only upon the following:
 449 //
 450 //  1. All card numbers for cards pertaining to the same
 451 //     ShenandoahHeapRegion are consecutively numbered.
 452 //  2. In the case that neighboring ShenandoahHeapRegions both
 453 //     represent old-gen memory, the card regions that span the
 454 //     boundary between these neighboring heap regions will be
 455 //     consecutively numbered.
 456 //  3. (A corollary) In the case that an old-gen object spans the
 457 //     boundary between two heap regions, the card regions that
 458 //     correspond to the span of this object will be consecutively
 459 //     numbered.
 460 
 461 
 462 // ShenandoahCardCluster abstracts access to the remembered set
 463 // and also keeps track of crossing map information to allow efficient
 464 // resolution of object start addresses.
 465 //
 466 // ShenandoahCardCluster supports all of the services of
 467 // RememberedSet, plus it supports register_object() and lookup_object().
 468 //
 469 // There are two situations under which we need to know the location
 470 // at which the object spanning the start of a particular card-table
 471 // memory region begins:
 472 //
 473 // 1. When we begin to scan dirty card memory that is not the
 474 //    first card region within a cluster, and the object that
 475 //    crosses into this card memory was not previously scanned,
 476 //    we need to find where that object starts so we can scan it.
 477 //    (Asides: if the objects starts within a previous cluster, it
 478 //     has already been scanned.  If the object starts within this
 479 //     cluster and it spans at least one card region that is dirty
 480 //     and precedes this card region within the cluster, then it has
 481 //     already been scanned.)
 482 // 2. When we are otherwise done scanning a complete cluster, if the
 483 //    last object within the cluster reaches into the following
 484 //    cluster, we need to scan this object.  Thus, we need to find
 485 //    its starting location.
 486 //
 487 // The RememberedSet template parameter is intended to represent either
 488 //     ShenandoahDirectCardMarkRememberedSet, or a to-be-implemented
 489 //     ShenandoahBufferWithSATBRememberedSet.
 490 template<typename RememberedSet>
 491 class ShenandoahCardCluster: public CHeapObj<mtGC> {
 492 
 493 private:
 494   RememberedSet *_rs;
 495 
 496 public:
 497   static const size_t CardsPerCluster = 64;
 498 
 499 private:
 500   typedef struct cross_map { uint8_t first; uint8_t last; } xmap;
 501   typedef union crossing_info { uint16_t short_word; xmap offsets; } crossing_info;
 502 
 503   // ObjectStartsInCardRegion bit is set within a crossing_info.offsets.start iff at least one object starts within
 504   // a particular card region.  We pack this bit into start byte under assumption that start byte is accessed less
 505   // frequently that last byte.  This is true when number of clean cards is greater than number of dirty cards.
 506   static const uint16_t ObjectStartsInCardRegion = 0x80;
 507   static const uint16_t FirstStartBits           = 0x3f;
 508 
 509   crossing_info *object_starts;
 510 
 511 public:
 512   // If we're setting first_start, assume the card has an object.
 513   inline void set_first_start(size_t card_index, uint8_t value) {
 514     object_starts[card_index].offsets.first = ObjectStartsInCardRegion | value;
 515   }
 516 
 517   inline void set_last_start(size_t card_index, uint8_t value) {
 518     object_starts[card_index].offsets.last = value;
 519   }
 520 
 521   inline void set_has_object_bit(size_t card_index) {
 522     object_starts[card_index].offsets.first |= ObjectStartsInCardRegion;
 523   }
 524 
 525   inline void clear_has_object_bit(size_t card_index) {
 526     object_starts[card_index].offsets.first &= ~ObjectStartsInCardRegion;
 527   }
 528 
 529   // Returns true iff an object is known to start within the card memory associated with card card_index.
 530   inline bool has_object(size_t card_index) {
 531     return (object_starts[card_index].offsets.first & ObjectStartsInCardRegion) != 0;
 532   }
 533 
 534   inline void clear_objects_in_range(HeapWord *addr, size_t num_words) {
 535     size_t card_index = _rs->card_index_for_addr(addr);
 536     size_t last_card_index = _rs->card_index_for_addr(addr + num_words - 1);
 537     while (card_index <= last_card_index)
 538       object_starts[card_index++].short_word = 0;
 539   }
 540 
 541   ShenandoahCardCluster(RememberedSet *rs) {
 542     _rs = rs;
 543     // TODO: We don't really need object_starts entries for every card entry.  We only need these for
 544     // the card entries that correspond to old-gen memory.  But for now, let's be quick and dirty.
 545     object_starts = NEW_C_HEAP_ARRAY(crossing_info, rs->total_cards(), mtGC);
 546     for (size_t i = 0; i < rs->total_cards(); i++) {
 547       object_starts[i].short_word = 0;
 548     }
 549   }
 550 
 551   ~ShenandoahCardCluster() {
 552     FREE_C_HEAP_ARRAY(crossing_info, object_starts);
 553     object_starts = nullptr;
 554   }
 555 
 556   // There is one entry within the object_starts array for each card entry.
 557   //
 558   // In the most recent implementation of ShenandoahScanRemembered::process_clusters(),
 559   // there is no need for the get_crossing_object_start() method function, so there is no
 560   // need to maintain the following information.  The comment is left in place for now in
 561   // case we find it necessary to add support for this service at a later time.
 562   //
 563   // Bits 0x7fff: If no object starts within this card region, the
 564   //              remaining bits of the object_starts array represent
 565   //              the absolute word offset within the enclosing
 566   //              cluster's memory of the starting address for the
 567   //              object that spans the start of this card region's
 568   //              memory.  If the spanning object begins in memory
 569   //              that precedes this card region's cluster, the value
 570   //              stored in these bits is the special value 0x7fff.
 571   //              (Note that the maximum value required to represent a
 572   //              spanning object from within the current cluster is
 573   //              ((63 * 64) - 8), which equals 0x0fbf.
 574   //
 575   // In the absence of the need to support get_crossing_object_start(),
 576   // here is discussion of performance:
 577   //
 578   //  Suppose multiple garbage objects are coalesced during GC sweep
 579   //  into a single larger "free segment".  As each two objects are
 580   //  coalesced together, the start information pertaining to the second
 581   //  object must be removed from the objects_starts array.  If the
 582   //  second object had been been the first object within card memory,
 583   //  the new first object is the object that follows that object if
 584   //  that starts within the same card memory, or NoObject if the
 585   //  following object starts within the following cluster.  If the
 586   //  second object had been the last object in the card memory,
 587   //  replace this entry with the newly coalesced object if it starts
 588   //  within the same card memory, or with NoObject if it starts in a
 589   //  preceding card's memory.
 590   //
 591   //  Suppose a large free segment is divided into a smaller free
 592   //  segment and a new object.  The second part of the newly divided
 593   //  memory must be registered as a new object, overwriting at most
 594   //  one first_start and one last_start entry.  Note that one of the
 595   //  newly divided two objects might be a new GCLAB.
 596   //
 597   //  Suppose postprocessing of a GCLAB finds that the original GCLAB
 598   //  has been divided into N objects.  Each of the N newly allocated
 599   //  objects will be registered, overwriting at most one first_start
 600   //  and one last_start entries.
 601   //
 602   //  No object registration operations are linear in the length of
 603   //  the registered objects.
 604   //
 605   // Consider further the following observations regarding object
 606   // registration costs:
 607   //
 608   //   1. The cost is paid once for each old-gen object (Except when
 609   //      an object is demoted and repromoted, in which case we would
 610   //      pay the cost again).
 611   //   2. The cost can be deferred so that there is no urgency during
 612   //      mutator copy-on-first-access promotion.  Background GC
 613   //      threads will update the object_starts array by post-
 614   //      processing the contents of retired PLAB buffers.
 615   //   3. The bet is that these costs are paid relatively rarely
 616   //      because:
 617   //      a) Most objects die young and objects that die in young-gen
 618   //         memory never need to be registered with the object_starts
 619   //         array.
 620   //      b) Most objects that are promoted into old-gen memory live
 621   //         there without further relocation for a relatively long
 622   //         time, so we get a lot of benefit from each investment
 623   //         in registering an object.
 624 
 625 public:
 626 
 627   // The starting locations of objects contained within old-gen memory
 628   // are registered as part of the remembered set implementation.  This
 629   // information is required when scanning dirty card regions that are
 630   // spanned by objects beginning within preceding card regions.  It
 631   // is necessary to find the first and last objects that begin within
 632   // this card region.  Starting addresses of objects are required to
 633   // find the object headers, and object headers provide information
 634   // about which fields within the object hold addresses.
 635   //
 636   // The old-gen memory allocator invokes register_object() for any
 637   // object that is allocated within old-gen memory.  This identifies
 638   // the starting addresses of objects that span boundaries between
 639   // card regions.
 640   //
 641   // It is not necessary to invoke register_object at the very instant
 642   // an object is allocated.  It is only necessary to invoke it
 643   // prior to the next start of a garbage collection concurrent mark
 644   // or concurrent update-references phase.  An "ideal" time to register
 645   // objects is during post-processing of a GCLAB after the GCLAB is
 646   // retired due to depletion of its memory.
 647   //
 648   // register_object() does not perform synchronization.  In the case
 649   // that multiple threads are registering objects whose starting
 650   // addresses are within the same cluster, races between these
 651   // threads may result in corruption of the object-start data
 652   // structures.  Parallel GC threads should avoid registering objects
 653   // residing within the same cluster by adhering to the following
 654   // coordination protocols:
 655   //
 656   //  1. Align thread-local GCLAB buffers with some TBD multiple of
 657   //     card clusters.  The card cluster size is 32 KB.  If the
 658   //     desired GCLAB size is 128 KB, align the buffer on a multiple
 659   //     of 4 card clusters.
 660   //  2. Post-process the contents of GCLAB buffers to register the
 661   //     objects allocated therein.  Allow one GC thread at a
 662   //     time to do the post-processing of each GCLAB.
 663   //  3. Since only one GC thread at a time is registering objects
 664   //     belonging to a particular allocation buffer, no locking
 665   //     is performed when registering these objects.
 666   //  4. Any remnant of unallocated memory within an expended GC
 667   //     allocation buffer is not returned to the old-gen allocation
 668   //     pool until after the GC allocation buffer has been post
 669   //     processed.  Before any remnant memory is returned to the
 670   //     old-gen allocation pool, the GC thread that scanned this GC
 671   //     allocation buffer performs a write-commit memory barrier.
 672   //  5. Background GC threads that perform tenuring of young-gen
 673   //     objects without a GCLAB use a CAS lock before registering
 674   //     each tenured object.  The CAS lock assures both mutual
 675   //     exclusion and memory coherency/visibility.  Note that an
 676   //     object tenured by a background GC thread will not overlap
 677   //     with any of the clusters that are receiving tenured objects
 678   //     by way of GCLAB buffers.  Multiple independent GC threads may
 679   //     attempt to tenure objects into a shared cluster.  This is why
 680   //     sychronization may be necessary.  Consider the following
 681   //     scenarios:
 682   //
 683   //     a) If two objects are tenured into the same card region, each
 684   //        registration may attempt to modify the first-start or
 685   //        last-start information associated with that card region.
 686   //        Furthermore, because the representations of first-start
 687   //        and last-start information within the object_starts array
 688   //        entry uses different bits of a shared uint_16 to represent
 689   //        each, it is necessary to lock the entire card entry
 690   //        before modifying either the first-start or last-start
 691   //        information within the entry.
 692   //     b) Suppose GC thread X promotes a tenured object into
 693   //        card region A and this tenured object spans into
 694   //        neighboring card region B.  Suppose GC thread Y (not equal
 695   //        to X) promotes a tenured object into cluster B.  GC thread X
 696   //        will update the object_starts information for card A.  No
 697   //        synchronization is required.
 698   //     c) In summary, when background GC threads register objects
 699   //        newly tenured into old-gen memory, they must acquire a
 700   //        mutual exclusion lock on the card that holds the starting
 701   //        address of the newly tenured object.  This can be achieved
 702   //        by using a CAS instruction to assure that the previous
 703   //        values of first-offset and last-offset have not been
 704   //        changed since the same thread inquired as to their most
 705   //        current values.
 706   //
 707   //     One way to minimize the need for synchronization between
 708   //     background tenuring GC threads is for each tenuring GC thread
 709   //     to promote young-gen objects into distinct dedicated cluster
 710   //     ranges.
 711   //  6. The object_starts information is only required during the
 712   //     starting of concurrent marking and concurrent evacuation
 713   //     phases of GC.  Before we start either of these GC phases, the
 714   //     JVM enters a safe point and all GC threads perform
 715   //     commit-write barriers to assure that access to the
 716   //     object_starts information is coherent.
 717 
 718 
 719   // Notes on synchronization of register_object():
 720   //
 721   //  1. For efficiency, there is no locking in the implementation of register_object()
 722   //  2. Thus, it is required that users of this service assure that concurrent/parallel invocations of
 723   //     register_object() do pertain to the same card's memory range.  See discussion below to undestand
 724   //     the risks.
 725   //  3. When allocating from a TLAB or GCLAB, the mutual exclusion can be guaranteed by assuring that each
 726   //     LAB's start and end are aligned on card memory boundaries.
 727   //  4. Use the same lock that guarantees exclusivity when performing free-list allocation within heap regions.
 728   //
 729   // Register the newly allocated object while we're holding the global lock since there's no synchronization
 730   // built in to the implementation of register_object().  There are potential races when multiple independent
 731   // threads are allocating objects, some of which might span the same card region.  For example, consider
 732   // a card table's memory region within which three objects are being allocated by three different threads:
 733   //
 734   // objects being "concurrently" allocated:
 735   //    [-----a------][-----b-----][--------------c------------------]
 736   //            [---- card table memory range --------------]
 737   //
 738   // Before any objects are allocated, this card's memory range holds no objects.  Note that:
 739   //   allocation of object a wants to set the has-object, first-start, and last-start attributes of the preceding card region.
 740   //   allocation of object b wants to set the has-object, first-start, and last-start attributes of this card region.
 741   //   allocation of object c also wants to set the has-object, first-start, and last-start attributes of this card region.
 742   //
 743   // The thread allocating b and the thread allocating c can "race" in various ways, resulting in confusion, such as last-start
 744   // representing object b while first-start represents object c.  This is why we need to require all register_object()
 745   // invocations associated with objects that are allocated from "free lists" to provide their own mutual exclusion locking
 746   // mechanism.
 747 
 748   // Reset the has_object() information to false for all cards in the range between from and to.
 749   void reset_object_range(HeapWord *from, HeapWord *to);
 750 
 751   // register_object() requires that the caller hold the heap lock
 752   // before calling it.
 753   void register_object(HeapWord* address);
 754 
 755   // register_object_wo_lock() does not require that the caller hold
 756   // the heap lock before calling it, under the assumption that the
 757   // caller has assure no other thread will endeavor to concurrently
 758   // register objects that start within the same card's memory region
 759   // as address.
 760   void register_object_wo_lock(HeapWord* address);
 761 
 762   // During the reference updates phase of GC, we walk through each old-gen memory region that was
 763   // not part of the collection set and we invalidate all unmarked objects.  As part of this effort,
 764   // we coalesce neighboring dead objects in order to make future remembered set scanning more
 765   // efficient (since future remembered set scanning of any card region containing consecutive
 766   // dead objects can skip over all of them at once by reading only a single dead object header
 767   // instead of having to read the header of each of the coalesced dead objects.
 768   //
 769   // At some future time, we may implement a further optimization: satisfy future allocation requests
 770   // by carving new objects out of the range of memory that represents the coalesced dead objects.
 771   //
 772   // Suppose we want to combine several dead objects into a single coalesced object.  How does this
 773   // impact our representation of crossing map information?
 774   //  1. If the newly coalesced range is contained entirely within a card range, that card's last
 775   //     start entry either remains the same or it is changed to the start of the coalesced region.
 776   //  2. For the card that holds the start of the coalesced object, it will not impact the first start
 777   //     but it may impact the last start.
 778   //  3. For following cards spanned entirely by the newly coalesced object, it will change has_object
 779   //     to false (and make first-start and last-start "undefined").
 780   //  4. For a following card that is spanned patially by the newly coalesced object, it may change
 781   //     first-start value, but it will not change the last-start value.
 782   //
 783   // The range of addresses represented by the arguments to coalesce_objects() must represent a range
 784   // of memory that was previously occupied exactly by one or more previously registered objects.  For
 785   // convenience, it is legal to invoke coalesce_objects() with arguments that span a single previously
 786   // registered object.
 787   //
 788   // The role of coalesce_objects is to change the crossing map information associated with all of the coalesced
 789   // objects.
 790   void coalesce_objects(HeapWord* address, size_t length_in_words);
 791 
 792   // The typical use case is going to look something like this:
 793   //   for each heapregion that comprises old-gen memory
 794   //     for each card number that corresponds to this heap region
 795   //       scan the objects contained therein if the card is dirty
 796   // To avoid excessive lookups in a sparse array, the API queries
 797   // the card number pertaining to a particular address and then uses the
 798   // card noumber for subsequent information lookups and stores.
 799 
 800   // If has_object(card_index), this returns the word offset within this card
 801   // memory at which the first object begins.  If !has_object(card_index), the
 802   // result is a don't care value.
 803   size_t get_first_start(size_t card_index);
 804 
 805   // If has_object(card_index), this returns the word offset within this card
 806   // memory at which the last object begins.  If !has_object(card_index), the
 807   // result is a don't care value.
 808   size_t get_last_start(size_t card_index);
 809 
 810 };
 811 
 812 // ShenandoahScanRemembered is a concrete class representing the
 813 // ability to scan the old-gen remembered set for references to
 814 // objects residing in young-gen memory.
 815 //
 816 // Scanning normally begins with an invocation of numRegions and ends
 817 // after all clusters of all regions have been scanned.
 818 //
 819 // Throughout the scanning effort, the number of regions does not
 820 // change.
 821 //
 822 // Even though the regions that comprise old-gen memory are not
 823 // necessarily contiguous, the abstraction represented by this class
 824 // identifies each of the old-gen regions with an integer value
 825 // in the range from 0 to (numRegions() - 1) inclusive.
 826 //
 827 
 828 template<typename RememberedSet>
 829 class ShenandoahScanRemembered: public CHeapObj<mtGC> {
 830 
 831 private:
 832 
 833   RememberedSet* _rs;
 834   ShenandoahCardCluster<RememberedSet>* _scc;
 835 
 836 public:
 837   // How to instantiate this object?
 838   //   ShenandoahDirectCardMarkRememberedSet *rs =
 839   //       new ShenandoahDirectCardMarkRememberedSet();
 840   //   scr = new
 841   //     ShenandoahScanRememberd<ShenandoahDirectCardMarkRememberedSet>(rs);
 842   //
 843   // or, after the planned implementation of
 844   // ShenandoahBufferWithSATBRememberedSet has been completed:
 845   //
 846   //   ShenandoahBufferWithSATBRememberedSet *rs =
 847   //       new ShenandoahBufferWithSATBRememberedSet();
 848   //   scr = new
 849   //     ShenandoahScanRememberd<ShenandoahBufferWithSATBRememberedSet>(rs);
 850 
 851 
 852   ShenandoahScanRemembered(RememberedSet *rs) {
 853     _rs = rs;
 854     _scc = new ShenandoahCardCluster<RememberedSet>(rs);
 855   }
 856 
 857   ~ShenandoahScanRemembered() {
 858     delete _scc;
 859   }
 860 
 861   // TODO:  We really don't want to share all of these APIs with arbitrary consumers of the ShenandoahScanRemembered abstraction.
 862   // But in the spirit of quick and dirty for the time being, I'm going to go ahead and publish everything for right now.  Some
 863   // of existing code already depends on having access to these services (because existing code has not been written to honor
 864   // full abstraction of remembered set scanning.  In the not too distant future, we want to try to make most, if not all, of
 865   // these services private.  Two problems with publicizing:
 866   //  1. Allowing arbitrary users to reach beneath the hood allows the users to make assumptions about underlying implementation.
 867   //     This will make it more difficult to change underlying implementation at a future time, such as when we eventually experiment
 868   //     with SATB-based implementation of remembered set representation.
 869   //  2. If we carefully control sharing of certain of these services, we can reduce the overhead of synchronization by assuring
 870   //     that all users follow protocols that avoid contention that might require synchronization.  When we publish these APIs, we
 871   //     lose control over who and how the data is accessed.  As a result, we are required to insert more defensive measures into
 872   //     the implementation, including synchronization locks.
 873 
 874 
 875   // Card index is zero-based relative to first spanned card region.
 876   size_t last_valid_index();
 877   size_t total_cards();
 878   size_t card_index_for_addr(HeapWord *p);
 879   HeapWord *addr_for_card_index(size_t card_index);
 880   bool is_card_dirty(size_t card_index);
 881   bool is_write_card_dirty(size_t card_index) { return _rs->is_write_card_dirty(card_index); }
 882   void mark_card_as_dirty(size_t card_index);
 883   void mark_range_as_dirty(size_t card_index, size_t num_cards);
 884   void mark_card_as_clean(size_t card_index);
 885   void mark_read_card_as_clean(size_t card_index) { _rs->mark_read_card_clean(card_index); }
 886   void mark_range_as_clean(size_t card_index, size_t num_cards);
 887   bool is_card_dirty(HeapWord *p);
 888   void mark_card_as_dirty(HeapWord *p);
 889   void mark_range_as_dirty(HeapWord *p, size_t num_heap_words);
 890   void mark_card_as_clean(HeapWord *p);
 891   void mark_range_as_clean(HeapWord *p, size_t num_heap_words);
 892   size_t cluster_count();
 893 
 894   // Called by GC thread at start of concurrent mark to exchange roles of read and write remembered sets.
 895   void swap_remset() { _rs->swap_remset(); }
 896 
 897   void reset_remset(HeapWord* start, size_t word_count) { _rs->reset_remset(start, word_count); }
 898 
 899   void merge_write_table(HeapWord* start, size_t word_count) { _rs->merge_write_table(start, word_count); }
 900 
 901   // Called by GC thread after scanning old remembered set in order to prepare for next GC pass
 902   void clear_old_remset() { _rs->clear_old_remset(); }
 903 
 904   size_t cluster_for_addr(HeapWord *addr);
 905   HeapWord* addr_for_cluster(size_t cluster_no);
 906 
 907   void reset_object_range(HeapWord *from, HeapWord *to);
 908   void register_object(HeapWord *addr);
 909   void register_object_wo_lock(HeapWord *addr);
 910   void coalesce_objects(HeapWord *addr, size_t length_in_words);
 911 
 912   HeapWord* first_object_in_card(size_t card_index) {
 913     if (_scc->has_object(card_index)) {
 914       return addr_for_card_index(card_index) + _scc->get_first_start(card_index);
 915     } else {
 916       return nullptr;
 917     }
 918   }
 919 
 920   // Return true iff this object is "properly" registered.
 921   bool verify_registration(HeapWord* address, ShenandoahMarkingContext* ctx);
 922 
 923   // clear the cards to clean, and clear the object_starts info to no objects
 924   void mark_range_as_empty(HeapWord *addr, size_t length_in_words);
 925 
 926   // process_clusters() scans a portion of the remembered set during a JVM
 927   // safepoint as part of the root scanning activities that serve to
 928   // initiate concurrent scanning and concurrent evacuation.  Multiple
 929   // threads may scan different portions of the remembered set by
 930   // making parallel invocations of process_clusters() with each
 931   // invocation scanning different clusters of the remembered set.
 932   //
 933   // An invocation of process_clusters() examines all of the
 934   // intergenerational references spanned by count clusters starting
 935   // with first_cluster.  The oops argument is assumed to represent a
 936   // thread-local OopClosure into which addresses of intergenerational
 937   // pointer values will be accumulated for the purposes of root scanning.
 938   //
 939   // A side effect of executing process_clusters() is to update the card
 940   // table entries, marking dirty cards as clean if they no longer
 941   // hold references to young-gen memory.  (THIS IS NOT YET IMPLEMENTED.)
 942   //
 943   // The implementation of process_clusters() is designed to efficiently
 944   // minimize work in the large majority of cases for which the
 945   // associated cluster has very few dirty card-table entries.
 946   //
 947   // At initialization of concurrent marking, invoke process_clusters with
 948   // ClosureType equal to ShenandoahInitMarkRootsClosure.
 949   //
 950   // At initialization of concurrent evacuation, invoke process_clusters with
 951   // ClosureType equal to ShenandoahEvacuateUpdateRootsClosure.
 952 
 953   // This is big enough it probably shouldn't be in-lined.  On the other hand, there are only a few places this
 954   // code is called from, so it might as well be in-lined.  The "real" reason I'm inlining at the moment is because
 955   // the template expansions were making it difficult for the link/loader to resolve references to the template-
 956   // parameterized implementations of this service.
 957   template <typename ClosureType>
 958   inline void process_clusters(size_t first_cluster, size_t count, HeapWord *end_of_range, ClosureType *oops, bool is_concurrent);
 959 
 960   template <typename ClosureType>
 961   inline void process_clusters(size_t first_cluster, size_t count, HeapWord *end_of_range, ClosureType *oops,
 962                                bool use_write_table, bool is_concurrent);
 963 
 964   template <typename ClosureType>
 965   inline void process_humongous_clusters(ShenandoahHeapRegion* r, size_t first_cluster, size_t count,
 966                                          HeapWord *end_of_range, ClosureType *oops, bool use_write_table, bool is_concurrent);
 967 
 968 
 969   template <typename ClosureType>
 970   inline void process_region(ShenandoahHeapRegion* region, ClosureType *cl, bool is_concurrent);
 971 
 972   template <typename ClosureType>
 973   inline void process_region(ShenandoahHeapRegion* region, ClosureType *cl, bool use_write_table, bool is_concurrent);
 974 
 975   template <typename ClosureType>
 976   inline void process_region_slice(ShenandoahHeapRegion* region, size_t offset, size_t clusters, HeapWord* end_of_range,
 977                                    ClosureType *cl, bool use_write_table, bool is_concurrent);
 978 
 979   // To Do:
 980   //  Create subclasses of ShenandoahInitMarkRootsClosure and
 981   //  ShenandoahEvacuateUpdateRootsClosure and any other closures
 982   //  that need to participate in remembered set scanning.  Within the
 983   //  subclasses, add a (probably templated) instance variable that
 984   //  refers to the associated ShenandoahCardCluster object.  Use this
 985   //  ShenandoahCardCluster instance to "enhance" the do_oops
 986   //  processing so that we can:
 987   //
 988   //   1. Avoid processing references that correspond to clean card
 989   //      regions, and
 990   //   2. Set card status to CLEAN when the associated card region no
 991   //      longer holds inter-generatioanal references.
 992   //
 993   //  To enable efficient implementation of these behaviors, we
 994   //  probably also want to add a few fields into the
 995   //  ShenandoahCardCluster object that allow us to precompute and
 996   //  remember the addresses at which card status is going to change
 997   //  from dirty to clean and clean to dirty.  The do_oops
 998   //  implementations will want to update this value each time they
 999   //  cross one of these boundaries.
1000   void roots_do(OopIterateClosure* cl);
1001 };
1002 
1003 struct ShenandoahRegionChunk {
1004   ShenandoahHeapRegion *_r;
1005   size_t _chunk_offset;          // HeapWordSize offset
1006   size_t _chunk_size;            // HeapWordSize qty
1007 };
1008 
1009 class ShenandoahRegionChunkIterator : public StackObj {
1010 private:
1011   // smallest_chunk_size is 64 words per card *
1012   // ShenandoahCardCluster<ShenandoahDirectCardMarkRememberedSet>::CardsPerCluster.
1013   // This is computed from CardTable::card_size_in_words() *
1014   //      ShenandoahCardCluster<ShenandoahDirectCardMarkRememberedSet>::CardsPerCluster;
1015   // We can't perform this computation here, because of encapsulation and initialization constraints.  We paste
1016   // the magic number here, and assert that this number matches the intended computation in constructor.
1017   static const size_t _smallest_chunk_size = 64 * ShenandoahCardCluster<ShenandoahDirectCardMarkRememberedSet>::CardsPerCluster;
1018 
1019   // The total remembered set scanning effort is divided into chunks of work that are assigned to individual worker tasks.
1020   // The chunks of assigned work are divided into groups, where the size of each group (_group_size) is 4 * the number of
1021   // worker tasks.  All of the assignments within a group represent the same amount of memory to be scanned.  Each of the
1022   // assignments within the first group are of size _first_group_chunk_size (typically the ShenandoahHeapRegion size, but
1023   // possibly smaller.  Each of the assignments within each subsequent group are half the size of the assignments in the
1024   // preceding group.  The last group may be larger than the others.  Because no group is allowed to have smaller assignments
1025   // than _smallest_chunk_size, which is 32 KB.
1026 
1027   // Under normal circumstances, no configuration needs more than _maximum_groups (default value of 16).
1028 
1029   static const size_t _maximum_groups = 16;
1030 
1031   const ShenandoahHeap* _heap;
1032 
1033   const size_t _group_size;                        // Number of chunks in each group, equals worker_threads * 8
1034   const size_t _first_group_chunk_size;
1035   const size_t _num_groups;                        // Number of groups in this configuration
1036   const size_t _total_chunks;
1037 
1038   shenandoah_padding(0);
1039   volatile size_t _index;
1040   shenandoah_padding(1);
1041 
1042   size_t _region_index[_maximum_groups];
1043   size_t _group_offset[_maximum_groups];
1044 
1045 
1046   // No implicit copying: iterators should be passed by reference to capture the state
1047   NONCOPYABLE(ShenandoahRegionChunkIterator);
1048 
1049   // Makes use of _heap.
1050   size_t calc_group_size();
1051 
1052   // Makes use of _group_size, which must be initialized before call.
1053   size_t calc_first_group_chunk_size();
1054 
1055   // Makes use of _group_size and _first_group_chunk_size, both of which must be initialized before call.
1056   size_t calc_num_groups();
1057 
1058   // Makes use of _group_size, _first_group_chunk_size, which must be initialized before call.
1059   size_t calc_total_chunks();
1060 
1061 public:
1062   ShenandoahRegionChunkIterator(size_t worker_count);
1063   ShenandoahRegionChunkIterator(ShenandoahHeap* heap, size_t worker_count);
1064 
1065   // Reset iterator to default state
1066   void reset();
1067 
1068   // Fills in assignment with next chunk of work and returns true iff there is more work.
1069   // Otherwise, returns false.  This is multi-thread-safe.
1070   inline bool next(struct ShenandoahRegionChunk *assignment);
1071 
1072   // This is *not* MT safe. However, in the absence of multithreaded access, it
1073   // can be used to determine if there is more work to do.
1074   inline bool has_next() const;
1075 };
1076 
1077 typedef ShenandoahScanRemembered<ShenandoahDirectCardMarkRememberedSet> RememberedScanner;
1078 
1079 class ShenandoahScanRememberedTask : public WorkerTask {
1080  private:
1081   ShenandoahObjToScanQueueSet* _queue_set;
1082   ShenandoahObjToScanQueueSet* _old_queue_set;
1083   ShenandoahReferenceProcessor* _rp;
1084   ShenandoahRegionChunkIterator* _work_list;
1085   bool _is_concurrent;
1086  public:
1087   ShenandoahScanRememberedTask(ShenandoahObjToScanQueueSet* queue_set,
1088                                ShenandoahObjToScanQueueSet* old_queue_set,
1089                                ShenandoahReferenceProcessor* rp,
1090                                ShenandoahRegionChunkIterator* work_list,
1091                                bool is_concurrent);
1092 
1093   void work(uint worker_id);
1094   void do_work(uint worker_id);
1095 };
1096 #endif // SHARE_GC_SHENANDOAH_SHENANDOAHSCANREMEMBERED_HPP