1 /* 2 * Copyright Amazon.com Inc. or its affiliates. All Rights Reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. 8 * 9 * This code is distributed in the hope that it will be useful, but WITHOUT 10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 12 * version 2 for more details (a copy is included in the LICENSE file that 13 * accompanied this code). 14 * 15 * You should have received a copy of the GNU General Public License version 16 * 2 along with this work; if not, write to the Free Software Foundation, 17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 18 * 19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 20 * or visit www.oracle.com if you need additional information or have any 21 * questions. 22 * 23 */ 24 25 #ifndef SHARE_GC_SHENANDOAH_SHENANDOAHSCANREMEMBERED_HPP 26 #define SHARE_GC_SHENANDOAH_SHENANDOAHSCANREMEMBERED_HPP 27 28 // Terminology used within this source file: 29 // 30 // Card Entry: This is the information that identifies whether a 31 // particular card-table entry is Clean or Dirty. A clean 32 // card entry denotes that the associated memory does not 33 // hold references to young-gen memory. 34 // 35 // Card Region, aka 36 // Card Memory: This is the region of memory that is assocated with a 37 // particular card entry. 38 // 39 // Card Cluster: A card cluster represents 64 card entries. A card 40 // cluster is the minimal amount of work performed at a 41 // time by a parallel thread. Note that the work required 42 // to scan a card cluster is somewhat variable in that the 43 // required effort depends on how many cards are dirty, how 44 // many references are held within the objects that span a 45 // DIRTY card's memory, and on the size of the object 46 // that spans the end of a DIRTY card's memory (because 47 // that object, if it's not an array, may need to be scanned in 48 // its entirety, when the object is imprecisely dirtied. Imprecise 49 // dirtying is when the card corresponding to the object header 50 // is dirtied, rather than the card on which the updated field lives). 51 // To better balance work amongst them, parallel worker threads dynamically 52 // claim clusters and are flexible in the number of clusters they 53 // process. 54 // 55 // A cluster represents a "natural" quantum of work to be performed by 56 // a parallel GC thread's background remembered set scanning efforts. 57 // The notion of cluster is similar to the notion of stripe in the 58 // implementation of parallel GC card scanning. However, a cluster is 59 // typically smaller than a stripe, enabling finer grain division of 60 // labor between multiple threads, and potentially better load balancing 61 // when dirty cards are not uniformly distributed in the heap, as is often 62 // the case with generational workloads where more recently promoted objects 63 // may be dirtied more frequently that older objects. 64 // 65 // For illustration, consider the following possible JVM configurations: 66 // 67 // Scenario 1: 68 // RegionSize is 128 MB 69 // Span of a card entry is 512 B 70 // Each card table entry consumes 1 B 71 // Assume one long word (8 B)of the card table represents a cluster. 72 // This long word holds 8 card table entries, spanning a 73 // total of 8*512 B = 4 KB of the heap 74 // The number of clusters per region is 128 MB / 4 KB = 32 K 75 // 76 // Scenario 2: 77 // RegionSize is 128 MB 78 // Span of each card entry is 128 B 79 // Each card table entry consumes 1 bit 80 // Assume one int word (4 B) of the card table represents a cluster. 81 // This int word holds 32 b/1 b = 32 card table entries, spanning a 82 // total of 32 * 128 B = 4 KB of the heap 83 // The number of clusters per region is 128 MB / 4 KB = 32 K 84 // 85 // Scenario 3: 86 // RegionSize is 128 MB 87 // Span of each card entry is 512 B 88 // Each card table entry consumes 1 bit 89 // Assume one long word (8 B) of card table represents a cluster. 90 // This long word holds 64 b/ 1 b = 64 card table entries, spanning a 91 // total of 64 * 512 B = 32 KB of the heap 92 // The number of clusters per region is 128 MB / 32 KB = 4 K 93 // 94 // At the start of a new young-gen concurrent mark pass, the gang of 95 // Shenandoah worker threads collaborate in performing the following 96 // actions: 97 // 98 // Let old_regions = number of ShenandoahHeapRegion comprising 99 // old-gen memory 100 // Let region_size = ShenandoahHeapRegion::region_size_bytes() 101 // represent the number of bytes in each region 102 // Let clusters_per_region = region_size / 512 103 // Let rs represent the ShenandoahDirectCardMarkRememberedSet 104 // 105 // for each ShenandoahHeapRegion old_region in the whole heap 106 // determine the cluster number of the first cluster belonging 107 // to that region 108 // for each cluster contained within that region 109 // Assure that exactly one worker thread processes each 110 // cluster, each thread making a series of invocations of the 111 // following: 112 // 113 // rs->process_clusters(worker_id, ReferenceProcessor *, 114 // ShenandoahConcurrentMark *, cluster_no, cluster_count, 115 // HeapWord *end_of_range, OopClosure *oops); 116 // 117 // For efficiency, divide up the clusters so that different threads 118 // are responsible for processing different clusters. Processing costs 119 // may vary greatly between clusters for the following reasons: 120 // 121 // a) some clusters contain mostly dirty cards and other 122 // clusters contain mostly clean cards 123 // b) some clusters contain mostly primitive data and other 124 // clusters contain mostly reference data 125 // c) some clusters are spanned by very large non-array objects that 126 // begin in some other cluster. When a large non-array object 127 // beginning in a preceding cluster spans large portions of 128 // this cluster, then because of imprecise dirtying, the 129 // portion of the object in this cluster may be clean, but 130 // will need to be processed by the worker responsible for 131 // this cluster, potentially increasing its work. 132 // d) in the case that the end of this cluster is spanned by a 133 // very large non-array object, the worker for this cluster will 134 // be responsible for processing the portion of the object 135 // in this cluster. 136 // 137 // Though an initial division of labor between marking threads may 138 // assign equal numbers of clusters to be scanned by each thread, it 139 // should be expected that some threads will finish their assigned 140 // work before others. Therefore, some amount of the full remembered 141 // set scanning effort should be held back and assigned incrementally 142 // to the threads that end up with excess capacity. Consider the 143 // following strategy for dividing labor: 144 // 145 // 1. Assume there are 8 marking threads and 1024 remembered 146 // set clusters to be scanned. 147 // 2. Assign each thread to scan 64 clusters. This leaves 148 // 512 (1024 - (8*64)) clusters to still be scanned. 149 // 3. As the 8 server threads complete previous cluster 150 // scanning assignments, issue each of the next 8 scanning 151 // assignments as units of 32 additional cluster each. 152 // In the case that there is high variance in effort 153 // associated with previous cluster scanning assignments, 154 // multiples of these next assignments may be serviced by 155 // the server threads that were previously assigned lighter 156 // workloads. 157 // 4. Make subsequent scanning assignments as follows: 158 // a) 8 assignments of size 16 clusters 159 // b) 8 assignments of size 8 clusters 160 // c) 16 assignments of size 4 clusters 161 // 162 // When there is no more remembered set processing work to be 163 // assigned to a newly idled worker thread, that thread can move 164 // on to work on other tasks associated with root scanning until such 165 // time as all clusters have been examined. 166 // 167 // Remembered set scanning is designed to run concurrently with 168 // mutator threads, with multiple concurrent workers. Furthermore, the 169 // current implementation of remembered set scanning never clears a 170 // card once it has been marked. 171 // 172 // These limitations will be addressed in future enhancements to the 173 // existing implementation. 174 175 #include "gc/shared/workerThread.hpp" 176 #include "gc/shenandoah/shenandoahCardStats.hpp" 177 #include "gc/shenandoah/shenandoahCardTable.hpp" 178 #include "gc/shenandoah/shenandoahNumberSeq.hpp" 179 #include "gc/shenandoah/shenandoahTaskqueue.hpp" 180 #include "memory/iterator.hpp" 181 #include "utilities/globalDefinitions.hpp" 182 183 class ShenandoahReferenceProcessor; 184 class ShenandoahConcurrentMark; 185 class ShenandoahHeap; 186 class ShenandoahHeapRegion; 187 class ShenandoahRegionIterator; 188 class ShenandoahMarkingContext; 189 190 class CardTable; 191 typedef CardTable::CardValue CardValue; 192 193 class ShenandoahDirectCardMarkRememberedSet: public CHeapObj<mtGC> { 194 195 private: 196 197 // Use symbolic constants defined in cardTable.hpp 198 // CardTable::card_shift = 9; 199 // CardTable::card_size = 512; (default value 512, a power of 2 >= 128) 200 // CardTable::card_size_in_words = 64; (default value 64, a power of 2 >= 16) 201 // CardTable::clean_card_val() 202 // CardTable::dirty_card_val() 203 204 // See shenandoahCardTable.hpp 205 // ShenandoahMinCardSizeInBytes 128 206 207 const size_t LogCardValsPerIntPtr; // the number of card values (entries) in an intptr_t 208 const size_t LogCardSizeInWords; // the size of a card in heap word units 209 210 ShenandoahHeap* _heap; 211 ShenandoahCardTable* _card_table; 212 size_t _card_shift; 213 size_t _total_card_count; 214 HeapWord* _whole_heap_base; // Points to first HeapWord of data contained within heap memory 215 CardValue* _byte_map; // Points to first entry within the card table 216 CardValue* _byte_map_base; // Points to byte_map minus the bias computed from address of heap memory 217 218 public: 219 220 // count is the number of cards represented by the card table. 221 ShenandoahDirectCardMarkRememberedSet(ShenandoahCardTable* card_table, size_t total_card_count); 222 223 // Card index is zero-based relative to _byte_map. 224 size_t last_valid_index() const; 225 size_t total_cards() const; 226 size_t card_index_for_addr(HeapWord* p) const; 227 HeapWord* addr_for_card_index(size_t card_index) const; 228 inline const CardValue* get_card_table_byte_map(bool use_write_table) const { 229 return use_write_table ? _card_table->write_byte_map() : _card_table->read_byte_map(); 230 } 231 232 inline bool is_card_dirty(size_t card_index) const; 233 inline bool is_write_card_dirty(size_t card_index) const; 234 inline void mark_card_as_dirty(size_t card_index); 235 inline void mark_range_as_dirty(size_t card_index, size_t num_cards); 236 inline void mark_card_as_clean(size_t card_index); 237 inline void mark_range_as_clean(size_t card_index, size_t num_cards); 238 inline bool is_card_dirty(HeapWord* p) const; 239 inline bool is_write_card_dirty(HeapWord* p) const; 240 inline void mark_card_as_dirty(HeapWord* p); 241 inline void mark_range_as_dirty(HeapWord* p, size_t num_heap_words); 242 inline void mark_card_as_clean(HeapWord* p); 243 inline void mark_range_as_clean(HeapWord* p, size_t num_heap_words); 244 245 // Merge any dirty values from write table into the read table, while leaving 246 // the write table unchanged. 247 void merge_write_table(HeapWord* start, size_t word_count); 248 249 // Destructively copy the write table to the read table, and clean the write table. 250 void reset_remset(HeapWord* start, size_t word_count); 251 }; 252 253 // A ShenandoahCardCluster represents the minimal unit of work 254 // performed by independent parallel GC threads during scanning of 255 // remembered sets. 256 // 257 // The GC threads that perform card-table remembered set scanning may 258 // overwrite card-table entries to mark them as clean in the case that 259 // the associated memory no longer holds references to young-gen 260 // memory. Rather than access the card-table entries directly, all GC 261 // thread access to card-table information is made by way of the 262 // ShenandoahCardCluster data abstraction. This abstraction 263 // effectively manages access to multiple possible underlying 264 // remembered set implementations, including a traditional card-table 265 // approach and a SATB-based approach. 266 // 267 // The API services represent a compromise between efficiency and 268 // convenience. 269 // 270 // Multiple GC threads that scan the remembered set 271 // in parallel. The desire is to divide the complete scanning effort 272 // into multiple clusters of work that can be independently processed 273 // by individual threads without need for synchronizing efforts 274 // between the work performed by each task. The term "cluster" of 275 // work is similar to the term "stripe" as used in the implementation 276 // of Parallel GC. 277 // 278 // Complexity arises when an object to be scanned crosses the boundary 279 // between adjacent cluster regions. Here is the protocol that we currently 280 // follow: 281 // 282 // 1. The thread responsible for scanning the cards in a cluster modifies 283 // the associated card-table entries. Only cards that are dirty are 284 // processed, except as described below for the case of objects that 285 // straddle more than one card. 286 // 2. Object Arrays are precisely dirtied, so only the portion of the obj-array 287 // that overlaps the range of dirty cards in its cluster are scanned 288 // by each worker thread. This holds for portions of obj-arrays that extend 289 // over clusters processed by different workers, with each worked responsible 290 // for scanning the portion of the obj-array overlapping the dirty cards in 291 // its cluster. 292 // 3. Non-array objects are precisely dirtied by the interpreter and the compilers 293 // For such objects that extend over multiple cards, or even multiple clusters, 294 // the entire object is scanned by the worker that processes the (dirty) card on 295 // which the object's header lies. (However, GC workers should precisely dirty the 296 // cards with inter-regional/inter-generational pointers in the body of this object, 297 // thus making subsequent scans potentially less expensive.) Such larger non-array 298 // objects are relatively rare. 299 // 300 // A possible criticism: 301 // C. The representation of pointer location descriptive information 302 // within Klass representations is not designed for efficient 303 // "random access". An alternative approach to this design would 304 // be to scan very large objects multiple times, once for each 305 // cluster that is spanned by the object's range. This reduces 306 // unnecessary overscan, but it introduces different sorts of 307 // overhead effort: 308 // i) For each spanned cluster, we have to look up the start of 309 // the crossing object. 310 // ii) Each time we scan the very large object, we have to 311 // sequentially walk through its pointer location 312 // descriptors, skipping over all of the pointers that 313 // precede the start of the range of addresses that we 314 // consider relevant. 315 316 317 // Because old-gen heap memory is not necessarily contiguous, and 318 // because cards are not necessarily maintained for young-gen memory, 319 // consecutive card numbers do not necessarily correspond to consecutive 320 // address ranges. For the traditional direct-card-marking 321 // implementation of this interface, consecutive card numbers are 322 // likely to correspond to contiguous regions of memory, but this 323 // should not be assumed. Instead, rely only upon the following: 324 // 325 // 1. All card numbers for cards pertaining to the same 326 // ShenandoahHeapRegion are consecutively numbered. 327 // 2. In the case that neighboring ShenandoahHeapRegions both 328 // represent old-gen memory, the card regions that span the 329 // boundary between these neighboring heap regions will be 330 // consecutively numbered. 331 // 3. (A corollary) In the case that an old-gen object straddles the 332 // boundary between two heap regions, the card regions that 333 // correspond to the span of this object will be consecutively 334 // numbered. 335 // 336 // ShenandoahCardCluster abstracts access to the remembered set 337 // and also keeps track of crossing map information to allow efficient 338 // resolution of object start addresses. 339 // 340 // ShenandoahCardCluster supports all of the services of 341 // DirectCardMarkRememberedSet, plus it supports register_object() and lookup_object(). 342 // Note that we only need to register the start addresses of the object that 343 // overlays the first address of a card; we need to do this for every card. 344 // In other words, register_object() checks if the object crosses a card boundary, 345 // and updates the offset value for each card that the object crosses into. 346 // For objects that don't straddle cards, nothing needs to be done. 347 // 348 class ShenandoahCardCluster: public CHeapObj<mtGC> { 349 350 private: 351 ShenandoahDirectCardMarkRememberedSet* _rs; 352 353 public: 354 static const size_t CardsPerCluster = 64; 355 356 private: 357 typedef struct cross_map { uint8_t first; uint8_t last; } xmap; 358 typedef union crossing_info { uint16_t short_word; xmap offsets; } crossing_info; 359 360 // ObjectStartsInCardRegion bit is set within a crossing_info.offsets.start iff at least one object starts within 361 // a particular card region. We pack this bit into start byte under assumption that start byte is accessed less 362 // frequently than last byte. This is true when number of clean cards is greater than number of dirty cards. 363 static const uint8_t ObjectStartsInCardRegion = 0x80; 364 static const uint8_t FirstStartBits = 0x7f; 365 366 // Check that we have enough bits to store the largest possible offset into a card for an object start. 367 // The value for maximum card size is based on the constraints for GCCardSizeInBytes in gc_globals.hpp. 368 static const int MaxCardSize = NOT_LP64(512) LP64_ONLY(1024); 369 STATIC_ASSERT((MaxCardSize / HeapWordSize) - 1 <= FirstStartBits); 370 371 crossing_info* _object_starts; 372 373 public: 374 // If we're setting first_start, assume the card has an object. 375 inline void set_first_start(size_t card_index, uint8_t value) { 376 _object_starts[card_index].offsets.first = ObjectStartsInCardRegion | value; 377 } 378 379 inline void set_last_start(size_t card_index, uint8_t value) { 380 _object_starts[card_index].offsets.last = value; 381 } 382 383 inline void set_starts_object_bit(size_t card_index) { 384 _object_starts[card_index].offsets.first |= ObjectStartsInCardRegion; 385 } 386 387 inline void clear_starts_object_bit(size_t card_index) { 388 _object_starts[card_index].offsets.first &= ~ObjectStartsInCardRegion; 389 } 390 391 // Returns true iff an object is known to start within the card memory associated with card card_index. 392 inline bool starts_object(size_t card_index) const { 393 return (_object_starts[card_index].offsets.first & ObjectStartsInCardRegion) != 0; 394 } 395 396 inline void clear_objects_in_range(HeapWord* addr, size_t num_words) { 397 size_t card_index = _rs->card_index_for_addr(addr); 398 size_t last_card_index = _rs->card_index_for_addr(addr + num_words - 1); 399 while (card_index <= last_card_index) 400 _object_starts[card_index++].short_word = 0; 401 } 402 403 ShenandoahCardCluster(ShenandoahDirectCardMarkRememberedSet* rs) { 404 _rs = rs; 405 _object_starts = NEW_C_HEAP_ARRAY(crossing_info, rs->total_cards() + 1, mtGC); // the +1 is to account for card table guarding entry 406 for (size_t i = 0; i < rs->total_cards(); i++) { 407 _object_starts[i].short_word = 0; 408 } 409 } 410 411 ~ShenandoahCardCluster() { 412 FREE_C_HEAP_ARRAY(crossing_info, _object_starts); 413 _object_starts = nullptr; 414 } 415 416 // There is one entry within the object_starts array for each card entry. 417 // 418 // Suppose multiple garbage objects are coalesced during GC sweep 419 // into a single larger "free segment". As each two objects are 420 // coalesced together, the start information pertaining to the second 421 // object must be removed from the objects_starts array. If the 422 // second object had been the first object within card memory, 423 // the new first object is the object that follows that object if 424 // that starts within the same card memory, or NoObject if the 425 // following object starts within the following cluster. If the 426 // second object had been the last object in the card memory, 427 // replace this entry with the newly coalesced object if it starts 428 // within the same card memory, or with NoObject if it starts in a 429 // preceding card's memory. 430 // 431 // Suppose a large free segment is divided into a smaller free 432 // segment and a new object. The second part of the newly divided 433 // memory must be registered as a new object, overwriting at most 434 // one first_start and one last_start entry. Note that one of the 435 // newly divided two objects might be a new GCLAB. 436 // 437 // Suppose postprocessing of a GCLAB finds that the original GCLAB 438 // has been divided into N objects. Each of the N newly allocated 439 // objects will be registered, overwriting at most one first_start 440 // and one last_start entries. 441 // 442 // No object registration operations are linear in the length of 443 // the registered objects. 444 // 445 // Consider further the following observations regarding object 446 // registration costs: 447 // 448 // 1. The cost is paid once for each old-gen object (Except when 449 // an object is demoted and repromoted, in which case we would 450 // pay the cost again). 451 // 2. The cost can be deferred so that there is no urgency during 452 // mutator copy-on-first-access promotion. Background GC 453 // threads will update the object_starts array by post- 454 // processing the contents of retired PLAB buffers. 455 // 3. The bet is that these costs are paid relatively rarely 456 // because: 457 // a) Most objects die young and objects that die in young-gen 458 // memory never need to be registered with the object_starts 459 // array. 460 // b) Most objects that are promoted into old-gen memory live 461 // there without further relocation for a relatively long 462 // time, so we get a lot of benefit from each investment 463 // in registering an object. 464 465 public: 466 467 // The starting locations of objects contained within old-gen memory 468 // are registered as part of the remembered set implementation. This 469 // information is required when scanning dirty card regions that are 470 // spanned by objects beginning within preceding card regions. It 471 // is necessary to find the first and last objects that begin within 472 // this card region. Starting addresses of objects are required to 473 // find the object headers, and object headers provide information 474 // about which fields within the object hold addresses. 475 // 476 // The old-gen memory allocator invokes register_object() for any 477 // object that is allocated within old-gen memory. This identifies 478 // the starting addresses of objects that span boundaries between 479 // card regions. 480 // 481 // It is not necessary to invoke register_object at the very instant 482 // an object is allocated. It is only necessary to invoke it 483 // prior to the next start of a garbage collection concurrent mark 484 // or concurrent update-references phase. An "ideal" time to register 485 // objects is during post-processing of a GCLAB after the GCLAB is 486 // retired due to depletion of its memory. 487 // 488 // register_object() does not perform synchronization. In the case 489 // that multiple threads are registering objects whose starting 490 // addresses are within the same cluster, races between these 491 // threads may result in corruption of the object-start data 492 // structures. Parallel GC threads should avoid registering objects 493 // residing within the same cluster by adhering to the following 494 // coordination protocols: 495 // 496 // 1. Align thread-local GCLAB buffers with some TBD multiple of 497 // card clusters. The card cluster size is 32 KB. If the 498 // desired GCLAB size is 128 KB, align the buffer on a multiple 499 // of 4 card clusters. 500 // 2. Post-process the contents of GCLAB buffers to register the 501 // objects allocated therein. Allow one GC thread at a 502 // time to do the post-processing of each GCLAB. 503 // 3. Since only one GC thread at a time is registering objects 504 // belonging to a particular allocation buffer, no locking 505 // is performed when registering these objects. 506 // 4. Any remnant of unallocated memory within an expended GC 507 // allocation buffer is not returned to the old-gen allocation 508 // pool until after the GC allocation buffer has been post 509 // processed. Before any remnant memory is returned to the 510 // old-gen allocation pool, the GC thread that scanned this GC 511 // allocation buffer performs a write-commit memory barrier. 512 // 5. Background GC threads that perform tenuring of young-gen 513 // objects without a GCLAB use a CAS lock before registering 514 // each tenured object. The CAS lock assures both mutual 515 // exclusion and memory coherency/visibility. Note that an 516 // object tenured by a background GC thread will not overlap 517 // with any of the clusters that are receiving tenured objects 518 // by way of GCLAB buffers. Multiple independent GC threads may 519 // attempt to tenure objects into a shared cluster. This is why 520 // sychronization may be necessary. Consider the following 521 // scenarios: 522 // 523 // a) If two objects are tenured into the same card region, each 524 // registration may attempt to modify the first-start or 525 // last-start information associated with that card region. 526 // Furthermore, because the representations of first-start 527 // and last-start information within the object_starts array 528 // entry uses different bits of a shared uint_16 to represent 529 // each, it is necessary to lock the entire card entry 530 // before modifying either the first-start or last-start 531 // information within the entry. 532 // b) Suppose GC thread X promotes a tenured object into 533 // card region A and this tenured object spans into 534 // neighboring card region B. Suppose GC thread Y (not equal 535 // to X) promotes a tenured object into cluster B. GC thread X 536 // will update the object_starts information for card A. No 537 // synchronization is required. 538 // c) In summary, when background GC threads register objects 539 // newly tenured into old-gen memory, they must acquire a 540 // mutual exclusion lock on the card that holds the starting 541 // address of the newly tenured object. This can be achieved 542 // by using a CAS instruction to assure that the previous 543 // values of first-offset and last-offset have not been 544 // changed since the same thread inquired as to their most 545 // current values. 546 // 547 // One way to minimize the need for synchronization between 548 // background tenuring GC threads is for each tenuring GC thread 549 // to promote young-gen objects into distinct dedicated cluster 550 // ranges. 551 // 6. The object_starts information is only required during the 552 // starting of concurrent marking and concurrent evacuation 553 // phases of GC. Before we start either of these GC phases, the 554 // JVM enters a safe point and all GC threads perform 555 // commit-write barriers to assure that access to the 556 // object_starts information is coherent. 557 558 559 // Notes on synchronization of register_object(): 560 // 561 // 1. For efficiency, there is no locking in the implementation of register_object() 562 // 2. Thus, it is required that users of this service assure that concurrent/parallel invocations of 563 // register_object() do pertain to the same card's memory range. See discussion below to understand 564 // the risks. 565 // 3. When allocating from a TLAB or GCLAB, the mutual exclusion can be guaranteed by assuring that each 566 // LAB's start and end are aligned on card memory boundaries. 567 // 4. Use the same lock that guarantees exclusivity when performing free-list allocation within heap regions. 568 // 569 // Register the newly allocated object while we're holding the global lock since there's no synchronization 570 // built in to the implementation of register_object(). There are potential races when multiple independent 571 // threads are allocating objects, some of which might span the same card region. For example, consider 572 // a card table's memory region within which three objects are being allocated by three different threads: 573 // 574 // objects being "concurrently" allocated: 575 // [-----a------][-----b-----][--------------c------------------] 576 // [---- card table memory range --------------] 577 // 578 // Before any objects are allocated, this card's memory range holds no objects. Note that: 579 // allocation of object a wants to set the has-object, first-start, and last-start attributes of the preceding card region. 580 // allocation of object b wants to set the has-object, first-start, and last-start attributes of this card region. 581 // allocation of object c also wants to set the has-object, first-start, and last-start attributes of this card region. 582 // 583 // The thread allocating b and the thread allocating c can "race" in various ways, resulting in confusion, such as last-start 584 // representing object b while first-start represents object c. This is why we need to require all register_object() 585 // invocations associated with objects that are allocated from "free lists" to provide their own mutual exclusion locking 586 // mechanism. 587 588 // Reset the starts_object() information to false for all cards in the range between from and to. 589 void reset_object_range(HeapWord* from, HeapWord* to); 590 591 // register_object() requires that the caller hold the heap lock 592 // before calling it. 593 void register_object(HeapWord* address); 594 595 // register_object_without_lock() does not require that the caller hold 596 // the heap lock before calling it, under the assumption that the 597 // caller has assured no other thread will endeavor to concurrently 598 // register objects that start within the same card's memory region 599 // as address. 600 void register_object_without_lock(HeapWord* address); 601 602 // During the reference updates phase of GC, we walk through each old-gen memory region that was 603 // not part of the collection set and we invalidate all unmarked objects. As part of this effort, 604 // we coalesce neighboring dead objects in order to make future remembered set scanning more 605 // efficient (since future remembered set scanning of any card region containing consecutive 606 // dead objects can skip over all of them at once by reading only a single dead object header 607 // instead of having to read the header of each of the coalesced dead objects. 608 // 609 // At some future time, we may implement a further optimization: satisfy future allocation requests 610 // by carving new objects out of the range of memory that represents the coalesced dead objects. 611 // 612 // Suppose we want to combine several dead objects into a single coalesced object. How does this 613 // impact our representation of crossing map information? 614 // 1. If the newly coalesced range is contained entirely within a card range, that card's last 615 // start entry either remains the same or it is changed to the start of the coalesced region. 616 // 2. For the card that holds the start of the coalesced object, it will not impact the first start 617 // but it may impact the last start. 618 // 3. For following cards spanned entirely by the newly coalesced object, it will change starts_object 619 // to false (and make first-start and last-start "undefined"). 620 // 4. For a following card that is spanned patially by the newly coalesced object, it may change 621 // first-start value, but it will not change the last-start value. 622 // 623 // The range of addresses represented by the arguments to coalesce_objects() must represent a range 624 // of memory that was previously occupied exactly by one or more previously registered objects. For 625 // convenience, it is legal to invoke coalesce_objects() with arguments that span a single previously 626 // registered object. 627 // 628 // The role of coalesce_objects is to change the crossing map information associated with all of the coalesced 629 // objects. 630 void coalesce_objects(HeapWord* address, size_t length_in_words); 631 632 // The typical use case is going to look something like this: 633 // for each heapregion that comprises old-gen memory 634 // for each card number that corresponds to this heap region 635 // scan the objects contained therein if the card is dirty 636 // To avoid excessive lookups in a sparse array, the API queries 637 // the card number pertaining to a particular address and then uses the 638 // card number for subsequent information lookups and stores. 639 640 // If starts_object(card_index), this returns the word offset within this card 641 // memory at which the first object begins. If !starts_object(card_index), the 642 // result is a don't care value -- asserts in a debug build. 643 size_t get_first_start(size_t card_index) const; 644 645 // If starts_object(card_index), this returns the word offset within this card 646 // memory at which the last object begins. If !starts_object(card_index), the 647 // result is a don't care value. 648 size_t get_last_start(size_t card_index) const; 649 650 651 // Given a card_index, return the starting address of the first block in the heap 652 // that straddles into the card. If the card is co-initial with an object, then 653 // this would return the starting address of the heap that this card covers. 654 // Expects to be called for a card affiliated with the old generation in 655 // generational mode. 656 HeapWord* block_start(size_t card_index) const; 657 }; 658 659 // ShenandoahScanRemembered is a concrete class representing the 660 // ability to scan the old-gen remembered set for references to 661 // objects residing in young-gen memory. 662 // 663 // Scanning normally begins with an invocation of numRegions and ends 664 // after all clusters of all regions have been scanned. 665 // 666 // Throughout the scanning effort, the number of regions does not 667 // change. 668 // 669 // Even though the regions that comprise old-gen memory are not 670 // necessarily contiguous, the abstraction represented by this class 671 // identifies each of the old-gen regions with an integer value 672 // in the range from 0 to (numRegions() - 1) inclusive. 673 // 674 675 class ShenandoahScanRemembered: public CHeapObj<mtGC> { 676 677 private: 678 ShenandoahDirectCardMarkRememberedSet* _rs; 679 ShenandoahCardCluster* _scc; 680 681 // Global card stats (cumulative) 682 HdrSeq _card_stats_scan_rs[MAX_CARD_STAT_TYPE]; 683 HdrSeq _card_stats_update_refs[MAX_CARD_STAT_TYPE]; 684 // Per worker card stats (multiplexed by phase) 685 HdrSeq** _card_stats; 686 687 // The types of card metrics that we gather 688 const char* _card_stats_name[MAX_CARD_STAT_TYPE] = { 689 "dirty_run", "clean_run", 690 "dirty_cards", "clean_cards", 691 "max_dirty_run", "max_clean_run", 692 "dirty_scan_objs", 693 "alternations" 694 }; 695 696 // The statistics are collected and logged separately for 697 // card-scans for initial marking, and for updating refs. 698 const char* _card_stat_log_type[MAX_CARD_STAT_LOG_TYPE] = { 699 "Scan Remembered Set", "Update Refs" 700 }; 701 702 int _card_stats_log_counter[2] = {0, 0}; 703 704 public: 705 ShenandoahScanRemembered(ShenandoahDirectCardMarkRememberedSet* rs) { 706 _rs = rs; 707 _scc = new ShenandoahCardCluster(rs); 708 709 // We allocate ParallelGCThreads worth even though we usually only 710 // use up to ConcGCThreads, because degenerate collections may employ 711 // ParallelGCThreads for remembered set scanning. 712 if (ShenandoahEnableCardStats) { 713 _card_stats = NEW_C_HEAP_ARRAY(HdrSeq*, ParallelGCThreads, mtGC); 714 for (uint i = 0; i < ParallelGCThreads; i++) { 715 _card_stats[i] = new HdrSeq[MAX_CARD_STAT_TYPE]; 716 } 717 } else { 718 _card_stats = nullptr; 719 } 720 } 721 722 ~ShenandoahScanRemembered() { 723 delete _scc; 724 if (ShenandoahEnableCardStats) { 725 for (uint i = 0; i < ParallelGCThreads; i++) { 726 delete _card_stats[i]; 727 } 728 FREE_C_HEAP_ARRAY(HdrSeq*, _card_stats); 729 _card_stats = nullptr; 730 } 731 assert(_card_stats == nullptr, "Error"); 732 } 733 734 HdrSeq* card_stats(uint worker_id) { 735 assert(worker_id < ParallelGCThreads, "Error"); 736 assert(ShenandoahEnableCardStats == (_card_stats != nullptr), "Error"); 737 return ShenandoahEnableCardStats ? _card_stats[worker_id] : nullptr; 738 } 739 740 HdrSeq* card_stats_for_phase(CardStatLogType t) { 741 switch (t) { 742 case CARD_STAT_SCAN_RS: 743 return _card_stats_scan_rs; 744 case CARD_STAT_UPDATE_REFS: 745 return _card_stats_update_refs; 746 default: 747 guarantee(false, "No such CardStatLogType"); 748 } 749 return nullptr; // Quiet compiler 750 } 751 752 // Card index is zero-based relative to first spanned card region. 753 size_t card_index_for_addr(HeapWord* p); 754 HeapWord* addr_for_card_index(size_t card_index); 755 bool is_card_dirty(size_t card_index); 756 bool is_write_card_dirty(size_t card_index); 757 bool is_card_dirty(HeapWord* p); 758 bool is_write_card_dirty(HeapWord* p); 759 void mark_card_as_dirty(HeapWord* p); 760 void mark_range_as_dirty(HeapWord* p, size_t num_heap_words); 761 void mark_card_as_clean(HeapWord* p); 762 void mark_range_as_clean(HeapWord* p, size_t num_heap_words); 763 764 void reset_remset(HeapWord* start, size_t word_count) { _rs->reset_remset(start, word_count); } 765 766 void merge_write_table(HeapWord* start, size_t word_count) { _rs->merge_write_table(start, word_count); } 767 768 size_t cluster_for_addr(HeapWord* addr); 769 HeapWord* addr_for_cluster(size_t cluster_no); 770 771 void reset_object_range(HeapWord* from, HeapWord* to); 772 void register_object(HeapWord* addr); 773 void register_object_without_lock(HeapWord* addr); 774 void coalesce_objects(HeapWord* addr, size_t length_in_words); 775 776 HeapWord* first_object_in_card(size_t card_index) { 777 if (_scc->starts_object(card_index)) { 778 return addr_for_card_index(card_index) + _scc->get_first_start(card_index); 779 } else { 780 return nullptr; 781 } 782 } 783 784 // Return true iff this object is "properly" registered. 785 bool verify_registration(HeapWord* address, ShenandoahMarkingContext* ctx); 786 787 // clear the cards to clean, and clear the object_starts info to no objects 788 void mark_range_as_empty(HeapWord* addr, size_t length_in_words); 789 790 // process_clusters() scans a portion of the remembered set 791 // for references from old gen into young. Several worker threads 792 // scan different portions of the remembered set by making parallel invocations 793 // of process_clusters() with each invocation scanning different 794 // "clusters" of the remembered set. 795 // 796 // An invocation of process_clusters() examines all of the 797 // intergenerational references spanned by `count` clusters starting 798 // with `first_cluster`. The `oops` argument is a worker-thread-local 799 // OopClosure that is applied to all "valid" references in the remembered set. 800 // 801 // A side-effect of executing process_clusters() is to update the remembered 802 // set entries (e.g. marking dirty cards clean if they no longer 803 // hold references to young-gen memory). 804 // 805 // An implementation of process_clusters() may choose to efficiently 806 // address more typical scenarios in the structure of remembered sets. E.g. 807 // in the generational setting, one might expect remembered sets to be very sparse 808 // (low mutation rates in the old generation leading to sparse dirty cards, 809 // each with very few intergenerational pointers). Specific implementations 810 // may choose to degrade gracefully as the sparsity assumption fails to hold, 811 // such as when there are sudden spikes in (premature) promotion or in the 812 // case of an underprovisioned, poorly-tuned, or poorly-shaped heap. 813 // 814 // At the start of a concurrent young generation marking cycle, we invoke process_clusters 815 // with ClosureType ShenandoahInitMarkRootsClosure. 816 // 817 // At the start of a concurrent evacuation phase, we invoke process_clusters with 818 // ClosureType ShenandoahEvacuateUpdateRootsClosure. 819 820 template <typename ClosureType> 821 void process_clusters(size_t first_cluster, size_t count, HeapWord* end_of_range, ClosureType* oops, 822 bool use_write_table, uint worker_id); 823 824 template <typename ClosureType> 825 void process_humongous_clusters(ShenandoahHeapRegion* r, size_t first_cluster, size_t count, 826 HeapWord* end_of_range, ClosureType* oops, bool use_write_table); 827 828 template <typename ClosureType> 829 void process_region_slice(ShenandoahHeapRegion* region, size_t offset, size_t clusters, HeapWord* end_of_range, 830 ClosureType* cl, bool use_write_table, uint worker_id); 831 832 // To Do: 833 // Create subclasses of ShenandoahInitMarkRootsClosure and 834 // ShenandoahEvacuateUpdateRootsClosure and any other closures 835 // that need to participate in remembered set scanning. Within the 836 // subclasses, add a (probably templated) instance variable that 837 // refers to the associated ShenandoahCardCluster object. Use this 838 // ShenandoahCardCluster instance to "enhance" the do_oops 839 // processing so that we can: 840 // 841 // 1. Avoid processing references that correspond to clean card 842 // regions, and 843 // 2. Set card status to CLEAN when the associated card region no 844 // longer holds inter-generatioanal references. 845 // 846 // To enable efficient implementation of these behaviors, we 847 // probably also want to add a few fields into the 848 // ShenandoahCardCluster object that allow us to precompute and 849 // remember the addresses at which card status is going to change 850 // from dirty to clean and clean to dirty. The do_oops 851 // implementations will want to update this value each time they 852 // cross one of these boundaries. 853 void roots_do(OopIterateClosure* cl); 854 855 // Log stats related to card/RS stats for given phase t 856 void log_card_stats(uint nworkers, CardStatLogType t) PRODUCT_RETURN; 857 private: 858 // Log stats for given worker id related into given summary card/RS stats 859 void log_worker_card_stats(uint worker_id, HdrSeq* sum_stats) PRODUCT_RETURN; 860 861 // Log given stats 862 void log_card_stats(HdrSeq* stats) PRODUCT_RETURN; 863 864 // Merge the stats from worked_id into the given summary stats, and clear the worker_id's stats. 865 void merge_worker_card_stats_cumulative(HdrSeq* worker_stats, HdrSeq* sum_stats) PRODUCT_RETURN; 866 }; 867 868 869 // A ShenandoahRegionChunk represents a contiguous interval of a ShenandoahHeapRegion, typically representing 870 // work to be done by a worker thread. 871 struct ShenandoahRegionChunk { 872 ShenandoahHeapRegion* _r; // The region of which this represents a chunk 873 size_t _chunk_offset; // HeapWordSize offset 874 size_t _chunk_size; // HeapWordSize qty 875 }; 876 877 // ShenandoahRegionChunkIterator divides the total remembered set scanning effort into ShenandoahRegionChunks 878 // that are assigned one at a time to worker threads. (Here, we use the terms `assignments` and `chunks` 879 // interchangeably.) Note that the effort required to scan a range of memory is not necessarily a linear 880 // function of the size of the range. Some memory ranges hold only a small number of live objects. 881 // Some ranges hold primarily primitive (non-pointer) data. We start with larger chunk sizes because larger chunks 882 // reduce coordination overhead. We expect that the GC worker threads that receive more difficult assignments 883 // will work longer on those chunks. Meanwhile, other worker threads will repeatedly accept and complete multiple 884 // easier chunks. As the total amount of work remaining to be completed decreases, we decrease the size of chunks 885 // given to individual threads. This reduces the likelihood of significant imbalance between worker thread assignments 886 // when there is less meaningful work to be performed by the remaining worker threads while they wait for 887 // worker threads with difficult assignments to finish, reducing the overall duration of the phase. 888 889 class ShenandoahRegionChunkIterator : public StackObj { 890 private: 891 // The largest chunk size is 4 MiB, measured in words. Otherwise, remembered set scanning may become too unbalanced. 892 // If the largest chunk size is too small, there is too much overhead sifting out assignments to individual worker threads. 893 static const size_t _maximum_chunk_size_words = (4 * 1024 * 1024) / HeapWordSize; 894 static const size_t _clusters_in_smallest_chunk = 4; 895 896 size_t _largest_chunk_size_words; 897 898 // smallest_chunk_size is 4 clusters. Each cluster spans 128 KiB. 899 // This is computed from CardTable::card_size_in_words() * ShenandoahCardCluster::CardsPerCluster; 900 static size_t smallest_chunk_size_words() { 901 return _clusters_in_smallest_chunk * CardTable::card_size_in_words() * ShenandoahCardCluster::CardsPerCluster; 902 } 903 904 // The total remembered set scanning effort is divided into chunks of work that are assigned to individual worker tasks. 905 // The chunks of assigned work are divided into groups, where the size of the typical group (_regular_group_size) is half the 906 // total number of regions. The first group may be larger than 907 // _regular_group_size in the case that the first group's chunk 908 // size is less than the region size. The last group may be larger 909 // than _regular_group_size because no group is allowed to 910 // have smaller assignments than _smallest_chunk_size, which is 128 KB. 911 912 // Under normal circumstances, no configuration needs more than _maximum_groups (default value of 16). 913 // The first group "effectively" processes chunks of size 1 MiB (or smaller for smaller region sizes). 914 // The last group processes chunks of size 128 KiB. There are four groups total. 915 916 // group[ 0] is 4 MiB chunk size (_maximum_chunk_size_words) 917 // group[ 1] is 2 MiB chunk size 918 // group[ 2] is 1 MiB chunk size 919 // group[ 3] is 512 KiB chunk size 920 // group[ 4] is 256 KiB chunk size 921 // group[ 5] is 128 KiB chunk size 922 // group[ 6] is 64 KiB chunk size 923 // group[ 7] is 32 KiB chunk size 924 // group[ 8] is 16 KiB chunk size 925 // group[ 9] is 8 KiB chunk size 926 // group[10] is 4 KiB chunk size 927 // Note: 4 KiB is smallest possible chunk_size, computed from: 928 // _clusters_in_smallest_chunk * MinimumCardSizeInWords * ShenandoahCardCluster::CardsPerCluster, which is 929 // 4 * 16 * 64 = 4096 930 931 // We set aside arrays to represent the maximum number of groups that may be required for any heap configuration 932 static const size_t _maximum_groups = 11; 933 934 const ShenandoahHeap* _heap; 935 936 const size_t _regular_group_size; // Number of chunks in each group 937 const size_t _first_group_chunk_size_b4_rebalance; 938 const size_t _num_groups; // Number of groups in this configuration 939 size_t _adjusted_num_groups; // Rebalancing may coalesce groups 940 const size_t _total_chunks; 941 942 shenandoah_padding(0); 943 volatile size_t _index; 944 shenandoah_padding(1); 945 946 size_t _region_index[_maximum_groups]; // The region index for the first region spanned by this group 947 size_t _group_offset[_maximum_groups]; // The offset at which group begins within first region spanned by this group 948 size_t _group_chunk_size[_maximum_groups]; // The size of each chunk within this group 949 size_t _group_entries[_maximum_groups]; // Total chunks spanned by this group and the ones before it. 950 951 // No implicit copying: iterators should be passed by reference to capture the state 952 NONCOPYABLE(ShenandoahRegionChunkIterator); 953 954 // Makes use of _heap. 955 size_t calc_regular_group_size(); 956 957 // Makes use of _regular_group_size, which must be initialized before call. 958 size_t calc_first_group_chunk_size_b4_rebalance(); 959 960 // Makes use of _regular_group_size and _first_group_chunk_size_b4_rebalance, both of which must be initialized before call. 961 size_t calc_num_groups(); 962 963 // Makes use of _regular_group_size, _first_group_chunk_size_b4_rebalance, which must be initialized before call. 964 size_t calc_total_chunks(); 965 966 public: 967 ShenandoahRegionChunkIterator(size_t worker_count); 968 ShenandoahRegionChunkIterator(ShenandoahHeap* heap, size_t worker_count); 969 970 // Reset iterator to default state 971 void reset(); 972 973 // Fills in assignment with next chunk of work and returns true iff there is more work. 974 // Otherwise, returns false. This is multi-thread-safe. 975 inline bool next(struct ShenandoahRegionChunk* assignment); 976 977 // This is *not* MT safe. However, in the absence of multithreaded access, it 978 // can be used to determine if there is more work to do. 979 inline bool has_next() const; 980 }; 981 982 983 class ShenandoahScanRememberedTask : public WorkerTask { 984 private: 985 ShenandoahObjToScanQueueSet* _queue_set; 986 ShenandoahObjToScanQueueSet* _old_queue_set; 987 ShenandoahReferenceProcessor* _rp; 988 ShenandoahRegionChunkIterator* _work_list; 989 bool _is_concurrent; 990 991 public: 992 ShenandoahScanRememberedTask(ShenandoahObjToScanQueueSet* queue_set, 993 ShenandoahObjToScanQueueSet* old_queue_set, 994 ShenandoahReferenceProcessor* rp, 995 ShenandoahRegionChunkIterator* work_list, 996 bool is_concurrent); 997 998 void work(uint worker_id); 999 void do_work(uint worker_id); 1000 }; 1001 1002 // After Full GC is done, reconstruct the remembered set by iterating over OLD regions, 1003 // registering all objects between bottom() and top(), and dirtying the cards containing 1004 // cross-generational pointers. 1005 class ShenandoahReconstructRememberedSetTask : public WorkerTask { 1006 private: 1007 ShenandoahRegionIterator* _regions; 1008 1009 public: 1010 explicit ShenandoahReconstructRememberedSetTask(ShenandoahRegionIterator* regions); 1011 1012 void work(uint worker_id) override; 1013 }; 1014 1015 #endif // SHARE_GC_SHENANDOAH_SHENANDOAHSCANREMEMBERED_HPP