1 /* 2 * Copyright (c) 2021, Amazon.com, Inc. or its affiliates. All rights reserved. 3 * 4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 5 * 6 * This code is free software; you can redistribute it and/or modify it 7 * under the terms of the GNU General Public License version 2 only, as 8 * published by the Free Software Foundation. 9 * 10 * This code is distributed in the hope that it will be useful, but WITHOUT 11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 13 * version 2 for more details (a copy is included in the LICENSE file that 14 * accompanied this code). 15 * 16 * You should have received a copy of the GNU General Public License version 17 * 2 along with this work; if not, write to the Free Software Foundation, 18 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 19 * 20 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 21 * or visit www.oracle.com if you need additional information or have any 22 * questions. 23 * 24 */ 25 26 #ifndef SHARE_GC_SHENANDOAH_SHENANDOAHSCANREMEMBERED_HPP 27 #define SHARE_GC_SHENANDOAH_SHENANDOAHSCANREMEMBERED_HPP 28 29 // Terminology used within this source file: 30 // 31 // Card Entry: This is the information that identifies whether a 32 // particular card-table entry is Clean or Dirty. A clean 33 // card entry denotes that the associated memory does not 34 // hold references to young-gen memory. 35 // 36 // Card Region, aka 37 // Card Memory: This is the region of memory that is assocated with a 38 // particular card entry. 39 // 40 // Card Cluster: A card cluster represents 64 card entries. A card 41 // cluster is the minimal amount of work performed at a 42 // time by a parallel thread. Note that the work required 43 // to scan a card cluster is somewhat variable in that the 44 // required effort depends on how many cards are dirty, how 45 // many references are held within the objects that span a 46 // DIRTY card's memory, and on the size of the object 47 // that spans the end of a DIRTY card's memory (because 48 // that object will be scanned in its entirety). For these 49 // reasons, it is advisable for the multiple worker threads 50 // to be flexible in the number of clusters to be 51 // processed by each thread. 52 // 53 // A cluster represents a "natural" quantum of work to be performed by 54 // a parallel GC thread's background remembered set scanning efforts. 55 // The notion of cluster is similar to the notion of stripe in the 56 // implementation of parallel GC card scanning. However, a cluster is 57 // typically smaller than a stripe, enabling finer grain division of 58 // labor between multiple threads. 59 // 60 // For illustration, consider the following possible JVM configurations: 61 // 62 // Scenario 1: 63 // RegionSize is 128 MB 64 // Span of a card entry is 512 B 65 // Each card table entry consumes 1 B 66 // Assume one long word of card table entries represents a cluster. 67 // This long word holds 8 card table entries, spanning a 68 // total of 4KB 69 // The number of clusters per region is 128 MB / 4 KB = 32K 70 // 71 // Scenario 2: 72 // RegionSize is 128 MB 73 // Span of each card entry is 128 B 74 // Each card table entry consumes 1 bit 75 // Assume one int word of card tables represents a cluster. 76 // This int word holds 32 card table entries, spanning a 77 // total of 4KB 78 // The number of clusters per region is 128 MB / 4 KB = 32K 79 // 80 // Scenario 3: 81 // RegionSize is 128 MB 82 // Span of each card entry is 512 B 83 // Each card table entry consumes 1 bit 84 // Assume one long word of card tables represents a cluster. 85 // This long word holds 64 card table entries, spanning a 86 // total of 32 KB 87 // The number of clusters per region is 128 MB / 32 KB = 4K 88 // 89 // At the start of a new young-gen concurrent mark pass, the gang of 90 // Shenandoah worker threads collaborate in performing the following 91 // actions: 92 // 93 // Let old_regions = number of ShenandoahHeapRegion comprising 94 // old-gen memory 95 // Let region_size = ShenandoahHeapRegion::region_size_bytes() 96 // represent the number of bytes in each region 97 // Let clusters_per_region = region_size / 512 98 // Let rs represent the relevant RememberedSet implementation 99 // (an instance of ShenandoahDirectCardMarkRememberedSet or an instance 100 // of a to-be-implemented ShenandoahBufferWithSATBRememberedSet) 101 // 102 // for each ShenandoahHeapRegion old_region in the whole heap 103 // determine the cluster number of the first cluster belonging 104 // to that region 105 // for each cluster contained within that region 106 // Assure that exactly one worker thread initializes each 107 // cluster of overreach memory by invoking: 108 // 109 // rs->initialize_overreach(cluster_no, cluster_count) 110 // 111 // in separate threads. (Divide up the clusters so that 112 // different threads are responsible for initializing different 113 // clusters. Initialization cost is essentially identical for 114 // each cluster.) 115 // 116 // Next, we repeat the process for invocations of process_clusters. 117 // for each ShenandoahHeapRegion old_region in the whole heap 118 // determine the cluster number of the first cluster belonging 119 // to that region 120 // for each cluster contained within that region 121 // Assure that exactly one worker thread processes each 122 // cluster, each thread making a series of invocations of the 123 // following: 124 // 125 // rs->process_clusters(worker_id, ReferenceProcessor *, 126 // ShenandoahConcurrentMark *, cluster_no, cluster_count, 127 // HeapWord *end_of_range, OopClosure *oops); 128 // 129 // For efficiency, divide up the clusters so that different threads 130 // are responsible for processing different clusters. Processing costs 131 // may vary greatly between clusters for the following reasons: 132 // 133 // a) some clusters contain mostly dirty cards and other 134 // clusters contain mostly clean cards 135 // b) some clusters contain mostly primitive data and other 136 // clusters contain mostly reference data 137 // c) some clusters are spanned by very large objects that 138 // begin in some other cluster. When a large object 139 // beginning in a preceding cluster spans large portions of 140 // this cluster, the processing of this cluster gets a 141 // "free ride" because the thread responsible for processing 142 // the cluster that holds the object's header does the 143 // processing. 144 // d) in the case that the end of this cluster is spanned by a 145 // very large object, the processing of this cluster will 146 // be responsible for examining the entire object, 147 // potentially requiring this thread to process large amounts 148 // of memory pertaining to other clusters. 149 // 150 // Though an initial division of labor between marking threads may 151 // assign equal numbers of clusters to be scanned by each thread, it 152 // should be expected that some threads will finish their assigned 153 // work before others. Therefore, some amount of the full remembered 154 // set scanning effort should be held back and assigned incrementally 155 // to the threads that end up with excess capacity. Consider the 156 // following strategy for dividing labor: 157 // 158 // 1. Assume there are 8 marking threads and 1024 remembered 159 // set clusters to be scanned. 160 // 2. Assign each thread to scan 64 clusters. This leaves 161 // 512 (1024 - (8*64)) clusters to still be scanned. 162 // 3. As the 8 server threads complete previous cluster 163 // scanning assignments, issue each of the next 8 scanning 164 // assignments as units of 32 additional cluster each. 165 // In the case that there is high variance in effort 166 // associated with previous cluster scanning assignments, 167 // multiples of these next assignments may be serviced by 168 // the server threads that were previously assigned lighter 169 // workloads. 170 // 4. Make subsequent scanning assignments as follows: 171 // a) 8 assignments of size 16 clusters 172 // b) 8 assignments of size 8 clusters 173 // c) 16 assignments of size 4 clusters 174 // 175 // When there is no more remembered set processing work to be 176 // assigned to a newly idled worker thread, that thread can move 177 // on to work on other tasks associated with root scanning until such 178 // time as all clusters have been examined. 179 // 180 // Once all clusters have been processed, the gang of GC worker 181 // threads collaborate to merge the overreach data. 182 // 183 // for each ShenandoahHeapRegion old_region in the whole heap 184 // determine the cluster number of the first cluster belonging 185 // to that region 186 // for each cluster contained within that region 187 // Assure that exactly one worker thread initializes each 188 // cluster of overreach memory by invoking: 189 // 190 // rs->merge_overreach(cluster_no, cluster_count) 191 // 192 // in separate threads. (Divide up the clusters so that 193 // different threads are responsible for merging different 194 // clusters. Merging cost is essentially identical for 195 // each cluster.) 196 // 197 // Though remembered set scanning is designed to run concurrently with 198 // mutator threads, the current implementation of remembered set 199 // scanning runs in parallel during a GC safepoint. Furthermore, the 200 // current implementation of remembered set scanning never clears a 201 // card once it has been marked. Since the current implementation 202 // never clears marked pages, the current implementation does not 203 // invoke initialize_overreach() or merge_overreach(). 204 // 205 // These limitations will be addressed in future enhancements to the 206 // existing implementation. 207 208 #include <stdint.h> 209 #include "memory/iterator.hpp" 210 #include "gc/shared/workerThread.hpp" 211 #include "gc/shenandoah/shenandoahCardTable.hpp" 212 #include "gc/shenandoah/shenandoahHeapRegion.hpp" 213 #include "gc/shenandoah/shenandoahTaskqueue.hpp" 214 215 class ShenandoahReferenceProcessor; 216 class ShenandoahConcurrentMark; 217 class ShenandoahHeap; 218 class ShenandoahRegionIterator; 219 class ShenandoahMarkingContext; 220 221 class CardTable; 222 223 class ShenandoahDirectCardMarkRememberedSet: public CHeapObj<mtGC> { 224 225 private: 226 227 // Use symbolic constants defined in cardTable.hpp 228 // CardTable::card_shift = 9; 229 // CardTable::card_size = 512; 230 // CardTable::card_size_in_words = 64; 231 232 // CardTable::clean_card_val() 233 // CardTable::dirty_card_val() 234 235 ShenandoahHeap *_heap; 236 ShenandoahCardTable *_card_table; 237 size_t _card_shift; 238 size_t _total_card_count; 239 size_t _cluster_count; 240 HeapWord *_whole_heap_base; // Points to first HeapWord of data contained within heap memory 241 HeapWord *_whole_heap_end; 242 uint8_t *_byte_map; // Points to first entry within the card table 243 uint8_t *_byte_map_base; // Points to byte_map minus the bias computed from address of heap memory 244 uint8_t *_overreach_map; // Points to first entry within the overreach card table 245 uint8_t *_overreach_map_base; // Points to overreach_map minus the bias computed from address of heap memory 246 247 uint64_t _wide_clean_value; 248 249 public: 250 // count is the number of cards represented by the card table. 251 ShenandoahDirectCardMarkRememberedSet(ShenandoahCardTable *card_table, size_t total_card_count); 252 ~ShenandoahDirectCardMarkRememberedSet(); 253 254 // Card index is zero-based relative to _byte_map. 255 size_t total_cards(); 256 size_t card_index_for_addr(HeapWord *p); 257 HeapWord *addr_for_card_index(size_t card_index); 258 bool is_card_dirty(size_t card_index); 259 bool is_write_card_dirty(size_t card_index); 260 void mark_card_as_dirty(size_t card_index); 261 void mark_range_as_dirty(size_t card_index, size_t num_cards); 262 void mark_card_as_clean(size_t card_index); 263 void mark_read_card_as_clean(size_t card_index); 264 void mark_range_as_clean(size_t card_index, size_t num_cards); 265 void mark_overreach_card_as_dirty(size_t card_index); 266 bool is_card_dirty(HeapWord *p); 267 void mark_card_as_dirty(HeapWord *p); 268 void mark_range_as_dirty(HeapWord *p, size_t num_heap_words); 269 void mark_card_as_clean(HeapWord *p); 270 void mark_range_as_clean(HeapWord *p, size_t num_heap_words); 271 void mark_overreach_card_as_dirty(void *p); 272 size_t cluster_count(); 273 274 // Called by multiple GC threads at start of concurrent mark and evacuation phases. Each parallel GC thread typically 275 // initializes a different subranges of all overreach entries. 276 void initialize_overreach(size_t first_cluster, size_t count); 277 278 // Called by GC thread at end of concurrent mark or evacuation phase. Each parallel GC thread typically merges different 279 // subranges of all overreach entries. 280 void merge_overreach(size_t first_cluster, size_t count); 281 282 // Called by GC thread at start of concurrent mark to exchange roles of read and write remembered sets. 283 // Not currently used because mutator write barrier does not honor changes to the location of card table. 284 void swap_remset() { _card_table->swap_card_tables(); } 285 286 void merge_write_table(HeapWord* start, size_t word_count) { 287 size_t card_index = card_index_for_addr(start); 288 size_t num_cards = word_count / CardTable::card_size_in_words(); 289 size_t iterations = num_cards / (sizeof (intptr_t) / sizeof (CardTable::CardValue)); 290 intptr_t* read_table_ptr = (intptr_t*) &(_card_table->read_byte_map())[card_index]; 291 intptr_t* write_table_ptr = (intptr_t*) &(_card_table->write_byte_map())[card_index]; 292 for (size_t i = 0; i < iterations; i++) { 293 intptr_t card_value = *write_table_ptr; 294 *read_table_ptr++ &= card_value; 295 write_table_ptr++; 296 } 297 } 298 299 HeapWord* whole_heap_base() { return _whole_heap_base; } 300 HeapWord* whole_heap_end() { return _whole_heap_end; } 301 302 // Instead of swap_remset, the current implementation of concurrent remembered set scanning does reset_remset 303 // in parallel threads, each invocation processing one entire HeapRegion at a time. Processing of a region 304 // consists of copying the write table to the read table and cleaning the write table. 305 void reset_remset(HeapWord* start, size_t word_count) { 306 size_t card_index = card_index_for_addr(start); 307 size_t num_cards = word_count / CardTable::card_size_in_words(); 308 size_t iterations = num_cards / (sizeof (intptr_t) / sizeof (CardTable::CardValue)); 309 intptr_t* read_table_ptr = (intptr_t*) &(_card_table->read_byte_map())[card_index]; 310 intptr_t* write_table_ptr = (intptr_t*) &(_card_table->write_byte_map())[card_index]; 311 for (size_t i = 0; i < iterations; i++) { 312 *read_table_ptr++ = *write_table_ptr; 313 *write_table_ptr++ = CardTable::clean_card_row_val(); 314 } 315 } 316 317 // Called by GC thread after scanning old remembered set in order to prepare for next GC pass 318 void clear_old_remset() { _card_table->clear_read_table(); } 319 320 }; 321 322 // A ShenandoahCardCluster represents the minimal unit of work 323 // performed by independent parallel GC threads during scanning of 324 // remembered sets. 325 // 326 // The GC threads that perform card-table remembered set scanning may 327 // overwrite card-table entries to mark them as clean in the case that 328 // the associated memory no longer holds references to young-gen 329 // memory. Rather than access the card-table entries directly, all GC 330 // thread access to card-table information is made by way of the 331 // ShenandoahCardCluster data abstraction. This abstraction 332 // effectively manages access to multiple possible underlying 333 // remembered set implementations, including a traditional card-table 334 // approach and a SATB-based approach. 335 // 336 // The API services represent a compromise between efficiency and 337 // convenience. 338 // 339 // In the initial implementation, we assume that scanning of card 340 // table entries occurs only while the JVM is at a safe point. Thus, 341 // there is no synchronization required between GC threads that are 342 // scanning card-table entries and marking certain entries that were 343 // previously dirty as clean, and mutator threads which would possibly 344 // be marking certain card-table entries as dirty. 345 // 346 // There is however a need to implement concurrency control and memory 347 // coherency between multiple GC threads that scan the remembered set 348 // in parallel. The desire is to divide the complete scanning effort 349 // into multiple clusters of work that can be independently processed 350 // by individual threads without need for synchronizing efforts 351 // between the work performed by each task. The term "cluster" of 352 // work is similar to the term "stripe" as used in the implementation 353 // of Parallel GC. 354 // 355 // Complexity arises when an object to be scanned crosses the boundary 356 // between adjacent cluster regions. Here is the protocol that is 357 // followed: 358 // 359 // 1. We implement a supplemental data structure known as the overreach 360 // card table. The thread that is responsible for scanning each 361 // cluster of card-table entries is granted exclusive access to 362 // modify the associated card-table entries. In the case that a 363 // thread scans a very large object that reaches into one or more 364 // following clusters, that thread has exclusive access to the 365 // overreach card table for all of the entries belonging to the 366 // following clusters that are spanned by this large object. 367 // After all clusters have been scanned, the scanning threads 368 // briefly synchronize to merge the contents of the overreach 369 // entries with the traditional card table entries using logical- 370 // and operations. 371 // 2. Every object is scanned in its "entirety" by the thread that is 372 // responsible for the cluster that holds its starting address. 373 // Entirety is in quotes because there are various situations in 374 // which some portions of the object will not be scanned by this 375 // thread: 376 // a) If an object spans multiple card regions, all of which are 377 // contained within the same cluster, the scanning thread 378 // consults the existing card-table entries and does not scan 379 // portions of the object that are not currently dirty. 380 // b) For any cluster that is spanned in its entirety by a very 381 // large object, the GC thread that scans this object assumes 382 // full responsibility for maintenance of the associated 383 // card-table entries. 384 // c) If a cluster is partially spanned by an object originating 385 // in a preceding cluster, the portion of the object that 386 // partially spans the following cluster is scanned in its 387 // entirety (because the thread that is responsible for 388 // scanning the object cannot rely upon the card-table entries 389 // associated with the following cluster). Whenever references 390 // to young-gen memory are found within the scanned data, the 391 // associated overreach card table entries are marked as dirty 392 // by the scanning thread. 393 // 3. If a cluster is spanned in its entirety by an object that 394 // originates within a preceding cluster's memory, the thread 395 // assigned to examine this cluster does absolutely nothing. The 396 // thread assigned to scan the cluster that holds the object's 397 // starting address takes full responsibility for scanning the 398 // entire object and updating the associated card-table entries. 399 // 4. If a cluster is spanned partially by an object that originates 400 // within a preceding cluster's memory, the thread assigned to 401 // examine this cluster marks the card-table entry as clean for 402 // each card table that is fully spanned by this overreaching 403 // object. If a card-table entry's memory is partially spanned 404 // by the overreaching object, the thread sets the card-table 405 // entry to clean if it was previously dirty and if the portion 406 // of the card-table entry's memory that is not spanned by the 407 // overreaching object does not hold pointers to young-gen 408 // memory. 409 // 5. While examining a particular card belonging to a particular 410 // cluster, if an object reaches beyond the end of its card 411 // memory, the thread "scans" all portions of the object that 412 // correspond to DIRTY card entries within the current cluster and 413 // all portions of the object that reach into following clustesr. 414 // After this object is scanned, continue scanning with the memory 415 // that follows this object if this memory pertains to the same 416 // cluster. Otherwise, consider this cluster's memory to have 417 // been fully examined. 418 // 419 // Discussion: 420 // Though this design results from careful consideration of multiple 421 // design objectives, it is subject to various criticisms. Some 422 // discussion of the design choices is provided here: 423 // 424 // 1. Note that remembered sets are a heuristic technique to avoid 425 // the need to scan all of old-gen memory with each young-gen 426 // collection. If we sometimes scan a bit more memory than is 427 // absolutely necessary, that should be considered a reasonable 428 // compromise. This compromise is already present in the sizing 429 // of card table memory areas. Note that a single dirty pointer 430 // within a 512-byte card region forces the "unnecessary" scanning 431 // of 63 = ((512 - 8 = 504) / 8) pointers. 432 // 2. One undesirable aspect of this design is that we sometimes have 433 // to scan large amounts of memory belonging to very large 434 // objects, even for parts of the very large object that do not 435 // correspond to dirty card table entries. Note that this design 436 // limits the amount of non-dirty scanning that might have to 437 // be performed for these very large objects. In particular, only 438 // the last part of the very large object that extends into but 439 // does not completely span a particular cluster is unnecessarily 440 // scanned. Thus, for each very large object, the maximum 441 // over-scan is the size of memory spanned by a single cluster. 442 // 3. The representation of pointer location descriptive information 443 // within Klass representations is not designed for efficient 444 // "random access". An alternative approach to this design would 445 // be to scan very large objects multiple times, once for each 446 // cluster that is spanned by the object's range. This reduces 447 // unnecessary overscan, but it introduces different sorts of 448 // overhead effort: 449 // i) For each spanned cluster, we have to look up the start of 450 // the crossing object. 451 // ii) Each time we scan the very large object, we have to 452 // sequentially walk through its pointer location 453 // descriptors, skipping over all of the pointers that 454 // precede the start of the range of addresses that we 455 // consider relevant. 456 457 458 // Because old-gen heap memory is not necessarily contiguous, and 459 // because cards are not necessarily maintained for young-gen memory, 460 // consecutive card numbers do not necessarily correspond to consecutive 461 // address ranges. For the traditional direct-card-marking 462 // implementation of this interface, consecutive card numbers are 463 // likely to correspond to contiguous regions of memory, but this 464 // should not be assumed. Instead, rely only upon the following: 465 // 466 // 1. All card numbers for cards pertaining to the same 467 // ShenandoahHeapRegion are consecutively numbered. 468 // 2. In the case that neighboring ShenandoahHeapRegions both 469 // represent old-gen memory, the card regions that span the 470 // boundary between these neighboring heap regions will be 471 // consecutively numbered. 472 // 3. (A corollary) In the case that an old-gen object spans the 473 // boundary between two heap regions, the card regions that 474 // correspond to the span of this object will be consecutively 475 // numbered. 476 477 478 // ShenandoahCardCluster abstracts access to the remembered set 479 // and also keeps track of crossing map information to allow efficient 480 // resolution of object start addresses. 481 // 482 // ShenandoahCardCluster supports all of the services of 483 // RememberedSet, plus it supports register_object() and lookup_object(). 484 // 485 // There are two situations under which we need to know the location 486 // at which the object spanning the start of a particular card-table 487 // memory region begins: 488 // 489 // 1. When we begin to scan dirty card memory that is not the 490 // first card region within a cluster, and the object that 491 // crosses into this card memory was not previously scanned, 492 // we need to find where that object starts so we can scan it. 493 // (Asides: if the objects starts within a previous cluster, it 494 // has already been scanned. If the object starts within this 495 // cluster and it spans at least one card region that is dirty 496 // and precedes this card region within the cluster, then it has 497 // already been scanned.) 498 // 2. When we are otherwise done scanning a complete cluster, if the 499 // last object within the cluster reaches into the following 500 // cluster, we need to scan this object. Thus, we need to find 501 // its starting location. 502 // 503 // The RememberedSet template parameter is intended to represent either 504 // ShenandoahDirectCardMarkRememberedSet, or a to-be-implemented 505 // ShenandoahBufferWithSATBRememberedSet. 506 template<typename RememberedSet> 507 class ShenandoahCardCluster: public CHeapObj<mtGC> { 508 509 private: 510 RememberedSet *_rs; 511 512 public: 513 static const size_t CardsPerCluster = 64; 514 515 private: 516 typedef struct cross_map { uint8_t first; uint8_t last; } xmap; 517 typedef union crossing_info { uint16_t short_word; xmap offsets; } crossing_info; 518 519 // ObjectStartsInCardRegion bit is set within a crossing_info.offsets.start iff at least one object starts within 520 // a particular card region. We pack this bit into start byte under assumption that start byte is accessed less 521 // frequently that last byte. This is true when number of clean cards is greater than number of dirty cards. 522 static const uint16_t ObjectStartsInCardRegion = 0x80; 523 static const uint16_t FirstStartBits = 0x3f; 524 525 crossing_info *object_starts; 526 527 public: 528 // If we're setting first_start, assume the card has an object. 529 inline void set_first_start(size_t card_index, uint8_t value) { 530 object_starts[card_index].offsets.first = ObjectStartsInCardRegion | value; 531 } 532 533 inline void set_last_start(size_t card_index, uint8_t value) { 534 object_starts[card_index].offsets.last = value; 535 } 536 537 inline void set_has_object_bit(size_t card_index) { 538 object_starts[card_index].offsets.first |= ObjectStartsInCardRegion; 539 } 540 541 inline void clear_has_object_bit(size_t card_index) { 542 object_starts[card_index].offsets.first &= ~ObjectStartsInCardRegion; 543 } 544 545 // Returns true iff an object is known to start within the card memory associated with card card_index. 546 inline bool has_object(size_t card_index) { 547 return (object_starts[card_index].offsets.first & ObjectStartsInCardRegion) != 0; 548 } 549 550 inline void clear_objects_in_range(HeapWord *addr, size_t num_words) { 551 size_t card_index = _rs->card_index_for_addr(addr); 552 size_t last_card_index = _rs->card_index_for_addr(addr + num_words - 1); 553 while (card_index <= last_card_index) 554 object_starts[card_index++].short_word = 0; 555 } 556 557 ShenandoahCardCluster(RememberedSet *rs) { 558 _rs = rs; 559 // TODO: We don't really need object_starts entries for every card entry. We only need these for 560 // the card entries that correspond to old-gen memory. But for now, let's be quick and dirty. 561 object_starts = (crossing_info *) malloc(rs->total_cards() * sizeof(crossing_info)); 562 if (object_starts == nullptr) 563 fatal("Insufficient memory for initializing heap"); 564 for (size_t i = 0; i < rs->total_cards(); i++) 565 object_starts[i].short_word = 0; 566 } 567 568 ~ShenandoahCardCluster() { 569 if (object_starts != nullptr) 570 free(object_starts); 571 object_starts = nullptr; 572 } 573 574 // There is one entry within the object_starts array for each card entry. 575 // 576 // In the most recent implementation of ShenandoahScanRemembered::process_clusters(), 577 // there is no need for the get_crossing_object_start() method function, so there is no 578 // need to maintain the following information. The comment is left in place for now in 579 // case we find it necessary to add support for this service at a later time. 580 // 581 // Bits 0x7fff: If no object starts within this card region, the 582 // remaining bits of the object_starts array represent 583 // the absolute word offset within the enclosing 584 // cluster's memory of the starting address for the 585 // object that spans the start of this card region's 586 // memory. If the spanning object begins in memory 587 // that precedes this card region's cluster, the value 588 // stored in these bits is the special value 0x7fff. 589 // (Note that the maximum value required to represent a 590 // spanning object from within the current cluster is 591 // ((63 * 64) - 8), which equals 0x0fbf. 592 // 593 // In the absence of the need to support get_crossing_object_start(), 594 // here is discussion of performance: 595 // 596 // Suppose multiple garbage objects are coalesced during GC sweep 597 // into a single larger "free segment". As each two objects are 598 // coalesced together, the start information pertaining to the second 599 // object must be removed from the objects_starts array. If the 600 // second object had been been the first object within card memory, 601 // the new first object is the object that follows that object if 602 // that starts within the same card memory, or NoObject if the 603 // following object starts within the following cluster. If the 604 // second object had been the last object in the card memory, 605 // replace this entry with the newly coalesced object if it starts 606 // within the same card memory, or with NoObject if it starts in a 607 // preceding card's memory. 608 // 609 // Suppose a large free segment is divided into a smaller free 610 // segment and a new object. The second part of the newly divided 611 // memory must be registered as a new object, overwriting at most 612 // one first_start and one last_start entry. Note that one of the 613 // newly divided two objects might be a new GCLAB. 614 // 615 // Suppose postprocessing of a GCLAB finds that the original GCLAB 616 // has been divided into N objects. Each of the N newly allocated 617 // objects will be registered, overwriting at most one first_start 618 // and one last_start entries. 619 // 620 // No object registration operations are linear in the length of 621 // the registered objects. 622 // 623 // Consider further the following observations regarding object 624 // registration costs: 625 // 626 // 1. The cost is paid once for each old-gen object (Except when 627 // an object is demoted and repromoted, in which case we would 628 // pay the cost again). 629 // 2. The cost can be deferred so that there is no urgency during 630 // mutator copy-on-first-access promotion. Background GC 631 // threads will update the object_starts array by post- 632 // processing the contents of retired PLAB buffers. 633 // 3. The bet is that these costs are paid relatively rarely 634 // because: 635 // a) Most objects die young and objects that die in young-gen 636 // memory never need to be registered with the object_starts 637 // array. 638 // b) Most objects that are promoted into old-gen memory live 639 // there without further relocation for a relatively long 640 // time, so we get a lot of benefit from each investment 641 // in registering an object. 642 643 public: 644 645 // The starting locations of objects contained within old-gen memory 646 // are registered as part of the remembered set implementation. This 647 // information is required when scanning dirty card regions that are 648 // spanned by objects beginning within preceding card regions. It 649 // is necessary to find the first and last objects that begin within 650 // this card region. Starting addresses of objects are required to 651 // find the object headers, and object headers provide information 652 // about which fields within the object hold addresses. 653 // 654 // The old-gen memory allocator invokes register_object() for any 655 // object that is allocated within old-gen memory. This identifies 656 // the starting addresses of objects that span boundaries between 657 // card regions. 658 // 659 // It is not necessary to invoke register_object at the very instant 660 // an object is allocated. It is only necessary to invoke it 661 // prior to the next start of a garbage collection concurrent mark 662 // or concurrent update-references phase. An "ideal" time to register 663 // objects is during post-processing of a GCLAB after the GCLAB is 664 // retired due to depletion of its memory. 665 // 666 // register_object() does not perform synchronization. In the case 667 // that multiple threads are registering objects whose starting 668 // addresses are within the same cluster, races between these 669 // threads may result in corruption of the object-start data 670 // structures. Parallel GC threads should avoid registering objects 671 // residing within the same cluster by adhering to the following 672 // coordination protocols: 673 // 674 // 1. Align thread-local GCLAB buffers with some TBD multiple of 675 // card clusters. The card cluster size is 32 KB. If the 676 // desired GCLAB size is 128 KB, align the buffer on a multiple 677 // of 4 card clusters. 678 // 2. Post-process the contents of GCLAB buffers to register the 679 // objects allocated therein. Allow one GC thread at a 680 // time to do the post-processing of each GCLAB. 681 // 3. Since only one GC thread at a time is registering objects 682 // belonging to a particular allocation buffer, no locking 683 // is performed when registering these objects. 684 // 4. Any remnant of unallocated memory within an expended GC 685 // allocation buffer is not returned to the old-gen allocation 686 // pool until after the GC allocation buffer has been post 687 // processed. Before any remnant memory is returned to the 688 // old-gen allocation pool, the GC thread that scanned this GC 689 // allocation buffer performs a write-commit memory barrier. 690 // 5. Background GC threads that perform tenuring of young-gen 691 // objects without a GCLAB use a CAS lock before registering 692 // each tenured object. The CAS lock assures both mutual 693 // exclusion and memory coherency/visibility. Note that an 694 // object tenured by a background GC thread will not overlap 695 // with any of the clusters that are receiving tenured objects 696 // by way of GCLAB buffers. Multiple independent GC threads may 697 // attempt to tenure objects into a shared cluster. This is why 698 // sychronization may be necessary. Consider the following 699 // scenarios: 700 // 701 // a) If two objects are tenured into the same card region, each 702 // registration may attempt to modify the first-start or 703 // last-start information associated with that card region. 704 // Furthermore, because the representations of first-start 705 // and last-start information within the object_starts array 706 // entry uses different bits of a shared uint_16 to represent 707 // each, it is necessary to lock the entire card entry 708 // before modifying either the first-start or last-start 709 // information within the entry. 710 // b) Suppose GC thread X promotes a tenured object into 711 // card region A and this tenured object spans into 712 // neighboring card region B. Suppose GC thread Y (not equal 713 // to X) promotes a tenured object into cluster B. GC thread X 714 // will update the object_starts information for card A. No 715 // synchronization is required. 716 // c) In summary, when background GC threads register objects 717 // newly tenured into old-gen memory, they must acquire a 718 // mutual exclusion lock on the card that holds the starting 719 // address of the newly tenured object. This can be achieved 720 // by using a CAS instruction to assure that the previous 721 // values of first-offset and last-offset have not been 722 // changed since the same thread inquired as to their most 723 // current values. 724 // 725 // One way to minimize the need for synchronization between 726 // background tenuring GC threads is for each tenuring GC thread 727 // to promote young-gen objects into distinct dedicated cluster 728 // ranges. 729 // 6. The object_starts information is only required during the 730 // starting of concurrent marking and concurrent evacuation 731 // phases of GC. Before we start either of these GC phases, the 732 // JVM enters a safe point and all GC threads perform 733 // commit-write barriers to assure that access to the 734 // object_starts information is coherent. 735 736 737 // Notes on synchronization of register_object(): 738 // 739 // 1. For efficiency, there is no locking in the implementation of register_object() 740 // 2. Thus, it is required that users of this service assure that concurrent/parallel invocations of 741 // register_object() do pertain to the same card's memory range. See discussion below to undestand 742 // the risks. 743 // 3. When allocating from a TLAB or GCLAB, the mutual exclusion can be guaranteed by assuring that each 744 // LAB's start and end are aligned on card memory boundaries. 745 // 4. Use the same lock that guarantees exclusivity when performing free-list allocation within heap regions. 746 // 747 // Register the newly allocated object while we're holding the global lock since there's no synchronization 748 // built in to the implementation of register_object(). There are potential races when multiple independent 749 // threads are allocating objects, some of which might span the same card region. For example, consider 750 // a card table's memory region within which three objects are being allocated by three different threads: 751 // 752 // objects being "concurrently" allocated: 753 // [-----a------][-----b-----][--------------c------------------] 754 // [---- card table memory range --------------] 755 // 756 // Before any objects are allocated, this card's memory range holds no objects. Note that: 757 // allocation of object a wants to set the has-object, first-start, and last-start attributes of the preceding card region. 758 // allocation of object b wants to set the has-object, first-start, and last-start attributes of this card region. 759 // allocation of object c also wants to set the has-object, first-start, and last-start attributes of this card region. 760 // 761 // The thread allocating b and the thread allocating c can "race" in various ways, resulting in confusion, such as last-start 762 // representing object b while first-start represents object c. This is why we need to require all register_object() 763 // invocations associated with objects that are allocated from "free lists" to provide their own mutual exclusion locking 764 // mechanism. 765 766 // Reset the has_object() information to false for all cards in the range between from and to. 767 void reset_object_range(HeapWord *from, HeapWord *to); 768 769 // register_object() requires that the caller hold the heap lock 770 // before calling it. 771 void register_object(HeapWord* address); 772 773 // register_object_wo_lock() does not require that the caller hold 774 // the heap lock before calling it, under the assumption that the 775 // caller has assure no other thread will endeavor to concurrently 776 // register objects that start within the same card's memory region 777 // as address. 778 void register_object_wo_lock(HeapWord* address); 779 780 // During the reference updates phase of GC, we walk through each old-gen memory region that was 781 // not part of the collection set and we invalidate all unmarked objects. As part of this effort, 782 // we coalesce neighboring dead objects in order to make future remembered set scanning more 783 // efficient (since future remembered set scanning of any card region containing consecutive 784 // dead objects can skip over all of them at once by reading only a single dead object header 785 // instead of having to read the header of each of the coalesced dead objects. 786 // 787 // At some future time, we may implement a further optimization: satisfy future allocation requests 788 // by carving new objects out of the range of memory that represents the coalesced dead objects. 789 // 790 // Suppose we want to combine several dead objects into a single coalesced object. How does this 791 // impact our representation of crossing map information? 792 // 1. If the newly coalesced range is contained entirely within a card range, that card's last 793 // start entry either remains the same or it is changed to the start of the coalesced region. 794 // 2. For the card that holds the start of the coalesced object, it will not impact the first start 795 // but it may impact the last start. 796 // 3. For following cards spanned entirely by the newly coalesced object, it will change has_object 797 // to false (and make first-start and last-start "undefined"). 798 // 4. For a following card that is spanned patially by the newly coalesced object, it may change 799 // first-start value, but it will not change the last-start value. 800 // 801 // The range of addresses represented by the arguments to coalesce_objects() must represent a range 802 // of memory that was previously occupied exactly by one or more previously registered objects. For 803 // convenience, it is legal to invoke coalesce_objects() with arguments that span a single previously 804 // registered object. 805 // 806 // The role of coalesce_objects is to change the crossing map information associated with all of the coalesced 807 // objects. 808 void coalesce_objects(HeapWord* address, size_t length_in_words); 809 810 // The typical use case is going to look something like this: 811 // for each heapregion that comprises old-gen memory 812 // for each card number that corresponds to this heap region 813 // scan the objects contained therein if the card is dirty 814 // To avoid excessive lookups in a sparse array, the API queries 815 // the card number pertaining to a particular address and then uses the 816 // card noumber for subsequent information lookups and stores. 817 818 // If has_object(card_index), this returns the word offset within this card 819 // memory at which the first object begins. If !has_object(card_index), the 820 // result is a don't care value. 821 size_t get_first_start(size_t card_index); 822 823 // If has_object(card_index), this returns the word offset within this card 824 // memory at which the last object begins. If !has_object(card_index), the 825 // result is a don't care value. 826 size_t get_last_start(size_t card_index); 827 828 }; 829 830 // ShenandoahScanRemembered is a concrete class representing the 831 // ability to scan the old-gen remembered set for references to 832 // objects residing in young-gen memory. 833 // 834 // Scanning normally begins with an invocation of numRegions and ends 835 // after all clusters of all regions have been scanned. 836 // 837 // Throughout the scanning effort, the number of regions does not 838 // change. 839 // 840 // Even though the regions that comprise old-gen memory are not 841 // necessarily contiguous, the abstraction represented by this class 842 // identifies each of the old-gen regions with an integer value 843 // in the range from 0 to (numRegions() - 1) inclusive. 844 // 845 846 template<typename RememberedSet> 847 class ShenandoahScanRemembered: public CHeapObj<mtGC> { 848 849 private: 850 851 RememberedSet* _rs; 852 ShenandoahCardCluster<RememberedSet>* _scc; 853 854 public: 855 // How to instantiate this object? 856 // ShenandoahDirectCardMarkRememberedSet *rs = 857 // new ShenandoahDirectCardMarkRememberedSet(); 858 // scr = new 859 // ShenandoahScanRememberd<ShenandoahDirectCardMarkRememberedSet>(rs); 860 // 861 // or, after the planned implementation of 862 // ShenandoahBufferWithSATBRememberedSet has been completed: 863 // 864 // ShenandoahBufferWithSATBRememberedSet *rs = 865 // new ShenandoahBufferWithSATBRememberedSet(); 866 // scr = new 867 // ShenandoahScanRememberd<ShenandoahBufferWithSATBRememberedSet>(rs); 868 869 870 ShenandoahScanRemembered(RememberedSet *rs) { 871 _rs = rs; 872 _scc = new ShenandoahCardCluster<RememberedSet>(rs); 873 } 874 875 ~ShenandoahScanRemembered() { 876 delete _scc; 877 } 878 879 // TODO: We really don't want to share all of these APIs with arbitrary consumers of the ShenandoahScanRemembered abstraction. 880 // But in the spirit of quick and dirty for the time being, I'm going to go ahead and publish everything for right now. Some 881 // of existing code already depends on having access to these services (because existing code has not been written to honor 882 // full abstraction of remembered set scanning. In the not too distant future, we want to try to make most, if not all, of 883 // these services private. Two problems with publicizing: 884 // 1. Allowing arbitrary users to reach beneath the hood allows the users to make assumptions about underlying implementation. 885 // This will make it more difficult to change underlying implementation at a future time, such as when we eventually experiment 886 // with SATB-based implementation of remembered set representation. 887 // 2. If we carefully control sharing of certain of these services, we can reduce the overhead of synchronization by assuring 888 // that all users follow protocols that avoid contention that might require synchronization. When we publish these APIs, we 889 // lose control over who and how the data is accessed. As a result, we are required to insert more defensive measures into 890 // the implementation, including synchronization locks. 891 892 893 // Card index is zero-based relative to first spanned card region. 894 size_t total_cards(); 895 size_t card_index_for_addr(HeapWord *p); 896 HeapWord *addr_for_card_index(size_t card_index); 897 bool is_card_dirty(size_t card_index); 898 bool is_write_card_dirty(size_t card_index) { return _rs->is_write_card_dirty(card_index); } 899 void mark_card_as_dirty(size_t card_index); 900 void mark_range_as_dirty(size_t card_index, size_t num_cards); 901 void mark_card_as_clean(size_t card_index); 902 void mark_read_card_as_clean(size_t card_index) { _rs->mark_read_card_clean(card_index); } 903 void mark_range_as_clean(size_t card_index, size_t num_cards); 904 void mark_overreach_card_as_dirty(size_t card_index); 905 bool is_card_dirty(HeapWord *p); 906 void mark_card_as_dirty(HeapWord *p); 907 void mark_range_as_dirty(HeapWord *p, size_t num_heap_words); 908 void mark_card_as_clean(HeapWord *p); 909 void mark_range_as_clean(HeapWord *p, size_t num_heap_words); 910 void mark_overreach_card_as_dirty(void *p); 911 size_t cluster_count(); 912 void initialize_overreach(size_t first_cluster, size_t count); 913 void merge_overreach(size_t first_cluster, size_t count); 914 915 // Called by GC thread at start of concurrent mark to exchange roles of read and write remembered sets. 916 void swap_remset() { _rs->swap_remset(); } 917 918 void reset_remset(HeapWord* start, size_t word_count) { _rs->reset_remset(start, word_count); } 919 920 void merge_write_table(HeapWord* start, size_t word_count) { _rs->merge_write_table(start, word_count); } 921 922 // Called by GC thread after scanning old remembered set in order to prepare for next GC pass 923 void clear_old_remset() { _rs->clear_old_remset(); } 924 925 size_t cluster_for_addr(HeapWord *addr); 926 927 void reset_object_range(HeapWord *from, HeapWord *to); 928 void register_object(HeapWord *addr); 929 void register_object_wo_lock(HeapWord *addr); 930 void coalesce_objects(HeapWord *addr, size_t length_in_words); 931 932 // Return true iff this object is "properly" registered. 933 bool verify_registration(HeapWord* address, ShenandoahMarkingContext* ctx); 934 935 // clear the cards to clean, and clear the object_starts info to no objects 936 void mark_range_as_empty(HeapWord *addr, size_t length_in_words); 937 938 // process_clusters() scans a portion of the remembered set during a JVM 939 // safepoint as part of the root scanning activities that serve to 940 // initiate concurrent scanning and concurrent evacuation. Multiple 941 // threads may scan different portions of the remembered set by 942 // making parallel invocations of process_clusters() with each 943 // invocation scanning different clusters of the remembered set. 944 // 945 // An invocation of process_clusters() examines all of the 946 // intergenerational references spanned by count clusters starting 947 // with first_cluster. The oops argument is assumed to represent a 948 // thread-local OopClosure into which addresses of intergenerational 949 // pointer values will be accumulated for the purposes of root scanning. 950 // 951 // A side effect of executing process_clusters() is to update the card 952 // table entries, marking dirty cards as clean if they no longer 953 // hold references to young-gen memory. (THIS IS NOT YET IMPLEMENTED.) 954 // 955 // The implementation of process_clusters() is designed to efficiently 956 // minimize work in the large majority of cases for which the 957 // associated cluster has very few dirty card-table entries. 958 // 959 // At initialization of concurrent marking, invoke process_clusters with 960 // ClosureType equal to ShenandoahInitMarkRootsClosure. 961 // 962 // At initialization of concurrent evacuation, invoke process_clusters with 963 // ClosureType equal to ShenandoahEvacuateUpdateRootsClosure. 964 965 // This is big enough it probably shouldn't be in-lined. On the other hand, there are only a few places this 966 // code is called from, so it might as well be in-lined. The "real" reason I'm inlining at the moment is because 967 // the template expansions were making it difficult for the link/loader to resolve references to the template- 968 // parameterized implementations of this service. 969 template <typename ClosureType> 970 inline void process_clusters(size_t first_cluster, size_t count, HeapWord *end_of_range, ClosureType *oops); 971 972 template <typename ClosureType> 973 inline void process_clusters(size_t first_cluster, size_t count, HeapWord *end_of_range, ClosureType *oops, bool use_write_table); 974 975 template <typename ClosureType> 976 inline void process_region(ShenandoahHeapRegion* region, ClosureType *cl); 977 978 template <typename ClosureType> 979 inline void process_region(ShenandoahHeapRegion* region, ClosureType *cl, bool use_write_table); 980 981 // To Do: 982 // Create subclasses of ShenandoahInitMarkRootsClosure and 983 // ShenandoahEvacuateUpdateRootsClosure and any other closures 984 // that need to participate in remembered set scanning. Within the 985 // subclasses, add a (probably templated) instance variable that 986 // refers to the associated ShenandoahCardCluster object. Use this 987 // ShenandoahCardCluster instance to "enhance" the do_oops 988 // processing so that we can: 989 // 990 // 1. Avoid processing references that correspond to clean card 991 // regions, and 992 // 2. Set card status to CLEAN when the associated card region no 993 // longer holds inter-generatioanal references. 994 // 995 // To enable efficient implementation of these behaviors, we 996 // probably also want to add a few fields into the 997 // ShenandoahCardCluster object that allow us to precompute and 998 // remember the addresses at which card status is going to change 999 // from dirty to clean and clean to dirty. The do_oops 1000 // implementations will want to update this value each time they 1001 // cross one of these boundaries. 1002 void roots_do(OopIterateClosure* cl); 1003 }; 1004 1005 typedef ShenandoahScanRemembered<ShenandoahDirectCardMarkRememberedSet> RememberedScanner; 1006 1007 class ShenandoahScanRememberedTask : public WorkerTask { 1008 private: 1009 ShenandoahObjToScanQueueSet* _queue_set; 1010 ShenandoahObjToScanQueueSet* _old_queue_set; 1011 ShenandoahReferenceProcessor* _rp; 1012 ShenandoahRegionIterator* _regions; 1013 public: 1014 ShenandoahScanRememberedTask(ShenandoahObjToScanQueueSet* queue_set, 1015 ShenandoahObjToScanQueueSet* old_queue_set, 1016 ShenandoahReferenceProcessor* rp, 1017 ShenandoahRegionIterator* regions); 1018 1019 void work(uint worker_id); 1020 }; 1021 #endif // SHARE_GC_SHENANDOAH_SHENANDOAHSCANREMEMBERED_HPP